Commit 4edaf082 authored by Giovanni La Mura's avatar Giovanni La Mura
Browse files

Use compiler flags to enable NVIDIA profiling

parent ee43703f
Loading
Loading
Loading
Loading
+37 −1
Original line number Diff line number Diff line
@@ -22,7 +22,6 @@
#include <exception>
#include <fstream>
#include <string>
#include <nvtx3/nvToolsExt.h>
#ifdef _OPENMP
#include <omp.h>
#endif
@@ -31,6 +30,9 @@
#include <mpi.h>
#endif
#endif
#ifdef USE_NVTX
#include <nvtx3/nvToolsExt.h>
#endif
#ifdef USE_MAGMA
#include <cuda_runtime.h>
#endif
@@ -108,7 +110,9 @@ void cluster(const string& config_file, const string& data_file, const string& o
#endif
  // the following only happens on MPI process 0
  if (mpidata->rank == 0) {
#ifdef USE_NVTX
    nvtxRangePush("Set up");
#endif
    logger->log("INFO: making legacy configuration...", LOG_INFO);
    ScattererConfiguration *sconf = NULL;
    try {
@@ -139,7 +143,9 @@ void cluster(const string& config_file, const string& data_file, const string& o
      return;
    }
    logger->log(" done.\n", LOG_INFO);
#ifdef USE_NVTX
    nvtxRangePop();
#endif
    int s_nsph = sconf->number_of_spheres;
    int nsph = gconf->number_of_spheres;
    if (s_nsph == nsph) {
@@ -225,9 +231,13 @@ void cluster(const string& config_file, const string& data_file, const string& o
	// do the first iteration on jxi488 separately, since it seems to be different from the others
	int jxi488 = 1;
	chrono::time_point<chrono::high_resolution_clock> start_iter_1 = chrono::high_resolution_clock::now();
#ifdef USE_NVTX
	nvtxRangePush("First iteration");
#endif
	int jer = cluster_jxi488_cycle(jxi488, sconf, gconf, p_scattering_angles, cid, output, output_path, tppoan);
#ifdef USE_NVTX
	nvtxRangePop();
#endif
	chrono::time_point<chrono::high_resolution_clock> end_iter_1 = chrono::high_resolution_clock::now();
	elapsed = start_iter_1 - t_start;
	string message = "INFO: Calculation setup took " + to_string(elapsed.count()) + "s.\n";
@@ -263,7 +273,9 @@ void cluster(const string& config_file, const string& data_file, const string& o
	// Create this variable and initialise it with a default here, so that it is defined anyway, with or without OpenMP support enabled
	int ompnumthreads = 1;

#ifdef USE_NVTX
	nvtxRangePush("Parallel loop");
#endif
#pragma omp parallel
	{
	  // Create and initialise this variable here, so that if OpenMP is enabled it is local to the thread, and if OpenMP is not enabled it has a well-defiled value anyway
@@ -315,9 +327,11 @@ void cluster(const string& config_file, const string& data_file, const string& o
	    logger->log(message);
	  }
	} // closes pragma omp parallel
#ifdef USE_NVTX
	nvtxRangePop();

	nvtxRangePush("Output concatenation");
#endif
#ifdef _OPENMP
#pragma omp barrier
	{
@@ -391,7 +405,9 @@ void cluster(const string& config_file, const string& data_file, const string& o
	  }
	}
#endif
#ifdef USE_NVTX
	nvtxRangePop();
#endif
	tppoanp->close();
	delete tppoanp;
      } else { // In case TPPOAN could not be opened. Should never happen.
@@ -598,7 +614,9 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
  np_int ndit = 2 * nsph * cid->c4->nlim;
  int isq, ibf;

#ifdef USE_NVTX
  nvtxRangePush("Prepare matrix calculation");
#endif
  fprintf(output, "========== JXI =%3d ====================\n", jxi488);
  double xi = sconf->get_scale(jxi488 - 1);
  double exdc = sconf->exdc;
@@ -655,19 +673,29 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
      //break;
    }
  } // i132 loop
#ifdef USE_NVTX
  nvtxRangePop();
#endif
  interval_start = chrono::high_resolution_clock::now();
#ifdef USE_NVTX
  nvtxRangePush("Calculate inverted matrix");
#endif
  cms(cid->am, cid->c1, cid->c1ao, cid->c4, cid->c6);
#ifdef USE_NVTX
  nvtxRangePop();
#endif
  interval_end = chrono::high_resolution_clock::now();
  elapsed = interval_end - interval_start;
  message = "INFO: matrix calculation for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n";
  logger->log(message);
  interval_start = chrono::high_resolution_clock::now();
#ifdef USE_NVTX
  nvtxRangePush("Invert the matrix");
#endif
  invert_matrix(cid->am, ndit, jer, mxndm);
#ifdef USE_NVTX
  nvtxRangePop();
#endif
  interval_end = chrono::high_resolution_clock::now();
  elapsed = interval_end - interval_start;
  message = "INFO: matrix inversion for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n";
@@ -679,7 +707,9 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
    // break; // jxi488 loop: goes to memory clean
  }
  interval_start = chrono::high_resolution_clock::now();
#ifdef USE_NVTX
  nvtxRangePush("Average calculation");
#endif
  ztm(cid->am, cid->c1, cid->c1ao, cid->c4, cid->c6, cid->c9);
  if (idfc >= 0) {
    if (jxi488 == jwtm) {
@@ -747,13 +777,17 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
  tppoan.write(reinterpret_cast<char *>(&(cid->vk)), sizeof(double));
  pcrsm0(cid->vk, exri, inpol, cid->c1, cid->c1ao, cid->c4);
  apcra(cid->zpv, cid->c4->le, cid->c1ao->am0m, inpol, sqk, cid->gapm, cid->gappm);
#ifdef USE_NVTX
  nvtxRangePop();
#endif
  interval_end = chrono::high_resolution_clock::now();
  elapsed = interval_end - interval_start;
  message = "INFO: average calculation for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n";
  logger->log(message);
  interval_start = chrono::high_resolution_clock::now();
#ifdef USE_NVTX
  nvtxRangePush("Angle loop");
#endif
  double th = sa->th;
  for (int jth486 = 1; jth486 <= sa->nth; jth486++) { // OpenMP portable?
    double ph = sa->ph;
@@ -1238,7 +1272,9 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
    } // jph484 loop
    th += sa->thstp;
  } // jth486 loop
#ifdef USE_NVTX
  nvtxRangePop();
#endif
  interval_end = chrono::high_resolution_clock::now();
  elapsed = interval_end - interval_start;
  message = "INFO: angle loop for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n";
+13 −2
Original line number Diff line number Diff line
@@ -102,13 +102,24 @@ endif
ifdef CUDA_HOME
override MAGMA_LDFLAGS+= -L$(CUDA_HOME)/lib64
endif
override MAGMA_LDFLAGS+= -lmagma -lcudart -lnvToolsExt
#override MAGMA_LDFLAGS+= -lmagma -lcudart -lnvToolsExt
override MAGMA_LDFLAGS+= -lmagma -lcudart
#the next endif is for USE_MAGMA
endif

# define (outside) USE_NVTX to enable NVIDIA profiling
ifdef USE_NVTX
override NVTX_FLAGS=-DUSE_NVTX
ifdef CUDA_HOME
override NVTX_FLAGS+= -I$(CUDA_HOME)/include
# closes CUDA_HOME
endif
# closes USE_NVTX
endif

# CXXFLAGS defines the default compilation options for the C++ compiler
ifndef CXXFLAGS
override CXXFLAGS=-O3 -ggdb -pg -coverage -I$(HDF5_INCLUDE) $(MPI_CXXFLAGS)
override CXXFLAGS=-O3 -ggdb -pg -coverage -I$(HDF5_INCLUDE) $(MPI_CXXFLAGS) $(NVTX_FLAGS)
ifdef USE_OPENMP
override CXXFLAGS+= -fopenmp
# closes USE_OPENMP