Commit 73c5bf01 authored by Giovanni La Mura's avatar Giovanni La Mura
Browse files

Enable NVIDIA markers

parent a603bb7c
Loading
Loading
Loading
Loading
+9 −0
Original line number Diff line number Diff line
@@ -22,6 +22,7 @@
#include <exception>
#include <fstream>
#include <string>
#include <nvtx3/nvToolsExt.h>
#ifdef _OPENMP
#include <omp.h>
#endif
@@ -84,6 +85,7 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
 */
void cluster(const string& config_file, const string& data_file, const string& output_path, const mixMPI *mpidata) {
  chrono::time_point<chrono::high_resolution_clock> t_start = chrono::high_resolution_clock::now();
  nvtxRangePush("Set up starts");
  chrono::duration<double> elapsed;
  string message;
  string timing_name = output_path + "/c_timing_mpi"+ to_string(mpidata->rank) +".log";
@@ -219,10 +221,13 @@ void cluster(const string& config_file, const string& data_file, const string& o
	  fprintf(output, "  VK=%15.7lE, XI IS SCALE FACTOR FOR LENGTHS\n", cid->vk);
	  fprintf(output, " \n");
	}
	nvtxRangePop();
	// do the first iteration on jxi488 separately, since it seems to be different from the others
	int jxi488 = 1;
	chrono::time_point<chrono::high_resolution_clock> start_iter_1 = chrono::high_resolution_clock::now();
	nvtxRangePush("First iteration starts");
	int jer = cluster_jxi488_cycle(jxi488, sconf, gconf, p_scattering_angles, cid, output, output_path, tppoan);
	nvtxRangePop();
	chrono::time_point<chrono::high_resolution_clock> end_iter_1 = chrono::high_resolution_clock::now();
	elapsed = start_iter_1 - t_start;
	string message = "INFO: Calculation setup took " + to_string(elapsed.count()) + "s.\n";
@@ -258,6 +263,7 @@ void cluster(const string& config_file, const string& data_file, const string& o
	// Create this variable and initialise it with a default here, so that it is defined anyway, with or without OpenMP support enabled
	int ompnumthreads = 1;

	nvtxRangePush("Parallel loop starts");
#pragma omp parallel
	{
	  // Create and initialise this variable here, so that if OpenMP is enabled it is local to the thread, and if OpenMP is not enabled it has a well-defiled value anyway
@@ -309,7 +315,9 @@ void cluster(const string& config_file, const string& data_file, const string& o
	    logger->log(message);
	  }
	} // closes pragma omp parallel
	nvtxRangePop();

	nvtxRangePush("Output concatenation starts");
#ifdef _OPENMP
#pragma omp barrier
	{
@@ -383,6 +391,7 @@ void cluster(const string& config_file, const string& data_file, const string& o
	  }
	}
#endif
	nvtxRangePop();
	tppoanp->close();
	delete tppoanp;
      } else { // In case TPPOAN could not be opened. Should never happen.
+1 −1
Original line number Diff line number Diff line
@@ -102,7 +102,7 @@ endif
ifdef CUDA_HOME
override MAGMA_LDFLAGS+= -L$(CUDA_HOME)/lib64
endif
override MAGMA_LDFLAGS+= -lmagma -lcudart
override MAGMA_LDFLAGS+= -lmagma -lcudart -lnvToolsExt
#the next endif is for USE_MAGMA
endif