Commit ee43703f authored by Giovanni La Mura's avatar Giovanni La Mura
Browse files

Place NVIDIA markers at the same code block level and add them to the parallel loop function

parent 73c5bf01
Loading
Loading
Loading
Loading
+16 −6
Original line number Diff line number Diff line
@@ -85,7 +85,6 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
 */
void cluster(const string& config_file, const string& data_file, const string& output_path, const mixMPI *mpidata) {
  chrono::time_point<chrono::high_resolution_clock> t_start = chrono::high_resolution_clock::now();
  nvtxRangePush("Set up starts");
  chrono::duration<double> elapsed;
  string message;
  string timing_name = output_path + "/c_timing_mpi"+ to_string(mpidata->rank) +".log";
@@ -109,6 +108,7 @@ void cluster(const string& config_file, const string& data_file, const string& o
#endif
  // the following only happens on MPI process 0
  if (mpidata->rank == 0) {
    nvtxRangePush("Set up");
    logger->log("INFO: making legacy configuration...", LOG_INFO);
    ScattererConfiguration *sconf = NULL;
    try {
@@ -139,6 +139,7 @@ void cluster(const string& config_file, const string& data_file, const string& o
      return;
    }
    logger->log(" done.\n", LOG_INFO);
    nvtxRangePop();
    int s_nsph = sconf->number_of_spheres;
    int nsph = gconf->number_of_spheres;
    if (s_nsph == nsph) {
@@ -221,11 +222,10 @@ void cluster(const string& config_file, const string& data_file, const string& o
	  fprintf(output, "  VK=%15.7lE, XI IS SCALE FACTOR FOR LENGTHS\n", cid->vk);
	  fprintf(output, " \n");
	}
	nvtxRangePop();
	// do the first iteration on jxi488 separately, since it seems to be different from the others
	int jxi488 = 1;
	chrono::time_point<chrono::high_resolution_clock> start_iter_1 = chrono::high_resolution_clock::now();
	nvtxRangePush("First iteration starts");
	nvtxRangePush("First iteration");
	int jer = cluster_jxi488_cycle(jxi488, sconf, gconf, p_scattering_angles, cid, output, output_path, tppoan);
	nvtxRangePop();
	chrono::time_point<chrono::high_resolution_clock> end_iter_1 = chrono::high_resolution_clock::now();
@@ -263,7 +263,7 @@ void cluster(const string& config_file, const string& data_file, const string& o
	// Create this variable and initialise it with a default here, so that it is defined anyway, with or without OpenMP support enabled
	int ompnumthreads = 1;

	nvtxRangePush("Parallel loop starts");
	nvtxRangePush("Parallel loop");
#pragma omp parallel
	{
	  // Create and initialise this variable here, so that if OpenMP is enabled it is local to the thread, and if OpenMP is not enabled it has a well-defiled value anyway
@@ -317,7 +317,7 @@ void cluster(const string& config_file, const string& data_file, const string& o
	} // closes pragma omp parallel
	nvtxRangePop();

	nvtxRangePush("Output concatenation starts");
	nvtxRangePush("Output concatenation");
#ifdef _OPENMP
#pragma omp barrier
	{
@@ -598,6 +598,7 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
  np_int ndit = 2 * nsph * cid->c4->nlim;
  int isq, ibf;

  nvtxRangePush("Prepare matrix calculation");
  fprintf(output, "========== JXI =%3d ====================\n", jxi488);
  double xi = sconf->get_scale(jxi488 - 1);
  double exdc = sconf->exdc;
@@ -654,14 +655,19 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
      //break;
    }
  } // i132 loop
  nvtxRangePop();
  interval_start = chrono::high_resolution_clock::now();
  nvtxRangePush("Calculate inverted matrix");
  cms(cid->am, cid->c1, cid->c1ao, cid->c4, cid->c6);
  nvtxRangePop();
  interval_end = chrono::high_resolution_clock::now();
  elapsed = interval_end - interval_start;
  message = "INFO: matrix calculation for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n";
  logger->log(message);
  interval_start = chrono::high_resolution_clock::now();
  nvtxRangePush("Invert the matrix");
  invert_matrix(cid->am, ndit, jer, mxndm);
  nvtxRangePop();
  interval_end = chrono::high_resolution_clock::now();
  elapsed = interval_end - interval_start;
  message = "INFO: matrix inversion for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n";
@@ -673,6 +679,7 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
    // break; // jxi488 loop: goes to memory clean
  }
  interval_start = chrono::high_resolution_clock::now();
  nvtxRangePush("Average calculation");
  ztm(cid->am, cid->c1, cid->c1ao, cid->c4, cid->c6, cid->c9);
  if (idfc >= 0) {
    if (jxi488 == jwtm) {
@@ -740,11 +747,13 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
  tppoan.write(reinterpret_cast<char *>(&(cid->vk)), sizeof(double));
  pcrsm0(cid->vk, exri, inpol, cid->c1, cid->c1ao, cid->c4);
  apcra(cid->zpv, cid->c4->le, cid->c1ao->am0m, inpol, sqk, cid->gapm, cid->gappm);
  nvtxRangePop();
  interval_end = chrono::high_resolution_clock::now();
  elapsed = interval_end - interval_start;
  message = "INFO: average calculation for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n";
  logger->log(message);
  interval_start = chrono::high_resolution_clock::now();
  nvtxRangePush("Angle loop");
  double th = sa->th;
  for (int jth486 = 1; jth486 <= sa->nth; jth486++) { // OpenMP portable?
    double ph = sa->ph;
@@ -1229,6 +1238,7 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
    } // jph484 loop
    th += sa->thstp;
  } // jth486 loop
  nvtxRangePop();
  interval_end = chrono::high_resolution_clock::now();
  elapsed = interval_end - interval_start;
  message = "INFO: angle loop for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n";