Merge branch 'unify_iterations' into 'master' (a4c3363e) · Commits · Giacomo Mulas / NP_TMcode

src/cluster/cluster.cpp

+56 −40

Original line number	Diff line number	Diff line
		@@ -288,53 +288,55 @@ void cluster(const string& config_file, const string& data_file, const string& o
		}

		// do the first iteration on jxi488 separately, since it seems to be different from the others
		int jxi488 = 1;
		// not anymore, now this iteration is at the same level as others
		int jxi488;
		int initialmaxrefiters = cid->maxrefiters;
		chrono::time_point<chrono::high_resolution_clock> start_iter_1 = chrono::high_resolution_clock::now();
		#ifdef USE_NVTX
		nvtxRangePush("First iteration");
		#endif
		//chrono::time_point<chrono::high_resolution_clock> start_iter_1 = chrono::high_resolution_clock::now();
		// #ifdef USE_NVTX
		// nvtxRangePush("First iteration");
		// #endif
		// use these pragmas, which should have no effect on parallelism, just to push OMP nested levels at the same level also in the first wavelength iteration
		int jer = 0;
		#pragma omp parallel
		{
		#pragma omp single
		{
		jer = cluster_jxi488_cycle(jxi488, sconf, gconf, p_scattering_angles, cid, p_output, output_path, vtppoanp);
		} // OMP sinlge
		} // OMP parallel
		#ifdef USE_NVTX
		nvtxRangePop();
		#endif
		chrono::time_point<chrono::high_resolution_clock> end_iter_1 = chrono::high_resolution_clock::now();
		elapsed = start_iter_1 - t_start;
		string message = "INFO: Calculation setup took " + to_string(elapsed.count()) + "s.\n";
		logger->log(message);
		time_logger->log(message);
		elapsed = end_iter_1 - start_iter_1;
		message = "INFO: First iteration took " + to_string(elapsed.count()) + "s.\n";
		logger->log(message);
		time_logger->log(message);
		// #pragma omp parallel
		// {
		// #pragma omp single
		// {
		// jer = cluster_jxi488_cycle(jxi488, sconf, gconf, p_scattering_angles, cid, p_output, output_path, vtppoanp);
		// } // OMP single
		// } // OMP parallel
		// #ifdef USE_NVTX
		// nvtxRangePop();
		// #endif
		//chrono::time_point<chrono::high_resolution_clock> end_iter_1 = chrono::high_resolution_clock::now();
		//elapsed = start_iter_1 - t_start;
		// string message = "INFO: Calculation setup took " + to_string(elapsed.count()) + "s.\n";
		// logger->log(message);
		// time_logger->log(message);
		// elapsed = end_iter_1 - start_iter_1;
		// message = "INFO: First iteration took " + to_string(elapsed.count()) + "s.\n";
		// logger->log(message);
		// time_logger->log(message);
		/* for the next iterations, just always do maxiter iterations, assuming the accuracy is good enough */
		cid->refinemode = 0;
		// cid->refinemode = 0;
		/* add an extra iteration for margin, if this does not exceed initialmaxrefiters */
		// if (cid->maxrefiters < initialmaxrefiters) cid->maxrefiters++;
		if (jer != 0) {
		// First loop failed. Halt the calculation.
		fclose(timing_file);
		delete time_logger;
		delete p_output;
		delete p_scattering_angles;
		delete cid;
		delete logger;
		delete sconf;
		delete gconf;
		return;
		}
		// if (jer != 0) {
		// // First loop failed. Halt the calculation.
		// fclose(timing_file);
		// delete time_logger;
		// delete p_output;
		// delete p_scattering_angles;
		// delete cid;
		// delete logger;
		// delete sconf;
		// delete gconf;
		// return;
		// }

		//==================================================
		// do the first outputs here, so that I open here the new files, afterwards I only append
		//==================================================
		// How should we handle this, when first iteration is not treated specially anymore? This should be ok, just write what was put in vtppoanp on initialisation, even if no actual calc was done yet. This creates the file nonetheless,
		vtppoanp->write_to_disk(output_path + "/c_TPPOAN");
		delete vtppoanp;

		@@ -425,7 +427,8 @@ void cluster(const string& config_file, const string& data_file, const string& o
		}
		#pragma omp barrier
		// ok, now I can actually start the parallel calculations
		for (int ixi488=2; ixi488<=cid_2->number_of_scales; ixi488 +=myMPIstride) {
		// we now start the loop from 1, so that the first iteration is run exactly as the others
		for (int ixi488=1; ixi488<=cid_2->number_of_scales; ixi488 +=myMPIstride) {
		// the parallel loop over MPI processes covers a different set of indices for each thread
		#pragma omp barrier
		int myjxi488 = ixi488+myompthread;
		@@ -473,6 +476,10 @@ void cluster(const string& config_file, const string& data_file, const string& o
		// thread 0 writes its virtual files, now including contributions from all threads, to disk, and deletes them
		// p_outarray[0]->append_to_disk(output_path + "/c_OCLU");
		// delete p_outarray[0];
		// ******************************************************
		// if this is the very first time, we should actually use
		// ->write_to_disk, not ->append_to_disk
		// ******************************************************
		vtppoanarray[0]->append_to_disk(output_path + "/c_TPPOAN");
		delete vtppoanarray[0];

		@@ -604,7 +611,8 @@ void cluster(const string& config_file, const string& data_file, const string& o
		// make sure all threads align here: I don't want the following loop to accidentally start for thread 0, possibly modifying some variables before they are copied by all other threads
		#pragma omp barrier
		// ok, now I can actually start the parallel calculations
		for (int ixi488=2; ixi488<=cid_2->number_of_scales; ixi488 +=myMPIstride) {
		// we now start the loop from 1, so that the first iteration is treated exactly on equal footing as others
		for (int ixi488=1; ixi488<=cid_2->number_of_scales; ixi488 +=myMPIstride) {
		// the parallel loop over MPI processes covers a different set of indices for each thread
		#pragma omp barrier
		int myjxi488 = ixi488 + myjxi488startoffset + myompthread;
		@@ -698,7 +706,8 @@ int cluster_jxi488_cycle(
		int jer = 0;
		int lcalc = 0;
		int jaw = 1;
		bool is_first_scale = (jxi488 == 1);
		// this is now part of cid, so don't mess with it here, just copy it by reference
		bool &is_first_scale = cid->is_first_scale;
		const int nsph = sconf->number_of_spheres;
		const np_int mxndm = gconf->mxndm;
		const int iavm = gconf->iavm;
		@@ -1563,6 +1572,8 @@ int cluster_jxi488_cycle(
		} // jph484 loop
		th += sa->thstp;
		} // jth486 loop
		// at the end of this iteration make sure to set is_first_scale to false, the next iteration will reinitialise _only_ if the order changes
		is_first_scale = 0;
		#ifdef USE_NVTX
		nvtxRangePop();
		#endif
		@@ -1679,6 +1690,8 @@ ClusterIterationData::ClusterIterationData(GeometryConfiguration *gconf, Scatter
		proc_device = 0;
		#endif

		// the first time the object is used, it will be a "first iteration", to ensure data structures are properly initialised.
		is_first_scale = 1;
		// In the first iteration, if refinement is enabled, determine the number of refinement iterations required to arrive at the target accuracy (if achievable in a reasonable number of iterations)
		refinemode = 2;
		// maxrefiters and accuracygoal should be configurable and preferably set somewhere else
		@@ -1827,6 +1840,7 @@ ClusterIterationData::ClusterIterationData(const ClusterIterationData& rhs) {
		number_of_scales = rhs.number_of_scales;

		proc_device = rhs.proc_device;
		is_first_scale = rhs.is_first_scale;
		refinemode = rhs.refinemode;
		maxrefiters = rhs.maxrefiters;
		accuracygoal = rhs.accuracygoal;
		@@ -1964,6 +1978,7 @@ ClusterIterationData::ClusterIterationData(const mixMPI *mpidata, const int devi
		proc_device = 0;
		#endif
		MPI_Bcast(&refinemode, 1, MPI_INT, 0, MPI_COMM_WORLD);
		MPI_Bcast(&is_first_scale, 1, MPI_C_BOOL, 0, MPI_COMM_WORLD);
		MPI_Bcast(&maxrefiters, 1, MPI_INT, 0, MPI_COMM_WORLD);
		MPI_Bcast(&accuracygoal, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
		}
		@@ -2035,6 +2050,7 @@ void ClusterIterationData::mpibcast(const mixMPI *mpidata) {
		MPI_Bcast(&xiblock, 1, MPI_INT, 0, MPI_COMM_WORLD);
		MPI_Bcast(&number_of_scales, 1, MPI_INT, 0, MPI_COMM_WORLD);
		MPI_Bcast(&refinemode, 1, MPI_INT, 0, MPI_COMM_WORLD);
		MPI_Bcast(&is_first_scale, 1, MPI_C_BOOL, 0, MPI_COMM_WORLD);
		MPI_Bcast(&maxrefiters, 1, MPI_INT, 0, MPI_COMM_WORLD);
		MPI_Bcast(&accuracygoal, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
		}

src/include/IterationData.h

+6 −0

Original line number	Diff line number	Diff line
		@@ -135,6 +135,8 @@ public:
		int proc_device;
		//! \brief Refinement mode selction flag.
		int refinemode;
		//! \brief flag defining a first iteration
		bool is_first_scale;
		//! \brief Maximum number of refinement iterations.
		int maxrefiters;
		//! \brief Required accuracy level.
		@@ -311,6 +313,8 @@ public:
		int proc_device;
		//! \brief Refinement mode selction flag.
		int refinemode;
		//! \brief flag defining a first iteration
		bool is_first_scale;
		//! \brief Maximum number of refinement iterations.
		int maxrefiters;
		//! \brief Required accuracy level.
		@@ -482,6 +486,8 @@ public:
		double **tqss;
		//! \brief Scattering coefficients tensor.
		double ****zpv;
		//! \brief flag for first time initialisation
		bool is_first_scale;

		/*! \brief `SphereIterationData` default instance constructor.
		*

src/inclusion/inclusion.cpp

+51 −42

Original line number	Diff line number	Diff line
		@@ -278,50 +278,50 @@ void inclusion(const string& config_file, const string& data_file, const string&
		}

		// do the first iteration on jxi488 separately, since it seems to be different from the others
		int jxi488 = 1;
		int jxi488;
		int initialmaxrefiters = cid->maxrefiters;

		chrono::time_point<chrono::high_resolution_clock> start_iter_1 = chrono::high_resolution_clock::now();
		#ifdef USE_NVTX
		nvtxRangePush("First iteration");
		#endif
		// chrono::time_point<chrono::high_resolution_clock> start_iter_1 = chrono::high_resolution_clock::now();
		// #ifdef USE_NVTX
		// nvtxRangePush("First iteration");
		// #endif
		// use these pragmas, which should have no effect on parallelism, just to push OMP nested levels at the same level also in the first wavelength iteration
		int jer = 0;
		#pragma omp parallel
		{
		#pragma omp single
		{
		jer = inclusion_jxi488_cycle(jxi488, sconf, gconf, p_scattering_angles, cid, p_output, output_path, vtppoanp);
		}
		}
		#ifdef USE_NVTX
		nvtxRangePop();
		#endif
		chrono::time_point<chrono::high_resolution_clock> end_iter_1 = chrono::high_resolution_clock::now();
		elapsed = start_iter_1 - t_start;
		string message = "INFO: Calculation setup took " + to_string(elapsed.count()) + "s.\n";
		logger->log(message);
		time_logger->log(message);
		elapsed = end_iter_1 - start_iter_1;
		message = "INFO: First iteration took " + to_string(elapsed.count()) + "s.\n";
		logger->log(message);
		time_logger->log(message);
		// #pragma omp parallel
		// {
		// #pragma omp single
		// {
		// jer = inclusion_jxi488_cycle(jxi488, sconf, gconf, p_scattering_angles, cid, p_output, output_path, vtppoanp);
		// }
		// }
		// #ifdef USE_NVTX
		// nvtxRangePop();
		// #endif
		// chrono::time_point<chrono::high_resolution_clock> end_iter_1 = chrono::high_resolution_clock::now();
		// elapsed = start_iter_1 - t_start;
		// string message = "INFO: Calculation setup took " + to_string(elapsed.count()) + "s.\n";
		// logger->log(message);
		// time_logger->log(message);
		// elapsed = end_iter_1 - start_iter_1;
		// message = "INFO: First iteration took " + to_string(elapsed.count()) + "s.\n";
		// logger->log(message);
		// time_logger->log(message);
		/* for the next iterations, just always do maxiter iterations, assuming the accuracy is good enough */
		cid->refinemode = 0;
		/* add an extra iteration for margin, if this does not exceed initialmaxrefiters */
		// if (cid->maxrefiters < initialmaxrefiters) cid->maxrefiters++;
		if (jer != 0) {
		// First loop failed. Halt the calculation.
		fclose(timing_file);
		delete time_logger;
		delete p_output;
		delete p_scattering_angles;
		delete cid;
		delete logger;
		delete sconf;
		delete gconf;
		return;
		}
		// cid->refinemode = 0;
		// /* add an extra iteration for margin, if this does not exceed initialmaxrefiters */
		// // if (cid->maxrefiters < initialmaxrefiters) cid->maxrefiters++;
		// if (jer != 0) {
		// // First loop failed. Halt the calculation.
		// fclose(timing_file);
		// delete time_logger;
		// delete p_output;
		// delete p_scattering_angles;
		// delete cid;
		// delete logger;
		// delete sconf;
		// delete gconf;
		// return;
		// }

		//==================================================
		// do the first outputs here, so that I open here the new files, afterwards I only append
		@@ -415,7 +415,7 @@ void inclusion(const string& config_file, const string& data_file, const string&
		}
		#pragma omp barrier
		// ok, now I can actually start the parallel calculations
		for (int ixi488=2; ixi488<=cid_2->number_of_scales; ixi488 +=myMPIstride) {
		for (int ixi488=1; ixi488<=cid_2->number_of_scales; ixi488 +=myMPIstride) {
		// the parallel loop over MPI processes covers a different set of indices for each thread
		#pragma omp barrier
		int myjxi488 = ixi488+myompthread;
		@@ -595,7 +595,7 @@ void inclusion(const string& config_file, const string& data_file, const string&
		// make sure all threads align here: I don't want the following loop to accidentally start for thread 0, possibly modifying some variables before they are copied by all other threads
		#pragma omp barrier
		// ok, now I can actually start the parallel calculations
		for (int ixi488=2; ixi488<=cid_2->number_of_scales; ixi488 +=myMPIstride) {
		for (int ixi488=1; ixi488<=cid_2->number_of_scales; ixi488 +=myMPIstride) {
		// the parallel loop over MPI processes covers a different set of indices for each thread
		#pragma omp barrier
		int myjxi488 = ixi488 + myjxi488startoffset + myompthread;
		@@ -685,7 +685,9 @@ int inclusion_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryCo
		int jer = 0;
		int lcalc = 0;
		int jaw = 1;
		bool is_first_scale = (jxi488 == 1);
		// this is now part of cid, so don't mess with it here, just copy it by reference
		bool &is_first_scale = cid->is_first_scale;
		//bool is_first_scale = (jxi488 == 1);
		// int li = cid->c1->li;
		// int le = cid->c1->le;
		// int lm = cid->c1->lm;
		@@ -1414,6 +1416,8 @@ int inclusion_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryCo
		} // jph484 loop
		th += sa->thstp;
		} // jth486 loop
		// at the end of this iteration make sure to set is_first_scale to false, the next iteration will reinitialise _only_ if the order changes
		is_first_scale = 0;
		#ifdef USE_NVTX
		nvtxRangePop();
		#endif
		@@ -1534,6 +1538,8 @@ InclusionIterationData::InclusionIterationData(GeometryConfiguration *gconf, Sca
		proc_device = 0;
		#endif

		// the first time the object is used, it will be a "first iteration", to ensure data structures are properly initialised.
		is_first_scale = 1;
		// In the first iteration, if refinement is enabled, determine the number of refinement iterations required to arrive at the target accuracy (if achievable in a reasonable number of iterations)
		refinemode = 2;
		// maxrefiters and accuracygoal should be configurable and preferably set somewhere else
		@@ -1685,6 +1691,7 @@ InclusionIterationData::InclusionIterationData(const InclusionIterationData& rhs
		extr = rhs.extr;

		proc_device = rhs.proc_device;
		is_first_scale = rhs.is_first_scale;
		refinemode = rhs.refinemode;
		maxrefiters = rhs.maxrefiters;
		accuracygoal = rhs.accuracygoal;
		@@ -1823,6 +1830,7 @@ InclusionIterationData::InclusionIterationData(const mixMPI *mpidata, const int
		proc_device = 0;
		#endif
		MPI_Bcast(&refinemode, 1, MPI_INT, 0, MPI_COMM_WORLD);
		MPI_Bcast(&is_first_scale, 1, MPI_C_BOOL, 0, MPI_COMM_WORLD);
		MPI_Bcast(&maxrefiters, 1, MPI_INT, 0, MPI_COMM_WORLD);
		MPI_Bcast(&accuracygoal, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
		}
		@@ -1888,6 +1896,7 @@ void InclusionIterationData::mpibcast(const mixMPI *mpidata) {
		MPI_Bcast(&nimd, 1, MPI_INT, 0, MPI_COMM_WORLD);
		MPI_Bcast(&extr, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
		MPI_Bcast(&refinemode, 1, MPI_INT, 0, MPI_COMM_WORLD);
		MPI_Bcast(&is_first_scale, 1, MPI_C_BOOL, 0, MPI_COMM_WORLD);
		MPI_Bcast(&maxrefiters, 1, MPI_INT, 0, MPI_COMM_WORLD);
		MPI_Bcast(&accuracygoal, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
		}

src/sphere/sphere.cpp

+32 −21

Original line number	Diff line number	Diff line
		@@ -201,25 +201,25 @@ void sphere(const string& config_file, const string& data_file, const string& ou
		}

		// Do the first wavelength iteration
		int jxi488 = 1;
		// int jxi488 = 1;
		// Use pragmas to put OMP parallelism to second level.
		int jer = 0;
		#pragma omp parallel
		{
		#pragma omp single
		{
		jer = sphere_jxi488_cycle(jxi488 - 1, sconf, gconf, p_sa, sid, p_output, output_path, vtppoanp);
		} // OMP single
		} // OMP parallel
		if (jer != 0) { // First iteration failed. Halt the calculation.
		delete p_output;
		delete p_sa;
		delete sid;
		delete logger;
		delete sconf;
		delete gconf;
		return;
		}
		// #pragma omp parallel
		// {
		// #pragma omp single
		// {
		// jer = sphere_jxi488_cycle(jxi488 - 1, sconf, gconf, p_sa, sid, p_output, output_path, vtppoanp);
		// } // OMP single
		// } // OMP parallel
		// if (jer != 0) { // First iteration failed. Halt the calculation.
		// delete p_output;
		// delete p_sa;
		// delete sid;
		// delete logger;
		// delete sconf;
		// delete gconf;
		// return;
		// }

		//==================================================
		// do the first outputs here, so that I open here the new files, afterwards I only append
		@@ -310,7 +310,7 @@ void sphere(const string& config_file, const string& data_file, const string& ou
		}
		#pragma omp barrier
		// ok, now I can actually start the parallel calculations
		for (int ixi488=2; ixi488<=sid_2->number_of_scales; ixi488 +=myMPIstride) {
		for (int ixi488=1; ixi488<=sid_2->number_of_scales; ixi488 +=myMPIstride) {
		// the parallel loop over MPI processes covers a different set of indices for each thread
		#pragma omp barrier
		int myjxi488 = ixi488+myompthread;
		@@ -463,7 +463,7 @@ void sphere(const string& config_file, const string& data_file, const string& ou
		// make sure all threads align here: I don't want the following loop to accidentally start for thread 0, possibly modifying some variables before they are copied by all other threads
		#pragma omp barrier
		// ok, now I can actually start the parallel calculations
		for (int ixi488=2; ixi488<=sid_2->number_of_scales; ixi488 +=myMPIstride) {
		for (int ixi488=1; ixi488<=sid_2->number_of_scales; ixi488 +=myMPIstride) {
		// the parallel loop over MPI processes covers a different set of indices for each thread
		#pragma omp barrier
		int myjxi488 = ixi488 + myjxi488startoffset + myompthread;
		@@ -546,7 +546,9 @@ int sphere_jxi488_cycle(
		int oindex = 0;
		int jxi = jxi488 + 1;
		int jxindex = jxi - oi->first_xi;
		bool is_first_scale = (jxi == 1);
		// this is now part of sid, so don't mess with it here, just copy it by reference
		bool &is_first_scale = sid->is_first_scale;
		//bool is_first_scale = (jxi == 1);
		int nsph = gconf->number_of_spheres;
		//int l_max = gconf->l_max;
		int in_pol = gconf->in_pol;
		@@ -889,6 +891,8 @@ int sphere_jxi488_cycle(
		} // jph484 loop on elevation
		th += sa->thstp;
		} // jth486 loop on azimuth
		// at the end of this iteration make sure to set is_first_scale to false, the next iteration will reinitialise _only_ if the order changes
		is_first_scale = 0;
		oi->vec_jxi[jxindex] = jxi;
		logger->log("INFO: finished scale iteration " + to_string(jxi) + " of " + to_string(nxi) + ".\n");
		delete logger;
		@@ -977,6 +981,7 @@ SphereIterationData::SphereIterationData(
		zpv[zi][zj][1] = (vec_zpv + vec_index + 2);
		}
		}
		is_first_scale = 1;
		}

		SphereIterationData::SphereIterationData(const SphereIterationData &rhs) {
		@@ -1082,6 +1087,7 @@ SphereIterationData::SphereIterationData(const SphereIterationData &rhs) {
		zpv[zi][zj][1] = (vec_zpv + vec_index + 2);
		}
		}
		is_first_scale = rhs.is_first_scale;
		}

		SphereIterationData::~SphereIterationData() {
		@@ -1207,6 +1213,9 @@ SphereIterationData::SphereIterationData(const mixMPI *mpidata, const int device
		// Collect vectors whose size depend on LM
		vec_zpv = new double[12 * _lm];
		MPI_Bcast(vec_zpv, 12 * _lm, MPI_DOUBLE, 0, MPI_COMM_WORLD);

		MPI_Bcast(&is_first_scale, 1, MPI_C_BOOL, 0, MPI_COMM_WORLD);

		}

		int SphereIterationData::mpibcast(const mixMPI *mpidata) {
		@@ -1270,6 +1279,8 @@ int SphereIterationData::mpibcast(const mixMPI *mpidata) {
		// Broadcast vectors whose size depend on LM
		MPI_Bcast(vec_zpv, 12 * _lm, MPI_DOUBLE, 0, MPI_COMM_WORLD);

		MPI_Bcast(&is_first_scale, 1, MPI_C_BOOL, 0, MPI_COMM_WORLD);

		return 0;
		}
		#endif // MPI_VERSION