MPI artifacts in deconvolution loop fixed (ea97cf42) · Commits · Claudio Gheller / HPC_Imaging

ricklib.h

+2 −1

Original line number	Diff line number	Diff line
		@@ -148,7 +148,8 @@ struct timing_t {
		double clean_iterations; ///< sub-stage: CleanSolver setup, minor cycles, restoration
		double deconv_io; ///< sub-stage: MPI-IO write of the 6 deconvolution output planes
		double total; ///< total runtime
		double write; ///< time spent in writing the final image (MPI-I/O)
		double write; ///< time spent in writing the final image (MPI-I/O), pure I/O only
		double write_imbalance;///< pre-write MPI_Barrier wait = phase-correction load-imbalance skew
		};

		/** @} */

src/decomposition/decomposition.cpp

+7 −0

Original line number	Diff line number	Diff line
		@@ -144,6 +144,11 @@ void write_timings(int rank, timing_t timing)
		MPI_Reduce(&timing.clean_iterations, &time_clean_iter, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
		MPI_Reduce(&timing.deconv_io, &time_deconv_io, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
		MPI_Reduce(&timing.write, &time_write, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
		// MIN and MAX for the pre-write barrier wait: the spread exposes the
		// actual load-skew range across ranks (a single MAX would hide it).
		double time_write_imbalance_min, time_write_imbalance_max;
		MPI_Reduce(&timing.write_imbalance, &time_write_imbalance_min, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
		MPI_Reduce(&timing.write_imbalance, &time_write_imbalance_max, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
		MPI_Reduce(&timing.total, &time_total, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);

		MPI_Reduce(&timing.check, &time_check, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
		@@ -164,6 +169,8 @@ void write_timings(int rank, timing_t timing)
		std::cout << std::setw(40) << " CLEAN iterations + restoration" << " time: " << time_clean_iter << " sec" << std::endl;
		std::cout << std::setw(40) << " Deconvolution I/O" << " time: " << time_deconv_io << " sec" << std::endl;
		std::cout << std::setw(40) << "I/O (write)" << " time: " << time_write << " sec" << std::endl;
		std::cout << std::setw(40) << " Write imbalance (min)" << " time: " << time_write_imbalance_min << " sec" << std::endl;
		std::cout << std::setw(40) << " Write imbalance (max)" << " time: " << time_write_imbalance_max << " sec" << std::endl;
		std::cout << std::setw(40) << "Total" << " time: " << time_total << " sec" << std::endl;
		}

src/deconvolution/clean_solver.cpp

+253 −270

File changed.

Preview size limit exceeded, changes collapsed.

src/deconvolution/clean_solver.h

+82 −81

Original line number	Diff line number	Diff line
		@@ -38,7 +38,9 @@
		* Plus 4 bytes padding for 8-byte alignment = 48 bytes total
		*/
		struct PeakInfo {
		double value; ///< Peak magnitude (used for threshold comparison)
		double value; ///< Peak magnitude sqrt(re^2+im^2) (threshold comparison only;
		///< the cross-rank argmax compares the EXACT squared
		///< magnitude recomputed from real_part/imag_part)
		double real_part; ///< Real part of complex peak
		double imag_part; ///< Imaginary part of complex peak
		int local_x; ///< Local x coordinate within rank's domain
		@@ -105,11 +107,12 @@ public:
		* @param rank_east MPI rank owning the slab adjacent to the last local
		* column (higher global x), or MPI_PROC_NULL.
		* @param comm Communicator spanning all ranks of the decomposition;
		* used for the restoration halo exchange and the global
		* kernel-radius agreement (Allreduce).
		* used for the CLEAN-loop collectives and the restoration
		* component gather (Allgatherv).
		*
		* The neighbour defaults (MPI_PROC_NULL everywhere) reproduce the old
		* single-slab behaviour: every halo exchange degenerates to a no-op.
		* The neighbour ranks are retained for API stability but are no longer
		* used by the restoration path (the global component list is replicated
		* instead — see gather_global_components()).
		*/
		CleanSolver(const CleanConfig& config, int grid_width, int grid_height,
		int x_start, int y_start, double pixel_scale_arcsec = 1.0,
		@@ -262,6 +265,12 @@ private:
		// the LOWEST global y, so "north" = the y_start_-side neighbour
		// (neighbors_2d::below in communication.h terms) and "south" = the
		// (y_start_+height_)-side neighbour (neighbors_2d::above).
		//
		// NOTE: since the restoration was rewritten to replicate the (sparse)
		// global clean-component list on every rank (gather_global_components),
		// these are NO LONGER USED by the restoration path. They are retained so
		// the constructor signature (and test_clib.cpp) stays stable and for
		// future distributed kernels that may need face-neighbour communication.
		int rank_north_ = MPI_PROC_NULL;
		int rank_south_ = MPI_PROC_NULL;
		int rank_west_ = MPI_PROC_NULL;
		@@ -288,10 +297,10 @@ private:
		* @param bmaj_pix Major-axis FWHM [pixels]
		* @param bmin_pix Minor-axis FWHM [pixels]
		* @param bpa_rad Position angle of the major axis [radians]
		* @param max_radius Upper clamp on R. MUST be identical on all ranks
		* (restore_image() agrees on it with an Allreduce-MIN over the
		* per-rank slab dimensions) so that every rank builds the SAME
		* kernel and the halo-exchange message sizes match.
		* @param max_radius Upper clamp on R. MUST be identical on all ranks so
		* that every rank builds the SAME kernel. restore_image() derives
		* it from the GLOBAL image dimensions (the replicated PSF size),
		* which is decomposition-independent — never from the local slab.
		* @param kernel_radius Output: kernel half-width R
		*/
		std::vector<double> build_restoring_kernel(double bmaj_pix, double bmin_pix,
		@@ -299,92 +308,84 @@ private:
		int& kernel_radius) const;

		/**
		* @brief Exchange border strips of a component map with the four
		* face-neighbour ranks (restoration halo exchange)
		* @details A clean component within halo_radius pixels of a slab
		* boundary has Gaussian restoring-beam wings that fall into the
		* neighbouring rank's slab. This method gives each rank a copy of the
		* neighbours' border components so convolve_components() can scatter
		* those wings into the local interior.
		*
		* Two-stage shift (Y first, then X): the X-direction column strips are
		* staged from the EXTENDED slab (local columns plus the already-received
		* north/south halo rows), so corner data from DIAGONAL neighbours is
		* propagated in two hops — the four face exchanges then cover the full
		* ring of width halo_radius around the slab, with no separate diagonal
		* messages.
		* @struct GlobalComponent
		* @brief One non-zero clean component in GLOBAL pixel coordinates
		* @details Trivially-copyable POD shipped over MPI as raw bytes by
		* gather_global_components(). Both maps share the entry (a CLEAN
		* iteration deposits real and imaginary flux at the same pixel).
		*/
		struct GlobalComponent {
		int gy; ///< Global row (y) of the component
		int gx; ///< Global column (x) of the component
		double re; ///< Accumulated real component flux [Jy/pixel]
		double im; ///< Accumulated imaginary component flux [Jy/pixel]
		};

		/**
		* @brief Replicate the sparse global clean-component list on every rank
		* @details Each rank extracts its non-zero (re, im) component pixels in
		* local row-major order, the lists are concatenated with an
		* MPI_Allgatherv, and the result is sorted by GLOBAL (gy, gx) — pixel
		* coordinates are unique across the (disjoint) decomposition, so the
		* order is total and bitwise identical on every rank, for ANY
		* decomposition.
		*
		* Buffer layouts (row-major, iy_logical/ix_logical are slab-relative
		* coordinates that extend outside [0,height_) x [0,width_)):
		* - halo_north: halo_radius x width_ ;
		* value(iy,ix) = halo_north[(iy + R)*width_ + ix], iy in [-R, 0)
		* - halo_south: halo_radius x width_ ;
		* value(iy,ix) = halo_south[(iy - height_)*width_ + ix],
		* iy in [height_, height_+R)
		* - halo_west: (height_ + 2R) x halo_radius (EXTENDED: includes the
		* neighbour's own north/south halo rows, i.e. the corner blocks);
		* value(iy,ix) = halo_west[(iy + R)*R + (ix + R)],
		* iy in [-R, height_+R), ix in [-R, 0)
		* - halo_east: (height_ + 2R) x halo_radius ;
		* value(iy,ix) = halo_east[(iy + R)*R + (ix - width_)],
		* iy in [-R, height_+R), ix in [width_, width_+R)
		* This replaces the former face-neighbour halo exchange, whose one-ring
		* reach forced the restoring-kernel radius to be clamped to
		* min(slab)/4 — a DECOMPOSITION-DEPENDENT truncation of the restoring
		* beam, and the root cause of 1D vs 2D vs relaxed restored-image
		* divergence (catastrophic for relaxed decompositions, where slabs can
		* legally be a single pixel thin). The component list is sparse
		* (<= max_iterations entries), so replication costs O(N_comp) bytes —
		* negligible against the full-image PSF that is already replicated.
		*
		* A buffer is left EMPTY when the corresponding neighbour is
		* MPI_PROC_NULL (global image edge): MPI_Sendrecv with a
		* MPI_PROC_NULL destination/source is a no-op for that direction, so
		* edge ranks need no special casing. Collective over comm_ in the sense
		* that all ranks must call it the same number of times.
		* COLLECTIVE over comm_: all ranks must call it (restore_image() does,
		* exactly once).
		*
		* @param comp Input component map (height_ x width_, row-major)
		* @param halo_radius Halo width R (== kernel_radius; globally uniform)
		* @param halo_north Output: north halo (empty if no neighbour)
		* @param halo_south Output: south halo (empty if no neighbour)
		* @param halo_west Output: extended west halo (empty if no neighbour)
		* @param halo_east Output: extended east halo (empty if no neighbour)
		* @return Globally sorted component list, identical on every rank
		*/
		void exchange_component_halos(const std::vector<double>& comp, int halo_radius,
		std::vector<double>& halo_north,
		std::vector<double>& halo_south,
		std::vector<double>& halo_west,
		std::vector<double>& halo_east) const;
		std::vector<GlobalComponent> gather_global_components() const;

		/**
		* @brief Convolve a (sparse) component map with the restoring kernel
		* @details Sparse scatter: only non-zero component pixels emit a kernel
		* footprint, so cost is O(N_components * (2R+1)^2) instead of a dense
		* O(N_pixels * (2R+1)^2) — optimal for Hogbom component counts.
		* @brief Convolve the replicated component list with the restoring kernel
		* @details Deterministic GATHER: each thread owns whole OUTPUT rows of
		* the local slab (single writer, no atomics) and accumulates the
		* contributions of all components within kernel_radius in ascending
		* GLOBAL (gy, gx) order — so the per-pixel floating-point summation
		* order is identical for any rank count, decomposition, thread count
		* and run, and identical to a single-rank gather. Components are
		* addressed in global coordinates and clipped to this slab, so wing
		* flux crossing decomposition boundaries is restored exactly, with NO
		* halo communication and NO constraint linking the kernel radius to the
		* slab dimensions.
		*
		* Both maps are convolved in one pass (they share source positions and
		* the kernel row). The negligibility cutoff (1e-12 of the strongest
		* component, per map) is computed from the replicated list, so it is
		* identical on every rank without communication.
		*
		* In addition to the local slab, components held in the four halo
		* buffers (from exchange_component_halos()) are scattered with the same
		* bounds-clipped kernel footprint: their source pixels lie logically
		* OUTSIDE the slab, but the clipping keeps every write inside
		* [0,height_) x [0,width_) — exactly the wing flux that a local-only
		* convolution loses at decomposition boundaries.
		* Purely LOCAL — no MPI calls (the former Allreduce-MAX cutoff
		* agreement is obsolete: the input list is already replicated).
		*
		* @param component Input component map (width_ x height_, row-major)
		* @param components Globally sorted list from gather_global_components()
		* @param kernel Restoring kernel from build_restoring_kernel()
		* @param kernel_radius Kernel half-width R (== halo width)
		* @param halo_north R rows from rank_north_ (layout: see
		* exchange_component_halos; pass an empty vector for no halo)
		* @param halo_south R rows from rank_south_
		* @param halo_west (height_+2R) x R columns from rank_west_ (extended)
		* @param halo_east (height_+2R) x R columns from rank_east_ (extended)
		* @param convolved Output buffer (width_ x height_), overwritten
		* @param kernel_radius Kernel half-width R
		* @param convolved_real Output (height_ x width_), overwritten
		* @param convolved_imag Output (height_ x width_), overwritten
		*/
		void convolve_components(const std::vector<double>& component,
		void convolve_components(const std::vector<GlobalComponent>& components,
		const std::vector<double>& kernel, int kernel_radius,
		const std::vector<double>& halo_north,
		const std::vector<double>& halo_south,
		const std::vector<double>& halo_west,
		const std::vector<double>& halo_east,
		std::vector<double>& convolved) const;
		std::vector<double>& convolved_real,
		std::vector<double>& convolved_imag) const;

		/**
		* @brief Compute the restored image (WSClean convention)
		* @details Fits (or takes from config) the restoring beam, agrees on a
		* globally uniform kernel radius (Allreduce-MIN over comm_), exchanges
		* component halos with the decomposition neighbours, convolves the
		* component maps with the beam, and adds the residual:
		* @details Fits (or takes from config) the restoring beam, derives the
		* kernel-radius clamp from the GLOBAL image dimensions (psf_width_/
		* psf_height_ — the beam is fully replicated, so this is identical on
		* every rank and INDEPENDENT of the decomposition), replicates the
		* sparse global component list (Allgatherv), convolves it into the
		* local slab, and adds the residual:
		* restored = (components (*) G_clean) + residual.
		* COLLECTIVE over comm_: every rank must call it (run_clean() does).
		* @param mpi_rank Rank id, used to gate the beam-parameter log line

src/phase/phase_correction_library.cpp

+8 −4

Original line number	Diff line number	Diff line
		@@ -533,12 +533,16 @@ if (phase_enabled && num_w_planes > 1)
		#endif
		}

		double write_0 = WALLCLOCK_TIME;

		// Time the pre-write barrier separately: it absorbs the load-imbalance
		// skew from the phase-correction compute (fast-tile ranks wait here for
		// the slowest rank). Folding it into timing.write would misreport idle
		// wait as MPI-IO time under the MPI_MAX reduction in write_timings().
		double barrier_0 = WALLCLOCK_TIME;
		if (size > 1)
		{
		MPI_Barrier(MYMPI_COMM);
		}
		timing.write_imbalance += WALLCLOCK_TIME - barrier_0;

		double write_0 = WALLCLOCK_TIME;


		#ifdef FITSIO