Loading ricklib.h +2 −1 Original line number Diff line number Diff line Loading @@ -148,7 +148,8 @@ struct timing_t { double clean_iterations; ///< sub-stage: CleanSolver setup, minor cycles, restoration double deconv_io; ///< sub-stage: MPI-IO write of the 6 deconvolution output planes double total; ///< total runtime double write; ///< time spent in writing the final image (MPI-I/O) double write; ///< time spent in writing the final image (MPI-I/O), pure I/O only double write_imbalance;///< pre-write MPI_Barrier wait = phase-correction load-imbalance skew }; /** @} */ Loading src/decomposition/decomposition.cpp +7 −0 Original line number Diff line number Diff line Loading @@ -144,6 +144,11 @@ void write_timings(int rank, timing_t timing) MPI_Reduce(&timing.clean_iterations, &time_clean_iter, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); MPI_Reduce(&timing.deconv_io, &time_deconv_io, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); MPI_Reduce(&timing.write, &time_write, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); // MIN and MAX for the pre-write barrier wait: the spread exposes the // actual load-skew range across ranks (a single MAX would hide it). double time_write_imbalance_min, time_write_imbalance_max; MPI_Reduce(&timing.write_imbalance, &time_write_imbalance_min, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD); MPI_Reduce(&timing.write_imbalance, &time_write_imbalance_max, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); MPI_Reduce(&timing.total, &time_total, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); MPI_Reduce(&timing.check, &time_check, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); Loading @@ -164,6 +169,8 @@ void write_timings(int rank, timing_t timing) std::cout << std::setw(40) << " CLEAN iterations + restoration" << " time: " << time_clean_iter << " sec" << std::endl; std::cout << std::setw(40) << " Deconvolution I/O" << " time: " << time_deconv_io << " sec" << std::endl; std::cout << std::setw(40) << "I/O (write)" << " time: " << time_write << " sec" << std::endl; std::cout << std::setw(40) << " Write imbalance (min)" << " time: " << time_write_imbalance_min << " sec" << std::endl; std::cout << std::setw(40) << " Write imbalance (max)" << " time: " << time_write_imbalance_max << " sec" << std::endl; std::cout << std::setw(40) << "Total" << " time: " << time_total << " sec" << std::endl; } Loading src/deconvolution/clean_solver.cpp +253 −270 File changed.Preview size limit exceeded, changes collapsed. Show changes src/deconvolution/clean_solver.h +82 −81 Original line number Diff line number Diff line Loading @@ -38,7 +38,9 @@ * Plus 4 bytes padding for 8-byte alignment = 48 bytes total */ struct PeakInfo { double value; ///< Peak magnitude (used for threshold comparison) double value; ///< Peak magnitude sqrt(re^2+im^2) (threshold comparison only; ///< the cross-rank argmax compares the EXACT squared ///< magnitude recomputed from real_part/imag_part) double real_part; ///< Real part of complex peak double imag_part; ///< Imaginary part of complex peak int local_x; ///< Local x coordinate within rank's domain Loading Loading @@ -105,11 +107,12 @@ public: * @param rank_east MPI rank owning the slab adjacent to the last local * column (higher global x), or MPI_PROC_NULL. * @param comm Communicator spanning all ranks of the decomposition; * used for the restoration halo exchange and the global * kernel-radius agreement (Allreduce). * used for the CLEAN-loop collectives and the restoration * component gather (Allgatherv). * * The neighbour defaults (MPI_PROC_NULL everywhere) reproduce the old * single-slab behaviour: every halo exchange degenerates to a no-op. * The neighbour ranks are retained for API stability but are no longer * used by the restoration path (the global component list is replicated * instead — see gather_global_components()). */ CleanSolver(const CleanConfig& config, int grid_width, int grid_height, int x_start, int y_start, double pixel_scale_arcsec = 1.0, Loading Loading @@ -262,6 +265,12 @@ private: // the LOWEST global y, so "north" = the y_start_-side neighbour // (neighbors_2d::below in communication.h terms) and "south" = the // (y_start_+height_)-side neighbour (neighbors_2d::above). // // NOTE: since the restoration was rewritten to replicate the (sparse) // global clean-component list on every rank (gather_global_components), // these are NO LONGER USED by the restoration path. They are retained so // the constructor signature (and test_clib.cpp) stays stable and for // future distributed kernels that may need face-neighbour communication. int rank_north_ = MPI_PROC_NULL; int rank_south_ = MPI_PROC_NULL; int rank_west_ = MPI_PROC_NULL; Loading @@ -288,10 +297,10 @@ private: * @param bmaj_pix Major-axis FWHM [pixels] * @param bmin_pix Minor-axis FWHM [pixels] * @param bpa_rad Position angle of the major axis [radians] * @param max_radius Upper clamp on R. MUST be identical on all ranks * (restore_image() agrees on it with an Allreduce-MIN over the * per-rank slab dimensions) so that every rank builds the SAME * kernel and the halo-exchange message sizes match. * @param max_radius Upper clamp on R. MUST be identical on all ranks so * that every rank builds the SAME kernel. restore_image() derives * it from the GLOBAL image dimensions (the replicated PSF size), * which is decomposition-independent — never from the local slab. * @param kernel_radius Output: kernel half-width R */ std::vector<double> build_restoring_kernel(double bmaj_pix, double bmin_pix, Loading @@ -299,92 +308,84 @@ private: int& kernel_radius) const; /** * @brief Exchange border strips of a component map with the four * face-neighbour ranks (restoration halo exchange) * @details A clean component within halo_radius pixels of a slab * boundary has Gaussian restoring-beam wings that fall into the * neighbouring rank's slab. This method gives each rank a copy of the * neighbours' border components so convolve_components() can scatter * those wings into the local interior. * * Two-stage shift (Y first, then X): the X-direction column strips are * staged from the EXTENDED slab (local columns plus the already-received * north/south halo rows), so corner data from DIAGONAL neighbours is * propagated in two hops — the four face exchanges then cover the full * ring of width halo_radius around the slab, with no separate diagonal * messages. * @struct GlobalComponent * @brief One non-zero clean component in GLOBAL pixel coordinates * @details Trivially-copyable POD shipped over MPI as raw bytes by * gather_global_components(). Both maps share the entry (a CLEAN * iteration deposits real and imaginary flux at the same pixel). */ struct GlobalComponent { int gy; ///< Global row (y) of the component int gx; ///< Global column (x) of the component double re; ///< Accumulated real component flux [Jy/pixel] double im; ///< Accumulated imaginary component flux [Jy/pixel] }; /** * @brief Replicate the sparse global clean-component list on every rank * @details Each rank extracts its non-zero (re, im) component pixels in * local row-major order, the lists are concatenated with an * MPI_Allgatherv, and the result is sorted by GLOBAL (gy, gx) — pixel * coordinates are unique across the (disjoint) decomposition, so the * order is total and bitwise identical on every rank, for ANY * decomposition. * * Buffer layouts (row-major, iy_logical/ix_logical are slab-relative * coordinates that extend outside [0,height_) x [0,width_)): * - halo_north: halo_radius x width_ ; * value(iy,ix) = halo_north[(iy + R)*width_ + ix], iy in [-R, 0) * - halo_south: halo_radius x width_ ; * value(iy,ix) = halo_south[(iy - height_)*width_ + ix], * iy in [height_, height_+R) * - halo_west: (height_ + 2R) x halo_radius (EXTENDED: includes the * neighbour's own north/south halo rows, i.e. the corner blocks); * value(iy,ix) = halo_west[(iy + R)*R + (ix + R)], * iy in [-R, height_+R), ix in [-R, 0) * - halo_east: (height_ + 2R) x halo_radius ; * value(iy,ix) = halo_east[(iy + R)*R + (ix - width_)], * iy in [-R, height_+R), ix in [width_, width_+R) * This replaces the former face-neighbour halo exchange, whose one-ring * reach forced the restoring-kernel radius to be clamped to * min(slab)/4 — a DECOMPOSITION-DEPENDENT truncation of the restoring * beam, and the root cause of 1D vs 2D vs relaxed restored-image * divergence (catastrophic for relaxed decompositions, where slabs can * legally be a single pixel thin). The component list is sparse * (<= max_iterations entries), so replication costs O(N_comp) bytes — * negligible against the full-image PSF that is already replicated. * * A buffer is left EMPTY when the corresponding neighbour is * MPI_PROC_NULL (global image edge): MPI_Sendrecv with a * MPI_PROC_NULL destination/source is a no-op for that direction, so * edge ranks need no special casing. Collective over comm_ in the sense * that all ranks must call it the same number of times. * COLLECTIVE over comm_: all ranks must call it (restore_image() does, * exactly once). * * @param comp Input component map (height_ x width_, row-major) * @param halo_radius Halo width R (== kernel_radius; globally uniform) * @param halo_north Output: north halo (empty if no neighbour) * @param halo_south Output: south halo (empty if no neighbour) * @param halo_west Output: extended west halo (empty if no neighbour) * @param halo_east Output: extended east halo (empty if no neighbour) * @return Globally sorted component list, identical on every rank */ void exchange_component_halos(const std::vector<double>& comp, int halo_radius, std::vector<double>& halo_north, std::vector<double>& halo_south, std::vector<double>& halo_west, std::vector<double>& halo_east) const; std::vector<GlobalComponent> gather_global_components() const; /** * @brief Convolve a (sparse) component map with the restoring kernel * @details Sparse scatter: only non-zero component pixels emit a kernel * footprint, so cost is O(N_components * (2R+1)^2) instead of a dense * O(N_pixels * (2R+1)^2) — optimal for Hogbom component counts. * @brief Convolve the replicated component list with the restoring kernel * @details Deterministic GATHER: each thread owns whole OUTPUT rows of * the local slab (single writer, no atomics) and accumulates the * contributions of all components within kernel_radius in ascending * GLOBAL (gy, gx) order — so the per-pixel floating-point summation * order is identical for any rank count, decomposition, thread count * and run, and identical to a single-rank gather. Components are * addressed in global coordinates and clipped to this slab, so wing * flux crossing decomposition boundaries is restored exactly, with NO * halo communication and NO constraint linking the kernel radius to the * slab dimensions. * * Both maps are convolved in one pass (they share source positions and * the kernel row). The negligibility cutoff (1e-12 of the strongest * component, per map) is computed from the replicated list, so it is * identical on every rank without communication. * * In addition to the local slab, components held in the four halo * buffers (from exchange_component_halos()) are scattered with the same * bounds-clipped kernel footprint: their source pixels lie logically * OUTSIDE the slab, but the clipping keeps every write inside * [0,height_) x [0,width_) — exactly the wing flux that a local-only * convolution loses at decomposition boundaries. * Purely LOCAL — no MPI calls (the former Allreduce-MAX cutoff * agreement is obsolete: the input list is already replicated). * * @param component Input component map (width_ x height_, row-major) * @param components Globally sorted list from gather_global_components() * @param kernel Restoring kernel from build_restoring_kernel() * @param kernel_radius Kernel half-width R (== halo width) * @param halo_north R rows from rank_north_ (layout: see * exchange_component_halos; pass an empty vector for no halo) * @param halo_south R rows from rank_south_ * @param halo_west (height_+2R) x R columns from rank_west_ (extended) * @param halo_east (height_+2R) x R columns from rank_east_ (extended) * @param convolved Output buffer (width_ x height_), overwritten * @param kernel_radius Kernel half-width R * @param convolved_real Output (height_ x width_), overwritten * @param convolved_imag Output (height_ x width_), overwritten */ void convolve_components(const std::vector<double>& component, void convolve_components(const std::vector<GlobalComponent>& components, const std::vector<double>& kernel, int kernel_radius, const std::vector<double>& halo_north, const std::vector<double>& halo_south, const std::vector<double>& halo_west, const std::vector<double>& halo_east, std::vector<double>& convolved) const; std::vector<double>& convolved_real, std::vector<double>& convolved_imag) const; /** * @brief Compute the restored image (WSClean convention) * @details Fits (or takes from config) the restoring beam, agrees on a * globally uniform kernel radius (Allreduce-MIN over comm_), exchanges * component halos with the decomposition neighbours, convolves the * component maps with the beam, and adds the residual: * @details Fits (or takes from config) the restoring beam, derives the * kernel-radius clamp from the GLOBAL image dimensions (psf_width_/ * psf_height_ — the beam is fully replicated, so this is identical on * every rank and INDEPENDENT of the decomposition), replicates the * sparse global component list (Allgatherv), convolves it into the * local slab, and adds the residual: * restored = (components (*) G_clean) + residual. * COLLECTIVE over comm_: every rank must call it (run_clean() does). * @param mpi_rank Rank id, used to gate the beam-parameter log line Loading src/phase/phase_correction_library.cpp +8 −4 Original line number Diff line number Diff line Loading @@ -533,12 +533,16 @@ if (phase_enabled && num_w_planes > 1) #endif } double write_0 = WALLCLOCK_TIME; // Time the pre-write barrier separately: it absorbs the load-imbalance // skew from the phase-correction compute (fast-tile ranks wait here for // the slowest rank). Folding it into timing.write would misreport idle // wait as MPI-IO time under the MPI_MAX reduction in write_timings(). double barrier_0 = WALLCLOCK_TIME; if (size > 1) { MPI_Barrier(MYMPI_COMM); } timing.write_imbalance += WALLCLOCK_TIME - barrier_0; double write_0 = WALLCLOCK_TIME; #ifdef FITSIO Loading Loading
ricklib.h +2 −1 Original line number Diff line number Diff line Loading @@ -148,7 +148,8 @@ struct timing_t { double clean_iterations; ///< sub-stage: CleanSolver setup, minor cycles, restoration double deconv_io; ///< sub-stage: MPI-IO write of the 6 deconvolution output planes double total; ///< total runtime double write; ///< time spent in writing the final image (MPI-I/O) double write; ///< time spent in writing the final image (MPI-I/O), pure I/O only double write_imbalance;///< pre-write MPI_Barrier wait = phase-correction load-imbalance skew }; /** @} */ Loading
src/decomposition/decomposition.cpp +7 −0 Original line number Diff line number Diff line Loading @@ -144,6 +144,11 @@ void write_timings(int rank, timing_t timing) MPI_Reduce(&timing.clean_iterations, &time_clean_iter, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); MPI_Reduce(&timing.deconv_io, &time_deconv_io, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); MPI_Reduce(&timing.write, &time_write, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); // MIN and MAX for the pre-write barrier wait: the spread exposes the // actual load-skew range across ranks (a single MAX would hide it). double time_write_imbalance_min, time_write_imbalance_max; MPI_Reduce(&timing.write_imbalance, &time_write_imbalance_min, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD); MPI_Reduce(&timing.write_imbalance, &time_write_imbalance_max, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); MPI_Reduce(&timing.total, &time_total, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); MPI_Reduce(&timing.check, &time_check, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); Loading @@ -164,6 +169,8 @@ void write_timings(int rank, timing_t timing) std::cout << std::setw(40) << " CLEAN iterations + restoration" << " time: " << time_clean_iter << " sec" << std::endl; std::cout << std::setw(40) << " Deconvolution I/O" << " time: " << time_deconv_io << " sec" << std::endl; std::cout << std::setw(40) << "I/O (write)" << " time: " << time_write << " sec" << std::endl; std::cout << std::setw(40) << " Write imbalance (min)" << " time: " << time_write_imbalance_min << " sec" << std::endl; std::cout << std::setw(40) << " Write imbalance (max)" << " time: " << time_write_imbalance_max << " sec" << std::endl; std::cout << std::setw(40) << "Total" << " time: " << time_total << " sec" << std::endl; } Loading
src/deconvolution/clean_solver.cpp +253 −270 File changed.Preview size limit exceeded, changes collapsed. Show changes
src/deconvolution/clean_solver.h +82 −81 Original line number Diff line number Diff line Loading @@ -38,7 +38,9 @@ * Plus 4 bytes padding for 8-byte alignment = 48 bytes total */ struct PeakInfo { double value; ///< Peak magnitude (used for threshold comparison) double value; ///< Peak magnitude sqrt(re^2+im^2) (threshold comparison only; ///< the cross-rank argmax compares the EXACT squared ///< magnitude recomputed from real_part/imag_part) double real_part; ///< Real part of complex peak double imag_part; ///< Imaginary part of complex peak int local_x; ///< Local x coordinate within rank's domain Loading Loading @@ -105,11 +107,12 @@ public: * @param rank_east MPI rank owning the slab adjacent to the last local * column (higher global x), or MPI_PROC_NULL. * @param comm Communicator spanning all ranks of the decomposition; * used for the restoration halo exchange and the global * kernel-radius agreement (Allreduce). * used for the CLEAN-loop collectives and the restoration * component gather (Allgatherv). * * The neighbour defaults (MPI_PROC_NULL everywhere) reproduce the old * single-slab behaviour: every halo exchange degenerates to a no-op. * The neighbour ranks are retained for API stability but are no longer * used by the restoration path (the global component list is replicated * instead — see gather_global_components()). */ CleanSolver(const CleanConfig& config, int grid_width, int grid_height, int x_start, int y_start, double pixel_scale_arcsec = 1.0, Loading Loading @@ -262,6 +265,12 @@ private: // the LOWEST global y, so "north" = the y_start_-side neighbour // (neighbors_2d::below in communication.h terms) and "south" = the // (y_start_+height_)-side neighbour (neighbors_2d::above). // // NOTE: since the restoration was rewritten to replicate the (sparse) // global clean-component list on every rank (gather_global_components), // these are NO LONGER USED by the restoration path. They are retained so // the constructor signature (and test_clib.cpp) stays stable and for // future distributed kernels that may need face-neighbour communication. int rank_north_ = MPI_PROC_NULL; int rank_south_ = MPI_PROC_NULL; int rank_west_ = MPI_PROC_NULL; Loading @@ -288,10 +297,10 @@ private: * @param bmaj_pix Major-axis FWHM [pixels] * @param bmin_pix Minor-axis FWHM [pixels] * @param bpa_rad Position angle of the major axis [radians] * @param max_radius Upper clamp on R. MUST be identical on all ranks * (restore_image() agrees on it with an Allreduce-MIN over the * per-rank slab dimensions) so that every rank builds the SAME * kernel and the halo-exchange message sizes match. * @param max_radius Upper clamp on R. MUST be identical on all ranks so * that every rank builds the SAME kernel. restore_image() derives * it from the GLOBAL image dimensions (the replicated PSF size), * which is decomposition-independent — never from the local slab. * @param kernel_radius Output: kernel half-width R */ std::vector<double> build_restoring_kernel(double bmaj_pix, double bmin_pix, Loading @@ -299,92 +308,84 @@ private: int& kernel_radius) const; /** * @brief Exchange border strips of a component map with the four * face-neighbour ranks (restoration halo exchange) * @details A clean component within halo_radius pixels of a slab * boundary has Gaussian restoring-beam wings that fall into the * neighbouring rank's slab. This method gives each rank a copy of the * neighbours' border components so convolve_components() can scatter * those wings into the local interior. * * Two-stage shift (Y first, then X): the X-direction column strips are * staged from the EXTENDED slab (local columns plus the already-received * north/south halo rows), so corner data from DIAGONAL neighbours is * propagated in two hops — the four face exchanges then cover the full * ring of width halo_radius around the slab, with no separate diagonal * messages. * @struct GlobalComponent * @brief One non-zero clean component in GLOBAL pixel coordinates * @details Trivially-copyable POD shipped over MPI as raw bytes by * gather_global_components(). Both maps share the entry (a CLEAN * iteration deposits real and imaginary flux at the same pixel). */ struct GlobalComponent { int gy; ///< Global row (y) of the component int gx; ///< Global column (x) of the component double re; ///< Accumulated real component flux [Jy/pixel] double im; ///< Accumulated imaginary component flux [Jy/pixel] }; /** * @brief Replicate the sparse global clean-component list on every rank * @details Each rank extracts its non-zero (re, im) component pixels in * local row-major order, the lists are concatenated with an * MPI_Allgatherv, and the result is sorted by GLOBAL (gy, gx) — pixel * coordinates are unique across the (disjoint) decomposition, so the * order is total and bitwise identical on every rank, for ANY * decomposition. * * Buffer layouts (row-major, iy_logical/ix_logical are slab-relative * coordinates that extend outside [0,height_) x [0,width_)): * - halo_north: halo_radius x width_ ; * value(iy,ix) = halo_north[(iy + R)*width_ + ix], iy in [-R, 0) * - halo_south: halo_radius x width_ ; * value(iy,ix) = halo_south[(iy - height_)*width_ + ix], * iy in [height_, height_+R) * - halo_west: (height_ + 2R) x halo_radius (EXTENDED: includes the * neighbour's own north/south halo rows, i.e. the corner blocks); * value(iy,ix) = halo_west[(iy + R)*R + (ix + R)], * iy in [-R, height_+R), ix in [-R, 0) * - halo_east: (height_ + 2R) x halo_radius ; * value(iy,ix) = halo_east[(iy + R)*R + (ix - width_)], * iy in [-R, height_+R), ix in [width_, width_+R) * This replaces the former face-neighbour halo exchange, whose one-ring * reach forced the restoring-kernel radius to be clamped to * min(slab)/4 — a DECOMPOSITION-DEPENDENT truncation of the restoring * beam, and the root cause of 1D vs 2D vs relaxed restored-image * divergence (catastrophic for relaxed decompositions, where slabs can * legally be a single pixel thin). The component list is sparse * (<= max_iterations entries), so replication costs O(N_comp) bytes — * negligible against the full-image PSF that is already replicated. * * A buffer is left EMPTY when the corresponding neighbour is * MPI_PROC_NULL (global image edge): MPI_Sendrecv with a * MPI_PROC_NULL destination/source is a no-op for that direction, so * edge ranks need no special casing. Collective over comm_ in the sense * that all ranks must call it the same number of times. * COLLECTIVE over comm_: all ranks must call it (restore_image() does, * exactly once). * * @param comp Input component map (height_ x width_, row-major) * @param halo_radius Halo width R (== kernel_radius; globally uniform) * @param halo_north Output: north halo (empty if no neighbour) * @param halo_south Output: south halo (empty if no neighbour) * @param halo_west Output: extended west halo (empty if no neighbour) * @param halo_east Output: extended east halo (empty if no neighbour) * @return Globally sorted component list, identical on every rank */ void exchange_component_halos(const std::vector<double>& comp, int halo_radius, std::vector<double>& halo_north, std::vector<double>& halo_south, std::vector<double>& halo_west, std::vector<double>& halo_east) const; std::vector<GlobalComponent> gather_global_components() const; /** * @brief Convolve a (sparse) component map with the restoring kernel * @details Sparse scatter: only non-zero component pixels emit a kernel * footprint, so cost is O(N_components * (2R+1)^2) instead of a dense * O(N_pixels * (2R+1)^2) — optimal for Hogbom component counts. * @brief Convolve the replicated component list with the restoring kernel * @details Deterministic GATHER: each thread owns whole OUTPUT rows of * the local slab (single writer, no atomics) and accumulates the * contributions of all components within kernel_radius in ascending * GLOBAL (gy, gx) order — so the per-pixel floating-point summation * order is identical for any rank count, decomposition, thread count * and run, and identical to a single-rank gather. Components are * addressed in global coordinates and clipped to this slab, so wing * flux crossing decomposition boundaries is restored exactly, with NO * halo communication and NO constraint linking the kernel radius to the * slab dimensions. * * Both maps are convolved in one pass (they share source positions and * the kernel row). The negligibility cutoff (1e-12 of the strongest * component, per map) is computed from the replicated list, so it is * identical on every rank without communication. * * In addition to the local slab, components held in the four halo * buffers (from exchange_component_halos()) are scattered with the same * bounds-clipped kernel footprint: their source pixels lie logically * OUTSIDE the slab, but the clipping keeps every write inside * [0,height_) x [0,width_) — exactly the wing flux that a local-only * convolution loses at decomposition boundaries. * Purely LOCAL — no MPI calls (the former Allreduce-MAX cutoff * agreement is obsolete: the input list is already replicated). * * @param component Input component map (width_ x height_, row-major) * @param components Globally sorted list from gather_global_components() * @param kernel Restoring kernel from build_restoring_kernel() * @param kernel_radius Kernel half-width R (== halo width) * @param halo_north R rows from rank_north_ (layout: see * exchange_component_halos; pass an empty vector for no halo) * @param halo_south R rows from rank_south_ * @param halo_west (height_+2R) x R columns from rank_west_ (extended) * @param halo_east (height_+2R) x R columns from rank_east_ (extended) * @param convolved Output buffer (width_ x height_), overwritten * @param kernel_radius Kernel half-width R * @param convolved_real Output (height_ x width_), overwritten * @param convolved_imag Output (height_ x width_), overwritten */ void convolve_components(const std::vector<double>& component, void convolve_components(const std::vector<GlobalComponent>& components, const std::vector<double>& kernel, int kernel_radius, const std::vector<double>& halo_north, const std::vector<double>& halo_south, const std::vector<double>& halo_west, const std::vector<double>& halo_east, std::vector<double>& convolved) const; std::vector<double>& convolved_real, std::vector<double>& convolved_imag) const; /** * @brief Compute the restored image (WSClean convention) * @details Fits (or takes from config) the restoring beam, agrees on a * globally uniform kernel radius (Allreduce-MIN over comm_), exchanges * component halos with the decomposition neighbours, convolves the * component maps with the beam, and adds the residual: * @details Fits (or takes from config) the restoring beam, derives the * kernel-radius clamp from the GLOBAL image dimensions (psf_width_/ * psf_height_ — the beam is fully replicated, so this is identical on * every rank and INDEPENDENT of the decomposition), replicates the * sparse global component list (Allgatherv), convolves it into the * local slab, and adds the residual: * restored = (components (*) G_clean) + residual. * COLLECTIVE over comm_: every rank must call it (run_clean() does). * @param mpi_rank Rank id, used to gate the beam-parameter log line Loading
src/phase/phase_correction_library.cpp +8 −4 Original line number Diff line number Diff line Loading @@ -533,12 +533,16 @@ if (phase_enabled && num_w_planes > 1) #endif } double write_0 = WALLCLOCK_TIME; // Time the pre-write barrier separately: it absorbs the load-imbalance // skew from the phase-correction compute (fast-tile ranks wait here for // the slowest rank). Folding it into timing.write would misreport idle // wait as MPI-IO time under the MPI_MAX reduction in write_timings(). double barrier_0 = WALLCLOCK_TIME; if (size > 1) { MPI_Barrier(MYMPI_COMM); } timing.write_imbalance += WALLCLOCK_TIME - barrier_0; double write_0 = WALLCLOCK_TIME; #ifdef FITSIO Loading