Commit ea97cf42 authored by Giovanni Lacopo's avatar Giovanni Lacopo
Browse files

MPI artifacts in deconvolution loop fixed

parent bde27029
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -148,7 +148,8 @@ struct timing_t {
  double clean_iterations; ///< sub-stage: CleanSolver setup, minor cycles, restoration
  double deconv_io;      ///< sub-stage: MPI-IO write of the 6 deconvolution output planes
  double total;          ///< total runtime
  double write;          ///< time spent in writing the final image (MPI-I/O)
  double write;          ///< time spent in writing the final image (MPI-I/O), pure I/O only
  double write_imbalance;///< pre-write MPI_Barrier wait = phase-correction load-imbalance skew
};

/** @} */
+7 −0
Original line number Diff line number Diff line
@@ -144,6 +144,11 @@ void write_timings(int rank, timing_t timing)
    MPI_Reduce(&timing.clean_iterations, &time_clean_iter, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
    MPI_Reduce(&timing.deconv_io, &time_deconv_io, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
    MPI_Reduce(&timing.write, &time_write, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
    // MIN and MAX for the pre-write barrier wait: the spread exposes the
    // actual load-skew range across ranks (a single MAX would hide it).
    double time_write_imbalance_min, time_write_imbalance_max;
    MPI_Reduce(&timing.write_imbalance, &time_write_imbalance_min, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
    MPI_Reduce(&timing.write_imbalance, &time_write_imbalance_max, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
    MPI_Reduce(&timing.total, &time_total, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);

    MPI_Reduce(&timing.check, &time_check, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
@@ -164,6 +169,8 @@ void write_timings(int rank, timing_t timing)
        std::cout << std::setw(40) << "  CLEAN iterations + restoration" << " time: " << time_clean_iter << " sec" << std::endl;
        std::cout << std::setw(40) << "  Deconvolution I/O" << " time: " << time_deconv_io << " sec" << std::endl;
        std::cout << std::setw(40) << "I/O (write)" << " time: " << time_write << " sec" << std::endl;
        std::cout << std::setw(40) << "  Write imbalance (min)" << " time: " << time_write_imbalance_min << " sec" << std::endl;
        std::cout << std::setw(40) << "  Write imbalance (max)" << " time: " << time_write_imbalance_max << " sec" << std::endl;
        std::cout << std::setw(40) << "Total" << " time: " << time_total << " sec" << std::endl;
    }

+253 −270

File changed.

Preview size limit exceeded, changes collapsed.

+82 −81
Original line number Diff line number Diff line
@@ -38,7 +38,9 @@
 * Plus 4 bytes padding for 8-byte alignment = 48 bytes total
 */
struct PeakInfo {
    double value;        ///< Peak magnitude (used for threshold comparison)
    double value;        ///< Peak magnitude sqrt(re^2+im^2) (threshold comparison only;
                         ///< the cross-rank argmax compares the EXACT squared
                         ///< magnitude recomputed from real_part/imag_part)
    double real_part;    ///< Real part of complex peak
    double imag_part;    ///< Imaginary part of complex peak
    int local_x;         ///< Local x coordinate within rank's domain
@@ -105,11 +107,12 @@ public:
     * @param rank_east MPI rank owning the slab adjacent to the last local
     *        column (higher global x), or MPI_PROC_NULL.
     * @param comm Communicator spanning all ranks of the decomposition;
     *        used for the restoration halo exchange and the global
     *        kernel-radius agreement (Allreduce).
     *        used for the CLEAN-loop collectives and the restoration
     *        component gather (Allgatherv).
     *
     * The neighbour defaults (MPI_PROC_NULL everywhere) reproduce the old
     * single-slab behaviour: every halo exchange degenerates to a no-op.
     * The neighbour ranks are retained for API stability but are no longer
     * used by the restoration path (the global component list is replicated
     * instead — see gather_global_components()).
     */
    CleanSolver(const CleanConfig& config, int grid_width, int grid_height,
                int x_start, int y_start, double pixel_scale_arcsec = 1.0,
@@ -262,6 +265,12 @@ private:
    // the LOWEST global y, so "north" = the y_start_-side neighbour
    // (neighbors_2d::below in communication.h terms) and "south" = the
    // (y_start_+height_)-side neighbour (neighbors_2d::above).
    //
    // NOTE: since the restoration was rewritten to replicate the (sparse)
    // global clean-component list on every rank (gather_global_components),
    // these are NO LONGER USED by the restoration path. They are retained so
    // the constructor signature (and test_clib.cpp) stays stable and for
    // future distributed kernels that may need face-neighbour communication.
    int rank_north_ = MPI_PROC_NULL;
    int rank_south_ = MPI_PROC_NULL;
    int rank_west_  = MPI_PROC_NULL;
@@ -288,10 +297,10 @@ private:
     * @param bmaj_pix Major-axis FWHM [pixels]
     * @param bmin_pix Minor-axis FWHM [pixels]
     * @param bpa_rad Position angle of the major axis [radians]
     * @param max_radius Upper clamp on R. MUST be identical on all ranks
     *        (restore_image() agrees on it with an Allreduce-MIN over the
     *        per-rank slab dimensions) so that every rank builds the SAME
     *        kernel and the halo-exchange message sizes match.
     * @param max_radius Upper clamp on R. MUST be identical on all ranks so
     *        that every rank builds the SAME kernel. restore_image() derives
     *        it from the GLOBAL image dimensions (the replicated PSF size),
     *        which is decomposition-independent — never from the local slab.
     * @param kernel_radius Output: kernel half-width R
     */
    std::vector<double> build_restoring_kernel(double bmaj_pix, double bmin_pix,
@@ -299,92 +308,84 @@ private:
                                               int& kernel_radius) const;

    /**
     * @brief Exchange border strips of a component map with the four
     *        face-neighbour ranks (restoration halo exchange)
     * @details A clean component within halo_radius pixels of a slab
     * boundary has Gaussian restoring-beam wings that fall into the
     * neighbouring rank's slab. This method gives each rank a copy of the
     * neighbours' border components so convolve_components() can scatter
     * those wings into the local interior.
     *
     * Two-stage shift (Y first, then X): the X-direction column strips are
     * staged from the EXTENDED slab (local columns plus the already-received
     * north/south halo rows), so corner data from DIAGONAL neighbours is
     * propagated in two hops — the four face exchanges then cover the full
     * ring of width halo_radius around the slab, with no separate diagonal
     * messages.
     * @struct GlobalComponent
     * @brief One non-zero clean component in GLOBAL pixel coordinates
     * @details Trivially-copyable POD shipped over MPI as raw bytes by
     * gather_global_components(). Both maps share the entry (a CLEAN
     * iteration deposits real and imaginary flux at the same pixel).
     */
    struct GlobalComponent {
        int gy;      ///< Global row (y) of the component
        int gx;      ///< Global column (x) of the component
        double re;   ///< Accumulated real component flux [Jy/pixel]
        double im;   ///< Accumulated imaginary component flux [Jy/pixel]
    };

    /**
     * @brief Replicate the sparse global clean-component list on every rank
     * @details Each rank extracts its non-zero (re, im) component pixels in
     * local row-major order, the lists are concatenated with an
     * MPI_Allgatherv, and the result is sorted by GLOBAL (gy, gx) — pixel
     * coordinates are unique across the (disjoint) decomposition, so the
     * order is total and bitwise identical on every rank, for ANY
     * decomposition.
     *
     * Buffer layouts (row-major, iy_logical/ix_logical are slab-relative
     * coordinates that extend outside [0,height_) x [0,width_)):
     *  - halo_north: halo_radius x width_ ;
     *      value(iy,ix) = halo_north[(iy + R)*width_ + ix],  iy in [-R, 0)
     *  - halo_south: halo_radius x width_ ;
     *      value(iy,ix) = halo_south[(iy - height_)*width_ + ix],
     *      iy in [height_, height_+R)
     *  - halo_west: (height_ + 2R) x halo_radius (EXTENDED: includes the
     *      neighbour's own north/south halo rows, i.e. the corner blocks);
     *      value(iy,ix) = halo_west[(iy + R)*R + (ix + R)],
     *      iy in [-R, height_+R), ix in [-R, 0)
     *  - halo_east: (height_ + 2R) x halo_radius ;
     *      value(iy,ix) = halo_east[(iy + R)*R + (ix - width_)],
     *      iy in [-R, height_+R), ix in [width_, width_+R)
     * This replaces the former face-neighbour halo exchange, whose one-ring
     * reach forced the restoring-kernel radius to be clamped to
     * min(slab)/4 — a DECOMPOSITION-DEPENDENT truncation of the restoring
     * beam, and the root cause of 1D vs 2D vs relaxed restored-image
     * divergence (catastrophic for relaxed decompositions, where slabs can
     * legally be a single pixel thin). The component list is sparse
     * (<= max_iterations entries), so replication costs O(N_comp) bytes —
     * negligible against the full-image PSF that is already replicated.
     *
     * A buffer is left EMPTY when the corresponding neighbour is
     * MPI_PROC_NULL (global image edge): MPI_Sendrecv with a
     * MPI_PROC_NULL destination/source is a no-op for that direction, so
     * edge ranks need no special casing. Collective over comm_ in the sense
     * that all ranks must call it the same number of times.
     * COLLECTIVE over comm_: all ranks must call it (restore_image() does,
     * exactly once).
     *
     * @param comp Input component map (height_ x width_, row-major)
     * @param halo_radius Halo width R (== kernel_radius; globally uniform)
     * @param halo_north Output: north halo (empty if no neighbour)
     * @param halo_south Output: south halo (empty if no neighbour)
     * @param halo_west Output: extended west halo (empty if no neighbour)
     * @param halo_east Output: extended east halo (empty if no neighbour)
     * @return Globally sorted component list, identical on every rank
     */
    void exchange_component_halos(const std::vector<double>& comp, int halo_radius,
                                  std::vector<double>& halo_north,
                                  std::vector<double>& halo_south,
                                  std::vector<double>& halo_west,
                                  std::vector<double>& halo_east) const;
    std::vector<GlobalComponent> gather_global_components() const;

    /**
     * @brief Convolve a (sparse) component map with the restoring kernel
     * @details Sparse scatter: only non-zero component pixels emit a kernel
     * footprint, so cost is O(N_components * (2R+1)^2) instead of a dense
     * O(N_pixels * (2R+1)^2) — optimal for Hogbom component counts.
     * @brief Convolve the replicated component list with the restoring kernel
     * @details Deterministic GATHER: each thread owns whole OUTPUT rows of
     * the local slab (single writer, no atomics) and accumulates the
     * contributions of all components within kernel_radius in ascending
     * GLOBAL (gy, gx) order — so the per-pixel floating-point summation
     * order is identical for any rank count, decomposition, thread count
     * and run, and identical to a single-rank gather. Components are
     * addressed in global coordinates and clipped to this slab, so wing
     * flux crossing decomposition boundaries is restored exactly, with NO
     * halo communication and NO constraint linking the kernel radius to the
     * slab dimensions.
     *
     * Both maps are convolved in one pass (they share source positions and
     * the kernel row). The negligibility cutoff (1e-12 of the strongest
     * component, per map) is computed from the replicated list, so it is
     * identical on every rank without communication.
     *
     * In addition to the local slab, components held in the four halo
     * buffers (from exchange_component_halos()) are scattered with the same
     * bounds-clipped kernel footprint: their source pixels lie logically
     * OUTSIDE the slab, but the clipping keeps every write inside
     * [0,height_) x [0,width_) — exactly the wing flux that a local-only
     * convolution loses at decomposition boundaries.
     * Purely LOCAL — no MPI calls (the former Allreduce-MAX cutoff
     * agreement is obsolete: the input list is already replicated).
     *
     * @param component Input component map (width_ x height_, row-major)
     * @param components Globally sorted list from gather_global_components()
     * @param kernel Restoring kernel from build_restoring_kernel()
     * @param kernel_radius Kernel half-width R (== halo width)
     * @param halo_north R rows from rank_north_ (layout: see
     *        exchange_component_halos; pass an empty vector for no halo)
     * @param halo_south R rows from rank_south_
     * @param halo_west (height_+2R) x R columns from rank_west_ (extended)
     * @param halo_east (height_+2R) x R columns from rank_east_ (extended)
     * @param convolved Output buffer (width_ x height_), overwritten
     * @param kernel_radius Kernel half-width R
     * @param convolved_real Output (height_ x width_), overwritten
     * @param convolved_imag Output (height_ x width_), overwritten
     */
    void convolve_components(const std::vector<double>& component,
    void convolve_components(const std::vector<GlobalComponent>& components,
                             const std::vector<double>& kernel, int kernel_radius,
                             const std::vector<double>& halo_north,
                             const std::vector<double>& halo_south,
                             const std::vector<double>& halo_west,
                             const std::vector<double>& halo_east,
                             std::vector<double>& convolved) const;
                             std::vector<double>& convolved_real,
                             std::vector<double>& convolved_imag) const;

    /**
     * @brief Compute the restored image (WSClean convention)
     * @details Fits (or takes from config) the restoring beam, agrees on a
     * globally uniform kernel radius (Allreduce-MIN over comm_), exchanges
     * component halos with the decomposition neighbours, convolves the
     * component maps with the beam, and adds the residual:
     * @details Fits (or takes from config) the restoring beam, derives the
     * kernel-radius clamp from the GLOBAL image dimensions (psf_width_/
     * psf_height_ — the beam is fully replicated, so this is identical on
     * every rank and INDEPENDENT of the decomposition), replicates the
     * sparse global component list (Allgatherv), convolves it into the
     * local slab, and adds the residual:
     * restored = (components (*) G_clean) + residual.
     * COLLECTIVE over comm_: every rank must call it (run_clean() does).
     * @param mpi_rank Rank id, used to gate the beam-parameter log line
+8 −4
Original line number Diff line number Diff line
@@ -533,12 +533,16 @@ if (phase_enabled && num_w_planes > 1)
#endif
  }

  double write_0 = WALLCLOCK_TIME;
  
  // Time the pre-write barrier separately: it absorbs the load-imbalance
  // skew from the phase-correction compute (fast-tile ranks wait here for
  // the slowest rank). Folding it into timing.write would misreport idle
  // wait as MPI-IO time under the MPI_MAX reduction in write_timings().
  double barrier_0 = WALLCLOCK_TIME;
  if (size > 1)
  {
    MPI_Barrier(MYMPI_COMM);
  }
  timing.write_imbalance += WALLCLOCK_TIME - barrier_0;

  double write_0 = WALLCLOCK_TIME;


#ifdef FITSIO
Loading