Loading src/gridding/wstack.cpp +148 −27 Original line number Diff line number Diff line Loading @@ -7,6 +7,7 @@ #include <numbers> #include <string> #include <algorithm> #include <vector> #include <cctype> #include <iostream> #include "../../ricklib.h" Loading Loading @@ -248,6 +249,13 @@ void wstack( if (grid_w >= num_w_planes) grid_w = num_w_planes - 1; // H4: clamp max-w vis to top plane // Skip if the vis center cell is outside this rank's local grid. // This is CORRECT (not a bug): the bucket-sort redistribution sends a // boundary visibility to EVERY rank whose ghost-extended slab overlaps // it, so halo copies are duplicates. Only the OWNING rank (center cell // inside its slab) must deposit density, otherwise boundary cells would // be double-counted. The owner always holds ALL visibilities whose // center falls in its cells (ghost band is a superset of the owned // band), so the local density at owned cells is complete. if (grid_u < x_start || grid_u >= x_start + xaxis) continue; if (grid_v < y_start || grid_v >= y_start + yaxis) continue; if (grid_w < 0 || grid_w >= num_w_planes) continue; Loading Loading @@ -301,43 +309,156 @@ void wstack( } } // Compute per-vis effective weight ONCE per visibility, using the cell // at the vis center (NOT inside the kernel-cell inner loop). #pragma omp parallel for num_threads(num_threads) // ========================================================================= // Pass 2: per-vis effective weight computation. // // Owned vis (center in this rank's slab): look up weight_uv[cell] directly. // // Halo vis (center in a neighbouring rank's slab but kernel footprint // overlapping this slab): they must use the SAME Briggs effective weight // that the owning rank would assign — otherwise the halo and its conjugate // baseline get mismatched weights, breaking Hermitian symmetry. // // Protocol: two MPI_Alltoallv rounds. // Round 1: each rank sends (u_local, v_local, w) int-triples to the // rank that owns the density cell (the "owning rank"). // Round 2: each owning rank looks up weight_uv and sends the density // double back to the requester. // After the exchange every vis — owned or halo — has its correct density, // and the per-vis weight formula is applied uniformly. // ========================================================================= // -- D0: Gather all ranks' slab extents. ---------------------------------- struct SlabExt { int x0, xs, y0, ys; }; std::vector<SlabExt> all_ext(size); { SlabExt me = { x_start, xaxis, y_start, yaxis }; MPI_Allgather(&me, 4, MPI_INT, all_ext.data(), 4, MPI_INT, MPI_COMM_WORLD); } // Return the rank that owns cell (gu, gv), or -1 for out-of-range cells. auto cell_owner = [&](int gu, int gv) -> int { for (int r = 0; r < size; r++) { if (gu >= all_ext[r].x0 && gu < all_ext[r].x0 + all_ext[r].xs && gv >= all_ext[r].y0 && gv < all_ext[r].y0 + all_ext[r].ys) return r; } return -1; }; // -- D1: Single pass: owned vis get weights now; halo vis get queued. ----- // halo_vis_idx[r] — local vis indices of halos whose owner is rank r // halo_queries[r] — matching (u_local, v_local, w) triples std::vector<std::vector<myull>> halo_vis_idx(size); // Flattened int-triple storage per destination rank (packed separately) std::vector<std::vector<int>> halo_uvw(size); // 3 ints per entry for (myull ii = 0; ii < num_points; ii++) { // H4 FIX: sentinel is > 1.0 (1.0 is a valid max-w vis) if (ww[ii] > 1.0) { out_vis_weight[ii] = 0.0; continue; } double pos_u = uu[ii] / dx; double pos_v = vv[ii] / dx; double ww_i = ww[ii] / dw; const double pos_u = uu[ii] / dx; const double pos_v = vv[ii] / dx; const double ww_i = ww[ii] / dw; int grid_u = (int)pos_u; int grid_v = (int)pos_v; int grid_w = (int)ww_i; if (grid_w >= num_w_planes) grid_w = num_w_planes - 1; // H4: clamp max-w vis to top plane if (grid_u < x_start || grid_u >= x_start + xaxis || grid_v < y_start || grid_v >= y_start + yaxis || grid_w < 0 || grid_w >= num_w_planes) { out_vis_weight[ii] = 0.0; continue; if (grid_w >= num_w_planes) grid_w = num_w_planes - 1; if (grid_w < 0 || grid_w >= num_w_planes) { out_vis_weight[ii] = 0.0; continue; } myull cell = (myull)(grid_u - x_start) if (grid_u >= x_start && grid_u < x_start + xaxis && grid_v >= y_start && grid_v < y_start + yaxis) { // Owned vis — compute weight directly. const myull cell = (myull)(grid_u - x_start) + (myull)(grid_v - y_start) * (myull)xaxis + (myull)grid_w * (myull)xaxis * (myull)yaxis; const double n_cell = weight_uv[cell]; const double w_vis = (double)weight[ii]; if (n_cell <= 0.0) { out_vis_weight[ii] = 0.0; continue; } if (wmode == WeightingMode::Uniform) { // WSClean Uniform: w_eff = w_vis / Σw_cell. if (wmode == WeightingMode::Uniform) out_vis_weight[ii] = w_vis / n_cell; } else { // Briggs // WSClean Briggs: w_eff = w_vis / (1 + Σw_cell · f²). else out_vis_weight[ii] = w_vis / (1.0 + n_cell * briggs_f2); } else { // Halo vis — queue a density request to the owning rank. const int owner = cell_owner(grid_u, grid_v); if (owner < 0) { out_vis_weight[ii] = 0.0; continue; } halo_vis_idx[owner].push_back(ii); halo_uvw[owner].push_back(grid_u - all_ext[owner].x0); halo_uvw[owner].push_back(grid_v - all_ext[owner].y0); halo_uvw[owner].push_back(grid_w); } } // -- D2: Build Alltoallv send buffer for Round 1 (int triples). ----------- std::vector<int> sq_cnt(size), sq_dsp(size, 0); for (int r = 0; r < size; r++) sq_cnt[r] = (int)halo_uvw[r].size(); for (int r = 1; r < size; r++) sq_dsp[r] = sq_dsp[r-1] + sq_cnt[r-1]; const int sq_total = sq_dsp[size-1] + sq_cnt[size-1]; std::vector<int> send_q(sq_total); for (int r = 0; r < size; r++) { std::copy(halo_uvw[r].begin(), halo_uvw[r].end(), send_q.begin() + sq_dsp[r]); } halo_uvw.clear(); // free early // Exchange counts (Alltoall of single ints), then the query triples. std::vector<int> rq_cnt(size), rq_dsp(size, 0); MPI_Alltoall(sq_cnt.data(), 1, MPI_INT, rq_cnt.data(), 1, MPI_INT, MPI_COMM_WORLD); for (int r = 1; r < size; r++) rq_dsp[r] = rq_dsp[r-1] + rq_cnt[r-1]; const int rq_total = rq_dsp[size-1] + rq_cnt[size-1]; std::vector<int> recv_q(rq_total); MPI_Alltoallv(send_q.data(), sq_cnt.data(), sq_dsp.data(), MPI_INT, recv_q.data(), rq_cnt.data(), rq_dsp.data(), MPI_INT, MPI_COMM_WORLD); send_q.clear(); // free early // -- D3: Service received queries: look up weight_uv for each triple. ----- const int n_recv_queries = rq_total / 3; std::vector<double> send_dens(n_recv_queries); for (int j = 0; j < n_recv_queries; j++) { const int u_l = recv_q[j*3 + 0]; const int v_l = recv_q[j*3 + 1]; const int w = recv_q[j*3 + 2]; const myull cell = (myull)u_l + (myull)v_l * (myull)xaxis + (myull)w * (myull)xaxis * (myull)yaxis; send_dens[j] = (cell < dens_size) ? weight_uv[cell] : 0.0; } recv_q.clear(); // -- D4: Alltoallv Round 2 — send densities back to requesters. ----------- // For round 2 the roles flip: what we received becomes what we send. std::vector<int> sd_cnt(size), sd_dsp(size, 0); std::vector<int> rd_cnt(size), rd_dsp(size, 0); for (int r = 0; r < size; r++) sd_cnt[r] = rq_cnt[r] / 3; // queries from r → densities to r for (int r = 0; r < size; r++) rd_cnt[r] = sq_cnt[r] / 3; // queries to r → densities from r for (int r = 1; r < size; r++) sd_dsp[r] = sd_dsp[r-1] + sd_cnt[r-1]; for (int r = 1; r < size; r++) rd_dsp[r] = rd_dsp[r-1] + rd_cnt[r-1]; const int rd_total = rd_dsp[size-1] + rd_cnt[size-1]; std::vector<double> recv_dens(rd_total); MPI_Alltoallv(send_dens.data(), sd_cnt.data(), sd_dsp.data(), MPI_DOUBLE, recv_dens.data(), rd_cnt.data(), rd_dsp.data(), MPI_DOUBLE, MPI_COMM_WORLD); send_dens.clear(); // -- D5: Apply received densities to halo vis. ---------------------------- for (int r = 0; r < size; r++) { for (int j = 0; j < rd_cnt[r]; j++) { const myull vis_ii = halo_vis_idx[r][j]; const double n_cell = recv_dens[rd_dsp[r] + j]; const double w_vis = (double)weight[vis_ii]; if (n_cell <= 0.0) { out_vis_weight[vis_ii] = 0.0; continue; } if (wmode == WeightingMode::Uniform) out_vis_weight[vis_ii] = w_vis / n_cell; else out_vis_weight[vis_ii] = w_vis / (1.0 + n_cell * briggs_f2); } } } Loading src/gridding/wstack.h +14 −0 Original line number Diff line number Diff line Loading @@ -43,6 +43,20 @@ * weight is then precomputed once: * - Uniform: w_eff = w_vis / Σw_cell * - Briggs: w_eff = w_vis / (1 + Σw_cell · f²) * HALO DENSITY EXCHANGE: visibilities whose center cell lies in a * NEIGHBOURING rank's slab (sent here by the ghost-region bucket sort * because their kernel footprint overlaps this slab) cannot address a * local density cell. Their density Σw_cell is fetched from the OWNING * rank via a two-round MPI_Alltoallv exchange: round 1 sends * (u_local, v_local, w) query triples to the cell owner; round 2 returns * the looked-up weight_uv density. Every vis — owned or halo — then gets * the identical Briggs/Uniform effective weight it would receive on its * owning rank, so conjugate baselines split across slabs stay matched * and Hermitian symmetry of the gridded UV plane is preserved exactly. * (Zeroing halo vis truncated footprints at slab boundaries, ~5% * imaginary dirty image; a raw-Natural fallback mismatched conjugate * pair weights catastrophically, up to ~48% imaginary. Both superseded.) * This makes wstack() COLLECTIVE over MPI_COMM_WORLD for Uniform/Briggs. * 4. Main pass: convolve each visibility (and its frequency samples) onto * the grid using w_eff (Uniform/Briggs) or the raw weight (Natural). The * weighting branch is hoisted to the per-visibility level, out of the hot Loading Loading
src/gridding/wstack.cpp +148 −27 Original line number Diff line number Diff line Loading @@ -7,6 +7,7 @@ #include <numbers> #include <string> #include <algorithm> #include <vector> #include <cctype> #include <iostream> #include "../../ricklib.h" Loading Loading @@ -248,6 +249,13 @@ void wstack( if (grid_w >= num_w_planes) grid_w = num_w_planes - 1; // H4: clamp max-w vis to top plane // Skip if the vis center cell is outside this rank's local grid. // This is CORRECT (not a bug): the bucket-sort redistribution sends a // boundary visibility to EVERY rank whose ghost-extended slab overlaps // it, so halo copies are duplicates. Only the OWNING rank (center cell // inside its slab) must deposit density, otherwise boundary cells would // be double-counted. The owner always holds ALL visibilities whose // center falls in its cells (ghost band is a superset of the owned // band), so the local density at owned cells is complete. if (grid_u < x_start || grid_u >= x_start + xaxis) continue; if (grid_v < y_start || grid_v >= y_start + yaxis) continue; if (grid_w < 0 || grid_w >= num_w_planes) continue; Loading Loading @@ -301,43 +309,156 @@ void wstack( } } // Compute per-vis effective weight ONCE per visibility, using the cell // at the vis center (NOT inside the kernel-cell inner loop). #pragma omp parallel for num_threads(num_threads) // ========================================================================= // Pass 2: per-vis effective weight computation. // // Owned vis (center in this rank's slab): look up weight_uv[cell] directly. // // Halo vis (center in a neighbouring rank's slab but kernel footprint // overlapping this slab): they must use the SAME Briggs effective weight // that the owning rank would assign — otherwise the halo and its conjugate // baseline get mismatched weights, breaking Hermitian symmetry. // // Protocol: two MPI_Alltoallv rounds. // Round 1: each rank sends (u_local, v_local, w) int-triples to the // rank that owns the density cell (the "owning rank"). // Round 2: each owning rank looks up weight_uv and sends the density // double back to the requester. // After the exchange every vis — owned or halo — has its correct density, // and the per-vis weight formula is applied uniformly. // ========================================================================= // -- D0: Gather all ranks' slab extents. ---------------------------------- struct SlabExt { int x0, xs, y0, ys; }; std::vector<SlabExt> all_ext(size); { SlabExt me = { x_start, xaxis, y_start, yaxis }; MPI_Allgather(&me, 4, MPI_INT, all_ext.data(), 4, MPI_INT, MPI_COMM_WORLD); } // Return the rank that owns cell (gu, gv), or -1 for out-of-range cells. auto cell_owner = [&](int gu, int gv) -> int { for (int r = 0; r < size; r++) { if (gu >= all_ext[r].x0 && gu < all_ext[r].x0 + all_ext[r].xs && gv >= all_ext[r].y0 && gv < all_ext[r].y0 + all_ext[r].ys) return r; } return -1; }; // -- D1: Single pass: owned vis get weights now; halo vis get queued. ----- // halo_vis_idx[r] — local vis indices of halos whose owner is rank r // halo_queries[r] — matching (u_local, v_local, w) triples std::vector<std::vector<myull>> halo_vis_idx(size); // Flattened int-triple storage per destination rank (packed separately) std::vector<std::vector<int>> halo_uvw(size); // 3 ints per entry for (myull ii = 0; ii < num_points; ii++) { // H4 FIX: sentinel is > 1.0 (1.0 is a valid max-w vis) if (ww[ii] > 1.0) { out_vis_weight[ii] = 0.0; continue; } double pos_u = uu[ii] / dx; double pos_v = vv[ii] / dx; double ww_i = ww[ii] / dw; const double pos_u = uu[ii] / dx; const double pos_v = vv[ii] / dx; const double ww_i = ww[ii] / dw; int grid_u = (int)pos_u; int grid_v = (int)pos_v; int grid_w = (int)ww_i; if (grid_w >= num_w_planes) grid_w = num_w_planes - 1; // H4: clamp max-w vis to top plane if (grid_u < x_start || grid_u >= x_start + xaxis || grid_v < y_start || grid_v >= y_start + yaxis || grid_w < 0 || grid_w >= num_w_planes) { out_vis_weight[ii] = 0.0; continue; if (grid_w >= num_w_planes) grid_w = num_w_planes - 1; if (grid_w < 0 || grid_w >= num_w_planes) { out_vis_weight[ii] = 0.0; continue; } myull cell = (myull)(grid_u - x_start) if (grid_u >= x_start && grid_u < x_start + xaxis && grid_v >= y_start && grid_v < y_start + yaxis) { // Owned vis — compute weight directly. const myull cell = (myull)(grid_u - x_start) + (myull)(grid_v - y_start) * (myull)xaxis + (myull)grid_w * (myull)xaxis * (myull)yaxis; const double n_cell = weight_uv[cell]; const double w_vis = (double)weight[ii]; if (n_cell <= 0.0) { out_vis_weight[ii] = 0.0; continue; } if (wmode == WeightingMode::Uniform) { // WSClean Uniform: w_eff = w_vis / Σw_cell. if (wmode == WeightingMode::Uniform) out_vis_weight[ii] = w_vis / n_cell; } else { // Briggs // WSClean Briggs: w_eff = w_vis / (1 + Σw_cell · f²). else out_vis_weight[ii] = w_vis / (1.0 + n_cell * briggs_f2); } else { // Halo vis — queue a density request to the owning rank. const int owner = cell_owner(grid_u, grid_v); if (owner < 0) { out_vis_weight[ii] = 0.0; continue; } halo_vis_idx[owner].push_back(ii); halo_uvw[owner].push_back(grid_u - all_ext[owner].x0); halo_uvw[owner].push_back(grid_v - all_ext[owner].y0); halo_uvw[owner].push_back(grid_w); } } // -- D2: Build Alltoallv send buffer for Round 1 (int triples). ----------- std::vector<int> sq_cnt(size), sq_dsp(size, 0); for (int r = 0; r < size; r++) sq_cnt[r] = (int)halo_uvw[r].size(); for (int r = 1; r < size; r++) sq_dsp[r] = sq_dsp[r-1] + sq_cnt[r-1]; const int sq_total = sq_dsp[size-1] + sq_cnt[size-1]; std::vector<int> send_q(sq_total); for (int r = 0; r < size; r++) { std::copy(halo_uvw[r].begin(), halo_uvw[r].end(), send_q.begin() + sq_dsp[r]); } halo_uvw.clear(); // free early // Exchange counts (Alltoall of single ints), then the query triples. std::vector<int> rq_cnt(size), rq_dsp(size, 0); MPI_Alltoall(sq_cnt.data(), 1, MPI_INT, rq_cnt.data(), 1, MPI_INT, MPI_COMM_WORLD); for (int r = 1; r < size; r++) rq_dsp[r] = rq_dsp[r-1] + rq_cnt[r-1]; const int rq_total = rq_dsp[size-1] + rq_cnt[size-1]; std::vector<int> recv_q(rq_total); MPI_Alltoallv(send_q.data(), sq_cnt.data(), sq_dsp.data(), MPI_INT, recv_q.data(), rq_cnt.data(), rq_dsp.data(), MPI_INT, MPI_COMM_WORLD); send_q.clear(); // free early // -- D3: Service received queries: look up weight_uv for each triple. ----- const int n_recv_queries = rq_total / 3; std::vector<double> send_dens(n_recv_queries); for (int j = 0; j < n_recv_queries; j++) { const int u_l = recv_q[j*3 + 0]; const int v_l = recv_q[j*3 + 1]; const int w = recv_q[j*3 + 2]; const myull cell = (myull)u_l + (myull)v_l * (myull)xaxis + (myull)w * (myull)xaxis * (myull)yaxis; send_dens[j] = (cell < dens_size) ? weight_uv[cell] : 0.0; } recv_q.clear(); // -- D4: Alltoallv Round 2 — send densities back to requesters. ----------- // For round 2 the roles flip: what we received becomes what we send. std::vector<int> sd_cnt(size), sd_dsp(size, 0); std::vector<int> rd_cnt(size), rd_dsp(size, 0); for (int r = 0; r < size; r++) sd_cnt[r] = rq_cnt[r] / 3; // queries from r → densities to r for (int r = 0; r < size; r++) rd_cnt[r] = sq_cnt[r] / 3; // queries to r → densities from r for (int r = 1; r < size; r++) sd_dsp[r] = sd_dsp[r-1] + sd_cnt[r-1]; for (int r = 1; r < size; r++) rd_dsp[r] = rd_dsp[r-1] + rd_cnt[r-1]; const int rd_total = rd_dsp[size-1] + rd_cnt[size-1]; std::vector<double> recv_dens(rd_total); MPI_Alltoallv(send_dens.data(), sd_cnt.data(), sd_dsp.data(), MPI_DOUBLE, recv_dens.data(), rd_cnt.data(), rd_dsp.data(), MPI_DOUBLE, MPI_COMM_WORLD); send_dens.clear(); // -- D5: Apply received densities to halo vis. ---------------------------- for (int r = 0; r < size; r++) { for (int j = 0; j < rd_cnt[r]; j++) { const myull vis_ii = halo_vis_idx[r][j]; const double n_cell = recv_dens[rd_dsp[r] + j]; const double w_vis = (double)weight[vis_ii]; if (n_cell <= 0.0) { out_vis_weight[vis_ii] = 0.0; continue; } if (wmode == WeightingMode::Uniform) out_vis_weight[vis_ii] = w_vis / n_cell; else out_vis_weight[vis_ii] = w_vis / (1.0 + n_cell * briggs_f2); } } } Loading
src/gridding/wstack.h +14 −0 Original line number Diff line number Diff line Loading @@ -43,6 +43,20 @@ * weight is then precomputed once: * - Uniform: w_eff = w_vis / Σw_cell * - Briggs: w_eff = w_vis / (1 + Σw_cell · f²) * HALO DENSITY EXCHANGE: visibilities whose center cell lies in a * NEIGHBOURING rank's slab (sent here by the ghost-region bucket sort * because their kernel footprint overlaps this slab) cannot address a * local density cell. Their density Σw_cell is fetched from the OWNING * rank via a two-round MPI_Alltoallv exchange: round 1 sends * (u_local, v_local, w) query triples to the cell owner; round 2 returns * the looked-up weight_uv density. Every vis — owned or halo — then gets * the identical Briggs/Uniform effective weight it would receive on its * owning rank, so conjugate baselines split across slabs stay matched * and Hermitian symmetry of the gridded UV plane is preserved exactly. * (Zeroing halo vis truncated footprints at slab boundaries, ~5% * imaginary dirty image; a raw-Natural fallback mismatched conjugate * pair weights catastrophically, up to ~48% imaginary. Both superseded.) * This makes wstack() COLLECTIVE over MPI_COMM_WORLD for Uniform/Briggs. * 4. Main pass: convolve each visibility (and its frequency samples) onto * the grid using w_eff (Uniform/Briggs) or the raw weight (Natural). The * weighting branch is hoisted to the per-visibility level, out of the hot Loading