Loading run_leo +13 −4 Original line number Diff line number Diff line #!/bin/bash #SBATCH --nodes=6 #SBATCH --nodes=1 #SBATCH --ntasks-per-node=2 #SBATCH --cpus-per-task=56 #SBATCH --time=04:00:00 #SBATCH --job-name=dadp_test #SBATCH --partition=dcgp_usr_prod #SBATCH --account=IscrC_dadp #SBATCH --account=EUHPC_D18_045 #SBATCH --output=out_leo #SBATCH --error=err_leo #SBATCH --mem=480G Loading @@ -14,8 +14,10 @@ cd $SLURM_SUBMIT_DIR module restore my_gcc #module restore my_intel module load gcc module load openmpi make clean make ulimit -s unlimited Loading @@ -36,5 +38,12 @@ IN_DATA=/leonardo_work/IscrC_dadp #10^6 points time mpirun -n ${SLURM_NTASKS} --map-by ppr:1:socket:PE=${SLURM_CPUS_PER_TASK} ./main -t f32 -i ${IN_DATA}/norm_data/std_LR_091_0001 -d 5 -a ${OUT_ASSIGNMENT} -o ${OUT_DATA} #34 * 10^6 points #time mpirun -n ${SLURM_NTASKS} --map-by ppr:1:socket:PE=${SLURM_CPUS_PER_TASK} ./main -t f32 -i ${IN_DATA}/norm_data/std_g1212639_091_0001 -d 5 -a ${OUT_ASSIGNMENT} -o ${OUT_DATA} #88 * 10^6 points #time mpirun -n ${SLURM_NTASKS} --map-by ppr:1:socket:PE=${SLURM_CPUS_PER_TASK} ./main -t f32 -i ${IN_DATA}/norm_data/std_g5503149_091_0000 -d 5 -a ${OUT_ASSIGNMENT} -o ${OUT_DATA} #200 * 10^6 points #time mpirun -n ${SLURM_NTASKS} --map-by ppr:1:socket:PE=${SLURM_CPUS_PER_TASK} ./main -t f32 -i ${IN_DATA}/norm_data/std_g2980844_091_0000 -d 5 -a ${OUT_ASSIGNMENT} -o ${OUT_DATA} src/adp/adp.c +185 −90 Original line number Diff line number Diff line Loading @@ -118,8 +118,8 @@ idx_t get_j_ksel_idx(global_context_t* ctx, idx_t j, idx_t ksel, MPI_Win exposed } void compute_density_kstarnn_rma_v2(global_context_t* ctx, const float_t d, int verbose){ void compute_density_kstarnn_rma_v2(global_context_t* ctx, const float_t d, int verbose) { /* * Point density computation: * args: Loading Loading @@ -648,12 +648,32 @@ lock_t h1_lock_free(global_context_t* ctx, MPI_Win lock_window, int owner, idx_t } void enqueue_center_removal(center_removal_queue_t* queue, center_removal_t element) { int default_incr_size = 100; if(queue -> count >= queue -> size) { queue -> size += default_incr_size; queue -> data = realloc(queue -> data, queue -> size * sizeof(center_removal_t)); } queue -> data[queue -> count] = element; queue -> count++; } int compare_removal_by_target(const void* a, const void* b) { idx_t arg1 = (*(const center_removal_t *)a).target_id; idx_t arg2 = (*(const center_removal_t *)b).target_id; return (arg1 > arg2) - (arg1 < arg2); } clusters_t Heuristic1(global_context_t *ctx) { /* /** * Heurisitc 1, from paper of Errico, Facco, Laio & Rodriguez * ( https://doi.org/10.1016/j.ins.2021.01.010 ) */ **/ datapoint_info_t* dp_info = ctx -> local_datapoints; idx_t n = ctx -> local_n_points; Loading Loading @@ -721,36 +741,38 @@ clusters_t Heuristic1(global_context_t *ctx) * Generate a mask that keeps track of the point has been eliminating the * point considered. Each thread updates this mask, then after the procedure * ends, center, removed centers, and max_rho arrays are populated * * optimized v2 use a queue of center removal and then exchange them */ lock_t* lock_array = (lock_t*)MY_MALLOC(n * sizeof(lock_t)); heap_node* to_remove_mask = (heap_node*)MY_MALLOC(n*sizeof(heap_node)); for(idx_t p = 0; p < n; ++p) { to_remove_mask[p].array_idx = MY_SIZE_MAX; to_remove_mask[p].value = 9999999; lock_array[p] = LOCK_FREE; } qsort(dp_info_ptrs, n, sizeof(datapoint_info_t*), cmpPP); /** * workflow * find the elements we need to eliminate * compact them to a single array * send/recieve **/ MPI_Win win_to_remove_mask; MPI_Win_create(to_remove_mask, n * sizeof(heap_node), 1, MPI_INFO_NULL, ctx -> mpi_communicator, &win_to_remove_mask); MPI_Win_fence(0, win_to_remove_mask); MPI_Win win_locks; MPI_Win_create(lock_array, n * sizeof(lock_t), sizeof(lock_t), MPI_INFO_NULL, ctx -> mpi_communicator, &win_locks); MPI_Win_fence(0, win_locks); center_removal_queue_t* removal_queues = (center_removal_queue_t*)MY_MALLOC(n * sizeof(center_removal_queue_t)); omp_lock_t* lock_array = (omp_lock_t*)MY_MALLOC(n * sizeof(omp_lock_t)); #ifdef EXP_H1 MPI_Win_lock_all(0, win_to_remove_mask); MPI_Win_lock_all(0, win_locks); #endif //zero out all queues #ifdef EXP_H1 printf("Using experimental h1\n"); #endif for(idx_t i = 0; i < n; ++i) { removal_queues[i].count = 0; removal_queues[i].size = 0; removal_queues[i].data = NULL; omp_init_lock(lock_array + i); } #if !defined(THREAD_FUNNELED) #pragma omp parallel for schedule(dynamic) Loading @@ -767,82 +789,140 @@ clusters_t Heuristic1(global_context_t *ctx) if(j_point.is_center && i_point.g > j_point.g) { /* * * TODO: Implement it without this but using private locks * use an array of locks, and compare and swap to actually gain control of the thing * * */ #ifdef EXP_H1 #pragma omp critical (h1_exp) { // set the lock at position i // actually is the p-th point int owner = foreign_owner(ctx, jidx); idx_t jpos = jidx - ctx -> rank_idx_start[owner]; //if local process it if(owner == ctx -> mpi_rank) { idx_t jpos = jidx - ctx -> idx_start; if(i_point.g > to_remove_mask[jpos].value) { omp_set_lock(lock_array + jpos); to_remove_mask[jpos].array_idx = i_point.array_idx; to_remove_mask[jpos].value = i_point.g; omp_unset_lock(lock_array + jpos); } lock_t state = LOCK_FREE; } //otherwise enqueue for sending else { center_removal_t element = {.rank = owner, .source_id = i_point.array_idx, .target_id = j_point.array_idx, .source_density = i_point.g}; enqueue_center_removal(removal_queues + p, element); } } } } state = h1_lock_acquire(ctx, win_locks, owner, jpos, state); //assemble arrays into a single buffer heap_node mask_element; MPI_Request request; idx_t tot_removal = 0; for(idx_t p = 0; p < n; ++p) { tot_removal += removal_queues[p].count; } MPI_Rget(&mask_element, sizeof(heap_node), MPI_BYTE, owner, jpos * sizeof(heap_node), sizeof(heap_node), MPI_BYTE, win_to_remove_mask, &request); MPI_Wait(&request, MPI_STATUS_IGNORE); idx_t buffer_idx = 0; center_removal_t* removal_buffer = (center_removal_t*)MY_MALLOC(tot_removal * sizeof(center_removal_t)); int flag = mask_element.array_idx == MY_SIZE_MAX; if(flag || i_point.g > mask_element.value ) for(idx_t p = 0; p < n; ++p) { heap_node tmp_mask_element = {.array_idx = i_point.array_idx, .value = i_point.g}; MPI_Request request; MPI_Rput(&tmp_mask_element, sizeof(heap_node), MPI_BYTE, owner, jpos*sizeof(heap_node), sizeof(heap_node), MPI_BYTE, win_to_remove_mask, &request); MPI_Wait(&request, MPI_STATUS_IGNORE); if(removal_queues[p].count > 0) { memcpy(removal_buffer + buffer_idx, removal_queues[p].data, removal_queues[p].count * sizeof(center_removal_t)); buffer_idx += removal_queues[p].count; } state = h1_lock_free(ctx, win_locks, owner, jpos, state); } #else #pragma omp critical (centers_elimination) { int owner = foreign_owner(ctx, jidx); idx_t jpos = jidx - ctx -> rank_idx_start[owner]; MPI_Win_lock(MPI_LOCK_EXCLUSIVE, owner, 0, win_to_remove_mask); heap_node mask_element; MPI_Request request; MPI_Rget(&mask_element, sizeof(heap_node), MPI_BYTE, owner, jpos * sizeof(heap_node), sizeof(heap_node), MPI_BYTE, win_to_remove_mask, &request); MPI_Wait(&request, MPI_STATUS_IGNORE); //sort by array idx (it sorts also by rank) qsort(removal_buffer, tot_removal, sizeof(center_removal_t), compare_removal_by_target); //prepare for the sendrcv int* recv_counts = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); int* send_counts = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); int flag = mask_element.array_idx == MY_SIZE_MAX; if(flag || i_point.g > mask_element.value ) int* recv_displs = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); int* send_displs = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); //zero_out_all for(int i = 0; i < ctx -> mpi_rank; ++i) { heap_node tmp_mask_element = {.array_idx = i_point.array_idx, .value = i_point.g}; MPI_Request request; MPI_Rput(&tmp_mask_element, sizeof(heap_node), MPI_BYTE, owner, jpos*sizeof(heap_node), sizeof(heap_node), MPI_BYTE, win_to_remove_mask, &request); MPI_Wait(&request, MPI_STATUS_IGNORE); recv_counts[i] = 0; send_counts[i] = 0; recv_displs[i] = 0; send_displs[i] = 0; } for(idx_t i = 0; i < tot_removal; ++i) { int rank_idx = removal_buffer[i].rank; send_counts[rank_idx]++; } MPI_Win_unlock(owner, win_to_remove_mask); // all to all to recieve counts MPI_Alltoall(send_counts, 1, MPI_INT, recv_counts, 1, MPI_INT, ctx -> mpi_communicator); // compute displacements for(int i = 1; i < ctx -> world_size; ++i) { recv_displs[i] = recv_displs[i - 1] + recv_counts[i - 1]; send_displs[i] = send_displs[i - 1] + send_counts[i - 1]; } #endif idx_t tot_recv_counts = 0; // count how many elements to recieve for(int i = 0; i < ctx -> world_size; ++i) tot_recv_counts += recv_counts[i]; if(ctx -> mpi_rank == 0){ for(int i = 0; i < ctx -> world_size; ++i){ DB_PRINT("%d mpi rank recv_count %d to %d\n", ctx -> mpi_rank, recv_counts[i], i); DB_PRINT("%d mpi rank send_count %d to %d\n", ctx -> mpi_rank, send_counts[i], i); } } DB_PRINT("rank %d: %lu recv counts\n", ctx -> mpi_rank, tot_recv_counts); // change dimensions to bytes for(int i = 0; i < ctx -> world_size; ++i) { recv_counts[i] = recv_counts[i] * sizeof(center_removal_t); send_counts[i] = send_counts[i] * sizeof(center_removal_t); recv_displs[i] = recv_displs[i] * sizeof(center_removal_t); send_displs[i] = send_displs[i] * sizeof(center_removal_t); } #ifdef EXP_H1 MPI_Win_unlock_all(win_to_remove_mask); MPI_Win_unlock_all(win_locks); #endif //allocate buffer to recieve center elminiations center_removal_t* recv_removals = (center_removal_t*)MY_MALLOC(tot_recv_counts * sizeof(center_removal_t)); // all to all MPI_Alltoallv(removal_buffer, send_counts, send_displs, MPI_BYTE, recv_removals, recv_counts, recv_displs, MPI_BYTE, ctx -> mpi_communicator); // merge into the mask #pragma omp parallel for for(idx_t i = 0; i < tot_recv_counts; ++i) { idx_t el_pos = recv_removals[i].target_id - ctx -> idx_start; if(recv_removals[i].source_density > to_remove_mask[el_pos].value) { omp_set_lock(lock_array + el_pos); to_remove_mask[el_pos].array_idx = recv_removals[i].source_id; to_remove_mask[el_pos].value = recv_removals[i].source_density; omp_unset_lock(lock_array + el_pos); } } MPI_Win_fence(0, win_to_remove_mask); MPI_Win_fence(0, win_locks); MPI_Barrier(ctx -> mpi_communicator); /* populate the usual arrays */ for(idx_t p = 0; p < all_centers.count; ++p) Loading Loading @@ -888,11 +968,24 @@ clusters_t Heuristic1(global_context_t *ctx) } } MPI_Win_free(&win_to_remove_mask); free(to_remove_mask); MPI_Win_free(&win_locks); for(idx_t i = 0; i < n; ++i) { removal_queues[i].count = 0; removal_queues[i].size = 0; removal_queues[i].data = NULL; omp_destroy_lock(lock_array+ i); } free(removal_queues); free(removal_buffer); free(send_counts); free(send_displs); free(recv_counts); free(recv_displs); free(lock_array); free(recv_removals); int n_centers = (int)actual_centers.count; int tot_centers; Loading @@ -900,9 +993,11 @@ clusters_t Heuristic1(global_context_t *ctx) MPI_DB_PRINT("Found %d temporary centers\n", tot_centers); /* bring on master all centers /** * bring on master all centers * order them in ascending order of density, * then re-scatter them around to get unique cluster labels */ * then re-scatter them around to get unique cluster labels **/ center_t* private_centers_buffer = (center_t*)MY_MALLOC(actual_centers.count * sizeof(center_t)); center_t* global_centers_buffer = (center_t*)MY_MALLOC(tot_centers * sizeof(center_t)); Loading src/adp/adp.h +15 −0 Original line number Diff line number Diff line Loading @@ -45,6 +45,21 @@ typedef struct merge_t float_t density; } merge_t; typedef struct center_removal_t { int rank; idx_t source_id; idx_t target_id; float_t source_density; } center_removal_t; typedef struct center_removal_queue_t { center_removal_t* data; idx_t count; idx_t size; } center_removal_queue_t; void compute_density_kstarnn_rma(global_context_t* ctx, const float_t d, int verbose); Loading Loading
run_leo +13 −4 Original line number Diff line number Diff line #!/bin/bash #SBATCH --nodes=6 #SBATCH --nodes=1 #SBATCH --ntasks-per-node=2 #SBATCH --cpus-per-task=56 #SBATCH --time=04:00:00 #SBATCH --job-name=dadp_test #SBATCH --partition=dcgp_usr_prod #SBATCH --account=IscrC_dadp #SBATCH --account=EUHPC_D18_045 #SBATCH --output=out_leo #SBATCH --error=err_leo #SBATCH --mem=480G Loading @@ -14,8 +14,10 @@ cd $SLURM_SUBMIT_DIR module restore my_gcc #module restore my_intel module load gcc module load openmpi make clean make ulimit -s unlimited Loading @@ -36,5 +38,12 @@ IN_DATA=/leonardo_work/IscrC_dadp #10^6 points time mpirun -n ${SLURM_NTASKS} --map-by ppr:1:socket:PE=${SLURM_CPUS_PER_TASK} ./main -t f32 -i ${IN_DATA}/norm_data/std_LR_091_0001 -d 5 -a ${OUT_ASSIGNMENT} -o ${OUT_DATA} #34 * 10^6 points #time mpirun -n ${SLURM_NTASKS} --map-by ppr:1:socket:PE=${SLURM_CPUS_PER_TASK} ./main -t f32 -i ${IN_DATA}/norm_data/std_g1212639_091_0001 -d 5 -a ${OUT_ASSIGNMENT} -o ${OUT_DATA} #88 * 10^6 points #time mpirun -n ${SLURM_NTASKS} --map-by ppr:1:socket:PE=${SLURM_CPUS_PER_TASK} ./main -t f32 -i ${IN_DATA}/norm_data/std_g5503149_091_0000 -d 5 -a ${OUT_ASSIGNMENT} -o ${OUT_DATA} #200 * 10^6 points #time mpirun -n ${SLURM_NTASKS} --map-by ppr:1:socket:PE=${SLURM_CPUS_PER_TASK} ./main -t f32 -i ${IN_DATA}/norm_data/std_g2980844_091_0000 -d 5 -a ${OUT_ASSIGNMENT} -o ${OUT_DATA}
src/adp/adp.c +185 −90 Original line number Diff line number Diff line Loading @@ -118,8 +118,8 @@ idx_t get_j_ksel_idx(global_context_t* ctx, idx_t j, idx_t ksel, MPI_Win exposed } void compute_density_kstarnn_rma_v2(global_context_t* ctx, const float_t d, int verbose){ void compute_density_kstarnn_rma_v2(global_context_t* ctx, const float_t d, int verbose) { /* * Point density computation: * args: Loading Loading @@ -648,12 +648,32 @@ lock_t h1_lock_free(global_context_t* ctx, MPI_Win lock_window, int owner, idx_t } void enqueue_center_removal(center_removal_queue_t* queue, center_removal_t element) { int default_incr_size = 100; if(queue -> count >= queue -> size) { queue -> size += default_incr_size; queue -> data = realloc(queue -> data, queue -> size * sizeof(center_removal_t)); } queue -> data[queue -> count] = element; queue -> count++; } int compare_removal_by_target(const void* a, const void* b) { idx_t arg1 = (*(const center_removal_t *)a).target_id; idx_t arg2 = (*(const center_removal_t *)b).target_id; return (arg1 > arg2) - (arg1 < arg2); } clusters_t Heuristic1(global_context_t *ctx) { /* /** * Heurisitc 1, from paper of Errico, Facco, Laio & Rodriguez * ( https://doi.org/10.1016/j.ins.2021.01.010 ) */ **/ datapoint_info_t* dp_info = ctx -> local_datapoints; idx_t n = ctx -> local_n_points; Loading Loading @@ -721,36 +741,38 @@ clusters_t Heuristic1(global_context_t *ctx) * Generate a mask that keeps track of the point has been eliminating the * point considered. Each thread updates this mask, then after the procedure * ends, center, removed centers, and max_rho arrays are populated * * optimized v2 use a queue of center removal and then exchange them */ lock_t* lock_array = (lock_t*)MY_MALLOC(n * sizeof(lock_t)); heap_node* to_remove_mask = (heap_node*)MY_MALLOC(n*sizeof(heap_node)); for(idx_t p = 0; p < n; ++p) { to_remove_mask[p].array_idx = MY_SIZE_MAX; to_remove_mask[p].value = 9999999; lock_array[p] = LOCK_FREE; } qsort(dp_info_ptrs, n, sizeof(datapoint_info_t*), cmpPP); /** * workflow * find the elements we need to eliminate * compact them to a single array * send/recieve **/ MPI_Win win_to_remove_mask; MPI_Win_create(to_remove_mask, n * sizeof(heap_node), 1, MPI_INFO_NULL, ctx -> mpi_communicator, &win_to_remove_mask); MPI_Win_fence(0, win_to_remove_mask); MPI_Win win_locks; MPI_Win_create(lock_array, n * sizeof(lock_t), sizeof(lock_t), MPI_INFO_NULL, ctx -> mpi_communicator, &win_locks); MPI_Win_fence(0, win_locks); center_removal_queue_t* removal_queues = (center_removal_queue_t*)MY_MALLOC(n * sizeof(center_removal_queue_t)); omp_lock_t* lock_array = (omp_lock_t*)MY_MALLOC(n * sizeof(omp_lock_t)); #ifdef EXP_H1 MPI_Win_lock_all(0, win_to_remove_mask); MPI_Win_lock_all(0, win_locks); #endif //zero out all queues #ifdef EXP_H1 printf("Using experimental h1\n"); #endif for(idx_t i = 0; i < n; ++i) { removal_queues[i].count = 0; removal_queues[i].size = 0; removal_queues[i].data = NULL; omp_init_lock(lock_array + i); } #if !defined(THREAD_FUNNELED) #pragma omp parallel for schedule(dynamic) Loading @@ -767,82 +789,140 @@ clusters_t Heuristic1(global_context_t *ctx) if(j_point.is_center && i_point.g > j_point.g) { /* * * TODO: Implement it without this but using private locks * use an array of locks, and compare and swap to actually gain control of the thing * * */ #ifdef EXP_H1 #pragma omp critical (h1_exp) { // set the lock at position i // actually is the p-th point int owner = foreign_owner(ctx, jidx); idx_t jpos = jidx - ctx -> rank_idx_start[owner]; //if local process it if(owner == ctx -> mpi_rank) { idx_t jpos = jidx - ctx -> idx_start; if(i_point.g > to_remove_mask[jpos].value) { omp_set_lock(lock_array + jpos); to_remove_mask[jpos].array_idx = i_point.array_idx; to_remove_mask[jpos].value = i_point.g; omp_unset_lock(lock_array + jpos); } lock_t state = LOCK_FREE; } //otherwise enqueue for sending else { center_removal_t element = {.rank = owner, .source_id = i_point.array_idx, .target_id = j_point.array_idx, .source_density = i_point.g}; enqueue_center_removal(removal_queues + p, element); } } } } state = h1_lock_acquire(ctx, win_locks, owner, jpos, state); //assemble arrays into a single buffer heap_node mask_element; MPI_Request request; idx_t tot_removal = 0; for(idx_t p = 0; p < n; ++p) { tot_removal += removal_queues[p].count; } MPI_Rget(&mask_element, sizeof(heap_node), MPI_BYTE, owner, jpos * sizeof(heap_node), sizeof(heap_node), MPI_BYTE, win_to_remove_mask, &request); MPI_Wait(&request, MPI_STATUS_IGNORE); idx_t buffer_idx = 0; center_removal_t* removal_buffer = (center_removal_t*)MY_MALLOC(tot_removal * sizeof(center_removal_t)); int flag = mask_element.array_idx == MY_SIZE_MAX; if(flag || i_point.g > mask_element.value ) for(idx_t p = 0; p < n; ++p) { heap_node tmp_mask_element = {.array_idx = i_point.array_idx, .value = i_point.g}; MPI_Request request; MPI_Rput(&tmp_mask_element, sizeof(heap_node), MPI_BYTE, owner, jpos*sizeof(heap_node), sizeof(heap_node), MPI_BYTE, win_to_remove_mask, &request); MPI_Wait(&request, MPI_STATUS_IGNORE); if(removal_queues[p].count > 0) { memcpy(removal_buffer + buffer_idx, removal_queues[p].data, removal_queues[p].count * sizeof(center_removal_t)); buffer_idx += removal_queues[p].count; } state = h1_lock_free(ctx, win_locks, owner, jpos, state); } #else #pragma omp critical (centers_elimination) { int owner = foreign_owner(ctx, jidx); idx_t jpos = jidx - ctx -> rank_idx_start[owner]; MPI_Win_lock(MPI_LOCK_EXCLUSIVE, owner, 0, win_to_remove_mask); heap_node mask_element; MPI_Request request; MPI_Rget(&mask_element, sizeof(heap_node), MPI_BYTE, owner, jpos * sizeof(heap_node), sizeof(heap_node), MPI_BYTE, win_to_remove_mask, &request); MPI_Wait(&request, MPI_STATUS_IGNORE); //sort by array idx (it sorts also by rank) qsort(removal_buffer, tot_removal, sizeof(center_removal_t), compare_removal_by_target); //prepare for the sendrcv int* recv_counts = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); int* send_counts = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); int flag = mask_element.array_idx == MY_SIZE_MAX; if(flag || i_point.g > mask_element.value ) int* recv_displs = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); int* send_displs = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); //zero_out_all for(int i = 0; i < ctx -> mpi_rank; ++i) { heap_node tmp_mask_element = {.array_idx = i_point.array_idx, .value = i_point.g}; MPI_Request request; MPI_Rput(&tmp_mask_element, sizeof(heap_node), MPI_BYTE, owner, jpos*sizeof(heap_node), sizeof(heap_node), MPI_BYTE, win_to_remove_mask, &request); MPI_Wait(&request, MPI_STATUS_IGNORE); recv_counts[i] = 0; send_counts[i] = 0; recv_displs[i] = 0; send_displs[i] = 0; } for(idx_t i = 0; i < tot_removal; ++i) { int rank_idx = removal_buffer[i].rank; send_counts[rank_idx]++; } MPI_Win_unlock(owner, win_to_remove_mask); // all to all to recieve counts MPI_Alltoall(send_counts, 1, MPI_INT, recv_counts, 1, MPI_INT, ctx -> mpi_communicator); // compute displacements for(int i = 1; i < ctx -> world_size; ++i) { recv_displs[i] = recv_displs[i - 1] + recv_counts[i - 1]; send_displs[i] = send_displs[i - 1] + send_counts[i - 1]; } #endif idx_t tot_recv_counts = 0; // count how many elements to recieve for(int i = 0; i < ctx -> world_size; ++i) tot_recv_counts += recv_counts[i]; if(ctx -> mpi_rank == 0){ for(int i = 0; i < ctx -> world_size; ++i){ DB_PRINT("%d mpi rank recv_count %d to %d\n", ctx -> mpi_rank, recv_counts[i], i); DB_PRINT("%d mpi rank send_count %d to %d\n", ctx -> mpi_rank, send_counts[i], i); } } DB_PRINT("rank %d: %lu recv counts\n", ctx -> mpi_rank, tot_recv_counts); // change dimensions to bytes for(int i = 0; i < ctx -> world_size; ++i) { recv_counts[i] = recv_counts[i] * sizeof(center_removal_t); send_counts[i] = send_counts[i] * sizeof(center_removal_t); recv_displs[i] = recv_displs[i] * sizeof(center_removal_t); send_displs[i] = send_displs[i] * sizeof(center_removal_t); } #ifdef EXP_H1 MPI_Win_unlock_all(win_to_remove_mask); MPI_Win_unlock_all(win_locks); #endif //allocate buffer to recieve center elminiations center_removal_t* recv_removals = (center_removal_t*)MY_MALLOC(tot_recv_counts * sizeof(center_removal_t)); // all to all MPI_Alltoallv(removal_buffer, send_counts, send_displs, MPI_BYTE, recv_removals, recv_counts, recv_displs, MPI_BYTE, ctx -> mpi_communicator); // merge into the mask #pragma omp parallel for for(idx_t i = 0; i < tot_recv_counts; ++i) { idx_t el_pos = recv_removals[i].target_id - ctx -> idx_start; if(recv_removals[i].source_density > to_remove_mask[el_pos].value) { omp_set_lock(lock_array + el_pos); to_remove_mask[el_pos].array_idx = recv_removals[i].source_id; to_remove_mask[el_pos].value = recv_removals[i].source_density; omp_unset_lock(lock_array + el_pos); } } MPI_Win_fence(0, win_to_remove_mask); MPI_Win_fence(0, win_locks); MPI_Barrier(ctx -> mpi_communicator); /* populate the usual arrays */ for(idx_t p = 0; p < all_centers.count; ++p) Loading Loading @@ -888,11 +968,24 @@ clusters_t Heuristic1(global_context_t *ctx) } } MPI_Win_free(&win_to_remove_mask); free(to_remove_mask); MPI_Win_free(&win_locks); for(idx_t i = 0; i < n; ++i) { removal_queues[i].count = 0; removal_queues[i].size = 0; removal_queues[i].data = NULL; omp_destroy_lock(lock_array+ i); } free(removal_queues); free(removal_buffer); free(send_counts); free(send_displs); free(recv_counts); free(recv_displs); free(lock_array); free(recv_removals); int n_centers = (int)actual_centers.count; int tot_centers; Loading @@ -900,9 +993,11 @@ clusters_t Heuristic1(global_context_t *ctx) MPI_DB_PRINT("Found %d temporary centers\n", tot_centers); /* bring on master all centers /** * bring on master all centers * order them in ascending order of density, * then re-scatter them around to get unique cluster labels */ * then re-scatter them around to get unique cluster labels **/ center_t* private_centers_buffer = (center_t*)MY_MALLOC(actual_centers.count * sizeof(center_t)); center_t* global_centers_buffer = (center_t*)MY_MALLOC(tot_centers * sizeof(center_t)); Loading
src/adp/adp.h +15 −0 Original line number Diff line number Diff line Loading @@ -45,6 +45,21 @@ typedef struct merge_t float_t density; } merge_t; typedef struct center_removal_t { int rank; idx_t source_id; idx_t target_id; float_t source_density; } center_removal_t; typedef struct center_removal_queue_t { center_removal_t* data; idx_t count; idx_t size; } center_removal_queue_t; void compute_density_kstarnn_rma(global_context_t* ctx, const float_t d, int verbose); Loading