Loading src/adp/adp.c +40 −15 Original line number Original line Diff line number Diff line Loading @@ -510,8 +510,6 @@ datapoint_info_t find_possibly_halo_datapoint_rma(global_context_t* ctx, idx_t i else else { { datapoint_info_t tmp_dp; datapoint_info_t tmp_dp; #pragma omp critical { idx_t i = idx - ctx -> rank_idx_start[owner]; idx_t i = idx - ctx -> rank_idx_start[owner]; MPI_Request request; MPI_Request request; MPI_Status status; MPI_Status status; Loading @@ -520,8 +518,6 @@ datapoint_info_t find_possibly_halo_datapoint_rma(global_context_t* ctx, idx_t i i * sizeof(datapoint_info_t), sizeof(datapoint_info_t), MPI_BYTE, win_datapoints, &request); i * sizeof(datapoint_info_t), sizeof(datapoint_info_t), MPI_BYTE, win_datapoints, &request); MPI_Wait(&request, MPI_STATUS_IGNORE); MPI_Wait(&request, MPI_STATUS_IGNORE); } return tmp_dp; return tmp_dp; } } } } Loading Loading @@ -680,9 +676,11 @@ clusters_t Heuristic1(global_context_t *ctx) struct timespec start_tot, finish_tot; struct timespec start_tot, finish_tot; double elapsed_tot; double elapsed_tot; double elapsed_time; TIME_DEF; TIME_DEF; TIME_START; lu_dynamic_array_t all_centers, removed_centers, actual_centers, max_rho; lu_dynamic_array_t all_centers, removed_centers, actual_centers, max_rho; lu_dynamic_array_allocate(&all_centers); lu_dynamic_array_allocate(&all_centers); Loading @@ -698,7 +696,7 @@ clusters_t Heuristic1(global_context_t *ctx) MPI_Win_create(ctx -> local_datapoints, ctx -> local_n_points * sizeof(datapoint_info_t), MPI_Win_create(ctx -> local_datapoints, ctx -> local_n_points * sizeof(datapoint_info_t), 1, MPI_INFO_NULL, ctx -> mpi_communicator, &win_datapoints); 1, MPI_INFO_NULL, ctx -> mpi_communicator, &win_datapoints); MPI_Win_fence(0, win_datapoints); MPI_Win_fence(0, win_datapoints); MPI_Win_lock_all(0, win_datapoints); //MPI_Win_lock_all(0, win_datapoints); #if !defined(THREAD_FUNNELED) #if !defined(THREAD_FUNNELED) #pragma omp parallel for #pragma omp parallel for Loading Loading @@ -744,7 +742,6 @@ clusters_t Heuristic1(global_context_t *ctx) * * * optimized v2 use a queue of center removal and then exchange them * optimized v2 use a queue of center removal and then exchange them */ */ heap_node* to_remove_mask = (heap_node*)MY_MALLOC(n*sizeof(heap_node)); heap_node* to_remove_mask = (heap_node*)MY_MALLOC(n*sizeof(heap_node)); for(idx_t p = 0; p < n; ++p) for(idx_t p = 0; p < n; ++p) Loading @@ -752,6 +749,9 @@ clusters_t Heuristic1(global_context_t *ctx) to_remove_mask[p].array_idx = MY_SIZE_MAX; to_remove_mask[p].array_idx = MY_SIZE_MAX; to_remove_mask[p].value = -9999999; to_remove_mask[p].value = -9999999; } } // sort by density qsort(dp_info_ptrs, n, sizeof(datapoint_info_t*), cmpPP); qsort(dp_info_ptrs, n, sizeof(datapoint_info_t*), cmpPP); /** /** Loading @@ -774,6 +774,13 @@ clusters_t Heuristic1(global_context_t *ctx) omp_init_lock(lock_array + i); omp_init_lock(lock_array + i); } } elapsed_time = TIME_STOP; LOG_WRITE("Putative centers", elapsed_time); TIME_START; MPI_Win_fence(MPI_MODE_NOPUT, win_datapoints); #if !defined(THREAD_FUNNELED) #if !defined(THREAD_FUNNELED) #pragma omp parallel for schedule(dynamic) #pragma omp parallel for schedule(dynamic) #endif #endif Loading @@ -793,9 +800,10 @@ clusters_t Heuristic1(global_context_t *ctx) // actually is the p-th point // actually is the p-th point int owner = foreign_owner(ctx, jidx); int owner = foreign_owner(ctx, jidx); //if local process it //if local process it idx_t jpos = jidx - ctx -> idx_start; if(owner == ctx -> mpi_rank) if(owner == ctx -> mpi_rank) { { idx_t jpos = jidx - ctx -> idx_start; //acquire the lock omp_set_lock(lock_array + jpos); omp_set_lock(lock_array + jpos); if(i_point.g > to_remove_mask[jpos].value) if(i_point.g > to_remove_mask[jpos].value) { { Loading @@ -815,8 +823,14 @@ clusters_t Heuristic1(global_context_t *ctx) } } } } MPI_Win_fence(MPI_MODE_NOPUT, win_datapoints); //assemble arrays into a single buffer //assemble arrays into a single buffer elapsed_time = TIME_STOP; LOG_WRITE("Finding centers to prune", elapsed_time); TIME_START; idx_t tot_removal = 0; idx_t tot_removal = 0; for(idx_t p = 0; p < n; ++p) for(idx_t p = 0; p < n; ++p) { { Loading Loading @@ -964,7 +978,11 @@ clusters_t Heuristic1(global_context_t *ctx) // merge into the mask // merge into the mask #pragma omp parallel for elapsed_time = TIME_STOP; LOG_WRITE("Communicating eliminations", elapsed_time); TIME_START; #pragma omp parallel for schedule(dynamic) for(idx_t i = 0; i < tot_recv_counts; ++i) for(idx_t i = 0; i < tot_recv_counts; ++i) { { idx_t el_pos = recv_removals[i].target_id - ctx -> idx_start; idx_t el_pos = recv_removals[i].target_id - ctx -> idx_start; Loading Loading @@ -1046,6 +1064,11 @@ clusters_t Heuristic1(global_context_t *ctx) free(lock_array); free(lock_array); free(recv_removals); free(recv_removals); elapsed_time = TIME_STOP; LOG_WRITE("Merging", elapsed_time); TIME_START; int n_centers = (int)actual_centers.count; int n_centers = (int)actual_centers.count; int tot_centers; int tot_centers; MPI_Allreduce(&n_centers, &tot_centers, 1, MPI_INT, MPI_SUM, ctx -> mpi_communicator); MPI_Allreduce(&n_centers, &tot_centers, 1, MPI_INT, MPI_SUM, ctx -> mpi_communicator); Loading Loading @@ -1188,7 +1211,7 @@ clusters_t Heuristic1(global_context_t *ctx) } } MPI_Win_unlock_all(win_datapoints); //MPI_Win_unlock_all(win_datapoints); MPI_Win_fence(0, win_datapoints); MPI_Win_fence(0, win_datapoints); MPI_Win_free(&win_datapoints); MPI_Win_free(&win_datapoints); Loading @@ -1209,6 +1232,8 @@ clusters_t Heuristic1(global_context_t *ctx) free(ks); free(ks); #endif #endif elapsed_time = TIME_STOP; LOG_WRITE("Cluster assign", elapsed_time); free(actual_centers.data); free(actual_centers.data); actual_centers.size = tot_centers; actual_centers.size = tot_centers; Loading Loading
src/adp/adp.c +40 −15 Original line number Original line Diff line number Diff line Loading @@ -510,8 +510,6 @@ datapoint_info_t find_possibly_halo_datapoint_rma(global_context_t* ctx, idx_t i else else { { datapoint_info_t tmp_dp; datapoint_info_t tmp_dp; #pragma omp critical { idx_t i = idx - ctx -> rank_idx_start[owner]; idx_t i = idx - ctx -> rank_idx_start[owner]; MPI_Request request; MPI_Request request; MPI_Status status; MPI_Status status; Loading @@ -520,8 +518,6 @@ datapoint_info_t find_possibly_halo_datapoint_rma(global_context_t* ctx, idx_t i i * sizeof(datapoint_info_t), sizeof(datapoint_info_t), MPI_BYTE, win_datapoints, &request); i * sizeof(datapoint_info_t), sizeof(datapoint_info_t), MPI_BYTE, win_datapoints, &request); MPI_Wait(&request, MPI_STATUS_IGNORE); MPI_Wait(&request, MPI_STATUS_IGNORE); } return tmp_dp; return tmp_dp; } } } } Loading Loading @@ -680,9 +676,11 @@ clusters_t Heuristic1(global_context_t *ctx) struct timespec start_tot, finish_tot; struct timespec start_tot, finish_tot; double elapsed_tot; double elapsed_tot; double elapsed_time; TIME_DEF; TIME_DEF; TIME_START; lu_dynamic_array_t all_centers, removed_centers, actual_centers, max_rho; lu_dynamic_array_t all_centers, removed_centers, actual_centers, max_rho; lu_dynamic_array_allocate(&all_centers); lu_dynamic_array_allocate(&all_centers); Loading @@ -698,7 +696,7 @@ clusters_t Heuristic1(global_context_t *ctx) MPI_Win_create(ctx -> local_datapoints, ctx -> local_n_points * sizeof(datapoint_info_t), MPI_Win_create(ctx -> local_datapoints, ctx -> local_n_points * sizeof(datapoint_info_t), 1, MPI_INFO_NULL, ctx -> mpi_communicator, &win_datapoints); 1, MPI_INFO_NULL, ctx -> mpi_communicator, &win_datapoints); MPI_Win_fence(0, win_datapoints); MPI_Win_fence(0, win_datapoints); MPI_Win_lock_all(0, win_datapoints); //MPI_Win_lock_all(0, win_datapoints); #if !defined(THREAD_FUNNELED) #if !defined(THREAD_FUNNELED) #pragma omp parallel for #pragma omp parallel for Loading Loading @@ -744,7 +742,6 @@ clusters_t Heuristic1(global_context_t *ctx) * * * optimized v2 use a queue of center removal and then exchange them * optimized v2 use a queue of center removal and then exchange them */ */ heap_node* to_remove_mask = (heap_node*)MY_MALLOC(n*sizeof(heap_node)); heap_node* to_remove_mask = (heap_node*)MY_MALLOC(n*sizeof(heap_node)); for(idx_t p = 0; p < n; ++p) for(idx_t p = 0; p < n; ++p) Loading @@ -752,6 +749,9 @@ clusters_t Heuristic1(global_context_t *ctx) to_remove_mask[p].array_idx = MY_SIZE_MAX; to_remove_mask[p].array_idx = MY_SIZE_MAX; to_remove_mask[p].value = -9999999; to_remove_mask[p].value = -9999999; } } // sort by density qsort(dp_info_ptrs, n, sizeof(datapoint_info_t*), cmpPP); qsort(dp_info_ptrs, n, sizeof(datapoint_info_t*), cmpPP); /** /** Loading @@ -774,6 +774,13 @@ clusters_t Heuristic1(global_context_t *ctx) omp_init_lock(lock_array + i); omp_init_lock(lock_array + i); } } elapsed_time = TIME_STOP; LOG_WRITE("Putative centers", elapsed_time); TIME_START; MPI_Win_fence(MPI_MODE_NOPUT, win_datapoints); #if !defined(THREAD_FUNNELED) #if !defined(THREAD_FUNNELED) #pragma omp parallel for schedule(dynamic) #pragma omp parallel for schedule(dynamic) #endif #endif Loading @@ -793,9 +800,10 @@ clusters_t Heuristic1(global_context_t *ctx) // actually is the p-th point // actually is the p-th point int owner = foreign_owner(ctx, jidx); int owner = foreign_owner(ctx, jidx); //if local process it //if local process it idx_t jpos = jidx - ctx -> idx_start; if(owner == ctx -> mpi_rank) if(owner == ctx -> mpi_rank) { { idx_t jpos = jidx - ctx -> idx_start; //acquire the lock omp_set_lock(lock_array + jpos); omp_set_lock(lock_array + jpos); if(i_point.g > to_remove_mask[jpos].value) if(i_point.g > to_remove_mask[jpos].value) { { Loading @@ -815,8 +823,14 @@ clusters_t Heuristic1(global_context_t *ctx) } } } } MPI_Win_fence(MPI_MODE_NOPUT, win_datapoints); //assemble arrays into a single buffer //assemble arrays into a single buffer elapsed_time = TIME_STOP; LOG_WRITE("Finding centers to prune", elapsed_time); TIME_START; idx_t tot_removal = 0; idx_t tot_removal = 0; for(idx_t p = 0; p < n; ++p) for(idx_t p = 0; p < n; ++p) { { Loading Loading @@ -964,7 +978,11 @@ clusters_t Heuristic1(global_context_t *ctx) // merge into the mask // merge into the mask #pragma omp parallel for elapsed_time = TIME_STOP; LOG_WRITE("Communicating eliminations", elapsed_time); TIME_START; #pragma omp parallel for schedule(dynamic) for(idx_t i = 0; i < tot_recv_counts; ++i) for(idx_t i = 0; i < tot_recv_counts; ++i) { { idx_t el_pos = recv_removals[i].target_id - ctx -> idx_start; idx_t el_pos = recv_removals[i].target_id - ctx -> idx_start; Loading Loading @@ -1046,6 +1064,11 @@ clusters_t Heuristic1(global_context_t *ctx) free(lock_array); free(lock_array); free(recv_removals); free(recv_removals); elapsed_time = TIME_STOP; LOG_WRITE("Merging", elapsed_time); TIME_START; int n_centers = (int)actual_centers.count; int n_centers = (int)actual_centers.count; int tot_centers; int tot_centers; MPI_Allreduce(&n_centers, &tot_centers, 1, MPI_INT, MPI_SUM, ctx -> mpi_communicator); MPI_Allreduce(&n_centers, &tot_centers, 1, MPI_INT, MPI_SUM, ctx -> mpi_communicator); Loading Loading @@ -1188,7 +1211,7 @@ clusters_t Heuristic1(global_context_t *ctx) } } MPI_Win_unlock_all(win_datapoints); //MPI_Win_unlock_all(win_datapoints); MPI_Win_fence(0, win_datapoints); MPI_Win_fence(0, win_datapoints); MPI_Win_free(&win_datapoints); MPI_Win_free(&win_datapoints); Loading @@ -1209,6 +1232,8 @@ clusters_t Heuristic1(global_context_t *ctx) free(ks); free(ks); #endif #endif elapsed_time = TIME_STOP; LOG_WRITE("Cluster assign", elapsed_time); free(actual_centers.data); free(actual_centers.data); actual_centers.size = tot_centers; actual_centers.size = tot_centers; Loading