From c629603de31f82139fa17ff2a77abe76c92169af Mon Sep 17 00:00:00 2001 From: Francesco Tomba Date: Mon, 16 Mar 2026 09:40:58 +0100 Subject: [PATCH 1/2] solved output data problems --- Makefile | 6 +- run_leo | 4 +- src/adp/adp.c | 50 +- src/common/common.c | 17 +- src/common/common.h | 46 +- src/main/main.c | 43 +- src/tree/heap.c | 217 ------ src/tree/heap.h | 176 +++-- src/tree/kdtree.h | 1278 ++++++++++++++++++++++++++++++++++ src/tree/kdtreeV2.c | 378 ---------- src/tree/kdtreeV2.h | 65 -- src/tree/tree.c | 1603 ++++++++++++++++++++++++++++++++++--------- src/tree/tree.h | 30 +- 13 files changed, 2792 insertions(+), 1121 deletions(-) delete mode 100644 src/tree/heap.c create mode 100644 src/tree/kdtree.h delete mode 100644 src/tree/kdtreeV2.c delete mode 100644 src/tree/kdtreeV2.h diff --git a/Makefile b/Makefile index ceb6a13..d9f820e 100644 --- a/Makefile +++ b/Makefile @@ -1,15 +1,15 @@ CC=mpicc #CC=mpiicx -CFLAGS=-O3 -march=native -flto -funroll-loops -fopenmp +CFLAGS=-O3 -march=native -funroll-loops -fopenmp #CFLAGS=-O3 -fopenmp LDFLAGS=-lm all: main -obj=src/main/main.c src/tree/tree.c src/common/common.c src/tree/kdtreeV2.c src/tree/heap.c src/adp/adp.c +obj=src/main/main.c src/tree/tree.c src/common/common.c src/adp/adp.c main: ${obj} - ${CC} ${CFLAGS} ${LDFLAGS} ${obj} -o $@ + ${CC} ${CFLAGS} ${obj} -o $@ ${LDFLAGS} clean: rm main diff --git a/run_leo b/run_leo index 44e834a..497cdb7 100644 --- a/run_leo +++ b/run_leo @@ -6,7 +6,7 @@ #SBATCH --time=04:00:00 #SBATCH --job-name=dadp_test #SBATCH --partition=dcgp_usr_prod -#SBATCH --account=EUHPC_D18_045 +#SBATCH --account=EUHPC_D31_027 #SBATCH --output=out_leo #SBATCH --error=err_leo #SBATCH --mem=480G @@ -37,7 +37,7 @@ OUT_DATA=/leonardo_scratch/large/userexternal/ftomba00/data IN_DATA=/leonardo_work/EUHPC_D18_045 #10^6 points -time mpirun -n ${SLURM_NTASKS} --map-by ppr:1:socket:PE=${SLURM_CPUS_PER_TASK} ./main -t f32 -i ${IN_DATA}/norm_data/std_LR_091_0001 -d 5 -a ${OUT_ASSIGNMENT} -o ${OUT_DATA} +time mpirun -n ${SLURM_NTASKS} --map-by ppr:1:socket:PE=${SLURM_CPUS_PER_TASK} ./main -t f32 -k 100 -i ${IN_DATA}/norm_data/std_LR_091_0001 -d 5 -a ${OUT_ASSIGNMENT} -o ${OUT_DATA} #time mpirun -n ${SLURM_NTASKS} --map-by core ./main -t f32 -i ${IN_DATA}/norm_data/std_LR_091_0001 -d 5 -a ${OUT_ASSIGNMENT} -o ${OUT_DATA} #34 * 10^6 points diff --git a/src/adp/adp.c b/src/adp/adp.c index f28cbde..240153a 100644 --- a/src/adp/adp.c +++ b/src/adp/adp.c @@ -92,10 +92,10 @@ float_t get_j_ksel_dist(global_context_t* ctx, idx_t j, idx_t ksel, MPI_Win expo } else { - heap_node el; + heap_node_t el; idx_t pos = j - ctx -> rank_idx_start[owner]; MPI_Request request; - int err = MPI_Rget(&el, sizeof(heap_node), MPI_BYTE, owner, (MPI_Aint)((pos * k + ksel) * sizeof(heap_node)), sizeof(heap_node), MPI_BYTE, exposed_ngbh, &request); + int err = MPI_Rget(&el, sizeof(heap_node_t), MPI_BYTE, owner, (MPI_Aint)((pos * k + ksel) * sizeof(heap_node_t)), sizeof(heap_node_t), MPI_BYTE, exposed_ngbh, &request); MPI_Wait(&request,MPI_STATUS_IGNORE); return el.value; } @@ -113,10 +113,10 @@ idx_t get_j_ksel_idx(global_context_t* ctx, idx_t j, idx_t ksel, MPI_Win exposed } else { - heap_node el; + heap_node_t el; idx_t pos = j - ctx -> rank_idx_start[owner]; MPI_Request request; - int err = MPI_Rget(&el, sizeof(heap_node), MPI_BYTE, owner, (MPI_Aint)((pos * k + ksel) * sizeof(heap_node)), sizeof(heap_node), MPI_BYTE, exposed_ngbh, &request); + int err = MPI_Rget(&el, sizeof(heap_node_t), MPI_BYTE, owner, (MPI_Aint)((pos * k + ksel) * sizeof(heap_node_t)), sizeof(heap_node_t), MPI_BYTE, exposed_ngbh, &request); MPI_Wait(&request,MPI_STATUS_IGNORE); return el.array_idx; } @@ -176,7 +176,7 @@ void compute_density_kstarnn_rma_v2(global_context_t* ctx, const float_t d, int MPI_Win exposed_ngbh; MPI_Win_create( ctx -> __local_heap_buffers, - ctx -> local_n_points * ctx -> k * sizeof(heap_node), + ctx -> local_n_points * ctx -> k * sizeof(heap_node_t), 1, MPI_INFO_NULL, ctx -> mpi_communicator, &exposed_ngbh); @@ -186,7 +186,7 @@ void compute_density_kstarnn_rma_v2(global_context_t* ctx, const float_t d, int int i_have_finished = 0; int all_have_finished = 0; int finished_points = 0; - heap_node* scratch_heap_nodes = (heap_node*)MY_MALLOC(ctx -> local_n_points * sizeof(heap_node)); + heap_node_t* scratch_heap_node_ts = (heap_node_t*)MY_MALLOC(ctx -> local_n_points * sizeof(heap_node_t)); for(idx_t j = 4; j < kMAX - 1; ++j) { @@ -217,17 +217,17 @@ void compute_density_kstarnn_rma_v2(global_context_t* ctx, const float_t d, int if(owner == ctx -> mpi_rank) { - scratch_heap_nodes[i] = ctx -> local_datapoints[pos].ngbh[ksel]; + scratch_heap_node_ts[i] = ctx -> local_datapoints[pos].ngbh[ksel]; } else { // if(pos > ctx -> rank_n_points[owner]) printf("NOPE HERE from rank %d %lu %d\n", ctx -> mpi_rank, pos, ctx -> rank_n_points[owner]); - MPI_Get(scratch_heap_nodes + i, - sizeof(heap_node), + MPI_Get(scratch_heap_node_ts + i, + sizeof(heap_node_t), MPI_BYTE, owner, - (MPI_Aint)((pos * (idx_t)(ctx -> k) + ksel) * sizeof(heap_node)), - sizeof(heap_node), + (MPI_Aint)((pos * (idx_t)(ctx -> k) + ksel) * sizeof(heap_node_t)), + sizeof(heap_node_t), MPI_BYTE, exposed_ngbh); } @@ -246,7 +246,7 @@ void compute_density_kstarnn_rma_v2(global_context_t* ctx, const float_t d, int { float_t vvi, vvj, vp, dL; vvi = local_datapoints[i].ngbh[ksel].value; - vvj = scratch_heap_nodes[i].value; + vvj = scratch_heap_node_ts[i].value; vp = (vvi + vvj)*(vvi + vvj); dL = -2.0 * ksel * log(4.*vvi*vvj/vp); @@ -293,7 +293,7 @@ void compute_density_kstarnn_rma_v2(global_context_t* ctx, const float_t d, int MPI_Win_fence(0, exposed_ngbh); MPI_Win_free(&exposed_ngbh); - free(scratch_heap_nodes); + free(scratch_heap_node_ts); #if defined(WRITE_DENSITY) /* densities */ @@ -301,24 +301,28 @@ void compute_density_kstarnn_rma_v2(global_context_t* ctx, const float_t d, int float_t* gs = (float_t*)MY_MALLOC(ctx -> local_n_points * sizeof(float_t)); idx_t* ks = (idx_t*)MY_MALLOC(ctx -> local_n_points * sizeof(idx_t)); - for(int i = 0; i < ctx -> local_n_points; ++i) den[i] = ctx -> local_datapoints[i].log_rho; - for(int i = 0; i < ctx -> local_n_points; ++i) ks[i] = ctx -> local_datapoints[i].kstar; - for(int i = 0; i < ctx -> local_n_points; ++i) gs[i] = ctx -> local_datapoints[i].g; + for(int i = 0; i < ctx -> local_n_points; ++i) + { + den[i] = ctx -> local_datapoints[i].log_rho; + ks[i] = ctx -> local_datapoints[i].kstar; + gs[i] = ctx -> local_datapoints[i].g; + } ordered_buffer_to_file(ctx, den, sizeof(float_t), ctx -> local_n_points, "bb/ordered_density.npy"); ordered_buffer_to_file(ctx, ks, sizeof(idx_t), ctx -> local_n_points, "bb/ks.npy"); ordered_buffer_to_file(ctx, gs, sizeof(float_t), ctx -> local_n_points, "bb/g.npy"); - ordered_data_to_file(ctx, "bb/ordered_data.npy"); + free(gs); free(den); free(ks); + #endif return; } -float_t get_j_ksel_dist_v2(global_context_t* ctx, idx_t i, idx_t j, idx_t ksel, int* flags, heap_node* tmp_heap_nodes, MPI_Win* exposed_ngbh) +float_t get_j_ksel_dist_v2(global_context_t* ctx, idx_t i, idx_t j, idx_t ksel, int* flags, heap_node_t* tmp_heap_node_ts, MPI_Win* exposed_ngbh) { if(flags[i]) { @@ -335,14 +339,14 @@ float_t get_j_ksel_dist_v2(global_context_t* ctx, idx_t i, idx_t j, idx_t ksel, //RMA flags[i] = 0; idx_t pos = j - ctx -> rank_idx_start[owner]; - MPI_Get(tmp_heap_nodes + i, sizeof(heap_node), MPI_BYTE, owner, (MPI_Aint)((pos * k + ksel) * sizeof(heap_node)), sizeof(heap_node), MPI_BYTE, *exposed_ngbh); + MPI_Get(tmp_heap_node_ts + i, sizeof(heap_node_t), MPI_BYTE, owner, (MPI_Aint)((pos * k + ksel) * sizeof(heap_node_t)), sizeof(heap_node_t), MPI_BYTE, *exposed_ngbh); return 0; } } else { flags[i] = 1; - return tmp_heap_nodes[i].value; + return tmp_heap_node_ts[i].value; } } @@ -745,7 +749,7 @@ clusters_t Heuristic1(global_context_t *ctx) * * optimized v2 use a queue of center removal and then exchange them **/ - heap_node* to_remove_mask = (heap_node*)MY_MALLOC(n*sizeof(heap_node)); + heap_node_t* to_remove_mask = (heap_node_t*)MY_MALLOC(n*sizeof(heap_node_t)); for(idx_t p = 0; p < n; ++p) { @@ -1401,7 +1405,7 @@ void Heuristic2(global_context_t* ctx, clusters_t* cluster) MPI_Win dp_info_win, ngbh_win; MPI_Win_create(ctx -> local_datapoints, ctx -> local_n_points * sizeof(datapoint_info_t), 1, MPI_INFO_NULL, ctx -> mpi_communicator, &dp_info_win); - MPI_Win_create(ctx -> __local_heap_buffers, ctx -> local_n_points * ctx -> k * sizeof(heap_node), 1, MPI_INFO_NULL, ctx -> mpi_communicator, &ngbh_win); + MPI_Win_create(ctx -> __local_heap_buffers, ctx -> local_n_points * ctx -> k * sizeof(heap_node_t), 1, MPI_INFO_NULL, ctx -> mpi_communicator, &ngbh_win); MPI_Win_fence(0, dp_info_win); MPI_Win_fence(0, ngbh_win); @@ -2478,7 +2482,7 @@ void Heuristic3(global_context_t* ctx, clusters_t* cluster, float_t Z, int halo) for(int i = 0; i < ctx -> local_n_points; ++i) cl[i] = ctx -> local_datapoints[i].cluster_idx; ordered_buffer_to_file(ctx, cl, sizeof(int), ctx -> local_n_points, "bb/final_assignment.npy"); - ordered_data_to_file(ctx, "bb/ordered_data.npy"); + //ordered_data_to_file(ctx, "bb/ordered_data.npy"); free(cl); diff --git a/src/common/common.c b/src/common/common.c index 3ee1a5d..3a5d155 100644 --- a/src/common/common.c +++ b/src/common/common.c @@ -1,5 +1,6 @@ #include "common.h" #include "mpi.h" +#include #include #define ARRAY_INCREMENT 100 @@ -13,8 +14,9 @@ void get_context(global_context_t* ctx) MPI_Get_processor_name(ctx -> processor_mame, &(ctx -> __processor_name_len)); MPI_Comm_rank(ctx -> mpi_communicator, &(ctx -> mpi_rank)); ctx -> local_data = NULL; - ctx -> lb_box = NULL; - ctx -> ub_box = NULL; + ctx -> lb_box = NULL; + ctx -> ub_box = NULL; + ctx -> og_idxs = NULL; ctx -> rank_n_points = (idx_t*)malloc(ctx -> world_size * sizeof(idx_t)); ctx -> rank_idx_start = (idx_t*)malloc(ctx -> world_size * sizeof(idx_t)); ctx -> local_datapoints = NULL; @@ -160,10 +162,10 @@ void print_error_code(int err) void free_context(global_context_t* ctx) { - FREE_NOT_NULL(ctx -> local_data); + // FREE_NOT_NULL(ctx -> local_data); FREE_NOT_NULL(ctx -> ub_box); FREE_NOT_NULL(ctx -> lb_box); - //FREE_NOT_NULL(ctx -> __local_heap_buffers); + FREE_NOT_NULL(ctx -> og_idxs); if(ctx -> __local_heap_buffers) MPI_Free_mem(ctx -> __local_heap_buffers); @@ -179,10 +181,10 @@ void free_context(global_context_t* ctx) void free_pointset(pointset_t* ps) { - if(ps -> data) + if(ps -> datapoints) { - free(ps -> data); - ps -> data = NULL; + free(ps -> datapoints); + ps -> datapoints = NULL; } if(ps -> ub_box) @@ -210,6 +212,7 @@ void mpi_printf(global_context_t* ctx, const char *fmt, ...) // myflush(stdout); va_end(l); } + fflush(stdout); } void generate_random_matrix( diff --git a/src/common/common.h b/src/common/common.h index dec2d61..ad6cd47 100644 --- a/src/common/common.h +++ b/src/common/common.h @@ -10,21 +10,25 @@ #define DEFAULT_MSG_LEN 10000000 //#include -#define PARALLEL_FIX_BORDERS -// #define WRITE_NGBH +// #define PARALLEL_FIX_BORDERS +#define WRITE_SHUFFLED_DATA +#define WRITE_NGBH // #define WRITE_TOP_NODES -// #define WRITE_DENSITY +#define WRITE_DENSITY // #define WRITE_CLUSTER_ASSIGN_H1 // #define WRITE_BORDERS // #define WRITE_CENTERS_PRE_MERGING // #define WRITE_MERGES_INFO // #define WRITE_MERGING_TABLE -// #define WRITE_FINAL_ASSIGNMENT +#define WRITE_FINAL_ASSIGNMENT // #define PRINT_NGBH_EXCHANGE_SCHEME // #define PRINT_H2_COMM_SCHEME // #define PRINT_H1_CLUSTER_ASSIGN_COMPLETION // #define PRINT_ORDERED_BUFFER +// #define PRINT_BALANCE_FACTOR +// #define CHECK_CORRECT_EXCHANGE +// #define DUMP_NGBH_KNN #define DEFAULT_STR_LEN 200 @@ -50,14 +54,17 @@ #define MY_TRUE 1 #define MY_FALSE 0 -#define HERE printf("%d in file %s reached line %d\n", ctx -> mpi_rank, __FILE__, __LINE__); MPI_Barrier(ctx -> mpi_communicator); +#define HERE printf("%d in file %s reached line %d\n", ctx -> mpi_rank, __FILE__, __LINE__); fflush(stdout); MPI_Barrier(ctx -> mpi_communicator); #define CHECK_ALLOCATION(x) if(!x){printf("[!!!] %d rank encountered failed allocation: %s at line %s \n", ctx -> mpi_rank, __FILE__, __LINE__ ); exit(1);}; + #define CHECK_ALLOCATION_NO_CTX(x) if(!x){printf("[!!!] Failed allocation: %s at line %d \n", __FILE__, __LINE__ ); exit(1);} +#ifndef MY_MALLOC #define MY_MALLOC(n) ({void* p = calloc(n,1); CHECK_ALLOCATION_NO_CTX(p); p; }) +#endif -#define DB_PRINT(...) printf(__VA_ARGS__) +#define DB_PRINT(...) printf(__VA_ARGS__); fflush(stdout) #ifdef NDEBUG #undef DB_PRINT(...) #define DB_PRINT(...) @@ -93,11 +100,14 @@ MPI_Reduce(&time, &avg, 1, MPI_DOUBLE, MPI_SUM, 0, ctx -> mpi_communicator); \ MPI_Reduce(&time, &min, 1, MPI_DOUBLE, MPI_MIN, 0, ctx -> mpi_communicator); \ MPI_Reduce(&time, &max, 1, MPI_DOUBLE, MPI_MAX, 0, ctx -> mpi_communicator); \ + MPI_Barrier(ctx->mpi_communicator); \ MPI_DB_PRINT("%50.50s -> [avg: %.2lfs, min: %.2lfs, max: %.2lfs]\n", sec_name, avg/((double)ctx -> world_size), min, max); \ + fflush(stdout); \ } \ else \ { \ MPI_DB_PRINT("%s\n", sec_name);\ + fflush(stdout); \ }\ } @@ -128,11 +138,20 @@ #define LOG_END #endif +#ifndef POINT + #define POINT + typedef struct point_t + { + idx_t array_idx; + float_t* data; + } point_t; +#endif + typedef struct datapoint_info_t { /* * datapoint object */ - heap_node* ngbh; //heap object stores nearest neighbors of each point + heap_node_t* ngbh; //heap object stores nearest neighbors of each point int is_center; //flag signaling if a point is a cluster center int cluster_idx; //cluster index idx_t array_idx; //global index of the point in the dataset @@ -154,8 +173,8 @@ typedef struct global_context_t int __processor_name_len; //processor name len int world_size; int mpi_rank; //rank of the processor - uint32_t dims; //number of dimensions of the dataset idx_t k; //number of neighbors + uint32_t dims; //number of dimensions of the dataset float_t z; //z parameter float_t* local_data; //pointer to the dataset stored into the node float_t* lb_box; //bounding box of the dataset @@ -164,12 +183,13 @@ typedef struct global_context_t size_t idx_start; //starting index of the points on the processor size_t local_n_points; //number of points stored in the current processor datapoint_info_t* local_datapoints; //pointer to the datapoint properties - idx_t* rank_idx_start; //starting index of datapoints in each processor - idx_t* rank_n_points; //processor name - heap_node* __local_heap_buffers; //buffer that stores nearest neighbors + idx_t* rank_idx_start; //starting index of datapoints in each processor + idx_t* rank_n_points; //processor name + idx_t* og_idxs; //original indexes + heap_node_t* __local_heap_buffers; //buffer that stores nearest neighbors MPI_Comm mpi_communicator; //mpi communicator - char input_data_file[DEFAULT_STR_LEN]; int input_data_in_float32; + char input_data_file[DEFAULT_STR_LEN]; char output_assignment_file[DEFAULT_STR_LEN]; char output_data_file[DEFAULT_STR_LEN]; } global_context_t; @@ -184,7 +204,7 @@ typedef struct pointset_t size_t n_points; size_t __capacity; uint32_t dims; - float_t* data; + point_t* datapoints; float_t* lb_box; float_t* ub_box; } pointset_t; diff --git a/src/main/main.c b/src/main/main.c index 2bc6aa2..c82e660 100644 --- a/src/main/main.c +++ b/src/main/main.c @@ -4,6 +4,7 @@ #include "../common/common.h" #include "../tree/tree.h" #include "../adp/adp.h" +#include #include #include @@ -251,7 +252,7 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx) TIME_DEF double elapsed_time; - int halo = MY_FALSE; + int halo = MY_TRUE; float_t tol = 0.002; if(I_AM_MASTER && ctx -> world_size <= 6) @@ -345,14 +346,6 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx) uint64_t *global_bin_counts_int = (uint64_t *)MY_MALLOC(k_global * sizeof(uint64_t)); - pointset_t original_ps; - original_ps.data = ctx->local_data; - original_ps.dims = ctx->dims; - original_ps.n_points = ctx->local_n_points; - original_ps.lb_box = (float_t*)MY_MALLOC(ctx -> dims * sizeof(float_t)); - original_ps.ub_box = (float_t*)MY_MALLOC(ctx -> dims * sizeof(float_t)); - - top_kdtree_t tree; TIME_START; top_tree_init(ctx, &tree); @@ -360,14 +353,14 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx) LOG_WRITE("Initializing global kdtree", elapsed_time); TIME_START; - build_top_kdtree(ctx, &original_ps, &tree, k_global, tol); + parallel_build_top_kdtree(ctx, &tree, 512); exchange_points(ctx, &tree); elapsed_time = TIME_STOP; LOG_WRITE("Top kdtree build and domain decomposition", elapsed_time); TIME_START; - kdtree_v2 local_tree; - kdtree_v2_init( &local_tree, ctx -> local_data, ctx -> local_n_points, (unsigned int)ctx -> dims); + kdtree_t local_tree; + kdtree_initialize( &local_tree, ctx -> local_data, ctx -> local_n_points, (unsigned int)ctx -> dims); datapoint_info_t* dp_info = (datapoint_info_t*)MY_MALLOC(ctx -> local_n_points * sizeof(datapoint_info_t)); for(uint64_t i = 0; i < ctx -> local_n_points; ++i) @@ -385,7 +378,10 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx) } ctx -> local_datapoints = dp_info; - build_local_tree(ctx, &local_tree); + build_tree_kdtree_parallel(&local_tree); + // build_tree_kdtree_sample(&local_tree); + ctx -> local_data = local_tree.data; + elapsed_time = TIME_STOP; LOG_WRITE("Local trees init and build", elapsed_time); @@ -393,14 +389,13 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx) MPI_DB_PRINT("----- Performing ngbh search -----\n"); MPI_Barrier(ctx -> mpi_communicator); - mpi_ngbh_search(ctx, dp_info, &tree, &local_tree, ctx -> local_data, ctx -> k); + mpi_all_knn_search(ctx, dp_info, &tree, &local_tree, ctx -> k); MPI_Barrier(ctx -> mpi_communicator); elapsed_time = TIME_STOP; LOG_WRITE("Total time for all knn search", elapsed_time) - TIME_START; //float_t id = id_estimate(ctx, dp_info, ctx -> local_n_points, 0.9, MY_FALSE); float_t id = compute_ID_two_NN_ML(ctx, dp_info, ctx -> local_n_points, MY_FALSE); @@ -440,12 +435,19 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx) TIME_START; int* cl = (int*)MY_MALLOC(ctx -> local_n_points * sizeof(int)); - for(int i = 0; i < ctx -> local_n_points; ++i) cl[i] = ctx -> local_datapoints[i].cluster_idx; + float_t* data_to_write = (float_t*)MY_MALLOC(ctx -> local_n_points * ctx -> k * sizeof(float_t)); + + for(int i = 0; i < ctx -> local_n_points; ++i) + { + cl[i] = ctx -> local_datapoints[i].cluster_idx; + idx_t idx = local_tree.__points[i].array_idx; + memcpy(data_to_write + idx*ctx -> dims, local_tree.__points[i].data, ctx -> dims*sizeof(float_t)); + } if(ctx -> world_size <= 32) { big_ordered_buffer_to_file(ctx, cl, sizeof(int), ctx -> local_n_points, ctx -> output_assignment_file); - big_ordered_buffer_to_file(ctx, ctx -> local_data, sizeof(double), ctx -> local_n_points * ctx -> dims, ctx -> output_data_file); + big_ordered_buffer_to_file(ctx, data_to_write, sizeof(double), ctx -> local_n_points * ctx -> dims, ctx -> output_data_file); free(cl); @@ -453,7 +455,7 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx) else { distributed_buffer_to_file(ctx, cl, sizeof(int), ctx -> local_n_points, ctx -> output_assignment_file); - distributed_buffer_to_file(ctx, ctx -> local_data, sizeof(double), ctx -> local_n_points * ctx -> dims, ctx -> output_data_file); + distributed_buffer_to_file(ctx, data_to_write, sizeof(double), ctx -> local_n_points * ctx -> dims, ctx -> output_data_file); free(cl); @@ -463,15 +465,14 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx) LOG_WRITE("Write results to file", elapsed_time); + free(data_to_write); top_tree_free(ctx, &tree); - kdtree_v2_free(&local_tree); + kdtree_free(&local_tree); //clusters_free(&clusters); free(send_counts); free(displacements); //free(dp_info); - original_ps.data = NULL; - free_pointset(&original_ps); free(global_bin_counts_int); } diff --git a/src/tree/heap.c b/src/tree/heap.c deleted file mode 100644 index 67d38e5..0000000 --- a/src/tree/heap.c +++ /dev/null @@ -1,217 +0,0 @@ -#include "heap.h" - -void swap_heap_node(heap_node* a, heap_node* b){ - heap_node tmp; - memcpy(&tmp,a,sizeof(heap_node)); - memcpy(a,b,sizeof(heap_node)); - memcpy(b,&tmp,sizeof(heap_node)); - return; -} - -void allocate_heap(heap* H, idx_t n){ - H -> data = (heap_node*)malloc(n*sizeof(heap_node)); - H -> N = n; - H -> count = 0; - return; -} - -void init_heap(heap* H){ - for(idx_t i = 0; i < H -> N; ++i) - { - H -> data[i].value = 0.; - H -> data[i].array_idx = MY_SIZE_MAX; - } - return; -} - -void free_heap(heap * H){ free(H -> data);} - - -void heapify_max_heap(heap* H, idx_t node){ - idx_t nn = node; - idx_t largest = nn; - /* - Found gratest between children of node and boundcheck if the node is a leaf - */ - - while(1) - { - largest = (HEAP_LCH(nn) < H -> N) && - (H -> data[HEAP_LCH(nn)].value > H -> data[largest].value ) ? HEAP_LCH(nn) : largest; - - largest = (HEAP_RCH(nn) < H -> N) && - (H -> data[HEAP_RCH(nn)].value > H -> data[largest].value ) ? HEAP_RCH(nn) : largest; - if(largest != nn) - { - swap_heap_node(H -> data + nn, H -> data + largest); - nn = largest; - } - else - { - break; - } - } - - - /* - if(HEAP_LCH(node) < H -> N){ - //if(H -> data[HEAP_LCH(node)].value > H -> data[largest].value ) largest = HEAP_LCH(node); - largest = (H -> data[HEAP_LCH(nn)].value > H -> data[largest].value ) ? HEAP_LCH(nn) : largest; - } - if(HEAP_RCH(node) < H -> N){ - //if(H -> data[HEAP_RCH(node)].value > H -> data[largest].value ) largest = HEAP_RCH(node); - largest = (H -> data[HEAP_RCH(nn)].value > H -> data[largest].value ) ? HEAP_RCH(nn) : largest; - } - if(largest == node){ - return; - } - else{ - swap_heap_node(H -> data + node, H -> data + largest); - heapify_max_heap(H, largest); - } - */ -} - - -void set_root_max_heap(heap * H, FLOAT_TYPE val, idx_t array_idx){ - H -> data[0].value = val; - H -> data[0].array_idx = array_idx; - heapify_max_heap(H,0); - return; -} - -void insert_max_heap_insertion_sort(heap * H,const FLOAT_TYPE val,const idx_t array_idx){ - heap_node tmpNode = {.value = val, .array_idx = array_idx}; - if(H -> count < H -> N) - { - idx_t idx = H -> count; - H -> data[idx] = tmpNode; - ++(H -> count); - while(idx >= 1) - { - if(H -> data[idx].value < H -> data[idx - 1].value) - { - swap_heap_node((H -> data) + idx, (H -> data) + idx - 1); - idx--; - } - else - { - break; - } - } - - } - else - { - if(H -> data[H -> count - 1].value > val) - { - idx_t idx = H -> count - 1; - H -> data[idx] = tmpNode; - while(idx >= 1) - { - if(H -> data[idx].value < H -> data[idx - 1].value) - { - swap_heap_node(H -> data + idx, H -> data + idx - 1); - idx--; - } - else - { - break; - } - } - } - } - return; -} - -void insert_max_heap(heap * H,const FLOAT_TYPE val,const idx_t array_idx){ - int c1 = H -> count < H -> N; - int c2 = (val < H -> data[0].value) && (!c1); - int ctot = c1 + 2*c2; - switch (ctot) { - case 1: - { - idx_t node = H->count; - ++(H -> count); - H -> data[node].value = val; - H -> data[node].array_idx = array_idx; - /* - * Push up the node through the heap - */ - while(node && H -> data[node].value > H -> data[HEAP_PARENT(node)].value) - { - swap_heap_node(H -> data + node, H -> data + HEAP_PARENT(node)); - node = HEAP_PARENT(node); - //if(node == 0) break; - } - } - break; - - case 2: - { - set_root_max_heap(H,val,array_idx); - } - break; - default: - break; - } - /* alternative solution, left here if something breaks*/ - - //if(H -> count < H -> N){ - // idx_t node = H->count; - // ++(H -> count); - // H -> data[node].value = val; - // H -> data[node].array_idx = array_idx; - // /* - // * Push up the node through the heap - // */ - // while(node && H -> data[node].value > H -> data[HEAP_PARENT(node)].value) - // { - // swap_heap_node(H -> data + node, H -> data + HEAP_PARENT(node)); - // node = HEAP_PARENT(node); - // //if(node == 0) break; - // } - // return; - //} - //else if (val < H -> data[0].value) - //{ - // set_root_max_heap(H,val,array_idx); - // return; - //} - //else - //{ - // return; - //} - - -} - -#ifdef USE_FLOAT32 - #define EPS 5.96e-08 -#else - #define EPS 2.11e-16 -#endif - -int cmp_heap_nodes(const void* a, const void* b) -{ - const heap_node* aa = (const heap_node*)a; - const heap_node* bb = (const heap_node*)b; - int val = (aa -> value > bb -> value) - (aa -> value < bb -> value); - //return vv; - return val; - - -} - - -void heap_sort(heap* H){ - idx_t n = H -> N; - qsort(H -> data, n, sizeof(heap_node),cmp_heap_nodes); - //for(idx_t i= (H -> N) - 1; i > 0; --i) - //{ - // swap_heap_node(H -> data, H -> data + i); - // H -> N = i; - // heapify_max_heap(H,0); - //} - //H -> N = n; -} diff --git a/src/tree/heap.h b/src/tree/heap.h index 117ff8c..5b63bf8 100644 --- a/src/tree/heap.h +++ b/src/tree/heap.h @@ -1,18 +1,16 @@ #pragma once + #include #include #include #include #include #include -#define T double -#define DATA_DIMS 0 - #ifdef USE_FLOAT32 - #define FLOAT_TYPE float + #define float_t float #else - #define FLOAT_TYPE double + #define float_t double #endif #ifdef USE_INT32 @@ -30,49 +28,127 @@ #define HP_LEFT_SIDE 0 #define HP_RIGHT_SIDE 1 - -struct heap_node -{ - FLOAT_TYPE value; - idx_t array_idx; -} ; - -struct heap +#define ALIGNMENT 64 +#define CHECK_ALLOCATION_NO_CTX(x) if(!x){printf("[!!!] Failed allocation: %s at line %d \n", __FILE__, __LINE__ ); exit(1);} +#define MY_MALLOC(n) ({void* p = aligned_alloc(ALIGNMENT,n); CHECK_ALLOCATION_NO_CTX(p); memset(p, 0, n); p; }) + + +typedef struct { + float_t value; + idx_t array_idx; +} heap_node_t; + +typedef struct { + size_t size; + size_t count; + heap_node_t* data; +} heap_t; + +static inline void heap_node_swap(heap_node_t* a, heap_node_t* b){ + heap_node_t tmp = *a; + *a = *b; + *b = tmp; +} + +static void heap_allocate(heap_t* H, idx_t n){ + H -> data = (heap_node_t*)MY_MALLOC(n*sizeof(heap_node_t)); + H -> size = n; + H -> count = 0; + return; +} + +static void heap_initialize(heap_t* H){ + for(idx_t i = 0; i < H->size; ++i) + { + H -> data[i].value = 0.; + H -> data[i].array_idx = MY_SIZE_MAX; + } + return; +} + +static void free_heap(heap_t * H){ free(H -> data);} + + +// Optimized heapify (sift-down) that avoids repeated swaps. +// It correctly uses H->count as the boundary. +static inline void heapify_max_heap(heap_t* H, idx_t i) { + heap_node_t node_to_sift = H->data[i]; // Save the node we are sifting down + idx_t child; + while ((child = HEAP_LCH(i)) < H->count) { + idx_t r_child = HEAP_RCH(i); + // Find the largest child + + // if (r_child < H->count && H->data[r_child].value > H->data[child].value) { + // child = r_child; + // } + + child = (r_child < H->count && H->data[r_child].value > H->data[child].value) ? r_child : child; + // If the node to sift is larger than its largest child, we are done + if (node_to_sift.value >= H->data[child].value) { + break; + } + // Move the larger child up + H->data[i] = H->data[child]; + i = child; + } + // Place the original node in its final position + H->data[i] = node_to_sift; +} + + +static void set_root_max_heap(heap_t * H, const float_t val, const idx_t array_idx){ + H -> data[0].value = val; + H -> data[0].array_idx = array_idx; + heapify_max_heap(H,0); + return; +} + +// Optimized insertion logic inspired by kBoundedQueue. +// It avoids swaps during bubble-up and correctly handles the full/not-full cases. +static inline void max_heap_insert(heap_t * H, const float_t val, const idx_t array_idx){ + if (H->count < H->size) { + // Heap is not full. Add to the end and bubble up. + idx_t i = H->count++; + // Bubble-up without swaps + while (i > 0) { + idx_t parent = HEAP_PARENT(i); + if (val <= H->data[parent].value) { + break; // Found the correct spot + } + // Move parent down + H->data[i] = H->data[parent]; + i = parent; + } + H->data[i].value = val; + H->data[i].array_idx = array_idx; + } else if (val < H->data[0].value) { + // Heap is full, but new value is smaller than the max. + // Replace the root and sift it down using the optimized heapify. + H->data[0].value = val; + H->data[0].array_idx = array_idx; + heapify_max_heap(H, 0); + } + // If heap is full and new value is >= max, do nothing. +} + +static int heap_node_compare(const void* a, const void* b) { - // [LT comment] is it possible to drop this N in favor of a global variable, - // when the tree is compiled to be used in adP? - // if N is constant, then we can save 8 bytes here - idx_t N; - idx_t count; - struct heap_node* data; - -} ; - -struct SimpleHeap -{ - idx_t N; - T * data; -} ; - -typedef struct SimpleHeap SimpleHeap; -typedef struct heap heap; -typedef struct heap_node heap_node; - - -void allocate_heap(heap* H, idx_t n); - -void init_heap(heap* H); - -void free_heap(heap * H); - -void heapify_max_heap(heap* H, idx_t node); - -void set_root_max_heap(heap * H, FLOAT_TYPE val, idx_t array_idx); - -void insert_max_heap(heap * H, FLOAT_TYPE val, idx_t array_idx); - -void heap_sort(heap* H); -void insert_max_heap_insertion_sort(heap * H,const FLOAT_TYPE val,const idx_t array_idx); - - -int cmp_heap_nodes(const void* a, const void* b); + const heap_node_t* aa = (const heap_node_t*)a; + const heap_node_t* bb = (const heap_node_t*)b; + if (aa->value < bb->value) return -1; + if (aa->value > bb->value) return 1; + return 0; +} + + +static inline void heap_sort(heap_t* H){ + idx_t n = H->count; + for(idx_t i = (H->count) - 1; i > 0; --i) + { + heap_node_swap(H->data, H->data + i); + H->count = i; + heapify_max_heap(H,0); + } + H->size = n; + // qsort(H->data, H->count, sizeof(heap_node_t), heap_node_compare); +} diff --git a/src/tree/kdtree.h b/src/tree/kdtree.h new file mode 100644 index 0000000..8a0c9be --- /dev/null +++ b/src/tree/kdtree.h @@ -0,0 +1,1278 @@ +#pragma once +#include "heap.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEFAULT_LEAF_SIZE 32 + +#define ALIGNMENT 64 +#define CHECK_ALLOCATION_NO_CTX(x) if(!x){printf("[!!!] Failed allocation: %s at line %d \n", __FILE__, __LINE__ ); exit(1);} + +#ifndef MY_MALLOC + #define MY_MALLOC(n) ({size_t __n = (n + ALIGNMENT - 1) & ~(ALIGNMENT - 1); void* p = aligned_alloc(ALIGNMENT,__n); CHECK_ALLOCATION_NO_CTX(p); memset(p, 0, __n); p; }) +#endif + +#define FREE(x) if(x); free(x); + +#define HP_LEFT_SIDE 0 +#define HP_RIGHT_SIDE 1 + +#define T double +#define DATA_DIMS 0 + +#ifdef USE_FLOAT32 + #define float_t float +#else + #define float_t double +#endif + +#ifdef USE_INT32 + #define MY_SIZE_MAX UINT32_MAX + #define idx_t uint32_t +#else + #define MY_SIZE_MAX UINT64_MAX + #define idx_t uint64_t +#endif + +#ifndef MIN + #define MIN(x,y) ((x < y) ? (x) : (y)) +#endif + +#ifndef MAX + #define MAX(x,y) ((x > y) ? (x) : (y)) +#endif + +typedef struct +{ + float_t* lb; + float_t* ub; +} bounding_box_t; + +// struct point_t +// { +// idx_t array_idx; +// float_t* data; +// }; +// typedef struct point_t point_t; + +struct pivot_t; + +union pivot_data { + + struct { + float_t split_value; // The value used for the split + uint8_t split_variable; // The dimension used for the split + uint64_t lch_idx; // 4-byte index to the left child in the pivot array + uint64_t rch_idx; // 4-byte index to the right child in the pivot array + } internal; + + // Data for a leaf node + struct { + uint32_t point_list_idx; // Index to the first point in the global point array + uint32_t leaf_point_count; // Number of points in this leaf + } leaf; +}; + +// The new, slimmer pivot_t struct +struct pivot_t { + uint8_t is_leaf; + bounding_box_t bounding_box; + union pivot_data as; +}; + +struct kdtree_t +{ + float_t* data; + uint32_t dims; + idx_t n_points; + idx_t root; + + struct point_t* __points; + float_t* __boxes_data; + struct pivot_t* __pivots; +}; + +typedef struct pivot_t pivot_t; +typedef struct kdtree_t kdtree_t; + + +static inline float_t euclidean_distance(const float_t* p1, const float_t* p2, const idx_t dims) +{ + register float_t d = 0; + for (idx_t i = 0; i < dims; ++i) + { + d += (p1[i] - p2[i])*(p1[i] - p2[i]); + } + return d; + // return sqrt(d); +} + +// #include +// static inline float_t euclidean_distance_simd(float_t* p1, float_t* p2, idx_t dims) +// { +// // Use AVX to process 4 doubles at a time +// __m256d diff, sum = _mm256_setzero_pd(); +// +// idx_t i = 0; +// for (; i <= (idx_t)dims - 4; i += 4) { +// // Load 4 doubles from p1 and p2 +// __m256d v1 = _mm256_loadu_pd(p1 + i); +// __m256d v2 = _mm256_loadu_pd(p2 + i); +// +// // Subtract +// diff = _mm256_sub_pd(v1, v2); +// +// // Square and add to sum (fused multiply-add) +// sum = _mm256_fmadd_pd(diff, diff, sum); +// } +// +// // Horizontal sum of the 4 partial sums in the vector +// __m256d temp = _mm256_hadd_pd(sum, sum); +// __m128d sum_high = _mm256_extractf128_pd(temp, 1); +// __m128d result = _mm_add_pd(_mm256_castpd256_pd128(temp), sum_high); +// +// float_t d = _mm_cvtsd_f64(result); +// +// // Handle any remaining dimensions that aren't a multiple of 4 +// for (; i < dims; ++i) { +// float_t term = p1[i] - p2[i]; +// d += term * term; +// } +// +// return d; +// } + +#ifdef USE_FLOAT32 +typedef float v4f __attribute__((vector_size(16))); +#else +typedef double v4f __attribute__((vector_size(32))); +#endif + +// static inline void point_swap(point_t *x, point_t *y, const idx_t dims) { +// // exchange idxs +// +// idx_t tmp_idx; +// tmp_idx = x->array_idx; +// x->array_idx = y->array_idx; +// y->array_idx = tmp_idx; +// +// //swap datapoint +// +// for(idx_t i = 0; i < dims; ++i) +// { +// float_t tmp_data = x -> data[i]; +// x -> data[i] = y -> data[i]; +// y -> data[i] = tmp_data; +// } +// +// } + +static inline void point_swap(point_t *x, point_t *y, const idx_t dims) +{ + point_t tmp = *x; + *x = *y; + *y = tmp; +} + +static void pivot_initialize_all(pivot_t* node_array, idx_t n) { + for (idx_t i = 0; i < n; ++i) + { + node_array[i].as.internal.lch_idx = -1; + node_array[i].as.internal.rch_idx = -1; + node_array[i].is_leaf = false; + node_array[i].as.internal.split_variable = -1; + node_array[i].as.internal.split_value = FLT_MAX; + } +} + +static inline int faster_point_partition_random(point_t *arr, idx_t low, idx_t high, int split_var, idx_t dims) { + // Seed the random number generator only once for the application's lifetime. + static bool seeded = false; + if (!seeded) { + srand(42); // Using a fixed seed for reproducible behavior. Use time(NULL) for non-deterministic pivots. + seeded = true; + } + + // In a subarray of size > 1, choose a random pivot index. + if (low < high) { + idx_t pivot_idx = low + rand() % (high - low + 1); + // Swap the chosen pivot with the last element to use the standard Lomuto scheme. + point_swap(arr + pivot_idx, arr + high, dims); + } + + // The rest of this function is the standard Lomuto partition scheme. + point_t pivot = arr[high]; + idx_t i = (low - 1); + for (idx_t j = low; j < high; j++) { + // If current element is smaller than or equal to the pivot + if (arr[j].data[split_var] <= pivot.data[split_var]) { + i++; + point_swap(arr + i, arr + j, dims); + } + } + point_swap(arr + i + 1, arr + high, dims); + return (i + 1); +} + + +static int point_compute_nth_element(point_t *a, idx_t left, idx_t right, idx_t rank, int split_var, idx_t dims) +{ + // `k` is the absolute index of the rank-th smallest element in the full array. + // `rank` is the 0-indexed rank in the subarray `a[left...right]`. + int k = left + rank; + + while (left <= right) { + // Base case: if the subarray has only one element, it's the desired element. + if (left == right) { + return left; + } + + // Partition the array using a random pivot and get the pivot's final index. + int pivotIndex = faster_point_partition_random(a, left, right, split_var, dims); + + // If the pivot is the rank-th element, we are done. + if (pivotIndex == k) { + return pivotIndex; + } + // If the rank-th element is in the left subarray, adjust the `right` boundary. + else if (pivotIndex > k) { + right = pivotIndex - 1; + } + // If the rank-th element is in the right subarray, adjust the `left` boundary. + else { + left = pivotIndex + 1; + } + } + return -1; // Should not be reached in a valid scenario. +} + + +static idx_t parallel_make_tree_w_ranks(point_t* points, pivot_t* pivots, + idx_t child_pivot_idx, int left_or_right, + idx_t start, idx_t end, int level, idx_t dims, + idx_t start_rank, idx_t end_rank) +{ + // this only build the first log2(ranks) levels + idx_t parent_pivot_idx = (child_pivot_idx > 0) ? ((child_pivot_idx - 1) / 2) : -1; + pivot_t *pivot = pivots + child_pivot_idx; + pivot_t *parent = (parent_pivot_idx != (idx_t)-1) ? (pivots + parent_pivot_idx) : NULL; + + switch (left_or_right) + { + case HP_LEFT_SIDE: + { + memcpy(pivot->bounding_box.lb, parent->bounding_box.lb, dims*sizeof(float_t)); + memcpy(pivot->bounding_box.ub, parent->bounding_box.ub, dims*sizeof(float_t)); + idx_t parent_splitting_dim = parent->as.internal.split_variable; + float_t parent_split_val = parent->as.internal.split_value; + pivot->bounding_box.ub[parent_splitting_dim] = parent_split_val; + } + break; + + case HP_RIGHT_SIDE: + { + memcpy(pivot->bounding_box.lb, parent->bounding_box.lb, dims*sizeof(float_t)); + memcpy(pivot->bounding_box.ub, parent->bounding_box.ub, dims*sizeof(float_t)); + idx_t parent_splitting_dim = parent->as.internal.split_variable; + float_t parent_split_val = parent->as.internal.split_value; + pivot->bounding_box.lb[parent_splitting_dim] = parent_split_val; + } + break; + + default: + { + for(int i=0; ibounding_box.lb[i] = FLT_MAX; + pivot->bounding_box.ub[i] = -FLT_MAX; + } + for(idx_t i=start; i<=end; ++i) { + for(idx_t j=0; jbounding_box.lb[j] = MIN(pivot->bounding_box.lb[j], points[i].data[j]); + pivot->bounding_box.ub[j] = MAX(pivot->bounding_box.ub[j], points[i].data[j]); + } + } + } + break; + } + + //handle the case it is a leaf + + if(start_rank == end_rank) + { + pivots[child_pivot_idx].is_leaf = true; + // write here the owner!!! + pivots[child_pivot_idx].as.leaf.leaf_point_count = start_rank; + return child_pivot_idx; + } + + int split_var = 0; + float_t max_extension = 0; + for(idx_t d=0; d < dims; ++d) { + float_t box_extension = pivot->bounding_box.ub[d] - pivot->bounding_box.lb[d]; + box_extension = box_extension*box_extension; + if (box_extension > max_extension) { + max_extension = box_extension; + split_var = d; + } + } + + + idx_t middle_rank = (start_rank + end_rank)/2; + + float_t fraction = (float_t)(middle_rank - start_rank + 1)/(float_t)(end_rank - start_rank + 1); + float_t nth_place = (end - start)*fraction; + + int nth_idx = point_compute_nth_element(points, start, end, (idx_t)nth_place, split_var, dims); + + // printf("child_pivot_idx %lu start_rank %lu end_rank %lu middle %lu nth_place %.2lf fraction %.2lf\n", + // child_pivot_idx, start_rank, end_rank, middle_rank, nth_place, fraction); + //printf("child_pivot_idx %lu start %lu end %lu nth %lu\n", child_pivot_idx, start, end, (idx_t)nth_idx); + + if (nth_idx > -1) { + pivots[child_pivot_idx].as.internal.split_variable = split_var; + pivots[child_pivot_idx].as.internal.split_value = points[nth_idx].data[split_var]; + + pivots[child_pivot_idx].as.internal.lch_idx = + parallel_make_tree_w_ranks(points, pivots, child_pivot_idx*2 + 1, + HP_LEFT_SIDE, start, nth_idx, level + 1, + dims, start_rank, middle_rank); + + pivots[child_pivot_idx].as.internal.rch_idx = + parallel_make_tree_w_ranks(points, pivots, child_pivot_idx*2 + 2, + HP_RIGHT_SIDE, nth_idx + 1, end, level + 1, + dims, middle_rank + 1, end_rank); + + } + + return child_pivot_idx; +} + + +static void point_initialize_all(point_t* point_array, float_t* data, idx_t n, idx_t dims) +{ + for (idx_t i = 0; i < n; ++i) + { + point_array[i].array_idx = i; + point_array[i].data = data + i*dims; + } + +} + +static inline int point_compare(const point_t *a, const point_t *b, int var) { + float_t res = a->data[var] - b->data[var]; + return (res > 0); +} + +static void pivot_print(pivot_t node, idx_t node_idx, idx_t dims) { + printf("Node %lu:\n", node_idx); + printf(" data: "); + + printf("\n"); + + if(node.is_leaf) + { + printf(" LEAF node\n"); + printf(" list len: %d\n", node.as.leaf.leaf_point_count); + } + else + { + printf(" INTERNAL node\n"); + printf(" split_value: %.4lf split_variable: %d\n", node.as.internal.split_value, node.as.internal.split_variable); + printf(" lch: %lu\n", node.as.internal.lch_idx); + printf(" rch: %lu\n", node.as.internal.rch_idx); + + } + printf(" lb_box: [ "); + for(int i=0; i k) { + right = pivotIndex - 1; + } + // If the median is in the right subarray, adjust the `left` boundary. + else { + left = pivotIndex + 1; + } + } + return -1; // Should not be reached in a valid scenario. +} + +// Implementation of QuickSelect +static int point_compute_median(point_t *a, idx_t left, idx_t right, int split_var, idx_t dims) +{ + // printf("----------\n"); + int k = left + ((right - left + 1) / 2); + + if (left == right) return left; + if (left == (right - 1)) + { + if (point_compare(a + left, a + right, split_var)) + { + point_swap(a + left, a + right, dims); + } + return right; + } + while (left <= right) { + + // Partition a[left..right] around a pivot + // and find the position of the pivot + int pivotIndex = point_partition(a, left, right, split_var, dims); + // printf("%d %d %d %d\n",left, right, k, pivotIndex); + + // If pivot itself is the k-th smallest element + if (pivotIndex == k) + return pivotIndex; + + // If there are more than k-1 elements on + // left of pivot, then k-th smallest must be + // on left side. + + else if (pivotIndex > k) right = pivotIndex - 1; + + // Else k-th smallest is on right side. + else left = pivotIndex + 1; + } + return -1; +} + +static idx_t recursive_make_tree(point_t* points, pivot_t* pivots, + idx_t child_pivot_idx, int left_or_right, + idx_t start, idx_t end, int level, idx_t dims) +{ + struct timespec start_node, stop_node; + + clock_gettime(CLOCK_MONOTONIC, &start_node); + + // printf("Entering Node: [%d %d] @level %d\n", + // start, end, level ); + + idx_t parent_pivot_idx = (child_pivot_idx > 0) ? ((child_pivot_idx - 1) / 2) : -1; + pivot_t *pivot = pivots + child_pivot_idx; + pivot_t *parent = pivots + parent_pivot_idx; + + switch (left_or_right) + { + case HP_LEFT_SIDE: + { + memcpy(pivot->bounding_box.lb, parent->bounding_box.lb, dims*sizeof(float_t)); + memcpy(pivot->bounding_box.ub, parent->bounding_box.ub, dims*sizeof(float_t)); + idx_t parent_splitting_dim = pivots[parent_pivot_idx].as.internal.split_variable; + float_t parent_split_val = pivots[parent_pivot_idx].as.internal.split_value; + + pivot->bounding_box.ub[parent_splitting_dim] = parent_split_val; + } + break; + + case HP_RIGHT_SIDE: + { + memcpy(pivot->bounding_box.lb, parent->bounding_box.lb, dims*sizeof(float_t)); + memcpy(pivot->bounding_box.ub, parent->bounding_box.ub, dims*sizeof(float_t)); + idx_t parent_splitting_dim = pivots[parent_pivot_idx].as.internal.split_variable; + float_t parent_split_val = pivots[parent_pivot_idx].as.internal.split_value; + + pivot->bounding_box.lb[parent_splitting_dim] = parent_split_val; + } + break; + + default: + { + for(int i=0; ibounding_box.lb[i] = FLT_MAX; + pivot->bounding_box.ub[i] = -FLT_MAX; + } + + // compute master bounding_box + // + + for(idx_t i=start; i<=end; ++i) + { + for(idx_t j=0; jbounding_box.lb[j] = MIN(pivot->bounding_box.lb[j], points[i].data[j]); + pivot->bounding_box.ub[j] = MAX(pivot->bounding_box.ub[j], points[i].data[j]); + } + } + } + break; + + } + + //int split_var = level % dims; + + int split_var = 0; + float_t max_extension = 0; + for(idx_t d=0; d < dims; ++d) + { + float_t box_extension = pivot->bounding_box.ub[d] - pivot->bounding_box.lb[d]; + box_extension = box_extension*box_extension; + + split_var = (box_extension > max_extension) ? d : split_var; + max_extension = MAX(box_extension, max_extension); + } + + if (end - start < DEFAULT_LEAF_SIZE) + { + pivots[child_pivot_idx].is_leaf = 1; + pivots[child_pivot_idx].as.leaf.leaf_point_count = (size_t)(end - start + 1); + pivots[child_pivot_idx].as.leaf.point_list_idx = start; + return child_pivot_idx; + } + + int median_idx = -1; + + median_idx = faster_point_compute_median(points, start, end, split_var, dims); + // printf("%d median idx\n", median_idx); + if (median_idx > -1) + { + // generate a pivot + pivots[child_pivot_idx].as.internal.split_variable = split_var; + pivots[child_pivot_idx].as.internal.split_value = points[median_idx].data[split_var]; + pivots[child_pivot_idx].as.internal.lch_idx = + recursive_make_tree(points, pivots, child_pivot_idx*2 + 1, + HP_LEFT_SIDE, start, median_idx, level + 1, dims); + pivots[child_pivot_idx].as.internal.rch_idx = + recursive_make_tree(points, pivots, child_pivot_idx*2 + 2, + HP_RIGHT_SIDE, median_idx + 1, end, level + 1, dims); + + } + + clock_gettime(CLOCK_MONOTONIC, &stop_node); + + // printf("Exiting Node: [%d %d] @level %d -> elapsed: %f\n", + // start, end, level, ((float)stop_node.tv_sec - (float)start_node.tv_sec) + + // ((float)stop_node.tv_nsec - (float)start_node.tv_nsec)/1e9 ); + + return child_pivot_idx; +} + +static inline float_t hyper_plane_dist(const float_t *p1, const float_t p2, const int var) +{ + return p1[var] - p2; +} + +static inline float_t box_dist(const float_t *p, const float_t *lb, const float_t* ub, const int dims) +{ + float_t r = 0; + for (idx_t i=0; i ub[i]) { + r += (p[i] - ub[i]) * (p[i] - ub[i]); + } + } + return r; +} + +static inline int hyper_plane_side(const float_t *p1, const float_t *p2, const int var) +{ + return p1[var] > p2[var]; +} + +static void knn_sub_tree_search(const float_t *point, const point_t* data_points, const pivot_t* pivots, + const idx_t root_idx, heap_t *H, const idx_t dims, idx_t* vis_nodes) +{ + pivot_t root = pivots[root_idx]; + *vis_nodes = *vis_nodes + 1; + if (root.is_leaf) + { + // take the base pointer + idx_t base_point = root.as.leaf.point_list_idx; + const point_t* leaf_points = data_points + base_point; + float_t* base_data = leaf_points[0].data; + idx_t point_list_count = root.as.leaf.leaf_point_count; + + + float_t dists[DEFAULT_LEAF_SIZE]; + #pragma unroll 4 + for (size_t i = 0; i < point_list_count; ++i) + { + float_t distance = euclidean_distance(point, base_data + i*dims, dims); + dists[i] = distance; + } + + for (size_t i = 0; i < point_list_count; ++i) + { + point_t data_point = leaf_points[i]; + max_heap_insert(H, dists[i], data_point.array_idx); + } + + // CORRECTED an efficient single loop: + // for (size_t i = 0; i < point_list_count; ++i) + // { + // *vis_nodes = *vis_nodes + 1; + // + // // Calculate distance using the correct data pointer + // float_t distance = euclidean_distance(point, base_data + i*dims, dims); + // + // // Immediately use the distance. No temporary array, less memory traffic. + // max_heap_insert(H, distance, leaf_points[i].array_idx); + // } + return; + } + + + + //float_t hp_distance = hyper_plane_dist(point, root->split_value, root->split_variable); + float_t hp_distance = hyper_plane_dist(point, root.as.internal.split_value, root.as.internal.split_variable); + + int side = hp_distance > 0.f; + + switch (side) { + case HP_LEFT_SIDE: + knn_sub_tree_search(point, data_points, pivots, root.as.internal.lch_idx, H, dims, vis_nodes); + break; + + case HP_RIGHT_SIDE: + knn_sub_tree_search(point, data_points, pivots, root.as.internal.rch_idx, H, dims, vis_nodes); + break; + + default: + break; + } + float_t max_d = H->data[0].value; + + + // int c = max_d > (hp_distance * hp_distance); + // if (c || (H->count) < (H->size)) + // { + // switch (side) + // { + // case HP_LEFT_SIDE: + // if (root->rch) knn_sub_tree_search(point, root->rch, H, dims, vis_nodes); + // break; + // + // case HP_RIGHT_SIDE: + // if (root->lch) knn_sub_tree_search(point, root->lch, H, dims, vis_nodes); + // break; + // + // default: + // break; + // } + // } + + int heap_is_not_full = (H->count) < (H->size); + switch (side) + { + case HP_LEFT_SIDE: + { + bounding_box_t box = pivots[root.as.internal.rch_idx].bounding_box; + float_t dist = box_dist(point, box.lb, box.ub, dims); + if((max_d > dist) || heap_is_not_full) + knn_sub_tree_search(point, data_points, pivots, root.as.internal.rch_idx, H, dims, vis_nodes); + } + break; + + case HP_RIGHT_SIDE: + { + bounding_box_t box = pivots[root.as.internal.lch_idx].bounding_box; + float_t dist = box_dist(point, box.lb, box.ub, dims); + if((max_d > dist) || heap_is_not_full) + knn_sub_tree_search(point, data_points, pivots, root.as.internal.lch_idx, H, dims, vis_nodes); + } + break; + + default: + break; + } + return; +} + + + +static void kdtree_initialize(kdtree_t *tree, float_t *data, size_t n_points, unsigned int dims) +{ + tree->dims = dims; + tree->data = data; + tree->n_points = n_points; + tree->__points = (point_t *)MY_MALLOC(n_points * sizeof(point_t)); + tree->__pivots = (pivot_t *)MY_MALLOC(n_points * sizeof(pivot_t)); + tree->__boxes_data = (float_t *)MY_MALLOC(2 * n_points * sizeof(float_t)*dims); + + for(idx_t i=0; i__pivots[i].bounding_box.lb = tree->__boxes_data + ii*dims; + tree->__pivots[i].bounding_box.ub = tree->__boxes_data + (ii+1)*dims; + } + pivot_initialize_all(tree->__pivots, n_points); + point_initialize_all(tree->__points, data, n_points, dims); + tree->root = -1; +} + +static void recursive_print(pivot_t* pivots, idx_t node_idx, idx_t dims, idx_t level) +{ + pivot_t node = pivots[node_idx]; + if(level < 3) + { + printf("---LEVEL %lu ---\n", level); + pivot_print(node, node_idx, dims); + + } + if(!node.is_leaf) + { + recursive_print(pivots, node.as.internal.lch_idx, dims, level+1); + recursive_print(pivots, node.as.internal.rch_idx, dims, level+1); + } +} + +static void kdtree_print(kdtree_t* tree) +{ + idx_t level = 0; + recursive_print(tree->__pivots, tree -> root, tree -> dims, level); +} + +// void kdtree_compact_data(kdtree_t* tree) +// { +// float_t* new_data = (float_t*)calloc(tree->dims * tree->n_points * sizeof(float_t), 1); +// for(idx_t i = 0; i < tree->n_points; ++i) +// { +// kdnode_t* node = tree->_nodes + i; +// memcpy(new_data + i*tree->dims, node->data, tree->dims * sizeof(float_t)); node->data = new_data + i*tree->dims; +// } +// free(tree -> data); +// tree -> data = new_data; +// } + + + + +#include + +#define SERIAL_BUILD_CUTOFF 4096 +#define NLEVELS 6 +#define OVERSAMPLING 32 + +static idx_t recursive_make_pivots(point_t* points, pivot_t* pivots, idx_t current_pivot_idx, idx_t parent_pivot_idx, + int left_or_right, idx_t start, idx_t end, int level, int dims, + idx_t* last_bucket_assigned, idx_t* leaf_pivot_indices) +{ + pivot_t *pivot = pivots + current_pivot_idx; + pivot_t *parent = (parent_pivot_idx != (idx_t)-1) ? (pivots + parent_pivot_idx) : NULL; + + if(level == 0) + { + pivot->is_leaf = true; + // Store the bucket ID in the leaf node so find_bucket can retrieve it + pivot->as.leaf.leaf_point_count = *last_bucket_assigned; + if (leaf_pivot_indices) { + leaf_pivot_indices[*last_bucket_assigned] = current_pivot_idx; + } + (*last_bucket_assigned)++; + return current_pivot_idx; + } + + + switch (left_or_right) + { + case HP_LEFT_SIDE: + { + memcpy(pivot->bounding_box.lb, parent->bounding_box.lb, dims*sizeof(float_t)); + memcpy(pivot->bounding_box.ub, parent->bounding_box.ub, dims*sizeof(float_t)); + idx_t parent_splitting_dim = parent->as.internal.split_variable; + float_t parent_split_val = parent->as.internal.split_value; + pivot->bounding_box.ub[parent_splitting_dim] = parent_split_val; + } + break; + + case HP_RIGHT_SIDE: + { + memcpy(pivot->bounding_box.lb, parent->bounding_box.lb, dims*sizeof(float_t)); + memcpy(pivot->bounding_box.ub, parent->bounding_box.ub, dims*sizeof(float_t)); + idx_t parent_splitting_dim = parent->as.internal.split_variable; + float_t parent_split_val = parent->as.internal.split_value; + pivot->bounding_box.lb[parent_splitting_dim] = parent_split_val; + } + break; + + default: + { + for(int i=0; ibounding_box.lb[i] = FLT_MAX; + pivot->bounding_box.ub[i] = -FLT_MAX; + } + for(idx_t i=start; i<=end; ++i) { + for(idx_t j=0; jbounding_box.lb[j] = MIN(pivot->bounding_box.lb[j], points[i].data[j]); + pivot->bounding_box.ub[j] = MAX(pivot->bounding_box.ub[j], points[i].data[j]); + } + } + } + break; + } + + int split_var = 0; + float_t max_extension = 0; + for(idx_t d=0; d < dims; ++d) { + float_t box_extension = pivot->bounding_box.ub[d] - pivot->bounding_box.lb[d]; + box_extension = box_extension*box_extension; + if (box_extension > max_extension) { + max_extension = box_extension; + split_var = d; + } + } + + int median_idx = faster_point_compute_median(points, start, end, split_var, dims); + + //build tree on sampled points + + if (median_idx > -1) { + pivots[current_pivot_idx].as.internal.split_variable = split_var; + pivots[current_pivot_idx].as.internal.split_value = points[median_idx].data[split_var]; + + pivots[current_pivot_idx].as.internal.lch_idx = recursive_make_pivots(points, pivots, current_pivot_idx * 2 + 1, current_pivot_idx, + HP_LEFT_SIDE, start, median_idx, level - 1, dims, last_bucket_assigned, leaf_pivot_indices); + pivots[current_pivot_idx].as.internal.rch_idx = recursive_make_pivots(points, pivots, current_pivot_idx * 2 + 2, current_pivot_idx, + HP_RIGHT_SIDE, median_idx + 1, end, level - 1, dims, last_bucket_assigned, leaf_pivot_indices); + } + return current_pivot_idx; +} + +static inline idx_t find_bucket(point_t* p, pivot_t* pivots, idx_t root_idx) { + idx_t curr = root_idx; + while (!pivots[curr].is_leaf) { + int split_var = pivots[curr].as.internal.split_variable; + float_t split_val = pivots[curr].as.internal.split_value; + if (p->data[split_var] <= split_val) { + curr = pivots[curr].as.internal.lch_idx; + } else { + curr = pivots[curr].as.internal.rch_idx; + } + } + // The bucket ID was stored in leaf_point_count by recursive_make_pivots + return (idx_t)pivots[curr].as.leaf.leaf_point_count; +} + +static void parallel_partition_points(point_t* points, point_t* scratch_points, + pivot_t* pivots, idx_t root_idx, + idx_t start, idx_t end, + idx_t* bucket_start_indices, + idx_t* bucket_counts, + idx_t n_buckets) { + idx_t n = end - start + 1; + #define PARTITION_BLOCK_SIZE 2048 + idx_t num_blocks = (n + PARTITION_BLOCK_SIZE - 1) / PARTITION_BLOCK_SIZE; + + // Allocate local counts: [num_blocks][n_buckets] + idx_t* block_bucket_counts = (idx_t*)calloc(num_blocks * n_buckets, sizeof(idx_t)); + if (!block_bucket_counts) { + printf("Allocation failed for block_bucket_counts\n"); + exit(1); + } + + // 1. Local histogram + #pragma omp parallel for schedule(static) + for (idx_t b = 0; b < num_blocks; ++b) { + idx_t block_start = b * PARTITION_BLOCK_SIZE; + idx_t block_end = MIN(block_start + PARTITION_BLOCK_SIZE, n); + idx_t* local_counts = &block_bucket_counts[b * n_buckets]; + + for (idx_t i = block_start; i < block_end; ++i) { + // Note: points[start + i] is the correct access + idx_t bucket = find_bucket(&points[start + i], pivots, root_idx); + local_counts[bucket]++; + } + } + + // 2. Global histogram and prefix sum + memset(bucket_counts, 0, n_buckets * sizeof(idx_t)); + for (idx_t b = 0; b < num_blocks; ++b) { + for (idx_t k = 0; k < n_buckets; ++k) { + bucket_counts[k] += block_bucket_counts[b * n_buckets + k]; + } + } + + bucket_start_indices[0] = 0; + for (idx_t k = 1; k < n_buckets; ++k) { + bucket_start_indices[k] = bucket_start_indices[k-1] + bucket_counts[k-1]; + } + + idx_t* block_write_offsets = (idx_t*)malloc(num_blocks * n_buckets * sizeof(idx_t)); + idx_t* current_offsets = (idx_t*)malloc(n_buckets * sizeof(idx_t)); + + memcpy(current_offsets, bucket_start_indices, n_buckets * sizeof(idx_t)); + + for (idx_t b = 0; b < num_blocks; ++b) { + for (idx_t k = 0; k < n_buckets; ++k) { + block_write_offsets[b * n_buckets + k] = current_offsets[k]; + current_offsets[k] += block_bucket_counts[b * n_buckets + k]; + } + } + free(current_offsets); + + // 3. Move points to scratch (using slice at `start`) + #pragma omp parallel for schedule(static) + for (idx_t b = 0; b < num_blocks; ++b) { + idx_t block_start = b * PARTITION_BLOCK_SIZE; + idx_t block_end = MIN(block_start + PARTITION_BLOCK_SIZE, n); + idx_t* local_offsets = &block_write_offsets[b * n_buckets]; + + for (idx_t i = block_start; i < block_end; ++i) { + idx_t bucket = find_bucket(&points[start + i], pivots, root_idx); + idx_t dest_offset = local_offsets[bucket]++; + + // Write to scratch_points at the correct offset relative to `start` + // scratch_points is assumed to be a parallel array to points + scratch_points[start + dest_offset] = points[start + i]; + } + } + + // 4. Copy back to points + #pragma omp parallel for schedule(static) + for(idx_t i=0; i (end - start + 1)) { + return recursive_make_tree(points, pivots, current_pivot_idx, left_or_right, start, end, level, dims); + } + + float_t* samples = (float_t*)calloc(n_samples * dims * sizeof(float_t), 1); + point_t* tmp_points = (point_t*)calloc(n_samples * sizeof(point_t), 1); + + // 1. Random sampling + for(idx_t s=0; sbounding_box.lb, parent->bounding_box.lb, dims*sizeof(float_t)); + memcpy(pivot->bounding_box.ub, parent->bounding_box.ub, dims*sizeof(float_t)); + pivot->bounding_box.ub[parent->as.internal.split_variable] = parent->as.internal.split_value; + break; + case HP_RIGHT_SIDE: + memcpy(pivot->bounding_box.lb, parent->bounding_box.lb, dims*sizeof(float_t)); + memcpy(pivot->bounding_box.ub, parent->bounding_box.ub, dims*sizeof(float_t)); + pivot->bounding_box.lb[parent->as.internal.split_variable] = parent->as.internal.split_value; + break; + default: + for(int i=0; ibounding_box.lb[i] = FLT_MAX; + pivot->bounding_box.ub[i] = -FLT_MAX; + } + for(idx_t i=start; i<=end; ++i) { + for(idx_t j=0; jbounding_box.lb[j] = MIN(pivot->bounding_box.lb[j], points[i].data[j]); + pivot->bounding_box.ub[j] = MAX(pivot->bounding_box.ub[j], points[i].data[j]); + } + } + break; + } + + // 3. Build Sample Tree (Pivots) + idx_t last_bucket_assigned = 0; + idx_t* leaf_pivot_indices = (idx_t*)malloc(n_buckets * sizeof(idx_t)); + + recursive_make_pivots(tmp_points, pivots, current_pivot_idx, parent_pivot_idx, + left_or_right, 0, n_samples - 1, sample_levels, dims, + &last_bucket_assigned, leaf_pivot_indices); + + // 4. Partition Points + idx_t* bucket_start_indices = (idx_t*)malloc(n_buckets * sizeof(idx_t)); + idx_t* bucket_counts = (idx_t*)malloc(n_buckets * sizeof(idx_t)); + + parallel_partition_points(points, scratch_points, pivots, current_pivot_idx, + start, end, bucket_start_indices, bucket_counts, n_buckets); + + // 5. Recursively build subtrees + for(idx_t b = 0; b < n_buckets; ++b) + { + if(bucket_counts[b] > 0) + { + idx_t p_idx = leaf_pivot_indices[b]; + idx_t b_start = start + bucket_start_indices[b]; + idx_t b_end = b_start + bucket_counts[b] - 1; + + // Root is 0. Left child of p is 2p+1. + int side = (p_idx % 2 != 0) ? HP_LEFT_SIDE : HP_RIGHT_SIDE; + if (p_idx == 0) side = -1; + + #pragma omp task + { + pivots[p_idx].is_leaf = 0; + sample_recursive_make_tree(points, scratch_points, pivots, p_idx, + (p_idx > 0) ? (p_idx - 1) / 2 : -1, side, + b_start, b_end, level + sample_levels, dims); + } + } + } + #pragma omp taskwait + + free(leaf_pivot_indices); + free(bucket_start_indices); + free(bucket_counts); + free(samples); + free(tmp_points); + + return current_pivot_idx; +} + +static idx_t parallel_recursive_make_tree(point_t* points, pivot_t* pivots, + idx_t child_pivot_idx, int left_or_right, + idx_t start, idx_t end, int level, idx_t dims); + +static void build_tree_kdtree_parallel(kdtree_t* tree) +{ + /************************************************* + * Wrapper for the parallel make_tree function. * + *************************************************/ + #pragma omp parallel + { + #pragma omp single + { + tree->root = parallel_recursive_make_tree(tree->__points, tree->__pivots, 0, -1, + 0, tree->n_points-1, 0, tree->dims); + } + } + + float_t* tmp_data = (float_t*)MY_MALLOC(tree->n_points * tree->dims * sizeof(float_t)); + + #pragma omp parallel for + for(idx_t i = 0; i < tree->n_points; ++i) + { + float_t* datapoint = tree->__points[i].data; + memcpy(tmp_data + tree->dims * i, datapoint, tree->dims * sizeof(float_t)); + tree->__points[i].data = tmp_data + tree->dims * i; + } + free(tree->data); + tree->data = tmp_data; +} + +static idx_t parallel_recursive_make_tree(point_t* points, pivot_t* pivots, + idx_t child_pivot_idx, int left_or_right, + idx_t start, idx_t end, int level, idx_t dims) +{ + // Fallback to serial version for small subproblems + if ((end - start) < SERIAL_BUILD_CUTOFF) { + return recursive_make_tree(points, pivots, child_pivot_idx, left_or_right, start, end, level, dims); + } + + // This logic is identical to the serial version, as the partitioning step itself remains serial. + // The parallelism comes from recursively calling this function in parallel tasks. + idx_t parent_pivot_idx = (child_pivot_idx > 0) ? ((child_pivot_idx - 1) / 2) : -1; + pivot_t *pivot = pivots + child_pivot_idx; + pivot_t *parent = (parent_pivot_idx != (idx_t)-1) ? (pivots + parent_pivot_idx) : NULL; + + switch (left_or_right) + { + case HP_LEFT_SIDE: + { + memcpy(pivot->bounding_box.lb, parent->bounding_box.lb, dims*sizeof(float_t)); + memcpy(pivot->bounding_box.ub, parent->bounding_box.ub, dims*sizeof(float_t)); + idx_t parent_splitting_dim = parent->as.internal.split_variable; + float_t parent_split_val = parent->as.internal.split_value; + pivot->bounding_box.ub[parent_splitting_dim] = parent_split_val; + } + break; + + case HP_RIGHT_SIDE: + { + memcpy(pivot->bounding_box.lb, parent->bounding_box.lb, dims*sizeof(float_t)); + memcpy(pivot->bounding_box.ub, parent->bounding_box.ub, dims*sizeof(float_t)); + idx_t parent_splitting_dim = parent->as.internal.split_variable; + float_t parent_split_val = parent->as.internal.split_value; + pivot->bounding_box.lb[parent_splitting_dim] = parent_split_val; + } + break; + + default: + { + for(int i=0; ibounding_box.lb[i] = FLT_MAX; + pivot->bounding_box.ub[i] = -FLT_MAX; + } + for(idx_t i=start; i<=end; ++i) { + for(idx_t j=0; jbounding_box.lb[j] = MIN(pivot->bounding_box.lb[j], points[i].data[j]); + pivot->bounding_box.ub[j] = MAX(pivot->bounding_box.ub[j], points[i].data[j]); + } + } + } + break; + } + + int split_var = 0; + float_t max_extension = 0; + for(idx_t d=0; d < dims; ++d) { + float_t box_extension = pivot->bounding_box.ub[d] - pivot->bounding_box.lb[d]; + box_extension = box_extension*box_extension; + if (box_extension > max_extension) { + max_extension = box_extension; + split_var = d; + } + } + + if (end - start < DEFAULT_LEAF_SIZE) { + pivots[child_pivot_idx].is_leaf = 1; + pivots[child_pivot_idx].as.leaf.leaf_point_count = (size_t)(end - start + 1); + pivots[child_pivot_idx].as.leaf.point_list_idx = start; + return child_pivot_idx; + } + + int median_idx = faster_point_compute_median(points, start, end, split_var, dims); + + if (median_idx > -1) { + pivots[child_pivot_idx].as.internal.split_variable = split_var; + pivots[child_pivot_idx].as.internal.split_value = points[median_idx].data[split_var]; + + #pragma omp task + pivots[child_pivot_idx].as.internal.lch_idx = + parallel_recursive_make_tree(points, pivots, child_pivot_idx*2 + 1, + HP_LEFT_SIDE, start, median_idx, level + 1, dims); + + #pragma omp task + pivots[child_pivot_idx].as.internal.rch_idx = + parallel_recursive_make_tree(points, pivots, child_pivot_idx*2 + 2, + HP_RIGHT_SIDE, median_idx + 1, end, level + 1, dims); + + #pragma omp taskwait + } + + return child_pivot_idx; +} + + +static void kdtree_free(kdtree_t *tree) +{ + + FREE(tree->__pivots) + FREE(tree->__points) + FREE(tree->__boxes_data); + FREE(tree->data); +} + +static void build_tree_kdtree(kdtree_t* tree) +{ + /************************************************* + * Wrapper for make_tree function. * + * Simplifies interfaces and takes time measures * + *************************************************/ + // tree->root = recursive_make_tree(tree->__points, tree->__pivots, 0, -1, 0, tree->n_points-1, 0, tree->dims); + tree->root = parallel_recursive_make_tree(tree->__points, tree->__pivots, 0, -1, 0, tree->n_points-1, 0, tree->dims); + + float_t* tmp_data = (float_t*)MY_MALLOC(tree->n_points * tree->dims * sizeof(float_t)); + #pragma omp parallel for + for(idx_t i = 0; i < tree->n_points; ++i) + { + float_t* datapoint = tree->__points[i].data; + memcpy(tmp_data + tree->dims * i, datapoint, tree->dims * sizeof(float_t)); + tree->__points[i].data = tmp_data + tree->dims * i; + } + free(tree->data); + tree->data = tmp_data; +} + +static void build_tree_kdtree_sample(kdtree_t* tree) +{ + point_t* scratch_points = (point_t*)MY_MALLOC(tree->n_points * sizeof(point_t)); + + #pragma omp parallel + { + #pragma omp single + { + tree->root = sample_recursive_make_tree(tree->__points, scratch_points, tree->__pivots, 0, -1, + -1, 0, tree->n_points-1, 0, tree->dims); + } + } + + FREE(scratch_points); + + float_t* tmp_data = (float_t*)MY_MALLOC(tree->n_points * tree->dims * sizeof(float_t)); + + #pragma omp parallel for + for(idx_t i = 0; i < tree->n_points; ++i) + { + float_t* datapoint = tree->__points[i].data; + memcpy(tmp_data + tree->dims * i, datapoint, tree->dims * sizeof(float_t)); + tree->__points[i].data = tmp_data + tree->dims * i; + } + free(tree->data); + tree->data = tmp_data; +} diff --git a/src/tree/kdtreeV2.c b/src/tree/kdtreeV2.c deleted file mode 100644 index f7b5e7d..0000000 --- a/src/tree/kdtreeV2.c +++ /dev/null @@ -1,378 +0,0 @@ -#include "kdtreeV2.h" -#include "heap.h" -#include -#include -#include - -#define DEFAULT_LEAF_SIZE 30 - -extern unsigned int data_dims; - -FLOAT_TYPE eud_kdtree_v2(FLOAT_TYPE *restrict p1, FLOAT_TYPE *restrict p2) { - register FLOAT_TYPE d = 0; - for (unsigned int i = 0; i < data_dims; ++i) { - register FLOAT_TYPE dd = (p1[i] - p2[i]); - d += dd * dd; - } - return d; - // return sqrt(d); -} - -#ifdef USE_FLOAT32 -typedef float v4f __attribute__((vector_size(16))); -#else -typedef double v4f __attribute__((vector_size(32))); -#endif - -FLOAT_TYPE eud_kdtree_v2_2(FLOAT_TYPE *restrict u, FLOAT_TYPE *restrict v) { - register float_t s; - uint32_t i = 0; - // manually unrolled loop, might be vectorized - register v4f acc = {0., 0., 0., 0.}; - for (; i + 4 <= data_dims; i += 4) { - register v4f _u = {u[i], u[i + 1], u[i + 2], u[i + 3]}; - register v4f _v = {v[i], v[i + 1], v[i + 2], v[i + 3]}; - register v4f _diff = _u - _v; - acc = _diff * _diff; - } - - s = acc[0] + acc[1] + acc[2] + acc[3]; - if (i < data_dims) { - for (; i < data_dims; ++i) { - float_t d = u[i] - v[i]; - s += d * d; - } - } - return s; -} - -FLOAT_TYPE *swapMem_kdv2; - -void swap_kdnode_v2(kdnode_v2 *x, kdnode_v2 *y) { - kdnode_v2 tmp; - tmp = *x; - *x = *y; - *y = tmp; - -#ifdef SWMEM - memcpy(swapMem_kdv2, x->data, sizeof(FLOAT_TYPE) * data_dims); - memcpy(x->data, y->data, sizeof(FLOAT_TYPE) * data_dims); - memcpy(y->data, swapMem_kdv2, sizeof(FLOAT_TYPE) * data_dims); - FLOAT_TYPE *tmpPtr = x->data; - x->data = y->data; - y->data = tmpPtr; -#endif -} - -/** - * - * KDtree implementation - * - * - */ - -void initialize_kdnodes_v2(kdnode_v2 *node_array, FLOAT_TYPE *d, idx_t n) { - for (idx_t i = 0; i < n; ++i) { - node_array[i].data = d + (i * data_dims); - node_array[i].array_idx = i; - node_array[i].lch = NULL; - node_array[i].rch = NULL; - node_array[i].parent = NULL; - node_array[i].level = -1; - node_array[i].is_leaf = 0; - node_array[i].split_var = -1; - node_array[i].node_list.data = NULL; - node_array[i].node_list.count = 0; - } -} -/* -void initializePTRS(kdnode_v2** node_ptr_array, kdnode_v2* node_array, idx_t n ) -{ - for(idx_t i = 0; i < n; ++i) - { - node_ptr_array[i] = node_array + i; - } -} -*/ - -int cmpKDnodesV2(kdnode_v2 *a, kdnode_v2 *b, int var) { - - FLOAT_TYPE res = a->data[var] - b->data[var]; - return (res > 0); -} - -void printKDnodeV2(kdnode_v2 *node) { - printf("Node %p:\n", node); - printf("\t array_idx: %lu\n", (uint64_t)(node->array_idx)); - printf("\t data: "); - for (unsigned int i = 0; i < data_dims; ++i) - printf(" %f ", node->data[i]); - printf("\n"); - printf("\t parent: %p\n", node->parent); - printf("\t lch: %p\n", node->lch); - printf("\t rch: %p\n", node->rch); - printf("\t level: %d\n", node->level); - printf("\t split_var: %d\n", node->split_var); - printf("\n"); -} - -// Standard Lomuto partition function - -int partition_kdnode_v2(kdnode_v2 *arr, int low, int high, int split_var) { - kdnode_v2 pivot = arr[high]; - - int i = (low - 1); - for (int j = low; j <= high - 1; j++) { - if (!cmpKDnodesV2(arr + j, &pivot, split_var)) { - i++; - swap_kdnode_v2(arr + i, arr + j); - } - } - swap_kdnode_v2(arr + i + 1, arr + high); - return (i + 1); -} - -// Implementation of QuickSelect -int median_of_nodes_kdnode_v2(kdnode_v2 *a, int left, int right, - int split_var) { - // printf("----------\n"); - int k = left + ((right - left + 1) / 2); - - if (left == right) - return left; - if (left == (right - 1)) { - if (cmpKDnodesV2(a + left, a + right, split_var)) { - swap_kdnode_v2(a + left, a + right); - } - return right; - } - while (left <= right) { - - // Partition a[left..right] around a pivot - // and find the position of the pivot - int pivotIndex = partition_kdnode_v2(a, left, right, split_var); - // printf("%d %d %d %d\n",left, right, k, pivotIndex); - - // If pivot itself is the k-th smallest element - if (pivotIndex == k) - return pivotIndex; - - // If there are more than k-1 elements on - // left of pivot, then k-th smallest must be - // on left side. - - else if (pivotIndex > k) - right = pivotIndex - 1; - - // Else k-th smallest is on right side. - else - left = pivotIndex + 1; - } - return -1; -} - -kdnode_v2 *make_tree_kdnode_v2(kdnode_v2 *t, int start, int end, - kdnode_v2 *parent, int level) { - kdnode_v2 *n = NULL; - int split_var = level % data_dims; - FLOAT_TYPE max_diff = -999999.; - for (unsigned int v = 0; v < data_dims; ++v) { - FLOAT_TYPE max_v = -9999999.; - FLOAT_TYPE min_v = 9999999.; - for (int i = start; i <= end; ++i) { - max_v = t[i].data[v] > max_v ? t[i].data[v] : max_v; - min_v = t[i].data[v] < min_v ? t[i].data[v] : min_v; - } - if ((max_v - min_v) > max_diff) { - max_diff = max_v - min_v; - split_var = v; - } - } - - /* - #ifdef SWMEM - if(parent == NULL) - { - swapMem_kdv2 = (FLOAT_TYPE*)malloc(sizeof(FLOAT_TYPE)*data_dims); - } - #endif - */ - - if (end - start < DEFAULT_LEAF_SIZE) { - n = t + start; - n->is_leaf = 1; - n->parent = parent; - n->lch = NULL; - n->rch = NULL; - size_t j = 0; - n->node_list.count = (size_t)(end - start + 1); - n->node_list.data = - (kdnode_v2 **)malloc(n->node_list.count * sizeof(kdnode_v2 *)); - for (int i = start; i <= end; ++i) { - t[i].parent = n; - t[i].is_leaf = 1; - t[i].lch = NULL; - t[i].rch = NULL; - n->node_list.data[j] = t + i; - ++j; - } - return n; - } - - int median_idx = -1; - - // if ((end - start) < 0) return 0; - // if (end == start) { - // n = t + start; - // n -> split_var = split_var; - // n->parent = parent; - // n->level = level; - // n -> lch = NULL; - // n -> rch = NULL; - // return n; - // } - - median_idx = median_of_nodes_kdnode_v2(t, start, end, split_var); - // printf("%d median idx\n", median_idx); - if (median_idx > -1) { - swap_kdnode_v2(t + start, t + median_idx); - // n = t + median_idx; - n = t + start; - // n->lch = make_tree_kdnode_v2(t, start, median_idx - 1, n, level + 1); - n->lch = make_tree_kdnode_v2(t, start + 1, median_idx, n, level + 1); - // n->rch = make_tree_kdnode_v2(t, median_idx + 1, end, n, level + 1); - n->rch = make_tree_kdnode_v2(t, median_idx + 1, end, n, level + 1); - n->split_var = split_var; - n->parent = parent; - n->level = level; - } - - /* - #ifdef SWMEM - if(parent == NULL) - { - swapMem_kdv2 = malloc(sizeof(FLOAT_TYPE)*data_dims); - } - #endif - */ - - return n; -} - -static inline FLOAT_TYPE hyper_plane_dist(FLOAT_TYPE *p1, FLOAT_TYPE *p2, - int var) { - return p1[var] - p2[var]; -} - -static inline int hyper_plane_side(FLOAT_TYPE *p1, FLOAT_TYPE *p2, int var) { - return p1[var] > p2[var]; -} - -void knn_sub_tree_search_kdtree_v2(FLOAT_TYPE *point, kdnode_v2 *root, - heap *H) { - if (root->is_leaf) { - for (size_t i = 0; i < root->node_list.count; ++i) { - kdnode_v2 *n = root->node_list.data[i]; - //__builtin_prefetch(root -> node_list.data + i + 1, 0, 3); - FLOAT_TYPE distance = eud_kdtree_v2(point, n->data); - insert_max_heap(H, distance, n->array_idx); - } - return; - } - - FLOAT_TYPE current_distance = eud_kdtree_v2(point, root->data); - FLOAT_TYPE hp_distance = hyper_plane_dist(point, root->data, root->split_var); - insert_max_heap(H, current_distance, root->array_idx); - __builtin_prefetch(root->lch, 0, 3); - __builtin_prefetch(root->rch, 0, 3); - - int side = hp_distance > 0.f; - - switch (side) { - case HP_LEFT_SIDE: - if (root->lch) { - knn_sub_tree_search_kdtree_v2(point, root->lch, H); - } - break; - - case HP_RIGHT_SIDE: - if (root->rch) { - knn_sub_tree_search_kdtree_v2(point, root->rch, H); - } - break; - - default: - break; - } - FLOAT_TYPE max_d = H->data[0].value; - int c = max_d > (hp_distance * hp_distance); - - if (c || (H->count) < (H->N)) - // if(c) - { - - switch (side) { - case HP_LEFT_SIDE: - if (root->rch) { - knn_sub_tree_search_kdtree_v2(point, root->rch, H); - } - break; - - case HP_RIGHT_SIDE: - if (root->lch) { - knn_sub_tree_search_kdtree_v2(point, root->lch, H); - } - break; - - default: - break; - } - } - return; -} - -void kdtree_v2_init(kdtree_v2 *tree, FLOAT_TYPE *data, size_t n_nodes, - unsigned int dims) { - data_dims = dims; - tree->n_nodes = n_nodes; - tree->_nodes = (kdnode_v2 *)malloc(n_nodes * sizeof(kdnode_v2)); - initialize_kdnodes_v2(tree->_nodes, data, n_nodes); - tree->root = NULL; -} - -void kdtree_v2_free(kdtree_v2 *tree) { - for (uint64_t i = 0; i < tree->n_nodes; ++i) - if (tree->_nodes[i].node_list.data) - free(tree->_nodes[i].node_list.data); - - free(tree->_nodes); -} - -heap knn_kdtree_v2(FLOAT_TYPE *point, kdnode_v2 *kdtree_root, int maxk) { - heap H; - allocate_heap(&H, maxk); - init_heap(&H); - knn_sub_tree_search_kdtree_v2(point, kdtree_root, &H); - heap_sort(&H); - return H; -} - -heap knn_kdtree_v2_no_heapsort(FLOAT_TYPE *point, kdnode_v2 *kdtree_root, - int maxk) { - heap H; - allocate_heap(&H, maxk); - init_heap(&H); - knn_sub_tree_search_kdtree_v2(point, kdtree_root, &H); - return H; -} - -kdnode_v2 *build_tree_kdtree_v2(kdnode_v2 *kd_ptrs, size_t n, - size_t dimensions) { - /************************************************* - * Wrapper for make_tree function. * - * Simplifies interfaces and takes time measures * - *************************************************/ - data_dims = dimensions; - kdnode_v2 *root = make_tree_kdnode_v2(kd_ptrs, 0, n - 1, NULL, 0); - return root; -} diff --git a/src/tree/kdtreeV2.h b/src/tree/kdtreeV2.h deleted file mode 100644 index 78d7e3a..0000000 --- a/src/tree/kdtreeV2.h +++ /dev/null @@ -1,65 +0,0 @@ -#pragma once -#include "heap.h" -#include -#include -#include -#include -#include -#include -#define T double -#define DATA_DIMS 0 - -#ifdef USE_FLOAT32 - #define FLOAT_TYPE float -#else - #define FLOAT_TYPE double -#endif - -#ifdef USE_INT32 - #define MY_SIZE_MAX UINT32_MAX - #define idx_t uint32_t -#else - #define MY_SIZE_MAX UINT64_MAX - #define idx_t uint64_t -#endif - -struct kdnode_v2_list -{ - size_t count; - struct kdnode_v2** data; -}; - - -struct kdnode_v2 -{ - FLOAT_TYPE * data; - int level; - int split_var; - int is_leaf; - idx_t array_idx; - struct kdnode_v2* parent; - struct kdnode_v2* lch; - struct kdnode_v2* rch; - struct kdnode_v2_list node_list; -}; - -struct kdtree_v2 -{ - size_t n_nodes; - struct kdnode_v2* _nodes; - struct kdnode_v2* root; -}; - - -typedef struct kdnode_v2 kdnode_v2; -typedef struct kdtree_v2 kdtree_v2; - -void initialize_kdnodes_v2(kdnode_v2* node_array, FLOAT_TYPE* d, idx_t n ); - -heap knn_kdtree_v2(FLOAT_TYPE* point, kdnode_v2* kdtree_root, int maxk); -heap knn_kdtree_v2_no_heapsort(FLOAT_TYPE* point, kdnode_v2* kdtree_root, int maxk); - -kdnode_v2 * build_tree_kdtree_v2(kdnode_v2* kd_ptrs, size_t n, size_t dimensions); -void knn_sub_tree_search_kdtree_v2(FLOAT_TYPE* point, kdnode_v2* root, heap * H); -void kdtree_v2_init(kdtree_v2* tree, FLOAT_TYPE* data, size_t n_nodes, unsigned int dims); -void kdtree_v2_free(kdtree_v2* tree); diff --git a/src/tree/tree.c b/src/tree/tree.c index d3e0ada..4dc46cf 100644 --- a/src/tree/tree.c +++ b/src/tree/tree.c @@ -9,7 +9,7 @@ */ #include "tree.h" #include "heap.h" -#include "kdtreeV2.h" +#include "kdtree.h" #include "mpi.h" #include #include @@ -18,8 +18,9 @@ #include #include #include +#include - +#define NBINS 20 /* * Maximum bytes to send with a single mpi send/recv, used * while communicating results of ngbh search @@ -29,8 +30,8 @@ //#define MAX_MSG_SIZE 4294967296 /* Used slices of 10 mb ? Really good? Maybe at the cause of TID thing */ -// #define MAX_MSG_SIZE (10000 * k * sizeof(heap_node)) -#define MAX_MSG_SIZE (100000 * k * sizeof(heap_node)) +// #define MAX_MSG_SIZE (10000 * k * sizeof(heap_node_t)) +#define MAX_MSG_SIZE (100000 * k * sizeof(heap_node_t)) #define TOP_TREE_RCH 1 @@ -51,13 +52,18 @@ int cmp_float_t(const void* a, const void* b) /* quickselect for an element along a dimension */ -void swap_data_element(float_t *a, float_t *b, size_t vec_len) { +void swap_data_element(point_t *a, point_t *b, size_t vec_len) { + idx_t tmp_dp; + tmp_dp = a->array_idx; + a->array_idx = b->array_idx; + a->array_idx = tmp_dp; + float_t tmp; for (size_t i = 0; i < vec_len; ++i) { - tmp = a[i]; - a[i] = b[i]; - b[i] = tmp; + tmp = a->data[i]; + a->data[i] = b->data[i]; + b->data[i] = tmp; } } @@ -65,19 +71,19 @@ int compare_data_element(float_t *a, float_t *b, int compare_dim) { return -((a[compare_dim] - b[compare_dim]) > 0.) + ((a[compare_dim] - b[compare_dim]) < 0.); } -int partition_data_element(float_t *array, int vec_len, int compare_dim, +int partition_data_element(point_t *array, int vec_len, int compare_dim, int left, int right, int pivot_index) { int store_index = left; int i; /* Move pivot to end */ - swap_data_element(array + pivot_index * vec_len, array + right * vec_len, vec_len); + swap_data_element(array + pivot_index, array + right, vec_len); for (i = left; i < right; ++i) { // if(compare_data_element(array + i*vec_len, array + pivot_index*vec_len, // compare_dim ) >= 0){ - if (array[i * vec_len + compare_dim] < array[right * vec_len + compare_dim]) + if (array[i].data[compare_dim] < array[right].data[compare_dim]) { swap_data_element(array + store_index * vec_len, array + i * vec_len, vec_len); store_index += 1; @@ -89,7 +95,7 @@ int partition_data_element(float_t *array, int vec_len, int compare_dim, return store_index; } -int qselect_data_element(float_t *array, int vec_len, int compare_dim, int left, int right, int n) +int qselect_data_element(point_t *array, int vec_len, int compare_dim, int left, int right, int n) { int pivot_index; if (left == right) @@ -113,25 +119,28 @@ int qselect_data_element(float_t *array, int vec_len, int compare_dim, int left, } } -int quickselect_data_element(float_t *array, int vec_len, int array_size, int compare_dim, int k) +int quickselect_data_element(point_t *array, int vec_len, int array_size, int compare_dim, int k) { return qselect_data_element(array, vec_len, compare_dim, 0, array_size - 1, k - 1); } int CMP_DIM; -int compare_data_element_sort(const void *a, const void *b) { +int compare_data_element_sort(const void *a, const void *b) +{ float_t aa = *((float_t *)a + CMP_DIM); float_t bb = *((float_t *)b + CMP_DIM); return ((aa - bb) > 0.) - ((aa - bb) < 0.); } -void compute_bounding_box(global_context_t *ctx) { +void compute_bounding_box(global_context_t *ctx) +{ ctx->lb_box = (float_t *)MY_MALLOC(ctx->dims * sizeof(float_t)); ctx->ub_box = (float_t *)MY_MALLOC(ctx->dims * sizeof(float_t)); - for (size_t d = 0; d < ctx->dims; ++d) { - ctx->lb_box[d] = 99999999.; - ctx->ub_box[d] = -99999999.; + for (size_t d = 0; d < ctx->dims; ++d) + { + ctx->lb_box[d] = FLT_MAX; + ctx->ub_box[d] = -FLT_MAX; } #define local_data ctx->local_data @@ -186,73 +195,70 @@ partition_t dequeue_partition(partition_queue_t *queue) return queue->data[--(queue->count)]; } -void compute_medians_and_check(global_context_t *ctx, float_t *data) { - float_t prop = 0.5; - int k = (int)(ctx->local_n_points * prop); - int d = 1; - - /*quick select on a particular dimension */ - CMP_DIM = d; - int kk = (k - 1) * ctx->dims; - - int count = 0; - // idx = idx - 1; - // - int aaa = quickselect_data_element(ctx->local_data, (int)(ctx->dims), (int)(ctx->local_n_points), d, k); - /* - * sanity check - * check if the median found in each node is - * a median - */ - - float_t *medians_rcv = (float_t *)MY_MALLOC(ctx->dims * ctx->world_size * sizeof(float_t)); - - /* - * MPI_Allgather( const void *sendbuf, - * int sendcount, - * MPI_Datatype sendtype, - * void *recvbuf, - * int recvcount, - * MPI_Datatype recvtype, - * MPI_Comm comm) - */ - - /* Exchange medians */ - - MPI_Allgather(ctx->local_data + kk, ctx->dims, MPI_MY_FLOAT, medians_rcv, ctx->dims, MPI_MY_FLOAT, ctx->mpi_communicator); - - /* sort medians on each node */ - - CMP_DIM = d; - qsort(medians_rcv, ctx->world_size, ctx->dims * sizeof(float_t), compare_data_element_sort); - - /* - * Evaluate goodness of the median on master which has whole dataset - */ - - if (ctx->mpi_rank == 0) { - int count = 0; - int idx = (int)(prop * (ctx->world_size)); - // idx = idx - 1; - for (int i = 0; i < ctx->n_points; ++i) - { - count += data[i * ctx->dims + d] <= medians_rcv[idx * ctx->dims + d]; - } - mpi_printf(ctx, "Choosing %lf percentile on dimension %d: empirical prop %lf\n", prop, d, (float_t)count / (float_t)(ctx->n_points)); - } - free(medians_rcv); -} +// void compute_medians_and_check(global_context_t *ctx, float_t *data) { +// float_t prop = 0.5; +// int k = (int)(ctx->local_n_points * prop); +// int d = 1; +// +// /*quick select on a particular dimension */ +// CMP_DIM = d; +// int kk = (k - 1) * ctx->dims; +// +// int count = 0; +// // idx = idx - 1; +// // +// int aaa = quickselect_data_element(ctx->data_w_idx, (int)(ctx->dims), (int)(ctx->local_n_points), d, k); +// /* +// * sanity check +// * check if the median found in each node is +// * a median +// */ +// +// float_t *medians_rcv = (float_t *)MY_MALLOC(ctx->dims * ctx->world_size * sizeof(float_t)); +// +// /* +// * MPI_Allgather( const void *sendbuf, +// * int sendcount, +// * MPI_Datatype sendtype, +// * void *recvbuf, +// * int recvcount, +// * MPI_Datatype recvtype, +// * MPI_Comm comm) +// */ +// +// /* Exchange medians */ +// +// MPI_Allgather(ctx->local_data + kk, ctx->dims, MPI_MY_FLOAT, medians_rcv, ctx->dims, MPI_MY_FLOAT, ctx->mpi_communicator); +// +// /* sort medians on each node */ +// +// CMP_DIM = d; +// qsort(medians_rcv, ctx->world_size, ctx->dims * sizeof(float_t), compare_data_element_sort); +// +// /* +// * Evaluate goodness of the median on master which has whole dataset +// */ +// +// if (ctx->mpi_rank == 0) { +// int count = 0; +// int idx = (int)(prop * (ctx->world_size)); +// // idx = idx - 1; +// for (int i = 0; i < ctx->n_points; ++i) +// { +// count += data[i * ctx->dims + d] <= medians_rcv[idx * ctx->dims + d]; +// } +// mpi_printf(ctx, "Choosing %lf percentile on dimension %d: empirical prop %lf\n", prop, d, (float_t)count / (float_t)(ctx->n_points)); +// } +// free(medians_rcv); +// } float_t check_pc_pointset_parallel(global_context_t *ctx, pointset_t *ps, guess_t g, int d, float_t prop) { - /* - * ONLY FOR TEST PURPOSES - * gather on master all data - * perform the count on master - */ + size_t pvt_count = 0; + #pragma omp parallel for reduction(+:pvt_count) for (size_t i = 0; i < ps->n_points; ++i) { - pvt_count += ps->data[i * ps->dims + d] <= g.x_guess; + pvt_count += ps->datapoints[i].data[d] <= g.x_guess; } size_t pvt_n_and_tot[2] = {pvt_count, ps->n_points}; @@ -268,55 +274,61 @@ float_t check_pc_pointset_parallel(global_context_t *ctx, pointset_t *ps, guess_ return ep; } -void compute_bounding_box_pointset(global_context_t *ctx, pointset_t *ps) { +void compute_bounding_box_data(global_context_t *ctx, float_t *data, float_t* lb_box, float_t* ub_box, idx_t n_points) +{ + #define lb lb_box + #define ub ub_box - for (size_t d = 0; d < ps->dims; ++d) + for (size_t d = 0; d < ctx->dims; ++d) { - ps->lb_box[d] = 99999999.; - ps->ub_box[d] = -99999999.; + lb[d] = FLT_MAX; + ub[d] = -FLT_MAX; } - #define local_data ps->data - #define lb ps->lb_box - #define ub ps->ub_box /* compute minimum and maximum for each dimensions, store them in local bb */ /* each processor on its own */ - /* TODO: reduction using omp directive */ - - for (size_t i = 0; i < ps->n_points; ++i) + // this moves memory maybe there is a better way + #pragma omp parallel { - for (size_t d = 0; d < ps->dims; ++d) + float_t* pvt_lb = (float_t*)MY_MALLOC(ctx -> dims * sizeof(float_t)); + float_t* pvt_ub = (float_t*)MY_MALLOC(ctx -> dims * sizeof(float_t)); + for (size_t d = 0; d < ctx->dims; ++d) { - lb[d] = MIN(local_data[i * ps->dims + d], lb[d]); - ub[d] = MAX(local_data[i * ps->dims + d], ub[d]); + pvt_lb[d] = FLT_MAX; + pvt_ub[d] = -FLT_MAX; } - } - /* Reduce to obtain bb */ - /* - MPI_Allreduce( const void *sendbuf, - void *recvbuf, - int count, - MPI_Datatype datatype, - MPI_Op op, - MPI_Comm comm) - */ + #pragma omp for + for (size_t i = 0; i < n_points; ++i) + { + for (size_t d = 0; d < ctx->dims; ++d) + { + pvt_lb[d] = MIN(data[i*ctx->dims + d], pvt_lb[d]); + pvt_ub[d] = MAX(data[i*ctx->dims + d], pvt_ub[d]); + } + } - /*get the bounding box */ + #pragma omp critical (bounding_box_reduction) + { + for (size_t d = 0; d < ctx->dims; ++d) + { + lb[d] = MIN(pvt_lb[d], lb[d]); + ub[d] = MAX(pvt_ub[d], ub[d]); + } + } - MPI_Allreduce(MPI_IN_PLACE, lb, ps->dims, MPI_MY_FLOAT, MPI_MIN, ctx->mpi_communicator); - MPI_Allreduce(MPI_IN_PLACE, ub, ps->dims, MPI_MY_FLOAT, MPI_MAX, ctx->mpi_communicator); - /* - DB_PRINT("[RANK %d]:", ctx -> mpi_rank); - for(size_t d = 0; d < ctx -> dims; ++d) - { - DB_PRINT("%lf ", ctx -> ub_box[d]); + free(pvt_lb); + free(pvt_ub); + } - DB_PRINT("\n"); - */ + + /*get the bounding box */ + + MPI_Allreduce(MPI_IN_PLACE, lb, ctx->dims, MPI_MY_FLOAT, MPI_MIN, ctx->mpi_communicator); + MPI_Allreduce(MPI_IN_PLACE, ub, ctx->dims, MPI_MY_FLOAT, MPI_MAX, ctx->mpi_communicator); /* MPI_DB_PRINT("[PS BOUNDING BOX]: "); @@ -324,7 +336,69 @@ void compute_bounding_box_pointset(global_context_t *ctx, pointset_t *ps) { lb[d], ub[d]); MPI_DB_PRINT("\n"); */ + #undef lb + #undef ub +} + +void compute_bounding_box_pointset(global_context_t *ctx, pointset_t *ps) { + #define local_data ps->datapoints + #define lb ps->lb_box + #define ub ps->ub_box + + for (size_t d = 0; d < ps->dims; ++d) + { + ps->lb_box[d] = FLT_MAX; + ps->ub_box[d] = -FLT_MAX; + } + + + /* compute minimum and maximum for each dimensions, store them in local bb */ + /* each processor on its own */ + + // this moves memory maybe there is a better way + #pragma omp parallel + { + float_t* pvt_lb = (float_t*)MY_MALLOC(ctx -> dims * sizeof(float_t)); + float_t* pvt_ub = (float_t*)MY_MALLOC(ctx -> dims * sizeof(float_t)); + for (size_t d = 0; d < ps->dims; ++d) + { + pvt_lb[d] = FLT_MAX; + pvt_ub[d] = -FLT_MAX; + } + + #pragma omp for + for (size_t i = 0; i < ps->n_points; ++i) + { + for (size_t d = 0; d < ps->dims; ++d) + { + pvt_lb[d] = MIN(local_data[i].data[d], pvt_lb[d]); + pvt_ub[d] = MAX(local_data[i].data[d], pvt_ub[d]); + } + } + + #pragma omp critical (bounding_box_reduction) + { + for (size_t d = 0; d < ps->dims; ++d) + { + lb[d] = MIN(pvt_lb[d], lb[d]); + ub[d] = MAX(pvt_ub[d], ub[d]); + } + } + + + free(pvt_lb); + free(pvt_ub); + + } + + /*get the bounding box */ + + MPI_Allreduce(MPI_IN_PLACE, lb, ps->dims, MPI_MY_FLOAT, MPI_MIN, ctx->mpi_communicator); + MPI_Allreduce(MPI_IN_PLACE, ub, ps->dims, MPI_MY_FLOAT, MPI_MAX, ctx->mpi_communicator); + // MPI_DB_PRINT("[PS BOUNDING BOX]: "); + // for(size_t d = 0; d < ps -> dims; ++d) MPI_DB_PRINT("d%d:[%lf, %lf] ",(int)d, + // lb[d], ub[d]); MPI_DB_PRINT("\n"); #undef local_data #undef lb @@ -333,8 +407,8 @@ void compute_bounding_box_pointset(global_context_t *ctx, pointset_t *ps) { guess_t retrieve_guess_pure(global_context_t *ctx, pointset_t *ps, - uint64_t *global_bin_counts, - int k_global, int d, float_t pc) + uint64_t *global_bin_counts, + int k_global, int d, float_t pc) { /* @@ -348,8 +422,8 @@ guess_t retrieve_guess_pure(global_context_t *ctx, pointset_t *ps, MPI_DB_PRINT("[ "); for(int i = 0; i < k_global; ++i) { - MPI_DB_PRINT("%lu %lf --- ", global_bin_counts[i], - (float_t)global_bin_counts[i]/(float_t)total_count); + MPI_DB_PRINT( "%lu %lf --- ", global_bin_counts[i], + (float_t)global_bin_counts[i]/(float_t)total_count); } MPI_DB_PRINT("\n"); */ @@ -376,11 +450,7 @@ guess_t retrieve_guess_pure(global_context_t *ctx, pointset_t *ps, float_t x_guess = (pc - y0) / (y1 - y0) * (x1 - x0) + x0; - /* - MPI_DB_PRINT("[MASTER] best guess @ %lf is %lf on bin %d on dimension %d --- x0 %lf x1 %lf y0 %lf y1 %lf\n",pc, x_guess,idx, d, x0, x1, y0, y1); - */ - - + // MPI_DB_PRINT("[MASTER] best guess @ %lf is %lf on bin %d on dimension %d --- x0 %lf x1 %lf y0 %lf y1 %lf\n",pc, x_guess,idx, d, x0, x1, y0, y1); guess_t g = {.bin_idx = idx, .x_guess = x_guess}; return g; @@ -416,32 +486,32 @@ void global_binning_check(global_context_t *ctx, float_t *data, int d, int k) } } +// TODO: k_global better to define it as a #define void compute_pure_global_binning(global_context_t *ctx, pointset_t *ps, uint64_t *global_bin_counts, int k_global, int d) { /* compute binning of data along dimension d */ - uint64_t *local_bin_count = (uint64_t *)MY_MALLOC(k_global * sizeof(uint64_t)); + uint64_t local_bin_count[NBINS]; for (size_t k = 0; k < k_global; ++k) { local_bin_count[k] = 0; global_bin_counts[k] = 0; } - /* - MPI_DB_PRINT("[PS BOUNDING BOX %d]: ", ctx -> mpi_rank); - for(size_t d = 0; d < ps -> dims; ++d) MPI_DB_PRINT("d%d:[%lf, %lf] ",(int)d, ps -> lb_box[d], ps -> ub_box[d]); MPI_DB_PRINT("\n"); - MPI_DB_PRINT("\n"); - */ + // MPI_DB_PRINT("[PS BOUNDING BOX %d]: ", ctx -> mpi_rank); + // for(size_t d = 0; d < ps -> dims; ++d) MPI_DB_PRINT("d%d:[%lf, %lf] ",(int)d, ps -> lb_box[d], ps -> ub_box[d]); MPI_DB_PRINT("\n"); + // MPI_DB_PRINT("\n"); float_t bin_w = (ps-> ub_box[d] - ps->lb_box[d]) / (float_t)k_global; /* THIS IS problematic when ps -> ub_box == ps -> lb_box */ + // TODO: this should be moved to a thread_private reduction #pragma omp parallel for for (size_t i = 0; i < ps->n_points; ++i) { - float_t p = ps->data[i * ps->dims + d]; + float_t p = ps->datapoints[i].data[d]; /* to prevent the border point in the box to have bin_idx == k_global causing invalid memory access */ int bin_idx = MIN((int)((p - ps->lb_box[d]) / bin_w), k_global - 1); @@ -450,11 +520,16 @@ void compute_pure_global_binning(global_context_t *ctx, pointset_t *ps, } MPI_Allreduce(local_bin_count, global_bin_counts, k_global, MPI_UNSIGNED_LONG, MPI_SUM, ctx->mpi_communicator); - free(local_bin_count); + // MPI_DB_PRINT("BIN COUNT: [ "); + // for(int i = 0; i < k_global; ++i) MPI_DB_PRINT(" %lu ", local_bin_count[i]); + // MPI_DB_PRINT(" ]\n"); + + //free(local_bin_count); } -size_t partition_data_around_value(float_t *array, size_t vec_len, size_t compare_dim, - size_t left, size_t right, float_t pivot_value) + +size_t partition_data_around_value(point_t *array, size_t vec_len, size_t compare_dim, + size_t left, size_t right, float_t pivot_value) { /* * returns the number of elements less than the pivot @@ -462,12 +537,14 @@ size_t partition_data_around_value(float_t *array, size_t vec_len, size_t compar size_t store_index = left; size_t i; /* Move pivot to end */ + + // TODO: Does it exist a way to get a parallel partition? for (i = left; i < right; ++i) { // if(compare_data_element(array + i*vec_len, array + pivot_index*vec_len, compare_dim ) >= 0){ - if (array[i * vec_len + compare_dim] < pivot_value) + if (array[i].data[compare_dim] < pivot_value) { - swap_data_element(array + store_index * vec_len, array + i * vec_len, vec_len); + swap_data_element(array + store_index, array + i, vec_len); store_index += 1; } } @@ -478,6 +555,7 @@ size_t partition_data_around_value(float_t *array, size_t vec_len, size_t compar return store_index; } + guess_t refine_pure_binning(global_context_t *ctx, pointset_t *ps, guess_t best_guess, uint64_t *global_bin_count, int k_global, int d, float_t f, float_t tolerance) @@ -531,41 +609,34 @@ guess_t refine_pure_binning(global_context_t *ctx, pointset_t *ps, * */ - /* - MPI_DB_PRINT("---- ---- ----\n"); - MPI_DB_PRINT("[MASTER] Refining on bin %d lb %lf ub %lf starting c %lf %lf\n", - best_guess.bin_idx, bin_lb, bin_ub, starting_cumulative/total_count, - (tmp_global_bins[best_guess.bin_idx] + starting_cumulative)/total_count); - */ + // MPI_DB_PRINT("---- ---- ----\n"); + // MPI_DB_PRINT("[MASTER] Refining on bin %d lb %lf ub %lf starting c %lf %lf\n", + // best_guess.bin_idx, bin_lb, bin_ub, starting_cumulative/total_count, + // (tmp_global_bins[best_guess.bin_idx] + starting_cumulative)/total_count); for (int i = 0; i < k_global; ++i) tmp_global_bins[i] = 0; pointset_t tmp_ps; - int end_idx = partition_data_around_value(ps->data, (int)ps->dims, d, 0, (int)ps->n_points, bin_ub); - int start_idx = partition_data_around_value(ps->data, (int)ps->dims, d, 0,end_idx, bin_lb); + int end_idx = partition_data_around_value(ps->datapoints, (int)ps->dims, d, 0, (int)ps->n_points, bin_ub); + int start_idx = partition_data_around_value(ps->datapoints, (int)ps->dims, d, 0, end_idx, bin_lb); tmp_ps.n_points = end_idx - start_idx; - tmp_ps.data = ps->data + start_idx * ps->dims; + tmp_ps.datapoints = ps->datapoints + start_idx; tmp_ps.dims = ps->dims; tmp_ps.lb_box = (float_t*)MY_MALLOC(ctx -> dims * sizeof(float_t)); tmp_ps.ub_box = (float_t*)MY_MALLOC(ctx -> dims * sizeof(float_t)); compute_bounding_box_pointset(ctx, &tmp_ps); - /* - MPI_DB_PRINT("[MASTER] searching for %lf of the bin considered\n",ff); - */ + // MPI_DB_PRINT("[MASTER] searching for %lf of the bin considered start_idx %d end_idx %d\n", + // ff, start_idx, end_idx); // DB_PRINT("%lu\n",tmp_ps.n_points ); MPI_Barrier(ctx->mpi_communicator); compute_pure_global_binning(ctx, &tmp_ps, tmp_global_bins, k_global, d); - /* sum to global bins */ - // for(int i = 0; i < k_global; ++i) tmp_global_bins[i] += - // starting_cumulative; - best_guess = retrieve_guess_pure(ctx, &tmp_ps, tmp_global_bins, k_global, d, ff); best_guess.ep = check_pc_pointset_parallel(ctx, ps, best_guess, d, f); @@ -603,24 +674,47 @@ void free_queue(partition_queue_t *pq) { free(pq->data); } void get_pointset_from_partition(pointset_t *ps, partition_t *part) { ps->n_points = part->n_points; - ps->data = part->base_ptr; + ps->datapoints= part->base_ptr; ps->n_points = part->n_points; } -guess_t compute_median_pure_binning(global_context_t *ctx, pointset_t *ps, float_t fraction, int selected_dim, int n_bins, float_t tolerance) +guess_t compute_median_pure_binning(global_context_t *ctx, pointset_t *ps, float_t fraction, int selected_dim, idx_t nbins, float_t tolerance) +{ + int best_bin_idx; + float_t ep; + + + // uint64_t *global_bin_counts_int = (uint64_t *)MY_MALLOC(n_bins * sizeof(uint64_t)); + + uint64_t global_bin_counts_int[NBINS]; + + compute_pure_global_binning(ctx, ps, global_bin_counts_int, NBINS, selected_dim); + guess_t g = retrieve_guess_pure(ctx, ps, global_bin_counts_int, NBINS, selected_dim, fraction); + g.ep = check_pc_pointset_parallel(ctx, ps, g, selected_dim, fraction); + g = refine_pure_binning(ctx, ps, g, global_bin_counts_int, NBINS, selected_dim, fraction, tolerance); + + //free(global_bin_counts_int); + return g; +} + +guess_t parallel_compute_median_pure_binning(global_context_t *ctx, pointset_t *ps, float_t fraction, int selected_dim, idx_t nbins, float_t tolerance) { int best_bin_idx; float_t ep; - uint64_t *global_bin_counts_int = (uint64_t *)MY_MALLOC(n_bins * sizeof(uint64_t)); + + // uint64_t *global_bin_counts_int = (uint64_t *)MY_MALLOC(n_bins * sizeof(uint64_t)); + + uint64_t global_bin_counts_int[NBINS]; compute_bounding_box_pointset(ctx, ps); - compute_pure_global_binning(ctx, ps, global_bin_counts_int, n_bins, selected_dim); - guess_t g = retrieve_guess_pure(ctx, ps, global_bin_counts_int, n_bins, selected_dim, fraction); + compute_pure_global_binning(ctx, ps, global_bin_counts_int, NBINS, selected_dim); + guess_t g = retrieve_guess_pure(ctx, ps, global_bin_counts_int, NBINS, selected_dim, fraction); // check_pc_pointset(ctx, ps, best_guess, d, f); g.ep = check_pc_pointset_parallel(ctx, ps, g, selected_dim, fraction); - g = refine_pure_binning(ctx, ps, g, global_bin_counts_int, n_bins, selected_dim, fraction, tolerance); - free(global_bin_counts_int); + g = refine_pure_binning(ctx, ps, g, global_bin_counts_int, NBINS, selected_dim, fraction, tolerance); + + //free(global_bin_counts_int); return g; } @@ -749,11 +843,220 @@ void tree_print_leaves(global_context_t* ctx, top_kdtree_node_t* root) if(root -> rch) tree_print_leaves(ctx, root -> rch); } -void build_top_kdtree(global_context_t *ctx, pointset_t *og_pointset, top_kdtree_t *tree, int n_bins, float_t tolerance) +top_kdtree_node_t* transfer_tree(global_context_t* ctx, pivot_t local_node, kdtree_t* mini_local_tree, top_kdtree_t* top_tree, + top_kdtree_node_t* parent, int left_or_right) +{ + + top_kdtree_node_t* current_node = top_tree_generate_node(ctx, top_tree); + + switch (left_or_right) + { + case TOP_TREE_LCH: + { + current_node -> parent = parent; + current_node -> parent -> lch = current_node; + /* compute the box */ + + /* + * left child has lb equal to parent + * ub equal to parent except for the dim of splitting + */ + int parent_split_dim = current_node -> parent -> split_dim; + float_t parent_hp = current_node -> parent -> split_val; + + memcpy(current_node -> lb_node_box, current_node -> parent -> lb_node_box, ctx -> dims * sizeof(float_t)); + memcpy(current_node -> ub_node_box, current_node -> parent -> ub_node_box, ctx -> dims * sizeof(float_t)); + + current_node -> ub_node_box[parent_split_dim] = parent_hp; + } + break; + + case TOP_TREE_RCH: + { + current_node -> parent = parent; + current_node -> parent -> rch = current_node; + + int parent_split_dim = current_node -> parent -> split_dim; + float_t parent_hp = current_node -> parent -> split_val; + + /* + * right child has ub equal to parent + * lb equal to parent except for the dim of splitting + */ + + memcpy(current_node -> lb_node_box, current_node -> parent -> lb_node_box, ctx -> dims * sizeof(float_t)); + memcpy(current_node -> ub_node_box, current_node -> parent -> ub_node_box, ctx -> dims * sizeof(float_t)); + + current_node -> lb_node_box[parent_split_dim] = parent_hp; + } + break; + case NO_CHILD: + { + compute_bounding_box_data(ctx, ctx->local_data, current_node->lb_node_box, current_node->ub_node_box, ctx->local_n_points); + } + break; + default: + break; + } + + // if(I_AM_MASTER) + // { + // printf("Processing node\n"); + // pivot_print(local_node, 0, ctx->dims); + // } + + if(local_node.is_leaf) + { + current_node->owner = local_node.as.leaf.leaf_point_count; + } + else + { + current_node->split_dim = local_node.as.internal.split_variable; + current_node->split_val = local_node.as.internal.split_value; + + idx_t lch_idx = local_node.as.internal.lch_idx; + current_node->lch = transfer_tree(ctx, mini_local_tree->__pivots[lch_idx], mini_local_tree, + top_tree, current_node, TOP_TREE_LCH); + + idx_t rch_idx = local_node.as.internal.rch_idx; + current_node->rch = transfer_tree(ctx, mini_local_tree->__pivots[rch_idx], mini_local_tree, + top_tree, current_node, TOP_TREE_RCH); + } + + return current_node; +} + +void parallel_build_top_kdtree(global_context_t *ctx, top_kdtree_t *tree, idx_t oversampling) +{ + TIME_DEF; + TIME_START; + //initializing og_indexes + // size_t tot_n_points = 0; + // MPI_Allreduce(&(ctx->local_n_points), &tot_n_points, 1, MPI_UINT64_T, MPI_SUM, ctx->mpi_communicator); + + MPI_Alltoall(&(ctx->local_n_points), 1, MPI_UINT64_T, ctx->rank_n_points, 1, MPI_UINT64_T, ctx->mpi_communicator); + + ctx->rank_idx_start[0] = 0; + for(int i = 1; i < ctx->world_size; ++i) + { + ctx->rank_idx_start[i] = ctx->rank_idx_start[i-1] + ctx->rank_n_points[i-1]; + } + + ctx -> og_idxs = (idx_t*)MY_MALLOC(ctx->local_n_points * sizeof(idx_t)); + for(idx_t i = 0; i < ctx->local_n_points; ++i) + { + ctx -> og_idxs[i] = i + ctx->rank_idx_start[ctx->mpi_rank]; + } + + // select a subset of points by sampling + + idx_t n_levels = (idx_t)ceil(log2(ctx -> world_size)) + 1; + + // idx_t local_sampled_points = ((1 << n_levels) - 1) * oversampling; + // idx_t total_sampled_points = local_sampled_points * ctx->world_size; + + idx_t base_points = ((1 << n_levels) - 1) * oversampling; + + idx_t total_sampled_points = ((base_points + ctx->world_size - 1) / ctx->world_size) * ctx->world_size; + + idx_t local_sampled_points = total_sampled_points / ctx->world_size; + + + float_t* points_chosen = (float_t*)MY_MALLOC(total_sampled_points * ctx->dims * sizeof(float_t)); + + idx_t start_idx = ctx->mpi_rank * local_sampled_points; + #pragma omp parallel + { + // 1. Initialize a private state for this specific thread + unsigned short state[3]; + int tid = omp_get_thread_num(); + + // Seed using thread ID and a base seed to ensure different sequences + state[0] = (unsigned short)tid; + state[1] = (unsigned short)(tid ^ time(NULL)); + state[2] = (unsigned short)(tid >> 16); + + // 2. Distribute the loop iterations among threads + #pragma omp for + for(idx_t i = 0; i < local_sampled_points; ++i) + { + // 3. Use the reentrant erand48 with the private state + // erand48 returns a double [0.0, 1.0) + double r = erand48(state); + idx_t sampled_idx = (idx_t)(r * ctx->local_n_points); + + // 4. Copy the data (Memory layout remains the same) + memcpy(points_chosen + (start_idx + i) * ctx->dims, + ctx->local_data + (sampled_idx) * ctx->dims, + ctx->dims * sizeof(float_t)); + } + } + float_t elapsed = TIME_STOP; + LOG_WRITE("Sampling", elapsed); + + // it SHOULD be deterministic + + TIME_START; + MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, + points_chosen, local_sampled_points * ctx->dims, MPI_MY_FLOAT, + ctx->mpi_communicator); + + MPI_DB_PRINT("Sampling %lu points\n", total_sampled_points); + kdtree_t mini_tree = {0}; + kdtree_initialize(&mini_tree, points_chosen, total_sampled_points, ctx->dims); + mini_tree.root = parallel_make_tree_w_ranks(mini_tree.__points, mini_tree.__pivots, 0, -1, 0, + mini_tree.n_points-1, 0, mini_tree.dims, 0, ctx->world_size - 1); + + + elapsed = TIME_STOP; + LOG_WRITE("Building top tree", elapsed); + // if(I_AM_MASTER) + // { + // kdtree_print(&mini_tree); + // } + TIME_START; + tree->root = transfer_tree(ctx, mini_tree.__pivots[mini_tree.root], + &mini_tree, tree, NULL, NO_CHILD); + elapsed = TIME_STOP; + LOG_WRITE("Transfering Tree", elapsed); + + kdtree_free(&mini_tree); +} + + +void build_top_kdtree(global_context_t *ctx, top_kdtree_t *tree, idx_t n_bins, float_t tolerance) { + + + // NOTE: For the me of the future + // pointsests are only "helper structs" partitions + + //initializing og_indexes size_t tot_n_points = 0; - MPI_Allreduce(&(og_pointset->n_points), &tot_n_points, 1, MPI_UINT64_T, MPI_SUM, ctx->mpi_communicator); + MPI_Allreduce(&(ctx->local_n_points), &tot_n_points, 1, MPI_UINT64_T, MPI_SUM, ctx->mpi_communicator); + + MPI_Alltoall(&(ctx->local_n_points), 1, MPI_UINT64_T, ctx->rank_n_points, 1, MPI_UINT64_T, ctx->mpi_communicator); + + ctx->rank_idx_start[0] = 0; + for(int i = 1; i < ctx->world_size; ++i) + { + ctx->rank_idx_start[i] = ctx->rank_idx_start[i-1] + ctx->rank_n_points[i-1]; + } + point_t *data_w_idx = (point_t*)MY_MALLOC(ctx->local_n_points * sizeof(point_t)); + for(idx_t i = 0; i < ctx->local_n_points; ++i) + { + data_w_idx[i].array_idx = i + ctx->rank_idx_start[ctx->mpi_rank]; + data_w_idx[i].data = ctx->local_data + i * ctx->dims; + } + + pointset_t og_pointset; + og_pointset.datapoints = data_w_idx; + og_pointset.dims = ctx->dims; + og_pointset.n_points = ctx->local_n_points; + og_pointset.lb_box = (float_t*)MY_MALLOC(ctx -> dims * sizeof(float_t)); + og_pointset.ub_box = (float_t*)MY_MALLOC(ctx -> dims * sizeof(float_t)); + /* MPI_DB_PRINT("[MASTER] Top tree builder invoked\n"); */ @@ -766,24 +1069,21 @@ void build_top_kdtree(global_context_t *ctx, pointset_t *og_pointset, top_kdtree /* enqueue the two partitions */ - compute_bounding_box_pointset(ctx, og_pointset); + compute_bounding_box_pointset(ctx, &og_pointset); partition_queue_t queue; init_queue(&queue); int selected_dim = 0; partition_t current_partition = { .d = selected_dim, - .base_ptr = og_pointset->data, - .n_points = og_pointset->n_points, + .base_ptr = og_pointset.datapoints, + .n_points = og_pointset.n_points, .n_procs = ctx->world_size, .parent = NULL, - .lr = NO_CHILD - }; + .lr = NO_CHILD }; enqueue_partition(&queue, current_partition); pointset_t current_pointset; - current_pointset.lb_box = (float_t*)MY_MALLOC(ctx -> dims * sizeof(float_t)); - current_pointset.ub_box = (float_t*)MY_MALLOC(ctx -> dims * sizeof(float_t)); while (queue.count) { @@ -801,14 +1101,14 @@ void build_top_kdtree(global_context_t *ctx, pointset_t *og_pointset, top_kdtree /* insert node */ // MPI_DB_PRINT( "[RANK %d] Handling partition:\n"\ - // "\tcurrent_node %p,\n"\ - // "\tdim %d,\n"\ - // "\tn_points %lu,\n"\ - // "\tstart_proc %d,\n"\ - // "\tn_procs %d\n"\ - // "\tparent %p\n"\ - // "\tbase_ptr %p\n"\ - // "\tlr %d\n", + // " current_node %p,\n"\ + // " dim %d,\n"\ + // " n_points %lu,\n"\ + // " start_proc %d,\n"\ + // " n_procs %d\n"\ + // " parent %p\n"\ + // " base_ptr %p\n"\ + // " lr %d\n", // ctx -> mpi_rank, // current_node, // current_partition.d, @@ -831,7 +1131,7 @@ void build_top_kdtree(global_context_t *ctx, pointset_t *og_pointset, top_kdtree * ub equal to parent except for the dim of splitting */ int parent_split_dim = current_node -> parent -> split_dim; - float_t parent_hp = current_node -> parent -> split_val; + float_t parent_hp = current_node -> parent -> split_val; memcpy(current_node -> lb_node_box, current_node -> parent -> lb_node_box, ctx -> dims * sizeof(float_t)); memcpy(current_node -> ub_node_box, current_node -> parent -> ub_node_box, ctx -> dims * sizeof(float_t)); @@ -843,11 +1143,11 @@ void build_top_kdtree(global_context_t *ctx, pointset_t *og_pointset, top_kdtree case TOP_TREE_RCH: if(current_partition.parent) { - current_node -> parent = current_partition.parent; + current_node -> parent = current_partition.parent; current_node -> parent -> rch = current_node; int parent_split_dim = current_node -> parent -> split_dim; - float_t parent_hp = current_node -> parent -> split_val; + float_t parent_hp = current_node -> parent -> split_val; /* * right child has ub equal to parent @@ -863,8 +1163,8 @@ void build_top_kdtree(global_context_t *ctx, pointset_t *og_pointset, top_kdtree case NO_CHILD: { tree -> root = current_node; - memcpy(current_node -> lb_node_box, og_pointset -> lb_box, ctx -> dims * sizeof(float_t)); - memcpy(current_node -> ub_node_box, og_pointset -> ub_box, ctx -> dims * sizeof(float_t)); + memcpy(current_node -> lb_node_box, og_pointset.lb_box, ctx -> dims * sizeof(float_t)); + memcpy(current_node -> ub_node_box, og_pointset.ub_box, ctx -> dims * sizeof(float_t)); } break; } @@ -874,13 +1174,16 @@ void build_top_kdtree(global_context_t *ctx, pointset_t *og_pointset, top_kdtree current_node -> lch = NULL; current_node -> rch = NULL; + current_pointset.lb_box = current_node->lb_node_box; + current_pointset.ub_box = current_node->ub_node_box; + MPI_Barrier(ctx -> mpi_communicator); /* handle partition */ if(current_partition.n_procs > 1) { float_t fraction = (current_partition.n_procs / 2) / (float_t)current_partition.n_procs; guess_t g = compute_median_pure_binning(ctx, ¤t_pointset, fraction, current_partition.d, n_bins, tolerance); - size_t pv = partition_data_around_value(current_pointset.data, ctx->dims, current_partition.d, 0, current_pointset.n_points, g.x_guess); + size_t pv = partition_data_around_value(current_pointset.datapoints, ctx->dims, current_partition.d, 0, current_pointset.n_points, g.x_guess); current_node -> split_val = g.x_guess; @@ -896,14 +1199,27 @@ void build_top_kdtree(global_context_t *ctx, pointset_t *og_pointset, top_kdtree - int next_dimension = (++selected_dim) % (ctx->dims); + // use max_strech int next_dimension = (++selected_dim) % (ctx->dims); + int next_dimension = 0; + float_t max_strech = 0.; + + for(int dim=0; dimdims; ++dim) + { + float_t stretch = current_node->ub_node_box[dim] - current_node->lb_node_box[dim]; + if(stretch > max_strech) + { + max_strech = stretch; + next_dimension = dim; + } + } + partition_t left_partition = { .n_points = points_left, .n_procs = procs_left, .start_proc = current_partition.start_proc, .parent = current_node, .lr = TOP_TREE_LCH, - .base_ptr = current_pointset.data, + .base_ptr = current_pointset.datapoints, .d = next_dimension, }; @@ -913,7 +1229,7 @@ void build_top_kdtree(global_context_t *ctx, pointset_t *og_pointset, top_kdtree .start_proc = current_partition.start_proc + procs_left, .parent = current_node, .lr = TOP_TREE_RCH, - .base_ptr = current_pointset.data + pv * current_pointset.dims, + .base_ptr = current_pointset.datapoints + pv, .d = next_dimension }; @@ -936,10 +1252,16 @@ void build_top_kdtree(global_context_t *ctx, pointset_t *og_pointset, top_kdtree } #endif + ctx -> og_idxs = (idx_t*)MY_MALLOC(ctx->local_n_points * sizeof(idx_t)); + for(idx_t i = 0; i < ctx->local_n_points; ++i) + { + ctx -> og_idxs[i] = data_w_idx[i].array_idx; + } + - free(current_pointset.lb_box); - free(current_pointset.ub_box); free_queue(&queue); + og_pointset.datapoints = NULL; + free_pointset(&og_pointset); } @@ -973,8 +1295,22 @@ int compute_point_owner(global_context_t* ctx, top_kdtree_t* tree, float_t* data return owner; } +void swap_data_vec_and_idx(float_t *a, float_t *b, idx_t* idx_a, idx_t* idx_b, size_t vec_len) { + idx_t tmp_idx = *idx_a; + *idx_a = *idx_b; + *idx_b = tmp_idx; + + float_t tmp; + for (size_t i = 0; i < vec_len; ++i) + { + tmp = a[i]; + a[i] = b[i]; + b[i] = tmp; + } +} + /* to partition points around owners */ -int partition_data_around_key(int* key, float_t *val, int vec_len, int ref_key , int left, int right) +int partition_data_around_key(int* key, float_t *values, idx_t* idxs, int vec_len, int ref_key , int left, int right) { /* * returns the number of elements less than the pivot @@ -987,7 +1323,8 @@ int partition_data_around_key(int* key, float_t *val, int vec_len, int ref_key , // if(compare_data_element(array + i*vec_len, array + pivot_index*vec_len, compare_dim ) >= 0){ if (key[i] < ref_key) { - swap_data_element(val + store_index * vec_len, val + i * vec_len, vec_len); + swap_data_vec_and_idx(values + store_index * vec_len, values + i * vec_len, + idxs + store_index, idxs + i, vec_len); /* swap keys */ int tmp = key[i]; key[i] = key[store_index]; @@ -1004,43 +1341,52 @@ int partition_data_around_key(int* key, float_t *val, int vec_len, int ref_key , void exchange_points(global_context_t* ctx, top_kdtree_t* tree) { - int* points_per_proc = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); - int* points_owners = (int*)MY_MALLOC(ctx -> dims * ctx -> local_n_points * sizeof(float_t)); + TIME_DEF + int* points_per_proc = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); + int* points_owners = (int*)MY_MALLOC(ctx -> dims * ctx -> local_n_points * sizeof(float_t)); int* partition_offset = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); /* compute owner */ + TIME_START; #pragma omp parallel for for(size_t i = 0; i < ctx -> local_n_points; ++i) { /* tree walk */ points_owners[i] = compute_point_owner(ctx, tree, ctx -> local_data + (i * ctx -> dims)); } - - + float_t elapsed = TIME_STOP; + LOG_WRITE("Compute owners", elapsed); + TIME_START; + int last_idx = 0; int len = ctx -> local_n_points; float_t* curr_data = ctx -> local_data; + HERE partition_offset[0] = 0; for(int owner = 1; owner < ctx -> world_size; ++owner) { - last_idx = partition_data_around_key(points_owners, ctx -> local_data, ctx -> dims, owner, last_idx, ctx -> local_n_points); + last_idx = partition_data_around_key(points_owners, ctx -> local_data, ctx -> og_idxs, ctx -> dims, owner, last_idx, ctx -> local_n_points); partition_offset[owner] = last_idx; points_per_proc[owner - 1] = last_idx; } + elapsed = TIME_STOP; + LOG_WRITE("Partition", elapsed); + // copy ordered + + TIME_START; points_per_proc[ctx -> world_size - 1] = ctx -> local_n_points; - - + for(int i = ctx -> world_size - 1; i > 0; --i) { points_per_proc[i] = points_per_proc[i] - points_per_proc[i - 1]; } - int* rcv_count = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); - int* rcv_displs = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); - int* send_displs = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); - int* send_count = points_per_proc; + int* rcv_count = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); + int* rcv_displs = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); + int* send_displs = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); + int* send_count = points_per_proc; float_t* rcvbuffer = NULL; int tot_count = 0; @@ -1056,14 +1402,23 @@ void exchange_points(global_context_t* ctx, top_kdtree_t* tree) send_displs[i] = send_displs[i - 1] + send_count[i - 1]; } + for(int i = 0; i < ctx -> world_size; ++i) tot_count += rcv_count[i]; + idx_t* rcv_idxs = (idx_t*)MY_MALLOC(tot_count * sizeof(idx_t)); + // exchange idxs + MPI_Alltoallv( ctx -> og_idxs, send_count, send_displs, MPI_UINT64_T, + rcv_idxs, rcv_count, rcv_displs, MPI_UINT64_T, + ctx -> mpi_communicator); + + tot_count = 0; + /*multiply for number of elements */ for(int i = 0; i < ctx -> world_size; ++i) { send_displs[i]= send_displs[i] * ctx -> dims; send_count[i] = send_count[i] * ctx -> dims; - rcv_displs[i]= rcv_displs[i] * ctx -> dims; - rcv_count[i] = rcv_count[i] * ctx -> dims; + rcv_displs[i] = rcv_displs[i] * ctx -> dims; + rcv_count[i] = rcv_count[i] * ctx -> dims; tot_count += rcv_count[i]; } @@ -1095,18 +1450,42 @@ void exchange_points(global_context_t* ctx, top_kdtree_t* tree) /* free prv pointer */ free(ppp); free(ctx -> local_data); + free(ctx -> og_idxs); ctx -> local_data = rcvbuffer; + ctx -> og_idxs = rcv_idxs; /* check exchange */ - for(size_t i = 0; i < ctx -> local_n_points; ++i) - { - int o = compute_point_owner(ctx, tree, ctx -> local_data + (i * ctx -> dims)); - if(o != ctx -> mpi_rank) DB_PRINT("rank %d got an error\n",ctx -> mpi_rank); - } + #ifdef CHECK_CORRECT_EXCHANGE + for(size_t i = 0; i < ctx -> local_n_points; ++i) + { + int o = compute_point_owner(ctx, tree, ctx -> local_data + (i * ctx -> dims)); + if(o != ctx -> mpi_rank) DB_PRINT("rank %d got an error\n",ctx -> mpi_rank); + } + #endif + elapsed = TIME_STOP; + LOG_WRITE("Exchange", elapsed); + + #ifdef PRINT_BALANCE_FACTOR + DB_PRINT("[RANK %d] balance factor %.4lf\n", ctx->mpi_rank, (float_t)ctx->local_n_points/ctx->n_points); + #endif + float_t balance_factor = (float_t)ctx->local_n_points/ctx->n_points; + float_t sum_balance_factor; + float_t max_balance_factor; + float_t min_balance_factor; + MPI_Reduce(&balance_factor, &sum_balance_factor, 1, MPI_MY_FLOAT, MPI_SUM, 0, ctx -> mpi_communicator); + MPI_Reduce(&balance_factor, &min_balance_factor, 1, MPI_MY_FLOAT, MPI_MIN, 0, ctx -> mpi_communicator); + MPI_Reduce(&balance_factor, &max_balance_factor, 1, MPI_MY_FLOAT, MPI_MAX, 0, ctx -> mpi_communicator); + + if(I_AM_MASTER) + { + MPI_DB_PRINT("Avg Balance Factor %.4lf min %.4lf max %.4lf\n", sum_balance_factor/ctx -> world_size, + min_balance_factor, max_balance_factor); + } + free(points_owners); free(points_per_proc); free(partition_offset); @@ -1120,20 +1499,14 @@ static inline size_t local_to_global_idx(global_context_t* ctx, size_t local_idx return local_idx + ctx -> idx_start; } -void translate_tree_idx_to_global(global_context_t* ctx, kdtree_v2* local_tree) +void translate_tree_idx_to_global(global_context_t* ctx, kdtree_t* local_tree) { for(size_t i = 0; i < ctx -> local_n_points; ++i) { - local_tree -> _nodes[i].array_idx = local_to_global_idx(ctx, local_tree -> _nodes[i].array_idx); + local_tree -> __points[i].array_idx = local_to_global_idx(ctx, local_tree -> __points[i].array_idx); } } -heap ngbh_search_kdtree(global_context_t* ctx, kdtree_v2* local_tree, float_t* data, int k) -{ - data_dims = ctx -> dims; - return knn_kdtree_v2(data, local_tree -> root, k); -} - void tree_walk( global_context_t* ctx, top_kdtree_node_t* root, @@ -1293,33 +1666,58 @@ void tree_walk_v2_find_n_points( break; } - int c = max_dist > (hp_distance * hp_distance); + // int c = max_dist > (hp_distance * hp_distance); + // + // //if(c || (H -> count) < (H -> N)) + // if(c) + // { + // + // switch (side) + // { + // case HP_LEFT_SIDE: + // if(root -> rch) + // { + // /* walk on the right */ + // tree_walk_v2_find_n_points(ctx, root -> rch, point_idx, max_dist, point, point_to_send_capacity); + // } + // break; + // + // case HP_RIGHT_SIDE: + // if(root -> lch) + // { + // /* walk on the left */ + // tree_walk_v2_find_n_points(ctx, root -> lch, point_idx, max_dist, point, point_to_send_capacity); + // } + // break; + // + // default: + // break; + // } + // } //if(c || (H -> count) < (H -> N)) - if(c) + switch (side) { + case HP_LEFT_SIDE: + if(root -> rch) + { + /* walk on the right */ + float_t point_to_box_dist = box_dist(point, root->rch->lb_node_box, root->rch->ub_node_box, ctx->dims); + if(point_to_box_dist < max_dist) tree_walk_v2_find_n_points(ctx, root -> rch, point_idx, max_dist, point, point_to_send_capacity); + } + break; - switch (side) - { - case HP_LEFT_SIDE: - if(root -> rch) - { - /* walk on the right */ - tree_walk_v2_find_n_points(ctx, root -> rch, point_idx, max_dist, point, point_to_send_capacity); - } - break; - - case HP_RIGHT_SIDE: - if(root -> lch) - { - /* walk on the left */ - tree_walk_v2_find_n_points(ctx, root -> lch, point_idx, max_dist, point, point_to_send_capacity); - } - break; + case HP_RIGHT_SIDE: + if(root -> lch) + { + /* walk on the left */ + float_t point_to_box_dist = box_dist(point, root->lch->lb_node_box, root->lch->ub_node_box, ctx->dims); + if(point_to_box_dist < max_dist) tree_walk_v2_find_n_points(ctx, root -> lch, point_idx, max_dist, point, point_to_send_capacity); + } + break; - default: - break; - } + default: + break; } } @@ -1387,42 +1785,66 @@ void tree_walk_v2_append_points( break; } - int c = max_dist > (hp_distance * hp_distance); + // int c = max_dist > (hp_distance * hp_distance); + // if(c) + // { + // + // switch (side) + // { + // case HP_LEFT_SIDE: + // if(root -> rch) + // { + // /* walk on the right */ + // tree_walk_v2_append_points(ctx, root -> rch, point_idx, max_dist, point, + // data_to_send_per_proc, local_idx_of_the_point, point_to_send_count); + // } + // break; + // + // case HP_RIGHT_SIDE: + // if(root -> lch) + // { + // /* walk on the left */ + // tree_walk_v2_append_points(ctx, root -> lch, point_idx, max_dist, point, + // data_to_send_per_proc, local_idx_of_the_point, point_to_send_count); + // } + // break; + // + // default: + // break; + // } + // } - //if(c || (H -> count) < (H -> N)) - if(c) + switch (side) { + case HP_LEFT_SIDE: + if(root -> rch) + { + /* walk on the right */ + float_t point_to_box_dist = box_dist(point, root->rch->lb_node_box, root->rch->ub_node_box, ctx->dims); + if(point_to_box_dist < max_dist) tree_walk_v2_append_points(ctx, root -> rch, point_idx, max_dist, point, + data_to_send_per_proc, local_idx_of_the_point, point_to_send_count); + } + break; - switch (side) - { - case HP_LEFT_SIDE: - if(root -> rch) - { - /* walk on the right */ - tree_walk_v2_append_points(ctx, root -> rch, point_idx, max_dist, point, - data_to_send_per_proc, local_idx_of_the_point, point_to_send_count); - } - break; - - case HP_RIGHT_SIDE: - if(root -> lch) - { - /* walk on the left */ - tree_walk_v2_append_points(ctx, root -> lch, point_idx, max_dist, point, - data_to_send_per_proc, local_idx_of_the_point, point_to_send_count); - } - break; + case HP_RIGHT_SIDE: + if(root -> lch) + { + /* walk on the left */ + float_t point_to_box_dist = box_dist(point, root->lch->lb_node_box, root->lch->ub_node_box, ctx->dims); + if(point_to_box_dist < max_dist) tree_walk_v2_append_points(ctx, root -> lch, point_idx, max_dist, point, + data_to_send_per_proc, local_idx_of_the_point, point_to_send_count); + } + break; - default: - break; - } + default: + break; } } } -void convert_heap_idx_to_global(global_context_t* ctx, heap* H) +void convert_heap_idx_to_global(global_context_t* ctx, heap_t* H) { for(uint64_t i = 0; i < H -> count; ++i) { @@ -1443,7 +1865,7 @@ void print_diagnositcs(global_context_t* ctx, int k) float_t memory_use = (float_t)ctx -> local_n_points * ctx -> dims * sizeof(float_t); memory_use += (float_t)sizeof(datapoint_info_t)* (float_t)(ctx -> local_n_points); /* ngbh */ - memory_use += (float_t)sizeof(heap_node)*(float_t)k * (float_t)(ctx -> local_n_points); + memory_use += (float_t)sizeof(heap_node_t)*(float_t)k * (float_t)(ctx -> local_n_points); memory_use = memory_use / 1e9 * shm_world_size; MPI_DB_PRINT(" Got ~%d points per node and %d ngbh per points\n", ctx -> local_n_points * shm_world_size, k); @@ -1458,45 +1880,40 @@ void print_diagnositcs(global_context_t* ctx, int k) MPI_Barrier(ctx -> mpi_communicator); } - -void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtree_t* top_tree, kdtree_v2* local_tree, float_t* data, int k) +void mpi_knn_search(global_context_t* ctx, top_kdtree_t* top_tree, kdtree_t* local_tree, float_t* data, idx_t n_data, idx_t vec_len, int k) { - /* local search */ - /* print diagnostics */ - print_diagnositcs(ctx, k); - ctx -> k = (idx_t)k; - TIME_DEF; double elapsed_time; TIME_START; + //printf("rank %d local_n_points %lu\n", ctx -> mpi_rank, ctx -> local_n_points); MPI_Barrier(ctx -> mpi_communicator); - //ctx -> __local_heap_buffers = (heap_node*)MY_MALLOC(ctx -> local_n_points * k * sizeof(heap_node)); - MPI_Alloc_mem(ctx -> local_n_points * k * sizeof(heap_node), MPI_INFO_NULL, &(ctx -> __local_heap_buffers)); - - #pragma omp parallel for schedule(dynamic) - for(int p = 0; p < ctx -> local_n_points; ++p) - { - idx_t idx = local_tree -> _nodes[p].array_idx; - /* actually we want to preserve the heap to then insert guesses from other nodes */ - heap tmp_heap; - tmp_heap.data = ctx -> __local_heap_buffers + k * idx; + //ctx -> __local_heap_buffers = (heap_node_t*)MY_MALLOC(ctx -> local_n_points * k * sizeof(heap_node)); + heap_node_t* __local_heap_buffers = (heap_node_t*)MY_MALLOC(n_data * k * sizeof(heap_node_t)); + //MPI_Alloc_mem(n_data * k * sizeof(heap_node_t), MPI_INFO_NULL, &__local_heap_buffers); + //MPI_Alloc_mem(n_data * k * sizeof(heap_node_t), MPI_INFO_NULL, &__local_heap_buffers); + + #pragma omp parallel for schedule(dynamic, 32) + for(int p = 0; p < n_data; ++p) + { + idx_t idx = p; + /* actually we want to preserve the heap_t to then insert guesses from other nodes */ + heap_t tmp_heap; + tmp_heap.data = __local_heap_buffers + k * idx; tmp_heap.count = 0; - tmp_heap.N = k; + tmp_heap.size = k; + + //dp_info[idx].ngbh = knn_kdtree_t_no_heapsort(local_tree -> _nodes[p].data, local_tree -> root, k); + idx_t visited_nodes = 0; + knn_sub_tree_search(data + idx * ctx->dims, local_tree-> __points, local_tree->__pivots, + local_tree->root, &tmp_heap, local_tree->dims, &visited_nodes); - //dp_info[idx].ngbh = knn_kdtree_v2_no_heapsort(local_tree -> _nodes[p].data, local_tree -> root, k); - knn_sub_tree_search_kdtree_v2(local_tree -> _nodes[p].data, local_tree -> root, &tmp_heap); convert_heap_idx_to_global(ctx, &tmp_heap); - dp_info[idx].cluster_idx = -1; - dp_info[idx].is_center = 0; - dp_info[idx].array_idx = idx + ctx -> idx_start; - dp_info[idx].ngbh = tmp_heap.data; } elapsed_time = TIME_STOP; LOG_WRITE("Local neighborhood search", elapsed_time); - - + //printf("rank %d elapsed_time %lf\n", ctx->mpi_rank, elapsed_time); TIME_START; /* find if a points needs a refine on the global tree */ float_t** data_to_send_per_proc = (float_t**)MY_MALLOC(ctx -> world_size * sizeof(float_t*)); @@ -1512,11 +1929,11 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre } /* NEW VERSION double tree walk */ - #pragma omp parallel for - for(int i = 0; i < ctx -> local_n_points; ++i) + #pragma omp parallel for schedule(dynamic, 128) + for(int i = 0; i < n_data; ++i) { - float_t max_dist = dp_info[i].ngbh[0].value; - float_t* point = ctx -> local_data + (i * ctx -> dims); + float_t max_dist = (__local_heap_buffers + i*k)[0].value; + float_t* point = data + (i * ctx -> dims); tree_walk_v2_find_n_points(ctx, top_tree -> root, i, max_dist, point, point_to_snd_capacity); @@ -1531,17 +1948,15 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre } - - #pragma omp parallel for - for(int i = 0; i < ctx -> local_n_points; ++i) + #pragma omp parallel for schedule(dynamic, 128) + for(int i = 0; i < n_data; ++i) { - float_t max_dist = dp_info[i].ngbh[0].value; - float_t* point = ctx -> local_data + (i * ctx -> dims); + float_t max_dist = (__local_heap_buffers + i*k)[0].value; + float_t* point = data + (i * ctx -> dims); tree_walk_v2_append_points(ctx, top_tree -> root, i, max_dist, point, data_to_send_per_proc, local_idx_of_the_point, point_to_snd_count); } - elapsed_time = TIME_STOP; LOG_WRITE("Finding points to refine", elapsed_time); @@ -1587,8 +2002,6 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre float_t* __max_dist_snd = (float_t*)MY_MALLOC(tot_points_snd * sizeof(float_t)); float_t* __max_dist_rcv = (float_t*)MY_MALLOC(tot_points_rcv * sizeof(float_t)); - - /* copy data to send in contiguous memory */ for(int i = 0; i < ctx -> world_size; ++i) { @@ -1603,13 +2016,12 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre idx_t point_idx = local_idx_of_the_point[i][j]; // for each point take the distance of its furthest nearest neighbor // then communicate it in order to prune the search on foreign nodes - __max_dist_snd[dist_idx] = ctx -> local_datapoints[point_idx].ngbh[0].value; + __max_dist_snd[dist_idx] = (__local_heap_buffers + k*point_idx)[0].value; ++dist_idx; } } - MPI_Alltoallv(__snd_points, snd_count, snd_displ, MPI_MY_FLOAT, __rcv_points, rcv_count, rcv_displ, MPI_MY_FLOAT, ctx -> mpi_communicator); @@ -1653,17 +2065,17 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre // } // - /* prepare heap batches */ + /* prepare heap_t batches */ //int work_batch_stride = 1 + ctx -> dims; int work_batch_stride = ctx -> dims; /* Note that I then have to recieve an equal number of heaps as the one I sent out before */ - heap_node* __heap_batches_to_snd = (heap_node*)MY_MALLOC((uint64_t)k * (uint64_t)tot_points_rcv * sizeof(heap_node)); + heap_node_t* __heap_batches_to_snd = (heap_node_t*)MY_MALLOC((uint64_t)k * (uint64_t)tot_points_rcv * sizeof(heap_node_t)); if( __heap_batches_to_snd == NULL) { - DB_PRINT("Rank %d failed to allocate snd_heaps %luB required\n",ctx -> mpi_rank, (uint64_t)k * (uint64_t)tot_points_snd * sizeof(heap_node)); + DB_PRINT("Rank %d failed to allocate snd_heaps %luB required\n",ctx -> mpi_rank, (uint64_t)k * (uint64_t)tot_points_snd * sizeof(heap_node_t)); exit(1); } @@ -1671,8 +2083,8 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre // max dists will contain the maximum distance to have to search - heap_node** heap_batches_per_node = (heap_node**)MY_MALLOC(ctx -> world_size * sizeof(heap_node*)); - float_t** max_dists_per_node = (float_t**)MY_MALLOC(ctx -> world_size * sizeof(float_t*)); + heap_node_t** heap_batches_per_node = (heap_node_t**)MY_MALLOC(ctx -> world_size * sizeof(heap_node_t*)); + float_t** max_dists_per_node = (float_t**)MY_MALLOC(ctx -> world_size * sizeof(float_t*)); for(int p = 0; p < ctx -> world_size; ++p) { @@ -1693,23 +2105,25 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre { if(point_to_rcv_count[p] > 0 && p != ctx -> mpi_rank) { - #pragma omp parallel for schedule(dynamic) + #pragma omp parallel for schedule(dynamic, 128) for(int batch = 0; batch < point_to_rcv_count[p]; ++batch) { - heap H; + heap_t H; H.count = k; - H.N = k; + H.size = k; H.data = heap_batches_per_node[p] + (uint64_t)k * (uint64_t)batch; float_t max_dist = max_dists_per_node[p][batch]; - for(idx_t i = 0; i < H.N; ++i) + for(idx_t i = 0; i < H.size; ++i) { H.data[i].value = max_dist; H.data[i].array_idx = MY_SIZE_MAX; } //float_t* point = rcv_work_batches[p] + batch * work_batch_stride + 1; + idx_t visited_nodes = 0; float_t* point = rcv_work_batches[p] + (uint64_t)batch * (uint64_t)work_batch_stride; - knn_sub_tree_search_kdtree_v2(point, local_tree -> root, &H); + knn_sub_tree_search(point, local_tree-> __points, local_tree->__pivots, + local_tree->root, &H, local_tree->dims, &visited_nodes); convert_heap_idx_to_global(ctx, &H); } } @@ -1736,7 +2150,7 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre TIME_START; MPI_Datatype MPI_my_heap; - MPI_Type_contiguous(k * sizeof(heap_node), MPI_BYTE, &MPI_my_heap); + MPI_Type_contiguous(k * sizeof(heap_node_t), MPI_BYTE, &MPI_my_heap); MPI_Barrier(ctx -> mpi_communicator); MPI_Type_commit(&MPI_my_heap); @@ -1746,7 +2160,7 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre * ------------------------------------- */ MPI_Barrier(ctx -> mpi_communicator); - int default_msg_len = MAX_MSG_SIZE / (k * sizeof(heap_node)); + int default_msg_len = MAX_MSG_SIZE / (k * sizeof(heap_node_t)); int* already_sent_points = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); int* already_rcvd_points = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); @@ -1780,10 +2194,10 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre MPI_DB_PRINT("Using default message length %lu\n", default_msg_len); - heap_node* __heap_batches_to_rcv = (heap_node*)MY_MALLOC((uint64_t)k * (uint64_t)max_n_recv * sizeof(heap_node)); + heap_node_t* __heap_batches_to_rcv = (heap_node_t*)MY_MALLOC((uint64_t)k * (uint64_t)max_n_recv * sizeof(heap_node_t)); if( __heap_batches_to_rcv == NULL) { - DB_PRINT("Rank %d failed to allocate rcv_heaps %luB required\n",ctx -> mpi_rank, (uint64_t)k * (uint64_t)max_n_recv* sizeof(heap_node)); + DB_PRINT("Rank %d failed to allocate rcv_heaps %luB required\n",ctx -> mpi_rank, (uint64_t)k * (uint64_t)max_n_recv* sizeof(heap_node_t)); exit(1); } @@ -1803,7 +2217,7 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre MPI_DB_PRINT("[--- ROUND %d ----]\n", i); MPI_Barrier(ctx -> mpi_communicator); DB_PRINT("[RANK %d] sending to %d tot: %d [%luB]---- recieving from %d %d\n", ctx -> mpi_rank, - rank_to_send, ngbh_to_send[rank_to_send], ngbh_to_send[rank_to_send]*sizeof(heap_node), rank_to_recv, ngbh_to_recv[rank_to_recv]); + rank_to_send, ngbh_to_send[rank_to_send], ngbh_to_send[rank_to_send]*sizeof(heap_node_t), rank_to_recv, ngbh_to_recv[rank_to_recv]); #endif if(ngbh_to_send[rank_to_send] > 0) { @@ -1858,24 +2272,24 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre } /* merge lists */ - #pragma omp parallel for + #pragma omp parallel for schedule(dynamic, 32) for(int b = 0; b < ngbh_to_recv[rank_to_recv]; ++b) { int idx = local_idx_of_the_point[rank_to_recv][b]; - /* retrieve the heap */ - heap H; + /* retrieve the heap_t */ + heap_t H; H.count = k; - H.N = k; + H.size = k; H.data = __heap_batches_to_rcv + k*b; - /* insert the points into the heap */ + /* insert the points into the heap_t */ for(int j = 0; j < k; ++j) { - heap tmp_heap; - tmp_heap.N = k; + heap_t tmp_heap; + tmp_heap.size = k; tmp_heap.count = k; - tmp_heap.data = dp_info[idx].ngbh; + tmp_heap.data = (__local_heap_buffers + idx*k); // insert it only of != from max - if(H.data[j].array_idx != MY_SIZE_MAX) insert_max_heap(&(tmp_heap), H.data[j].value, H.data[j].array_idx); + if(H.data[j].array_idx != MY_SIZE_MAX) max_heap_insert(&(tmp_heap), H.data[j].value, H.data[j].array_idx); } } @@ -1884,6 +2298,9 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre MPI_Barrier(ctx -> mpi_communicator); } + elapsed_time = TIME_STOP; + LOG_WRITE("Merging results", elapsed_time); + MPI_Testall(req_idx, req_array, &flag, MPI_STATUSES_IGNORE); @@ -1899,19 +2316,22 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre /* -------------------------------------------------------- */ /* heapsort them */ - #pragma omp parallel for schedule(dynamic) - for(int i = 0; i < ctx -> local_n_points; ++i) + TIME_START; + + #pragma omp parallel for schedule(dynamic, 128) + for(int i = 0; i < n_data; ++i) { - heap tmp_heap; - tmp_heap.N = k; + heap_t tmp_heap; + tmp_heap.size = k; tmp_heap.count = k; - tmp_heap.data = dp_info[i].ngbh; + tmp_heap.data = (__local_heap_buffers + i*k); heap_sort(&(tmp_heap)); } elapsed_time = TIME_STOP; - LOG_WRITE("Merging results", elapsed_time); + LOG_WRITE("Sorting negihborhoods", elapsed_time); + #if defined(WRITE_NGBH) MPI_DB_PRINT("Writing ngbh to files\n"); @@ -1924,9 +2344,9 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre } else { - for(int i = 0; i < ctx -> local_n_points; ++i) + for(int i = 0; i < n_data; ++i) { - fwrite(dp_info[i].ngbh, sizeof(heap_node), k, file); + fwrite(__local_heap_buffers + i*k, sizeof(heap_node_t), k, file); } fclose(file); } @@ -1962,11 +2382,522 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre free(__max_dist_snd); free(__max_dist_rcv); + free(__local_heap_buffers); +} + + +void mpi_all_knn_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtree_t* top_tree, kdtree_t* local_tree, int k) +{ + /* local search */ + /* print diagnostics */ + print_diagnositcs(ctx, k); + ctx -> k = (idx_t)k; + + TIME_DEF; + double elapsed_time; + + TIME_START; + //printf("rank %d local_n_points %lu\n", ctx -> mpi_rank, ctx -> local_n_points); + MPI_Barrier(ctx -> mpi_communicator); + //ctx -> __local_heap_buffers = (heap_node_t*)MY_MALLOC(ctx -> local_n_points * k * sizeof(heap_node)); + MPI_Alloc_mem(ctx -> local_n_points * k * sizeof(heap_node_t), MPI_INFO_NULL, &(ctx -> __local_heap_buffers)); + + #pragma omp parallel for schedule(dynamic, 32) + for(int p = 0; p < ctx -> local_n_points; ++p) + { + idx_t idx = local_tree -> __points[p].array_idx; + /* actually we want to preserve the heap_t to then insert guesses from other nodes */ + heap_t tmp_heap; + tmp_heap.data = ctx -> __local_heap_buffers + k * idx; + tmp_heap.count = 0; + tmp_heap.size = k; + + //dp_info[idx].ngbh = knn_kdtree_t_no_heapsort(local_tree -> _nodes[p].data, local_tree -> root, k); + + idx_t visited_nodes = 0; + knn_sub_tree_search(local_tree->__points[p].data, local_tree-> __points, local_tree->__pivots, + local_tree->root, &tmp_heap, local_tree->dims, &visited_nodes); + + convert_heap_idx_to_global(ctx, &tmp_heap); + dp_info[idx].cluster_idx = -1; + dp_info[idx].is_center = 0; + dp_info[idx].array_idx = idx + ctx -> idx_start; + dp_info[idx].ngbh = tmp_heap.data; + } + elapsed_time = TIME_STOP; + LOG_WRITE("Local neighborhood search", elapsed_time); + //printf("rank %d elapsed_time %lf\n", ctx->mpi_rank, elapsed_time); + + + TIME_START; + /* find if a points needs a refine on the global tree */ + float_t** data_to_send_per_proc = (float_t**)MY_MALLOC(ctx -> world_size * sizeof(float_t*)); + int** local_idx_of_the_point = (int**)MY_MALLOC(ctx -> world_size * sizeof(int*)); + int* point_to_snd_count = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); + int* point_to_snd_capacity = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); + + for(int i = 0; i < ctx -> world_size; ++i) + { + /* allocate it afterwards */ + point_to_snd_capacity[i] = 0; + point_to_snd_count[i] = 0; + } + + /* NEW VERSION double tree walk */ + #pragma omp parallel for schedule(dynamic, 128) + for(int i = 0; i < ctx -> local_n_points; ++i) + { + idx_t idx = local_tree -> __points[i].array_idx; + float_t max_dist = dp_info[idx].ngbh[0].value; + float_t* point = local_tree->__points[i].data; + + tree_walk_v2_find_n_points(ctx, top_tree -> root, idx, max_dist, point, point_to_snd_capacity); + + } + + /* allocate needed space */ + for(int i = 0; i < ctx -> world_size; ++i) + { + int np = point_to_snd_capacity[i]; + data_to_send_per_proc[i] = (float_t*)MY_MALLOC(np * (ctx -> dims) * sizeof(float_t)); + local_idx_of_the_point[i] = (int*)MY_MALLOC(np * sizeof(int)); + + } + + #pragma omp parallel for schedule(dynamic, 128) + for(int i = 0; i < ctx -> local_n_points; ++i) + { + idx_t idx = local_tree -> __points[i].array_idx; + float_t max_dist = dp_info[idx].ngbh[0].value; + float_t* point = local_tree -> __points[i].data; + + tree_walk_v2_append_points(ctx, top_tree -> root, idx, max_dist, point, data_to_send_per_proc, local_idx_of_the_point, point_to_snd_count); + } + + elapsed_time = TIME_STOP; + LOG_WRITE("Finding points to refine", elapsed_time); + + TIME_START; + int* point_to_rcv_count = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); + + /* exchange points to work on*/ + MPI_Alltoall(point_to_snd_count, 1, MPI_INT, point_to_rcv_count, 1, MPI_INT, ctx -> mpi_communicator); + + int* rcv_count = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); + int* snd_count = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); + int* rcv_displ = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); + int* snd_displ = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); + + /*compute counts and displs*/ + rcv_displ[0] = 0; + snd_displ[0] = 0; + + + rcv_count[0] = point_to_rcv_count[0] * (ctx -> dims); + snd_count[0] = point_to_snd_count[0] * (ctx -> dims); + + int tot_points_rcv = point_to_rcv_count[0]; + int tot_points_snd = point_to_snd_count[0]; + int tot_count = rcv_count[0]; + + for(int i = 1; i < ctx -> world_size; ++i) + { + rcv_count[i] = point_to_rcv_count[i] * (ctx -> dims); + snd_count[i] = point_to_snd_count[i] * (ctx -> dims); + + tot_count += rcv_count[i]; + tot_points_rcv += point_to_rcv_count[i]; + tot_points_snd += point_to_snd_count[i]; + + rcv_displ[i] = rcv_displ[i - 1] + rcv_count[i - 1]; + snd_displ[i] = snd_displ[i - 1] + snd_count[i - 1]; + } + + float_t* __rcv_points = (float_t*)MY_MALLOC(tot_points_rcv * (ctx -> dims) * sizeof(float_t)); + float_t* __snd_points = (float_t*)MY_MALLOC(tot_points_snd * (ctx -> dims) * sizeof(float_t)); + + float_t* __max_dist_snd = (float_t*)MY_MALLOC(tot_points_snd * sizeof(float_t)); + float_t* __max_dist_rcv = (float_t*)MY_MALLOC(tot_points_rcv * sizeof(float_t)); + + + + /* copy data to send in contiguous memory */ + for(int i = 0; i < ctx -> world_size; ++i) + { + memcpy(__snd_points + snd_displ[i], data_to_send_per_proc[i], snd_count[i] * sizeof(float_t)); + } + + size_t dist_idx = 0; + for(idx_t i = 0; i < ctx -> world_size; ++i) + { + for(idx_t j = 0; j < point_to_snd_count[i]; ++j) + { + idx_t point_idx = local_idx_of_the_point[i][j]; + // for each point take the distance of its furthest nearest neighbor + // then communicate it in order to prune the search on foreign nodes + __max_dist_snd[dist_idx] = ctx -> local_datapoints[point_idx].ngbh[0].value; + ++dist_idx; + } + } + + MPI_Alltoallv(__snd_points, snd_count, snd_displ, MPI_MY_FLOAT, + __rcv_points, rcv_count, rcv_displ, MPI_MY_FLOAT, ctx -> mpi_communicator); + + float_t** rcv_work_batches = (float_t**)MY_MALLOC(ctx -> world_size * sizeof(float_t*)); + for(int i = 0; i < ctx -> world_size; ++i) + { + rcv_work_batches[i] = __rcv_points + rcv_displ[i]; + } + + MPI_Status status; + int flag; + + for(int i = 0; i < ctx -> world_size; ++i) + { + rcv_count[i] = rcv_count[i] / ctx -> dims; + rcv_displ[i] = rcv_displ[i] / ctx -> dims; + snd_count[i] = snd_count[i] / ctx -> dims; + snd_displ[i] = snd_displ[i] / ctx -> dims; + + } + + MPI_Alltoallv(__max_dist_snd, snd_count, snd_displ, MPI_MY_FLOAT, + __max_dist_rcv, rcv_count, rcv_displ, MPI_MY_FLOAT, ctx -> mpi_communicator); + + /* prepare heap_t batches */ + + //int work_batch_stride = 1 + ctx -> dims; + int work_batch_stride = ctx -> dims; + + /* Note that I then have to recieve an equal number of heaps as the one I sent out before */ + heap_node_t* __heap_batches_to_snd = (heap_node_t*)MY_MALLOC((uint64_t)k * (uint64_t)tot_points_rcv * sizeof(heap_node_t)); + + if( __heap_batches_to_snd == NULL) + { + DB_PRINT("Rank %d failed to allocate snd_heaps %luB required\n",ctx -> mpi_rank, (uint64_t)k * (uint64_t)tot_points_snd * sizeof(heap_node_t)); + exit(1); + } + + MPI_Barrier(ctx -> mpi_communicator); + + + // max dists will contain the maximum distance to have to search + heap_node_t** heap_batches_per_node = (heap_node_t**)MY_MALLOC(ctx -> world_size * sizeof(heap_node_t*)); + float_t** max_dists_per_node = (float_t**)MY_MALLOC(ctx -> world_size * sizeof(float_t*)); + + for(int p = 0; p < ctx -> world_size; ++p) + { + heap_batches_per_node[p] = __heap_batches_to_snd + (uint64_t)rcv_displ[p] * (uint64_t)k; + max_dists_per_node[p] = __max_dist_rcv + (uint64_t)rcv_displ[p]; + } + + /* compute everything */ + elapsed_time = TIME_STOP; + LOG_WRITE("Exchanging points", elapsed_time); + MPI_Barrier(ctx -> mpi_communicator); + + + TIME_START; + + /* ngbh search on recieved points */ + for(int p = 0; p < ctx -> world_size; ++p) + { + if(point_to_rcv_count[p] > 0 && p != ctx -> mpi_rank) + { + #pragma omp parallel for schedule(dynamic, 128) + for(int batch = 0; batch < point_to_rcv_count[p]; ++batch) + { + heap_t H; + H.count = k; + H.size = k; + H.data = heap_batches_per_node[p] + (uint64_t)k * (uint64_t)batch; + float_t max_dist = max_dists_per_node[p][batch]; + for(idx_t i = 0; i < H.size; ++i) + { + H.data[i].value = max_dist; + H.data[i].array_idx = MY_SIZE_MAX; + } + + //float_t* point = rcv_work_batches[p] + batch * work_batch_stride + 1; + idx_t visited_nodes = 0; + float_t* point = rcv_work_batches[p] + (uint64_t)batch * (uint64_t)work_batch_stride; + knn_sub_tree_search(point, local_tree-> __points, local_tree->__pivots, + local_tree->root, &H, local_tree->dims, &visited_nodes); + convert_heap_idx_to_global(ctx, &H); + } + } + } + + /* sendout results */ + + /* + * dummy pointers to clarify counts in this part + * act like an alias for rcv and snd counts + */ + + int* ngbh_to_send = point_to_rcv_count; + int* ngbh_to_recv = point_to_snd_count; + + /* + * counts are inverted since I have to recieve as many batches as points I + * Have originally sended + */ + + elapsed_time = TIME_STOP; + LOG_WRITE("Ngbh search for foreing points", elapsed_time); + + TIME_START; + + MPI_Datatype MPI_my_heap; + MPI_Type_contiguous(k * sizeof(heap_node_t), MPI_BYTE, &MPI_my_heap); + MPI_Barrier(ctx -> mpi_communicator); + MPI_Type_commit(&MPI_my_heap); + + + /* ------------------------------------- + * ALTERNATIVE TO ALL TO ALL FOR BIG MSG + * ------------------------------------- */ + + MPI_Barrier(ctx -> mpi_communicator); + int default_msg_len = MAX_MSG_SIZE / (k * sizeof(heap_node_t)); + + int* already_sent_points = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); + int* already_rcvd_points = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); + + /* allocate a request array to keep track of all requests going out*/ + MPI_Request* req_array; + int req_num = 0; + for(int i = 0; i < ctx -> world_size; ++i) + { + req_num += ngbh_to_send[i] > 0 ? ngbh_to_send[i]/default_msg_len + 1 : 0; + } + + req_array = (MPI_Request*)MY_MALLOC(req_num * sizeof(MPI_Request)); + + + for(int i = 0; i < ctx -> world_size; ++i) + { + already_sent_points[i] = 0; + already_rcvd_points[i] = 0; + } + + int req_idx = 0; + + // find the maximum number of points to send */ + + idx_t max_n_recv = 0; + for(int i = 0; i < ctx -> world_size; ++i) + { + max_n_recv = MAX(max_n_recv, (idx_t)ngbh_to_recv[i]); + } + + MPI_DB_PRINT("Using default message length %lu\n", default_msg_len); + + heap_node_t* __heap_batches_to_rcv = (heap_node_t*)MY_MALLOC((uint64_t)k * (uint64_t)max_n_recv * sizeof(heap_node_t)); + if( __heap_batches_to_rcv == NULL) + { + DB_PRINT("Rank %d failed to allocate rcv_heaps %luB required\n",ctx -> mpi_rank, (uint64_t)k * (uint64_t)max_n_recv* sizeof(heap_node_t)); + exit(1); + } + + /* make a ring */ + + MPI_Barrier(ctx -> mpi_communicator); + for(int i = 1; i < ctx -> world_size; ++i) + { + int rank_to_send = (ctx -> mpi_rank + i) % (ctx -> world_size); + int rank_to_recv = (ctx -> world_size + ctx -> mpi_rank - i) % (ctx -> world_size); + + /* do things */ + + /* send out one batch */ + + #ifdef PRINT_NGBH_EXCHANGE_SCHEME + MPI_DB_PRINT("[--- ROUND %d ----]\n", i); + MPI_Barrier(ctx -> mpi_communicator); + DB_PRINT("[RANK %d] sending to %d tot: %d [%luB]---- recieving from %d %d\n", ctx -> mpi_rank, + rank_to_send, ngbh_to_send[rank_to_send], ngbh_to_send[rank_to_send]*sizeof(heap_node_t), rank_to_recv, ngbh_to_recv[rank_to_recv]); + #endif + if(ngbh_to_send[rank_to_send] > 0) + { + int count_send = 0; + while(already_sent_points[rank_to_send] < ngbh_to_send[rank_to_send]) + { + MPI_Request request; + count_send = MIN((int)default_msg_len, (int)(ngbh_to_send[rank_to_send] - already_sent_points[rank_to_send] )); + + MPI_Isend( heap_batches_per_node[rank_to_send] + k * already_sent_points[rank_to_send], count_send, + MPI_my_heap, rank_to_send, 0, ctx -> mpi_communicator, &request); + + already_sent_points[rank_to_send] += count_send; + req_array[req_idx] = request; + ++req_idx; + } + } + + if( ngbh_to_send[rank_to_send] != already_sent_points[rank_to_send] || + point_to_rcv_count[rank_to_send] != already_sent_points[rank_to_send]) + { + DB_PRINT("ERROR OCCURRED in sending points [rank %d] got %d expected %d\n", + ctx -> mpi_rank, already_rcvd_points[rank_to_send], point_to_rcv_count[rank_to_send]); + } + + MPI_Barrier(ctx -> mpi_communicator); + + if(ngbh_to_recv[rank_to_recv] > 0) + { + flag = 0; + while(already_rcvd_points[rank_to_recv] < ngbh_to_recv[rank_to_recv]) + { + MPI_Probe(rank_to_recv, MPI_ANY_TAG, ctx -> mpi_communicator, &status); + MPI_Request request; + int count_recv; + int source = status.MPI_SOURCE; + MPI_Get_count(&status, MPI_my_heap, &count_recv); + /* recieve each slice */ + + MPI_Recv(__heap_batches_to_rcv + k * already_rcvd_points[rank_to_recv], + count_recv, MPI_my_heap, source, MPI_ANY_TAG, ctx -> mpi_communicator, &status); + + already_rcvd_points[rank_to_recv] += count_recv; + } + } + + if( ngbh_to_recv[rank_to_recv] != already_rcvd_points[rank_to_recv] || + point_to_snd_count[rank_to_recv] != already_rcvd_points[rank_to_recv]) + { + DB_PRINT("ERROR OCCURRED in recieving points [rank %d] got %d expected %d\n", + ctx -> mpi_rank, already_rcvd_points[rank_to_recv], point_to_snd_count[rank_to_recv]); + } + + /* merge lists */ + #pragma omp parallel for schedule(dynamic, 32) + for(int b = 0; b < ngbh_to_recv[rank_to_recv]; ++b) + { + int idx = local_idx_of_the_point[rank_to_recv][b]; + /* retrieve the heap_t */ + heap_t H; + H.count = k; + H.size = k; + H.data = __heap_batches_to_rcv + k*b; + /* insert the points into the heap_t */ + for(int j = 0; j < k; ++j) + { + heap_t tmp_heap; + tmp_heap.size = k; + tmp_heap.count = k; + tmp_heap.data = dp_info[idx].ngbh; + // insert it only of != from max + if(H.data[j].array_idx != MY_SIZE_MAX) max_heap_insert(&(tmp_heap), H.data[j].value, H.data[j].array_idx); + } + } + + + + MPI_Barrier(ctx -> mpi_communicator); + } + + elapsed_time = TIME_STOP; + LOG_WRITE("Merging results", elapsed_time); + + + MPI_Testall(req_idx, req_array, &flag, MPI_STATUSES_IGNORE); + + if(flag == 0) + { + DB_PRINT("ERROR OCCURRED Rank %d has unfinished communications\n", ctx -> mpi_rank); + exit(1); + } + free(req_array); + free(already_sent_points); + free(already_rcvd_points); + + /* -------------------------------------------------------- */ + /* heapsort them */ + + TIME_START; + + #pragma omp parallel for schedule(dynamic, 128) + for(int i = 0; i < ctx -> local_n_points; ++i) + { + heap_t tmp_heap; + tmp_heap.size = k; + tmp_heap.count = k; + tmp_heap.data = dp_info[i].ngbh; + + heap_sort(&(tmp_heap)); + } + + elapsed_time = TIME_STOP; + LOG_WRITE("Sorting negihborhoods", elapsed_time); + + + #if defined(WRITE_NGBH) + MPI_DB_PRINT("Writing ngbh to files\n"); + char ngbh_out[80]; + sprintf(ngbh_out, "./bb/rank_%d.ngbh",ctx -> mpi_rank); + FILE* file = fopen(ngbh_out,"w"); + if(!file) + { + printf("Cannot open file %s\n",ngbh_out); + } + else + { + for(int i = 0; i < ctx -> local_n_points; ++i) + { + fwrite(dp_info[i].ngbh, sizeof(heap_node_t), k, file); + } + fclose(file); + } + #endif + + MPI_Barrier(ctx -> mpi_communicator); + + + #if defined(WRITE_SHUFFLED_DATA) + float_t* data_to_write = (float_t*)MY_MALLOC(ctx -> local_n_points * ctx -> k * sizeof(float_t)); + for(int i = 0; i < ctx -> local_n_points; ++i) + { + idx_t idx = local_tree -> __points[i].array_idx; + memcpy(data_to_write + idx*ctx -> dims, local_tree -> __points[i].data, ctx -> dims*sizeof(float_t)); + } + + big_ordered_buffer_to_file(ctx, data_to_write, sizeof(float_t), ctx -> local_n_points * ctx -> dims, "bb/ordered_data.npy"); + free(data_to_write); + #endif + + for(int i = 0; i < ctx -> world_size; ++i) + { + if(data_to_send_per_proc[i]) free(data_to_send_per_proc[i]); + if(local_idx_of_the_point[i]) free(local_idx_of_the_point[i]); + } + + free(data_to_send_per_proc); + free(local_idx_of_the_point); + free(heap_batches_per_node); + free(max_dists_per_node); + //free(rcv_heap_batches); + free(rcv_work_batches); + free(point_to_rcv_count); + free(point_to_snd_count); + free(point_to_snd_capacity); + + free(rcv_count); + free(snd_count); + free(rcv_displ); + free(snd_displ); + free(__heap_batches_to_rcv); + free(__heap_batches_to_snd); + free(__rcv_points); + free(__snd_points); + free(__max_dist_snd); + free(__max_dist_rcv); + } -void build_local_tree(global_context_t* ctx, kdtree_v2* local_tree) +void build_local_tree(global_context_t* ctx, kdtree_t* local_tree) { - local_tree -> root = build_tree_kdtree_v2(local_tree -> _nodes, local_tree -> n_nodes, ctx -> dims); + build_tree_kdtree(local_tree); } diff --git a/src/tree/tree.h b/src/tree/tree.h index 1d84445..4fd2fd6 100644 --- a/src/tree/tree.h +++ b/src/tree/tree.h @@ -1,9 +1,9 @@ #pragma once #include "../common/common.h" #include "heap.h" -#include "kdtreeV2.h" - +#include "kdtree.h" #include + #include #include #include @@ -52,9 +52,13 @@ typedef struct partition_t int n_procs; int start_proc; size_t n_points; - float_t* base_ptr; int lr; struct top_kdtree_node_t* parent; + //support for parallel partition to be put into a union maybe + idx_t offset; + //support for single thread paritition to be put into a union maybe + point_t* base_ptr; + } partition_t; typedef struct partition_queue_t @@ -87,19 +91,33 @@ typedef struct top_kdtree_t struct top_kdtree_node_t* root; } top_kdtree_t; +typedef struct partition_utils_t +{ + size_t* lt_count; + size_t* gt_count; + size_t* lt_displ; + size_t* gt_displ; +} partition_utils_t; + void simulate_master_read_and_scatter(int, size_t, global_context_t* ); void top_tree_init(global_context_t *ctx, top_kdtree_t *tree); -void build_top_kdtree(global_context_t *ctx, pointset_t *og_pointset, top_kdtree_t *tree, int n_bins, float_t tolerance); +void build_top_kdtree(global_context_t *ctx, top_kdtree_t *tree, idx_t n_bins, float_t tolerance); void exchange_points(global_context_t* ctx, top_kdtree_t* tree); -void build_local_tree(global_context_t* ctx, kdtree_v2* local_tree); -void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtree_t* top_tree, kdtree_v2* local_tree, float_t* data, int k); +void build_local_tree(global_context_t* ctx, kdtree_t* local_tree); +size_t partition_data_around_value(point_t *array, size_t vec_len, size_t compare_dim, size_t left, size_t right, float_t pivot_value); +size_t parallel_partition_data_around_value(partition_utils_t* p_putils, float_t* in, + float_t* out, size_t vec_len, size_t compare_dim, + size_t left, size_t right, float_t pivot_value); +void mpi_all_knn_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtree_t* top_tree, kdtree_t* local_tree, int k); float_t compute_ID_two_NN_ML(global_context_t* ctx, datapoint_info_t* dp_info, idx_t n, int verbose); void compute_density_kstarnn_rma_v2(global_context_t* ctx, const float_t d, int verbose); void compute_correction(global_context_t* ctx, float_t Z); int foreign_owner(global_context_t*, idx_t); void top_tree_free(global_context_t *ctx, top_kdtree_t *tree); +void parallel_build_top_kdtree(global_context_t *ctx, top_kdtree_t *tree, idx_t oversampling); +void mpi_knn_search(global_context_t* ctx, top_kdtree_t* top_tree, kdtree_t* local_tree, float_t* data, idx_t n_data, idx_t vec_len, int k); -- GitLab From 88d661af5d326620a684986312bcc6013664ce16 Mon Sep 17 00:00:00 2001 From: Francesco Tomba Date: Tue, 14 Apr 2026 11:40:28 +0200 Subject: [PATCH 2/2] got 10d 1B --- run_leo | 13 +- src/common/common.c | 1 + src/common/common.h | 10 +- src/main/main.c | 222 +++++++++++------ src/tree/kdtree.h | 11 +- src/tree/tree.c | 529 ++++++++++++++++++++++++++++++++++++++++- src/tree/tree.h | 6 +- src/tree/vptree.h | 563 ++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 1267 insertions(+), 88 deletions(-) create mode 100644 src/tree/vptree.h diff --git a/run_leo b/run_leo index 497cdb7..efc8a39 100644 --- a/run_leo +++ b/run_leo @@ -31,13 +31,18 @@ export PSM2_MQ_RECVREQS_MAX=268435456 rm bb/* mkdir bb -OUT_ASSIGNMENT=/leonardo_scratch/large/userexternal/ftomba00/assignment -OUT_DATA=/leonardo_scratch/large/userexternal/ftomba00/data - +OUT_DIR=/leonardo_scratch/large/userexternal/ftomba00/exp1M IN_DATA=/leonardo_work/EUHPC_D18_045 +rm -rf ${OUT_DIR} +mkdir ${OUT_DIR} + + #10^6 points -time mpirun -n ${SLURM_NTASKS} --map-by ppr:1:socket:PE=${SLURM_CPUS_PER_TASK} ./main -t f32 -k 100 -i ${IN_DATA}/norm_data/std_LR_091_0001 -d 5 -a ${OUT_ASSIGNMENT} -o ${OUT_DATA} +time mpirun -n ${SLURM_NTASKS} --map-by ppr:1:socket:PE=${SLURM_CPUS_PER_TASK} ./main -t f32 -k 100 -i ${IN_DATA}/norm_data/std_LR_091_0001 -d 5 -o ${OUT_DIR} + + +## to modify those one below #time mpirun -n ${SLURM_NTASKS} --map-by core ./main -t f32 -i ${IN_DATA}/norm_data/std_LR_091_0001 -d 5 -a ${OUT_ASSIGNMENT} -o ${OUT_DATA} #34 * 10^6 points diff --git a/src/common/common.c b/src/common/common.c index 3a5d155..ee9e766 100644 --- a/src/common/common.c +++ b/src/common/common.c @@ -22,6 +22,7 @@ void get_context(global_context_t* ctx) ctx -> local_datapoints = NULL; ctx -> __local_heap_buffers = NULL; ctx -> input_data_in_float32 = -1; + ctx -> local_tree_type = KD; ctx -> dims = 0; ctx -> k = 300; ctx -> z = 3; diff --git a/src/common/common.h b/src/common/common.h index ad6cd47..c92c2b2 100644 --- a/src/common/common.h +++ b/src/common/common.h @@ -11,16 +11,16 @@ //#include // #define PARALLEL_FIX_BORDERS -#define WRITE_SHUFFLED_DATA -#define WRITE_NGBH +// #define WRITE_SHUFFLED_DATA +// #define WRITE_NGBH // #define WRITE_TOP_NODES -#define WRITE_DENSITY +// #define WRITE_DENSITY // #define WRITE_CLUSTER_ASSIGN_H1 // #define WRITE_BORDERS // #define WRITE_CENTERS_PRE_MERGING // #define WRITE_MERGES_INFO // #define WRITE_MERGING_TABLE -#define WRITE_FINAL_ASSIGNMENT +// #define WRITE_FINAL_ASSIGNMENT // #define PRINT_NGBH_EXCHANGE_SCHEME // #define PRINT_H2_COMM_SCHEME @@ -162,6 +162,7 @@ typedef struct datapoint_info_t { float_t log_rho_err; // } datapoint_info_t; +enum local_tree_type {VP, KD}; typedef struct global_context_t { @@ -188,6 +189,7 @@ typedef struct global_context_t idx_t* og_idxs; //original indexes heap_node_t* __local_heap_buffers; //buffer that stores nearest neighbors MPI_Comm mpi_communicator; //mpi communicator + enum local_tree_type local_tree_type; //local tree strategy int input_data_in_float32; char input_data_file[DEFAULT_STR_LEN]; char output_assignment_file[DEFAULT_STR_LEN]; diff --git a/src/main/main.c b/src/main/main.c index c82e660..039eaa8 100644 --- a/src/main/main.c +++ b/src/main/main.c @@ -8,29 +8,6 @@ #include #include - -#ifdef AMONRA - #pragma message "Hi, you are on amonra" - #define OUT_CLUSTER_ASSIGN "/beegfs/ftomba/phd/results/final_assignment.npy" - #define OUT_DATA "/beegfs/ftomba/phd/results/ordered_data.npy" -#endif - -#ifdef LEONARDO - #define OUT_CLUSTER_ASSIGN "/leonardo_scratch/large/userexternal/ftomba00/out_dadp/final_assignment.npy" - #define OUT_DATA "/leonardo_scratch/large/userexternal/ftomba00/out_dadp/ordered_data.npy" -#endif - -#ifdef LUMI - #define OUT_CLUSTER_ASSIGN "~/scratch_dadp/out_dadp/final_assignment.npy" - #define OUT_DATA "~/scratch_dadp/out_dadp/ordered_data.npy" -#endif - -#ifndef OUT_CLUSTER_ASSIGN - #define OUT_CLUSTER_ASSIGN "final_assignment.npy" - #define OUT_DATA "ordered_data.npy" -#endif - - #ifdef THREAD_FUNNELED #define THREAD_LEVEL MPI_THREAD_FUNNELED #else @@ -43,10 +20,10 @@ struct option long_options[] = {"in-data" , required_argument, 0, 'i'}, {"in-dtype", required_argument, 0, 't'}, {"in-dims" , required_argument, 0, 'd'}, - {"out-data", optional_argument, 0, 'o'}, - {"out-assignment", optional_argument, 0, 'a'}, + {"out-directory", required_argument, 0, 'o'}, {"kngbh", optional_argument, 0, 'k'}, {"z", optional_argument, 0, 'z'}, + {"knn-strategy", optional_argument, 0, 's'}, {"help", optional_argument, 0, 'h'}, {0, 0, 0, 0} }; @@ -58,12 +35,10 @@ const char* help = "Distributed Advanced Density Peak\n"\ " -d --in-dims (required) number of dimensions of the data file, dadp expects something\n"\ " of the form N x d where N is inferred from the lenght of the\n"\ " data file\n"\ - " -o --out-data (optional) output path for the data, the datafile is shuffled between\n"\ - " mpi tasks and datapoints are ordered default is `out_data` \n"\ - " -a --out-assignment (optional) output path for the cluster assignment output ranges [0 ... Nc - 1]\n"\ - " for core points halo points have indices [-Nc ... -1] conversion\n"\ - " of idx for an halo point is cluster_idx = -halo_idx - 1, default is `out_assignment`\n"\ + " -o --out-directory (required) output path for the data, cluster assignment and original indices \n"\ + " of the datapoints. Produces files like `data.[rank]`, `og_idx.[rank]` and `assignment.[rank]`\n"\ " -k --kngbh (optional) number of nearest neighbors to compute\n"\ + " -s --knn-strategy (optional) strategy for local knn (allowed values 'kd' for kdtree, 'vp' for ball-tree (vantage point)\n"\ " -z --z (optional) number of nearest neighbors to compute\n"; void parse_args(global_context_t* ctx, int argc, char** argv) @@ -72,10 +47,8 @@ void parse_args(global_context_t* ctx, int argc, char** argv) int opt; int input_file_set = 0; int input_type_set = 0; - snprintf(ctx -> output_assignment_file, DEFAULT_STR_LEN, "%s", OUT_CLUSTER_ASSIGN); - snprintf(ctx -> output_data_file, DEFAULT_STR_LEN, "%s", OUT_DATA); - while((opt = getopt_long(argc, argv, "i:t:d:o:a:k:z:h", long_options, NULL)) != -1) + while((opt = getopt_long(argc, argv, "i:t:d:o:k:z:hs:", long_options, NULL)) != -1) { switch(opt) { @@ -105,9 +78,6 @@ void parse_args(global_context_t* ctx, int argc, char** argv) case 'o': strncpy(ctx -> output_data_file, optarg, DEFAULT_STR_LEN); break; - case 'a': - strncpy(ctx -> output_assignment_file, optarg, DEFAULT_STR_LEN); - break; case 'k': ctx -> k = atoi(optarg); break; @@ -118,6 +88,22 @@ void parse_args(global_context_t* ctx, int argc, char** argv) mpi_printf(ctx, "%s\n", help); MPI_Finalize(); exit(0); + + case 's': + if(strncmp("vp", optarg, 2) == 0) + { + ctx -> local_tree_type = VP; + } + else if(strncmp("kd", optarg, 2) == 0) + { + ctx -> local_tree_type = KD; + } + else + { + printf("Cannot find valid local tree type selection, exiting\n"); + exit(0); + } + break; default: mpi_printf(ctx, "%s\n", help); @@ -161,8 +147,7 @@ void print_hello(global_context_t* ctx) mpi_printf(ctx, "Data file .............> %s\n", ctx -> input_data_file); mpi_printf(ctx, "Input Type .............> float%d\n", ctx -> input_data_in_float32 ? 32 : 64); mpi_printf(ctx, "Dimensions .............> %d\n", ctx -> dims); - mpi_printf(ctx, "Output data file .......> %s\n", ctx -> output_data_file); - mpi_printf(ctx, "Output assignment file -> %s\n", ctx -> output_assignment_file); + mpi_printf(ctx, "Output directory .......> %s\n", ctx -> output_data_file); mpi_printf(ctx, "k ......................> %lu\n", ctx -> k); mpi_printf(ctx, "Z ......................> %.2lf\n", ctx -> z); mpi_printf(ctx, "\nRUNNING!\n"); @@ -255,15 +240,33 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx) int halo = MY_TRUE; float_t tol = 0.002; - if(I_AM_MASTER && ctx -> world_size <= 6) + if(ctx -> world_size <= 6) { - test_file_path(ctx -> output_data_file); - test_file_path(ctx -> output_assignment_file); + if(I_AM_MASTER) + { + char out_file[200]; + + snprintf(out_file, 200, "%s/og_idx", ctx->output_data_file); + test_file_path(out_file); + + snprintf(out_file, 200, "%s/assignment", ctx->output_data_file); + test_file_path(out_file); + + snprintf(out_file, 200, "%s/data", ctx->output_data_file); + test_file_path(out_file); + } } else { - test_distributed_file_path(ctx, ctx -> output_data_file); - test_distributed_file_path(ctx, ctx -> output_assignment_file); + char out_file[200]; + snprintf(out_file, 200, "%s/og_idx", ctx->output_data_file); + test_distributed_file_path(ctx, out_file); + + snprintf(out_file, 200, "%s/assignment", ctx->output_data_file); + test_distributed_file_path(ctx, out_file); + + snprintf(out_file, 200, "%s/data", ctx->output_data_file); + test_distributed_file_path(ctx, out_file); } @@ -358,10 +361,6 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx) elapsed_time = TIME_STOP; LOG_WRITE("Top kdtree build and domain decomposition", elapsed_time); - TIME_START; - kdtree_t local_tree; - kdtree_initialize( &local_tree, ctx -> local_data, ctx -> local_n_points, (unsigned int)ctx -> dims); - datapoint_info_t* dp_info = (datapoint_info_t*)MY_MALLOC(ctx -> local_n_points * sizeof(datapoint_info_t)); for(uint64_t i = 0; i < ctx -> local_n_points; ++i) { @@ -378,22 +377,64 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx) } ctx -> local_datapoints = dp_info; - build_tree_kdtree_parallel(&local_tree); - // build_tree_kdtree_sample(&local_tree); - ctx -> local_data = local_tree.data; - - elapsed_time = TIME_STOP; - LOG_WRITE("Local trees init and build", elapsed_time); - TIME_START; - MPI_DB_PRINT("----- Performing ngbh search -----\n"); - MPI_Barrier(ctx -> mpi_communicator); - mpi_all_knn_search(ctx, dp_info, &tree, &local_tree, ctx -> k); + void* opaque_local_tree; + + switch(ctx -> local_tree_type) + { + case KD: + { + kdtree_t local_tree; + kdtree_initialize( &local_tree, ctx -> local_data, ctx -> local_n_points, (unsigned int)ctx -> dims); + + + build_tree_kdtree_parallel(&local_tree); + // build_tree_kdtree_sample(&local_tree); + ctx -> local_data = local_tree.data; + + elapsed_time = TIME_STOP; + LOG_WRITE("Local kdtrees init and build", elapsed_time); + + TIME_START; + MPI_DB_PRINT("----- Performing ngbh search -----\n"); + MPI_Barrier(ctx -> mpi_communicator); + + mpi_all_knn_search_kdtree(ctx, dp_info, &tree, &local_tree, ctx -> k); + + MPI_Barrier(ctx -> mpi_communicator); + elapsed_time = TIME_STOP; + LOG_WRITE("Total time for all knn search w. local kdtrees", elapsed_time); + opaque_local_tree = &local_tree; + } + break; + case VP: + { + vptree_t local_tree; + vptree_initialize( &local_tree, ctx -> local_data, ctx -> local_n_points, (unsigned int)ctx -> dims); + + + build_tree_vptree(&local_tree); + // build_tree_kdtree_sample(&local_tree); + ctx -> local_data = local_tree.data; + + elapsed_time = TIME_STOP; + LOG_WRITE("Vptrees init and build", elapsed_time); + + TIME_START; + MPI_DB_PRINT("----- Performing ngbh search -----\n"); + MPI_Barrier(ctx -> mpi_communicator); + + mpi_all_knn_search_vptree(ctx, dp_info, &tree, &local_tree, ctx -> k); + + MPI_Barrier(ctx -> mpi_communicator); + elapsed_time = TIME_STOP; + LOG_WRITE("Total time for all knn search w. vptrees", elapsed_time); + opaque_local_tree = &local_tree; + } + break; + } - MPI_Barrier(ctx -> mpi_communicator); - elapsed_time = TIME_STOP; - LOG_WRITE("Total time for all knn search", elapsed_time) TIME_START; @@ -434,40 +475,77 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx) TIME_START; - int* cl = (int*)MY_MALLOC(ctx -> local_n_points * sizeof(int)); + int* cl = (int*)MY_MALLOC(2 * ctx -> local_n_points * sizeof(int)); float_t* data_to_write = (float_t*)MY_MALLOC(ctx -> local_n_points * ctx -> k * sizeof(float_t)); - for(int i = 0; i < ctx -> local_n_points; ++i) + switch(ctx -> local_tree_type) { - cl[i] = ctx -> local_datapoints[i].cluster_idx; - idx_t idx = local_tree.__points[i].array_idx; - memcpy(data_to_write + idx*ctx -> dims, local_tree.__points[i].data, ctx -> dims*sizeof(float_t)); + case KD: + { + kdtree_t local_tree = *((kdtree_t*)opaque_local_tree); + for(int i = 0; i < ctx -> local_n_points; ++i) + { + cl[i] = ctx -> local_datapoints[i].cluster_idx; + idx_t idx = local_tree.__points[i].array_idx; + memcpy(data_to_write + idx*ctx -> dims, local_tree.__points[i].data, ctx -> dims*sizeof(float_t)); + } + kdtree_free(&local_tree); + } + break; + + case VP: + { + vptree_t local_tree = *((vptree_t*)opaque_local_tree); + for(int i = 0; i < ctx -> local_n_points; ++i) + { + cl[i] = ctx -> local_datapoints[i].cluster_idx; + idx_t idx = local_tree.__points[i].array_idx; + memcpy(data_to_write + idx*ctx -> dims, local_tree.__points[i].data, ctx -> dims*sizeof(float_t)); + } + vptree_free(&local_tree); + } + break; + + default: + break; } if(ctx -> world_size <= 32) { - big_ordered_buffer_to_file(ctx, cl, sizeof(int), ctx -> local_n_points, ctx -> output_assignment_file); - big_ordered_buffer_to_file(ctx, data_to_write, sizeof(double), ctx -> local_n_points * ctx -> dims, ctx -> output_data_file); + char out_file[200]; + + snprintf(out_file, 200, "%s/og_idx", ctx->output_data_file); + big_ordered_buffer_to_file(ctx, ctx->og_idxs, sizeof(idx_t), ctx -> local_n_points, out_file); - free(cl); + snprintf(out_file, 200, "%s/assignment", ctx->output_data_file); + big_ordered_buffer_to_file(ctx, cl, sizeof(int), ctx -> local_n_points, out_file); + snprintf(out_file, 200, "%s/data", ctx->output_data_file); + big_ordered_buffer_to_file(ctx, data_to_write, sizeof(double), ctx -> local_n_points * ctx -> dims, out_file); } else { - distributed_buffer_to_file(ctx, cl, sizeof(int), ctx -> local_n_points, ctx -> output_assignment_file); - distributed_buffer_to_file(ctx, data_to_write, sizeof(double), ctx -> local_n_points * ctx -> dims, ctx -> output_data_file); + char out_file[200]; - free(cl); + snprintf(out_file, 200, "%s/og_idx", ctx->output_data_file); + distributed_buffer_to_file(ctx, ctx->og_idxs, sizeof(idx_t), ctx -> local_n_points, out_file); + snprintf(out_file, 200, "%s/assignment", ctx->output_data_file); + distributed_buffer_to_file(ctx, cl, sizeof(int), ctx -> local_n_points, out_file); + + snprintf(out_file, 200, "%s/data", ctx->output_data_file); + distributed_buffer_to_file(ctx, data_to_write, sizeof(double), ctx -> local_n_points * ctx -> dims, out_file); } + + elapsed_time = TIME_STOP; LOG_WRITE("Write results to file", elapsed_time); + free(cl); free(data_to_write); top_tree_free(ctx, &tree); - kdtree_free(&local_tree); //clusters_free(&clusters); free(send_counts); diff --git a/src/tree/kdtree.h b/src/tree/kdtree.h index 8a0c9be..56062b2 100644 --- a/src/tree/kdtree.h +++ b/src/tree/kdtree.h @@ -10,7 +10,7 @@ #include #include -#define DEFAULT_LEAF_SIZE 32 +#define DEFAULT_LEAF_SIZE 256 #define ALIGNMENT 64 #define CHECK_ALLOCATION_NO_CTX(x) if(!x){printf("[!!!] Failed allocation: %s at line %d \n", __FILE__, __LINE__ ); exit(1);} @@ -49,6 +49,15 @@ #define MAX(x,y) ((x > y) ? (x) : (y)) #endif +#ifndef POINT + #define POINT + typedef struct point_t + { + idx_t array_idx; + float_t* data; + } point_t; +#endif + typedef struct { float_t* lb; diff --git a/src/tree/tree.c b/src/tree/tree.c index 4dc46cf..3f8b3e4 100644 --- a/src/tree/tree.c +++ b/src/tree/tree.c @@ -11,6 +11,7 @@ #include "heap.h" #include "kdtree.h" #include "mpi.h" +#include "vptree.h" #include #include #include @@ -56,7 +57,7 @@ void swap_data_element(point_t *a, point_t *b, size_t vec_len) { idx_t tmp_dp; tmp_dp = a->array_idx; a->array_idx = b->array_idx; - a->array_idx = tmp_dp; + b->array_idx = tmp_dp; float_t tmp; for (size_t i = 0; i < vec_len; ++i) @@ -934,7 +935,7 @@ void parallel_build_top_kdtree(global_context_t *ctx, top_kdtree_t *tree, idx_t // size_t tot_n_points = 0; // MPI_Allreduce(&(ctx->local_n_points), &tot_n_points, 1, MPI_UINT64_T, MPI_SUM, ctx->mpi_communicator); - MPI_Alltoall(&(ctx->local_n_points), 1, MPI_UINT64_T, ctx->rank_n_points, 1, MPI_UINT64_T, ctx->mpi_communicator); + MPI_Allgather(&(ctx->local_n_points), 1, MPI_UINT64_T, ctx->rank_n_points, 1, MPI_UINT64_T, ctx->mpi_communicator); ctx->rank_idx_start[0] = 0; for(int i = 1; i < ctx->world_size; ++i) @@ -1035,7 +1036,7 @@ void build_top_kdtree(global_context_t *ctx, top_kdtree_t *tree, idx_t n_bins, f size_t tot_n_points = 0; MPI_Allreduce(&(ctx->local_n_points), &tot_n_points, 1, MPI_UINT64_T, MPI_SUM, ctx->mpi_communicator); - MPI_Alltoall(&(ctx->local_n_points), 1, MPI_UINT64_T, ctx->rank_n_points, 1, MPI_UINT64_T, ctx->mpi_communicator); + MPI_Allgather(&(ctx->local_n_points), 1, MPI_UINT64_T, ctx->rank_n_points, 1, MPI_UINT64_T, ctx->mpi_communicator); ctx->rank_idx_start[0] = 0; for(int i = 1; i < ctx->world_size; ++i) @@ -1362,7 +1363,6 @@ void exchange_points(global_context_t* ctx, top_kdtree_t* tree) int len = ctx -> local_n_points; float_t* curr_data = ctx -> local_data; - HERE partition_offset[0] = 0; for(int owner = 1; owner < ctx -> world_size; ++owner) { @@ -2385,8 +2385,521 @@ void mpi_knn_search(global_context_t* ctx, top_kdtree_t* top_tree, kdtree_t* loc free(__local_heap_buffers); } +void mpi_all_knn_search_vptree(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtree_t* top_tree, vptree_t* local_tree, int k) +{ + /* local search */ + /* print diagnostics */ + print_diagnositcs(ctx, k); + ctx -> k = (idx_t)k; + + TIME_DEF; + double elapsed_time; + + TIME_START; + //printf("rank %d local_n_points %lu\n", ctx -> mpi_rank, ctx -> local_n_points); + MPI_Barrier(ctx -> mpi_communicator); + //ctx -> __local_heap_buffers = (heap_node_t*)MY_MALLOC(ctx -> local_n_points * k * sizeof(heap_node)); + MPI_Alloc_mem(ctx -> local_n_points * k * sizeof(heap_node_t), MPI_INFO_NULL, &(ctx -> __local_heap_buffers)); + float_t avg_visited_nodes = 0; + + #pragma omp parallel for schedule(dynamic, 32) reduction(+:avg_visited_nodes) + for(int p = 0; p < ctx -> local_n_points; ++p) + { + idx_t idx = local_tree -> __points[p].array_idx; + /* actually we want to preserve the heap_t to then insert guesses from other nodes */ + heap_t tmp_heap; + tmp_heap.data = ctx -> __local_heap_buffers + k * idx; + tmp_heap.count = 0; + tmp_heap.size = k; + + //dp_info[idx].ngbh = knn_kdtree_t_no_heapsort(local_tree -> _nodes[p].data, local_tree -> root, k); + + idx_t visited_nodes = 0; + knn_vptree_search(local_tree->__points[p].data, local_tree-> __points, local_tree->__nodes, + local_tree->root, &tmp_heap, local_tree->dims, &visited_nodes); + + convert_heap_idx_to_global(ctx, &tmp_heap); + dp_info[idx].cluster_idx = -1; + dp_info[idx].is_center = 0; + dp_info[idx].array_idx = idx + ctx -> idx_start; + dp_info[idx].ngbh = tmp_heap.data; + avg_visited_nodes += (float_t)visited_nodes/(float_t)ctx->local_n_points; + } + + MPI_PRINT("AVG visited nodes on master %lf\n", avg_visited_nodes); + elapsed_time = TIME_STOP; + LOG_WRITE("Local neighborhood search", elapsed_time); + //printf("rank %d elapsed_time %lf\n", ctx->mpi_rank, elapsed_time); + + + TIME_START; + /* find if a points needs a refine on the global tree */ + float_t** data_to_send_per_proc = (float_t**)MY_MALLOC(ctx -> world_size * sizeof(float_t*)); + int** local_idx_of_the_point = (int**)MY_MALLOC(ctx -> world_size * sizeof(int*)); + int* point_to_snd_count = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); + int* point_to_snd_capacity = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); + + for(int i = 0; i < ctx -> world_size; ++i) + { + /* allocate it afterwards */ + point_to_snd_capacity[i] = 0; + point_to_snd_count[i] = 0; + } + + /* NEW VERSION double tree walk */ + #pragma omp parallel for schedule(dynamic, 128) + for(int i = 0; i < ctx -> local_n_points; ++i) + { + idx_t idx = local_tree -> __points[i].array_idx; + float_t max_dist = dp_info[idx].ngbh[0].value; + float_t* point = local_tree->__points[i].data; + + tree_walk_v2_find_n_points(ctx, top_tree -> root, idx, max_dist, point, point_to_snd_capacity); + + } + + /* allocate needed space */ + for(int i = 0; i < ctx -> world_size; ++i) + { + int np = point_to_snd_capacity[i]; + data_to_send_per_proc[i] = (float_t*)MY_MALLOC(np * (ctx -> dims) * sizeof(float_t)); + local_idx_of_the_point[i] = (int*)MY_MALLOC(np * sizeof(int)); + + } + + #pragma omp parallel for schedule(dynamic, 128) + for(int i = 0; i < ctx -> local_n_points; ++i) + { + idx_t idx = local_tree -> __points[i].array_idx; + float_t max_dist = dp_info[idx].ngbh[0].value; + float_t* point = local_tree -> __points[i].data; + + tree_walk_v2_append_points(ctx, top_tree -> root, idx, max_dist, point, data_to_send_per_proc, local_idx_of_the_point, point_to_snd_count); + } + + elapsed_time = TIME_STOP; + LOG_WRITE("Finding points to refine", elapsed_time); + + TIME_START; + int* point_to_rcv_count = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); + + /* exchange points to work on*/ + MPI_Alltoall(point_to_snd_count, 1, MPI_INT, point_to_rcv_count, 1, MPI_INT, ctx -> mpi_communicator); + + int* rcv_count = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); + int* snd_count = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); + int* rcv_displ = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); + int* snd_displ = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); + + /*compute counts and displs*/ + rcv_displ[0] = 0; + snd_displ[0] = 0; + + + rcv_count[0] = point_to_rcv_count[0] * (ctx -> dims); + snd_count[0] = point_to_snd_count[0] * (ctx -> dims); + + int tot_points_rcv = point_to_rcv_count[0]; + int tot_points_snd = point_to_snd_count[0]; + int tot_count = rcv_count[0]; + + for(int i = 1; i < ctx -> world_size; ++i) + { + rcv_count[i] = point_to_rcv_count[i] * (ctx -> dims); + snd_count[i] = point_to_snd_count[i] * (ctx -> dims); + + tot_count += rcv_count[i]; + tot_points_rcv += point_to_rcv_count[i]; + tot_points_snd += point_to_snd_count[i]; + + rcv_displ[i] = rcv_displ[i - 1] + rcv_count[i - 1]; + snd_displ[i] = snd_displ[i - 1] + snd_count[i - 1]; + } + + float_t* __rcv_points = (float_t*)MY_MALLOC(tot_points_rcv * (ctx -> dims) * sizeof(float_t)); + float_t* __snd_points = (float_t*)MY_MALLOC(tot_points_snd * (ctx -> dims) * sizeof(float_t)); + + float_t* __max_dist_snd = (float_t*)MY_MALLOC(tot_points_snd * sizeof(float_t)); + float_t* __max_dist_rcv = (float_t*)MY_MALLOC(tot_points_rcv * sizeof(float_t)); + + + + /* copy data to send in contiguous memory */ + for(int i = 0; i < ctx -> world_size; ++i) + { + memcpy(__snd_points + snd_displ[i], data_to_send_per_proc[i], snd_count[i] * sizeof(float_t)); + } + + size_t dist_idx = 0; + for(idx_t i = 0; i < ctx -> world_size; ++i) + { + for(idx_t j = 0; j < point_to_snd_count[i]; ++j) + { + idx_t point_idx = local_idx_of_the_point[i][j]; + // for each point take the distance of its furthest nearest neighbor + // then communicate it in order to prune the search on foreign nodes + __max_dist_snd[dist_idx] = ctx -> local_datapoints[point_idx].ngbh[0].value; + ++dist_idx; + } + } + + MPI_Alltoallv(__snd_points, snd_count, snd_displ, MPI_MY_FLOAT, + __rcv_points, rcv_count, rcv_displ, MPI_MY_FLOAT, ctx -> mpi_communicator); + + float_t** rcv_work_batches = (float_t**)MY_MALLOC(ctx -> world_size * sizeof(float_t*)); + for(int i = 0; i < ctx -> world_size; ++i) + { + rcv_work_batches[i] = __rcv_points + rcv_displ[i]; + } + + MPI_Status status; + int flag; + + for(int i = 0; i < ctx -> world_size; ++i) + { + rcv_count[i] = rcv_count[i] / ctx -> dims; + rcv_displ[i] = rcv_displ[i] / ctx -> dims; + snd_count[i] = snd_count[i] / ctx -> dims; + snd_displ[i] = snd_displ[i] / ctx -> dims; + + } + + MPI_Alltoallv(__max_dist_snd, snd_count, snd_displ, MPI_MY_FLOAT, + __max_dist_rcv, rcv_count, rcv_displ, MPI_MY_FLOAT, ctx -> mpi_communicator); + + /* prepare heap_t batches */ + + //int work_batch_stride = 1 + ctx -> dims; + int work_batch_stride = ctx -> dims; + + /* Note that I then have to recieve an equal number of heaps as the one I sent out before */ + heap_node_t* __heap_batches_to_snd = (heap_node_t*)MY_MALLOC((uint64_t)k * (uint64_t)tot_points_rcv * sizeof(heap_node_t)); + + if( __heap_batches_to_snd == NULL) + { + DB_PRINT("Rank %d failed to allocate snd_heaps %luB required\n",ctx -> mpi_rank, (uint64_t)k * (uint64_t)tot_points_snd * sizeof(heap_node_t)); + exit(1); + } + + MPI_Barrier(ctx -> mpi_communicator); + + + // max dists will contain the maximum distance to have to search + heap_node_t** heap_batches_per_node = (heap_node_t**)MY_MALLOC(ctx -> world_size * sizeof(heap_node_t*)); + float_t** max_dists_per_node = (float_t**)MY_MALLOC(ctx -> world_size * sizeof(float_t*)); + + for(int p = 0; p < ctx -> world_size; ++p) + { + heap_batches_per_node[p] = __heap_batches_to_snd + (uint64_t)rcv_displ[p] * (uint64_t)k; + max_dists_per_node[p] = __max_dist_rcv + (uint64_t)rcv_displ[p]; + } + + /* compute everything */ + elapsed_time = TIME_STOP; + LOG_WRITE("Exchanging points", elapsed_time); + MPI_Barrier(ctx -> mpi_communicator); + + + TIME_START; + + /* ngbh search on recieved points */ + for(int p = 0; p < ctx -> world_size; ++p) + { + if(point_to_rcv_count[p] > 0 && p != ctx -> mpi_rank) + { + #pragma omp parallel for schedule(dynamic, 128) + for(int batch = 0; batch < point_to_rcv_count[p]; ++batch) + { + heap_t H; + H.count = k; + H.size = k; + H.data = heap_batches_per_node[p] + (uint64_t)k * (uint64_t)batch; + float_t max_dist = max_dists_per_node[p][batch]; + for(idx_t i = 0; i < H.size; ++i) + { + H.data[i].value = max_dist; + H.data[i].array_idx = MY_SIZE_MAX; + } + + //float_t* point = rcv_work_batches[p] + batch * work_batch_stride + 1; + idx_t visited_nodes = 0; + float_t* point = rcv_work_batches[p] + (uint64_t)batch * (uint64_t)work_batch_stride; + knn_vptree_search(point, local_tree-> __points, local_tree->__nodes, + local_tree->root, &H, local_tree->dims, &visited_nodes); + convert_heap_idx_to_global(ctx, &H); + } + } + } + + /* sendout results */ + + /* + * dummy pointers to clarify counts in this part + * act like an alias for rcv and snd counts + */ + + int* ngbh_to_send = point_to_rcv_count; + int* ngbh_to_recv = point_to_snd_count; + + /* + * counts are inverted since I have to recieve as many batches as points I + * Have originally sended + */ + + elapsed_time = TIME_STOP; + LOG_WRITE("Ngbh search for foreing points", elapsed_time); + + TIME_START; + + MPI_Datatype MPI_my_heap; + MPI_Type_contiguous(k * sizeof(heap_node_t), MPI_BYTE, &MPI_my_heap); + MPI_Barrier(ctx -> mpi_communicator); + MPI_Type_commit(&MPI_my_heap); + + + /* ------------------------------------- + * ALTERNATIVE TO ALL TO ALL FOR BIG MSG + * ------------------------------------- */ + + MPI_Barrier(ctx -> mpi_communicator); + int default_msg_len = MAX_MSG_SIZE / (k * sizeof(heap_node_t)); + + int* already_sent_points = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); + int* already_rcvd_points = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); + + /* allocate a request array to keep track of all requests going out*/ + MPI_Request* req_array; + int req_num = 0; + for(int i = 0; i < ctx -> world_size; ++i) + { + req_num += ngbh_to_send[i] > 0 ? ngbh_to_send[i]/default_msg_len + 1 : 0; + } + + req_array = (MPI_Request*)MY_MALLOC(req_num * sizeof(MPI_Request)); + + + for(int i = 0; i < ctx -> world_size; ++i) + { + already_sent_points[i] = 0; + already_rcvd_points[i] = 0; + } + + int req_idx = 0; + + // find the maximum number of points to send */ + + idx_t max_n_recv = 0; + for(int i = 0; i < ctx -> world_size; ++i) + { + max_n_recv = MAX(max_n_recv, (idx_t)ngbh_to_recv[i]); + } + + MPI_DB_PRINT("Using default message length %lu\n", default_msg_len); -void mpi_all_knn_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtree_t* top_tree, kdtree_t* local_tree, int k) + heap_node_t* __heap_batches_to_rcv = (heap_node_t*)MY_MALLOC((uint64_t)k * (uint64_t)max_n_recv * sizeof(heap_node_t)); + if( __heap_batches_to_rcv == NULL) + { + DB_PRINT("Rank %d failed to allocate rcv_heaps %luB required\n",ctx -> mpi_rank, (uint64_t)k * (uint64_t)max_n_recv* sizeof(heap_node_t)); + exit(1); + } + + /* make a ring */ + + MPI_Barrier(ctx -> mpi_communicator); + for(int i = 1; i < ctx -> world_size; ++i) + { + int rank_to_send = (ctx -> mpi_rank + i) % (ctx -> world_size); + int rank_to_recv = (ctx -> world_size + ctx -> mpi_rank - i) % (ctx -> world_size); + + /* do things */ + + /* send out one batch */ + + #ifdef PRINT_NGBH_EXCHANGE_SCHEME + MPI_DB_PRINT("[--- ROUND %d ----]\n", i); + MPI_Barrier(ctx -> mpi_communicator); + DB_PRINT("[RANK %d] sending to %d tot: %d [%luB]---- recieving from %d %d\n", ctx -> mpi_rank, + rank_to_send, ngbh_to_send[rank_to_send], ngbh_to_send[rank_to_send]*sizeof(heap_node_t), rank_to_recv, ngbh_to_recv[rank_to_recv]); + #endif + if(ngbh_to_send[rank_to_send] > 0) + { + int count_send = 0; + while(already_sent_points[rank_to_send] < ngbh_to_send[rank_to_send]) + { + MPI_Request request; + count_send = MIN((int)default_msg_len, (int)(ngbh_to_send[rank_to_send] - already_sent_points[rank_to_send] )); + + MPI_Isend( heap_batches_per_node[rank_to_send] + k * already_sent_points[rank_to_send], count_send, + MPI_my_heap, rank_to_send, 0, ctx -> mpi_communicator, &request); + + already_sent_points[rank_to_send] += count_send; + req_array[req_idx] = request; + ++req_idx; + } + } + + if( ngbh_to_send[rank_to_send] != already_sent_points[rank_to_send] || + point_to_rcv_count[rank_to_send] != already_sent_points[rank_to_send]) + { + DB_PRINT("ERROR OCCURRED in sending points [rank %d] got %d expected %d\n", + ctx -> mpi_rank, already_rcvd_points[rank_to_send], point_to_rcv_count[rank_to_send]); + } + + MPI_Barrier(ctx -> mpi_communicator); + + if(ngbh_to_recv[rank_to_recv] > 0) + { + flag = 0; + while(already_rcvd_points[rank_to_recv] < ngbh_to_recv[rank_to_recv]) + { + MPI_Probe(rank_to_recv, MPI_ANY_TAG, ctx -> mpi_communicator, &status); + MPI_Request request; + int count_recv; + int source = status.MPI_SOURCE; + MPI_Get_count(&status, MPI_my_heap, &count_recv); + /* recieve each slice */ + + MPI_Recv(__heap_batches_to_rcv + k * already_rcvd_points[rank_to_recv], + count_recv, MPI_my_heap, source, MPI_ANY_TAG, ctx -> mpi_communicator, &status); + + already_rcvd_points[rank_to_recv] += count_recv; + } + } + + if( ngbh_to_recv[rank_to_recv] != already_rcvd_points[rank_to_recv] || + point_to_snd_count[rank_to_recv] != already_rcvd_points[rank_to_recv]) + { + DB_PRINT("ERROR OCCURRED in recieving points [rank %d] got %d expected %d\n", + ctx -> mpi_rank, already_rcvd_points[rank_to_recv], point_to_snd_count[rank_to_recv]); + } + + /* merge lists */ + #pragma omp parallel for schedule(dynamic, 32) + for(int b = 0; b < ngbh_to_recv[rank_to_recv]; ++b) + { + int idx = local_idx_of_the_point[rank_to_recv][b]; + /* retrieve the heap_t */ + heap_t H; + H.count = k; + H.size = k; + H.data = __heap_batches_to_rcv + k*b; + /* insert the points into the heap_t */ + for(int j = 0; j < k; ++j) + { + heap_t tmp_heap; + tmp_heap.size = k; + tmp_heap.count = k; + tmp_heap.data = dp_info[idx].ngbh; + // insert it only of != from max + if(H.data[j].array_idx != MY_SIZE_MAX) max_heap_insert(&(tmp_heap), H.data[j].value, H.data[j].array_idx); + } + } + + + + MPI_Barrier(ctx -> mpi_communicator); + } + + elapsed_time = TIME_STOP; + LOG_WRITE("Merging results", elapsed_time); + + + MPI_Testall(req_idx, req_array, &flag, MPI_STATUSES_IGNORE); + + if(flag == 0) + { + DB_PRINT("ERROR OCCURRED Rank %d has unfinished communications\n", ctx -> mpi_rank); + exit(1); + } + free(req_array); + free(already_sent_points); + free(already_rcvd_points); + + /* -------------------------------------------------------- */ + /* heapsort them */ + + TIME_START; + + #pragma omp parallel for schedule(dynamic, 128) + for(int i = 0; i < ctx -> local_n_points; ++i) + { + heap_t tmp_heap; + tmp_heap.size = k; + tmp_heap.count = k; + tmp_heap.data = dp_info[i].ngbh; + + heap_sort(&(tmp_heap)); + } + + elapsed_time = TIME_STOP; + LOG_WRITE("Sorting negihborhoods", elapsed_time); + + + #if defined(WRITE_NGBH) + MPI_DB_PRINT("Writing ngbh to files\n"); + char ngbh_out[80]; + sprintf(ngbh_out, "./bb/rank_%d.ngbh",ctx -> mpi_rank); + FILE* file = fopen(ngbh_out,"w"); + if(!file) + { + printf("Cannot open file %s\n",ngbh_out); + } + else + { + for(int i = 0; i < ctx -> local_n_points; ++i) + { + fwrite(dp_info[i].ngbh, sizeof(heap_node_t), k, file); + } + fclose(file); + } + #endif + + MPI_Barrier(ctx -> mpi_communicator); + + + #if defined(WRITE_SHUFFLED_DATA) + float_t* data_to_write = (float_t*)MY_MALLOC(ctx -> local_n_points * ctx -> k * sizeof(float_t)); + for(int i = 0; i < ctx -> local_n_points; ++i) + { + idx_t idx = local_tree -> __points[i].array_idx; + memcpy(data_to_write + idx*ctx -> dims, local_tree -> __points[i].data, ctx -> dims*sizeof(float_t)); + } + + big_ordered_buffer_to_file(ctx, data_to_write, sizeof(float_t), ctx -> local_n_points * ctx -> dims, "bb/ordered_data.npy"); + free(data_to_write); + #endif + + for(int i = 0; i < ctx -> world_size; ++i) + { + if(data_to_send_per_proc[i]) free(data_to_send_per_proc[i]); + if(local_idx_of_the_point[i]) free(local_idx_of_the_point[i]); + } + + free(data_to_send_per_proc); + free(local_idx_of_the_point); + free(heap_batches_per_node); + free(max_dists_per_node); + //free(rcv_heap_batches); + free(rcv_work_batches); + free(point_to_rcv_count); + free(point_to_snd_count); + free(point_to_snd_capacity); + + free(rcv_count); + free(snd_count); + free(rcv_displ); + free(snd_displ); + free(__heap_batches_to_rcv); + free(__heap_batches_to_snd); + free(__rcv_points); + free(__snd_points); + free(__max_dist_snd); + free(__max_dist_rcv); + +} + + +void mpi_all_knn_search_kdtree(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtree_t* top_tree, kdtree_t* local_tree, int k) { /* local search */ /* print diagnostics */ @@ -2402,7 +2915,8 @@ void mpi_all_knn_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kd //ctx -> __local_heap_buffers = (heap_node_t*)MY_MALLOC(ctx -> local_n_points * k * sizeof(heap_node)); MPI_Alloc_mem(ctx -> local_n_points * k * sizeof(heap_node_t), MPI_INFO_NULL, &(ctx -> __local_heap_buffers)); - #pragma omp parallel for schedule(dynamic, 32) + float_t avg_visited_nodes = 0; + #pragma omp parallel for schedule(dynamic, 32) reduction(+:avg_visited_nodes) for(int p = 0; p < ctx -> local_n_points; ++p) { idx_t idx = local_tree -> __points[p].array_idx; @@ -2423,7 +2937,10 @@ void mpi_all_knn_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kd dp_info[idx].is_center = 0; dp_info[idx].array_idx = idx + ctx -> idx_start; dp_info[idx].ngbh = tmp_heap.data; + avg_visited_nodes += (float_t)visited_nodes/(float_t)ctx->local_n_points; } + + MPI_PRINT("AVG visited nodes on master %lf\n", avg_visited_nodes); elapsed_time = TIME_STOP; LOG_WRITE("Local neighborhood search", elapsed_time); //printf("rank %d elapsed_time %lf\n", ctx->mpi_rank, elapsed_time); diff --git a/src/tree/tree.h b/src/tree/tree.h index 4fd2fd6..bf4f318 100644 --- a/src/tree/tree.h +++ b/src/tree/tree.h @@ -2,6 +2,7 @@ #include "../common/common.h" #include "heap.h" #include "kdtree.h" +#include "vptree.h" #include #include @@ -110,7 +111,10 @@ size_t partition_data_around_value(point_t *array, size_t vec_len, size_t comp size_t parallel_partition_data_around_value(partition_utils_t* p_putils, float_t* in, float_t* out, size_t vec_len, size_t compare_dim, size_t left, size_t right, float_t pivot_value); -void mpi_all_knn_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtree_t* top_tree, kdtree_t* local_tree, int k); + +void mpi_all_knn_search_kdtree(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtree_t* top_tree, kdtree_t* local_tree, int k); +void mpi_all_knn_search_vptree(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtree_t* top_tree, vptree_t* local_tree, int k); + float_t compute_ID_two_NN_ML(global_context_t* ctx, datapoint_info_t* dp_info, idx_t n, int verbose); void compute_density_kstarnn_rma_v2(global_context_t* ctx, const float_t d, int verbose); void compute_correction(global_context_t* ctx, float_t Z); diff --git a/src/tree/vptree.h b/src/tree/vptree.h new file mode 100644 index 0000000..88a06c7 --- /dev/null +++ b/src/tree/vptree.h @@ -0,0 +1,563 @@ +#pragma once +/* + * vptree.h — Vantage-Point Tree, single-header, drop-in for kdtree.h + * + * Keeps every convention from kdtree.h: + * - same point_t / idx_t / float_t / heap_t types + * - same MY_MALLOC / FREE / ALIGNMENT macros + * - same OpenMP parallel build strategy + * - same data-layout compaction after build + * - same public interface: vptree_initialize / build_tree_vptree / + * knn_vptree_search / vptree_free + * + * WHY VP-TREE IS FASTER THAN KD-TREE AT d>=8 + * ------------------------------------------- + * KD-tree prunes with a hyperrectangle check: + * min_box_dist(q, box) > tau → prune + * In d=10 a hyperrectangle covers ~2^10 orthants, so the test rarely prunes. + * + * VP-tree prunes with a hypersphere annulus check (ALL squared, zero sqrt): + * vp_can_prune_left / vp_can_prune_right (see below) + * This exploits the metric directly and prunes aggressively even at d=10. + * + * PRUNING DERIVATION (sqrt-free, all squared distances) + * ------------------------------------------------------ + * Notation (all values are SQUARED distances): + * dqv = dist²(query, vp) + * mu = median dist² from vp to subtree points (split radius²) + * tau = heap->data[0].value (current k-th worst neighbour dist²) + * + * LEFT child: points p with dist²(p,vp) <= mu + * RIGHT child: points p with dist²(p,vp) > mu + * + * Triangle inequality in Euclidean space (non-squared): + * dist(q,p) >= |dist(q,vp) - dist(p,vp)| + * + * For LEFT child (dist(p,vp) <= sqrt(mu)): + * min dist(q,p) >= sqrt(dqv) - sqrt(mu) + * Prune LEFT if sqrt(dqv) - sqrt(mu) > sqrt(tau) [only when dqv > mu] + * Squaring both sides (both positive by guard): + * let s = dqv + mu - tau + * prune if s > 0 AND s*s > 4*dqv*mu + * + * For RIGHT child (dist(p,vp) > sqrt(mu)): + * min dist(q,p) >= sqrt(mu) - sqrt(dqv) + * Prune RIGHT if sqrt(mu) - sqrt(dqv) > sqrt(tau) [only when mu > dqv] + * Squaring: + * let s = mu + dqv - tau + * prune if mu > dqv AND s > 0 AND s*s > 4*dqv*mu + * + * Result: zero sqrt calls per node, only multiplications and comparisons. + * + * USAGE + * ----- + * #include "heap.h" + * #include "vptree.h" + * + * vptree_t tree; + * vptree_initialize(&tree, data_ptr, n_points, dims); + * build_tree_vptree(&tree); + * + * heap_t H; // your existing heap, initialised to size k + * idx_t vis = 0; + * knn_vptree_search(query_vec, tree.__points, tree.__nodes, + * tree.root, &H, tree.dims, &vis); + * + * vptree_free(&tree); + */ + +#include "heap.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* ── Reuse all macros from kdtree.h if already defined ───────────────────── */ + +#ifndef ALIGNMENT + #define ALIGNMENT 64 +#endif + +#ifndef CHECK_ALLOCATION_NO_CTX + #define CHECK_ALLOCATION_NO_CTX(x) \ + if(!x){printf("[!!!] Failed allocation: %s at line %d \n", __FILE__, __LINE__ ); exit(1);} +#endif + +#ifndef MY_MALLOC + #define MY_MALLOC(n) ({size_t __n = (n + ALIGNMENT - 1) & ~(ALIGNMENT - 1); \ + void* p = aligned_alloc(ALIGNMENT, __n); \ + CHECK_ALLOCATION_NO_CTX(p); memset(p, 0, __n); p; }) +#endif + +#ifndef FREE + #define FREE(x) if(x) free(x); +#endif + +#ifndef MIN + #define MIN(x,y) ((x) < (y) ? (x) : (y)) +#endif +#ifndef MAX + #define MAX(x,y) ((x) > (y) ? (x) : (y)) +#endif + +#ifndef DEFAULT_LEAF_SIZE + #define DEFAULT_LEAF_SIZE 256 +#endif + +/* float_t / idx_t: honour the same USE_FLOAT32 / USE_INT32 guards */ +#ifndef float_t + #ifdef USE_FLOAT32 + #define float_t float + #else + #define float_t double + #endif +#endif + +#ifndef idx_t + #ifdef USE_INT32 + #define idx_t uint32_t + #define MY_SIZE_MAX UINT32_MAX + #else + #define idx_t uint64_t + #define MY_SIZE_MAX UINT64_MAX + #endif +#endif + +/* ── VP-tree build cutoff (fall back to serial below this) ──────────────── */ +#ifndef VP_SERIAL_BUILD_CUTOFF + #define VP_SERIAL_BUILD_CUTOFF 4096 +#endif + +/* ═══════════════════════════════════════════════════════════════════════════ + * Node layout — flat pool, heap-ordered (left=2i+1, right=2i+2) + * + * mu stores the SQUARED median distance from vp to its subtree points. + * Everything is squared throughout — no sqrt anywhere. + * ═══════════════════════════════════════════════════════════════════════════ */ + +typedef struct { + float_t mu; /* split radius²: median dist²(p,vp) in subtree */ + idx_t vp_point_idx; /* index in __points[] of the vantage point */ + uint32_t leaf_point_idx; /* (leaf) start index in __points[] */ + uint32_t leaf_point_count; /* (leaf) number of points in leaf */ + uint8_t is_leaf; +} vpnode_t; + +typedef struct { + float_t* data; /* raw data (same role as kdtree_t.data) */ + uint32_t dims; + idx_t n_points; + idx_t root; /* always 0 */ + struct point_t* __points; /* permuted point array */ + vpnode_t* __nodes; /* flat node pool */ +} vptree_t; + +/* ═══════════════════════════════════════════════════════════════════════════ + * Distance — squared, no sqrt, same semantics as kdtree.h euclidean_distance + * ═══════════════════════════════════════════════════════════════════════════ */ + +static inline float_t vp_dist_sq(const float_t* __restrict__ a, + const float_t* __restrict__ b, + const idx_t dims) +{ + float_t d = 0.0; + /* restrict + fixed small dims → compiler emits AVX/AVX-512 with -O3 -march=native */ + for (idx_t i = 0; i < dims; ++i) { + float_t diff = a[i] - b[i]; + d += diff * diff; + } + return d; +} + +/* ═══════════════════════════════════════════════════════════════════════════ + * Pruning predicates — ZERO sqrt, pure multiply+compare + * All arguments are squared distances. + * Returns 1 if the subtree can be safely skipped. + * ═══════════════════════════════════════════════════════════════════════════ */ + +/* + * Prune LEFT child (points with dist²(p,vp) <= mu). + * Original: sqrt(dqv) - sqrt(mu) > sqrt(tau) [guard: dqv > mu] + * Squared: s = dqv + mu - tau; prune iff s > 0 && s*s > 4*dqv*mu + */ +static inline int vp_can_prune_left(float_t dqv, float_t mu, float_t tau) +{ + if (dqv <= mu) return 0; + float_t s = dqv + mu - tau; + if (s <= 0.0) return 0; + return (s * s) > (4.0 * dqv * mu); +} + +/* + * Prune RIGHT child (points with dist²(p,vp) > mu). + * Original: sqrt(mu) - sqrt(dqv) > sqrt(tau) [guard: mu > dqv] + * Squared: s = mu + dqv - tau; prune iff s > 0 && s*s > 4*dqv*mu + */ +static inline int vp_can_prune_right(float_t dqv, float_t mu, float_t tau) +{ + if (mu <= dqv) return 0; + float_t s = mu + dqv - tau; + if (s <= 0.0) return 0; + return (s * s) > (4.0 * dqv * mu); +} + +/* ═══════════════════════════════════════════════════════════════════════════ + * point_t helpers + * ═══════════════════════════════════════════════════════════════════════════ */ + +static inline void vp_point_swap(point_t *x, point_t *y) +{ + point_t tmp = *x; *x = *y; *y = tmp; +} + +/* ═══════════════════════════════════════════════════════════════════════════ + * QuickSelect — finds k-th smallest in dists[low..high] + * Simultaneously permutes points[] to match. + * ═══════════════════════════════════════════════════════════════════════════ */ + +static float_t vp_nth_dist(point_t* points, float_t* dists, + idx_t low, idx_t high, idx_t k) +{ + while (low < high) { + idx_t pivot_idx = low + (idx_t)rand() % (high - low + 1); + float_t pivot_d = dists[pivot_idx]; + + /* Move pivot to end */ + vp_point_swap(points + pivot_idx, points + high); + float_t tmp = dists[pivot_idx]; dists[pivot_idx] = dists[high]; dists[high] = tmp; + + /* Lomuto partition */ + idx_t store = low; + for (idx_t j = low; j < high; ++j) { + if (dists[j] < pivot_d) { + vp_point_swap(points + store, points + j); + tmp = dists[store]; dists[store] = dists[j]; dists[j] = tmp; + ++store; + } + } + vp_point_swap(points + store, points + high); + tmp = dists[store]; dists[store] = dists[high]; dists[high] = tmp; + + if (store == k) return dists[store]; + else if (store < k) low = store + 1; + else high = store - 1; + } + return dists[low]; +} + +/* ═══════════════════════════════════════════════════════════════════════════ + * Vantage point selection — random + * + * Variance-maximising selection (32x32 distance probes per node) was the + * original approach but costs ~1B extra distance computations at build time + * for 52M points, causing 82s build vs 2s for KD-tree. Random selection is + * equally correct (VP-tree correctness never depends on VP quality) and + * builds in ~2s. The marginal pruning improvement from better VP selection + * does not justify the build cost at this scale. + * ═══════════════════════════════════════════════════════════════════════════ */ + +static inline idx_t vp_select_vantage_point(point_t* points, + idx_t start, idx_t end) +{ + (void)points; + idx_t n = end - start + 1; + if (n <= 2) return start; + return start + (idx_t)rand() % n; +} + +/* ═══════════════════════════════════════════════════════════════════════════ + * BUILD — serial recursive + * ═══════════════════════════════════════════════════════════════════════════ */ + +static idx_t vp_recursive_build(point_t* points, vpnode_t* nodes, + float_t* dist_scratch, + idx_t node_idx, + idx_t start, idx_t end, idx_t dims); + +static idx_t vp_recursive_build(point_t* points, vpnode_t* nodes, + float_t* dist_scratch, + idx_t node_idx, + idx_t start, idx_t end, idx_t dims) +{ + vpnode_t* node = nodes + node_idx; + idx_t n = end - start + 1; + + /* ── Leaf ─────────────────────────────────────────────────────────── */ + if (n <= DEFAULT_LEAF_SIZE) { + node->is_leaf = 1; + node->leaf_point_idx = (uint32_t)start; + node->leaf_point_count = (uint32_t)n; + node->vp_point_idx = start; + node->mu = 0.0; + return node_idx; + } + + /* ── Pick VP, place at front ──────────────────────────────────────── */ + idx_t vp_local = vp_select_vantage_point(points, start, end); + vp_point_swap(points + start, points + vp_local); + node->vp_point_idx = start; + + const float_t* vp_data = points[start].data; + idx_t inner_start = start + 1; + idx_t inner_n = end - inner_start + 1; + + /* ── Compute squared distances VP → all other points ─────────────── */ + float_t* d = dist_scratch + inner_start; + for (idx_t i = 0; i < inner_n; ++i) + d[i] = vp_dist_sq(vp_data, points[inner_start + i].data, dims); + + /* ── QuickSelect: partition around median ─────────────────────────── */ + idx_t median_rank = inner_n / 2; + float_t mu = vp_nth_dist(points + inner_start, d, + 0, inner_n - 1, median_rank); + node->mu = mu; /* squared distance */ + + /* + * After QuickSelect: + * [inner_start .. inner_start+median_rank] dist² <= mu → LEFT subtree + * [inner_start+median_rank+1 .. end] dist² > mu → RIGHT subtree + */ + idx_t left_end = start + median_rank; + idx_t right_start = left_end + 1; + + if (left_end >= inner_start) + vp_recursive_build(points, nodes, dist_scratch, + node_idx * 2 + 1, inner_start, left_end, dims); + + if (right_start <= end) + vp_recursive_build(points, nodes, dist_scratch, + node_idx * 2 + 2, right_start, end, dims); + + return node_idx; +} + +/* ═══════════════════════════════════════════════════════════════════════════ + * BUILD — parallel (OMP tasks, mirrors kdtree.h parallel_recursive_make_tree) + * ═══════════════════════════════════════════════════════════════════════════ */ + +static idx_t vp_parallel_build(point_t* points, vpnode_t* nodes, + float_t* dist_scratch, + idx_t node_idx, + idx_t start, idx_t end, idx_t dims); + +static idx_t vp_parallel_build(point_t* points, vpnode_t* nodes, + float_t* dist_scratch, + idx_t node_idx, + idx_t start, idx_t end, idx_t dims) +{ + if ((end - start) < VP_SERIAL_BUILD_CUTOFF) + return vp_recursive_build(points, nodes, dist_scratch, + node_idx, start, end, dims); + + vpnode_t* node = nodes + node_idx; + + idx_t vp_local = vp_select_vantage_point(points, start, end); + vp_point_swap(points + start, points + vp_local); + node->vp_point_idx = start; + + const float_t* vp_data = points[start].data; + idx_t inner_start = start + 1; + idx_t inner_n = end - inner_start + 1; + + float_t* d = dist_scratch + inner_start; + #pragma omp simd + for (idx_t i = 0; i < inner_n; ++i) + d[i] = vp_dist_sq(vp_data, points[inner_start + i].data, dims); + + idx_t median_rank = inner_n / 2; + float_t mu = vp_nth_dist(points + inner_start, d, + 0, inner_n - 1, median_rank); + node->mu = mu; + + idx_t left_end = start + median_rank; + idx_t right_start = left_end + 1; + idx_t lch = node_idx * 2 + 1; + idx_t rch = node_idx * 2 + 2; + + #pragma omp task shared(points, nodes, dist_scratch) + { + if (left_end >= inner_start) + vp_parallel_build(points, nodes, dist_scratch, + lch, inner_start, left_end, dims); + } + + #pragma omp task shared(points, nodes, dist_scratch) + { + if (right_start <= end) + vp_parallel_build(points, nodes, dist_scratch, + rch, right_start, end, dims); + } + + #pragma omp taskwait + + return node_idx; +} + +/* ═══════════════════════════════════════════════════════════════════════════ + * SEARCH — exact KNN query + * + * Drop-in for knn_sub_tree_search() in kdtree.h. + * Same signature, same heap semantics, outputs squared distances. + * + * Visit order: near shell first → tau tightens before evaluating far shell. + * dqv <= mu (inside ball) → LEFT is near, visit LEFT first + * dqv > mu (outside ball) → RIGHT is near, visit RIGHT first + * + * After near-side visit, re-read tau before deciding on far side. + * ═══════════════════════════════════════════════════════════════════════════ */ + +static void knn_vptree_search(const float_t* query, + const point_t* points, + const vpnode_t* nodes, + idx_t node_idx, + heap_t* H, + const idx_t dims, + idx_t* vis_nodes) +{ + const vpnode_t* node = nodes + node_idx; + (*vis_nodes)++; + + /* ── Leaf: brute-force, two-pass (compute then insert) ───────────── */ + if (node->is_leaf) { + idx_t base = node->leaf_point_idx; + idx_t count = node->leaf_point_count; + const float_t* dp = points[base].data; + + float_t dists[DEFAULT_LEAF_SIZE]; + #pragma GCC unroll 4 + for (idx_t i = 0; i < count; ++i) + dists[i] = vp_dist_sq(query, dp + i * dims, dims); + + for (idx_t i = 0; i < count; ++i) + max_heap_insert(H, dists[i], points[base + i].array_idx); + + return; + } + + /* ── Internal node ───────────────────────────────────────────────── */ + float_t dqv = vp_dist_sq(query, points[node->vp_point_idx].data, dims); + max_heap_insert(H, dqv, points[node->vp_point_idx].array_idx); + + float_t tau = H->data[0].value; + int heap_not_full = (H->count < H->size); + float_t mu = node->mu; + idx_t lch = node_idx * 2 + 1; + idx_t rch = node_idx * 2 + 2; + + if (dqv <= mu) { + /* Inside ball — LEFT is near side */ + if (heap_not_full || !vp_can_prune_left(dqv, mu, tau)) + knn_vptree_search(query, points, nodes, lch, H, dims, vis_nodes); + + tau = H->data[0].value; /* re-read: may have tightened */ + heap_not_full = (H->count < H->size); + + if (heap_not_full || !vp_can_prune_right(dqv, mu, tau)) + knn_vptree_search(query, points, nodes, rch, H, dims, vis_nodes); + } else { + /* Outside ball — RIGHT is near side */ + if (heap_not_full || !vp_can_prune_right(dqv, mu, tau)) + knn_vptree_search(query, points, nodes, rch, H, dims, vis_nodes); + + tau = H->data[0].value; + heap_not_full = (H->count < H->size); + + if (heap_not_full || !vp_can_prune_left(dqv, mu, tau)) + knn_vptree_search(query, points, nodes, lch, H, dims, vis_nodes); + } +} + +/* ═══════════════════════════════════════════════════════════════════════════ + * PUBLIC API (mirrors kdtree.h) + * ═══════════════════════════════════════════════════════════════════════════ */ + +static void vptree_initialize(vptree_t* tree, float_t* data, + size_t n_points, unsigned int dims) +{ + tree->dims = dims; + tree->data = data; + tree->n_points = n_points; + tree->root = 0; + + tree->__points = (point_t*)MY_MALLOC(n_points * sizeof(point_t)); + /* 4*n_points: safe upper bound for heap-ordered binary tree node pool */ + tree->__nodes = (vpnode_t*)MY_MALLOC(4 * n_points * sizeof(vpnode_t)); + + for (idx_t i = 0; i < n_points; ++i) { + tree->__points[i].array_idx = i; + tree->__points[i].data = data + i * dims; + } +} + +static void build_tree_vptree(vptree_t* tree) +{ + float_t* dist_scratch = (float_t*)MY_MALLOC(tree->n_points * sizeof(float_t)); + + static bool seeded = false; + if (!seeded) { srand(42); seeded = true; } + + #pragma omp parallel + { + #pragma omp single + { + tree->root = vp_parallel_build(tree->__points, tree->__nodes, + dist_scratch, + 0, 0, tree->n_points - 1, + tree->dims); + } + } + + FREE(dist_scratch); + + /* Compact data: permuted __points → contiguous block for cache-friendly + * leaf scans. Mirrors the same step in kdtree.h build_tree_kdtree(). */ + float_t* tmp_data = (float_t*)MY_MALLOC(tree->n_points * tree->dims * sizeof(float_t)); + + #pragma omp parallel for schedule(static) + for (idx_t i = 0; i < tree->n_points; ++i) { + float_t* src = tree->__points[i].data; + memcpy(tmp_data + tree->dims * i, src, tree->dims * sizeof(float_t)); + tree->__points[i].data = tmp_data + tree->dims * i; + } + + free(tree->data); + tree->data = tmp_data; +} + +static void vptree_free(vptree_t* tree) +{ + FREE(tree->__nodes); + FREE(tree->__points); + FREE(tree->data); +} + +/* ── Debug print, first 3 levels (mirrors kdtree_print) ─────────────────── */ + +static void vp_recursive_print(const vpnode_t* nodes, idx_t node_idx, + idx_t dims, idx_t level) +{ + (void)dims; + if (level >= 3) return; + const vpnode_t* node = nodes + node_idx; + printf("--- VP LEVEL %lu ---\n", (unsigned long)level); + if (node->is_leaf) { + printf(" LEAF start=%u count=%u\n", + node->leaf_point_idx, node->leaf_point_count); + } else { + printf(" INTERNAL vp_idx=%lu mu²=%.6f\n", + (unsigned long)node->vp_point_idx, node->mu); + vp_recursive_print(nodes, node_idx * 2 + 1, dims, level + 1); + vp_recursive_print(nodes, node_idx * 2 + 2, dims, level + 1); + } +} + +static void vptree_print(const vptree_t* tree) +{ + vp_recursive_print(tree->__nodes, tree->root, tree->dims, 0); +} -- GitLab