From c629603de31f82139fa17ff2a77abe76c92169af Mon Sep 17 00:00:00 2001
From: Francesco Tomba <francy273998@gmail.com>
Date: Mon, 16 Mar 2026 09:40:58 +0100
Subject: [PATCH 1/2] solved output data problems

---
 Makefile            |    6 +-
 run_leo             |    4 +-
 src/adp/adp.c       |   50 +-
 src/common/common.c |   17 +-
 src/common/common.h |   46 +-
 src/main/main.c     |   43 +-
 src/tree/heap.c     |  217 ------
 src/tree/heap.h     |  176 +++--
 src/tree/kdtree.h   | 1278 ++++++++++++++++++++++++++++++++++
 src/tree/kdtreeV2.c |  378 ----------
 src/tree/kdtreeV2.h |   65 --
 src/tree/tree.c     | 1603 ++++++++++++++++++++++++++++++++++---------
 src/tree/tree.h     |   30 +-
 13 files changed, 2792 insertions(+), 1121 deletions(-)
 delete mode 100644 src/tree/heap.c
 create mode 100644 src/tree/kdtree.h
 delete mode 100644 src/tree/kdtreeV2.c
 delete mode 100644 src/tree/kdtreeV2.h

diff --git a/Makefile b/Makefile
index ceb6a13..d9f820e 100644
--- a/Makefile
+++ b/Makefile
@@ -1,15 +1,15 @@
 CC=mpicc
 #CC=mpiicx
-CFLAGS=-O3 -march=native -flto -funroll-loops -fopenmp
+CFLAGS=-O3 -march=native -funroll-loops -fopenmp
 #CFLAGS=-O3 -fopenmp 
 LDFLAGS=-lm 
 
 all: main
 
-obj=src/main/main.c src/tree/tree.c src/common/common.c src/tree/kdtreeV2.c src/tree/heap.c src/adp/adp.c
+obj=src/main/main.c src/tree/tree.c src/common/common.c src/adp/adp.c
 
 main: ${obj} 
-	${CC} ${CFLAGS} ${LDFLAGS} ${obj} -o $@
+	${CC} ${CFLAGS} ${obj} -o $@ ${LDFLAGS}
 
 clean:
 	rm main
diff --git a/run_leo b/run_leo
index 44e834a..497cdb7 100644
--- a/run_leo
+++ b/run_leo
@@ -6,7 +6,7 @@
 #SBATCH --time=04:00:00
 #SBATCH --job-name=dadp_test
 #SBATCH --partition=dcgp_usr_prod 
-#SBATCH --account=EUHPC_D18_045
+#SBATCH --account=EUHPC_D31_027
 #SBATCH --output=out_leo
 #SBATCH --error=err_leo
 #SBATCH --mem=480G
@@ -37,7 +37,7 @@ OUT_DATA=/leonardo_scratch/large/userexternal/ftomba00/data
 IN_DATA=/leonardo_work/EUHPC_D18_045
 
 #10^6 points 
-time mpirun -n ${SLURM_NTASKS} --map-by ppr:1:socket:PE=${SLURM_CPUS_PER_TASK}  ./main -t f32 -i ${IN_DATA}/norm_data/std_LR_091_0001 -d 5 -a ${OUT_ASSIGNMENT} -o ${OUT_DATA}
+time mpirun -n ${SLURM_NTASKS} --map-by ppr:1:socket:PE=${SLURM_CPUS_PER_TASK}  ./main -t f32 -k 100 -i ${IN_DATA}/norm_data/std_LR_091_0001 -d 5 -a ${OUT_ASSIGNMENT} -o ${OUT_DATA}
 #time mpirun -n ${SLURM_NTASKS} --map-by core  ./main -t f32 -i ${IN_DATA}/norm_data/std_LR_091_0001 -d 5 -a ${OUT_ASSIGNMENT} -o ${OUT_DATA}
 
 #34 * 10^6 points
diff --git a/src/adp/adp.c b/src/adp/adp.c
index f28cbde..240153a 100644
--- a/src/adp/adp.c
+++ b/src/adp/adp.c
@@ -92,10 +92,10 @@ float_t get_j_ksel_dist(global_context_t* ctx, idx_t j, idx_t ksel, MPI_Win expo
     }
     else
     {
-        heap_node el;
+        heap_node_t el;
         idx_t pos  = j - ctx -> rank_idx_start[owner];
         MPI_Request request;
-        int err = MPI_Rget(&el, sizeof(heap_node), MPI_BYTE, owner, (MPI_Aint)((pos * k + ksel) * sizeof(heap_node)), sizeof(heap_node), MPI_BYTE, exposed_ngbh, &request);
+        int err = MPI_Rget(&el, sizeof(heap_node_t), MPI_BYTE, owner, (MPI_Aint)((pos * k + ksel) * sizeof(heap_node_t)), sizeof(heap_node_t), MPI_BYTE, exposed_ngbh, &request);
         MPI_Wait(&request,MPI_STATUS_IGNORE);
         return el.value;
     }                 
@@ -113,10 +113,10 @@ idx_t get_j_ksel_idx(global_context_t* ctx, idx_t j, idx_t ksel, MPI_Win exposed
     }
     else
     {
-        heap_node el;
+        heap_node_t el;
         idx_t pos  = j - ctx -> rank_idx_start[owner];
         MPI_Request request;
-        int err = MPI_Rget(&el, sizeof(heap_node), MPI_BYTE, owner, (MPI_Aint)((pos * k + ksel) * sizeof(heap_node)), sizeof(heap_node), MPI_BYTE, exposed_ngbh, &request);
+        int err = MPI_Rget(&el, sizeof(heap_node_t), MPI_BYTE, owner, (MPI_Aint)((pos * k + ksel) * sizeof(heap_node_t)), sizeof(heap_node_t), MPI_BYTE, exposed_ngbh, &request);
         MPI_Wait(&request,MPI_STATUS_IGNORE);
         return el.array_idx;
     }                 
@@ -176,7 +176,7 @@ void compute_density_kstarnn_rma_v2(global_context_t* ctx, const float_t d, int
 
     MPI_Win exposed_ngbh;
     MPI_Win_create( ctx -> __local_heap_buffers, 
-                    ctx -> local_n_points * ctx -> k * sizeof(heap_node), 
+                    ctx -> local_n_points * ctx -> k * sizeof(heap_node_t), 
                     1, MPI_INFO_NULL, 
                     ctx -> mpi_communicator, 
                     &exposed_ngbh);
@@ -186,7 +186,7 @@ void compute_density_kstarnn_rma_v2(global_context_t* ctx, const float_t d, int
     int i_have_finished = 0;
     int all_have_finished = 0;
     int finished_points = 0;
-    heap_node* scratch_heap_nodes = (heap_node*)MY_MALLOC(ctx -> local_n_points * sizeof(heap_node));  
+    heap_node_t* scratch_heap_node_ts = (heap_node_t*)MY_MALLOC(ctx -> local_n_points * sizeof(heap_node_t));  
 
         for(idx_t j = 4; j < kMAX - 1; ++j)
         {
@@ -217,17 +217,17 @@ void compute_density_kstarnn_rma_v2(global_context_t* ctx, const float_t d, int
 
                     if(owner == ctx -> mpi_rank)
                     {
-                        scratch_heap_nodes[i] = ctx -> local_datapoints[pos].ngbh[ksel];
+                        scratch_heap_node_ts[i] = ctx -> local_datapoints[pos].ngbh[ksel];
                     }
                     else
                     {
                         // if(pos > ctx -> rank_n_points[owner]) printf("NOPE HERE from rank %d %lu %d\n", ctx -> mpi_rank, pos, ctx -> rank_n_points[owner]);
-                        MPI_Get(scratch_heap_nodes + i, 
-                                sizeof(heap_node), 
+                        MPI_Get(scratch_heap_node_ts + i, 
+                                sizeof(heap_node_t), 
                                 MPI_BYTE, 
                                 owner, 
-                                (MPI_Aint)((pos * (idx_t)(ctx -> k) + ksel) * sizeof(heap_node)), 
-                                sizeof(heap_node), 
+                                (MPI_Aint)((pos * (idx_t)(ctx -> k) + ksel) * sizeof(heap_node_t)), 
+                                sizeof(heap_node_t), 
                                 MPI_BYTE, 
                                 exposed_ngbh);
                     }
@@ -246,7 +246,7 @@ void compute_density_kstarnn_rma_v2(global_context_t* ctx, const float_t d, int
                 {
                     float_t vvi, vvj, vp, dL;
                     vvi = local_datapoints[i].ngbh[ksel].value;
-                    vvj = scratch_heap_nodes[i].value;
+                    vvj = scratch_heap_node_ts[i].value;
                     vp = (vvi + vvj)*(vvi + vvj);
                     dL = -2.0 * ksel * log(4.*vvi*vvj/vp);
 
@@ -293,7 +293,7 @@ void compute_density_kstarnn_rma_v2(global_context_t* ctx, const float_t d, int
     MPI_Win_fence(0, exposed_ngbh);
     MPI_Win_free(&exposed_ngbh);
 
-    free(scratch_heap_nodes);
+    free(scratch_heap_node_ts);
 
     #if defined(WRITE_DENSITY)
         /* densities */
@@ -301,24 +301,28 @@ void compute_density_kstarnn_rma_v2(global_context_t* ctx, const float_t d, int
         float_t* gs = (float_t*)MY_MALLOC(ctx -> local_n_points * sizeof(float_t));
         idx_t* ks = (idx_t*)MY_MALLOC(ctx -> local_n_points * sizeof(idx_t));
 
-        for(int i = 0; i < ctx -> local_n_points; ++i) den[i] = ctx -> local_datapoints[i].log_rho;
-        for(int i = 0; i < ctx -> local_n_points; ++i) ks[i]  = ctx -> local_datapoints[i].kstar;
-        for(int i = 0; i < ctx -> local_n_points; ++i) gs[i]  = ctx -> local_datapoints[i].g;
+        for(int i = 0; i < ctx -> local_n_points; ++i) 
+        {
+            den[i] = ctx -> local_datapoints[i].log_rho;
+            ks[i]  = ctx -> local_datapoints[i].kstar;
+            gs[i]  = ctx -> local_datapoints[i].g;
+        }
 
         ordered_buffer_to_file(ctx, den, sizeof(float_t), ctx -> local_n_points, "bb/ordered_density.npy");
         ordered_buffer_to_file(ctx, ks, sizeof(idx_t), ctx -> local_n_points, "bb/ks.npy");
         ordered_buffer_to_file(ctx, gs, sizeof(float_t), ctx -> local_n_points, "bb/g.npy");
 
-        ordered_data_to_file(ctx, "bb/ordered_data.npy");
+        free(gs);
         free(den);
         free(ks);
+
     #endif
     return;
 
 
 }
 
-float_t get_j_ksel_dist_v2(global_context_t* ctx, idx_t i, idx_t j, idx_t ksel, int* flags, heap_node* tmp_heap_nodes, MPI_Win* exposed_ngbh)
+float_t get_j_ksel_dist_v2(global_context_t* ctx, idx_t i, idx_t j, idx_t ksel, int* flags, heap_node_t* tmp_heap_node_ts, MPI_Win* exposed_ngbh)
 {
     if(flags[i])
     {
@@ -335,14 +339,14 @@ float_t get_j_ksel_dist_v2(global_context_t* ctx, idx_t i, idx_t j, idx_t ksel,
             //RMA
             flags[i] = 0;
             idx_t pos  = j - ctx -> rank_idx_start[owner];
-            MPI_Get(tmp_heap_nodes + i, sizeof(heap_node), MPI_BYTE, owner, (MPI_Aint)((pos * k + ksel) * sizeof(heap_node)), sizeof(heap_node), MPI_BYTE, *exposed_ngbh);
+            MPI_Get(tmp_heap_node_ts + i, sizeof(heap_node_t), MPI_BYTE, owner, (MPI_Aint)((pos * k + ksel) * sizeof(heap_node_t)), sizeof(heap_node_t), MPI_BYTE, *exposed_ngbh);
             return 0;
         }                 
     }
     else
     {
         flags[i] = 1;
-        return tmp_heap_nodes[i].value;
+        return tmp_heap_node_ts[i].value;
     }
 }
 
@@ -745,7 +749,7 @@ clusters_t Heuristic1(global_context_t *ctx)
      *
      * optimized v2 use a queue of center removal and then exchange them
 	 **/
-	heap_node* to_remove_mask = (heap_node*)MY_MALLOC(n*sizeof(heap_node));
+	heap_node_t* to_remove_mask = (heap_node_t*)MY_MALLOC(n*sizeof(heap_node_t));
 
     for(idx_t p = 0; p < n; ++p) 
     {
@@ -1401,7 +1405,7 @@ void Heuristic2(global_context_t* ctx, clusters_t* cluster)
 
     MPI_Win dp_info_win, ngbh_win;
     MPI_Win_create(ctx -> local_datapoints, ctx -> local_n_points * sizeof(datapoint_info_t), 1, MPI_INFO_NULL, ctx -> mpi_communicator, &dp_info_win);
-    MPI_Win_create(ctx -> __local_heap_buffers, ctx -> local_n_points * ctx -> k * sizeof(heap_node), 1, MPI_INFO_NULL, ctx -> mpi_communicator, &ngbh_win);
+    MPI_Win_create(ctx -> __local_heap_buffers, ctx -> local_n_points * ctx -> k * sizeof(heap_node_t), 1, MPI_INFO_NULL, ctx -> mpi_communicator, &ngbh_win);
 
     MPI_Win_fence(0, dp_info_win);
     MPI_Win_fence(0, ngbh_win);
@@ -2478,7 +2482,7 @@ void Heuristic3(global_context_t* ctx, clusters_t* cluster, float_t Z, int halo)
         for(int i = 0; i < ctx -> local_n_points; ++i) cl[i] = ctx -> local_datapoints[i].cluster_idx;
 
         ordered_buffer_to_file(ctx, cl, sizeof(int), ctx -> local_n_points, "bb/final_assignment.npy");
-        ordered_data_to_file(ctx, "bb/ordered_data.npy");
+        //ordered_data_to_file(ctx, "bb/ordered_data.npy");
 
         free(cl);
         
diff --git a/src/common/common.c b/src/common/common.c
index 3ee1a5d..3a5d155 100644
--- a/src/common/common.c
+++ b/src/common/common.c
@@ -1,5 +1,6 @@
 #include "common.h"
 #include "mpi.h"
+#include <stdlib.h>
 #include <time.h>
 
 #define ARRAY_INCREMENT 100
@@ -13,8 +14,9 @@ void get_context(global_context_t* ctx)
 	MPI_Get_processor_name(ctx -> processor_mame, &(ctx -> __processor_name_len));
 	MPI_Comm_rank(ctx -> mpi_communicator, &(ctx -> mpi_rank));
 	ctx -> local_data = NULL;
-	ctx -> lb_box 	  = NULL;
-	ctx -> ub_box 	  = NULL;
+	ctx -> lb_box     = NULL;
+	ctx -> ub_box     = NULL;
+    ctx -> og_idxs    = NULL;
     ctx -> rank_n_points  = (idx_t*)malloc(ctx -> world_size * sizeof(idx_t));
     ctx -> rank_idx_start = (idx_t*)malloc(ctx -> world_size * sizeof(idx_t));
     ctx -> local_datapoints = NULL;
@@ -160,10 +162,10 @@ void print_error_code(int err)
 void free_context(global_context_t* ctx)
 {
 
-    FREE_NOT_NULL(ctx -> local_data);
+    // FREE_NOT_NULL(ctx -> local_data);
     FREE_NOT_NULL(ctx -> ub_box);
     FREE_NOT_NULL(ctx -> lb_box);
-    //FREE_NOT_NULL(ctx -> __local_heap_buffers);
+    FREE_NOT_NULL(ctx -> og_idxs);
     if(ctx -> __local_heap_buffers) MPI_Free_mem(ctx -> __local_heap_buffers);
 
 
@@ -179,10 +181,10 @@ void free_context(global_context_t* ctx)
 
 void free_pointset(pointset_t* ps)
 {
-	if(ps -> data) 
+	if(ps -> datapoints) 
 	{
-		free(ps -> data);
-		ps -> data = NULL;
+		free(ps -> datapoints);
+		ps -> datapoints = NULL;
 	}
 
 	if(ps -> ub_box)
@@ -210,6 +212,7 @@ void mpi_printf(global_context_t* ctx, const char *fmt, ...)
 		//        myflush(stdout);
 		va_end(l);
 	}
+    fflush(stdout);
 }
 
 void generate_random_matrix(
diff --git a/src/common/common.h b/src/common/common.h
index dec2d61..ad6cd47 100644
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -10,21 +10,25 @@
 #define DEFAULT_MSG_LEN 10000000
 //#include <stdarg.h>
 
-#define PARALLEL_FIX_BORDERS
-// #define WRITE_NGBH
+// #define PARALLEL_FIX_BORDERS
+#define WRITE_SHUFFLED_DATA
+#define WRITE_NGBH
 // #define WRITE_TOP_NODES
-// #define WRITE_DENSITY
+#define WRITE_DENSITY
 // #define WRITE_CLUSTER_ASSIGN_H1
 // #define WRITE_BORDERS
 // #define WRITE_CENTERS_PRE_MERGING
 // #define WRITE_MERGES_INFO
 // #define WRITE_MERGING_TABLE
-// #define WRITE_FINAL_ASSIGNMENT
+#define WRITE_FINAL_ASSIGNMENT
 
 // #define PRINT_NGBH_EXCHANGE_SCHEME
 // #define PRINT_H2_COMM_SCHEME
 // #define PRINT_H1_CLUSTER_ASSIGN_COMPLETION
 // #define PRINT_ORDERED_BUFFER
+// #define PRINT_BALANCE_FACTOR
+// #define CHECK_CORRECT_EXCHANGE
+// #define DUMP_NGBH_KNN
 
 #define DEFAULT_STR_LEN 200
 
@@ -50,14 +54,17 @@
 #define MY_TRUE  1
 #define MY_FALSE 0
 
-#define HERE printf("%d in file %s reached line %d\n", ctx -> mpi_rank, __FILE__, __LINE__); MPI_Barrier(ctx -> mpi_communicator);
+#define HERE printf("%d in file %s reached line %d\n", ctx -> mpi_rank, __FILE__, __LINE__); fflush(stdout); MPI_Barrier(ctx -> mpi_communicator);
 
 #define CHECK_ALLOCATION(x) if(!x){printf("[!!!] %d rank encountered failed allocation: %s at line %s \n", ctx -> mpi_rank, __FILE__, __LINE__ ); exit(1);};
 
+
 #define CHECK_ALLOCATION_NO_CTX(x) if(!x){printf("[!!!] Failed allocation: %s at line %d \n", __FILE__, __LINE__ ); exit(1);}
+#ifndef MY_MALLOC
 #define MY_MALLOC(n) ({void* p = calloc(n,1); CHECK_ALLOCATION_NO_CTX(p); p; })
+#endif
 
-#define DB_PRINT(...) printf(__VA_ARGS__)
+#define DB_PRINT(...) printf(__VA_ARGS__); fflush(stdout)
 #ifdef NDEBUG
 	#undef DB_PRINT(...)
 	#define DB_PRINT(...)
@@ -93,11 +100,14 @@
             MPI_Reduce(&time, &avg, 1, MPI_DOUBLE, MPI_SUM, 0, ctx -> mpi_communicator); \
             MPI_Reduce(&time, &min, 1, MPI_DOUBLE, MPI_MIN, 0, ctx -> mpi_communicator); \
             MPI_Reduce(&time, &max, 1, MPI_DOUBLE, MPI_MAX, 0, ctx -> mpi_communicator); \
+            MPI_Barrier(ctx->mpi_communicator); \
             MPI_DB_PRINT("%50.50s -> [avg: %.2lfs, min: %.2lfs, max: %.2lfs]\n", sec_name, avg/((double)ctx -> world_size), min, max); \
+            fflush(stdout); \
         } \
         else \
         { \
             MPI_DB_PRINT("%s\n", sec_name);\
+            fflush(stdout); \
         }\
     }
     
@@ -128,11 +138,20 @@
     #define LOG_END
 #endif
 
+#ifndef POINT
+    #define POINT
+    typedef struct point_t
+    {
+        idx_t    array_idx;
+        float_t* data;
+    } point_t;
+#endif
+
 typedef struct datapoint_info_t {
     /*
      * datapoint object 
      */
-    heap_node* ngbh;        //heap object stores nearest neighbors of each point
+    heap_node_t* ngbh;      //heap object stores nearest neighbors of each point
     int is_center;          //flag signaling if a point is a cluster center
     int cluster_idx;        //cluster index
     idx_t array_idx;        //global index of the point in the dataset
@@ -154,8 +173,8 @@ typedef struct global_context_t
     int __processor_name_len;                       //processor name len
     int world_size;  
     int mpi_rank;                                   //rank of the processor
-    uint32_t dims;                                  //number of dimensions of the dataset
     idx_t k;                                        //number of neighbors
+    uint32_t dims;                                  //number of dimensions of the dataset
     float_t z;                                      //z parameter
 	float_t* local_data;                            //pointer to the dataset stored into the node
 	float_t* lb_box;                                //bounding box of the dataset
@@ -164,12 +183,13 @@ typedef struct global_context_t
     size_t idx_start;                               //starting index of the points on the processor
     size_t local_n_points;                          //number of points stored in the current processor
     datapoint_info_t*  local_datapoints;            //pointer to the datapoint properties
-    idx_t* rank_idx_start;                            //starting index of datapoints in each processor
-    idx_t* rank_n_points;                             //processor name
-    heap_node* __local_heap_buffers;                //buffer that stores nearest neighbors
+    idx_t* rank_idx_start;                          //starting index of datapoints in each processor
+    idx_t* rank_n_points;                           //processor name
+    idx_t* og_idxs;                                 //original indexes
+    heap_node_t* __local_heap_buffers;              //buffer that stores nearest neighbors
 	MPI_Comm mpi_communicator;                      //mpi communicator
-    char input_data_file[DEFAULT_STR_LEN];
     int input_data_in_float32;
+    char input_data_file[DEFAULT_STR_LEN];
     char output_assignment_file[DEFAULT_STR_LEN];
     char output_data_file[DEFAULT_STR_LEN];
 } global_context_t;
@@ -184,7 +204,7 @@ typedef struct pointset_t
 	size_t n_points;
 	size_t __capacity;
 	uint32_t dims;
-	float_t* data;
+	point_t* datapoints;
 	float_t* lb_box;
 	float_t* ub_box;
 } pointset_t;
diff --git a/src/main/main.c b/src/main/main.c
index 2bc6aa2..c82e660 100644
--- a/src/main/main.c
+++ b/src/main/main.c
@@ -4,6 +4,7 @@
 #include "../common/common.h"
 #include "../tree/tree.h"
 #include "../adp/adp.h"
+#include <string.h>
 #include <unistd.h>
 #include <getopt.h>
 
@@ -251,7 +252,7 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
     TIME_DEF
     double elapsed_time;
 
-    int halo = MY_FALSE;
+    int halo = MY_TRUE;
     float_t tol = 0.002;
 
     if(I_AM_MASTER && ctx -> world_size <= 6)
@@ -345,14 +346,6 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
 
     uint64_t *global_bin_counts_int = (uint64_t *)MY_MALLOC(k_global * sizeof(uint64_t));
 
-    pointset_t original_ps;
-    original_ps.data = ctx->local_data;
-    original_ps.dims = ctx->dims;
-    original_ps.n_points = ctx->local_n_points;
-    original_ps.lb_box = (float_t*)MY_MALLOC(ctx -> dims * sizeof(float_t));
-    original_ps.ub_box = (float_t*)MY_MALLOC(ctx -> dims * sizeof(float_t));
-
-
     top_kdtree_t tree;
     TIME_START;
     top_tree_init(ctx, &tree);
@@ -360,14 +353,14 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
     LOG_WRITE("Initializing global kdtree", elapsed_time);
 
     TIME_START;
-    build_top_kdtree(ctx, &original_ps, &tree, k_global, tol);
+    parallel_build_top_kdtree(ctx, &tree, 512);
     exchange_points(ctx, &tree);
     elapsed_time = TIME_STOP;
     LOG_WRITE("Top kdtree build and domain decomposition", elapsed_time);
 
     TIME_START;
-    kdtree_v2 local_tree;
-    kdtree_v2_init( &local_tree, ctx -> local_data, ctx -> local_n_points, (unsigned int)ctx -> dims);
+    kdtree_t local_tree;
+    kdtree_initialize( &local_tree, ctx -> local_data, ctx -> local_n_points, (unsigned int)ctx -> dims);
 
     datapoint_info_t* dp_info = (datapoint_info_t*)MY_MALLOC(ctx -> local_n_points * sizeof(datapoint_info_t));            
     for(uint64_t i = 0; i < ctx -> local_n_points; ++i)
@@ -385,7 +378,10 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
     }
     ctx -> local_datapoints = dp_info;
 
-    build_local_tree(ctx, &local_tree);
+    build_tree_kdtree_parallel(&local_tree);
+    // build_tree_kdtree_sample(&local_tree);
+    ctx -> local_data = local_tree.data;
+    
     elapsed_time = TIME_STOP;
     LOG_WRITE("Local trees init and build", elapsed_time);
 
@@ -393,14 +389,13 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
     MPI_DB_PRINT("----- Performing ngbh search -----\n");
     MPI_Barrier(ctx -> mpi_communicator);
 
-    mpi_ngbh_search(ctx, dp_info, &tree, &local_tree, ctx -> local_data, ctx -> k);
+    mpi_all_knn_search(ctx, dp_info, &tree, &local_tree, ctx -> k);
 
     MPI_Barrier(ctx -> mpi_communicator);
     elapsed_time = TIME_STOP;
     LOG_WRITE("Total time for all knn search", elapsed_time)
 
 
-
     TIME_START;
     //float_t id = id_estimate(ctx, dp_info, ctx -> local_n_points, 0.9, MY_FALSE);
     float_t id = compute_ID_two_NN_ML(ctx, dp_info, ctx -> local_n_points, MY_FALSE);
@@ -440,12 +435,19 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
     
     TIME_START;
     int* cl = (int*)MY_MALLOC(ctx -> local_n_points * sizeof(int));
-    for(int i = 0; i < ctx -> local_n_points; ++i) cl[i] = ctx -> local_datapoints[i].cluster_idx;
+    float_t* data_to_write = (float_t*)MY_MALLOC(ctx -> local_n_points * ctx -> k * sizeof(float_t));
+
+    for(int i = 0; i < ctx -> local_n_points; ++i)
+    {
+        cl[i] = ctx -> local_datapoints[i].cluster_idx;
+        idx_t idx = local_tree.__points[i].array_idx;
+        memcpy(data_to_write + idx*ctx -> dims, local_tree.__points[i].data, ctx -> dims*sizeof(float_t));
+    }
 
     if(ctx -> world_size <= 32)
     {
         big_ordered_buffer_to_file(ctx, cl, sizeof(int), ctx -> local_n_points, ctx -> output_assignment_file);
-        big_ordered_buffer_to_file(ctx, ctx -> local_data, sizeof(double), ctx -> local_n_points * ctx -> dims, ctx -> output_data_file);
+        big_ordered_buffer_to_file(ctx, data_to_write, sizeof(double), ctx -> local_n_points * ctx -> dims, ctx -> output_data_file);
 
         free(cl);
 
@@ -453,7 +455,7 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
     else
     {
         distributed_buffer_to_file(ctx, cl, sizeof(int), ctx -> local_n_points, ctx -> output_assignment_file);
-        distributed_buffer_to_file(ctx, ctx -> local_data, sizeof(double), ctx -> local_n_points * ctx -> dims, ctx -> output_data_file);
+        distributed_buffer_to_file(ctx, data_to_write, sizeof(double), ctx -> local_n_points * ctx -> dims, ctx -> output_data_file);
 
         free(cl);
 
@@ -463,15 +465,14 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
     LOG_WRITE("Write results to file", elapsed_time);
     
     
+    free(data_to_write);
     top_tree_free(ctx, &tree);
-    kdtree_v2_free(&local_tree);
+    kdtree_free(&local_tree);
     //clusters_free(&clusters);
 
     free(send_counts);
     free(displacements);
     //free(dp_info);
     
-    original_ps.data = NULL;
-    free_pointset(&original_ps);
     free(global_bin_counts_int);
 }
diff --git a/src/tree/heap.c b/src/tree/heap.c
deleted file mode 100644
index 67d38e5..0000000
--- a/src/tree/heap.c
+++ /dev/null
@@ -1,217 +0,0 @@
-#include "heap.h"
-
-void swap_heap_node(heap_node* a, heap_node* b){
-    heap_node tmp;
-    memcpy(&tmp,a,sizeof(heap_node));
-    memcpy(a,b,sizeof(heap_node));
-    memcpy(b,&tmp,sizeof(heap_node));
-    return;
-}
-
-void allocate_heap(heap* H, idx_t n){
-    H -> data = (heap_node*)malloc(n*sizeof(heap_node));
-    H -> N = n;
-    H -> count = 0;
-    return;
-}
-
-void init_heap(heap* H){
-    for(idx_t i = 0; i < H -> N; ++i)
-    {
-        H -> data[i].value = 0.;
-        H -> data[i].array_idx = MY_SIZE_MAX; 
-    }
-    return;
-}
-
-void free_heap(heap * H){ free(H -> data);}
-
-
-void heapify_max_heap(heap* H, idx_t node){
-    idx_t nn = node;
-    idx_t largest = nn; 
-    /*
-    Found gratest between children of node and boundcheck if the node is a leaf 
-    */
-    
-    while(1)
-    {
-        largest =   (HEAP_LCH(nn) < H -> N) && 
-                    (H -> data[HEAP_LCH(nn)].value > H -> data[largest].value ) ? HEAP_LCH(nn) : largest;
-
-        largest =   (HEAP_RCH(nn) < H -> N) && 
-                    (H -> data[HEAP_RCH(nn)].value > H -> data[largest].value ) ? HEAP_RCH(nn) : largest;
-        if(largest != nn) 
-        {
-            swap_heap_node(H -> data + nn, H -> data + largest);
-            nn = largest;
-        }
-        else
-        {
-            break;
-        }
-    }
-    
-
-    /*
-    if(HEAP_LCH(node) < H -> N){
-        //if(H -> data[HEAP_LCH(node)].value > H -> data[largest].value ) largest = HEAP_LCH(node);
-        largest = (H -> data[HEAP_LCH(nn)].value > H -> data[largest].value ) ? HEAP_LCH(nn) : largest;
-    }
-    if(HEAP_RCH(node) < H -> N){
-        //if(H -> data[HEAP_RCH(node)].value > H -> data[largest].value ) largest = HEAP_RCH(node);
-        largest = (H -> data[HEAP_RCH(nn)].value > H -> data[largest].value ) ? HEAP_RCH(nn) : largest;
-    }
-    if(largest == node){
-        return;
-    }
-    else{
-        swap_heap_node(H -> data + node, H -> data + largest);
-        heapify_max_heap(H, largest);
-    }
-    */
-}
-
-
-void set_root_max_heap(heap * H, FLOAT_TYPE val, idx_t array_idx){
-    H -> data[0].value = val;
-    H -> data[0].array_idx = array_idx;
-    heapify_max_heap(H,0);
-    return;
-}
-
-void insert_max_heap_insertion_sort(heap * H,const FLOAT_TYPE val,const idx_t array_idx){
-    heap_node tmpNode = {.value = val, .array_idx = array_idx};
-    if(H -> count < H -> N)
-    {
-        idx_t idx = H -> count;
-        H -> data[idx] = tmpNode;
-        ++(H -> count);
-        while(idx >= 1)
-        {
-            if(H -> data[idx].value < H -> data[idx - 1].value)
-            {
-                swap_heap_node((H -> data) + idx, (H -> data) + idx - 1);
-                idx--;
-            }
-            else
-            {
-                break;
-            }
-        }
-
-    }
-    else
-    {
-        if(H -> data[H -> count - 1].value > val)
-        {
-            idx_t idx = H -> count - 1;
-            H -> data[idx] = tmpNode;
-            while(idx >= 1)
-            {
-                if(H -> data[idx].value < H -> data[idx - 1].value)
-                {
-                    swap_heap_node(H -> data + idx, H -> data + idx - 1);
-                    idx--;
-                }
-                else
-                {
-                    break;
-                }
-            }
-        }
-    }
-    return;
-}
-
-void insert_max_heap(heap * H,const FLOAT_TYPE val,const idx_t array_idx){
-    int c1 = H -> count < H -> N;
-    int c2 = (val < H -> data[0].value) && (!c1);
-    int ctot = c1 + 2*c2;
-    switch (ctot) {
-        case 1:
-            {
-                idx_t node = H->count;
-                ++(H -> count);
-                H -> data[node].value = val;
-                H -> data[node].array_idx = array_idx;
-                /*
-                * Push up the node through the heap 
-                */
-                while(node && H -> data[node].value > H -> data[HEAP_PARENT(node)].value)
-                {
-                    swap_heap_node(H -> data + node, H -> data + HEAP_PARENT(node));
-                    node = HEAP_PARENT(node);
-                    //if(node == 0) break;
-                }
-            }
-            break;
-
-        case 2: 
-            {
-                set_root_max_heap(H,val,array_idx);
-            }
-            break;
-        default:
-            break;
-    }
-    /* alternative solution, left here if something breaks*/
-
-    //if(H -> count < H -> N){
-    //    idx_t node = H->count;
-    //    ++(H -> count);
-    //    H -> data[node].value = val;
-    //    H -> data[node].array_idx = array_idx;
-    //    /*
-    //    * Push up the node through the heap 
-    //    */
-    //    while(node && H -> data[node].value > H -> data[HEAP_PARENT(node)].value)
-    //    {
-    //        swap_heap_node(H -> data + node, H -> data + HEAP_PARENT(node));
-    //        node = HEAP_PARENT(node);
-    //        //if(node == 0) break;
-    //    }
-    //    return;
-    //}
-    //else if (val < H -> data[0].value)
-    //{
-    //    set_root_max_heap(H,val,array_idx);
-    //    return;
-    //}
-    //else
-    //{
-    //    return;
-    //}
-    
-    
-}
-
-#ifdef USE_FLOAT32
-    #define EPS 5.96e-08
-#else
-    #define EPS 2.11e-16
-#endif
-
-int cmp_heap_nodes(const void* a, const void* b)
-{
-    const heap_node* aa = (const heap_node*)a;
-    const heap_node* bb = (const heap_node*)b;
-    int val =  (aa -> value > bb -> value) - (aa -> value < bb -> value); 
-    //return vv; 
-    return val;
-
-
-}
-
-
-void heap_sort(heap* H){
-    idx_t n = H -> N;
-    qsort(H -> data, n, sizeof(heap_node),cmp_heap_nodes);
-    //for(idx_t i= (H -> N) - 1; i > 0; --i)
-    //{
-    //    swap_heap_node(H -> data, H -> data + i);
-    //    H -> N = i;
-    //    heapify_max_heap(H,0);
-    //}
-    //H -> N = n;
-}
diff --git a/src/tree/heap.h b/src/tree/heap.h
index 117ff8c..5b63bf8 100644
--- a/src/tree/heap.h
+++ b/src/tree/heap.h
@@ -1,18 +1,16 @@
 #pragma once
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <math.h>
 #include <limits.h>
 #include <stdint.h>
-#define T double
-#define DATA_DIMS 0 
-
 
 #ifdef USE_FLOAT32
-    #define FLOAT_TYPE float
+    #define float_t float
 #else
-    #define FLOAT_TYPE double 
+    #define float_t double 
 #endif
 
 #ifdef USE_INT32
@@ -30,49 +28,127 @@
 #define HP_LEFT_SIDE 0
 #define HP_RIGHT_SIDE 1
 
-
-struct heap_node
-{
-   FLOAT_TYPE value;
-   idx_t array_idx;
-} ;
-
-struct heap
+#define ALIGNMENT 64
+#define CHECK_ALLOCATION_NO_CTX(x) if(!x){printf("[!!!] Failed allocation: %s at line %d \n", __FILE__, __LINE__ ); exit(1);}
+#define MY_MALLOC(n) ({void* p = aligned_alloc(ALIGNMENT,n); CHECK_ALLOCATION_NO_CTX(p); memset(p, 0, n); p; })
+
+
+typedef struct {
+   float_t value;
+   idx_t   array_idx;
+} heap_node_t;
+
+typedef struct  {
+   size_t size; 
+   size_t count;
+   heap_node_t* data;
+} heap_t;
+
+static inline void heap_node_swap(heap_node_t* a, heap_node_t* b){
+    heap_node_t tmp = *a;
+    *a = *b;
+    *b = tmp;
+}
+
+static void heap_allocate(heap_t* H, idx_t n){
+    H -> data = (heap_node_t*)MY_MALLOC(n*sizeof(heap_node_t));
+    H -> size = n;
+    H -> count = 0;
+    return;
+}
+
+static void heap_initialize(heap_t* H){
+    for(idx_t i = 0; i < H->size; ++i)
+    {
+        H -> data[i].value = 0.;
+        H -> data[i].array_idx = MY_SIZE_MAX; 
+    }
+    return;
+}
+
+static void free_heap(heap_t * H){ free(H -> data);}
+
+
+// Optimized heapify (sift-down) that avoids repeated swaps.
+// It correctly uses H->count as the boundary.
+static inline void heapify_max_heap(heap_t* H, idx_t i) {
+    heap_node_t node_to_sift = H->data[i]; // Save the node we are sifting down
+    idx_t child;
+    while ((child = HEAP_LCH(i)) < H->count) {
+        idx_t r_child = HEAP_RCH(i);
+        // Find the largest child
+        
+        // if (r_child < H->count && H->data[r_child].value > H->data[child].value) {
+        //     child = r_child;
+        // }
+        
+        child = (r_child < H->count && H->data[r_child].value > H->data[child].value) ? r_child : child;
+        // If the node to sift is larger than its largest child, we are done
+        if (node_to_sift.value >= H->data[child].value) {
+            break;
+        }
+        // Move the larger child up
+        H->data[i] = H->data[child];
+        i = child;
+    }
+    // Place the original node in its final position
+    H->data[i] = node_to_sift;
+}
+
+
+static void set_root_max_heap(heap_t * H, const float_t val, const idx_t array_idx){
+    H -> data[0].value = val;
+    H -> data[0].array_idx = array_idx;
+    heapify_max_heap(H,0);
+    return;
+}
+
+// Optimized insertion logic inspired by kBoundedQueue.
+// It avoids swaps during bubble-up and correctly handles the full/not-full cases.
+static inline void max_heap_insert(heap_t * H, const float_t val, const idx_t array_idx){
+    if (H->count < H->size) {
+        // Heap is not full. Add to the end and bubble up.
+        idx_t i = H->count++;
+        // Bubble-up without swaps
+        while (i > 0) {
+            idx_t parent = HEAP_PARENT(i);
+            if (val <= H->data[parent].value) {
+                break; // Found the correct spot
+            }
+            // Move parent down
+            H->data[i] = H->data[parent];
+            i = parent;
+        }
+        H->data[i].value = val;
+        H->data[i].array_idx = array_idx;
+    } else if (val < H->data[0].value) {
+        // Heap is full, but new value is smaller than the max.
+        // Replace the root and sift it down using the optimized heapify.
+        H->data[0].value = val;
+        H->data[0].array_idx = array_idx;
+        heapify_max_heap(H, 0);
+    }
+    // If heap is full and new value is >= max, do nothing.
+}
+
+static int heap_node_compare(const void* a, const void* b)
 {
-  // [LT comment] is it possible to drop this N in favor of a global variable,
-  //              when the tree is compiled to be used in adP?
-  //              if N is constant, then we can save 8 bytes here
-   idx_t N; 
-   idx_t count;
-   struct heap_node* data;
-   
-} ;
-
-struct SimpleHeap
-{
-   idx_t N; 
-   T * data;
-} ;
-
-typedef struct SimpleHeap SimpleHeap;
-typedef struct heap heap;
-typedef struct heap_node heap_node;
-
-
-void allocate_heap(heap* H, idx_t n);
-
-void init_heap(heap* H);
-
-void free_heap(heap * H);
-
-void heapify_max_heap(heap* H, idx_t node);
-
-void set_root_max_heap(heap * H, FLOAT_TYPE val, idx_t array_idx);
-
-void insert_max_heap(heap * H, FLOAT_TYPE val, idx_t array_idx);
-
-void heap_sort(heap* H);
-void insert_max_heap_insertion_sort(heap * H,const FLOAT_TYPE val,const idx_t array_idx);
-
-
-int cmp_heap_nodes(const void* a, const void* b);
+    const heap_node_t* aa = (const heap_node_t*)a;
+    const heap_node_t* bb = (const heap_node_t*)b;
+    if (aa->value < bb->value) return -1;
+    if (aa->value > bb->value) return 1;
+    return 0;
+}
+
+
+static inline void heap_sort(heap_t* H){
+    idx_t n = H->count;
+    for(idx_t i = (H->count) - 1; i > 0; --i)
+    {
+        heap_node_swap(H->data, H->data + i);
+        H->count = i;
+        heapify_max_heap(H,0);
+    }
+    H->size = n;
+    // qsort(H->data, H->count, sizeof(heap_node_t), heap_node_compare);
+}
diff --git a/src/tree/kdtree.h b/src/tree/kdtree.h
new file mode 100644
index 0000000..8a0c9be
--- /dev/null
+++ b/src/tree/kdtree.h
@@ -0,0 +1,1278 @@
+#pragma once
+#include "heap.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <math.h>
+#include <limits.h>
+#include <stdint.h>
+#include <time.h>
+#include <float.h>
+
+#define DEFAULT_LEAF_SIZE 32
+
+#define ALIGNMENT 64
+#define CHECK_ALLOCATION_NO_CTX(x) if(!x){printf("[!!!] Failed allocation: %s at line %d \n", __FILE__, __LINE__ ); exit(1);}
+
+#ifndef MY_MALLOC
+    #define MY_MALLOC(n) ({size_t __n = (n + ALIGNMENT - 1) & ~(ALIGNMENT - 1); void* p = aligned_alloc(ALIGNMENT,__n); CHECK_ALLOCATION_NO_CTX(p); memset(p, 0, __n); p; })
+#endif
+
+#define FREE(x) if(x); free(x);
+
+#define HP_LEFT_SIDE  0
+#define HP_RIGHT_SIDE 1
+
+#define T double
+#define DATA_DIMS 0 
+
+#ifdef USE_FLOAT32
+    #define float_t float
+#else
+    #define float_t double 
+#endif
+
+#ifdef USE_INT32
+    #define MY_SIZE_MAX UINT32_MAX
+    #define idx_t uint32_t
+#else
+    #define MY_SIZE_MAX UINT64_MAX
+    #define idx_t uint64_t
+#endif
+
+#ifndef MIN
+    #define MIN(x,y) ((x < y) ? (x) : (y))
+#endif
+
+#ifndef MAX
+    #define MAX(x,y) ((x > y) ? (x) : (y))
+#endif
+
+typedef struct 
+{
+    float_t* lb;
+    float_t* ub;
+} bounding_box_t;
+
+// struct point_t
+// {
+//     idx_t    array_idx;
+//     float_t* data;
+// };
+// typedef struct point_t point_t;
+
+struct pivot_t;
+
+union pivot_data {
+
+    struct {
+        float_t split_value;    // The value used for the split
+        uint8_t split_variable; // The dimension used for the split
+        uint64_t lch_idx;       // 4-byte index to the left child in the pivot array
+        uint64_t rch_idx;       // 4-byte index to the right child in the pivot array
+    } internal;
+
+    // Data for a leaf node
+    struct {
+        uint32_t point_list_idx;   // Index to the first point in the global point array
+        uint32_t leaf_point_count; // Number of points in this leaf
+    } leaf;
+};
+
+// The new, slimmer pivot_t struct
+struct pivot_t {
+    uint8_t is_leaf; 
+    bounding_box_t bounding_box;
+    union pivot_data as;
+};
+
+struct kdtree_t
+{
+    float_t* data;
+    uint32_t  dims;
+    idx_t  n_points;
+    idx_t  root;
+
+    struct point_t* __points;
+    float_t* __boxes_data;
+    struct pivot_t* __pivots;
+};
+
+typedef struct pivot_t pivot_t;
+typedef struct kdtree_t kdtree_t;
+
+
+static inline float_t euclidean_distance(const float_t* p1, const float_t* p2, const idx_t dims) 
+{
+    register float_t d = 0;
+    for (idx_t i = 0; i < dims; ++i) 
+    {
+        d += (p1[i] - p2[i])*(p1[i] - p2[i]);
+    }
+    return d;
+    // return sqrt(d);
+}
+
+// #include <immintrin.h>
+// static inline float_t euclidean_distance_simd(float_t* p1, float_t* p2, idx_t dims) 
+// {
+//     // Use AVX to process 4 doubles at a time
+//     __m256d diff, sum = _mm256_setzero_pd();
+//
+//     idx_t i = 0;
+//     for (; i <= (idx_t)dims - 4; i += 4) {
+//         // Load 4 doubles from p1 and p2
+//         __m256d v1 = _mm256_loadu_pd(p1 + i);
+//         __m256d v2 = _mm256_loadu_pd(p2 + i);
+//
+//         // Subtract
+//         diff = _mm256_sub_pd(v1, v2);
+//
+//         // Square and add to sum (fused multiply-add)
+//         sum = _mm256_fmadd_pd(diff, diff, sum);
+//     }
+//
+//     // Horizontal sum of the 4 partial sums in the vector
+//     __m256d temp = _mm256_hadd_pd(sum, sum);
+//     __m128d sum_high = _mm256_extractf128_pd(temp, 1);
+//     __m128d result = _mm_add_pd(_mm256_castpd256_pd128(temp), sum_high);
+//
+//     float_t d = _mm_cvtsd_f64(result);
+//
+//     // Handle any remaining dimensions that aren't a multiple of 4
+//     for (; i < dims; ++i) {
+//         float_t term = p1[i] - p2[i];
+//         d += term * term;
+//     }
+//
+//     return d;
+// }
+
+#ifdef USE_FLOAT32
+typedef float v4f __attribute__((vector_size(16)));
+#else
+typedef double v4f __attribute__((vector_size(32)));
+#endif
+
+// static inline void point_swap(point_t *x, point_t *y, const idx_t dims) {
+//     // exchange idxs
+//
+//     idx_t tmp_idx;
+//     tmp_idx = x->array_idx;
+//     x->array_idx = y->array_idx;
+//     y->array_idx = tmp_idx;
+//
+//     //swap datapoint
+//
+//     for(idx_t i = 0; i < dims; ++i)
+//     {
+//         float_t tmp_data = x -> data[i];
+//         x -> data[i] = y -> data[i];
+//         y -> data[i] = tmp_data;
+//     }
+//
+// }
+
+static inline void point_swap(point_t *x, point_t *y, const idx_t dims) 
+{
+    point_t tmp = *x;
+    *x = *y;
+    *y = tmp;
+}
+
+static void pivot_initialize_all(pivot_t* node_array, idx_t n) {
+    for (idx_t i = 0; i < n; ++i) 
+    {
+        node_array[i].as.internal.lch_idx = -1;
+        node_array[i].as.internal.rch_idx = -1;
+        node_array[i].is_leaf = false;
+        node_array[i].as.internal.split_variable = -1;
+        node_array[i].as.internal.split_value = FLT_MAX;
+    }
+}
+
+static inline int faster_point_partition_random(point_t *arr, idx_t low, idx_t high, int split_var, idx_t dims) {
+    // Seed the random number generator only once for the application's lifetime.
+    static bool seeded = false;
+    if (!seeded) {
+        srand(42); // Using a fixed seed for reproducible behavior. Use time(NULL) for non-deterministic pivots.
+        seeded = true;
+    }
+    
+    // In a subarray of size > 1, choose a random pivot index.
+    if (low < high) {
+        idx_t pivot_idx = low + rand() % (high - low + 1);
+        // Swap the chosen pivot with the last element to use the standard Lomuto scheme.
+        point_swap(arr + pivot_idx, arr + high, dims);
+    }
+    
+    // The rest of this function is the standard Lomuto partition scheme.
+    point_t pivot = arr[high];
+    idx_t i = (low - 1);
+    for (idx_t j = low; j < high; j++) {
+        // If current element is smaller than or equal to the pivot
+        if (arr[j].data[split_var] <= pivot.data[split_var]) {
+            i++;
+            point_swap(arr + i, arr + j, dims);
+        }
+    }
+    point_swap(arr + i + 1, arr + high, dims);
+    return (i + 1);
+}
+
+
+static int point_compute_nth_element(point_t *a, idx_t left, idx_t right, idx_t rank, int split_var, idx_t dims) 
+{
+    // `k` is the absolute index of the rank-th smallest element in the full array.
+    // `rank` is the 0-indexed rank in the subarray `a[left...right]`.
+    int k = left + rank;
+
+    while (left <= right) {
+        // Base case: if the subarray has only one element, it's the desired element.
+        if (left == right) {
+            return left;
+        }
+
+        // Partition the array using a random pivot and get the pivot's final index.
+        int pivotIndex = faster_point_partition_random(a, left, right, split_var, dims);
+
+        // If the pivot is the rank-th element, we are done.
+        if (pivotIndex == k) {
+            return pivotIndex;
+        }
+        // If the rank-th element is in the left subarray, adjust the `right` boundary.
+        else if (pivotIndex > k) {
+            right = pivotIndex - 1;
+        }
+        // If the rank-th element is in the right subarray, adjust the `left` boundary.
+        else {
+            left = pivotIndex + 1;
+        }
+    }
+    return -1; // Should not be reached in a valid scenario.
+}
+
+
+static idx_t parallel_make_tree_w_ranks(point_t* points, pivot_t* pivots, 
+                                        idx_t child_pivot_idx, int left_or_right, 
+                                        idx_t start, idx_t end, int level, idx_t dims, 
+                                        idx_t start_rank, idx_t end_rank) 
+{
+    // this only build the first log2(ranks) levels
+    idx_t parent_pivot_idx = (child_pivot_idx > 0) ? ((child_pivot_idx - 1) / 2) : -1;
+    pivot_t *pivot = pivots + child_pivot_idx;
+    pivot_t *parent = (parent_pivot_idx != (idx_t)-1) ? (pivots + parent_pivot_idx) : NULL;
+
+    switch (left_or_right) 
+    {
+        case HP_LEFT_SIDE:
+        {
+            memcpy(pivot->bounding_box.lb, parent->bounding_box.lb, dims*sizeof(float_t));
+            memcpy(pivot->bounding_box.ub, parent->bounding_box.ub, dims*sizeof(float_t));
+            idx_t parent_splitting_dim = parent->as.internal.split_variable;
+            float_t parent_split_val = parent->as.internal.split_value;
+            pivot->bounding_box.ub[parent_splitting_dim] = parent_split_val; 
+        }
+        break;
+
+        case HP_RIGHT_SIDE:
+        {
+            memcpy(pivot->bounding_box.lb, parent->bounding_box.lb, dims*sizeof(float_t));
+            memcpy(pivot->bounding_box.ub, parent->bounding_box.ub, dims*sizeof(float_t));
+            idx_t parent_splitting_dim = parent->as.internal.split_variable;
+            float_t parent_split_val = parent->as.internal.split_value;
+            pivot->bounding_box.lb[parent_splitting_dim] = parent_split_val;
+        }
+        break;
+
+        default:
+        {
+            for(int i=0; i<dims; ++i) {
+                pivot->bounding_box.lb[i] = FLT_MAX;
+                pivot->bounding_box.ub[i] = -FLT_MAX;
+            }
+            for(idx_t i=start; i<=end; ++i) {
+                for(idx_t j=0; j<dims; ++j) {
+                    pivot->bounding_box.lb[j] = MIN(pivot->bounding_box.lb[j], points[i].data[j]);
+                    pivot->bounding_box.ub[j] = MAX(pivot->bounding_box.ub[j], points[i].data[j]);
+                }
+            }
+        }
+        break;
+    }
+    
+    //handle the case it is a leaf
+
+    if(start_rank == end_rank)
+    {
+        pivots[child_pivot_idx].is_leaf = true;
+        // write here the owner!!!
+        pivots[child_pivot_idx].as.leaf.leaf_point_count = start_rank;
+        return child_pivot_idx;
+    }
+    
+    int split_var = 0;
+    float_t max_extension = 0;
+    for(idx_t d=0; d < dims; ++d) {
+        float_t box_extension = pivot->bounding_box.ub[d] - pivot->bounding_box.lb[d];
+        box_extension = box_extension*box_extension;
+        if (box_extension > max_extension) {
+            max_extension = box_extension;
+            split_var = d;
+        }
+    }
+
+
+    idx_t middle_rank = (start_rank + end_rank)/2;
+
+    float_t fraction = (float_t)(middle_rank - start_rank + 1)/(float_t)(end_rank - start_rank + 1);
+    float_t nth_place  = (end - start)*fraction;
+
+    int nth_idx = point_compute_nth_element(points, start, end, (idx_t)nth_place, split_var, dims); 
+
+    // printf("child_pivot_idx %lu start_rank %lu end_rank %lu middle %lu nth_place %.2lf fraction %.2lf\n", 
+    //         child_pivot_idx, start_rank, end_rank, middle_rank, nth_place, fraction);
+    //printf("child_pivot_idx %lu start %lu end %lu nth %lu\n", child_pivot_idx, start, end, (idx_t)nth_idx);
+
+    if (nth_idx > -1) {
+        pivots[child_pivot_idx].as.internal.split_variable = split_var;
+        pivots[child_pivot_idx].as.internal.split_value = points[nth_idx].data[split_var];
+        
+        pivots[child_pivot_idx].as.internal.lch_idx = 
+            parallel_make_tree_w_ranks(points, pivots, child_pivot_idx*2 + 1, 
+                                    HP_LEFT_SIDE, start, nth_idx, level + 1, 
+                                    dims, start_rank, middle_rank);
+        
+        pivots[child_pivot_idx].as.internal.rch_idx = 
+            parallel_make_tree_w_ranks(points, pivots, child_pivot_idx*2 + 2, 
+                                        HP_RIGHT_SIDE, nth_idx + 1, end, level + 1, 
+                                        dims, middle_rank + 1, end_rank);
+        
+    }
+
+    return child_pivot_idx;
+}
+
+
+static void point_initialize_all(point_t* point_array, float_t* data, idx_t n, idx_t dims)
+{
+    for (idx_t i = 0; i < n; ++i) 
+    {
+        point_array[i].array_idx = i;
+        point_array[i].data = data + i*dims;
+    }
+
+}
+
+static inline int point_compare(const point_t *a, const point_t *b, int var) {
+    float_t res = a->data[var] - b->data[var];
+    return (res > 0);
+}
+
+static void pivot_print(pivot_t node, idx_t node_idx, idx_t dims) {
+    printf("Node %lu:\n", node_idx);
+    printf(" data: ");
+
+    printf("\n");
+
+    if(node.is_leaf)
+    {
+        printf("  LEAF node\n");
+        printf("  list len: %d\n", node.as.leaf.leaf_point_count);
+    }
+    else 
+    {
+        printf("  INTERNAL node\n");
+        printf("  split_value: %.4lf split_variable: %d\n", node.as.internal.split_value, node.as.internal.split_variable);
+        printf("  lch: %lu\n", node.as.internal.lch_idx);
+        printf("  rch: %lu\n", node.as.internal.rch_idx);
+    
+    }
+    printf("  lb_box: [ ");
+    for(int i=0; i<dims; ++i)
+    {
+        printf("%.4lf ", node.bounding_box.lb[i]);
+    }
+    printf(" ]\n");
+
+    printf("  ub_box: [ ");
+    for(int i=0; i<dims; ++i)
+    {
+        printf("%.4lf ", node.bounding_box.ub[i]);
+    }
+    printf(" ]\n");
+
+
+    printf("\n");
+}
+
+// Standard Lomuto partition function
+
+static inline int point_partition(point_t *arr, idx_t low, idx_t high, int split_var, idx_t dims) {
+    point_t pivot = arr[high];
+
+    int i = (low - 1);
+    for (int j = low; j <= high - 1; j++) {
+        if (!point_compare(arr + j, &pivot, split_var)) 
+        {
+            i++;
+            point_swap(arr + i, arr + j, dims);
+        }
+    }
+    point_swap(arr + i + 1, arr + high, dims);
+    return (i + 1);
+}
+
+
+
+
+static int faster_point_compute_median(point_t *a, idx_t left, idx_t right, int split_var, idx_t dims) 
+{
+    // `k` is the absolute index of the median in the original full array.
+    int k = left + ((right - left + 1) / 2);
+
+    while (left <= right) {
+        // Base case: if the subarray has only one element, it's the median.
+        if (left == right) {
+            return left;
+        }
+
+        // Partition the array using a random pivot and get the pivot's final index.
+        int pivotIndex = faster_point_partition_random(a, left, right, split_var, dims);
+
+        // If the pivot is the median, we are done.
+        if (pivotIndex == k) {
+            return pivotIndex;
+        }
+        // If the median is in the left subarray, adjust the `right` boundary.
+        else if (pivotIndex > k) {
+            right = pivotIndex - 1;
+        }
+        // If the median is in the right subarray, adjust the `left` boundary.
+        else {
+            left = pivotIndex + 1;
+        }
+    }
+    return -1; // Should not be reached in a valid scenario.
+}
+
+// Implementation of QuickSelect
+static int point_compute_median(point_t *a, idx_t left, idx_t right, int split_var, idx_t dims) 
+{
+    // printf("----------\n");
+    int k = left + ((right - left + 1) / 2);
+
+    if (left == right) return left;
+    if (left == (right - 1)) 
+    {
+        if (point_compare(a + left, a + right, split_var)) 
+        {
+            point_swap(a + left, a + right, dims);
+        }
+        return right;
+    }
+    while (left <= right) {
+
+        // Partition a[left..right] around a pivot
+        // and find the position of the pivot
+        int pivotIndex = point_partition(a, left, right, split_var, dims);
+        // printf("%d %d %d %d\n",left, right, k, pivotIndex);
+
+        // If pivot itself is the k-th smallest element
+        if (pivotIndex == k)
+        return pivotIndex;
+
+        // If there are more than k-1 elements on
+        // left of pivot, then k-th smallest must be
+        // on left side.
+
+        else if (pivotIndex > k) right = pivotIndex - 1;
+
+        // Else k-th smallest is on right side.
+        else left = pivotIndex + 1;
+    }
+    return -1;
+}
+
+static idx_t recursive_make_tree(point_t* points, pivot_t* pivots, 
+                             idx_t child_pivot_idx, int left_or_right, 
+                             idx_t start, idx_t end, int level, idx_t dims) 
+{
+    struct timespec start_node, stop_node;
+
+    clock_gettime(CLOCK_MONOTONIC, &start_node);
+
+    // printf("Entering Node: [%d %d] @level %d\n", 
+    //        start, end, level );
+
+    idx_t parent_pivot_idx = (child_pivot_idx > 0) ? ((child_pivot_idx - 1) / 2) : -1;
+    pivot_t *pivot = pivots + child_pivot_idx;
+    pivot_t *parent = pivots + parent_pivot_idx;
+
+    switch (left_or_right) 
+    {
+        case HP_LEFT_SIDE:
+        {
+            memcpy(pivot->bounding_box.lb, parent->bounding_box.lb, dims*sizeof(float_t));
+            memcpy(pivot->bounding_box.ub, parent->bounding_box.ub, dims*sizeof(float_t));
+            idx_t parent_splitting_dim = pivots[parent_pivot_idx].as.internal.split_variable;
+            float_t parent_split_val = pivots[parent_pivot_idx].as.internal.split_value;
+
+            pivot->bounding_box.ub[parent_splitting_dim] = parent_split_val; 
+        }
+        break;
+
+        case HP_RIGHT_SIDE:
+        {
+            memcpy(pivot->bounding_box.lb, parent->bounding_box.lb, dims*sizeof(float_t));
+            memcpy(pivot->bounding_box.ub, parent->bounding_box.ub, dims*sizeof(float_t));
+            idx_t parent_splitting_dim = pivots[parent_pivot_idx].as.internal.split_variable;
+            float_t parent_split_val = pivots[parent_pivot_idx].as.internal.split_value;
+
+            pivot->bounding_box.lb[parent_splitting_dim] = parent_split_val; 
+        }
+        break;
+
+        default:
+        {
+            for(int i=0; i<dims; ++i)
+            {
+                pivot->bounding_box.lb[i] = FLT_MAX;
+                pivot->bounding_box.ub[i] = -FLT_MAX;
+            }
+
+            // compute master bounding_box
+            //
+
+            for(idx_t i=start; i<=end; ++i)
+            {
+                for(idx_t j=0; j<dims; ++j)
+                {
+                    pivot->bounding_box.lb[j] = MIN(pivot->bounding_box.lb[j], points[i].data[j]);
+                    pivot->bounding_box.ub[j] = MAX(pivot->bounding_box.ub[j], points[i].data[j]);
+                }
+            }
+        }
+        break;
+         
+    }
+    
+    //int split_var = level % dims;
+
+    int split_var = 0;
+    float_t max_extension = 0;
+    for(idx_t d=0; d < dims; ++d)
+    {
+        float_t box_extension = pivot->bounding_box.ub[d] - pivot->bounding_box.lb[d];
+        box_extension = box_extension*box_extension;
+
+        split_var = (box_extension > max_extension) ? d : split_var;
+        max_extension = MAX(box_extension, max_extension);
+    }
+
+    if (end - start < DEFAULT_LEAF_SIZE)
+    {
+        pivots[child_pivot_idx].is_leaf = 1;
+        pivots[child_pivot_idx].as.leaf.leaf_point_count = (size_t)(end - start + 1);
+        pivots[child_pivot_idx].as.leaf.point_list_idx = start;
+        return child_pivot_idx;
+    }
+
+    int median_idx = -1;
+
+    median_idx = faster_point_compute_median(points, start, end, split_var, dims);
+    // printf("%d median idx\n", median_idx);
+    if (median_idx > -1) 
+    {
+        // generate a pivot
+        pivots[child_pivot_idx].as.internal.split_variable = split_var;
+        pivots[child_pivot_idx].as.internal.split_value = points[median_idx].data[split_var];
+        pivots[child_pivot_idx].as.internal.lch_idx = 
+            recursive_make_tree(points, pivots, child_pivot_idx*2 + 1, 
+                                HP_LEFT_SIDE, start, median_idx, level + 1, dims);
+        pivots[child_pivot_idx].as.internal.rch_idx = 
+            recursive_make_tree(points, pivots, child_pivot_idx*2 + 2, 
+                                HP_RIGHT_SIDE, median_idx + 1, end, level + 1, dims);
+
+    }
+
+    clock_gettime(CLOCK_MONOTONIC, &stop_node);
+
+    // printf("Exiting Node: [%d %d] @level %d -> elapsed: %f\n", 
+    //        start, end, level, ((float)stop_node.tv_sec - (float)start_node.tv_sec) + 
+    //                           ((float)stop_node.tv_nsec - (float)start_node.tv_nsec)/1e9  );
+
+    return child_pivot_idx;
+}
+
+static inline float_t hyper_plane_dist(const float_t *p1, const float_t p2, const int var) 
+{
+    return p1[var] - p2;
+}
+
+static inline float_t box_dist(const float_t *p, const float_t *lb, const float_t* ub, const int dims) 
+{
+    float_t r = 0;
+    for (idx_t i=0; i<dims; i++) {
+        if(p[i] < lb[i]) {
+            r += (lb[i] - p[i]) * (lb[i] - p[i]);
+        } else if (p[i] > ub[i]) {
+            r += (p[i] - ub[i]) * (p[i] - ub[i]);
+        }
+    }
+    return r;
+}
+
+static inline int hyper_plane_side(const float_t *p1, const float_t *p2, const int var)
+{
+    return p1[var] > p2[var];
+}
+
+static void knn_sub_tree_search(const float_t *point, const point_t* data_points, const pivot_t* pivots, 
+                         const idx_t root_idx, heap_t *H, const idx_t dims, idx_t* vis_nodes) 
+{
+    pivot_t root = pivots[root_idx];
+    *vis_nodes = *vis_nodes + 1;
+    if (root.is_leaf) 
+    {
+        // take the base pointer
+        idx_t base_point = root.as.leaf.point_list_idx;
+        const point_t* leaf_points = data_points + base_point;
+        float_t* base_data   = leaf_points[0].data;
+        idx_t point_list_count = root.as.leaf.leaf_point_count;
+
+
+        float_t dists[DEFAULT_LEAF_SIZE];
+        #pragma unroll 4
+        for (size_t i = 0; i < point_list_count; ++i) 
+        {
+            float_t distance = euclidean_distance(point, base_data + i*dims, dims);
+            dists[i] = distance;
+        }
+
+        for (size_t i = 0; i < point_list_count; ++i) 
+        {
+            point_t data_point = leaf_points[i];
+            max_heap_insert(H, dists[i], data_point.array_idx);
+        }
+
+        // CORRECTED an efficient single loop:
+        // for (size_t i = 0; i < point_list_count; ++i)
+        // {
+        //     *vis_nodes = *vis_nodes + 1;
+        //
+        //     // Calculate distance using the correct data pointer
+        //     float_t distance = euclidean_distance(point, base_data + i*dims, dims);
+        //
+        //     // Immediately use the distance. No temporary array, less memory traffic.
+        //     max_heap_insert(H, distance, leaf_points[i].array_idx);
+        // }
+        return;
+    }
+
+
+
+    //float_t hp_distance = hyper_plane_dist(point, root->split_value, root->split_variable);
+    float_t hp_distance = hyper_plane_dist(point, root.as.internal.split_value, root.as.internal.split_variable);
+
+    int side = hp_distance > 0.f;
+
+    switch (side) {
+    case HP_LEFT_SIDE:
+        knn_sub_tree_search(point, data_points, pivots, root.as.internal.lch_idx, H, dims, vis_nodes);
+        break;
+
+    case HP_RIGHT_SIDE:
+        knn_sub_tree_search(point, data_points, pivots, root.as.internal.rch_idx, H, dims, vis_nodes);
+        break;
+
+    default:
+        break;
+    }
+    float_t max_d = H->data[0].value;
+
+
+    // int c = max_d > (hp_distance * hp_distance);
+    // if (c || (H->count) < (H->size))
+    // {
+    //     switch (side) 
+    //     {
+    //         case HP_LEFT_SIDE:
+    //             if (root->rch) knn_sub_tree_search(point, root->rch, H, dims, vis_nodes);
+    //             break;
+    //
+    //         case HP_RIGHT_SIDE:
+    //             if (root->lch) knn_sub_tree_search(point, root->lch, H, dims, vis_nodes);
+    //             break;
+    //
+    //         default:
+    //             break;
+    //     }
+    // }
+
+    int heap_is_not_full = (H->count) < (H->size);
+    switch (side) 
+    {
+        case HP_LEFT_SIDE:
+            {
+                bounding_box_t box = pivots[root.as.internal.rch_idx].bounding_box; 
+                float_t dist = box_dist(point, box.lb, box.ub, dims);
+                if((max_d > dist) || heap_is_not_full) 
+                    knn_sub_tree_search(point, data_points, pivots, root.as.internal.rch_idx, H, dims, vis_nodes);
+            }
+            break;
+
+        case HP_RIGHT_SIDE:
+            {
+                bounding_box_t box = pivots[root.as.internal.lch_idx].bounding_box; 
+                float_t dist = box_dist(point, box.lb, box.ub, dims);
+                if((max_d > dist) || heap_is_not_full) 
+                    knn_sub_tree_search(point, data_points, pivots, root.as.internal.lch_idx, H, dims, vis_nodes);
+            }
+            break;
+
+        default:
+            break;
+    }
+    return;
+}
+
+
+
+static void kdtree_initialize(kdtree_t *tree, float_t *data, size_t n_points, unsigned int dims) 
+{
+    tree->dims = dims;
+    tree->data = data;
+    tree->n_points = n_points;
+    tree->__points = (point_t *)MY_MALLOC(n_points * sizeof(point_t));
+    tree->__pivots = (pivot_t *)MY_MALLOC(n_points * sizeof(pivot_t));
+    tree->__boxes_data = (float_t *)MY_MALLOC(2 * n_points * sizeof(float_t)*dims);
+
+    for(idx_t i=0; i<n_points; ++i)
+    {
+        idx_t ii = 2*i;
+        tree->__pivots[i].bounding_box.lb = tree->__boxes_data + ii*dims; 
+        tree->__pivots[i].bounding_box.ub = tree->__boxes_data + (ii+1)*dims; 
+    }
+    pivot_initialize_all(tree->__pivots, n_points);
+    point_initialize_all(tree->__points, data, n_points, dims);
+    tree->root = -1;
+}
+
+static void recursive_print(pivot_t* pivots, idx_t node_idx, idx_t dims, idx_t level)
+{
+    pivot_t node =  pivots[node_idx];
+    if(level < 3)
+    {
+        printf("---LEVEL %lu ---\n", level);
+        pivot_print(node, node_idx, dims);
+
+    }
+    if(!node.is_leaf) 
+    {
+        recursive_print(pivots, node.as.internal.lch_idx, dims, level+1);
+        recursive_print(pivots, node.as.internal.rch_idx, dims, level+1);
+    }
+}
+
+static void kdtree_print(kdtree_t* tree)
+{
+    idx_t level = 0;
+    recursive_print(tree->__pivots, tree -> root, tree -> dims, level);
+}
+
+// void kdtree_compact_data(kdtree_t* tree)
+// {
+//     float_t* new_data = (float_t*)calloc(tree->dims * tree->n_points * sizeof(float_t), 1); 
+//     for(idx_t i = 0; i < tree->n_points; ++i)
+//     {
+//         kdnode_t* node = tree->_nodes + i;
+//                     memcpy(new_data + i*tree->dims, node->data, tree->dims * sizeof(float_t));        node->data = new_data + i*tree->dims; 
+//     }
+//     free(tree -> data);
+//     tree -> data = new_data;
+// }
+
+
+
+
+#include <omp.h>
+
+#define SERIAL_BUILD_CUTOFF 4096
+#define NLEVELS 6
+#define OVERSAMPLING 32
+
+static idx_t recursive_make_pivots(point_t* points, pivot_t* pivots, idx_t current_pivot_idx, idx_t parent_pivot_idx, 
+                            int left_or_right, idx_t start, idx_t end, int level, int dims, 
+                            idx_t* last_bucket_assigned, idx_t* leaf_pivot_indices)
+{
+    pivot_t *pivot = pivots + current_pivot_idx;
+    pivot_t *parent = (parent_pivot_idx != (idx_t)-1) ? (pivots + parent_pivot_idx) : NULL;
+    
+    if(level == 0)
+    {
+        pivot->is_leaf = true;
+        // Store the bucket ID in the leaf node so find_bucket can retrieve it
+        pivot->as.leaf.leaf_point_count = *last_bucket_assigned;
+        if (leaf_pivot_indices) {
+            leaf_pivot_indices[*last_bucket_assigned] = current_pivot_idx;
+        }
+        (*last_bucket_assigned)++;
+        return current_pivot_idx;
+    }
+
+
+    switch (left_or_right) 
+    {
+        case HP_LEFT_SIDE:
+        {
+            memcpy(pivot->bounding_box.lb, parent->bounding_box.lb, dims*sizeof(float_t));
+            memcpy(pivot->bounding_box.ub, parent->bounding_box.ub, dims*sizeof(float_t));
+            idx_t parent_splitting_dim = parent->as.internal.split_variable;
+            float_t parent_split_val = parent->as.internal.split_value;
+            pivot->bounding_box.ub[parent_splitting_dim] = parent_split_val; 
+        }
+        break;
+
+        case HP_RIGHT_SIDE:
+        {
+            memcpy(pivot->bounding_box.lb, parent->bounding_box.lb, dims*sizeof(float_t));
+            memcpy(pivot->bounding_box.ub, parent->bounding_box.ub, dims*sizeof(float_t));
+            idx_t parent_splitting_dim = parent->as.internal.split_variable;
+            float_t parent_split_val = parent->as.internal.split_value;
+            pivot->bounding_box.lb[parent_splitting_dim] = parent_split_val;
+        }
+        break;
+
+        default:
+        {
+            for(int i=0; i<dims; ++i) {
+                pivot->bounding_box.lb[i] = FLT_MAX;
+                pivot->bounding_box.ub[i] = -FLT_MAX;
+            }
+            for(idx_t i=start; i<=end; ++i) {
+                for(idx_t j=0; j<dims; ++j) {
+                    pivot->bounding_box.lb[j] = MIN(pivot->bounding_box.lb[j], points[i].data[j]);
+                    pivot->bounding_box.ub[j] = MAX(pivot->bounding_box.ub[j], points[i].data[j]);
+                }
+            }
+        }
+        break;
+    }
+    
+    int split_var = 0;
+    float_t max_extension = 0;
+    for(idx_t d=0; d < dims; ++d) {
+        float_t box_extension = pivot->bounding_box.ub[d] - pivot->bounding_box.lb[d];
+        box_extension = box_extension*box_extension;
+        if (box_extension > max_extension) {
+            max_extension = box_extension;
+            split_var = d;
+        }
+    }
+
+    int median_idx = faster_point_compute_median(points, start, end, split_var, dims);
+
+    //build tree on sampled points
+
+    if (median_idx > -1) {
+        pivots[current_pivot_idx].as.internal.split_variable = split_var;
+        pivots[current_pivot_idx].as.internal.split_value = points[median_idx].data[split_var];
+
+        pivots[current_pivot_idx].as.internal.lch_idx = recursive_make_pivots(points, pivots, current_pivot_idx * 2 + 1, current_pivot_idx, 
+                                                                              HP_LEFT_SIDE, start, median_idx, level - 1, dims, last_bucket_assigned, leaf_pivot_indices);
+        pivots[current_pivot_idx].as.internal.rch_idx = recursive_make_pivots(points, pivots, current_pivot_idx * 2 + 2, current_pivot_idx, 
+                                                                              HP_RIGHT_SIDE, median_idx + 1, end, level - 1, dims, last_bucket_assigned, leaf_pivot_indices);
+    }
+    return current_pivot_idx;
+}
+
+static inline idx_t find_bucket(point_t* p, pivot_t* pivots, idx_t root_idx) {
+    idx_t curr = root_idx;
+    while (!pivots[curr].is_leaf) {
+        int split_var = pivots[curr].as.internal.split_variable;
+        float_t split_val = pivots[curr].as.internal.split_value;
+        if (p->data[split_var] <= split_val) {
+            curr = pivots[curr].as.internal.lch_idx;
+        } else {
+            curr = pivots[curr].as.internal.rch_idx;
+        }
+    }
+    // The bucket ID was stored in leaf_point_count by recursive_make_pivots
+    return (idx_t)pivots[curr].as.leaf.leaf_point_count;
+}
+
+static void parallel_partition_points(point_t* points, point_t* scratch_points, 
+                               pivot_t* pivots, idx_t root_idx, 
+                               idx_t start, idx_t end, 
+                               idx_t* bucket_start_indices, 
+                               idx_t* bucket_counts, 
+                               idx_t n_buckets) {
+    idx_t n = end - start + 1;
+    #define PARTITION_BLOCK_SIZE 2048
+    idx_t num_blocks = (n + PARTITION_BLOCK_SIZE - 1) / PARTITION_BLOCK_SIZE;
+
+    // Allocate local counts: [num_blocks][n_buckets]
+    idx_t* block_bucket_counts = (idx_t*)calloc(num_blocks * n_buckets, sizeof(idx_t));
+    if (!block_bucket_counts) {
+        printf("Allocation failed for block_bucket_counts\n");
+        exit(1);
+    }
+
+    // 1. Local histogram
+    #pragma omp parallel for schedule(static)
+    for (idx_t b = 0; b < num_blocks; ++b) {
+        idx_t block_start = b * PARTITION_BLOCK_SIZE;
+        idx_t block_end = MIN(block_start + PARTITION_BLOCK_SIZE, n);
+        idx_t* local_counts = &block_bucket_counts[b * n_buckets];
+        
+        for (idx_t i = block_start; i < block_end; ++i) {
+            // Note: points[start + i] is the correct access
+            idx_t bucket = find_bucket(&points[start + i], pivots, root_idx);
+            local_counts[bucket]++;
+        }
+    }
+
+    // 2. Global histogram and prefix sum
+    memset(bucket_counts, 0, n_buckets * sizeof(idx_t));
+    for (idx_t b = 0; b < num_blocks; ++b) {
+        for (idx_t k = 0; k < n_buckets; ++k) {
+            bucket_counts[k] += block_bucket_counts[b * n_buckets + k];
+        }
+    }
+
+    bucket_start_indices[0] = 0;
+    for (idx_t k = 1; k < n_buckets; ++k) {
+        bucket_start_indices[k] = bucket_start_indices[k-1] + bucket_counts[k-1];
+    }
+
+    idx_t* block_write_offsets = (idx_t*)malloc(num_blocks * n_buckets * sizeof(idx_t));
+    idx_t* current_offsets = (idx_t*)malloc(n_buckets * sizeof(idx_t));
+    
+    memcpy(current_offsets, bucket_start_indices, n_buckets * sizeof(idx_t));
+
+    for (idx_t b = 0; b < num_blocks; ++b) {
+        for (idx_t k = 0; k < n_buckets; ++k) {
+            block_write_offsets[b * n_buckets + k] = current_offsets[k];
+            current_offsets[k] += block_bucket_counts[b * n_buckets + k];
+        }
+    }
+    free(current_offsets);
+
+    // 3. Move points to scratch (using slice at `start`)
+    #pragma omp parallel for schedule(static)
+    for (idx_t b = 0; b < num_blocks; ++b) {
+        idx_t block_start = b * PARTITION_BLOCK_SIZE;
+        idx_t block_end = MIN(block_start + PARTITION_BLOCK_SIZE, n);
+        idx_t* local_offsets = &block_write_offsets[b * n_buckets];
+        
+        for (idx_t i = block_start; i < block_end; ++i) {
+            idx_t bucket = find_bucket(&points[start + i], pivots, root_idx);
+            idx_t dest_offset = local_offsets[bucket]++;
+            
+            // Write to scratch_points at the correct offset relative to `start`
+            // scratch_points is assumed to be a parallel array to points
+            scratch_points[start + dest_offset] = points[start + i];
+        }
+    }
+
+    // 4. Copy back to points
+    #pragma omp parallel for schedule(static)
+    for(idx_t i=0; i<n; ++i) {
+         points[start + i] = scratch_points[start + i];
+    }
+
+    free(block_bucket_counts);
+    free(block_write_offsets);
+}
+
+static idx_t sample_recursive_make_tree(point_t* points, point_t* scratch_points, pivot_t* pivots, 
+                                 idx_t current_pivot_idx, idx_t parent_pivot_idx, int left_or_right,
+                                 idx_t start, idx_t end, int level, idx_t dims)
+{
+    if ((end - start) < SERIAL_BUILD_CUTOFF) {
+        return recursive_make_tree(points, pivots, current_pivot_idx, left_or_right, start, end, level, dims);
+    }
+
+    pivot_t *pivot = pivots + current_pivot_idx;
+    pivot_t *parent = (parent_pivot_idx != (idx_t)-1) ? (pivots + parent_pivot_idx) : NULL;
+
+    int sample_levels = NLEVELS; 
+    idx_t n_buckets = 1 << sample_levels;
+    idx_t n_samples = (n_buckets - 1) * OVERSAMPLING; 
+
+    // If not enough points to sample, fallback
+    if (n_samples > (end - start + 1)) { 
+        return recursive_make_tree(points, pivots, current_pivot_idx, left_or_right, start, end, level, dims);
+    }
+
+    float_t* samples = (float_t*)calloc(n_samples * dims * sizeof(float_t), 1);
+    point_t* tmp_points = (point_t*)calloc(n_samples * sizeof(point_t), 1);
+    
+    // 1. Random sampling
+    for(idx_t s=0; s<n_samples; ++s)
+    {
+        idx_t ii = rand() % (end - start + 1) + start;
+        memcpy(samples + s * dims, points[ii].data, dims * sizeof(float_t));        
+        tmp_points[s].data = samples + s * dims;
+        tmp_points[s].array_idx = s;
+    }
+
+    // 2. Setup root bounding box
+    switch (left_or_right) 
+    {
+        case HP_LEFT_SIDE:
+            memcpy(pivot->bounding_box.lb, parent->bounding_box.lb, dims*sizeof(float_t));
+            memcpy(pivot->bounding_box.ub, parent->bounding_box.ub, dims*sizeof(float_t));
+            pivot->bounding_box.ub[parent->as.internal.split_variable] = parent->as.internal.split_value; 
+            break;
+        case HP_RIGHT_SIDE:
+            memcpy(pivot->bounding_box.lb, parent->bounding_box.lb, dims*sizeof(float_t));
+            memcpy(pivot->bounding_box.ub, parent->bounding_box.ub, dims*sizeof(float_t));
+            pivot->bounding_box.lb[parent->as.internal.split_variable] = parent->as.internal.split_value;
+            break;
+        default:
+            for(int i=0; i<dims; ++i) {
+                pivot->bounding_box.lb[i] = FLT_MAX;
+                pivot->bounding_box.ub[i] = -FLT_MAX;
+            }
+             for(idx_t i=start; i<=end; ++i) {
+                for(idx_t j=0; j<dims; ++j) {
+                    pivot->bounding_box.lb[j] = MIN(pivot->bounding_box.lb[j], points[i].data[j]);
+                    pivot->bounding_box.ub[j] = MAX(pivot->bounding_box.ub[j], points[i].data[j]);
+                }
+            }
+            break;
+    }
+
+    // 3. Build Sample Tree (Pivots)
+    idx_t last_bucket_assigned = 0;
+    idx_t* leaf_pivot_indices = (idx_t*)malloc(n_buckets * sizeof(idx_t));
+    
+    recursive_make_pivots(tmp_points, pivots, current_pivot_idx, parent_pivot_idx, 
+                          left_or_right, 0, n_samples - 1, sample_levels, dims, 
+                          &last_bucket_assigned, leaf_pivot_indices);
+
+    // 4. Partition Points
+    idx_t* bucket_start_indices = (idx_t*)malloc(n_buckets * sizeof(idx_t));
+    idx_t* bucket_counts = (idx_t*)malloc(n_buckets * sizeof(idx_t));
+    
+    parallel_partition_points(points, scratch_points, pivots, current_pivot_idx, 
+                              start, end, bucket_start_indices, bucket_counts, n_buckets);
+
+    // 5. Recursively build subtrees
+    for(idx_t b = 0; b < n_buckets; ++b) 
+    {
+        if(bucket_counts[b] > 0) 
+        {
+            idx_t p_idx = leaf_pivot_indices[b];
+            idx_t b_start = start + bucket_start_indices[b];
+            idx_t b_end = b_start + bucket_counts[b] - 1;
+            
+            // Root is 0. Left child of p is 2p+1.
+            int side = (p_idx % 2 != 0) ? HP_LEFT_SIDE : HP_RIGHT_SIDE;
+            if (p_idx == 0) side = -1; 
+
+            #pragma omp task
+            {
+                pivots[p_idx].is_leaf = 0;
+                sample_recursive_make_tree(points, scratch_points, pivots, p_idx, 
+                                           (p_idx > 0) ? (p_idx - 1) / 2 : -1, side, 
+                                           b_start, b_end, level + sample_levels, dims);
+            }
+        }
+    }
+    #pragma omp taskwait
+
+    free(leaf_pivot_indices);
+    free(bucket_start_indices);
+    free(bucket_counts);
+    free(samples);
+    free(tmp_points);
+
+    return current_pivot_idx;
+}
+
+static idx_t parallel_recursive_make_tree(point_t* points, pivot_t* pivots, 
+                                   idx_t child_pivot_idx, int left_or_right, 
+                                   idx_t start, idx_t end, int level, idx_t dims);
+
+static void build_tree_kdtree_parallel(kdtree_t* tree) 
+{
+    /*************************************************
+    * Wrapper for the parallel make_tree function.  *
+    *************************************************/
+    #pragma omp parallel
+    {
+        #pragma omp single
+        {
+            tree->root = parallel_recursive_make_tree(tree->__points, tree->__pivots, 0, -1,
+                                                      0, tree->n_points-1, 0, tree->dims);
+        }
+    }
+
+    float_t* tmp_data = (float_t*)MY_MALLOC(tree->n_points * tree->dims * sizeof(float_t));
+
+    #pragma omp parallel for
+    for(idx_t i = 0; i < tree->n_points; ++i)
+    {
+        float_t* datapoint = tree->__points[i].data;
+        memcpy(tmp_data + tree->dims * i, datapoint, tree->dims * sizeof(float_t));
+        tree->__points[i].data = tmp_data + tree->dims * i;
+    }
+    free(tree->data);
+    tree->data = tmp_data;
+}
+
+static idx_t parallel_recursive_make_tree(point_t* points, pivot_t* pivots, 
+                                   idx_t child_pivot_idx, int left_or_right, 
+                                   idx_t start, idx_t end, int level, idx_t dims) 
+{
+    // Fallback to serial version for small subproblems
+    if ((end - start) < SERIAL_BUILD_CUTOFF) {
+        return recursive_make_tree(points, pivots, child_pivot_idx, left_or_right, start, end, level, dims);
+    }
+
+    // This logic is identical to the serial version, as the partitioning step itself remains serial.
+    // The parallelism comes from recursively calling this function in parallel tasks.
+    idx_t parent_pivot_idx = (child_pivot_idx > 0) ? ((child_pivot_idx - 1) / 2) : -1;
+    pivot_t *pivot = pivots + child_pivot_idx;
+    pivot_t *parent = (parent_pivot_idx != (idx_t)-1) ? (pivots + parent_pivot_idx) : NULL;
+
+    switch (left_or_right) 
+    {
+        case HP_LEFT_SIDE:
+        {
+            memcpy(pivot->bounding_box.lb, parent->bounding_box.lb, dims*sizeof(float_t));
+            memcpy(pivot->bounding_box.ub, parent->bounding_box.ub, dims*sizeof(float_t));
+            idx_t parent_splitting_dim = parent->as.internal.split_variable;
+            float_t parent_split_val = parent->as.internal.split_value;
+            pivot->bounding_box.ub[parent_splitting_dim] = parent_split_val; 
+        }
+        break;
+
+        case HP_RIGHT_SIDE:
+        {
+            memcpy(pivot->bounding_box.lb, parent->bounding_box.lb, dims*sizeof(float_t));
+            memcpy(pivot->bounding_box.ub, parent->bounding_box.ub, dims*sizeof(float_t));
+            idx_t parent_splitting_dim = parent->as.internal.split_variable;
+            float_t parent_split_val = parent->as.internal.split_value;
+            pivot->bounding_box.lb[parent_splitting_dim] = parent_split_val;
+        }
+        break;
+
+        default:
+        {
+            for(int i=0; i<dims; ++i) {
+                pivot->bounding_box.lb[i] = FLT_MAX;
+                pivot->bounding_box.ub[i] = -FLT_MAX;
+            }
+            for(idx_t i=start; i<=end; ++i) {
+                for(idx_t j=0; j<dims; ++j) {
+                    pivot->bounding_box.lb[j] = MIN(pivot->bounding_box.lb[j], points[i].data[j]);
+                    pivot->bounding_box.ub[j] = MAX(pivot->bounding_box.ub[j], points[i].data[j]);
+                }
+            }
+        }
+        break;
+    }
+    
+    int split_var = 0;
+    float_t max_extension = 0;
+    for(idx_t d=0; d < dims; ++d) {
+        float_t box_extension = pivot->bounding_box.ub[d] - pivot->bounding_box.lb[d];
+        box_extension = box_extension*box_extension;
+        if (box_extension > max_extension) {
+            max_extension = box_extension;
+            split_var = d;
+        }
+    }
+
+    if (end - start < DEFAULT_LEAF_SIZE) {
+        pivots[child_pivot_idx].is_leaf = 1;
+        pivots[child_pivot_idx].as.leaf.leaf_point_count = (size_t)(end - start + 1);
+        pivots[child_pivot_idx].as.leaf.point_list_idx = start;
+        return child_pivot_idx;
+    }
+
+    int median_idx = faster_point_compute_median(points, start, end, split_var, dims);
+    
+    if (median_idx > -1) {
+        pivots[child_pivot_idx].as.internal.split_variable = split_var;
+        pivots[child_pivot_idx].as.internal.split_value = points[median_idx].data[split_var];
+        
+        #pragma omp task
+        pivots[child_pivot_idx].as.internal.lch_idx = 
+            parallel_recursive_make_tree(points, pivots, child_pivot_idx*2 + 1, 
+                                         HP_LEFT_SIDE, start, median_idx, level + 1, dims);
+        
+        #pragma omp task
+        pivots[child_pivot_idx].as.internal.rch_idx = 
+            parallel_recursive_make_tree(points, pivots, child_pivot_idx*2 + 2, 
+                                         HP_RIGHT_SIDE, median_idx + 1, end, level + 1, dims);
+        
+        #pragma omp taskwait
+    }
+
+    return child_pivot_idx;
+}
+
+
+static void kdtree_free(kdtree_t *tree) 
+{
+
+    FREE(tree->__pivots)
+    FREE(tree->__points)
+    FREE(tree->__boxes_data);
+    FREE(tree->data);
+}
+
+static void build_tree_kdtree(kdtree_t* tree)
+{
+    /*************************************************
+    * Wrapper for make_tree function.               *
+    * Simplifies interfaces and takes time measures *
+    *************************************************/
+    // tree->root = recursive_make_tree(tree->__points, tree->__pivots, 0, -1, 0, tree->n_points-1, 0, tree->dims);
+    tree->root = parallel_recursive_make_tree(tree->__points, tree->__pivots, 0, -1, 0, tree->n_points-1, 0, tree->dims);
+
+    float_t* tmp_data = (float_t*)MY_MALLOC(tree->n_points * tree->dims * sizeof(float_t));
+    #pragma omp parallel for
+    for(idx_t i = 0; i < tree->n_points; ++i)
+    {
+        float_t* datapoint = tree->__points[i].data;
+        memcpy(tmp_data + tree->dims * i, datapoint, tree->dims * sizeof(float_t));
+        tree->__points[i].data = tmp_data + tree->dims * i;
+    }
+    free(tree->data);
+    tree->data = tmp_data;
+}
+
+static void build_tree_kdtree_sample(kdtree_t* tree) 
+{
+    point_t* scratch_points = (point_t*)MY_MALLOC(tree->n_points * sizeof(point_t));
+
+    #pragma omp parallel
+    {
+        #pragma omp single
+        {
+            tree->root = sample_recursive_make_tree(tree->__points, scratch_points, tree->__pivots, 0, -1,
+                                                      -1, 0, tree->n_points-1, 0, tree->dims);
+        }
+    }
+    
+    FREE(scratch_points);
+
+    float_t* tmp_data = (float_t*)MY_MALLOC(tree->n_points * tree->dims * sizeof(float_t));
+
+    #pragma omp parallel for
+    for(idx_t i = 0; i < tree->n_points; ++i)
+    {
+        float_t* datapoint = tree->__points[i].data;
+        memcpy(tmp_data + tree->dims * i, datapoint, tree->dims * sizeof(float_t));
+        tree->__points[i].data = tmp_data + tree->dims * i;
+    }
+    free(tree->data);
+    tree->data = tmp_data;
+}
diff --git a/src/tree/kdtreeV2.c b/src/tree/kdtreeV2.c
deleted file mode 100644
index f7b5e7d..0000000
--- a/src/tree/kdtreeV2.c
+++ /dev/null
@@ -1,378 +0,0 @@
-#include "kdtreeV2.h"
-#include "heap.h"
-#include <math.h>
-#include <stdint.h>
-#include <time.h>
-
-#define DEFAULT_LEAF_SIZE 30
-
-extern unsigned int data_dims;
-
-FLOAT_TYPE eud_kdtree_v2(FLOAT_TYPE *restrict p1, FLOAT_TYPE *restrict p2) {
-  register FLOAT_TYPE d = 0;
-  for (unsigned int i = 0; i < data_dims; ++i) {
-    register FLOAT_TYPE dd = (p1[i] - p2[i]);
-    d += dd * dd;
-  }
-  return d;
-  // return sqrt(d);
-}
-
-#ifdef USE_FLOAT32
-typedef float v4f __attribute__((vector_size(16)));
-#else
-typedef double v4f __attribute__((vector_size(32)));
-#endif
-
-FLOAT_TYPE eud_kdtree_v2_2(FLOAT_TYPE *restrict u, FLOAT_TYPE *restrict v) {
-  register float_t s;
-  uint32_t i = 0;
-  // manually unrolled loop, might be vectorized
-  register v4f acc = {0., 0., 0., 0.};
-  for (; i + 4 <= data_dims; i += 4) {
-    register v4f _u = {u[i], u[i + 1], u[i + 2], u[i + 3]};
-    register v4f _v = {v[i], v[i + 1], v[i + 2], v[i + 3]};
-    register v4f _diff = _u - _v;
-    acc = _diff * _diff;
-  }
-
-  s = acc[0] + acc[1] + acc[2] + acc[3];
-  if (i < data_dims) {
-    for (; i < data_dims; ++i) {
-      float_t d = u[i] - v[i];
-      s += d * d;
-    }
-  }
-  return s;
-}
-
-FLOAT_TYPE *swapMem_kdv2;
-
-void swap_kdnode_v2(kdnode_v2 *x, kdnode_v2 *y) {
-  kdnode_v2 tmp;
-  tmp = *x;
-  *x = *y;
-  *y = tmp;
-
-#ifdef SWMEM
-  memcpy(swapMem_kdv2, x->data, sizeof(FLOAT_TYPE) * data_dims);
-  memcpy(x->data, y->data, sizeof(FLOAT_TYPE) * data_dims);
-  memcpy(y->data, swapMem_kdv2, sizeof(FLOAT_TYPE) * data_dims);
-  FLOAT_TYPE *tmpPtr = x->data;
-  x->data = y->data;
-  y->data = tmpPtr;
-#endif
-}
-
-/**
- *
- * KDtree implementation
- *
- *
- */
-
-void initialize_kdnodes_v2(kdnode_v2 *node_array, FLOAT_TYPE *d, idx_t n) {
-  for (idx_t i = 0; i < n; ++i) {
-    node_array[i].data = d + (i * data_dims);
-    node_array[i].array_idx = i;
-    node_array[i].lch = NULL;
-    node_array[i].rch = NULL;
-    node_array[i].parent = NULL;
-    node_array[i].level = -1;
-    node_array[i].is_leaf = 0;
-    node_array[i].split_var = -1;
-    node_array[i].node_list.data = NULL;
-    node_array[i].node_list.count = 0;
-  }
-}
-/*
-void initializePTRS(kdnode_v2** node_ptr_array, kdnode_v2* node_array, idx_t n )
-{
-    for(idx_t i = 0; i < n; ++i)
-    {
-        node_ptr_array[i] = node_array + i;
-    }
-}
-*/
-
-int cmpKDnodesV2(kdnode_v2 *a, kdnode_v2 *b, int var) {
-
-  FLOAT_TYPE res = a->data[var] - b->data[var];
-  return (res > 0);
-}
-
-void printKDnodeV2(kdnode_v2 *node) {
-  printf("Node %p:\n", node);
-  printf("\t array_idx: %lu\n", (uint64_t)(node->array_idx));
-  printf("\t data: ");
-  for (unsigned int i = 0; i < data_dims; ++i)
-    printf(" %f ", node->data[i]);
-  printf("\n");
-  printf("\t parent: %p\n", node->parent);
-  printf("\t lch: %p\n", node->lch);
-  printf("\t rch: %p\n", node->rch);
-  printf("\t level: %d\n", node->level);
-  printf("\t split_var: %d\n", node->split_var);
-  printf("\n");
-}
-
-// Standard Lomuto partition function
-
-int partition_kdnode_v2(kdnode_v2 *arr, int low, int high, int split_var) {
-  kdnode_v2 pivot = arr[high];
-
-  int i = (low - 1);
-  for (int j = low; j <= high - 1; j++) {
-    if (!cmpKDnodesV2(arr + j, &pivot, split_var)) {
-      i++;
-      swap_kdnode_v2(arr + i, arr + j);
-    }
-  }
-  swap_kdnode_v2(arr + i + 1, arr + high);
-  return (i + 1);
-}
-
-// Implementation of QuickSelect
-int median_of_nodes_kdnode_v2(kdnode_v2 *a, int left, int right,
-                              int split_var) {
-  // printf("----------\n");
-  int k = left + ((right - left + 1) / 2);
-
-  if (left == right)
-    return left;
-  if (left == (right - 1)) {
-    if (cmpKDnodesV2(a + left, a + right, split_var)) {
-      swap_kdnode_v2(a + left, a + right);
-    }
-    return right;
-  }
-  while (left <= right) {
-
-    // Partition a[left..right] around a pivot
-    // and find the position of the pivot
-    int pivotIndex = partition_kdnode_v2(a, left, right, split_var);
-    // printf("%d %d %d %d\n",left, right, k, pivotIndex);
-
-    // If pivot itself is the k-th smallest element
-    if (pivotIndex == k)
-      return pivotIndex;
-
-    // If there are more than k-1 elements on
-    // left of pivot, then k-th smallest must be
-    // on left side.
-
-    else if (pivotIndex > k)
-      right = pivotIndex - 1;
-
-    // Else k-th smallest is on right side.
-    else
-      left = pivotIndex + 1;
-  }
-  return -1;
-}
-
-kdnode_v2 *make_tree_kdnode_v2(kdnode_v2 *t, int start, int end,
-                               kdnode_v2 *parent, int level) {
-  kdnode_v2 *n = NULL;
-  int split_var = level % data_dims;
-  FLOAT_TYPE max_diff = -999999.;
-  for (unsigned int v = 0; v < data_dims; ++v) {
-    FLOAT_TYPE max_v = -9999999.;
-    FLOAT_TYPE min_v = 9999999.;
-    for (int i = start; i <= end; ++i) {
-      max_v = t[i].data[v] > max_v ? t[i].data[v] : max_v;
-      min_v = t[i].data[v] < min_v ? t[i].data[v] : min_v;
-    }
-    if ((max_v - min_v) > max_diff) {
-      max_diff = max_v - min_v;
-      split_var = v;
-    }
-  }
-
-  /*
-  #ifdef SWMEM
-      if(parent == NULL)
-      {
-          swapMem_kdv2 = (FLOAT_TYPE*)malloc(sizeof(FLOAT_TYPE)*data_dims);
-      }
-  #endif
-  */
-
-  if (end - start < DEFAULT_LEAF_SIZE) {
-    n = t + start;
-    n->is_leaf = 1;
-    n->parent = parent;
-    n->lch = NULL;
-    n->rch = NULL;
-    size_t j = 0;
-    n->node_list.count = (size_t)(end - start + 1);
-    n->node_list.data =
-        (kdnode_v2 **)malloc(n->node_list.count * sizeof(kdnode_v2 *));
-    for (int i = start; i <= end; ++i) {
-      t[i].parent = n;
-      t[i].is_leaf = 1;
-      t[i].lch = NULL;
-      t[i].rch = NULL;
-      n->node_list.data[j] = t + i;
-      ++j;
-    }
-    return n;
-  }
-
-  int median_idx = -1;
-
-  // if ((end - start) < 0) return 0;
-  // if (end  == start) {
-  //     n = t + start;
-  //     n -> split_var = split_var;
-  //     n->parent = parent;
-  //     n->level = level;
-  //     n -> lch = NULL;
-  //     n -> rch = NULL;
-  //     return n;
-  // }
-
-  median_idx = median_of_nodes_kdnode_v2(t, start, end, split_var);
-  // printf("%d median idx\n", median_idx);
-  if (median_idx > -1) {
-    swap_kdnode_v2(t + start, t + median_idx);
-    // n = t + median_idx;
-    n = t + start;
-    // n->lch  = make_tree_kdnode_v2(t, start, median_idx - 1, n, level + 1);
-    n->lch = make_tree_kdnode_v2(t, start + 1, median_idx, n, level + 1);
-    // n->rch = make_tree_kdnode_v2(t, median_idx + 1, end, n, level + 1);
-    n->rch = make_tree_kdnode_v2(t, median_idx + 1, end, n, level + 1);
-    n->split_var = split_var;
-    n->parent = parent;
-    n->level = level;
-  }
-
-  /*
-  #ifdef SWMEM
-      if(parent == NULL)
-      {
-          swapMem_kdv2 = malloc(sizeof(FLOAT_TYPE)*data_dims);
-      }
-  #endif
-  */
-
-  return n;
-}
-
-static inline FLOAT_TYPE hyper_plane_dist(FLOAT_TYPE *p1, FLOAT_TYPE *p2,
-                                          int var) {
-  return p1[var] - p2[var];
-}
-
-static inline int hyper_plane_side(FLOAT_TYPE *p1, FLOAT_TYPE *p2, int var) {
-  return p1[var] > p2[var];
-}
-
-void knn_sub_tree_search_kdtree_v2(FLOAT_TYPE *point, kdnode_v2 *root,
-                                   heap *H) {
-  if (root->is_leaf) {
-    for (size_t i = 0; i < root->node_list.count; ++i) {
-      kdnode_v2 *n = root->node_list.data[i];
-      //__builtin_prefetch(root -> node_list.data + i + 1, 0, 3);
-      FLOAT_TYPE distance = eud_kdtree_v2(point, n->data);
-      insert_max_heap(H, distance, n->array_idx);
-    }
-    return;
-  }
-
-  FLOAT_TYPE current_distance = eud_kdtree_v2(point, root->data);
-  FLOAT_TYPE hp_distance = hyper_plane_dist(point, root->data, root->split_var);
-  insert_max_heap(H, current_distance, root->array_idx);
-  __builtin_prefetch(root->lch, 0, 3);
-  __builtin_prefetch(root->rch, 0, 3);
-
-  int side = hp_distance > 0.f;
-
-  switch (side) {
-  case HP_LEFT_SIDE:
-    if (root->lch) {
-      knn_sub_tree_search_kdtree_v2(point, root->lch, H);
-    }
-    break;
-
-  case HP_RIGHT_SIDE:
-    if (root->rch) {
-      knn_sub_tree_search_kdtree_v2(point, root->rch, H);
-    }
-    break;
-
-  default:
-    break;
-  }
-  FLOAT_TYPE max_d = H->data[0].value;
-  int c = max_d > (hp_distance * hp_distance);
-
-  if (c || (H->count) < (H->N))
-  // if(c)
-  {
-
-    switch (side) {
-    case HP_LEFT_SIDE:
-      if (root->rch) {
-        knn_sub_tree_search_kdtree_v2(point, root->rch, H);
-      }
-      break;
-
-    case HP_RIGHT_SIDE:
-      if (root->lch) {
-        knn_sub_tree_search_kdtree_v2(point, root->lch, H);
-      }
-      break;
-
-    default:
-      break;
-    }
-  }
-  return;
-}
-
-void kdtree_v2_init(kdtree_v2 *tree, FLOAT_TYPE *data, size_t n_nodes,
-                    unsigned int dims) {
-  data_dims = dims;
-  tree->n_nodes = n_nodes;
-  tree->_nodes = (kdnode_v2 *)malloc(n_nodes * sizeof(kdnode_v2));
-  initialize_kdnodes_v2(tree->_nodes, data, n_nodes);
-  tree->root = NULL;
-}
-
-void kdtree_v2_free(kdtree_v2 *tree) {
-  for (uint64_t i = 0; i < tree->n_nodes; ++i)
-    if (tree->_nodes[i].node_list.data)
-      free(tree->_nodes[i].node_list.data);
-
-  free(tree->_nodes);
-}
-
-heap knn_kdtree_v2(FLOAT_TYPE *point, kdnode_v2 *kdtree_root, int maxk) {
-  heap H;
-  allocate_heap(&H, maxk);
-  init_heap(&H);
-  knn_sub_tree_search_kdtree_v2(point, kdtree_root, &H);
-  heap_sort(&H);
-  return H;
-}
-
-heap knn_kdtree_v2_no_heapsort(FLOAT_TYPE *point, kdnode_v2 *kdtree_root,
-                               int maxk) {
-  heap H;
-  allocate_heap(&H, maxk);
-  init_heap(&H);
-  knn_sub_tree_search_kdtree_v2(point, kdtree_root, &H);
-  return H;
-}
-
-kdnode_v2 *build_tree_kdtree_v2(kdnode_v2 *kd_ptrs, size_t n,
-                                size_t dimensions) {
-  /*************************************************
-   * Wrapper for make_tree function.               *
-   * Simplifies interfaces and takes time measures *
-   *************************************************/
-  data_dims = dimensions;
-  kdnode_v2 *root = make_tree_kdnode_v2(kd_ptrs, 0, n - 1, NULL, 0);
-  return root;
-}
diff --git a/src/tree/kdtreeV2.h b/src/tree/kdtreeV2.h
deleted file mode 100644
index 78d7e3a..0000000
--- a/src/tree/kdtreeV2.h
+++ /dev/null
@@ -1,65 +0,0 @@
-#pragma once
-#include "heap.h"
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <limits.h>
-#include <stdint.h>
-#define T double
-#define DATA_DIMS 0 
-
-#ifdef USE_FLOAT32
-    #define FLOAT_TYPE float
-#else
-    #define FLOAT_TYPE double 
-#endif
-
-#ifdef USE_INT32
-    #define MY_SIZE_MAX UINT32_MAX
-    #define idx_t uint32_t
-#else
-    #define MY_SIZE_MAX UINT64_MAX
-    #define idx_t uint64_t
-#endif
-
-struct kdnode_v2_list
-{
-    size_t count;
-    struct kdnode_v2** data;
-};
-
-
-struct kdnode_v2
-{   
-   FLOAT_TYPE * data;
-   int level;
-   int split_var;
-   int is_leaf;
-   idx_t array_idx;
-   struct kdnode_v2* parent;
-   struct kdnode_v2* lch;
-   struct kdnode_v2* rch;
-   struct kdnode_v2_list node_list;
-};
-
-struct kdtree_v2
-{
-    size_t n_nodes;
-    struct kdnode_v2* _nodes;
-    struct kdnode_v2* root;
-};
-
-
-typedef struct kdnode_v2 kdnode_v2;
-typedef struct kdtree_v2 kdtree_v2;
-
-void initialize_kdnodes_v2(kdnode_v2* node_array, FLOAT_TYPE* d, idx_t n );
-
-heap knn_kdtree_v2(FLOAT_TYPE* point, kdnode_v2* kdtree_root, int maxk);
-heap knn_kdtree_v2_no_heapsort(FLOAT_TYPE* point, kdnode_v2* kdtree_root, int maxk);
-
-kdnode_v2 * build_tree_kdtree_v2(kdnode_v2* kd_ptrs, size_t n, size_t dimensions);
-void knn_sub_tree_search_kdtree_v2(FLOAT_TYPE* point, kdnode_v2* root, heap * H);
-void kdtree_v2_init(kdtree_v2* tree, FLOAT_TYPE* data, size_t n_nodes, unsigned int dims);
-void kdtree_v2_free(kdtree_v2* tree);
diff --git a/src/tree/tree.c b/src/tree/tree.c
index d3e0ada..4dc46cf 100644
--- a/src/tree/tree.c
+++ b/src/tree/tree.c
@@ -9,7 +9,7 @@
  */
 #include "tree.h"
 #include "heap.h"
-#include "kdtreeV2.h"
+#include "kdtree.h"
 #include "mpi.h"
 #include <math.h>
 #include <stdint.h>
@@ -18,8 +18,9 @@
 #include <string.h>
 #include <omp.h>
 #include <sys/sysinfo.h>
+#include <float.h>
 
-
+#define NBINS 20
 /* 
  * Maximum bytes to send with a single mpi send/recv, used 
  * while communicating results of ngbh search
@@ -29,8 +30,8 @@
 //#define MAX_MSG_SIZE 4294967296
 
 /* Used slices of 10 mb ? Really good? Maybe at the cause of TID thing */
-// #define MAX_MSG_SIZE (10000 * k * sizeof(heap_node))
-#define MAX_MSG_SIZE (100000 * k * sizeof(heap_node))
+// #define MAX_MSG_SIZE (10000 * k * sizeof(heap_node_t))
+#define MAX_MSG_SIZE (100000 * k * sizeof(heap_node_t))
 
 
 #define TOP_TREE_RCH 1
@@ -51,13 +52,18 @@ int cmp_float_t(const void* a, const void* b)
 
 
 /* quickselect for an element along a dimension */
-void swap_data_element(float_t *a, float_t *b, size_t vec_len) {
+void swap_data_element(point_t *a, point_t *b, size_t vec_len) {
+    idx_t tmp_dp;
+    tmp_dp = a->array_idx;
+    a->array_idx = b->array_idx;
+    a->array_idx = tmp_dp;
+
     float_t tmp;
     for (size_t i = 0; i < vec_len; ++i) 
     {
-        tmp = a[i];
-        a[i] = b[i];
-        b[i] = tmp;
+        tmp = a->data[i];
+        a->data[i] = b->data[i];
+        b->data[i] = tmp;
     }
 }
 
@@ -65,19 +71,19 @@ int compare_data_element(float_t *a, float_t *b, int compare_dim) {
     return -((a[compare_dim] - b[compare_dim]) > 0.) + ((a[compare_dim] - b[compare_dim]) < 0.);
 }
 
-int partition_data_element(float_t *array, int vec_len, int compare_dim,
+int partition_data_element(point_t *array, int vec_len, int compare_dim,
                            int left, int right, int pivot_index) 
 {
     int store_index = left;
     int i;
 
     /* Move pivot to end */
-    swap_data_element(array + pivot_index * vec_len, array + right * vec_len, vec_len);
+    swap_data_element(array + pivot_index, array + right, vec_len);
     for (i = left; i < right; ++i) 
     {
         // if(compare_data_element(array + i*vec_len, array + pivot_index*vec_len,
         // compare_dim ) >= 0){
-        if (array[i * vec_len + compare_dim] < array[right * vec_len + compare_dim]) 
+        if (array[i].data[compare_dim] < array[right].data[compare_dim]) 
         {
             swap_data_element(array + store_index * vec_len, array + i * vec_len, vec_len);
             store_index += 1;
@@ -89,7 +95,7 @@ int partition_data_element(float_t *array, int vec_len, int compare_dim,
     return store_index;
 }
 
-int qselect_data_element(float_t *array, int vec_len, int compare_dim, int left, int right, int n) 
+int qselect_data_element(point_t *array, int vec_len, int compare_dim, int left, int right, int n) 
 {
     int pivot_index;
     if (left == right) 
@@ -113,25 +119,28 @@ int qselect_data_element(float_t *array, int vec_len, int compare_dim, int left,
     }
 }
 
-int quickselect_data_element(float_t *array, int vec_len, int array_size, int compare_dim, int k) 
+int quickselect_data_element(point_t *array, int vec_len, int array_size, int compare_dim, int k) 
 {
     return qselect_data_element(array, vec_len, compare_dim, 0, array_size - 1, k - 1);
 }
 
 int CMP_DIM;
-int compare_data_element_sort(const void *a, const void *b) {
+int compare_data_element_sort(const void *a, const void *b) 
+{
     float_t aa = *((float_t *)a + CMP_DIM);
     float_t bb = *((float_t *)b + CMP_DIM);
     return ((aa - bb) > 0.) - ((aa - bb) < 0.);
 }
 
-void compute_bounding_box(global_context_t *ctx) {
+void compute_bounding_box(global_context_t *ctx)
+{
     ctx->lb_box = (float_t *)MY_MALLOC(ctx->dims * sizeof(float_t));
     ctx->ub_box = (float_t *)MY_MALLOC(ctx->dims * sizeof(float_t));
 
-    for (size_t d = 0; d < ctx->dims; ++d) {
-    ctx->lb_box[d] = 99999999.;
-    ctx->ub_box[d] = -99999999.;
+    for (size_t d = 0; d < ctx->dims; ++d) 
+    {
+        ctx->lb_box[d] =  FLT_MAX;
+        ctx->ub_box[d] = -FLT_MAX;
     }
 
     #define local_data ctx->local_data
@@ -186,73 +195,70 @@ partition_t dequeue_partition(partition_queue_t *queue)
   return queue->data[--(queue->count)];
 }
 
-void compute_medians_and_check(global_context_t *ctx, float_t *data) {
-    float_t prop = 0.5;
-    int k = (int)(ctx->local_n_points * prop);
-    int d = 1;
-
-    /*quick select on a particular dimension */
-    CMP_DIM = d;
-    int kk = (k - 1) * ctx->dims;
-
-    int count = 0;
-    // idx = idx - 1;
-    //
-    int aaa = quickselect_data_element(ctx->local_data, (int)(ctx->dims), (int)(ctx->local_n_points), d, k);
-    /*
-    * sanity check
-    * check if the median found in each node is
-    * a median
-    */
-
-    float_t *medians_rcv = (float_t *)MY_MALLOC(ctx->dims * ctx->world_size * sizeof(float_t));
-
-    /*
-    * MPI_Allgather(     const void *sendbuf,
-    *                     int sendcount,
-    *                     MPI_Datatype sendtype,
-    *                     void *recvbuf,
-    *                     int recvcount,
-    *                     MPI_Datatype recvtype,
-    *                     MPI_Comm comm)
-    */
-
-    /* Exchange medians */
-
-    MPI_Allgather(ctx->local_data + kk, ctx->dims, MPI_MY_FLOAT, medians_rcv, ctx->dims, MPI_MY_FLOAT, ctx->mpi_communicator);
-
-    /* sort medians on each node */
-
-    CMP_DIM = d;
-    qsort(medians_rcv, ctx->world_size, ctx->dims * sizeof(float_t), compare_data_element_sort);
-
-    /*
-    * Evaluate goodness of the median on master which has whole dataset
-    */
-
-    if (ctx->mpi_rank == 0) {
-    int count = 0;
-    int idx = (int)(prop * (ctx->world_size));
-    // idx = idx - 1;
-    for (int i = 0; i < ctx->n_points; ++i) 
-    {
-        count += data[i * ctx->dims + d] <= medians_rcv[idx * ctx->dims + d];
-    }
-    mpi_printf(ctx, "Choosing %lf percentile on dimension %d: empirical prop %lf\n", prop, d, (float_t)count / (float_t)(ctx->n_points));
-    }
-    free(medians_rcv);
-}
+// void compute_medians_and_check(global_context_t *ctx, float_t *data) {
+//     float_t prop = 0.5;
+//     int k = (int)(ctx->local_n_points * prop);
+//     int d = 1;
+//
+//     /*quick select on a particular dimension */
+//     CMP_DIM = d;
+//     int kk = (k - 1) * ctx->dims;
+//
+//     int count = 0;
+//     // idx = idx - 1;
+//     //
+//     int aaa = quickselect_data_element(ctx->data_w_idx, (int)(ctx->dims), (int)(ctx->local_n_points), d, k);
+//     /*
+//     * sanity check
+//     * check if the median found in each node is
+//     * a median
+//     */
+//
+//     float_t *medians_rcv = (float_t *)MY_MALLOC(ctx->dims * ctx->world_size * sizeof(float_t));
+//
+//     /*
+//     * MPI_Allgather(     const void *sendbuf,
+//     *                     int sendcount,
+//     *                     MPI_Datatype sendtype,
+//     *                     void *recvbuf,
+//     *                     int recvcount,
+//     *                     MPI_Datatype recvtype,
+//     *                     MPI_Comm comm)
+//     */
+//
+//     /* Exchange medians */
+//
+//     MPI_Allgather(ctx->local_data + kk, ctx->dims, MPI_MY_FLOAT, medians_rcv, ctx->dims, MPI_MY_FLOAT, ctx->mpi_communicator);
+//
+//     /* sort medians on each node */
+//
+//     CMP_DIM = d;
+//     qsort(medians_rcv, ctx->world_size, ctx->dims * sizeof(float_t), compare_data_element_sort);
+//
+//     /*
+//     * Evaluate goodness of the median on master which has whole dataset
+//     */
+//
+//     if (ctx->mpi_rank == 0) {
+//     int count = 0;
+//     int idx = (int)(prop * (ctx->world_size));
+//     // idx = idx - 1;
+//     for (int i = 0; i < ctx->n_points; ++i) 
+//     {
+//         count += data[i * ctx->dims + d] <= medians_rcv[idx * ctx->dims + d];
+//     }
+//     mpi_printf(ctx, "Choosing %lf percentile on dimension %d: empirical prop %lf\n", prop, d, (float_t)count / (float_t)(ctx->n_points));
+//     }
+//     free(medians_rcv);
+// }
 
 float_t check_pc_pointset_parallel(global_context_t *ctx, pointset_t *ps, guess_t g, int d, float_t prop) {
-    /*
-     * ONLY FOR TEST PURPOSES
-     * gather on master all data
-     * perform the count on master
-     */
+
     size_t pvt_count = 0;
+    #pragma omp parallel for reduction(+:pvt_count)
     for (size_t i = 0; i < ps->n_points; ++i) 
     {
-        pvt_count += ps->data[i * ps->dims + d] <= g.x_guess;
+        pvt_count += ps->datapoints[i].data[d] <= g.x_guess;
     }
 
     size_t pvt_n_and_tot[2] = {pvt_count, ps->n_points};
@@ -268,55 +274,61 @@ float_t check_pc_pointset_parallel(global_context_t *ctx, pointset_t *ps, guess_
     return ep;
 }
 
-void compute_bounding_box_pointset(global_context_t *ctx, pointset_t *ps) {
+void compute_bounding_box_data(global_context_t *ctx, float_t *data, float_t* lb_box, float_t* ub_box, idx_t n_points) 
+{
+    #define lb lb_box
+    #define ub ub_box
 
-    for (size_t d = 0; d < ps->dims; ++d)
+    for (size_t d = 0; d < ctx->dims; ++d)
     {
-        ps->lb_box[d] = 99999999.;
-        ps->ub_box[d] = -99999999.;
+        lb[d] =  FLT_MAX;
+        ub[d] = -FLT_MAX;
     }
 
-    #define local_data ps->data
-    #define lb ps->lb_box
-    #define ub ps->ub_box
 
     /* compute minimum and maximum for each dimensions, store them in local bb */
     /* each processor on its own */
 
-    /* TODO: reduction using omp directive */
-
-    for (size_t i = 0; i < ps->n_points; ++i) 
+    // this moves memory maybe there is a better way
+    #pragma omp parallel 
     {
-        for (size_t d = 0; d < ps->dims; ++d) 
+        float_t* pvt_lb = (float_t*)MY_MALLOC(ctx -> dims * sizeof(float_t));
+        float_t* pvt_ub = (float_t*)MY_MALLOC(ctx -> dims * sizeof(float_t));
+        for (size_t d = 0; d < ctx->dims; ++d)
         {
-            lb[d] = MIN(local_data[i * ps->dims + d], lb[d]);
-            ub[d] = MAX(local_data[i * ps->dims + d], ub[d]);
+            pvt_lb[d] = FLT_MAX;
+            pvt_ub[d] = -FLT_MAX;
         }
-      }
 
-    /* Reduce to obtain bb */
-    /*
-    MPI_Allreduce(  const void *sendbuf,
-                                  void *recvbuf,
-                                  int count,
-                                  MPI_Datatype datatype,
-                                  MPI_Op op,
-                                  MPI_Comm comm)
-    */
+        #pragma omp for
+        for (size_t i = 0; i < n_points; ++i) 
+        {
+            for (size_t d = 0; d < ctx->dims; ++d) 
+            {
+                pvt_lb[d] = MIN(data[i*ctx->dims + d], pvt_lb[d]);
+                pvt_ub[d] = MAX(data[i*ctx->dims + d], pvt_ub[d]);
+            }
+        }
 
-    /*get the bounding box */
+        #pragma omp critical (bounding_box_reduction)
+        {
+            for (size_t d = 0; d < ctx->dims; ++d) 
+            {
+                lb[d] = MIN(pvt_lb[d], lb[d]);
+                ub[d] = MAX(pvt_ub[d], ub[d]);
+            }
+        }
 
-    MPI_Allreduce(MPI_IN_PLACE, lb, ps->dims, MPI_MY_FLOAT, MPI_MIN, ctx->mpi_communicator);
-    MPI_Allreduce(MPI_IN_PLACE, ub, ps->dims, MPI_MY_FLOAT, MPI_MAX, ctx->mpi_communicator);
 
-    /*
-    DB_PRINT("[RANK %d]:", ctx -> mpi_rank);
-    for(size_t d = 0; d < ctx -> dims; ++d)
-    {
-          DB_PRINT("%lf ", ctx -> ub_box[d]);
+        free(pvt_lb);
+        free(pvt_ub);
+
     }
-    DB_PRINT("\n");
-    */
+
+    /*get the bounding box */
+
+    MPI_Allreduce(MPI_IN_PLACE, lb, ctx->dims, MPI_MY_FLOAT, MPI_MIN, ctx->mpi_communicator);
+    MPI_Allreduce(MPI_IN_PLACE, ub, ctx->dims, MPI_MY_FLOAT, MPI_MAX, ctx->mpi_communicator);
 
     /*
     MPI_DB_PRINT("[PS BOUNDING BOX]: ");
@@ -324,7 +336,69 @@ void compute_bounding_box_pointset(global_context_t *ctx, pointset_t *ps) {
     lb[d], ub[d]); MPI_DB_PRINT("\n");
     */
 
+    #undef lb
+    #undef ub
+}
+
+void compute_bounding_box_pointset(global_context_t *ctx, pointset_t *ps) {
+    #define local_data ps->datapoints
+    #define lb ps->lb_box
+    #define ub ps->ub_box
+
+    for (size_t d = 0; d < ps->dims; ++d)
+    {
+        ps->lb_box[d] =  FLT_MAX;
+        ps->ub_box[d] = -FLT_MAX;
+    }
+
+
+    /* compute minimum and maximum for each dimensions, store them in local bb */
+    /* each processor on its own */
+
+    // this moves memory maybe there is a better way
+    #pragma omp parallel 
+    {
+        float_t* pvt_lb = (float_t*)MY_MALLOC(ctx -> dims * sizeof(float_t));
+        float_t* pvt_ub = (float_t*)MY_MALLOC(ctx -> dims * sizeof(float_t));
+        for (size_t d = 0; d < ps->dims; ++d)
+        {
+            pvt_lb[d] = FLT_MAX;
+            pvt_ub[d] = -FLT_MAX;
+        }
+
+        #pragma omp for
+        for (size_t i = 0; i < ps->n_points; ++i) 
+        {
+            for (size_t d = 0; d < ps->dims; ++d) 
+            {
+                pvt_lb[d] = MIN(local_data[i].data[d], pvt_lb[d]);
+                pvt_ub[d] = MAX(local_data[i].data[d], pvt_ub[d]);
+            }
+        }
+
+        #pragma omp critical (bounding_box_reduction)
+        {
+            for (size_t d = 0; d < ps->dims; ++d) 
+            {
+                lb[d] = MIN(pvt_lb[d], lb[d]);
+                ub[d] = MAX(pvt_ub[d], ub[d]);
+            }
+        }
+
+
+        free(pvt_lb);
+        free(pvt_ub);
+
+    }
+
+    /*get the bounding box */
+
+    MPI_Allreduce(MPI_IN_PLACE, lb, ps->dims, MPI_MY_FLOAT, MPI_MIN, ctx->mpi_communicator);
+    MPI_Allreduce(MPI_IN_PLACE, ub, ps->dims, MPI_MY_FLOAT, MPI_MAX, ctx->mpi_communicator);
 
+    // MPI_DB_PRINT("[PS BOUNDING BOX]: ");
+    // for(size_t d = 0; d < ps -> dims; ++d) MPI_DB_PRINT("d%d:[%lf, %lf] ",(int)d,
+    // lb[d], ub[d]); MPI_DB_PRINT("\n");
 
     #undef local_data
     #undef lb
@@ -333,8 +407,8 @@ void compute_bounding_box_pointset(global_context_t *ctx, pointset_t *ps) {
 
 
 guess_t retrieve_guess_pure(global_context_t *ctx, pointset_t *ps,
-                        uint64_t *global_bin_counts, 
-                        int k_global, int d, float_t pc)
+                            uint64_t *global_bin_counts, 
+                            int k_global, int d, float_t pc)
 {
 
     /*
@@ -348,8 +422,8 @@ guess_t retrieve_guess_pure(global_context_t *ctx, pointset_t *ps,
     MPI_DB_PRINT("[ ");
     for(int i = 0; i < k_global; ++i)
     {
-          MPI_DB_PRINT("%lu %lf --- ", global_bin_counts[i],
-    (float_t)global_bin_counts[i]/(float_t)total_count);
+          MPI_DB_PRINT( "%lu %lf --- ", global_bin_counts[i],
+                        (float_t)global_bin_counts[i]/(float_t)total_count);
     }
     MPI_DB_PRINT("\n");
     */
@@ -376,11 +450,7 @@ guess_t retrieve_guess_pure(global_context_t *ctx, pointset_t *ps,
     float_t x_guess = (pc - y0) / (y1 - y0) * (x1 - x0) + x0;
 
         
-    /*
-    MPI_DB_PRINT("[MASTER] best guess @ %lf is %lf on bin %d on dimension %d --- x0 %lf x1 %lf y0 %lf y1 %lf\n",pc, x_guess,idx, d, x0, x1, y0, y1);
-    */
-    
-
+    // MPI_DB_PRINT("[MASTER] best guess @ %lf is %lf on bin %d on dimension %d --- x0 %lf x1 %lf y0 %lf y1 %lf\n",pc, x_guess,idx, d, x0, x1, y0, y1);
 
     guess_t g = {.bin_idx = idx, .x_guess = x_guess};
     return g;
@@ -416,32 +486,32 @@ void global_binning_check(global_context_t *ctx, float_t *data, int d, int k)
     }
 }
 
+// TODO: k_global better to define it as a #define 
 void compute_pure_global_binning(global_context_t *ctx, pointset_t *ps,
                                  uint64_t *global_bin_counts, int k_global,
                                  int d) 
 {
     /* compute binning of data along dimension d */
-    uint64_t *local_bin_count = (uint64_t *)MY_MALLOC(k_global * sizeof(uint64_t));
+    uint64_t local_bin_count[NBINS];
     for (size_t k = 0; k < k_global; ++k) 
     {
         local_bin_count[k] = 0;
         global_bin_counts[k] = 0;
     }
 
-    /*
-    MPI_DB_PRINT("[PS BOUNDING BOX %d]: ", ctx -> mpi_rank);
-    for(size_t d = 0; d < ps -> dims; ++d) MPI_DB_PRINT("d%d:[%lf, %lf] ",(int)d, ps -> lb_box[d], ps -> ub_box[d]); MPI_DB_PRINT("\n");
-    MPI_DB_PRINT("\n");
-    */
+    // MPI_DB_PRINT("[PS BOUNDING BOX %d]: ", ctx -> mpi_rank);
+    // for(size_t d = 0; d < ps -> dims; ++d) MPI_DB_PRINT("d%d:[%lf, %lf] ",(int)d, ps -> lb_box[d], ps -> ub_box[d]); MPI_DB_PRINT("\n");
+    // MPI_DB_PRINT("\n");
 
     float_t bin_w = (ps-> ub_box[d] - ps->lb_box[d]) / (float_t)k_global;
 
     /* THIS IS problematic when ps -> ub_box == ps -> lb_box */
 
+    // TODO: this should be moved to a thread_private reduction
     #pragma omp parallel for
     for (size_t i = 0; i < ps->n_points; ++i) 
     {
-        float_t p = ps->data[i * ps->dims + d];
+        float_t p = ps->datapoints[i].data[d];
         /* to prevent the border point in the box to have bin_idx == k_global causing invalid memory access */
         int bin_idx = MIN((int)((p - ps->lb_box[d]) / bin_w), k_global - 1);
         
@@ -450,11 +520,16 @@ void compute_pure_global_binning(global_context_t *ctx, pointset_t *ps,
     }
 
     MPI_Allreduce(local_bin_count, global_bin_counts, k_global, MPI_UNSIGNED_LONG, MPI_SUM, ctx->mpi_communicator);
-    free(local_bin_count);
+    // MPI_DB_PRINT("BIN COUNT: [ ");
+    // for(int i = 0; i < k_global; ++i) MPI_DB_PRINT(" %lu ", local_bin_count[i]);
+    // MPI_DB_PRINT(" ]\n");
+         
+    //free(local_bin_count);
 }
 
-size_t partition_data_around_value(float_t *array, size_t vec_len, size_t compare_dim,
-                                size_t left, size_t right, float_t pivot_value) 
+
+size_t partition_data_around_value(point_t *array, size_t vec_len, size_t compare_dim,
+                                   size_t left, size_t right, float_t pivot_value) 
 {
     /*
     * returns the number of elements less than the pivot
@@ -462,12 +537,14 @@ size_t partition_data_around_value(float_t *array, size_t vec_len, size_t compar
     size_t store_index = left;
     size_t i;
     /* Move pivot to end */
+
+    // TODO: Does it exist a way to get a parallel partition?
     for (i = left; i < right; ++i) 
     {
         // if(compare_data_element(array + i*vec_len, array + pivot_index*vec_len, compare_dim ) >= 0){
-        if (array[i * vec_len + compare_dim] < pivot_value) 
+        if (array[i].data[compare_dim] < pivot_value) 
         {
-            swap_data_element(array + store_index * vec_len, array + i * vec_len, vec_len);
+            swap_data_element(array + store_index, array + i, vec_len);
             store_index += 1;
         }
     }
@@ -478,6 +555,7 @@ size_t partition_data_around_value(float_t *array, size_t vec_len, size_t compar
     return store_index; 
 }
 
+
 guess_t refine_pure_binning(global_context_t *ctx, pointset_t *ps,
                             guess_t best_guess, uint64_t *global_bin_count,
                             int k_global, int d, float_t f, float_t tolerance)
@@ -531,41 +609,34 @@ guess_t refine_pure_binning(global_context_t *ctx, pointset_t *ps,
          * */
 
         
-        /*
-        MPI_DB_PRINT("---- ---- ----\n");
-        MPI_DB_PRINT("[MASTER] Refining on bin %d lb %lf ub %lf starting c %lf %lf\n", 
-                best_guess.bin_idx, bin_lb, bin_ub, starting_cumulative/total_count,
-                (tmp_global_bins[best_guess.bin_idx] + starting_cumulative)/total_count);
-        */
+        // MPI_DB_PRINT("---- ---- ----\n");
+        // MPI_DB_PRINT("[MASTER] Refining on bin %d lb %lf ub %lf starting c %lf %lf\n", 
+        //         best_guess.bin_idx, bin_lb, bin_ub, starting_cumulative/total_count,
+        //         (tmp_global_bins[best_guess.bin_idx] + starting_cumulative)/total_count);
     
 
         for (int i = 0; i < k_global; ++i)  tmp_global_bins[i] = 0;
 
         pointset_t tmp_ps;
 
-        int end_idx = partition_data_around_value(ps->data, (int)ps->dims, d, 0, (int)ps->n_points, bin_ub);
-        int start_idx = partition_data_around_value(ps->data, (int)ps->dims, d, 0,end_idx, bin_lb);
+        int end_idx   = partition_data_around_value(ps->datapoints, (int)ps->dims, d, 0, (int)ps->n_points, bin_ub);
+        int start_idx = partition_data_around_value(ps->datapoints, (int)ps->dims, d, 0, end_idx, bin_lb);
 
         tmp_ps.n_points = end_idx - start_idx;
-        tmp_ps.data = ps->data + start_idx * ps->dims;
+        tmp_ps.datapoints = ps->datapoints + start_idx;
         tmp_ps.dims = ps->dims;
         tmp_ps.lb_box = (float_t*)MY_MALLOC(ctx -> dims * sizeof(float_t));
         tmp_ps.ub_box = (float_t*)MY_MALLOC(ctx -> dims * sizeof(float_t));
 
         compute_bounding_box_pointset(ctx, &tmp_ps);
 
-        /*
-        MPI_DB_PRINT("[MASTER] searching for %lf of the bin considered\n",ff);
-        */
+        // MPI_DB_PRINT("[MASTER] searching for %lf of the bin considered start_idx %d end_idx %d\n",
+        //              ff, start_idx, end_idx);
 
         // DB_PRINT("%lu\n",tmp_ps.n_points );
         MPI_Barrier(ctx->mpi_communicator);
         compute_pure_global_binning(ctx, &tmp_ps, tmp_global_bins, k_global, d);
 
-        /* sum to global bins */
-        // for(int i = 0; i < k_global; ++i) tmp_global_bins[i] +=
-        // starting_cumulative;
-
         best_guess = retrieve_guess_pure(ctx, &tmp_ps, tmp_global_bins, k_global, d, ff);
 
         best_guess.ep = check_pc_pointset_parallel(ctx, ps, best_guess, d, f);
@@ -603,24 +674,47 @@ void free_queue(partition_queue_t *pq) { free(pq->data); }
 void get_pointset_from_partition(pointset_t *ps, partition_t *part) 
 {
     ps->n_points  = part->n_points;
-    ps->data      = part->base_ptr;
+    ps->datapoints= part->base_ptr;
     ps->n_points  = part->n_points;
 }
 
-guess_t compute_median_pure_binning(global_context_t *ctx, pointset_t *ps, float_t fraction, int selected_dim, int n_bins, float_t tolerance)
+guess_t compute_median_pure_binning(global_context_t *ctx, pointset_t *ps, float_t fraction, int selected_dim, idx_t nbins, float_t tolerance)
+{
+    int best_bin_idx;
+    float_t ep;
+
+
+    // uint64_t *global_bin_counts_int = (uint64_t *)MY_MALLOC(n_bins * sizeof(uint64_t));
+
+    uint64_t global_bin_counts_int[NBINS]; 
+
+    compute_pure_global_binning(ctx, ps, global_bin_counts_int, NBINS, selected_dim);
+    guess_t g = retrieve_guess_pure(ctx, ps, global_bin_counts_int, NBINS, selected_dim, fraction);
+    g.ep = check_pc_pointset_parallel(ctx, ps, g, selected_dim, fraction);
+    g = refine_pure_binning(ctx, ps, g, global_bin_counts_int, NBINS, selected_dim, fraction, tolerance);
+
+    //free(global_bin_counts_int);
+    return g;
+}
+
+guess_t parallel_compute_median_pure_binning(global_context_t *ctx, pointset_t *ps, float_t fraction, int selected_dim, idx_t nbins, float_t tolerance)
 {
     int best_bin_idx;
     float_t ep;
 
-    uint64_t *global_bin_counts_int = (uint64_t *)MY_MALLOC(n_bins * sizeof(uint64_t));
+
+    // uint64_t *global_bin_counts_int = (uint64_t *)MY_MALLOC(n_bins * sizeof(uint64_t));
+
+    uint64_t global_bin_counts_int[NBINS]; 
 
     compute_bounding_box_pointset(ctx, ps);
-    compute_pure_global_binning(ctx, ps, global_bin_counts_int, n_bins, selected_dim);
-    guess_t g = retrieve_guess_pure(ctx, ps, global_bin_counts_int, n_bins, selected_dim, fraction);
+    compute_pure_global_binning(ctx, ps, global_bin_counts_int, NBINS, selected_dim);
+    guess_t g = retrieve_guess_pure(ctx, ps, global_bin_counts_int, NBINS, selected_dim, fraction);
     // check_pc_pointset(ctx, ps, best_guess, d, f);
     g.ep = check_pc_pointset_parallel(ctx, ps, g, selected_dim, fraction);
-    g = refine_pure_binning(ctx, ps, g, global_bin_counts_int, n_bins, selected_dim, fraction, tolerance);
-    free(global_bin_counts_int);
+    g = refine_pure_binning(ctx, ps, g, global_bin_counts_int, NBINS, selected_dim, fraction, tolerance);
+
+    //free(global_bin_counts_int);
     return g;
 }
 
@@ -749,11 +843,220 @@ void tree_print_leaves(global_context_t* ctx, top_kdtree_node_t* root)
     if(root -> rch) tree_print_leaves(ctx, root -> rch);
 }
 
-void build_top_kdtree(global_context_t *ctx, pointset_t *og_pointset, top_kdtree_t *tree, int n_bins, float_t tolerance) 
+top_kdtree_node_t* transfer_tree(global_context_t* ctx, pivot_t local_node, kdtree_t* mini_local_tree, top_kdtree_t* top_tree, 
+                                 top_kdtree_node_t* parent, int left_or_right)
+{
+    
+    top_kdtree_node_t* current_node = top_tree_generate_node(ctx, top_tree);
+
+    switch (left_or_right)
+    {
+        case TOP_TREE_LCH:
+            {
+                current_node -> parent        = parent;
+                current_node -> parent -> lch = current_node;
+                /* compute the box */
+
+                /*
+                * left child has lb equal to parent
+                * ub equal to parent except for the dim of splitting 
+                */
+                int parent_split_dim = current_node -> parent -> split_dim;
+                float_t parent_hp    = current_node -> parent -> split_val;
+
+                memcpy(current_node -> lb_node_box, current_node -> parent -> lb_node_box, ctx -> dims * sizeof(float_t));
+                memcpy(current_node -> ub_node_box, current_node -> parent -> ub_node_box, ctx -> dims * sizeof(float_t));
+
+                current_node -> ub_node_box[parent_split_dim] = parent_hp;
+            }
+            break;
+
+        case TOP_TREE_RCH:
+            {
+                current_node -> parent        = parent;
+                current_node -> parent -> rch = current_node;
+
+                int parent_split_dim = current_node -> parent -> split_dim;
+                float_t parent_hp    = current_node -> parent -> split_val;
+
+                /*
+                * right child has ub equal to parent
+                * lb equal to parent except for the dim of splitting 
+                */
+
+                memcpy(current_node -> lb_node_box, current_node -> parent -> lb_node_box, ctx -> dims * sizeof(float_t));
+                memcpy(current_node -> ub_node_box, current_node -> parent -> ub_node_box, ctx -> dims * sizeof(float_t));
+
+                current_node -> lb_node_box[parent_split_dim] = parent_hp;
+            }
+            break;
+        case NO_CHILD:
+            {
+                compute_bounding_box_data(ctx, ctx->local_data, current_node->lb_node_box, current_node->ub_node_box, ctx->local_n_points);
+            }
+            break;
+        default:
+            break;
+    }
+
+    // if(I_AM_MASTER)
+    // {
+    //     printf("Processing node\n");
+    //     pivot_print(local_node, 0, ctx->dims);
+    // }
+
+    if(local_node.is_leaf)
+    {
+        current_node->owner = local_node.as.leaf.leaf_point_count;
+    }
+    else 
+    {
+        current_node->split_dim = local_node.as.internal.split_variable;
+        current_node->split_val = local_node.as.internal.split_value;
+
+        idx_t lch_idx = local_node.as.internal.lch_idx;
+        current_node->lch = transfer_tree(ctx, mini_local_tree->__pivots[lch_idx], mini_local_tree, 
+                                          top_tree, current_node, TOP_TREE_LCH);
+
+        idx_t rch_idx = local_node.as.internal.rch_idx;
+        current_node->rch = transfer_tree(ctx, mini_local_tree->__pivots[rch_idx], mini_local_tree, 
+                                          top_tree, current_node, TOP_TREE_RCH);
+    }
+
+    return current_node;
+}
+
+void parallel_build_top_kdtree(global_context_t *ctx, top_kdtree_t *tree, idx_t oversampling) 
+{
+    TIME_DEF;
+    TIME_START;
+    //initializing og_indexes
+    // size_t tot_n_points = 0;
+    // MPI_Allreduce(&(ctx->local_n_points), &tot_n_points, 1, MPI_UINT64_T, MPI_SUM, ctx->mpi_communicator);
+
+    MPI_Alltoall(&(ctx->local_n_points), 1, MPI_UINT64_T, ctx->rank_n_points, 1, MPI_UINT64_T, ctx->mpi_communicator);
+
+    ctx->rank_idx_start[0] = 0;
+    for(int i = 1; i < ctx->world_size; ++i)
+    {
+        ctx->rank_idx_start[i] = ctx->rank_idx_start[i-1] + ctx->rank_n_points[i-1];
+    }
+
+    ctx -> og_idxs = (idx_t*)MY_MALLOC(ctx->local_n_points * sizeof(idx_t));
+    for(idx_t i = 0; i < ctx->local_n_points; ++i)
+    {
+        ctx -> og_idxs[i] = i + ctx->rank_idx_start[ctx->mpi_rank];
+    }
+
+    // select a subset of points by sampling
+    
+    idx_t n_levels = (idx_t)ceil(log2(ctx -> world_size)) + 1;
+
+    // idx_t local_sampled_points = ((1 << n_levels) - 1) * oversampling;
+    // idx_t total_sampled_points   = local_sampled_points * ctx->world_size; 
+    
+    idx_t base_points = ((1 << n_levels) - 1) * oversampling;
+
+    idx_t total_sampled_points = ((base_points + ctx->world_size - 1) / ctx->world_size) * ctx->world_size;
+
+    idx_t local_sampled_points = total_sampled_points / ctx->world_size;
+
+
+    float_t* points_chosen = (float_t*)MY_MALLOC(total_sampled_points * ctx->dims * sizeof(float_t));
+
+    idx_t start_idx = ctx->mpi_rank * local_sampled_points;
+    #pragma omp parallel
+    {
+        // 1. Initialize a private state for this specific thread
+        unsigned short state[3];
+        int tid = omp_get_thread_num();
+        
+        // Seed using thread ID and a base seed to ensure different sequences
+        state[0] = (unsigned short)tid;
+        state[1] = (unsigned short)(tid ^ time(NULL));
+        state[2] = (unsigned short)(tid >> 16);
+
+        // 2. Distribute the loop iterations among threads
+        #pragma omp for
+        for(idx_t i = 0; i < local_sampled_points; ++i)
+        {
+            // 3. Use the reentrant erand48 with the private state
+            // erand48 returns a double [0.0, 1.0)
+            double r = erand48(state);
+            idx_t sampled_idx = (idx_t)(r * ctx->local_n_points);
+
+            // 4. Copy the data (Memory layout remains the same)
+            memcpy(points_chosen + (start_idx + i) * ctx->dims, 
+                ctx->local_data + (sampled_idx) * ctx->dims, 
+                ctx->dims * sizeof(float_t));
+        }
+    }
+    float_t elapsed = TIME_STOP;
+    LOG_WRITE("Sampling", elapsed);
+
+    // it SHOULD be deterministic
+    
+    TIME_START;
+    MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, 
+                  points_chosen, local_sampled_points * ctx->dims, MPI_MY_FLOAT, 
+                  ctx->mpi_communicator);
+
+    MPI_DB_PRINT("Sampling %lu points\n", total_sampled_points);
+    kdtree_t mini_tree = {0};
+    kdtree_initialize(&mini_tree, points_chosen, total_sampled_points, ctx->dims);
+    mini_tree.root = parallel_make_tree_w_ranks(mini_tree.__points, mini_tree.__pivots, 0, -1, 0, 
+                                                mini_tree.n_points-1, 0, mini_tree.dims, 0, ctx->world_size - 1);
+
+
+    elapsed = TIME_STOP;
+    LOG_WRITE("Building top tree", elapsed);
+    // if(I_AM_MASTER)
+    // {
+    //     kdtree_print(&mini_tree);
+    // }
+    TIME_START;
+    tree->root = transfer_tree(ctx, mini_tree.__pivots[mini_tree.root], 
+                                &mini_tree, tree, NULL, NO_CHILD);
+    elapsed = TIME_STOP;
+    LOG_WRITE("Transfering Tree", elapsed);
+
+    kdtree_free(&mini_tree);
+}
+
+
+void build_top_kdtree(global_context_t *ctx, top_kdtree_t *tree, idx_t n_bins, float_t tolerance) 
 {
+
+
+    // NOTE: For the me of the future
+    // pointsests are only "helper structs" partitions 
+
+    //initializing og_indexes
     size_t tot_n_points = 0;
-    MPI_Allreduce(&(og_pointset->n_points), &tot_n_points, 1, MPI_UINT64_T, MPI_SUM, ctx->mpi_communicator);
+    MPI_Allreduce(&(ctx->local_n_points), &tot_n_points, 1, MPI_UINT64_T, MPI_SUM, ctx->mpi_communicator);
+
+    MPI_Alltoall(&(ctx->local_n_points), 1, MPI_UINT64_T, ctx->rank_n_points, 1, MPI_UINT64_T, ctx->mpi_communicator);
+
+    ctx->rank_idx_start[0] = 0;
+    for(int i = 1; i < ctx->world_size; ++i)
+    {
+        ctx->rank_idx_start[i] = ctx->rank_idx_start[i-1] + ctx->rank_n_points[i-1];
+    }
 
+    point_t *data_w_idx = (point_t*)MY_MALLOC(ctx->local_n_points * sizeof(point_t));    
+    for(idx_t i = 0; i < ctx->local_n_points; ++i)
+    {
+        data_w_idx[i].array_idx = i + ctx->rank_idx_start[ctx->mpi_rank];
+        data_w_idx[i].data      = ctx->local_data + i * ctx->dims;
+    }
+
+    pointset_t og_pointset;
+    og_pointset.datapoints = data_w_idx;
+    og_pointset.dims = ctx->dims;
+    og_pointset.n_points = ctx->local_n_points;
+    og_pointset.lb_box = (float_t*)MY_MALLOC(ctx -> dims * sizeof(float_t));
+    og_pointset.ub_box = (float_t*)MY_MALLOC(ctx -> dims * sizeof(float_t));
+    
     /*
     MPI_DB_PRINT("[MASTER] Top tree builder invoked\n");
     */
@@ -766,24 +1069,21 @@ void build_top_kdtree(global_context_t *ctx, pointset_t *og_pointset, top_kdtree
 
     /* enqueue the two partitions */
 
-    compute_bounding_box_pointset(ctx, og_pointset);
+    compute_bounding_box_pointset(ctx, &og_pointset);
 
     partition_queue_t queue;
     init_queue(&queue);
 
     int selected_dim = 0;
     partition_t current_partition = {  .d          = selected_dim,
-                                       .base_ptr   = og_pointset->data,
-                                       .n_points   = og_pointset->n_points,
+                                       .base_ptr   = og_pointset.datapoints,
+                                       .n_points   = og_pointset.n_points,
                                        .n_procs    = ctx->world_size,
                                        .parent     = NULL,
-                                       .lr         = NO_CHILD 
-                                    };
+                                       .lr         = NO_CHILD };
 
     enqueue_partition(&queue, current_partition);
     pointset_t current_pointset;
-    current_pointset.lb_box = (float_t*)MY_MALLOC(ctx -> dims * sizeof(float_t));
-    current_pointset.ub_box = (float_t*)MY_MALLOC(ctx -> dims * sizeof(float_t));
 
     while (queue.count) 
     {
@@ -801,14 +1101,14 @@ void build_top_kdtree(global_context_t *ctx, pointset_t *og_pointset, top_kdtree
         /* insert node */
         
         // MPI_DB_PRINT(   "[RANK %d] Handling partition:\n"\
-        //                 "\tcurrent_node %p,\n"\
-        //                 "\tdim %d,\n"\
-        //                 "\tn_points %lu,\n"\
-        //                 "\tstart_proc %d,\n"\
-        //                 "\tn_procs %d\n"\
-        //                 "\tparent %p\n"\
-        //                 "\tbase_ptr %p\n"\
-        //                 "\tlr %d\n", 
+        //                 "    current_node %p,\n"\
+        //                 "    dim %d,\n"\
+        //                 "    n_points %lu,\n"\
+        //                 "    start_proc %d,\n"\
+        //                 "    n_procs %d\n"\
+        //                 "    parent %p\n"\
+        //                 "    base_ptr %p\n"\
+        //                 "    lr %d\n", 
         //         ctx -> mpi_rank,
         //         current_node,
         //         current_partition.d,
@@ -831,7 +1131,7 @@ void build_top_kdtree(global_context_t *ctx, pointset_t *og_pointset, top_kdtree
                      * ub equal to parent except for the dim of splitting 
                      */
                     int parent_split_dim = current_node -> parent -> split_dim;
-                    float_t parent_hp      = current_node -> parent -> split_val;
+                    float_t parent_hp    = current_node -> parent -> split_val;
 
                     memcpy(current_node -> lb_node_box, current_node -> parent -> lb_node_box, ctx -> dims * sizeof(float_t));
                     memcpy(current_node -> ub_node_box, current_node -> parent -> ub_node_box, ctx -> dims * sizeof(float_t));
@@ -843,11 +1143,11 @@ void build_top_kdtree(global_context_t *ctx, pointset_t *og_pointset, top_kdtree
             case TOP_TREE_RCH:
                 if(current_partition.parent)
                 {
-                    current_node -> parent           = current_partition.parent;
+                    current_node -> parent        = current_partition.parent;
                     current_node -> parent -> rch = current_node;
 
                     int parent_split_dim = current_node -> parent -> split_dim;
-                    float_t parent_hp      = current_node -> parent -> split_val;
+                    float_t parent_hp    = current_node -> parent -> split_val;
 
                     /*
                      * right child has ub equal to parent
@@ -863,8 +1163,8 @@ void build_top_kdtree(global_context_t *ctx, pointset_t *og_pointset, top_kdtree
             case NO_CHILD:
                 {
                     tree -> root = current_node;
-                    memcpy(current_node -> lb_node_box, og_pointset -> lb_box, ctx -> dims * sizeof(float_t));
-                    memcpy(current_node -> ub_node_box, og_pointset -> ub_box, ctx -> dims * sizeof(float_t));
+                    memcpy(current_node -> lb_node_box, og_pointset.lb_box, ctx -> dims * sizeof(float_t));
+                    memcpy(current_node -> ub_node_box, og_pointset.ub_box, ctx -> dims * sizeof(float_t));
                 }
                 break;
         }
@@ -874,13 +1174,16 @@ void build_top_kdtree(global_context_t *ctx, pointset_t *og_pointset, top_kdtree
         current_node -> lch = NULL;
         current_node -> rch = NULL;
 
+        current_pointset.lb_box = current_node->lb_node_box;
+        current_pointset.ub_box = current_node->ub_node_box;
+
         MPI_Barrier(ctx -> mpi_communicator);
         /* handle partition */
         if(current_partition.n_procs > 1)
         {
             float_t fraction = (current_partition.n_procs / 2) / (float_t)current_partition.n_procs;
             guess_t g = compute_median_pure_binning(ctx, &current_pointset, fraction, current_partition.d, n_bins, tolerance);
-            size_t pv = partition_data_around_value(current_pointset.data, ctx->dims, current_partition.d, 0, current_pointset.n_points, g.x_guess);
+            size_t pv = partition_data_around_value(current_pointset.datapoints, ctx->dims, current_partition.d, 0, current_pointset.n_points, g.x_guess);
 
             current_node -> split_val = g.x_guess;
 
@@ -896,14 +1199,27 @@ void build_top_kdtree(global_context_t *ctx, pointset_t *og_pointset, top_kdtree
     
 
-            int next_dimension = (++selected_dim) % (ctx->dims);
+            // use max_strech int next_dimension = (++selected_dim) % (ctx->dims);
+            int next_dimension = 0;
+            float_t max_strech = 0.;
+
+            for(int dim=0; dim<ctx->dims; ++dim)
+            {
+                float_t stretch = current_node->ub_node_box[dim] - current_node->lb_node_box[dim];
+                if(stretch > max_strech)
+                {
+                    max_strech = stretch;
+                    next_dimension = dim;
+                }
+            }
+
             partition_t left_partition = {
                 .n_points     = points_left, 
                 .n_procs      = procs_left,
                 .start_proc   = current_partition.start_proc,
                 .parent       = current_node,
                 .lr           = TOP_TREE_LCH,
-                .base_ptr     = current_pointset.data,
+                .base_ptr     = current_pointset.datapoints,
                 .d            = next_dimension,
             };
 
@@ -913,7 +1229,7 @@ void build_top_kdtree(global_context_t *ctx, pointset_t *og_pointset, top_kdtree
                 .start_proc   = current_partition.start_proc + procs_left,
                 .parent       = current_node,
                 .lr           = TOP_TREE_RCH,
-                .base_ptr     = current_pointset.data + pv * current_pointset.dims,
+                .base_ptr     = current_pointset.datapoints + pv,
                 .d            = next_dimension
             };
 
@@ -936,10 +1252,16 @@ void build_top_kdtree(global_context_t *ctx, pointset_t *og_pointset, top_kdtree
         }
     #endif
 
+    ctx -> og_idxs = (idx_t*)MY_MALLOC(ctx->local_n_points * sizeof(idx_t));
+    for(idx_t i = 0; i < ctx->local_n_points; ++i)
+    {
+        ctx -> og_idxs[i] = data_w_idx[i].array_idx;
+    }
+
     
-    free(current_pointset.lb_box);
-    free(current_pointset.ub_box);
     free_queue(&queue);
+    og_pointset.datapoints = NULL;
+    free_pointset(&og_pointset);
 
 }
 
@@ -973,8 +1295,22 @@ int compute_point_owner(global_context_t* ctx, top_kdtree_t* tree, float_t* data
     return owner;
 }
 
+void swap_data_vec_and_idx(float_t *a, float_t *b, idx_t* idx_a, idx_t* idx_b, size_t vec_len) {
+    idx_t tmp_idx = *idx_a;
+    *idx_a = *idx_b;
+    *idx_b = tmp_idx;
+
+    float_t tmp;
+    for (size_t i = 0; i < vec_len; ++i) 
+    {
+        tmp = a[i];
+        a[i] = b[i];
+        b[i] = tmp;
+    }
+}
+
 /* to partition points around owners */
-int partition_data_around_key(int* key, float_t *val, int vec_len, int ref_key , int left, int right) 
+int partition_data_around_key(int* key, float_t *values, idx_t* idxs, int vec_len, int ref_key , int left, int right) 
 {
     /*
     * returns the number of elements less than the pivot
@@ -987,7 +1323,8 @@ int partition_data_around_key(int* key, float_t *val, int vec_len, int ref_key ,
         // if(compare_data_element(array + i*vec_len, array + pivot_index*vec_len, compare_dim ) >= 0){
         if (key[i] < ref_key) 
         {
-            swap_data_element(val + store_index * vec_len, val + i * vec_len, vec_len);
+            swap_data_vec_and_idx(values + store_index * vec_len, values + i * vec_len, 
+                                  idxs + store_index, idxs + i, vec_len);
             /* swap keys */
             int tmp = key[i];
             key[i] = key[store_index];
@@ -1004,43 +1341,52 @@ int partition_data_around_key(int* key, float_t *val, int vec_len, int ref_key ,
 
 void exchange_points(global_context_t* ctx, top_kdtree_t* tree)
 {
-    int* points_per_proc = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));    
-    int* points_owners      = (int*)MY_MALLOC(ctx -> dims * ctx -> local_n_points * sizeof(float_t));
+    TIME_DEF
+    int* points_per_proc  = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));    
+    int* points_owners    = (int*)MY_MALLOC(ctx -> dims * ctx -> local_n_points * sizeof(float_t));
     int* partition_offset = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));    
 
     /* compute owner */
+    TIME_START;
     #pragma omp parallel for
     for(size_t i = 0; i < ctx -> local_n_points; ++i)
     {
         /* tree walk */
         points_owners[i] = compute_point_owner(ctx, tree, ctx -> local_data + (i * ctx -> dims));
     }
-        
-    
+    float_t elapsed = TIME_STOP;
+    LOG_WRITE("Compute owners", elapsed);
+    TIME_START;
+
     int last_idx = 0;
     int len      = ctx -> local_n_points;
     float_t* curr_data = ctx -> local_data;
 
+    HERE
     partition_offset[0] = 0;
     for(int owner = 1; owner < ctx -> world_size; ++owner)
     {
-        last_idx = partition_data_around_key(points_owners, ctx -> local_data, ctx -> dims, owner, last_idx, ctx -> local_n_points);    
+        last_idx = partition_data_around_key(points_owners, ctx -> local_data, ctx -> og_idxs, ctx -> dims, owner, last_idx, ctx -> local_n_points);    
         partition_offset[owner] = last_idx;
         points_per_proc[owner - 1] = last_idx;
     }
+    elapsed = TIME_STOP;
+    LOG_WRITE("Partition", elapsed);
 
+    // copy ordered 
+
+    TIME_START;
     points_per_proc[ctx -> world_size - 1] = ctx -> local_n_points;
-    
-    
+
     for(int i = ctx -> world_size - 1; i > 0; --i)
     {
         points_per_proc[i] = points_per_proc[i] - points_per_proc[i - 1];
     }
 
-    int* rcv_count      = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
-    int* rcv_displs     = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
-    int* send_displs    = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); 
-    int* send_count     = points_per_proc;
+    int* rcv_count   = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
+    int* rcv_displs  = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
+    int* send_displs = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); 
+    int* send_count  = points_per_proc;
 
     float_t* rcvbuffer = NULL;
     int tot_count = 0;
@@ -1056,14 +1402,23 @@ void exchange_points(global_context_t* ctx, top_kdtree_t* tree)
         send_displs[i] = send_displs[i - 1] + send_count[i - 1];
     }
 
+    for(int i = 0; i < ctx -> world_size; ++i) tot_count += rcv_count[i];
+    idx_t* rcv_idxs = (idx_t*)MY_MALLOC(tot_count * sizeof(idx_t));
+    // exchange idxs
+    MPI_Alltoallv(  ctx -> og_idxs, send_count, send_displs, MPI_UINT64_T, 
+                    rcv_idxs, rcv_count, rcv_displs, MPI_UINT64_T, 
+                    ctx -> mpi_communicator);
+
+    tot_count = 0;
+
     /*multiply for number of elements */
     for(int i = 0; i < ctx -> world_size; ++i) 
     {
         send_displs[i]= send_displs[i] * ctx -> dims;
         send_count[i] = send_count[i] * ctx -> dims;
 
-        rcv_displs[i]= rcv_displs[i] * ctx -> dims;
-        rcv_count[i] = rcv_count[i] * ctx -> dims;
+        rcv_displs[i] = rcv_displs[i] * ctx -> dims;
+        rcv_count[i]  = rcv_count[i] * ctx -> dims;
         tot_count += rcv_count[i];
     }
 
@@ -1095,18 +1450,42 @@ void exchange_points(global_context_t* ctx, top_kdtree_t* tree)
     /* free prv pointer */
     free(ppp);
     free(ctx -> local_data);
+    free(ctx -> og_idxs);
     ctx -> local_data = rcvbuffer;
+    ctx -> og_idxs    = rcv_idxs; 
 
     /* check exchange */
     
-    for(size_t i = 0; i < ctx -> local_n_points; ++i)
-    {
-        int o = compute_point_owner(ctx, tree, ctx -> local_data + (i * ctx -> dims));
-        if(o != ctx -> mpi_rank) DB_PRINT("rank %d got an error\n",ctx -> mpi_rank);
-    }
+    #ifdef CHECK_CORRECT_EXCHANGE
+        for(size_t i = 0; i < ctx -> local_n_points; ++i)
+        {
+            int o = compute_point_owner(ctx, tree, ctx -> local_data + (i * ctx -> dims));
+            if(o != ctx -> mpi_rank) DB_PRINT("rank %d got an error\n",ctx -> mpi_rank);
+        }
+    #endif
 
+    elapsed = TIME_STOP;
+    LOG_WRITE("Exchange", elapsed);
+    
+    #ifdef PRINT_BALANCE_FACTOR
+        DB_PRINT("[RANK %d] balance factor %.4lf\n", ctx->mpi_rank, (float_t)ctx->local_n_points/ctx->n_points);
+    #endif
 
+    float_t balance_factor = (float_t)ctx->local_n_points/ctx->n_points;
+    float_t sum_balance_factor;
+    float_t max_balance_factor;
+    float_t min_balance_factor;
+    MPI_Reduce(&balance_factor, &sum_balance_factor, 1, MPI_MY_FLOAT, MPI_SUM, 0, ctx -> mpi_communicator);
+    MPI_Reduce(&balance_factor, &min_balance_factor, 1, MPI_MY_FLOAT, MPI_MIN, 0, ctx -> mpi_communicator);
+    MPI_Reduce(&balance_factor, &max_balance_factor, 1, MPI_MY_FLOAT, MPI_MAX, 0, ctx -> mpi_communicator);
+
+    if(I_AM_MASTER)
+    {
+        MPI_DB_PRINT("Avg Balance Factor %.4lf min %.4lf max %.4lf\n", sum_balance_factor/ctx -> world_size,
+                     min_balance_factor, max_balance_factor);
+    }
 
+    
     free(points_owners);
     free(points_per_proc);
     free(partition_offset);
@@ -1120,20 +1499,14 @@ static inline size_t local_to_global_idx(global_context_t* ctx, size_t local_idx
     return local_idx + ctx -> idx_start; 
 }
 
-void translate_tree_idx_to_global(global_context_t* ctx, kdtree_v2* local_tree) 
+void translate_tree_idx_to_global(global_context_t* ctx, kdtree_t* local_tree) 
 {
     for(size_t i = 0; i < ctx -> local_n_points; ++i)        
     {
-        local_tree -> _nodes[i].array_idx = local_to_global_idx(ctx, local_tree -> _nodes[i].array_idx); 
+        local_tree -> __points[i].array_idx = local_to_global_idx(ctx, local_tree -> __points[i].array_idx); 
     }
 }
 
-heap ngbh_search_kdtree(global_context_t* ctx, kdtree_v2* local_tree, float_t* data, int k)
-{
-    data_dims = ctx -> dims;
-    return knn_kdtree_v2(data, local_tree -> root, k);
-}
-
 void tree_walk(
         global_context_t* ctx, 
         top_kdtree_node_t* root, 
@@ -1293,33 +1666,58 @@ void tree_walk_v2_find_n_points(
                 break;
         }
 
-        int c   = max_dist > (hp_distance * hp_distance);
+        // int c   = max_dist > (hp_distance * hp_distance);
+        //
+        // //if(c || (H -> count) < (H -> N))
+        // if(c)
+        // {
+        //
+        //     switch (side)
+        //     {
+        //         case HP_LEFT_SIDE:
+        //             if(root -> rch) 
+        //             {
+        //                 /* walk on the right */
+        //                 tree_walk_v2_find_n_points(ctx, root -> rch, point_idx, max_dist, point, point_to_send_capacity);
+        //             }
+        //             break;
+        //
+        //         case HP_RIGHT_SIDE:
+        //             if(root -> lch) 
+        //             {
+        //                 /* walk on the left */
+        //                 tree_walk_v2_find_n_points(ctx, root -> lch, point_idx, max_dist, point, point_to_send_capacity);
+        //             }
+        //             break;
+        //
+        //         default:
+        //             break;
+        //     }
+        // }
 
         //if(c || (H -> count) < (H -> N))
-        if(c)
+        switch (side)
         {
+            case HP_LEFT_SIDE:
+                if(root -> rch) 
+                {
+                    /* walk on the right */
+                    float_t point_to_box_dist = box_dist(point, root->rch->lb_node_box, root->rch->ub_node_box, ctx->dims);
+                    if(point_to_box_dist < max_dist) tree_walk_v2_find_n_points(ctx, root -> rch, point_idx, max_dist, point, point_to_send_capacity);
+                }
+                break;
 
-            switch (side)
-            {
-                case HP_LEFT_SIDE:
-                    if(root -> rch) 
-                    {
-                        /* walk on the right */
-                        tree_walk_v2_find_n_points(ctx, root -> rch, point_idx, max_dist, point, point_to_send_capacity);
-                    }
-                    break;
-                
-                case HP_RIGHT_SIDE:
-                    if(root -> lch) 
-                    {
-                        /* walk on the left */
-                        tree_walk_v2_find_n_points(ctx, root -> lch, point_idx, max_dist, point, point_to_send_capacity);
-                    }
-                    break;
+            case HP_RIGHT_SIDE:
+                if(root -> lch) 
+                {
+                    /* walk on the left */
+                    float_t point_to_box_dist = box_dist(point, root->lch->lb_node_box, root->lch->ub_node_box, ctx->dims);
+                    if(point_to_box_dist < max_dist) tree_walk_v2_find_n_points(ctx, root -> lch, point_idx, max_dist, point, point_to_send_capacity);
+                }
+                break;
 
-                default:
-                    break;
-            }
+            default:
+                break;
         }
     }
 
@@ -1387,42 +1785,66 @@ void tree_walk_v2_append_points(
                 break;
         }
 
-        int c   = max_dist > (hp_distance * hp_distance);
+        // int c   = max_dist > (hp_distance * hp_distance);
+        // if(c)
+        // {
+        //
+        //     switch (side)
+        //     {
+        //         case HP_LEFT_SIDE:
+        //             if(root -> rch) 
+        //             {
+        //                 /* walk on the right */
+        //                 tree_walk_v2_append_points(ctx, root -> rch, point_idx, max_dist, point, 
+        //                         data_to_send_per_proc, local_idx_of_the_point, point_to_send_count);
+        //             }
+        //             break;
+        //
+        //         case HP_RIGHT_SIDE:
+        //             if(root -> lch) 
+        //             {
+        //                 /* walk on the left */
+        //                 tree_walk_v2_append_points(ctx, root -> lch, point_idx, max_dist, point, 
+        //                         data_to_send_per_proc, local_idx_of_the_point, point_to_send_count);
+        //             }
+        //             break;
+        //
+        //         default:
+        //             break;
+        //     }
+        // }
 
-        //if(c || (H -> count) < (H -> N))
-        if(c)
+        switch (side)
         {
+            case HP_LEFT_SIDE:
+                if(root -> rch) 
+                {
+                    /* walk on the right */
+                    float_t point_to_box_dist = box_dist(point, root->rch->lb_node_box, root->rch->ub_node_box, ctx->dims);
+                    if(point_to_box_dist < max_dist) tree_walk_v2_append_points(ctx, root -> rch, point_idx, max_dist, point, 
+                                                        data_to_send_per_proc, local_idx_of_the_point, point_to_send_count);
+                }
+                break;
 
-            switch (side)
-            {
-                case HP_LEFT_SIDE:
-                    if(root -> rch) 
-                    {
-                        /* walk on the right */
-                        tree_walk_v2_append_points(ctx, root -> rch, point_idx, max_dist, point, 
-                                data_to_send_per_proc, local_idx_of_the_point, point_to_send_count);
-                    }
-                    break;
-                
-                case HP_RIGHT_SIDE:
-                    if(root -> lch) 
-                    {
-                        /* walk on the left */
-                        tree_walk_v2_append_points(ctx, root -> lch, point_idx, max_dist, point, 
-                                data_to_send_per_proc, local_idx_of_the_point, point_to_send_count);
-                    }
-                    break;
+            case HP_RIGHT_SIDE:
+                if(root -> lch) 
+                {
+                    /* walk on the left */
+                    float_t point_to_box_dist = box_dist(point, root->lch->lb_node_box, root->lch->ub_node_box, ctx->dims);
+                    if(point_to_box_dist < max_dist) tree_walk_v2_append_points(ctx, root -> lch, point_idx, max_dist, point, 
+                                                        data_to_send_per_proc, local_idx_of_the_point, point_to_send_count);
+                }
+                break;
 
-                default:
-                    break;
-            }
+            default:
+                break;
         }
     }
 
 }
 
 
-void convert_heap_idx_to_global(global_context_t* ctx, heap* H)
+void convert_heap_idx_to_global(global_context_t* ctx, heap_t* H)
 {
     for(uint64_t i = 0; i < H -> count; ++i)
     {
@@ -1443,7 +1865,7 @@ void print_diagnositcs(global_context_t* ctx, int k)
     float_t memory_use = (float_t)ctx -> local_n_points * ctx -> dims * sizeof(float_t);
     memory_use += (float_t)sizeof(datapoint_info_t)* (float_t)(ctx -> local_n_points); 
     /* ngbh */
-    memory_use += (float_t)sizeof(heap_node)*(float_t)k * (float_t)(ctx -> local_n_points); 
+    memory_use += (float_t)sizeof(heap_node_t)*(float_t)k * (float_t)(ctx -> local_n_points); 
     memory_use = memory_use / 1e9 * shm_world_size;
 
     MPI_DB_PRINT("       Got ~%d points per node and %d ngbh per points\n", ctx -> local_n_points * shm_world_size, k); 
@@ -1458,45 +1880,40 @@ void print_diagnositcs(global_context_t* ctx, int k)
     MPI_Barrier(ctx -> mpi_communicator);
 }
 
-
-void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtree_t* top_tree, kdtree_v2* local_tree, float_t* data, int k)
+void mpi_knn_search(global_context_t* ctx, top_kdtree_t* top_tree, kdtree_t* local_tree, float_t* data, idx_t n_data, idx_t vec_len, int k)
 {
-    /* local search */
-    /* print diagnostics */
-    print_diagnositcs(ctx, k);
-    ctx -> k = (idx_t)k;
-    
     TIME_DEF;
     double elapsed_time;
 
     TIME_START;
+    //printf("rank %d local_n_points %lu\n", ctx -> mpi_rank, ctx -> local_n_points);
     MPI_Barrier(ctx -> mpi_communicator);
-    //ctx -> __local_heap_buffers = (heap_node*)MY_MALLOC(ctx -> local_n_points * k * sizeof(heap_node));
-    MPI_Alloc_mem(ctx -> local_n_points * k * sizeof(heap_node), MPI_INFO_NULL, &(ctx -> __local_heap_buffers));
-
-    #pragma omp parallel for schedule(dynamic)
-    for(int p = 0; p < ctx -> local_n_points; ++p)
-    {
-        idx_t idx = local_tree -> _nodes[p].array_idx;
-        /* actually we want to preserve the heap to then insert guesses from other nodes */
-        heap tmp_heap;
-        tmp_heap.data  = ctx -> __local_heap_buffers + k * idx;
+    //ctx -> __local_heap_buffers = (heap_node_t*)MY_MALLOC(ctx -> local_n_points * k * sizeof(heap_node));
+    heap_node_t* __local_heap_buffers = (heap_node_t*)MY_MALLOC(n_data * k * sizeof(heap_node_t));
+    //MPI_Alloc_mem(n_data * k * sizeof(heap_node_t), MPI_INFO_NULL, &__local_heap_buffers);
+    //MPI_Alloc_mem(n_data * k * sizeof(heap_node_t), MPI_INFO_NULL, &__local_heap_buffers);
+
+    #pragma omp parallel for schedule(dynamic, 32)
+    for(int p = 0; p < n_data; ++p)
+    {
+        idx_t idx = p;
+        /* actually we want to preserve the heap_t to then insert guesses from other nodes */
+        heap_t tmp_heap;
+        tmp_heap.data  = __local_heap_buffers + k * idx;
         tmp_heap.count = 0;
-        tmp_heap.N     = k;
+        tmp_heap.size  = k;
+
+        //dp_info[idx].ngbh = knn_kdtree_t_no_heapsort(local_tree -> _nodes[p].data, local_tree -> root, k);
+        idx_t visited_nodes = 0;
+        knn_sub_tree_search(data + idx * ctx->dims, local_tree-> __points, local_tree->__pivots,
+                             local_tree->root, &tmp_heap, local_tree->dims, &visited_nodes);
 
-        //dp_info[idx].ngbh = knn_kdtree_v2_no_heapsort(local_tree -> _nodes[p].data, local_tree -> root, k);
-        knn_sub_tree_search_kdtree_v2(local_tree -> _nodes[p].data, local_tree -> root, &tmp_heap);
 
         convert_heap_idx_to_global(ctx, &tmp_heap);
-        dp_info[idx].cluster_idx = -1;
-        dp_info[idx].is_center = 0;
-        dp_info[idx].array_idx = idx + ctx -> idx_start;
-        dp_info[idx].ngbh = tmp_heap.data;
     }
     elapsed_time = TIME_STOP;
     LOG_WRITE("Local neighborhood search", elapsed_time);
-
-
+    //printf("rank %d elapsed_time %lf\n", ctx->mpi_rank, elapsed_time);
     TIME_START;
     /* find if a points needs a refine on the global tree */
     float_t** data_to_send_per_proc    = (float_t**)MY_MALLOC(ctx -> world_size * sizeof(float_t*));
@@ -1512,11 +1929,11 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
     }
 
     /* NEW VERSION double tree walk */
-    #pragma omp parallel for
-    for(int i = 0; i < ctx -> local_n_points; ++i)
+    #pragma omp parallel for schedule(dynamic, 128)
+    for(int i = 0; i < n_data; ++i)
     {
-        float_t max_dist = dp_info[i].ngbh[0].value;
-        float_t* point   = ctx -> local_data + (i * ctx -> dims);
+        float_t max_dist = (__local_heap_buffers + i*k)[0].value;
+        float_t* point   = data + (i * ctx -> dims);
         
         tree_walk_v2_find_n_points(ctx, top_tree -> root, i, max_dist, point, point_to_snd_capacity);
 
@@ -1531,17 +1948,15 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
 
     }
 
-
-    #pragma omp parallel for
-    for(int i = 0; i < ctx -> local_n_points; ++i)
+    #pragma omp parallel for schedule(dynamic, 128)
+    for(int i = 0; i < n_data; ++i)
     {
-        float_t max_dist = dp_info[i].ngbh[0].value;
-        float_t* point   = ctx -> local_data + (i * ctx -> dims);
+        float_t max_dist = (__local_heap_buffers + i*k)[0].value;
+        float_t* point   = data + (i * ctx -> dims);
 
         tree_walk_v2_append_points(ctx, top_tree -> root, i, max_dist, point, data_to_send_per_proc, local_idx_of_the_point, point_to_snd_count);
     }
 
-
     elapsed_time = TIME_STOP;
     LOG_WRITE("Finding points to refine", elapsed_time);
 
@@ -1587,8 +2002,6 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
     float_t* __max_dist_snd = (float_t*)MY_MALLOC(tot_points_snd * sizeof(float_t));
     float_t* __max_dist_rcv = (float_t*)MY_MALLOC(tot_points_rcv * sizeof(float_t));
 
-
-
     /* copy data to send in contiguous memory */
     for(int i = 0; i < ctx -> world_size; ++i)
     {
@@ -1603,13 +2016,12 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
             idx_t point_idx = local_idx_of_the_point[i][j];
             // for each point take the distance of its furthest nearest neighbor
             // then communicate it in order to prune the search on foreign nodes
-            __max_dist_snd[dist_idx] = ctx -> local_datapoints[point_idx].ngbh[0].value; 
+            __max_dist_snd[dist_idx] = (__local_heap_buffers + k*point_idx)[0].value; 
             ++dist_idx;
         }
     }
 
 
-
     MPI_Alltoallv(__snd_points, snd_count, snd_displ, MPI_MY_FLOAT, 
                   __rcv_points, rcv_count, rcv_displ, MPI_MY_FLOAT, ctx -> mpi_communicator); 
 
@@ -1653,17 +2065,17 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
     // }
     //
 
-    /* prepare heap batches */
+    /* prepare heap_t batches */
 
     //int work_batch_stride = 1 + ctx -> dims;
     int work_batch_stride = ctx -> dims;
 
     /* Note that I then have to recieve an equal number of heaps as the one I sent out before */
-    heap_node* __heap_batches_to_snd = (heap_node*)MY_MALLOC((uint64_t)k * (uint64_t)tot_points_rcv * sizeof(heap_node));
+    heap_node_t* __heap_batches_to_snd = (heap_node_t*)MY_MALLOC((uint64_t)k * (uint64_t)tot_points_rcv * sizeof(heap_node_t));
 
     if( __heap_batches_to_snd == NULL)
     {
-        DB_PRINT("Rank %d failed to allocate snd_heaps %luB required\n",ctx -> mpi_rank, (uint64_t)k * (uint64_t)tot_points_snd * sizeof(heap_node));
+        DB_PRINT("Rank %d failed to allocate snd_heaps %luB required\n",ctx -> mpi_rank, (uint64_t)k * (uint64_t)tot_points_snd * sizeof(heap_node_t));
         exit(1);
     }
 
@@ -1671,8 +2083,8 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
 
 
     // max dists will contain the maximum distance to have to search
-    heap_node** heap_batches_per_node = (heap_node**)MY_MALLOC(ctx -> world_size * sizeof(heap_node*));
-    float_t**   max_dists_per_node    = (float_t**)MY_MALLOC(ctx -> world_size * sizeof(float_t*));
+    heap_node_t** heap_batches_per_node = (heap_node_t**)MY_MALLOC(ctx -> world_size * sizeof(heap_node_t*));
+    float_t**     max_dists_per_node      = (float_t**)MY_MALLOC(ctx -> world_size * sizeof(float_t*));
 
     for(int p = 0; p < ctx -> world_size; ++p) 
     {
@@ -1693,23 +2105,25 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
     {
         if(point_to_rcv_count[p] > 0 && p != ctx -> mpi_rank)
         {
-            #pragma omp parallel for schedule(dynamic)
+            #pragma omp parallel for schedule(dynamic, 128)
             for(int batch = 0; batch < point_to_rcv_count[p]; ++batch)
             {
-                heap H;
+                heap_t H;
                 H.count = k;
-                H.N = k;
+                H.size = k;
                 H.data = heap_batches_per_node[p] + (uint64_t)k * (uint64_t)batch; 
                 float_t max_dist = max_dists_per_node[p][batch];
-                for(idx_t i = 0; i < H.N; ++i)
+                for(idx_t i = 0; i < H.size; ++i)
                 {
                     H.data[i].value = max_dist;
                     H.data[i].array_idx = MY_SIZE_MAX; 
                 }
 
                 //float_t* point = rcv_work_batches[p] + batch * work_batch_stride + 1; 
+                idx_t visited_nodes = 0;
                 float_t* point = rcv_work_batches[p] + (uint64_t)batch * (uint64_t)work_batch_stride; 
-                knn_sub_tree_search_kdtree_v2(point, local_tree -> root, &H);
+                knn_sub_tree_search(point, local_tree-> __points, local_tree->__pivots,
+                                    local_tree->root, &H, local_tree->dims, &visited_nodes);
                 convert_heap_idx_to_global(ctx, &H);
             }
         }
@@ -1736,7 +2150,7 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
     TIME_START;
     
     MPI_Datatype MPI_my_heap;
-    MPI_Type_contiguous(k * sizeof(heap_node), MPI_BYTE, &MPI_my_heap);
+    MPI_Type_contiguous(k * sizeof(heap_node_t), MPI_BYTE, &MPI_my_heap);
     MPI_Barrier(ctx -> mpi_communicator);
     MPI_Type_commit(&MPI_my_heap);
 
@@ -1746,7 +2160,7 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
      * ------------------------------------- */
     
     MPI_Barrier(ctx -> mpi_communicator);
-    int default_msg_len = MAX_MSG_SIZE / (k * sizeof(heap_node));
+    int default_msg_len = MAX_MSG_SIZE / (k * sizeof(heap_node_t));
 
     int* already_sent_points = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
     int* already_rcvd_points = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
@@ -1780,10 +2194,10 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
 
     MPI_DB_PRINT("Using default message length %lu\n", default_msg_len);
 
-    heap_node* __heap_batches_to_rcv = (heap_node*)MY_MALLOC((uint64_t)k * (uint64_t)max_n_recv * sizeof(heap_node));
+    heap_node_t* __heap_batches_to_rcv = (heap_node_t*)MY_MALLOC((uint64_t)k * (uint64_t)max_n_recv * sizeof(heap_node_t));
     if( __heap_batches_to_rcv == NULL)
     {
-        DB_PRINT("Rank %d failed to allocate rcv_heaps %luB required\n",ctx -> mpi_rank, (uint64_t)k * (uint64_t)max_n_recv* sizeof(heap_node));
+        DB_PRINT("Rank %d failed to allocate rcv_heaps %luB required\n",ctx -> mpi_rank, (uint64_t)k * (uint64_t)max_n_recv* sizeof(heap_node_t));
         exit(1);
     }
 
@@ -1803,7 +2217,7 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
             MPI_DB_PRINT("[--- ROUND %d ----]\n", i);
             MPI_Barrier(ctx -> mpi_communicator);
             DB_PRINT("[RANK %d] sending to %d tot: %d [%luB]---- recieving from %d %d\n", ctx -> mpi_rank, 
-                    rank_to_send, ngbh_to_send[rank_to_send], ngbh_to_send[rank_to_send]*sizeof(heap_node), rank_to_recv, ngbh_to_recv[rank_to_recv]);
+                    rank_to_send, ngbh_to_send[rank_to_send], ngbh_to_send[rank_to_send]*sizeof(heap_node_t), rank_to_recv, ngbh_to_recv[rank_to_recv]);
         #endif
         if(ngbh_to_send[rank_to_send] > 0)
         {
@@ -1858,24 +2272,24 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
         }
 
         /* merge lists */
-        #pragma omp parallel for
+        #pragma omp parallel for schedule(dynamic, 32)
         for(int b = 0; b < ngbh_to_recv[rank_to_recv]; ++b)
         {
             int idx = local_idx_of_the_point[rank_to_recv][b];
-            /* retrieve the heap */
-            heap H;
+            /* retrieve the heap_t */
+            heap_t H;
             H.count = k;
-            H.N     = k;
+            H.size  = k;
             H.data  = __heap_batches_to_rcv + k*b;
-            /* insert the points into the heap */
+            /* insert the points into the heap_t */
             for(int j = 0; j < k; ++j)
             {
-                heap tmp_heap;
-                tmp_heap.N     = k;
+                heap_t tmp_heap;
+                tmp_heap.size  = k;
                 tmp_heap.count = k;
-                tmp_heap.data  = dp_info[idx].ngbh;
+                tmp_heap.data  = (__local_heap_buffers + idx*k);
                 // insert it only of != from max
-                if(H.data[j].array_idx != MY_SIZE_MAX) insert_max_heap(&(tmp_heap), H.data[j].value, H.data[j].array_idx);
+                if(H.data[j].array_idx != MY_SIZE_MAX) max_heap_insert(&(tmp_heap), H.data[j].value, H.data[j].array_idx);
             }
         }
 
@@ -1884,6 +2298,9 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
         MPI_Barrier(ctx -> mpi_communicator);
     }
 
+    elapsed_time = TIME_STOP;
+    LOG_WRITE("Merging results", elapsed_time);
+
     
     MPI_Testall(req_idx, req_array, &flag, MPI_STATUSES_IGNORE);
 
@@ -1899,19 +2316,22 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
     /* -------------------------------------------------------- */
     /* heapsort them */
 
-    #pragma omp parallel for schedule(dynamic)
-    for(int i = 0; i < ctx -> local_n_points; ++i)
+    TIME_START;
+
+    #pragma omp parallel for schedule(dynamic, 128)
+    for(int i = 0; i < n_data; ++i)
     {
-        heap tmp_heap;
-        tmp_heap.N     = k;
+        heap_t tmp_heap;
+        tmp_heap.size   = k;
         tmp_heap.count = k;
-        tmp_heap.data  = dp_info[i].ngbh;
+        tmp_heap.data  = (__local_heap_buffers + i*k);
 
         heap_sort(&(tmp_heap));
     }
 
     elapsed_time = TIME_STOP;
-    LOG_WRITE("Merging results", elapsed_time);
+    LOG_WRITE("Sorting negihborhoods", elapsed_time);
+
 
     #if defined(WRITE_NGBH)
     MPI_DB_PRINT("Writing ngbh to files\n");
@@ -1924,9 +2344,9 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
         }
         else
         {
-            for(int i = 0; i < ctx -> local_n_points; ++i)
+            for(int i = 0; i < n_data; ++i)
             {
-                fwrite(dp_info[i].ngbh, sizeof(heap_node), k, file);
+                fwrite(__local_heap_buffers + i*k, sizeof(heap_node_t), k, file);
             }
             fclose(file);
         }
@@ -1962,11 +2382,522 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
     free(__max_dist_snd);
     free(__max_dist_rcv);
 
+    free(__local_heap_buffers);
+}
+
+
+void mpi_all_knn_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtree_t* top_tree, kdtree_t* local_tree, int k)
+{
+    /* local search */
+    /* print diagnostics */
+    print_diagnositcs(ctx, k);
+    ctx -> k = (idx_t)k;
+    
+    TIME_DEF;
+    double elapsed_time;
+
+    TIME_START;
+    //printf("rank %d local_n_points %lu\n", ctx -> mpi_rank, ctx -> local_n_points);
+    MPI_Barrier(ctx -> mpi_communicator);
+    //ctx -> __local_heap_buffers = (heap_node_t*)MY_MALLOC(ctx -> local_n_points * k * sizeof(heap_node));
+    MPI_Alloc_mem(ctx -> local_n_points * k * sizeof(heap_node_t), MPI_INFO_NULL, &(ctx -> __local_heap_buffers));
+
+    #pragma omp parallel for schedule(dynamic, 32)
+    for(int p = 0; p < ctx -> local_n_points; ++p)
+    {
+        idx_t idx = local_tree -> __points[p].array_idx;
+        /* actually we want to preserve the heap_t to then insert guesses from other nodes */
+        heap_t tmp_heap;
+        tmp_heap.data  = ctx -> __local_heap_buffers + k * idx;
+        tmp_heap.count = 0;
+        tmp_heap.size  = k;
+
+        //dp_info[idx].ngbh = knn_kdtree_t_no_heapsort(local_tree -> _nodes[p].data, local_tree -> root, k);
+
+        idx_t visited_nodes = 0;
+        knn_sub_tree_search(local_tree->__points[p].data, local_tree-> __points, local_tree->__pivots,
+                            local_tree->root, &tmp_heap, local_tree->dims, &visited_nodes);
+
+        convert_heap_idx_to_global(ctx, &tmp_heap);
+        dp_info[idx].cluster_idx = -1;
+        dp_info[idx].is_center = 0;
+        dp_info[idx].array_idx = idx + ctx -> idx_start;
+        dp_info[idx].ngbh = tmp_heap.data;
+    }
+    elapsed_time = TIME_STOP;
+    LOG_WRITE("Local neighborhood search", elapsed_time);
+    //printf("rank %d elapsed_time %lf\n", ctx->mpi_rank, elapsed_time);
+
+
+    TIME_START;
+    /* find if a points needs a refine on the global tree */
+    float_t** data_to_send_per_proc    = (float_t**)MY_MALLOC(ctx -> world_size * sizeof(float_t*));
+    int**     local_idx_of_the_point   = (int**)MY_MALLOC(ctx -> world_size * sizeof(int*));
+    int*      point_to_snd_count       = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
+    int*      point_to_snd_capacity    = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
+
+    for(int i = 0; i < ctx -> world_size; ++i)
+    {
+        /* allocate it afterwards */
+        point_to_snd_capacity[i] = 0;
+        point_to_snd_count[i]    = 0;
+    }
+
+    /* NEW VERSION double tree walk */
+    #pragma omp parallel for schedule(dynamic, 128)
+    for(int i = 0; i < ctx -> local_n_points; ++i)
+    {
+        idx_t idx = local_tree -> __points[i].array_idx;
+        float_t max_dist = dp_info[idx].ngbh[0].value;
+        float_t* point   = local_tree->__points[i].data;
+        
+        tree_walk_v2_find_n_points(ctx, top_tree -> root, idx, max_dist, point, point_to_snd_capacity);
+
+    }
+
+    /* allocate needed space */
+    for(int i = 0; i < ctx -> world_size; ++i)
+    {
+        int np = point_to_snd_capacity[i];
+        data_to_send_per_proc[i]  = (float_t*)MY_MALLOC(np * (ctx -> dims) * sizeof(float_t));    
+        local_idx_of_the_point[i] = (int*)MY_MALLOC(np * sizeof(int));    
+
+    }
+
+    #pragma omp parallel for schedule(dynamic, 128)
+    for(int i = 0; i < ctx -> local_n_points; ++i)
+    {
+        idx_t idx = local_tree -> __points[i].array_idx;
+        float_t max_dist = dp_info[idx].ngbh[0].value;
+        float_t* point   = local_tree -> __points[i].data;
+
+        tree_walk_v2_append_points(ctx, top_tree -> root, idx, max_dist, point, data_to_send_per_proc, local_idx_of_the_point, point_to_snd_count);
+    }
+
+    elapsed_time = TIME_STOP;
+    LOG_WRITE("Finding points to refine", elapsed_time);
+
+    TIME_START;
+    int* point_to_rcv_count = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
+
+    /* exchange points to work on*/
+    MPI_Alltoall(point_to_snd_count, 1, MPI_INT, point_to_rcv_count, 1, MPI_INT, ctx -> mpi_communicator);
+
+    int* rcv_count = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
+    int* snd_count = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
+    int* rcv_displ = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
+    int* snd_displ = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
+
+    /*compute counts and displs*/
+    rcv_displ[0] = 0;
+    snd_displ[0] = 0;
+
+
+    rcv_count[0] = point_to_rcv_count[0] * (ctx -> dims);
+    snd_count[0] = point_to_snd_count[0] * (ctx -> dims);
+
+    int tot_points_rcv = point_to_rcv_count[0];
+    int tot_points_snd = point_to_snd_count[0];
+    int tot_count = rcv_count[0];
+
+    for(int i = 1; i < ctx -> world_size; ++i)
+    {
+        rcv_count[i] = point_to_rcv_count[i] * (ctx -> dims);        
+        snd_count[i] = point_to_snd_count[i] * (ctx -> dims);        
+
+        tot_count += rcv_count[i];
+        tot_points_rcv += point_to_rcv_count[i];
+        tot_points_snd += point_to_snd_count[i];
+
+        rcv_displ[i] = rcv_displ[i - 1] + rcv_count[i - 1];
+        snd_displ[i] = snd_displ[i - 1] + snd_count[i - 1];
+    }
+
+    float_t* __rcv_points = (float_t*)MY_MALLOC(tot_points_rcv * (ctx -> dims) * sizeof(float_t));
+    float_t* __snd_points = (float_t*)MY_MALLOC(tot_points_snd * (ctx -> dims) * sizeof(float_t)); 
+
+    float_t* __max_dist_snd = (float_t*)MY_MALLOC(tot_points_snd * sizeof(float_t));
+    float_t* __max_dist_rcv = (float_t*)MY_MALLOC(tot_points_rcv * sizeof(float_t));
+
+
+
+    /* copy data to send in contiguous memory */
+    for(int i = 0; i < ctx -> world_size; ++i)
+    {
+        memcpy(__snd_points + snd_displ[i], data_to_send_per_proc[i], snd_count[i] * sizeof(float_t));
+    }
+
+    size_t dist_idx = 0;
+    for(idx_t i = 0; i < ctx -> world_size; ++i)
+    {
+        for(idx_t j = 0; j < point_to_snd_count[i]; ++j)
+        {
+            idx_t point_idx = local_idx_of_the_point[i][j];
+            // for each point take the distance of its furthest nearest neighbor
+            // then communicate it in order to prune the search on foreign nodes
+            __max_dist_snd[dist_idx] = ctx -> local_datapoints[point_idx].ngbh[0].value; 
+            ++dist_idx;
+        }
+    }
+
+    MPI_Alltoallv(__snd_points, snd_count, snd_displ, MPI_MY_FLOAT, 
+                  __rcv_points, rcv_count, rcv_displ, MPI_MY_FLOAT, ctx -> mpi_communicator); 
+
+    float_t** rcv_work_batches = (float_t**)MY_MALLOC(ctx -> world_size * sizeof(float_t*));
+    for(int i = 0; i < ctx -> world_size; ++i) 
+    {
+        rcv_work_batches[i]       = __rcv_points + rcv_displ[i];
+    }
+
+    MPI_Status status;
+    int flag;
+
+    for(int i = 0; i < ctx -> world_size; ++i)
+    {
+        rcv_count[i] = rcv_count[i] / ctx -> dims;
+        rcv_displ[i] = rcv_displ[i] / ctx -> dims;
+        snd_count[i] = snd_count[i] / ctx -> dims;
+        snd_displ[i] = snd_displ[i] / ctx -> dims;
+
+    }
+
+    MPI_Alltoallv(__max_dist_snd, snd_count, snd_displ, MPI_MY_FLOAT, 
+                  __max_dist_rcv, rcv_count, rcv_displ, MPI_MY_FLOAT, ctx -> mpi_communicator); 
+
+    /* prepare heap_t batches */
+
+    //int work_batch_stride = 1 + ctx -> dims;
+    int work_batch_stride = ctx -> dims;
+
+    /* Note that I then have to recieve an equal number of heaps as the one I sent out before */
+    heap_node_t* __heap_batches_to_snd = (heap_node_t*)MY_MALLOC((uint64_t)k * (uint64_t)tot_points_rcv * sizeof(heap_node_t));
+
+    if( __heap_batches_to_snd == NULL)
+    {
+        DB_PRINT("Rank %d failed to allocate snd_heaps %luB required\n",ctx -> mpi_rank, (uint64_t)k * (uint64_t)tot_points_snd * sizeof(heap_node_t));
+        exit(1);
+    }
+
+    MPI_Barrier(ctx -> mpi_communicator);
+
+
+    // max dists will contain the maximum distance to have to search
+    heap_node_t** heap_batches_per_node = (heap_node_t**)MY_MALLOC(ctx -> world_size * sizeof(heap_node_t*));
+    float_t**   max_dists_per_node    = (float_t**)MY_MALLOC(ctx -> world_size * sizeof(float_t*));
+
+    for(int p = 0; p < ctx -> world_size; ++p) 
+    {
+        heap_batches_per_node[p] = __heap_batches_to_snd + (uint64_t)rcv_displ[p] * (uint64_t)k;
+        max_dists_per_node[p]    = __max_dist_rcv        + (uint64_t)rcv_displ[p];
+    }
+
+    /* compute everything */
+    elapsed_time = TIME_STOP;
+    LOG_WRITE("Exchanging points", elapsed_time);
+    MPI_Barrier(ctx -> mpi_communicator);
+
+
+    TIME_START;
+
+    /* ngbh search on recieved points */
+    for(int p = 0; p < ctx -> world_size; ++p)
+    {
+        if(point_to_rcv_count[p] > 0 && p != ctx -> mpi_rank)
+        {
+            #pragma omp parallel for schedule(dynamic, 128)
+            for(int batch = 0; batch < point_to_rcv_count[p]; ++batch)
+            {
+                heap_t H;
+                H.count = k;
+                H.size = k;
+                H.data = heap_batches_per_node[p] + (uint64_t)k * (uint64_t)batch; 
+                float_t max_dist = max_dists_per_node[p][batch];
+                for(idx_t i = 0; i < H.size; ++i)
+                {
+                    H.data[i].value = max_dist;
+                    H.data[i].array_idx = MY_SIZE_MAX; 
+                }
+
+                //float_t* point = rcv_work_batches[p] + batch * work_batch_stride + 1; 
+                idx_t visited_nodes = 0;
+                float_t* point = rcv_work_batches[p] + (uint64_t)batch * (uint64_t)work_batch_stride; 
+                knn_sub_tree_search(point, local_tree-> __points, local_tree->__pivots,
+                                    local_tree->root, &H, local_tree->dims, &visited_nodes);
+                convert_heap_idx_to_global(ctx, &H);
+            }
+        }
+    }
+    
+    /* sendout results */
+
+    /* 
+     * dummy pointers to clarify counts in this part
+     * act like an alias for rcv and snd counts
+     */ 
+
+    int* ngbh_to_send = point_to_rcv_count;
+    int* ngbh_to_recv = point_to_snd_count;
+
+    /*
+     * counts are inverted since I have to recieve as many batches as points I
+     * Have originally sended
+     */
+
+    elapsed_time = TIME_STOP;
+    LOG_WRITE("Ngbh search for foreing points", elapsed_time);
+
+    TIME_START;
+    
+    MPI_Datatype MPI_my_heap;
+    MPI_Type_contiguous(k * sizeof(heap_node_t), MPI_BYTE, &MPI_my_heap);
+    MPI_Barrier(ctx -> mpi_communicator);
+    MPI_Type_commit(&MPI_my_heap);
+
+
+    /* -------------------------------------
+     * ALTERNATIVE TO ALL TO ALL FOR BIG MSG
+     * ------------------------------------- */
+    
+    MPI_Barrier(ctx -> mpi_communicator);
+    int default_msg_len = MAX_MSG_SIZE / (k * sizeof(heap_node_t));
+
+    int* already_sent_points = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
+    int* already_rcvd_points = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
+
+    /* allocate a request array to keep track of all requests going out*/
+    MPI_Request* req_array;
+    int req_num = 0;
+    for(int i = 0; i < ctx -> world_size; ++i)
+    {
+        req_num += ngbh_to_send[i] > 0 ? ngbh_to_send[i]/default_msg_len + 1 : 0;
+    }
+
+    req_array = (MPI_Request*)MY_MALLOC(req_num * sizeof(MPI_Request));
+
+
+    for(int i = 0; i < ctx -> world_size; ++i)
+    {
+        already_sent_points[i] = 0;
+        already_rcvd_points[i] = 0;
+    }
+
+    int req_idx = 0;
+
+    // find the maximum number of points to send */
+    
+    idx_t max_n_recv = 0;
+    for(int i = 0; i < ctx -> world_size; ++i)
+    {
+        max_n_recv = MAX(max_n_recv, (idx_t)ngbh_to_recv[i]);
+    }
+
+    MPI_DB_PRINT("Using default message length %lu\n", default_msg_len);
+
+    heap_node_t* __heap_batches_to_rcv = (heap_node_t*)MY_MALLOC((uint64_t)k * (uint64_t)max_n_recv * sizeof(heap_node_t));
+    if( __heap_batches_to_rcv == NULL)
+    {
+        DB_PRINT("Rank %d failed to allocate rcv_heaps %luB required\n",ctx -> mpi_rank, (uint64_t)k * (uint64_t)max_n_recv* sizeof(heap_node_t));
+        exit(1);
+    }
+
+    /* make a ring */
+
+    MPI_Barrier(ctx -> mpi_communicator);
+    for(int i = 1; i < ctx -> world_size; ++i)
+    {
+        int rank_to_send = (ctx -> mpi_rank + i) % (ctx -> world_size);
+        int rank_to_recv = (ctx -> world_size + ctx -> mpi_rank - i) % (ctx -> world_size);
+
+        /* do things */
+
+        /* send out one batch */
+
+        #ifdef PRINT_NGBH_EXCHANGE_SCHEME
+            MPI_DB_PRINT("[--- ROUND %d ----]\n", i);
+            MPI_Barrier(ctx -> mpi_communicator);
+            DB_PRINT("[RANK %d] sending to %d tot: %d [%luB]---- recieving from %d %d\n", ctx -> mpi_rank, 
+                    rank_to_send, ngbh_to_send[rank_to_send], ngbh_to_send[rank_to_send]*sizeof(heap_node_t), rank_to_recv, ngbh_to_recv[rank_to_recv]);
+        #endif
+        if(ngbh_to_send[rank_to_send] > 0)
+        {
+            int count_send = 0;
+            while(already_sent_points[rank_to_send] < ngbh_to_send[rank_to_send])
+            {
+                MPI_Request request;
+                count_send = MIN((int)default_msg_len, (int)(ngbh_to_send[rank_to_send] - already_sent_points[rank_to_send] ));
+
+                MPI_Isend(  heap_batches_per_node[rank_to_send] + k * already_sent_points[rank_to_send], count_send,  
+                        MPI_my_heap, rank_to_send, 0, ctx -> mpi_communicator, &request);
+
+                already_sent_points[rank_to_send] += count_send;
+                req_array[req_idx] = request;
+                ++req_idx;
+            }
+        }
+
+        if(     ngbh_to_send[rank_to_send] != already_sent_points[rank_to_send] || 
+                point_to_rcv_count[rank_to_send] != already_sent_points[rank_to_send])
+        {
+            DB_PRINT("ERROR OCCURRED in sending points [rank %d] got %d expected %d\n", 
+                    ctx -> mpi_rank, already_rcvd_points[rank_to_send], point_to_rcv_count[rank_to_send]);
+        }
+        
+        MPI_Barrier(ctx -> mpi_communicator);
+
+        if(ngbh_to_recv[rank_to_recv] > 0)
+        {
+            flag = 0;
+            while(already_rcvd_points[rank_to_recv] < ngbh_to_recv[rank_to_recv])
+            {
+                MPI_Probe(rank_to_recv, MPI_ANY_TAG, ctx -> mpi_communicator, &status);
+                MPI_Request request;
+                int count_recv; 
+                int source = status.MPI_SOURCE;
+                MPI_Get_count(&status, MPI_my_heap, &count_recv);
+                /* recieve each slice */
+
+                MPI_Recv(__heap_batches_to_rcv + k * already_rcvd_points[rank_to_recv], 
+                        count_recv, MPI_my_heap, source, MPI_ANY_TAG, ctx -> mpi_communicator, &status);
+
+                already_rcvd_points[rank_to_recv] += count_recv;
+            }
+        }
+
+        if(     ngbh_to_recv[rank_to_recv] != already_rcvd_points[rank_to_recv] || 
+                point_to_snd_count[rank_to_recv] != already_rcvd_points[rank_to_recv])
+        {
+            DB_PRINT("ERROR OCCURRED in recieving points [rank %d] got %d expected %d\n", 
+                    ctx -> mpi_rank, already_rcvd_points[rank_to_recv], point_to_snd_count[rank_to_recv]);
+        }
+
+        /* merge lists */
+        #pragma omp parallel for schedule(dynamic, 32)
+        for(int b = 0; b < ngbh_to_recv[rank_to_recv]; ++b)
+        {
+            int idx = local_idx_of_the_point[rank_to_recv][b];
+            /* retrieve the heap_t */
+            heap_t H;
+            H.count = k;
+            H.size  = k;
+            H.data  = __heap_batches_to_rcv + k*b;
+            /* insert the points into the heap_t */
+            for(int j = 0; j < k; ++j)
+            {
+                heap_t tmp_heap;
+                tmp_heap.size  = k;
+                tmp_heap.count = k;
+                tmp_heap.data  = dp_info[idx].ngbh;
+                // insert it only of != from max
+                if(H.data[j].array_idx != MY_SIZE_MAX) max_heap_insert(&(tmp_heap), H.data[j].value, H.data[j].array_idx);
+            }
+        }
+
+
+
+        MPI_Barrier(ctx -> mpi_communicator);
+    }
+
+    elapsed_time = TIME_STOP;
+    LOG_WRITE("Merging results", elapsed_time);
+
+    
+    MPI_Testall(req_idx, req_array, &flag, MPI_STATUSES_IGNORE);
+
+    if(flag == 0)
+    {
+        DB_PRINT("ERROR OCCURRED Rank %d has unfinished communications\n", ctx -> mpi_rank);
+        exit(1);
+    }
+    free(req_array);
+    free(already_sent_points);
+    free(already_rcvd_points);
+
+    /* -------------------------------------------------------- */
+    /* heapsort them */
+
+    TIME_START;
+
+    #pragma omp parallel for schedule(dynamic, 128)
+    for(int i = 0; i < ctx -> local_n_points; ++i)
+    {
+        heap_t tmp_heap;
+        tmp_heap.size   = k;
+        tmp_heap.count = k;
+        tmp_heap.data  = dp_info[i].ngbh;
+
+        heap_sort(&(tmp_heap));
+    }
+
+    elapsed_time = TIME_STOP;
+    LOG_WRITE("Sorting negihborhoods", elapsed_time);
+
+
+    #if defined(WRITE_NGBH)
+    MPI_DB_PRINT("Writing ngbh to files\n");
+        char ngbh_out[80];
+        sprintf(ngbh_out, "./bb/rank_%d.ngbh",ctx -> mpi_rank);
+        FILE* file = fopen(ngbh_out,"w");
+        if(!file) 
+        {
+            printf("Cannot open file %s\n",ngbh_out);
+        }
+        else
+        {
+            for(int i = 0; i < ctx -> local_n_points; ++i)
+            {
+                fwrite(dp_info[i].ngbh, sizeof(heap_node_t), k, file);
+            }
+            fclose(file);
+        }
+    #endif
+
+    MPI_Barrier(ctx -> mpi_communicator);
+
+
+    #if defined(WRITE_SHUFFLED_DATA)
+        float_t* data_to_write = (float_t*)MY_MALLOC(ctx -> local_n_points * ctx -> k * sizeof(float_t));
+        for(int i = 0; i < ctx -> local_n_points; ++i)
+        {
+            idx_t idx = local_tree -> __points[i].array_idx;
+            memcpy(data_to_write + idx*ctx -> dims, local_tree -> __points[i].data, ctx -> dims*sizeof(float_t));
+        }
+
+        big_ordered_buffer_to_file(ctx, data_to_write, sizeof(float_t), ctx -> local_n_points * ctx -> dims, "bb/ordered_data.npy");
+        free(data_to_write);
+    #endif
+
+    for(int i = 0; i < ctx -> world_size; ++i)
+    {
+        if(data_to_send_per_proc[i])  free(data_to_send_per_proc[i]);
+        if(local_idx_of_the_point[i]) free(local_idx_of_the_point[i]);
+    }
+
+    free(data_to_send_per_proc);
+    free(local_idx_of_the_point);
+    free(heap_batches_per_node);
+    free(max_dists_per_node);
+    //free(rcv_heap_batches);
+    free(rcv_work_batches);
+    free(point_to_rcv_count);
+    free(point_to_snd_count);
+    free(point_to_snd_capacity);
+
+    free(rcv_count);
+    free(snd_count);
+    free(rcv_displ);
+    free(snd_displ);
+    free(__heap_batches_to_rcv);
+    free(__heap_batches_to_snd);
+    free(__rcv_points);
+    free(__snd_points);
+    free(__max_dist_snd);
+    free(__max_dist_rcv);
+
 }
 
-void build_local_tree(global_context_t* ctx, kdtree_v2* local_tree)
+void build_local_tree(global_context_t* ctx, kdtree_t* local_tree)
 {
-    local_tree -> root = build_tree_kdtree_v2(local_tree -> _nodes, local_tree -> n_nodes, ctx -> dims);
+    build_tree_kdtree(local_tree);
 }
 
 
diff --git a/src/tree/tree.h b/src/tree/tree.h
index 1d84445..4fd2fd6 100644
--- a/src/tree/tree.h
+++ b/src/tree/tree.h
@@ -1,9 +1,9 @@
 #pragma once 
 #include "../common/common.h"
 #include "heap.h"
-#include "kdtreeV2.h"
-
+#include "kdtree.h"
 #include <stdlib.h>
+
 #include <stdio.h>
 #include <memory.h>
 #include <math.h>
@@ -52,9 +52,13 @@ typedef struct partition_t
 	int n_procs;
 	int start_proc;
 	size_t n_points;
-	float_t* base_ptr;
 	int lr;
 	struct top_kdtree_node_t* parent;
+    //support for parallel partition to be put into a union maybe
+    idx_t  offset;
+    //support for single thread paritition to be put into a union maybe
+	point_t* base_ptr;
+
 } partition_t;
 
 typedef struct partition_queue_t
@@ -87,19 +91,33 @@ typedef struct top_kdtree_t
 	struct top_kdtree_node_t* root;
 } top_kdtree_t;
 
+typedef struct partition_utils_t
+{
+   size_t* lt_count; 
+   size_t* gt_count; 
+   size_t* lt_displ; 
+   size_t* gt_displ; 
+} partition_utils_t;
+
 
 
 void     simulate_master_read_and_scatter(int, size_t, global_context_t* ); 
 void     top_tree_init(global_context_t *ctx, top_kdtree_t *tree); 
-void     build_top_kdtree(global_context_t *ctx, pointset_t *og_pointset, top_kdtree_t *tree, int n_bins, float_t tolerance);
+void     build_top_kdtree(global_context_t *ctx, top_kdtree_t *tree, idx_t n_bins, float_t tolerance);
 void     exchange_points(global_context_t* ctx, top_kdtree_t* tree);
-void     build_local_tree(global_context_t* ctx, kdtree_v2* local_tree);
-void     mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtree_t* top_tree, kdtree_v2* local_tree, float_t* data, int k);
+void     build_local_tree(global_context_t* ctx, kdtree_t* local_tree);
+size_t   partition_data_around_value(point_t *array, size_t vec_len, size_t compare_dim, size_t left, size_t right, float_t pivot_value);
+size_t   parallel_partition_data_around_value(partition_utils_t* p_putils, float_t* in, 
+                                            float_t* out, size_t vec_len, size_t compare_dim,
+                                            size_t left, size_t right, float_t pivot_value);
+void     mpi_all_knn_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtree_t* top_tree, kdtree_t* local_tree, int k);
 float_t  compute_ID_two_NN_ML(global_context_t* ctx, datapoint_info_t* dp_info, idx_t n, int verbose);
 void     compute_density_kstarnn_rma_v2(global_context_t* ctx, const float_t d, int verbose);
 void     compute_correction(global_context_t* ctx, float_t Z);
 int      foreign_owner(global_context_t*, idx_t);
 void     top_tree_free(global_context_t *ctx, top_kdtree_t *tree);
+void     parallel_build_top_kdtree(global_context_t *ctx, top_kdtree_t *tree, idx_t oversampling);
+void     mpi_knn_search(global_context_t* ctx, top_kdtree_t* top_tree, kdtree_t* local_tree, float_t* data, idx_t n_data, idx_t vec_len, int k);
 
 
-- 
GitLab


From 88d661af5d326620a684986312bcc6013664ce16 Mon Sep 17 00:00:00 2001
From: Francesco Tomba <francy273998@gmail.com>
Date: Tue, 14 Apr 2026 11:40:28 +0200
Subject: [PATCH 2/2] got 10d 1B

---
 run_leo             |  13 +-
 src/common/common.c |   1 +
 src/common/common.h |  10 +-
 src/main/main.c     | 222 +++++++++++------
 src/tree/kdtree.h   |  11 +-
 src/tree/tree.c     | 529 ++++++++++++++++++++++++++++++++++++++++-
 src/tree/tree.h     |   6 +-
 src/tree/vptree.h   | 563 ++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 1267 insertions(+), 88 deletions(-)
 create mode 100644 src/tree/vptree.h

diff --git a/run_leo b/run_leo
index 497cdb7..efc8a39 100644
--- a/run_leo
+++ b/run_leo
@@ -31,13 +31,18 @@ export PSM2_MQ_RECVREQS_MAX=268435456
 rm bb/*
 mkdir bb
 
-OUT_ASSIGNMENT=/leonardo_scratch/large/userexternal/ftomba00/assignment
-OUT_DATA=/leonardo_scratch/large/userexternal/ftomba00/data
-
+OUT_DIR=/leonardo_scratch/large/userexternal/ftomba00/exp1M
 IN_DATA=/leonardo_work/EUHPC_D18_045
 
+rm -rf ${OUT_DIR}
+mkdir ${OUT_DIR}
+
+
 #10^6 points 
-time mpirun -n ${SLURM_NTASKS} --map-by ppr:1:socket:PE=${SLURM_CPUS_PER_TASK}  ./main -t f32 -k 100 -i ${IN_DATA}/norm_data/std_LR_091_0001 -d 5 -a ${OUT_ASSIGNMENT} -o ${OUT_DATA}
+time mpirun -n ${SLURM_NTASKS} --map-by ppr:1:socket:PE=${SLURM_CPUS_PER_TASK}  ./main -t f32 -k 100 -i ${IN_DATA}/norm_data/std_LR_091_0001 -d 5 -o ${OUT_DIR}
+
+
+## to modify those one below
 #time mpirun -n ${SLURM_NTASKS} --map-by core  ./main -t f32 -i ${IN_DATA}/norm_data/std_LR_091_0001 -d 5 -a ${OUT_ASSIGNMENT} -o ${OUT_DATA}
 
 #34 * 10^6 points
diff --git a/src/common/common.c b/src/common/common.c
index 3a5d155..ee9e766 100644
--- a/src/common/common.c
+++ b/src/common/common.c
@@ -22,6 +22,7 @@ void get_context(global_context_t* ctx)
     ctx -> local_datapoints = NULL;
     ctx -> __local_heap_buffers = NULL;
     ctx -> input_data_in_float32 = -1;
+    ctx -> local_tree_type = KD;
     ctx -> dims = 0;
     ctx -> k = 300;
     ctx -> z = 3;
diff --git a/src/common/common.h b/src/common/common.h
index ad6cd47..c92c2b2 100644
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -11,16 +11,16 @@
 //#include <stdarg.h>
 
 // #define PARALLEL_FIX_BORDERS
-#define WRITE_SHUFFLED_DATA
-#define WRITE_NGBH
+// #define WRITE_SHUFFLED_DATA
+// #define WRITE_NGBH
 // #define WRITE_TOP_NODES
-#define WRITE_DENSITY
+// #define WRITE_DENSITY
 // #define WRITE_CLUSTER_ASSIGN_H1
 // #define WRITE_BORDERS
 // #define WRITE_CENTERS_PRE_MERGING
 // #define WRITE_MERGES_INFO
 // #define WRITE_MERGING_TABLE
-#define WRITE_FINAL_ASSIGNMENT
+// #define WRITE_FINAL_ASSIGNMENT
 
 // #define PRINT_NGBH_EXCHANGE_SCHEME
 // #define PRINT_H2_COMM_SCHEME
@@ -162,6 +162,7 @@ typedef struct datapoint_info_t {
     float_t log_rho_err;    //
 } datapoint_info_t;
 
+enum local_tree_type {VP, KD};
 
 typedef struct global_context_t 
 {
@@ -188,6 +189,7 @@ typedef struct global_context_t
     idx_t* og_idxs;                                 //original indexes
     heap_node_t* __local_heap_buffers;              //buffer that stores nearest neighbors
 	MPI_Comm mpi_communicator;                      //mpi communicator
+    enum local_tree_type local_tree_type;           //local tree strategy
     int input_data_in_float32;
     char input_data_file[DEFAULT_STR_LEN];
     char output_assignment_file[DEFAULT_STR_LEN];
diff --git a/src/main/main.c b/src/main/main.c
index c82e660..039eaa8 100644
--- a/src/main/main.c
+++ b/src/main/main.c
@@ -8,29 +8,6 @@
 #include <unistd.h>
 #include <getopt.h>
 
-
-#ifdef AMONRA
-    #pragma message "Hi, you are on amonra"
-    #define OUT_CLUSTER_ASSIGN "/beegfs/ftomba/phd/results/final_assignment.npy"
-    #define OUT_DATA           "/beegfs/ftomba/phd/results/ordered_data.npy"
-#endif
-
-#ifdef LEONARDO
-    #define OUT_CLUSTER_ASSIGN "/leonardo_scratch/large/userexternal/ftomba00/out_dadp/final_assignment.npy"
-    #define OUT_DATA           "/leonardo_scratch/large/userexternal/ftomba00/out_dadp/ordered_data.npy"
-#endif
-
-#ifdef LUMI
-    #define OUT_CLUSTER_ASSIGN "~/scratch_dadp/out_dadp/final_assignment.npy"
-    #define OUT_DATA           "~/scratch_dadp/out_dadp/ordered_data.npy"
-#endif
-
-#ifndef  OUT_CLUSTER_ASSIGN
-    #define OUT_CLUSTER_ASSIGN "final_assignment.npy"
-    #define OUT_DATA           "ordered_data.npy"
-#endif
-
-
 #ifdef THREAD_FUNNELED
     #define THREAD_LEVEL MPI_THREAD_FUNNELED
 #else
@@ -43,10 +20,10 @@ struct option long_options[] =
     {"in-data" , required_argument, 0, 'i'},
     {"in-dtype", required_argument, 0, 't'},
     {"in-dims" , required_argument, 0, 'd'},
-    {"out-data", optional_argument, 0, 'o'},
-    {"out-assignment", optional_argument, 0, 'a'},
+    {"out-directory", required_argument, 0, 'o'},
     {"kngbh", optional_argument, 0, 'k'},
     {"z", optional_argument, 0, 'z'},
+    {"knn-strategy", optional_argument, 0, 's'},
     {"help", optional_argument, 0, 'h'},
     {0, 0, 0, 0}
 };
@@ -58,12 +35,10 @@ const char* help = "Distributed Advanced Density Peak\n"\
                    "    -d --in-dims        (required) number of dimensions of the data file, dadp expects something\n"\
                    "                                   of the form N x d where N is inferred from the lenght of the\n"\
                    "                                   data file\n"\
-                   "    -o --out-data       (optional) output path for the data, the datafile is shuffled between\n"\
-                   "                                   mpi tasks and datapoints are ordered default is `out_data` \n"\
-                   "    -a --out-assignment (optional) output path for the cluster assignment output ranges [0 ... Nc - 1]\n"\
-                   "                                   for core points halo points have indices [-Nc ... -1] conversion\n"\
-                   "                                   of idx for an halo point is cluster_idx = -halo_idx - 1, default is `out_assignment`\n"\
+                   "    -o --out-directory  (required) output path for the data, cluster assignment and original indices  \n"\
+                   "                                   of the datapoints. Produces files like `data.[rank]`, `og_idx.[rank]` and `assignment.[rank]`\n"\
                    "    -k --kngbh          (optional) number of nearest neighbors to compute\n"\
+                   "    -s --knn-strategy   (optional) strategy for local knn (allowed values 'kd' for kdtree, 'vp' for ball-tree (vantage point)\n"\
                    "    -z --z              (optional) number of nearest neighbors to compute\n";
 
 void parse_args(global_context_t* ctx, int argc, char** argv)
@@ -72,10 +47,8 @@ void parse_args(global_context_t* ctx, int argc, char** argv)
     int opt;
     int input_file_set = 0;
     int input_type_set = 0;
-    snprintf(ctx -> output_assignment_file, DEFAULT_STR_LEN, "%s", OUT_CLUSTER_ASSIGN);
-    snprintf(ctx -> output_data_file, DEFAULT_STR_LEN, "%s", OUT_DATA);
 
-    while((opt = getopt_long(argc, argv, "i:t:d:o:a:k:z:h", long_options, NULL)) != -1)
+    while((opt = getopt_long(argc, argv, "i:t:d:o:k:z:hs:", long_options, NULL)) != -1)
     {
         switch(opt)
         {
@@ -105,9 +78,6 @@ void parse_args(global_context_t* ctx, int argc, char** argv)
             case 'o':
                 strncpy(ctx -> output_data_file, optarg, DEFAULT_STR_LEN);
                 break;
-            case 'a':
-                strncpy(ctx -> output_assignment_file, optarg, DEFAULT_STR_LEN);
-                break;
             case 'k':
                 ctx -> k = atoi(optarg);
                 break;
@@ -118,6 +88,22 @@ void parse_args(global_context_t* ctx, int argc, char** argv)
                 mpi_printf(ctx, "%s\n", help);
                 MPI_Finalize();
                 exit(0);
+            
+            case 's':
+                if(strncmp("vp", optarg, 2) == 0)
+                {
+                    ctx -> local_tree_type = VP;
+                }
+                else if(strncmp("kd", optarg, 2) == 0)
+                {
+                    ctx -> local_tree_type = KD;
+                }
+                else
+                {
+                    printf("Cannot find valid local tree type selection, exiting\n");
+                    exit(0);
+                }
+                break;
 
             default:
                 mpi_printf(ctx, "%s\n", help);
@@ -161,8 +147,7 @@ void print_hello(global_context_t* ctx)
     mpi_printf(ctx, "Data file  .............> %s\n", ctx -> input_data_file);
     mpi_printf(ctx, "Input Type .............> float%d\n", ctx -> input_data_in_float32 ? 32 : 64);
     mpi_printf(ctx, "Dimensions .............> %d\n", ctx -> dims);
-    mpi_printf(ctx, "Output data file .......> %s\n", ctx -> output_data_file);
-    mpi_printf(ctx, "Output assignment file -> %s\n", ctx -> output_assignment_file);
+    mpi_printf(ctx, "Output directory .......> %s\n", ctx -> output_data_file);
     mpi_printf(ctx, "k ......................> %lu\n", ctx -> k);
     mpi_printf(ctx, "Z ......................> %.2lf\n", ctx -> z);
     mpi_printf(ctx, "\nRUNNING!\n");
@@ -255,15 +240,33 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
     int halo = MY_TRUE;
     float_t tol = 0.002;
 
-    if(I_AM_MASTER && ctx -> world_size <= 6)
+    if(ctx -> world_size <= 6)
     {
-        test_file_path(ctx -> output_data_file);
-        test_file_path(ctx -> output_assignment_file);
+        if(I_AM_MASTER)
+        {
+            char out_file[200];
+
+            snprintf(out_file, 200, "%s/og_idx", ctx->output_data_file);
+            test_file_path(out_file);
+
+            snprintf(out_file, 200, "%s/assignment", ctx->output_data_file);
+            test_file_path(out_file);
+
+            snprintf(out_file, 200, "%s/data", ctx->output_data_file);
+            test_file_path(out_file);
+        }
     }
     else
     {
-        test_distributed_file_path(ctx, ctx -> output_data_file);
-        test_distributed_file_path(ctx, ctx -> output_assignment_file);
+        char out_file[200];
+        snprintf(out_file, 200, "%s/og_idx", ctx->output_data_file);
+        test_distributed_file_path(ctx, out_file);
+
+        snprintf(out_file, 200, "%s/assignment", ctx->output_data_file);
+        test_distributed_file_path(ctx, out_file);
+
+        snprintf(out_file, 200, "%s/data", ctx->output_data_file);
+        test_distributed_file_path(ctx, out_file);
     }
     
 
@@ -358,10 +361,6 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
     elapsed_time = TIME_STOP;
     LOG_WRITE("Top kdtree build and domain decomposition", elapsed_time);
 
-    TIME_START;
-    kdtree_t local_tree;
-    kdtree_initialize( &local_tree, ctx -> local_data, ctx -> local_n_points, (unsigned int)ctx -> dims);
-
     datapoint_info_t* dp_info = (datapoint_info_t*)MY_MALLOC(ctx -> local_n_points * sizeof(datapoint_info_t));            
     for(uint64_t i = 0; i < ctx -> local_n_points; ++i)
     {
@@ -378,22 +377,64 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
     }
     ctx -> local_datapoints = dp_info;
 
-    build_tree_kdtree_parallel(&local_tree);
-    // build_tree_kdtree_sample(&local_tree);
-    ctx -> local_data = local_tree.data;
-    
-    elapsed_time = TIME_STOP;
-    LOG_WRITE("Local trees init and build", elapsed_time);
-
     TIME_START;
-    MPI_DB_PRINT("----- Performing ngbh search -----\n");
-    MPI_Barrier(ctx -> mpi_communicator);
 
-    mpi_all_knn_search(ctx, dp_info, &tree, &local_tree, ctx -> k);
+    void* opaque_local_tree;
+
+    switch(ctx -> local_tree_type)
+    {
+        case KD:
+            {
+                kdtree_t local_tree;
+                kdtree_initialize( &local_tree, ctx -> local_data, ctx -> local_n_points, (unsigned int)ctx -> dims);
+
+
+                build_tree_kdtree_parallel(&local_tree);
+                // build_tree_kdtree_sample(&local_tree);
+                ctx -> local_data = local_tree.data;
+                
+                elapsed_time = TIME_STOP;
+                LOG_WRITE("Local kdtrees init and build", elapsed_time);
+
+                TIME_START;
+                MPI_DB_PRINT("----- Performing ngbh search -----\n");
+                MPI_Barrier(ctx -> mpi_communicator);
+
+                mpi_all_knn_search_kdtree(ctx, dp_info, &tree, &local_tree, ctx -> k);
+
+                MPI_Barrier(ctx -> mpi_communicator);
+                elapsed_time = TIME_STOP;
+                LOG_WRITE("Total time for all knn search w. local kdtrees", elapsed_time);
+                opaque_local_tree = &local_tree;
+            }
+            break;
+        case VP:
+            {
+                vptree_t local_tree;
+                vptree_initialize( &local_tree, ctx -> local_data, ctx -> local_n_points, (unsigned int)ctx -> dims);
+
+
+                build_tree_vptree(&local_tree);
+                // build_tree_kdtree_sample(&local_tree);
+                ctx -> local_data = local_tree.data;
+
+                elapsed_time = TIME_STOP;
+                LOG_WRITE("Vptrees init and build", elapsed_time);
+
+                TIME_START;
+                MPI_DB_PRINT("----- Performing ngbh search -----\n");
+                MPI_Barrier(ctx -> mpi_communicator);
+
+                mpi_all_knn_search_vptree(ctx, dp_info, &tree, &local_tree, ctx -> k);
+
+                MPI_Barrier(ctx -> mpi_communicator);
+                elapsed_time = TIME_STOP;
+                LOG_WRITE("Total time for all knn search w. vptrees", elapsed_time);
+                opaque_local_tree = &local_tree;
+            }
+            break;
+    }
 
-    MPI_Barrier(ctx -> mpi_communicator);
-    elapsed_time = TIME_STOP;
-    LOG_WRITE("Total time for all knn search", elapsed_time)
 
 
     TIME_START;
@@ -434,40 +475,77 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
 
     
     TIME_START;
-    int* cl = (int*)MY_MALLOC(ctx -> local_n_points * sizeof(int));
+    int* cl = (int*)MY_MALLOC(2 * ctx -> local_n_points * sizeof(int));
     float_t* data_to_write = (float_t*)MY_MALLOC(ctx -> local_n_points * ctx -> k * sizeof(float_t));
 
-    for(int i = 0; i < ctx -> local_n_points; ++i)
+    switch(ctx -> local_tree_type)
     {
-        cl[i] = ctx -> local_datapoints[i].cluster_idx;
-        idx_t idx = local_tree.__points[i].array_idx;
-        memcpy(data_to_write + idx*ctx -> dims, local_tree.__points[i].data, ctx -> dims*sizeof(float_t));
+        case KD:
+            {
+                kdtree_t local_tree = *((kdtree_t*)opaque_local_tree);
+                for(int i = 0; i < ctx -> local_n_points; ++i)
+                {
+                    cl[i] = ctx -> local_datapoints[i].cluster_idx;
+                    idx_t idx = local_tree.__points[i].array_idx;
+                    memcpy(data_to_write + idx*ctx -> dims, local_tree.__points[i].data, ctx -> dims*sizeof(float_t));
+                }
+                kdtree_free(&local_tree);
+            }
+            break;
+
+        case VP:
+            {
+                vptree_t local_tree = *((vptree_t*)opaque_local_tree);
+                for(int i = 0; i < ctx -> local_n_points; ++i)
+                {
+                    cl[i] = ctx -> local_datapoints[i].cluster_idx;
+                    idx_t idx = local_tree.__points[i].array_idx;
+                    memcpy(data_to_write + idx*ctx -> dims, local_tree.__points[i].data, ctx -> dims*sizeof(float_t));
+                }
+                vptree_free(&local_tree);
+            }
+            break;
+
+        default:
+            break;
     }
 
     if(ctx -> world_size <= 32)
     {
-        big_ordered_buffer_to_file(ctx, cl, sizeof(int), ctx -> local_n_points, ctx -> output_assignment_file);
-        big_ordered_buffer_to_file(ctx, data_to_write, sizeof(double), ctx -> local_n_points * ctx -> dims, ctx -> output_data_file);
+        char out_file[200];
+
+        snprintf(out_file, 200, "%s/og_idx", ctx->output_data_file);
+        big_ordered_buffer_to_file(ctx, ctx->og_idxs, sizeof(idx_t), ctx -> local_n_points, out_file);
 
-        free(cl);
+        snprintf(out_file, 200, "%s/assignment", ctx->output_data_file);
+        big_ordered_buffer_to_file(ctx, cl, sizeof(int), ctx -> local_n_points, out_file);
 
+        snprintf(out_file, 200, "%s/data", ctx->output_data_file);
+        big_ordered_buffer_to_file(ctx, data_to_write, sizeof(double), ctx -> local_n_points * ctx -> dims, out_file);
     }
     else
     {
-        distributed_buffer_to_file(ctx, cl, sizeof(int), ctx -> local_n_points, ctx -> output_assignment_file);
-        distributed_buffer_to_file(ctx, data_to_write, sizeof(double), ctx -> local_n_points * ctx -> dims, ctx -> output_data_file);
+        char out_file[200];
 
-        free(cl);
+        snprintf(out_file, 200, "%s/og_idx", ctx->output_data_file);
+        distributed_buffer_to_file(ctx, ctx->og_idxs, sizeof(idx_t), ctx -> local_n_points, out_file);
 
+        snprintf(out_file, 200, "%s/assignment", ctx->output_data_file);
+        distributed_buffer_to_file(ctx, cl, sizeof(int), ctx -> local_n_points, out_file);
+
+        snprintf(out_file, 200, "%s/data", ctx->output_data_file);
+        distributed_buffer_to_file(ctx, data_to_write, sizeof(double), ctx -> local_n_points * ctx -> dims, out_file);
     }
 
+
+
     elapsed_time = TIME_STOP;
     LOG_WRITE("Write results to file", elapsed_time);
     
     
+    free(cl);
     free(data_to_write);
     top_tree_free(ctx, &tree);
-    kdtree_free(&local_tree);
     //clusters_free(&clusters);
 
     free(send_counts);
diff --git a/src/tree/kdtree.h b/src/tree/kdtree.h
index 8a0c9be..56062b2 100644
--- a/src/tree/kdtree.h
+++ b/src/tree/kdtree.h
@@ -10,7 +10,7 @@
 #include <time.h>
 #include <float.h>
 
-#define DEFAULT_LEAF_SIZE 32
+#define DEFAULT_LEAF_SIZE 256
 
 #define ALIGNMENT 64
 #define CHECK_ALLOCATION_NO_CTX(x) if(!x){printf("[!!!] Failed allocation: %s at line %d \n", __FILE__, __LINE__ ); exit(1);}
@@ -49,6 +49,15 @@
     #define MAX(x,y) ((x > y) ? (x) : (y))
 #endif
 
+#ifndef POINT
+    #define POINT
+    typedef struct point_t
+    {
+        idx_t    array_idx;
+        float_t* data;
+    } point_t;
+#endif
+
 typedef struct 
 {
     float_t* lb;
diff --git a/src/tree/tree.c b/src/tree/tree.c
index 4dc46cf..3f8b3e4 100644
--- a/src/tree/tree.c
+++ b/src/tree/tree.c
@@ -11,6 +11,7 @@
 #include "heap.h"
 #include "kdtree.h"
 #include "mpi.h"
+#include "vptree.h"
 #include <math.h>
 #include <stdint.h>
 #include <stdio.h>
@@ -56,7 +57,7 @@ void swap_data_element(point_t *a, point_t *b, size_t vec_len) {
     idx_t tmp_dp;
     tmp_dp = a->array_idx;
     a->array_idx = b->array_idx;
-    a->array_idx = tmp_dp;
+    b->array_idx = tmp_dp;
 
     float_t tmp;
     for (size_t i = 0; i < vec_len; ++i) 
@@ -934,7 +935,7 @@ void parallel_build_top_kdtree(global_context_t *ctx, top_kdtree_t *tree, idx_t
     // size_t tot_n_points = 0;
     // MPI_Allreduce(&(ctx->local_n_points), &tot_n_points, 1, MPI_UINT64_T, MPI_SUM, ctx->mpi_communicator);
 
-    MPI_Alltoall(&(ctx->local_n_points), 1, MPI_UINT64_T, ctx->rank_n_points, 1, MPI_UINT64_T, ctx->mpi_communicator);
+    MPI_Allgather(&(ctx->local_n_points), 1, MPI_UINT64_T, ctx->rank_n_points, 1, MPI_UINT64_T, ctx->mpi_communicator);
 
     ctx->rank_idx_start[0] = 0;
     for(int i = 1; i < ctx->world_size; ++i)
@@ -1035,7 +1036,7 @@ void build_top_kdtree(global_context_t *ctx, top_kdtree_t *tree, idx_t n_bins, f
     size_t tot_n_points = 0;
     MPI_Allreduce(&(ctx->local_n_points), &tot_n_points, 1, MPI_UINT64_T, MPI_SUM, ctx->mpi_communicator);
 
-    MPI_Alltoall(&(ctx->local_n_points), 1, MPI_UINT64_T, ctx->rank_n_points, 1, MPI_UINT64_T, ctx->mpi_communicator);
+    MPI_Allgather(&(ctx->local_n_points), 1, MPI_UINT64_T, ctx->rank_n_points, 1, MPI_UINT64_T, ctx->mpi_communicator);
 
     ctx->rank_idx_start[0] = 0;
     for(int i = 1; i < ctx->world_size; ++i)
@@ -1362,7 +1363,6 @@ void exchange_points(global_context_t* ctx, top_kdtree_t* tree)
     int len      = ctx -> local_n_points;
     float_t* curr_data = ctx -> local_data;
 
-    HERE
     partition_offset[0] = 0;
     for(int owner = 1; owner < ctx -> world_size; ++owner)
     {
@@ -2385,8 +2385,521 @@ void mpi_knn_search(global_context_t* ctx, top_kdtree_t* top_tree, kdtree_t* loc
     free(__local_heap_buffers);
 }
 
+void mpi_all_knn_search_vptree(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtree_t* top_tree, vptree_t* local_tree, int k)
+{
+    /* local search */
+    /* print diagnostics */
+    print_diagnositcs(ctx, k);
+    ctx -> k = (idx_t)k;
+    
+    TIME_DEF;
+    double elapsed_time;
+
+    TIME_START;
+    //printf("rank %d local_n_points %lu\n", ctx -> mpi_rank, ctx -> local_n_points);
+    MPI_Barrier(ctx -> mpi_communicator);
+    //ctx -> __local_heap_buffers = (heap_node_t*)MY_MALLOC(ctx -> local_n_points * k * sizeof(heap_node));
+    MPI_Alloc_mem(ctx -> local_n_points * k * sizeof(heap_node_t), MPI_INFO_NULL, &(ctx -> __local_heap_buffers));
+    float_t avg_visited_nodes = 0;
+
+    #pragma omp parallel for schedule(dynamic, 32) reduction(+:avg_visited_nodes)
+    for(int p = 0; p < ctx -> local_n_points; ++p)
+    {
+        idx_t idx = local_tree -> __points[p].array_idx;
+        /* actually we want to preserve the heap_t to then insert guesses from other nodes */
+        heap_t tmp_heap;
+        tmp_heap.data  = ctx -> __local_heap_buffers + k * idx;
+        tmp_heap.count = 0;
+        tmp_heap.size  = k;
+
+        //dp_info[idx].ngbh = knn_kdtree_t_no_heapsort(local_tree -> _nodes[p].data, local_tree -> root, k);
+
+        idx_t visited_nodes = 0;
+        knn_vptree_search(local_tree->__points[p].data, local_tree-> __points, local_tree->__nodes, 
+                          local_tree->root, &tmp_heap, local_tree->dims, &visited_nodes);
+
+        convert_heap_idx_to_global(ctx, &tmp_heap);
+        dp_info[idx].cluster_idx = -1;
+        dp_info[idx].is_center = 0;
+        dp_info[idx].array_idx = idx + ctx -> idx_start;
+        dp_info[idx].ngbh = tmp_heap.data;
+        avg_visited_nodes += (float_t)visited_nodes/(float_t)ctx->local_n_points;
+    }
+
+    MPI_PRINT("AVG visited nodes on master %lf\n", avg_visited_nodes);
+    elapsed_time = TIME_STOP;
+    LOG_WRITE("Local neighborhood search", elapsed_time);
+    //printf("rank %d elapsed_time %lf\n", ctx->mpi_rank, elapsed_time);
+
+
+    TIME_START;
+    /* find if a points needs a refine on the global tree */
+    float_t** data_to_send_per_proc    = (float_t**)MY_MALLOC(ctx -> world_size * sizeof(float_t*));
+    int**     local_idx_of_the_point   = (int**)MY_MALLOC(ctx -> world_size * sizeof(int*));
+    int*      point_to_snd_count       = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
+    int*      point_to_snd_capacity    = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
+
+    for(int i = 0; i < ctx -> world_size; ++i)
+    {
+        /* allocate it afterwards */
+        point_to_snd_capacity[i] = 0;
+        point_to_snd_count[i]    = 0;
+    }
+
+    /* NEW VERSION double tree walk */
+    #pragma omp parallel for schedule(dynamic, 128)
+    for(int i = 0; i < ctx -> local_n_points; ++i)
+    {
+        idx_t idx = local_tree -> __points[i].array_idx;
+        float_t max_dist = dp_info[idx].ngbh[0].value;
+        float_t* point   = local_tree->__points[i].data;
+        
+        tree_walk_v2_find_n_points(ctx, top_tree -> root, idx, max_dist, point, point_to_snd_capacity);
+
+    }
+
+    /* allocate needed space */
+    for(int i = 0; i < ctx -> world_size; ++i)
+    {
+        int np = point_to_snd_capacity[i];
+        data_to_send_per_proc[i]  = (float_t*)MY_MALLOC(np * (ctx -> dims) * sizeof(float_t));    
+        local_idx_of_the_point[i] = (int*)MY_MALLOC(np * sizeof(int));    
+
+    }
+
+    #pragma omp parallel for schedule(dynamic, 128)
+    for(int i = 0; i < ctx -> local_n_points; ++i)
+    {
+        idx_t idx = local_tree -> __points[i].array_idx;
+        float_t max_dist = dp_info[idx].ngbh[0].value;
+        float_t* point   = local_tree -> __points[i].data;
+
+        tree_walk_v2_append_points(ctx, top_tree -> root, idx, max_dist, point, data_to_send_per_proc, local_idx_of_the_point, point_to_snd_count);
+    }
+
+    elapsed_time = TIME_STOP;
+    LOG_WRITE("Finding points to refine", elapsed_time);
+
+    TIME_START;
+    int* point_to_rcv_count = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
+
+    /* exchange points to work on*/
+    MPI_Alltoall(point_to_snd_count, 1, MPI_INT, point_to_rcv_count, 1, MPI_INT, ctx -> mpi_communicator);
+
+    int* rcv_count = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
+    int* snd_count = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
+    int* rcv_displ = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
+    int* snd_displ = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
+
+    /*compute counts and displs*/
+    rcv_displ[0] = 0;
+    snd_displ[0] = 0;
+
+
+    rcv_count[0] = point_to_rcv_count[0] * (ctx -> dims);
+    snd_count[0] = point_to_snd_count[0] * (ctx -> dims);
+
+    int tot_points_rcv = point_to_rcv_count[0];
+    int tot_points_snd = point_to_snd_count[0];
+    int tot_count = rcv_count[0];
+
+    for(int i = 1; i < ctx -> world_size; ++i)
+    {
+        rcv_count[i] = point_to_rcv_count[i] * (ctx -> dims);        
+        snd_count[i] = point_to_snd_count[i] * (ctx -> dims);        
+
+        tot_count += rcv_count[i];
+        tot_points_rcv += point_to_rcv_count[i];
+        tot_points_snd += point_to_snd_count[i];
+
+        rcv_displ[i] = rcv_displ[i - 1] + rcv_count[i - 1];
+        snd_displ[i] = snd_displ[i - 1] + snd_count[i - 1];
+    }
+
+    float_t* __rcv_points = (float_t*)MY_MALLOC(tot_points_rcv * (ctx -> dims) * sizeof(float_t));
+    float_t* __snd_points = (float_t*)MY_MALLOC(tot_points_snd * (ctx -> dims) * sizeof(float_t)); 
+
+    float_t* __max_dist_snd = (float_t*)MY_MALLOC(tot_points_snd * sizeof(float_t));
+    float_t* __max_dist_rcv = (float_t*)MY_MALLOC(tot_points_rcv * sizeof(float_t));
+
+
+
+    /* copy data to send in contiguous memory */
+    for(int i = 0; i < ctx -> world_size; ++i)
+    {
+        memcpy(__snd_points + snd_displ[i], data_to_send_per_proc[i], snd_count[i] * sizeof(float_t));
+    }
+
+    size_t dist_idx = 0;
+    for(idx_t i = 0; i < ctx -> world_size; ++i)
+    {
+        for(idx_t j = 0; j < point_to_snd_count[i]; ++j)
+        {
+            idx_t point_idx = local_idx_of_the_point[i][j];
+            // for each point take the distance of its furthest nearest neighbor
+            // then communicate it in order to prune the search on foreign nodes
+            __max_dist_snd[dist_idx] = ctx -> local_datapoints[point_idx].ngbh[0].value; 
+            ++dist_idx;
+        }
+    }
+
+    MPI_Alltoallv(__snd_points, snd_count, snd_displ, MPI_MY_FLOAT, 
+                  __rcv_points, rcv_count, rcv_displ, MPI_MY_FLOAT, ctx -> mpi_communicator); 
+
+    float_t** rcv_work_batches = (float_t**)MY_MALLOC(ctx -> world_size * sizeof(float_t*));
+    for(int i = 0; i < ctx -> world_size; ++i) 
+    {
+        rcv_work_batches[i]       = __rcv_points + rcv_displ[i];
+    }
+
+    MPI_Status status;
+    int flag;
+
+    for(int i = 0; i < ctx -> world_size; ++i)
+    {
+        rcv_count[i] = rcv_count[i] / ctx -> dims;
+        rcv_displ[i] = rcv_displ[i] / ctx -> dims;
+        snd_count[i] = snd_count[i] / ctx -> dims;
+        snd_displ[i] = snd_displ[i] / ctx -> dims;
+
+    }
+
+    MPI_Alltoallv(__max_dist_snd, snd_count, snd_displ, MPI_MY_FLOAT, 
+                  __max_dist_rcv, rcv_count, rcv_displ, MPI_MY_FLOAT, ctx -> mpi_communicator); 
+
+    /* prepare heap_t batches */
+
+    //int work_batch_stride = 1 + ctx -> dims;
+    int work_batch_stride = ctx -> dims;
+
+    /* Note that I then have to recieve an equal number of heaps as the one I sent out before */
+    heap_node_t* __heap_batches_to_snd = (heap_node_t*)MY_MALLOC((uint64_t)k * (uint64_t)tot_points_rcv * sizeof(heap_node_t));
+
+    if( __heap_batches_to_snd == NULL)
+    {
+        DB_PRINT("Rank %d failed to allocate snd_heaps %luB required\n",ctx -> mpi_rank, (uint64_t)k * (uint64_t)tot_points_snd * sizeof(heap_node_t));
+        exit(1);
+    }
+
+    MPI_Barrier(ctx -> mpi_communicator);
+
+
+    // max dists will contain the maximum distance to have to search
+    heap_node_t** heap_batches_per_node = (heap_node_t**)MY_MALLOC(ctx -> world_size * sizeof(heap_node_t*));
+    float_t**   max_dists_per_node    = (float_t**)MY_MALLOC(ctx -> world_size * sizeof(float_t*));
+
+    for(int p = 0; p < ctx -> world_size; ++p) 
+    {
+        heap_batches_per_node[p] = __heap_batches_to_snd + (uint64_t)rcv_displ[p] * (uint64_t)k;
+        max_dists_per_node[p]    = __max_dist_rcv        + (uint64_t)rcv_displ[p];
+    }
+
+    /* compute everything */
+    elapsed_time = TIME_STOP;
+    LOG_WRITE("Exchanging points", elapsed_time);
+    MPI_Barrier(ctx -> mpi_communicator);
+
+
+    TIME_START;
+
+    /* ngbh search on recieved points */
+    for(int p = 0; p < ctx -> world_size; ++p)
+    {
+        if(point_to_rcv_count[p] > 0 && p != ctx -> mpi_rank)
+        {
+            #pragma omp parallel for schedule(dynamic, 128)
+            for(int batch = 0; batch < point_to_rcv_count[p]; ++batch)
+            {
+                heap_t H;
+                H.count = k;
+                H.size = k;
+                H.data = heap_batches_per_node[p] + (uint64_t)k * (uint64_t)batch; 
+                float_t max_dist = max_dists_per_node[p][batch];
+                for(idx_t i = 0; i < H.size; ++i)
+                {
+                    H.data[i].value = max_dist;
+                    H.data[i].array_idx = MY_SIZE_MAX; 
+                }
+
+                //float_t* point = rcv_work_batches[p] + batch * work_batch_stride + 1; 
+                idx_t visited_nodes = 0;
+                float_t* point = rcv_work_batches[p] + (uint64_t)batch * (uint64_t)work_batch_stride; 
+                knn_vptree_search(point, local_tree-> __points, local_tree->__nodes, 
+                                local_tree->root, &H, local_tree->dims, &visited_nodes);
+                convert_heap_idx_to_global(ctx, &H);
+            }
+        }
+    }
+    
+    /* sendout results */
+
+    /* 
+     * dummy pointers to clarify counts in this part
+     * act like an alias for rcv and snd counts
+     */ 
+
+    int* ngbh_to_send = point_to_rcv_count;
+    int* ngbh_to_recv = point_to_snd_count;
+
+    /*
+     * counts are inverted since I have to recieve as many batches as points I
+     * Have originally sended
+     */
+
+    elapsed_time = TIME_STOP;
+    LOG_WRITE("Ngbh search for foreing points", elapsed_time);
+
+    TIME_START;
+    
+    MPI_Datatype MPI_my_heap;
+    MPI_Type_contiguous(k * sizeof(heap_node_t), MPI_BYTE, &MPI_my_heap);
+    MPI_Barrier(ctx -> mpi_communicator);
+    MPI_Type_commit(&MPI_my_heap);
+
+
+    /* -------------------------------------
+     * ALTERNATIVE TO ALL TO ALL FOR BIG MSG
+     * ------------------------------------- */
+    
+    MPI_Barrier(ctx -> mpi_communicator);
+    int default_msg_len = MAX_MSG_SIZE / (k * sizeof(heap_node_t));
+
+    int* already_sent_points = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
+    int* already_rcvd_points = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
+
+    /* allocate a request array to keep track of all requests going out*/
+    MPI_Request* req_array;
+    int req_num = 0;
+    for(int i = 0; i < ctx -> world_size; ++i)
+    {
+        req_num += ngbh_to_send[i] > 0 ? ngbh_to_send[i]/default_msg_len + 1 : 0;
+    }
+
+    req_array = (MPI_Request*)MY_MALLOC(req_num * sizeof(MPI_Request));
+
+
+    for(int i = 0; i < ctx -> world_size; ++i)
+    {
+        already_sent_points[i] = 0;
+        already_rcvd_points[i] = 0;
+    }
+
+    int req_idx = 0;
+
+    // find the maximum number of points to send */
+    
+    idx_t max_n_recv = 0;
+    for(int i = 0; i < ctx -> world_size; ++i)
+    {
+        max_n_recv = MAX(max_n_recv, (idx_t)ngbh_to_recv[i]);
+    }
+
+    MPI_DB_PRINT("Using default message length %lu\n", default_msg_len);
 
-void mpi_all_knn_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtree_t* top_tree, kdtree_t* local_tree, int k)
+    heap_node_t* __heap_batches_to_rcv = (heap_node_t*)MY_MALLOC((uint64_t)k * (uint64_t)max_n_recv * sizeof(heap_node_t));
+    if( __heap_batches_to_rcv == NULL)
+    {
+        DB_PRINT("Rank %d failed to allocate rcv_heaps %luB required\n",ctx -> mpi_rank, (uint64_t)k * (uint64_t)max_n_recv* sizeof(heap_node_t));
+        exit(1);
+    }
+
+    /* make a ring */
+
+    MPI_Barrier(ctx -> mpi_communicator);
+    for(int i = 1; i < ctx -> world_size; ++i)
+    {
+        int rank_to_send = (ctx -> mpi_rank + i) % (ctx -> world_size);
+        int rank_to_recv = (ctx -> world_size + ctx -> mpi_rank - i) % (ctx -> world_size);
+
+        /* do things */
+
+        /* send out one batch */
+
+        #ifdef PRINT_NGBH_EXCHANGE_SCHEME
+            MPI_DB_PRINT("[--- ROUND %d ----]\n", i);
+            MPI_Barrier(ctx -> mpi_communicator);
+            DB_PRINT("[RANK %d] sending to %d tot: %d [%luB]---- recieving from %d %d\n", ctx -> mpi_rank, 
+                    rank_to_send, ngbh_to_send[rank_to_send], ngbh_to_send[rank_to_send]*sizeof(heap_node_t), rank_to_recv, ngbh_to_recv[rank_to_recv]);
+        #endif
+        if(ngbh_to_send[rank_to_send] > 0)
+        {
+            int count_send = 0;
+            while(already_sent_points[rank_to_send] < ngbh_to_send[rank_to_send])
+            {
+                MPI_Request request;
+                count_send = MIN((int)default_msg_len, (int)(ngbh_to_send[rank_to_send] - already_sent_points[rank_to_send] ));
+
+                MPI_Isend(  heap_batches_per_node[rank_to_send] + k * already_sent_points[rank_to_send], count_send,  
+                        MPI_my_heap, rank_to_send, 0, ctx -> mpi_communicator, &request);
+
+                already_sent_points[rank_to_send] += count_send;
+                req_array[req_idx] = request;
+                ++req_idx;
+            }
+        }
+
+        if(     ngbh_to_send[rank_to_send] != already_sent_points[rank_to_send] || 
+                point_to_rcv_count[rank_to_send] != already_sent_points[rank_to_send])
+        {
+            DB_PRINT("ERROR OCCURRED in sending points [rank %d] got %d expected %d\n", 
+                    ctx -> mpi_rank, already_rcvd_points[rank_to_send], point_to_rcv_count[rank_to_send]);
+        }
+        
+        MPI_Barrier(ctx -> mpi_communicator);
+
+        if(ngbh_to_recv[rank_to_recv] > 0)
+        {
+            flag = 0;
+            while(already_rcvd_points[rank_to_recv] < ngbh_to_recv[rank_to_recv])
+            {
+                MPI_Probe(rank_to_recv, MPI_ANY_TAG, ctx -> mpi_communicator, &status);
+                MPI_Request request;
+                int count_recv; 
+                int source = status.MPI_SOURCE;
+                MPI_Get_count(&status, MPI_my_heap, &count_recv);
+                /* recieve each slice */
+
+                MPI_Recv(__heap_batches_to_rcv + k * already_rcvd_points[rank_to_recv], 
+                        count_recv, MPI_my_heap, source, MPI_ANY_TAG, ctx -> mpi_communicator, &status);
+
+                already_rcvd_points[rank_to_recv] += count_recv;
+            }
+        }
+
+        if(     ngbh_to_recv[rank_to_recv] != already_rcvd_points[rank_to_recv] || 
+                point_to_snd_count[rank_to_recv] != already_rcvd_points[rank_to_recv])
+        {
+            DB_PRINT("ERROR OCCURRED in recieving points [rank %d] got %d expected %d\n", 
+                    ctx -> mpi_rank, already_rcvd_points[rank_to_recv], point_to_snd_count[rank_to_recv]);
+        }
+
+        /* merge lists */
+        #pragma omp parallel for schedule(dynamic, 32)
+        for(int b = 0; b < ngbh_to_recv[rank_to_recv]; ++b)
+        {
+            int idx = local_idx_of_the_point[rank_to_recv][b];
+            /* retrieve the heap_t */
+            heap_t H;
+            H.count = k;
+            H.size  = k;
+            H.data  = __heap_batches_to_rcv + k*b;
+            /* insert the points into the heap_t */
+            for(int j = 0; j < k; ++j)
+            {
+                heap_t tmp_heap;
+                tmp_heap.size  = k;
+                tmp_heap.count = k;
+                tmp_heap.data  = dp_info[idx].ngbh;
+                // insert it only of != from max
+                if(H.data[j].array_idx != MY_SIZE_MAX) max_heap_insert(&(tmp_heap), H.data[j].value, H.data[j].array_idx);
+            }
+        }
+
+
+
+        MPI_Barrier(ctx -> mpi_communicator);
+    }
+
+    elapsed_time = TIME_STOP;
+    LOG_WRITE("Merging results", elapsed_time);
+
+    
+    MPI_Testall(req_idx, req_array, &flag, MPI_STATUSES_IGNORE);
+
+    if(flag == 0)
+    {
+        DB_PRINT("ERROR OCCURRED Rank %d has unfinished communications\n", ctx -> mpi_rank);
+        exit(1);
+    }
+    free(req_array);
+    free(already_sent_points);
+    free(already_rcvd_points);
+
+    /* -------------------------------------------------------- */
+    /* heapsort them */
+
+    TIME_START;
+
+    #pragma omp parallel for schedule(dynamic, 128)
+    for(int i = 0; i < ctx -> local_n_points; ++i)
+    {
+        heap_t tmp_heap;
+        tmp_heap.size   = k;
+        tmp_heap.count = k;
+        tmp_heap.data  = dp_info[i].ngbh;
+
+        heap_sort(&(tmp_heap));
+    }
+
+    elapsed_time = TIME_STOP;
+    LOG_WRITE("Sorting negihborhoods", elapsed_time);
+
+
+    #if defined(WRITE_NGBH)
+    MPI_DB_PRINT("Writing ngbh to files\n");
+        char ngbh_out[80];
+        sprintf(ngbh_out, "./bb/rank_%d.ngbh",ctx -> mpi_rank);
+        FILE* file = fopen(ngbh_out,"w");
+        if(!file) 
+        {
+            printf("Cannot open file %s\n",ngbh_out);
+        }
+        else
+        {
+            for(int i = 0; i < ctx -> local_n_points; ++i)
+            {
+                fwrite(dp_info[i].ngbh, sizeof(heap_node_t), k, file);
+            }
+            fclose(file);
+        }
+    #endif
+
+    MPI_Barrier(ctx -> mpi_communicator);
+
+
+    #if defined(WRITE_SHUFFLED_DATA)
+        float_t* data_to_write = (float_t*)MY_MALLOC(ctx -> local_n_points * ctx -> k * sizeof(float_t));
+        for(int i = 0; i < ctx -> local_n_points; ++i)
+        {
+            idx_t idx = local_tree -> __points[i].array_idx;
+            memcpy(data_to_write + idx*ctx -> dims, local_tree -> __points[i].data, ctx -> dims*sizeof(float_t));
+        }
+
+        big_ordered_buffer_to_file(ctx, data_to_write, sizeof(float_t), ctx -> local_n_points * ctx -> dims, "bb/ordered_data.npy");
+        free(data_to_write);
+    #endif
+
+    for(int i = 0; i < ctx -> world_size; ++i)
+    {
+        if(data_to_send_per_proc[i])  free(data_to_send_per_proc[i]);
+        if(local_idx_of_the_point[i]) free(local_idx_of_the_point[i]);
+    }
+
+    free(data_to_send_per_proc);
+    free(local_idx_of_the_point);
+    free(heap_batches_per_node);
+    free(max_dists_per_node);
+    //free(rcv_heap_batches);
+    free(rcv_work_batches);
+    free(point_to_rcv_count);
+    free(point_to_snd_count);
+    free(point_to_snd_capacity);
+
+    free(rcv_count);
+    free(snd_count);
+    free(rcv_displ);
+    free(snd_displ);
+    free(__heap_batches_to_rcv);
+    free(__heap_batches_to_snd);
+    free(__rcv_points);
+    free(__snd_points);
+    free(__max_dist_snd);
+    free(__max_dist_rcv);
+
+}
+
+
+void mpi_all_knn_search_kdtree(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtree_t* top_tree, kdtree_t* local_tree, int k)
 {
     /* local search */
     /* print diagnostics */
@@ -2402,7 +2915,8 @@ void mpi_all_knn_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kd
     //ctx -> __local_heap_buffers = (heap_node_t*)MY_MALLOC(ctx -> local_n_points * k * sizeof(heap_node));
     MPI_Alloc_mem(ctx -> local_n_points * k * sizeof(heap_node_t), MPI_INFO_NULL, &(ctx -> __local_heap_buffers));
 
-    #pragma omp parallel for schedule(dynamic, 32)
+    float_t avg_visited_nodes = 0;
+    #pragma omp parallel for schedule(dynamic, 32) reduction(+:avg_visited_nodes)
     for(int p = 0; p < ctx -> local_n_points; ++p)
     {
         idx_t idx = local_tree -> __points[p].array_idx;
@@ -2423,7 +2937,10 @@ void mpi_all_knn_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kd
         dp_info[idx].is_center = 0;
         dp_info[idx].array_idx = idx + ctx -> idx_start;
         dp_info[idx].ngbh = tmp_heap.data;
+        avg_visited_nodes += (float_t)visited_nodes/(float_t)ctx->local_n_points;
     }
+
+    MPI_PRINT("AVG visited nodes on master %lf\n", avg_visited_nodes);
     elapsed_time = TIME_STOP;
     LOG_WRITE("Local neighborhood search", elapsed_time);
     //printf("rank %d elapsed_time %lf\n", ctx->mpi_rank, elapsed_time);
diff --git a/src/tree/tree.h b/src/tree/tree.h
index 4fd2fd6..bf4f318 100644
--- a/src/tree/tree.h
+++ b/src/tree/tree.h
@@ -2,6 +2,7 @@
 #include "../common/common.h"
 #include "heap.h"
 #include "kdtree.h"
+#include "vptree.h"
 #include <stdlib.h>
 
 #include <stdio.h>
@@ -110,7 +111,10 @@ size_t   partition_data_around_value(point_t *array, size_t vec_len, size_t comp
 size_t   parallel_partition_data_around_value(partition_utils_t* p_putils, float_t* in, 
                                             float_t* out, size_t vec_len, size_t compare_dim,
                                             size_t left, size_t right, float_t pivot_value);
-void     mpi_all_knn_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtree_t* top_tree, kdtree_t* local_tree, int k);
+
+void     mpi_all_knn_search_kdtree(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtree_t* top_tree, kdtree_t* local_tree, int k);
+void     mpi_all_knn_search_vptree(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtree_t* top_tree, vptree_t* local_tree, int k);
+
 float_t  compute_ID_two_NN_ML(global_context_t* ctx, datapoint_info_t* dp_info, idx_t n, int verbose);
 void     compute_density_kstarnn_rma_v2(global_context_t* ctx, const float_t d, int verbose);
 void     compute_correction(global_context_t* ctx, float_t Z);
diff --git a/src/tree/vptree.h b/src/tree/vptree.h
new file mode 100644
index 0000000..88a06c7
--- /dev/null
+++ b/src/tree/vptree.h
@@ -0,0 +1,563 @@
+#pragma once
+/*
+ * vptree.h  —  Vantage-Point Tree, single-header, drop-in for kdtree.h
+ *
+ * Keeps every convention from kdtree.h:
+ *   - same point_t / idx_t / float_t / heap_t types
+ *   - same MY_MALLOC / FREE / ALIGNMENT macros
+ *   - same OpenMP parallel build strategy
+ *   - same data-layout compaction after build
+ *   - same public interface: vptree_initialize / build_tree_vptree /
+ *                            knn_vptree_search / vptree_free
+ *
+ * WHY VP-TREE IS FASTER THAN KD-TREE AT d>=8
+ * -------------------------------------------
+ * KD-tree prunes with a hyperrectangle check:
+ *     min_box_dist(q, box) > tau   →  prune
+ * In d=10 a hyperrectangle covers ~2^10 orthants, so the test rarely prunes.
+ *
+ * VP-tree prunes with a hypersphere annulus check (ALL squared, zero sqrt):
+ *     vp_can_prune_left / vp_can_prune_right  (see below)
+ * This exploits the metric directly and prunes aggressively even at d=10.
+ *
+ * PRUNING DERIVATION (sqrt-free, all squared distances)
+ * ------------------------------------------------------
+ * Notation (all values are SQUARED distances):
+ *   dqv = dist²(query, vp)
+ *   mu  = median dist² from vp to subtree points  (split radius²)
+ *   tau = heap->data[0].value  (current k-th worst neighbour dist²)
+ *
+ *   LEFT  child: points p with dist²(p,vp) <= mu
+ *   RIGHT child: points p with dist²(p,vp) >  mu
+ *
+ * Triangle inequality in Euclidean space (non-squared):
+ *   dist(q,p) >= |dist(q,vp) - dist(p,vp)|
+ *
+ * For LEFT child (dist(p,vp) <= sqrt(mu)):
+ *   min dist(q,p) >= sqrt(dqv) - sqrt(mu)
+ *   Prune LEFT if sqrt(dqv) - sqrt(mu) > sqrt(tau)   [only when dqv > mu]
+ *   Squaring both sides (both positive by guard):
+ *     let s = dqv + mu - tau
+ *     prune if s > 0  AND  s*s > 4*dqv*mu
+ *
+ * For RIGHT child (dist(p,vp) > sqrt(mu)):
+ *   min dist(q,p) >= sqrt(mu) - sqrt(dqv)
+ *   Prune RIGHT if sqrt(mu) - sqrt(dqv) > sqrt(tau)  [only when mu > dqv]
+ *   Squaring:
+ *     let s = mu + dqv - tau
+ *     prune if mu > dqv  AND  s > 0  AND  s*s > 4*dqv*mu
+ *
+ * Result: zero sqrt calls per node, only multiplications and comparisons.
+ *
+ * USAGE
+ * -----
+ *   #include "heap.h"
+ *   #include "vptree.h"
+ *
+ *   vptree_t tree;
+ *   vptree_initialize(&tree, data_ptr, n_points, dims);
+ *   build_tree_vptree(&tree);
+ *
+ *   heap_t H;   // your existing heap, initialised to size k
+ *   idx_t vis = 0;
+ *   knn_vptree_search(query_vec, tree.__points, tree.__nodes,
+ *                     tree.root, &H, tree.dims, &vis);
+ *
+ *   vptree_free(&tree);
+ */
+
+#include "heap.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <math.h>
+#include <limits.h>
+#include <stdint.h>
+#include <time.h>
+#include <float.h>
+#include <omp.h>
+
+/* ── Reuse all macros from kdtree.h if already defined ───────────────────── */
+
+#ifndef ALIGNMENT
+    #define ALIGNMENT 64
+#endif
+
+#ifndef CHECK_ALLOCATION_NO_CTX
+    #define CHECK_ALLOCATION_NO_CTX(x) \
+        if(!x){printf("[!!!] Failed allocation: %s at line %d \n", __FILE__, __LINE__ ); exit(1);}
+#endif
+
+#ifndef MY_MALLOC
+    #define MY_MALLOC(n) ({size_t __n = (n + ALIGNMENT - 1) & ~(ALIGNMENT - 1); \
+                           void* p = aligned_alloc(ALIGNMENT, __n); \
+                           CHECK_ALLOCATION_NO_CTX(p); memset(p, 0, __n); p; })
+#endif
+
+#ifndef FREE
+    #define FREE(x) if(x) free(x);
+#endif
+
+#ifndef MIN
+    #define MIN(x,y) ((x) < (y) ? (x) : (y))
+#endif
+#ifndef MAX
+    #define MAX(x,y) ((x) > (y) ? (x) : (y))
+#endif
+
+#ifndef DEFAULT_LEAF_SIZE
+    #define DEFAULT_LEAF_SIZE 256
+#endif
+
+/* float_t / idx_t: honour the same USE_FLOAT32 / USE_INT32 guards */
+#ifndef float_t
+    #ifdef USE_FLOAT32
+        #define float_t float
+    #else
+        #define float_t double
+    #endif
+#endif
+
+#ifndef idx_t
+    #ifdef USE_INT32
+        #define idx_t uint32_t
+        #define MY_SIZE_MAX UINT32_MAX
+    #else
+        #define idx_t uint64_t
+        #define MY_SIZE_MAX UINT64_MAX
+    #endif
+#endif
+
+/* ── VP-tree build cutoff (fall back to serial below this) ──────────────── */
+#ifndef VP_SERIAL_BUILD_CUTOFF
+    #define VP_SERIAL_BUILD_CUTOFF 4096
+#endif
+
+/* ═══════════════════════════════════════════════════════════════════════════
+ * Node layout — flat pool, heap-ordered (left=2i+1, right=2i+2)
+ *
+ * mu stores the SQUARED median distance from vp to its subtree points.
+ * Everything is squared throughout — no sqrt anywhere.
+ * ═══════════════════════════════════════════════════════════════════════════ */
+
+typedef struct {
+    float_t  mu;               /* split radius²: median dist²(p,vp) in subtree */
+    idx_t    vp_point_idx;     /* index in __points[] of the vantage point      */
+    uint32_t leaf_point_idx;   /* (leaf) start index in __points[]              */
+    uint32_t leaf_point_count; /* (leaf) number of points in leaf               */
+    uint8_t  is_leaf;
+} vpnode_t;
+
+typedef struct {
+    float_t*        data;      /* raw data (same role as kdtree_t.data)         */
+    uint32_t        dims;
+    idx_t           n_points;
+    idx_t           root;      /* always 0                                      */
+    struct point_t* __points;  /* permuted point array                          */
+    vpnode_t*       __nodes;   /* flat node pool                                */
+} vptree_t;
+
+/* ═══════════════════════════════════════════════════════════════════════════
+ * Distance — squared, no sqrt, same semantics as kdtree.h euclidean_distance
+ * ═══════════════════════════════════════════════════════════════════════════ */
+
+static inline float_t vp_dist_sq(const float_t* __restrict__ a,
+                                  const float_t* __restrict__ b,
+                                  const idx_t dims)
+{
+    float_t d = 0.0;
+    /* restrict + fixed small dims → compiler emits AVX/AVX-512 with -O3 -march=native */
+    for (idx_t i = 0; i < dims; ++i) {
+        float_t diff = a[i] - b[i];
+        d += diff * diff;
+    }
+    return d;
+}
+
+/* ═══════════════════════════════════════════════════════════════════════════
+ * Pruning predicates — ZERO sqrt, pure multiply+compare
+ * All arguments are squared distances.
+ * Returns 1 if the subtree can be safely skipped.
+ * ═══════════════════════════════════════════════════════════════════════════ */
+
+/*
+ * Prune LEFT child (points with dist²(p,vp) <= mu).
+ * Original: sqrt(dqv) - sqrt(mu) > sqrt(tau)  [guard: dqv > mu]
+ * Squared:  s = dqv + mu - tau;  prune iff s > 0 && s*s > 4*dqv*mu
+ */
+static inline int vp_can_prune_left(float_t dqv, float_t mu, float_t tau)
+{
+    if (dqv <= mu) return 0;
+    float_t s = dqv + mu - tau;
+    if (s <= 0.0) return 0;
+    return (s * s) > (4.0 * dqv * mu);
+}
+
+/*
+ * Prune RIGHT child (points with dist²(p,vp) > mu).
+ * Original: sqrt(mu) - sqrt(dqv) > sqrt(tau)  [guard: mu > dqv]
+ * Squared:  s = mu + dqv - tau;  prune iff s > 0 && s*s > 4*dqv*mu
+ */
+static inline int vp_can_prune_right(float_t dqv, float_t mu, float_t tau)
+{
+    if (mu <= dqv) return 0;
+    float_t s = mu + dqv - tau;
+    if (s <= 0.0) return 0;
+    return (s * s) > (4.0 * dqv * mu);
+}
+
+/* ═══════════════════════════════════════════════════════════════════════════
+ * point_t helpers
+ * ═══════════════════════════════════════════════════════════════════════════ */
+
+static inline void vp_point_swap(point_t *x, point_t *y)
+{
+    point_t tmp = *x; *x = *y; *y = tmp;
+}
+
+/* ═══════════════════════════════════════════════════════════════════════════
+ * QuickSelect — finds k-th smallest in dists[low..high]
+ * Simultaneously permutes points[] to match.
+ * ═══════════════════════════════════════════════════════════════════════════ */
+
+static float_t vp_nth_dist(point_t* points, float_t* dists,
+                            idx_t low, idx_t high, idx_t k)
+{
+    while (low < high) {
+        idx_t   pivot_idx = low + (idx_t)rand() % (high - low + 1);
+        float_t pivot_d   = dists[pivot_idx];
+
+        /* Move pivot to end */
+        vp_point_swap(points + pivot_idx, points + high);
+        float_t tmp = dists[pivot_idx]; dists[pivot_idx] = dists[high]; dists[high] = tmp;
+
+        /* Lomuto partition */
+        idx_t store = low;
+        for (idx_t j = low; j < high; ++j) {
+            if (dists[j] < pivot_d) {
+                vp_point_swap(points + store, points + j);
+                tmp = dists[store]; dists[store] = dists[j]; dists[j] = tmp;
+                ++store;
+            }
+        }
+        vp_point_swap(points + store, points + high);
+        tmp = dists[store]; dists[store] = dists[high]; dists[high] = tmp;
+
+        if      (store == k) return dists[store];
+        else if (store  < k) low  = store + 1;
+        else                 high = store - 1;
+    }
+    return dists[low];
+}
+
+/* ═══════════════════════════════════════════════════════════════════════════
+ * Vantage point selection — random
+ *
+ * Variance-maximising selection (32x32 distance probes per node) was the
+ * original approach but costs ~1B extra distance computations at build time
+ * for 52M points, causing 82s build vs 2s for KD-tree. Random selection is
+ * equally correct (VP-tree correctness never depends on VP quality) and
+ * builds in ~2s. The marginal pruning improvement from better VP selection
+ * does not justify the build cost at this scale.
+ * ═══════════════════════════════════════════════════════════════════════════ */
+
+static inline idx_t vp_select_vantage_point(point_t* points,
+                                             idx_t start, idx_t end)
+{
+    (void)points;
+    idx_t n = end - start + 1;
+    if (n <= 2) return start;
+    return start + (idx_t)rand() % n;
+}
+
+/* ═══════════════════════════════════════════════════════════════════════════
+ * BUILD — serial recursive
+ * ═══════════════════════════════════════════════════════════════════════════ */
+
+static idx_t vp_recursive_build(point_t* points, vpnode_t* nodes,
+                                  float_t* dist_scratch,
+                                  idx_t node_idx,
+                                  idx_t start, idx_t end, idx_t dims);
+
+static idx_t vp_recursive_build(point_t* points, vpnode_t* nodes,
+                                  float_t* dist_scratch,
+                                  idx_t node_idx,
+                                  idx_t start, idx_t end, idx_t dims)
+{
+    vpnode_t* node = nodes + node_idx;
+    idx_t     n    = end - start + 1;
+
+    /* ── Leaf ─────────────────────────────────────────────────────────── */
+    if (n <= DEFAULT_LEAF_SIZE) {
+        node->is_leaf          = 1;
+        node->leaf_point_idx   = (uint32_t)start;
+        node->leaf_point_count = (uint32_t)n;
+        node->vp_point_idx     = start;
+        node->mu               = 0.0;
+        return node_idx;
+    }
+
+    /* ── Pick VP, place at front ──────────────────────────────────────── */
+    idx_t vp_local = vp_select_vantage_point(points, start, end);
+    vp_point_swap(points + start, points + vp_local);
+    node->vp_point_idx = start;
+
+    const float_t* vp_data    = points[start].data;
+    idx_t          inner_start = start + 1;
+    idx_t          inner_n     = end - inner_start + 1;
+
+    /* ── Compute squared distances VP → all other points ─────────────── */
+    float_t* d = dist_scratch + inner_start;
+    for (idx_t i = 0; i < inner_n; ++i)
+        d[i] = vp_dist_sq(vp_data, points[inner_start + i].data, dims);
+
+    /* ── QuickSelect: partition around median ─────────────────────────── */
+    idx_t   median_rank = inner_n / 2;
+    float_t mu          = vp_nth_dist(points + inner_start, d,
+                                       0, inner_n - 1, median_rank);
+    node->mu = mu;  /* squared distance */
+
+    /*
+     * After QuickSelect:
+     *   [inner_start .. inner_start+median_rank]  dist² <= mu  → LEFT subtree
+     *   [inner_start+median_rank+1 .. end]         dist² >  mu  → RIGHT subtree
+     */
+    idx_t left_end    = start + median_rank;
+    idx_t right_start = left_end + 1;
+
+    if (left_end >= inner_start)
+        vp_recursive_build(points, nodes, dist_scratch,
+                            node_idx * 2 + 1, inner_start, left_end, dims);
+
+    if (right_start <= end)
+        vp_recursive_build(points, nodes, dist_scratch,
+                            node_idx * 2 + 2, right_start, end, dims);
+
+    return node_idx;
+}
+
+/* ═══════════════════════════════════════════════════════════════════════════
+ * BUILD — parallel (OMP tasks, mirrors kdtree.h parallel_recursive_make_tree)
+ * ═══════════════════════════════════════════════════════════════════════════ */
+
+static idx_t vp_parallel_build(point_t* points, vpnode_t* nodes,
+                                float_t* dist_scratch,
+                                idx_t node_idx,
+                                idx_t start, idx_t end, idx_t dims);
+
+static idx_t vp_parallel_build(point_t* points, vpnode_t* nodes,
+                                float_t* dist_scratch,
+                                idx_t node_idx,
+                                idx_t start, idx_t end, idx_t dims)
+{
+    if ((end - start) < VP_SERIAL_BUILD_CUTOFF)
+        return vp_recursive_build(points, nodes, dist_scratch,
+                                   node_idx, start, end, dims);
+
+    vpnode_t* node = nodes + node_idx;
+
+    idx_t vp_local = vp_select_vantage_point(points, start, end);
+    vp_point_swap(points + start, points + vp_local);
+    node->vp_point_idx = start;
+
+    const float_t* vp_data    = points[start].data;
+    idx_t          inner_start = start + 1;
+    idx_t          inner_n     = end - inner_start + 1;
+
+    float_t* d = dist_scratch + inner_start;
+    #pragma omp simd
+    for (idx_t i = 0; i < inner_n; ++i)
+        d[i] = vp_dist_sq(vp_data, points[inner_start + i].data, dims);
+
+    idx_t   median_rank = inner_n / 2;
+    float_t mu          = vp_nth_dist(points + inner_start, d,
+                                       0, inner_n - 1, median_rank);
+    node->mu = mu;
+
+    idx_t left_end    = start + median_rank;
+    idx_t right_start = left_end + 1;
+    idx_t lch         = node_idx * 2 + 1;
+    idx_t rch         = node_idx * 2 + 2;
+
+    #pragma omp task shared(points, nodes, dist_scratch)
+    {
+        if (left_end >= inner_start)
+            vp_parallel_build(points, nodes, dist_scratch,
+                               lch, inner_start, left_end, dims);
+    }
+
+    #pragma omp task shared(points, nodes, dist_scratch)
+    {
+        if (right_start <= end)
+            vp_parallel_build(points, nodes, dist_scratch,
+                               rch, right_start, end, dims);
+    }
+
+    #pragma omp taskwait
+
+    return node_idx;
+}
+
+/* ═══════════════════════════════════════════════════════════════════════════
+ * SEARCH — exact KNN query
+ *
+ * Drop-in for knn_sub_tree_search() in kdtree.h.
+ * Same signature, same heap semantics, outputs squared distances.
+ *
+ * Visit order: near shell first → tau tightens before evaluating far shell.
+ *   dqv <= mu  (inside ball)  → LEFT  is near, visit LEFT  first
+ *   dqv >  mu  (outside ball) → RIGHT is near, visit RIGHT first
+ *
+ * After near-side visit, re-read tau before deciding on far side.
+ * ═══════════════════════════════════════════════════════════════════════════ */
+
+static void knn_vptree_search(const float_t*  query,
+                               const point_t*  points,
+                               const vpnode_t* nodes,
+                               idx_t           node_idx,
+                               heap_t*         H,
+                               const idx_t     dims,
+                               idx_t*          vis_nodes)
+{
+    const vpnode_t* node = nodes + node_idx;
+    (*vis_nodes)++;
+
+    /* ── Leaf: brute-force, two-pass (compute then insert) ───────────── */
+    if (node->is_leaf) {
+        idx_t          base  = node->leaf_point_idx;
+        idx_t          count = node->leaf_point_count;
+        const float_t* dp    = points[base].data;
+
+        float_t dists[DEFAULT_LEAF_SIZE];
+        #pragma GCC unroll 4
+        for (idx_t i = 0; i < count; ++i)
+            dists[i] = vp_dist_sq(query, dp + i * dims, dims);
+
+        for (idx_t i = 0; i < count; ++i)
+            max_heap_insert(H, dists[i], points[base + i].array_idx);
+
+        return;
+    }
+
+    /* ── Internal node ───────────────────────────────────────────────── */
+    float_t dqv = vp_dist_sq(query, points[node->vp_point_idx].data, dims);
+    max_heap_insert(H, dqv, points[node->vp_point_idx].array_idx);
+
+    float_t tau           = H->data[0].value;
+    int     heap_not_full = (H->count < H->size);
+    float_t mu            = node->mu;
+    idx_t   lch           = node_idx * 2 + 1;
+    idx_t   rch           = node_idx * 2 + 2;
+
+    if (dqv <= mu) {
+        /* Inside ball — LEFT is near side */
+        if (heap_not_full || !vp_can_prune_left(dqv, mu, tau))
+            knn_vptree_search(query, points, nodes, lch, H, dims, vis_nodes);
+
+        tau           = H->data[0].value;   /* re-read: may have tightened */
+        heap_not_full = (H->count < H->size);
+
+        if (heap_not_full || !vp_can_prune_right(dqv, mu, tau))
+            knn_vptree_search(query, points, nodes, rch, H, dims, vis_nodes);
+    } else {
+        /* Outside ball — RIGHT is near side */
+        if (heap_not_full || !vp_can_prune_right(dqv, mu, tau))
+            knn_vptree_search(query, points, nodes, rch, H, dims, vis_nodes);
+
+        tau           = H->data[0].value;
+        heap_not_full = (H->count < H->size);
+
+        if (heap_not_full || !vp_can_prune_left(dqv, mu, tau))
+            knn_vptree_search(query, points, nodes, lch, H, dims, vis_nodes);
+    }
+}
+
+/* ═══════════════════════════════════════════════════════════════════════════
+ * PUBLIC API  (mirrors kdtree.h)
+ * ═══════════════════════════════════════════════════════════════════════════ */
+
+static void vptree_initialize(vptree_t* tree, float_t* data,
+                               size_t n_points, unsigned int dims)
+{
+    tree->dims     = dims;
+    tree->data     = data;
+    tree->n_points = n_points;
+    tree->root     = 0;
+
+    tree->__points = (point_t*)MY_MALLOC(n_points * sizeof(point_t));
+    /* 4*n_points: safe upper bound for heap-ordered binary tree node pool */
+    tree->__nodes  = (vpnode_t*)MY_MALLOC(4 * n_points * sizeof(vpnode_t));
+
+    for (idx_t i = 0; i < n_points; ++i) {
+        tree->__points[i].array_idx = i;
+        tree->__points[i].data      = data + i * dims;
+    }
+}
+
+static void build_tree_vptree(vptree_t* tree)
+{
+    float_t* dist_scratch = (float_t*)MY_MALLOC(tree->n_points * sizeof(float_t));
+
+    static bool seeded = false;
+    if (!seeded) { srand(42); seeded = true; }
+
+    #pragma omp parallel
+    {
+        #pragma omp single
+        {
+            tree->root = vp_parallel_build(tree->__points, tree->__nodes,
+                                            dist_scratch,
+                                            0, 0, tree->n_points - 1,
+                                            tree->dims);
+        }
+    }
+
+    FREE(dist_scratch);
+
+    /* Compact data: permuted __points → contiguous block for cache-friendly
+     * leaf scans. Mirrors the same step in kdtree.h build_tree_kdtree(). */
+    float_t* tmp_data = (float_t*)MY_MALLOC(tree->n_points * tree->dims * sizeof(float_t));
+
+    #pragma omp parallel for schedule(static)
+    for (idx_t i = 0; i < tree->n_points; ++i) {
+        float_t* src = tree->__points[i].data;
+        memcpy(tmp_data + tree->dims * i, src, tree->dims * sizeof(float_t));
+        tree->__points[i].data = tmp_data + tree->dims * i;
+    }
+
+    free(tree->data);
+    tree->data = tmp_data;
+}
+
+static void vptree_free(vptree_t* tree)
+{
+    FREE(tree->__nodes);
+    FREE(tree->__points);
+    FREE(tree->data);
+}
+
+/* ── Debug print, first 3 levels (mirrors kdtree_print) ─────────────────── */
+
+static void vp_recursive_print(const vpnode_t* nodes, idx_t node_idx,
+                                idx_t dims, idx_t level)
+{
+    (void)dims;
+    if (level >= 3) return;
+    const vpnode_t* node = nodes + node_idx;
+    printf("--- VP LEVEL %lu ---\n", (unsigned long)level);
+    if (node->is_leaf) {
+        printf("  LEAF  start=%u  count=%u\n",
+               node->leaf_point_idx, node->leaf_point_count);
+    } else {
+        printf("  INTERNAL  vp_idx=%lu  mu²=%.6f\n",
+               (unsigned long)node->vp_point_idx, node->mu);
+        vp_recursive_print(nodes, node_idx * 2 + 1, dims, level + 1);
+        vp_recursive_print(nodes, node_idx * 2 + 2, dims, level + 1);
+    }
+}
+
+static void vptree_print(const vptree_t* tree)
+{
+    vp_recursive_print(tree->__nodes, tree->root, tree->dims, 0);
+}
-- 
GitLab