added some comments on structs (bb980b8e) · Commits · Luca Tornatore / dADP

.gitignore

+1 −0

Original line number	Diff line number	Diff line
		main
		sync.sh
		leo_sync.sh
		lumi_sync.sh
		bb
		*.ipynb
		scalability_results

src/adp/adp.c

+59 −89

Original line number	Diff line number	Diff line
		@@ -817,20 +817,6 @@ clusters_t Heuristic1(global_context_t *ctx)
		MPI_Win_create(to_remove_mask, n * sizeof(heap_node), 1, MPI_INFO_NULL, ctx -> mpi_communicator, &win_to_remove_mask);
		MPI_Win_fence(0, win_to_remove_mask);

		/*
		* to remove
		* and to reimplement it using rma
		* for dp in datapoints:
		* for nghb in neighbors:
		* if ngbh its_mine
		* no_problems, do it as always
		* else:
		* ngbh is an external node
		* do rma things,
		* in particular, acquire a lock on the window, read and write things
		* then close
		*/

		#if defined(THREAD_FUNNELED)
		#else
		#pragma omp parallel for
		@@ -851,10 +837,8 @@ clusters_t Heuristic1(global_context_t *ctx)
		{
		/*
		*
		* THIS PART CAN BE SUBSTITUTED
		* we have something like (value, idx) pairs and if we have less than
		* MAX_UINT32 particles we can manipulate value and idx to fit into a single
		* UINT64 and then perform a single atomic max operation
		* TODO: Implement it without this but using private locks
		* use an array of locks, and compare and swap to actually gain control of the thing
		*
		* */
		int owner = foreign_owner(ctx, jidx);
		@@ -1118,8 +1102,6 @@ clusters_t Heuristic1(global_context_t *ctx)
		void Heuristic2(global_context_t* ctx, clusters_t* cluster)
		{
		/*
		* Port the current implementation to this using rma
		* then perform the matrix reduction
		*
		* Each one computes its borders, then the borders are shared, and the matrix is
		* reduced to a single huge matrix of borders
		@@ -1659,9 +1641,6 @@ void master_finds_borders(global_context_t* ctx, clusters_t* cluster, float_t Z,
		#define src surviving_clusters[merging_table[m].source]
		#define trg surviving_clusters[merging_table[m].target]

		//int re_check = ( (src != merging_table[m].source) \|\| (trg != merging_table[m].target) );
		//if(re_check)
		{
		/*
		* Enforce a that in case of symmetric merging condition the lowest idx cluster
		* is merged into the higher idx cluster, only to preserve compatibility with
		@@ -1715,7 +1694,6 @@ void master_finds_borders(global_context_t* ctx, clusters_t* cluster, float_t Z,
		default:
		break;
		}
		}

		#undef src
		#undef trg
		@@ -1814,14 +1792,6 @@ void Heuristic3(global_context_t* ctx, clusters_t* cluster, float_t Z, int halo)
		/*
		* Heurisitc 3, from paper of Errico, Facco, Laio & Rodriguez
		* ( https://doi.org/10.1016/j.ins.2021.01.010 )
		* Dense implementation, makes use of a dense matrix to store
		* borders between clusters, so it is more performant when the number of clusters is low
		*
		* args:
		* - clusters* cluster : cluster object storing border info and cluster centers
		* - datapoint_info* dp_info : array of Datapoint structures
		* - float_t Z : parameter to assess density peak significance
		* - halo : flag to set if you want to compute also the halo points
		*/

		#define borders cluster->borders
		@@ -1902,6 +1872,8 @@ void Heuristic3(global_context_t* ctx, clusters_t* cluster, float_t Z, int halo)
		clusters_reset(cluster);
		}

		/* broadcast the number of elements on those lists */

		MPI_Bcast(&(cluster -> centers.count), 1, MPI_UINT64_T, 0, ctx -> mpi_communicator);
		MPI_Bcast(&(cluster -> centers.size), 1, MPI_UINT64_T, 0, ctx -> mpi_communicator);

		@@ -1992,9 +1964,10 @@ void Heuristic3(global_context_t* ctx, clusters_t* cluster, float_t Z, int halo)
		{
		int cidx = dp_info[i].cluster_idx;
		int halo_flag = dp_info[i].log_rho_c < max_border_den_array[cidx];
		//int halo_flag = max_border_den_array[cidx] > dp_info[i].log_rho_c ;

		//changed_here

		//doing this we can have both the assignment and if it is
		//part of the halo, without the need for storing other info
		//halo points have cidx < 0 (old cidx = (c + 1) * -1 )
		dp_info[i].cluster_idx = halo_flag ? (cidx * (-1)) - 1 : cidx;
		}
		@@ -2012,9 +1985,6 @@ void Heuristic3(global_context_t* ctx, clusters_t* cluster, float_t Z, int halo)
		free(old_to_new);

		/free memory and put the correct arrays into place/
		//free(ipos.data);
		//free(jpos.data);
		//

		#ifdef WRITE_FINAL_ASSIGNMENT
		int* cl = (int)MY_MALLOC(ctx -> local_n_points sizeof(int));

src/common/common.c

+2 −37

Original line number	Diff line number	Diff line
		@@ -17,13 +17,7 @@ void get_context(global_context_t* ctx)
		ctx -> ub_box = NULL;
		ctx -> rank_n_points = (int)malloc(ctx -> world_size sizeof(int));
		ctx -> rank_idx_start = (int)malloc(ctx -> world_size sizeof(int));
		ctx -> idx_halo_points_recv = NULL;
		ctx -> idx_halo_points_send = NULL;
		ctx -> n_halo_points_recv = NULL;
		ctx -> n_halo_points_send = NULL;
		ctx -> halo_datapoints = NULL;
		ctx -> local_datapoints = NULL;
		ctx -> __recv_heap_buffers = NULL;
		ctx -> __local_heap_buffers = NULL;
		}

		@@ -175,35 +169,6 @@ void free_context(global_context_t* ctx)
		//}

		FREE_NOT_NULL(ctx -> local_datapoints);
		if(ctx -> halo_datapoints)
		{
		for(int i = 0; i < ctx -> world_size; ++i)
		{
		/*
		for(int j = 0; j < ctx -> n_halo_points_recv[i]; ++j)
		{
		FREE_NOT_NULL(ctx -> halo_datapoints[i][j].ngbh.data);
		}
		*/
		FREE_NOT_NULL(ctx -> halo_datapoints[i]);
		}
		}
		FREE_NOT_NULL(ctx -> halo_datapoints);
		FREE_NOT_NULL(ctx -> __recv_heap_buffers);

		if(ctx -> idx_halo_points_recv)
		{
		for(int i = 0; i < ctx -> world_size; ++i) FREE_NOT_NULL(ctx -> idx_halo_points_recv[i]);
		}
		FREE_NOT_NULL(ctx -> idx_halo_points_recv);

		if(ctx -> idx_halo_points_send)
		{
		for(int i = 0; i < ctx -> world_size; ++i) FREE_NOT_NULL(ctx -> idx_halo_points_send[i]);
		}
		FREE_NOT_NULL(ctx -> idx_halo_points_send);
		FREE_NOT_NULL(ctx -> n_halo_points_recv);
		FREE_NOT_NULL(ctx -> n_halo_points_send);
		FREE_NOT_NULL(ctx -> rank_n_points);
		FREE_NOT_NULL(ctx -> rank_idx_start);
		}

src/common/common.h

+48 −45

Original line number	Diff line number	Diff line
		@@ -21,17 +21,6 @@
		//#define PRINT_H1_CLUSTER_ASSIGN_COMPLETION
		//#define PRINT_ORDERED_BUFFER

		typedef struct datapoint_info_t {
		heap ngbh;
		int is_center;
		int cluster_idx;
		idx_t array_idx;
		idx_t kstar;
		float_t g;
		float_t log_rho;
		float_t log_rho_c;
		float_t log_rho_err;
		} datapoint_info_t;

		#define MAX(A,B) ((A) > (B) ? (A) : (B))
		#define MIN(A,B) ((A) < (B) ? (A) : (B))
		@@ -131,55 +120,69 @@ typedef struct datapoint_info_t {
		#define LOG_END
		#endif

		typedef struct datapoint_info_t {
		/*
		* datapoint object
		*/
		heap ngbh; //heap object stores nearest neighbors of each point
		int is_center; //flag signaling if a point is a cluster center
		int cluster_idx; //cluster index
		idx_t array_idx; //global index of the point in the dataset
		idx_t kstar; //kstar value required for the density computation
		float_t g; //density quantities, required by ADP the procedure
		float_t log_rho; //
		float_t log_rho_c; //
		float_t log_rho_err; //
		} datapoint_info_t;


		struct global_context_t
		typedef struct global_context_t
		{
		/*
		* context object to store info related to each
		* MPI process
		*/
		char processor_mame[MPI_MAX_PROCESSOR_NAME]; //processor name
		int __processor_name_len; //processor name len
		int world_size;
		int mpi_rank;
		int __processor_name_len;
		idx_t k;
		float_t* local_data;
		float_t* lb_box;
		float_t* ub_box;
		int* n_halo_points_recv;
		int* n_halo_points_send;
		idx_t** idx_halo_points_recv;
		idx_t** idx_halo_points_send;
		size_t n_points;
		size_t idx_start;
		size_t local_n_points;
		datapoint_info_t* local_datapoints;
		datapoint_info_t** halo_datapoints;
		heap_node* __recieved_heap_data;
		uint32_t dims;
		int* rank_idx_start;
		int* rank_n_points;
		char processor_mame[MPI_MAX_PROCESSOR_NAME];
		MPI_Comm mpi_communicator;
		heap_node* __recv_heap_buffers;
		heap_node* __local_heap_buffers;
		};

		struct pointset_t
		int mpi_rank; //rank of the processor
		uint32_t dims; //number of dimensions of the dataset
		idx_t k; //number of neighbors
		float_t* local_data; //pointer to the dataset stored into the node
		float_t* lb_box; //bounding box of the dataset
		float_t* ub_box; //bounding box of the dataset
		size_t n_points; //total number of points
		size_t idx_start; //starting index of the points on the processor
		size_t local_n_points; //number of points stored in the current processor
		datapoint_info_t* local_datapoints; //pointer to the datapoint properties
		int* rank_idx_start; //starting index of datapoints in each processor
		int* rank_n_points; //processor name
		heap_node* __local_heap_buffers; //buffer that stores nearest neighbors
		MPI_Comm mpi_communicator; //mpi communicator
		} global_context_t;

		typedef struct pointset_t
		{
		/*
		* Helper object to handle top kdtree
		* construction, it represents the dataset
		* inside one node of the tree
		*/
		size_t n_points;
		size_t __capacity;
		uint32_t dims;
		float_t* data;
		float_t* lb_box;
		float_t* ub_box;
		};
		} pointset_t;


		struct lu_dynamic_array_t {
		typedef struct lu_dynamic_array_t {
		idx_t *data;
		idx_t size;
		idx_t count;
		};
		} lu_dynamic_array_t;

		typedef struct pointset_t pointset_t;
		typedef struct global_context_t global_context_t;
		typedef struct lu_dynamic_array_t lu_dynamic_array_t;

		void mpi_printf(global_context_t, const char fmt, ...);
		void get_context(global_context_t*);

src/main/main.c

+23 −4

Original line number	Diff line number	Diff line
		@@ -11,12 +11,28 @@
		#define OUT_CLUSTER_ASSIGN "/beegfs/ftomba/phd/results/final_assignment.npy"
		#define OUT_HALO_FLAGS "/beegfs/ftomba/phd/results/halo_flags.npy"
		#define OUT_DATA "/beegfs/ftomba/phd/results/ordered_data.npy"
		#else
		#endif

		#ifdef LEONARDO
		#define OUT_CLUSTER_ASSIGN "/leonardo_scratch/large/userexternal/ftomba00/out_dadp/final_assignment.npy"
		#define OUT_HALO_FLAGS "/leonardo_scratch/large/userexternal/ftomba00/out_dadp/halo_flags.npy"
		#define OUT_DATA "/leonardo_scratch/large/userexternal/ftomba00/out_dadp/ordered_data.npy"
		#endif

		#ifdef LUMI
		#define OUT_CLUSTER_ASSIGN "~/scratch_dadp/out_dadp/final_assignment.npy"
		#define OUT_HALO_FLAGS "~/scratch_dadp/out_dadp/halo_flags.npy"
		#define OUT_DATA "~/scratch_dadp/out_dadp/ordered_data.npy"
		#endif

		#ifndef OUT_CLUSTER_ASSIGN
		#define OUT_CLUSTER_ASSIGN "final_assignment.npy"
		#define OUT_HALO_FLAGS "halo_flags.npy"
		#define OUT_DATA "ordered_data.npy"
		#endif



		//

		#ifdef THREAD_FUNNELED
		@@ -25,6 +41,7 @@
		#define THREAD_LEVEL MPI_THREAD_MULTIPLE
		#endif


		int main(int argc, char** argv) {
		#if defined (_OPENMP)
		int mpi_provided_thread_level;
		@@ -54,6 +71,7 @@ int main(int argc, char** argv) {
		MPI_Init(NULL, NULL);
		#endif


		char processor_name[MPI_MAX_PROCESSOR_NAME];
		int name_len;
		MPI_Get_processor_name(processor_name, &name_len);
		@@ -147,6 +165,7 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)

		//1B points
		// data = read_data_file(ctx,"../norm_data/eds_box_acc_normalized",5,MY_FALSE);
		//data = read_data_file(ctx,"../norm_data/eds_box_6d",6,MY_FALSE);

		// 190M points
		// std_g2980844_091_0000
		@@ -160,7 +179,7 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)

		/* 8M points */

		// data = read_data_file(ctx,"../norm_data/std_g0144846_Me14_091_0001",MY_TRUE);
		// data = read_data_file(ctx,"../norm_data/std_g0144846_Me14_091_0001",5,MY_TRUE);

		//88M
		// data = read_data_file(ctx,"../norm_data/std_g5503149_091_0000",MY_TRUE);