added some work on lock free impl of H1, still not working as expected, poor performance (aecabbb9) · Commits · Luca Tornatore / dADP

src/adp/adp.c

+40 −32

Original line number	Diff line number	Diff line
		#include "adp.h"
		#include <unistd.h>

		const border_t border_null = {.density = -1.0, .error = 0, .idx = NOBORDER};
		const sparse_border_t sparse_border_null = {.density = -1.0, .error = 0, .idx = NOBORDER, .i = NOBORDER, .j = NOBORDER};
		@@ -617,6 +618,8 @@ lock_t h1_lock_acquire(global_context_t* ctx, MPI_Win lock_window, int owner, id
		while(result == LOCK_ACQUIRED && err == MPI_SUCCESS)
		{
		err = MPI_Compare_and_swap(&state, &compare, &result, MPI_LOCK_T, owner, pos, lock_window);
		MPI_Win_flush(owner, lock_window);
		usleep(100);
		}

		if(err != MPI_SUCCESS)
		@@ -677,8 +680,7 @@ clusters_t Heuristic1(global_context_t *ctx)
		MPI_Win_fence(0, win_datapoints);
		MPI_Win_lock_all(0, win_datapoints);

		#if defined(THREAD_FUNNELED)
		#else
		#if !defined(THREAD_FUNNELED)
		#pragma omp parallel for
		#endif
		for(idx_t i = 0; i < n; ++i)
		@@ -741,14 +743,17 @@ clusters_t Heuristic1(global_context_t *ctx)
		MPI_Win_create(lock_array, n * sizeof(lock_t), sizeof(lock_t), MPI_INFO_NULL, ctx -> mpi_communicator, &win_locks);
		MPI_Win_fence(0, win_locks);

		#ifdef EXP_H1
		MPI_Win_lock_all(0, win_to_remove_mask);
		MPI_Win_lock_all(0, win_locks);
		#endif

		#ifdef EXP_H1
		printf("Using experimental h1\n");
		#endif


		#if defined(THREAD_FUNNELED)
		#else
		#pragma omp parallel for
		#if !defined(THREAD_FUNNELED)
		#pragma omp parallel for schedule(dynamic)
		#endif
		for(idx_t p = 0; p < n; ++p)
		{
		@@ -769,7 +774,9 @@ clusters_t Heuristic1(global_context_t *ctx)
		*
		* */

		#ifdef EXPERIMENTAL_H1
		#ifdef EXP_H1
		#pragma omp critical (h1_exp)
		{
		int owner = foreign_owner(ctx, jidx);
		idx_t jpos = jidx - ctx -> rank_idx_start[owner];

		@@ -796,9 +803,9 @@ clusters_t Heuristic1(global_context_t *ctx)
		}

		state = h1_lock_free(ctx, win_locks, owner, jpos, state);
		}
		#else

		#pragma omp critical (h1_centers_elimination)
		#pragma omp critical (centers_elimination)
		{
		int owner = foreign_owner(ctx, jidx);
		idx_t jpos = jidx - ctx -> rank_idx_start[owner];
		@@ -822,13 +829,16 @@ clusters_t Heuristic1(global_context_t *ctx)
		}

		MPI_Win_unlock(owner, win_to_remove_mask);
		}
		#endif
		}
		}
		}

		#ifdef EXP_H1
		MPI_Win_unlock_all(win_to_remove_mask);
		MPI_Win_unlock_all(win_locks);
		#endif

		MPI_Win_fence(0, win_to_remove_mask);
		MPI_Win_fence(0, win_locks);
		@@ -1815,8 +1825,6 @@ void Heuristic3(global_context_t* ctx, clusters_t* cluster, float_t Z, int halo)

		qsort(centers_dp, cluster -> centers.count, sizeof(datapoint_info_t), compare_dp_by_cidx);

		printf("Centers\n");

		master_finds_borders(ctx, cluster, Z, surviving_clusters, centers_dp);
		master_fixes_border_matrix_and_centers(ctx, cluster, Z, old_to_new, surviving_clusters, nclus);
		free(centers_dp);

src/main/main.c

+0 −47

Original line number	Diff line number	Diff line
		@@ -232,52 +232,7 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
		if (ctx->mpi_rank == 0)
		{
		data = read_data_file(ctx,ctx -> input_data_file, ctx -> dims, ctx -> input_data_in_float32);

		//data = read_data_file(ctx, "../norm_data/50_blobs_more_var.npy", MY_TRUE);
		//ctx->dims = 2;
		//data = read_data_file(ctx, "../norm_data/blobs_small.npy", MY_FALSE);
		//data = read_data_file(ctx, "../norm_data/blobs_small.npy", MY_FALSE);
		// std_g0163178_Me14_091_0000

		// 100M points
		// 2D
		// std_g2980844_091_0000
		//data = read_data_file(ctx,"../norm_data/huge_blobs.npy",MY_FALSE);
		// 2B points
		// data = read_data_file(ctx,"../norm_data/very_huge_blobs.npy",MY_FALSE);
		// data = read_data_file(ctx,"../norm_data/hd_blobs.npy",5,MY_FALSE);

		//1B points
		// data = read_data_file(ctx,"../norm_data/eds_box_acc_normalized",5,MY_FALSE);
		// data = read_data_file(ctx,"../norm_data/eds_box_6d",6,MY_FALSE);

		// 190M points
		// std_g2980844_091_0000
		// data = read_data_file(ctx,"../norm_data/std_g2980844_091_0000",5,MY_TRUE);

		/* 1M points ca.*/
		//data = read_data_file(ctx,"../norm_data/std_LR_091_0001",5,MY_TRUE);

		/* BOX */
		// data = read_data_file(ctx,"../norm_data/std_Box_256_30_092_0000",MY_TRUE);

		/* 8M points */

		// data = read_data_file(ctx,"../norm_data/std_g0144846_Me14_091_0001",5,MY_TRUE);

		//88M
		// data = read_data_file(ctx,"../norm_data/std_g5503149_091_0000",MY_TRUE);

		//
		//34 M
		//data = read_data_file(ctx,"../norm_data/std_g1212639_091_0001",MY_TRUE);

		//for weak scalability
		//ctx->n_points = ctx->n_points / 4;
		//ctx->n_points = (ctx->n_points / 32) * ctx -> world_size;

		get_dataset_diagnostics(ctx, data);

		}

		/* communicate the total number of points*/
		@@ -320,9 +275,7 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
		int count_send = MIN(default_msg_len, send_counts[i] - already_sent_points);
		MPI_Send(data + displacements[i] + already_sent_points, count_send, MPI_MY_FLOAT, i, ctx -> mpi_rank, ctx -> mpi_communicator);
		already_sent_points += count_send;
		//DB_PRINT("[RANK 0] has sent to rank %d %d elements out of %lu\n",i, already_sent_points, send_counts[i]);
		}
		//DB_PRINT("------------------------------------------------\n");
		}
		}
		else

src/tree/kdtreeV2.c

+0 −3

Original line number	Diff line number	Diff line
		@@ -228,9 +228,6 @@ kdnode_v2* make_tree_kdnode_v2(kdnode_v2* t, int start, int end, kdnode_v2* pare

		}




		int median_idx = -1;

		//if ((end - start) < 0) return 0;

src/tree/tree.c

+4 −3

Original line number	Diff line number	Diff line
		@@ -1478,7 +1478,7 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
		//ctx -> __local_heap_buffers = (heap_node)MY_MALLOC(ctx -> local_n_points k * sizeof(heap_node));
		MPI_Alloc_mem(ctx -> local_n_points * k * sizeof(heap_node), MPI_INFO_NULL, &(ctx -> __local_heap_buffers));

		#pragma omp parallel for
		#pragma omp parallel for schedule(dynamic)
		for(int p = 0; p < ctx -> local_n_points; ++p)
		{
		idx_t idx = local_tree -> _nodes[p].array_idx;
		@@ -1663,7 +1663,7 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
		//if(count_rcv_work_batches[p] > 0)
		{
		//heap_batches_per_node[p] = (heap_node)MY_MALLOC(k point_to_rcv_count[p] * sizeof(heap_node));
		#pragma omp parallel for
		#pragma omp parallel for schedule(dynamic)
		for(int batch = 0; batch < point_to_rcv_count[p]; ++batch)
		{
		heap H;
		@@ -1822,7 +1822,7 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
		}

		/* merge lists */
		#pragma omp paralell for
		#pragma omp parallel for
		for(int b = 0; b < ngbh_to_recv[rank_to_recv]; ++b)
		{
		int idx = local_idx_of_the_point[rank_to_recv][b];
		@@ -1843,6 +1843,7 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
		}



		MPI_Barrier(ctx -> mpi_communicator);
		}