Loading Makefile +3 −1 Original line number Diff line number Diff line CC=mpicc CFLAGS=-O3 -march=native -flto -funroll-loops -fopenmp #CC=mpiicx #CFLAGS=-O3 -march=native -flto -funroll-loops -fopenmp CFLAGS=-O3 -fopenmp LDFLAGS=-lm all: main Loading run_leo +4 −3 Original line number Diff line number Diff line Loading @@ -15,8 +15,9 @@ cd $SLURM_SUBMIT_DIR module load gcc module load openmpi #module load gcc #module load openmpi module load intel-oneapi-mpi make clean make Loading @@ -33,7 +34,7 @@ mkdir bb OUT_ASSIGNMENT=/leonardo_scratch/large/userexternal/ftomba00/assignment OUT_DATA=/leonardo_scratch/large/userexternal/ftomba00/data IN_DATA=/leonardo_work/IscrC_dadp IN_DATA=/leonardo_work/EUHPC_D18_045 #10^6 points time mpirun -n ${SLURM_NTASKS} --map-by ppr:1:socket:PE=${SLURM_CPUS_PER_TASK} ./main -t f32 -i ${IN_DATA}/norm_data/std_LR_091_0001 -d 5 -a ${OUT_ASSIGNMENT} -o ${OUT_DATA} Loading src/adp/adp.c +15 −11 Original line number Diff line number Diff line Loading @@ -696,8 +696,8 @@ clusters_t Heuristic1(global_context_t *ctx) MPI_Win_create(ctx -> local_datapoints, ctx -> local_n_points * sizeof(datapoint_info_t), 1, MPI_INFO_NULL, ctx -> mpi_communicator, &win_datapoints); MPI_Win_fence(0, win_datapoints); //MPI_Win_lock_all(0, win_datapoints); MPI_Win_lock_all(0, win_datapoints); // #if !defined(THREAD_FUNNELED) #pragma omp parallel for #endif Loading Loading @@ -780,6 +780,7 @@ clusters_t Heuristic1(global_context_t *ctx) TIME_START; //#define EXP_CENTER_PRUNING #if defined(EXP_CENTER_PRUNING) int all_have_finished = 0; Loading Loading @@ -878,12 +879,13 @@ clusters_t Heuristic1(global_context_t *ctx) if(i_have_finished) break; } //MPI_Win_unlock_all(win_datapoints); MPI_Win_unlock_all(win_datapoints); MPI_Win_fence(0, win_datapoints); free(tmp_datapoints); #else MPI_Win_fence(MPI_MODE_NOPUT, win_datapoints); //MPI_Win_unlock_all(win_datapoints); //MPI_Win_fence(MPI_MODE_NOPUT, win_datapoints); idx_t n_foreign_req = 0; idx_t n_local_req = 0; Loading Loading @@ -942,16 +944,17 @@ clusters_t Heuristic1(global_context_t *ctx) } } } MPI_Win_fence(MPI_MODE_NOPUT, win_datapoints); //MPI_Win_fence(MPI_MODE_NOPUT, win_datapoints); MPI_Barrier(ctx -> mpi_communicator); DB_PRINT("Rank %d: foreign requests points %lu out of %lu -> fraction %.2lf time %.2lfs\n", ctx -> mpi_rank, n_foreign_req, n_local_req + n_foreign_req, (float)n_foreign_req/(float)n_local_req, elapsed_proc); MPI_Barrier(ctx -> mpi_communicator); // MPI_Barrier(ctx -> mpi_communicator); // DB_PRINT("Rank %d: foreign requests points %lu out of %lu -> fraction %.2lf time %.2lfs\n", // ctx -> mpi_rank, n_foreign_req, n_local_req + n_foreign_req, (float)n_foreign_req/(float)n_local_req, elapsed_proc); // MPI_Barrier(ctx -> mpi_communicator); #endif //assemble arrays into a single buffer elapsed_time = TIME_STOP; LOG_WRITE("Finding centers to prune", elapsed_time); TIME_START; Loading Loading @@ -1025,7 +1028,7 @@ clusters_t Heuristic1(global_context_t *ctx) idx_t tot_recv_counts = 0; // count how many elements to recieve MPI_DB_PRINT("Using centers elimination queue experiment\n"); // MPI_DB_PRINT("Using centers elimination queue experiment\n"); for(int i = 0; i < ctx -> world_size; ++i) tot_recv_counts += recv_counts[i]; /* Loading Loading @@ -1095,6 +1098,7 @@ clusters_t Heuristic1(global_context_t *ctx) //allocate buffer to recieve center elminiations center_removal_t* recv_removals = (center_removal_t*)MY_MALLOC(tot_recv_counts * sizeof(center_removal_t)); // all to all Loading Loading @@ -1336,7 +1340,7 @@ clusters_t Heuristic1(global_context_t *ctx) } //MPI_Win_unlock_all(win_datapoints); MPI_Win_unlock_all(win_datapoints); MPI_Win_fence(0, win_datapoints); MPI_Win_free(&win_datapoints); Loading src/common/common.c +2 −2 Original line number Diff line number Diff line Loading @@ -44,7 +44,7 @@ void get_dataset_diagnostics(global_context_t* ctx, float_t* data) for(int j = 0; j < ctx -> dims; ++j) pvt_mean[j] = 0.; #pragma omp for for(int i = 0; i < ctx -> n_points; ++i) for(idx_t i = 0; i < ctx -> n_points; ++i) { int j = 0; for(j = 0; j < jmax; j+=4) Loading Loading @@ -81,7 +81,7 @@ void get_dataset_diagnostics(global_context_t* ctx, float_t* data) for(int j = 0; j < ctx -> dims; ++j) pvt_var[j] = 0.; #pragma omp for for(int i = 0; i < ctx -> n_points; ++i) for(idx_t i = 0; i < ctx -> n_points; ++i) { int j = 0; for(j = 0; j < jmax; j+=4) Loading src/common/common.h +4 −2 Original line number Diff line number Diff line Loading @@ -45,9 +45,11 @@ #define MY_TRUE 1 #define MY_FALSE 0 #define CHECK_ALLOCATION(x) if(!x){printf("[!!!] %d rank encountered failed allocation at line %s \n", ctx -> mpi_rank, __LINE__ ); exit(1);}; #define HERE printf("%d in file %s reached line %d\n", ctx -> mpi_rank, __FILE__, __LINE__); MPI_Barrier(ctx -> mpi_communicator); #define CHECK_ALLOCATION_NO_CTX(x) if(!x){printf("[!!!] Failed allocation at line %d \n", __LINE__ ); exit(1);} #define CHECK_ALLOCATION(x) if(!x){printf("[!!!] %d rank encountered failed allocation: %s at line %s \n", ctx -> mpi_rank, __FILE__, __LINE__ ); exit(1);}; #define CHECK_ALLOCATION_NO_CTX(x) if(!x){printf("[!!!] Failed allocation: %s at line %d \n", __FILE__, __LINE__ ); exit(1);} #define MY_MALLOC(n) ({void* p = calloc(n,1); CHECK_ALLOCATION_NO_CTX(p); p; }) #define DB_PRINT(...) printf(__VA_ARGS__) Loading Loading
Makefile +3 −1 Original line number Diff line number Diff line CC=mpicc CFLAGS=-O3 -march=native -flto -funroll-loops -fopenmp #CC=mpiicx #CFLAGS=-O3 -march=native -flto -funroll-loops -fopenmp CFLAGS=-O3 -fopenmp LDFLAGS=-lm all: main Loading
run_leo +4 −3 Original line number Diff line number Diff line Loading @@ -15,8 +15,9 @@ cd $SLURM_SUBMIT_DIR module load gcc module load openmpi #module load gcc #module load openmpi module load intel-oneapi-mpi make clean make Loading @@ -33,7 +34,7 @@ mkdir bb OUT_ASSIGNMENT=/leonardo_scratch/large/userexternal/ftomba00/assignment OUT_DATA=/leonardo_scratch/large/userexternal/ftomba00/data IN_DATA=/leonardo_work/IscrC_dadp IN_DATA=/leonardo_work/EUHPC_D18_045 #10^6 points time mpirun -n ${SLURM_NTASKS} --map-by ppr:1:socket:PE=${SLURM_CPUS_PER_TASK} ./main -t f32 -i ${IN_DATA}/norm_data/std_LR_091_0001 -d 5 -a ${OUT_ASSIGNMENT} -o ${OUT_DATA} Loading
src/adp/adp.c +15 −11 Original line number Diff line number Diff line Loading @@ -696,8 +696,8 @@ clusters_t Heuristic1(global_context_t *ctx) MPI_Win_create(ctx -> local_datapoints, ctx -> local_n_points * sizeof(datapoint_info_t), 1, MPI_INFO_NULL, ctx -> mpi_communicator, &win_datapoints); MPI_Win_fence(0, win_datapoints); //MPI_Win_lock_all(0, win_datapoints); MPI_Win_lock_all(0, win_datapoints); // #if !defined(THREAD_FUNNELED) #pragma omp parallel for #endif Loading Loading @@ -780,6 +780,7 @@ clusters_t Heuristic1(global_context_t *ctx) TIME_START; //#define EXP_CENTER_PRUNING #if defined(EXP_CENTER_PRUNING) int all_have_finished = 0; Loading Loading @@ -878,12 +879,13 @@ clusters_t Heuristic1(global_context_t *ctx) if(i_have_finished) break; } //MPI_Win_unlock_all(win_datapoints); MPI_Win_unlock_all(win_datapoints); MPI_Win_fence(0, win_datapoints); free(tmp_datapoints); #else MPI_Win_fence(MPI_MODE_NOPUT, win_datapoints); //MPI_Win_unlock_all(win_datapoints); //MPI_Win_fence(MPI_MODE_NOPUT, win_datapoints); idx_t n_foreign_req = 0; idx_t n_local_req = 0; Loading Loading @@ -942,16 +944,17 @@ clusters_t Heuristic1(global_context_t *ctx) } } } MPI_Win_fence(MPI_MODE_NOPUT, win_datapoints); //MPI_Win_fence(MPI_MODE_NOPUT, win_datapoints); MPI_Barrier(ctx -> mpi_communicator); DB_PRINT("Rank %d: foreign requests points %lu out of %lu -> fraction %.2lf time %.2lfs\n", ctx -> mpi_rank, n_foreign_req, n_local_req + n_foreign_req, (float)n_foreign_req/(float)n_local_req, elapsed_proc); MPI_Barrier(ctx -> mpi_communicator); // MPI_Barrier(ctx -> mpi_communicator); // DB_PRINT("Rank %d: foreign requests points %lu out of %lu -> fraction %.2lf time %.2lfs\n", // ctx -> mpi_rank, n_foreign_req, n_local_req + n_foreign_req, (float)n_foreign_req/(float)n_local_req, elapsed_proc); // MPI_Barrier(ctx -> mpi_communicator); #endif //assemble arrays into a single buffer elapsed_time = TIME_STOP; LOG_WRITE("Finding centers to prune", elapsed_time); TIME_START; Loading Loading @@ -1025,7 +1028,7 @@ clusters_t Heuristic1(global_context_t *ctx) idx_t tot_recv_counts = 0; // count how many elements to recieve MPI_DB_PRINT("Using centers elimination queue experiment\n"); // MPI_DB_PRINT("Using centers elimination queue experiment\n"); for(int i = 0; i < ctx -> world_size; ++i) tot_recv_counts += recv_counts[i]; /* Loading Loading @@ -1095,6 +1098,7 @@ clusters_t Heuristic1(global_context_t *ctx) //allocate buffer to recieve center elminiations center_removal_t* recv_removals = (center_removal_t*)MY_MALLOC(tot_recv_counts * sizeof(center_removal_t)); // all to all Loading Loading @@ -1336,7 +1340,7 @@ clusters_t Heuristic1(global_context_t *ctx) } //MPI_Win_unlock_all(win_datapoints); MPI_Win_unlock_all(win_datapoints); MPI_Win_fence(0, win_datapoints); MPI_Win_free(&win_datapoints); Loading
src/common/common.c +2 −2 Original line number Diff line number Diff line Loading @@ -44,7 +44,7 @@ void get_dataset_diagnostics(global_context_t* ctx, float_t* data) for(int j = 0; j < ctx -> dims; ++j) pvt_mean[j] = 0.; #pragma omp for for(int i = 0; i < ctx -> n_points; ++i) for(idx_t i = 0; i < ctx -> n_points; ++i) { int j = 0; for(j = 0; j < jmax; j+=4) Loading Loading @@ -81,7 +81,7 @@ void get_dataset_diagnostics(global_context_t* ctx, float_t* data) for(int j = 0; j < ctx -> dims; ++j) pvt_var[j] = 0.; #pragma omp for for(int i = 0; i < ctx -> n_points; ++i) for(idx_t i = 0; i < ctx -> n_points; ++i) { int j = 0; for(j = 0; j < jmax; j+=4) Loading
src/common/common.h +4 −2 Original line number Diff line number Diff line Loading @@ -45,9 +45,11 @@ #define MY_TRUE 1 #define MY_FALSE 0 #define CHECK_ALLOCATION(x) if(!x){printf("[!!!] %d rank encountered failed allocation at line %s \n", ctx -> mpi_rank, __LINE__ ); exit(1);}; #define HERE printf("%d in file %s reached line %d\n", ctx -> mpi_rank, __FILE__, __LINE__); MPI_Barrier(ctx -> mpi_communicator); #define CHECK_ALLOCATION_NO_CTX(x) if(!x){printf("[!!!] Failed allocation at line %d \n", __LINE__ ); exit(1);} #define CHECK_ALLOCATION(x) if(!x){printf("[!!!] %d rank encountered failed allocation: %s at line %s \n", ctx -> mpi_rank, __FILE__, __LINE__ ); exit(1);}; #define CHECK_ALLOCATION_NO_CTX(x) if(!x){printf("[!!!] Failed allocation: %s at line %d \n", __FILE__, __LINE__ ); exit(1);} #define MY_MALLOC(n) ({void* p = calloc(n,1); CHECK_ALLOCATION_NO_CTX(p); p; }) #define DB_PRINT(...) printf(__VA_ARGS__) Loading