Commit 1e241ec6 authored by lykos98's avatar lykos98
Browse files

added h1 optim, plus debugged tree for datasets larger than 1B

parent c36add8a
Loading
Loading
Loading
Loading
+3 −1
Original line number Diff line number Diff line
CC=mpicc
CFLAGS=-O3 -march=native -flto -funroll-loops -fopenmp
#CC=mpiicx
#CFLAGS=-O3 -march=native -flto -funroll-loops -fopenmp
CFLAGS=-O3 -fopenmp 
LDFLAGS=-lm 

all: main
+4 −3
Original line number Diff line number Diff line
@@ -15,8 +15,9 @@

cd $SLURM_SUBMIT_DIR

module load gcc
module load openmpi
#module load gcc
#module load openmpi
module load intel-oneapi-mpi

make clean
make
@@ -33,7 +34,7 @@ mkdir bb
OUT_ASSIGNMENT=/leonardo_scratch/large/userexternal/ftomba00/assignment
OUT_DATA=/leonardo_scratch/large/userexternal/ftomba00/data

IN_DATA=/leonardo_work/IscrC_dadp
IN_DATA=/leonardo_work/EUHPC_D18_045

#10^6 points 
time mpirun -n ${SLURM_NTASKS} --map-by ppr:1:socket:PE=${SLURM_CPUS_PER_TASK}  ./main -t f32 -i ${IN_DATA}/norm_data/std_LR_091_0001 -d 5 -a ${OUT_ASSIGNMENT} -o ${OUT_DATA}
+15 −11
Original line number Diff line number Diff line
@@ -696,8 +696,8 @@ clusters_t Heuristic1(global_context_t *ctx)
    MPI_Win_create(ctx -> local_datapoints, ctx -> local_n_points * sizeof(datapoint_info_t), 
                   1, MPI_INFO_NULL, ctx -> mpi_communicator, &win_datapoints);
    MPI_Win_fence(0, win_datapoints);
    //MPI_Win_lock_all(0,  win_datapoints);

    MPI_Win_lock_all(0,  win_datapoints);
    //
#if !defined(THREAD_FUNNELED)
    #pragma omp parallel for
#endif
@@ -780,6 +780,7 @@ clusters_t Heuristic1(global_context_t *ctx)
    TIME_START;

    

//#define EXP_CENTER_PRUNING
#if defined(EXP_CENTER_PRUNING)
    int all_have_finished = 0;
@@ -878,12 +879,13 @@ clusters_t Heuristic1(global_context_t *ctx)
        if(i_have_finished) break;
    }

    //MPI_Win_unlock_all(win_datapoints);
    MPI_Win_unlock_all(win_datapoints);
    MPI_Win_fence(0, win_datapoints);
    free(tmp_datapoints);
    
#else
    MPI_Win_fence(MPI_MODE_NOPUT, win_datapoints);
    //MPI_Win_unlock_all(win_datapoints);
    //MPI_Win_fence(MPI_MODE_NOPUT, win_datapoints);

    idx_t n_foreign_req = 0;
    idx_t n_local_req   = 0;
@@ -942,16 +944,17 @@ clusters_t Heuristic1(global_context_t *ctx)
            }
        }
    }
    MPI_Win_fence(MPI_MODE_NOPUT, win_datapoints);
    //MPI_Win_fence(MPI_MODE_NOPUT, win_datapoints);

    MPI_Barrier(ctx -> mpi_communicator);
    DB_PRINT("Rank %d: foreign requests points %lu out of %lu -> fraction %.2lf time %.2lfs\n",
            ctx -> mpi_rank, n_foreign_req, n_local_req + n_foreign_req, (float)n_foreign_req/(float)n_local_req, elapsed_proc);
    MPI_Barrier(ctx -> mpi_communicator);
    // MPI_Barrier(ctx -> mpi_communicator);
    // DB_PRINT("Rank %d: foreign requests points %lu out of %lu -> fraction %.2lf time %.2lfs\n",
    //         ctx -> mpi_rank, n_foreign_req, n_local_req + n_foreign_req, (float)n_foreign_req/(float)n_local_req, elapsed_proc);
    // MPI_Barrier(ctx -> mpi_communicator);
#endif

    //assemble arrays into a single buffer
    

    elapsed_time = TIME_STOP;
    LOG_WRITE("Finding centers to prune", elapsed_time);
    TIME_START;
@@ -1025,7 +1028,7 @@ clusters_t Heuristic1(global_context_t *ctx)
    idx_t tot_recv_counts = 0;

    // count how many elements to recieve
    MPI_DB_PRINT("Using centers elimination queue experiment\n");
    // MPI_DB_PRINT("Using centers elimination queue experiment\n");

    for(int i = 0; i < ctx -> world_size; ++i) tot_recv_counts += recv_counts[i];
    /*
@@ -1095,6 +1098,7 @@ clusters_t Heuristic1(global_context_t *ctx)

    //allocate buffer to recieve center elminiations
    

    center_removal_t* recv_removals = (center_removal_t*)MY_MALLOC(tot_recv_counts * sizeof(center_removal_t));

    // all to all
@@ -1336,7 +1340,7 @@ clusters_t Heuristic1(global_context_t *ctx)

    }

    //MPI_Win_unlock_all(win_datapoints);
    MPI_Win_unlock_all(win_datapoints);
    MPI_Win_fence(0, win_datapoints);
    MPI_Win_free(&win_datapoints);

+2 −2
Original line number Diff line number Diff line
@@ -44,7 +44,7 @@ void get_dataset_diagnostics(global_context_t* ctx, float_t* data)
        for(int j = 0; j < ctx -> dims; ++j) pvt_mean[j] = 0.;

        #pragma omp for
        for(int i = 0; i < ctx -> n_points; ++i)
        for(idx_t i = 0; i < ctx -> n_points; ++i)
        {
            int j = 0;
            for(j = 0; j < jmax; j+=4)
@@ -81,7 +81,7 @@ void get_dataset_diagnostics(global_context_t* ctx, float_t* data)
        for(int j = 0; j < ctx -> dims; ++j) pvt_var[j] = 0.;

        #pragma omp for
        for(int i = 0; i < ctx -> n_points; ++i)
        for(idx_t i = 0; i < ctx -> n_points; ++i)
        {
            int j = 0;
            for(j = 0; j < jmax; j+=4)
+4 −2
Original line number Diff line number Diff line
@@ -45,9 +45,11 @@
#define MY_TRUE  1
#define MY_FALSE 0

#define CHECK_ALLOCATION(x) if(!x){printf("[!!!] %d rank encountered failed allocation at line %s \n", ctx -> mpi_rank, __LINE__ ); exit(1);};
#define HERE printf("%d in file %s reached line %d\n", ctx -> mpi_rank, __FILE__, __LINE__); MPI_Barrier(ctx -> mpi_communicator);

#define CHECK_ALLOCATION_NO_CTX(x) if(!x){printf("[!!!] Failed allocation at line %d \n", __LINE__ ); exit(1);}
#define CHECK_ALLOCATION(x) if(!x){printf("[!!!] %d rank encountered failed allocation: %s at line %s \n", ctx -> mpi_rank, __FILE__, __LINE__ ); exit(1);};

#define CHECK_ALLOCATION_NO_CTX(x) if(!x){printf("[!!!] Failed allocation: %s at line %d \n", __FILE__, __LINE__ ); exit(1);}
#define MY_MALLOC(n) ({void* p = calloc(n,1); CHECK_ALLOCATION_NO_CTX(p); p; })

#define DB_PRINT(...) printf(__VA_ARGS__)
Loading