Commit 9d256282 authored by Francesco Tomba's avatar Francesco Tomba
Browse files

Merge branch 'h3-optimization' into 'main'

got up to 2.5B points

See merge request !3
parents 915669a1 ab9542d6
Loading
Loading
Loading
Loading
+70 −11
Original line number Diff line number Diff line
@@ -2,6 +2,7 @@
#include "mpi.h"
#include <bits/time.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <unistd.h>

@@ -1940,6 +1941,25 @@ void merge_A_into_B(idx_t* who_amI, idx_t cluster_A, idx_t cluster_B, idx_t n)
    return;
}

idx_t find_root(idx_t* cluster, idx_t index)
{

    idx_t root = index;
    while(root != cluster[root])
    {
        root = cluster[root];
    }
    return root;
}

void union_A_into_B(idx_t* cluster, idx_t a, idx_t b)
{
    idx_t root_a = find_root(cluster, a);
    idx_t root_b = find_root(cluster, b);

    cluster[root_a] = root_b;
}

void master_finds_borders(global_context_t* ctx, clusters_t* cluster, float_t Z, idx_t* surviving_clusters, datapoint_info_t* centers_dp)
{
    datapoint_info_t* dp_info   = ctx -> local_datapoints;
@@ -2016,21 +2036,37 @@ void master_finds_borders(global_context_t* ctx, clusters_t* cluster, float_t Z,
        struct timespec start_merge, end_merge;
    #endif

    struct timespec start_epoch, end_epoch;

    idx_t slice = merge_count / 20;
    idx_t actual_merges = 0;

    clock_gettime(CLOCK_MONOTONIC, &start_epoch);

    for( idx_t m = 0; m < merge_count; m++ )
    {
      
        #if defined(WRITE_MERGES_INFO)
            clock_gettime(CLOCK_MONOTONIC, &start_merge);
        #endif
        // print progress
        if(merge_count > 1e5)
        // print progress diagnostics
        if(merge_count > 1e5 && (m % slice == 0 || m == merge_count - 1))
        {
            int slice = merge_count / 20;
            if(m % slice == 0 || m == merge_count - 1) printf("Merging progress: %lu / %lu -> %.2f \n", 
                                                              m, merge_count, (float)m/(float)merge_count * 100.);
            clock_gettime(CLOCK_MONOTONIC, &end_epoch);

            float elapsed_time = (float)(end_epoch.tv_sec - start_epoch.tv_sec) - 
                                 (float)(end_epoch.tv_nsec - start_epoch.tv_nsec)/1e9;

            printf("Merging progress: %lu / %lu -> %.2f .. elapsed time: %.2f .. eta: %.2f .. avg per merge %e .. frac merges %f\n", 
                                                        m, merge_count, (float)m/(float)merge_count * 100.,
                                                        elapsed_time, elapsed_time/(float)m * (float)merge_count, 
                                                        elapsed_time/m, (float)actual_merges/(float)slice);
            actual_merges = 0;
        }
        #define src surviving_clusters[merging_table[m].source]
        #define trg surviving_clusters[merging_table[m].target]
        // idx_t src = surviving_clusters[merging_table[m].source];
        // idx_t trg = surviving_clusters[merging_table[m].target];
        idx_t src = find_root(surviving_clusters, merging_table[m].source);
        idx_t trg = find_root(surviving_clusters, merging_table[m].target);

        /* 
         * Enforce a that in case of symmetric merging condition the lowest idx cluster 
@@ -2058,6 +2094,8 @@ void master_finds_borders(global_context_t* ctx, clusters_t* cluster, float_t Z,
        float_t dens_border_err = b.error;

        int i_have_to_merge = is_a_merging(dens1,dens1_err,dens2,dens2_err,dens_border,dens_border_err,Z);            
        actual_merges += (i_have_to_merge && src != trg);

        switch (i_have_to_merge && src != trg)
        {
            case 1:
@@ -2079,7 +2117,8 @@ void master_finds_borders(global_context_t* ctx, clusters_t* cluster, float_t Z,
                 */

                fix_sparse_borders_A_into_B(new_src, new_trg, cluster);
                merge_A_into_B(surviving_clusters, new_src, new_trg, nclus );	  
                union_A_into_B(surviving_clusters, new_src, new_trg);
                //merge_A_into_B(surviving_clusters, new_src, new_trg, nclus );	  

            }
            break;
@@ -2099,11 +2138,15 @@ void master_finds_borders(global_context_t* ctx, clusters_t* cluster, float_t Z,
            fflush(f);
        #endif

    }

        #undef src
        #undef trg
    #pragma omp parallel for
    for(idx_t i = 0; i < nclus; ++i)
    {
        surviving_clusters[i] = find_root(surviving_clusters, i);
    }


    #if defined(WRITE_MERGES_INFO)
        fclose(f);
    #endif
@@ -2214,6 +2257,22 @@ void Heuristic3(global_context_t* ctx, clusters_t* cluster, float_t Z, int halo)

	clock_gettime(CLOCK_MONOTONIC, &start_tot);

    // if(I_AM_MASTER)
    // {
    //     // reallocate to keep memory nearby
    //     for(idx_t i = 0; i < cluster -> centers.count; ++i)
    //     {
    //         idx_t n_borders = cluster->sparse_borders[i].count;
    //         sparse_border_t* tmp_adj_list = (sparse_border_t*)MY_MALLOC(n_borders * sizeof(sparse_border_t));
    //         memcpy(tmp_adj_list, cluster->sparse_borders[i].data, n_borders * sizeof(sparse_border_t));
    //         free(cluster->sparse_borders[i].data);
    //         cluster->sparse_borders[i].data = tmp_adj_list;
    //
    //         printf("%lu\n", i);
    //         fflush(stdout);
    //     }
    // }


    datapoint_info_t* dp_info   = ctx -> local_datapoints;
	idx_t  nclus                = cluster -> centers.count;  
+3 −0
Original line number Diff line number Diff line
@@ -429,6 +429,9 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
    LOG_WRITE("H2", elapsed_time)


    // free ngbh
    MPI_Free_mem(ctx -> __local_heap_buffers);
    ctx -> __local_heap_buffers = NULL;
    TIME_START;
    Heuristic3(ctx, &clusters, ctx -> z, halo);
    elapsed_time = TIME_STOP;
+2 −1
Original line number Diff line number Diff line
@@ -29,7 +29,8 @@
//#define MAX_MSG_SIZE 4294967296

/* Used slices of 10 mb ? Really good? Maybe at the cause of TID thing */
#define MAX_MSG_SIZE (10000 * k * sizeof(heap_node))
// #define MAX_MSG_SIZE (10000 * k * sizeof(heap_node))
#define MAX_MSG_SIZE (100000 * k * sizeof(heap_node))


#define TOP_TREE_RCH 1