Loading allvars.c +4 −1 Original line number Original line Diff line number Diff line Loading @@ -7,7 +7,9 @@ struct ip in; struct op out, outparam; struct op out, outparam; struct meta metaData; struct meta metaData; struct time timing; timing_t wt_timing = {0}; timing_t pr_timing = {0}; struct parameter param; struct parameter param; struct fileData data; struct fileData data; Loading @@ -16,6 +18,7 @@ char datapath[900]; int xaxis, yaxis; int xaxis, yaxis; int global_rank; int global_rank; int size; int size; int verbose_level = 0; long nsectors; long nsectors; long startrow; long startrow; double resolution, dx, dw, w_supporth; double resolution, dx, dw, w_supporth; Loading allvars.h +76 −9 Original line number Original line Diff line number Diff line Loading @@ -92,13 +92,24 @@ extern struct meta } metaData; } metaData; extern struct time typedef struct { { double setup; // time spent in initialization, init() double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, phase_time; double init; // time spent in initializing arrays double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1, phase_time1; double process; // time spent in gridding; double writetime, writetime1; double mpi; // time spent in mpi communications double fftw; // } timing; double kernel; // double mmove; // double reduce; // double reduce_mpi; // double reduce_sh ; // double compose; // double phase; // double write; // double total; } timing_t; extern timing_t wt_timing; // wall-clock timings extern timing_t pr_timing; // process CPU timing extern struct parameter extern struct parameter { { Loading Loading @@ -127,12 +138,11 @@ extern char datapath[900]; extern int xaxis, yaxis; extern int xaxis, yaxis; extern int global_rank; extern int global_rank; extern int size; extern int size; extern int verbose_level; extern long nsectors; extern long nsectors; extern long startrow; extern long startrow; extern double resolution, dx, dw, w_supporth; extern double resolution, dx, dw, w_supporth; extern clock_t start, end, start0, startk, endk; extern struct timespec begin, finish, begin0, begink, finishk; extern long * histo_send, size_of_grid; extern long * histo_send, size_of_grid; extern double * grid, *gridss, *gridss_real, *gridss_img, *gridss_w; extern double * grid, *gridss, *gridss_real, *gridss_img, *gridss_w; Loading @@ -142,3 +152,60 @@ extern double * grid, *gridss, *gridss_real, *gridss_img, *gridss_w; #endif #endif extern long **sectorarray; extern long **sectorarray; #if defined(DEBUG) #define dprintf(LEVEL, T, t, ...) if( (verbose_level >= (LEVEL)) && \ ( ((t) ==-1 ) || ((T)==(t)) ) ) { \ printf(__VA_ARGS__); fflush(stdout); } #else #define dprintf(...) #endif #define CPU_TIME_wt ({ struct timespec myts; (clock_gettime( CLOCK_REALTIME, &myts ), (double)myts.tv_sec + (double)myts.tv_nsec * 1e-9);}) #define CPU_TIME_pr ({ struct timespec myts; (clock_gettime( CLOCK_PROCESS_CPUTIME_ID, &myts ), (double)myts.tv_sec + (double)myts.tv_nsec * 1e-9);}) #define CPU_TIME_th ({ struct timespec myts; (clock_gettime( CLOCK_THREAD_CPUTIME_ID, &myts ), (double)myts.tv_sec + (double)myts.tv_nsec * 1e-9);}) #if defined(_OPENMP) #define TAKE_TIME_START( T ) { \ wt_timing.T = CPU_TIME_wt; \ pr_timing.T = CPU_TIME_pr; } #define TAKE_TIME_STOP( T ) { \ pr_timing.T = CPU_TIME_pr - pr_timing.T; \ wt_timing.T = CPU_TIME_wt - wt_timing.T; } #define TAKE_TIME( Twt, Tpr ) { Twt = CPU_TIME_wt; Tpr = CPU_TIME_pr; } #define ADD_TIME( T, Twt, Tpr ) { \ pr_timing.T += CPU_TIME_pr - Tpr; \ wt_timing.T += CPU_TIME_wt - Twt; \ Tpr = CPU_TIME_pr; Twt = CPU_TIME_wt; } #else #define TAKE_TIME_START( T ) wt_timing.T = CPU_TIME_wt #define TAKE_TIME_STOP( T ) wt_timing.T = CPU_TIME_wt - wt_timing.T #define TAKE_TIME( Twt, ... ) Twt = CPU_TIME_wt; #define ADD_TIME( T, Twt, ... ) { wt_timing.T += CPU_TIME_wt - Twt; Twt = CPU_TIME_wt;} #endif #define TAKE_TIMEwt_START( T) wt_timing.T = CPU_TIME_wt #define TAKE_TIMEwt_STOP( T) wt_timing.T = CPU_TIME_wt - wt_timing.T #define TAKE_TIMEwt( Twt ) Twt = CPU_TIME_wt; #define ADD_TIMEwt( T, Twt ) { wt_timing.T += CPU_TIME_wt - Twt; Twt = CPU_TIME_wt; } #if defined(__GNUC__) && !defined(__ICC) && !defined(__INTEL_COMPILER) #define PRAGMA_IVDEP _Pragma("GCC ivdep") #else #define PRAGMA_IVDEP _Pragma("ivdep") #endif #define STRINGIFY(a) #a #define UNROLL(N) _Pragma(STRINGIFY(unroll(N))) data/paramfile.txt +5 −4 Original line number Original line Diff line number Diff line ndatasets 1 ndatasets 1 Datapath1 /beegfs/lofar/cgheller/L798046_SB244_uv.uncorr_130B27932t_121MHz.pre-cal.binMS/ Datapath1 ../input/ Datapath2 /beegfs/lofar/cgheller/L798046_SB244_uv.uncorr_130B27932t_123MHz.pre-cal.binMS/ Datapath2 ../input/ Datapath3 /beegfs/lofar/cgheller/L798046_SB244_uv.uncorr_130B27932t_125MHz.pre-cal.binMS/ Datapath3 ../input/ num_threads 2 num_threads 2 w_support 7 w_support 7 grid_size_x 2048 grid_size_x 2048 Loading @@ -24,3 +24,4 @@ fftfile3 fft_img.bin logfile run.log logfile run.log extension .txt extension .txt timingfile timings.dat timingfile timings.dat verbose_level 1 fourier_transform.c +209 −203 Original line number Original line Diff line number Diff line Loading @@ -2,13 +2,15 @@ #include "allvars.h" #include "allvars.h" #include "proto.h" #include "proto.h" void fftw_data(){ void fftw_data() { #ifdef USE_FFTW #ifdef USE_FFTW // FFT transform the data (using distributed FFTW) // FFT transform the data (using distributed FFTW) if(global_rank == 0)printf("PERFORMING FFT\n"); if(global_rank == 0)printf("PERFORMING FFT\n"); clock_gettime(CLOCK_MONOTONIC, &begin); start = clock(); TAKE_TIME_START(fftw); fftw_plan plan; fftw_plan plan; fftw_complex *fftwgrid; fftw_complex *fftwgrid; ptrdiff_t alloc_local, local_n0, local_0_start; ptrdiff_t alloc_local, local_n0, local_0_start; Loading Loading @@ -64,21 +66,23 @@ void fftw_data(){ MPI_Barrier(MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); #endif #endif end = clock(); TAKE_TIME_STOP(fftw); clock_gettime(CLOCK_MONOTONIC, &finish); timing.fftw_time = ((double) (end - start)) / CLOCKS_PER_SEC; timing.fftw_time1 = (finish.tv_sec - begin.tv_sec); timing.fftw_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; clock_gettime(CLOCK_MONOTONIC, &begin); #endif #endif } } void write_fftw_data(){ void write_fftw_data(){ // Write results #ifdef USE_FFTW #ifdef USE_FFTW double twt, tpr; #ifdef WRITE_DATA #ifdef WRITE_DATA // Write results TAKE_TIME(twt, tpr); #ifdef USE_MPI #ifdef USE_MPI MPI_Win writewin; MPI_Win writewin; MPI_Win_create(gridss, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &writewin); MPI_Win_create(gridss, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &writewin); Loading Loading @@ -150,27 +154,28 @@ void write_fftw_data(){ MPI_Win_free(&writewin); MPI_Win_free(&writewin); MPI_Barrier(MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); #endif #endif ADD_TIME(write, twt, tpr); #endif //WRITE_DATA #endif //WRITE_DATA // Phase correction // Phase correction clock_gettime(CLOCK_MONOTONIC, &begin); start = clock(); TAKE_TIME_START(phase) if(global_rank == 0)printf("PHASE CORRECTION\n"); if(global_rank == 0)printf("PHASE CORRECTION\n"); double* image_real = (double*) calloc(xaxis*yaxis,sizeof(double)); double* image_real = (double*) calloc(xaxis*yaxis,sizeof(double)); double* image_imag = (double*) calloc(xaxis*yaxis,sizeof(double)); double* image_imag = (double*) calloc(xaxis*yaxis,sizeof(double)); phase_correction(gridss,image_real,image_imag,xaxis,yaxis,param.num_w_planes,param.grid_size_x,param.grid_size_y,resolution,metaData.wmin,metaData.wmax,param.num_threads); phase_correction(gridss,image_real,image_imag,xaxis,yaxis,param.num_w_planes,param.grid_size_x,param.grid_size_y,resolution,metaData.wmin,metaData.wmax,param.num_threads); end = clock(); TAKE_TIME_STOP(phase); clock_gettime(CLOCK_MONOTONIC, &finish); timing.phase_time = ((double) (end - start)) / CLOCKS_PER_SEC; timing.phase_time1 = (finish.tv_sec - begin.tv_sec); timing.phase_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; #ifdef WRITE_IMAGE #ifdef WRITE_IMAGE TAKE_TIME(twt, tpr); if(global_rank == 0) if(global_rank == 0) { { file.pFilereal = fopen (out.fftfile2,"wb"); file.pFilereal = fopen (out.fftfile2,"wb"); Loading Loading @@ -208,6 +213,7 @@ void write_fftw_data(){ MPI_Barrier(MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); #endif #endif ADD_TIME(write, twt, tpr); #endif //WRITE_IMAGE #endif //WRITE_IMAGE #endif //FFTW #endif //FFTW Loading gridding.c +404 −355 Original line number Original line Diff line number Diff line Loading @@ -4,35 +4,35 @@ #include "proto.h" #include "proto.h" void gridding(){ void gridding() { if(global_rank == 0)printf("GRIDDING DATA\n"); if(global_rank == 0)printf("GRIDDING DATA\n"); // Create histograms and linked lists // Create histograms and linked lists clock_gettime(CLOCK_MONOTONIC, &begin); TAKE_TIME_START(init); start = clock(); // Initialize linked list // Initialize linked list initialize_array(); initialize_array(); TAKE_TIME_STOP(init); TAKE_TIME_START(process); // Sector and Gridding data // Sector and Gridding data gridding_data(); gridding_data(); TAKE_TIME_STOP(process); #ifdef USE_MPI #ifdef USE_MPI MPI_Barrier(MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); #endif #endif end = clock(); return; clock_gettime(CLOCK_MONOTONIC, &finish); timing.process_time = ((double) (end - start)) / CLOCKS_PER_SEC; timing.process_time1 = (finish.tv_sec - begin.tv_sec); timing.process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; clock_gettime(CLOCK_MONOTONIC, &begin); } } void initialize_array(){ void initialize_array() { histo_send = (long*) calloc(nsectors+1,sizeof(long)); histo_send = (long*) calloc(nsectors+1,sizeof(long)); int * boundary = (int*) calloc(metaData.Nmeasures,sizeof(int)); int * boundary = (int*) calloc(metaData.Nmeasures,sizeof(int)); Loading Loading @@ -79,13 +79,16 @@ void initialize_array(){ #ifdef VERBOSE #ifdef VERBOSE for (int iii=0; iii<nsectors+1; iii++)printf("HISTO %d %d %ld\n", global_rank, iii, histo_send[iii]); for (int iii=0; iii<nsectors+1; iii++)printf("HISTO %d %d %ld\n", global_rank, iii, histo_send[iii]); #endif #endif free( boundary); return; } } void gridding_data(){ void gridding_data() { double shift = (double)(dx*yaxis); double shift = (double)(dx*yaxis); // Open the MPI Memory Window for the slab #ifdef USE_MPI #ifdef USE_MPI MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin); MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin); MPI_Win_fence(0,slabwin); MPI_Win_fence(0,slabwin); Loading @@ -95,24 +98,20 @@ void gridding_data(){ file.pFile1 = fopen (out.outfile1,"w"); file.pFile1 = fopen (out.outfile1,"w"); #endif #endif timing.kernel_time = 0.0; timing.kernel_time1 = 0.0; timing.reduce_time = 0.0; timing.reduce_time1 = 0.0; timing.compose_time = 0.0; timing.compose_time1 = 0.0; // calculate the resolution in radians // calculate the resolution in radians resolution = 1.0/MAX(abs(metaData.uvmin),abs(metaData.uvmax)); resolution = 1.0/MAX(abs(metaData.uvmin),abs(metaData.uvmax)); // calculate the resolution in arcsec // calculate the resolution in arcsec double resolution_asec = (3600.0*180.0)/MAX(abs(metaData.uvmin),abs(metaData.uvmax))/PI; double resolution_asec = (3600.0*180.0)/MAX(abs(metaData.uvmin),abs(metaData.uvmax))/PI; if( global_rank == 0 ) printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec); printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec); for (long isector = 0; isector < nsectors; isector++) for (long isector = 0; isector < nsectors; isector++) { { clock_gettime(CLOCK_MONOTONIC, &begink); double twt, tpr; startk = clock(); TAKE_TIME(twt, tpr); // define local destination sector // define local destination sector //isector = (isector_count+rank)%size; // this line must be wrong! [LT] //isector = (isector_count+rank)%size; // this line must be wrong! [LT] Loading @@ -132,6 +131,7 @@ void gridding_data(){ long ip = 0; long ip = 0; long inu = 0; long inu = 0; #warning shall we omp-ize this ? for(long iphi = histo_send[isector]-1; iphi>=0; iphi--) for(long iphi = histo_send[isector]-1; iphi>=0; iphi--) { { long ilocal = sectorarray[isector][iphi]; long ilocal = sectorarray[isector][iphi]; Loading @@ -141,33 +141,32 @@ void gridding_data(){ uus[icount] = data.uu[ilocal]; uus[icount] = data.uu[ilocal]; vvs[icount] = data.vv[ilocal]-isector*shift; vvs[icount] = data.vv[ilocal]-isector*shift; wws[icount] = data.ww[ilocal]; wws[icount] = data.ww[ilocal]; for (long ipol=0; ipol<metaData.polarisations; ipol++) UNROLL(4) PRAGMA_IVDEP for (long ipol=0; ipol<metaData.polarisations; ipol++, ip++) { { weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol]; weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol]; ip++; } } for (long ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++) PRAGMA_IVDEP UNROLL(4) for (long ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++, inu++) { { visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; visimgs[inu] = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; visimgs[inu] = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq,metaData.Nvis); //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq,metaData.Nvis); inu++; } } icount++; icount++; } } clock_gettime(CLOCK_MONOTONIC, &finishk); ADD_TIME(compose, twt, tpr); endk = clock(); timing.compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; timing.compose_time1 += (finishk.tv_sec - begink.tv_sec); timing.compose_time1 += (finishk.tv_sec - begink.tv_sec); timing.compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; #ifndef USE_MPI #ifndef USE_MPI double vvmin = 1e20; double vvmin = 1e20; double uumax = -1e20; double uumax = -1e20; double vvmax = -1e20; double vvmax = -1e20; #warning shall we omp-ize this ? for (long ipart=0; ipart<Nsec; ipart++) for (long ipart=0; ipart<Nsec; ipart++) { { uumin = MIN(uumin,uus[ipart]); uumin = MIN(uumin,uus[ipart]); Loading @@ -186,8 +185,7 @@ void gridding_data(){ #ifdef VERBOSE #ifdef VERBOSE printf("Processing sector %ld\n",isector); printf("Processing sector %ld\n",isector); #endif #endif clock_gettime(CLOCK_MONOTONIC, &begink); TAKE_TIME(twt, tpr); startk = clock(); wstack(param.num_w_planes, wstack(param.num_w_planes, Nsec, Nsec, Loading @@ -207,6 +205,8 @@ void gridding_data(){ gridss, gridss, param.num_threads); param.num_threads); ADD_TIME(kernel, twt, tpr); /* int z =0 ; /* int z =0 ; * #pragma omp target map(to:test_i_gpu) map(from:z) * #pragma omp target map(to:test_i_gpu) map(from:z) * { * { Loading @@ -215,42 +215,49 @@ void gridding_data(){ * z = x + test_i_gpu; * z = x + test_i_gpu; * }*/ * }*/ clock_gettime(CLOCK_MONOTONIC, &finishk); endk = clock(); timing.kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; timing.kernel_time1 += (finishk.tv_sec - begink.tv_sec); timing.kernel_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; #ifdef VERBOSE #ifdef VERBOSE printf("Processed sector %ld\n",isector); printf("Processed sector %ld\n",isector); #endif #endif clock_gettime(CLOCK_MONOTONIC, &begink); /* ---------------- startk = clock(); * REDUCE * ---------------- */ //for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)printf("--> %f\n",gridss[iii]); double twt_r, tpr_r; TAKE_TIME(twt_r, tpr_r); #ifndef USE_MPI // .................. long stride = isector*2*xaxis*yaxis*num_w_planes; #ifndef USE_MPI // REDUCE WITH NO MPI for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++) #pragma omp parallel { long stride = isector * size_of_grid; #pragma omp for for (long iii=0; iii< size_fo_grid; iii++) gridtot[stride+iii] = gridss[iii]; gridtot[stride+iii] = gridss[iii]; #endif } // .................. // REDUCE WITH MPI #else // Write grid in the corresponding remote slab // Write grid in the corresponding remote slab #ifdef USE_MPI // int target_rank = (int)isector; it implied that size >= nsectors // int target_rank = (int)isector; it implied that size >= nsectors int target_rank = (int)(isector % size); int target_rank = (int)(isector % size); #ifdef ONE_SIDE #ifdef ONE_SIDE // MPI_Win_lock(MPI_LOCK_SHARED,target_rank,0,slabwin); // MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin); // MPI_Win_unlock(target_rank,slabwin); // for every task, gridss coincides with the // for every task, gridss coincides with the // that can be avoided if shared window coincides with gridss // that can be avoided if shared window coincides with gridss TAKE_TIME(twt, tpr); memcpy(Me.win.ptr+isector*size_of_grid, gridss, size_of_grid*sizeof(double)); memcpy(Me.win.ptr+isector*size_of_grid, gridss, size_of_grid*sizeof(double)); ADD_TIME(mmove, twt, tpr); dprintf(1, global_rank, 0, "reducing sector %ld..\n", isector); TAKE_TIME( twt, tpr); reduce( isector, target_rank ); // here the reduce is performed within every host reduce( isector, target_rank ); // here the reduce is performed within every host ADD_TIME(reduce_sh, twt, tpr); if ( Me.Nhosts > 1 ) if ( Me.Nhosts > 1 ) { { Loading @@ -272,6 +279,7 @@ void gridding_data(){ if( Sector_Comm != MPI_COMM_NULL ) if( Sector_Comm != MPI_COMM_NULL ) { { double _twt_; int sector_size; int sector_size; int sector_rank = 0; int sector_rank = 0; int sector_target; int sector_target; Loading @@ -281,7 +289,10 @@ void gridding_data(){ if ( global_rank == target_rank) if ( global_rank == target_rank) { { MPI_Send( §or_rank, 1, MPI_INT, 0, 0, Sector_Comm); MPI_Send( §or_rank, 1, MPI_INT, 0, 0, Sector_Comm); memcpy(grid, Me.swins[Me.Rank[myHOST]].ptr+isector*size_of_grid*sizeof(double), size_of_grid * sizeof(double)); TAKE_TIMEwt( _twt_ ); memcpy(gridss, Me.swins[Me.Rank[myHOST]].ptr+isector*size_of_grid*sizeof(double), size_of_grid * sizeof(double)); ADD_TIMEwt( mmove, _twt_); } } if( sector_rank == 0 ) if( sector_rank == 0 ) Loading @@ -290,29 +301,35 @@ void gridding_data(){ MPI_Recv( §or_target, 1, MPI_INT, MPI_ANY_SOURCE, 0, Sector_Comm, &status); MPI_Recv( §or_target, 1, MPI_INT, MPI_ANY_SOURCE, 0, Sector_Comm, &status); } } TAKE_TIMEwt(_twt_); MPI_Bcast( §or_target, 1, MPI_INT, 0, Sector_Comm ); MPI_Bcast( §or_target, 1, MPI_INT, 0, Sector_Comm ); MPI_Reduce(grid, grid, size_of_grid, MPI_DOUBLE,MPI_SUM, sector_target, Sector_Comm); MPI_Reduce(gridss, grid, size_of_grid, MPI_DOUBLE,MPI_SUM, sector_target, Sector_Comm); MPI_Comm_free( &Sector_Comm ); MPI_Comm_free( &Sector_Comm ); ADD_TIMEwt(mpi, _twt_); } } } } ADD_TIME(reduce_mpi, twt, tpr); #else // relates to #ifdef ONE_SIDE #else // relates to #ifdef ONE_SIDE { double _twt_; TAKE_TIMEwt(_twt_); MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD); MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD); ADD_TIMEwt(mpi, _twt_); } #endif // closes #ifdef ONE_SIDE #endif // closes #ifdef ONE_SIDE #endif // closes USE_MPI #endif // closes USE_MPI clock_gettime(CLOCK_MONOTONIC, &finishk); ADD_TIME(reduce, twt_r, tpr_r); endk = clock(); timing.reduce_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; timing.reduce_time1 += (finishk.tv_sec - begink.tv_sec); // wipe before getting to the next sector timing.reduce_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; memset((void*)gridss, 0, size_of_grid * sizeof(double)); // Go to next sector for (long inull=0; inull<2*param.num_w_planes*xaxis*yaxis; inull++)gridss[inull] = 0.0; // Deallocate all sector arrays // Deallocate all sector arrays free(uus); free(uus); Loading @@ -324,25 +341,18 @@ void gridding_data(){ // End of loop over sector // End of loop over sector } } // Finalize MPI communication #ifdef USE_MPI free( histo_send ); MPI_Win_fence(0,slabwin); #endif #ifndef USE_MPI #ifndef USE_MPI fclose(file.pFile1); fclose(file.pFile1); #endif #endif #ifdef USE_MPI #ifdef USE_MPI MPI_Win_fence(0,slabwin); MPI_Barrier(MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); #endif #endif end = clock(); clock_gettime(CLOCK_MONOTONIC, &finish); timing.process_time = ((double) (end - start)) / CLOCKS_PER_SEC; timing.process_time1 = (finish.tv_sec - begin.tv_sec); timing.process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; clock_gettime(CLOCK_MONOTONIC, &begin); } } void write_grided_data() void write_grided_data() Loading @@ -355,6 +365,7 @@ void write_grided_data() printf("WRITING GRIDDED DATA\n"); printf("WRITING GRIDDED DATA\n"); file.pFilereal = fopen (out.outfile2,"wb"); file.pFilereal = fopen (out.outfile2,"wb"); file.pFileimg = fopen (out.outfile3,"wb"); file.pFileimg = fopen (out.outfile3,"wb"); #ifdef USE_MPI #ifdef USE_MPI for (int isector=0; isector<nsectors; isector++) for (int isector=0; isector<nsectors; isector++) { { Loading @@ -377,6 +388,7 @@ void write_grided_data() fseek(file.pFilereal, global_index, SEEK_SET); fseek(file.pFilereal, global_index, SEEK_SET); fwrite(&gridss_real[index], 1, sizeof(double), file.pFilereal); fwrite(&gridss_real[index], 1, sizeof(double), file.pFilereal); } } for (int iw=0; iw<param.num_w_planes; iw++) for (int iw=0; iw<param.num_w_planes; iw++) for (int iv=0; iv<yaxis; iv++) for (int iv=0; iv<yaxis; iv++) for (int iu=0; iu<xaxis; iu++) for (int iu=0; iu<xaxis; iu++) Loading Loading @@ -424,7 +436,6 @@ void write_grided_data() #endif #endif #endif //WRITE_DATA #endif //WRITE_DATA } } Loading @@ -434,6 +445,7 @@ void reduce( int sector, int target_rank ) { { int local_rank = Me.Rank[myHOST]; int local_rank = Me.Rank[myHOST]; int target_rank_on_myhost = -1; if( Me.Ranks_to_host[ target_rank ] == Me.myhost ) if( Me.Ranks_to_host[ target_rank ] == Me.myhost ) // exchange rank 0 with target rank // exchange rank 0 with target rank Loading @@ -442,16 +454,32 @@ void reduce( int sector, int target_rank ) // every target rank // every target rank { { int r = 0; target_rank_on_myhost = 0; while( Me.Ranks_to_myhost[r] != target_rank ) while( Me.Ranks_to_myhost[target_rank_on_myhost] != target_rank ) r++; target_rank_on_myhost++; if( r > 0 ) dprintf(2, Me.Rank[myHOST], 0, "[SEC %d] swapping Host master with target rank %d (%d)\n", sector, target_rank, target_rank_on_myhost); if( target_rank_on_myhost > 0 ) // the target is not the task that already has rank 0 // on my host { { if( local_rank == 0 ) if( local_rank == 0 ) local_rank = r; local_rank = target_rank_on_myhost; if( local_rank == r ) else if( local_rank == target_rank_on_myhost ) local_rank = 0; local_rank = 0; win_t temp = Me.swins[target_rank_on_myhost]; Me.swins[target_rank_on_myhost] = Me.swins[0]; Me.swins[0] = temp; temp = Me.scwins[target_rank_on_myhost]; Me.scwins[target_rank_on_myhost] = Me.scwins[0]; Me.scwins[0] = temp; } } } } Loading @@ -470,6 +498,10 @@ void reduce( int sector, int target_rank ) if( local_rank % threshold == 0) if( local_rank % threshold == 0) { { int source = local_rank + (1<<l); int source = local_rank + (1<<l); dprintf(2, 0, 0, "[SEC %d] task %d (%d) getting data from task %d at level %d\n", sector, local_rank, Me.Rank[myHOST], source, l ); while( *(int*)(Me.scwins[source].ptr) < l ) while( *(int*)(Me.scwins[source].ptr) < l ) // sleep 5 usec if the source target is not ready // sleep 5 usec if the source target is not ready NSLEEP( 5000 ); NSLEEP( 5000 ); Loading @@ -481,8 +513,25 @@ void reduce( int sector, int target_rank ) *(int*)(Me.win_ctrl.ptr) = l; *(int*)(Me.win_ctrl.ptr) = l; } } else else { dprintf(2, 0, 0, "[SEC %d] task %d (%d) signaling that level %d is done\n", sector, local_rank, Me.Rank[myHOST], l ); *(int*)(Me.win_ctrl.ptr) = l; *(int*)(Me.win_ctrl.ptr) = l; } } } if ( target_rank_on_myhost > 0 ) { win_t temp = Me.swins[target_rank_on_myhost]; Me.swins[target_rank_on_myhost] = Me.swins[0]; Me.swins[0] = temp; temp = Me.scwins[target_rank_on_myhost]; Me.scwins[target_rank_on_myhost] = Me.scwins[0]; Me.scwins[0] = temp; } return; return; } } Loading Loading
allvars.c +4 −1 Original line number Original line Diff line number Diff line Loading @@ -7,7 +7,9 @@ struct ip in; struct op out, outparam; struct op out, outparam; struct meta metaData; struct meta metaData; struct time timing; timing_t wt_timing = {0}; timing_t pr_timing = {0}; struct parameter param; struct parameter param; struct fileData data; struct fileData data; Loading @@ -16,6 +18,7 @@ char datapath[900]; int xaxis, yaxis; int xaxis, yaxis; int global_rank; int global_rank; int size; int size; int verbose_level = 0; long nsectors; long nsectors; long startrow; long startrow; double resolution, dx, dw, w_supporth; double resolution, dx, dw, w_supporth; Loading
allvars.h +76 −9 Original line number Original line Diff line number Diff line Loading @@ -92,13 +92,24 @@ extern struct meta } metaData; } metaData; extern struct time typedef struct { { double setup; // time spent in initialization, init() double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, phase_time; double init; // time spent in initializing arrays double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1, phase_time1; double process; // time spent in gridding; double writetime, writetime1; double mpi; // time spent in mpi communications double fftw; // } timing; double kernel; // double mmove; // double reduce; // double reduce_mpi; // double reduce_sh ; // double compose; // double phase; // double write; // double total; } timing_t; extern timing_t wt_timing; // wall-clock timings extern timing_t pr_timing; // process CPU timing extern struct parameter extern struct parameter { { Loading Loading @@ -127,12 +138,11 @@ extern char datapath[900]; extern int xaxis, yaxis; extern int xaxis, yaxis; extern int global_rank; extern int global_rank; extern int size; extern int size; extern int verbose_level; extern long nsectors; extern long nsectors; extern long startrow; extern long startrow; extern double resolution, dx, dw, w_supporth; extern double resolution, dx, dw, w_supporth; extern clock_t start, end, start0, startk, endk; extern struct timespec begin, finish, begin0, begink, finishk; extern long * histo_send, size_of_grid; extern long * histo_send, size_of_grid; extern double * grid, *gridss, *gridss_real, *gridss_img, *gridss_w; extern double * grid, *gridss, *gridss_real, *gridss_img, *gridss_w; Loading @@ -142,3 +152,60 @@ extern double * grid, *gridss, *gridss_real, *gridss_img, *gridss_w; #endif #endif extern long **sectorarray; extern long **sectorarray; #if defined(DEBUG) #define dprintf(LEVEL, T, t, ...) if( (verbose_level >= (LEVEL)) && \ ( ((t) ==-1 ) || ((T)==(t)) ) ) { \ printf(__VA_ARGS__); fflush(stdout); } #else #define dprintf(...) #endif #define CPU_TIME_wt ({ struct timespec myts; (clock_gettime( CLOCK_REALTIME, &myts ), (double)myts.tv_sec + (double)myts.tv_nsec * 1e-9);}) #define CPU_TIME_pr ({ struct timespec myts; (clock_gettime( CLOCK_PROCESS_CPUTIME_ID, &myts ), (double)myts.tv_sec + (double)myts.tv_nsec * 1e-9);}) #define CPU_TIME_th ({ struct timespec myts; (clock_gettime( CLOCK_THREAD_CPUTIME_ID, &myts ), (double)myts.tv_sec + (double)myts.tv_nsec * 1e-9);}) #if defined(_OPENMP) #define TAKE_TIME_START( T ) { \ wt_timing.T = CPU_TIME_wt; \ pr_timing.T = CPU_TIME_pr; } #define TAKE_TIME_STOP( T ) { \ pr_timing.T = CPU_TIME_pr - pr_timing.T; \ wt_timing.T = CPU_TIME_wt - wt_timing.T; } #define TAKE_TIME( Twt, Tpr ) { Twt = CPU_TIME_wt; Tpr = CPU_TIME_pr; } #define ADD_TIME( T, Twt, Tpr ) { \ pr_timing.T += CPU_TIME_pr - Tpr; \ wt_timing.T += CPU_TIME_wt - Twt; \ Tpr = CPU_TIME_pr; Twt = CPU_TIME_wt; } #else #define TAKE_TIME_START( T ) wt_timing.T = CPU_TIME_wt #define TAKE_TIME_STOP( T ) wt_timing.T = CPU_TIME_wt - wt_timing.T #define TAKE_TIME( Twt, ... ) Twt = CPU_TIME_wt; #define ADD_TIME( T, Twt, ... ) { wt_timing.T += CPU_TIME_wt - Twt; Twt = CPU_TIME_wt;} #endif #define TAKE_TIMEwt_START( T) wt_timing.T = CPU_TIME_wt #define TAKE_TIMEwt_STOP( T) wt_timing.T = CPU_TIME_wt - wt_timing.T #define TAKE_TIMEwt( Twt ) Twt = CPU_TIME_wt; #define ADD_TIMEwt( T, Twt ) { wt_timing.T += CPU_TIME_wt - Twt; Twt = CPU_TIME_wt; } #if defined(__GNUC__) && !defined(__ICC) && !defined(__INTEL_COMPILER) #define PRAGMA_IVDEP _Pragma("GCC ivdep") #else #define PRAGMA_IVDEP _Pragma("ivdep") #endif #define STRINGIFY(a) #a #define UNROLL(N) _Pragma(STRINGIFY(unroll(N)))
data/paramfile.txt +5 −4 Original line number Original line Diff line number Diff line ndatasets 1 ndatasets 1 Datapath1 /beegfs/lofar/cgheller/L798046_SB244_uv.uncorr_130B27932t_121MHz.pre-cal.binMS/ Datapath1 ../input/ Datapath2 /beegfs/lofar/cgheller/L798046_SB244_uv.uncorr_130B27932t_123MHz.pre-cal.binMS/ Datapath2 ../input/ Datapath3 /beegfs/lofar/cgheller/L798046_SB244_uv.uncorr_130B27932t_125MHz.pre-cal.binMS/ Datapath3 ../input/ num_threads 2 num_threads 2 w_support 7 w_support 7 grid_size_x 2048 grid_size_x 2048 Loading @@ -24,3 +24,4 @@ fftfile3 fft_img.bin logfile run.log logfile run.log extension .txt extension .txt timingfile timings.dat timingfile timings.dat verbose_level 1
fourier_transform.c +209 −203 Original line number Original line Diff line number Diff line Loading @@ -2,13 +2,15 @@ #include "allvars.h" #include "allvars.h" #include "proto.h" #include "proto.h" void fftw_data(){ void fftw_data() { #ifdef USE_FFTW #ifdef USE_FFTW // FFT transform the data (using distributed FFTW) // FFT transform the data (using distributed FFTW) if(global_rank == 0)printf("PERFORMING FFT\n"); if(global_rank == 0)printf("PERFORMING FFT\n"); clock_gettime(CLOCK_MONOTONIC, &begin); start = clock(); TAKE_TIME_START(fftw); fftw_plan plan; fftw_plan plan; fftw_complex *fftwgrid; fftw_complex *fftwgrid; ptrdiff_t alloc_local, local_n0, local_0_start; ptrdiff_t alloc_local, local_n0, local_0_start; Loading Loading @@ -64,21 +66,23 @@ void fftw_data(){ MPI_Barrier(MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); #endif #endif end = clock(); TAKE_TIME_STOP(fftw); clock_gettime(CLOCK_MONOTONIC, &finish); timing.fftw_time = ((double) (end - start)) / CLOCKS_PER_SEC; timing.fftw_time1 = (finish.tv_sec - begin.tv_sec); timing.fftw_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; clock_gettime(CLOCK_MONOTONIC, &begin); #endif #endif } } void write_fftw_data(){ void write_fftw_data(){ // Write results #ifdef USE_FFTW #ifdef USE_FFTW double twt, tpr; #ifdef WRITE_DATA #ifdef WRITE_DATA // Write results TAKE_TIME(twt, tpr); #ifdef USE_MPI #ifdef USE_MPI MPI_Win writewin; MPI_Win writewin; MPI_Win_create(gridss, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &writewin); MPI_Win_create(gridss, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &writewin); Loading Loading @@ -150,27 +154,28 @@ void write_fftw_data(){ MPI_Win_free(&writewin); MPI_Win_free(&writewin); MPI_Barrier(MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); #endif #endif ADD_TIME(write, twt, tpr); #endif //WRITE_DATA #endif //WRITE_DATA // Phase correction // Phase correction clock_gettime(CLOCK_MONOTONIC, &begin); start = clock(); TAKE_TIME_START(phase) if(global_rank == 0)printf("PHASE CORRECTION\n"); if(global_rank == 0)printf("PHASE CORRECTION\n"); double* image_real = (double*) calloc(xaxis*yaxis,sizeof(double)); double* image_real = (double*) calloc(xaxis*yaxis,sizeof(double)); double* image_imag = (double*) calloc(xaxis*yaxis,sizeof(double)); double* image_imag = (double*) calloc(xaxis*yaxis,sizeof(double)); phase_correction(gridss,image_real,image_imag,xaxis,yaxis,param.num_w_planes,param.grid_size_x,param.grid_size_y,resolution,metaData.wmin,metaData.wmax,param.num_threads); phase_correction(gridss,image_real,image_imag,xaxis,yaxis,param.num_w_planes,param.grid_size_x,param.grid_size_y,resolution,metaData.wmin,metaData.wmax,param.num_threads); end = clock(); TAKE_TIME_STOP(phase); clock_gettime(CLOCK_MONOTONIC, &finish); timing.phase_time = ((double) (end - start)) / CLOCKS_PER_SEC; timing.phase_time1 = (finish.tv_sec - begin.tv_sec); timing.phase_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; #ifdef WRITE_IMAGE #ifdef WRITE_IMAGE TAKE_TIME(twt, tpr); if(global_rank == 0) if(global_rank == 0) { { file.pFilereal = fopen (out.fftfile2,"wb"); file.pFilereal = fopen (out.fftfile2,"wb"); Loading Loading @@ -208,6 +213,7 @@ void write_fftw_data(){ MPI_Barrier(MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); #endif #endif ADD_TIME(write, twt, tpr); #endif //WRITE_IMAGE #endif //WRITE_IMAGE #endif //FFTW #endif //FFTW Loading
gridding.c +404 −355 Original line number Original line Diff line number Diff line Loading @@ -4,35 +4,35 @@ #include "proto.h" #include "proto.h" void gridding(){ void gridding() { if(global_rank == 0)printf("GRIDDING DATA\n"); if(global_rank == 0)printf("GRIDDING DATA\n"); // Create histograms and linked lists // Create histograms and linked lists clock_gettime(CLOCK_MONOTONIC, &begin); TAKE_TIME_START(init); start = clock(); // Initialize linked list // Initialize linked list initialize_array(); initialize_array(); TAKE_TIME_STOP(init); TAKE_TIME_START(process); // Sector and Gridding data // Sector and Gridding data gridding_data(); gridding_data(); TAKE_TIME_STOP(process); #ifdef USE_MPI #ifdef USE_MPI MPI_Barrier(MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); #endif #endif end = clock(); return; clock_gettime(CLOCK_MONOTONIC, &finish); timing.process_time = ((double) (end - start)) / CLOCKS_PER_SEC; timing.process_time1 = (finish.tv_sec - begin.tv_sec); timing.process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; clock_gettime(CLOCK_MONOTONIC, &begin); } } void initialize_array(){ void initialize_array() { histo_send = (long*) calloc(nsectors+1,sizeof(long)); histo_send = (long*) calloc(nsectors+1,sizeof(long)); int * boundary = (int*) calloc(metaData.Nmeasures,sizeof(int)); int * boundary = (int*) calloc(metaData.Nmeasures,sizeof(int)); Loading Loading @@ -79,13 +79,16 @@ void initialize_array(){ #ifdef VERBOSE #ifdef VERBOSE for (int iii=0; iii<nsectors+1; iii++)printf("HISTO %d %d %ld\n", global_rank, iii, histo_send[iii]); for (int iii=0; iii<nsectors+1; iii++)printf("HISTO %d %d %ld\n", global_rank, iii, histo_send[iii]); #endif #endif free( boundary); return; } } void gridding_data(){ void gridding_data() { double shift = (double)(dx*yaxis); double shift = (double)(dx*yaxis); // Open the MPI Memory Window for the slab #ifdef USE_MPI #ifdef USE_MPI MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin); MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin); MPI_Win_fence(0,slabwin); MPI_Win_fence(0,slabwin); Loading @@ -95,24 +98,20 @@ void gridding_data(){ file.pFile1 = fopen (out.outfile1,"w"); file.pFile1 = fopen (out.outfile1,"w"); #endif #endif timing.kernel_time = 0.0; timing.kernel_time1 = 0.0; timing.reduce_time = 0.0; timing.reduce_time1 = 0.0; timing.compose_time = 0.0; timing.compose_time1 = 0.0; // calculate the resolution in radians // calculate the resolution in radians resolution = 1.0/MAX(abs(metaData.uvmin),abs(metaData.uvmax)); resolution = 1.0/MAX(abs(metaData.uvmin),abs(metaData.uvmax)); // calculate the resolution in arcsec // calculate the resolution in arcsec double resolution_asec = (3600.0*180.0)/MAX(abs(metaData.uvmin),abs(metaData.uvmax))/PI; double resolution_asec = (3600.0*180.0)/MAX(abs(metaData.uvmin),abs(metaData.uvmax))/PI; if( global_rank == 0 ) printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec); printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec); for (long isector = 0; isector < nsectors; isector++) for (long isector = 0; isector < nsectors; isector++) { { clock_gettime(CLOCK_MONOTONIC, &begink); double twt, tpr; startk = clock(); TAKE_TIME(twt, tpr); // define local destination sector // define local destination sector //isector = (isector_count+rank)%size; // this line must be wrong! [LT] //isector = (isector_count+rank)%size; // this line must be wrong! [LT] Loading @@ -132,6 +131,7 @@ void gridding_data(){ long ip = 0; long ip = 0; long inu = 0; long inu = 0; #warning shall we omp-ize this ? for(long iphi = histo_send[isector]-1; iphi>=0; iphi--) for(long iphi = histo_send[isector]-1; iphi>=0; iphi--) { { long ilocal = sectorarray[isector][iphi]; long ilocal = sectorarray[isector][iphi]; Loading @@ -141,33 +141,32 @@ void gridding_data(){ uus[icount] = data.uu[ilocal]; uus[icount] = data.uu[ilocal]; vvs[icount] = data.vv[ilocal]-isector*shift; vvs[icount] = data.vv[ilocal]-isector*shift; wws[icount] = data.ww[ilocal]; wws[icount] = data.ww[ilocal]; for (long ipol=0; ipol<metaData.polarisations; ipol++) UNROLL(4) PRAGMA_IVDEP for (long ipol=0; ipol<metaData.polarisations; ipol++, ip++) { { weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol]; weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol]; ip++; } } for (long ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++) PRAGMA_IVDEP UNROLL(4) for (long ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++, inu++) { { visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; visimgs[inu] = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; visimgs[inu] = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq,metaData.Nvis); //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq,metaData.Nvis); inu++; } } icount++; icount++; } } clock_gettime(CLOCK_MONOTONIC, &finishk); ADD_TIME(compose, twt, tpr); endk = clock(); timing.compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; timing.compose_time1 += (finishk.tv_sec - begink.tv_sec); timing.compose_time1 += (finishk.tv_sec - begink.tv_sec); timing.compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; #ifndef USE_MPI #ifndef USE_MPI double vvmin = 1e20; double vvmin = 1e20; double uumax = -1e20; double uumax = -1e20; double vvmax = -1e20; double vvmax = -1e20; #warning shall we omp-ize this ? for (long ipart=0; ipart<Nsec; ipart++) for (long ipart=0; ipart<Nsec; ipart++) { { uumin = MIN(uumin,uus[ipart]); uumin = MIN(uumin,uus[ipart]); Loading @@ -186,8 +185,7 @@ void gridding_data(){ #ifdef VERBOSE #ifdef VERBOSE printf("Processing sector %ld\n",isector); printf("Processing sector %ld\n",isector); #endif #endif clock_gettime(CLOCK_MONOTONIC, &begink); TAKE_TIME(twt, tpr); startk = clock(); wstack(param.num_w_planes, wstack(param.num_w_planes, Nsec, Nsec, Loading @@ -207,6 +205,8 @@ void gridding_data(){ gridss, gridss, param.num_threads); param.num_threads); ADD_TIME(kernel, twt, tpr); /* int z =0 ; /* int z =0 ; * #pragma omp target map(to:test_i_gpu) map(from:z) * #pragma omp target map(to:test_i_gpu) map(from:z) * { * { Loading @@ -215,42 +215,49 @@ void gridding_data(){ * z = x + test_i_gpu; * z = x + test_i_gpu; * }*/ * }*/ clock_gettime(CLOCK_MONOTONIC, &finishk); endk = clock(); timing.kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; timing.kernel_time1 += (finishk.tv_sec - begink.tv_sec); timing.kernel_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; #ifdef VERBOSE #ifdef VERBOSE printf("Processed sector %ld\n",isector); printf("Processed sector %ld\n",isector); #endif #endif clock_gettime(CLOCK_MONOTONIC, &begink); /* ---------------- startk = clock(); * REDUCE * ---------------- */ //for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)printf("--> %f\n",gridss[iii]); double twt_r, tpr_r; TAKE_TIME(twt_r, tpr_r); #ifndef USE_MPI // .................. long stride = isector*2*xaxis*yaxis*num_w_planes; #ifndef USE_MPI // REDUCE WITH NO MPI for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++) #pragma omp parallel { long stride = isector * size_of_grid; #pragma omp for for (long iii=0; iii< size_fo_grid; iii++) gridtot[stride+iii] = gridss[iii]; gridtot[stride+iii] = gridss[iii]; #endif } // .................. // REDUCE WITH MPI #else // Write grid in the corresponding remote slab // Write grid in the corresponding remote slab #ifdef USE_MPI // int target_rank = (int)isector; it implied that size >= nsectors // int target_rank = (int)isector; it implied that size >= nsectors int target_rank = (int)(isector % size); int target_rank = (int)(isector % size); #ifdef ONE_SIDE #ifdef ONE_SIDE // MPI_Win_lock(MPI_LOCK_SHARED,target_rank,0,slabwin); // MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin); // MPI_Win_unlock(target_rank,slabwin); // for every task, gridss coincides with the // for every task, gridss coincides with the // that can be avoided if shared window coincides with gridss // that can be avoided if shared window coincides with gridss TAKE_TIME(twt, tpr); memcpy(Me.win.ptr+isector*size_of_grid, gridss, size_of_grid*sizeof(double)); memcpy(Me.win.ptr+isector*size_of_grid, gridss, size_of_grid*sizeof(double)); ADD_TIME(mmove, twt, tpr); dprintf(1, global_rank, 0, "reducing sector %ld..\n", isector); TAKE_TIME( twt, tpr); reduce( isector, target_rank ); // here the reduce is performed within every host reduce( isector, target_rank ); // here the reduce is performed within every host ADD_TIME(reduce_sh, twt, tpr); if ( Me.Nhosts > 1 ) if ( Me.Nhosts > 1 ) { { Loading @@ -272,6 +279,7 @@ void gridding_data(){ if( Sector_Comm != MPI_COMM_NULL ) if( Sector_Comm != MPI_COMM_NULL ) { { double _twt_; int sector_size; int sector_size; int sector_rank = 0; int sector_rank = 0; int sector_target; int sector_target; Loading @@ -281,7 +289,10 @@ void gridding_data(){ if ( global_rank == target_rank) if ( global_rank == target_rank) { { MPI_Send( §or_rank, 1, MPI_INT, 0, 0, Sector_Comm); MPI_Send( §or_rank, 1, MPI_INT, 0, 0, Sector_Comm); memcpy(grid, Me.swins[Me.Rank[myHOST]].ptr+isector*size_of_grid*sizeof(double), size_of_grid * sizeof(double)); TAKE_TIMEwt( _twt_ ); memcpy(gridss, Me.swins[Me.Rank[myHOST]].ptr+isector*size_of_grid*sizeof(double), size_of_grid * sizeof(double)); ADD_TIMEwt( mmove, _twt_); } } if( sector_rank == 0 ) if( sector_rank == 0 ) Loading @@ -290,29 +301,35 @@ void gridding_data(){ MPI_Recv( §or_target, 1, MPI_INT, MPI_ANY_SOURCE, 0, Sector_Comm, &status); MPI_Recv( §or_target, 1, MPI_INT, MPI_ANY_SOURCE, 0, Sector_Comm, &status); } } TAKE_TIMEwt(_twt_); MPI_Bcast( §or_target, 1, MPI_INT, 0, Sector_Comm ); MPI_Bcast( §or_target, 1, MPI_INT, 0, Sector_Comm ); MPI_Reduce(grid, grid, size_of_grid, MPI_DOUBLE,MPI_SUM, sector_target, Sector_Comm); MPI_Reduce(gridss, grid, size_of_grid, MPI_DOUBLE,MPI_SUM, sector_target, Sector_Comm); MPI_Comm_free( &Sector_Comm ); MPI_Comm_free( &Sector_Comm ); ADD_TIMEwt(mpi, _twt_); } } } } ADD_TIME(reduce_mpi, twt, tpr); #else // relates to #ifdef ONE_SIDE #else // relates to #ifdef ONE_SIDE { double _twt_; TAKE_TIMEwt(_twt_); MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD); MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD); ADD_TIMEwt(mpi, _twt_); } #endif // closes #ifdef ONE_SIDE #endif // closes #ifdef ONE_SIDE #endif // closes USE_MPI #endif // closes USE_MPI clock_gettime(CLOCK_MONOTONIC, &finishk); ADD_TIME(reduce, twt_r, tpr_r); endk = clock(); timing.reduce_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; timing.reduce_time1 += (finishk.tv_sec - begink.tv_sec); // wipe before getting to the next sector timing.reduce_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; memset((void*)gridss, 0, size_of_grid * sizeof(double)); // Go to next sector for (long inull=0; inull<2*param.num_w_planes*xaxis*yaxis; inull++)gridss[inull] = 0.0; // Deallocate all sector arrays // Deallocate all sector arrays free(uus); free(uus); Loading @@ -324,25 +341,18 @@ void gridding_data(){ // End of loop over sector // End of loop over sector } } // Finalize MPI communication #ifdef USE_MPI free( histo_send ); MPI_Win_fence(0,slabwin); #endif #ifndef USE_MPI #ifndef USE_MPI fclose(file.pFile1); fclose(file.pFile1); #endif #endif #ifdef USE_MPI #ifdef USE_MPI MPI_Win_fence(0,slabwin); MPI_Barrier(MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); #endif #endif end = clock(); clock_gettime(CLOCK_MONOTONIC, &finish); timing.process_time = ((double) (end - start)) / CLOCKS_PER_SEC; timing.process_time1 = (finish.tv_sec - begin.tv_sec); timing.process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; clock_gettime(CLOCK_MONOTONIC, &begin); } } void write_grided_data() void write_grided_data() Loading @@ -355,6 +365,7 @@ void write_grided_data() printf("WRITING GRIDDED DATA\n"); printf("WRITING GRIDDED DATA\n"); file.pFilereal = fopen (out.outfile2,"wb"); file.pFilereal = fopen (out.outfile2,"wb"); file.pFileimg = fopen (out.outfile3,"wb"); file.pFileimg = fopen (out.outfile3,"wb"); #ifdef USE_MPI #ifdef USE_MPI for (int isector=0; isector<nsectors; isector++) for (int isector=0; isector<nsectors; isector++) { { Loading @@ -377,6 +388,7 @@ void write_grided_data() fseek(file.pFilereal, global_index, SEEK_SET); fseek(file.pFilereal, global_index, SEEK_SET); fwrite(&gridss_real[index], 1, sizeof(double), file.pFilereal); fwrite(&gridss_real[index], 1, sizeof(double), file.pFilereal); } } for (int iw=0; iw<param.num_w_planes; iw++) for (int iw=0; iw<param.num_w_planes; iw++) for (int iv=0; iv<yaxis; iv++) for (int iv=0; iv<yaxis; iv++) for (int iu=0; iu<xaxis; iu++) for (int iu=0; iu<xaxis; iu++) Loading Loading @@ -424,7 +436,6 @@ void write_grided_data() #endif #endif #endif //WRITE_DATA #endif //WRITE_DATA } } Loading @@ -434,6 +445,7 @@ void reduce( int sector, int target_rank ) { { int local_rank = Me.Rank[myHOST]; int local_rank = Me.Rank[myHOST]; int target_rank_on_myhost = -1; if( Me.Ranks_to_host[ target_rank ] == Me.myhost ) if( Me.Ranks_to_host[ target_rank ] == Me.myhost ) // exchange rank 0 with target rank // exchange rank 0 with target rank Loading @@ -442,16 +454,32 @@ void reduce( int sector, int target_rank ) // every target rank // every target rank { { int r = 0; target_rank_on_myhost = 0; while( Me.Ranks_to_myhost[r] != target_rank ) while( Me.Ranks_to_myhost[target_rank_on_myhost] != target_rank ) r++; target_rank_on_myhost++; if( r > 0 ) dprintf(2, Me.Rank[myHOST], 0, "[SEC %d] swapping Host master with target rank %d (%d)\n", sector, target_rank, target_rank_on_myhost); if( target_rank_on_myhost > 0 ) // the target is not the task that already has rank 0 // on my host { { if( local_rank == 0 ) if( local_rank == 0 ) local_rank = r; local_rank = target_rank_on_myhost; if( local_rank == r ) else if( local_rank == target_rank_on_myhost ) local_rank = 0; local_rank = 0; win_t temp = Me.swins[target_rank_on_myhost]; Me.swins[target_rank_on_myhost] = Me.swins[0]; Me.swins[0] = temp; temp = Me.scwins[target_rank_on_myhost]; Me.scwins[target_rank_on_myhost] = Me.scwins[0]; Me.scwins[0] = temp; } } } } Loading @@ -470,6 +498,10 @@ void reduce( int sector, int target_rank ) if( local_rank % threshold == 0) if( local_rank % threshold == 0) { { int source = local_rank + (1<<l); int source = local_rank + (1<<l); dprintf(2, 0, 0, "[SEC %d] task %d (%d) getting data from task %d at level %d\n", sector, local_rank, Me.Rank[myHOST], source, l ); while( *(int*)(Me.scwins[source].ptr) < l ) while( *(int*)(Me.scwins[source].ptr) < l ) // sleep 5 usec if the source target is not ready // sleep 5 usec if the source target is not ready NSLEEP( 5000 ); NSLEEP( 5000 ); Loading @@ -481,8 +513,25 @@ void reduce( int sector, int target_rank ) *(int*)(Me.win_ctrl.ptr) = l; *(int*)(Me.win_ctrl.ptr) = l; } } else else { dprintf(2, 0, 0, "[SEC %d] task %d (%d) signaling that level %d is done\n", sector, local_rank, Me.Rank[myHOST], l ); *(int*)(Me.win_ctrl.ptr) = l; *(int*)(Me.win_ctrl.ptr) = l; } } } if ( target_rank_on_myhost > 0 ) { win_t temp = Me.swins[target_rank_on_myhost]; Me.swins[target_rank_on_myhost] = Me.swins[0]; Me.swins[0] = temp; temp = Me.scwins[target_rank_on_myhost]; Me.scwins[target_rank_on_myhost] = Me.scwins[0]; Me.scwins[0] = temp; } return; return; } } Loading