Commit 71bccbe1 authored by Nandhana Sakhtivel's avatar Nandhana Sakhtivel
Browse files

Binomial Commnucation added but not working

parent 34c4d74f
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -45,3 +45,5 @@ int Ntasks_local;
double **swins = NULL;
int    **cwins = NULL;
int max_level = 0;
double *end_4, *end_reduce;
int dsize_4, iter=0;  
+2 −0
Original line number Diff line number Diff line
@@ -268,3 +268,5 @@ extern blocks_t blocks;
extern double **swins;
extern int    **cwins;
extern int max_level;
extern double *end_4, *end_reduce;
extern int dsize_4, iter; 
+12 −9
Original line number Diff line number Diff line
@@ -136,12 +136,14 @@ void gridding_data()
        copy_win_ptrs( (void***)&swins, Me.swins, Me.Ntasks[Me.SHMEMl] );
        copy_win_ptrs( (void***)&cwins, Me.scwins, Me.Ntasks[Me.SHMEMl] );

        int     dsize_4 = (size_of_grid/4)*4;
        double *end_4   = (double*)Me.win.ptr + dsize_4;
        double *end     = (double*)Me.win.ptr + datasize;
   	MPI_Barrier(MPI_COMM_WORLD);
       // printf("The no of task in shared memory %d, host %d\n", Me.Ntasks[Me.SHMEMl], Me.Ntasks[myHOST]);
        dsize_4 = (size_of_grid/4)*4;
        end_4   = (double*)Me.win.ptr + dsize_4;
        end_reduce  = (double*)Me.win.ptr + size_of_grid;

      	while( (1<< (++max_level) ) < Me.Ntasks[Me.SHMEMl] );
      
       // printf("Max level %d my rank %d\n",max_level, global_rank);      
      	*(int*)Me.win_ctrl.ptr     = DATA_FREE;
      	*((int*)Me.win_ctrl.ptr+1) = FINAL_FREE;
      	MPI_Barrier(*(Me.COMM[myHOST]));
@@ -298,7 +300,8 @@ void gridding_data()
       
     #ifdef ONE_SIDE

     printf("One Side communication active\n");
     printf("One Side communication active");
     //printf("One Side communication active my rank %d target rank %d\n", global_rank, target_rank);

     //MPI_Win_lock(MPI_LOCK_SHARED,target_rank,0,slabwin);
     //MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin);
@@ -349,10 +352,10 @@ void gridding_data()

        MPI_Barrier(MPI_COMM_WORLD);

        if(Me.Nhosts>1)
 //       if(Me.Nhosts>1)
		memcpy(Me.sfwins[Me.Rank[myHOST]].ptr, grid, size_of_grid);
        else
        	memcpy(Me.sfwins[global_rank].ptr, grid, size_of_grid);
   //     else
     //   	memcpy(Me.sfwins[global_rank].ptr, grid, size_of_grid);

    #endif

+3 −1
Original line number Diff line number Diff line
@@ -31,3 +31,5 @@ void write_result();
int reduce_ring (int );
int reduce_binomial (int );
int shmem_reduce_ring  ( int, int, int_t, map_t *, double * restrict, blocks_t *);
int shmem_reduce_binomial( int, int, int, map_t *, double * restrict, int );
+168 −13
Original line number Diff line number Diff line
@@ -165,6 +165,148 @@ int reduce_ring (int target_rank)
  return 0;
}

int reduce_binomial ( int target_rank )
{


	/* -------------------------------------------------
  	 *
  	 *  USE THE SHARED MEMORY WINDOWS TO REDUCE DATA
 	 * ------------------------------------------------- */

      {
	  timing.rtime  = CPU_TIME_rt;
	  timing.ttotal = CPU_TIME_pr;
	 #pragma omp parallel num_threads(2)
	  {
	    int thid = omp_get_thread_num();



		if( thid == 1 )
		  {
		                                                                       // check that the data in Me.win
		                                                                       // can be overwritten by new data 		                                                                       		                                                                       // -> this condition is true when                                                          		                                                                       		                                                                       // win_ctrl has the value "DATA_FREE"

                      ACQUIRE_CTRL((int*)Me.win_ctrl.ptr, DATA_FREE, timing.tspin, != )
                      memcpy(Me.win.ptr, gridss, sizeof(gridss));
                      if( Me.Ntasks[myHOST] > 1 )
		      {
				int value = target_rank * (max_level+1);
				atomic_store((int*)Me.win_ctrl.ptr, value);
			
				double start = CPU_TIME_tr;
                               // printf("Im before shmem_reduce my rank %d target rank %d size_of_grid %d\n", global_rank, target_rank, size_of_grid);
				int ret = shmem_reduce_binomial( target_rank, target_rank, size_of_grid, &Me, (double*)Me.win.ptr, max_level );
				//printf("Im after shmem_reduce my rank %d target rank %d\n", global_rank, target_rank);
				timing.treduce += CPU_TIME_tr - start;
				if( ret != 0 )
			  	{
			    		printf("Task %d : shared-memory reduce for sector %d has returned "
				   		"an error code %d : better stop here\n",
				  		 global_rank, target_rank, ret );
			    		free(cwins);
			    		free(swins);
			    		numa_shutdown(global_rank, 0, &MYMPI_COMM_WORLD, &Me);
			    		MPI_Finalize();
			  	}
				
		      }
		    else
		      atomic_store((int*)Me.win_ctrl.ptr, DATA_FREE);

		    int Im_target = (global_rank == target_rank);
		    int Im_NOT_target_but_Im_master = (Me.Nhosts>1) &&
		      (Me.Ranks_to_host[target_rank]!=Me.myhost) && (Me.Rank[myHOST]==0);

                    if( Im_target || Im_NOT_target_but_Im_master )
		      {
			ACQUIRE_CTRL((int*)Me.win_ctrl.ptr+1, FINAL_FREE, timing.tspin, != );
			double start = CPU_TIME_tr;
			double * restrict final = (double*)Me.win.ptr + size_of_grid;
			double * restrict run   = (double*)Me.win.ptr;
			for( ; run < end_4; run += 4, final += 4 ) {
			  *final     = *run;
			  *(final+1) = *(run+1);
			  *(final+2) = *(run+2);
			  *(final+3) = *(run+3); }
			for( ; run < end_reduce; run++, final++ )
			  *final = *run;
			timing.tmovmemory += CPU_TIME_tr - start;
                         printf("Im inside I'm target my rank %d target rank %d\n", global_rank, target_rank);

			atomic_store(((int*)Me.win_ctrl.ptr+1), target_rank);
			atomic_store((int*)Me.win_ctrl.ptr, DATA_FREE);
			atomic_thread_fence(memory_order_release);
		      }

		  }
                  else
		  {
		    //MPI_Barrier(*Me.COMM[myHOST]);
		    /*
		     *
		     *  REDUCE AMONG HOSTS
		     */

                     if ( (Me.Nhosts > 1) && (Me.Rank[myHOST] == 0) )
		      {
			double start = CPU_TIME_tr;

			int target_task       = Me.Ranks_to_host[target_rank];
			int Im_hosting_target = Me.Ranks_to_host[target_rank] == Me.myhost;
			int target            = 0;

			if( Im_hosting_target )
			  while( (target < Me.Ntasks[Me.SHMEMl]) &&
				 (Me.Ranks_to_myhost[target] != target_rank) )
			    target++;


			int    *ctrl_ptr    = ( target == 0 ? (int*)Me.win_ctrl.ptr+1 : ((int*)Me.scwins[target].ptr)+1 );

			double *send_buffer = ( Im_hosting_target ? (double*)Me.swins[target].ptr+size_of_grid :
						(double*)Me.win.ptr+size_of_grid );
			double *recv_buffer = ( Im_hosting_target ? (double*)Me.sfwins[target].ptr : NULL );

			timingmpi.tmpi_setup += CPU_TIME_tr - start;

			double tstart = CPU_TIME_tr;

			ACQUIRE_CTRL( ctrl_ptr, target_rank, timing.tspin, != );

			timingmpi.tmpi_reduce_wait += CPU_TIME_tr - tstart;

			tstart = CPU_TIME_tr;
			MPI_Ireduce(send_buffer, recv_buffer, size_of_grid, MPI_DOUBLE, MPI_SUM, target_task, COMM[HOSTS], &requests[target_rank]);
			timingmpi.tmpi_reduce += CPU_TIME_tr - tstart;

			MPI_Wait( &requests[target_rank], MPI_STATUS_IGNORE );
			atomic_store(ctrl_ptr, FINAL_FREE);

			iter++;
                        timingmpi.tmpi += CPU_TIME_tr - start;
			fflush(stdout);
		      }

		  } // closes thread 0
                  atomic_thread_fence(memory_order_release);


	  }
	  timing.rtime  = CPU_TIME_rt - timing.rtime;
	  timing.ttotal = CPU_TIME_pr - timing.ttotal;

	  free(cwins);
	  free(swins);


	}

  return 0;
}
      

int shmem_reduce_ring( int sector, int target_rank, int_t size_of_grid, map_t *Me, double * restrict data, blocks_t *blocks )
 {

@@ -315,9 +457,10 @@ int shmem_reduce_ring( int sector, int target_rank, int_t size_of_grid, map_t *M
   return 0;
 }

int shmem_reduce( int sector, int target_rank, int dsize, map_t *Me, double * restrict data, int max_level )
int shmem_reduce_binomial( int sector, int target_rank, int dsize, map_t *Me, double * restrict data, int max_level )
 {

   //printf("Im inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank);
   int local_rank = Me->Rank[Me->SHMEMl];
   int target_rank_on_myhost = -1;

@@ -354,7 +497,7 @@ int shmem_reduce( int sector, int target_rank, int dsize, map_t *Me, double * re
       if( target_rank_on_myhost == Me->Ntasks[Me->SHMEMl] )
	 return -1;
      }

     // printf("Im after ist if  shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank);
      // Here we start the reduction
      //    

@@ -364,6 +507,7 @@ int shmem_reduce( int sector, int target_rank, int dsize, map_t *Me, double * re
      int my_maxlevel = max_level;
      while( (local_rank % (1<<my_maxlevel)) ) my_maxlevel--;

      //printf("my max_level %d max level %d my rank %d\n", my_maxlevel, max_level, global_rank);
      dprintf(1, 0, 0, "@ SEC %d t %d (%d), %d %d\n",
      sector, local_rank, oRank, *(int*)Me->win_ctrl.ptr, my_maxlevel);
     
@@ -383,13 +527,13 @@ int shmem_reduce( int sector, int target_rank, int dsize, map_t *Me, double * re
	 // 
	 {
	   int I_m_target = local_rank < source;
	   
	//   printf("Im inside the 1st if of reduction for loop inside shmem_reduce_binomial my rank %d target rank %d I'm target %d source %d\n", global_rank, target_rank, I_m_target, source);
 	                                                                       // prepare pointers for the summation loop
 	   double * restrict my_source = ( I_m_target ? swins[source] : data + dsize2);
	   double * restrict my_target = ( I_m_target ? data : swins[source]+dsize2 );
	   my_source = __builtin_assume_aligned( my_source, 8);
	   my_target = __builtin_assume_aligned( my_target, 8);

        //   printf("Im inside the 1st if of reduction after source and target assignment for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank);
	  #if defined(DEBUG)
	   int my_start = ( I_m_target ? 0 : dsize2);	   
	  #endif
@@ -398,7 +542,7 @@ int shmem_reduce( int sector, int target_rank, int dsize, map_t *Me, double * re
	                                                                      // are ready to be used (control tag must have                  	                                                                      	                                                                 // the value of the current sector )
	  int ctrl = sector*(max_level+1)+l;
	   ACQUIRE_CTRL( cwins[source], ctrl, timing.tspin_in, < );

          //  printf("Im inside the 1st if of reduction after aquire ctrl for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank);
	                                                                      // performs the summation loop
	                                                                      // 	                                                                      
	   double * my_end   = my_source+dsize2;
@@ -414,23 +558,31 @@ int shmem_reduce( int sector, int target_rank, int dsize, map_t *Me, double * re
	   summations += (!sector)*dsize2;
	   if( dsize2 < 16 )
	     {
            //   printf("Im inside the  if dsize2<16 of reduction after aquire ctrl for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank);
	       for( ; my_source < my_end; my_source++, my_target++)
		 *my_target += *my_source;
             //  printf("Im inside the  if dsize2<16 of reduction after  for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank);
	     }
	   else
	     {
             //  printf("Im inside the  else dsize2<16 of reduction after aquire ctrl for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank);	       
	       double * my_end_4 = my_source+dsize2_4;
	       
	     //   printf("Im inside the  else dsize2<16 of reduction after my_end_4 for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank);    
	       for( ; my_source < my_end_4; my_source+=4, my_target+=4 )
		 {
               //     printf("I'm inside the beginning of the for loop for adding source my source %lf rank %d target rank %d\n",*my_source, global_rank, target_rank);
		    *my_target += *my_source;
		   *(my_target+1) += *(my_source+1);
		   *(my_target+2) += *(my_source+2);
	           *(my_target+3) += *(my_source+3);
                //   printf("I'm inside the for loop for adding source my source %lf rank %d target rank %d\n",*my_source, global_rank, target_rank);
		 }
             //  printf("Im inside the  else dsize2<16 of reduction after 1st for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank);
	       for( ; my_source < my_end; my_source++, my_target++)
		 *my_target += *my_source;	       
              //  printf("Im inside the  else dsize2<16 of reduction after 2st for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank);
	     }
            //printf("Im inside the 1st if of reduction after summation for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank);
	   timing.tsum += CPU_TIME_tr - tstart;
	  #if defined(USE_PAPI)
	   if( sector == 0 )
@@ -450,7 +602,7 @@ int shmem_reduce( int sector, int target_rank, int dsize, map_t *Me, double * re

	   dprintf(1,0,0, "- SEC %d l %d t %d <-> %d done : %d\n",
		   sector, l, local_rank, source, *(int*)(Me->win_ctrl.ptr));
	   
	   //printf("Im at the end of reduction for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank);
         }
       
       else
@@ -462,6 +614,7 @@ int shmem_reduce( int sector, int target_rank, int dsize, map_t *Me, double * re
	   
         atomic_thread_fence(memory_order_release);
      }
     // printf("Im after reduction for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank);

      if ( target_rank_on_myhost > 0 )
      {
@@ -472,8 +625,10 @@ int shmem_reduce( int sector, int target_rank, int dsize, map_t *Me, double * re
         temp = (void*)cwins[target_rank_on_myhost];
         cwins[target_rank_on_myhost] = cwins[0];
         cwins[0] = (int*)temp;
       //  printf("Im inside targetrankonmyhost %d  inside shmem_reduce_binomial my rank %d target rank %d\n", target_rank_on_myhost ,global_rank, target_rank);
      }

   return 0;
 }