Commit 0560e16e authored by Giovanni Lacopo's avatar Giovanni Lacopo
Browse files

Merging complete

parent bf2914d9
Loading
Loading
Loading
Loading
+71 −1
Original line number Diff line number Diff line
@@ -220,8 +220,31 @@ void write_fftw_data(){
  
 #ifdef WRITE_IMAGE

  double start_image = CPU_TIME_wt;
  
  if(rank == 0)
    {

     #ifdef FITSIO
      printf("REMOVING RESIDUAL FITS FILE\n");
      remove(testfitsreal);
      remove(testfitsimag);


      printf("FITS CREATION\n");
      status = 0;

      fits_create_file(&fptrimg, testfitsimag, &status);
      fits_create_img(fptrimg, DOUBLE_IMG, naxis, naxes, &status);
      fits_close_file(fptrimg, &status);

      status = 0;

      fits_create_file(&fptreal, testfitsreal, &status);
      fits_create_img(fptreal, DOUBLE_IMG, naxis, naxes, &status);
      fits_close_file(fptreal, &status);
     #endif
      
      file.pFilereal = fopen (out.fftfile2,"wb");
      file.pFileimg = fopen (out.fftfile3,"wb");
      fclose(file.pFilereal);
@@ -231,6 +254,31 @@ void write_fftw_data(){
  MPI_Barrier(MPI_COMM_WORLD);

  if(rank == 0)printf("WRITING IMAGE\n");

 #ifdef FITSIO
  uint * fpixel = (uint *) malloc(sizeof(uint)*naxis);
  uint * lpixel = (uint *) malloc(sizeof(uint)*naxis);
 #endif

 #ifdef FITSIO

  fpixel[0] = 1;
  fpixel[1] = rank*yaxis+1;
  lpixel[0] = xaxis;
  lpixel[1] = (rank+1)*yaxis;

  status = 0;
  fits_open_image(&fptreal, testfitsreal, READWRITE, &status);
  fits_write_subset(fptreal, TDOUBLE, fpixel, lpixel, image_real, &status);
  fits_close_file(fptreal, &status);

  status = 0;
  fits_open_image(&fptrimg, testfitsimag, READWRITE, &status);
  fits_write_subset(fptrimg, TDOUBLE, fpixel, lpixel, image_imag, &status);
  fits_close_file(fptrimg, &status);

 #endif //FITSIO

  for (int isector=0; isector<size; isector++)
    {

@@ -238,7 +286,28 @@ void write_fftw_data(){
      
      if(isector == rank)
	{

	  printf("%d writing\n",isector);

	 #ifdef FITSIO

	  fpixel[0] = 1;
	  fpixel[1] = isector*yaxis+1;
	  lpixel[0] = xaxis;
	  lpixel[1] = (isector+1)*yaxis;

	  status = 0;
	  fits_open_image(&fptreal, testfitsreal, READWRITE, &status);
	  fits_write_subset(fptreal, TDOUBLE, fpixel, lpixel, image_real, &status);
	  fits_close_file(fptreal, &status);

	  status = 0;
	  fits_open_image(&fptrimg, testfitsimag, READWRITE, &status);
	  fits_write_subset(fptrimg, TDOUBLE, fpixel, lpixel, image_imag, &status);
	  fits_close_file(fptrimg, &status);

	 #endif //FITSIO

	  file.pFilereal = fopen (out.fftfile2,"ab");
	  file.pFileimg = fopen (out.fftfile3,"ab");

@@ -256,6 +325,7 @@ void write_fftw_data(){
 
  MPI_Barrier(MPI_COMM_WORLD);

  timing_wt.write += CPU_TIME_wt - start_image;

 #endif //WRITE_IMAGE

+12 −0
Original line number Diff line number Diff line
@@ -109,6 +109,18 @@ int main(int argc, char * argv[])
  }  
 #endif

#ifdef FITSIO
  fitsfile *fptreal;
  fitsfile *fptrimg;
  int status;
  char testfitsreal[NAME_LEN] = "parallel_real.fits";
  char testfitsimag[NAME_LEN] = "parallel_img.fits";

  uint naxis = 2;
  uint naxes[2] = { grid_size_x, grid_size_y };
 #endif

  
  /* Reading Parameter file */

  read_parameter_file(in.paramfile);
+14 −3
Original line number Diff line number Diff line
@@ -26,14 +26,25 @@ void write_result( void )
      printf("%40s time : %g sec\n", "Phase", timing_wt.phase);
     #endif //USE_FFTW

     #if defined(WRITE_IMAGE)
      printf("%40s time : %g sec\n", "Image writing", timing_wt.write);
     #endif
      
      printf("%40s time : %g sec\n", "TOT", timing_wt.total);
  
      file.pFile = fopen (out.timingfile, "w");


     #if defined(USE_FFTW)
     #if !defined(CUFFTMP)
      fprintf(file.pFile, "%g %g %g %g %g %g %g\n",
	      timing_wt.setup, timing_wt.kernel, timing_wt.compose,
	      timing_wt.reduce, timing_wt.fftw, timing_wt.phase, timing_wt.total);
      
	      timing_wt.reduce, timing_wt.fftw, timing_wt.phase, timing_wt.write, timing_wt.total);
     #else
      fprintf(file.pFile, "%g %g %g %g %g %g %g\n",
	      timing_wt.setup, timing_wt.kernel, timing_wt.compose,
	      timing_wt.reduce, timing_wt.cufftmp, timing_wt.phase, timing_wt.write, timing_wt.total);
     #endif
      fclose(file.pFile);         
    }

w-stacking_omp.v2.c

deleted100644 → 0
+0 −267
Original line number Diff line number Diff line
#include <omp.h>
#include "w-stacking_omp.h"
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#ifdef NVIDIA
#include <cuda_runtime.h>
#endif

#ifdef ACCOMP
#pragma omp  declare target
#endif
double gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
{
     double conv_weight;
     conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
     return conv_weight;
}
#ifdef ACCOMP
#pragma omp end declare target
#endif

//The function has been slightly modified in order to include the ranks of the tasks calling the GPUs [GL]
void wstack(
     int num_w_planes,
     unsigned int num_points,
     unsigned int freq_per_chan,
     unsigned int polarizations,
     double* uu,
     double* vv,
     double* ww,
     float* vis_real,
     float* vis_img,
     float* weight,
     double dx,
     double dw,
     int w_support,
     int grid_size_x,
     int grid_size_y,
     double* grid,
     int num_threads,
     int rank) 
{
    //unsigned int index;
    unsigned int visindex;

    // initialize the convolution kernel
    // gaussian:
    int KernelLen = (w_support-1)/2;
    double std = 1.0;
    double std22 = 1.0/(2.0*std*std);
    double norm = std22/PI;

    // Loop over visibilities.
#ifdef _OPENMP
    omp_set_num_threads(num_threads);
#endif

#ifdef ACCOMP
    unsigned int Nvis = num_points*freq_per_chan*polarizations;
    unsigned int gpu_weight_dim = Nvis/freq_per_chan;
    unsigned int gpu_grid_dim = 2*num_w_planes*grid_size_x*grid_size_y;
#pragma omp target teams distribute parallel for private(visindex)	\
  map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points],vis_real[0:Nvis], vis_img[0:Nvis], weight[0:gpu_weight_dim]) \
  map(tofrom: grid[0:gpu_grid_dim]) device(rank % omp_get_num_devices()) //Works also if Ntasks > NGPUs [GL]
#endif
    for (unsigned int i = 0; i < num_points; i++)
    {
#ifdef _OPENMP
	//int tid;
	//tid = omp_get_thread_num();
	//printf("%d\n",tid);
#endif
      
        visindex = i*freq_per_chan*polarizations;

        double sum = 0.0;
        int j, k;
	//if (i%1000 == 0)printf("%ld\n",i);

        /* Convert UV coordinates to grid coordinates. */
        double pos_u = uu[i] / dx;
        double pos_v = vv[i] / dx;
        double ww_i  = ww[i] / dw;
	//Renormalization of ww_i
	ww_i = ww_i/(1 + num_w_planes);

	int grid_w = (int)ww_i;
        int grid_u = (int)pos_u;
        int grid_v = (int)pos_v;

	// check the boundaries
	unsigned int jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
	unsigned int jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
	unsigned int kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
	unsigned int kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
        //printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax);


	unsigned int buffer_size = 2*(kmax-kmin+1)*(jmax-jmin+1);
	double grid_buffer[ buffer_size];
	
        // Convolve this point onto the grid.
        for (k = kmin; k <= kmax; k++)
        {

            double v_dist = (double)k+0.5 - pos_v;
	    unsigned int buffer_base = k * buffer_size;
	    
            for (j = jmin; j <= jmax; j++)
            {
                double u_dist = (double)j+0.5 - pos_u;

		double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
		// Loops over frequencies and polarizations
		double add_term_real = 0.0;
		double add_term_img = 0.0;
		unsigned int ifine = visindex;
		// DAV: the following two loops are performend by each thread separately: no problems of race conditions
		for (unsigned int ifreq=0; ifreq<freq_per_chan; ifreq++)
		{
		   unsigned int iweight = visindex/freq_per_chan;
	           for (unsigned int ipol=0; ipol<polarizations; ipol++)
	           {
                      if (!isnan(vis_real[ifine]))
                      {
		         //printf("%f %ld\n",weight[iweight],iweight);
                         add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
		         add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
			 //if(vis_img[ifine]>1e10 || vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_points*freq_per_chan*polarizations);
		      }
		      ifine++;
		      iweight++;
		   }
	        }
		
		// DAV: this is the critical call in terms of correctness of the results and of performance

		grid_buffer[buffer_base + j*2] += add_term_real;
		grid_buffer[buffer_base + j*2+1] += add_term_img;

            }

	    for (k = kmin; k <= kmax; k++) {
	      unsigned int buffer_base = k * buffer_size;
	      for (j = jmin; j <= jmax; j++)
		{
		  unsigned int iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
		  
		 #pragma omp atomic
		  grid[iKer] += grid_buffer[buffer_base + j*2];
		 #pragma omp atomic
		  grid[iKer+1] += grid_buffer[buffer_base + j*2+1];}
	    }

	}
	
    }


   #ifdef ACCOMP
   #pragma omp target exit data map(delete:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:gpu_weight_dim],grid[0:gpu_grid_dim]) device(rank % omp_get_num_devices())
   #endif
    //for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
}


#ifdef NVIDIA
#define CUDAErrorCheck(funcall)                                         \
do {                                                                    \
  cudaError_t ierr = funcall;                                           \
  if (cudaSuccess != ierr) {                                            \
    fprintf(stderr, "%s(line %d) : CUDA RT API error : %s(%d) -> %s\n", \
    __FILE__, __LINE__, #funcall, ierr, cudaGetErrorString(ierr));      \
    exit(ierr);                                                         \
  }                                                                     \
} while (0)

static inline int _corePerSM(int major, int minor)
/**
 * @brief Give the number of CUDA cores per streaming multiprocessor (SM).
 *
 * The number of CUDA cores per SM is determined by the compute capability.
 *
 * @param major Major revision number of the compute capability.
 * @param minor Minor revision number of the compute capability.
 *
 * @return The number of CUDA cores per SM.
 */
{
  if (1 == major) {
    if (0 == minor || 1 == minor || 2 == minor || 3 == minor) return 8;
  }
  if (2 == major) {
    if (0 == minor) return 32;
    if (1 == minor) return 48;
  }
  if (3 == major) {
    if (0 == minor || 5 == minor || 7 == minor) return 192;
  }
  if (5 == major) {
    if (0 == minor || 2 == minor) return 128;
  }
  if (6 == major) {
    if (0 == minor) return 64;
    if (1 == minor || 2 == minor) return 128;
  }
  if (7 == major) {
    if (0 == minor || 2 == minor || 5 == minor) return 64;
  }
  return -1;
}

void getGPUInfo(int iaccel)
{
  int corePerSM;

 struct cudaDeviceProp dev;

  CUDAErrorCheck(cudaSetDevice(iaccel));
  CUDAErrorCheck(cudaGetDeviceProperties(&dev, iaccel));
  corePerSM = _corePerSM(dev.major, dev.minor);

  printf("\n");
  printf("============================================================\n");
  printf("CUDA Device name : \"%s\"\n", dev.name);
  printf("------------------------------------------------------------\n");
  printf("Comp. Capability : %d.%d\n", dev.major, dev.minor);
  printf("max clock rate   : %.0f MHz\n", dev.clockRate * 1.e-3f);
  printf("number of SMs    : %d\n", dev.multiProcessorCount);
  printf("cores  /  SM     : %d\n", corePerSM);
  printf("# of CUDA cores  : %d\n", corePerSM * dev.multiProcessorCount);
  printf("------------------------------------------------------------\n");
  printf("global memory    : %5.0f MBytes\n", dev.totalGlobalMem / 1048576.0f);
  printf("shared mem. / SM : %5.1f KBytes\n", dev.sharedMemPerMultiprocessor / 1024.0f);
  printf("32-bit reg. / SM : %d\n", dev.regsPerMultiprocessor);
  printf("------------------------------------------------------------\n");
  printf("max # of threads / SM    : %d\n", dev.maxThreadsPerMultiProcessor);
  printf("max # of threads / block : %d\n", dev.maxThreadsPerBlock);
  printf("max dim. of block        : (%d, %d, %d)\n",
      dev.maxThreadsDim[0], dev.maxThreadsDim[1], dev.maxThreadsDim[2]);
  printf("max dim. of grid         : (%d, %d, %d)\n",
      dev.maxGridSize[0],   dev.maxGridSize[1],   dev.maxGridSize[2]);
  printf("warp size                : %d\n", dev.warpSize);
  printf("============================================================\n");

  int z = 0, x = 2;
  #pragma omp target map(to:x) map(tofrom:z)
  {
    z=x+100;
  }
}

#endif






int test(int nnn)
{
	int mmm;

	mmm = nnn+1;
	return mmm;
}

w-stacking_omp.v3.c

deleted100644 → 0
+0 −281
Original line number Diff line number Diff line
#include <omp.h>
#include "w-stacking_omp.h"
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#ifdef NVIDIA
#include <cuda_runtime.h>
#endif

#ifdef ACCOMP
#pragma omp  declare target
#endif
double gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
{
     double conv_weight;
     conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
     return conv_weight;
}
#ifdef ACCOMP
#pragma omp end declare target
#endif

//The function has been slightly modified in order to include the ranks of the tasks calling the GPUs [GL]
void wstack(
     int num_w_planes,
     unsigned int num_points,
     unsigned int freq_per_chan,
     unsigned int polarizations,
     double* uu,
     double* vv,
     double* ww,
     float* vis_real,
     float* vis_img,
     float* weight,
     double dx,
     double dw,
     int w_support,
     int grid_size_x,
     int grid_size_y,
     double* grid,
     int num_threads,
     int rank) 
{
    //unsigned int index;
    unsigned int visindex;

    // initialize the convolution kernel
    // gaussian:
    int KernelLen = (w_support-1)/2;
    double std = 1.0;
    double std22 = 1.0/(2.0*std*std);
    double norm = std22/PI;

    // Loop over visibilities.
#ifdef _OPENMP
    omp_set_num_threads(num_threads);
#endif

#ifdef ACCOMP
    unsigned int Nvis = num_points*freq_per_chan*polarizations;
    unsigned int gpu_weight_dim = Nvis/freq_per_chan;
    unsigned int gpu_grid_dim = 2*num_w_planes*grid_size_x*grid_size_y;
    double *grid_buffer = (double*)malloc(gpu_grid_dim*sizeof(double));
    
    /*
    //Checking for GPU memory
    unsigned int unsigned int int data_mem = num_points * (sizeof(uu) + sizeof(vv) + sizeof(ww)) + Nvis * (sizeof(vis_real) +
											   sizeof(vis_img) ) +
      gpu_weight_dim * sizeof(weight);
    unsigned int unsigned int int grid_mem = gpu_grid_dim * sizeof(grid);
    printf("Rank %d, Original data: %lld, Grid array: %lld, Total: %lld\n", rank, data_mem, grid_mem, data_mem + grid_mem);fflush(stdout);
    */

#pragma omp target teams distribute parallel for private(visindex)	\
  map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:gpu_weight_dim]) \
  map(to:grid_buffer[0:gpu_grid_dim]) \
  map(tofrom: grid[0:gpu_grid_dim]) device(rank % omp_get_num_devices()) //Works also if Ntasks > NGPUs [GL]
#endif
    for (unsigned int i = 0; i < num_points; i++)
    {
#ifdef _OPENMP
	//int tid;
	//tid = omp_get_thread_num();
	//printf("%d\n",tid);
#endif

        visindex = i*freq_per_chan*polarizations;

        double sum = 0.0;
        int j, k;
	//if (i%1000 == 0)printf("%ld\n",i);

        /* Convert UV coordinates to grid coordinates. */
        double pos_u = uu[i] / dx;
        double pos_v = vv[i] / dx;
        double ww_i  = ww[i] / dw;
	//Renormalization of ww_i
	ww_i = ww_i/(1+num_w_planes);
        int grid_w = (int)ww_i;
        int grid_u = (int)pos_u;
        int grid_v = (int)pos_v;

	// check the boundaries
	unsigned int jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
	unsigned int jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
	unsigned int kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
	unsigned int kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
        //printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax);


	unsigned int buffer_size = 2*(kmax-kmin+1)*(jmax-jmin+1);
        //double grid_buffer[ buffer_size];
	
        // Convolve this point onto the grid.
        for (k = kmin; k <= kmax; k++)
        {

            double v_dist = (double)k+0.5 - pos_v;
	    unsigned int buffer_base = k * buffer_size;
	    
            for (j = jmin; j <= jmax; j++)
            {
                double u_dist = (double)j+0.5 - pos_u;
		
		double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
		// Loops over frequencies and polarizations
		double add_term_real = 0.0;
		double add_term_img = 0.0;
		unsigned int ifine = visindex;
		// DAV: the following two loops are performend by each thread separately: no problems of race conditions
		for (unsigned int ifreq=0; ifreq<freq_per_chan; ifreq++)
		{
		   unsigned int iweight = visindex/freq_per_chan;
	           for (unsigned int ipol=0; ipol<polarizations; ipol++)
	           {
                      if (!isnan(vis_real[ifine]))
                      {
		         //printf("%f %ld\n",weight[iweight],iweight);
                         add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
		         add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
			 //if(vis_img[ifine]>1e10 || vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_points*freq_per_chan*polarizations);
		      }
		      ifine++;
		      iweight++;
		   }
	        }
		// DAV: this is the critical call in terms of correctness of the results and of performance

		grid_buffer[buffer_base + j*2] += add_term_real;
                grid_buffer[buffer_base + j*2+1] += add_term_img;
		
            }
	}  
	
	for (k = kmin; k <= kmax; k++) {
	  unsigned int buffer_base = k * buffer_size;
	  for (j = jmin; j <= jmax; j++)
	    {
	      unsigned int iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
		  
	     #pragma omp atomic
	      grid[iKer] += grid_buffer[buffer_base + j*2];
	     #pragma omp atomic
	      grid[iKer+1] += grid_buffer[buffer_base + j*2+1];}

	}
	
    }



   #ifdef ACCOMP
   #pragma omp target exit data map(delete:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:gpu_weight_dim],grid[0:gpu_grid_dim]) device(rank % omp_get_num_devices())
   #endif
    
    /*
    for (int i=0; i<100; i++)printf("%f\n",grid[i]);
    */
}


#ifdef NVIDIA
#define CUDAErrorCheck(funcall)                                         \
do {                                                                    \
  cudaError_t ierr = funcall;                                           \
  if (cudaSuccess != ierr) {                                            \
    fprintf(stderr, "%s(line %d) : CUDA RT API error : %s(%d) -> %s\n", \
    __FILE__, __LINE__, #funcall, ierr, cudaGetErrorString(ierr));      \
    exit(ierr);                                                         \
  }                                                                     \
} while (0)

static inline int _corePerSM(int major, int minor)
/**
 * @brief Give the number of CUDA cores per streaming multiprocessor (SM).
 *
 * The number of CUDA cores per SM is determined by the compute capability.
 *
 * @param major Major revision number of the compute capability.
 * @param minor Minor revision number of the compute capability.
 *
 * @return The number of CUDA cores per SM.
 */
{
  if (1 == major) {
    if (0 == minor || 1 == minor || 2 == minor || 3 == minor) return 8;
  }
  if (2 == major) {
    if (0 == minor) return 32;
    if (1 == minor) return 48;
  }
  if (3 == major) {
    if (0 == minor || 5 == minor || 7 == minor) return 192;
  }
  if (5 == major) {
    if (0 == minor || 2 == minor) return 128;
  }
  if (6 == major) {
    if (0 == minor) return 64;
    if (1 == minor || 2 == minor) return 128;
  }
  if (7 == major) {
    if (0 == minor || 2 == minor || 5 == minor) return 64;
  }
  return -1;
}

void getGPUInfo(int iaccel)
{
  int corePerSM;

 struct cudaDeviceProp dev;

  CUDAErrorCheck(cudaSetDevice(iaccel));
  CUDAErrorCheck(cudaGetDeviceProperties(&dev, iaccel));
  corePerSM = _corePerSM(dev.major, dev.minor);

  printf("\n");
  printf("============================================================\n");
  printf("CUDA Device name : \"%s\"\n", dev.name);
  printf("------------------------------------------------------------\n");
  printf("Comp. Capability : %d.%d\n", dev.major, dev.minor);
  printf("max clock rate   : %.0f MHz\n", dev.clockRate * 1.e-3f);
  printf("number of SMs    : %d\n", dev.multiProcessorCount);
  printf("cores  /  SM     : %d\n", corePerSM);
  printf("# of CUDA cores  : %d\n", corePerSM * dev.multiProcessorCount);
  printf("------------------------------------------------------------\n");
  printf("global memory    : %5.0f MBytes\n", dev.totalGlobalMem / 1048576.0f);
  printf("shared mem. / SM : %5.1f KBytes\n", dev.sharedMemPerMultiprocessor / 1024.0f);
  printf("32-bit reg. / SM : %d\n", dev.regsPerMultiprocessor);
  printf("------------------------------------------------------------\n");
  printf("max # of threads / SM    : %d\n", dev.maxThreadsPerMultiProcessor);
  printf("max # of threads / block : %d\n", dev.maxThreadsPerBlock);
  printf("max dim. of block        : (%d, %d, %d)\n",
      dev.maxThreadsDim[0], dev.maxThreadsDim[1], dev.maxThreadsDim[2]);
  printf("max dim. of grid         : (%d, %d, %d)\n",
      dev.maxGridSize[0],   dev.maxGridSize[1],   dev.maxGridSize[2]);
  printf("warp size                : %d\n", dev.warpSize);
  printf("============================================================\n");

  int z = 0, x = 2;
  #pragma omp target map(to:x) map(tofrom:z)
  {
    z=x+100;
  }
}

#endif






int test(int nnn)
{
	int mmm;

	mmm = nnn+1;
	return mmm;
}