NVHPC 23.5 updates, cufftMp working, correct WEIGHTS from MS (e658f6bb) · Commits · Claudio Gheller / HPC_Imaging

Build/Makefile.leo

+9 −9

Original line number	Diff line number	Diff line
		@@ -9,8 +9,8 @@ OPT_PURE_MPI = -O4 -march=native -mavx -mavx2

		OMP_GPU = -mp=multicore,gpu -gpu=cuda11.8 -gpu=cc80

		CUDA_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/cuda/11.8/include
		CUDA_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/cuda/11.8/lib64 -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/cuda/11.8/targets/x86_64-linux/lib/stubs
		CUDA_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/include
		CUDA_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/lib64 -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/targets/x86_64-linux/lib/stubs

		FFTW_INCL=
		FFTW_LIB=
		@@ -19,18 +19,18 @@ FFTW_LIB=
		##########################################################
		#NVIDIA CUFFTMP

		CUFFTMP_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/math_libs/11.8/lib64
		CUFFTMP_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/math_libs/11.8/include/cufftmp
		CUFFTMP_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/lib64
		CUFFTMP_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/include/cufftmp
		##########################################################

		NVSHMEM_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nvshmem/include
		NVSHMEM_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nvshmem/lib
		NVSHMEM_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/include/
		NVSHMEM_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/lib/

		##########################################################
		#NVIDIA NCCL REDUCE

		NCCL_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nccl/include
		NCCL_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-8.5.0/nvhpc-23.1-x5lw6edfmfuot2ipna3wseallzl4oolm/Linux_x86_64/23.1/comm_libs/11.8/nccl/lib
		NCCL_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nccl/include
		NCCL_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nccl/lib
		##########################################################

		NVC = nvc
		@@ -44,7 +44,7 @@ NVLIB_2 = $(CUDA_INC) $(CUDA_LIB) $(MPI_INC) $(MPI_LIB) $(CUFFTMP_INC) $(CUFFTMP
		NVLIB_3 = $(CUDA_INC) $(CUDA_LIB) $(MPI_INC) $(MPI_LIB) $(NCCL_INC) $(NCCL_LIB) -lcudart -lnccl


		NVCC = /opt/nvidia/hpc_sdk/Linux_x86_64/23.1/cuda/11.8/bin/nvcc
		NVCC = nvcc
		OPT_NVCC = -std=c++17 --generate-code arch=compute_80,code=sm_80

		CFLAGS +=

Makefile

+7 −7

Original line number	Diff line number	Diff line
		@@ -35,10 +35,10 @@ FFTWLIBS =
		OPT += -DUSE_FFTW

		# use omp-ized version of fftw routines
		OPT += -DHYBRID_FFTW
		#OPT += -DHYBRID_FFTW

		# switch on the OpenMP parallelization
		OPT += -DUSE_OMP
		#OPT += -DUSE_OMP

		# write the full 3D cube of gridded visibilities and its FFT transform
		#OPT += -DWRITE_DATA
		@@ -53,7 +53,7 @@ OPT += -DPHASE_ON
		#OPT += -DFITSIO

		# Perform true parallel images writing
		#OPT += -DPARALLELIO
		OPT += -DPARALLELIO

		# Normalize uvw in case it is not done in the binMS
		#OPT += -DNORMALIZE_UVW
		@@ -74,13 +74,13 @@ OPT += -DGAUSS_HI_PRECISION
		#OPT += -DNVIDIA

		#use cuda for GPUs
		#OPT += -DCUDACC
		OPT += -DCUDACC

		# use GPU acceleration via OMP
		OPT += -DACCOMP
		#OPT += -DACCOMP

		# perform stacking on GPUs
		#OPT += -DGPU_STACKING
		OPT += -DGPU_STACKING

		# use NVIDIA GPU to perform the reduce
		#OPT += -DNCCL_REDUCE
		@@ -89,7 +89,7 @@ OPT += -DACCOMP
		#OPT += -DRCCL_REDUCE

		# use GPU to perform FFT
		#OPT += -DCUFFTMP
		OPT += -DCUFFTMP

		#support for AMD GPUs
		#OPT += __HIP_PLATFORM_AMD__

README.md

+7 −0

Original line number	Diff line number	Diff line
		@@ -50,5 +50,12 @@ In the case in which the code has been compiled without either -fopenmp or -D_OP
		the code is forced to use the standard MPI_Reduce implementation, since our reduce works
		only with OpenMP.

		To use the cufftMp with nvhpc 23.5 you have to add the following paths to the environmental variable ´LD_LIBRARY_PATH´:

		###########################################

		export LD_LIBRARY_PATH="/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/lib/:$LD_LIBRARY_PATH"

		export LD_LIBRARY_PATH="/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/lib64/:$LD_LIBRARY_PATH"

		###########################################
		No newline at end of file

scripts/bench.go_cufftmp

0 → 100644

+111 −0

Original line number	Diff line number	Diff line
		#!/bin/bash
		#SBATCH -A IscrC_RICK
		#SBATCH -p boost_usr_prod
		##SBATCH --qos boost_qos_bprod
		#SBATCH -J RICK
		### number of nodes
		#SBATCH -N 1
		### number of MPI tasks per node
		#SBATCH --ntasks-per-node=4
		#SBATCH -n 4
		### number of openmp threads
		#SBATCH --cpus-per-task=8
		### number of allocated GPUs per node
		#SBATCH --gpus-per-node=4
		#SBATCH --mem=450G
		#SBATCH -o test.out
		#SBATCH -e test.err
		#SBATCH -t 03:00:00


		module load openmpi/
		module load fftw/
		module load nvhpc/23.5
		module load cuda/

		export LD_LIBRARY_PATH="/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/lib/:$LD_LIBRARY_PATH"

		export LD_LIBRARY_PATH="/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/lib64/:$LD_LIBRARY_PATH"

		export OMPI_CC=gcc
		export OMPI_CXX=g++

		export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK}
		export OMP_PLACES=cores

		cd ../
		make -j1 clean
		rm -f w-stacking_fftw_acc-omp_acc-fft
		make -j1 w-stacking

		export typestring=omp_gpu_cufftmp
		export exe=w-stacking_fftw_acc-omp_acc-fft

		OUT_SHM=result_${SLURM_NTASKS}_${typestring}_${SLURM_CPUS_PER_TASK}
		OUT_SHM_RES=/leonardo_scratch/large/userexternal/glacopo0/hpc_imaging/scripts/Tests/times_${SLURM_NTASKS}_${typestring}_${SLURM_CPUS_PER_TASK}_large

		rm -f ${OUT_SHM} ${OUT_SHM_RES}

		export logdir=mpi_${SLURM_NTASKS}_${typestring}_${SLURM_CPUS_PER_TASK}
		echo "Creating $logdir"
		rm -fr $logdir
		mkdir $logdir

		for itest in {1..3}
		do
		export logfile=test_${itest}_${logdir}.log
		echo "time mpirun -np ${SLURM_NTASKS} --bind-to core --map-by ppr:${SLURM_NTASKS_PER_NODE}:node:pe=${SLURM_CPUS_PER_TASK} -x OMP_NUM_THREADS data/paramfile.txt" > $logfile
		time mpirun -np ${SLURM_NTASKS} --bind-to core --map-by ppr:${SLURM_NTASKS_PER_NODE}:node:pe=${SLURM_CPUS_PER_TASK} -x OMP_NUM_THREADS --mca btl self,vader /leonardo_scratch/large/userexternal/glacopo0/hpc_imaging/${exe} data/paramfile.txt >> $logfile
		mv $logfile $logdir
		mv timings.dat ${logdir}/timings_${itest}.dat
		cat ${logdir}/timings_all.dat ${logdir}/timings_${itest}.dat >> ${logdir}/timings_all.dat

		Reduce_time=$( grep -w 'Reduce time :' $logdir/$logfile \| gawk '{print $4}' )
		FFTW_time=$( grep -w 'cufftMP time :' $logdir/$logfile \| gawk '{print $4}' )
		Composition_time=$( grep -w 'Array Composition time :' $logdir/$logfile \| gawk '{print $5}' )
		Writing_time=$( grep -w ' Image writing time :' $logdir/$logfile \| gawk '{print $5}' )
		Total_time=$( grep -w 'TOT time :' $logdir/$logfile \| gawk '{print $4}' )

		#Not relevant for the paper
		Setup_time=$( grep -w 'Setup time:' $logdir/$logfile \| gawk '{print $3}' )
		Kernel_time=$( grep -w 'Kernel time :' $logdir/$logfile \| gawk '{print $4}' )
		Phase_time=$( grep -w 'Phase time :' $logdir/$logfile \| gawk '{print $4}' )
		##########################


		echo $itest $Reduce_time $FFTW_time $Composition_time $Writing_time $Total_time $Setup_time $Kernel_time $Phase_time >> ${OUT_SHM}
		done

		echo -e "\n\n" >> ${OUT_SHM}
		avg_red=$( awk '{sum+=$2} END { print sum/3 }' ${OUT_SHM} )
		avg_fftw=$( awk '{sum+=$3} END { print sum/3 }' ${OUT_SHM} )
		avg_comp=$( awk '{sum+=$4} END { print sum/3 }' ${OUT_SHM} )
		avg_write=$( awk '{sum+=$5} END { print sum/3 }' ${OUT_SHM} )
		avg_tot=$( awk '{sum+=$6} END { print sum/3 }' ${OUT_SHM} )

		std_red=$( awk '{if($2!=""){count++;sum+=$2};y+=$2^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} )
		std_fftw=$( awk '{if($3!=""){count++;sum+=$3};y+=$3^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} )
		std_comp=$( awk '{if($4!=""){count++;sum+=$4};y+=$4^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} )
		std_write=$( awk '{if($5!=""){count++;sum+=$5};y+=$5^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} )
		std_tot=$( awk '{if($6!=""){count++;sum+=$6};y+=$6^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} )


		#Not relevant for the paper
		avg_setup=$( awk '{sum+=$7} END { print sum/3 }' ${OUT_SHM} )
		avg_ker=$( awk '{sum+=$8} END { print sum/3 }' ${OUT_SHM} )
		avg_phase=$( awk '{sum+=$9} END { print sum/3 }' ${OUT_SHM} )

		std_setup=$( awk '{if($7!=""){count++;sum+=$7};y+=$7^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} )
		std_ker=$( awk '{if($8!=""){count++;sum+=$8};y+=$8^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} )
		std_phase=$( awk '{if($9!=""){count++;sum+=$9};y+=$9^2} END{sq=sqrt(y/3-(sum/3)^2);sq=sq?sq:0;print sq}' ${OUT_SHM} )
		##########################


		echo "Averages and standard deviations over 3 shots" >> ${OUT_SHM_RES}
		echo -e "\n" ${OUT_SHM_RES}
		echo "${SLURM_NTASKS} MPI tasks; ${SLURM_CPUS_PER_TASK} OpenMP threads per task; ${SLURM_GPUS_PER_NODE} GPUs per node;" >> ${OUT_SHM_RES}
		echo -e "\n\n" ${OUT_SHM_RES}
		echo $avg_red $std_red $avg_fftw $std_fftw $avg_comp $std_comp $avg_write $std_write $avg_tot $std_tot >> ${OUT_SHM_RES}
		echo -e "\n\n" ${OUT_SHM_RES}
		echo $avg_setup $std_setup $avg_ker $std_ker $avg_phase $std_phase >> ${OUT_SHM_RES}
		rm -f ${OUT_SHM}

scripts/create_binMS.py

0 → 100644

+218 −0

Original line number	Diff line number	Diff line
		USE_MPI = 0

		import numpy as np
		import casacore.tables as pt
		import time
		import sys
		import os

		#outpath = '/data/gridding/data/shortgauss_t201806301100_SBH255.binMS/'
		print(sys.argv[1])
		outpath = "/data/gridding/data/Lofarbig/"+sys.argv[1]+".binMS/"
		os.mkdir(outpath)


		ufile = 'ucoord.bin'
		vfile = 'vcoord.bin'
		wfile = 'wcoord.bin'
		weights = 'weights.bin'
		visrealfile = 'visibilities_real.bin'
		visimgfile = 'visibilities_img.bin'
		metafile = 'meta.txt'

		offset = 0.0

		if USE_MPI == 1:
		from mpi4py import MPI
		comm = MPI.COMM_WORLD
		rank = comm.Get_rank()
		size = comm.Get_size()
		print(rank,size)
		else:
		comm = 0
		rank = 0
		size = 1

		num_threads = 1

		# input MS
		readtime0 = time.time()
		#msfile = "/data/Lofar-data/results/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.ms"
		msfile = "/data/Lofar-Luca/results/"+sys.argv[1]+".ms/"
		ms = pt.table(msfile, readonly=True, ack=False)

		if rank == 0:
		print("Reading ", msfile)
		print("Writing ", outpath)
		# load data and metadata
		with pt.table(msfile + '::SPECTRAL_WINDOW', ack=False) as freqtab:
		freq = freqtab.getcol('REF_FREQUENCY')[0] / 1000000.0
		freqpersample = np.mean(freqtab.getcol('RESOLUTION'))
		timepersample = ms.getcell('INTERVAL',0)

		print("Frequencies (MHz) : ",freq)
		print("Time interval (sec) : ",timepersample)

		with pt.taql("SELECT ANTENNA1,ANTENNA2,sqrt(sumsqr(UVW)),GCOUNT() FROM $ms GROUPBY ANTENNA1,ANTENNA2") as BL:
		ants1, ants2 = BL.getcol('ANTENNA1'), BL.getcol('ANTENNA2')
		Ntime = BL.getcol('Col_4')[0] # number of timesteps
		Nbaselines = len(ants1)

		print("Number of timesteps : ",Ntime)
		print("Total obs time (hrs): ",timepersample*Ntime/3600)
		print("Number of baselines : ",Nbaselines)

		#sp = pt.table(msfile+'::LOFAR_ANTENNA_FIELD', readonly=True, ack=False, memorytable=True).getcol('POSITION')

		ant1, ant2 = ms.getcol('ANTENNA1'), ms.getcol('ANTENNA2')

		number_of_measures = Ntime * Nbaselines
		#nm_pe_aux = int(number_of_measures / size)
		#remaining_aux = number_of_measures % size
		nm_pe = np.array(0)
		nm_pe = int(number_of_measures / size)
		remaining = np.array(0)
		remaining = number_of_measures % size
		print(nm_pe,remaining)

		else:
		nm_pe = None
		remaining = None

		if USE_MPI == 1:
		nm_pe = comm.bcast(nm_pe, root=0)
		remaining = comm.bcast(remaining, root=0)

		# set the data domain for each MPI rank
		startrow = rank*nm_pe

		if rank == size-1:
		nm_pe = nm_pe+remaining
		print(rank,nm_pe,remaining)

		nrow = nm_pe

		# read data
		uvw = ms.getcol('UVW',startrow,nrow)
		vis = ms.getcol('DATA',startrow,nrow)
		weight = ms.getcol('WEIGHT_SPECTRUM',startrow,nrow)
		print("Freqs per channel : ",vis.shape[1])
		print("Polarizations : ",vis.shape[2])
		print("Number of observations : ",uvw.shape[0])
		print("Data size (MB) : ",uvw.shape[0]vis.shape[1]vis.shape[2]24/1024.0/1024.0)

		# set parameters
		num_points = uvw.shape[0]
		num_w_planes = 1
		grid_size = 100 # number of cells of the grid

		# serialize arrays
		vis_ser_real = vis.real.flatten()
		vis_ser_img = vis.imag.flatten()
		print("data types: uvw = ",uvw.dtype," vis = ",vis_ser_real.dtype)
		#vis_ser = np.zeros(2*vis_ser_real.size)
		#for i in range(vis_ser_real.size):
		# vis_ser[2*i]=vis_ser_real[i]
		# vis_ser[2*i+1]=vis_ser_img[i]

		uu_ser = uvw[:,0].flatten()
		vv_ser = uvw[:,1].flatten()
		ww_ser = uvw[:,2].flatten()
		weight_ser = weight.flatten()
		grid = np.zeros(2num_w_planesgrid_size*grid_size) # complex!
		gridtot = np.zeros(2num_w_planesgrid_size*grid_size) # complex!
		peanokeys = np.empty(vis_ser_real.size,dtype=np.uint64)
		gsize = grid.size

		hist, bin_edges = np.histogram(ww_ser,num_w_planes)
		print(hist)

		print(vis_ser_real.dtype)

		# normalize uv
		minu = np.amin(uu_ser)
		maxu = np.amax(abs(uu_ser))
		minv = np.amin(vv_ser)
		maxv = np.amax(abs(vv_ser))
		minw = np.amin(ww_ser)
		maxw = np.amax(ww_ser)

		if USE_MPI == 1:
		maxu_all = np.array(0,dtype=np.float)
		maxv_all = np.array(0,dtype=np.float)
		maxw_all = np.array(0,dtype=np.float)
		minu_all = np.array(0,dtype=np.float)
		minv_all = np.array(0,dtype=np.float)
		minw_all = np.array(0,dtype=np.float)
		comm.Allreduce(maxu, maxu_all, op=MPI.MAX)
		comm.Allreduce(maxv, maxv_all, op=MPI.MAX)
		comm.Allreduce(maxw, maxw_all, op=MPI.MAX)
		comm.Allreduce(minu, minu_all, op=MPI.MIN)
		comm.Allreduce(minv, minv_all, op=MPI.MIN)
		comm.Allreduce(minw, minw_all, op=MPI.MIN)

		ming = min(minu_all,minv_all)
		maxg = max(maxu_all,maxv_all)
		minw = minw_all
		maxw = maxw_all
		ming = ming-offset*ming
		maxg = maxg+offset*maxg
		minw = minw
		maxw = maxw
		else:
		ming = min(minu,minv)
		maxg = max(maxu,maxv)
		ming = ming-offset*ming
		maxg = maxg+offset*maxg
		minw = minw
		maxw = maxw

		print(maxu,maxv,maxg)
		#uu_ser = (uu_ser-ming)/(maxg-ming)
		#vv_ser = (vv_ser-ming)/(maxg-ming)
		uu_ser = (uu_ser+maxg)/(2*maxg)
		vv_ser = (vv_ser+maxg)/(2*maxg)
		ww_ser = (ww_ser-minw)/(maxw-minw)
		#print(uu_ser.shape, vv_ser.dtype, ww_ser.dtype, vis_ser_real.shape, vis_ser_img.dtype, weight_ser.dtype, grid.dtype)
		print(np.amin(uu_ser),np.amax(uu_ser))
		print(np.amin(vv_ser),np.amax(vv_ser))
		print(np.amin(ww_ser),np.amax(ww_ser))

		# set normalized uvw - mesh conversion factors
		dx = 1.0/grid_size
		dw = 1.0/num_w_planes

		readtime1 = time.time()

		if rank == 0:
		outfile = outpath+ufile
		uu_ser.tofile(outfile,sep='')
		outfile = outpath+vfile
		vv_ser.tofile(outfile,sep='')
		outfile = outpath+wfile
		ww_ser.tofile(outfile,sep='')
		outfile = outpath+weights
		weight_ser.tofile(outfile,sep='')
		outfile = outpath+weights
		weight_ser.tofile(outfile,sep='')
		outfile = outpath+visrealfile
		vis_ser_real.tofile(outfile,sep='')
		outfile = outpath+visimgfile
		vis_ser_img.tofile(outfile,sep='')
		outfile = outpath+metafile
		f = open(outfile, 'w')
		f.writelines(str(uu_ser.size)+"\n")
		f.writelines(str(vis_ser_real.size)+"\n")
		f.writelines(str(vis.shape[1])+"\n")
		f.writelines(str(vis.shape[2])+"\n")
		f.writelines(str(Ntime)+"\n")
		f.writelines(str(timepersample)+"\n")
		f.writelines(str(timepersample*Ntime/3600)+"\n")
		f.writelines(str(Nbaselines)+"\n")
		f.writelines(str(ming)+"\n")
		f.writelines(str(maxg)+"\n")
		f.writelines(str(minw)+"\n")
		f.writelines(str(maxw)+"\n")
		f.close()