Commit 3cc43560 authored by Fabio Roberto Vitello's avatar Fabio Roberto Vitello
Browse files

fly5.0.1

parents
File added
#SHELL=/bin/sh
OBJS= fly_h.o fly.o null.o error.o \
sys_init.o read_params.o read_redsh.o\
read_b_asc.o read_b_bin.o init_pars.o init_ew.o \
init_pos.o reset_pos.o inpar_dyn.o step.o step_force.o \
tree_build.o tree_sort.o \
tree_gen.o find_group.o \
cell_prop.o acc_comp.o acc_ex.o \
ilist_group.o force_group.o force.o ilist.o upd_pos.o upd_vel.o \
wr_native.o write_b_asc.o write_b_bin.o out_32.o leapf_corr.o \
io.o dt_comp.o read_s_bin.o
# Tunable parameters
#
# CF Name of the fortran compiling system to use
# LDFLAGS Flags to the loader
# LIBS List of libraries
# CMD Name of the executable
# PROFLIB Library needed for profiling
#
# CF = ifort
# CF = /opt/share/mvapich2_Working_4GB/bin/mpif90
CF = mpif90
# CF = pgf90
# CF = /opt/share/mpich2/bin/mpif90
#CF = /opt/share/pgi/linux86/6.2/bin/pgf90
# CC = pgcc
CC = mpicc
# LIBS =-L/opt/share/mvapich2/lib -lmpich -lfmpich -lmpichf90 -lmpichcxx -L.
# LIBS =-L/opt/share/mpich2_pgi_01-02/lib -lmpich -lfmpich -lmpichf90 -lmpichcxx
# LDFLAGS= -L/opt/share/intel/fc/9.1.036/lib -L/opt/share/intel/cc/9.1.042/lib -L/opt/share/mpich2/lib
# LDFLAGS= -L/opt/share/mvapich2_Working_4GB/lib
# LDFLAGS= -L/opt/share/mvapich2/lib
# LDFLAGS= -L/opt/share/intel/fc/9.1.036/lib -L/opt/share/intel/cc/9.1.042/lib -L/opt/share/mvapich2/lib
# LDFLAGS= -L/opt/share/mpich2_pgi_01-02/lib
# CMD = ../bin_16mlu/FLY_mpif90_1 # compilato con mpich2 -> mpich2_intel_gb/
# CMD = ../bin_16mlu/FLY_mpif90_1_1 # compilato con mpich2 -> mpich2_intel_gb/ compiltore ifort diretto
# CMD = ../bin_test/FLY_mpif90_2 # compilato con mpich2-31-01/
# CMD = ../bin_test/FLY_mpif90_3 # compilato con mpich2_pgi_01-02
# CMD = ../bin_test/FLY_mpif90_3_1 # compilato con mpich2_pgi_01-02 -fastsse
# -O3 -fastsse -Mprefetch=w -Msmart -tp k8-32 -Mipa=fast -Minline
# CMD = ../bin_test/FLY_mpif90_3_2 # compilato con mpich2_pgi_01-02 -fastsse
# CMD = ../bin_16mlu/FLY_mpif90_3 # compilato con mpich2_pgi_01-02
# CMD = ../bin_16mlu/FLY_mpi_ifort # compilato con mpich2 + ifort
# CMD = /gpfs/FLY/bin_big_40/FLY_mpi # compilato con mpif90 new ver
# CMD = /gpfs/FLY/bin_big/FLY_mpi_file # compilato con acc_comp_file
# CMD = /home01/ube/FLY_4/bin_2m/FLY_mpi4 # compilato con acc_comp_noall
# CMD = /gpfs/FLY/bin_16mlu/FLY_mpi # compilato con acc_comp_noall
# CMD = /gpfs/FLY/bin_big/FLY_mpi_simp # compilato con acc_comp_nodlb_gr
CMD= FLY_mpi
# CMD = ../bin_test/FLY_mpif90_4 # compilato con mvapich2_try4
# CMD = ../bin_test/FLY_mpif90_5 # compilato con mvapich2.tcp
# CMD = ../bin_test/FLY_mpif90_6 # compilato con mvapich2.vapi
# CMD = ../bin_test/FLY_mpif90_7 # compilato con mvapich2_pgi_01-02
# CMD = /gpfs/FLY/bin_big/FLY_mpi
# FFLAGS = -quiet -O0 -r8 -w95 -I/opt/share/mpich2/include/ -I/usr/local/include
# FFLAGS = -O3 -w95 -r8 -I/opt/share/mpich2/include/ -I/usr/local/include -I/usr/include
# FFLAGS = -O3 -I/opt/share/mpich2/include/ -I/usr/local/include -I/usr/include
# FFLAGS = -O3 -fast -Mprefetch=w -Msmart -tp amd64 -Mipa=fast -Minline
# FFLAGS = -O0 -check all
FFLAGS = -O3
# FFLAGS = "-mcmodel=medium"
# FFLAGS = -O3 -I/opt/share/mpich2/include/
# FFLAGS = -O3 -I/opt/share/mvapich2/include/ -I/usr/include -I/usr/local/topspin/include -I/usr/local/topspin/include/vapi
#FFLAGS = -O3 -Bstatic -I./include
#FFLAGS = -fast -O3 -I./include
#FFLAGS = -fastsse -Mprefetch -tp amd64e -Mprefetch=w -Msmart -Mipa=fast -Minline **QUI
#FFLAGS1 = -O0 -w95 -r8 -I/o8pt/share/mpich2/include/ -I/usr/local/include -I/usr/include
#FFLAGS1 = -O0 -I/opt/share/mvapich2/include/ -I/usr/include -I/usr/local/topspin/include -I/usr/local/topspin/include/vapi
#FFLAGS1 = -O0 -Bstatic -I./include
#FFLAGS1 = -fast -O0 -I./include
#####FFLAGS1 = -O0 -I/opt/share/mpich2/include/ -I/usr/local/include -I/usr/include
# FFLAGS1 = -O3
#FFLAGS1 = "-mcmodel=medium"
CFLAGS = -O3
LD = $(CF)
# Lines from here on down should not need to be changed. They are the
# actual rules which make uses to build a.out.
#
all: $(CMD)
.SUFFIXES :
.SUFFIXES : .o .F90 .c
$(OBJS): fly_h.F90
#read_b_bin.o : read_b_bin.F90 $(CF) $(FFLAGS1) -c $<
read_b_bin.o : read_b_bin.F90
$(CF) $(FFLAGS1) -c $<
read_redsh.o : read_redsh.F90
$(CF) $(FFLAGS1) -c $<
.F90.o :
$(CF) $(FFLAGS) -c $<
.c.o :
$(CC) $(CFLAGS) -c $<
$(CMD): $(OBJS)
$(LD) $(LDFLAGS) -o $(@) $(OBJS) $(LIBS)
# Make the profiled version of the command and call it a.out.prof
#
$(CMD).prof: $(OBJS)
$(CF) $(LDFLAGS) -o $(@) $(OBJS) $(PROFLIB) $(LIBS)
clean:
-rm -f $(OBJS)
clobber: clean
-rm -f $(CMD) $(CMD).prof
void: clobber
-rm -f $(SRCS) makefile
PROGRAM TestOMP
USE omp_lib
implicit none
INCLUDE 'mpif.h'
! Declaration of local variables.
! -------------------------------
INTEGER(KIND=4), DIMENSION (:) , ALLOCATABLE ::isub
INTEGER :: TID, NTID
INTEGER:: status
INTEGER :: ierr,me,npes, ierror,lname
INTEGER :: asss,bsss,csss,dsss,esss,fsss,gsss,hsss
character*(MPI_MAX_PROCESSOR_NAME) hostname_me
!-----------------------------------------------------------------------
CALL MPI_INIT(ierror)
CALL MPI_COMM_RANK(MPI_COMM_WORLD, me, ierror)
CALL MPI_COMM_SIZE(MPI_COMM_WORLD, npes, ierror)
CALL MPI_GET_PROCESSOR_NAME(hostname_me, lname, ierror)
WRITE(6, *) "TestOMP - RUN. PE=",me," HOSTNAME:",hostname_me(1:lname),' npes=',npes
NTID = 1
NTID = OMP_GET_MAX_THREADS()
!$OMP PARALLEL PRIVATE(isub) &
!$OMP PRIVATE(asss,bsss,csss)
!---!$OMP PRIVATE(dsss,esss,fsss) &
!---!$OMP PRIVATE(gsss,hsss)
ALLOCATE(isub(8), STAT=status)
!-----------------------------------------------------------------------
! Ogni thread conta quanti indici di sottocella genera in pcg
!-----------------------------------------------------------------------
TID = 0
TID = OMP_GET_THREAD_NUM()
asss=TID
esss=TID
if(tid.lt.5) hsss=TID
isub(1:8)=TID
write(6,*)'PE=',me,"TID=",TID, asss,esss,hsss
DEALLOCATE(isub)
!$OMP END PARALLEL
call MPI_FINALIZE(ierror)
STOP
END
!-----------------------------------------------------------------------
!TEST: fa solo grouping locale
!
SUBROUTINE acc_comp(option)
!
!
!-----------------------------------------------------------------------
!
!
! Subroutine to compute the gravitational acceleration for all of
! the bodies. Vectorization is achieved by processing all of the
! cells at a given level in the tree simultaneously. The local
! variable option indicates whether the code is to compute the
! potential and/or acceleration.
!
! local_wg_bod is the number of clock cycle needed for a PE resident
! body having nterms=1
!=======================================================================
USE fly_h
implicit none
INCLUDE 'mpif.h'
! Declaration of local variables.
! -------------------------------
INTEGER :: TID, NTID,status
INTEGER :: n, q, i
INTEGER(KIND=4) :: ele, nterms, nterms_gr, bcount_ele, j, p, uno
INTEGER(KIND=4) :: mio_ele
INTEGER(KIND=4), DIMENSION (:), ALLOCATABLE :: iterms,iterms_gr
REAL(KIND=8), DIMENSION (:), ALLOCATABLE :: pmass,pmass_gr
REAL(KIND=8), DIMENSION (:), ALLOCATABLE :: drdotdr,dx,dy,dz
REAL(KIND=8), DIMENSION (:), ALLOCATABLE:: drdotdr_gr,dx_gr,dy_gr,dz_gr
REAL(KIND=8), DIMENSION (:,:), ALLOCATABLE ::pquad,pquad_gr
REAL(KIND=8), DIMENSION(:), ALLOCATABLE :: acc_g
REAL(KIND=8), DIMENSION (:), ALLOCATABLE::pos_comm
INTEGER :: count_par
REAL(KIND=8) :: c1a, c2a, c2b, c2c, ctwg, cgs_g, cgs_b, cg1,cg2
REAL(KIND=8) ::cpar_a, cpar_b
CHARACTER(LEN=4) :: option
!=======================================================================
!=======================================================================
! Initialize the interaction list diagnostics.
! --------------------------------------------
!***************************************************
!
! m_sh, max_sh, max_pr_bal
! are computed by load_balance once before the iterations
!
!**************************************************
uno=1
group_access=0
nterms=0
nterms_gr=0
bcount_ele=0
mark_bod_gr=0
ctot_TW=0
ctot_GS_nt=0
!=======================================================================
! GROUPING SECTION
!=======================================================================
! We find the force for the bodies of a cell of the grouping as the
! sum of two components. The first component is equal for all the
! bodies of the cell-grouping and it is due at the cells and at the
! bodies outside at the cell-grouping. The second component is
! different by body at body of the cell-grouping and it is due at
! the interactions between the bodies of the cell-grouping.
!=======================================================================
ctwg=0
cgs_g=0
cgs_b=0
!-------------------------------------------------------
! count_group_arr(#PE)=cg_loc of the #PE
! ilocal_work_gr the number of local grouping cell resolved by the PE
! ilocal_save maximum number of local grouping cell resolved by each PE
! grouping_rmt receive the grouping cell non locally resolved by the remote PE
! iblk: used for atomic update for non locally resolved grouping cell
! no_dlb_space if set to 1 there is no free space to execute dlb section of grouping
!-------------------------------------------------------
IF(me .EQ. 0) write(uterm,*)'Grouping section started'
cg1=MPI_WTIME()
!-----------------------------------------------------------------------
! Analysis of grouping cell in the PE=ix_gr
! ix_gr start from the local PE and cycle for all the PEs
! load grouping_rmt with grouping cells of a remote PE
!-----------------------------------------------------------------------
count_par=0
cpar_a=MPI_WTIME()
NTID = 1
!$ NTID = OMP_GET_MAX_THREADS();
!$OMP PARALLEL PRIVATE(mio_ele, ele,count_par,acc_g,nterms) &
!$OMP PRIVATE(nterms_gr, bcount_ele,j,q,NTID,TID) &
!$OMP PRIVATE(p,iterms,iterms_gr,pmass,pmass_gr) &
!$OMP PRIVATE(drdotdr,dx,dy,dz) &
!$OMP PRIVATE(drdotdr_gr,dx_gr,dy_gr,dz_gr,pquad,pquad_gr,pos_comm)
ALLOCATE(iterms(maxnterm), STAT=status)
ALLOCATE(iterms_gr(maxnterm), STAT=status)
ALLOCATE(pmass(maxnterm), STAT=status)
ALLOCATE(pmass_gr(maxnterm), STAT=status)
ALLOCATE(drdotdr(maxnterm), STAT=status)
ALLOCATE(dx(maxnterm), STAT=status)
ALLOCATE(dy(maxnterm), STAT=status)
ALLOCATE(dz(maxnterm), STAT=status)
ALLOCATE(drdotdr_gr(maxnterm), STAT=status)
ALLOCATE(dx_gr(maxnterm), STAT=status)
ALLOCATE(dy_gr(maxnterm), STAT=status)
ALLOCATE(dz_gr(maxnterm), STAT=status)
ALLOCATE(pquad(2*ndim-1,maxnterm), STAT=status)
ALLOCATE(pquad_gr(2*ndim-1,maxnterm), STAT=status)
ALLOCATE(acc_g(ndim), STAT=status)
ALLOCATE(pos_comm(ndim), STAT=status)
TID = 0
!$ TID = OMP_GET_THREAD_NUM();
!$OMP DO
DO mio_ele=1,cg_loc
!-------------------------------------------------------------------------
! iblk2 is an array atomically updated that cointain the number of cells already
! computed on the PE.
! mio_ele contains the number of elemnt to be processed: locally from 1 to
! ilocal_work_gr, and remotely (or locally for shared gr-cells), as computed
! from iblk2 array that is atomically updated
!-------------------------------------------------------------------------
!-------------------------------------------------------------------------
! ele contain the number of gr-cell to be elaborated: local or remote
! cell stored in grouping_rmt array
!-------------------------------------------------------------------------
ele=grouping(mio_ele)
count_par=count_par+1
acc_g=0
! Forming the interaction lists.
! ----------------------------
CALL ilist_group (ele,nterms,nterms_gr,bcount_ele,iterms,iterms_gr,&
pmass, pmass_gr, pquad,pquad_gr,drdotdr_gr,dx_gr,dy_gr,dz_gr)
!
! Compute potential and/or acceleration.
! --------------------------------------
!
!
! Compute of the pot. and F_far of a grouping cell
! --------------------------------------------------
!
CALL force_group(ele,nterms_gr,iterms_gr,drdotdr_gr,dx_gr,dy_gr,dz_gr,pmass_gr,acc_g,pquad_gr,option,TID)
!-----------------------------------------------------------------------
! Computation of the pot. F_near and F_tot of each body in tghe grouping cell
! Mark to 1 (local or remomtely wuth a PUT operation) the flag (mark_bod_gr)
! of each body that is computed in this section
!-----------------------------------------------------------------------
DO q=nterms-bcount_ele+1,nterms
j=iterms(q)
mark_bod_gr(j)=uno
CALL force(j,nterms,iterms,pos_comm,dx,dy,dz,drdotdr,pmass,pquad,acc_g,option)
ENDDO !q=nterms-bcount_ele+1,nterms
ENDDO ! mio_ele=q,cg_log
!$OMP END DO
1100 FORMAT(a,i3,3(a,i9))
1200 FORMAT(a,f5.2,a,f5.2)
!$OMP BARRIER
IF(me.eq.0 .and. TID.eq.0) THEN
cg2=MPI_WTIME()
ctwg=ctwg+(cg2-cg1)
write(uterm,1000)'GROUPING: PE=',me,' TIME=',ctwg,' Tot gr-cells=',cg_loc
call flush(uterm)
ENDIF
1000 FORMAT(a,i3,1(a,g15.4))
!-----------------------------------------------------------------------
! LOCAL FORCE COMPUTATION
!-----------------------------------------------------------------------
! In this section each PE compute the force for a subset of the local
! bodies, that were not computed in the grouping part
!-----------------------------------------------------------------------
group_access=1 ! ungrouped flag
IF(TID.eq.0) c2a=MPI_WTIME()
count_par=0
!$OMP DO
DO 100 p=1,nb_res_loc(me+1)
!-----------------------------------------------------------------------
! Forming the interaction lists.
! p is the logical number of body
!-----------------------------------------------------------------------
IF(mark_bod_gr(p).ge.1) CYCLE ! skip this particle. It was already computed in the grouping section
count_par=count_par+1
! numbod_100=numbod_100+1
CALL ilist(p,nterms,iterms,pos_comm,pmass, drdotdr,dx,dy,dz,pquad)
!-----------------------------------------------------------------------
! Compute potential and the Force.
!-----------------------------------------------------------------------
CALL force(p,nterms,iterms,pos_comm,dx,dy,dz,drdotdr,pmass,pquad,acc_g,option)
100 CONTINUE
!$OMP END DO
DEALLOCATE(drdotdr)
DEALLOCATE(dx)
DEALLOCATE(dy)
DEALLOCATE(dz)
DEALLOCATE(drdotdr_gr)
DEALLOCATE(dx_gr)
DEALLOCATE(dy_gr)
DEALLOCATE(dz_gr)
DEALLOCATE(iterms)
DEALLOCATE(iterms_gr)
DEALLOCATE(pmass)
DEALLOCATE(pmass_gr)
DEALLOCATE(pquad)
DEALLOCATE(pquad_gr)
DEALLOCATE(acc_g)
DEALLOCATE(pos_comm)
!$OMP END PARALLEL
c2b=MPI_WTIME()
ctota=ctot_TW+(c2b-c2a)
CALL MPI_BARRIER(MPI_COMM_WORLD,ierror)
RETURN
END
!-----------------------------------------------------------------------
!TEST: fa solo grouping locale
!
SUBROUTINE acc_comp(option)
!
!
!-----------------------------------------------------------------------
!
!
! Subroutine to compute the gravitational acceleration for all of
! the bodies. Vectorization is achieved by processing all of the
! cells at a given level in the tree simultaneously. The local
! variable option indicates whether the code is to compute the
! potential and/or acceleration.
!
! local_wg_bod is the number of clock cycle needed for a PE resident
! body having nterms=1
!=======================================================================
USE fly_h
implicit none
INCLUDE 'mpif.h'
!$ use OMP_LIb
! Declaration of local variables.
! -------------------------------
INTEGER :: TID, NTID,N_LOC_ELE
INTEGER :: n, q
INTEGER(KIND=4) :: ele, nterms, nterms_gr, bcount_ele, j, p, uno
INTEGER(KIND=4) :: mio_ele
INTEGER(KIND=4), DIMENSION (:), ALLOCATABLE :: iterms,iterms_gr
REAL(KIND=8), DIMENSION (:), ALLOCATABLE :: pmass,pmass_gr
REAL(KIND=8), DIMENSION (:), ALLOCATABLE :: drdotdr,dx,dy,dz
REAL(KIND=8), DIMENSION (:), ALLOCATABLE:: drdotdr_gr,dx_gr,dy_gr,dz_gr
REAL(KIND=8), DIMENSION (:,:), ALLOCATABLE ::pquad,pquad_gr
REAL(KIND=8), DIMENSION(:), ALLOCATABLE :: acc_g
REAL(KIND=8), DIMENSION (:), ALLOCATABLE::pos_comm
INTEGER :: count_par
REAL(KIND=8) ::c0a, c1a, c2a, c2b, c2c, ctwg, cgs_g, cgs_b, cg1,cg2
REAL(KIND=8) ::cpar_a, cpar_b
CHARACTER(LEN=4) :: option
!=======================================================================
!=======================================================================
! Initialize the interaction list diagnostics.
! --------------------------------------------
!***************************************************
!
! m_sh, max_sh, max_pr_bal
! are computed by load_balance once before the iterations
!
!**************************************************
iterms=0
uno=1
! numbod=0
! numbod_100=0
group_access=0
nterms=0
nterms_gr=0
bcount_ele=0
mark_bod_gr=0
ctot_TW=0
ctot_GS_nt=0
mio_ele = 0
!=======================================================================
! GROUPING SECTION
!=======================================================================
! We find the force for the bodies of a cell of the grouping as the
! sum of two components. The first component is equal for all the
! bodies of the cell-grouping and it is due at the cells and at the
! bodies outside at the cell-grouping. The second component is
! different by body at body of the cell-grouping and it is due at
! the interactions between the bodies of the cell-grouping.
!=======================================================================
ctwg=0
cgs_g=0
cgs_b=0
!-------------------------------------------------------
! count_group_arr(#PE)=cg_loc of the #PE
! ilocal_work_gr the number of local grouping cell resolved by the PE
! ilocal_save maximum number of local grouping cell resolved by each PE
! grouping_rmt receive the grouping cell non locally resolved by the remote PE
! iblk: used for atomic update for non locally resolved grouping cell
! no_dlb_space if set to 1 there is no free space to execute dlb section of grouping
!-------------------------------------------------------
IF(me.eq.0) write(uterm,*)'Grouping section started'
cg1=MPI_WTIME()
!-----------------------------------------------------------------------
! Analysis of grouping cell in the PE=ix_gr
! ix_gr start from the local PE and cycle for all the PEs
! load grouping_rmt with grouping cells of a remote PE
!-----------------------------------------------------------------------
count_par=0
cpar_a=MPI_WTIME()
NTID = 1
!$ NTID = OMP_GET_MAX_THREADS();
!!! QUI VERIFICA SE FUNZIONA la &!!
!$OMP PARALLEL PRIVATE(mio_ele, ele,count_par,acc_g,nterms) &
!$OMP PRIVATE(nterms_gr, bcount_ele,j,q,NTID,TID) &
!$OMP PRIVATE(N_LOC_ELE,iterms,iterms_gr,pmass,pmass_gr) &
!$OMP PRIVATE(drdotdr,dx,dy,dz) &
!$OMP PRIVATE(drdotdr_gr,dx_gr,dy_gr,dz_gr,pquad,pquad_gr,acc_g,pos_comm)
ALLOCATE(iterms(maxnterm), STAT=status)
ALLOCATE(iterms_gr(maxnterm), STAT=status)
ALLOCATE(pmass(maxnterm), STAT=status)
ALLOCATE(pmass_gr(maxnterm), STAT=status)
ALLOCATE(drdotdr(maxnterm), STAT=status)
ALLOCATE(dx(maxnterm), STAT=status)