#ifdef OMP #include #endif #include #include #include "mpi.h" //#include "pardef.h" #include "util.h" void aprod(int mode, long int m, long int n, double *vVect, double *knownTerms, double *systemMatrix, long int *matrixIndex, int *instrCol, int *instrConstrIlung, struct comData comlsqr, time_t *ompSec) { // Parallel definitions int myid, nproc; long int *mapNoss, *mapNcoeff; int nthreads, tid, ntasks; long **mapForThread; /// struct comData *comlsqr; // FILE *fk, *fk0; double zero = 0.0; double sum, yi; long int l1, l2; long int i, i1; long int l, j, k; int i2 = 0, j2 = 0, j3 = 0, na = 0; int setBound[4]; double localSum; short nAstroPSolved = comlsqr.nAstroPSolved; long localAstro = comlsqr.VrIdAstroPDim * nAstroPSolved; long localAstroMax = comlsqr.VrIdAstroPDimMax * nAstroPSolved; // Initialize. myid = comlsqr.myid; nproc = comlsqr.nproc; mapNcoeff = comlsqr.mapNcoeff; mapNoss = comlsqr.mapNoss; nthreads = comlsqr.nthreads; ntasks= comlsqr.ntasks; mapForThread = comlsqr.mapForThread; int multMI = comlsqr.multMI; long nparam = comlsqr.parOss; short nAttAxes = comlsqr.nAttAxes; int numOfExtStar = comlsqr.numOfExtStar; int numOfBarStar = comlsqr.numOfBarStar; int numOfExtAttCol = comlsqr.numOfExtAttCol; long VrIdAstroPDimMax = comlsqr.VrIdAstroPDimMax; int startingAttColExtConstr = comlsqr.startingAttColExtConstr; int nOfElextObs = comlsqr.nOfElextObs; int nEqExtConstr = comlsqr.nEqExtConstr; int nOfElBarObs = comlsqr.nOfElBarObs; int nEqBarConstr = comlsqr.nEqBarConstr; int debugMode = comlsqr.debugMode; short nInstrPSolved = comlsqr.nInstrPSolved; int nOfInstrConstr = comlsqr.nOfInstrConstr; int nElemIC = comlsqr.nElemIC; short nAttP = comlsqr.nAttP; short nGlobP = comlsqr.nGlobP; setBound[0] = comlsqr.setBound[0]; setBound[1] = comlsqr.setBound[1]; setBound[2] = comlsqr.setBound[2]; setBound[3] = comlsqr.setBound[3]; long nDegFreedomAtt = comlsqr.nDegFreedomAtt; short nAttParAxis = comlsqr.nAttParAxis; long offsetAttParam = comlsqr.offsetAttParam; long offsetInstrParam = comlsqr.offsetInstrParam; long offsetGlobParam = comlsqr.offsetGlobParam; /* for(int n=0;nmapForThread[n]=(long *) calloc(3,sizeof(long)); */ //nthreads = 1; tid = 0; FILE *fp1, *fp2; // fp1=fopen("test1_aprod","w"); // fp2=fopen("test2_aprod","w"); if (mode != 1 && mode != 2) { printf("ERROR: Invalid mode=%d in aprod function\n", mode); exit(1); } l1 = 0; l2 = 0; myid = comlsqr.myid; if (mode == 1) { time_t startTime = time(NULL); //// #pragma omp parallel private(myid, sum, k, l1, l2, l, j, tid, nthreads, i2, na) shared(mapNoss, instrCol, comlsqr, vVect, systemMatrix, matrixIndex, knownTerms, j2) { myid = comlsqr.myid; if (comlsqr.itn == 1 && debugMode) printf("PE=%d Aprod1 OpenMP num of threads =%d from thread =%d icycle=%ld comlsqr.itn=%d\n", myid, nthreads, tid, i, comlsqr.itn); long miValAstro = 0; long miValAtt = 0; long jstartAtt = 0; long jstartAstro = 0; long lset = 0; long offLocalAstro = 0; long offLocalAtt = 0; long offLocalInstr = 0; //Offset on Instruments long ixInstr = 0; int nInstrVal = 0; offLocalInstr = offsetInstrParam + (localAstroMax - offsetAttParam); //Offset on Instruments nInstrVal = nAstroPSolved + nAttP; offLocalAstro = comlsqr.mapStar[myid][0] * nAstroPSolved; //Offset on my mstars offLocalAtt = localAstroMax - offsetAttParam; //Offset on attitude long offLocalGlob = offsetGlobParam + (localAstroMax - offsetAttParam); //Offset on GlobP int nGlobVal = nAstroPSolved + nAttP + nInstrPSolved; jstartAstro = miValAstro - offLocalAstro; //FV_ EDIT ompSs for(int nt=0; nt < ntasks; nt++ ) { #pragma omp task label(mode1) { for (long ix = mapForThread[nt][0]; ix < mapForThread[nt][2]; ix++) { sum = 0.; ///////////////////////////////////////////////////// /// Mode 1 Astrometric Sect if (nAstroPSolved) { lset = ix * nparam; if (matrixIndex[multMI * ix] != miValAstro) { miValAstro = matrixIndex[multMI * ix]; jstartAstro = miValAstro - offLocalAstro; } for (long jx = jstartAstro; jx < jstartAstro + nAstroPSolved; jx++) { sum = sum + systemMatrix[lset] * vVect[jx]; lset++; } } ////////////////////////////////////////////////////// /// Mode 1 Attitude Sect if (nAttP) { lset = ix * nparam + nAstroPSolved; miValAtt = matrixIndex[multMI * ix + (multMI - 1)]; for (int nax = 0; nax < nAttAxes; nax++) { jstartAtt = miValAtt + offLocalAtt + nax * nDegFreedomAtt; for (long inpax = jstartAtt; inpax < jstartAtt + nAttParAxis; inpax++) { sum = sum + systemMatrix[lset] * vVect[inpax]; lset++; } } } ////////////////////////////////////////////////////// /// Mode 1 Instrument Sect if (nInstrPSolved) { lset = ix * nparam + nInstrVal; long iiVal = ix * nInstrPSolved; for (int inInstr = 0; inInstr < nInstrPSolved; inInstr++) { ixInstr = offLocalInstr + instrCol[iiVal + inInstr]; sum = sum + systemMatrix[lset] * vVect[ixInstr]; lset++; } } ////////////////////////////////////////////////////// /// Mode 1 Global sect if (nGlobP) { lset = ix * nparam + nGlobVal; for (long inGlob = offLocalGlob; inGlob < offLocalGlob + nGlobP; inGlob++) { sum = sum + systemMatrix[lset] * vVect[inGlob]; lset++; } } ////////////////////////////////////////////////////// knownTerms[ix] += sum; } //for ix } } /// Mode 1 ExtConstr if (nEqExtConstr) { long offExtAtt; long offExtAttConstr = VrIdAstroPDimMax * nAstroPSolved + startingAttColExtConstr; long vVIx; long ktIx = mapNoss[myid]; long offExtConstr; for (int iexc = 0; iexc < nEqExtConstr; iexc++) { sum = 0.0; offExtConstr = mapNcoeff[myid] + iexc * nOfElextObs; //FV_ EDIT ompSs //#pragma omp for for (int j3 = 0; j3 < numOfExtStar * nAstroPSolved; j3++) sum += systemMatrix[offExtConstr + j3] * vVect[j3]; for (int nax = 0; nax < nAttAxes; nax++) { offExtAtt = offExtConstr + numOfExtStar * nAstroPSolved + nax * numOfExtAttCol; vVIx = offExtAttConstr + nax * nDegFreedomAtt; //FV_ EDIT ompSs //#pragma omp for for (int j3 = 0; j3 < numOfExtAttCol; j3++) sum += systemMatrix[offExtAtt + j3] * vVect[vVIx + j3]; } //FV_ EDIT ompSs //#pragma omp atomic knownTerms[ktIx + iexc] += sum; } //for iexc } ////////////////////////////////////////////////////// /// Mode 1 BarConstr if (nEqBarConstr) { long offBarConstr = mapNcoeff[myid] + nOfElextObs * nEqExtConstr; long offBarConstrIx; long ktIx = mapNoss[myid] + nEqExtConstr; for (int iexc = 0; iexc < nEqBarConstr; iexc++) { sum = 0.0; offBarConstrIx = offBarConstr + iexc * nOfElBarObs; //FV_ EDIT ompSs //#pragma omp for for (int j3 = 0; j3 < numOfBarStar * nAstroPSolved; j3++) sum += systemMatrix[offBarConstrIx + j3] * vVect[j3]; //FV_ EDIT ompSs //#pragma omp atomic knownTerms[ktIx + iexc] += sum; } //for iexc } ////////////////////////////////////////////////////// /// Mode 1 InstrConstr if (nOfInstrConstr) { long offSetInstr = 0; long offSetInstrInc = 0; long offSetInstrInc1 = 0; long vVix = 0; long offvV = 0; long offSetInstrConstr = mapNcoeff[myid] + nOfElextObs * nEqExtConstr + nOfElBarObs * nEqBarConstr; long offSetInstrConstr1 = VrIdAstroPDimMax * nAstroPSolved + nDegFreedomAtt * nAttAxes; long ktIx = mapNoss[myid] + nEqExtConstr + nEqBarConstr; for (int i1 = myid; i1 < nOfInstrConstr; i1 = i1 + nproc) { sum = 0.0; offSetInstrInc = offSetInstrConstr; offSetInstr = 0; for (int m = 0; m < i1; m++) { offSetInstrInc += instrConstrIlung[m]; offSetInstr += instrConstrIlung[m]; } offvV = mapNoss[myid] * nInstrPSolved + offSetInstr; //FV_ EDIT ompSs //#pragma omp for for (int j3 = 0; j3 < instrConstrIlung[i1]; j3++) { vVix = instrCol[offvV + j3]; sum += systemMatrix[offSetInstrInc + j3] * vVect[offSetInstrConstr1 + vVix]; } //FV_ EDIT ompSs //#pragma omp atomic knownTerms[ktIx + i1] += sum; } } ////////////////////////////////////////////////////// } //pragma *ompSec += time(NULL) - startTime; if (comlsqr.itn <= 2 && (myid == 0 || myid == nproc - 1 || debugMode == 1)) printf("PE=%d AprodTiming: mode=1 OmpSec=%ld\n", myid, time(NULL) - startTime); #pragma omp taskwait } else //mode==2 { //if(mode time_t startTime = time(NULL); int count = 0; myid = comlsqr.myid; printf("vVect[1]: %f:\n", vVect[1] ); //qui lavorare /* #ifdef OMP tid = omp_get_thread_num(); nthreads = omp_get_num_threads(); #endif */ ///////////////////////////////////////////////////// /// Mode 2 Astrometric Sect if (nAstroPSolved) { long offLocalAstro = comlsqr.mapStar[myid][0] * nAstroPSolved; // for (long ix = mapForThread[tid][0]; ix < mapForThread[tid][2]; ix++) long lset; long jstartAstro; for(int nt=0; nt < ntasks; nt++ ) { #pragma omp target device (cuda) copy_deps #pragma omp task label(vVect_nAstroPSolved) //out(vVect[jstartAstro : jstartAstro + nAstroPSolved-1]) { lset = mapForThread[nt][0] * nparam; double taskLocalSum; jstartAstro = 0; for (long ix = mapForThread[nt][0]; ix < mapForThread[nt][2]; ix++) { long miValAstro = 0; if (matrixIndex[multMI * ix] != miValAstro) { miValAstro = matrixIndex[multMI * ix]; jstartAstro = miValAstro - offLocalAstro; } for (long jx = jstartAstro; jx < jstartAstro + nAstroPSolved; jx++) { taskLocalSum = systemMatrix[lset] * knownTerms[ix]; //vVect[jx] = 0.000001; if(jx == 1 && false) { printf("nAstroPSolved nt:%d ix:%ld vVect[jx]:%f\n\n",nt,ix,vVect[jx] ); } vVect[jx] += taskLocalSum; lset++; } //for jx lset += nparam - nAstroPSolved; } //for ix } // #pragma omp taskwait } } ///////////////////////////////////////////////////// ///////////////////////////////////////////////////// /// Mode 2 Attitude Sect if (nAttP) { long mix; long offj = (localAstroMax - offsetAttParam); long vVix; long lset; // printf("ntask: %d\n",ntasks); for(int nt=0; nt < ntasks; nt++ ) { // #pragma omp task shared(nparam,nInstrPSolved,matrixIndex) in (nt) commutative(vVect) // #pragma omp task commutative(vVect) #pragma omp task label(nAttP) commutative(vVect)//out(vVect[mix : mix + (nAttAxes-1) * nDegFreedomAtt]) { for (long ix = mapForThread[nt][0]; ix < mapForThread[nt][2]; ix++) { lset = nparam * ix + nAstroPSolved; mix = matrixIndex[multMI * ix + (multMI - 1)] + offj; for (int ly = 0; ly < nAttAxes; ly++) { vVix = mix + ly * nDegFreedomAtt; if(vVix == 1) { printf("nAttP nt %d %ld\n\n",nt,ix); } for (int lx = 0; lx < nAttParAxis; lx++) { localSum = systemMatrix[lset] * knownTerms[ix]; // #pragma omp atomic // vVect[vVix] = 0.000001; vVect[vVix] += localSum; lset++; vVix++; } //for lx } //for ly } //for ix } } // long lset; ///#pragma omp for } ///////////////////////////////////////////////////// /// Mode 2 Instrument Sect printf("\t\t\t\tnOfInstrConstr: %d", nOfInstrConstr); if (nOfInstrConstr ) { int offLset = nAstroPSolved + nAttP; long iix; long offj = offsetInstrParam + (localAstroMax - offsetAttParam); long vVix; for(int nt=0; nt < ntasks; nt++ ) { // #pragma omp task shared(vVect,nparam,offLset,nInstrPSolved) in (nt) // #pragma omp task commutative(vVect) #pragma omp task label(nOfInstrConstr) commutative(vVect) //out(vVect[mix : mix + (nAttAxes-1) * nDegFreedomAtt]) { long lset; for (long ix = mapForThread[nt][0]; ix < mapForThread[nt][2]; ix++) { lset = nparam * ix + offLset; iix = ix * nInstrPSolved; for (int ly = 0; ly < nInstrPSolved; ly++) { vVix = offj + instrCol[iix + ly]; if(vVix == 1) { printf("nOfInstrConstr nt %d %ld\n\n",nt,ix); } localSum = systemMatrix[lset] * knownTerms[ix]; //#pragma omp atomic //vVect[vVix] = 0.000001; vVect[vVix] += localSum; lset++; } //for ly } //for ix } // #pragma omp taskwait } } ///////////////////////////////////////////////////// ///} //pragma *ompSec += time(NULL) - startTime; if (comlsqr.itn <= 2 && (myid == 0 || myid == nproc - 1 || debugMode == 1)) printf("PE=%d AprodTiming: mode=2.1 OmpSec=%ld\n", myid, time(NULL) - startTime); ///////////////////////////////////////////////////// /// Mode 2 Global Sect if (nGlobP ) { time_t startTime_nGlobP = time(NULL); long lset; int offLset = nAstroPSolved + nAttP + nInstrPSolved; long offj = offsetGlobParam + (localAstroMax - offsetAttParam); long vVix; for (long ix = 0; ix < mapNoss[myid]; ix++) { lset = nparam * ix + offLset; for (long ly = 0; ly < nGlobP; ly++) { vVix = offj + ly; if(vVix == 1) { printf("nGlobP nt %ld\n\n",ix); } localSum = systemMatrix[lset] * knownTerms[ix]; vVect[vVix] += localSum; lset++; } //for ly } //for ix if (comlsqr.itn <= 2 && (myid == 0 || myid == nproc - 1 || debugMode == 1)) printf("\tPE=%d AprodTiming: nGlobP sec=%ld\n", myid, time(NULL) - startTime_nGlobP); } //////////////////////////////////////////////////// startTime = time(NULL); //FV_ EDIT ompSs // #pragma omp parallel private(myid, yi, localSum, tid, nthreads, i2, j2, na) shared(mapNoss, vVect, systemMatrix, knownTerms, k, j3) { myid = comlsqr.myid; //FV_ EDIT ompSs /* //#ifdef OMP tid = omp_get_thread_num(); nthreads = omp_get_num_threads(); #endif */ localSum = 0.0; printf("\t\t\t\nEqExtConstr: %d nEqBarConstr: %d nOfInstrConstr: %d", nEqExtConstr,nEqBarConstr,nOfInstrConstr); #pragma omp taskwait ////////////////////////////////////////////////////// /// Mode 2 ExtConstr if (nEqExtConstr && true) { time_t startTime_nEqExtConstr = time(NULL); long offExtStarConstrEq; long off2; long off3; for (int ix = 0; ix < nEqExtConstr; ix++) { //stars yi = knownTerms[mapNoss[myid] + ix]; offExtStarConstrEq = mapNcoeff[myid] + ix * nOfElextObs; //Star offset on the row ix (all the row) //FV_ EDIT ompSs //#pragma omp for for (long yx = 0; yx < numOfExtStar; yx++) { off3 = yx * nAstroPSolved; off2 = offExtStarConstrEq + off3; for (int j2 = 0; j2 < nAstroPSolved; j2++) { localSum = systemMatrix[off2 + j2] * yi; vVect[j2 + off3] += localSum; } } //FV_ EDIT ompSs //#pragma omp barrier } //for ix long offExtAttConstrEq; long offExtUnk; long off1 = VrIdAstroPDimMax * nAstroPSolved + startingAttColExtConstr; for (int ix = 0; ix < nEqExtConstr; ix++) { //att yi = knownTerms[mapNoss[myid] + ix]; offExtAttConstrEq = mapNcoeff[myid] + ix * nOfElextObs; //Att offset on the row ix (all the row) offExtAttConstrEq += numOfExtStar * nAstroPSolved; //Att offset inside ix row for (int nax = 0; nax < nAttAxes; nax++) { offExtUnk = off1 + nax * nDegFreedomAtt; // Att offset for Unk array on extConstr off2 = offExtAttConstrEq + nax * numOfExtAttCol; //FV_ EDIT ompSs //#pragma omp for for (int jx = 0; jx < numOfExtAttCol; jx++) { localSum = systemMatrix[off2 + jx] * yi; vVect[offExtUnk + jx] += localSum; } //FV_ EDIT ompSs //#pragma omp barrier } } if (comlsqr.itn <= 2 && (myid == 0 || myid == nproc - 1 || debugMode == 1)) printf("\tPE=%d AprodTiming: nEqExtConstr sec=%ld\n", myid, time(NULL) - startTime_nEqExtConstr); } ////////////////////////////////////////////////////// /// Mode 2 BarConstr if (nEqBarConstr && true) { time_t startTime_nEqBarConstr = time(NULL); localSum = 0.0; long off3; long off2; for (int ix = 0; ix < nEqBarConstr; ix++) { //stars yi = knownTerms[mapNoss[myid] + nEqExtConstr + ix]; long offBarStarConstrEq = mapNcoeff[myid] + nEqExtConstr * nOfElextObs + ix * nOfElBarObs; //Star offset on the row i2 (all the row) //FV_ EDIT ompSs //#pragma omp for for (long yx = 0; yx < numOfBarStar; yx++) { off3 = yx * nAstroPSolved; off2 = offBarStarConstrEq + off3; for (int j2 = 0; j2 < nAstroPSolved; j2++) { localSum = systemMatrix[off2 + j2] * yi; vVect[j2 + off3] += localSum; } } //FV_ EDIT ompSs //#pragma omp barrier } //for i2 if (comlsqr.itn <= 2 && (myid == 0 || myid == nproc - 1 || debugMode == 1)) printf("\tPE=%d AprodTiming: nEqBarConstr sec=%ld\n", myid, time(NULL) - startTime_nEqBarConstr); } ////////////////////////////////////////////////////// /// Mode 2 InstrConstr if (nOfInstrConstr && true) { time_t startTime_nOfInstrConstr = time(NULL); localSum = 0.0; int offSetInstr; long off1; long offInstrUnk = VrIdAstroPDimMax * nAstroPSolved + nAttAxes * nDegFreedomAtt; long off2 = mapNoss[myid] + nEqExtConstr + nEqBarConstr; long off3 = mapNcoeff[myid] + nOfElextObs * nEqExtConstr + nOfElBarObs * nEqBarConstr; long off4 = mapNoss[myid] * nInstrPSolved; long off5; long off6; for (int k1 = myid; k1 < nOfInstrConstr; k1 = k1 + nproc) { yi = knownTerms[off2 + k1]; offSetInstr = 0; for (int m = 0; m < k1; m++) offSetInstr += instrConstrIlung[m]; off1 = off3 + offSetInstr; off5 = off4 + offSetInstr; //FV_ EDIT ompSs //#pragma omp for for (int j = 0; j < instrConstrIlung[k1]; j++) { localSum = systemMatrix[off1 + j] * yi; off6 = offInstrUnk + instrCol[off5 + j]; vVect[off6] += localSum; } } if (comlsqr.itn <= 2 && (myid == 0 || myid == nproc - 1 || debugMode == 1)) printf("\tPE=%d AprodTiming: nOfInstrConstr sec=%ld\n", myid, time(NULL) - startTime_nOfInstrConstr); } //////////// } //pragma *ompSec += time(NULL) - startTime; if (comlsqr.itn <= 2 && (myid == 0 || myid == nproc - 1 || debugMode == 1)) printf("PE=%d AprodTiming: mode=2.2 OmpSec=%ld\n", myid, time(NULL) - startTime); } // else if(mode==2) /* else //mode==2 { //if(mode double **vVect_aux_AttP; double **vVect_aux_InstP ; int nElements = (comlsqr.nAttParam + comlsqr.nInstrParam); vVect_aux_AttP =(double **) calloc(omp_get_num_threads(),sizeof(double *)); if (!vVect_aux_AttP) exit(err_malloc("vVect_aux_AttP", myid)); vVect_aux_InstP =(double **) calloc(omp_get_num_threads(),sizeof(double *)); if (!vVect_aux_InstP) exit(err_malloc("vVect_aux_InstP", myid)); for (int i=0;i