lsqr.c


        temp   =   d2norm( alpha, beta );
        temp   =   d2norm( temp , damp );
        anorm  =   d2norm( anorm, temp );

        if (beta > ZERO) {
            cblas_dscal ( mapNoss[myid]+nEqExtConstr+nEqBarConstr+nOfInstrConstr, (ONE / beta), knownTerms, 1 );
            cblas_dscal ( nunkSplit, (- beta), vVect, 1 );
//only processor 0 accumulate and distribute the v array
#pragma omp for
        for(i=0;i<localAstroMax;i++) {
        	vAuxVect[i]=vVect[i];
        	vVect[i]=0;
		 }
	    if(myid!=0) 
	    {
#pragma omp for
	    	for(i=localAstroMax;i<nunkSplit;i++) 
	      		vVect[i]=0;	
		}
 	    
        aprod ( 2, m, n, vVect, knownTerms, systemMatrix, matrixIndex, instrCol, instrConstrIlung, comlsqr,&ompSec);
	    MPI_Barrier(MPI_COMM_WORLD);


//////////////// TEST vVect
/*
if(nproc==2)
{
	printf("LSQR TP9 PE=%d itn=%d VrIdAstroPDim=%ld vVect[0-5]=%lf %lf %lf %lf %lf VrIdAstroPDim/2*5=%ld  vVect[..+4]=%lf %lf %lf %lf %lf VrIdAstroPDim-1*5=%ld  vVect[...+5]=%lf %lf %lf %lf %lf\n",myid,itn,VrIdAstroPDim,vVect[0],vVect[1],vVect[2],vVect[3],vVect[4],(VrIdAstroPDim/2)*5,vVect[(VrIdAstroPDim/2)*5],vVect[(VrIdAstroPDim/2)*5+1],vVect[(VrIdAstroPDim/2)*5+2],vVect[(VrIdAstroPDim/2)*5+3],vVect[(VrIdAstroPDim/2)*5+4],(VrIdAstroPDim-1)*5,vVect[(VrIdAstroPDim-1)*5],vVect[(VrIdAstroPDim-1)*5+1],vVect[(VrIdAstroPDim-1)*5+2],vVect[(VrIdAstroPDim-1)*5+3],vVect[(VrIdAstroPDim-1)*5+4]);
}
if(nproc==1)
{
	printf("LSQR TP9 PE=%d itn=%d VrIdAstroPDim=%ld vVect[0-5]=%lf %lf %lf %lf %lf VrIdAstroPDim/2*5=%ld  vVect[..+4]=%lf %lf %lf %lf %lf VrIdAstroPDim-1*5=%ld  vVect[...+5]=%lf %lf %lf %lf %lf\n",myid,itn,VrIdAstroPDim,vVect[0],vVect[1],vVect[2],vVect[3],vVect[4],(VrIdAstroPDim/2)*5,vVect[(VrIdAstroPDim/2)*5],vVect[(VrIdAstroPDim/2)*5+1],vVect[(VrIdAstroPDim/2)*5+2],vVect[(VrIdAstroPDim/2)*5+3],vVect[(VrIdAstroPDim/2)*5+4],(VrIdAstroPDim-1)*5,vVect[(VrIdAstroPDim-1)*5],vVect[(VrIdAstroPDim-1)*5+1],vVect[(VrIdAstroPDim-1)*5+2],vVect[(VrIdAstroPDim-1)*5+3],vVect[(VrIdAstroPDim-1)*5+4]);
}
*/
//////////////////////////	    
	    

    double *dcopy;
    dcopy=(double *)calloc(comlsqr.nAttParam,sizeof(double));
    if(!dcopy) exit(err_malloc("dcopy",myid));
    mpi_allreduce(&vVect[localAstroMax],dcopy,(long int) comlsqr.nAttParam,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD); 
#pragma omp for
    for(i=0;i<comlsqr.nAttParam;i++) 
    {
		vVect[localAstroMax+i]=dcopy[i];
	}
	free(dcopy);
    dcopy=(double *)calloc(comlsqr.nInstrParam,sizeof(double));
    if(!dcopy) exit(err_malloc("dcopy",myid));
    mpi_allreduce(&vVect[localAstroMax+comlsqr.nAttParam],dcopy,(long int) comlsqr.nInstrParam,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD); 
#pragma omp for
    for(i=0;i<comlsqr.nInstrParam;i++) 
    {
		vVect[localAstroMax+comlsqr.nAttParam+i]=dcopy[i];
	}
	free(dcopy);
    dcopy=(double *)calloc(comlsqr.nGlobalParam,sizeof(double));
    if(!dcopy) exit(err_malloc("dcopy",myid));
    mpi_allreduce(&vVect[localAstroMax+comlsqr.nAttParam+comlsqr.nInstrParam],dcopy,(long int) comlsqr.nGlobalParam,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD); 
    for(i=0;i<comlsqr.nGlobalParam;i++) 
    {
		vVect[localAstroMax+comlsqr.nAttParam+comlsqr.nInstrParam+i]=dcopy[i];
	}
	free(dcopy);
	if(nAstroPSolved) SumCirc(vVect,comlsqr);
#pragma omp for
    for(i=0;i<localAstroMax;i++) {
        	vVect[i]+=vAuxVect[i];
		}

    double alphaLoc=0;
    if(nAstroPSolved) alphaLoc=cblas_dnrm2(nAstroElements*comlsqr.nAstroPSolved,vVect,1);
	double alphaLoc2=alphaLoc*alphaLoc;
	if(myid==0) {
		double alphaOther=cblas_dnrm2(comlsqr.nunkSplit-localAstroMax,&vVect[localAstroMax],1);
		double alphaOther2=alphaOther*alphaOther;
		alphaLoc2=alphaLoc2+alphaOther2;
	}
	double alphaGlob2=0;
	MPI_Allreduce(&alphaLoc2, &alphaGlob2,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);

	alpha=sqrt(alphaGlob2);
            if (alpha > ZERO) {
                cblas_dscal ( nunkSplit, (ONE / alpha), vVect, 1 );
            }
        }

//      ------------------------------------------------------------------
//      Use a plane rotation to eliminate the damping parameter.
//      This alters the diagonal (rhobar) of the lower-bidiagonal matrix.
//      ------------------------------------------------------------------
        rhbar1 = rhobar;
        if ( damped ) {
            rhbar1 = d2norm( rhobar, damp );
            cs1    = rhobar / rhbar1;
            sn1    = damp   / rhbar1;
            psi    = sn1 * phibar;
            phibar = cs1 * phibar;
        }

//      ------------------------------------------------------------------
//      Use a plane rotation to eliminate the subdiagonal element (beta)
//      of the lower-bidiagonal matrix, giving an upper-bidiagonal matrix.
//      ------------------------------------------------------------------
        rho    =   d2norm( rhbar1, beta );
        cs     =   rhbar1 / rho;
        sn     =   beta   / rho;
        theta  =   sn * alpha;
        rhobar = - cs * alpha;
        phi    =   cs * phibar;
        phibar =   sn * phibar;
        tau    =   sn * phi;

//      ------------------------------------------------------------------
//      Update  x, w  and (perhaps) the standard error estimates.
//      ------------------------------------------------------------------
        t1     =   phi   / rho;
        t2     = - theta / rho;
        t3     =   ONE   / rho;
        dknorm =   ZERO;

        for (i = 0; i < nAstroElements*comlsqr.nAstroPSolved; i++) {
                t      =  wVect[i];
                t      = (t3*t)*(t3*t);
                dknorm =  t     +  dknorm;
        }
 		dknormSum=0;
 		MPI_Allreduce(&dknorm,&dknormSum,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
 		dknorm=dknormSum;
 		
        if ( wantse ) {
#pragma omp for
            for (i = 0; i < localAstro; i++) {
                t      =  wVect[i];
                xSolution[i]   =  t1*t  +  xSolution[i];
                wVect[i]   =  t2*t  +  vVect[i];
                t      = (t3*t)*(t3*t);
                standardError[i]  =  t     +  standardError[i];
            }
#pragma omp for
            for (i = localAstroMax; i < localAstroMax+other; i++) {
                t      =  wVect[i];
                xSolution[i]   =  t1*t  +  xSolution[i];
                wVect[i]   =  t2*t  +  vVect[i];
                t      = (t3*t)*(t3*t);
                standardError[i]  =  t     +  standardError[i];
            }
            for (i = localAstroMax; i < localAstroMax+other; i++) {
                t      =  wVect[i];
                t      = (t3*t)*(t3*t);
                dknorm =  t     +  dknorm;
            }
        }
        else {
#pragma omp for
            for (i = 0; i < localAstro; i++) {
                t      =  wVect[i];
                xSolution[i]   =  t1*t  +  xSolution[i];
                wVect[i]   =  t2*t  +  vVect[i];
            }
#pragma omp for
            for (i = localAstroMax; i < localAstroMax+other; i++) {
                t      =  wVect[i];
                xSolution[i]   =  t1*t  +  xSolution[i];
                wVect[i]   =  t2*t  +  vVect[i];
            }
            for (i = localAstroMax; i < localAstroMax+other; i++) {
                t      =  wVect[i];
                dknorm = (t3*t)*(t3*t)  +  dknorm;
            }
        }

//      ------------------------------------------------------------------
//      Monitor the norm of d_k, the update to x.
//      dknorm = norm( d_k )
//      dnorm  = norm( D_k ),        where   D_k = (d_1, d_2, ..., d_k )
//      dxk    = norm( phi_k d_k ),  where new x = x_k + phi_k d_k.
//      ------------------------------------------------------------------
        dknorm = sqrt( dknorm );
        dnorm  = d2norm( dnorm, dknorm );
        dxk    = fabs( phi * dknorm );
        if (dxmax < dxk ) {
            dxmax   =  dxk;
            maxdx   =  itn;
        }

//      ------------------------------------------------------------------
//      Use a plane rotation on the right to eliminate the
//      super-diagonal element (theta) of the upper-bidiagonal matrix.
//      Then use the result to estimate  norm(x).
//      ------------------------------------------------------------------
        delta  =   sn2 * rho;
        gambar = - cs2 * rho;
        rhs    =   phi    - delta * z;
        zbar   =   rhs    / gambar;
        xnorm  =   d2norm( xnorm1, zbar  );
        gamma  =   d2norm( gambar, theta );
        cs2    =   gambar / gamma;
        sn2    =   theta  / gamma;
        z      =   rhs    / gamma;
        xnorm1 =   d2norm( xnorm1, z     );

//      ------------------------------------------------------------------
//      Test for convergence.
//      First, estimate the norm and condition of the matrix  Abar,
//      and the norms of  rbar  and  Abar(transpose)*rbar.
//      ------------------------------------------------------------------
        acond  =   anorm * dnorm;
        res2   =   d2norm( res2 , psi    );
        rnorm  =   d2norm( res2 , phibar );
        arnorm =   alpha * fabs( tau );

//      Now use these norms to estimate certain other quantities,
//      some of which will be small near a solution.

        alfopt =   sqrt( rnorm / (dnorm * xnorm) );
        test1  =   rnorm /  bnorm;
        test2  =   ZERO;
        if (rnorm   > ZERO) test2 = arnorm / (anorm * rnorm);
//      if (arnorm0 > ZERO) test2 = arnorm / arnorm0;  //(Michael Friedlander's modification)
        test3  =   ONE   /  acond;
        t1     =   test1 / (ONE  +  anorm * xnorm / bnorm);
        rtol   =   btol  +  atol *  anorm * xnorm / bnorm;

//      The following tests guard against extremely small values of
//      atol, btol  or  ctol.  (The user may have set any or all of
//      the parameters  atol, btol, conlim  to zero.)
//      The effect is equivalent to the normal tests using
//      atol = relpr,  btol = relpr,  conlim = 1/relpr.

        t3     =   ONE + test3;
        t2     =   ONE + test2;
        t1     =   ONE + t1;
//		printf("LSQR TP8 PE=%d itn=%d test1=%18.16lf, rnorm=%lf bnorm=%lf anorm=%lf xnorm=%lf rtol=%lf t1=%18.16lf\n",myid,itn,test1,rnorm,bnorm,anorm,xnorm,rtol,t1);

        if (itn >= itnlim) istop = 5;
        if (t3  <= ONE   ) istop = 4;
        if (t2  <= ONE   ) istop = 2;
        if (t1  <= ONE   ) istop = 1;
//        if (t1  <= ONE   ) printf("PE=%d t1=%lf\n",myid,t1); 

//      Allow for tolerances set by the user.

        if (test3 <= ctol) istop = 4;
        if (test2 <= atol) istop = 2;
        if (test1 <= rtol) istop = 1;   //(Michael Friedlander had this commented out)
//        if (test1 <= rtol) printf("PE=%d test1=%lf\n",myid,test1); 

//      ------------------------------------------------------------------
//      See if it is time to print something.
//      ------------------------------------------------------------------
//        if (nout  == NULL     ) goto goto_600; // Delete for buffer flush modification??? TBV
        if (n     <= 40       ) goto goto_400;
        if (itn   <= 10       ) goto goto_400;
        if (itn   >= itnlim-10) goto goto_400;
        if (itn % 10 == 0     ) goto goto_400;
        if (test3 <=  2.0*ctol) goto goto_400;
        if (test2 <= 10.0*atol) goto goto_400;
        if (test1 <= 10.0*rtol) goto goto_400;
        if (istop != 0        ) goto goto_400;
        goto goto_600;

//      Print a line for this iteration.
//      "extra" is for experimental purposes.

    goto_400:
       if(!comlsqr.Test)
        if(myid==0) {
			nout=fopen(lsqrOut,"a");
			if ( nout != NULL) {
				if ( extra ) {
					fprintf(nout, fmt_1500_extra,
						itn, xSolution[0], rnorm, test1, test2, anorm,
                    acond, phi, dknorm, dxk, alfopt);
				} else {
					fprintf(nout, fmt_1500,
						itn, xSolution[0], rnorm, test1, test2, anorm, acond);
				}
				if (itn % 10 == 0) fprintf(nout, fmt_1600);
			} else {
				printf("Error while opening %s (3)\n",lsqrOut);
				MPI_Abort(MPI_COMM_WORLD,1);
				exit(EXIT_FAILURE);
			}
			fclose(nout);
        }
//      ------------------------------------------------------------------
//      Stop if appropriate.
//      The convergence criteria are required to be met on  nconv
//      consecutive iterations, where  nconv  is set below.
//      Suggested value:  nconv = 1, 2  or  3.
//      ------------------------------------------------------------------
    goto_600:
        if (istop == 0) {
            nstop  = 0;
        }
        else {
            nconv  = 1;
            nstop  = nstop + 1;
            if (nstop < nconv  &&  itn < itnlim) istop = 0;
        }
        
        if(comlsqr.Test)
        {
			if(itn<comlsqr.itnLimit) istop=0;
			else (istop=5);
		}
		cycleEndMpiTime=MPI_Wtime();
//		printf("lsqr: PE=%d MPI-iteration number %d. Iteration seconds %f\n",myid,itn,cycleEndMpiTime-cycleStartMpiTime);

        if (istop != 0) break;
        
    }
//  ==================================================================
//  End of iteration loop.
//  ==================================================================
//  Finish off the standard error estimates.

    if ( wantse ) {
        t    =   ONE;
        if (m > n)     t = m - n;
        if ( damped )  t = m;
        t    =   rnorm / sqrt( t );
      
        for (i = 0; i < nunkSplit; i++)
            standardError[i]  = t * sqrt( standardError[i] );
        
    }

//  Decide if istop = 2 or 3.
//  Print the stopping condition.
 goto_800:
    if (damped  &&  istop == 2) istop = 3;
	if (myid == 0) {
		nout=fopen(lsqrOut,"a");
		if (nout != NULL) {
			fprintf(nout, fmt_2000,
					exitLsqr, istop, itn,
					exitLsqr, anorm, acond,
					exitLsqr, bnorm, xnorm,
					exitLsqr, rnorm, arnorm);
			fprintf(nout, fmt_2100,
					exitLsqr, dxmax, maxdx,
					exitLsqr, dxmax/(xnorm + 1.0e-20));
			fprintf(nout, fmt_3000,
					exitLsqr, msg[istop]);
		} else {
			printf("Error while opening %s (4)\n",lsqrOut);
			MPI_Abort(MPI_COMM_WORLD,1);
			exit(EXIT_FAILURE);
		}
		fclose(nout);
	}

//  Assign output variables from local copies.
    *istop_out  = istop;
    *itn_out    = itn;
    *anorm_out  = anorm;
    *acond_out  = acond;
    *rnorm_out  = rnorm;
    *arnorm_out = test2;
    *xnorm_out  = xnorm;

    if(nAstroPSolved) free(vAuxVect);
    return;
}