/* mpcc_r PGMRF.c -o psampler.out -lxlf90_r -lm_r -lpthread -lwsmp -lpwsmp -L ~/wsmp/aix4.3/ */

/*
This program do parallel one-block Metropolis-Hastings sampling from latent GMRF models. 
Implemented for two cases geostatistical (Lancaster data as in paper) 
or disease-mapping (not set 
to run here, you will have to change if(0) if(1) statements. 
The program uses MPI for communication and for matrix calculations WSMP, see 
http://www.research.ibm.com/math/OpResearch/wsmp.html, and what version to down load, 
linking etc. has to be adapted to your super-computer set-up. 

If you have any questions, do not hesitate to contact me; 
ingelins@math.ntnu.no. !

Also see comments for each function.
*/
 

#include <assert.h>
#include <stdio.h> 
#include <stdlib.h> 
#include <math.h> 
#include <float.h> 
#include <mpi.h> 
#include <malloc.h> 
#include <time.h>  
#include "rlib.h"
#include <mpi.h>

 
#define FREE(ptr) if(ptr)free((void *)(ptr));ptr=NULL 
#define Calloc(n,type) (type *)calloc((size_t)(n),sizeof(type)) 

#define MAX(a,b)  ((a) > (b) ? (a) : (b))
#define MIN(a,b)  ((a) < (b) ? (a) : (b))
#define ABS(x)    ((x) >= 0.0 ? (x) : (-(x)))

#define COEF_FNMIN "param2.dat"
 
/*
#define GRAPH_FNMIN  "scotland.graph"    
#define DATA_FNMIN "scotland.data" 
*/
/*
#define GRAPH_FNMIN "sardinia.graph"
#define DATA_FNMIN  "sardinia.data"
*/
/*
#define GRAPH_FNMIN "london-10.graph" 
#define DATA_FNMIN  "london.data" 
*/

#define DATA_FNMIN "LA.data"
#define GRAPH_FNMIN "LA.graph"

#define TEST_FNOUT1  "test1.txt"  
#define TEST_FNOUT2  "test2.txt"   

/* 
typedef double Real;
#define MPI_PREC MPI_DOUBLE
*/
#define PRIOR_A 0.25
#define PRIOR_B 0.0005
#define DIAG_TERM 0.0000005
  
#define PI 3.14159 



double gettime()    
{

  

  return((double)clock())/CLOCKS_PER_SEC;    
}    
 
double eu_gamma(double x, double a, double b)
{
/*returns prop. log density of gamma distr. for x */
  return (a-1.0)*log(x)-x*b;
}


double pxQx(int mydim, int ntot, int *ia, int *ja, double *avals, 
            double *diag, double *myx, int myid, MPI_Comm comm, int size) 
{

  /*Calclulate vector-matrix-vector product x^TQx im parallel. 
    Q given by ia, ja and avals*/
  int i, j, ldim, d1, l, idx;
  double *xfull, *lx, prod, myprod;

  xfull = Calloc(ntot, double);
  idx=0;
  myprod = 0;
 
 for(l=0;l<size;l++)
    {
      ldim = ntot/size;     
      d1 = ntot - size*ldim;     
      if(l<d1)     
        ldim++;
      
      if(l==myid)
        {
          lx=myx;
        }
      else
        lx = Calloc(ldim, double);

            
      MPI_Bcast(lx,ldim*sizeof(double),MPI_BYTE,l,comm);
      
      for(i=0;i<ldim;i++)
        {
          xfull[idx]=lx[i];
          idx++;
        }

      if(myid != l)
        FREE(lx);
    }


  if(0 &&  myid==0)
    {
      for(i=0;i<ntot;i++)
        printf("xfull(%d)=%.3f \n",i,xfull[i]);
    }

  for(i=0;i<mydim;i++)
    {
      for(j=0;j<ia[i+1]-ia[i];j++)
        {
          myprod = myprod + myx[i]*2*xfull[ja[ia[i]+j]]*avals[ia[i]+j];
          if(0) printf("myprod %.2f avals[%d] %.3f \n", ia[i]+j, avals[ia[i]+j], myprod);
        }
      myprod = myprod + myx[i]*myx[i]*diag[i];
      if(0) printf("d: i %d j %d myprod %.2f \n", i,j,myprod);
    }

  MPI_Allreduce(&myprod, &prod, 1, MPI_DOUBLE, MPI_SUM, comm);

  if(0) 
    printf("myprod=%.3f, prod=%.3f 0.5*prod %.3f \n", myprod, prod, 0.5*prod); 

  FREE(xfull);

 
  return prod;
}

double ldetQ(int mydim, int ntot, int *ia, int *ja, double *avals, 
            double *diag, int *perm, int *iparm, double *dparm, 
            int *invp, int myid, MPI_Comm comm, int size) 
{
  /*Calculates  log(det(Q)) in parallel. Q given by ia, ja and avals */ 

  int  ldb, nrhs, naux, *aux, mrp, i;
  double my_ldetQ, ldetQ, *b, *ldiag, waltime;

  ldb=mydim;  
  nrhs=1;  
  naux=0;  
  mrp=0; 
  aux=NULL;
  
  my_ldetQ=0;
  ldetQ=0;

  ldiag = Calloc(mydim, double);
  b = Calloc(mydim, double);   

  for(i=0;i<mydim;i++)
    ldiag[i]=diag[i];

  iparm[1]=3;      
  iparm[2]=3;      
  iparm[3]=1;    
  iparm[31]=1;
  waltime = gettime();     
  
  pwssmp(&mydim,ia,ja,avals,ldiag,perm, invp,b,&ldb,&nrhs,&aux,
         &naux,&mrp,iparm,dparm); 
    
  
  
  if(iparm[63] != 0)       
    {       
      printf("The following ERROR was detected: %d \n", iparm[63]);       
      MPI_Abort(comm,0);   
      return 0;       
    }  
      
 
  if(0 && myid==0) 
    printf("Cholesky completed in  %.6f sec. \n", gettime()-waltime);       

  for(i=0;i<mydim;i++)
    my_ldetQ = my_ldetQ + log(ldiag[i]*ldiag[i]);

  MPI_Allreduce(&my_ldetQ, &ldetQ, 1, MPI_DOUBLE, MPI_SUM, comm);

  if(0) printf("my_ldetQ = %.3f ldetQ = %.3f \n",my_ldetQ, ldetQ);
  
  return ldetQ;
    

}

int read_coef(double **pcoef, int *n_range, int nh, int myid, MPI_Comm comm)
{
  /* Read file with coefficient for GRF-approx. File with coeff. available 
     from rue@math.ntnu.no, see www.math.ntnu.no/~hrue. */
     
  const int itag=1; 
  int i, j, r, range=0, error=0;
  double dummy, *coef;
  FILE *fpin;

  MPI_Status mpistat; 

  if(myid==0)
    {
      if((fpin = fopen(COEF_FNMIN,"r")) == NULL)  
        {    
          printf("file_read: Couldn't open input file %s.\n", COEF_FNMIN);    
          MPI_Abort(comm,0);  
        }  

      if(nh!=2)
        {
          printf("Read coeffisients for nh=2 only \n");
          MPI_Abort(comm,0);  
        } 
    
     
      while(error==0)
        {
          for(i=0;(i<nh+1 && !error);i++)
            for(j=0;j<nh+1 && !error;j++)
              if (fscanf(fpin, "%lf", &dummy) != 1)
                {
                  error = 1;
                }
              else
                if(0) printf("range %d coof(%d, %d)=%.8f \n"
                             ,range, i, j, dummy);
          range++;
        }
   
   
      fclose(fpin);
    
      range = range -1;

      coef = Calloc(range*(nh+1)*(nh+1), double);      
      *n_range = range;

      printf("range = %d \n",range);

      MPI_Bcast(n_range,1,MPI_INT,0,comm);

      if((fpin = fopen(COEF_FNMIN,"r")) == NULL)  
        {    
          printf("file_read: Couldn't open input file %s.\n", COEF_FNMIN);    
          MPI_Abort(comm,0);  
        }  

      for(r=0;r<range;r++)
        {
          for(i=0;i<nh+1;i++)
            for(j=0;j<nh+1;j++)
              {
                if(fscanf(fpin, "%lf", &coef[r*(nh+1)*(nh+1)+i*(nh+1)+j]) != 1)
                  {
                    printf("Error reading fileprintf %s.\n", COEF_FNMIN);    
                    MPI_Abort(comm,0);                   
                  }
                else
                  {
                    if(0) printf("range %d coef(%d, %d)=%.5f \n"
                                 ,r, i, j, coef[r*(nh+1)*(nh+1)+i*(nh+1)+j]);
                  }
              }
        }
      MPI_Bcast(coef,(range)*(nh+1)*(nh+1)*sizeof(double),MPI_BYTE,0,comm);
         
    }
    else
    {
      MPI_Bcast(n_range,1,MPI_INT,0,comm);
      range = *n_range;
      coef = Calloc(range*(nh+1)*(nh+1), double);
      
        
      MPI_Bcast(coef,(range)*(nh+1)*(nh+1)*sizeof(double),MPI_BYTE,0,comm);
      
      if(0)
        for(r=0;r<range;r++)
          for(i=0;i<nh+1;i++)
            for(j=0;j<nh+1;j++)
              {
                if(1) printf("range %d coof(%d, %d)=%f \n"
                             ,range, i, j, coef[r*(nh+1)*(nh+1)+i*(nh+1)+j]);
              }
    
    }
  *pcoef = coef;
  return 1;
}

int PreadData(int mydim, int ntot, double **pdata, int **pidata, double **pE,
              int myid, MPI_Comm comm, int size) 
{ 
  /* Processor 0 reads disease-mapping data  
     and distribute them to their dedicated processor.
     (not used in Lancaser example)
  */ 

  const int itag=1; 
  int i, j, l, ldim, *tidata, d1;
  double dummy, *tdata, *tE;
  FILE *fpin; 

  MPI_Status mpistat; 

 
  if(myid==0)  
    {  

      if( (fpin = fopen(DATA_FNMIN,"r")) == NULL)  
        {    
          printf("file_read: Couldn't open input file %s.\n", GRAPH_FNMIN);    
          MPI_Abort(comm,0);  
        }  

      for(l=0;l<size;l++)
        {
          ldim = ntot/size;    
          d1 = ntot - size*ldim;    
          if(l<d1)    
            ldim++; 
         
          tdata = Calloc(ldim, double);
          tE =  Calloc(ldim, double); 
          tidata = Calloc(ldim, int);

          for(i=0;i<ldim;i++)
            {
             
             

             if(fscanf(fpin, "%lf %lf %lf \n", &tdata[i], &tE[i], &dummy) != 3)
               MPI_Abort(comm,0);
             if(0)  printf("Is here 3.33. %.2f %.2f %.2f \n",  tdata[i], tE[i], dummy); 
             
             tidata[i]=i;
            
            }
          
          if(l==myid)
            {
              *pdata = tdata;
              *pidata = tidata;
              *pE = tE;
            }
          else
            {
              MPI_Send(tdata,ldim*sizeof(double),MPI_BYTE,l,itag,comm);  
              MPI_Send(tE,ldim*sizeof(double),MPI_BYTE,l,itag,comm);  
              MPI_Send(tidata,ldim*sizeof(int),MPI_BYTE,l,itag,comm);  
              FREE(tdata);
              FREE(tE);
              FREE(tidata);
            }
        }
    }
  else
    {
      *pdata = Calloc(mydim, double);
      *pE = Calloc(mydim, double);
      *pidata = Calloc(mydim, int);
      MPI_Recv(*pdata,mydim*sizeof(double),MPI_BYTE,0,itag,comm,&mpistat);       
      MPI_Recv(*pE,mydim*sizeof(double),MPI_BYTE,0,itag,comm,&mpistat);
      MPI_Recv(*pidata,mydim*sizeof(int),MPI_BYTE,0,itag,comm,&mpistat);
    }


  return 1;
}
 


int PreadGraph(int *pmydim, int *n, int **pia, int **pja, double **pavals, double **pdiag, int myid, MPI_Comm comm, int size)
{
  /* Processor 0 reads disease-mapping graph  
     and distribute them to their dedicated processor.
     (not used in Lancaser example)
*/
  const int itag=1;
  int dim, i, nneigh, lineno, *neigh, t_ia, j, *ia, *ja, *tia, *tja, 
    *Full_ia, mydim, ldim, li, p, d1, ldimja, mydimja;  
  FILE *fpin;  
  double *avals, *diag, *tavals, *tdiag, *Full_diag;  

  MPI_Status mpistat;

  if(myid==0) 
    { 
       
      if( (fpin = fopen(GRAPH_FNMIN,"r")) == NULL) 
        {   
          printf("file_read: Couldn't open input file %s.\n", GRAPH_FNMIN);   
          MPI_Abort(comm,0); 
        } 
 
      fscanf( fpin, "%d", &dim); 
       
    } 
 
  MPI_Barrier(comm); 
  MPI_Bcast(&dim,1,MPI_INT,0,comm); 
  *n = dim;
  mydim = dim/size;  
  d1 = dim - size*mydim;  
  if(myid<d1)  
    mydim++;  

  *pmydim = mydim;
  
  printf("I am processor no %d and have mydim %d \n", myid, mydim);    
  
  if(myid==0)
    {
      Full_ia = Calloc(dim+1, int);  
      Full_diag = Calloc(dim, double);  
  
      t_ia=0;  
  
      for(i = 0; i < dim; i++)   
        {        
          Full_ia[i]=t_ia;  
          fscanf(fpin, "%d", &lineno);   
          fscanf(fpin, "%d", &nneigh);   
          Full_diag[i] = nneigh;  
          if (nneigh)   
            {   
              neigh  = Calloc(nneigh, int);    
              for(j=0;j<nneigh;j++)   
                {   
                  fscanf(fpin, "%d", &neigh[j]);  
                  if(neigh[j]>i)  
                    t_ia++;  
                   
                }   
              FREE(neigh);   
            }   
        }   
      fclose(fpin);   
    
      Full_ia[dim] = t_ia;  
  
      if( (fpin = fopen(GRAPH_FNMIN,"r")) == NULL)    
        {      
          printf("file_read: Couldn't open input file %s.\n", GRAPH_FNMIN);      
          MPI_Abort(comm,0); 
          return 0;    
        }    
    
      fscanf( fpin, "%d", &dim);    
  
      ja = Calloc(t_ia, int);  
      avals = Calloc(t_ia, double);  
      t_ia=0;  
      p=0;
      li=0;
     
      for(i = 0; i < dim; i++)    
        {
          if(li==0)
            {
              ldim = dim/size;   
              d1 = dim - size*ldim;   
              if(p<d1)   
                ldim++;   
              
              tia = Calloc(ldim+1, int);
              tdiag = Calloc(ldim, double);
              ldimja =  Full_ia[i+ldim]-Full_ia[i];
              tja = Calloc(ldimja, int);
              tavals = Calloc(ldimja, double);
              
              for(j=0;j<=ldim;j++)
                tia[j]=Full_ia[i+j]-Full_ia[i];

              for(j=0;j<ldim;j++)
                tdiag[j] = Full_diag[i+j];
            }
             
          fscanf(fpin, "%d", &lineno);    
          fscanf(fpin, "%d", &nneigh);    
          if (nneigh)    
            {    
              neigh  = Calloc(nneigh, int);     
              for(j=0;j<nneigh;j++)    
                {    
                  fscanf(fpin, "%d", &neigh[j]);   
                  if(neigh[j]>i)   
                    {  
                      
                      tja[t_ia]=neigh[j];  
                      tavals[t_ia]=-1;  
                      t_ia++;  
                    }  
                }    
              FREE(neigh);    
            }
          li++;

          if(li==ldim)
            {
              if(p==myid)
                {
                  ia = tia;
                  ja = tja;
                  avals = tavals;
                  diag = tdiag;
                }
              else
                {
                  MPI_Send(tia,(ldim+1)*sizeof(int),MPI_BYTE,p,itag,comm);
                  MPI_Send(tja,ldimja*sizeof(int),MPI_BYTE,p,itag,comm);
                  MPI_Send(tavals,ldimja*sizeof(double),MPI_BYTE,p,itag,comm);
                  MPI_Send(tdiag,ldim*sizeof(double),MPI_BYTE,p,itag,comm); 
                  FREE(tia);
                  FREE(tja);
                  FREE(tavals);
                  FREE(tdiag);
                }
              t_ia=0;
              li=0;
              p++;
            }
        }    
      fclose(fpin);
    }
  else /*if mydim!=0 */
    {
      ia = Calloc(mydim+1, int);
      MPI_Recv(ia,(mydim+1)*sizeof(int),MPI_BYTE,0,itag,comm,&mpistat); 
      mydimja=ia[mydim];
      ja = Calloc(mydimja, int);
      avals = Calloc(mydimja, double);
      diag = Calloc(mydim, double);
      MPI_Recv(ja,mydimja*sizeof(int),MPI_BYTE,0,itag,comm,&mpistat); 
      MPI_Recv(avals,mydimja*sizeof(double),MPI_BYTE,0,itag,comm,&mpistat); 
      MPI_Recv(diag,mydim*sizeof(double),MPI_BYTE,0,itag,comm,&mpistat); 
    }

  if(0)  
    {  
      for(i=0;i<mydim+1;i++)  
        printf("ia(%d)=%d \n", i, ia[i]);  
    }  
  if(0)  
    {  
      for(i=0;i<ia[mydim];i++)   
        printf("ja(%d)=%d \n", i, ja[i]);  
  
      for(i=0;i<mydim;i++)   
        printf("diag(%d)=%.2f \n", i, diag[i]);  
  
      for(i=0;i<ia[mydim];i++)   
        printf("avals(%d)=%.2f \n", i, avals[i]);   
   
   
    }  

  *pja = ja; 
  *pia = ia; 
  *pavals = avals; 
  *pdiag = diag; 

  return 1;  
}  
 

int Pgetpx(int gx, int gy, int nx, int ny,int size, int *p, int *pi)
{
  /* For a lattice divided between processors. For element with global 
     pixel (gx,gy), returns processor p and local index pi. */  
  int tp, ti, gi, d1, dim, r;

  dim = nx*ny;
  gi = gy*nx+gx;

  d1 = dim/size;
  r = dim - d1*size;

  if(0) printf("d1 %d r %d size %d dim %d \n", d1, r, size, dim);
 
  if(gi < r*(d1+1))
    {
      tp = gi/(d1+1);
      ti = gi - tp*(d1+1);
    }
  else
    {
      tp = r + (gi-r*(d1+1))/d1;
      ti = gi - r*(d1+1) - (tp-r)*d1;
    }

  if(0)
    printf("gx %d gy %d gi %d p %d pi %d \n", gx, gy, gi, tp, ti);

  *p = tp;
  *pi = ti;

  return 0;
}




int PmakeLatticeNH(int *pmydim, int *n, int **pia, int **pja, int **pjai,
                   double **pavals, 
                   double **pdiag,int myid, MPI_Comm comm, int size, 
                   int n1, int n2, int nh)    
{
  /*
    Makes a lattice of dim nx x ny with a nh-order neighbourhood 
    (elements have at most nh^2-1 neighbours), and divide it between 
    procesors.
  */ 
  int dim, i, nneigh, gnneigh, j, *ia, *ja, *jai, *tia, *tja,  mydim; 
  int ldim, ti, p, d1, mydimja, k,l;   
  int over, under, left, right;
  FILE *fpin;   
  double *avals, *diag;   
  

  dim = n1*n2;
  *n = dim;

  d1 = dim/size;


  printf("n1 %d n2 %d nh %d\n",n1,n2,nh);

 
  mydim = dim/size;   
  d1 = dim - size*mydim;   
  if(myid<d1)   
    mydim++;   
 
  *pmydim = mydim; 
   
  if(1)
    printf("I an processor no. %d and have mydim %d \n", myid, mydim);     
 
  ia = Calloc(mydim+1, int);
  diag = Calloc(mydim, double);
  
 
  ia[0]=0;

  i = myid*(dim/size);
  if(d1<=myid)
    i=i+d1;
  else
    i=i+myid;

       
  for(j=0;j<mydim;j++)
    {
      
      over = i/n1;
      if(over>nh)
        over = nh;

      left = i-(i/n1)*n1;
      if(left>nh)
        left=nh;

      right = n1 - (i-(i/n1)*n1) -1;
      if(right>nh)
        right=nh;

      under = n2 - i/n1 -1; 
      if(under>nh)
        under = nh;

      gnneigh = under*(left+right+1) + right;
     
      if(0) printf("over %d, under %d, left %d, right %d gnneigh %d \n",
             over, under, left, right, gnneigh);

      ia[j+1]=ia[j]+gnneigh;
      diag[j]=(over+under+1)*(left+right+1)-1;

      i++;
    }


  mydimja = ia[mydim];
 
  if(1)
    printf("mydimja %d \n", mydimja);
  
  ja = Calloc(mydimja, int);
  jai = Calloc(mydimja, int);
  avals = Calloc(mydimja, double);
 
      
  i = myid*(dim/size); 
  if(d1<=myid) 
    i=i+d1; 
  else 
    i=i+myid; 
  
  ti=0;

  for(j=0;j<mydim;j++) 
    {

      left = i-(i/n1)*n1;
      if(left>nh)
        left=nh;

      right = n1 - (i-(i/n1)*n1) -1;
      if(right>nh)
        right=nh;

      under = n2 - i/n1 -1; 
      if(under>nh)
        under = nh;
      
      if(0) printf("under %d, left %d, right %d \n",
             under, left, right);


      for(k=1;k<right+1;k++)
        {
          ja[ti]=i+k;
          jai[ti]=k;
          ti++;
        }

      for(l=1;l<under+1;l++)
        for(k=-left;k<right+1;k++)
          {
            ja[ti]=i+n1*l+k;
            jai[ti]=l*(nh+1)+abs(k);
            ti++;
          }
     
      i++; 
    } 
 if(0)   
    {   
      for(i=0;i<mydim+1&&i<100;i++)   
        printf("ia(%d)=%d \n", i, ia[i]);   
    }  


  if(0)   
    {   
      for(i=0;i<ia[mydim]&&i<100;i++)    
        printf("ja(%d)=%d jai(%d)=%d \n", i, ja[i],i,jai[i]);   
    }

 
     
  if(0)
    {
      for(i=0;i<mydim;i++)    
        printf("diag(%d)=%.2f \n", i, diag[i]);   
   
      for(i=0;i<ia[mydim];i++)    
        printf("avals(%d)=%.2f \n", i, avals[i]);        
    }   
  

  *pia = ia;  
  *pjai = jai;  
  *pavals = avals;  
  *pdiag = diag;  
  *pja = ja;
  return 1;   
}



int PmakeLattice1(int *pmydim, int *n, int **pia, int **pja, double **pavals, 
                 double **pdiag,int myid, MPI_Comm comm, int size, 
                 int n1, int n2)    
{
  int dim, i, nneigh, gnneigh, j, *ia, *ja, *tia, *tja,  mydim, ldim, ti, p, d1, mydimja;   
  FILE *fpin;   
  double *avals, *diag;   
  /*
    Makes a lattice of dim nx x ny with a first order neighbourhood, 
    and initilise avals to an intrinsic GMRF. Not usen in Lancaster 
    example.*/

  dim = n1*n2;
  *n = dim;

  d1 = dim/size;


  printf("n1 %d n2 %d \n",n1,n2);

  /*  
  if(n1>n2)
    {
      dl=n1;
      n1=n2;
      n2=dl;
      if(myid==0)
        printf("Swaped the directions \n");
    }
  */

  mydim = dim/size;   
  d1 = dim - size*mydim;   
  if(myid<d1)   
    mydim++;   
 
  *pmydim = mydim; 
   
  if(1)
    printf("I am processor no %d and have mydim %d \n", myid, mydim);     
 
  ia = Calloc(mydim+1, int);
  diag = Calloc(mydim, double);
  
 
  ia[0]=0;

  i = myid*(dim/size);
  if(d1<=myid)
    i=i+d1;
  else
    i=i+myid;

       
  for(j=0;j<mydim;j++)
    {
      gnneigh=2; 
      nneigh=4; 
      
      if(i<n1)
        nneigh--;
      if(i%n1==0)
        nneigh--;
      if(i>=n1*(n2-1))
        {
          nneigh--;
          gnneigh--;
        }
      if(i%n1==n1-1)
        { 
          nneigh--; 
          gnneigh--; 
        } 

      diag[j]=nneigh;
      ia[j+1]=ia[j]+gnneigh;

      i++;
    }


  mydimja = ia[mydim];
 
  if(1)
    printf("mydimja %d \n", mydimja);
  
  ja = Calloc(mydimja, int);
  avals = Calloc(mydimja, double);
 
      
  i = myid*(dim/size); 
  if(d1<=myid) 
    i=i+d1; 
  else 
    i=i+myid; 
  
  ti=0;

  for(j=0;j<mydim;j++) 
    {

      if(i%n1<n1-1)  
        {   
          ja[ti]=i+1;
          avals[ti]=-1;
          ti++;
        }   
      if(i<n1*(n2-1)) 
        {
          ja[ti]=i+n1;
          avals[ti]=-1;
          ti++;
        }
         
      i++; 
    } 
 if(0)   
    {   
      for(i=0;i<mydim+1;i++)   
        printf("ia(%d)=%d \n", i, ia[i]);   
    }  

  if(0)   
    {   
      for(i=0;i<mydim+1;i++)   
        printf("ia(%d)=%d \n", i, ia[i]);   
    }   
  if(0)   
    {   
      for(i=0;i<ia[mydim];i++)    
        printf("ja(%d)=%d \n", i, ja[i]);   
         for(i=0;i<mydim;i++)    
        printf("diag(%d)=%.2f \n", i, diag[i]);   
   
      for(i=0;i<ia[mydim];i++)    
        printf("avals(%d)=%.2f \n", i, avals[i]);        
    }   
  

  *pia = ia;  
  *pavals = avals;  
  *pdiag = diag;  
  *pja = ja;
  return 1;   
}

int InitLA(int *pmydim, int *pmyndata, int *pntot, int **pidata, 
           double **pm, double **py,  int **pia, int **pja, int **pjai,
           double **pavals, double **pnndiag, int myid, 
           MPI_Comm comm, int size, int nh)
{ 
  /* Makes lattice and reads data for the Lancaser dataset. */
  const int itag=1; 
  int i, j, l, ldim, *tidata, d1, ntot, ndata=248;
  int *tposx, *tposy, *tm, *ty, xmin, xmax, ymin, ymax, xex, yex;
  int *idata,*ia, *ja, *jai;
  int nx, ny, p, pi, myndata;
  double dummy, *tdata, *tE;
  double *y, *m, *avals, *nndiag;
  FILE *fpin; 

  MPI_Status mpistat; 
  
   
  nx=150;
  ny=210;

  xex=20;
  yex=5;



  tposx = Calloc(ndata, int);
  tposy = Calloc(ndata, int);
  tm = Calloc(ndata, int);
  ty = Calloc(ndata, int);
 
  if(myid==0)  
    {  

      if( (fpin = fopen(DATA_FNMIN,"r")) == NULL)  
        {    
          printf("file_read: Couldn't open input file %s.\n", DATA_FNMIN);    
          MPI_Abort(comm,0);  
        }  

      for(i=0;i<ndata;i++)
        {
          if(fscanf(fpin, "%d %d %d %d \n", 
                    &tposx[i], &tposy[i], &tm[i], &ty[i]) != 4)
            MPI_Abort(comm,0);

          if(0)  printf("Is here ReadLA 1: %d %d %d %d \n", 
                        tposx[i], tposy[i], tm[i], ty[i]); 
             

          tposx[i] =  tposx[i] + xex;
          tposy[i] =  tposy[i] + yex;

          if(i==0)
            {
              xmin = xmax = tposx[i];
              ymin = ymax = tposy[i];
            }

          if(tposx[i]<xmin)
            xmin = tposx[i];

          if(tposx[i]>xmax)
            xmax = tposx[i];

            if(tposy[i]<ymin)
            ymin = tposy[i];

          if(tposy[i]>ymax)
            ymax = tposy[i];
          
        
        }
      if(0)
        printf("xmin=%d, xmax=%d, ymin=%d ymax=%d \n",xmin, xmax, ymin, ymax); 
    }

  MPI_Barrier(comm); 
  MPI_Bcast(tposx,ndata,MPI_INT,0,comm);
  MPI_Bcast(tposy,ndata,MPI_INT,0,comm);
  MPI_Bcast(tm,ndata,MPI_INT,0,comm);
  MPI_Bcast(ty,ndata,MPI_INT,0,comm);

  if(0) PmakeLattice1(pmydim, pntot, &ia, &ja, &avals, &nndiag, 
                      myid, comm, size, nx, ny);  

  if(1) PmakeLatticeNH(pmydim, pntot, &ia, &ja, &jai, &avals, &nndiag,
                      myid, comm, size, nx, ny, nh);  
  
  if(0)   
    {   
      for(i=0;i<*pmydim+1;i++)   
        printf("ia(%d)=%d \n", i, ia[i]);   
    }  

  if(1){
  myndata=0;

  for(i=0;i<ndata;i++)
    {
      Pgetpx(tposx[i], tposy[i], nx, ny, size, &p, &pi);
        
      if(p==myid)
        {
          myndata++;
          if(0)
            printf("tposx[%d] %d, tposy[%d] %d \n", i, tposx[i], i,tposy[i]);
        }
    }

  printf("myndata %d \n", myndata);

  idata = Calloc(myndata, int);
  m = Calloc(myndata, double);
  y = Calloc(myndata, double);

  j=0;

  for(i=0;i<ndata;i++)
    {
      Pgetpx(tposx[i], tposy[i], nx, ny, size, &p, &pi);
      
      if(p==myid)
        {
          idata[j] = pi;
          m[j] = tm[i];
          y[j] = ty[i];

          if(0)
            printf("tposx[%d] %d, tposy[%d] %d idata[%d] %d m(%d) %.2f y(%d) %.2f  \n",
                   i, tposx[i], i,tposy[i],j,idata[j], j, m[j], j, y[j]);
          
          if(0)
            {
              m[j] = 0;
              y[j] = 0;
            }
          j++;
        }
    }

  FREE(tposx);
  FREE(tposy);
  FREE(tm);
  FREE(ty);
  }  
  if(0)   
    {   
      for(i=0;i<(*pmydim)+1;i++)   
        printf("At the end of LAI: ja(%d)=%d \n", i, ja[i]);   
    }  


  *pidata = idata;
  *pm = m;
  *py = y;
  *pia = ia;
  *pja = ja;
  *pjai = jai;
  *pavals = avals;
  *pnndiag = nndiag;
  *pmyndata = myndata;


  return 0;    
}



int SampleGMRF(double *pmy_loglike, int mydim, int totn, int *ia, int *ja, double *avals, double *diag, int *perm, int *invp, int *iparm,  
              double *dparm, double *sample, int myid, MPI_Comm comm, int size)
{
  /* Sample in parallel from the GMRF with prec.matrix Q given by 
     ia, ja and avals */
  int i, ldb, nrhs, naux, mrp, *aux;
  double *ldiag, my_z2, lmy_detQ, waltime, my_loglike;

  my_z2=0;
  lmy_detQ=0;

  ldb=mydim;  
  nrhs=1;  
  naux=0;  
  mrp=0; 
  aux=NULL;

  if(0) printf("SampleGMRF myid %d, mydim %d \n", myid, mydim); 

  if(0)     
    {     
      for(i=0;i<mydim+1;i++)     
        printf("ia(%d)=%d \n", i, ia[i]);     
    }     
  if(0)     
    {     
      for(i=0;i<ia[mydim];i++)      
        printf("ja(%d)=%d \n", i, ja[i]);     
    }
  if(0){     
      for(i=0;i<mydim;i++)      
        printf("diag(%d)=%.2f \n", i, diag[i]);     

      for(i=0;i<ia[mydim];i++)      
        printf("avals(%d)=%.2f \n", i, avals[i]);          
  } 

  ldiag = Calloc(mydim, double);

  for(i=0;i<mydim;i++)
    ldiag[i]=diag[i];

  if(0) printf("SampleGMRF myid %d, mydim %d \n", myid, mydim); 
  iparm[1]=3;      
  iparm[2]=3;      
  iparm[3]=1;    
  iparm[31]=1;
  waltime = gettime();     

  pwssmp(&mydim,ia,ja,avals,ldiag,perm, invp,sample,&ldb,&nrhs,&aux,&naux,&mrp,iparm,dparm); 
  // find Cholesky factor of Q
 if(iparm[63] != 0)       
    {       
      printf("The following ERROR was detected: %d \n", iparm[63]);       
      MPI_Abort(comm,0);   
      return 0;       
    }  
      

  if(0 && myid==0) 
    printf("Cholesky completed in  %.6f sec. \n", gettime()-waltime);       
  // Find log(det(Q))) for the processors part.
  for(i=0;i<mydim;i++)
    lmy_detQ = lmy_detQ + log(ldiag[i]*ldiag[i]);


  if(0) printf("lmy_detQ %.3f \n", lmy_detQ);  
    
  for(i=0;i<mydim;i++)     
    {     
      sample[i]=M_SQRT2*cos(2.0*PI*drand48())*sqrt(-log(1.0-drand48()));
      my_z2 = my_z2 + sample[i]*sample[i];
  
    }

    
  iparm[1]=4;      
  iparm[2]=4;      
  iparm[3]=1;     
  iparm[29]=2;    

  waltime = gettime();     

  //Solve triangular system.    
  pwssmp(&mydim,ia,ja,avals,ldiag,perm, invp,sample,&ldb,&nrhs,&aux,&naux,&mrp,iparm,dparm);       
    
  if(0 && myid==0) 
    printf("Back substitution completed in  %.6f sec. \n", gettime()-waltime);       
     
  iparm[1]=5;      
  iparm[2]=5;      
  iparm[3]=1;     
  
  if(0)
    {
      waltime = gettime();     
      pwssmp(&mydim,ia,ja,avals,ldiag,perm, invp,sample,&ldb,&nrhs,&aux,&naux,&mrp,iparm,dparm);      
      
      if(myid==0 && 0) 
        printf("Iterative refinement completed in  %.6f sec. \n", gettime()-waltime);       
      
      if(iparm[63] != 0)       
        {       
          printf("The following ERROR was detected: %d \n", iparm[63]);       
          MPI_Abort(comm,0);   
          return 0;       
        }       
    }

  if(1) my_loglike = -0.5*my_z2 - 0.5*lmy_detQ;
  if(0) my_loglike = -0.5*my_z2;
   
 
  if(0) 
    { 
      printf("The solution of the systen is: \n");     
 
      for(i=0;i<mydim;i++)     
        printf("sample(%d) = %.3f \n", i, sample[i]);     
    } 
        
  if(0)
    {
      printf("I sampleGMRF: my_loglike = %.5f \n", my_loglike);
    }

  FREE(ldiag);
  *pmy_loglike = my_loglike; //probabiliy density for sample (up to an norm
  //const. not dep. of parameters in Q.
  return 1;
} 






  

int InitTask (int *pmydim, int *totn, int **pia, int **pja, int **pjai, 
              double **pavals, 
              double **pdiag, double **pnndiag, int **pperm,
              int **pinvp, int **piparm,  double **pdparm, int myid, 
              MPI_Comm comm, int size, int n1, int n2, int nh) 
{ 
  /*
    Makes/reads graphs and do steps that only need to be done once.
  */ 
  int *perm, *invp, ldb, nrhs, naux, *aux, mydim, mrp, i, *ja, *jai, 
    *ia, iparm[64];
  double *b, waltime, *avals, *diag, dparm[64], *nndiag;

  b=NULL;


  if(n1==0 || n2==0)
    PreadGraph(&mydim, totn, &ia, &ja, &avals, &nndiag, myid, comm, size);
  else
    {
      if(n1>1 && n2>1)
        {       
          if(0)
            PmakeLattice1(&mydim, totn, &ia, &ja, &avals, &nndiag, 
                          myid, comm, size, n1, n2);
          
          if(1)
            PmakeLatticeNH(&mydim, totn, &ia, &ja, &jai, &avals, &nndiag, 
                          myid, comm, size, n1, n2,nh);
        }
      else
        {
          mydim = *pmydim;
          nndiag = *pnndiag;
          ia = *pia;
          ja = *pja;
          jai = *pjai;
          avals = *pavals;
        }
    }

  diag = Calloc(mydim, double);
 
  printf("myid %d, mydim %d \n", myid, mydim);
  
  if(0)    
    {    
      for(i=0;i<mydim+1;i++)    
        printf("ia(%d)=%d \n", i, ia[i]);    
    }    
  if(0)    
    {    
      for(i=0;i<ia[mydim];i++)     
        printf(" ja(%d)=%d \n", i, ja[i]);    
    
      for(i=0;i<mydim;i++)     
        if(0) printf("diag(%d)=%.2f \n", i, diag[i]);    
    
      for(i=0;i<ia[mydim];i++)     
        if(0) printf("avals(%d)=%.2f \n", i, avals[i]);         
    }    
   

  perm  = Calloc(*totn, int); 
  invp  = Calloc(*totn, int); 
 
  

  ldb=mydim; 
  nrhs=1; 
  naux=0; 
  mrp=0;
 
  waltime = gettime(); 
   
  iparm[0]=0; 
  iparm[1]=0; 
  iparm[2]=0; 
  iparm[3]=1; 
   
  pwssmp(&mydim,ia,ja,avals,diag,perm,invp,b,&ldb,&nrhs,&aux,&naux,&mrp,iparm,dparm);     
      
  if(iparm[63] != 0)     
    {     
      printf("The following ERROR was detected: %d \n", iparm[63]);     
      MPI_Abort(comm,0);    
      return 0;     
    }     
     
  if(myid==0) 
    printf("Inistalization step completed in %.6f sec. \n",gettime()-waltime);     
    
  iparm[1]=1;     
  iparm[2]=1;     
  iparm[3]=1;     
    
  waltime = gettime();    

  pwssmp(&mydim,ia,ja,avals,diag,perm,invp,b,&ldb,&nrhs,
         &aux,&naux,&mrp,iparm,dparm);      
       
  if(iparm[63] != 0)      
    {      
      printf("The following ERROR was detected: %d \n", iparm[63]);      
      MPI_Abort(comm,0);    
      return 0;      
    }      
      
  if(myid==0)  
    printf("Ordering step completed in %.6f sec. \n",gettime()-waltime); 
    
  iparm[1]=2;      
  iparm[2]=2;      
  iparm[3]=1;     
 
  waltime = gettime();      

  pwssmp(&mydim,ia,ja,avals,diag,perm,invp,b,&ldb,&nrhs,
         &aux,&naux,&mrp,iparm,dparm);      
       
  if(iparm[63] != 0)      
    {      
      printf("The following ERROR was detected: %d \n", iparm[63]);      
      MPI_Abort(comm,0);    
      return 0;      
    }     
 
  if(myid==0) 
    printf("Sybmolic factorization completed in  %.6f sec. \n", 
           gettime()-waltime);       
  
  if(0) 
    { 
      printf("Number of non-zeros in factorization L: %d.\n", iparm[23]);    
      printf("Number of FLOPS in factorization: %.3f \n", dparm[22]);    
      printf("Double words needed to factor: %d \n", iparm[22]);    
    } 
       
  *pperm = perm;
  *pinvp = invp;
  *pmydim = mydim;
  *pia = ia;
  *pavals = avals;
  *pja = ja;
  *pjai = jai;
  *pdiag = diag;
  *pnndiag = nndiag; 
  *piparm = iparm;
  *pdparm = dparm;
  return 1;
}


int UpdateQr(int mydim, int myjadim, double kappa, int r, int nh, 
             int *jai, double *diag, double *avals, 
             double *coef)
{
  /* Updates elements in Q/avals according to new hyper-parameters 
     r and kappa. Used for Lancaster dataset
  */

  int i;


  for(i=0;i<myjadim;i++)
    {
      avals[i] = kappa*coef[r*(nh+1)*(nh+1)+jai[i]];
      if(0)
        printf("in f, 1 avals(%d) = %.6f = coef[%d]= %f jai[%d]=%d\n",
               i, avals[i],r*(nh+1)*(nh+1)+jai[i],
               coef[r*(nh+1)*(nh+1)+jai[i]],i,jai[i]);
    }

  for(i=0;i<mydim;i++)
    {
      diag[i] = kappa*coef[r*(nh+1)*(nh+1)]+DIAG_TERM;
      if(0)     
        printf("diag(%d) = %.6f coefcoef %d \n",
               i, diag[i],r*(nh+1)*(nh+1) );
    }
 
  return 1;
}


  

int UpdateQ(int mydim, int myjadim, double kappa, double isigma, 
            double *nndiag, double *diag, double *avals)
{
  /* Updates Q when Q=kappa Q + diag(isigma). Used in disease-mapping, 
     not in Lancaseter example */
  int i;

 
  
  if(0) printf("mydim = %d myjadim= %d \n", mydim, myjadim);

  for(i=0;i<myjadim;i++)
    {
      avals[i] = -kappa;
      if(0)
        printf("in f, 1 avals(%d) = %.6f \n", i, avals[i]);
    }

  for(i=0;i<mydim;i++)
    diag[i] = kappa*(nndiag[i] + isigma) + DIAG_TERM;

  if(0)
    {
      for(i=0;i<mydim;i++)        
        printf(" in f Diag(%d) = %.6f \n", i, diag[i]);        
      for(i=0;i<myjadim;i++)         
        printf("in favals(%d) = %.6f \n", i, avals[i]);
    }

  return 1;
}

int scale_proposal(double *pnewx, double x, double f)
{
  /* propose new (hyperparameter) newx = unif(x/f,fx) */
  double newx, z, len;

  len = f - 1/f; 
  z= drand48();

  
  newx =  (1/f + len*z)*x; 

  /*
  if (z < len/(len+2*log(f)))
    newx =  (1/f + len*z)*x;
  else
    newx = pow(f, 2.0*z-1.0)*x;
  */

  *pnewx = newx;
  
  if (0) 
    printf("z = %.3f, xnew = %.3f \n", z, newx);
  
  return 1;
}

int myPoissonllike12div(double *div1, double *div2, int myddim, double *E, double *y, int *iy, double *x) 
{
  int i;

  printf("Is here 1 \n");

  
  for(i=0;i<myddim;i++)
    {
      printf("Is here 2 i=%d \n",i);
      div1[i] = y[i] - E[i]*exp(x[i]);
      div2[i] = E[i]*exp(x[i]);
      div1[i] = div1[i] + x[i]*div2[i];
      printf("Is here 3 i=%d \n",i);
    }
  
  
  return 1;
}

int myllike12div(double *div1, double *div2, int myddim, double *m, 
                 double *y, int *iy, double *x, double beta) 
{
  int i;

  if(0) printf("I am here 1 myddim %d \n", myddim);

  
  for(i=0;i<myddim;i++)
    {
      div1[i] = y[i] - m[i] * (1 - 1/(1+exp(beta+x[iy[i]])));
      div2[i] = m[i]/(exp(-(beta+x[iy[i]])) + 2 + exp(beta+x[iy[i]]));
      div1[i] = div1[i] + x[iy[i]]*div2[i];

      if(0)
        { 
          printf("x(%d)=%.3f, y(%d)=%.3f, m(%d)=%.3f ,beta=%.3f \n",
                 iy[i], x[iy[i]], i, y[i], i, m[i], beta);       
          printf("div1(%d) %.3f div2(%d) %.3f \n", i,div1[i], i,div2[i]);
        }
    }
  
  
  return 1;
}


int myPoissonloglike(double *ploglike, int myddim, 
		     double *E, double *y, int *iy, double *x)  
{
  /* Returns Poisson loglikelihood for the processors data. 
     Not used for Lancaser ex */
  int i, j;
  double loglike;

  loglike=0;

  for(i=0;i<myddim;i++)
    {
      j=iy[i];

      loglike = loglike + y[i]*x[j] - E[i]*exp(x[j]);
    }

  if(0)
    printf("loglike (in myloglike) = %.3f \n", loglike);

  *ploglike = loglike;

  return 1;

}

int myloglike(double *ploglike, int myddim, double *m, double *y, int *iy, double *x, double beta)  
{
  /* Returns log likelihoos for the proc. data in Lancaser ex. */
  int i, j;
  double loglike;

  loglike = 0;

  for(i=0;i<myddim;i++)
    {
      loglike = loglike + y[i]*(beta+x[iy[i]]) 
        - m[i]*log(1+exp(beta+x[iy[i]]));
    }

  if(0)
    printf("loglike (in myloglike) = %.3f \n", loglike);

  *ploglike = loglike;

  return 1;

}


int find_mode(double *mode,  double *modediag, int mydim, int myddim, 
              double beta,
              int *ia, int *ja, double *avals, double *diag, int *perm, 
              int *invp, int *iparm,   
              double *dparm, double *E, double *y, int *iy, int myid, 
              MPI_Comm comm, int size) 
{
  /* Find mode by Newton-Raphsons method in parallel*/
  int ni,j, ldb, nrhs, naux, mrp, iter;
  double *div1, *div2, *x, *x_old, deltamax, mydeltamax, deltalim, waltime;
  double *fmdiag, *aux;

  iter = 300;

  div1 = Calloc(myddim, double);
  div2 = Calloc(myddim, double);
  fmdiag = Calloc(mydim, double);
  x = Calloc(mydim, double);
  x_old = Calloc(mydim, double);

  ldb=mydim;    
  nrhs=1;    
  naux=0;    
  /*  aux=NULL;*/
  mrp=0;   

  deltalim = 0.01;
  deltamax = 2*deltalim;

  for(j=0;j<mydim;j++)  
    { 
      x[j] = 0; 

    } 

  
  for(ni=0;(ni<=iter && deltamax > deltalim);ni++)
    {


      for(j=0;j<mydim;j++) 
        {
          x_old[j] = x[j];
          if(0)
            printf("x(%d)= %.3f \n", j, x[j]);
        }

     

      myllike12div(div1, div2, myddim, E, y, iy, x,beta);
 
      for(j=0;j<mydim;j++)  
        { 
          x[j] = 0;
          fmdiag[j] = diag[j];    
        } 


      for(j=0;j<myddim;j++)
        {
          x[iy[j]]=div1[j];
          fmdiag[iy[j]] = diag[iy[j]] + div2[j];
          if(0)
            {
              printf("fmdiag[%d]=%.3f \n", iy[j], fmdiag[iy[j]]);
              printf("x(%d)= %.3f \n", iy[j], x[iy[j]]);
              printf("div2[%d]=%.3f \n", iy[j], div2[iy[j]]);
            } 
        }



      iparm[1]=3;       
      iparm[2]=4;       
      iparm[3]=1;
      iparm[29]=0;
      iparm[31]=0;
      

      
      waltime = gettime();      

      pwssmp(&mydim,ia,ja,avals,fmdiag,perm,invp,x,&ldb,&nrhs,&aux,&naux,
             &mrp,iparm,dparm);        
     
      
      /*
      iparm[1]=4;       
      iparm[2]=4;       
      iparm[3]=1;      
      iparm[29]=2;     
      waltime = gettime();      
     
     
      pwssmp(&mydim,ia,ja,avals,ldiag,perm, invp,sample,
      &ldb,&nrhs,&aux,&naux,&mrp,iparm,dparm);        
      */

      
      if(myid==0 && 0)  
        printf("System solves in  %.6f sec. \n", gettime()-waltime);        
      
      mydeltamax = 0;


      for(j=0;j<mydim;j++)
        mydeltamax = MAX(mydeltamax, ABS(x[j]-x_old[j]));  


      MPI_Allreduce(&mydeltamax, &deltamax, 1, MPI_DOUBLE, MPI_MAX, comm);  
    
    
    }


  if(0 && myid==0) 
    printf("ni = %d, deltamax= %.6f \n", ni, deltamax); 

  if(ni >= iter)
    {
      if(myid==0)
        printf("Mode not found, deltamax = %.10f \n", deltamax);
      MPI_Abort(comm,0); 
    }      

  if(1) ("deltamax = %.10f \n", deltamax);

  for(j=0;j<mydim;j++)
    {
      mode[j] = x[j];
    }

  for(j=0;j<mydim;j++) 
    {
      modediag[j] = fmdiag[j];  
    }

  FREE(fmdiag);
  FREE(x);
  FREE(x_old);
  FREE(div1);
  FREE(div2);
    
}
      

int main(int argc, char *argv[])  
{ 
  /* A one-block Metropolise-Hastings algorithm. Run for NMH iterations. */
  int totn, ldb, nrhs, naux, nnzl, wspace, niter, info, i, seed=2002, nmh, j;  
  int *perm, *invp, *iparm, *ja, *jai, *ia, n1, n2, *idata, nacc;  
  int myddim, nh, n_range, r_old, r_new;
  double *b, *dparm, rcond, aux, *avals, *diag, *nndiag, *coef;
  double  my_loglike, loglike, my_dloglike, *new_sample;  
  double *data, *E; 
  double sigma_old, fsigma, sigma_new, sigma_mean, beta_new;
  double beta_old, sigma_beta, ldet; 
  double  lacc, lr;
  double  my_sloglike, lacc_old, lacc_new, *y, *m;
  double  waltime, rtc, *mode, *modediag, prod; 
  int mrp;

  int mperr, myid, mydim, size; 
  MPI_Comm comm; 
  MPI_Status mpistat; 

   
  MPI_Init(&argc, &argv); 
  MPI_Comm_dup(MPI_COMM_WORLD, &comm); 
  MPI_Comm_rank(comm, &myid); 
  MPI_Comm_size(comm, &size); 

  sigma_old=sigma_new = 1;
  fsigma=1;
  sigma_mean=0;

  beta_new = beta_old = 0.6;
  sigma_beta=3;

  nh=2;
  r_new=r_old=100;
  myddim=0;
  n1=-1; 
  n2=-1;

  nmh=25; //Number of iterations, shold be >25!
  nacc=0;

  loglike = 0;
  my_loglike = 0;

  srand48(seed+3*myid); 
  srand(seed+3*myid);
  
  read_coef(&coef, &n_range, nh, myid, comm);  
  
    
  
  InitLA(&mydim, &myddim, &totn, &idata, &m, &y, &ia, &ja, &jai,&avals, 
         &nndiag, myid,comm,size,nh);
  
  
  if(0)
    {
      for(i=0;i<myddim;i++)
        {
          printf("y(%d)=%.3f, m(%d)=%.3f ,beta_old=%.3f \n",
                 i, y[i], i, m[i], beta_old);
        }
    }

   if(0)   
    {   
      for(i=0;i<mydim+1;i++)   
        printf("ia(%d)=%d \n", i, ia[i]);   
    }  

   InitTask (&mydim, &totn, &ia, &ja, &jai, &avals, &diag, &nndiag, 
             &perm, &invp, &iparm, &dparm, myid, comm, size, n1, n2, nh);
   
   
   new_sample = Calloc(mydim, double);

   mode = Calloc(mydim, double);
   modediag = Calloc(mydim, double);

   if(0)  PreadData(mydim, totn,  &data, &idata, &E, myid, comm, size); 
   
   if(0) 
     UpdateQ(mydim, ia[mydim], sigma_old, sigma_old, nndiag, diag, avals);   
  
   
   if(1)
     UpdateQr(mydim, ia[mydim], sigma_old, r_old, nh, 
              jai, diag, avals, coef);


   
   find_mode(mode, modediag, mydim, myddim, beta_old, 
             ia, ja, avals, diag, perm, invp, iparm, dparm, 
             m, y, idata, myid, comm, size);  
    
   
   
   SampleGMRF(&my_sloglike, mydim, totn, ia, ja, avals, 
              modediag, perm, invp, iparm, dparm, new_sample, 
              myid, comm, size);
  
   if(0)   
    { 
      for(i=0;i<10;i++)   
        printf("diag(%d)=%.3f \n", idata[i], diag[idata[i]]);   
    }  
   
   ldet = ldetQ(mydim, totn, ia, ja, avals, diag, perm, iparm, dparm, 
                 invp, myid, comm, size);

       
          
   for(j=0;j<mydim;j++)
     new_sample[j] = new_sample[j] + mode[j];
   

   if(0)  myPoissonloglike(&my_dloglike, mydim, E, data, 
                           idata, new_sample);
       
   if(1) 
     myloglike(&my_dloglike, myddim, m, y,idata, new_sample, beta_old);
       
   my_loglike = my_dloglike - my_sloglike;

   MPI_Allreduce(&my_loglike, &loglike, 1, MPI_DOUBLE, MPI_SUM, comm); 
      
   
       
   if(0) printf("loglike = %.3f \n", loglike);
   
   loglike = loglike - 
     0.5*pxQx(mydim, totn, ia, ja, avals, diag, 
              new_sample, myid, comm, size)-
     0.5*ldet-log(sigma_old)-log(r_old);
   
   if(1) printf("loglike = %.3f \n", loglike);
   
   lacc_old = loglike + eu_gamma(sigma_old, PRIOR_A, PRIOR_B);

   if(1) printf("lacc_old = %.3f  \n", lacc_old);   
  

   for(i=0;i<nmh;i++)
     {
       
       loglike = 0;
       my_loglike = 0;

       if(myid==0)
         scale_proposal(&sigma_new, sigma_old, fsigma);
       
       MPI_Bcast(&sigma_new,1,MPI_INT,0,comm); 
   
       if(0) 
         printf("sigma_old = %.3f, sigma_new = %.3f \n",
                sigma_old, sigma_new); 
       if(1)
         UpdateQr(mydim, ia[mydim], sigma_new, r_new, nh, 
                  jai, diag, avals, coef);

            
      
       find_mode(mode, modediag, mydim, myddim, beta_new, 
                 ia, ja, avals, diag, perm, invp, iparm, dparm, 
                 m, y, idata, myid, comm, size);  
       
       ldet = ldetQ(mydim, totn, ia, ja, avals, diag, perm, iparm, dparm, 
                 invp, myid, comm, size); 
      
       SampleGMRF(&my_sloglike, mydim, totn, ia, ja, avals, 
                  modediag, perm, invp, iparm, dparm, new_sample, 
                  myid, comm, size);  
      
      
       for(j=0;j<mydim;j++)
         new_sample[j] = new_sample[j] + mode[j];
      
       if(0)  myPoissonloglike(&my_dloglike, mydim, E, data, 
                              idata, new_sample);
       
       if(1) myloglike(&my_dloglike, myddim, m, y,idata, new_sample, beta_new);
       
       my_loglike = my_dloglike - my_sloglike;
       
       if(0) printf("my_sloglike = %.3f \n", my_sloglike); 

       MPI_Allreduce(&my_loglike, &loglike, 1, MPI_DOUBLE, MPI_SUM, comm); 
       
      
       if(0) printf("loglike = %.3f \n", loglike);

       loglike = loglike - 
         0.5*pxQx(mydim, totn, ia, ja, avals, diag, 
                  new_sample, myid, comm, size)-
         0.5*ldet-log(sigma_new)-log(r_new);
       
       
       
       if(0) printf("loglike = %.3f \n", loglike);
 
       lacc_new = loglike + eu_gamma(sigma_new, PRIOR_A, PRIOR_B);

       if(0 && myid==0) printf("lacc_new = %.3f  \n", lacc_new);         
       
       lacc = lacc_new - lacc_old;

       if(myid == 0)
         lr = (double)rand()/RAND_MAX;
     
       
       MPI_Bcast(&lr,1,MPI_INT,0,comm); 

       if(myid==0)
         printf("sigma_old = %.4f,sigma_new %.4f,lacc = %.4f ,accrate %.4f \n",
                sigma_old, sigma_new, lacc, (double)nacc/i); 
       if(myid==0 && 0) printf("\n");
       
       if(lr < exp(lacc))
         {
           sigma_old = sigma_new;
           lacc_old = lacc_new;
           nacc++;
           sigma_mean = ((double)(nacc)-1)/nacc*sigma_mean 
             + 1/(double)nacc*sigma_new;
         }
      
     } 
  
   if(0)   
     {   
       printf("Diag is: \n");       
      
       for(i=0;i<mydim;i++)       
         printf("Diag(%d) = %.6f \n", i, diag[i]);       
       for(i=0;i<ja[mydim];i++)
         printf("avals(%d) = %.6f \n", i, avals[i]);
       for(i=0;i<mydim;i++) 
         printf("data(%d)=%.2f idata(%d) %d \n", i, data[i], i, idata[i]); 
      
    
     }
   if(myid==0 && 1)
     {
         printf("sigma_old = %.4f,sigma_new %.4f,lacc = %.4f ,accrate %.4f \n",
                sigma_old, sigma_new, lacc, (double)nacc/i); 
         printf("sigma_mean %.3f \n", sigma_mean);
     }
    
   MPI_Finalize(); 
} 
 

