/****
A Gibbs Sampler algorithm for finding multiple sites in multiple sequences 
****/
#include "gibbs.h"

g_type	MkGibbs(int nconv, int ncycl, double pseudo, 
	double qseudo, a_type A, p_type data)
{
	g_type G;
	int	n;

	NEW(G,1,gibbs_type);
	NEW(G->order,MaxSeqPop(data),int);
	NEW(G->pos,MaxSeqPop(data),int);
	G->nconverge = MAX(int, 1, nconv);
	G->ncycles = MAX(int,1,ncycl);
	G->pseudo = MIN(double,NSeqsPop(data),pseudo);
	G->pseudo = MAX(double,0.0000001,pseudo);
	G->qseudo = qseudo;
	G->use_order = FALSE;
	G->nres= NULL;
	G->data = data;
	G->nseq = NSeqsPop(data);
	G->A = A;
	G->fptr=NULL;
	G->sites = NULL;
	G->nsite=NULL;
	G->site_pos=NULL;
	NEW(G->null, MinSeqPop(data)+1,boolean);
	NEWP(G->tmpfreq, MaxSeqPop(data)+1,double);
	NEW(G->tmpratio, MaxSeqPop(data)+1,double);
	return G;
}

void	InitSitesGibbs(g_type G)
/* Randomly choose the inital site locations in each sequence,
   site_pos(n,j) and block other sites setting pos_open to zero for
   positions that overlap the sites */
{
   int t,n,i,k,s;

   InitSites(G->sites);
   for( t = 1; t <= nTypeSites(G->sites); t++){
	for( n = 1; n <= NSeqsPop(G->data); n++){
	    for(k = 1; k <= G->nsite[t][n]; k++){
		s = AddRandomSite(t,n,G->sites);
		G->site_pos[t][n][k] = s;
	    }
	}
   }
}

void	gibbs(FILE *fptr, g_type G, int nruns)
{
	int	b,n,s,iter,k,newsite,t,N=NSeqsPop(G->data),npos,hpsz=101;
	int	***site_pos,  *df,site, ncycles=G->ncycles;
	int	***oldsite_pos;
	int	nconverg = G->nconverge,num_conv;
	int	*order=G->order,run,number;
	m_type	*model,*finalmodel;
	st_type	finalsites,sites = G->sites,cpsites;
	p_type  data = G->data;
	o_type	R=NULL;
	a_type	A=PopA(G->data);
	boolean	flag,**null,quit;
	double	ipp,L0,*Like[3],*MissInfo,TotLike,*dTemp;
	double	pseudo = G->pseudo,qseudo=G->qseudo;
	double	best,oldbest;
	char	str[50];

	fprintf(stderr,"nconv = %d\n", nconverg);
	if(nTypeSites(sites) == 1) num_conv = 7; else num_conv = 10;
	NEWPP(site_pos, nTypeSites(sites)+1, int);
	NEWPP(oldsite_pos, nTypeSites(sites)+1, int);
	NEWP(null, nTypeSites(sites)+1, boolean);
	for(t = 1; t <= nTypeSites(sites); t++){
		MEW(null[t], MinSeqPop(data)+1, boolean);
		for(k=1; k <= MinSeqPop(data); k++) null[t][k] = FALSE;
		NEWP(site_pos[t], NSeqsPop(G->data) + 1, int);
		NEWP(oldsite_pos[t], NSeqsPop(G->data) + 1, int);
		for(n=1; n<=NSeqsPop(G->data); n++){
			NEW(site_pos[t][n],G->nsite[t][n]+2,int);
			NEW(oldsite_pos[t][n],G->nsite[t][n]+2,int);
		}
	}
	L0 = LogL0Pop(data); 
	NEW(MissInfo, nTypeSites(sites)+1, double);
	for(t=0; t<= 2; t++) {
		NEW(Like[t],nTypeSites(sites)+1, double);
	}
	NEW(df, nTypeSites(sites)+1, int);
   oldbest = 1.0; best = 0.0; 
   for(run = 1; run <= nruns; run++){
	number = 0;
	cpsites = CopySites(sites);
	InitSitesGibbs(G);
	for(npos=0,flag=TRUE,t=1; t<= nTypeSites(sites); t++) {
        	df[t] = (SiteLen(t,sites)*(nAlpha(A) - 1));
		k=nSites(t,1,sites);
		npos += k;
		for(n = 2; n <= N; n++) 
			if(k!=nSites(t,n,sites))flag = FALSE;
	}
	if(nTypeSites(sites) > 1 && flag && G->use_order) {
		if(R != NULL) { PutOrder(stdout,R); NilOrder(R); }
		R=Order(nTypeSites(sites),npos,pseudo*(double)N);
		for(n = 1; n <= NSeqsPop(data); n++) {
			OrderSites(n,order,sites);
			Add2Order(order,R);
		}
		/*** PutOrder(stderr,R); /****/
	} else R = NULL;
	fprintf(stderr,"\r** %d **\n", run);
	model = InitGibbs(sites,data,qseudo,NULL);
	G->model = model;
	for(iter =1; iter <= nconverg; iter++) {
	  if(iter > 2 && iter % ncycles == 0) {
	    for(t = 1; t <= nTypeSites(sites); t++) {
		if(Metropolis(G,t,model[t])){
			fprintf(stderr, " motif %c cycle %d\n",
				t+'A'-1,iter);
		}
		if(iter % 5 == 0) fprintf(stderr,"\r%d",iter);
	    }
	  }
	  for(n = 1; n <= N; n++) {
	      for(t = 1; t <= nTypeSites(sites); t++) {
		for( k = 1; k <= G->nsite[t][n]; k++){
		    if(R!=NULL) {
			OrderSites(n, order,sites);
			RmOrder(order,R);
		    }
		    s = G->site_pos[t][n][k];
		    VacateSite(t,n,s,sites);
		    RmModel(SeqPop(n,data), s, model[t]);
		    GetFreqProb(t,n,model[t],G,R);
		    s = ChooseSite(t,n,sites);
		    G->site_pos[t][n][k] = s;
		    Add2Model(SeqPop(n,data),s,model[t]);
		    if(R!=NULL) {
			OrderSites(n,order,sites);
			Add2Order(order,R);
		    }
		}
	      }
	  }
	  for(TotLike=0.0, t = 1; t <= nTypeSites(sites); t++) {
        	Like[0][t] = LogLikeModel(model[t]);
		TotLike += Like[0][t] - L0; 
	  }
	  if(TotLike > best){ 
	        /*** for(t = 1; t <= nTypeSites(sites); t++) 
			PutModel2(stderr,model[t]); /*****/
		number = 0; best = TotLike; 
		GetSitePos(site_pos, sites);
		dTemp = Like[1]; Like[1] = Like[0]; Like[0]= dTemp;
	  } else number++;
	  if(number > num_conv) break;
	}
	for(t=1;t<=nTypeSites(sites);t++){
		NilModel(model[t]); model[t] = NULL;
	}
	free(model);
	NilSites(sites); G->sites = sites = cpsites;
	for(quit = TRUE,t = 1; t <= nTypeSites(sites); t++) {
	  for(n = 1; n <= N; n++) {
		for( k = 1; k <= G->nsite[t][n]; k++){
	   		if(oldsite_pos[t][n][k]!=site_pos[t][n][k]){
	   		     if(oldbest < best) { /* save the best */
				oldsite_pos[t][n][k]=site_pos[t][n][k];
			     }
			     quit = FALSE;
			}
		}
	  }
	}
	/** fprintf(stderr,"best = %g; oldbest = %g\n",best,oldbest);
	/****/
	if(oldbest < best) {
		oldbest = best;
		dTemp = Like[2]; Like[2] = Like[1]; Like[1]= dTemp;
	} else if( oldbest == best) quit = TRUE;
	best = 0.0;
	if(quit) break;
    }
	finalsites=CopySites(sites);
	G->sites = finalsites;
	NilSites(sites);
	for( t = 1; t <= nTypeSites(finalsites); t++){
	   for( n = 1; n <= NSeqsPop(G->data); n++){
		for(k= 1; k <= G->nsite[t][n]; k++){
		   s = oldsite_pos[t][n][k];	
		   G->site_pos[t][n][k] = s;
		   AddSite(t,n,s,finalsites);
		}
	   }
	}
	if(R != NULL) {
		NilOrder(R);
		R=Order(nTypeSites(finalsites),npos,pseudo*(double)N);
		InitOrder(R);
		for(n = 1; n <= NSeqsPop(data); n++) {
			OrderSites(n,order,finalsites);
			Add2Order(order,R);
		}
	}
	G->model = finalmodel = InitGibbs(finalsites,data,qseudo,null);
	for(n = 1; n <= N; n++) {
	    for(t = 1; t <= nTypeSites(finalsites); t++) {
		for( k = 1; k <= G->nsite[t][n]; k++){
		    s = G->site_pos[t][n][k];
		    VacateSite(t,n,s,finalsites);
		    GetFreqProb(t,n,finalmodel[t],G,R);
		    AddSite(t,n,s,finalsites);
		}
	    }
	}
	for(TotLike=0.0,t = 1; t <= nTypeSites(finalsites); t++) {
		sprintf(str,"MOTIF %c",t + 'A' -1);
		fprintf(fptr,"\n\n%*s%*s%s",
			23,"",(SiteLen(t,finalsites)-7)/2,"",str);
		PutSitesGibbs(fptr,t,finalsites,G,NULL);
		PutModel(fptr,finalmodel[t]);
		fprintf(fptr,
			"\n\nComplete log-likelihood ratio  = %4d bits\n",
                        (int) (Like[2][t] - L0));
		fprintf(fptr,"Missing position information   = %4d bits\n",
		   	(int) (MissInfo[t]=MissInfoSites(t,finalsites)));
        	fprintf(fptr,"Log-likelihood ratio statistic = %4d bits\n",
                        (int)(best = Like[2][t] -  MissInfo[t] - L0));
		TotLike += best;
        	fprintf(fptr,"Degrees of freedom             = %4d\n",df[t]);
        	fprintf(fptr,"Information per parameter      = %1.3g bits\n",
                        ((Like[2][t] -  MissInfo[t] - L0)/(double)df[t]));
		sprintf(str,"Best Sites (std. dev. above mean)");
		fprintf(fptr,"\n%*s%*s%s",
			22,"",(SiteLen(t,finalsites)-7)/2,"",str);
		PutFinalSites(fptr,finalsites,t,finalmodel[t],G,data);
	}
	if(R != NULL) { 
		fprintf(fptr,"Element Order:\n");
		 PutOrder(stdout,R); NilOrder(R); 
	}
	fprintf(fptr,"\nTotal log-likelihood ratio  = %4d bits\n\n",
		(int) (TotLike+0.5));
}

m_type	*InitGibbs(st_type S, p_type P, double qseudo,boolean **null)
{
	int b,n,k,t,ntyps,max;
	double	npseudo;
	m_type *M;
	a_type A=PopA(P);
	int	*pos;

        ntyps = nTypeSites(S);
	NEW(pos,MaxSeqPop(P),int);
        NEW(M,ntyps+1,m_type);
        for(t=1; t<= ntyps; t++){
            for(n=1,npseudo=0.0; n<= NSeqsPop(P); n++)
		npseudo += (double) nSites(t,n,S);
            npseudo = sqrt(npseudo) * qseudo;
	/****
	    fprintf(stderr,"site %d: total number pseudo counts = %g\n",
			t,npseudo);
            PutSitesGibbs(stderr,t,S,NULL); /***/
            M[t] = Model(SiteLen(t,S),npseudo,CntsPop(P),A);
	    for(n =1; n <= NSeqsPop(P); n++){
		   PosTSites(t,n,pos,S);
		   for(k = 1; k <= nSites(t,n,S); k++){
			Add2Model(SeqPop(n,P), pos[k],M[t]);
		   }
	    }
        }
	free(pos);
	return M;
}

int	GetFreqProb(int t, int n, m_type M, g_type G, o_type R)
{
	st_type S=G->sites;
	register char	*seq  = SeqPop(n,G->data);
	register int	i,end;
	int	pos,*order=G->order;
	register double	factor,*freq_prob = PosProbSite(t,n,G->sites);

	end = LenSeqP(n,G->data) - LenModel(M) + 1;
	if(R!=NULL){
	   OrderSites(n,order,S); pos = 0;
	   factor = RelProbOrder(order,t,pos,R);
	   for(freq_prob[0]=0.0, i= 1; i<= end; i++){
	    if(!OccupiedSite(t,n,i,S)){
		freq_prob[i] = (factor * RelProbModel(seq+i, M));
		freq_prob[0] += freq_prob[i];
	    } else {
		freq_prob[i] = 0.0;
	    	if(StartSite(n,i,S)) {
			pos++; /* == next site? */
			factor = RelProbOrder(order,t,pos,R);
		}
	    }
	   }
	} else {	/* R == NULL */
	   for(freq_prob[0]=0.0, i= 1; i<= end; i++){
	    if(!OccupiedSite(t,n,i,S)){
		freq_prob[i] = RelProbModel(seq+i, M);
		freq_prob[0] += freq_prob[i];
	    } else freq_prob[i] = 0.0;
	   }
	}
}

