#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <sys/types.h>
#include <dirent.h>

#ifndef UDM_GUESSER_STANDALONG
#include "udm_config.h"
#endif

#include "udm_common.h"
#include "udm_crc32.h"
#include "udm_guesser.h"

#ifndef UDM_GUESSER_STANDALONG
#include "udm_utils.h"
#include "udm_unicode.h"
#include "udm_log.h"
#endif

int UdmLoadLangMapFile(UDM_ENV * Env, const char * filename){
	FILE * f;
	char str[1000];
	int nitems=0;
#ifndef UDM_GUESSER_STANDALONG
	UDM_CHARSET * cs;
#endif
	if(!Env->LangMapList.nmaps){
		Env->LangMapList.maps=(UDM_LANGMAP*)malloc(sizeof(UDM_LANGMAP));
	}else{
		Env->LangMapList.maps=(UDM_LANGMAP*)realloc(Env->LangMapList.maps,(Env->LangMapList.nmaps+1)*sizeof(UDM_LANGMAP));
	}
	Env->LangMapList.nmaps++;
	memset(&Env->LangMapList.maps[Env->LangMapList.nmaps-1],0,sizeof(UDM_LANGMAP));

	f=fopen(filename,"r");
	if(!f){
		Env->errcode=1;
		sprintf(Env->errstr,"Can't open LangMapFile '%s'\n",filename);
		return -1;
	}
	while(fgets(str,sizeof(str),f)){
		if(str[0]=='#'||str[0]==' '||str[0]=='\t')continue;

		if(!strncmp(str,"Charset:",8)){
			char * charset;
#ifndef UDM_GUESSER_STANDALONG
			char * lasttok;
#endif
			UDM_FREE(Env->LangMapList.maps[Env->LangMapList.nmaps-1].charset);
			if((charset=UdmGetToken(str+8," \t\n\r",&lasttok))){
				Env->LangMapList.maps[Env->LangMapList.nmaps-1].charset=strdup(charset);
			}
		}else
		if(!strncmp(str,"Language:",9)){
			char * lang;
#ifndef UDM_GUESSER_STANDALONG
			char *lasttok;
#endif
			UDM_FREE(Env->LangMapList.maps[Env->LangMapList.nmaps-1].lang);
			if((lang=UdmGetToken(str+9," \t\n\r",&lasttok))){
				Env->LangMapList.maps[Env->LangMapList.nmaps-1].lang=strdup(lang);
			}
		}else{
			char *s;
			int count;
			
			if(!(s=strchr(str,'\t')))continue;
			*s='\0';

			if(((count=atoi(s+1))==0)||(strlen(str)<1)||(strlen(str)>UDM_LM_MAXGRAM))
				continue;

			for(s=str;*s;s++){
				if(*s=='_')*s=' ';
			}
			if(*str){
				int hindex;
				hindex=((unsigned int)UdmCRC32(str,strlen(str)))&UDM_LM_HASHMASK;
				Env->LangMapList.maps[Env->LangMapList.nmaps-1].memb[hindex].count=count;
				/*
				strcpy(Env->LangMapList.maps[Env->LangMapList.nmaps-1].memb[hindex].str,str);
				*/
				nitems++;
			}
		}
	}
	fclose(f);
	
	if(!Env->LangMapList.maps[Env->LangMapList.nmaps-1].lang){
		Env->errcode=1;
		sprintf(Env->errstr,"No language definition in LangMapFile '%s'\n",filename);
		return -1;
	}

	if(!Env->LangMapList.maps[Env->LangMapList.nmaps-1].charset){
		Env->errcode=1;
		sprintf(Env->errstr,"No charset definition in LangMapFile '%s'\n",filename);
		return -1;
	}
#ifndef UDM_GUESSER_STANDALONG
	if(!(cs=UdmGetCharSet(Env->LangMapList.maps[Env->LangMapList.nmaps-1].charset))){
		Env->errcode=1;
		sprintf(Env->errstr,"Unknown charset '%s' in LangMapFile '%s'\n",
			Env->LangMapList.maps[Env->LangMapList.nmaps-1].charset,filename);
		return -1;
	}
#endif
	UdmPrepareLangMap(&Env->LangMapList.maps[Env->LangMapList.nmaps-1]);
	return 0;
}


int UdmLoadLangMapList(UDM_ENV * Env, const char * mapdir){
	DIR * dir;
	struct dirent * item;
	char fullname[1024]="";
	char name[1024]="";
	int res=0;

	Env->LangMapList.nmaps=0;
	dir=opendir(mapdir);
	if(!dir)return 0;

	while((item=readdir(dir))){
		char * tail;
		strcpy(name,item->d_name);
		if((tail=strstr(name,".lm"))){
			*tail='\0';
			sprintf(fullname,"%s/%s",mapdir,item->d_name);
			res=UdmLoadLangMapFile(Env,fullname);
			if(res<0){
				return res;
			}
		}
	}
	closedir(dir);
	return 0;
}


void UdmFreeLangMapList(UDM_ENV * env){
	size_t i;
	
	if(!env)return;
	for(i=0;i<env->LangMapList.nmaps;i++){
		if(env->LangMapList.maps[i].charset){
			UDM_FREE(env->LangMapList.maps[i].charset);
		}
		if(env->LangMapList.maps[i].lang){
			UDM_FREE(env->LangMapList.maps[i].lang);
		}
	}
	free(env->LangMapList.maps);
}


void UdmBuildLangMap(UDM_LANGMAP * map,const char * text,size_t textlen){
	const char * end=text+textlen;
	int prevb=' ';

	for(;text<=end;text++){
		char buf[UDM_LM_MAXGRAM+3];
		size_t buflen=0;
		const char * t;
		int code;
		int prev=0;

		code=(unsigned char)(*text);
		if(code<' ')continue;
		if((code==' ')&&(prevb==' '))continue;
		prevb=code;

		t=text;
		for(buflen=0;buflen<UDM_LM_MAXGRAM;buflen++){
			int hindex;

			for(;t<=end;t++){
				code=(unsigned char)(*t);
				if(code<' ')continue;
				if((code==' ')&&(prev==' '))continue;
				prev=code;
				break;
			}
			if(t>end)break;
			t++;

			buf[buflen]=code;
			buf[buflen+1]='\0';

			hindex=UdmCRC32(buf,buflen+1);
			hindex=((unsigned int)(hindex))&(UDM_LM_HASHMASK);
			map->memb[hindex].count++;

#ifdef DEBUG_GUESSER
			/* Print collision */
			if(map->memb[hindex].str[0]){
				int res;
				res=strcmp(map->memb[hindex].str,buf);
				if(res){
					printf("Coll %04X '%s' '%s'\n",hindex,map->memb[hindex].str,buf);
					strcpy(map->memb[hindex].str,buf);
				}
			}
#endif
			/*strcpy(map->memb[hindex].str,buf);*/
		}
	}
}


void UdmPrepareLangMap(UDM_LANGMAP * map){
	int i;
	int nmemb=UDM_LM_HASHMASK+1;
	float expectation=0;
	float dispersion=0;

	/* Calculate math expectation */
	for(i=0;i<nmemb;i++){
		expectation+=map->memb[i].count;
	}
	expectation/=nmemb;

	/* Calculate math dispersion */
	for(i=0;i<nmemb;i++){
		dispersion+=(map->memb[i].count-expectation)*(map->memb[i].count-expectation);
	}
	map->expectation=expectation;
	map->dispersion=sqrt(dispersion/nmemb);
}


float UdmCheckLangMap(UDM_LANGMAP * map0,UDM_LANGMAP * map1){
	float res=0;
	float exp0=0;
	float exp1=0;
	float up=0;
	float nmemb=UDM_LM_HASHMASK+1;
	int i;
	UDM_LANGITEM * mp0, *mp1;

	/* Abort if one of dispertions is 0 */
	if((map0->dispersion<0.00001)||(map1->dispersion<0.00001))return 0;

	exp0=map0->expectation;
	exp1=map1->expectation;

	for(i=0,mp0=map0->memb,mp1=map1->memb;i<nmemb;i++,mp0++,mp1++){
		up+=(mp0->count-exp0)*(mp1->count-exp1);
	}
	up/=nmemb;

	res=up/map0->dispersion/map1->dispersion;
	return res;
}


/* Structure to sort guesser results */
typedef struct {
	UDM_LANGMAP * map;
	float quality;
} UDM_MAPSTAT;

static int statcmp(const void * i1, const void * i2){
	float fres;
	fres = ((const UDM_MAPSTAT*)(i2))->quality - ((const UDM_MAPSTAT*)(i1))->quality;
	if(fres<0)return -1;
	if(fres>0)return +1;
	return 0;
}

/*************************************************************************/

#ifndef UDM_GUESSER_STANDALONG

int  UdmGuessCharSet(UDM_AGENT * Indexer,UDM_DOCUMENT * Doc){
	int i;
	UDM_MAPSTAT * mapstat;

	/* 
		TODO for Guesser
		
		There are three sources to detect charset:
		1. HTTP header:  Content-Type: ... charset=XXX
		2. <META HTTP-EQUIV="Content-Type" Contnet="... charset=YYY">
		3. ZZZ[i] - array of guessed charsets in mapstat[]
		good(ZZZ[n]) means that guesser returned good results for n.

		Now we may have various combinations:
		Simple situations, non-guessed and guessed charsets
		seem to be the same. At least one of non-guessed
		charset is the same with the best guessed charset
		and guessed charset gave good results:

		1. XXX==YYY==ZZZ[0],      good(ZZZ[0]). Take XXX value.
		2. No YYY, XXX=ZZZ[0] and good(ZZZ[0]). Take XXX value.
		3. No XXX, YYY=ZZZ[0] and good(ZZZ[0]). Take YYY value.
		4. XXX<>YYY, XXX==ZZZ[0], good(ZZZ[0]). Take XXX value.
		5. XXX<>YYY, YYY==ZZZ[0], good(ZZZ[0]). Take XXX value.
			4 and 5 seem to be webmaster mistake.

		There are the same fith situations when ZZZ[x] is still good 
		enough, but it is not the best one, i.e. x>0 
		Choose charset in the same way.
	*/

	/* Prepare document langmap */
	UdmPrepareLangMap(&Doc->LangMap);

	/* Allocate memory for comparison statistics */
	mapstat=(UDM_MAPSTAT *)malloc(Indexer->Conf->LangMapList.nmaps*sizeof(UDM_MAPSTAT));
	for(i=0;i<Indexer->Conf->LangMapList.nmaps;i++){
		mapstat[i].quality=UdmCheckLangMap(&Indexer->Conf->LangMapList.maps[i],&Doc->LangMap);
		mapstat[i].map=&Indexer->Conf->LangMapList.maps[i];
	}

	/* Sort statistics in quality order */
	qsort(mapstat,Indexer->Conf->LangMapList.nmaps,sizeof(UDM_MAPSTAT),&statcmp);

	/* Display results, best language is shown first */
	for(i=0;i<Indexer->Conf->LangMapList.nmaps;i++){
		if(mapstat[i].quality<0.800)break;
		if(!Doc->charset){
			if(mapstat[i].map->charset){
				Doc->charset=strdup(mapstat[i].map->charset);
			}
		}
		if(!Doc->lang){
			if(mapstat[i].map->lang){
				Doc->lang=strdup(mapstat[i].map->lang);
			}
		}
		if(i>4)break;
		UdmLog(Indexer,UDM_LOG_EXTRA,"Guesser: %.5f %s-%s",mapstat[i].quality,mapstat[i].map->lang,mapstat[i].map->charset);
	}
	free(mapstat);
	return 0;
}

#endif



/***************************************************************/

#ifdef UDM_GUESSER_STANDALONG

int main(int argc,char ** argv){
	int i;
	char buf[1024]="";
	UDM_ENV env;
	UDM_MAPSTAT * mapstat;
	UDM_LANGMAP mchar;

	/* Init structures */
	memset(&env,0,sizeof(env));
	memset(&mchar,0,sizeof(mchar));

	/* Load all available lang ngram maps */
	UdmLoadLangMapList(&env,LMDIR);
	if(env.errcode){
		printf("Error: '%s'\n",env.errstr);
		return 1;
	}

	fprintf(stderr,"%d maps found\n",env.LangMapList.nmaps);

	/* Add each line statistics */
	while(fgets(buf,sizeof(buf),stdin)){
		UdmBuildLangMap(&mchar,buf,strlen(buf));
	}

	/* Prepare map to comparison */
	UdmPrepareLangMap(&mchar);

	/* Allocate memory for comparison statistics */
	mapstat=(UDM_MAPSTAT *)malloc(env.LangMapList.nmaps*sizeof(UDM_MAPSTAT));

	/* Calculate each lang map        */
	/* correlation with text          */
	/* and store in mapstat structure */

	for(i=0;i<env.LangMapList.nmaps;i++){
		mapstat[i].quality=UdmCheckLangMap(&env.LangMapList.maps[i],&mchar);
		mapstat[i].map=&env.LangMapList.maps[i];
	}

	/* Sort statistics in quality order */
	qsort(mapstat,env.LangMapList.nmaps,sizeof(UDM_MAPSTAT),&statcmp);


	/* Display results. Best language is shown first. */
	for(i=0;i<env.LangMapList.nmaps;i++){
		printf("%.10f\t%s\t%s\n",mapstat[i].quality<0?0:mapstat[i].quality,mapstat[i].map->lang,mapstat[i].map->charset);
	}

	/* Free variables */
	free(mapstat);
	UdmFreeLangMapList(&env);

	return 0;
}

#endif
