/* Lumberjack
 *
 * FX <fx@phenoelit.de>
 * Phenoelit (http://www.phenoelit.de)
 * (c) 1999, 2k
 *
 */
#include <stdio.h>
#include <string.h>
#include <sys/stat.h>
#include <unistd.h>

#include "list.h"

#define BADWORDLIST "badword.txt"

#define MIN_LENGTH 3
#define MAX_LENGTH 20

void clean_str(char *a);
void split(char *a);
void join(char *a);
void remove_bad();

/* ldif_collect()
 * main function for collecting some informations from the ldif file
 */
int ldif_collect(char *ldifname) {

    FILE *ldif;
    char *instr;
    int i;
    struct stat s;
    unsigned long int s_done;

    if ((ldif=fopen(ldifname,"rt"))==NULL) {
	fprintf(stderr,"ldif_collect(): failed to open %s for read\n",ldifname);
	return (-1);
    }
    // if ((out=fopen(outfile,"wt+"))==NULL) {
// 	fprintf(stderr,"ldif_collect(): failed to open %s for write\n",outfile);
// 	return (-1);
  //   }
    instr=(char *)malloc(2048);


    stat(ldifname,(struct stat *)&s);
    s_done=0;
    rewind(ldif);
    while (fgets(instr,2047,ldif)!=NULL) {
	s_done+=strlen(instr);
	
	// if (list_verbose) { printf("%s\n",instr); }
	if (strstr(instr,"userpassword:")!=NULL) {
	    memset(instr,0,2048);
	    continue;
	}
	if (instr[0]==' ') {
	    memset(instr,0,2048);
	    continue;
	}
	join(instr);
	clean_str(instr);
	split(instr);
	
	// if (list_verbose) { printf("%s\n---------------\n",instr); }
	memset(instr,0,2048);
	if (list_verbose) { fprintf(stderr,"\r%5.2f %%",(float)(((float)s_done/((float)s.st_size)))*100); }
	
    }
    if (list_verbose) fprintf(stderr,"\n");
    fflush(stderr);
    
    list_unique();
    remove_bad();
    // list_write_to(out);
    // list_display();

    // fclose(out);
    fclose(ldif);

    return 0;
}

void clean_str(char *a) {
    int i,j;
    char *ts;

    if (a[strlen(a)-1]=='\n') { a[strlen(a)-1]='\0'; }
    for (i=0;i<strlen(a);i++) {
	if (!(
		(((char)a[i]>='A')&&((char)a[i]<='Z')) ||
		(((char)a[i]>='a')&&((char)a[i]<='z')) ||
		(((char)a[i]>='0')&&((char)a[i]<='9')))) {
	    a[i]=' ';
	}
    }
    ts=(char *)malloc(strlen(a)+1);
    memset(ts,0,strlen(a)+1);
    ts[0]=a[0];
    j=1;
    for (i=1;i<strlen(a);i++) {
	if (!((a[i-1]==' ')&&(a[i]==' '))) {
	    ts[j]=a[i];
	    j++;
	}
    }
    memset(a,0,strlen(a)+1);
    strcpy(a,ts);
    free(ts);
}

void join(char *a) {
    int i,j;
    char *ts;

    ts=(char *)malloc(strlen(a)+1);
    memset(ts,0,strlen(a)+1);
    ts[0]=a[0];
    j=1;
    for (i=1;i<strlen(a);i++) {
	if (a[i]!='\n') {
	    ts[j]=tolower(a[i]);
	    j++;
	}
    }
    memset(a,0,strlen(a)+1);
    strcpy(a,ts);
    free(ts);
}


void split(char *a) {
    char *ts;
    char *ts2;

    ts=a;
    while ((ts2=strchr(ts,' '))!=NULL) {
	ts2[0]='\0';
	ts2+=sizeof(char);
	//if (list_verbose) { printf("SPLIT: |%s| (size=%d)\n",ts,strlen(ts)); }
	if ((strlen(ts)>MIN_LENGTH)&&(strlen(ts)<=MAX_LENGTH)) list_append(ts);
	ts=ts2;
    }
    if ((strlen(ts)>MIN_LENGTH)&&(strlen(ts)<=MAX_LENGTH)) list_append(ts);
}
	
void remove_bad() {
    int i;
    char *teststr;
    FILE *badl;

    printf("Cleaning ...");
    fflush(stdout);

    list_rewind();
    while(list_next()==0) {
	teststr=(char *)list_content();
	if ( ((teststr[0]>='0')&&(teststr[0]<='9')) &&
		((teststr[1]>='0')&&(teststr[1]<='9')) ) {
	    list_delete();
	}
    }
    if ((badl=fopen(BADWORDLIST,"rt"))==NULL) {
	fprintf(stderr,"failed to open %s.\nmake shure the bad word list is available.\n",BADWORDLIST);
	exit (-1);
    }
    teststr=malloc(2049);
    memset(teststr,0,2049);
    while (fgets(teststr,2048,badl)!=NULL) {
	teststr[strlen(teststr)-1]='\0';
	if (list_find(teststr)) {list_delete();}
	memset(teststr,0,2049);
    }
    free(teststr);
    fclose(badl);

    printf(" done\n");
}
	
