 // // htsearch.cc //9 // Command-line and CGI interface to search the databases H // Expects the databases are generated using htdig, htmerge, and htfuzzyK // Outputs HTML-ized results of the search based on the templates specified  // // // #if RELEASE V static char RCSid[] = "$Id: htsearch.cc,v 1.24.2.9 2000/02/15 22:20:02 grdetil Exp $"; #endif   #include "htsearch.h"  #include "WeightWord.h"  #include "parser.h"  #include "Display.h" #include "../htfuzzy/Fuzzy.h"  #include "cgi.h" #include "WordRecord.h"  #include "WordList.h"  #include "StringList.h"  #include "IntObject.h" #include <time.h>  #include <ctype.h> #include <signal.h>  #include "HtURLCodec.h"  #include "HtWordType.h"   ( // If we have this, we probably want it. #ifdef HAVE_GETOPT_H #include <getopt.h>  #endif   #ifdef __VMS2 #include <unistd.h> // for exit(1) == EXIT_FAILURE #include "vms_access.h"  #endif  % typedef void (*SIGNAL_HANDLER) (...);   / ResultList *htsearch(char *, List &, Parser *);   9 void setupWords(char *, List &, int, Parser *, String &); 4 void createLogicalWords(List &, String &, String &); void reportError(char *); # void convertToBoolean(List &words); + void doFuzzy(WeightWord *, List &, List &); , void addRequiredWords(List &, StringList &);
 void usage();    int			debug = 0; int			minimum_word_length = 3;    O //***************************************************************************** 
 // int main()  // int  main(int ac, char **av)  {      int			c;     extern char		*optarg; #     int		        override_config=0;      List		searchWords;-     String		configFile = DEFAULT_CONFIG_FILE;      int			pageNumber = 1;      StringMatch		limit_to;     StringMatch		exclude_these;      String		logicalWords; $     String              origPattern;'     String              logicalPattern; $     StringMatch		searchWordsPattern;     StringList		requiredWords;     int                 i;        // $      // Parse command line arguments      // /      while ((c = getopt(ac, av, "c:dv")) != -1)       {  	switch (c)  	{   	    case 'c':   		configFile = optarg; #                  override_config=1; 	  		break;   	    case 'v':   		debug++; 	  		break;   	    case 'd':   		debug++; 	  		break;  	    case '?': 	        usage();                  break;  	}       }       //;     // The total search can NEVER take more than 5 minutes.      //     alarm(5 * 60);       //      // Parse the CGI parameters.     //     char	none[] = ""; 0     cgi		input(optind < ac ? av[optind] : none);       //%     // Compile the URL limit pattern.      //!     if (input.exists("restrict"))        {  	char *sep = input["restrict"]; , 	while ((sep = strchr(sep, '\001')) != NULL) 	  *sep++ = '|';% 	limit_to.Pattern(input["restrict"]);      }       if (input.exists("exclude"))     { $        char *sep = input["exclude"];2        while ((sep = strchr(sep, '\001')) != NULL)        	 *sep++ = '|';/        exclude_these.Pattern(input["exclude"]);      }        //N     // Setup the configuration database.  First we read the compiled defaults.J     // Then we override those with defaults read in from the configurationH     // file, and finally we override some attributes with information we     // got from the HTML form.     //"     config.Defaults(&defaults[0]);9     // To allow . in filename while still being 'secure',      // e.g. htdig-f.q.d.n.conf3     if (!override_config && input.exists("config")  , 	&& (strstr(input["config"], "./") == NULL))     {  #ifdef __VMS. 	char	*configDir = getenv("HTDIG_CONFIG_DIR"); #else ( 	char	*configDir = getenv("CONFIG_DIR"); #endif 	if (configDir)  	{ 	    configFile = configDir; 	} 	else  	{ 	    configFile = CONFIG_DIR;  	}" 	if (strlen(input["config"]) == 0)$ 	  configFile = DEFAULT_CONFIG_FILE; 	else  #ifdef __VMS 	{, 	  if (strchr(configFile.get(),'/') != NULL) 	    configFile << '/'; , 	  configFile << input["config"] << ".conf"; 	} #else 3 	  configFile << '/' << input["config"] << ".conf";  #endif     } %     if (access(configFile, R_OK) < 0)      { ; 	reportError(form("Unable to read configuration file '%s'",  			 configFile.get()));      }      config.Read(configFile);       if (input.exists("method")) - 	config.Add("match_method", input["method"]);      if (input.exists("format")) . 	config.Add("template_name", input["format"]);  '     if (input.exists("matchesperpage"))      { H 	// minimum check for a valid int value of "matchesperpage" cgi variable' 	if (atoi(input["matchesperpage"]) > 0) = 	    config.Add("matches_per_page", input["matchesperpage"]);      }        if (input.exists("page")) " 	pageNumber = atoi(input["page"]);     if (input.exists("config")) ' 	config.Add("config", input["config"]); !     if (input.exists("restrict")) + 	config.Add("restrict", input["restrict"]);       if (input.exists("exclude"))) 	config.Add("exclude", input["exclude"]); !     if (input.exists("keywords")) + 	config.Add("keywords", input["keywords"]); <     requiredWords.Create(config["keywords"], " \t\r\n\001");     if (input.exists("sort")) # 	config.Add("sort", input["sort"]);   S     minimum_word_length = config.Value("minimum_word_length", minimum_word_length);   =     StringList form_vars(config["allow_in_form"], " \t\r\n"); *     for (i= 0; i < form_vars.Count(); i++)     { %       if (input.exists(form_vars[i])) / 	config.Add(form_vars[i], input[form_vars[i]]);      }    8     // Ctype-like functions for what constitutes a word.#     HtWordType::Initialize(config);        //6     // Check url_part_aliases and common_url_parts for     // errors.>     String url_part_errors = HtURLCodec::instance()->ErrMsg();  &     if (url_part_errors.length() != 0)J       reportError(form("Invalid url_part_aliases or common_url_parts: %s",/                        url_part_errors.get()));   "     Parser	*parser = new Parser(); 	      //<     // Parse the words to search for from the argument list.6     // This will produce a list of WeightWord objects.     //+     String	 originalWords = input["words"]; "     originalWords.chop(" \t\r\n");*     setupWords(originalWords, searchWords,7 	       strcmp(config["match_method"], "boolean") == 0,  	       parser, origPattern);        //A     // Convert the list of WeightWord objects to a pattern string      // that we can compile.      //B     createLogicalWords(searchWords, logicalWords, logicalPattern);       //  F     // Assemble the full pattern for excerpt matching and highlighting     //"     origPattern += logicalPattern;$     searchWordsPattern.IgnoreCase();%     searchWordsPattern.IgnorePunct(); L     searchWordsPattern.Pattern(logicalPattern);	// this should now be enough.     //searchWordsPattern.Pattern(origPattern);     //if (debug > 2);     //  cout << "Excerpt pattern: " << origPattern << "\n";        //B     // If required keywords were given in the search form, we willB     // modify the current searchWords list to include the required
     // words.      //"     if (requiredWords.Count() > 0)     { . 	addRequiredWords(searchWords, requiredWords);     }           //L     // Perform the actual search.  The function htsearch() is used for this.K     // The Dictionary it returns is then passed on to the Display object to +     // actually render the results in HTML.      //'     String	word_db = config["word_db"]; "     if (access(word_db, R_OK) < 0)     { Q 	reportError(form("Unable to read word database file '%s'\nDid you run htmerge?",  			 word_db.get()));     } A     ResultList	*results = htsearch(word_db, searchWords, parser);   '     String	index = config["doc_index"];       if (access(index, R_OK) < 0)     { R 	reportError(form("Unable to read document index file '%s'\nDid you run htmerge?", 			 index.get()));     } %     String	doc_db = config["doc_db"]; !     if (access(doc_db, R_OK) < 0)      { U 	reportError(form("Unable to read document database file '%s'\nDid you run htmerge?",  			 doc_db.get()));      }   #     Display	display(index, doc_db); #     if (display.hasTemplateError())        { F 	reportError(form("Unable to read template file '%s'\nDoes it exist?",3                          config["template_name"])); 
 	return 0;       } ,     display.setOriginalWords(originalWords);      display.setResults(results);)     display.setSearchWords(&searchWords);       display.setLimit(&limit_to);'     display.setExclude(&exclude_these); 4     display.setAllWordsPattern(&searchWordsPattern);     display.setCGI(&input); *     display.setLogicalWords(logicalWords);     if (parser->hadError()) 7 	display.displaySyntaxError(parser->getErrorMessage());      else 	display.display(pageNumber);        delete results;      delete parser;
     return 0;  }   O //*****************************************************************************  voidG createLogicalWords(List &searchWords, String &logicalWords, String &wm)  {      String		pattern;     int			i;     int			wasHidden = 0;  -     for (i = 0; i < searchWords.Count(); i++)      { 0 	WeightWord	*ww = (WeightWord *) searchWords[i]; 	if (!ww->isHidden)  	{6 	    if (strcmp(ww->word, "&") == 0 && wasHidden == 0) 		logicalWords << " and ";; 	    else if (strcmp(ww->word, "|") == 0 && wasHidden == 0)  		logicalWords << " or "; ; 	    else if (strcmp(ww->word, "!") == 0 && wasHidden == 0)  		logicalWords << " not "; 	    else if (wasHidden == 0)  	    { 		logicalWords << ww->word;  	    } 	    wasHidden = 0;  	} 	else  	    wasHidden = 1; 4 	if (ww->weight > 0			// Ignore boolean syntax stuff5 	    && !ww->isIgnore)			// Ignore short or bad words  	{ 	    if (pattern.length()) 		pattern << '|';  	    pattern << ww->word;  	}     }      wm = pattern;        if (debug)     { 2 	cerr << "LogicalWords: " << logicalWords << endl;( 	cerr << "Pattern: " << pattern << endl;     }  }    void& dumpWords(List &words, char *msg = "") {      if (debug)     {  	cerr << msg << ": '";( 	for (int i = 0; i < words.Count(); i++) 	{. 	    WeightWord	*ww = (WeightWord *) words[i];4 	    cerr << ww->word << ':' << ww->isHidden << ' '; 	} 	cerr << "'\n";      }  }   O //***************************************************************************** 5 // void setupWords(char *allWords, List &searchWords, < //		   int boolean, Parser *parser, String &originalPattern) // voidJ setupWords(char *allWords, List &searchWords, int boolean, Parser *parser, 	   String &originalPattern) {      List	tempWords;      int		i;        //K     // Parse the words we need to search for.  It should be a list of words D     // with optional 'and' and 'or' between them.  The list of wordsF     // will be put in the searchWords list and at the same time in the)     // String pattern separated with '|'.      //>     WordList	badWords;		// Just used to check for valid words.2     badWords.BadWordFile(config["bad_word_list"]);       //G     // Convert the string to a list of WeightWord objects.  The special L     // characters '(' and ')' will be put into their own WeightWord objects.     //3     unsigned char	*pos = (unsigned char*) allWords;      unsigned char	t;     String		word; <     // Why use a char type if String is the new char type!!!<     char		*prefix_suffix = config["prefix_match_character"];     while (*pos)     { 
 	while (1) 	{ 	    t = *pos++; 	    if (isspace(t)) 	    { 		continue;  	    }0 	    else if (boolean && (t == '(' || t == ')')) 	    { 		char	s[2]; 		s[0] = t;  		s[1] = '\0';) 		tempWords.Add(new WeightWord(s, -1.0));  		break; 	    }, 	    else if (HtIsWordChar(t) || t == ':' ||A 			 (strchr(prefix_suffix, t) != NULL) || (t >= 161 && t <= 255))  	    { 		word = 0; ! 		while (t && (HtIsWordChar(t) || R 			     t == ':' || (strchr(prefix_suffix, t) != NULL) || (t >= 161 && t <= 255))) 		{  		    word << (char) t;  		    t = *pos++;  		}    		pos--; 	  	word.lowercase(); 6 		if (boolean && mystrcasecmp(word.get(), "and") == 0) 		{ / 		    tempWords.Add(new WeightWord("&", -1.0));  		} : 		else if (boolean && mystrcasecmp(word.get(), "or") == 0) 		{ / 		    tempWords.Add(new WeightWord("|", -1.0));  		} ; 		else if (boolean && mystrcasecmp(word.get(), "not") == 0)  		{ / 		    tempWords.Add(new WeightWord("!", -1.0));  		}  		else 		{ * 		    // Add word to excerpt matching list% 		    originalPattern << word << "|"; ! 	  	    HtStripPunctuation(word); 1 		    WeightWord	*ww = new WeightWord(word, 1.0); $ 		    if (!badWords.IsValid(word) ||' 			word.length() < minimum_word_length)  		    {  			ww->isIgnore = 1; 			tempWords.Add(ww);  		    } 
 		    else 		    {  			tempWords.Add(ww);  		    }  		}  		break; 	    } 	}     }   &     dumpWords(tempWords, "tempWords"); 	      //D     // If the user specified boolean expression operators, the wholeC     // expression has to be syntactically correct.  If not, we need       // to report a syntax error.     //     if (boolean)     { & 	if (!parser->checkSyntax(&tempWords)) 	{, 	    for (i = 0; i < tempWords.Count(); i++) 	    {  		searchWords.Add(tempWords[i]); 	    } 	    tempWords.Release();  	    return;! //			reportError("Syntax error");  	}     }      else     {  	convertToBoolean(tempWords);      }  	 $     dumpWords(tempWords, "Boolean"); 	      //O     // We need to assign weights to the words according to the search_algorithm      // configuration attribute. H     // For algorithms other than exact, we need to also do word lookups.     //8     StringList	algs(config["search_algorithm"], " \t,");     List		algorithms;      String		name, weight;      double		fweight;     Fuzzy		*fuzzy = 0;       //E     // Generate the list of algorithms to use and associate the given      // weights with them.      //&     for (i = 0; i < algs.Count(); i++)     {  	name = strtok(algs[i], ":");  	weight = strtok(0, ":");  	if (name.length() == 0) 	    name = "exact"; 	if (weight.length() == 0) 	    weight = "1"; 	fweight = atof(weight);  % 	fuzzy = Fuzzy::getFuzzyByName(name);  	if (fuzzy)  	{ 	    fuzzy->setWeight(fweight);  	    fuzzy->openIndex(config); 	    algorithms.Add(fuzzy);  	}     }   &     dumpWords(searchWords, "initial"); 	      //7     // For each of the words, apply all the algorithms.      //+     for (i = 0; i < tempWords.Count(); i++)      { . 	WeightWord	*ww = (WeightWord *) tempWords[i];% 	if (ww->weight > 0 && !ww->isIgnore)  	{ 	    // - 	    // Apply all the algorithms to the word.  	    //  	    if (debug) 0 	      cerr << "Fuzzy on: " << ww->word << endl;* 	    doFuzzy(ww, searchWords, algorithms); 	    delete ww;  	} 	else  	{ 	    // C 	    // This is '(', ')', '&', or '|'.  These will be automatically + 	    // transfered to the searchWords list.  	    //  	    if (debug) & 		cerr << "Add: " << ww->word << endl; 	    searchWords.Add(ww);  	}' 	dumpWords(searchWords, "searchWords");      }      tempWords.Release();!     // Does the next thing work??  //    algorithms.Start_Get(); 7 //    while ((fuzzy = (Fuzzy *) algorithms.Get_Next()))  //	delete fuzzy; }     O //*****************************************************************************  void< doFuzzy(WeightWord *ww, List &searchWords, List &algorithms) {      List		fuzzyWords;      List		weightWords;     Fuzzy		*fuzzy;     WeightWord	*newWw;     String		*word;       algorithms.Start_Get(); 5     while ((fuzzy = (Fuzzy *) algorithms.Get_Next()))      {          if (debug > 1)% 	  cout << "   " << fuzzy->getName(); ' 	fuzzy->getWords(ww->word, fuzzyWords);  	fuzzyWords.Start_Get();2 	while ((word = (String *) fuzzyWords.Get_Next())) 	{ 	    if (debug > 1) " 	      cout << " " << word->get();= 	    newWw = new WeightWord(word->get(), fuzzy->getWeight()); " 	    newWw->isExact = ww->isExact;$ 	    newWw->isHidden = ww->isHidden; 	    weightWords.Add(newWw); 	} 	if (debug > 1)  	  cout << endl; 	fuzzyWords.Destroy();     }        //E     // We now have a list of substitute words.  They need to be added      // to the searchWords.     //     if (weightWords.Count())     {  	if (weightWords.Count() > 1) 0 	    searchWords.Add(new WeightWord("(", -1.0));. 	for (int i = 0; i < weightWords.Count(); i++) 	{ 	    if (i > 0) - 		searchWords.Add(new WeightWord("|", -1.0)); % 	    searchWords.Add(weightWords[i]);  	} 	if (weightWords.Count() > 1)c0 	    searchWords.Add(new WeightWord(")", -1.0));     }g     weightWords.Release(); }a    O //*****************************************************************************e% // void convertToBoolean(List &words)a // void convertToBoolean(List &words)2 {/     List	list;     int		i;;=     int		do_and = strcmp(config["match_method"], "and") == 0;i       if (words.Count() == 0)i 	return;     list.Add(words[0]);y'     for (i = 1; i < words.Count(); i++)o     {  	if (do_and)) 	    list.Add(new WeightWord("&", -1.0));e 	elsej) 	    list.Add(new WeightWord("|", -1.0));. 	list.Add(words[i]);     }c     words.Release();  &     for (i = 0; i < list.Count(); i++)     {e 	words.Add(list[i]);     }V     list.Release();< }o    O //*****************************************************************************iJ // Dictionary *htsearch(char *wordfile, List &searchWords, Parser *parser)F //   This returns a dictionary indexed by document ID and containing a# //   List of WordReference objects.  // ResultList *; htsearch(char *wordfile, List &searchWords, Parser *parser)) {      //1     // Pick the database type we are going to user     //)     ResultList	*matches = new ResultList;&      if (searchWords.Count() > 0)     {	1 	Database	*dbf = Database::getDatabaseInstance();*   	dbf->OpenRead(wordfile);*   	parser->setDatabase(dbf);' 	parser->parse(&searchWords, *matches);  	dbf->Close(); 	delete dbf;     }n 		     return matches;* }a    O //*****************************************************************************gF // Modify the search words list to include the required words as well.G // This is done by putting the existing search words in parenthesis and 5 // appending the required words separated with "and".  void> addRequiredWords(List &searchWords, StringList &requiredWords) {g5     searchWords.Insert(new WeightWord("(", -1.0), 0); /     searchWords.Add(new WeightWord(")", -1.0));/  3     for (int i = 0; i < requiredWords.Count(); i++)      {	, 	searchWords.Add(new WeightWord("&", -1.0));8 	searchWords.Add(new WeightWord(requiredWords[i], 1.0));     }  }     O //***************************************************************************** J // Report an error.  Since we don' know if we are running as a CGI or not,D // we will assume this is the first thing returned by a CGI program. // void reportError(char *msg) { .     cout << "Content-type: text/html\r\n\r\n";A     cout << "<html><head><title>htsearch error</title></head>\n"; +     cout << "<body bgcolor=\"#ffffff\">\n"; (     cout << "<h1>ht://Dig error</h1>\n";J     cout << "<p>htsearch detected an error.  Please report this to the\n";C     cout << "webmaster of this site.  The error message is:</p>\n";"=     cout << "<pre>\n" << msg << "\n</pre>\n</body></html>\n";c     exit(1); }"  O //*****************************************************************************| // void usage()eM //   Display program usage information--assumes we're running from a cmd linea // void usage() { E   cout << "usage: htsearch [-v][-d][-c configfile] [query_string]\n";uC   cout << "This program is part of ht://Dig " << VERSION << "\n\n";i   cout << "Options:\n";nJ   cout << "\t-v -d\tVerbose mode.  This increases the verbosity of the\n";F   cout << "\t\tprogram.  Using more than 2 is probably only useful\n";D   cout << "\t\tfor debugging purposes.  The default verbose mode\n";J   cout << "\t\tgives a progress on what it is doing and where it is.\n\n";   cout << "\t-c configfile\n";F   cout << "\t\tUse the specified configuration file instead on the\n";   cout << "\t\tdefault.\n\n";{P   cout << "\tquery_string\tA CGI-style query string can be given as a single\n";O   cout << "\t\targument, and is only used if the REQUEST_METHOD environment\n"; I   cout << "\t\tvariable is not set.  If no query_string is given, and\n";iS   cout << "\t\tREQUEST_METHOD is not set, htsearch will prompt for the query.\n\n";l
   exit(0); }u"config"] << ".conf";  #endif     } %     if (access(configFile, R                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                