 // // Retriever.cc  // // Implementation of Retriever //B // $Id: Retriever.cc,v 1.36.2.20 2000/02/15 23:10:36 grdetil Exp $ //   #include "Retriever.h" #include "htdig.h" #include "WordList.h"  #include "URLRef.h"  #include "Server.h"  #include "Parsable.h"  #include "Document.h"  #include "StringList.h"  #include <pwd.h> #include <signal.h>  #include <assert.h>  #include <stdio.h> #include "HtWordType.h"    static WordList	words; static int noSignal;   #ifdef __VMS2 #include <unistd.h> // for exit(1) == EXIT_FAILURE   #include <unixlib.h>  // Action routine to decc$to_vms& // Stupid call sequence, if you ask me static char *tmp_GetUnixName; & static int GetUnixName(char *UnixName) { ?     // Hopefully the contents of UnixName is available globally      tmp_GetUnixName = UnixName; 
     return 0;  }  #endif    O //*****************************************************************************  // Retriever::Retriever()  //( Retriever::Retriever(RetrieverLog flags) {      FILE	*urls_parsed;       currenthopcount = 0;:     max_hop_count = config.Value("max_hop_count", 999999); 		     //?     // Initialize the weight factors for words in the different      // HTML headers      //=     factor[0] = config.Double("text_factor"); // Normal words .     factor[1] = config.Double("title_factor");2     factor[2] = config.Double("heading_factor_1");2     factor[3] = config.Double("heading_factor_2");2     factor[4] = config.Double("heading_factor_3");2     factor[5] = config.Double("heading_factor_4");2     factor[6] = config.Double("heading_factor_5");2     factor[7] = config.Double("heading_factor_6");     factor[8] = 0;     factor[9] = 0;2     factor[10] = config.Double("keywords_factor");:     factor[11] = config.Double("meta_description_factor"); 	      //3     // Open the file to which we will append words.      //*     String	filename = config["word_list"];!     words.WordTempFile(filename); /     words.BadWordFile(config["bad_word_list"]);        doc = new Document(); ?     minimumWordLength = config.Value("minimum_word_length", 3);        log = flags;     // if in restart mode !     if (Retriever_noLog != log )       { '     String	filelog = config["url_log"];      char buffer[1000];	// FIXME      int  l;   & 	urls_parsed = fopen(filelog,   "r" ); 	if (0 != urls_parsed)	         { 9   	    // read all url discovered but not fetched before  7 	    while (fgets(buffer, sizeof(buffer), urls_parsed))        	    { 	       l = strlen(buffer); * 	       assert(l && buffer[l -1] == '\n'); 	       buffer[l -1] = 0;  	       Initial(buffer,2);
             }               fclose(urls_parsed); 	}         unlink(filelog);     }  }     O //*****************************************************************************  // Retriever::~Retriever() // Retriever::~Retriever()  {      delete doc;  }     O //***************************************************************************** 9 // void Retriever::setUsernamePassword(char *credentials)  // void1 Retriever::setUsernamePassword(char *credentials)  { *     doc->setUsernamePassword(credentials); }     O //***************************************************************************** 0 // void Retriever::Initial(char *list, int from)3 //   Add a single URL to the list of URLs to visit. K //   Since URLs are stored on a per server basis, we first need to find the 1 //   the correct server to add the URL's path to.  //, //   from == 0 urls in db.docs and no db.logI //   from == 1 urls in start_url add url only if not already in the list  # //   from == 2 add url from db.log  6 //   from == 3 urls in db.docs and there was a db.log  // void( Retriever::Initial(char *list, int from) {      //6     // Split the list of urls up into individual urls.     //#     StringList	tokens(list, " \t");      String	sig;      String      url;     Server	*server;   ,     for (int i = 0; i < tokens.Count(); i++)     {  	URL	u(tokens[i]);, 	server = (Server *) servers[u.signature()]; 	url = u.get();  	if (debug > 2) B            cout << "\t" << from << ":" << (int) log << ":" << url;
 	if (!server)  	{" 	    String robotsURL = "http://";, 	    robotsURL << u.host() << "/robots.txt";> 	    StringList *localRobotsFiles = GetLocal(robotsURL.get());? 	    server = new Server(u.host(), u.port(), localRobotsFiles); ( 	    servers.Add(u.signature(), server); 	    delete localRobotsFiles;  	}' 	else if (from && visited.Exists(url))   	{ 	    if (debug > 2) +                 cout << " skipped" << endl;  	    continue; 	}1         if (Retriever_noLog == log || from != 3)  	         {  	    if (debug > 2) "                 cout << " pushed";6 	    server->push(u.get(), 0, 0, IsLocalURL(u.get()));	         }  	if (debug > 2)             cout << endl; 	visited.Add(url, 0);      }  }     O //***************************************************************************** 0 // void Retriever::Initial(List &list, int from) // void' Retriever::Initial(List &list,int from)  {      list.Start_Get();      String	*str;B     // from == 0 is an optimisation for pushing url in update mode     //  assuming that *     // 1) there's many more urls in docdb      // 2) they're pushed first+     // 3) there's no duplicate url in docdb C     // then they don't need to be check against already pushed urls )     // But 2) can be false with -l option      //2     // FIXME it's nasty, what have to be test is :G     // we have urls to push from db.docs but do we already have them in H     // db.log? For this it's using a side effect with 'visited' and thatR     // urls in db.docs are only pushed via this method, and that db.log are pushed/     // first, db.docs second, start_urls third!      //  #     if (!from && visited.Count())        {         from = 3;     } .     while ((str = (String *) list.Get_Next()))     {  	Initial(str->get(),from);     }  }   O //*****************************************************************************  // static void sigexit(int) {   noSignal=0; }   O //*****************************************************************************  // static void sig_handlers  //	initialise signal handlers  // static void  sig_handlers(void) {  struct sigaction action;    /* SIGINT, SIGQUIT, SIGTERM */   action.sa_handler = sigexit;   sigemptyset(&action.sa_mask);  action.sa_flags = 0; +  if (sigaction(SIGINT, &action, NULL) != 0) 0 	reportError("Cannot install SIGINT handler\n");+  if(sigaction(SIGQUIT, &action, NULL) != 0) 1 	reportError("Cannot install SIGQUIT handler\n"); +  if(sigaction(SIGTERM, &action, NULL) != 0) 1 	reportError("Cannot install SIGTERM handler\n"); *  if(sigaction(SIGHUP, &action, NULL) != 0)0 	reportError("Cannot install SIGHUP handler\n"); }     O //*****************************************************************************  // void Retriever::Start()D //   This is the main loop of the retriever.  We will go through the= //   list of paths stored for each server.  While parsing the E //   retrieved documents, new paths will be added to the servers.  We 2 //   return if no more paths need to be retrieved. // void Retriever::Start() {      //F     // Main digger loop.  The todo list should initialy have the startC     // URL and all the URLs which were seen in a previous dig.  The B     // loop will continue as long as there are more URLs to visit.     //     int		more = 1;     Server	*server;      URLRef	*ref;          //  =     // Always sig . The delay bother me but a bad db is worst      //  "     if ( Retriever_noLog != log )      {  	sig_handlers();     }      noSignal = 1;        while (more && noSignal)     { 
 	more = 0; 		 	// E 	// Go through all the current servers in sequence.  We take only one F 	// URL from each server during this loop.  This ensures that the load) 	// on the servers is distributed evenly.  	//  	servers.Start_Get(); D 	while ( (server = (Server *)servers.Get_NextElement()) && noSignal) 	{ 	    if (debug > 1) ; 		cout << "pick: " << server->host() << ", # servers = " <<  		    servers.Count() << endl; 	      	    ref = server->pop();  	    if (!ref), 		continue;		      // Nothing on this server5 	    // There may be no more documents, or the server , 	    // has passed the server_max_docs limit   	    // = 	    // We have a URL to index, now.  We need to register the ; 	    // fact that we are not done yet by setting the 'more'  	    // variable.  	    //  	    more = 1;   	    // ! 	    // Deal with the actual URL. @ 	    // We'll check with the server to see if we need to sleep() 	    // before parsing it. 	    // G 	    server->delay();   // This will pause if needed and reset the time  	    parse_url(*ref);              delete ref;  	}     }      // if we exited on signal -     if (Retriever_noLog != log && !noSignal)       {      FILE	*urls_parsed;'     String	filelog = config["url_log"]; (         // save url seen but not fetched& 	urls_parsed = fopen(filelog,   "w" ); 	if (0 == urls_parsed) 	{; 	    reportError(form("Unable to create URL log file '%s'",  			     filelog.get())); 	}         else {   	   servers.Start_Get();: 	   while ((server = (Server *)servers.Get_NextElement())) 	   { 0    	      while (NULL != (ref = server->pop()))                { :       	          fprintf(urls_parsed, "%s\n", ref->URL());  		  delete ref;               }             }            fclose(urls_parsed); 	         }      }  }     O //***************************************************************************** & // void Retriever::parse_url(URL &url) // void$ Retriever::parse_url(URLRef &urlRef) {      URL			url;     DocumentRef	*ref;      int			old_document;      time_t		date;      static int	index = 0;      Server		*server;D     static int		local_urls_only = config.Boolean("local_urls_only");  G //	cout << "**** urlRef URL = '" << urlRef.URL() << "', referer = '" <<  //		urlRef.Referer() << "'\n";     url.parse(urlRef.URL()); 	 (     currenthopcount = urlRef.HopCount();     ref = GetRef(url.get());     if (ref)     {  	// ? 	// We already have an entry for this document in our database. ? 	// This means we can get the document ID and last modification  	// time from there. 	//  	current_id = ref->DocID();  	date = ref->DocTime();  	if (ref->DocAccessed()) 	  old_document = 1;D 	else // we haven't retrieved it yet, so we only have the first link 	  old_document = 0;A 	ref->DocBackLinks(ref->DocBackLinks() + 1); // we had a new link  	ref->DocAccessed(time(0)); ! 	ref->DocState(Reference_normal); +         currenthopcount=ref->DocHopCount();      }      else     {  	// : 	// Never seen this document before.  We need to create an? 	// entry for it.  This implies that it gets a new document ID.  	// 
 	date = 0; 	current_id = docs.NextDocID();  	ref = new DocumentRef;  	ref->DocID(current_id); 	ref->DocURL(url.get());! 	ref->DocState(Reference_normal);  	ref->DocAccessed(time(0)); *         ref->DocHopCount(currenthopcount);< 	ref->DocBackLinks(1); // We had to have a link to get here! 	old_document = 0;     }   #     words.DocumentID(ref->DocID());        if (debug > 0)     {  	//  	// Display progress 	//  	cout << index++ <<  	    ':' << current_id <<  	    ':' << currenthopcount << 	    ':' << url.get() << ": "; 	cout.flush();     }      3     // Reset the document to clean out any old data      doc->Reset();        doc->Url(url.get());#     doc->Referer(urlRef.Referer());        base = doc->Url();  D     // Retrive document, first trying local file access if possible.     Document::DocStatus status; 1     server = (Server *) servers[url.signature()]; 6     StringList *local_filenames = GetLocal(url.get());     if (local_filenames)     {            if (debug > 1)* 	    cout << "Trying local files" << endl;;         status = doc->RetrieveLocal(date, local_filenames); 3         if (status == Document::Document_not_local) 	         {  	    if (local_urls_only) ( 		status = Document::Document_not_found;* 	    else if (server && !server->IsDead()) 	    {         	if (debug > 1) < 		    cout << "Local retrieval failed, trying HTTP" << endl;# 		status = doc->RetrieveHTTP(date);  	    }	 	    else ( 		status = Document::Document_no_server;	         }          delete local_filenames;      } )     else if (server && !server->IsDead()) )         status = doc->RetrieveHTTP(date);      else' 	status = Document::Document_no_server;        current_ref = ref; 	      //E     // Determine what to do by looking at the status code returned by &     // the Document retrieval process.     //     switch (status)      {  	case Document::Document_ok: 	    trackWords = 1; 	    if (old_document) 	    {, 	      if (doc->ModTime() == ref->DocTime()) 		{  		  if (debug)3 		    cout << " retrieved but not changed" << endl;  		  words.MarkScanned();
 		  break; 		}  		//7 		// Since we already had a record of this document and 6 		// we were able to retrieve it, it must have changed8 		// since the last time we scanned it.  This means that7 		// we need to assign a new document ID to it and mark  		// the old one as obsolete.  		// 		words.MarkModified(); - 	        int backlinks = ref->DocBackLinks(); 
 		delete ref;   		current_id = docs.NextDocID(); 		words.DocumentID(current_id);  		ref = new DocumentRef; 		ref->DocID(current_id);  		ref->DocURL(url.get()); " 		ref->DocState(Reference_normal); 		ref->DocAccessed(time(0));$ 		ref->DocHopCount(currenthopcount); 		ref->DocBackLinks(backlinks);  		if (debug) 		    cout << " (changed) "; 	    }- 	    RetrievedDocument(*doc, url.get(), ref); B 	    // Hey! If this document is marked noindex, don't even bother> 	    // adding new words. Mark this as gone and get rid of it!. 	    if (ref->DocState() == Reference_noindex) 	      words.MarkModified();	 	    else  	      words.Flush();  	    if (debug) . 		cout << " size = " << doc->Length() << endl; 	    break;   % 	case Document::Document_not_changed:  	    if (debug) ! 		cout << " not changed" << endl;  	    words.MarkScanned();  	    break;   # 	case Document::Document_not_found: ( 	    ref->DocState(Reference_not_found); 	    if (debug)  		cout << " not found" << endl;  	    recordNotFound(url.get(), 			   urlRef.Referer(), $ 			   Document::Document_not_found); 	    words.MarkGone(); 	    break;   ! 	case Document::Document_no_host: ( 	    ref->DocState(Reference_not_found); 	    if (debug) $ 		cout << " host not found" << endl; 	    recordNotFound(url.get(), 			   urlRef.Referer(), " 			   Document::Document_no_host); 	    words.MarkGone(); 	    if (server) 		server->IsDead(1); 	    break;   # 	case Document::Document_no_server: ( 	    ref->DocState(Reference_not_found); 	    if (debug) ' 		cout << " no server running" << endl;  	    recordNotFound(url.get(), 			   urlRef.Referer(), $ 			   Document::Document_no_server); 	    words.MarkGone(); 	    if (server) 		server->IsDead(1); 	    break;   " 	case Document::Document_not_html: 	    if (debug)  		cout << " not HTML" << endl; 	    words.MarkGone(); 	    break;   " 	case Document::Document_redirect: 	    if (debug)  		cout << " redirect" << endl; 	    words.MarkGone();* 	    got_redirect(doc->Redirected(), ref); 	    break;  	     .        case Document::Document_not_authorized: 	    if (debug) ) 	      cout << " not authorized" << endl;  	    break;   (       case Document::Document_not_local: 	   if (debug)# 	     cout << " not local" << endl; 
 	   break;     }      docs.Add(*ref);      delete ref;  }     O //***************************************************************************** P // void Retriever::RetrievedDocument(Document &doc, char *url, DocumentRef *ref)J //   We found a document that needs to be parsed.  Since we don't know theG //   document type, we'll let the Document itself return an appropriate K //   Parsable object which we can call upon to parse the document contents.  // voidE Retriever::RetrievedDocument(Document &doc, char *, DocumentRef *ref)  {      n_links = 0;     current_ref = ref;     current_anchor_number = 0;     current_title = 0;     current_head = 0;      current_meta_dsc = 0;        //C     // Create a parser object and let it have a go at the document. F     // We will pass ourselves as a callback object for all the got_*()     // routines.B     // This will generate the Parsable object as a specific parser     //+     Parsable	*parsable = doc.getParsable();e"     parsable->parse(*this, *base);       //D     // We don't need to dispose of the parsable object since it will     // automatically be reused.      //       //$     // Update the document reference     //     ref->DocHead(current_head);t&     ref->DocMetaDsc(current_meta_dsc);      ref->DocTime(doc.ModTime());!     ref->DocTitle(current_title);      ref->DocSize(doc.Length());      ref->DocAccessed(time(0));     ref->DocLinks(n_links);n:     ref->DocImageSize(ref->DocImageSize() + doc.Length()); }p    O //*****************************************************************************e# // int Retriever::Need2Get(char *u)/A //   Return TRUE if we need to retrieve the given url.  This will*4 //   check the list of urls we have already visited. // intr Retriever::Need2Get(char *u) {E     static String	url;     url = u;        return !visited.Exists(url); }"      O //***************************************************************************** % // int Retriever::IsValidURL(char *u) E //   Return TRUE if we need to retrieve the given url.  We will check  //   for limits here.u // inte Retriever::IsValidURL(char *u) {D$     static Dictionary	*invalids = 0;"     static Dictionary	*valids = 0;       //@     // Invalid extensions will be kept in a dictionary for quickC     // lookup.  Since the dictionary is static to this function, wei8     // need to initialize it the first time we get here.     //     if (!invalids)     {a9 	// A list of bad extensions, separated by spaces or tabso% 	String	t = config["bad_extensions"];e 	String	lowerp;c 	char	*p = strtok(t, " \t"); 	invalids = new Dictionary;p
 	while (p) 	{' 	    // Extensions are case insensitivel 	    lowerp = p; 	    lowerp.lowercase(); 	    invalids->Add(lowerp, 0); 	    p = strtok(0, " \t"); 	}     }e       ///     // Valid extensions are performed similarlyl     //     if (!valids)     { = 	// A list of allowed extensions, separated by spaces or tabs ' 	String	t = config["valid_extensions"];l 	String	lowerp;b 	char	*p = strtok(t, " \t"); 	valids = new Dictionary;p
 	while (p) 	{' 	    // Extensions are case insensitive  	    lowerp = p; 	    lowerp.lowercase(); 	    valids->Add(lowerp, 0); 	    p = strtok(0, " \t"); 	}     })       static String	url;     url = u;       //C     // Currently, we only deal with HTTP URLs.  Gopher and ftp will      // come later...  ***FIX***      //;     if (strstr(u, "/../") || strncmp(u, "http://", 7) != 0)g       {  	if (debug > 2)*? 	  cout << endl <<"   Rejected: Not an http or relative link!";* 	return FALSE;       }t       //C     // If the URL contains any of the patterns in the exclude list,*     // mark it as invalid*     //D     if (excludes.hasPattern()) // Make sure there's an exclude list!       { 1 	int retValue;     // Returned value of findFirsts4 	int myWhich = 0;    // Item # that matched [0 .. n]4 	int myLength = 0;   // Length of the matching value7 	retValue = excludes.FindFirst(url, myWhich, myLength);/ 	if (retValue >= 0)i 	  { 	    if (debug > 2)/ 	      {/ 		myWhich++;         // [0 .. n] --> [1 .. n+1]nA 		cout << endl <<"  Rejected: Item in the exclude list: item # "; 5 		cout << myWhich << " length: " << myLength << endl;/ 	      } 	    return FALSE; 	  }       }g       //?     // See if the path extension is in the list of invalid ones      //"     char	*ext = strrchr(url, '.');     String	lowerext;C     if (ext && strchr(ext, '/'))	// Ignore a dot if it's not in the,3       ext = NULL;			// final component of the path.      if (ext)       {. 	lowerext = ext;< 	int parm = lowerext.indexOf('?');	// chop off URL parameter 	if (parm >= 0)r- 	    lowerext.chop(lowerext.length() - parm);u 	lowerext.lowercase();  	if (invalids->Exists(lowerext)) 	  { 	    if (debug > 2) ; 	      cout << endl <<"   Rejected: Extension is invalid!";< 	    return FALSE; 	  }       }        //'     // Or NOT in the list of valid onesh     //@     if (ext && valids->Count() > 0 && !valids->Exists(lowerext))       {s 	if (debug > 2)r9 	  cout << endl <<"   Rejected: Extension is not valid!";( 	return FALSE;       }        ext = strrchr(url, '?');*     if (ext && badquerystr.hasPattern() &&)        (badquerystr.FindFirst(ext) >= 0)){     {i       if (debug > 2)6 	  cout << endl <<"   Rejected: Invalid Querystring!";        return FALSE;     }o       //5     // If any of the limits are met, we allow the URL      //#     if (limits.FindFirst(url) >= 0)s
 	return TRUE;,       if (debug > 2);       cout << endl <<"   Rejected: URL not in the limits!";s       return FALSE;  }     O //******************************************************************************- // StringList* Retriever::GetLocal(char *url)tH //   Returns a list of strings containing the (possible) local filenames9 //   of the given url, or 0 if it's definitely not local.m3 //   THE CALLER MUST FREE THE STRINGLIST AFTER USE!i // StringList*1 Retriever::GetLocal(char *url) { $     static StringList *prefixes = 0;!     static StringList *paths = 0;d'     static StringList *defaultdocs = 0;c       //=     // Initialize prefix/path list if this is the first time. D     // The list is given in format "prefix1=path1 prefix2=path2 ..."     //     if (!prefixes)     {t!     	prefixes = new StringList();/ 	paths = new StringList();  	defaultdocs = new StringList();  ! 	String t = config["local_urls"];  	char *p = strtok(t, " \t"); 	while (p)	  	{$    	    char *path = strchr(p, '=');    	    if (!path) 	    { 		p = strtok(0, " \t");u    		continue; 	    }    	    *path++ = '\0';h! 	    String *pre = new String(p);t 	    decodeURL(*pre);s 	    prefixes->Add(pre);$ 	    String *pat = new String(path); 	    decodeURL(*pat);* 	    paths->Add(pat);  	    p = strtok(0, " \t"); 	}! 	t = config["local_default_doc"];* 	p = strtok(t, " \t"); 	while (p)	* 	{! 	    String *def = new String(p);i 	    decodeURL(*def);/ 	    defaultdocs->Add(def);  	    p = strtok(0, " \t"); 	} 	if (defaultdocs->Count() == 0); 	    delete defaultdocs;     }*  #     // Begin by hex-decoding URL...g     String hexurl = url;     decodeURL(hexurl);     url = hexurl.get();&  $     // Check first for local user...     if (strchr(url, '~'))"     {(4 	StringList *local = GetLocalUser(url, defaultdocs); 	if (local)a 	    return local;     }i  1     // This shouldn't happen, but check anyway...r     if (strstr(url, ".."))         return 0;i          String *prefix, *path;/     StringList *local_names = new StringList();"     prefixes->Start_Get();     paths->Start_Get();*5     while ((prefix = (String*) prefixes->Get_Next()))e     {t$ 	path = (String*) paths->Get_Next();?         if (mystrncasecmp(*prefix, url, prefix->length()) == 0)  	{; 	    int l = strlen(url)-prefix->length()+path->length()+4;a* 	    String *local = new String(*path, l);& 	    *local += &url[prefix->length()];/ 	    if (local->last() == '/' && defaultdocs) {   	      defaultdocs->Start_Get();G 	      while (String *defaultdoc = (String *)defaultdocs->Get_Next()) {eT 		String *localdefault = new String(*local, local->length()+defaultdoc->length()+1);$ 		localdefault->append(*defaultdoc);! 		local_names->Add(localdefault);R 	      } 	      delete local; 	    }	 	    elsee 	      local_names->Add(local);s 	}	      } !     if (local_names->Count() > 0)          return local_names;        delete local_names; 
     return 0;& }S    O //*****************************************************************************nJ // StringList* Retriever::GetLocalUser(char *url, StringList *defaultdocs)G //   If the URL has ~user part, return a list of strings containing ther> //   (possible) local filenames of the given url, or 0 if it's //   definitely not local.3 //   THE CALLER MUST FREE THE STRINGLIST AFTER USE!> // StringList*r; Retriever::GetLocalUser(char *url, StringList *defaultdocs)f {s;     static StringList *prefixes = 0, *paths = 0, *dirs = 0; !     static Dictionary home_cache;e       //=     // Initialize prefix/path list if this is the first time.s;     // The list is given in format "prefix1=path1,dir1 ..."nC     // If path is zero-length, user's home directory is looked up. g     //     if (!prefixes)     { $         prefixes = new StringList(); 	paths = new StringList(); 	dirs = new StringList();i& 	String t = config["local_user_urls"]; 	char *p = strtok(t, " \t");
 	while (p) 	{! 	    char *path = strchr(p, '=');u 	    if (!path)r 	    { 		p = strtok(0, " \t");e 	        continue; 	    } 	    *path++ = '\0';# 	    char *dir = strchr(path, ',');r 	    if (!dir) 	    { 		p = strtok(0, " \t");I 	        continue; 	    } 	    *dir++ = '\0';l! 	    String *pre = new String(p);b 	    decodeURL(*pre);a 	    prefixes->Add(pre);$ 	    String *pat = new String(path); 	    decodeURL(*pat);l 	    paths->Add(pat);e# 	    String *ptd = new String(dir);} 	    decodeURL(*ptd);  	    dirs->Add(ptd); 	    p = strtok(0, " \t"); 	}     }G  %     // Can we do anything about this?lE     if (!strchr(url, '~') || !prefixes->Count() || strstr(url, ".."))f         return 0;   "     // Split the URL to components     String tmp = url; "     char *name = strchr(tmp, '~');     *name++ = '\0'; #     char *rest = strchr(name, '/');*6     if (!rest || (rest-name <= 1) || (rest-name > 32))         return 0;:     *rest++ = '\0';   .     // Look it up in the prefix/path/dir table     prefixes->Start_Get();     paths->Start_Get();l     dirs->Start_Get();      String *prefix, *path, *dir;/     StringList *local_names = new StringList();r5     while ((prefix = (String*) prefixes->Get_Next()))      {*+         path = (String*) paths->Get_Next();e" 	dir = (String*) dirs->Get_Next();,         if (mystrcasecmp(*prefix, tmp) != 0)   	    continue;   	String *local = new String;# 	// No path, look up home directory  	if (path->length() == 0)  	{/ 	    String *home = (String*) home_cache[name];s 	    if (!home)  	    {0 	        struct passwd *passwd = getpwnam(name);
 		if (passwd)e 		{  #ifdef __VMS= 		    if (decc$from_vms(passwd->pw_dir, GetUnixName, 0) == 0)d 		    { ; 		        reportError("Cannot translate home directory\n");  		        exit(1); 		    } ) 		    home = new String(tmp_GetUnixName);+ #else ( 		    home = new String(passwd->pw_dir); #endif! 		    home_cache.Add(name, home);  		}  	    } 	    if (home) 	        *local += *home; 	 	    else  	        continue; 	} 	elseu 	{ 	    *local += *path;e 	    *local += name; 	} 	*local += *dir; 	*local += rest;+ 	if (local->last() == '/' && defaultdocs) {o 	  defaultdocs->Start_Get();C 	  while (String *defaultdoc = (String *)defaultdocs->Get_Next()) {fW 	    String *localdefault = new String(*local, local->length()+defaultdoc->length()+1);r' 	    localdefault->append(*defaultdoc); $ 	    local_names->Add(localdefault); 	  } 	  delete local; 	} 	elsed 	  local_names->Add(local);      }f  !     if (local_names->Count() > 0)a         return local_names;d       delete local_names;n
     return 0;' }<    O //***************************************************************************** ' // int Retriever::IsLocalURL(char *url)o? //   Returns 1 if the given url has a (possible) local filename-' //   or 0 if it's definitely not local.= // int(  Retriever::IsLocalURL(char *url) {t     int ret;  /     StringList *local_filename = GetLocal(url);t      ret = (local_filename != 0);     delete local_filename;       return ret;i }a     O //***************************************************************************** * // DocumentRef *Retriever::GetRef(char *u)D //   Extract the date field from the given url.  This will require a- //   lookup in the current document database.  // DocumentRef* Retriever::GetRef(char *u) {D     static String	url;     url = u;       return docs[url];I }a    O //***************************************************************************** B // void Retriever::got_word(char *word, int location, int heading)< //   The location is normalized to be in the range 0 - 1000. // void: Retriever::got_word(char *word, int location, int heading) {s     if (debug > 3)5 	cout << "word: " << word << '@' << location << endl;oC     if (heading > 11 || heading < 0) // Current limits for headings 3       heading = 0;  // Assume it's just normal textd     if (trackWords)/     {c       String w = word;       HtStripPunctuation(w);*       if (w.length() >= minimumWordLength)A 	words.Word(w, location, current_anchor_number, factor[heading]);oK       if (strcmp(word, w.get()) != 0)	// have punctuation that was strippedu       {n 	// Check for compound words...n 	String parts = word;  	int added;S 	int nparts = 1; 	doe 	{ 	    added = 0;  	    char *start = parts.get();v 	    char *punctp, *nextp, *p; 	    char  punct;e
 	    int   n;  	    while (*start)/ 	    { 		p = start; 		for (n = 0; n < nparts; n++) 		{t3 		    while (HtIsStrictWordChar((unsigned char)*p))) 			p++;  		    punctp = p;r# 		    if (!*punctp && n+1 < nparts) 	 			break;d: 		    while (*p && !HtIsStrictWordChar((unsigned char)*p)) 			p++;o 		    if (n == 0)D
 			nextp = p;) 		}	 		if (n < nparts)( 		    break; 		punct = *punctp; 		*punctp = '\0';c, 		if (*start && (*p || start > parts.get())) 		{t 		    w = start; 		    HtStripPunctuation(w);* 		    if (w.length() >= minimumWordLength) 		    {vC 			words.Word(w, location, current_anchor_number, factor[heading]);m 			if (debug > 3)tA 			    cout << "word part: " << start << '@' << location << endl;i 		    }  		    added++; 		}e 		start = nextp; 		*punctp = punct; 	    } 	    nparts++; 	} while (added > 2);h       }      }b }     O //*****************************************************************************u) // void Retriever::got_title(char *title)t // void! Retriever::got_title(char *title)S {n     if (debug > 1)& 	cout << "\ntitle: " << title << endl;     current_title = title; }n    O //*****************************************************************************r+ // void Retriever::got_anchor(char *anchor)u // void# Retriever::got_anchor(char *anchor)( {      if (debug > 2)& 	cout << "anchor: " << anchor << endl;#     current_ref->AddAnchor(anchor);      current_anchor_number++; }t    O //*****************************************************************************u' // void Retriever::got_image(char *src)G // void Retriever::got_image(char *src)1 {      URL	url(src, *base);     char	*image = url.get(); 	r     if (debug > 2)$ 	cout << "image: " << image << endl;       if (images_seen)% 	fprintf(images_seen, "%s\n", image);eQ //	current_ref->DocImageSize(current_ref->DocImageSize() + images.Sizeof(image));r }n    O //*****************************************************************************t: // void Retriever::got_href(char *href, char *description) // void0 Retriever::got_href(URL &url, char *description) {e     DocumentRef		*ref;     Server		*server;       if (debug > 2)E 	cout << "href: " << url.get() << " (" << description << ')' << endl;e       n_links++;       if (urls_seen)' 	fprintf(urls_seen, "%s\n", url.get());        //>     // Check if this URL falls within the valid range of URLs.     //     if (IsValidURL(url.get()))     {  	//<> 	// It is valid.  Normalize it (resolve cnames for the server) 	// and check again... 	//  	if (debug > 2)* 	{1 	    cout << "resolving '" << url.get() << "'\n";* 	    cout.flush(); 	}   	url.normalize();D  2 	// If it is a backlink from the current document,4 	// just update that field.  Writing to the database. 	// is meaningless, as it will be overwritten.7 	// Adding it as a new document may even be harmful, ast5 	// that will be a duplicate.  This can happen if the 8 	// current document is never referenced before, as in a 	// start_url.  3 	if (strcmp(url.get(), current_ref->DocURL()) == 0)  	{@ 	    current_ref->DocBackLinks(current_ref->DocBackLinks() + 1);. 	    current_ref->AddDescription(description); 	}, 	else if (limitsn.FindFirst(url.get()) >= 0) 	{ 	    //e- 	    // First add it to the document databasec 	    //f 	    ref = docs[url.get()];u9 	    // if ref exists we have to call AddDescription evens*             // if max_hop_count is reached9     	    if (!ref && currenthopcount + 1 > max_hop_count);	 		return;    	    if (!ref) 	    { 		//6 		// Didn't see this one, yet.  Create a new reference% 		// for it with a unique document ID/ 		// 		ref = new DocumentRef; 		ref->DocID(docs.NextDocID());h( 		ref->DocHopCount(currenthopcount + 1); 	    }= 	    ref->DocBackLinks(ref->DocBackLinks() + 1); // This one!) 	    ref->DocURL(url.get());& 	    ref->AddDescription(description);   	    //cK     	    // If the dig is restricting by hop count, perform the check here   	    // too*1     	    if (currenthopcount + 1 > max_hop_count)* 	    {
 		delete ref; 	 		return;i 	    }  $ 	    if (ref->DocHopCount() != -1 &&+ 		ref->DocHopCount() < currenthopcount + 1) 4 	       // If we had taken the path through this ref3 	       // We'd be here faster than currenthopcountc? 	       currenthopcount = ref->DocHopCount();  // So update it!"   	    docs.Add(*ref);   	    //*6 	    // Now put it in the list of URLs to still visit. 	    //i 	    if (Need2Get(url.get()))  	    { 		if (debug > 1)3 		    cout << "\n   pushing " << url.get() << endl;m/ 		server = (Server *) servers[url.signature()];* 		if (!server) 		{c 		    //3 		    // Hadn't seen this server, yet.  Register it  		    //# 		    String robotsURL = "http://"; / 		    robotsURL << url.host() << "/robots.txt"; > 		    StringList *localRobotsFile = GetLocal(robotsURL.get());C 		    server = new Server(url.host(), url.port(), localRobotsFile); + 		    servers.Add(url.signature(), server);y 		    delete localRobotsFile;o 		}" 		//6 		// Let's just be sure we're not pushing an empty URL 		// 		if (strlen(url.get()))< 		  server->push(url.get(), ref->DocHopCount(), base->get(), 				IsLocalURL(url.get()));.   		String	temp = url.get(); 		visited.Add(temp, 0);s 		if (debug) 		    cout << '+'; 	    } 	    else if (debug) 		cout << '*'; 	    delete ref; 	} 	elsel 	{ 	    //  	    // Not a valid URLe 	    //e 	    if (debug > 1)t; 		cout << "\nurl rejected: (level 2)" << url.get() << endl;p 	    if (debug == 1) 		cout << '-'; 	}     }i     else     {e 	//  	// Not a valid URLs 	//c 	if (debug > 1) > 	    cout << "\nurl rejected: (level 1)" << url.get() << endl; 	if (debug == 1) 	    cout << '-';      }      if (debug) 	cout.flush(); }r    O //***************************************************************************** D // void Retriever::got_redirect(char *new_url, DocumentRef *old_ref) // void< Retriever::got_redirect(char *new_url, DocumentRef *old_ref) {e     URL	url(new_url);        if (debug > 2)+ 	cout << "redirect: " << url.get() << endl;        n_links++;       if (urls_seen)' 	fprintf(urls_seen, "%s\n", url.get());t       //>     // Check if this URL falls within the valid range of URLs.     //     if (IsValidURL(url.get()))     {/ 	//u> 	// It is valid.  Normalize it (resolve cnames for the server) 	// and check again... 	//  	if (debug > 2)o 	{1 	    cout << "resolving '" << url.get() << "'\n";l 	    cout.flush(); 	}   	url.normalize(); ' 	if (limitsn.FindFirst(url.get()) >= 0)y 	{ 	    // - 	    // First add it to the document database< 	    //t( 	    DocumentRef	*ref = docs[url.get()]; 	    if (!ref) 	    { 		//6 		// Didn't see this one, yet.  Create a new reference% 		// for it with a unique document ID  		// 		ref = new DocumentRef; 		ref->DocID(docs.NextDocID());h$ 		ref->DocHopCount(currenthopcount); 	    } 	    ref->DocURL(url.get()); 			' 	    //o; 	    // Copy the descriptions of the old DocRef to this onem 	    // ' 	    List	*d = old_ref->Descriptions();r 	    if (d)i 	    { 		d->Start_Get();' 		String	*str;* 		while ((str = (String *) d->Get_Next())) 		{h& 		    ref->AddDescription(str->get()); 		}e 	    }5 	    if (ref->DocHopCount() > old_ref->DocHopCount())g+ 		ref->DocHopCount(old_ref->DocHopCount());x  $ 	    // Copy the number of backlinks0 	    ref->DocBackLinks(old_ref->DocBackLinks());   	    docs.Add(*ref);   	    //f6 	    // Now put it in the list of URLs to still visit. 	    //  	    if (Need2Get(url.get()))< 	    { 		if (debug > 1)1 		    cout << "   pushing " << url.get() << endl; 7 		Server	*server = (Server *) servers[url.signature()];h 		if (!server) 		{  		    //3 		    // Hadn't seen this server, yet.  Register it  		    //# 		    String robotsURL = "http://";r/ 		    robotsURL << url.host() << "/robots.txt";/> 		    StringList *localRobotsFile = GetLocal(robotsURL.get());C 		    server = new Server(url.host(), url.port(), localRobotsFile);u+ 		    servers.Add(url.signature(), server);L 		    delete localRobotsFile;e 		}F: 		server->push(url.get(), ref->DocHopCount(), base->get(), 				IsLocalURL(url.get()));*   		String	temp = url.get(); 		visited.Add(temp, 0);  	    }   	    delete ref; 	}     }n }h    O //*****************************************************************************l' // void Retriever::got_head(char *head)I // void Retriever::got_head(char *head): {t     if (debug > 4)" 	cout << "head: " << head << endl;     current_head = head; }t  O //*****************************************************************************f) // void Retriever::got_meta_dsc(char *md)  // void! Retriever::got_meta_dsc(char *md)h {r     if (debug > 4), 	cout << "meta description: " << md << endl;     current_meta_dsc = md; }=    O //*****************************************************************************s* // void Retriever::got_meta_email(char *e) // void" Retriever::got_meta_email(char *e) {      if (debug > 1)' 	cout << "\nmeta email: " << e << endl;i     current_ref->DocEmail(e);  }'    O //*****************************************************************************e1 // void Retriever::got_meta_notification(char *e)d // void) Retriever::got_meta_notification(char *e)( {"     if (debug > 1)3 	cout << "\nmeta notification date: " << e << endl;w$     current_ref->DocNotification(e); }r    O //*****************************************************************************\, // void Retriever::got_meta_subject(char *e) // void$ Retriever::got_meta_subject(char *e) {e     if (debug > 1)( 	cout << "\nmeta subect: " << e << endl;     current_ref->DocSubject(e);  }     O //*****************************************************************************l  // void Retriever::got_noindex() // void Retriever::got_noindex() {      if (debug > 1)/       cout << "\nMETA ROBOT: Noindex " << endl;(-     current_ref->DocState(Reference_noindex);  }t    O //*****************************************************************************> // void? Retriever::recordNotFound(char *url, char *referer, int reason)i {-     char	*message = "";p          switch (reason)N     { # 	case Document::Document_not_found:l 	    message = "Not found";{ 	    break;= 	r! 	case Document::Document_no_host:g 	    message = "Unknown host"; 	    break;t 	l# 	case Document::Document_no_server:h* 	    message = "Unable to contact server"; 	    break;      }f  F     notFound << message << ": " << url << " Ref: " << referer << '\n'; }o  O //*****************************************************************************a/ // void Retriever::ReportStatistics(char *name)a // void' Retriever::ReportStatistics(char *name)  { '     cout << name << ": Run complete\n"; 9     cout << name << ": " << servers.Count() << " server";a     if (servers.Count() > 1)
 	cout << "s";n     cout << " seen:\n";l       Server		*server;     String		buffer;*     StringList	results;*     String		newname = name;*       newname << ":    ";* 	t     servers.Start_Get();;     while ((server = (Server *) servers.Get_NextElement()))t     {n 	buffer = 0;+ 	server->reportStatistics(buffer, newname);i 	results.Add(buffer);l     }i     results.Sort();l  -     for (int i = 0; i < results.Count(); i++)G     {T 	cout << results[i] << "\n";     }:       if (notFound.length() > 0)     {u7 	cout << "\n" << name << ": Errors to take note of:\n";  	cout << notFound;     }c }c  ary home_cache;e       //=     // Initialize prefix/path list if this is the first time.s;     // The list is given in format "prefix1=path1,dir1 ..."nC     // If path is zero-length, user's home directory is looked up. g     //     if (!prefixes)     { $         prefixes = new StringList(); 	paths = new StringList(); 	dirs = new StringList();i& 	String t = config["local_user_urls"]; 	char *p = strtok(t, " \t");
 	while (p) 	{! 	    char *p                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                