 // // Document.cc // // Implementation of Document  // // // #if RELEASE W static char RCSid[] = "$Id: Document.cc,v 1.34.2.14 2000/02/15 22:42:20 grdetil Exp $";  #endif   #include <signal.h>  #include <sys/types.h> #include <sys/stat.h>  #include <ctype.h> #include "Document.h"  #include "Connection.h"  #include "StringList.h"  #include "htdig.h" #include "HTML.h"  #include "Plaintext.h" #include "ExternalParser.h"  #include "PDF.h"   #if 1 % typedef void (*SIGNAL_HANDLER) (...);  #else  typedef SIG_PF SIGNAL_HANDLER; #endif  O //*****************************************************************************  // Document::Document(char *u)E //   Initialize with the given url as the location for this document. C //   If the max_size is given, use that for size, otherwise use the  //   config value. //) Document::Document(char *u, int max_size)  {      url = 0;     proxy = 0;     referer = 0;     contents = 0;        if (max_size > 0)  	max_doc_size = max_size;      else- 	max_doc_size = config.Value("max_doc_size");  	 *     char	*proxyURL = config["http_proxy"];     if (proxyURL && *proxyURL)     {  	proxy = new URL(proxyURL);  	proxy->normalize();     }   0     char	*credentials = config["authorization"];$     if (credentials && *credentials)" 	setUsernamePassword(credentials);  *     contents.allocate(max_doc_size + 100);     contentType = "";      contentLength = -1; 
     if (u)     {  	Url(u);     }  }     O //*****************************************************************************  // Document::~Document() // Document::~Document()  {      if (url)       delete url;      if (proxy)       delete proxy; 
 #if MEM_DEBUG      char *p = new char; >     cout << "==== Document deleted: " << this << " new at " << 	((void *) p) << endl;
     delete p;  #endif }     O //*****************************************************************************  // void Document::Reset() 5 //   Restore the Document object to an initial state. L //   We will not reset the authorization information since it can be reused. // void Document::Reset()  {      contentType = 0;     contentLength = -1;      if (url)       delete url;      url = 0;     referer = 0;     modtime = 0;       contents = 0;      document_length = 0;     redirected_to = 0;  G     // Don't reset the authorization since it's a pain to set up again.      //    authorization = 0;=     // Don't reset the proxy since it's a pain to set up too.      //    if (proxy)     //      delete proxy;      //    proxy = 0; }     O //***************************************************************************** 8 // void Document::setUsernamePassword(char *credentials) // void0 Document::setUsernamePassword(char *credentials) {      static char	tbl[64] =      { ( 	'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',( 	'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',( 	'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',( 	'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',( 	'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',( 	'o', 'p', 'q', 'r', 's', 't', 'u', 'v',( 	'w', 'x', 'y', 'z', '0', '1', '2', '3',' 	'4', '5', '6', '7', '8', '9', '+', '/'      };     authorization = 0;     char	*p;!     int		n = strlen(credentials);      int		ch;  0     for (p = credentials; n > 2; n -= 3, p += 3)     {  	ch = *p >> 2;  	authorization << tbl[ch & 077];. 	ch = ((*p << 4) & 060) | ((p[1] >> 4) & 017);  	authorization << tbl[ch & 077];/ 	ch = ((p[1] << 2) & 074) | ((p[2] >> 6) & 03);   	authorization << tbl[ch & 077]; 	ch = p[2] & 077;   	authorization << tbl[ch & 077];     }        if (n != 0)      {  	char c1 = *p; 	char c2 = n == 1 ? 0 : p[1];    	ch = c1 >> 2;  	authorization << tbl[ch & 077];  , 	ch = ((c1 << 4) & 060) | ((c2 >> 4) & 017);  	authorization << tbl[ch & 077];   	if (n == 1) 	    authorization << '='; 	else 	         {  	    ch = (c2 << 2) & 074;$ 	    authorization << tbl[ch & 077];	         }  	authorization << '=';     }  }     O //*****************************************************************************  // void Document::Url(char *u)" //   Set the URL for this document // void Document::Url(char *u) {      if (url)       delete url;      url = new URL(u);  }     O //***************************************************************************** - // time_t Document::getdate(char *datestring) 3 //   Convert a RFC850 date string into a time value  // time_t# Document::getdate(char *datestring)  {      struct tm   tm;      time_t      ret;         char        *s;            //&     // Two possible time designations:+     //      Tuesday, 01-Jul-97 16:48:02 GMT 	     // or )     //      Thu, 01 May 1997 00:40:42 GMT      //:     // We strip off the weekday before sending to strptime2     // because some servers send invalid weekdays!A     // (Some don't even send a weekday, but we'll be flexible...)          s = strchr(datestring, ',');
     if (s)         s++;     else         s = datestring;      while (isspace(*s))          s++;>     if (strchr(s, '-') && mystrptime(s, "%d-%b-%y %T", &tm) ||.             mystrptime(s, "%d %b %Y %T", &tm))       { @ 	// correct for mystrptime, if %Y format saw only a 2 digit year 	if (tm.tm_year < 0) 	  tm.tm_year += 1900;A 	tm.tm_yday = 0;	// clear these to prevent problems in strftime()  	tm.tm_wday = 0; 	  	if (debug > 2)  	  {3 	    cout << "Translated " << datestring << " to ";  	    char	buffer[100];D 	    // Leave out %a for weekday, because we don't set it anymore...@ 	    //strftime(buffer, sizeof(buffer), "%a, %d %b %Y %T", &tm);A 	    // Let's just do away with strftime() altogether for this... < 	    //strftime(buffer, sizeof(buffer), "%d %b %Y %T", &tm);E 	    sprintf(buffer, "%4d-%02d-%02d %02d:%02d:%02d", tm.tm_year+1900, = 		tm.tm_mon+1, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec); 9 	    cout << buffer << " (" << tm.tm_year << ")" << endl;  	  } #if HAVE_TIMEGM  	ret = timegm(&tm);  #else  	ret = mytimegm(&tm);  #endif       }      else       {  	if (debug > 2)  	  {1 	    cout << "Cannot translate " << datestring << 3                     ", using current time" << endl;  	  }; 	ret = time(0); // This isn't the best, but it works. *fix*        }      if (debug > 2)     { $         cout << "And converted to ";&         struct tm *tm2 = gmtime(&ret);         char    buffer[100];A         strftime(buffer, sizeof(buffer), "%a, %d %b %Y %T", tm2);          cout << buffer << endl;      }      return ret;  }     O //*****************************************************************************  // int Document::UseProxy() I //   Returns 1 if the given url is to be retrieved from the proxy server,  //   or 0 if it's not. // int  Document::UseProxy() { )     static StringMatch *excludeproxy = 0;        //>     // Initialize excludeproxy list if this is the first time.     //     if (!excludeproxy)     { &     	excludeproxy = new StringMatch();3 	StringList l(config["http_proxy_exclude"], " \t");  	excludeproxy->IgnoreCase();$ 	excludeproxy->Pattern(l.Join('|'));
 	l.Release();      }   2     if ((proxy) && (!excludeproxy->hasPattern() ||0 		    excludeproxy->FindFirst(url->get()) < 0 ))D       return 1;    // if the exclude pattern is empty, use the proxy
     return 0;  }     O //***************************************************************************** 0 // DocStatus Document::RetrieveHTTP(time_t date)D //   Attempt to retrieve the document pointed to by our internal URL // Document::DocStatus # Document::RetrieveHTTP(time_t date)  {      Connection	c;      if (c.open() == NOTOK) 	return Document_not_found;        int		useproxy = UseProxy();      if (useproxy)      { + 	if (c.assign_port(proxy->port()) == NOTOK)  	{ 	    c.close();  	    return Document_not_found;  	}- 	if (c.assign_server(proxy->host()) == NOTOK)  	{ 	    if (debug) : 		cout << "Unknown proxy host: " << proxy->host() << endl; 	    c.close();  	    return Document_no_host;  	}     }      else     { ) 	if (c.assign_port(url->port()) == NOTOK)  	{ 	    c.close();  	    return Document_not_found;  	}+ 	if (c.assign_server(url->host()) == NOTOK)  	{ 	    if (debug)c2 		cout << "Unknown host: " << url->host() << endl; 	    c.close();c 	    return Document_no_host;c 	}     }1 	0     if (c.connect(1) == NOTOK)     {d 	if (debug)e 	{] 	    cout << "Unable to build connection with " << url->host() << ':' << url->port() << endl;t 	    if (useproxy) 	    {P 		cout << "(Via proxy " << proxy->host() << ':' << proxy->port() << ')' << endl; 	    } 	} 	c.close();e 	return Document_no_server;.     }i       //3     // Construct and send the request to the serverI     //#     String        command = "GET ";*       if (useproxy)*     {** 	command << url->get() << " HTTP/1.0\r\n";     }D     else     { + 	command << url->path() << " HTTP/1.0\r\n";a     }r=     command << "User-Agent: " << config["user_agent"] << "/" i; 	    << VERSION << " (" <<	config["maintainer"] << ")\r\n";u       //:     // If a referer was provided, we'll send that as well.     //     if (referer.length())      {a- 	command << "Referer: " << referer << "\r\n";l     }x 	c     //<     // If a date was provided, we'll use that in the special&     // 'If-modified-since' URC header.     //     if (date > 0)n     {p 	struct tm	*tm = gmtime(&date);  	char		buffer[100];r= 	strftime(buffer, sizeof(buffer), "%a, %d %h %Y %T GMT", tm);r6 	command << "If-Modified-Since: " << buffer << "\r\n";     }a       //N     // If authorization was provided, send it.  This will happen regardless of4     // whether the server needs it or not.  Oh well.     //     if (authorization.length())*     {*? 	command << "Authorization: Basic " << authorization << "\r\n";      }        //I     // If we are allowed to index virtual hosts, we will send the special F     // 'Host:' header that tells the server what virtual web site this     // request is for.     //1     if (config.Boolean("allow_virtual_hosts", 1))*     {*. 	command << "Host: " << url->host() << "\r\n";     }*          //A     // Finally we can commit the request by sending a blank line.l     //     command << "\r\n";       if (debug > 2)C 	cout << "Retrieval command for " << url->get() << ": " << command;        c.write(command);o       //)     // Setup a timeout for the connection      //'     c.timeout(config.Value("timeout"));   ,     DocStatus   returnStatus = Document_ok;;     switch (readHeader(c))     {n 	case Header_ok:'             returnStatus = Document_ok;n 	    break;  	case Header_not_changed:D) 	    returnStatus = Document_not_changed;e             break; 	case Header_not_found: ' 	    returnStatus = Document_not_found;              break; 	case Header_redirect:& 	    returnStatus = Document_redirect;             break; 	case Header_not_text:& 	    returnStatus = Document_not_html;             break; 	case Header_not_authorized:, 	    returnStatus = Document_not_authorized;             break;     } $     if (returnStatus != Document_ok)     {, 	c.close();'         return returnStatus;     },       //"     // Read in the document itself     //     contents = 0;,     char	docBuffer[8192];v     int		bytesRead;z#     int		bytesToGo = contentLength;'  2     if (bytesToGo < 0 || bytesToGo > max_doc_size)!         bytesToGo = max_doc_size;r     while (bytesToGo > 0)	     { N         int len = bytesToGo<sizeof(docBuffer) ? bytesToGo : sizeof(docBuffer);+         bytesRead = c.read(docBuffer, len);&         if (bytesRead <= 0)              break; 	if (debug > 2) 8 	    cout << "Read " << bytesRead << " from document\n";' 	contents.append(docBuffer, bytesRead);  	bytesToGo -= bytesRead;     }      c.close();(     document_length = contents.length();       if (debug > 2)= 	cout << "Read a total of " << document_length << " bytes\n";   (     if (document_length < contentLength)&       document_length = contentLength;     return Document_ok;< }=    O //*****************************************************************************7* // int Document::readHeader(Connection &c)2 //   Read and interpret the header of the document // int*# Document::readHeader(Connection &c)i {o     String	line;     int		inHeader = 1;)     int		returnStatus = Header_not_found;r       modtime = 0;       while (inHeader)     {  	c.read_line(line, "\n");* 	line.chop('\r');* 	if (debug > 2)*- 	    cout << "Header line: " << line << endl;_ 	if (line.length() == 0) 	    inHeader = 0; 	elsea 	{ 	    char	*token = line.get();' 	    while (*token && !isspace(*token))a
 		token++;& 	    while (*token && isspace(*token))
 		token++;( 	    if (strncmp(line, "HTTP/", 5) == 0) 	    { 		//6 		// Found the status line.  This will determine if we 		// continue or not 		//$ 		char	*status = strtok(token, " ");+ 		if (status && strcmp(status, "200") == 0)e 		{i 		    returnStatus = Header_ok;m 		}v0 		else if (status && strcmp(status, "304") == 0) 		{d( 		    returnStatus = Header_not_changed; 		} 3 		else if (status && strncmp(status, "30", 2) == 0); 		{  		    //8 		    // All 3xx codes other than 304 will be considered0 		    // HTTP redirects that need to look at the 		    // Location header field.t 		    //% 		    returnStatus = Header_redirect;r 		}o0 		else if (status && strcmp(status, "401") == 0) 		{ + 		    returnStatus = Header_not_authorized;. 		}a 	    }$ 	    else if (modtime == 0 && *token9 		     && mystrncasecmp(line, "last-modified:", 14) == 0)  	    {+ 		modtime = getdate(strtok(token, "\n\t"));  	    }+ 	    else if (contentLength == -1 && *tokeny: 		     && mystrncasecmp(line, "content-length:", 15) == 0) 	    {. 		contentLength = atoi(strtok(token, "\n\t")); 	    }F 	    else if (*token && mystrncasecmp(line, "content-type:", 13) == 0) 	    {  		token = strtok(token, "\n\t"); 				* 		if ((returnStatus == Header_not_found ||  			returnStatus == Header_ok) &&) 		    !ExternalParser::canParse(token) &&). 		    mystrncasecmp("text/", token, 5) != 0 &&7 		    mystrncasecmp("application/pdf", token, 15) != 0)) 		    return Header_not_text;) 		contentType = token; 	    }7 	    else if (mystrncasecmp(line, "location:", 9) == 0)a 	    {+ 		redirected_to = strtok(token, "\r\n \t");i 	    } 	}     } )     static int	modification_time_is_now = . 			config.Boolean("modification_time_is_now");1     if (modtime == 0 && modification_time_is_now)d 	modtime = time(NULL);       if (debug > 2)3 	cout << "returnStatus = " << returnStatus << endl;e     return returnStatus; }     O //*****************************************************************************/H // DocStatus Document::RetrieveLocal(time_t date, StringList *filenames)D //   Attempt to retrieve the document pointed to by our internal URLJ //   using a list of potential local filenames given. Returns Document_ok,B //   Document_not_changed or Document_not_local (in which case the* //   retriever tries it again using HTTP). // Document::DocStatust; Document::RetrieveLocal(time_t date, StringList *filenames)x {      struct stat stat_buf;L     String *filename;y       filenames->Start_Get();r  K     // Loop through list of potential filenames until the list is exhausted #     // or a suitable file is found.-:     while ((filename = (String *)filenames->Get_Next()) &&G 	   ((stat(*filename, &stat_buf) == -1) || !S_ISREG(stat_buf.st_mode)))o         if (debug > 1)8 	    cout << "  tried local file " << *filename << endl;          if (!filename)"         return Document_not_local;       if (debug > 1)>         cout << "  found existing file " << *filename << endl;        modtime = stat_buf.st_mtime;     if (modtime <= date)$         return Document_not_changed;  @     // Process only HTML files (this could be changed if we read%     // the server's mime.types file). :     // (...and handle a select few other types for now...)(     char *ext = strrchr(*filename, '.');     if (ext == NULL)!       	return Document_not_local; N     if ((mystrcasecmp(ext, ".html") == 0) || (mystrcasecmp(ext, ".htm") == 0))"         contentType = "text/html";R     else if ((mystrcasecmp(ext, ".txt") == 0) || (mystrcasecmp(ext, ".asc") == 0))#         contentType = "text/plain";o.     else if ((mystrcasecmp(ext, ".pdf") == 0))(         contentType = "application/pdf";Q     else if ((mystrcasecmp(ext, ".ps") == 0) || (mystrcasecmp(ext, ".eps") == 0)) /         contentType = "application/postscript";n	     else O   	return Document_not_local;        // Open it$     FILE *f = fopen(*filename, "r");     if (f == NULL)  	return Document_not_local;       //"     // Read in the document itself     //     contents = 0;y     char	docBuffer[8192];      int		bytesRead;e  G     while ((bytesRead = fread(docBuffer, 1, sizeof(docBuffer), f)) > 0)      {u 	if (debug > 2)I8 	    cout << "Read " << bytesRead << " from document\n";2 	if (contents.length() + bytesRead > max_doc_size)2 	    bytesRead = max_doc_size - contents.length();' 	contents.append(docBuffer, bytesRead); ' 	if (contents.length() >= max_doc_size)e 	    break;"     }      fclose(f);(     document_length = contents.length();%     contentLength = stat_buf.st_size;v       if (debug > 2)= 	cout << "Read a total of " << document_length << " bytes\n";<  (     if (document_length < contentLength)&       document_length = contentLength;     return Document_ok;e }e    O //*****************************************************************************r$ // Parsable *Document::getParsable()E //   Given the content-type of a document, returns a document parser.tG //   This will first look through the list of user supplied parsers and F //   then at our (limited) builtin list of parsers.  The user supplied5 //   parsers are external programs that will be used.. //
 Parsable * Document::getParsable(). {g     static HTML			*html = 0;%     static Plaintext		*plaintext = 0;<.     static ExternalParser	*externalParser = 0;     static PDF			*pdf = 0;          Parsable	*parsable = 0;/  .     if (ExternalParser::canParse(contentType))     {t 	if (externalParser) 	{ 	    delete externalParser;B 	}2 	externalParser = new ExternalParser(contentType); 	parsable = externalParser;r     } =     else if (mystrncasecmp(contentType, "text/html", 9) == 0)      {  	if (!html)l 	    html = new HTML();" 	parsable = html;e     })?     else if (mystrncasecmp(contentType, "text/plain", 10) == 0)a     {  	if (!plaintext)! 	    plaintext = new Plaintext();o 	parsable = plaintext;     } D     else if (mystrncasecmp(contentType, "application/pdf", 15) == 0)     {c
 	if (!pdf) 	    pdf = new PDF();c 	parsable = pdf;     }o     else     {e 	if (!plaintext)! 	    plaintext = new Plaintext();r 	parsable = plaintext; 	if (debug)o 	{" 	    cout << '"' << contentType <</ 		"\" not a recognized type.  Assuming text\n";o 	}     }f  =     parsable->setContents(contents.get(), contents.length());a     return parsable; }           break; 	case Header_not_text:& 	    returnStatus = Document_not_html;             break; 	case Header_not_authorized:, 	    returnStatus = Document_not_authorized;             break;     } $     if (returnStatus != Document_ok)     {, 	c.close();'         return returnStatus;     },       //"     // Read in the document itself     //     contents = 0;,     char	docBuffer                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                