 //
 // HTML.cc // // Implementation of HTML O // Class to parse HTML documents and return useful information to the Retriever  // // #if RELEASE S static char RCSid[] = "$Id: HTML.cc,v 1.30.2.14 2000/02/15 20:11:29 grdetil Exp $";  #endif   #include "htdig.h" #include "HTML.h"  #include "SGMLEntities.h"  #include "Configuration.h" #include <ctype.h> #include "StringMatch.h" #include "StringList.h"  #include "URL.h" #include "HtWordType.h"    static StringMatch	tags; static StringMatch	nobreaktags; # static StringMatch	spacebeforetags; " static StringMatch	spaceaftertags; static StringMatch	attrs;  static StringMatch	srcMatch; static StringMatch	hrefMatch; ! static StringMatch	keywordsMatch;  static int		keywordsCount; static int		max_keywords;  static int		offset;  static int		totlength;    O //***************************************************************************** D // ADDSPACE() macro, to insert space where needed in various strings6 // 		Reduces all multiple whitespace to a single space   #define ADDSPACE(in_space)	\     if (!in_space)							\     {									\  	if (in_title && doindex)					\  	{								\  	    title << ' ';						\  	}								\ ? 	if (in_ref && description.length() < max_description_length)	\  	{								\  	    description << ' ';						\  	}								\ ? 	if (head.length() < max_head_length && doindex && !in_title)	\  	{								\  	    head << ' ';						\ 	}								\  	in_space = 1;							\     }     O //*****************************************************************************  // HTML::HTML()  // HTML::HTML() {      //9     // Initialize the patterns that we will try to match. @     // The tags Match object is used to match tag commands whileG     // the attrs Match object is used to match names of tag parameters.      //     tags.IgnoreCase();     tags.Pattern("title|/title|a|/a|h1|h2|h3|h4|h5|h6|/h1|/h2|/h3|/h4|/h5|/h6|noindex|/noindex|img|li|meta|frame|area|base|embed|object|link|style|/style");       attrs.IgnoreCase(); #     attrs.Pattern("src|href|name");        srcMatch.IgnoreCase();     srcMatch.Pattern("src");       hrefMatch.IgnoreCase();      hrefMatch.Pattern("href");  N     // These tags don't cause a word break.  They may also be in "tags" above,G     // except for the "a" tag, which must be handled as a special case. 9     // Note that <sup> & <sub> should cause a word break.      nobreaktags.IgnoreCase();      nobreaktags.Pattern("font|/font|em|/em|strong|/strong|i|/i|b|/b|u|/u|tt|/tt|abbr|/abbr|code|/code|q|/q|samp|/samp|kbd|/kbd|var|/var|dfn|/dfn|cite|/cite|blink|/blink|big|/big|small|/small|s|/s");  K     // These tags, which may also be in "tags" above, cause word breaks and O     // therefore cause space to be inserted before (or after) do_tag() is done. !     spacebeforetags.IgnoreCase(); {     spacebeforetags.Pattern("title|h1|h2|h3|h4|h5|h6|address|blockquote|noindex|img|li|th|td|dt|dd|p|br|hr|center|spacer");       spaceaftertags.IgnoreCase();R     spaceaftertags.Pattern("/title|/h1|/h2|/h3|/h4|/h5|/h6|/address|/blockquote");  >     //String	keywordNames = config["keywords_meta_tag_names"];%     //keywordNames.replace(' ', '|'); %     //keywordNames.remove(",\t\r\n"); !     //keywordsMatch.IgnoreCase(); *     //keywordsMatch.Pattern(keywordNames);F     StringList keywordNames(config["keywords_meta_tag_names"], " \t");     keywordsMatch.IgnoreCase(); 2     keywordsMatch.Pattern(keywordNames.Join('|'));     keywordNames.Release(); 4     max_keywords = config.Value("max_keywords", -1);     if (max_keywords < 0) / 	max_keywords = (int) ((unsigned int) ~1 >> 1);      
     word = 0; 
     href = 0;      title = 0;     description = 0;
     head = 0;      meta_dsc = 0;      tag = 0;     in_title = 0;      in_ref = 0;      in_heading = 0; 
     base = 0;      doindex = 1;     dofollow = 1; ?     minimumWordLength = config.Value("minimum_word_length", 3);  }     O //*****************************************************************************  // HTML::~HTML() //
 HTML::~HTML()  {  }     O //***************************************************************************** 7 // void HTML::parse(Retriever &retriever, URL &baseURL) N //   Parse the HTML document using the Retriever object for all the callbacks.E //   The HTML document contents are contained in the contents String.  // void/ HTML::parse(Retriever &retriever, URL &baseURL)  { 1     if (contents == 0 || contents->length() == 0)  	return;       base = &baseURL;          //E     // We have some variables which will contain the various items we      // are looking for     //     int			in_space;      int			in_punct;      unsigned char	*q, *start; @     unsigned char	*position = (unsigned char *) contents->get();Q     unsigned char       *text = (unsigned char *) new char[contents->length()+1]; &     unsigned char       *ptext = text;>     static char         *skip_start = config["noindex_start"];:     static char         *skip_end = config["noindex_end"];       keywordsCount = 0;     offset = 0;      title = 0;
     head = 0;      meta_dsc = 0;      doindex = 1;     dofollow = 1;      in_heading = 0;      in_title = 0;      in_ref = 0;      in_space = 0;      in_punct = 0;  	      while (*position)      {          //?       // Filter out section marked to be ignored for indexing.  $       // This can contain any HTML.        //       if (*skip_start &&H 	  mystrncasecmp((char *)position, skip_start, strlen(skip_start)) == 0) 	{@ 	  q = (unsigned char*)mystrcasestr((char *)position, skip_end);
 	  if (!q)C 	    *position = '\0';       // Rest of document will be skipped...  	  else % 	    position = q + strlen(skip_end);  	  continue; 	}  2       if (strncmp((char *)position, "<!", 2) == 0) 	{ 	  // B 	  // Possible comment declaration (but could be DTD declaration!). 	  // A comment can contain other '<' and '>':5 	  // we have to ignore complete comment declarations * 	  // but of course also DTD declarations. 	  // / 	  position += 2;	// Get past declaration start / 	  if (strncmp((char *)position, "--", 2) == 0)  	    {3 	      // Found start of comment - now find the end  	      position += 2; 	 	      do  		{ 7 		  q = (unsigned char*)strstr((char *)position, "--");  		  if (!q)  		    {  		      *position = '\0'; ; 		      break;	// Rest of document seems to be a comment...  		    }  		  else 		    {  		      position = q + 2; 9 		      // Skip extra dashes after a badly formed comment   		      while (*position == '-') 			  position++;6 		      // Skip whitespace after an individual comment" 		      while (isspace(*position)) 			  position++; 		    } @ 		  // if comment declaration hasn't ended, skip another comment 		} - 	      while (*position && *position != '>');  	      if (*position == '>') 		{ - 		  position++;	// End of comment declaration  		}  	    } 	  else  	    {- 	      // Not a comment declaration after all * 	      // but possibly DTD: get to the end9 	      q = (unsigned char*)strstr((char *)position, ">"); 
 	      if (q)  		{  		  position = q + 1; $ 		  // End of (whatever) declaration 		}  	      else  		{ 1 		  *position = '\0'; // Rest of document is DTD?  		}  	    } 	  continue; 	}   	if (*position == '<') 	{ 	    // C 	    // Start of a tag.  Since tags cannot be nested, we can simply " 	    // search for the closing '>' 	    // 7 	    q = (unsigned char*)strchr((char *)position, '>');  	    if (q)  	      { // copy tag 		while (position <= q)  		  *ptext++ = *position++;  	      }	 	    else 2 	      { // copy rest of text, as tag does not end 		while (*position)  		  *ptext++ = *position++;  	      } 	} 	else if (*position == '&')  	{?            *ptext = SGMLEntities::translateAndUpdate(position);             if (*ptext == '<')             {               *ptext = ' ';             }            ptext++; 	         }          else	         { "            *ptext++ = *position++;	         }        }        *ptext++ = '\0';       totlength = ptext - text;          position = text;       start = position;          while (*position)        {  	offset = position - start;/ 	// String = 0 is expensiveC
 	// word = 0;T 	if (*position == '<') 	  { 	    //iC 	    // Start of a tag.  Since tags cannot be nested, we can simplyd" 	    // search for the closing '>' 	    //e7 	    q = (unsigned char*)strchr((char *)position, '>');L 	    if (!q): 	      break; // Syntax error in the doc.  Tag never ends.
 	    tag = 0;e3 	    tag.append((char*)position, q - position + 1);" 	    position++; 	    while (isspace(*position))t
 		position++;cC 	    if (!in_space && spacebeforetags.CompareWord((char *)position)t0 		|| !in_space && !in_punct && *position != '/') 	    {4 		// These opening tags cause a space to be inserted! 		// before anything they insert.d@ 		// Tags processed here (i.e. not in nobreaktags), like <a ...>> 		// tag, are a special case: they don't actually add space in< 		// formatted text, but because in our processing it causes@ 		// a word break, we avoid word concatenation in "head" string. 		ADDSPACE(in_space);a 		in_punct = 0;s 	    } 	    do_tag(retriever, tag);C 	    if (!in_space && spaceaftertags.CompareWord((char *)position))i 	    {4 		// These closing tags cause a space to be inserted  		// after anything they insert. 		ADDSPACE(in_space);  		in_punct = 0;_ 	    } 	    position = q+1; 	  }9 	else if (*position > 0 && HtIsStrictWordChar(*position))m 	{ 	    //h5 	    // Start of a word.  Try to find the whole thing< 	    //	 	    word = 0; 	    in_space = 0; 	    in_punct = 0;1 	    while (*position && HtIsWordChar(*position))* 	    { 		word << (char)*position;
 		position++;T 		if (*position == '<')  		{I 		    q = position+1;t 		    while (isspace(*q))  			q++;e* 		    // Does this tag cause a word break?- 		    if (nobreaktags.CompareWord((char *)q))e 		    {h5 			// These tags just change character formatting and( 			// don't break words.5 			q = (unsigned char*)strchr((char *)position, '>');|	 			if (q)i 			{ 			    tag = 0;a5 			    tag.append((char*)position, q - position + 1);. 			    do_tag(retriever, tag); 			    position = q+1; 			} 		    }n 		}s 	    }   	    if (in_title && doindex)  	    { 		title << word; 	    }   	    if (in_ref) 	    {4 		if (description.length() < max_description_length) 		{  		    description << word; 		}o 		else 		{, 		    description << " ..."; 		    if (dofollow)N/ 		      retriever.got_href(*href, description);  		    in_ref = 0;g 		    description = 0; 		}. 	    }  A 	    if (head.length() < max_head_length && doindex && !in_title)d 	    { 		//  		// Capitalize H1 and H2 blocks8 		// (This is currently disabled until we can captialize% 	        // non-ASCII characters -GRHi1 	        // if (in_heading > 1 && in_heading < 4)e
 	        // {e 	        //   word.uppercase(); 
 	        // }e   		//* 		// Append the word to the head (excerpt) 		// 		  head << word;| 	    }  7 	    if (word.length() >= minimumWordLength && doindex)r 	    { 	      retriever.got_word(word,n$ 				 int(offset * 1000 / totlength), 				 in_heading);3 	    } 	} 	elses 	{ 	    //". 	    // Characters that are not part of a word 	    //a 	    if (isspace(*position)) 	    { 		ADDSPACE(in_space);w 		in_punct = 0;, 	    }	 	    elsee 	    { 		// 		// Not whitespacee 		//> 		if (head.length() < max_head_length && doindex && !in_title) 		{w2 		    // We don't want to add random chars to the ' 		    // excerpt if we're in the title.e 		    head << *position; 		}w> 		if (in_ref && description.length() < max_description_length) 		{  		    description << *position;  		}k 		if (in_title && doindex) 		{1 		    title << *position;0 		}  		in_space = 0;  		in_punct = 1;e 	    } 	    position++; 	}     }e     retriever.got_head(head);        delete text; }r    O //***************************************************************************** 7 // void HTML::do_tag(Retriever &retriever, String &tag)" // void/ HTML::do_tag(Retriever &retriever, String &tag)* {*4     char	*position = tag.get() + 1;		// Skip the '<'     char	*q, *t;     int		which, length;*       while (isspace(*position)) 	position++;       which = -1; 6     if (tags.CompareWord(position, which, length) < 0) 	return; // Nothing matched.       if (debug > 3)> 	cout << "Tag: " << position << ", matched " << which << endl;          switch (which)     {g 	case 0:		// "title" 	    if (title.length()) 	    { 		if (debug)6 		    cout << "More than one <title> tag in document!"4 			 << " (possible search engine spamming)" << endl; 		break; 	    } 	    in_title = 1; 	    in_heading = 1; 	    break;o 			f 	case 1:		// "/title"n 	    if (!in_title)n 		break; 	    in_title = 0; 	    in_heading = 0;  	    retriever.got_title(title); 	    break;> 			; 	case 2:		// "a" 	{ 	    which = -1;
 	    int pos; F 	    while ((pos = attrs.FindFirstWord(position, which, length)) >= 0) 	    { 		position += pos + length;o 		if (debug > 1)L 		    cout << "A tag: pos = " << pos << ", position = " << position << endl; 		switch (which) 		{t 		    case 1:		// "href" 		    {0 			//  			// a href seen  			// ( 			while (*position && *position != '=') 			    position++; 			if (!*position) 			    return; 			position++; 			while (isspace(*position))i 			    position++;                        // 7 		       // Allow either single quotes or double quotesi/                        // around the URL itselfi                        //p?                        if (*position == '"'||*position == '\'')q 			{ 			    position++;* 			    q = strchr(position, position[-1]); 			    if (!q)
 				break;                            //eL                            // We seem to have matched the opening quote charL                            // Mark the end of the quotes as our endpoint, soL                            // that we can continue parsing after the current"                            // text                            //e%                            *q = '\0';l                            // B                            // If a '#' is present in a quoted URL,L                            //  treat that as the end of the URL, but we skipN                            //  past the quote to parse the rest of the anchor.                            // C                            if ((t = strchr(position, '#')) != NULL)/)                                *t = '\0';  			} 			else  			{ 			    q = position; 			    while (*q &&  				   *q != '>' &&s. 				   !isspace(*q) && // *q != '?'  ???? -grh 				   *q != '#')	 				q++; 			    *q = '\0';i 			} 			if (in_ref) 			{ 			    if (debug > 1)h4 				cout << "Terminating previous <a href=...> tag,"4 				     << " which didn't have a closing </a> tag." 				     << endl;  			    if (dofollow)+ 				retriever.got_href(*href, description);= 			    in_ref = 0; 			} 			delete href;m. 			href = new URL(transSGML(position), *base); 			in_ref = 1; 			description = 0;  			position = q + 1;	 			break;y 		    }    		    case 2:		// "name" 		    {* 			//( 			// a name seen> 			// ( 			while (*position && *position != '=') 			    position++; 			if (!*position) 			    return; 			position++; 			while (isspace(*position))m 			    position++;                        // E                        // Allow either single quotes or double quotesg/                        // around the URL itselfh                        ///?                        if (*position == '"'||*position == '\'')  			{ 			    position++;* 			    q = strchr(position, position[-1]); 			    if (!q)
 				break;                            //sL                            // We seem to have matched the opening quote charL                            // Mark the end of the quotes as our endpoint, soL                            // that we can continue parsing after the current"                            // text                            //e%                            *q = '\0';                             //sB                            // If a '#' is present in a quoted URL,L                            //  treat that as the end of the URL, but we skipN                            //  past the quote to parse the rest of the anchor.                            //iC                            if ((t = strchr(position, '#')) != NULL)t)                                *t = '\0';/ 			} 			elsee 			{ 			    q = position;. 			    while (*q && *q != '>' && !isspace(*q)) 				q++;
 			*q = '\0';  			}- 			retriever.got_anchor(transSGML(position));  			position = q + 1;	 			break;c 		    }t 		    default:	 			break;  		}s 	    } 	    break;( 	}   	case 3:		// "/a"o 	    if (in_ref) 	    { 	      if (dofollow)) 		retriever.got_href(*href, description);s 	      in_ref = 0; 	    } 	    break;    	case 4:		// "h1"i 	    in_heading = 2; 	    break;d   	case 5:		// "h2"g 	    in_heading = 3; 	    break;    	case 6:		// "h3"r 	    in_heading = 4; 	    break;r   	case 7:		// "h4"y 	    in_heading = 5; 	    break;f   	case 8:		// "h5"c 	    in_heading = 6; 	    break;	   	case 9:		// "h6"v 	    in_heading = 7; 	    break;r   	case 10:	// "/h1" 	case 11:	// "/h2" 	case 12:	// "/h3" 	case 13:	// "/h4" 	case 14:	// "/h5" 	case 15:	// "/h6" 	    in_heading = 0; 	    break;    	case 16:	// "noindex" 	case 27:	// "style" 	    doindex = 0;t 	    dofollow = 0; 	    break;A   	case 17:	// "/noindex"= 	case 28:	// "/style"t 	    doindex = 1;e 	    dofollow = 1; 	    break;c   	case 18:	// "img" 	{ 	    // Handle alt parameter 	    Configuration	conf;# 	    conf.NameValueSeparators("=");  	    conf.Add(position+length);  	    if (conf["alt"])t 	    {( 		char	*alttxt = transSGML(conf["alt"]); 		if (doindex && in_title) 		    title << alttxt << " "; > 		if (in_ref && description.length() < max_description_length)# 		    description << alttxt << " ";w> 		if (doindex && !in_title && head.length() < max_head_length) 		    head << alttxt << " ";  		char	*w = HtWordToken(alttxt); 		while (w && doindex) 		{=) 		    if (strlen(w) >= minimumWordLength)> 		      retriever.got_word(w,t# 				 int((offset+(w-alttxt)) * 1000s 					/ totlength), 				 in_heading);a 		    w = HtWordToken(0);p 		}o 		w = '\0';} 	    }   	    // Handle src parameter 	    which = -1;< 	    int pos = attrs.FindFirstWord(position, which, length); 	    if (pos < 0 || which != 0)x 		break; 	    position += pos + length;* 	    while (*position && *position != '=')
 		position++;. 	    if (!*position) 		break; 	    position++; 	    while (isspace(*position))n
 		position++; 
            // 9            // Allow either single quotes or double quotesh#            // around the URL itself	
            //l3            if (*position == '"'||*position == '\'')d 	    {
 		position++;e% 		q = strchr(position, position[-1]);i	 		if (!q)/ 		    break;                //g@                // We seem to have matched the opening quote char@                // Mark the end of the quotes as our endpoint, so@                // that we can continue parsing after the current                // text                //v                *q = '\0';t                //t6                // If a '#' is present in a quoted URL,@                //  treat that as the end of the URL, but we skipB                //  past the quote to parse the rest of the anchor.                // 7                if ((t = strchr(position, '#')) != NULL)f                    *t = '\0';t 	    }	 	    else_ 	    { 		q = position; ) 		while (*q && *q != '>' && !isspace(*q)) 
 		    q++; 	    *q = '\0';t 	    }. 	    retriever.got_image(transSGML(position)); 	    break;e 	}   	case 19:	// "li"lA 	    if (doindex && !in_title && head.length() < max_head_length)e 		head << "* ";1 	    break;    	case 20:	// "meta"	 	{	    position += length; 	    Configuration	conf;# 	    conf.NameValueSeparators("=");e 	    conf.Add(position);   	    // ; 	    // First test for old-style meta tags (these break any* 	    // reasonable DTD...) 	    //o 	    if (conf["htdig-noindex"])  	      { 		retriever.got_noindex(); 		doindex = 0; 		dofollow = 0;  	      } 	    if (conf["htdig-index"])k 	      { 		doindex = 1; 		dofollow = 1;h 	      } 	    if (conf["htdig-email"])n 	    {; 		retriever.got_meta_email(transSGML(conf["htdig-email"]));o 	    }) 	    if (conf["htdig-notification-date"]). 	    {N 		retriever.got_meta_notification(transSGML(conf["htdig-notification-date"])); 	    }% 	    if (conf["htdig-email-subject"])" 	    {E 		retriever.got_meta_subject(transSGML(conf["htdig-email-subject"]));n 	    }4 	    if (conf["htdig-keywords"] || conf["keywords"]) 	    { 		//7 		// Keywords are added as being at the very top of thee) 		// document and have a weight factor of/8 		// keywords-factor which is assigned to slot 10 in the 		// factor table. 		//* 		char	*keywords = conf["htdig-keywords"]; 		if (!keywords)" 		    keywords = conf["keywords"];- 		char	*w = HtWordToken(transSGML(keywords));t 		while (w && doindex) 		{)( 		    if (strlen(w) >= minimumWordLength' 				&& ++keywordsCount <= max_keywords)g% 		      retriever.got_word(w, 1, 10);p 		    w = HtWordToken(0);( 		}) 		w = '\0';  	    } 	  	    if (conf["http-equiv"]) 	      {  " 		// <META HTTP-EQUIV=REFRESH case6 		if (mystrcasecmp(conf["http-equiv"], "refresh") == 0 		    && conf["content"])o 		  {	& 		    char *content = conf["content"];. 		    char *q = mystrcasestr(content, "url="); 		    if (q && *q)	 		      {  			q += 4; // skiping "URL=" 			char *qq = q;0 			while (*qq && (*qq != ';') && (*qq != '"') && 			       !isspace(*qq))qq++;n 			*qq = 0;i, 			URL *href = new URL(transSGML(q), *base);7 			// I don't know why anyone would do this, but hey...	 			if (dofollow)# 			  retriever.got_href(*href, "");  			delete href;e	 		      }v 		  }e 	      }   	    //h< 	    // Now check for <meta name=...  content=...> tags that- 	    // fly with any reasonable DTD out there/ 	    //   ) 	    if (conf["name"] && conf["content"])  	    { 		char	*cache = conf["name"];   ! 		which = -1; // What does it do?   / 		  // First of all, check for META description/  0 		  if (mystrcasecmp(cache, "description") == 0 $ 			 && strlen(conf["content"]) != 0) 		  {  		    //= 		    // We need to do two things. First grab the description  		    //, 		    meta_dsc = transSGML(conf["content"]);9 		   if (meta_dsc.length() > max_meta_description_length) E 		     meta_dsc = meta_dsc.sub(0, max_meta_description_length).get();  		   if (debug > 1) ? 		     cout << "META Description: " << conf["content"] << endl; & 		   retriever.got_meta_dsc(meta_dsc);     		   //?* 		   // Now add the words to the word list* 		   // (slot 11 is the new slot for this) 		   //   B 		   char        *words = HtWordToken(transSGML(conf["content"])); 		   char        *w = words;'                    while (w && doindex)  		     {& 			if (strlen(w) >= minimumWordLength) 			  retriever.got_word(w," 				 int((offset+(w-words)) * 1000 					/ totlength),	 				 11);i 			w = HtWordToken(0); 		     } 		 w = '\0'; 		}	  ' 		if (keywordsMatch.CompareWord(cache))  		{ 8 		    char	*w = HtWordToken(transSGML(conf["content"])); 		    while (w && doindex) 		    {i% 			if (strlen(w) >= minimumWordLength!' 				&& ++keywordsCount <= max_keywords);" 			  retriever.got_word(w, 1, 10); 			w = HtWordToken(0); 		    }  		    w = '\0';  		} 3 		else if (mystrcasecmp(cache, "htdig-email") == 0)o 		{ ; 		    retriever.got_meta_email(transSGML(conf["content"]));  		} ? 		else if (mystrcasecmp(cache, "htdig-notification-date") == 0)= 		{'B 		    retriever.got_meta_notification(transSGML(conf["content"])); 		} ; 		else if (mystrcasecmp(cache, "htdig-email-subject") == 0)  		{ = 		    retriever.got_meta_subject(transSGML(conf["content"]));  		} 5 		else if (mystrcasecmp(cache, "htdig-noindex") == 0)s 		  {  		    retriever.got_noindex(); 		    doindex = 0; 		    dofollow = 0;a 		  }e- 		else if (mystrcasecmp(cache, "robots") == 0 # 			 && strlen(conf["content"]) !=0)  		  { / 		    String   content_cache = conf["content"];   1 		    if (content_cache.indexOf("noindex") != -1)n	 		      {e 			doindex = 0;  			retriever.got_noindex();a	 		      }d2 		    if (content_cache.indexOf("nofollow") != -1) 		      dofollow = 0;p. 		    if (content_cache.indexOf("none") != -1)	 		      {i 			doindex = 0;  			dofollow = 0; 			retriever.got_noindex();U	 		      }  		  }  	    } 	    else if (conf["name"] &&e8 		     mystrcasecmp(conf["name"], "htdig-noindex") == 0) 	    {! 	        retriever.got_noindex();\ 	        doindex = 0;r 		dofollow = 0;S 	    } 	    break;p 	}   	case 21:	// frame 	case 24:	// embed 	case 25:	// object  	{ 	    which = -1;? 	    int pos = srcMatch.FindFirstWord(position, which, length);i 	    position += pos + length; 	    switch (which)) 	    { 		case 0:		// "src"} 		{  		    // 		    // src seeni 		    //+ 		    while (*position && *position != '=')2 			position++; 		    if (!*position) 
 			return; 		    position++;e  		    while (isspace(*position)) 			position++;                    // A                    // Allow either single quotes or double quotes +                    // around the URL itself                     //	;                    if (*position == '"'||*position == '\'')/ 		    {c 			position++;& 			q = strchr(position, position[-1]);
 			if (!q)
 			    break;6                        ///H                        // We seem to have matched the opening quote charH                        // Mark the end of the quotes as our endpoint, soH                        // that we can continue parsing after the current                        // text                        //(!                        *q = '\0';t                        //t>                        // If a '#' is present in a quoted URL,H                        //  treat that as the end of the URL, but we skipJ                        //  past the quote to parse the rest of the anchor.                        //&?                        if ((t = strchr(position, '#')) != NULL) %                            *t = '\0';l 		    }o
 		    else 		    {f 			q = position; 			while (*q &&  			       *q != '>' &&2 			       !isspace(*q) && //  *q != '?'   ??? -grh 			       *q != '#') 			    q++;	
 			*q = '\0';o 		    }  		    delete href;1 		    href = new URL(transSGML(position), *base);= 		    if (dofollow)a 		    {F 			description = 0;i* 			retriever.got_href(*href, description); 			in_ref = 0; 		    }o 		    break; 		}  		break; 	    } 	    break;i 	} 	= 	case 22:	// area  	case 26:	// link) 	{ 	    which = -1;@ 	    int pos = hrefMatch.FindFirstWord(position, which, length); 	    position += pos + length; 	    switch (which)o 	    { 		case 0:		// "href" 		{t 		    // 		    // href seen 		    //+ 		    while (*position && *position != '=')  			position++; 		    if (!*position)s
 			return; 		    position++;q  		    while (isspace(*position)) 			position++;                    //aA                    // Allow either single quotes or double quotes +                    // around the URL itself                     //r;                    if (*position == '"'||*position == '\'')  		    {  			position++;& 			q = strchr(position, position[-1]);
 			if (!q)
 			    break;n                        // H                        // We seem to have matched the opening quote charH                        // Mark the end of the quotes as our endpoint, soH                        // that we can continue parsing after the current                        // text                        //h!                        *q = '\0';)                        //\>                        // If a '#' is present in a quoted URL,H                        //  treat that as the end of the URL, but we skipJ                        //  past the quote to parse the rest of the anchor.?                        if ((t = strchr(position, '#')) != NULL) %                            *t = '\0';  		    }d
 		    else 		    {/ 			q = position; 			while (*q &&e 			       *q != '>' &&4 			       !isspace(*q) && //  *q != '?'   ???? --grh 			       *q != '#') 			    q++;	
 			*q = '\0';o 		    }  		    delete href;1 		    href = new URL(transSGML(position), *base);] 		    if (dofollow)e 		    {	 			description = 0; * 			retriever.got_href(*href, description); 			in_ref = 0; 		    }l 		    break; 		}d  
 		default: 		    break; 	    } 	    break;f 	}   	case 23:	// base  	{ 	    which = -1;@ 	    int pos = hrefMatch.FindFirstWord(position, which, length); 	    position += pos + length; 	    switch (which)r 	    { 		case 0:		// "href" 		{n+ 		    while (*position && *position != '=')  			position++; 		    if (!*position)d
 			return; 		    position++;d  		    while (isspace(*position)) 			position++;                    //iA                    // Allow either single quotes or double quotesn+                    // around the URL itselfr                    //d;                    if (*position == '"'||*position == '\'')	 		    {  			position++;& 			q = strchr(position, position[-1]);
 			if (!q)
 			    break;w                        //	H                        // We seem to have matched the opening quote charH                        // Mark the end of the quotes as our endpoint, soH                        // that we can continue parsing after the current                        // text                        //c!                        *q = '\0';r                        //;>                        // If a '#' is present in a quoted URL,H                        //  treat that as the end of the URL, but we skipJ                        //  past the quote to parse the rest of the anchor.                        //nE                        // Is there a better way of looking for these?i                        //l?                        if ((t = strchr(position, '#')) != NULL) %                            *t = '\0';  		    } 
 		    else 		    {. 			q = position; 			while (*q &&  			       *q != '>' &&1 			       !isspace(*q) && // *q != '?'   ??? -grh& 			       *q != '#') 			    q++;* 		    *q = '\0'; 		    }	/ 		    URL tempBase(transSGML(position), *base);o 		    *base = tempBase;c 		}o 	    } 	    break;s 	} 	c	 	default:p 	    return;						// Nothing..."     }" }!    O //***************************************************************************** $ // char * HTML::transSGML(char *str) // char * HTML::transSGML(char *str) {h     static String	convert;/     unsigned char	*text = (unsigned char *)str;a       convert = 0;     while (*text)d     {1 	if (*text == '&') 	{1 	    if (strncmp((char *)text, "&amp;", 5) == 0) r 	    {@ 		// We MUST convert these in URLs, regardless of translate_amp. 		convert << '&';	 		text += 5; 	    } elsel4 		convert << SGMLEntities::translateAndUpdate(text); 	} 	elser 	    convert << *text++;     }c     return convert.get();  }          while (w && doindex)  		     {& 			if (strlen(w) >= minimumWordLength) 			  retriever.got_word(w," 				 int((offset+(w-words)) * 1000 					/ totlength),	 				 11);i 			w = HtWordToken(0); 		     } 		 w = '\0'; 		}	  ' 		if (keywordsMatch.CompareWord(cache))  		{ 8 		    char	*w = HtWordToken(transSGML(conf["content"])); 		    while (w && doindex) 		    {i% 			if (strlen(w) >= minimumWordLength!'                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 