 // // SGMLEntities.cc //! // Implementation of SGMLEntities 2 // Translate SGML entities to ISO 8859 equivalents // // #if RELEASE Z static char RCSid[] = "$Id: SGMLEntities.cc,v 1.10.2.1 1999/11/24 02:06:47 grdetil Exp $"; #endif   #include "SGMLEntities.h"  #include "htString.h"  #include <ctype.h> #include <stdlib.h>  #include "Configuration.h" #include "StringMatch.h"   extern Configuration config;   static SGMLEntities	junk;   
 static struct  {      char			*entity;      unsigned char	equiv; } entities[] =   {      { "lt",	      '<' } ,      { "gt",	      '>' } ,      { "amp",	      '&' } ,     { "quot",	      '"' } , -     { "trade",	      153 } , /* trade mark */ 6     { "nbsp",         160 } , /* non breaking space */=     { "iexcl",        161 } , /* inverted exclamation mark */ -     { "cent",         162 } , /* cent sign */ .     { "pound",        163 } , /* pound sign */1     { "curren",       164 } , /* currency sign */ ,     { "yen",          165 } , /* yen sign */A     { "brvbar",       166 } , /* broken vertical bar, (brkbar) */ 0     { "sect",         167 } , /* section sign */4     { "uml",          168 } , /* spacing diaresis */2     { "copy",         169 } , /* copyright sign */>     { "ordf",         170 } , /* feminine ordinal indicator */>     { "laquo",        171 } , /* angle quotation mark, left */1     { "not",          172 } , /* negation sign */ /     { "shy",          173 } , /* soft hyphen */ =     { "reg",          174 } , /* circled R registered sign */ 2     { "hibar",        175 } , /* spacing macron *//     { "deg",          176 } , /* degree sign */ 6     { "plusmn",       177 } , /* plus-or-minus sign */1     { "sup2",         178 } , /* superscript 2 */ 1     { "sup3",         179 } , /* superscript 3 */ 6     { "acute",        180 } , /* spacing acute (96) */.     { "micro",        181 } , /* micro sign */2     { "para",         182 } , /* paragraph sign */.     { "middot",       183 } , /* middle dot */3     { "cedil",        184 } , /* spacing cedilla */ 1     { "sup1",         185 } , /* superscript 1 */ ?     { "ordm",         186 } , /* masculine ordinal indicator */ ?     { "raquo",        187 } , /* angle quotation mark, right */ 0     { "frac14",       188 } , /* fraction 1/4 */0     { "frac12",       189 } , /* fraction 1/2 */0     { "frac34",       190 } , /* fraction 3/4 */:     { "iquest",       191 } , /* inverted question mark */<     { "Agrave",       192 } , /* capital A, grave accent */ <     { "Aacute",       193 } , /* capital A, acute accent */ A     { "Acirc",        194 } , /* capital A, circumflex accent */  5     { "Atilde",       195 } , /* capital A, tilde */  G     { "Auml",         196 } , /* capital A, dieresis or umlaut mark */  4     { "Aring",        197 } , /* capital A, ring */ D     { "AElig",        198 } , /* capital AE diphthong (ligature) */ 7     { "Ccedil",       199 } , /* capital C, cedilla */  <     { "Egrave",       200 } , /* capital E, grave accent */ <     { "Eacute",       201 } , /* capital E, acute accent */ A     { "Ecirc",        202 } , /* capital E, circumflex accent */  G     { "Euml",         203 } , /* capital E, dieresis or umlaut mark */  <     { "Igrave",       205 } , /* capital I, grave accent */ <     { "Iacute",       204 } , /* capital I, acute accent */ A     { "Icirc",        206 } , /* capital I, circumflex accent */  G     { "Iuml",         207 } , /* capital I, dieresis or umlaut mark */  D     { "ETH",          208 } , /* capital Eth, Icelandic (Dstrok) */ 5     { "Ntilde",       209 } , /* capital N, tilde */  <     { "Ograve",       210 } , /* capital O, grave accent */ <     { "Oacute",       211 } , /* capital O, acute accent */ A     { "Ocirc",        212 } , /* capital O, circumflex accent */  5     { "Otilde",       213 } , /* capital O, tilde */  G     { "Ouml",         214 } , /* capital O, dieresis or umlaut mark */  8     { "times",        215 } , /* multiplication sign */ 5     { "Oslash",       216 } , /* capital O, slash */  <     { "Ugrave",       217 } , /* capital U, grave accent */ <     { "Uacute",       218 } , /* capital U, acute accent */ A     { "Ucirc",        219 } , /* capital U, circumflex accent */  G     { "Uuml",         220 } , /* capital U, dieresis or umlaut mark */  <     { "Yacute",       221 } , /* capital Y, acute accent */ =     { "THORN",        222 } , /* capital THORN, Icelandic */  H     { "szlig",        223 } , /* small sharp s, German (sz ligature) */ :     { "agrave",       224 } , /* small a, grave accent */ :     { "aacute",       225 } , /* small a, acute accent */ ?     { "acirc",        226 } , /* small a, circumflex accent */  2     { "atilde",       227 } , /* small a, tilde */E     { "auml",         228 } , /* small a, dieresis or umlaut mark */  1     { "aring",        229 } , /* small a, ring */ B     { "aelig",        230 } , /* small ae diphthong (ligature) */ 5     { "ccedil",       231 } , /* small c, cedilla */  :     { "egrave",       232 } , /* small e, grave accent */ :     { "eacute",       233 } , /* small e, acute accent */ ?     { "ecirc",        234 } , /* small e, circumflex accent */  E     { "euml",         235 } , /* small e, dieresis or umlaut mark */  :     { "igrave",       236 } , /* small i, grave accent */ :     { "iacute",       237 } , /* small i, acute accent */ ?     { "icirc",        238 } , /* small i, circumflex accent */  E     { "iuml",         239 } , /* small i, dieresis or umlaut mark */  9     { "eth",          240 } , /* small eth, Icelandic */  3     { "ntilde",       241 } , /* small n, tilde */  :     { "ograve",       242 } , /* small o, grave accent */ :     { "oacute",       243 } , /* small o, acute accent */ ?     { "ocirc",        244 } , /* small o, circumflex accent */  3     { "otilde",       245 } , /* small o, tilde */  E     { "ouml",         246 } , /* small o, dieresis or umlaut mark */  1     { "divide",       247 } , /* division sign */ 3     { "oslash",       248 } , /* small o, slash */  :     { "ugrave",       249 } , /* small u, grave accent */ :     { "uacute",       250 } , /* small u, acute accent */ ?     { "ucirc",        251 } , /* small u, circumflex accent */  E     { "uuml",         252 } , /* small u, dieresis or umlaut mark */  :     { "yacute",       253 } , /* small y, acute accent */ ;     { "thorn",        254 } , /* small thorn, Icelandic */  D     { "yuml",         255 } , /* small y, dieresis or umlaut mark */     { 0, 0 }   };  O //*****************************************************************************  SGMLEntities::SGMLEntities() {      trans = new Dictionary();      init();  }     O //*****************************************************************************  SGMLEntities::~SGMLEntities()  {      trans->Release();      delete trans;  }     O //***************************************************************************** 
 unsigned char % SGMLEntities::translate(char *entity)  {      if (!entity || !*entity) 	return ' ';#     if (junk.trans->Exists(entity))      { 6 	return (unsigned char) ((int) (*junk.trans)[entity]);     } 2     else if (*entity == '#' && isdigit(entity[1]))     {  	// 3 	// This looks like a numeric entity.  That's fine.  	//  	return atoi(entity + 1);      }      else     { ? 	return ' ';	// Unrecognized entity.  Change it into a space...      }  }   O //*****************************************************************************  void SGMLEntities::init() { ,     for (int i = 0; entities[i].entity; i++)     { > 	trans->Add(entities[i].entity, (Object *) entities[i].equiv);     }  }   O //***************************************************************************** N // This method does the same as the translate method, but it will also advanceK // the character pointer to the next character after the entity.  This will H // allow us to encapsulate the entity rules inside this routine.  (There8 // was some confusion on how entities are terminated...) //
 unsigned char = SGMLEntities::translateAndUpdate(unsigned char *&entityStart)  {      String		entity; &     unsigned char	*orig = entityStart;B     static int		translate_quot = config.Boolean("translate_quot");@     static int		translate_amp = config.Boolean("translate_amp");D     static int		translate_lt_gt = config.Boolean("translate_lt_gt");          if (*entityStart == '&')= 	entityStart++;		// Don't need the '&' that starts the entityi;     while ((isalnum(*entityStart) || *entityStart == '.' ||o3 	    *entityStart == '-' || *entityStart == '#') &&n 	   entity.length() < 10)t       {  	entity << *entityStart++;       }n       if ( !translate_quot )       {i 	//]. 	// Do NOT translate entities for '"' (quote). 	//   $ 	static StringMatch *quoteMatch = 0; 	if (!quoteMatch)  	  {$ 	    quoteMatch = new StringMatch();% 	    quoteMatch->Pattern("quot|#34");  	  }    	if (quoteMatch->hasPattern() &&2 	    quoteMatch->FindFirstWord(entity.get()) >= 0) 	  { 	    entityStart = orig + 1; 	    return '&'; 	  }       }        if ( !translate_amp )        {" 	// 4 	// Do NOT translate entities for '&' since they can: 	// occur in code samples that might end up in an excerpt. 	//e  " 	static StringMatch *ampMatch = 0; 	if (!ampMatch)  	  {" 	    ampMatch = new StringMatch();" 	    ampMatch->Pattern("amp|#38"); 	  } 	  	if (ampMatch->hasPattern() &&0 	    ampMatch->FindFirstWord(entity.get()) >= 0) 	  { 	    entityStart = orig + 1; 	    return '&'; 	  }       }m       if ( !translate_lt_gt )        {  	//n< 	// Do NOT translate entities for '<' and '>' since they can: 	// occur in code samples that might end up in an excerpt. 	//*  # 	static StringMatch *ltgtMatch = 0;a 	if (!ltgtMatch) 	  {# 	    ltgtMatch = new StringMatch();*) 	    ltgtMatch->Pattern("lt|#60|gt|#62");r 	  }   	if (ltgtMatch->hasPattern() && 1 	    ltgtMatch->FindFirstWord(entity.get()) >= 0)  	  { 	    entityStart = orig + 1; 	    return '&'; 	  }       }u       if (entity.length() >= 10)       {* 	//oE 	// This must be a bogus entity.  It can't be more than 10 charactersdE 	// long.  Well, just assume it was an error and return just the '&'.s 	//g 	entityStart = orig + 1; 	return '&';     }s          if (*entityStart == ';')+ 	entityStart++;		// A final ';' is used up. (     unsigned char e = translate(entity);:     if (e == ' ' && strncmp((char *)orig, "&#32", 4) != 0)     { : 	entityStart = orig + 1;	// Catch unrecognized entities... 	return '&';     }9
     return e;o }/*/:     { "iquest",       191 } , /* inverted question mark */<     { "Agrave",       192 } , /* capital A, grave accent */ <     { "Aacute",       193 } , /* capital A, ac                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                