 // // DocumentRef.cc  //  // Implementation of DocumentRefJ // Reference to an indexed document. Keeps track of all information storedF // on the document, either by the dig or temporary search information. // //   #include "DocumentRef.h" #include "good_strtok.h" #include <stdlib.h>  #include <ctype.h> #include <fstream.h> #include "WordList.h"  #include "Configuration.h" #include "HtURLCodec.h"  #include "HtWordType.h"   . #if defined(HAVE_LIBZ) && defined(HAVE_ZLIB_H) #include <zlib.h>  #endif   extern Configuration config;  . #if defined(HAVE_LIBZ) && defined(HAVE_ZLIB_H)- //unsigned char DocumentRef::c_buffer[32000];  // // Compress Function //& int DocumentRef::Compress(String &s) {8   static int cf=config.Value("compression_level",0);       if (cf) {      //     // Now compress s into c_s     //"     unsigned char c_buffer[16384];     String c_s; /     z_stream c_stream; /* compression stream */ "     c_stream.zalloc=(alloc_func)0;      c_stream.zfree=(free_func)0;     c_stream.opaque=(voidpf)0;.     // Get compression factor, default to best*     if (cf<-1) cf=-1; else if (cf>9) cf=9;&     int err=deflateInit(&c_stream,cf);     if (err!=Z_OK) return 0;     int len=s.length(); '     c_stream.next_in=(Bytef*)(char *)s;      c_stream.avail_in=len;8     while (err==Z_OK && c_stream.total_in!=(uLong)len) {!       c_stream.next_out=c_buffer; *       c_stream.avail_out=sizeof(c_buffer);(       err=deflate(&c_stream,Z_NO_FLUSH);>       c_s.append((char *)c_buffer,c_stream.next_out-c_buffer);     }      // Finish the stream     for (;;) {!       c_stream.next_out=c_buffer; *       c_stream.avail_out=sizeof(c_buffer);&       err=deflate(&c_stream,Z_FINISH);>       c_s.append((char *)c_buffer,c_stream.next_out-c_buffer);#       if (err==Z_STREAM_END) break; "       //CHECK_ERR(err, "deflate");     }      err=deflateEnd(&c_stream);  
     s=c_s;   }    return 1;  }    //? // Decompress routine returns 0 if decompressed 1 if compressed  //( int DocumentRef::Decompress(String &s) {8   static int cf=config.Value("compression_level",0);       if (cf) {      String c_s;      // Decompress stream"     unsigned char c_buffer[16384];     z_stream d_stream;"     d_stream.zalloc=(alloc_func)0;      d_stream.zfree=(free_func)0;     d_stream.opaque=(voidpf)0;          int len=s.length(); '     d_stream.next_in=(Bytef*)(char *)s;      d_stream.avail_in=len;     #     int err=inflateInit(&d_stream);      if (err!=Z_OK) return 1;     0     while (err==Z_OK && d_stream.total_in<len) {!       d_stream.next_out=c_buffer; *       d_stream.avail_out=sizeof(c_buffer);(       err=inflate(&d_stream,Z_NO_FLUSH);>       c_s.append((char *)c_buffer,d_stream.next_out-c_buffer);#       if (err==Z_STREAM_END) break;      }           err=inflateEnd(&d_stream);
     s=c_s;   }    return 0;  }    char *DocumentRef::DocHead() {!   if (docHeadState==Compressed) {      Decompress(docHead);     docHeadState=Uncompressed;   }    return docHead;  }   $ void DocumentRef::DocHead(char *h) {   docHead=h;6   docHeadState=docHead.length()==0?Empty:Uncompressed; }  #else     char *DocumentRef::DocHead() {   return docHead;  }   $ void DocumentRef::DocHead(char *h) {   docHead=h; }  #endif  O //*****************************************************************************  // DocumentRef::DocumentRef()  // DocumentRef::DocumentRef() {      Clear(); }     O //*****************************************************************************  // DocumentRef::~DocumentRef() // DocumentRef::~DocumentRef()  {  }     O //*****************************************************************************  // void DocumentRef::Clear() // void DocumentRef::Clear()  {      docID = 0;     docURL = 0;      docTitle = 0;       docState = Reference_normal;     docTime = 0;     docSize = 0;     docImageSize = 0;      docHead = 0;     docMetaDsc = 0;      docAccessed = 0;     docLinks = 0;      descriptions.Destroy();      docAnchors.Destroy();      docHopCount = 0;     docSig = 0;      docEmail = 0;      docNotification = 0;     docSubject = 0;      docBackLinks = 0; . #if defined(HAVE_LIBZ) && defined(HAVE_ZLIB_H)     docHeadState=Empty;  #endif }      enum {      DOC_ID,				// 0      DOC_TIME,				// 1      DOC_ACCESSED,			// 2     DOC_STATE,				// 3     DOC_SIZE,				// 4      DOC_LINKS,				// 5     DOC_IMAGESIZE,			// 6      DOC_HOPCOUNT,			// 7     DOC_URL,				// 8     DOC_HEAD,				// 9      DOC_TITLE,				// 10 $     DOC_DESCRIPTIONS,	        	// 11     DOC_ANCHORS,			// 12     DOC_EMAIL,				// 13 $     DOC_NOTIFICATION,		        // 14     DOC_SUBJECT,			// 15-     DOC_STRING,                         // 16 -     DOC_METADSC,                        // 17 -     DOC_BACKLINKS,                      // 18 -     DOC_SIG                             // 19  };  < // Must be powers of two never reached by the DOC_... enums. #define CHARSIZE_MARKER_BIT 64  #define SHORTSIZE_MARKER_BIT 128  O //***************************************************************************** ) // void DocumentRef::Serialize(String &s) 5 //   Convert all the data in the object to a string.  . //   The data is in the string is tagged with  //& void DocumentRef::Serialize(String &s) {      int		length;     String	*str;  . #if defined(HAVE_LIBZ) && defined(HAVE_ZLIB_H)%     if (docHeadState==Uncompressed) {        Compress(docHead);       docHeadState=Compressed;     }  #endif //F // The following macros make the serialization process a little easierF // to follow.  Note that if an object to be serialized has the default? // value for this class, it it NOT serialized.  This means that  // storage will be saved...  // #define addnum(id, out, var) \G  if (var != 0)                                                        \ G  {                                                                    \ G    if (var <= (unsigned char) ~1)                                     \ G    {                                                                  \ G      unsigned char _tmp = var;                                        \ G      out << (char) (id | CHARSIZE_MARKER_BIT);                        \ G      out.append((char *) &_tmp, sizeof(_tmp));                        \ G    }                                                                  \ G    else if (var <= (unsigned short int) ~1)                           \ G    {                                                                  \ G      unsigned short int _tmp = var;                                   \ G      out << (char) (id | SHORTSIZE_MARKER_BIT);                       \ G      out.append((char *) &_tmp, sizeof(_tmp));                        \ G    }                                                                  \ G    else                                                               \ G    {                                                                  \ G      out << (char) id;                                                \ G      out.append((char *) &var, sizeof(var));                          \ G    }                                                                  \   }  ! #define	addstring(id, out, str)	\ G  if (str.length())                                                    \ G  {                                                                    \ G    length = str.length();                                             \ G    if (length <= (unsigned char) ~1)                                  \ G    {                                                                  \ G      unsigned char _tmp = length;                                     \ G      out << (char) (id | CHARSIZE_MARKER_BIT);                        \ G      out.append((char *) &_tmp, sizeof(_tmp));                        \ G    }                                                                  \ G    else if (length <= (unsigned short int) ~1)                        \ G    {                                                                  \ G      unsigned short int _tmp = length;                                \ G      out << (char) (id | SHORTSIZE_MARKER_BIT);                       \ G      out.append((char *) &_tmp, sizeof(_tmp));                        \eG    }                                                                  \ G    else                                                               \iG    {                                                                  \iG      out << (char) id;                                                \"G      out.append((char *) &length, sizeof(length));                    \iG    }                                                                  \ G    out.append(str);                                                   \Z  }  9 // To keep compatibility with old databases, don't botherm@ // with long lists at all.  Bloat the size for long strings with< // one char to just keep a ~1 marker since we don't know the7 // endianness; we don't know where to put a endian-safen< // size-marker, and we probably rather want the full char toB // keep the length.  Only strings shorter than (unsigned char) ~1 > // will be "optimized"; trying to optimize strings that fit in? // (unsigned short) does not seem to give anything substantial.-  #define	addlist(id, out, list) \G  if (list.Count())                                                    \nG  {                                                                    \mG    length = list.Count();                                             \ G    if (length <= (unsigned short int) ~1)                             \fG    {                                                                  \cG      if (length <= (unsigned char) ~1)                                \ G      {                                                                \tG        unsigned char _tmp = length;                                   \eG        out << (char) (id | CHARSIZE_MARKER_BIT);                      \_G        out.append((char *) &_tmp, sizeof(_tmp));                      \EG      }                                                                \ G      else                                                             \DG      {                                                                \0G        unsigned short int _tmp = length;                              \iG        out << (char) (id | SHORTSIZE_MARKER_BIT);                     \aG        out.append((char *) &_tmp, sizeof(_tmp));                      \pG      }                                                                \*G      list.Start_Get();                                                \)@      while ((str = (String *) list.Get_Next()))		              \G      {                                                                \mG        length = str->length();                                        \ G        if (length < (unsigned char) ~1)                               \rG        {                                                              \ G          unsigned char _tmp = length;                                 \(G          out.append((char*) &_tmp, sizeof(_tmp));                     \UG        }                                                              \hG        else                                                           \sG        {                                                              \ G          unsigned char _tmp = ~1;                                     \/G          out.append((char*) &_tmp, sizeof(_tmp));                     \*G          out.append((char*) &length, sizeof(length));                 \ G        }                                                              \*G        out.append(*str);                                              \tG      }                                                                \*G    }                                                                  \/G    else                                                               \oG    {                                                                  \SG      out << (char) id;                                                \ G      out.append((char *) &length, sizeof(length));                    \ G      list.Start_Get();                                                \EG      while ((str = (String *) list.Get_Next()))                       \nG      {                                                                \eG        length = str->length();                                        \	G        out.append((char*) &length, sizeof(length));                   \	G        out.append(*str);                                              \UG      }                                                                \	G    }                                                                  \   }       addnum(DOC_ID, s, docID);I!     addnum(DOC_TIME, s, docTime);U)     addnum(DOC_ACCESSED, s, docAccessed); #     addnum(DOC_STATE, s, docState); !     addnum(DOC_SIZE, s, docSize);O#     addnum(DOC_LINKS, s, docLinks);1+     addnum(DOC_BACKLINKS, s, docBackLinks);/+     addnum(DOC_IMAGESIZE, s, docImageSize);a)     addnum(DOC_HOPCOUNT, s, docHopCount);E     addnum(DOC_SIG, s, docSig);_  >     // Use a temporary since the addstring macro will evaluate     // this multiple times.*9     String tmps = HtURLCodec::instance()->encode(docURL);t      addstring(DOC_URL, s, tmps);$     addstring(DOC_HEAD, s, docHead);*     addstring(DOC_METADSC, s, docMetaDsc);&     addstring(DOC_TITLE, s, docTitle);  /     addlist(DOC_DESCRIPTIONS, s, descriptions);i(     addlist(DOC_ANCHORS, s, docAnchors);  &     addstring(DOC_EMAIL, s, docEmail);4     addstring(DOC_NOTIFICATION, s, docNotification);*     addstring(DOC_SUBJECT, s, docSubject); }o    O //*****************************************************************************d0 // void DocumentRef::Deserialize(String &stream)A //   Extract the contents of our private variables from the givendE //   character string.  The character string is expected to have been ( //   created using the Serialize member. //- void DocumentRef::Deserialize(String &stream)  {      Clear();     char	*s = stream.get(); $     char	*end = s + stream.length();     int		length;     int		count;      int		i;      int		x;      String	*str;  9 // There is a problem with getting a numeric value into a : // numeric unknown type that may be an enum (the other way& // around is simply by casting (int)).> //  Supposedly the enum incarnates as a simple type, so we can) // just check the size and copy the bits. ' #define MEMCPY_ASSIGN(to, from, type) \oG  do {                                                                 \ G    type _tmp = (type) (from);                                         \vG    memcpy((char *) &(to), (char *) &_tmp, sizeof(to));                \I  } while (0)   #define NUM_ASSIGN(to, from) \G  do {                                                                 \ G    if (sizeof(to) == sizeof(unsigned long int))                       \sG      MEMCPY_ASSIGN(to, from, unsigned long int);                      \ G    else if (sizeof(to) == sizeof(unsigned int))                       \oG      MEMCPY_ASSIGN(to, from, unsigned int);                           \oG    else if (sizeof(to) == sizeof(unsigned short int))                 \ G      MEMCPY_ASSIGN(to, from, unsigned short int);                     \ G    else if (sizeof(to) == sizeof(unsigned char))                      \ G      MEMCPY_ASSIGN(to, from, unsigned char);                          \ G    /* else fatal error here? */                                       \   } while (0)   #define	getnum(type, in, var) \ G  if (type & CHARSIZE_MARKER_BIT)                                      \ G  {                                                                    \ G    NUM_ASSIGN(var, *(unsigned char *) in);                            \cG    in += sizeof(unsigned char);                                       \nG  }                                                                    \ G  else if (type & SHORTSIZE_MARKER_BIT)                                \eG  {                                                                    \ G    unsigned short int _tmp0;                                          \ G    memcpy((char *) &_tmp0, (char *) (in), sizeof(unsigned short));    \cG    NUM_ASSIGN(var, _tmp0);                                            \nG    in += sizeof(unsigned short int);                                  \ G  }                                                                    \ G  else                                                                 \ G  {                                                                    \cG    memcpy((char *) &var, in, sizeof(var));                            \nG    in += sizeof(var);                                                 \   }  " #define	getstring(type, in, str) \G  getnum(type, in, length);                                            \ G  str = 0;                                                             \tG  str.append(in, length);                                              \h
  in += lengtho  ! #define	getlist(type, in, list) \ G  getnum(type, in, count);                                             \zG  if (type & (CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT))             \hG  {                                                                    \ G    for (i = 0; i < count; i++)                                        \ G    {                                                                  \sG      unsigned char _tmp = *(unsigned char *) in;                      \ G      in += sizeof(_tmp);                                              \tG      if (_tmp < (unsigned char) ~1)                                   \lG        length = _tmp;                                                 \ G      else                                                             \ G        getnum(~(CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT), in,      \ G               length);                                                \uG      str = new String;                                                \oG      str->append(in, length);                                         \oG      list.Add(str);                                                   \ G      in += length;                                                    \sG    }                                                                  \ G  }                                                                    \uG  else                                                                 \oG  {                                                                    \oG    for (i = 0; i < count; i++)                                        \ G    {                                                                  \sG      getnum(~(CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT), in,        \iG             length);                                                  \ G      str = new String;                                                \ G      str->append(in, length);                                         \tG      list.Add(str);                                                   \ G      in += length;                                                    \eG    }                                                                  \p  }       while (s < end)t     { !         x = (unsigned char) *s++; B         switch (x & ~(CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT))	         {          case DOC_ID:              getnum(x, s, docID);             break;         case DOC_TIME:"             getnum(x, s, docTime);             break;         case DOC_ACCESSED:&             getnum(x, s, docAccessed);             break;         case DOC_STATE: #             getnum(x, s, docState);n             break;         case DOC_SIZE:"             getnum(x, s, docSize);             break;         case DOC_LINKS:t#             getnum(x, s, docLinks);              break;         case DOC_IMAGESIZE: '             getnum(x, s, docImageSize);              break;         case DOC_HOPCOUNT:&             getnum(x, s, docHopCount);             break; 	case DOC_BACKLINKS:  	    getnum(x, s, docBackLinks); 	    break;  	case DOC_SIG: 	    getnum(x, s, docSig); 	    break;o         case DOC_URL:  	    {A 	      // Use a temporary since the addstring macro will evaluateh 	      // this multiple times. 	      String tmps;t 	      getstring(x, s, tmps);   5 	      docURL = HtURLCodec::instance()->decode(tmps);* 	    } 	    break;          case DOC_HEAD:%             getstring(x, s, docHead); . #if defined(HAVE_LIBZ) && defined(HAVE_ZLIB_H)>             docHeadState=docHead.length()==0?Empty:Compressed; #endif             break; 	case DOC_METADSC:! 	    getstring(x, s, docMetaDsc);p 	    break;          case DOC_TITLE: &             getstring(x, s, docTitle);             break;         case DOC_DESCRIPTIONS:(             getlist(x, s, descriptions);             break;         case DOC_ANCHORS:n&             getlist(x, s, docAnchors);             break;         case DOC_EMAIL:s&             getstring(x, s, docEmail);             break;         case DOC_NOTIFICATION:-             getstring(x, s, docNotification);m             break;         case DOC_SUBJECT:D(             getstring(x, s, docSubject);             break; 	case DOC_STRING: 1 	  // This is just a debugging string. Ignore it.a 	    break;a         default:@             cerr << "BAD TAG IN SERIALIZED DATA: " << x << endl;             return;o	         }t     }d }r    O //*****************************************************************************S, // void DocumentRef::AddDescription(char *d) //) void DocumentRef::AddDescription(char *d), {s     if (!d || !*d)         return;,       while (isspace(*d))s         d++;       if (!d || !*d)         return;I       String	desc = d;     desc.chop(" \t");O  G     // Add the description text to the word database with proper factor*J     // Do this first because we may have reached the max_description limitC     // This also ensures we keep the proper weight on descriptions o     // that occur many times       static WordList *words = 0;e     >     if (!words) // Hey... We only want to do this once, right?     {n 	words = new WordList();* 	words->WordTempFile(config["word_list"]);- 	words->BadWordFile(config["bad_word_list"]);(     }        words->DocumentID(docID);           // Parse words. -     char         *p                   = desc;tO     static int    minimum_word_length = config.Value("minimum_word_length", 3);hL     static double description_factor  = config.Double("description_factor");L     static int    max_descriptions    = config.Value("max_descriptions", 5);       String word;       while (*p)     { 7       // Reset contents before adding chars each round.        word = 0;p  $       while (*p && HtIsWordChar(*p))         word << *p++;          HtStripPunctuation(word);   /       if (word.length() >= minimum_word_length)(?         // The wordlist takes care of lowercasing; just add it. 4         words->Word(word, 0, 0, description_factor);  +       while (*p && !HtIsStrictWordChar(*p))          p++;     }   !     // And let's flush the words!g     words->Flush();      /     // Now are we at the max_description limit?)1     if (descriptions.Count() >= max_descriptions)f
   	return;   	)     descriptions.Start_Get();      String	*description;>     while ((description = (String *) descriptions.Get_Next()))     {f8         if (mystrcasecmp(description->get(), desc) == 0)             return;e     }g'     descriptions.Add(new String(desc));M }P    O //*****************************************************************************r' // void DocumentRef::AddAnchor(char *a)  //$ void DocumentRef::AddAnchor(char *a) {,
     if (a)#     	docAnchors.Add(new String(a));  }                             \ G  {                                                                    \ G    NUM_ASSIGN(var, *(unsigned char *) in);                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  