 // // DocumentDB.cc // // Implementation of DocumentDB  // // //   #include "DocumentDB.h"  #include <stdio.h> #include <stdlib.h>  #include <ctype.h> #include <unistd.h>  #include <fstream.h> #include "Database.h"  #include "HtURLCodec.h"    #ifdef __VMS #include <descrip.h> #include <unixlib.h> #include <sor$routines.h>    #define SOR$M_STABLE   1  #define SOR$M_NOSIGNAL 8 /* ? */  S static struct dsc$descriptor_s tmp_dsc = { 0, DSC$K_DTYPE_T, DSC$K_CLASS_S, NULL };     // Action routine to decc$to_vms& // Stupid call sequence, if you ask me. static int GetVMSName(char *VMSName, int type) { ;     if (type == DECC$K_DIRECTORY || type == DECC$K_FOREIGN)           return 1; // try another>     // Hopefully the contents of VMSName is available globally,     tmp_dsc.dsc$w_length  = strlen(VMSName);$     tmp_dsc.dsc$a_pointer = VMSName;
     return 0;  }   & // Utility routine to cmp_int_to_tab()- inline unsigned int times10(unsigned int num)  { %     return ((num << 3) + (num << 1));  }   5 // User compare routine (parameter to sor$begin_sort)  static int cmp_int_to_tab(     char *rec1,      char *rec2,      unsigned short *len1,      unsigned short *len2,      unsigned long *context )  {      unsigned int num1 = 0,                  num2 = 0;!     for (; *rec1 != '\t'; ++rec1) & 	num1 = times10(num1) + (*rec1 - '0');!     for (; *rec2 != '\t'; ++rec2) & 	num2 = times10(num2) + (*rec2 - '0');7     return (num1 < num2) ? -1 : (num1 == num2) ? 0 : 1;  }  #endif  O //*****************************************************************************  // DocumentDB::DocumentDB()  // DocumentDB::DocumentDB() {      isopen = 0;      isread = 0;      nextDocID = 0;     myTryUncoded = 1;  }     O //*****************************************************************************  // DocumentDB::~DocumentDB() // DocumentDB::~DocumentDB()  {      if (isopen) 	 	Close();  }     O //***************************************************************************** ' // int DocumentDB::Open(char *filename) E //   We will attempt to open up an existing document database.  If it D //   doesn't exist, we'll create a new one.  If we are succesfull inA //   opening the database, we need to look for our special record 0 //   which contains the next document ID to use. //$ int DocumentDB::Open(char *filename) { *     dbf = Database::getDatabaseInstance(); 	 1     if (dbf->OpenReadWrite(filename, 0664) == OK)      {  	String		data;' 	if (dbf->Get("nextDocID", data) == OK)  	{ 	    nextDocID = atoi(data); 	} 	isopen = 1; 	return OK;      }      else 	return NOTOK; }     O //***************************************************************************** ' // int DocumentDB::Read(char *filename) > //   We will attempt to open up an existing document database. //$ int DocumentDB::Read(char *filename) { *     dbf = Database::getDatabaseInstance(); 	 &     if (dbf->OpenRead(filename) == OK)     {  	isopen = 1; 	isread = 1; 	return OK;      }      else 	return NOTOK; }     O //*****************************************************************************  // int DocumentDB::Close()E //   Close the database.  Before we close it, we first need to update A //   the special record which keeps track our nextDocID variable.  // int DocumentDB::Close()  {      String	data;       if (!isread)     {  	data << nextDocID; 6 	dbf->Put("nextDocID", data.get(), data.length() + 1);     }        dbf->Close();      delete dbf;      dbf = 0;     isopen = 0;      isread = 0;      return OK; }     O //***************************************************************************** ( // int DocumentDB::Add(DocumentRef &doc) //% int DocumentDB::Add(DocumentRef &doc)  {      String	url;      url = doc.DocURL(); =     // Why would we want to lowercase the URL before storing? "     // URLs can be case sensitive!     //    url.lowercase();
     temp = 0;      doc.Serialize(temp);  >     // If in compatibility-mode, there may be an unencoded url6     // that has to be deleted to avoid duplicate URLs.     if (myTryUncoded)        dbf->Delete(url);   8     dbf->Put(HtURLCodec::instance()->encode(url), temp);     return OK; }     O //***************************************************************************** 1 // DocumentRef *DocumentDB::operator [] (char *u)  //. DocumentRef *DocumentDB::operator [] (char *u) {      String			data;     String			url = u; 6     // Why would we lowercase the URL before using it?"     // URLs can be case sensitive!     // url.lowercase();   D     if (dbf->Get(HtURLCodec::instance()->encode(url), data) == NOTOK<         && (! myTryUncoded || dbf->Get(url, data) == NOTOK))
 	return 0;  (     DocumentRef		*ref = new DocumentRef;     ref->Deserialize(data);      return ref;  }     O //***************************************************************************** " // int DocumentDB::Exists(char *u) // int DocumentDB::Exists(char *u)  {      String			url = u; :     // Why would we lowercase, URLs can be case-sensitive!     //    url.lowercase();  ;     return dbf->Exists(HtURLCodec::instance()->encode(url)) ,       || (myTryUncoded && dbf->Exists(url)); }     O //***************************************************************************** " // int DocumentDB::Delete(char *u) // int DocumentDB::Delete(char *u)  {      String			url = u; B     // Why would we lowercase the URL, they can be case-sensitive!     // url.lowercase(); G     int delete_stat = dbf->Delete(HtURLCodec::instance()->encode(url));   =     // If the deletion was not successful (maybe the item did F     // not exist) delete the unencoded URL if we should be compatible.-     return (delete_stat != 0 && myTryUncoded) '       ? dbf->Delete(url) : delete_stat;  }     O //***************************************************************************** 1 // int DocumentDB::CreateSearchDB(char *filename) A //   Create an extract from our database which can be used by the B //   search engine.  The extract will consist of lines with fields( //   separated by tabs.  The fields are: //        docID  //        docURL //        docTime  //        docHead  //        docMetaDsc* //        descriptions (separated by tabs) //) //   The extract will be sorted by docID.  //. int DocumentDB::CreateSearchDB(char *filename) {      DocumentRef	        *ref; "     List		*descriptions, *anchors;     char		*key;      String		data;      FILE		*fl;&     String		tmpdir = getenv("TMPDIR");   #ifdef __VMS     if (tmpdir.length() == 0)  	tmpdir = "SYS$SCRATCH:"; 3     char *tmpfile = tempnam(tmpdir.get(), "htdig"); $     tmpdir = tmpfile; free(tmpfile);     tmpdir << ".TMP"; I     fl = fopen(tmpdir.get(), "w", "rfm=var", "rat=cr", VMS_OPEN_OPTIONS);  #else !     String		command = SORT_PROG ;   $     command << " -n -o" << filename;     if (tmpdir.length())     {  	command << " -T " << tmpdir;      }      fl = popen(command, "w");  #endif       dbf->Start_Get(); #     while ((key = dbf->Get_Next()))      {  	dbf->Get(key, data); C 	if (strncmp(HtURLCodec::instance()->decode(key), "http:", 5) == 0)  	{ 	    ref = new DocumentRef;  	    ref->Deserialize(data);% 	    fprintf(fl, "%d", ref->DocID()); * 	    fprintf(fl, "\tu:%s", ref->DocURL());, 	    fprintf(fl, "\tt:%s", ref->DocTitle());, 	    fprintf(fl, "\ta:%d", ref->DocState());1 	    fprintf(fl, "\tm:%d", (int) ref->DocTime()); + 	    fprintf(fl, "\ts:%d", ref->DocSize()); + 	    fprintf(fl, "\th:%s", ref->DocHead()); . 	    fprintf(fl, "\th:%s", ref->DocMetaDsc());5 	    fprintf(fl, "\tl:%d", (int) ref->DocAccessed()); , 	    fprintf(fl, "\tL:%d", ref->DocLinks());0 	    fprintf(fl, "\tI:%d", ref->DocImageSize()); 	    fprintf(fl, "\td:"); ( 	    descriptions = ref->Descriptions(); 	    String	*description;  	    descriptions->Start_Get();  	    int		first = 1;@ 	    while ((description = (String *) descriptions->Get_Next())) 	    {
 		if (!first)  		    fprintf(fl, "\001"); 		first = 0;( 		fprintf(fl, "%s", description->get()); 	    } 	    fprintf(fl, "\tA:"); ! 	    anchors = ref->DocAnchors();  	    String	*anchor; 	    anchors->Start_Get(); 	    first = 1; 6 	    while ((anchor = (String *) anchors->Get_Next())) 	    {
 		if (!first)  		    fprintf(fl, "\001"); 		first = 0;# 		fprintf(fl, "%s", anchor->get());  	    } 	    fprintf(fl, "\n");      	    delete ref; 	}     }    #ifdef __VMS     fclose(fl);        String              record;      int                 sortRC; 9     unsigned long       options = 0 /* or SOR$M_STABLE */  #ifdef VMS_SORT_NOSIGNAL1                                   |SOR$M_NOSIGNAL  #endif     ; $     unsigned long       context = 0;(     struct dsc$descriptor_s tmpfile_dsc,)                             filename_dsc;   0     tmpfile_dsc.dsc$w_length  = tmpdir.length();.     tmpfile_dsc.dsc$b_dtype   = DSC$K_DTYPE_T;.     tmpfile_dsc.dsc$b_class   = DSC$K_CLASS_S;-     tmpfile_dsc.dsc$a_pointer = tmpdir.get();   5     if (decc$to_vms(filename, GetVMSName, 0, 0) == 0)      { B 	cerr << "Can't translate output file name" << filename << "\n\n";	 	exit(1);      }      filename_dsc = tmp_dsc;        sortRC = sor$pass_files(C         &tmpfile_dsc,   // input file descriptor (NULL for records) D         &filename_dsc,  // output file descriptor (NULL for records)G         NULL,           // output file organization (default FAB$C_SEQ) H         NULL,           // output file ercord format (default FAB$C_VAR)R         NULL,           // output file bucket size (relative & indexed files only)@         NULL,           // output file block size (magtape only)H         NULL,           // output file max record size (0 = don't check)I         NULL,           // output file preallocated blocks (default 1000) >         NULL,           // file-handling options (default DFW)C         NULL,           // size of fixed portion (VFC records only) "         &context        // context     ); #ifdef VMS_SORT_NOSIGNAL     if (!(sortRC & 1))     { 9 	cerr << "Document sort failed, rc=" << sortRC << "\n\n"; 	 	exit(1);      }  #endif     sortRC = sor$begin_sort(.         NULL,           // description of keysJ         NULL,           // length of longest record (req for record input)"         &options,       // optionsd         NULL,           // input file size in blocks (n/a for record input, optional for file input)/         cmp_int_to_tab, // user compare routine -         NULL,           // user equal routine ?         NULL,           // type of sort (default SOR$GK_RECORD) ;         NULL,           // number of work files (default 2) "         &context        // context     ); #ifdef VMS_SORT_NOSIGNAL     if (!(sortRC & 1))     { 9 	cerr << "Document sort failed, rc=" << sortRC << "\n\n"; 	 	exit(1);      }  #endif  &     sortRC = sor$sort_merge(&context); #ifdef VMS_SORT_NOSIGNAL     if (!(sortRC & 1))     { 9 	cerr << "Document sort failed, rc=" << sortRC << "\n\n"; 	 	exit(1);      }  #endif  $     sortRC = sor$end_sort(&context);       remove(tmpdir.get());  #else      int	sortRC = pclose(fl);     if (sortRC)      { $ 	cerr << "Document sort failed\n\n";	 	exit(1);      }  #endif
     return 0;  }     O //*****************************************************************************  // List *DocumentDB::URLs() 2 //   Return a list of all the URLs in the database // List *DocumentDB::URLs() {      List	*list = new List;     char	*coded_key;       dbf->Start_Get(); )     while ((coded_key = dbf->Get_Next()))      { 8 	String key = HtURLCodec::instance()->decode(coded_key);) 	if (mystrncasecmp(key, "http:", 5) == 0)  	{% 	    DocumentRef	*ref = (*this)[key]; 
 	    if (ref) + 	    	list->Add(new String(ref->DocURL()));              delete ref;  	}     }      return list; }                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               