/*****************************************************************************/
/*
                              SearchHtmlFile.c

VERSION HISTORY
---------------
19-AUG-97  MGD  MapUrl() to MapUrl_Map() for conditional mapping
23-MAY-97  MGD  escape entities (e.g. "&lt;") in HTML text before searching
07-FEB-95  MGD  corrected HTML search bug, removed "/extract" script name
                for HTML files, HTTPD can retrieve these directly
24-NOV-94  MGD  minor revisions, improved <TITLE> retrieval
10-JUN-94  MGD  initial development
*/
/*****************************************************************************/

/* standard C header files */
#include <stdio.h>
#include <ctype.h>

/* VMS related header files */
#include <rmsdef.h>
#include <rms.h>
#include <ssdef.h>
#include <stsdef.h>

#ifdef __ALPHA
#   pragma nomember_alignment
#endif

#define boolean int
#define true 1
#define false 0
 
#define VMSok(x) ((x) & STS$M_SUCCESS)
#define VMSnok(x) !(((x) & STS$M_SUCCESS))

/* external declarations */
extern boolean  Debug;
extern FILE  *HttpOut;

extern char* SearchTextString (char*, char*, boolean, int*);
extern char* MapUrl_Map (char*, char*, char*, char*, void*);

/*****************************************************************************/
/*
Search an HTML marked up file.  Simply count the number of '<' and '>' 
characters, which should be balanced, and when not inside an HTML markup tag 
search the text.  As HTML files cannot easily have text extracted from within 
them without the results being unpredictable simply return the document as 
having the search string hit.  The HTML <title> tag (if present) is used as 
the document name.
*/ 

SearchHtmlFile
(
char *FileName,
int FileNameLength,
char *DocumentName,
char *UriSearchString,
char *SearchString,
int SearchStringLength,
int *RecordCountPtr,
int *FileHitCountPtr,
int *TotalHitCountPtr,
boolean CaseSensitive,
boolean DocumentOnly
)
{
   register int  InsideTag = 0;
   register char  *cptr, *dptr, *rptr, *sptr, *tptr;

   boolean  RetrievingTitle = false;
   int  status,
        HitCount = 0,
        MatchedLength,
        RecordNumber = 0;
   char  ch;
   char  Record [1024+1],
         String [2048],
         Text [2048];
   struct FAB  FileFAB;
   struct RAB  FileRAB;

   /*********/
   /* begin */
   /*********/

   if (Debug)
      fprintf (stdout, "SearchHtmlFile() |%s|%s|\n", FileName, SearchString);

   FileFAB = cc$rms_fab;
   FileFAB.fab$b_fac = FAB$M_GET;
   FileFAB.fab$l_fna = FileName;  
   FileFAB.fab$b_fns = FileNameLength;
   FileFAB.fab$b_shr = FAB$M_SHRGET;

   if (VMSnok (status = sys$open (&FileFAB, 0, 0)))
   {
      if (Debug) fprintf (stdout, "sys$open() %%X%08.08X\n", status);
      return (status);
   }

   FileRAB = cc$rms_rab;
   FileRAB.rab$l_fab = &FileFAB;
   /* 2 buffers and read ahead performance option */
   FileRAB.rab$b_mbf = 2;
   FileRAB.rab$l_rop = RAB$M_RAH;
   FileRAB.rab$l_ubf = Record;
   FileRAB.rab$w_usz = sizeof(Record)-1;

   if (VMSnok (status = sys$connect (&FileRAB, 0, 0)))
   {
      if (Debug) fprintf (stdout, "sys$connect() %%X%08.08X\n", status);
      sys$close (&FileFAB, 0, 0);
      return (status);
   }

   /**********************/
   /* search all records */
   /**********************/

   while (VMSok (status = sys$get (&FileRAB, 0, 0)))
   {
      RecordNumber++;
      if (!FileRAB.rab$w_rsz) continue;
      Record[FileRAB.rab$w_rsz] = '\0';
      if (Debug) fprintf (stdout, "Record |%s|\n", Record);

      /* terminate on any carriage control that may be in the record */
      for (rptr = Record; *rptr && *rptr != '\r' && *rptr != '\n'; rptr++);
      *rptr = '\0';
      /* continue if empty line */
      if (!Record[0]) continue;

      /**************************************/
      /* retrieve text not inside HTML tags */
      /**************************************/

      tptr = Text;
      rptr = Record;
      while (*rptr)
      {
         if (*rptr == '<' || *rptr == '>')
         {
            InsideTag++;
            rptr++;
            if (InsideTag & 1)
            {
               /* checks to detect start and end of title */
               if (toupper(*rptr) == 'T')
               {
                  if (strsame (rptr, "TITLE>", 6))
                  {
                     rptr += 5;
                     dptr = DocumentName;
                     RetrievingTitle = true;
                  }
                  else;
               }
               if (RetrievingTitle && *rptr == '/')
               {
                  if (strsame (rptr, "/TITLE>", 7))
                  {
                     rptr += 6;
                     *dptr = '\0';
                     RetrievingTitle = false;
                  }
               }
            }
         }
         else
         {
            if (RetrievingTitle)
            {
               if (dptr < DocumentName+255)
                  *dptr++ = *rptr++;
               else
                  rptr++;
            }
            else
            if (InsideTag & 1)
               rptr++;
            else
            {
               if (*rptr == '&')
               {
                  if (strsame (rptr, "&lt;", 4))
                  {
                     *tptr++ = '<';
                     rptr += 4;
                  }
                  else
                  if (strsame (rptr, "&gt;", 4))
                  {
                     *tptr++ = '>';
                     rptr += 4;
                  }
                  else
                  if (strsame (rptr, "&amp;", 5))
                  {
                     *tptr++ = '&';
                     rptr += 5;
                  }
                  else
                  if (strsame (rptr, "&quot;", 6))
                  {
                     *tptr++ = '\"';
                     rptr += 6;
                  }
                  else
                  if (strsame (rptr, "&nbsp;", 6))
                  {
                     *tptr++ = ' ';
                     rptr += 6;
                  }
                  else
                  if (*(rptr+1) == '#')
                  {
                     for (cptr = rptr+2; *cptr && isdigit(*cptr); cptr++);
                     if (*cptr == ';')
                     {
                        ch = atoi(rptr+2) & 0xff;
                        *tptr++ = ch;
                        rptr = cptr + 1;
                     }
                     else
                        *tptr++ = *rptr++;
                  }
                  else
                     *tptr++ = *rptr++;
               }
               else
                  *tptr++ = *rptr++;
            }
         }

      }  /* while (*rptr) */
 
      *tptr = '\0';
      if (!Text[0]) continue;

      tptr = SearchTextString (Text, SearchString,
                               CaseSensitive, &MatchedLength);
      if (tptr != NULL)
      {
         /********/
         /* hit! */
         /********/

         if (Debug) fprintf (stdout, "Hit |%s|\n", tptr);
         if (!HitCount++)
         {
            /******************************/
            /* first hit, document anchor */
            /******************************/

            if (!*TotalHitCountPtr) fputs ("<P>\n<OL>\n", HttpOut);

            fprintf (HttpOut, "<LI>HTML document: <A HREF=\"%s\">\"%s\"</A>\n",
                     MapUrl_Map(NULL,FileName,NULL,NULL,NULL), DocumentName);
            if (!DocumentOnly) fprintf (HttpOut, "<UL>\n");
         }
         *TotalHitCountPtr += 1;
         if (DocumentOnly) break;

         /*******************************/
         /* display line hit occured in */
         /*******************************/

         sptr = String;
         strcpy (sptr, "<LI>");
         sptr += 4;
         /* copy the text up to the first character of the search string */
         sptr = CopyTextIntoHtml (sptr, Text, tptr-Text);
         /* matched string, highlighted */
         strcpy (sptr, "<U>");
         sptr += 3;
         sptr = CopyTextIntoHtml (sptr, tptr, MatchedLength);
         strcpy (sptr, "</U>");
         sptr += 4;
         /* rest of Text after the matched search string */
         sptr = CopyTextIntoHtml (sptr, tptr+MatchedLength, -1);
         *sptr++ = '\r'; *sptr++ = '\n'; *sptr = '\0';

         if (Debug) fprintf (stdout, "String |%s|\n", String);
         fputs (String, HttpOut);
      }
   }

   /***************/
   /* end of file */
   /***************/

   if (status == RMS$_EOF) status = SS$_NORMAL;
   sys$close (&FileFAB, 0, 0);

   if (status == RMS$_RTB)
   {
      fprintf (HttpOut,
"<P><B>ERROR!</B> <I>Record (line) %d too big for query search buffer!</I>\
<BR><TT>%s</TT> <!-- %s --><P>\n",
      RecordNumber, MapUrl_Map(NULL,FileName,NULL,NULL,NULL), FileName);
      status = SS$_NORMAL;
   }

   if (HitCount)
   {
      if (!DocumentOnly) fputs ("</UL>\n", HttpOut);
      *FileHitCountPtr += 1;
   }
   if (InsideTag & 1)
   {
      /* must have encountered an opening '<' without a closing '>' */
      fprintf (HttpOut,
      "<P><B>HTML problem</B>, unbalanced &lt;&gt; in <TT>\"%s\"</TT><P>\n",
      MapUrl_Map (NULL,FileName,NULL,NULL,NULL));
   }

   *RecordCountPtr += RecordNumber;

   return (status);
}

/*****************************************************************************/
