/*======================================================================
*     
*  Title:        TOOLS for VMSGREP
*                
*  Version:      1-001
*                
*  Abstract:     Some C routines used by the VMSGREP program
*                
*  Environment:  VMS
*                
*  Date:         21-AUG-1986
*                
*---------------------------------------------------------------------*/

#include <stdio.h>
#include "tools.h"

CHAR    *amatch( lin, pat, boln )
CHAR    *lin, *boln;
TOKEN   *pat;
{
   /*    Scans through the pattern template looking for a match
    * with lin. Each element of lin in compared with the template
    * until either a mis-match is found or the end of the template
    * is reached. In the former case a 0 is returned; in the latter,
    * a pointer into lin (pointing to the last character in the
    * matched pattern) is returned.
    *
    *    "lin"   is a pointer to the line being searched.
    *    "pat"   is a pointer to a template made by makepat().
    *    "boln"  is a pointer into "lin" which points at the
    *                     character at the beginning of line.
    */

   register CHAR  *bocl, *rval, *strstart;

   if (pat == 0)
      return( (CHAR *)0 );

   strstart = lin;

   while ( pat )
   {
      if (pat->tok == CLOSURE && pat->next)
      {
         /*
          *   Process a closure:
          *   First skip over the closure token to the
          *   object to be repeated. This object can be
          *   a character class.
          */

         pat = pat->next;

         /*  Now match as many occurrences of the
          *  closure pattern as possible,
          */

         bocl = lin;

         while (*lin && omatch(&lin, pat, boln))
            ;

         /*     'Lin' now points to the character that made
          *     made us fail. Now go on to the process the 
          *     rest of the string. A problem here is
          *     a character following the closure which
          *     could have been in the closure.
          *     For example, in the pattern "[a-]*t" (which
          *     matches any lower-case word ending in a t),
          *     the final 't' will be sucked up in the while
          *     loop. So, if the match fails, we back up a
          *     notch and try to match the rest of the
          *     string again, repeating this process
          *     recursively until we get back to the
          *     beginning of the closure.  The recursion
          *     goes, at most, two levels deep.
          */

         if (pat = pat->next)
            {
            while ( bocl <= lin )
               {
               if (rval = amatch(lin, pat, boln) )
                  {
                  /* success */
                  return(rval);
                  }
                  else
                  --lin;
               }
               return (0);    /* match failed */
            }
         }
      else if ( omatch(&lin, pat, boln) )
      {
      pat = pat->next;
      }
      else
      {
      return (0);
      }
   }

   /*
    *   Note that omatch() advances lin to point at the next
    *	character to be matched. Consequently, when we reach
    *	the end of the template, lin will be pointing at the
    *	character following the last character matched.
    *	The exceptions are templates containing only a 
    *	BOLN or EOLN token.  In these cases omatch doesn't
    *	advance.
    *
    *	So, decrement lin to make it point at the end of the
    *  	matched string. Then, check to make sure that we haven't
    *	decremented past the beginning of the string.
    *
    *	A philosophical point should be mentioned here.  Is $
    *	a position or a character? (i.e. does $ mean the EOL
    *	character itself or does it mean the character at the end of
    *	the line.) I decided here to make it mean the former, in
    *	order to make the behavior of amatch() consistent. If you
    *   give amatch the pattern ^$ (match all lines consisting only
    * 	of an end of line) then, since something has to be returned,
    *	a pointer to the end of line character itself is returned.
    * 	
    *	The --lin is done outside the return statement because
    *      is often a macro (which has side-effects).
    */

   --lin;
   return ( max(strstart, lin) );
}

/* ------------------------------------------------------------------ */

#ifdef DEBUG

delete( ch, str )
int             ch;
register CHAR   *str;
{
   /*     Delete th first occurrence of character from string
    *     moving everything else over a notch to fill the hole.
    */    

   ch &= 0xff;

   while ( *str && *str != ch)
      str++;
   while ( *str )
      {
      *str =(str+1);
      str++;
      }
}

#endif

/* ------------------------------------------------------------------- */    

setbit( c, field )
int     c;
CHAR    field[];
{
   /*   Set a bit in the bit ASCII bit map corresponding to the
    *   character c. Field must be a least 16 bytes long.
    */

   field[ (c & 0x7f) >> 3 ] |= 1 << (c & 0x07);
}

/* ------------------------------------------------------------------- */    

testbit( c, field )
int     c;
CHAR    *field;
{
   /*   See if a bit corresponding to c in field is set
    */

   return ( field[ (c & 0x7f) >> 3 ] & (1 << (c & 0x07)) );
}

/* ------------------------------------------------------------------ */

CHAR *dodash( delim, src, map )
int     delim;
CHAR    *src, *map;
{
   /*  Expand the set pointer to by "*src" into the bitmap "map."
    *  Stop at delim or end of string. Update *src to point
    *  at terminator. A set can have one element {x} or several
    *  elements ( {abcdefghijklmnopqrstuvwxyz} and {a-z}
    *  are equivalent ). Note that the dash notation is expanded
    *  as sequential numbers. This means (since we are using the
    *  ASCII character set) that a-z will contain the entire alphabet 
    *  plus the symbols:  [\]^_'
    *  The character classes are stored in a 16 byte wide bit field
    *  where each bit corresponds to an ASCII character.
    */

   register int   first, last;
   CHAR           *start;

   start = src;

   while (*src  &&  *src !=delim)
      {
      if (*src != '-')
         setbit( esc( &src ), map );
      else if ( src == start  ||  *(src+1) == delim )
         setbit( '-', map );
      else
         {
         src++;

         if ( *src < *(src - 2) )
            {
            first = *src;
            last  = *(src-2);
            }
            else
            {
            first = *(src - 2);
            last  = *src;
            }

         while( ++first <= last )
            setbit( first, map );

         src++;
         }
      }
   return( src );
}

/*------------------------------------------------------------------------*/

int      esc(s)
CHAR     **s;
{
   /* Map escape sequences into their equivalent symbols. Returns the
    * Correct ASCII character. c is the character following the \.     
    */

   register int    rval;

   if ( **s != ESCAPE )
      rval = *( (*s)++ );
   else
      {
      (*s)++;

      switch( toupper(**s) )
         {
         case '\0':      rval = ESCAPE;      break;
         case 'B':       rval = '\b' ;       break;
         case 'F':       rval = '\f' ;       break;
         case 'N':       rval = '\n' ;       break;      
         case 'R':       rval = '\r' ;       break;
         case 'S':       rval = ' '  ;       break;
         case 'T':       rval = '\t' ;       break;
         default:        rval = **s  ;       break;
         }

      (*s)++;
      }

   return (rval);
}                             
 
/* --------------------------------------------------------------------- */ 

TOKEN  *getpat( arg )
CHAR   *arg;
{
   /*           Translate arg into a TOKEN string
    */

   return ( makepat(arg, '\000') );
}

/* -------------------------------------------------------------------- */

#ifdef DEBUG

insert( ch, str )
int             ch;
register CHAR   *str;
{
   /*     Insert ch into str at the place pointed to by str. Move
    *     everything else over a notch
    */

   register CHAR  *bp;

   bp = str;

   while (*str)                  /*  Find the end of string       */
      str++;
   do                            /*  Move the tail over one notch */
      {
      *(str+1) = *str;
      str--;
      } while (str >= bp);

   *bp = ch;                     /*  Put the CHAR in the hole.    */ 
}

#endif

/* -------------------------------------------------------------------- */

int isalphanum(c)
int     c;
{
   /*   Return true if c is an alphabetic character or digit,
    *   false otherwise.
    */

   return( ('a' <= c && c <= 'z') ||
           ('A' <= c && c <= 'Z') ||
           ('0' <= c && c <= '9') );
}

/* -------------------------------------------------------------------- */

TOKEN    *makepat(arg, delim)
CHAR     *arg;
int      delim;
{
   /*      Make a pattern template from the string pointed to by arg.
    *      Stop when delim or '\000' or '\n' is found in arg.
    *      Return a pointer to the pattern template.
    *   
    *      The pattern templates used here are somewhat different
    *      than those used in the book; each token is a structure
    *      of the form TOKEN (see tools.h). A token consists of
    *      an identifier, a pointer to a string, a literal
    *      character and a pointer to another token. This last is 0 if
    *      there is no subsequent token.
    *
    *      The one strangeness here is caused (again) by CLOSURE which
    *      has to be put in front of the previous token. To make this
    *      insertion a little easier, the 'next' field of the last
    *      token in the chain (the one pointed to by 'tail') is made
    *      to point at the previous node. When we are finished,
    *      tail->next is set to 0.
    */

   TOKEN  *head, *tail;
   TOKEN  *ntok;
   int    error, i ;

   /*     Check for characters that aren't legal at the beginning
    *     of a template.
    */

   if (*arg=='\0' ||  *arg==delim  ||  *arg=='\n' || *arg==CLOSURE)     
      return(0);

   error = 0;
   head  = 0;
   tail  = 0;
          
   for (; *arg  &&  *arg != delim  && *arg != '\n' && !error; arg++)
      {
      ntok = (TOKEN *) malloc(TOKSIZE) ;

      if (error = !ntok)
         {
         fprintf(stderr,
         "Not enough memory for pattern templat\n");
         break;
         }

      switch(*arg)
         {
         case ANY:
            ntok->tok = ANY;
            break;

         case BOL:
            if (head==0)    /* then this is the first symbol */
               ntok->tok = BOL;
            else
               error = 1;
            break;

         case EOL:
            if ( *(arg+1) == delim || *(arg+1) == '\0'
                                   || *(arg+1) == '\n' )
               ntok->tok = EOL;
            else
               error = 1;
            break;

         case CCL:
            if (*(arg+1) == NEGATE)
               {
               ntok->tok = NCCL;
               arg += 2;
               }
               else
               {
               ntok->tok = CCL;
               arg++;
               }
                 
            if (ntok->bitmap = (CHAR *) calloc( 16, 1 ))
               {
               arg = dodash(CCLEND, arg, ntok->bitmap );
               }
               else
               {
               fprintf(stderr, "Not enough memmory for pat\n");
               error = 1;
               }
               break;

         case CLOSURE:
            switch ( tail->tok )
               {
               case BOL:
               case EOL:
               case CLOSURE:
                  return(0);
               default:
                  ntok->tok = CLOSURE;
            }
            break;
            
         default:
            ntok->tok = LITCHAR;
            ntok->lchar = esc( &arg );
            --arg; /* esc advances us past the character */
         }

      if (error)
         {
         unmakepat(head);
         return (0);
         }
      else if (head == 0)
         {
         /* This is the first node in the chain.
          */
         ntok->next = 0;
         head = tail = ntok;
         }
      else if( ntok->tok != CLOSURE)
         {
         /* Insert at end of list (after tail) */
         tail->next = ntok;
         ntok->next = tail;
         tail = ntok;
         }
      else if( head != tail )
         {
         /* More than one node in the chain. Insert the
          * CLOSURE node immediately in front of tail   
          */

         (tail->next)->next = ntok;
         ntok->next = tail;
         }
      else
         {
         /* Only one node in the chain, Insert the CLOSURE
         * node at the head of the linked list.
         */                      
                              
         ntok->next = head;
         tail->next = ntok;
         head = ntok;
         }

      } /* End for loop */
    
   tail->next = 0;
   return (head);
}

/* ---------------------------------------------------------------------- */

CHAR      *matchs(line, pat, ret_endp)
CHAR      *line;
TOKEN     *pat;
int       ret_endp;
{
   /*
    *    Compares line and pattern. Line is a character string while 
    *    pat is a pattern template made by getpat().
    *    Returns:
    *            1. A zero if no math was found.
    *            2. A pointer the last character
    *               satisfying the match if ret_endp is non-zero
    *            3. A pointer to the beginning of the matched string
    *               if ret_endp is 0;
    *
    *    For example:
    *
    *            matchs ("1234567890", getpat("4[0-9]*7"), 0);
    *
    *    will return a pointer to the '4', while
    *
    *            matchs ("1234567890", getpat("4[0-9]*7"), 1);            
    *
    *    will return a pointer to the '7'.
    */

   CHAR  *rval, *bptr;

   bptr = line;

   while (*line)
      {
      if ((rval = amatch(line,pat,bptr)) == 0)
         {
         line++;
         }
         else
         {
         rval = ret_endp ? rval : line ;
         break;
         }
      }
           
   return (rval);
}

/* ------------------------------------------------------------------------ */

CHAR   *stoupper(str)
CHAR   *str;
{
   /*
    *   Map the entire string pointed to by str to upper case
    *   Return str.
    */

   CHAR    *rval;

   rval = str;

   while (*str)
      {
      if ( 'a' <=*str  &&  *str <= 'z' )
         *str -= ('a' - 'A');                    

      str++;
      }

   return(rval);
}

/* 
---------------------------------------------------------------------------- */

int     omatch(linp, pat, boln)
CHAR    **linp, *boln;
TOKEN   *pat;
{
   /*   Match one pattern element, pointed at by pat, with the
    *   character at **limp. Return non-zero on match.
    *   Otherwise, return 0. *Limp is advanced to skip over the
    *   matched character; it is not advanced on failure. The
    *   amount of the advance is 0 for patterns that match null
    *   strings, 1 otherwise. "boln" should point at the position
    *   that will match a BOL token.
    */

   register int    advance;

   advance = -1;

   if ( **linp )
      {
      switch ( pat->tok )
         {
         case LITCHAR:
            if ( **linp == pat->lchar )
               advance = 1;
            break;

         case BOL:
            if ( *linp == boln )
               advance = 0;
            break;

         case ANY:
            if ( **linp != '\n' )
               advance = 1;
            break;

         case EOL:
            if ( **linp == '\n' )
               advance = 0;
            break;

         case CCL:
            if ( testbit( **linp, pat->bitmap ) )
               advance = 1;
            break;  

         case NCCL:                                                           
            if ( !testbit( **linp, pat->bitmap ) )
               advance = 1;
            break;  
                                                                                                                                                      
         default:
            printf("omatch: can't happen\n");
         }
      }

   if (advance >= 0)
      *linp += advance;

   return( ++advance );
}

/* ----------------------------------------------------------------------- */

#ifdef DEBUG

pr_line(ln)
register CHAR  *ln;
{
   /*   Print out ln, if a non-printing character is found, print
    *   out its numerical value in the form "\0x<hex number>".
    *   Again, this is a debugging aid.  It lets you see what's                
    *   really on the line.
    */

   for ( ; *ln ; ln++ )
      {
      if ((' ' <= *ln) && (*ln <= '~'))
         putchar(*ln);
      else
         {
         printf("\\0x%02x", *ln);
         if (*ln == '\n')
            putchar('\n');
         }
      }
}

/* ------------------------------------------------------------------------ */

pr_tok( head )
TOKEN   *head;
{
   /*   Print out the pattern template (linked list of TOKENs)
    *   pointed to by head. This is a useful debugging aid. Note
    *   that pr_tok() just scans along the linked list, terminating
    *   on a null pointer; so, you can't use pr_tok from inside
    *   makepat() because tail->next points to the previous
    *   node instead of being null. Note that NEGATE and OR_SYM
    *   are not listed because they won't occur in a template.
    */
      
   register CHAR  *str;
   register int   i;

   for (; head; head = head->next )
      {
      switch (head->tok)
         {
         case BOL:
            str = "BOL";  
            break;

         case EOL:
            str = "EOL";
            break;                 

         case ANY:
            str = "ANY";
            break;                 

         case LITCHAR:
            str = "LITCHAR";
            break;                 
 
         case ESCAPE:
            str = "ESCAPE";
            break;                 

         case CCL:
            str = "CCL";
            break;                 

         case CCLEND:
            str = "CCLEND";
            break;                 

         case NCCL:
            str = "NCCL";
            break;                 
 
         case CLOSURE:
            str = "CLOSURE";
            break;                 

         default:               
            str = "**** unknown ****"; 
         }

      printf("%-8s at: 0x%x, ", str, head);

      if (head->tok == CCL || head->tok == NCCL)
         {
         printf("string (at 0x%x) =<", head->bitmap );

         for ( i = 0; i < 0x7f ; i++)
             if (testbit(i, head->bitmap))
                putchar(i);

         printf(">, ");
         }

      else if (head->tok == LITCHAR)
         printf("lchar = %c, ", head->lchar);

      printf("next = 0x%x\n", head->next);
      
      }

   putchar('\n');
                  
}

#endif      

/* ---------------------------------------------------------------------- */

unmakepat(head)
TOKEN   *head;
{
   /* Free up the memory used for the token string   */

   register TOKEN *old_head;

   while (head)
      {
      switch (head->tok)
         {
         case CCL:
         case NCCL:
            free(head->bitmap);
            /* no break, fall through to default */

         default:
            old_head = head;
            head = head->next;
            free(old_head);
            break;
         }
      }
}

/* ---------------------------------------------------------------------- */

CHAR  *index( c, str )
CHAR  *str;
{

   /*   Return true if c is in str.
    */

   while (*str)
      if (*str++ == c)
         return str;

   return 0;
}

