/*------------------------------------------------------------
| ps2text.c
|
| - this program extracts text from Adobe PS files
|
| - under VMS, invoke as a foreign symbol
|	ps2text :== $whereever:ps2text.exe
| - under Ultrix, just type the executable name
|
| - first param is PS,
|	if present 2nd param is ascii file
|	if 2nd param is missing, output is to the terminal
| - program is not character set dependent, though it may not
|   work with 2-byte PS
|
| Note that there are no global variables.
|   Please keep it that way.
|
| Notice also that this code is completely portable to Ultrix cc.
|   Please keep it that way.
|
| Program written by David Parmenter, CUP ITG/E
|
| Modified:
|   Mark Goodrich, SDT Base Technologies & Environments - 11/24/93
+------------------------------------------------------------*/

#include <stdio.h>
#include <ctype.h>

#ifdef vms
#include <stdlib.h>
#endif

#include "ps2text.h"

int verify_adobe_ps();
int skip_prolog();
void skip_to_trailer();
int get_creator_name();
void print_text();
void do_vertical();
void interpret();
char *next_token();
int preprocess_line();
int main_loop();


/*------------------------------------------------------------
| Verify that the input file is in fact an Adobe PS file
| First line always '%!...'
+------------------------------------------------------------*/
int
verify_adobe_ps(ps_fp)
    FILE		*ps_fp;			/* ps file */
{
    char		string[MAX_BUF];	/* text buffer */

    if (feof(ps_fp))				/* test for eof */
	return(FALSE);
    fgets(string, MAX_BUF, ps_fp);		/* get first line */
    if (Strnequal(string, PS_C))		/* test for a match */
	return(TRUE);
    return(Strnequal(string, MSPS_C));		/* test for a match */
}


/*------------------------------------------------------------
| prolog skipper.
|
| scan forward until the endprolog comment is seen
| return TRUE if we find it, FALSE if we never get a match.
+------------------------------------------------------------*/
int skip_prolog(ps_fp, creator_data)
    FILE		*ps_fp;			/* ps file */
    CREATOR		*creator_data;		/* creator-specific data */
{
    char		string[MAX_BUF];	/* utility string */

    while (!feof(ps_fp))
	{
	fgets(string, MAX_BUF, ps_fp);
	if (Strnequal(string, creator_data->begindocument))
	    return(TRUE);
	if (Strnequal(string, creator_data->enddocument) ||
		Strnequal(string, TRAILER_C))
	    return(FALSE);
	}
    return(FALSE);
}


/*-----------------------------------------------------------
| skips to %%Trailer
|
| in the event of an error, we want to skip to the end of the
| document, or included file, and see where we are.
+----------------------------------------------------------*/
void
skip_to_trailer(ps_fp)
    FILE		*ps_fp;			/* ps file */
{
    char		string[MAX_BUF];	/* utility string */

    while (!feof(ps_fp))
	{
	fgets(string, MAX_BUF, ps_fp);
	if (Strnequal(string, TRAILER_C))
	    return;
	}
    return;
}


/*------------------------------------------------------------
| determine creator
|
| return FALSE if we never get a match.
+------------------------------------------------------------*/
int
get_creator_name(ps_fp, creator_name)
    FILE		*ps_fp;			/* PS file */
    char		creator_name[];		/* receives the name */
{
    int			i;			/* index start of name */
    char		string[MAX_BUF];	/* buffer */

    for (;;)
	{
	if (feof(ps_fp))			/* we couldn't find a creator */
	    return(FALSE);			/* exit with error */
	fgets(string, MAX_BUF, ps_fp);
	if (Strnequal(string, CREATOR_C))
	    break;
	}


    for (i = strlen(CREATOR_C); i < strlen(string); i++)
	if (string[i] != ' ') break;
	
    strcpy(creator_name, &string[i]);		/* load in creator name */
    return(TRUE);				/* success */
}


/*------------------------------------------------------------
| print routine.
|
|	print out whatever text is received.
|
|	8-bit chars look like this: \xyz (3 octal digits)
|	Some special characters may have been defined in the
|	specialchars section of the DAT file.  If so, we print
|	the associated string.
|
|	Anything else beginning with '\' we can skip the '\' and print
|	the next byte.
+------------------------------------------------------------*/
void
print_text(ascii_fp, text, creator_data)
    FILE		*ascii_fp;		/* fp to the output file */
    char		*text;			/* text to be printed */
    CREATOR		*creator_data;		/* creator_data */
{
    unsigned char	c;			/* receives octal values */

    while (*text)				/* traverse the string */
	{
	if (*text == '\\')
	    {
	    text++;

	    c = *text++ - 48;		/* convert octal back to single byte */
	    c = c * 8 + *text++ - 48;
	    c = c * 8 + *text - 48;

		/* if this is a special char, print the special string */
		/* else just print the character */

	    if (creator_data->special_chars[c] != NULL)
		fprintf(ascii_fp, "%s", creator_data->special_chars[c]);
	    else
		fputc(c, ascii_fp);
	    }
	else
	    fputc(*text, ascii_fp);	/* print the character */

	text++;
	}
}


/*------------------------------------------------------------
| This handles vertical motion.  If the delta is greater
| than paragraph_test, it's probably a paragraph, so we issue
| two \n's.  If it's not equal, just issue one.  If it's equal,
| then the y position did not change at all, so we can assume that
| x must have changed, so it might be a table, and we issue a tab.
+------------------------------------------------------------*/
void
do_vertical(ascii_fp, vertical, new_vertical, horizontal, new_horizontal, creator_data)
    FILE		*ascii_fp;	/* fp to the output file */
    COORD		*vertical;	/* ptr to vertical position */
    COORD		new_vertical;	/* new vertical */
    COORD		*horizontal;	/* ptr to horizontal position */
    COORD		new_horizontal;	/* new horizontal */
    CREATOR		*creator_data;	/* creator_data */
{
    COORD		delta_vertical;	/* delta change in vertical */

    /* compare verticals.  abs() is important here because some ps's */
    /* use negative positioning */

    if (creator_data->flags & DEBUGGING_VERTICAL)
	{
	fprintf(stdout,"\n\tnew (X,Y) %d %d, old (X,Y) %d %d\n",
		new_horizontal, new_vertical, *horizontal, *vertical);
	fflush(stdout);
	}

    delta_vertical = abs(new_vertical) - abs(*vertical);
    if (delta_vertical > creator_data->paragraph_test)
	fputc('\n', ascii_fp);
    if ((delta_vertical + (creator_data->paragraph_test/6)) >= creator_data->paragraph_test)
	fputc('\n', ascii_fp);
    else
	{
	if (new_vertical != *vertical || new_horizontal != *horizontal)
	    fputc('\t', ascii_fp);
	}
    *vertical = new_vertical;
    *horizontal = new_horizontal;
}


/*------------------------------------------------------------
| Here's where we interpret the tokens.
|
| token contains the current operator, and
|   vertical stores the current y position and
|   horizontal stores the current x position.
|
| we need to issue linefeeds and formfeeds and ' 's depending on
|   what the operator is.
|
| the operators are handled roughly in order of frequency, to
|   hopefully improve performance a tad.
+------------------------------------------------------------*/
void
interpret(ascii_fp, token, vertical, horizontal, creator_data)
    FILE		*ascii_fp;		/* fp to the output file */
    char		*token;			/* ptr to this token */
    COORD		*vertical;		/* vertical position */
    COORD		*horizontal;		/* horizontal position */
    CREATOR		*creator_data;		/* craetor specific data */
{
    COORD		new_vertical = 0;	/* receives new value for y */
    COORD		new_horizontal = 0;	/* receives new value for x */
    int			i = 0;			/* utility int */

    if (!*token)				/* do nothing with null args */
	;

	/* any relative y movement, we add delta to vertical then do motion */
	/* note that motion can be float */

    else if (Strequal(token, creator_data->delta_y))
	{
	new_vertical = *vertical + TOKEN_COORD(LAST_TOKEN(0));
	new_horizontal = *horizontal;
	do_vertical(ascii_fp, vertical, new_vertical, horizontal, new_horizontal, creator_data);
	}

    else if (Strequal(token, creator_data->delta_xy))
	{
	new_vertical = *vertical + TOKEN_COORD(LAST_TOKEN(0));
	new_horizontal = *horizontal + TOKEN_COORD(LAST_TOKEN(1));
	do_vertical(ascii_fp, vertical, new_vertical, horizontal, new_horizontal, creator_data);
	}

	/* any absolute y motion, we set n_v, then do motion */
	/* note that motion can be float */

    else if (Strequal(token, creator_data->absolute_y))
	{
	new_vertical = TOKEN_COORD(LAST_TOKEN(0));
	new_horizontal = *horizontal;
	do_vertical(ascii_fp, vertical, new_vertical, horizontal, new_horizontal, creator_data);
	}

    else if (Strequal(token, creator_data->absolute_xy))
	{
	if (Strequal(token, "SB"))
	    {
	    new_vertical = TOKEN_COORD(LAST_TOKEN(3));
	    new_horizontal = TOKEN_COORD(LAST_TOKEN(4));
	    }
	else
	    {
	    new_vertical = TOKEN_COORD(LAST_TOKEN(0));
	    new_horizontal = TOKEN_COORD(LAST_TOKEN(1));
	    }
	do_vertical(ascii_fp, vertical, new_vertical, horizontal, new_horizontal, creator_data);
	if (Strequal(token, "SB"))
	    {
	    *horizontal += TOKEN_COORD(LAST_TOKEN(0));
	    print_text(ascii_fp, LAST_TOKEN(1), creator_data);
	    }
	}

	/* if it's text do a ' ' */

    else if (Strequal(token, creator_data->text))
	{
	*horizontal += 1;
	fputc(' ', ascii_fp);
	}

	/* if we have relative x movement, and it's not a kern, then do a ' ' */

    else if (Strequal(token, creator_data->delta_x))
	{
	*horizontal += TOKEN_COORD(LAST_TOKEN(0));
	if (abs(TOKEN_COORD(LAST_TOKEN(0))) > creator_data->kern)
	    fputc(' ', ascii_fp);
	}

	/* if we have absolute x motion, we do a tab to approximate tables */

    else if (Strequal(token, creator_data->absolute_x))
	{
	*horizontal = TOKEN_COORD(LAST_TOKEN(0));
	fputc('\t', ascii_fp);
	}

	/* if it's a new page, reset vertical, and skip two lines */

    else if (Strequal(token, creator_data->beginpage))
	{
	*vertical = 0;
	*horizontal = 0;
	fprintf(ascii_fp, "\n\n");
	}

	/* if it's the end of a page, reset vertical, and issue a page eject */

    else if (Strequal(token, creator_data->endpage))
	{
	*vertical = 0;
	*horizontal = 0;
	fprintf(ascii_fp, "\n\f");
	}

	/* we found a horizontal rule.  Put this out as an approximation */

    else if (Strequal(token, creator_data->rule))
	{				/* issue a rule */
	*vertical += 1;			/* nudge it forward by one unit */
	*horizontal += 1;
	fprintf(ascii_fp, RULE);
	}

    else
	{
	for (i=MAX_TOKENS-1; i > 0; i--)
	    LAST_TOKEN(i) = LAST_TOKEN(i-1);
	LAST_TOKEN(0) = token;
	}
}


/*------------------------------------------------------------
| This is a reimplimentation of strtok(), specialized to use PS delimiters.
|
| If P1 is not null, it is used a new string to be parsed.  Subsequent
| parsings may come using P1 = NULL.  Each consecutive call returns a
| pointer to a token, plus the token's type.  When no more tokens are
| found, it returns NULL.
|
| tok_start indexes to the beginning of the current token.  tok_end
| is incremented until a valid token terminator is seen.  If the token
| is not text, then the terminator is a ' '.  If it's text, we keep
| parsing until we find the matching ')'.  Note that '\(' and '\)' are
| not delimiters and need to be skipped over.
|
| Note that this uses static data and hence is not re-entrant.
+------------------------------------------------------------*/

char
*next_token(string, token_type, flags)		/* pass back a char ptr */
    char		string[];	/* string to parse */
    int			*token_type;	/* return type of token */
    unsigned long	flags;		/* process flags */
{
    static char		keep_string[MAX_BUF];
					/* stores the original string */
    static char		*keep_end;	/* stores the strlen of the original */
					/* string */
    static char		*tok_start;	/* tok_start marks the beginning of */
					/* current token */
    static char		*tok_end;	/* tok_end the end of token */

    if (string != NULL)			/* starting a new parse */
	{
	strcpy(keep_string, string);	/* load up p_s */
	tok_start = tok_end = &keep_string[0];
					/* start out at p_s[0] */
	keep_end = &keep_string[strlen(string)];
					/* store end loc */
	}
    else				/* a subsequent call */
	{
	if (tok_end == keep_end)	/* we've reached the end of p_s */
	    return(NULL);		/* so return NULL */
	tok_start = ++tok_end;		/* bump to beginning of next token */
	}

    while (*tok_start == ' ')
	tok_start++;

    if (*tok_start == '(')		/* it's a text token */
	{
	if (flags & POSTPROCESS)
	    *token_type = OPERATOR;	/* set t_t */
	else
	    *token_type = TEXT_TOKEN;	/* set t_t */
	tok_end = tok_start++;		/* bump t_s so it points past token */
	while (*tok_end != ')')
	    tok_end++;			/* parse forward until we see ')' */
	}
    else if (*tok_start == '\0')
	return(NULL);
    else				/* it's an operator */
	{
	*token_type = OPERATOR;		/* set t_t */
	tok_end = tok_start;
	while ((*tok_end != ' ') && (*tok_end != '\0'))
	    tok_end++;			/* parse to ' ', or to '\0' */
	}

    *tok_end = '\0';			/* stick in a zero */
    return(tok_start);			/* pass back a pointer to the token */
}


/*------------------------------------------------------------
| - Handle most PS specific text stuff:
| - separate tokens which may be adjacent to one another.  Document
|    does this 'XY(foo)S', which is really three tokens.
| - takes PS escapes which begin with a back slash and recodes them so
|   that print_text() and next_token() have an easy time of it.
|   all we do is replace '\\', '\(', '\)' and replace the 2nd char with
|   the 3-digit octal Ascii code.  Then, any further parens will definitely
|   denote strings, and any backslashes will have 3 digits of octal info
|   trailing after.
| - handles dangerous situation of unbalanced parens on one line, by getting
|   next line, until number of left and right parens is balanced.
+------------------------------------------------------------*/

int
preprocess_line(ps_fp, string)
    FILE		*ps_fp;			/* ps file */
    char		string[];		/* string to be preprocessed */
{
    char		*source;		/* char ptr to source string */
    char		*target;		/* char ptr to target string */
    char		buffer[MAX_BUF];	/* temp buffer */

    int			nested_parens = 0;

    if (!string)
	return(FALSE);				/* just in case */

    source = &string[0];
    target = &buffer[0];

    while (*source)				/* traverse entire string */
	{
	if (*source == '\n' || *source == '\r')	/* strip linefeeds and returns */
	    *target++ = ' ';

	else if (*source == '\\')
	    {
	    *target++ = '\\';			/* put in the slash */
	    switch(*(++source))
		{
		case '\\':			/* handle slashes */
		    strcpy(target, "134");
		    target += 3;
		    break;

		case '(':			/* handle open parens */
		    strcpy(target, "050");
		    target += 3;
		    break;

		case ')':			/* handle close parens */
		    strcpy(target, "051");
		    target += 3;
		    break;

		default:			/* handle 3-digit octals */
		    strncpy(target, source, 3);
		    target += 3;
		    source += 2;
		    break;
		}
	    }

	else if (*source == '(')	/* insert a space before a string */
	    {				/* we know this is not a '\(', */
	    strcpy(target, " (");	/* that's handled above */
	    target += 2;
	    nested_parens++;
	    }

	else if (*source == ')')	/* ditto */
	    {
	    strcpy(target, ") ");
	    target += 2;
	    nested_parens--;
	    }

	else
	    *target++ = *source;	/* it's nothing, just copy character */

	source++;

	/* if we have nested parens, then get another line, and keep going */

	if ((*source == '\0') && (nested_parens != 0))
	    fgets(source, MAX_BUF, ps_fp);
	}

    *target = '\0';			/* terminate it */

    strcpy(string, buffer);		/* copy it */
    return(TRUE);			/* pass it back */
}


/*------------------------------------------------------------
| Here's the main loop.  This gets called once for each PS file.  It
| recurses for included PS files.
|
| It skips the prolog
| and then gets line after line.  Comment lines are ignored, real lines
| are parsed.  Depending on whether we've got text or something else we
| call interpret(), or print_text().
+------------------------------------------------------------*/
int
main_loop(ps_fp, ascii_fp, debugging, paragraph_test)
    FILE		*ps_fp;			/* ps file */
    FILE		*ascii_fp;		/* ascii output file */
    unsigned long	debugging;		/* debugging flags */
    COORD		paragraph_test;		/* paragraph spacing value */
{
    CREATOR		*creator_data;		/* creator data block */
    char		string[MAX_BUF];	/* buffer */
    char		creator_name[MAX_BUF];	/* receives creator's name */
    char		*token;			/* the current token */

    COORD		vertical;		/* vertical coordinate */
    COORD		horizontal;		/* horizontal coordinate */
    int			token_type;		/* type of token */
    int			error = FALSE;		/* look for errors in setup */
    unsigned long	flags;			/* next_token flags */

    vertical = 0;				/* initialize for safety */
    horizontal = 0;				/* initialize for safety */

	/* get creator name, data, and skip prolog */
	/* if anything goes wrong, skip out and return to calling */
	/* procedure */

    if (!get_creator_name(ps_fp, creator_name))
	error = TRUE;

    else if (!get_creator_data(ps_fp, &creator_data, creator_name))
	error = TRUE;

    else if (!skip_prolog(ps_fp, creator_data))
	error = TRUE;

    if (error)					/* error, so skip to end of */
	{					/* this section */
	skip_to_trailer(ps_fp);
	return(TRUE);
	}

    if (debugging)
	{
	fprintf(stdout,"%s %s\n", CREATOR_C, creator_name);
	creator_data->flags |= debugging;
	}

    if (paragraph_test)
	creator_data->paragraph_test = paragraph_test;
    flags = creator_data->flags;

    while (!feof(ps_fp))
	{
	fgets(string, MAX_BUF, ps_fp);		/* get next string */

	    /* look first to see if we have encountered an embedded PS file */
	    /* if we have, then we call main_loop recursively, testing for */
	    /* success */

	if (Strnequal(string, PS_C))
	    main_loop(ps_fp, ascii_fp, debugging, paragraph_test);

	    /* look to see if we've reached the end of the document */
	    /* free up creator block and return */

	else if (Strnequal(string, creator_data->enddocument))
	    return(free_creator_block(creator_data));

	    /* if we see a line beginning with '%', it's a comment line and */
	    /* can be ignored */

	else if (string[0] != COMMENT_CHAR)
	    {
	    if (!preprocess_line(ps_fp, string))	/* clean up line */
		Exit_error("I got confused.");

	    token = next_token(string, &token_type, flags); /* get 1st token */

	    while(token)			/* loop until no tokens left */
		{
		if (debugging & DEBUGGING_TOKENS)
		    {
		    fprintf(stdout, "token_type=%d, token='%s'\n", token_type, token);
		    fflush(stdout);
		    }
		switch(token_type)
		    {
		    case OPERATOR:		/* interpret the operator */
			interpret(ascii_fp, token,
				&vertical, &horizontal, creator_data);
			break;
		    case TEXT_TOKEN:		/* print the text */
			print_text(ascii_fp, token, creator_data);
			break;
		    default:			/* do nothing */
			 break;
		    }
		token = next_token(NULL, &token_type, flags); /* get next token */
		}
	    }
	}
    return(FALSE);				/* file ended prematurely */
}


/*------------------------------------------------------------
| Main entry point.
|
| first argument is input file.  It is required.
|
| second argument is output file.  It is optional.
|
+------------------------------------------------------------*/
main(argc, argv)
    int			argc;			/* number of args */
    char		*argv[];		/* array of pointers to args */
{
    int			i = 1;			/* argv index */
    unsigned long	debugging = FALSE;	/* debugging flag */
    COORD		paragraph_test = 0;	/* paragraph spacing value */
    FILE		*ps_fp;			/* PS input file */
    FILE		*ascii_fp;		/* ascii output file */

    if (argc > 1)
	{
	if (Strnequal(argv[i], "-p"))
	    {
	    paragraph_test = TOKEN_COORD(&argv[i][2]);
	    argc -= 1;
	    i++;
	    }
	if (Strnequal(argv[i], "-d"))
	    {
	    debugging = (TOKEN_COORD(&argv[i][2]) | 1)<<16;
	    argc -= 1;
	    i++;
	    }
	}

    switch(argc)				/* parse argument list */
	{
	case 0:					/* can't do anything without */
	case 1:					/* an input file */
	    issue_help();
	    Exit_error("ps2text needs one or two parameters: {ps-file} [ascii-file]");
	    break;

	case 2:					/* if no P2, use terminal */
	    if (Strnequal(argv[i], "-h") || (Strnequal(argv[i], "?")))
		{
		issue_help();			/* jump out with help */
		exit(TRUE);
		}
	    ascii_fp = stdout;
	    break;

	default:				/* use p2 as outfile */
	    if ((ascii_fp = fopen(argv[i+1], "w")) == NULL)
		Exit_error("can't open output file");
	    break;
	}

    if ((ps_fp = fopen(argv[i], "r")) == NULL)   /* open the PS file */
	Exit_error("can't open input file")

    else if (feof(ps_fp))			/* something's wrong */
	Exit_error("no input lines seen in PS file")

    else if (!verify_adobe_ps(ps_fp))		/* make sure it's Adobe PS */
	Exit_error("doesn't seem to be an Adobe compliant PS file")

    else if (!main_loop(ps_fp, ascii_fp, debugging, paragraph_test))	/* do the main loop */
	Exit_error("file terminated with errors.  Check output.");

    fclose(ascii_fp);
    fclose(ps_fp);
}

