/*-
 *	Copyright 1987 Jeff Sparkes
 *	Department of Computer Science
 *	Memorial University of Newfoundland
 *	St. John's, Nfld.
 *	garfield!jeff1,	jeff1@garfield.mun.cdn
 *
 *	Permission is granted to distribute and/or modify this code, provided
 *	this copyright notice remains intact.
 *	If you use it, let me know.  If change it let me know.  If you
 *	make money from it, send me a share.
 */

#include <stdio.h>
#include <ctype.h>
#include "token.h"
#include "table.h"

#define	SENTENCE	12
#define WORD		13
#define PARAGRAPH	14
#define	SENTENCE2	15
#define	SENTENCE3	16
#define	SENTENCE4	17

#define	Getc(x)		x = nextchar(); if (x == 0) return WORD; else if (x == -1) goto top;
#define Ungetc(c)	current_char--

/*- sws why are these static?
static int      current_char, current;
static char     current_word[100];
static int	blank_flag = 0, blank_next = 0;
static int	slashes, slashes_next;
*/
int current_char, current;
char current_word[100];
int blank_flag = 0, blank_next = 0;
int slashes, slashes_next;

char nextchar();
int word_out = 0;
int pending_punct = 0;
int word_count, pos = 0;

/*struct token token[10];*/
struct token token[20];
extern struct tab base_table[];
extern struct fix superfixes[], subfixes[];
extern struct special special[];

/* sws see if a char from tword is a terminal or not */
int
isterm(c)
     char c;
{
  switch (c) {

  case WORD:
  case PARAGRAPH:
  case SENTENCE:
  case SENTENCE2:
  case SENTENCE3:
  case SENTENCE4:
    return(1); 

  default:
    return(0); 
  }  
}

tparse()
{
  char c, c1;
  int tok, i, j;

  /* sws  initialize */
  slashes = 0;
  slashes_next = 0;
  blank_flag = 0;
  blank_next = 0;
  pos = 0;
  for (i = 0; i < 100; i++) {
    current_word[i] = ' ';
  }

  for (i = 0; i < 10; i++) {
    for (j = 0; j < 10; j++)
      token[i].str[j] = 0;
    /* sws trouble here */
    /* token[i].char_num = 0; */
    token[i].char_num = -1;
    token[i].vowel = V_NONE;
    token[i].special = 0;
  }
  /* sws changed format for readability */
  /*- sws  some definitions moved to tibdef.tex */
  printf("\\bgroup\\tibetan\n");

  while (1) {
    /*-
     * If we get a %, check for another immediately following.
     * This is the end of tibetan mode.  Otherwise, it is just a
     * comment, but I figure it should be left in, just in case
     * some one needs to look at the filter output.
     */

    /* initialize */
    for (i = 0; i < 100; i++)
      current_word[i] = 0;

    /* now get stuff */
    /*- sws  rewritten to allow more general checking - 
             including TeX commands inside 
             now tex stuff moved to getchar
	     */
    while(1) {
      c = getc(input);
      switch(c) {

      case ' ':
	/* ignore blanks here (before a word) */
	continue;

      case '#':
	blank_next = 1;
	continue;

      case '\n':
	c1 = getc(input);
	if (c1 == '\n') {
	  output_special(PARAGRAPH);
	  continue;
	} 
	else
	  ungetc(c1, input);
	continue;

      case '%':
	c1 = getc(input);
	if (c1 == '%') {
	  /* end of tibetan mode */
	  output_pending_punct();
	  printf("\\egroup ");
	  return;
	}
	else {
	  /* a comment within tibetan mode, pass it through */
	  ungetc(c1, input);
	  printf("%% ");
	  while ((c = getc(input)) != '\n')
	    putchar(c);
	  putchar('\n');
	}
	continue;

      case EOF:
	printf("[tparse] Missing closing %%%%\n");
	exit(1);
	continue;

      default:
	/*- ok, we found some stuff that's a word candidate, go to 
	   the next phase */
	ungetc(c, input);
	break;

      }
      /* we get here only from the default */
      break;
    }

    /* we get here from the above default case */
    /* now read a word */
    if (fscanf(input, "%s", current_word) == EOF) {
      printf("Missing %%%%\n");
      exit(1);
    }

    current_char = 0;
    current = -1;
    tok = 0;
    tok = tword();
    output(current);
    if (tok != 0)
      output_special(tok);
  }
}

/*-
  Parse the word, as designated by white space. The tokens are put
  into an array so that some tokens can affect previous ones.  The
  argument is the number of the current token in array.
*/
tword()
{
  char c, c1, s[10];
  int ind = 0;
  int i, mtch;

  /* sws */
  int current_save;

top:
  ind = 0;
  c = Getc(c);
  while (1) {
    /*-
     * Check for end of word delimiters.  If it's also end of
     * sentence, then do the appropriate thing.
     */
    switch (c) {

    case '#':
      /* don't output the word separator */
      blank_next = 1;
      return (WORD);

    case '/':

/*#define sldb*/

      /* sws to keep track */
      current_save = current;
#ifdef sldb      
      printf("%%\n%%{\\rm found slash, current = %d}\n", current); 
#endif
      /* sws watch out for nextchar picking up specials */
      slashes_next = 0;
      c1 = nextchar();
      while (c1 == '/') {
	slashes_next++;
	c1 = nextchar();
      }
#ifdef sldb      
	printf("%%\n%%{\\rm found %d slashes, current = %d}\n", 
	       slashes_next,current);
#endif

      /*-
       * If the slashes are at the end of a word,
       * keep the count in slashes_next, and
       * return an end of sentence.
       */

      /* sws old version, loses if we picked up a special */
      /*
			if (isspace(c1) || c1 == 0 || c1 == -1) {
			  return (SENTENCE);
			}
			*/
      /* look for space, formfeed, nl, tab, vtab */
      if (isspace(c1) || c1 == 0 || c1 == -1) {
#ifdef sldb      
	printf("%%\n%%{\\rm found space, current = %d,", current);
	printf(" current_save = %d}\n", current_save);
#endif
	if (current != current_save) {
	  /* sws slashes came first */
	  output_pending_punct();
	  for (i = 0; i <= slashes_next; i++)
	    printf("\\tibsp\\char115\\tibetan");
	  slashes_next = 0;
	  return (WORD);
	} else {
	  /* really end of sentence */
	  return (SENTENCE);
	}
      }

      /* end of sentence, no space */
      if (c1 == '*') {
#ifdef sldb      
	printf("%%\n%%{\\rm found [*], current = %d}\n", current);
#endif
	/* must be end of sentence */
	  return (SENTENCE4);
      }

      /*-
       * Otherwise, the slashes are at the beginning
       * so output them now.
       */

      /* sws  debug*/
      /* printf("\n{\\rm current = %d}\n", current);
			  if ( current_char >= 0)
			    printf("{\\rm current char = %c}\n",
				   current_word[current_char]);
				   */

      /* sws old version */
      output_pending_punct();
      for (i = 0; i <= slashes_next; i++)
	printf("\\tibsp\\char115\\tibetan");

      /*sws alternate... */
      /*-
			output(current);
			for (i=0; i<=slashes; i++)
				printf("\\tibsp\\char115\\tibetan");
			slashes = 0;
			*/

      c = c1;
      slashes_next = 0;
      continue;
    case '!':
      output(current);
      return (SENTENCE2);
    case '|':
      output(current);
      return (SENTENCE3);
    case '*':
      /*output(current);*/
      return (SENTENCE4);
    case '%':
      Ungetc(c);
      return (0);
    case '\n':
      return (WORD);
    default:
      break;
    }

    if (isspace(c)) {
      return (WORD);
    } current++;
    /*-
      If we've found a superfix, parse the next token. If it is
      a token that the superfix can modify, then use the
      modified char_num, otherwise the superfix is merely a
      vowel-less base character
     */
    if (super(c)) {
      int cur, tok;

      if ((c1 = current_word[current_char++]) == 0) {
	Ungetc(c1);
	goto not_super;
      }
      if (sub(c1) || (c == 's' && c1 == 'h') ||
	  (vowel(c1) != V_NONE)) {
	Ungetc(c1);
	goto not_super;
      }
      if (!isalpha(c1)) {
	Ungetc(c1);
	goto not_super;
      }
      Ungetc(c1);
      sprintf(s, "%c", c);
      cur = current;
      /* fix so that next_char doesn't overwrite with special */
      token[current].char_num = 0;


      /* sws  attempt to fix this... 
	 note ends get missed here as tok gets forgotten
	 just why is this here?
       */
      /* tword();*/
      tok = tword();
/*      printf("%%\n%% found tok = %c [%d]\n", tok,tok);*/
#if 0
/* looses super or sub base still ...*/
      /* stop if we found an end of sentence etc */
      if (isterm(tok))
	return(tok);
#endif

/* apparently fixed later ? */
#if 0
      if (tok != 0) {
	/* looses super base */
	/*return(tok);*/
	output_special(tok);
      }
#endif

/* sws... need to fix this sometime... */
/* requires a space after a / */
      /*-
       * For some reason, // at the end get bypassed
       */
      /* backup till last char */
      /* sws also include brackets */
      /*while (!isalpha(current_word[current_char])) {
	current_char--;
      }
      current_char++;
      */
#if 0
      while ((NULL == current_word[current_char]) ||
	     ('/' == current_word[current_char]) ||
	     (!isalpha(current_word[current_char])) &&  
	     (!current_word[current_char] == '{' ) &&
	     (!current_word[current_char] == '}' ) )
	     {
	current_char--;
      }
#endif
      current_char++;
      if ((token[cur].char_num =
	      match(SUPER, s, token[cur + 1].char_num))
	  != -1) {
	int j;

	sprintf(token[cur].str, "%c%s", c, token[cur + 1].str);
	token[cur + 1].str[0] = 0;
	token[cur].vowel = token[cur + 1].vowel;
	token[cur + 1].char_num = -1;
	token[cur + 1].special = 0;
	token[cur + 1].vowel = V_NONE;
	return (tword());
      } else {
	token[cur].char_num = match(BASE, s, -1);
	strcat(token[cur].str, s);
	token[cur].vowel = V_NONE;
	return (tword());
      }
    }
not_super:
    mtch = 0;
    while (1) {
      /*-
       * Match the g.y case.
       */
      if (c == '.') {
	token[current].str[ind] = 0;
	token[current].vowel = V_A;
	return (tword());
      }
      /*-
       * Check for a subfix..
       */
      if (sub(c)) {
	int t, t1;

	sprintf(s, "%c", c);
	t = match(SUB, s, token[current].char_num);
	if (t != ERROR) {
	  c1 = Getc(c1);
	  if ((t1 = vowel(c1)) == V_NONE) {
	    Ungetc(c1);
	    Ungetc(c);
	    token[current].vowel = V_NONE;
	    return (tword());
	  } else {
	    token[current].vowel = t1;
	    token[current].char_num = t;
	    strcat(token[current].str, s);
	    return (tword());
	  }
	}
      }
      token[current].vowel = vowel(c);
      /*-
       * We've matched the a+ glyph.
       */
      if (mtch == 0 && token[current].vowel != V_NONE) {
	if (ind == 0) {
	  /*-
	   * We've matched a single vowel
	   * glyph.
	   */
	  return (tword());
	}
	token[current].str[ind++] = c;
	token[current].str[ind] = 0;
	break;
	/*-
	 * We've hit a vowel, which is the end of the
	 * glyph.
	 */
      } else if (token[current].vowel != V_NONE) {
	token[current].str[ind] = 0;
	return (tword());
	/*-
	 * Check to see if what we have so far + the
	 * next is a glyph.  If not, then this
	 * character is the beginning of the next
	 * one.
	 */
      } else {
	int n;

	token[current].str[ind++] = c;
	token[current].str[ind] = 0;
	if ((n = match(BASE, token[current].str, -1)) != ERROR) {
	  token[current].char_num = n;
	  mtch++;
	} else if (mtch != 0) {
	  token[current].str[--ind] = 0;
	  Ungetc(c);
	  return (tword());
	} else {
	  bad_word();
	  return (0);
	}
      }
      c = Getc(c);
    }
  }
}

/* sws
The actual output routine.
1. dumps any pending punctuation
2. reset some parameters
3. print (count) tokens

*/
output(count)
  int count;
{
  int i, shift, cn, ch;
  char fs[20];

  /*-
   * Indicate that output has occurred.
   */
  output_pending_punct();
  blank_flag = blank_next;
  blank_next = 0;
  slashes = slashes_next;
  slashes_next = 0;
  word_out = 1;
  for (i = 0; i <= count; i++) {
    /*-
     * Check for a single vowel glyph.  The output is different
     * for a single vowel since it has nothing to modify.
     */
    shift = 0;
    if (token[i].special == SPECIAL) {
      printf("%s%%\n", token[i].str);
      continue;
    }
    if (token[i].char_num > 127) {
      cn = token[i].char_num - 128;
      strcpy(fs, "\\tibsp");
    } else {
      cn = token[i].char_num;
      strcpy(fs, "\\tibetan");
    }
    if (token[i].str[0] == 0)
      if (token[i].vowel != V_NONE)
	shift = 1;
      else
	continue;
    /* save the character so that we can find the last one */
    ch = token[i].char_num;
    switch (token[i].vowel) {
    case V_A:
    case V_NONE:
      if (shift)
	printf("\\char29");
      else
	printf("\\char%d", cn);
      break;
    case V_E:
      if (shift)
	printf("\\tibsp\\accent127\\tibetan\\char29");
      else
	printf("\\tibsp\\accent127%s\\char%d", fs, cn);
      break;
    case V_I:
      if (shift)
	printf("\\tibsp\\accent126\\tibetan\\char29");
      else
	printf("\\tibsp\\accent126%s\\char%d", fs, cn);
      break;
    case V_O:
      if (shift)
	printf("\\tibsp\\accent125\\tibetan\\char29");
      else
	printf("\\tibsp\\accent125%s\\char%d", fs, cn);
      break;
    case V_U:
      if (shift)
	printf("\\u{\\char29}");
      else
	printf("\\u{\\char%d}", cn);
      break;
    default:
      break;
    }
  }
  for (i = 0; i < 10; i++) {
    int j;

    for (j = 0; j < 10; j++)
      token[i].str[j] = 0;
    token[i].char_num = -1;
    token[i].vowel = V_NONE;
    token[i].special = 0;
  }
}

/* sws 

This only does something if we are at the end of a paragraph. Else it
just sets up some pending punctuation.

*/
output_special(c)
     /*sws  char c;*/
     int c;
{
  /*-
   * Don't output any special markers unless output has done something
   * since the last time we've been called.
   */
  if (word_out) {
    switch (c) {
    case SENTENCE:
      if (pending_punct == 0 || pending_punct == WORD)
	pending_punct = SENTENCE;
      break;
    case SENTENCE2:
      if (pending_punct == 0 || pending_punct == WORD)
	pending_punct = SENTENCE2;
      break;
    case SENTENCE3:
      if (pending_punct == 0 || pending_punct == WORD)
	pending_punct = SENTENCE3;
      break;
    case SENTENCE4:
      if (pending_punct == 0 || pending_punct == WORD)
	pending_punct = SENTENCE4;
      break;
    case WORD:
      if (pending_punct == 0)
	pending_punct = WORD;
      break;

    case PARAGRAPH:
      output_pending_punct();
      printf("\n\n");
      break;
    default:
      break;
    }
  }
}

/* sws
this prints out any ending stuff that is in the pipeline 

*/
output_pending_punct()
{
  int i;

  switch (pending_punct) {
    /* this controls the amount of space at the end of sentences */
  case SENTENCE:
    for (i = 0; i < slashes; i++)
      printf("\\tibsp\\char115\\tibetan");
    printf("\\filler\\tibsp\\char115\\tspace\\tibetan\n");
    break;
  case SENTENCE2:
    printf("\\filler\\tibsp\\char121\\tspace\\tibetan\n");
    break;
  case SENTENCE3:
    printf("\\filler\\tspace\\tibetan\n");
    break;
  case SENTENCE4:
    /* same as sentence, but no space at end */
    printf("\\filler");
    for (i = 0; i <= slashes; i++)
      printf("\\tibsp\\char115\\tibetan");
    break;
  case WORD:
    if (blank_flag) {
      printf("\\filler\\tenrm\\ \\tibetan\n");
      blank_flag = 0;
    } else {
      /*printf("\\filler\\tibsp\\char114\\tenrm\\ \\tibetan\n");*/
      printf("\\filler");
      printf("\\twspace");
      printf("\\tibsp\\char114");
      printf("\\twspace");
      printf("\\tibetan\n");
    }
    break;
  default:
    break;
  }
  pending_punct = 0;
}


/* sws new function to check for tex brackets */
/*  {}  */
int isbracket(c)
char c;
{
  return  ( (c == '{') || (c == '}') );
}

/* sws new function to check for tex termination */
/* terminate with ' ,/{}\#' and \0  */
/* later worry about eof? */
int istexterm(c)
char c;
{
  return  ( (c == ' ') || (c == ',') || (c == '/') 
	   || (c == '{') || (c == '}') || (c == '\\')  
	   || (c == '#') || (c == NULL) );
}

/* sws */
/*
This gets the next character from the current string. It catches
specials first and outputs them so they don't get to tword 

current_char should be +1 from the current character location in 
current_word

*/
char
nextchar()
{
  int i,j;
  int no_space;

  /* look for end of string */
  if ( current_word[current_char] == NULL ) {
    return (current_word[current_char]);
  } 
  /* look for /'s */
  /* report the character just after */
  else if ( current_word[current_char] == '/' ) {
    /* count them */
    i = 1;
    while ( current_word[current_char+i] == '/' ) {
      i++;
    }
    /* i is now the number of slashes */

    /* sws... need to check for //x or // (space - end of word)*/

    no_space = 0;
    if ( ( current_word[current_char+i] == NULL ) 
	|| ( isspace(current_word[current_char+i]) ) ) {
      blank_next = 1;
/*      printf("%%\n%% found blank after %d\n",i);*/
    }
    else if ( current_word[current_char+i]=='*' ) {
      /* no space after last shad */
      no_space = 1;
      blank_next = 0;
    }
    else {
/*      printf("%%\n%% found [%c] after %d\n",
	     current_word[current_char+i],i);*/
    }

    if (current == -1) 
      current = 0;
    else if (token[current].char_num != -1) 
      current++;

    token[current].char_num = 0;
    token[current].special = SPECIAL;

/*    token[current].str =  "\\tibsp\\char115\\tibetan\0";*/
/*    strcpy(token[current].str, "\\tibsp\\char115\\tibetan\0");*/

    /* setup */
    /* see if we need a space to start - slashes after something */
    if (current_char >0)
      strcpy(token[current].str, "\\filler\\tibsp\0");
    else
      strcpy(token[current].str, "\\tibsp\0");
    /* output the shads */
    for (j=0; j<i; j++)
      strcat(token[current].str, "\\char115\0");
    /* see if we need a space at the end - end of sentence */
    if (blank_next)
      strcat(token[current].str, "\\tspace\n");
    /* leave ready for tibetan characters */
    strcat(token[current].str, "\\tibetan\0");

    token[current].vowel = V_NONE;

    current_char += i;
    if (no_space) {
      current_char++;
      blank_next = 1;
    }
    current++;
    return (-1);

  }
  /* look for TeX brackets */
  else if ( isbracket(current_word[current_char]) ) {
    if (current == -1) 
      current = 0;
    else if (token[current].char_num != -1) 
      current++;

    token[current].char_num = 0;
    token[current].special = SPECIAL;
    token[current].str[0] =  current_word[current_char];
    token[current].str[1] =  '\0';
    token[current].vowel = V_NONE;

/*    printf("%%\n%% found bracket [%s]\n",token[current].str);*/

    current_char += 1;
    current++;

    return (-1);
  }
  /* look for TeX construct */
  else if ( current_word[current_char] == '\\'  ) {
    /* a TeX command within tibetan mode, pass it through */
    i = 1;
    while ( ! istexterm( current_word[current_char+i] ) ) {
      i++;
    }
    /* i is now the length, not including the terminating char */

    if (current == -1) 
      current = 0;
    else if (token[current].char_num != -1) 
      current++;

    token[current].char_num = 0;
    token[current].special = SPECIAL;
    for (j=0; j<i; j++) 
      token[current].str[j] =  current_word[current_char+j];
    token[current].str[i+1] =  '\0';
    token[current].vowel = V_NONE;

    current_char += i;
    current++;
    return (-1);
  } 
  else {
    /* look for normal special char */
  i = 0;
  while (special[i].word != NULL) {
    if (strncmp(special[i].word, current_word + current_char, 
		strlen(special[i].word)) == 0) {
      current_char += strlen(special[i].word);
      if (current == -1) {
	current = 0;
	token[current].char_num = special[i].char_num;
	token[current].special = SPECIAL;
	strcpy(token[current].str, special[i].cmd);
	token[current].vowel = vowel(special[i].word[
		strlen(special[i].word) + 1]);
	current++;
	return (-1);
      }
      if (token[current].char_num == -1) {
	token[current].char_num = special[i].char_num;
	token[current].special = SPECIAL;
	strcpy(token[current].str, special[i].cmd);
	token[current].vowel = vowel(special[i].word[
		strlen(special[i].word + 1)]);
	current++;
	return (-1);
      } else {
	current++;
	token[current].char_num = special[i].char_num;
	token[current].special = SPECIAL;
	strcpy(token[current].str, special[i].cmd);
	token[current].vowel = vowel(special[i].word[
		strlen(special[i].word + 1)]);
	current++;
	return (-1);
      }
    }
    i++;
  }
  return (current_word[current_char++]);
}
}

/* sws
this just complains 
*/
bad_word()
{
  fprintf(stderr, "bad character in word <%s>\n", current_word);
}


/* end of tparse.c */
