/*****************************************************************************
 * KEYWORD.C
 * WADE 5/23/89
 * This is a quick and dirty program to generate keywords from the titles file.
 * A file is generated with id numbers and keywords for import into Ingres.
  ****************************************************************************/

#include <stdio.h>
#define FILEID "/mit/pips_dev/work/pips.web"
#define OUTPUT "/mit/pips_dev/work/keyword.dat"

static char    *
next_field(ptr)
	char          **ptr;
{
	register char  *cp = *ptr;

	while (*cp && *cp != ':')
		cp++;
	*cp = '\0';
	return *ptr = ++cp;
}


main()
{
  int      ctr;
  FILE    *fileaddr;
  char    *more;
  FILE    *fopen();
  char     *titles;
  char     *id;
  char     *title;
  FILE    *output;
  int      token_count,len;
  char    *tokens[20];
  int      token();
  int      valid();
  int      total_tokens;
  char *dummy,*ptr;
  /* *************** initialize variables *****************************/

  /* allocate storage for a max. of 20 tokens per record */
  for (ctr = 0; ctr < 20; ctr++)
    tokens[ctr] = (char *)malloc(31);
  total_tokens = 0;
  fileaddr = fopen(FILEID,"r");
  output   = fopen(OUTPUT,"w");

  titles = (char *)malloc(280);
  /* execute loop for each line/record in the titles file */
  more = fgets(titles,250,fileaddr);

  while( more ) {

    /* obtain id and title from record */ 
    id = titles;
    ptr= id;
    dummy = next_field(&ptr);
    dummy = next_field(&ptr);
    dummy = next_field(&ptr);
    title = next_field(&ptr);
    dummy = next_field(&ptr); 

printf("id    =%s\n",id);
printf("title =%s\n",title);
len = strlen(title);
    /* break title into tokens - max. of 20 */
    token_count = token(title,tokens);
 token_count--; 
printf("%d\n",token_count);
 
    for (ctr = 0;ctr < token_count; ctr++)
      if (valid(tokens[ctr])) {
	fprintf(output,"%5.5s %s\n",id,tokens[ctr]); 
	total_tokens = total_tokens + 1;
      }
    /* get next line */
    more = fgets(titles,250,fileaddr);
  }

  printf("%d tokens created.\n",total_tokens);
  fclose(fileaddr);
  fclose(output);
}
int
valid(token)
     char *token;
{
  int ctr;
  int token_length;
  
  if (strlen(token) < 3 )        return(0);
  if (token[0] < 'A')            return(0);
  if (strcmp(token,"for") == 0)  return(0);
  if (strcmp(token,"For") == 0)  return(0);
  if (strcmp(token,"and") == 0)  return(0);
  if (strcmp(token,"the") == 0)  return(0);
  if (strcmp(token,"The") == 0)  return(0);
  if (strcmp(token,"etc.") == 0) return(0);
  if (strcmp(token,"out") == 0)  return(0);
  if (strcmp(token,"res.") == 0) return(0);
 
  token_length = strlen(token);
  for (ctr = 0; ctr < token_length; ctr++) {
    if (token[ctr] >= 'A' && token[ctr] <= 'Z')
      token[ctr] = token[ctr] + 'a' - 'A';

    if ((token[ctr] == '.') || (token[ctr] == ':') || (token[ctr] == ','))
      token[ctr] = ' ';
  }

  return(1);
}

