/* WIDE AREA INFORMATION SERVER SOFTWARE:
   No guarantees or restrictions.  See the readme file for the full standard
   disclaimer.

   Brewster@think.com
*/

/* The memory hashtables for building an index. */
/* -brewster 5/90 */

/* main functions:
 *   add_word
 *   finished_add_word
 *   look_up_word
 *
 * The idea is to store up a bunch of words before going to disk.
 * A word entry points to where it will go on disk, and
 * accumulates the entries before doing it.
 *
 * Some of the policy issues in this file are:
 *   How much weight should the first occurance of a word in a document get
 *   over the other occurances.  The first occurance should be worth more
 *   so that words with 3 occurances of "dog" and not "cat"'s should not 
 *   win out over 1 "dog" and 1 "cat" if the question is "Tell me about cats
 *   torture dogs"
 *   The extra weight is 5 at this point.
 *
 */

/* To Do:
 *  Improve the hashing functions.
 *  done: stop inserting into hash table after max number have been accumulated
 *  done: make flush not flush buffers that are too big.
 */
 
#include <ctype.h>
#include <string.h> 	/* for strlen(), memset() */

#include "panic.h"
#include "cutil.h"
#include "irfiles.h"
#include "irhash.h"
#include "stoplist.h"
#include "irinv.h"

#ifdef UNIX
#define PRINT_AS_INDEXING true /* also defined in irtfiles.c and irfiles.c */
#else 
#define PRINT_AS_INDEXING false
#endif


/*===========================*
 *===  Hashing Functions  ===*
 *===========================*/

/* #define FAST_HASH */

#ifdef FAST_HASH

/* courtesy ses@ccgr.technion.ac.il, but it turns out in
   informal timings that it increases the index time.  sigh. */

static char coeff[] = {
		61,59,53,47,43,41,37,31,29,23,17,13,11,7,3,1,
		61,59,53,47,43,41,37,31,29,23,17,13,11,7,3,1,
		61,59,53,47,43,41,37,31,29,23,17,13,11,7,3,1,
		61,59,53,47,43,41,37,31,29,23,17,13,11,7,3,1,
		61,59,53,47,43,41,37,31,29,23,17,13,11,7,3,1,
		61,59,53,47,43,41,37,31,29,23,17,13,11,7,3,1,
		61,59,53,47,43,41,37,31,29,23,17,13,11,7,3,1,
		61,59,53,47,43,41,37,31,29,23,17,13,11,7,3,1};

long hash_word(wd,below_n)
char *wd;
long below_n;
{
  register char *foo;
  register long hash = 0;
  register int l;

  for(l=0,foo=wd;l<sizeof(coeff) && *foo ;l++)
    hash = hash + (*(foo++) * coeff[l]);

  return (hash % below_n);
}
#endif /* def FAST_HASH*/

#ifndef FAST_HASH

/* these stink -brewster */

static long random_array_3[256] = 
{142L, 176L, 108L, 210L, 109L, 223L, 214L, 251L, 
   102L, 86L, 91L, 9L, 247L, 139L, 115L, 71L, 
   63L, 35L, 126L, 77L, 209L, 175L, 120L, 28L, 
   44L, 198L, 21L, 125L, 245L, 250L, 10L, 119L, 
   127L, 60L, 81L, 226L, 216L, 182L, 172L, 72L, 
   151L, 178L, 116L, 224L, 244L, 41L, 212L, 73L, 
   190L, 248L, 173L, 18L, 82L, 27L, 97L, 26L, 
   79L, 169L, 74L, 170L, 83L, 189L, 101L, 141L, 
   230L, 55L, 135L, 220L, 187L, 201L, 95L, 39L, 
   186L, 131L, 105L, 36L, 255L, 203L, 155L, 84L, 
   160L, 75L, 254L, 235L, 51L, 243L, 158L, 14L, 
   148L, 167L, 149L, 96L, 68L, 161L, 45L, 233L, 
   11L, 19L, 3L, 38L, 195L, 48L, 144L, 15L, 
   171L, 94L, 180L, 29L, 252L, 181L, 80L, 4L, 
   20L, 213L, 23L, 143L, 7L, 236L, 76L, 110L, 
   22L, 58L, 17L, 253L, 66L, 246L, 40L, 112L, 
   179L, 130L, 87L, 124L, 240L, 193L, 107L, 165L, 
   202L, 31L, 106L, 43L, 93L, 99L, 147L, 199L, 
   129L, 197L, 32L, 229L, 150L, 46L, 157L, 128L, 
   136L, 153L, 121L, 113L, 237L, 194L, 218L, 104L, 
   78L, 184L, 62L, 159L, 227L, 222L, 47L, 53L, 
   1L, 24L, 118L, 177L, 49L, 185L, 98L, 90L, 
   34L, 192L, 200L, 221L, 232L, 146L, 114L, 137L, 
   67L, 225L, 154L, 241L, 50L, 56L, 145L, 5L, 
   188L, 207L, 231L, 228L, 6L, 183L, 219L, 217L, 
   156L, 30L, 174L, 205L, 103L, 37L, 133L, 152L, 
   117L, 196L, 164L, 249L, 239L, 64L, 242L, 59L, 
   168L, 2L, 162L, 13L, 92L, 85L, 70L, 0L, 
   52L, 65L, 166L, 163L, 215L, 69L, 140L, 25L, 
   33L, 100L, 42L, 54L, 88L, 206L, 122L, 57L, 
   16L, 208L, 134L, 132L, 138L, 89L, 8L, 234L, 
   12L, 238L, 111L, 204L, 61L, 211L, 191L, 123L};


static long random_array_2[256] = 
{818L, 789L, 854L, 862L, 704L, 1019L, 390L, 887L, 
   93L, 204L, 269L, 59L, 743L, 219L, 191L, 769L, 
   911L, 435L, 805L, 448L, 142L, 1000L, 149L, 264L, 
   639L, 504L, 699L, 934L, 266L, 661L, 318L, 211L, 
   117L, 549L, 90L, 536L, 378L, 944L, 400L, 599L, 
   592L, 883L, 985L, 606L, 759L, 456L, 581L, 119L, 
   106L, 310L, 412L, 931L, 233L, 561L, 973L, 870L, 
   377L, 349L, 334L, 354L, 249L, 585L, 799L, 899L, 
   545L, 553L, 848L, 625L, 438L, 890L, 791L, 1014L, 
   337L, 374L, 489L, 146L, 123L, 907L, 977L, 22L, 
   396L, 241L, 198L, 424L, 136L, 715L, 867L, 684L, 
   560L, 244L, 293L, 1017L, 397L, 778L, 725L, 78L, 
   184L, 656L, 389L, 635L, 982L, 158L, 203L, 878L, 
   323L, 394L, 73L, 18L, 837L, 996L, 58L, 62L, 
   161L, 451L, 534L, 746L, 485L, 222L, 25L, 666L, 
   28L, 21L, 420L, 147L, 522L, 74L, 474L, 362L, 
   253L, 172L, 195L, 622L, 559L, 790L, 288L, 455L, 
   263L, 538L, 355L, 417L, 810L, 576L, 685L, 797L, 
   641L, 315L, 347L, 786L, 487L, 966L, 579L, 181L, 
   499L, 429L, 688L, 140L, 278L, 719L, 186L, 872L, 
   997L, 319L, 173L, 882L, 1008L, 573L, 431L, 830L, 
   774L, 654L, 235L, 121L, 925L, 529L, 593L, 92L, 
   954L, 434L, 213L, 79L, 284L, 510L, 763L, 655L, 
   300L, 447L, 4L, 461L, 506L, 88L, 99L, 459L, 
   220L, 780L, 523L, 178L, 303L, 578L, 287L, 827L, 
   419L, 521L, 114L, 703L, 664L, 892L, 304L, 876L, 
   352L, 331L, 35L, 896L, 341L, 450L, 812L, 350L, 
   316L, 705L, 815L, 935L, 15L, 572L, 503L, 467L, 
   306L, 976L, 118L, 760L, 807L, 809L, 339L, 442L, 
   758L, 546L, 327L, 527L, 537L, 383L, 82L, 531L, 
   728L, 428L, 768L, 675L, 814L, 919L, 133L, 682L, 
   906L, 163L, 716L, 692L, 174L, 464L, 708L, 922L};


/*  
static long random_char_code _AP((long ch,long offset));
static long random_char_code(ch,offset)
long ch;
long offset;
{

	return(random_array_3[ (offset + (ch & 0xFF)) % 256]);
}
*/

#define random_char_code(ch,offset)\
       (random_array_3[ (offset + (ch ) ) & 0xff])

/* assumes the word has been downcased already */

static long hash_word(wd,below_n)
char *wd;
long below_n;
{
         
        register long i=0;
        register long answer = 0;
	register char* foo;

	foo=wd;
        for (i = 0; *foo; foo++,i++) {
		answer = answer ^ (random_array_2[i % 256] +
				   ((0 == (i & 1)) ? 
				    random_char_code((long)*foo, i)
				    : (random_char_code((long)*foo, i))
				    << 8));			
	      }
        return(answer % below_n);
}

#endif /* ndef FAST_HASH */

static long hash_word_2 _AP((char *wd));
static long hash_word_2(wd)
char *wd;
{
  long hash = hash_word(wd, ((1L << (8 * DICTIONARY_ENTRY_HASH_CODE_SIZE))
			     - 2));
  return(1 + hash);
							  
}

 
/* ================================
   ===  Word Occurance Buffers  ===
   ================================ */

/* Word occurance buffers
 * This is a simple memory allocator for use with the word memory hashtable.
 * Since the buffers are tiny, this is done as a copy-sweep GC scheme.
 * Oh, I long for the storage system of lisp.
 */
char *first_word_occurance_buffer = NULL;  /* allocate blocks out of this */
char *last_word_occurance_buffer = NULL;
long word_occurance_block_length = 256000;  /* maybe this should be larger? */
char * word_occurance_free_ptr = NULL;

char *make_word_occurrance_block(size)
long size;

{
  /* allocates a word_occurance_block out of the buffers */
  /* old way: s_malloc((size_t)size); */
  /* returns a pointer to a piece of memory */
  if(NULL == first_word_occurance_buffer){
    /* initialize it */
    first_word_occurance_buffer = 
      (char *)s_malloc(MAX(word_occurance_block_length,
			   sizeof(size_t)+ size));
    *(char **)first_word_occurance_buffer = NULL; /* set the end */
    last_word_occurance_buffer = first_word_occurance_buffer;
    word_occurance_free_ptr = first_word_occurance_buffer + sizeof(size_t);
  }
  if((long)word_occurance_free_ptr + size >= 
     word_occurance_block_length + (long)last_word_occurance_buffer){
    /* then allocate a new block */
    char * new_block = (char *)s_malloc(MAX(word_occurance_block_length,
					    sizeof(size_t)+ size));
    *(char **)new_block = NULL; /* set the end of the chain */
    *(char **)last_word_occurance_buffer = new_block;
    word_occurance_free_ptr = new_block + sizeof(size_t);
    last_word_occurance_buffer = new_block;
  }
  /* allocate away */	
  { char * answer = word_occurance_free_ptr;
    word_occurance_free_ptr += size;	
    return(answer);  
  }
}

void free_word_occurance_block(block)
char *block;
{
  /* this is not used with the new scheme, but is here in case
     malloc is a win on some systems */
  /* old way s_free(block); */
}

static void flush_word_occur_bufs_internal
  _AP((char* head_of_list));

static void flush_word_occur_bufs_internal(head_of_list)
char* head_of_list;
/* frees all word occurance buffers.  This should be done with care */
{      
  while(1){
    char * next_block;
    if(NULL == head_of_list)
      break;
    next_block = *(char **)head_of_list;
    s_free(head_of_list);
    head_of_list = next_block;
  }
}

void flush_word_occurance_buffers()
{
  /* frees all word occurance buffers.  This should be done with care */
  flush_word_occur_bufs_internal(first_word_occurance_buffer);
  first_word_occurance_buffer = NULL;
  word_occurance_free_ptr = NULL;
  last_word_occurance_buffer = NULL;
}


void gc_word_occurance_buffers(the_word_memory_hashtable)
word_memory_hashtable * the_word_memory_hashtable;

{
  /* go through the word_memory_hashtable and copy what we need into another 
     list of buffers, the flush the old ones */
  /* not needed yet */
}


/* ===============================
   ===  Word Memory Hashtable  ===
   =============================== */

static long find_location _AP((char* word,word_memory_hashtable* 
			       the_word_memory_hashtable));

static long 
find_location(word,the_word_memory_hashtable)
char* word;
word_memory_hashtable* the_word_memory_hashtable;
/* returns the location that the word should go (or is).  returns -1 if 
 * the hashtable is full and the word is not there
 */
{
  long hash_code = hash_word(word, the_word_memory_hashtable->size);
  long i;
  long hash_code_2 = hash_word_2(word);

  for(i = hash_code; i < (hash_code + the_word_memory_hashtable->size); 
      i++){
    long index = i % the_word_memory_hashtable->size; 
    if(NULL == the_word_memory_hashtable->contents[index]){
      /* found an open spot, return it */
      return(index);
    }
    else 
      if(hash_code_2 == the_word_memory_hashtable->contents[index]->hash_code
	 &&
	 strcmp(word, the_word_memory_hashtable->contents[index]->word) == 0){
	/* we win, return it */
	return(index);
      }
    /* keep looking */
  }
  return(-1);
}

/* this pushes all word entries to the top of the word_memory_hashtable
 * therefore messing up the hashing order, but allows for quick sorting
 * just before dumping to disk.
 */
void collapse_word_memory_hashtable(the_word_memory_hashtable)
word_memory_hashtable *the_word_memory_hashtable;
{
  long insert_index = 0;
  long extract_index;
  for(extract_index = 0; extract_index < the_word_memory_hashtable->size;
      extract_index++){
    word_entry *entry = the_word_memory_hashtable->contents[extract_index];
    if(NULL != entry)
      the_word_memory_hashtable->contents[insert_index++] = entry;
  }
}

static int word_entry_compare _AP((word_entry**i,word_entry** j));

static int word_entry_compare(i,j)
word_entry **i;
word_entry **j;
{
  return(strcmp((*i)->word, (*j)->word));
}

/* assumes that the word_memory_hashtable has been compressed */
void sort_word_memory_hashtable(the_word_memory_hashtable)
word_memory_hashtable *the_word_memory_hashtable;
{
  qsort(the_word_memory_hashtable->contents,
	the_word_memory_hashtable->number_of_entries,
	(size_t)sizeof(char *),
	word_entry_compare);
}

      
/* for	 debugging */
void print_word_memory_hashtable(the_word_memory_hashtable)
word_memory_hashtable* the_word_memory_hashtable;
{
  if (NULL == the_word_memory_hashtable){
    cprintf(PRINT_AS_INDEXING, "No Hashtable allocated\n");
    return;
  }
  cprintf(PRINT_AS_INDEXING, "Number of entries possible: %ld\n", 
	  the_word_memory_hashtable->size);
  cprintf(PRINT_AS_INDEXING, "Number of entries allocated: %ld\n",
	  the_word_memory_hashtable->number_of_entries);
  if(NULL != the_word_memory_hashtable->contents){
    long i;
    /* print the entries */
    printf("The entries are:\n");
    for(i = 0; i < the_word_memory_hashtable->size; i++){
      if(NULL != the_word_memory_hashtable->contents[i]){
	printf(" Position: %ld word: \"%s\" %ld occurances\n", i, 
	       the_word_memory_hashtable->contents[i]->word,
	       the_word_memory_hashtable->contents[i]->number_of_occurances);	
      }
    }
  }
}

static word_entry* look_up_word _AP((char* word,word_memory_hashtable*
				     the_word_memory_hashtable));
  
static word_entry* 
look_up_word(word,the_word_memory_hashtable)
char* word;
word_memory_hashtable* the_word_memory_hashtable;
{
  /* looks up the word in the dictionary and returns
   * a pointer to the word_entry.
   * If is not present, then it mallocs a new word entry.
   */
  /* this is a pretty dumb hashing scheme XXX */
  long index = find_location(word, the_word_memory_hashtable);
  if(-1 == index){
    panic("the hashtable is completely full.  It should have been grown\n");
  }
  if(NULL == the_word_memory_hashtable->contents[index]){
    /* make a new entry */
    word_entry *new_entry = 
      &the_word_memory_hashtable->word_entry_block
	[the_word_memory_hashtable->number_of_entries++];

    if(NULL == new_entry){
      panic("malloc failed for word_entry\n"); 
    }
    strncpy(new_entry->word, word, MAX_WORD_LENGTH);
    new_entry->hash_code = hash_word_2(word);      
    new_entry->number_of_occurances = 0;
    new_entry->memory_ptr = 
      make_word_occurrance_block(WORD_MEMORY_INIT_BLOCK_SIZE);
    new_entry->current_memory_ptr = new_entry->memory_ptr;
    new_entry->memory_size = WORD_MEMORY_INIT_BLOCK_SIZE;
    new_entry->current_doc_id = 0;
	    
    the_word_memory_hashtable->contents[index] = new_entry;
    return(new_entry);
  }
  else{
    return(the_word_memory_hashtable->contents[index]);
  }
}

static unsigned char add_weight _AP((long current_weight,long new_weight));

static unsigned char 
add_weight(current_weight,new_weight)
long current_weight;
long new_weight;
/* add a new weight to the existing one */
{
  /* this should be smarter than this, like doing the log or something */
  if(127 < (current_weight + new_weight)){
    /* the max char.  should be 255, but does not work on all compilers */
    return(127);
  }
  else{
    return(current_weight + new_weight);
  }
}

static char* more_memory _AP((char* current_memory_ptr,
			      long current_memory_size,
			      long new_size));

static char* more_memory(current_memory_ptr,current_memory_size,new_size)
char* current_memory_ptr;
long current_memory_size;
long new_size;
/* Allocates more memory for a word_entry.  It transfers all the bytes 
 * from the old to the new and then returns the new.
 */
{
  char* new_memory = NULL;
  if(current_memory_size > new_size){
    panic("trying to contract a word_entry block.  This is not right\n");
  }
  new_memory = make_word_occurrance_block(new_size);
  if(NULL == new_memory){
    panic("Out of memory.");
  }
  memset(new_memory, 0, new_size);
  memmove(new_memory, current_memory_ptr, (size_t)current_memory_size); 
  return(new_memory);
}

static long more_memory_size _AP((long current_size,
				  long number_of_occurances));

static long more_memory_size(current_size,number_of_occurances)
long current_size;
long number_of_occurances;
/* This is pretty important to get right.  This is a place holder */
{
  return(MAX(2 * current_size, WORD_MEMORY_INIT_BLOCK_SIZE));
}

static long write_bytes_to_memory _AP((long value,long size,char* ptr));

static long write_bytes_to_memory(value,size,ptr)
long value;
long size;
char* ptr;
{
  /* writes the number into memory lsb first.  
     returns the number of bytes written */
  long i;
  if(size < 0) /* paranoia */
    panic("attempting to write a negative number of bytes");

  ptr += size; /* start at the end of the block and write backwards */
  for (i = 0; i < size; i++){
    ptr--;
    *ptr = value & 0xFF;
    value = value >> 8;
  }
  return(size);
}
			
/* adds a word to the word_memory_hashtable. Currently it
 * ignores the character position XXX.  
 * Returns the 0 if successful. See irext.h for more documentation.
 */
long add_word(word, char_pos, line_pos,
	      weight, doc_id, date, db)
     char *word;	/* the word to be indexed, this could be a
			   word pair. If NULL there are no more words
			   to be indexed */
     long char_pos;	/* the position of the start of the
			   word */
     long line_pos;	/* this is passed for the best
			   section calculation */
     long weight;	/* how important the word looks
			   syntactically (such as is it bold)
			   NOT used by signature system */
     long doc_id; 	/* current document, this will never be 0 */
     time_t date; /* display day of this document, 0 if not known */
     database* db; /* database to insert the document */
{
  /* look up the word in the word_memory_hashtable */
  /* creates it if necessary */	
  word_entry* wrd_entry;
  word_memory_hashtable * the_word_memory_hashtable = db->the_word_memory_hashtable;
  /* printf("Word: '%s' doc_id: %ld, pos: %ld, weight: %ld\n",
     word, doc_id, char_pos, weight); */
  
  if(NULL == db->the_word_memory_hashtable){
    panic("The memory word hashtable is not defined.");
  }

  /* if we have filled up the hashtable, or if we have indexed enough words
     flush the memory copies to disk */
  if((the_word_memory_hashtable->number_of_entries ==
      the_word_memory_hashtable->word_entry_block_size) ||
     (the_word_memory_hashtable->number_of_words_indexed ==
      the_word_memory_hashtable->flush_after_n_words))
    flush_memory_hashtable_to_disk(db, false);
  
  the_word_memory_hashtable->number_of_words_indexed ++;
  wrd_entry = look_up_word(word, the_word_memory_hashtable);
  wrd_entry->number_of_occurances ++;

  if(wrd_entry->number_of_occurances > MAX_OCCURANCES){
    /* do nothing. we have enough of that word */
  }
  else{
    /* we have a word to add */
    if(doc_id != wrd_entry->current_doc_id){
      /* then we have a new doc_id to add to the memory block */
      wrd_entry->current_doc_id = doc_id;
          
      /* check to see if we need more memory */
      if((wrd_entry->memory_size -
	  (wrd_entry->current_memory_ptr - 
	   wrd_entry->memory_ptr) 
	  < 
	  DICTIONARY_ELEMENT_SIZE)){
	/* we need more memory. this makes more and frees the old*/
	char* old_memory_ptr = wrd_entry->memory_ptr;
 
	long new_size = 
	  more_memory_size(wrd_entry->memory_size,
			   wrd_entry->number_of_occurances);
	/* cprintf(PRINT_AS_INDEXING, "Get more memory %ld bytes for %s\n", new_size, word); */
	wrd_entry->memory_ptr = 
	  more_memory(wrd_entry->memory_ptr, wrd_entry->memory_size,
		      new_size);
	wrd_entry->current_memory_ptr = 
	  wrd_entry->memory_ptr + /* new offset */
	    (wrd_entry->current_memory_ptr - old_memory_ptr);
	/* just being paranoid... no longer illegal
	   if(wrd_entry->current_memory_ptr == wrd_entry->memory_ptr)
	   panic("After allocating more memory, the size went to 0");
	   */
	wrd_entry->memory_size = new_size;
      }				/* finished making more memory */

      /* add away */
      wrd_entry->current_memory_ptr +=
	write_bytes_to_memory(doc_id, DOCUMENT_ID_SIZE,
			      wrd_entry->current_memory_ptr);
      wrd_entry->current_memory_ptr +=
	write_bytes_to_memory(char_pos, 
			      CHARACTER_POSITION_SIZE,
			      wrd_entry->current_memory_ptr);
      wrd_entry->current_memory_ptr +=
	write_bytes_to_memory(weight + 5, /* add 5 since for the first one */
			      WEIGHT_SIZE,
			      wrd_entry->current_memory_ptr);
    }
    else{
      /* The word is already there,
       * just increment the weight in the record.
       * This will change when/if position information is kept (for proximity).
       */
      if(wrd_entry->current_memory_ptr == wrd_entry->memory_ptr){
	panic("Memory hashtable error. Recorded doc_id %ld, current doc_id %ld\n",
	      wrd_entry->current_doc_id, doc_id);
      }
      *(wrd_entry->current_memory_ptr - 1) =
	add_weight(*(wrd_entry->current_memory_ptr - 1), weight);
    }
  }
  return(0L);
}

void add_stop_words(the_word_memory_hashtable)
word_memory_hashtable *the_word_memory_hashtable;
     /* add the stop words to the hashtable.  this must be done before
	adding other words */
{
  init_stop_list();
  while(true){
    char *word = next_stop_word();
    word_entry* wrd_entry;

    if(NULL == word)
      break;
    wrd_entry = look_up_word(word, the_word_memory_hashtable);
    wrd_entry->number_of_occurances = STOP_WORD_FLAG;
  }
}

/* this clears the contents of the word_memory_hashtable */
void clear_word_memory_hashtable(the_word_memory_hashtable)
word_memory_hashtable *the_word_memory_hashtable;
{
  memset((char*)the_word_memory_hashtable->contents, 0,
	 ((long)the_word_memory_hashtable->size * 
	  sizeof(size_t)));
  the_word_memory_hashtable->number_of_entries = 0;
  the_word_memory_hashtable->number_of_words_indexed = 0;
}


/* Size is in the number of entries.  
   flush_after_n_words sets the hashtable flush parameter.
   Returns TRUE if it succeeds. */
word_memory_hashtable * init_word_memory_hashtable(size,flush_after_n_words,the_word_memory_hashtable)
long size;
long flush_after_n_words;
word_memory_hashtable* the_word_memory_hashtable;
{
  if(NULL != the_word_memory_hashtable){
    /* then displose of the old one */
    if(NULL != the_word_memory_hashtable->contents)
      s_free(the_word_memory_hashtable->contents);
    if(NULL != the_word_memory_hashtable->word_entry_block)
      s_free(the_word_memory_hashtable->word_entry_block);
    flush_word_occurance_buffers();
  }
  the_word_memory_hashtable = 
    (word_memory_hashtable*)s_malloc((size_t)sizeof(word_memory_hashtable));

  the_word_memory_hashtable->size = size;
  
  the_word_memory_hashtable->word_entry_block_size = size / 2;
	
  the_word_memory_hashtable->contents = 
    (word_entry **)s_malloc((size_t)(the_word_memory_hashtable->size
					 * sizeof(size_t)));
  the_word_memory_hashtable->word_entry_block =
    (word_entry *)s_malloc((size_t)(the_word_memory_hashtable->word_entry_block_size
				    * sizeof(word_entry)));

  if(NULL == the_word_memory_hashtable->contents){
    panic("Could not malloc for the word hashtable\n");
    return(NULL);
  }
  /* clear the hashtable the slow by safe way
  for(i = 0; i < the_word_memory_hashtable->size; i++){
    the_word_memory_hashtable->contents[i] = (word_entry*)NULL;
  }
  */
  clear_word_memory_hashtable(the_word_memory_hashtable);

  /* add the stopwords to the index */
  add_stop_words(the_word_memory_hashtable);
	 
  the_word_memory_hashtable->flush_after_n_words = 
    flush_after_n_words;

  the_word_memory_hashtable->growth_factor = 2.0;
  the_word_memory_hashtable->grow_when_this_full = .5;
  
  return(the_word_memory_hashtable);
}