/* WIDE AREA INFORMATION SERVER SOFTWARE:
   No guarantees or restrictions.  See the readme file for the full standard
   disclaimer.

   Brewster@think.com
*/

/* Hash table utilities */

#ifndef HUTIL_H
#define HUTIL_H

#include "irlex.h" /* for MAX_WORD_LENGTH */

/* this is the size of the memory word hashtable.  
 * It will be grown if needed.
 */
#define HASHTABLE_INITIAL_SIZE 65536L

/* the amount of memory for word occurances (bytes) */
#define WORD_MEMORY_INIT_BLOCK_SIZE 10

/* this is the maximum number of occurances that will be stored in the 
 * disk table.  The number of occurances will reflect the total number in
 * all files.  The theory is that if a word is very common, then it
 * is not very useful in descriminating between files.  Also, if it
 * is very common, then it takes up alot of space.
 * Maybe this should be dependent on the number of documents indexed.
 * Therefore if a word is in every document, then it probably does not mean
 * much.  
 * In increasing this, it may not keep all the references in the 
 * inverted file because the max length of an index block is governed
 * by a size that can be represented in INDEX_BLOCK_SIZE_SIZE bytes.
 */
#define MAX_OCCURANCES 20000L

#define STOP_WORD_FLAG 0x40000000  /* this is a flag to be put in the number_of_occurances field of a word_entry so that it is always greater than the limit
and no words will be collected. */

typedef struct word_entry{
  char word[MAX_WORD_LENGTH + 1];	/* NULL when empty. Must be 
					   the first slot */
  /* should this be the total weight? */
  long hash_code;
  long number_of_occurances;	/* total for the whole db */

  char* memory_ptr;		/* what will go into the next block */
  char* current_memory_ptr;	/* the fill ptr into memory_ptr */
  long memory_size;		/* the size of memory_ptr */
  long current_doc_id;		/* the last document-id in memory_ptr
				 * this will change a page pointer eventually
				 */
} word_entry;

typedef struct word_memory_hashtable{
  long size;	/* number of elements that can be in the contents */
  long word_entry_block_size;  /* the maximum number of entries before flushing */
  long number_of_entries;	/* number of elements that are in the 
				 * contents.
				 */
  word_entry** contents;	/* pointer to the word hashtable memory */
  word_entry *word_entry_block;	/* pointer block of entries */
  long number_of_words_indexed; /* total number of words indexed */
  long flush_after_n_words; 	/* number of words that should be accumulated
				 * before flushing.  This should be dynamically
				 * handled rather than this way.
				 */
  double growth_factor;		/* amount to grow when growing */
  double grow_when_this_full; 	/* fraction of full that triggers growth */
} word_memory_hashtable;

#endif /* ndef HUTIL_H */
