/* WIDE AREA INFORMATION SERVER SOFTWARE:
   No guarantees or restrictions.  See the readme file for the full standard
   disclaimer.	
  
*/


/* retrieval part of the serial ir engine.  if you are using a different 
   storage system for the documents, replace this file. */

#include "irretrvl.h"
#include <string.h>
#include "futil.h"
#include <ctype.h>  /* for isspace */

/*----------------------------------------------------------------------*/


static boolean parseDocID
  _AP((DocObj* doc,char* filename,long* start_character,
       long* end_character,long* errorCode));

static boolean
parseDocID(doc,filename,start_character,end_character,errorCode)
DocObj* doc;
char* filename;
long* start_character;
long* end_character;
long* errorCode;
{
  DocID* theDocID = NULL;
  char* local_id = NULL;
  char* token = NULL;
  long i;

  theDocID = docIDFromAny(doc->DocumentID);

  local_id = anyToString(GetLocalID(theDocID));
  
  freeDocID(theDocID);

  /* parse the doc id into start pos, end pos, and filename */
  /* first the start char */
  token = local_id;
  for (i = 0; local_id[i] != '\0' && isspace(local_id[i]) == false; i++)
    ;
  if (local_id[i] == '\0')
   { 
     waislog(WLOG_HIGH, WLOG_ERROR, 
	     "Attempt to retrieve data for bad doc-id: '%s'",local_id); 
     *errorCode = GDT_BadDocID;
     s_free(local_id);
     return(false);
   }
  local_id[i] = '\0';
  sscanf(token,"%ld",start_character);
  /* now the second char */
  token = local_id + i + 1;
  for (++i; local_id[i] != '\0' && isspace(local_id[i]) == false; i++)
   ;
  if (local_id[i] == '\0')
   { 
     waislog(WLOG_HIGH, WLOG_ERROR, 
	     "attempt to retrieve data for bad doc-id: '%s'",
	     local_id);
     *errorCode = GDT_BadDocID;
     s_free(local_id);
     return(false);
   }
  local_id[i] = '\0';
  sscanf(token,"%ld",end_character);
  /* and finally the file name */
  strncpy(filename,local_id + i + 1,MAX_FILENAME_LEN);
  s_free(local_id);
  return(true);
}


/*----------------------------------------------------------------------*/

WAISDocumentText* getData(doc, databaseName, errorCode)
DocObj* doc;
char *databaseName;
long* errorCode;
/* it isn't text, so we can just grab data */
{
  FILE* file = NULL;
  char fileName[MAX_FILENAME_LEN + 1];
  WAISDocumentText* data = NULL;
  long start,end;		/* position of the document in the file */
  long startByte,endByte,bytes,bytesRead; /* part of the doc that we want */
  char* buffer = NULL;
  any* bufAny = NULL;

  /* we can only handle byte chunks here */
  if ((doc->ChunkCode == CT_byte) ||
      (doc->ChunkCode == CT_document)) {
    if (parseDocID(doc,fileName,&start,&end,errorCode) == false)
      return(NULL);
 
    file = s_fopen(fileName,"rb"); 
    if (file == NULL)
      { 
	waislog(WLOG_HIGH, WLOG_ERROR, 
		"attempt to retrieve data for missing doc-id: '%s'",
		fileName);
	*errorCode = GDT_MissingDocID;
	return(NULL);
      }

    if (doc->ChunkCode == CT_byte) {
      startByte = doc->ChunkStart.Pos + start;
      endByte = doc->ChunkEnd.Pos + start;
    }
    else {
      startByte = start;
      endByte = end;
    }

#if 0
    /* Causing problems on vax... */
    waislog(WLOG_LOW, WLOG_RETRIEVE,
	    "Retrieving DocID: %d %d %s, byte: %d %d, from database %s", 
	    start, end, fileName, startByte, endByte, databaseName);
#endif

    if (endByte > end && end != 0)
      { 
	waislog(WLOG_HIGH, WLOG_ERROR, 
		"retrieval beyond bounds of document %ld in file <%s>",
		endByte,fileName);
	*errorCode = GDT_BadRange;
	return(NULL);
      }
   
    /* get the bytes */
    if (fseek(file,startByte,SEEK_SET) != 0)
      { 
	waislog(WLOG_HIGH, WLOG_ERROR, 
		"retrieval can't seek to %ld in file <%s>",startByte,
		fileName);
	*errorCode = GDT_BadRange;
	return(NULL);
      }

    bytes = endByte - startByte; 
    buffer = (char*)s_malloc(bytes);
  
    bytesRead = fread((void*)buffer,(size_t)sizeof(char),bytes,file);
  
    if (bytesRead != bytes)
      { 
	waislog(WLOG_HIGH, WLOG_ERROR, 
		"retrieval error in file <%s>",fileName);
	*errorCode = GDT_BadRange;
	return(NULL);
      }
  
    bufAny = makeAny(bytesRead,buffer);
  
    data = makeWAISDocumentText(duplicateAny(doc->DocumentID),0L,bufAny);
  
    /* the any and the buffer are freed by freeWAISSearchResponse() */
    s_fclose(file);
  
    *errorCode = GDT_NoError;
  
    return(data);
  }
  else
    { 
      waislog(WLOG_HIGH, WLOG_ERROR, 
	      "search engine can only use whole documents or byte offsets for data lookup");
      *errorCode = GDT_UnsupportedChunkType;
      return(NULL);
    }

}

/*----------------------------------------------------------------------*/

#define BUFSZ	(size_t)5000

WAISDocumentText* getDocumentText(doc, databaseName, errorCode)
DocObj* doc;
char *databaseName;
long* errorCode;
/* find the text for doc, get the sub part if any, finally construct and
   return a WAISDocumentText.  If it can not find the document 
   (or some other error) it returns NULL and sets errorCode.
 */
{
  WAISDocumentText* text = NULL;
  FILE* file = NULL;
  char* buffer = NULL;
  any* bufAny = NULL;
  char filename[MAX_FILENAME_LEN + 1];
  long start_character;
  long end_character;
  register long i;
  long bytes,bytesRead;
  long startByte,endByte,byte,lines;
  
  /* we can only handle line chunks for now */
  if (doc->ChunkCode != CT_line)
   { 
     waislog(WLOG_HIGH, WLOG_ERROR, 
	     "search engine can only use line offsets for now.");
     
     *errorCode = GDT_UnsupportedChunkType;
     return(NULL);
   }

  if (parseDocID(doc,filename,&start_character,&end_character,errorCode) == 
      false)
    return(NULL);

  waislog(WLOG_LOW, WLOG_RETRIEVE,
	  "Retrieving DocID: %d %d %s, line range: %d %d, from database %s", 
	  start_character, end_character, filename,
	  doc->ChunkStart.Pos, doc->ChunkEnd.Pos,
	  databaseName);
  /* check the database */
  if(NULL == databaseName){
    *errorCode = GDT_MissingDatabase;
    return(NULL);
  }
  

  file = s_fopen(filename,"r");
  if (file == NULL)
   { 
     waislog(WLOG_HIGH, WLOG_ERROR, 
	     "attempt to retrieve text for bad doc-id: '%s'",
	     doc->DocumentID);
     
     *errorCode = GDT_MissingDocID;
     return(NULL);
   }

  if(0 != fseek(file, start_character, SEEK_SET))
   { 
     waislog(WLOG_HIGH, WLOG_ERROR, 
	     " error on attempt to seek into file");
     
     *errorCode = GDT_MissingDocID;
     return(NULL);
   }
  /* find the start byte */
  buffer = (char*)s_malloc(BUFSZ);
  lines = byte = 0;
  while (lines < doc->ChunkStart.Pos)
   { /* search a buffer full */
     bytesRead = fread(buffer,(size_t)sizeof(char),BUFSZ,file); 
     for (i = 0; i < bytesRead && lines < doc->ChunkStart.Pos; i++, byte++)
      { if (buffer[i] == '\n' || buffer[i] == '\r')
	  /* \r should not happen because we are reading the file in text 
	     mode */
          lines++;
      }
     if (bytesRead == 0) /* cheasy handling files that don't end with nl */
       lines++;
   } 
  startByte = byte;
   
  beFriendly();
  
  /* find the end byte */ /* this could be done while getting the bytes XXX */
  /* search starting form the start pos */  
  if (fseek(file,startByte + start_character,SEEK_SET) != 0) 
   { 
     waislog(WLOG_HIGH, WLOG_ERROR, 
	     "retrieval can't seek to %ld in file <%s>",
	     startByte,filename);
     
     *errorCode = GDT_BadRange;
     return(NULL);
   }

  beFriendly();
  
  while (lines < doc->ChunkEnd.Pos) 
   { /* search a buffer full */
     bytesRead = fread(buffer,(size_t)sizeof(char),BUFSZ,file); 
     for (i = 0; i < bytesRead && lines < doc->ChunkEnd.Pos; i++, byte++)
      { if (buffer[i] == '\n' || buffer[i] == '\r')
	  /* \r should not happen, we are reading the file in text mode */
          lines++;
      }
     if (bytesRead == 0) /* cheasy handling of files that don't end with nl */
       lines++;
   } 
  endByte = byte;
   
  beFriendly();
  
  s_free(buffer);
     
  /* get the bytes */
  if (fseek(file,startByte + start_character,SEEK_SET) != 0)
   { 
     waislog(WLOG_HIGH, WLOG_ERROR, 
	     "retrieval can't seek to %ld in file <%s>",startByte,
	     filename);
     
     *errorCode = GDT_BadRange;
     return(NULL);
   }
   
  bytes = endByte - startByte; 
  buffer = (char*)s_malloc(bytes);
  
  bytesRead = fread((void*)buffer,(size_t)sizeof(char),bytes,file);
  
  if (bytesRead != bytes)
   { 
     waislog(WLOG_HIGH, WLOG_ERROR, 
	     "retrieval error in file <%s>",filename);
     
     *errorCode = GDT_BadRange;
     return(NULL);
   }
  
  bufAny = makeAny(bytesRead,buffer);
  
  text = makeWAISDocumentText(duplicateAny(doc->DocumentID),0L,bufAny);
  
  /* the any and the buffer are freed by freeWAISSearchResponse() */
  s_fclose(file);
  
  *errorCode = GDT_NoError;
  
  return(text);
}
