#include <stdio.h>
#include <errno.h>
#include <string.h>
#include <sys/types.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <stdlib.h>
#include <time.h>
#include "ctable.h"
#include "util.h"

table host_table, url_table, agent_table;

#define DEFAULT_LOGFILE "/var/local/www/logs/access_log.1"
#define BUFFER_SIZE 4096
/* Tried using ((2<<18)-1) and ((2<<15)-3) but it didn't matter much */
#define HOST_TABLE_SIZE (2<<17)
#define URL_TABLE_SIZE (2<<18)
#define AGENT_TABLE_SIZE (2<<11)

static char *host_names[] = {0, "Prodigy", "AOL"};

int parse_entry (char *buf, int *match_start, int *match_end);

int main(int argc, char **argv)
{
  FILE *fp;
  char *logfile, *myname, buf[BUFFER_SIZE], start_time[30], end_time[30],
    *p, *q;
  unsigned long len, line, truncated_lines = 0, bad_lines = 0, status, host,
    size;
  unsigned long long total_hits = 0, total_bytes = 0, hits[24], bytes[24],
    mit_hits = 0;
  int match_start[9], match_end[9], i, j;
  long hour;
  unsigned long long scratch[101];
  generic_table_ent *indexes[101];
  single_table_ent *sent;
  double_table_ent *dent;
  long_table_ent *lent;
  time_t now;
  unsigned char *hostp;

  now = time(0);
  fprintf(stderr, " -- starting at %s", ctime(&now));
  memset(hits, 0, sizeof(hits));
  memset(bytes, 0, sizeof(bytes));
  start_time[0] = end_time[0] = 0;

  if (!(myname = strrchr(argv[0], '/')))
    myname = argv[0];
  else
    myname++;

  if (argc < 2)
    logfile = DEFAULT_LOGFILE;
  else
    logfile = argv[1];

  if (table_init(&host_table, HOST_TABLE_SIZE, "hosts") ||
      table_init(&url_table, URL_TABLE_SIZE, "urls") ||
      table_init(&agent_table, AGENT_TABLE_SIZE, "agents"))
  {
    fprintf(stderr, "%s: failed to allocate memory for tables\n", myname);
    exit(1);
  }

  if (!(fp = fopen(logfile, "r")))
  {
    fprintf(stderr, "%s: failed to open %s (%d)\n", myname, logfile, errno);
    exit(1);
  }

  /* read and handle each line */
  line = 0;
  while (fgets(buf, BUFFER_SIZE, fp) != 0)
  {
    len = strlen(buf);
    if (!len)
      continue;
    line++;
    if (buf[len-1] != '\n')
    {
      if (!feof(fp))
      {
	int i;
	truncated_lines++;
	while (fgets(buf, BUFFER_SIZE, fp) != 0)
	{
	  len += i = strlen(buf);
	  if (!i)
	    continue;
	  if (buf[i-1] == '\n')
	    break;
	}
	fprintf(stderr,
		"%s: line %ld too long (%ld > %d characters)\n",
		myname, line, len, BUFFER_SIZE-1);
	continue;
      }
      else
	fprintf(stderr, "%s: missing eol at eof\n", myname);
    }
    buf[len-1] = 0;

    if (parse_entry(buf, match_start, match_end))
    {
      fprintf(stderr,"%s: bad line %ld\n", myname, line);
      bad_lines++;
      continue;
    }

    if (!total_hits)
    {
      memcpy(start_time, buf+match_start[7], 28);
      start_time[28] = 0;
    }
    memcpy(end_time, buf+match_start[7], 28);
    end_time[28] = 0;

    buf[match_end[0]] = 0; /* host */
    buf[match_end[1]] = 0; /* hour */
    buf[match_end[2]] = 0; /* url */
    buf[match_end[3]] = 0; /* status */
    buf[match_end[4]] = 0; /* size */
    buf[match_end[6]] = 0; /* agent */
#if 0
    fprintf(stderr,"%s %s %s %s %s %s\n",
	    buf+match_start[0],buf+match_start[1],buf+match_start[2],
	    buf+match_start[3],buf+match_start[4],buf+match_start[6]);
#endif

    host = inet_addr(buf + match_start[0]);
    hostp = (unsigned char*)&host;
    if (hostp[0] == 18)
      mit_hits++;
    if (hostp[0] == 207 && hostp[1] == 115 && hostp[2] <= 63)
      host = htonl(1);
    else if (hostp[0] == 152 && hostp[1] == 163)
      host = htonl(2);
    long_table_inc(&host_table, host, 1);

    hour = atoi(buf + match_start[1]);
    if (hour < 0 || hour > 23)
      continue;
    status = atoi(buf + match_start[3]);
    size = atoi(buf + match_start[4]);

    total_hits++;
#if 0
    if (!(total_hits % 5000))
      fprintf(stderr, "processed %ld hits\n", total_hits);
#endif
    total_bytes += size;
    hits[hour]++;
    bytes[hour] += size;

    p = buf + match_start[2];
    len = strlen(p) > 4;
    q = buf + match_end[2];
    if (status == 404)
      p = "404 Not Found";
    else if (!strncmp(p, "/cgi/counter/", 13))
      p = "CGI Counter";
    else if (len &&
	     (!strcasecmp(q-4, ".gif") || !strcasecmp(q-4, ".xbm") ||
	      !strcasecmp(q-4, ".jpg") || !strcasecmp(q-5, ".jpeg")))
      p = "All Pictures";
    else if ((q = strchr(p, '?')) != 0 && q < buf+match_end[2])
      *q = 0;
    double_table_inc(&url_table, p, 1, size);

    p = buf + match_start[6];
    if (!memcmp(p, "Mozilla", 7) && strcasestr(p, "compatible"))
    {
      if (strcasestr(p, "AOL"))
      {
	if (strcasestr(p, "MSIE"))
	  p = "Mozilla compatible (MSIE/AOL)";
	else
	  p = "Mozilla compatible (AOL)";
      }
      else if (strcasestr(p, "MSIE"))
	p = "Mozilla compatible (MSIE)";
      else
	p = "Mozilla compatible";
    }
    else
    {
      if ((p = strchr(p, '/')) != 0)
	*p = 0;
      p = buf + match_start[6];
    }
    single_table_inc(&agent_table, p, 1);
  }

  if (!feof(fp))
    fprintf(stderr, "%s: read error before end of file\n", myname);

  fclose(fp);

  /* output statistics */
  if (start_time[0] && end_time[0])
    printf("<P>Summary period: %s to %s</P>\n\n", start_time, end_time);

  printf("<P>%llu hits, for a total of %llu bytes",total_hits,total_bytes);

  if (total_hits && total_bytes)
  {
    printf(",\nof which %llu (%d%%) hits were from MIT (net 18) hosts.</P>\n\n",
	   mit_hits,(int)((100*mit_hits)/total_hits));

    printf("<P><table border>\n"
	   "<tr><th colspan=5>Hourly Transmission Statistics</th></tr>\n");
    printf("<tr><th>Time </th><th>Hits </th><th>Bytes </th>"
	   "<th>%%Hits </th><th>%%Bytes</th></tr>\n");
    for (i = 0; i < 24; i++)
    {
      printf("<tr><td>%d </td><td>%llu </td><td>%llu </td>"
	     "<td>%.1f </td><td>%.1f </td></tr>\n",
	     i, hits[i], bytes[i], hits[i]*100/(double)total_hits,
	     100*((double)bytes[i]/(double)total_bytes));
    }
    printf("</table></P>\n\n");

    j = table_select(&url_table, 100, indexes, scratch);
    printf("<P><table border>\n"
	   "<tr><th colspan=3>Top Documents</th></tr>\n");
    printf("<tr><th>Hits </th><th>Bytes </th><th>Document </th></tr>\n");
    for (i = 0; i < j; i++)
    {
      dent = (double_table_ent*)indexes[i];
      if (dent->name[0] == '/')
	printf("<tr><td>%llu </td><td>%llu </td>"
	       "<td><a href=http://www.mit.edu%s>%s</a></td></tr>\n",
               dent->val1, dent->val2, dent->name, dent->name);
      else
	printf("<tr><td>%llu </td><td>%llu </td><td>%s</td></tr>\n",
               dent->val1, dent->val2, dent->name);
    }
    printf("</table></P>\n\n");

    j = table_select(&host_table, 20, indexes, scratch);
    printf("<P><table border>\n"
	   "<tr><th colspan=2>Top Hosts</th></tr>\n");
    printf("<tr><th>Hits </th><th>Host</th></tr>\n");
    for (i = 0; i < j; i++)
    {
      lent = (long_table_ent*)indexes[i];
      hostp = (unsigned char*)&lent->name;
      sprintf(buf, "%d.%d.%d.%d", hostp[0], hostp[1], hostp[2], hostp[3]);
      if (hostp[0])
	printf("<tr><td>%llu </td>"
	       "<td><a href=http://www.mit.edu/machine?%s>%s</a></td></tr>\n",
	       lent->val, buf, buf);
      else
	printf("<tr><td>%llu </td><td>%s</td></tr>\n",
	       lent->val, host_names[(int)hostp[3]]);
    }
    printf("</table></P>\n\n");

    j = table_select(&agent_table, 20, indexes, scratch);
    printf("<P><table border>\n"
	   "<tr><th colspan=3>Top Browsers</th></tr>\n");
    printf("<tr><th>%%Hits </th><th>Hits </th><th>Browser</th></tr>\n");
    for (i = 0; i < j; i++)
    {
      sent = (single_table_ent*)indexes[i];
      printf("<tr><td>%.1f </td><td>%llu </td><td>%s</td></tr>\n",
	     sent->val*100/(double)total_hits, sent->val, sent->name);
    }
    printf("</table></P>\n\n");

    j = table_select_min(&agent_table, 20, indexes, scratch);
    printf("<P><table border>\n"
	   "<tr><th>Bottom Browsers</th></tr>\n");
    for (i = 0; i < agent_table.size; i++)
    {
      for (sent = (single_table_ent*)agent_table.data[i];
           sent; sent = sent->next)
        if (sent->val == 1)
          printf("<tr><td>%s</td></tr>\n", sent->name);
    }
    printf("</table></P>\n\n");
  }
  else
    printf("</P>\n");

  fprintf(stderr, "\n%llu hits, for a total of\n%llu bytes",
	  total_hits, total_bytes);
  if (total_hits)
  {
    fprintf(stderr,
	    ", of which\n%llu (%d%%) hits were from MIT (net 18) hosts.",
	    mit_hits, (int)((100*mit_hits)/total_hits));
  }
  fprintf(stderr, "\n\n%lu truncated lines\n%lu bad lines\n\n",
	  truncated_lines, bad_lines);
  table_info(&host_table);
  table_info(&url_table);
  table_info(&agent_table);
  fprintf(stderr, "%lu bytes allocated by mymalloc\n\n",
	  malloc_allocated_bytes);

  now = time(0);
  fprintf(stderr, " -- ending at %s", ctime(&now));
  return 0;
}
