/* Run Idealized Reno over traceroute.
 * Interim copyright notice:
 *
 * (c) Pittsburgh Supercomputing Center, Matthew B Mathis, 1996.
 * This software is under active development and may contain bugs.
 * PLEASE DO NOT REDISTRIBUTE.  Current copies can be obtained
 * from ftp://ftp.psc.edu/pub/net_tools/treno.shar.
 *
 * This software is being developed in connection with "IP Performance Metrics"
 * For more information see: http://www.psc.edu/~mathis/ippm
 *
 * The final version will be published under an unrestricted copyright.
 * 
 * Comments, suggestions and patches welcome.
 *   - Matt Mathis <mathis@psc.edu>
 *
 * Contributors:
 *	Van Jacobson and all authors of Reno TCP
 *	Van Jacobson and all authors of traceroute
 *	Jamshid Mahdavi - MTU discovery for traceroute
 *	Sean Doran - updated signals, bug fixes
 */

#include <stdio.h>
#include <ctype.h>
#include <errno.h>
#include <netdb.h>
#include <signal.h>
#include <stdlib.h>

#if HAVE_FCNTL_H
# include <fcntl.h>
#endif
#if HAVE_MEMORY_H
# include <memory.h>
#endif
#if HAVE_STRING_H 
# include <string.h>
#endif
#if HAVE_UNISTD_H
# include <unistd.h>
#endif

#include <sys/types.h>
#include <sys/param.h>
#include <sys/socket.h>
#include <sys/stat.h>

#if HAVE_SYS_FILE_H
# include <sys/file.h>
#endif
#if TIME_WITH_SYS_TIME
# include <sys/time.h>
# include <time.h>
#else
# if HAVE_SYS_TIME_H
#  include <sys/time.h>
# else
#  include <time.h>
# endif
#endif

#include <netinet/in_systm.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/ip_icmp.h>
#include <netinet/udp.h>
#include <netinet/ip_var.h>

#include "mplib.h"
#define EVFLUSH
#include "eventlog.h"

/* Fixup OS specific stuff */
#ifdef sgi
#undef IP_HDRINCL
#define NOVMC
#endif

/* Prototypes */
int drop_mtu(int os);
void netprintip(u_char **pkt, int *len, int fdata); /* netprintf.c */


/*****************************************************************
 * Parser and control framework
 */

/* command line options */
int a_diag=0;
int a_research=0;
int a_duration=10;
int a_udp=1;
int a_ttl=0;
#define DEF_TTL 32	/* XXX */
int a_autottl=0;
int a_bbytes= 0;	/* default after parsing */
#define BBYTES 8166	/* FDDI MTU */
int a_dack=1;		/* delayed ack? */
int a_thrust=0;
int a_verbose = 0;
#define DEF_VERB 4
/* Verbosity:
 0 = show BW
 2 = show XT
 4 = loss, other instruments
 6 = detail on unexpected events
 8 = show packets
*/
int a_debug = 0;
int a_setfrom = 0;
int a_tos=0;	/* not supported */
struct sockaddr_in a_ToHost;	/* the remote end */
struct sockaddr_in a_FromHost;	/* my selected interface */

extern int errno;
extern int numeric;			/* don't format names */
RETSIGTYPE ring();			/* signal handler */
int runf = 1;				/* signal happened */
long starttick, tick=0;			/* current second */
struct timeval now, ptime;
u_short ident;
u_short port = 32768+666;               /* UDP (raw) port # */
u_int ttl;				/* actual ttl in use */
int on = 1;
int npdepth = 0;

#define VERSIONN 961001


RETSIGTYPE ring(signo)
{
  MP_RESIGNAL(SIGALRM,ring);
  alarm(2);
}

/*****************************************************************
 * TCP state variables.  Variable which mimic true TCP state
 * variables are kept in a tcpcb to facilitate code portability.
 * Since we support multiple CC algorithms, some of the variables are redundant
 */

typedef u_short tcp_seq;

struct tcpcb {
  short t_dupacks;		/* consecutive dup acks recd */
  u_short t_flags;
  tcp_seq snd_nxt;              /* send next */
  tcp_seq snd_fack;             /* Forward (right) most ACK */
  tcp_seq snd_recover;          /* One RTT beyond last good data */
  long snd_retran_data;         /* Signed to catch bugs */
  u_long snd_cwnd;    	        /* congestion-controlled window */
  u_long snd_ssthresh;  	/* slow start threshold */
  u_long snd_lothresh;          /* recovery window floor */
  u_long snd_hithresh;          /* maximum window following recovery */
} pcb;

/* selected t_flags values */
#define TF_SACK_PERMIT  0x0200          /* other side said I could SACK */
#define TF_RECOVERY     0x0400          /* We are recovering from a lost segment
 */
#define TF_REPAIRED     0x0800          /* We have retransmitted something */
#define TF_WHOLD        0x1000          /* we are in the window hold state */
#define TF_TOGGLE       0x2000          /* divide by 2 toggle */

/* TCP local and global variable, mimicing true TCP variables */
int sack_present;
int tcprexmtthresh = 3;

/* TCP variables used by alternate CC implementations */
int repaired_f, t_rcounter, tcprexmtthreshN, hithresh, lothresh, toggle_f, hold_f;
u_int maxseg;				/* TCP mss (slightly less than mtu) */

/* Reassembly queue emulation variables.  These have no natural TCP parallels. */
tcp_seq rseq;			/* current receive sequence */
tcp_seq erseq;			/* expected (in order) receive sequence */
tcp_seq rrseq;			/* fully recovered sequence */
tcp_seq flseq;			/* first loss sequence */
tcp_seq daseq;			/* ACK discarded to emulate delayed-ACKs */
int missing;			/* total missing segments in this loss interval */

/*****************************************************************
 * input buffer and goop
 */
int is;				/* icmp (listen) socket */
char *inbuf;
char *buf;

/*****************************************************************
 *  statistics, calibration and error instruments
 */

/* These MUST agree with statinfo below */
struct stats {
  int s_tin, s_tout;
  int s_ein, s_eout, s_ecnt, s_eloss, s_etime;
  int s_nein, s_neout, s_necnt, s_netime;
  int s_ssmaxwin, s_camaxwin;
  int s_repaired, s_rttcnt;

  int c_unexpected, c_okooo;

  int c_newhole, c_moreholes;
  int c_missing, c_retrans;
  int c_recovered;
  int c_late;
  int c_bytesin;
  int c_rcvbuf, c_sndbuf;
  int c_delayedack;

  int e_fromchanged;
  int e_dupdata, e_wayooo, e_hithresh;
  int e_invalseq1, e_invalseq2;
  int e_nobuff;
  int e_delayACKbotch;
} S;

/* Misc stats */
struct sockaddr_in from, goodfrom, lastfrom;
int havegoodfrom;
tcp_seq rttseq;
u_int minrtt;
struct timeval rttstamp;

/* statinfo supports easy formatting, etc.  Update "stats" above to agree */
#define PrS 1
#define PrC 2
#define PrE 3
#define SI(f, v, n) {f, &(v), n}
struct statinfo {
  int	ctrl;	/* printing/processing control */
  void *v;	/* pointer to the value */
  char *name;	/* The description for printing */
} si[] = {

  /* s_ are statistics about the measurement */
SI(PrS, S.s_tin, "total packets in"),
SI(PrS, S.s_tout, "total packets out"),
SI(PrS, S.s_ein, "equilibrium packets in"),
SI(PrS, S.s_eout, "equilibrium packets out"),
SI(PrS, S.s_ecnt, "equilibrium congestion avoidance cycles"),
SI(PrS, S.s_eloss, "equilibrium packets lost"),
SI(PrS, S.s_etime, "equilibrium us"),
SI(PrS, S.s_nein, "non-equilibrium packets in"),
SI(PrS, S.s_neout, "non-equilibrium packets out"),
SI(PrS, S.s_necnt, "non-equilibrium events"),
SI(PrS, S.s_netime, "non-equilibrium us"),
SI(PrS, S.s_ssmaxwin, "bytes, maximum window during slow-start"),
SI(PrS, S.s_camaxwin, "bytes, maximum window during congestion avoidance"),
SI(PrS, S.s_repaired, "first retransmissions"),
SI(PrS, S.s_rttcnt, "RTT measurements"),

  /* These should all be zero or small.  Large values reflect problems */
SI(PrC, S.c_unexpected, "other ICMP messages received"),
SI(PrC, S.c_okooo, "minor out-of-order data arrived (before tcprexmtthresh)"),

  /* c_ are calibration checks.   They all have constrained values */
SI(PrC, S.c_newhole, "loss epochs (first hole)"),
SI(PrC, S.c_moreholes, "additional holes"),
SI(PrC, S.c_missing, "total missing segments"),
SI(PrC, S.c_retrans, "total virtual retransmissions"),
SI(PrC, S.c_recovered, "normal full recoveries"),
SI(PrC, S.c_late, "segments received late"),
SI(PrC, S.c_bytesin, "(ACK) bytes received"),
SI(PrC, S.c_rcvbuf, "bytes receive socket buffer space"),
SI(PrC, S.c_sndbuf, "bytes send socket buffer space"),
SI(PrC, S.c_delayedack, "delayed ACKs emulated"),

  /* e_ reflect exception events and should be zero */
SI(PrE, S.e_fromchanged, "received replys from other places"),
SI(PrE, S.e_dupdata, "received duplicate segment"),
SI(PrE, S.e_wayooo, "spurious retransmissions triggered by of order data"),
SI(PrE, S.e_hithresh, "bounding parameters hithresh test was true"),
SI(PrE, S.e_invalseq1, "received sequence > snd_next"),
SI(PrE, S.e_invalseq2, "received sequence out-of-order, but not in recovery"),
SI(PrE, S.e_nobuff, "send() got ENOBUF"),
SI(PrE, S.e_delayACKbotch, "timeout was due to faulty delayed ACK")
};
#define NSI (sizeof(si)/sizeof(struct statinfo))

/* equilibrium instrumentations */
int pktin, pktout, pktloss, equilib, priorequl;
struct timeval eqstamp;

void ckequilib(int f)
{
  if (equilib && f) {	/* Counts as equilibrium if before and after are "normal" */
    S.s_tin += pktin;
    S.s_ein += pktin;
    S.s_tout += pktout;
    S.s_eout += pktout;
    S.s_ecnt++;
    S.s_etime += USEC(&eqstamp, &ptime);
    S.s_eloss += pktloss;
  } else {
    S.s_tin += pktin;
    S.s_nein += pktin;
    S.s_tout += pktout;
    S.s_neout += pktout;
    S.s_necnt++;
    S.s_netime += USEC(&eqstamp, &ptime);
  }
  priorequl=equilib;
  equilib=f;
  eqstamp = ptime;
  pktloss = pktin = pktout = 0;
}

char lbanner[] = "\
 hop       packets   min rtt BW\n\
(ttl)     in    out   (mS)  k bps   Address\n";
char longfmt[] = "\n\
= %2d %6d %6d %6.2f = %6g %d.%d.%d.%d %s\n";

char sbanner[] = "\
 hop     BW\n\
(ttl)   k bps   Address\n";
char sfmt[] = "\n\
= %2d = %6g %d.%d.%d.%d %s\n";

char *bwfmt="%s rate: %6g kbp/s (%d pkts in + %d lost = %3.2g%%) in %4.4g s\n";

/*****************************************************************
 * Diagnostic mode stats (from the BTC doc):
          - Statistics over the entire test
            (data transferred, duration and average rate)
          - Statistics from the equilibrium portion of the test
            (data transferred, duration, average rate, and number
            of equilibrium congestion control cycles)
          - Path property statistics (MTU, Min RTT, max cwnd in
            equilibrium and max cwnd during Slow-start)
          - Statistics from the non-equilibrium portion of the
            test (nature and number of non-equilibrium events).
          - Estimated load/BW/buffering used on the return path.
          - Warnings about data transmission abnormalities.
            (e.g packets out-of-order)
          - Warnings about conditions which may effect metric
            accuracy. (e.g insufficient tester buffering)
          - Alarms about serious data transmission abnormalities.
            (e.g. data duplicated in the network)
          - Alarms about tester internal inconsistencies and events
            which might invalidate the results.
          - IP address/name of the responding target.
          - TReno version.
*/

void clearstats()
{
  ckequilib(0);	/* residuals get cleared below */
  memset( (char *)&S, 0, sizeof(S)); 
  memset( (char *)&goodfrom, 0, sizeof(struct sockaddr_in));
  havegoodfrom = 0;
  minrtt = (u_int) -1;
}

void showstats()
{
  float rtt, tbw, ebw, tpct, epct;
  int i;
  char *name="";
  unsigned char *fp = (unsigned char *) &goodfrom.sin_addr;
  struct hostent *hp;
  
  Event2('A', ttl, goodfrom.sin_addr.s_addr);
  if (!numeric) {
    hp = gethostbyaddr((char *) &goodfrom.sin_addr, 4, AF_INET);
    if (hp) name = hp->h_name;
  }
  
  rtt = (minrtt == (u_int) -1) ? -1.0 : minrtt/1000.0;
  tbw = ((float) S.s_tin)*maxseg*8000 / (S.s_etime+S.s_netime);
  ebw = S.s_etime? ((float) S.s_ein)*maxseg*8000 / S.s_etime: 0.0;
  tpct = S.s_tout? ((S.s_tout-S.s_tin)*100.0)/S.s_tout: 0.0;
  epct = S.s_eout? ((S.s_eloss)*100.0)/S.s_eout: 0.0;

  /* Choose the format - Metric mode */
  if (!a_autottl) {
    printf("\n");
    printf("Replies were from %s [%d.%d.%d.%d]\n", name, fp[0], fp[1], fp[2], fp[3]);
    printf(bwfmt, "    Average", tbw, S.s_tin, S.s_tout-S.s_tin, tpct, (S.s_etime+S.s_netime)/1000000.0);
    printf(bwfmt, "Equilibrium", ebw, S.s_ein, S.s_eloss, epct, S.s_etime/1000000.0);
    printf("Path properties: min RTT was %6.2f ms, path MTU was %d bytes\n", rtt, maxseg);
    if (a_diag) printf("Diagnostic mode\n");	/* XXX show switches */

if (!a_verbose)
printf("XXX Calibration checks are still under construction, use -v\n");

    for (i=0; i < NSI; i++) {
      int val = *(int *)si[i].v;
      char *pfmt = 0;

      switch (si[i].ctrl) {
      case PrS: if (a_verbose >= 1) pfmt="There were %d %s\n"; break;
      case PrC:	if (a_verbose >= 1) pfmt="There were %d %s (confirm)\n"; break;
      case PrE: if (val) {pfmt="Alarm: there were %d %s\n"; break;}
		else {if (a_verbose >= 1) pfmt="There were %d %s (no alarm)\n"; break;};
      }
      if (pfmt) printf(pfmt, val, si[i].name);
    }
    return;
  } else { /* Hop-by-hop mode is terse */
    if (a_verbose >= 4) {
      printf(longfmt, ttl,
	     S.s_tin, S.s_tout, rtt,
	     tbw, fp[0], fp[1], fp[2], fp[3], name); }
    else
      printf(sfmt, ttl,
	     tbw, fp[0], fp[1], fp[2], fp[3], name);
  }
}

/*****************************************************************
 * Output buffers and variables
 */
#define MINBUF 1500	/* For unexpected messages */
/* output buffer and goop */
struct opacket {		/* a (udp) probe packet. */
	struct ip ip;
	struct udphdr udp;
};
#define UDP_MINLEN (sizeof(struct opacket)-SIZEOFstructip)
int sndsock;				/* send socket */
u_int mtu;				/* Path MTU */
u_int optlen;				/* option length on the probes */
u_int tcpiplen = (20+20+12);		/* Presumed TCP/IP header w/ options */
u_int probelen;				/* probe packet length */
u_int buflen;				/* largest possible packet */
u_short checksum();
char *outbuf = 0;
struct opacket *op;
struct ip *ip;
struct udphdr *up;
struct icmp *ic;
u_short cksm;	/* precomputed */
#ifdef NOVMC
/*
 * Some systems use virtual memory page flips to save copies.  Unfortunately
 * they are often broken in the sense that VM copy on write is far slower than 
 * it would have been to do conventional mbuf copy in the first place.
 * Work around by not re-using buffers until we think the device driver
 * is done with them.
 */
char **bufflist, **bfl;
#endif /* NOVMC */

/*****************************************************************
 * Pre/reload send buffers with precomputed packets
 */

void setbuffers(sz, tl)
int sz, tl;
{

  /* refine the ttl */
  if (tl) ttl = tl;

  /* refine the size */
  if (sz && (sz != mtu)) {
    if (mtu && (sz > mtu)) {
      printf("Invalid resize: %d\n", sz);
      exit(1);
    }
    mtu = sz;
    maxseg = mtu-tcpiplen;
/*  if (maxseg > 2048) maxseg &= ~1023;     XXX no rounding */
    printf(" MTU=%d ", mtu);
    Event('M', maxseg);		/* set/adjust MTU and mss */
  }

  /* allocate input and output buffers, if needed */
  if (!outbuf) {
#define slop 100
    buflen = maxseg+tcpiplen+optlen+slop;	 /* largest possible message */
    if (buflen < MINBUF) buflen = MINBUF+slop;
    inbuf = (char *)malloc(buflen);
    outbuf = (char *)malloc(buflen);
    memset(outbuf, 0, buflen);
    op = (struct opacket *)outbuf;
    ip = &op->ip;
    up = &op->udp;
    ic = (struct icmp *)op;
    if (a_udp) { 
      ip->ip_hl = SIZEOFstructip >> 2;
      ip->ip_v = IPVERSION;
      ip->ip_tos = a_tos;
      /*  ip->ip_len = ?? see below */
      ip->ip_off = (short) IP_DF;			/* always DF */
      ip->ip_ttl = ttl;
      ip->ip_p = IPPROTO_UDP;
#ifndef CRAY
      if (a_setfrom) ip->ip_src = a_FromHost.sin_addr;
      ip->ip_dst = a_ToHost.sin_addr;
      up->uh_sport = htons(ident);
#else
      if (a_setfrom) ip->ip_src = a_FromHost.sin_addr.s_addr;
      ip->ip_dst = a_ToHost.sin_addr.s_addr;
      ((struct udpiphdr *)ip)->ui_sport = htons(ident);
#endif
      /* up->uh_ulen = ?? see below */
      up->uh_sum = 0;
    } else {
      /* set up ICMP packet */
      ic->icmp_type = ICMP_ECHO;
      ic->icmp_seq = 1; /* Must match snd_nxt initialization elsewhere */
      ic->icmp_id = ident;
      ic->icmp_cksum = 0;
      /* cksm = checksum(outbuf, probelen); see below */
    }
#ifdef NOVMC
#define MAXTRAN 100
    { int i;
      bufflist = (char **) malloc((MAXTRAN+2)*sizeof(char *));
      bufflist[0] = outbuf;
      for (i = 1; i <= MAXTRAN; i++) {
	bufflist[i] = (char *)malloc(buflen);
	memcpy(bufflist[i], outbuf, buflen);
      }
      bufflist[MAXTRAN+1] = 0;
      bfl = bufflist;
    }
#endif /* NOVMC */
  }

  /* "reload buffers" and put in the final touches */
  if (a_udp) {
    probelen = maxseg+tcpiplen-optlen;   /* probe size  w/o options */
#ifdef NOVMC
    for (bfl = bufflist; *bfl; ) {
      outbuf = *bfl++;
      op = (struct opacket *)outbuf;
      ip = &op->ip;
      up = &op->udp;
      ic = (struct icmp *)op;
#endif /* NOVMC */
      ip->ip_len = probelen;
      ip->ip_ttl = ttl;
      up->uh_ulen = htons((u_short)maxseg+tcpiplen-SIZEOFstructip);
/*sizeof(struct opacket));*/
#ifdef NOVMC
    }
#endif /* NOVMC */
  } else {
    probelen = maxseg+tcpiplen-optlen-SIZEOFstructip;
    ic = (struct icmp *)outbuf;
    ic->icmp_cksum = 0;
    cksm = checksum(outbuf, probelen);
  }
}

/*****************************************************************
 * send a probe packet at the specified sequence
 */

int sendsomething(struct tcpcb *tp)
{
  register tcp_seq seq;
  int erf;

  pktout++;
again: ;	/* XXX seq advances if the interface refuses the packet */
#ifdef NOVMC
  if (!*bfl) bfl = bufflist;
  outbuf = *bfl++;
  op = (struct opacket *)outbuf;
  ip = &op->ip;
  up = &op->udp;
  ic = (struct icmp *)op;
#endif /* NOVMC */
  seq = tp->snd_nxt = SHORT((tp->snd_nxt) + 1);
  Event('O', seq);	/* packet out */
  if (a_verbose >= 8){
    int bcnt = mtu;
    char *outb = outbuf;
    printf("Sending:\n");
    netprintip((u_char**)&outb, &bcnt, 2);
  }
  if (a_udp) {
#ifndef CRAY
    up->uh_dport = htons(SHORT(port+seq));
#else
    ((struct udpiphdr *)ip)->ui_dport = htons(SHORT(port+seq));
#endif
    erf = sendto(sndsock, outbuf, probelen, 0,
		 (struct sockaddr *) &a_ToHost, sizeof(a_ToHost));
  } else {
    ic->icmp_seq = seq;
    if (seq && !cksm--) cksm--; /* ones comp magic */
    ic->icmp_cksum = cksm;
    erf = sendto(sndsock, outbuf, probelen, 0,
		 (struct sockaddr *) &a_ToHost, sizeof(a_ToHost));
  }
  if (erf != probelen) {


    if (errno == ENOBUFS) {
      S.e_nobuff++;
      Event('x', 0);			/* sendto ENOBUFS */
      return 0;
    } else if (errno == EMSGSIZE) {
      setbuffers(drop_mtu(mtu), 0);
      goto again;
    } else NOERROR(erf, "sendto");
  }
  /* Check to see it this was a virtual retransmission */
  if (sack_present && repaired_f) {
    if (missing) {
      missing--;
      if (SEQ_GT(tp->snd_nxt, rrseq)) rrseq = tp->snd_nxt;
      S.c_retrans++;
    }
  }

  /* time a segment if not already doing so */
  if (SEQ_LEQ(rttseq, tp->snd_fack)) {
    rttstamp = now;
    rttseq = tp->snd_nxt;
  }
  return 1;
}

/*****************************************************************
 *  Search for the next smaller valid mtu
 */
int drop_mtu(int os)
{
  static int mtulist[] =	/* from RFC1191 */
    {32000, 17914, 8166, 4352, 2002, 1492, 1006, 508, 296, 68, 0};
  int i=0, ns;

  while ((os <= (ns = mtulist[i++])) && ns) ;
  return (ns);
}


/*****************************************************************
 * Receive a segment and get it's sequence number
 * return 1 if success, 0 if timeout
 *	Instrument and discard unexpected ("u") segments
 */
int recvseq(struct tcpcb *tp)
     /* tp is not used */
{
  int fromlen, bcnt;
  int seq; /* XXX potentially uninitialized */
  int type, code, rident;
  struct udphdr32o *up;

 again:
  fromlen = sizeof(from);
  bcnt = recvfrom(is, inbuf, buflen, 0,
		  (struct sockaddr *) &from, &fromlen);
  gettimeofday(&now, 0);
  EventTime('T', &now);

  if (bcnt<0) {
    if (errno != EINTR) perror("recvfrom");
    return 0;
  }
  S.c_bytesin += bcnt;
  type = code = rident = -1;
  buf = inbuf;
  crackip(inbuf);
  up = STRUCTALIGN(struct udphdr32o *, cipclient, 0);
  if (cicmp) {
    type = cicmp->icmp_type;
    code = cicmp->icmp_code;
  };
  if (a_udp && cip2 && up &&
      (type == ICMP_TIMXCEED || type == ICMP_UNREACH) &&
      (cip2->ip_p == IPPROTO_UDP) ){
    seq = SHORT(ntohs(up->uh_dport) - port);
    rident = ntohs(up->uh_sport);
    if ((type == ICMP_UNREACH) && (rident == ident)) {
      if (code == ICMP_UNREACH_NEEDFRAG) {
	int m = ntohl(cicmp->icmp_void) & 0xFFFF;
	if (!m) {
	  m = drop_mtu(mtu);	/* XXX too small if rerouted */
	  goto again;
	}
	if (m != mtu) setbuffers(m, 0);
      } else
	runf = 0;
    }
  } else {
    if (!a_udp &&
	(a_ToHost.sin_addr.s_addr == from.sin_addr.s_addr) &&
	(type == ICMP_ECHOREPLY) ) {
      seq = SHORT(cicmp->icmp_seq);
      rident = cicmp->icmp_id;
    }
  }
  if (!cicmp || (rident != ident)) {
    S.c_unexpected++;
    Event('u', rident);
    if (a_debug || (a_verbose >= 6)) {
      printf ("Unexpected message:\n");
      printf("Ident?, %x != %x\n", rident, ident);
      netprintip((u_char**)&buf, &bcnt, npdepth);
    }
    else {
      if (a_verbose >= 4) putchar('u');
    }
    goto again;
  } else {
    if (havegoodfrom) {
      if (
#ifdef MEMCMP_8BIT_CLEAN
	  memcmp(&from, &goodfrom, fromlen)
#else
	  bcmp(&from, &goodfrom, fromlen)
#endif
	  ) {
	S.e_fromchanged++;
	lastfrom = from;
      }
    } else
      goodfrom = from;
    if (a_verbose >= 8) {
      printf ("Received message:\n");
      netprintip((u_char**)&buf, &bcnt, npdepth);
    }
  }
  ptime = now;
  rseq = seq;	/* store the returned sequence number */
  Event('I', rseq);			/* packet input */
  return 1;
}


/*****************************************************************
 *
 *  Emulate  the receiver's reassembly queue
 *	and the sender's scoreboard.
 *	
 *	Returns 0 if this ack should be ignored for some reason.
 */
int parseack(struct tcpcb *tp)
{
  int ret=1;
  if (SEQ_GT(rseq, tp->snd_nxt)){
    Event('s', rseq);		/* way out of sequence */
    if (a_debug) printf ("%d:%d:%d", tp->snd_fack, rseq, tp->snd_nxt);
    S.e_invalseq1++;
    return(0);
  }
  pktin++;
  
  if (sack_present) {			/* already out of order */
    int loss = SHORT(rseq - flseq) - ++tp->t_dupacks;
    if (rseq == erseq) {		/* expected (next) segment */
      if (SEQ_GEQ(rseq, rrseq)) {	/* normal full recovery or */
	tp->t_dupacks = sack_present = 0;
	if (a_verbose >= 6) printf("%d ",loss);
	S.c_recovered++;
      } 
    } else if (!loss) {	/* no net loss - must have been out-of-order */
	if (!repaired_f) {
	  S.c_okooo++;
	  tp->t_dupacks = sack_present = 0;
	} else S.e_wayooo++;
	if (a_verbose >= 6) printf("ooo");
    } else if (SEQ_LT(rseq, erseq)) {	      	/* miss-ordered ACK... */
      Event('s', rseq);				/* out of sequence */
      if (rseq == tp->snd_fack) {S.e_dupdata++;}
        else {S.c_late++;}	/* should be == S.c_okooo+S.e_wayooo */
      return(0);
    } else {					/* a new hole? */
      rrseq = tp->snd_nxt; 
      missing += SHORT(rseq - erseq);
      S.c_missing += SHORT(rseq - erseq);
      pktloss += SHORT(rseq - erseq);
      S.c_moreholes++;
    }
    if (!repaired_f && (tp->t_dupacks > tcprexmtthresh)) {
      repaired_f++;
      tp->t_flags |= TF_REPAIRED;
      S.s_repaired++;
      ckequilib(1);
    }
  } else {				/* no sack_present */
    if (rseq == erseq) {		/* and the expected segment */
      /* emulate the receivers delayed ACK behavior XXX */
      if (a_dack && (tp->snd_cwnd > (2*maxseg)) && (rseq & 0x1)) {
	daseq = rseq;
	Event('Z', rseq);		/* Delayed ACK emulation */
	S.c_delayedack++;
	ret=0;
      }
    } else if (SEQ_LT(rseq, erseq)) {	      /* miss-ordered ACK... */
      Event('s', rseq);				/* out of sequence */
      if (rseq == tp->snd_fack) {S.e_dupdata++;}
        else {S.e_invalseq2++;}
      return(0);
    } else {
      sack_present = tp->t_dupacks = 1;		/* a new hole */
      flseq = erseq;
      rrseq = tp->snd_nxt;
      missing = SHORT(rseq - erseq);
      pktloss += SHORT(rseq - erseq);
      S.c_missing += SHORT(rseq - erseq);
      S.c_newhole++;
      if (a_verbose >= 2) putchar('X');
    }
  }
  tp->snd_fack = rseq;
  erseq = SHORT(tp->snd_fack + 1);	     /* expected for next time */
  /* do the rtt estimator, XXX use a ring buffer */
  if (SEQ_GEQ(tp->snd_fack, rttseq)) {
    u_int rtt = USEC(&rttstamp, &now);
    if (rtt < minrtt) minrtt = rtt;
    S.s_rttcnt++;
  }
  return(ret);
}

/*****************************************************************
 * The various Congestion Control functions
 */
void stdInit(struct tcpcb *tp)
{
  /* TCP's initial state and first packets */
  memset(tp, 0, sizeof(struct tcpcb));
  tp->snd_nxt = SHORT(1);
  rrseq = tp->snd_fack = tp->snd_nxt;
  sendsomething(tp);		/* If mtu discovery, this sets maxseg */
  rttseq = erseq = tp->snd_nxt;
  rttstamp = now;
  sendsomething(tp);		/* cwnd = 2 packets */
  t_rcounter = sack_present = 0;
  tp->snd_cwnd = 2 * maxseg;
  tp->snd_ssthresh = ((u_long) -1) >> 1;
}

void stdTimo(struct tcpcb *tp)	/* used by most */
{
  u_long win=tp->snd_cwnd / 2 / maxseg;

  if (a_verbose >= 2) putchar('T');
  if (win < 2) win = 2;
  tp->snd_ssthresh = win * maxseg;		/* update ssthresh */
  tp->snd_cwnd = 2*maxseg;	      /* XXX pull cwnd down to one! */
  sack_present = 0;
  Event('t', tp->snd_ssthresh);		/* timeout (show ssthresh) */
  sendsomething(tp);
  erseq = tp->snd_nxt;
}

void stdCC(struct tcpcb *tp)
{
  /* "sack_present" has been previously set */

  int win, awnd;
  
  if ( !t_rcounter && sack_present) {			/* new recovery */
      u_long win = (tp->snd_cwnd + maxseg) / 2 / maxseg;
      if (win < 2) win = 2;
      hithresh = win * maxseg;
      win /= 2;
      if (win < 2) win = 2;
      tp->snd_ssthresh = lothresh = win * maxseg;
      t_rcounter++;
      tp->snd_recover = tp->snd_nxt;
      repaired_f = hold_f = toggle_f = 0;
      Event('R', tp->snd_ssthresh);
  }

  /* Do the Window adjustment */
  awnd = SHORT(tp->snd_nxt-tp->snd_fack) * maxseg;	/* compute the actual window */
  if (t_rcounter) {
    tp->snd_cwnd = awnd;
    if (hold_f) {
      tp->snd_cwnd += maxseg;
      Event('r', tp->snd_cwnd);			/* recovery */
    } else if (SEQ_GEQ(tp->snd_fack, tp->snd_recover) || (awnd <= lothresh)) {
      tp->snd_cwnd += maxseg;
      hold_f = 1;
      Event('r', tp->snd_cwnd);			/* recovery */
    } else if (toggle_f ^= 1) {
      tp->snd_cwnd += maxseg;
      Event('r', tp->snd_cwnd);			/* recovery */
    } else
      Event('d', tp->snd_cwnd);			/* dropped dup ACK */

    /* Check for end of recovery */
    if (!sack_present && (!repaired_f || SEQ_GT(tp->snd_fack, tp->snd_recover))) {
      if (repaired_f && (tp->snd_cwnd > hithresh)) {	/* Window safety check */
	Event('B', hithresh - tp->snd_cwnd);		/* should never happen */
	tp->snd_cwnd = hithresh;
	S.e_hithresh++;
      }
      t_rcounter = repaired_f = 0;
    }
 
  } else {
    if (tp->snd_cwnd < tp->snd_ssthresh) {
      tp->snd_cwnd += maxseg;
      if (tp->snd_cwnd > S.s_ssmaxwin) S.s_ssmaxwin = tp->snd_cwnd;
      Event('e', tp->snd_cwnd);		/* exponential open (aka slow start) */
    } else {
      tp->snd_cwnd += maxseg * maxseg / tp->snd_cwnd;
      if (tp->snd_cwnd > S.s_camaxwin) S.s_camaxwin = tp->snd_cwnd;
      Event('a', tp->snd_cwnd);		/* congestion avoidance  */
    }
  }

  /* send out to cwnd beyond fack */
  win = tp->snd_cwnd - awnd;
  while (win >= maxseg) {
    sendsomething(tp);
    awnd = SHORT(tp->snd_nxt-tp->snd_fack) * maxseg; 
    win = tp->snd_cwnd - awnd;
  }
}

/*****************************************************************
 * Conditional support for alternate congestion control algorithms
 */

typedef void (*ccontrol_t)(struct tcpcb *);

#ifdef ALTCC
#include altcc.c
#endif

ccontrol_t initR = stdInit, timoR = stdTimo, ccR = stdCC;
char *TCPver = "(future) standard CC";


/*****************************************************************
 * setup(argc, argv)
 *	- parse command line switches
 *	- Open and configure the sockets
 */
#define PARSEUINT(v)	{if ((argc <= 0) || (((v) = atoi(*av++)) <= 0) ){goto punt;}; argc--;}
#define PARSEINT(v)	{if (argc <= 0){goto punt;}; (v) = atoi(*av++); argc--;}
#define caseD(c) case (0x100+c)
#define caseR(c) case (0x200+c)
void setup(int argc, char *argv[])
{
  char *sv, **av = argv,  *swp, *erf="Invalid argument to -%s%c\n\n";
  int rlen=0, swb;				
  char usage[] = "\
Usage (metric mode):  treno <options> <host>\n\
      -p <s>       Set the test duration\n\
      -b <mtu>     Specify the (initial) MTU\n\
      -i           Use ICMP Ping instead of traceroute\n\
      -n           Show addresses only\n\
      -v           More verbose output\n\
      -V           Display version information\n\
      -F <addr>    Select a source (from) interface\n\
      -D<switches> Enable diagnostic mode\n\
      <host>       Target host\n\
";
  char usageD[] = "\
Usage (diagnostic mode): treno <options> [<GW1>...<GW>] <host>\n\
      -v<digit>    Adjustable verbosity (-v is -v4)\n\
      -Dt <ttl>    Set the initial TTL\n\
      -Da <ttl>    Auto increment TTL through <ttl> (implys -Dt 1)\n\
      -DT          Use more thrust (raise process priority)\n\
      -Dd          Debug (both socket and internal)\n\
      -DE          Enable event logging (for detailed postmortem)\n\
      -DR<switches> Enable reseach mode\n\
      [<GW1>...<GW>] Gateways to be traversed\n\
      Switches can be combined: -vnDde\n\
";
  char usageR[] = "\
The following switches invoke non-standard TCP behaviors...\n\
      -DRc <ver>   Use the selected CC version: 0=std, look at the source\n\
      -DRr <cnt>   Specify tcprexmtthresh (<0 => first hole only)\n\
      -DRd         Turn off the receiver's delayed ACK emulation\n\
";

  /* no buffering on stdout */
#ifdef SETVBUF_REVERSED
  setvbuf(stdout, _IONBF, 0, 0);
#else
  setvbuf(stdout, 0, _IONBF, 0);
#endif
  numeric=0; 	/* Format names by default */

  /* general switch cracker */
  argc--, av++;
  while (argc > 0 && *av[0] == '-') {
    sv = *av++; argc--; swp=""; swb=0;
    while (*++sv) switch (swb+*sv) {

      /* metric mode switches */
    case  'p': PARSEUINT(a_duration); break;
    case  'b': PARSEINT(a_bbytes); break;	/* check for legal later */
    case  'i': a_udp=0; break;
    case  'n': numeric=1; break;
    case  'v':
    caseD('v'):
    caseR('v'):
      if (isdigit(sv[1])) 
	a_verbose = *++sv - '0';
      else
	a_verbose = DEF_VERB;
      break;	/* No -v is verbose = 0 */
    case  'V': printversion(); break;
    case  'F':
      if (argc <= 0) {goto punt;};
      a_setfrom++;
      setsin(&a_FromHost, *av++); argc--;
      break;

      /* Diagnostic mode switches */
    case  'D': a_diag++; swp="D"; swb = 0x100; break;
    caseD('t'): PARSEUINT(a_ttl); break;
    caseD('a'): PARSEUINT(a_autottl); break;
    caseD('T'): a_thrust=1; break;
    caseD('d'): a_debug++; break;
    caseD('E'):
      evfd = open("event.log", O_WRONLY|O_CREAT|O_TRUNC, 0666);	/* lazy */
      break;

      /* Research mode switches */
    caseD('R'): a_research++; swp="DR"; swb = 0x200; break;
    caseR('c'):
#ifdef ALTCC
      {int cv;
      PARSEUINT(cv);
      if (cv >= NCCV) goto punt;
      initR=ccv[cv].i;
      timoR=ccv[cv].t;
      ccR=ccv[cv].cc;
      break;}
#else
      printf("No alternate Congestion Control\n");
      goto punt;
#endif
    caseR('r'):
      {int trt;
      PARSEINT(trt);
      if (trt > 0) {
	tcprexmtthresh = tcprexmtthreshN = trt;
      } else {
	tcprexmtthresh = -trt;
	tcprexmtthreshN = 1;
      }
      break;}
    caseR('d'): a_dack=0; break;
    default:
      erf="Unknown switch: -%s%c\n\n";
    punt:
      printf(erf, swp, *sv);
      printf(usage);
      if (a_diag) printf(usageD);
      if (a_research) printf(usageR);
      exit(1);
    }
  }
  
  /* Parse and arrange the route */
  while (argc > 0) {
    setsin(&route[rlen++], *av++); argc--;
  }
  if (rlen < 1)  {
    printf(usage);
    printf("You must supply at least one host name\n");
    exit(1);
  }
  memcpy(&a_ToHost, &route[rlen-1], sizeof(struct sockaddr_in));  /* ICMP */
  /* XXX forbid LSRR unless diagnostic mode */

  /* misc configuration */
  Event('V', VERSIONN);	/* Event logger version */
  if (a_thrust) setthrust();
  ident = (getpid() & 0xffff) | 0x8000;
  npdepth = a_debug?-1:(a_udp?3:2);
  if (a_udp) {
    if (!a_ttl) a_ttl = a_autottl?1:DEF_TTL;
  } else {
    if (a_autottl || a_ttl) {
      printf("Switch -i conflicts with TTL switches\n");
      exit(1);
    }
    a_ttl = -1;
  }

  /* set up the icmp socket */
  NOERROR(is = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP), "icmp socket");
  if (a_debug) setsockopt(is, SOL_SOCKET, SO_DEBUG, (char *)&on, sizeof(on));

  /* set up the send socket - UDP if we are using it */
  if (a_udp) {
    NOERROR(sndsock = socket(AF_INET, SOCK_RAW, IPPROTO_RAW), "sndsock");
    if (a_debug) setsockopt(sndsock, SOL_SOCKET, SO_DEBUG,
			  (char *)&on, sizeof(on));
#ifdef IP_HDRINCL
    NOERROR(setsockopt(sndsock, IPPROTO_IP, IP_HDRINCL,
		       (char *)&on, sizeof(on)), "IP_HDRINCL");
#endif
  }
  else sndsock = is;	/* use the ICMP socket for sending */

#define BIGBUF 524288
#define TINYBUF 32768
  { int sz;
    for (sz=BIGBUF; sz >= TINYBUF; sz >>= 1) {
      if (setsockopt(is, SOL_SOCKET, SO_RCVBUF, (char *)&sz, sizeof(sz)) >= 0) break;
    }
    for (sz=BIGBUF; sz >= TINYBUF; sz >>= 1) {
      if (setsockopt(sndsock, SOL_SOCKET, SO_SNDBUF, (char *)&sz, sizeof(sz)) >= 0) break;
    }
  }
#ifndef IP_HDRINCL
  if (a_setfrom) {
    NOERROR(bind(sndsock, (struct sockaddr *)&a_FromHost, sizeof(a_FromHost)),
	    "Bind from");
  }
#endif

  /* set the route and check the MTU */
  optlen = setroute(sndsock, IPOPT_LSRR, rlen, route, 0);
  { int minlen = optlen + SIZEOFstructip + (a_udp?UDP_MINLEN:ICMP_MINLEN);
    if (a_bbytes) {
      if (a_bbytes<minlen) {
	printf(usage);
	printf("MTU (-b) must be greater than %d\n", minlen);
	exit(1);
      }
    } else
      a_bbytes = a_udp?BBYTES:576;	/* udp default is ATM, ICMP is 576 */
  }
}

/*****************************************************************
 * The main
 */
int main(int argc, char *argv[])
{
  setup(argc, argv);

  /* pre loop setup */
  MP_SIGNAL(SIGALRM,ring);
  if (a_autottl) printf((a_verbose >= 4)?lbanner:sbanner);

  /* loop across each hop */
  for (; runf; a_ttl++) {
    Event('H', a_udp?a_ttl:0);	/* which hop */
    setbuffers(a_bbytes, a_ttl);
    a_bbytes = 0;

    /* sync to the system clock */
    gettimeofday(&now, 0);
    tick = now.tv_sec;
    while(tick == now.tv_sec) gettimeofday(&now, 0);
    starttick = tick = now.tv_sec;
    EventTime('T', &now);	/* starting time stamp */
    ptime = now;
    clearstats();
    {int szlen = sizeof(S.c_rcvbuf);
      NOERROR(getsockopt(is, SOL_SOCKET, SO_RCVBUF,
			 (char *)&S.c_rcvbuf, &szlen),"Get SO_RCVBUF");
      szlen = sizeof(S.c_sndbuf);
      NOERROR(getsockopt(sndsock, SOL_SOCKET, SO_SNDBUF,
			 (char *)&S.c_sndbuf, &szlen), "Get SO_SNDBUF");
    }

    /* Set TCP's initial state and send the first packet(s) */
    initR(&pcb);

    /* loop for the test duration XXX */
    while ((++tick <= (starttick+a_duration))) {
      alarm(2);				/* XXX fixed timout: 2 sec HEADWAY */
      if (a_verbose < 2) putchar('.');	/* minimal chatter */
	
      /* Run TCP for one second */
      while (tick > now.tv_sec) {
	if (recvseq(&pcb)) {
	  if (parseack(&pcb)) ccR(&pcb);
	} else {
	  if (a_dack && (rseq & 0x1) && (daseq == rseq)) S.e_delayACKbotch++;
	  ckequilib(0);
	  timoR(&pcb);
	}
      }
    }

    /* drain outstanding packets and report final stats */
    alarm(2);
    while(recvseq(&pcb)) {
      pktin++;	/* XXX parseack()? */
    }
    ckequilib(priorequl);	/* this interval is counted the same as the prior */
    showstats();

    if ((a_autottl?(a_ttl >= a_autottl):1) || (S.s_tin == 0)) runf = 0;
  } /* ttl loop */

  /* close out the event log */
  evflush();
  close(evfd);
  exit(0);
}
