]> git.lizzy.rs Git - plan9front.git/blob - sys/src/9/ip/tcp.c
merge
[plan9front.git] / sys / src / 9 / ip / tcp.c
1 #include        "u.h"
2 #include        "../port/lib.h"
3 #include        "mem.h"
4 #include        "dat.h"
5 #include        "fns.h"
6 #include        "../port/error.h"
7
8 #include        "ip.h"
9
10 enum
11 {
12         QMAX            = 64*1024-1,
13         IP_TCPPROTO     = 6,
14
15         TCP4_IPLEN      = 8,
16         TCP4_PHDRSIZE   = 12,
17         TCP4_HDRSIZE    = 20,
18         TCP4_TCBPHDRSZ  = 40,
19         TCP4_PKT        = TCP4_IPLEN+TCP4_PHDRSIZE,
20
21         TCP6_IPLEN      = 0,
22         TCP6_PHDRSIZE   = 40,
23         TCP6_HDRSIZE    = 20,
24         TCP6_TCBPHDRSZ  = 60,
25         TCP6_PKT        = TCP6_IPLEN+TCP6_PHDRSIZE,
26
27         TcptimerOFF     = 0,
28         TcptimerON      = 1,
29         TcptimerDONE    = 2,
30         MAX_TIME        = (1<<20),      /* Forever */
31         TCP_ACK         = 50,           /* Timed ack sequence in ms */
32         MAXBACKMS       = 9*60*1000,    /* longest backoff time (ms) before hangup */
33
34         URG             = 0x20,         /* Data marked urgent */
35         ACK             = 0x10,         /* Acknowledge is valid */
36         PSH             = 0x08,         /* Whole data pipe is pushed */
37         RST             = 0x04,         /* Reset connection */
38         SYN             = 0x02,         /* Pkt. is synchronise */
39         FIN             = 0x01,         /* Start close down */
40
41         EOLOPT          = 0,
42         NOOPOPT         = 1,
43         MSSOPT          = 2,
44         MSS_LENGTH      = 4,            /* Maximum segment size */
45         WSOPT           = 3,
46         WS_LENGTH       = 3,            /* Bits to scale window size by */
47         MSL2            = 10,
48         MSPTICK         = 50,           /* Milliseconds per timer tick */
49         DEF_MSS         = 1460,         /* Default maximum segment */
50         DEF_MSS6        = 1280,         /* Default maximum segment (min) for v6 */
51         DEF_RTT         = 500,          /* Default round trip */
52         DEF_KAT         = 120000,       /* Default time (ms) between keep alives */
53         TCP_LISTEN      = 0,            /* Listen connection */
54         TCP_CONNECT     = 1,            /* Outgoing connection */
55         SYNACK_RXTIMER  = 250,          /* ms between SYNACK retransmits */
56
57         TCPREXMTTHRESH  = 3,            /* dupack threshhold for rxt */
58
59         FORCE           = 1,
60         CLONE           = 2,
61         RETRAN          = 4,
62         ACTIVE          = 8,
63         SYNACK          = 16,
64
65         LOGAGAIN        = 3,
66         LOGDGAIN        = 2,
67
68         Closed          = 0,            /* Connection states */
69         Listen,
70         Syn_sent,
71         Syn_received,
72         Established,
73         Finwait1,
74         Finwait2,
75         Close_wait,
76         Closing,
77         Last_ack,
78         Time_wait,
79
80         Maxlimbo        = 1000,         /* maximum procs waiting for response to SYN ACK */
81         NLHT            = 256,          /* hash table size, must be a power of 2 */
82         LHTMASK         = NLHT-1,
83
84         HaveWS          = 1<<8,
85 };
86
87 /* Must correspond to the enumeration above */
88 char *tcpstates[] =
89 {
90         "Closed",       "Listen",       "Syn_sent", "Syn_received",
91         "Established",  "Finwait1",     "Finwait2", "Close_wait",
92         "Closing",      "Last_ack",     "Time_wait"
93 };
94
95 typedef struct Tcptimer Tcptimer;
96 struct Tcptimer
97 {
98         Tcptimer        *next;
99         Tcptimer        *prev;
100         Tcptimer        *readynext;
101         int     state;
102         int     start;
103         int     count;
104         void    (*func)(void*);
105         void    *arg;
106 };
107
108 /*
109  *  v4 and v6 pseudo headers used for
110  *  checksuming tcp
111  */
112 typedef struct Tcp4hdr Tcp4hdr;
113 struct Tcp4hdr
114 {
115         uchar   vihl;           /* Version and header length */
116         uchar   tos;            /* Type of service */
117         uchar   length[2];      /* packet length */
118         uchar   id[2];          /* Identification */
119         uchar   frag[2];        /* Fragment information */
120         uchar   Unused;
121         uchar   proto;
122         uchar   tcplen[2];
123         uchar   tcpsrc[4];
124         uchar   tcpdst[4];
125         uchar   tcpsport[2];
126         uchar   tcpdport[2];
127         uchar   tcpseq[4];
128         uchar   tcpack[4];
129         uchar   tcpflag[2];
130         uchar   tcpwin[2];
131         uchar   tcpcksum[2];
132         uchar   tcpurg[2];
133         /* Options segment */
134         uchar   tcpopt[1];
135 };
136
137 typedef struct Tcp6hdr Tcp6hdr;
138 struct Tcp6hdr
139 {
140         uchar   vcf[4];
141         uchar   ploadlen[2];
142         uchar   proto;
143         uchar   ttl;
144         uchar   tcpsrc[IPaddrlen];
145         uchar   tcpdst[IPaddrlen];
146         uchar   tcpsport[2];
147         uchar   tcpdport[2];
148         uchar   tcpseq[4];
149         uchar   tcpack[4];
150         uchar   tcpflag[2];
151         uchar   tcpwin[2];
152         uchar   tcpcksum[2];
153         uchar   tcpurg[2];
154         /* Options segment */
155         uchar   tcpopt[1];
156 };
157
158 /*
159  *  this represents the control info
160  *  for a single packet.  It is derived from
161  *  a packet in ntohtcp{4,6}() and stuck into
162  *  a packet in htontcp{4,6}().
163  */
164 typedef struct Tcp Tcp;
165 struct  Tcp
166 {
167         ushort  source;
168         ushort  dest;
169         ulong   seq;
170         ulong   ack;
171         uchar   flags;
172         ushort  ws;     /* window scale option (if not zero) */
173         ulong   wnd;
174         ushort  urg;
175         ushort  mss;    /* max segment size option (if not zero) */
176         ushort  len;    /* size of data */
177 };
178
179 /*
180  *  this header is malloc'd to thread together fragments
181  *  waiting to be coalesced
182  */
183 typedef struct Reseq Reseq;
184 struct Reseq
185 {
186         Reseq   *next;
187         Tcp     seg;
188         Block   *bp;
189         ushort  length;
190 };
191
192 /*
193  *  the qlock in the Conv locks this structure
194  */
195 typedef struct Tcpctl Tcpctl;
196 struct Tcpctl
197 {
198         uchar   state;                  /* Connection state */
199         uchar   type;                   /* Listening or active connection */
200         uchar   code;                   /* Icmp code */
201         struct {
202                 ulong   una;            /* Unacked data pointer */
203                 ulong   nxt;            /* Next sequence expected */
204                 ulong   ptr;            /* Data pointer */
205                 ulong   wnd;            /* Tcp send window */
206                 ulong   urg;            /* Urgent data pointer */
207                 ulong   wl2;
208                 int     scale;          /* how much to right shift window in xmitted packets */
209                 /* to implement tahoe and reno TCP */
210                 ulong   dupacks;        /* number of duplicate acks rcvd */
211                 int     recovery;       /* loss recovery flag */
212                 ulong   rxt;            /* right window marker for recovery */
213         } snd;
214         struct {
215                 ulong   nxt;            /* Receive pointer to next uchar slot */
216                 ulong   wnd;            /* Receive window incoming */
217                 ulong   urg;            /* Urgent pointer */
218                 int     blocked;
219                 int     una;            /* unacked data segs */
220                 int     scale;          /* how much to left shift window in rcved packets */
221         } rcv;
222         ulong   iss;                    /* Initial sequence number */
223         int     sawwsopt;               /* true if we saw a wsopt on the incoming SYN */
224         ulong   cwind;                  /* Congestion window */
225         int     scale;                  /* desired snd.scale */
226         ushort  ssthresh;               /* Slow start threshold */
227         int     resent;                 /* Bytes just resent */
228         int     irs;                    /* Initial received squence */
229         ushort  mss;                    /* Maximum segment size */
230         int     rerecv;                 /* Overlap of data rerecevived */
231         ulong   window;                 /* Receive window */
232         uchar   backoff;                /* Exponential backoff counter */
233         int     backedoff;              /* ms we've backed off for rexmits */
234         uchar   flags;                  /* State flags */
235         Reseq   *reseq;                 /* Resequencing queue */
236         Tcptimer        timer;                  /* Activity timer */
237         Tcptimer        acktimer;               /* Acknowledge timer */
238         Tcptimer        rtt_timer;              /* Round trip timer */
239         Tcptimer        katimer;                /* keep alive timer */
240         ulong   rttseq;                 /* Round trip sequence */
241         int     srtt;                   /* Shortened round trip */
242         int     mdev;                   /* Mean deviation of round trip */
243         int     kacounter;              /* count down for keep alive */
244         uint    sndsyntime;             /* time syn sent */
245         ulong   time;                   /* time Finwait2 or Syn_received was sent */
246         int     nochecksum;             /* non-zero means don't send checksums */
247         int     flgcnt;                 /* number of flags in the sequence (FIN,SEQ) */
248
249         union {
250                 Tcp4hdr tcp4hdr;
251                 Tcp6hdr tcp6hdr;
252         } protohdr;             /* prototype header */
253 };
254
255 /*
256  *  New calls are put in limbo rather than having a conversation structure
257  *  allocated.  Thus, a SYN attack results in lots of limbo'd calls but not
258  *  any real Conv structures mucking things up.  Calls in limbo rexmit their
259  *  SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
260  *
261  *  In particular they aren't on a listener's queue so that they don't figure
262  *  in the input queue limit.
263  *
264  *  If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
265  *  of 70000 limbo'd calls.  Not great for a linear list but doable.  Therefore
266  *  there is no hashing of this list.
267  */
268 typedef struct Limbo Limbo;
269 struct Limbo
270 {
271         Limbo   *next;
272
273         uchar   laddr[IPaddrlen];
274         uchar   raddr[IPaddrlen];
275         ushort  lport;
276         ushort  rport;
277         ulong   irs;            /* initial received sequence */
278         ulong   iss;            /* initial sent sequence */
279         ushort  mss;            /* mss from the other end */
280         ushort  rcvscale;       /* how much to scale rcvd windows */
281         ushort  sndscale;       /* how much to scale sent windows */
282         ulong   lastsend;       /* last time we sent a synack */
283         uchar   version;        /* v4 or v6 */
284         uchar   rexmits;        /* number of retransmissions */
285 };
286
287 int     tcp_irtt = DEF_RTT;     /* Initial guess at round trip time */
288 ushort  tcp_mss = DEF_MSS;      /* Maximum segment size to be sent */
289
290 enum {
291         /* MIB stats */
292         MaxConn,
293         Mss,
294         ActiveOpens,
295         PassiveOpens,
296         EstabResets,
297         CurrEstab,
298         InSegs,
299         OutSegs,
300         RetransSegs,
301         RetransTimeouts,
302         InErrs,
303         OutRsts,
304
305         /* non-MIB stats */
306         CsumErrs,
307         HlenErrs,
308         LenErrs,
309         OutOfOrder,
310
311         Nstats
312 };
313
314 static char *statnames[] =
315 {
316 [MaxConn]       "MaxConn",
317 [Mss]           "MaxSegment",
318 [ActiveOpens]   "ActiveOpens",
319 [PassiveOpens]  "PassiveOpens",
320 [EstabResets]   "EstabResets",
321 [CurrEstab]     "CurrEstab",
322 [InSegs]        "InSegs",
323 [OutSegs]       "OutSegs",
324 [RetransSegs]   "RetransSegs",
325 [RetransTimeouts]       "RetransTimeouts",
326 [InErrs]        "InErrs",
327 [OutRsts]       "OutRsts",
328 [CsumErrs]      "CsumErrs",
329 [HlenErrs]      "HlenErrs",
330 [LenErrs]       "LenErrs",
331 [OutOfOrder]    "OutOfOrder",
332 };
333
334 typedef struct Tcppriv Tcppriv;
335 struct Tcppriv
336 {
337         /* List of active timers */
338         QLock   tl;
339         Tcptimer *timers;
340
341         /* hash table for matching conversations */
342         Ipht    ht;
343
344         /* calls in limbo waiting for an ACK to our SYN ACK */
345         int     nlimbo;
346         Limbo   *lht[NLHT];
347
348         /* for keeping track of tcpackproc */
349         QLock   apl;
350         int     ackprocstarted;
351
352         uvlong  stats[Nstats];
353 };
354
355 /*
356  *  Setting tcpporthogdefense to non-zero enables Dong Lin's
357  *  solution to hijacked systems staking out port's as a form
358  *  of DoS attack.
359  *
360  *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
361  *  that number gets acked by the other end, we shut down the connection.
362  *  Look for tcpporthogdefense in the code.
363  */
364 int tcpporthogdefense = 0;
365
366 int     addreseq(Tcpctl*, Tcppriv*, Tcp*, Block*, ushort);
367 void    getreseq(Tcpctl*, Tcp*, Block**, ushort*);
368 void    localclose(Conv*, char*);
369 void    procsyn(Conv*, Tcp*);
370 void    tcpiput(Proto*, Ipifc*, Block*);
371 void    tcpoutput(Conv*);
372 int     tcptrim(Tcpctl*, Tcp*, Block**, ushort*);
373 void    tcpstart(Conv*, int);
374 void    tcptimeout(void*);
375 void    tcpsndsyn(Conv*, Tcpctl*);
376 void    tcprcvwin(Conv*);
377 void    tcpacktimer(void*);
378 void    tcpkeepalive(void*);
379 void    tcpsetkacounter(Tcpctl*);
380 void    tcprxmit(Conv*);
381 void    tcpsettimer(Tcpctl*);
382 void    tcpsynackrtt(Conv*);
383 void    tcpsetscale(Conv*, Tcpctl*, ushort, ushort);
384
385 static void limborexmit(Proto*);
386 static void limbo(Conv*, uchar*, uchar*, Tcp*, int);
387
388 void
389 tcpsetstate(Conv *s, uchar newstate)
390 {
391         Tcpctl *tcb;
392         uchar oldstate;
393         Tcppriv *tpriv;
394
395         tpriv = s->p->priv;
396
397         tcb = (Tcpctl*)s->ptcl;
398
399         oldstate = tcb->state;
400         if(oldstate == newstate)
401                 return;
402
403         if(oldstate == Established)
404                 tpriv->stats[CurrEstab]--;
405         if(newstate == Established)
406                 tpriv->stats[CurrEstab]++;
407
408         /**
409         print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
410                 tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
411         **/
412
413         switch(newstate) {
414         case Closed:
415                 qclose(s->rq);
416                 qclose(s->wq);
417                 qclose(s->eq);
418                 break;
419
420         case Close_wait:                /* Remote closes */
421                 qhangup(s->rq, nil);
422                 break;
423         }
424
425         tcb->state = newstate;
426
427         if(oldstate == Syn_sent && newstate != Closed)
428                 Fsconnected(s, nil);
429 }
430
431 static char*
432 tcpconnect(Conv *c, char **argv, int argc)
433 {
434         char *e;
435         Tcpctl *tcb;
436
437         tcb = (Tcpctl*)(c->ptcl);
438         if(tcb->state != Closed)
439                 return Econinuse;
440
441         e = Fsstdconnect(c, argv, argc);
442         if(e != nil)
443                 return e;
444         tcpstart(c, TCP_CONNECT);
445
446         return nil;
447 }
448
449 static int
450 tcpstate(Conv *c, char *state, int n)
451 {
452         Tcpctl *s;
453
454         s = (Tcpctl*)(c->ptcl);
455
456         return snprint(state, n,
457                 "%s qin %d qout %d srtt %d mdev %d cwin %lud swin %lud>>%d rwin %lud>>%d timer.start %d timer.count %d rerecv %d katimer.start %d katimer.count %d\n",
458                 tcpstates[s->state],
459                 c->rq ? qlen(c->rq) : 0,
460                 c->wq ? qlen(c->wq) : 0,
461                 s->srtt, s->mdev,
462                 s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale,
463                 s->timer.start, s->timer.count, s->rerecv,
464                 s->katimer.start, s->katimer.count);
465 }
466
467 static int
468 tcpinuse(Conv *c)
469 {
470         Tcpctl *s;
471
472         s = (Tcpctl*)(c->ptcl);
473         return s->state != Closed;
474 }
475
476 static char*
477 tcpannounce(Conv *c, char **argv, int argc)
478 {
479         char *e;
480         Tcpctl *tcb;
481
482         tcb = (Tcpctl*)(c->ptcl);
483         if(tcb->state != Closed)
484                 return Econinuse;
485
486         e = Fsstdannounce(c, argv, argc);
487         if(e != nil)
488                 return e;
489         tcpstart(c, TCP_LISTEN);
490         Fsconnected(c, nil);
491
492         return nil;
493 }
494
495 /*
496  *  tcpclose is always called with the q locked
497  */
498 static void
499 tcpclose(Conv *c)
500 {
501         Tcpctl *tcb;
502
503         tcb = (Tcpctl*)c->ptcl;
504
505         qhangup(c->rq, nil);
506         qhangup(c->wq, nil);
507         qhangup(c->eq, nil);
508         qflush(c->rq);
509
510         switch(tcb->state) {
511         case Listen:
512                 /*
513                  *  reset any incoming calls to this listener
514                  */
515                 Fsconnected(c, "Hangup");
516
517                 localclose(c, nil);
518                 break;
519         case Closed:
520         case Syn_sent:
521                 localclose(c, nil);
522                 break;
523         case Syn_received:
524         case Established:
525                 tcb->flgcnt++;
526                 tcb->snd.nxt++;
527                 tcpsetstate(c, Finwait1);
528                 tcpoutput(c);
529                 break;
530         case Close_wait:
531                 tcb->flgcnt++;
532                 tcb->snd.nxt++;
533                 tcpsetstate(c, Last_ack);
534                 tcpoutput(c);
535                 break;
536         }
537 }
538
539 void
540 tcpkick(void *x)
541 {
542         Conv *s = x;
543         Tcpctl *tcb;
544
545         tcb = (Tcpctl*)s->ptcl;
546
547         if(waserror()){
548                 qunlock(s);
549                 nexterror();
550         }
551         qlock(s);
552
553         switch(tcb->state) {
554         case Syn_sent:
555         case Syn_received:
556         case Established:
557         case Close_wait:
558                 /*
559                  * Push data
560                  */
561                 tcprcvwin(s);
562                 tcpoutput(s);
563                 break;
564         default:
565                 localclose(s, "Hangup");
566                 break;
567         }
568
569         qunlock(s);
570         poperror();
571 }
572
573 void
574 tcprcvwin(Conv *s)                              /* Call with tcb locked */
575 {
576         int w;
577         Tcpctl *tcb;
578
579         tcb = (Tcpctl*)s->ptcl;
580         w = tcb->window - qlen(s->rq);
581         if(w < 0)
582                 w = 0;
583         if(w == 0)
584                 netlog(s->p->f, Logtcp, "tcprcvwim: window %lud qlen %d\n", tcb->window, qlen(s->rq));
585         tcb->rcv.wnd = w;
586         if(w == 0)
587                 tcb->rcv.blocked = 1;
588 }
589
590 void
591 tcpacktimer(void *v)
592 {
593         Tcpctl *tcb;
594         Conv *s;
595
596         s = v;
597         tcb = (Tcpctl*)s->ptcl;
598
599         if(waserror()){
600                 qunlock(s);
601                 nexterror();
602         }
603         qlock(s);
604         if(tcb->state != Closed){
605                 tcb->flags |= FORCE;
606                 tcprcvwin(s);
607                 tcpoutput(s);
608         }
609         qunlock(s);
610         poperror();
611 }
612
613 static void
614 tcpcreate(Conv *c)
615 {
616         c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c);
617         c->wq = qopen((3*QMAX)/2, Qkick, tcpkick, c);
618 }
619
620 static void
621 timerstate(Tcppriv *priv, Tcptimer *t, int newstate)
622 {
623         if(newstate != TcptimerON){
624                 if(t->state == TcptimerON){
625                         /* unchain */
626                         if(priv->timers == t){
627                                 priv->timers = t->next;
628                                 if(t->prev != nil)
629                                         panic("timerstate1");
630                         }
631                         if(t->next)
632                                 t->next->prev = t->prev;
633                         if(t->prev)
634                                 t->prev->next = t->next;
635                         t->next = t->prev = nil;
636                 }
637         } else {
638                 if(t->state != TcptimerON){
639                         /* chain */
640                         if(t->prev != nil || t->next != nil)
641                                 panic("timerstate2");
642                         t->prev = nil;
643                         t->next = priv->timers;
644                         if(t->next)
645                                 t->next->prev = t;
646                         priv->timers = t;
647                 }
648         }
649         t->state = newstate;
650 }
651
652 void
653 tcpackproc(void *a)
654 {
655         Tcptimer *t, *tp, *timeo;
656         Proto *tcp;
657         Tcppriv *priv;
658         int loop;
659
660         tcp = a;
661         priv = tcp->priv;
662
663         for(;;) {
664                 tsleep(&up->sleep, return0, 0, MSPTICK);
665
666                 qlock(&priv->tl);
667                 timeo = nil;
668                 loop = 0;
669                 for(t = priv->timers; t != nil; t = tp) {
670                         if(loop++ > 10000)
671                                 panic("tcpackproc1");
672                         tp = t->next;
673                         if(t->state == TcptimerON) {
674                                 t->count--;
675                                 if(t->count == 0) {
676                                         timerstate(priv, t, TcptimerDONE);
677                                         t->readynext = timeo;
678                                         timeo = t;
679                                 }
680                         }
681                 }
682                 qunlock(&priv->tl);
683
684                 loop = 0;
685                 for(t = timeo; t != nil; t = t->readynext) {
686                         if(loop++ > 10000)
687                                 panic("tcpackproc2");
688                         if(t->state == TcptimerDONE && t->func != nil && !waserror()){
689                                 (*t->func)(t->arg);
690                                 poperror();
691                         }
692                 }
693
694                 limborexmit(tcp);
695         }
696 }
697
698 void
699 tcpgo(Tcppriv *priv, Tcptimer *t)
700 {
701         if(t == nil || t->start == 0)
702                 return;
703
704         qlock(&priv->tl);
705         t->count = t->start;
706         timerstate(priv, t, TcptimerON);
707         qunlock(&priv->tl);
708 }
709
710 void
711 tcphalt(Tcppriv *priv, Tcptimer *t)
712 {
713         if(t == nil)
714                 return;
715
716         qlock(&priv->tl);
717         timerstate(priv, t, TcptimerOFF);
718         qunlock(&priv->tl);
719 }
720
721 int
722 backoff(int n)
723 {
724         return 1 << n;
725 }
726
727 void
728 localclose(Conv *s, char *reason)       /* called with tcb locked */
729 {
730         Tcpctl *tcb;
731         Reseq *rp,*rp1;
732         Tcppriv *tpriv;
733
734         tpriv = s->p->priv;
735         tcb = (Tcpctl*)s->ptcl;
736
737         iphtrem(&tpriv->ht, s);
738
739         tcphalt(tpriv, &tcb->timer);
740         tcphalt(tpriv, &tcb->rtt_timer);
741         tcphalt(tpriv, &tcb->acktimer);
742         tcphalt(tpriv, &tcb->katimer);
743
744         /* Flush reassembly queue; nothing more can arrive */
745         for(rp = tcb->reseq; rp != nil; rp = rp1) {
746                 rp1 = rp->next;
747                 freeblist(rp->bp);
748                 free(rp);
749         }
750         tcb->reseq = nil;
751
752         if(tcb->state == Syn_sent)
753                 Fsconnected(s, reason);
754         if(s->state == Announced)
755                 wakeup(&s->listenr);
756
757         qhangup(s->rq, reason);
758         qhangup(s->wq, reason);
759
760         tcpsetstate(s, Closed);
761 }
762
763 /* mtu (- TCP + IP hdr len) of 1st hop */
764 int
765 tcpmtu(Proto *tcp, uchar *addr, int version, int *scale)
766 {
767         Ipifc *ifc;
768         int mtu;
769
770         ifc = findipifc(tcp->f, addr, 0);
771         switch(version){
772         default:
773         case V4:
774                 mtu = DEF_MSS;
775                 if(ifc != nil)
776                         mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
777                 break;
778         case V6:
779                 mtu = DEF_MSS6;
780                 if(ifc != nil)
781                         mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
782                 break;
783         }
784         if(ifc != nil){
785                 if(ifc->mbps > 1000)
786                         *scale = HaveWS | 4;
787                 else if(ifc->mbps > 100)
788                         *scale = HaveWS | 3;
789                 else if(ifc->mbps > 10)
790                         *scale = HaveWS | 1;
791                 else
792                         *scale = HaveWS | 0;
793         } else
794                 *scale = HaveWS | 0;
795
796         return mtu;
797 }
798
799 void
800 inittcpctl(Conv *s, int mode)
801 {
802         Tcpctl *tcb;
803         Tcp4hdr* h4;
804         Tcp6hdr* h6;
805         Tcppriv *tpriv;
806         int mss;
807
808         tcb = (Tcpctl*)s->ptcl;
809
810         memset(tcb, 0, sizeof(Tcpctl));
811
812         tcb->ssthresh = 65535;
813         tcb->srtt = tcp_irtt<<LOGAGAIN;
814         tcb->mdev = 0;
815
816         /* setup timers */
817         tcb->timer.start = tcp_irtt / MSPTICK;
818         tcb->timer.func = tcptimeout;
819         tcb->timer.arg = s;
820         tcb->rtt_timer.start = MAX_TIME;
821         tcb->acktimer.start = TCP_ACK / MSPTICK;
822         tcb->acktimer.func = tcpacktimer;
823         tcb->acktimer.arg = s;
824         tcb->katimer.start = DEF_KAT / MSPTICK;
825         tcb->katimer.func = tcpkeepalive;
826         tcb->katimer.arg = s;
827
828         mss = DEF_MSS;
829
830         /* create a prototype(pseudo) header */
831         if(mode != TCP_LISTEN){
832                 if(ipcmp(s->laddr, IPnoaddr) == 0)
833                         findlocalip(s->p->f, s->laddr, s->raddr);
834
835                 switch(s->ipversion){
836                 case V4:
837                         h4 = &tcb->protohdr.tcp4hdr;
838                         memset(h4, 0, sizeof(*h4));
839                         h4->proto = IP_TCPPROTO;
840                         hnputs(h4->tcpsport, s->lport);
841                         hnputs(h4->tcpdport, s->rport);
842                         v6tov4(h4->tcpsrc, s->laddr);
843                         v6tov4(h4->tcpdst, s->raddr);
844                         break;
845                 case V6:
846                         h6 = &tcb->protohdr.tcp6hdr;
847                         memset(h6, 0, sizeof(*h6));
848                         h6->proto = IP_TCPPROTO;
849                         hnputs(h6->tcpsport, s->lport);
850                         hnputs(h6->tcpdport, s->rport);
851                         ipmove(h6->tcpsrc, s->laddr);
852                         ipmove(h6->tcpdst, s->raddr);
853                         mss = DEF_MSS6;
854                         break;
855                 default:
856                         panic("inittcpctl: version %d", s->ipversion);
857                 }
858         }
859
860         tcb->mss = tcb->cwind = mss;
861         tpriv = s->p->priv;
862         tpriv->stats[Mss] = tcb->mss;
863
864         /* default is no window scaling */
865         tcb->window = QMAX;
866         tcb->rcv.wnd = QMAX;
867         tcb->rcv.scale = 0;
868         tcb->snd.scale = 0;
869         qsetlimit(s->rq, QMAX);
870 }
871
872 /*
873  *  called with s qlocked
874  */
875 void
876 tcpstart(Conv *s, int mode)
877 {
878         Tcpctl *tcb;
879         Tcppriv *tpriv;
880         char kpname[KNAMELEN];
881
882         tpriv = s->p->priv;
883
884         if(tpriv->ackprocstarted == 0){
885                 qlock(&tpriv->apl);
886                 if(tpriv->ackprocstarted == 0){
887                         sprint(kpname, "#I%dtcpack", s->p->f->dev);
888                         kproc(kpname, tcpackproc, s->p);
889                         tpriv->ackprocstarted = 1;
890                 }
891                 qunlock(&tpriv->apl);
892         }
893
894         tcb = (Tcpctl*)s->ptcl;
895
896         inittcpctl(s, mode);
897
898         iphtadd(&tpriv->ht, s);
899         switch(mode) {
900         case TCP_LISTEN:
901                 tpriv->stats[PassiveOpens]++;
902                 tcb->flags |= CLONE;
903                 tcpsetstate(s, Listen);
904                 break;
905
906         case TCP_CONNECT:
907                 tpriv->stats[ActiveOpens]++;
908                 tcb->flags |= ACTIVE;
909                 tcpsndsyn(s, tcb);
910                 tcpsetstate(s, Syn_sent);
911                 tcpoutput(s);
912                 break;
913         }
914 }
915
916 static char*
917 tcpflag(ushort flag)
918 {
919         static char buf[128];
920
921         sprint(buf, "%d", flag>>10);    /* Head len */
922         if(flag & URG)
923                 strcat(buf, " URG");
924         if(flag & ACK)
925                 strcat(buf, " ACK");
926         if(flag & PSH)
927                 strcat(buf, " PSH");
928         if(flag & RST)
929                 strcat(buf, " RST");
930         if(flag & SYN)
931                 strcat(buf, " SYN");
932         if(flag & FIN)
933                 strcat(buf, " FIN");
934
935         return buf;
936 }
937
938 Block *
939 htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb)
940 {
941         int dlen;
942         Tcp6hdr *h;
943         ushort csum;
944         ushort hdrlen, optpad = 0;
945         uchar *opt;
946
947         hdrlen = TCP6_HDRSIZE;
948         if(tcph->flags & SYN){
949                 if(tcph->mss)
950                         hdrlen += MSS_LENGTH;
951                 if(tcph->ws)
952                         hdrlen += WS_LENGTH;
953                 optpad = hdrlen & 3;
954                 if(optpad)
955                         optpad = 4 - optpad;
956                 hdrlen += optpad;
957         }
958
959         if(data) {
960                 dlen = blocklen(data);
961                 data = padblock(data, hdrlen + TCP6_PKT);
962                 if(data == nil)
963                         return nil;
964         }
965         else {
966                 dlen = 0;
967                 data = allocb(hdrlen + TCP6_PKT + 64);  /* the 64 pad is to meet mintu's */
968                 if(data == nil)
969                         return nil;
970                 data->wp += hdrlen + TCP6_PKT;
971         }
972
973         /* copy in pseudo ip header plus port numbers */
974         h = (Tcp6hdr *)(data->rp);
975         memmove(h, ph, TCP6_TCBPHDRSZ);
976
977         /* compose pseudo tcp header, do cksum calculation */
978         hnputl(h->vcf, hdrlen + dlen);
979         h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
980         h->ttl = ph->proto;
981
982         /* copy in variable bits */
983         hnputl(h->tcpseq, tcph->seq);
984         hnputl(h->tcpack, tcph->ack);
985         hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
986         hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
987         hnputs(h->tcpurg, tcph->urg);
988
989         if(tcph->flags & SYN){
990                 opt = h->tcpopt;
991                 if(tcph->mss != 0){
992                         *opt++ = MSSOPT;
993                         *opt++ = MSS_LENGTH;
994                         hnputs(opt, tcph->mss);
995 //                      print("our outgoing mss %d\n", tcph->mss);
996                         opt += 2;
997                 }
998                 if(tcph->ws != 0){
999                         *opt++ = WSOPT;
1000                         *opt++ = WS_LENGTH;
1001                         *opt++ = tcph->ws;
1002                 }
1003                 while(optpad-- > 0)
1004                         *opt++ = NOOPOPT;
1005         }
1006
1007         if(tcb != nil && tcb->nochecksum){
1008                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1009         } else {
1010                 csum = ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE);
1011                 hnputs(h->tcpcksum, csum);
1012         }
1013
1014         /* move from pseudo header back to normal ip header */
1015         memset(h->vcf, 0, 4);
1016         h->vcf[0] = IP_VER6;
1017         hnputs(h->ploadlen, hdrlen+dlen);
1018         h->proto = ph->proto;
1019
1020         return data;
1021 }
1022
1023 Block *
1024 htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb)
1025 {
1026         int dlen;
1027         Tcp4hdr *h;
1028         ushort csum;
1029         ushort hdrlen, optpad = 0;
1030         uchar *opt;
1031
1032         hdrlen = TCP4_HDRSIZE;
1033         if(tcph->flags & SYN){
1034                 if(tcph->mss)
1035                         hdrlen += MSS_LENGTH;
1036                 if(tcph->ws)
1037                         hdrlen += WS_LENGTH;
1038                 optpad = hdrlen & 3;
1039                 if(optpad)
1040                         optpad = 4 - optpad;
1041                 hdrlen += optpad;
1042         }
1043
1044         if(data) {
1045                 dlen = blocklen(data);
1046                 data = padblock(data, hdrlen + TCP4_PKT);
1047                 if(data == nil)
1048                         return nil;
1049         }
1050         else {
1051                 dlen = 0;
1052                 data = allocb(hdrlen + TCP4_PKT + 64);  /* the 64 pad is to meet mintu's */
1053                 if(data == nil)
1054                         return nil;
1055                 data->wp += hdrlen + TCP4_PKT;
1056         }
1057
1058         /* copy in pseudo ip header plus port numbers */
1059         h = (Tcp4hdr *)(data->rp);
1060         memmove(h, ph, TCP4_TCBPHDRSZ);
1061
1062         /* copy in variable bits */
1063         hnputs(h->tcplen, hdrlen + dlen);
1064         hnputl(h->tcpseq, tcph->seq);
1065         hnputl(h->tcpack, tcph->ack);
1066         hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
1067         hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
1068         hnputs(h->tcpurg, tcph->urg);
1069
1070         if(tcph->flags & SYN){
1071                 opt = h->tcpopt;
1072                 if(tcph->mss != 0){
1073                         *opt++ = MSSOPT;
1074                         *opt++ = MSS_LENGTH;
1075                         hnputs(opt, tcph->mss);
1076                         opt += 2;
1077                 }
1078                 if(tcph->ws != 0){
1079                         *opt++ = WSOPT;
1080                         *opt++ = WS_LENGTH;
1081                         *opt++ = tcph->ws;
1082                 }
1083                 while(optpad-- > 0)
1084                         *opt++ = NOOPOPT;
1085         }
1086
1087         if(tcb != nil && tcb->nochecksum){
1088                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1089         } else {
1090                 csum = ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE);
1091                 hnputs(h->tcpcksum, csum);
1092         }
1093
1094         return data;
1095 }
1096
1097 int
1098 ntohtcp6(Tcp *tcph, Block **bpp)
1099 {
1100         Tcp6hdr *h;
1101         uchar *optr;
1102         ushort hdrlen;
1103         ushort optlen;
1104         int n;
1105
1106         *bpp = pullupblock(*bpp, TCP6_PKT+TCP6_HDRSIZE);
1107         if(*bpp == nil)
1108                 return -1;
1109
1110         h = (Tcp6hdr *)((*bpp)->rp);
1111         tcph->source = nhgets(h->tcpsport);
1112         tcph->dest = nhgets(h->tcpdport);
1113         tcph->seq = nhgetl(h->tcpseq);
1114         tcph->ack = nhgetl(h->tcpack);
1115         hdrlen = (h->tcpflag[0]>>2) & ~3;
1116         if(hdrlen < TCP6_HDRSIZE) {
1117                 freeblist(*bpp);
1118                 return -1;
1119         }
1120
1121         tcph->flags = h->tcpflag[1];
1122         tcph->wnd = nhgets(h->tcpwin);
1123         tcph->urg = nhgets(h->tcpurg);
1124         tcph->mss = 0;
1125         tcph->ws = 0;
1126         tcph->len = nhgets(h->ploadlen) - hdrlen;
1127
1128         *bpp = pullupblock(*bpp, hdrlen+TCP6_PKT);
1129         if(*bpp == nil)
1130                 return -1;
1131
1132         optr = h->tcpopt;
1133         n = hdrlen - TCP6_HDRSIZE;
1134         while(n > 0 && *optr != EOLOPT) {
1135                 if(*optr == NOOPOPT) {
1136                         n--;
1137                         optr++;
1138                         continue;
1139                 }
1140                 optlen = optr[1];
1141                 if(optlen < 2 || optlen > n)
1142                         break;
1143                 switch(*optr) {
1144                 case MSSOPT:
1145                         if(optlen == MSS_LENGTH)
1146                                 tcph->mss = nhgets(optr+2);
1147                         break;
1148                 case WSOPT:
1149                         if(optlen == WS_LENGTH && *(optr+2) <= 14)
1150                                 tcph->ws = HaveWS | *(optr+2);
1151                         break;
1152                 }
1153                 n -= optlen;
1154                 optr += optlen;
1155         }
1156         return hdrlen;
1157 }
1158
1159 int
1160 ntohtcp4(Tcp *tcph, Block **bpp)
1161 {
1162         Tcp4hdr *h;
1163         uchar *optr;
1164         ushort hdrlen;
1165         ushort optlen;
1166         int n;
1167
1168         *bpp = pullupblock(*bpp, TCP4_PKT+TCP4_HDRSIZE);
1169         if(*bpp == nil)
1170                 return -1;
1171
1172         h = (Tcp4hdr *)((*bpp)->rp);
1173         tcph->source = nhgets(h->tcpsport);
1174         tcph->dest = nhgets(h->tcpdport);
1175         tcph->seq = nhgetl(h->tcpseq);
1176         tcph->ack = nhgetl(h->tcpack);
1177
1178         hdrlen = (h->tcpflag[0]>>2) & ~3;
1179         if(hdrlen < TCP4_HDRSIZE) {
1180                 freeblist(*bpp);
1181                 return -1;
1182         }
1183
1184         tcph->flags = h->tcpflag[1];
1185         tcph->wnd = nhgets(h->tcpwin);
1186         tcph->urg = nhgets(h->tcpurg);
1187         tcph->mss = 0;
1188         tcph->ws = 0;
1189         tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1190
1191         *bpp = pullupblock(*bpp, hdrlen+TCP4_PKT);
1192         if(*bpp == nil)
1193                 return -1;
1194
1195         optr = h->tcpopt;
1196         n = hdrlen - TCP4_HDRSIZE;
1197         while(n > 0 && *optr != EOLOPT) {
1198                 if(*optr == NOOPOPT) {
1199                         n--;
1200                         optr++;
1201                         continue;
1202                 }
1203                 optlen = optr[1];
1204                 if(optlen < 2 || optlen > n)
1205                         break;
1206                 switch(*optr) {
1207                 case MSSOPT:
1208                         if(optlen == MSS_LENGTH) {
1209                                 tcph->mss = nhgets(optr+2);
1210 //                              print("new incoming mss %d\n", tcph->mss);
1211                         }
1212                         break;
1213                 case WSOPT:
1214                         if(optlen == WS_LENGTH && *(optr+2) <= 14)
1215                                 tcph->ws = HaveWS | *(optr+2);
1216                         break;
1217                 }
1218                 n -= optlen;
1219                 optr += optlen;
1220         }
1221         return hdrlen;
1222 }
1223
1224 /*
1225  *  For outgiing calls, generate an initial sequence
1226  *  number and put a SYN on the send queue
1227  */
1228 void
1229 tcpsndsyn(Conv *s, Tcpctl *tcb)
1230 {
1231         Tcppriv *tpriv;
1232
1233         tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1234         tcb->rttseq = tcb->iss;
1235         tcb->snd.wl2 = tcb->iss;
1236         tcb->snd.una = tcb->iss;
1237         tcb->snd.ptr = tcb->rttseq;
1238         tcb->snd.nxt = tcb->rttseq;
1239         tcb->flgcnt++;
1240         tcb->flags |= FORCE;
1241         tcb->sndsyntime = NOW;
1242
1243         /* set desired mss and scale */
1244         tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale);
1245         tpriv = s->p->priv;
1246         tpriv->stats[Mss] = tcb->mss;
1247 }
1248
1249 void
1250 sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, uchar version, char *reason)
1251 {
1252         Block *hbp;
1253         uchar rflags;
1254         Tcppriv *tpriv;
1255         Tcp4hdr ph4;
1256         Tcp6hdr ph6;
1257
1258         netlog(tcp->f, Logtcp, "sndrst: %s\n", reason);
1259
1260         tpriv = tcp->priv;
1261
1262         if(seg->flags & RST)
1263                 return;
1264
1265         /* make pseudo header */
1266         switch(version) {
1267         case V4:
1268                 memset(&ph4, 0, sizeof(ph4));
1269                 ph4.vihl = IP_VER4;
1270                 v6tov4(ph4.tcpsrc, dest);
1271                 v6tov4(ph4.tcpdst, source);
1272                 ph4.proto = IP_TCPPROTO;
1273                 hnputs(ph4.tcplen, TCP4_HDRSIZE);
1274                 hnputs(ph4.tcpsport, seg->dest);
1275                 hnputs(ph4.tcpdport, seg->source);
1276                 break;
1277         case V6:
1278                 memset(&ph6, 0, sizeof(ph6));
1279                 ph6.vcf[0] = IP_VER6;
1280                 ipmove(ph6.tcpsrc, dest);
1281                 ipmove(ph6.tcpdst, source);
1282                 ph6.proto = IP_TCPPROTO;
1283                 hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1284                 hnputs(ph6.tcpsport, seg->dest);
1285                 hnputs(ph6.tcpdport, seg->source);
1286                 break;
1287         default:
1288                 panic("sndrst: version %d", version);
1289         }
1290
1291         tpriv->stats[OutRsts]++;
1292         rflags = RST;
1293
1294         /* convince the other end that this reset is in band */
1295         if(seg->flags & ACK) {
1296                 seg->seq = seg->ack;
1297                 seg->ack = 0;
1298         }
1299         else {
1300                 rflags |= ACK;
1301                 seg->ack = seg->seq;
1302                 seg->seq = 0;
1303                 if(seg->flags & SYN)
1304                         seg->ack++;
1305                 seg->ack += length;
1306                 if(seg->flags & FIN)
1307                         seg->ack++;
1308         }
1309         seg->flags = rflags;
1310         seg->wnd = 0;
1311         seg->urg = 0;
1312         seg->mss = 0;
1313         seg->ws = 0;
1314         switch(version) {
1315         case V4:
1316                 hbp = htontcp4(seg, nil, &ph4, nil);
1317                 if(hbp == nil)
1318                         return;
1319                 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1320                 break;
1321         case V6:
1322                 hbp = htontcp6(seg, nil, &ph6, nil);
1323                 if(hbp == nil)
1324                         return;
1325                 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1326                 break;
1327         default:
1328                 panic("sndrst2: version %d", version);
1329         }
1330 }
1331
1332 /*
1333  *  send a reset to the remote side and close the conversation
1334  *  called with s qlocked
1335  */
1336 char*
1337 tcphangup(Conv *s)
1338 {
1339         Tcp seg;
1340         Tcpctl *tcb;
1341         Block *hbp;
1342
1343         tcb = (Tcpctl*)s->ptcl;
1344         if(waserror())
1345                 return commonerror();
1346         if(ipcmp(s->raddr, IPnoaddr) != 0) {
1347                 if(!waserror()){
1348                         memset(&seg, 0, sizeof seg);
1349                         seg.flags = RST | ACK;
1350                         seg.ack = tcb->rcv.nxt;
1351                         tcb->rcv.una = 0;
1352                         seg.seq = tcb->snd.ptr;
1353                         seg.wnd = 0;
1354                         seg.urg = 0;
1355                         seg.mss = 0;
1356                         seg.ws = 0;
1357                         switch(s->ipversion) {
1358                         case V4:
1359                                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1360                                 hbp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr, tcb);
1361                                 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1362                                 break;
1363                         case V6:
1364                                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1365                                 hbp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr, tcb);
1366                                 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1367                                 break;
1368                         default:
1369                                 panic("tcphangup: version %d", s->ipversion);
1370                         }
1371                         poperror();
1372                 }
1373         }
1374         localclose(s, nil);
1375         poperror();
1376         return nil;
1377 }
1378
1379 /*
1380  *  (re)send a SYN ACK
1381  */
1382 int
1383 sndsynack(Proto *tcp, Limbo *lp)
1384 {
1385         Block *hbp;
1386         Tcp4hdr ph4;
1387         Tcp6hdr ph6;
1388         Tcp seg;
1389         int scale;
1390
1391         /* make pseudo header */
1392         switch(lp->version) {
1393         case V4:
1394                 memset(&ph4, 0, sizeof(ph4));
1395                 ph4.vihl = IP_VER4;
1396                 v6tov4(ph4.tcpsrc, lp->laddr);
1397                 v6tov4(ph4.tcpdst, lp->raddr);
1398                 ph4.proto = IP_TCPPROTO;
1399                 hnputs(ph4.tcplen, TCP4_HDRSIZE);
1400                 hnputs(ph4.tcpsport, lp->lport);
1401                 hnputs(ph4.tcpdport, lp->rport);
1402                 break;
1403         case V6:
1404                 memset(&ph6, 0, sizeof(ph6));
1405                 ph6.vcf[0] = IP_VER6;
1406                 ipmove(ph6.tcpsrc, lp->laddr);
1407                 ipmove(ph6.tcpdst, lp->raddr);
1408                 ph6.proto = IP_TCPPROTO;
1409                 hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1410                 hnputs(ph6.tcpsport, lp->lport);
1411                 hnputs(ph6.tcpdport, lp->rport);
1412                 break;
1413         default:
1414                 panic("sndrst: version %d", lp->version);
1415         }
1416
1417         memset(&seg, 0, sizeof seg);
1418         seg.seq = lp->iss;
1419         seg.ack = lp->irs+1;
1420         seg.flags = SYN|ACK;
1421         seg.urg = 0;
1422         seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale);
1423 //      if (seg.mss > lp->mss && lp->mss >= 512)
1424 //              seg.mss = lp->mss;
1425         seg.wnd = QMAX;
1426
1427         /* if the other side set scale, we should too */
1428         if(lp->rcvscale){
1429                 seg.ws = scale;
1430                 lp->sndscale = scale;
1431         } else {
1432                 seg.ws = 0;
1433                 lp->sndscale = 0;
1434         }
1435
1436         switch(lp->version) {
1437         case V4:
1438                 hbp = htontcp4(&seg, nil, &ph4, nil);
1439                 if(hbp == nil)
1440                         return -1;
1441                 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1442                 break;
1443         case V6:
1444                 hbp = htontcp6(&seg, nil, &ph6, nil);
1445                 if(hbp == nil)
1446                         return -1;
1447                 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1448                 break;
1449         default:
1450                 panic("sndsnack: version %d", lp->version);
1451         }
1452         lp->lastsend = NOW;
1453         return 0;
1454 }
1455
1456 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1457
1458 /*
1459  *  put a call into limbo and respond with a SYN ACK
1460  *
1461  *  called with proto locked
1462  */
1463 static void
1464 limbo(Conv *s, uchar *source, uchar *dest, Tcp *seg, int version)
1465 {
1466         Limbo *lp, **l;
1467         Tcppriv *tpriv;
1468         int h;
1469
1470         tpriv = s->p->priv;
1471         h = hashipa(source, seg->source);
1472
1473         for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1474                 lp = *l;
1475                 if(lp->lport != seg->dest || lp->rport != seg->source || lp->version != version)
1476                         continue;
1477                 if(ipcmp(lp->raddr, source) != 0)
1478                         continue;
1479                 if(ipcmp(lp->laddr, dest) != 0)
1480                         continue;
1481
1482                 /* each new SYN restarts the retransmits */
1483                 lp->irs = seg->seq;
1484                 break;
1485         }
1486         lp = *l;
1487         if(lp == nil){
1488                 if(tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]){
1489                         lp = tpriv->lht[h];
1490                         tpriv->lht[h] = lp->next;
1491                         lp->next = nil;
1492                 } else {
1493                         lp = malloc(sizeof(*lp));
1494                         if(lp == nil)
1495                                 return;
1496                         tpriv->nlimbo++;
1497                 }
1498                 *l = lp;
1499                 lp->version = version;
1500                 ipmove(lp->laddr, dest);
1501                 ipmove(lp->raddr, source);
1502                 lp->lport = seg->dest;
1503                 lp->rport = seg->source;
1504                 lp->mss = seg->mss;
1505                 lp->rcvscale = seg->ws;
1506                 lp->irs = seg->seq;
1507                 lp->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1508         }
1509
1510         if(sndsynack(s->p, lp) < 0){
1511                 *l = lp->next;
1512                 tpriv->nlimbo--;
1513                 free(lp);
1514         }
1515 }
1516
1517 /*
1518  *  resend SYN ACK's once every SYNACK_RXTIMER ms.
1519  */
1520 static void
1521 limborexmit(Proto *tcp)
1522 {
1523         Tcppriv *tpriv;
1524         Limbo **l, *lp;
1525         int h;
1526         int seen;
1527         ulong now;
1528
1529         tpriv = tcp->priv;
1530
1531         if(!canqlock(tcp))
1532                 return;
1533         seen = 0;
1534         now = NOW;
1535         for(h = 0; h < NLHT && seen < tpriv->nlimbo; h++){
1536                 for(l = &tpriv->lht[h]; *l != nil && seen < tpriv->nlimbo; ){
1537                         lp = *l;
1538                         seen++;
1539                         if(now - lp->lastsend < (lp->rexmits+1)*SYNACK_RXTIMER)
1540                                 continue;
1541
1542                         /* time it out after 1 second */
1543                         if(++(lp->rexmits) > 5){
1544                                 tpriv->nlimbo--;
1545                                 *l = lp->next;
1546                                 free(lp);
1547                                 continue;
1548                         }
1549
1550                         /* if we're being attacked, don't bother resending SYN ACK's */
1551                         if(tpriv->nlimbo > 100)
1552                                 continue;
1553
1554                         if(sndsynack(tcp, lp) < 0){
1555                                 tpriv->nlimbo--;
1556                                 *l = lp->next;
1557                                 free(lp);
1558                                 continue;
1559                         }
1560
1561                         l = &lp->next;
1562                 }
1563         }
1564         qunlock(tcp);
1565 }
1566
1567 /*
1568  *  lookup call in limbo.  if found, throw it out.
1569  *
1570  *  called with proto locked
1571  */
1572 static void
1573 limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1574 {
1575         Limbo *lp, **l;
1576         int h;
1577         Tcppriv *tpriv;
1578
1579         tpriv = s->p->priv;
1580
1581         /* find a call in limbo */
1582         h = hashipa(src, segp->source);
1583         for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1584                 lp = *l;
1585                 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1586                         continue;
1587                 if(ipcmp(lp->laddr, dst) != 0)
1588                         continue;
1589                 if(ipcmp(lp->raddr, src) != 0)
1590                         continue;
1591
1592                 /* RST can only follow the SYN */
1593                 if(segp->seq == lp->irs+1){
1594                         tpriv->nlimbo--;
1595                         *l = lp->next;
1596                         free(lp);
1597                 }
1598                 break;
1599         }
1600 }
1601
1602 /*
1603  *  come here when we finally get an ACK to our SYN-ACK.
1604  *  lookup call in limbo.  if found, create a new conversation
1605  *
1606  *  called with proto locked
1607  */
1608 static Conv*
1609 tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1610 {
1611         Conv *new;
1612         Tcpctl *tcb;
1613         Tcppriv *tpriv;
1614         Tcp4hdr *h4;
1615         Tcp6hdr *h6;
1616         Limbo *lp, **l;
1617         int h;
1618
1619         /* unless it's just an ack, it can't be someone coming out of limbo */
1620         if((segp->flags & SYN) || (segp->flags & ACK) == 0)
1621                 return nil;
1622
1623         tpriv = s->p->priv;
1624
1625         /* find a call in limbo */
1626         h = hashipa(src, segp->source);
1627         for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){
1628                 netlog(s->p->f, Logtcp, "tcpincoming s %I!%ud/%I!%ud d %I!%ud/%I!%ud v %d/%d\n",
1629                         src, segp->source, lp->raddr, lp->rport,
1630                         dst, segp->dest, lp->laddr, lp->lport,
1631                         version, lp->version
1632                 );
1633
1634                 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1635                         continue;
1636                 if(ipcmp(lp->laddr, dst) != 0)
1637                         continue;
1638                 if(ipcmp(lp->raddr, src) != 0)
1639                         continue;
1640
1641                 /* we're assuming no data with the initial SYN */
1642                 if(segp->seq != lp->irs+1 || segp->ack != lp->iss+1){
1643                         netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux\n",
1644                                 segp->seq, lp->irs+1, segp->ack, lp->iss+1);
1645                         lp = nil;
1646                 } else {
1647                         tpriv->nlimbo--;
1648                         *l = lp->next;
1649                 }
1650                 break;
1651         }
1652         if(lp == nil)
1653                 return nil;
1654
1655         new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1656         if(new == nil)
1657                 return nil;
1658
1659         memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1660         tcb = (Tcpctl*)new->ptcl;
1661         tcb->flags &= ~CLONE;
1662         tcb->timer.arg = new;
1663         tcb->timer.state = TcptimerOFF;
1664         tcb->acktimer.arg = new;
1665         tcb->acktimer.state = TcptimerOFF;
1666         tcb->katimer.arg = new;
1667         tcb->katimer.state = TcptimerOFF;
1668         tcb->rtt_timer.arg = new;
1669         tcb->rtt_timer.state = TcptimerOFF;
1670
1671         tcb->irs = lp->irs;
1672         tcb->rcv.nxt = tcb->irs+1;
1673         tcb->rcv.urg = tcb->rcv.nxt;
1674
1675         tcb->iss = lp->iss;
1676         tcb->rttseq = tcb->iss;
1677         tcb->snd.wl2 = tcb->iss;
1678         tcb->snd.una = tcb->iss+1;
1679         tcb->snd.ptr = tcb->iss+1;
1680         tcb->snd.nxt = tcb->iss+1;
1681         tcb->flgcnt = 0;
1682         tcb->flags |= SYNACK;
1683
1684         /* our sending max segment size cannot be bigger than what he asked for */
1685         if(lp->mss != 0 && lp->mss < tcb->mss) {
1686                 tcb->mss = lp->mss;
1687                 tpriv->stats[Mss] = tcb->mss;
1688         }
1689
1690         /* window scaling */
1691         tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1692
1693         /* the congestion window always starts out as a single segment */
1694         tcb->snd.wnd = segp->wnd;
1695         tcb->cwind = tcb->mss;
1696
1697         /* set initial round trip time */
1698         tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER;
1699         tcpsynackrtt(new);
1700
1701         free(lp);
1702
1703         /* set up proto header */
1704         switch(version){
1705         case V4:
1706                 h4 = &tcb->protohdr.tcp4hdr;
1707                 memset(h4, 0, sizeof(*h4));
1708                 h4->proto = IP_TCPPROTO;
1709                 hnputs(h4->tcpsport, new->lport);
1710                 hnputs(h4->tcpdport, new->rport);
1711                 v6tov4(h4->tcpsrc, dst);
1712                 v6tov4(h4->tcpdst, src);
1713                 break;
1714         case V6:
1715                 h6 = &tcb->protohdr.tcp6hdr;
1716                 memset(h6, 0, sizeof(*h6));
1717                 h6->proto = IP_TCPPROTO;
1718                 hnputs(h6->tcpsport, new->lport);
1719                 hnputs(h6->tcpdport, new->rport);
1720                 ipmove(h6->tcpsrc, dst);
1721                 ipmove(h6->tcpdst, src);
1722                 break;
1723         default:
1724                 panic("tcpincoming: version %d", new->ipversion);
1725         }
1726
1727         tcpsetstate(new, Established);
1728
1729         iphtadd(&tpriv->ht, new);
1730
1731         return new;
1732 }
1733
1734 int
1735 seq_within(ulong x, ulong low, ulong high)
1736 {
1737         if(low <= high){
1738                 if(low <= x && x <= high)
1739                         return 1;
1740         }
1741         else {
1742                 if(x >= low || x <= high)
1743                         return 1;
1744         }
1745         return 0;
1746 }
1747
1748 int
1749 seq_lt(ulong x, ulong y)
1750 {
1751         return (int)(x-y) < 0;
1752 }
1753
1754 int
1755 seq_le(ulong x, ulong y)
1756 {
1757         return (int)(x-y) <= 0;
1758 }
1759
1760 int
1761 seq_gt(ulong x, ulong y)
1762 {
1763         return (int)(x-y) > 0;
1764 }
1765
1766 int
1767 seq_ge(ulong x, ulong y)
1768 {
1769         return (int)(x-y) >= 0;
1770 }
1771
1772 /*
1773  *  use the time between the first SYN and it's ack as the
1774  *  initial round trip time
1775  */
1776 void
1777 tcpsynackrtt(Conv *s)
1778 {
1779         Tcpctl *tcb;
1780         int delta;
1781         Tcppriv *tpriv;
1782
1783         tcb = (Tcpctl*)s->ptcl;
1784         tpriv = s->p->priv;
1785
1786         delta = NOW - tcb->sndsyntime;
1787         tcb->srtt = delta<<LOGAGAIN;
1788         tcb->mdev = delta<<LOGDGAIN;
1789
1790         /* halt round trip timer */
1791         tcphalt(tpriv, &tcb->rtt_timer);
1792 }
1793
1794 void
1795 update(Conv *s, Tcp *seg)
1796 {
1797         int rtt, delta;
1798         Tcpctl *tcb;
1799         ulong acked;
1800         ulong expand;
1801         Tcppriv *tpriv;
1802
1803         tpriv = s->p->priv;
1804         tcb = (Tcpctl*)s->ptcl;
1805
1806         /* if everything has been acked, force output(?) */
1807         if(seq_gt(seg->ack, tcb->snd.nxt)) {
1808                 tcb->flags |= FORCE;
1809                 return;
1810         }
1811
1812         /* added by Dong Lin for fast retransmission */
1813         if(seg->ack == tcb->snd.una
1814         && tcb->snd.una != tcb->snd.nxt
1815         && seg->len == 0
1816         && seg->wnd == tcb->snd.wnd) {
1817
1818                 /* this is a pure ack w/o window update */
1819                 netlog(s->p->f, Logtcprxmt, "dupack %lud ack %lud sndwnd %lud advwin %lud\n",
1820                         tcb->snd.dupacks, seg->ack, tcb->snd.wnd, seg->wnd);
1821
1822                 if(++tcb->snd.dupacks == TCPREXMTTHRESH) {
1823                         /*
1824                          *  tahoe tcp rxt the packet, half sshthresh,
1825                          *  and set cwnd to one packet
1826                          */
1827                         tcb->snd.recovery = 1;
1828                         tcb->snd.rxt = tcb->snd.nxt;
1829                         netlog(s->p->f, Logtcprxmt, "fast rxt %lud, nxt %lud\n", tcb->snd.una, tcb->snd.nxt);
1830                         tcprxmit(s);
1831                 } else {
1832                         /* do reno tcp here. */
1833                 }
1834         }
1835
1836         /*
1837          *  update window
1838          */
1839         if(seq_gt(seg->ack, tcb->snd.wl2)
1840         || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){
1841                 tcb->snd.wnd = seg->wnd;
1842                 tcb->snd.wl2 = seg->ack;
1843         }
1844
1845         if(!seq_gt(seg->ack, tcb->snd.una)){
1846                 /*
1847                  *  don't let us hangup if sending into a closed window and
1848                  *  we're still getting acks
1849                  */
1850                 if((tcb->flags&RETRAN) && tcb->snd.wnd == 0){
1851                         tcb->backedoff = MAXBACKMS/4;
1852                 }
1853                 return;
1854         }
1855
1856         /*
1857          *  any positive ack turns off fast rxt,
1858          *  (should we do new-reno on partial acks?)
1859          */
1860         if(!tcb->snd.recovery || seq_ge(seg->ack, tcb->snd.rxt)) {
1861                 tcb->snd.dupacks = 0;
1862                 tcb->snd.recovery = 0;
1863         } else
1864                 netlog(s->p->f, Logtcp, "rxt next %lud, cwin %lud\n", seg->ack, tcb->cwind);
1865
1866         /* Compute the new send window size */
1867         acked = seg->ack - tcb->snd.una;
1868
1869         /* avoid slow start and timers for SYN acks */
1870         if((tcb->flags & SYNACK) == 0) {
1871                 tcb->flags |= SYNACK;
1872                 acked--;
1873                 tcb->flgcnt--;
1874                 goto done;
1875         }
1876
1877         /* slow start as long as we're not recovering from lost packets */
1878         if(tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
1879                 if(tcb->cwind < tcb->ssthresh) {
1880                         expand = tcb->mss;
1881                         if(acked < expand)
1882                                 expand = acked;
1883                 }
1884                 else
1885                         expand = ((int)tcb->mss * tcb->mss) / tcb->cwind;
1886
1887                 if(tcb->cwind + expand < tcb->cwind)
1888                         expand = tcb->snd.wnd - tcb->cwind;
1889                 if(tcb->cwind + expand > tcb->snd.wnd)
1890                         expand = tcb->snd.wnd - tcb->cwind;
1891                 tcb->cwind += expand;
1892         }
1893
1894         /* Adjust the timers according to the round trip time */
1895         if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
1896                 tcphalt(tpriv, &tcb->rtt_timer);
1897                 if((tcb->flags&RETRAN) == 0) {
1898                         tcb->backoff = 0;
1899                         tcb->backedoff = 0;
1900                         rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
1901                         if(rtt == 0)
1902                                 rtt = 1;        /* otherwise all close systems will rexmit in 0 time */
1903                         rtt *= MSPTICK;
1904                         if(tcb->srtt == 0) {
1905                                 tcb->srtt = rtt << LOGAGAIN;
1906                                 tcb->mdev = rtt << LOGDGAIN;
1907                         } else {
1908                                 delta = rtt - (tcb->srtt>>LOGAGAIN);
1909                                 tcb->srtt += delta;
1910                                 if(tcb->srtt <= 0)
1911                                         tcb->srtt = 1;
1912
1913                                 delta = abs(delta) - (tcb->mdev>>LOGDGAIN);
1914                                 tcb->mdev += delta;
1915                                 if(tcb->mdev <= 0)
1916                                         tcb->mdev = 1;
1917                         }
1918                         tcpsettimer(tcb);
1919                 }
1920         }
1921
1922 done:
1923         if(qdiscard(s->wq, acked) < acked)
1924                 tcb->flgcnt--;
1925
1926         tcb->snd.una = seg->ack;
1927         if(seq_gt(seg->ack, tcb->snd.urg))
1928                 tcb->snd.urg = seg->ack;
1929
1930         if(tcb->snd.una != tcb->snd.nxt)
1931                 tcpgo(tpriv, &tcb->timer);
1932         else
1933                 tcphalt(tpriv, &tcb->timer);
1934
1935         if(seq_lt(tcb->snd.ptr, tcb->snd.una))
1936                 tcb->snd.ptr = tcb->snd.una;
1937
1938         tcb->flags &= ~RETRAN;
1939         tcb->backoff = 0;
1940         tcb->backedoff = 0;
1941 }
1942
1943 void
1944 tcpiput(Proto *tcp, Ipifc*, Block *bp)
1945 {
1946         Tcp seg;
1947         Tcp4hdr *h4;
1948         Tcp6hdr *h6;
1949         int hdrlen;
1950         Tcpctl *tcb;
1951         ushort length, csum;
1952         uchar source[IPaddrlen], dest[IPaddrlen];
1953         Conv *s;
1954         Fs *f;
1955         Tcppriv *tpriv;
1956         uchar version;
1957
1958         f = tcp->f;
1959         tpriv = tcp->priv;
1960
1961         tpriv->stats[InSegs]++;
1962
1963         h4 = (Tcp4hdr*)(bp->rp);
1964         h6 = (Tcp6hdr*)(bp->rp);
1965         memset(&seg, 0, sizeof seg);
1966
1967         if((h4->vihl&0xF0)==IP_VER4) {
1968                 version = V4;
1969                 length = nhgets(h4->length);
1970                 v4tov6(dest, h4->tcpdst);
1971                 v4tov6(source, h4->tcpsrc);
1972
1973                 h4->Unused = 0;
1974                 hnputs(h4->tcplen, length-TCP4_PKT);
1975                 if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
1976                         ptclcsum(bp, TCP4_IPLEN, length-TCP4_IPLEN)) {
1977                         tpriv->stats[CsumErrs]++;
1978                         tpriv->stats[InErrs]++;
1979                         netlog(f, Logtcp, "bad tcp proto cksum\n");
1980                         freeblist(bp);
1981                         return;
1982                 }
1983
1984                 hdrlen = ntohtcp4(&seg, &bp);
1985                 if(hdrlen < 0){
1986                         tpriv->stats[HlenErrs]++;
1987                         tpriv->stats[InErrs]++;
1988                         netlog(f, Logtcp, "bad tcp hdr len\n");
1989                         return;
1990                 }
1991
1992                 /* trim the packet to the size claimed by the datagram */
1993                 length -= hdrlen+TCP4_PKT;
1994                 bp = trimblock(bp, hdrlen+TCP4_PKT, length);
1995                 if(bp == nil){
1996                         tpriv->stats[LenErrs]++;
1997                         tpriv->stats[InErrs]++;
1998                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
1999                         return;
2000                 }
2001         }
2002         else {
2003                 int ttl = h6->ttl;
2004                 int proto = h6->proto;
2005
2006                 version = V6;
2007                 length = nhgets(h6->ploadlen);
2008                 ipmove(dest, h6->tcpdst);
2009                 ipmove(source, h6->tcpsrc);
2010
2011                 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
2012                 h6->ttl = proto;
2013                 hnputl(h6->vcf, length);
2014                 if((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
2015                     (csum = ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) != 0) {
2016                         tpriv->stats[CsumErrs]++;
2017                         tpriv->stats[InErrs]++;
2018                         netlog(f, Logtcp,
2019                             "bad tcpv6 proto cksum: got %#ux, computed %#ux\n",
2020                                 h6->tcpcksum[0]<<8 | h6->tcpcksum[1], csum);
2021                         freeblist(bp);
2022                         return;
2023                 }
2024                 h6->ttl = ttl;
2025                 h6->proto = proto;
2026                 hnputs(h6->ploadlen, length);
2027
2028                 hdrlen = ntohtcp6(&seg, &bp);
2029                 if(hdrlen < 0){
2030                         tpriv->stats[HlenErrs]++;
2031                         tpriv->stats[InErrs]++;
2032                         netlog(f, Logtcp, "bad tcpv6 hdr len\n");
2033                         return;
2034                 }
2035
2036                 /* trim the packet to the size claimed by the datagram */
2037                 length -= hdrlen;
2038                 bp = trimblock(bp, hdrlen+TCP6_PKT, length);
2039                 if(bp == nil){
2040                         tpriv->stats[LenErrs]++;
2041                         tpriv->stats[InErrs]++;
2042                         netlog(f, Logtcp, "tcpv6 len < 0 after trim\n");
2043                         return;
2044                 }
2045         }
2046
2047         /* lock protocol while searching for a conversation */
2048         qlock(tcp);
2049
2050         /* Look for a matching conversation */
2051         s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2052         if(s == nil){
2053                 netlog(f, Logtcp, "iphtlook(src %I!%d, dst %I!%d) failed\n",
2054                         source, seg.source, dest, seg.dest);
2055 reset:
2056                 qunlock(tcp);
2057                 sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2058                 freeblist(bp);
2059                 return;
2060         }
2061
2062         /* if it's a listener, look for the right flags and get a new conv */
2063         tcb = (Tcpctl*)s->ptcl;
2064         if(tcb->state == Listen){
2065                 if(seg.flags & RST){
2066                         limborst(s, &seg, source, dest, version);
2067                         qunlock(tcp);
2068                         freeblist(bp);
2069                         return;
2070                 }
2071
2072                 /* if this is a new SYN, put the call into limbo */
2073                 if((seg.flags & SYN) && (seg.flags & ACK) == 0){
2074                         limbo(s, source, dest, &seg, version);
2075                         qunlock(tcp);
2076                         freeblist(bp);
2077                         return;
2078                 }
2079
2080                 /*
2081                  *  if there's a matching call in limbo, tcpincoming will
2082                  *  return it in state Syn_received
2083                  */
2084                 s = tcpincoming(s, &seg, source, dest, version);
2085                 if(s == nil)
2086                         goto reset;
2087         }
2088
2089         /* The rest of the input state machine is run with the control block
2090          * locked and implements the state machine directly out of the RFC.
2091          * Out-of-band data is ignored - it was always a bad idea.
2092          */
2093         tcb = (Tcpctl*)s->ptcl;
2094         if(waserror()){
2095                 qunlock(s);
2096                 nexterror();
2097         }
2098         qlock(s);
2099         qunlock(tcp);
2100
2101         /* fix up window */
2102         seg.wnd <<= tcb->rcv.scale;
2103
2104         /* every input packet in puts off the keep alive time out */
2105         tcpsetkacounter(tcb);
2106
2107         switch(tcb->state) {
2108         case Closed:
2109                 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2110                 goto raise;
2111         case Syn_sent:
2112                 if(seg.flags & ACK) {
2113                         if(!seq_within(seg.ack, tcb->iss+1, tcb->snd.nxt)) {
2114                                 sndrst(tcp, source, dest, length, &seg, version,
2115                                          "bad seq in Syn_sent");
2116                                 goto raise;
2117                         }
2118                 }
2119                 if(seg.flags & RST) {
2120                         if(seg.flags & ACK)
2121                                 localclose(s, Econrefused);
2122                         goto raise;
2123                 }
2124
2125                 if(seg.flags & SYN) {
2126                         procsyn(s, &seg);
2127                         if(seg.flags & ACK){
2128                                 update(s, &seg);
2129                                 tcpsynackrtt(s);
2130                                 tcpsetstate(s, Established);
2131                                 tcpsetscale(s, tcb, seg.ws, tcb->scale);
2132                         }
2133                         else {
2134                                 tcb->time = NOW;
2135                                 tcpsetstate(s, Syn_received);   /* DLP - shouldn't this be a reset? */
2136                         }
2137
2138                         if(length != 0 || (seg.flags & FIN))
2139                                 break;
2140
2141                         freeblist(bp);
2142                         goto output;
2143                 }
2144                 else
2145                         freeblist(bp);
2146
2147                 qunlock(s);
2148                 poperror();
2149                 return;
2150         case Syn_received:
2151                 /* doesn't matter if it's the correct ack, we're just trying to set timing */
2152                 if(seg.flags & ACK)
2153                         tcpsynackrtt(s);
2154                 break;
2155         }
2156
2157         /*
2158          *  One DOS attack is to open connections to us and then forget about them,
2159          *  thereby tying up a conv at no long term cost to the attacker.
2160          *  This is an attempt to defeat these stateless DOS attacks.  See
2161          *  corresponding code in tcpsendka().
2162          */
2163         if(tcb->state != Syn_received && (seg.flags & RST) == 0){
2164                 if(tcpporthogdefense
2165                 && seq_within(seg.ack, tcb->snd.una-(1<<31), tcb->snd.una-(1<<29))){
2166                         print("stateless hog %I.%d->%I.%d f %ux %lux - %lux - %lux\n",
2167                                 source, seg.source, dest, seg.dest, seg.flags,
2168                                 tcb->snd.una-(1<<31), seg.ack, tcb->snd.una-(1<<29));
2169                         localclose(s, "stateless hog");
2170                 }
2171         }
2172
2173         /* Cut the data to fit the receive window */
2174         if(tcptrim(tcb, &seg, &bp, &length) == -1) {
2175                 netlog(f, Logtcp, "tcptrim, not accept, seq %lud-%lud win %lud-%lud from %I\n", 
2176                         seg.seq, seg.seq + length - 1, 
2177                         tcb->rcv.nxt, tcb->rcv.nxt + tcb->rcv.wnd-1, s->raddr);
2178                 netlog(f, Logtcp, "tcp len < 0, %lud %d\n", seg.seq, length);
2179                 update(s, &seg);
2180                 if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) {
2181                         tcphalt(tpriv, &tcb->rtt_timer);
2182                         tcphalt(tpriv, &tcb->acktimer);
2183                         tcphalt(tpriv, &tcb->katimer);
2184                         tcpsetstate(s, Time_wait);
2185                         tcb->timer.start = MSL2*(1000 / MSPTICK);
2186                         tcpgo(tpriv, &tcb->timer);
2187                 }
2188                 if(!(seg.flags & RST)) {
2189                         tcb->flags |= FORCE;
2190                         goto output;
2191                 }
2192                 qunlock(s);
2193                 poperror();
2194                 return;
2195         }
2196
2197         /* Cannot accept so answer with a rst */
2198         if(length && tcb->state == Closed) {
2199                 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2200                 goto raise;
2201         }
2202
2203         /* The segment is beyond the current receive pointer so
2204          * queue the data in the resequence queue
2205          */
2206         if(seg.seq != tcb->rcv.nxt)
2207         if(length != 0 || (seg.flags & (SYN|FIN))) {
2208                 update(s, &seg);
2209                 if(addreseq(tcb, tpriv, &seg, bp, length) < 0)
2210                         print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr, s->lport);
2211                 tcb->flags |= FORCE;
2212                 goto output;
2213         }
2214
2215         /*
2216          *  keep looping till we've processed this packet plus any
2217          *  adjacent packets in the resequence queue
2218          */
2219         for(;;) {
2220                 if(seg.flags & RST) {
2221                         if(tcb->state == Established) {
2222                                 tpriv->stats[EstabResets]++;
2223                                 if(tcb->rcv.nxt != seg.seq)
2224                                         print("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt %lux seq %lux\n", s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt, seg.seq);
2225                         }
2226                         localclose(s, Econrefused);
2227                         goto raise;
2228                 }
2229
2230                 if((seg.flags&ACK) == 0)
2231                         goto raise;
2232
2233                 switch(tcb->state) {
2234                 case Syn_received:
2235                         if(!seq_within(seg.ack, tcb->snd.una+1, tcb->snd.nxt)){
2236                                 sndrst(tcp, source, dest, length, &seg, version,
2237                                         "bad seq in Syn_received");
2238                                 goto raise;
2239                         }
2240                         update(s, &seg);
2241                         tcpsetstate(s, Established);
2242                 case Established:
2243                 case Close_wait:
2244                         update(s, &seg);
2245                         break;
2246                 case Finwait1:
2247                         update(s, &seg);
2248                         if(qlen(s->wq)+tcb->flgcnt == 0){
2249                                 tcphalt(tpriv, &tcb->rtt_timer);
2250                                 tcphalt(tpriv, &tcb->acktimer);
2251                                 tcpsetkacounter(tcb);
2252                                 tcb->time = NOW;
2253                                 tcpsetstate(s, Finwait2);
2254                                 tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2255                                 tcpgo(tpriv, &tcb->katimer);
2256                         }
2257                         break;
2258                 case Finwait2:
2259                         update(s, &seg);
2260                         break;
2261                 case Closing:
2262                         update(s, &seg);
2263                         if(qlen(s->wq)+tcb->flgcnt == 0) {
2264                                 tcphalt(tpriv, &tcb->rtt_timer);
2265                                 tcphalt(tpriv, &tcb->acktimer);
2266                                 tcphalt(tpriv, &tcb->katimer);
2267                                 tcpsetstate(s, Time_wait);
2268                                 tcb->timer.start = MSL2*(1000 / MSPTICK);
2269                                 tcpgo(tpriv, &tcb->timer);
2270                         }
2271                         break;
2272                 case Last_ack:
2273                         update(s, &seg);
2274                         if(qlen(s->wq)+tcb->flgcnt == 0) {
2275                                 localclose(s, nil);
2276                                 goto raise;
2277                         }
2278                 case Time_wait:
2279                         tcb->flags |= FORCE;
2280                         if(tcb->timer.state != TcptimerON)
2281                                 tcpgo(tpriv, &tcb->timer);
2282                 }
2283
2284                 if((seg.flags&URG) && seg.urg) {
2285                         if(seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2286                                 tcb->rcv.urg = seg.urg + seg.seq;
2287                                 pullblock(&bp, seg.urg);
2288                         }
2289                 }
2290                 else
2291                 if(seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2292                         tcb->rcv.urg = tcb->rcv.nxt;
2293
2294                 if(length == 0) {
2295                         if(bp != nil)
2296                                 freeblist(bp);
2297                 }
2298                 else {
2299                         switch(tcb->state){
2300                         default:
2301                                 /* Ignore segment text */
2302                                 if(bp != nil)
2303                                         freeblist(bp);
2304                                 break;
2305
2306                         case Syn_received:
2307                         case Established:
2308                         case Finwait1:
2309                                 /* If we still have some data place on
2310                                  * receive queue
2311                                  */
2312                                 if(bp) {
2313                                         bp = packblock(bp);
2314                                         if(bp == nil)
2315                                                 panic("tcp packblock");
2316                                         qpassnolim(s->rq, bp);
2317                                         bp = nil;
2318
2319                                         /*
2320                                          *  Force an ack every 2 data messages.  This is
2321                                          *  a hack for rob to make his home system run
2322                                          *  faster.
2323                                          *
2324                                          *  this also keeps the standard TCP congestion
2325                                          *  control working since it needs an ack every
2326                                          *  2 max segs worth.  This is not quite that,
2327                                          *  but under a real stream is equivalent since
2328                                          *  every packet has a max seg in it.
2329                                          */
2330                                         if(++(tcb->rcv.una) >= 2)
2331                                                 tcb->flags |= FORCE;
2332                                 }
2333                                 tcb->rcv.nxt += length;
2334
2335                                 /*
2336                                  *  update our rcv window
2337                                  */
2338                                 tcprcvwin(s);
2339
2340                                 /*
2341                                  *  turn on the acktimer if there's something
2342                                  *  to ack
2343                                  */
2344                                 if(tcb->acktimer.state != TcptimerON)
2345                                         tcpgo(tpriv, &tcb->acktimer);
2346
2347                                 break;
2348                         case Finwait2:
2349                                 /* no process to read the data, send a reset */
2350                                 if(bp != nil)
2351                                         freeblist(bp);
2352                                 sndrst(tcp, source, dest, length, &seg, version,
2353                                         "send to Finwait2");
2354                                 qunlock(s);
2355                                 poperror();
2356                                 return;
2357                         }
2358                 }
2359
2360                 if(seg.flags & FIN) {
2361                         tcb->flags |= FORCE;
2362
2363                         switch(tcb->state) {
2364                         case Syn_received:
2365                         case Established:
2366                                 tcb->rcv.nxt++;
2367                                 tcpsetstate(s, Close_wait);
2368                                 break;
2369                         case Finwait1:
2370                                 tcb->rcv.nxt++;
2371                                 if(qlen(s->wq)+tcb->flgcnt == 0) {
2372                                         tcphalt(tpriv, &tcb->rtt_timer);
2373                                         tcphalt(tpriv, &tcb->acktimer);
2374                                         tcphalt(tpriv, &tcb->katimer);
2375                                         tcpsetstate(s, Time_wait);
2376                                         tcb->timer.start = MSL2*(1000/MSPTICK);
2377                                         tcpgo(tpriv, &tcb->timer);
2378                                 }
2379                                 else
2380                                         tcpsetstate(s, Closing);
2381                                 break;
2382                         case Finwait2:
2383                                 tcb->rcv.nxt++;
2384                                 tcphalt(tpriv, &tcb->rtt_timer);
2385                                 tcphalt(tpriv, &tcb->acktimer);
2386                                 tcphalt(tpriv, &tcb->katimer);
2387                                 tcpsetstate(s, Time_wait);
2388                                 tcb->timer.start = MSL2 * (1000/MSPTICK);
2389                                 tcpgo(tpriv, &tcb->timer);
2390                                 break;
2391                         case Close_wait:
2392                         case Closing:
2393                         case Last_ack:
2394                                 break;
2395                         case Time_wait:
2396                                 tcpgo(tpriv, &tcb->timer);
2397                                 break;
2398                         }
2399                 }
2400
2401                 /*
2402                  *  get next adjacent segment from the resequence queue.
2403                  *  dump/trim any overlapping segments
2404                  */
2405                 for(;;) {
2406                         if(tcb->reseq == nil)
2407                                 goto output;
2408
2409                         if(seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2410                                 goto output;
2411
2412                         getreseq(tcb, &seg, &bp, &length);
2413
2414                         if(tcptrim(tcb, &seg, &bp, &length) == 0)
2415                                 break;
2416                 }
2417         }
2418 output:
2419         tcpoutput(s);
2420         qunlock(s);
2421         poperror();
2422         return;
2423 raise:
2424         qunlock(s);
2425         poperror();
2426         freeblist(bp);
2427         tcpkick(s);
2428 }
2429
2430 /*
2431  *  always enters and exits with the s locked.  We drop
2432  *  the lock to ipoput the packet so some care has to be
2433  *  taken by callers.
2434  */
2435 void
2436 tcpoutput(Conv *s)
2437 {
2438         Tcp seg;
2439         int msgs;
2440         Tcpctl *tcb;
2441         Block *hbp, *bp;
2442         int sndcnt, n;
2443         ulong ssize, dsize, usable, sent;
2444         Fs *f;
2445         Tcppriv *tpriv;
2446         uchar version;
2447
2448         f = s->p->f;
2449         tpriv = s->p->priv;
2450         version = s->ipversion;
2451         memset(&seg, 0, sizeof seg);
2452
2453         for(msgs = 0; msgs < 100; msgs++) {
2454                 tcb = (Tcpctl*)s->ptcl;
2455
2456                 switch(tcb->state) {
2457                 case Listen:
2458                 case Closed:
2459                 case Finwait2:
2460                         return;
2461                 }
2462
2463                 /* force an ack when a window has opened up */
2464                 if(tcb->rcv.blocked && tcb->rcv.wnd > 0){
2465                         tcb->rcv.blocked = 0;
2466                         tcb->flags |= FORCE;
2467                 }
2468
2469                 sndcnt = qlen(s->wq)+tcb->flgcnt;
2470                 sent = tcb->snd.ptr - tcb->snd.una;
2471
2472                 /* Don't send anything else until our SYN has been acked */
2473                 if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
2474                         break;
2475
2476                 /* Compute usable segment based on offered window and limit
2477                  * window probes to one
2478                  */
2479                 if(tcb->snd.wnd == 0){
2480                         if(sent != 0) {
2481                                 if((tcb->flags&FORCE) == 0)
2482                                         break;
2483 //                              tcb->snd.ptr = tcb->snd.una;
2484                         }
2485                         usable = 1;
2486                 }
2487                 else {
2488                         usable = tcb->cwind;
2489                         if(tcb->snd.wnd < usable)
2490                                 usable = tcb->snd.wnd;
2491 //                      usable -= sent;
2492                         usable = usable >= sent? usable - sent: 0;
2493                 }
2494                 ssize = sndcnt-sent;
2495                 if(ssize && usable < 2)
2496                         netlog(s->p->f, Logtcp, "throttled snd.wnd %lud cwind %lud\n",
2497                                 tcb->snd.wnd, tcb->cwind);
2498                 if(usable < ssize)
2499                         ssize = usable;
2500                 if(tcb->mss < ssize)
2501                         ssize = tcb->mss;
2502                 dsize = ssize;
2503                 seg.urg = 0;
2504
2505                 if(ssize == 0)
2506                 if((tcb->flags&FORCE) == 0)
2507                         break;
2508
2509                 tcb->flags &= ~FORCE;
2510                 tcprcvwin(s);
2511
2512                 /* By default we will generate an ack */
2513                 tcphalt(tpriv, &tcb->acktimer);
2514                 tcb->rcv.una = 0;
2515                 seg.source = s->lport;
2516                 seg.dest = s->rport;
2517                 seg.flags = ACK;
2518                 seg.mss = 0;
2519                 seg.ws = 0;
2520                 switch(tcb->state){
2521                 case Syn_sent:
2522                         seg.flags = 0;
2523                         if(tcb->snd.ptr == tcb->iss){
2524                                 seg.flags |= SYN;
2525                                 dsize--;
2526                                 seg.mss = tcb->mss;
2527                                 seg.ws = tcb->scale;
2528                         }
2529                         break;
2530                 case Syn_received:
2531                         /*
2532                          *  don't send any data with a SYN/ACK packet
2533                          *  because Linux rejects the packet in its
2534                          *  attempt to solve the SYN attack problem
2535                          */
2536                         if(tcb->snd.ptr == tcb->iss){
2537                                 seg.flags |= SYN;
2538                                 dsize = 0;
2539                                 ssize = 1;
2540                                 seg.mss = tcb->mss;
2541                                 seg.ws = tcb->scale;
2542                         }
2543                         break;
2544                 }
2545                 seg.seq = tcb->snd.ptr;
2546                 seg.ack = tcb->rcv.nxt;
2547                 seg.wnd = tcb->rcv.wnd;
2548
2549                 /* Pull out data to send */
2550                 bp = nil;
2551                 if(dsize != 0) {
2552                         bp = qcopy(s->wq, dsize, sent);
2553                         if(BLEN(bp) != dsize) {
2554                                 seg.flags |= FIN;
2555                                 dsize--;
2556                         }
2557                 }
2558
2559                 if(sent+dsize == sndcnt)
2560                         seg.flags |= PSH;
2561
2562                 /* keep track of balance of resent data */
2563                 if(seq_lt(tcb->snd.ptr, tcb->snd.nxt)) {
2564                         n = tcb->snd.nxt - tcb->snd.ptr;
2565                         if(ssize < n)
2566                                 n = ssize;
2567                         tcb->resent += n;
2568                         netlog(f, Logtcp, "rexmit: %I!%d -> %I!%d ptr %lux nxt %lux\n",
2569                                 s->raddr, s->rport, s->laddr, s->lport, tcb->snd.ptr, tcb->snd.nxt);
2570                         tpriv->stats[RetransSegs]++;
2571                 }
2572
2573                 tcb->snd.ptr += ssize;
2574
2575                 /* Pull up the send pointer so we can accept acks
2576                  * for this window
2577                  */
2578                 if(seq_gt(tcb->snd.ptr,tcb->snd.nxt))
2579                         tcb->snd.nxt = tcb->snd.ptr;
2580
2581                 /* Build header, link data and compute cksum */
2582                 switch(version){
2583                 case V4:
2584                         tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2585                         hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
2586                         if(hbp == nil) {
2587                                 freeblist(bp);
2588                                 return;
2589                         }
2590                         break;
2591                 case V6:
2592                         tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2593                         hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
2594                         if(hbp == nil) {
2595                                 freeblist(bp);
2596                                 return;
2597                         }
2598                         break;
2599                 default:
2600                         hbp = nil;      /* to suppress a warning */
2601                         panic("tcpoutput: version %d", version);
2602                 }
2603
2604                 /* Start the transmission timers if there is new data and we
2605                  * expect acknowledges
2606                  */
2607                 if(ssize != 0){
2608                         if(tcb->timer.state != TcptimerON)
2609                                 tcpgo(tpriv, &tcb->timer);
2610
2611                         /*  If round trip timer isn't running, start it.
2612                          *  measure the longest packet only in case the
2613                          *  transmission time dominates RTT
2614                          */
2615                         if(tcb->rtt_timer.state != TcptimerON)
2616                         if(ssize == tcb->mss) {
2617                                 tcpgo(tpriv, &tcb->rtt_timer);
2618                                 tcb->rttseq = tcb->snd.ptr;
2619                         }
2620                 }
2621
2622                 tpriv->stats[OutSegs]++;
2623
2624                 /* put off the next keep alive */
2625                 tcpgo(tpriv, &tcb->katimer);
2626
2627                 switch(version){
2628                 case V4:
2629                         if(ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0){
2630                                 /* a negative return means no route */
2631                                 localclose(s, "no route");
2632                         }
2633                         break;
2634                 case V6:
2635                         if(ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0){
2636                                 /* a negative return means no route */
2637                                 localclose(s, "no route");
2638                         }
2639                         break;
2640                 default:
2641                         panic("tcpoutput2: version %d", version);
2642                 }
2643                 if((msgs%4) == 1){
2644                         qunlock(s);
2645                         sched();
2646                         qlock(s);
2647                 }
2648         }
2649 }
2650
2651 /*
2652  *  the BSD convention (hack?) for keep alives.  resend last uchar acked.
2653  */
2654 void
2655 tcpsendka(Conv *s)
2656 {
2657         Tcp seg;
2658         Tcpctl *tcb;
2659         Block *hbp,*dbp;
2660
2661         tcb = (Tcpctl*)s->ptcl;
2662
2663         dbp = nil;
2664         memset(&seg, 0, sizeof seg);
2665         seg.urg = 0;
2666         seg.source = s->lport;
2667         seg.dest = s->rport;
2668         seg.flags = ACK|PSH;
2669         seg.mss = 0;
2670         seg.ws = 0;
2671         if(tcpporthogdefense)
2672                 seg.seq = tcb->snd.una-(1<<30)-nrand(1<<20);
2673         else
2674                 seg.seq = tcb->snd.una-1;
2675         seg.ack = tcb->rcv.nxt;
2676         tcb->rcv.una = 0;
2677         seg.wnd = tcb->rcv.wnd;
2678         if(tcb->state == Finwait2){
2679                 seg.flags |= FIN;
2680         } else {
2681                 dbp = allocb(1);
2682                 dbp->wp++;
2683         }
2684
2685         if(isv4(s->raddr)) {
2686                 /* Build header, link data and compute cksum */
2687                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2688                 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
2689                 if(hbp == nil) {
2690                         freeblist(dbp);
2691                         return;
2692                 }
2693                 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
2694         }
2695         else {
2696                 /* Build header, link data and compute cksum */
2697                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2698                 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
2699                 if(hbp == nil) {
2700                         freeblist(dbp);
2701                         return;
2702                 }
2703                 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
2704         }
2705 }
2706
2707 /*
2708  *  set connection to time out after 12 minutes
2709  */
2710 void
2711 tcpsetkacounter(Tcpctl *tcb)
2712 {
2713         tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK);
2714         if(tcb->kacounter < 3)
2715                 tcb->kacounter = 3;
2716 }
2717
2718 /*
2719  *  if we've timed out, close the connection
2720  *  otherwise, send a keepalive and restart the timer
2721  */
2722 void
2723 tcpkeepalive(void *v)
2724 {
2725         Tcpctl *tcb;
2726         Conv *s;
2727
2728         s = v;
2729         tcb = (Tcpctl*)s->ptcl;
2730         if(waserror()){
2731                 qunlock(s);
2732                 nexterror();
2733         }
2734         qlock(s);
2735         if(tcb->state != Closed){
2736                 if(--(tcb->kacounter) <= 0) {
2737                         localclose(s, Etimedout);
2738                 } else {
2739                         tcpsendka(s);
2740                         tcpgo(s->p->priv, &tcb->katimer);
2741                 }
2742         }
2743         qunlock(s);
2744         poperror();
2745 }
2746
2747 /*
2748  *  start keepalive timer
2749  */
2750 char*
2751 tcpstartka(Conv *s, char **f, int n)
2752 {
2753         Tcpctl *tcb;
2754         int x;
2755
2756         tcb = (Tcpctl*)s->ptcl;
2757         if(tcb->state != Established)
2758                 return "connection must be in Establised state";
2759         if(n > 1){
2760                 x = atoi(f[1]);
2761                 if(x >= MSPTICK)
2762                         tcb->katimer.start = x/MSPTICK;
2763         }
2764         tcpsetkacounter(tcb);
2765         tcpgo(s->p->priv, &tcb->katimer);
2766
2767         return nil;
2768 }
2769
2770 /*
2771  *  turn checksums on/off
2772  */
2773 char*
2774 tcpsetchecksum(Conv *s, char **f, int)
2775 {
2776         Tcpctl *tcb;
2777
2778         tcb = (Tcpctl*)s->ptcl;
2779         tcb->nochecksum = !atoi(f[1]);
2780
2781         return nil;
2782 }
2783
2784 void
2785 tcprxmit(Conv *s)
2786 {
2787         Tcpctl *tcb;
2788
2789         tcb = (Tcpctl*)s->ptcl;
2790
2791         tcb->flags |= RETRAN|FORCE;
2792         tcb->snd.ptr = tcb->snd.una;
2793
2794         /*
2795          *  We should be halving the slow start threshhold (down to one
2796          *  mss) but leaving it at mss seems to work well enough
2797          */
2798         tcb->ssthresh = tcb->mss;
2799
2800         /*
2801          *  pull window down to a single packet
2802          */
2803         tcb->cwind = tcb->mss;
2804         tcpoutput(s);
2805 }
2806
2807 void
2808 tcptimeout(void *arg)
2809 {
2810         Conv *s;
2811         Tcpctl *tcb;
2812         int maxback;
2813         Tcppriv *tpriv;
2814
2815         s = (Conv*)arg;
2816         tpriv = s->p->priv;
2817         tcb = (Tcpctl*)s->ptcl;
2818
2819         if(waserror()){
2820                 qunlock(s);
2821                 nexterror();
2822         }
2823         qlock(s);
2824         switch(tcb->state){
2825         default:
2826                 tcb->backoff++;
2827                 if(tcb->state == Syn_sent)
2828                         maxback = MAXBACKMS/2;
2829                 else
2830                         maxback = MAXBACKMS;
2831                 tcb->backedoff += tcb->timer.start * MSPTICK;
2832                 if(tcb->backedoff >= maxback) {
2833                         localclose(s, Etimedout);
2834                         break;
2835                 }
2836                 netlog(s->p->f, Logtcprxmt, "timeout rexmit %#lux %d/%lud\n", tcb->snd.una, tcb->timer.start, NOW);
2837                 tcpsettimer(tcb);
2838                 tcprxmit(s);
2839                 tpriv->stats[RetransTimeouts]++;
2840                 tcb->snd.dupacks = 0;
2841                 break;
2842         case Time_wait:
2843                 localclose(s, nil);
2844                 break;
2845         case Closed:
2846                 break;
2847         }
2848         qunlock(s);
2849         poperror();
2850 }
2851
2852 int
2853 inwindow(Tcpctl *tcb, int seq)
2854 {
2855         return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1);
2856 }
2857
2858 /*
2859  *  set up state for a received SYN (or SYN ACK) packet
2860  */
2861 void
2862 procsyn(Conv *s, Tcp *seg)
2863 {
2864         Tcpctl *tcb;
2865         Tcppriv *tpriv;
2866
2867         tcb = (Tcpctl*)s->ptcl;
2868         tcb->flags |= FORCE;
2869
2870         tcb->rcv.nxt = seg->seq + 1;
2871         tcb->rcv.urg = tcb->rcv.nxt;
2872         tcb->irs = seg->seq;
2873
2874         /* our sending max segment size cannot be bigger than what he asked for */
2875         if(seg->mss != 0 && seg->mss < tcb->mss) {
2876                 tcb->mss = seg->mss;
2877                 tpriv = s->p->priv;
2878                 tpriv->stats[Mss] = tcb->mss;
2879         }
2880
2881         /* the congestion window always starts out as a single segment */
2882         tcb->snd.wnd = seg->wnd;
2883         tcb->cwind = tcb->mss;
2884 }
2885
2886 int
2887 addreseq(Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length)
2888 {
2889         Reseq *rp, *rp1;
2890         int i, rqlen, qmax;
2891
2892         rp = malloc(sizeof(Reseq));
2893         if(rp == nil){
2894                 freeblist(bp);  /* bp always consumed by add_reseq */
2895                 return 0;
2896         }
2897
2898         rp->seg = *seg;
2899         rp->bp = bp;
2900         rp->length = length;
2901
2902         /* Place on reassembly list sorting by starting seq number */
2903         rp1 = tcb->reseq;
2904         if(rp1 == nil || seq_lt(seg->seq, rp1->seg.seq)) {
2905                 rp->next = rp1;
2906                 tcb->reseq = rp;
2907                 if(rp->next != nil)
2908                         tpriv->stats[OutOfOrder]++;
2909                 return 0;
2910         }
2911
2912         rqlen = 0;
2913         for(i = 0;; i++) {
2914                 rqlen += rp1->length;
2915                 if(rp1->next == nil || seq_lt(seg->seq, rp1->next->seg.seq)) {
2916                         rp->next = rp1->next;
2917                         rp1->next = rp;
2918                         if(rp->next != nil)
2919                                 tpriv->stats[OutOfOrder]++;
2920                         break;
2921                 }
2922                 rp1 = rp1->next;
2923         }
2924         qmax = QMAX<<tcb->rcv.scale;
2925         if(rqlen > qmax){
2926                 print("resequence queue > window: %d > %d\n", rqlen, qmax);
2927                 i = 0;
2928                 for(rp1 = tcb->reseq; rp1 != nil; rp1 = rp1->next){
2929                         print("%#lux %#lux %#ux\n", rp1->seg.seq,
2930                                 rp1->seg.ack, rp1->seg.flags);
2931                         if(i++ > 10){
2932                                 print("...\n");
2933                                 break;
2934                         }
2935                 }
2936
2937                 /*
2938                  * delete entire reassembly queue; wait for retransmit.
2939                  * - should we be smarter and only delete the tail?
2940                  */
2941                 for(rp = tcb->reseq; rp != nil; rp = rp1){
2942                         rp1 = rp->next;
2943                         freeblist(rp->bp);
2944                         free(rp);
2945                 }
2946                 tcb->reseq = nil;
2947
2948                 return -1;
2949         }
2950         return 0;
2951 }
2952
2953 void
2954 getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
2955 {
2956         Reseq *rp;
2957
2958         rp = tcb->reseq;
2959         if(rp == nil)
2960                 return;
2961
2962         tcb->reseq = rp->next;
2963
2964         *seg = rp->seg;
2965         *bp = rp->bp;
2966         *length = rp->length;
2967
2968         free(rp);
2969 }
2970
2971 int
2972 tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
2973 {
2974         ushort len;
2975         uchar accept;
2976         int dupcnt, excess;
2977
2978         accept = 0;
2979         len = *length;
2980         if(seg->flags & SYN)
2981                 len++;
2982         if(seg->flags & FIN)
2983                 len++;
2984
2985         if(tcb->rcv.wnd == 0) {
2986                 if(len == 0 && seg->seq == tcb->rcv.nxt)
2987                         return 0;
2988         }
2989         else {
2990                 /* Some part of the segment should be in the window */
2991                 if(inwindow(tcb,seg->seq))
2992                         accept++;
2993                 else
2994                 if(len != 0) {
2995                         if(inwindow(tcb, seg->seq+len-1) ||
2996                         seq_within(tcb->rcv.nxt, seg->seq,seg->seq+len-1))
2997                                 accept++;
2998                 }
2999         }
3000         if(!accept) {
3001                 freeblist(*bp);
3002                 return -1;
3003         }
3004         dupcnt = tcb->rcv.nxt - seg->seq;
3005         if(dupcnt > 0){
3006                 tcb->rerecv += dupcnt;
3007                 if(seg->flags & SYN){
3008                         seg->flags &= ~SYN;
3009                         seg->seq++;
3010
3011                         if(seg->urg > 1)
3012                                 seg->urg--;
3013                         else
3014                                 seg->flags &= ~URG;
3015                         dupcnt--;
3016                 }
3017                 if(dupcnt > 0){
3018                         pullblock(bp, (ushort)dupcnt);
3019                         seg->seq += dupcnt;
3020                         *length -= dupcnt;
3021
3022                         if(seg->urg > dupcnt)
3023                                 seg->urg -= dupcnt;
3024                         else {
3025                                 seg->flags &= ~URG;
3026                                 seg->urg = 0;
3027                         }
3028                 }
3029         }
3030         excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
3031         if(excess > 0) {
3032                 tcb->rerecv += excess;
3033                 *length -= excess;
3034                 *bp = trimblock(*bp, 0, *length);
3035                 if(*bp == nil)
3036                         panic("presotto is a boofhead");
3037                 seg->flags &= ~FIN;
3038         }
3039         return 0;
3040 }
3041
3042 void
3043 tcpadvise(Proto *tcp, Block *bp, char *msg)
3044 {
3045         Tcp4hdr *h4;
3046         Tcp6hdr *h6;
3047         Tcpctl *tcb;
3048         uchar source[IPaddrlen];
3049         uchar dest[IPaddrlen];
3050         ushort psource, pdest;
3051         Conv *s, **p;
3052
3053         h4 = (Tcp4hdr*)(bp->rp);
3054         h6 = (Tcp6hdr*)(bp->rp);
3055
3056         if((h4->vihl&0xF0)==IP_VER4) {
3057                 v4tov6(dest, h4->tcpdst);
3058                 v4tov6(source, h4->tcpsrc);
3059                 psource = nhgets(h4->tcpsport);
3060                 pdest = nhgets(h4->tcpdport);
3061         }
3062         else {
3063                 ipmove(dest, h6->tcpdst);
3064                 ipmove(source, h6->tcpsrc);
3065                 psource = nhgets(h6->tcpsport);
3066                 pdest = nhgets(h6->tcpdport);
3067         }
3068
3069         /* Look for a connection */
3070         qlock(tcp);
3071         for(p = tcp->conv; *p; p++) {
3072                 s = *p;
3073                 tcb = (Tcpctl*)s->ptcl;
3074                 if(s->rport == pdest)
3075                 if(s->lport == psource)
3076                 if(tcb->state != Closed)
3077                 if(ipcmp(s->raddr, dest) == 0)
3078                 if(ipcmp(s->laddr, source) == 0){
3079                         qlock(s);
3080                         qunlock(tcp);
3081                         switch(tcb->state){
3082                         case Syn_sent:
3083                                 localclose(s, msg);
3084                                 break;
3085                         }
3086                         qunlock(s);
3087                         freeblist(bp);
3088                         return;
3089                 }
3090         }
3091         qunlock(tcp);
3092         freeblist(bp);
3093 }
3094
3095 static char*
3096 tcpporthogdefensectl(char *val)
3097 {
3098         if(strcmp(val, "on") == 0)
3099                 tcpporthogdefense = 1;
3100         else if(strcmp(val, "off") == 0)
3101                 tcpporthogdefense = 0;
3102         else
3103                 return "unknown value for tcpporthogdefense";
3104         return nil;
3105 }
3106
3107 /* called with c qlocked */
3108 char*
3109 tcpctl(Conv* c, char** f, int n)
3110 {
3111         if(n == 1 && strcmp(f[0], "hangup") == 0)
3112                 return tcphangup(c);
3113         if(n >= 1 && strcmp(f[0], "keepalive") == 0)
3114                 return tcpstartka(c, f, n);
3115         if(n >= 1 && strcmp(f[0], "checksum") == 0)
3116                 return tcpsetchecksum(c, f, n);
3117         if(n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
3118                 return tcpporthogdefensectl(f[1]);
3119         return "unknown control request";
3120 }
3121
3122 int
3123 tcpstats(Proto *tcp, char *buf, int len)
3124 {
3125         Tcppriv *priv;
3126         char *p, *e;
3127         int i;
3128
3129         priv = tcp->priv;
3130         p = buf;
3131         e = p+len;
3132         for(i = 0; i < Nstats; i++)
3133                 p = seprint(p, e, "%s: %llud\n", statnames[i], priv->stats[i]);
3134         return p - buf;
3135 }
3136
3137 /*
3138  *  garbage collect any stale conversations:
3139  *      - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
3140  *      - Finwait2 after 5 minutes
3141  *
3142  *  this is called whenever we run out of channels.  Both checks are
3143  *  of questionable validity so we try to use them only when we're
3144  *  up against the wall.
3145  */
3146 int
3147 tcpgc(Proto *tcp)
3148 {
3149         Conv *c, **pp, **ep;
3150         int n;
3151         Tcpctl *tcb;
3152
3153
3154         n = 0;
3155         ep = &tcp->conv[tcp->nc];
3156         for(pp = tcp->conv; pp < ep; pp++) {
3157                 c = *pp;
3158                 if(c == nil)
3159                         break;
3160                 if(!canqlock(c))
3161                         continue;
3162                 tcb = (Tcpctl*)c->ptcl;
3163                 switch(tcb->state){
3164                 case Syn_received:
3165                         if(NOW - tcb->time > 5000){
3166                                 localclose(c, "timed out");
3167                                 n++;
3168                         }
3169                         break;
3170                 case Finwait2:
3171                         if(NOW - tcb->time > 5*60*1000){
3172                                 localclose(c, "timed out");
3173                                 n++;
3174                         }
3175                         break;
3176                 }
3177                 qunlock(c);
3178         }
3179         return n;
3180 }
3181
3182 void
3183 tcpsettimer(Tcpctl *tcb)
3184 {
3185         int x;
3186
3187         /* round trip dependency */
3188         x = backoff(tcb->backoff) *
3189                 (tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK;
3190
3191         /* bounded twixt 1/2 and 64 seconds */
3192         if(x < 500/MSPTICK)
3193                 x = 500/MSPTICK;
3194         else if(x > (64000/MSPTICK))
3195                 x = 64000/MSPTICK;
3196         tcb->timer.start = x;
3197 }
3198
3199 void
3200 tcpinit(Fs *fs)
3201 {
3202         Proto *tcp;
3203         Tcppriv *tpriv;
3204
3205         tcp = smalloc(sizeof(Proto));
3206         tpriv = tcp->priv = smalloc(sizeof(Tcppriv));
3207         tcp->name = "tcp";
3208         tcp->connect = tcpconnect;
3209         tcp->announce = tcpannounce;
3210         tcp->ctl = tcpctl;
3211         tcp->state = tcpstate;
3212         tcp->create = tcpcreate;
3213         tcp->close = tcpclose;
3214         tcp->rcv = tcpiput;
3215         tcp->advise = tcpadvise;
3216         tcp->stats = tcpstats;
3217         tcp->inuse = tcpinuse;
3218         tcp->gc = tcpgc;
3219         tcp->ipproto = IP_TCPPROTO;
3220         tcp->nc = scalednconv();
3221         tcp->ptclsize = sizeof(Tcpctl);
3222         tpriv->stats[MaxConn] = tcp->nc;
3223
3224         Fsproto(fs, tcp);
3225 }
3226
3227 void
3228 tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale)
3229 {
3230         if(rcvscale){
3231                 tcb->rcv.scale = rcvscale & 0xff;
3232                 tcb->snd.scale = sndscale & 0xff;
3233                 tcb->window = QMAX<<tcb->snd.scale;
3234                 qsetlimit(s->rq, tcb->window);
3235         } else {
3236                 tcb->rcv.scale = 0;
3237                 tcb->snd.scale = 0;
3238                 tcb->window = QMAX;
3239                 qsetlimit(s->rq, tcb->window);
3240         }
3241 }