]> git.lizzy.rs Git - plan9front.git/blob - sys/src/9/ip/tcp.c
f241a5c8ef775e7ec9adbe76f47da67893fd44d5
[plan9front.git] / sys / src / 9 / ip / tcp.c
1 #include        "u.h"
2 #include        "../port/lib.h"
3 #include        "mem.h"
4 #include        "dat.h"
5 #include        "fns.h"
6 #include        "../port/error.h"
7
8 #include        "ip.h"
9
10 enum
11 {
12         QMAX            = 64*1024-1,
13         IP_TCPPROTO     = 6,
14
15         TCP4_IPLEN      = 8,
16         TCP4_PHDRSIZE   = 12,
17         TCP4_HDRSIZE    = 20,
18         TCP4_TCBPHDRSZ  = 40,
19         TCP4_PKT        = TCP4_IPLEN+TCP4_PHDRSIZE,
20
21         TCP6_IPLEN      = 0,
22         TCP6_PHDRSIZE   = 40,
23         TCP6_HDRSIZE    = 20,
24         TCP6_TCBPHDRSZ  = 60,
25         TCP6_PKT        = TCP6_IPLEN+TCP6_PHDRSIZE,
26
27         TcptimerOFF     = 0,
28         TcptimerON      = 1,
29         TcptimerDONE    = 2,
30         MAX_TIME        = (1<<20),      /* Forever */
31         TCP_ACK         = 50,           /* Timed ack sequence in ms */
32         MAXBACKMS       = 9*60*1000,    /* longest backoff time (ms) before hangup */
33
34         URG             = 0x20,         /* Data marked urgent */
35         ACK             = 0x10,         /* Acknowledge is valid */
36         PSH             = 0x08,         /* Whole data pipe is pushed */
37         RST             = 0x04,         /* Reset connection */
38         SYN             = 0x02,         /* Pkt. is synchronise */
39         FIN             = 0x01,         /* Start close down */
40
41         EOLOPT          = 0,
42         NOOPOPT         = 1,
43         MSSOPT          = 2,
44         MSS_LENGTH      = 4,            /* Maximum segment size */
45         WSOPT           = 3,
46         WS_LENGTH       = 3,            /* Bits to scale window size by */
47         MSL2            = 10,
48         MSPTICK         = 50,           /* Milliseconds per timer tick */
49         DEF_MSS         = 1460,         /* Default maximum segment */
50         DEF_MSS6        = 1220,         /* Default maximum segment (min) for v6 */
51         DEF_RTT         = 500,          /* Default round trip */
52         DEF_KAT         = 120000,       /* Default time (ms) between keep alives */
53         TCP_LISTEN      = 0,            /* Listen connection */
54         TCP_CONNECT     = 1,            /* Outgoing connection */
55         SYNACK_RXTIMER  = 250,          /* ms between SYNACK retransmits */
56
57         TCPREXMTTHRESH  = 3,            /* dupack threshhold for rxt */
58
59         FORCE           = 1,
60         CLONE           = 2,
61         RETRAN          = 4,
62         ACTIVE          = 8,
63         SYNACK          = 16,
64
65         LOGAGAIN        = 3,
66         LOGDGAIN        = 2,
67
68         Closed          = 0,            /* Connection states */
69         Listen,
70         Syn_sent,
71         Syn_received,
72         Established,
73         Finwait1,
74         Finwait2,
75         Close_wait,
76         Closing,
77         Last_ack,
78         Time_wait,
79
80         Maxlimbo        = 1000,         /* maximum procs waiting for response to SYN ACK */
81         NLHT            = 256,          /* hash table size, must be a power of 2 */
82         LHTMASK         = NLHT-1,
83
84         /*
85          * window is 64kb · 2ⁿ
86          * these factors determine the ultimate bandwidth-delay product.
87          * 64kb · 2⁵ = 2mb, or 2x overkill for 100mbps · 70ms.
88          */
89         Maxqscale       = 4,            /* maximum queuing scale */
90         Defadvscale     = 4,            /* default advertisement */
91 };
92
93 /* Must correspond to the enumeration above */
94 char *tcpstates[] =
95 {
96         "Closed",       "Listen",       "Syn_sent", "Syn_received",
97         "Established",  "Finwait1",     "Finwait2", "Close_wait",
98         "Closing",      "Last_ack",     "Time_wait"
99 };
100
101 typedef struct Tcptimer Tcptimer;
102 struct Tcptimer
103 {
104         Tcptimer        *next;
105         Tcptimer        *prev;
106         Tcptimer        *readynext;
107         int     state;
108         int     start;
109         int     count;
110         void    (*func)(void*);
111         void    *arg;
112 };
113
114 /*
115  *  v4 and v6 pseudo headers used for
116  *  checksuming tcp
117  */
118 typedef struct Tcp4hdr Tcp4hdr;
119 struct Tcp4hdr
120 {
121         uchar   vihl;           /* Version and header length */
122         uchar   tos;            /* Type of service */
123         uchar   length[2];      /* packet length */
124         uchar   id[2];          /* Identification */
125         uchar   frag[2];        /* Fragment information */
126         uchar   Unused;
127         uchar   proto;
128         uchar   tcplen[2];
129         uchar   tcpsrc[4];
130         uchar   tcpdst[4];
131         uchar   tcpsport[2];
132         uchar   tcpdport[2];
133         uchar   tcpseq[4];
134         uchar   tcpack[4];
135         uchar   tcpflag[2];
136         uchar   tcpwin[2];
137         uchar   tcpcksum[2];
138         uchar   tcpurg[2];
139         /* Options segment */
140         uchar   tcpopt[1];
141 };
142
143 typedef struct Tcp6hdr Tcp6hdr;
144 struct Tcp6hdr
145 {
146         uchar   vcf[4];
147         uchar   ploadlen[2];
148         uchar   proto;
149         uchar   ttl;
150         uchar   tcpsrc[IPaddrlen];
151         uchar   tcpdst[IPaddrlen];
152         uchar   tcpsport[2];
153         uchar   tcpdport[2];
154         uchar   tcpseq[4];
155         uchar   tcpack[4];
156         uchar   tcpflag[2];
157         uchar   tcpwin[2];
158         uchar   tcpcksum[2];
159         uchar   tcpurg[2];
160         /* Options segment */
161         uchar   tcpopt[1];
162 };
163
164 /*
165  *  this represents the control info
166  *  for a single packet.  It is derived from
167  *  a packet in ntohtcp{4,6}() and stuck into
168  *  a packet in htontcp{4,6}().
169  */
170 typedef struct Tcp Tcp;
171 struct  Tcp
172 {
173         ushort  source;
174         ushort  dest;
175         ulong   seq;
176         ulong   ack;
177         uchar   flags;
178         uchar   update;
179         ushort  ws;     /* window scale option */
180         ulong   wnd;    /* prescaled window*/
181         ushort  urg;
182         ushort  mss;    /* max segment size option (if not zero) */
183         ushort  len;    /* size of data */
184 };
185
186 /*
187  *  this header is malloc'd to thread together fragments
188  *  waiting to be coalesced
189  */
190 typedef struct Reseq Reseq;
191 struct Reseq
192 {
193         Reseq   *next;
194         Tcp     seg;
195         Block   *bp;
196         ushort  length;
197 };
198
199 /*
200  *  the qlock in the Conv locks this structure
201  */
202 typedef struct Tcpctl Tcpctl;
203 struct Tcpctl
204 {
205         uchar   state;                  /* Connection state */
206         uchar   type;                   /* Listening or active connection */
207         uchar   code;                   /* Icmp code */
208         struct {
209                 ulong   una;            /* Unacked data pointer */
210                 ulong   nxt;            /* Next sequence expected */
211                 ulong   ptr;            /* Data pointer */
212                 ulong   wnd;            /* Tcp send window */
213                 ulong   urg;            /* Urgent data pointer */
214                 ulong   wl2;
215                 uint    scale;          /* how much to right shift window in xmitted packets */
216                 /* to implement tahoe and reno TCP */
217                 ulong   dupacks;        /* number of duplicate acks rcvd */
218                 ulong   partialack;
219                 int     recovery;       /* loss recovery flag */
220                 int     retransmit;     /* retransmit 1 packet @ una flag */
221                 int     rto;
222                 ulong   rxt;            /* right window marker for recovery "recover" rfc3782 */
223         } snd;
224         struct {
225                 ulong   nxt;            /* Receive pointer to next uchar slot */
226                 ulong   wnd;            /* Receive window incoming */
227                 ulong   wsnt;           /* Last wptr sent.  important to track for large bdp */
228                 ulong   wptr;
229                 ulong   urg;            /* Urgent pointer */
230                 ulong   ackptr;         /* last acked sequence */
231                 int     blocked;
232                 uint    scale;          /* how much to left shift window in rcv'd packets */
233         } rcv;
234         ulong   iss;                    /* Initial sequence number */
235         ulong   cwind;                  /* Congestion window */
236         ulong   abcbytes;               /* appropriate byte counting rfc 3465 */
237         uint    scale;                  /* desired snd.scale */
238         ulong   ssthresh;               /* Slow start threshold */
239         int     resent;                 /* Bytes just resent */
240         int     irs;                    /* Initial received squence */
241         ushort  mss;                    /* Maximum segment size */
242         int     rerecv;                 /* Overlap of data rerecevived */
243         ulong   window;                 /* Our receive window (queue) */
244         uint    qscale;                 /* Log2 of our receive window (queue) */
245         uchar   backoff;                /* Exponential backoff counter */
246         int     backedoff;              /* ms we've backed off for rexmits */
247         uchar   flags;                  /* State flags */
248         Reseq   *reseq;                 /* Resequencing queue */
249         int     nreseq;
250         int     reseqlen;
251         Tcptimer        timer;                  /* Activity timer */
252         Tcptimer        acktimer;               /* Acknowledge timer */
253         Tcptimer        rtt_timer;              /* Round trip timer */
254         Tcptimer        katimer;                /* keep alive timer */
255         ulong   rttseq;                 /* Round trip sequence */
256         int     srtt;                   /* Smoothed round trip */
257         int     mdev;                   /* Mean deviation of round trip */
258         int     kacounter;              /* count down for keep alive */
259         uint    sndsyntime;             /* time syn sent */
260         ulong   time;                   /* time Finwait2 or Syn_received was sent */
261         ulong   timeuna;                        /* snd.una when time was set */
262         int     nochecksum;             /* non-zero means don't send checksums */
263         int     flgcnt;                 /* number of flags in the sequence (FIN,SEQ) */
264
265         union {
266                 Tcp4hdr tcp4hdr;
267                 Tcp6hdr tcp6hdr;
268         } protohdr;             /* prototype header */
269 };
270
271 /*
272  *  New calls are put in limbo rather than having a conversation structure
273  *  allocated.  Thus, a SYN attack results in lots of limbo'd calls but not
274  *  any real Conv structures mucking things up.  Calls in limbo rexmit their
275  *  SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
276  *
277  *  In particular they aren't on a listener's queue so that they don't figure
278  *  in the input queue limit.
279  *
280  *  If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
281  *  of 70000 limbo'd calls.  Not great for a linear list but doable.  Therefore
282  *  there is no hashing of this list.
283  */
284 typedef struct Limbo Limbo;
285 struct Limbo
286 {
287         Limbo   *next;
288
289         uchar   laddr[IPaddrlen];
290         uchar   raddr[IPaddrlen];
291         ushort  lport;
292         ushort  rport;
293         ulong   irs;            /* initial received sequence */
294         ulong   iss;            /* initial sent sequence */
295         ushort  mss;            /* mss from the other end */
296         ushort  rcvscale;       /* how much to scale rcvd windows */
297         ushort  sndscale;       /* how much to scale sent windows */
298         ulong   lastsend;       /* last time we sent a synack */
299         uchar   version;        /* v4 or v6 */
300         uchar   rexmits;        /* number of retransmissions */
301 };
302
303 int     tcp_irtt = DEF_RTT;     /* Initial guess at round trip time */
304
305 enum {
306         /* MIB stats */
307         MaxConn,
308         Mss,
309         ActiveOpens,
310         PassiveOpens,
311         EstabResets,
312         CurrEstab,
313         InSegs,
314         OutSegs,
315         RetransSegs,
316         RetransSegsSent,
317         RetransTimeouts,
318         InErrs,
319         OutRsts,
320
321         /* non-MIB stats */
322         CsumErrs,
323         HlenErrs,
324         LenErrs,
325         Resequenced,
326         OutOfOrder,
327         ReseqBytelim,
328         ReseqPktlim,
329         Delayack,
330         Wopenack,
331
332         Recovery,
333         RecoveryDone,
334         RecoveryRTO,
335         RecoveryNoSeq,
336         RecoveryCwind,
337         RecoveryPA,
338
339         Nstats
340 };
341
342 static char *statnames[Nstats] =
343 {
344 [MaxConn]       "MaxConn",
345 [Mss]           "MaxSegment",
346 [ActiveOpens]   "ActiveOpens",
347 [PassiveOpens]  "PassiveOpens",
348 [EstabResets]   "EstabResets",
349 [CurrEstab]     "CurrEstab",
350 [InSegs]        "InSegs",
351 [OutSegs]       "OutSegs",
352 [RetransSegs]   "RetransSegs",
353 [RetransSegsSent]       "RetransSegsSent",
354 [RetransTimeouts]       "RetransTimeouts",
355 [InErrs]        "InErrs",
356 [OutRsts]       "OutRsts",
357 [CsumErrs]      "CsumErrs",
358 [HlenErrs]      "HlenErrs",
359 [LenErrs]       "LenErrs",
360 [OutOfOrder]    "OutOfOrder",
361 [Resequenced]   "Resequenced",
362 [ReseqBytelim]  "ReseqBytelim",
363 [ReseqPktlim]   "ReseqPktlim",
364 [Delayack]      "Delayack",
365 [Wopenack]      "Wopenack",
366
367 [Recovery]      "Recovery",
368 [RecoveryDone]  "RecoveryDone",
369 [RecoveryRTO]   "RecoveryRTO",
370
371 [RecoveryNoSeq] "RecoveryNoSeq",
372 [RecoveryCwind] "RecoveryCwind",
373 [RecoveryPA]    "RecoveryPA",
374 };
375
376 typedef struct Tcppriv Tcppriv;
377 struct Tcppriv
378 {
379         /* List of active timers */
380         QLock   tl;
381         Tcptimer *timers;
382
383         /* hash table for matching conversations */
384         Ipht    ht;
385
386         /* calls in limbo waiting for an ACK to our SYN ACK */
387         int     nlimbo;
388         Limbo   *lht[NLHT];
389
390         /* for keeping track of tcpackproc */
391         QLock   apl;
392         int     ackprocstarted;
393
394         uvlong  stats[Nstats];
395 };
396
397 /*
398  *  Setting tcpporthogdefense to non-zero enables Dong Lin's
399  *  solution to hijacked systems staking out port's as a form
400  *  of DoS attack.
401  *
402  *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
403  *  that number gets acked by the other end, we shut down the connection.
404  *  Look for tcpporthogdefense in the code.
405  */
406 int tcpporthogdefense = 0;
407
408 static  int     addreseq(Fs*, Tcpctl*, Tcppriv*, Tcp*, Block*, ushort);
409 static  int     dumpreseq(Tcpctl*);
410 static  void    getreseq(Tcpctl*, Tcp*, Block**, ushort*);
411 static  void    limbo(Conv*, uchar*, uchar*, Tcp*, int);
412 static  void    limborexmit(Proto*);
413 static  void    localclose(Conv*, char*);
414 static  void    procsyn(Conv*, Tcp*);
415 static  void    tcpacktimer(void*);
416 static  void    tcpiput(Proto*, Ipifc*, Block*);
417 static  void    tcpkeepalive(void*);
418 static  void    tcpoutput(Conv*);
419 static  void    tcprcvwin(Conv*);
420 static  void    tcprxmit(Conv*);
421 static  void    tcpsetkacounter(Tcpctl*);
422 static  void    tcpsetscale(Conv*, Tcpctl*, ushort, ushort);
423 static  void    tcpsettimer(Tcpctl*);
424 static  void    tcpsndsyn(Conv*, Tcpctl*);
425 static  void    tcpstart(Conv*, int);
426 static  void    tcpsynackrtt(Conv*);
427 static  void    tcptimeout(void*);
428 static  int     tcptrim(Tcpctl*, Tcp*, Block**, ushort*);
429
430 static void
431 tcpsetstate(Conv *s, uchar newstate)
432 {
433         Tcpctl *tcb;
434         uchar oldstate;
435         Tcppriv *tpriv;
436
437         tpriv = s->p->priv;
438
439         tcb = (Tcpctl*)s->ptcl;
440
441         oldstate = tcb->state;
442         if(oldstate == newstate)
443                 return;
444
445         if(oldstate == Established)
446                 tpriv->stats[CurrEstab]--;
447         if(newstate == Established)
448                 tpriv->stats[CurrEstab]++;
449
450         switch(newstate) {
451         case Closed:
452                 qclose(s->rq);
453                 qclose(s->wq);
454                 qclose(s->eq);
455                 break;
456
457         case Close_wait:                /* Remote closes */
458                 qhangup(s->rq, nil);
459                 break;
460         }
461
462         tcb->state = newstate;
463
464         if(oldstate == Syn_sent && newstate != Closed)
465                 Fsconnected(s, nil);
466 }
467
468 static char*
469 tcpconnect(Conv *c, char **argv, int argc)
470 {
471         char *e;
472         Tcpctl *tcb;
473
474         tcb = (Tcpctl*)(c->ptcl);
475         if(tcb->state != Closed)
476                 return Econinuse;
477
478         e = Fsstdconnect(c, argv, argc);
479         if(e != nil)
480                 return e;
481         tcpstart(c, TCP_CONNECT);
482
483         return nil;
484 }
485
486 static int
487 tcpstate(Conv *c, char *state, int n)
488 {
489         Tcpctl *s;
490
491         s = (Tcpctl*)(c->ptcl);
492
493         return snprint(state, n,
494                 "%s qin %d qout %d rq %d.%d srtt %d mdev %d sst %lud cwin %lud swin %lud>>%d rwin %lud>>%d qscale %d timer.start %d timer.count %d rerecv %d katimer.start %d katimer.count %d\n",
495                 tcpstates[s->state],
496                 c->rq ? qlen(c->rq) : 0,
497                 c->wq ? qlen(c->wq) : 0,
498                 s->nreseq, s->reseqlen,
499                 s->srtt, s->mdev, s->ssthresh,
500                 s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale,
501                 s->qscale,
502                 s->timer.start, s->timer.count, s->rerecv,
503                 s->katimer.start, s->katimer.count);
504 }
505
506 static int
507 tcpinuse(Conv *c)
508 {
509         Tcpctl *s;
510
511         s = (Tcpctl*)(c->ptcl);
512         return s->state != Closed;
513 }
514
515 static char*
516 tcpannounce(Conv *c, char **argv, int argc)
517 {
518         char *e;
519         Tcpctl *tcb;
520
521         tcb = (Tcpctl*)(c->ptcl);
522         if(tcb->state != Closed)
523                 return Econinuse;
524
525         e = Fsstdannounce(c, argv, argc);
526         if(e != nil)
527                 return e;
528         tcpstart(c, TCP_LISTEN);
529         Fsconnected(c, nil);
530
531         return nil;
532 }
533
534 /*
535  *  tcpclose is always called with the q locked
536  */
537 static void
538 tcpclose(Conv *c)
539 {
540         Tcpctl *tcb;
541
542         tcb = (Tcpctl*)c->ptcl;
543
544         qhangup(c->rq, nil);
545         qhangup(c->wq, nil);
546         qhangup(c->eq, nil);
547         qflush(c->rq);
548
549         switch(tcb->state) {
550         case Listen:
551                 /*
552                  *  reset any incoming calls to this listener
553                  */
554                 Fsconnected(c, "Hangup");
555
556                 localclose(c, nil);
557                 break;
558         case Closed:
559         case Syn_sent:
560                 localclose(c, nil);
561                 break;
562         case Syn_received:
563         case Established:
564                 tcb->flgcnt++;
565                 tcb->snd.nxt++;
566                 tcpsetstate(c, Finwait1);
567                 tcpoutput(c);
568                 break;
569         case Close_wait:
570                 tcb->flgcnt++;
571                 tcb->snd.nxt++;
572                 tcpsetstate(c, Last_ack);
573                 tcpoutput(c);
574                 break;
575         }
576 }
577
578 static void
579 tcpkick(void *x)
580 {
581         Conv *s = x;
582         Tcpctl *tcb;
583
584         tcb = (Tcpctl*)s->ptcl;
585
586         if(waserror()){
587                 qunlock(s);
588                 nexterror();
589         }
590         qlock(s);
591
592         switch(tcb->state) {
593         case Syn_sent:
594         case Syn_received:
595         case Established:
596         case Close_wait:
597                 /*
598                  * Push data
599                  */
600                 tcpoutput(s);
601                 break;
602         default:
603                 localclose(s, "Hangup");
604                 break;
605         }
606
607         qunlock(s);
608         poperror();
609 }
610
611 static int seq_lt(ulong, ulong);
612
613 static void
614 tcprcvwin(Conv *s)                              /* Call with tcb locked */
615 {
616         int w;
617         Tcpctl *tcb;
618
619         tcb = (Tcpctl*)s->ptcl;
620         w = tcb->window - qlen(s->rq);
621         if(w < 0)
622                 w = 0;
623         /* RFC 1122 § 4.2.2.17 do not move right edge of window left */
624         if(seq_lt(tcb->rcv.nxt + w, tcb->rcv.wptr))
625                 w = tcb->rcv.wptr - tcb->rcv.nxt;
626         if(w != tcb->rcv.wnd)
627         if(w>>tcb->rcv.scale == 0 || tcb->window > 4*tcb->mss && w < tcb->mss/4){
628                 tcb->rcv.blocked = 1;
629                 netlog(s->p->f, Logtcp, "tcprcvwin: window %lud qlen %d ws %ud lport %d\n",
630                         tcb->window, qlen(s->rq), tcb->rcv.scale, s->lport);
631         }
632         tcb->rcv.wnd = w;
633         tcb->rcv.wptr = tcb->rcv.nxt + w;
634 }
635
636 static void
637 tcpacktimer(void *v)
638 {
639         Tcpctl *tcb;
640         Conv *s;
641
642         s = v;
643         tcb = (Tcpctl*)s->ptcl;
644
645         if(waserror()){
646                 qunlock(s);
647                 nexterror();
648         }
649         qlock(s);
650         if(tcb->state != Closed){
651                 tcb->flags |= FORCE;
652                 tcpoutput(s);
653         }
654         qunlock(s);
655         poperror();
656 }
657
658 static void
659 tcpcongestion(Tcpctl *tcb)
660 {
661         ulong inflight;
662
663         inflight = tcb->snd.nxt - tcb->snd.una;
664         if(inflight > tcb->cwind)
665                 inflight = tcb->cwind;
666         tcb->ssthresh = inflight / 2;
667         if(tcb->ssthresh < 2*tcb->mss)
668                 tcb->ssthresh = 2*tcb->mss;
669 }
670
671 enum {
672         L               = 2,            /* aggressive slow start; legal values ∈ (1.0, 2.0) */
673 };
674
675 static void
676 tcpabcincr(Tcpctl *tcb, uint acked)
677 {
678         uint limit;
679
680         tcb->abcbytes += acked;
681         if(tcb->cwind < tcb->ssthresh){
682                 /* slow start */
683                 if(tcb->snd.rto)
684                         limit = 1*tcb->mss;
685                 else
686                         limit = L*tcb->mss;
687                 tcb->cwind += MIN(tcb->abcbytes, limit);
688                 tcb->abcbytes = 0;
689         }
690         else{
691                 tcb->snd.rto = 0;
692                 /* avoidance */
693                 if(tcb->abcbytes >= tcb->cwind){
694                         tcb->abcbytes -= tcb->cwind;
695                         tcb->cwind += tcb->mss;
696                 }
697         }
698 }
699
700 static void
701 tcpcreate(Conv *c)
702 {
703         c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c);
704         c->wq = qopen(QMAX, Qkick, tcpkick, c);
705 }
706
707 static void
708 timerstate(Tcppriv *priv, Tcptimer *t, int newstate)
709 {
710         if(newstate != TcptimerON){
711                 if(t->state == TcptimerON){
712                         /* unchain */
713                         if(priv->timers == t){
714                                 priv->timers = t->next;
715                                 if(t->prev != nil)
716                                         panic("timerstate1");
717                         }
718                         if(t->next)
719                                 t->next->prev = t->prev;
720                         if(t->prev)
721                                 t->prev->next = t->next;
722                         t->next = t->prev = nil;
723                 }
724         } else {
725                 if(t->state != TcptimerON){
726                         /* chain */
727                         if(t->prev != nil || t->next != nil)
728                                 panic("timerstate2");
729                         t->prev = nil;
730                         t->next = priv->timers;
731                         if(t->next)
732                                 t->next->prev = t;
733                         priv->timers = t;
734                 }
735         }
736         t->state = newstate;
737 }
738
739 static void
740 tcpackproc(void *a)
741 {
742         Tcptimer *t, *tp, *timeo;
743         Proto *tcp;
744         Tcppriv *priv;
745         int loop;
746
747         tcp = a;
748         priv = tcp->priv;
749
750         while(waserror())
751                 ;
752
753         for(;;) {
754                 tsleep(&up->sleep, return0, 0, MSPTICK);
755
756                 qlock(&priv->tl);
757                 timeo = nil;
758                 loop = 0;
759                 for(t = priv->timers; t != nil; t = tp) {
760                         if(loop++ > 10000)
761                                 panic("tcpackproc1");
762                         tp = t->next;
763                         if(t->state == TcptimerON) {
764                                 t->count--;
765                                 if(t->count == 0) {
766                                         timerstate(priv, t, TcptimerDONE);
767                                         t->readynext = timeo;
768                                         timeo = t;
769                                 }
770                         }
771                 }
772                 qunlock(&priv->tl);
773
774                 loop = 0;
775                 for(t = timeo; t != nil; t = t->readynext) {
776                         if(loop++ > 10000)
777                                 panic("tcpackproc2");
778                         if(t->state == TcptimerDONE && t->func != nil && !waserror()){
779                                 (*t->func)(t->arg);
780                                 poperror();
781                         }
782                 }
783
784                 limborexmit(tcp);
785         }
786 }
787
788 static void
789 tcpgo(Tcppriv *priv, Tcptimer *t)
790 {
791         if(t == nil || t->start == 0)
792                 return;
793
794         qlock(&priv->tl);
795         t->count = t->start;
796         timerstate(priv, t, TcptimerON);
797         qunlock(&priv->tl);
798 }
799
800 static void
801 tcphalt(Tcppriv *priv, Tcptimer *t)
802 {
803         if(t == nil)
804                 return;
805
806         qlock(&priv->tl);
807         timerstate(priv, t, TcptimerOFF);
808         qunlock(&priv->tl);
809 }
810
811 static int
812 backoff(int n)
813 {
814         return 1 << n;
815 }
816
817 static void
818 localclose(Conv *s, char *reason)       /* called with tcb locked */
819 {
820         Tcpctl *tcb;
821         Tcppriv *tpriv;
822
823         tpriv = s->p->priv;
824         tcb = (Tcpctl*)s->ptcl;
825
826         iphtrem(&tpriv->ht, s);
827
828         tcphalt(tpriv, &tcb->timer);
829         tcphalt(tpriv, &tcb->rtt_timer);
830         tcphalt(tpriv, &tcb->acktimer);
831         tcphalt(tpriv, &tcb->katimer);
832
833         /* Flush reassembly queue; nothing more can arrive */
834         dumpreseq(tcb);
835
836         if(tcb->state == Syn_sent)
837                 Fsconnected(s, reason);
838         if(s->state == Announced)
839                 wakeup(&s->listenr);
840
841         qhangup(s->rq, reason);
842         qhangup(s->wq, reason);
843
844         tcpsetstate(s, Closed);
845 }
846
847 /* mtu (- TCP + IP hdr len) of 1st hop */
848 static int
849 tcpmtu(Route *r, int version, uint *scale)
850 {
851         Ipifc *ifc;
852         int mtu;
853
854         /*
855          * set the ws.  it doesn't commit us to anything.
856          * ws is the ultimate limit to the bandwidth-delay product.
857          */
858         *scale = Defadvscale;
859
860         /*
861          * currently we do not implement path MTU discovery
862          * so use interface MTU *only* if directly reachable
863          * or when we use V4 which allows routers to fragment.
864          * otherwise, we use the default MSS which assumes a
865          * safe minimum MTU of 1280 bytes for V6.
866          */  
867         if(r != nil){
868                 ifc = r->ifc;
869                 mtu = ifc->maxtu - ifc->m->hsize;
870                 if(version == V4)
871                         return mtu - (TCP4_PKT + TCP4_HDRSIZE);
872                 mtu -= TCP6_PKT + TCP6_HDRSIZE;
873                 if((r->type & (Rifc|Runi)) != 0 || mtu <= DEF_MSS6)
874                         return mtu;
875         }
876         if(version == V6)
877                 return DEF_MSS6;
878         else
879                 return DEF_MSS;
880 }
881
882 static void
883 inittcpctl(Conv *s, int mode)
884 {
885         Tcpctl *tcb;
886         Tcp4hdr* h4;
887         Tcp6hdr* h6;
888         Tcppriv *tpriv;
889         int mss;
890
891         tcb = (Tcpctl*)s->ptcl;
892
893         memset(tcb, 0, sizeof(Tcpctl));
894
895         tcb->ssthresh = QMAX;                   /* reset by tcpsetscale() */
896         tcb->srtt = tcp_irtt<<LOGAGAIN;
897         tcb->mdev = 0;
898
899         /* setup timers */
900         tcb->timer.start = tcp_irtt / MSPTICK;
901         tcb->timer.func = tcptimeout;
902         tcb->timer.arg = s;
903         tcb->rtt_timer.start = MAX_TIME;
904         tcb->acktimer.start = TCP_ACK / MSPTICK;
905         tcb->acktimer.func = tcpacktimer;
906         tcb->acktimer.arg = s;
907         tcb->katimer.start = DEF_KAT / MSPTICK;
908         tcb->katimer.func = tcpkeepalive;
909         tcb->katimer.arg = s;
910
911         mss = DEF_MSS;
912
913         /* create a prototype(pseudo) header */
914         if(mode != TCP_LISTEN){
915                 if(ipcmp(s->laddr, IPnoaddr) == 0)
916                         findlocalip(s->p->f, s->laddr, s->raddr);
917
918                 switch(s->ipversion){
919                 case V4:
920                         h4 = &tcb->protohdr.tcp4hdr;
921                         memset(h4, 0, sizeof(*h4));
922                         h4->proto = IP_TCPPROTO;
923                         hnputs(h4->tcpsport, s->lport);
924                         hnputs(h4->tcpdport, s->rport);
925                         v6tov4(h4->tcpsrc, s->laddr);
926                         v6tov4(h4->tcpdst, s->raddr);
927                         break;
928                 case V6:
929                         h6 = &tcb->protohdr.tcp6hdr;
930                         memset(h6, 0, sizeof(*h6));
931                         h6->proto = IP_TCPPROTO;
932                         hnputs(h6->tcpsport, s->lport);
933                         hnputs(h6->tcpdport, s->rport);
934                         ipmove(h6->tcpsrc, s->laddr);
935                         ipmove(h6->tcpdst, s->raddr);
936                         mss = DEF_MSS6;
937                         break;
938                 default:
939                         panic("inittcpctl: version %d", s->ipversion);
940                 }
941         }
942
943         tcb->mss = tcb->cwind = mss;
944         tcb->abcbytes = 0;
945         tpriv = s->p->priv;
946         tpriv->stats[Mss] = tcb->mss;
947
948         /* default is no window scaling */
949         tcpsetscale(s, tcb, 0, 0);
950 }
951
952 /*
953  *  called with s qlocked
954  */
955 static void
956 tcpstart(Conv *s, int mode)
957 {
958         Tcpctl *tcb;
959         Tcppriv *tpriv;
960         char kpname[KNAMELEN];
961
962         tpriv = s->p->priv;
963
964         if(tpriv->ackprocstarted == 0){
965                 qlock(&tpriv->apl);
966                 if(tpriv->ackprocstarted == 0){
967                         snprint(kpname, sizeof(kpname), "#I%dtcpack", s->p->f->dev);
968                         kproc(kpname, tcpackproc, s->p);
969                         tpriv->ackprocstarted = 1;
970                 }
971                 qunlock(&tpriv->apl);
972         }
973
974         tcb = (Tcpctl*)s->ptcl;
975
976         inittcpctl(s, mode);
977
978         iphtadd(&tpriv->ht, s);
979         switch(mode) {
980         case TCP_LISTEN:
981                 tpriv->stats[PassiveOpens]++;
982                 tcb->flags |= CLONE;
983                 tcpsetstate(s, Listen);
984                 break;
985
986         case TCP_CONNECT:
987                 tpriv->stats[ActiveOpens]++;
988                 tcb->flags |= ACTIVE;
989                 tcpsndsyn(s, tcb);
990                 tcpsetstate(s, Syn_sent);
991                 tcpoutput(s);
992                 break;
993         }
994 }
995
996 static char*
997 tcpflag(char *buf, char *e, ushort flag)
998 {
999         char *p;
1000
1001         p = seprint(buf, e, "%d", flag>>10);    /* Head len */
1002         if(flag & URG)
1003                 p = seprint(p, e, " URG");
1004         if(flag & ACK)
1005                 p = seprint(p, e, " ACK");
1006         if(flag & PSH)
1007                 p = seprint(p, e, " PSH");
1008         if(flag & RST)
1009                 p = seprint(p, e, " RST");
1010         if(flag & SYN)
1011                 p = seprint(p, e, " SYN");
1012         if(flag & FIN)
1013                 p = seprint(p, e, " FIN");
1014         USED(p);
1015         return buf;
1016 }
1017
1018 static Block*
1019 htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb)
1020 {
1021         int dlen;
1022         Tcp6hdr *h;
1023         ushort csum;
1024         ushort hdrlen, optpad = 0;
1025         uchar *opt;
1026
1027         hdrlen = TCP6_HDRSIZE;
1028         if(tcph->flags & SYN){
1029                 if(tcph->mss)
1030                         hdrlen += MSS_LENGTH;
1031                 if(tcph->ws)
1032                         hdrlen += WS_LENGTH;
1033                 optpad = hdrlen & 3;
1034                 if(optpad)
1035                         optpad = 4 - optpad;
1036                 hdrlen += optpad;
1037         }
1038
1039         if(data) {
1040                 dlen = blocklen(data);
1041                 data = padblock(data, hdrlen + TCP6_PKT);
1042         }
1043         else {
1044                 dlen = 0;
1045                 data = allocb(hdrlen + TCP6_PKT + 64);  /* the 64 pad is to meet mintu's */
1046                 data->wp += hdrlen + TCP6_PKT;
1047         }
1048
1049         /* copy in pseudo ip header plus port numbers */
1050         h = (Tcp6hdr *)(data->rp);
1051         memmove(h, ph, TCP6_TCBPHDRSZ);
1052
1053         /* compose pseudo tcp header, do cksum calculation */
1054         hnputl(h->vcf, hdrlen + dlen);
1055         h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
1056         h->ttl = ph->proto;
1057
1058         /* copy in variable bits */
1059         hnputl(h->tcpseq, tcph->seq);
1060         hnputl(h->tcpack, tcph->ack);
1061         hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
1062         hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
1063         hnputs(h->tcpurg, tcph->urg);
1064
1065         if(tcph->flags & SYN){
1066                 opt = h->tcpopt;
1067                 if(tcph->mss != 0){
1068                         *opt++ = MSSOPT;
1069                         *opt++ = MSS_LENGTH;
1070                         hnputs(opt, tcph->mss);
1071                         opt += 2;
1072                 }
1073                 if(tcph->ws != 0){
1074                         *opt++ = WSOPT;
1075                         *opt++ = WS_LENGTH;
1076                         *opt++ = tcph->ws;
1077                 }
1078                 while(optpad-- > 0)
1079                         *opt++ = NOOPOPT;
1080         }
1081
1082         if(tcb != nil && tcb->nochecksum){
1083                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1084         } else {
1085                 csum = ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE);
1086                 hnputs(h->tcpcksum, csum);
1087         }
1088
1089         /* move from pseudo header back to normal ip header */
1090         memset(h->vcf, 0, 4);
1091         h->vcf[0] = IP_VER6;
1092         hnputs(h->ploadlen, hdrlen+dlen);
1093         h->proto = ph->proto;
1094
1095         return data;
1096 }
1097
1098 static Block*
1099 htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb)
1100 {
1101         int dlen;
1102         Tcp4hdr *h;
1103         ushort csum;
1104         ushort hdrlen, optpad = 0;
1105         uchar *opt;
1106
1107         hdrlen = TCP4_HDRSIZE;
1108         if(tcph->flags & SYN){
1109                 if(tcph->mss)
1110                         hdrlen += MSS_LENGTH;
1111                 if(1)
1112                         hdrlen += WS_LENGTH;
1113                 optpad = hdrlen & 3;
1114                 if(optpad)
1115                         optpad = 4 - optpad;
1116                 hdrlen += optpad;
1117         }
1118
1119         if(data) {
1120                 dlen = blocklen(data);
1121                 data = padblock(data, hdrlen + TCP4_PKT);
1122         }
1123         else {
1124                 dlen = 0;
1125                 data = allocb(hdrlen + TCP4_PKT + 64);  /* the 64 pad is to meet mintu's */
1126                 data->wp += hdrlen + TCP4_PKT;
1127         }
1128
1129         /* copy in pseudo ip header plus port numbers */
1130         h = (Tcp4hdr *)(data->rp);
1131         memmove(h, ph, TCP4_TCBPHDRSZ);
1132
1133         /* copy in variable bits */
1134         hnputs(h->tcplen, hdrlen + dlen);
1135         hnputl(h->tcpseq, tcph->seq);
1136         hnputl(h->tcpack, tcph->ack);
1137         hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
1138         hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
1139         hnputs(h->tcpurg, tcph->urg);
1140
1141         if(tcph->flags & SYN){
1142                 opt = h->tcpopt;
1143                 if(tcph->mss != 0){
1144                         *opt++ = MSSOPT;
1145                         *opt++ = MSS_LENGTH;
1146                         hnputs(opt, tcph->mss);
1147                         opt += 2;
1148                 }
1149                 /* always offer.  rfc1323 §2.2 */
1150                 if(1){
1151                         *opt++ = WSOPT;
1152                         *opt++ = WS_LENGTH;
1153                         *opt++ = tcph->ws;
1154                 }
1155                 while(optpad-- > 0)
1156                         *opt++ = NOOPOPT;
1157         }
1158
1159         if(tcb != nil && tcb->nochecksum){
1160                 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1161         } else {
1162                 csum = ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE);
1163                 hnputs(h->tcpcksum, csum);
1164         }
1165
1166         return data;
1167 }
1168
1169 static int
1170 ntohtcp6(Tcp *tcph, Block **bpp)
1171 {
1172         Tcp6hdr *h;
1173         uchar *optr;
1174         ushort hdrlen;
1175         ushort optlen;
1176         int n;
1177
1178         *bpp = pullupblock(*bpp, TCP6_PKT+TCP6_HDRSIZE);
1179         if(*bpp == nil)
1180                 return -1;
1181
1182         h = (Tcp6hdr *)((*bpp)->rp);
1183         tcph->source = nhgets(h->tcpsport);
1184         tcph->dest = nhgets(h->tcpdport);
1185         tcph->seq = nhgetl(h->tcpseq);
1186         tcph->ack = nhgetl(h->tcpack);
1187         hdrlen = (h->tcpflag[0]>>2) & ~3;
1188         if(hdrlen < TCP6_HDRSIZE) {
1189                 freeblist(*bpp);
1190                 return -1;
1191         }
1192
1193         tcph->flags = h->tcpflag[1];
1194         tcph->wnd = nhgets(h->tcpwin);
1195         tcph->urg = nhgets(h->tcpurg);
1196         tcph->mss = 0;
1197         tcph->ws = 0;
1198         tcph->update = 0;
1199         tcph->len = nhgets(h->ploadlen) - hdrlen;
1200
1201         *bpp = pullupblock(*bpp, hdrlen+TCP6_PKT);
1202         if(*bpp == nil)
1203                 return -1;
1204
1205         optr = h->tcpopt;
1206         n = hdrlen - TCP6_HDRSIZE;
1207         while(n > 0 && *optr != EOLOPT) {
1208                 if(*optr == NOOPOPT) {
1209                         n--;
1210                         optr++;
1211                         continue;
1212                 }
1213                 optlen = optr[1];
1214                 if(optlen < 2 || optlen > n)
1215                         break;
1216                 switch(*optr) {
1217                 case MSSOPT:
1218                         if(optlen == MSS_LENGTH)
1219                                 tcph->mss = nhgets(optr+2);
1220                         break;
1221                 case WSOPT:
1222                         if(optlen == WS_LENGTH && *(optr+2) <= 14)
1223                                 tcph->ws = *(optr+2);
1224                         break;
1225                 }
1226                 n -= optlen;
1227                 optr += optlen;
1228         }
1229         return hdrlen;
1230 }
1231
1232 static int
1233 ntohtcp4(Tcp *tcph, Block **bpp)
1234 {
1235         Tcp4hdr *h;
1236         uchar *optr;
1237         ushort hdrlen;
1238         ushort optlen;
1239         int n;
1240
1241         *bpp = pullupblock(*bpp, TCP4_PKT+TCP4_HDRSIZE);
1242         if(*bpp == nil)
1243                 return -1;
1244
1245         h = (Tcp4hdr *)((*bpp)->rp);
1246         tcph->source = nhgets(h->tcpsport);
1247         tcph->dest = nhgets(h->tcpdport);
1248         tcph->seq = nhgetl(h->tcpseq);
1249         tcph->ack = nhgetl(h->tcpack);
1250
1251         hdrlen = (h->tcpflag[0]>>2) & ~3;
1252         if(hdrlen < TCP4_HDRSIZE) {
1253                 freeblist(*bpp);
1254                 return -1;
1255         }
1256
1257         tcph->flags = h->tcpflag[1];
1258         tcph->wnd = nhgets(h->tcpwin);
1259         tcph->urg = nhgets(h->tcpurg);
1260         tcph->mss = 0;
1261         tcph->ws = 0;
1262         tcph->update = 0;
1263         tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1264
1265         *bpp = pullupblock(*bpp, hdrlen+TCP4_PKT);
1266         if(*bpp == nil)
1267                 return -1;
1268
1269         optr = h->tcpopt;
1270         n = hdrlen - TCP4_HDRSIZE;
1271         while(n > 0 && *optr != EOLOPT) {
1272                 if(*optr == NOOPOPT) {
1273                         n--;
1274                         optr++;
1275                         continue;
1276                 }
1277                 optlen = optr[1];
1278                 if(optlen < 2 || optlen > n)
1279                         break;
1280                 switch(*optr) {
1281                 case MSSOPT:
1282                         if(optlen == MSS_LENGTH)
1283                                 tcph->mss = nhgets(optr+2);
1284                         break;
1285                 case WSOPT:
1286                         if(optlen == WS_LENGTH && *(optr+2) <= 14)
1287                                 tcph->ws = *(optr+2);
1288                         break;
1289                 }
1290                 n -= optlen;
1291                 optr += optlen;
1292         }
1293         return hdrlen;
1294 }
1295
1296 /*
1297  *  For outgoing calls, generate an initial sequence
1298  *  number and put a SYN on the send queue
1299  */
1300 static void
1301 tcpsndsyn(Conv *s, Tcpctl *tcb)
1302 {
1303         Tcppriv *tpriv;
1304
1305         tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1306         tcb->rttseq = tcb->iss;
1307         tcb->snd.wl2 = tcb->iss;
1308         tcb->snd.una = tcb->iss;
1309         tcb->snd.rxt = tcb->iss;
1310         tcb->snd.ptr = tcb->rttseq;
1311         tcb->snd.nxt = tcb->rttseq;
1312         tcb->flgcnt++;
1313         tcb->flags |= FORCE;
1314         tcb->sndsyntime = NOW;
1315
1316         /* set desired mss and scale */
1317         tcb->mss = tcpmtu(v6lookup(s->p->f, s->raddr, s), s->ipversion, &tcb->scale);
1318         tpriv = s->p->priv;
1319         tpriv->stats[Mss] = tcb->mss;
1320 }
1321
1322 void
1323 sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, uchar version, char *reason)
1324 {
1325         Block *hbp;
1326         uchar rflags;
1327         Tcppriv *tpriv;
1328         Tcp4hdr ph4;
1329         Tcp6hdr ph6;
1330
1331         netlog(tcp->f, Logtcp, "sndrst: %s\n", reason);
1332
1333         tpriv = tcp->priv;
1334
1335         if(seg->flags & RST)
1336                 return;
1337
1338         /* make pseudo header */
1339         switch(version) {
1340         case V4:
1341                 memset(&ph4, 0, sizeof(ph4));
1342                 ph4.vihl = IP_VER4;
1343                 v6tov4(ph4.tcpsrc, dest);
1344                 v6tov4(ph4.tcpdst, source);
1345                 ph4.proto = IP_TCPPROTO;
1346                 hnputs(ph4.tcplen, TCP4_HDRSIZE);
1347                 hnputs(ph4.tcpsport, seg->dest);
1348                 hnputs(ph4.tcpdport, seg->source);
1349                 break;
1350         case V6:
1351                 memset(&ph6, 0, sizeof(ph6));
1352                 ph6.vcf[0] = IP_VER6;
1353                 ipmove(ph6.tcpsrc, dest);
1354                 ipmove(ph6.tcpdst, source);
1355                 ph6.proto = IP_TCPPROTO;
1356                 hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1357                 hnputs(ph6.tcpsport, seg->dest);
1358                 hnputs(ph6.tcpdport, seg->source);
1359                 break;
1360         default:
1361                 panic("sndrst: version %d", version);
1362         }
1363
1364         tpriv->stats[OutRsts]++;
1365         rflags = RST;
1366
1367         /* convince the other end that this reset is in band */
1368         if(seg->flags & ACK) {
1369                 seg->seq = seg->ack;
1370                 seg->ack = 0;
1371         }
1372         else {
1373                 rflags |= ACK;
1374                 seg->ack = seg->seq;
1375                 seg->seq = 0;
1376                 if(seg->flags & SYN)
1377                         seg->ack++;
1378                 seg->ack += length;
1379                 if(seg->flags & FIN)
1380                         seg->ack++;
1381         }
1382         seg->flags = rflags;
1383         seg->wnd = 0;
1384         seg->urg = 0;
1385         seg->mss = 0;
1386         seg->ws = 0;
1387         switch(version) {
1388         case V4:
1389                 hbp = htontcp4(seg, nil, &ph4, nil);
1390                 if(hbp == nil)
1391                         return;
1392                 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1393                 break;
1394         case V6:
1395                 hbp = htontcp6(seg, nil, &ph6, nil);
1396                 if(hbp == nil)
1397                         return;
1398                 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1399                 break;
1400         default:
1401                 panic("sndrst2: version %d", version);
1402         }
1403 }
1404
1405 /*
1406  *  send a reset to the remote side and close the conversation
1407  *  called with s qlocked
1408  */
1409 static char*
1410 tcphangup(Conv *s)
1411 {
1412         Tcp seg;
1413         Tcpctl *tcb;
1414         Block *hbp;
1415
1416         tcb = (Tcpctl*)s->ptcl;
1417         if(waserror())
1418                 return commonerror();
1419         if(ipcmp(s->raddr, IPnoaddr) != 0) {
1420                 if(!waserror()){
1421                         memset(&seg, 0, sizeof seg);
1422                         seg.flags = RST | ACK;
1423                         seg.ack = tcb->rcv.nxt;
1424                         tcb->rcv.ackptr = seg.ack;
1425                         seg.seq = tcb->snd.ptr;
1426                         seg.wnd = 0;
1427                         seg.urg = 0;
1428                         seg.mss = 0;
1429                         seg.ws = 0;
1430                         switch(s->ipversion) {
1431                         case V4:
1432                                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1433                                 hbp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr, tcb);
1434                                 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1435                                 break;
1436                         case V6:
1437                                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1438                                 hbp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr, tcb);
1439                                 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1440                                 break;
1441                         default:
1442                                 panic("tcphangup: version %d", s->ipversion);
1443                         }
1444                         poperror();
1445                 }
1446         }
1447         localclose(s, nil);
1448         poperror();
1449         return nil;
1450 }
1451
1452 /*
1453  *  (re)send a SYN ACK
1454  */
1455 static int
1456 sndsynack(Proto *tcp, Limbo *lp)
1457 {
1458         Block *hbp;
1459         Tcp4hdr ph4;
1460         Tcp6hdr ph6;
1461         Tcp seg;
1462         uint scale;
1463
1464         /* make pseudo header */
1465         switch(lp->version) {
1466         case V4:
1467                 memset(&ph4, 0, sizeof(ph4));
1468                 ph4.vihl = IP_VER4;
1469                 v6tov4(ph4.tcpsrc, lp->laddr);
1470                 v6tov4(ph4.tcpdst, lp->raddr);
1471                 ph4.proto = IP_TCPPROTO;
1472                 hnputs(ph4.tcplen, TCP4_HDRSIZE);
1473                 hnputs(ph4.tcpsport, lp->lport);
1474                 hnputs(ph4.tcpdport, lp->rport);
1475                 break;
1476         case V6:
1477                 memset(&ph6, 0, sizeof(ph6));
1478                 ph6.vcf[0] = IP_VER6;
1479                 ipmove(ph6.tcpsrc, lp->laddr);
1480                 ipmove(ph6.tcpdst, lp->raddr);
1481                 ph6.proto = IP_TCPPROTO;
1482                 hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1483                 hnputs(ph6.tcpsport, lp->lport);
1484                 hnputs(ph6.tcpdport, lp->rport);
1485                 break;
1486         default:
1487                 panic("sndrst: version %d", lp->version);
1488         }
1489
1490         memset(&seg, 0, sizeof seg);
1491         seg.seq = lp->iss;
1492         seg.ack = lp->irs+1;
1493         seg.flags = SYN|ACK;
1494         seg.urg = 0;
1495         seg.mss = tcpmtu(v6lookup(tcp->f, lp->raddr, nil), lp->version, &scale);
1496         seg.wnd = QMAX;
1497
1498         /* if the other side set scale, we should too */
1499         if(lp->rcvscale){
1500                 seg.ws = scale;
1501                 lp->sndscale = scale;
1502         } else {
1503                 seg.ws = 0;
1504                 lp->sndscale = 0;
1505         }
1506
1507         switch(lp->version) {
1508         case V4:
1509                 hbp = htontcp4(&seg, nil, &ph4, nil);
1510                 if(hbp == nil)
1511                         return -1;
1512                 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1513                 break;
1514         case V6:
1515                 hbp = htontcp6(&seg, nil, &ph6, nil);
1516                 if(hbp == nil)
1517                         return -1;
1518                 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1519                 break;
1520         default:
1521                 panic("sndsnack: version %d", lp->version);
1522         }
1523         lp->lastsend = NOW;
1524         return 0;
1525 }
1526
1527 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1528
1529 /*
1530  *  put a call into limbo and respond with a SYN ACK
1531  *
1532  *  called with proto locked
1533  */
1534 static void
1535 limbo(Conv *s, uchar *source, uchar *dest, Tcp *seg, int version)
1536 {
1537         Limbo *lp, **l;
1538         Tcppriv *tpriv;
1539         int h;
1540
1541         tpriv = s->p->priv;
1542         h = hashipa(source, seg->source);
1543
1544         for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1545                 lp = *l;
1546                 if(lp->lport != seg->dest || lp->rport != seg->source || lp->version != version)
1547                         continue;
1548                 if(ipcmp(lp->raddr, source) != 0)
1549                         continue;
1550                 if(ipcmp(lp->laddr, dest) != 0)
1551                         continue;
1552
1553                 /* each new SYN restarts the retransmits */
1554                 lp->irs = seg->seq;
1555                 break;
1556         }
1557         lp = *l;
1558         if(lp == nil){
1559                 if(tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]){
1560                         lp = tpriv->lht[h];
1561                         tpriv->lht[h] = lp->next;
1562                         lp->next = nil;
1563                 } else {
1564                         lp = malloc(sizeof(*lp));
1565                         if(lp == nil)
1566                                 return;
1567                         tpriv->nlimbo++;
1568                 }
1569                 *l = lp;
1570                 lp->version = version;
1571                 ipmove(lp->laddr, dest);
1572                 ipmove(lp->raddr, source);
1573                 lp->lport = seg->dest;
1574                 lp->rport = seg->source;
1575                 lp->mss = seg->mss;
1576                 lp->rcvscale = seg->ws;
1577                 lp->irs = seg->seq;
1578                 lp->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1579         }
1580
1581         if(sndsynack(s->p, lp) < 0){
1582                 *l = lp->next;
1583                 tpriv->nlimbo--;
1584                 free(lp);
1585         }
1586 }
1587
1588 /*
1589  *  resend SYN ACK's once every SYNACK_RXTIMER ms.
1590  */
1591 static void
1592 limborexmit(Proto *tcp)
1593 {
1594         Tcppriv *tpriv;
1595         Limbo **l, *lp;
1596         int h;
1597         int seen;
1598         ulong now;
1599
1600         tpriv = tcp->priv;
1601
1602         if(!canqlock(tcp))
1603                 return;
1604         seen = 0;
1605         now = NOW;
1606         for(h = 0; h < NLHT && seen < tpriv->nlimbo; h++){
1607                 for(l = &tpriv->lht[h]; *l != nil && seen < tpriv->nlimbo; ){
1608                         lp = *l;
1609                         seen++;
1610                         if(now - lp->lastsend < (lp->rexmits+1)*SYNACK_RXTIMER)
1611                                 continue;
1612
1613                         /* time it out after 1 second */
1614                         if(++(lp->rexmits) > 5){
1615                                 tpriv->nlimbo--;
1616                                 *l = lp->next;
1617                                 free(lp);
1618                                 continue;
1619                         }
1620
1621                         /* if we're being attacked, don't bother resending SYN ACK's */
1622                         if(tpriv->nlimbo > 100)
1623                                 continue;
1624
1625                         if(sndsynack(tcp, lp) < 0){
1626                                 tpriv->nlimbo--;
1627                                 *l = lp->next;
1628                                 free(lp);
1629                                 continue;
1630                         }
1631
1632                         l = &lp->next;
1633                 }
1634         }
1635         qunlock(tcp);
1636 }
1637
1638 /*
1639  *  lookup call in limbo.  if found, throw it out.
1640  *
1641  *  called with proto locked
1642  */
1643 static void
1644 limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1645 {
1646         Limbo *lp, **l;
1647         int h;
1648         Tcppriv *tpriv;
1649
1650         tpriv = s->p->priv;
1651
1652         /* find a call in limbo */
1653         h = hashipa(src, segp->source);
1654         for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1655                 lp = *l;
1656                 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1657                         continue;
1658                 if(ipcmp(lp->laddr, dst) != 0)
1659                         continue;
1660                 if(ipcmp(lp->raddr, src) != 0)
1661                         continue;
1662
1663                 /* RST can only follow the SYN */
1664                 if(segp->seq == lp->irs+1){
1665                         tpriv->nlimbo--;
1666                         *l = lp->next;
1667                         free(lp);
1668                 }
1669                 break;
1670         }
1671 }
1672
1673 static void
1674 initialwindow(Tcpctl *tcb)
1675 {
1676         /* RFC 3390 initial window */
1677         if(tcb->mss < 1095)
1678                 tcb->cwind = 4*tcb->mss;
1679         else if(tcb->mss < 2190)
1680                 tcb->cwind = 4380;
1681         else
1682                 tcb->cwind = 2*tcb->mss;
1683 }
1684
1685 /*
1686  *  come here when we finally get an ACK to our SYN-ACK.
1687  *  lookup call in limbo.  if found, create a new conversation
1688  *
1689  *  called with proto locked
1690  */
1691 static Conv*
1692 tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1693 {
1694         Conv *new;
1695         Tcpctl *tcb;
1696         Tcppriv *tpriv;
1697         Tcp4hdr *h4;
1698         Tcp6hdr *h6;
1699         Limbo *lp, **l;
1700         int h;
1701
1702         /* unless it's just an ack, it can't be someone coming out of limbo */
1703         if((segp->flags & SYN) || (segp->flags & ACK) == 0)
1704                 return nil;
1705
1706         tpriv = s->p->priv;
1707
1708         /* find a call in limbo */
1709         h = hashipa(src, segp->source);
1710         for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){
1711                 netlog(s->p->f, Logtcp, "tcpincoming s %I!%ud/%I!%ud d %I!%ud/%I!%ud v %d/%d\n",
1712                         src, segp->source, lp->raddr, lp->rport,
1713                         dst, segp->dest, lp->laddr, lp->lport,
1714                         version, lp->version
1715                 );
1716
1717                 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1718                         continue;
1719                 if(ipcmp(lp->laddr, dst) != 0)
1720                         continue;
1721                 if(ipcmp(lp->raddr, src) != 0)
1722                         continue;
1723
1724                 /* we're assuming no data with the initial SYN */
1725                 if(segp->seq != lp->irs+1 || segp->ack != lp->iss+1){
1726                         netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux\n",
1727                                 segp->seq, lp->irs+1, segp->ack, lp->iss+1);
1728                         lp = nil;
1729                 } else {
1730                         tpriv->nlimbo--;
1731                         *l = lp->next;
1732                 }
1733                 break;
1734         }
1735         if(lp == nil)
1736                 return nil;
1737
1738         new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1739         if(new == nil)
1740                 return nil;
1741
1742         memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1743         tcb = (Tcpctl*)new->ptcl;
1744         tcb->flags &= ~CLONE;
1745         tcb->timer.arg = new;
1746         tcb->timer.state = TcptimerOFF;
1747         tcb->acktimer.arg = new;
1748         tcb->acktimer.state = TcptimerOFF;
1749         tcb->katimer.arg = new;
1750         tcb->katimer.state = TcptimerOFF;
1751         tcb->rtt_timer.arg = new;
1752         tcb->rtt_timer.state = TcptimerOFF;
1753
1754         tcb->irs = lp->irs;
1755         tcb->rcv.nxt = tcb->irs+1;
1756         tcb->rcv.wptr = tcb->rcv.nxt;
1757         tcb->rcv.wsnt = 0;
1758         tcb->rcv.urg = tcb->rcv.nxt;
1759
1760         tcb->iss = lp->iss;
1761         tcb->rttseq = tcb->iss;
1762         tcb->snd.wl2 = tcb->iss;
1763         tcb->snd.una = tcb->iss+1;
1764         tcb->snd.ptr = tcb->iss+1;
1765         tcb->snd.nxt = tcb->iss+1;
1766         tcb->snd.rxt = tcb->iss+1;
1767         tcb->flgcnt = 0;
1768         tcb->flags |= SYNACK;
1769
1770         /* set desired mss and scale */
1771         tcb->mss = tcpmtu(v6lookup(s->p->f, src, s), version, &tcb->scale);
1772
1773         /* our sending max segment size cannot be bigger than what he asked for */
1774         if(lp->mss != 0 && lp->mss < tcb->mss)
1775                 tcb->mss = lp->mss;
1776         tpriv->stats[Mss] = tcb->mss;
1777
1778         /* window scaling */
1779         tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1780
1781         /* congestion window */
1782         tcb->snd.wnd = segp->wnd;
1783         initialwindow(tcb);
1784
1785         /* set initial round trip time */
1786         tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER;
1787         tcpsynackrtt(new);
1788
1789         free(lp);
1790
1791         /* set up proto header */
1792         switch(version){
1793         case V4:
1794                 h4 = &tcb->protohdr.tcp4hdr;
1795                 memset(h4, 0, sizeof(*h4));
1796                 h4->proto = IP_TCPPROTO;
1797                 hnputs(h4->tcpsport, new->lport);
1798                 hnputs(h4->tcpdport, new->rport);
1799                 v6tov4(h4->tcpsrc, dst);
1800                 v6tov4(h4->tcpdst, src);
1801                 break;
1802         case V6:
1803                 h6 = &tcb->protohdr.tcp6hdr;
1804                 memset(h6, 0, sizeof(*h6));
1805                 h6->proto = IP_TCPPROTO;
1806                 hnputs(h6->tcpsport, new->lport);
1807                 hnputs(h6->tcpdport, new->rport);
1808                 ipmove(h6->tcpsrc, dst);
1809                 ipmove(h6->tcpdst, src);
1810                 break;
1811         default:
1812                 panic("tcpincoming: version %d", new->ipversion);
1813         }
1814
1815         tcpsetstate(new, Established);
1816
1817         iphtadd(&tpriv->ht, new);
1818
1819         return new;
1820 }
1821
1822 static int
1823 seq_within(ulong x, ulong low, ulong high)
1824 {
1825         if(low <= high){
1826                 if(low <= x && x <= high)
1827                         return 1;
1828         }
1829         else {
1830                 if(x >= low || x <= high)
1831                         return 1;
1832         }
1833         return 0;
1834 }
1835
1836 static int
1837 seq_lt(ulong x, ulong y)
1838 {
1839         return (int)(x-y) < 0;
1840 }
1841
1842 static int
1843 seq_le(ulong x, ulong y)
1844 {
1845         return (int)(x-y) <= 0;
1846 }
1847
1848 static int
1849 seq_gt(ulong x, ulong y)
1850 {
1851         return (int)(x-y) > 0;
1852 }
1853
1854 static int
1855 seq_ge(ulong x, ulong y)
1856 {
1857         return (int)(x-y) >= 0;
1858 }
1859
1860 /*
1861  *  use the time between the first SYN and it's ack as the
1862  *  initial round trip time
1863  */
1864 static void
1865 tcpsynackrtt(Conv *s)
1866 {
1867         Tcpctl *tcb;
1868         int delta;
1869         Tcppriv *tpriv;
1870
1871         tcb = (Tcpctl*)s->ptcl;
1872         tpriv = s->p->priv;
1873
1874         delta = NOW - tcb->sndsyntime;
1875         tcb->srtt = delta<<LOGAGAIN;
1876         tcb->mdev = delta<<LOGDGAIN;
1877
1878         /* halt round trip timer */
1879         tcphalt(tpriv, &tcb->rtt_timer);
1880 }
1881
1882 static void
1883 update(Conv *s, Tcp *seg)
1884 {
1885         int rtt, delta;
1886         Tcpctl *tcb;
1887         ulong acked;
1888         Tcppriv *tpriv;
1889
1890         if(seg->update)
1891                 return;
1892         seg->update = 1;
1893
1894         tpriv = s->p->priv;
1895         tcb = (Tcpctl*)s->ptcl;
1896
1897         /* catch zero-window updates, update window & recover */
1898         if(tcb->snd.wnd == 0 && seg->wnd > 0)
1899         if(seq_lt(seg->ack,  tcb->snd.ptr)){
1900                 netlog(s->p->f, Logtcp, "tcp: zwu ack %lud una %lud ptr %lud win %lud\n",
1901                         seg->ack,  tcb->snd.una, tcb->snd.ptr, seg->wnd);
1902                 tcb->snd.wnd = seg->wnd;
1903                 goto recovery;
1904         }
1905
1906         /* newreno fast retransmit */
1907         if(seg->ack == tcb->snd.una)
1908         if(tcb->snd.una != tcb->snd.nxt)
1909         if(++tcb->snd.dupacks == 3){
1910 recovery:
1911                 if(tcb->snd.recovery){
1912                         tpriv->stats[RecoveryCwind]++;
1913                         tcb->cwind += tcb->mss;
1914                 }else if(seq_le(tcb->snd.rxt, seg->ack)){
1915                         tpriv->stats[Recovery]++;
1916                         tcb->abcbytes = 0;
1917                         tcb->snd.recovery = 1;
1918                         tcb->snd.partialack = 0;
1919                         tcb->snd.rxt = tcb->snd.nxt;
1920                         tcpcongestion(tcb);
1921                         tcb->cwind = tcb->ssthresh + 3*tcb->mss;
1922                         netlog(s->p->f, Logtcpwin, "recovery inflate %ld ss %ld @%lud\n",
1923                                 tcb->cwind, tcb->ssthresh, tcb->snd.rxt);
1924                         tcprxmit(s);
1925                 }else{
1926                         tpriv->stats[RecoveryNoSeq]++;
1927                         netlog(s->p->f, Logtcpwin, "!recov %lud not ≤ %lud %ld\n",
1928                                 tcb->snd.rxt, seg->ack, tcb->snd.rxt - seg->ack);
1929                         /* do not enter fast retransmit */
1930                         /* do not change ssthresh */
1931                 }
1932         }else if(tcb->snd.recovery){
1933                 tpriv->stats[RecoveryCwind]++;
1934                 tcb->cwind += tcb->mss;
1935         }
1936
1937         /*
1938          *  update window
1939          */
1940         if(seq_gt(seg->ack, tcb->snd.wl2)
1941         || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){
1942                 /* clear dupack if we advance wl2 */
1943                 if(tcb->snd.wl2 != seg->ack)
1944                         tcb->snd.dupacks = 0;
1945                 tcb->snd.wnd = seg->wnd;
1946                 tcb->snd.wl2 = seg->ack;
1947         }
1948
1949         if(!seq_gt(seg->ack, tcb->snd.una)){
1950                 /*
1951                  *  don't let us hangup if sending into a closed window and
1952                  *  we're still getting acks
1953                  */
1954                 if((tcb->flags&RETRAN) && tcb->snd.wnd == 0)
1955                         tcb->backedoff = MAXBACKMS/4;
1956                 return;
1957         }
1958
1959         /* Compute the new send window size */
1960         acked = seg->ack - tcb->snd.una;
1961
1962         /* avoid slow start and timers for SYN acks */
1963         if((tcb->flags & SYNACK) == 0) {
1964                 tcb->flags |= SYNACK;
1965                 acked--;
1966                 tcb->flgcnt--;
1967                 goto done;
1968         }
1969
1970         /*
1971          *  congestion control
1972          */
1973         if(tcb->snd.recovery){
1974                 if(seq_ge(seg->ack, tcb->snd.rxt)){
1975                         /* recovery finished; deflate window */
1976                         tpriv->stats[RecoveryDone]++;
1977                         tcb->snd.dupacks = 0;
1978                         tcb->snd.recovery = 0;
1979                         tcb->cwind = (tcb->snd.nxt - tcb->snd.una) + tcb->mss;
1980                         if(tcb->ssthresh < tcb->cwind)
1981                                 tcb->cwind = tcb->ssthresh;
1982                         netlog(s->p->f, Logtcpwin, "recovery deflate %ld %ld\n",
1983                                 tcb->cwind, tcb->ssthresh);
1984                 } else {
1985                         /* partial ack; we lost more than one segment */
1986                         tpriv->stats[RecoveryPA]++;
1987                         if(tcb->cwind > acked)
1988                                 tcb->cwind -= acked;
1989                         else{
1990                                 netlog(s->p->f, Logtcpwin, "partial ack neg\n");
1991                                 tcb->cwind = tcb->mss;
1992                         }
1993                         netlog(s->p->f, Logtcpwin, "partial ack %ld left %ld cwind %ld\n",
1994                                 acked, tcb->snd.rxt - seg->ack, tcb->cwind);
1995
1996                         if(acked >= tcb->mss)
1997                                 tcb->cwind += tcb->mss;
1998                         tcb->snd.partialack++;
1999                 }
2000         } else
2001                 tcpabcincr(tcb, acked);
2002
2003         /* Adjust the timers according to the round trip time */
2004         /* todo: fix sloppy treatment of overflow cases here. */
2005         if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
2006                 tcphalt(tpriv, &tcb->rtt_timer);
2007                 if((tcb->flags&RETRAN) == 0) {
2008                         tcb->backoff = 0;
2009                         tcb->backedoff = 0;
2010                         rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
2011                         if(rtt == 0)
2012                                 rtt = 1;        /* otherwise all close systems will rexmit in 0 time */
2013                         rtt *= MSPTICK;
2014                         if(tcb->srtt == 0) {
2015                                 tcb->srtt = rtt << LOGAGAIN;
2016                                 tcb->mdev = rtt << LOGDGAIN;
2017                         } else {
2018                                 delta = rtt - (tcb->srtt>>LOGAGAIN);
2019                                 tcb->srtt += delta;
2020                                 if(tcb->srtt <= 0)
2021                                         tcb->srtt = 1;
2022
2023                                 delta = abs(delta) - (tcb->mdev>>LOGDGAIN);
2024                                 tcb->mdev += delta;
2025                                 if(tcb->mdev <= 0)
2026                                         tcb->mdev = 1;
2027                         }
2028                         tcpsettimer(tcb);
2029                 }
2030         }
2031
2032 done:
2033         if(qdiscard(s->wq, acked) < acked)
2034                 tcb->flgcnt--;
2035         tcb->snd.una = seg->ack;
2036
2037         /* newreno fast recovery */
2038         if(tcb->snd.recovery)
2039                 tcprxmit(s);
2040
2041         if(seq_gt(seg->ack, tcb->snd.urg))
2042                 tcb->snd.urg = seg->ack;
2043
2044         if(tcb->snd.una != tcb->snd.nxt){
2045                 /* “impatient” variant */
2046                 if(!tcb->snd.recovery || tcb->snd.partialack == 1){
2047                         tcb->time = NOW;
2048                         tcb->timeuna = tcb->snd.una;
2049                         tcpgo(tpriv, &tcb->timer);
2050                 }
2051         }
2052         else
2053                 tcphalt(tpriv, &tcb->timer);
2054
2055         if(seq_lt(tcb->snd.ptr, tcb->snd.una))
2056                 tcb->snd.ptr = tcb->snd.una;
2057
2058         if(!tcb->snd.recovery)
2059                 tcb->flags &= ~RETRAN;
2060         tcb->backoff = 0;
2061         tcb->backedoff = 0;
2062 }
2063
2064 static void
2065 tcpiput(Proto *tcp, Ipifc*, Block *bp)
2066 {
2067         Tcp seg;
2068         Tcp4hdr *h4;
2069         Tcp6hdr *h6;
2070         int hdrlen;
2071         Tcpctl *tcb;
2072         ushort length, csum;
2073         uchar source[IPaddrlen], dest[IPaddrlen];
2074         Conv *s;
2075         Fs *f;
2076         Tcppriv *tpriv;
2077         uchar version;
2078
2079         f = tcp->f;
2080         tpriv = tcp->priv;
2081
2082         tpriv->stats[InSegs]++;
2083
2084         h4 = (Tcp4hdr*)(bp->rp);
2085         h6 = (Tcp6hdr*)(bp->rp);
2086
2087         if((h4->vihl&0xF0)==IP_VER4) {
2088                 version = V4;
2089                 length = nhgets(h4->length);
2090                 v4tov6(dest, h4->tcpdst);
2091                 v4tov6(source, h4->tcpsrc);
2092
2093                 h4->Unused = 0;
2094                 hnputs(h4->tcplen, length-TCP4_PKT);
2095                 if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
2096                         ptclcsum(bp, TCP4_IPLEN, length-TCP4_IPLEN)) {
2097                         tpriv->stats[CsumErrs]++;
2098                         tpriv->stats[InErrs]++;
2099                         netlog(f, Logtcp, "bad tcp proto cksum\n");
2100                         freeblist(bp);
2101                         return;
2102                 }
2103
2104                 hdrlen = ntohtcp4(&seg, &bp);
2105                 if(hdrlen < 0){
2106                         tpriv->stats[HlenErrs]++;
2107                         tpriv->stats[InErrs]++;
2108                         netlog(f, Logtcp, "bad tcp hdr len\n");
2109                         return;
2110                 }
2111
2112                 /* trim the packet to the size claimed by the datagram */
2113                 length -= hdrlen+TCP4_PKT;
2114                 bp = trimblock(bp, hdrlen+TCP4_PKT, length);
2115                 if(bp == nil){
2116                         tpriv->stats[LenErrs]++;
2117                         tpriv->stats[InErrs]++;
2118                         netlog(f, Logtcp, "tcp len < 0 after trim\n");
2119                         return;
2120                 }
2121         }
2122         else {
2123                 int ttl = h6->ttl;
2124                 int proto = h6->proto;
2125
2126                 version = V6;
2127                 length = nhgets(h6->ploadlen);
2128                 ipmove(dest, h6->tcpdst);
2129                 ipmove(source, h6->tcpsrc);
2130
2131                 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
2132                 h6->ttl = proto;
2133                 hnputl(h6->vcf, length);
2134                 if((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
2135                     (csum = ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) != 0) {
2136                         tpriv->stats[CsumErrs]++;
2137                         tpriv->stats[InErrs]++;
2138                         netlog(f, Logtcp,
2139                             "bad tcpv6 proto cksum: got %#ux, computed %#ux\n",
2140                                 h6->tcpcksum[0]<<8 | h6->tcpcksum[1], csum);
2141                         freeblist(bp);
2142                         return;
2143                 }
2144                 h6->ttl = ttl;
2145                 h6->proto = proto;
2146                 hnputs(h6->ploadlen, length);
2147
2148                 hdrlen = ntohtcp6(&seg, &bp);
2149                 if(hdrlen < 0){
2150                         tpriv->stats[HlenErrs]++;
2151                         tpriv->stats[InErrs]++;
2152                         netlog(f, Logtcp, "bad tcpv6 hdr len\n");
2153                         return;
2154                 }
2155
2156                 /* trim the packet to the size claimed by the datagram */
2157                 length -= hdrlen;
2158                 bp = trimblock(bp, hdrlen+TCP6_PKT, length);
2159                 if(bp == nil){
2160                         tpriv->stats[LenErrs]++;
2161                         tpriv->stats[InErrs]++;
2162                         netlog(f, Logtcp, "tcpv6 len < 0 after trim\n");
2163                         return;
2164                 }
2165         }
2166
2167         /* lock protocol while searching for a conversation */
2168         qlock(tcp);
2169
2170         /* Look for a matching conversation */
2171         s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2172         if(s == nil){
2173                 netlog(f, Logtcp, "iphtlook(src %I!%d, dst %I!%d) failed\n",
2174                         source, seg.source, dest, seg.dest);
2175 reset:
2176                 qunlock(tcp);
2177                 sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2178                 freeblist(bp);
2179                 return;
2180         }
2181
2182         /* if it's a listener, look for the right flags and get a new conv */
2183         tcb = (Tcpctl*)s->ptcl;
2184         if(tcb->state == Listen){
2185                 if(seg.flags & RST){
2186                         limborst(s, &seg, source, dest, version);
2187                         qunlock(tcp);
2188                         freeblist(bp);
2189                         return;
2190                 }
2191
2192                 /* if this is a new SYN, put the call into limbo */
2193                 if((seg.flags & SYN) && (seg.flags & ACK) == 0){
2194                         limbo(s, source, dest, &seg, version);
2195                         qunlock(tcp);
2196                         freeblist(bp);
2197                         return;
2198                 }
2199
2200                 /*
2201                  *  if there's a matching call in limbo, tcpincoming will
2202                  *  return it in state Syn_received
2203                  */
2204                 s = tcpincoming(s, &seg, source, dest, version);
2205                 if(s == nil)
2206                         goto reset;
2207         }
2208
2209         /* The rest of the input state machine is run with the control block
2210          * locked and implements the state machine directly out of the RFC.
2211          * Out-of-band data is ignored - it was always a bad idea.
2212          */
2213         tcb = (Tcpctl*)s->ptcl;
2214         if(waserror()){
2215                 qunlock(s);
2216                 nexterror();
2217         }
2218         qlock(s);
2219         qunlock(tcp);
2220
2221         /* fix up window */
2222         seg.wnd <<= tcb->rcv.scale;
2223
2224         /* every input packet in puts off the keep alive time out */
2225         tcpsetkacounter(tcb);
2226
2227         switch(tcb->state) {
2228         case Closed:
2229                 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2230                 goto raise;
2231         case Syn_sent:
2232                 if(seg.flags & ACK) {
2233                         if(!seq_within(seg.ack, tcb->iss+1, tcb->snd.nxt)) {
2234                                 sndrst(tcp, source, dest, length, &seg, version,
2235                                          "bad seq in Syn_sent");
2236                                 goto raise;
2237                         }
2238                 }
2239                 if(seg.flags & RST) {
2240                         if(seg.flags & ACK)
2241                                 localclose(s, Econrefused);
2242                         goto raise;
2243                 }
2244
2245                 if(seg.flags & SYN) {
2246                         procsyn(s, &seg);
2247                         if(seg.flags & ACK){
2248                                 update(s, &seg);
2249                                 tcpsynackrtt(s);
2250                                 tcpsetstate(s, Established);
2251                                 tcpsetscale(s, tcb, seg.ws, tcb->scale);
2252                         }
2253                         else {
2254                                 tcb->time = NOW;
2255                                 tcpsetstate(s, Syn_received);   /* DLP - shouldn't this be a reset? */
2256                         }
2257
2258                         if(length != 0 || (seg.flags & FIN))
2259                                 break;
2260
2261                         freeblist(bp);
2262                         goto output;
2263                 }
2264                 else
2265                         freeblist(bp);
2266
2267                 qunlock(s);
2268                 poperror();
2269                 return;
2270         case Syn_received:
2271                 /* doesn't matter if it's the correct ack, we're just trying to set timing */
2272                 if(seg.flags & ACK)
2273                         tcpsynackrtt(s);
2274                 break;
2275         }
2276
2277         /*
2278          *  One DOS attack is to open connections to us and then forget about them,
2279          *  thereby tying up a conv at no long term cost to the attacker.
2280          *  This is an attempt to defeat these stateless DOS attacks.  See
2281          *  corresponding code in tcpsendka().
2282          */
2283         if(tcb->state != Syn_received && (seg.flags & RST) == 0){
2284                 if(tcpporthogdefense
2285                 && seq_within(seg.ack, tcb->snd.una-(1<<31), tcb->snd.una-(1<<29))){
2286                         print("stateless hog %I.%d->%I.%d f %ux %lux - %lux - %lux\n",
2287                                 source, seg.source, dest, seg.dest, seg.flags,
2288                                 tcb->snd.una-(1<<31), seg.ack, tcb->snd.una-(1<<29));
2289                         localclose(s, "stateless hog");
2290                 }
2291         }
2292
2293         /* Cut the data to fit the receive window */
2294         tcprcvwin(s);
2295         if(tcptrim(tcb, &seg, &bp, &length) == -1) {
2296                 if(seg.seq+1 != tcb->rcv.nxt || length != 1)
2297                 netlog(f, Logtcp, "tcp: trim: !inwind: seq %lud-%lud win %lud-%lud l %d from %I\n", 
2298                         seg.seq, seg.seq + length - 1, 
2299                         tcb->rcv.nxt, tcb->rcv.nxt + tcb->rcv.wnd-1, length, s->raddr);
2300                 update(s, &seg);
2301                 if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) {
2302                         tcphalt(tpriv, &tcb->rtt_timer);
2303                         tcphalt(tpriv, &tcb->acktimer);
2304                         tcphalt(tpriv, &tcb->katimer);
2305                         tcpsetstate(s, Time_wait);
2306                         tcb->timer.start = MSL2*(1000 / MSPTICK);
2307                         tcpgo(tpriv, &tcb->timer);
2308                 }
2309                 if(!(seg.flags & RST)) {
2310                         tcb->flags |= FORCE;
2311                         goto output;
2312                 }
2313                 qunlock(s);
2314                 poperror();
2315                 return;
2316         }
2317
2318         /* Cannot accept so answer with a rst */
2319         if(length && tcb->state == Closed) {
2320                 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2321                 goto raise;
2322         }
2323
2324         /* The segment is beyond the current receive pointer so
2325          * queue the data in the resequence queue
2326          */
2327         if(seg.seq != tcb->rcv.nxt)
2328         if(length != 0 || (seg.flags & (SYN|FIN))) {
2329                 update(s, &seg);
2330                 if(addreseq(f, tcb, tpriv, &seg, bp, length) < 0)
2331                         print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr, s->lport);
2332                 tcb->flags |= FORCE;            /* force duplicate ack; RFC 5681 §3.2 */
2333                 goto output;
2334         }
2335
2336         if(tcb->nreseq > 0)
2337                 tcb->flags |= FORCE;            /* filled hole in sequence space; RFC 5681 §3.2 */
2338
2339         /*
2340          *  keep looping till we've processed this packet plus any
2341          *  adjacent packets in the resequence queue
2342          */
2343         for(;;) {
2344                 if(seg.flags & RST) {
2345                         if(tcb->state == Established) {
2346                                 tpriv->stats[EstabResets]++;
2347                                 if(tcb->rcv.nxt != seg.seq)
2348                                         print("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt %lux seq %lux\n", s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt, seg.seq);
2349                         }
2350                         localclose(s, Econrefused);
2351                         goto raise;
2352                 }
2353
2354                 if((seg.flags&ACK) == 0)
2355                         goto raise;
2356
2357                 switch(tcb->state) {
2358                 case Syn_received:
2359                         if(!seq_within(seg.ack, tcb->snd.una+1, tcb->snd.nxt)){
2360                                 sndrst(tcp, source, dest, length, &seg, version,
2361                                         "bad seq in Syn_received");
2362                                 goto raise;
2363                         }
2364                         update(s, &seg);
2365                         tcpsetstate(s, Established);
2366                 case Established:
2367                 case Close_wait:
2368                         update(s, &seg);
2369                         break;
2370                 case Finwait1:
2371                         update(s, &seg);
2372                         if(qlen(s->wq)+tcb->flgcnt == 0){
2373                                 tcphalt(tpriv, &tcb->rtt_timer);
2374                                 tcphalt(tpriv, &tcb->acktimer);
2375                                 tcpsetkacounter(tcb);
2376                                 tcb->time = NOW;
2377                                 tcpsetstate(s, Finwait2);
2378                                 tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2379                                 tcpgo(tpriv, &tcb->katimer);
2380                         }
2381                         break;
2382                 case Finwait2:
2383                         update(s, &seg);
2384                         break;
2385                 case Closing:
2386                         update(s, &seg);
2387                         if(qlen(s->wq)+tcb->flgcnt == 0) {
2388                                 tcphalt(tpriv, &tcb->rtt_timer);
2389                                 tcphalt(tpriv, &tcb->acktimer);
2390                                 tcphalt(tpriv, &tcb->katimer);
2391                                 tcpsetstate(s, Time_wait);
2392                                 tcb->timer.start = MSL2*(1000 / MSPTICK);
2393                                 tcpgo(tpriv, &tcb->timer);
2394                         }
2395                         break;
2396                 case Last_ack:
2397                         update(s, &seg);
2398                         if(qlen(s->wq)+tcb->flgcnt == 0) {
2399                                 localclose(s, nil);
2400                                 goto raise;
2401                         }
2402                 case Time_wait:
2403                         tcb->flags |= FORCE;
2404                         if(tcb->timer.state != TcptimerON)
2405                                 tcpgo(tpriv, &tcb->timer);
2406                 }
2407
2408                 if((seg.flags&URG) && seg.urg) {
2409                         if(seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2410                                 tcb->rcv.urg = seg.urg + seg.seq;
2411                                 pullblock(&bp, seg.urg);
2412                         }
2413                 }
2414                 else
2415                 if(seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2416                         tcb->rcv.urg = tcb->rcv.nxt;
2417
2418                 if(length == 0) {
2419                         if(bp != nil)
2420                                 freeblist(bp);
2421                 }
2422                 else {
2423                         switch(tcb->state){
2424                         default:
2425                                 /* Ignore segment text */
2426                                 if(bp != nil)
2427                                         freeblist(bp);
2428                                 break;
2429
2430                         case Syn_received:
2431                         case Established:
2432                         case Finwait1:
2433                                 /* If we still have some data place on
2434                                  * receive queue
2435                                  */
2436                                 if(bp) {
2437                                         qpassnolim(s->rq, packblock(bp));
2438                                         bp = nil;
2439                                 }
2440                                 tcb->rcv.nxt += length;
2441
2442                                 /*
2443                                  *  turn on the acktimer if there's something
2444                                  *  to ack
2445                                  */
2446                                 if(tcb->acktimer.state != TcptimerON)
2447                                         tcpgo(tpriv, &tcb->acktimer);
2448
2449                                 break;
2450                         case Finwait2:
2451                                 /* no process to read the data, send a reset */
2452                                 if(bp != nil)
2453                                         freeblist(bp);
2454                                 sndrst(tcp, source, dest, length, &seg, version,
2455                                         "send to Finwait2");
2456                                 qunlock(s);
2457                                 poperror();
2458                                 return;
2459                         }
2460                 }
2461
2462                 if(seg.flags & FIN) {
2463                         tcb->flags |= FORCE;
2464
2465                         switch(tcb->state) {
2466                         case Syn_received:
2467                         case Established:
2468                                 tcb->rcv.nxt++;
2469                                 tcpsetstate(s, Close_wait);
2470                                 break;
2471                         case Finwait1:
2472                                 tcb->rcv.nxt++;
2473                                 if(qlen(s->wq)+tcb->flgcnt == 0) {
2474                                         tcphalt(tpriv, &tcb->rtt_timer);
2475                                         tcphalt(tpriv, &tcb->acktimer);
2476                                         tcphalt(tpriv, &tcb->katimer);
2477                                         tcpsetstate(s, Time_wait);
2478                                         tcb->timer.start = MSL2*(1000/MSPTICK);
2479                                         tcpgo(tpriv, &tcb->timer);
2480                                 }
2481                                 else
2482                                         tcpsetstate(s, Closing);
2483                                 break;
2484                         case Finwait2:
2485                                 tcb->rcv.nxt++;
2486                                 tcphalt(tpriv, &tcb->rtt_timer);
2487                                 tcphalt(tpriv, &tcb->acktimer);
2488                                 tcphalt(tpriv, &tcb->katimer);
2489                                 tcpsetstate(s, Time_wait);
2490                                 tcb->timer.start = MSL2 * (1000/MSPTICK);
2491                                 tcpgo(tpriv, &tcb->timer);
2492                                 break;
2493                         case Close_wait:
2494                         case Closing:
2495                         case Last_ack:
2496                                 break;
2497                         case Time_wait:
2498                                 tcpgo(tpriv, &tcb->timer);
2499                                 break;
2500                         }
2501                 }
2502
2503                 /*
2504                  *  get next adjacent segment from the resequence queue.
2505                  *  dump/trim any overlapping segments
2506                  */
2507                 for(;;) {
2508                         if(tcb->reseq == nil)
2509                                 goto output;
2510
2511                         if(seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2512                                 goto output;
2513
2514                         getreseq(tcb, &seg, &bp, &length);
2515
2516                         tcprcvwin(s);
2517                         if(tcptrim(tcb, &seg, &bp, &length) == 0){
2518                                 tcb->flags |= FORCE;
2519                                 break;
2520                         }
2521                 }
2522         }
2523 output:
2524         tcpoutput(s);
2525         qunlock(s);
2526         poperror();
2527         return;
2528 raise:
2529         qunlock(s);
2530         poperror();
2531         freeblist(bp);
2532         tcpkick(s);
2533 }
2534
2535 /*
2536  *  always enters and exits with the s locked.  We drop
2537  *  the lock to ipoput the packet so some care has to be
2538  *  taken by callers.
2539  */
2540 static void
2541 tcpoutput(Conv *s)
2542 {
2543         Tcp seg;
2544         uint msgs;
2545         Tcpctl *tcb;
2546         Block *hbp, *bp;
2547         int sndcnt;
2548         ulong ssize, dsize, sent;
2549         Fs *f;
2550         Tcppriv *tpriv;
2551         uchar version;
2552
2553         f = s->p->f;
2554         tpriv = s->p->priv;
2555         version = s->ipversion;
2556
2557         tcb = (Tcpctl*)s->ptcl;
2558
2559         /* force ack every 2*mss */
2560         if((tcb->flags & FORCE) == 0)
2561         if(tcb->rcv.nxt - tcb->rcv.ackptr >= 2*tcb->mss){
2562                 tpriv->stats[Delayack]++;
2563                 tcb->flags |= FORCE;
2564         }
2565
2566         /* force ack if window opening */
2567         if(0)
2568         if((tcb->flags & FORCE) == 0){
2569                 tcprcvwin(s);
2570                 if((int)(tcb->rcv.wptr - tcb->rcv.wsnt) >= 2*tcb->mss){
2571                         tpriv->stats[Wopenack]++;
2572                         tcb->flags |= FORCE;
2573                 }
2574         }
2575
2576         for(msgs = 0; msgs < 100; msgs++) {
2577                 switch(tcb->state) {
2578                 case Listen:
2579                 case Closed:
2580                 case Finwait2:
2581                         return;
2582                 }
2583
2584                 /* Don't send anything else until our SYN has been acked */
2585                 if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
2586                         break;
2587
2588                 /* force an ack when a window has opened up */
2589                 tcprcvwin(s);
2590                 if(tcb->rcv.blocked && tcb->rcv.wnd > 0){
2591                         tcb->rcv.blocked = 0;
2592                         tcb->flags |= FORCE;
2593                 }
2594
2595                 sndcnt = qlen(s->wq)+tcb->flgcnt;
2596                 sent = tcb->snd.ptr - tcb->snd.una;
2597                 ssize = sndcnt;
2598                 if(tcb->snd.wnd == 0){
2599                         /* zero window probe */
2600                         if(sent > 0)
2601                         if(!(tcb->flags & FORCE))
2602                                 break;  /* already probing, rto re-probes */
2603                         if(ssize < sent)
2604                                 ssize = 0;
2605                         else{
2606                                 ssize -= sent;
2607                                 if(ssize > 0)
2608                                         ssize = 1;
2609                         }
2610                 } else {
2611                         /* calculate usable segment size */
2612                         if(ssize > tcb->cwind)
2613                                 ssize = tcb->cwind;
2614                         if(ssize > tcb->snd.wnd)
2615                                 ssize = tcb->snd.wnd;
2616
2617                         if(ssize < sent)
2618                                 ssize = 0;
2619                         else {
2620                                 ssize -= sent;
2621                                 if(ssize > tcb->mss)
2622                                         ssize = tcb->mss;
2623                         }
2624                 }
2625
2626                 dsize = ssize;
2627                 seg.urg = 0;
2628
2629                 if(!(tcb->flags & FORCE)){
2630                         if(ssize == 0)
2631                                 break;
2632                         if(ssize < tcb->mss)
2633                         if(tcb->snd.nxt == tcb->snd.ptr)
2634                         if(sent > TCPREXMTTHRESH*tcb->mss)
2635                                 break;
2636                 }
2637
2638                 tcb->flags &= ~FORCE;
2639
2640                 /* By default we will generate an ack */
2641                 tcphalt(tpriv, &tcb->acktimer);
2642                 seg.source = s->lport;
2643                 seg.dest = s->rport;
2644                 seg.flags = ACK;
2645                 seg.mss = 0;
2646                 seg.ws = 0;
2647                 seg.update = 0;
2648                 switch(tcb->state){
2649                 case Syn_sent:
2650                         seg.flags = 0;
2651                         if(tcb->snd.ptr == tcb->iss){
2652                                 seg.flags |= SYN;
2653                                 dsize--;
2654                                 seg.mss = tcb->mss;
2655                                 seg.ws = tcb->scale;
2656                         }
2657                         break;
2658                 case Syn_received:
2659                         /*
2660                          *  don't send any data with a SYN/ACK packet
2661                          *  because Linux rejects the packet in its
2662                          *  attempt to solve the SYN attack problem
2663                          */
2664                         if(tcb->snd.ptr == tcb->iss){
2665                                 seg.flags |= SYN;
2666                                 dsize = 0;
2667                                 ssize = 1;
2668                                 seg.mss = tcb->mss;
2669                                 seg.ws = tcb->scale;
2670                         }
2671                         break;
2672                 }
2673                 seg.seq = tcb->snd.ptr;
2674                 seg.ack = tcb->rcv.nxt;
2675                 seg.wnd = tcb->rcv.wnd;
2676
2677                 /* Pull out data to send */
2678                 bp = nil;
2679                 if(dsize != 0) {
2680                         bp = qcopy(s->wq, dsize, sent);
2681                         if(BLEN(bp) != dsize) {
2682                                 seg.flags |= FIN;
2683                                 dsize--;
2684                         }
2685                 }
2686
2687                 if(sent+dsize == sndcnt && dsize)
2688                         seg.flags |= PSH;
2689
2690                 tcb->snd.ptr += ssize;
2691
2692                 /* Pull up the send pointer so we can accept acks
2693                  * for this window
2694                  */
2695                 if(seq_gt(tcb->snd.ptr,tcb->snd.nxt))
2696                         tcb->snd.nxt = tcb->snd.ptr;
2697
2698                 /* Build header, link data and compute cksum */
2699                 switch(version){
2700                 case V4:
2701                         tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2702                         hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
2703                         if(hbp == nil) {
2704                                 freeblist(bp);
2705                                 return;
2706                         }
2707                         break;
2708                 case V6:
2709                         tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2710                         hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
2711                         if(hbp == nil) {
2712                                 freeblist(bp);
2713                                 return;
2714                         }
2715                         break;
2716                 default:
2717                         hbp = nil;      /* to suppress a warning */
2718                         panic("tcpoutput: version %d", version);
2719                 }
2720
2721                 /* Start the transmission timers if there is new data and we
2722                  * expect acknowledges
2723                  */
2724                 if(ssize != 0){
2725                         if(tcb->timer.state != TcptimerON){
2726                                 tcb->time = NOW;
2727                                 tcb->timeuna = tcb->snd.una;
2728                                 tcpgo(tpriv, &tcb->timer);
2729                         }
2730
2731                         /*  If round trip timer isn't running, start it.
2732                          *  measure the longest packet only in case the
2733                          *  transmission time dominates RTT
2734                          */
2735                         if(tcb->snd.retransmit == 0)
2736                         if(tcb->rtt_timer.state != TcptimerON)
2737                         if(ssize == tcb->mss) {
2738                                 tcpgo(tpriv, &tcb->rtt_timer);
2739                                 tcb->rttseq = tcb->snd.ptr;
2740                         }
2741                 }
2742
2743                 tpriv->stats[OutSegs]++;
2744                 if(tcb->snd.retransmit)
2745                         tpriv->stats[RetransSegsSent]++;
2746                 tcb->rcv.ackptr = seg.ack;
2747                 tcb->rcv.wsnt = tcb->rcv.wptr;
2748
2749                 /* put off the next keep alive */
2750                 tcpgo(tpriv, &tcb->katimer);
2751
2752                 switch(version){
2753                 case V4:
2754                         if(ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0){
2755                                 /* a negative return means no route */
2756                                 localclose(s, "no route");
2757                         }
2758                         break;
2759                 case V6:
2760                         if(ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0){
2761                                 /* a negative return means no route */
2762                                 localclose(s, "no route");
2763                         }
2764                         break;
2765                 default:
2766                         panic("tcpoutput2: version %d", version);
2767                 }
2768                 if((msgs%4) == 3){
2769                         qunlock(s);
2770                         qlock(s);
2771                 }
2772         }
2773 }
2774
2775 /*
2776  *  the BSD convention (hack?) for keep alives.  resend last uchar acked.
2777  */
2778 static void
2779 tcpsendka(Conv *s)
2780 {
2781         Tcp seg;
2782         Tcpctl *tcb;
2783         Block *hbp,*dbp;
2784
2785         tcb = (Tcpctl*)s->ptcl;
2786
2787         dbp = nil;
2788         memset(&seg, 0, sizeof seg);
2789         seg.urg = 0;
2790         seg.source = s->lport;
2791         seg.dest = s->rport;
2792         seg.flags = ACK|PSH;
2793         seg.mss = 0;
2794         seg.ws = 0;
2795         if(tcpporthogdefense)
2796                 seg.seq = tcb->snd.una-(1<<30)-nrand(1<<20);
2797         else
2798                 seg.seq = tcb->snd.una-1;
2799         seg.ack = tcb->rcv.nxt;
2800         tcb->rcv.ackptr = seg.ack;
2801         tcprcvwin(s);
2802         seg.wnd = tcb->rcv.wnd;
2803         if(tcb->state == Finwait2){
2804                 seg.flags |= FIN;
2805         } else {
2806                 dbp = allocb(1);
2807                 dbp->wp++;
2808         }
2809
2810         if(isv4(s->raddr)) {
2811                 /* Build header, link data and compute cksum */
2812                 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2813                 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
2814                 if(hbp == nil) {
2815                         freeblist(dbp);
2816                         return;
2817                 }
2818                 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
2819         }
2820         else {
2821                 /* Build header, link data and compute cksum */
2822                 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2823                 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
2824                 if(hbp == nil) {
2825                         freeblist(dbp);
2826                         return;
2827                 }
2828                 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
2829         }
2830 }
2831
2832 /*
2833  *  set connection to time out after 12 minutes
2834  */
2835 static void
2836 tcpsetkacounter(Tcpctl *tcb)
2837 {
2838         tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK);
2839         if(tcb->kacounter < 3)
2840                 tcb->kacounter = 3;
2841 }
2842
2843 /*
2844  *  if we've timed out, close the connection
2845  *  otherwise, send a keepalive and restart the timer
2846  */
2847 static void
2848 tcpkeepalive(void *v)
2849 {
2850         Tcpctl *tcb;
2851         Conv *s;
2852
2853         s = v;
2854         tcb = (Tcpctl*)s->ptcl;
2855         if(waserror()){
2856                 qunlock(s);
2857                 nexterror();
2858         }
2859         qlock(s);
2860         if(tcb->state != Closed){
2861                 if(--(tcb->kacounter) <= 0) {
2862                         localclose(s, Etimedout);
2863                 } else {
2864                         tcpsendka(s);
2865                         tcpgo(s->p->priv, &tcb->katimer);
2866                 }
2867         }
2868         qunlock(s);
2869         poperror();
2870 }
2871
2872 /*
2873  *  start keepalive timer
2874  */
2875 static char*
2876 tcpstartka(Conv *s, char **f, int n)
2877 {
2878         Tcpctl *tcb;
2879         int x;
2880
2881         tcb = (Tcpctl*)s->ptcl;
2882         if(tcb->state != Established)
2883                 return "connection must be in Establised state";
2884         if(n > 1){
2885                 x = atoi(f[1]);
2886                 if(x >= MSPTICK)
2887                         tcb->katimer.start = x/MSPTICK;
2888         }
2889         tcpsetkacounter(tcb);
2890         tcpgo(s->p->priv, &tcb->katimer);
2891
2892         return nil;
2893 }
2894
2895 /*
2896  *  turn checksums on/off
2897  */
2898 static char*
2899 tcpsetchecksum(Conv *s, char **f, int)
2900 {
2901         Tcpctl *tcb;
2902
2903         tcb = (Tcpctl*)s->ptcl;
2904         tcb->nochecksum = !atoi(f[1]);
2905
2906         return nil;
2907 }
2908
2909 /*
2910  *  retransmit (at most) one segment at snd.una.
2911  *  preserve cwind & snd.ptr
2912  */
2913 static void
2914 tcprxmit(Conv *s)
2915 {
2916         Tcpctl *tcb;
2917         Tcppriv *tpriv;
2918         ulong tcwind, tptr;
2919
2920         tcb = (Tcpctl*)s->ptcl;
2921         tcb->flags |= RETRAN|FORCE;
2922
2923         tptr = tcb->snd.ptr;
2924         tcwind = tcb->cwind;
2925         tcb->snd.ptr = tcb->snd.una;
2926         tcb->cwind = tcb->mss;
2927         tcb->snd.retransmit = 1;
2928         tcpoutput(s);
2929         tcb->snd.retransmit = 0;
2930         tcb->cwind = tcwind;
2931         tcb->snd.ptr = tptr;
2932
2933         tpriv = s->p->priv;
2934         tpriv->stats[RetransSegs]++;
2935 }
2936
2937 /*
2938  *  todo: RFC 4138 F-RTO
2939  */
2940 static void
2941 tcptimeout(void *arg)
2942 {
2943         Conv *s;
2944         Tcpctl *tcb;
2945         int maxback;
2946         Tcppriv *tpriv;
2947
2948         s = (Conv*)arg;
2949         tpriv = s->p->priv;
2950         tcb = (Tcpctl*)s->ptcl;
2951
2952         if(waserror()){
2953                 qunlock(s);
2954                 nexterror();
2955         }
2956         qlock(s);
2957         switch(tcb->state){
2958         default:
2959                 tcb->backoff++;
2960                 if(tcb->state == Syn_sent)
2961                         maxback = MAXBACKMS/2;
2962                 else
2963                         maxback = MAXBACKMS;
2964                 tcb->backedoff += tcb->timer.start * MSPTICK;
2965                 if(tcb->backedoff >= maxback) {
2966                         localclose(s, Etimedout);
2967                         break;
2968                 }
2969                 netlog(s->p->f, Logtcprxmt, "rxm %d/%d %ldms %lud rto %d %lud %s\n",
2970                         tcb->srtt, tcb->mdev, NOW-tcb->time,
2971                         tcb->snd.una-tcb->timeuna, tcb->snd.rto, tcb->snd.ptr,
2972                         tcpstates[s->state]);
2973                 tcpsettimer(tcb);
2974                 if(tcb->snd.rto == 0)
2975                         tcpcongestion(tcb);
2976                 tcprxmit(s);
2977                 tcb->snd.ptr = tcb->snd.una;
2978                 tcb->cwind = tcb->mss;
2979                 tcb->snd.rto = 1;
2980                 tpriv->stats[RetransTimeouts]++;
2981
2982                 if(tcb->snd.recovery){
2983                         tcb->snd.dupacks = 0;                   /* reno rto */
2984                         tcb->snd.recovery = 0;
2985                         tpriv->stats[RecoveryRTO]++;
2986                         tcb->snd.rxt = tcb->snd.nxt;
2987                         netlog(s->p->f, Logtcpwin,
2988                                 "rto recovery rxt @%lud\n", tcb->snd.nxt);
2989                 }
2990
2991                 tcb->abcbytes = 0;
2992                 break;
2993         case Time_wait:
2994                 localclose(s, nil);
2995                 break;
2996         case Closed:
2997                 break;
2998         }
2999         qunlock(s);
3000         poperror();
3001 }
3002
3003 static int
3004 inwindow(Tcpctl *tcb, int seq)
3005 {
3006         return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1);
3007 }
3008
3009 /*
3010  *  set up state for a received SYN (or SYN ACK) packet
3011  */
3012 static void
3013 procsyn(Conv *s, Tcp *seg)
3014 {
3015         Tcpctl *tcb;
3016         Tcppriv *tpriv;
3017
3018         tcb = (Tcpctl*)s->ptcl;
3019         tcb->flags |= FORCE;
3020
3021         tcb->rcv.nxt = seg->seq + 1;
3022         tcb->rcv.wptr = tcb->rcv.nxt;
3023         tcb->rcv.wsnt = 0;
3024         tcb->rcv.urg = tcb->rcv.nxt;
3025         tcb->irs = seg->seq;
3026
3027         /* our sending max segment size cannot be bigger than what he asked for */
3028         if(seg->mss != 0 && seg->mss < tcb->mss) {
3029                 tcb->mss = seg->mss;
3030                 tpriv = s->p->priv;
3031                 tpriv->stats[Mss] = tcb->mss;
3032         }
3033
3034         tcb->snd.wnd = seg->wnd;
3035         initialwindow(tcb);
3036 }
3037
3038 static int
3039 dumpreseq(Tcpctl *tcb)
3040 {
3041         Reseq *r, *next;
3042
3043         for(r = tcb->reseq; r != nil; r = next){
3044                 next = r->next;
3045                 freeblist(r->bp);
3046                 free(r);
3047         }
3048         tcb->reseq = nil;
3049         tcb->nreseq = 0;
3050         tcb->reseqlen = 0;
3051         return -1;
3052 }
3053
3054 static void
3055 logreseq(Fs *f, Reseq *r, ulong n)
3056 {
3057         char *s;
3058
3059         for(; r != nil; r = r->next){
3060                 s = nil;
3061                 if(r->next == nil && r->seg.seq != n)
3062                         s = "hole/end";
3063                 else if(r->next == nil)
3064                         s = "end";
3065                 else if(r->seg.seq != n)
3066                         s = "hole";
3067                 if(s != nil)
3068                         netlog(f, Logtcp, "%s %lud-%lud (%ld) %#ux\n", s,
3069                                 n, r->seg.seq, r->seg.seq-n, r->seg.flags);
3070                 n = r->seg.seq + r->seg.len;
3071         }
3072 }
3073
3074 static int
3075 addreseq(Fs *f, Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length)
3076 {
3077         Reseq *rp, **rr;
3078         int qmax;
3079
3080         rp = malloc(sizeof(Reseq));
3081         if(rp == nil){
3082                 freeblist(bp);  /* bp always consumed by addreseq */
3083                 return 0;
3084         }
3085
3086         rp->seg = *seg;
3087         rp->bp = bp;
3088         rp->length = length;
3089
3090         tcb->reseqlen += length;
3091         tcb->nreseq++;
3092
3093         /* Place on reassembly list sorting by starting seq number */
3094         for(rr = &tcb->reseq;; rr = &(*rr)->next)
3095                 if(*rr == nil || seq_lt(seg->seq, (*rr)->seg.seq)){
3096                         rp->next = *rr;
3097                         *rr = rp;
3098                         tpriv->stats[Resequenced]++;
3099                         if(rp->next != nil)
3100                                 tpriv->stats[OutOfOrder]++;
3101                         break;
3102                 }
3103
3104         qmax = tcb->window;
3105         if(tcb->reseqlen > qmax){
3106                 netlog(f, Logtcp, "tcp: reseq: queue > window: %d > %d; %d packets\n", tcb->reseqlen, qmax, tcb->nreseq);
3107                 logreseq(f, tcb->reseq, tcb->rcv.nxt);
3108                 tpriv->stats[ReseqBytelim]++;
3109                 return dumpreseq(tcb);
3110         }
3111         qmax = tcb->window / tcb->mss;          /* ~190 for qscale==2, 390 for qscale=3 */
3112         if(tcb->nreseq > qmax){
3113                 netlog(f, Logtcp, "resequence queue > packets: %d %d; %d bytes\n", tcb->nreseq, qmax, tcb->reseqlen);
3114                 logreseq(f, tcb->reseq, tcb->rcv.nxt);
3115                 tpriv->stats[ReseqPktlim]++;
3116                 return dumpreseq(tcb);
3117         }
3118
3119         return 0;
3120 }
3121
3122 static void
3123 getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
3124 {
3125         Reseq *rp;
3126
3127         rp = tcb->reseq;
3128         if(rp == nil)
3129                 return;
3130
3131         tcb->reseq = rp->next;
3132
3133         *seg = rp->seg;
3134         *bp = rp->bp;
3135         *length = rp->length;
3136
3137         tcb->nreseq--;
3138         tcb->reseqlen -= rp->length;
3139
3140         free(rp);
3141 }
3142
3143 static int
3144 tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
3145 {
3146         ushort len;
3147         uchar accept;
3148         int dupcnt, excess;
3149
3150         accept = 0;
3151         len = *length;
3152         if(seg->flags & SYN)
3153                 len++;
3154         if(seg->flags & FIN)
3155                 len++;
3156
3157         if(tcb->rcv.wnd == 0) {
3158                 if(len == 0 && seg->seq == tcb->rcv.nxt)
3159                         return 0;
3160         }
3161         else {
3162                 /* Some part of the segment should be in the window */
3163                 if(inwindow(tcb,seg->seq))
3164                         accept++;
3165                 else
3166                 if(len != 0) {
3167                         if(inwindow(tcb, seg->seq+len-1) ||
3168                         seq_within(tcb->rcv.nxt, seg->seq,seg->seq+len-1))
3169                                 accept++;
3170                 }
3171         }
3172         if(!accept) {
3173                 freeblist(*bp);
3174                 return -1;
3175         }
3176         dupcnt = tcb->rcv.nxt - seg->seq;
3177         if(dupcnt > 0){
3178                 tcb->rerecv += dupcnt;
3179                 if(seg->flags & SYN){
3180                         seg->flags &= ~SYN;
3181                         seg->seq++;
3182
3183                         if(seg->urg > 1)
3184                                 seg->urg--;
3185                         else
3186                                 seg->flags &= ~URG;
3187                         dupcnt--;
3188                 }
3189                 if(dupcnt > 0){
3190                         pullblock(bp, (ushort)dupcnt);
3191                         seg->seq += dupcnt;
3192                         *length -= dupcnt;
3193
3194                         if(seg->urg > dupcnt)
3195                                 seg->urg -= dupcnt;
3196                         else {
3197                                 seg->flags &= ~URG;
3198                                 seg->urg = 0;
3199                         }
3200                 }
3201         }
3202         excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
3203         if(excess > 0) {
3204                 tcb->rerecv += excess;
3205                 *length -= excess;
3206                 *bp = trimblock(*bp, 0, *length);
3207                 if(*bp == nil)
3208                         panic("presotto is a boofhead");
3209                 seg->flags &= ~FIN;
3210         }
3211         return 0;
3212 }
3213
3214 static void
3215 tcpadvise(Proto *tcp, Block *bp, char *msg)
3216 {
3217         Tcp4hdr *h4;
3218         Tcp6hdr *h6;
3219         Tcpctl *tcb;
3220         uchar source[IPaddrlen];
3221         uchar dest[IPaddrlen];
3222         ushort psource, pdest;
3223         Conv *s, **p;
3224
3225         h4 = (Tcp4hdr*)(bp->rp);
3226         h6 = (Tcp6hdr*)(bp->rp);
3227
3228         if((h4->vihl&0xF0)==IP_VER4) {
3229                 v4tov6(dest, h4->tcpdst);
3230                 v4tov6(source, h4->tcpsrc);
3231                 psource = nhgets(h4->tcpsport);
3232                 pdest = nhgets(h4->tcpdport);
3233         }
3234         else {
3235                 ipmove(dest, h6->tcpdst);
3236                 ipmove(source, h6->tcpsrc);
3237                 psource = nhgets(h6->tcpsport);
3238                 pdest = nhgets(h6->tcpdport);
3239         }
3240
3241         /* Look for a connection */
3242         qlock(tcp);
3243         for(p = tcp->conv; *p; p++) {
3244                 s = *p;
3245                 tcb = (Tcpctl*)s->ptcl;
3246                 if(s->rport == pdest)
3247                 if(s->lport == psource)
3248                 if(tcb->state != Closed)
3249                 if(ipcmp(s->raddr, dest) == 0)
3250                 if(ipcmp(s->laddr, source) == 0){
3251                         if(s->ignoreadvice)
3252                                 break;
3253                         qlock(s);
3254                         qunlock(tcp);
3255                         switch(tcb->state){
3256                         case Syn_sent:
3257                                 localclose(s, msg);
3258                                 break;
3259                         }
3260                         qunlock(s);
3261                         freeblist(bp);
3262                         return;
3263                 }
3264         }
3265         qunlock(tcp);
3266         freeblist(bp);
3267 }
3268
3269 static char*
3270 tcpporthogdefensectl(char *val)
3271 {
3272         if(strcmp(val, "on") == 0)
3273                 tcpporthogdefense = 1;
3274         else if(strcmp(val, "off") == 0)
3275                 tcpporthogdefense = 0;
3276         else
3277                 return "unknown value for tcpporthogdefense";
3278         return nil;
3279 }
3280
3281 /* called with c qlocked */
3282 static char*
3283 tcpctl(Conv* c, char** f, int n)
3284 {
3285         if(n == 1 && strcmp(f[0], "close") == 0)
3286                 return tcpclose(c), nil;
3287         if(n == 1 && strcmp(f[0], "hangup") == 0)
3288                 return tcphangup(c);
3289         if(n >= 1 && strcmp(f[0], "keepalive") == 0)
3290                 return tcpstartka(c, f, n);
3291         if(n >= 1 && strcmp(f[0], "checksum") == 0)
3292                 return tcpsetchecksum(c, f, n);
3293         if(n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
3294                 return tcpporthogdefensectl(f[1]);
3295         return "unknown control request";
3296 }
3297
3298 static int
3299 tcpstats(Proto *tcp, char *buf, int len)
3300 {
3301         Tcppriv *priv;
3302         char *p, *e;
3303         int i;
3304
3305         priv = tcp->priv;
3306         p = buf;
3307         e = p+len;
3308         for(i = 0; i < Nstats; i++)
3309                 p = seprint(p, e, "%s: %llud\n", statnames[i], priv->stats[i]);
3310         return p - buf;
3311 }
3312
3313 /*
3314  *  garbage collect any stale conversations:
3315  *      - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
3316  *      - Finwait2 after 5 minutes
3317  *
3318  *  this is called whenever we run out of channels.  Both checks are
3319  *  of questionable validity so we try to use them only when we're
3320  *  up against the wall.
3321  */
3322 static int
3323 tcpgc(Proto *tcp)
3324 {
3325         Conv *c, **pp, **ep;
3326         int n;
3327         Tcpctl *tcb;
3328
3329
3330         n = 0;
3331         ep = &tcp->conv[tcp->nc];
3332         for(pp = tcp->conv; pp < ep; pp++) {
3333                 c = *pp;
3334                 if(c == nil)
3335                         break;
3336                 if(!canqlock(c))
3337                         continue;
3338                 tcb = (Tcpctl*)c->ptcl;
3339                 switch(tcb->state){
3340                 case Syn_received:
3341                         if(NOW - tcb->time > 5000){
3342                                 localclose(c, Etimedout);
3343                                 n++;
3344                         }
3345                         break;
3346                 case Finwait2:
3347                         if(NOW - tcb->time > 5*60*1000){
3348                                 localclose(c, Etimedout);
3349                                 n++;
3350                         }
3351                         break;
3352                 }
3353                 qunlock(c);
3354         }
3355         return n;
3356 }
3357
3358 static void
3359 tcpsettimer(Tcpctl *tcb)
3360 {
3361         int x;
3362
3363         /* round trip dependency */
3364         x = backoff(tcb->backoff) *
3365                 (tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK;
3366
3367         /* bounded twixt 0.3 and 64 seconds */
3368         if(x < 300/MSPTICK)
3369                 x = 300/MSPTICK;
3370         else if(x > (64000/MSPTICK))
3371                 x = 64000/MSPTICK;
3372         tcb->timer.start = x;
3373 }
3374
3375 void
3376 tcpinit(Fs *fs)
3377 {
3378         Proto *tcp;
3379         Tcppriv *tpriv;
3380
3381         tcp = smalloc(sizeof(Proto));
3382         tpriv = tcp->priv = smalloc(sizeof(Tcppriv));
3383         tcp->name = "tcp";
3384         tcp->connect = tcpconnect;
3385         tcp->announce = tcpannounce;
3386         tcp->ctl = tcpctl;
3387         tcp->state = tcpstate;
3388         tcp->create = tcpcreate;
3389         tcp->close = tcpclose;
3390         tcp->rcv = tcpiput;
3391         tcp->advise = tcpadvise;
3392         tcp->stats = tcpstats;
3393         tcp->inuse = tcpinuse;
3394         tcp->gc = tcpgc;
3395         tcp->ipproto = IP_TCPPROTO;
3396         tcp->nc = scalednconv();
3397         tcp->ptclsize = sizeof(Tcpctl);
3398         tpriv->stats[MaxConn] = tcp->nc;
3399
3400         Fsproto(fs, tcp);
3401 }
3402
3403 static void
3404 tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale)
3405 {
3406         /*
3407          * guess at reasonable queue sizes.  there's no current way 
3408          * to know how many nic receive buffers we can safely tie up in the
3409          * tcp stack, and we don't adjust our queues to maximize throughput
3410          * and minimize bufferbloat.  n.b. the offer (rcvscale) needs to be
3411          * respected, but we still control our own buffer commitment by
3412          * keeping a seperate qscale.
3413          */
3414         tcb->rcv.scale = rcvscale & 0xff;
3415         tcb->snd.scale = sndscale & 0xff;
3416         tcb->qscale = rcvscale & 0xff;
3417         if(rcvscale > Maxqscale)
3418                 tcb->qscale = Maxqscale;
3419
3420         if(rcvscale != tcb->rcv.scale)
3421                 netlog(s->p->f, Logtcp, "tcpsetscale: window %lud qlen %d >> window %ud lport %d\n",
3422                         tcb->window, qlen(s->rq), QMAX<<tcb->qscale, s->lport);
3423         tcb->window = QMAX<<tcb->qscale;
3424         tcb->ssthresh = tcb->window;
3425
3426         /*
3427          * it's important to set wq large enough to cover the full
3428          * bandwidth-delay product.  it's possible to be in loss
3429          * recovery with a big window, and we need to keep sending
3430          * into the inflated window.  the difference can be huge
3431          * for even modest (70ms) ping times.
3432          */
3433         qsetlimit(s->rq, QMAX<<tcb->qscale);
3434         qsetlimit(s->wq, QMAX<<tcb->qscale);
3435         tcprcvwin(s);
3436 }