2 #include "../port/lib.h"
6 #include "../port/error.h"
19 TCP4_PKT = TCP4_IPLEN+TCP4_PHDRSIZE,
25 TCP6_PKT = TCP6_IPLEN+TCP6_PHDRSIZE,
30 MAX_TIME = (1<<20), /* Forever */
31 TCP_ACK = 50, /* Timed ack sequence in ms */
32 MAXBACKMS = 9*60*1000, /* longest backoff time (ms) before hangup */
34 URG = 0x20, /* Data marked urgent */
35 ACK = 0x10, /* Acknowledge is valid */
36 PSH = 0x08, /* Whole data pipe is pushed */
37 RST = 0x04, /* Reset connection */
38 SYN = 0x02, /* Pkt. is synchronise */
39 FIN = 0x01, /* Start close down */
44 MSS_LENGTH = 4, /* Maximum segment size */
46 WS_LENGTH = 3, /* Bits to scale window size by */
48 MSPTICK = 50, /* Milliseconds per timer tick */
49 DEF_MSS = 1460, /* Default maximum segment */
50 DEF_MSS6 = 1220, /* Default maximum segment (min) for v6 */
51 DEF_RTT = 500, /* Default round trip */
52 DEF_KAT = 120000, /* Default time (ms) between keep alives */
53 TCP_LISTEN = 0, /* Listen connection */
54 TCP_CONNECT = 1, /* Outgoing connection */
55 SYNACK_RXTIMER = 250, /* ms between SYNACK retransmits */
57 TCPREXMTTHRESH = 3, /* dupack threshhold for rxt */
68 Closed = 0, /* Connection states */
80 Maxlimbo = 1000, /* maximum procs waiting for response to SYN ACK */
81 NLHT = 256, /* hash table size, must be a power of 2 */
86 * these factors determine the ultimate bandwidth-delay product.
87 * 64kb · 2⁵ = 2mb, or 2x overkill for 100mbps · 70ms.
89 Maxqscale = 4, /* maximum queuing scale */
90 Defadvscale = 4, /* default advertisement */
93 /* Must correspond to the enumeration above */
96 "Closed", "Listen", "Syn_sent", "Syn_received",
97 "Established", "Finwait1", "Finwait2", "Close_wait",
98 "Closing", "Last_ack", "Time_wait"
101 typedef struct Tcptimer Tcptimer;
115 * v4 and v6 pseudo headers used for
118 typedef struct Tcp4hdr Tcp4hdr;
121 uchar vihl; /* Version and header length */
122 uchar tos; /* Type of service */
123 uchar length[2]; /* packet length */
124 uchar id[2]; /* Identification */
125 uchar frag[2]; /* Fragment information */
139 /* Options segment */
143 typedef struct Tcp6hdr Tcp6hdr;
150 uchar tcpsrc[IPaddrlen];
151 uchar tcpdst[IPaddrlen];
160 /* Options segment */
165 * this represents the control info
166 * for a single packet. It is derived from
167 * a packet in ntohtcp{4,6}() and stuck into
168 * a packet in htontcp{4,6}().
170 typedef struct Tcp Tcp;
179 ushort ws; /* window scale option */
180 ulong wnd; /* prescaled window*/
182 ushort mss; /* max segment size option (if not zero) */
183 ushort len; /* size of data */
187 * this header is malloc'd to thread together fragments
188 * waiting to be coalesced
190 typedef struct Reseq Reseq;
200 * the qlock in the Conv locks this structure
202 typedef struct Tcpctl Tcpctl;
205 uchar state; /* Connection state */
206 uchar type; /* Listening or active connection */
207 uchar code; /* Icmp code */
209 ulong una; /* Unacked data pointer */
210 ulong nxt; /* Next sequence expected */
211 ulong ptr; /* Data pointer */
212 ulong wnd; /* Tcp send window */
213 ulong urg; /* Urgent data pointer */
215 uint scale; /* how much to right shift window in xmitted packets */
216 /* to implement tahoe and reno TCP */
217 ulong dupacks; /* number of duplicate acks rcvd */
219 int recovery; /* loss recovery flag */
220 int retransmit; /* retransmit 1 packet @ una flag */
222 ulong rxt; /* right window marker for recovery "recover" rfc3782 */
225 ulong nxt; /* Receive pointer to next uchar slot */
226 ulong wnd; /* Receive window incoming */
227 ulong wsnt; /* Last wptr sent. important to track for large bdp */
229 ulong urg; /* Urgent pointer */
230 ulong ackptr; /* last acked sequence */
232 uint scale; /* how much to left shift window in rcv'd packets */
234 ulong iss; /* Initial sequence number */
235 ulong cwind; /* Congestion window */
236 ulong abcbytes; /* appropriate byte counting rfc 3465 */
237 uint scale; /* desired snd.scale */
238 ulong ssthresh; /* Slow start threshold */
239 int resent; /* Bytes just resent */
240 int irs; /* Initial received squence */
241 ushort mss; /* Maximum segment size */
242 int rerecv; /* Overlap of data rerecevived */
243 ulong window; /* Our receive window (queue) */
244 uint qscale; /* Log2 of our receive window (queue) */
245 uchar backoff; /* Exponential backoff counter */
246 int backedoff; /* ms we've backed off for rexmits */
247 uchar flags; /* State flags */
248 Reseq *reseq; /* Resequencing queue */
251 Tcptimer timer; /* Activity timer */
252 Tcptimer acktimer; /* Acknowledge timer */
253 Tcptimer rtt_timer; /* Round trip timer */
254 Tcptimer katimer; /* keep alive timer */
255 ulong rttseq; /* Round trip sequence */
256 int srtt; /* Smoothed round trip */
257 int mdev; /* Mean deviation of round trip */
258 int kacounter; /* count down for keep alive */
259 uint sndsyntime; /* time syn sent */
260 ulong time; /* time Finwait2 or Syn_received was sent */
261 ulong timeuna; /* snd.una when time was set */
262 int nochecksum; /* non-zero means don't send checksums */
263 int flgcnt; /* number of flags in the sequence (FIN,SEQ) */
268 } protohdr; /* prototype header */
272 * New calls are put in limbo rather than having a conversation structure
273 * allocated. Thus, a SYN attack results in lots of limbo'd calls but not
274 * any real Conv structures mucking things up. Calls in limbo rexmit their
275 * SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
277 * In particular they aren't on a listener's queue so that they don't figure
278 * in the input queue limit.
280 * If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
281 * of 70000 limbo'd calls. Not great for a linear list but doable. Therefore
282 * there is no hashing of this list.
284 typedef struct Limbo Limbo;
289 uchar laddr[IPaddrlen];
290 uchar raddr[IPaddrlen];
293 ulong irs; /* initial received sequence */
294 ulong iss; /* initial sent sequence */
295 ushort mss; /* mss from the other end */
296 ushort rcvscale; /* how much to scale rcvd windows */
297 ushort sndscale; /* how much to scale sent windows */
298 ulong lastsend; /* last time we sent a synack */
299 uchar version; /* v4 or v6 */
300 uchar rexmits; /* number of retransmissions */
303 int tcp_irtt = DEF_RTT; /* Initial guess at round trip time */
342 static char *statnames[Nstats] =
346 [ActiveOpens] "ActiveOpens",
347 [PassiveOpens] "PassiveOpens",
348 [EstabResets] "EstabResets",
349 [CurrEstab] "CurrEstab",
352 [RetransSegs] "RetransSegs",
353 [RetransSegsSent] "RetransSegsSent",
354 [RetransTimeouts] "RetransTimeouts",
357 [CsumErrs] "CsumErrs",
358 [HlenErrs] "HlenErrs",
360 [OutOfOrder] "OutOfOrder",
361 [Resequenced] "Resequenced",
362 [ReseqBytelim] "ReseqBytelim",
363 [ReseqPktlim] "ReseqPktlim",
364 [Delayack] "Delayack",
365 [Wopenack] "Wopenack",
367 [Recovery] "Recovery",
368 [RecoveryDone] "RecoveryDone",
369 [RecoveryRTO] "RecoveryRTO",
371 [RecoveryNoSeq] "RecoveryNoSeq",
372 [RecoveryCwind] "RecoveryCwind",
373 [RecoveryPA] "RecoveryPA",
376 typedef struct Tcppriv Tcppriv;
379 /* List of active timers */
383 /* hash table for matching conversations */
386 /* calls in limbo waiting for an ACK to our SYN ACK */
390 /* for keeping track of tcpackproc */
394 uvlong stats[Nstats];
398 * Setting tcpporthogdefense to non-zero enables Dong Lin's
399 * solution to hijacked systems staking out port's as a form
402 * To avoid stateless Conv hogs, we pick a sequence number at random. If
403 * that number gets acked by the other end, we shut down the connection.
404 * Look for tcpporthogdefense in the code.
406 int tcpporthogdefense = 0;
408 static int addreseq(Fs*, Tcpctl*, Tcppriv*, Tcp*, Block*, ushort);
409 static int dumpreseq(Tcpctl*);
410 static void getreseq(Tcpctl*, Tcp*, Block**, ushort*);
411 static void limbo(Conv*, uchar*, uchar*, Tcp*, int);
412 static void limborexmit(Proto*);
413 static void localclose(Conv*, char*);
414 static void procsyn(Conv*, Tcp*);
415 static void tcpacktimer(void*);
416 static void tcpiput(Proto*, Ipifc*, Block*);
417 static void tcpkeepalive(void*);
418 static void tcpoutput(Conv*);
419 static void tcprcvwin(Conv*);
420 static void tcprxmit(Conv*);
421 static void tcpsetkacounter(Tcpctl*);
422 static void tcpsetscale(Conv*, Tcpctl*, ushort, ushort);
423 static void tcpsettimer(Tcpctl*);
424 static void tcpsndsyn(Conv*, Tcpctl*);
425 static void tcpstart(Conv*, int);
426 static void tcpsynackrtt(Conv*);
427 static void tcptimeout(void*);
428 static int tcptrim(Tcpctl*, Tcp*, Block**, ushort*);
431 tcpsetstate(Conv *s, uchar newstate)
439 tcb = (Tcpctl*)s->ptcl;
441 oldstate = tcb->state;
442 if(oldstate == newstate)
445 if(oldstate == Established)
446 tpriv->stats[CurrEstab]--;
447 if(newstate == Established)
448 tpriv->stats[CurrEstab]++;
457 case Close_wait: /* Remote closes */
462 tcb->state = newstate;
464 if(oldstate == Syn_sent && newstate != Closed)
469 tcpconnect(Conv *c, char **argv, int argc)
474 tcb = (Tcpctl*)(c->ptcl);
475 if(tcb->state != Closed)
478 e = Fsstdconnect(c, argv, argc);
481 tcpstart(c, TCP_CONNECT);
487 tcpstate(Conv *c, char *state, int n)
491 s = (Tcpctl*)(c->ptcl);
493 return snprint(state, n,
494 "%s qin %d qout %d rq %d.%d srtt %d mdev %d sst %lud cwin %lud swin %lud>>%d rwin %lud>>%d qscale %d timer.start %d timer.count %d rerecv %d katimer.start %d katimer.count %d\n",
496 c->rq ? qlen(c->rq) : 0,
497 c->wq ? qlen(c->wq) : 0,
498 s->nreseq, s->reseqlen,
499 s->srtt, s->mdev, s->ssthresh,
500 s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale,
502 s->timer.start, s->timer.count, s->rerecv,
503 s->katimer.start, s->katimer.count);
511 s = (Tcpctl*)(c->ptcl);
512 return s->state != Closed;
516 tcpannounce(Conv *c, char **argv, int argc)
521 tcb = (Tcpctl*)(c->ptcl);
522 if(tcb->state != Closed)
525 e = Fsstdannounce(c, argv, argc);
528 tcpstart(c, TCP_LISTEN);
535 * tcpclose is always called with the q locked
542 tcb = (Tcpctl*)c->ptcl;
552 * reset any incoming calls to this listener
554 Fsconnected(c, "Hangup");
566 tcpsetstate(c, Finwait1);
572 tcpsetstate(c, Last_ack);
584 tcb = (Tcpctl*)s->ptcl;
603 localclose(s, "Hangup");
611 static int seq_lt(ulong, ulong);
614 tcprcvwin(Conv *s) /* Call with tcb locked */
619 tcb = (Tcpctl*)s->ptcl;
620 w = tcb->window - qlen(s->rq);
623 /* RFC 1122 § 4.2.2.17 do not move right edge of window left */
624 if(seq_lt(tcb->rcv.nxt + w, tcb->rcv.wptr))
625 w = tcb->rcv.wptr - tcb->rcv.nxt;
626 if(w != tcb->rcv.wnd)
627 if(w>>tcb->rcv.scale == 0 || tcb->window > 4*tcb->mss && w < tcb->mss/4){
628 tcb->rcv.blocked = 1;
629 netlog(s->p->f, Logtcp, "tcprcvwin: window %lud qlen %d ws %ud lport %d\n",
630 tcb->window, qlen(s->rq), tcb->rcv.scale, s->lport);
633 tcb->rcv.wptr = tcb->rcv.nxt + w;
643 tcb = (Tcpctl*)s->ptcl;
650 if(tcb->state != Closed){
659 tcpcongestion(Tcpctl *tcb)
663 inflight = tcb->snd.nxt - tcb->snd.una;
664 if(inflight > tcb->cwind)
665 inflight = tcb->cwind;
666 tcb->ssthresh = inflight / 2;
667 if(tcb->ssthresh < 2*tcb->mss)
668 tcb->ssthresh = 2*tcb->mss;
672 L = 2, /* aggressive slow start; legal values ∈ (1.0, 2.0) */
676 tcpabcincr(Tcpctl *tcb, uint acked)
680 tcb->abcbytes += acked;
681 if(tcb->cwind < tcb->ssthresh){
687 tcb->cwind += MIN(tcb->abcbytes, limit);
693 if(tcb->abcbytes >= tcb->cwind){
694 tcb->abcbytes -= tcb->cwind;
695 tcb->cwind += tcb->mss;
703 c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c);
704 c->wq = qopen(QMAX, Qkick, tcpkick, c);
708 timerstate(Tcppriv *priv, Tcptimer *t, int newstate)
710 if(newstate != TcptimerON){
711 if(t->state == TcptimerON){
713 if(priv->timers == t){
714 priv->timers = t->next;
716 panic("timerstate1");
719 t->next->prev = t->prev;
721 t->prev->next = t->next;
722 t->next = t->prev = nil;
725 if(t->state != TcptimerON){
727 if(t->prev != nil || t->next != nil)
728 panic("timerstate2");
730 t->next = priv->timers;
742 Tcptimer *t, *tp, *timeo;
754 tsleep(&up->sleep, return0, 0, MSPTICK);
759 for(t = priv->timers; t != nil; t = tp) {
761 panic("tcpackproc1");
763 if(t->state == TcptimerON) {
766 timerstate(priv, t, TcptimerDONE);
767 t->readynext = timeo;
775 for(t = timeo; t != nil; t = t->readynext) {
777 panic("tcpackproc2");
778 if(t->state == TcptimerDONE && t->func != nil && !waserror()){
789 tcpgo(Tcppriv *priv, Tcptimer *t)
791 if(t == nil || t->start == 0)
796 timerstate(priv, t, TcptimerON);
801 tcphalt(Tcppriv *priv, Tcptimer *t)
807 timerstate(priv, t, TcptimerOFF);
818 localclose(Conv *s, char *reason) /* called with tcb locked */
824 tcb = (Tcpctl*)s->ptcl;
826 iphtrem(&tpriv->ht, s);
828 tcphalt(tpriv, &tcb->timer);
829 tcphalt(tpriv, &tcb->rtt_timer);
830 tcphalt(tpriv, &tcb->acktimer);
831 tcphalt(tpriv, &tcb->katimer);
833 /* Flush reassembly queue; nothing more can arrive */
836 if(tcb->state == Syn_sent)
837 Fsconnected(s, reason);
838 if(s->state == Announced)
841 qhangup(s->rq, reason);
842 qhangup(s->wq, reason);
844 tcpsetstate(s, Closed);
847 /* mtu (- TCP + IP hdr len) of 1st hop */
849 tcpmtu(Route *r, int version, uint *scale)
855 * set the ws. it doesn't commit us to anything.
856 * ws is the ultimate limit to the bandwidth-delay product.
858 *scale = Defadvscale;
861 * currently we do not implement path MTU discovery
862 * so use interface MTU *only* if directly reachable
863 * or when we use V4 which allows routers to fragment.
864 * otherwise, we use the default MSS which assumes a
865 * safe minimum MTU of 1280 bytes for V6.
869 mtu = ifc->maxtu - ifc->m->hsize;
871 return mtu - (TCP4_PKT + TCP4_HDRSIZE);
872 mtu -= TCP6_PKT + TCP6_HDRSIZE;
873 if((r->type & (Rifc|Runi)) != 0 || mtu <= DEF_MSS6)
883 inittcpctl(Conv *s, int mode)
891 tcb = (Tcpctl*)s->ptcl;
893 memset(tcb, 0, sizeof(Tcpctl));
895 tcb->ssthresh = QMAX; /* reset by tcpsetscale() */
896 tcb->srtt = tcp_irtt<<LOGAGAIN;
900 tcb->timer.start = tcp_irtt / MSPTICK;
901 tcb->timer.func = tcptimeout;
903 tcb->rtt_timer.start = MAX_TIME;
904 tcb->acktimer.start = TCP_ACK / MSPTICK;
905 tcb->acktimer.func = tcpacktimer;
906 tcb->acktimer.arg = s;
907 tcb->katimer.start = DEF_KAT / MSPTICK;
908 tcb->katimer.func = tcpkeepalive;
909 tcb->katimer.arg = s;
913 /* create a prototype(pseudo) header */
914 if(mode != TCP_LISTEN){
915 if(ipcmp(s->laddr, IPnoaddr) == 0)
916 findlocalip(s->p->f, s->laddr, s->raddr);
918 switch(s->ipversion){
920 h4 = &tcb->protohdr.tcp4hdr;
921 memset(h4, 0, sizeof(*h4));
922 h4->proto = IP_TCPPROTO;
923 hnputs(h4->tcpsport, s->lport);
924 hnputs(h4->tcpdport, s->rport);
925 v6tov4(h4->tcpsrc, s->laddr);
926 v6tov4(h4->tcpdst, s->raddr);
929 h6 = &tcb->protohdr.tcp6hdr;
930 memset(h6, 0, sizeof(*h6));
931 h6->proto = IP_TCPPROTO;
932 hnputs(h6->tcpsport, s->lport);
933 hnputs(h6->tcpdport, s->rport);
934 ipmove(h6->tcpsrc, s->laddr);
935 ipmove(h6->tcpdst, s->raddr);
939 panic("inittcpctl: version %d", s->ipversion);
943 tcb->mss = tcb->cwind = mss;
946 tpriv->stats[Mss] = tcb->mss;
948 /* default is no window scaling */
949 tcpsetscale(s, tcb, 0, 0);
953 * called with s qlocked
956 tcpstart(Conv *s, int mode)
960 char kpname[KNAMELEN];
964 if(tpriv->ackprocstarted == 0){
966 if(tpriv->ackprocstarted == 0){
967 snprint(kpname, sizeof(kpname), "#I%dtcpack", s->p->f->dev);
968 kproc(kpname, tcpackproc, s->p);
969 tpriv->ackprocstarted = 1;
971 qunlock(&tpriv->apl);
974 tcb = (Tcpctl*)s->ptcl;
978 iphtadd(&tpriv->ht, s);
981 tpriv->stats[PassiveOpens]++;
983 tcpsetstate(s, Listen);
987 tpriv->stats[ActiveOpens]++;
988 tcb->flags |= ACTIVE;
990 tcpsetstate(s, Syn_sent);
997 tcpflag(char *buf, char *e, ushort flag)
1001 p = seprint(buf, e, "%d", flag>>10); /* Head len */
1003 p = seprint(p, e, " URG");
1005 p = seprint(p, e, " ACK");
1007 p = seprint(p, e, " PSH");
1009 p = seprint(p, e, " RST");
1011 p = seprint(p, e, " SYN");
1013 p = seprint(p, e, " FIN");
1019 htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb)
1024 ushort hdrlen, optpad = 0;
1027 hdrlen = TCP6_HDRSIZE;
1028 if(tcph->flags & SYN){
1030 hdrlen += MSS_LENGTH;
1032 hdrlen += WS_LENGTH;
1033 optpad = hdrlen & 3;
1035 optpad = 4 - optpad;
1040 dlen = blocklen(data);
1041 data = padblock(data, hdrlen + TCP6_PKT);
1045 data = allocb(hdrlen + TCP6_PKT + 64); /* the 64 pad is to meet mintu's */
1046 data->wp += hdrlen + TCP6_PKT;
1049 /* copy in pseudo ip header plus port numbers */
1050 h = (Tcp6hdr *)(data->rp);
1051 memmove(h, ph, TCP6_TCBPHDRSZ);
1053 /* compose pseudo tcp header, do cksum calculation */
1054 hnputl(h->vcf, hdrlen + dlen);
1055 h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
1058 /* copy in variable bits */
1059 hnputl(h->tcpseq, tcph->seq);
1060 hnputl(h->tcpack, tcph->ack);
1061 hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
1062 hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
1063 hnputs(h->tcpurg, tcph->urg);
1065 if(tcph->flags & SYN){
1069 *opt++ = MSS_LENGTH;
1070 hnputs(opt, tcph->mss);
1082 if(tcb != nil && tcb->nochecksum){
1083 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1085 csum = ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE);
1086 hnputs(h->tcpcksum, csum);
1089 /* move from pseudo header back to normal ip header */
1090 memset(h->vcf, 0, 4);
1091 h->vcf[0] = IP_VER6;
1092 hnputs(h->ploadlen, hdrlen+dlen);
1093 h->proto = ph->proto;
1099 htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb)
1104 ushort hdrlen, optpad = 0;
1107 hdrlen = TCP4_HDRSIZE;
1108 if(tcph->flags & SYN){
1110 hdrlen += MSS_LENGTH;
1112 hdrlen += WS_LENGTH;
1113 optpad = hdrlen & 3;
1115 optpad = 4 - optpad;
1120 dlen = blocklen(data);
1121 data = padblock(data, hdrlen + TCP4_PKT);
1125 data = allocb(hdrlen + TCP4_PKT + 64); /* the 64 pad is to meet mintu's */
1126 data->wp += hdrlen + TCP4_PKT;
1129 /* copy in pseudo ip header plus port numbers */
1130 h = (Tcp4hdr *)(data->rp);
1131 memmove(h, ph, TCP4_TCBPHDRSZ);
1133 /* copy in variable bits */
1134 hnputs(h->tcplen, hdrlen + dlen);
1135 hnputl(h->tcpseq, tcph->seq);
1136 hnputl(h->tcpack, tcph->ack);
1137 hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
1138 hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
1139 hnputs(h->tcpurg, tcph->urg);
1141 if(tcph->flags & SYN){
1145 *opt++ = MSS_LENGTH;
1146 hnputs(opt, tcph->mss);
1149 /* always offer. rfc1323 §2.2 */
1159 if(tcb != nil && tcb->nochecksum){
1160 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1162 csum = ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE);
1163 hnputs(h->tcpcksum, csum);
1170 ntohtcp6(Tcp *tcph, Block **bpp)
1178 *bpp = pullupblock(*bpp, TCP6_PKT+TCP6_HDRSIZE);
1182 h = (Tcp6hdr *)((*bpp)->rp);
1183 tcph->source = nhgets(h->tcpsport);
1184 tcph->dest = nhgets(h->tcpdport);
1185 tcph->seq = nhgetl(h->tcpseq);
1186 tcph->ack = nhgetl(h->tcpack);
1187 hdrlen = (h->tcpflag[0]>>2) & ~3;
1188 if(hdrlen < TCP6_HDRSIZE) {
1193 tcph->flags = h->tcpflag[1];
1194 tcph->wnd = nhgets(h->tcpwin);
1195 tcph->urg = nhgets(h->tcpurg);
1199 tcph->len = nhgets(h->ploadlen) - hdrlen;
1201 *bpp = pullupblock(*bpp, hdrlen+TCP6_PKT);
1206 n = hdrlen - TCP6_HDRSIZE;
1207 while(n > 0 && *optr != EOLOPT) {
1208 if(*optr == NOOPOPT) {
1214 if(optlen < 2 || optlen > n)
1218 if(optlen == MSS_LENGTH)
1219 tcph->mss = nhgets(optr+2);
1222 if(optlen == WS_LENGTH && *(optr+2) <= 14)
1223 tcph->ws = *(optr+2);
1233 ntohtcp4(Tcp *tcph, Block **bpp)
1241 *bpp = pullupblock(*bpp, TCP4_PKT+TCP4_HDRSIZE);
1245 h = (Tcp4hdr *)((*bpp)->rp);
1246 tcph->source = nhgets(h->tcpsport);
1247 tcph->dest = nhgets(h->tcpdport);
1248 tcph->seq = nhgetl(h->tcpseq);
1249 tcph->ack = nhgetl(h->tcpack);
1251 hdrlen = (h->tcpflag[0]>>2) & ~3;
1252 if(hdrlen < TCP4_HDRSIZE) {
1257 tcph->flags = h->tcpflag[1];
1258 tcph->wnd = nhgets(h->tcpwin);
1259 tcph->urg = nhgets(h->tcpurg);
1263 tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1265 *bpp = pullupblock(*bpp, hdrlen+TCP4_PKT);
1270 n = hdrlen - TCP4_HDRSIZE;
1271 while(n > 0 && *optr != EOLOPT) {
1272 if(*optr == NOOPOPT) {
1278 if(optlen < 2 || optlen > n)
1282 if(optlen == MSS_LENGTH)
1283 tcph->mss = nhgets(optr+2);
1286 if(optlen == WS_LENGTH && *(optr+2) <= 14)
1287 tcph->ws = *(optr+2);
1297 * For outgoing calls, generate an initial sequence
1298 * number and put a SYN on the send queue
1301 tcpsndsyn(Conv *s, Tcpctl *tcb)
1305 tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1306 tcb->rttseq = tcb->iss;
1307 tcb->snd.wl2 = tcb->iss;
1308 tcb->snd.una = tcb->iss;
1309 tcb->snd.rxt = tcb->iss;
1310 tcb->snd.ptr = tcb->rttseq;
1311 tcb->snd.nxt = tcb->rttseq;
1313 tcb->flags |= FORCE;
1314 tcb->sndsyntime = NOW;
1316 /* set desired mss and scale */
1317 tcb->mss = tcpmtu(v6lookup(s->p->f, s->raddr, s), s->ipversion, &tcb->scale);
1319 tpriv->stats[Mss] = tcb->mss;
1323 sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, uchar version, char *reason)
1331 netlog(tcp->f, Logtcp, "sndrst: %s\n", reason);
1335 if(seg->flags & RST)
1338 /* make pseudo header */
1341 memset(&ph4, 0, sizeof(ph4));
1343 v6tov4(ph4.tcpsrc, dest);
1344 v6tov4(ph4.tcpdst, source);
1345 ph4.proto = IP_TCPPROTO;
1346 hnputs(ph4.tcplen, TCP4_HDRSIZE);
1347 hnputs(ph4.tcpsport, seg->dest);
1348 hnputs(ph4.tcpdport, seg->source);
1351 memset(&ph6, 0, sizeof(ph6));
1352 ph6.vcf[0] = IP_VER6;
1353 ipmove(ph6.tcpsrc, dest);
1354 ipmove(ph6.tcpdst, source);
1355 ph6.proto = IP_TCPPROTO;
1356 hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1357 hnputs(ph6.tcpsport, seg->dest);
1358 hnputs(ph6.tcpdport, seg->source);
1361 panic("sndrst: version %d", version);
1364 tpriv->stats[OutRsts]++;
1367 /* convince the other end that this reset is in band */
1368 if(seg->flags & ACK) {
1369 seg->seq = seg->ack;
1374 seg->ack = seg->seq;
1376 if(seg->flags & SYN)
1379 if(seg->flags & FIN)
1382 seg->flags = rflags;
1389 hbp = htontcp4(seg, nil, &ph4, nil);
1392 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1395 hbp = htontcp6(seg, nil, &ph6, nil);
1398 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1401 panic("sndrst2: version %d", version);
1406 * send a reset to the remote side and close the conversation
1407 * called with s qlocked
1416 tcb = (Tcpctl*)s->ptcl;
1418 return commonerror();
1419 if(ipcmp(s->raddr, IPnoaddr) != 0) {
1421 memset(&seg, 0, sizeof seg);
1422 seg.flags = RST | ACK;
1423 seg.ack = tcb->rcv.nxt;
1424 tcb->rcv.ackptr = seg.ack;
1425 seg.seq = tcb->snd.ptr;
1430 switch(s->ipversion) {
1432 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1433 hbp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr, tcb);
1434 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1437 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1438 hbp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr, tcb);
1439 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1442 panic("tcphangup: version %d", s->ipversion);
1453 * (re)send a SYN ACK
1456 sndsynack(Proto *tcp, Limbo *lp)
1464 /* make pseudo header */
1465 switch(lp->version) {
1467 memset(&ph4, 0, sizeof(ph4));
1469 v6tov4(ph4.tcpsrc, lp->laddr);
1470 v6tov4(ph4.tcpdst, lp->raddr);
1471 ph4.proto = IP_TCPPROTO;
1472 hnputs(ph4.tcplen, TCP4_HDRSIZE);
1473 hnputs(ph4.tcpsport, lp->lport);
1474 hnputs(ph4.tcpdport, lp->rport);
1477 memset(&ph6, 0, sizeof(ph6));
1478 ph6.vcf[0] = IP_VER6;
1479 ipmove(ph6.tcpsrc, lp->laddr);
1480 ipmove(ph6.tcpdst, lp->raddr);
1481 ph6.proto = IP_TCPPROTO;
1482 hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1483 hnputs(ph6.tcpsport, lp->lport);
1484 hnputs(ph6.tcpdport, lp->rport);
1487 panic("sndrst: version %d", lp->version);
1490 memset(&seg, 0, sizeof seg);
1492 seg.ack = lp->irs+1;
1493 seg.flags = SYN|ACK;
1495 seg.mss = tcpmtu(v6lookup(tcp->f, lp->raddr, nil), lp->version, &scale);
1498 /* if the other side set scale, we should too */
1501 lp->sndscale = scale;
1507 switch(lp->version) {
1509 hbp = htontcp4(&seg, nil, &ph4, nil);
1512 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1515 hbp = htontcp6(&seg, nil, &ph6, nil);
1518 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1521 panic("sndsnack: version %d", lp->version);
1527 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1530 * put a call into limbo and respond with a SYN ACK
1532 * called with proto locked
1535 limbo(Conv *s, uchar *source, uchar *dest, Tcp *seg, int version)
1542 h = hashipa(source, seg->source);
1544 for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1546 if(lp->lport != seg->dest || lp->rport != seg->source || lp->version != version)
1548 if(ipcmp(lp->raddr, source) != 0)
1550 if(ipcmp(lp->laddr, dest) != 0)
1553 /* each new SYN restarts the retransmits */
1559 if(tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]){
1561 tpriv->lht[h] = lp->next;
1564 lp = malloc(sizeof(*lp));
1570 lp->version = version;
1571 ipmove(lp->laddr, dest);
1572 ipmove(lp->raddr, source);
1573 lp->lport = seg->dest;
1574 lp->rport = seg->source;
1576 lp->rcvscale = seg->ws;
1578 lp->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1581 if(sndsynack(s->p, lp) < 0){
1589 * resend SYN ACK's once every SYNACK_RXTIMER ms.
1592 limborexmit(Proto *tcp)
1606 for(h = 0; h < NLHT && seen < tpriv->nlimbo; h++){
1607 for(l = &tpriv->lht[h]; *l != nil && seen < tpriv->nlimbo; ){
1610 if(now - lp->lastsend < (lp->rexmits+1)*SYNACK_RXTIMER)
1613 /* time it out after 1 second */
1614 if(++(lp->rexmits) > 5){
1621 /* if we're being attacked, don't bother resending SYN ACK's */
1622 if(tpriv->nlimbo > 100)
1625 if(sndsynack(tcp, lp) < 0){
1639 * lookup call in limbo. if found, throw it out.
1641 * called with proto locked
1644 limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1652 /* find a call in limbo */
1653 h = hashipa(src, segp->source);
1654 for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1656 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1658 if(ipcmp(lp->laddr, dst) != 0)
1660 if(ipcmp(lp->raddr, src) != 0)
1663 /* RST can only follow the SYN */
1664 if(segp->seq == lp->irs+1){
1674 initialwindow(Tcpctl *tcb)
1676 /* RFC 3390 initial window */
1678 tcb->cwind = 4*tcb->mss;
1679 else if(tcb->mss < 2190)
1682 tcb->cwind = 2*tcb->mss;
1686 * come here when we finally get an ACK to our SYN-ACK.
1687 * lookup call in limbo. if found, create a new conversation
1689 * called with proto locked
1692 tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1702 /* unless it's just an ack, it can't be someone coming out of limbo */
1703 if((segp->flags & SYN) || (segp->flags & ACK) == 0)
1708 /* find a call in limbo */
1709 h = hashipa(src, segp->source);
1710 for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){
1711 netlog(s->p->f, Logtcp, "tcpincoming s %I!%ud/%I!%ud d %I!%ud/%I!%ud v %d/%d\n",
1712 src, segp->source, lp->raddr, lp->rport,
1713 dst, segp->dest, lp->laddr, lp->lport,
1714 version, lp->version
1717 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1719 if(ipcmp(lp->laddr, dst) != 0)
1721 if(ipcmp(lp->raddr, src) != 0)
1724 /* we're assuming no data with the initial SYN */
1725 if(segp->seq != lp->irs+1 || segp->ack != lp->iss+1){
1726 netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux\n",
1727 segp->seq, lp->irs+1, segp->ack, lp->iss+1);
1738 new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1742 memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1743 tcb = (Tcpctl*)new->ptcl;
1744 tcb->flags &= ~CLONE;
1745 tcb->timer.arg = new;
1746 tcb->timer.state = TcptimerOFF;
1747 tcb->acktimer.arg = new;
1748 tcb->acktimer.state = TcptimerOFF;
1749 tcb->katimer.arg = new;
1750 tcb->katimer.state = TcptimerOFF;
1751 tcb->rtt_timer.arg = new;
1752 tcb->rtt_timer.state = TcptimerOFF;
1755 tcb->rcv.nxt = tcb->irs+1;
1756 tcb->rcv.wptr = tcb->rcv.nxt;
1758 tcb->rcv.urg = tcb->rcv.nxt;
1761 tcb->rttseq = tcb->iss;
1762 tcb->snd.wl2 = tcb->iss;
1763 tcb->snd.una = tcb->iss+1;
1764 tcb->snd.ptr = tcb->iss+1;
1765 tcb->snd.nxt = tcb->iss+1;
1766 tcb->snd.rxt = tcb->iss+1;
1768 tcb->flags |= SYNACK;
1770 /* set desired mss and scale */
1771 tcb->mss = tcpmtu(v6lookup(s->p->f, src, s), version, &tcb->scale);
1773 /* our sending max segment size cannot be bigger than what he asked for */
1774 if(lp->mss != 0 && lp->mss < tcb->mss)
1776 tpriv->stats[Mss] = tcb->mss;
1778 /* window scaling */
1779 tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1781 /* congestion window */
1782 tcb->snd.wnd = segp->wnd;
1785 /* set initial round trip time */
1786 tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER;
1791 /* set up proto header */
1794 h4 = &tcb->protohdr.tcp4hdr;
1795 memset(h4, 0, sizeof(*h4));
1796 h4->proto = IP_TCPPROTO;
1797 hnputs(h4->tcpsport, new->lport);
1798 hnputs(h4->tcpdport, new->rport);
1799 v6tov4(h4->tcpsrc, dst);
1800 v6tov4(h4->tcpdst, src);
1803 h6 = &tcb->protohdr.tcp6hdr;
1804 memset(h6, 0, sizeof(*h6));
1805 h6->proto = IP_TCPPROTO;
1806 hnputs(h6->tcpsport, new->lport);
1807 hnputs(h6->tcpdport, new->rport);
1808 ipmove(h6->tcpsrc, dst);
1809 ipmove(h6->tcpdst, src);
1812 panic("tcpincoming: version %d", new->ipversion);
1815 tcpsetstate(new, Established);
1817 iphtadd(&tpriv->ht, new);
1823 seq_within(ulong x, ulong low, ulong high)
1826 if(low <= x && x <= high)
1830 if(x >= low || x <= high)
1837 seq_lt(ulong x, ulong y)
1839 return (int)(x-y) < 0;
1843 seq_le(ulong x, ulong y)
1845 return (int)(x-y) <= 0;
1849 seq_gt(ulong x, ulong y)
1851 return (int)(x-y) > 0;
1855 seq_ge(ulong x, ulong y)
1857 return (int)(x-y) >= 0;
1861 * use the time between the first SYN and it's ack as the
1862 * initial round trip time
1865 tcpsynackrtt(Conv *s)
1871 tcb = (Tcpctl*)s->ptcl;
1874 delta = NOW - tcb->sndsyntime;
1875 tcb->srtt = delta<<LOGAGAIN;
1876 tcb->mdev = delta<<LOGDGAIN;
1878 /* halt round trip timer */
1879 tcphalt(tpriv, &tcb->rtt_timer);
1883 update(Conv *s, Tcp *seg)
1895 tcb = (Tcpctl*)s->ptcl;
1897 /* catch zero-window updates, update window & recover */
1898 if(tcb->snd.wnd == 0 && seg->wnd > 0)
1899 if(seq_lt(seg->ack, tcb->snd.ptr)){
1900 netlog(s->p->f, Logtcp, "tcp: zwu ack %lud una %lud ptr %lud win %lud\n",
1901 seg->ack, tcb->snd.una, tcb->snd.ptr, seg->wnd);
1902 tcb->snd.wnd = seg->wnd;
1906 /* newreno fast retransmit */
1907 if(seg->ack == tcb->snd.una)
1908 if(tcb->snd.una != tcb->snd.nxt)
1909 if(++tcb->snd.dupacks == 3){
1911 if(tcb->snd.recovery){
1912 tpriv->stats[RecoveryCwind]++;
1913 tcb->cwind += tcb->mss;
1914 }else if(seq_le(tcb->snd.rxt, seg->ack)){
1915 tpriv->stats[Recovery]++;
1917 tcb->snd.recovery = 1;
1918 tcb->snd.partialack = 0;
1919 tcb->snd.rxt = tcb->snd.nxt;
1921 tcb->cwind = tcb->ssthresh + 3*tcb->mss;
1922 netlog(s->p->f, Logtcpwin, "recovery inflate %ld ss %ld @%lud\n",
1923 tcb->cwind, tcb->ssthresh, tcb->snd.rxt);
1926 tpriv->stats[RecoveryNoSeq]++;
1927 netlog(s->p->f, Logtcpwin, "!recov %lud not ≤ %lud %ld\n",
1928 tcb->snd.rxt, seg->ack, tcb->snd.rxt - seg->ack);
1929 /* do not enter fast retransmit */
1930 /* do not change ssthresh */
1932 }else if(tcb->snd.recovery){
1933 tpriv->stats[RecoveryCwind]++;
1934 tcb->cwind += tcb->mss;
1940 if(seq_gt(seg->ack, tcb->snd.wl2)
1941 || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){
1942 /* clear dupack if we advance wl2 */
1943 if(tcb->snd.wl2 != seg->ack)
1944 tcb->snd.dupacks = 0;
1945 tcb->snd.wnd = seg->wnd;
1946 tcb->snd.wl2 = seg->ack;
1949 if(!seq_gt(seg->ack, tcb->snd.una)){
1951 * don't let us hangup if sending into a closed window and
1952 * we're still getting acks
1954 if((tcb->flags&RETRAN) && tcb->snd.wnd == 0)
1955 tcb->backedoff = MAXBACKMS/4;
1959 /* Compute the new send window size */
1960 acked = seg->ack - tcb->snd.una;
1962 /* avoid slow start and timers for SYN acks */
1963 if((tcb->flags & SYNACK) == 0) {
1964 tcb->flags |= SYNACK;
1971 * congestion control
1973 if(tcb->snd.recovery){
1974 if(seq_ge(seg->ack, tcb->snd.rxt)){
1975 /* recovery finished; deflate window */
1976 tpriv->stats[RecoveryDone]++;
1977 tcb->snd.dupacks = 0;
1978 tcb->snd.recovery = 0;
1979 tcb->cwind = (tcb->snd.nxt - tcb->snd.una) + tcb->mss;
1980 if(tcb->ssthresh < tcb->cwind)
1981 tcb->cwind = tcb->ssthresh;
1982 netlog(s->p->f, Logtcpwin, "recovery deflate %ld %ld\n",
1983 tcb->cwind, tcb->ssthresh);
1985 /* partial ack; we lost more than one segment */
1986 tpriv->stats[RecoveryPA]++;
1987 if(tcb->cwind > acked)
1988 tcb->cwind -= acked;
1990 netlog(s->p->f, Logtcpwin, "partial ack neg\n");
1991 tcb->cwind = tcb->mss;
1993 netlog(s->p->f, Logtcpwin, "partial ack %ld left %ld cwind %ld\n",
1994 acked, tcb->snd.rxt - seg->ack, tcb->cwind);
1996 if(acked >= tcb->mss)
1997 tcb->cwind += tcb->mss;
1998 tcb->snd.partialack++;
2001 tcpabcincr(tcb, acked);
2003 /* Adjust the timers according to the round trip time */
2004 /* todo: fix sloppy treatment of overflow cases here. */
2005 if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
2006 tcphalt(tpriv, &tcb->rtt_timer);
2007 if((tcb->flags&RETRAN) == 0) {
2010 rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
2012 rtt = 1; /* otherwise all close systems will rexmit in 0 time */
2014 if(tcb->srtt == 0) {
2015 tcb->srtt = rtt << LOGAGAIN;
2016 tcb->mdev = rtt << LOGDGAIN;
2018 delta = rtt - (tcb->srtt>>LOGAGAIN);
2023 delta = abs(delta) - (tcb->mdev>>LOGDGAIN);
2033 if(qdiscard(s->wq, acked) < acked)
2035 tcb->snd.una = seg->ack;
2037 /* newreno fast recovery */
2038 if(tcb->snd.recovery)
2041 if(seq_gt(seg->ack, tcb->snd.urg))
2042 tcb->snd.urg = seg->ack;
2044 if(tcb->snd.una != tcb->snd.nxt){
2045 /* “impatient” variant */
2046 if(!tcb->snd.recovery || tcb->snd.partialack == 1){
2048 tcb->timeuna = tcb->snd.una;
2049 tcpgo(tpriv, &tcb->timer);
2053 tcphalt(tpriv, &tcb->timer);
2055 if(seq_lt(tcb->snd.ptr, tcb->snd.una))
2056 tcb->snd.ptr = tcb->snd.una;
2058 if(!tcb->snd.recovery)
2059 tcb->flags &= ~RETRAN;
2065 tcpiput(Proto *tcp, Ipifc*, Block *bp)
2072 ushort length, csum;
2073 uchar source[IPaddrlen], dest[IPaddrlen];
2082 tpriv->stats[InSegs]++;
2084 h4 = (Tcp4hdr*)(bp->rp);
2085 h6 = (Tcp6hdr*)(bp->rp);
2087 if((h4->vihl&0xF0)==IP_VER4) {
2089 length = nhgets(h4->length);
2090 v4tov6(dest, h4->tcpdst);
2091 v4tov6(source, h4->tcpsrc);
2094 hnputs(h4->tcplen, length-TCP4_PKT);
2095 if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
2096 ptclcsum(bp, TCP4_IPLEN, length-TCP4_IPLEN)) {
2097 tpriv->stats[CsumErrs]++;
2098 tpriv->stats[InErrs]++;
2099 netlog(f, Logtcp, "bad tcp proto cksum\n");
2104 hdrlen = ntohtcp4(&seg, &bp);
2106 tpriv->stats[HlenErrs]++;
2107 tpriv->stats[InErrs]++;
2108 netlog(f, Logtcp, "bad tcp hdr len\n");
2112 /* trim the packet to the size claimed by the datagram */
2113 length -= hdrlen+TCP4_PKT;
2114 bp = trimblock(bp, hdrlen+TCP4_PKT, length);
2116 tpriv->stats[LenErrs]++;
2117 tpriv->stats[InErrs]++;
2118 netlog(f, Logtcp, "tcp len < 0 after trim\n");
2124 int proto = h6->proto;
2127 length = nhgets(h6->ploadlen);
2128 ipmove(dest, h6->tcpdst);
2129 ipmove(source, h6->tcpsrc);
2131 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
2133 hnputl(h6->vcf, length);
2134 if((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
2135 (csum = ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) != 0) {
2136 tpriv->stats[CsumErrs]++;
2137 tpriv->stats[InErrs]++;
2139 "bad tcpv6 proto cksum: got %#ux, computed %#ux\n",
2140 h6->tcpcksum[0]<<8 | h6->tcpcksum[1], csum);
2146 hnputs(h6->ploadlen, length);
2148 hdrlen = ntohtcp6(&seg, &bp);
2150 tpriv->stats[HlenErrs]++;
2151 tpriv->stats[InErrs]++;
2152 netlog(f, Logtcp, "bad tcpv6 hdr len\n");
2156 /* trim the packet to the size claimed by the datagram */
2158 bp = trimblock(bp, hdrlen+TCP6_PKT, length);
2160 tpriv->stats[LenErrs]++;
2161 tpriv->stats[InErrs]++;
2162 netlog(f, Logtcp, "tcpv6 len < 0 after trim\n");
2167 /* lock protocol while searching for a conversation */
2170 /* Look for a matching conversation */
2171 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2173 netlog(f, Logtcp, "iphtlook(src %I!%d, dst %I!%d) failed\n",
2174 source, seg.source, dest, seg.dest);
2177 sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2182 /* if it's a listener, look for the right flags and get a new conv */
2183 tcb = (Tcpctl*)s->ptcl;
2184 if(tcb->state == Listen){
2185 if(seg.flags & RST){
2186 limborst(s, &seg, source, dest, version);
2192 /* if this is a new SYN, put the call into limbo */
2193 if((seg.flags & SYN) && (seg.flags & ACK) == 0){
2194 limbo(s, source, dest, &seg, version);
2201 * if there's a matching call in limbo, tcpincoming will
2202 * return it in state Syn_received
2204 s = tcpincoming(s, &seg, source, dest, version);
2209 /* The rest of the input state machine is run with the control block
2210 * locked and implements the state machine directly out of the RFC.
2211 * Out-of-band data is ignored - it was always a bad idea.
2213 tcb = (Tcpctl*)s->ptcl;
2222 seg.wnd <<= tcb->rcv.scale;
2224 /* every input packet in puts off the keep alive time out */
2225 tcpsetkacounter(tcb);
2227 switch(tcb->state) {
2229 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2232 if(seg.flags & ACK) {
2233 if(!seq_within(seg.ack, tcb->iss+1, tcb->snd.nxt)) {
2234 sndrst(tcp, source, dest, length, &seg, version,
2235 "bad seq in Syn_sent");
2239 if(seg.flags & RST) {
2241 localclose(s, Econrefused);
2245 if(seg.flags & SYN) {
2247 if(seg.flags & ACK){
2250 tcpsetstate(s, Established);
2251 tcpsetscale(s, tcb, seg.ws, tcb->scale);
2255 tcpsetstate(s, Syn_received); /* DLP - shouldn't this be a reset? */
2258 if(length != 0 || (seg.flags & FIN))
2271 /* doesn't matter if it's the correct ack, we're just trying to set timing */
2278 * One DOS attack is to open connections to us and then forget about them,
2279 * thereby tying up a conv at no long term cost to the attacker.
2280 * This is an attempt to defeat these stateless DOS attacks. See
2281 * corresponding code in tcpsendka().
2283 if(tcb->state != Syn_received && (seg.flags & RST) == 0){
2284 if(tcpporthogdefense
2285 && seq_within(seg.ack, tcb->snd.una-(1<<31), tcb->snd.una-(1<<29))){
2286 print("stateless hog %I.%d->%I.%d f %ux %lux - %lux - %lux\n",
2287 source, seg.source, dest, seg.dest, seg.flags,
2288 tcb->snd.una-(1<<31), seg.ack, tcb->snd.una-(1<<29));
2289 localclose(s, "stateless hog");
2293 /* Cut the data to fit the receive window */
2295 if(tcptrim(tcb, &seg, &bp, &length) == -1) {
2296 if(seg.seq+1 != tcb->rcv.nxt || length != 1)
2297 netlog(f, Logtcp, "tcp: trim: !inwind: seq %lud-%lud win %lud-%lud l %d from %I\n",
2298 seg.seq, seg.seq + length - 1,
2299 tcb->rcv.nxt, tcb->rcv.nxt + tcb->rcv.wnd-1, length, s->raddr);
2301 if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) {
2302 tcphalt(tpriv, &tcb->rtt_timer);
2303 tcphalt(tpriv, &tcb->acktimer);
2304 tcphalt(tpriv, &tcb->katimer);
2305 tcpsetstate(s, Time_wait);
2306 tcb->timer.start = MSL2*(1000 / MSPTICK);
2307 tcpgo(tpriv, &tcb->timer);
2309 if(!(seg.flags & RST)) {
2310 tcb->flags |= FORCE;
2318 /* Cannot accept so answer with a rst */
2319 if(length && tcb->state == Closed) {
2320 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2324 /* The segment is beyond the current receive pointer so
2325 * queue the data in the resequence queue
2327 if(seg.seq != tcb->rcv.nxt)
2328 if(length != 0 || (seg.flags & (SYN|FIN))) {
2330 if(addreseq(f, tcb, tpriv, &seg, bp, length) < 0)
2331 print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr, s->lport);
2332 tcb->flags |= FORCE; /* force duplicate ack; RFC 5681 §3.2 */
2337 tcb->flags |= FORCE; /* filled hole in sequence space; RFC 5681 §3.2 */
2340 * keep looping till we've processed this packet plus any
2341 * adjacent packets in the resequence queue
2344 if(seg.flags & RST) {
2345 if(tcb->state == Established) {
2346 tpriv->stats[EstabResets]++;
2347 if(tcb->rcv.nxt != seg.seq)
2348 print("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt %lux seq %lux\n", s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt, seg.seq);
2350 localclose(s, Econrefused);
2354 if((seg.flags&ACK) == 0)
2357 switch(tcb->state) {
2359 if(!seq_within(seg.ack, tcb->snd.una+1, tcb->snd.nxt)){
2360 sndrst(tcp, source, dest, length, &seg, version,
2361 "bad seq in Syn_received");
2365 tcpsetstate(s, Established);
2372 if(qlen(s->wq)+tcb->flgcnt == 0){
2373 tcphalt(tpriv, &tcb->rtt_timer);
2374 tcphalt(tpriv, &tcb->acktimer);
2375 tcpsetkacounter(tcb);
2377 tcpsetstate(s, Finwait2);
2378 tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2379 tcpgo(tpriv, &tcb->katimer);
2387 if(qlen(s->wq)+tcb->flgcnt == 0) {
2388 tcphalt(tpriv, &tcb->rtt_timer);
2389 tcphalt(tpriv, &tcb->acktimer);
2390 tcphalt(tpriv, &tcb->katimer);
2391 tcpsetstate(s, Time_wait);
2392 tcb->timer.start = MSL2*(1000 / MSPTICK);
2393 tcpgo(tpriv, &tcb->timer);
2398 if(qlen(s->wq)+tcb->flgcnt == 0) {
2403 tcb->flags |= FORCE;
2404 if(tcb->timer.state != TcptimerON)
2405 tcpgo(tpriv, &tcb->timer);
2408 if((seg.flags&URG) && seg.urg) {
2409 if(seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2410 tcb->rcv.urg = seg.urg + seg.seq;
2411 pullblock(&bp, seg.urg);
2415 if(seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2416 tcb->rcv.urg = tcb->rcv.nxt;
2425 /* Ignore segment text */
2433 /* If we still have some data place on
2437 qpassnolim(s->rq, packblock(bp));
2440 tcb->rcv.nxt += length;
2443 * turn on the acktimer if there's something
2446 if(tcb->acktimer.state != TcptimerON)
2447 tcpgo(tpriv, &tcb->acktimer);
2451 /* no process to read the data, send a reset */
2454 sndrst(tcp, source, dest, length, &seg, version,
2455 "send to Finwait2");
2462 if(seg.flags & FIN) {
2463 tcb->flags |= FORCE;
2465 switch(tcb->state) {
2469 tcpsetstate(s, Close_wait);
2473 if(qlen(s->wq)+tcb->flgcnt == 0) {
2474 tcphalt(tpriv, &tcb->rtt_timer);
2475 tcphalt(tpriv, &tcb->acktimer);
2476 tcphalt(tpriv, &tcb->katimer);
2477 tcpsetstate(s, Time_wait);
2478 tcb->timer.start = MSL2*(1000/MSPTICK);
2479 tcpgo(tpriv, &tcb->timer);
2482 tcpsetstate(s, Closing);
2486 tcphalt(tpriv, &tcb->rtt_timer);
2487 tcphalt(tpriv, &tcb->acktimer);
2488 tcphalt(tpriv, &tcb->katimer);
2489 tcpsetstate(s, Time_wait);
2490 tcb->timer.start = MSL2 * (1000/MSPTICK);
2491 tcpgo(tpriv, &tcb->timer);
2498 tcpgo(tpriv, &tcb->timer);
2504 * get next adjacent segment from the resequence queue.
2505 * dump/trim any overlapping segments
2508 if(tcb->reseq == nil)
2511 if(seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2514 getreseq(tcb, &seg, &bp, &length);
2517 if(tcptrim(tcb, &seg, &bp, &length) == 0){
2518 tcb->flags |= FORCE;
2536 * always enters and exits with the s locked. We drop
2537 * the lock to ipoput the packet so some care has to be
2548 ulong ssize, dsize, sent;
2555 version = s->ipversion;
2557 tcb = (Tcpctl*)s->ptcl;
2559 /* force ack every 2*mss */
2560 if((tcb->flags & FORCE) == 0)
2561 if(tcb->rcv.nxt - tcb->rcv.ackptr >= 2*tcb->mss){
2562 tpriv->stats[Delayack]++;
2563 tcb->flags |= FORCE;
2566 /* force ack if window opening */
2568 if((tcb->flags & FORCE) == 0){
2570 if((int)(tcb->rcv.wptr - tcb->rcv.wsnt) >= 2*tcb->mss){
2571 tpriv->stats[Wopenack]++;
2572 tcb->flags |= FORCE;
2576 for(msgs = 0; msgs < 100; msgs++) {
2577 switch(tcb->state) {
2584 /* Don't send anything else until our SYN has been acked */
2585 if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
2588 /* force an ack when a window has opened up */
2590 if(tcb->rcv.blocked && tcb->rcv.wnd > 0){
2591 tcb->rcv.blocked = 0;
2592 tcb->flags |= FORCE;
2595 sndcnt = qlen(s->wq)+tcb->flgcnt;
2596 sent = tcb->snd.ptr - tcb->snd.una;
2598 if(tcb->snd.wnd == 0){
2599 /* zero window probe */
2601 if(!(tcb->flags & FORCE))
2602 break; /* already probing, rto re-probes */
2611 /* calculate usable segment size */
2612 if(ssize > tcb->cwind)
2614 if(ssize > tcb->snd.wnd)
2615 ssize = tcb->snd.wnd;
2621 if(ssize > tcb->mss)
2629 if(!(tcb->flags & FORCE)){
2632 if(ssize < tcb->mss)
2633 if(tcb->snd.nxt == tcb->snd.ptr)
2634 if(sent > TCPREXMTTHRESH*tcb->mss)
2638 tcb->flags &= ~FORCE;
2640 /* By default we will generate an ack */
2641 tcphalt(tpriv, &tcb->acktimer);
2642 seg.source = s->lport;
2643 seg.dest = s->rport;
2651 if(tcb->snd.ptr == tcb->iss){
2655 seg.ws = tcb->scale;
2660 * don't send any data with a SYN/ACK packet
2661 * because Linux rejects the packet in its
2662 * attempt to solve the SYN attack problem
2664 if(tcb->snd.ptr == tcb->iss){
2669 seg.ws = tcb->scale;
2673 seg.seq = tcb->snd.ptr;
2674 seg.ack = tcb->rcv.nxt;
2675 seg.wnd = tcb->rcv.wnd;
2677 /* Pull out data to send */
2680 bp = qcopy(s->wq, dsize, sent);
2681 if(BLEN(bp) != dsize) {
2687 if(sent+dsize == sndcnt && dsize)
2690 tcb->snd.ptr += ssize;
2692 /* Pull up the send pointer so we can accept acks
2695 if(seq_gt(tcb->snd.ptr,tcb->snd.nxt))
2696 tcb->snd.nxt = tcb->snd.ptr;
2698 /* Build header, link data and compute cksum */
2701 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2702 hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
2709 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2710 hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
2717 hbp = nil; /* to suppress a warning */
2718 panic("tcpoutput: version %d", version);
2721 /* Start the transmission timers if there is new data and we
2722 * expect acknowledges
2725 if(tcb->timer.state != TcptimerON){
2727 tcb->timeuna = tcb->snd.una;
2728 tcpgo(tpriv, &tcb->timer);
2731 /* If round trip timer isn't running, start it.
2732 * measure the longest packet only in case the
2733 * transmission time dominates RTT
2735 if(tcb->snd.retransmit == 0)
2736 if(tcb->rtt_timer.state != TcptimerON)
2737 if(ssize == tcb->mss) {
2738 tcpgo(tpriv, &tcb->rtt_timer);
2739 tcb->rttseq = tcb->snd.ptr;
2743 tpriv->stats[OutSegs]++;
2744 if(tcb->snd.retransmit)
2745 tpriv->stats[RetransSegsSent]++;
2746 tcb->rcv.ackptr = seg.ack;
2747 tcb->rcv.wsnt = tcb->rcv.wptr;
2749 /* put off the next keep alive */
2750 tcpgo(tpriv, &tcb->katimer);
2754 if(ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0){
2755 /* a negative return means no route */
2756 localclose(s, "no route");
2760 if(ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0){
2761 /* a negative return means no route */
2762 localclose(s, "no route");
2766 panic("tcpoutput2: version %d", version);
2776 * the BSD convention (hack?) for keep alives. resend last uchar acked.
2785 tcb = (Tcpctl*)s->ptcl;
2788 memset(&seg, 0, sizeof seg);
2790 seg.source = s->lport;
2791 seg.dest = s->rport;
2792 seg.flags = ACK|PSH;
2795 if(tcpporthogdefense)
2796 seg.seq = tcb->snd.una-(1<<30)-nrand(1<<20);
2798 seg.seq = tcb->snd.una-1;
2799 seg.ack = tcb->rcv.nxt;
2800 tcb->rcv.ackptr = seg.ack;
2802 seg.wnd = tcb->rcv.wnd;
2803 if(tcb->state == Finwait2){
2810 if(isv4(s->raddr)) {
2811 /* Build header, link data and compute cksum */
2812 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2813 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
2818 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
2821 /* Build header, link data and compute cksum */
2822 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2823 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
2828 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
2833 * set connection to time out after 12 minutes
2836 tcpsetkacounter(Tcpctl *tcb)
2838 tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK);
2839 if(tcb->kacounter < 3)
2844 * if we've timed out, close the connection
2845 * otherwise, send a keepalive and restart the timer
2848 tcpkeepalive(void *v)
2854 tcb = (Tcpctl*)s->ptcl;
2860 if(tcb->state != Closed){
2861 if(--(tcb->kacounter) <= 0) {
2862 localclose(s, Etimedout);
2865 tcpgo(s->p->priv, &tcb->katimer);
2873 * start keepalive timer
2876 tcpstartka(Conv *s, char **f, int n)
2881 tcb = (Tcpctl*)s->ptcl;
2882 if(tcb->state != Established)
2883 return "connection must be in Establised state";
2887 tcb->katimer.start = x/MSPTICK;
2889 tcpsetkacounter(tcb);
2890 tcpgo(s->p->priv, &tcb->katimer);
2896 * turn checksums on/off
2899 tcpsetchecksum(Conv *s, char **f, int)
2903 tcb = (Tcpctl*)s->ptcl;
2904 tcb->nochecksum = !atoi(f[1]);
2910 * retransmit (at most) one segment at snd.una.
2911 * preserve cwind & snd.ptr
2920 tcb = (Tcpctl*)s->ptcl;
2921 tcb->flags |= RETRAN|FORCE;
2923 tptr = tcb->snd.ptr;
2924 tcwind = tcb->cwind;
2925 tcb->snd.ptr = tcb->snd.una;
2926 tcb->cwind = tcb->mss;
2927 tcb->snd.retransmit = 1;
2929 tcb->snd.retransmit = 0;
2930 tcb->cwind = tcwind;
2931 tcb->snd.ptr = tptr;
2934 tpriv->stats[RetransSegs]++;
2938 * todo: RFC 4138 F-RTO
2941 tcptimeout(void *arg)
2950 tcb = (Tcpctl*)s->ptcl;
2960 if(tcb->state == Syn_sent)
2961 maxback = MAXBACKMS/2;
2963 maxback = MAXBACKMS;
2964 tcb->backedoff += tcb->timer.start * MSPTICK;
2965 if(tcb->backedoff >= maxback) {
2966 localclose(s, Etimedout);
2969 netlog(s->p->f, Logtcprxmt, "rxm %d/%d %ldms %lud rto %d %lud %s\n",
2970 tcb->srtt, tcb->mdev, NOW-tcb->time,
2971 tcb->snd.una-tcb->timeuna, tcb->snd.rto, tcb->snd.ptr,
2972 tcpstates[s->state]);
2974 if(tcb->snd.rto == 0)
2977 tcb->snd.ptr = tcb->snd.una;
2978 tcb->cwind = tcb->mss;
2980 tpriv->stats[RetransTimeouts]++;
2982 if(tcb->snd.recovery){
2983 tcb->snd.dupacks = 0; /* reno rto */
2984 tcb->snd.recovery = 0;
2985 tpriv->stats[RecoveryRTO]++;
2986 tcb->snd.rxt = tcb->snd.nxt;
2987 netlog(s->p->f, Logtcpwin,
2988 "rto recovery rxt @%lud\n", tcb->snd.nxt);
3004 inwindow(Tcpctl *tcb, int seq)
3006 return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1);
3010 * set up state for a received SYN (or SYN ACK) packet
3013 procsyn(Conv *s, Tcp *seg)
3018 tcb = (Tcpctl*)s->ptcl;
3019 tcb->flags |= FORCE;
3021 tcb->rcv.nxt = seg->seq + 1;
3022 tcb->rcv.wptr = tcb->rcv.nxt;
3024 tcb->rcv.urg = tcb->rcv.nxt;
3025 tcb->irs = seg->seq;
3027 /* our sending max segment size cannot be bigger than what he asked for */
3028 if(seg->mss != 0 && seg->mss < tcb->mss) {
3029 tcb->mss = seg->mss;
3031 tpriv->stats[Mss] = tcb->mss;
3034 tcb->snd.wnd = seg->wnd;
3039 dumpreseq(Tcpctl *tcb)
3043 for(r = tcb->reseq; r != nil; r = next){
3055 logreseq(Fs *f, Reseq *r, ulong n)
3059 for(; r != nil; r = r->next){
3061 if(r->next == nil && r->seg.seq != n)
3063 else if(r->next == nil)
3065 else if(r->seg.seq != n)
3068 netlog(f, Logtcp, "%s %lud-%lud (%ld) %#ux\n", s,
3069 n, r->seg.seq, r->seg.seq-n, r->seg.flags);
3070 n = r->seg.seq + r->seg.len;
3075 addreseq(Fs *f, Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length)
3080 rp = malloc(sizeof(Reseq));
3082 freeblist(bp); /* bp always consumed by addreseq */
3088 rp->length = length;
3090 tcb->reseqlen += length;
3093 /* Place on reassembly list sorting by starting seq number */
3094 for(rr = &tcb->reseq;; rr = &(*rr)->next)
3095 if(*rr == nil || seq_lt(seg->seq, (*rr)->seg.seq)){
3098 tpriv->stats[Resequenced]++;
3100 tpriv->stats[OutOfOrder]++;
3105 if(tcb->reseqlen > qmax){
3106 netlog(f, Logtcp, "tcp: reseq: queue > window: %d > %d; %d packets\n", tcb->reseqlen, qmax, tcb->nreseq);
3107 logreseq(f, tcb->reseq, tcb->rcv.nxt);
3108 tpriv->stats[ReseqBytelim]++;
3109 return dumpreseq(tcb);
3111 qmax = tcb->window / tcb->mss; /* ~190 for qscale==2, 390 for qscale=3 */
3112 if(tcb->nreseq > qmax){
3113 netlog(f, Logtcp, "resequence queue > packets: %d %d; %d bytes\n", tcb->nreseq, qmax, tcb->reseqlen);
3114 logreseq(f, tcb->reseq, tcb->rcv.nxt);
3115 tpriv->stats[ReseqPktlim]++;
3116 return dumpreseq(tcb);
3123 getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
3131 tcb->reseq = rp->next;
3135 *length = rp->length;
3138 tcb->reseqlen -= rp->length;
3144 tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
3152 if(seg->flags & SYN)
3154 if(seg->flags & FIN)
3157 if(tcb->rcv.wnd == 0) {
3158 if(len == 0 && seg->seq == tcb->rcv.nxt)
3162 /* Some part of the segment should be in the window */
3163 if(inwindow(tcb,seg->seq))
3167 if(inwindow(tcb, seg->seq+len-1) ||
3168 seq_within(tcb->rcv.nxt, seg->seq,seg->seq+len-1))
3176 dupcnt = tcb->rcv.nxt - seg->seq;
3178 tcb->rerecv += dupcnt;
3179 if(seg->flags & SYN){
3190 pullblock(bp, (ushort)dupcnt);
3194 if(seg->urg > dupcnt)
3202 excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
3204 tcb->rerecv += excess;
3206 *bp = trimblock(*bp, 0, *length);
3208 panic("presotto is a boofhead");
3215 tcpadvise(Proto *tcp, Block *bp, char *msg)
3220 uchar source[IPaddrlen];
3221 uchar dest[IPaddrlen];
3222 ushort psource, pdest;
3225 h4 = (Tcp4hdr*)(bp->rp);
3226 h6 = (Tcp6hdr*)(bp->rp);
3228 if((h4->vihl&0xF0)==IP_VER4) {
3229 v4tov6(dest, h4->tcpdst);
3230 v4tov6(source, h4->tcpsrc);
3231 psource = nhgets(h4->tcpsport);
3232 pdest = nhgets(h4->tcpdport);
3235 ipmove(dest, h6->tcpdst);
3236 ipmove(source, h6->tcpsrc);
3237 psource = nhgets(h6->tcpsport);
3238 pdest = nhgets(h6->tcpdport);
3241 /* Look for a connection */
3243 for(p = tcp->conv; *p; p++) {
3245 tcb = (Tcpctl*)s->ptcl;
3246 if(s->rport == pdest)
3247 if(s->lport == psource)
3248 if(tcb->state != Closed)
3249 if(ipcmp(s->raddr, dest) == 0)
3250 if(ipcmp(s->laddr, source) == 0){
3270 tcpporthogdefensectl(char *val)
3272 if(strcmp(val, "on") == 0)
3273 tcpporthogdefense = 1;
3274 else if(strcmp(val, "off") == 0)
3275 tcpporthogdefense = 0;
3277 return "unknown value for tcpporthogdefense";
3281 /* called with c qlocked */
3283 tcpctl(Conv* c, char** f, int n)
3285 if(n == 1 && strcmp(f[0], "close") == 0)
3286 return tcpclose(c), nil;
3287 if(n == 1 && strcmp(f[0], "hangup") == 0)
3288 return tcphangup(c);
3289 if(n >= 1 && strcmp(f[0], "keepalive") == 0)
3290 return tcpstartka(c, f, n);
3291 if(n >= 1 && strcmp(f[0], "checksum") == 0)
3292 return tcpsetchecksum(c, f, n);
3293 if(n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
3294 return tcpporthogdefensectl(f[1]);
3295 return "unknown control request";
3299 tcpstats(Proto *tcp, char *buf, int len)
3308 for(i = 0; i < Nstats; i++)
3309 p = seprint(p, e, "%s: %llud\n", statnames[i], priv->stats[i]);
3314 * garbage collect any stale conversations:
3315 * - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
3316 * - Finwait2 after 5 minutes
3318 * this is called whenever we run out of channels. Both checks are
3319 * of questionable validity so we try to use them only when we're
3320 * up against the wall.
3325 Conv *c, **pp, **ep;
3331 ep = &tcp->conv[tcp->nc];
3332 for(pp = tcp->conv; pp < ep; pp++) {
3338 tcb = (Tcpctl*)c->ptcl;
3341 if(NOW - tcb->time > 5000){
3342 localclose(c, Etimedout);
3347 if(NOW - tcb->time > 5*60*1000){
3348 localclose(c, Etimedout);
3359 tcpsettimer(Tcpctl *tcb)
3363 /* round trip dependency */
3364 x = backoff(tcb->backoff) *
3365 (tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK;
3367 /* bounded twixt 0.3 and 64 seconds */
3370 else if(x > (64000/MSPTICK))
3372 tcb->timer.start = x;
3381 tcp = smalloc(sizeof(Proto));
3382 tpriv = tcp->priv = smalloc(sizeof(Tcppriv));
3384 tcp->connect = tcpconnect;
3385 tcp->announce = tcpannounce;
3387 tcp->state = tcpstate;
3388 tcp->create = tcpcreate;
3389 tcp->close = tcpclose;
3391 tcp->advise = tcpadvise;
3392 tcp->stats = tcpstats;
3393 tcp->inuse = tcpinuse;
3395 tcp->ipproto = IP_TCPPROTO;
3396 tcp->nc = scalednconv();
3397 tcp->ptclsize = sizeof(Tcpctl);
3398 tpriv->stats[MaxConn] = tcp->nc;
3404 tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale)
3407 * guess at reasonable queue sizes. there's no current way
3408 * to know how many nic receive buffers we can safely tie up in the
3409 * tcp stack, and we don't adjust our queues to maximize throughput
3410 * and minimize bufferbloat. n.b. the offer (rcvscale) needs to be
3411 * respected, but we still control our own buffer commitment by
3412 * keeping a seperate qscale.
3414 tcb->rcv.scale = rcvscale & 0xff;
3415 tcb->snd.scale = sndscale & 0xff;
3416 tcb->qscale = rcvscale & 0xff;
3417 if(rcvscale > Maxqscale)
3418 tcb->qscale = Maxqscale;
3420 if(rcvscale != tcb->rcv.scale)
3421 netlog(s->p->f, Logtcp, "tcpsetscale: window %lud qlen %d >> window %ud lport %d\n",
3422 tcb->window, qlen(s->rq), QMAX<<tcb->qscale, s->lport);
3423 tcb->window = QMAX<<tcb->qscale;
3424 tcb->ssthresh = tcb->window;
3427 * it's important to set wq large enough to cover the full
3428 * bandwidth-delay product. it's possible to be in loss
3429 * recovery with a big window, and we need to keep sending
3430 * into the inflated window. the difference can be huge
3431 * for even modest (70ms) ping times.
3433 qsetlimit(s->rq, QMAX<<tcb->qscale);
3434 qsetlimit(s->wq, QMAX<<tcb->qscale);