2 #include "../port/lib.h"
6 #include "../port/error.h"
19 TCP4_PKT = TCP4_IPLEN+TCP4_PHDRSIZE,
25 TCP6_PKT = TCP6_IPLEN+TCP6_PHDRSIZE,
30 MAX_TIME = (1<<20), /* Forever */
31 TCP_ACK = 50, /* Timed ack sequence in ms */
32 MAXBACKMS = 9*60*1000, /* longest backoff time (ms) before hangup */
34 URG = 0x20, /* Data marked urgent */
35 ACK = 0x10, /* Acknowledge is valid */
36 PSH = 0x08, /* Whole data pipe is pushed */
37 RST = 0x04, /* Reset connection */
38 SYN = 0x02, /* Pkt. is synchronise */
39 FIN = 0x01, /* Start close down */
44 MSS_LENGTH = 4, /* Maximum segment size */
46 WS_LENGTH = 3, /* Bits to scale window size by */
48 MSPTICK = 50, /* Milliseconds per timer tick */
49 DEF_MSS = 1460, /* Default maximum segment */
50 DEF_MSS6 = 1280, /* Default maximum segment (min) for v6 */
51 DEF_RTT = 500, /* Default round trip */
52 DEF_KAT = 120000, /* Default time (ms) between keep alives */
53 TCP_LISTEN = 0, /* Listen connection */
54 TCP_CONNECT = 1, /* Outgoing connection */
55 SYNACK_RXTIMER = 250, /* ms between SYNACK retransmits */
57 TCPREXMTTHRESH = 3, /* dupack threshhold for rxt */
68 Closed = 0, /* Connection states */
80 Maxlimbo = 1000, /* maximum procs waiting for response to SYN ACK */
81 NLHT = 256, /* hash table size, must be a power of 2 */
87 /* Must correspond to the enumeration above */
90 "Closed", "Listen", "Syn_sent", "Syn_received",
91 "Established", "Finwait1", "Finwait2", "Close_wait",
92 "Closing", "Last_ack", "Time_wait"
95 typedef struct Tcptimer Tcptimer;
109 * v4 and v6 pseudo headers used for
112 typedef struct Tcp4hdr Tcp4hdr;
115 uchar vihl; /* Version and header length */
116 uchar tos; /* Type of service */
117 uchar length[2]; /* packet length */
118 uchar id[2]; /* Identification */
119 uchar frag[2]; /* Fragment information */
133 /* Options segment */
137 typedef struct Tcp6hdr Tcp6hdr;
144 uchar tcpsrc[IPaddrlen];
145 uchar tcpdst[IPaddrlen];
154 /* Options segment */
159 * this represents the control info
160 * for a single packet. It is derived from
161 * a packet in ntohtcp{4,6}() and stuck into
162 * a packet in htontcp{4,6}().
164 typedef struct Tcp Tcp;
172 ushort ws; /* window scale option (if not zero) */
175 ushort mss; /* max segment size option (if not zero) */
176 ushort len; /* size of data */
180 * this header is malloc'd to thread together fragments
181 * waiting to be coalesced
183 typedef struct Reseq Reseq;
193 * the qlock in the Conv locks this structure
195 typedef struct Tcpctl Tcpctl;
198 uchar state; /* Connection state */
199 uchar type; /* Listening or active connection */
200 uchar code; /* Icmp code */
202 ulong una; /* Unacked data pointer */
203 ulong nxt; /* Next sequence expected */
204 ulong ptr; /* Data pointer */
205 ulong wnd; /* Tcp send window */
206 ulong urg; /* Urgent data pointer */
208 int scale; /* how much to right shift window in xmitted packets */
209 /* to implement tahoe and reno TCP */
210 ulong dupacks; /* number of duplicate acks rcvd */
211 int recovery; /* loss recovery flag */
212 ulong rxt; /* right window marker for recovery */
215 ulong nxt; /* Receive pointer to next uchar slot */
216 ulong wnd; /* Receive window incoming */
217 ulong urg; /* Urgent pointer */
219 int una; /* unacked data segs */
220 int scale; /* how much to left shift window in rcved packets */
222 ulong iss; /* Initial sequence number */
223 int sawwsopt; /* true if we saw a wsopt on the incoming SYN */
224 ulong cwind; /* Congestion window */
225 int scale; /* desired snd.scale */
226 ushort ssthresh; /* Slow start threshold */
227 int resent; /* Bytes just resent */
228 int irs; /* Initial received squence */
229 ushort mss; /* Maximum segment size */
230 int rerecv; /* Overlap of data rerecevived */
231 ulong window; /* Receive window */
232 uchar backoff; /* Exponential backoff counter */
233 int backedoff; /* ms we've backed off for rexmits */
234 uchar flags; /* State flags */
235 Reseq *reseq; /* Resequencing queue */
236 Tcptimer timer; /* Activity timer */
237 Tcptimer acktimer; /* Acknowledge timer */
238 Tcptimer rtt_timer; /* Round trip timer */
239 Tcptimer katimer; /* keep alive timer */
240 ulong rttseq; /* Round trip sequence */
241 int srtt; /* Shortened round trip */
242 int mdev; /* Mean deviation of round trip */
243 int kacounter; /* count down for keep alive */
244 uint sndsyntime; /* time syn sent */
245 ulong time; /* time Finwait2 or Syn_received was sent */
246 int nochecksum; /* non-zero means don't send checksums */
247 int flgcnt; /* number of flags in the sequence (FIN,SEQ) */
252 } protohdr; /* prototype header */
256 * New calls are put in limbo rather than having a conversation structure
257 * allocated. Thus, a SYN attack results in lots of limbo'd calls but not
258 * any real Conv structures mucking things up. Calls in limbo rexmit their
259 * SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
261 * In particular they aren't on a listener's queue so that they don't figure
262 * in the input queue limit.
264 * If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
265 * of 70000 limbo'd calls. Not great for a linear list but doable. Therefore
266 * there is no hashing of this list.
268 typedef struct Limbo Limbo;
273 uchar laddr[IPaddrlen];
274 uchar raddr[IPaddrlen];
277 ulong irs; /* initial received sequence */
278 ulong iss; /* initial sent sequence */
279 ushort mss; /* mss from the other end */
280 ushort rcvscale; /* how much to scale rcvd windows */
281 ushort sndscale; /* how much to scale sent windows */
282 ulong lastsend; /* last time we sent a synack */
283 uchar version; /* v4 or v6 */
284 uchar rexmits; /* number of retransmissions */
287 int tcp_irtt = DEF_RTT; /* Initial guess at round trip time */
288 ushort tcp_mss = DEF_MSS; /* Maximum segment size to be sent */
314 static char *statnames[] =
318 [ActiveOpens] "ActiveOpens",
319 [PassiveOpens] "PassiveOpens",
320 [EstabResets] "EstabResets",
321 [CurrEstab] "CurrEstab",
324 [RetransSegs] "RetransSegs",
325 [RetransTimeouts] "RetransTimeouts",
328 [CsumErrs] "CsumErrs",
329 [HlenErrs] "HlenErrs",
331 [OutOfOrder] "OutOfOrder",
334 typedef struct Tcppriv Tcppriv;
337 /* List of active timers */
341 /* hash table for matching conversations */
344 /* calls in limbo waiting for an ACK to our SYN ACK */
348 /* for keeping track of tcpackproc */
352 uvlong stats[Nstats];
356 * Setting tcpporthogdefense to non-zero enables Dong Lin's
357 * solution to hijacked systems staking out port's as a form
360 * To avoid stateless Conv hogs, we pick a sequence number at random. If
361 * that number gets acked by the other end, we shut down the connection.
362 * Look for tcpporthogdefense in the code.
364 int tcpporthogdefense = 0;
366 int addreseq(Tcpctl*, Tcppriv*, Tcp*, Block*, ushort);
367 void getreseq(Tcpctl*, Tcp*, Block**, ushort*);
368 void localclose(Conv*, char*);
369 void procsyn(Conv*, Tcp*);
370 void tcpiput(Proto*, Ipifc*, Block*);
371 void tcpoutput(Conv*);
372 int tcptrim(Tcpctl*, Tcp*, Block**, ushort*);
373 void tcpstart(Conv*, int);
374 void tcptimeout(void*);
375 void tcpsndsyn(Conv*, Tcpctl*);
376 void tcprcvwin(Conv*);
377 void tcpacktimer(void*);
378 void tcpkeepalive(void*);
379 void tcpsetkacounter(Tcpctl*);
380 void tcprxmit(Conv*);
381 void tcpsettimer(Tcpctl*);
382 void tcpsynackrtt(Conv*);
383 void tcpsetscale(Conv*, Tcpctl*, ushort, ushort);
385 static void limborexmit(Proto*);
386 static void limbo(Conv*, uchar*, uchar*, Tcp*, int);
389 tcpsetstate(Conv *s, uchar newstate)
397 tcb = (Tcpctl*)s->ptcl;
399 oldstate = tcb->state;
400 if(oldstate == newstate)
403 if(oldstate == Established)
404 tpriv->stats[CurrEstab]--;
405 if(newstate == Established)
406 tpriv->stats[CurrEstab]++;
409 print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
410 tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
420 case Close_wait: /* Remote closes */
425 tcb->state = newstate;
427 if(oldstate == Syn_sent && newstate != Closed)
432 tcpconnect(Conv *c, char **argv, int argc)
437 tcb = (Tcpctl*)(c->ptcl);
438 if(tcb->state != Closed)
441 e = Fsstdconnect(c, argv, argc);
444 tcpstart(c, TCP_CONNECT);
450 tcpstate(Conv *c, char *state, int n)
454 s = (Tcpctl*)(c->ptcl);
456 return snprint(state, n,
457 "%s qin %d qout %d srtt %d mdev %d cwin %lud swin %lud>>%d rwin %lud>>%d timer.start %d timer.count %d rerecv %d katimer.start %d katimer.count %d\n",
459 c->rq ? qlen(c->rq) : 0,
460 c->wq ? qlen(c->wq) : 0,
462 s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale,
463 s->timer.start, s->timer.count, s->rerecv,
464 s->katimer.start, s->katimer.count);
472 s = (Tcpctl*)(c->ptcl);
473 return s->state != Closed;
477 tcpannounce(Conv *c, char **argv, int argc)
482 tcb = (Tcpctl*)(c->ptcl);
483 if(tcb->state != Closed)
486 e = Fsstdannounce(c, argv, argc);
489 tcpstart(c, TCP_LISTEN);
496 * tcpclose is always called with the q locked
503 tcb = (Tcpctl*)c->ptcl;
513 * reset any incoming calls to this listener
515 Fsconnected(c, "Hangup");
527 tcpsetstate(c, Finwait1);
533 tcpsetstate(c, Last_ack);
545 tcb = (Tcpctl*)s->ptcl;
565 localclose(s, "Hangup");
574 tcprcvwin(Conv *s) /* Call with tcb locked */
579 tcb = (Tcpctl*)s->ptcl;
580 w = tcb->window - qlen(s->rq);
584 netlog(s->p->f, Logtcp, "tcprcvwim: window %lud qlen %d\n", tcb->window, qlen(s->rq));
587 tcb->rcv.blocked = 1;
597 tcb = (Tcpctl*)s->ptcl;
604 if(tcb->state != Closed){
616 c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c);
617 c->wq = qopen((3*QMAX)/2, Qkick, tcpkick, c);
621 timerstate(Tcppriv *priv, Tcptimer *t, int newstate)
623 if(newstate != TcptimerON){
624 if(t->state == TcptimerON){
626 if(priv->timers == t){
627 priv->timers = t->next;
629 panic("timerstate1");
632 t->next->prev = t->prev;
634 t->prev->next = t->next;
635 t->next = t->prev = nil;
638 if(t->state != TcptimerON){
640 if(t->prev != nil || t->next != nil)
641 panic("timerstate2");
643 t->next = priv->timers;
655 Tcptimer *t, *tp, *timeo;
664 tsleep(&up->sleep, return0, 0, MSPTICK);
669 for(t = priv->timers; t != nil; t = tp) {
671 panic("tcpackproc1");
673 if(t->state == TcptimerON) {
676 timerstate(priv, t, TcptimerDONE);
677 t->readynext = timeo;
685 for(t = timeo; t != nil; t = t->readynext) {
687 panic("tcpackproc2");
688 if(t->state == TcptimerDONE && t->func != nil && !waserror()){
699 tcpgo(Tcppriv *priv, Tcptimer *t)
701 if(t == nil || t->start == 0)
706 timerstate(priv, t, TcptimerON);
711 tcphalt(Tcppriv *priv, Tcptimer *t)
717 timerstate(priv, t, TcptimerOFF);
728 localclose(Conv *s, char *reason) /* called with tcb locked */
735 tcb = (Tcpctl*)s->ptcl;
737 iphtrem(&tpriv->ht, s);
739 tcphalt(tpriv, &tcb->timer);
740 tcphalt(tpriv, &tcb->rtt_timer);
741 tcphalt(tpriv, &tcb->acktimer);
742 tcphalt(tpriv, &tcb->katimer);
744 /* Flush reassembly queue; nothing more can arrive */
745 for(rp = tcb->reseq; rp != nil; rp = rp1) {
752 if(tcb->state == Syn_sent)
753 Fsconnected(s, reason);
754 if(s->state == Announced)
757 qhangup(s->rq, reason);
758 qhangup(s->wq, reason);
760 tcpsetstate(s, Closed);
763 /* mtu (- TCP + IP hdr len) of 1st hop */
765 tcpmtu(Proto *tcp, uchar *addr, int version, int *scale)
770 ifc = findipifc(tcp->f, addr, 0);
776 mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
781 mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
787 else if(ifc->mbps > 100)
789 else if(ifc->mbps > 10)
800 inittcpctl(Conv *s, int mode)
808 tcb = (Tcpctl*)s->ptcl;
810 memset(tcb, 0, sizeof(Tcpctl));
812 tcb->ssthresh = 65535;
813 tcb->srtt = tcp_irtt<<LOGAGAIN;
817 tcb->timer.start = tcp_irtt / MSPTICK;
818 tcb->timer.func = tcptimeout;
820 tcb->rtt_timer.start = MAX_TIME;
821 tcb->acktimer.start = TCP_ACK / MSPTICK;
822 tcb->acktimer.func = tcpacktimer;
823 tcb->acktimer.arg = s;
824 tcb->katimer.start = DEF_KAT / MSPTICK;
825 tcb->katimer.func = tcpkeepalive;
826 tcb->katimer.arg = s;
830 /* create a prototype(pseudo) header */
831 if(mode != TCP_LISTEN){
832 if(ipcmp(s->laddr, IPnoaddr) == 0)
833 findlocalip(s->p->f, s->laddr, s->raddr);
835 switch(s->ipversion){
837 h4 = &tcb->protohdr.tcp4hdr;
838 memset(h4, 0, sizeof(*h4));
839 h4->proto = IP_TCPPROTO;
840 hnputs(h4->tcpsport, s->lport);
841 hnputs(h4->tcpdport, s->rport);
842 v6tov4(h4->tcpsrc, s->laddr);
843 v6tov4(h4->tcpdst, s->raddr);
846 h6 = &tcb->protohdr.tcp6hdr;
847 memset(h6, 0, sizeof(*h6));
848 h6->proto = IP_TCPPROTO;
849 hnputs(h6->tcpsport, s->lport);
850 hnputs(h6->tcpdport, s->rport);
851 ipmove(h6->tcpsrc, s->laddr);
852 ipmove(h6->tcpdst, s->raddr);
856 panic("inittcpctl: version %d", s->ipversion);
860 tcb->mss = tcb->cwind = mss;
862 tpriv->stats[Mss] = tcb->mss;
864 /* default is no window scaling */
869 qsetlimit(s->rq, QMAX);
873 * called with s qlocked
876 tcpstart(Conv *s, int mode)
880 char kpname[KNAMELEN];
884 if(tpriv->ackprocstarted == 0){
886 if(tpriv->ackprocstarted == 0){
887 sprint(kpname, "#I%dtcpack", s->p->f->dev);
888 kproc(kpname, tcpackproc, s->p);
889 tpriv->ackprocstarted = 1;
891 qunlock(&tpriv->apl);
894 tcb = (Tcpctl*)s->ptcl;
898 iphtadd(&tpriv->ht, s);
901 tpriv->stats[PassiveOpens]++;
903 tcpsetstate(s, Listen);
907 tpriv->stats[ActiveOpens]++;
908 tcb->flags |= ACTIVE;
910 tcpsetstate(s, Syn_sent);
919 static char buf[128];
921 sprint(buf, "%d", flag>>10); /* Head len */
939 htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb)
944 ushort hdrlen, optpad = 0;
947 hdrlen = TCP6_HDRSIZE;
948 if(tcph->flags & SYN){
950 hdrlen += MSS_LENGTH;
960 dlen = blocklen(data);
961 data = padblock(data, hdrlen + TCP6_PKT);
967 data = allocb(hdrlen + TCP6_PKT + 64); /* the 64 pad is to meet mintu's */
970 data->wp += hdrlen + TCP6_PKT;
973 /* copy in pseudo ip header plus port numbers */
974 h = (Tcp6hdr *)(data->rp);
975 memmove(h, ph, TCP6_TCBPHDRSZ);
977 /* compose pseudo tcp header, do cksum calculation */
978 hnputl(h->vcf, hdrlen + dlen);
979 h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
982 /* copy in variable bits */
983 hnputl(h->tcpseq, tcph->seq);
984 hnputl(h->tcpack, tcph->ack);
985 hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
986 hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
987 hnputs(h->tcpurg, tcph->urg);
989 if(tcph->flags & SYN){
994 hnputs(opt, tcph->mss);
995 // print("our outgoing mss %d\n", tcph->mss);
1007 if(tcb != nil && tcb->nochecksum){
1008 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1010 csum = ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE);
1011 hnputs(h->tcpcksum, csum);
1014 /* move from pseudo header back to normal ip header */
1015 memset(h->vcf, 0, 4);
1016 h->vcf[0] = IP_VER6;
1017 hnputs(h->ploadlen, hdrlen+dlen);
1018 h->proto = ph->proto;
1024 htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb)
1029 ushort hdrlen, optpad = 0;
1032 hdrlen = TCP4_HDRSIZE;
1033 if(tcph->flags & SYN){
1035 hdrlen += MSS_LENGTH;
1037 hdrlen += WS_LENGTH;
1038 optpad = hdrlen & 3;
1040 optpad = 4 - optpad;
1045 dlen = blocklen(data);
1046 data = padblock(data, hdrlen + TCP4_PKT);
1052 data = allocb(hdrlen + TCP4_PKT + 64); /* the 64 pad is to meet mintu's */
1055 data->wp += hdrlen + TCP4_PKT;
1058 /* copy in pseudo ip header plus port numbers */
1059 h = (Tcp4hdr *)(data->rp);
1060 memmove(h, ph, TCP4_TCBPHDRSZ);
1062 /* copy in variable bits */
1063 hnputs(h->tcplen, hdrlen + dlen);
1064 hnputl(h->tcpseq, tcph->seq);
1065 hnputl(h->tcpack, tcph->ack);
1066 hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
1067 hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
1068 hnputs(h->tcpurg, tcph->urg);
1070 if(tcph->flags & SYN){
1074 *opt++ = MSS_LENGTH;
1075 hnputs(opt, tcph->mss);
1087 if(tcb != nil && tcb->nochecksum){
1088 h->tcpcksum[0] = h->tcpcksum[1] = 0;
1090 csum = ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE);
1091 hnputs(h->tcpcksum, csum);
1098 ntohtcp6(Tcp *tcph, Block **bpp)
1106 *bpp = pullupblock(*bpp, TCP6_PKT+TCP6_HDRSIZE);
1110 h = (Tcp6hdr *)((*bpp)->rp);
1111 tcph->source = nhgets(h->tcpsport);
1112 tcph->dest = nhgets(h->tcpdport);
1113 tcph->seq = nhgetl(h->tcpseq);
1114 tcph->ack = nhgetl(h->tcpack);
1115 hdrlen = (h->tcpflag[0]>>2) & ~3;
1116 if(hdrlen < TCP6_HDRSIZE) {
1121 tcph->flags = h->tcpflag[1];
1122 tcph->wnd = nhgets(h->tcpwin);
1123 tcph->urg = nhgets(h->tcpurg);
1126 tcph->len = nhgets(h->ploadlen) - hdrlen;
1128 *bpp = pullupblock(*bpp, hdrlen+TCP6_PKT);
1133 n = hdrlen - TCP6_HDRSIZE;
1134 while(n > 0 && *optr != EOLOPT) {
1135 if(*optr == NOOPOPT) {
1141 if(optlen < 2 || optlen > n)
1145 if(optlen == MSS_LENGTH)
1146 tcph->mss = nhgets(optr+2);
1149 if(optlen == WS_LENGTH && *(optr+2) <= 14)
1150 tcph->ws = HaveWS | *(optr+2);
1160 ntohtcp4(Tcp *tcph, Block **bpp)
1168 *bpp = pullupblock(*bpp, TCP4_PKT+TCP4_HDRSIZE);
1172 h = (Tcp4hdr *)((*bpp)->rp);
1173 tcph->source = nhgets(h->tcpsport);
1174 tcph->dest = nhgets(h->tcpdport);
1175 tcph->seq = nhgetl(h->tcpseq);
1176 tcph->ack = nhgetl(h->tcpack);
1178 hdrlen = (h->tcpflag[0]>>2) & ~3;
1179 if(hdrlen < TCP4_HDRSIZE) {
1184 tcph->flags = h->tcpflag[1];
1185 tcph->wnd = nhgets(h->tcpwin);
1186 tcph->urg = nhgets(h->tcpurg);
1189 tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1191 *bpp = pullupblock(*bpp, hdrlen+TCP4_PKT);
1196 n = hdrlen - TCP4_HDRSIZE;
1197 while(n > 0 && *optr != EOLOPT) {
1198 if(*optr == NOOPOPT) {
1204 if(optlen < 2 || optlen > n)
1208 if(optlen == MSS_LENGTH) {
1209 tcph->mss = nhgets(optr+2);
1210 // print("new incoming mss %d\n", tcph->mss);
1214 if(optlen == WS_LENGTH && *(optr+2) <= 14)
1215 tcph->ws = HaveWS | *(optr+2);
1225 * For outgiing calls, generate an initial sequence
1226 * number and put a SYN on the send queue
1229 tcpsndsyn(Conv *s, Tcpctl *tcb)
1233 tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1234 tcb->rttseq = tcb->iss;
1235 tcb->snd.wl2 = tcb->iss;
1236 tcb->snd.una = tcb->iss;
1237 tcb->snd.ptr = tcb->rttseq;
1238 tcb->snd.nxt = tcb->rttseq;
1240 tcb->flags |= FORCE;
1241 tcb->sndsyntime = NOW;
1243 /* set desired mss and scale */
1244 tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale);
1246 tpriv->stats[Mss] = tcb->mss;
1250 sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, uchar version, char *reason)
1258 netlog(tcp->f, Logtcp, "sndrst: %s\n", reason);
1262 if(seg->flags & RST)
1265 /* make pseudo header */
1268 memset(&ph4, 0, sizeof(ph4));
1270 v6tov4(ph4.tcpsrc, dest);
1271 v6tov4(ph4.tcpdst, source);
1272 ph4.proto = IP_TCPPROTO;
1273 hnputs(ph4.tcplen, TCP4_HDRSIZE);
1274 hnputs(ph4.tcpsport, seg->dest);
1275 hnputs(ph4.tcpdport, seg->source);
1278 memset(&ph6, 0, sizeof(ph6));
1279 ph6.vcf[0] = IP_VER6;
1280 ipmove(ph6.tcpsrc, dest);
1281 ipmove(ph6.tcpdst, source);
1282 ph6.proto = IP_TCPPROTO;
1283 hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1284 hnputs(ph6.tcpsport, seg->dest);
1285 hnputs(ph6.tcpdport, seg->source);
1288 panic("sndrst: version %d", version);
1291 tpriv->stats[OutRsts]++;
1294 /* convince the other end that this reset is in band */
1295 if(seg->flags & ACK) {
1296 seg->seq = seg->ack;
1301 seg->ack = seg->seq;
1303 if(seg->flags & SYN)
1306 if(seg->flags & FIN)
1309 seg->flags = rflags;
1316 hbp = htontcp4(seg, nil, &ph4, nil);
1319 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1322 hbp = htontcp6(seg, nil, &ph6, nil);
1325 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1328 panic("sndrst2: version %d", version);
1333 * send a reset to the remote side and close the conversation
1334 * called with s qlocked
1343 tcb = (Tcpctl*)s->ptcl;
1345 return commonerror();
1346 if(ipcmp(s->raddr, IPnoaddr) != 0) {
1348 memset(&seg, 0, sizeof seg);
1349 seg.flags = RST | ACK;
1350 seg.ack = tcb->rcv.nxt;
1352 seg.seq = tcb->snd.ptr;
1357 switch(s->ipversion) {
1359 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1360 hbp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr, tcb);
1361 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1364 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1365 hbp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr, tcb);
1366 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1369 panic("tcphangup: version %d", s->ipversion);
1380 * (re)send a SYN ACK
1383 sndsynack(Proto *tcp, Limbo *lp)
1391 /* make pseudo header */
1392 switch(lp->version) {
1394 memset(&ph4, 0, sizeof(ph4));
1396 v6tov4(ph4.tcpsrc, lp->laddr);
1397 v6tov4(ph4.tcpdst, lp->raddr);
1398 ph4.proto = IP_TCPPROTO;
1399 hnputs(ph4.tcplen, TCP4_HDRSIZE);
1400 hnputs(ph4.tcpsport, lp->lport);
1401 hnputs(ph4.tcpdport, lp->rport);
1404 memset(&ph6, 0, sizeof(ph6));
1405 ph6.vcf[0] = IP_VER6;
1406 ipmove(ph6.tcpsrc, lp->laddr);
1407 ipmove(ph6.tcpdst, lp->raddr);
1408 ph6.proto = IP_TCPPROTO;
1409 hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1410 hnputs(ph6.tcpsport, lp->lport);
1411 hnputs(ph6.tcpdport, lp->rport);
1414 panic("sndrst: version %d", lp->version);
1417 memset(&seg, 0, sizeof seg);
1419 seg.ack = lp->irs+1;
1420 seg.flags = SYN|ACK;
1422 seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale);
1423 // if (seg.mss > lp->mss && lp->mss >= 512)
1424 // seg.mss = lp->mss;
1427 /* if the other side set scale, we should too */
1430 lp->sndscale = scale;
1436 switch(lp->version) {
1438 hbp = htontcp4(&seg, nil, &ph4, nil);
1441 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1444 hbp = htontcp6(&seg, nil, &ph6, nil);
1447 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1450 panic("sndsnack: version %d", lp->version);
1456 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1459 * put a call into limbo and respond with a SYN ACK
1461 * called with proto locked
1464 limbo(Conv *s, uchar *source, uchar *dest, Tcp *seg, int version)
1471 h = hashipa(source, seg->source);
1473 for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1475 if(lp->lport != seg->dest || lp->rport != seg->source || lp->version != version)
1477 if(ipcmp(lp->raddr, source) != 0)
1479 if(ipcmp(lp->laddr, dest) != 0)
1482 /* each new SYN restarts the retransmits */
1488 if(tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]){
1490 tpriv->lht[h] = lp->next;
1493 lp = malloc(sizeof(*lp));
1499 lp->version = version;
1500 ipmove(lp->laddr, dest);
1501 ipmove(lp->raddr, source);
1502 lp->lport = seg->dest;
1503 lp->rport = seg->source;
1505 lp->rcvscale = seg->ws;
1507 lp->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1510 if(sndsynack(s->p, lp) < 0){
1518 * resend SYN ACK's once every SYNACK_RXTIMER ms.
1521 limborexmit(Proto *tcp)
1535 for(h = 0; h < NLHT && seen < tpriv->nlimbo; h++){
1536 for(l = &tpriv->lht[h]; *l != nil && seen < tpriv->nlimbo; ){
1539 if(now - lp->lastsend < (lp->rexmits+1)*SYNACK_RXTIMER)
1542 /* time it out after 1 second */
1543 if(++(lp->rexmits) > 5){
1550 /* if we're being attacked, don't bother resending SYN ACK's */
1551 if(tpriv->nlimbo > 100)
1554 if(sndsynack(tcp, lp) < 0){
1568 * lookup call in limbo. if found, throw it out.
1570 * called with proto locked
1573 limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1581 /* find a call in limbo */
1582 h = hashipa(src, segp->source);
1583 for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1585 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1587 if(ipcmp(lp->laddr, dst) != 0)
1589 if(ipcmp(lp->raddr, src) != 0)
1592 /* RST can only follow the SYN */
1593 if(segp->seq == lp->irs+1){
1603 * come here when we finally get an ACK to our SYN-ACK.
1604 * lookup call in limbo. if found, create a new conversation
1606 * called with proto locked
1609 tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1619 /* unless it's just an ack, it can't be someone coming out of limbo */
1620 if((segp->flags & SYN) || (segp->flags & ACK) == 0)
1625 /* find a call in limbo */
1626 h = hashipa(src, segp->source);
1627 for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){
1628 netlog(s->p->f, Logtcp, "tcpincoming s %I!%ud/%I!%ud d %I!%ud/%I!%ud v %d/%d\n",
1629 src, segp->source, lp->raddr, lp->rport,
1630 dst, segp->dest, lp->laddr, lp->lport,
1631 version, lp->version
1634 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1636 if(ipcmp(lp->laddr, dst) != 0)
1638 if(ipcmp(lp->raddr, src) != 0)
1641 /* we're assuming no data with the initial SYN */
1642 if(segp->seq != lp->irs+1 || segp->ack != lp->iss+1){
1643 netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux\n",
1644 segp->seq, lp->irs+1, segp->ack, lp->iss+1);
1655 new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1659 memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1660 tcb = (Tcpctl*)new->ptcl;
1661 tcb->flags &= ~CLONE;
1662 tcb->timer.arg = new;
1663 tcb->timer.state = TcptimerOFF;
1664 tcb->acktimer.arg = new;
1665 tcb->acktimer.state = TcptimerOFF;
1666 tcb->katimer.arg = new;
1667 tcb->katimer.state = TcptimerOFF;
1668 tcb->rtt_timer.arg = new;
1669 tcb->rtt_timer.state = TcptimerOFF;
1672 tcb->rcv.nxt = tcb->irs+1;
1673 tcb->rcv.urg = tcb->rcv.nxt;
1676 tcb->rttseq = tcb->iss;
1677 tcb->snd.wl2 = tcb->iss;
1678 tcb->snd.una = tcb->iss+1;
1679 tcb->snd.ptr = tcb->iss+1;
1680 tcb->snd.nxt = tcb->iss+1;
1682 tcb->flags |= SYNACK;
1684 /* our sending max segment size cannot be bigger than what he asked for */
1685 if(lp->mss != 0 && lp->mss < tcb->mss) {
1687 tpriv->stats[Mss] = tcb->mss;
1690 /* window scaling */
1691 tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1693 /* the congestion window always starts out as a single segment */
1694 tcb->snd.wnd = segp->wnd;
1695 tcb->cwind = tcb->mss;
1697 /* set initial round trip time */
1698 tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER;
1703 /* set up proto header */
1706 h4 = &tcb->protohdr.tcp4hdr;
1707 memset(h4, 0, sizeof(*h4));
1708 h4->proto = IP_TCPPROTO;
1709 hnputs(h4->tcpsport, new->lport);
1710 hnputs(h4->tcpdport, new->rport);
1711 v6tov4(h4->tcpsrc, dst);
1712 v6tov4(h4->tcpdst, src);
1715 h6 = &tcb->protohdr.tcp6hdr;
1716 memset(h6, 0, sizeof(*h6));
1717 h6->proto = IP_TCPPROTO;
1718 hnputs(h6->tcpsport, new->lport);
1719 hnputs(h6->tcpdport, new->rport);
1720 ipmove(h6->tcpsrc, dst);
1721 ipmove(h6->tcpdst, src);
1724 panic("tcpincoming: version %d", new->ipversion);
1727 tcpsetstate(new, Established);
1729 iphtadd(&tpriv->ht, new);
1735 seq_within(ulong x, ulong low, ulong high)
1738 if(low <= x && x <= high)
1742 if(x >= low || x <= high)
1749 seq_lt(ulong x, ulong y)
1751 return (int)(x-y) < 0;
1755 seq_le(ulong x, ulong y)
1757 return (int)(x-y) <= 0;
1761 seq_gt(ulong x, ulong y)
1763 return (int)(x-y) > 0;
1767 seq_ge(ulong x, ulong y)
1769 return (int)(x-y) >= 0;
1773 * use the time between the first SYN and it's ack as the
1774 * initial round trip time
1777 tcpsynackrtt(Conv *s)
1783 tcb = (Tcpctl*)s->ptcl;
1786 delta = NOW - tcb->sndsyntime;
1787 tcb->srtt = delta<<LOGAGAIN;
1788 tcb->mdev = delta<<LOGDGAIN;
1790 /* halt round trip timer */
1791 tcphalt(tpriv, &tcb->rtt_timer);
1795 update(Conv *s, Tcp *seg)
1804 tcb = (Tcpctl*)s->ptcl;
1806 /* if everything has been acked, force output(?) */
1807 if(seq_gt(seg->ack, tcb->snd.nxt)) {
1808 tcb->flags |= FORCE;
1812 /* added by Dong Lin for fast retransmission */
1813 if(seg->ack == tcb->snd.una
1814 && tcb->snd.una != tcb->snd.nxt
1816 && seg->wnd == tcb->snd.wnd) {
1818 /* this is a pure ack w/o window update */
1819 netlog(s->p->f, Logtcprxmt, "dupack %lud ack %lud sndwnd %lud advwin %lud\n",
1820 tcb->snd.dupacks, seg->ack, tcb->snd.wnd, seg->wnd);
1822 if(++tcb->snd.dupacks == TCPREXMTTHRESH) {
1824 * tahoe tcp rxt the packet, half sshthresh,
1825 * and set cwnd to one packet
1827 tcb->snd.recovery = 1;
1828 tcb->snd.rxt = tcb->snd.nxt;
1829 netlog(s->p->f, Logtcprxmt, "fast rxt %lud, nxt %lud\n", tcb->snd.una, tcb->snd.nxt);
1832 /* do reno tcp here. */
1839 if(seq_gt(seg->ack, tcb->snd.wl2)
1840 || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){
1841 tcb->snd.wnd = seg->wnd;
1842 tcb->snd.wl2 = seg->ack;
1845 if(!seq_gt(seg->ack, tcb->snd.una)){
1847 * don't let us hangup if sending into a closed window and
1848 * we're still getting acks
1850 if((tcb->flags&RETRAN) && tcb->snd.wnd == 0){
1851 tcb->backedoff = MAXBACKMS/4;
1857 * any positive ack turns off fast rxt,
1858 * (should we do new-reno on partial acks?)
1860 if(!tcb->snd.recovery || seq_ge(seg->ack, tcb->snd.rxt)) {
1861 tcb->snd.dupacks = 0;
1862 tcb->snd.recovery = 0;
1864 netlog(s->p->f, Logtcp, "rxt next %lud, cwin %lud\n", seg->ack, tcb->cwind);
1866 /* Compute the new send window size */
1867 acked = seg->ack - tcb->snd.una;
1869 /* avoid slow start and timers for SYN acks */
1870 if((tcb->flags & SYNACK) == 0) {
1871 tcb->flags |= SYNACK;
1877 /* slow start as long as we're not recovering from lost packets */
1878 if(tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
1879 if(tcb->cwind < tcb->ssthresh) {
1885 expand = ((int)tcb->mss * tcb->mss) / tcb->cwind;
1887 if(tcb->cwind + expand < tcb->cwind)
1888 expand = tcb->snd.wnd - tcb->cwind;
1889 if(tcb->cwind + expand > tcb->snd.wnd)
1890 expand = tcb->snd.wnd - tcb->cwind;
1891 tcb->cwind += expand;
1894 /* Adjust the timers according to the round trip time */
1895 if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
1896 tcphalt(tpriv, &tcb->rtt_timer);
1897 if((tcb->flags&RETRAN) == 0) {
1900 rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
1902 rtt = 1; /* otherwise all close systems will rexmit in 0 time */
1904 if(tcb->srtt == 0) {
1905 tcb->srtt = rtt << LOGAGAIN;
1906 tcb->mdev = rtt << LOGDGAIN;
1908 delta = rtt - (tcb->srtt>>LOGAGAIN);
1913 delta = abs(delta) - (tcb->mdev>>LOGDGAIN);
1923 if(qdiscard(s->wq, acked) < acked)
1926 tcb->snd.una = seg->ack;
1927 if(seq_gt(seg->ack, tcb->snd.urg))
1928 tcb->snd.urg = seg->ack;
1930 if(tcb->snd.una != tcb->snd.nxt)
1931 tcpgo(tpriv, &tcb->timer);
1933 tcphalt(tpriv, &tcb->timer);
1935 if(seq_lt(tcb->snd.ptr, tcb->snd.una))
1936 tcb->snd.ptr = tcb->snd.una;
1938 tcb->flags &= ~RETRAN;
1944 tcpiput(Proto *tcp, Ipifc*, Block *bp)
1951 ushort length, csum;
1952 uchar source[IPaddrlen], dest[IPaddrlen];
1961 tpriv->stats[InSegs]++;
1963 h4 = (Tcp4hdr*)(bp->rp);
1964 h6 = (Tcp6hdr*)(bp->rp);
1965 memset(&seg, 0, sizeof seg);
1967 if((h4->vihl&0xF0)==IP_VER4) {
1969 length = nhgets(h4->length);
1970 v4tov6(dest, h4->tcpdst);
1971 v4tov6(source, h4->tcpsrc);
1974 hnputs(h4->tcplen, length-TCP4_PKT);
1975 if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
1976 ptclcsum(bp, TCP4_IPLEN, length-TCP4_IPLEN)) {
1977 tpriv->stats[CsumErrs]++;
1978 tpriv->stats[InErrs]++;
1979 netlog(f, Logtcp, "bad tcp proto cksum\n");
1984 hdrlen = ntohtcp4(&seg, &bp);
1986 tpriv->stats[HlenErrs]++;
1987 tpriv->stats[InErrs]++;
1988 netlog(f, Logtcp, "bad tcp hdr len\n");
1992 /* trim the packet to the size claimed by the datagram */
1993 length -= hdrlen+TCP4_PKT;
1994 bp = trimblock(bp, hdrlen+TCP4_PKT, length);
1996 tpriv->stats[LenErrs]++;
1997 tpriv->stats[InErrs]++;
1998 netlog(f, Logtcp, "tcp len < 0 after trim\n");
2004 int proto = h6->proto;
2007 length = nhgets(h6->ploadlen);
2008 ipmove(dest, h6->tcpdst);
2009 ipmove(source, h6->tcpsrc);
2011 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
2013 hnputl(h6->vcf, length);
2014 if((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
2015 (csum = ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) != 0) {
2016 tpriv->stats[CsumErrs]++;
2017 tpriv->stats[InErrs]++;
2019 "bad tcpv6 proto cksum: got %#ux, computed %#ux\n",
2020 h6->tcpcksum[0]<<8 | h6->tcpcksum[1], csum);
2026 hnputs(h6->ploadlen, length);
2028 hdrlen = ntohtcp6(&seg, &bp);
2030 tpriv->stats[HlenErrs]++;
2031 tpriv->stats[InErrs]++;
2032 netlog(f, Logtcp, "bad tcpv6 hdr len\n");
2036 /* trim the packet to the size claimed by the datagram */
2038 bp = trimblock(bp, hdrlen+TCP6_PKT, length);
2040 tpriv->stats[LenErrs]++;
2041 tpriv->stats[InErrs]++;
2042 netlog(f, Logtcp, "tcpv6 len < 0 after trim\n");
2047 /* lock protocol while searching for a conversation */
2050 /* Look for a matching conversation */
2051 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2053 netlog(f, Logtcp, "iphtlook(src %I!%d, dst %I!%d) failed\n",
2054 source, seg.source, dest, seg.dest);
2057 sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2062 /* if it's a listener, look for the right flags and get a new conv */
2063 tcb = (Tcpctl*)s->ptcl;
2064 if(tcb->state == Listen){
2065 if(seg.flags & RST){
2066 limborst(s, &seg, source, dest, version);
2072 /* if this is a new SYN, put the call into limbo */
2073 if((seg.flags & SYN) && (seg.flags & ACK) == 0){
2074 limbo(s, source, dest, &seg, version);
2081 * if there's a matching call in limbo, tcpincoming will
2082 * return it in state Syn_received
2084 s = tcpincoming(s, &seg, source, dest, version);
2089 /* The rest of the input state machine is run with the control block
2090 * locked and implements the state machine directly out of the RFC.
2091 * Out-of-band data is ignored - it was always a bad idea.
2093 tcb = (Tcpctl*)s->ptcl;
2102 seg.wnd <<= tcb->rcv.scale;
2104 /* every input packet in puts off the keep alive time out */
2105 tcpsetkacounter(tcb);
2107 switch(tcb->state) {
2109 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2112 if(seg.flags & ACK) {
2113 if(!seq_within(seg.ack, tcb->iss+1, tcb->snd.nxt)) {
2114 sndrst(tcp, source, dest, length, &seg, version,
2115 "bad seq in Syn_sent");
2119 if(seg.flags & RST) {
2121 localclose(s, Econrefused);
2125 if(seg.flags & SYN) {
2127 if(seg.flags & ACK){
2130 tcpsetstate(s, Established);
2131 tcpsetscale(s, tcb, seg.ws, tcb->scale);
2135 tcpsetstate(s, Syn_received); /* DLP - shouldn't this be a reset? */
2138 if(length != 0 || (seg.flags & FIN))
2151 /* doesn't matter if it's the correct ack, we're just trying to set timing */
2158 * One DOS attack is to open connections to us and then forget about them,
2159 * thereby tying up a conv at no long term cost to the attacker.
2160 * This is an attempt to defeat these stateless DOS attacks. See
2161 * corresponding code in tcpsendka().
2163 if(tcb->state != Syn_received && (seg.flags & RST) == 0){
2164 if(tcpporthogdefense
2165 && seq_within(seg.ack, tcb->snd.una-(1<<31), tcb->snd.una-(1<<29))){
2166 print("stateless hog %I.%d->%I.%d f %ux %lux - %lux - %lux\n",
2167 source, seg.source, dest, seg.dest, seg.flags,
2168 tcb->snd.una-(1<<31), seg.ack, tcb->snd.una-(1<<29));
2169 localclose(s, "stateless hog");
2173 /* Cut the data to fit the receive window */
2174 if(tcptrim(tcb, &seg, &bp, &length) == -1) {
2175 netlog(f, Logtcp, "tcptrim, not accept, seq %lud-%lud win %lud-%lud from %I\n",
2176 seg.seq, seg.seq + length - 1,
2177 tcb->rcv.nxt, tcb->rcv.nxt + tcb->rcv.wnd-1, s->raddr);
2178 netlog(f, Logtcp, "tcp len < 0, %lud %d\n", seg.seq, length);
2180 if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) {
2181 tcphalt(tpriv, &tcb->rtt_timer);
2182 tcphalt(tpriv, &tcb->acktimer);
2183 tcphalt(tpriv, &tcb->katimer);
2184 tcpsetstate(s, Time_wait);
2185 tcb->timer.start = MSL2*(1000 / MSPTICK);
2186 tcpgo(tpriv, &tcb->timer);
2188 if(!(seg.flags & RST)) {
2189 tcb->flags |= FORCE;
2197 /* Cannot accept so answer with a rst */
2198 if(length && tcb->state == Closed) {
2199 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2203 /* The segment is beyond the current receive pointer so
2204 * queue the data in the resequence queue
2206 if(seg.seq != tcb->rcv.nxt)
2207 if(length != 0 || (seg.flags & (SYN|FIN))) {
2209 if(addreseq(tcb, tpriv, &seg, bp, length) < 0)
2210 print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr, s->lport);
2211 tcb->flags |= FORCE;
2216 * keep looping till we've processed this packet plus any
2217 * adjacent packets in the resequence queue
2220 if(seg.flags & RST) {
2221 if(tcb->state == Established) {
2222 tpriv->stats[EstabResets]++;
2223 if(tcb->rcv.nxt != seg.seq)
2224 print("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt %lux seq %lux\n", s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt, seg.seq);
2226 localclose(s, Econrefused);
2230 if((seg.flags&ACK) == 0)
2233 switch(tcb->state) {
2235 if(!seq_within(seg.ack, tcb->snd.una+1, tcb->snd.nxt)){
2236 sndrst(tcp, source, dest, length, &seg, version,
2237 "bad seq in Syn_received");
2241 tcpsetstate(s, Established);
2248 if(qlen(s->wq)+tcb->flgcnt == 0){
2249 tcphalt(tpriv, &tcb->rtt_timer);
2250 tcphalt(tpriv, &tcb->acktimer);
2251 tcpsetkacounter(tcb);
2253 tcpsetstate(s, Finwait2);
2254 tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2255 tcpgo(tpriv, &tcb->katimer);
2263 if(qlen(s->wq)+tcb->flgcnt == 0) {
2264 tcphalt(tpriv, &tcb->rtt_timer);
2265 tcphalt(tpriv, &tcb->acktimer);
2266 tcphalt(tpriv, &tcb->katimer);
2267 tcpsetstate(s, Time_wait);
2268 tcb->timer.start = MSL2*(1000 / MSPTICK);
2269 tcpgo(tpriv, &tcb->timer);
2274 if(qlen(s->wq)+tcb->flgcnt == 0) {
2279 tcb->flags |= FORCE;
2280 if(tcb->timer.state != TcptimerON)
2281 tcpgo(tpriv, &tcb->timer);
2284 if((seg.flags&URG) && seg.urg) {
2285 if(seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2286 tcb->rcv.urg = seg.urg + seg.seq;
2287 pullblock(&bp, seg.urg);
2291 if(seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2292 tcb->rcv.urg = tcb->rcv.nxt;
2301 /* Ignore segment text */
2309 /* If we still have some data place on
2315 panic("tcp packblock");
2316 qpassnolim(s->rq, bp);
2320 * Force an ack every 2 data messages. This is
2321 * a hack for rob to make his home system run
2324 * this also keeps the standard TCP congestion
2325 * control working since it needs an ack every
2326 * 2 max segs worth. This is not quite that,
2327 * but under a real stream is equivalent since
2328 * every packet has a max seg in it.
2330 if(++(tcb->rcv.una) >= 2)
2331 tcb->flags |= FORCE;
2333 tcb->rcv.nxt += length;
2336 * update our rcv window
2341 * turn on the acktimer if there's something
2344 if(tcb->acktimer.state != TcptimerON)
2345 tcpgo(tpriv, &tcb->acktimer);
2349 /* no process to read the data, send a reset */
2352 sndrst(tcp, source, dest, length, &seg, version,
2353 "send to Finwait2");
2360 if(seg.flags & FIN) {
2361 tcb->flags |= FORCE;
2363 switch(tcb->state) {
2367 tcpsetstate(s, Close_wait);
2371 if(qlen(s->wq)+tcb->flgcnt == 0) {
2372 tcphalt(tpriv, &tcb->rtt_timer);
2373 tcphalt(tpriv, &tcb->acktimer);
2374 tcphalt(tpriv, &tcb->katimer);
2375 tcpsetstate(s, Time_wait);
2376 tcb->timer.start = MSL2*(1000/MSPTICK);
2377 tcpgo(tpriv, &tcb->timer);
2380 tcpsetstate(s, Closing);
2384 tcphalt(tpriv, &tcb->rtt_timer);
2385 tcphalt(tpriv, &tcb->acktimer);
2386 tcphalt(tpriv, &tcb->katimer);
2387 tcpsetstate(s, Time_wait);
2388 tcb->timer.start = MSL2 * (1000/MSPTICK);
2389 tcpgo(tpriv, &tcb->timer);
2396 tcpgo(tpriv, &tcb->timer);
2402 * get next adjacent segment from the resequence queue.
2403 * dump/trim any overlapping segments
2406 if(tcb->reseq == nil)
2409 if(seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2412 getreseq(tcb, &seg, &bp, &length);
2414 if(tcptrim(tcb, &seg, &bp, &length) == 0)
2431 * always enters and exits with the s locked. We drop
2432 * the lock to ipoput the packet so some care has to be
2443 ulong ssize, dsize, usable, sent;
2450 version = s->ipversion;
2451 memset(&seg, 0, sizeof seg);
2453 for(msgs = 0; msgs < 100; msgs++) {
2454 tcb = (Tcpctl*)s->ptcl;
2456 switch(tcb->state) {
2463 /* force an ack when a window has opened up */
2464 if(tcb->rcv.blocked && tcb->rcv.wnd > 0){
2465 tcb->rcv.blocked = 0;
2466 tcb->flags |= FORCE;
2469 sndcnt = qlen(s->wq)+tcb->flgcnt;
2470 sent = tcb->snd.ptr - tcb->snd.una;
2472 /* Don't send anything else until our SYN has been acked */
2473 if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
2476 /* Compute usable segment based on offered window and limit
2477 * window probes to one
2479 if(tcb->snd.wnd == 0){
2481 if((tcb->flags&FORCE) == 0)
2483 // tcb->snd.ptr = tcb->snd.una;
2488 usable = tcb->cwind;
2489 if(tcb->snd.wnd < usable)
2490 usable = tcb->snd.wnd;
2492 usable = usable >= sent? usable - sent: 0;
2494 ssize = sndcnt-sent;
2495 if(ssize && usable < 2)
2496 netlog(s->p->f, Logtcp, "throttled snd.wnd %lud cwind %lud\n",
2497 tcb->snd.wnd, tcb->cwind);
2500 if(tcb->mss < ssize)
2506 if((tcb->flags&FORCE) == 0)
2509 tcb->flags &= ~FORCE;
2512 /* By default we will generate an ack */
2513 tcphalt(tpriv, &tcb->acktimer);
2515 seg.source = s->lport;
2516 seg.dest = s->rport;
2523 if(tcb->snd.ptr == tcb->iss){
2527 seg.ws = tcb->scale;
2532 * don't send any data with a SYN/ACK packet
2533 * because Linux rejects the packet in its
2534 * attempt to solve the SYN attack problem
2536 if(tcb->snd.ptr == tcb->iss){
2541 seg.ws = tcb->scale;
2545 seg.seq = tcb->snd.ptr;
2546 seg.ack = tcb->rcv.nxt;
2547 seg.wnd = tcb->rcv.wnd;
2549 /* Pull out data to send */
2552 bp = qcopy(s->wq, dsize, sent);
2553 if(BLEN(bp) != dsize) {
2559 if(sent+dsize == sndcnt)
2562 /* keep track of balance of resent data */
2563 if(seq_lt(tcb->snd.ptr, tcb->snd.nxt)) {
2564 n = tcb->snd.nxt - tcb->snd.ptr;
2568 netlog(f, Logtcp, "rexmit: %I!%d -> %I!%d ptr %lux nxt %lux\n",
2569 s->raddr, s->rport, s->laddr, s->lport, tcb->snd.ptr, tcb->snd.nxt);
2570 tpriv->stats[RetransSegs]++;
2573 tcb->snd.ptr += ssize;
2575 /* Pull up the send pointer so we can accept acks
2578 if(seq_gt(tcb->snd.ptr,tcb->snd.nxt))
2579 tcb->snd.nxt = tcb->snd.ptr;
2581 /* Build header, link data and compute cksum */
2584 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2585 hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
2592 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2593 hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
2600 hbp = nil; /* to suppress a warning */
2601 panic("tcpoutput: version %d", version);
2604 /* Start the transmission timers if there is new data and we
2605 * expect acknowledges
2608 if(tcb->timer.state != TcptimerON)
2609 tcpgo(tpriv, &tcb->timer);
2611 /* If round trip timer isn't running, start it.
2612 * measure the longest packet only in case the
2613 * transmission time dominates RTT
2615 if(tcb->rtt_timer.state != TcptimerON)
2616 if(ssize == tcb->mss) {
2617 tcpgo(tpriv, &tcb->rtt_timer);
2618 tcb->rttseq = tcb->snd.ptr;
2622 tpriv->stats[OutSegs]++;
2624 /* put off the next keep alive */
2625 tcpgo(tpriv, &tcb->katimer);
2629 if(ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0){
2630 /* a negative return means no route */
2631 localclose(s, "no route");
2635 if(ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0){
2636 /* a negative return means no route */
2637 localclose(s, "no route");
2641 panic("tcpoutput2: version %d", version);
2652 * the BSD convention (hack?) for keep alives. resend last uchar acked.
2661 tcb = (Tcpctl*)s->ptcl;
2664 memset(&seg, 0, sizeof seg);
2666 seg.source = s->lport;
2667 seg.dest = s->rport;
2668 seg.flags = ACK|PSH;
2671 if(tcpporthogdefense)
2672 seg.seq = tcb->snd.una-(1<<30)-nrand(1<<20);
2674 seg.seq = tcb->snd.una-1;
2675 seg.ack = tcb->rcv.nxt;
2677 seg.wnd = tcb->rcv.wnd;
2678 if(tcb->state == Finwait2){
2685 if(isv4(s->raddr)) {
2686 /* Build header, link data and compute cksum */
2687 tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2688 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
2693 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
2696 /* Build header, link data and compute cksum */
2697 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2698 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
2703 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
2708 * set connection to time out after 12 minutes
2711 tcpsetkacounter(Tcpctl *tcb)
2713 tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK);
2714 if(tcb->kacounter < 3)
2719 * if we've timed out, close the connection
2720 * otherwise, send a keepalive and restart the timer
2723 tcpkeepalive(void *v)
2729 tcb = (Tcpctl*)s->ptcl;
2735 if(tcb->state != Closed){
2736 if(--(tcb->kacounter) <= 0) {
2737 localclose(s, Etimedout);
2740 tcpgo(s->p->priv, &tcb->katimer);
2748 * start keepalive timer
2751 tcpstartka(Conv *s, char **f, int n)
2756 tcb = (Tcpctl*)s->ptcl;
2757 if(tcb->state != Established)
2758 return "connection must be in Establised state";
2762 tcb->katimer.start = x/MSPTICK;
2764 tcpsetkacounter(tcb);
2765 tcpgo(s->p->priv, &tcb->katimer);
2771 * turn checksums on/off
2774 tcpsetchecksum(Conv *s, char **f, int)
2778 tcb = (Tcpctl*)s->ptcl;
2779 tcb->nochecksum = !atoi(f[1]);
2789 tcb = (Tcpctl*)s->ptcl;
2791 tcb->flags |= RETRAN|FORCE;
2792 tcb->snd.ptr = tcb->snd.una;
2795 * We should be halving the slow start threshhold (down to one
2796 * mss) but leaving it at mss seems to work well enough
2798 tcb->ssthresh = tcb->mss;
2801 * pull window down to a single packet
2803 tcb->cwind = tcb->mss;
2808 tcptimeout(void *arg)
2817 tcb = (Tcpctl*)s->ptcl;
2827 if(tcb->state == Syn_sent)
2828 maxback = MAXBACKMS/2;
2830 maxback = MAXBACKMS;
2831 tcb->backedoff += tcb->timer.start * MSPTICK;
2832 if(tcb->backedoff >= maxback) {
2833 localclose(s, Etimedout);
2836 netlog(s->p->f, Logtcprxmt, "timeout rexmit %#lux %d/%lud\n", tcb->snd.una, tcb->timer.start, NOW);
2839 tpriv->stats[RetransTimeouts]++;
2840 tcb->snd.dupacks = 0;
2853 inwindow(Tcpctl *tcb, int seq)
2855 return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1);
2859 * set up state for a received SYN (or SYN ACK) packet
2862 procsyn(Conv *s, Tcp *seg)
2867 tcb = (Tcpctl*)s->ptcl;
2868 tcb->flags |= FORCE;
2870 tcb->rcv.nxt = seg->seq + 1;
2871 tcb->rcv.urg = tcb->rcv.nxt;
2872 tcb->irs = seg->seq;
2874 /* our sending max segment size cannot be bigger than what he asked for */
2875 if(seg->mss != 0 && seg->mss < tcb->mss) {
2876 tcb->mss = seg->mss;
2878 tpriv->stats[Mss] = tcb->mss;
2881 /* the congestion window always starts out as a single segment */
2882 tcb->snd.wnd = seg->wnd;
2883 tcb->cwind = tcb->mss;
2887 addreseq(Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length)
2892 rp = malloc(sizeof(Reseq));
2894 freeblist(bp); /* bp always consumed by add_reseq */
2900 rp->length = length;
2902 /* Place on reassembly list sorting by starting seq number */
2904 if(rp1 == nil || seq_lt(seg->seq, rp1->seg.seq)) {
2908 tpriv->stats[OutOfOrder]++;
2914 rqlen += rp1->length;
2915 if(rp1->next == nil || seq_lt(seg->seq, rp1->next->seg.seq)) {
2916 rp->next = rp1->next;
2919 tpriv->stats[OutOfOrder]++;
2924 qmax = QMAX<<tcb->rcv.scale;
2926 print("resequence queue > window: %d > %d\n", rqlen, qmax);
2928 for(rp1 = tcb->reseq; rp1 != nil; rp1 = rp1->next){
2929 print("%#lux %#lux %#ux\n", rp1->seg.seq,
2930 rp1->seg.ack, rp1->seg.flags);
2938 * delete entire reassembly queue; wait for retransmit.
2939 * - should we be smarter and only delete the tail?
2941 for(rp = tcb->reseq; rp != nil; rp = rp1){
2954 getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
2962 tcb->reseq = rp->next;
2966 *length = rp->length;
2972 tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
2980 if(seg->flags & SYN)
2982 if(seg->flags & FIN)
2985 if(tcb->rcv.wnd == 0) {
2986 if(len == 0 && seg->seq == tcb->rcv.nxt)
2990 /* Some part of the segment should be in the window */
2991 if(inwindow(tcb,seg->seq))
2995 if(inwindow(tcb, seg->seq+len-1) ||
2996 seq_within(tcb->rcv.nxt, seg->seq,seg->seq+len-1))
3004 dupcnt = tcb->rcv.nxt - seg->seq;
3006 tcb->rerecv += dupcnt;
3007 if(seg->flags & SYN){
3018 pullblock(bp, (ushort)dupcnt);
3022 if(seg->urg > dupcnt)
3030 excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
3032 tcb->rerecv += excess;
3034 *bp = trimblock(*bp, 0, *length);
3036 panic("presotto is a boofhead");
3043 tcpadvise(Proto *tcp, Block *bp, char *msg)
3048 uchar source[IPaddrlen];
3049 uchar dest[IPaddrlen];
3050 ushort psource, pdest;
3053 h4 = (Tcp4hdr*)(bp->rp);
3054 h6 = (Tcp6hdr*)(bp->rp);
3056 if((h4->vihl&0xF0)==IP_VER4) {
3057 v4tov6(dest, h4->tcpdst);
3058 v4tov6(source, h4->tcpsrc);
3059 psource = nhgets(h4->tcpsport);
3060 pdest = nhgets(h4->tcpdport);
3063 ipmove(dest, h6->tcpdst);
3064 ipmove(source, h6->tcpsrc);
3065 psource = nhgets(h6->tcpsport);
3066 pdest = nhgets(h6->tcpdport);
3069 /* Look for a connection */
3071 for(p = tcp->conv; *p; p++) {
3073 tcb = (Tcpctl*)s->ptcl;
3074 if(s->rport == pdest)
3075 if(s->lport == psource)
3076 if(tcb->state != Closed)
3077 if(ipcmp(s->raddr, dest) == 0)
3078 if(ipcmp(s->laddr, source) == 0){
3096 tcpporthogdefensectl(char *val)
3098 if(strcmp(val, "on") == 0)
3099 tcpporthogdefense = 1;
3100 else if(strcmp(val, "off") == 0)
3101 tcpporthogdefense = 0;
3103 return "unknown value for tcpporthogdefense";
3107 /* called with c qlocked */
3109 tcpctl(Conv* c, char** f, int n)
3111 if(n == 1 && strcmp(f[0], "hangup") == 0)
3112 return tcphangup(c);
3113 if(n >= 1 && strcmp(f[0], "keepalive") == 0)
3114 return tcpstartka(c, f, n);
3115 if(n >= 1 && strcmp(f[0], "checksum") == 0)
3116 return tcpsetchecksum(c, f, n);
3117 if(n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
3118 return tcpporthogdefensectl(f[1]);
3119 return "unknown control request";
3123 tcpstats(Proto *tcp, char *buf, int len)
3132 for(i = 0; i < Nstats; i++)
3133 p = seprint(p, e, "%s: %llud\n", statnames[i], priv->stats[i]);
3138 * garbage collect any stale conversations:
3139 * - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
3140 * - Finwait2 after 5 minutes
3142 * this is called whenever we run out of channels. Both checks are
3143 * of questionable validity so we try to use them only when we're
3144 * up against the wall.
3149 Conv *c, **pp, **ep;
3155 ep = &tcp->conv[tcp->nc];
3156 for(pp = tcp->conv; pp < ep; pp++) {
3162 tcb = (Tcpctl*)c->ptcl;
3165 if(NOW - tcb->time > 5000){
3166 localclose(c, "timed out");
3171 if(NOW - tcb->time > 5*60*1000){
3172 localclose(c, "timed out");
3183 tcpsettimer(Tcpctl *tcb)
3187 /* round trip dependency */
3188 x = backoff(tcb->backoff) *
3189 (tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK;
3191 /* bounded twixt 1/2 and 64 seconds */
3194 else if(x > (64000/MSPTICK))
3196 tcb->timer.start = x;
3205 tcp = smalloc(sizeof(Proto));
3206 tpriv = tcp->priv = smalloc(sizeof(Tcppriv));
3208 tcp->connect = tcpconnect;
3209 tcp->announce = tcpannounce;
3211 tcp->state = tcpstate;
3212 tcp->create = tcpcreate;
3213 tcp->close = tcpclose;
3215 tcp->advise = tcpadvise;
3216 tcp->stats = tcpstats;
3217 tcp->inuse = tcpinuse;
3219 tcp->ipproto = IP_TCPPROTO;
3220 tcp->nc = scalednconv();
3221 tcp->ptclsize = sizeof(Tcpctl);
3222 tpriv->stats[MaxConn] = tcp->nc;
3228 tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale)
3231 tcb->rcv.scale = rcvscale & 0xff;
3232 tcb->snd.scale = sndscale & 0xff;
3233 tcb->window = QMAX<<tcb->snd.scale;
3234 qsetlimit(s->rq, tcb->window);
3239 qsetlimit(s->rq, tcb->window);