NLHT = 256, /* hash table size, must be a power of 2 */
LHTMASK = NLHT-1,
- HaveWS = 1<<8,
+ /*
+ * window is 64kb · 2ⁿ
+ * these factors determine the ultimate bandwidth-delay product.
+ * 64kb · 2⁵ = 2mb, or 2x overkill for 100mbps · 70ms.
+ */
+ Maxqscale = 4, /* maximum queuing scale */
+ Defadvscale = 4, /* default advertisement */
};
/* Must correspond to the enumeration above */
ulong seq;
ulong ack;
uchar flags;
- ushort ws; /* window scale option (if not zero) */
- ulong wnd;
+ uchar update;
+ ushort ws; /* window scale option */
+ ulong wnd; /* prescaled window*/
ushort urg;
ushort mss; /* max segment size option (if not zero) */
ushort len; /* size of data */
ulong wnd; /* Tcp send window */
ulong urg; /* Urgent data pointer */
ulong wl2;
- int scale; /* how much to right shift window in xmitted packets */
+ uint scale; /* how much to right shift window in xmitted packets */
/* to implement tahoe and reno TCP */
ulong dupacks; /* number of duplicate acks rcvd */
+ ulong partialack;
int recovery; /* loss recovery flag */
- ulong rxt; /* right window marker for recovery */
+ int retransmit; /* retransmit 1 packet @ una flag */
+ int rto;
+ ulong rxt; /* right window marker for recovery "recover" rfc3782 */
} snd;
struct {
ulong nxt; /* Receive pointer to next uchar slot */
ulong wnd; /* Receive window incoming */
+ ulong wsnt; /* Last wptr sent. important to track for large bdp */
+ ulong wptr;
ulong urg; /* Urgent pointer */
+ ulong ackptr; /* last acked sequence */
int blocked;
- int una; /* unacked data segs */
- int scale; /* how much to left shift window in rcved packets */
+ uint scale; /* how much to left shift window in rcv'd packets */
} rcv;
ulong iss; /* Initial sequence number */
- int sawwsopt; /* true if we saw a wsopt on the incoming SYN */
ulong cwind; /* Congestion window */
- int scale; /* desired snd.scale */
- ushort ssthresh; /* Slow start threshold */
+ ulong abcbytes; /* appropriate byte counting rfc 3465 */
+ uint scale; /* desired snd.scale */
+ ulong ssthresh; /* Slow start threshold */
int resent; /* Bytes just resent */
int irs; /* Initial received squence */
ushort mss; /* Maximum segment size */
int rerecv; /* Overlap of data rerecevived */
- ulong window; /* Receive window */
+ ulong window; /* Our receive window (queue) */
+ uint qscale; /* Log2 of our receive window (queue) */
uchar backoff; /* Exponential backoff counter */
int backedoff; /* ms we've backed off for rexmits */
uchar flags; /* State flags */
Reseq *reseq; /* Resequencing queue */
+ int nreseq;
+ int reseqlen;
Tcptimer timer; /* Activity timer */
Tcptimer acktimer; /* Acknowledge timer */
Tcptimer rtt_timer; /* Round trip timer */
Tcptimer katimer; /* keep alive timer */
ulong rttseq; /* Round trip sequence */
- int srtt; /* Shortened round trip */
+ int srtt; /* Smoothed round trip */
int mdev; /* Mean deviation of round trip */
int kacounter; /* count down for keep alive */
uint sndsyntime; /* time syn sent */
ulong time; /* time Finwait2 or Syn_received was sent */
+ ulong timeuna; /* snd.una when time was set */
int nochecksum; /* non-zero means don't send checksums */
int flgcnt; /* number of flags in the sequence (FIN,SEQ) */
};
int tcp_irtt = DEF_RTT; /* Initial guess at round trip time */
-ushort tcp_mss = DEF_MSS; /* Maximum segment size to be sent */
enum {
/* MIB stats */
InSegs,
OutSegs,
RetransSegs,
+ RetransSegsSent,
RetransTimeouts,
InErrs,
OutRsts,
CsumErrs,
HlenErrs,
LenErrs,
+ Resequenced,
OutOfOrder,
+ ReseqBytelim,
+ ReseqPktlim,
+ Delayack,
+ Wopenack,
+
+ Recovery,
+ RecoveryDone,
+ RecoveryRTO,
+ RecoveryNoSeq,
+ RecoveryCwind,
+ RecoveryPA,
Nstats
};
-static char *statnames[] =
+static char *statnames[Nstats] =
{
[MaxConn] "MaxConn",
[Mss] "MaxSegment",
[InSegs] "InSegs",
[OutSegs] "OutSegs",
[RetransSegs] "RetransSegs",
+[RetransSegsSent] "RetransSegsSent",
[RetransTimeouts] "RetransTimeouts",
[InErrs] "InErrs",
[OutRsts] "OutRsts",
[HlenErrs] "HlenErrs",
[LenErrs] "LenErrs",
[OutOfOrder] "OutOfOrder",
+[Resequenced] "Resequenced",
+[ReseqBytelim] "ReseqBytelim",
+[ReseqPktlim] "ReseqPktlim",
+[Delayack] "Delayack",
+[Wopenack] "Wopenack",
+
+[Recovery] "Recovery",
+[RecoveryDone] "RecoveryDone",
+[RecoveryRTO] "RecoveryRTO",
+
+[RecoveryNoSeq] "RecoveryNoSeq",
+[RecoveryCwind] "RecoveryCwind",
+[RecoveryPA] "RecoveryPA",
};
typedef struct Tcppriv Tcppriv;
*/
int tcpporthogdefense = 0;
-int addreseq(Tcpctl*, Tcppriv*, Tcp*, Block*, ushort);
-void getreseq(Tcpctl*, Tcp*, Block**, ushort*);
-void localclose(Conv*, char*);
-void procsyn(Conv*, Tcp*);
-void tcpiput(Proto*, Ipifc*, Block*);
-void tcpoutput(Conv*);
-int tcptrim(Tcpctl*, Tcp*, Block**, ushort*);
-void tcpstart(Conv*, int);
-void tcptimeout(void*);
-void tcpsndsyn(Conv*, Tcpctl*);
-void tcprcvwin(Conv*);
-void tcpacktimer(void*);
-void tcpkeepalive(void*);
-void tcpsetkacounter(Tcpctl*);
-void tcprxmit(Conv*);
-void tcpsettimer(Tcpctl*);
-void tcpsynackrtt(Conv*);
-void tcpsetscale(Conv*, Tcpctl*, ushort, ushort);
-
-static void limborexmit(Proto*);
-static void limbo(Conv*, uchar*, uchar*, Tcp*, int);
+static int addreseq(Fs*, Tcpctl*, Tcppriv*, Tcp*, Block*, ushort);
+static int dumpreseq(Tcpctl*);
+static void getreseq(Tcpctl*, Tcp*, Block**, ushort*);
+static void limbo(Conv*, uchar*, uchar*, Tcp*, int);
+static void limborexmit(Proto*);
+static void localclose(Conv*, char*);
+static void procsyn(Conv*, Tcp*);
+static void tcpacktimer(void*);
+static void tcpiput(Proto*, Ipifc*, Block*);
+static void tcpkeepalive(void*);
+static void tcpoutput(Conv*);
+static void tcprcvwin(Conv*);
+static void tcprxmit(Conv*);
+static void tcpsetkacounter(Tcpctl*);
+static void tcpsetscale(Conv*, Tcpctl*, ushort, ushort);
+static void tcpsettimer(Tcpctl*);
+static void tcpsndsyn(Conv*, Tcpctl*);
+static void tcpstart(Conv*, int);
+static void tcpsynackrtt(Conv*);
+static void tcptimeout(void*);
+static int tcptrim(Tcpctl*, Tcp*, Block**, ushort*);
-void
+static void
tcpsetstate(Conv *s, uchar newstate)
{
Tcpctl *tcb;
if(newstate == Established)
tpriv->stats[CurrEstab]++;
- /**
- print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport,
- tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab );
- **/
-
switch(newstate) {
case Closed:
qclose(s->rq);
s = (Tcpctl*)(c->ptcl);
return snprint(state, n,
- "%s qin %d qout %d srtt %d mdev %d cwin %lud swin %lud>>%d rwin %lud>>%d timer.start %d timer.count %d rerecv %d katimer.start %d katimer.count %d\n",
+ "%s qin %d qout %d rq %d.%d srtt %d mdev %d sst %lud cwin %lud swin %lud>>%d rwin %lud>>%d qscale %d timer.start %d timer.count %d rerecv %d katimer.start %d katimer.count %d\n",
tcpstates[s->state],
c->rq ? qlen(c->rq) : 0,
c->wq ? qlen(c->wq) : 0,
- s->srtt, s->mdev,
+ s->nreseq, s->reseqlen,
+ s->srtt, s->mdev, s->ssthresh,
s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale,
+ s->qscale,
s->timer.start, s->timer.count, s->rerecv,
s->katimer.start, s->katimer.count);
}
}
}
-void
+static void
tcpkick(void *x)
{
Conv *s = x;
/*
* Push data
*/
- tcprcvwin(s);
tcpoutput(s);
break;
default:
poperror();
}
-void
+static int seq_lt(ulong, ulong);
+
+static void
tcprcvwin(Conv *s) /* Call with tcb locked */
{
int w;
w = tcb->window - qlen(s->rq);
if(w < 0)
w = 0;
- if(w == 0)
- netlog(s->p->f, Logtcp, "tcprcvwim: window %lud qlen %d\n", tcb->window, qlen(s->rq));
- tcb->rcv.wnd = w;
- if(w == 0)
+ /* RFC 1122 § 4.2.2.17 do not move right edge of window left */
+ if(seq_lt(tcb->rcv.nxt + w, tcb->rcv.wptr))
+ w = tcb->rcv.wptr - tcb->rcv.nxt;
+ if(w != tcb->rcv.wnd)
+ if(w>>tcb->rcv.scale == 0 || tcb->window > 4*tcb->mss && w < tcb->mss/4){
tcb->rcv.blocked = 1;
+ netlog(s->p->f, Logtcp, "tcprcvwin: window %lud qlen %d ws %ud lport %d\n",
+ tcb->window, qlen(s->rq), tcb->rcv.scale, s->lport);
+ }
+ tcb->rcv.wnd = w;
+ tcb->rcv.wptr = tcb->rcv.nxt + w;
}
-void
+static void
tcpacktimer(void *v)
{
Tcpctl *tcb;
qlock(s);
if(tcb->state != Closed){
tcb->flags |= FORCE;
- tcprcvwin(s);
tcpoutput(s);
}
qunlock(s);
poperror();
}
+static void
+tcpcongestion(Tcpctl *tcb)
+{
+ ulong inflight;
+
+ inflight = tcb->snd.nxt - tcb->snd.una;
+ if(inflight > tcb->cwind)
+ inflight = tcb->cwind;
+ tcb->ssthresh = inflight / 2;
+ if(tcb->ssthresh < 2*tcb->mss)
+ tcb->ssthresh = 2*tcb->mss;
+}
+
+enum {
+ L = 2, /* aggressive slow start; legal values ∈ (1.0, 2.0) */
+};
+
+static void
+tcpabcincr(Tcpctl *tcb, uint acked)
+{
+ uint limit;
+
+ tcb->abcbytes += acked;
+ if(tcb->cwind < tcb->ssthresh){
+ /* slow start */
+ if(tcb->snd.rto)
+ limit = 1*tcb->mss;
+ else
+ limit = L*tcb->mss;
+ tcb->cwind += MIN(tcb->abcbytes, limit);
+ tcb->abcbytes = 0;
+ }
+ else{
+ tcb->snd.rto = 0;
+ /* avoidance */
+ if(tcb->abcbytes >= tcb->cwind){
+ tcb->abcbytes -= tcb->cwind;
+ tcb->cwind += tcb->mss;
+ }
+ }
+}
+
static void
tcpcreate(Conv *c)
{
c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c);
- c->wq = qopen((3*QMAX)/2, Qkick, tcpkick, c);
+ c->wq = qopen(QMAX, Qkick, tcpkick, c);
}
static void
t->state = newstate;
}
-void
+static void
tcpackproc(void *a)
{
Tcptimer *t, *tp, *timeo;
tcp = a;
priv = tcp->priv;
+ while(waserror())
+ ;
+
for(;;) {
tsleep(&up->sleep, return0, 0, MSPTICK);
}
}
-void
+static void
tcpgo(Tcppriv *priv, Tcptimer *t)
{
if(t == nil || t->start == 0)
qunlock(&priv->tl);
}
-void
+static void
tcphalt(Tcppriv *priv, Tcptimer *t)
{
if(t == nil)
qunlock(&priv->tl);
}
-int
+static int
backoff(int n)
{
return 1 << n;
}
-void
+static void
localclose(Conv *s, char *reason) /* called with tcb locked */
{
Tcpctl *tcb;
- Reseq *rp,*rp1;
Tcppriv *tpriv;
tpriv = s->p->priv;
tcphalt(tpriv, &tcb->katimer);
/* Flush reassembly queue; nothing more can arrive */
- for(rp = tcb->reseq; rp != nil; rp = rp1) {
- rp1 = rp->next;
- freeblist(rp->bp);
- free(rp);
- }
- tcb->reseq = nil;
+ dumpreseq(tcb);
if(tcb->state == Syn_sent)
Fsconnected(s, reason);
}
/* mtu (- TCP + IP hdr len) of 1st hop */
-int
-tcpmtu(Proto *tcp, uchar *addr, int version, int *scale)
+static int
+tcpmtu(Proto *tcp, uchar *addr, int version, uint *scale)
{
Ipifc *ifc;
int mtu;
mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
break;
}
- if(ifc != nil){
- if(ifc->mbps > 1000)
- *scale = HaveWS | 4;
- else if(ifc->mbps > 100)
- *scale = HaveWS | 3;
- else if(ifc->mbps > 10)
- *scale = HaveWS | 1;
- else
- *scale = HaveWS | 0;
- } else
- *scale = HaveWS | 0;
+ /*
+ * set the ws. it doesn't commit us to anything.
+ * ws is the ultimate limit to the bandwidth-delay product.
+ */
+ *scale = Defadvscale;
return mtu;
}
-void
+static void
inittcpctl(Conv *s, int mode)
{
Tcpctl *tcb;
memset(tcb, 0, sizeof(Tcpctl));
- tcb->ssthresh = 65535;
+ tcb->ssthresh = QMAX; /* reset by tcpsetscale() */
tcb->srtt = tcp_irtt<<LOGAGAIN;
tcb->mdev = 0;
}
tcb->mss = tcb->cwind = mss;
+ tcb->abcbytes = 0;
tpriv = s->p->priv;
tpriv->stats[Mss] = tcb->mss;
/* default is no window scaling */
- tcb->window = QMAX;
- tcb->rcv.wnd = QMAX;
- tcb->rcv.scale = 0;
- tcb->snd.scale = 0;
- qsetlimit(s->rq, QMAX);
+ tcpsetscale(s, tcb, 0, 0);
}
/*
* called with s qlocked
*/
-void
+static void
tcpstart(Conv *s, int mode)
{
Tcpctl *tcb;
if(tpriv->ackprocstarted == 0){
qlock(&tpriv->apl);
if(tpriv->ackprocstarted == 0){
- sprint(kpname, "#I%dtcpack", s->p->f->dev);
+ snprint(kpname, sizeof(kpname), "#I%dtcpack", s->p->f->dev);
kproc(kpname, tcpackproc, s->p);
tpriv->ackprocstarted = 1;
}
}
static char*
-tcpflag(ushort flag)
+tcpflag(char *buf, char *e, ushort flag)
{
- static char buf[128];
+ char *p;
- sprint(buf, "%d", flag>>10); /* Head len */
+ p = seprint(buf, e, "%d", flag>>10); /* Head len */
if(flag & URG)
- strcat(buf, " URG");
+ p = seprint(p, e, " URG");
if(flag & ACK)
- strcat(buf, " ACK");
+ p = seprint(p, e, " ACK");
if(flag & PSH)
- strcat(buf, " PSH");
+ p = seprint(p, e, " PSH");
if(flag & RST)
- strcat(buf, " RST");
+ p = seprint(p, e, " RST");
if(flag & SYN)
- strcat(buf, " SYN");
+ p = seprint(p, e, " SYN");
if(flag & FIN)
- strcat(buf, " FIN");
-
+ p = seprint(p, e, " FIN");
+ USED(p);
return buf;
}
-Block *
+static Block*
htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb)
{
int dlen;
if(data) {
dlen = blocklen(data);
data = padblock(data, hdrlen + TCP6_PKT);
- if(data == nil)
- return nil;
}
else {
dlen = 0;
data = allocb(hdrlen + TCP6_PKT + 64); /* the 64 pad is to meet mintu's */
- if(data == nil)
- return nil;
data->wp += hdrlen + TCP6_PKT;
}
*opt++ = MSSOPT;
*opt++ = MSS_LENGTH;
hnputs(opt, tcph->mss);
-// print("our outgoing mss %d\n", tcph->mss);
opt += 2;
}
if(tcph->ws != 0){
return data;
}
-Block *
+static Block*
htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb)
{
int dlen;
if(tcph->flags & SYN){
if(tcph->mss)
hdrlen += MSS_LENGTH;
- if(tcph->ws)
+ if(1)
hdrlen += WS_LENGTH;
optpad = hdrlen & 3;
if(optpad)
hnputs(opt, tcph->mss);
opt += 2;
}
- if(tcph->ws != 0){
+ /* always offer. rfc1323 §2.2 */
+ if(1){
*opt++ = WSOPT;
*opt++ = WS_LENGTH;
*opt++ = tcph->ws;
return data;
}
-int
+static int
ntohtcp6(Tcp *tcph, Block **bpp)
{
Tcp6hdr *h;
tcph->urg = nhgets(h->tcpurg);
tcph->mss = 0;
tcph->ws = 0;
+ tcph->update = 0;
tcph->len = nhgets(h->ploadlen) - hdrlen;
*bpp = pullupblock(*bpp, hdrlen+TCP6_PKT);
break;
case WSOPT:
if(optlen == WS_LENGTH && *(optr+2) <= 14)
- tcph->ws = HaveWS | *(optr+2);
+ tcph->ws = *(optr+2);
break;
}
n -= optlen;
return hdrlen;
}
-int
+static int
ntohtcp4(Tcp *tcph, Block **bpp)
{
Tcp4hdr *h;
tcph->urg = nhgets(h->tcpurg);
tcph->mss = 0;
tcph->ws = 0;
+ tcph->update = 0;
tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
*bpp = pullupblock(*bpp, hdrlen+TCP4_PKT);
break;
switch(*optr) {
case MSSOPT:
- if(optlen == MSS_LENGTH) {
+ if(optlen == MSS_LENGTH)
tcph->mss = nhgets(optr+2);
-// print("new incoming mss %d\n", tcph->mss);
- }
break;
case WSOPT:
if(optlen == WS_LENGTH && *(optr+2) <= 14)
- tcph->ws = HaveWS | *(optr+2);
+ tcph->ws = *(optr+2);
break;
}
n -= optlen;
}
/*
- * For outgiing calls, generate an initial sequence
+ * For outgoing calls, generate an initial sequence
* number and put a SYN on the send queue
*/
-void
+static void
tcpsndsyn(Conv *s, Tcpctl *tcb)
{
Tcppriv *tpriv;
tcb->rttseq = tcb->iss;
tcb->snd.wl2 = tcb->iss;
tcb->snd.una = tcb->iss;
+ tcb->snd.rxt = tcb->iss;
tcb->snd.ptr = tcb->rttseq;
tcb->snd.nxt = tcb->rttseq;
tcb->flgcnt++;
* send a reset to the remote side and close the conversation
* called with s qlocked
*/
-char*
+static char*
tcphangup(Conv *s)
{
Tcp seg;
return commonerror();
if(ipcmp(s->raddr, IPnoaddr) != 0) {
if(!waserror()){
+ memset(&seg, 0, sizeof seg);
seg.flags = RST | ACK;
seg.ack = tcb->rcv.nxt;
- tcb->rcv.una = 0;
+ tcb->rcv.ackptr = seg.ack;
seg.seq = tcb->snd.ptr;
seg.wnd = 0;
seg.urg = 0;
/*
* (re)send a SYN ACK
*/
-int
+static int
sndsynack(Proto *tcp, Limbo *lp)
{
Block *hbp;
Tcp4hdr ph4;
Tcp6hdr ph6;
Tcp seg;
- int scale;
+ uint scale;
/* make pseudo header */
switch(lp->version) {
panic("sndrst: version %d", lp->version);
}
+ memset(&seg, 0, sizeof seg);
seg.seq = lp->iss;
seg.ack = lp->irs+1;
seg.flags = SYN|ACK;
seg.urg = 0;
seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale);
-// if (seg.mss > lp->mss && lp->mss >= 512)
-// seg.mss = lp->mss;
seg.wnd = QMAX;
/* if the other side set scale, we should too */
}
}
+static void
+initialwindow(Tcpctl *tcb)
+{
+ /* RFC 3390 initial window */
+ if(tcb->mss < 1095)
+ tcb->cwind = 4*tcb->mss;
+ else if(tcb->mss < 2190)
+ tcb->cwind = 4380;
+ else
+ tcb->cwind = 2*tcb->mss;
+}
+
/*
* come here when we finally get an ACK to our SYN-ACK.
* lookup call in limbo. if found, create a new conversation
tcb->irs = lp->irs;
tcb->rcv.nxt = tcb->irs+1;
+ tcb->rcv.wptr = tcb->rcv.nxt;
+ tcb->rcv.wsnt = 0;
tcb->rcv.urg = tcb->rcv.nxt;
tcb->iss = lp->iss;
tcb->snd.una = tcb->iss+1;
tcb->snd.ptr = tcb->iss+1;
tcb->snd.nxt = tcb->iss+1;
+ tcb->snd.rxt = tcb->iss+1;
tcb->flgcnt = 0;
tcb->flags |= SYNACK;
+ /* set desired mss and scale */
+ tcb->mss = tcpmtu(s->p, dst, s->ipversion, &tcb->scale);
+
/* our sending max segment size cannot be bigger than what he asked for */
- if(lp->mss != 0 && lp->mss < tcb->mss) {
+ if(lp->mss != 0 && lp->mss < tcb->mss)
tcb->mss = lp->mss;
- tpriv->stats[Mss] = tcb->mss;
- }
+ tpriv->stats[Mss] = tcb->mss;
/* window scaling */
tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
- /* the congestion window always starts out as a single segment */
+ /* congestion window */
tcb->snd.wnd = segp->wnd;
- tcb->cwind = tcb->mss;
+ initialwindow(tcb);
/* set initial round trip time */
tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER;
return new;
}
-int
+static int
seq_within(ulong x, ulong low, ulong high)
{
if(low <= high){
return 0;
}
-int
+static int
seq_lt(ulong x, ulong y)
{
return (int)(x-y) < 0;
}
-int
+static int
seq_le(ulong x, ulong y)
{
return (int)(x-y) <= 0;
}
-int
+static int
seq_gt(ulong x, ulong y)
{
return (int)(x-y) > 0;
}
-int
+static int
seq_ge(ulong x, ulong y)
{
return (int)(x-y) >= 0;
* use the time between the first SYN and it's ack as the
* initial round trip time
*/
-void
+static void
tcpsynackrtt(Conv *s)
{
Tcpctl *tcb;
tcphalt(tpriv, &tcb->rtt_timer);
}
-void
+static void
update(Conv *s, Tcp *seg)
{
int rtt, delta;
Tcpctl *tcb;
ulong acked;
- ulong expand;
Tcppriv *tpriv;
+ if(seg->update)
+ return;
+ seg->update = 1;
+
tpriv = s->p->priv;
tcb = (Tcpctl*)s->ptcl;
- /* if everything has been acked, force output(?) */
- if(seq_gt(seg->ack, tcb->snd.nxt)) {
- tcb->flags |= FORCE;
- return;
+ /* catch zero-window updates, update window & recover */
+ if(tcb->snd.wnd == 0 && seg->wnd > 0)
+ if(seq_lt(seg->ack, tcb->snd.ptr)){
+ netlog(s->p->f, Logtcp, "tcp: zwu ack %lud una %lud ptr %lud win %lud\n",
+ seg->ack, tcb->snd.una, tcb->snd.ptr, seg->wnd);
+ tcb->snd.wnd = seg->wnd;
+ goto recovery;
}
- /* added by Dong Lin for fast retransmission */
- if(seg->ack == tcb->snd.una
- && tcb->snd.una != tcb->snd.nxt
- && seg->len == 0
- && seg->wnd == tcb->snd.wnd) {
-
- /* this is a pure ack w/o window update */
- netlog(s->p->f, Logtcprxmt, "dupack %lud ack %lud sndwnd %lud advwin %lud\n",
- tcb->snd.dupacks, seg->ack, tcb->snd.wnd, seg->wnd);
-
- if(++tcb->snd.dupacks == TCPREXMTTHRESH) {
- /*
- * tahoe tcp rxt the packet, half sshthresh,
- * and set cwnd to one packet
- */
+ /* newreno fast retransmit */
+ if(seg->ack == tcb->snd.una)
+ if(tcb->snd.una != tcb->snd.nxt)
+ if(++tcb->snd.dupacks == 3){
+recovery:
+ if(tcb->snd.recovery){
+ tpriv->stats[RecoveryCwind]++;
+ tcb->cwind += tcb->mss;
+ }else if(seq_le(tcb->snd.rxt, seg->ack)){
+ tpriv->stats[Recovery]++;
+ tcb->abcbytes = 0;
tcb->snd.recovery = 1;
+ tcb->snd.partialack = 0;
tcb->snd.rxt = tcb->snd.nxt;
- netlog(s->p->f, Logtcprxmt, "fast rxt %lud, nxt %lud\n", tcb->snd.una, tcb->snd.nxt);
+ tcpcongestion(tcb);
+ tcb->cwind = tcb->ssthresh + 3*tcb->mss;
+ netlog(s->p->f, Logtcpwin, "recovery inflate %ld ss %ld @%lud\n",
+ tcb->cwind, tcb->ssthresh, tcb->snd.rxt);
tcprxmit(s);
- } else {
- /* do reno tcp here. */
+ }else{
+ tpriv->stats[RecoveryNoSeq]++;
+ netlog(s->p->f, Logtcpwin, "!recov %lud not ≤ %lud %ld\n",
+ tcb->snd.rxt, seg->ack, tcb->snd.rxt - seg->ack);
+ /* do not enter fast retransmit */
+ /* do not change ssthresh */
}
+ }else if(tcb->snd.recovery){
+ tpriv->stats[RecoveryCwind]++;
+ tcb->cwind += tcb->mss;
}
/*
*/
if(seq_gt(seg->ack, tcb->snd.wl2)
|| (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){
+ /* clear dupack if we advance wl2 */
+ if(tcb->snd.wl2 != seg->ack)
+ tcb->snd.dupacks = 0;
tcb->snd.wnd = seg->wnd;
tcb->snd.wl2 = seg->ack;
}
* don't let us hangup if sending into a closed window and
* we're still getting acks
*/
- if((tcb->flags&RETRAN) && tcb->snd.wnd == 0){
+ if((tcb->flags&RETRAN) && tcb->snd.wnd == 0)
tcb->backedoff = MAXBACKMS/4;
- }
return;
}
- /*
- * any positive ack turns off fast rxt,
- * (should we do new-reno on partial acks?)
- */
- if(!tcb->snd.recovery || seq_ge(seg->ack, tcb->snd.rxt)) {
- tcb->snd.dupacks = 0;
- tcb->snd.recovery = 0;
- } else
- netlog(s->p->f, Logtcp, "rxt next %lud, cwin %lud\n", seg->ack, tcb->cwind);
-
/* Compute the new send window size */
acked = seg->ack - tcb->snd.una;
goto done;
}
- /* slow start as long as we're not recovering from lost packets */
- if(tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) {
- if(tcb->cwind < tcb->ssthresh) {
- expand = tcb->mss;
- if(acked < expand)
- expand = acked;
- }
- else
- expand = ((int)tcb->mss * tcb->mss) / tcb->cwind;
+ /*
+ * congestion control
+ */
+ if(tcb->snd.recovery){
+ if(seq_ge(seg->ack, tcb->snd.rxt)){
+ /* recovery finished; deflate window */
+ tpriv->stats[RecoveryDone]++;
+ tcb->snd.dupacks = 0;
+ tcb->snd.recovery = 0;
+ tcb->cwind = (tcb->snd.nxt - tcb->snd.una) + tcb->mss;
+ if(tcb->ssthresh < tcb->cwind)
+ tcb->cwind = tcb->ssthresh;
+ netlog(s->p->f, Logtcpwin, "recovery deflate %ld %ld\n",
+ tcb->cwind, tcb->ssthresh);
+ } else {
+ /* partial ack; we lost more than one segment */
+ tpriv->stats[RecoveryPA]++;
+ if(tcb->cwind > acked)
+ tcb->cwind -= acked;
+ else{
+ netlog(s->p->f, Logtcpwin, "partial ack neg\n");
+ tcb->cwind = tcb->mss;
+ }
+ netlog(s->p->f, Logtcpwin, "partial ack %ld left %ld cwind %ld\n",
+ acked, tcb->snd.rxt - seg->ack, tcb->cwind);
- if(tcb->cwind + expand < tcb->cwind)
- expand = tcb->snd.wnd - tcb->cwind;
- if(tcb->cwind + expand > tcb->snd.wnd)
- expand = tcb->snd.wnd - tcb->cwind;
- tcb->cwind += expand;
- }
+ if(acked >= tcb->mss)
+ tcb->cwind += tcb->mss;
+ tcb->snd.partialack++;
+ }
+ } else
+ tcpabcincr(tcb, acked);
/* Adjust the timers according to the round trip time */
+ /* todo: fix sloppy treatment of overflow cases here. */
if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
tcphalt(tpriv, &tcb->rtt_timer);
if((tcb->flags&RETRAN) == 0) {
done:
if(qdiscard(s->wq, acked) < acked)
tcb->flgcnt--;
-
tcb->snd.una = seg->ack;
+
+ /* newreno fast recovery */
+ if(tcb->snd.recovery)
+ tcprxmit(s);
+
if(seq_gt(seg->ack, tcb->snd.urg))
tcb->snd.urg = seg->ack;
- if(tcb->snd.una != tcb->snd.nxt)
- tcpgo(tpriv, &tcb->timer);
+ if(tcb->snd.una != tcb->snd.nxt){
+ /* “impatient” variant */
+ if(!tcb->snd.recovery || tcb->snd.partialack == 1){
+ tcb->time = NOW;
+ tcb->timeuna = tcb->snd.una;
+ tcpgo(tpriv, &tcb->timer);
+ }
+ }
else
tcphalt(tpriv, &tcb->timer);
if(seq_lt(tcb->snd.ptr, tcb->snd.una))
tcb->snd.ptr = tcb->snd.una;
- tcb->flags &= ~RETRAN;
+ if(!tcb->snd.recovery)
+ tcb->flags &= ~RETRAN;
tcb->backoff = 0;
tcb->backedoff = 0;
}
-void
+static void
tcpiput(Proto *tcp, Ipifc*, Block *bp)
{
Tcp seg;
}
/* Cut the data to fit the receive window */
+ tcprcvwin(s);
if(tcptrim(tcb, &seg, &bp, &length) == -1) {
- netlog(f, Logtcp, "tcptrim, not accept, seq %lud-%lud win %lud-%lud from %I\n",
+ if(seg.seq+1 != tcb->rcv.nxt || length != 1)
+ netlog(f, Logtcp, "tcp: trim: !inwind: seq %lud-%lud win %lud-%lud l %d from %I\n",
seg.seq, seg.seq + length - 1,
- tcb->rcv.nxt, tcb->rcv.nxt + tcb->rcv.wnd-1, s->raddr);
- netlog(f, Logtcp, "tcp len < 0, %lud %d\n", seg.seq, length);
+ tcb->rcv.nxt, tcb->rcv.nxt + tcb->rcv.wnd-1, length, s->raddr);
update(s, &seg);
if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) {
tcphalt(tpriv, &tcb->rtt_timer);
if(seg.seq != tcb->rcv.nxt)
if(length != 0 || (seg.flags & (SYN|FIN))) {
update(s, &seg);
- if(addreseq(tcb, tpriv, &seg, bp, length) < 0)
+ if(addreseq(f, tcb, tpriv, &seg, bp, length) < 0)
print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr, s->lport);
- tcb->flags |= FORCE;
+ tcb->flags |= FORCE; /* force duplicate ack; RFC 5681 §3.2 */
goto output;
}
+ if(tcb->nreseq > 0)
+ tcb->flags |= FORCE; /* filled hole in sequence space; RFC 5681 §3.2 */
+
/*
* keep looping till we've processed this packet plus any
* adjacent packets in the resequence queue
* receive queue
*/
if(bp) {
- bp = packblock(bp);
- if(bp == nil)
- panic("tcp packblock");
- qpassnolim(s->rq, bp);
+ qpassnolim(s->rq, packblock(bp));
bp = nil;
-
- /*
- * Force an ack every 2 data messages. This is
- * a hack for rob to make his home system run
- * faster.
- *
- * this also keeps the standard TCP congestion
- * control working since it needs an ack every
- * 2 max segs worth. This is not quite that,
- * but under a real stream is equivalent since
- * every packet has a max seg in it.
- */
- if(++(tcb->rcv.una) >= 2)
- tcb->flags |= FORCE;
}
tcb->rcv.nxt += length;
- /*
- * update our rcv window
- */
- tcprcvwin(s);
-
/*
* turn on the acktimer if there's something
* to ack
getreseq(tcb, &seg, &bp, &length);
- if(tcptrim(tcb, &seg, &bp, &length) == 0)
+ tcprcvwin(s);
+ if(tcptrim(tcb, &seg, &bp, &length) == 0){
+ tcb->flags |= FORCE;
break;
+ }
}
}
output:
* the lock to ipoput the packet so some care has to be
* taken by callers.
*/
-void
+static void
tcpoutput(Conv *s)
{
Tcp seg;
- int msgs;
+ uint msgs;
Tcpctl *tcb;
Block *hbp, *bp;
- int sndcnt, n;
- ulong ssize, dsize, usable, sent;
+ int sndcnt;
+ ulong ssize, dsize, sent;
Fs *f;
Tcppriv *tpriv;
uchar version;
tpriv = s->p->priv;
version = s->ipversion;
- for(msgs = 0; msgs < 100; msgs++) {
- tcb = (Tcpctl*)s->ptcl;
+ tcb = (Tcpctl*)s->ptcl;
+
+ /* force ack every 2*mss */
+ if((tcb->flags & FORCE) == 0)
+ if(tcb->rcv.nxt - tcb->rcv.ackptr >= 2*tcb->mss){
+ tpriv->stats[Delayack]++;
+ tcb->flags |= FORCE;
+ }
+
+ /* force ack if window opening */
+ if(0)
+ if((tcb->flags & FORCE) == 0){
+ tcprcvwin(s);
+ if((int)(tcb->rcv.wptr - tcb->rcv.wsnt) >= 2*tcb->mss){
+ tpriv->stats[Wopenack]++;
+ tcb->flags |= FORCE;
+ }
+ }
+ for(msgs = 0; msgs < 100; msgs++) {
switch(tcb->state) {
case Listen:
case Closed:
return;
}
+ /* Don't send anything else until our SYN has been acked */
+ if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
+ break;
+
/* force an ack when a window has opened up */
+ tcprcvwin(s);
if(tcb->rcv.blocked && tcb->rcv.wnd > 0){
tcb->rcv.blocked = 0;
tcb->flags |= FORCE;
sndcnt = qlen(s->wq)+tcb->flgcnt;
sent = tcb->snd.ptr - tcb->snd.una;
-
- /* Don't send anything else until our SYN has been acked */
- if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
- break;
-
- /* Compute usable segment based on offered window and limit
- * window probes to one
- */
+ ssize = sndcnt;
if(tcb->snd.wnd == 0){
- if(sent != 0) {
- if((tcb->flags&FORCE) == 0)
- break;
-// tcb->snd.ptr = tcb->snd.una;
+ /* zero window probe */
+ if(sent > 0)
+ if(!(tcb->flags & FORCE))
+ break; /* already probing, rto re-probes */
+ if(ssize < sent)
+ ssize = 0;
+ else{
+ ssize -= sent;
+ if(ssize > 0)
+ ssize = 1;
+ }
+ } else {
+ /* calculate usable segment size */
+ if(ssize > tcb->cwind)
+ ssize = tcb->cwind;
+ if(ssize > tcb->snd.wnd)
+ ssize = tcb->snd.wnd;
+
+ if(ssize < sent)
+ ssize = 0;
+ else {
+ ssize -= sent;
+ if(ssize > tcb->mss)
+ ssize = tcb->mss;
}
- usable = 1;
}
- else {
- usable = tcb->cwind;
- if(tcb->snd.wnd < usable)
- usable = tcb->snd.wnd;
-// usable -= sent;
- usable = usable >= sent? usable - sent: 0;
- }
- ssize = sndcnt-sent;
- if(ssize && usable < 2)
- netlog(s->p->f, Logtcp, "throttled snd.wnd %lud cwind %lud\n",
- tcb->snd.wnd, tcb->cwind);
- if(usable < ssize)
- ssize = usable;
- if(tcb->mss < ssize)
- ssize = tcb->mss;
+
dsize = ssize;
seg.urg = 0;
- if(ssize == 0)
- if((tcb->flags&FORCE) == 0)
- break;
+ if(!(tcb->flags & FORCE)){
+ if(ssize == 0)
+ break;
+ if(ssize < tcb->mss)
+ if(tcb->snd.nxt == tcb->snd.ptr)
+ if(sent > TCPREXMTTHRESH*tcb->mss)
+ break;
+ }
tcb->flags &= ~FORCE;
- tcprcvwin(s);
/* By default we will generate an ack */
tcphalt(tpriv, &tcb->acktimer);
- tcb->rcv.una = 0;
seg.source = s->lport;
seg.dest = s->rport;
seg.flags = ACK;
seg.mss = 0;
seg.ws = 0;
+ seg.update = 0;
switch(tcb->state){
case Syn_sent:
seg.flags = 0;
}
}
- if(sent+dsize == sndcnt)
+ if(sent+dsize == sndcnt && dsize)
seg.flags |= PSH;
- /* keep track of balance of resent data */
- if(seq_lt(tcb->snd.ptr, tcb->snd.nxt)) {
- n = tcb->snd.nxt - tcb->snd.ptr;
- if(ssize < n)
- n = ssize;
- tcb->resent += n;
- netlog(f, Logtcp, "rexmit: %I!%d -> %I!%d ptr %lux nxt %lux\n",
- s->raddr, s->rport, s->laddr, s->lport, tcb->snd.ptr, tcb->snd.nxt);
- tpriv->stats[RetransSegs]++;
- }
-
tcb->snd.ptr += ssize;
/* Pull up the send pointer so we can accept acks
* expect acknowledges
*/
if(ssize != 0){
- if(tcb->timer.state != TcptimerON)
+ if(tcb->timer.state != TcptimerON){
+ tcb->time = NOW;
+ tcb->timeuna = tcb->snd.una;
tcpgo(tpriv, &tcb->timer);
+ }
/* If round trip timer isn't running, start it.
* measure the longest packet only in case the
* transmission time dominates RTT
*/
+ if(tcb->snd.retransmit == 0)
if(tcb->rtt_timer.state != TcptimerON)
if(ssize == tcb->mss) {
tcpgo(tpriv, &tcb->rtt_timer);
}
tpriv->stats[OutSegs]++;
+ if(tcb->snd.retransmit)
+ tpriv->stats[RetransSegsSent]++;
+ tcb->rcv.ackptr = seg.ack;
+ tcb->rcv.wsnt = tcb->rcv.wptr;
/* put off the next keep alive */
tcpgo(tpriv, &tcb->katimer);
default:
panic("tcpoutput2: version %d", version);
}
- if((msgs%4) == 1){
+ if((msgs%4) == 3){
qunlock(s);
- sched();
qlock(s);
}
}
/*
* the BSD convention (hack?) for keep alives. resend last uchar acked.
*/
-void
+static void
tcpsendka(Conv *s)
{
Tcp seg;
tcb = (Tcpctl*)s->ptcl;
dbp = nil;
+ memset(&seg, 0, sizeof seg);
seg.urg = 0;
seg.source = s->lport;
seg.dest = s->rport;
else
seg.seq = tcb->snd.una-1;
seg.ack = tcb->rcv.nxt;
- tcb->rcv.una = 0;
+ tcb->rcv.ackptr = seg.ack;
+ tcprcvwin(s);
seg.wnd = tcb->rcv.wnd;
if(tcb->state == Finwait2){
seg.flags |= FIN;
/*
* set connection to time out after 12 minutes
*/
-void
+static void
tcpsetkacounter(Tcpctl *tcb)
{
tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK);
* if we've timed out, close the connection
* otherwise, send a keepalive and restart the timer
*/
-void
+static void
tcpkeepalive(void *v)
{
Tcpctl *tcb;
/*
* start keepalive timer
*/
-char*
+static char*
tcpstartka(Conv *s, char **f, int n)
{
Tcpctl *tcb;
/*
* turn checksums on/off
*/
-char*
+static char*
tcpsetchecksum(Conv *s, char **f, int)
{
Tcpctl *tcb;
return nil;
}
-void
+/*
+ * retransmit (at most) one segment at snd.una.
+ * preserve cwind & snd.ptr
+ */
+static void
tcprxmit(Conv *s)
{
Tcpctl *tcb;
+ Tcppriv *tpriv;
+ ulong tcwind, tptr;
tcb = (Tcpctl*)s->ptcl;
-
tcb->flags |= RETRAN|FORCE;
- tcb->snd.ptr = tcb->snd.una;
- /*
- * We should be halving the slow start threshhold (down to one
- * mss) but leaving it at mss seems to work well enough
- */
- tcb->ssthresh = tcb->mss;
-
- /*
- * pull window down to a single packet
- */
+ tptr = tcb->snd.ptr;
+ tcwind = tcb->cwind;
+ tcb->snd.ptr = tcb->snd.una;
tcb->cwind = tcb->mss;
+ tcb->snd.retransmit = 1;
tcpoutput(s);
+ tcb->snd.retransmit = 0;
+ tcb->cwind = tcwind;
+ tcb->snd.ptr = tptr;
+
+ tpriv = s->p->priv;
+ tpriv->stats[RetransSegs]++;
}
-void
+/*
+ * todo: RFC 4138 F-RTO
+ */
+static void
tcptimeout(void *arg)
{
Conv *s;
localclose(s, Etimedout);
break;
}
- netlog(s->p->f, Logtcprxmt, "timeout rexmit %#lux %d/%lud\n", tcb->snd.una, tcb->timer.start, NOW);
+ netlog(s->p->f, Logtcprxmt, "rxm %d/%d %ldms %lud rto %d %lud %s\n",
+ tcb->srtt, tcb->mdev, NOW-tcb->time,
+ tcb->snd.una-tcb->timeuna, tcb->snd.rto, tcb->snd.ptr,
+ tcpstates[s->state]);
tcpsettimer(tcb);
+ if(tcb->snd.rto == 0)
+ tcpcongestion(tcb);
tcprxmit(s);
+ tcb->snd.ptr = tcb->snd.una;
+ tcb->cwind = tcb->mss;
+ tcb->snd.rto = 1;
tpriv->stats[RetransTimeouts]++;
- tcb->snd.dupacks = 0;
+
+ if(tcb->snd.recovery){
+ tcb->snd.dupacks = 0; /* reno rto */
+ tcb->snd.recovery = 0;
+ tpriv->stats[RecoveryRTO]++;
+ tcb->snd.rxt = tcb->snd.nxt;
+ netlog(s->p->f, Logtcpwin,
+ "rto recovery rxt @%lud\n", tcb->snd.nxt);
+ }
+
+ tcb->abcbytes = 0;
break;
case Time_wait:
localclose(s, nil);
poperror();
}
-int
+static int
inwindow(Tcpctl *tcb, int seq)
{
return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1);
/*
* set up state for a received SYN (or SYN ACK) packet
*/
-void
+static void
procsyn(Conv *s, Tcp *seg)
{
Tcpctl *tcb;
tcb->flags |= FORCE;
tcb->rcv.nxt = seg->seq + 1;
+ tcb->rcv.wptr = tcb->rcv.nxt;
+ tcb->rcv.wsnt = 0;
tcb->rcv.urg = tcb->rcv.nxt;
tcb->irs = seg->seq;
tpriv->stats[Mss] = tcb->mss;
}
- /* the congestion window always starts out as a single segment */
tcb->snd.wnd = seg->wnd;
- tcb->cwind = tcb->mss;
+ initialwindow(tcb);
+}
+
+static int
+dumpreseq(Tcpctl *tcb)
+{
+ Reseq *r, *next;
+
+ for(r = tcb->reseq; r != nil; r = next){
+ next = r->next;
+ freeblist(r->bp);
+ free(r);
+ }
+ tcb->reseq = nil;
+ tcb->nreseq = 0;
+ tcb->reseqlen = 0;
+ return -1;
}
-int
-addreseq(Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length)
+static void
+logreseq(Fs *f, Reseq *r, ulong n)
+{
+ char *s;
+
+ for(; r != nil; r = r->next){
+ s = nil;
+ if(r->next == nil && r->seg.seq != n)
+ s = "hole/end";
+ else if(r->next == nil)
+ s = "end";
+ else if(r->seg.seq != n)
+ s = "hole";
+ if(s != nil)
+ netlog(f, Logtcp, "%s %lud-%lud (%ld) %#ux\n", s,
+ n, r->seg.seq, r->seg.seq-n, r->seg.flags);
+ n = r->seg.seq + r->seg.len;
+ }
+}
+
+static int
+addreseq(Fs *f, Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length)
{
- Reseq *rp, *rp1;
- int i, rqlen, qmax;
+ Reseq *rp, **rr;
+ int qmax;
rp = malloc(sizeof(Reseq));
if(rp == nil){
- freeblist(bp); /* bp always consumed by add_reseq */
+ freeblist(bp); /* bp always consumed by addreseq */
return 0;
}
rp->bp = bp;
rp->length = length;
- /* Place on reassembly list sorting by starting seq number */
- rp1 = tcb->reseq;
- if(rp1 == nil || seq_lt(seg->seq, rp1->seg.seq)) {
- rp->next = rp1;
- tcb->reseq = rp;
- if(rp->next != nil)
- tpriv->stats[OutOfOrder]++;
- return 0;
- }
+ tcb->reseqlen += length;
+ tcb->nreseq++;
- rqlen = 0;
- for(i = 0;; i++) {
- rqlen += rp1->length;
- if(rp1->next == nil || seq_lt(seg->seq, rp1->next->seg.seq)) {
- rp->next = rp1->next;
- rp1->next = rp;
+ /* Place on reassembly list sorting by starting seq number */
+ for(rr = &tcb->reseq;; rr = &(*rr)->next)
+ if(*rr == nil || seq_lt(seg->seq, (*rr)->seg.seq)){
+ rp->next = *rr;
+ *rr = rp;
+ tpriv->stats[Resequenced]++;
if(rp->next != nil)
tpriv->stats[OutOfOrder]++;
break;
}
- rp1 = rp1->next;
- }
- qmax = QMAX<<tcb->rcv.scale;
- if(rqlen > qmax){
- print("resequence queue > window: %d > %d\n", rqlen, qmax);
- i = 0;
- for(rp1 = tcb->reseq; rp1 != nil; rp1 = rp1->next){
- print("%#lux %#lux %#ux\n", rp1->seg.seq,
- rp1->seg.ack, rp1->seg.flags);
- if(i++ > 10){
- print("...\n");
- break;
- }
- }
-
- /*
- * delete entire reassembly queue; wait for retransmit.
- * - should we be smarter and only delete the tail?
- */
- for(rp = tcb->reseq; rp != nil; rp = rp1){
- rp1 = rp->next;
- freeblist(rp->bp);
- free(rp);
- }
- tcb->reseq = nil;
- return -1;
+ qmax = tcb->window;
+ if(tcb->reseqlen > qmax){
+ netlog(f, Logtcp, "tcp: reseq: queue > window: %d > %d; %d packets\n", tcb->reseqlen, qmax, tcb->nreseq);
+ logreseq(f, tcb->reseq, tcb->rcv.nxt);
+ tpriv->stats[ReseqBytelim]++;
+ return dumpreseq(tcb);
}
+ qmax = tcb->window / tcb->mss; /* ~190 for qscale==2, 390 for qscale=3 */
+ if(tcb->nreseq > qmax){
+ netlog(f, Logtcp, "resequence queue > packets: %d %d; %d bytes\n", tcb->nreseq, qmax, tcb->reseqlen);
+ logreseq(f, tcb->reseq, tcb->rcv.nxt);
+ tpriv->stats[ReseqPktlim]++;
+ return dumpreseq(tcb);
+ }
+
return 0;
}
-void
+static void
getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
{
Reseq *rp;
*bp = rp->bp;
*length = rp->length;
+ tcb->nreseq--;
+ tcb->reseqlen -= rp->length;
+
free(rp);
}
-int
+static int
tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
{
ushort len;
return 0;
}
-void
+static void
tcpadvise(Proto *tcp, Block *bp, char *msg)
{
Tcp4hdr *h4;
if(tcb->state != Closed)
if(ipcmp(s->raddr, dest) == 0)
if(ipcmp(s->laddr, source) == 0){
+ if(s->ignoreadvice)
+ break;
qlock(s);
qunlock(tcp);
switch(tcb->state){
}
/* called with c qlocked */
-char*
+static char*
tcpctl(Conv* c, char** f, int n)
{
if(n == 1 && strcmp(f[0], "hangup") == 0)
return "unknown control request";
}
-int
+static int
tcpstats(Proto *tcp, char *buf, int len)
{
Tcppriv *priv;
* of questionable validity so we try to use them only when we're
* up against the wall.
*/
-int
+static int
tcpgc(Proto *tcp)
{
Conv *c, **pp, **ep;
switch(tcb->state){
case Syn_received:
if(NOW - tcb->time > 5000){
- localclose(c, "timed out");
+ localclose(c, Etimedout);
n++;
}
break;
case Finwait2:
if(NOW - tcb->time > 5*60*1000){
- localclose(c, "timed out");
+ localclose(c, Etimedout);
n++;
}
break;
return n;
}
-void
+static void
tcpsettimer(Tcpctl *tcb)
{
int x;
x = backoff(tcb->backoff) *
(tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK;
- /* bounded twixt 1/2 and 64 seconds */
- if(x < 500/MSPTICK)
- x = 500/MSPTICK;
+ /* bounded twixt 0.3 and 64 seconds */
+ if(x < 300/MSPTICK)
+ x = 300/MSPTICK;
else if(x > (64000/MSPTICK))
x = 64000/MSPTICK;
tcb->timer.start = x;
Fsproto(fs, tcp);
}
-void
+static void
tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale)
{
- if(rcvscale){
- tcb->rcv.scale = rcvscale & 0xff;
- tcb->snd.scale = sndscale & 0xff;
- tcb->window = QMAX<<tcb->snd.scale;
- qsetlimit(s->rq, tcb->window);
- } else {
- tcb->rcv.scale = 0;
- tcb->snd.scale = 0;
- tcb->window = QMAX;
- qsetlimit(s->rq, tcb->window);
- }
+ /*
+ * guess at reasonable queue sizes. there's no current way
+ * to know how many nic receive buffers we can safely tie up in the
+ * tcp stack, and we don't adjust our queues to maximize throughput
+ * and minimize bufferbloat. n.b. the offer (rcvscale) needs to be
+ * respected, but we still control our own buffer commitment by
+ * keeping a seperate qscale.
+ */
+ tcb->rcv.scale = rcvscale & 0xff;
+ tcb->snd.scale = sndscale & 0xff;
+ tcb->qscale = rcvscale & 0xff;
+ if(rcvscale > Maxqscale)
+ tcb->qscale = Maxqscale;
+
+ if(rcvscale != tcb->rcv.scale)
+ netlog(s->p->f, Logtcp, "tcpsetscale: window %lud qlen %d >> window %ud lport %d\n",
+ tcb->window, qlen(s->rq), QMAX<<tcb->qscale, s->lport);
+ tcb->window = QMAX<<tcb->qscale;
+ tcb->ssthresh = tcb->window;
+
+ /*
+ * it's important to set wq large enough to cover the full
+ * bandwidth-delay product. it's possible to be in loss
+ * recovery with a big window, and we need to keep sending
+ * into the inflated window. the difference can be huge
+ * for even modest (70ms) ping times.
+ */
+ qsetlimit(s->rq, QMAX<<tcb->qscale);
+ qsetlimit(s->wq, QMAX<<tcb->qscale);
+ tcprcvwin(s);
}