these changes are a first step in preparing tcp to handle modest (1-2MB) bandwidth-delay products. the strategy was to completely implement NewReno. the testing network was a 7/35/70ms by 100Mbit wan emulator with 0/.05/.1% loss. here are the performance comparisons from the current nix tcp "old" to the new nix tcp. the smallest improvement was 80%, the largest was 11x. loss% rtt old new 0.10 7 4.40 7.85 0.10 35 0.88 1.79 0.10 70 0.47 0.84 0.05 7 4.80 9.38 0.05 35 1.00 2.02 0.05 70 0.52 1.77 0.01 7 5.33 11.87 0.01 35 1.14 10.97 0.01 70 0.54 4.75 0.00 7 4.49 11.92 0.00 35 1.04 11.35 0.00 70 0.58 10.56 since the diff is not very easy to read, i wrote a small paper detailing the changes http://www.quanstro.net/plan9/tcp/tcp.pdf - erik Reference: /n/patches.lsub.org/patch/tcpbdp Date: Thu Nov 29 23:26:58 CET 2012 Signed-off-by: quanstro@quanstro.net --- /sys/src/nix/ip/tcp.c Thu Sep 13 19:07:05 2012 +++ /sys/src/nix/ip/tcp.c Thu Nov 29 22:16:59 2012 @@ -80,7 +80,14 @@ Maxlimbo = 1000, /* maximum procs waiting for response to SYN ACK */ NLHT = 256, /* hash table size, must be a power of 2 */ LHTMASK = NLHT-1, - HaveWS = 1<<8, + + /* + * window is 64kb · 2ⁿ + * these factors determine the ultimate bandwidth-delay product. + * 64kb · 2⁵ = 2mb, or 2x overkill for 100mbps · 70ms. + */ + Maxqscale = 4, /* maximum queuing scale */ + Defadvscale = 4, /* default advertisement */ }; /* Must correspond to the enumeration above */ @@ -168,8 +175,9 @@ ulong seq; ulong ack; uchar flags; + uchar update; ushort ws; /* window scale option */ - ulong wnd; + ulong wnd; /* prescaled window*/ ushort urg; ushort mss; /* max segment size option (if not zero) */ ushort len; /* size of data */ @@ -207,20 +215,25 @@ uint scale; /* how much to right shift window in xmitted packets */ /* to implement tahoe and reno TCP */ ulong dupacks; /* number of duplicate acks rcvd */ + ulong partialack; int recovery; /* loss recovery flag */ - ulong rxt; /* right window marker for recovery */ + int retransmit; /* retransmit 1 packet @ una flag */ + int rto; + ulong rxt; /* right window marker for recovery "recover" rfc3782 */ } snd; struct { ulong nxt; /* Receive pointer to next uchar slot */ ulong wnd; /* Receive window incoming */ + ulong wsnt; /* Last wptr sent. important to track for large bdp */ + ulong wptr; ulong urg; /* Urgent pointer */ + ulong ackptr; /* last acked sequence */ int blocked; - int una; /* unacked data segs, for delayed acks */ - uint scale; /* how much to left shift window in rcved packets */ + uint scale; /* how much to left shift window in rcv'd packets */ } rcv; ulong iss; /* Initial sequence number */ ulong cwind; /* Congestion window */ - ulong abcbytes; /* appropriate byte counting rfc 3485 */ + ulong abcbytes; /* appropriate byte counting rfc 3465 */ uint scale; /* desired snd.scale */ ulong ssthresh; /* Slow start threshold */ int resent; /* Bytes just resent */ @@ -245,6 +258,7 @@ int kacounter; /* count down for keep alive */ uint sndsyntime; /* time syn sent */ ulong time; /* time Finwait2 or Syn_received was sent */ + ulong timeuna; /* snd.una when time was set */ int nochecksum; /* non-zero means don't send checksums */ int flgcnt; /* number of flags in the sequence (FIN,SEQ) */ @@ -287,7 +301,6 @@ }; int tcp_irtt = DEF_RTT; /* Initial guess at round trip time */ -ushort tcp_mss = DEF_MSS; /* Maximum segment size to be sent */ enum { /* MIB stats */ @@ -300,6 +313,7 @@ InSegs, OutSegs, RetransSegs, + RetransSegsSent, RetransTimeouts, InErrs, OutRsts, @@ -308,15 +322,24 @@ CsumErrs, HlenErrs, LenErrs, - OutOfOrder, Resequenced, + OutOfOrder, ReseqBytelim, ReseqPktlim, + Delayack, + Wopenack, + + Recovery, + RecoveryDone, + RecoveryRTO, + RecoveryNoSeq, + RecoveryCwind, + RecoveryPA, Nstats }; -static char *statnames[] = +static char *statnames[Nstats] = { [MaxConn] "MaxConn", [Mss] "MaxSegment", @@ -327,6 +350,7 @@ [InSegs] "InSegs", [OutSegs] "OutSegs", [RetransSegs] "RetransSegs", +[RetransSegsSent] "RetransSegsSent", [RetransTimeouts] "RetransTimeouts", [InErrs] "InErrs", [OutRsts] "OutRsts", @@ -337,6 +361,16 @@ [Resequenced] "Resequenced", [ReseqBytelim] "ReseqBytelim", [ReseqPktlim] "ReseqPktlim", +[Delayack] "Delayack", +[Wopenack] "Wopenack", + +[Recovery] "Recovery", +[RecoveryDone] "RecoveryDone", +[RecoveryRTO] "RecoveryRTO", + +[RecoveryNoSeq] "RecoveryNoSeq", +[RecoveryCwind] "RecoveryCwind", +[RecoveryPA] "RecoveryPA", }; typedef struct Tcppriv Tcppriv; @@ -457,13 +491,14 @@ s = (Tcpctl*)(c->ptcl); return snprint(state, n, - "%s qin %d qout %d rq %d.%d srtt %d mdev %d cwin %lud swin %lud>>%d rwin %lud>>%d timer.start %d timer.count %d rerecv %d katimer.start %d katimer.count %d\n", + "%s qin %d qout %d rq %d.%d srtt %d mdev %d sst %lud cwin %lud swin %lud>>%d rwin %lud>>%d qscale %d timer.start %d timer.count %d rerecv %d katimer.start %d katimer.count %d\n", tcpstates[s->state], c->rq ? qlen(c->rq) : 0, c->wq ? qlen(c->wq) : 0, s->nreseq, s->reseqlen, - s->srtt, s->mdev, + s->srtt, s->mdev, s->ssthresh, s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale, + s->qscale, s->timer.start, s->timer.count, s->rerecv, s->katimer.start, s->katimer.count); } @@ -573,6 +608,8 @@ poperror(); } +static int seq_lt(ulong, ulong); + static void tcprcvwin(Conv *s) /* Call with tcb locked */ { @@ -583,6 +620,9 @@ w = tcb->window - qlen(s->rq); if(w < 0) w = 0; + /* RFC 1122 § 4.2.2.17 do not move right edge of window left */ + if(seq_lt(tcb->rcv.nxt + w, tcb->rcv.wptr)) + w = tcb->rcv.wptr - tcb->rcv.nxt; if(w != tcb->rcv.wnd) if(w>>tcb->rcv.scale == 0 || tcb->window > 4*tcb->mss && w < tcb->mss/4){ tcb->rcv.blocked = 1; @@ -590,6 +630,7 @@ tcb->window, qlen(s->rq), tcb->rcv.scale, s->lport); } tcb->rcv.wnd = w; + tcb->rcv.wptr = tcb->rcv.nxt + w; } static void @@ -627,13 +668,32 @@ tcb->ssthresh = 2*tcb->mss; } +enum { + L = 2, /* aggressive slow start; legal values ∈ (1.0, 2.0) */ +}; + static void -tcpabcincr(Tcpctl *tcb, ulong acked, ulong limit) +tcpabcincr(Tcpctl *tcb, uint acked) { + uint limit; + tcb->abcbytes += acked; - if(tcb->abcbytes >= limit){ - tcb->cwind += tcb->mss; - tcb->abcbytes %= limit; + if(tcb->cwind < tcb->ssthresh){ + /* slow start */ + if(tcb->snd.rto) + limit = 1*tcb->mss; + else + limit = L*tcb->mss; + tcb->cwind += MIN(tcb->abcbytes, limit); + tcb->abcbytes = 0; + } + else{ + tcb->snd.rto = 0; + /* avoidance */ + if(tcb->abcbytes >= tcb->cwind){ + tcb->abcbytes -= tcb->cwind; + tcb->cwind += tcb->mss; + } } } @@ -641,7 +701,7 @@ tcpcreate(Conv *c) { c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c); - c->wq = qopen((3*QMAX)/2, Qkick, tcpkick, c); + c->wq = qopen(QMAX, Qkick, tcpkick, c); } static void @@ -794,25 +854,19 @@ case V4: mtu = DEF_MSS; if(ifc != nil) - mtu = ifc->maxtu - ifc->medium->hsize - (TCP4_PKT + TCP4_HDRSIZE); + mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE); break; case V6: mtu = DEF_MSS6; if(ifc != nil) - mtu = ifc->maxtu - ifc->medium->hsize - (TCP6_PKT + TCP6_HDRSIZE); + mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE); break; } - if(ifc != nil){ - if(ifc->mbps > 1000) - *scale = HaveWS | 4; - else if(ifc->mbps > 100) - *scale = HaveWS | 3; - else if(ifc->mbps > 10) - *scale = HaveWS | 1; - else - *scale = HaveWS | 0; - } else - *scale = HaveWS | 0; + /* + * set the ws. it doesn't commit us to anything. + * ws is the ultimate limit to the bandwidth-delay product. + */ + *scale = Defadvscale; return mtu; } @@ -902,7 +956,7 @@ if(tpriv->ackprocstarted == 0){ qlock(&tpriv->apl); if(tpriv->ackprocstarted == 0){ - sprint(kpname, "#I%dtcpack", s->p->f->dev); + snprint(kpname, sizeof(kpname), "#I%dtcpack", s->p->f->dev); kproc(kpname, tcpackproc, s->p); tpriv->ackprocstarted = 1; } @@ -932,24 +986,24 @@ } static char* -tcpflag(ushort flag) +tcpflag(char *buf, char *e, ushort flag) { - static char buf[128]; + char *p; - sprint(buf, "%d", flag>>10); /* Head len */ + p = seprint(buf, e, "%d", flag>>10); /* Head len */ if(flag & URG) - strcat(buf, " URG"); + p = seprint(p, e, " URG"); if(flag & ACK) - strcat(buf, " ACK"); + p = seprint(p, e, " ACK"); if(flag & PSH) - strcat(buf, " PSH"); + p = seprint(p, e, " PSH"); if(flag & RST) - strcat(buf, " RST"); + p = seprint(p, e, " RST"); if(flag & SYN) - strcat(buf, " SYN"); + p = seprint(p, e, " SYN"); if(flag & FIN) - strcat(buf, " FIN"); - + p = seprint(p, e, " FIN"); + USED(p); return buf; } @@ -1141,6 +1195,7 @@ tcph->urg = nhgets(h->tcpurg); tcph->mss = 0; tcph->ws = 0; + tcph->update = 0; tcph->len = nhgets(h->ploadlen) - hdrlen; *bpp = pullupblock(*bpp, hdrlen+TCP6_PKT); @@ -1165,7 +1220,7 @@ break; case WSOPT: if(optlen == WS_LENGTH && *(optr+2) <= 14) - tcph->ws = HaveWS | *(optr+2); + tcph->ws = *(optr+2); break; } n -= optlen; @@ -1204,6 +1259,7 @@ tcph->urg = nhgets(h->tcpurg); tcph->mss = 0; tcph->ws = 0; + tcph->update = 0; tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT); *bpp = pullupblock(*bpp, hdrlen+TCP4_PKT); @@ -1228,7 +1284,7 @@ break; case WSOPT: if(optlen == WS_LENGTH && *(optr+2) <= 14) - tcph->ws = HaveWS | *(optr+2); + tcph->ws = *(optr+2); break; } n -= optlen; @@ -1250,6 +1306,7 @@ tcb->rttseq = tcb->iss; tcb->snd.wl2 = tcb->iss; tcb->snd.una = tcb->iss; + tcb->snd.rxt = tcb->iss; tcb->snd.ptr = tcb->rttseq; tcb->snd.nxt = tcb->rttseq; tcb->flgcnt++; @@ -1364,7 +1421,7 @@ memset(&seg, 0, sizeof seg); seg.flags = RST | ACK; seg.ack = tcb->rcv.nxt; - tcb->rcv.una = 0; + tcb->rcv.ackptr = seg.ack; seg.seq = tcb->snd.ptr; seg.wnd = 0; seg.urg = 0; @@ -1613,6 +1670,18 @@ } } +static void +initialwindow(Tcpctl *tcb) +{ + /* RFC 3390 initial window */ + if(tcb->mss < 1095) + tcb->cwind = 4*tcb->mss; + else if(tcb->mss < 2190) + tcb->cwind = 4380; + else + tcb->cwind = 2*tcb->mss; +} + /* * come here when we finally get an ACK to our SYN-ACK. * lookup call in limbo. if found, create a new conversation @@ -1684,6 +1753,8 @@ tcb->irs = lp->irs; tcb->rcv.nxt = tcb->irs+1; + tcb->rcv.wptr = tcb->rcv.nxt; + tcb->rcv.wsnt = 0; tcb->rcv.urg = tcb->rcv.nxt; tcb->iss = lp->iss; @@ -1692,6 +1763,7 @@ tcb->snd.una = tcb->iss+1; tcb->snd.ptr = tcb->iss+1; tcb->snd.nxt = tcb->iss+1; + tcb->snd.rxt = tcb->iss+1; tcb->flgcnt = 0; tcb->flags |= SYNACK; @@ -1704,9 +1776,9 @@ /* window scaling */ tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale); - /* the congestion window always starts out as a single segment */ + /* congestion window */ tcb->snd.wnd = segp->wnd; - tcb->cwind = tcb->mss; + initialwindow(tcb); /* set initial round trip time */ tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER; @@ -1813,15 +1885,13 @@ ulong acked; Tcppriv *tpriv; + if(seg->update) + return; + seg->update = 1; + tpriv = s->p->priv; tcb = (Tcpctl*)s->ptcl; - /* if everything has been acked, force output(?) */ - if(seq_gt(seg->ack, tcb->snd.nxt)) { - tcb->flags |= FORCE; - return; - } - /* catch zero-window updates, update window & recover */ if(tcb->snd.wnd == 0 && seg->wnd > 0) if(seq_lt(seg->ack, tcb->snd.ptr)){ @@ -1831,25 +1901,45 @@ goto recovery; } + /* newreno fast retransmit */ if(seg->ack == tcb->snd.una) if(tcb->snd.una != tcb->snd.nxt) - if(seg->len == 0) - if(seg->wnd == tcb->snd.wnd) - if(++tcb->snd.dupacks == TCPREXMTTHRESH){ + if(++tcb->snd.dupacks == 3){ recovery: - tcb->snd.recovery = 1; - tcb->snd.rxt = tcb->snd.nxt; - tcpcongestion(tcb); - tcprxmit(s); - tcb->cwind = tcb->ssthresh; - } else + if(tcb->snd.recovery){ + tpriv->stats[RecoveryCwind]++; + tcb->cwind += tcb->mss; + }else if(seq_le(tcb->snd.rxt, seg->ack)){ + tpriv->stats[Recovery]++; + tcb->abcbytes = 0; + tcb->snd.recovery = 1; + tcb->snd.partialack = 0; + tcb->snd.rxt = tcb->snd.nxt; + tcpcongestion(tcb); + tcb->cwind = tcb->ssthresh + 3*tcb->mss; + netlog(s->p->f, Logtcpwin, "recovery inflate %ld ss %ld @%lud\n", + tcb->cwind, tcb->ssthresh, tcb->snd.rxt); + tcprxmit(s); + }else{ + tpriv->stats[RecoveryNoSeq]++; + netlog(s->p->f, Logtcpwin, "!recov %lud not ≤ %lud %ld\n", + tcb->snd.rxt, seg->ack, tcb->snd.rxt - seg->ack); + /* do not enter fast retransmit */ + /* do not change ssthresh */ + } + }else if(tcb->snd.recovery){ + tpriv->stats[RecoveryCwind]++; tcb->cwind += tcb->mss; + } /* * update window */ if(seq_gt(seg->ack, tcb->snd.wl2) || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){ + /* clear dupack if we advance wl2 */ + if(tcb->snd.wl2 != seg->ack) + tcb->snd.dupacks = 0; tcb->snd.wnd = seg->wnd; tcb->snd.wl2 = seg->ack; } @@ -1880,24 +1970,33 @@ */ if(tcb->snd.recovery){ if(seq_ge(seg->ack, tcb->snd.rxt)){ - /* recovery finished */ + /* recovery finished; deflate window */ + tpriv->stats[RecoveryDone]++; tcb->snd.dupacks = 0; tcb->snd.recovery = 0; - tcb->cwind = (tcb->snd.nxt - seg->ack) + tcb->mss; + tcb->cwind = (tcb->snd.nxt - tcb->snd.una) + tcb->mss; if(tcb->ssthresh < tcb->cwind) tcb->cwind = tcb->ssthresh; + netlog(s->p->f, Logtcpwin, "recovery deflate %ld %ld\n", + tcb->cwind, tcb->ssthresh); } else { - /* partial ack */ - tcb->cwind -= acked; - tcb->cwind += tcb->mss; + /* partial ack; we lost more than one segment */ + tpriv->stats[RecoveryPA]++; + if(tcb->cwind > acked) + tcb->cwind -= acked; + else{ + netlog(s->p->f, Logtcpwin, "partial ack neg\n"); + tcb->cwind = tcb->mss; + } + netlog(s->p->f, Logtcpwin, "partial ack %ld left %ld cwind %ld\n", + acked, tcb->snd.rxt - seg->ack, tcb->cwind); + + if(acked >= tcb->mss) + tcb->cwind += tcb->mss; + tcb->snd.partialack++; } - } else { - tcb->snd.dupacks = 0; - if(tcb->cwind < tcb->ssthresh) - tcpabcincr(tcb, acked, 2*tcb->mss); /* slow start */ - else - tcpabcincr(tcb, acked, tcb->cwind); /* congestion dance */ - } + } else + tcpabcincr(tcb, acked); /* Adjust the timers according to the round trip time */ /* todo: fix sloppy treatment of overflow cases here. */ @@ -1937,13 +2036,17 @@ if(tcb->snd.recovery) tcprxmit(s); - /*tcplimitmaxburst(tcb);*/ - if(seq_gt(seg->ack, tcb->snd.urg)) tcb->snd.urg = seg->ack; - if(tcb->snd.una != tcb->snd.nxt) - tcpgo(tpriv, &tcb->timer); + if(tcb->snd.una != tcb->snd.nxt){ + /* “impatient” variant */ + if(!tcb->snd.recovery || tcb->snd.partialack == 1){ + tcb->time = NOW; + tcb->timeuna = tcb->snd.una; + tcpgo(tpriv, &tcb->timer); + } + } else tcphalt(tpriv, &tcb->timer); @@ -2186,10 +2289,12 @@ } /* Cut the data to fit the receive window */ + tcprcvwin(s); if(tcptrim(tcb, &seg, &bp, &length) == -1) { - netlog(f, Logtcp, "tcptrim, not accept, seq %lud-%lud win %lud-%lud from %I\n", + if(seg.seq+1 != tcb->rcv.nxt || length != 1) + netlog(f, Logtcp, "tcp: trim: !inwind: seq %lud-%lud win %lud-%lud l %d from %I\n", seg.seq, seg.seq + length - 1, - tcb->rcv.nxt, tcb->rcv.nxt + tcb->rcv.wnd-1, s->raddr); + tcb->rcv.nxt, tcb->rcv.nxt + tcb->rcv.wnd-1, length, s->raddr); update(s, &seg); if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) { tcphalt(tpriv, &tcb->rtt_timer); @@ -2222,10 +2327,13 @@ update(s, &seg); if(addreseq(f, tcb, tpriv, &seg, bp, length) < 0) print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr, s->lport); - tcb->flags |= FORCE; /* force duplicate ack */ + tcb->flags |= FORCE; /* force duplicate ack; RFC 5681 §3.2 */ goto output; } + if(tcb->nreseq > 0) + tcb->flags |= FORCE; /* filled hole in sequence space; RFC 5681 §3.2 */ + /* * keep looping till we've processed this packet plus any * adjacent packets in the resequence queue @@ -2329,20 +2437,6 @@ panic("tcp packblock"); qpassnolim(s->rq, bp); bp = nil; - - /* - * Force an ack every 2 data messages. This is - * a hack for rob to make his home system run - * faster. - * - * this also keeps the standard TCP congestion - * control working since it needs an ack every - * 2 max segs worth. This is not quite that, - * but under a real stream is equivalent since - * every packet has a max seg in it. - */ - if(++(tcb->rcv.una) >= 2) - tcb->flags |= FORCE; } tcb->rcv.nxt += length; @@ -2420,8 +2514,11 @@ getreseq(tcb, &seg, &bp, &length); - if(tcptrim(tcb, &seg, &bp, &length) == 0) + tcprcvwin(s); + if(tcptrim(tcb, &seg, &bp, &length) == 0){ + tcb->flags |= FORCE; break; + } } } output: @@ -2458,9 +2555,25 @@ tpriv = s->p->priv; version = s->ipversion; - for(msgs = 0; msgs < 100; msgs++) { - tcb = (Tcpctl*)s->ptcl; + tcb = (Tcpctl*)s->ptcl; + + /* force ack every 2*mss */ + if((tcb->flags & FORCE) == 0) + if(tcb->rcv.nxt - tcb->rcv.ackptr >= 2*tcb->mss){ + tpriv->stats[Delayack]++; + tcb->flags |= FORCE; + } + + /* force ack if window opening */ + if((tcb->flags & FORCE) == 0){ + tcprcvwin(s); + if((int)(tcb->rcv.wptr - tcb->rcv.wsnt) >= 2*tcb->mss){ + tpriv->stats[Wopenack]++; + tcb->flags |= FORCE; + } + } + for(msgs = 0; msgs < 100; msgs++) { switch(tcb->state) { case Listen: case Closed: @@ -2526,12 +2639,12 @@ /* By default we will generate an ack */ tcphalt(tpriv, &tcb->acktimer); - tcb->rcv.una = 0; seg.source = s->lport; seg.dest = s->rport; seg.flags = ACK; seg.mss = 0; seg.ws = 0; + seg.update = 0; switch(tcb->state){ case Syn_sent: seg.flags = 0; @@ -2609,13 +2722,17 @@ * expect acknowledges */ if(ssize != 0){ - if(tcb->timer.state != TcptimerON) + if(tcb->timer.state != TcptimerON){ + tcb->time = NOW; + tcb->timeuna = tcb->snd.una; tcpgo(tpriv, &tcb->timer); + } /* If round trip timer isn't running, start it. * measure the longest packet only in case the * transmission time dominates RTT */ + if(tcb->snd.retransmit == 0) if(tcb->rtt_timer.state != TcptimerON) if(ssize == tcb->mss) { tcpgo(tpriv, &tcb->rtt_timer); @@ -2624,6 +2741,10 @@ } tpriv->stats[OutSegs]++; + if(tcb->snd.retransmit) + tpriv->stats[RetransSegsSent]++; + tcb->rcv.ackptr = seg.ack; + tcb->rcv.wsnt = tcb->rcv.wptr; /* put off the next keep alive */ tcpgo(tpriv, &tcb->katimer); @@ -2644,9 +2765,8 @@ default: panic("tcpoutput2: version %d", version); } - if((msgs%4) == 1){ + if((msgs%4) == 3){ qunlock(s); - // sched(); qlock(s); } } @@ -2677,7 +2797,8 @@ else seg.seq = tcb->snd.una-1; seg.ack = tcb->rcv.nxt; - tcb->rcv.una = 0; + tcb->rcv.ackptr = seg.ack; + tcprcvwin(s); seg.wnd = tcb->rcv.wnd; if(tcb->state == Finwait2){ seg.flags |= FIN; @@ -2803,7 +2924,9 @@ tcwind = tcb->cwind; tcb->snd.ptr = tcb->snd.una; tcb->cwind = tcb->mss; + tcb->snd.retransmit = 1; tcpoutput(s); + tcb->snd.retransmit = 0; tcb->cwind = tcwind; tcb->snd.ptr = tptr; @@ -2843,16 +2966,29 @@ localclose(s, Etimedout); break; } - netlog(s->p->f, Logtcprxmt, "timeout rexmit %#lux %d/%lud\n", - tcb->snd.una, tcb->timer.start, NOW); + netlog(s->p->f, Logtcprxmt, "rxm %d/%d %ldms %lud rto %d %lud %s\n", + tcb->srtt, tcb->mdev, NOW-tcb->time, + tcb->snd.una-tcb->timeuna, tcb->snd.rto, tcb->snd.ptr, + tcpstates[s->state]); tcpsettimer(tcb); - tcpcongestion(tcb); + if(tcb->snd.rto == 0) + tcpcongestion(tcb); tcprxmit(s); tcb->snd.ptr = tcb->snd.una; tcb->cwind = tcb->mss; + tcb->snd.rto = 1; tpriv->stats[RetransTimeouts]++; - tcb->snd.dupacks = 0; - tcb->snd.recovery = 0; + + if(tcb->snd.recovery){ + tcb->snd.dupacks = 0; /* reno rto */ + tcb->snd.recovery = 0; + tpriv->stats[RecoveryRTO]++; + tcb->snd.rxt = tcb->snd.nxt; + netlog(s->p->f, Logtcpwin, + "rto recovery rxt @%lud\n", tcb->snd.nxt); + } + + tcb->abcbytes = 0; break; case Time_wait: localclose(s, nil); @@ -2883,6 +3019,8 @@ tcb->flags |= FORCE; tcb->rcv.nxt = seg->seq + 1; + tcb->rcv.wptr = tcb->rcv.nxt; + tcb->rcv.wsnt = 0; tcb->rcv.urg = tcb->rcv.nxt; tcb->irs = seg->seq; @@ -2894,14 +3032,7 @@ } tcb->snd.wnd = seg->wnd; - - /* RFC 3390 initial window */ - if(tcb->mss < 1095) - tcb->cwind = 4*tcb->mss; - else if(tcb->mss < 2190) - tcb->cwind = 4380; - else - tcb->cwind = 2*tcb->mss; + initialwindow(tcb); } static int @@ -2921,11 +3052,22 @@ } static void -logreseq(Fs *f, Reseq *r) +logreseq(Fs *f, Reseq *r, ulong n) { + char *s; + for(; r != nil; r = r->next){ - netlog(f, Logtcp, "%#lud %ud %#lud %#ux\n", r->seg.seq, r->seg.len, - r->seg.ack, r->seg.flags); + s = nil; + if(r->next == nil && r->seg.seq != n) + s = "hole/end"; + else if(r->next == nil) + s = "end"; + else if(r->seg.seq != n) + s = "hole"; + if(s != nil) + netlog(f, Logtcp, "%s %lud-%lud (%ld) %#ux\n", s, + n, r->seg.seq, r->seg.seq-n, r->seg.flags); + n = r->seg.seq + r->seg.len; } } @@ -2959,17 +3101,17 @@ break; } - qmax = QMAX<qscale; + qmax = tcb->window; if(tcb->reseqlen > qmax){ netlog(f, Logtcp, "tcp: reseq: queue > window: %d > %d; %d packets\n", tcb->reseqlen, qmax, tcb->nreseq); - logreseq(f, tcb->reseq); + logreseq(f, tcb->reseq, tcb->rcv.nxt); tpriv->stats[ReseqBytelim]++; return dumpreseq(tcb); } - qmax = 15*(tcb->qscale + 1); + qmax = tcb->window / tcb->mss; /* ~190 for qscale==2, 390 for qscale=3 */ if(tcb->nreseq > qmax){ netlog(f, Logtcp, "resequence queue > packets: %d %d; %d bytes\n", tcb->nreseq, qmax, tcb->reseqlen); - logreseq(f, tcb->reseq); + logreseq(f, tcb->reseq, tcb->rcv.nxt); tpriv->stats[ReseqPktlim]++; return dumpreseq(tcb); } @@ -3218,9 +3360,9 @@ x = backoff(tcb->backoff) * (tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK; - /* bounded twixt 1/2 and 64 seconds */ - if(x < 500/MSPTICK) - x = 500/MSPTICK; + /* bounded twixt 0.3 and 64 seconds */ + if(x < 300/MSPTICK) + x = 300/MSPTICK; else if(x > (64000/MSPTICK)) x = 64000/MSPTICK; tcb->timer.start = x; @@ -3254,10 +3396,6 @@ Fsproto(fs, tcp); } -enum { - Maxqscale = 3, /* ½ mb */ -}; - static void tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale) { @@ -3266,12 +3404,12 @@ * to know how many nic receive buffers we can safely tie up in the * tcp stack, and we don't adjust our queues to maximize throughput * and minimize bufferbloat. n.b. the offer (rcvscale) needs to be - * respected, but we still control our own buffer commentment by + * respected, but we still control our own buffer commitment by * keeping a seperate qscale. */ tcb->rcv.scale = rcvscale & 0xff; tcb->snd.scale = sndscale & 0xff; - tcb->qscale = rcvscale; + tcb->qscale = rcvscale & 0xff; if(rcvscale > Maxqscale) tcb->qscale = Maxqscale; @@ -3280,6 +3418,15 @@ tcb->window, qlen(s->rq), QMAX<qscale, s->lport); tcb->window = QMAX<qscale; tcb->ssthresh = tcb->window; + + /* + * it's important to set wq large enough to cover the full + * bandwidth-delay product. it's possible to be in loss + * recovery with a big window, and we need to keep sending + * into the inflated window. the difference can be huge + * for even modest (70ms) ping times. + */ qsetlimit(s->rq, QMAX<qscale); + qsetlimit(s->wq, QMAX<qscale); tcprcvwin(s); }