remove misguided comment Reference: /n/patches.lsub.org/patch/tcpwindow Date: Tue Sep 11 17:35:54 CES 2012 Signed-off-by: quanstro@quanstro.net --- /sys/src/nix/ip/tcp.c Fri Jul 20 11:32:38 2012 +++ /sys/src/nix/ip/tcp.c Tue Sep 11 16:39:36 2012 @@ -168,7 +168,7 @@ ulong seq; ulong ack; uchar flags; - ushort ws; /* window scale option (if not zero) */ + ushort ws; /* window scale option */ ulong wnd; ushort urg; ushort mss; /* max segment size option (if not zero) */ @@ -204,7 +204,7 @@ ulong wnd; /* Tcp send window */ ulong urg; /* Urgent data pointer */ ulong wl2; - int scale; /* how much to right shift window in xmitted packets */ + uint scale; /* how much to right shift window in xmitted packets */ /* to implement tahoe and reno TCP */ ulong dupacks; /* number of duplicate acks rcvd */ int recovery; /* loss recovery flag */ @@ -216,19 +216,19 @@ ulong urg; /* Urgent pointer */ int blocked; int una; /* unacked data segs, for delayed acks */ - int scale; /* how much to left shift window in rcved packets */ + uint scale; /* how much to left shift window in rcved packets */ } rcv; ulong iss; /* Initial sequence number */ - int sawwsopt; /* true if we saw a wsopt on the incoming SYN */ ulong cwind; /* Congestion window */ - ulong abcbytes; /* appropriate byte counting */ - int scale; /* desired snd.scale */ + ulong abcbytes; /* appropriate byte counting rfc 3485 */ + uint scale; /* desired snd.scale */ ulong ssthresh; /* Slow start threshold */ int resent; /* Bytes just resent */ int irs; /* Initial received squence */ ushort mss; /* Maximum segment size */ int rerecv; /* Overlap of data rerecevived */ - ulong window; /* Receive window */ + ulong window; /* Our receive window (queue) */ + uint qscale; /* Log2 of our receive window (queue) */ uchar backoff; /* Exponential backoff counter */ int backedoff; /* ms we've backed off for rexmits */ uchar flags; /* State flags */ @@ -240,7 +240,7 @@ Tcptimer rtt_timer; /* Round trip timer */ Tcptimer katimer; /* keep alive timer */ ulong rttseq; /* Round trip sequence */ - int srtt; /* Shortened round trip */ + int srtt; /* Smoothed round trip */ int mdev; /* Mean deviation of round trip */ int kacounter; /* count down for keep alive */ uint sndsyntime; /* time syn sent */ @@ -562,7 +562,6 @@ /* * Push data */ - tcprcvwin(s); tcpoutput(s); break; default: @@ -582,10 +581,15 @@ tcb = (Tcpctl*)s->ptcl; w = tcb->window - qlen(s->rq); - if(w <= 0 || tcb->window > 4*tcb->mss && w < tcb->mss/4){ + if(w < 0) + w = 0; + if(w != tcb->rcv.wnd) + if(w>>tcb->rcv.scale == 0 || tcb->window > 4*tcb->mss && w < tcb->mss/4){ tcb->rcv.blocked = 1; - netlog(s->p->f, Logtcp, "tcprcvwim: window %lud qlen %d\n", tcb->window, qlen(s->rq)); + netlog(s->p->f, Logtcp, "tcprcvwin: window %lud qlen %d ws %ud lport %d\n", + tcb->window, qlen(s->rq), tcb->rcv.scale, s->lport); } + tcb->rcv.wnd = w; } static void @@ -604,7 +608,6 @@ qlock(s); if(tcb->state != Closed){ tcb->flags |= FORCE; - tcprcvwin(s); tcpoutput(s); } qunlock(s); @@ -780,7 +783,7 @@ /* mtu (- TCP + IP hdr len) of 1st hop */ static int -tcpmtu(Proto *tcp, uchar *addr, int version, int *scale) +tcpmtu(Proto *tcp, uchar *addr, int version, uint *scale) { Ipifc *ifc; int mtu; @@ -827,7 +830,7 @@ memset(tcb, 0, sizeof(Tcpctl)); - tcb->ssthresh = QMAX; + tcb->ssthresh = QMAX; /* reset by tcpsetscale() */ tcb->srtt = tcp_irtt<mdev = 0; @@ -881,11 +884,7 @@ tpriv->stats[Mss] = tcb->mss; /* default is no window scaling */ - tcb->window = QMAX; - tcb->rcv.wnd = QMAX; - tcb->rcv.scale = 0; - tcb->snd.scale = 0; - qsetlimit(s->rq, QMAX); + tcpsetscale(s, tcb, 0, 0); } /* @@ -1051,7 +1050,7 @@ if(tcph->flags & SYN){ if(tcph->mss) hdrlen += MSS_LENGTH; - if(tcph->ws) + if(1) hdrlen += WS_LENGTH; optpad = hdrlen & 3; if(optpad) @@ -1093,7 +1092,8 @@ hnputs(opt, tcph->mss); opt += 2; } - if(tcph->ws != 0){ + /* always offer. rfc1323 §2.2 */ + if(1){ *opt++ = WSOPT; *opt++ = WS_LENGTH; *opt++ = tcph->ws; @@ -1402,7 +1402,7 @@ Tcp4hdr ph4; Tcp6hdr ph6; Tcp seg; - int scale; + uint scale; /* make pseudo header */ switch(lp->version) { @@ -1899,14 +1899,6 @@ tcpabcincr(tcb, acked, tcb->cwind); /* congestion dance */ } - if(tcb->cwind > tcb->snd.wnd){ - tcb->cwind = tcb->snd.wnd; - - /* this is a sloppy hack. why isn't this updated when we see the window open? */ - if(tcb->cwind < 10) - tcb->cwind = tcb->mss; - } - /* Adjust the timers according to the round trip time */ /* todo: fix sloppy treatment of overflow cases here. */ if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) { @@ -2230,7 +2222,7 @@ update(s, &seg); if(addreseq(f, tcb, tpriv, &seg, bp, length) < 0) print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr, s->lport); - tcb->flags |= FORCE; + tcb->flags |= FORCE; /* force duplicate ack */ goto output; } @@ -2355,11 +2347,6 @@ tcb->rcv.nxt += length; /* - * update our rcv window - */ - tcprcvwin(s); - - /* * turn on the acktimer if there's something * to ack */ @@ -2486,6 +2473,7 @@ break; /* force an ack when a window has opened up */ + tcprcvwin(s); if(tcb->rcv.blocked && tcb->rcv.wnd > 0){ tcb->rcv.blocked = 0; tcb->flags |= FORCE; @@ -2499,9 +2487,13 @@ if(sent > 0) if(!(tcb->flags & FORCE)) break; /* already probing, rto re-probes */ - ssize -= sent; - if(ssize > 0) - ssize = 1; + if(ssize < sent) + ssize = 0; + else{ + ssize -= sent; + if(ssize > 0) + ssize = 1; + } } else { /* calculate usable segment size */ if(ssize > tcb->cwind) @@ -2531,7 +2523,6 @@ } tcb->flags &= ~FORCE; - tcprcvwin(s); /* By default we will generate an ack */ tcphalt(tpriv, &tcb->acktimer); @@ -2655,7 +2646,7 @@ } if((msgs%4) == 1){ qunlock(s); - sched(); + // sched(); qlock(s); } } @@ -2968,14 +2959,14 @@ break; } - qmax = QMAX<rcv.scale; + qmax = QMAX<qscale; if(tcb->reseqlen > qmax){ netlog(f, Logtcp, "tcp: reseq: queue > window: %d > %d; %d packets\n", tcb->reseqlen, qmax, tcb->nreseq); logreseq(f, tcb->reseq); tpriv->stats[ReseqBytelim]++; return dumpreseq(tcb); } - qmax = 15*(tcb->rcv.scale + 1); + qmax = 15*(tcb->qscale + 1); if(tcb->nreseq > qmax){ netlog(f, Logtcp, "resequence queue > packets: %d %d; %d bytes\n", tcb->nreseq, qmax, tcb->reseqlen); logreseq(f, tcb->reseq); @@ -3263,20 +3254,32 @@ Fsproto(fs, tcp); } +enum { + Maxqscale = 3, /* ½ mb */ +}; + static void tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale) { - if(rcvscale){ - tcb->rcv.scale = rcvscale & 0xff; - tcb->snd.scale = sndscale & 0xff; - tcb->window = QMAX<snd.scale; - tcb->ssthresh = tcb->window; - qsetlimit(s->rq, tcb->window); - } else { - tcb->rcv.scale = 0; - tcb->snd.scale = 0; - tcb->window = QMAX; - tcb->ssthresh = tcb->window; - qsetlimit(s->rq, tcb->window); - } + /* + * guess at reasonable queue sizes. there's no current way + * to know how many nic receive buffers we can safely tie up in the + * tcp stack, and we don't adjust our queues to maximize throughput + * and minimize bufferbloat. n.b. the offer (rcvscale) needs to be + * respected, but we still control our own buffer commentment by + * keeping a seperate qscale. + */ + tcb->rcv.scale = rcvscale & 0xff; + tcb->snd.scale = sndscale & 0xff; + tcb->qscale = rcvscale; + if(rcvscale > Maxqscale) + tcb->qscale = Maxqscale; + + if(rcvscale != tcb->rcv.scale) + netlog(s->p->f, Logtcp, "tcpsetscale: window %lud qlen %d >> window %ud lport %d\n", + tcb->window, qlen(s->rq), QMAX<qscale, s->lport); + tcb->window = QMAX<qscale; + tcb->ssthresh = tcb->window; + qsetlimit(s->rq, QMAX<qscale); + tcprcvwin(s); }