tcp changes as discussed previously on the list. Reference: /n/patches.lsub.org/patch/tcpwnd Date: Tue Jul 10 17:37:21 CES 2012 Signed-off-by: quanstro@quanstro.net --- /sys/src/9/ip/tcp.c Thu Apr 12 12:25:03 2012 +++ /sys/src/9/ip/tcp.c Sat Jul 14 17:25:28 2012 @@ -80,7 +80,6 @@ Maxlimbo = 1000, /* maximum procs waiting for response to SYN ACK */ NLHT = 256, /* hash table size, must be a power of 2 */ LHTMASK = NLHT-1, - HaveWS = 1<<8, }; @@ -216,14 +215,15 @@ ulong wnd; /* Receive window incoming */ ulong urg; /* Urgent pointer */ int blocked; - int una; /* unacked data segs */ + int una; /* unacked data segs, for delayed acks */ int scale; /* how much to left shift window in rcved packets */ } rcv; ulong iss; /* Initial sequence number */ int sawwsopt; /* true if we saw a wsopt on the incoming SYN */ ulong cwind; /* Congestion window */ + ulong abcbytes; /* appropriate byte counting */ int scale; /* desired snd.scale */ - ushort ssthresh; /* Slow start threshold */ + ulong ssthresh; /* Slow start threshold */ int resent; /* Bytes just resent */ int irs; /* Initial received squence */ ushort mss; /* Maximum segment size */ @@ -233,6 +233,8 @@ int backedoff; /* ms we've backed off for rexmits */ uchar flags; /* State flags */ Reseq *reseq; /* Resequencing queue */ + int nreseq; + int reseqlen; Tcptimer timer; /* Activity timer */ Tcptimer acktimer; /* Acknowledge timer */ Tcptimer rtt_timer; /* Round trip timer */ @@ -307,6 +309,9 @@ HlenErrs, LenErrs, OutOfOrder, + Resequenced, + ReseqBytelim, + ReseqPktlim, Nstats }; @@ -329,6 +334,9 @@ [HlenErrs] "HlenErrs", [LenErrs] "LenErrs", [OutOfOrder] "OutOfOrder", +[Resequenced] "Resequenced", +[ReseqBytelim] "ReseqBytelim", +[ReseqPktlim] "ReseqPktlim", }; typedef struct Tcppriv Tcppriv; @@ -363,29 +371,29 @@ */ int tcpporthogdefense = 0; -int addreseq(Tcpctl*, Tcppriv*, Tcp*, Block*, ushort); -void getreseq(Tcpctl*, Tcp*, Block**, ushort*); -void localclose(Conv*, char*); -void procsyn(Conv*, Tcp*); -void tcpiput(Proto*, Ipifc*, Block*); -void tcpoutput(Conv*); -int tcptrim(Tcpctl*, Tcp*, Block**, ushort*); -void tcpstart(Conv*, int); -void tcptimeout(void*); -void tcpsndsyn(Conv*, Tcpctl*); -void tcprcvwin(Conv*); -void tcpacktimer(void*); -void tcpkeepalive(void*); -void tcpsetkacounter(Tcpctl*); -void tcprxmit(Conv*); -void tcpsettimer(Tcpctl*); -void tcpsynackrtt(Conv*); -void tcpsetscale(Conv*, Tcpctl*, ushort, ushort); - -static void limborexmit(Proto*); -static void limbo(Conv*, uchar*, uchar*, Tcp*, int); +static int addreseq(Fs*, Tcpctl*, Tcppriv*, Tcp*, Block*, ushort); +static int dumpreseq(Tcpctl*); +static void getreseq(Tcpctl*, Tcp*, Block**, ushort*); +static void limbo(Conv*, uchar*, uchar*, Tcp*, int); +static void limborexmit(Proto*); +static void localclose(Conv*, char*); +static void procsyn(Conv*, Tcp*); +static void tcpacktimer(void*); +static void tcpiput(Proto*, Ipifc*, Block*); +static void tcpkeepalive(void*); +static void tcpoutput(Conv*); +static void tcprcvwin(Conv*); +static void tcprxmit(Conv*); +static void tcpsetkacounter(Tcpctl*); +static void tcpsetscale(Conv*, Tcpctl*, ushort, ushort); +static void tcpsettimer(Tcpctl*); +static void tcpsndsyn(Conv*, Tcpctl*); +static void tcpstart(Conv*, int); +static void tcpsynackrtt(Conv*); +static void tcptimeout(void*); +static int tcptrim(Tcpctl*, Tcp*, Block**, ushort*); -void +static void tcpsetstate(Conv *s, uchar newstate) { Tcpctl *tcb; @@ -405,11 +413,6 @@ if(newstate == Established) tpriv->stats[CurrEstab]++; - /** - print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport, - tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab ); - **/ - switch(newstate) { case Closed: qclose(s->rq); @@ -454,10 +457,11 @@ s = (Tcpctl*)(c->ptcl); return snprint(state, n, - "%s qin %d qout %d srtt %d mdev %d cwin %lud swin %lud>>%d rwin %lud>>%d timer.start %d timer.count %d rerecv %d katimer.start %d katimer.count %d\n", + "%s qin %d qout %d rq %d.%d srtt %d mdev %d cwin %lud swin %lud>>%d rwin %lud>>%d timer.start %d timer.count %d rerecv %d katimer.start %d katimer.count %d\n", tcpstates[s->state], c->rq ? qlen(c->rq) : 0, c->wq ? qlen(c->wq) : 0, + s->nreseq, s->reseqlen, s->srtt, s->mdev, s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale, s->timer.start, s->timer.count, s->rerecv, @@ -536,7 +540,7 @@ } } -void +static void tcpkick(void *x) { Conv *s = x; @@ -570,7 +574,7 @@ poperror(); } -void +static void tcprcvwin(Conv *s) /* Call with tcb locked */ { int w; @@ -578,16 +582,13 @@ tcb = (Tcpctl*)s->ptcl; w = tcb->window - qlen(s->rq); - if(w < 0) - w = 0; - if(w == 0) - netlog(s->p->f, Logtcp, "tcprcvwim: window %lud qlen %d\n", tcb->window, qlen(s->rq)); - tcb->rcv.wnd = w; - if(w == 0) + if(w <= 0 || tcb->window > 4*tcb->mss && w < tcb->mss/4){ tcb->rcv.blocked = 1; + netlog(s->p->f, Logtcp, "tcprcvwim: window %lud qlen %d\n", tcb->window, qlen(s->rq)); + } } -void +static void tcpacktimer(void *v) { Tcpctl *tcb; @@ -611,6 +612,29 @@ } static void +tcpcongestion(Tcpctl *tcb) +{ + ulong inflight; + + inflight = tcb->snd.nxt - tcb->snd.una; + if(inflight > tcb->cwind) + inflight = tcb->cwind; + tcb->ssthresh = inflight / 2; + if(tcb->ssthresh < 2*tcb->mss) + tcb->ssthresh = 2*tcb->mss; +} + +static void +tcpabcincr(Tcpctl *tcb, ulong acked, ulong limit) +{ + tcb->abcbytes += acked; + if(tcb->abcbytes >= limit){ + tcb->cwind += tcb->mss; + tcb->abcbytes %= limit; + } +} + +static void tcpcreate(Conv *c) { c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c); @@ -649,7 +673,7 @@ t->state = newstate; } -void +static void tcpackproc(void *a) { Tcptimer *t, *tp, *timeo; @@ -695,7 +719,7 @@ } } -void +static void tcpgo(Tcppriv *priv, Tcptimer *t) { if(t == nil || t->start == 0) @@ -707,7 +731,7 @@ qunlock(&priv->tl); } -void +static void tcphalt(Tcppriv *priv, Tcptimer *t) { if(t == nil) @@ -718,17 +742,16 @@ qunlock(&priv->tl); } -int +static int backoff(int n) { return 1 << n; } -void +static void localclose(Conv *s, char *reason) /* called with tcb locked */ { Tcpctl *tcb; - Reseq *rp,*rp1; Tcppriv *tpriv; tpriv = s->p->priv; @@ -742,12 +765,7 @@ tcphalt(tpriv, &tcb->katimer); /* Flush reassembly queue; nothing more can arrive */ - for(rp = tcb->reseq; rp != nil; rp = rp1) { - rp1 = rp->next; - freeblist(rp->bp); - free(rp); - } - tcb->reseq = nil; + dumpreseq(tcb); if(tcb->state == Syn_sent) Fsconnected(s, reason); @@ -761,7 +779,7 @@ } /* mtu (- TCP + IP hdr len) of 1st hop */ -int +static int tcpmtu(Proto *tcp, uchar *addr, int version, int *scale) { Ipifc *ifc; @@ -796,7 +814,7 @@ return mtu; } -void +static void inittcpctl(Conv *s, int mode) { Tcpctl *tcb; @@ -809,7 +827,7 @@ memset(tcb, 0, sizeof(Tcpctl)); - tcb->ssthresh = 65535; + tcb->ssthresh = QMAX; tcb->srtt = tcp_irtt<mdev = 0; @@ -858,6 +876,7 @@ } tcb->mss = tcb->cwind = mss; + tcb->abcbytes = 0; tpriv = s->p->priv; tpriv->stats[Mss] = tcb->mss; @@ -872,7 +891,7 @@ /* * called with s qlocked */ -void +static void tcpstart(Conv *s, int mode) { Tcpctl *tcb; @@ -935,7 +954,7 @@ return buf; } -Block * +static Block* htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb) { int dlen; @@ -992,7 +1011,6 @@ *opt++ = MSSOPT; *opt++ = MSS_LENGTH; hnputs(opt, tcph->mss); -// print("our outgoing mss %d\n", tcph->mss); opt += 2; } if(tcph->ws != 0){ @@ -1020,7 +1038,7 @@ return data; } -Block * +static Block* htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb) { int dlen; @@ -1094,7 +1112,7 @@ return data; } -int +static int ntohtcp6(Tcp *tcph, Block **bpp) { Tcp6hdr *h; @@ -1156,7 +1174,7 @@ return hdrlen; } -int +static int ntohtcp4(Tcp *tcph, Block **bpp) { Tcp4hdr *h; @@ -1205,10 +1223,8 @@ break; switch(*optr) { case MSSOPT: - if(optlen == MSS_LENGTH) { + if(optlen == MSS_LENGTH) tcph->mss = nhgets(optr+2); -// print("new incoming mss %d\n", tcph->mss); - } break; case WSOPT: if(optlen == WS_LENGTH && *(optr+2) <= 14) @@ -1222,10 +1238,10 @@ } /* - * For outgiing calls, generate an initial sequence + * For outgoing calls, generate an initial sequence * number and put a SYN on the send queue */ -void +static void tcpsndsyn(Conv *s, Tcpctl *tcb) { Tcppriv *tpriv; @@ -1333,7 +1349,7 @@ * send a reset to the remote side and close the conversation * called with s qlocked */ -char* +static char* tcphangup(Conv *s) { Tcp seg; @@ -1379,7 +1395,7 @@ /* * (re)send a SYN ACK */ -int +static int sndsynack(Proto *tcp, Limbo *lp) { Block *hbp; @@ -1420,8 +1436,6 @@ seg.flags = SYN|ACK; seg.urg = 0; seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale); -// if (seg.mss > lp->mss && lp->mss >= 512) -// seg.mss = lp->mss; seg.wnd = QMAX; /* if the other side set scale, we should too */ @@ -1731,7 +1745,7 @@ return new; } -int +static int seq_within(ulong x, ulong low, ulong high) { if(low <= high){ @@ -1745,25 +1759,25 @@ return 0; } -int +static int seq_lt(ulong x, ulong y) { return (int)(x-y) < 0; } -int +static int seq_le(ulong x, ulong y) { return (int)(x-y) <= 0; } -int +static int seq_gt(ulong x, ulong y) { return (int)(x-y) > 0; } -int +static int seq_ge(ulong x, ulong y) { return (int)(x-y) >= 0; @@ -1773,7 +1787,7 @@ * use the time between the first SYN and it's ack as the * initial round trip time */ -void +static void tcpsynackrtt(Conv *s) { Tcpctl *tcb; @@ -1791,13 +1805,12 @@ tcphalt(tpriv, &tcb->rtt_timer); } -void +static void update(Conv *s, Tcp *seg) { int rtt, delta; Tcpctl *tcb; ulong acked; - ulong expand; Tcppriv *tpriv; tpriv = s->p->priv; @@ -1809,30 +1822,29 @@ return; } - /* added by Dong Lin for fast retransmission */ - if(seg->ack == tcb->snd.una - && tcb->snd.una != tcb->snd.nxt - && seg->len == 0 - && seg->wnd == tcb->snd.wnd) { - - /* this is a pure ack w/o window update */ - netlog(s->p->f, Logtcprxmt, "dupack %lud ack %lud sndwnd %lud advwin %lud\n", - tcb->snd.dupacks, seg->ack, tcb->snd.wnd, seg->wnd); - - if(++tcb->snd.dupacks == TCPREXMTTHRESH) { - /* - * tahoe tcp rxt the packet, half sshthresh, - * and set cwnd to one packet - */ - tcb->snd.recovery = 1; - tcb->snd.rxt = tcb->snd.nxt; - netlog(s->p->f, Logtcprxmt, "fast rxt %lud, nxt %lud\n", tcb->snd.una, tcb->snd.nxt); - tcprxmit(s); - } else { - /* do reno tcp here. */ - } + /* catch zero-window updates, update window & recover */ + if(tcb->snd.wnd == 0 && seg->wnd > 0) + if(seq_lt(seg->ack, tcb->snd.ptr)){ + netlog(s->p->f, Logtcp, "tcp: zwu ack %lud una %lud ptr %lud win %lud\n", + seg->ack, tcb->snd.una, tcb->snd.ptr, seg->wnd); + tcb->snd.wnd = seg->wnd; + goto recovery; } + if(seg->ack == tcb->snd.una) + if(tcb->snd.una != tcb->snd.nxt) + if(seg->len == 0) + if(seg->wnd == tcb->snd.wnd) + if(++tcb->snd.dupacks == TCPREXMTTHRESH){ +recovery: + tcb->snd.recovery = 1; + tcb->snd.rxt = tcb->snd.nxt; + tcpcongestion(tcb); + tcprxmit(s); + tcb->cwind = tcb->ssthresh; + } else + tcb->cwind += tcb->mss; + /* * update window */ @@ -1847,22 +1859,11 @@ * don't let us hangup if sending into a closed window and * we're still getting acks */ - if((tcb->flags&RETRAN) && tcb->snd.wnd == 0){ + if((tcb->flags&RETRAN) && tcb->snd.wnd == 0) tcb->backedoff = MAXBACKMS/4; - } return; } - /* - * any positive ack turns off fast rxt, - * (should we do new-reno on partial acks?) - */ - if(!tcb->snd.recovery || seq_ge(seg->ack, tcb->snd.rxt)) { - tcb->snd.dupacks = 0; - tcb->snd.recovery = 0; - } else - netlog(s->p->f, Logtcp, "rxt next %lud, cwin %lud\n", seg->ack, tcb->cwind); - /* Compute the new send window size */ acked = seg->ack - tcb->snd.una; @@ -1874,24 +1875,40 @@ goto done; } - /* slow start as long as we're not recovering from lost packets */ - if(tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) { - if(tcb->cwind < tcb->ssthresh) { - expand = tcb->mss; - if(acked < expand) - expand = acked; + /* + * congestion control + */ + if(tcb->snd.recovery){ + if(seq_ge(seg->ack, tcb->snd.rxt)){ + /* recovery finished */ + tcb->snd.dupacks = 0; + tcb->snd.recovery = 0; + tcb->cwind = (tcb->snd.nxt - seg->ack) + tcb->mss; + if(tcb->ssthresh < tcb->cwind) + tcb->cwind = tcb->ssthresh; + } else { + /* partial ack */ + tcb->cwind -= acked; + tcb->cwind += tcb->mss; } + } else { + tcb->snd.dupacks = 0; + if(tcb->cwind < tcb->ssthresh) + tcpabcincr(tcb, acked, 2*tcb->mss); /* slow start */ else - expand = ((int)tcb->mss * tcb->mss) / tcb->cwind; + tcpabcincr(tcb, acked, tcb->cwind); /* congestion dance */ + } + + if(tcb->cwind > tcb->snd.wnd){ + tcb->cwind = tcb->snd.wnd; - if(tcb->cwind + expand < tcb->cwind) - expand = tcb->snd.wnd - tcb->cwind; - if(tcb->cwind + expand > tcb->snd.wnd) - expand = tcb->snd.wnd - tcb->cwind; - tcb->cwind += expand; + /* this is a sloppy hack. why isn't this updated when we see the window open? */ + if(tcb->cwind < 10) + tcb->cwind = tcb->mss; } /* Adjust the timers according to the round trip time */ + /* todo: fix sloppy treatment of overflow cases here. */ if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) { tcphalt(tpriv, &tcb->rtt_timer); if((tcb->flags&RETRAN) == 0) { @@ -1922,8 +1939,14 @@ done: if(qdiscard(s->wq, acked) < acked) tcb->flgcnt--; - tcb->snd.una = seg->ack; + + /* newreno fast recovery */ + if(tcb->snd.recovery) + tcprxmit(s); + + /*tcplimitmaxburst(tcb);*/ + if(seq_gt(seg->ack, tcb->snd.urg)) tcb->snd.urg = seg->ack; @@ -1935,12 +1958,13 @@ if(seq_lt(tcb->snd.ptr, tcb->snd.una)) tcb->snd.ptr = tcb->snd.una; - tcb->flags &= ~RETRAN; + if(!tcb->snd.recovery) + tcb->flags &= ~RETRAN; tcb->backoff = 0; tcb->backedoff = 0; } -void +static void tcpiput(Proto *tcp, Ipifc*, Block *bp) { Tcp seg; @@ -1962,7 +1986,6 @@ h4 = (Tcp4hdr*)(bp->rp); h6 = (Tcp6hdr*)(bp->rp); - memset(&seg, 0, sizeof seg); if((h4->vihl&0xF0)==IP_VER4) { version = V4; @@ -2175,7 +2198,6 @@ netlog(f, Logtcp, "tcptrim, not accept, seq %lud-%lud win %lud-%lud from %I\n", seg.seq, seg.seq + length - 1, tcb->rcv.nxt, tcb->rcv.nxt + tcb->rcv.wnd-1, s->raddr); - netlog(f, Logtcp, "tcp len < 0, %lud %d\n", seg.seq, length); update(s, &seg); if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) { tcphalt(tpriv, &tcb->rtt_timer); @@ -2206,7 +2228,7 @@ if(seg.seq != tcb->rcv.nxt) if(length != 0 || (seg.flags & (SYN|FIN))) { update(s, &seg); - if(addreseq(tcb, tpriv, &seg, bp, length) < 0) + if(addreseq(f, tcb, tpriv, &seg, bp, length) < 0) print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr, s->lport); tcb->flags |= FORCE; goto output; @@ -2432,15 +2454,15 @@ * the lock to ipoput the packet so some care has to be * taken by callers. */ -void +static void tcpoutput(Conv *s) { Tcp seg; - int msgs; + uint msgs; Tcpctl *tcb; Block *hbp, *bp; - int sndcnt, n; - ulong ssize, dsize, usable, sent; + int sndcnt; + ulong ssize, dsize, sent; Fs *f; Tcppriv *tpriv; uchar version; @@ -2448,7 +2470,6 @@ f = s->p->f; tpriv = s->p->priv; version = s->ipversion; - memset(&seg, 0, sizeof seg); for(msgs = 0; msgs < 100; msgs++) { tcb = (Tcpctl*)s->ptcl; @@ -2460,6 +2481,10 @@ return; } + /* Don't send anything else until our SYN has been acked */ + if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0) + break; + /* force an ack when a window has opened up */ if(tcb->rcv.blocked && tcb->rcv.wnd > 0){ tcb->rcv.blocked = 0; @@ -2468,43 +2493,42 @@ sndcnt = qlen(s->wq)+tcb->flgcnt; sent = tcb->snd.ptr - tcb->snd.una; - - /* Don't send anything else until our SYN has been acked */ - if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0) - break; - - /* Compute usable segment based on offered window and limit - * window probes to one - */ + ssize = sndcnt; if(tcb->snd.wnd == 0){ - if(sent != 0) { - if((tcb->flags&FORCE) == 0) - break; -// tcb->snd.ptr = tcb->snd.una; + /* zero window probe */ + if(sent > 0) + if(!(tcb->flags & FORCE)) + break; /* already probing, rto re-probes */ + ssize -= sent; + if(ssize > 0) + ssize = 1; + } else { + /* calculate usable segment size */ + if(ssize > tcb->cwind) + ssize = tcb->cwind; + if(ssize > tcb->snd.wnd) + ssize = tcb->snd.wnd; + + if(ssize < sent) + ssize = 0; + else { + ssize -= sent; + if(ssize > tcb->mss) + ssize = tcb->mss; } - usable = 1; } - else { - usable = tcb->cwind; - if(tcb->snd.wnd < usable) - usable = tcb->snd.wnd; -// usable -= sent; - usable = usable >= sent? usable - sent: 0; - } - ssize = sndcnt-sent; - if(ssize && usable < 2) - netlog(s->p->f, Logtcp, "throttled snd.wnd %lud cwind %lud\n", - tcb->snd.wnd, tcb->cwind); - if(usable < ssize) - ssize = usable; - if(tcb->mss < ssize) - ssize = tcb->mss; + dsize = ssize; seg.urg = 0; - if(ssize == 0) - if((tcb->flags&FORCE) == 0) - break; + if(!(tcb->flags & FORCE)){ + if(ssize == 0) + break; + if(ssize < tcb->mss) + if(tcb->snd.nxt == tcb->snd.ptr) + if(sent > TCPREXMTTHRESH*tcb->mss) + break; + } tcb->flags &= ~FORCE; tcprcvwin(s); @@ -2556,20 +2580,9 @@ } } - if(sent+dsize == sndcnt) + if(sent+dsize == sndcnt && dsize) seg.flags |= PSH; - /* keep track of balance of resent data */ - if(seq_lt(tcb->snd.ptr, tcb->snd.nxt)) { - n = tcb->snd.nxt - tcb->snd.ptr; - if(ssize < n) - n = ssize; - tcb->resent += n; - netlog(f, Logtcp, "rexmit: %I!%d -> %I!%d ptr %lux nxt %lux\n", - s->raddr, s->rport, s->laddr, s->lport, tcb->snd.ptr, tcb->snd.nxt); - tpriv->stats[RetransSegs]++; - } - tcb->snd.ptr += ssize; /* Pull up the send pointer so we can accept acks @@ -2651,7 +2664,7 @@ /* * the BSD convention (hack?) for keep alives. resend last uchar acked. */ -void +static void tcpsendka(Conv *s) { Tcp seg; @@ -2707,7 +2720,7 @@ /* * set connection to time out after 12 minutes */ -void +static void tcpsetkacounter(Tcpctl *tcb) { tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK); @@ -2719,7 +2732,7 @@ * if we've timed out, close the connection * otherwise, send a keepalive and restart the timer */ -void +static void tcpkeepalive(void *v) { Tcpctl *tcb; @@ -2747,7 +2760,7 @@ /* * start keepalive timer */ -char* +static char* tcpstartka(Conv *s, char **f, int n) { Tcpctl *tcb; @@ -2770,7 +2783,7 @@ /* * turn checksums on/off */ -char* +static char* tcpsetchecksum(Conv *s, char **f, int) { Tcpctl *tcb; @@ -2781,30 +2794,36 @@ return nil; } -void +/* + * retransmit (at most) one segment at snd.una. + * preserve cwind & snd.ptr + */ +static void tcprxmit(Conv *s) { Tcpctl *tcb; + Tcppriv *tpriv; + ulong tcwind, tptr; tcb = (Tcpctl*)s->ptcl; - tcb->flags |= RETRAN|FORCE; - tcb->snd.ptr = tcb->snd.una; - - /* - * We should be halving the slow start threshhold (down to one - * mss) but leaving it at mss seems to work well enough - */ - tcb->ssthresh = tcb->mss; - /* - * pull window down to a single packet - */ + tptr = tcb->snd.ptr; + tcwind = tcb->cwind; + tcb->snd.ptr = tcb->snd.una; tcb->cwind = tcb->mss; tcpoutput(s); + tcb->cwind = tcwind; + tcb->snd.ptr = tptr; + + tpriv = s->p->priv; + tpriv->stats[RetransSegs]++; } -void +/* + * todo: RFC 4138 F-RTO + */ +static void tcptimeout(void *arg) { Conv *s; @@ -2833,11 +2852,16 @@ localclose(s, Etimedout); break; } - netlog(s->p->f, Logtcprxmt, "timeout rexmit %#lux %d/%lud\n", tcb->snd.una, tcb->timer.start, NOW); + netlog(s->p->f, Logtcprxmt, "timeout rexmit %#lux %d/%lud\n", + tcb->snd.una, tcb->timer.start, NOW); tcpsettimer(tcb); + tcpcongestion(tcb); tcprxmit(s); + tcb->snd.ptr = tcb->snd.una; + tcb->cwind = tcb->mss; tpriv->stats[RetransTimeouts]++; tcb->snd.dupacks = 0; + tcb->snd.recovery = 0; break; case Time_wait: localclose(s, nil); @@ -2849,7 +2873,7 @@ poperror(); } -int +static int inwindow(Tcpctl *tcb, int seq) { return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1); @@ -2858,7 +2882,7 @@ /* * set up state for a received SYN (or SYN ACK) packet */ -void +static void procsyn(Conv *s, Tcp *seg) { Tcpctl *tcb; @@ -2878,20 +2902,51 @@ tpriv->stats[Mss] = tcb->mss; } - /* the congestion window always starts out as a single segment */ tcb->snd.wnd = seg->wnd; - tcb->cwind = tcb->mss; + + /* RFC 3390 initial window */ + if(tcb->mss < 1095) + tcb->cwind = 4*tcb->mss; + else if(tcb->mss < 2190) + tcb->cwind = 4380; + else + tcb->cwind = 2*tcb->mss; +} + +static int +dumpreseq(Tcpctl *tcb) +{ + Reseq *r, *next; + + for(r = tcb->reseq; r != nil; r = next){ + next = r->next; + freeblist(r->bp); + free(r); + } + tcb->reseq = nil; + tcb->nreseq = 0; + tcb->reseqlen = 0; + return -1; +} + +static void +logreseq(Fs *f, Reseq *r) +{ + for(; r != nil; r = r->next){ + netlog(f, Logtcp, "%#lud %ud %#lud %#ux\n", r->seg.seq, r->seg.len, + r->seg.ack, r->seg.flags); + } } -int -addreseq(Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length) +static int +addreseq(Fs *f, Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length) { - Reseq *rp, *rp1; - int i, rqlen, qmax; + Reseq *rp, **rr; + int qmax; rp = malloc(sizeof(Reseq)); if(rp == nil){ - freeblist(bp); /* bp always consumed by add_reseq */ + freeblist(bp); /* bp always consumed by addreseq */ return 0; } @@ -2899,58 +2954,39 @@ rp->bp = bp; rp->length = length; - /* Place on reassembly list sorting by starting seq number */ - rp1 = tcb->reseq; - if(rp1 == nil || seq_lt(seg->seq, rp1->seg.seq)) { - rp->next = rp1; - tcb->reseq = rp; - if(rp->next != nil) - tpriv->stats[OutOfOrder]++; - return 0; - } + tcb->reseqlen += length; + tcb->nreseq++; - rqlen = 0; - for(i = 0;; i++) { - rqlen += rp1->length; - if(rp1->next == nil || seq_lt(seg->seq, rp1->next->seg.seq)) { - rp->next = rp1->next; - rp1->next = rp; + /* Place on reassembly list sorting by starting seq number */ + for(rr = &tcb->reseq;; rr = &(*rr)->next) + if(*rr == nil || seq_lt(seg->seq, (*rr)->seg.seq)){ + rp->next = *rr; + *rr = rp; + tpriv->stats[Resequenced]++; if(rp->next != nil) tpriv->stats[OutOfOrder]++; break; } - rp1 = rp1->next; - } - qmax = QMAX<rcv.scale; - if(rqlen > qmax){ - print("resequence queue > window: %d > %d\n", rqlen, qmax); - i = 0; - for(rp1 = tcb->reseq; rp1 != nil; rp1 = rp1->next){ - print("%#lux %#lux %#ux\n", rp1->seg.seq, - rp1->seg.ack, rp1->seg.flags); - if(i++ > 10){ - print("...\n"); - break; - } - } - - /* - * delete entire reassembly queue; wait for retransmit. - * - should we be smarter and only delete the tail? - */ - for(rp = tcb->reseq; rp != nil; rp = rp1){ - rp1 = rp->next; - freeblist(rp->bp); - free(rp); - } - tcb->reseq = nil; - return -1; + qmax = QMAX<rcv.scale; + if(tcb->reseqlen > qmax){ + netlog(f, Logtcp, "tcp: reseq: queue > window: %d > %d; %d packets\n", tcb->reseqlen, qmax, tcb->nreseq); + logreseq(f, tcb->reseq); + tpriv->stats[ReseqBytelim]++; + return dumpreseq(tcb); + } + qmax = 15*(tcb->rcv.scale + 1); + if(tcb->nreseq > qmax){ + netlog(f, Logtcp, "resequence queue > packets: %d %d; %d bytes\n", tcb->nreseq, qmax, tcb->reseqlen); + logreseq(f, tcb->reseq); + tpriv->stats[ReseqPktlim]++; + return dumpreseq(tcb); } + return 0; } -void +static void getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length) { Reseq *rp; @@ -2965,10 +3001,13 @@ *bp = rp->bp; *length = rp->length; + tcb->nreseq--; + tcb->reseqlen -= rp->length; + free(rp); } -int +static int tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length) { ushort len; @@ -3039,7 +3078,7 @@ return 0; } -void +static void tcpadvise(Proto *tcp, Block *bp, char *msg) { Tcp4hdr *h4; @@ -3105,7 +3144,7 @@ } /* called with c qlocked */ -char* +static char* tcpctl(Conv* c, char** f, int n) { if(n == 1 && strcmp(f[0], "hangup") == 0) @@ -3119,7 +3158,7 @@ return "unknown control request"; } -int +static int tcpstats(Proto *tcp, char *buf, int len) { Tcppriv *priv; @@ -3143,7 +3182,7 @@ * of questionable validity so we try to use them only when we're * up against the wall. */ -int +static int tcpgc(Proto *tcp) { Conv *c, **pp, **ep; @@ -3163,13 +3202,13 @@ switch(tcb->state){ case Syn_received: if(NOW - tcb->time > 5000){ - localclose(c, "timed out"); + localclose(c, Etimedout); n++; } break; case Finwait2: if(NOW - tcb->time > 5*60*1000){ - localclose(c, "timed out"); + localclose(c, Etimedout); n++; } break; @@ -3179,7 +3218,7 @@ return n; } -void +static void tcpsettimer(Tcpctl *tcb) { int x; @@ -3224,18 +3263,20 @@ Fsproto(fs, tcp); } -void +static void tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale) { if(rcvscale){ tcb->rcv.scale = rcvscale & 0xff; tcb->snd.scale = sndscale & 0xff; tcb->window = QMAX<snd.scale; + tcb->ssthresh = tcb->window; qsetlimit(s->rq, tcb->window); } else { tcb->rcv.scale = 0; tcb->snd.scale = 0; tcb->window = QMAX; + tcb->ssthresh = tcb->window; qsetlimit(s->rq, tcb->window); } } --- /sys/src/nix/ip/tcp.c Thu Apr 12 12:26:27 2012 +++ /sys/src/nix/ip/tcp.c Sat Jul 14 17:25:34 2012 @@ -41,13 +41,13 @@ EOLOPT = 0, NOOPOPT = 1, MSSOPT = 2, - MSS_LENGTH = 4, /* Mean segment size */ + MSS_LENGTH = 4, /* Maximum segment size */ WSOPT = 3, WS_LENGTH = 3, /* Bits to scale window size by */ MSL2 = 10, MSPTICK = 50, /* Milliseconds per timer tick */ - DEF_MSS = 1460, /* Default mean segment */ - DEF_MSS6 = 1280, /* Default mean segment (min) for v6 */ + DEF_MSS = 1460, /* Default maximum segment */ + DEF_MSS6 = 1280, /* Default maximum segment (min) for v6 */ DEF_RTT = 500, /* Default round trip */ DEF_KAT = 120000, /* Default time (ms) between keep alives */ TCP_LISTEN = 0, /* Listen connection */ @@ -55,7 +55,6 @@ SYNACK_RXTIMER = 250, /* ms between SYNACK retransmits */ TCPREXMTTHRESH = 3, /* dupack threshhold for rxt */ - TCPMAXBURST = 4, FORCE = 1, CLONE = 2, @@ -227,13 +226,15 @@ ulong ssthresh; /* Slow start threshold */ int resent; /* Bytes just resent */ int irs; /* Initial received squence */ - ushort mss; /* Mean segment size */ + ushort mss; /* Maximum segment size */ int rerecv; /* Overlap of data rerecevived */ - ulong window; /* Recevive window */ + ulong window; /* Receive window */ uchar backoff; /* Exponential backoff counter */ int backedoff; /* ms we've backed off for rexmits */ uchar flags; /* State flags */ Reseq *reseq; /* Resequencing queue */ + int nreseq; + int reseqlen; Tcptimer timer; /* Activity timer */ Tcptimer acktimer; /* Acknowledge timer */ Tcptimer rtt_timer; /* Round trip timer */ @@ -291,6 +292,7 @@ enum { /* MIB stats */ MaxConn, + Mss, ActiveOpens, PassiveOpens, EstabResets, @@ -307,6 +309,9 @@ HlenErrs, LenErrs, OutOfOrder, + Resequenced, + ReseqBytelim, + ReseqPktlim, Nstats }; @@ -314,6 +319,7 @@ static char *statnames[] = { [MaxConn] "MaxConn", +[Mss] "MaxSegment", [ActiveOpens] "ActiveOpens", [PassiveOpens] "PassiveOpens", [EstabResets] "EstabResets", @@ -328,6 +334,9 @@ [HlenErrs] "HlenErrs", [LenErrs] "LenErrs", [OutOfOrder] "OutOfOrder", +[Resequenced] "Resequenced", +[ReseqBytelim] "ReseqBytelim", +[ReseqPktlim] "ReseqPktlim", }; typedef struct Tcppriv Tcppriv; @@ -348,7 +357,7 @@ QLock apl; int ackprocstarted; - ulong stats[Nstats]; + uvlong stats[Nstats]; }; /* @@ -357,32 +366,32 @@ * of DoS attack. * * To avoid stateless Conv hogs, we pick a sequence number at random. If - * it that number gets acked by the other end, we shut down the connection. - * Look for tcpporthogedefense in the code. + * that number gets acked by the other end, we shut down the connection. + * Look for tcpporthogdefense in the code. */ int tcpporthogdefense = 0; -static int addreseq(Tcpctl*, Tcppriv*, Tcp*, Block*, ushort); -static void getreseq(Tcpctl*, Tcp*, Block**, ushort*); -static void localclose(Conv*, char*); -static void procsyn(Conv*, Tcp*); -static void tcpacktimer(void*); -static void tcpiput(Proto*, Ipifc*, Block*); -static void tcpkeepalive(void*); -static void tcpoutput(Conv*); -static void tcprcvwin(Conv*); -static void tcprxmit(Conv*); -static void tcpsetkacounter(Tcpctl*); -static void tcpsetscale(Conv*, Tcpctl*, ushort, ushort); -static void tcpsettimer(Tcpctl*); -static void tcpsndsyn(Conv*, Tcpctl*); -static void tcpstart(Conv*, int); -static void tcpsynackrtt(Conv*); -static void tcptimeout(void*); -static int tcptrim(Tcpctl*, Tcp*, Block**, ushort*); - -static void limborexmit(Proto*); -static void limbo(Conv*, uchar*, uchar*, Tcp*, int); +static int addreseq(Fs*, Tcpctl*, Tcppriv*, Tcp*, Block*, ushort); +static int dumpreseq(Tcpctl*); +static void getreseq(Tcpctl*, Tcp*, Block**, ushort*); +static void limbo(Conv*, uchar*, uchar*, Tcp*, int); +static void limborexmit(Proto*); +static void localclose(Conv*, char*); +static void procsyn(Conv*, Tcp*); +static void tcpacktimer(void*); +static void tcpiput(Proto*, Ipifc*, Block*); +static void tcpkeepalive(void*); +static void tcpoutput(Conv*); +static void tcprcvwin(Conv*); +static void tcprxmit(Conv*); +static void tcpsetkacounter(Tcpctl*); +static void tcpsetscale(Conv*, Tcpctl*, ushort, ushort); +static void tcpsettimer(Tcpctl*); +static void tcpsndsyn(Conv*, Tcpctl*); +static void tcpstart(Conv*, int); +static void tcpsynackrtt(Conv*); +static void tcptimeout(void*); +static int tcptrim(Tcpctl*, Tcp*, Block**, ushort*); static void tcpsetstate(Conv *s, uchar newstate) @@ -443,30 +452,29 @@ static int tcpstate(Conv *c, char *state, int n) { - Tcpctl *tcb; + Tcpctl *s; - tcb = (Tcpctl*)(c->ptcl); + s = (Tcpctl*)(c->ptcl); return snprint(state, n, - "%s qin %d qout %d srtt %d mdev %d cwin %lud swin %lud>>%d" - " rwin %lud>>%d timer.start %d timer.count %d rerecv %d" - " katimer.start %d katimer.count %d ssthresh %lud\n", - tcpstates[tcb->state], + "%s qin %d qout %d rq %d.%d srtt %d mdev %d cwin %lud swin %lud>>%d rwin %lud>>%d timer.start %d timer.count %d rerecv %d katimer.start %d katimer.count %d\n", + tcpstates[s->state], c->rq ? qlen(c->rq) : 0, c->wq ? qlen(c->wq) : 0, - tcb->srtt, tcb->mdev, - tcb->cwind, tcb->snd.wnd, tcb->rcv.scale, tcb->rcv.wnd, - tcb->snd.scale, tcb->timer.start, tcb->timer.count, tcb->rerecv, - tcb->katimer.start, tcb->katimer.count, tcb->ssthresh); + s->nreseq, s->reseqlen, + s->srtt, s->mdev, + s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale, + s->timer.start, s->timer.count, s->rerecv, + s->katimer.start, s->katimer.count); } static int tcpinuse(Conv *c) { - Tcpctl *tcb; + Tcpctl *s; - tcb = (Tcpctl*)(c->ptcl); - return tcb->state != Closed; + s = (Tcpctl*)(c->ptcl); + return s->state != Closed; } static char* @@ -574,11 +582,10 @@ tcb = (Tcpctl*)s->ptcl; w = tcb->window - qlen(s->rq); - if(w < 0) - w = 0; - tcb->rcv.wnd = w; - if(w == 0) + if(w <= 0 || tcb->window > 4*tcb->mss && w < tcb->mss/4){ tcb->rcv.blocked = 1; + netlog(s->p->f, Logtcp, "tcprcvwim: window %lud qlen %d\n", tcb->window, qlen(s->rq)); + } } static void @@ -604,40 +611,11 @@ poperror(); } -/* - * qio.c wakes up writer when queue is half-empty. - * wq holds data until it gets acked, thus wq needs to - * be able to contain twice the full window of data. - * plus more to guarantee saturation. - */ -static int -tcpwqsize(int maxwin) -{ - return (5*maxwin)/2; -} - -/* - * limit maximum packet bursts due to stretched acks - * and application limited periods. increase ssthresh - * if necessary to allow reasonably quick cwind - * reclamation. - */ -static void -tcplimitmaxburst(Tcpctl *tcb) -{ - ulong cwindmax; - cwindmax = tcb->snd.nxt - tcb->snd.una + 3*tcb->mss; - if(tcb->cwind > cwindmax){ - if(tcb->ssthresh < tcb->cwind) - tcb->ssthresh = tcb->cwind; - tcb->cwind = cwindmax; - } -} - static void tcpcongestion(Tcpctl *tcb) { ulong inflight; + inflight = tcb->snd.nxt - tcb->snd.una; if(inflight > tcb->cwind) inflight = tcb->cwind; @@ -656,12 +634,11 @@ } } - static void tcpcreate(Conv *c) { c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c); - c->wq = qopen(tcpwqsize(QMAX), Qkick, tcpkick, c); + c->wq = qopen((3*QMAX)/2, Qkick, tcpkick, c); } static void @@ -775,7 +752,6 @@ localclose(Conv *s, char *reason) /* called with tcb locked */ { Tcpctl *tcb; - Reseq *rp,*rp1; Tcppriv *tpriv; tpriv = s->p->priv; @@ -789,12 +765,7 @@ tcphalt(tpriv, &tcb->katimer); /* Flush reassembly queue; nothing more can arrive */ - for(rp = tcb->reseq; rp != nil; rp = rp1) { - rp1 = rp->next; - freeblist(rp->bp); - free(rp); - } - tcb->reseq = nil; + dumpreseq(tcb); if(tcb->state == Syn_sent) Fsconnected(s, reason); @@ -820,12 +791,12 @@ case V4: mtu = DEF_MSS; if(ifc != nil) - mtu = ifc->maxtu - ifc->medium->hsize - (TCP4_PKT + TCP4_HDRSIZE); + mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE); break; case V6: mtu = DEF_MSS6; if(ifc != nil) - mtu = ifc->maxtu - ifc->medium->hsize - (TCP6_PKT + TCP6_HDRSIZE); + mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE); break; } if(ifc != nil){ @@ -849,6 +820,7 @@ Tcpctl *tcb; Tcp4hdr* h4; Tcp6hdr* h6; + Tcppriv *tpriv; int mss; tcb = (Tcpctl*)s->ptcl; @@ -905,6 +877,8 @@ tcb->mss = tcb->cwind = mss; tcb->abcbytes = 0; + tpriv = s->p->priv; + tpriv->stats[Mss] = tcb->mss; /* default is no window scaling */ tcb->window = QMAX; @@ -912,7 +886,6 @@ tcb->rcv.scale = 0; tcb->snd.scale = 0; qsetlimit(s->rq, QMAX); - qsetlimit(s->wq, tcpwqsize(QMAX)); } /* @@ -1271,6 +1244,8 @@ static void tcpsndsyn(Conv *s, Tcpctl *tcb) { + Tcppriv *tpriv; + tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16); tcb->rttseq = tcb->iss; tcb->snd.wl2 = tcb->iss; @@ -1283,6 +1258,8 @@ /* set desired mss and scale */ tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale); + tpriv = s->p->priv; + tpriv->stats[Mss] = tcb->mss; } void @@ -1384,6 +1361,7 @@ return commonerror(); if(ipcmp(s->raddr, IPnoaddr) != 0) { if(!waserror()){ + memset(&seg, 0, sizeof seg); seg.flags = RST | ACK; seg.ack = tcb->rcv.nxt; tcb->rcv.una = 0; @@ -1452,6 +1430,7 @@ panic("sndrst: version %d", lp->version); } + memset(&seg, 0, sizeof seg); seg.seq = lp->iss; seg.ack = lp->irs+1; seg.flags = SYN|ACK; @@ -1660,7 +1639,7 @@ /* find a call in limbo */ h = hashipa(src, segp->source); for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){ - netlog(s->p->f, Logtcp, "tcpincoming s %I,%ux/%I,%ux d %I,%ux/%I,%ux v %d/%d\n", + netlog(s->p->f, Logtcp, "tcpincoming s %I!%ud/%I!%ud d %I!%ud/%I!%ud v %d/%d\n", src, segp->source, lp->raddr, lp->rport, dst, segp->dest, lp->laddr, lp->lport, version, lp->version @@ -1717,8 +1696,10 @@ tcb->flags |= SYNACK; /* our sending max segment size cannot be bigger than what he asked for */ - if(lp->mss != 0 && lp->mss < tcb->mss) + if(lp->mss != 0 && lp->mss < tcb->mss) { tcb->mss = lp->mss; + tpriv->stats[Mss] = tcb->mss; + } /* window scaling */ tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale); @@ -1841,11 +1822,21 @@ return; } + /* catch zero-window updates, update window & recover */ + if(tcb->snd.wnd == 0 && seg->wnd > 0) + if(seq_lt(seg->ack, tcb->snd.ptr)){ + netlog(s->p->f, Logtcp, "tcp: zwu ack %lud una %lud ptr %lud win %lud\n", + seg->ack, tcb->snd.una, tcb->snd.ptr, seg->wnd); + tcb->snd.wnd = seg->wnd; + goto recovery; + } + if(seg->ack == tcb->snd.una) if(tcb->snd.una != tcb->snd.nxt) if(seg->len == 0) if(seg->wnd == tcb->snd.wnd) if(++tcb->snd.dupacks == TCPREXMTTHRESH){ +recovery: tcb->snd.recovery = 1; tcb->snd.rxt = tcb->snd.nxt; tcpcongestion(tcb); @@ -1864,11 +1855,16 @@ } if(!seq_gt(seg->ack, tcb->snd.una)){ + /* + * don't let us hangup if sending into a closed window and + * we're still getting acks + */ if((tcb->flags&RETRAN) && tcb->snd.wnd == 0) tcb->backedoff = MAXBACKMS/4; return; } + /* Compute the new send window size */ acked = seg->ack - tcb->snd.una; /* avoid slow start and timers for SYN acks */ @@ -1903,9 +1899,14 @@ tcpabcincr(tcb, acked, tcb->cwind); /* congestion dance */ } - if(tcb->cwind > tcb->snd.wnd) + if(tcb->cwind > tcb->snd.wnd){ tcb->cwind = tcb->snd.wnd; + /* this is a sloppy hack. why isn't this updated when we see the window open? */ + if(tcb->cwind < 10) + tcb->cwind = tcb->mss; + } + /* Adjust the timers according to the round trip time */ /* todo: fix sloppy treatment of overflow cases here. */ if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) { @@ -1971,7 +1972,7 @@ Tcp6hdr *h6; int hdrlen; Tcpctl *tcb; - ushort length; + ushort length, csum; uchar source[IPaddrlen], dest[IPaddrlen]; Conv *s; Fs *f; @@ -2034,10 +2035,12 @@ h6->ttl = proto; hnputl(h6->vcf, length); if((h6->tcpcksum[0] || h6->tcpcksum[1]) && - ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) { + (csum = ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) != 0) { tpriv->stats[CsumErrs]++; tpriv->stats[InErrs]++; - netlog(f, Logtcp, "bad tcp proto cksum\n"); + netlog(f, Logtcp, + "bad tcpv6 proto cksum: got %#ux, computed %#ux\n", + h6->tcpcksum[0]<<8 | h6->tcpcksum[1], csum); freeblist(bp); return; } @@ -2049,7 +2052,7 @@ if(hdrlen < 0){ tpriv->stats[HlenErrs]++; tpriv->stats[InErrs]++; - netlog(f, Logtcp, "bad tcp hdr len\n"); + netlog(f, Logtcp, "bad tcpv6 hdr len\n"); return; } @@ -2059,7 +2062,7 @@ if(bp == nil){ tpriv->stats[LenErrs]++; tpriv->stats[InErrs]++; - netlog(f, Logtcp, "tcp len < 0 after trim\n"); + netlog(f, Logtcp, "tcpv6 len < 0 after trim\n"); return; } } @@ -2070,7 +2073,8 @@ /* Look for a matching conversation */ s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest); if(s == nil){ - netlog(f, Logtcp, "iphtlook failed\n"); + netlog(f, Logtcp, "iphtlook(src %I!%d, dst %I!%d) failed\n", + source, seg.source, dest, seg.dest); reset: qunlock(tcp); sndrst(tcp, source, dest, length, &seg, version, "no conversation"); @@ -2182,7 +2186,7 @@ if(tcb->state != Syn_received && (seg.flags & RST) == 0){ if(tcpporthogdefense && seq_within(seg.ack, tcb->snd.una-(1<<31), tcb->snd.una-(1<<29))){ - print("stateless hog %I.%d->%I.%d f %#ux %#lux - %#lux - %#lux\n", + print("stateless hog %I.%d->%I.%d f %ux %lux - %lux - %lux\n", source, seg.source, dest, seg.dest, seg.flags, tcb->snd.una-(1<<31), seg.ack, tcb->snd.una-(1<<29)); localclose(s, "stateless hog"); @@ -2191,7 +2195,9 @@ /* Cut the data to fit the receive window */ if(tcptrim(tcb, &seg, &bp, &length) == -1) { - netlog(f, Logtcp, "tcp len < 0, %lud %d\n", seg.seq, length); + netlog(f, Logtcp, "tcptrim, not accept, seq %lud-%lud win %lud-%lud from %I\n", + seg.seq, seg.seq + length - 1, + tcb->rcv.nxt, tcb->rcv.nxt + tcb->rcv.wnd-1, s->raddr); update(s, &seg); if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) { tcphalt(tpriv, &tcb->rtt_timer); @@ -2222,7 +2228,7 @@ if(seg.seq != tcb->rcv.nxt) if(length != 0 || (seg.flags & (SYN|FIN))) { update(s, &seg); - if(addreseq(tcb, tpriv, &seg, bp, length) < 0) + if(addreseq(f, tcb, tpriv, &seg, bp, length) < 0) print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr, s->lport); tcb->flags |= FORCE; goto output; @@ -2237,7 +2243,7 @@ if(tcb->state == Established) { tpriv->stats[EstabResets]++; if(tcb->rcv.nxt != seg.seq) - print("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt %#lux seq %#lux\n", s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt, seg.seq); + print("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt %lux seq %lux\n", s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt, seg.seq); } localclose(s, Econrefused); goto raise; @@ -2452,7 +2458,7 @@ tcpoutput(Conv *s) { Tcp seg; - int msgs; + uint msgs; Tcpctl *tcb; Block *hbp, *bp; int sndcnt; @@ -2574,7 +2580,7 @@ } } - if(sent+dsize == sndcnt) + if(sent+dsize == sndcnt && dsize) seg.flags |= PSH; tcb->snd.ptr += ssize; @@ -2647,6 +2653,11 @@ default: panic("tcpoutput2: version %d", version); } + if((msgs%4) == 1){ + qunlock(s); + sched(); + qlock(s); + } } } @@ -2663,6 +2674,7 @@ tcb = (Tcpctl*)s->ptcl; dbp = nil; + memset(&seg, 0, sizeof seg); seg.urg = 0; seg.source = s->lport; seg.dest = s->rport; @@ -2790,7 +2802,9 @@ tcprxmit(Conv *s) { Tcpctl *tcb; + Tcppriv *tpriv; ulong tcwind, tptr; + tcb = (Tcpctl*)s->ptcl; tcb->flags |= RETRAN|FORCE; @@ -2801,6 +2815,9 @@ tcpoutput(s); tcb->cwind = tcwind; tcb->snd.ptr = tptr; + + tpriv = s->p->priv; + tpriv->stats[RetransSegs]++; } /* @@ -2835,7 +2852,7 @@ localclose(s, Etimedout); break; } - netlog(s->p->f, Logtcprxmt, "timeout rexmit 0x%lux %d/%d\n", + netlog(s->p->f, Logtcprxmt, "timeout rexmit %#lux %d/%lud\n", tcb->snd.una, tcb->timer.start, NOW); tcpsettimer(tcb); tcpcongestion(tcb); @@ -2869,6 +2886,7 @@ procsyn(Conv *s, Tcp *seg) { Tcpctl *tcb; + Tcppriv *tpriv; tcb = (Tcpctl*)s->ptcl; tcb->flags |= FORCE; @@ -2878,8 +2896,11 @@ tcb->irs = seg->seq; /* our sending max segment size cannot be bigger than what he asked for */ - if(seg->mss != 0 && seg->mss < tcb->mss) + if(seg->mss != 0 && seg->mss < tcb->mss) { tcb->mss = seg->mss; + tpriv = s->p->priv; + tpriv->stats[Mss] = tcb->mss; + } tcb->snd.wnd = seg->wnd; @@ -2893,14 +2914,39 @@ } static int -addreseq(Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length) +dumpreseq(Tcpctl *tcb) +{ + Reseq *r, *next; + + for(r = tcb->reseq; r != nil; r = next){ + next = r->next; + freeblist(r->bp); + free(r); + } + tcb->reseq = nil; + tcb->nreseq = 0; + tcb->reseqlen = 0; + return -1; +} + +static void +logreseq(Fs *f, Reseq *r) +{ + for(; r != nil; r = r->next){ + netlog(f, Logtcp, "%#lud %ud %#lud %#ux\n", r->seg.seq, r->seg.len, + r->seg.ack, r->seg.flags); + } +} + +static int +addreseq(Fs *f, Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length) { - Reseq *rp, *rp1; - int i, rqlen, qmax; + Reseq *rp, **rr; + int qmax; rp = malloc(sizeof(Reseq)); if(rp == nil){ - freeblist(bp); /* bp always consumed by add_reseq */ + freeblist(bp); /* bp always consumed by addreseq */ return 0; } @@ -2908,54 +2954,35 @@ rp->bp = bp; rp->length = length; - /* Place on reassembly list sorting by starting seq number */ - rp1 = tcb->reseq; - if(rp1 == nil || seq_lt(seg->seq, rp1->seg.seq)) { - rp->next = rp1; - tcb->reseq = rp; - if(rp->next != nil) - tpriv->stats[OutOfOrder]++; - return 0; - } + tcb->reseqlen += length; + tcb->nreseq++; - rqlen = 0; - for(i = 0;; i++) { - rqlen += rp1->length; - if(rp1->next == nil || seq_lt(seg->seq, rp1->next->seg.seq)) { - rp->next = rp1->next; - rp1->next = rp; + /* Place on reassembly list sorting by starting seq number */ + for(rr = &tcb->reseq;; rr = &(*rr)->next) + if(*rr == nil || seq_lt(seg->seq, (*rr)->seg.seq)){ + rp->next = *rr; + *rr = rp; + tpriv->stats[Resequenced]++; if(rp->next != nil) tpriv->stats[OutOfOrder]++; break; } - rp1 = rp1->next; - } - qmax = QMAX<rcv.scale; - if(rqlen > qmax){ - print("resequence queue > window: %d > %d\n", rqlen, qmax); - i = 0; - for(rp1 = tcb->reseq; rp1 != nil; rp1 = rp1->next){ - print("%#lux %#lux %#ux\n", rp1->seg.seq, - rp1->seg.ack, rp1->seg.flags); - if(i++ > 10){ - print("...\n"); - break; - } - } - - /* - * delete entire reassembly queue; wait for retransmit. - * - should we be smarter and only delete the tail? - */ - for(rp = tcb->reseq; rp != nil; rp = rp1){ - rp1 = rp->next; - freeblist(rp->bp); - free(rp); - } - tcb->reseq = nil; - return -1; + qmax = QMAX<rcv.scale; + if(tcb->reseqlen > qmax){ + netlog(f, Logtcp, "tcp: reseq: queue > window: %d > %d; %d packets\n", tcb->reseqlen, qmax, tcb->nreseq); + logreseq(f, tcb->reseq); + tpriv->stats[ReseqBytelim]++; + return dumpreseq(tcb); + } + qmax = 15*(tcb->rcv.scale + 1); + if(tcb->nreseq > qmax){ + netlog(f, Logtcp, "resequence queue > packets: %d %d; %d bytes\n", tcb->nreseq, qmax, tcb->reseqlen); + logreseq(f, tcb->reseq); + tpriv->stats[ReseqPktlim]++; + return dumpreseq(tcb); } + return 0; } @@ -2974,6 +3001,9 @@ *bp = rp->bp; *length = rp->length; + tcb->nreseq--; + tcb->reseqlen -= rp->length; + free(rp); } @@ -3139,7 +3169,7 @@ p = buf; e = p+len; for(i = 0; i < Nstats; i++) - p = seprint(p, e, "%s: %lud\n", statnames[i], priv->stats[i]); + p = seprint(p, e, "%s: %llud\n", statnames[i], priv->stats[i]); return p - buf; } @@ -3242,14 +3272,11 @@ tcb->window = QMAX<snd.scale; tcb->ssthresh = tcb->window; qsetlimit(s->rq, tcb->window); - qsetlimit(s->wq, tcpwqsize(tcb->window)); } else { tcb->rcv.scale = 0; tcb->snd.scale = 0; tcb->window = QMAX; tcb->ssthresh = tcb->window; qsetlimit(s->rq, tcb->window); - qsetlimit(s->wq, tcpwqsize(tcb->window)); } } -