# HG changeset patch # User Erik Quanstrom # Date 1329923420 -3600 # Node ID e73da93bb1db6c161c6c5b71845f247097f63666 # Parent c7fcc2202d7c9e80d233174f75c66fabdfe92d55 netif: support mtu limits of hardware, specificly for ethernet jumbograms allow the mtu of an ethernet interface to be set between the smallest and largest allowable mtu and optionally call the hardware to do some mtu-related configuration. this is important to get good performance with low cpu usage on bulk transfer protocols such as ATA-over-Ethernet (AoE). this patch should be fully detailed in ether(3). cavet: this patch doesn't change the ip stack. path mtu discovery is not implemented. R=nixiedev, john CC=nix-dev http://codereview.appspot.com/5674045 Committer: Francisco J Ballesteros diff -r c7fcc2202d7c -r e73da93bb1db sys/man/3/ether --- a/sys/man/3/ether Wed Feb 22 16:09:39 2012 +0100 +++ b/sys/man/3/ether Wed Feb 22 16:10:20 2012 +0100 @@ -7,6 +7,7 @@ .BI /net/ether n /clone .BI /net/ether n /addr +.BI /net/ether n /mtu .BI /net/ether n /ifstats .BI /net/ether n /stats .BI /net/ether n /[0-7] @@ -24,6 +25,7 @@ and .BR clone , .BR addr , +.BR mtu , .BR ifstats , and .B stats @@ -35,7 +37,16 @@ Reading .B addr returns the MAC address of this interface in hex with no punctuation -and no trailing newline. +and no trailing newline. Reading +.B mtu +returns 3 12-byte numbers: the hardware's minimum MTU, the current +MTU and the maximum MTU supported by the interface. The MTU +is set by writing +.B mtu +.I n +into the +.B ctl +file. The number .I n (optional in the bind) @@ -54,7 +65,9 @@ .B data file causes a packet to be sent. The Ethernet address of the interface is inserted into -the packet header as the source address. +the packet header as the source address unless +.B l2bridge +is set. .PP A connection is assigned to a packet type by opening its .B ctl @@ -79,6 +92,26 @@ .I wavelan in .IR plan9.ini (8). +The control messages described in +.IR ip (3) +under +.B "Configuring interfaces" +from +.L bridge +to +.L headersonly +are understood. +The additional control message +.L nonblocking +makes +.I write +systems calls to this interface non-blocking iff +followed by nothing or a non-zero integer; +a following +.L 0 +makes +.I writes +block on a full output queue. .PP Reading the .B ctl @@ -92,11 +125,39 @@ card and general statistics, independent of the interface; .B ifstats contains device-specific data and statistics about the card. +The format of the +.B stats +file is +.IB stat : value ...\fR. +In particular, +if +.B link +is non-zero, +.B mbps +is the current speed of the interface in +megabits-per-second. While +.B in +and +.B out +are the total number of packets input or output +and do differ (in the case of dropped packets or +loop back) from hardware counters. .PP An interface normally receives only those packets whose -destination address is that of the interface or is the +destination address is that of the interface, the broadcast address, -.BR ff:ff:ff:ff:ff:ff . +.BR ff:ff:ff:ff:ff:ff , +or a a multicast address assigned to the interface. +Multicast addresses are added by writing +.B addmulti +.I ea +and remove by writing +.B remmulti +.I ea +to the +.B ctl +file. Multicast addresses are automatically dropped when +the connection is closed. The interface can be made to receive all packets on the network by writing the string .B promiscuous @@ -104,8 +165,49 @@ .B ctl file. The interface remains promiscuous until the control file is -closed. -The extra packets are passed up connections only of types \-1 -and \-2. +closed by all connections requesting promiscuous mode. +The extra packets are passed up connections of types \-1 +and \-2 only. +.PP +Writing +.B bridge +to the +.B ctl +file causes loop back packets to be ignored. This is useful for +layer 3 bridging. Writing +.B l2bridge +also allows the connection to set the source address and +receive packets with any destination address. +Writing +.B scanbs +[ +.I secs +] +starts a scan for wireless base stations and sets the +scanning interval to +.IR secs. +Scanning is terminated when the connection is closed. +The default interval is 5s. +For wired connections, this command is ignored. +.PP +Writing +.B headersonly +causes the connection to return only the first +58 bytes of the packet followed by a two-byte +length and a four-byte time in machine ticks. +Both numbers are in big-endian format. .SH SOURCE .B /sys/src/9/*/devether.c +.br +.B /sys/src/9/port/netif.c +.SH "SEE ALSO" +.IR ip (3). +.SH BUGS +The multicast interface makes removal of multicast +addresses on that rely on hashing difficult. +.PP +Interface MTU settings must be greater than those +used by +.IR ip (3); +this is not enforced. NB: interface MTU is different +than IP stack's MTU. diff -r c7fcc2202d7c -r e73da93bb1db sys/src/nix/386/devether.c --- a/sys/src/nix/386/devether.c Wed Feb 22 16:09:39 2012 +0100 +++ b/sys/src/nix/386/devether.c Wed Feb 22 16:10:20 2012 +0100 @@ -3,8 +3,8 @@ #include "mem.h" #include "dat.h" #include "fns.h" +#include "io.h" #include "../port/error.h" - #include "../port/netif.h" #include "etherif.h" @@ -14,16 +14,14 @@ Chan* etherattach(char* spec) { - int ctlrno; + ulong ctlrno; char *p; Chan *chan; ctlrno = 0; if(spec && *spec){ ctlrno = strtoul(spec, &p, 0); - if((ctlrno == 0 && p == spec) || *p != 0) - error(Ebadarg); - if(ctlrno < 0 || ctlrno >= MaxEther) + if((ctlrno == 0 && p == spec) || *p || (ctlrno >= MaxEther)) error(Ebadarg); } if(etherxx[ctlrno] == 0) @@ -148,7 +146,7 @@ ep = ðer->f[Ntypes]; multi = pkt->d[0] & 1; - /* check for valid multcast addresses */ + /* check for valid multicast addresses */ if(multi && memcmp(pkt->d, ether->bcast, sizeof(pkt->d)) != 0 && ether->prom == 0){ if(!activemulti(ether, pkt->d, sizeof(pkt->d))){ if(fromwire){ @@ -172,7 +170,7 @@ for(fp = ether->f; fp < ep; fp++){ if(f = *fp) if(f->type == type || f->type < 0) - if(tome || multi || f->prom){ + if(tome || multi || f->prom || f->bridge & 2){ /* Don't want to hear bridged packets */ if(f->bridge && !fromwire && !fromme) continue; @@ -266,13 +264,13 @@ return n; } free(cb); - if(ether->ctl!=nil) - return ether->ctl(ether,buf,n); + if(ether->ctl != nil) + return ether->ctl(ether, buf, n); error(Ebadctl); } - if(n > ether->maxmtu) + if(n > ether->mtu) error(Etoobig); if(n < ether->minmtu) error(Etoosmall); @@ -283,7 +281,8 @@ nexterror(); } memmove(bp->rp, buf, n); - memmove(bp->rp+Eaddrlen, ether->ea, Eaddrlen); + if((ether->f[NETID(chan->qid.path)]->bridge & 2) == 0) + memmove(bp->rp+Eaddrlen, ether->ea, Eaddrlen); poperror(); bp->wp += n; @@ -309,7 +308,7 @@ } ether = etherxx[chan->devno]; - if(n > ether->maxmtu){ + if(n > ether->mtu){ freeb(bp); error(Etoobig); } @@ -364,16 +363,17 @@ static Ether* etherprobe(int cardno, int ctlrno) { - int i; + int i, j; Ether *ether; char buf[128], name[32]; ether = malloc(sizeof(Ether)); memset(ether, 0, sizeof(Ether)); ether->ctlrno = ctlrno; - ether->tbdf = -1; + ether->tbdf = BUSUNKNOWN; ether->mbps = 10; ether->minmtu = ETHERMINTU; + ether->mtu = ETHERMAXTU; ether->maxmtu = ETHERMAXTU; if(cardno < 0){ @@ -415,38 +415,31 @@ /* * If ether->irq is <0, it is a hack to indicate no interrupt * used by ethersink. - * Or perhaps the driver has some other way to configure - * interrups for intself, e.g. HyperTransport MSI. */ if(ether->irq >= 0) intrenable(ether->irq, ether->interrupt, ether, ether->tbdf, name); - i = sprint(buf, "#l%d: %s: %dMbps port %#p irq %d", - ctlrno, cards[cardno].type, ether->mbps, ether->port, ether->irq); + i = sprint(buf, "#l%d: %s: %dMbps port %#p irq %d tu %d", + ctlrno, cards[cardno].type, ether->mbps, ether->port, ether->irq, ether->mtu); if(ether->mem) i += sprint(buf+i, " addr %#p", ether->mem); if(ether->size) - i += sprint(buf+i, " size %ld", ether->size); + i += sprint(buf+i, " size 0x%luX", ether->size); i += sprint(buf+i, ": %2.2ux%2.2ux%2.2ux%2.2ux%2.2ux%2.2ux", ether->ea[0], ether->ea[1], ether->ea[2], ether->ea[3], ether->ea[4], ether->ea[5]); sprint(buf+i, "\n"); print(buf); - if (ether->mbps >= 1000) { - netifinit(ether, name, Ntypes, 512*1024); - if(ether->oq == 0) - ether->oq = qopen(512*1024, Qmsg, 0, 0); - } else if(ether->mbps >= 100){ - netifinit(ether, name, Ntypes, 256*1024); - if(ether->oq == 0) - ether->oq = qopen(256*1024, Qmsg, 0, 0); - } - else{ - netifinit(ether, name, Ntypes, 128*1024); - if(ether->oq == 0) - ether->oq = qopen(128*1024, Qmsg, 0, 0); - } + j = ether->mbps; + if(j > 1000) + j *= 10; + for(i = 0; j >= 100; i++) + j /= 10; + i = (128<oq == 0) + ether->oq = qopen(i, Qmsg, 0, 0); if(ether->oq == 0) panic("etherreset %s", name); ether->alen = Eaddrlen; @@ -489,17 +482,22 @@ static void ethershutdown(void) { + char name[32]; + int i; Ether *ether; - int i; for(i = 0; i < MaxEther; i++){ ether = etherxx[i]; if(ether == nil) continue; if(ether->shutdown == nil) { - print("#l%d: no shutdown fuction\n", i); + print("#l%d: no shutdown function\n", i); continue; } + snprint(name, sizeof(name), "ether%d", i); + if(ether->irq >= 0){ + // intrdisable(ether->irq, ether->interrupt, ether, ether->tbdf, name); + } (*ether->shutdown)(ether); } } diff -r c7fcc2202d7c -r e73da93bb1db sys/src/nix/k10/etherif.h --- a/sys/src/nix/k10/etherif.h Wed Feb 22 16:09:39 2012 +0100 +++ b/sys/src/nix/k10/etherif.h Wed Feb 22 16:10:20 2012 +0100 @@ -15,8 +15,6 @@ int ctlrno; int tbdf; /* type+busno+devno+funcno */ - int minmtu; - int maxmtu; uchar ea[Eaddrlen]; void (*attach)(Ether*); /* filled in by reset routine */ diff -r c7fcc2202d7c -r e73da93bb1db sys/src/nix/port/netif.c --- a/sys/src/nix/port/netif.c Wed Feb 22 16:09:39 2012 +0100 +++ b/sys/src/nix/port/netif.c Wed Feb 22 16:10:20 2012 +0100 @@ -67,7 +67,8 @@ /* second level contains clone plus all the conversations */ t = NETTYPE(c->qid.path); - if(t == N2ndqid || t == Ncloneqid || t == Naddrqid){ + if(t == N2ndqid || t == Ncloneqid || t == Naddrqid + || t == Nstatqid || t == Nifstatqid || t == Nmtuqid){ switch(i) { case DEVDOTDOT: q.type = QTDIR; @@ -90,15 +91,19 @@ q.path = Nifstatqid; devdir(c, q, "ifstats", 0, eve, 0444, dp); break; + case 4: + q.path = Nmtuqid; + devdir(c, q, "mtu", 0, eve, 0444, dp); + break; default: - i -= 4; + i -= 5; if(i >= nif->nfile) return -1; if(nif->f[i] == 0) return 0; q.type = QTDIR; q.path = NETQID(i, N3rdqid); - sprint(up->genbuf, "%d", i); + snprint(up->genbuf, sizeof up->genbuf, "%d", i); devdir(c, q, up->genbuf, 0, eve, DMDIR|0555, dp); break; } @@ -184,8 +189,10 @@ case Ndataqid: case Nctlqid: f = nif->f[id]; - if(netown(f, up->user, omode&7) < 0) + if(netown(f, up->user, omode&7) < 0){ + netifclose(nif, c); error(Eperm); + } break; } } @@ -216,15 +223,17 @@ return readnum(offset, a, n, NETID(c->qid.path), NUMSIZE); case Nstatqid: p = malloc(READSTR); - j = snprint(p, READSTR, "in: %d\n", nif->inpackets); + if(p == nil) + error(Enomem); + j = snprint(p, READSTR, "in: %llud\n", nif->inpackets); j += snprint(p+j, READSTR-j, "link: %d\n", nif->link); - j += snprint(p+j, READSTR-j, "out: %d\n", nif->outpackets); - j += snprint(p+j, READSTR-j, "crc errs: %d\n", nif->crcs); - j += snprint(p+j, READSTR-j, "overflows: %d\n", nif->overflows); - j += snprint(p+j, READSTR-j, "soft overflows: %d\n", nif->soverflows); - j += snprint(p+j, READSTR-j, "framing errs: %d\n", nif->frames); - j += snprint(p+j, READSTR-j, "buffer errs: %d\n", nif->buffs); - j += snprint(p+j, READSTR-j, "output errs: %d\n", nif->oerrs); + j += snprint(p+j, READSTR-j, "out: %llud\n", nif->outpackets); + j += snprint(p+j, READSTR-j, "crc errs: %llud\n", nif->crcs); + j += snprint(p+j, READSTR-j, "overflows: %llud\n", nif->overflows); + j += snprint(p+j, READSTR-j, "soft overflows: %llud\n", nif->soverflows); + j += snprint(p+j, READSTR-j, "framing errs: %llud\n", nif->frames); + j += snprint(p+j, READSTR-j, "buffer errs: %llud\n", nif->buffs); + j += snprint(p+j, READSTR-j, "output errs: %llud\n", nif->oerrs); j += snprint(p+j, READSTR-j, "prom: %d\n", nif->prom); j += snprint(p+j, READSTR-j, "mbps: %d\n", nif->mbps); j += snprint(p+j, READSTR-j, "addr: "); @@ -247,6 +256,9 @@ return readnum(offset, a, n, f->type, NUMSIZE); case Nifstatqid: return 0; + case Nmtuqid: + snprint(up->genbuf, sizeof up->genbuf, "%11.ud %11.ud %11.ud\n", nif->minmtu, nif->mtu, nif->maxmtu); + return readstr(offset, a, n, up->genbuf); } error(Ebadarg); return -1; /* not reached */ @@ -290,7 +302,7 @@ netifwrite(Netif *nif, Chan *c, void *a, long n) { Netfile *f; - int type; + int type, mtu; char *p, buf[64]; uchar binaddr[Nmaxaddr]; @@ -310,12 +322,14 @@ qlock(nif); f = nif->f[NETID(c->qid.path)]; if((p = matchtoken(buf, "connect")) != 0){ + qclose(f->iq); type = atoi(p); if(typeinuse(nif, type)) error(Einuse); f->type = type; if(f->type < 0) nif->all++; + qreopen(f->iq); } else if(matchtoken(buf, "promiscuous")){ if(f->prom == 0){ if(nif->prom == 0 && nif->promiscuous != nil) @@ -334,8 +348,23 @@ f->scan = type; nif->scan++; } + } else if((p = matchtoken(buf, "mtu")) != 0){ + /* poor planning. */ + if(!iseve()) + error(Eperm); + mtu = atoi(p); + /* zero resets default. */ + if(mtu != 0) + if(mtu < nif->minmtu || mtu > nif->maxmtu) + error(Ebadarg); + if(nif->hwmtu) + nif->mtu = nif->hwmtu(nif->arg, mtu); + else + nif->mtu = mtu; + } else if(matchtoken(buf, "l2bridge")){ + f->bridge |= 2; } else if(matchtoken(buf, "bridge")){ - f->bridge = 1; + f->bridge |= 1; } else if(matchtoken(buf, "headersonly")){ f->headersonly = 1; } else if((p = matchtoken(buf, "addmulti")) != 0){ @@ -560,65 +589,6 @@ return p; } -void -hnputv(void *p, uvlong v) -{ - uchar *a; - - a = p; - hnputl(a, v>>32); - hnputl(a+4, v); -} - -void -hnputl(void *p, uint v) -{ - uchar *a; - - a = p; - a[0] = v>>24; - a[1] = v>>16; - a[2] = v>>8; - a[3] = v; -} - -void -hnputs(void *p, ushort v) -{ - uchar *a; - - a = p; - a[0] = v>>8; - a[1] = v; -} - -uvlong -nhgetv(void *p) -{ - uchar *a; - - a = p; - return ((vlong)nhgetl(a) << 32) | nhgetl(a+4); -} - -uint -nhgetl(void *p) -{ - uchar *a; - - a = p; - return (a[0]<<24)|(a[1]<<16)|(a[2]<<8)|(a[3]<<0); -} - -ushort -nhgets(void *p) -{ - uchar *a; - - a = p; - return (a[0]<<8)|(a[1]<<0); -} - static ulong hash(uchar *a, int len) { diff -r c7fcc2202d7c -r e73da93bb1db sys/src/nix/port/netif.h --- a/sys/src/nix/port/netif.h Wed Feb 22 16:09:39 2012 +0100 +++ b/sys/src/nix/port/netif.h Wed Feb 22 16:10:20 2012 +0100 @@ -16,6 +16,7 @@ Nstatqid, Ntypeqid, Nifstatqid, + Nmtuqid, }; /* @@ -75,6 +76,9 @@ int alen; /* address length */ int mbps; /* megabits per sec */ int link; /* link status */ + int minmtu; + int maxmtu; + int mtu; uchar addr[Nmaxaddr]; uchar bcast[Nmaxaddr]; Netaddr *maddr; /* known multicast addresses */ @@ -87,20 +91,21 @@ Queue* oq; /* output */ /* statistics */ - int misses; - int inpackets; - int outpackets; - int crcs; /* input crc errors */ - int oerrs; /* output errors */ - int frames; /* framing errors */ - int overflows; /* packet overflows */ - int buffs; /* buffering errors */ - int soverflows; /* software overflow */ + uvlong misses; + uvlong inpackets; + uvlong outpackets; + uvlong crcs; /* input crc errors */ + uvlong oerrs; /* output errors */ + uvlong frames; /* framing errors */ + uvlong overflows; /* packet overflows */ + uvlong buffs; /* buffering errors */ + uvlong soverflows; /* software overflow */ /* routines for touching the hardware */ void *arg; void (*promiscuous)(void*, int); void (*multicast)(void*, uchar*, int); + int (*hwmtu)(void*, int); /* get/set mtu */ void (*scanbs)(void*, uint); /* scan for base stations */ };