--- /sys/man/1/wc Mon Dec 3 00:42:16 2007 +++ /sys/man/1/wc Wed Apr 24 00:12:04 2013 @@ -37,13 +37,6 @@ .SH SOURCE .B /sys/src/cmd/wc.c .SH BUGS -The Unicode Standard has many blank characters scattered through it, -but -.I wc -looks for only -.SM ASCII -space, tab and newline. -.PP .I Wc should have options to count suboptimal .SM UTF --- /sys/src/9/pc/sdiahci.c Tue Apr 9 16:20:30 2013 +++ /sys/src/9/pc/sdiahci.c Tue Apr 23 21:39:09 2013 @@ -1938,7 +1938,8 @@ /* * 0x27c4 is the intel 82801 in compatibility (not sata) mode. */ - if (p->did == 0x24d1 || /* 82801eb/er */ + if (p->did == 0x1e02 || /* c210 */ + p->did == 0x24d1 || /* 82801eb/er */ (p->did & 0xfffb) == 0x27c1 || /* 82801g[bh]m ich7 */ p->did == 0x2821 || /* 82801h[roh] */ (p->did & 0xfffe) == 0x2824 || /* 82801h[b] */ --- /sys/src/9/port/fpi.c Mon Mar 25 21:09:58 2013 +++ /sys/src/9/port/fpi.c Tue Apr 23 21:37:04 2013 @@ -291,6 +291,8 @@ return 0; if(IsInfinity(x) && IsInfinity(y)) return y->s - x->s; + if(IsZero(x) && IsZero(y)) + return 0; if(x->e == y->e && x->h == y->h && x->l == y->l) return y->s - x->s; if(x->e < y->e --- /sys/src/9/port/portdat.h Mon Apr 8 20:18:26 2013 +++ /sys/src/9/port/portdat.h Fri Apr 19 07:15:05 2013 @@ -989,7 +989,7 @@ Qmsg = (1<<1), /* message stream */ Qclosed = (1<<2), /* queue has been closed/hungup */ Qflow = (1<<3), /* producer flow controlled */ - Qcoalesce = (1<<4), /* coallesce packets on read */ + Qcoalesce = (1<<4), /* coalesce packets on read */ Qkick = (1<<5), /* always call the kick routine after qwrite */ }; --- /sys/src/cmd/acme/regx.c Sat Jan 12 22:22:05 2008 +++ /sys/src/cmd/acme/regx.c Wed Apr 24 01:05:49 2013 @@ -20,7 +20,7 @@ typedef struct Inst Inst; struct Inst { - uint type; /* < 0x10000 ==> literal, otherwise action */ + uint type; /* <= Runemax+1 ==> literal, otherwise action */ union { int sid; int subid; @@ -61,25 +61,28 @@ * 0x100xx are operators, value == precedence * 0x200xx are tokens, i.e. operands for operators */ -#define OPERATOR 0x10000 /* Bitmask of all operators */ -#define START 0x10000 /* Start, used for marker on stack */ -#define RBRA 0x10001 /* Right bracket, ) */ -#define LBRA 0x10002 /* Left bracket, ( */ -#define OR 0x10003 /* Alternation, | */ -#define CAT 0x10004 /* Concatentation, implicit operator */ -#define STAR 0x10005 /* Closure, * */ -#define PLUS 0x10006 /* a+ == aa* */ -#define QUEST 0x10007 /* a? == a|nothing, i.e. 0 or 1 a's */ -#define ANY 0x20000 /* Any character but newline, . */ -#define NOP 0x20001 /* No operation, internal use only */ -#define BOL 0x20002 /* Beginning of line, ^ */ -#define EOL 0x20003 /* End of line, $ */ -#define CCLASS 0x20004 /* Character class, [] */ -#define NCCLASS 0x20005 /* Negated character class, [^] */ -#define END 0x20077 /* Terminate: match found */ +enum { + OPERATOR = Runemask+1, /* Bitmask of all operators */ + START = OPERATOR, /* Start, used for marker on stack */ + RBRA, /* Right bracket, ) */ + LBRA, /* Left bracket, ( */ + OR, /* Alternation, | */ + CAT, /* Concatentation, implicit operator */ + STAR, /* Closure, * */ + PLUS, /* a+ == aa* */ + QUEST, /* a? == a|nothing, i.e. 0 or 1 a's */ + + ANY = OPERATOR<<1, /* Any character but newline, . */ + NOP, /* No operation, internal use only */ + BOL, /* Beginning of line, ^ */ + EOL, /* End of line, $ */ + CCLASS, /* Character class, [] */ + NCCLASS, /* Negated character class, [^] */ + END, /* Terminate: match found */ -#define ISATOR 0x10000 -#define ISAND 0x20000 + ISATOR = OPERATOR, + ISAND = OPERATOR<<1, +}; /* * Parser Information @@ -452,7 +455,7 @@ exprp++; return '\n'; } - return *exprp++|0x10000; + return *exprp++|(Runemax+1); } return *exprp++; } @@ -487,7 +490,7 @@ exprp++; /* eat '-' */ if((c2 = nextrec()) == ']') goto Error; - classp[n+0] = 0xFFFF; + classp[n+0] = Runemax; classp[n+1] = c1; classp[n+2] = c2; n += 3; @@ -509,7 +512,7 @@ p = class[classno]; while(*p){ - if(*p == 0xFFFF){ + if(*p == Runemax){ if(p[1]<=c && c<=p[2]) return !negate; p += 3; --- /sys/src/cmd/cc/cc.h Mon Mar 4 22:15:21 2013 +++ /sys/src/cmd/cc/cc.h Wed Apr 24 01:19:17 2013 @@ -20,6 +20,8 @@ typedef struct Init Init; typedef struct Bits Bits; +typedef Rune TRune; /* target system type */ + #define NHUNK 50000L #define BUFSIZ 8192 #define NSYMB 1500 @@ -51,7 +53,7 @@ double fconst; /* fp constant */ vlong vconst; /* non fp const */ char* cstring; /* character string */ - ushort* rstring; /* rune string */ + TRune* rstring; /* rune string */ Sym* sym; Type* type; @@ -336,6 +338,9 @@ TFILE, TOLD, NALLTYPES, + + /* adapt size of Rune to target system's size */ + TRUNE = sizeof(TRune)==4? TUINT: TUSHORT, }; enum { @@ -740,7 +745,7 @@ void gextern(Sym*, Node*, long, long); void ginit(void); long outstring(char*, long); -long outlstring(ushort*, long); +long outlstring(TRune*, long); void xcom(Node*); long exreg(Type*); long align(long, Type*, int); --- /sys/src/cmd/cc/cc.y Thu Nov 17 00:34:38 2011 +++ /sys/src/cmd/cc/cc.y Wed Apr 24 01:19:24 2013 @@ -855,9 +855,9 @@ LLSTRING { $$ = new(OLSTRING, Z, Z); - $$->type = typ(TARRAY, types[TUSHORT]); - $$->type->width = $1.l + sizeof(ushort); - $$->rstring = (ushort*)$1.s; + $$->type = typ(TARRAY, types[TRUNE]); + $$->type->width = $1.l + sizeof(TRune); + $$->rstring = (TRune*)$1.s; $$->sym = symstring; $$->etype = TARRAY; $$->class = CSTATIC; @@ -867,16 +867,16 @@ char *s; int n; - n = $1->type->width - sizeof(ushort); + n = $1->type->width - sizeof(TRune); s = alloc(n+$2.l+MAXALIGN); memcpy(s, $1->rstring, n); memcpy(s+n, $2.s, $2.l); - *(ushort*)(s+n+$2.l) = 0; + *(TRune*)(s+n+$2.l) = 0; $$ = $1; $$->type->width += $2.l; - $$->rstring = (ushort*)s; + $$->rstring = (TRune*)s; } zelist: --- /sys/src/cmd/cc/com.c Thu Nov 17 00:39:03 2011 +++ /sys/src/cmd/cc/com.c Wed Apr 24 01:19:44 2013 @@ -67,6 +67,7 @@ Node *l, *r; Type *t; int o; + static TRune zer; if(n == Z) { diag(Z, "Z in tcom"); @@ -633,10 +634,10 @@ break; case OLSTRING: - if(n->type->link != types[TUSHORT]) { + if(n->type->link != types[TRUNE]) { o = outstring(0, 0); while(o & 3) { - outlstring(L"", sizeof(ushort)); + outlstring(&zer, sizeof(TRune)); o = outlstring(0, 0); } } --- /sys/src/cmd/cc/dcl.c Thu Nov 17 00:40:04 2011 +++ /sys/src/cmd/cc/dcl.c Wed Apr 24 01:19:44 2013 @@ -232,7 +232,7 @@ a->cstring++; } if(a->op == OLSTRING) { - b->vconst = convvtox(*a->rstring, TUSHORT); + b->vconst = convvtox(*a->rstring, TRUNE); a->rstring++; } a->type->width -= b->type->width; --- /sys/src/cmd/cc/lex.c Wed Oct 3 17:56:45 2012 +++ /sys/src/cmd/cc/lex.c Wed Apr 24 01:19:35 2013 @@ -80,7 +80,8 @@ case 'I': p = ARGF(); - setinclude(p); + if(p) + setinclude(p); break; } ARGEND if(argc < 1 && outfile == 0) { @@ -465,7 +466,7 @@ yyerror("missing '"); peekc = c1; } - yylval.vval = convvtox(c, TUSHORT); + yylval.vval = convvtox(c, TRUNE); return LUCONST; } if(c == '"') { @@ -539,15 +540,15 @@ c = escchar('"', 1, 0); if(c == EOF) break; - cp = allocn(cp, c1, sizeof(ushort)); - *(ushort*)(cp + c1) = c; - c1 += sizeof(ushort); + cp = allocn(cp, c1, sizeof(TRune)); + *(TRune*)(cp + c1) = c; + c1 += sizeof(TRune); } yylval.sval.l = c1; do { - cp = allocn(cp, c1, sizeof(ushort)); - *(ushort*)(cp + c1) = 0; - c1 += sizeof(ushort); + cp = allocn(cp, c1, sizeof(TRune)); + *(TRune*)(cp + c1) = 0; + c1 += sizeof(TRune); } while(c1 & MAXALIGN); yylval.sval.s = cp; return LLSTRING; @@ -1025,7 +1026,7 @@ } else c = GETC(); for(;;) { - if(!isspace(c)) + if(c >= Runeself || !isspace(c)) return c; if(c == '\n') { lineno++; --- /sys/src/cmd/cc/pswt.c Wed Mar 13 20:30:01 2013 +++ /sys/src/cmd/cc/pswt.c Wed Apr 24 01:19:31 2013 @@ -132,28 +132,29 @@ } long -outlstring(ushort *s, long n) +outlstring(TRune *s, long n) { - char buf[2]; - int c; + char buf[sizeof(TRune)]; + uint c; + int i; long r; if(suppress) return nstring; - while(nstring & 1) + while(nstring & (sizeof(TRune)-1)) outstring("", 1); r = nstring; while(n > 0) { c = *s++; if(align(0, types[TCHAR], Aarg1)) { - buf[0] = c>>8; - buf[1] = c; + for(i = 0; i < sizeof(TRune); i++) + buf[i] = c>>(8*(sizeof(TRune) - i - 1)); } else { - buf[0] = c; - buf[1] = c>>8; + for(i = 0; i < sizeof(TRune); i++) + buf[i] = c>>(8*i); } - outstring(buf, 2); - n -= sizeof(ushort); + outstring(buf, sizeof(TRune)); + n -= sizeof(TRune); } return r; } --- /sys/src/cmd/cc/sub.c Mon Mar 4 22:27:25 2013 +++ /sys/src/cmd/cc/sub.c Wed Apr 24 01:19:31 2013 @@ -85,7 +85,10 @@ break; case OLSTRING: - print(" \"%S\"", n->rstring); + if(sizeof(TRune) == sizeof(Rune)) + print(" \"%S\"", (Rune*)n->rstring); + else + print(" \"...\""); i = 0; break; --- /sys/src/cmd/ed.c Fri Oct 15 22:53:45 2010 +++ /sys/src/cmd/ed.c Wed Apr 24 01:16:32 2013 @@ -15,7 +15,7 @@ ESIZE = 256, /* max size of reg exp */ GBSIZE = 256, /* max size of global command */ MAXSUB = 9, /* max number of sub reg exp */ - ESCFLG = 0xFFFF, /* escape Rune - user defined code */ + ESCFLG = Runemax, /* escape Rune - user defined code */ EOF = -1, }; @@ -54,7 +54,7 @@ int peekc; int pflag; int rescuing; -Rune rhsbuf[LBSIZE/2]; +Rune rhsbuf[LBSIZE/sizeof(Rune)]; char savedfile[FNSIZE]; jmp_buf savej; int subnewa; @@ -735,7 +735,7 @@ if(c == 0) continue; *p++ = c; - if(p >= &linebuf[LBSIZE-2]) + if(p >= &linebuf[LBSIZE-sizeof(Rune)]) error(Q); } } @@ -988,11 +988,12 @@ lp = linebuf; bp = getblock(tl, OREAD); nl = nleft; - tl &= ~((BLKSIZE/2) - 1); + tl &= ~((BLKSIZE/sizeof(Rune)) - 1); while(*lp++ = *bp++) { nl -= sizeof(Rune); if(nl == 0) { - bp = getblock(tl += BLKSIZE/2, OREAD); + tl += BLKSIZE/sizeof(Rune); + bp = getblock(tl, OREAD); nl = nleft; } } @@ -1010,7 +1011,7 @@ tl = tline; bp = getblock(tl, OWRITE); nl = nleft; - tl &= ~((BLKSIZE/2)-1); + tl &= ~((BLKSIZE/sizeof(Rune))-1); while(*bp = *lp++) { if(*bp++ == '\n') { bp[-1] = 0; @@ -1019,7 +1020,7 @@ } nl -= sizeof(Rune); if(nl == 0) { - tl += BLKSIZE/2; + tl += BLKSIZE/sizeof(Rune); bp = getblock(tl, OWRITE); nl = nleft; } @@ -1046,8 +1047,9 @@ static uchar ibuff[BLKSIZE]; static uchar obuff[BLKSIZE]; - bno = atl / (BLKSIZE/2); - off = (atl<<1) & (BLKSIZE-1) & ~03; + bno = atl / (BLKSIZE/sizeof(Rune)); + /* &~3 so the ptr is aligned to 4 (?) */ + off = (atl*sizeof(Rune)) & (BLKSIZE-1) & ~3; if(bno >= NBLK) { lastc = '\n'; error(T); @@ -1160,7 +1162,7 @@ for(a1=addr1; a1<=addr2; a1++) { lp = getline(*a1); while(*gp = *lp++) - if(gp++ >= &genbuf[LBSIZE-2]) + if(gp++ >= &genbuf[LBSIZE-sizeof(Rune)]) error(Q); } lp = linebuf; @@ -1238,7 +1240,7 @@ if(c == '\\') { c = getchr(); *p++ = ESCFLG; - if(p >= &rhsbuf[LBSIZE/2]) + if(p >= &rhsbuf[LBSIZE/sizeof(Rune)]) error(Q); } else if(c == '\n' && (!globp || !globp[0])) { @@ -1249,7 +1251,7 @@ if(c == seof) break; *p++ = c; - if(p >= &rhsbuf[LBSIZE/2]) + if(p >= &rhsbuf[LBSIZE/sizeof(Rune)]) error(Q); } *p = 0; --- /sys/src/cmd/file.c Mon Jul 30 20:11:53 2012 +++ /sys/src/cmd/file.c Wed Apr 24 01:06:27 2013 @@ -267,64 +267,10 @@ close(fd); } -/* - * Unicode 4.0 4-byte runes. - */ -typedef int Rune1; - -enum { - UTFmax1 = 4, -}; - -int -fullrune1(char *p, int n) -{ - int c; - - if(n >= 1) { - c = *(uchar*)p; - if(c < 0x80) - return 1; - if(n >= 2 && c < 0xE0) - return 1; - if(n >= 3 && c < 0xF0) - return 1; - if(n >= 4) - return 1; - } - return 0; -} - -int -chartorune1(Rune1 *rune, char *str) -{ - int c, c1, c2, c3, n; - Rune r; - - c = *(uchar*)str; - if(c < 0xF0){ - r = 0; - n = chartorune(&r, str); - *rune = r; - return n; - } - c &= ~0xF0; - c1 = *(uchar*)(str+1) & ~0x80; - c2 = *(uchar*)(str+2) & ~0x80; - c3 = *(uchar*)(str+3) & ~0x80; - n = (c<<18) | (c1<<12) | (c2<<6) | c3; - if(n < 0x10000 || n > 0x10FFFF){ - *rune = Runeerror; - return 1; - } - *rune = n; - return 4; -} - void filetype(int fd) { - Rune1 r; + Rune r; int i, f, n; char *p, *eob; @@ -363,9 +309,9 @@ language[i].count = 0; eob = (char *)buf+nbuf; for(n = 0, p = (char *)buf; p < eob; n++) { - if (!fullrune1(p, eob-p) && eob-p < UTFmax1) + if (!fullrune(p, eob-p) && eob-p < UTFmax) break; - p += chartorune1(&r, p); + p += chartorune(&r, p); if (r == 0) f = Cnull; else if (r <= 0x7f) { --- /sys/src/cmd/freq.c Tue Jan 19 22:57:26 2010 +++ /sys/src/cmd/freq.c Wed Apr 24 01:16:32 2013 @@ -2,7 +2,7 @@ #include #include -uvlong count[1<<16]; +uvlong count[Runemax+1]; Biobuf bout; void usage(void); --- /sys/src/cmd/grep/comp.c Fri Jul 23 00:47:10 2010 +++ /sys/src/cmd/grep/comp.c Wed Apr 24 01:05:13 2013 @@ -275,7 +275,7 @@ x = re2or(x, rclass(ov, p[0]-1)); ov = p[1]+1; } - x = re2or(x, rclass(ov, 0xffff)); + x = re2or(x, rclass(ov, Runemask)); } else { x = rclass(p[0], p[1]); for(p+=2; *p; p+=2) --- /sys/src/cmd/grep/grep.h Thu Sep 7 23:49:48 2006 +++ /sys/src/cmd/grep/grep.h Wed Apr 24 01:05:13 2013 @@ -53,7 +53,7 @@ Caselim = 7, Nhunk = 1<<16, - Cbegin = 0x10000, + Cbegin = Runemax+1, Flshcnt = (1<<9)-1, Cflag = 1<<0, --- /sys/src/cmd/htmlroff/char.c Fri Sep 1 05:54:10 2006 +++ /sys/src/cmd/htmlroff/char.c Wed Apr 24 01:16:11 2013 @@ -16,6 +16,12 @@ if(r == '\n') return L("\n"); + if(((uint)r&~0xFFFF) != 0){ + /* The cache must grow a lot to handle them */ + fprint(2, "%s: can't handle rune '%C'\n", argv0, r); + return L("?"); + } + if(tcscache[r>>8] && tcscache[r>>8][r&0xFF]) return tcscache[r>>8][r&0xFF]; @@ -59,7 +65,7 @@ typedef struct Trtab Trtab; struct Trtab { - char t[3]; + char t[UTFmax]; Rune r; }; --- /sys/src/cmd/rc/glob.c Wed Jun 27 06:23:14 2007 +++ /sys/src/cmd/rc/glob.c Wed Apr 24 01:15:45 2013 @@ -111,25 +111,22 @@ else globsort(globv, svglobv); } + /* * Do p and q point at equal utf codes */ - int equtf(uchar *p, uchar *q) { + Rune pr, qr; if(*p!=*q) return 0; - if(twobyte(*p)) return p[1]==q[1]; - if(threebyte(*p)){ - if(p[1]!=q[1]) - return 0; - if(p[1]=='\0') - return 1; /* broken code at end of string! */ - return p[2]==q[2]; - } - return 1; + + chartorune(&pr, (char*)p); + chartorune(&qr, (char*)q); + return pr == qr; } + /* * Return a pointer to the next utf code in the string, * not jumping past nuls in broken utf codes! @@ -138,10 +135,10 @@ uchar* nextutf(uchar *p) { - if(twobyte(*p)) return p[1]=='\0'?p+1:p+2; - if(threebyte(*p)) return p[1]=='\0'?p+1:p[2]=='\0'?p+2:p+3; - return p+1; + Rune dummy; + return p + chartorune(&dummy, (char*)p); } + /* * Convert the utf code at *p to a unicode value */ @@ -149,14 +146,12 @@ int unicode(uchar *p) { - int u = *p; + Rune r; - if(twobyte(u)) - return ((u&0x1f)<<6)|(p[1]&0x3f); - if(threebyte(u)) - return (u<<12)|((p[1]&0x3f)<<6)|(p[2]&0x3f); - return u; + chartorune(&r, (char*)p); + return r; } + /* * Does the string s match the pattern p * . and .. are only matched by patterns starting with . --- /sys/src/cmd/rc/lex.c Sun Mar 25 21:38:29 2007 +++ /sys/src/cmd/rc/lex.c Wed Apr 24 01:15:45 2013 @@ -166,15 +166,25 @@ char* addutf(char *p, int c) { - p = addtok(p, c); - if(twobyte(c)) /* 2-byte escape */ - return addtok(p, advance()); - if(threebyte(c)){ /* 3-byte escape */ + uchar b, m; + int i; + + p = addtok(p, c); /* 1-byte UTF runes are special */ + if(onebyte(c)) + return p; + + m = 0xc0; + b = 0x80; + for(i=1; i < UTFmax; i++){ + if((c&m) == b) + break; p = addtok(p, advance()); - return addtok(p, advance()); + b = m; + m = (m >> 1)|0x80; } return p; } + int lastdol; /* was the last token read '$' or '$#' or '"'? */ int lastword; /* was the last token read a word or compound word terminator? */ --- /sys/src/cmd/rc/rc.h Thu Mar 28 21:42:37 2013 +++ /sys/src/cmd/rc/rc.h Wed Apr 24 01:15:45 2013 @@ -128,13 +128,12 @@ * GLOBGLOB matches GLOB */ #define GLOB ((char)0x01) + /* - * onebyte(c), twobyte(c), threebyte(c) - * Is c the first character of a one- two- or three-byte utf sequence? + * onebyte(c) + * Is c the first character of a one-byte utf sequence? */ #define onebyte(c) ((c&0x80)==0x00) -#define twobyte(c) ((c&0xe0)==0xc0) -#define threebyte(c) ((c&0xf0)==0xe0) char **argp; char **args; --- /sys/src/cmd/sam/cmd.c Sun Nov 20 02:09:35 2005 +++ /sys/src/cmd/sam/cmd.c Wed Apr 24 01:06:05 2013 @@ -71,7 +71,7 @@ inputc(void) { int n, nbuf; - char buf[3]; + char buf[UTFmax]; Rune r; Again: --- /sys/src/cmd/sam/regexp.c Sat Jan 12 22:20:47 2008 +++ /sys/src/cmd/sam/regexp.c Wed Apr 24 01:06:01 2013 @@ -9,7 +9,7 @@ struct Inst { - long type; /* < 0x10000 ==> literal, otherwise action */ + long type; /* <= Runemax ==> literal, otherwise action */ union { int rsid; int rsubid; @@ -46,7 +46,7 @@ #define NLIST 127 -Ilist *tl, *nl; /* This list, next list */ +Ilist *tl, *nl; /* This list, next list */ Ilist list[2][NLIST+1]; /* +1 for trailing null */ static Rangeset sempty; @@ -56,25 +56,28 @@ * 0x100xx are operators, value == precedence * 0x200xx are tokens, i.e. operands for operators */ -#define OPERATOR 0x10000 /* Bitmask of all operators */ -#define START 0x10000 /* Start, used for marker on stack */ -#define RBRA 0x10001 /* Right bracket, ) */ -#define LBRA 0x10002 /* Left bracket, ( */ -#define OR 0x10003 /* Alternation, | */ -#define CAT 0x10004 /* Concatentation, implicit operator */ -#define STAR 0x10005 /* Closure, * */ -#define PLUS 0x10006 /* a+ == aa* */ -#define QUEST 0x10007 /* a? == a|nothing, i.e. 0 or 1 a's */ -#define ANY 0x20000 /* Any character but newline, . */ -#define NOP 0x20001 /* No operation, internal use only */ -#define BOL 0x20002 /* Beginning of line, ^ */ -#define EOL 0x20003 /* End of line, $ */ -#define CCLASS 0x20004 /* Character class, [] */ -#define NCCLASS 0x20005 /* Negated character class, [^] */ -#define END 0x20077 /* Terminate: match found */ +enum { + OPERATOR = Runemask+1, /* Bitmask of all operators */ + START = OPERATOR, /* Start, used for marker on stack */ + RBRA, /* Right bracket, ) */ + LBRA, /* Left bracket, ( */ + OR, /* Alternation, | */ + CAT, /* Concatentation, implicit operator */ + STAR, /* Closure, * */ + PLUS, /* a+ == aa* */ + QUEST, /* a? == a|nothing, i.e. 0 or 1 a's */ + + ANY = OPERATOR<<1, /* Any character but newline, . */ + NOP, /* No operation, internal use only */ + BOL, /* Beginning of line, ^ */ + EOL, /* End of line, $ */ + CCLASS, /* Character class, [] */ + NCCLASS, /* Negated character class, [^] */ + END, /* Terminate: match found */ -#define ISATOR 0x10000 -#define ISAND 0x20000 + ISATOR = OPERATOR, + ISAND = OPERATOR<<1, +}; /* * Parser Information @@ -459,7 +462,7 @@ exprp++; return '\n'; } - return *exprp++|0x10000; + return *exprp++|(Runemax+1); } return *exprp++; } @@ -494,7 +497,7 @@ exprp++; /* eat '-' */ if((c2 = nextrec()) == ']') goto Error; - classp[n+0] = 0xFFFF; + classp[n+0] = Runemax; classp[n+1] = c1; classp[n+2] = c2; n += 3; @@ -516,7 +519,7 @@ p = class[classno]; while(*p){ - if(*p == 0xFFFF){ + if(*p == Runemax){ if(p[1]<=c && c<=p[2]) return !negate; p += 3; --- /sys/src/cmd/sed.c Thu Feb 5 05:24:16 2009 +++ /sys/src/cmd/sed.c Wed Apr 24 01:15:26 2013 @@ -623,7 +623,7 @@ while ((r = *cp++) != '\0') { if(r == '\\') { if (rhs < end) - *rhs++ = 0xFFFF; + *rhs++ = Runemax; else return 0; r = *cp++; @@ -1050,7 +1050,7 @@ sp = place(sp, loc1, loc2); continue; } - if (c == 0xFFFF && (c = *rp++) >= '1' && c < MAXSUB + '0') { + if (c == Runemax && (c = *rp++) >= '1' && c < MAXSUB + '0') { n = c-'0'; if (subexp[n].rsp && subexp[n].rep) { sp = place(sp, subexp[n].rsp, subexp[n].rep); --- /sys/src/cmd/tcs/conv.h Wed Jul 14 22:04:23 2010 +++ /sys/src/cmd/tcs/conv.h Wed Apr 24 01:06:53 2013 @@ -19,6 +19,6 @@ void tune_out(Rune *base, int n, long *notused); #define emit(x) *(*r)++ = (x) -#define NRUNE 65536 +#define NRUNE (Runemax+1) extern long tab[]; /* common table indexed by Runes for reverse mappings */ --- /sys/src/cmd/tr.c Thu Feb 19 17:33:23 2009 +++ /sys/src/cmd/tr.c Wed Apr 24 01:15:08 2013 @@ -15,10 +15,8 @@ #define CLEARBIT(a,c) ((a)[(c)/8] &= ~bits[(c)&07]) #define BITSET(a,c) ((a)[(c)/8] & bits[(c)&07]) -#define MAXRUNE 0xFFFF - -uchar f[(MAXRUNE+1)/8]; -uchar t[(MAXRUNE+1)/8]; +uchar f[(Runemax+1)/8]; +uchar t[(Runemax+1)/8]; char wbuf[4096]; char *wptr; --- /sys/src/cmd/unicode.c Sun Dec 12 02:15:51 1999 +++ /sys/src/cmd/unicode.c Wed Apr 24 01:15:08 2013 @@ -51,13 +51,13 @@ return "bad range"; } min = strtoul(q, &q, 16); - if(min<0 || min>0xFFFF || *q!='-') + if(min<0 || min>Runemax || *q!='-') goto err; q++; if(strchr(hex, *q) == 0) goto err; max = strtoul(q, &q, 16); - if(max<0 || max>0xFFFF || maxRunemax || max0xFFFF || *q!=0) + if(m<0 || m>Runemax || *q!=0) goto err; Bprint(&bout, "%C", m); if(!text) --- /sys/src/cmd/unix/drawterm/libc/rune.c Fri Dec 30 00:56:08 2005 +++ /sys/src/cmd/unix/drawterm/libc/rune.c Wed Apr 24 01:14:58 2013 @@ -14,21 +14,26 @@ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0001 1111 1111 1111 1111 1111 */ Maskx = (1< T2 Tx + * 00080-007FF => T2 Tx */ c1 = *(uchar*)(str+1) ^ Tx; if(c1 & Testx) @@ -60,20 +65,42 @@ /* * three character sequence - * 0800-FFFF => T3 Tx Tx + * 00800-0FFFF => T3 Tx Tx */ c2 = *(uchar*)(str+2) ^ Tx; + if(c2 & Testx) goto bad; if(c < T4) { l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; if(l <= Rune2) goto bad; + if (SurrogateMin <= l && l <= SurrogateMax) + goto bad; *rune = l; return 3; } /* + * four character sequence + * 10000-10FFFF => T4 Tx Tx Tx + */ + if(UTFmax >= 4) { + c3 = *(uchar*)(str+3) ^ Tx; + if(c3 & Testx) + goto bad; + if(c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if(l <= Rune3) + goto bad; + if(l > Runemax) + goto bad; + *rune = l; + return 4; + } + } + + /* * bad decoding */ bad: @@ -105,15 +132,37 @@ str[1] = Tx | (c & Maskx); return 2; } + /* + * If the Rune is out of range or a surrogate half, convert it to the error rune. + * Do this test here because the error rune encodes to three bytes. + * Doing it earlier would duplicate work, since an out of range + * Rune wouldn't have fit in one or two bytes. + */ + if (c > Runemax) + c = Runeerror; + if (SurrogateMin <= c && c <= SurrogateMax) + c = Runeerror; /* * three character sequence * 0800-FFFF => T3 Tx Tx */ - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; + if (c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence (21-bit value) + * 10000-1FFFFF => T4 Tx Tx Tx + */ + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; } int @@ -136,11 +185,12 @@ c = *r++; if(c <= Rune1) nb++; - else - if(c <= Rune2) + else if(c <= Rune2) nb += 2; + else if(c <= Rune3) + nb += 3; else - nb += 3; + nb += 4; } return nb; } @@ -149,14 +199,14 @@ fullrune(char *str, int n) { int c; - - if(n > 0) { - c = *(uchar*)str; - if(c < Tx) - return 1; - if(n > 1) - if(c < T3 || n > 2) - return 1; - } - return 0; + if(n <= 0) + return 0; + c = *(uchar*)str; + if(c < Tx) + return 1; + if(c < T3) + return n >= 2; + if(c < T4) + return n >= 3; + return n >= 4; } --- /sys/src/cmd/unix/u9fs/rune.c Sat Mar 2 19:05:53 2002 +++ /sys/src/cmd/unix/u9fs/rune.c Wed Apr 24 01:14:53 2013 @@ -14,21 +14,27 @@ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ + + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0001 1111 1111 1111 1111 1111 */ - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ Maskx = (1< T2 Tx + * 00080-007FF => T2 Tx */ c1 = *(uchar*)(str+1) ^ Tx; if(c1 & Testx) @@ -60,20 +66,42 @@ /* * three character sequence - * 0800-FFFF => T3 Tx Tx + * 00800-0FFFF => T3 Tx Tx */ c2 = *(uchar*)(str+2) ^ Tx; + if(c2 & Testx) goto bad; if(c < T4) { l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; if(l <= Rune2) goto bad; + if (SurrogateMin <= l && l <= SurrogateMax) + goto bad; *rune = l; return 3; } /* + * four character sequence + * 10000-10FFFF => T4 Tx Tx Tx + */ + if(UTFmax >= 4) { + c3 = *(uchar*)(str+3) ^ Tx; + if(c3 & Testx) + goto bad; + if(c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if(l <= Rune3) + goto bad; + if(l > Runemax) + goto bad; + *rune = l; + return 4; + } + } + + /* * bad decoding */ bad: @@ -105,15 +133,37 @@ str[1] = Tx | (c & Maskx); return 2; } + /* + * If the Rune is out of range or a surrogate half, convert it to the error rune. + * Do this test here because the error rune encodes to three bytes. + * Doing it earlier would duplicate work, since an out of range + * Rune wouldn't have fit in one or two bytes. + */ + if (c > Runemax) + c = Runeerror; + if (SurrogateMin <= c && c <= SurrogateMax) + c = Runeerror; /* * three character sequence * 0800-FFFF => T3 Tx Tx */ - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; + if (c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence (21-bit value) + * 10000-1FFFFF => T4 Tx Tx Tx + */ + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; } int --- /sys/src/cmd/vnc/devcons.c Sun Nov 20 04:14:48 2005 +++ /sys/src/cmd/vnc/devcons.c Wed Apr 24 01:14:49 2013 @@ -158,7 +158,7 @@ kbdputc(int ch) { int n; - char buf[3]; + char buf[UTFmax]; Rune r; r = ch; --- /sys/src/cmd/wc.c Fri Mar 16 05:42:44 2001 +++ /sys/src/cmd/wc.c Wed Apr 24 03:18:50 2013 @@ -1,26 +1,84 @@ /* - * wc -- count things in utf-encoded text files - * Bugs: - * The only white space characters recognized are ' ', '\t' and '\n', even though - * ISO 10646 has many more blanks scattered through it. - * Should count characters that cannot occur in any rune (hex f0-ff) separately. - * Should count non-canonical runes (e.g. hex c1,80 instead of hex 40). + * Count bytes within runes, if it fits in a uvlong, and other things. */ #include #include -#define NBUF (8*1024) -uvlong nline, tnline, pline; -uvlong nword, tnword, pword; -uvlong nrune, tnrune, prune; -uvlong nbadr, tnbadr, pbadr; -uvlong nchar, tnchar, pchar; -void count(int, char *); -void report(uvlong, uvlong, uvlong, uvlong, uvlong, char *); +#include + +/* flags, per-file counts, and total counts */ +static int pline, pword, prune, pbadr, pchar; +static uvlong nline, nword, nrune, nbadr, nchar; +static uvlong tnline, tnword, tnrune, tnbadr, tnchar; + +enum{Space, Word}; + +static void +wc(Biobuf *bin) +{ + int where; + long r; + + nline = 0; + nword = 0; + nrune = 0; + nbadr = 0; + where = Space; + while ((long)(r = Bgetrune(bin)) >= 0) { + nrune++; + if(r == Runeerror) { + nbadr++; + continue; + } + if(r == '\n') + nline++; + if(where == Word){ + if(isspacerune(r)) + where = Space; + }else + if(isspacerune(r) == 0){ + where = Word; + nword++; + } + } + nchar = Boffset(bin); + tnline += nline; + tnword += nword; + tnrune += nrune; + tnbadr += nbadr; + tnchar += nchar; +} + +static void +report(uvlong nline, uvlong nword, uvlong nrune, uvlong nbadr, uvlong nchar, char *fname) +{ + char line[1024], *s, *e; + + s = line; + e = line + sizeof line; + line[0] = 0; + if(pline) + s = seprint(s, e, " %7llud", nline); + if(pword) + s = seprint(s, e, " %7llud", nword); + if(prune) + s = seprint(s, e, " %7llud", nrune); + if(pbadr) + s = seprint(s, e, " %7llud", nbadr); + if(pchar) + s = seprint(s, e, " %7llud", nchar); + if(fname != nil) + seprint(s, e, " %s", fname); + print("%s\n", line+1); +} + void main(int argc, char *argv[]) { - char *status=""; - int i, f; + char *sts; + Biobuf sin, *bin; + int i; + + sts = nil; ARGBEGIN { case 'l': pline++; break; case 'w': pword++; break; @@ -31,279 +89,30 @@ fprint(2, "Usage: %s [-lwrbc] [file ...]\n", argv0); exits("usage"); } ARGEND - if(pline+pword+prune+pbadr+pchar == 0) { + if(pline+pword+prune+pbadr+pchar == 0){ pline = 1; pword = 1; pchar = 1; } - if(argc==0) - count(0, 0); - else{ - for(i=0;i1) report(tnline, tnword, tnrune, tnbadr, tnchar, "total"); } - exits(status); -} -void -report(uvlong nline, uvlong nword, uvlong nrune, uvlong nbadr, uvlong nchar, char *fname) -{ - char line[1024], word[128]; - line[0] = '\0'; - if(pline){ - sprint(word, " %7llud", nline); - strcat(line, word); - } - if(pword){ - sprint(word, " %7llud", nword); - strcat(line, word); - } - if(prune){ - sprint(word, " %7llud", nrune); - strcat(line, word); - } - if(pbadr){ - sprint(word, " %7llud", nbadr); - strcat(line, word); - } - if(pchar){ - sprint(word, " %7llud", nchar); - strcat(line, word); - } - if(fname){ - sprint(word, " %s", fname); - strcat(line, word); - } - print("%s\n", line+1); -} -/* - * How it works. Start in statesp. Each time we read a character, - * increment various counts, and do state transitions according to the - * following table. If we're not in statesp or statewd when done, the - * file ends with a partial rune. - * | character - * state |09,20| 0a |00-7f|80-bf|c0-df|e0-ef|f0-ff - * -------+-----+-----+-----+-----+-----+-----+----- - * statesp|ASP |ASPN |AWDW |AWDWX|AC2W |AC3W |AWDWX - * statewd|ASP |ASPN |AWD |AWDX |AC2 |AC3 |AWDX - * statec2|ASPX |ASPNX|AWDX |AWDR |AC2X |AC3X |AWDX - * statec3|ASPX |ASPNX|AWDX |AC2R |AC2X |AC3X |AWDX - */ -enum{ /* actions */ - AC2, /* enter statec2 */ - AC2R, /* enter statec2, don't count a rune */ - AC2W, /* enter statec2, count a word */ - AC2X, /* enter statec2, count a bad rune */ - AC3, /* enter statec3 */ - AC3W, /* enter statec3, count a word */ - AC3X, /* enter statec3, count a bad rune */ - ASP, /* enter statesp */ - ASPN, /* enter statesp, count a newline */ - ASPNX, /* enter statesp, count a newline, count a bad rune */ - ASPX, /* enter statesp, count a bad rune */ - AWD, /* enter statewd */ - AWDR, /* enter statewd, don't count a rune */ - AWDW, /* enter statewd, count a word */ - AWDWX, /* enter statewd, count a word, count a bad rune */ - AWDX, /* enter statewd, count a bad rune */ -}; -uchar statesp[256]={ /* looking for the start of a word */ -AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, /* 00-07 */ -AWDW, ASP, ASPN, AWDW, AWDW, AWDW, AWDW, AWDW, /* 08-0f */ -AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, /* 10-17 */ -AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, /* 18-1f */ -ASP, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, /* 20-27 */ -AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, /* 28-2f */ -AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, /* 30-37 */ -AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, /* 38-3f */ -AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, /* 40-47 */ -AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, /* 48-4f */ -AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, /* 50-57 */ -AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, /* 58-5f */ -AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, /* 60-67 */ -AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, /* 68-6f */ -AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, /* 70-77 */ -AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, AWDW, /* 78-7f */ -AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,/* 80-87 */ -AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,/* 88-8f */ -AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,/* 90-97 */ -AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,/* 98-9f */ -AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,/* a0-a7 */ -AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,/* a8-af */ -AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,/* b0-b7 */ -AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,/* b8-bf */ -AC2W, AC2W, AC2W, AC2W, AC2W, AC2W, AC2W, AC2W, /* c0-c7 */ -AC2W, AC2W, AC2W, AC2W, AC2W, AC2W, AC2W, AC2W, /* c8-cf */ -AC2W, AC2W, AC2W, AC2W, AC2W, AC2W, AC2W, AC2W, /* d0-d7 */ -AC2W, AC2W, AC2W, AC2W, AC2W, AC2W, AC2W, AC2W, /* d8-df */ -AC3W, AC3W, AC3W, AC3W, AC3W, AC3W, AC3W, AC3W, /* e0-e7 */ -AC3W, AC3W, AC3W, AC3W, AC3W, AC3W, AC3W, AC3W, /* e8-ef */ -AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,/* f0-f7 */ -AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,AWDWX,/* f8-ff */ -}; -uchar statewd[256]={ /* looking for the next character in a word */ -AWD, AWD, AWD, AWD, AWD, AWD, AWD, AWD, /* 00-07 */ -AWD, ASP, ASPN, AWD, AWD, AWD, AWD, AWD, /* 08-0f */ -AWD, AWD, AWD, AWD, AWD, AWD, AWD, AWD, /* 10-17 */ -AWD, AWD, AWD, AWD, AWD, AWD, AWD, AWD, /* 18-1f */ -ASP, AWD, AWD, AWD, AWD, AWD, AWD, AWD, /* 20-27 */ -AWD, AWD, AWD, AWD, AWD, AWD, AWD, AWD, /* 28-2f */ -AWD, AWD, AWD, AWD, AWD, AWD, AWD, AWD, /* 30-37 */ -AWD, AWD, AWD, AWD, AWD, AWD, AWD, AWD, /* 38-3f */ -AWD, AWD, AWD, AWD, AWD, AWD, AWD, AWD, /* 40-47 */ -AWD, AWD, AWD, AWD, AWD, AWD, AWD, AWD, /* 48-4f */ -AWD, AWD, AWD, AWD, AWD, AWD, AWD, AWD, /* 50-57 */ -AWD, AWD, AWD, AWD, AWD, AWD, AWD, AWD, /* 58-5f */ -AWD, AWD, AWD, AWD, AWD, AWD, AWD, AWD, /* 60-67 */ -AWD, AWD, AWD, AWD, AWD, AWD, AWD, AWD, /* 68-6f */ -AWD, AWD, AWD, AWD, AWD, AWD, AWD, AWD, /* 70-77 */ -AWD, AWD, AWD, AWD, AWD, AWD, AWD, AWD, /* 78-7f */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 80-87 */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 88-8f */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 90-97 */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 98-9f */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* a0-a7 */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* a8-af */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* b0-b7 */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* b8-bf */ -AC2, AC2, AC2, AC2, AC2, AC2, AC2, AC2, /* c0-c7 */ -AC2, AC2, AC2, AC2, AC2, AC2, AC2, AC2, /* c8-cf */ -AC2, AC2, AC2, AC2, AC2, AC2, AC2, AC2, /* d0-d7 */ -AC2, AC2, AC2, AC2, AC2, AC2, AC2, AC2, /* d8-df */ -AC3, AC3, AC3, AC3, AC3, AC3, AC3, AC3, /* e0-e7 */ -AC3, AC3, AC3, AC3, AC3, AC3, AC3, AC3, /* e8-ef */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* f0-f7 */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* f8-ff */ -}; -uchar statec2[256]={ /* looking for 10xxxxxx to complete a rune */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 00-07 */ -AWDX, ASPX, ASPNX,AWDX, AWDX, AWDX, AWDX, AWDX, /* 08-0f */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 10-17 */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 18-1f */ -ASPX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 20-27 */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 28-2f */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 30-37 */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 38-3f */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 40-47 */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 48-4f */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 50-57 */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 58-5f */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 60-67 */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 68-6f */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 70-77 */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 78-7f */ -AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, /* 80-87 */ -AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, /* 88-8f */ -AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, /* 90-97 */ -AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, /* 98-9f */ -AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, /* a0-a7 */ -AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, /* a8-af */ -AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, /* b0-b7 */ -AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, AWDR, /* b8-bf */ -AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, /* c0-c7 */ -AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, /* c8-cf */ -AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, /* d0-d7 */ -AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, /* d8-df */ -AC3X, AC3X, AC3X, AC3X, AC3X, AC3X, AC3X, AC3X, /* e0-e7 */ -AC3X, AC3X, AC3X, AC3X, AC3X, AC3X, AC3X, AC3X, /* e8-ef */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* f0-f7 */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* f8-ff */ -}; -uchar statec3[256]={ /* looking for 10xxxxxx,10xxxxxx to complete a rune */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 00-07 */ -AWDX, ASPX, ASPNX,AWDX, AWDX, AWDX, AWDX, AWDX, /* 08-0f */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 10-17 */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 18-1f */ -ASPX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 20-27 */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 28-2f */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 30-37 */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 38-3f */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 40-47 */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 48-4f */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 50-57 */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 58-5f */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 60-67 */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 68-6f */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 70-77 */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* 78-7f */ -AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, /* 80-87 */ -AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, /* 88-8f */ -AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, /* 90-97 */ -AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, /* 98-9f */ -AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, /* a0-a7 */ -AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, /* a8-af */ -AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, /* b0-b7 */ -AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, AC2R, /* b8-bf */ -AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, /* c0-c7 */ -AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, /* c8-cf */ -AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, /* d0-d7 */ -AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, AC2X, /* d8-df */ -AC3X, AC3X, AC3X, AC3X, AC3X, AC3X, AC3X, AC3X, /* e0-e7 */ -AC3X, AC3X, AC3X, AC3X, AC3X, AC3X, AC3X, AC3X, /* e8-ef */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* f0-f7 */ -AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, AWDX, /* f8-ff */ -}; -void -count(int f, char *name) -{ - int n; - uchar buf[NBUF]; - uchar *bufp, *ebuf; - uchar *state=statesp; - - nline = 0; - nword = 0; - nrune = 0; - nbadr = 0; - nchar = 0; - - for(;;){ - n=read(f, buf, NBUF); - if(n<=0) - break; - nchar+=n; - nrune+=n; /* might be too large, gets decreased later */ - bufp=buf; - ebuf=buf+n; - do{ - switch(state[*bufp]){ - case AC2: state=statec2; break; - case AC2R: state=statec2; --nrune; break; - case AC2W: state=statec2; nword++; break; - case AC2X: state=statec2; nbadr++; break; - case AC3: state=statec3; break; - case AC3W: state=statec3; nword++; break; - case AC3X: state=statec3; nbadr++; break; - case ASP: state=statesp; break; - case ASPN: state=statesp; nline++; break; - case ASPNX: state=statesp; nline++; nbadr++; break; - case ASPX: state=statesp; nbadr++; break; - case AWD: state=statewd; break; - case AWDR: state=statewd; --nrune; break; - case AWDW: state=statewd; nword++; break; - case AWDWX: state=statewd; nword++; nbadr++; break; - case AWDX: state=statewd; nbadr++; break; - } - }while(++bufp!=ebuf); - } - if(state!=statesp && state!=statewd) - nbadr++; - if(n<0) - perror(name); - report(nline, nword, nrune, nbadr, nchar, name); + exits(sts); }