# HG changeset patch # User Erik Quanstrom # Date 1331050110 -3600 # Node ID a170d9d2280834790e621c1ed0ae350c450af3b3 # Parent 6fe89e1c4d071b7c1d975d1cbf4d94ba7c217b70 utf: use UTFmax, Runemax, Rune rather than 3, 0xffff ushort almost all of this is just cleanup, using UTFmax, Runemax, Rune rather than 3, 0xffff ushort there are a few cases (/sys/src/libc/port/rune.c and friends) where the possiblity of 20-bit runes (in a uint) is considered. should we move to 20-bit runes, we would be compatable with go, p9p. change: cc/cc.h incorporate charles' comment change: add libdraw/buildfont.c (oversite) change: correct p?swt.c to align properly (charles' comment) R=nixiedev, ality, charles.forsyth, 0intro, nemo CC=nix-dev http://codereview.appspot.com/5683071 Committer: Francisco J Ballesteros diff -r 6fe89e1c4d07 -r a170d9d22808 sys/include/ape/utf.h --- a/sys/include/ape/utf.h Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/include/ape/utf.h Tue Mar 06 17:08:30 2012 +0100 @@ -14,7 +14,8 @@ UTFmax = 3, /* maximum bytes per rune */ Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ Runeself = 0x80, /* rune and UTF sequences are the same (<) */ - Runeerror = 0x80, /* decoding error in UTF */ + Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0xFFFF, /* 16 bit rune */ }; /* diff -r 6fe89e1c4d07 -r a170d9d22808 sys/include/libc.h --- a/sys/include/libc.h Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/include/libc.h Tue Mar 06 17:08:30 2012 +0100 @@ -45,6 +45,7 @@ Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ Runeself = 0x80, /* rune and UTF sequences are the same (<) */ Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0xFFFF, /* 16 bit rune */ }; /* diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/ape/lib/ap/gen/mbwc.c --- a/sys/src/ape/lib/ap/gen/mbwc.c Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/ape/lib/ap/gen/mbwc.c Tue Mar 06 17:08:30 2012 +0100 @@ -1,4 +1,5 @@ #include +#include /* * Use the FSS-UTF transformation proposed by posix. @@ -7,12 +8,14 @@ * Tx 10xxxxxx 6 free bits * T1 110xxxxx 5 free bits * T2 1110xxxx 4 free bits + * T3 11110xxx 3 free bits * * Encoding is as follows. * From hex Thru hex Sequence Bits - * 00000000 0000007F T0 7 - * 00000080 000007FF T1 Tx 11 - * 00000800 0000FFFF T2 Tx Tx 16 + * 00000000 0000007F T1 7 + * 00000080 000007FF T2 Tx 11 + * 00000800 0000FFFF T3 Tx Tx 16 + * 00010000 0010FFFF T4 Tx Tx Tx 20 (and change) */ int @@ -25,7 +28,7 @@ int mbtowc(wchar_t *pwc, const char *s, size_t n) { - int c, c1, c2; + int c, c1, c2, c3; long l; if(!s) @@ -70,6 +73,24 @@ return 3; } + if(n < 4) + goto bad; + if(UTFmax >= 4) { + c3 = (s[3] ^ 0x80) & 0xff; + if(c3 & 0xC0) + goto bad; + if(c < 0xf8) { + l = ((((((c << 6) | c1) << 6) | c2) << 6) | c3) & 0x3fffff; + if(l <= 0x10000) + goto bad; + if(l > Runemax) + goto bad; + if(pwc) + *pwc = l; + return 4; + } + } + /* * bad decoding */ @@ -86,7 +107,9 @@ if(!s) return 0; - c = wchar & 0xFFFF; + c = wchar; + if(c > Runemax) + c = Runeerror; if(c < 0x80) { s[0] = c; return 1; @@ -98,10 +121,17 @@ return 2; } - s[0] = 0xE0 | (c >> 12); - s[1] = 0x80 | ((c >> 6) & 0x3F); - s[2] = 0x80 | (c & 0x3F); - return 3; + if(c < 0x10000){ + s[0] = 0xE0 | (c >> 12); + s[1] = 0x80 | ((c >> 6) & 0x3F); + s[2] = 0x80 | (c & 0x3F); + return 3; + } + s[0] = 0xf0 | c >> 18; + s[1] = 0x80 | (c >> 12) & 0x3F; + s[2] = 0x80 | (c >> 6) & 0x3F; + s[3] = 0x80 | (c & 0x3F); + return 4; } size_t @@ -117,7 +147,7 @@ break; s++; } else { - d = mbtowc(pwcs, s, 3); + d = mbtowc(pwcs, s, UTFmax); if(d <= 0) return (size_t)((d<0) ? -1 : i); s += d; @@ -133,10 +163,10 @@ int i, d; long c; char *p, *pe; - char buf[3]; + char buf[UTFmax]; p = s; - pe = p+n-3; + pe = p+n-UTFmax; while(p < pe) { c = *pwcs++; if(c < 0x80) @@ -146,20 +176,16 @@ if(c == 0) return p-s; } - while(p < pe+3) { + while(p < pe+UTFmax) { c = *pwcs++; d = wctomb(buf, c); - if(p+d <= pe+3) { - *p++ = buf[0]; - if(d > 1) { - *p++ = buf[2]; - if(d > 2) - *p++ = buf[3]; - } + if(p+d <= pe+UTFmax) { + for(i = 0; i < d; i++) + p[i] = buf[i]; + p += d; } if(c == 0) break; } return p-s; } - diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/ape/lib/utf/rune.c --- a/sys/src/ape/lib/utf/rune.c Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/ape/lib/utf/rune.c Tue Mar 06 17:08:30 2012 +0100 @@ -23,16 +23,19 @@ Bit2 = 5, Bit3 = 4, Bit4 = 3, + Bit5 = 2, T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */ Maskx = (1< T4 Tx Tx Tx + */ + if(UTFmax >= 4) { + c3 = *(uchar*)(str+3) ^ Tx; + if(c3 & Testx) + goto bad; + if(c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if(l <= Rune3) + goto bad; + if(l > Runemax) + goto bad; + *rune = l; + return 4; + } + } + + /* * bad decoding */ bad: @@ -113,7 +135,7 @@ /* * two character sequence - * 0080-07FF => T2 Tx + * 00080-007FF => T2 Tx */ if(c <= Rune2) { str[0] = T2 | (c >> 1*Bitx); @@ -123,12 +145,26 @@ /* * three character sequence - * 0800-FFFF => T3 Tx Tx + * 00800-0FFFF => T3 Tx Tx */ - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; + if(c > Runemax) + c = Runeerror; + if(c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence + * 010000-1FFFFF => T4 Tx Tx Tx + */ + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; } int @@ -155,7 +191,10 @@ if(c <= Rune2) nb += 2; else + if(c <= Rune3 || c > Runemax) nb += 3; + else + nb += 4; } return nb; } @@ -165,13 +204,14 @@ { int c; - if(n > 0) { - c = *(uchar*)str; - if(c < Tx) - return 1; - if(n > 1) - if(c < T3 || n > 2) - return 1; - } - return 0; + if(n <= 0) + return 0; + c = *(uchar*)str; + if(c < Tx) + return 1; + if(c < T3) + return n >= 2; + if(UTFmax == 3 || c < T4) + return n >= 3; + return n >= 4; } diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/boot/alphapc/lib.h --- a/sys/src/boot/alphapc/lib.h Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/boot/alphapc/lib.h Tue Mar 06 17:08:30 2012 +0100 @@ -27,9 +27,10 @@ enum { UTFmax = 3, /* maximum bytes per rune */ - Runesync = 0x80, /* cannot represent part of a UTF sequence */ - Runeself = 0x80, /* rune and UTF sequences are the same (<) */ - Runeerror = 0x80, /* decoding error in UTF */ + Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ + Runeself = 0x80, /* rune and UTF sequences are the same (<) */ + Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0xFFFF, /* 16 bit rune */ }; /* diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/cmd/1c/swt.c --- a/sys/src/cmd/1c/swt.c Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/cmd/1c/swt.c Tue Mar 06 17:08:30 2012 +0100 @@ -244,26 +244,28 @@ } long -outlstring(ushort *s, long n) +outlstring(Rune *s, long n) { - char buf[2]; - int c; + char buf[UTFmax]; + int c, i; long r; - while(nstring & 1) +// if(suppress) +// return nstring; + while(nstring & (sizeof(Rune)-1)) outstring("", 1); r = nstring; while(n > 0) { c = *s++; if(align(0, types[TCHAR], Aarg1)) { - buf[0] = c>>8; - buf[1] = c; + for(i = 0; i < sizeof(Rune); i++) + buf[i] = c>>8*(sizeof(Rune) - i - 1); } else { - buf[0] = c; - buf[1] = c>>8; + for(i = 0; i < sizeof(Rune); i++) + buf[i] = c>>8*i; } - outstring(buf, 2); - n -= sizeof(ushort); + outstring(buf, sizeof(Rune)); + n -= sizeof(Rune); } return r; } diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/cmd/2c/swt.c --- a/sys/src/cmd/2c/swt.c Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/cmd/2c/swt.c Tue Mar 06 17:08:30 2012 +0100 @@ -324,26 +324,28 @@ } long -outlstring(ushort *s, long n) +outlstring(Rune *s, long n) { - char buf[2]; - int c; + char buf[UTFmax]; + int c, i; long r; - while(nstring & 1) +// if(suppress) +// return nstring; + while(nstring & (sizeof(Rune)-1)) outstring("", 1); r = nstring; while(n > 0) { c = *s++; if(align(0, types[TCHAR], Aarg1)) { - buf[0] = c>>8; - buf[1] = c; + for(i = 0; i < sizeof(Rune); i++) + buf[i] = c>>8*(sizeof(Rune) - i - 1); } else { - buf[0] = c; - buf[1] = c>>8; + for(i = 0; i < sizeof(Rune); i++) + buf[i] = c>>8*i; } - outstring(buf, 2); - n -= sizeof(ushort); + outstring(buf, sizeof(Rune)); + n -= sizeof(Rune); } return r; } diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/cmd/cc/cc.h --- a/sys/src/cmd/cc/cc.h Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/cmd/cc/cc.h Tue Mar 06 17:08:30 2012 +0100 @@ -51,7 +51,7 @@ double fconst; /* fp constant */ vlong vconst; /* non fp const */ char* cstring; /* character string */ - ushort* rstring; /* rune string */ + Rune* rstring; /* rune string */ Sym* sym; Type* type; @@ -336,6 +336,12 @@ TFILE, TOLD, NALLTYPES, + + /* + * bootstrapping + */ +// TRUNE = TUINT, + TRUNE = sizeof(Rune)==4? TUINT: TUSHORT, }; enum { @@ -739,7 +745,7 @@ void gextern(Sym*, Node*, long, long); void ginit(void); long outstring(char*, long); -long outlstring(ushort*, long); +long outlstring(Rune*, long); void sextern(Sym*, Node*, long, long); void xcom(Node*); long exreg(Type*); diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/cmd/cc/cc.y --- a/sys/src/cmd/cc/cc.y Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/cmd/cc/cc.y Tue Mar 06 17:08:30 2012 +0100 @@ -855,9 +855,9 @@ LLSTRING { $$ = new(OLSTRING, Z, Z); - $$->type = typ(TARRAY, types[TUSHORT]); - $$->type->width = $1.l + sizeof(ushort); - $$->rstring = (ushort*)$1.s; + $$->type = typ(TARRAY, types[TRUNE]); + $$->type->width = $1.l + sizeof(Rune); + $$->rstring = (Rune*)$1.s; $$->sym = symstring; $$->etype = TARRAY; $$->class = CSTATIC; @@ -867,16 +867,16 @@ char *s; int n; - n = $1->type->width - sizeof(ushort); + n = $1->type->width - sizeof(Rune); s = alloc(n+$2.l+MAXALIGN); memcpy(s, $1->rstring, n); memcpy(s+n, $2.s, $2.l); - *(ushort*)(s+n+$2.l) = 0; + *(Rune*)(s+n+$2.l) = 0; $$ = $1; $$->type->width += $2.l; - $$->rstring = (ushort*)s; + $$->rstring = (Rune*)s; } zelist: diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/cmd/cc/com.c --- a/sys/src/cmd/cc/com.c Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/cmd/cc/com.c Tue Mar 06 17:08:30 2012 +0100 @@ -633,10 +633,12 @@ break; case OLSTRING: - if(n->type->link != types[TUSHORT]) { + if(n->type->link != types[TRUNE]) { o = outstring(0, 0); while(o & 3) { - outlstring(L"", sizeof(ushort)); + // outlstring(L"", sizeof(Rune)); + uint str[1] = {0}; + outlstring(str, sizeof(Rune)); o = outlstring(0, 0); } } diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/cmd/cc/lex.c --- a/sys/src/cmd/cc/lex.c Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/cmd/cc/lex.c Tue Mar 06 17:08:30 2012 +0100 @@ -469,7 +469,7 @@ yyerror("missing '"); peekc = c1; } - yylval.vval = convvtox(c, TUSHORT); + yylval.vval = convvtox(c, TRUNE); return LUCONST; } if(c == '"') { @@ -543,15 +543,15 @@ c = escchar('"', 1, 0); if(c == EOF) break; - cp = allocn(cp, c1, sizeof(ushort)); - *(ushort*)(cp + c1) = c; - c1 += sizeof(ushort); + cp = allocn(cp, c1, sizeof(Rune)); + *(Rune*)(cp + c1) = c; + c1 += sizeof(Rune); } yylval.sval.l = c1; do { - cp = allocn(cp, c1, sizeof(ushort)); - *(ushort*)(cp + c1) = 0; - c1 += sizeof(ushort); + cp = allocn(cp, c1, sizeof(Rune)); + *(Rune*)(cp + c1) = 0; + c1 += sizeof(Rune); } while(c1 & MAXALIGN); yylval.sval.s = cp; return LLSTRING; @@ -1029,7 +1029,7 @@ } else c = GETC(); for(;;) { - if(!isspace(c)) + if(c >= Runeself || !isspace(c)) return c; if(c == '\n') { lineno++; diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/cmd/cc/macbody --- a/sys/src/cmd/cc/macbody Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/cmd/cc/macbody Tue Mar 06 17:08:30 2012 +0100 @@ -18,22 +18,39 @@ return n; } -Sym* -getsym(void) +static void +nextsym(int c) { - int c; + int c1; char *cp; +<<<<<<< local +======= c = getnsc(); if(!isalpha(c) && c != '_' && c < Runeself) { unget(c); return S; } +>>>>>>> other for(cp = symb;;) { - if(cp <= symb+NSYMB-4) - *cp++ = c; + if(c >= Runeself) { + for(c1=0;;) { + if(cp <= symb+NSYMB-4) + cp[c1++] = c; + if(fullrune(cp, c1)) + break; + c = getc(); + } + cp += c1; + }else + if(cp <= symb+NSYMB-4) + *cp++ = c; c = getc(); +<<<<<<< local + if(c >= Runeself || isalnum(c) || c == '_') +======= if(isalnum(c) || c == '_' || c >= Runeself) +>>>>>>> other continue; unget(c); break; @@ -41,6 +58,19 @@ *cp = 0; if(cp > symb+NSYMB-4) yyerror("symbol too large: %s", symb); +} + +Sym* +getsym(void) +{ + int c; + + c = getnsc(); + if(c < Runeself && !isalpha(c) && c != '_') { + unget(c); + return S; + } + nextsym(c); return lookup(); } @@ -193,7 +223,7 @@ macdef(void) { Sym *s, *a; - char *args[NARG], *np, *base; + char *args[NARG], *base; int n, i, c, len, dots; int ischr; @@ -235,15 +265,9 @@ len = 1; ischr = 0; for(;;) { - if(isalpha(c) || c == '_') { - np = symb; - *np++ = c; + if(c >= Runeself || isalpha(c) || c == '_') { + nextsym(c); c = getc(); - while(isalnum(c) || c == '_') { - *np++ = c; - c = getc(); - } - *np = 0; for(i=0; iname); break; } diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/cmd/cc/pswt.c --- a/sys/src/cmd/cc/pswt.c Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/cmd/cc/pswt.c Tue Mar 06 17:08:30 2012 +0100 @@ -78,28 +78,28 @@ } long -outlstring(ushort *s, long n) +outlstring(Rune *s, long n) { - char buf[2]; - int c; + char buf[UTFmax]; + int c, i; long r; if(suppress) return nstring; - while(nstring & 1) + while(nstring & (sizeof(Rune)-1)) outstring("", 1); r = nstring; while(n > 0) { c = *s++; if(align(0, types[TCHAR], Aarg1)) { - buf[0] = c>>8; - buf[1] = c; + for(i = 0; i < sizeof(Rune); i++) + buf[i] = c>>8*(sizeof(Rune) - i - 1); } else { - buf[0] = c; - buf[1] = c>>8; + for(i = 0; i < sizeof(Rune); i++) + buf[i] = c>>8*i; } - outstring(buf, 2); - n -= sizeof(ushort); + outstring(buf, sizeof(Rune)); + n -= sizeof(Rune); } return r; } diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/cmd/db/input.c --- a/sys/src/cmd/db/input.c Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/cmd/db/input.c Tue Mar 06 17:08:30 2012 +0100 @@ -46,12 +46,13 @@ int readrune(int fd, Rune *r) { - char buf[UTFmax]; + char buf[UTFmax+1]; int i; for(i=0; iname; + for(i = 0; i < 8; i++){ + c = p[i]; + if(c == 0) + break; + if(c & 0x80 || c < ' ') + c = '-'; + buf[j++] = c; + } + buf[j++] = '.'; + p = (char*)d->ext; + for(i = 0; i < 3; i++){ + c = p[i]; + if(c == 0) + break; + if(c & 0x80 || c < ' ') + c = '-'; + buf[j++] = c; + } + buf[j] = 0; + return seprint(s, e, "\"%s\" ", buf); +} + void dirdump(void *vdbuf) { @@ -1874,7 +1904,8 @@ name = getnamerunes(name, dbuf, 1); seprint(buf, ebuf, "\"%s\" %2.2x %2.2ux %2.2ux %d", name, dbuf[0], dbuf[12], dbuf[13], GSHORT(d->start)); }else{ - s = seprint(buf, ebuf, "\"%.8s.%.3s\" ", (char*)d->name, (char*)d->ext); +// s = seprint(buf, ebuf, "\"%.8s.%.3s\" ", (char*)d->name, (char*)d->ext); + s = sanitize(buf, ebuf, d); for(i=7; i>=0; i--) *s++ = d->attr&(1<= NBLK) { lastc = '\n'; error(T); @@ -1238,7 +1238,7 @@ if(c == '\\') { c = getchr(); *p++ = ESCFLG; - if(p >= &rhsbuf[LBSIZE/2]) + if(p >= &rhsbuf[LBSIZE/sizeof(Rune)]) error(Q); } else if(c == '\n' && (!globp || !globp[0])) { @@ -1249,7 +1249,7 @@ if(c == seof) break; *p++ = c; - if(p >= &rhsbuf[LBSIZE/2]) + if(p >= &rhsbuf[LBSIZE/sizeof(Rune)]) error(Q); } *p = 0; diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/cmd/eqn/text.c --- a/sys/src/cmd/eqn/text.c Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/cmd/eqn/text.c Tue Mar 06 17:08:30 2012 +0100 @@ -1,6 +1,7 @@ #include "e.h" #include "y.tab.h" #include +#include #define CSSIZE 1000 char cs[CSSIZE+20]; /* text string converted into this */ @@ -42,14 +43,14 @@ wchar_t r; int w; - w = mbtowc(&r, psp, 3); + w = mbtowc(&r, psp, UTFmax); if(w == 0){ psp++; return 0; } if(w < 0){ psp += 1; - return 0x80; /* Plan 9-ism */ + return Runeerror; /* Plan 9-ism */ } psp += w; return r; @@ -112,17 +113,13 @@ printf(".ds %d \"%s\n", yyval, p); } -int isalpharune(int c) -{ - return ('a'<=c && c<='z') || ('A'<=c && c<='Z'); -} - int isdigitrune(int c) { return ('0'<=c && c<='9'); } -trans(int c, char *p1) +int +trans(int c, char *) { int f; diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/cmd/htmlroff/char.c --- a/sys/src/cmd/htmlroff/char.c Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/cmd/htmlroff/char.c Tue Mar 06 17:08:30 2012 +0100 @@ -1,6 +1,10 @@ #include "a.h" /* + * hopeless if runes are not 16 bits + */ + +/* * Translate Unicode to HTML by asking tcs(1). * This way we don't have yet another table. */ diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/cmd/sam/cmd.c --- a/sys/src/cmd/sam/cmd.c Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/cmd/sam/cmd.c Tue Mar 06 17:08:30 2012 +0100 @@ -71,7 +71,7 @@ inputc(void) { int n, nbuf; - char buf[3]; + char buf[UTFmax]; Rune r; Again: diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/cmd/sam/regexp.c --- a/sys/src/cmd/sam/regexp.c Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/cmd/sam/regexp.c Tue Mar 06 17:08:30 2012 +0100 @@ -9,7 +9,7 @@ struct Inst { - long type; /* < 0x10000 ==> literal, otherwise action */ + long type; /* < 0x1000000 ==> literal, otherwise action */ union { int rsid; int rsubid; @@ -53,28 +53,28 @@ /* * Actions and Tokens * - * 0x100xx are operators, value == precedence - * 0x200xx are tokens, i.e. operands for operators + * 0x10000xx are operators, value == precedence + * 0x20000xx are tokens, i.e. operands for operators */ -#define OPERATOR 0x10000 /* Bitmask of all operators */ -#define START 0x10000 /* Start, used for marker on stack */ -#define RBRA 0x10001 /* Right bracket, ) */ -#define LBRA 0x10002 /* Left bracket, ( */ -#define OR 0x10003 /* Alternation, | */ -#define CAT 0x10004 /* Concatentation, implicit operator */ -#define STAR 0x10005 /* Closure, * */ -#define PLUS 0x10006 /* a+ == aa* */ -#define QUEST 0x10007 /* a? == a|nothing, i.e. 0 or 1 a's */ -#define ANY 0x20000 /* Any character but newline, . */ -#define NOP 0x20001 /* No operation, internal use only */ -#define BOL 0x20002 /* Beginning of line, ^ */ -#define EOL 0x20003 /* End of line, $ */ -#define CCLASS 0x20004 /* Character class, [] */ -#define NCCLASS 0x20005 /* Negated character class, [^] */ -#define END 0x20077 /* Terminate: match found */ +#define OPERATOR 0x1000000 /* Bitmask of all operators */ +#define START 0x1000000 /* Start, used for marker on stack */ +#define RBRA 0x1000001 /* Right bracket, ) */ +#define LBRA 0x1000002 /* Left bracket, ( */ +#define OR 0x1000003 /* Alternation, | */ +#define CAT 0x1000004 /* Concatentation, implicit operator */ +#define STAR 0x1000005 /* Closure, * */ +#define PLUS 0x1000006 /* a+ == aa* */ +#define QUEST 0x1000007 /* a? == a|nothing, i.e. 0 or 1 a's */ +#define ANY 0x2000000 /* Any character but newline, . */ +#define NOP 0x2000001 /* No operation, internal use only */ +#define BOL 0x2000002 /* Beginning of line, ^ */ +#define EOL 0x2000003 /* End of line, $ */ +#define CCLASS 0x2000004 /* Character class, [] */ +#define NCCLASS 0x2000005 /* Negated character class, [^] */ +#define END 0x2000077 /* Terminate: match found */ -#define ISATOR 0x10000 -#define ISAND 0x20000 +#define ISATOR 0x1000000 +#define ISAND 0x2000000 /* * Parser Information @@ -459,7 +459,7 @@ exprp++; return '\n'; } - return *exprp++|0x10000; + return *exprp++|0x1000000; } return *exprp++; } @@ -494,7 +494,7 @@ exprp++; /* eat '-' */ if((c2 = nextrec()) == ']') goto Error; - classp[n+0] = 0xFFFF; + classp[n+0] = Runemax; classp[n+1] = c1; classp[n+2] = c2; n += 3; @@ -516,7 +516,7 @@ p = class[classno]; while(*p){ - if(*p == 0xFFFF){ + if(*p == Runemax){ if(p[1]<=c && c<=p[2]) return !negate; p += 3; diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/cmd/sed.c --- a/sys/src/cmd/sed.c Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/cmd/sed.c Tue Mar 06 17:08:30 2012 +0100 @@ -623,7 +623,7 @@ while ((r = *cp++) != '\0') { if(r == '\\') { if (rhs < end) - *rhs++ = 0xFFFF; + *rhs++ = Runemax; else return 0; r = *cp++; diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/cmd/tcs/plan9.h --- a/sys/src/cmd/tcs/plan9.h Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/cmd/tcs/plan9.h Tue Mar 06 17:08:30 2012 +0100 @@ -1,6 +1,6 @@ typedef unsigned short Rune; /* 16 bits */ typedef unsigned char uchar; -#define Runeerror 0x80 /* decoding error in UTF */ +#define Runeerror 0xFFFD /* decoding error in UTF */ #define Runeself 0x80 /* rune and UTF sequences are the same (<) */ #define UTFmax 6 /* maximum bytes per rune */ diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/cmd/tr.c --- a/sys/src/cmd/tr.c Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/cmd/tr.c Tue Mar 06 17:08:30 2012 +0100 @@ -15,13 +15,15 @@ #define CLEARBIT(a,c) ((a)[(c)/8] &= ~bits[(c)&07]) #define BITSET(a,c) ((a)[(c)/8] & bits[(c)&07]) -#define MAXRUNE 0xFFFF - -uchar f[(MAXRUNE+1)/8]; -uchar t[(MAXRUNE+1)/8]; +uchar f[(Runemax+1)/8]; +uchar t[(Runemax+1)/8]; char wbuf[4096]; char *wptr; +enum { + Lastc = Runemax + 1, +}; + Pcb pfrom, pto; int cflag; @@ -94,7 +96,7 @@ SETBIT(t, c); } - last = 0x10000; + last = Lastc; while (readrune(0, &c) > 0) { if(!BITSET(f, c) && (c != last || !BITSET(t,c))) { last = c; @@ -134,7 +136,7 @@ else p[i] = i; } if (sflag){ - lastc = 0x10000; + lastc = Lastc; while (readrune(0, &from) > 0) { if (from > high) from = to; @@ -188,7 +190,7 @@ SETBIT(t,to); } if (sflag){ - lastc = 0x10000; + lastc = Lastc; while (readrune(0, &from) > 0) { if (from <= high) from = p[from]; @@ -276,7 +278,7 @@ n = 0; if (*s == 'x') { s++; - for (i = 0; i < 4; i++) { + for (i = 0; i < 6; i++) { save = s; s += chartorune(&r, s); if ('0' <= r && r <= '9') @@ -291,6 +293,8 @@ else *rp = n; return save; } + if(n > Runemax) + sysfatal("character > Runemax"); } } else { for(i = 0; i < 3; i++) { diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/cmd/unicode.c --- a/sys/src/cmd/unicode.c Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/cmd/unicode.c Tue Mar 06 17:08:30 2012 +0100 @@ -51,13 +51,13 @@ return "bad range"; } min = strtoul(q, &q, 16); - if(min<0 || min>0xFFFF || *q!='-') + if(min<0 || min>Runemax || *q!='-') goto err; q++; if(strchr(hex, *q) == 0) goto err; max = strtoul(q, &q, 16); - if(max<0 || max>0xFFFF || maxRunemax || max0xFFFF || *q!=0) + if(m<0 || m>Runemax || *q!=0) goto err; Bprint(&bout, "%C", m); if(!text) diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/cmd/unix/drawterm/libc/rune.c --- a/sys/src/cmd/unix/drawterm/libc/rune.c Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/cmd/unix/drawterm/libc/rune.c Tue Mar 06 17:08:30 2012 +0100 @@ -8,16 +8,19 @@ Bit2 = 5, Bit3 = 4, Bit4 = 3, + Bit5 = 2, T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */ Maskx = (1< T4 Tx Tx Tx + */ + if(UTFmax >= 4) { + c3 = *(uchar*)(str+3) ^ Tx; + if(c3 & Testx) + goto bad; + if(c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if(l <= Rune3) + goto bad; + if(l > Runemax) + goto bad; + *rune = l; + return 4; + } + } + + /* * bad decoding */ bad: @@ -98,7 +120,7 @@ /* * two character sequence - * 0080-07FF => T2 Tx + * 00080-007FF => T2 Tx */ if(c <= Rune2) { str[0] = T2 | (c >> 1*Bitx); @@ -108,12 +130,26 @@ /* * three character sequence - * 0800-FFFF => T3 Tx Tx + * 00800-0FFFF => T3 Tx Tx */ - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; + if(c > Runemax) + c = Runeerror; + if(c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence + * 010000-1FFFFF => T4 Tx Tx Tx + */ + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; } int @@ -140,7 +176,10 @@ if(c <= Rune2) nb += 2; else + if(c <= Rune3 || c > Runemax) nb += 3; + else + nb += 4; } return nb; } @@ -150,13 +189,14 @@ { int c; - if(n > 0) { - c = *(uchar*)str; - if(c < Tx) - return 1; - if(n > 1) - if(c < T3 || n > 2) - return 1; - } - return 0; + if(n <= 0) + return 0; + c = *(uchar*)str; + if(c < Tx) + return 1; + if(c < T3) + return n >= 2; + if(UTFmax == 3 || c < T4) + return n >= 3; + return n >= 4; } diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/cmd/unix/drawterm/libc/utf.h --- a/sys/src/cmd/unix/drawterm/libc/utf.h Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/cmd/unix/drawterm/libc/utf.h Tue Mar 06 17:08:30 2012 +0100 @@ -8,7 +8,8 @@ UTFmax = 3, /* maximum bytes per rune */ Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ Runeself = 0x80, /* rune and UTF sequences are the same (<) */ - Runeerror = 0x80, /* decoding error in UTF */ + Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0xFFFF, /* 16 bit rune */ }; /* diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/cmd/unix/u9fs/plan9.h --- a/sys/src/cmd/unix/u9fs/plan9.h Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/cmd/unix/u9fs/plan9.h Tue Mar 06 17:08:30 2012 +0100 @@ -97,7 +97,8 @@ UTFmax = 3, /* maximum bytes per rune */ Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ Runeself = 0x80, /* rune and UTF sequences are the same (<) */ - Runeerror = 0x80 /* decoding error in UTF */ + Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0xFFFF, /* 16 bit rune */ }; extern int runetochar(char*, Rune*); diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/cmd/unix/u9fs/rune.c --- a/sys/src/cmd/unix/u9fs/rune.c Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/cmd/unix/u9fs/rune.c Tue Mar 06 17:08:30 2012 +0100 @@ -8,27 +8,30 @@ Bit2 = 5, Bit3 = 4, Bit4 = 3, + Bit5 = 2, T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */ Maskx = (1< T4 Tx Tx Tx + */ + if(UTFmax >= 4) { + c3 = *(uchar*)(str+3) ^ Tx; + if(c3 & Testx) + goto bad; + if(c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if(l <= Rune3) + goto bad; + if(l > Runemax) + goto bad; + *rune = l; + return 4; + } + } + + /* * bad decoding */ bad: @@ -98,7 +120,7 @@ /* * two character sequence - * 0080-07FF => T2 Tx + * 00080-007FF => T2 Tx */ if(c <= Rune2) { str[0] = T2 | (c >> 1*Bitx); @@ -108,12 +130,26 @@ /* * three character sequence - * 0800-FFFF => T3 Tx Tx + * 00800-0FFFF => T3 Tx Tx */ - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; + if(c > Runemax) + c = Runeerror; + if(c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence + * 010000-1FFFFF => T4 Tx Tx Tx + */ + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; } int @@ -127,6 +163,45 @@ } int +runenlen(Rune *r, int nrune) +{ + int nb, c; + + nb = 0; + while(nrune--) { + c = *r++; + if(c <= Rune1) + nb++; + else + if(c <= Rune2) + nb += 2; + else + if(c <= Rune3 || c > Runemax) + nb += 3; + else + nb += 4; + } + return nb; +} + +int +fullrune(char *str, int n) +{ + int c; + + if(n <= 0) + return 0; + c = *(uchar*)str; + if(c < Tx) + return 1; + if(c < T3) + return n >= 2; + if(UTFmax == 3 || c < T4) + return n >= 3; + return n >= 4; +} + +int utflen(char *s) { int c; @@ -144,5 +219,4 @@ s += chartorune(&rune, s); n++; } - return 0; } diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/cmd/vnc/devcons.c --- a/sys/src/cmd/vnc/devcons.c Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/cmd/vnc/devcons.c Tue Mar 06 17:08:30 2012 +0100 @@ -158,7 +158,7 @@ kbdputc(int ch) { int n; - char buf[3]; + char buf[UTFmax]; Rune r; r = ch; diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/cmd/vnc/latin1.c --- a/sys/src/cmd/vnc/latin1.c Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/cmd/vnc/latin1.c Tue Mar 06 17:08:30 2012 +0100 @@ -35,7 +35,9 @@ else return -1; } - return c; + if(c <= Runemax) + return c; + return -1; } /* diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/cmd/vnc/screen.c --- a/sys/src/cmd/vnc/screen.c Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/cmd/vnc/screen.c Tue Mar 06 17:08:30 2012 +0100 @@ -356,7 +356,7 @@ { int i; Rune r; - char buf[4]; + char buf[UTFmax + 1]; drawlock(); while(n > 0){ diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/cmd/yacc.c --- a/sys/src/cmd/yacc.c Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/cmd/yacc.c Tue Mar 06 17:08:30 2012 +0100 @@ -142,7 +142,7 @@ char* infile; /* input file name */ int numbval; /* value of an input number */ -char tokname[NAMESIZE+4]; /* input token name, slop for runes and 0 */ +char tokname[NAMESIZE+UTFmax+1]; /* input token name, slop for runes and 0 */ /* structure declarations */ diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/libbio/bgetrune.c --- a/sys/src/libbio/bgetrune.c Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/libbio/bgetrune.c Tue Mar 06 17:08:30 2012 +0100 @@ -7,7 +7,7 @@ { int c, i; Rune rune; - char str[4]; + char str[UTFmax + 1]; c = Bgetc(bp); if(c < Runeself) { /* one char */ diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/libc/fmt/dofmt.c --- a/sys/src/libc/fmt/dofmt.c Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/libc/fmt/dofmt.c Tue Mar 06 17:08:30 2012 +0100 @@ -512,12 +512,13 @@ int _badfmt(Fmt *f) { - char x[3]; + char x[2+UTFmax]; + int n; x[0] = '%'; - x[1] = f->r; - x[2] = '%'; - f->prec = 3; - _fmtcpy(f, x, 3, 3); + n = 1 + runetochar(x+1, (Rune*)&f->r); + x[n++] = '%'; + f->prec = n; + _fmtcpy(f, x, n, n); return 0; } diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/libc/port/rune.c --- a/sys/src/libc/port/rune.c Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/libc/port/rune.c Tue Mar 06 17:08:30 2012 +0100 @@ -8,16 +8,19 @@ Bit2 = 5, Bit3 = 4, Bit4 = 3, + Bit5 = 2, T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */ Maskx = (1< T4 Tx Tx Tx + */ + if(UTFmax >= 4) { + c3 = *(uchar*)(str+3) ^ Tx; + if(c3 & Testx) + goto bad; + if(c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if(l <= Rune3) + goto bad; + if(l > Runemax) + goto bad; + *rune = l; + return 4; + } + } + + /* * bad decoding */ bad: @@ -98,7 +120,7 @@ /* * two character sequence - * 0080-07FF => T2 Tx + * 00080-007FF => T2 Tx */ if(c <= Rune2) { str[0] = T2 | (c >> 1*Bitx); @@ -108,12 +130,26 @@ /* * three character sequence - * 0800-FFFF => T3 Tx Tx + * 00800-0FFFF => T3 Tx Tx */ - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; + if(c > Runemax) + c = Runeerror; + if(c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence + * 010000-1FFFFF => T4 Tx Tx Tx + */ + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; } int @@ -140,7 +176,10 @@ if(c <= Rune2) nb += 2; else + if(c <= Rune3 || c > Runemax) nb += 3; + else + nb += 4; } return nb; } @@ -150,13 +189,14 @@ { int c; - if(n > 0) { - c = *(uchar*)str; - if(c < Tx) - return 1; - if(n > 1) - if(c < T3 || n > 2) - return 1; - } - return 0; + if(n <= 0) + return 0; + c = *(uchar*)str; + if(c < Tx) + return 1; + if(c < T3) + return n >= 2; + if(UTFmax == 3 || c < T4) + return n >= 3; + return n >= 4; } diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/libdraw/buildfont.c --- a/sys/src/libdraw/buildfont.c Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/libdraw/buildfont.c Tue Mar 06 17:08:30 2012 +0100 @@ -70,7 +70,7 @@ } max = strtol(s, &s, 0); s = skip(s); - if(*s==0 || min>=65536 || max>=65536 || min>max){ + if(*s==0 || min>Runemax || max>Runemax || min>max){ werrstr("illegal subfont range"); Err3: freefont(fnt); diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/libdraw/event.c --- a/sys/src/libdraw/event.c Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/libdraw/event.c Tue Mar 06 17:08:30 2012 +0100 @@ -4,6 +4,10 @@ #include #include +enum { + Kbdmsgsz = 1 + 4 /* allow for 32-bit runes */ +}; + typedef struct Slave Slave; typedef struct Ebuf Ebuf; @@ -199,7 +203,7 @@ ekeyslave(int fd) { Rune r; - char t[3], k[10]; + char t[Kbdmsgsz], k[10]; int kr, kn, w; if(eforkslave(Ekeyboard) < MAXSLAVE) @@ -218,7 +222,9 @@ memmove(k, &k[w], kn); t[1] = r; t[2] = r>>8; - if(write(epipe[1], t, 3) != 3) + t[3] = r>>16; + t[4] = r>>24; + if(write(epipe[1], t, sizeof t) != sizeof t) break; } breakout:; @@ -302,7 +308,7 @@ s->head = (Ebuf *)1; return; } - if(i == Skeyboard && n != 3) + if(i == Skeyboard && n != Kbdmsgsz) drawerror(display, "events: protocol error: keyboard"); if(i == Smouse){ if(n < 1+1+2*12) @@ -417,13 +423,15 @@ int ekbd(void) { + uchar *t; + int c; Ebuf *eb; - int c; if(Skeyboard < 0) drawerror(display, "events: keyboard not initialzed"); eb = ebread(&eslave[Skeyboard]); - c = eb->buf[0] + (eb->buf[1]<<8); + t = eb->buf; + c = t[0] | t[1]<<8 | t[2]<<16 | t[3]<<24; free(eb); return c; } diff -r 6fe89e1c4d07 -r a170d9d22808 sys/src/libhttpd/httpunesc.c --- a/sys/src/libhttpd/httpunesc.c Wed Mar 07 15:41:27 2012 +0000 +++ b/sys/src/libhttpd/httpunesc.c Tue Mar 06 17:08:30 2012 +0100 @@ -10,26 +10,25 @@ char * httpunesc(HConnect *cc, char *s) { - char *t, *v; - int c; + char *t, *v, *p; + int c, n; Htmlesc *e; + Rune r; v = halloc(cc, UTFmax*strlen(s) + 1); for(t = v; c = *s;){ if(c == '&'){ - if(s[1] == '#' && s[2] && s[3] && s[4] && s[5] == ';'){ - c = atoi(s+2); - if(c < Runeself){ - *t++ = c; - s += 6; - continue; - } - if(c < 256 && c >= 161){ - e = &htmlesc[c-161]; - t += runetochar(t, &e->value); - s += 6; - continue; - } + if(s[1] == '#' && (n = strtoul(s+2, &p, 10)) != 0 && *p == ';'){ + r = n; + t += runetochar(t, &r); + s = p+1; + continue; + }else if(s[1] == '#' && (s[2] == 'x' || s[2] == 'X') && + (n = strtoul(s+3, &p, 16)) != 0 && *p == ';'){ + r = n; + t += runetochar(t, &r); + s = p+1; + continue; } else { for(e = htmlesc; e->name != nil; e++) if(strncmp(e->name, s, strlen(e->name)) == 0)