use UTFmax and Runemax for documentation instead of the constants 3 and 0xffff. in a few cases Runeerror was still mistakenly 0x80. the files /sys/src/cmd/postscript/common/rune* should be removed. 20091128 i realized that i included the function "isdigitrune" by mistake in /sys/include/libc.h in this submission. Notes: Mon Apr 29 20:43:28 EDT 2013 geoff being done independently. Reference: /n/sources/patch/sorry/bloated-rune-size Date: Sat Nov 28 18:27:27 CET 2009 Signed-off-by: quanstro@quanstro.net Reviewed-by: geoff --- /sys/include/ape/utf.h Sat Nov 28 06:56:29 2009 +++ /sys/include/ape/utf.h Sat Nov 28 06:56:28 2009 @@ -14,7 +14,8 @@ UTFmax = 3, /* maximum bytes per rune */ Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ Runeself = 0x80, /* rune and UTF sequences are the same (<) */ - Runeerror = 0x80, /* decoding error in UTF */ + Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0xFFFF, /* 16 bit rune */ }; /* --- /sys/include/libc.h Sat Nov 28 06:56:37 2009 +++ /sys/include/libc.h Sat Nov 28 06:56:35 2009 @@ -45,6 +45,7 @@ Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ Runeself = 0x80, /* rune and UTF sequences are the same (<) */ Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0xFFFF, /* 16 bit rune */ }; /* @@ -79,6 +80,7 @@ extern Rune totitlerune(Rune); extern Rune toupperrune(Rune); extern int isalpharune(Rune); +extern int isdigitrune(Rune); extern int islowerrune(Rune); extern int isspacerune(Rune); extern int istitlerune(Rune); --- /sys/src/9/port/chan.c Sat Nov 28 06:56:48 2009 +++ /sys/src/9/port/chan.c Sat Nov 28 06:56:45 2009 @@ -1263,7 +1263,7 @@ if(name <= aname) panic("bad math in namelenerror"); /* walk out of current UTF sequence */ - for(i=0; (*name&0xC0)==0x80 && i<3; i++) + for(i=0; (*name&0xC0)==0x80 && igenbuf, sizeof up->genbuf, "...%.*s", @@ -1688,8 +1688,8 @@ if((ulong)name < KZERO){ validaddr((ulong)name, 1, 0); if(!dup) - print("warning: validname called from %lux with user pointer", pc); - ename = vmemchr(name, 0, (1<<16)); + print("warning: validname called from %#p with user pointer", pc); + ename = vmemchr(name, 0, 1<<16); }else ename = memchr(name, 0, (1<<16)); --- /sys/src/9/port/lib.h Sat Nov 28 06:56:57 2009 +++ /sys/src/9/port/lib.h Sat Nov 28 06:56:55 2009 @@ -35,10 +35,11 @@ enum { - UTFmax = 3, /* maximum bytes per rune */ - Runesync = 0x80, /* cannot represent part of a UTF sequence */ - Runeself = 0x80, /* rune and UTF sequences are the same (<) */ - Runeerror = 0x80, /* decoding error in UTF */ + UTFmax = 3, /* maximum bytes per rune */ + Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ + Runeself = 0x80, /* rune and UTF sequences are the same (<) */ + Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0xFFFF, /* 16 bit rune */ }; /* @@ -81,22 +82,30 @@ #pragma varargck argpos fmtprint 2 #pragma varargck argpos print 1 +#pragma varargck argpos fprint 2 #pragma varargck argpos seprint 3 #pragma varargck argpos snprint 3 #pragma varargck argpos sprint 2 +#pragma varargck type "llb" vlong #pragma varargck type "lld" vlong #pragma varargck type "llx" vlong +#pragma varargck type "llb" uvlong #pragma varargck type "lld" uvlong #pragma varargck type "llx" uvlong +#pragma varargck type "lx" void* +#pragma varargck type "lb" long #pragma varargck type "ld" long #pragma varargck type "lx" long +#pragma varargck type "lb" ulong #pragma varargck type "ld" ulong #pragma varargck type "lx" ulong +#pragma varargck type "b" int #pragma varargck type "d" int #pragma varargck type "x" int #pragma varargck type "c" int #pragma varargck type "C" int +#pragma varargck type "b" uint #pragma varargck type "d" uint #pragma varargck type "x" uint #pragma varargck type "c" uint @@ -113,6 +122,7 @@ extern void quotefmtinstall(void); extern int fmtprint(Fmt*, char*, ...); extern int fmtstrcpy(Fmt*, char*); +extern int encodefmt(Fmt*); /* * one-of-a-kind @@ -130,7 +140,6 @@ extern int getfields(char*, char**, int, int, char*); extern int tokenize(char*, char**, int); extern int dec64(uchar*, int, char*, int); -extern int encodefmt(Fmt*); extern void qsort(void*, long, long, int (*)(void*, void*)); /* --- /sys/src/boot/alphapc/lib.h Sat Nov 28 06:57:07 2009 +++ /sys/src/boot/alphapc/lib.h Sat Nov 28 06:57:05 2009 @@ -26,10 +26,11 @@ enum { - UTFmax = 3, /* maximum bytes per rune */ - Runesync = 0x80, /* cannot represent part of a UTF sequence */ - Runeself = 0x80, /* rune and UTF sequences are the same (<) */ - Runeerror = 0x80, /* decoding error in UTF */ + UTFmax = 3, /* maximum bytes per rune */ + Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ + Runeself = 0x80, /* rune and UTF sequences are the same (<) */ + Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0xFFFF, /* 16 bit rune */ }; /* --- /sys/src/cmd/auth/authsrv.c Sat Nov 28 06:57:19 2009 +++ /sys/src/cmd/auth/authsrv.c Sat Nov 28 06:57:17 2009 @@ -725,9 +725,10 @@ { uchar buf[512]; int i; + Rune r; + /* assume 16 bit runes; cf. factotum/chap.c */ for (i = 0; *passwd && i + 1 < sizeof(buf);) { - Rune r; passwd += chartorune(&r, passwd); buf[i++] = r; buf[i++] = r >> 8; @@ -809,8 +810,10 @@ snprint(notuser, sizeof notuser, "!%s", user); for(ntp = tp; ntp; ntp = ntp->entry) if(strcmp(ntp->attr, "uid") == 0){ - if(strcmp(ntp->val, notuser) == 0) + if(strcmp(ntp->val, notuser) == 0){ + ok = 0; break; + } if(*ntp->val == '*' || strcmp(ntp->val, user) == 0) ok = 1; } --- /sys/src/cmd/auth/factotum/chap.c Sat Nov 28 06:57:32 2009 +++ /sys/src/cmd/auth/factotum/chap.c Sat Nov 28 06:57:30 2009 @@ -385,10 +385,11 @@ uchar digest[MD4dlen]; uchar *w, unipass[256]; - // Standard says unlimited length, experience says 128 max + /* Standard says unlimited length, experience says 128 max */ if ((n = strlen(pass)) > 128) n = 128; + /* assume 16 bit Rune */ for(i=0, w=unipass; i < n; i++) { pass += chartorune(&r, pass); *w++ = r & 0xff; --- /sys/src/cmd/dossrv/dossubs.c Sat Nov 28 06:57:48 2009 +++ /sys/src/cmd/dossrv/dossubs.c Sat Nov 28 06:57:45 2009 @@ -1129,6 +1129,13 @@ *p = 0; } +Rune +utfbe16(uchar *buf) +{ + /* not extended-plane capable */ + return buf[0] | buf[1]<<8; +} + static char* getnamerunes(char *dst, uchar *buf, int step) { @@ -1139,15 +1146,15 @@ d = dbuf; r = 1; for(i = 1; r && i < 11; i += 2){ - r = buf[i] | (buf[i+1] << 8); + r = utfbe16(buf + i); d += runetochar(d, &r); } for(i = 14; r && i < 26; i += 2){ - r = buf[i] | (buf[i+1] << 8); + r = utfbe16(buf + i); d += runetochar(d, &r); } for(i = 28; r && i < 32; i += 2){ - r = buf[i] | (buf[i+1] << 8); + r = utfbe16(buf + i); d += runetochar(d, &r); } --- /sys/src/cmd/htmlroff/char.c Sat Nov 28 06:58:03 2009 +++ /sys/src/cmd/htmlroff/char.c Sat Nov 28 06:58:02 2009 @@ -1,6 +1,10 @@ #include "a.h" /* + * hopeless if runes are not 16 bits + */ + +/* * Translate Unicode to HTML by asking tcs(1). * This way we don't have yet another table. */ --- /sys/src/cmd/ms2html.c Sat Nov 28 06:58:22 2009 +++ /sys/src/cmd/ms2html.c Sat Nov 28 06:58:18 2009 @@ -647,7 +647,7 @@ free(d); } -/* get next logical byte. from stdin or a defined string */ +/* get next logical rune. from stdin or a defined string */ int getrune(void) { --- /sys/src/cmd/proof/font.c Sat Nov 28 06:58:39 2009 +++ /sys/src/cmd/proof/font.c Sat Nov 28 06:58:38 2009 @@ -362,7 +362,7 @@ return 0; } dprint(2, "map %S to %s font# %d\n", rp, s, font); - s[runetochar(s, &r)] = 0; + s[runetochar(s, &r)] = 0; /* looks wrong -quanstro */ return s; } --- /sys/src/cmd/sam/cmd.c Sat Nov 28 06:58:58 2009 +++ /sys/src/cmd/sam/cmd.c Sat Nov 28 06:58:56 2009 @@ -71,7 +71,7 @@ inputc(void) { int n, nbuf; - char buf[3]; + char buf[UTFmax]; Rune r; Again: --- /sys/src/cmd/tr.c Sat Nov 28 06:59:17 2009 +++ /sys/src/cmd/tr.c Sat Nov 28 06:59:16 2009 @@ -15,10 +15,8 @@ #define CLEARBIT(a,c) ((a)[(c)/8] &= ~bits[(c)&07]) #define BITSET(a,c) ((a)[(c)/8] & bits[(c)&07]) -#define MAXRUNE 0xFFFF - -uchar f[(MAXRUNE+1)/8]; -uchar t[(MAXRUNE+1)/8]; +uchar f[(Runemax+1)/8]; +uchar t[(Runemax+1)/8]; char wbuf[4096]; char *wptr; --- /sys/src/cmd/troff2html/troff2html.c Sat Nov 28 06:59:38 2009 +++ /sys/src/cmd/troff2html/troff2html.c Sat Nov 28 06:59:37 2009 @@ -14,7 +14,7 @@ typedef struct HTMLfont HTMLfont; /* - * a Char is >= 32 bits. low 16 bits are the rune. higher are attributes. + * a Char is >= 32 bits. low 24 bits are the rune. higher are attributes. * must be able to hold a pointer. */ enum @@ -32,7 +32,7 @@ enum /* magic emissions */ { Estring = 0, - Epp = 1<<16, + Epp = 1<<24, }; int attrorder[] = { Indent1, Indent2, Indent3, Heading, Anchor, Italic, Bold, CW }; --- /sys/src/cmd/unicode.c Sat Nov 28 07:00:00 2009 +++ /sys/src/cmd/unicode.c Sat Nov 28 06:59:59 2009 @@ -51,13 +51,13 @@ return "bad range"; } min = strtoul(q, &q, 16); - if(min<0 || min>0xFFFF || *q!='-') + if(min<0 || min>Runemax || *q!='-') goto err; q++; if(strchr(hex, *q) == 0) goto err; max = strtoul(q, &q, 16); - if(max<0 || max>0xFFFF || maxRunemax || max0xFFFF || *q!=0) + if(m<0 || m>Runemax || *q!=0) goto err; Bprint(&bout, "%C", m); if(!text) --- /sys/src/cmd/unix/drawterm/libc/utf.h Sat Nov 28 07:00:23 2009 +++ /sys/src/cmd/unix/drawterm/libc/utf.h Sat Nov 28 07:00:21 2009 @@ -8,7 +8,8 @@ UTFmax = 3, /* maximum bytes per rune */ Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ Runeself = 0x80, /* rune and UTF sequences are the same (<) */ - Runeerror = 0x80, /* decoding error in UTF */ + Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0xFFFF, /* 16 bit rune */ }; /* --- /sys/src/cmd/unix/u9fs/plan9.h Sat Nov 28 07:00:47 2009 +++ /sys/src/cmd/unix/u9fs/plan9.h Sat Nov 28 07:00:45 2009 @@ -97,7 +97,8 @@ UTFmax = 3, /* maximum bytes per rune */ Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ Runeself = 0x80, /* rune and UTF sequences are the same (<) */ - Runeerror = 0x80 /* decoding error in UTF */ + Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0xFFFF, /* 16 bit rune */ }; extern int runetochar(char*, Rune*); --- /sys/src/cmd/vnc/devcons.c Sat Nov 28 07:01:12 2009 +++ /sys/src/cmd/vnc/devcons.c Sat Nov 28 07:01:10 2009 @@ -158,7 +158,7 @@ kbdputc(int ch) { int n; - char buf[3]; + char buf[UTFmax]; Rune r; r = ch; --- /sys/src/cmd/vnc/screen.c Sat Nov 28 07:01:38 2009 +++ /sys/src/cmd/vnc/screen.c Sat Nov 28 07:01:37 2009 @@ -356,7 +356,7 @@ { int i; Rune r; - char buf[4]; + char buf[UTFmax + 1]; drawlock(); while(n > 0){ --- /sys/src/cmd/yacc.c Sat Nov 28 07:02:11 2009 +++ /sys/src/cmd/yacc.c Sat Nov 28 07:02:06 2009 @@ -141,7 +141,7 @@ char* infile; /* input file name */ int numbval; /* value of an input number */ -char tokname[NAMESIZE+4]; /* input token name, slop for runes and 0 */ +char tokname[NAMESIZE+UTFmax+1]; /* input token name, slop for runes and 0 */ /* structure declarations */ @@ -1918,17 +1918,22 @@ /* i is the number of lines skipped */ i = 0; - if(Bgetrune(finput) != '*') - error("illegal comment"); c = Bgetrune(finput); - while(c != Beof) { - while(c == '*') - if((c=Bgetrune(finput)) == '/') - return i; - if(c == '\n') - i++; - c = Bgetrune(finput); - } + if(c == '/'){ + while((c = Bgetrune(finput)) != Beof) + if(c == '\n') + return 1; + }else if(c == '*'){ + while((c = Bgetrune(finput)) != Beof) { + while(c == '*') + if((c=Bgetrune(finput)) == '/') + return i; + if(c == '\n') + i++; + } + }else + error("illegal comment"); + error("EOF inside comment"); return 0; } --- /sys/src/libbio/bgetrune.c Sat Nov 28 07:02:39 2009 +++ /sys/src/libbio/bgetrune.c Sat Nov 28 07:02:38 2009 @@ -7,7 +7,7 @@ { int c, i; Rune rune; - char str[4]; + char str[UTFmax + 1]; c = Bgetc(bp); if(c < Runeself) { /* one char */ --- /sys/src/libhtml/lex.c Sat Nov 28 07:03:11 2009 +++ /sys/src/libhtml/lex.c Sat Nov 28 07:03:08 2009 @@ -1312,7 +1312,7 @@ ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i); n = chartorune(&r, (char*)(buf+ts->i)); if(ok) { - if(warn && c == 0x80) + if(warn && c == Runeerror) fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]); ts->i += n; c = r;