further documentation. add code that is largely optimized away that documents how to support 20-bit runes. this code is not enabled. that would require changing Rune to uint in u.h and Runemax to 0x10ffff in libc.h Notes: Mon Apr 29 20:43:28 EDT 2013 geoff being done independently. Reference: /n/sources/patch/sorry/bloated-rune-size2 Date: Sat Nov 28 07:16:46 CET 2009 Signed-off-by: quanstro@quanstro.net Reviewed-by: geoff --- /sys/src/cmd/unix/drawterm/libc/rune.c Sat Nov 28 07:08:28 2009 +++ /sys/src/cmd/unix/drawterm/libc/rune.c Sat Nov 28 07:08:27 2009 @@ -8,16 +8,19 @@ Bit2 = 5, Bit3 = 4, Bit4 = 3, + Bit5 = 2, T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */ Maskx = (1< T4 Tx Tx Tx + */ + if(UTFmax >= 4) { + c3 = *(uchar*)(str+3) ^ Tx; + if(c3 & Testx) + goto bad; + if(c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if(l <= Rune3) + goto bad; + if(l > Runemax) + goto bad; + *rune = l; + return 4; + } + } + + /* * bad decoding */ bad: @@ -98,7 +120,7 @@ /* * two character sequence - * 0080-07FF => T2 Tx + * 00080-007FF => T2 Tx */ if(c <= Rune2) { str[0] = T2 | (c >> 1*Bitx); @@ -108,12 +130,26 @@ /* * three character sequence - * 0800-FFFF => T3 Tx Tx + * 00800-0FFFF => T3 Tx Tx + */ + if(c > Runemax) + c = Runeerror; + if(c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence + * 010000-1FFFFF => T4 Tx Tx Tx */ - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; } int @@ -140,7 +176,10 @@ if(c <= Rune2) nb += 2; else + if(c <= Rune3 || c > Runemax) nb += 3; + else + nb += 4; } return nb; } @@ -150,13 +189,14 @@ { int c; - if(n > 0) { - c = *(uchar*)str; - if(c < Tx) - return 1; - if(n > 1) - if(c < T3 || n > 2) - return 1; - } - return 0; + if(n <= 0) + return 0; + c = *(uchar*)str; + if(c < Tx) + return 1; + if(c < T3) + return n >= 2; + if(UTFmax == 3 || c < T4) + return n >= 3; + return n >= 4; } --- /sys/src/cmd/unix/u9fs/rune.c Sat Nov 28 07:08:34 2009 +++ /sys/src/cmd/unix/u9fs/rune.c Sat Nov 28 07:08:33 2009 @@ -8,27 +8,30 @@ Bit2 = 5, Bit3 = 4, Bit4 = 3, + Bit5 = 2, T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */ Maskx = (1< T4 Tx Tx Tx + */ + if(UTFmax >= 4) { + c3 = *(uchar*)(str+3) ^ Tx; + if(c3 & Testx) + goto bad; + if(c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if(l <= Rune3) + goto bad; + if(l > Runemax) + goto bad; + *rune = l; + return 4; + } + } + + /* * bad decoding */ bad: @@ -98,7 +120,7 @@ /* * two character sequence - * 0080-07FF => T2 Tx + * 00080-007FF => T2 Tx */ if(c <= Rune2) { str[0] = T2 | (c >> 1*Bitx); @@ -108,12 +130,26 @@ /* * three character sequence - * 0800-FFFF => T3 Tx Tx + * 00800-0FFFF => T3 Tx Tx + */ + if(c > Runemax) + c = Runeerror; + if(c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence + * 010000-1FFFFF => T4 Tx Tx Tx */ - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; } int @@ -127,22 +163,40 @@ } int -utflen(char *s) +runenlen(Rune *r, int nrune) { - int c; - long n; - Rune rune; + int nb, c; - n = 0; - for(;;) { - c = *(uchar*)s; - if(c < Runeself) { - if(c == 0) - return n; - s++; - } else - s += chartorune(&rune, s); - n++; + nb = 0; + while(nrune--) { + c = *r++; + if(c <= Rune1) + nb++; + else + if(c <= Rune2) + nb += 2; + else + if(c <= Rune3 || c > Runemax) + nb += 3; + else + nb += 4; } - return 0; + return nb; +} + +int +fullrune(char *str, int n) +{ + int c; + + if(n <= 0) + return 0; + c = *(uchar*)str; + if(c < Tx) + return 1; + if(c < T3) + return n >= 2; + if(UTFmax == 3 || c < T4) + return n >= 3; + return n >= 4; } --- /sys/src/ape/lib/utf/rune.c Sat Nov 28 07:08:42 2009 +++ /sys/src/ape/lib/utf/rune.c Sat Nov 28 07:08:41 2009 @@ -23,16 +23,19 @@ Bit2 = 5, Bit3 = 4, Bit4 = 3, + Bit5 = 2, T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */ Maskx = (1< T4 Tx Tx Tx + */ + if(UTFmax >= 4) { + c3 = *(uchar*)(str+3) ^ Tx; + if(c3 & Testx) + goto bad; + if(c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if(l <= Rune3) + goto bad; + if(l > Runemax) + goto bad; + *rune = l; + return 4; + } + } + + /* * bad decoding */ bad: @@ -113,7 +135,7 @@ /* * two character sequence - * 0080-07FF => T2 Tx + * 00080-007FF => T2 Tx */ if(c <= Rune2) { str[0] = T2 | (c >> 1*Bitx); @@ -123,12 +145,26 @@ /* * three character sequence - * 0800-FFFF => T3 Tx Tx + * 00800-0FFFF => T3 Tx Tx + */ + if(c > Runemax) + c = Runeerror; + if(c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence + * 010000-1FFFFF => T4 Tx Tx Tx */ - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; } int @@ -155,7 +191,10 @@ if(c <= Rune2) nb += 2; else + if(c <= Rune3 || c > Runemax) nb += 3; + else + nb += 4; } return nb; } @@ -165,13 +204,14 @@ { int c; - if(n > 0) { - c = *(uchar*)str; - if(c < Tx) - return 1; - if(n > 1) - if(c < T3 || n > 2) - return 1; - } - return 0; + if(n <= 0) + return 0; + c = *(uchar*)str; + if(c < Tx) + return 1; + if(c < T3) + return n >= 2; + if(UTFmax == 3 || c < T4) + return n >= 3; + return n >= 4; } --- /sys/src/libc/port/rune.c Sat Nov 28 07:08:51 2009 +++ /sys/src/libc/port/rune.c Sat Nov 28 07:08:50 2009 @@ -8,16 +8,19 @@ Bit2 = 5, Bit3 = 4, Bit4 = 3, + Bit5 = 2, T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */ Maskx = (1< T4 Tx Tx Tx + */ + if(UTFmax >= 4) { + c3 = *(uchar*)(str+3) ^ Tx; + if(c3 & Testx) + goto bad; + if(c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if(l <= Rune3) + goto bad; + if(l > Runemax) + goto bad; + *rune = l; + return 4; + } + } + + /* * bad decoding */ bad: @@ -98,7 +120,7 @@ /* * two character sequence - * 0080-07FF => T2 Tx + * 00080-007FF => T2 Tx */ if(c <= Rune2) { str[0] = T2 | (c >> 1*Bitx); @@ -108,12 +130,26 @@ /* * three character sequence - * 0800-FFFF => T3 Tx Tx + * 00800-0FFFF => T3 Tx Tx + */ + if(c > Runemax) + c = Runeerror; + if(c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence + * 010000-1FFFFF => T4 Tx Tx Tx */ - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; } int @@ -140,7 +176,10 @@ if(c <= Rune2) nb += 2; else + if(c <= Rune3 || c > Runemax) nb += 3; + else + nb += 4; } return nb; } @@ -150,13 +189,14 @@ { int c; - if(n > 0) { - c = *(uchar*)str; - if(c < Tx) - return 1; - if(n > 1) - if(c < T3 || n > 2) - return 1; - } - return 0; + if(n <= 0) + return 0; + c = *(uchar*)str; + if(c < Tx) + return 1; + if(c < T3) + return n >= 2; + if(UTFmax == 3 || c < T4) + return n >= 3; + return n >= 4; } --- /sys/src/ape/lib/ap/gen/mbwc.c Sat Nov 28 07:09:01 2009 +++ /sys/src/ape/lib/ap/gen/mbwc.c Sat Nov 28 07:08:59 2009 @@ -1,4 +1,5 @@ #include +#include /* * Use the FSS-UTF transformation proposed by posix. @@ -7,12 +8,14 @@ * Tx 10xxxxxx 6 free bits * T1 110xxxxx 5 free bits * T2 1110xxxx 4 free bits + * T3 11110xxx 3 free bits * * Encoding is as follows. * From hex Thru hex Sequence Bits - * 00000000 0000007F T0 7 - * 00000080 000007FF T1 Tx 11 + * 00000000 0000007F T0 7 + * 00000080 000007FF T1 Tx 11 * 00000800 0000FFFF T2 Tx Tx 16 + * 00000800 00010000 T2 Tx Tx Tx 20 (and change) */ int @@ -25,7 +28,7 @@ int mbtowc(wchar_t *pwc, const char *s, size_t n) { - int c, c1, c2; + int c, c1, c2, c3; long l; if(!s) @@ -68,6 +71,24 @@ if(pwc) *pwc = l; return 3; + } + + if(n < 4) + goto bad; + if(UTFmax >= 4) { + c3 = (s[3] ^ 0x80) & 0xff; + if(c3 & 0xC0) + goto bad; + if(c < 0xf8) { + l = ((((((c << 6) | c1) << 6) | c2) << 6) | c3) & 0x3fffff; + if(l <= 0x10000) + goto bad; + if(l > Runemax) + goto bad; + if(pwc) + *pwc = l; + return 4; + } } /* --- /sys/src/libdraw/event.c Sat Nov 28 07:09:12 2009 +++ /sys/src/libdraw/event.c Sat Nov 28 07:09:11 2009 @@ -4,6 +4,10 @@ #include #include +enum { + Kbdmsgsz = 1 + 4 /* allow for 32-bit runes */ +}; + typedef struct Slave Slave; typedef struct Ebuf Ebuf; @@ -199,7 +203,7 @@ ekeyslave(int fd) { Rune r; - char t[3], k[10]; + char t[Kbdmsgsz], k[10]; int kr, kn, w; if(eforkslave(Ekeyboard) < MAXSLAVE) @@ -218,7 +222,9 @@ memmove(k, &k[w], kn); t[1] = r; t[2] = r>>8; - if(write(epipe[1], t, 3) != 3) + t[3] = r>>16; + t[4] = r>>24; + if(write(epipe[1], t, sizeof t) != sizeof t) break; } breakout:; @@ -302,7 +308,7 @@ s->head = (Ebuf *)1; return; } - if(i == Skeyboard && n != 3) + if(i == Skeyboard && n != Kbdmsgsz) drawerror(display, "events: protocol error: keyboard"); if(i == Smouse){ if(n < 1+1+2*12) @@ -417,13 +423,15 @@ int ekbd(void) { - Ebuf *eb; + uchar *t; int c; + Ebuf *eb; if(Skeyboard < 0) drawerror(display, "events: keyboard not initialzed"); eb = ebread(&eslave[Skeyboard]); - c = eb->buf[0] + (eb->buf[1]<<8); + t = eb->buf; + c = t[0] | t[1]<<8 | t[2]<<16 | t[3]<<24; free(eb); return c; }