use wider runes Reference: /n/sources/patch/applied/rune8-drawterm Date: Mon Apr 29 23:44:26 CES 2013 Signed-off-by: geoff@plan9.bell-labs.com --- /sys/src/cmd/unix/drawterm/libc/utf.h Mon Apr 29 23:44:16 2013 +++ /sys/src/cmd/unix/drawterm/libc/utf.h Mon Apr 29 23:44:16 2013 @@ -1,14 +1,16 @@ #ifndef _UTFH_ #define _UTFH_ 1 -typedef unsigned short Rune; /* 16 bits */ +typedef unsigned int Rune; /* 32 bits */ enum { - UTFmax = 3, /* maximum bytes per rune */ + UTFmax = 4, /* maximum bytes per rune */ Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ Runeself = 0x80, /* rune and UTF sequences are the same (<) */ - Runeerror = 0x80, /* decoding error in UTF */ + Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0x10FFFF, /* 21-bit rune */ + Runemask = 0x1FFFFF, /* bits used by runes (see grep) */ }; /* --- /sys/src/cmd/unix/drawterm/include/lib.h Mon Apr 29 23:44:16 2013 +++ /sys/src/cmd/unix/drawterm/include/lib.h Mon Apr 29 23:44:16 2013 @@ -27,7 +27,7 @@ typedef int p9_long; typedef signed char p9_schar; typedef unsigned short p9_ushort; -typedef unsigned short Rune; +typedef unsigned int Rune; typedef unsigned int p9_u32int; typedef p9_u32int mpdigit; @@ -50,10 +50,12 @@ enum { - UTFmax = 3, /* maximum bytes per rune */ + UTFmax = 4, /* maximum bytes per rune */ Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ Runeself = 0x80, /* rune and UTF sequences are the same (<) */ - Runeerror = 0x80 /* decoding error in UTF */ + Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0x10FFFF, /* 21-bit rune */ + Runemask = 0x1FFFFF, /* bits used by runes (see grep) */ }; /* --- /sys/src/cmd/unix/drawterm/libc/rune.c Mon Apr 29 23:44:16 2013 +++ /sys/src/cmd/unix/drawterm/libc/rune.c Mon Apr 29 23:44:16 2013 @@ -1,28 +1,21 @@ #include #include +#define Bit(i) (7-(i)) +/* N 0's preceded by i 1's, T(Bit(2)) is 1100 0000 */ +#define T(i) (((1 << (Bit(i)+1))-1) ^ 0xFF) +/* 0000 0000 0000 0111 1111 1111 */ +#define RuneX(i) ((1 << (Bit(i) + ((i)-1)*Bitx))-1) + enum { - Bit1 = 7, - Bitx = 6, - Bit2 = 5, - Bit3 = 4, - Bit4 = 3, - - T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ - Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ - T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ - T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ - T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ - T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ - - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */ - Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0001 1111 1111 1111 1111 1111 */ + Bitx = Bit(1), + + Tx = T(1), /* 1000 0000 */ + Rune1 = (1<<(Bit(0)+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ - Maskx = (1< T1 - */ - c = *(uchar*)str; - if(c < Tx) { - *rune = c; - return 1; - } - - /* - * two character sequence * 00080-007FF => T2 Tx - */ - c1 = *(uchar*)(str+1) ^ Tx; - if(c1 & Testx) - goto bad; - if(c < T3) { - if(c < T2) - goto bad; - l = ((c << Bitx) | c1) & Rune2; - if(l <= Rune1) - goto bad; - *rune = l; - return 2; - } - - /* - * three character sequence * 00800-0FFFF => T3 Tx Tx + * 10000-10FFFF => T4 Tx Tx Tx */ - c2 = *(uchar*)(str+2) ^ Tx; - if(c2 & Testx) - goto bad; - if(c < T4) { - l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; - if(l <= Rune2) - goto bad; - if (SurrogateMin <= l && l <= SurrogateMax) - goto bad; - *rune = l; - return 3; + c[0] = *(uchar*)(str); + if(c[0] < Tx){ + *rune = c[0]; + return 1; } + l = c[0]; - /* - * four character sequence - * 10000-10FFFF => T4 Tx Tx Tx - */ - if(UTFmax >= 4) { - c3 = *(uchar*)(str+3) ^ Tx; - if(c3 & Testx) + for(i = 1; i < UTFmax; i++) { + c[i] = *(uchar*)(str+i); + c[i] ^= Tx; + if(c[i] & Testx) goto bad; - if(c < T5) { - l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; - if(l <= Rune3) + l = (l << Bitx) | c[i]; + if(c[0] < T(i + 2)) { + l &= RuneX(i + 1); + if(i == 1) { + if(c[0] < T(2) || l <= Rune1) + goto bad; + } else if(l <= RuneX(i) || l > Runemax) goto bad; - if(l > Runemax) + if (i == 2 && SurrogateMin <= l && l <= SurrogateMax) goto bad; *rune = l; - return 4; + return i + 1; } } @@ -111,12 +75,9 @@ int runetochar(char *str, Rune *rune) { - long c; + int i, j; + Rune c; - /* - * one character sequence - * 00000-0007F => 00-7F - */ c = *rune; if(c <= Rune1) { str[0] = c; @@ -124,45 +85,35 @@ } /* + * one character sequence + * 00000-0007F => 00-7F * two character sequence * 0080-07FF => T2 Tx - */ - if(c <= Rune2) { - str[0] = T2 | (c >> 1*Bitx); - str[1] = Tx | (c & Maskx); - return 2; - } - /* - * If the Rune is out of range or a surrogate half, convert it to the error rune. - * Do this test here because the error rune encodes to three bytes. - * Doing it earlier would duplicate work, since an out of range - * Rune wouldn't have fit in one or two bytes. - */ - if (c > Runemax) - c = Runeerror; - if (SurrogateMin <= c && c <= SurrogateMax) - c = Runeerror; - - /* * three character sequence * 0800-FFFF => T3 Tx Tx - */ - if (c <= Rune3) { - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; - } - - /* * four character sequence (21-bit value) * 10000-1FFFFF => T4 Tx Tx Tx + * If the Rune is out of range or a surrogate half, + * convert it to the error rune. + * Do this test when i==3 because the error rune encodes to three bytes. + * Doing it earlier would duplicate work, since an out of range + * Rune wouldn't have fit in one or two bytes. */ - str[0] = T4 | (c >> 3*Bitx); - str[1] = Tx | ((c >> 2*Bitx) & Maskx); - str[2] = Tx | ((c >> 1*Bitx) & Maskx); - str[3] = Tx | (c & Maskx); - return 4; + for(i = 2; i < UTFmax + 1; i++){ + if(i == 3){ + if(c > Runemax) + c = Runeerror; + if(SurrogateMin <= c && c <= SurrogateMax) + c = Runeerror; + } + if (c <= RuneX(i) || i == UTFmax ) { + str[0] = T(i) | (c >> (i - 1)*Bitx); + for(j = 1; j < i; j++) + str[j] = Tx | ((c >> (i - j - 1)*Bitx) & Maskx); + return i; + } + } + return UTFmax; } int @@ -178,19 +129,21 @@ int runenlen(Rune *r, int nrune) { - int nb, c; + int nb, i; + Rune c; nb = 0; while(nrune--) { c = *r++; - if(c <= Rune1) + if(c <= Rune1){ nb++; - else if(c <= Rune2) - nb += 2; - else if(c <= Rune3) - nb += 3; - else - nb += 4; + } else { + for(i = 2; i < UTFmax + 1; i++) + if(c <= RuneX(i) || i == UTFmax){ + nb += i; + break; + } + } } return nb; } @@ -198,15 +151,16 @@ int fullrune(char *str, int n) { - int c; + int i; + Rune c; + if(n <= 0) return 0; c = *(uchar*)str; if(c < Tx) return 1; - if(c < T3) - return n >= 2; - if(c < T4) - return n >= 3; - return n >= 4; + for(i = 3; i < UTFmax + 1; i++) + if(c < T(i)) + return n >= i - 1; + return n >= UTFmax; }