improve html entity recognition. 1. recognize the new unicode references like &#[xX][0-9a-fA-F]+. c.f. http://www.unicode.org. 2. be very careful about determining the end of an entity reference. entities are a bit more restricted than html/xml CNAMEs, containing only [a-zA-Z0-9]. anything outside that is the end of a reference. this allows us to recognize "&
" as "&
" as the standard indicates. 3. no longer try substrings of the recognized entity name. this prevents us from fouling common cgi arguments like http://site.com?pie=x (intrepteted as http://site.com?πe=x). washingtonpost.com has examples of this. Reference: /n/sources/patch/applied/libhtml-entities Date: Fri Jul 14 18:15:44 CES 2006 Signed-off-by: quanstro@quanstro.net --- /sys/src/libhtml/lex.c Fri Jul 14 18:08:58 2006 +++ /sys/src/libhtml/lex.c Fri Jul 14 18:17:04 2006 @@ -1196,8 +1196,7 @@ // We've just read an '&'; look for an entity reference // name, and if found, return translated char. // if there is a complete entity name but it isn't known, -// try prefixes (gets around some buggy HTML out there), -// and if that fails, back up to just past the '&' and return '&'. +// back up to just past the '&' and return '&'. // If the entity can't be completed in the current buffer, back up // to the '&' and return -1. static int @@ -1208,7 +1207,6 @@ int fnd; int ans; int v; - int i; int k; Rune buf[SMALLBUFSIZE]; @@ -1219,12 +1217,23 @@ if(c == '#') { c = getchar(ts); v = 0; - while(c >= 0) { - if(!(c < 256 && isdigit(c))) - break; - v = v*10 + c - 48; - c = getchar(ts); - } + if(c == 'X' || c == 'x') + for(c = getchar(ts); c < 256; c = getchar(ts)) + if(c >= '0' && c <= '9') + v = v*16+c-'0'; + else if(c >= 'A' && c<= 'F') + v = v*16+c-'A'+10; + else if(c >= 'a' && c <= 'f') + v = v*16+c-'a'+10; + else + break; + else + while(c >= 0) { + if(!(c < 256 && isdigit(c))) + break; + v = v*10 + c - 48; + c = getchar(ts); + } if(c >= 0) { if(!(c == ';' || c == '\n' || c == '\r')) ungetchar(ts, c); @@ -1245,7 +1254,7 @@ c = getchar(ts); if(c < 0) break; - if(ISNAMCHAR(c)) { + if(c < 256 && (isalpha(c) || isdigit(c))) { if(k < SMALLBUFSIZE-1) buf[k++] = c; } @@ -1255,25 +1264,8 @@ break; } } - if(c >= 0) { + if(c >= 256 || c != '=' && !(isalpha(c) || isdigit(c))) fnd = _lookup(chartab, NCHARTAB, buf, k, &ans); - if(!fnd) { - // Try prefixes of s - if(c == ';' || c == '\n' || c == '\r') - ungetchar(ts, c); - i = k; - while(--k > 0) { - fnd = _lookup(chartab, NCHARTAB, buf, k, &ans); - if(fnd) { - while(i > k) { - i--; - ungetchar(ts, buf[i]); - } - break; - } - } - } - } } if(!fnd) { backup(ts, savei);