improve html entity recognition.
1. recognize the new unicode references like [xX][0-9a-fA-F]+. c.f. http://www.unicode.org.
2. be very careful about determining the end of an entity reference. entities are a bit more
restricted than html/xml CNAMEs, containing only [a-zA-Z0-9]. anything outside that is the
end of a reference. this allows us to recognize "&
" as "&
" as the standard indicates.
3. no longer try substrings of the recognized entity name. this prevents us from fouling common cgi
arguments like http://site.com?pie=x (intrepteted as http://site.com?πe=x). washingtonpost.com has
examples of this.
Reference: /n/sources/patch/applied/libhtml-entities
Date: Fri Jul 14 18:15:44 CES 2006
Signed-off-by: quanstro@quanstro.net
--- /sys/src/libhtml/lex.c Fri Jul 14 18:08:58 2006
+++ /sys/src/libhtml/lex.c Fri Jul 14 18:17:04 2006
@@ -1196,8 +1196,7 @@
// We've just read an '&'; look for an entity reference
// name, and if found, return translated char.
// if there is a complete entity name but it isn't known,
-// try prefixes (gets around some buggy HTML out there),
-// and if that fails, back up to just past the '&' and return '&'.
+// back up to just past the '&' and return '&'.
// If the entity can't be completed in the current buffer, back up
// to the '&' and return -1.
static int
@@ -1208,7 +1207,6 @@
int fnd;
int ans;
int v;
- int i;
int k;
Rune buf[SMALLBUFSIZE];
@@ -1219,12 +1217,23 @@
if(c == '#') {
c = getchar(ts);
v = 0;
- while(c >= 0) {
- if(!(c < 256 && isdigit(c)))
- break;
- v = v*10 + c - 48;
- c = getchar(ts);
- }
+ if(c == 'X' || c == 'x')
+ for(c = getchar(ts); c < 256; c = getchar(ts))
+ if(c >= '0' && c <= '9')
+ v = v*16+c-'0';
+ else if(c >= 'A' && c<= 'F')
+ v = v*16+c-'A'+10;
+ else if(c >= 'a' && c <= 'f')
+ v = v*16+c-'a'+10;
+ else
+ break;
+ else
+ while(c >= 0) {
+ if(!(c < 256 && isdigit(c)))
+ break;
+ v = v*10 + c - 48;
+ c = getchar(ts);
+ }
if(c >= 0) {
if(!(c == ';' || c == '\n' || c == '\r'))
ungetchar(ts, c);
@@ -1245,7 +1254,7 @@
c = getchar(ts);
if(c < 0)
break;
- if(ISNAMCHAR(c)) {
+ if(c < 256 && (isalpha(c) || isdigit(c))) {
if(k < SMALLBUFSIZE-1)
buf[k++] = c;
}
@@ -1255,25 +1264,8 @@
break;
}
}
- if(c >= 0) {
+ if(c >= 256 || c != '=' && !(isalpha(c) || isdigit(c)))
fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
- if(!fnd) {
- // Try prefixes of s
- if(c == ';' || c == '\n' || c == '\r')
- ungetchar(ts, c);
- i = k;
- while(--k > 0) {
- fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
- if(fnd) {
- while(i > k) {
- i--;
- ungetchar(ts, buf[i]);
- }
- break;
- }
- }
- }
- }
}
if(!fnd) {
backup(ts, savei);