1. _gettoks(): lex styles like scripts (no entities); fix ts leak. 2. getscriptdata(): add findtag argument to accomidate styles and fix comment lexing. 3. amperstand(): change entity recognition to parse unicode character references and determine the end of reference as per rfc. changes 1-3 will allow abaco to correctly display wikipedia and yahoo.co.jp. it also improves the output of htmlfmt. Reference: /n/sources/patch/applied/libhtml-lexscript Date: Sun Aug 13 18:11:34 CES 2006 Signed-off-by: quanstro@quanstro.net --- /sys/src/libhtml/lex.c Sun Aug 13 18:03:02 2006 +++ /sys/src/libhtml/lex.c Sun Aug 13 18:02:48 2006 @@ -127,7 +127,7 @@ }; // HTML 4.0 attribute names. -// Keep sorted, and in correspondence with enum in i.h. +// Keep sorted, and in correspondence with enum in impl.h. Rune* attrnames[] = { L"abbr", L"accept-charset", @@ -540,7 +540,7 @@ static void lexinit(); static int getplaindata(TokenSource* ts, Token* a, int* pai); static int getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai); -static int getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai); +static int getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai, int findtag); static int gettag(TokenSource* ts, int starti, Token* a, int* pai); static Rune* buftostr(Rune* s, Rune* buf, int j); static int comment(TokenSource* ts); @@ -620,11 +620,11 @@ break; if(c == '<') { tag = gettag(ts, starti, a, &ai); - if(tag == Tscript) { + if(tag == Tscript || tag == Tstyle) { // special rules for getting Data after.... starti = ts->i; c = getchar(ts); - tag = getscriptdata(ts, c, starti, a, &ai); + tag = getscriptdata(ts, c, starti, a, &ai, tag); } } else @@ -649,6 +649,7 @@ fprint(2, "lex: got token %T\n", &a[ai]); } } + free(ts); if(dbglex) fprint(2, "lex: returning %d tokens\n", ai); *plen = ai; @@ -793,9 +794,9 @@ } // The rules for lexing scripts are different (ugh). -// Gather up everything until see a . +// Gather up everything until see an "" static int -getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai) +getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai, int findtag) { Rune* s; int j; @@ -818,8 +819,10 @@ savei = ts->i; c = getchar(ts); if(c == '!') { - while(c >= 0 && c != '\n' && c != '\r') - c = getchar(ts); +// while(c >= 0 && c != '\n' && c != '\r') +// c = getchar(ts); + if(comment(ts) == -1) + break; if(c == '\r') c = getchar(ts); if(c == '\n') @@ -833,11 +836,11 @@ if(tag != Comment) (*pai)--; backup(ts, tstarti); - if(tag == Tscript + RBRA) { + if(tag == findtag + RBRA) { done = 1; break; } - // here tag was not , so take as regular data + // here tag was not the one we were looking for, so take as regular data c = getchar(ts); } } @@ -1196,8 +1199,7 @@ // We've just read an '&'; look for an entity reference // name, and if found, return translated char. // if there is a complete entity name but it isn't known, -// try prefixes (gets around some buggy HTML out there), -// and if that fails, back up to just past the '&' and return '&'. +// back up to just past the '&' and return '&'. // If the entity can't be completed in the current buffer, back up // to the '&' and return -1. static int @@ -1208,7 +1210,6 @@ int fnd; int ans; int v; - int i; int k; Rune buf[SMALLBUFSIZE]; @@ -1219,12 +1220,23 @@ if(c == '#') { c = getchar(ts); v = 0; - while(c >= 0) { - if(!(c < 256 && isdigit(c))) - break; - v = v*10 + c - 48; - c = getchar(ts); - } + if(c == 'X' || c == 'x') + for(c = getchar(ts); c < 256; c = getchar(ts)) + if(c >= '0' && c <= '9') + v = v*16+c-'0'; + else if(c >= 'A' && c<= 'F') + v = v*16+c-'A'+10; + else if(c >= 'a' && c <= 'f') + v = v*16+c-'a'+10; + else + break; + else + while(c >= 0) { + if(!(c < 256 && isdigit(c))) + break; + v = v*10 + c - 48; + c = getchar(ts); + } if(c >= 0) { if(!(c == ';' || c == '\n' || c == '\r')) ungetchar(ts, c); @@ -1245,7 +1257,7 @@ c = getchar(ts); if(c < 0) break; - if(ISNAMCHAR(c)) { + if(c < 256 && (isalpha(c) || isdigit(c))) { if(k < SMALLBUFSIZE-1) buf[k++] = c; } @@ -1255,25 +1267,8 @@ break; } } - if(c >= 0) { + if(c >= 256 || c != '=' && !(isalpha(c) || isdigit(c))) fnd = _lookup(chartab, NCHARTAB, buf, k, &ans); - if(!fnd) { - // Try prefixes of s - if(c == ';' || c == '\n' || c == '\r') - ungetchar(ts, c); - i = k; - while(--k > 0) { - fnd = _lookup(chartab, NCHARTAB, buf, k, &ans); - if(fnd) { - while(i > k) { - i--; - ungetchar(ts, buf[i]); - } - break; - } - } - } - } } if(!fnd) { backup(ts, savei);