1. _gettoks(): lex styles like scripts (no entities); fix ts leak.
2. getscriptdata(): add findtag argument to accomidate styles and fix comment lexing.
3. amperstand(): change entity recognition to parse unicode character references
and determine the end of reference as per rfc.

changes 1-3 will allow abaco to correctly display wikipedia and yahoo.co.jp.
it also improves the output of htmlfmt.

Reference: /n/sources/patch/applied/libhtml-lexscript
Date: Sun Aug 13 18:11:34 CES 2006
Signed-off-by: quanstro@quanstro.net

--- /sys/src/libhtml/lex.c	Sun Aug 13 18:03:02 2006
+++ /sys/src/libhtml/lex.c	Sun Aug 13 18:02:48 2006
@@ -127,7 +127,7 @@
 };
 
 // HTML 4.0 attribute names.
-// Keep sorted, and in correspondence with enum in i.h.
+// Keep sorted, and in correspondence with enum in impl.h.
 Rune* attrnames[] = {
 	L"abbr",
 	L"accept-charset",
@@ -540,7 +540,7 @@
 static void		lexinit();
 static int		getplaindata(TokenSource* ts, Token* a, int* pai);
 static int		getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
-static int		getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
+static int		getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai, int findtag);
 static int		gettag(TokenSource* ts, int starti, Token* a, int* pai);
 static Rune*	buftostr(Rune* s, Rune* buf, int j);
 static int		comment(TokenSource* ts);
@@ -620,11 +620,11 @@
 				break;
 			if(c == '<') {
 				tag = gettag(ts, starti, a, &ai);
-				if(tag == Tscript) {
+				if(tag == Tscript || tag == Tstyle) {
 					// special rules for getting Data after....
 					starti = ts->i;
 					c = getchar(ts);
-					tag = getscriptdata(ts, c, starti, a, &ai);
+					tag = getscriptdata(ts, c, starti, a, &ai, tag);
 				}
 			}
 			else
@@ -649,6 +649,7 @@
 				fprint(2, "lex: got token %T\n", &a[ai]);
 		}
 	}
+	free(ts);
 	if(dbglex)
 		fprint(2, "lex: returning %d tokens\n", ai);
 	*plen = ai;
@@ -793,9 +794,9 @@
 }
 
 // The rules for lexing scripts are different (ugh).
-// Gather up everything until see a </SCRIPT>.
+// Gather up everything until see an "</" tagnames[tok] ">"
 static int
-getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
+getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai, int findtag)
 {
 	Rune*	s;
 	int	j;
@@ -818,8 +819,10 @@
 			savei = ts->i;
 			c = getchar(ts);
 			if(c == '!') {
-				while(c >= 0 && c != '\n' && c != '\r')
-					c = getchar(ts);
+//				while(c >= 0 && c != '\n' && c != '\r')
+//					c = getchar(ts);
+				if(comment(ts) == -1)
+					break;
 				if(c == '\r')
 					c = getchar(ts);
 				if(c == '\n')
@@ -833,11 +836,11 @@
 				if(tag != Comment)
 					(*pai)--;
 				backup(ts, tstarti);
-				if(tag == Tscript + RBRA) {
+				if(tag == findtag + RBRA) {
 					done = 1;
 					break;
 				}
-				// here tag was not </SCRIPT>, so take as regular data
+				// here tag was not the one we were looking for, so take as regular data
 				c = getchar(ts);
 			}
 		}
@@ -1196,8 +1199,7 @@
 // We've just read an '&'; look for an entity reference
 // name, and if found, return translated char.
 // if there is a complete entity name but it isn't known,
-// try prefixes (gets around some buggy HTML out there),
-// and if that fails, back up to just past the '&' and return '&'.
+// back up to just past the '&' and return '&'.
 // If the entity can't be completed in the current buffer, back up
 // to the '&' and return -1.
 static int
@@ -1208,7 +1210,6 @@
 	int	fnd;
 	int	ans;
 	int	v;
-	int	i;
 	int	k;
 	Rune	buf[SMALLBUFSIZE];
 
@@ -1219,12 +1220,23 @@
 	if(c == '#') {
 		c = getchar(ts);
 		v = 0;
-		while(c >= 0) {
-			if(!(c < 256 && isdigit(c)))
-				break;
-			v = v*10 + c - 48;
-			c = getchar(ts);
-		}
+		if(c == 'X' || c == 'x')
+			for(c = getchar(ts); c < 256; c = getchar(ts))
+				if(c >= '0' && c <= '9')
+					v = v*16+c-'0';
+				else if(c >= 'A' && c<= 'F')
+					v = v*16+c-'A'+10;
+				else if(c >= 'a' && c <= 'f')
+					v = v*16+c-'a'+10;
+				else
+					break;
+		else
+			while(c >= 0) {
+				if(!(c < 256 && isdigit(c)))
+					break;
+				v = v*10 + c - 48;
+				c = getchar(ts);
+			}
 		if(c >= 0) {
 			if(!(c == ';' || c == '\n' || c == '\r'))
 				ungetchar(ts, c);
@@ -1245,7 +1257,7 @@
 			c = getchar(ts);
 			if(c < 0)
 				break;
-			if(ISNAMCHAR(c)) {
+			if(c < 256 && (isalpha(c) || isdigit(c))) {
 				if(k < SMALLBUFSIZE-1)
 					buf[k++] = c;
 			}
@@ -1255,25 +1267,8 @@
 				break;
 			}
 		}
-		if(c >= 0) {
+		if(c >= 256 || c != '=' && !(isalpha(c) || isdigit(c)))
 			fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
-			if(!fnd) {
-				// Try prefixes of s
-				if(c == ';' || c == '\n' || c == '\r')
-					ungetchar(ts, c);
-				i = k;
-				while(--k > 0) {
-					fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
-					if(fnd) {
-						while(i > k) {
-							i--;
-							ungetchar(ts, buf[i]);
-						}
-						break;
-					}
-				}
-			}
-		}
 	}
 	if(!fnd) {
 		backup(ts, savei);