change character classification from unicode first to this priority:
ascii, utf8, binary, latin1.  use a private function to recognize utf8.
these changes allow us to recognize 0x10ffff > utf > 0xffff and latin1.
dbcs recognition is also possible; that code is deferred for a subsequent
patch.

the utf-8 range 0xa0-0xff is now called "latin". not "Extended Latin". 

Notes:
Tue Sep 12 08:10:27 EDT 2006 rsc
    I fixed utf parsing.  I don't want to change 
    "latin ascii" to "latin" because there are a programs
    that look for the string ascii in the output and use
    that to say hey this is a text file.
    

Reference: /n/sources/patch/applied/file-utf-parse
Date: Tue May 23 03:08:42 CES 2006
Signed-off-by: quanstro@quanstro.net
Reviewed-by: rsc

--- /sys/src/cmd/file.c	Tue May 23 02:58:47 2006
+++ /sys/src/cmd/file.c	Tue May 23 02:58:33 2006
@@ -100,7 +100,7 @@
 	
 } language[] =
 {
-	Normal, 0,	0x0080, 0x0080,	"Extended Latin",
+	Normal,	0,	0x00a0,	0x00ff,	"Latin",
 	Normal,	0,	0x0100,	0x01FF,	"Extended Latin",
 	Normal,	0,	0x0370,	0x03FF,	"Greek",
 	Normal,	0,	0x0400,	0x04FF,	"Cyrillic",
@@ -133,7 +133,7 @@
 {
 	Fascii,		/* printable ascii */
 	Flatin,		/* latin 1*/
-	Futf,		/* UTf character set */
+	Futf,		/* UTF character set */
 	Fbinary,	/* binary */
 	Feascii,	/* ASCII with control chars */
 	Fnull,		/* NULL in file */
@@ -263,12 +263,36 @@
 	close(fd);
 }
 
+static int
+utf8len(uchar *s, uchar *e)
+{
+	int c, n, i;
+
+	c = *s++;
+	if ((c&0xe0) == 0xc0)
+		n = 2;
+	else if ((c&0xf0) == 0xe0)
+		n = 3;
+	else if ((c&0xf8) == 0xf0)
+		n = 4;
+	else
+		return -1;
+	i = n-1;
+	if(e-s < i)
+		i = e-s;
+	for(; i-- && (c = *s++);)
+		if(0x80 != (c&0xc0))
+			return -1;
+	return n;
+}
+
 void
 filetype(int fd)
 {
 	Rune r;
 	int i, f, n;
-	char *p, *eob;
+	uchar *p, *eob;
+	uchar c;
 
 	free(mbuf);
 	mbuf = dirfstat(fd);
@@ -303,30 +327,30 @@
 	memset(cfreq, 0, sizeof(cfreq));
 	for (i = 0; language[i].name; i++)
 		language[i].count = 0;
-	eob = (char *)buf+nbuf;
-	for(n = 0, p = (char *)buf; p < eob; n++) {
-		if (!fullrune(p, eob-p) && eob-p < UTFmax)
-			break;
-		p += chartorune(&r, p);
-		if (r == 0)
-			f = Cnull;
-		else if (r <= 0x7f) {
-			if (!isprint(r) && !isspace(r))
+	eob = buf+nbuf;
+	for(n = 0, p = buf; p < eob; n++) {
+		c = *p;
+		if(c < 0x80){
+			if(c == 0)
+				f = Cnull;
+			else if(!isprint(c) && !isspace(c))
 				f = Ceascii;	/* ASCII control char */
-			else f = r;
-		} else if (r == 0x080) {
+			else
+				f = c;
+		} else if((i = utf8len(p, eob)) > 0){
+			// special care for non-basic-plane codepoints
+			chartorune(&r, (char*)p);
+			p += i-1;
 			bump_utf_count(r);
 			f = Cutf;
-		} else if (r < 0xA0)
-				f = Cbinary;	/* Invalid Runes */
-		else if (r <= 0xff)
-				f = Clatin;	/* Latin 1 */
-		else {
-			bump_utf_count(r);
-			f = Cutf;		/* UTF extension */
-		}
+		} else if(c <= 0xa0)
+			f = Cbinary;
+		else
+			f = Clatin;
 		cfreq[f]++;			/* ASCII chars peg directly */
+		p++;
 	}
+
 	/*
 	 * gross classify
 	 */
@@ -338,11 +362,14 @@
 		guess = Flatin;
 	else if (cfreq[Ceascii])
 		guess = Feascii;
-	else if (cfreq[Cnull] == n) {
-		print(mime ? OCTET : "first block all null bytes\n");
-		return;
-	}
-	else guess = Fascii;
+	else if (cfreq[Cnull]){
+		if(cfreq[Cnull] == n) {
+			print(mime ? OCTET : "first block all null bytes\n");
+			return;
+		}
+		guess = Fbinary;
+	} else 
+		guess = Fascii;
 	/*
 	 * lookup dictionary words
 	 */
@@ -367,7 +394,7 @@
 	else if (guess == Feascii)
 		print(mime ? PLAIN : "extended ascii\n");
 	else if (guess == Flatin)
-		print(mime ? PLAIN : "latin ascii\n");
+		print(mime ? PLAIN : "latin\n");
 	else if (guess == Futf && utf_count() < 4)
 		print_utf();
 	else print(mime ? OCTET : "binary\n");