1. in the language table 0x80 is not part of 8859-n and the range
0xa0-0xff is valid latin1.

2. don't call chartorune() first -- it is difficult to tell a bad rune from
ascii/latin1/binary.  deal with 4-byte utf-8.

3. added utf-(16|32)(be|le) detection (when byteorder marks are present).

4. support image bit-depths of 1, 2 and 4.

5. font file parsing now insists on newlines between entries and ignores
short lines.  subfonts that are split into two different images are also
properly located.

i wrote a seperate program chkfont that slowly checks most font parameters
and takes pains to complain in a meaningful way which is included.

6. iself's automatic cpu array was redeclared "static" so that non-explicitly
initialized locations would be 0.  added an array for elf type
(core file, relocatable, shared library, executable). 

less important changes
1. -h flag -- prints character histogram.  this is sometimes useful to
see why files lib /lib/glass are classified as binary.

2. added rustic support for detecting some dbcs character sets.
although the method is crude, we should not often be fooled on
real examples -- it's uncommon to have even-byte long runs of
latin1 characters.


Notes:
Sun May 21 13:43:49 EDT 2006 rsc
    can we please do these one at a time?
    i already decided not to put in 0-h.
    you forgot 5-byte utf-8.
    i don't think faccess is necessary or even correct.
    if .1 doesn't exist that's fine; it falls back on the no-extension file.
    (it being libdraw)
    
    please submit a patch for one of these things,
    wait for me to deal with it, and then submit the next thing.
    i can't deal with multiple things at once.  it just 
    frustrates me trying to understand a zillion different changes.
    
    thanks.
    russ


Reference: /n/sources/patch/sorry/file-redux
Date: Sat May 20 17:36:18 CES 2006
Signed-off-by: quanstro@quanstro.net
Reviewed-by: rsc

--- /sys/src/cmd/file.c	Sat May 20 17:10:04 2006
+++ /sys/src/cmd/file.c	Sat May 20 17:35:10 2006
@@ -33,6 +33,7 @@
 	Cnull,
 	Ceascii,
 	Cutf,
+	Cdbcs,
 };
 struct
 {
@@ -100,7 +101,7 @@
 	
 } language[] =
 {
-	Normal, 0,	0x0080, 0x0080,	"Extended Latin",
+	Normal,	0,	0x00a0,	0x00ff,	"Latin",
 	Normal,	0,	0x0100,	0x01FF,	"Extended Latin",
 	Normal,	0,	0x0370,	0x03FF,	"Greek",
 	Normal,	0,	0x0400,	0x04FF,	"Cyrillic",
@@ -136,13 +137,14 @@
 	Futf,		/* UTf character set */
 	Fbinary,	/* binary */
 	Feascii,	/* ASCII with control chars */
+	Fdbcs,		/* iso 646/iso 2022-compliant dbcs charset */
 	Fnull,		/* NULL in file */
 } guess;
 
 void	bump_utf_count(Rune);
 int	cistrncmp(char*, char*, int);
 void	filetype(int);
-int	getfontnum(uchar*, uchar**);
+int	getfontnum(char*, char**);
 int	isas(void);
 int	isc(void);
 int	iscint(void);
@@ -197,6 +199,7 @@
 };
 
 int mime;
+int hflag;
 
 #define OCTET	"application/octet-stream\n"
 #define PLAIN	"text/plain\n"
@@ -212,8 +215,11 @@
 	case 'm':
 		mime = 1;
 		break;
+	case 'h':
+		hflag = 1;
+		break;
 	default:
-		fprint(2, "usage: file [-m] [file...]\n");
+		fprint(2, "usage: file [-mh] [file...]\n");
 		exits("usage");
 	}ARGEND;
 
@@ -263,12 +269,36 @@
 	close(fd);
 }
 
+static int
+utf8len(uchar *s, uchar *e)
+{
+	int c, n, i;
+
+	c = *s++;
+	if ((c&0xe0) == 0xc0)
+		n = 2;
+	else if ((c&0xf0) == 0xe0)
+		n = 3;
+	else if ((c&0xf8) == 0xf0)
+		n = 4;
+	else
+		return -1;
+	i = n-1;
+	if(e-s < i)
+		i = e-s;
+	for(; i-- && (c = *s++);)
+		if(0x80 != (c&0xc0))
+			return -1;
+	return n;
+}
+
 void
 filetype(int fd)
 {
 	Rune r;
 	int i, f, n;
-	char *p, *eob;
+	uchar *p, *eob;
+	uchar c;
 
 	free(mbuf);
 	mbuf = dirfstat(fd);
@@ -303,46 +333,68 @@
 	memset(cfreq, 0, sizeof(cfreq));
 	for (i = 0; language[i].name; i++)
 		language[i].count = 0;
-	eob = (char *)buf+nbuf;
-	for(n = 0, p = (char *)buf; p < eob; n++) {
-		if (!fullrune(p, eob-p) && eob-p < UTFmax)
-			break;
-		p += chartorune(&r, p);
-		if (r == 0)
-			f = Cnull;
-		else if (r <= 0x7f) {
-			if (!isprint(r) && !isspace(r))
+	eob = buf+nbuf;
+	f = 0;	// shut up compiler
+	for(n = 0, p = buf; p < eob; n++) {
+		c = *p;
+		if(c < 0x80){
+			if(c == 0)
+				f = Cnull;
+			else if(!isprint(c) && !isspace(c))
 				f = Ceascii;	/* ASCII control char */
-			else f = r;
-		} else if (r == 0x080) {
+			else
+				f = c;
+		} else if((i = utf8len(p, eob)) > 0){
+			// special care for non-basic-plane codepoints
+			chartorune(&r, (char*)p);
+			p += i-1;
 			bump_utf_count(r);
 			f = Cutf;
-		} else if (r < 0xA0)
-				f = Cbinary;	/* Invalid Runes */
-		else if (r <= 0xff)
-				f = Clatin;	/* Latin 1 */
-		else {
-			bump_utf_count(r);
-			f = Cutf;		/* UTF extension */
+		} else if(c <= 0xa0)
+			f = Cbinary;
+		else{
+			if(p[1] > 0x80){
+				p++;
+				f = Cdbcs;
+			} else
+				f = Clatin;
 		}
 		cfreq[f]++;			/* ASCII chars peg directly */
+		p++;
+	}
+
+	// check for dbcs character straddling the end of the buffer.
+	if(nbuf < sizeof(buf)-1 && f == Clatin && cfreq[Clatin] == 1 && cfreq[Cdbcs])
+		cfreq[Clatin]--;
+	if(cfreq[Cdbcs] < 5 || cfreq[Cdbcs] && cfreq[Clatin]){
+		cfreq[Clatin] += cfreq[Cdbcs]*2;
+		cfreq[Cdbcs] = 0;
 	}
+
+	if(hflag)
+		fprint(2, "n = %d, bin = %d, dbcs = %d utf = %d, latin = %d, eascii = %d null = %d\n",
+			n, cfreq[Cbinary], cfreq[Cdbcs], cfreq[Cutf], cfreq[Clatin], cfreq[Ceascii], cfreq[Cnull]);
 	/*
 	 * gross classify
 	 */
 	if (cfreq[Cbinary])
 		guess = Fbinary;
+	else if (cfreq[Cdbcs])
+		guess = Fdbcs;
 	else if (cfreq[Cutf])
 		guess = Futf;
 	else if (cfreq[Clatin])
 		guess = Flatin;
 	else if (cfreq[Ceascii])
 		guess = Feascii;
-	else if (cfreq[Cnull] == n) {
-		print(mime ? OCTET : "first block all null bytes\n");
-		return;
-	}
-	else guess = Fascii;
+	else if (cfreq[Cnull]){
+		if(cfreq[Cnull] == n) {
+			print(mime ? OCTET : "first block all null bytes\n");
+			return;
+		}
+		guess = Fbinary;
+	} else 
+		guess = Fascii;
 	/*
 	 * lookup dictionary words
 	 */
@@ -367,9 +419,11 @@
 	else if (guess == Feascii)
 		print(mime ? PLAIN : "extended ascii\n");
 	else if (guess == Flatin)
-		print(mime ? PLAIN : "latin ascii\n");
+		print(mime ? PLAIN : "latin\n");
 	else if (guess == Futf && utf_count() < 4)
 		print_utf();
+	else if (guess == Fdbcs)
+		print(mime ? PLAIN : "dbcs\n");
 	else print(mime ? OCTET : "binary\n");
 }
 
@@ -528,6 +582,10 @@
 	070707,		0xFFFF,		"cpio archive\n", OCTET,
 	0x2F7,		0xFFFF,		"tex dvi\n", "application/dvi",
 	0xfaff,		0xfeff,		"mp3 audio\n",	"audio/mpeg",
+	0xfeff0000,	0xffffffff,	"utf-32be\n",	"text/utf-32be",
+	0xfffe,		0xffffffff,	"utf-32le\n",	"text/utf-32le",
+	0xfeff,		0xffff,		"utf-16be\n",	"text/utf-16be",
+	0xfffe,		0xffff,		"utf-16le\n",	"text/utf-16le",
 };
 
 int
@@ -567,7 +625,6 @@
 	return 0;
 }
 
-
 /* from tar.c */
 enum { NAMSIZ = 100, TBLOCK = 512 };
 
@@ -1076,6 +1133,9 @@
 	case 24:
 	case 16:
 	case 8:
+	case 4:
+	case 2:
+	case 1:
 		return d;
 	}
 	return -1;
@@ -1163,43 +1223,71 @@
 }
 
 #define	WHITESPACE(c)		((c) == ' ' || (c) == '\t' || (c) == '\n')
+#define	SPACE(c)			((c) == ' ' || (c) == '\t')
+
+int
+faccess(char *p, int m)
+{
+	char* s;
+	int i, r;
 
+	if(access(p, m) == 0)
+		return 0;
+	s = p + strlen(p);
+	for(i = 0; i < 2; i++) {
+		snprint(s, 4, ".%d", i);
+		r = access(p, m);
+		*s = 0;
+		if(r != 0)
+			return -1;
+	}
+	return 0;
+}
+	
 int
 isp9font(void)
 {
-	uchar *cp, *p;
+	char *p, *cp;
 	int i, n;
-	char pathname[1024];
+	char path[1024 + 1 + 3];
 
-	cp = buf;
+	cp = (char*)buf;
 	if (!getfontnum(cp, &cp))	/* height */
 		return 0;
 	if (!getfontnum(cp, &cp))	/* ascent */
 		return 0;
-	for (i = 0;; i++) {
+	while(WHITESPACE(*cp))
+		cp++;
+	for (i = 0; strchr(cp, '\n'); i++) {
 		if (!getfontnum(cp, &cp))	/* min */
 			break;
 		if (!getfontnum(cp, &cp))	/* max */
 			return 0;
-		while (WHITESPACE(*cp))
+		getfontnum(cp, &cp);	/* offset -- not required */
+		while(SPACE(*cp))
 			cp++;
-		for (p = cp; *cp && !WHITESPACE(*cp); cp++)
-				;
+		for(p = cp; !WHITESPACE(*cp); cp++)
+			;
 			/* construct a path name, if needed */
 		n = 0;
 		if (*p != '/' && slash) {
 			n = slash-fname+1;
-			if (n < sizeof(pathname))
-				memcpy(pathname, fname, n);
+			if (n < sizeof path)
+				memcpy(path, fname, n);
 			else n = 0;
 		}
-		if (n+cp-p < sizeof(pathname)) {
-			memcpy(pathname+n, p, cp-p);
+		if (n+cp-p < sizeof path - sizeof ".00") {
+			memcpy(path+n, p, cp-p);
 			n += cp-p;
-			pathname[n] = 0;
-			if (access(pathname, AEXIST) < 0)
+			path[n] = 0;
+			if(faccess(path, AEXIST) < 0){
+				if(hflag)
+					fprint(2, " %s\n", path);
 				return 0;
+			}
 		}
+		while(WHITESPACE(*cp))
+			cp++;
 	}
 	if (i) {
 		print(mime ? "text/plain\n" : "font file\n");
@@ -1209,15 +1297,18 @@
 }
 
 int
-getfontnum(uchar *cp, uchar **rp)
+getfontnum(char *cp, char **rp)
 {
-	while (WHITESPACE(*cp))		/* extract ulong delimited by whitespace */
-		cp++;
-	if (*cp < '0' || *cp > '9')
+	char *p;
+	ulong l;
+
+	*rp = cp;
+	l = strtoul(cp, &p, 0);
+	if(p == cp || !WHITESPACE(*p))
 		return 0;
-	strtoul((char *)cp, (char **)rp, 0);
-	if (!WHITESPACE(**rp))
+	if(l < 0 || l > 0xffff)
 		return 0;
+	*rp = p;
 	return 1;
 }
 
@@ -1244,7 +1335,7 @@
 int
 iself(void)
 {
-	char *cpu[] = {		/* NB: incomplete and arbitary list */
+	static char *cpu[] = {		/* NB: incomplete and arbitary list */
 	[1]	"WE32100",
 	[2]	"SPARC",
 	[3]	"i386",
@@ -1267,12 +1358,18 @@
 	[62]	"AMD64",
 	[75]	"VAX",
 	};
-
+	static char *type[] = {
+	[1]	"relocatable object",
+	[2]	"executable",
+	[3]	"shared library",
+	[4]	"core dump",
+	};
 
 	if (memcmp(buf, "\x7fELF", 4) == 0){
 		if (!mime){
 			int n = (buf[19] << 8) | buf[18];
 			char *p = "unknown";
+			char *t = "unknown";
 
 			if (n > 0 && n < nelem(cpu) && cpu[n])
 				p = cpu[n];
@@ -1282,7 +1379,10 @@
 				if (n > 0 && n < nelem(cpu) && cpu[n])
 					p = cpu[n];
 			}
-			print("%s ELF executable\n", p);
+			n = buf[16];
+			if(n>0 && n < nelem(type) && type[n])
+				t = type[n];
+			print("%s ELF %s\n", p, t);
 		}
 		else
 			print("application/x-elf-executable");