1. in the language table 0x80 is not part of 8859-n and the range 0xa0-0xff is valid latin1. 2. don't call chartorune() first -- it is difficult to tell a bad rune from ascii/latin1/binary. deal with 4-byte utf-8. 3. added utf-(16|32)(be|le) detection (when byteorder marks are present). 4. support image bit-depths of 1, 2 and 4. 5. font file parsing now insists on newlines between entries and ignores short lines. subfonts that are split into two different images are also properly located. i wrote a seperate program chkfont that slowly checks most font parameters and takes pains to complain in a meaningful way which is included. 6. iself's automatic cpu array was redeclared "static" so that non-explicitly initialized locations would be 0. added an array for elf type (core file, relocatable, shared library, executable). less important changes 1. -h flag -- prints character histogram. this is sometimes useful to see why files lib /lib/glass are classified as binary. 2. added rustic support for detecting some dbcs character sets. although the method is crude, we should not often be fooled on real examples -- it's uncommon to have even-byte long runs of latin1 characters. Notes: Sun May 21 13:43:49 EDT 2006 rsc can we please do these one at a time? i already decided not to put in 0-h. you forgot 5-byte utf-8. i don't think faccess is necessary or even correct. if .1 doesn't exist that's fine; it falls back on the no-extension file. (it being libdraw) please submit a patch for one of these things, wait for me to deal with it, and then submit the next thing. i can't deal with multiple things at once. it just frustrates me trying to understand a zillion different changes. thanks. russ Reference: /n/sources/patch/sorry/file-redux Date: Sat May 20 17:36:18 CES 2006 Signed-off-by: quanstro@quanstro.net Reviewed-by: rsc --- /sys/src/cmd/file.c Sat May 20 17:10:04 2006 +++ /sys/src/cmd/file.c Sat May 20 17:35:10 2006 @@ -33,6 +33,7 @@ Cnull, Ceascii, Cutf, + Cdbcs, }; struct { @@ -100,7 +101,7 @@ } language[] = { - Normal, 0, 0x0080, 0x0080, "Extended Latin", + Normal, 0, 0x00a0, 0x00ff, "Latin", Normal, 0, 0x0100, 0x01FF, "Extended Latin", Normal, 0, 0x0370, 0x03FF, "Greek", Normal, 0, 0x0400, 0x04FF, "Cyrillic", @@ -136,13 +137,14 @@ Futf, /* UTf character set */ Fbinary, /* binary */ Feascii, /* ASCII with control chars */ + Fdbcs, /* iso 646/iso 2022-compliant dbcs charset */ Fnull, /* NULL in file */ } guess; void bump_utf_count(Rune); int cistrncmp(char*, char*, int); void filetype(int); -int getfontnum(uchar*, uchar**); +int getfontnum(char*, char**); int isas(void); int isc(void); int iscint(void); @@ -197,6 +199,7 @@ }; int mime; +int hflag; #define OCTET "application/octet-stream\n" #define PLAIN "text/plain\n" @@ -212,8 +215,11 @@ case 'm': mime = 1; break; + case 'h': + hflag = 1; + break; default: - fprint(2, "usage: file [-m] [file...]\n"); + fprint(2, "usage: file [-mh] [file...]\n"); exits("usage"); }ARGEND; @@ -263,12 +269,36 @@ close(fd); } +static int +utf8len(uchar *s, uchar *e) +{ + int c, n, i; + + c = *s++; + if ((c&0xe0) == 0xc0) + n = 2; + else if ((c&0xf0) == 0xe0) + n = 3; + else if ((c&0xf8) == 0xf0) + n = 4; + else + return -1; + i = n-1; + if(e-s < i) + i = e-s; + for(; i-- && (c = *s++);) + if(0x80 != (c&0xc0)) + return -1; + return n; +} + void filetype(int fd) { Rune r; int i, f, n; - char *p, *eob; + uchar *p, *eob; + uchar c; free(mbuf); mbuf = dirfstat(fd); @@ -303,46 +333,68 @@ memset(cfreq, 0, sizeof(cfreq)); for (i = 0; language[i].name; i++) language[i].count = 0; - eob = (char *)buf+nbuf; - for(n = 0, p = (char *)buf; p < eob; n++) { - if (!fullrune(p, eob-p) && eob-p < UTFmax) - break; - p += chartorune(&r, p); - if (r == 0) - f = Cnull; - else if (r <= 0x7f) { - if (!isprint(r) && !isspace(r)) + eob = buf+nbuf; + f = 0; // shut up compiler + for(n = 0, p = buf; p < eob; n++) { + c = *p; + if(c < 0x80){ + if(c == 0) + f = Cnull; + else if(!isprint(c) && !isspace(c)) f = Ceascii; /* ASCII control char */ - else f = r; - } else if (r == 0x080) { + else + f = c; + } else if((i = utf8len(p, eob)) > 0){ + // special care for non-basic-plane codepoints + chartorune(&r, (char*)p); + p += i-1; bump_utf_count(r); f = Cutf; - } else if (r < 0xA0) - f = Cbinary; /* Invalid Runes */ - else if (r <= 0xff) - f = Clatin; /* Latin 1 */ - else { - bump_utf_count(r); - f = Cutf; /* UTF extension */ + } else if(c <= 0xa0) + f = Cbinary; + else{ + if(p[1] > 0x80){ + p++; + f = Cdbcs; + } else + f = Clatin; } cfreq[f]++; /* ASCII chars peg directly */ + p++; + } + + // check for dbcs character straddling the end of the buffer. + if(nbuf < sizeof(buf)-1 && f == Clatin && cfreq[Clatin] == 1 && cfreq[Cdbcs]) + cfreq[Clatin]--; + if(cfreq[Cdbcs] < 5 || cfreq[Cdbcs] && cfreq[Clatin]){ + cfreq[Clatin] += cfreq[Cdbcs]*2; + cfreq[Cdbcs] = 0; } + + if(hflag) + fprint(2, "n = %d, bin = %d, dbcs = %d utf = %d, latin = %d, eascii = %d null = %d\n", + n, cfreq[Cbinary], cfreq[Cdbcs], cfreq[Cutf], cfreq[Clatin], cfreq[Ceascii], cfreq[Cnull]); /* * gross classify */ if (cfreq[Cbinary]) guess = Fbinary; + else if (cfreq[Cdbcs]) + guess = Fdbcs; else if (cfreq[Cutf]) guess = Futf; else if (cfreq[Clatin]) guess = Flatin; else if (cfreq[Ceascii]) guess = Feascii; - else if (cfreq[Cnull] == n) { - print(mime ? OCTET : "first block all null bytes\n"); - return; - } - else guess = Fascii; + else if (cfreq[Cnull]){ + if(cfreq[Cnull] == n) { + print(mime ? OCTET : "first block all null bytes\n"); + return; + } + guess = Fbinary; + } else + guess = Fascii; /* * lookup dictionary words */ @@ -367,9 +419,11 @@ else if (guess == Feascii) print(mime ? PLAIN : "extended ascii\n"); else if (guess == Flatin) - print(mime ? PLAIN : "latin ascii\n"); + print(mime ? PLAIN : "latin\n"); else if (guess == Futf && utf_count() < 4) print_utf(); + else if (guess == Fdbcs) + print(mime ? PLAIN : "dbcs\n"); else print(mime ? OCTET : "binary\n"); } @@ -528,6 +582,10 @@ 070707, 0xFFFF, "cpio archive\n", OCTET, 0x2F7, 0xFFFF, "tex dvi\n", "application/dvi", 0xfaff, 0xfeff, "mp3 audio\n", "audio/mpeg", + 0xfeff0000, 0xffffffff, "utf-32be\n", "text/utf-32be", + 0xfffe, 0xffffffff, "utf-32le\n", "text/utf-32le", + 0xfeff, 0xffff, "utf-16be\n", "text/utf-16be", + 0xfffe, 0xffff, "utf-16le\n", "text/utf-16le", }; int @@ -567,7 +625,6 @@ return 0; } - /* from tar.c */ enum { NAMSIZ = 100, TBLOCK = 512 }; @@ -1076,6 +1133,9 @@ case 24: case 16: case 8: + case 4: + case 2: + case 1: return d; } return -1; @@ -1163,43 +1223,71 @@ } #define WHITESPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\n') +#define SPACE(c) ((c) == ' ' || (c) == '\t') + +int +faccess(char *p, int m) +{ + char* s; + int i, r; + if(access(p, m) == 0) + return 0; + s = p + strlen(p); + for(i = 0; i < 2; i++) { + snprint(s, 4, ".%d", i); + r = access(p, m); + *s = 0; + if(r != 0) + return -1; + } + return 0; +} + int isp9font(void) { - uchar *cp, *p; + char *p, *cp; int i, n; - char pathname[1024]; + char path[1024 + 1 + 3]; - cp = buf; + cp = (char*)buf; if (!getfontnum(cp, &cp)) /* height */ return 0; if (!getfontnum(cp, &cp)) /* ascent */ return 0; - for (i = 0;; i++) { + while(WHITESPACE(*cp)) + cp++; + for (i = 0; strchr(cp, '\n'); i++) { if (!getfontnum(cp, &cp)) /* min */ break; if (!getfontnum(cp, &cp)) /* max */ return 0; - while (WHITESPACE(*cp)) + getfontnum(cp, &cp); /* offset -- not required */ + while(SPACE(*cp)) cp++; - for (p = cp; *cp && !WHITESPACE(*cp); cp++) - ; + for(p = cp; !WHITESPACE(*cp); cp++) + ; /* construct a path name, if needed */ n = 0; if (*p != '/' && slash) { n = slash-fname+1; - if (n < sizeof(pathname)) - memcpy(pathname, fname, n); + if (n < sizeof path) + memcpy(path, fname, n); else n = 0; } - if (n+cp-p < sizeof(pathname)) { - memcpy(pathname+n, p, cp-p); + if (n+cp-p < sizeof path - sizeof ".00") { + memcpy(path+n, p, cp-p); n += cp-p; - pathname[n] = 0; - if (access(pathname, AEXIST) < 0) + path[n] = 0; + if(faccess(path, AEXIST) < 0){ + if(hflag) + fprint(2, " %s\n", path); return 0; + } } + while(WHITESPACE(*cp)) + cp++; } if (i) { print(mime ? "text/plain\n" : "font file\n"); @@ -1209,15 +1297,18 @@ } int -getfontnum(uchar *cp, uchar **rp) +getfontnum(char *cp, char **rp) { - while (WHITESPACE(*cp)) /* extract ulong delimited by whitespace */ - cp++; - if (*cp < '0' || *cp > '9') + char *p; + ulong l; + + *rp = cp; + l = strtoul(cp, &p, 0); + if(p == cp || !WHITESPACE(*p)) return 0; - strtoul((char *)cp, (char **)rp, 0); - if (!WHITESPACE(**rp)) + if(l < 0 || l > 0xffff) return 0; + *rp = p; return 1; } @@ -1244,7 +1335,7 @@ int iself(void) { - char *cpu[] = { /* NB: incomplete and arbitary list */ + static char *cpu[] = { /* NB: incomplete and arbitary list */ [1] "WE32100", [2] "SPARC", [3] "i386", @@ -1267,12 +1358,18 @@ [62] "AMD64", [75] "VAX", }; - + static char *type[] = { + [1] "relocatable object", + [2] "executable", + [3] "shared library", + [4] "core dump", + }; if (memcmp(buf, "\x7fELF", 4) == 0){ if (!mime){ int n = (buf[19] << 8) | buf[18]; char *p = "unknown"; + char *t = "unknown"; if (n > 0 && n < nelem(cpu) && cpu[n]) p = cpu[n]; @@ -1282,7 +1379,10 @@ if (n > 0 && n < nelem(cpu) && cpu[n]) p = cpu[n]; } - print("%s ELF executable\n", p); + n = buf[16]; + if(n>0 && n < nelem(type) && type[n]) + t = type[n]; + print("%s ELF %s\n", p, t); } else print("application/x-elf-executable");