handle iso-2022-jp-2 as a superset of iso-2022-jp. this fixes the decoding issue with 4657D0DD-A119-4E19-B50D-EBCE5861F9F8@gmail.com from 31 dec 2013 Reference: /n/atom/patch/applied/tcs2022-jp-2 Date: Wed Jan 1 23:09:19 CET 2014 Signed-off-by: quanstro@quanstro.net --- /sys/src/cmd/tcs/mkfile Wed Jan 1 23:07:53 2014 +++ /sys/src/cmd/tcs/mkfile Wed Jan 1 23:07:54 2014 @@ -10,6 +10,7 @@ utf.$O\ html.$O\ kuten208.$O\ + kuten212.$O\ gb.$O\ gbk.$O\ ksc.$O\ --- /sys/src/cmd/tcs/conv.h Wed Jan 1 23:07:55 2014 +++ /sys/src/cmd/tcs/conv.h Wed Jan 1 23:07:56 2014 @@ -1,5 +1,5 @@ void jis_in(int fd, long *notused, struct convert *out); -void jisjis_in(int fd, long *notused, struct convert *out); +void jis2_in(int fd, long *notused, struct convert *out); void msjis_in(int fd, long *notused, struct convert *out); void ujis_in(int fd, long *notused, struct convert *out); void jisjis_out(Rune *base, int n, long *notused); --- /sys/src/cmd/tcs/tcs.c Wed Jan 1 23:07:58 2014 +++ /sys/src/cmd/tcs/tcs.c Wed Jan 1 23:07:59 2014 @@ -525,8 +525,10 @@ { "ibm862", "IBM Code Page 862 (Hebrew)", Table, (void*)tabcp862 }, { "ibm866", "IBM Code Page 866 (Russian)", Table, (void*)tabcp866 }, { "ibm874", "IBM Code Page 874 (Thai)", Table, (void*)tabcp874 }, - { "iso-2022-jp", "alias for jis-kanji (MIME)", From|Func, 0, (Fnptr)jisjis_in }, + { "iso-2022-jp", "alias for jis-kanji (MIME)", From|Func, 0, (Fnptr)jis2_in }, { "iso-2022-jp", "alias for jis-kanji (MIME)", Func, 0, (Fnptr)jisjis_out }, + { "iso-2022-jp-2", "alias for jis-kanji (MIME)", From|Func, 0, (Fnptr)jis2_in }, + { "iso-2022-jp-2", "alias for jis-kanji (MIME)", Func, 0, (Fnptr)jisjis_out }, { "iso-8859-1", "alias for 8859-1 (MIME)", Table, (void *)tab8859_1 }, { "iso-8859-2", "alias for 8859-2 (MIME)", Table, (void *)tab8859_2 }, { "iso-8859-3", "alias for 8859-3 (MIME)", Table, (void *)tab8859_3 }, @@ -539,7 +541,7 @@ { "iso-8859-10", "alias for 8859-10 (MIME)", Table, (void *)tab8859_10 }, { "iso-8859-15", "alias for 8859-15 (MIME)", Table, (void *)tab8859_15 }, { "jis", "guesses at the JIS encoding", From|Func, 0, (Fnptr)jis_in }, - { "jis-kanji", "ISO 2022-JP (Japanese)", From|Func, 0, (Fnptr)jisjis_in }, + { "jis-kanji", "ISO 2022-JP (Japanese)", From|Func, 0, (Fnptr)jis2_in }, { "jis-kanji", "ISO 2022-JP (Japanese)", Func, 0, (Fnptr)jisjis_out }, { "koi8", "KOI-8 (GOST 19769-74)", Table, (void *)tabkoi8 }, { "koi8-r", "alias for koi8 (MIME)", Table, (void *)tabkoi8 }, --- /sys/src/cmd/tcs/conv_jis.c Wed Jan 1 23:08:00 2014 +++ /sys/src/cmd/tcs/conv_jis.c Wed Jan 1 23:08:02 2014 @@ -10,7 +10,10 @@ #include "hdr.h" #include "conv.h" #include "kuten208.h" +#include "kuten212.h" #include "jis.h" +#include "gb.h" +#include "ksc.h" /* a state machine for interpreting all sorts of encodings @@ -254,56 +257,176 @@ } /* - a state machine for interpreting jis-kanji == 2022-JP -*/ + * a partial state machine for interpreting iso-2022-2 + */ +extern long tab8859_7[]; /* consider a bit of maintence on tcs! */ + static void -jis(int c, Rune **r, long input_loc) +jis2(int c, Rune **r, long input_loc) { - static enum { state0, state1, state2, state3, state4 } state = state0; - static int set8 = 0; - static int japan646 = 0; - static int lastc; int n; long l; + static enum { state0, state1, staten, state1dot, state2, state21, state3, state4, } state = state0; + static int lastc, japan646; + static enum {iNone, i88591, i88597, } g2 = iNone; + static enum {Cnone, Ckuten208, Ckuten212, Cgb, Cksc, } cs = Cnone; again: - switch(state) - { + + if(c == '\r' || c == '\n') + g2 = iNone; + + switch(state){ case state0: /* idle state */ - if(c == ESC){ state = state1; return; } - if(c < 0) return; - if(!set8 && (c < 128)){ + if(c == ESC){ + state = state1; + return; + } + if(c < 0) + return; + if(cs == Cnone && (c < 128)){ if(japan646){ - switch(c) - { - case '\\': emit(0xA5); return; /* yen */ - case '~': emit(0xAF); return; /* spacing macron */ - default: emit(c); return; + switch(c){ + case '\\': + emit(0xA5); /* yen */ + return; + case '~': + emit(0xAF); /* spacing macron */ + return; } - } else { - emit(c); - return; } + emit(c); + return; } - lastc = c; state = state4; return; + lastc = c; + state = state4; + return; - case state1: /* seen an escape */ - if(c == '$'){ state = state2; return; } - if(c == '('){ state = state3; return; } - emit(ESC); state = state0; goto again; + case state1: /* ESC */ + switch(c){ + case '$': + state = state2; + return; + case '(': + state = state3; + return; + case 'N': + state = staten; + return; + case '.': + state = state1dot; + return; + default: + emit(ESC); + state = state0; + goto again; + } - case state2: /* may be shifting into JIS */ - if((c == '@') || (c == 'B')){ - set8 = 1; state = state0; return; + case staten: /* ESC N */ + if((uint)c > 0x80) + goto bad; + switch(g2){ + default: + case iNone: + if(squawk) + EPR "%s: bad char %#x %s\n", argv0, c, file); + goto error; + + case i88591: + emit(c+0x80); + state = state0; + return; + case i88597: + c = tab8859_7[c | 0x80]; + if(c < 0){ + bad: + if(squawk) + EPR "%s: bad char %#x %s\n", argv0, c, file); + error: + nerrors++; + if(!clean) + emit(BADMAP); + emit('('); + emit(c); + emit(')'); + }else + emit(c); + state = state0; + return; + } + + case state1dot: /* ESC . */ + switch(c){ + case 'A': + g2 = i88591; + state = state0; + return; + case 'F': + g2 = i88597; + state = state0; + return; + default: + state = state0; + emit(ESC); + emit('$'); + emit('.'); + goto again; + } + + case state2: /* ESC $ ... may be shifting into JIS */ + switch(c){ + case '@': + case 'B': + cs = Ckuten208; + state = state0; + return; + case 'A': + /* handle gb2312-1980 */ + cs = Cgb; + state = state0; + return; + case '(': + state = state21; + return; + default: + emit(ESC); + emit('$'); + state = state0; + goto again; + } + + case state21: /* ESC $ ( */ + switch(c){ + case 'C': + /* handle ksc5601-1987 */ + cs = Cksc; + state = state0; + return; + case 'D': + /* handle jis x 0212-1990 */ + cs = Ckuten212; + japan646 = 0; /* guess */ + state = state0; + return; + default: + emit(ESC); + emit('$'); + emit('('); + state = state0; + goto again; } - emit(ESC); emit('$'); state = state0; goto again; case state3: /* may be shifting out of JIS */ - if((c == 'J') || (c == 'H') || (c == 'B')){ - japan646 = (c == 'J'); - set8 = 0; state = state0; return; + if(c == 'J' || c == 'H' || c == 'B'){ + japan646 = c == 'J'; + cs = Cnone; + state = state0; + return; } - emit(ESC); emit('('); state = state0; goto again; + emit(ESC); + emit('('); + state = state0; + goto again; case state4: /* two part char */ if(c < 0){ @@ -316,8 +439,29 @@ state = state0; goto again; } - n = (lastc&0x7F)*100 + (c&0x7f) - 3232; /* kuten208 */ - if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){ + n = (lastc&0x7F)*100 + (c&0x7f) - 3232; /* kuten208 */ + l = -1; + switch(cs){ + default: + abort(); + case Cgb: + if(n < GBMAX) + l = tabgb[n]; + break; + case Cksc: + if(n < ksc5601max) + l = tabksc5601[n]; + break; + case Ckuten208: + if(n < KUTEN208MAX) + l = tabkuten208[n]; + break; + case Ckuten212: + if(n < KUTEN212MAX) + l = tabkuten212[n]; + break; + } + if(l == -1){ nerrors++; if(squawk) EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file); @@ -388,10 +532,9 @@ } void -jisjis_in(int fd, long *notused, struct convert *out) +jis2_in(int fd, long*, struct convert *out) { - USED(notused); - do_in(fd, jis, out); + do_in(fd, jis2, out); } static int first = 1;