there are two parts to this patch 1. remove all the character-set tables from fs.c and mbox.c; allow tcs to do the translations 2. the rfc-2047 decoding was improved. adjacent 2047-encoded "atoms" are pasted together as per the rfc. i also tightened up the processing of 2047 strings. the code had been scanning the insides of 2047 strings for the start of a new 2047-string. - erik Notes: Sat Apr 1 16:46:44 EST 2006 rsc > i also tightened up the processing of 2047 strings. the code had been scanning > the insides of 2047 strings for the start of a new 2047-string. what's wrong with that? it can't happen. Sat Apr 1 17:18:46 EST 2006 rsc Applied, thanks. As usual I didn't apply the changes that were not actually changing the behavior. I'd rather leave old working code as it is so that the diffs are easier to read and to avoid introducing bugs. Speaking of bugs, I think I found two in the code you sent in. One is that you appeared to have reversed the sense of the uscores flag and the other is that you had a cistrcmp() compared to 8. Please double-check the current code. I did some limited testing and it seemed okay. Thanks again. Russ Reference: /n/sources/patch/applied/upas-tcs-2047 Date: Sat Apr 1 20:29:21 CES 2006 Signed-off-by: quanstro@quanstro.net Reviewed-by: rsc --- /sys/src/cmd/upas/fs/dat.h Sat Apr 1 20:05:34 2006 +++ /sys/src/cmd/upas/fs/dat.h Sat Apr 1 20:05:29 2006 @@ -132,8 +132,6 @@ void decode(Message*); int cistrncmp(char*, char*, int); int cistrcmp(char*, char*); -int latin1toutf(char*, char*, char*); -int windows1257toutf(char*, char*, char*); int decquoted(char*, char*, char*, int); int xtoutf(char*, char**, char*, char*); void countlines(Message*); --- /sys/src/cmd/upas/fs/fs.c Sat Apr 1 20:06:12 2006 +++ /sys/src/cmd/upas/fs/fs.c Sat Apr 1 20:05:55 2006 @@ -2,6 +2,7 @@ #include #include #include +#include #include "dat.h" enum @@ -1350,47 +1351,29 @@ } // rfc2047 non-ascii -typedef struct Charset Charset; -struct Charset { - char *name; - int len; - int convert; - char *tcsname; -} charsets[] = -{ - { "us-ascii", 8, 1, nil, }, - { "utf-8", 5, 0, nil, }, - { "iso-8859-1", 10, 1, nil, }, - { "iso-8859-2", 10, 2, "8859-2", }, - { "big5", 4, 2, "big5", }, - { "iso-2022-jp", 11, 2, "jis", }, - { "windows-1251", 12, 2, "cp1251"}, - { "koi8-r", 6, 2, "koi8"}, -}; - int rfc2047convert(String *s, char *token, int len) { + char charset[41]; // c.f. rfc 2278 char decoded[1024]; - char utfbuf[2*1024]; - int i; - char *e, *x; + char *e, *x, *end, *tok0; + int l; - if(len == 0) + if(len < sizeof "=?x?y??=") return -1; + tok0 = token; e = token+len-2; token += 2; - // bail if we don't understand the character set - for(i = 0; i < nelem(charsets); i++) - if(cistrncmp(charsets[i].name, token, charsets[i].len) == 0) - if(token[charsets[i].len] == '?'){ - token += charsets[i].len + 1; - break; - } - if(i >= nelem(charsets)) + x = strchr(token, '?'); + l = x-token; + if (l >= sizeof charset || x>e) return -1; + strncpy(charset, token, l); + charset[l] = 0; + + token = x+1; // bail if it doesn't fit if(e-token > sizeof(decoded)-1) @@ -1399,92 +1382,61 @@ // bail if we don't understand the encoding if(cistrncmp(token, "b?", 2) == 0){ token += 2; - len = dec64((uchar*)decoded, sizeof(decoded), token, e-token); + if ((end = strstr(token, "?=")) == nil) + return -1; + len = dec64((uchar*)decoded, sizeof decoded, token, end-token); decoded[len] = 0; } else if(cistrncmp(token, "q?", 2) == 0){ token += 2; - len = decquoted(decoded, token, e, 1); + if ((end = strstr(token, "?=")) == nil) + return -1; + len = decquoted(decoded, token, end, 1); if(len > 0 && decoded[len-1] == '\n') len--; decoded[len] = 0; } else return -1; - switch(charsets[i].convert){ - case 0: + if(xtoutf(charset, &x, decoded, decoded+len) <= 0){ s_append(s, decoded); - break; - case 1: - latin1toutf(utfbuf, decoded, decoded+len); - s_append(s, utfbuf); - break; - case 2: - if(xtoutf(charsets[i].tcsname, &x, decoded, decoded+len) <= 0){ - s_append(s, decoded); - } else { - s_append(s, x); - free(x); - } - break; - } - - return 0; -} - -char* -rfc2047start(char *start, char *end) -{ - int quests; - - if(*--end != '=') - return nil; - if(*--end != '?') - return nil; - - quests = 0; - for(end--; end >= start; end--){ - switch(*end){ - case '=': - if(quests == 3 && *(end+1) == '?') - return end; - break; - case '?': - ++quests; - break; - case ' ': - case '\t': - case '\n': - case '\r': - /* can't have white space in a token */ - return nil; - } + } else { + s_append(s, x); + free(x); } - return nil; + return end-tok0+2; } // convert a header line String* stringconvert(String *s, char *uneaten, int len) { - char *token; - char *p; - int i; + char *p, *end, *pastept; + int c; s = s_reset(s); - p = uneaten; - for(i = 0; i < len; i++){ - if(*p++ == '='){ - token = rfc2047start(uneaten, p); - if(token != nil){ - s_nappend(s, uneaten, token-uneaten); - if(rfc2047convert(s, token, p - token) < 0) - s_nappend(s, token, p - token); - uneaten = p; + end = uneaten+len; + for(p = uneaten; p= 0) { + p += c; + for(pastept = p; isspace(*pastept);) + pastept++; + if(pastept[0] == '=' && pastept[1] == '?') { + p = pastept; + goto paste; } + } else{ + s_append(s, "=?"); + p += 2; } } - if(p > uneaten) - s_nappend(s, uneaten, p-uneaten); return s; } --- /sys/src/cmd/upas/fs/mbox.c Sat Apr 1 20:06:53 2006 +++ /sys/src/cmd/upas/fs/mbox.c Sat Apr 1 20:06:37 2006 @@ -62,7 +62,6 @@ static char* getstring(char*, String*, int); static void setfilename(Message*, char*); static char* lowercase(char*); -static int is8bit(Message*); static int headerline(char**, String*); static void initheaders(void); static void parseattachments(Message*, Mailbox*); @@ -1055,7 +1054,7 @@ m->decoded = 1; } -// convert latin1 to utf +// convert to utf void convert(Message *m) { @@ -1063,117 +1062,17 @@ char *x; // don't convert if we're not a leaf, not text, or already converted - if(m->converted) + if(m->converted || m->part != nil || cistrncmp(s_to_c(m->type), "text", 4) != 0) return; - if(m->part != nil) - return; - if(cistrncmp(s_to_c(m->type), "text", 4) != 0) - return; - - if(cistrcmp(s_to_c(m->charset), "us-ascii") == 0 || - cistrcmp(s_to_c(m->charset), "iso-8859-1") == 0){ - len = is8bit(m); - if(len > 0){ - len = 2*len + m->bend - m->body + 1; - x = emalloc(len); - len = latin1toutf(x, m->body, m->bend); - if(m->ballocd) - free(m->body); - m->body = x; - m->bend = x + len; - m->ballocd = 1; - } - } else if(cistrcmp(s_to_c(m->charset), "iso-8859-2") == 0){ - len = xtoutf("8859-2", &x, m->body, m->bend); - if(len != 0){ - if(m->ballocd) - free(m->body); - m->body = x; - m->bend = x + len; - m->ballocd = 1; - } - } else if(cistrcmp(s_to_c(m->charset), "iso-8859-15") == 0){ - len = xtoutf("8859-15", &x, m->body, m->bend); - if(len != 0){ - if(m->ballocd) - free(m->body); - m->body = x; - m->bend = x + len; - m->ballocd = 1; - } - } else if(cistrcmp(s_to_c(m->charset), "big5") == 0){ - len = xtoutf("big5", &x, m->body, m->bend); - if(len != 0){ - if(m->ballocd) - free(m->body); - m->body = x; - m->bend = x + len; - m->ballocd = 1; - } - } else if(cistrcmp(s_to_c(m->charset), "iso-2022-jp") == 0){ - len = xtoutf("jis", &x, m->body, m->bend); - if(len != 0){ - if(m->ballocd) - free(m->body); - m->body = x; - m->bend = x + len; - m->ballocd = 1; - } - } else if(cistrcmp(s_to_c(m->charset), "windows-1257") == 0 - || cistrcmp(s_to_c(m->charset), "windows-1252") == 0){ - len = is8bit(m); - if(len > 0){ - len = 2*len + m->bend - m->body + 1; - x = emalloc(len); - len = windows1257toutf(x, m->body, m->bend); - if(m->ballocd) - free(m->body); - m->body = x; - m->bend = x + len; - m->ballocd = 1; - } - } else if(cistrcmp(s_to_c(m->charset), "windows-1251") == 0){ - len = xtoutf("cp1251", &x, m->body, m->bend); - if(len != 0){ - if(m->ballocd) - free(m->body); - m->body = x; - m->bend = x + len; - m->ballocd = 1; - } - } else if(cistrcmp(s_to_c(m->charset), "koi8-r") == 0){ - len = xtoutf("koi8", &x, m->body, m->bend); - if(len != 0){ - if(m->ballocd) - free(m->body); - m->body = x; - m->bend = x + len; - m->ballocd = 1; - } - } - m->converted = 1; -} - -enum -{ - Self= 1, - Hex= 2, -}; -uchar tableqp[256]; - -static void -initquoted(void) -{ - int c; - - memset(tableqp, 0, 256); - for(c = ' '; c <= '<'; c++) - tableqp[c] = Self; - for(c = '>'; c <= '~'; c++) - tableqp[c] = Self; - tableqp['\t'] = Self; - tableqp['='] = Hex; + len = xtoutf(s_to_c(m->charset), &x, m->body, m->bend); + if(len != 0){ + if(m->ballocd) + free(m->body); + m->body = x; + m->bend = x + len; + m->ballocd = 1; + } } static int @@ -1188,13 +1087,14 @@ return 0; } +// underscores are translated in 2047 headers but not in the body static char* decquotedline(char *out, char *in, char *e, int uscores) { int c, soft; /* dump trailing white space */ - while(e >= in && (*e == ' ' || *e == '\t' || *e == '\r' || *e == '\n')) + while(e >= in && isspace(*e)) e--; /* trailing '=' means no newline */ @@ -1206,17 +1106,17 @@ while(in <= e){ c = (*in++) & 0xff; - switch(tableqp[c]){ - case Self: - if(uscores && c == '_') - c = ' '; - *out++ = c; - break; - case Hex: + switch(c){ + case '=': c = hex2int(*in++)<<4; c |= hex2int(*in++); *out++ = c; break; + case '_': + *out++ = uscores ? '_' : ' '; + break; + default: + *out++ = c; } } if(!soft) @@ -1231,9 +1131,6 @@ { char *p, *nl; - if(tableqp[' '] == 0) - initquoted(); - p = out; while((nl = strchr(in, '\n')) != nil && nl < e){ p = decquotedline(p, in, nl, uscores); @@ -1263,35 +1160,31 @@ return op; } -/* - * return number of 8 bit characters - */ +// translate latin1 directly since it fits neatly in utf static int -is8bit(Message *m) +latin1toutf(char **out, char *in, char *e) { - int count = 0; + Rune r; char *p; + int n; - for(p = m->body; p < m->bend; p++) - if(*p & 0x80) - count++; - return count; -} + for(n = 0, p = in; p= 0x7f && r <= 0x9f) - r = winchars[r-0x7f]; - p += runetochar(p, &r); - } - *p = 0; - return p - out; } void *