there are two parts to this patch

1.  	remove all the character-set tables from fs.c and mbox.c; 
	allow tcs to do the translations

2.  	the rfc-2047 decoding was improved. adjacent 2047-encoded
	"atoms" are pasted together as per the rfc.

i also tightened up the processing of 2047 strings.  the code had been scanning
the insides of 2047 strings for the start of a new 2047-string.

- erik

Notes:
Sat Apr 1 16:46:44 EST 2006 rsc
    >	i also tightened up the processing of 2047 strings.  the code had been scanning
    >	the insides of 2047 strings for the start of a new 2047-string.
    	
    what's wrong with that?
    it can't happen.
    

Sat Apr 1 17:18:46 EST 2006 rsc
    Applied, thanks.  As usual I didn't apply the changes
    that were not actually changing the behavior.
    I'd rather leave old working code as it is so that
    the diffs are easier to read and to avoid introducing
    bugs. 
    
    Speaking of bugs, I think I found two in the code
    you sent in.  One is that you appeared to have
    reversed the sense of the uscores flag and the
    other is that you had a cistrcmp() compared to 8.
    
    Please double-check the current code.  I did some
    limited testing and it seemed okay.
    
    Thanks again.
    Russ


Reference: /n/sources/patch/applied/upas-tcs-2047
Date: Sat Apr  1 20:29:21 CES 2006
Signed-off-by: quanstro@quanstro.net
Reviewed-by: rsc

--- /sys/src/cmd/upas/fs/dat.h	Sat Apr  1 20:05:34 2006
+++ /sys/src/cmd/upas/fs/dat.h	Sat Apr  1 20:05:29 2006
@@ -132,8 +132,6 @@
 void		decode(Message*);
 int		cistrncmp(char*, char*, int);
 int		cistrcmp(char*, char*);
-int		latin1toutf(char*, char*, char*);
-int		windows1257toutf(char*, char*, char*);
 int		decquoted(char*, char*, char*, int);
 int		xtoutf(char*, char**, char*, char*);
 void		countlines(Message*);
--- /sys/src/cmd/upas/fs/fs.c	Sat Apr  1 20:06:12 2006
+++ /sys/src/cmd/upas/fs/fs.c	Sat Apr  1 20:05:55 2006
@@ -2,6 +2,7 @@
 #include <auth.h>
 #include <fcall.h>
 #include <libsec.h>
+#include <ctype.h>
 #include "dat.h"
 
 enum
@@ -1350,47 +1351,29 @@
 }
 
 // rfc2047 non-ascii
-typedef struct Charset Charset;
-struct Charset {
-	char *name;
-	int len;
-	int convert;
-	char *tcsname;
-} charsets[] =
-{
-	{ "us-ascii",		8,	1, nil, },
-	{ "utf-8",		5,	0, nil, },
-	{ "iso-8859-1",		10,	1, nil, },
-	{ "iso-8859-2",		10,	2, "8859-2", },
-	{ "big5",		4,	2, "big5", },
-	{ "iso-2022-jp",	11, 2, "jis", },
-	{ "windows-1251",	12,	2, "cp1251"},
-	{ "koi8-r",		6,	2, "koi8"},
-};
-
 int
 rfc2047convert(String *s, char *token, int len)
 {
+	char charset[41];		// c.f. rfc 2278
 	char decoded[1024];
-	char utfbuf[2*1024];
-	int i;
-	char *e, *x;
+	char *e, *x, *end, *tok0;
+	int l;
 
-	if(len == 0)
+	if(len < sizeof "=?x?y??=")
 		return -1;
 
+	tok0 = token;
 	e = token+len-2;
 	token += 2;
 
-	// bail if we don't understand the character set
-	for(i = 0; i < nelem(charsets); i++)
-		if(cistrncmp(charsets[i].name, token, charsets[i].len) == 0)
-		if(token[charsets[i].len] == '?'){
-			token += charsets[i].len + 1;
-			break;
-		}
-	if(i >= nelem(charsets))
+	x = strchr(token, '?');
+	l = x-token;
+	if (l >= sizeof charset || x>e)
 		return -1;
+	strncpy(charset, token, l);
+	charset[l] = 0;
+
+	token = x+1;
 
 	// bail if it doesn't fit 
 	if(e-token > sizeof(decoded)-1)
@@ -1399,92 +1382,61 @@
 	// bail if we don't understand the encoding
 	if(cistrncmp(token, "b?", 2) == 0){
 		token += 2;
-		len = dec64((uchar*)decoded, sizeof(decoded), token, e-token);
+		if ((end = strstr(token, "?=")) == nil)
+			return -1;
+		len = dec64((uchar*)decoded, sizeof decoded, token, end-token);
 		decoded[len] = 0;
 	} else if(cistrncmp(token, "q?", 2) == 0){
 		token += 2;
-		len = decquoted(decoded, token, e, 1);
+		if ((end = strstr(token, "?=")) == nil)
+			return -1;
+		len = decquoted(decoded, token, end, 1);
 		if(len > 0 && decoded[len-1] == '\n')
 			len--;
 		decoded[len] = 0;
 	} else
 		return -1;
 
-	switch(charsets[i].convert){
-	case 0:
+	if(xtoutf(charset, &x, decoded, decoded+len) <= 0){
 		s_append(s, decoded);
-		break;
-	case 1:
-		latin1toutf(utfbuf, decoded, decoded+len);
-		s_append(s, utfbuf);
-		break;
-	case 2:
-		if(xtoutf(charsets[i].tcsname, &x, decoded, decoded+len) <= 0){
-			s_append(s, decoded);
-		} else {
-			s_append(s, x);
-			free(x);
-		}
-		break;
-	}
-
-	return 0;
-}
-
-char*
-rfc2047start(char *start, char *end)
-{
-	int quests;
-
-	if(*--end != '=')
-		return nil;
-	if(*--end != '?')
-		return nil;
-
-	quests = 0;
-	for(end--; end >= start; end--){
-		switch(*end){
-		case '=':
-			if(quests == 3 && *(end+1) == '?')
-				return end;
-			break;
-		case '?':
-			++quests;
-			break;
-		case ' ':
-		case '\t':
-		case '\n':
-		case '\r':
-			/* can't have white space in a token */
-			return nil;
-		}
+	} else {
+		s_append(s, x);
+		free(x);
 	}
-	return nil;
+	return end-tok0+2;
 }
 
 // convert a header line
 String*
 stringconvert(String *s, char *uneaten, int len)
 {
-	char *token;
-	char *p;
-	int i;
+	char *p, *end, *pastept;
+	int c;
 
 	s = s_reset(s);
-	p = uneaten;
-	for(i = 0; i < len; i++){
-		if(*p++ == '='){
-			token = rfc2047start(uneaten, p);
-			if(token != nil){
-				s_nappend(s, uneaten, token-uneaten);
-				if(rfc2047convert(s, token, p - token) < 0)
-					s_nappend(s, token, p - token);
-				uneaten = p;
+	end = uneaten+len;
+	for(p = uneaten; p<end;){
+		c = *p;
+		if(c != '=' || p[1] != '?'){
+			s_putc(s, c);
+			p++;
+			continue;
+		}
+
+paste:
+		if((c = rfc2047convert(s, p, end-p)) >= 0) {
+			p += c;
+			for(pastept = p; isspace(*pastept);)
+				pastept++;
+			if(pastept[0] == '=' && pastept[1] == '?') {
+				p = pastept;
+				goto paste;
 			}
+		} else{
+			s_append(s, "=?");
+			p += 2;
 		}
 	}
-	if(p > uneaten)
-		s_nappend(s, uneaten, p-uneaten);
 	return s;
 }
 
--- /sys/src/cmd/upas/fs/mbox.c	Sat Apr  1 20:06:53 2006
+++ /sys/src/cmd/upas/fs/mbox.c	Sat Apr  1 20:06:37 2006
@@ -62,7 +62,6 @@
 static	char*	getstring(char*, String*, int);
 static	void	setfilename(Message*, char*);
 static	char*	lowercase(char*);
-static	int	is8bit(Message*);
 static	int	headerline(char**, String*);
 static	void	initheaders(void);
 static void	parseattachments(Message*, Mailbox*);
@@ -1055,7 +1054,7 @@
 	m->decoded = 1;
 }
 
-// convert latin1 to utf
+// convert to utf
 void
 convert(Message *m)
 {
@@ -1063,117 +1062,17 @@
 	char *x;
 
 	// don't convert if we're not a leaf, not text, or already converted
-	if(m->converted)
+	if(m->converted || m->part != nil || cistrncmp(s_to_c(m->type), "text", 4) != 0)
 		return;
-	if(m->part != nil)
-		return;
-	if(cistrncmp(s_to_c(m->type), "text", 4) != 0)
-		return;
-
-	if(cistrcmp(s_to_c(m->charset), "us-ascii") == 0 ||
-	   cistrcmp(s_to_c(m->charset), "iso-8859-1") == 0){
-		len = is8bit(m);
-		if(len > 0){
-			len = 2*len + m->bend - m->body + 1;
-			x = emalloc(len);
-			len = latin1toutf(x, m->body, m->bend);
-			if(m->ballocd)
-				free(m->body);
-			m->body = x;
-			m->bend = x + len;
-			m->ballocd = 1;
-		}
-	} else if(cistrcmp(s_to_c(m->charset), "iso-8859-2") == 0){
-		len = xtoutf("8859-2", &x, m->body, m->bend);
-		if(len != 0){
-			if(m->ballocd)
-				free(m->body);
-			m->body = x;
-			m->bend = x + len;
-			m->ballocd = 1;
-		}
-	} else if(cistrcmp(s_to_c(m->charset), "iso-8859-15") == 0){
-		len = xtoutf("8859-15", &x, m->body, m->bend);
-		if(len != 0){
-			if(m->ballocd)
-				free(m->body);
-			m->body = x;
-			m->bend = x + len;
-			m->ballocd = 1;
-		}
-	} else if(cistrcmp(s_to_c(m->charset), "big5") == 0){
-		len = xtoutf("big5", &x, m->body, m->bend);
-		if(len != 0){
-			if(m->ballocd)
-				free(m->body);
-			m->body = x;
-			m->bend = x + len;
-			m->ballocd = 1;
-		}
-	} else if(cistrcmp(s_to_c(m->charset), "iso-2022-jp") == 0){
-		len = xtoutf("jis", &x, m->body, m->bend);
-		if(len != 0){
-			if(m->ballocd)
-				free(m->body);
-			m->body = x;
-			m->bend = x + len;
-			m->ballocd = 1;
-		}
-	} else if(cistrcmp(s_to_c(m->charset), "windows-1257") == 0
-			|| cistrcmp(s_to_c(m->charset), "windows-1252") == 0){
-		len = is8bit(m);
-		if(len > 0){
-			len = 2*len + m->bend - m->body + 1;
-			x = emalloc(len);
-			len = windows1257toutf(x, m->body, m->bend);
-			if(m->ballocd)
-				free(m->body);
-			m->body = x;
-			m->bend = x + len;
-			m->ballocd = 1;
-		}
-	} else if(cistrcmp(s_to_c(m->charset), "windows-1251") == 0){
-		len = xtoutf("cp1251", &x, m->body, m->bend);
-		if(len != 0){
-			if(m->ballocd)
-				free(m->body);
-			m->body = x;
-			m->bend = x + len;
-			m->ballocd = 1;
-		}
-	} else if(cistrcmp(s_to_c(m->charset), "koi8-r") == 0){
-		len = xtoutf("koi8", &x, m->body, m->bend);
-		if(len != 0){
-			if(m->ballocd)
-				free(m->body);
-			m->body = x;
-			m->bend = x + len;
-			m->ballocd = 1;
-		}
-	}
-
 	m->converted = 1;
-}
-
-enum
-{
-	Self=	1,
-	Hex=	2,
-};
-uchar	tableqp[256];
-
-static void
-initquoted(void)
-{
-	int c;
-
-	memset(tableqp, 0, 256);
-	for(c = ' '; c <= '<'; c++)
-		tableqp[c] = Self;
-	for(c = '>'; c <= '~'; c++)
-		tableqp[c] = Self;
-	tableqp['\t'] = Self;
-	tableqp['='] = Hex;
+	len = xtoutf(s_to_c(m->charset), &x, m->body, m->bend);
+	if(len != 0){
+		if(m->ballocd)
+			free(m->body);
+		m->body = x;
+		m->bend = x + len;
+		m->ballocd = 1;
+	}
 }
 
 static int
@@ -1188,13 +1087,14 @@
 	return 0;
 }
 
+// underscores are translated in 2047 headers but not in the body
 static char*
 decquotedline(char *out, char *in, char *e, int uscores)
 {
 	int c, soft;
 
 	/* dump trailing white space */
-	while(e >= in && (*e == ' ' || *e == '\t' || *e == '\r' || *e == '\n'))
+	while(e >= in && isspace(*e))
 		e--;
 
 	/* trailing '=' means no newline */
@@ -1206,17 +1106,17 @@
 
 	while(in <= e){
 		c = (*in++) & 0xff;
-		switch(tableqp[c]){
-		case Self:
-			if(uscores && c == '_')
-				c = ' ';
-			*out++ = c;
-			break;
-		case Hex:
+		switch(c){
+		case '=':
 			c = hex2int(*in++)<<4;
 			c |= hex2int(*in++);
 			*out++ = c;
 			break;
+		case '_':
+			*out++ = uscores ? '_' : ' ';
+			break;
+		default:
+			*out++ = c;
 		}
 	}
 	if(!soft)
@@ -1231,9 +1131,6 @@
 {
 	char *p, *nl;
 
-	if(tableqp[' '] == 0)
-		initquoted();
-
 	p = out;
 	while((nl = strchr(in, '\n')) != nil && nl < e){
 		p = decquotedline(p, in, nl, uscores);
@@ -1263,35 +1160,31 @@
 	return op;
 }
 
-/*
- *  return number of 8 bit characters
- */
+// translate latin1 directly since it fits neatly in utf
 static int
-is8bit(Message *m)
+latin1toutf(char **out, char *in, char *e)
 {
-	int count = 0;
+	Rune r;
 	char *p;
+	int n;
 
-	for(p = m->body; p < m->bend; p++)
-		if(*p & 0x80)
-			count++;
-	return count;
-}
+	for(n = 0, p = in; p<e;)
+		if(*p++ & 0x80)
+			n++;
+	if(n == 0)
+		return 0;
 
-// translate latin1 directly since it fits neatly in utf
-int
-latin1toutf(char *out, char *in, char *e)
-{
-	Rune r;
-	char *p;
+	n += e-in;
+	p = *out = malloc(n+1);
+	if(p == nil)
+		return 0;
 
-	p = out;
 	for(; in < e; in++){
 		r = (*in) & 0xff;
 		p += runetochar(p, &r);
 	}
 	*p = 0;
-	return p - out;
+	return n;
 }
 
 // translate any thing else using the tcs program
@@ -1304,6 +1197,12 @@
 	int n, len, sofar;
 	char *p;
 
+	// small speed hack.
+	if (cistrcmp(charset, "us-ascii") == 0 || cistrcmp(charset, "utf-8") == 8)
+		return 0;
+	if(cistrcmp(charset, "iso-8859-1") == 0)
+		return latin1toutf(out, in, e);
+
 	len = e-in+1;
 	sofar = 0;
 	*out = p = malloc(len+1);
@@ -1370,36 +1269,6 @@
 		break;
 	}
 	return sofar;
-}
-
-enum {
-	Winstart= 0x7f,
-	Winend= 0x9f,
-};
-
-Rune winchars[] = {
-	L'•',
-	L'•', L'•', L'‚', L'ƒ', L'„', L'…', L'†', L'‡',
-	L'ˆ', L'‰', L'Š', L'‹', L'Œ', L'•', L'•', L'•',
-	L'•', L'‘', L'’', L'“', L'”', L'•', L'–', L'—',
-	L'˜', L'™', L'š', L'›', L'œ', L'•', L'•', L'Ÿ',
-};
-
-int
-windows1257toutf(char *out, char *in, char *e)
-{
-	Rune r;
-	char *p;
-
-	p = out;
-	for(; in < e; in++){
-		r = (*in) & 0xff;
-		if(r >= 0x7f && r <= 0x9f)
-			r = winchars[r-0x7f];
-		p += runetochar(p, &r);
-	}
-	*p = 0;
-	return p - out;
 }
 
 void *