This patch extends tcs(1) to understand TUNE. http://www.tunerfc.tn.gov.in TUNE is a 16 bit encoding scheme for Tamil. Unlike Unicode, TUNE maps each character to a single glyph. This property of TUNE makes it straight forward to view/edit Tamil in Plan 9. Please use unicode.8.font from http://www.tip9ug.jp/who/art/lib/font/madhavi.tgz to view the Tamil glyphs in tune.c. Sample text to test tune.c: http://www.tip9ug.jp/who/art/lib/kp-tune.utf http://www.tip9ug.jp/who/art/lib/kp-uni.utf Pointers to TUNE and Unicode Tamil code charts are available in the Links section in http://www.tip9ug.jp/who/art/tamil Please let me know if any other information is needed. Thanks. Arvindh Notes: Wed Mar 29 17:48:54 EST 2006 rsc Wed Mar 29 17:57:07 EST 2006 rsc I removed the L'x' constants, as they make it very hard to port to non-Plan 9 systems, and tcs is actually one of the more important programs to be able to port. There were some invalid rune sequences in your program as rune constants. I translated them to Runeerror. I hope that was right. Please diff /sys/src/cmd/tcs/tune.c against yours and make sure you agree with the changes. I don't fully understand TUNE. My basic understanding is that the conversion takes over a bigger Unicode range than is currently allocated to Tamil. What range does it take over? Thanks. Russ P.S. This is a bit of overkill: static enum { state0, state1, state2, state3, state4, state5, state6, state7 } state = state0; Can we just use static int state = 0; instead? Reference: /n/sources/patch/applied/tcs-tune Date: Wed Mar 29 23:00:55 CES 2006 Reviewed-by: rsc --- /sys/src/cmd/tcs/conv.h Wed Mar 29 22:48:13 2006 +++ /sys/src/cmd/tcs/conv.h Wed Mar 29 22:48:05 2006 @@ -13,6 +13,8 @@ void uksc_out(Rune *base, int n, long *notused); void html_in(int fd, long *notused, struct convert *out); void html_out(Rune *base, int n, long *notused); +void tune_in(int fd, long *notused, struct convert *out); +void tune_out(Rune *base, int n, long *notused); #define emit(x) *(*r)++ = (x) #define NRUNE 65536 --- /sys/src/cmd/tcs/conv_big5.c Wed Mar 29 22:48:53 2006 +++ /sys/src/cmd/tcs/conv_big5.c Wed Mar 29 22:48:45 2006 @@ -110,6 +110,7 @@ big5proc(-1, &r, nin); if(r > ob) OUT(out, ob, r-ob); + OUT(out, ob, 0); } void --- /sys/src/cmd/tcs/conv_gb.c Wed Mar 29 22:49:41 2006 +++ /sys/src/cmd/tcs/conv_gb.c Wed Mar 29 22:49:33 2006 @@ -88,6 +88,7 @@ gbproc(-1, &r, nin); if(r > ob) OUT(out, ob, r-ob); + OUT(out, ob, 0); } void --- /sys/src/cmd/tcs/conv_jis.c Wed Mar 29 22:50:40 2006 +++ /sys/src/cmd/tcs/conv_jis.c Wed Mar 29 22:50:29 2006 @@ -363,6 +363,7 @@ (*procfn)(-1, &r, nin); if(r > ob) OUT(out, ob, r-ob); + OUT(out, ob, 0); } void --- /sys/src/cmd/tcs/conv_ksc.c Wed Mar 29 22:51:45 2006 +++ /sys/src/cmd/tcs/conv_ksc.c Wed Mar 29 22:51:37 2006 @@ -109,6 +109,7 @@ ukscproc(-1, &r, nin); if(r > ob) OUT(out, ob, r-ob); + OUT(out, ob, 0); } void --- /sys/src/cmd/tcs/html.c Wed Mar 29 22:52:59 2006 +++ /sys/src/cmd/tcs/html.c Wed Mar 29 22:52:51 2006 @@ -424,6 +424,7 @@ } if(r > rbuf) OUT(out, rbuf, r-rbuf); + OUT(out, rbuf, 0); } /* --- /sys/src/cmd/tcs/mkfile Wed Mar 29 22:54:21 2006 +++ /sys/src/cmd/tcs/mkfile Wed Mar 29 22:54:13 2006 @@ -11,7 +11,8 @@ kuten208.$O\ gb.$O\ ksc.$O\ - big5.$O + big5.$O\ + tune.$O BIN=/$objtype/bin +#include +#include +#include "hdr.h" +#include "conv.h" + +typedef struct Tmap Tmap; +struct Tmap +{ + Rune u; + Rune t; +}; + +static Tmap t1[] = +{ + {L'அ', L''}, + {L'ஆ', L''}, + {L'இ', L''}, + {L'ஈ', L''}, + {L'உ', L''}, + {L'ஊ', L''}, + {L'எ', L''}, + {L'ஏ', L''}, + {L'ஐ', L''}, + {L'ஒ', L''}, + {L'ஓ', L''}, + {L'ஔ', L''}, + {L'ஃ', L''} +}; + +static Rune t2[] = +{ + L'்', + L'்', // filler + L'ா', + L'ி', + L'ீ', + L'ு', + L'ூ', + L'ெ', + L'ே', + L'ை', + L'ொ', + L'ோ', + L'ௌ' +}; + +static Tmap t3[] = +{ + {L'க', L''}, + {L'ங', L''}, + {L'ச', L''}, + {L'ஜ', L''}, + {L'ஞ', L''}, + {L'ட', L''}, + {L'ண', L''}, + {L'த', L''}, + {L'ந', L''}, + {L'ன', L''}, + {L'ப', L''}, + {L'ம', L''}, + {L'ய', L''}, + {L'ர', L''}, + {L'ற', L''}, + {L'ல', L''}, + {L'ள', L''}, + {L'ழ', L''}, + {L'வ', L''}, + {L'ஶ', L''}, + {L'ஷ', L''}, + {L'ஸ', L''}, + {L'ஹ', L''} +}; + +static Rune +findbytune(Tmap *tab, int size, Rune t) +{ + int i; + + for(i = 0; i < size; i++) + if(tab[i].t == t) + return tab[i].u; + return Runeerror; +} + +static Rune +findbyuni(Tmap *tab, int size, Rune u) +{ + int i; + + for(i = 0; i < size; i++) + if(tab[i].u == u) + return tab[i].t; + return Runeerror; +} + +static int +findindex(Rune *rstr, int size, Rune r) +{ + int i; + + for(i = 0; i < size; i++) + if(rstr[i] == r) + return i; + return -1; +} + +void +tune_in(int fd, long *x, struct convert *out) +{ + Biobuf b; + Rune rbuf[N]; + Rune *r, *er, tr; + int c, i; + + USED(x); + r = rbuf; + er = rbuf+N-3; + Binit(&b, fd, OREAD); + while((c = Bgetrune(&b)) != Beof){ + ninput += b.runesize; + if(r >= er){ + OUT(out, rbuf, r-rbuf); + r = rbuf; + } + if(c>=L'' && c <= L'' && (i = c%16) < nelem(t2)){ + if(c >= L''){ + *r++ = L'க'; + *r++ = L'்'; + *r++ = L'ஷ'; + }else + *r++ = findbytune(t3, nelem(t3), c-i+1); + if(i != 1) + *r++ = t2[i]; + }else if((tr = findbytune(t1, nelem(t1), c)) != Runeerror) + *r++ = tr; + else switch(c){ + case L'': + *r++ = L'ண'; *r++ = L'ா'; + break; + case L'': + *r++ = L'ற'; *r++ = L'ா'; + break; + case L'': + *r++ = L'ன'; *r++ = L'ா'; + break; + case L'': + *r++ = L'ண'; *r++ = L'ை'; + break; + case L'': + *r++ = L'ல'; *r++ = L'ை'; + break; + case L'': + *r++ = L'ள'; *r++ = L'ை'; + break; + case L'': + *r++ = L'ன'; *r++ = L'ை'; + break; + case L'': + *r++ = L'ஶ'; *r++ = L'்'; *r++ = L'ர'; *r++ = L'ீ'; + break; + default: + if(c >= 0xe200 && c <= 0xe3ff){ + if(squawk) + EPR( "%s: rune 0x%x not in output cs\n", argv0, c); + nerrors++; + if(clean) + break; + c = BADMAP; + } + *r++ = c; + break; + } + } + if(r > rbuf) + OUT(out, rbuf, r-rbuf); + OUT(out, rbuf, 0); +} + +void +tune_out(Rune *r, int n, long *x) +{ + static enum { state0, state1, state2, state3, state4, state5, state6, state7 } state = state0; + static Rune lastr; + Rune *er, tr; + char *p; + int i; + + USED(x); + nrunes += n; + er = r+n; + for(p = obuf; r < er; r++) + switch(state){ + case state0: + casestate0: + if((tr = findbyuni(t3, nelem(t3), *r)) != Runeerror){ + lastr = tr; + state = state1; + }else if(*r == L'ஒ'){ + lastr = L''; + state = state3; + }else if((tr = findbyuni(t1, nelem(t1), *r)) != Runeerror) + p += runetochar(p, &tr); + else + p += runetochar(p, r); + break; + case state1: + casestate1: + if((i = findindex(t2, nelem(t2), *r)) != -1){ + if(lastr && lastr != L'�') + lastr += i-1; + if(*r ==L'ெ') + state = state5; + else if(*r ==L'ே') + state = state4; + else if(lastr == L'') + state = state2; + else if(lastr == L'') + state = state6; + else{ + if(lastr) + p += runetochar(p, &lastr); + state = state0; + } + }else if(lastr && lastr != L'�' && (*r == L'²' || *r == L'³' || *r == L'⁴')){ + if(squawk) + EPR( "%s: character not in output cs\n", argv0, lastr, *r); + lastr = clean ? 0 : L'�'; + nerrors++; + }else{ + if(lastr) + p += runetochar(p, &lastr); + state = state0; + goto casestate0; + } + break; + case state2: + if(*r == L'ஷ'){ + lastr = L''; + state = state1; + break; + } + p += runetochar(p, &lastr); + state = state0; + goto casestate0; + case state3: + state = state0; + if(*r == L'ௗ'){ + p += runetochar(p, L""); + break; + } + p += runetochar(p, &lastr); + goto casestate0; + case state4: + state = state0; + if(*r == L'ா'){ + if(lastr){ + if(lastr != L'�') + lastr += 3; + p += runetochar(p, &lastr); + } + break; + } + if(lastr) + p += runetochar(p, &lastr); + goto casestate0; + case state5: + state = state0; + if(*r == L'ா' || *r == L'ௗ'){ + if(lastr){ + if(lastr != L'�') + lastr += *r == L'ா' ? 3 : 5; + p += runetochar(p, &lastr); + } + break; + } + if(lastr) + p += runetochar(p, &lastr); + goto casestate0; + case state6: + if(*r == L'ர'){ + state = state7; + break; + } + p += runetochar(p, &lastr); + state = state0; + goto casestate0; + case state7: + if(*r == L'ீ'){ + p += runetochar(p, L""); + state = state0; + break; + } + p += runetochar(p, &lastr); + lastr = L''; + state = state1; + goto casestate1; + } + if(n == 0 && state != state0){ + if(lastr) + p += runetochar(p, &lastr); + state = state0; + } + noutput += p-obuf; + write(1, obuf, p-obuf); +} --- /sys/src/cmd/tcs/utf.c Wed Mar 29 22:59:14 2006 +++ /sys/src/cmd/tcs/utf.c Wed Mar 29 22:59:06 2006 @@ -61,6 +61,7 @@ if(n == 0) break; } + OUT(out, runes, 0); } void @@ -112,6 +113,7 @@ if(n == 0) break; } + OUT(out, runes, 0); } void