convert to and from surrogate pair format. this is primarly to assist with json formatting. perhaps tcs(1) should perform this duty, but it's not clear at this point that that would be a win. Reference: /n/atom/patch/applied/surrogate Date: Sat May 10 04:57:35 CES 2014 Signed-off-by: quanstro@quanstro.net --- /sys/src/libc/port/surrogatetorune.c Thu Jan 1 00:00:00 1970 +++ /sys/src/libc/port/surrogatetorune.c Sat May 10 04:55:43 2014 @@ -0,0 +1,55 @@ +#include +#include + +enum { + L0 = 0xd800, /* big endian encoding */ + L0sz = 0x3ff, + L1 = 0xdc00, + L1sz = 0x7ff, + Surmin = 0x10000, +}; + +int +issurrogaterune(Rune r) +{ + if(r >= L0 && r <= L0+L0sz) + return 1; + if(r >= L1 && r <= L1+L1sz) + return 2; + return 0; +} + +int +surrogatetorune(Rune *r, Rune *o) +{ + Rune r0, r1; + + r0 = r[0]; + if(r0 < L0 || (r0 -= L0) > L0sz){ + if(r0 >= L1 && r0 <= L1+L1sz) + *o = Runeerror; + else + *o = r[0]; + return 1; + } + r1 = r[1]; + if(r1 < L1 || (r1 -= L1) > L1sz){ + *o = Runeerror; + return 2; + } + *o = Surmin + (r0<<10 | r1); + return 2; +} + +int +runetosurrogate(Rune r0, Rune *r) +{ + if(r0 < Surmin){ + r[0] = r0; + return 1; + } + r0 -= Surmin; + r[0] = L0 + (r0>>10 & L0sz); + r[1] = L1 + (r0 & L1sz); + return 2; +} --- /sys/man/2/surrogatetorune Thu Jan 1 00:00:00 1970 +++ /sys/man/2/surrogatetorune Sat May 10 04:55:44 2014 @@ -0,0 +1,35 @@ +.TH SURROGATETORUNE 2 +.SH NAME +surrogatetorune, runetosurrogate \- Surrogate pair conversion +.SH SYNOPSIS +.B +#include +.br +.B #include +.PP +.B +int surrogatetorune(Rune *in, Rune *out) +.PP +.B +int runetosurrogate(Rune in, Rune *out) +.SH DESCRIPTION +These routines convert runes outside the Basic Plane to surrogate pairs +for interoperation with foreign systems. +.I Surrogatetorune +takes a +pair of input runes, +.IR in , +potentially in surrogate-pair format and returns +the number of input runes consumed. +.I Out +points to the single rune produced. Malformed surrogate +pairs may consume either one or two input runes; +.I out +will point to +.BR Runeerror . +.PP +.I Runetosurrogate +converts +.I in +to a surrogate pair or a single rune and +returns the number of runes produced. --- /sys/man/2/isalpharune Sat May 10 04:55:45 2014 +++ /sys/man/2/isalpharune Sat May 10 04:55:46 2014 @@ -1,6 +1,6 @@ .TH ISALPHARUNE 2 .SH NAME -isalpharune, isbaserune, islowerrune, ispunctrune, isspacerune, istitlerune, isupperrune, isdigitrune, tobaserune, tolowerrune, totitlerune, toupperrune \- Unicode character classes and cases +isalpharune, isbaserune, islowerrune, isspacerune, issurrogaterune, istitlerune, isupperrune, isdigitrune, tobaserune, tolowerrune, totitlerune, toupperrune \- Unicode character classes and cases .SH SYNOPSIS .B #include .br @@ -13,19 +13,22 @@ int isbaserune(Rune c) .PP .B +int isdigitrune(Rune c) +.PP +.B int islowerrune(Rune c) .PP .B int isspacerune(Rune c) .PP .B -int istitlerune(Rune c) +int issurrogaterune(Rune c) .PP .B -int isupperrune(Rune c) +int istitlerune(Rune c) .PP .B -int isdigitrune(Rune c) +int isupperrune(Rune c) .PP .B int ispunctrune(Rune c) @@ -54,7 +57,7 @@ these routines test types and modify cases for Unicode characters. The names are self-explanatory. -.P +.PP As an extension, .I ispunctrune returns the Unicode punctuation type character, @@ -66,6 +69,9 @@ final punctuation. .PP The case-conversion routines return the character unchanged if it has no case. +.PP +.I Issurrogaterune +returns a rune's index in a surrogate pair, or 0 if it is not part of a pair. .SH SOURCE .B /sys/src/libc/port/runetype.c .br --- /sys/src/cmd/runetype/surrogate.c Thu Jan 1 00:00:00 1970 +++ /sys/src/cmd/runetype/surrogate.c Sat May 10 04:55:47 2014 @@ -0,0 +1,60 @@ +#include +#include +#include + +void +surrogate(Biobuf *in, Biobuf *out) +{ + int n; + long r; + Rune o[2]; + + for(;;){ + r = Bgetrune(in); + if(r == Beof) + return; + n = runetosurrogate(r, o); + Bputrune(out, o[0]); + if(n == 2) + Bputrune(out, o[1]); + } +} + +void +usage(void) +{ + fprint(2, "usage %s files ...\n", argv0); + exits("usage"); +} + +void +main(int argc, char **argv) +{ + int i; + Biobuf *b, in, out; + + ARGBEGIN{ + default: + usage(); + }ARGEND + + if(Binit(&out, 1, OWRITE) == -1) + sysfatal("%s: Binit: %r", argv0); + + if(argc == 0){ + if(Binit(&in, 0, OREAD) == -1) + sysfatal("%s: Binit: %r", argv0); + surrogate(&in, &out); + Bterm(&in); + }else{ + for(i = 0; i < argc; i++){ + b = Bopen(argv[i], OREAD); + if(b == nil) + sysfatal("%s: Bopen: %r", argv0); + surrogate(b, &out); + Bterm(b); + } + } + Bterm(&out); + exits(""); +} --- /sys/src/cmd/runetype/unsurrogate.c Thu Jan 1 00:00:00 1970 +++ /sys/src/cmd/runetype/unsurrogate.c Sat May 10 04:55:47 2014 @@ -0,0 +1,77 @@ +#include +#include +#include + +void +unsurrogate(Biobuf *in, Biobuf *out) +{ + int i; + long r0; + Rune r[2], o[2]; + + for(i = 0;;){ + r0 = Bgetrune(in); + if(r0 == Beof) + break; + r[i++] = r0; + if(i != 2) + continue; + switch(surrogatetorune(r, o)){ + case 1: + Bputrune(out, o[0]); + r[0] = r[1]; + i = 1; + break; + case 2: + Bputrune(out, o[0]); + Bputrune(out, o[1]); + i = 0; + break; + } + } + if(i == 1){ + if(issurrogaterune(r[0]) == 0) + Bputrune(out, r[0]); + else + Bputrune(out, Runeerror); + } +} + +void +usage(void) +{ + fprint(2, "usage %s files ...\n", argv0); + exits("usage"); +} + +void +main(int argc, char **argv) +{ + int i; + Biobuf *b, in, out; + + ARGBEGIN{ + default: + usage(); + }ARGEND + + if(Binit(&out, 1, OWRITE) == -1) + sysfatal("%s: Binit: %r", argv0); + + if(argc == 0){ + if(Binit(&in, 0, OREAD) == -1) + sysfatal("%s: Binit: %r", argv0); + unsurrogate(&in, &out); + Bterm(&in); + }else{ + for(i = 0; i < argc; i++){ + b = Bopen(argv[i], OREAD); + if(b == nil) + sysfatal("%s: Bopen: %r", argv0); + unsurrogate(b, &out); + Bterm(b); + } + } + Bterm(&out); + exits(""); +} --- /sys/src/cmd/runetype/mkfile Sat May 10 04:55:49 2014 +++ /sys/src/cmd/runetype/mkfile Sat May 10 04:55:50 2014 @@ -4,7 +4,7 @@ <|mkrunesize url=http://www.unicode.org/Public/UNIDATA/ <|sed -n ''''s/^([A-Z])/cpu\1/p''''