awk (or at least our port) has a bug where substr() mixes up rune position and byte position in its calculations. this is illustrated by the following ; echo 1/🎄bcdefgh|6.out '{for(i=1; i < 4; i++) print i " " substr($1, i, 1)}' 1 1 2 / 3 � repairing the algorithm to correctly use byte positions was initially tried, but the code was fussy, and slow. this changes the algorithm for substr so that we calculate the byte position of the start and end of the substr as we count off the wide characters (this is ape, so not runes) the nice side effect is that this makes substr() about 30% faster for shortish strings. 0.064u 0.014s 0.098r 6.out {s=substr($0,1,3)} 0.092u 0.017s 0.123r awk {s=substr($0,1,3)} Notes: posting note for visibility before this patch is applied. - quanstro Reference: /n/atom/patch/applied/awksubstr Date: Sun Oct 4 02:12:47 CES 2015 Signed-off-by: quanstro@quanstro.net Reviewed-by: quanstro --- /sys/src/cmd/awk/run.c Sun Oct 4 02:12:37 2015 +++ /sys/src/cmd/awk/run.c Sun Oct 4 02:12:39 2015 @@ -35,7 +35,7 @@ #include "awk.h" #include "y.tab.h" -#define tempfree(x) if (istemp(x)) tfree(x); else +#define tempfree(x) do {if (istemp(x)) tfree(x); }while(0) /* #undef tempfree @@ -731,8 +731,8 @@ Cell *substr(Node **a, int nnn) /* substr(a[0], a[1], a[2]) */ { - int k, m, n; - char *s, *p; + int m, n, c, i; + char *s, *b, *e; int temp; Cell *x, *y, *z = 0; @@ -741,31 +741,39 @@ if (a[2] != 0) z = execute(a[2]); s = getsval(x); - k = countposn(s, strlen(s)) + 1; - if (k <= 1) { - tempfree(x); - tempfree(y); - if (a[2] != 0) { - tempfree(z); - } - x = gettemp(); - setsval(x, ""); - return(x); - } - m = getival(y, 1, k); /* 1 <= m <= k */ + m = getival(y, 1, Imax); tempfree(y); if (a[2] != 0) { - n = getival(z, 0, k-m); - /* n <= 0 <= k-m */ + n = getival(z, 0, Imax); tempfree(z); } else - n = k - 1; + n = Imax; + + c = 0; + b = NULL; + + for (e = s; *e != 0; e += i) { + c++; + if (c == m) + b = e; + if (c == m+n) + break; + i = mblen(e, 100); /* ok because mblen is careful */ + if (i < 0) + i = 1; + } + + /* invalid index? */ + if(b == NULL) + b = e; + dprintf( ("substr: m=%d, n=%d, s=%s\n", m, n, s) ); + y = gettemp(); - temp = s[n+m-1]; /* with thanks to John Linderman */ - s[n+m-1] = '\0'; - setsval(y, s + m - 1); - s[n+m-1] = temp; + temp = *e; + *e = 0; /* with thanks to John Linderman */ + setsval(y, b); + *e = temp; tempfree(x); return(y); } @@ -804,7 +812,7 @@ char *p, *t; const char *os; Cell *x; - int flag = 0, n, ch; + int flag, n, ch; int fmtwd; /* format width */ int fmtsz = recsize; char *buf = *pbuf; @@ -1628,8 +1636,7 @@ Cell *nullproc(Node **a, int n) { - n = n; - a = a; + USED(a, n); return 0; } @@ -1727,7 +1734,7 @@ Cell *x; int i, stat; - n = n; + USED(n); x = execute(a[0]); getsval(x); stat = -1;