regexp already merges adjacent overlapping ranges. this patch also merges adjacent, non-overlapping ranges.. for example [ab] -> [a-b]. this patch is built on the changes in regexpclass. the actual change is 2 lines plus commented-out debugging. this patch is a significant performance win for some cases: /tmp/sed is the current sources version. ; time /tmp/sed -n '/[α-ω]/p' /dev/null 1.10u 0.02s 1.17r /tmp/sed -n /[α-ω]/p ; time sed -n '/[α-ω]/p' /dev/null 1.09u 0.02s 1.17r sed -n /[α-ω]/p ; time /tmp/sed -n '/[αβγδεζηθικλμνξοπρςστυφχψω]/p' /dev/null 1.72u 0.02s 1.81r /tmp/sed -n /[αβγδεζηθικλμνξοπρςστυφχψω]/p ; time /bin/sed -n '/[αβγδεζηθικλμνξοπρςστυφχψω]/p' /dev/null 1.10u 0.02s 1.17r /bin/sed -n /[αβγδεζηθικλμνξοπρςστυφχψω]/p debugging output (in the code, commented out) # before change ; echo x | 8.xsed -n '/[abcdefghijklmnopqrstuvwxyz]/p' nspan = 26 a a 0061 0061 b b 0062 0062 c c 0063 0063 d d 0064 0064 e e 0065 0065 f f 0066 0066 g g 0067 0067 h h 0068 0068 i i 0069 0069 j j 006a 006a k k 006b 006b l l 006c 006c m m 006d 006d n n 006e 006e o o 006f 006f p p 0070 0070 q q 0071 0071 r r 0072 0072 s s 0073 0073 t t 0074 0074 u u 0075 0075 v v 0076 0076 w w 0077 0077 x x 0078 0078 y y 0079 0079 z z 007a 007a x # after change ; ; echo x | 8.xsed -n '/[abcdefghijklmnopqrstuvwxyz]/p' nspan = 1 a z 0061 007a x Reference: /n/sources/patch/applied/regexpcmerge Date: Tue Dec 1 03:23:54 CET 2009 Signed-off-by: quanstro@quanstro.net --- /sys/src/libregexp/regcomp.c Tue Dec 1 03:19:00 2009 +++ /sys/src/libregexp/regcomp.c Tue Dec 1 03:18:58 2009 @@ -383,11 +383,26 @@ return RUNE; } +static void +debugspan(void) +{ +#ifdef DEBUG + int i, nspan; + Rune r; + + nspan = yyclassp->end - yyclassp->spans >>1; + fprint(2, "nspan = %d\n", nspan); + p = yyclassp->spans; + for(i = 0; i < nspan; i++) + print("%C %C %.4ux %.4ux\n", p[2*i], p[2*i+1], p[2*i], p[2*i+1]); +#endif +} + static int bldcclass(void) { int type; - Rune r[NCCRUNE]; + Rune r[NSPANS*2]; Rune *p, *ep, *np; Rune rune; int quoted; @@ -408,7 +423,11 @@ } /* parse class into a set of spans */ - for(; ep<&r[NCCRUNE];){ + for(;;){ + if(ep == r + nelem(r)){ + rcerror("class too large"); + return 0; + } if(rune == 0){ rcerror("malformed '[]'"); return 0; @@ -455,8 +474,8 @@ np[0] = *p++; np[1] = *p++; for(; p < ep; p += 2) - if(p[0] <= np[1]){ - if(p[1] > np[1]) + if(p[0] <= np[1]+1){ + if(p[1] >= np[1]) np[1] = p[1]; } else { np += 2; @@ -464,6 +483,7 @@ np[1] = p[1]; } yyclassp->end = np+2; + debugspan(); } return type;