both behavior and code indicate that split(1)'s `-e' (split by regular expression) doesn't play along with either `-n' (line count) or `-f' (output file prefix). the former is somewhat understandable, but the later is strange in lieu of `-s' (output file suffix) working just fine. that by accident or is there some rationale? this has been corrected, as have any other errors found. prefixes and suffixes apply regardless. -n is now not documented since it is not compatable with linux. when mixing -n and -e, the line count used is the number of lines since the last split. the first versions used an absolute line count, which doesn't seem as useful. Reference: /n/atom/patch/applied2013/splitupd Date: Mon Dec 30 20:25:57 CET 2013 Signed-off-by: quanstro@quanstro.net --- /sys/src/cmd/split.c Mon Dec 30 20:22:45 2013 +++ /sys/src/cmd/split.c Mon Dec 30 20:22:45 2013 @@ -3,47 +3,60 @@ #include #include -char digit[] = "0123456789"; +enum{ + Digits = 2, + Maxdigits = 7, /* greater than 2³² */ + Npat = 25, + Deflines = 1000, +}; + char *suffix = ""; char *stem = "x"; -char suff[] = "aa"; char name[200]; Biobuf bout; -Biobuf *output = &bout; - -extern int nextfile(void); -extern int matchfile(Resub*); -extern void openf(void); -extern char *fold(char*,int); -extern void usage(void); -extern void badexp(void); +Biobuf *output; +int iflag; +int xflag; +uint digits = Digits; + +void openf(void); +int nextf(void); +int match(Reprog**, int, char*); +char* xlower(char*); +void usage(void); void main(int argc, char *argv[]) { - Reprog *exp; - char *pattern = 0; - int n = 1000; - char *line; - int xflag = 0; - int iflag = 0; - Biobuf bin; - Biobuf *b = &bin; - char buf[256]; + char *pat[Npat], *line, buf[4096]; + int i, n, npat, lineno; + Biobuf bin, *b; + Reprog *re[Npat]; + + n = 0; + b = &bin; + npat = 0; ARGBEGIN { + case 'd': + digits = atoi(EARGF(usage())); + if(digits > Maxdigits) + sysfatal("split: too many digits %d\n", digits); + break; case 'l': case 'n': - n=atoi(EARGF(usage())); + n = atoi(EARGF(usage())); break; case 'e': - pattern = strdup(EARGF(usage())); + if(npat == nelem(pat)) + sysfatal("split: too many patterns"); + pat[npat++] = EARGF(usage()); break; case 'f': - stem = strdup(EARGF(usage())); + stem = EARGF(usage()); break; case 's': - suffix = strdup(EARGF(usage())); + suffix = EARGF(usage()); break; case 'x': xflag++; @@ -57,139 +70,153 @@ } ARGEND; - if(argc < 0 || argc > 1) + if(argc > 1) usage(); - - if(argc != 0) { - b = Bopen(argv[0], OREAD); - if(b == nil) { - fprint(2, "split: can't open %s: %r\n", argv[0]); - exits("open"); - } - } else + else if(argc == 0) Binit(b, 0, OREAD); + else{ + b = Bopen(argv[0], OREAD); + if(b == nil) + sysfatal("split: Bopen %s: %r", argv[0]); + } - if(pattern) { - Resub match[2]; - - if(!(exp = regcomp(iflag? fold(pattern, strlen(pattern)): - pattern))) - badexp(); - memset(match, 0, sizeof match); - matchfile(match); - while((line=Brdline(b,'\n')) != 0) { - memset(match, 0, sizeof match); - line[Blinelen(b)-1] = 0; - if(regexec(exp, iflag? fold(line, Blinelen(b)-1): line, - match, 2)) { - if(matchfile(match) && xflag) - continue; - } else if(output == 0) - nextfile(); /* at most once */ - Bwrite(output, line, Blinelen(b)-1); - Bputc(output, '\n'); - } - } else { - int linecnt = n; - - while((line=Brdline(b,'\n')) != 0) { - if(++linecnt > n) { - nextfile(); - linecnt = 1; - } - Bwrite(output, line, Blinelen(b)); + /* default */ + if(n == 0 && npat == 0) + n = Deflines; + + /* prepare regular reressions */ + for(i = 0; i < npat; i++){ + re[i] = regcomp(xlower(pat[i])); + if(re[i] == nil) + sysfatal("split: bad regular reression: %s", pat[i]); + } + lineno = 0; + while((line = Brdline(b, '\n')) != nil) { + line[Blinelen(b)-1] = 0; + if(match(re, npat, line)){ + lineno = 0; + if(xflag) + continue; + }else if(n > 0 && lineno == n || output == nil){ + lineno = 0; + nextf(); } + lineno++; + Bwrite(output, line, Blinelen(b)-1); + Bputc(output, '\n'); + } - /* - * in case we didn't end with a newline, tack whatever's - * left onto the last file - */ - while((n = Bread(b, buf, sizeof(buf))) > 0) - Bwrite(output, buf, n); - } - if(b != nil) - Bterm(b); - exits(0); + while((n = Bread(b, buf, sizeof(buf))) > 0) + Bwrite(output, buf, n); + Bterm(b); + exits(""); } int -nextfile(void) +fmt32(uint v, char *buf, int ndig) { - static int canopen = 1; + int i; - if(suff[0] > 'z') { - if(canopen) - fprint(2, "split: file %szz not split\n",stem); - canopen = 0; - } else { - snprint(name, sizeof name, "%s%s", stem, suff); - if(++suff[1] > 'z') - suff[1] = 'a', ++suff[0]; - openf(); + buf[ndig] = 0; + for(i = ndig-1; i >= 0; i--){ + buf[i] = 'a' + v%26; + v /= 26; } - return canopen; + return v; } int -matchfile(Resub *match) +nextf(void) { - if(match[1].sp) { - int len = match[1].ep - match[1].sp; - - strncpy(name, match[1].sp, len); - strcpy(name+len, suffix); - openf(); - return 1; - } - return nextfile(); + char buf[Maxdigits+1]; + int r, d; + static int once, seq; + + /* expand as necessary */ + r = -1; + for(d = digits; d <= Maxdigits; d++) + if((r = fmt32(seq, buf, d)) == 0) + break; + snprint(name, sizeof name, "%s%s%s", stem, buf, suffix); + if(r != 0){ + if(!once) + fprint(2, "split: file %s not split\n", name); + once = 1; + return 0; + } + seq++; + openf(); + return 1; } void openf(void) { - static int fd = 0; + static int fd = -1; - Bflush(output); - Bterm(output); - if(fd > 0) + if(fd >= 0){ + Bterm(output); close(fd); - fd = create(name,OWRITE,0666); - if(fd < 0) { - fprint(2, "grep: can't create %s: %r\n", name); - exits("create"); } + fd = create(name, OWRITE, 0666); + if(fd < 0) + sysfatal("split: can't create %s: %r", name); + output = &bout; Binit(output, fd, OWRITE); } -char * -fold(char *s, int n) +int +match(Reprog **re, int nre, char *line) { - static char *fline; - static int linesize = 0; - char *t; - - if(linesize < n+1){ - fline = realloc(fline,n+1); - linesize = n+1; - } - for(t=fline; *t++ = tolower(*s++); ) - continue; - /* we assume the 'A'-'Z' only appear as themselves - * in a utf encoding. - */ - return fline; + char *p; + int i, len; + Resub m[2]; + + if(nre == 0) + return 0; + p = xlower(line); + for(i = 0; i < nre; i++){ + memset(m, 0, sizeof m); + if(regexec(re[i], p, m, nelem(m))){ + if(m[1].sp == nil) + return nextf(); + len = m[1].ep - m[1].sp; + snprint(name, sizeof name, "%*s%s", len, m[1].sp, suffix); + openf(); + return 1; + } + } + return 0; } -void -usage(void) +char* +xlower(char *s) { - fprint(2, "usage: split [-n num] [-e exp] [-f stem] [-s suff] [-x] [-i] [file]\n"); - exits("usage"); + char *p; + Rune r; + static char buf[1024*UTFmax]; + + if(!iflag) + return s; + p = buf; + for(;;){ + if((uchar)*s < 0x80){ + *p++ = tolower(*s); + if(*s++ == 0) + break; + } + else{ + s += chartorune(&r, s); + r = tolowerrune(r); + p += runetochar(p, &r); + } + } + return buf; } void -badexp(void) +usage(void) { - fprint(2, "split: bad regular expression\n"); - exits("bad regular expression"); + fprint(2, "usage: split [-n num] [-e exp] [-f stem] [-s suff] [-x] [-i] [file]\n"); + exits("usage"); } --- /sys/man/1/split Mon Dec 30 20:22:46 2013 +++ /sys/man/1/split Mon Dec 30 20:22:47 2013 @@ -25,17 +25,17 @@ .BR xzz . The options are .TP +.BI -d " n" +Use at least +.I n +digits (default 2) per file name. +More digits are added if necessary. +.TP .BI -n " n" Split into .IR n -line pieces. .TP -.BI -l " n" -Synonym for -.B -n -.IR n , -a nod to Unix's syntax. -.TP .BI -e " expression" File divisions occur at each line that matches a regular @@ -44,13 +44,11 @@ .IR regexp (6). Multiple .B -e -options may appear. -If a subexpression of -.I expression -is contained in parentheses -.BR ( ... ) , -the output file name is the portion of the -line which matches the subexpression. +options may appear. The first match is used. +If the first subexpression contained in parentheses +.BR ( ... ) +is a non-empty match, the output file base name is the +matching text. .TP .BI -f " stem Use @@ -62,17 +60,26 @@ .BI -s " suffix Append .I suffix -to names identified under -.BR -e . +to file names. .TP .B -x Exclude the matched input line from the output file. .TP .B -i -Ignore case in option -.BR -e ; -force output file names (excluding the suffix) -to lower case. +Case insensitive matching with option +.BR -e . +.PP +Options +.B -e +and +.B -n +may be freely mixed. If +a pattern is specified, by default no splitting is done on the +basis of lines read. However if +.B -n +is also specified, both rules will be used. The line count +is the number of lines since the last split. +Empty files are not procduced. .SH SOURCE .B /sys/src/cmd/split.c .SH SEE ALSO