Change replica/updatedb (the heart of replica/scan) to notice remote file system errors, like the network going down, and bail out. Previously, if one machine (say ethel) was doing a scan of /n/sources/plan9 and then the network connection between them was lost, ethel's walk of /n/sources/plan9 would not visit whatever files it didn't get to and would think they were deleted. Now it will notice the network failure and stop. Reference: /n/sources/patch/applied/updatedb Date: Wed Nov 28 14:42:28 CET 2007 Signed-off-by: rsc@swtch.com --- /sys/src/cmd/replica/updatedb.c Wed Nov 28 14:40:06 2007 +++ /sys/src/cmd/replica/updatedb.c Wed Nov 28 14:40:05 2007 @@ -97,6 +97,39 @@ } void +warn(char *msg, void*) +{ + char *p; + + fprint(2, "warning: %s\n", msg); + + /* find the %r in "can't open foo: %r" */ + p = strstr(msg, ": "); + if(p) + p += 2; + + /* + * if the error is about a remote server failing, + * then there's no point in continuing to look + * for changes -- we'll think everything got deleted! + * + * actual errors i see are: + * "i/o on hungup channel" for a local hangup + * "i/o on hungup channel" for a timeout (yank the network wire) + * "'/n/sources/plan9' Hangup" for a remote hangup + * the rest is paranoia. + */ + if(p){ + if(cistrstr(p, "hungup") || cistrstr(p, "Hangup") + || cistrstr(p, "rpc error") + || cistrstr(p, "shut down") + || cistrstr(p, "i/o") + || cistrstr(p, "connection")) + sysfatal("suspected network or i/o error - bailing out"); + } +} + +void usage(void) { fprint(2, "usage: replica/updatedb [-c] [-p proto] [-r root] [-t now n] [-u uid] [-x path]... db [paths]\n"); @@ -151,7 +184,7 @@ nmatch = argc-1; db = opendb(argv[0]); - if(rdproto(proto, root, walk, nil, nil) < 0) + if(rdproto(proto, root, walk, warn, nil) < 0) sysfatal("rdproto: %r"); if(!changesonly){