venti: * stop storing unused wtime, train fields in IEntry. * prefetch arena tocs more gradually; set icacheprefetch=1 * yank out dead lump readahead code * yank out dead dcache readahead code * fix sync bug reported by anothy et al. * new, cleaner icache code mirrorarenas: * fix thread-stack access race (wsync) * new flag -s disables SHA1 checks, useful if destination is remote verifyarena: * accept venti-standard file:range syntax Reference: /n/sources/patch/applied/nventi Date: Mon Oct 1 03:36:25 CES 2007 Signed-off-by: rsc@swtch.com --- /sys/src/cmd/venti/srv/config.c Mon Oct 1 03:35:51 2007 +++ /sys/src/cmd/venti/srv/config.c Mon Oct 1 03:35:50 2007 @@ -245,3 +245,9 @@ return b; } +/* for OS X linker, which only resolves functions, not data */ +void +needmainindex(void) +{ +} + --- /sys/src/cmd/venti/srv/conv.c Mon Oct 1 03:35:51 2007 +++ /sys/src/cmd/venti/srv/conv.c Mon Oct 1 03:35:51 2007 @@ -581,9 +581,9 @@ scorecp(ie->score, p); p += VtScoreSize; - ie->wtime = U32GET(p); + /* ie->wtime = U32GET(p); */ p += U32Size; - ie->train = U16GET(p); + /* ie->train = U16GET(p); */ p += U16Size; if(p - buf != IEntryAddrOff) sysfatal("unpackentry bad IEntryAddrOff amount"); @@ -613,9 +613,9 @@ scorecp(p, ie->score); p += VtScoreSize; - U32PUT(p, ie->wtime); + U32PUT(p, 0); /* wtime */ p += U32Size; - U16PUT(p, ie->train); + U16PUT(p, 0); /* train */ p += U16Size; U64PUT(p, ie->ia.addr, t32); p += U64Size; --- /sys/src/cmd/venti/srv/mirrorarenas.c Mon Oct 1 03:35:52 2007 +++ /sys/src/cmd/venti/srv/mirrorarenas.c Mon Oct 1 03:35:51 2007 @@ -22,13 +22,14 @@ Part *dst; int force; int verbose; +int dosha1 = 1; char *status; uvlong astart, aend; void usage(void) { - fprint(2, "usage: mirrorarenas [-v] src dst [ranges]\n"); + fprint(2, "usage: mirrorarenas [-sv] src dst [ranges]\n"); threadexitsall("usage"); } @@ -92,6 +93,7 @@ * src with writing dst during copy. This is an easy factor of two * (almost) in performance. */ +static Write wsync; static void writeproc(void *v) { @@ -99,7 +101,7 @@ USED(v); while((w = recvp(writechan)) != nil){ - if(w->n == 0) + if(w == &wsync) continue; if(ewritepart(dst, w->o, w->p, w->n) < 0) w->error = 1; @@ -146,11 +148,7 @@ /* * wait for queued write to finish */ - w[i].p = nil; - w[i].o = 0; - w[i].n = 0; - w[i].error = 0; - sendp(writechan, &w[i]); + sendp(writechan, &wsync); i = 1-i; if(w[i].error) return -1; @@ -240,7 +238,7 @@ mirror(Arena *sa, Arena *da) { vlong v, si, di, end; - int clumpmax, blocksize; + int clumpmax, blocksize, sealed; static uchar buf[MaxIoSize]; ArenaHead h; DigestState xds, *ds; @@ -305,7 +303,8 @@ shaoff = 0; ds = nil; - if(sa->diskstats.sealed && scorecmp(sa->score, zeroscore) != 0){ + sealed = sa->diskstats.sealed && scorecmp(sa->score, zeroscore) != 0; + if(sealed && dosha1){ /* start sha1 state with header */ memset(&xds, 0, sizeof xds); ds = &xds; @@ -362,7 +361,7 @@ if(ewritepart(dst, end, buf, blocksize) < 0) return; - if(ds){ + if(sealed){ /* * ... but on the final pass, copy the encoding * of the tail information from the source @@ -375,20 +374,27 @@ if(asha1(dst, shaoff, end, ds) < 0 || copy(end, end+blocksize-VtScoreSize, "tail", ds) < 0) return; - memset(buf, 0, VtScoreSize); - sha1(buf, VtScoreSize, da->score, ds); - if(scorecmp(sa->score, da->score) == 0){ + if(dosha1){ + memset(buf, 0, VtScoreSize); + sha1(buf, VtScoreSize, da->score, ds); + if(scorecmp(sa->score, da->score) == 0){ + if(verbose) + chat("%T %s: %V sealed mirrored\n", sa->name, sa->score); + if(ewritepart(dst, end+blocksize-VtScoreSize, da->score, VtScoreSize) < 0) + return; + }else{ + chat("%T %s: sealing dst: score mismatch: %V vs %V\n", sa->name, sa->score, da->score); + memset(&xds, 0, sizeof xds); + asha1(dst, base-blocksize, end+blocksize-VtScoreSize, &xds); + sha1(buf, VtScoreSize, 0, &xds); + chat("%T reseal: %V\n", da->score); + status = "errors"; + } + }else{ if(verbose) - chat("%T %s: %V sealed mirrored\n", sa->name, sa->score); - if(ewritepart(dst, end+blocksize-VtScoreSize, da->score, VtScoreSize) < 0) + chat("%T %s: %V mirrored\n", sa->name, sa->score); + if(ewritepart(dst, end+blocksize-VtScoreSize, sa->score, VtScoreSize) < 0) return; - }else{ - chat("%T %s: sealing dst: score mismatch: %V vs %V\n", sa->name, sa->score, da->score); - memset(&xds, 0, sizeof xds); - asha1(dst, base-blocksize, end+blocksize-VtScoreSize, &xds); - sha1(buf, VtScoreSize, 0, &xds); - chat("%T reseal: %V\n", da->score); - status = "errors"; } }else{ chat("%T %s: %,lld used mirrored\n", @@ -461,6 +467,9 @@ break; case 'v': verbose++; + break; + case 's': + dosha1 = 0; break; default: usage(); --- /sys/src/cmd/venti/srv/verifyarena.c Mon Oct 1 03:35:52 2007 +++ /sys/src/cmd/venti/srv/verifyarena.c Mon Oct 1 03:35:52 2007 @@ -7,6 +7,7 @@ static uchar *data; static int blocksize; static int sleepms; +static vlong offset0; void usage(void) @@ -22,7 +23,7 @@ for(nr = 0; nr < n; nr += m){ m = n - nr; - m = pread(fd, &buf[nr], m, off+nr); + m = pread(fd, &buf[nr], m, offset0+off+nr); if(m <= 0){ if(m == 0) werrstr("early eof"); @@ -175,7 +176,8 @@ char *p, *q, *table, *f[10], line[256]; vlong start, stop; ArenaPart ap; - + Part *part; + needzeroscore(); ventifmtinstall(); blocksize = MaxIoSize; @@ -201,8 +203,10 @@ threadexitsall(nil); } - if((fd = open(argv[0], OREAD)) < 0) - sysfatal("open %s: %r", argv[0]); + if((part = initpart(argv[0], OREAD)) == nil) + sysfatal("open partition %s: %r", argv[0]); + fd = part->fd; + offset0 = part->offset; if(preadblock(data, 8192, PartBlank) < 0) sysfatal("read arena part header: %r"); @@ -249,7 +253,7 @@ fprint(2, "%T %s: bad start,stop %lld,%lld\n", f[0], stop, start); continue; } - if(seek(fd, start, 0) < 0) + if(seek(fd, offset0+start, 0) < 0) fprint(2, "%T %s: seek to start: %r\n", f[0]); verifyarena(f[0], stop - start); } --- /sys/src/cmd/venti/srv/hdisk.c Mon Oct 1 03:35:54 2007 +++ /sys/src/cmd/venti/srv/hdisk.c Mon Oct 1 03:35:53 2007 @@ -547,7 +547,7 @@ Lump *u; IAddr ia; IEntry ie; - int i, rac; + int i; Arena *arena; u64int aa; ZBlock *zb; @@ -561,7 +561,7 @@ } hprint(&c->hout, "

index search %V

\n", score);
-	if(_lookupscore(score, -1, &ia, nil) < 0)
+	if(icachelookup(score, -1, &ia) < 0)
 		hprint(&c->hout, "  icache: not found\n");
 	else
 		hprint(&c->hout, "  icache: addr=%#llx size=%d type=%d blocks=%d\n",
@@ -585,12 +585,12 @@
 			hprint(&c->hout, " -cache");
 		putlump(u);
 		
-		if(lookupscore(score, type, &ia, &rac) < 0){
+		if(lookupscore(score, type, &ia) < 0){
 			hprint(&c->hout, " -lookup\n");
 			continue;
 		}
-		hprint(&c->hout, "\n  lookupscore: addr=%#llx size=%d blocks=%d rac=%d\n",
-			ia.addr, ia.size, ia.blocks, rac);
+		hprint(&c->hout, "\n  lookupscore: addr=%#llx size=%d blocks=%d\n",
+			ia.addr, ia.size, ia.blocks);
 		
 		arena = amapitoa(mainindex, ia.addr, &aa);
 		if(arena == nil){
--- /sys/src/cmd/venti/srv/stats.c	Mon Oct  1 03:35:54 2007
+++ /sys/src/cmd/venti/srv/stats.c	Mon Oct  1 03:35:54 2007
@@ -60,6 +60,9 @@
 	{ "index cache flushes", },
 	{ "index cache stalls", },
 	{ "index cache read time", },
+	{ "index cache lookups" },
+	{ "index cache summary hits" },
+	{ "index cache summary prefetches" },
 
 	{ "bloom filter hits", },
 	{ "bloom filter misses", },
@@ -81,6 +84,9 @@
 
 	{ "sum reads", },
 	{ "sum read bytes", },
+	
+	{ "cig loads" },
+	{ "cig load time" },
 };
 
 QLock statslock;
--- /sys/src/cmd/venti/srv/lump.c	Mon Oct  1 03:35:55 2007
+++ /sys/src/cmd/venti/srv/lump.c	Mon Oct  1 03:35:54 2007
@@ -7,7 +7,7 @@
 int			writestodevnull = 0;
 int			verifywrites = 0;
 
-static Packet		*readilump(Lump *u, IAddr *ia, u8int *score, int rac);
+static Packet		*readilump(Lump *u, IAddr *ia, u8int *score);
 
 /*
  * Some of this logic is duplicated in hdisk.c
@@ -19,7 +19,6 @@
 	Packet *p;
 	IAddr ia;
 	u32int n;
-	int rac;
 
 	trace(TraceLump, "readlump enter");
 /*
@@ -49,7 +48,7 @@
 	if(cached)
 		*cached = 0;
 
-	if(lookupscore(score, type, &ia, &rac) < 0){
+	if(lookupscore(score, type, &ia) < 0){
 		/* ZZZ place to check for someone trying to guess scores */
 		seterr(EOk, "no block with score %V/%d exists", score, type);
 
@@ -64,7 +63,7 @@
 	}
 
 	trace(TraceLump, "readlump readilump");
-	p = readilump(u, &ia, score, rac);
+	p = readilump(u, &ia, score);
 	putlump(u);
 
 	trace(TraceLump, "readlump exit");
@@ -134,9 +133,8 @@
 	Packet *old;
 	IAddr ia;
 	int ok;
-	int rac;
 
-	if(lookupscore(u->score, u->type, &ia, &rac) == 0){
+	if(lookupscore(u->score, u->type, &ia) == 0){
 		if(verifywrites == 0){
 			/* assume the data is here! */
 			packetfree(p);
@@ -149,7 +147,7 @@
 		 * if the read fails,
 		 * assume it was corrupted data and store the block again
 		 */
-		old = readilump(u, &ia, u->score, rac);
+		old = readilump(u, &ia, u->score);
 		if(old != nil){
 			ok = 0;
 			if(packetcmp(p, old) != 0){
@@ -176,8 +174,6 @@
 	ok = storeclump(mainindex, flat, u->score, u->type, creator, &ia);
 	freezblock(flat);
 	if(ok == 0)
-		ok = insertscore(u->score, &ia, 1);
-	if(ok == 0)
 		insertlump(u, p);
 	else
 		packetfree(p);
@@ -193,39 +189,14 @@
 	return ok;
 }
 
-static void
-lreadahead(u64int a, Arena *arena, u64int aa, int n)
-{	
-	u8int buf[ClumpSize];
-	Clump cl;
-	IAddr ia;
-
-	while(n > 0) {
-		if (aa >= arena->memstats.used)
-			break;
-		if(readarena(arena, aa, buf, ClumpSize) < ClumpSize)
-			break;
-		if(unpackclump(&cl, buf, arena->clumpmagic) < 0)
-			break;
-		ia.addr = a;
-		ia.type = cl.info.type;
-		ia.size = cl.info.uncsize;
-		ia.blocks = (cl.info.size + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog;
-		insertscore(cl.info.score, &ia, 0);
-		a += ClumpSize + cl.info.size;
-		aa += ClumpSize + cl.info.size;
-		n--;
-	}
-}
-
 static Packet*
-readilump(Lump *u, IAddr *ia, u8int *score, int rac)
+readilump(Lump *u, IAddr *ia, u8int *score)
 {
 	Arena *arena;
 	ZBlock *zb;
 	Packet *p, *pp;
 	Clump cl;
-	u64int a, aa;
+	u64int aa;
 	u8int sc[VtScoreSize];
 
 	trace(TraceLump, "readilump enter");
@@ -256,13 +227,6 @@
 		seterr(ECrash, "score mismatch");
 		freezblock(zb);
 		return nil;
-	}
-
-	if(rac == 0) {
-		trace(TraceLump, "readilump readahead");
-		a = ia->addr + ClumpSize + cl.info.size;
-		aa += ClumpSize + cl.info.size;
-		lreadahead(a, arena, aa, 20);
 	}
 
 	trace(TraceLump, "readilump success");
--- /sys/src/cmd/venti/srv/clump.c	Mon Oct  1 03:35:56 2007
+++ /sys/src/cmd/venti/srv/clump.c	Mon Oct  1 03:35:55 2007
@@ -62,18 +62,16 @@
 	memset(cb->data+ClumpSize+dsize, 0, 4);
 	cl.info.size = dsize;
 
-	ia->addr = 0;
-	ia->type = type;
-	ia->size = size;
-	ia->blocks = (dsize + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog;
-
-	a = writeiclump(ix, &cl, cb->data, &ia->addr);
-
+	a = writeiclump(ix, &cl, cb->data);
 	trace(TraceLump, "storeclump exit %lld", a);
-
 	freezblock(cb);
 	if(a == TWID64)
 		return -1;
+
+	ia->addr = a;
+	ia->type = type;
+	ia->size = size;
+	ia->blocks = (dsize + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog;
 
 /*
 	qlock(&stats.lock);
--- /sys/src/cmd/venti/srv/arena.c	Mon Oct  1 03:35:57 2007
+++ /sys/src/cmd/venti/srv/arena.c	Mon Oct  1 03:35:56 2007
@@ -16,6 +16,7 @@
 static CIBlock	*getcib(Arena *arena, int clump, int writing, CIBlock *rock);
 static void	putcib(Arena *arena, CIBlock *cib);
 static void	sumproc(void *);
+static void loadcig(Arena *arena);
 
 static QLock	sumlock;
 static Rendez	sumwait;
@@ -65,7 +66,7 @@
 	}
 
 	if(arena->diskstats.sealed && scorecmp(zeroscore, arena->score)==0)
-		backsumarena(arena);
+		sealarena(arena);
 
 	return arena;
 }
@@ -137,14 +138,23 @@
 	CIBlock *cib, r;
 	int i;
 
-	for(i = 0; i < n; i++){
+	/*
+	 * because the clump blocks are laid out
+	 * in reverse order at the end of the arena,
+	 * it can be a few percent faster to read
+	 * the clumps backwards, which reads the
+	 * disk blocks forwards.
+	 */
+	for(i = n-1; i >= 0; i--){
 		cib = getcib(arena, clump + i, 0, &r);
-		if(cib == nil)
-			break;
+		if(cib == nil){
+			n = i;
+			continue;
+		}
 		unpackclumpinfo(&cis[i], &cib->data->data[cib->offset]);
 		putcib(arena, cib);
 	}
-	return i;
+	return n;
 }
 
 /*
@@ -283,13 +293,12 @@
 filling up and real errors writing the clump?
  */
 u64int
-writeaclump(Arena *arena, Clump *c, u8int *clbuf, u64int start, u64int *pa)
+writeaclump(Arena *arena, Clump *c, u8int *clbuf)
 {
 	DBlock *b;
 	u64int a, aa;
 	u32int clump, n, nn, m, off, blocksize;
 	int ok;
-	AState as;
 
 	n = c->info.size + ClumpSize + U32Size;
 	qlock(&arena->lock);
@@ -299,10 +308,6 @@
 		if(!arena->memstats.sealed){
 			logerr(EOk, "seal memstats %s", arena->name);
 			arena->memstats.sealed = 1;
-			as.arena = arena;
-			as.aa = start+aa;
-			as.stats = arena->memstats;
-			setdcachestate(&as);
 		}
 		qunlock(&arena->lock);
 		return TWID64;
@@ -349,7 +354,28 @@
 	if(c->info.size < c->info.uncsize)
 		arena->memstats.cclumps++;
 
-	clump = arena->memstats.clumps++;
+	clump = arena->memstats.clumps;
+	if(clump % ArenaCIGSize == 0){
+		if(arena->cig == nil){
+			loadcig(arena);
+			if(arena->cig == nil)
+				goto NoCIG;
+		}
+		/* add aa as start of next cig */
+		if(clump/ArenaCIGSize != arena->ncig){
+			fprint(2, "bad arena cig computation %s: writing clump %d but %d cigs\n",
+				arena->name, clump, arena->ncig);
+			arena->ncig = -1;
+			vtfree(arena->cig);
+			arena->cig = nil;
+			goto NoCIG;
+		}
+		arena->cig = vtrealloc(arena->cig, (arena->ncig+1)*sizeof arena->cig[0]);
+		arena->cig[arena->ncig++].offset = aa;
+	}
+NoCIG:
+	arena->memstats.clumps++;
+
 	if(arena->memstats.clumps == 0)
 		sysfatal("clumps wrapped");
 	arena->wtime = now();
@@ -359,14 +385,6 @@
 	writeclumpinfo(arena, clump, &c->info);
 	wbarena(arena);
 
-	/* set up for call to setdcachestate */
-	as.arena = arena;
-	as.aa = start+arena->memstats.used;
-	as.stats = arena->memstats;
-
-	/* update this before calling setdcachestate so it cannot be behind dcache.diskstate */
-	*pa = start+aa;
-	setdcachestate(&as);
 	qunlock(&arena->lock);
 
 	return aa;
@@ -415,6 +433,7 @@
 	/*
 	 * Look up as->arena to find index.
 	 */
+	needmainindex();	/* OS X linker */
 	ix = mainindex;
 	for(i=0; inarenas; i++)
 		if(ix->arenas[i] == as->arena)
@@ -515,6 +534,7 @@
 	/*
 	 * read & sum all blocks except the last one
 	 */
+	flushdcache();
 	memset(&s, 0, sizeof s);
 	b = alloczblock(bs, 0, arena->part->blocksize);
 	e = arena->base + arena->size;
@@ -550,24 +570,19 @@
 	sha1(b->data, bs-VtScoreSize, nil, &s);
 	sha1(zeroscore, VtScoreSize, nil, &s);
 	sha1(nil, 0, score, &s);
-
+	
 	/*
 	 * check for no checksum or the same
-	 *
-	 * the writepart is okay because we flushed the dcache in sealarena
 	 */
-	if(scorecmp(score, &b->data[bs - VtScoreSize]) != 0){
-		if(scorecmp(zeroscore, &b->data[bs - VtScoreSize]) != 0)
-			logerr(EOk, "overwriting mismatched checksums for arena=%s, found=%V calculated=%V",
-				arena->name, &b->data[bs - VtScoreSize], score);
-		scorecp(&b->data[bs - VtScoreSize], score);
-		if(writepart(arena->part, e, b->data, bs) < 0)
-			logerr(EOk, "sumarena can't write sum for %s: %r", arena->name);
-	}
+	if(scorecmp(score, &b->data[bs - VtScoreSize]) != 0
+	&& scorecmp(zeroscore, &b->data[bs - VtScoreSize]) != 0)
+		logerr(EOk, "overwriting mismatched checksums for arena=%s, found=%V calculated=%V",
+			arena->name, &b->data[bs - VtScoreSize], score);
 	freezblock(b);
 
 	qlock(&arena->lock);
 	scorecp(arena->score, score);
+	wbarena(arena);
 	qunlock(&arena->lock);
 }
 
@@ -586,6 +601,7 @@
 	}
 	dirtydblock(b, DirtyArenaTrailer);
 	bad = okarena(arena)<0 || packarena(arena, b->data)<0;
+	scorecp(b->data + arena->blocksize - VtScoreSize, arena->score);
 	putdblock(b);
 	if(bad)
 		return -1;
@@ -753,4 +769,158 @@
 
 	putdblock(cib->data);
 	cib->data = nil;
+}
+
+
+/*
+ * For index entry readahead purposes, the arenas are 
+ * broken into smaller subpieces, called clump info groups
+ * or cigs.  Each cig has ArenaCIGSize clumps (ArenaCIGSize
+ * is chosen to make the index entries take up about half
+ * a megabyte).  The index entries do not contain enough
+ * information to determine what the clump index is for
+ * a given address in an arena.  That info is needed both for
+ * figuring out which clump group an address belongs to 
+ * and for prefetching a clump group's index entries from
+ * the arena table of contents.  The first time clump groups
+ * are accessed, we scan the entire arena table of contents
+ * (which might be 10s of megabytes), recording the data 
+ * offset of each clump group.
+ */
+
+/* 
+ * load clump info group information by scanning entire toc.
+ */
+static void
+loadcig(Arena *arena)
+{
+	u32int i, j, ncig, nci;
+	ArenaCIG *cig;
+	ClumpInfo *ci;
+	u64int offset;
+	int ms;
+
+	if(arena->cig || arena->ncig < 0)
+		return;
+
+//	fprint(2, "loadcig %s\n", arena->name);
+	
+	ncig = (arena->memstats.clumps+ArenaCIGSize-1) / ArenaCIGSize;
+	if(ncig == 0){
+		arena->cig = vtmalloc(1);
+		arena->ncig = 0;
+		return;
+	}
+
+	ms = msec();
+	cig = vtmalloc(ncig*sizeof cig[0]);
+	ci = vtmalloc(ArenaCIGSize*sizeof ci[0]);
+	offset = 0;
+	for(i=0; incig = -1;
+				fprint(2, "loadcig %s: got %ud cigs, expected %ud\n", arena->name, i+1, ncig);
+				goto out;
+			}
+		}
+	}
+	vtfree(ci);
+	
+	arena->ncig = ncig;
+	arena->cig = cig;
+
+out:
+	ms = msec() - ms;
+	addstat2(StatCigLoad, 1, StatCigLoadTime, ms);
+}
+
+/*
+ * convert arena address into arena group + data boundaries.
+ */
+int
+arenatog(Arena *arena, u64int addr, u64int *gstart, u64int *glimit, int *g)
+{
+	int r, l, m;
+
+	qlock(&arena->lock);
+	if(arena->cig == nil)
+		loadcig(arena);
+	if(arena->cig == nil || arena->ncig == 0){
+		qunlock(&arena->lock);
+		return -1;
+	}
+
+	l = 1;
+	r = arena->ncig - 1;
+	while(l <= r){
+		m = (r + l) / 2;
+		if(arena->cig[m].offset <= addr)
+			l = m + 1;
+		else
+			r = m - 1;
+	}
+	l--;
+
+	*g = l;
+	*gstart = arena->cig[l].offset;
+	if(l+1 < arena->ncig)
+		*glimit = arena->cig[l+1].offset;
+	else
+		*glimit = arena->memstats.used;
+	qunlock(&arena->lock);
+	return 0;
+}
+
+/*
+ * load the clump info for group g into the index entries.
+ */
+int
+asumload(Arena *arena, int g, IEntry *entries, int nentries)
+{
+	int i, base, limit;
+	u64int addr;
+	ClumpInfo ci;
+	IEntry *ie;
+
+	if(nentries < ArenaCIGSize){
+		fprint(2, "asking for too few entries\n");
+		return -1;
+	}
+	
+	qlock(&arena->lock);
+	if(arena->cig == nil)
+		loadcig(arena);
+	if(arena->cig == nil || arena->ncig == 0 || g >= arena->ncig){
+		qunlock(&arena->lock);
+		return -1;
+	}
+	
+	addr = 0;
+	base = g*ArenaCIGSize;
+	limit = base + ArenaCIGSize;
+	if(base > arena->memstats.clumps)
+		base = arena->memstats.clumps;
+	ie = entries;
+	for(i=base; iscore, ci.score);
+			ie->ia.type = ci.type;
+			ie->ia.size = ci.uncsize;
+			ie->ia.blocks = (ci.size + ClumpSize + (1<> ABlockLog;
+			ie->ia.addr = addr;
+			ie++;
+		}
+		addr += ClumpSize + ci.size;
+	}
+	qunlock(&arena->lock);
+	return ie - entries;
 }
--- /sys/src/cmd/venti/srv/dcache.c	Mon Oct  1 03:35:57 2007
+++ /sys/src/cmd/venti/srv/dcache.c	Mon Oct  1 03:35:57 2007
@@ -55,15 +55,6 @@
 	u8int		*mem;			/* memory for all block descriptors */
 	int		ndirty;			/* number of dirty blocks */
 	int		maxdirty;		/* max. number of dirty blocks */
-	Channel	*ra;
-	u8int		*rabuf;
-	u32int		ramax;
-	u32int		rasize;
-	u64int		raaddr;
-	Part		*rapart;
-
-	AState	diskstate;
-	AState	state;
 };
 
 typedef struct Ra Ra;
@@ -82,7 +73,6 @@
 static void	fixheap(int i, DBlock *b);
 static void	flushproc(void*);
 static void	writeproc(void*);
-static void raproc(void*);
 
 void
 initdcache(u32int mem)
@@ -109,7 +99,6 @@
 	dcache.blocks = MKNZ(DBlock, nblocks);
 	dcache.write = MKNZ(DBlock*, nblocks);
 	dcache.mem = MKNZ(u8int, (nblocks+1+128) * blocksize);
-	dcache.ra = chancreate(sizeof(Ra), 0);
 
 	last = nil;
 	p = (u8int*)(((ulong)dcache.mem+blocksize-1)&~(ulong)(blocksize-1));
@@ -121,10 +110,6 @@
 		b->next = last;
 		last = b;
 	}
-	dcache.rabuf = &p[i*blocksize];
-	dcache.ramax = 128*blocksize;
-	dcache.raaddr = 0;
-	dcache.rapart = nil;
 
 	dcache.free = last;
 	dcache.nheap = 0;
@@ -133,136 +118,6 @@
 
 	vtproc(flushproc, nil);
 	vtproc(delaykickroundproc, &dcache.round);
-	vtproc(raproc, nil);
-}
-
-void
-setdcachestate(AState *a)
-{
-	trace(TraceBlock, "setdcachestate %s 0x%llux clumps %d", a->arena ? a->arena->name : nil, a->aa, a->stats.clumps);
-	qlock(&dcache.lock);
-	dcache.state = *a;
-	qunlock(&dcache.lock);
-}
-
-AState
-diskstate(void)
-{
-	AState a;
-
-	qlock(&dcache.lock);
-	a = dcache.diskstate;
-	qunlock(&dcache.lock);
-	return a;
-}
-
-static void
-raproc(void *v)
-{
-	Ra ra;
-	DBlock *b;
-
-	USED(v);
-	while(recv(dcache.ra, &ra) == 1){
-		if(ra.part->size <= ra.addr)
-			continue;
-		b = _getdblock(ra.part, ra.addr, OREAD, 2);
-		putdblock(b);
-	}
-}
-
-/*
- * We do readahead a whole arena at a time now,
- * so dreadahead is a no-op.  The original implementation
- * is in unused_dreadahead below.
- */
-void
-dreadahead(Part *part, u64int addr, int miss)
-{
-	USED(part);
-	USED(addr);
-	USED(miss);
-}
-
-void
-unused_dreadahead(Part *part, u64int addr, int miss)
-{
-	Ra ra;
-	static struct {
-		Part *part;
-		u64int addr;
-	} lastmiss;
-	static struct {
-		Part *part;
-		u64int addr;
-		int dir;
-	} lastra;
-
-	if(miss){
-		if(lastmiss.part==part && lastmiss.addr==addr-dcache.size){
-		XRa:
-			lastra.part = part;
-			lastra.dir = addr-lastmiss.addr;
-			lastra.addr = addr+lastra.dir;
-			ra.part = part;
-			ra.addr = lastra.addr;
-			nbsend(dcache.ra, &ra);
-		}else if(lastmiss.part==part && lastmiss.addr==addr+dcache.size){
-			addr -= dcache.size;
-			goto XRa;
-		}
-	}else{
-		if(lastra.part==part && lastra.addr==addr){
-			lastra.addr += lastra.dir;
-			ra.part = part;
-			ra.addr = lastra.addr;
-			nbsend(dcache.ra, &ra);
-		}
-	}
-
-	if(miss){
-		lastmiss.part = part;
-		lastmiss.addr = addr;
-	}
-}
-
-int
-rareadpart(Part *part, u64int addr, u8int *buf, uint n, int load)
-{
-	uint nn;
-	static RWLock ralock;
-
-	rlock(&ralock);
-	if(dcache.rapart==part && dcache.raaddr <= addr && addr+n <= dcache.raaddr+dcache.rasize){
-		memmove(buf, dcache.rabuf+(addr-dcache.raaddr), n);
-		runlock(&ralock);
-		return 0;
-	}
-	if(load != 2 || addr >= part->size){	/* addr >= part->size: let readpart do the error */	
-		runlock(&ralock);
-		diskaccess(0);
-		return readpart(part, addr, buf, n);
-	}
-
-	runlock(&ralock);
-	wlock(&ralock);
-fprint(2, "raread %s %llx\n", part->name, addr);
-	nn = dcache.ramax;
-	if(addr+nn > part->size)
-		nn = part->size - addr;
-	diskaccess(0);
-	if(readpart(part, addr, dcache.rabuf, nn) < 0){
-		wunlock(&ralock);
-		return -1;
-	}
-	memmove(buf, dcache.rabuf, n);	
-	dcache.rapart = part;
-	dcache.rasize = nn;
-	dcache.raaddr = addr;
-	wunlock(&ralock);
-
-	addstat(StatApartReadBytes, nn-n);
-	return 0;
 }
 
 static u32int
@@ -313,16 +168,8 @@
 again:
 	for(b = dcache.heads[h]; b != nil; b = b->next){
 		if(b->part == part && b->addr == addr){
-			/*
-			qlock(&stats.lock);
-			stats.pchit++;
-			qunlock(&stats.lock);
-			*/
-			if(load){
+			if(load)
 				addstat(StatDcacheHit, 1);
-				if(load != 2 && mode != OWRITE)
-					dreadahead(part, b->addr, 0);
-			}
 			goto found;
 		}
 	}
@@ -367,8 +214,6 @@
 	b->addr = addr;
 	b->part = part;
 	b->size = 0;
-	if(load != 2 && mode != OWRITE)
-		dreadahead(part, b->addr, 1);
 
 found:
 	b->ref++;
@@ -405,7 +250,8 @@
 				memset(&b->data[b->size], 0, size - b->size);
 			else{
 				trace(TraceBlock, "getdblock readpart %s 0x%llux", part->name, addr);
-				if(rareadpart(part, addr + b->size, &b->data[b->size], size - b->size, load) < 0){
+				diskaccess(0);
+				if(readpart(part, addr + b->size, &b->data[b->size], size - b->size) < 0){
 					b->mode = ORDWR;	/* so putdblock wunlocks */
 					putdblock(b);
 					return nil;
@@ -768,7 +614,6 @@
 	int i, j, n;
 	ulong t0;
 	DBlock *b, **write;
-	AState as;
 
 	USED(v);
 	threadsetname("flushproc");
@@ -779,10 +624,6 @@
 		t0 = nsec()/1000;
 		trace(TraceProc, "build t=%lud", (ulong)(nsec()/1000)-t0);
 
-		qlock(&dcache.lock);
-		as = dcache.state;
-		qunlock(&dcache.lock);
-
 		write = dcache.write;
 		n = 0;
 		for(i=0; iwriting);
 	for(i = ix->mapalloc; i < ix->narenas; i++){
-		a = writeaclump(ix->arenas[i], c, clbuf, ix->amap[i].start, pa);
+		a = writeaclump(ix->arenas[i], c, clbuf);
 		if(a != TWID64){
-			ix->mapalloc = i;	/* assuming write is atomic, race is okay */
+			ix->mapalloc = i;
+			ia.addr = ix->amap[i].start + a;
+			ia.type = c->info.type;
+			ia.size = c->info.uncsize;
+			ia.blocks = (c->info.size + ClumpSize + (1<> ABlockLog;
+			as.arena = ix->arenas[i];
+			as.aa = ia.addr;
+			as.stats = as.arena->memstats;
+			insertscore(c->info.score, &ia, IEDirty, &as);
+			qunlock(&ix->writing);
 			trace(TraceLump, "writeiclump exit");
-			return a;
+			return ia.addr;
 		}
 	}
+	qunlock(&ix->writing);
 
 	seterr(EAdmin, "no space left in arenas");
 	trace(TraceLump, "writeiclump failed");
@@ -594,6 +607,25 @@
 	}
 	*aa = a - ix->amap[l].start;
 	return ix->arenas[l];
+}
+
+/*
+ * convert an arena index to the bounds of the containing arena group.
+ */
+Arena*
+amapitoag(Index *ix, u64int a, u64int *gstart, u64int *glimit, int *g)
+{
+	u64int aa;
+	Arena *arena;
+	
+	arena = amapitoa(ix, a, &aa);
+	if(arena == nil)
+		return nil;
+	if(arenatog(arena, aa, gstart, glimit, g) < 0)
+		return nil;
+	*gstart += a - aa;
+	*glimit += a - aa;
+	return arena;
 }
 
 int
--- /sys/src/cmd/venti/srv/icache.c	Mon Oct  1 03:35:59 2007
+++ /sys/src/cmd/venti/srv/icache.c	Mon Oct  1 03:35:59 2007
@@ -2,236 +2,429 @@
 #include "dat.h"
 #include "fns.h"
 
+int icacheprefetch = 1;
+
 typedef struct ICache ICache;
+typedef struct IHash IHash;
+typedef struct ISum ISum;
+
 struct ICache
 {
-	QLock	lock;			/* locks hash table & all associated data */
+	QLock	lock;
 	Rendez	full;
-	IEntry	**heads;		/* heads of all the hash chains */
-	int	bits;			/* bits to use for indexing heads */
-	u32int	size;			/* number of heads; == 1 << bits, should be < entries */
-	IEntry	*base;			/* all allocated hash table entries */
-	IEntry	*free;
-	u32int	entries;		/* elements in base */
-	IEntry	*dirty;		/* chain of dirty elements */
-	u32int	ndirty;
+	IHash	*hash;
+	IEntry	*entries;
+	int		nentries;
+	IEntry	free;
+	IEntry	clean;
+	IEntry	dirty;
 	u32int	maxdirty;
-	u32int	unused;			/* index of first unused element in base */
-	u32int	stolen;			/* last head from which an element was stolen */
+	u32int	ndirty;
+	AState	as;
 
-	Arena	*last[4];
-	Arena	*lastload;
-	int		nlast;
+	ISum	**sum;
+	int		nsum;
+	IHash	*shash;
+	IEntry	*sentries;
+	int		nsentries;
 };
 
-int icacheprefetch = 0;		/* interferes with playing music via vacfs */
-
 static ICache icache;
 
-static IEntry	*icachealloc(IAddr *ia, u8int *score);
-
 /*
- * bits is the number of bits in the icache hash table
- * depth is the average depth
- * memory usage is about (1<table[0]));
+	ih->table = (IEntry**)(ih+1);
+	ih->bits = bits;
+	ih->size = size;
+	return ih;
 }
 
-u32int
-hashbits(u8int *sc, int bits)
+static IEntry*
+ihashlookup(IHash *ih, u8int score[VtScoreSize], int type)
 {
-	u32int v;
-
-	v = (sc[0] << 24) | (sc[1] << 16) | (sc[2] << 8) | sc[3];
-	if(bits < 32)
-		 v >>= (32 - bits);
-	return v;
+	u32int h;
+	IEntry *ie;
+	
+	h = hashbits(score, ih->bits);
+	for(ie=ih->table[h]; ie; ie=ie->nexthash)
+		if((type == -1 || type == ie->ia.type) && scorecmp(score, ie->score) == 0)
+			return ie;
+	return nil;
 }
 
 static void
-loadarenaclumps(Arena *arena, u64int aa)
+ihashdelete(IHash *ih, IEntry *ie, char *what)
 {
-	ulong i;
-	ClumpInfo ci;
-	IAddr ia;
-
-	for(i=0; imemstats.clumps; i++){
-		if(readclumpinfo(arena, i, &ci) < 0)
-			break;
-		ia.type = ci.type;
-		ia.size = ci.uncsize;
-		ia.blocks = (ci.size + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog;
-		ia.addr = aa;
-		aa += ClumpSize + ci.size;
-		if(ia.type != VtCorruptType)
-			insertscore(ci.score, &ia, 0);
-	}
+	u32int h;
+	IEntry **l;
+	
+	h = hashbits(ie->score, ih->bits);
+	for(l=&ih->table[h]; *l; l=&(*l)->nexthash)
+		if(*l == ie){
+			*l = ie->nexthash;
+			return;
+		}
+	fprint(2, "warning: %s %V not found in ihashdelete\n", what, ie->score);
 }
 
-int
-_lookupscore(u8int *score, int type, IAddr *ia, int *rac)
+static void
+ihashinsert(IHash *ih, IEntry *ie)
 {
 	u32int h;
-	IEntry *ie, *last;
-
-	qlock(&icache.lock);
-	h = hashbits(score, icache.bits);
-	last = nil;
-	for(ie = icache.heads[h]; ie != nil; ie = ie->next){
-		if((ie->ia.type == type || type == -1) && scorecmp(ie->score, score)==0){
-			if(last != nil)
-				last->next = ie->next;
-			else
-				icache.heads[h] = ie->next;
-			addstat(StatIcacheHit, 1);
-			if(rac)
-				ie->rac = 1;
-			trace(TraceLump, "lookupscore incache");
-			ie->next = icache.heads[h];
-			icache.heads[h] = ie;
-
-			*ia = ie->ia;
-			if(rac)
-				*rac = ie->rac;
-			qunlock(&icache.lock);
-			return 0;
-		}
-		last = ie;
-	}
-	addstat(StatIcacheMiss, 1);
-	qunlock(&icache.lock);
-	return -1;
+	
+	h = hashbits(ie->score, ih->bits);
+	ie->nexthash = ih->table[h];
+	ih->table[h] = ie;
 }
 
 
 /*
-ZZZ need to think about evicting the correct IEntry,
-and writing back the wtime.
- * look up data score in the index cache
- * if this fails, pull it in from the disk index table, if it exists.
- *
- * must be called with the lump for this score locked
+ * IEntry lists.
  */
-int
-lookupscore(u8int *score, int type, IAddr *ia, int *rac)
+
+static IEntry*
+popout(IEntry *ie)
 {
-	IEntry d, *ie;
-	u32int h;
-	u64int aa;
-	Arena *load;
-	int i, ret;
-	uint ms;
+	if(ie->prev == nil && ie->next == nil)
+		return ie;
+	ie->prev->next = ie->next;
+	ie->next->prev = ie->prev;
+	ie->next = nil;
+	ie->prev = nil;
+	return ie;
+}
 
-	aa = 0;
-	ms = msec();
-	
-	trace(TraceLump, "lookupscore %V.%d", score, type);
+static IEntry*
+poplast(IEntry *list)
+{
+	if(list->prev == list)
+		return nil;
+	return popout(list->prev);
+}
+
+static IEntry*
+pushfirst(IEntry *list, IEntry *ie)
+{
+	popout(ie);
+	ie->prev = list;
+	ie->next = list->next;
+	ie->prev->next = ie;
+	ie->next->prev = ie;
+	return ie;
+}
 
-	ret = 0;
-	if(_lookupscore(score, type, ia, rac) < 0){
-		if(loadientry(mainindex, score, type, &d) < 0){
-			ret = -1;
-			goto out;
-		}
+/*
+ * Arena summary cache.
+ */
+struct ISum
+{
+	QLock	lock;
+	IEntry	*entries;
+	int	nentries;
+	int	loaded;
+	u64int addr;
+	u64int limit;
+	Arena *arena;
+	int g;
+};
+
+static ISum*
+scachelookup(u64int addr)
+{
+	int i;
+	ISum *s;
 
-		/* failed in cache but found on disk - fill cache. */
-		trace(TraceLump, "lookupscore loaded");
-		addstat(StatIcacheFill, 1);
-
-		/*
-		 * no one else can load an entry for this score,
-		 * since we have this score's lump's lock.
-		 */
-		qlock(&icache.lock);
-	
-		/*
-		 * If we notice that all the hits are coming from one arena,
-		 * load the table of contents for that arena into the cache.
-		 */
-		load = nil;
-		h = hashbits(score, icache.bits);
-		ie = icachealloc(&d.ia, score);
-		if(icacheprefetch){
-			icache.last[icache.nlast++%nelem(icache.last)] = amapitoa(mainindex, ie->ia.addr, &aa);
-			aa = ie->ia.addr - aa;	/* compute base addr of arena */
-			for(i=0; iaddr <= addr && addr < s->limit){
+			if(i > 0){
+				memmove(icache.sum+1, icache.sum, i*sizeof icache.sum[0]);
+				icache.sum[0] = s;
 			}
+			return s;
 		}
+	}
+	return nil;
+}
+
+static void
+sumclear(ISum *s)
+{
+	int i;
+
+	for(i=0; inentries; i++)
+		ihashdelete(icache.shash, &s->entries[i], "scache");
+	s->nentries = 0;
+	s->loaded = 0;
+	s->addr = 0;
+	s->limit = 0;
+	s->arena = nil;
+	s->g = 0;
+}
+
+static ISum*
+scacheevict(void)
+{
+	ISum *s;
+	int i;
 	
-		ie->next = icache.heads[h];
-		icache.heads[h] = ie;
-	
-		*ia = ie->ia;
-		*rac = ie->rac;
-	
-		qunlock(&icache.lock);
-		if(load){
-			trace(TraceProc, "preload 0x%llux", aa);
-			loadarenaclumps(load, aa);
+	for(i=icache.nsum-1; i>=0; i--){
+		s = icache.sum[i];
+		if(canqlock(&s->lock)){
+			if(i > 0){
+				memmove(icache.sum+1, icache.sum, i*sizeof icache.sum[0]);
+				icache.sum[0] = s;
+			}
+			sumclear(s);
+			return s;
 		}
 	}
+	return nil;
+}
 
-out:
-	ms = msec() - ms;
-	addstat2(StatIcacheRead, 1, StatIcacheReadTime, ms);
+static void
+scachehit(u64int addr)
+{
+	scachelookup(addr);	/* for move-to-front */
+}
 
-	return ret;
+static void
+scachesetup(ISum *s, u64int addr)
+{
+	u64int addr0, limit;
+	int g;
+
+	s->arena = amapitoag(mainindex, addr, &addr0, &limit, &g);
+	s->addr = addr0;
+	s->limit = limit;
+	s->g = g;
+}
+
+static void
+scacheload(ISum *s)
+{
+	int i, n;
+
+	s->loaded = 1;
+	n = asumload(s->arena, s->g, s->entries, ArenaCIGSize);
+	/*
+	 * n can be less then ArenaCIGSize, either if the clump group
+	 * is the last in the arena and is only partially filled, or if there
+	 * are corrupt clumps in the group -- those are not returned.
+	 */
+	for(i=0; ientries[i].ia.addr += s->addr;
+		ihashinsert(icache.shash, &s->entries[i]);
+	}
+//fprint(2, "%T scacheload %s %d - %d entries\n", s->arena->name, s->g, n);
+	addstat(StatScachePrefetch, n);
+	s->nentries = n;
+}
+
+static ISum*
+scachemiss(u64int addr)
+{
+	ISum *s;
+
+	s = scachelookup(addr);
+	if(s == nil){
+		/* first time: make an entry in the cache but don't populate it yet */
+		s = scacheevict();
+		if(s == nil)
+			return nil;
+		scachesetup(s, addr);
+		qunlock(&s->lock);
+		return nil;
+	}
+
+	/* second time: load from disk */
+	qlock(&s->lock);
+	if(s->loaded || !icacheprefetch){
+		qunlock(&s->lock);
+		return nil;
+	}
+	
+	return s;	/* locked */
 }
 
 /*
- * insert a new element in the hash table.
+ * Index cache.
  */
-int
-insertscore(u8int *score, IAddr *ia, int write)
+
+void
+initicache(u32int mem0)
 {
-	IEntry *ie, se;
-	u32int h;
+	u32int mem;
+	int i, entries, scache;
+	
+	icache.full.l = &icache.lock;
 
-	trace(TraceLump, "insertscore enter");
-	if(write)
-		addstat(StatIcacheWrite, 1);
-	else
-		addstat(StatIcachePrefetch, 1);
+	mem = mem0;
+	entries = mem / (sizeof(IEntry)+sizeof(IEntry*));
+	scache = (entries/8) / ArenaCIGSize;
+	entries -= entries/8;
+	if(scache < 4)
+		scache = 4;
+	if(scache > 16)
+		scache = 16;
+	if(entries < 1000)
+		entries = 1000;
+fprint(2, "icache %,d bytes = %,d entries; %d scache\n", mem0, entries, scache);
+
+	icache.clean.prev = icache.clean.next = &icache.clean;
+	icache.dirty.prev = icache.dirty.next = &icache.dirty;
+	icache.free.prev = icache.free.next = &icache.free;
+	
+	icache.hash = mkihash(entries);
+	icache.nentries = entries;
+	setstat(StatIcacheSize, entries);
+	icache.entries = vtmallocz(entries*sizeof icache.entries[0]);
+	icache.maxdirty = entries / 2;
+	for(i=0; ientries = icache.sentries + i*ArenaCIGSize;
+	}
+}
 
-	qlock(&icache.lock);
-	h = hashbits(score, icache.bits);
 
-	ie = icachealloc(ia, score);
-	if(write){
+static IEntry*
+evictlru(void)
+{
+	IEntry *ie;
+	
+	ie = poplast(&icache.clean);
+	if(ie == nil)
+		return nil;
+	ihashdelete(icache.hash, ie, "evictlru");
+	return ie;
+}
+
+static void
+icacheinsert(u8int score[VtScoreSize], IAddr *ia, int state)
+{
+	IEntry *ie;
+
+	if((ie = poplast(&icache.free)) == nil && (ie = evictlru()) == nil){
+		addstat(StatIcacheStall, 1);
+		while((ie = poplast(&icache.free)) == nil && (ie = evictlru()) == nil){
+			// Could safely return here if state == IEClean.
+			// But if state == IEDirty, have to wait to make
+			// sure we don't lose an index write.  
+			// Let's wait all the time.
+			flushdcache();
+			kickicache();
+			rsleep(&icache.full);
+		}
+		addstat(StatIcacheStall, -1);
+	}
+
+	memmove(ie->score, score, VtScoreSize);
+	ie->state = state;
+	ie->ia = *ia;
+	if(state == IEClean){
+		addstat(StatIcachePrefetch, 1);
+		pushfirst(&icache.clean, ie);
+	}else{
+		addstat(StatIcacheWrite, 1);
+		assert(state == IEDirty);
 		icache.ndirty++;
 		setstat(StatIcacheDirty, icache.ndirty);
 		delaykickicache();
-		ie->dirty = 1;
+		pushfirst(&icache.dirty, ie);
 	}
-	ie->next = icache.heads[h];
-	icache.heads[h] = ie;
+	ihashinsert(icache.hash, ie);
+}
+
+int
+icachelookup(u8int score[VtScoreSize], int type, IAddr *ia)
+{
+	IEntry *ie;
 
-	se = *ie;
+	qlock(&icache.lock);
+	addstat(StatIcacheLookup, 1);
+	if((ie = ihashlookup(icache.hash, score, type)) != nil){
+		*ia = ie->ia;
+		if(ie->state == IEClean)
+			pushfirst(&icache.clean, ie);
+		addstat(StatIcacheHit, 1);
+		qunlock(&icache.lock);
+		return 0;
+	}
+
+	if((ie = ihashlookup(icache.shash, score, type)) != nil){
+		*ia = ie->ia;
+		icacheinsert(score, &ie->ia, IEClean);
+		scachehit(ie->ia.addr);
+		addstat(StatScacheHit, 1);
+		qunlock(&icache.lock);
+		return 0;
+	}
+	addstat(StatIcacheMiss, 1);
 	qunlock(&icache.lock);
 
-	if(write && icache.ndirty >= icache.maxdirty)
+	return -1;
+}
+
+int
+insertscore(u8int score[VtScoreSize], IAddr *ia, int state, AState *as)
+{
+	ISum *toload;
+
+	qlock(&icache.lock);
+	icacheinsert(score, ia, state);
+	if(state == IEClean)
+		toload = scachemiss(ia->addr);
+	else{
+		assert(state == IEDirty);
+		toload = nil;
+		if(as == nil)
+			fprint(2, "%T insertscore IEDirty without as; called from %lux\n", getcallerpc(&score));
+		else{
+			if(icache.as.aa > as->aa)
+				fprint(2, "%T insertscore: aa moving backward: %#llux -> %#llux\n", icache.as.aa, as->aa);
+			icache.as = *as;
+		}
+	}
+	qunlock(&icache.lock);
+	if(toload){
+		scacheload(toload);
+		qunlock(&toload->lock);
+	}
+	
+	if(icache.ndirty >= icache.maxdirty)
 		kickicache();
 
 	/*
@@ -240,125 +433,81 @@
 	 * the lump, meaning any searches for this block
 	 * will hit in the lump cache until after we return.
 	 */
-	markbloomfilter(mainindex->bloom, score);
+	if(state == IEDirty)
+		markbloomfilter(mainindex->bloom, score);
 
 	return 0;
 }
 
-/*
- * allocate a index cache entry which hasn't been used in a while.
- * must be called with icache.lock locked
- * if the score is already in the table, update the entry.
- */
-static IEntry *
-icachealloc(IAddr *ia, u8int *score)
+static int
+lookupscore_untimed(u8int score[VtScoreSize], int type, IAddr *ia)
 {
-	int i;
-	IEntry *ie, *last, *clean, *lastclean;
-	u32int h;
+	IEntry d;
 
-	h = hashbits(score, icache.bits);
-	last = nil;
-	for(ie = icache.heads[h]; ie != nil; ie = ie->next){
-		if(ie->ia.type == ia->type && scorecmp(ie->score, score)==0){
-			if(last != nil)
-				last->next = ie->next;
-			else
-				icache.heads[h] = ie->next;
-			trace(TraceLump, "icachealloc hit");
-			ie->rac = 1;
-			return ie;
-		}
-		last = ie;
-	}
+	if(icachelookup(score, type, ia) >= 0)
+		return 0;
 
-	h = icache.unused;
-	if(h < icache.entries){
-		ie = &icache.base[h++];
-		icache.unused = h;
-		trace(TraceLump, "icachealloc unused");
-		goto Found;
-	}
-	
-	if((ie = icache.free) != nil){
-		icache.free = ie->next;
-		goto Found;
-	}
-
-	h = icache.stolen;
-	for(i=0;; i++){
-		h++;
-		if(h >= icache.size)
-			h = 0;
-		if(i == icache.size){
-			trace(TraceLump, "icachealloc sleep");
-			addstat(StatIcacheStall, 1);
-			while(icache.ndirty == icache.entries){
-				/*
-				 * This is a bit suspect.  Kickicache will wake up the
-				 * icachewritecoord, but if all the index entries are for
-				 * unflushed disk blocks, icachewritecoord won't be
-				 * able to do much.  It always rewakes everyone when
-				 * it thinks it is done, though, so at least we'll go around
-				 * the while loop again.  Also, if icachewritecoord sees
-				 * that the disk state hasn't change at all since the last
-				 * time around, it kicks the disk.  This needs to be
-				 * rethought, but it shouldn't deadlock anymore.
-				 */
-				kickicache();
-				rsleep(&icache.full);
-			}
-			addstat(StatIcacheStall, -1);
-			i = 0;
-		}
-		lastclean = nil;
-		clean = nil;
-		last = nil;
-		for(ie=icache.heads[h]; ie; last=ie, ie=ie->next){
-			if(!ie->dirty){
-				clean = ie;
-				lastclean = last;
-			}
-		}
-		if(clean){
-			if(lastclean)
-				lastclean->next = clean->next;
-			else
-				icache.heads[h] = clean->next;
-			clean->next = nil;
-			icache.stolen = h;
-			ie = clean;
-			trace(TraceLump, "icachealloc steal");
-			goto Found;
-		}
-	}
+	addstat(StatIcacheFill, 1);
+	if(loadientry(mainindex, score, type, &d) < 0)
+		return -1;
+	
+	insertscore(score, &d.ia, IEClean, nil);
+	*ia = d.ia;
+	return 0;
+}
 
-Found:
-	ie->ia = *ia;
-	scorecp(ie->score, score);
-	ie->rac = 0;	
-	return ie;
+int
+lookupscore(u8int score[VtScoreSize], int type, IAddr *ia)
+{
+	int ms, ret;
+	
+	ms = msec();
+	ret = lookupscore_untimed(score, type, ia);
+	ms = msec() - ms;
+	addstat2(StatIcacheRead, 1, StatIcacheReadTime, ms);
+	return ret;
 }
+	
+u32int
+hashbits(u8int *sc, int bits)
+{
+	u32int v;
 
+	v = (sc[0] << 24) | (sc[1] << 16) | (sc[2] << 8) | sc[3];
+	if(bits < 32)
+		 v >>= (32 - bits);
+	return v;
+}
+
+ulong
+icachedirtyfrac(void)
+{
+	return (vlong)icache.ndirty*IcacheFrac / icache.nentries;
+}
+
+/*
+ * Return a singly-linked list of dirty index entries.
+ * with 32-bit hash numbers between lo and hi
+ * and address < limit.
+ */
 IEntry*
 icachedirty(u32int lo, u32int hi, u64int limit)
 {
-	int i;
 	u32int h;
 	IEntry *ie, *dirty;
 
 	dirty = nil;
 	trace(TraceProc, "icachedirty enter");
 	qlock(&icache.lock);
-	for(i=0; inext)
-		if(ie->dirty && ie->ia.addr != 0 && ie->ia.addr < limit){
+	for(ie = icache.dirty.next; ie != &icache.dirty; ie=ie->next){
+		if(ie->state == IEDirty && ie->ia.addr < limit){
 			h = hashbits(ie->score, 32);
 			if(lo <= h && h <= hi){
 				ie->nextdirty = dirty;
 				dirty = ie;
 			}
 		}
+	}
 	qunlock(&icache.lock);
 	trace(TraceProc, "icachedirty exit");
 	if(dirty == nil)
@@ -366,36 +515,59 @@
 	return dirty;
 }
 
+AState
+icachestate(void)
+{
+	AState as;
+
+	qlock(&icache.lock);
+	as = icache.as;
+	qunlock(&icache.lock);
+	return as;
+}
+
+/*
+ * The singly-linked non-circular list of index entries ie
+ * has been written to disk.  Move them to the clean list.
+ */
 void
 icacheclean(IEntry *ie)
 {
-	trace(TraceProc, "icachedirty enter");
+	IEntry *next;
+	
+	trace(TraceProc, "icacheclean enter");
 	qlock(&icache.lock);
-	for(; ie; ie=ie->nextdirty){
+	for(; ie; ie=next){
+		assert(ie->state == IEDirty);
+		next = ie->nextdirty;
+		ie->nextdirty = nil;
+		popout(ie); /* from icache.dirty */
 		icache.ndirty--;
-		ie->dirty = 0;
+		ie->state = IEClean;
+		pushfirst(&icache.clean, ie);
 	}
 	setstat(StatIcacheDirty, icache.ndirty);
 	rwakeupall(&icache.full);
 	qunlock(&icache.lock);
-	trace(TraceProc, "icachedirty exit");
+	trace(TraceProc, "icacheclean exit");
 }
 
 void
 emptyicache(void)
 {
 	int i;
-	IEntry *ie, **lie;
+	IEntry *ie;
+	ISum *s;
 	
 	qlock(&icache.lock);
-	for(i=0; idirty == 0){
-			*lie = ie->next;
-			ie->next = icache.free;
-			icache.free = ie;
-		}else
-			lie = &ie->next;
+	while((ie = evictlru()) != nil)
+		pushfirst(&icache.free, ie);
+	for(i=0; ilock);
+		sumclear(s);
+		qunlock(&s->lock);
 	}
 	qunlock(&icache.lock);
 }
+
--- /sys/src/cmd/venti/srv/httpd.c	Mon Oct  1 03:36:01 2007
+++ /sys/src/cmd/venti/srv/httpd.c	Mon Oct  1 03:36:00 2007
@@ -565,11 +565,11 @@
 	if(scorecmp(zeroscore, arena->score) != 0)
 		hprint(hout, "\tscore=%V\n", arena->score);
 
-	hprint(hout, "\tmem: clumps=%d compressed clumps=%d data=%,lld compressed data=%,lld storage=%,lld\n",
+	hprint(hout, "\twritten: clumps=%d compressed clumps=%d data=%,lld compressed data=%,lld storage=%,lld\n",
 		arena->memstats.clumps, arena->memstats.cclumps, arena->memstats.uncsize,
 		arena->memstats.used - arena->memstats.clumps * ClumpSize,
 		arena->memstats.used + arena->memstats.clumps * ClumpInfoSize);
-	hprint(hout, "\tdisk: clumps=%d compressed clumps=%d data=%,lld compressed data=%,lld storage=%,lld\n",
+	hprint(hout, "\tindexed: clumps=%d compressed clumps=%d data=%,lld compressed data=%,lld storage=%,lld\n",
 		arena->diskstats.clumps, arena->diskstats.cclumps, arena->diskstats.uncsize,
 		arena->diskstats.used - arena->diskstats.clumps * ClumpSize,
 		arena->diskstats.used + arena->diskstats.clumps * ClumpInfoSize);
@@ -895,7 +895,7 @@
 
 	"icachehit",
 	"icachemiss",
-	"icachelookup",
+	"icacheread",
 	"icachewrite",
 	"icachefill",
 	"icacheprefetch",
@@ -904,6 +904,9 @@
 	"icacheflush",
 	"icachestall",
 	"icachelookuptime",
+	"icachelookup",
+	"scachehit",
+	"scacheprefetch",
 
 	"bloomhit",
 	"bloommiss",
@@ -925,6 +928,9 @@
 
 	"sumread",
 	"sumreadbyte",
+	
+	"cigload",
+	"cigloadtime",
 };
 
 static int
--- /sys/src/cmd/venti/srv/checkarenas.c	Mon Oct  1 03:36:01 2007
+++ /sys/src/cmd/venti/srv/checkarenas.c	Mon Oct  1 03:36:01 2007
@@ -24,7 +24,7 @@
 
 	err = 0;
 	for(;;){
-		e = syncarena(arena, 0, 1000, 0, fix);
+		e = syncarena(arena, 1000, 0, fix);
 		err |= e;
 		if(!(e & SyncHeader))
 			break;
--- /sys/src/cmd/venti/srv/syncarena.c	Mon Oct  1 03:36:02 2007
+++ /sys/src/cmd/venti/srv/syncarena.c	Mon Oct  1 03:36:02 2007
@@ -25,7 +25,7 @@
  * returns 0 if ok, flags if error occurred
  */
 int
-syncarena(Arena *arena, u64int start, u32int n, int zok, int fix)
+syncarena(Arena *arena, u32int n, int zok, int fix)
 {
 	ZBlock *lump;
 	Clump cl;
@@ -53,7 +53,7 @@
 			fprint(2, "%s: illegal clump magic number=%#8.8ux at clump=%d\n", arena->name, magic, clump);
 			/* err |= SyncDataErr; */
 			if(fix && writeclumpmagic(arena, aa, ClumpFreeMagic) < 0){
-				fprint(2, "can't write corrected clump free magic: %r");
+				fprint(2, "%s: can't write corrected clump free magic: %r", arena->name);
 				err |= SyncFixErr;
 			}
 			break;
@@ -136,9 +136,8 @@
 	|| cclumps != arena->memstats.cclumps
 	|| uncsize != arena->memstats.uncsize){
 		err |= SyncHeader;
-		fprint(2, "arena %s: start=%lld fix=%d flush=%d %lld->%lld %ud->%ud %ud->%ud %lld->%lld\n",
+		fprint(2, "arena %s: fix=%d flush=%d %lld->%lld %ud->%ud %ud->%ud %lld->%lld\n",
 			arena->name,
-			start,
 			fix,
 			flush,
 			used, arena->memstats.used,
--- /sys/src/cmd/venti/srv/buildindex.c	Mon Oct  1 03:36:03 2007
+++ /sys/src/cmd/venti/srv/buildindex.c	Mon Oct  1 03:36:02 2007
@@ -36,18 +36,19 @@
 void
 usage(void)
 {
-	fprint(2, "usage: buildindex [-bd] [-i isect]... [-M imem] venti.conf\n");
+	fprint(2, "usage: buildindex [-b] [-i isect]... [-M imem] venti.conf\n");
 	threadexitsall("usage");
 }
 
 void
 threadmain(int argc, char *argv[])
 {
-	int fd, i, napart;
+	int fd, i, napart, nfinish, maxdisks;
 	u32int bcmem, imem;
 	Config conf;
 	Part *p;
 	
+	maxdisks = 100000;
 	ventifmtinstall();
 	imem = 256*1024*1024;
 	ARGBEGIN{
@@ -64,6 +65,9 @@
 	case 'M':
 		imem = unittoull(EARGF(usage()));
 		break;
+	case 'm':	/* temporary - might go away */
+		maxdisks = atoi(EARGF(usage()));
+		break;
 	default:
 		usage();
 		break;
@@ -132,17 +136,21 @@
 	/* start arena procs */
 	p = nil;
 	napart = 0;
+	nfinish = 0;
 	arenadonechan = chancreate(sizeof(void*), 0);
 	for(i=0; inarenas; i++){
 		if(ix->arenas[i]->part != p){
 			p = ix->arenas[i]->part;
 			vtproc(arenapartproc, p);
-			napart++;
+			if(++napart >= maxdisks){
+				recvp(arenadonechan);
+				nfinish++;
+			}
 		}
 	}
 
 	/* wait for arena procs to finish */
-	for(i=0; imemstats.clumps)
 			fprint(2, "%T arena %s: %d entries\n", 
 				a->name, a->memstats.clumps);
-		addr = ix->amap[i].start;
-		for(clump=0; clumpmemstats.clumps; clump+=n){
+		/*
+		 * Running the loop backwards accesses the 
+		 * clump info blocks forwards, since they are
+		 * stored in reverse order at the end of the arena.
+		 * This speeds things slightly.
+		 */
+		addr = ix->amap[i].start + a->memstats.used;
+		for(clump=a->memstats.clumps; clump > 0; clump-=n){
 			n = ClumpChunks;
-			if(n > a->memstats.clumps - clump)
-				n = a->memstats.clumps - clump;
-			if(readclumpinfos(a, clump, cis, n) != n){
+			if(n > clump)
+				n = clump;
+			if(readclumpinfos(a, clump-n, cis, n) != n){
 				fprint(2, "%T arena %s: directory read: %r\n", a->name);
 				errors = 1;
 				break;
 			}
-			for(j=0; j=0; j--){
 				ci = &cis[j];
 				ie.ia.type = ci->type;
 				ie.ia.size = ci->uncsize;
+				addr -= ci->size + ClumpSize;
 				ie.ia.addr = addr;
-				addr += ci->size + ClumpSize;
 				ie.ia.blocks = (ci->size + ClumpSize + (1<> ABlockLog;
 				scorecp(ie.score, ci->score);
 				if(ci->type == VtCorruptType)
@@ -253,6 +267,8 @@
 				}
 			}
 		}
+		if(addr != ix->amap[i].start)
+			fprint(2, "%T arena %s: clump miscalculation %lld != %lld\n", a->name, addr, ix->amap[i].start);
 	}
 	add(&arenaentries, tot);
 	add(&skipentries, nskip);
--- /sys/src/cmd/venti/srv/venti.c	Mon Oct  1 03:36:04 2007
+++ /sys/src/cmd/venti/srv/venti.c	Mon Oct  1 03:36:03 2007
@@ -106,9 +106,6 @@
 	if(configfile == nil)
 		configfile = "venti.conf";
 
-	if(initarenasum() < 0)
-		fprint(2, "warning: can't initialize arena summing process: %r");
-
 	fprint(2, "conf...");
 	if(initventi(configfile, &config) < 0)
 		sysfatal("can't init server: %r");
@@ -146,13 +143,7 @@
 		mem, mem / (8 * 1024));
 	initlumpcache(mem, mem / (8 * 1024));
 
-	icmem = u64log2(icmem / (sizeof(IEntry)+sizeof(IEntry*)) / ICacheDepth);
-	if(icmem < 4)
-		icmem = 4;
-	if(0) fprint(2, "initialize %d bytes of index cache for %d index entries\n",
-		(sizeof(IEntry)+sizeof(IEntry*)) * (1 << icmem) * ICacheDepth,
-		(1 << icmem) * ICacheDepth);
-	initicache(icmem, ICacheDepth);
+	initicache(icmem);
 	initicachewrite();
 
 	/*
@@ -170,7 +161,7 @@
 		startbloomproc(mainindex->bloom);
 
 	fprint(2, "sync...");
-	if(!readonly && syncindex(mainindex, 1, 0, 0) < 0)
+	if(!readonly && syncindex(mainindex) < 0)
 		sysfatal("can't sync server: %r");
 
 	if(!readonly && queuewrites){
@@ -181,6 +172,9 @@
 			queuewrites = 0;
 		}
 	}
+
+	if(initarenasum() < 0)
+		fprint(2, "warning: can't initialize arena summing process: %r");
 
 	fprint(2, "announce %s...", vaddr);
 	ventisrv = vtlisten(vaddr);
--- /sys/src/cmd/venti/srv/fns.h	Mon Oct  1 03:36:04 2007
+++ /sys/src/cmd/venti/srv/fns.h	Mon Oct  1 03:36:04 2007
@@ -6,8 +6,11 @@
 void		addstat2(int, int, int, int);
 ZBlock		*alloczblock(u32int size, int zeroed, uint alignment);
 Arena		*amapitoa(Index *index, u64int a, u64int *aa);
+Arena		*amapitoag(Index *index, u64int a, u64int *gstart, u64int *glimit, int *g);
 u64int		arenadirsize(Arena *arena, u32int clumps);
+int		arenatog(Arena *arena, u64int aa, u64int *gstart, u64int *glimit, int *g);
 void		arenaupdate(Arena *arena, u32int size, u8int *score);
+int		asumload(Arena *arena, int g, IEntry *entries, int maxentries);
 void		backsumarena(Arena *arena);
 void	binstats(long (*fn)(Stats *s0, Stats *s1, void*), void *arg, long t0, long t1, Statbin *bin, int nbin);
 int		bloominit(Bloom*, vlong, uchar*);
@@ -26,7 +29,6 @@
 void		dirtydblock(DBlock*, int);
 void		diskaccess(int);
 void		disksched(void);
-AState	diskstate(void);
 void		*emalloc(ulong);
 void		emptydcache(void);
 void		emptyicache(void);
@@ -64,6 +66,8 @@
 IEntry*	icachedirty(u32int, u32int, u64int);
 ulong	icachedirtyfrac(void);
 void		icacheclean(IEntry*);
+int		icachelookup(u8int *score, int type, IAddr *ia);
+AState	icachestate(void);
 int		ientrycmp(const void *vie1, const void *vie2);
 char		*ifileline(IFile *f);
 int		ifilename(IFile *f, char *dst);
@@ -76,7 +80,7 @@
 int		initarenasum(void);
 void		initbloomfilter(Index*);
 void		initdcache(u32int mem);
-void		initicache(int bits, int depth);
+void		initicache(u32int mem);
 void		initicachewrite(void);
 IEStream	*initiestream(Part *part, u64int off, u64int clumps, u32int size);
 ISect		*initisect(Part *part);
@@ -87,7 +91,7 @@
 void		initround(Round*, char*, int);
 int		initventi(char *config, Config *conf);
 void		insertlump(Lump *lump, Packet *p);
-int		insertscore(u8int *score, IAddr *ia, int write);
+int		insertscore(u8int *score, IAddr *ia, int state, AState *as);
 void		kickdcache(void);
 void		kickicache(void);
 void		kickround(Round*, int wait);
@@ -97,14 +101,14 @@
 int		loadientry(Index *index, u8int *score, int type, IEntry *ie);
 void		logerr(int severity, char *fmt, ...);
 Lump		*lookuplump(u8int *score, int type);
-int		_lookupscore(u8int *score, int type, IAddr *ia, int *rac);
-int		lookupscore(u8int *score, int type, IAddr *ia, int *rac);
+int		lookupscore(u8int *score, int type, IAddr *ia);
 int		maparenas(AMap *am, Arena **arenas, int n, char *what);
 void		markbloomfilter(Bloom*, u8int*);
 uint		msec(void);
 int		namecmp(char *s, char *t);
 void		namecp(char *dst, char *src);
 int		nameok(char *name);
+void		needmainindex(void);
 void		needzeroscore(void);
 Arena		*newarena(Part *part, u32int, char *name, u64int base, u64int size, u32int blocksize);
 ArenaPart	*newarenapart(Part *part, u32int blocksize, u32int tabsize);
@@ -152,7 +156,6 @@
 int		scorecmp(u8int *, u8int *);
 void		scoremem(u8int *score, u8int *buf, int size);
 void		setatailstate(AState*);
-void		setdcachestate(AState*);
 void		seterr(int severity, char *fmt, ...);
 void		setstat(int, long);
 void		settrace(char *type);
@@ -166,9 +169,8 @@
 int		stru32int(char *s, u32int *r);
 int		stru64int(char *s, u64int *r);
 void		sumarena(Arena *arena);
-int		syncarena(Arena *arena, u64int start, u32int n, int zok, int fix);
-int		syncarenaindex(Index *ix, Arena *arena, u32int clump, u64int a, int fix, int *pflush, int check);
-int		syncindex(Index *ix, int fix, int mustflushicache, int check);
+int		syncarena(Arena *arena, u32int n, int zok, int fix);
+int		syncindex(Index *ix);
 void		trace(char *type, char*, ...);
 void		traceinit(void);
 int		u64log2(u64int v);
@@ -197,12 +199,12 @@
 int		wbisect(ISect *is);
 int		wbindex(Index *ix);
 int		whackblock(u8int *dst, u8int *src, int ssize);
-u64int		writeaclump(Arena *a, Clump *c, u8int *clbuf, u64int, u64int*);
+u64int		writeaclump(Arena *a, Clump *c, u8int *clbuf);
 u32int		writearena(Arena *arena, u64int aa, u8int *clbuf, u32int n);
 int		writebloom(Bloom*);
 int		writeclumpinfo(Arena *arean, int clump, ClumpInfo *ci);
 int		writepng(Hio*, Memimage*);
-u64int		writeiclump(Index *ix, Clump *c, u8int *clbuf, u64int*);
+u64int		writeiclump(Index *ix, Clump *c, u8int *clbuf);
 int		writelump(Packet *p, u8int *score, int type, u32int creator, uint ms);
 int		writepart(Part *part, u64int addr, u8int *buf, u32int n);
 int		writeqlump(Lump *u, Packet *p, int creator, uint ms);
--- /sys/src/cmd/venti/srv/syncindex.c	Mon Oct  1 03:36:05 2007
+++ /sys/src/cmd/venti/srv/syncindex.c	Mon Oct  1 03:36:04 2007
@@ -6,7 +6,7 @@
 void
 usage(void)
 {
-	fprint(2, "usage: syncindex [-fv] [-B blockcachesize] config\n");
+	fprint(2, "usage: syncindex [-v] [-B blockcachesize] config\n");
 	threadexitsall("usage");
 }
 
@@ -16,9 +16,7 @@
 threadmain(int argc, char *argv[])
 {
 	u32int bcmem, icmem;
-	int fix;
 
-	fix = 0;
 	bcmem = 0;
 	icmem = 0;
 	ARGBEGIN{
@@ -28,9 +26,6 @@
 	case 'I':
 		icmem = unittoull(EARGF(usage()));
 		break;
-	case 'f':
-		fix++;
-		break;
 	case 'v':
 		verbose++;
 		break;
@@ -39,9 +34,6 @@
 		break;
 	}ARGEND
 
-	if(!fix)
-		readonly = 1;
-
 	if(argc != 1)
 		usage();
 
@@ -56,21 +48,17 @@
 	if(0) fprint(2, "initialize %d bytes of disk block cache\n", bcmem);
 	initdcache(bcmem);
 	initlumpcache(1*1024*1024, 1024/8);
-	icmem = u64log2(icmem / (sizeof(IEntry)+sizeof(IEntry*)) / ICacheDepth);
-	if(icmem < 4)
-		icmem = 4;
-	if(1) fprint(2, "initialize %d bytes of index cache for %d index entries\n",
-		(sizeof(IEntry)+sizeof(IEntry*)) * (1 << icmem) * ICacheDepth,
-		(1 << icmem) * ICacheDepth);
-	initicache(icmem, ICacheDepth);
+	initicache(icmem);
 	initicachewrite();
 	if(mainindex->bloom)
 		startbloomproc(mainindex->bloom);
 
 	if(verbose)
 		printindex(2, mainindex);
-	if(syncindex(mainindex, fix, 1, 0) < 0)
+	if(syncindex(mainindex) < 0)
 		sysfatal("failed to sync index=%s: %r\n", mainindex->name);
+	flushicache();
+	flushdcache();
 
 	threadexitsall(0);
 }
--- /sys/src/cmd/venti/srv/syncindex0.c	Mon Oct  1 03:36:05 2007
+++ /sys/src/cmd/venti/srv/syncindex0.c	Mon Oct  1 03:36:05 2007
@@ -2,184 +2,92 @@
 #include "dat.h"
 #include "fns.h"
 
-enum
+static int
+syncarenaindex(Arena *arena, u64int a0)
 {
-	ClumpChunks	= 32*1024
-};
-
-static int missing, wrong;
-
-/*
- * shell sort is plenty good enough
- * because we're going to do a bunch of disk i/o's
- */
-static void
-sortclumpinfo(ClumpInfo *ci, int *s, int n)
-{
-	int i, j, m, t;
-
-	for(m = (n + 3) / 5; m > 0; m = (m + 1) / 3){
-		for(i = n - m; i-- > 0;){
-			for(j = i + m; j < n; j += m){
-				if(memcmp(ci[s[j - m]].score, ci[s[j]].score, VtScoreSize) <= 0)
-					break;
-				t = s[j];
-				s[j] = s[j - m];
-				s[j - m] = t;
-			}
-		}
-	}
-}
-
-int
-syncarenaindex(Index *ix, Arena *arena, u32int clump, u64int a, int fix, int *pflush, int check)
-{
-	Packet *pack;
-	IEntry ie;
+	int ok;
+	u32int clump;
+	u64int a;
+	ClumpInfo ci;
 	IAddr ia;
-	ClumpInfo *ci, *cis;
-	u64int *addrs;
-	int i, n, ok, *s, flush;
-
-	trace(TraceProc, "syncarenaindex enter");
-
-	flush = 0;
-	cis = MKN(ClumpInfo, ClumpChunks);
-	addrs = MKN(u64int, ClumpChunks);
-	s = MKN(int, ClumpChunks);
+	AState as;
+	
+	if(arena->diskstats.clumps == arena->memstats.clumps)
+		return 0;
+	
+	memset(&as, 0, sizeof as);
+	as.arena = arena;
+	as.stats = arena->diskstats;
+
 	ok = 0;
-	for(; clump < arena->memstats.clumps; clump += n){
-		n = ClumpChunks;
-		if(n > arena->memstats.clumps - clump)
-			n = arena->memstats.clumps - clump;
-		n = readclumpinfos(arena, clump, cis, n);
-		if(n <= 0){
-			fprint(2, "arena directory read failed\n");
+	a = a0 + arena->diskstats.used;
+	for(clump=arena->diskstats.clumps; clump < arena->memstats.clumps; clump++){
+		if(readclumpinfo(arena, clump, &ci) < 0){
+			fprint(2, "%s: clump %d: cannot read clumpinfo\n",
+				arena->name, clump);
 			ok = -1;
 			break;
 		}
 
-		for(i = 0; i < n; i++){
-			addrs[i] = a;
-			a += cis[i].size + ClumpSize;
-			s[i] = i;
-		}
-
-		sortclumpinfo(cis, s, n);
-
-		for(i = 0; i < n; i++){
-			ci = &cis[s[i]];
-			ia.type = ci->type;
-			ia.size = ci->uncsize;
-			ia.addr = addrs[s[i]];
-			ia.blocks = (ci->size + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog;
-
-			if(!check)
-				goto Add;
-			if(loadientry(ix, ci->score, ci->type, &ie) < 0){
-				trace(TraceProc, "syncarenaindex missing block %V.%d", ci->score, ci->type);
-				missing++;
-			if(0)	fprint(2, "missing block type=%d score=%V\n", ci->type, ci->score);
-			}else if(iaddrcmp(&ia, &ie.ia) != 0){
-				trace(TraceProc, "syncarenaindex mismatched entry");
-				fprint(2, "\nmismatched index entry and clump at %d\n", clump + i);
-				fprint(2, "\tclump: type=%d size=%d blocks=%d addr=%lld\n", ia.type, ia.size, ia.blocks, ia.addr);
-				fprint(2, "\tindex: type=%d size=%d block=%d addr=%lld\n", ie.ia.type, ie.ia.size, ie.ia.blocks, ie.ia.addr);
-				pack = readlump(ie.score, ie.ia.type, ie.ia.size, nil);
-				packetfree(pack);
-				if(pack != nil){
-					fprint(2, "duplicated lump\n");
-					continue;
-				}
-				wrong++;
-			}else
-				continue;
-		Add:
-			if(!fix){
-				ok = -1;
-				continue;
-			}
-			flush = 1;
-			trace(TraceProc, "syncarenaindex insert %V", ci->score);
-			insertscore(ci->score, &ia, 1);
-		}
-
-		if(0 && clump / 1000 != (clump + n) / 1000)
-			fprint(2, ".");
-	}
-	free(cis);
-	free(addrs);
-	free(s);
-	if(flush){
-		flushdcache();
-		*pflush = 1;
+		ia.type = ci.type;
+		ia.size = ci.uncsize;
+		ia.addr = a;
+		ia.blocks = (ClumpSize + ci.size + (1 << ABlockLog) - 1) >> ABlockLog;
+		a += ClumpSize + ci.size;
+
+		as.stats.used += ClumpSize + ci.size;
+		as.stats.uncsize += ia.size;
+		as.stats.clumps++;
+		if(ci.uncsize > ci.size)
+			as.stats.cclumps++;
+		as.aa = a;
+		insertscore(ci.score, &ia, IEDirty, &as);
 	}
+	flushdcache();
 	return ok;
 }
 
 int
-syncindex(Index *ix, int fix, int mustflush, int check)
+syncindex(Index *ix)
 {
 	Arena *arena;
-	AState as;
-	u64int a;
-	int i, e, e1, ok, ok1, flush;
+	int i, e, e1, ok;
 
 	ok = 0;
-	flush = 0;
 	for(i = 0; i < ix->narenas; i++){
 		trace(TraceProc, "syncindex start %d", i);
 		arena = ix->arenas[i];
-		/*
-		 * Syncarena will scan through the arena looking for blocks
-		 * that have been forgotten.  It will update arena->memstats.used,
-		 * so save the currenct copy as the place to start the 
-		 * syncarenaindex scan.
-		 */
-		a = arena->memstats.used;
-		e = syncarena(arena, ix->amap[i].start, TWID32, fix, fix);
+		e = syncarena(arena, TWID32, 1, 1);
 		e1 = e;
-		if(fix)
-			e1 &= ~(SyncHeader|SyncCIZero|SyncCIErr);
-		if(e1 == SyncHeader)
+		e1 &= ~(SyncHeader|SyncCIZero|SyncCIErr);
+		if(e & SyncHeader)
 			fprint(2, "arena %s: header is out-of-date\n", arena->name);
-		if(e1)
+		if(e1){
+			fprint(2, "arena %s: %x\n", arena->name, e1);
 			ok = -1;
-		else{
-			/*
-			 * use diskstats not memstats here, because diskstats
-			 * is what has been indexed; memstats is what has 
-			 * made it to disk (confusing names).
-			 */
-			ok1 = syncarenaindex(ix, arena,
-					arena->diskstats.clumps,
-					ix->amap[i].start + arena->diskstats.used,
-					fix, &flush, check);
-			if(ok1 < 0)
-				fprint(2, "syncarenaindex: %r\n");
-			if(fix && ok1==0 && (e & SyncHeader) && wbarena(arena) < 0)
-				fprint(2, "arena=%s header write failed: %r\n", arena->name);
-			ok |= ok1;
-
-			as.arena = arena;
-			as.aa = ix->amap[i].start + arena->memstats.used;
-			as.stats = arena->memstats;
-			setdcachestate(&as);
+			continue;
+		}
+		flushdcache();
+		
+		if(arena->memstats.clumps == arena->diskstats.clumps)
+			continue;
+		
+		fprint(2, "%T %s: indexing %d clumps...\n",
+			arena->name,
+			arena->memstats.clumps - arena->diskstats.clumps);
+
+		if(syncarenaindex(arena, ix->amap[i].start) < 0){
+			fprint(2, "arena %s: syncarenaindex: %r\n", arena->name);
+			ok = -1;
+			continue;
+		}
+		if(wbarena(arena) < 0){
+			fprint(2, "arena %s: wbarena: %r\n", arena->name);
+			ok = -1;
+			continue;
 		}
-	}
-	if(missing || wrong)
-		fprint(2, "syncindex: %d missing entries, %d wrong entries (flush=%d)\n", missing, wrong, flush);
-	if(fix && wbindex(ix) < 0){
-		fprint(2, "can't write back index header for %s: %r\n", ix->name);
-		return -1;
-	}
-	if(fix && flush){
 		flushdcache();
-		if(mustflush){
-			flushicache();
-			flushdcache();
-		}else
-			kickicache();
+		delaykickicache();
 	}
 	return ok;
 }
--- /sys/src/cmd/venti/srv/icachewrite.c	Mon Oct  1 03:36:06 2007
+++ /sys/src/cmd/venti/srv/icachewrite.c	Mon Oct  1 03:36:05 2007
@@ -12,7 +12,7 @@
 static IEntry *iesort(IEntry*);
 
 int icachesleeptime = 1000;	/* milliseconds */
-int minicachesleeptime = 50;
+int minicachesleeptime = 0;
 
 enum
 {
@@ -85,7 +85,7 @@
 static int
 icachewritesect(Index *ix, ISect *is, u8int *buf)
 {
-	int err, h, bsize, t;
+	int err, i, werr, h, bsize, t;
 	u32int lo, hi;
 	u64int addr, naddr;
 	uint nbuf, off;
@@ -115,7 +115,8 @@
 		}
 		if(t < minicachesleeptime)
 			t = minicachesleeptime;
-		sleep(t);
+		if(t > 0)
+			sleep(t);
 		trace(TraceProc, "icachewritesect nextchunk");
 		chunk = nextchunk(ix, is, &iedirty, &addr, &nbuf);
 
@@ -169,33 +170,29 @@
 					break;
 			}
 			packibucket(&ib, buf+off, is->bucketmagic);
-			/*
-			 * XXX This is not quite right - it's good that we 
-			 * update the cached block (if any) here, but
-			 * since the block doesn't get written until writepart
-			 * below, we also need to make sure that the cache 
-			 * doesn't load the stale block before we write it to
-			 * disk below.  We could lock the disk cache during
-			 * the writepart, but that's pretty annoying.
-			 * Another possibility would be never to cache
-			 * index partition blocks.  The hit rate on those is
-			 * miniscule anyway.
-			 */
-			if((b = _getdblock(is->part, naddr, ORDWR, 0)) != nil){
-				memmove(b->data, buf+off, bsize);
-				putdblock(b);
-			}
 		}
 
 		diskaccess(1);
 
 		trace(TraceProc, "icachewritesect writepart", addr, nbuf);
-		if(writepart(is->part, addr, buf, nbuf) < 0 ||
-		    flushpart(is->part) < 0){
+		werr = 0;
+		if(writepart(is->part, addr, buf, nbuf) < 0 || flushpart(is->part) < 0)
+			werr = -1;
+
+		for(i=0; ipart, addr+i, ORDWR, 0)) != nil){
+				memmove(b->data, buf+i, bsize);
+				putdblock(b);
+			}
+		}
+
+		if(werr < 0){
 			fprint(2, "%s: part %s addr 0x%llux: icachewritesect "
 				"writepart: %r\n", argv0, is->part->name, addr);
+			err = -1;
 			continue;
 		}
+		
 		addstat(StatIsectWriteBytes, nbuf);
 		addstat(StatIsectWrite, 1);
 		icacheclean(chunk);
@@ -245,18 +242,20 @@
 	threadsetname("icachewritecoord");
 
 	ix = mainindex;
-	iwrite.as = diskstate();
+	iwrite.as = icachestate();
 
 	for(;;){
 		trace(TraceProc, "icachewritecoord sleep");
 		waitforkick(&iwrite.round);
 		trace(TraceWork, "start");
-		as = diskstate();
+		as = icachestate();
 		if(as.arena==iwrite.as.arena && as.aa==iwrite.as.aa){
 			/* will not be able to do anything more than last flush - kick disk */
+			fprint(2, "icache: nothing to do - kick dcache\n");
 			trace(TraceProc, "icachewritecoord kick dcache");
 			kickdcache();
 			trace(TraceProc, "icachewritecoord kicked dcache");
+			goto SkipWork;	/* won't do anything; don't bother rewriting bloom filter */
 		}
 		iwrite.as = as;
 
@@ -274,9 +273,11 @@
 				err |= recvul(ix->bloom->writedonechan);
 
 			trace(TraceProc, "icachewritecoord donewrite err=%d", err);
-			if(err == 0)
+			if(err == 0){
 				setatailstate(&iwrite.as);
+			}
 		}
+	SkipWork:
 		icacheclean(nil);	/* wake up anyone waiting */
 		trace(TraceWork, "finish");
 		addstat(StatIcacheFlush, 1);