venti: * stop storing unused wtime, train fields in IEntry. * prefetch arena tocs more gradually; set icacheprefetch=1 * yank out dead lump readahead code * yank out dead dcache readahead code * fix sync bug reported by anothy et al. * new, cleaner icache code mirrorarenas: * fix thread-stack access race (wsync) * new flag -s disables SHA1 checks, useful if destination is remote verifyarena: * accept venti-standard file:range syntax Reference: /n/sources/patch/applied/nventi Date: Mon Oct 1 03:36:25 CES 2007 Signed-off-by: rsc@swtch.com --- /sys/src/cmd/venti/srv/config.c Mon Oct 1 03:35:51 2007 +++ /sys/src/cmd/venti/srv/config.c Mon Oct 1 03:35:50 2007 @@ -245,3 +245,9 @@ return b; } +/* for OS X linker, which only resolves functions, not data */ +void +needmainindex(void) +{ +} + --- /sys/src/cmd/venti/srv/conv.c Mon Oct 1 03:35:51 2007 +++ /sys/src/cmd/venti/srv/conv.c Mon Oct 1 03:35:51 2007 @@ -581,9 +581,9 @@ scorecp(ie->score, p); p += VtScoreSize; - ie->wtime = U32GET(p); + /* ie->wtime = U32GET(p); */ p += U32Size; - ie->train = U16GET(p); + /* ie->train = U16GET(p); */ p += U16Size; if(p - buf != IEntryAddrOff) sysfatal("unpackentry bad IEntryAddrOff amount"); @@ -613,9 +613,9 @@ scorecp(p, ie->score); p += VtScoreSize; - U32PUT(p, ie->wtime); + U32PUT(p, 0); /* wtime */ p += U32Size; - U16PUT(p, ie->train); + U16PUT(p, 0); /* train */ p += U16Size; U64PUT(p, ie->ia.addr, t32); p += U64Size; --- /sys/src/cmd/venti/srv/mirrorarenas.c Mon Oct 1 03:35:52 2007 +++ /sys/src/cmd/venti/srv/mirrorarenas.c Mon Oct 1 03:35:51 2007 @@ -22,13 +22,14 @@ Part *dst; int force; int verbose; +int dosha1 = 1; char *status; uvlong astart, aend; void usage(void) { - fprint(2, "usage: mirrorarenas [-v] src dst [ranges]\n"); + fprint(2, "usage: mirrorarenas [-sv] src dst [ranges]\n"); threadexitsall("usage"); } @@ -92,6 +93,7 @@ * src with writing dst during copy. This is an easy factor of two * (almost) in performance. */ +static Write wsync; static void writeproc(void *v) { @@ -99,7 +101,7 @@ USED(v); while((w = recvp(writechan)) != nil){ - if(w->n == 0) + if(w == &wsync) continue; if(ewritepart(dst, w->o, w->p, w->n) < 0) w->error = 1; @@ -146,11 +148,7 @@ /* * wait for queued write to finish */ - w[i].p = nil; - w[i].o = 0; - w[i].n = 0; - w[i].error = 0; - sendp(writechan, &w[i]); + sendp(writechan, &wsync); i = 1-i; if(w[i].error) return -1; @@ -240,7 +238,7 @@ mirror(Arena *sa, Arena *da) { vlong v, si, di, end; - int clumpmax, blocksize; + int clumpmax, blocksize, sealed; static uchar buf[MaxIoSize]; ArenaHead h; DigestState xds, *ds; @@ -305,7 +303,8 @@ shaoff = 0; ds = nil; - if(sa->diskstats.sealed && scorecmp(sa->score, zeroscore) != 0){ + sealed = sa->diskstats.sealed && scorecmp(sa->score, zeroscore) != 0; + if(sealed && dosha1){ /* start sha1 state with header */ memset(&xds, 0, sizeof xds); ds = &xds; @@ -362,7 +361,7 @@ if(ewritepart(dst, end, buf, blocksize) < 0) return; - if(ds){ + if(sealed){ /* * ... but on the final pass, copy the encoding * of the tail information from the source @@ -375,20 +374,27 @@ if(asha1(dst, shaoff, end, ds) < 0 || copy(end, end+blocksize-VtScoreSize, "tail", ds) < 0) return; - memset(buf, 0, VtScoreSize); - sha1(buf, VtScoreSize, da->score, ds); - if(scorecmp(sa->score, da->score) == 0){ + if(dosha1){ + memset(buf, 0, VtScoreSize); + sha1(buf, VtScoreSize, da->score, ds); + if(scorecmp(sa->score, da->score) == 0){ + if(verbose) + chat("%T %s: %V sealed mirrored\n", sa->name, sa->score); + if(ewritepart(dst, end+blocksize-VtScoreSize, da->score, VtScoreSize) < 0) + return; + }else{ + chat("%T %s: sealing dst: score mismatch: %V vs %V\n", sa->name, sa->score, da->score); + memset(&xds, 0, sizeof xds); + asha1(dst, base-blocksize, end+blocksize-VtScoreSize, &xds); + sha1(buf, VtScoreSize, 0, &xds); + chat("%T reseal: %V\n", da->score); + status = "errors"; + } + }else{ if(verbose) - chat("%T %s: %V sealed mirrored\n", sa->name, sa->score); - if(ewritepart(dst, end+blocksize-VtScoreSize, da->score, VtScoreSize) < 0) + chat("%T %s: %V mirrored\n", sa->name, sa->score); + if(ewritepart(dst, end+blocksize-VtScoreSize, sa->score, VtScoreSize) < 0) return; - }else{ - chat("%T %s: sealing dst: score mismatch: %V vs %V\n", sa->name, sa->score, da->score); - memset(&xds, 0, sizeof xds); - asha1(dst, base-blocksize, end+blocksize-VtScoreSize, &xds); - sha1(buf, VtScoreSize, 0, &xds); - chat("%T reseal: %V\n", da->score); - status = "errors"; } }else{ chat("%T %s: %,lld used mirrored\n", @@ -461,6 +467,9 @@ break; case 'v': verbose++; + break; + case 's': + dosha1 = 0; break; default: usage(); --- /sys/src/cmd/venti/srv/verifyarena.c Mon Oct 1 03:35:52 2007 +++ /sys/src/cmd/venti/srv/verifyarena.c Mon Oct 1 03:35:52 2007 @@ -7,6 +7,7 @@ static uchar *data; static int blocksize; static int sleepms; +static vlong offset0; void usage(void) @@ -22,7 +23,7 @@ for(nr = 0; nr < n; nr += m){ m = n - nr; - m = pread(fd, &buf[nr], m, off+nr); + m = pread(fd, &buf[nr], m, offset0+off+nr); if(m <= 0){ if(m == 0) werrstr("early eof"); @@ -175,7 +176,8 @@ char *p, *q, *table, *f[10], line[256]; vlong start, stop; ArenaPart ap; - + Part *part; + needzeroscore(); ventifmtinstall(); blocksize = MaxIoSize; @@ -201,8 +203,10 @@ threadexitsall(nil); } - if((fd = open(argv[0], OREAD)) < 0) - sysfatal("open %s: %r", argv[0]); + if((part = initpart(argv[0], OREAD)) == nil) + sysfatal("open partition %s: %r", argv[0]); + fd = part->fd; + offset0 = part->offset; if(preadblock(data, 8192, PartBlank) < 0) sysfatal("read arena part header: %r"); @@ -249,7 +253,7 @@ fprint(2, "%T %s: bad start,stop %lld,%lld\n", f[0], stop, start); continue; } - if(seek(fd, start, 0) < 0) + if(seek(fd, offset0+start, 0) < 0) fprint(2, "%T %s: seek to start: %r\n", f[0]); verifyarena(f[0], stop - start); } --- /sys/src/cmd/venti/srv/hdisk.c Mon Oct 1 03:35:54 2007 +++ /sys/src/cmd/venti/srv/hdisk.c Mon Oct 1 03:35:53 2007 @@ -547,7 +547,7 @@ Lump *u; IAddr ia; IEntry ie; - int i, rac; + int i; Arena *arena; u64int aa; ZBlock *zb; @@ -561,7 +561,7 @@ } hprint(&c->hout, "
\n", score);
- if(_lookupscore(score, -1, &ia, nil) < 0)
+ if(icachelookup(score, -1, &ia) < 0)
hprint(&c->hout, " icache: not found\n");
else
hprint(&c->hout, " icache: addr=%#llx size=%d type=%d blocks=%d\n",
@@ -585,12 +585,12 @@
hprint(&c->hout, " -cache");
putlump(u);
- if(lookupscore(score, type, &ia, &rac) < 0){
+ if(lookupscore(score, type, &ia) < 0){
hprint(&c->hout, " -lookup\n");
continue;
}
- hprint(&c->hout, "\n lookupscore: addr=%#llx size=%d blocks=%d rac=%d\n",
- ia.addr, ia.size, ia.blocks, rac);
+ hprint(&c->hout, "\n lookupscore: addr=%#llx size=%d blocks=%d\n",
+ ia.addr, ia.size, ia.blocks);
arena = amapitoa(mainindex, ia.addr, &aa);
if(arena == nil){
--- /sys/src/cmd/venti/srv/stats.c Mon Oct 1 03:35:54 2007
+++ /sys/src/cmd/venti/srv/stats.c Mon Oct 1 03:35:54 2007
@@ -60,6 +60,9 @@
{ "index cache flushes", },
{ "index cache stalls", },
{ "index cache read time", },
+ { "index cache lookups" },
+ { "index cache summary hits" },
+ { "index cache summary prefetches" },
{ "bloom filter hits", },
{ "bloom filter misses", },
@@ -81,6 +84,9 @@
{ "sum reads", },
{ "sum read bytes", },
+
+ { "cig loads" },
+ { "cig load time" },
};
QLock statslock;
--- /sys/src/cmd/venti/srv/lump.c Mon Oct 1 03:35:55 2007
+++ /sys/src/cmd/venti/srv/lump.c Mon Oct 1 03:35:54 2007
@@ -7,7 +7,7 @@
int writestodevnull = 0;
int verifywrites = 0;
-static Packet *readilump(Lump *u, IAddr *ia, u8int *score, int rac);
+static Packet *readilump(Lump *u, IAddr *ia, u8int *score);
/*
* Some of this logic is duplicated in hdisk.c
@@ -19,7 +19,6 @@
Packet *p;
IAddr ia;
u32int n;
- int rac;
trace(TraceLump, "readlump enter");
/*
@@ -49,7 +48,7 @@
if(cached)
*cached = 0;
- if(lookupscore(score, type, &ia, &rac) < 0){
+ if(lookupscore(score, type, &ia) < 0){
/* ZZZ place to check for someone trying to guess scores */
seterr(EOk, "no block with score %V/%d exists", score, type);
@@ -64,7 +63,7 @@
}
trace(TraceLump, "readlump readilump");
- p = readilump(u, &ia, score, rac);
+ p = readilump(u, &ia, score);
putlump(u);
trace(TraceLump, "readlump exit");
@@ -134,9 +133,8 @@
Packet *old;
IAddr ia;
int ok;
- int rac;
- if(lookupscore(u->score, u->type, &ia, &rac) == 0){
+ if(lookupscore(u->score, u->type, &ia) == 0){
if(verifywrites == 0){
/* assume the data is here! */
packetfree(p);
@@ -149,7 +147,7 @@
* if the read fails,
* assume it was corrupted data and store the block again
*/
- old = readilump(u, &ia, u->score, rac);
+ old = readilump(u, &ia, u->score);
if(old != nil){
ok = 0;
if(packetcmp(p, old) != 0){
@@ -176,8 +174,6 @@
ok = storeclump(mainindex, flat, u->score, u->type, creator, &ia);
freezblock(flat);
if(ok == 0)
- ok = insertscore(u->score, &ia, 1);
- if(ok == 0)
insertlump(u, p);
else
packetfree(p);
@@ -193,39 +189,14 @@
return ok;
}
-static void
-lreadahead(u64int a, Arena *arena, u64int aa, int n)
-{
- u8int buf[ClumpSize];
- Clump cl;
- IAddr ia;
-
- while(n > 0) {
- if (aa >= arena->memstats.used)
- break;
- if(readarena(arena, aa, buf, ClumpSize) < ClumpSize)
- break;
- if(unpackclump(&cl, buf, arena->clumpmagic) < 0)
- break;
- ia.addr = a;
- ia.type = cl.info.type;
- ia.size = cl.info.uncsize;
- ia.blocks = (cl.info.size + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog;
- insertscore(cl.info.score, &ia, 0);
- a += ClumpSize + cl.info.size;
- aa += ClumpSize + cl.info.size;
- n--;
- }
-}
-
static Packet*
-readilump(Lump *u, IAddr *ia, u8int *score, int rac)
+readilump(Lump *u, IAddr *ia, u8int *score)
{
Arena *arena;
ZBlock *zb;
Packet *p, *pp;
Clump cl;
- u64int a, aa;
+ u64int aa;
u8int sc[VtScoreSize];
trace(TraceLump, "readilump enter");
@@ -256,13 +227,6 @@
seterr(ECrash, "score mismatch");
freezblock(zb);
return nil;
- }
-
- if(rac == 0) {
- trace(TraceLump, "readilump readahead");
- a = ia->addr + ClumpSize + cl.info.size;
- aa += ClumpSize + cl.info.size;
- lreadahead(a, arena, aa, 20);
}
trace(TraceLump, "readilump success");
--- /sys/src/cmd/venti/srv/clump.c Mon Oct 1 03:35:56 2007
+++ /sys/src/cmd/venti/srv/clump.c Mon Oct 1 03:35:55 2007
@@ -62,18 +62,16 @@
memset(cb->data+ClumpSize+dsize, 0, 4);
cl.info.size = dsize;
- ia->addr = 0;
- ia->type = type;
- ia->size = size;
- ia->blocks = (dsize + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog;
-
- a = writeiclump(ix, &cl, cb->data, &ia->addr);
-
+ a = writeiclump(ix, &cl, cb->data);
trace(TraceLump, "storeclump exit %lld", a);
-
freezblock(cb);
if(a == TWID64)
return -1;
+
+ ia->addr = a;
+ ia->type = type;
+ ia->size = size;
+ ia->blocks = (dsize + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog;
/*
qlock(&stats.lock);
--- /sys/src/cmd/venti/srv/arena.c Mon Oct 1 03:35:57 2007
+++ /sys/src/cmd/venti/srv/arena.c Mon Oct 1 03:35:56 2007
@@ -16,6 +16,7 @@
static CIBlock *getcib(Arena *arena, int clump, int writing, CIBlock *rock);
static void putcib(Arena *arena, CIBlock *cib);
static void sumproc(void *);
+static void loadcig(Arena *arena);
static QLock sumlock;
static Rendez sumwait;
@@ -65,7 +66,7 @@
}
if(arena->diskstats.sealed && scorecmp(zeroscore, arena->score)==0)
- backsumarena(arena);
+ sealarena(arena);
return arena;
}
@@ -137,14 +138,23 @@
CIBlock *cib, r;
int i;
- for(i = 0; i < n; i++){
+ /*
+ * because the clump blocks are laid out
+ * in reverse order at the end of the arena,
+ * it can be a few percent faster to read
+ * the clumps backwards, which reads the
+ * disk blocks forwards.
+ */
+ for(i = n-1; i >= 0; i--){
cib = getcib(arena, clump + i, 0, &r);
- if(cib == nil)
- break;
+ if(cib == nil){
+ n = i;
+ continue;
+ }
unpackclumpinfo(&cis[i], &cib->data->data[cib->offset]);
putcib(arena, cib);
}
- return i;
+ return n;
}
/*
@@ -283,13 +293,12 @@
filling up and real errors writing the clump?
*/
u64int
-writeaclump(Arena *arena, Clump *c, u8int *clbuf, u64int start, u64int *pa)
+writeaclump(Arena *arena, Clump *c, u8int *clbuf)
{
DBlock *b;
u64int a, aa;
u32int clump, n, nn, m, off, blocksize;
int ok;
- AState as;
n = c->info.size + ClumpSize + U32Size;
qlock(&arena->lock);
@@ -299,10 +308,6 @@
if(!arena->memstats.sealed){
logerr(EOk, "seal memstats %s", arena->name);
arena->memstats.sealed = 1;
- as.arena = arena;
- as.aa = start+aa;
- as.stats = arena->memstats;
- setdcachestate(&as);
}
qunlock(&arena->lock);
return TWID64;
@@ -349,7 +354,28 @@
if(c->info.size < c->info.uncsize)
arena->memstats.cclumps++;
- clump = arena->memstats.clumps++;
+ clump = arena->memstats.clumps;
+ if(clump % ArenaCIGSize == 0){
+ if(arena->cig == nil){
+ loadcig(arena);
+ if(arena->cig == nil)
+ goto NoCIG;
+ }
+ /* add aa as start of next cig */
+ if(clump/ArenaCIGSize != arena->ncig){
+ fprint(2, "bad arena cig computation %s: writing clump %d but %d cigs\n",
+ arena->name, clump, arena->ncig);
+ arena->ncig = -1;
+ vtfree(arena->cig);
+ arena->cig = nil;
+ goto NoCIG;
+ }
+ arena->cig = vtrealloc(arena->cig, (arena->ncig+1)*sizeof arena->cig[0]);
+ arena->cig[arena->ncig++].offset = aa;
+ }
+NoCIG:
+ arena->memstats.clumps++;
+
if(arena->memstats.clumps == 0)
sysfatal("clumps wrapped");
arena->wtime = now();
@@ -359,14 +385,6 @@
writeclumpinfo(arena, clump, &c->info);
wbarena(arena);
- /* set up for call to setdcachestate */
- as.arena = arena;
- as.aa = start+arena->memstats.used;
- as.stats = arena->memstats;
-
- /* update this before calling setdcachestate so it cannot be behind dcache.diskstate */
- *pa = start+aa;
- setdcachestate(&as);
qunlock(&arena->lock);
return aa;
@@ -415,6 +433,7 @@
/*
* Look up as->arena to find index.
*/
+ needmainindex(); /* OS X linker */
ix = mainindex;
for(i=0; inarenas; i++)
if(ix->arenas[i] == as->arena)
@@ -515,6 +534,7 @@
/*
* read & sum all blocks except the last one
*/
+ flushdcache();
memset(&s, 0, sizeof s);
b = alloczblock(bs, 0, arena->part->blocksize);
e = arena->base + arena->size;
@@ -550,24 +570,19 @@
sha1(b->data, bs-VtScoreSize, nil, &s);
sha1(zeroscore, VtScoreSize, nil, &s);
sha1(nil, 0, score, &s);
-
+
/*
* check for no checksum or the same
- *
- * the writepart is okay because we flushed the dcache in sealarena
*/
- if(scorecmp(score, &b->data[bs - VtScoreSize]) != 0){
- if(scorecmp(zeroscore, &b->data[bs - VtScoreSize]) != 0)
- logerr(EOk, "overwriting mismatched checksums for arena=%s, found=%V calculated=%V",
- arena->name, &b->data[bs - VtScoreSize], score);
- scorecp(&b->data[bs - VtScoreSize], score);
- if(writepart(arena->part, e, b->data, bs) < 0)
- logerr(EOk, "sumarena can't write sum for %s: %r", arena->name);
- }
+ if(scorecmp(score, &b->data[bs - VtScoreSize]) != 0
+ && scorecmp(zeroscore, &b->data[bs - VtScoreSize]) != 0)
+ logerr(EOk, "overwriting mismatched checksums for arena=%s, found=%V calculated=%V",
+ arena->name, &b->data[bs - VtScoreSize], score);
freezblock(b);
qlock(&arena->lock);
scorecp(arena->score, score);
+ wbarena(arena);
qunlock(&arena->lock);
}
@@ -586,6 +601,7 @@
}
dirtydblock(b, DirtyArenaTrailer);
bad = okarena(arena)<0 || packarena(arena, b->data)<0;
+ scorecp(b->data + arena->blocksize - VtScoreSize, arena->score);
putdblock(b);
if(bad)
return -1;
@@ -753,4 +769,158 @@
putdblock(cib->data);
cib->data = nil;
+}
+
+
+/*
+ * For index entry readahead purposes, the arenas are
+ * broken into smaller subpieces, called clump info groups
+ * or cigs. Each cig has ArenaCIGSize clumps (ArenaCIGSize
+ * is chosen to make the index entries take up about half
+ * a megabyte). The index entries do not contain enough
+ * information to determine what the clump index is for
+ * a given address in an arena. That info is needed both for
+ * figuring out which clump group an address belongs to
+ * and for prefetching a clump group's index entries from
+ * the arena table of contents. The first time clump groups
+ * are accessed, we scan the entire arena table of contents
+ * (which might be 10s of megabytes), recording the data
+ * offset of each clump group.
+ */
+
+/*
+ * load clump info group information by scanning entire toc.
+ */
+static void
+loadcig(Arena *arena)
+{
+ u32int i, j, ncig, nci;
+ ArenaCIG *cig;
+ ClumpInfo *ci;
+ u64int offset;
+ int ms;
+
+ if(arena->cig || arena->ncig < 0)
+ return;
+
+// fprint(2, "loadcig %s\n", arena->name);
+
+ ncig = (arena->memstats.clumps+ArenaCIGSize-1) / ArenaCIGSize;
+ if(ncig == 0){
+ arena->cig = vtmalloc(1);
+ arena->ncig = 0;
+ return;
+ }
+
+ ms = msec();
+ cig = vtmalloc(ncig*sizeof cig[0]);
+ ci = vtmalloc(ArenaCIGSize*sizeof ci[0]);
+ offset = 0;
+ for(i=0; incig = -1;
+ fprint(2, "loadcig %s: got %ud cigs, expected %ud\n", arena->name, i+1, ncig);
+ goto out;
+ }
+ }
+ }
+ vtfree(ci);
+
+ arena->ncig = ncig;
+ arena->cig = cig;
+
+out:
+ ms = msec() - ms;
+ addstat2(StatCigLoad, 1, StatCigLoadTime, ms);
+}
+
+/*
+ * convert arena address into arena group + data boundaries.
+ */
+int
+arenatog(Arena *arena, u64int addr, u64int *gstart, u64int *glimit, int *g)
+{
+ int r, l, m;
+
+ qlock(&arena->lock);
+ if(arena->cig == nil)
+ loadcig(arena);
+ if(arena->cig == nil || arena->ncig == 0){
+ qunlock(&arena->lock);
+ return -1;
+ }
+
+ l = 1;
+ r = arena->ncig - 1;
+ while(l <= r){
+ m = (r + l) / 2;
+ if(arena->cig[m].offset <= addr)
+ l = m + 1;
+ else
+ r = m - 1;
+ }
+ l--;
+
+ *g = l;
+ *gstart = arena->cig[l].offset;
+ if(l+1 < arena->ncig)
+ *glimit = arena->cig[l+1].offset;
+ else
+ *glimit = arena->memstats.used;
+ qunlock(&arena->lock);
+ return 0;
+}
+
+/*
+ * load the clump info for group g into the index entries.
+ */
+int
+asumload(Arena *arena, int g, IEntry *entries, int nentries)
+{
+ int i, base, limit;
+ u64int addr;
+ ClumpInfo ci;
+ IEntry *ie;
+
+ if(nentries < ArenaCIGSize){
+ fprint(2, "asking for too few entries\n");
+ return -1;
+ }
+
+ qlock(&arena->lock);
+ if(arena->cig == nil)
+ loadcig(arena);
+ if(arena->cig == nil || arena->ncig == 0 || g >= arena->ncig){
+ qunlock(&arena->lock);
+ return -1;
+ }
+
+ addr = 0;
+ base = g*ArenaCIGSize;
+ limit = base + ArenaCIGSize;
+ if(base > arena->memstats.clumps)
+ base = arena->memstats.clumps;
+ ie = entries;
+ for(i=base; iscore, ci.score);
+ ie->ia.type = ci.type;
+ ie->ia.size = ci.uncsize;
+ ie->ia.blocks = (ci.size + ClumpSize + (1<> ABlockLog;
+ ie->ia.addr = addr;
+ ie++;
+ }
+ addr += ClumpSize + ci.size;
+ }
+ qunlock(&arena->lock);
+ return ie - entries;
}
--- /sys/src/cmd/venti/srv/dcache.c Mon Oct 1 03:35:57 2007
+++ /sys/src/cmd/venti/srv/dcache.c Mon Oct 1 03:35:57 2007
@@ -55,15 +55,6 @@
u8int *mem; /* memory for all block descriptors */
int ndirty; /* number of dirty blocks */
int maxdirty; /* max. number of dirty blocks */
- Channel *ra;
- u8int *rabuf;
- u32int ramax;
- u32int rasize;
- u64int raaddr;
- Part *rapart;
-
- AState diskstate;
- AState state;
};
typedef struct Ra Ra;
@@ -82,7 +73,6 @@
static void fixheap(int i, DBlock *b);
static void flushproc(void*);
static void writeproc(void*);
-static void raproc(void*);
void
initdcache(u32int mem)
@@ -109,7 +99,6 @@
dcache.blocks = MKNZ(DBlock, nblocks);
dcache.write = MKNZ(DBlock*, nblocks);
dcache.mem = MKNZ(u8int, (nblocks+1+128) * blocksize);
- dcache.ra = chancreate(sizeof(Ra), 0);
last = nil;
p = (u8int*)(((ulong)dcache.mem+blocksize-1)&~(ulong)(blocksize-1));
@@ -121,10 +110,6 @@
b->next = last;
last = b;
}
- dcache.rabuf = &p[i*blocksize];
- dcache.ramax = 128*blocksize;
- dcache.raaddr = 0;
- dcache.rapart = nil;
dcache.free = last;
dcache.nheap = 0;
@@ -133,136 +118,6 @@
vtproc(flushproc, nil);
vtproc(delaykickroundproc, &dcache.round);
- vtproc(raproc, nil);
-}
-
-void
-setdcachestate(AState *a)
-{
- trace(TraceBlock, "setdcachestate %s 0x%llux clumps %d", a->arena ? a->arena->name : nil, a->aa, a->stats.clumps);
- qlock(&dcache.lock);
- dcache.state = *a;
- qunlock(&dcache.lock);
-}
-
-AState
-diskstate(void)
-{
- AState a;
-
- qlock(&dcache.lock);
- a = dcache.diskstate;
- qunlock(&dcache.lock);
- return a;
-}
-
-static void
-raproc(void *v)
-{
- Ra ra;
- DBlock *b;
-
- USED(v);
- while(recv(dcache.ra, &ra) == 1){
- if(ra.part->size <= ra.addr)
- continue;
- b = _getdblock(ra.part, ra.addr, OREAD, 2);
- putdblock(b);
- }
-}
-
-/*
- * We do readahead a whole arena at a time now,
- * so dreadahead is a no-op. The original implementation
- * is in unused_dreadahead below.
- */
-void
-dreadahead(Part *part, u64int addr, int miss)
-{
- USED(part);
- USED(addr);
- USED(miss);
-}
-
-void
-unused_dreadahead(Part *part, u64int addr, int miss)
-{
- Ra ra;
- static struct {
- Part *part;
- u64int addr;
- } lastmiss;
- static struct {
- Part *part;
- u64int addr;
- int dir;
- } lastra;
-
- if(miss){
- if(lastmiss.part==part && lastmiss.addr==addr-dcache.size){
- XRa:
- lastra.part = part;
- lastra.dir = addr-lastmiss.addr;
- lastra.addr = addr+lastra.dir;
- ra.part = part;
- ra.addr = lastra.addr;
- nbsend(dcache.ra, &ra);
- }else if(lastmiss.part==part && lastmiss.addr==addr+dcache.size){
- addr -= dcache.size;
- goto XRa;
- }
- }else{
- if(lastra.part==part && lastra.addr==addr){
- lastra.addr += lastra.dir;
- ra.part = part;
- ra.addr = lastra.addr;
- nbsend(dcache.ra, &ra);
- }
- }
-
- if(miss){
- lastmiss.part = part;
- lastmiss.addr = addr;
- }
-}
-
-int
-rareadpart(Part *part, u64int addr, u8int *buf, uint n, int load)
-{
- uint nn;
- static RWLock ralock;
-
- rlock(&ralock);
- if(dcache.rapart==part && dcache.raaddr <= addr && addr+n <= dcache.raaddr+dcache.rasize){
- memmove(buf, dcache.rabuf+(addr-dcache.raaddr), n);
- runlock(&ralock);
- return 0;
- }
- if(load != 2 || addr >= part->size){ /* addr >= part->size: let readpart do the error */
- runlock(&ralock);
- diskaccess(0);
- return readpart(part, addr, buf, n);
- }
-
- runlock(&ralock);
- wlock(&ralock);
-fprint(2, "raread %s %llx\n", part->name, addr);
- nn = dcache.ramax;
- if(addr+nn > part->size)
- nn = part->size - addr;
- diskaccess(0);
- if(readpart(part, addr, dcache.rabuf, nn) < 0){
- wunlock(&ralock);
- return -1;
- }
- memmove(buf, dcache.rabuf, n);
- dcache.rapart = part;
- dcache.rasize = nn;
- dcache.raaddr = addr;
- wunlock(&ralock);
-
- addstat(StatApartReadBytes, nn-n);
- return 0;
}
static u32int
@@ -313,16 +168,8 @@
again:
for(b = dcache.heads[h]; b != nil; b = b->next){
if(b->part == part && b->addr == addr){
- /*
- qlock(&stats.lock);
- stats.pchit++;
- qunlock(&stats.lock);
- */
- if(load){
+ if(load)
addstat(StatDcacheHit, 1);
- if(load != 2 && mode != OWRITE)
- dreadahead(part, b->addr, 0);
- }
goto found;
}
}
@@ -367,8 +214,6 @@
b->addr = addr;
b->part = part;
b->size = 0;
- if(load != 2 && mode != OWRITE)
- dreadahead(part, b->addr, 1);
found:
b->ref++;
@@ -405,7 +250,8 @@
memset(&b->data[b->size], 0, size - b->size);
else{
trace(TraceBlock, "getdblock readpart %s 0x%llux", part->name, addr);
- if(rareadpart(part, addr + b->size, &b->data[b->size], size - b->size, load) < 0){
+ diskaccess(0);
+ if(readpart(part, addr + b->size, &b->data[b->size], size - b->size) < 0){
b->mode = ORDWR; /* so putdblock wunlocks */
putdblock(b);
return nil;
@@ -768,7 +614,6 @@
int i, j, n;
ulong t0;
DBlock *b, **write;
- AState as;
USED(v);
threadsetname("flushproc");
@@ -779,10 +624,6 @@
t0 = nsec()/1000;
trace(TraceProc, "build t=%lud", (ulong)(nsec()/1000)-t0);
- qlock(&dcache.lock);
- as = dcache.state;
- qunlock(&dcache.lock);
-
write = dcache.write;
n = 0;
for(i=0; iwriting);
for(i = ix->mapalloc; i < ix->narenas; i++){
- a = writeaclump(ix->arenas[i], c, clbuf, ix->amap[i].start, pa);
+ a = writeaclump(ix->arenas[i], c, clbuf);
if(a != TWID64){
- ix->mapalloc = i; /* assuming write is atomic, race is okay */
+ ix->mapalloc = i;
+ ia.addr = ix->amap[i].start + a;
+ ia.type = c->info.type;
+ ia.size = c->info.uncsize;
+ ia.blocks = (c->info.size + ClumpSize + (1<> ABlockLog;
+ as.arena = ix->arenas[i];
+ as.aa = ia.addr;
+ as.stats = as.arena->memstats;
+ insertscore(c->info.score, &ia, IEDirty, &as);
+ qunlock(&ix->writing);
trace(TraceLump, "writeiclump exit");
- return a;
+ return ia.addr;
}
}
+ qunlock(&ix->writing);
seterr(EAdmin, "no space left in arenas");
trace(TraceLump, "writeiclump failed");
@@ -594,6 +607,25 @@
}
*aa = a - ix->amap[l].start;
return ix->arenas[l];
+}
+
+/*
+ * convert an arena index to the bounds of the containing arena group.
+ */
+Arena*
+amapitoag(Index *ix, u64int a, u64int *gstart, u64int *glimit, int *g)
+{
+ u64int aa;
+ Arena *arena;
+
+ arena = amapitoa(ix, a, &aa);
+ if(arena == nil)
+ return nil;
+ if(arenatog(arena, aa, gstart, glimit, g) < 0)
+ return nil;
+ *gstart += a - aa;
+ *glimit += a - aa;
+ return arena;
}
int
--- /sys/src/cmd/venti/srv/icache.c Mon Oct 1 03:35:59 2007
+++ /sys/src/cmd/venti/srv/icache.c Mon Oct 1 03:35:59 2007
@@ -2,236 +2,429 @@
#include "dat.h"
#include "fns.h"
+int icacheprefetch = 1;
+
typedef struct ICache ICache;
+typedef struct IHash IHash;
+typedef struct ISum ISum;
+
struct ICache
{
- QLock lock; /* locks hash table & all associated data */
+ QLock lock;
Rendez full;
- IEntry **heads; /* heads of all the hash chains */
- int bits; /* bits to use for indexing heads */
- u32int size; /* number of heads; == 1 << bits, should be < entries */
- IEntry *base; /* all allocated hash table entries */
- IEntry *free;
- u32int entries; /* elements in base */
- IEntry *dirty; /* chain of dirty elements */
- u32int ndirty;
+ IHash *hash;
+ IEntry *entries;
+ int nentries;
+ IEntry free;
+ IEntry clean;
+ IEntry dirty;
u32int maxdirty;
- u32int unused; /* index of first unused element in base */
- u32int stolen; /* last head from which an element was stolen */
+ u32int ndirty;
+ AState as;
- Arena *last[4];
- Arena *lastload;
- int nlast;
+ ISum **sum;
+ int nsum;
+ IHash *shash;
+ IEntry *sentries;
+ int nsentries;
};
-int icacheprefetch = 0; /* interferes with playing music via vacfs */
-
static ICache icache;
-static IEntry *icachealloc(IAddr *ia, u8int *score);
-
/*
- * bits is the number of bits in the icache hash table
- * depth is the average depth
- * memory usage is about (1<table[0]));
+ ih->table = (IEntry**)(ih+1);
+ ih->bits = bits;
+ ih->size = size;
+ return ih;
}
-u32int
-hashbits(u8int *sc, int bits)
+static IEntry*
+ihashlookup(IHash *ih, u8int score[VtScoreSize], int type)
{
- u32int v;
-
- v = (sc[0] << 24) | (sc[1] << 16) | (sc[2] << 8) | sc[3];
- if(bits < 32)
- v >>= (32 - bits);
- return v;
+ u32int h;
+ IEntry *ie;
+
+ h = hashbits(score, ih->bits);
+ for(ie=ih->table[h]; ie; ie=ie->nexthash)
+ if((type == -1 || type == ie->ia.type) && scorecmp(score, ie->score) == 0)
+ return ie;
+ return nil;
}
static void
-loadarenaclumps(Arena *arena, u64int aa)
+ihashdelete(IHash *ih, IEntry *ie, char *what)
{
- ulong i;
- ClumpInfo ci;
- IAddr ia;
-
- for(i=0; imemstats.clumps; i++){
- if(readclumpinfo(arena, i, &ci) < 0)
- break;
- ia.type = ci.type;
- ia.size = ci.uncsize;
- ia.blocks = (ci.size + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog;
- ia.addr = aa;
- aa += ClumpSize + ci.size;
- if(ia.type != VtCorruptType)
- insertscore(ci.score, &ia, 0);
- }
+ u32int h;
+ IEntry **l;
+
+ h = hashbits(ie->score, ih->bits);
+ for(l=&ih->table[h]; *l; l=&(*l)->nexthash)
+ if(*l == ie){
+ *l = ie->nexthash;
+ return;
+ }
+ fprint(2, "warning: %s %V not found in ihashdelete\n", what, ie->score);
}
-int
-_lookupscore(u8int *score, int type, IAddr *ia, int *rac)
+static void
+ihashinsert(IHash *ih, IEntry *ie)
{
u32int h;
- IEntry *ie, *last;
-
- qlock(&icache.lock);
- h = hashbits(score, icache.bits);
- last = nil;
- for(ie = icache.heads[h]; ie != nil; ie = ie->next){
- if((ie->ia.type == type || type == -1) && scorecmp(ie->score, score)==0){
- if(last != nil)
- last->next = ie->next;
- else
- icache.heads[h] = ie->next;
- addstat(StatIcacheHit, 1);
- if(rac)
- ie->rac = 1;
- trace(TraceLump, "lookupscore incache");
- ie->next = icache.heads[h];
- icache.heads[h] = ie;
-
- *ia = ie->ia;
- if(rac)
- *rac = ie->rac;
- qunlock(&icache.lock);
- return 0;
- }
- last = ie;
- }
- addstat(StatIcacheMiss, 1);
- qunlock(&icache.lock);
- return -1;
+
+ h = hashbits(ie->score, ih->bits);
+ ie->nexthash = ih->table[h];
+ ih->table[h] = ie;
}
/*
-ZZZ need to think about evicting the correct IEntry,
-and writing back the wtime.
- * look up data score in the index cache
- * if this fails, pull it in from the disk index table, if it exists.
- *
- * must be called with the lump for this score locked
+ * IEntry lists.
*/
-int
-lookupscore(u8int *score, int type, IAddr *ia, int *rac)
+
+static IEntry*
+popout(IEntry *ie)
{
- IEntry d, *ie;
- u32int h;
- u64int aa;
- Arena *load;
- int i, ret;
- uint ms;
+ if(ie->prev == nil && ie->next == nil)
+ return ie;
+ ie->prev->next = ie->next;
+ ie->next->prev = ie->prev;
+ ie->next = nil;
+ ie->prev = nil;
+ return ie;
+}
- aa = 0;
- ms = msec();
-
- trace(TraceLump, "lookupscore %V.%d", score, type);
+static IEntry*
+poplast(IEntry *list)
+{
+ if(list->prev == list)
+ return nil;
+ return popout(list->prev);
+}
+
+static IEntry*
+pushfirst(IEntry *list, IEntry *ie)
+{
+ popout(ie);
+ ie->prev = list;
+ ie->next = list->next;
+ ie->prev->next = ie;
+ ie->next->prev = ie;
+ return ie;
+}
- ret = 0;
- if(_lookupscore(score, type, ia, rac) < 0){
- if(loadientry(mainindex, score, type, &d) < 0){
- ret = -1;
- goto out;
- }
+/*
+ * Arena summary cache.
+ */
+struct ISum
+{
+ QLock lock;
+ IEntry *entries;
+ int nentries;
+ int loaded;
+ u64int addr;
+ u64int limit;
+ Arena *arena;
+ int g;
+};
+
+static ISum*
+scachelookup(u64int addr)
+{
+ int i;
+ ISum *s;
- /* failed in cache but found on disk - fill cache. */
- trace(TraceLump, "lookupscore loaded");
- addstat(StatIcacheFill, 1);
-
- /*
- * no one else can load an entry for this score,
- * since we have this score's lump's lock.
- */
- qlock(&icache.lock);
-
- /*
- * If we notice that all the hits are coming from one arena,
- * load the table of contents for that arena into the cache.
- */
- load = nil;
- h = hashbits(score, icache.bits);
- ie = icachealloc(&d.ia, score);
- if(icacheprefetch){
- icache.last[icache.nlast++%nelem(icache.last)] = amapitoa(mainindex, ie->ia.addr, &aa);
- aa = ie->ia.addr - aa; /* compute base addr of arena */
- for(i=0; iaddr <= addr && addr < s->limit){
+ if(i > 0){
+ memmove(icache.sum+1, icache.sum, i*sizeof icache.sum[0]);
+ icache.sum[0] = s;
}
+ return s;
}
+ }
+ return nil;
+}
+
+static void
+sumclear(ISum *s)
+{
+ int i;
+
+ for(i=0; inentries; i++)
+ ihashdelete(icache.shash, &s->entries[i], "scache");
+ s->nentries = 0;
+ s->loaded = 0;
+ s->addr = 0;
+ s->limit = 0;
+ s->arena = nil;
+ s->g = 0;
+}
+
+static ISum*
+scacheevict(void)
+{
+ ISum *s;
+ int i;
- ie->next = icache.heads[h];
- icache.heads[h] = ie;
-
- *ia = ie->ia;
- *rac = ie->rac;
-
- qunlock(&icache.lock);
- if(load){
- trace(TraceProc, "preload 0x%llux", aa);
- loadarenaclumps(load, aa);
+ for(i=icache.nsum-1; i>=0; i--){
+ s = icache.sum[i];
+ if(canqlock(&s->lock)){
+ if(i > 0){
+ memmove(icache.sum+1, icache.sum, i*sizeof icache.sum[0]);
+ icache.sum[0] = s;
+ }
+ sumclear(s);
+ return s;
}
}
+ return nil;
+}
-out:
- ms = msec() - ms;
- addstat2(StatIcacheRead, 1, StatIcacheReadTime, ms);
+static void
+scachehit(u64int addr)
+{
+ scachelookup(addr); /* for move-to-front */
+}
- return ret;
+static void
+scachesetup(ISum *s, u64int addr)
+{
+ u64int addr0, limit;
+ int g;
+
+ s->arena = amapitoag(mainindex, addr, &addr0, &limit, &g);
+ s->addr = addr0;
+ s->limit = limit;
+ s->g = g;
+}
+
+static void
+scacheload(ISum *s)
+{
+ int i, n;
+
+ s->loaded = 1;
+ n = asumload(s->arena, s->g, s->entries, ArenaCIGSize);
+ /*
+ * n can be less then ArenaCIGSize, either if the clump group
+ * is the last in the arena and is only partially filled, or if there
+ * are corrupt clumps in the group -- those are not returned.
+ */
+ for(i=0; ientries[i].ia.addr += s->addr;
+ ihashinsert(icache.shash, &s->entries[i]);
+ }
+//fprint(2, "%T scacheload %s %d - %d entries\n", s->arena->name, s->g, n);
+ addstat(StatScachePrefetch, n);
+ s->nentries = n;
+}
+
+static ISum*
+scachemiss(u64int addr)
+{
+ ISum *s;
+
+ s = scachelookup(addr);
+ if(s == nil){
+ /* first time: make an entry in the cache but don't populate it yet */
+ s = scacheevict();
+ if(s == nil)
+ return nil;
+ scachesetup(s, addr);
+ qunlock(&s->lock);
+ return nil;
+ }
+
+ /* second time: load from disk */
+ qlock(&s->lock);
+ if(s->loaded || !icacheprefetch){
+ qunlock(&s->lock);
+ return nil;
+ }
+
+ return s; /* locked */
}
/*
- * insert a new element in the hash table.
+ * Index cache.
*/
-int
-insertscore(u8int *score, IAddr *ia, int write)
+
+void
+initicache(u32int mem0)
{
- IEntry *ie, se;
- u32int h;
+ u32int mem;
+ int i, entries, scache;
+
+ icache.full.l = &icache.lock;
- trace(TraceLump, "insertscore enter");
- if(write)
- addstat(StatIcacheWrite, 1);
- else
- addstat(StatIcachePrefetch, 1);
+ mem = mem0;
+ entries = mem / (sizeof(IEntry)+sizeof(IEntry*));
+ scache = (entries/8) / ArenaCIGSize;
+ entries -= entries/8;
+ if(scache < 4)
+ scache = 4;
+ if(scache > 16)
+ scache = 16;
+ if(entries < 1000)
+ entries = 1000;
+fprint(2, "icache %,d bytes = %,d entries; %d scache\n", mem0, entries, scache);
+
+ icache.clean.prev = icache.clean.next = &icache.clean;
+ icache.dirty.prev = icache.dirty.next = &icache.dirty;
+ icache.free.prev = icache.free.next = &icache.free;
+
+ icache.hash = mkihash(entries);
+ icache.nentries = entries;
+ setstat(StatIcacheSize, entries);
+ icache.entries = vtmallocz(entries*sizeof icache.entries[0]);
+ icache.maxdirty = entries / 2;
+ for(i=0; ientries = icache.sentries + i*ArenaCIGSize;
+ }
+}
- qlock(&icache.lock);
- h = hashbits(score, icache.bits);
- ie = icachealloc(ia, score);
- if(write){
+static IEntry*
+evictlru(void)
+{
+ IEntry *ie;
+
+ ie = poplast(&icache.clean);
+ if(ie == nil)
+ return nil;
+ ihashdelete(icache.hash, ie, "evictlru");
+ return ie;
+}
+
+static void
+icacheinsert(u8int score[VtScoreSize], IAddr *ia, int state)
+{
+ IEntry *ie;
+
+ if((ie = poplast(&icache.free)) == nil && (ie = evictlru()) == nil){
+ addstat(StatIcacheStall, 1);
+ while((ie = poplast(&icache.free)) == nil && (ie = evictlru()) == nil){
+ // Could safely return here if state == IEClean.
+ // But if state == IEDirty, have to wait to make
+ // sure we don't lose an index write.
+ // Let's wait all the time.
+ flushdcache();
+ kickicache();
+ rsleep(&icache.full);
+ }
+ addstat(StatIcacheStall, -1);
+ }
+
+ memmove(ie->score, score, VtScoreSize);
+ ie->state = state;
+ ie->ia = *ia;
+ if(state == IEClean){
+ addstat(StatIcachePrefetch, 1);
+ pushfirst(&icache.clean, ie);
+ }else{
+ addstat(StatIcacheWrite, 1);
+ assert(state == IEDirty);
icache.ndirty++;
setstat(StatIcacheDirty, icache.ndirty);
delaykickicache();
- ie->dirty = 1;
+ pushfirst(&icache.dirty, ie);
}
- ie->next = icache.heads[h];
- icache.heads[h] = ie;
+ ihashinsert(icache.hash, ie);
+}
+
+int
+icachelookup(u8int score[VtScoreSize], int type, IAddr *ia)
+{
+ IEntry *ie;
- se = *ie;
+ qlock(&icache.lock);
+ addstat(StatIcacheLookup, 1);
+ if((ie = ihashlookup(icache.hash, score, type)) != nil){
+ *ia = ie->ia;
+ if(ie->state == IEClean)
+ pushfirst(&icache.clean, ie);
+ addstat(StatIcacheHit, 1);
+ qunlock(&icache.lock);
+ return 0;
+ }
+
+ if((ie = ihashlookup(icache.shash, score, type)) != nil){
+ *ia = ie->ia;
+ icacheinsert(score, &ie->ia, IEClean);
+ scachehit(ie->ia.addr);
+ addstat(StatScacheHit, 1);
+ qunlock(&icache.lock);
+ return 0;
+ }
+ addstat(StatIcacheMiss, 1);
qunlock(&icache.lock);
- if(write && icache.ndirty >= icache.maxdirty)
+ return -1;
+}
+
+int
+insertscore(u8int score[VtScoreSize], IAddr *ia, int state, AState *as)
+{
+ ISum *toload;
+
+ qlock(&icache.lock);
+ icacheinsert(score, ia, state);
+ if(state == IEClean)
+ toload = scachemiss(ia->addr);
+ else{
+ assert(state == IEDirty);
+ toload = nil;
+ if(as == nil)
+ fprint(2, "%T insertscore IEDirty without as; called from %lux\n", getcallerpc(&score));
+ else{
+ if(icache.as.aa > as->aa)
+ fprint(2, "%T insertscore: aa moving backward: %#llux -> %#llux\n", icache.as.aa, as->aa);
+ icache.as = *as;
+ }
+ }
+ qunlock(&icache.lock);
+ if(toload){
+ scacheload(toload);
+ qunlock(&toload->lock);
+ }
+
+ if(icache.ndirty >= icache.maxdirty)
kickicache();
/*
@@ -240,125 +433,81 @@
* the lump, meaning any searches for this block
* will hit in the lump cache until after we return.
*/
- markbloomfilter(mainindex->bloom, score);
+ if(state == IEDirty)
+ markbloomfilter(mainindex->bloom, score);
return 0;
}
-/*
- * allocate a index cache entry which hasn't been used in a while.
- * must be called with icache.lock locked
- * if the score is already in the table, update the entry.
- */
-static IEntry *
-icachealloc(IAddr *ia, u8int *score)
+static int
+lookupscore_untimed(u8int score[VtScoreSize], int type, IAddr *ia)
{
- int i;
- IEntry *ie, *last, *clean, *lastclean;
- u32int h;
+ IEntry d;
- h = hashbits(score, icache.bits);
- last = nil;
- for(ie = icache.heads[h]; ie != nil; ie = ie->next){
- if(ie->ia.type == ia->type && scorecmp(ie->score, score)==0){
- if(last != nil)
- last->next = ie->next;
- else
- icache.heads[h] = ie->next;
- trace(TraceLump, "icachealloc hit");
- ie->rac = 1;
- return ie;
- }
- last = ie;
- }
+ if(icachelookup(score, type, ia) >= 0)
+ return 0;
- h = icache.unused;
- if(h < icache.entries){
- ie = &icache.base[h++];
- icache.unused = h;
- trace(TraceLump, "icachealloc unused");
- goto Found;
- }
-
- if((ie = icache.free) != nil){
- icache.free = ie->next;
- goto Found;
- }
-
- h = icache.stolen;
- for(i=0;; i++){
- h++;
- if(h >= icache.size)
- h = 0;
- if(i == icache.size){
- trace(TraceLump, "icachealloc sleep");
- addstat(StatIcacheStall, 1);
- while(icache.ndirty == icache.entries){
- /*
- * This is a bit suspect. Kickicache will wake up the
- * icachewritecoord, but if all the index entries are for
- * unflushed disk blocks, icachewritecoord won't be
- * able to do much. It always rewakes everyone when
- * it thinks it is done, though, so at least we'll go around
- * the while loop again. Also, if icachewritecoord sees
- * that the disk state hasn't change at all since the last
- * time around, it kicks the disk. This needs to be
- * rethought, but it shouldn't deadlock anymore.
- */
- kickicache();
- rsleep(&icache.full);
- }
- addstat(StatIcacheStall, -1);
- i = 0;
- }
- lastclean = nil;
- clean = nil;
- last = nil;
- for(ie=icache.heads[h]; ie; last=ie, ie=ie->next){
- if(!ie->dirty){
- clean = ie;
- lastclean = last;
- }
- }
- if(clean){
- if(lastclean)
- lastclean->next = clean->next;
- else
- icache.heads[h] = clean->next;
- clean->next = nil;
- icache.stolen = h;
- ie = clean;
- trace(TraceLump, "icachealloc steal");
- goto Found;
- }
- }
+ addstat(StatIcacheFill, 1);
+ if(loadientry(mainindex, score, type, &d) < 0)
+ return -1;
+
+ insertscore(score, &d.ia, IEClean, nil);
+ *ia = d.ia;
+ return 0;
+}
-Found:
- ie->ia = *ia;
- scorecp(ie->score, score);
- ie->rac = 0;
- return ie;
+int
+lookupscore(u8int score[VtScoreSize], int type, IAddr *ia)
+{
+ int ms, ret;
+
+ ms = msec();
+ ret = lookupscore_untimed(score, type, ia);
+ ms = msec() - ms;
+ addstat2(StatIcacheRead, 1, StatIcacheReadTime, ms);
+ return ret;
}
+
+u32int
+hashbits(u8int *sc, int bits)
+{
+ u32int v;
+ v = (sc[0] << 24) | (sc[1] << 16) | (sc[2] << 8) | sc[3];
+ if(bits < 32)
+ v >>= (32 - bits);
+ return v;
+}
+
+ulong
+icachedirtyfrac(void)
+{
+ return (vlong)icache.ndirty*IcacheFrac / icache.nentries;
+}
+
+/*
+ * Return a singly-linked list of dirty index entries.
+ * with 32-bit hash numbers between lo and hi
+ * and address < limit.
+ */
IEntry*
icachedirty(u32int lo, u32int hi, u64int limit)
{
- int i;
u32int h;
IEntry *ie, *dirty;
dirty = nil;
trace(TraceProc, "icachedirty enter");
qlock(&icache.lock);
- for(i=0; inext)
- if(ie->dirty && ie->ia.addr != 0 && ie->ia.addr < limit){
+ for(ie = icache.dirty.next; ie != &icache.dirty; ie=ie->next){
+ if(ie->state == IEDirty && ie->ia.addr < limit){
h = hashbits(ie->score, 32);
if(lo <= h && h <= hi){
ie->nextdirty = dirty;
dirty = ie;
}
}
+ }
qunlock(&icache.lock);
trace(TraceProc, "icachedirty exit");
if(dirty == nil)
@@ -366,36 +515,59 @@
return dirty;
}
+AState
+icachestate(void)
+{
+ AState as;
+
+ qlock(&icache.lock);
+ as = icache.as;
+ qunlock(&icache.lock);
+ return as;
+}
+
+/*
+ * The singly-linked non-circular list of index entries ie
+ * has been written to disk. Move them to the clean list.
+ */
void
icacheclean(IEntry *ie)
{
- trace(TraceProc, "icachedirty enter");
+ IEntry *next;
+
+ trace(TraceProc, "icacheclean enter");
qlock(&icache.lock);
- for(; ie; ie=ie->nextdirty){
+ for(; ie; ie=next){
+ assert(ie->state == IEDirty);
+ next = ie->nextdirty;
+ ie->nextdirty = nil;
+ popout(ie); /* from icache.dirty */
icache.ndirty--;
- ie->dirty = 0;
+ ie->state = IEClean;
+ pushfirst(&icache.clean, ie);
}
setstat(StatIcacheDirty, icache.ndirty);
rwakeupall(&icache.full);
qunlock(&icache.lock);
- trace(TraceProc, "icachedirty exit");
+ trace(TraceProc, "icacheclean exit");
}
void
emptyicache(void)
{
int i;
- IEntry *ie, **lie;
+ IEntry *ie;
+ ISum *s;
qlock(&icache.lock);
- for(i=0; idirty == 0){
- *lie = ie->next;
- ie->next = icache.free;
- icache.free = ie;
- }else
- lie = &ie->next;
+ while((ie = evictlru()) != nil)
+ pushfirst(&icache.free, ie);
+ for(i=0; ilock);
+ sumclear(s);
+ qunlock(&s->lock);
}
qunlock(&icache.lock);
}
+
--- /sys/src/cmd/venti/srv/httpd.c Mon Oct 1 03:36:01 2007
+++ /sys/src/cmd/venti/srv/httpd.c Mon Oct 1 03:36:00 2007
@@ -565,11 +565,11 @@
if(scorecmp(zeroscore, arena->score) != 0)
hprint(hout, "\tscore=%V\n", arena->score);
- hprint(hout, "\tmem: clumps=%d compressed clumps=%d data=%,lld compressed data=%,lld storage=%,lld\n",
+ hprint(hout, "\twritten: clumps=%d compressed clumps=%d data=%,lld compressed data=%,lld storage=%,lld\n",
arena->memstats.clumps, arena->memstats.cclumps, arena->memstats.uncsize,
arena->memstats.used - arena->memstats.clumps * ClumpSize,
arena->memstats.used + arena->memstats.clumps * ClumpInfoSize);
- hprint(hout, "\tdisk: clumps=%d compressed clumps=%d data=%,lld compressed data=%,lld storage=%,lld\n",
+ hprint(hout, "\tindexed: clumps=%d compressed clumps=%d data=%,lld compressed data=%,lld storage=%,lld\n",
arena->diskstats.clumps, arena->diskstats.cclumps, arena->diskstats.uncsize,
arena->diskstats.used - arena->diskstats.clumps * ClumpSize,
arena->diskstats.used + arena->diskstats.clumps * ClumpInfoSize);
@@ -895,7 +895,7 @@
"icachehit",
"icachemiss",
- "icachelookup",
+ "icacheread",
"icachewrite",
"icachefill",
"icacheprefetch",
@@ -904,6 +904,9 @@
"icacheflush",
"icachestall",
"icachelookuptime",
+ "icachelookup",
+ "scachehit",
+ "scacheprefetch",
"bloomhit",
"bloommiss",
@@ -925,6 +928,9 @@
"sumread",
"sumreadbyte",
+
+ "cigload",
+ "cigloadtime",
};
static int
--- /sys/src/cmd/venti/srv/checkarenas.c Mon Oct 1 03:36:01 2007
+++ /sys/src/cmd/venti/srv/checkarenas.c Mon Oct 1 03:36:01 2007
@@ -24,7 +24,7 @@
err = 0;
for(;;){
- e = syncarena(arena, 0, 1000, 0, fix);
+ e = syncarena(arena, 1000, 0, fix);
err |= e;
if(!(e & SyncHeader))
break;
--- /sys/src/cmd/venti/srv/syncarena.c Mon Oct 1 03:36:02 2007
+++ /sys/src/cmd/venti/srv/syncarena.c Mon Oct 1 03:36:02 2007
@@ -25,7 +25,7 @@
* returns 0 if ok, flags if error occurred
*/
int
-syncarena(Arena *arena, u64int start, u32int n, int zok, int fix)
+syncarena(Arena *arena, u32int n, int zok, int fix)
{
ZBlock *lump;
Clump cl;
@@ -53,7 +53,7 @@
fprint(2, "%s: illegal clump magic number=%#8.8ux at clump=%d\n", arena->name, magic, clump);
/* err |= SyncDataErr; */
if(fix && writeclumpmagic(arena, aa, ClumpFreeMagic) < 0){
- fprint(2, "can't write corrected clump free magic: %r");
+ fprint(2, "%s: can't write corrected clump free magic: %r", arena->name);
err |= SyncFixErr;
}
break;
@@ -136,9 +136,8 @@
|| cclumps != arena->memstats.cclumps
|| uncsize != arena->memstats.uncsize){
err |= SyncHeader;
- fprint(2, "arena %s: start=%lld fix=%d flush=%d %lld->%lld %ud->%ud %ud->%ud %lld->%lld\n",
+ fprint(2, "arena %s: fix=%d flush=%d %lld->%lld %ud->%ud %ud->%ud %lld->%lld\n",
arena->name,
- start,
fix,
flush,
used, arena->memstats.used,
--- /sys/src/cmd/venti/srv/buildindex.c Mon Oct 1 03:36:03 2007
+++ /sys/src/cmd/venti/srv/buildindex.c Mon Oct 1 03:36:02 2007
@@ -36,18 +36,19 @@
void
usage(void)
{
- fprint(2, "usage: buildindex [-bd] [-i isect]... [-M imem] venti.conf\n");
+ fprint(2, "usage: buildindex [-b] [-i isect]... [-M imem] venti.conf\n");
threadexitsall("usage");
}
void
threadmain(int argc, char *argv[])
{
- int fd, i, napart;
+ int fd, i, napart, nfinish, maxdisks;
u32int bcmem, imem;
Config conf;
Part *p;
+ maxdisks = 100000;
ventifmtinstall();
imem = 256*1024*1024;
ARGBEGIN{
@@ -64,6 +65,9 @@
case 'M':
imem = unittoull(EARGF(usage()));
break;
+ case 'm': /* temporary - might go away */
+ maxdisks = atoi(EARGF(usage()));
+ break;
default:
usage();
break;
@@ -132,17 +136,21 @@
/* start arena procs */
p = nil;
napart = 0;
+ nfinish = 0;
arenadonechan = chancreate(sizeof(void*), 0);
for(i=0; inarenas; i++){
if(ix->arenas[i]->part != p){
p = ix->arenas[i]->part;
vtproc(arenapartproc, p);
- napart++;
+ if(++napart >= maxdisks){
+ recvp(arenadonechan);
+ nfinish++;
+ }
}
}
/* wait for arena procs to finish */
- for(i=0; imemstats.clumps)
fprint(2, "%T arena %s: %d entries\n",
a->name, a->memstats.clumps);
- addr = ix->amap[i].start;
- for(clump=0; clumpmemstats.clumps; clump+=n){
+ /*
+ * Running the loop backwards accesses the
+ * clump info blocks forwards, since they are
+ * stored in reverse order at the end of the arena.
+ * This speeds things slightly.
+ */
+ addr = ix->amap[i].start + a->memstats.used;
+ for(clump=a->memstats.clumps; clump > 0; clump-=n){
n = ClumpChunks;
- if(n > a->memstats.clumps - clump)
- n = a->memstats.clumps - clump;
- if(readclumpinfos(a, clump, cis, n) != n){
+ if(n > clump)
+ n = clump;
+ if(readclumpinfos(a, clump-n, cis, n) != n){
fprint(2, "%T arena %s: directory read: %r\n", a->name);
errors = 1;
break;
}
- for(j=0; j=0; j--){
ci = &cis[j];
ie.ia.type = ci->type;
ie.ia.size = ci->uncsize;
+ addr -= ci->size + ClumpSize;
ie.ia.addr = addr;
- addr += ci->size + ClumpSize;
ie.ia.blocks = (ci->size + ClumpSize + (1<> ABlockLog;
scorecp(ie.score, ci->score);
if(ci->type == VtCorruptType)
@@ -253,6 +267,8 @@
}
}
}
+ if(addr != ix->amap[i].start)
+ fprint(2, "%T arena %s: clump miscalculation %lld != %lld\n", a->name, addr, ix->amap[i].start);
}
add(&arenaentries, tot);
add(&skipentries, nskip);
--- /sys/src/cmd/venti/srv/venti.c Mon Oct 1 03:36:04 2007
+++ /sys/src/cmd/venti/srv/venti.c Mon Oct 1 03:36:03 2007
@@ -106,9 +106,6 @@
if(configfile == nil)
configfile = "venti.conf";
- if(initarenasum() < 0)
- fprint(2, "warning: can't initialize arena summing process: %r");
-
fprint(2, "conf...");
if(initventi(configfile, &config) < 0)
sysfatal("can't init server: %r");
@@ -146,13 +143,7 @@
mem, mem / (8 * 1024));
initlumpcache(mem, mem / (8 * 1024));
- icmem = u64log2(icmem / (sizeof(IEntry)+sizeof(IEntry*)) / ICacheDepth);
- if(icmem < 4)
- icmem = 4;
- if(0) fprint(2, "initialize %d bytes of index cache for %d index entries\n",
- (sizeof(IEntry)+sizeof(IEntry*)) * (1 << icmem) * ICacheDepth,
- (1 << icmem) * ICacheDepth);
- initicache(icmem, ICacheDepth);
+ initicache(icmem);
initicachewrite();
/*
@@ -170,7 +161,7 @@
startbloomproc(mainindex->bloom);
fprint(2, "sync...");
- if(!readonly && syncindex(mainindex, 1, 0, 0) < 0)
+ if(!readonly && syncindex(mainindex) < 0)
sysfatal("can't sync server: %r");
if(!readonly && queuewrites){
@@ -181,6 +172,9 @@
queuewrites = 0;
}
}
+
+ if(initarenasum() < 0)
+ fprint(2, "warning: can't initialize arena summing process: %r");
fprint(2, "announce %s...", vaddr);
ventisrv = vtlisten(vaddr);
--- /sys/src/cmd/venti/srv/fns.h Mon Oct 1 03:36:04 2007
+++ /sys/src/cmd/venti/srv/fns.h Mon Oct 1 03:36:04 2007
@@ -6,8 +6,11 @@
void addstat2(int, int, int, int);
ZBlock *alloczblock(u32int size, int zeroed, uint alignment);
Arena *amapitoa(Index *index, u64int a, u64int *aa);
+Arena *amapitoag(Index *index, u64int a, u64int *gstart, u64int *glimit, int *g);
u64int arenadirsize(Arena *arena, u32int clumps);
+int arenatog(Arena *arena, u64int aa, u64int *gstart, u64int *glimit, int *g);
void arenaupdate(Arena *arena, u32int size, u8int *score);
+int asumload(Arena *arena, int g, IEntry *entries, int maxentries);
void backsumarena(Arena *arena);
void binstats(long (*fn)(Stats *s0, Stats *s1, void*), void *arg, long t0, long t1, Statbin *bin, int nbin);
int bloominit(Bloom*, vlong, uchar*);
@@ -26,7 +29,6 @@
void dirtydblock(DBlock*, int);
void diskaccess(int);
void disksched(void);
-AState diskstate(void);
void *emalloc(ulong);
void emptydcache(void);
void emptyicache(void);
@@ -64,6 +66,8 @@
IEntry* icachedirty(u32int, u32int, u64int);
ulong icachedirtyfrac(void);
void icacheclean(IEntry*);
+int icachelookup(u8int *score, int type, IAddr *ia);
+AState icachestate(void);
int ientrycmp(const void *vie1, const void *vie2);
char *ifileline(IFile *f);
int ifilename(IFile *f, char *dst);
@@ -76,7 +80,7 @@
int initarenasum(void);
void initbloomfilter(Index*);
void initdcache(u32int mem);
-void initicache(int bits, int depth);
+void initicache(u32int mem);
void initicachewrite(void);
IEStream *initiestream(Part *part, u64int off, u64int clumps, u32int size);
ISect *initisect(Part *part);
@@ -87,7 +91,7 @@
void initround(Round*, char*, int);
int initventi(char *config, Config *conf);
void insertlump(Lump *lump, Packet *p);
-int insertscore(u8int *score, IAddr *ia, int write);
+int insertscore(u8int *score, IAddr *ia, int state, AState *as);
void kickdcache(void);
void kickicache(void);
void kickround(Round*, int wait);
@@ -97,14 +101,14 @@
int loadientry(Index *index, u8int *score, int type, IEntry *ie);
void logerr(int severity, char *fmt, ...);
Lump *lookuplump(u8int *score, int type);
-int _lookupscore(u8int *score, int type, IAddr *ia, int *rac);
-int lookupscore(u8int *score, int type, IAddr *ia, int *rac);
+int lookupscore(u8int *score, int type, IAddr *ia);
int maparenas(AMap *am, Arena **arenas, int n, char *what);
void markbloomfilter(Bloom*, u8int*);
uint msec(void);
int namecmp(char *s, char *t);
void namecp(char *dst, char *src);
int nameok(char *name);
+void needmainindex(void);
void needzeroscore(void);
Arena *newarena(Part *part, u32int, char *name, u64int base, u64int size, u32int blocksize);
ArenaPart *newarenapart(Part *part, u32int blocksize, u32int tabsize);
@@ -152,7 +156,6 @@
int scorecmp(u8int *, u8int *);
void scoremem(u8int *score, u8int *buf, int size);
void setatailstate(AState*);
-void setdcachestate(AState*);
void seterr(int severity, char *fmt, ...);
void setstat(int, long);
void settrace(char *type);
@@ -166,9 +169,8 @@
int stru32int(char *s, u32int *r);
int stru64int(char *s, u64int *r);
void sumarena(Arena *arena);
-int syncarena(Arena *arena, u64int start, u32int n, int zok, int fix);
-int syncarenaindex(Index *ix, Arena *arena, u32int clump, u64int a, int fix, int *pflush, int check);
-int syncindex(Index *ix, int fix, int mustflushicache, int check);
+int syncarena(Arena *arena, u32int n, int zok, int fix);
+int syncindex(Index *ix);
void trace(char *type, char*, ...);
void traceinit(void);
int u64log2(u64int v);
@@ -197,12 +199,12 @@
int wbisect(ISect *is);
int wbindex(Index *ix);
int whackblock(u8int *dst, u8int *src, int ssize);
-u64int writeaclump(Arena *a, Clump *c, u8int *clbuf, u64int, u64int*);
+u64int writeaclump(Arena *a, Clump *c, u8int *clbuf);
u32int writearena(Arena *arena, u64int aa, u8int *clbuf, u32int n);
int writebloom(Bloom*);
int writeclumpinfo(Arena *arean, int clump, ClumpInfo *ci);
int writepng(Hio*, Memimage*);
-u64int writeiclump(Index *ix, Clump *c, u8int *clbuf, u64int*);
+u64int writeiclump(Index *ix, Clump *c, u8int *clbuf);
int writelump(Packet *p, u8int *score, int type, u32int creator, uint ms);
int writepart(Part *part, u64int addr, u8int *buf, u32int n);
int writeqlump(Lump *u, Packet *p, int creator, uint ms);
--- /sys/src/cmd/venti/srv/syncindex.c Mon Oct 1 03:36:05 2007
+++ /sys/src/cmd/venti/srv/syncindex.c Mon Oct 1 03:36:04 2007
@@ -6,7 +6,7 @@
void
usage(void)
{
- fprint(2, "usage: syncindex [-fv] [-B blockcachesize] config\n");
+ fprint(2, "usage: syncindex [-v] [-B blockcachesize] config\n");
threadexitsall("usage");
}
@@ -16,9 +16,7 @@
threadmain(int argc, char *argv[])
{
u32int bcmem, icmem;
- int fix;
- fix = 0;
bcmem = 0;
icmem = 0;
ARGBEGIN{
@@ -28,9 +26,6 @@
case 'I':
icmem = unittoull(EARGF(usage()));
break;
- case 'f':
- fix++;
- break;
case 'v':
verbose++;
break;
@@ -39,9 +34,6 @@
break;
}ARGEND
- if(!fix)
- readonly = 1;
-
if(argc != 1)
usage();
@@ -56,21 +48,17 @@
if(0) fprint(2, "initialize %d bytes of disk block cache\n", bcmem);
initdcache(bcmem);
initlumpcache(1*1024*1024, 1024/8);
- icmem = u64log2(icmem / (sizeof(IEntry)+sizeof(IEntry*)) / ICacheDepth);
- if(icmem < 4)
- icmem = 4;
- if(1) fprint(2, "initialize %d bytes of index cache for %d index entries\n",
- (sizeof(IEntry)+sizeof(IEntry*)) * (1 << icmem) * ICacheDepth,
- (1 << icmem) * ICacheDepth);
- initicache(icmem, ICacheDepth);
+ initicache(icmem);
initicachewrite();
if(mainindex->bloom)
startbloomproc(mainindex->bloom);
if(verbose)
printindex(2, mainindex);
- if(syncindex(mainindex, fix, 1, 0) < 0)
+ if(syncindex(mainindex) < 0)
sysfatal("failed to sync index=%s: %r\n", mainindex->name);
+ flushicache();
+ flushdcache();
threadexitsall(0);
}
--- /sys/src/cmd/venti/srv/syncindex0.c Mon Oct 1 03:36:05 2007
+++ /sys/src/cmd/venti/srv/syncindex0.c Mon Oct 1 03:36:05 2007
@@ -2,184 +2,92 @@
#include "dat.h"
#include "fns.h"
-enum
+static int
+syncarenaindex(Arena *arena, u64int a0)
{
- ClumpChunks = 32*1024
-};
-
-static int missing, wrong;
-
-/*
- * shell sort is plenty good enough
- * because we're going to do a bunch of disk i/o's
- */
-static void
-sortclumpinfo(ClumpInfo *ci, int *s, int n)
-{
- int i, j, m, t;
-
- for(m = (n + 3) / 5; m > 0; m = (m + 1) / 3){
- for(i = n - m; i-- > 0;){
- for(j = i + m; j < n; j += m){
- if(memcmp(ci[s[j - m]].score, ci[s[j]].score, VtScoreSize) <= 0)
- break;
- t = s[j];
- s[j] = s[j - m];
- s[j - m] = t;
- }
- }
- }
-}
-
-int
-syncarenaindex(Index *ix, Arena *arena, u32int clump, u64int a, int fix, int *pflush, int check)
-{
- Packet *pack;
- IEntry ie;
+ int ok;
+ u32int clump;
+ u64int a;
+ ClumpInfo ci;
IAddr ia;
- ClumpInfo *ci, *cis;
- u64int *addrs;
- int i, n, ok, *s, flush;
-
- trace(TraceProc, "syncarenaindex enter");
-
- flush = 0;
- cis = MKN(ClumpInfo, ClumpChunks);
- addrs = MKN(u64int, ClumpChunks);
- s = MKN(int, ClumpChunks);
+ AState as;
+
+ if(arena->diskstats.clumps == arena->memstats.clumps)
+ return 0;
+
+ memset(&as, 0, sizeof as);
+ as.arena = arena;
+ as.stats = arena->diskstats;
+
ok = 0;
- for(; clump < arena->memstats.clumps; clump += n){
- n = ClumpChunks;
- if(n > arena->memstats.clumps - clump)
- n = arena->memstats.clumps - clump;
- n = readclumpinfos(arena, clump, cis, n);
- if(n <= 0){
- fprint(2, "arena directory read failed\n");
+ a = a0 + arena->diskstats.used;
+ for(clump=arena->diskstats.clumps; clump < arena->memstats.clumps; clump++){
+ if(readclumpinfo(arena, clump, &ci) < 0){
+ fprint(2, "%s: clump %d: cannot read clumpinfo\n",
+ arena->name, clump);
ok = -1;
break;
}
- for(i = 0; i < n; i++){
- addrs[i] = a;
- a += cis[i].size + ClumpSize;
- s[i] = i;
- }
-
- sortclumpinfo(cis, s, n);
-
- for(i = 0; i < n; i++){
- ci = &cis[s[i]];
- ia.type = ci->type;
- ia.size = ci->uncsize;
- ia.addr = addrs[s[i]];
- ia.blocks = (ci->size + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog;
-
- if(!check)
- goto Add;
- if(loadientry(ix, ci->score, ci->type, &ie) < 0){
- trace(TraceProc, "syncarenaindex missing block %V.%d", ci->score, ci->type);
- missing++;
- if(0) fprint(2, "missing block type=%d score=%V\n", ci->type, ci->score);
- }else if(iaddrcmp(&ia, &ie.ia) != 0){
- trace(TraceProc, "syncarenaindex mismatched entry");
- fprint(2, "\nmismatched index entry and clump at %d\n", clump + i);
- fprint(2, "\tclump: type=%d size=%d blocks=%d addr=%lld\n", ia.type, ia.size, ia.blocks, ia.addr);
- fprint(2, "\tindex: type=%d size=%d block=%d addr=%lld\n", ie.ia.type, ie.ia.size, ie.ia.blocks, ie.ia.addr);
- pack = readlump(ie.score, ie.ia.type, ie.ia.size, nil);
- packetfree(pack);
- if(pack != nil){
- fprint(2, "duplicated lump\n");
- continue;
- }
- wrong++;
- }else
- continue;
- Add:
- if(!fix){
- ok = -1;
- continue;
- }
- flush = 1;
- trace(TraceProc, "syncarenaindex insert %V", ci->score);
- insertscore(ci->score, &ia, 1);
- }
-
- if(0 && clump / 1000 != (clump + n) / 1000)
- fprint(2, ".");
- }
- free(cis);
- free(addrs);
- free(s);
- if(flush){
- flushdcache();
- *pflush = 1;
+ ia.type = ci.type;
+ ia.size = ci.uncsize;
+ ia.addr = a;
+ ia.blocks = (ClumpSize + ci.size + (1 << ABlockLog) - 1) >> ABlockLog;
+ a += ClumpSize + ci.size;
+
+ as.stats.used += ClumpSize + ci.size;
+ as.stats.uncsize += ia.size;
+ as.stats.clumps++;
+ if(ci.uncsize > ci.size)
+ as.stats.cclumps++;
+ as.aa = a;
+ insertscore(ci.score, &ia, IEDirty, &as);
}
+ flushdcache();
return ok;
}
int
-syncindex(Index *ix, int fix, int mustflush, int check)
+syncindex(Index *ix)
{
Arena *arena;
- AState as;
- u64int a;
- int i, e, e1, ok, ok1, flush;
+ int i, e, e1, ok;
ok = 0;
- flush = 0;
for(i = 0; i < ix->narenas; i++){
trace(TraceProc, "syncindex start %d", i);
arena = ix->arenas[i];
- /*
- * Syncarena will scan through the arena looking for blocks
- * that have been forgotten. It will update arena->memstats.used,
- * so save the currenct copy as the place to start the
- * syncarenaindex scan.
- */
- a = arena->memstats.used;
- e = syncarena(arena, ix->amap[i].start, TWID32, fix, fix);
+ e = syncarena(arena, TWID32, 1, 1);
e1 = e;
- if(fix)
- e1 &= ~(SyncHeader|SyncCIZero|SyncCIErr);
- if(e1 == SyncHeader)
+ e1 &= ~(SyncHeader|SyncCIZero|SyncCIErr);
+ if(e & SyncHeader)
fprint(2, "arena %s: header is out-of-date\n", arena->name);
- if(e1)
+ if(e1){
+ fprint(2, "arena %s: %x\n", arena->name, e1);
ok = -1;
- else{
- /*
- * use diskstats not memstats here, because diskstats
- * is what has been indexed; memstats is what has
- * made it to disk (confusing names).
- */
- ok1 = syncarenaindex(ix, arena,
- arena->diskstats.clumps,
- ix->amap[i].start + arena->diskstats.used,
- fix, &flush, check);
- if(ok1 < 0)
- fprint(2, "syncarenaindex: %r\n");
- if(fix && ok1==0 && (e & SyncHeader) && wbarena(arena) < 0)
- fprint(2, "arena=%s header write failed: %r\n", arena->name);
- ok |= ok1;
-
- as.arena = arena;
- as.aa = ix->amap[i].start + arena->memstats.used;
- as.stats = arena->memstats;
- setdcachestate(&as);
+ continue;
+ }
+ flushdcache();
+
+ if(arena->memstats.clumps == arena->diskstats.clumps)
+ continue;
+
+ fprint(2, "%T %s: indexing %d clumps...\n",
+ arena->name,
+ arena->memstats.clumps - arena->diskstats.clumps);
+
+ if(syncarenaindex(arena, ix->amap[i].start) < 0){
+ fprint(2, "arena %s: syncarenaindex: %r\n", arena->name);
+ ok = -1;
+ continue;
+ }
+ if(wbarena(arena) < 0){
+ fprint(2, "arena %s: wbarena: %r\n", arena->name);
+ ok = -1;
+ continue;
}
- }
- if(missing || wrong)
- fprint(2, "syncindex: %d missing entries, %d wrong entries (flush=%d)\n", missing, wrong, flush);
- if(fix && wbindex(ix) < 0){
- fprint(2, "can't write back index header for %s: %r\n", ix->name);
- return -1;
- }
- if(fix && flush){
flushdcache();
- if(mustflush){
- flushicache();
- flushdcache();
- }else
- kickicache();
+ delaykickicache();
}
return ok;
}
--- /sys/src/cmd/venti/srv/icachewrite.c Mon Oct 1 03:36:06 2007
+++ /sys/src/cmd/venti/srv/icachewrite.c Mon Oct 1 03:36:05 2007
@@ -12,7 +12,7 @@
static IEntry *iesort(IEntry*);
int icachesleeptime = 1000; /* milliseconds */
-int minicachesleeptime = 50;
+int minicachesleeptime = 0;
enum
{
@@ -85,7 +85,7 @@
static int
icachewritesect(Index *ix, ISect *is, u8int *buf)
{
- int err, h, bsize, t;
+ int err, i, werr, h, bsize, t;
u32int lo, hi;
u64int addr, naddr;
uint nbuf, off;
@@ -115,7 +115,8 @@
}
if(t < minicachesleeptime)
t = minicachesleeptime;
- sleep(t);
+ if(t > 0)
+ sleep(t);
trace(TraceProc, "icachewritesect nextchunk");
chunk = nextchunk(ix, is, &iedirty, &addr, &nbuf);
@@ -169,33 +170,29 @@
break;
}
packibucket(&ib, buf+off, is->bucketmagic);
- /*
- * XXX This is not quite right - it's good that we
- * update the cached block (if any) here, but
- * since the block doesn't get written until writepart
- * below, we also need to make sure that the cache
- * doesn't load the stale block before we write it to
- * disk below. We could lock the disk cache during
- * the writepart, but that's pretty annoying.
- * Another possibility would be never to cache
- * index partition blocks. The hit rate on those is
- * miniscule anyway.
- */
- if((b = _getdblock(is->part, naddr, ORDWR, 0)) != nil){
- memmove(b->data, buf+off, bsize);
- putdblock(b);
- }
}
diskaccess(1);
trace(TraceProc, "icachewritesect writepart", addr, nbuf);
- if(writepart(is->part, addr, buf, nbuf) < 0 ||
- flushpart(is->part) < 0){
+ werr = 0;
+ if(writepart(is->part, addr, buf, nbuf) < 0 || flushpart(is->part) < 0)
+ werr = -1;
+
+ for(i=0; ipart, addr+i, ORDWR, 0)) != nil){
+ memmove(b->data, buf+i, bsize);
+ putdblock(b);
+ }
+ }
+
+ if(werr < 0){
fprint(2, "%s: part %s addr 0x%llux: icachewritesect "
"writepart: %r\n", argv0, is->part->name, addr);
+ err = -1;
continue;
}
+
addstat(StatIsectWriteBytes, nbuf);
addstat(StatIsectWrite, 1);
icacheclean(chunk);
@@ -245,18 +242,20 @@
threadsetname("icachewritecoord");
ix = mainindex;
- iwrite.as = diskstate();
+ iwrite.as = icachestate();
for(;;){
trace(TraceProc, "icachewritecoord sleep");
waitforkick(&iwrite.round);
trace(TraceWork, "start");
- as = diskstate();
+ as = icachestate();
if(as.arena==iwrite.as.arena && as.aa==iwrite.as.aa){
/* will not be able to do anything more than last flush - kick disk */
+ fprint(2, "icache: nothing to do - kick dcache\n");
trace(TraceProc, "icachewritecoord kick dcache");
kickdcache();
trace(TraceProc, "icachewritecoord kicked dcache");
+ goto SkipWork; /* won't do anything; don't bother rewriting bloom filter */
}
iwrite.as = as;
@@ -274,9 +273,11 @@
err |= recvul(ix->bloom->writedonechan);
trace(TraceProc, "icachewritecoord donewrite err=%d", err);
- if(err == 0)
+ if(err == 0){
setatailstate(&iwrite.as);
+ }
}
+ SkipWork:
icacheclean(nil); /* wake up anyone waiting */
trace(TraceWork, "finish");
addstat(StatIcacheFlush, 1);