venti: * stop storing unused wtime, train fields in IEntry. * prefetch arena tocs more gradually; set icacheprefetch=1 * yank out dead lump readahead code * yank out dead dcache readahead code * fix sync bug reported by anothy et al. * new, cleaner icache code mirrorarenas: * fix thread-stack access race (wsync) * new flag -s disables SHA1 checks, useful if destination is remote verifyarena: * accept venti-standard file:range syntax Reference: /n/sources/patch/applied/nventi Date: Mon Oct 1 03:36:25 CES 2007 Signed-off-by: rsc@swtch.com --- /sys/src/cmd/venti/srv/config.c Mon Oct 1 03:35:51 2007 +++ /sys/src/cmd/venti/srv/config.c Mon Oct 1 03:35:50 2007 @@ -245,3 +245,9 @@ return b; } +/* for OS X linker, which only resolves functions, not data */ +void +needmainindex(void) +{ +} + --- /sys/src/cmd/venti/srv/conv.c Mon Oct 1 03:35:51 2007 +++ /sys/src/cmd/venti/srv/conv.c Mon Oct 1 03:35:51 2007 @@ -581,9 +581,9 @@ scorecp(ie->score, p); p += VtScoreSize; - ie->wtime = U32GET(p); + /* ie->wtime = U32GET(p); */ p += U32Size; - ie->train = U16GET(p); + /* ie->train = U16GET(p); */ p += U16Size; if(p - buf != IEntryAddrOff) sysfatal("unpackentry bad IEntryAddrOff amount"); @@ -613,9 +613,9 @@ scorecp(p, ie->score); p += VtScoreSize; - U32PUT(p, ie->wtime); + U32PUT(p, 0); /* wtime */ p += U32Size; - U16PUT(p, ie->train); + U16PUT(p, 0); /* train */ p += U16Size; U64PUT(p, ie->ia.addr, t32); p += U64Size; --- /sys/src/cmd/venti/srv/mirrorarenas.c Mon Oct 1 03:35:52 2007 +++ /sys/src/cmd/venti/srv/mirrorarenas.c Mon Oct 1 03:35:51 2007 @@ -22,13 +22,14 @@ Part *dst; int force; int verbose; +int dosha1 = 1; char *status; uvlong astart, aend; void usage(void) { - fprint(2, "usage: mirrorarenas [-v] src dst [ranges]\n"); + fprint(2, "usage: mirrorarenas [-sv] src dst [ranges]\n"); threadexitsall("usage"); } @@ -92,6 +93,7 @@ * src with writing dst during copy. This is an easy factor of two * (almost) in performance. */ +static Write wsync; static void writeproc(void *v) { @@ -99,7 +101,7 @@ USED(v); while((w = recvp(writechan)) != nil){ - if(w->n == 0) + if(w == &wsync) continue; if(ewritepart(dst, w->o, w->p, w->n) < 0) w->error = 1; @@ -146,11 +148,7 @@ /* * wait for queued write to finish */ - w[i].p = nil; - w[i].o = 0; - w[i].n = 0; - w[i].error = 0; - sendp(writechan, &w[i]); + sendp(writechan, &wsync); i = 1-i; if(w[i].error) return -1; @@ -240,7 +238,7 @@ mirror(Arena *sa, Arena *da) { vlong v, si, di, end; - int clumpmax, blocksize; + int clumpmax, blocksize, sealed; static uchar buf[MaxIoSize]; ArenaHead h; DigestState xds, *ds; @@ -305,7 +303,8 @@ shaoff = 0; ds = nil; - if(sa->diskstats.sealed && scorecmp(sa->score, zeroscore) != 0){ + sealed = sa->diskstats.sealed && scorecmp(sa->score, zeroscore) != 0; + if(sealed && dosha1){ /* start sha1 state with header */ memset(&xds, 0, sizeof xds); ds = &xds; @@ -362,7 +361,7 @@ if(ewritepart(dst, end, buf, blocksize) < 0) return; - if(ds){ + if(sealed){ /* * ... but on the final pass, copy the encoding * of the tail information from the source @@ -375,20 +374,27 @@ if(asha1(dst, shaoff, end, ds) < 0 || copy(end, end+blocksize-VtScoreSize, "tail", ds) < 0) return; - memset(buf, 0, VtScoreSize); - sha1(buf, VtScoreSize, da->score, ds); - if(scorecmp(sa->score, da->score) == 0){ + if(dosha1){ + memset(buf, 0, VtScoreSize); + sha1(buf, VtScoreSize, da->score, ds); + if(scorecmp(sa->score, da->score) == 0){ + if(verbose) + chat("%T %s: %V sealed mirrored\n", sa->name, sa->score); + if(ewritepart(dst, end+blocksize-VtScoreSize, da->score, VtScoreSize) < 0) + return; + }else{ + chat("%T %s: sealing dst: score mismatch: %V vs %V\n", sa->name, sa->score, da->score); + memset(&xds, 0, sizeof xds); + asha1(dst, base-blocksize, end+blocksize-VtScoreSize, &xds); + sha1(buf, VtScoreSize, 0, &xds); + chat("%T reseal: %V\n", da->score); + status = "errors"; + } + }else{ if(verbose) - chat("%T %s: %V sealed mirrored\n", sa->name, sa->score); - if(ewritepart(dst, end+blocksize-VtScoreSize, da->score, VtScoreSize) < 0) + chat("%T %s: %V mirrored\n", sa->name, sa->score); + if(ewritepart(dst, end+blocksize-VtScoreSize, sa->score, VtScoreSize) < 0) return; - }else{ - chat("%T %s: sealing dst: score mismatch: %V vs %V\n", sa->name, sa->score, da->score); - memset(&xds, 0, sizeof xds); - asha1(dst, base-blocksize, end+blocksize-VtScoreSize, &xds); - sha1(buf, VtScoreSize, 0, &xds); - chat("%T reseal: %V\n", da->score); - status = "errors"; } }else{ chat("%T %s: %,lld used mirrored\n", @@ -461,6 +467,9 @@ break; case 'v': verbose++; + break; + case 's': + dosha1 = 0; break; default: usage(); --- /sys/src/cmd/venti/srv/verifyarena.c Mon Oct 1 03:35:52 2007 +++ /sys/src/cmd/venti/srv/verifyarena.c Mon Oct 1 03:35:52 2007 @@ -7,6 +7,7 @@ static uchar *data; static int blocksize; static int sleepms; +static vlong offset0; void usage(void) @@ -22,7 +23,7 @@ for(nr = 0; nr < n; nr += m){ m = n - nr; - m = pread(fd, &buf[nr], m, off+nr); + m = pread(fd, &buf[nr], m, offset0+off+nr); if(m <= 0){ if(m == 0) werrstr("early eof"); @@ -175,7 +176,8 @@ char *p, *q, *table, *f[10], line[256]; vlong start, stop; ArenaPart ap; - + Part *part; + needzeroscore(); ventifmtinstall(); blocksize = MaxIoSize; @@ -201,8 +203,10 @@ threadexitsall(nil); } - if((fd = open(argv[0], OREAD)) < 0) - sysfatal("open %s: %r", argv[0]); + if((part = initpart(argv[0], OREAD)) == nil) + sysfatal("open partition %s: %r", argv[0]); + fd = part->fd; + offset0 = part->offset; if(preadblock(data, 8192, PartBlank) < 0) sysfatal("read arena part header: %r"); @@ -249,7 +253,7 @@ fprint(2, "%T %s: bad start,stop %lld,%lld\n", f[0], stop, start); continue; } - if(seek(fd, start, 0) < 0) + if(seek(fd, offset0+start, 0) < 0) fprint(2, "%T %s: seek to start: %r\n", f[0]); verifyarena(f[0], stop - start); } --- /sys/src/cmd/venti/srv/hdisk.c Mon Oct 1 03:35:54 2007 +++ /sys/src/cmd/venti/srv/hdisk.c Mon Oct 1 03:35:53 2007 @@ -547,7 +547,7 @@ Lump *u; IAddr ia; IEntry ie; - int i, rac; + int i; Arena *arena; u64int aa; ZBlock *zb; @@ -561,7 +561,7 @@ } hprint(&c->hout, "
\n", score); - if(_lookupscore(score, -1, &ia, nil) < 0) + if(icachelookup(score, -1, &ia) < 0) hprint(&c->hout, " icache: not found\n"); else hprint(&c->hout, " icache: addr=%#llx size=%d type=%d blocks=%d\n", @@ -585,12 +585,12 @@ hprint(&c->hout, " -cache"); putlump(u); - if(lookupscore(score, type, &ia, &rac) < 0){ + if(lookupscore(score, type, &ia) < 0){ hprint(&c->hout, " -lookup\n"); continue; } - hprint(&c->hout, "\n lookupscore: addr=%#llx size=%d blocks=%d rac=%d\n", - ia.addr, ia.size, ia.blocks, rac); + hprint(&c->hout, "\n lookupscore: addr=%#llx size=%d blocks=%d\n", + ia.addr, ia.size, ia.blocks); arena = amapitoa(mainindex, ia.addr, &aa); if(arena == nil){ --- /sys/src/cmd/venti/srv/stats.c Mon Oct 1 03:35:54 2007 +++ /sys/src/cmd/venti/srv/stats.c Mon Oct 1 03:35:54 2007 @@ -60,6 +60,9 @@ { "index cache flushes", }, { "index cache stalls", }, { "index cache read time", }, + { "index cache lookups" }, + { "index cache summary hits" }, + { "index cache summary prefetches" }, { "bloom filter hits", }, { "bloom filter misses", }, @@ -81,6 +84,9 @@ { "sum reads", }, { "sum read bytes", }, + + { "cig loads" }, + { "cig load time" }, }; QLock statslock; --- /sys/src/cmd/venti/srv/lump.c Mon Oct 1 03:35:55 2007 +++ /sys/src/cmd/venti/srv/lump.c Mon Oct 1 03:35:54 2007 @@ -7,7 +7,7 @@ int writestodevnull = 0; int verifywrites = 0; -static Packet *readilump(Lump *u, IAddr *ia, u8int *score, int rac); +static Packet *readilump(Lump *u, IAddr *ia, u8int *score); /* * Some of this logic is duplicated in hdisk.c @@ -19,7 +19,6 @@ Packet *p; IAddr ia; u32int n; - int rac; trace(TraceLump, "readlump enter"); /* @@ -49,7 +48,7 @@ if(cached) *cached = 0; - if(lookupscore(score, type, &ia, &rac) < 0){ + if(lookupscore(score, type, &ia) < 0){ /* ZZZ place to check for someone trying to guess scores */ seterr(EOk, "no block with score %V/%d exists", score, type); @@ -64,7 +63,7 @@ } trace(TraceLump, "readlump readilump"); - p = readilump(u, &ia, score, rac); + p = readilump(u, &ia, score); putlump(u); trace(TraceLump, "readlump exit"); @@ -134,9 +133,8 @@ Packet *old; IAddr ia; int ok; - int rac; - if(lookupscore(u->score, u->type, &ia, &rac) == 0){ + if(lookupscore(u->score, u->type, &ia) == 0){ if(verifywrites == 0){ /* assume the data is here! */ packetfree(p); @@ -149,7 +147,7 @@ * if the read fails, * assume it was corrupted data and store the block again */ - old = readilump(u, &ia, u->score, rac); + old = readilump(u, &ia, u->score); if(old != nil){ ok = 0; if(packetcmp(p, old) != 0){ @@ -176,8 +174,6 @@ ok = storeclump(mainindex, flat, u->score, u->type, creator, &ia); freezblock(flat); if(ok == 0) - ok = insertscore(u->score, &ia, 1); - if(ok == 0) insertlump(u, p); else packetfree(p); @@ -193,39 +189,14 @@ return ok; } -static void -lreadahead(u64int a, Arena *arena, u64int aa, int n) -{ - u8int buf[ClumpSize]; - Clump cl; - IAddr ia; - - while(n > 0) { - if (aa >= arena->memstats.used) - break; - if(readarena(arena, aa, buf, ClumpSize) < ClumpSize) - break; - if(unpackclump(&cl, buf, arena->clumpmagic) < 0) - break; - ia.addr = a; - ia.type = cl.info.type; - ia.size = cl.info.uncsize; - ia.blocks = (cl.info.size + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog; - insertscore(cl.info.score, &ia, 0); - a += ClumpSize + cl.info.size; - aa += ClumpSize + cl.info.size; - n--; - } -} - static Packet* -readilump(Lump *u, IAddr *ia, u8int *score, int rac) +readilump(Lump *u, IAddr *ia, u8int *score) { Arena *arena; ZBlock *zb; Packet *p, *pp; Clump cl; - u64int a, aa; + u64int aa; u8int sc[VtScoreSize]; trace(TraceLump, "readilump enter"); @@ -256,13 +227,6 @@ seterr(ECrash, "score mismatch"); freezblock(zb); return nil; - } - - if(rac == 0) { - trace(TraceLump, "readilump readahead"); - a = ia->addr + ClumpSize + cl.info.size; - aa += ClumpSize + cl.info.size; - lreadahead(a, arena, aa, 20); } trace(TraceLump, "readilump success"); --- /sys/src/cmd/venti/srv/clump.c Mon Oct 1 03:35:56 2007 +++ /sys/src/cmd/venti/srv/clump.c Mon Oct 1 03:35:55 2007 @@ -62,18 +62,16 @@ memset(cb->data+ClumpSize+dsize, 0, 4); cl.info.size = dsize; - ia->addr = 0; - ia->type = type; - ia->size = size; - ia->blocks = (dsize + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog; - - a = writeiclump(ix, &cl, cb->data, &ia->addr); - + a = writeiclump(ix, &cl, cb->data); trace(TraceLump, "storeclump exit %lld", a); - freezblock(cb); if(a == TWID64) return -1; + + ia->addr = a; + ia->type = type; + ia->size = size; + ia->blocks = (dsize + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog; /* qlock(&stats.lock); --- /sys/src/cmd/venti/srv/arena.c Mon Oct 1 03:35:57 2007 +++ /sys/src/cmd/venti/srv/arena.c Mon Oct 1 03:35:56 2007 @@ -16,6 +16,7 @@ static CIBlock *getcib(Arena *arena, int clump, int writing, CIBlock *rock); static void putcib(Arena *arena, CIBlock *cib); static void sumproc(void *); +static void loadcig(Arena *arena); static QLock sumlock; static Rendez sumwait; @@ -65,7 +66,7 @@ } if(arena->diskstats.sealed && scorecmp(zeroscore, arena->score)==0) - backsumarena(arena); + sealarena(arena); return arena; } @@ -137,14 +138,23 @@ CIBlock *cib, r; int i; - for(i = 0; i < n; i++){ + /* + * because the clump blocks are laid out + * in reverse order at the end of the arena, + * it can be a few percent faster to read + * the clumps backwards, which reads the + * disk blocks forwards. + */ + for(i = n-1; i >= 0; i--){ cib = getcib(arena, clump + i, 0, &r); - if(cib == nil) - break; + if(cib == nil){ + n = i; + continue; + } unpackclumpinfo(&cis[i], &cib->data->data[cib->offset]); putcib(arena, cib); } - return i; + return n; } /* @@ -283,13 +293,12 @@ filling up and real errors writing the clump? */ u64int -writeaclump(Arena *arena, Clump *c, u8int *clbuf, u64int start, u64int *pa) +writeaclump(Arena *arena, Clump *c, u8int *clbuf) { DBlock *b; u64int a, aa; u32int clump, n, nn, m, off, blocksize; int ok; - AState as; n = c->info.size + ClumpSize + U32Size; qlock(&arena->lock); @@ -299,10 +308,6 @@ if(!arena->memstats.sealed){ logerr(EOk, "seal memstats %s", arena->name); arena->memstats.sealed = 1; - as.arena = arena; - as.aa = start+aa; - as.stats = arena->memstats; - setdcachestate(&as); } qunlock(&arena->lock); return TWID64; @@ -349,7 +354,28 @@ if(c->info.size < c->info.uncsize) arena->memstats.cclumps++; - clump = arena->memstats.clumps++; + clump = arena->memstats.clumps; + if(clump % ArenaCIGSize == 0){ + if(arena->cig == nil){ + loadcig(arena); + if(arena->cig == nil) + goto NoCIG; + } + /* add aa as start of next cig */ + if(clump/ArenaCIGSize != arena->ncig){ + fprint(2, "bad arena cig computation %s: writing clump %d but %d cigs\n", + arena->name, clump, arena->ncig); + arena->ncig = -1; + vtfree(arena->cig); + arena->cig = nil; + goto NoCIG; + } + arena->cig = vtrealloc(arena->cig, (arena->ncig+1)*sizeof arena->cig[0]); + arena->cig[arena->ncig++].offset = aa; + } +NoCIG: + arena->memstats.clumps++; + if(arena->memstats.clumps == 0) sysfatal("clumps wrapped"); arena->wtime = now(); @@ -359,14 +385,6 @@ writeclumpinfo(arena, clump, &c->info); wbarena(arena); - /* set up for call to setdcachestate */ - as.arena = arena; - as.aa = start+arena->memstats.used; - as.stats = arena->memstats; - - /* update this before calling setdcachestate so it cannot be behind dcache.diskstate */ - *pa = start+aa; - setdcachestate(&as); qunlock(&arena->lock); return aa; @@ -415,6 +433,7 @@ /* * Look up as->arena to find index. */ + needmainindex(); /* OS X linker */ ix = mainindex; for(i=0; inarenas; i++) if(ix->arenas[i] == as->arena) @@ -515,6 +534,7 @@ /* * read & sum all blocks except the last one */ + flushdcache(); memset(&s, 0, sizeof s); b = alloczblock(bs, 0, arena->part->blocksize); e = arena->base + arena->size; @@ -550,24 +570,19 @@ sha1(b->data, bs-VtScoreSize, nil, &s); sha1(zeroscore, VtScoreSize, nil, &s); sha1(nil, 0, score, &s); - + /* * check for no checksum or the same - * - * the writepart is okay because we flushed the dcache in sealarena */ - if(scorecmp(score, &b->data[bs - VtScoreSize]) != 0){ - if(scorecmp(zeroscore, &b->data[bs - VtScoreSize]) != 0) - logerr(EOk, "overwriting mismatched checksums for arena=%s, found=%V calculated=%V", - arena->name, &b->data[bs - VtScoreSize], score); - scorecp(&b->data[bs - VtScoreSize], score); - if(writepart(arena->part, e, b->data, bs) < 0) - logerr(EOk, "sumarena can't write sum for %s: %r", arena->name); - } + if(scorecmp(score, &b->data[bs - VtScoreSize]) != 0 + && scorecmp(zeroscore, &b->data[bs - VtScoreSize]) != 0) + logerr(EOk, "overwriting mismatched checksums for arena=%s, found=%V calculated=%V", + arena->name, &b->data[bs - VtScoreSize], score); freezblock(b); qlock(&arena->lock); scorecp(arena->score, score); + wbarena(arena); qunlock(&arena->lock); } @@ -586,6 +601,7 @@ } dirtydblock(b, DirtyArenaTrailer); bad = okarena(arena)<0 || packarena(arena, b->data)<0; + scorecp(b->data + arena->blocksize - VtScoreSize, arena->score); putdblock(b); if(bad) return -1; @@ -753,4 +769,158 @@ putdblock(cib->data); cib->data = nil; +} + + +/* + * For index entry readahead purposes, the arenas are + * broken into smaller subpieces, called clump info groups + * or cigs. Each cig has ArenaCIGSize clumps (ArenaCIGSize + * is chosen to make the index entries take up about half + * a megabyte). The index entries do not contain enough + * information to determine what the clump index is for + * a given address in an arena. That info is needed both for + * figuring out which clump group an address belongs to + * and for prefetching a clump group's index entries from + * the arena table of contents. The first time clump groups + * are accessed, we scan the entire arena table of contents + * (which might be 10s of megabytes), recording the data + * offset of each clump group. + */ + +/* + * load clump info group information by scanning entire toc. + */ +static void +loadcig(Arena *arena) +{ + u32int i, j, ncig, nci; + ArenaCIG *cig; + ClumpInfo *ci; + u64int offset; + int ms; + + if(arena->cig || arena->ncig < 0) + return; + +// fprint(2, "loadcig %s\n", arena->name); + + ncig = (arena->memstats.clumps+ArenaCIGSize-1) / ArenaCIGSize; + if(ncig == 0){ + arena->cig = vtmalloc(1); + arena->ncig = 0; + return; + } + + ms = msec(); + cig = vtmalloc(ncig*sizeof cig[0]); + ci = vtmalloc(ArenaCIGSize*sizeof ci[0]); + offset = 0; + for(i=0; i ncig = -1; + fprint(2, "loadcig %s: got %ud cigs, expected %ud\n", arena->name, i+1, ncig); + goto out; + } + } + } + vtfree(ci); + + arena->ncig = ncig; + arena->cig = cig; + +out: + ms = msec() - ms; + addstat2(StatCigLoad, 1, StatCigLoadTime, ms); +} + +/* + * convert arena address into arena group + data boundaries. + */ +int +arenatog(Arena *arena, u64int addr, u64int *gstart, u64int *glimit, int *g) +{ + int r, l, m; + + qlock(&arena->lock); + if(arena->cig == nil) + loadcig(arena); + if(arena->cig == nil || arena->ncig == 0){ + qunlock(&arena->lock); + return -1; + } + + l = 1; + r = arena->ncig - 1; + while(l <= r){ + m = (r + l) / 2; + if(arena->cig[m].offset <= addr) + l = m + 1; + else + r = m - 1; + } + l--; + + *g = l; + *gstart = arena->cig[l].offset; + if(l+1 < arena->ncig) + *glimit = arena->cig[l+1].offset; + else + *glimit = arena->memstats.used; + qunlock(&arena->lock); + return 0; +} + +/* + * load the clump info for group g into the index entries. + */ +int +asumload(Arena *arena, int g, IEntry *entries, int nentries) +{ + int i, base, limit; + u64int addr; + ClumpInfo ci; + IEntry *ie; + + if(nentries < ArenaCIGSize){ + fprint(2, "asking for too few entries\n"); + return -1; + } + + qlock(&arena->lock); + if(arena->cig == nil) + loadcig(arena); + if(arena->cig == nil || arena->ncig == 0 || g >= arena->ncig){ + qunlock(&arena->lock); + return -1; + } + + addr = 0; + base = g*ArenaCIGSize; + limit = base + ArenaCIGSize; + if(base > arena->memstats.clumps) + base = arena->memstats.clumps; + ie = entries; + for(i=base; i score, ci.score); + ie->ia.type = ci.type; + ie->ia.size = ci.uncsize; + ie->ia.blocks = (ci.size + ClumpSize + (1< > ABlockLog; + ie->ia.addr = addr; + ie++; + } + addr += ClumpSize + ci.size; + } + qunlock(&arena->lock); + return ie - entries; } --- /sys/src/cmd/venti/srv/dcache.c Mon Oct 1 03:35:57 2007 +++ /sys/src/cmd/venti/srv/dcache.c Mon Oct 1 03:35:57 2007 @@ -55,15 +55,6 @@ u8int *mem; /* memory for all block descriptors */ int ndirty; /* number of dirty blocks */ int maxdirty; /* max. number of dirty blocks */ - Channel *ra; - u8int *rabuf; - u32int ramax; - u32int rasize; - u64int raaddr; - Part *rapart; - - AState diskstate; - AState state; }; typedef struct Ra Ra; @@ -82,7 +73,6 @@ static void fixheap(int i, DBlock *b); static void flushproc(void*); static void writeproc(void*); -static void raproc(void*); void initdcache(u32int mem) @@ -109,7 +99,6 @@ dcache.blocks = MKNZ(DBlock, nblocks); dcache.write = MKNZ(DBlock*, nblocks); dcache.mem = MKNZ(u8int, (nblocks+1+128) * blocksize); - dcache.ra = chancreate(sizeof(Ra), 0); last = nil; p = (u8int*)(((ulong)dcache.mem+blocksize-1)&~(ulong)(blocksize-1)); @@ -121,10 +110,6 @@ b->next = last; last = b; } - dcache.rabuf = &p[i*blocksize]; - dcache.ramax = 128*blocksize; - dcache.raaddr = 0; - dcache.rapart = nil; dcache.free = last; dcache.nheap = 0; @@ -133,136 +118,6 @@ vtproc(flushproc, nil); vtproc(delaykickroundproc, &dcache.round); - vtproc(raproc, nil); -} - -void -setdcachestate(AState *a) -{ - trace(TraceBlock, "setdcachestate %s 0x%llux clumps %d", a->arena ? a->arena->name : nil, a->aa, a->stats.clumps); - qlock(&dcache.lock); - dcache.state = *a; - qunlock(&dcache.lock); -} - -AState -diskstate(void) -{ - AState a; - - qlock(&dcache.lock); - a = dcache.diskstate; - qunlock(&dcache.lock); - return a; -} - -static void -raproc(void *v) -{ - Ra ra; - DBlock *b; - - USED(v); - while(recv(dcache.ra, &ra) == 1){ - if(ra.part->size <= ra.addr) - continue; - b = _getdblock(ra.part, ra.addr, OREAD, 2); - putdblock(b); - } -} - -/* - * We do readahead a whole arena at a time now, - * so dreadahead is a no-op. The original implementation - * is in unused_dreadahead below. - */ -void -dreadahead(Part *part, u64int addr, int miss) -{ - USED(part); - USED(addr); - USED(miss); -} - -void -unused_dreadahead(Part *part, u64int addr, int miss) -{ - Ra ra; - static struct { - Part *part; - u64int addr; - } lastmiss; - static struct { - Part *part; - u64int addr; - int dir; - } lastra; - - if(miss){ - if(lastmiss.part==part && lastmiss.addr==addr-dcache.size){ - XRa: - lastra.part = part; - lastra.dir = addr-lastmiss.addr; - lastra.addr = addr+lastra.dir; - ra.part = part; - ra.addr = lastra.addr; - nbsend(dcache.ra, &ra); - }else if(lastmiss.part==part && lastmiss.addr==addr+dcache.size){ - addr -= dcache.size; - goto XRa; - } - }else{ - if(lastra.part==part && lastra.addr==addr){ - lastra.addr += lastra.dir; - ra.part = part; - ra.addr = lastra.addr; - nbsend(dcache.ra, &ra); - } - } - - if(miss){ - lastmiss.part = part; - lastmiss.addr = addr; - } -} - -int -rareadpart(Part *part, u64int addr, u8int *buf, uint n, int load) -{ - uint nn; - static RWLock ralock; - - rlock(&ralock); - if(dcache.rapart==part && dcache.raaddr <= addr && addr+n <= dcache.raaddr+dcache.rasize){ - memmove(buf, dcache.rabuf+(addr-dcache.raaddr), n); - runlock(&ralock); - return 0; - } - if(load != 2 || addr >= part->size){ /* addr >= part->size: let readpart do the error */ - runlock(&ralock); - diskaccess(0); - return readpart(part, addr, buf, n); - } - - runlock(&ralock); - wlock(&ralock); -fprint(2, "raread %s %llx\n", part->name, addr); - nn = dcache.ramax; - if(addr+nn > part->size) - nn = part->size - addr; - diskaccess(0); - if(readpart(part, addr, dcache.rabuf, nn) < 0){ - wunlock(&ralock); - return -1; - } - memmove(buf, dcache.rabuf, n); - dcache.rapart = part; - dcache.rasize = nn; - dcache.raaddr = addr; - wunlock(&ralock); - - addstat(StatApartReadBytes, nn-n); - return 0; } static u32int @@ -313,16 +168,8 @@ again: for(b = dcache.heads[h]; b != nil; b = b->next){ if(b->part == part && b->addr == addr){ - /* - qlock(&stats.lock); - stats.pchit++; - qunlock(&stats.lock); - */ - if(load){ + if(load) addstat(StatDcacheHit, 1); - if(load != 2 && mode != OWRITE) - dreadahead(part, b->addr, 0); - } goto found; } } @@ -367,8 +214,6 @@ b->addr = addr; b->part = part; b->size = 0; - if(load != 2 && mode != OWRITE) - dreadahead(part, b->addr, 1); found: b->ref++; @@ -405,7 +250,8 @@ memset(&b->data[b->size], 0, size - b->size); else{ trace(TraceBlock, "getdblock readpart %s 0x%llux", part->name, addr); - if(rareadpart(part, addr + b->size, &b->data[b->size], size - b->size, load) < 0){ + diskaccess(0); + if(readpart(part, addr + b->size, &b->data[b->size], size - b->size) < 0){ b->mode = ORDWR; /* so putdblock wunlocks */ putdblock(b); return nil; @@ -768,7 +614,6 @@ int i, j, n; ulong t0; DBlock *b, **write; - AState as; USED(v); threadsetname("flushproc"); @@ -779,10 +624,6 @@ t0 = nsec()/1000; trace(TraceProc, "build t=%lud", (ulong)(nsec()/1000)-t0); - qlock(&dcache.lock); - as = dcache.state; - qunlock(&dcache.lock); - write = dcache.write; n = 0; for(i=0; i writing); for(i = ix->mapalloc; i < ix->narenas; i++){ - a = writeaclump(ix->arenas[i], c, clbuf, ix->amap[i].start, pa); + a = writeaclump(ix->arenas[i], c, clbuf); if(a != TWID64){ - ix->mapalloc = i; /* assuming write is atomic, race is okay */ + ix->mapalloc = i; + ia.addr = ix->amap[i].start + a; + ia.type = c->info.type; + ia.size = c->info.uncsize; + ia.blocks = (c->info.size + ClumpSize + (1< > ABlockLog; + as.arena = ix->arenas[i]; + as.aa = ia.addr; + as.stats = as.arena->memstats; + insertscore(c->info.score, &ia, IEDirty, &as); + qunlock(&ix->writing); trace(TraceLump, "writeiclump exit"); - return a; + return ia.addr; } } + qunlock(&ix->writing); seterr(EAdmin, "no space left in arenas"); trace(TraceLump, "writeiclump failed"); @@ -594,6 +607,25 @@ } *aa = a - ix->amap[l].start; return ix->arenas[l]; +} + +/* + * convert an arena index to the bounds of the containing arena group. + */ +Arena* +amapitoag(Index *ix, u64int a, u64int *gstart, u64int *glimit, int *g) +{ + u64int aa; + Arena *arena; + + arena = amapitoa(ix, a, &aa); + if(arena == nil) + return nil; + if(arenatog(arena, aa, gstart, glimit, g) < 0) + return nil; + *gstart += a - aa; + *glimit += a - aa; + return arena; } int --- /sys/src/cmd/venti/srv/icache.c Mon Oct 1 03:35:59 2007 +++ /sys/src/cmd/venti/srv/icache.c Mon Oct 1 03:35:59 2007 @@ -2,236 +2,429 @@ #include "dat.h" #include "fns.h" +int icacheprefetch = 1; + typedef struct ICache ICache; +typedef struct IHash IHash; +typedef struct ISum ISum; + struct ICache { - QLock lock; /* locks hash table & all associated data */ + QLock lock; Rendez full; - IEntry **heads; /* heads of all the hash chains */ - int bits; /* bits to use for indexing heads */ - u32int size; /* number of heads; == 1 << bits, should be < entries */ - IEntry *base; /* all allocated hash table entries */ - IEntry *free; - u32int entries; /* elements in base */ - IEntry *dirty; /* chain of dirty elements */ - u32int ndirty; + IHash *hash; + IEntry *entries; + int nentries; + IEntry free; + IEntry clean; + IEntry dirty; u32int maxdirty; - u32int unused; /* index of first unused element in base */ - u32int stolen; /* last head from which an element was stolen */ + u32int ndirty; + AState as; - Arena *last[4]; - Arena *lastload; - int nlast; + ISum **sum; + int nsum; + IHash *shash; + IEntry *sentries; + int nsentries; }; -int icacheprefetch = 0; /* interferes with playing music via vacfs */ - static ICache icache; -static IEntry *icachealloc(IAddr *ia, u8int *score); - /* - * bits is the number of bits in the icache hash table - * depth is the average depth - * memory usage is about (1< table[0])); + ih->table = (IEntry**)(ih+1); + ih->bits = bits; + ih->size = size; + return ih; } -u32int -hashbits(u8int *sc, int bits) +static IEntry* +ihashlookup(IHash *ih, u8int score[VtScoreSize], int type) { - u32int v; - - v = (sc[0] << 24) | (sc[1] << 16) | (sc[2] << 8) | sc[3]; - if(bits < 32) - v >>= (32 - bits); - return v; + u32int h; + IEntry *ie; + + h = hashbits(score, ih->bits); + for(ie=ih->table[h]; ie; ie=ie->nexthash) + if((type == -1 || type == ie->ia.type) && scorecmp(score, ie->score) == 0) + return ie; + return nil; } static void -loadarenaclumps(Arena *arena, u64int aa) +ihashdelete(IHash *ih, IEntry *ie, char *what) { - ulong i; - ClumpInfo ci; - IAddr ia; - - for(i=0; i memstats.clumps; i++){ - if(readclumpinfo(arena, i, &ci) < 0) - break; - ia.type = ci.type; - ia.size = ci.uncsize; - ia.blocks = (ci.size + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog; - ia.addr = aa; - aa += ClumpSize + ci.size; - if(ia.type != VtCorruptType) - insertscore(ci.score, &ia, 0); - } + u32int h; + IEntry **l; + + h = hashbits(ie->score, ih->bits); + for(l=&ih->table[h]; *l; l=&(*l)->nexthash) + if(*l == ie){ + *l = ie->nexthash; + return; + } + fprint(2, "warning: %s %V not found in ihashdelete\n", what, ie->score); } -int -_lookupscore(u8int *score, int type, IAddr *ia, int *rac) +static void +ihashinsert(IHash *ih, IEntry *ie) { u32int h; - IEntry *ie, *last; - - qlock(&icache.lock); - h = hashbits(score, icache.bits); - last = nil; - for(ie = icache.heads[h]; ie != nil; ie = ie->next){ - if((ie->ia.type == type || type == -1) && scorecmp(ie->score, score)==0){ - if(last != nil) - last->next = ie->next; - else - icache.heads[h] = ie->next; - addstat(StatIcacheHit, 1); - if(rac) - ie->rac = 1; - trace(TraceLump, "lookupscore incache"); - ie->next = icache.heads[h]; - icache.heads[h] = ie; - - *ia = ie->ia; - if(rac) - *rac = ie->rac; - qunlock(&icache.lock); - return 0; - } - last = ie; - } - addstat(StatIcacheMiss, 1); - qunlock(&icache.lock); - return -1; + + h = hashbits(ie->score, ih->bits); + ie->nexthash = ih->table[h]; + ih->table[h] = ie; } /* -ZZZ need to think about evicting the correct IEntry, -and writing back the wtime. - * look up data score in the index cache - * if this fails, pull it in from the disk index table, if it exists. - * - * must be called with the lump for this score locked + * IEntry lists. */ -int -lookupscore(u8int *score, int type, IAddr *ia, int *rac) + +static IEntry* +popout(IEntry *ie) { - IEntry d, *ie; - u32int h; - u64int aa; - Arena *load; - int i, ret; - uint ms; + if(ie->prev == nil && ie->next == nil) + return ie; + ie->prev->next = ie->next; + ie->next->prev = ie->prev; + ie->next = nil; + ie->prev = nil; + return ie; +} - aa = 0; - ms = msec(); - - trace(TraceLump, "lookupscore %V.%d", score, type); +static IEntry* +poplast(IEntry *list) +{ + if(list->prev == list) + return nil; + return popout(list->prev); +} + +static IEntry* +pushfirst(IEntry *list, IEntry *ie) +{ + popout(ie); + ie->prev = list; + ie->next = list->next; + ie->prev->next = ie; + ie->next->prev = ie; + return ie; +} - ret = 0; - if(_lookupscore(score, type, ia, rac) < 0){ - if(loadientry(mainindex, score, type, &d) < 0){ - ret = -1; - goto out; - } +/* + * Arena summary cache. + */ +struct ISum +{ + QLock lock; + IEntry *entries; + int nentries; + int loaded; + u64int addr; + u64int limit; + Arena *arena; + int g; +}; + +static ISum* +scachelookup(u64int addr) +{ + int i; + ISum *s; - /* failed in cache but found on disk - fill cache. */ - trace(TraceLump, "lookupscore loaded"); - addstat(StatIcacheFill, 1); - - /* - * no one else can load an entry for this score, - * since we have this score's lump's lock. - */ - qlock(&icache.lock); - - /* - * If we notice that all the hits are coming from one arena, - * load the table of contents for that arena into the cache. - */ - load = nil; - h = hashbits(score, icache.bits); - ie = icachealloc(&d.ia, score); - if(icacheprefetch){ - icache.last[icache.nlast++%nelem(icache.last)] = amapitoa(mainindex, ie->ia.addr, &aa); - aa = ie->ia.addr - aa; /* compute base addr of arena */ - for(i=0; i addr <= addr && addr < s->limit){ + if(i > 0){ + memmove(icache.sum+1, icache.sum, i*sizeof icache.sum[0]); + icache.sum[0] = s; } + return s; } + } + return nil; +} + +static void +sumclear(ISum *s) +{ + int i; + + for(i=0; i nentries; i++) + ihashdelete(icache.shash, &s->entries[i], "scache"); + s->nentries = 0; + s->loaded = 0; + s->addr = 0; + s->limit = 0; + s->arena = nil; + s->g = 0; +} + +static ISum* +scacheevict(void) +{ + ISum *s; + int i; - ie->next = icache.heads[h]; - icache.heads[h] = ie; - - *ia = ie->ia; - *rac = ie->rac; - - qunlock(&icache.lock); - if(load){ - trace(TraceProc, "preload 0x%llux", aa); - loadarenaclumps(load, aa); + for(i=icache.nsum-1; i>=0; i--){ + s = icache.sum[i]; + if(canqlock(&s->lock)){ + if(i > 0){ + memmove(icache.sum+1, icache.sum, i*sizeof icache.sum[0]); + icache.sum[0] = s; + } + sumclear(s); + return s; } } + return nil; +} -out: - ms = msec() - ms; - addstat2(StatIcacheRead, 1, StatIcacheReadTime, ms); +static void +scachehit(u64int addr) +{ + scachelookup(addr); /* for move-to-front */ +} - return ret; +static void +scachesetup(ISum *s, u64int addr) +{ + u64int addr0, limit; + int g; + + s->arena = amapitoag(mainindex, addr, &addr0, &limit, &g); + s->addr = addr0; + s->limit = limit; + s->g = g; +} + +static void +scacheload(ISum *s) +{ + int i, n; + + s->loaded = 1; + n = asumload(s->arena, s->g, s->entries, ArenaCIGSize); + /* + * n can be less then ArenaCIGSize, either if the clump group + * is the last in the arena and is only partially filled, or if there + * are corrupt clumps in the group -- those are not returned. + */ + for(i=0; i entries[i].ia.addr += s->addr; + ihashinsert(icache.shash, &s->entries[i]); + } +//fprint(2, "%T scacheload %s %d - %d entries\n", s->arena->name, s->g, n); + addstat(StatScachePrefetch, n); + s->nentries = n; +} + +static ISum* +scachemiss(u64int addr) +{ + ISum *s; + + s = scachelookup(addr); + if(s == nil){ + /* first time: make an entry in the cache but don't populate it yet */ + s = scacheevict(); + if(s == nil) + return nil; + scachesetup(s, addr); + qunlock(&s->lock); + return nil; + } + + /* second time: load from disk */ + qlock(&s->lock); + if(s->loaded || !icacheprefetch){ + qunlock(&s->lock); + return nil; + } + + return s; /* locked */ } /* - * insert a new element in the hash table. + * Index cache. */ -int -insertscore(u8int *score, IAddr *ia, int write) + +void +initicache(u32int mem0) { - IEntry *ie, se; - u32int h; + u32int mem; + int i, entries, scache; + + icache.full.l = &icache.lock; - trace(TraceLump, "insertscore enter"); - if(write) - addstat(StatIcacheWrite, 1); - else - addstat(StatIcachePrefetch, 1); + mem = mem0; + entries = mem / (sizeof(IEntry)+sizeof(IEntry*)); + scache = (entries/8) / ArenaCIGSize; + entries -= entries/8; + if(scache < 4) + scache = 4; + if(scache > 16) + scache = 16; + if(entries < 1000) + entries = 1000; +fprint(2, "icache %,d bytes = %,d entries; %d scache\n", mem0, entries, scache); + + icache.clean.prev = icache.clean.next = &icache.clean; + icache.dirty.prev = icache.dirty.next = &icache.dirty; + icache.free.prev = icache.free.next = &icache.free; + + icache.hash = mkihash(entries); + icache.nentries = entries; + setstat(StatIcacheSize, entries); + icache.entries = vtmallocz(entries*sizeof icache.entries[0]); + icache.maxdirty = entries / 2; + for(i=0; i entries = icache.sentries + i*ArenaCIGSize; + } +} - qlock(&icache.lock); - h = hashbits(score, icache.bits); - ie = icachealloc(ia, score); - if(write){ +static IEntry* +evictlru(void) +{ + IEntry *ie; + + ie = poplast(&icache.clean); + if(ie == nil) + return nil; + ihashdelete(icache.hash, ie, "evictlru"); + return ie; +} + +static void +icacheinsert(u8int score[VtScoreSize], IAddr *ia, int state) +{ + IEntry *ie; + + if((ie = poplast(&icache.free)) == nil && (ie = evictlru()) == nil){ + addstat(StatIcacheStall, 1); + while((ie = poplast(&icache.free)) == nil && (ie = evictlru()) == nil){ + // Could safely return here if state == IEClean. + // But if state == IEDirty, have to wait to make + // sure we don't lose an index write. + // Let's wait all the time. + flushdcache(); + kickicache(); + rsleep(&icache.full); + } + addstat(StatIcacheStall, -1); + } + + memmove(ie->score, score, VtScoreSize); + ie->state = state; + ie->ia = *ia; + if(state == IEClean){ + addstat(StatIcachePrefetch, 1); + pushfirst(&icache.clean, ie); + }else{ + addstat(StatIcacheWrite, 1); + assert(state == IEDirty); icache.ndirty++; setstat(StatIcacheDirty, icache.ndirty); delaykickicache(); - ie->dirty = 1; + pushfirst(&icache.dirty, ie); } - ie->next = icache.heads[h]; - icache.heads[h] = ie; + ihashinsert(icache.hash, ie); +} + +int +icachelookup(u8int score[VtScoreSize], int type, IAddr *ia) +{ + IEntry *ie; - se = *ie; + qlock(&icache.lock); + addstat(StatIcacheLookup, 1); + if((ie = ihashlookup(icache.hash, score, type)) != nil){ + *ia = ie->ia; + if(ie->state == IEClean) + pushfirst(&icache.clean, ie); + addstat(StatIcacheHit, 1); + qunlock(&icache.lock); + return 0; + } + + if((ie = ihashlookup(icache.shash, score, type)) != nil){ + *ia = ie->ia; + icacheinsert(score, &ie->ia, IEClean); + scachehit(ie->ia.addr); + addstat(StatScacheHit, 1); + qunlock(&icache.lock); + return 0; + } + addstat(StatIcacheMiss, 1); qunlock(&icache.lock); - if(write && icache.ndirty >= icache.maxdirty) + return -1; +} + +int +insertscore(u8int score[VtScoreSize], IAddr *ia, int state, AState *as) +{ + ISum *toload; + + qlock(&icache.lock); + icacheinsert(score, ia, state); + if(state == IEClean) + toload = scachemiss(ia->addr); + else{ + assert(state == IEDirty); + toload = nil; + if(as == nil) + fprint(2, "%T insertscore IEDirty without as; called from %lux\n", getcallerpc(&score)); + else{ + if(icache.as.aa > as->aa) + fprint(2, "%T insertscore: aa moving backward: %#llux -> %#llux\n", icache.as.aa, as->aa); + icache.as = *as; + } + } + qunlock(&icache.lock); + if(toload){ + scacheload(toload); + qunlock(&toload->lock); + } + + if(icache.ndirty >= icache.maxdirty) kickicache(); /* @@ -240,125 +433,81 @@ * the lump, meaning any searches for this block * will hit in the lump cache until after we return. */ - markbloomfilter(mainindex->bloom, score); + if(state == IEDirty) + markbloomfilter(mainindex->bloom, score); return 0; } -/* - * allocate a index cache entry which hasn't been used in a while. - * must be called with icache.lock locked - * if the score is already in the table, update the entry. - */ -static IEntry * -icachealloc(IAddr *ia, u8int *score) +static int +lookupscore_untimed(u8int score[VtScoreSize], int type, IAddr *ia) { - int i; - IEntry *ie, *last, *clean, *lastclean; - u32int h; + IEntry d; - h = hashbits(score, icache.bits); - last = nil; - for(ie = icache.heads[h]; ie != nil; ie = ie->next){ - if(ie->ia.type == ia->type && scorecmp(ie->score, score)==0){ - if(last != nil) - last->next = ie->next; - else - icache.heads[h] = ie->next; - trace(TraceLump, "icachealloc hit"); - ie->rac = 1; - return ie; - } - last = ie; - } + if(icachelookup(score, type, ia) >= 0) + return 0; - h = icache.unused; - if(h < icache.entries){ - ie = &icache.base[h++]; - icache.unused = h; - trace(TraceLump, "icachealloc unused"); - goto Found; - } - - if((ie = icache.free) != nil){ - icache.free = ie->next; - goto Found; - } - - h = icache.stolen; - for(i=0;; i++){ - h++; - if(h >= icache.size) - h = 0; - if(i == icache.size){ - trace(TraceLump, "icachealloc sleep"); - addstat(StatIcacheStall, 1); - while(icache.ndirty == icache.entries){ - /* - * This is a bit suspect. Kickicache will wake up the - * icachewritecoord, but if all the index entries are for - * unflushed disk blocks, icachewritecoord won't be - * able to do much. It always rewakes everyone when - * it thinks it is done, though, so at least we'll go around - * the while loop again. Also, if icachewritecoord sees - * that the disk state hasn't change at all since the last - * time around, it kicks the disk. This needs to be - * rethought, but it shouldn't deadlock anymore. - */ - kickicache(); - rsleep(&icache.full); - } - addstat(StatIcacheStall, -1); - i = 0; - } - lastclean = nil; - clean = nil; - last = nil; - for(ie=icache.heads[h]; ie; last=ie, ie=ie->next){ - if(!ie->dirty){ - clean = ie; - lastclean = last; - } - } - if(clean){ - if(lastclean) - lastclean->next = clean->next; - else - icache.heads[h] = clean->next; - clean->next = nil; - icache.stolen = h; - ie = clean; - trace(TraceLump, "icachealloc steal"); - goto Found; - } - } + addstat(StatIcacheFill, 1); + if(loadientry(mainindex, score, type, &d) < 0) + return -1; + + insertscore(score, &d.ia, IEClean, nil); + *ia = d.ia; + return 0; +} -Found: - ie->ia = *ia; - scorecp(ie->score, score); - ie->rac = 0; - return ie; +int +lookupscore(u8int score[VtScoreSize], int type, IAddr *ia) +{ + int ms, ret; + + ms = msec(); + ret = lookupscore_untimed(score, type, ia); + ms = msec() - ms; + addstat2(StatIcacheRead, 1, StatIcacheReadTime, ms); + return ret; } + +u32int +hashbits(u8int *sc, int bits) +{ + u32int v; + v = (sc[0] << 24) | (sc[1] << 16) | (sc[2] << 8) | sc[3]; + if(bits < 32) + v >>= (32 - bits); + return v; +} + +ulong +icachedirtyfrac(void) +{ + return (vlong)icache.ndirty*IcacheFrac / icache.nentries; +} + +/* + * Return a singly-linked list of dirty index entries. + * with 32-bit hash numbers between lo and hi + * and address < limit. + */ IEntry* icachedirty(u32int lo, u32int hi, u64int limit) { - int i; u32int h; IEntry *ie, *dirty; dirty = nil; trace(TraceProc, "icachedirty enter"); qlock(&icache.lock); - for(i=0; i next) - if(ie->dirty && ie->ia.addr != 0 && ie->ia.addr < limit){ + for(ie = icache.dirty.next; ie != &icache.dirty; ie=ie->next){ + if(ie->state == IEDirty && ie->ia.addr < limit){ h = hashbits(ie->score, 32); if(lo <= h && h <= hi){ ie->nextdirty = dirty; dirty = ie; } } + } qunlock(&icache.lock); trace(TraceProc, "icachedirty exit"); if(dirty == nil) @@ -366,36 +515,59 @@ return dirty; } +AState +icachestate(void) +{ + AState as; + + qlock(&icache.lock); + as = icache.as; + qunlock(&icache.lock); + return as; +} + +/* + * The singly-linked non-circular list of index entries ie + * has been written to disk. Move them to the clean list. + */ void icacheclean(IEntry *ie) { - trace(TraceProc, "icachedirty enter"); + IEntry *next; + + trace(TraceProc, "icacheclean enter"); qlock(&icache.lock); - for(; ie; ie=ie->nextdirty){ + for(; ie; ie=next){ + assert(ie->state == IEDirty); + next = ie->nextdirty; + ie->nextdirty = nil; + popout(ie); /* from icache.dirty */ icache.ndirty--; - ie->dirty = 0; + ie->state = IEClean; + pushfirst(&icache.clean, ie); } setstat(StatIcacheDirty, icache.ndirty); rwakeupall(&icache.full); qunlock(&icache.lock); - trace(TraceProc, "icachedirty exit"); + trace(TraceProc, "icacheclean exit"); } void emptyicache(void) { int i; - IEntry *ie, **lie; + IEntry *ie; + ISum *s; qlock(&icache.lock); - for(i=0; i dirty == 0){ - *lie = ie->next; - ie->next = icache.free; - icache.free = ie; - }else - lie = &ie->next; + while((ie = evictlru()) != nil) + pushfirst(&icache.free, ie); + for(i=0; i lock); + sumclear(s); + qunlock(&s->lock); } qunlock(&icache.lock); } + --- /sys/src/cmd/venti/srv/httpd.c Mon Oct 1 03:36:01 2007 +++ /sys/src/cmd/venti/srv/httpd.c Mon Oct 1 03:36:00 2007 @@ -565,11 +565,11 @@ if(scorecmp(zeroscore, arena->score) != 0) hprint(hout, "\tscore=%V\n", arena->score); - hprint(hout, "\tmem: clumps=%d compressed clumps=%d data=%,lld compressed data=%,lld storage=%,lld\n", + hprint(hout, "\twritten: clumps=%d compressed clumps=%d data=%,lld compressed data=%,lld storage=%,lld\n", arena->memstats.clumps, arena->memstats.cclumps, arena->memstats.uncsize, arena->memstats.used - arena->memstats.clumps * ClumpSize, arena->memstats.used + arena->memstats.clumps * ClumpInfoSize); - hprint(hout, "\tdisk: clumps=%d compressed clumps=%d data=%,lld compressed data=%,lld storage=%,lld\n", + hprint(hout, "\tindexed: clumps=%d compressed clumps=%d data=%,lld compressed data=%,lld storage=%,lld\n", arena->diskstats.clumps, arena->diskstats.cclumps, arena->diskstats.uncsize, arena->diskstats.used - arena->diskstats.clumps * ClumpSize, arena->diskstats.used + arena->diskstats.clumps * ClumpInfoSize); @@ -895,7 +895,7 @@ "icachehit", "icachemiss", - "icachelookup", + "icacheread", "icachewrite", "icachefill", "icacheprefetch", @@ -904,6 +904,9 @@ "icacheflush", "icachestall", "icachelookuptime", + "icachelookup", + "scachehit", + "scacheprefetch", "bloomhit", "bloommiss", @@ -925,6 +928,9 @@ "sumread", "sumreadbyte", + + "cigload", + "cigloadtime", }; static int --- /sys/src/cmd/venti/srv/checkarenas.c Mon Oct 1 03:36:01 2007 +++ /sys/src/cmd/venti/srv/checkarenas.c Mon Oct 1 03:36:01 2007 @@ -24,7 +24,7 @@ err = 0; for(;;){ - e = syncarena(arena, 0, 1000, 0, fix); + e = syncarena(arena, 1000, 0, fix); err |= e; if(!(e & SyncHeader)) break; --- /sys/src/cmd/venti/srv/syncarena.c Mon Oct 1 03:36:02 2007 +++ /sys/src/cmd/venti/srv/syncarena.c Mon Oct 1 03:36:02 2007 @@ -25,7 +25,7 @@ * returns 0 if ok, flags if error occurred */ int -syncarena(Arena *arena, u64int start, u32int n, int zok, int fix) +syncarena(Arena *arena, u32int n, int zok, int fix) { ZBlock *lump; Clump cl; @@ -53,7 +53,7 @@ fprint(2, "%s: illegal clump magic number=%#8.8ux at clump=%d\n", arena->name, magic, clump); /* err |= SyncDataErr; */ if(fix && writeclumpmagic(arena, aa, ClumpFreeMagic) < 0){ - fprint(2, "can't write corrected clump free magic: %r"); + fprint(2, "%s: can't write corrected clump free magic: %r", arena->name); err |= SyncFixErr; } break; @@ -136,9 +136,8 @@ || cclumps != arena->memstats.cclumps || uncsize != arena->memstats.uncsize){ err |= SyncHeader; - fprint(2, "arena %s: start=%lld fix=%d flush=%d %lld->%lld %ud->%ud %ud->%ud %lld->%lld\n", + fprint(2, "arena %s: fix=%d flush=%d %lld->%lld %ud->%ud %ud->%ud %lld->%lld\n", arena->name, - start, fix, flush, used, arena->memstats.used, --- /sys/src/cmd/venti/srv/buildindex.c Mon Oct 1 03:36:03 2007 +++ /sys/src/cmd/venti/srv/buildindex.c Mon Oct 1 03:36:02 2007 @@ -36,18 +36,19 @@ void usage(void) { - fprint(2, "usage: buildindex [-bd] [-i isect]... [-M imem] venti.conf\n"); + fprint(2, "usage: buildindex [-b] [-i isect]... [-M imem] venti.conf\n"); threadexitsall("usage"); } void threadmain(int argc, char *argv[]) { - int fd, i, napart; + int fd, i, napart, nfinish, maxdisks; u32int bcmem, imem; Config conf; Part *p; + maxdisks = 100000; ventifmtinstall(); imem = 256*1024*1024; ARGBEGIN{ @@ -64,6 +65,9 @@ case 'M': imem = unittoull(EARGF(usage())); break; + case 'm': /* temporary - might go away */ + maxdisks = atoi(EARGF(usage())); + break; default: usage(); break; @@ -132,17 +136,21 @@ /* start arena procs */ p = nil; napart = 0; + nfinish = 0; arenadonechan = chancreate(sizeof(void*), 0); for(i=0; i narenas; i++){ if(ix->arenas[i]->part != p){ p = ix->arenas[i]->part; vtproc(arenapartproc, p); - napart++; + if(++napart >= maxdisks){ + recvp(arenadonechan); + nfinish++; + } } } /* wait for arena procs to finish */ - for(i=0; i memstats.clumps) fprint(2, "%T arena %s: %d entries\n", a->name, a->memstats.clumps); - addr = ix->amap[i].start; - for(clump=0; clump memstats.clumps; clump+=n){ + /* + * Running the loop backwards accesses the + * clump info blocks forwards, since they are + * stored in reverse order at the end of the arena. + * This speeds things slightly. + */ + addr = ix->amap[i].start + a->memstats.used; + for(clump=a->memstats.clumps; clump > 0; clump-=n){ n = ClumpChunks; - if(n > a->memstats.clumps - clump) - n = a->memstats.clumps - clump; - if(readclumpinfos(a, clump, cis, n) != n){ + if(n > clump) + n = clump; + if(readclumpinfos(a, clump-n, cis, n) != n){ fprint(2, "%T arena %s: directory read: %r\n", a->name); errors = 1; break; } - for(j=0; j =0; j--){ ci = &cis[j]; ie.ia.type = ci->type; ie.ia.size = ci->uncsize; + addr -= ci->size + ClumpSize; ie.ia.addr = addr; - addr += ci->size + ClumpSize; ie.ia.blocks = (ci->size + ClumpSize + (1< > ABlockLog; scorecp(ie.score, ci->score); if(ci->type == VtCorruptType) @@ -253,6 +267,8 @@ } } } + if(addr != ix->amap[i].start) + fprint(2, "%T arena %s: clump miscalculation %lld != %lld\n", a->name, addr, ix->amap[i].start); } add(&arenaentries, tot); add(&skipentries, nskip); --- /sys/src/cmd/venti/srv/venti.c Mon Oct 1 03:36:04 2007 +++ /sys/src/cmd/venti/srv/venti.c Mon Oct 1 03:36:03 2007 @@ -106,9 +106,6 @@ if(configfile == nil) configfile = "venti.conf"; - if(initarenasum() < 0) - fprint(2, "warning: can't initialize arena summing process: %r"); - fprint(2, "conf..."); if(initventi(configfile, &config) < 0) sysfatal("can't init server: %r"); @@ -146,13 +143,7 @@ mem, mem / (8 * 1024)); initlumpcache(mem, mem / (8 * 1024)); - icmem = u64log2(icmem / (sizeof(IEntry)+sizeof(IEntry*)) / ICacheDepth); - if(icmem < 4) - icmem = 4; - if(0) fprint(2, "initialize %d bytes of index cache for %d index entries\n", - (sizeof(IEntry)+sizeof(IEntry*)) * (1 << icmem) * ICacheDepth, - (1 << icmem) * ICacheDepth); - initicache(icmem, ICacheDepth); + initicache(icmem); initicachewrite(); /* @@ -170,7 +161,7 @@ startbloomproc(mainindex->bloom); fprint(2, "sync..."); - if(!readonly && syncindex(mainindex, 1, 0, 0) < 0) + if(!readonly && syncindex(mainindex) < 0) sysfatal("can't sync server: %r"); if(!readonly && queuewrites){ @@ -181,6 +172,9 @@ queuewrites = 0; } } + + if(initarenasum() < 0) + fprint(2, "warning: can't initialize arena summing process: %r"); fprint(2, "announce %s...", vaddr); ventisrv = vtlisten(vaddr); --- /sys/src/cmd/venti/srv/fns.h Mon Oct 1 03:36:04 2007 +++ /sys/src/cmd/venti/srv/fns.h Mon Oct 1 03:36:04 2007 @@ -6,8 +6,11 @@ void addstat2(int, int, int, int); ZBlock *alloczblock(u32int size, int zeroed, uint alignment); Arena *amapitoa(Index *index, u64int a, u64int *aa); +Arena *amapitoag(Index *index, u64int a, u64int *gstart, u64int *glimit, int *g); u64int arenadirsize(Arena *arena, u32int clumps); +int arenatog(Arena *arena, u64int aa, u64int *gstart, u64int *glimit, int *g); void arenaupdate(Arena *arena, u32int size, u8int *score); +int asumload(Arena *arena, int g, IEntry *entries, int maxentries); void backsumarena(Arena *arena); void binstats(long (*fn)(Stats *s0, Stats *s1, void*), void *arg, long t0, long t1, Statbin *bin, int nbin); int bloominit(Bloom*, vlong, uchar*); @@ -26,7 +29,6 @@ void dirtydblock(DBlock*, int); void diskaccess(int); void disksched(void); -AState diskstate(void); void *emalloc(ulong); void emptydcache(void); void emptyicache(void); @@ -64,6 +66,8 @@ IEntry* icachedirty(u32int, u32int, u64int); ulong icachedirtyfrac(void); void icacheclean(IEntry*); +int icachelookup(u8int *score, int type, IAddr *ia); +AState icachestate(void); int ientrycmp(const void *vie1, const void *vie2); char *ifileline(IFile *f); int ifilename(IFile *f, char *dst); @@ -76,7 +80,7 @@ int initarenasum(void); void initbloomfilter(Index*); void initdcache(u32int mem); -void initicache(int bits, int depth); +void initicache(u32int mem); void initicachewrite(void); IEStream *initiestream(Part *part, u64int off, u64int clumps, u32int size); ISect *initisect(Part *part); @@ -87,7 +91,7 @@ void initround(Round*, char*, int); int initventi(char *config, Config *conf); void insertlump(Lump *lump, Packet *p); -int insertscore(u8int *score, IAddr *ia, int write); +int insertscore(u8int *score, IAddr *ia, int state, AState *as); void kickdcache(void); void kickicache(void); void kickround(Round*, int wait); @@ -97,14 +101,14 @@ int loadientry(Index *index, u8int *score, int type, IEntry *ie); void logerr(int severity, char *fmt, ...); Lump *lookuplump(u8int *score, int type); -int _lookupscore(u8int *score, int type, IAddr *ia, int *rac); -int lookupscore(u8int *score, int type, IAddr *ia, int *rac); +int lookupscore(u8int *score, int type, IAddr *ia); int maparenas(AMap *am, Arena **arenas, int n, char *what); void markbloomfilter(Bloom*, u8int*); uint msec(void); int namecmp(char *s, char *t); void namecp(char *dst, char *src); int nameok(char *name); +void needmainindex(void); void needzeroscore(void); Arena *newarena(Part *part, u32int, char *name, u64int base, u64int size, u32int blocksize); ArenaPart *newarenapart(Part *part, u32int blocksize, u32int tabsize); @@ -152,7 +156,6 @@ int scorecmp(u8int *, u8int *); void scoremem(u8int *score, u8int *buf, int size); void setatailstate(AState*); -void setdcachestate(AState*); void seterr(int severity, char *fmt, ...); void setstat(int, long); void settrace(char *type); @@ -166,9 +169,8 @@ int stru32int(char *s, u32int *r); int stru64int(char *s, u64int *r); void sumarena(Arena *arena); -int syncarena(Arena *arena, u64int start, u32int n, int zok, int fix); -int syncarenaindex(Index *ix, Arena *arena, u32int clump, u64int a, int fix, int *pflush, int check); -int syncindex(Index *ix, int fix, int mustflushicache, int check); +int syncarena(Arena *arena, u32int n, int zok, int fix); +int syncindex(Index *ix); void trace(char *type, char*, ...); void traceinit(void); int u64log2(u64int v); @@ -197,12 +199,12 @@ int wbisect(ISect *is); int wbindex(Index *ix); int whackblock(u8int *dst, u8int *src, int ssize); -u64int writeaclump(Arena *a, Clump *c, u8int *clbuf, u64int, u64int*); +u64int writeaclump(Arena *a, Clump *c, u8int *clbuf); u32int writearena(Arena *arena, u64int aa, u8int *clbuf, u32int n); int writebloom(Bloom*); int writeclumpinfo(Arena *arean, int clump, ClumpInfo *ci); int writepng(Hio*, Memimage*); -u64int writeiclump(Index *ix, Clump *c, u8int *clbuf, u64int*); +u64int writeiclump(Index *ix, Clump *c, u8int *clbuf); int writelump(Packet *p, u8int *score, int type, u32int creator, uint ms); int writepart(Part *part, u64int addr, u8int *buf, u32int n); int writeqlump(Lump *u, Packet *p, int creator, uint ms); --- /sys/src/cmd/venti/srv/syncindex.c Mon Oct 1 03:36:05 2007 +++ /sys/src/cmd/venti/srv/syncindex.c Mon Oct 1 03:36:04 2007 @@ -6,7 +6,7 @@ void usage(void) { - fprint(2, "usage: syncindex [-fv] [-B blockcachesize] config\n"); + fprint(2, "usage: syncindex [-v] [-B blockcachesize] config\n"); threadexitsall("usage"); } @@ -16,9 +16,7 @@ threadmain(int argc, char *argv[]) { u32int bcmem, icmem; - int fix; - fix = 0; bcmem = 0; icmem = 0; ARGBEGIN{ @@ -28,9 +26,6 @@ case 'I': icmem = unittoull(EARGF(usage())); break; - case 'f': - fix++; - break; case 'v': verbose++; break; @@ -39,9 +34,6 @@ break; }ARGEND - if(!fix) - readonly = 1; - if(argc != 1) usage(); @@ -56,21 +48,17 @@ if(0) fprint(2, "initialize %d bytes of disk block cache\n", bcmem); initdcache(bcmem); initlumpcache(1*1024*1024, 1024/8); - icmem = u64log2(icmem / (sizeof(IEntry)+sizeof(IEntry*)) / ICacheDepth); - if(icmem < 4) - icmem = 4; - if(1) fprint(2, "initialize %d bytes of index cache for %d index entries\n", - (sizeof(IEntry)+sizeof(IEntry*)) * (1 << icmem) * ICacheDepth, - (1 << icmem) * ICacheDepth); - initicache(icmem, ICacheDepth); + initicache(icmem); initicachewrite(); if(mainindex->bloom) startbloomproc(mainindex->bloom); if(verbose) printindex(2, mainindex); - if(syncindex(mainindex, fix, 1, 0) < 0) + if(syncindex(mainindex) < 0) sysfatal("failed to sync index=%s: %r\n", mainindex->name); + flushicache(); + flushdcache(); threadexitsall(0); } --- /sys/src/cmd/venti/srv/syncindex0.c Mon Oct 1 03:36:05 2007 +++ /sys/src/cmd/venti/srv/syncindex0.c Mon Oct 1 03:36:05 2007 @@ -2,184 +2,92 @@ #include "dat.h" #include "fns.h" -enum +static int +syncarenaindex(Arena *arena, u64int a0) { - ClumpChunks = 32*1024 -}; - -static int missing, wrong; - -/* - * shell sort is plenty good enough - * because we're going to do a bunch of disk i/o's - */ -static void -sortclumpinfo(ClumpInfo *ci, int *s, int n) -{ - int i, j, m, t; - - for(m = (n + 3) / 5; m > 0; m = (m + 1) / 3){ - for(i = n - m; i-- > 0;){ - for(j = i + m; j < n; j += m){ - if(memcmp(ci[s[j - m]].score, ci[s[j]].score, VtScoreSize) <= 0) - break; - t = s[j]; - s[j] = s[j - m]; - s[j - m] = t; - } - } - } -} - -int -syncarenaindex(Index *ix, Arena *arena, u32int clump, u64int a, int fix, int *pflush, int check) -{ - Packet *pack; - IEntry ie; + int ok; + u32int clump; + u64int a; + ClumpInfo ci; IAddr ia; - ClumpInfo *ci, *cis; - u64int *addrs; - int i, n, ok, *s, flush; - - trace(TraceProc, "syncarenaindex enter"); - - flush = 0; - cis = MKN(ClumpInfo, ClumpChunks); - addrs = MKN(u64int, ClumpChunks); - s = MKN(int, ClumpChunks); + AState as; + + if(arena->diskstats.clumps == arena->memstats.clumps) + return 0; + + memset(&as, 0, sizeof as); + as.arena = arena; + as.stats = arena->diskstats; + ok = 0; - for(; clump < arena->memstats.clumps; clump += n){ - n = ClumpChunks; - if(n > arena->memstats.clumps - clump) - n = arena->memstats.clumps - clump; - n = readclumpinfos(arena, clump, cis, n); - if(n <= 0){ - fprint(2, "arena directory read failed\n"); + a = a0 + arena->diskstats.used; + for(clump=arena->diskstats.clumps; clump < arena->memstats.clumps; clump++){ + if(readclumpinfo(arena, clump, &ci) < 0){ + fprint(2, "%s: clump %d: cannot read clumpinfo\n", + arena->name, clump); ok = -1; break; } - for(i = 0; i < n; i++){ - addrs[i] = a; - a += cis[i].size + ClumpSize; - s[i] = i; - } - - sortclumpinfo(cis, s, n); - - for(i = 0; i < n; i++){ - ci = &cis[s[i]]; - ia.type = ci->type; - ia.size = ci->uncsize; - ia.addr = addrs[s[i]]; - ia.blocks = (ci->size + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog; - - if(!check) - goto Add; - if(loadientry(ix, ci->score, ci->type, &ie) < 0){ - trace(TraceProc, "syncarenaindex missing block %V.%d", ci->score, ci->type); - missing++; - if(0) fprint(2, "missing block type=%d score=%V\n", ci->type, ci->score); - }else if(iaddrcmp(&ia, &ie.ia) != 0){ - trace(TraceProc, "syncarenaindex mismatched entry"); - fprint(2, "\nmismatched index entry and clump at %d\n", clump + i); - fprint(2, "\tclump: type=%d size=%d blocks=%d addr=%lld\n", ia.type, ia.size, ia.blocks, ia.addr); - fprint(2, "\tindex: type=%d size=%d block=%d addr=%lld\n", ie.ia.type, ie.ia.size, ie.ia.blocks, ie.ia.addr); - pack = readlump(ie.score, ie.ia.type, ie.ia.size, nil); - packetfree(pack); - if(pack != nil){ - fprint(2, "duplicated lump\n"); - continue; - } - wrong++; - }else - continue; - Add: - if(!fix){ - ok = -1; - continue; - } - flush = 1; - trace(TraceProc, "syncarenaindex insert %V", ci->score); - insertscore(ci->score, &ia, 1); - } - - if(0 && clump / 1000 != (clump + n) / 1000) - fprint(2, "."); - } - free(cis); - free(addrs); - free(s); - if(flush){ - flushdcache(); - *pflush = 1; + ia.type = ci.type; + ia.size = ci.uncsize; + ia.addr = a; + ia.blocks = (ClumpSize + ci.size + (1 << ABlockLog) - 1) >> ABlockLog; + a += ClumpSize + ci.size; + + as.stats.used += ClumpSize + ci.size; + as.stats.uncsize += ia.size; + as.stats.clumps++; + if(ci.uncsize > ci.size) + as.stats.cclumps++; + as.aa = a; + insertscore(ci.score, &ia, IEDirty, &as); } + flushdcache(); return ok; } int -syncindex(Index *ix, int fix, int mustflush, int check) +syncindex(Index *ix) { Arena *arena; - AState as; - u64int a; - int i, e, e1, ok, ok1, flush; + int i, e, e1, ok; ok = 0; - flush = 0; for(i = 0; i < ix->narenas; i++){ trace(TraceProc, "syncindex start %d", i); arena = ix->arenas[i]; - /* - * Syncarena will scan through the arena looking for blocks - * that have been forgotten. It will update arena->memstats.used, - * so save the currenct copy as the place to start the - * syncarenaindex scan. - */ - a = arena->memstats.used; - e = syncarena(arena, ix->amap[i].start, TWID32, fix, fix); + e = syncarena(arena, TWID32, 1, 1); e1 = e; - if(fix) - e1 &= ~(SyncHeader|SyncCIZero|SyncCIErr); - if(e1 == SyncHeader) + e1 &= ~(SyncHeader|SyncCIZero|SyncCIErr); + if(e & SyncHeader) fprint(2, "arena %s: header is out-of-date\n", arena->name); - if(e1) + if(e1){ + fprint(2, "arena %s: %x\n", arena->name, e1); ok = -1; - else{ - /* - * use diskstats not memstats here, because diskstats - * is what has been indexed; memstats is what has - * made it to disk (confusing names). - */ - ok1 = syncarenaindex(ix, arena, - arena->diskstats.clumps, - ix->amap[i].start + arena->diskstats.used, - fix, &flush, check); - if(ok1 < 0) - fprint(2, "syncarenaindex: %r\n"); - if(fix && ok1==0 && (e & SyncHeader) && wbarena(arena) < 0) - fprint(2, "arena=%s header write failed: %r\n", arena->name); - ok |= ok1; - - as.arena = arena; - as.aa = ix->amap[i].start + arena->memstats.used; - as.stats = arena->memstats; - setdcachestate(&as); + continue; + } + flushdcache(); + + if(arena->memstats.clumps == arena->diskstats.clumps) + continue; + + fprint(2, "%T %s: indexing %d clumps...\n", + arena->name, + arena->memstats.clumps - arena->diskstats.clumps); + + if(syncarenaindex(arena, ix->amap[i].start) < 0){ + fprint(2, "arena %s: syncarenaindex: %r\n", arena->name); + ok = -1; + continue; + } + if(wbarena(arena) < 0){ + fprint(2, "arena %s: wbarena: %r\n", arena->name); + ok = -1; + continue; } - } - if(missing || wrong) - fprint(2, "syncindex: %d missing entries, %d wrong entries (flush=%d)\n", missing, wrong, flush); - if(fix && wbindex(ix) < 0){ - fprint(2, "can't write back index header for %s: %r\n", ix->name); - return -1; - } - if(fix && flush){ flushdcache(); - if(mustflush){ - flushicache(); - flushdcache(); - }else - kickicache(); + delaykickicache(); } return ok; } --- /sys/src/cmd/venti/srv/icachewrite.c Mon Oct 1 03:36:06 2007 +++ /sys/src/cmd/venti/srv/icachewrite.c Mon Oct 1 03:36:05 2007 @@ -12,7 +12,7 @@ static IEntry *iesort(IEntry*); int icachesleeptime = 1000; /* milliseconds */ -int minicachesleeptime = 50; +int minicachesleeptime = 0; enum { @@ -85,7 +85,7 @@ static int icachewritesect(Index *ix, ISect *is, u8int *buf) { - int err, h, bsize, t; + int err, i, werr, h, bsize, t; u32int lo, hi; u64int addr, naddr; uint nbuf, off; @@ -115,7 +115,8 @@ } if(t < minicachesleeptime) t = minicachesleeptime; - sleep(t); + if(t > 0) + sleep(t); trace(TraceProc, "icachewritesect nextchunk"); chunk = nextchunk(ix, is, &iedirty, &addr, &nbuf); @@ -169,33 +170,29 @@ break; } packibucket(&ib, buf+off, is->bucketmagic); - /* - * XXX This is not quite right - it's good that we - * update the cached block (if any) here, but - * since the block doesn't get written until writepart - * below, we also need to make sure that the cache - * doesn't load the stale block before we write it to - * disk below. We could lock the disk cache during - * the writepart, but that's pretty annoying. - * Another possibility would be never to cache - * index partition blocks. The hit rate on those is - * miniscule anyway. - */ - if((b = _getdblock(is->part, naddr, ORDWR, 0)) != nil){ - memmove(b->data, buf+off, bsize); - putdblock(b); - } } diskaccess(1); trace(TraceProc, "icachewritesect writepart", addr, nbuf); - if(writepart(is->part, addr, buf, nbuf) < 0 || - flushpart(is->part) < 0){ + werr = 0; + if(writepart(is->part, addr, buf, nbuf) < 0 || flushpart(is->part) < 0) + werr = -1; + + for(i=0; i part, addr+i, ORDWR, 0)) != nil){ + memmove(b->data, buf+i, bsize); + putdblock(b); + } + } + + if(werr < 0){ fprint(2, "%s: part %s addr 0x%llux: icachewritesect " "writepart: %r\n", argv0, is->part->name, addr); + err = -1; continue; } + addstat(StatIsectWriteBytes, nbuf); addstat(StatIsectWrite, 1); icacheclean(chunk); @@ -245,18 +242,20 @@ threadsetname("icachewritecoord"); ix = mainindex; - iwrite.as = diskstate(); + iwrite.as = icachestate(); for(;;){ trace(TraceProc, "icachewritecoord sleep"); waitforkick(&iwrite.round); trace(TraceWork, "start"); - as = diskstate(); + as = icachestate(); if(as.arena==iwrite.as.arena && as.aa==iwrite.as.aa){ /* will not be able to do anything more than last flush - kick disk */ + fprint(2, "icache: nothing to do - kick dcache\n"); trace(TraceProc, "icachewritecoord kick dcache"); kickdcache(); trace(TraceProc, "icachewritecoord kicked dcache"); + goto SkipWork; /* won't do anything; don't bother rewriting bloom filter */ } iwrite.as = as; @@ -274,9 +273,11 @@ err |= recvul(ix->bloom->writedonechan); trace(TraceProc, "icachewritecoord donewrite err=%d", err); - if(err == 0) + if(err == 0){ setatailstate(&iwrite.as); + } } + SkipWork: icacheclean(nil); /* wake up anyone waiting */ trace(TraceWork, "finish"); addstat(StatIcacheFlush, 1);