# HG changeset patch # User Francisco J Ballesteros # Date 1328893894 0 # Node ID fb5824366f81c8551b375f7c8cebd392718ed1df # Parent 257203b800f6651da9c0a62c8d3bf2d50194f682 Cfs: experimental, initial version for the creepy file system. Still ongoing. Only fsfmt is included, actually. But most of the machinery of the file system is in there, for others to look. Archival into venti is provided by an external program not included, as is access to the external archive. That's because Cfs keeps the snaps in disk always frozen, and guarantees that at least one is there. All the activity happens in memory, with unix semantics for file access. Linkage to 9p or ix is not included either. R=nixiedev, john, charles.forsyth CC=nix-dev http://codereview.appspot.com/5616074 diff -r 257203b800f6 -r fb5824366f81 sys/src/cmd/Cfs/attr.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/src/cmd/Cfs/attr.c Fri Feb 10 17:11:34 2012 +0000 @@ -0,0 +1,213 @@ +#include +#include +#include +#include +#include +#include + +#include "conf.h" +#include "dbg.h" +#include "dk.h" +#include "fns.h" + +/* + * Attribute handling + */ + +typedef struct Adef Adef; + +struct Adef +{ + char* name; + long (*wattr)(Fsys*, Memblk*, void*, long); + long (*rattr)(Fsys*, Memblk*, void*, long); +}; + +static long wname(Fsys*, Memblk*, void*, long); +static long rname(Fsys*, Memblk*, void*, long); + +static Adef adef[] = +{ + {"name", wname, rname}, +}; + +ulong +embedattrsz(Memblk *f) +{ + ulong sz; + + sz = f->d.asize; + if(sz < Dminattrsz) + sz = Dminattrsz; + if(sz > Embedsz || Embedsz - sz < sizeof(Dentry)) + sz = Embedsz; + return sz; +} + +void +gmeta(Fmeta *meta, void *buf, ulong nbuf) +{ + Dmeta *d; + char *p, *x; + int i; + + if(nbuf < sizeof *d) + error("metadata buffer too small"); + d = buf; + meta->id = d->id; + meta->mode = d->mode; + meta->mtime = d->mtime; + meta->length = d->length; + + if(d->ssz[FMuid] + sizeof *d > nbuf || + d->ssz[FMgid] + sizeof *d > nbuf || + d->ssz[FMmuid] + sizeof *d > nbuf || + d->ssz[FMname] + sizeof *d > nbuf) + error("corrupt meta: wrong string size"); + + p = (char*)(&d[1]); + x = p; + for(i = 0; i < nelem(d->ssz); i++){ + if(x[d->ssz[i]-1] != 0) + error("corrupt meta: unterminated string"); + x += d->ssz[i]; + } + + meta->uid = p; + p += d->ssz[FMuid]; + meta->gid = p; + p += d->ssz[FMgid]; + meta->muid = p; + p += d->ssz[FMmuid]; + meta->name = p; +} + +static ulong +metasize(Fmeta *meta) +{ + ulong n; + + n = sizeof(Dmeta); + n += strlen(meta->uid) + 1; + n += strlen(meta->gid) + 1; + n += strlen(meta->muid) + 1; + n += strlen(meta->name) + 1; + /* + * BUG: meta->attr + */ + return n; +} + +/* + * Pack the metadata into buf. + * pointers in meta are changed to refer to the packed data. + * Return pointer past the packed metadata. + * The caller is responsible for ensuring that metadata fits in buf. + */ +ulong +pmeta(void *buf, ulong nbuf, Fmeta *meta) +{ + Dmeta *d; + char *p, *bufp; + ulong sz; + + sz = metasize(meta); + if(sz > nbuf){ + sysfatal("bug: allocate and use ablk"); + error("attributes are too long"); + } + d = buf; + bufp = buf; + d->id = meta->id; + d->mode = meta->mode; + d->mtime = meta->mtime; + d->length = meta->length; + + p = (char*)(&d[1]); + d->ssz[FMuid] = strlen(meta->uid) + 1; + strcpy(p, meta->uid); + meta->uid = p; + p += d->ssz[FMuid]; + + d->ssz[FMgid] = strlen(meta->gid) + 1; + strcpy(p, meta->gid); + meta->gid = p; + p += d->ssz[FMgid]; + + d->ssz[FMmuid] = strlen(meta->muid) + 1; + strcpy(p, meta->muid); + meta->muid = p; + p += d->ssz[FMmuid]; + + d->ssz[FMname] = strlen(meta->name) + 1; + strcpy(p, meta->name); + meta->name = p; + p += d->ssz[FMname]; + + assert(p - bufp <= sz); /* can be <, to leave room for growing */ + return sz; +} + +static long +wname(Fsys *, Memblk *f, void *buf, long len) +{ + char *p, *old; + ulong maxsz; + + p = buf; + if(len < 1 || p[len-1] != 0) + error("name must end in \\0"); + old = f->mf->name; + f->mf->name = p; + maxsz = embedattrsz(f); + if(metasize(f->mf) > maxsz){ + f->mf->name = old; + fprint(2, "%s: bug: no attribute block implemented\n", argv0); + error("no room to grow metadata"); + } + /* name goes last, we can pack in place */ + f->d.asize = pmeta(f->d.embed, maxsz, f->mf); + changed(f); + return len; +} + +static long +rname(Fsys *, Memblk *f, void *buf, long len) +{ + long l; + + l = strlen(f->mf->name) + 1; + if(l > len) + error("buffer too short"); + strcpy(buf, f->mf->name); + return l; +} + +long +dfwattr(Fsys *fs, Memblk *f, char *name, void *val, long nval) +{ + int i; + + isfile(f); + iswlocked(f); + for(i = 0; i < nelem(adef); i++) + if(strcmp(adef[i].name, name) == 0) + return adef[i].wattr(fs, f, val, nval); + error("user defined attributes not yet implemented"); + return -1; +} + +long +dfrattr(Fsys *fs, Memblk *f, char *name, void *val, long count) +{ + int i; + + isfile(f); + isrlocked(f); + for(i = 0; i < nelem(adef); i++) + if(strcmp(adef[i].name, name) == 0) + return adef[i].rattr(fs, f, val, count); + error("user defined attributes not yet implemented"); + return -1; +} + diff -r 257203b800f6 -r fb5824366f81 sys/src/cmd/Cfs/conf.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/src/cmd/Cfs/conf.h Fri Feb 10 17:11:34 2012 +0000 @@ -0,0 +1,59 @@ + + +#define TESTING + +enum +{ + KiB = 1024UL, + MiB = KiB * 1024UL, + GiB = MiB * 1024UL, + +#ifdef TESTING + Incr = 2, + Fmemsz = 1*MiB, /* max size of in-memory file data */ + Fsysmem = 1*MiB, /* size of fsys data in memory */ + Dminfree = 100000, /* min nb. of free blocks in disk */ + + /* disk parameters; don't change */ + Dblksz = 1*KiB, /* disk block size */ + Dblkhdrsz = 2*BIT64SZ, + Ndptr = 2, /* # of direct data pointers */ + Niptr = 2, /* # of indirect data pointers */ + +#else + Incr = 16, + Fmemsz = 64 * MiB, /* max size of in-memory file data */ + Fsysmem = 2*GiB, /* size of fsys data in memory */ + Dminfree = 1000, /* min nb. of free blocks in disk */ + + /* disk parameters; don't change */ + Dblksz = 16*KiB, /* disk block size */ + Dblkhdrsz = 2*BIT64SZ, + Ndptr = 8, /* # of direct data pointers */ + Niptr = 4, /* # of indirect data pointers */ + +#endif + + Dminattrsz = Dblksz/2, /* min size for attributes */ + + /* + * The format of the disk is: + * blk 0: unused + * blk 1: super + * Nblkgrpsz blocks (1st is ref, Nblkgrpsz-1 are data) + * ... + * Nblkgrpsz blocks (1st is ref, Nblkgrpsz-1 are data) + * + */ + Nblkgrpsz = (Dblksz - Dblkhdrsz) / BIT64SZ, + Dblk0addr = 2*Dblksz, + + /* + * Caution: Errstack also limits the max tree depth, + * because of recursive routines (in the worst case). + */ + Stack = 32*KiB, /* stack size for threads */ + Errstack = 512, /* max # of nested error labels */ + Fhashsz = 7919, /* size of file hash (plan9 has 35454 files). */ + +}; diff -r 257203b800f6 -r fb5824366f81 sys/src/cmd/Cfs/dbg.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/src/cmd/Cfs/dbg.c Fri Feb 10 17:11:34 2012 +0000 @@ -0,0 +1,3 @@ + +char dbg[256]; + diff -r 257203b800f6 -r fb5824366f81 sys/src/cmd/Cfs/dbg.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/src/cmd/Cfs/dbg.h Fri Feb 10 17:11:34 2012 +0000 @@ -0,0 +1,6 @@ +/* + * 'd': general debug + * 'D': disk + */ +#define dDprint if(!dbg['D']){}else print +extern char dbg[]; diff -r 257203b800f6 -r fb5824366f81 sys/src/cmd/Cfs/dblk.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/src/cmd/Cfs/dblk.c Fri Feb 10 17:11:34 2012 +0000 @@ -0,0 +1,474 @@ +#include +#include +#include +#include +#include +#include + +#include "conf.h" +#include "dbg.h" +#include "dk.h" +#include "fns.h" + +/* + * disk blocks. + * see dk.h + */ + +void +dbclear(Fsys *fs, u64int addr, int type) +{ + static Diskblk d; + static QLock lk; + + dDprint("dbclear d%#ullx\n", addr); + qlock(&lk); + d.tag = TAG(addr, type); + d.epoch = now(); + if(pwrite(fs->fd, &d, sizeof d, addr) != Dblksz){ + qlock(&lk); + error("dbclear: %r"); + } + qunlock(&lk); +} + +void +meltedref(Fsys *fs, Memblk *rb) +{ + if(rb->frozen && rb->dirty){ + if(catcherror()) + sysfatal("writing ref: %r"); + dbwrite(fs, rb); + noerror(); + } + rb->frozen = rb->dirty = 0; +} + +/* + * BUG: the free list of blocks using entries in the ref blocks + * shouldn't span all those blocks as it does now. To prevent + * massive loses of free blocks each DBref block must keep its own + * little free list, and all blocks with free entries must be linked + * in the global list. + * This keeps locality and makes it less likely that a failure in the + * middle of the sync for the free list destroyes the entire list. + */ + +u64int +newblkaddr(Fsys *fs) +{ + u64int addr, naddr; + + qlock(fs); + + /* + * Caution: can't acquire new locks while holding blklk + * only dbgetref may raise an error, but we don't hold the + * lock while calling it. + */ +Again: + if(fs->super == nil) + addr = Dblksz; + else if(fs->super->d.eaddr < fs->limit){ + addr = fs->super->d.eaddr; + fs->super->d.eaddr += Dblksz; + changed(fs->super); + }else if(fs->super->d.free != 0){ + addr = fs->super->d.free; + + qunlock(fs); + naddr = dbgetref(fs, addr); /* acquires locks */ + qlock(fs); + if(addr != fs->super->d.free){ + /* had a race */ + goto Again; + } + fs->super->d.free = naddr; + fs->super->d.nfree -= 1; + changed(fs->super); + goto found; + }else{ + /* backward compatible with fossil */ + error("disk is full"); + } + /* + * ref blocks are allocated and initialized on demand, + * and they must be zeroed before used. + * do this holding the lock so others find everything + * initialized. + */ + if(((addr-Dblk0addr)/Dblksz)%Nblkgrpsz == 0){ + if(catcherror()){ + qunlock(fs); + error(nil); + } + dbclear(fs, addr-Dblksz, DBref); /* fs initialization */ + noerror(); + addr += Dblksz; + fs->super->d.eaddr += Dblksz; + } +found: + qunlock(fs); + okaddr(fs, addr); + + dDprint("newblkaddr = d%#ullx\n", addr); + return addr; +} + +u64int +addrofref(u64int refaddr, int idx) +{ + u64int bno; + + bno = (refaddr - Dblk0addr)/Dblksz; + bno *= Nblkgrpsz; + bno += idx; + + return Dblk0addr + bno*Dblksz; +} + +u64int +refaddr(u64int addr, int *idx) +{ + u64int bno, refaddr; + + addr -= Dblk0addr; + bno = addr/Dblksz; + *idx = bno%Nblkgrpsz; + refaddr = Dblk0addr + bno/Nblkgrpsz * Nblkgrpsz * Dblksz; + dDprint("refaddr d%#ullx = d%#ullx[%d]\n", + Dblk0addr + addr, refaddr, *idx); + return refaddr; +} + +/* + * db*ref() functions update the on-disk reference counters. + * memory blocks use Memblk.Ref instead. + */ + +u64int +dbgetref(Fsys *fs, u64int addr) +{ + Memblk *rb; + u64int raddr, ref; + int i; + + if(addr == 0) + return 0; + + if(addr == Noaddr ) /* root; 1 ref by the face */ + return 1; + + raddr = refaddr(addr, &i); + rb = dbget(fs, DBref, raddr); + qlock(fs); + meltedref(fs, rb); + ref = rb->d.ref[i]; + qunlock(fs); + mbput(fs, rb); + return ref; +} + +void +dbsetref(Fsys *fs, u64int addr, u64int ref) +{ + Memblk *rb; + u64int raddr; + int i; + + dDprint("dbsetref %#ullx = %ulld\n", addr, ref); + if(addr < Dblk0addr) + sysfatal("dbsetref"); + raddr = refaddr(addr, &i); + rb = dbget(fs, DBref, raddr); + qlock(fs); + meltedref(fs, rb); + rb->d.ref[i] = ref; + changed(rb); + qunlock(fs); + mbput(fs, rb); +} + +void +dbincref(Fsys *fs, u64int addr) +{ + Memblk *rb; + u64int raddr; + int i; + + dDprint("dbincref %#ullx\n", addr); + raddr = refaddr(addr, &i); + rb = dbget(fs, DBref, raddr); + qlock(fs); + meltedref(fs, rb); + rb->d.ref[i]++; + changed(rb); + qunlock(fs); + mbput(fs, rb); +} + +u64int +dbdecref(Fsys *fs, u64int addr) +{ + Memblk *rb; + u64int raddr, ref; + int i; + + if(addr < Dblk0addr) + sysfatal("dbdecref"); + dDprint("dbdecref %#ullx\n", addr); + raddr = refaddr(addr, &i); + rb = dbget(fs, DBref, raddr); + meltedref(fs, rb); + qlock(fs); + rb->d.ref[i]--; + ref = rb->d.ref[i]; + changed(rb); + if(ref == 0){ + rb->d.ref[i] = fs->super->d.free; + fs->super->d.free = addr; + fs->super->d.nfree += 1; + changed(fs->super); + changed(rb); + } + qunlock(fs); + mbput(fs, rb); + return ref; +} + +static Mfile* +mfalloc(Fsys *fs) +{ + Mfile *mf; + + qlock(fs); + mf = fs->mfree; + if(mf != nil){ + fs->mfree = mf->next; + mf->next = nil; + } + qunlock(fs); + if(mf == nil) + mf = mallocz(sizeof *mf, 1); + return mf; +} + +Memblk* +dballoc(Fsys *fs, uint type) +{ + Memblk *b; + u64int addr; + int root; + + root = (type == Noaddr); + addr = Noaddr; + if(root) + type = DBfile; + else + addr = newblkaddr(fs); + dDprint("dballoc DB%s\n", tname[type]); + b = mballoc(fs, addr); + b->d.tag = TAG(b->addr,type); + changed(b); + if(catcherror()){ + mbput(fs, b); + error(nil); + } + if(addr != Noaddr && addr >= Dblk0addr) + dbsetref(fs, addr, 1); + if(type == DBfile) + b->mf = mfalloc(fs); + mbhash(fs, b); + dDprint("dballoc DB%s -> %H\n", tname[type], b); + noerror(); + return b; +} + +/* + * BUG: these should ensure that all integers are converted between + * little endian (disk format) and the machine endianness. + * We know the format of all blocks and the type of all file + * attributes. Those are the integers to convert to fix the bug. + */ +Memblk* +hosttodisk(Memblk *b) +{ + incref(b); + if(!TAGADDROK(b->d.tag, b->addr)) + sysfatal("hosttodisk: bad tag"); + return b; +} +void +disktohost(Memblk *b) +{ + static union + { + u64int i; + uchar m[BIT64SZ]; + } u; + + u.i = 0x1122334455667788ULL; + if(u.m[0] != 0x88) + sysfatal("fix hosttodisk/disktohost for big endian"); + + if(!TAGADDROK(b->d.tag, b->addr)) + error("disktohost: bad tag"); +} + +long +dbwrite(Fsys *fs, Memblk *b) +{ + Memblk *nb; + + dDprint("dbwrite %H",b); + + nb = hosttodisk(b); + nb->d.epoch = now(); + if(pwrite(fs->fd, &nb->d, sizeof nb->d, nb->addr) != Dblksz){ + mbput(fs, nb); + error("dbwrite: %r"); + } + mbput(fs, nb); + return Dblksz; +} + +long +dbread(Fsys *fs, Memblk *b) +{ + long tot, nr; + uchar *p; + + dDprint("dbread m%#p d%#ullx\n", b, b->addr); + + p = b->d.ddata; + for(tot = 0; tot < Dblksz; tot += nr){ + nr = pread(fs->fd, p+tot, Dblksz-tot, b->addr + tot); + if(nr == 0) + error("eof on disk"); + if(nr <= 0){ + error("dbread: %r"); + return -1; + } + } + assert(tot == sizeof b->d); + + disktohost(b); + if(TAGTYPE(b->d.tag) != DBref) + b->frozen = 1; + dDprint("dbread %H", b); + + return tot; +} + +/* + * Directories are fully loaded by dbget. + * Their data is never removed from memory unless the + * entire directory is removed from memory. + */ +Memblk* +dbget(Fsys *fs, uint type, u64int addr) +{ + Memblk *b; + u64int tag; + + dDprint("dbget d%#ullx\n",addr); + okaddr(fs, addr); + b = mbget(fs, addr); + if(b != nil) + return b; + + /* + * others might request the same block while we read it. + * the first one hashing it wins; no locks. + */ + tag = TAG(addr,type); + b = mballoc(fs, addr); + if(catcherror()){ + mbput(fs, b); + error(nil); + } + dbread(fs, b); + if(b->d.tag != tag) + sysfatal("dbget: wrong tag"); + if(type == DBfile){ + assert(b->mf == nil); + b->mf = mfalloc(fs); + gmeta(b->mf, b->d.embed, Embedsz); + b->written = 1; + } + noerror(); + b = mbhash(fs, b); + return b; +} + +/* + * caller responsible for locking. + */ +Memblk* +dbdup(Fsys *fs, Memblk *b) +{ + Memblk *nb; + uint type; + int i; + Mfile *nm, *m; + + type = TAGTYPE(b->d.tag); + nb = dballoc(fs, type); + switch(type){ + case DBfree: + case DBref: + case DBsuper: + case DBattr: + sysfatal("dbdup: DB%s", tname[type]); + case DBdata: + memmove(nb->d.data, b->d.data, Dblkdatasz); + break; + case DBptr: + for(i = 0; i < Dptrperblk; i++){ + nb->d.ptr[i] = b->d.ptr[i]; + if(nb->d.ptr[i] != 0) + dbincref(fs, b->d.ptr[i]); + } + break; + case DBfile: + isrlocked(b); + isloaded(b); + nb->d.asize = b->d.asize; + nb->d.aptr = b->d.aptr; + if(nb->d.aptr != 0) + dbincref(fs, b->d.aptr); + for(i = 0; i < nelem(b->d.dptr); i++){ + nb->d.dptr[i] = b->d.dptr[i]; + if(nb->d.dptr[i] != 0) + dbincref(fs, b->d.dptr[i]); + } + for(i = 0; i < nelem(b->d.iptr); i++){ + nb->d.iptr[i] = b->d.iptr[i]; + if(nb->d.iptr[i] != 0) + dbincref(fs, b->d.iptr[i]); + } + memmove(nb->d.embed, b->d.embed, Embedsz); + nm = nb->mf; + m = b->mf; + gmeta(nm, nb->d.embed, Embedsz); + if(m->nchild > 0){ + if(nm->nachild < m->nchild){ + nm->child = realloc(nm->child, m->nchild*sizeof(Child)); + nm->nachild = m->nchild; + nm->nchild = m->nchild; + } + for(i = 0; i < nm->nchild; i++){ + nm->child[i] = m->child[i]; + nm->child[i].f->mf->parent = nb; + incref(nm->child[i].f); + incref(nm->child[i].b); + } + } + break; + default: + sysfatal("dbdup: type"); + } + changed(nb); + return nb; +} + diff -r 257203b800f6 -r fb5824366f81 sys/src/cmd/Cfs/dk.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/src/cmd/Cfs/dk.h Fri Feb 10 17:11:34 2012 +0000 @@ -0,0 +1,352 @@ +typedef struct Fattr Fattr; +typedef struct Fmeta Fmeta; +typedef struct Child Child; +typedef struct Ddatablk Ddatablk; +typedef struct Dptrblk Dptrblk; +typedef struct Drefblk Drefblk; +typedef struct Dattrblk Dattrblk; +typedef struct Dfileblk Dfileblk; +typedef struct Dsuperblk Dsuperblk; +typedef union Diskblk Diskblk; +typedef struct Diskblkhdr Diskblkhdr; +typedef struct Memblk Memblk; +typedef struct Fsys Fsys; +typedef struct Dentry Dentry; +typedef struct Dmeta Dmeta; +typedef struct Blksl Blksl; +typedef struct Mfile Mfile; + +/* + * Conventions on the data structures: + * + * References: + * - Mem refs include the reference from the hash (to keep the block) + * plus from external structures/users. Thus: + * - those with ref=1 are just kept cached in the hash + * - those with ref=2 are referenced also by the tree + * (or the superblock; does not apply to DBref blocks) + * - those with ref >2 are in use + * - Disk refs count only references within the tree on disk. + * (perhaps loaded in memory waiting for a further sync) + * - Children do not imply new refs to the parents. + * Locking: + * + * Assumptions: + * - /active is *never* found on disk, it's memory-only. + * - b->addr is worm. + * - b->next is locked by the hash bucked lock + * - blocks added to the end of the hash chain. + * - blocks are locked by the file responsible for them, when not frozen. + * - super, disk refs, block allocation, free list, ... protected by fs lock + * - We try not to hold more than one lock, using the + * reference counters when we need to be sure that + * an unlocked resource does not vanish. + * - parents of blocks in memory are in memory + * - reference blocks are never removed from memory. + * - disk refs frozen while waiting to be to disk during a fs freeze. + * in which case db*ref functions write the block in place and melt it. + * - the block epoch number for a on-disk block is the time when it + * was written (thus it's archived "/" has a newer epoch). + * Order: + * fs & super: while locked can't acquire fs or blocks. + * blocks: parent -> child; block -> ref block + */ + +enum +{ + /* block types */ + DBfree = 0, + DBdata, + DBptr, + DBref, + DBattr, + DBfile, + DBsuper, +}; + +/* + * ##### On disk structures. ##### + * + * All on-disk integer values are little endian. + * + * blk 0: unused + * blk 1: super + * ref blk + Nblkgrpsz blocks + * ... + * ref blk + Nblkgrpsz blocks + * + * The code assumes these structures are packed. + * Be careful if they are changed to make things easy for the + * compiler and keep them naturally aligned. + */ + +struct Ddatablk +{ + uchar data[1]; /* raw memory */ +}; + +struct Dptrblk +{ + u64int ptr[1]; /* array of block addresses */ +}; + +struct Drefblk +{ + u64int ref[1]; /* RC or next in free list */ +}; + +struct Dattrblk +{ + u64int next; /* next block used for attribute data */ + uchar attr[1]; /* raw attribute data */ +}; + +/* + * directory entry. contents of data blocks in directories. + * Each block stores only an integral number of Dentries, for simplicity. + */ +struct Dentry +{ + u64int file; /* file address or 0 when archived */ +}; + +/* + * The trailing part of the file block is used to store attributes + * and initial file data. + * At least Dminattrsz is reserved for attributes, at most + * all the remaining embedded space. + * Past the attributes, starts the file data. + * If more attribute space is needed, an attribute block is allocated. + * For huge attributes, it is suggested that a file is allocated and + * the attribute value refers to that file. + * The pointer in iptr[n] is an n-indirect data pointer. + * + * Directories are also files, but their data is simply an array of + * Dentries. + */ +struct Dfileblk +{ + u64int asize; /* attribute size */ + u64int aptr; /* attribute block pointer */ + u64int dptr[Ndptr]; /* direct data pointers */ + u64int iptr[Niptr]; /* indirect data pointers */ + uchar embed[1]; /* embedded attrs and data */ +}; + +enum +{ + FMuid = 0, + FMgid, + FMmuid, + FMname, + FMnstr, +}; + +struct Dmeta +{ + u64int id; /* ctime, actually */ + u64int mode; + u64int mtime; + u64int length; + u16int ssz[FMnstr]; + /* uid\0gid\0muid\0name\0 */ +}; + +/* + * Superblock. + * The stored tree is: + * / + * active/ root of the current or active tree + * archive/ root of the archived tree + * + * ... + * / old root of active as of epoch#1 + * ... + * / old root of active as of epoch#n + */ +struct Dsuperblk +{ + u64int free; /* first free block on list */ + u64int eaddr; /* end of the assigned disk portion */ + u64int root[16]; /* address of /archive in disk */ + u64int nfree; /* # of blocks in free list */ + u64int dblksz; /* only for checking */ + u64int nblkgrpsz; /* only for checking */ + u64int dminattrsz; /* only for checking */ + uchar vac0[24]; /* score for last venti archive + 4pad */ + uchar vac1[24]; /* score for previous venti archive + 4pad */ +}; + +enum +{ + Noaddr = ~0UL +}; + +#define TAG(addr,type) ((addr)<<8|((type)&0x7F)) +#define TAGTYPE(t) ((t)&0x7F) +#define TAGADDROK(t,addr) (((t)&~0xFF) == ((addr)<<8)) + +/* + * disk block + */ +struct Diskblkhdr +{ + u64int tag; /* block tag */ + u64int epoch; /* block epoch */ + /* ref is kept on Dref blocks */ +}; + +union Diskblk +{ + struct{ + Diskblkhdr; + union{ + Ddatablk; /* data block */ + Dptrblk; /* pointer block */ + Drefblk; /* reference counters block */ + Dattrblk; /* attribute block */ + Dfileblk; /* file block */ + Dsuperblk; + }; + }; + uchar ddata[Dblksz]; +}; + +enum +{ + Dblkdatasz = sizeof(Diskblk) - sizeof(Diskblkhdr), + Embedsz = Dblkdatasz - sizeof(Dfileblk), + Dentryperblk = Dblkdatasz / sizeof(Dentry), + Dptrperblk = Dblkdatasz / sizeof(u64int), + Drefperblk = Dblkdatasz / sizeof(u64int), +}; + +/* + * File attributes are name/value pairs. + * A few ones have the name implied by their position. + * All integer values are always kept LE. + * addr u64int + * mode u32int + * mtime u64int + * length u64int + * uid [n] + UTF8 + '\0' + * gid [n] + UTF8 + '\0' + * muid [n] + UTF8 + '\0' + * name [n] + UTF8 + '\0' + */ + +/* + * ##### On memory structures. ##### + */ + +/* + * unpacked file attributes point into the Bfile embedded data. + */ +struct Fattr +{ + Fattr *next; + char *name; + uchar *val; + long nval; +}; + +struct Child +{ + Memblk *f; /* actual child */ + Memblk *b; /* data block containing it's dentry */ + Dentry *d; /* loaded dentry */ +}; + +struct Fmeta +{ + Dmeta; + char *uid; + char *gid; + char *muid; + char *name; +}; + +struct Mfile +{ + RWLock; + Fmeta; + union{ + Memblk *parent; /* most recent parent */ + Mfile *next; /* in free Mfile list */ + }; + + Memblk* lastb; /* last returned data block */ + ulong lastbno; /* for the last asked block # */ + + Child *child; /* direct references to loaded children */ + int nchild; /* number of used children in child */ + int nachild; /* number of allocated chilren in child */ +}; + +/* + * memory block + */ +struct Memblk +{ + Ref; + u64int addr; /* block address */ + Memblk *next; /* in hash or free list */ + + /* for DBref only */ + union{ + Memblk *rnext; /* in list of ref blocks */ + Mfile *mf; /* per file mem info */ + }; + + int dirty; /* must be written */ + int frozen; /* is frozen */ + int written; /* no need to scan this for dirties */ + Diskblk d; +}; + +/* + * Slice into a block + */ +struct Blksl +{ + Memblk *b; + void *data; + long len; +}; + +struct Fsys +{ + QLock; + struct{ + RWLock; + Memblk *b; + } fhash[Fhashsz]; + + Memblk *blk; + usize nblk; + usize nablk; + usize nused; + usize nfree; + Memblk *free; + Mfile *mfree; + + Memblk *refs; + + usize limit; + Memblk *super; /* locked by blklk */ + Memblk *root; /* only in memory */ + Memblk *active; /* /active */ + Memblk *archive; /* /archive */ + + Memblk *fzsuper; /* frozen super */ + Memblk *fzactive; /* frozen active */ + Memblk *fzarchive; /* frozen archive */ + + char *dev; + int fd; +}; + +#pragma varargck type "H" Memblk* + + +extern char*tname[]; diff -r 257203b800f6 -r fb5824366f81 sys/src/cmd/Cfs/file.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/src/cmd/Cfs/file.c Fri Feb 10 17:11:34 2012 +0000 @@ -0,0 +1,329 @@ +#include +#include +#include +#include +#include +#include + +#include "conf.h" +#include "dbg.h" +#include "dk.h" +#include "fns.h" + +/* + * Interface to handle files. + * see dk.h + */ + +/* + * Ok if nelems is 0. + */ +Memblk* +walkpath(Fsys *fs, Memblk *f, char *elems[], int nelems) +{ + int i; + Memblk *nf; + + isfile(f); + rlock(f->mf); + if(f->mf->length > 0 && f->mf->child == nil){ + runlock(f->mf); + dfloaddir(fs, f, 0); + rlock(f->mf); + } + if(catcherror()){ + runlock(f->mf); + error(nil); + } + for(i = 0; i < nelems; i++){ + if((f->mf->mode&DMDIR) == 0) + error("not a directory"); + nf = dfwalk(fs, f, elems[i], 0); + runlock(f->mf); + f = nf; + USED(&f); /* in case of error() */ + } + noerror(); + incref(f); + runlock(f->mf); + return f; +} + +Memblk* +dfcreate(Fsys *fs, Memblk *parent, char *name, char *uid, ulong mode) +{ + Memblk *b; + Mfile *m; + + dDprint("dfcreate '%s' %M at %H", name, mode, parent); + if(parent != nil){ + isdir(parent); + wlock(parent->mf); + if(parent->frozen){ + wunlock(parent->mf); + parent = dfmelt(fs, parent); + }else + incref(parent); + b = dballoc(fs, DBfile); + }else + b = dballoc(fs, Noaddr); /* root */ + + if(catcherror()){ + wunlock(b->mf); + mbput(fs, b); + if(parent != nil){ + wunlock(parent->mf); + mbput(fs, parent); + } + error("create: %r"); + } + m = b->mf; + m->id = b->d.epoch; + m->mode = mode; + m->mtime = b->d.epoch; + m->length = 0; + m->uid = uid; + m->gid = uid; + m->muid = uid; + m->name = name; + b->d.asize = pmeta(b->d.embed, Embedsz, m); + + if(parent != nil){ + m->gid = parent->mf->uid; + dflink(fs, parent, b); + wunlock(parent->mf); + mbput(fs, parent); + } + noerror(); + changed(b); + dDprint("dfcreate-> %H\n", b); + return b; +} + +/* + * returns a slice into a block for reading. + */ +Blksl +dfread(Fsys *fs, Memblk *f, ulong len, uvlong off) +{ + Blksl sl; + + isfile(f); + rlock(f->mf); + if(catcherror()){ + runlock(f->mf); + error("read: %r"); + } + sl = dfslice(fs, f, len, off, 0); + noerror(); + runlock(f->mf); + return sl; +} + +/* + * returns a slice into a block for writing + * the block is returned unlocked. + */ +Blksl +dfwrite(Fsys *fs, Memblk *f, uvlong off) +{ + Blksl sl; + + isnotdir(f); + wlock(f->mf); + if(f->frozen){ + wunlock(f->mf); + f = dfmelt(fs, f); + }else + incref(f); + if(catcherror()){ + wunlock(f->mf); + error(nil); + } + sl = dfslice(fs, f, 0, off, 1); + noerror(); + wunlock(f->mf); + mbput(fs, f); + return sl; +} + +/* + * Freezing should not fail. + * If it does, we can't even freeze the tree to sync to disk, + * so there's not much to do. + * The caller with probably catch the error and sysfatal. + */ + + +/* + * freeze a direct or indirect pointer and everything below it. + */ +static void +ptrfreeze(Fsys *fs, u64int addr, int nind) +{ + int i; + Memblk *b; + + if(addr == 0) + return; + b = mbget(fs, addr); + if(b == nil) + return; /* on disk: frozen */ + if(!b->frozen){ + b->frozen = 1; + if(nind > 0) + for(i = 0; i < Dptrperblk; i++) + ptrfreeze(fs, b->d.ptr[i], nind-1); + } + mbput(fs, b); +} + +/* + * freeze a file. + * Do not recur if children is found frozen. + */ +void +dffreeze(Fsys *fs, Memblk *f) +{ + int i; + Memblk *b; + + iswlocked(f); + isfile(f); + dDprint("dffrezee m%#p\n", f); + f->frozen = 1; + for(i = 0; i < nelem(f->d.dptr); i++) + ptrfreeze(fs, f->d.dptr[i], 0); + for(i = 0; i < nelem(f->d.iptr); i++) + ptrfreeze(fs, f->d.dptr[i], i+1); + if((f->mf->mode&DMDIR) == 0) + return; + for(i = 0; i < f->mf->nchild; i++){ + b = f->mf->child[i].f; + if(!b->frozen){ + wlock(b->mf); + dffreeze(fs, b); + wunlock(b->mf); + } + } +} + +/* + * freeze a direct or indirect pointer and everything below it. + */ +static void +ptrsync(Fsys *fs, u64int addr, int nind) +{ + int i; + Memblk *b; + + if(addr == 0) + return; + b = mbget(fs, addr); + if(b == nil) + return; /* on disk */ + if(!b->frozen) + sysfatal("ptrsync: not frozen\n\t%H", b); + if(b->dirty) + dbwrite(fs, b); + b->dirty = 0; + mbput(fs, b); + if(nind > 0) + for(i = 0; i < Dptrperblk; i++) + ptrsync(fs, b->d.ptr[i], nind-1); +} + +/* + * Ensure all frozen but dirty blocks are in disk. + */ +void +dfsync(Fsys *fs, Memblk *f) +{ + int i; + + isfile(f); + if(f->written) + return; + if(!f->frozen) + sysfatal("dfsync: not frozen\n\t%H", f); + + for(i = 0; i < nelem(f->d.dptr); i++) + ptrsync(fs, f->d.dptr[i], 0); + for(i = 0; i < nelem(f->d.iptr); i++) + ptrsync(fs, f->d.dptr[i], i+1); + for(i = 0; i < f->mf->nchild; i++) + dfsync(fs, f->mf->child[i].f); + + rlock(f->mf); + if(f->dirty) + dbwrite(fs, f); + f->dirty = 0; + f->written = 1; + runlock(f->mf); +} + +/* + * release a direct or indirect pointer and everything below it. + */ +static int +ptrreclaim(Fsys *fs, u64int addr, int nind) +{ + int i; + Memblk *b; + + if(addr == 0) + return 0; + if(dbdecref(fs, addr) != 0) + return 0; + b = dbget(fs, nind == 0? DBdata: DBptr, addr); + if(!b->frozen) + sysfatal("ptrreclaim: not frozen\n\t%H", b); + mbunhash(fs, b); + if(b->ref != 1) + sysfatal("dfreclaim: bug?"); + if(nind > 0) + for(i = 0; i < Dptrperblk; i++) + ptrreclaim(fs, b->d.ptr[i], nind-1); + mbput(fs, b); + return 1; +} + +/* + * remove f and all it's children. + * It's safe to remove the parent before the children, + * because no reference to f is kept in the disk when this + * function is called. + * + * One problem here is that we have to load the blocks + * to actually learn their references and remove them. + * TODO: do this using an external cleaner program? + */ +int +dfreclaim(Fsys *fs, Memblk *f) +{ + int i, tot; + + tot = 0; + dDprint("dfreclaim %H", f); + if(dbdecref(fs, f->addr) != 0) + return 0; + tot++; + if(!f->frozen) + sysfatal("dfsync: not frozen\n\t%H", f); + incref(f); + mbunhash(fs, f); + if(f->ref != 1) + sysfatal("dfreclaim: ref is %d", f->ref); + for(i = 0; i < nelem(f->d.dptr); i++) + tot += ptrreclaim(fs, f->d.dptr[i], 0); + for(i = 0; i < nelem(f->d.iptr); i++) + tot += ptrreclaim(fs, f->d.dptr[i], i+1); + if(f->mf->mode&DMDIR){ + isloaded(f); + for(i = 0; i < f->mf->nchild; i++) + tot += dfreclaim(fs, f->mf->child[i].f); + } + mbput(fs, f); + return tot; +} + diff -r 257203b800f6 -r fb5824366f81 sys/src/cmd/Cfs/fsfmt.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/src/cmd/Cfs/fsfmt.c Fri Feb 10 17:11:34 2012 +0000 @@ -0,0 +1,55 @@ +#include +#include +#include +#include +#include +#include + +#include "conf.h" +#include "dbg.h" +#include "dk.h" +#include "fns.h" + +static void +usage(void) +{ + fprint(2, "usage: %s [-DFLAGS] [-dv]\n", argv0); + exits("usage"); +} + +void +main(int argc, char *argv[]) +{ + Fsys *fs; + int verb; + char *dev; + + dev = "disk"; + verb = 0; + ARGBEGIN{ + case 'v': + verb++; + break; + default: + if(ARGC() >= 'A' && ARGC() <= 'Z'){ + dbg['d'] = 1; + dbg[ARGC()] = 1; + }else + usage(); + }ARGEND; + if(argc == 1) + dev = argv[0]; + else if(argc > 0) + usage(); + fmtinstall('H', mbfmt); + fmtinstall('M', dirmodefmt); + errinit(Errstack); + if(catcherror()) + sysfatal("error: %r"); + fs = fsfmt(dev); + if(verb) + fsdump(fs); + noerror(); + exits(nil); +} + diff -r 257203b800f6 -r fb5824366f81 sys/src/cmd/Cfs/fsys.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/src/cmd/Cfs/fsys.c Fri Feb 10 17:11:34 2012 +0000 @@ -0,0 +1,457 @@ +#include +#include +#include +#include +#include +#include + +#include "conf.h" +#include "dbg.h" +#include "dk.h" +#include "fns.h" + +/* + * All the code assumes outofmemoryexits = 1. + */ + +int +iserror(char *s) +{ + char err[128]; + + rerrstr(err, sizeof err); + return strstr(err, s) != nil; +} + +uvlong +now(void) +{ + return nsec(); +} + +void +okaddr(Fsys *fs, u64int addr) +{ + if(addr < Dblksz || addr >= fs->limit) + error("okaddr %#ullx", addr); +} + +/* + * NO LOCKS. debug only + */ +void +fsdump(Fsys *fs) +{ + int i, flg; + Memblk *b; + u64int a; + + if(fs != nil){ + print("\n\nfsys '%s' limit %#ulx super m%#p root m%#p:\n", + fs->dev, fs->limit, fs->super, fs->root); + for(i = 0; i < nelem(fs->fhash); i++) + for(b = fs->fhash[i].b; b != nil; b = b->next) + print("h[%d] = %H", i, b); + print("nblk %uld nablk %uld used %uld free %uld\n", + fs->nblk, fs->nablk, fs->nused, fs->nfree); + } + b = fs->super; + if(b->d.free != 0){ + print("free:"); + flg = dbg['D']; + dbg['D'] = 0; + for(a = b->d.free; a != 0; a = dbgetref(fs, a)) + print(" d%#ullx", a); + dbg['D'] = flg; + print("\n"); + } + print("Fsysmem\t= %uld\n", Fsysmem); + print("Dminfree\t= %d\n", Dminfree); + print("Dblksz\t= %uld\n", Dblksz); + print("Dminattrsz\t= %uld\n", Dminattrsz); + print("Nblkgrpsz\t= %uld\n", Nblkgrpsz); + print("Dblkdatasz\t= %d\n", Dblkdatasz); + print("Embedsz\t= %d\n", Embedsz); + print("Dentryperblk\t= %d\n", Dentryperblk); + print("Dptrperblk\t= %d\n\n", Dptrperblk); +} + +static usize +disksize(int fd) +{ + Dir *d; + u64int sz; + + d = dirfstat(fd); + if(d == nil) + return 0; + sz = d->length; + free(d); + return sz; +} + +static void +freezerefs(Fsys *fs) +{ + Memblk *rb; + + qlock(fs); + for(rb = fs->refs; rb != nil; rb = rb->next) + rb->frozen = 1; + qunlock(fs); +} + +static void +writerefs(Fsys *fs) +{ + Memblk *rb; + + qlock(fs); + for(rb = fs->refs; rb != nil; rb = rb->next) + meltedref(fs, rb); + qunlock(fs); +} + +static void +freezesuper(Fsys *fs) +{ + Memblk *b; + + b = mbdup(fs, fs->super); + qlock(fs); + b->d = fs->super->d; + assert(fs->fzsuper == nil); + fs->fzsuper = b; + fs->fzsuper->frozen = 1; + qunlock(fs); +} + +static void +writesuper(Fsys *fs) +{ + qlock(fs); + assert(fs->fzsuper != nil); + qunlock(fs); + dbwrite(fs, fs->fzsuper); + dDprint("fswrite: %H", fs->fzsuper); + mbput(fs, fs->fzsuper); + fs->fzsuper = nil; +} + +/* + * Write any dirty frozen state after a freeze. + * Only this function and initialization of previously unused DBref + * blocks may write to the disk. + */ +static void +fswrite(Fsys *fs) +{ + if(fs->fzsuper == nil) + sysfatal("can't fswrite if we didn't fsfreeze"); + writerefs(fs); + dfsync(fs, fs->archive); + writesuper(fs); +} + +/* + * Freeze the file tree, keeping active as a new melted file + * that refers to frozen children now in the archive. + * returns the just frozen tree. + */ +Memblk* +fsfreeze(Fsys *fs) +{ + Memblk *na, *oa, *arch, *super; + Child *ac; + char name[50]; + int i; + + wlock(fs->active->mf); + wlock(fs->archive->mf); + if(fs->fzsuper != nil){ + wunlock(fs->archive->mf); + wunlock(fs->active->mf); + error("freeze already in progress"); + } + dfloaddir(fs, fs->active, 1); + dfloaddir(fs, fs->archive, 1); + super = fs->super; + if(catcherror()){ + /* + * Freeze can't fail. If it does, we better + * restart the file system from the last known + * frozen tree. + * The only reasing this should happen is because + * we run out of memory, or out of disk, or + * the disk fails. + */ + sysfatal("freeze: can't recover: %r"); + } + + /* + * move active into /archive/ and create a new melted + * active. + */ + oa = fs->active; + na = dbdup(fs, oa); + wlock(na->mf); + seprint(name, name+sizeof(name), "%ulld", oa->d.epoch); + dfwattr(fs, oa, "name", name, strlen(name)+1); + ac = fs->root->mf->child; + assert(ac->f == oa); + ac->f = na; /* keeps the ref we have */ + ac->d->file = na->addr; + if(fs->archive->frozen){ + arch = dbdup(fs, fs->archive); + wlock(arch->mf); + wunlock(fs->archive->mf); + mbput(fs, fs->archive); + fs->archive = arch; + for(i = nelem(super->d.root)-1; i > 0; i--) + super->d.root[i] = super->d.root[i-1]; + super->d.root[0] = fs->archive->addr; + } + dflink(fs, fs->archive, oa); + fs->active = na; + fs->archive->frozen = 1; /* for fsfmt */ + + /* 1. Free the entire previously active + */ + dffreeze(fs, oa); + wunlock(oa->mf); + + /* 2. Freeze whatever new blocks are found in archive + */ + dffreeze(fs, fs->archive); + + /* 3. Freeze the on-disk reference counters + * and the state of the super-block. + */ + freezerefs(fs); + freezesuper(fs); + + /* + /* 4. release locks, all done. + */ + wunlock(na->mf); + wunlock(fs->archive->mf); + noerror(); + return na; +} + +static Fsys* +fsinit(char *dev, int nblk) +{ + Fsys *fs; + + fs = mallocz(sizeof *fs, 1); + fs->dev = strdup(dev); + fs->fd = open(dev, ORDWR); + if(fs->fd < 0) + sysfatal("can't open disk: %r"); + + fs->nablk = Fsysmem / sizeof(Memblk); + if(nblk > 0 && nblk < fs->nablk) + fs->nablk = nblk; + fs->limit = disksize(fs->fd); + if(fs->nablk > fs->limit/Dblksz) + fs->nablk = fs->limit/Dblksz; + fs->limit = fs->nablk * Dblksz; + if(fs->limit < 10*Dblksz) + sysfatal("disk is ridiculous"); + fs->blk = malloc(fs->nablk * sizeof fs->blk[0]); + dDprint("fsys '%s' open\n", fs->dev); + return fs; +} + +/* + * / is only in memory. It's `on-disk' address is Noaddr. + * + * /archive is the root on disk. + * /active is allocated on disk, but not on disk. It will be linked into + * /archive as a child in the future. + */ +Fsys* +fsfmt(char *dev) +{ + Fsys *fs; + Memblk *super; + + if(catcherror()) + sysfatal("fsfmt: error: %r"); + + fs = fsinit(dev, 16); /* enough # of blocks for fmt */ + + fs->super = dballoc(fs, DBsuper); + super = fs->super; + super->d.eaddr = fs->super->addr + Dblksz; + super->d.dblksz = Dblksz; + super->d.nblkgrpsz = Nblkgrpsz; + super->d.dminattrsz = Dminattrsz; + + fs->root = dfcreate(fs, nil, "", getuser(), DMDIR|0555); + fs->active = dfcreate(fs, fs->root, "active", getuser(), DMDIR|0775); + fs->archive = dfcreate(fs, fs->root, "archive", getuser(), DMDIR|0555); + super->d.root[0] = fs->archive->addr; + + fsfreeze(fs); + fswrite(fs); + + noerror(); + return fs; +} + +void +fssync(Fsys *fs) +{ + /* + * TODO: If active has not changed and we are just going + * to dump a new archive for no change, do nothing. + */ + fsfreeze(fs); + fswrite(fs); +} + +static Memblk* +readsuper(Fsys *fs) +{ + Memblk *super; + + fs->super = dbget(fs, DBsuper, Dblksz); + super = fs->super; + if(super->d.dblksz != Dblksz) + error("bad Dblksz"); + if(super->d.nblkgrpsz != Nblkgrpsz) + error("bad Nblkgrpsz"); + if(super->d.dminattrsz != Dminattrsz) + error("bad Dminattrsz"); + return super; +} + +/* + * One process per file system, so consume all the memory + * for the cache. + * To open more file systems, use more processes! + */ + +Fsys* +fsopen(char *dev) +{ + Fsys *fs; + + if(catcherror()) + sysfatal("fsopen: error: %r"); + + fs = fsinit(dev, 0); + + readsuper(fs); + + fs->root = dfcreate(fs, nil, "", getuser(), DMDIR|0555); + fs->active = dfcreate(fs, fs->root, "active", getuser(), DMDIR|0775); + fs->archive = dbget(fs, DBfile, fs->super->d.root[0]); + wlock(fs->root->mf); + wlock(fs->archive->mf); + dflink(fs, fs->root, fs->archive); + wunlock(fs->archive->mf); + wunlock(fs->root->mf); + noerror(); + return fs; +} + +/* + * XXX: must revisit here: + * - there are several things been done multiple times in different + * functions. e.g., writing the super, compare + * fsfmt and fsopen + * fsreclaim and fsfreeze. + * some inner functions are missing there. + * + * - it's not clear references end up ok after fsreclaim, must test that. + * - perhaps we should reclaim in a loop until we are sure that + * at least a min # of blocks are available or we can't reclaim anything else, + * whatever happens first. + * + * - must implement a variant of the fsfmt function that reads an archived + * tree and formats the file system according to it. Although that's + * perhaps just a fsfmt() followed by the inverse o fthe archival tool, + * and we may leave fsfmt alone. + */ + +/* + * This should be called if fs->super->d.nfree < some number. + */ +void +fsreclaim(Fsys *fs) +{ + uvlong nfree; + Child *c, *victim; + Memblk *arch, *gone; + int i, n, tot; + + tot = 0; + for(;;){ + qlock(fs); + nfree = fs->super->d.nfree; + nfree += (fs->limit - fs->super->d.eaddr)/Dblksz; + qunlock(fs); + if(nfree > Dminfree){ + dDprint("fsreclaim: got %ulld free\n", nfree); + break; + } + dDprint("fsreclaim: reclaiming: %ulld free\n", nfree); + arch = fs->archive; + wlock(arch->mf); + dfloaddir(fs, arch, 1); + if(arch->mf->nchild < 2){ + wunlock(arch->mf); + dDprint("nothing to reclaim\n"); + break; + } + victim = arch->mf->child; + for(i = 0; i < arch->mf->nchild; i++){ + c = &arch->mf->child[i]; + if(victim->f->d.epoch > c->f->d.epoch) + victim = c; + } + + gone = victim->f; + fprint(2, "%s: reclaiming /archive/%s\n", argv0, gone->mf->name); + dDprint("victim is %H", gone); + + /* + * Surgery: we don't want to allocate anything by now. + * Just clear the references on disk and memory to the victim. + * If we fail before finishing then RCs will be >= the + * value they should have (the reference is gone from disk). + */ + victim->d->file = 0; + dbwrite(fs, victim->b); + delchild(arch, victim); + wunlock(arch->mf); + + n = dbgetref(fs, gone->addr); + if(n != 1) + sysfatal("reclaim: gone ref is %d != 1", n); + n = dfreclaim(fs, gone); + dDprint("%d block%s reclaimed\n", n, n?"s":""); + tot += n; + + /* + * Hopefully we have some free blocks. + * dump the reference blocks to disk. + * Gone blocks are in the free list, active blocks may end up + * with on-disk refs >= those matching the references on disk, + * the next snap will make the ref list coherent. + * We don't snap here because that is likely to allocate more + * blocks. + */ + freezerefs(fs); + writerefs(fs); + freezesuper(fs); + dDprint("fsreclaim: %H", fs->fzsuper); + writesuper(fs); + } + if(tot > 0) + fprint(2, "%s: %d block%s reclaimed\n", argv0, tot, tot?"s":""); + +} diff -r 257203b800f6 -r fb5824366f81 sys/src/cmd/Cfs/main.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/src/cmd/Cfs/main.c Fri Feb 10 17:11:34 2012 +0000 @@ -0,0 +1,32 @@ +#include +#include +#include +#include +#include +#include + +#include "conf.h" +#include "dbg.h" +#include "dk.h" +#include "fns.h" + +void +main(int, char *argv[]) +{ + Fsys *fs; + + argv0 = argv[0]; + fmtinstall('H', mbfmt); + fmtinstall('M', dirmodefmt); + errinit(Errstack); + if(catcherror()) + sysfatal("error: %r"); + fs = fsopen("disk"); + fsdump(fs); + dbg['D'] = 1; + fsreclaim(fs); + fsdump(fs); + noerror(); + exits(nil); +} + diff -r 257203b800f6 -r fb5824366f81 sys/src/cmd/Cfs/mblk.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/src/cmd/Cfs/mblk.c Fri Feb 10 17:11:34 2012 +0000 @@ -0,0 +1,306 @@ +#include +#include +#include +#include +#include +#include + +#include "conf.h" +#include "dbg.h" +#include "dk.h" +#include "fns.h" + +/* + * memory blocks. + * see dk.h + */ + + +/* + * All the code assumes outofmemoryexits = 1. + */ + +char*tname[] = { +[DBfree] "free", +[DBdata] "data", +[DBptr] "ptr", +[DBref] "ref", +[DBattr] "attr", +[DBfile] "file", +[DBsuper] "super" +}; + +#define EP(e) ((e)&0xFFFFFFFFUL) +/* + * NO LOCKS. debug only + */ +int +mbfmt(Fmt *fmt) +{ + Memblk *b; + int type, i, n, once; + + b = va_arg(fmt->args, Memblk*); + if(b == nil) + return fmtprint(fmt, "\n"); + type = TAGTYPE(b->d.tag); + fmtprint(fmt, "m%#p d%#ullx", b, b->addr); + if(b->frozen) + fmtprint(fmt, " FZ"); + if(b->dirty) + fmtprint(fmt, " DT"); + if(b->written) + fmtprint(fmt, " WR"); + fmtprint(fmt, " DB%s r%d", tname[type], b->ref); + fmtprint(fmt, " tag %#ullx epoch %#ullx", EP(b->d.tag), EP(b->d.epoch)); + switch(type){ + case DBfree: + case DBdata: + case DBattr: + fmtprint(fmt, "\n"); + break; + case DBptr: + fmtprint(fmt, "\n"); + for(i = n = 0; i < Dptrperblk; i++) + if(b->d.ptr[i]){ + fmtprint(fmt, "\t[%d]=d%#ullx", i, b->d.ptr[i]); + if(++n%4 == 0) + fmtprint(fmt, "\n"); + } + if(n%4 != 0) + fmtprint(fmt, "\n"); + break; + case DBref: + fmtprint(fmt, " rnext m%#p\n", b->rnext); + for(i = n = 0; i < Drefperblk; i++) + if(b->d.ref[i]){ + fmtprint(fmt, "\t[%d]d%#ullx=%#ullx", + i, addrofref(b->addr, i), b->d.ref[i]); + if(++n%5 == 0) + fmtprint(fmt, "\n"); + } + if(n%5 != 0) + fmtprint(fmt, "\n"); + break; + case DBfile: + fmtprint(fmt, "\n\tasz %#ullx aptr %#ullx\n", b->d.asize, b->d.aptr); + if(b->mf == nil){ + fmtprint(fmt, "\tno mfile\n"); + break; + } + fmtprint(fmt, "\tid %#ullx mode %M mt %#ullx sz %#ullx '%s' '%s'\n", + EP(b->mf->id), (ulong)b->mf->mode, EP(b->mf->mtime), + b->mf->length, b->mf->uid, b->mf->name); + fmtprint(fmt, "\tparent m%#p nr%d nw%d lastb m%#p lastbno %uld\n", + b->mf->parent, b->mf->readers, b->mf->writer, + b->mf->lastb, b->mf->lastbno); + if(b->mf->nchild > 0){ + fmtprint(fmt, "\tchild:"); + for(i = 0; i < b->mf->nchild; i++) + fmtprint(fmt, " m%#p", b->mf->child[i].f); + fmtprint(fmt, "\n"); + } + fmtprint(fmt, "\tdptr:"); + for(i = 0; i < nelem(b->d.dptr); i++) + fmtprint(fmt, " d%#ullx", b->d.dptr[i]); + fmtprint(fmt, "\n"); + fmtprint(fmt, "\tiptr:"); + for(i = 0; i < nelem(b->d.iptr); i++) + fmtprint(fmt, " d%#ullx", b->d.iptr[i]); + fmtprint(fmt, "\n"); + break; + case DBsuper: + fmtprint(fmt, "\n\tfree d%#ullx eaddr %#ullx root [", b->d.free, b->d.eaddr); + once = 0; + for(i = 0; i < nelem(b->d.root); i++) + if(b->d.root[i] != 0){ + if(once++ != 0) + fmtprint(fmt, " "); + fmtprint(fmt, "d%#ullx", b->d.root[i]); + } + fmtprint(fmt, "]\n"); + break; + } + return 0; +} + +void +clean(Memblk *b) +{ + b->dirty = 0; +} + +void +ismelted(Memblk *b) +{ + if(b->frozen) + sysfatal("frozen at pc %#p", getcallerpc(&b)); +} + +void +changed(Memblk *b) +{ + if(TAGTYPE(b->d.tag) != DBsuper) + ismelted(b); + b->d.epoch = now(); + b->dirty = 1; +} + +Memblk* +mbhash(Fsys *fs, Memblk *b) +{ + Memblk **h; + uint hv; + + hv = b->addr%nelem(fs->fhash); + wlock(&fs->fhash[hv]); + fs->nused++; + for(h = &fs->fhash[hv].b; *h != nil; h = &(*h)->next) + if((*h)->addr == b->addr){ + /* concurrent reads, use the first one */ + mbput(fs, b); + b = *h; + goto Found; + } + *h = b; + if(b->next != nil) + sysfatal("mbhash: next"); + if(TAGTYPE(b->d.tag) == DBref){ + qlock(fs); + b->rnext = fs->refs; + fs->refs = b; + qunlock(fs); + } +Found: + incref(b); + wunlock(&fs->fhash[hv]); + return b; +} + +static void +mbfree(Fsys *fs, Memblk *b) +{ + Mfile *mf; + + if(b == nil) + return; + dDprint("mbfree %H\n", b); + if(b->ref > 0) + sysfatal("mbfree: has refs"); + if(b->next != nil) + sysfatal("mbfree: has next"); + if(TAGTYPE(b->d.tag) == DBref) + sysfatal("mbfree: is DBref"); + + if(TAGTYPE(b->d.tag) == DBfile && b->mf != nil){ + mf = b->mf; + b->mf = nil; + mf->nchild = 0; + if(mf->lastb != nil) + mbput(fs, mf->lastb); + mf->lastb = nil; + mf->lastbno = 0; + mf->parent = nil; + mf->next = nil; + assert(mf->readers == mf->writer && mf->readers == 0); + qlock(fs); + mf->next = fs->mfree; + fs->mfree = mf; + qunlock(fs); + } + b->d.tag = DBfree; + b->frozen = b->written = b->dirty = 0; + b->addr = 0; + + qlock(fs); + fs->nfree++; + b->next = fs->free; + fs->free = b; + qunlock(fs); +} + +void +mbunhash(Fsys *fs, Memblk *b) +{ + Memblk **h; + uint hv; + + if(TAGTYPE(b->d.tag) == DBref) + sysfatal("mbunhash: DBref"); + + hv = b->addr%nelem(fs->fhash); + wlock(&fs->fhash[hv]); + for(h = &fs->fhash[hv].b; *h != nil; h = &(*h)->next) + if((*h)->addr == b->addr){ + if(*h != b) + sysfatal("mbunhash: dup block"); + *h = b->next; + b->next = nil; + fs->nused--; + wunlock(&fs->fhash[hv]); + mbput(fs, b); + return; + } + sysfatal("mbunhash: not found"); +} + +Memblk* +mballoc(Fsys *fs, u64int addr) +{ + Memblk *b; + + b = nil; + qlock(fs); + if(fs->nblk < fs->nablk) + b = &fs->blk[fs->nblk++]; + else if(fs->free != nil){ + b = fs->free; + fs->free = b->next; + fs->nfree--; + }else{ + qunlock(fs); + error("evict block not implemented"); + } + qunlock(fs); + memset(b, 0, sizeof *b); + b->addr = addr; + b->ref = 1; + dDprint("mballoc %#ullx -> %H", addr, b); + return b; +} + +Memblk* +mbget(Fsys *fs, u64int addr) +{ + Memblk *b; + uint hv; + + hv = addr%nelem(fs->fhash); + rlock(&fs->fhash[hv]); + for(b = fs->fhash[hv].b; b != nil; b = b->next) + if(b->addr == b->addr){ + incref(b); + break; + } + runlock(&fs->fhash[hv]); + dDprint("mbget %#ullx -> %H", addr, b); + return b; +} + +void +mbput(Fsys *fs, Memblk *b) +{ + dDprint("mbput m%#p pc=%#p\n", b, getcallerpc(&fs)); + if(decref(b) == 0) + mbfree(fs, b); +} + +Memblk* +mbdup(Fsys *fs, Memblk *b) +{ + Memblk *nb; + + nb = mballoc(fs, b->addr); + memmove(&nb->d, &b->d, sizeof b->d); + return nb; +} diff -r 257203b800f6 -r fb5824366f81 sys/src/cmd/Cfs/mkfile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sys/src/cmd/Cfs/mkfile Fri Feb 10 17:11:34 2012 +0000 @@ -0,0 +1,29 @@ +