2 * Manage tree of VtFiles stored in the block cache.
4 * The single point of truth for the info about the VtFiles themselves
5 * is the block data. Because of this, there is no explicit locking of
6 * VtFile structures, and indeed there may be more than one VtFile
7 * structure for a given Venti file. They synchronize through the
10 * This is a bit simpler than fossil because there are no epochs
11 * or tags or anything else. Just mutable local blocks and immutable
19 #define MaxBlock (1UL<<31)
21 static char ENotDir[] = "walk in non-directory";
22 static char ETooBig[] = "file too big";
23 /* static char EBadAddr[] = "bad address"; */
24 static char ELabelMismatch[] = "label mismatch";
26 static int sizetodepth(uvlong s, int psize, int dsize);
27 static VtBlock *fileload(VtFile *r, VtEntry *e);
28 static int shrinkdepth(VtFile*, VtBlock*, VtEntry*, int);
29 static int shrinksize(VtFile*, VtEntry*, uvlong);
30 static int growdepth(VtFile*, VtBlock*, VtEntry*, int);
32 #define ISLOCKED(r) ((r)->b != nil)
33 #define DEPTH(t) ((t)&VtTypeDepthMask)
36 vtfilealloc(VtCache *c, VtBlock *b, VtFile *p, u32int offset, int mode)
43 assert(p==nil || ISLOCKED(p));
49 epb = p->dsize / VtEntrySize;
51 if(b->type != VtDirType){
52 werrstr("bad block type %#uo", b->type);
57 * a non-active entry is the only thing that
58 * can legitimately happen here. all the others
61 if(vtentryunpack(&e, b->data, offset % epb) < 0){
62 fprint(2, "vtentryunpack failed: %r (%.*H)\n", VtEntrySize, b->data+VtEntrySize*(offset%epb));
65 if(!(e.flags & VtEntryActive)){
66 werrstr("entry not active");
70 if(DEPTH(e.type) < sizetodepth(e.size, e.psize, e.dsize)){
71 fprint(2, "depth %ud size %llud psize %ud dsize %ud\n",
72 DEPTH(e.type), e.size, e.psize, e.dsize);
77 size = vtcacheblocksize(c);
78 if(e.dsize > size || e.psize > size){
79 werrstr("block sizes %ud, %ud bigger than cache block size %ud",
80 e.psize, e.dsize, size);
84 r = vtmallocz(sizeof(VtFile));
90 r->dir = (e.type & VtTypeBaseMask) == VtDirType;
95 assert(mode == VtOREAD || p->mode == VtORDWR);
99 assert(b->addr != NilBlock);
102 memmove(r->score, b->score, VtScoreSize);
110 vtfileroot(VtCache *c, u32int addr, int mode)
115 b = vtcachelocal(c, addr, VtDirType);
118 r = vtfilealloc(c, b, nil, 0, mode);
124 vtfileopenroot(VtCache *c, VtEntry *e)
129 b = vtcacheallocblock(c, VtDirType);
133 vtentrypack(e, b->data, 0);
134 f = vtfilealloc(c, b, nil, 0, VtORDWR);
140 vtfilecreateroot(VtCache *c, int psize, int dsize, int type)
144 memset(&e, 0, sizeof e);
145 e.flags = VtEntryActive;
149 memmove(e.score, vtzeroscore, VtScoreSize);
151 return vtfileopenroot(c, &e);
155 vtfileopen(VtFile *r, u32int offset, int mode)
166 bn = offset/(r->dsize/VtEntrySize);
168 b = vtfileblock(r, bn, mode);
171 r = vtfilealloc(r->c, b, r, offset, mode);
177 vtfilecreate(VtFile *r, int psize, int dsize, int type)
179 return _vtfilecreate(r, -1, psize, dsize, type);
183 _vtfilecreate(VtFile *r, int o, int psize, int dsize, int type)
194 assert(psize <= VtMaxLumpSize);
195 assert(dsize <= VtMaxLumpSize);
196 assert(type == VtDirType || type == VtDataType);
203 epb = r->dsize/VtEntrySize;
205 size = vtfilegetdirsize(r);
207 * look at a random block to see if we can find an empty entry
210 offset = lnrand(size+1);
211 offset -= offset % epb;
215 /* try the given block and then try the last block */
218 b = vtfileblock(r, bn, VtORDWR);
221 for(i=offset%r->epb; i<epb; i++){
222 if(vtentryunpack(&e, b->data, i) < 0)
224 if((e.flags&VtEntryActive) == 0 && e.gen != ~0)
229 fprint(2, "vtfilecreate: cannot happen\n");
230 werrstr("vtfilecreate: cannot happen");
237 /* found an entry - gen already set */
240 e.flags = VtEntryActive;
243 memmove(e.score, vtzeroscore, VtScoreSize);
244 vtentrypack(&e, b->data, i);
248 if(vtfilesetdirsize(r, offset+1) < 0){
254 rr = vtfilealloc(r->c, b, r, offset, VtORDWR);
260 vtfilekill(VtFile *r, int doremove)
270 if(doremove==0 && e.size == 0){
271 /* already truncated */
283 e.flags &= ~VtEntryLocal;
286 memmove(e.score, vtzeroscore, VtScoreSize);
287 vtentrypack(&e, b->data, r->offset % r->epb);
299 vtfileremove(VtFile *r)
301 return vtfilekill(r, 1);
305 vtfiletruncate(VtFile *r)
307 return vtfilekill(r, 0);
311 vtfilegetsize(VtFile *r)
326 shrinksize(VtFile *r, VtEntry *e, uvlong size)
328 int i, depth, type, isdir, ppb;
330 uchar score[VtScoreSize];
333 b = vtcacheglobal(r->c, e->score, e->type);
338 ppb = e->psize/VtScoreSize;
341 for(i=0; i+1<depth; i++)
346 if(b->addr == NilBlock){
347 /* not worth copying the block just so we can zero some of it */
353 * invariant: each pointer in the tree rooted at b accounts for ptrsz bytes
356 /* zero the pointers to unnecessary blocks */
357 i = (size+ptrsz-1)/ptrsz;
359 memmove(b->data+i*VtScoreSize, vtzeroscore, VtScoreSize);
361 /* recurse (go around again) on the partially necessary block */
370 memmove(score, b->data+i*VtScoreSize, VtScoreSize);
372 b = vtcacheglobal(r->c, score, type);
377 if(b->addr == NilBlock){
383 * No one ever truncates BtDir blocks.
385 if(depth==0 && !isdir && e->dsize > size)
386 memset(b->data+size, 0, e->dsize-size);
392 vtfilesetsize(VtFile *r, u64int size)
400 return vtfiletruncate(r);
402 if(size > VtMaxFileSize || size > ((uvlong)MaxBlock)*r->dsize){
417 depth = sizetodepth(size, e.psize, e.dsize);
418 edepth = DEPTH(e.type);
420 if(shrinkdepth(r, b, &e, depth) < 0){
424 }else if(depth > edepth){
425 if(growdepth(r, b, &e, depth) < 0){
432 shrinksize(r, &e, size);
435 vtentrypack(&e, b->data, r->offset % r->epb);
442 vtfilesetdirsize(VtFile *r, u32int ds)
448 epb = r->dsize/VtEntrySize;
450 size = (uvlong)r->dsize*(ds/epb);
451 size += VtEntrySize*(ds%epb);
452 return vtfilesetsize(r, size);
456 vtfilegetdirsize(VtFile *r)
463 epb = r->dsize/VtEntrySize;
465 size = vtfilegetsize(r);
466 ds = epb*(size/r->dsize);
467 ds += (size%r->dsize)/VtEntrySize;
472 vtfilegetentry(VtFile *r, VtEntry *e)
486 vtfilesetentry(VtFile *r, VtEntry *e)
492 b = fileload(r, &ee);
495 vtentrypack(e, b->data, r->offset % r->epb);
501 blockwalk(VtBlock *p, int index, VtCache *c, int mode, VtEntry *e)
517 score = p->data+index*VtScoreSize;
520 /*print("walk from %V/%d ty %d to %V ty %d\n", p->score, index, p->type, score, type); */
522 if(mode == VtOWRITE && vtglobaltolocal(score) == NilBlock){
523 b = vtcacheallocblock(c, type);
527 b = vtcacheglobal(c, score, type);
529 if(b == nil || mode == VtOREAD)
532 if(vtglobaltolocal(b->score) != NilBlock)
540 e->flags |= VtEntryLocal;
542 b = vtblockcopy(b/*, e->tag, fs->ehi, fs->elo*/);
547 if(p->type == VtDirType){
548 memmove(e->score, b->score, VtScoreSize);
549 vtentrypack(e, p->data, index);
551 memmove(p->data+index*VtScoreSize, b->score, VtScoreSize);
557 * Change the depth of the VtFile r.
558 * The entry e for r is contained in block p.
561 growdepth(VtFile *r, VtBlock *p, VtEntry *e, int depth)
567 assert(depth <= VtPointerDepth);
569 b = vtcacheglobal(r->c, e->score, e->type);
576 * Keep adding layers until we get to the right depth
577 * or an error occurs.
579 while(DEPTH(e->type) < depth){
580 bb = vtcacheallocblock(r->c, e->type+1);
583 memmove(bb->data, b->score, VtScoreSize);
584 memmove(e->score, bb->score, VtScoreSize);
586 e->flags |= VtEntryLocal;
591 vtentrypack(e, p->data, r->offset % r->epb);
594 if(DEPTH(e->type) == depth)
600 shrinkdepth(VtFile *r, VtBlock *p, VtEntry *e, int depth)
602 VtBlock *b, *nb, *ob, *rb;
606 assert(depth <= VtPointerDepth);
608 rb = vtcacheglobal(r->c, e->score, e->type);
613 * Walk down to the new root block.
614 * We may stop early, but something is better than nothing.
620 for(; DEPTH(e->type) > depth; e->type--){
621 nb = vtcacheglobal(r->c, b->data, e->type-1);
624 if(ob!=nil && ob!=rb)
636 * Right now, e points at the root block rb, b is the new root block,
637 * and ob points at b. To update:
639 * (i) change e to point at b
640 * (ii) zero the pointer ob -> b
641 * (iii) free the root block
643 * p (the block containing e) must be written before
648 memmove(e->score, b->score, VtScoreSize);
649 vtentrypack(e, p->data, r->offset % r->epb);
652 memmove(ob->data, vtzeroscore, VtScoreSize);
656 if(ob!=nil && ob!=rb)
660 if(DEPTH(e->type) == depth)
666 mkindices(VtEntry *e, u32int bn, int *index)
670 memset(index, 0, (VtPointerDepth+1)*sizeof(int));
672 np = e->psize/VtScoreSize;
673 for(i=0; bn > 0; i++){
674 if(i >= VtPointerDepth){
675 werrstr("bad address 0x%lux", (ulong)bn);
685 vtfileblock(VtFile *r, u32int bn, int mode)
688 int index[VtPointerDepth+1];
694 assert(bn != NilBlock);
700 i = mkindices(&e, bn, index);
703 if(i > DEPTH(e.type)){
705 werrstr("bad address 0x%lux", (ulong)bn);
709 if(growdepth(r, b, &e, i) < 0)
713 assert(b->type == VtDirType);
715 index[DEPTH(e.type)] = r->offset % r->epb;
717 /* mode for intermediate block */
722 for(i=DEPTH(e.type); i>=0; i--){
723 bb = blockwalk(b, index[i], r->c, i==0 ? mode : m, &e);
729 b->pc = getcallerpc(&r);
737 vtfileblockscore(VtFile *r, u32int bn, uchar score[VtScoreSize])
740 int index[VtPointerDepth+1];
745 assert(bn != NilBlock);
751 if(DEPTH(e.type) == 0){
752 memmove(score, e.score, VtScoreSize);
757 i = mkindices(&e, bn, index);
762 if(i > DEPTH(e.type)){
763 memmove(score, vtzeroscore, VtScoreSize);
768 index[DEPTH(e.type)] = r->offset % r->epb;
770 for(i=DEPTH(e.type); i>=1; i--){
771 bb = blockwalk(b, index[i], r->c, VtOREAD, &e);
776 if(memcmp(b->score, vtzeroscore, VtScoreSize) == 0)
780 memmove(score, b->data+index[0]*VtScoreSize, VtScoreSize);
790 vtfileincref(VtFile *r)
798 vtfileclose(VtFile *r)
811 vtfileclose(r->parent);
812 memset(r, ~0, sizeof(*r));
817 * Retrieve the block containing the entry for r.
818 * If a snapshot has happened, we might need
819 * to get a new copy of the block. We avoid this
820 * in the common case by caching the score for
821 * the block and the last epoch in which it was valid.
823 * We use r->mode to tell the difference between active
824 * file system VtFiles (VtORDWR) and VtFiles for the
825 * snapshot file system (VtOREAD).
828 fileloadblock(VtFile *r, int mode)
838 assert(r->mode == VtORDWR);
840 b = vtcacheglobal(r->c, r->score, VtDirType);
843 b->pc = getcallerpc(&r);
846 assert(r->parent != nil);
847 if(vtfilelock(r->parent, VtORDWR) < 0)
849 b = vtfileblock(r->parent, r->offset/r->epb, VtORDWR);
850 vtfileunlock(r->parent);
853 memmove(r->score, b->score, VtScoreSize);
859 werrstr("read/write lock of read-only file");
862 addr = vtglobaltolocal(r->score);
864 return vtcacheglobal(r->c, r->score, VtDirType);
866 b = vtcachelocal(r->c, addr, VtDirType);
871 * If it failed because the epochs don't match, the block has been
872 * archived and reclaimed. Rewalk from the parent and get the
873 * new pointer. This can't happen in the VtORDWR case
874 * above because blocks in the current epoch don't get
875 * reclaimed. The fact that we're VtOREAD means we're
876 * a snapshot. (Or else the file system is read-only, but then
877 * the archiver isn't going around deleting blocks.)
879 rerrstr(e, sizeof e);
880 if(strcmp(e, ELabelMismatch) == 0){
881 if(vtfilelock(r->parent, VtOREAD) < 0)
883 b = vtfileblock(r->parent, r->offset/r->epb, VtOREAD);
884 vtfileunlock(r->parent);
886 fprint(2, "vtfilealloc: lost %V found %V\n",
888 memmove(r->score, b->score, VtScoreSize);
897 vtfilelock(VtFile *r, int mode)
904 b = fileloadblock(r, mode);
908 * The fact that we are holding b serves as the
909 * lock entitling us to write to r->b.
913 b->pc = getcallerpc(&r);
918 * Lock two (usually sibling) VtFiles. This needs special care
919 * because the Entries for both vtFiles might be in the same block.
920 * We also try to lock blocks in left-to-right order within the tree.
923 vtfilelock2(VtFile *r, VtFile *rr, int mode)
928 return vtfilelock(r, mode);
933 if(r->parent==rr->parent && r->offset/r->epb == rr->offset/rr->epb){
934 b = fileloadblock(r, mode);
939 }else if(r->parent==rr->parent || r->offset > rr->offset){
940 bb = fileloadblock(rr, mode);
941 b = fileloadblock(r, mode);
943 b = fileloadblock(r, mode);
944 bb = fileloadblock(rr, mode);
946 if(b == nil || bb == nil){
955 * The fact that we are holding b and bb serves
956 * as the lock entitling us to write to r->b and rr->b.
960 b->pc = getcallerpc(&r);
961 bb->pc = getcallerpc(&r);
966 vtfileunlock(VtFile *r)
971 fprint(2, "vtfileunlock: already unlocked\n");
980 fileload(VtFile *r, VtEntry *e)
986 if(vtentryunpack(e, b->data, r->offset % r->epb) < 0)
993 sizetodepth(uvlong s, int psize, int dsize)
998 /* determine pointer depth */
999 np = psize/VtScoreSize;
1000 s = (s + dsize - 1)/dsize;
1001 for(d = 0; s > 1; d++)
1002 s = (s + np - 1)/np;
1007 vtfileread(VtFile *f, void *data, long count, vlong offset)
1013 assert(ISLOCKED(f));
1015 vtfilegetentry(f, &e);
1018 if(count < 0 || offset < 0){
1019 werrstr("vtfileread: bad offset or count");
1022 if(offset >= e.size)
1025 if(offset+count > e.size)
1026 count = e.size - offset;
1028 frag = offset % e.dsize;
1029 if(frag+count > e.dsize)
1030 count = e.dsize - frag;
1032 b = vtfileblock(f, offset/e.dsize, VtOREAD);
1036 memmove(data, b->data+frag, count);
1042 filewrite1(VtFile *f, void *data, long count, vlong offset)
1048 vtfilegetentry(f, &e);
1049 if(count < 0 || offset < 0){
1050 werrstr("vtfilewrite: bad offset or count");
1054 frag = offset % e.dsize;
1055 if(frag+count > e.dsize)
1056 count = e.dsize - frag;
1059 if(frag == 0 && count == e.dsize)
1062 b = vtfileblock(f, offset/e.dsize, m);
1066 memmove(b->data+frag, data, count);
1067 if(m == VtOWRITE && frag+count < e.dsize)
1068 memset(b->data+frag+count, 0, e.dsize-frag-count);
1070 if(offset+count > e.size){
1071 vtfilegetentry(f, &e);
1072 e.size = offset+count;
1073 vtfilesetentry(f, &e);
1081 vtfilewrite(VtFile *f, void *data, long count, vlong offset)
1085 assert(ISLOCKED(f));
1090 m = filewrite1(f, (char*)data+tot, count-tot, offset+tot);
1101 flushblock(VtCache *c, VtBlock *bb, uchar score[VtScoreSize], int ppb, int epb,
1109 addr = vtglobaltolocal(score);
1110 if(addr == NilBlock)
1115 if(memcmp(b->score, score, VtScoreSize) != 0)
1118 if((b = vtcachelocal(c, addr, type)) == nil)
1126 for(i=0; i<epb; i++){
1127 if(vtentryunpack(&e, b->data, i) < 0)
1129 if(!(e.flags&VtEntryActive))
1131 if(flushblock(c, nil, e.score, e.psize/VtScoreSize, e.dsize/VtEntrySize,
1134 vtentrypack(&e, b->data, i);
1138 default: /* VtPointerTypeX */
1139 for(i=0; i<ppb; i++){
1140 if(flushblock(c, nil, b->data+VtScoreSize*i, ppb, epb, type-1) < 0)
1146 if(vtblockwrite(b) < 0)
1148 memmove(score, b->score, VtScoreSize);
1160 vtfileflush(VtFile *f)
1166 assert(ISLOCKED(f));
1167 b = fileload(f, &e);
1168 if(!(e.flags&VtEntryLocal)){
1173 ret = flushblock(f->c, nil, e.score, e.psize/VtScoreSize, e.dsize/VtEntrySize,
1180 vtentrypack(&e, b->data, f->offset % f->epb);
1186 vtfileflushbefore(VtFile *r, u64int offset)
1190 int i, base, depth, ppb, epb, doflush;
1191 int index[VtPointerDepth+1], j, ret;
1192 VtBlock *bi[VtPointerDepth+2];
1195 assert(ISLOCKED(r));
1199 b = fileload(r, &e);
1204 * compute path through tree for the last written byte and the next one.
1207 memset(bi, 0, sizeof bi);
1208 depth = DEPTH(e.type);
1210 i = mkindices(&e, (offset-1)/e.dsize, index);
1215 ppb = e.psize / VtScoreSize;
1216 epb = e.dsize / VtEntrySize;
1219 * load the blocks along the last written byte
1221 index[depth] = r->offset % r->epb;
1222 for(i=depth; i>=0; i--){
1223 bb = blockwalk(b, index[i], r->c, VtORDWR, &e);
1232 * walk up the path from leaf to root, flushing anything that
1233 * has been finished.
1235 base = e.type&~VtTypeDepthMask;
1236 for(i=0; i<=depth; i++){
1239 /* leaf: data or dir block */
1240 if(offset%e.dsize == 0)
1244 * interior node: pointer blocks.
1245 * specifically, b = bi[i] is a block whose index[i-1]'th entry
1246 * points at bi[i-1].
1251 * the index entries up to but not including index[i-1] point at
1252 * finished blocks, so flush them for sure.
1254 for(j=0; j<index[i-1]; j++)
1255 if(flushblock(r->c, nil, b->data+j*VtScoreSize, ppb, epb, base+i-1) < 0)
1259 * if index[i-1] is the last entry in the block and is global
1260 * (i.e. the kid is flushed), then we can flush this block.
1262 if(j==ppb-1 && vtglobaltolocal(b->data+j*VtScoreSize)==NilBlock)
1269 score = bi[i+1]->data+index[i]*VtScoreSize;
1270 if(flushblock(r->c, bi[i], score, ppb, epb, base+i) < 0)
1276 /* top: entry. do this always so that the score is up-to-date */
1277 vtentrypack(&e, bi[depth+1]->data, index[depth]);
1278 for(i=0; i<nelem(bi); i++)