4 * Caches raw disk blocks. Getdblock() gets a block, putdblock puts it back.
5 * Getdblock has a mode parameter that determines i/o and access to a block:
6 * if mode is OREAD or ORDWR, it is read from disk if not already in memory.
7 * If mode is ORDWR or OWRITE, it is locked for exclusive use before being returned.
8 * It is *not* marked dirty -- once changes have been made, they should be noted
9 * by using dirtydblock() before putdblock().
11 * There is a global cache lock as well as a lock on each block.
12 * Within a thread, the cache lock can be acquired while holding a block lock,
13 * but not vice versa; and a block cannot be locked if you already hold the lock
16 * The flush proc writes out dirty blocks in batches, one batch per dirty tag.
17 * For example, the DirtyArena blocks are all written to disk before any of the
18 * DirtyArenaCib blocks.
20 * This code used to be in charge of flushing the dirty index blocks out to
21 * disk, but updating the index turned out to benefit from extra care.
22 * Now cached index blocks are never marked dirty. The index.c code takes
23 * care of updating them behind our back, and uses _getdblock to update any
24 * cached copies of the blocks as it changes them on disk.
31 typedef struct DCache DCache;
36 HashSize = 1<<HashLog,
37 HashMask = HashSize - 1,
43 RWLock dirtylock; /* must be held to inspect or set b->dirty */
46 DBlock *free; /* list of available lumps */
47 u32int now; /* ticks for usage timestamps */
48 int size; /* max. size of any block; allocated to each block */
49 DBlock **heads; /* hash table for finding address */
50 int nheap; /* number of available victims */
51 DBlock **heap; /* heap for locating victims */
52 int nblocks; /* number of blocks allocated */
53 DBlock *blocks; /* array of block descriptors */
54 DBlock **write; /* array of block pointers to be written */
55 u8int *mem; /* memory for all block descriptors */
56 int ndirty; /* number of dirty blocks */
57 int maxdirty; /* max. number of dirty blocks */
69 static int downheap(int i, DBlock *b);
70 static int upheap(int i, DBlock *b);
71 static DBlock *bumpdblock(void);
72 static void delheap(DBlock *db);
73 static void fixheap(int i, DBlock *b);
74 static void flushproc(void*);
75 static void writeproc(void*);
78 initdcache(u32int mem)
81 u32int nblocks, blocksize;
86 sysfatal("no max. block size given for disk cache");
87 if(mem < maxblocksize * 2)
88 sysfatal("need at least 2 max-size blocks (%d bytes) for the disk cache", maxblocksize * 2);
90 blocksize = maxblocksize;
91 nblocks = mem / blocksize;
92 dcache.full.l = &dcache.lock;
93 dcache.nblocks = nblocks;
94 dcache.maxdirty = (nblocks * 2) / 3;
95 trace(TraceProc, "initialize disk cache with %d blocks of %d bytes, maximum %d dirty blocks\n",
96 nblocks, blocksize, dcache.maxdirty);
97 dcache.size = blocksize;
98 dcache.heads = vtbrk(sizeof(DBlock*) * HashSize);
99 dcache.heap = vtbrk(sizeof(DBlock*) * nblocks);
100 dcache.blocks = vtbrk(sizeof(DBlock) * nblocks);
101 dcache.write = vtbrk(sizeof(DBlock*) * nblocks);
102 dcache.mem = vtbrk((nblocks+1+128) * blocksize);
105 p = (u8int*)(((uintptr)dcache.mem+blocksize-1)&~(uintptr)(blocksize-1));
106 for(i = 0; i < nblocks; i++){
107 b = &dcache.blocks[i];
108 b->data = &p[i * blocksize];
110 b->writedonechan = chancreate(sizeof(void*), 1);
117 setstat(StatDcacheSize, nblocks);
118 initround(&dcache.round, "dcache", 120*1000);
120 vtproc(flushproc, nil);
121 vtproc(delaykickroundproc, &dcache.round);
129 #define hashit(c) ((((c) * 0x6b43a9b5) >> (32 - HashLog)) & HashMask)
130 h = (addr >> 32) ^ addr;
135 getdblock(Part *part, u64int addr, int mode)
139 b = _getdblock(part, addr, mode, 1);
140 if(mode == OREAD || mode == ORDWR)
141 addstat(StatDcacheRead, 1);
142 if(mode == OWRITE || mode == ORDWR)
143 addstat(StatDcacheWrite, 1);
148 _getdblock(Part *part, u64int addr, int mode, int load)
154 trace(TraceBlock, "getdblock enter %s 0x%llux", part->name, addr);
155 size = part->blocksize;
156 if(size > dcache.size){
157 seterr(EAdmin, "block size %d too big for cache with size %d", size, dcache.size);
159 addstat(StatDcacheLookup, 1);
165 * look for the block in the cache
169 for(b = dcache.heads[h]; b != nil; b = b->next){
170 if(b->part == part && b->addr == addr){
172 addstat2(StatDcacheHit, 1, StatDcacheLookup, 1);
178 * missed: locate the block with the oldest second to last use.
179 * remove it from the heap, and fix up the heap.
182 qunlock(&dcache.lock);
187 * Only start timer here, on cache miss - calling msec() on plain cache hits
188 * makes cache hits system-call bound.
191 addstat2(StatDcacheLookup, 1, StatDcacheMiss, 1);
195 trace(TraceBlock, "all disk cache blocks in use");
196 addstat(StatDcacheStall, 1);
197 rsleep(&dcache.full);
198 addstat(StatDcacheStall, -1);
205 * the new block has no last use, so assume it happens sometime in the middle
206 ZZZ this is not reasonable
208 b->used = (b->used2 + dcache.now) / 2;
211 * rechain the block on the correct hash chain
213 b->next = dcache.heads[h];
226 b->used = dcache.now++;
227 if(b->heap != TWID32)
230 if((mode == ORDWR || mode == OWRITE) && part->writechan == nil){
231 trace(TraceBlock, "getdblock allocwriteproc %s", part->name);
232 part->writechan = chancreate(sizeof(DBlock*), dcache.nblocks);
233 vtproc(writeproc, part);
235 qunlock(&dcache.lock);
237 trace(TraceBlock, "getdblock lock");
238 addstat(StatDblockStall, 1);
243 addstat(StatDblockStall, -1);
244 trace(TraceBlock, "getdblock locked");
248 addstat(StatDblockStall, 1);
251 addstat(StatDblockStall, -1);
255 memset(&b->data[b->size], 0, size - b->size);
257 trace(TraceBlock, "getdblock readpart %s 0x%llux", part->name, addr);
259 if(readpart(part, addr + b->size, &b->data[b->size], size - b->size) < 0){
260 b->mode = ORDWR; /* so putdblock wunlocks */
264 trace(TraceBlock, "getdblock readpartdone");
265 addstat(StatApartRead, 1);
266 addstat(StatApartReadBytes, size-b->size);
271 addstat(StatDblockStall, 1);
274 addstat(StatDblockStall, -1);
279 trace(TraceBlock, "getdblock exit");
281 addstat(StatDcacheLookupTime, msec() - ms);
291 trace(TraceBlock, "putdblock %s 0x%llux", b->part->name, b->addr);
299 if(--b->ref == 0 && !b->dirty){
300 if(b->heap == TWID32)
301 upheap(dcache.nheap++, b);
302 rwakeupall(&dcache.full);
304 qunlock(&dcache.lock);
308 dirtydblock(DBlock *b, int dirty)
312 trace(TraceBlock, "dirtydblock enter %s 0x%llux %d from 0x%lux",
313 b->part->name, b->addr, dirty, getcallerpc(&b));
315 assert(b->mode==ORDWR || b->mode==OWRITE);
319 assert(b->dirty == dirty);
326 setstat(StatDcacheDirty, dcache.ndirty);
327 if(dcache.ndirty >= dcache.maxdirty)
328 kickround(&dcache.round, 0);
330 delaykickround(&dcache.round);
332 qunlock(&dcache.lock);
345 if(dcache.heads[h] != b)
346 sysfatal("bad hash chains in disk cache");
347 dcache.heads[h] = b->next;
349 b->prev->next = b->next;
351 b->next->prev = b->prev;
355 * remove some block from use and update the free list and counters
362 trace(TraceBlock, "bumpdblock enter");
365 dcache.free = b->next;
369 if(dcache.ndirty >= dcache.maxdirty)
373 * remove blocks until we find one that is unused
374 * referenced blocks are left in the heap even though
375 * they can't be scavenged; this is simple a speed optimization
378 if(dcache.nheap == 0){
380 trace(TraceBlock, "bumpdblock gotnothing");
385 if(!b->ref && !b->dirty)
389 trace(TraceBlock, "bumpdblock bumping %s 0x%llux", b->part->name, b->addr);
401 while(dcache.nheap > 0){
404 if(!b->ref && !b->dirty){
406 b->next = dcache.free;
410 qunlock(&dcache.lock);
414 * delete an arbitrary block from the heap
419 if(db->heap == TWID32)
421 fixheap(db->heap, dcache.heap[--dcache.nheap]);
426 * push an element up or down to it's correct new location
429 fixheap(int i, DBlock *b)
431 if(upheap(i, b) == i)
436 upheap(int i, DBlock *b)
443 for(; i != 0; i = p){
446 if(b->used2 - now >= bb->used2 - now)
458 downheap(int i, DBlock *b)
467 if(k >= dcache.nheap)
469 if(k + 1 < dcache.nheap && dcache.heap[k]->used2 - now > dcache.heap[k + 1]->used2 - now)
472 if(b->used2 - now <= bb->used2 - now)
484 findblock(DBlock *bb)
490 h = pbhash(bb->addr);
491 for(b = dcache.heads[h]; b != nil; b = b->next){
493 sysfatal("bad prev link");
498 sysfatal("block missing from hash table");
506 int i, k, refed, nfree;
511 for(i = 0; i < dcache.nheap; i++){
512 if(dcache.heap[i]->heap != i)
513 sysfatal("dc: mis-heaped at %d: %d", i, dcache.heap[i]->heap);
514 if(i > 0 && dcache.heap[(i - 1) >> 1]->used2 - now > dcache.heap[i]->used2 - now)
515 sysfatal("dc: bad heap ordering");
517 if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now)
518 sysfatal("dc: bad heap ordering");
520 if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now)
521 sysfatal("dc: bad heap ordering");
525 for(i = 0; i < dcache.nblocks; i++){
526 b = &dcache.blocks[i];
527 if(b->data != &dcache.mem[i * size])
528 sysfatal("dc: mis-blocked at %d", i);
529 if(b->ref && b->heap == TWID32)
534 && dcache.heap[b->heap] != b)
535 sysfatal("dc: spurious heap value");
539 for(b = dcache.free; b != nil; b = b->next){
540 if(b->addr != 0 || b->heap != TWID32)
541 sysfatal("dc: bad free list");
545 if(dcache.nheap + nfree + refed != dcache.nblocks)
546 sysfatal("dc: missing blocks: %d %d %d", dcache.nheap, refed, dcache.nblocks);
547 qunlock(&dcache.lock);
553 trace(TraceProc, "flushdcache enter");
554 kickround(&dcache.round, 1);
555 trace(TraceProc, "flushdcache exit");
561 kickround(&dcache.round, 0);
565 parallelwrites(DBlock **b, DBlock **eb, int dirty)
570 for(p=b; p<eb && (*p)->dirty == dirty; p++){
571 assert(b<=p && p<eb);
572 sendp((*p)->part->writechan, *p);
576 assert(b<=p && p<eb);
577 recvp((*p)->writedonechan);
581 * Flush the partitions that have been written to.
585 if(part != (*p)->part){
587 flushpart(part); /* what if it fails? */
595 * Sort first by dirty flag, then by partition, then by address in partition.
598 writeblockcmp(const void *va, const void *vb)
605 if(a->dirty != b->dirty)
606 return a->dirty - b->dirty;
607 if(a->part != b->part){
608 if(a->part < b->part)
610 if(a->part > b->part)
613 if(a->addr < b->addr)
626 threadsetname("flushproc");
628 waitforkick(&dcache.round);
630 trace(TraceWork, "start");
632 trace(TraceProc, "build t=%lud", (ulong)(nsec()/1000)-t0);
634 write = dcache.write;
636 for(i=0; i<dcache.nblocks; i++){
637 b = &dcache.blocks[i];
642 qsort(write, n, sizeof(write[0]), writeblockcmp);
644 /* Write each stage of blocks out. */
645 trace(TraceProc, "writeblocks t=%lud", (ulong)(nsec()/1000)-t0);
647 for(j=1; j<DirtyMax; j++){
648 trace(TraceProc, "writeblocks.%d t=%lud",
649 j, (ulong)(nsec()/1000)-t0);
650 i += parallelwrites(write+i, write+n, j);
653 fprint(2, "in flushproc i=%d n=%d\n", i, n);
655 fprint(2, "\tblock %d: dirty=%d\n",
661 * b->dirty is protected by b->lock while ndirty is protected
662 * by dcache.lock, so the --ndirty below is the delayed one
663 * from clearing b->dirty in the write proc. It may happen
664 * that some other proc has come along and redirtied b since
665 * the write. That's okay, it just means that ndirty may be
666 * one too high until we catch up and do the decrement.
668 trace(TraceProc, "undirty.%d t=%lud", j, (ulong)(nsec()/1000)-t0);
673 if(b->ref == 0 && b->heap == TWID32){
674 upheap(dcache.nheap++, b);
675 rwakeupall(&dcache.full);
678 setstat(StatDcacheDirty, dcache.ndirty);
679 qunlock(&dcache.lock);
680 addstat(StatDcacheFlush, 1);
681 trace(TraceWork, "finish");
693 threadsetname("writeproc:%s", p->name);
695 b = recvp(p->writechan);
696 trace(TraceWork, "start");
697 assert(b->part == p);
698 trace(TraceProc, "wlock %s 0x%llux", p->name, b->addr);
700 trace(TraceProc, "writepart %s 0x%llux", p->name, b->addr);
702 if(writepart(p, b->addr, b->data, b->size) < 0)
703 fprint(2, "%s: writeproc: part %s addr 0x%llux: write error: %r\n",
704 argv0, p->name, b->addr);
705 addstat(StatApartWrite, 1);
706 addstat(StatApartWriteBytes, b->size);
709 trace(TraceProc, "finish %s 0x%llux", p->name, b->addr);
710 trace(TraceWork, "finish");
711 sendp(b->writedonechan, b);