4 * Caches raw disk blocks. Getdblock() gets a block, putdblock puts it back.
5 * Getdblock has a mode parameter that determines i/o and access to a block:
6 * if mode is OREAD or ORDWR, it is read from disk if not already in memory.
7 * If mode is ORDWR or OWRITE, it is locked for exclusive use before being returned.
8 * It is *not* marked dirty -- once changes have been made, they should be noted
9 * by using dirtydblock() before putdblock().
11 * There is a global cache lock as well as a lock on each block.
12 * Within a thread, the cache lock can be acquired while holding a block lock,
13 * but not vice versa; and a block cannot be locked if you already hold the lock
16 * The flush proc writes out dirty blocks in batches, one batch per dirty tag.
17 * For example, the DirtyArena blocks are all written to disk before any of the
18 * DirtyArenaCib blocks.
20 * This code used to be in charge of flushing the dirty index blocks out to
21 * disk, but updating the index turned out to benefit from extra care.
22 * Now cached index blocks are never marked dirty. The index.c code takes
23 * care of updating them behind our back, and uses _getdblock to update any
24 * cached copies of the blocks as it changes them on disk.
31 typedef struct DCache DCache;
36 HashSize = 1<<HashLog,
37 HashMask = HashSize - 1,
43 RWLock dirtylock; /* must be held to inspect or set b->dirty */
46 DBlock *free; /* list of available lumps */
47 u32int now; /* ticks for usage timestamps */
48 int size; /* max. size of any block; allocated to each block */
49 DBlock **heads; /* hash table for finding address */
50 int nheap; /* number of available victims */
51 DBlock **heap; /* heap for locating victims */
52 int nblocks; /* number of blocks allocated */
53 DBlock *blocks; /* array of block descriptors */
54 DBlock **write; /* array of block pointers to be written */
55 u8int *mem; /* memory for all block descriptors */
56 int ndirty; /* number of dirty blocks */
57 int maxdirty; /* max. number of dirty blocks */
69 static int downheap(int i, DBlock *b);
70 static int upheap(int i, DBlock *b);
71 static DBlock *bumpdblock(void);
72 static void delheap(DBlock *db);
73 static void fixheap(int i, DBlock *b);
74 static void flushproc(void*);
75 static void writeproc(void*);
78 initdcache(u32int mem)
81 u32int nblocks, blocksize;
85 if(mem < maxblocksize * 2)
86 sysfatal("need at least %d bytes for the disk cache", maxblocksize * 2);
88 sysfatal("no max. block size given for disk cache");
89 blocksize = maxblocksize;
90 nblocks = mem / blocksize;
91 dcache.full.l = &dcache.lock;
92 dcache.nblocks = nblocks;
93 dcache.maxdirty = (nblocks * 2) / 3;
94 trace(TraceProc, "initialize disk cache with %d blocks of %d bytes, maximum %d dirty blocks\n",
95 nblocks, blocksize, dcache.maxdirty);
96 dcache.size = blocksize;
97 dcache.heads = MKNZ(DBlock*, HashSize);
98 dcache.heap = MKNZ(DBlock*, nblocks);
99 dcache.blocks = MKNZ(DBlock, nblocks);
100 dcache.write = MKNZ(DBlock*, nblocks);
101 dcache.mem = MKNZ(u8int, (nblocks+1+128) * blocksize);
104 p = (u8int*)(((uintptr)dcache.mem+blocksize-1)&~(uintptr)(blocksize-1));
105 for(i = 0; i < nblocks; i++){
106 b = &dcache.blocks[i];
107 b->data = &p[i * blocksize];
109 b->writedonechan = chancreate(sizeof(void*), 1);
116 setstat(StatDcacheSize, nblocks);
117 initround(&dcache.round, "dcache", 120*1000);
119 vtproc(flushproc, nil);
120 vtproc(delaykickroundproc, &dcache.round);
128 #define hashit(c) ((((c) * 0x6b43a9b5) >> (32 - HashLog)) & HashMask)
129 h = (addr >> 32) ^ addr;
134 getdblock(Part *part, u64int addr, int mode)
138 b = _getdblock(part, addr, mode, 1);
139 if(mode == OREAD || mode == ORDWR)
140 addstat(StatDcacheRead, 1);
141 if(mode == OWRITE || mode == ORDWR)
142 addstat(StatDcacheWrite, 1);
147 _getdblock(Part *part, u64int addr, int mode, int load)
153 trace(TraceBlock, "getdblock enter %s 0x%llux", part->name, addr);
154 size = part->blocksize;
155 if(size > dcache.size){
156 seterr(EAdmin, "block size %d too big for cache with size %d", size, dcache.size);
158 addstat(StatDcacheLookup, 1);
164 * look for the block in the cache
168 for(b = dcache.heads[h]; b != nil; b = b->next){
169 if(b->part == part && b->addr == addr){
171 addstat2(StatDcacheHit, 1, StatDcacheLookup, 1);
177 * missed: locate the block with the oldest second to last use.
178 * remove it from the heap, and fix up the heap.
181 qunlock(&dcache.lock);
186 * Only start timer here, on cache miss - calling msec() on plain cache hits
187 * makes cache hits system-call bound.
190 addstat2(StatDcacheLookup, 1, StatDcacheMiss, 1);
194 trace(TraceBlock, "all disk cache blocks in use");
195 addstat(StatDcacheStall, 1);
196 rsleep(&dcache.full);
197 addstat(StatDcacheStall, -1);
204 * the new block has no last use, so assume it happens sometime in the middle
205 ZZZ this is not reasonable
207 b->used = (b->used2 + dcache.now) / 2;
210 * rechain the block on the correct hash chain
212 b->next = dcache.heads[h];
225 b->used = dcache.now++;
226 if(b->heap != TWID32)
229 if((mode == ORDWR || mode == OWRITE) && part->writechan == nil){
230 trace(TraceBlock, "getdblock allocwriteproc %s", part->name);
231 part->writechan = chancreate(sizeof(DBlock*), dcache.nblocks);
232 vtproc(writeproc, part);
234 qunlock(&dcache.lock);
236 trace(TraceBlock, "getdblock lock");
237 addstat(StatDblockStall, 1);
242 addstat(StatDblockStall, -1);
243 trace(TraceBlock, "getdblock locked");
247 addstat(StatDblockStall, 1);
250 addstat(StatDblockStall, -1);
254 memset(&b->data[b->size], 0, size - b->size);
256 trace(TraceBlock, "getdblock readpart %s 0x%llux", part->name, addr);
258 if(readpart(part, addr + b->size, &b->data[b->size], size - b->size) < 0){
259 b->mode = ORDWR; /* so putdblock wunlocks */
263 trace(TraceBlock, "getdblock readpartdone");
264 addstat(StatApartRead, 1);
265 addstat(StatApartReadBytes, size-b->size);
270 addstat(StatDblockStall, 1);
273 addstat(StatDblockStall, -1);
278 trace(TraceBlock, "getdblock exit");
280 addstat(StatDcacheLookupTime, msec() - ms);
290 trace(TraceBlock, "putdblock %s 0x%llux", b->part->name, b->addr);
298 if(--b->ref == 0 && !b->dirty){
299 if(b->heap == TWID32)
300 upheap(dcache.nheap++, b);
301 rwakeupall(&dcache.full);
303 qunlock(&dcache.lock);
307 dirtydblock(DBlock *b, int dirty)
311 trace(TraceBlock, "dirtydblock enter %s 0x%llux %d from 0x%lux",
312 b->part->name, b->addr, dirty, getcallerpc(&b));
314 assert(b->mode==ORDWR || b->mode==OWRITE);
318 assert(b->dirty == dirty);
325 setstat(StatDcacheDirty, dcache.ndirty);
326 if(dcache.ndirty >= dcache.maxdirty)
327 kickround(&dcache.round, 0);
329 delaykickround(&dcache.round);
331 qunlock(&dcache.lock);
344 if(dcache.heads[h] != b)
345 sysfatal("bad hash chains in disk cache");
346 dcache.heads[h] = b->next;
348 b->prev->next = b->next;
350 b->next->prev = b->prev;
354 * remove some block from use and update the free list and counters
361 trace(TraceBlock, "bumpdblock enter");
364 dcache.free = b->next;
368 if(dcache.ndirty >= dcache.maxdirty)
372 * remove blocks until we find one that is unused
373 * referenced blocks are left in the heap even though
374 * they can't be scavenged; this is simple a speed optimization
377 if(dcache.nheap == 0){
379 trace(TraceBlock, "bumpdblock gotnothing");
384 if(!b->ref && !b->dirty)
388 trace(TraceBlock, "bumpdblock bumping %s 0x%llux", b->part->name, b->addr);
400 while(dcache.nheap > 0){
403 if(!b->ref && !b->dirty){
405 b->next = dcache.free;
409 qunlock(&dcache.lock);
413 * delete an arbitrary block from the heap
418 if(db->heap == TWID32)
420 fixheap(db->heap, dcache.heap[--dcache.nheap]);
425 * push an element up or down to it's correct new location
428 fixheap(int i, DBlock *b)
430 if(upheap(i, b) == i)
435 upheap(int i, DBlock *b)
442 for(; i != 0; i = p){
445 if(b->used2 - now >= bb->used2 - now)
457 downheap(int i, DBlock *b)
466 if(k >= dcache.nheap)
468 if(k + 1 < dcache.nheap && dcache.heap[k]->used2 - now > dcache.heap[k + 1]->used2 - now)
471 if(b->used2 - now <= bb->used2 - now)
483 findblock(DBlock *bb)
489 h = pbhash(bb->addr);
490 for(b = dcache.heads[h]; b != nil; b = b->next){
492 sysfatal("bad prev link");
497 sysfatal("block missing from hash table");
505 int i, k, refed, nfree;
510 for(i = 0; i < dcache.nheap; i++){
511 if(dcache.heap[i]->heap != i)
512 sysfatal("dc: mis-heaped at %d: %d", i, dcache.heap[i]->heap);
513 if(i > 0 && dcache.heap[(i - 1) >> 1]->used2 - now > dcache.heap[i]->used2 - now)
514 sysfatal("dc: bad heap ordering");
516 if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now)
517 sysfatal("dc: bad heap ordering");
519 if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now)
520 sysfatal("dc: bad heap ordering");
524 for(i = 0; i < dcache.nblocks; i++){
525 b = &dcache.blocks[i];
526 if(b->data != &dcache.mem[i * size])
527 sysfatal("dc: mis-blocked at %d", i);
528 if(b->ref && b->heap == TWID32)
533 && dcache.heap[b->heap] != b)
534 sysfatal("dc: spurious heap value");
538 for(b = dcache.free; b != nil; b = b->next){
539 if(b->addr != 0 || b->heap != TWID32)
540 sysfatal("dc: bad free list");
544 if(dcache.nheap + nfree + refed != dcache.nblocks)
545 sysfatal("dc: missing blocks: %d %d %d", dcache.nheap, refed, dcache.nblocks);
546 qunlock(&dcache.lock);
552 trace(TraceProc, "flushdcache enter");
553 kickround(&dcache.round, 1);
554 trace(TraceProc, "flushdcache exit");
560 kickround(&dcache.round, 0);
564 parallelwrites(DBlock **b, DBlock **eb, int dirty)
569 for(p=b; p<eb && (*p)->dirty == dirty; p++){
570 assert(b<=p && p<eb);
571 sendp((*p)->part->writechan, *p);
575 assert(b<=p && p<eb);
576 recvp((*p)->writedonechan);
580 * Flush the partitions that have been written to.
584 if(part != (*p)->part){
586 flushpart(part); /* what if it fails? */
594 * Sort first by dirty flag, then by partition, then by address in partition.
597 writeblockcmp(const void *va, const void *vb)
604 if(a->dirty != b->dirty)
605 return a->dirty - b->dirty;
606 if(a->part != b->part){
607 if(a->part < b->part)
609 if(a->part > b->part)
612 if(a->addr < b->addr)
625 threadsetname("flushproc");
627 waitforkick(&dcache.round);
629 trace(TraceWork, "start");
631 trace(TraceProc, "build t=%lud", (ulong)(nsec()/1000)-t0);
633 write = dcache.write;
635 for(i=0; i<dcache.nblocks; i++){
636 b = &dcache.blocks[i];
641 qsort(write, n, sizeof(write[0]), writeblockcmp);
643 /* Write each stage of blocks out. */
644 trace(TraceProc, "writeblocks t=%lud", (ulong)(nsec()/1000)-t0);
646 for(j=1; j<DirtyMax; j++){
647 trace(TraceProc, "writeblocks.%d t=%lud",
648 j, (ulong)(nsec()/1000)-t0);
649 i += parallelwrites(write+i, write+n, j);
652 fprint(2, "in flushproc i=%d n=%d\n", i, n);
654 fprint(2, "\tblock %d: dirty=%d\n",
660 * b->dirty is protected by b->lock while ndirty is protected
661 * by dcache.lock, so the --ndirty below is the delayed one
662 * from clearing b->dirty in the write proc. It may happen
663 * that some other proc has come along and redirtied b since
664 * the write. That's okay, it just means that ndirty may be
665 * one too high until we catch up and do the decrement.
667 trace(TraceProc, "undirty.%d t=%lud", j, (ulong)(nsec()/1000)-t0);
672 if(b->ref == 0 && b->heap == TWID32){
673 upheap(dcache.nheap++, b);
674 rwakeupall(&dcache.full);
677 setstat(StatDcacheDirty, dcache.ndirty);
678 qunlock(&dcache.lock);
679 addstat(StatDcacheFlush, 1);
680 trace(TraceWork, "finish");
692 threadsetname("writeproc:%s", p->name);
694 b = recvp(p->writechan);
695 trace(TraceWork, "start");
696 assert(b->part == p);
697 trace(TraceProc, "wlock %s 0x%llux", p->name, b->addr);
699 trace(TraceProc, "writepart %s 0x%llux", p->name, b->addr);
701 if(writepart(p, b->addr, b->data, b->size) < 0)
702 fprint(2, "%s: writeproc: part %s addr 0x%llux: write error: %r\n",
703 argv0, p->name, b->addr);
704 addstat(StatApartWrite, 1);
705 addstat(StatApartWriteBytes, b->size);
708 trace(TraceProc, "finish %s 0x%llux", p->name, b->addr);
709 trace(TraceWork, "finish");
710 sendp(b->writedonechan, b);