]> git.lizzy.rs Git - plan9front.git/blob - sys/src/cmd/venti/srv/dcache.c
a50ef0c5c2bbc4b71eae3778d0e25c71316f430f
[plan9front.git] / sys / src / cmd / venti / srv / dcache.c
1 /*
2  * Disk cache.
3  * 
4  * Caches raw disk blocks.  Getdblock() gets a block, putdblock puts it back.
5  * Getdblock has a mode parameter that determines i/o and access to a block:
6  * if mode is OREAD or ORDWR, it is read from disk if not already in memory.
7  * If mode is ORDWR or OWRITE, it is locked for exclusive use before being returned.
8  * It is *not* marked dirty -- once changes have been made, they should be noted
9  * by using dirtydblock() before putdblock().  
10  *
11  * There is a global cache lock as well as a lock on each block. 
12  * Within a thread, the cache lock can be acquired while holding a block lock,
13  * but not vice versa; and a block cannot be locked if you already hold the lock
14  * on another block.
15  * 
16  * The flush proc writes out dirty blocks in batches, one batch per dirty tag.
17  * For example, the DirtyArena blocks are all written to disk before any of the
18  * DirtyArenaCib blocks.
19  *
20  * This code used to be in charge of flushing the dirty index blocks out to 
21  * disk, but updating the index turned out to benefit from extra care.
22  * Now cached index blocks are never marked dirty.  The index.c code takes
23  * care of updating them behind our back, and uses _getdblock to update any
24  * cached copies of the blocks as it changes them on disk.
25  */
26
27 #include "stdinc.h"
28 #include "dat.h"
29 #include "fns.h"
30
31 typedef struct DCache   DCache;
32
33 enum
34 {
35         HashLog         = 9,
36         HashSize        = 1<<HashLog,
37         HashMask        = HashSize - 1,
38 };
39
40 struct DCache
41 {
42         QLock           lock;
43         RWLock          dirtylock;              /* must be held to inspect or set b->dirty */
44         Rendez          full;
45         Round           round;
46         DBlock          *free;                  /* list of available lumps */
47         u32int          now;                    /* ticks for usage timestamps */
48         int             size;                   /* max. size of any block; allocated to each block */
49         DBlock          **heads;                /* hash table for finding address */
50         int             nheap;                  /* number of available victims */
51         DBlock          **heap;                 /* heap for locating victims */
52         int             nblocks;                /* number of blocks allocated */
53         DBlock          *blocks;                /* array of block descriptors */
54         DBlock          **write;                /* array of block pointers to be written */
55         u8int           *mem;                   /* memory for all block descriptors */
56         int             ndirty;                 /* number of dirty blocks */
57         int             maxdirty;               /* max. number of dirty blocks */
58 };
59
60 typedef struct Ra Ra;
61 struct Ra
62 {
63         Part *part;
64         u64int addr;
65 };
66
67 static DCache   dcache;
68
69 static int      downheap(int i, DBlock *b);
70 static int      upheap(int i, DBlock *b);
71 static DBlock   *bumpdblock(void);
72 static void     delheap(DBlock *db);
73 static void     fixheap(int i, DBlock *b);
74 static void     flushproc(void*);
75 static void     writeproc(void*);
76
77 void
78 initdcache(u32int mem)
79 {
80         DBlock *b, *last;
81         u32int nblocks, blocksize;
82         int i;
83         u8int *p;
84
85         if(mem < maxblocksize * 2)
86                 sysfatal("need at least %d bytes for the disk cache", maxblocksize * 2);
87         if(maxblocksize == 0)
88                 sysfatal("no max. block size given for disk cache");
89         blocksize = maxblocksize;
90         nblocks = mem / blocksize;
91         dcache.full.l = &dcache.lock;
92         dcache.nblocks = nblocks;
93         dcache.maxdirty = (nblocks * 2) / 3;
94         trace(TraceProc, "initialize disk cache with %d blocks of %d bytes, maximum %d dirty blocks\n",
95                         nblocks, blocksize, dcache.maxdirty);
96         dcache.size = blocksize;
97         dcache.heads = MKNZ(DBlock*, HashSize);
98         dcache.heap = MKNZ(DBlock*, nblocks);
99         dcache.blocks = MKNZ(DBlock, nblocks);
100         dcache.write = MKNZ(DBlock*, nblocks);
101         dcache.mem = MKNZ(u8int, (nblocks+1+128) * blocksize);
102
103         last = nil;
104         p = (u8int*)(((uintptr)dcache.mem+blocksize-1)&~(uintptr)(blocksize-1));
105         for(i = 0; i < nblocks; i++){
106                 b = &dcache.blocks[i];
107                 b->data = &p[i * blocksize];
108                 b->heap = TWID32;
109                 b->writedonechan = chancreate(sizeof(void*), 1);
110                 b->next = last;
111                 last = b;
112         }
113
114         dcache.free = last;
115         dcache.nheap = 0;
116         setstat(StatDcacheSize, nblocks);
117         initround(&dcache.round, "dcache", 120*1000);
118
119         vtproc(flushproc, nil);
120         vtproc(delaykickroundproc, &dcache.round);
121 }
122
123 static u32int
124 pbhash(u64int addr)
125 {
126         u32int h;
127
128 #define hashit(c)       ((((c) * 0x6b43a9b5) >> (32 - HashLog)) & HashMask)
129         h = (addr >> 32) ^ addr;
130         return hashit(h);
131 }
132
133 DBlock*
134 getdblock(Part *part, u64int addr, int mode)
135 {
136         DBlock *b;
137         
138         b = _getdblock(part, addr, mode, 1);
139         if(mode == OREAD || mode == ORDWR)
140                 addstat(StatDcacheRead, 1);
141         if(mode == OWRITE || mode == ORDWR)
142                 addstat(StatDcacheWrite, 1);
143         return b;
144 }
145
146 DBlock*
147 _getdblock(Part *part, u64int addr, int mode, int load)
148 {
149         DBlock *b;
150         u32int h, size, ms;
151
152         ms = 0;
153         trace(TraceBlock, "getdblock enter %s 0x%llux", part->name, addr);
154         size = part->blocksize;
155         if(size > dcache.size){
156                 seterr(EAdmin, "block size %d too big for cache with size %d", size, dcache.size);
157                 if(load)
158                         addstat(StatDcacheLookup, 1);
159                 return nil;
160         }
161         h = pbhash(addr);
162
163         /*
164          * look for the block in the cache
165          */
166         qlock(&dcache.lock);
167 again:
168         for(b = dcache.heads[h]; b != nil; b = b->next){
169                 if(b->part == part && b->addr == addr){
170                         if(load)
171                                 addstat2(StatDcacheHit, 1, StatDcacheLookup, 1);
172                         goto found;
173                 }
174         }
175
176         /*
177          * missed: locate the block with the oldest second to last use.
178          * remove it from the heap, and fix up the heap.
179          */
180         if(!load){
181                 qunlock(&dcache.lock);
182                 return nil;
183         }
184
185         /*
186          * Only start timer here, on cache miss - calling msec() on plain cache hits
187          * makes cache hits system-call bound.
188          */
189         ms = msec();
190         addstat2(StatDcacheLookup, 1, StatDcacheMiss, 1);
191
192         b = bumpdblock();
193         if(b == nil){
194                 trace(TraceBlock, "all disk cache blocks in use");
195                 addstat(StatDcacheStall, 1);
196                 rsleep(&dcache.full);
197                 addstat(StatDcacheStall, -1);
198                 goto again;
199         }
200
201         assert(!b->dirty);
202
203         /*
204          * the new block has no last use, so assume it happens sometime in the middle
205 ZZZ this is not reasonable
206          */
207         b->used = (b->used2 + dcache.now) / 2;
208
209         /*
210          * rechain the block on the correct hash chain
211          */
212         b->next = dcache.heads[h];
213         dcache.heads[h] = b;
214         if(b->next != nil)
215                 b->next->prev = b;
216         b->prev = nil;
217
218         b->addr = addr;
219         b->part = part;
220         b->size = 0;
221
222 found:
223         b->ref++;
224         b->used2 = b->used;
225         b->used = dcache.now++;
226         if(b->heap != TWID32)
227                 fixheap(b->heap, b);
228
229         if((mode == ORDWR || mode == OWRITE) && part->writechan == nil){
230                 trace(TraceBlock, "getdblock allocwriteproc %s", part->name);
231                 part->writechan = chancreate(sizeof(DBlock*), dcache.nblocks);
232                 vtproc(writeproc, part);
233         }
234         qunlock(&dcache.lock);
235
236         trace(TraceBlock, "getdblock lock");
237         addstat(StatDblockStall, 1);
238         if(mode == OREAD)
239                 rlock(&b->lock);
240         else
241                 wlock(&b->lock);
242         addstat(StatDblockStall, -1);
243         trace(TraceBlock, "getdblock locked");
244
245         if(b->size != size){
246                 if(mode == OREAD){
247                         addstat(StatDblockStall, 1);
248                         runlock(&b->lock);
249                         wlock(&b->lock);
250                         addstat(StatDblockStall, -1);
251                 }
252                 if(b->size < size){
253                         if(mode == OWRITE)
254                                 memset(&b->data[b->size], 0, size - b->size);
255                         else{
256                                 trace(TraceBlock, "getdblock readpart %s 0x%llux", part->name, addr);
257                                 diskaccess(0);
258                                 if(readpart(part, addr + b->size, &b->data[b->size], size - b->size) < 0){
259                                         b->mode = ORDWR;        /* so putdblock wunlocks */
260                                         putdblock(b);
261                                         return nil;
262                                 }
263                                 trace(TraceBlock, "getdblock readpartdone");
264                                 addstat(StatApartRead, 1);
265                                 addstat(StatApartReadBytes, size-b->size);
266                         }
267                 }
268                 b->size = size;
269                 if(mode == OREAD){
270                         addstat(StatDblockStall, 1);
271                         wunlock(&b->lock);
272                         rlock(&b->lock);
273                         addstat(StatDblockStall, -1);
274                 }
275         }
276
277         b->mode = mode;
278         trace(TraceBlock, "getdblock exit");
279         if(ms)
280                 addstat(StatDcacheLookupTime, msec() - ms);
281         return b;
282 }
283
284 void
285 putdblock(DBlock *b)
286 {
287         if(b == nil)
288                 return;
289
290         trace(TraceBlock, "putdblock %s 0x%llux", b->part->name, b->addr);
291
292         if(b->mode == OREAD)
293                 runlock(&b->lock);
294         else
295                 wunlock(&b->lock);
296
297         qlock(&dcache.lock);
298         if(--b->ref == 0 && !b->dirty){
299                 if(b->heap == TWID32)
300                         upheap(dcache.nheap++, b);
301                 rwakeupall(&dcache.full);
302         }
303         qunlock(&dcache.lock);
304 }
305
306 void
307 dirtydblock(DBlock *b, int dirty)
308 {
309         int odirty;
310
311         trace(TraceBlock, "dirtydblock enter %s 0x%llux %d from 0x%lux",
312                 b->part->name, b->addr, dirty, getcallerpc(&b));
313         assert(b->ref != 0);
314         assert(b->mode==ORDWR || b->mode==OWRITE);
315
316         odirty = b->dirty;
317         if(b->dirty)
318                 assert(b->dirty == dirty);
319         else
320                 b->dirty = dirty;
321
322         qlock(&dcache.lock);
323         if(!odirty){
324                 dcache.ndirty++;
325                 setstat(StatDcacheDirty, dcache.ndirty);
326                 if(dcache.ndirty >= dcache.maxdirty)
327                         kickround(&dcache.round, 0);
328                 else
329                         delaykickround(&dcache.round);
330         }
331         qunlock(&dcache.lock);
332 }
333
334 static void
335 unchain(DBlock *b)
336 {
337         ulong h;
338         
339         /*
340          * unchain the block
341          */
342         if(b->prev == nil){
343                 h = pbhash(b->addr);
344                 if(dcache.heads[h] != b)
345                         sysfatal("bad hash chains in disk cache");
346                 dcache.heads[h] = b->next;
347         }else
348                 b->prev->next = b->next;
349         if(b->next != nil)
350                 b->next->prev = b->prev;
351 }
352
353 /*
354  * remove some block from use and update the free list and counters
355  */
356 static DBlock*
357 bumpdblock(void)
358 {
359         DBlock *b;
360
361         trace(TraceBlock, "bumpdblock enter");
362         b = dcache.free;
363         if(b != nil){
364                 dcache.free = b->next;
365                 return b;
366         }
367
368         if(dcache.ndirty >= dcache.maxdirty)
369                 kickdcache();
370
371         /*
372          * remove blocks until we find one that is unused
373          * referenced blocks are left in the heap even though
374          * they can't be scavenged; this is simple a speed optimization
375          */
376         for(;;){
377                 if(dcache.nheap == 0){
378                         kickdcache();
379                         trace(TraceBlock, "bumpdblock gotnothing");
380                         return nil;
381                 }
382                 b = dcache.heap[0];
383                 delheap(b);
384                 if(!b->ref && !b->dirty)
385                         break;
386         }
387
388         trace(TraceBlock, "bumpdblock bumping %s 0x%llux", b->part->name, b->addr);
389
390         unchain(b);
391         return b;
392 }
393
394 void
395 emptydcache(void)
396 {
397         DBlock *b;
398         
399         qlock(&dcache.lock);
400         while(dcache.nheap > 0){
401                 b = dcache.heap[0];
402                 delheap(b);
403                 if(!b->ref && !b->dirty){
404                         unchain(b);
405                         b->next = dcache.free;
406                         dcache.free = b;
407                 }
408         }
409         qunlock(&dcache.lock);
410 }
411
412 /*
413  * delete an arbitrary block from the heap
414  */
415 static void
416 delheap(DBlock *db)
417 {
418         if(db->heap == TWID32)
419                 return;
420         fixheap(db->heap, dcache.heap[--dcache.nheap]);
421         db->heap = TWID32;
422 }
423
424 /*
425  * push an element up or down to it's correct new location
426  */
427 static void
428 fixheap(int i, DBlock *b)
429 {
430         if(upheap(i, b) == i)
431                 downheap(i, b);
432 }
433
434 static int
435 upheap(int i, DBlock *b)
436 {
437         DBlock *bb;
438         u32int now;
439         int p;
440
441         now = dcache.now;
442         for(; i != 0; i = p){
443                 p = (i - 1) >> 1;
444                 bb = dcache.heap[p];
445                 if(b->used2 - now >= bb->used2 - now)
446                         break;
447                 dcache.heap[i] = bb;
448                 bb->heap = i;
449         }
450
451         dcache.heap[i] = b;
452         b->heap = i;
453         return i;
454 }
455
456 static int
457 downheap(int i, DBlock *b)
458 {
459         DBlock *bb;
460         u32int now;
461         int k;
462
463         now = dcache.now;
464         for(; ; i = k){
465                 k = (i << 1) + 1;
466                 if(k >= dcache.nheap)
467                         break;
468                 if(k + 1 < dcache.nheap && dcache.heap[k]->used2 - now > dcache.heap[k + 1]->used2 - now)
469                         k++;
470                 bb = dcache.heap[k];
471                 if(b->used2 - now <= bb->used2 - now)
472                         break;
473                 dcache.heap[i] = bb;
474                 bb->heap = i;
475         }
476
477         dcache.heap[i] = b;
478         b->heap = i;
479         return i;
480 }
481
482 static void
483 findblock(DBlock *bb)
484 {
485         DBlock *b, *last;
486         int h;
487
488         last = nil;
489         h = pbhash(bb->addr);
490         for(b = dcache.heads[h]; b != nil; b = b->next){
491                 if(last != b->prev)
492                         sysfatal("bad prev link");
493                 if(b == bb)
494                         return;
495                 last = b;
496         }
497         sysfatal("block missing from hash table");
498 }
499
500 void
501 checkdcache(void)
502 {
503         DBlock *b;
504         u32int size, now;
505         int i, k, refed, nfree;
506
507         qlock(&dcache.lock);
508         size = dcache.size;
509         now = dcache.now;
510         for(i = 0; i < dcache.nheap; i++){
511                 if(dcache.heap[i]->heap != i)
512                         sysfatal("dc: mis-heaped at %d: %d", i, dcache.heap[i]->heap);
513                 if(i > 0 && dcache.heap[(i - 1) >> 1]->used2 - now > dcache.heap[i]->used2 - now)
514                         sysfatal("dc: bad heap ordering");
515                 k = (i << 1) + 1;
516                 if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now)
517                         sysfatal("dc: bad heap ordering");
518                 k++;
519                 if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now)
520                         sysfatal("dc: bad heap ordering");
521         }
522
523         refed = 0;
524         for(i = 0; i < dcache.nblocks; i++){
525                 b = &dcache.blocks[i];
526                 if(b->data != &dcache.mem[i * size])
527                         sysfatal("dc: mis-blocked at %d", i);
528                 if(b->ref && b->heap == TWID32)
529                         refed++;
530                 if(b->addr)
531                         findblock(b);
532                 if(b->heap != TWID32
533                 && dcache.heap[b->heap] != b)
534                         sysfatal("dc: spurious heap value");
535         }
536
537         nfree = 0;
538         for(b = dcache.free; b != nil; b = b->next){
539                 if(b->addr != 0 || b->heap != TWID32)
540                         sysfatal("dc: bad free list");
541                 nfree++;
542         }
543
544         if(dcache.nheap + nfree + refed != dcache.nblocks)
545                 sysfatal("dc: missing blocks: %d %d %d", dcache.nheap, refed, dcache.nblocks);
546         qunlock(&dcache.lock);
547 }
548
549 void
550 flushdcache(void)
551 {
552         trace(TraceProc, "flushdcache enter");
553         kickround(&dcache.round, 1);
554         trace(TraceProc, "flushdcache exit");
555 }
556
557 void
558 kickdcache(void)
559 {
560         kickround(&dcache.round, 0);
561 }
562
563 static int
564 parallelwrites(DBlock **b, DBlock **eb, int dirty)
565 {
566         DBlock **p, **q;
567         Part *part;
568
569         for(p=b; p<eb && (*p)->dirty == dirty; p++){
570                 assert(b<=p && p<eb);
571                 sendp((*p)->part->writechan, *p);
572         }
573         q = p;
574         for(p=b; p<q; p++){
575                 assert(b<=p && p<eb);
576                 recvp((*p)->writedonechan);
577         }
578         
579         /*
580          * Flush the partitions that have been written to.
581          */
582         part = nil;
583         for(p=b; p<q; p++){
584                 if(part != (*p)->part){
585                         part = (*p)->part;
586                         flushpart(part);        /* what if it fails? */
587                 }
588         }
589
590         return p-b;
591 }
592
593 /*
594  * Sort first by dirty flag, then by partition, then by address in partition.
595  */
596 static int
597 writeblockcmp(const void *va, const void *vb)
598 {
599         DBlock *a, *b;
600
601         a = *(DBlock**)va;
602         b = *(DBlock**)vb;
603
604         if(a->dirty != b->dirty)
605                 return a->dirty - b->dirty;
606         if(a->part != b->part){
607                 if(a->part < b->part)
608                         return -1;
609                 if(a->part > b->part)
610                         return 1;
611         }
612         if(a->addr < b->addr)
613                 return -1;
614         return 1;
615 }
616
617 static void
618 flushproc(void *v)
619 {
620         int i, j, n;
621         ulong t0;
622         DBlock *b, **write;
623
624         USED(v);
625         threadsetname("flushproc");
626         for(;;){
627                 waitforkick(&dcache.round);
628
629                 trace(TraceWork, "start");
630                 t0 = nsec()/1000;
631                 trace(TraceProc, "build t=%lud", (ulong)(nsec()/1000)-t0);
632
633                 write = dcache.write;
634                 n = 0;
635                 for(i=0; i<dcache.nblocks; i++){
636                         b = &dcache.blocks[i];
637                         if(b->dirty)
638                                 write[n++] = b;
639                 }
640
641                 qsort(write, n, sizeof(write[0]), writeblockcmp);
642
643                 /* Write each stage of blocks out. */
644                 trace(TraceProc, "writeblocks t=%lud", (ulong)(nsec()/1000)-t0);
645                 i = 0;
646                 for(j=1; j<DirtyMax; j++){
647                         trace(TraceProc, "writeblocks.%d t=%lud",
648                                 j, (ulong)(nsec()/1000)-t0);
649                         i += parallelwrites(write+i, write+n, j);
650                 }
651                 if(i != n){
652                         fprint(2, "in flushproc i=%d n=%d\n", i, n);
653                         for(i=0; i<n; i++)
654                                 fprint(2, "\tblock %d: dirty=%d\n",
655                                         i, write[i]->dirty);
656                         abort();
657                 }
658
659                 /*
660                  * b->dirty is protected by b->lock while ndirty is protected
661                  * by dcache.lock, so the --ndirty below is the delayed one
662                  * from clearing b->dirty in the write proc.  It may happen
663                  * that some other proc has come along and redirtied b since
664                  * the write.  That's okay, it just means that ndirty may be
665                  * one too high until we catch up and do the decrement.
666                  */
667                 trace(TraceProc, "undirty.%d t=%lud", j, (ulong)(nsec()/1000)-t0);
668                 qlock(&dcache.lock);
669                 for(i=0; i<n; i++){
670                         b = write[i];
671                         --dcache.ndirty;
672                         if(b->ref == 0 && b->heap == TWID32){
673                                 upheap(dcache.nheap++, b);
674                                 rwakeupall(&dcache.full);
675                         }
676                 }
677                 setstat(StatDcacheDirty, dcache.ndirty);
678                 qunlock(&dcache.lock);
679                 addstat(StatDcacheFlush, 1);
680                 trace(TraceWork, "finish");
681         }
682 }
683
684 static void
685 writeproc(void *v)
686 {
687         DBlock *b;
688         Part *p;
689
690         p = v;
691
692         threadsetname("writeproc:%s", p->name);
693         for(;;){
694                 b = recvp(p->writechan);
695                 trace(TraceWork, "start");
696                 assert(b->part == p);
697                 trace(TraceProc, "wlock %s 0x%llux", p->name, b->addr);
698                 wlock(&b->lock);
699                 trace(TraceProc, "writepart %s 0x%llux", p->name, b->addr);
700                 diskaccess(0);
701                 if(writepart(p, b->addr, b->data, b->size) < 0)
702                         fprint(2, "%s: writeproc: part %s addr 0x%llux: write error: %r\n",
703                                 argv0, p->name, b->addr);
704                 addstat(StatApartWrite, 1);
705                 addstat(StatApartWriteBytes, b->size);
706                 b->dirty = 0;
707                 wunlock(&b->lock);
708                 trace(TraceProc, "finish %s 0x%llux", p->name, b->addr);
709                 trace(TraceWork, "finish");
710                 sendp(b->writedonechan, b);
711         }
712 }