]> git.lizzy.rs Git - plan9front.git/blob - sys/src/cmd/venti/srv/dcache.c
venti: fix memory layers
[plan9front.git] / sys / src / cmd / venti / srv / dcache.c
1 /*
2  * Disk cache.
3  * 
4  * Caches raw disk blocks.  Getdblock() gets a block, putdblock puts it back.
5  * Getdblock has a mode parameter that determines i/o and access to a block:
6  * if mode is OREAD or ORDWR, it is read from disk if not already in memory.
7  * If mode is ORDWR or OWRITE, it is locked for exclusive use before being returned.
8  * It is *not* marked dirty -- once changes have been made, they should be noted
9  * by using dirtydblock() before putdblock().  
10  *
11  * There is a global cache lock as well as a lock on each block. 
12  * Within a thread, the cache lock can be acquired while holding a block lock,
13  * but not vice versa; and a block cannot be locked if you already hold the lock
14  * on another block.
15  * 
16  * The flush proc writes out dirty blocks in batches, one batch per dirty tag.
17  * For example, the DirtyArena blocks are all written to disk before any of the
18  * DirtyArenaCib blocks.
19  *
20  * This code used to be in charge of flushing the dirty index blocks out to 
21  * disk, but updating the index turned out to benefit from extra care.
22  * Now cached index blocks are never marked dirty.  The index.c code takes
23  * care of updating them behind our back, and uses _getdblock to update any
24  * cached copies of the blocks as it changes them on disk.
25  */
26
27 #include "stdinc.h"
28 #include "dat.h"
29 #include "fns.h"
30
31 typedef struct DCache   DCache;
32
33 enum
34 {
35         HashLog         = 9,
36         HashSize        = 1<<HashLog,
37         HashMask        = HashSize - 1,
38 };
39
40 struct DCache
41 {
42         QLock           lock;
43         RWLock          dirtylock;              /* must be held to inspect or set b->dirty */
44         Rendez          full;
45         Round           round;
46         DBlock          *free;                  /* list of available lumps */
47         u32int          now;                    /* ticks for usage timestamps */
48         int             size;                   /* max. size of any block; allocated to each block */
49         DBlock          **heads;                /* hash table for finding address */
50         int             nheap;                  /* number of available victims */
51         DBlock          **heap;                 /* heap for locating victims */
52         int             nblocks;                /* number of blocks allocated */
53         DBlock          *blocks;                /* array of block descriptors */
54         DBlock          **write;                /* array of block pointers to be written */
55         u8int           *mem;                   /* memory for all block descriptors */
56         int             ndirty;                 /* number of dirty blocks */
57         int             maxdirty;               /* max. number of dirty blocks */
58 };
59
60 typedef struct Ra Ra;
61 struct Ra
62 {
63         Part *part;
64         u64int addr;
65 };
66
67 static DCache   dcache;
68
69 static int      downheap(int i, DBlock *b);
70 static int      upheap(int i, DBlock *b);
71 static DBlock   *bumpdblock(void);
72 static void     delheap(DBlock *db);
73 static void     fixheap(int i, DBlock *b);
74 static void     flushproc(void*);
75 static void     writeproc(void*);
76
77 void
78 initdcache(u32int mem)
79 {
80         DBlock *b, *last;
81         u32int nblocks, blocksize;
82         int i;
83         u8int *p;
84
85         if(maxblocksize == 0)
86                 sysfatal("no max. block size given for disk cache");
87         if(mem < maxblocksize * 2)
88                 sysfatal("need at least 2 max-size blocks (%d bytes) for the disk cache", maxblocksize * 2);
89
90         blocksize = maxblocksize;
91         nblocks = mem / blocksize;
92         dcache.full.l = &dcache.lock;
93         dcache.nblocks = nblocks;
94         dcache.maxdirty = (nblocks * 2) / 3;
95         trace(TraceProc, "initialize disk cache with %d blocks of %d bytes, maximum %d dirty blocks\n",
96                         nblocks, blocksize, dcache.maxdirty);
97         dcache.size = blocksize;
98         dcache.heads = vtbrk(sizeof(DBlock*) * HashSize);
99         dcache.heap = vtbrk(sizeof(DBlock*) * nblocks);
100         dcache.blocks = vtbrk(sizeof(DBlock) * nblocks);
101         dcache.write = vtbrk(sizeof(DBlock*) * nblocks);
102         dcache.mem = vtbrk((nblocks+1+128) * blocksize);
103
104         last = nil;
105         p = (u8int*)(((uintptr)dcache.mem+blocksize-1)&~(uintptr)(blocksize-1));
106         for(i = 0; i < nblocks; i++){
107                 b = &dcache.blocks[i];
108                 b->data = &p[i * blocksize];
109                 b->heap = TWID32;
110                 b->writedonechan = chancreate(sizeof(void*), 1);
111                 b->next = last;
112                 last = b;
113         }
114
115         dcache.free = last;
116         dcache.nheap = 0;
117         setstat(StatDcacheSize, nblocks);
118         initround(&dcache.round, "dcache", 120*1000);
119
120         vtproc(flushproc, nil);
121         vtproc(delaykickroundproc, &dcache.round);
122 }
123
124 static u32int
125 pbhash(u64int addr)
126 {
127         u32int h;
128
129 #define hashit(c)       ((((c) * 0x6b43a9b5) >> (32 - HashLog)) & HashMask)
130         h = (addr >> 32) ^ addr;
131         return hashit(h);
132 }
133
134 DBlock*
135 getdblock(Part *part, u64int addr, int mode)
136 {
137         DBlock *b;
138         
139         b = _getdblock(part, addr, mode, 1);
140         if(mode == OREAD || mode == ORDWR)
141                 addstat(StatDcacheRead, 1);
142         if(mode == OWRITE || mode == ORDWR)
143                 addstat(StatDcacheWrite, 1);
144         return b;
145 }
146
147 DBlock*
148 _getdblock(Part *part, u64int addr, int mode, int load)
149 {
150         DBlock *b;
151         u32int h, size, ms;
152
153         ms = 0;
154         trace(TraceBlock, "getdblock enter %s 0x%llux", part->name, addr);
155         size = part->blocksize;
156         if(size > dcache.size){
157                 seterr(EAdmin, "block size %d too big for cache with size %d", size, dcache.size);
158                 if(load)
159                         addstat(StatDcacheLookup, 1);
160                 return nil;
161         }
162         h = pbhash(addr);
163
164         /*
165          * look for the block in the cache
166          */
167         qlock(&dcache.lock);
168 again:
169         for(b = dcache.heads[h]; b != nil; b = b->next){
170                 if(b->part == part && b->addr == addr){
171                         if(load)
172                                 addstat2(StatDcacheHit, 1, StatDcacheLookup, 1);
173                         goto found;
174                 }
175         }
176
177         /*
178          * missed: locate the block with the oldest second to last use.
179          * remove it from the heap, and fix up the heap.
180          */
181         if(!load){
182                 qunlock(&dcache.lock);
183                 return nil;
184         }
185
186         /*
187          * Only start timer here, on cache miss - calling msec() on plain cache hits
188          * makes cache hits system-call bound.
189          */
190         ms = msec();
191         addstat2(StatDcacheLookup, 1, StatDcacheMiss, 1);
192
193         b = bumpdblock();
194         if(b == nil){
195                 trace(TraceBlock, "all disk cache blocks in use");
196                 addstat(StatDcacheStall, 1);
197                 rsleep(&dcache.full);
198                 addstat(StatDcacheStall, -1);
199                 goto again;
200         }
201
202         assert(!b->dirty);
203
204         /*
205          * the new block has no last use, so assume it happens sometime in the middle
206 ZZZ this is not reasonable
207          */
208         b->used = (b->used2 + dcache.now) / 2;
209
210         /*
211          * rechain the block on the correct hash chain
212          */
213         b->next = dcache.heads[h];
214         dcache.heads[h] = b;
215         if(b->next != nil)
216                 b->next->prev = b;
217         b->prev = nil;
218
219         b->addr = addr;
220         b->part = part;
221         b->size = 0;
222
223 found:
224         b->ref++;
225         b->used2 = b->used;
226         b->used = dcache.now++;
227         if(b->heap != TWID32)
228                 fixheap(b->heap, b);
229
230         if((mode == ORDWR || mode == OWRITE) && part->writechan == nil){
231                 trace(TraceBlock, "getdblock allocwriteproc %s", part->name);
232                 part->writechan = chancreate(sizeof(DBlock*), dcache.nblocks);
233                 vtproc(writeproc, part);
234         }
235         qunlock(&dcache.lock);
236
237         trace(TraceBlock, "getdblock lock");
238         addstat(StatDblockStall, 1);
239         if(mode == OREAD)
240                 rlock(&b->lock);
241         else
242                 wlock(&b->lock);
243         addstat(StatDblockStall, -1);
244         trace(TraceBlock, "getdblock locked");
245
246         if(b->size != size){
247                 if(mode == OREAD){
248                         addstat(StatDblockStall, 1);
249                         runlock(&b->lock);
250                         wlock(&b->lock);
251                         addstat(StatDblockStall, -1);
252                 }
253                 if(b->size < size){
254                         if(mode == OWRITE)
255                                 memset(&b->data[b->size], 0, size - b->size);
256                         else{
257                                 trace(TraceBlock, "getdblock readpart %s 0x%llux", part->name, addr);
258                                 diskaccess(0);
259                                 if(readpart(part, addr + b->size, &b->data[b->size], size - b->size) < 0){
260                                         b->mode = ORDWR;        /* so putdblock wunlocks */
261                                         putdblock(b);
262                                         return nil;
263                                 }
264                                 trace(TraceBlock, "getdblock readpartdone");
265                                 addstat(StatApartRead, 1);
266                                 addstat(StatApartReadBytes, size-b->size);
267                         }
268                 }
269                 b->size = size;
270                 if(mode == OREAD){
271                         addstat(StatDblockStall, 1);
272                         wunlock(&b->lock);
273                         rlock(&b->lock);
274                         addstat(StatDblockStall, -1);
275                 }
276         }
277
278         b->mode = mode;
279         trace(TraceBlock, "getdblock exit");
280         if(ms)
281                 addstat(StatDcacheLookupTime, msec() - ms);
282         return b;
283 }
284
285 void
286 putdblock(DBlock *b)
287 {
288         if(b == nil)
289                 return;
290
291         trace(TraceBlock, "putdblock %s 0x%llux", b->part->name, b->addr);
292
293         if(b->mode == OREAD)
294                 runlock(&b->lock);
295         else
296                 wunlock(&b->lock);
297
298         qlock(&dcache.lock);
299         if(--b->ref == 0 && !b->dirty){
300                 if(b->heap == TWID32)
301                         upheap(dcache.nheap++, b);
302                 rwakeupall(&dcache.full);
303         }
304         qunlock(&dcache.lock);
305 }
306
307 void
308 dirtydblock(DBlock *b, int dirty)
309 {
310         int odirty;
311
312         trace(TraceBlock, "dirtydblock enter %s 0x%llux %d from 0x%lux",
313                 b->part->name, b->addr, dirty, getcallerpc(&b));
314         assert(b->ref != 0);
315         assert(b->mode==ORDWR || b->mode==OWRITE);
316
317         odirty = b->dirty;
318         if(b->dirty)
319                 assert(b->dirty == dirty);
320         else
321                 b->dirty = dirty;
322
323         qlock(&dcache.lock);
324         if(!odirty){
325                 dcache.ndirty++;
326                 setstat(StatDcacheDirty, dcache.ndirty);
327                 if(dcache.ndirty >= dcache.maxdirty)
328                         kickround(&dcache.round, 0);
329                 else
330                         delaykickround(&dcache.round);
331         }
332         qunlock(&dcache.lock);
333 }
334
335 static void
336 unchain(DBlock *b)
337 {
338         ulong h;
339         
340         /*
341          * unchain the block
342          */
343         if(b->prev == nil){
344                 h = pbhash(b->addr);
345                 if(dcache.heads[h] != b)
346                         sysfatal("bad hash chains in disk cache");
347                 dcache.heads[h] = b->next;
348         }else
349                 b->prev->next = b->next;
350         if(b->next != nil)
351                 b->next->prev = b->prev;
352 }
353
354 /*
355  * remove some block from use and update the free list and counters
356  */
357 static DBlock*
358 bumpdblock(void)
359 {
360         DBlock *b;
361
362         trace(TraceBlock, "bumpdblock enter");
363         b = dcache.free;
364         if(b != nil){
365                 dcache.free = b->next;
366                 return b;
367         }
368
369         if(dcache.ndirty >= dcache.maxdirty)
370                 kickdcache();
371
372         /*
373          * remove blocks until we find one that is unused
374          * referenced blocks are left in the heap even though
375          * they can't be scavenged; this is simple a speed optimization
376          */
377         for(;;){
378                 if(dcache.nheap == 0){
379                         kickdcache();
380                         trace(TraceBlock, "bumpdblock gotnothing");
381                         return nil;
382                 }
383                 b = dcache.heap[0];
384                 delheap(b);
385                 if(!b->ref && !b->dirty)
386                         break;
387         }
388
389         trace(TraceBlock, "bumpdblock bumping %s 0x%llux", b->part->name, b->addr);
390
391         unchain(b);
392         return b;
393 }
394
395 void
396 emptydcache(void)
397 {
398         DBlock *b;
399         
400         qlock(&dcache.lock);
401         while(dcache.nheap > 0){
402                 b = dcache.heap[0];
403                 delheap(b);
404                 if(!b->ref && !b->dirty){
405                         unchain(b);
406                         b->next = dcache.free;
407                         dcache.free = b;
408                 }
409         }
410         qunlock(&dcache.lock);
411 }
412
413 /*
414  * delete an arbitrary block from the heap
415  */
416 static void
417 delheap(DBlock *db)
418 {
419         if(db->heap == TWID32)
420                 return;
421         fixheap(db->heap, dcache.heap[--dcache.nheap]);
422         db->heap = TWID32;
423 }
424
425 /*
426  * push an element up or down to it's correct new location
427  */
428 static void
429 fixheap(int i, DBlock *b)
430 {
431         if(upheap(i, b) == i)
432                 downheap(i, b);
433 }
434
435 static int
436 upheap(int i, DBlock *b)
437 {
438         DBlock *bb;
439         u32int now;
440         int p;
441
442         now = dcache.now;
443         for(; i != 0; i = p){
444                 p = (i - 1) >> 1;
445                 bb = dcache.heap[p];
446                 if(b->used2 - now >= bb->used2 - now)
447                         break;
448                 dcache.heap[i] = bb;
449                 bb->heap = i;
450         }
451
452         dcache.heap[i] = b;
453         b->heap = i;
454         return i;
455 }
456
457 static int
458 downheap(int i, DBlock *b)
459 {
460         DBlock *bb;
461         u32int now;
462         int k;
463
464         now = dcache.now;
465         for(; ; i = k){
466                 k = (i << 1) + 1;
467                 if(k >= dcache.nheap)
468                         break;
469                 if(k + 1 < dcache.nheap && dcache.heap[k]->used2 - now > dcache.heap[k + 1]->used2 - now)
470                         k++;
471                 bb = dcache.heap[k];
472                 if(b->used2 - now <= bb->used2 - now)
473                         break;
474                 dcache.heap[i] = bb;
475                 bb->heap = i;
476         }
477
478         dcache.heap[i] = b;
479         b->heap = i;
480         return i;
481 }
482
483 static void
484 findblock(DBlock *bb)
485 {
486         DBlock *b, *last;
487         int h;
488
489         last = nil;
490         h = pbhash(bb->addr);
491         for(b = dcache.heads[h]; b != nil; b = b->next){
492                 if(last != b->prev)
493                         sysfatal("bad prev link");
494                 if(b == bb)
495                         return;
496                 last = b;
497         }
498         sysfatal("block missing from hash table");
499 }
500
501 void
502 checkdcache(void)
503 {
504         DBlock *b;
505         u32int size, now;
506         int i, k, refed, nfree;
507
508         qlock(&dcache.lock);
509         size = dcache.size;
510         now = dcache.now;
511         for(i = 0; i < dcache.nheap; i++){
512                 if(dcache.heap[i]->heap != i)
513                         sysfatal("dc: mis-heaped at %d: %d", i, dcache.heap[i]->heap);
514                 if(i > 0 && dcache.heap[(i - 1) >> 1]->used2 - now > dcache.heap[i]->used2 - now)
515                         sysfatal("dc: bad heap ordering");
516                 k = (i << 1) + 1;
517                 if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now)
518                         sysfatal("dc: bad heap ordering");
519                 k++;
520                 if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now)
521                         sysfatal("dc: bad heap ordering");
522         }
523
524         refed = 0;
525         for(i = 0; i < dcache.nblocks; i++){
526                 b = &dcache.blocks[i];
527                 if(b->data != &dcache.mem[i * size])
528                         sysfatal("dc: mis-blocked at %d", i);
529                 if(b->ref && b->heap == TWID32)
530                         refed++;
531                 if(b->addr)
532                         findblock(b);
533                 if(b->heap != TWID32
534                 && dcache.heap[b->heap] != b)
535                         sysfatal("dc: spurious heap value");
536         }
537
538         nfree = 0;
539         for(b = dcache.free; b != nil; b = b->next){
540                 if(b->addr != 0 || b->heap != TWID32)
541                         sysfatal("dc: bad free list");
542                 nfree++;
543         }
544
545         if(dcache.nheap + nfree + refed != dcache.nblocks)
546                 sysfatal("dc: missing blocks: %d %d %d", dcache.nheap, refed, dcache.nblocks);
547         qunlock(&dcache.lock);
548 }
549
550 void
551 flushdcache(void)
552 {
553         trace(TraceProc, "flushdcache enter");
554         kickround(&dcache.round, 1);
555         trace(TraceProc, "flushdcache exit");
556 }
557
558 void
559 kickdcache(void)
560 {
561         kickround(&dcache.round, 0);
562 }
563
564 static int
565 parallelwrites(DBlock **b, DBlock **eb, int dirty)
566 {
567         DBlock **p, **q;
568         Part *part;
569
570         for(p=b; p<eb && (*p)->dirty == dirty; p++){
571                 assert(b<=p && p<eb);
572                 sendp((*p)->part->writechan, *p);
573         }
574         q = p;
575         for(p=b; p<q; p++){
576                 assert(b<=p && p<eb);
577                 recvp((*p)->writedonechan);
578         }
579         
580         /*
581          * Flush the partitions that have been written to.
582          */
583         part = nil;
584         for(p=b; p<q; p++){
585                 if(part != (*p)->part){
586                         part = (*p)->part;
587                         flushpart(part);        /* what if it fails? */
588                 }
589         }
590
591         return p-b;
592 }
593
594 /*
595  * Sort first by dirty flag, then by partition, then by address in partition.
596  */
597 static int
598 writeblockcmp(const void *va, const void *vb)
599 {
600         DBlock *a, *b;
601
602         a = *(DBlock**)va;
603         b = *(DBlock**)vb;
604
605         if(a->dirty != b->dirty)
606                 return a->dirty - b->dirty;
607         if(a->part != b->part){
608                 if(a->part < b->part)
609                         return -1;
610                 if(a->part > b->part)
611                         return 1;
612         }
613         if(a->addr < b->addr)
614                 return -1;
615         return 1;
616 }
617
618 static void
619 flushproc(void *v)
620 {
621         int i, j, n;
622         ulong t0;
623         DBlock *b, **write;
624
625         USED(v);
626         threadsetname("flushproc");
627         for(;;){
628                 waitforkick(&dcache.round);
629
630                 trace(TraceWork, "start");
631                 t0 = nsec()/1000;
632                 trace(TraceProc, "build t=%lud", (ulong)(nsec()/1000)-t0);
633
634                 write = dcache.write;
635                 n = 0;
636                 for(i=0; i<dcache.nblocks; i++){
637                         b = &dcache.blocks[i];
638                         if(b->dirty)
639                                 write[n++] = b;
640                 }
641
642                 qsort(write, n, sizeof(write[0]), writeblockcmp);
643
644                 /* Write each stage of blocks out. */
645                 trace(TraceProc, "writeblocks t=%lud", (ulong)(nsec()/1000)-t0);
646                 i = 0;
647                 for(j=1; j<DirtyMax; j++){
648                         trace(TraceProc, "writeblocks.%d t=%lud",
649                                 j, (ulong)(nsec()/1000)-t0);
650                         i += parallelwrites(write+i, write+n, j);
651                 }
652                 if(i != n){
653                         fprint(2, "in flushproc i=%d n=%d\n", i, n);
654                         for(i=0; i<n; i++)
655                                 fprint(2, "\tblock %d: dirty=%d\n",
656                                         i, write[i]->dirty);
657                         abort();
658                 }
659
660                 /*
661                  * b->dirty is protected by b->lock while ndirty is protected
662                  * by dcache.lock, so the --ndirty below is the delayed one
663                  * from clearing b->dirty in the write proc.  It may happen
664                  * that some other proc has come along and redirtied b since
665                  * the write.  That's okay, it just means that ndirty may be
666                  * one too high until we catch up and do the decrement.
667                  */
668                 trace(TraceProc, "undirty.%d t=%lud", j, (ulong)(nsec()/1000)-t0);
669                 qlock(&dcache.lock);
670                 for(i=0; i<n; i++){
671                         b = write[i];
672                         --dcache.ndirty;
673                         if(b->ref == 0 && b->heap == TWID32){
674                                 upheap(dcache.nheap++, b);
675                                 rwakeupall(&dcache.full);
676                         }
677                 }
678                 setstat(StatDcacheDirty, dcache.ndirty);
679                 qunlock(&dcache.lock);
680                 addstat(StatDcacheFlush, 1);
681                 trace(TraceWork, "finish");
682         }
683 }
684
685 static void
686 writeproc(void *v)
687 {
688         DBlock *b;
689         Part *p;
690
691         p = v;
692
693         threadsetname("writeproc:%s", p->name);
694         for(;;){
695                 b = recvp(p->writechan);
696                 trace(TraceWork, "start");
697                 assert(b->part == p);
698                 trace(TraceProc, "wlock %s 0x%llux", p->name, b->addr);
699                 wlock(&b->lock);
700                 trace(TraceProc, "writepart %s 0x%llux", p->name, b->addr);
701                 diskaccess(0);
702                 if(writepart(p, b->addr, b->data, b->size) < 0)
703                         fprint(2, "%s: writeproc: part %s addr 0x%llux: write error: %r\n",
704                                 argv0, p->name, b->addr);
705                 addstat(StatApartWrite, 1);
706                 addstat(StatApartWriteBytes, b->size);
707                 b->dirty = 0;
708                 wunlock(&b->lock);
709                 trace(TraceProc, "finish %s 0x%llux", p->name, b->addr);
710                 trace(TraceWork, "finish");
711                 sendp(b->writedonechan, b);
712         }
713 }