]> git.lizzy.rs Git - plan9front.git/blob - sys/src/9/port/devswap.c
kernel: avoid selecting the boot process in killbig()
[plan9front.git] / sys / src / 9 / port / devswap.c
1 #include        "u.h"
2 #include        "../port/lib.h"
3 #include        "mem.h"
4 #include        "dat.h"
5 #include        "fns.h"
6 #include        "../port/error.h"
7
8 #include        <libsec.h>
9 #include        <pool.h>
10
11 static int      canflush(Proc*, Segment*);
12 static void     executeio(void);
13 static void     pageout(Proc*, Segment*);
14 static void     pagepte(int, Page**);
15 static void     pager(void*);
16
17 Image   swapimage = {
18         .notext = 1,
19 };
20
21 static Chan     *swapchan;
22 static uchar    *swapbuf;
23 static AESstate *swapkey;
24
25 static Page     **iolist;
26 static ulong    ioptr;
27
28 static ushort   ageclock;
29
30 static void
31 swapinit(void)
32 {
33         while(conf.nswap && conf.nswppo){
34                 swapalloc.swmap = xalloc(conf.nswap);
35                 if(swapalloc.swmap == nil)
36                         break;
37                 iolist = xalloc(conf.nswppo*sizeof(Page*));
38                 if(iolist == nil){
39                         xfree(swapalloc.swmap);
40                         swapalloc.swmap = nil;
41                 }
42                 break;
43         }
44
45         if(swapalloc.swmap == nil || iolist == nil)
46                 conf.nswap = conf.nswppo = 0;
47
48         swapalloc.top = &swapalloc.swmap[conf.nswap];
49         swapalloc.alloc = swapalloc.swmap;
50         swapalloc.last = swapalloc.swmap;
51         swapalloc.free = conf.nswap;
52         swapalloc.xref = 0;
53
54         kproc("pager", pager, 0);
55 }
56
57 static uintptr
58 newswap(void)
59 {
60         uchar *look;
61
62         lock(&swapalloc);
63         if(swapalloc.free == 0) {
64                 unlock(&swapalloc);
65                 return ~0;
66         }
67         look = memchr(swapalloc.last, 0, swapalloc.top-swapalloc.last);
68         if(look == nil)
69                 look = memchr(swapalloc.swmap, 0, swapalloc.last-swapalloc.swmap);
70         *look = 2;      /* ref for pte + io transaction */
71         swapalloc.last = look;
72         swapalloc.free--;
73         unlock(&swapalloc);
74         return (look-swapalloc.swmap) * BY2PG;
75 }
76
77 void
78 putswap(Page *p)
79 {
80         uchar *idx;
81
82         lock(&swapalloc);
83         idx = &swapalloc.swmap[((uintptr)p)/BY2PG];
84         if(*idx == 0)
85                 panic("putswap %#p ref == 0", p);
86
87         if(*idx == 255) {
88                 if(swapalloc.xref == 0)
89                         panic("putswap %#p xref == 0", p);
90
91                 if(--swapalloc.xref == 0) {
92                         for(idx = swapalloc.swmap; idx < swapalloc.top; idx++) {
93                                 if(*idx == 255) {
94                                         *idx = 0;
95                                         swapalloc.free++;
96                                 }
97                         }
98                 }
99         } else {
100                 if(--(*idx) == 0)
101                         swapalloc.free++;
102         }
103         unlock(&swapalloc);
104 }
105
106 void
107 dupswap(Page *p)
108 {
109         uchar *idx;
110
111         lock(&swapalloc);
112         idx = &swapalloc.swmap[((uintptr)p)/BY2PG];
113         if(*idx == 255)
114                 swapalloc.xref++;
115         else {
116                 if(++(*idx) == 255)
117                         swapalloc.xref += 255;
118         }
119         unlock(&swapalloc);
120 }
121
122 int
123 swapcount(uintptr daddr)
124 {
125         return swapalloc.swmap[daddr/BY2PG];
126 }
127
128 void
129 kickpager(void)
130 {
131         wakeup(&swapalloc.r);
132 }
133
134 static int
135 reclaim(void)
136 {
137         ulong np;
138
139         for(;;){
140                 if((np = pagereclaim(&fscache, 1000)) > 0) {
141                         if(0) print("reclaim: %lud fscache\n", np);
142                 } else if((np = pagereclaim(&swapimage, 1000)) > 0) {
143                         if(0) print("reclaim: %lud swap\n", np);
144                 } else if((np = imagereclaim(1000)) > 0) {
145                         if(0) print("reclaim: %lud image\n", np);
146                 }
147                 if(!needpages(nil))
148                         return 1;       /* have pages, done */
149                 if(np == 0)
150                         return 0;       /* didnt reclaim, need to swap */
151                 sched();
152         }
153 }
154
155 static void
156 pager(void*)
157 {
158         Proc *p;
159         Segment *s;
160         int x, i;
161
162         while(waserror())
163                 ;
164
165         x = -1;
166         for(;;){
167                 up->psstate = "Reclaim";
168                 if(reclaim()){
169                         up->psstate = "Idle";
170                         wakeup(&palloc.pwait[0]);
171                         wakeup(&palloc.pwait[1]);
172                         sleep(&swapalloc.r, needpages, nil);
173                         continue;
174                 }
175
176                 if(swapimage.c == nil || swapalloc.free == 0){
177                 Killbig:
178                         if(!freebroken())
179                                 killbig("out of memory");
180                         sched();
181                         continue;
182                 }
183
184                 i = ageclock;
185                 do {
186                         if(++x >= conf.nproc){
187                                 if(++ageclock == i)
188                                         goto Killbig;
189                                 x = 0;
190                         }
191                         p = proctab(x);
192                 } while(p->state == Dead || p->noswap || !canqlock(&p->seglock));
193                 up->psstate = "Pageout";
194                 for(i = 0; i < NSEG; i++) {
195                         if((s = p->seg[i]) != nil) {
196                                 switch(s->type&SG_TYPE) {
197                                 default:
198                                         break;
199                                 case SG_TEXT:
200                                         pageout(p, s);
201                                         break;
202                                 case SG_DATA:
203                                 case SG_BSS:
204                                 case SG_STACK:
205                                 case SG_SHARED:
206                                         pageout(p, s);
207                                         break;
208                                 }
209                         }
210                 }
211                 qunlock(&p->seglock);
212
213                 if(ioptr) {
214                         up->psstate = "I/O";
215                         executeio();
216                 }
217         }
218 }
219
220 static void
221 pageout(Proc *p, Segment *s)
222 {
223         int type, i, size;
224         short age;
225         Pte *l;
226         Page **pg, *entry;
227
228         if(!canqlock(s))        /* We cannot afford to wait, we will surely deadlock */
229                 return;
230
231         if(!canflush(p, s)      /* Able to invalidate all tlbs with references */
232         || waserror()) {
233                 qunlock(s);
234                 putseg(s);
235                 return;
236         }
237
238         /* Pass through the pte tables looking for memory pages to swap out */
239         type = s->type&SG_TYPE;
240         size = s->mapsize;
241         for(i = 0; i < size; i++) {
242                 l = s->map[i];
243                 if(l == nil)
244                         continue;
245                 for(pg = l->first; pg <= l->last; pg++) {
246                         entry = *pg;
247                         if(pagedout(entry))
248                                 continue;
249                         if(entry->modref & PG_REF) {
250                                 entry->modref &= ~PG_REF;
251                                 entry->refage = ageclock;
252                                 continue;
253                         }
254                         age = (short)(ageclock - entry->refage);
255                         if(age < 16)
256                                 continue;
257                         pagepte(type, pg);
258                 }
259         }
260         poperror();
261         qunlock(s);
262         putseg(s);
263 }
264
265 static int
266 canflush(Proc *p, Segment *s)
267 {
268         int x, i;
269
270         if(incref(s) == 2)              /* Easy if we are the only user */
271                 return canpage(p);
272
273         /*
274          * Now we must do hardwork to ensure all processes which have tlb
275          * entries for this segment will be flushed if we succeed in paging it out
276          */
277         for(x = 0; x < conf.nproc; x++){
278                 p = proctab(x);
279                 if(p->state == Dead)
280                         continue;
281                 for(i = 0; i < NSEG; i++){
282                         if(p->seg[i] == s)
283                                 if(!canpage(p))
284                                         return 0;
285                 }
286         }
287         return 1;
288 }
289
290 static void
291 pagepte(int type, Page **pg)
292 {
293         uintptr daddr;
294         Page *outp;
295
296         outp = *pg;
297         switch(type) {
298         case SG_TEXT:                           /* Revert to demand load */
299                 putpage(outp);
300                 *pg = nil;
301                 break;
302
303         case SG_DATA:
304         case SG_BSS:
305         case SG_STACK:
306         case SG_SHARED:
307                 if(ioptr >= conf.nswppo)
308                         break;
309
310                 /*
311                  *  get a new swap address with swapcount 2, one for the pte
312                  *  and one extra ref for us while we write the page to disk
313                  */
314                 daddr = newswap();
315                 if(daddr == ~0)
316                         break;
317
318                 /* clear any pages referring to it from the cache */
319                 cachedel(&swapimage, daddr);
320
321                 /* forget anything that it used to cache */
322                 uncachepage(outp);
323
324                 /*
325                  *  enter it into the cache so that a fault happening
326                  *  during the write will grab the page from the cache
327                  *  rather than one partially written to the disk
328                  */
329                 outp->daddr = daddr;
330                 cachepage(outp, &swapimage);
331                 *pg = (Page*)(daddr|PG_ONSWAP);
332
333                 /* Add page to IO transaction list */
334                 iolist[ioptr++] = outp;
335                 break;
336         }
337 }
338
339 static void
340 executeio(void)
341 {
342         Page *outp;
343         ulong i, j;
344
345         for(i = j = 0; i < ioptr; i++) {
346                 outp = iolist[i];
347
348                 assert(outp->ref > 0);
349                 assert(outp->image == &swapimage);
350                 assert(outp->daddr != ~0);
351
352                 /* only write when swap address still in use */
353                 if(swapcount(outp->daddr) > 1){
354                         Chan *c = swapimage.c;
355                         KMap *k = kmap(outp);
356                         if(waserror()){
357                                 kunmap(k);
358                                 iolist[j++] = outp;
359                                 continue;
360                         }
361                         if(devtab[c->type]->write(c, (char*)VA(k), BY2PG, outp->daddr) != BY2PG)
362                                 error(Eshort);
363                         kunmap(k);
364                         poperror();
365                 }
366
367                 /* drop our extra swap reference */
368                 putswap((Page*)outp->daddr);
369
370                 /* Free up the page after I/O */
371                 putpage(outp);
372         }
373         ioptr = j;
374         if(j) print("executeio (%lud/%lud): %s\n", j, i, up->errstr);
375 }
376
377 int
378 needpages(void*)
379 {
380         return palloc.freecount < swapalloc.headroom;
381 }
382
383 static void
384 setswapchan(Chan *c)
385 {
386         uchar buf[sizeof(Dir)+100];
387         Dir d;
388         int n;
389
390         if(waserror()){
391                 cclose(c);
392                 nexterror();
393         }
394         if(swapimage.c != nil) {
395                 if(swapalloc.free != conf.nswap)
396                         error(Einuse);
397                 cclose(swapimage.c);
398                 swapimage.c = nil;
399         }
400
401         /*
402          *  if this isn't a file, set the swap space
403          *  to be at most the size of the partition
404          */
405         if(devtab[c->type]->dc != L'M'){
406                 n = devtab[c->type]->stat(c, buf, sizeof buf);
407                 if(n <= 0 || convM2D(buf, n, &d, nil) == 0)
408                         error("stat failed in setswapchan");
409                 if(d.length < (vlong)conf.nswppo*BY2PG)
410                         error("swap device too small");
411                 if(d.length < (vlong)conf.nswap*BY2PG){
412                         conf.nswap = d.length/BY2PG;
413                         swapalloc.top = &swapalloc.swmap[conf.nswap];
414                         swapalloc.free = conf.nswap;
415                 }
416         }
417         c->flag &= ~CCACHE;
418         cclunk(c);
419         poperror();
420
421         swapchan = c;
422         swapimage.c = namec("#¶/swapfile", Aopen, ORDWR, 0);
423 }
424
425 enum {
426         Qdir,
427         Qswap,
428         Qswapfile,
429 };
430
431 static Dirtab swapdir[]={
432         ".",            {Qdir, 0, QTDIR},       0,              DMDIR|0555,
433         "swap",         {Qswap},                0,              0664,
434         "swapfile",     {Qswapfile},            0,              0600,
435 };
436
437 static Chan*
438 swapattach(char *spec)
439 {
440         return devattach(L'¶', spec);
441 }
442
443 static Walkqid*
444 swapwalk(Chan *c, Chan *nc, char **name, int nname)
445 {
446         return devwalk(c, nc, name, nname, swapdir, nelem(swapdir), devgen);
447 }
448
449 static int
450 swapstat(Chan *c, uchar *dp, int n)
451 {
452         return devstat(c, dp, n, swapdir, nelem(swapdir), devgen);
453 }
454
455 static Chan*
456 swapopen(Chan *c, int omode)
457 {
458         uchar key[128/8];
459
460         switch((ulong)c->qid.path){
461         case Qswapfile:
462                 if(!iseve() || omode != ORDWR)
463                         error(Eperm);
464                 if(swapimage.c != nil)
465                         error(Einuse);
466                 if(swapchan == nil)
467                         error(Egreg);
468
469                 c->mode = openmode(omode);
470                 c->flag |= COPEN;
471                 c->offset = 0;
472
473                 swapbuf = mallocalign(BY2PG, BY2PG, 0, 0);
474                 swapkey = secalloc(sizeof(AESstate)*2);
475                 if(swapbuf == nil || swapkey == nil)
476                         error(Enomem);
477
478                 genrandom(key, sizeof(key));
479                 setupAESstate(&swapkey[0], key, sizeof(key), nil);
480                 genrandom(key, sizeof(key));
481                 setupAESstate(&swapkey[1], key, sizeof(key), nil);
482                 memset(key, 0, sizeof(key));
483
484                 return c;
485         }
486         return devopen(c, omode, swapdir, nelem(swapdir), devgen);
487 }
488
489 static void
490 swapclose(Chan *c)
491 {
492         if((c->flag & COPEN) == 0)
493                 return;
494         switch((ulong)c->qid.path){
495         case Qswapfile:
496                 cclose(swapchan);
497                 swapchan = nil;
498                 secfree(swapkey);
499                 swapkey = nil;
500                 free(swapbuf);
501                 swapbuf = nil;
502                 break;
503         }
504 }
505
506 static long
507 swapread(Chan *c, void *va, long n, vlong off)
508 {
509         char tmp[256];          /* must be >= 18*NUMSIZE (Qswap) */
510         ulong reclaim;
511
512         switch((ulong)c->qid.path){
513         case Qdir:
514                 return devdirread(c, va, n, swapdir, nelem(swapdir), devgen);
515         case Qswap:
516                 reclaim = imagecached() + fscache.pgref + swapimage.pgref;
517                 snprint(tmp, sizeof tmp,
518                         "%llud memory\n"
519                         "%llud pagesize\n"
520                         "%lud kernel\n"
521                         "%lud/%lud user\n"
522                         "%lud/%lud swap\n"
523                         "%lud/%lud reclaim\n"
524                         "%llud/%llud/%llud kernel malloc\n"
525                         "%llud/%llud/%llud kernel draw\n"
526                         "%llud/%llud/%llud kernel secret\n",
527                         (uvlong)conf.npage*BY2PG,
528                         (uvlong)BY2PG,
529                         conf.npage-conf.upages,
530                         palloc.user-palloc.freecount-reclaim, palloc.user,
531                         conf.nswap-swapalloc.free, conf.nswap,
532                         reclaim, palloc.user,
533                         (uvlong)mainmem->curalloc,
534                         (uvlong)mainmem->cursize,
535                         (uvlong)mainmem->maxsize,
536                         (uvlong)imagmem->curalloc,
537                         (uvlong)imagmem->cursize,
538                         (uvlong)imagmem->maxsize,
539                         (uvlong)secrmem->curalloc,
540                         (uvlong)secrmem->cursize,
541                         (uvlong)secrmem->maxsize);
542                 return readstr((ulong)off, va, n, tmp);
543         case Qswapfile:
544                 if(n != BY2PG)
545                         error(Ebadarg);
546                 if(devtab[swapchan->type]->read(swapchan, va, n, off) != n)
547                         error(Eio);
548                 aes_xts_decrypt(&swapkey[0], &swapkey[1], off, va, va, n);
549                 return n;
550         }
551         error(Egreg);
552         return 0;
553 }
554
555 static long
556 swapwrite(Chan *c, void *va, long n, vlong off)
557 {
558         char buf[256];
559         
560         switch((ulong)c->qid.path){
561         case Qswap:
562                 if(!iseve())
563                         error(Eperm);
564                 if(n >= sizeof buf)
565                         error(Egreg);
566                 memmove(buf, va, n);    /* so we can NUL-terminate */
567                 buf[n] = 0;
568                 /* start a pager if not already started */
569                 if(strncmp(buf, "start", 5) == 0)
570                         kickpager();
571                 else if(buf[0]>='0' && buf[0]<='9')
572                         setswapchan(fdtochan(strtoul(buf, nil, 0), ORDWR, 1, 1));
573                 else
574                         error(Ebadctl);
575                 return n;
576         case Qswapfile:
577                 if(n != BY2PG)
578                         error(Ebadarg);
579                 aes_xts_encrypt(&swapkey[0], &swapkey[1], off, va, swapbuf, n);
580                 if(devtab[swapchan->type]->write(swapchan, swapbuf, n, off) != n)
581                         error(Eio);
582                 return n;
583         }
584         error(Egreg);
585         return 0;
586 }
587
588 Dev swapdevtab = {
589         L'¶',
590         "swap",
591         devreset,
592         swapinit,
593         devshutdown,
594         swapattach,
595         swapwalk,
596         swapstat,
597         swapopen,
598         devcreate,
599         swapclose,
600         swapread,
601         devbread,
602         swapwrite,
603         devbwrite,
604         devremove,
605         devwstat,
606 };