2 #include "../port/lib.h"
9 * Simple segment descriptors with no translation.
11 #define EXECSEGM(p) { 0, SEGL|SEGP|SEGPL(p)|SEGEXEC }
12 #define DATASEGM(p) { 0, SEGB|SEGG|SEGP|SEGPL(p)|SEGDATA|SEGW }
13 #define EXEC32SEGM(p) { 0xFFFF, SEGG|SEGD|(0xF<<16)|SEGP|SEGPL(p)|SEGEXEC|SEGR }
14 #define DATA32SEGM(p) { 0xFFFF, SEGB|SEGG|(0xF<<16)|SEGP|SEGPL(p)|SEGDATA|SEGW }
18 [NULLSEG] { 0, 0}, /* null descriptor */
19 [KESEG] EXECSEGM(0), /* kernel code */
20 [KDSEG] DATASEGM(0), /* kernel data */
21 [UE32SEG] EXEC32SEGM(3), /* user code 32 bit*/
22 [UDSEG] DATA32SEGM(3), /* user data/stack */
23 [UESEG] EXECSEGM(3), /* user code */
40 MAPBITS = 8*sizeof(m->mmumap[0]),
44 loadptr(u16int lim, uintptr off, void (*load)(void*))
59 taskswitch(uintptr stack)
64 tss->rsp0[0] = (u32int)stack;
65 tss->rsp0[1] = stack >> 32;
66 tss->rsp1[0] = (u32int)stack;
67 tss->rsp1[1] = stack >> 32;
68 tss->rsp2[0] = (u32int)stack;
69 tss->rsp2[1] = stack >> 32;
73 static void kernelro(void);
82 /* zap double map done by l.s */
89 m->tss = mallocz(sizeof(Tss), 1);
91 panic("mmuinit: no memory for Tss");
92 m->tss->iomap = 0xDFFF;
94 x = (uintptr)m + MACHSIZE;
96 m->tss->ist[i+1] = x>>32;
100 * We used to keep the GDT in the Mach structure, but it
101 * turns out that that slows down access to the rest of the
102 * page. Since the Mach structure is accessed quite often,
103 * it pays off anywhere from a factor of 1.25 to 2 on real
104 * hardware to separate them (the AMDs are more sensitive
105 * than Intels in this regard). Under VMware it pays off
106 * a factor of about 10 to 100.
108 memmove(m->gdt, gdt, sizeof gdt);
111 m->gdt[TSSSEG+0].d0 = (x<<16)|(sizeof(Tss)-1);
112 m->gdt[TSSSEG+0].d1 = (x&0xFF000000)|((x>>16)&0xFF)|SEGTSS|SEGPL(0)|SEGP;
113 m->gdt[TSSSEG+1].d0 = x>>32;
114 m->gdt[TSSSEG+1].d1 = 0;
116 loadptr(sizeof(gdt)-1, (uintptr)m->gdt, lgdt);
117 loadptr(sizeof(Segdesc)*512-1, (uintptr)IDTADDR, lidt);
118 taskswitch((uintptr)m + MACHSIZE);
122 wrmsr(GSbase, (uvlong)&machp[m->machno]);
123 wrmsr(KernelGSbase, 0ull);
125 /* enable syscall extension */
130 wrmsr(Star, ((uvlong)UE32SEL << 48) | ((uvlong)KESEL << 32));
131 wrmsr(Lstar, (uvlong)syscallentry);
132 wrmsr(Sfmask, 0x200);
136 * These could go back to being macros once the kernel is debugged,
137 * but the extra checking is nice to have.
142 if(pa >= (uintptr)-KZERO)
143 panic("kaddr: pa=%#p pc=%#p", pa, getcallerpc(&pa));
144 return (void*)(pa+KZERO);
157 panic("paddr: va=%#p pc=%#p", va, getcallerpc(&v));
169 m->mmufree = p->next;
175 mmupool.free = p->next;
181 p = malloc(n * sizeof(MMU));
183 panic("mmualloc: out of memory for MMU");
184 p->page = mallocalign(n * PTSZ, BY2PG, 0, 0);
186 panic("mmualloc: out of memory for MMU pages");
188 p[i].page = p[i-1].page + (1<<PTSHIFT);
193 p[n-1].next = mmupool.free;
194 mmupool.free = p->next;
196 mmupool.nfree += n-1;
205 mmucreate(uintptr *table, uintptr va, int level, int index)
207 uintptr *page, flags;
210 flags = PTEWRITE|PTEVALID;
213 assert((va < USTKTOP) || (va >= KMAP && va < KMAP+KMAPSIZE));
221 if((p->next = up->mmuhead) == nil)
224 m->mmumap[index/MAPBITS] |= 1ull<<(index%MAPBITS);
226 up->mmutail->next = p;
235 up->kmaptail->next = p;
244 memset(page, 0, PTSZ);
245 table[index] = PADDR(page) | flags;
250 mmuwalk(uintptr* table, uintptr va, int level, int create)
256 for(i = 2; i >= level; i--){
262 if(pte >= (uintptr)-KZERO)
263 table = (void*)(pte + VMAP);
265 table = (void*)(pte + KZERO);
269 table = mmucreate(table, va, i, x);
277 ptecount(uintptr va, int level)
279 return (1<<PTSHIFT) - (va & PGLSZ(level+1)-1) / PGLSZ(level);
283 ptesplit(uintptr* table, uintptr va)
285 uintptr *pte, pa, off;
287 pte = mmuwalk(table, va, 1, 0);
288 if(pte == nil || (*pte & PTESIZE) == 0 || (va & PGLSZ(1)-1) == 0)
292 pa = *pte & ~PTESIZE;
293 for(off = 0; off < PGLSZ(1); off += PGLSZ(0))
294 table[PTLX(va + off, 0)] = pa + off;
295 *pte = PADDR(table) | PTEVALID|PTEWRITE;
300 * map kernel text segment readonly
301 * and everything else no-execute.
306 uintptr *pte, psz, va;
308 ptesplit(m->pml4, APBOOTSTRAP);
309 ptesplit(m->pml4, KTZERO);
310 ptesplit(m->pml4, (uintptr)etext-1);
312 for(va = KZERO; va != 0; va += psz){
314 pte = mmuwalk(m->pml4, va, 0, 0);
318 pte = mmuwalk(m->pml4, va, 1, 0);
323 if((*pte & PTEVALID) == 0)
325 if(va >= KTZERO && va < (uintptr)etext)
327 else if(va != (APBOOTSTRAP & -BY2PG))
334 pmap(uintptr pa, uintptr va, vlong size)
336 uintptr *pte, *ptee, flags;
339 if(size <= 0 || va < VMAP)
340 panic("pmap: pa=%#p va=%#p size=%lld", pa, va, size);
347 if(size >= PGLSZ(1) && (va % PGLSZ(1)) == 0)
349 l = (flags & PTESIZE) != 0;
351 pte = mmuwalk(m->pml4, va, l, 1);
353 pte = mmuwalk(m->pml4, va, ++l, 0);
354 if(pte && (*pte & PTESIZE)){
356 z = va & (PGLSZ(l)-1);
362 panic("pmap: pa=%#p va=%#p size=%lld", pa, va, size);
364 ptee = pte + ptecount(va, l);
365 while(size > 0 && pte < ptee){
375 punmap(uintptr va, vlong size)
382 if((va % PGLSZ(1)) != 0 || size < PGLSZ(1))
383 ptesplit(m->pml4, va);
385 pte = mmuwalk(m->pml4, va, l, 0);
386 if(pte == nil && (va % PGLSZ(1)) == 0 && size >= PGLSZ(1))
387 pte = mmuwalk(m->pml4, va, ++l, 0);
405 pte[PTLX(KMAP, 3)] = 0;
408 pte[PTLX(UTZERO, 3)] = 0;
409 pte[PTLX(USTKTOP-1, 3)] = 0;
410 m->mmumap[PTLX(UTZERO, 3)/MAPBITS] &= ~(1ull<<(PTLX(UTZERO, 3)%MAPBITS));
411 m->mmumap[PTLX(USTKTOP-1, 3)/MAPBITS] &= ~(1ull<<(PTLX(USTKTOP-1, 3)%MAPBITS));
413 for(i = 0; i < nelem(m->mmumap); pte += MAPBITS, i++){
414 if((w = m->mmumap[i]) == 0)
417 for(x = 0; w != 0; w >>= 1, x++){
432 if(m->mmucount+proc->mmucount < 256){
433 p->next = m->mmufree;
434 m->mmufree = proc->mmuhead;
435 m->mmucount += proc->mmucount;
438 p->next = mmupool.free;
439 mmupool.free = proc->mmuhead;
440 mmupool.nfree += proc->mmucount;
443 proc->mmuhead = proc->mmutail = nil;
459 mmuswitch(Proc *proc)
468 if((p = proc->kmaphead) != nil)
469 m->pml4[PTLX(KMAP, 3)] = PADDR(p->page) | PTEWRITE|PTEVALID;
470 for(p = proc->mmuhead; p != nil && p->level == PML4E; p = p->next){
471 m->mmumap[p->index/MAPBITS] |= 1ull<<(p->index%MAPBITS);
472 m->pml4[p->index] = PADDR(p->page) | PTEUSER|PTEWRITE|PTEVALID;
474 taskswitch((uintptr)proc->kstack+KSTACK);
478 mmurelease(Proc *proc)
483 if((p = proc->kmaptail) != nil){
484 if((p->next = proc->mmuhead) == nil)
486 proc->mmuhead = proc->kmaphead;
487 proc->mmucount += proc->kmapcount;
489 proc->kmaphead = proc->kmaptail = nil;
490 proc->kmapcount = proc->kmapindex = 0;
493 taskswitch((uintptr)m+MACHSIZE);
497 putmmu(uintptr va, uintptr pa, Page *)
503 pte = mmuwalk(m->pml4, va, 0, 1);
505 panic("putmmu: bug: va=%#p pa=%#p", va, pa);
514 * Double-check the user MMU.
515 * Error checking only.
518 checkmmu(uintptr va, uintptr pa)
524 pte = mmuwalk(m->pml4, va, 0, 0);
525 if(pte == 0 || ((old = *pte) & PTEVALID) == 0 || PPN(old) == pa){
530 print("%ld %s: va=%#p pa=%#p pte=%#p\n", up->pid, up->text, va, pa, old);
544 uintptr *pte, pa, va;
548 if(cankaddr(pa) != 0)
549 return (KMap*)KADDR(pa);
552 va = KMAP + (((uintptr)up->kmapindex++ << PGSHIFT) & (KMAPSIZE-1));
553 pte = mmuwalk(m->pml4, va, 0, 1);
554 if(pte == 0 || (*pte & PTEVALID) != 0)
555 panic("kmap: pa=%#p va=%#p", pa, va);
556 *pte = pa | PTEWRITE|PTENOEXEC|PTEVALID;
573 pte = mmuwalk(m->pml4, va, 0, 0);
574 if(pte == 0 || (*pte & PTEVALID) == 0)
575 panic("kunmap: va=%#p", va);
581 * Add a device mapping to the vmap range.
582 * note that the VMAP and KZERO PDPs are shared
583 * between processors (see mpstartap) so no
584 * synchronization is being done.
587 vmap(uvlong pa, int size)
592 if(pa < BY2PG || size <= 0 || -pa < size || pa+size > VMAPSIZE){
593 print("vmap pa=%llux size=%d pc=%#p\n", pa, size, getcallerpc(&pa));
599 * might be asking for less than a page.
605 pmap(pa | PTEUNCACHED|PTEWRITE|PTENOEXEC|PTEVALID, va, size);
606 return (void*)(va+o);
612 paddr(v); /* will panic on error */
616 * mark pages as write combining (used for framebuffer)
619 patwc(void *a, int n)
621 uintptr *pte, mask, attr, va;
625 /* check if pat is usable */
626 if((MACHP(0)->cpuiddx & Pat) == 0
627 || rdmsr(0x277, &v) == -1
628 || ((v >> PATWC*8) & 7) != 1)
631 /* set the bits for all pages in range */
632 for(va = (uintptr)a; n > 0; n -= z, va += z){
634 pte = mmuwalk(m->pml4, va, l, 0);
636 pte = mmuwalk(m->pml4, va, ++l, 0);
637 if(pte == 0 || (*pte & PTEVALID) == 0)
638 panic("patwc: va=%#p", va);
641 mask = l == 0 ? 3<<3 | 1<<7 : 3<<3 | 1<<12;
642 attr = (((PATWC&3)<<3) | ((PATWC&4)<<5) | ((PATWC&4)<<10));
643 *pte = (*pte & ~mask) | (attr & mask);
648 * The palloc.pages array and mmupool can be a large chunk
649 * out of the 2GB window above KZERO, so we allocate from
650 * upages and map in the VMAP window before pageinit()
656 uintptr va, base, top;
662 for(i=0; i<nelem(palloc.mem); i++){
666 nt = np / 50; /* 2% for mmupool */
669 nt = (uvlong)nt*BY2PG / (sizeof(MMU)+PTSZ);
670 tsize = (uvlong)nt * (sizeof(MMU)+PTSZ);
672 psize = (uvlong)np * BY2PG;
673 psize += sizeof(Page) + BY2PG;
674 psize = (psize / (sizeof(Page)+BY2PG)) * sizeof(Page);
677 psize = ROUND(psize, PGLSZ(1));
679 for(i=0; i<nelem(palloc.mem); i++){
681 base = ROUND(pm->base, PGLSZ(1));
682 top = pm->base + (uvlong)pm->npage * BY2PG;
683 if((base + psize) <= VMAPSIZE && (vlong)(top - base) >= psize){
684 pm->base = base + psize;
685 pm->npage = (top - pm->base)/BY2PG;
688 pmap(base | PTEGLOBAL|PTEWRITE|PTENOEXEC|PTEVALID, va, psize);
690 palloc.pages = (void*)(va + tsize);
692 mmupool.nfree = mmupool.nalloc = nt;
693 mmupool.free = (void*)(va + (uvlong)nt*PTSZ);
695 mmupool.free[i].page = (uintptr*)va;
696 mmupool.free[i].next = &mmupool.free[i+1];
699 mmupool.free[i-1].next = nil;