2 * Memory mappings. Life was easier when 2G of memory was enough.
4 * The kernel memory starts at KZERO, with the text loaded at KZERO+1M
5 * (9load sits under 1M during the load). The memory from KZERO to the
6 * top of memory is mapped 1-1 with physical memory, starting at physical
7 * address 0. All kernel memory and data structures (i.e., the entries stored
8 * into conf.mem) must sit in this physical range: if KZERO is at 0xF0000000,
9 * then the kernel can only have 256MB of memory for itself.
11 * The 256M below KZERO comprises three parts. The lowest 4M is the
12 * virtual page table, a virtual address representation of the current
13 * page table tree. The second 4M is used for temporary per-process
14 * mappings managed by kmap and kunmap. The remaining 248M is used
15 * for global (shared by all procs and all processors) device memory
16 * mappings and managed by vmap and vunmap. The total amount (256M)
17 * could probably be reduced somewhat if desired. The largest device
18 * mapping is that of the video card, and even though modern video cards
19 * have embarrassing amounts of memory, the video drivers only use one
20 * frame buffer worth (at most 16M). Each is described in more detail below.
22 * The VPT is a 4M frame constructed by inserting the pdb into itself.
23 * This short-circuits one level of the page tables, with the result that
24 * the contents of second-level page tables can be accessed at VPT.
25 * We use the VPT to edit the page tables (see mmu) after inserting them
26 * into the page directory. It is a convenient mechanism for mapping what
27 * might be otherwise-inaccessible pages. The idea was borrowed from
30 * The VPT doesn't solve all our problems, because we still need to
31 * prepare page directories before we can install them. For that, we
32 * use tmpmap/tmpunmap, which map a single page at TMPADDR.
36 #include "../port/lib.h"
43 * Simple segment descriptors with no translation.
45 #define DATASEGM(p) { 0xFFFF, SEGG|SEGB|(0xF<<16)|SEGP|SEGPL(p)|SEGDATA|SEGW }
46 #define EXECSEGM(p) { 0xFFFF, SEGG|SEGD|(0xF<<16)|SEGP|SEGPL(p)|SEGEXEC|SEGR }
47 #define EXEC16SEGM(p) { 0xFFFF, SEGG|(0xF<<16)|SEGP|SEGPL(p)|SEGEXEC|SEGR }
48 #define TSSSEGM(b,p) { ((b)<<16)|sizeof(Tss),\
49 ((b)&0xFF000000)|(((b)>>16)&0xFF)|SEGTSS|SEGPL(p)|SEGP }
53 [NULLSEG] { 0, 0}, /* null descriptor */
54 [KDSEG] DATASEGM(0), /* kernel data/stack */
55 [KESEG] EXECSEGM(0), /* kernel code */
56 [UDSEG] DATASEGM(3), /* user data/stack */
57 [UESEG] EXECSEGM(3), /* user code */
58 [TSSSEG] TSSSEGM(0,0), /* tss segment */
59 [KESEG16] EXEC16SEGM(0), /* kernel code 16-bit */
62 static void taskswitch(ulong, ulong);
63 static void memglobal(void);
65 #define vpt ((ulong*)VPT)
66 #define VPTX(va) (((ulong)(va))>>12)
67 #define vpd (vpt+VPTX(VPT))
70 /* PAT entry used for write combining */
81 if(0) print("vpt=%#.8ux vpd=%#p kmap=%#.8ux\n",
85 m->pdb[PDX(VPT)] = PADDR(m->pdb)|PTEWRITE|PTEVALID;
87 m->tss = mallocz(sizeof(Tss), 1);
89 panic("mmuinit: no memory for Tss");
90 m->tss->iomap = 0xDFFF<<16;
93 * We used to keep the GDT in the Mach structure, but it
94 * turns out that that slows down access to the rest of the
95 * page. Since the Mach structure is accessed quite often,
96 * it pays off anywhere from a factor of 1.25 to 2 on real
97 * hardware to separate them (the AMDs are more sensitive
98 * than Intels in this regard). Under VMware it pays off
99 * a factor of about 10 to 100.
101 memmove(m->gdt, gdt, sizeof gdt);
103 m->gdt[TSSSEG].d0 = (x<<16)|sizeof(Tss);
104 m->gdt[TSSSEG].d1 = (x&0xFF000000)|((x>>16)&0xFF)|SEGTSS|SEGPL(0)|SEGP;
106 ptr[0] = sizeof(gdt)-1;
109 ptr[2] = (x>>16) & 0xFFFF;
112 ptr[0] = sizeof(Segdesc)*256-1;
115 ptr[2] = (x>>16) & 0xFFFF;
118 /* make kernel text unwritable */
119 for(x = KTZERO; x < (ulong)etext; x += BY2PG){
120 p = mmuwalk(m->pdb, x, 2, 0);
126 taskswitch(PADDR(m->pdb), (ulong)m + BY2PG);
129 /* IA32_PAT write combining */
130 if((MACHP(0)->cpuiddx & Pat) != 0
131 && rdmsr(0x277, &v) != -1){
132 v &= ~(255LL<<(PATWC*8));
133 v |= 1LL<<(PATWC*8); /* WC */
139 * On processors that support it, we set the PTEGLOBAL bit in
140 * page table and page directory entries that map kernel memory.
141 * Doing this tells the processor not to bother flushing them
142 * from the TLB when doing the TLB flush associated with a
143 * context switch (write to CR3). Since kernel memory mappings
144 * are never removed, this is safe. (If we ever remove kernel memory
145 * mappings, we can do a full flush by turning off the PGE bit in CR4,
146 * writing to CR3, and then turning the PGE bit back on.)
148 * See also mmukmap below.
150 * Processor support for the PTEGLOBAL bit is enabled in devarch.c.
158 /* only need to do this once, on bootstrap processor */
166 for(i=PDX(KZERO); i<1024; i++){
167 if(pde[i] & PTEVALID){
169 if(!(pde[i] & PTESIZE)){
170 pte = KADDR(pde[i]&~(BY2PG-1));
171 for(j=0; j<1024; j++)
172 if(pte[j] & PTEVALID)
180 * Flush all the user-space and device-mapping mmu info
181 * for this process, because something has been deleted.
182 * It will be paged back in on demand.
196 * Flush a single page mapping from the tlb.
201 if(m->cpuidfamily >= 4)
208 * Allocate a new page for a page directory.
209 * We keep a small cache of pre-initialized
210 * page directories in each mach.
223 page = newpage(0, 0, 0);
224 page->va = (ulong)vpd;
227 memmove(pdb, m->pdb, BY2PG);
228 pdb[PDX(VPT)] = page->pa|PTEWRITE|PTEVALID; /* set up VPT */
232 m->pdbpool = page->next;
240 mmupdbfree(Proc *proc, Page *p)
243 panic("mmupdbfree: islo");
246 p->next = proc->mmufree;
249 p->next = m->pdbpool;
256 * A user-space memory segment has been deleted, or the
257 * process is exiting. Clear all the pde entries for user-space
258 * memory mappings and device mappings. Any entries that
259 * are needed will be paged back in as necessary.
262 mmuptefree(Proc* proc)
268 if(proc->mmupdb == nil || proc->mmuused == nil)
271 pdb = tmpmap(proc->mmupdb);
272 last = &proc->mmuused;
273 for(page = *last; page; page = page->next){
274 pdb[page->daddr] = 0;
279 *last = proc->mmufree;
280 proc->mmufree = proc->mmuused;
285 taskswitch(ulong pdb, ulong stack)
300 mmuswitch(Proc* proc)
311 if(proc->mmupdb != nil){
312 pdb = tmpmap(proc->mmupdb);
313 pdb[PDX(MACHADDR)] = m->pdb[PDX(MACHADDR)];
315 taskswitch(proc->mmupdb->pa, (ulong)(proc->kstack+KSTACK));
317 taskswitch(PADDR(m->pdb), (ulong)(proc->kstack+KSTACK));
319 memmove(&m->gdt[PROCSEG0], proc->gdt, sizeof(proc->gdt));
320 if((x = (ulong)proc->ldt) && (n = proc->nldt) > 0){
321 m->gdt[LDTSEG].d0 = (x<<16)|((n * sizeof(Segdesc)) - 1);
322 m->gdt[LDTSEG].d1 = (x&0xFF000000)|((x>>16)&0xFF)|SEGLDT|SEGPL(0)|SEGP;
329 * Release any pages allocated for a page directory base or page-tables
331 * switch to the prototype pdb for this processor (m->pdb);
332 * call mmuptefree() to place all pages used for page-tables (proc->mmuused)
333 * onto the process' free list (proc->mmufree). This has the side-effect of
334 * cleaning any user entries in the pdb (proc->mmupdb);
335 * if there's a pdb put it in the cache of pre-initialised pdb's
336 * for this processor (m->pdbpool) or on the process' free list;
337 * finally, place any pages freed back into the free pool (palloc).
338 * This routine is only called from schedinit() with palloc locked.
341 mmurelease(Proc* proc)
347 panic("mmurelease: islo");
348 taskswitch(PADDR(m->pdb), (ulong)m + BY2PG);
349 if(proc->kmaptable != nil){
350 if(proc->mmupdb == nil)
351 panic("mmurelease: no mmupdb");
352 if(--proc->kmaptable->ref != 0)
353 panic("mmurelease: kmap ref %ld", proc->kmaptable->ref);
355 panic("mmurelease: nkmap %d", proc->nkmap);
357 * remove kmaptable from pdb before putting pdb up for reuse.
359 pdb = tmpmap(proc->mmupdb);
360 if(PPN(pdb[PDX(KMAP)]) != proc->kmaptable->pa)
361 panic("mmurelease: bad kmap pde %#.8lux kmap %#.8lux",
362 pdb[PDX(KMAP)], proc->kmaptable->pa);
366 * move kmaptable to free list.
368 pagechainhead(proc->kmaptable);
369 proc->kmaptable = nil;
371 if(proc->mmupdb != nil){
373 mmupdbfree(proc, proc->mmupdb);
376 for(page = proc->mmufree; page != nil; page = next){
379 panic("mmurelease: page->ref %ld", page->ref);
382 if(proc->mmufree != nil)
385 if(proc->ldt != nil){
393 * Allocate and install pdb for the current process.
402 if(up->mmupdb != nil)
404 page = mmupdballoc();
406 if(up->mmupdb != nil){
408 * Perhaps we got an interrupt while
409 * mmupdballoc was sleeping and that
410 * interrupt allocated an mmupdb?
413 mmupdbfree(up, page);
418 pdb[PDX(MACHADDR)] = m->pdb[PDX(MACHADDR)];
421 putcr3(up->mmupdb->pa);
426 * Update the mmu in response to a user fault. pa may have PTEWRITE set.
429 putmmu(uintptr va, uintptr pa, Page*)
434 if(up->mmupdb == nil)
438 * We should be able to get through this with interrupts
439 * turned on (if we get interrupted we'll just pick up
440 * where we left off) but we get many faults accessing
441 * vpt[] near the end of this function, and they always happen
442 * after the process has been switched out and then
443 * switched back, usually many times in a row (perhaps
444 * it cannot switch back successfully for some reason).
446 * In any event, I'm tired of searching for this bug.
447 * Turn off interrupts during putmmu even though
448 * we shouldn't need to. - rsc
452 if(!(vpd[PDX(va)]&PTEVALID)){
453 if(up->mmufree == 0){
455 page = newpage(0, 0, 0);
460 up->mmufree = page->next;
462 vpd[PDX(va)] = PPN(page->pa)|PTEUSER|PTEWRITE|PTEVALID;
463 /* page is now mapped into the VPT - clear it */
464 memset((void*)(VPT+PDX(va)*BY2PG), 0, BY2PG);
465 page->daddr = PDX(va);
466 page->next = up->mmuused;
470 vpt[VPTX(va)] = pa|PTEUSER|PTEVALID;
473 if(getcr3() != up->mmupdb->pa)
474 print("bad cr3 %#.8lux %#.8lux\n", getcr3(), up->mmupdb->pa);
479 * Double-check the user MMU.
480 * Error checking only.
483 checkmmu(uintptr va, uintptr pa)
487 if(!(vpd[PDX(va)]&PTEVALID) || !(vpt[VPTX(va)]&PTEVALID))
489 if(PPN(vpt[VPTX(va)]) != pa)
490 print("%ld %s: va=%#p pa=%#p pte=%#08lux\n",
492 va, pa, vpt[VPTX(va)]);
496 * Walk the page-table pointed to by pdb and return a pointer
497 * to the entry for virtual address va at the requested level.
498 * If the entry is invalid and create isn't requested then bail
499 * out early. Otherwise, for the 2nd level walk, allocate a new
500 * page-table page and register it in the 1st level. This is used
501 * only to edit kernel mappings, which use pages from kernel memory,
502 * so it's okay to use KADDR to look at the tables.
505 mmuwalk(ulong* pdb, ulong va, int level, int create)
510 table = &pdb[PDX(va)];
511 if(!(*table & PTEVALID) && create == 0)
524 panic("mmuwalk2: va %luX entry %luX", va, *table);
525 if(!(*table & PTEVALID)){
527 memset(map, 0, BY2PG);
528 *table = PADDR(map)|PTEWRITE|PTEVALID;
530 table = KADDR(PPN(*table));
531 return &table[PTX(va)];
536 * Device mappings are shared by all procs and processors and
537 * live in the virtual range VMAP to VMAP+VMAPSIZE. The master
538 * copy of the mappings is stored in mach0->pdb, and they are
539 * paged in from there as necessary by vmapsync during faults.
542 static Lock vmaplock;
544 static int findhole(ulong *a, int n, int count);
545 static ulong vmapalloc(ulong size);
546 static int pdbmap(ulong *, ulong, ulong, int);
547 static void pdbunmap(ulong*, ulong, int);
550 * Add a device mapping to the vmap range.
553 vmap(ulong pa, int size)
559 * might be asking for less than a page.
566 size = ROUND(size, BY2PG);
568 print("vmap pa=0 pc=%#p\n", getcallerpc(&pa));
572 if((va = vmapalloc(size)) == 0
573 || pdbmap(MACHP(0)->pdb, pa|PTEUNCACHED|PTEWRITE, va, size) < 0){
578 /* avoid trap on local processor
579 for(i=0; i<size; i+=4*MB)
583 // print(" vmap %#.8lux %d => %#.8lux\n", pa+o, osize, va+o);
584 return (void*)(va + o);
588 findhole(ulong *a, int n, int count)
605 * Look for free space in the vmap.
608 vmapalloc(ulong size)
614 vpdb = &MACHP(0)->pdb[PDX(VMAP)];
615 vpdbsize = VMAPSIZE/(4*MB);
618 n = (size+4*MB-1) / (4*MB);
619 if((o = findhole(vpdb, vpdbsize, n)) != -1)
620 return VMAP + o*4*MB;
623 n = (size+BY2PG-1) / BY2PG;
624 for(i=0; i<vpdbsize; i++)
625 if((vpdb[i]&PTEVALID) && !(vpdb[i]&PTESIZE))
626 if((o = findhole(KADDR(PPN(vpdb[i])), WD2PG, n)) != -1)
627 return VMAP + i*4*MB + o*BY2PG;
628 if((o = findhole(vpdb, vpdbsize, 1)) != -1)
629 return VMAP + o*4*MB;
632 * could span page directory entries, but not worth the trouble.
633 * not going to be very much contention.
639 * Remove a device mapping from the vmap range.
640 * Since pdbunmap does not remove page tables, just entries,
641 * the call need not be interlocked with vmap.
644 vunmap(void *v, int size)
649 * might not be aligned
655 size = ROUND(size, BY2PG);
657 if(size < 0 || va < VMAP || va+size > VMAP+VMAPSIZE)
658 panic("vunmap va=%#.8lux size=%#x pc=%#.8lux",
659 va, size, getcallerpc(&v));
661 pdbunmap(MACHP(0)->pdb, va, size);
664 * Flush mapping from all the tlbs and copied pdbs.
665 * This can be (and is) slow, since it is called only rarely.
666 * It is possible for vunmap to be called with up == nil,
667 * e.g. from the reset/init driver routines during system
668 * boot. In that case it suffices to flush the MACH(0) TLB
672 putcr3(PADDR(MACHP(0)->pdb));
680 * Add kernel mappings for pa -> va for a section of size bytes.
683 pdbmap(ulong *pdb, ulong pa, ulong va, int size)
686 ulong pgsz, *pte, *table;
692 if((MACHP(0)->cpuiddx & Pse) && (getcr4() & 0x10))
697 for(off=0; off<size; off+=pgsz){
698 table = &pdb[PDX(va+off)];
699 if((*table&PTEVALID) && (*table&PTESIZE))
700 panic("vmap: va=%#.8lux pa=%#.8lux pde=%#.8lux",
701 va+off, pa+off, *table);
704 * Check if it can be mapped using a 4MB page:
705 * va, pa aligned and size >= 4MB and processor can do it.
707 if(pse && (pa+off)%(4*MB) == 0 && (va+off)%(4*MB) == 0 && (size-off) >= 4*MB){
708 *table = (pa+off)|flag|PTESIZE|PTEVALID;
711 pte = mmuwalk(pdb, va+off, 2, 1);
713 panic("vmap: va=%#.8lux pa=%#.8lux pte=%#.8lux",
714 va+off, pa+off, *pte);
715 *pte = (pa+off)|flag|PTEVALID;
723 * Remove mappings. Must already exist, for sanity.
724 * Only used for kernel mappings, so okay to use KADDR.
727 pdbunmap(ulong *pdb, ulong va, int size)
734 table = &pdb[PDX(va)];
735 if(!(*table & PTEVALID))
736 panic("vunmap: not mapped");
737 if(*table & PTESIZE){
739 panic("vunmap: misaligned: %#p", va);
744 table = KADDR(PPN(*table));
745 if(!(table[PTX(va)] & PTEVALID))
746 panic("vunmap: not mapped");
753 pmap(ulong pa, ulong va, int size)
755 pdbmap(MACHP(0)->pdb, pa, va, size);
759 punmap(ulong va, int size)
761 pdbunmap(MACHP(0)->pdb, va, size);
762 mmuflushtlb(PADDR(m->pdb));
766 * Handle a fault by bringing vmap up to date.
767 * Only copy pdb entries and they never go away,
768 * so no locking needed.
775 if(va < VMAP || va >= VMAP+VMAPSIZE)
778 entry = MACHP(0)->pdb[PDX(va)];
779 if(!(entry&PTEVALID))
781 if(!(entry&PTESIZE)){
782 /* make sure entry will help the fault */
783 table = KADDR(PPN(entry));
784 if(!(table[PTX(va)]&PTEVALID))
787 vpd[PDX(va)] = entry;
789 * TLB doesn't cache negative results, so no flush needed.
796 * KMap is used to map individual pages into virtual memory.
797 * It is rare to have more than a few KMaps at a time (in the
798 * absence of interrupts, only two at a time are ever used,
799 * but interrupts can stack). The mappings are local to a process,
800 * so we can use the same range of virtual address space for
801 * all processes without any coordination.
803 #define kpt (vpt+VPTX(KMAP))
804 #define NKPT (KMAPSIZE/BY2PG)
812 panic("kmap: up=0 pc=%#.8lux", getcallerpc(&page));
813 if(up->mmupdb == nil)
816 panic("kmap %lud %s: nkmap=%d", up->pid, up->text, up->nkmap);
819 * Splhi shouldn't be necessary here, but paranoia reigns.
820 * See comment in putmmu above.
824 if(!(vpd[PDX(KMAP)]&PTEVALID)){
825 /* allocate page directory */
826 if(KMAPSIZE > BY2XPG)
827 panic("bad kmapsize");
828 if(up->kmaptable != nil)
831 up->kmaptable = newpage(0, 0, 0);
833 vpd[PDX(KMAP)] = up->kmaptable->pa|PTEWRITE|PTEVALID;
835 memset(kpt, 0, BY2PG);
836 kpt[0] = page->pa|PTEWRITE|PTEVALID;
841 if(up->kmaptable == nil)
842 panic("no kmaptable");
844 for(i=0; i<NKPT; i++){
845 if(kpt[(i+o)%NKPT] == 0){
847 kpt[o] = page->pa|PTEWRITE|PTEVALID;
850 return (KMap*)(KMAP+o*BY2PG);
853 panic("out of kmap");
863 if(up->mmupdb == nil || !(vpd[PDX(KMAP)]&PTEVALID))
864 panic("kunmap: no kmaps");
865 if(va < KMAP || va >= KMAP+KMAPSIZE)
866 panic("kunmap: bad address %#.8lux pc=%#p", va, getcallerpc(&k));
867 if(!(vpt[VPTX(va)]&PTEVALID))
868 panic("kunmap: not mapped %#.8lux pc=%#p", va, getcallerpc(&k));
871 panic("kunmap %lud %s: nkmap=%d", up->pid, up->text, up->nkmap);
877 * Temporary one-page mapping used to edit page directories.
879 * The fasttmp #define controls whether the code optimizes
880 * the case where the page is already mapped in the physical
892 panic("tmpaddr: islo");
894 if(fasttmp && p->pa < -KZERO)
898 * PDX(TMPADDR) == PDX(MACHADDR), so this
899 * entry is private to the processor and shared
900 * between up->mmupdb (if any) and m->pdb.
902 entry = &vpt[VPTX(TMPADDR)];
903 if(!(*entry&PTEVALID)){
904 for(i=KZERO; i<=CPU0MACH; i+=BY2PG)
905 print("%#p: *%#p=%#p (vpt=%#p index=%#p)\n", i, &vpt[VPTX(i)], vpt[VPTX(i)], vpt, VPTX(i));
906 panic("tmpmap: no entry");
908 if(PPN(*entry) != PPN(TMPADDR-KZERO))
909 panic("tmpmap: already mapped entry=%#.8lux", *entry);
910 *entry = p->pa|PTEWRITE|PTEVALID;
912 return (void*)TMPADDR;
921 panic("tmpaddr: islo");
922 if(fasttmp && (ulong)v >= KZERO && v != (void*)TMPADDR)
924 if(v != (void*)TMPADDR)
925 panic("tmpunmap: bad address");
926 entry = &vpt[VPTX(TMPADDR)];
927 if(!(*entry&PTEVALID) || PPN(*entry) == PPN(PADDR(TMPADDR)))
928 panic("tmpmap: not mapped entry=%#.8lux", *entry);
929 *entry = PPN(TMPADDR-KZERO)|PTEWRITE|PTEVALID;
934 * These could go back to being macros once the kernel is debugged,
935 * but the extra checking is nice to have.
940 if(pa >= (ulong)-KZERO)
941 panic("kaddr: pa=%#.8lux", pa);
942 return (void*)(pa+KZERO);
952 panic("paddr: va=%#.8lux pc=%#p", va, getcallerpc(&v));
960 checkfault(ulong, ulong)
965 * Return the number of bytes that can be accessed via KADDR(pa).
966 * If pa is not a valid argument to KADDR, return 0.
977 * mark pages as write combining (used for framebuffer)
980 patwc(void *a, int n)
982 ulong *pte, mask, attr, va;
986 /* check if pat is usable */
987 if((MACHP(0)->cpuiddx & Pat) == 0
988 || rdmsr(0x277, &v) == -1
989 || ((v >> PATWC*8) & 7) != 1)
992 /* set the bits for all pages in range */
993 for(va = (ulong)a; n > 0; n -= z, va += z){
994 pte = mmuwalk(MACHP(0)->pdb, va, 1, 0);
995 if(pte && (*pte & (PTEVALID|PTESIZE)) == (PTEVALID|PTESIZE)){
996 z = 4*MB - (va & (4*MB-1));
999 pte = mmuwalk(MACHP(0)->pdb, va, 2, 0);
1000 if(pte == 0 || (*pte & PTEVALID) == 0)
1001 panic("patwc: va=%#p", va);
1002 z = BY2PG - (va & (BY2PG-1));
1005 attr = (((PATWC&3)<<3) | ((PATWC&4)<<5) | ((PATWC&4)<<10));
1006 *pte = (*pte & ~mask) | (attr & mask);