diff options
author | mischief <mischief@offblast.org> | 2014-06-24 18:02:25 -0700 |
---|---|---|
committer | mischief <mischief@offblast.org> | 2014-06-24 18:02:25 -0700 |
commit | 5ba95fdb07ddc2c32111a1b2f57f17aa27fcbbf5 (patch) | |
tree | c1ec54cb9ecff85b0b820a26d26a10a32a118d0c /sys/src/9/xen/mmu.c | |
parent | fa03455b5057675b18d1c87aef2d1071b2088de0 (diff) |
import xen 32 bit paravirtual kernel from /n/sources/xen.
Diffstat (limited to 'sys/src/9/xen/mmu.c')
-rw-r--r-- | sys/src/9/xen/mmu.c | 595 |
1 files changed, 595 insertions, 0 deletions
diff --git a/sys/src/9/xen/mmu.c b/sys/src/9/xen/mmu.c new file mode 100644 index 000000000..b3ce84074 --- /dev/null +++ b/sys/src/9/xen/mmu.c @@ -0,0 +1,595 @@ +#include "u.h" +#include "../port/lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "io.h" + +int paemode; +uvlong *xenpdpt; /* this needs to go in Mach for multiprocessor guest */ + +#define LOG(a) +#define PUTMMULOG(a) +#define MFN(pa) (patomfn[(pa)>>PGSHIFT]) +#define MAPPN(x) (paemode? matopfn[*(uvlong*)(&x)>>PGSHIFT]<<PGSHIFT : matopfn[(x)>>PGSHIFT]<<PGSHIFT) + +#define DATASEGM(p) { 0xFFFF, SEGG|SEGB|(0xF<<16)|SEGP|SEGPL(p)|SEGDATA|SEGW } +#define EXECSEGM(p) { 0xFFFF, SEGG|SEGD|(0xF<<16)|SEGP|SEGPL(p)|SEGEXEC|SEGR } +#define TSSSEGM(b,p) { ((b)<<16)|sizeof(Tss),\ + ((b)&0xFF000000)|(((b)>>16)&0xFF)|SEGTSS|SEGPL(p)|SEGP } + +Segdesc gdt[NGDT] = +{ +[NULLSEG] { 0, 0}, /* null descriptor */ +[KDSEG] DATASEGM(0), /* kernel data/stack */ +[KESEG] EXECSEGM(0), /* kernel code */ +[UDSEG] DATASEGM(3), /* user data/stack */ +[UESEG] EXECSEGM(3), /* user code */ +[TSSSEG] TSSSEGM(0,0), /* tss segment */ +}; + +/* note: pdb must already be pinned */ +static void +taskswitch(Page *pdb, ulong stack) +{ + Tss *tss; + + tss = m->tss; + tss->ss0 = KDSEL; + tss->esp0 = stack; + tss->ss1 = KDSEL; + tss->esp1 = stack; + tss->ss2 = KDSEL; + tss->esp2 = stack; + //tss->cr3 = pdb; + HYPERVISOR_stack_switch(KDSEL, stack); + mmuflushtlb(pdb); +} + +void +mmuflushtlb(Page *pdb) +{ + int s, i; + + if(!paemode){ + if(pdb) + xenptswitch(pdb->pa); + else + xenptswitch(PADDR(m->pdb)); + }else{ + if(pdb){ + s = splhi(); + for(i = 0; i < 3; i++){ + xenupdate((ulong*)&xenpdpt[i], pdb->pa | PTEVALID); + pdb = pdb->next; + } + splx(s); + }else{ + s = splhi(); + for(i = 0; i < 3; i++) + xenupdatema((ulong*)&xenpdpt[i], ((uvlong*)m->pdb)[i]); + splx(s); + } + xentlbflush(); + } +} + +/* + * On processors that support it, we set the PTEGLOBAL bit in + * page table and page directory entries that map kernel memory. + * Doing this tells the processor not to bother flushing them + * from the TLB when doing the TLB flush associated with a + * context switch (write to CR3). Since kernel memory mappings + * are never removed, this is safe. (If we ever remove kernel memory + * mappings, we can do a full flush by turning off the PGE bit in CR4, + * writing to CR3, and then turning the PGE bit back on.) + * + * See also mmukmap below. + * + * Processor support for the PTEGLOBAL bit is enabled in devarch.c. + */ +static void +memglobal(void) +{ + int i, j; + ulong *pde, *pte; + + /* only need to do this once, on bootstrap processor */ + if(m->machno != 0) + return; + + if(!m->havepge) + return; + + pde = m->pdb; + for(i=512; i<1024; i++){ /* 512: start at entry for virtual 0x80000000 */ + if(pde[i] & PTEVALID){ + pde[i] |= PTEGLOBAL; + if(!(pde[i] & PTESIZE)){ + pte = KADDR(pde[i]&~(BY2PG-1)); + for(j=0; j<1024; j++) + if(pte[j] & PTEVALID) + pte[j] |= PTEGLOBAL; + } + } + } +} + +ulong +mmumapframe(ulong va, ulong mfn) +{ + ulong *pte, pdbx; + uvlong ma; + + /* + * map machine frame number to a virtual address. + * When called the pagedir and page table exist, we just + * need to fill in a page table entry. + */ + ma = ((uvlong)mfn<<PGSHIFT) | PTEVALID|PTEWRITE; + pdbx = PDX(va); + pte = KADDR(MAPPN(PDB(m->pdb,va)[pdbx])); + xenupdatema(&pte[PTX(va)], ma); + return va; +} + +void +mmumapcpu0(void) +{ + ulong *pdb, *pte, va, pa, pdbx; + + if(strstr(xenstart->magic, "x86_32p")) + paemode = 1; + hypervisor_virt_start = paemode ? 0xF5800000 : 0xFC000000; + patomfn = (ulong*)xenstart->mfn_list; + matopfn = (ulong*)hypervisor_virt_start; + /* Xen bug ? can't touch top entry in PDPT */ + if(paemode) + hypervisor_virt_start = 0xC0000000; + + /* + * map CPU0MACH at MACHADDR. + * When called the pagedir and page table exist, we just + * need to fill in a page table entry. + */ + pdb = (ulong*)xenstart->pt_base; + va = MACHADDR; + pa = PADDR(CPU0MACH) | PTEVALID|PTEWRITE; + pdbx = PDX(va); + pdb = PDB(pdb, va); + pte = KADDR(MAPPN(pdb[pdbx])); + xenupdate(&pte[PTX(va)], pa); +} + +void +mmuinit(void) +{ +//XXX ulong x; +//XXX ushort ptr[3]; + ulong *pte, npgs, pa; + extern int rtsr(void); + + if(paemode){ + int i; + xenpdpt = (uvlong*)m->pdb; + m->pdb = xspanalloc(32, 32, 0); + /* clear "reserved" bits in initial page directory pointers -- Xen bug? */ + for(i = 0; i < 4; i++) + ((uvlong*)m->pdb)[i] = xenpdpt[i] & ~0x1E6LL; + } + + /* + * So far only memory up to xentop is mapped, map the rest. + * We cant use large pages because our contiguous PA space + * is not necessarily contiguous in MA. + */ + npgs = conf.mem[0].npage; + for(pa=conf.mem[0].base; npgs; npgs--, pa+=BY2PG) { + pte = mmuwalk(m->pdb, (ulong)KADDR(pa), 2, 1); + if(!pte) + panic("mmuinit"); + xenupdate(pte, pa|PTEVALID|PTEWRITE); + } + + memglobal(); + + m->tss = malloc(sizeof(Tss)); + memset(m->tss, 0, sizeof(Tss)); + m->tss->iomap = 0xDFFF<<16; + + /* + * We used to keep the GDT in the Mach structure, but it + * turns out that that slows down access to the rest of the + * page. Since the Mach structure is accessed quite often, + * it pays off anywhere from a factor of 1.25 to 2 on real + * hardware to separate them (the AMDs are more sensitive + * than Intels in this regard). Under VMware it pays off + * a factor of about 10 to 100. + */ + +#ifdef we_dont_set_gdt_or_lidt + memmove(m->gdt, gdt, sizeof gdt); + x = (ulong)m->tss; + m->gdt[TSSSEG].d0 = (x<<16)|sizeof(Tss); + m->gdt[TSSSEG].d1 = (x&0xFF000000)|((x>>16)&0xFF)|SEGTSS|SEGPL(0)|SEGP; + + ptr[0] = sizeof(gdt)-1; + x = (ulong)m->gdt; + ptr[1] = x & 0xFFFF; + ptr[2] = (x>>16) & 0xFFFF; + lgdt(ptr); + + ptr[0] = sizeof(Segdesc)*256-1; + x = IDTADDR; + ptr[1] = x & 0xFFFF; + ptr[2] = (x>>16) & 0xFFFF; + lidt(ptr); +#endif + +#ifdef we_may_eventually_want_this + /* make kernel text unwritable */ + for(x = KTZERO; x < (ulong)etext; x += BY2PG){ + p = mmuwalk(m->pdb, x, 2, 0); + if(p == nil) + panic("mmuinit"); + *p &= ~PTEWRITE; + } +#endif + + taskswitch(0, (ulong)m + BY2PG); +#ifdef we_dont_do_this + ltr(TSSSEL); +#endif +} + +void +flushmmu(void) +{ + int s; + + s = splhi(); + up->newtlb = 1; + mmuswitch(up); + splx(s); +} + +static ulong* +mmupdb(Page *pg, ulong va) +{ + int i; + + for(i = PAX(va); i > 0; i -= 2) + pg = pg->next; + return (ulong*)pg->va; +} + +/* this can be called with an active pdb, so use Xen calls to zero it out. + */ +static void +mmuptefree(Proc* proc) +{ + ulong *pdb, va; + Page **last, *page; + + if(proc->mmupdb && proc->mmuused){ + last = &proc->mmuused; + for(page = *last; page; page = page->next){ + /* this is no longer a pte page so make it readwrite */ + va = page->daddr; + pdb = mmupdb(proc->mmupdb, va); + xenupdatema(&pdb[PDX(va)], 0); + xenptunpin(page->va); + last = &page->next; + } + *last = proc->mmufree; + proc->mmufree = proc->mmuused; + proc->mmuused = 0; + } +} + +void +mmuswitch(Proc* proc) +{ + //ulong *pdb; + + if(proc->newtlb){ + mmuptefree(proc); + proc->newtlb = 0; + } + + if(proc->mmupdb){ + //XXX doesn't work for some reason, but it's not needed for uniprocessor + //pdb = (ulong*)proc->mmupdb->va; + //xenupdate(&pdb[PDX(MACHADDR)], m->pdb[PDX(MACHADDR)]); + taskswitch(proc->mmupdb, (ulong)(proc->kstack+KSTACK)); + } + else + taskswitch(0, (ulong)(proc->kstack+KSTACK)); +} + +void +mmurelease(Proc* proc) +{ + Page *page, *next; + + /* + * Release any pages allocated for a page directory base or page-tables + * for this process: + * switch to the prototype pdb for this processor (m->pdb); + * call mmuptefree() to place all pages used for page-tables (proc->mmuused) + * onto the process' free list (proc->mmufree). This has the side-effect of + * cleaning any user entries in the pdb (proc->mmupdb); + * if there's a pdb put it in the cache of pre-initialised pdb's + * for this processor (m->pdbpool) or on the process' free list; + * finally, place any pages freed back into the free pool (palloc). + * This routine is only called from sched() with palloc locked. + */ + taskswitch(0, (ulong)m + BY2PG); + mmuptefree(proc); + + if((page = proc->mmupdb) != 0){ + proc->mmupdb = 0; + while(page){ + next = page->next; + /* its not a page table anymore, mark it rw */ + xenptunpin(page->va); + if(paemode || m->pdbcnt > 10){ + page->next = proc->mmufree; + proc->mmufree = page; + } + else{ + page->next = m->pdbpool; + m->pdbpool = page; + m->pdbcnt++; + } + page = next; + } + } + + for(page = proc->mmufree; page; page = next){ + next = page->next; + if(--page->ref) + panic("mmurelease: page->ref %d\n", page->ref); + pagechainhead(page); + } + if(proc->mmufree && palloc.r.p) + wakeup(&palloc.r); + proc->mmufree = 0; +} + +static Page* +mmupdballoc(ulong va, void *mpdb) +{ + int s; + Page *page; + Page *badpages, *pg; + + s = splhi(); + /* + * All page tables must be read-only. We will mark them + * readwrite later when we free them and they are no + * longer used as page tables. + */ + if(m->pdbpool == 0){ + spllo(); + badpages = 0; + for (;;) { + page = newpage(0, 0, 0); + page->va = VA(kmap(page)); + if(mpdb) + memmove((void*)page->va, mpdb, BY2PG); + else + memset((void*)page->va, 0, BY2PG); + if (xenpgdpin(page->va)) + break; + /* + * XXX Plan 9 is a bit lax about putting pages on the free list when they are + * still mapped (r/w) by some process's page table. From Plan 9's point + * of view this is safe because the any such process will have up->newtlb set, + * so the mapping will be cleared before the process is dispatched. But the Xen + * hypervisor has no way of knowing this, so it refuses to pin the page for use + * as a pagetable. + */ + if(0) print("bad pgdpin %lux va %lux copy %lux %s\n", MFN(PADDR(page->va)), va, (ulong)mpdb, up? up->text: ""); + page->next = badpages; + badpages = page; + } + while (badpages != 0) { + pg = badpages; + badpages = badpages->next; + putpage(pg); + } + } + else{ + page = m->pdbpool; + m->pdbpool = page->next; + m->pdbcnt--; + if (!xenpgdpin(page->va)) + panic("xenpgdpin"); + } + splx(s); + + page->next = 0; + return page; +} + +void +checkmmu(ulong va, ulong pa) +{ + ulong *pdb, *pte; + int pdbx; + + if(up->mmupdb == 0) + return; + + pdb = mmupdb(up->mmupdb, va); + pdbx = PDX(va); + if(MAPPN(pdb[pdbx]) == 0){ + /* okay to be empty - will fault and get filled */ + return; + } + + pte = KADDR(MAPPN(pdb[pdbx])); + if(MAPPN(pte[PTX(va)]) != pa){ + if(!paemode) + print("%ld %s: va=0x%08lux pa=0x%08lux pte=0x%08lux (0x%08lux)\n", + up->pid, up->text, + va, pa, pte[PTX(va)], MAPPN(pte[PTX(va)])); + else + print("%ld %s: va=0x%08lux pa=0x%08lux pte=0x%16llux (0x%08lux)\n", + up->pid, up->text, + va, pa, *(uvlong*)&pte[PTX(va)], MAPPN(pte[PTX(va)])); + } +} + +void +putmmu(ulong va, ulong pa, Page*) +{ + int pdbx; + Page *page; + Page *badpages, *pg; + ulong *pdb, *pte; + int i, s; + + PUTMMULOG(dprint("putmmu va %lux pa %lux\n", va, pa);) + if(up->mmupdb == 0){ + if(!paemode) + up->mmupdb = mmupdballoc(va, m->pdb); + else { + page = 0; + for(i = 4; i >= 0; i -= 2){ + if(m->pdb[i]) + pg = mmupdballoc(va, KADDR(MAPPN(m->pdb[i]))); + else + pg = mmupdballoc(va, 0); + pg->next = page; + page = pg; + } + up->mmupdb = page; + } + } + pdb = mmupdb(up->mmupdb, va); + pdbx = PDX(va); + + if(PPN(pdb[pdbx]) == 0){ + PUTMMULOG(dprint("new pt page for index %d pdb %lux\n", pdbx, (ulong)pdb);) + /* mark page as readonly before using as a page table */ + if(up->mmufree == 0){ + badpages = 0; + for (;;) { + page = newpage(1, 0, 0); + page->va = VA(kmap(page)); + if (xenptpin(page->va)) + break; + if(0) print("bad pin %lux va %lux %s\n", MFN(PADDR(page->va)), va, up->text); + page->next = badpages; + badpages = page; + } + while (badpages != 0) { + pg = badpages; + badpages = badpages->next; + putpage(pg); + } + } + else { + page = up->mmufree; + up->mmufree = page->next; + memset((void*)page->va, 0, BY2PG); + if (!xenptpin(page->va)) + panic("xenptpin"); + } + + xenupdate(&pdb[pdbx], page->pa|PTEVALID|PTEUSER|PTEWRITE); + + page->daddr = va; + page->next = up->mmuused; + up->mmuused = page; + } + + pte = KADDR(MAPPN(pdb[pdbx])); + PUTMMULOG(dprint("pte %lux index %lud old %lux new %lux mfn %lux\n", (ulong)pte, PTX(va), pte[PTX(va)], pa|PTEUSER, MFN(pa));) + xenupdate(&pte[PTX(va)], pa|PTEUSER); + + s = splhi(); + //XXX doesn't work for some reason, but it's not needed for uniprocessor + //xenupdate(&pdb[PDX(MACHADDR)], m->pdb[PDX(MACHADDR)]); + mmuflushtlb(up->mmupdb); + splx(s); +} + +ulong* +mmuwalk(ulong* pdb, ulong va, int level, int create) +{ + ulong pa, va2, *table; + + /* + * Walk the page-table pointed to by pdb and return a pointer + * to the entry for virtual address va at the requested level. + * If the entry is invalid and create isn't requested then bail + * out early. Otherwise, for the 2nd level walk, allocate a new + * page-table page and register it in the 1st level. + */ + if(paemode){ + pdb = &pdb[PAX(va)]; + if(!(*pdb & PTEVALID)){ + if(create == 0) + return 0; + panic("mmuwalk: missing pgdir ptr for va=%lux\n", va); + } + pdb = KADDR(MAPPN(*pdb)); + } + table = &pdb[PDX(va)]; + if(!(*table & PTEVALID) && create == 0) + return 0; + + switch(level){ + + default: + return 0; + + case 1: + return table; + + case 2: + if(*table & PTESIZE) + panic("mmuwalk2: va %luX entry %luX\n", va, *table); + if(!(*table & PTEVALID)){ + va2 = (ulong)xspanalloc(BY2PG, BY2PG, 0); + pa = PADDR(va2); + xenptpin(va2); + xenupdate(table, pa|PTEWRITE|PTEVALID); + } + table = KADDR(MAPPN(*table)); + + return &table[PTX(va)]; + } +} + +int +mmukmapsync(ulong va) +{ + USED(va); + return 0; +} + +/* + * More debugging. + */ +void +countpagerefs(ulong *ref, int print) +{ + USED(ref); + USED(print); +} + +/* + * Return the number of bytes that can be accessed via KADDR(pa). + * If pa is not a valid argument to KADDR, return 0. + */ +ulong +cankaddr(ulong pa) +{ + if(pa >= -KZERO) + return 0; + return -KZERO - pa; +} |