summaryrefslogtreecommitdiff
path: root/sys/src/9/xen/mmu.c
diff options
context:
space:
mode:
authormischief <mischief@offblast.org>2014-06-24 18:02:25 -0700
committermischief <mischief@offblast.org>2014-06-24 18:02:25 -0700
commit5ba95fdb07ddc2c32111a1b2f57f17aa27fcbbf5 (patch)
treec1ec54cb9ecff85b0b820a26d26a10a32a118d0c /sys/src/9/xen/mmu.c
parentfa03455b5057675b18d1c87aef2d1071b2088de0 (diff)
import xen 32 bit paravirtual kernel from /n/sources/xen.
Diffstat (limited to 'sys/src/9/xen/mmu.c')
-rw-r--r--sys/src/9/xen/mmu.c595
1 files changed, 595 insertions, 0 deletions
diff --git a/sys/src/9/xen/mmu.c b/sys/src/9/xen/mmu.c
new file mode 100644
index 000000000..b3ce84074
--- /dev/null
+++ b/sys/src/9/xen/mmu.c
@@ -0,0 +1,595 @@
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "io.h"
+
+int paemode;
+uvlong *xenpdpt; /* this needs to go in Mach for multiprocessor guest */
+
+#define LOG(a)
+#define PUTMMULOG(a)
+#define MFN(pa) (patomfn[(pa)>>PGSHIFT])
+#define MAPPN(x) (paemode? matopfn[*(uvlong*)(&x)>>PGSHIFT]<<PGSHIFT : matopfn[(x)>>PGSHIFT]<<PGSHIFT)
+
+#define DATASEGM(p) { 0xFFFF, SEGG|SEGB|(0xF<<16)|SEGP|SEGPL(p)|SEGDATA|SEGW }
+#define EXECSEGM(p) { 0xFFFF, SEGG|SEGD|(0xF<<16)|SEGP|SEGPL(p)|SEGEXEC|SEGR }
+#define TSSSEGM(b,p) { ((b)<<16)|sizeof(Tss),\
+ ((b)&0xFF000000)|(((b)>>16)&0xFF)|SEGTSS|SEGPL(p)|SEGP }
+
+Segdesc gdt[NGDT] =
+{
+[NULLSEG] { 0, 0}, /* null descriptor */
+[KDSEG] DATASEGM(0), /* kernel data/stack */
+[KESEG] EXECSEGM(0), /* kernel code */
+[UDSEG] DATASEGM(3), /* user data/stack */
+[UESEG] EXECSEGM(3), /* user code */
+[TSSSEG] TSSSEGM(0,0), /* tss segment */
+};
+
+/* note: pdb must already be pinned */
+static void
+taskswitch(Page *pdb, ulong stack)
+{
+ Tss *tss;
+
+ tss = m->tss;
+ tss->ss0 = KDSEL;
+ tss->esp0 = stack;
+ tss->ss1 = KDSEL;
+ tss->esp1 = stack;
+ tss->ss2 = KDSEL;
+ tss->esp2 = stack;
+ //tss->cr3 = pdb;
+ HYPERVISOR_stack_switch(KDSEL, stack);
+ mmuflushtlb(pdb);
+}
+
+void
+mmuflushtlb(Page *pdb)
+{
+ int s, i;
+
+ if(!paemode){
+ if(pdb)
+ xenptswitch(pdb->pa);
+ else
+ xenptswitch(PADDR(m->pdb));
+ }else{
+ if(pdb){
+ s = splhi();
+ for(i = 0; i < 3; i++){
+ xenupdate((ulong*)&xenpdpt[i], pdb->pa | PTEVALID);
+ pdb = pdb->next;
+ }
+ splx(s);
+ }else{
+ s = splhi();
+ for(i = 0; i < 3; i++)
+ xenupdatema((ulong*)&xenpdpt[i], ((uvlong*)m->pdb)[i]);
+ splx(s);
+ }
+ xentlbflush();
+ }
+}
+
+/*
+ * On processors that support it, we set the PTEGLOBAL bit in
+ * page table and page directory entries that map kernel memory.
+ * Doing this tells the processor not to bother flushing them
+ * from the TLB when doing the TLB flush associated with a
+ * context switch (write to CR3). Since kernel memory mappings
+ * are never removed, this is safe. (If we ever remove kernel memory
+ * mappings, we can do a full flush by turning off the PGE bit in CR4,
+ * writing to CR3, and then turning the PGE bit back on.)
+ *
+ * See also mmukmap below.
+ *
+ * Processor support for the PTEGLOBAL bit is enabled in devarch.c.
+ */
+static void
+memglobal(void)
+{
+ int i, j;
+ ulong *pde, *pte;
+
+ /* only need to do this once, on bootstrap processor */
+ if(m->machno != 0)
+ return;
+
+ if(!m->havepge)
+ return;
+
+ pde = m->pdb;
+ for(i=512; i<1024; i++){ /* 512: start at entry for virtual 0x80000000 */
+ if(pde[i] & PTEVALID){
+ pde[i] |= PTEGLOBAL;
+ if(!(pde[i] & PTESIZE)){
+ pte = KADDR(pde[i]&~(BY2PG-1));
+ for(j=0; j<1024; j++)
+ if(pte[j] & PTEVALID)
+ pte[j] |= PTEGLOBAL;
+ }
+ }
+ }
+}
+
+ulong
+mmumapframe(ulong va, ulong mfn)
+{
+ ulong *pte, pdbx;
+ uvlong ma;
+
+ /*
+ * map machine frame number to a virtual address.
+ * When called the pagedir and page table exist, we just
+ * need to fill in a page table entry.
+ */
+ ma = ((uvlong)mfn<<PGSHIFT) | PTEVALID|PTEWRITE;
+ pdbx = PDX(va);
+ pte = KADDR(MAPPN(PDB(m->pdb,va)[pdbx]));
+ xenupdatema(&pte[PTX(va)], ma);
+ return va;
+}
+
+void
+mmumapcpu0(void)
+{
+ ulong *pdb, *pte, va, pa, pdbx;
+
+ if(strstr(xenstart->magic, "x86_32p"))
+ paemode = 1;
+ hypervisor_virt_start = paemode ? 0xF5800000 : 0xFC000000;
+ patomfn = (ulong*)xenstart->mfn_list;
+ matopfn = (ulong*)hypervisor_virt_start;
+ /* Xen bug ? can't touch top entry in PDPT */
+ if(paemode)
+ hypervisor_virt_start = 0xC0000000;
+
+ /*
+ * map CPU0MACH at MACHADDR.
+ * When called the pagedir and page table exist, we just
+ * need to fill in a page table entry.
+ */
+ pdb = (ulong*)xenstart->pt_base;
+ va = MACHADDR;
+ pa = PADDR(CPU0MACH) | PTEVALID|PTEWRITE;
+ pdbx = PDX(va);
+ pdb = PDB(pdb, va);
+ pte = KADDR(MAPPN(pdb[pdbx]));
+ xenupdate(&pte[PTX(va)], pa);
+}
+
+void
+mmuinit(void)
+{
+//XXX ulong x;
+//XXX ushort ptr[3];
+ ulong *pte, npgs, pa;
+ extern int rtsr(void);
+
+ if(paemode){
+ int i;
+ xenpdpt = (uvlong*)m->pdb;
+ m->pdb = xspanalloc(32, 32, 0);
+ /* clear "reserved" bits in initial page directory pointers -- Xen bug? */
+ for(i = 0; i < 4; i++)
+ ((uvlong*)m->pdb)[i] = xenpdpt[i] & ~0x1E6LL;
+ }
+
+ /*
+ * So far only memory up to xentop is mapped, map the rest.
+ * We cant use large pages because our contiguous PA space
+ * is not necessarily contiguous in MA.
+ */
+ npgs = conf.mem[0].npage;
+ for(pa=conf.mem[0].base; npgs; npgs--, pa+=BY2PG) {
+ pte = mmuwalk(m->pdb, (ulong)KADDR(pa), 2, 1);
+ if(!pte)
+ panic("mmuinit");
+ xenupdate(pte, pa|PTEVALID|PTEWRITE);
+ }
+
+ memglobal();
+
+ m->tss = malloc(sizeof(Tss));
+ memset(m->tss, 0, sizeof(Tss));
+ m->tss->iomap = 0xDFFF<<16;
+
+ /*
+ * We used to keep the GDT in the Mach structure, but it
+ * turns out that that slows down access to the rest of the
+ * page. Since the Mach structure is accessed quite often,
+ * it pays off anywhere from a factor of 1.25 to 2 on real
+ * hardware to separate them (the AMDs are more sensitive
+ * than Intels in this regard). Under VMware it pays off
+ * a factor of about 10 to 100.
+ */
+
+#ifdef we_dont_set_gdt_or_lidt
+ memmove(m->gdt, gdt, sizeof gdt);
+ x = (ulong)m->tss;
+ m->gdt[TSSSEG].d0 = (x<<16)|sizeof(Tss);
+ m->gdt[TSSSEG].d1 = (x&0xFF000000)|((x>>16)&0xFF)|SEGTSS|SEGPL(0)|SEGP;
+
+ ptr[0] = sizeof(gdt)-1;
+ x = (ulong)m->gdt;
+ ptr[1] = x & 0xFFFF;
+ ptr[2] = (x>>16) & 0xFFFF;
+ lgdt(ptr);
+
+ ptr[0] = sizeof(Segdesc)*256-1;
+ x = IDTADDR;
+ ptr[1] = x & 0xFFFF;
+ ptr[2] = (x>>16) & 0xFFFF;
+ lidt(ptr);
+#endif
+
+#ifdef we_may_eventually_want_this
+ /* make kernel text unwritable */
+ for(x = KTZERO; x < (ulong)etext; x += BY2PG){
+ p = mmuwalk(m->pdb, x, 2, 0);
+ if(p == nil)
+ panic("mmuinit");
+ *p &= ~PTEWRITE;
+ }
+#endif
+
+ taskswitch(0, (ulong)m + BY2PG);
+#ifdef we_dont_do_this
+ ltr(TSSSEL);
+#endif
+}
+
+void
+flushmmu(void)
+{
+ int s;
+
+ s = splhi();
+ up->newtlb = 1;
+ mmuswitch(up);
+ splx(s);
+}
+
+static ulong*
+mmupdb(Page *pg, ulong va)
+{
+ int i;
+
+ for(i = PAX(va); i > 0; i -= 2)
+ pg = pg->next;
+ return (ulong*)pg->va;
+}
+
+/* this can be called with an active pdb, so use Xen calls to zero it out.
+ */
+static void
+mmuptefree(Proc* proc)
+{
+ ulong *pdb, va;
+ Page **last, *page;
+
+ if(proc->mmupdb && proc->mmuused){
+ last = &proc->mmuused;
+ for(page = *last; page; page = page->next){
+ /* this is no longer a pte page so make it readwrite */
+ va = page->daddr;
+ pdb = mmupdb(proc->mmupdb, va);
+ xenupdatema(&pdb[PDX(va)], 0);
+ xenptunpin(page->va);
+ last = &page->next;
+ }
+ *last = proc->mmufree;
+ proc->mmufree = proc->mmuused;
+ proc->mmuused = 0;
+ }
+}
+
+void
+mmuswitch(Proc* proc)
+{
+ //ulong *pdb;
+
+ if(proc->newtlb){
+ mmuptefree(proc);
+ proc->newtlb = 0;
+ }
+
+ if(proc->mmupdb){
+ //XXX doesn't work for some reason, but it's not needed for uniprocessor
+ //pdb = (ulong*)proc->mmupdb->va;
+ //xenupdate(&pdb[PDX(MACHADDR)], m->pdb[PDX(MACHADDR)]);
+ taskswitch(proc->mmupdb, (ulong)(proc->kstack+KSTACK));
+ }
+ else
+ taskswitch(0, (ulong)(proc->kstack+KSTACK));
+}
+
+void
+mmurelease(Proc* proc)
+{
+ Page *page, *next;
+
+ /*
+ * Release any pages allocated for a page directory base or page-tables
+ * for this process:
+ * switch to the prototype pdb for this processor (m->pdb);
+ * call mmuptefree() to place all pages used for page-tables (proc->mmuused)
+ * onto the process' free list (proc->mmufree). This has the side-effect of
+ * cleaning any user entries in the pdb (proc->mmupdb);
+ * if there's a pdb put it in the cache of pre-initialised pdb's
+ * for this processor (m->pdbpool) or on the process' free list;
+ * finally, place any pages freed back into the free pool (palloc).
+ * This routine is only called from sched() with palloc locked.
+ */
+ taskswitch(0, (ulong)m + BY2PG);
+ mmuptefree(proc);
+
+ if((page = proc->mmupdb) != 0){
+ proc->mmupdb = 0;
+ while(page){
+ next = page->next;
+ /* its not a page table anymore, mark it rw */
+ xenptunpin(page->va);
+ if(paemode || m->pdbcnt > 10){
+ page->next = proc->mmufree;
+ proc->mmufree = page;
+ }
+ else{
+ page->next = m->pdbpool;
+ m->pdbpool = page;
+ m->pdbcnt++;
+ }
+ page = next;
+ }
+ }
+
+ for(page = proc->mmufree; page; page = next){
+ next = page->next;
+ if(--page->ref)
+ panic("mmurelease: page->ref %d\n", page->ref);
+ pagechainhead(page);
+ }
+ if(proc->mmufree && palloc.r.p)
+ wakeup(&palloc.r);
+ proc->mmufree = 0;
+}
+
+static Page*
+mmupdballoc(ulong va, void *mpdb)
+{
+ int s;
+ Page *page;
+ Page *badpages, *pg;
+
+ s = splhi();
+ /*
+ * All page tables must be read-only. We will mark them
+ * readwrite later when we free them and they are no
+ * longer used as page tables.
+ */
+ if(m->pdbpool == 0){
+ spllo();
+ badpages = 0;
+ for (;;) {
+ page = newpage(0, 0, 0);
+ page->va = VA(kmap(page));
+ if(mpdb)
+ memmove((void*)page->va, mpdb, BY2PG);
+ else
+ memset((void*)page->va, 0, BY2PG);
+ if (xenpgdpin(page->va))
+ break;
+ /*
+ * XXX Plan 9 is a bit lax about putting pages on the free list when they are
+ * still mapped (r/w) by some process's page table. From Plan 9's point
+ * of view this is safe because the any such process will have up->newtlb set,
+ * so the mapping will be cleared before the process is dispatched. But the Xen
+ * hypervisor has no way of knowing this, so it refuses to pin the page for use
+ * as a pagetable.
+ */
+ if(0) print("bad pgdpin %lux va %lux copy %lux %s\n", MFN(PADDR(page->va)), va, (ulong)mpdb, up? up->text: "");
+ page->next = badpages;
+ badpages = page;
+ }
+ while (badpages != 0) {
+ pg = badpages;
+ badpages = badpages->next;
+ putpage(pg);
+ }
+ }
+ else{
+ page = m->pdbpool;
+ m->pdbpool = page->next;
+ m->pdbcnt--;
+ if (!xenpgdpin(page->va))
+ panic("xenpgdpin");
+ }
+ splx(s);
+
+ page->next = 0;
+ return page;
+}
+
+void
+checkmmu(ulong va, ulong pa)
+{
+ ulong *pdb, *pte;
+ int pdbx;
+
+ if(up->mmupdb == 0)
+ return;
+
+ pdb = mmupdb(up->mmupdb, va);
+ pdbx = PDX(va);
+ if(MAPPN(pdb[pdbx]) == 0){
+ /* okay to be empty - will fault and get filled */
+ return;
+ }
+
+ pte = KADDR(MAPPN(pdb[pdbx]));
+ if(MAPPN(pte[PTX(va)]) != pa){
+ if(!paemode)
+ print("%ld %s: va=0x%08lux pa=0x%08lux pte=0x%08lux (0x%08lux)\n",
+ up->pid, up->text,
+ va, pa, pte[PTX(va)], MAPPN(pte[PTX(va)]));
+ else
+ print("%ld %s: va=0x%08lux pa=0x%08lux pte=0x%16llux (0x%08lux)\n",
+ up->pid, up->text,
+ va, pa, *(uvlong*)&pte[PTX(va)], MAPPN(pte[PTX(va)]));
+ }
+}
+
+void
+putmmu(ulong va, ulong pa, Page*)
+{
+ int pdbx;
+ Page *page;
+ Page *badpages, *pg;
+ ulong *pdb, *pte;
+ int i, s;
+
+ PUTMMULOG(dprint("putmmu va %lux pa %lux\n", va, pa);)
+ if(up->mmupdb == 0){
+ if(!paemode)
+ up->mmupdb = mmupdballoc(va, m->pdb);
+ else {
+ page = 0;
+ for(i = 4; i >= 0; i -= 2){
+ if(m->pdb[i])
+ pg = mmupdballoc(va, KADDR(MAPPN(m->pdb[i])));
+ else
+ pg = mmupdballoc(va, 0);
+ pg->next = page;
+ page = pg;
+ }
+ up->mmupdb = page;
+ }
+ }
+ pdb = mmupdb(up->mmupdb, va);
+ pdbx = PDX(va);
+
+ if(PPN(pdb[pdbx]) == 0){
+ PUTMMULOG(dprint("new pt page for index %d pdb %lux\n", pdbx, (ulong)pdb);)
+ /* mark page as readonly before using as a page table */
+ if(up->mmufree == 0){
+ badpages = 0;
+ for (;;) {
+ page = newpage(1, 0, 0);
+ page->va = VA(kmap(page));
+ if (xenptpin(page->va))
+ break;
+ if(0) print("bad pin %lux va %lux %s\n", MFN(PADDR(page->va)), va, up->text);
+ page->next = badpages;
+ badpages = page;
+ }
+ while (badpages != 0) {
+ pg = badpages;
+ badpages = badpages->next;
+ putpage(pg);
+ }
+ }
+ else {
+ page = up->mmufree;
+ up->mmufree = page->next;
+ memset((void*)page->va, 0, BY2PG);
+ if (!xenptpin(page->va))
+ panic("xenptpin");
+ }
+
+ xenupdate(&pdb[pdbx], page->pa|PTEVALID|PTEUSER|PTEWRITE);
+
+ page->daddr = va;
+ page->next = up->mmuused;
+ up->mmuused = page;
+ }
+
+ pte = KADDR(MAPPN(pdb[pdbx]));
+ PUTMMULOG(dprint("pte %lux index %lud old %lux new %lux mfn %lux\n", (ulong)pte, PTX(va), pte[PTX(va)], pa|PTEUSER, MFN(pa));)
+ xenupdate(&pte[PTX(va)], pa|PTEUSER);
+
+ s = splhi();
+ //XXX doesn't work for some reason, but it's not needed for uniprocessor
+ //xenupdate(&pdb[PDX(MACHADDR)], m->pdb[PDX(MACHADDR)]);
+ mmuflushtlb(up->mmupdb);
+ splx(s);
+}
+
+ulong*
+mmuwalk(ulong* pdb, ulong va, int level, int create)
+{
+ ulong pa, va2, *table;
+
+ /*
+ * Walk the page-table pointed to by pdb and return a pointer
+ * to the entry for virtual address va at the requested level.
+ * If the entry is invalid and create isn't requested then bail
+ * out early. Otherwise, for the 2nd level walk, allocate a new
+ * page-table page and register it in the 1st level.
+ */
+ if(paemode){
+ pdb = &pdb[PAX(va)];
+ if(!(*pdb & PTEVALID)){
+ if(create == 0)
+ return 0;
+ panic("mmuwalk: missing pgdir ptr for va=%lux\n", va);
+ }
+ pdb = KADDR(MAPPN(*pdb));
+ }
+ table = &pdb[PDX(va)];
+ if(!(*table & PTEVALID) && create == 0)
+ return 0;
+
+ switch(level){
+
+ default:
+ return 0;
+
+ case 1:
+ return table;
+
+ case 2:
+ if(*table & PTESIZE)
+ panic("mmuwalk2: va %luX entry %luX\n", va, *table);
+ if(!(*table & PTEVALID)){
+ va2 = (ulong)xspanalloc(BY2PG, BY2PG, 0);
+ pa = PADDR(va2);
+ xenptpin(va2);
+ xenupdate(table, pa|PTEWRITE|PTEVALID);
+ }
+ table = KADDR(MAPPN(*table));
+
+ return &table[PTX(va)];
+ }
+}
+
+int
+mmukmapsync(ulong va)
+{
+ USED(va);
+ return 0;
+}
+
+/*
+ * More debugging.
+ */
+void
+countpagerefs(ulong *ref, int print)
+{
+ USED(ref);
+ USED(print);
+}
+
+/*
+ * Return the number of bytes that can be accessed via KADDR(pa).
+ * If pa is not a valid argument to KADDR, return 0.
+ */
+ulong
+cankaddr(ulong pa)
+{
+ if(pa >= -KZERO)
+ return 0;
+ return -KZERO - pa;
+}