summaryrefslogtreecommitdiff
path: root/sys/src/9/xen/trap.c
diff options
context:
space:
mode:
authormischief <mischief@offblast.org>2014-06-24 18:02:25 -0700
committermischief <mischief@offblast.org>2014-06-24 18:02:25 -0700
commit5ba95fdb07ddc2c32111a1b2f57f17aa27fcbbf5 (patch)
treec1ec54cb9ecff85b0b820a26d26a10a32a118d0c /sys/src/9/xen/trap.c
parentfa03455b5057675b18d1c87aef2d1071b2088de0 (diff)
import xen 32 bit paravirtual kernel from /n/sources/xen.
Diffstat (limited to 'sys/src/9/xen/trap.c')
-rw-r--r--sys/src/9/xen/trap.c1108
1 files changed, 1108 insertions, 0 deletions
diff --git a/sys/src/9/xen/trap.c b/sys/src/9/xen/trap.c
new file mode 100644
index 000000000..b47fc2c47
--- /dev/null
+++ b/sys/src/9/xen/trap.c
@@ -0,0 +1,1108 @@
+#include "u.h"
+#include "tos.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "io.h"
+#include "ureg.h"
+#include "../port/error.h"
+#include <trace.h>
+
+#define INTRLOG(a)
+#define SETUPLOG(a)
+#define SYSCALLLOG(a)
+#define FAULTLOG(a)
+#define FAULTLOGFAST(a)
+#define POSTNOTELOG(a)
+#define TRAPLOG(a)
+
+int faultpanic = 0;
+
+enum {
+ /* trap_info_t flags */
+ SPL0 = 0,
+ SPL3 = 3,
+ EvDisable = 4,
+};
+
+void noted(Ureg*, ulong);
+
+static void debugbpt(Ureg*, void*);
+static void fault386(Ureg*, void*);
+static void safe_fault386(Ureg*, void*);
+static void doublefault(Ureg*, void*);
+static void unexpected(Ureg*, void*);
+static void _dumpstack(Ureg*);
+
+static Lock vctllock;
+static Vctl *vctl[256];
+
+enum
+{
+ Ntimevec = 20 /* number of time buckets for each intr */
+};
+ulong intrtimes[256][Ntimevec];
+
+void
+intrenable(int irq, void (*f)(Ureg*, void*), void* a, int tbdf, char *name)
+{
+ int vno;
+ Vctl *v;
+
+/**/
+ SETUPLOG(dprint("intrenable: irq %d, f %p, a %p, tbdf 0x%x, name %s\n",
+ irq, f, a, tbdf, name);)
+/**/
+ if(f == nil){
+ print("intrenable: nil handler for %d, tbdf 0x%uX for %s\n",
+ irq, tbdf, name);
+ return;
+ }
+
+ v = xalloc(sizeof(Vctl));
+ v->isintr = 1;
+ v->irq = irq;
+ v->tbdf = tbdf;
+ v->f = f;
+ v->a = a;
+ strncpy(v->name, name, KNAMELEN-1);
+ v->name[KNAMELEN-1] = 0;
+
+ ilock(&vctllock);
+ vno = arch->intrenable(v);
+ if(vno == -1){
+ iunlock(&vctllock);
+ print("intrenable: couldn't enable irq %d, tbdf 0x%uX for %s\n",
+ irq, tbdf, v->name);
+ xfree(v);
+ return;
+ }
+ if(vctl[vno]){
+ if(vctl[vno]->isr != v->isr || vctl[vno]->eoi != v->eoi)
+ panic("intrenable: handler: %s %s %p %p %p %p\n",
+ vctl[vno]->name, v->name,
+ vctl[vno]->isr, v->isr, vctl[vno]->eoi, v->eoi);
+ v->next = vctl[vno];
+ }
+ vctl[vno] = v;
+ SETUPLOG(dprint("INTRENABLE: vctl[%d] is %p\n", vno, vctl[vno]);)
+ iunlock(&vctllock);
+}
+
+int
+intrdisable(int irq, void (*f)(Ureg *, void *), void *a, int tbdf, char *name)
+{
+ Vctl **pv, *v;
+ int vno;
+
+ /*
+ * For now, none of this will work with the APIC code,
+ * there is no mapping between irq and vector as the IRQ
+ * is pretty meaningless.
+ */
+ if(arch->intrvecno == nil)
+ return -1;
+ vno = arch->intrvecno(irq);
+ ilock(&vctllock);
+ pv = &vctl[vno];
+ while (*pv &&
+ ((*pv)->irq != irq || (*pv)->tbdf != tbdf || (*pv)->f != f || (*pv)->a != a ||
+ strcmp((*pv)->name, name)))
+ pv = &((*pv)->next);
+ assert(*pv);
+
+ v = *pv;
+ *pv = (*pv)->next; /* Link out the entry */
+
+ if(vctl[vno] == nil && arch->intrdisable != nil)
+ arch->intrdisable(irq);
+ iunlock(&vctllock);
+ xfree(v);
+ return 0;
+}
+
+static long
+irqallocread(Chan*, void *vbuf, long n, vlong offset)
+{
+ char *buf, *p, str[2*(11+1)+KNAMELEN+1+1];
+ int m, vno;
+ long oldn;
+ Vctl *v;
+
+ if(n < 0 || offset < 0)
+ error(Ebadarg);
+
+ oldn = n;
+ buf = vbuf;
+ for(vno=0; vno<nelem(vctl); vno++){
+ for(v=vctl[vno]; v; v=v->next){
+ m = snprint(str, sizeof str, "%11d %11d %.*s\n", vno, v->irq, KNAMELEN, v->name);
+ if(m <= offset) /* if do not want this, skip entry */
+ offset -= m;
+ else{
+ /* skip offset bytes */
+ m -= offset;
+ p = str+offset;
+ offset = 0;
+
+ /* write at most max(n,m) bytes */
+ if(m > n)
+ m = n;
+ memmove(buf, p, m);
+ n -= m;
+ buf += m;
+
+ if(n == 0)
+ return oldn;
+ }
+ }
+ }
+ return oldn - n;
+}
+
+void
+trapenable(int vno, void (*f)(Ureg*, void*), void* a, char *name)
+{
+ Vctl *v;
+
+ if(vno < 0 || vno >= VectorPIC)
+ panic("trapenable: vno %d\n", vno);
+ v = xalloc(sizeof(Vctl));
+ v->tbdf = BUSUNKNOWN;
+ v->f = f;
+ v->a = a;
+ strncpy(v->name, name, KNAMELEN);
+ v->name[KNAMELEN-1] = 0;
+
+ lock(&vctllock);
+ if(vctl[vno])
+ v->next = vctl[vno]->next;
+ vctl[vno] = v;
+ unlock(&vctllock);
+}
+
+static void
+nmienable(void)
+{
+ /* leave this here in case plan 9 ever makes it to dom0 */
+#ifdef NOWAY
+ /*
+ * Hack: should be locked with NVRAM access.
+ */
+ outb(0x70, 0x80); /* NMI latch clear */
+ outb(0x70, 0);
+
+ x = inb(0x61) & 0x07; /* Enable NMI */
+ outb(0x61, 0x08|x);
+ outb(0x61, x);
+#endif
+}
+
+/* we started out doing the 'giant bulk init' for all traps.
+ * we're going to do them one-by-one since error analysis is
+ * so much easier that way.
+ */
+void
+trapinit(void)
+{
+ trap_info_t t[2];
+ ulong vaddr;
+ int v, flag;
+
+ HYPERVISOR_set_callbacks(
+ KESEL, (ulong)hypervisor_callback,
+ KESEL, (ulong)failsafe_callback);
+
+ /* XXX rework as single hypercall once debugged */
+ t[1].address = 0;
+ vaddr = (ulong)vectortable;
+ for(v = 0; v < 256; v++){
+ switch(v){
+ case VectorBPT:
+ case VectorSYSCALL:
+ flag = SPL3 | EvDisable;
+ break;
+ default:
+ flag = SPL0 | EvDisable;
+ break;
+ }
+ t[0] = (trap_info_t){ v, flag, KESEL, vaddr };
+ if(HYPERVISOR_set_trap_table(t) < 0)
+ panic("trapinit: FAIL: try to set: 0x%x, 0x%x, 0x%x, 0x%ulx\n",
+ t[0].vector, t[0].flags, t[0].cs, t[0].address);
+ vaddr += 6;
+ }
+
+ /*
+ * Special traps.
+ * Syscall() is called directly without going through trap().
+ */
+ trapenable(VectorBPT, debugbpt, 0, "debugpt");
+ trapenable(VectorPF, fault386, 0, "fault386");
+ trapenable(Vector2F, doublefault, 0, "doublefault");
+ trapenable(Vector15, unexpected, 0, "unexpected");
+
+ nmienable();
+ addarchfile("irqalloc", 0444, irqallocread, nil);
+}
+
+static char* excname[32] = {
+ "divide error",
+ "debug exception",
+ "nonmaskable interrupt",
+ "breakpoint",
+ "overflow",
+ "bounds check",
+ "invalid opcode",
+ "coprocessor not available",
+ "double fault",
+ "coprocessor segment overrun",
+ "invalid TSS",
+ "segment not present",
+ "stack exception",
+ "general protection violation",
+ "page fault",
+ "15 (reserved)",
+ "coprocessor error",
+ "alignment check",
+ "machine check",
+ "19 (reserved)",
+ "20 (reserved)",
+ "21 (reserved)",
+ "22 (reserved)",
+ "23 (reserved)",
+ "24 (reserved)",
+ "25 (reserved)",
+ "26 (reserved)",
+ "27 (reserved)",
+ "28 (reserved)",
+ "29 (reserved)",
+ "30 (reserved)",
+ "31 (reserved)",
+};
+
+/*
+ * keep histogram of interrupt service times
+ */
+void
+intrtime(Mach*, int vno)
+{
+ ulong diff;
+ ulong x;
+
+ x = perfticks();
+ diff = x - m->perf.intrts;
+ m->perf.intrts = x;
+
+ m->perf.inintr += diff;
+ if(up == nil && m->perf.inidle > diff)
+ m->perf.inidle -= diff;
+
+ diff /= m->cpumhz*100; // quantum = 100µsec
+ if(diff >= Ntimevec)
+ diff = Ntimevec-1;
+ intrtimes[vno][diff]++;
+}
+
+/* go to user space */
+void
+kexit(Ureg*)
+{
+ uvlong t;
+ Tos *tos;
+
+ /* precise time accounting, kernel exit */
+ tos = (Tos*)(USTKTOP-sizeof(Tos));
+ cycles(&t);
+ tos->kcycles += t - up->kentry;
+ tos->pcycles = up->pcycles;
+ tos->pid = up->pid;
+ INTRLOG(dprint("leave kexit, TOS %p\n", tos);)
+}
+
+/*
+ * All traps come here. It is slower to have all traps call trap()
+ * rather than directly vectoring the handler. However, this avoids a
+ * lot of code duplication and possible bugs. The only exception is
+ * VectorSYSCALL.
+ * Trap is called with interrupts (and events) disabled via interrupt-gates.
+ */
+void
+trap(Ureg* ureg)
+{
+ int clockintr, i, vno, user;
+ char buf[ERRMAX];
+ Vctl *ctl, *v;
+ Mach *mach;
+
+ TRAPLOG(dprint("trap ureg %lux %lux\n", (ulong*)ureg, ureg->trap);)
+ m->perf.intrts = perfticks();
+ user = (ureg->cs & 0xFFFF) == UESEL;
+ if(user){
+ up->dbgreg = ureg;
+ cycles(&up->kentry);
+ }
+
+ clockintr = 0;
+
+ vno = ureg->trap;
+ if(vno < 0 || vno >= 256)
+ panic("bad interrupt number %d\n", vno);
+ TRAPLOG(dprint("trap: vno is 0x%x, vctl[%d] is %p\n", vno, vno, vctl[vno]);)
+ if(ctl = vctl[vno]){
+ INTRLOG(dprint("ctl is %p, isintr is %d\n", ctl, ctl->isintr);)
+ if(ctl->isintr){
+ m->intr++;
+ if(vno >= VectorPIC && vno != VectorSYSCALL)
+ m->lastintr = ctl->irq;
+ }
+
+ INTRLOG(dprint("ctl %p, isr %p\n", ctl, ctl->isr);)
+ if(ctl->isr)
+ ctl->isr(vno);
+ for(v = ctl; v != nil; v = v->next){
+ INTRLOG(dprint("ctl %p, f is %p\n", v, v->f);)
+ if(v->f)
+ v->f(ureg, v->a);
+ }
+ INTRLOG(dprint("ctl %p, eoi %p\n", ctl, ctl->eoi);)
+ if(ctl->eoi)
+ ctl->eoi(vno);
+
+ if(ctl->isintr){
+ intrtime(m, vno);
+
+ //if(ctl->irq == IrqCLOCK || ctl->irq == IrqTIMER)
+ if (ctl->tbdf != BUSUNKNOWN && ctl->irq == VIRQ_TIMER)
+ clockintr = 1;
+
+ if(up && !clockintr)
+ preempted();
+ }
+ }
+ else if(vno <= nelem(excname) && user){
+ spllo();
+ sprint(buf, "sys: trap: %s", excname[vno]);
+ postnote(up, 1, buf, NDebug);
+ }
+ else if(vno >= VectorPIC && vno != VectorSYSCALL){
+ /*
+ * An unknown interrupt.
+ * Check for a default IRQ7. This can happen when
+ * the IRQ input goes away before the acknowledge.
+ * In this case, a 'default IRQ7' is generated, but
+ * the corresponding bit in the ISR isn't set.
+ * In fact, just ignore all such interrupts.
+ */
+
+ /* call all interrupt routines, just in case */
+ for(i = VectorPIC; i <= MaxIrqLAPIC; i++){
+ ctl = vctl[i];
+ if(ctl == nil)
+ continue;
+ if(!ctl->isintr)
+ continue;
+ for(v = ctl; v != nil; v = v->next){
+ if(v->f)
+ v->f(ureg, v->a);
+ }
+ /* should we do this? */
+ if(ctl->eoi)
+ ctl->eoi(i);
+ }
+
+ iprint("cpu%d: spurious interrupt %d, last %d\n",
+ m->machno, vno, m->lastintr);
+ if(0)if(conf.nmach > 1){
+ for(i = 0; i < 32; i++){
+ if(!(active.machs & (1<<i)))
+ continue;
+ mach = MACHP(i);
+ if(m->machno == mach->machno)
+ continue;
+ print(" cpu%d: last %d",
+ mach->machno, mach->lastintr);
+ }
+ print("\n");
+ }
+ m->spuriousintr++;
+ if(user)
+ kexit(ureg);
+ return;
+ }
+ else{
+ if(vno == VectorNMI){
+ nmienable();
+ if(m->machno != 0){
+ print("cpu%d: PC %8.8luX\n",
+ m->machno, ureg->pc);
+ for(;;);
+ }
+ }
+ dumpregs(ureg);
+ if(!user){
+ ureg->sp = (ulong)&ureg->sp;
+ _dumpstack(ureg);
+ }
+ if(vno < nelem(excname))
+ panic("%s", excname[vno]);
+ panic("unknown trap/intr: %d\n", vno);
+ }
+ splhi();
+
+ /* delaysched set because we held a lock or because our quantum ended */
+ if(up && up->delaysched && clockintr){
+ INTRLOG(dprint("calling sched in trap? \n");)
+ sched();
+ INTRLOG(dprint("Back from calling sched in trap?\n");)
+ splhi();
+ }
+
+ if(user){
+ if(up->procctl || up->nnote)
+ notify(ureg);
+ kexit(ureg);
+ }
+
+ if (ureg->trap == 0xe) {
+ /*
+ * on page fault, we need to restore the old spl
+ * Xen won't do it for us.
+ * XXX verify this.
+ */
+ if (ureg->flags & 0x200)
+ spllo();
+ }
+}
+
+void
+dumpregs2(Ureg* ureg)
+{
+ if(up)
+ print("cpu%d: registers for %s %lud\n",
+ m->machno, up->text, up->pid);
+ else
+ print("cpu%d: registers for kernel\n", m->machno);
+ print("FLAGS=%luX TRAP=%luX ECODE=%luX PC=%luX",
+ ureg->flags, ureg->trap, ureg->ecode, ureg->pc);
+ print(" SS=%4.4luX USP=%luX\n", ureg->ss & 0xFFFF, ureg->usp);
+ print(" AX %8.8luX BX %8.8luX CX %8.8luX DX %8.8luX\n",
+ ureg->ax, ureg->bx, ureg->cx, ureg->dx);
+ print(" SI %8.8luX DI %8.8luX BP %8.8luX\n",
+ ureg->si, ureg->di, ureg->bp);
+ print(" CS %4.4luX DS %4.4luX ES %4.4luX FS %4.4luX GS %4.4luX\n",
+ ureg->cs & 0xFFFF, ureg->ds & 0xFFFF, ureg->es & 0xFFFF,
+ ureg->fs & 0xFFFF, ureg->gs & 0xFFFF);
+}
+
+void
+dumpregs(Ureg* ureg)
+{
+ extern ulong etext;
+
+ dumpregs2(ureg);
+
+ /*
+ * Processor control registers.
+ * If machine check exception, time stamp counter, page size extensions
+ * or enhanced virtual 8086 mode extensions are supported, there is a
+ * CR4. If there is a CR4 and machine check extensions, read the machine
+ * check address and machine check type registers if RDMSR supported.
+ */
+ print("SKIPPING get of crx and other such stuff.\n");/* */
+#ifdef NOT
+ print(" CR0 %8.8lux CR2 %8.8lux CR3 %8.8lux",
+ getcr0(), getcr2(), getcr3());
+ if(m->cpuiddx & 0x9A){
+ print(" CR4 %8.8lux", getcr4());
+ if((m->cpuiddx & 0xA0) == 0xA0){
+ rdmsr(0x00, &mca);
+ rdmsr(0x01, &mct);
+ print("\n MCA %8.8llux MCT %8.8llux", mca, mct);
+ }
+ }
+#endif
+ print("\n ur %lux up %lux\n", (ulong)ureg, (ulong)up);
+}
+
+
+/*
+ * Fill in enough of Ureg to get a stack trace, and call a function.
+ * Used by debugging interface rdb.
+ */
+void
+callwithureg(void (*fn)(Ureg*))
+{
+ Ureg ureg;
+ ureg.pc = getcallerpc(&fn);
+ ureg.sp = (ulong)&fn;
+ fn(&ureg);
+}
+
+static void
+_dumpstack(Ureg *ureg)
+{
+ ulong l, v, i, estack;
+ extern ulong etext;
+ int x;
+
+ if(getconf("*nodumpstack")){
+ iprint("dumpstack disabled\n");
+ return;
+ }
+ iprint("dumpstack\n");
+ x = 0;
+ x += print("ktrace /kernel/path %.8lux %.8lux <<EOF\n", ureg->pc, ureg->sp);
+ i = 0;
+ if(up
+ && (ulong)&l >= (ulong)up->kstack
+ && (ulong)&l <= (ulong)up->kstack+KSTACK)
+ estack = (ulong)up->kstack+KSTACK;
+ else if((ulong)&l >= (ulong)m->stack
+ && (ulong)&l <= (ulong)m+BY2PG)
+ estack = (ulong)m+MACHSIZE;
+ else
+ return;
+ x += print("estackx %.8lux\n", estack);
+
+ for(l=(ulong)&l; l<estack; l+=4){
+ v = *(ulong*)l;
+ if((KTZERO < v && v < (ulong)&etext) || estack-l<32){
+ /*
+ * we could Pick off general CALL (((uchar*)v)[-5] == 0xE8)
+ * and CALL indirect through AX (((uchar*)v)[-2] == 0xFF && ((uchar*)v)[-2] == 0xD0),
+ * but this is too clever and misses faulting address.
+ */
+ x += print("%.8lux=%.8lux ", l, v);
+ i++;
+ }
+ if(i == 4){
+ i = 0;
+ x += print("\n");
+ }
+ }
+ if(i)
+ print("\n");
+ print("EOF\n");
+}
+
+void
+dumpstack(void)
+{
+ callwithureg(_dumpstack);
+}
+
+static void
+debugbpt(Ureg* ureg, void*)
+{
+ char buf[ERRMAX];
+ print("debugbpt\n");
+ if(up == 0)
+ panic("kernel bpt");
+ /* restore pc to instruction that caused the trap */
+ ureg->pc--;
+ sprint(buf, "sys: breakpoint");
+ postnote(up, 1, buf, NDebug);
+ print("debugbpt for proc %lud\n", up->pid);
+}
+
+static void
+doublefault(Ureg*, void*)
+{
+ panic("double fault");
+}
+
+static void
+unexpected(Ureg* ureg, void*)
+{
+ print("unexpected trap %lud; ignoring\n", ureg->trap);
+}
+
+static void
+fault386(Ureg* ureg, void* )
+{
+ ulong addr;
+ int read, user, n, insyscall;
+ char buf[ERRMAX];
+
+ addr = HYPERVISOR_shared_info->vcpu_info[m->machno].arch.cr2;
+ if (faultpanic) {
+ dprint("cr2 is 0x%lx\n", addr);
+ //dumpregs(ureg);
+ dumpstack();
+ panic("fault386");
+ exit(1);
+ }
+
+ user = (ureg->cs & 0xFFFF) == UESEL;
+ if(!user && mmukmapsync(addr))
+ return;
+ read = !(ureg->ecode & 2);
+ if(up == nil)
+ panic("fault but up is zero; pc 0x%8.8lux addr 0x%8.8lux\n", ureg->pc, addr);
+ insyscall = up->insyscall;
+ up->insyscall = 1;
+ n = fault(addr, read);
+ if(n < 0){
+ if(!user){
+ dumpregs(ureg);
+ panic("fault: 0x%lux\n", addr);
+ }
+ sprint(buf, "sys: trap: fault %s addr=0x%lux",
+ read? "read" : "write", addr);
+ dprint("Posting %s to %lud\n", buf, up->pid);
+ postnote(up, 1, buf, NDebug);
+ }
+ up->insyscall = insyscall;
+ FAULTLOG(dprint("fault386: all done\n");)
+}
+
+/*
+ * system calls
+ */
+#include "../port/systab.h"
+
+/*
+ * Syscall is called directly from assembler without going through trap().
+ */
+void
+syscall(Ureg* ureg)
+{
+ char *e;
+ ulong sp;
+ long ret;
+ int i, s;
+ ulong scallnr;
+
+ SYSCALLLOG(dprint("%d: syscall ...#%ld(%s)\n",
+ up->pid, ureg->ax, sysctab[ureg->ax]);)
+
+ if((ureg->cs & 0xFFFF) != UESEL)
+ panic("syscall: cs 0x%4.4luX\n", ureg->cs);
+
+ cycles(&up->kentry);
+
+ m->syscall++;
+ up->insyscall = 1;
+ up->pc = ureg->pc;
+ up->dbgreg = ureg;
+
+ if(up->procctl == Proc_tracesyscall){
+ up->procctl = Proc_stopme;
+ procctl(up);
+ }
+
+ scallnr = ureg->ax;
+ up->scallnr = scallnr;
+ if(scallnr == RFORK && up->fpstate == FPactive){
+ fpsave(&up->fpsave);
+ up->fpstate = FPinactive;
+ }
+ spllo();
+
+ sp = ureg->usp;
+ up->nerrlab = 0;
+ ret = -1;
+ if(!waserror()){
+ if(scallnr >= nsyscall || systab[scallnr] == 0){
+ pprint("bad sys call number %lud pc %lux\n",
+ scallnr, ureg->pc);
+ postnote(up, 1, "sys: bad sys call", NDebug);
+ error(Ebadarg);
+ }
+
+ if(sp<(USTKTOP-BY2PG) || sp>(USTKTOP-sizeof(Sargs)-BY2WD))
+ validaddr(sp, sizeof(Sargs)+BY2WD, 0);
+
+ up->s = *((Sargs*)(sp+BY2WD));
+ up->psstate = sysctab[scallnr];
+
+ ret = systab[scallnr]((va_list)up->s.args);
+ poperror();
+ }else{
+ /* failure: save the error buffer for errstr */
+ e = up->syserrstr;
+ up->syserrstr = up->errstr;
+ up->errstr = e;
+ if(0 && up->pid == 1)
+ print("syscall %lud error %s\n", scallnr, up->syserrstr);
+ }
+ if(up->nerrlab){
+ print("bad errstack [%lud]: %d extra\n", scallnr, up->nerrlab);
+ for(i = 0; i < NERR; i++)
+ print("sp=%lux pc=%lux\n",
+ up->errlab[i].sp, up->errlab[i].pc);
+ panic("error stack");
+ }
+
+ SYSCALLLOG(dprint("%d: Syscall %d returns %d, ureg %p\n", up->pid, scallnr, ret, ureg);)
+ /*
+ * Put return value in frame. On the x86 the syscall is
+ * just another trap and the return value from syscall is
+ * ignored. On other machines the return value is put into
+ * the results register by caller of syscall.
+ */
+ ureg->ax = ret;
+
+ if(up->procctl == Proc_tracesyscall){
+ up->procctl = Proc_stopme;
+ s = splhi();
+ procctl(up);
+ splx(s);
+ }
+
+ up->insyscall = 0;
+ up->psstate = 0;
+ INTRLOG(dprint("cleared insyscall\n");)
+ if(scallnr == NOTED)
+ noted(ureg, *(ulong*)(sp+BY2WD));
+
+ if(scallnr!=RFORK && (up->procctl || up->nnote)){
+ splhi();
+ notify(ureg);
+ }
+ /* if we delayed sched because we held a lock, sched now */
+ if(up->delaysched)
+ sched();
+ INTRLOG(dprint("before kexit\n");)
+ kexit(ureg);
+}
+
+/*
+ * Call user, if necessary, with note.
+ * Pass user the Ureg struct and the note on his stack.
+ */
+int
+notify(Ureg* ureg)
+{
+ int l;
+ ulong s, sp;
+ Note *n;
+
+ if(up->procctl)
+ procctl(up);
+ if(up->nnote == 0)
+ return 0;
+
+ if(up->fpstate == FPactive){
+ fpsave(&up->fpsave);
+ up->fpstate = FPinactive;
+ }
+ up->fpstate |= FPillegal;
+
+ s = spllo();
+ qlock(&up->debug);
+ up->notepending = 0;
+ n = &up->note[0];
+ if(strncmp(n->msg, "sys:", 4) == 0){
+ l = strlen(n->msg);
+ if(l > ERRMAX-15) /* " pc=0x12345678\0" */
+ l = ERRMAX-15;
+ sprint(n->msg+l, " pc=0x%.8lux", ureg->pc);
+ }
+
+ if(n->flag!=NUser && (up->notified || up->notify==0)){
+ if(n->flag == NDebug)
+ pprint("suicide: %s\n", n->msg);
+ qunlock(&up->debug);
+ pexit(n->msg, n->flag!=NDebug);
+ }
+
+ if(up->notified){
+ qunlock(&up->debug);
+ splhi();
+ return 0;
+ }
+
+ if(!up->notify){
+ qunlock(&up->debug);
+ pexit(n->msg, n->flag!=NDebug);
+ }
+ sp = ureg->usp;
+ sp -= sizeof(Ureg);
+
+ if(!okaddr((ulong)up->notify, 1, 0)
+ || !okaddr(sp-ERRMAX-4*BY2WD, sizeof(Ureg)+ERRMAX+4*BY2WD, 1)){
+ pprint("suicide: bad address in notify\n");
+ qunlock(&up->debug);
+ pexit("Suicide", 0);
+ }
+
+ up->ureg = (void*)sp;
+ memmove((Ureg*)sp, ureg, sizeof(Ureg));
+ *(Ureg**)(sp-BY2WD) = up->ureg; /* word under Ureg is old up->ureg */
+ up->ureg = (void*)sp;
+ sp -= BY2WD+ERRMAX;
+ memmove((char*)sp, up->note[0].msg, ERRMAX);
+ sp -= 3*BY2WD;
+ *(ulong*)(sp+2*BY2WD) = sp+3*BY2WD; /* arg 2 is string */
+ *(ulong*)(sp+1*BY2WD) = (ulong)up->ureg; /* arg 1 is ureg* */
+ *(ulong*)(sp+0*BY2WD) = 0; /* arg 0 is pc */
+ ureg->usp = sp;
+ ureg->pc = (ulong)up->notify;
+ up->notified = 1;
+ up->nnote--;
+ memmove(&up->lastnote, &up->note[0], sizeof(Note));
+ memmove(&up->note[0], &up->note[1], up->nnote*sizeof(Note));
+
+ qunlock(&up->debug);
+ splx(s);
+ return 1;
+}
+
+/*
+ * Return user to state before notify()
+ */
+void
+noted(Ureg* ureg, ulong arg0)
+{
+ Ureg *nureg;
+ ulong oureg, sp;
+
+ qlock(&up->debug);
+ if(arg0!=NRSTR && !up->notified) {
+ qunlock(&up->debug);
+ pprint("call to noted() when not notified\n");
+ pexit("Suicide", 0);
+ }
+ up->notified = 0;
+
+ nureg = up->ureg; /* pointer to user returned Ureg struct */
+
+ up->fpstate &= ~FPillegal;
+
+ /* sanity clause */
+ oureg = (ulong)nureg;
+ if(!okaddr((ulong)oureg-BY2WD, BY2WD+sizeof(Ureg), 0)){
+ pprint("bad ureg in noted or call to noted when not notified\n");
+ qunlock(&up->debug);
+ pexit("Suicide", 0);
+ }
+
+ /*
+ * Check the segment selectors are all valid, otherwise
+ * a fault will be taken on attempting to return to the
+ * user process.
+ * Take care with the comparisons as different processor
+ * generations push segment descriptors in different ways.
+ */
+ if((nureg->cs & 0xFFFF) != UESEL || (nureg->ss & 0xFFFF) != UDSEL
+ || (nureg->ds & 0xFFFF) != UDSEL || (nureg->es & 0xFFFF) != UDSEL
+ || (nureg->fs & 0xFFFF) != UDSEL || (nureg->gs & 0xFFFF) != UDSEL){
+ pprint("bad segment selector in noted\n");
+ pprint("cs is %#lux, wanted %#ux\n", nureg->cs, UESEL);
+ pprint("ds is %#lux, wanted %#ux\n", nureg->ds, UDSEL);
+ pprint("es is %#lux, fs is %#lux, gs %#lux, wanted %#ux\n",
+ ureg->es, ureg->fs, ureg->gs, UDSEL);
+ pprint("ss is %#lux, wanted %#ux\n", nureg->ss, UDSEL);
+ qunlock(&up->debug);
+ pexit("Suicide", 0);
+ }
+
+ /* don't let user change system flags */
+ nureg->flags = (ureg->flags & ~0xCD5) | (nureg->flags & 0xCD5);
+
+ memmove(ureg, nureg, sizeof(Ureg));
+
+ switch(arg0){
+ case NCONT:
+ case NRSTR:
+ if(!okaddr(nureg->pc, 1, 0) || !okaddr(nureg->usp, BY2WD, 0)){
+ qunlock(&up->debug);
+ pprint("suicide: trap in noted\n");
+ pexit("Suicide", 0);
+ }
+ up->ureg = (Ureg*)(*(ulong*)(oureg-BY2WD));
+ qunlock(&up->debug);
+ break;
+
+ case NSAVE:
+ if(!okaddr(nureg->pc, BY2WD, 0)
+ || !okaddr(nureg->usp, BY2WD, 0)){
+ qunlock(&up->debug);
+ pprint("suicide: trap in noted\n");
+ pexit("Suicide", 0);
+ }
+ qunlock(&up->debug);
+ sp = oureg-4*BY2WD-ERRMAX;
+ splhi();
+ ureg->sp = sp;
+ ((ulong*)sp)[1] = oureg; /* arg 1 0(FP) is ureg* */
+ ((ulong*)sp)[0] = 0; /* arg 0 is pc */
+ break;
+
+ default:
+ pprint("unknown noted arg 0x%lux\n", arg0);
+ up->lastnote.flag = NDebug;
+ /* fall through */
+
+ case NDFLT:
+ if(up->lastnote.flag == NDebug){
+ qunlock(&up->debug);
+ pprint("suicide: %s\n", up->lastnote.msg);
+ } else
+ qunlock(&up->debug);
+ pexit(up->lastnote.msg, up->lastnote.flag!=NDebug);
+ }
+}
+
+uintptr
+execregs(uintptr entry, ulong ssize, ulong nargs)
+{
+ ulong *sp;
+ Ureg *ureg;
+
+ up->fpstate = FPinit;
+ fpoff();
+
+ sp = (ulong*)(USTKTOP - ssize);
+ *--sp = nargs;
+
+ ureg = up->dbgreg;
+ ureg->usp = (ulong)sp;
+ ureg->pc = entry;
+// print("execregs returns 0x%x\n", USTKTOP-sizeof(Tos));
+ return USTKTOP-sizeof(Tos); /* address of kernel/user shared data */
+}
+
+/*
+ * return the userpc the last exception happened at
+ */
+ulong
+userpc(void)
+{
+ Ureg *ureg;
+
+ ureg = (Ureg*)up->dbgreg;
+ return ureg->pc;
+}
+
+/* This routine must save the values of registers the user is not permitted
+ * to write from devproc and then restore the saved values before returning.
+ */
+void
+setregisters(Ureg* ureg, char* pureg, char* uva, int n)
+{
+ ulong flags;
+ ulong cs;
+ ulong ss;
+
+ flags = ureg->flags;
+ cs = ureg->cs;
+ ss = ureg->ss;
+ memmove(pureg, uva, n);
+ ureg->flags = (ureg->flags & 0x00FF) | (flags & 0xFF00);
+ ureg->cs = cs;
+ ureg->ss = ss;
+}
+
+static void
+linkproc(void)
+{
+ spllo();
+ up->kpfun(up->kparg);
+ pexit("kproc dying", 0);
+}
+
+void
+kprocchild(Proc* p, void (*func)(void*), void* arg)
+{
+ /*
+ * gotolabel() needs a word on the stack in
+ * which to place the return PC used to jump
+ * to linkproc().
+ */
+ p->sched.pc = (ulong)linkproc;
+ p->sched.sp = (ulong)p->kstack+KSTACK-BY2WD;
+
+ p->kpfun = func;
+ p->kparg = arg;
+}
+
+void
+forkchild(Proc *p, Ureg *ureg)
+{
+ Ureg *cureg;
+
+ /*
+ * Add 2*BY2WD to the stack to account for
+ * - the return PC
+ * - trap's argument (ur)
+ */
+ p->sched.sp = (ulong)p->kstack+KSTACK-(sizeof(Ureg)+2*BY2WD);
+ p->sched.pc = (ulong)forkret;
+
+ cureg = (Ureg*)(p->sched.sp+2*BY2WD);
+ memmove(cureg, ureg, sizeof(Ureg));
+ /* return value of syscall in child */
+ cureg->ax = 0;
+
+ /* Things from bottom of syscall which were never executed */
+ p->psstate = 0;
+ p->insyscall = 0;
+}
+
+/* Give enough context in the ureg to produce a kernel stack for
+ * a sleeping process
+ */
+void
+setkernur(Ureg* ureg, Proc* p)
+{
+ ureg->pc = p->sched.pc;
+ ureg->sp = p->sched.sp+4;
+}
+
+ulong
+dbgpc(Proc *p)
+{
+ Ureg *ureg;
+
+ ureg = p->dbgreg;
+ if(ureg == 0)
+ return 0;
+
+ return ureg->pc;
+}
+
+/*
+ * install_safe_pf_handler / install_normal_pf_handler:
+ *
+ * These are used within the failsafe_callback handler in entry.S to avoid
+ * taking a full page fault when reloading FS and GS. This is because FS and
+ * GS could be invalid at pretty much any point while Xenolinux executes (we
+ * don't set them to safe values on entry to the kernel). At *any* point Xen
+ * may be entered due to a hardware interrupt --- on exit from Xen an invalid
+ * FS/GS will cause our failsafe_callback to be executed. This could occur,
+ * for example, while the mmu_update_queue is in an inconsistent state. This
+ * is disastrous because the normal page-fault handler touches the update
+ * queue!
+ *
+ * Fortunately, within the failsafe handler it is safe to force DS/ES/FS/GS
+ * to zero if they cannot be reloaded -- at this point executing a normal
+ * page fault would not change this effect. The safe page-fault handler
+ * ensures this end result (blow away the selector value) without the dangers
+ * of the normal page-fault handler.
+ *
+ * NB. Perhaps this can all go away after we have implemented writeable
+ * page tables. :-)
+ */
+static void
+safe_fault386(Ureg* , void* ) {
+ panic("DO SAFE PAGE FAULT!\n");
+
+
+
+}
+
+unsigned long install_safe_pf_handler(void)
+{
+ dprint("called from failsafe callback\n");
+ trapenable(VectorPF, safe_fault386, 0, "safe_fault386");
+ return 0;
+}
+
+void install_normal_pf_handler(unsigned long)
+{
+ trapenable(VectorPF, fault386, 0, "fault386");
+}