summaryrefslogtreecommitdiff
path: root/sys/src/9/port/sysproc.c
diff options
context:
space:
mode:
authorTaru Karttunen <taruti@taruti.net>2011-03-30 15:46:40 +0300
committerTaru Karttunen <taruti@taruti.net>2011-03-30 15:46:40 +0300
commite5888a1ffdae813d7575f5fb02275c6bb07e5199 (patch)
treed8d51eac403f07814b9e936eed0c9a79195e2450 /sys/src/9/port/sysproc.c
Import sources from 2011-03-30 iso image
Diffstat (limited to 'sys/src/9/port/sysproc.c')
-rwxr-xr-xsys/src/9/port/sysproc.c1068
1 files changed, 1068 insertions, 0 deletions
diff --git a/sys/src/9/port/sysproc.c b/sys/src/9/port/sysproc.c
new file mode 100755
index 000000000..0048dc0b9
--- /dev/null
+++ b/sys/src/9/port/sysproc.c
@@ -0,0 +1,1068 @@
+#include "u.h"
+#include "tos.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "../port/error.h"
+#include "edf.h"
+
+#include <a.out.h>
+
+int shargs(char*, int, char**);
+
+extern void checkpages(void);
+extern void checkpagerefs(void);
+
+long
+sysr1(ulong*)
+{
+ checkpagerefs();
+ return 0;
+}
+
+long
+sysrfork(ulong *arg)
+{
+ Proc *p;
+ int n, i;
+ Fgrp *ofg;
+ Pgrp *opg;
+ Rgrp *org;
+ Egrp *oeg;
+ ulong pid, flag;
+ Mach *wm;
+
+ flag = arg[0];
+ /* Check flags before we commit */
+ if((flag & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
+ error(Ebadarg);
+ if((flag & (RFNAMEG|RFCNAMEG)) == (RFNAMEG|RFCNAMEG))
+ error(Ebadarg);
+ if((flag & (RFENVG|RFCENVG)) == (RFENVG|RFCENVG))
+ error(Ebadarg);
+
+ if((flag&RFPROC) == 0) {
+ if(flag & (RFMEM|RFNOWAIT))
+ error(Ebadarg);
+ if(flag & (RFFDG|RFCFDG)) {
+ ofg = up->fgrp;
+ if(flag & RFFDG)
+ up->fgrp = dupfgrp(ofg);
+ else
+ up->fgrp = dupfgrp(nil);
+ closefgrp(ofg);
+ }
+ if(flag & (RFNAMEG|RFCNAMEG)) {
+ opg = up->pgrp;
+ up->pgrp = newpgrp();
+ if(flag & RFNAMEG)
+ pgrpcpy(up->pgrp, opg);
+ /* inherit noattach */
+ up->pgrp->noattach = opg->noattach;
+ closepgrp(opg);
+ }
+ if(flag & RFNOMNT)
+ up->pgrp->noattach = 1;
+ if(flag & RFREND) {
+ org = up->rgrp;
+ up->rgrp = newrgrp();
+ closergrp(org);
+ }
+ if(flag & (RFENVG|RFCENVG)) {
+ oeg = up->egrp;
+ up->egrp = smalloc(sizeof(Egrp));
+ up->egrp->ref = 1;
+ if(flag & RFENVG)
+ envcpy(up->egrp, oeg);
+ closeegrp(oeg);
+ }
+ if(flag & RFNOTEG)
+ up->noteid = incref(&noteidalloc);
+ return 0;
+ }
+
+ p = newproc();
+
+ p->fpsave = up->fpsave;
+ p->scallnr = up->scallnr;
+ p->s = up->s;
+ p->nerrlab = 0;
+ p->slash = up->slash;
+ p->dot = up->dot;
+ incref(p->dot);
+
+ memmove(p->note, up->note, sizeof(p->note));
+ p->privatemem = up->privatemem;
+ p->noswap = up->noswap;
+ p->nnote = up->nnote;
+ p->notified = 0;
+ p->lastnote = up->lastnote;
+ p->notify = up->notify;
+ p->ureg = up->ureg;
+ p->dbgreg = 0;
+
+ /* Make a new set of memory segments */
+ n = flag & RFMEM;
+ qlock(&p->seglock);
+ if(waserror()){
+ qunlock(&p->seglock);
+ nexterror();
+ }
+ for(i = 0; i < NSEG; i++)
+ if(up->seg[i])
+ p->seg[i] = dupseg(up->seg, i, n);
+ qunlock(&p->seglock);
+ poperror();
+
+ /* File descriptors */
+ if(flag & (RFFDG|RFCFDG)) {
+ if(flag & RFFDG)
+ p->fgrp = dupfgrp(up->fgrp);
+ else
+ p->fgrp = dupfgrp(nil);
+ }
+ else {
+ p->fgrp = up->fgrp;
+ incref(p->fgrp);
+ }
+
+ /* Process groups */
+ if(flag & (RFNAMEG|RFCNAMEG)) {
+ p->pgrp = newpgrp();
+ if(flag & RFNAMEG)
+ pgrpcpy(p->pgrp, up->pgrp);
+ /* inherit noattach */
+ p->pgrp->noattach = up->pgrp->noattach;
+ }
+ else {
+ p->pgrp = up->pgrp;
+ incref(p->pgrp);
+ }
+ if(flag & RFNOMNT)
+ up->pgrp->noattach = 1;
+
+ if(flag & RFREND)
+ p->rgrp = newrgrp();
+ else {
+ incref(up->rgrp);
+ p->rgrp = up->rgrp;
+ }
+
+ /* Environment group */
+ if(flag & (RFENVG|RFCENVG)) {
+ p->egrp = smalloc(sizeof(Egrp));
+ p->egrp->ref = 1;
+ if(flag & RFENVG)
+ envcpy(p->egrp, up->egrp);
+ }
+ else {
+ p->egrp = up->egrp;
+ incref(p->egrp);
+ }
+ p->hang = up->hang;
+ p->procmode = up->procmode;
+
+ /* Craft a return frame which will cause the child to pop out of
+ * the scheduler in user mode with the return register zero
+ */
+ forkchild(p, up->dbgreg);
+
+ p->parent = up;
+ p->parentpid = up->pid;
+ if(flag&RFNOWAIT)
+ p->parentpid = 0;
+ else {
+ lock(&up->exl);
+ up->nchild++;
+ unlock(&up->exl);
+ }
+ if((flag&RFNOTEG) == 0)
+ p->noteid = up->noteid;
+
+ p->fpstate = up->fpstate;
+ pid = p->pid;
+ memset(p->time, 0, sizeof(p->time));
+ p->time[TReal] = MACHP(0)->ticks;
+
+ kstrdup(&p->text, up->text);
+ kstrdup(&p->user, up->user);
+ /*
+ * since the bss/data segments are now shareable,
+ * any mmu info about this process is now stale
+ * (i.e. has bad properties) and has to be discarded.
+ */
+ flushmmu();
+ p->basepri = up->basepri;
+ p->priority = up->basepri;
+ p->fixedpri = up->fixedpri;
+ p->mp = up->mp;
+ wm = up->wired;
+ if(wm)
+ procwired(p, wm->machno);
+ ready(p);
+ sched();
+ return pid;
+}
+
+static ulong
+l2be(long l)
+{
+ uchar *cp;
+
+ cp = (uchar*)&l;
+ return (cp[0]<<24) | (cp[1]<<16) | (cp[2]<<8) | cp[3];
+}
+
+long
+sysexec(ulong *arg)
+{
+ Segment *s, *ts;
+ ulong t, d, b;
+ int i;
+ Chan *tc;
+ char **argv, **argp;
+ char *a, *charp, *args, *file, *file0;
+ char *progarg[sizeof(Exec)/2+1], *elem, progelem[64];
+ ulong ssize, spage, nargs, nbytes, n, bssend;
+ int indir;
+ Exec exec;
+ char line[sizeof(Exec)];
+ Fgrp *f;
+ Image *img;
+ ulong magic, text, entry, data, bss;
+ Tos *tos;
+
+ indir = 0;
+ elem = nil;
+ validaddr(arg[0], 1, 0);
+ file0 = validnamedup((char*)arg[0], 1);
+ if(waserror()){
+ free(file0);
+ free(elem);
+ nexterror();
+ }
+ file = file0;
+ for(;;){
+ tc = namec(file, Aopen, OEXEC, 0);
+ if(waserror()){
+ cclose(tc);
+ nexterror();
+ }
+ if(!indir)
+ kstrdup(&elem, up->genbuf);
+
+ n = devtab[tc->type]->read(tc, &exec, sizeof(Exec), 0);
+ if(n < 2)
+ error(Ebadexec);
+ magic = l2be(exec.magic);
+ text = l2be(exec.text);
+ entry = l2be(exec.entry);
+ if(n==sizeof(Exec) && (magic == AOUT_MAGIC)){
+ if(text >= USTKTOP-UTZERO
+ || entry < UTZERO+sizeof(Exec)
+ || entry >= UTZERO+sizeof(Exec)+text)
+ error(Ebadexec);
+ break; /* for binary */
+ }
+
+ /*
+ * Process #! /bin/sh args ...
+ */
+ memmove(line, &exec, sizeof(Exec));
+ if(indir || line[0]!='#' || line[1]!='!')
+ error(Ebadexec);
+ n = shargs(line, n, progarg);
+ if(n == 0)
+ error(Ebadexec);
+ indir = 1;
+ /*
+ * First arg becomes complete file name
+ */
+ progarg[n++] = file;
+ progarg[n] = 0;
+ validaddr(arg[1], BY2WD, 1);
+ arg[1] += BY2WD;
+ file = progarg[0];
+ if(strlen(elem) >= sizeof progelem)
+ error(Ebadexec);
+ strcpy(progelem, elem);
+ progarg[0] = progelem;
+ poperror();
+ cclose(tc);
+ }
+
+ data = l2be(exec.data);
+ bss = l2be(exec.bss);
+ t = (UTZERO+sizeof(Exec)+text+(BY2PG-1)) & ~(BY2PG-1);
+ d = (t + data + (BY2PG-1)) & ~(BY2PG-1);
+ bssend = t + data + bss;
+ b = (bssend + (BY2PG-1)) & ~(BY2PG-1);
+ if(t >= KZERO || d >= KZERO || b >= KZERO)
+ error(Ebadexec);
+
+ /*
+ * Args: pass 1: count
+ */
+ nbytes = sizeof(Tos); /* hole for profiling clock at top of stack (and more) */
+ nargs = 0;
+ if(indir){
+ argp = progarg;
+ while(*argp){
+ a = *argp++;
+ nbytes += strlen(a) + 1;
+ nargs++;
+ }
+ }
+ evenaddr(arg[1]);
+ argp = (char**)arg[1];
+ validaddr((ulong)argp, BY2WD, 0);
+ while(*argp){
+ a = *argp++;
+ if(((ulong)argp&(BY2PG-1)) < BY2WD)
+ validaddr((ulong)argp, BY2WD, 0);
+ validaddr((ulong)a, 1, 0);
+ nbytes += ((char*)vmemchr(a, 0, 0x7FFFFFFF) - a) + 1;
+ nargs++;
+ }
+ ssize = BY2WD*(nargs+1) + ((nbytes+(BY2WD-1)) & ~(BY2WD-1));
+
+ /*
+ * 8-byte align SP for those (e.g. sparc) that need it.
+ * execregs() will subtract another 4 bytes for argc.
+ */
+ if((ssize+4) & 7)
+ ssize += 4;
+ spage = (ssize+(BY2PG-1)) >> PGSHIFT;
+
+ /*
+ * Build the stack segment, putting it in kernel virtual for the moment
+ */
+ if(spage > TSTKSIZ)
+ error(Enovmem);
+
+ qlock(&up->seglock);
+ if(waserror()){
+ qunlock(&up->seglock);
+ nexterror();
+ }
+ up->seg[ESEG] = newseg(SG_STACK, TSTKTOP-USTKSIZE, USTKSIZE/BY2PG);
+
+ /*
+ * Args: pass 2: assemble; the pages will be faulted in
+ */
+ tos = (Tos*)(TSTKTOP - sizeof(Tos));
+ tos->cyclefreq = m->cyclefreq;
+ cycles((uvlong*)&tos->pcycles);
+ tos->pcycles = -tos->pcycles;
+ tos->kcycles = tos->pcycles;
+ tos->clock = 0;
+ argv = (char**)(TSTKTOP - ssize);
+ charp = (char*)(TSTKTOP - nbytes);
+ args = charp;
+ if(indir)
+ argp = progarg;
+ else
+ argp = (char**)arg[1];
+
+ for(i=0; i<nargs; i++){
+ if(indir && *argp==0) {
+ indir = 0;
+ argp = (char**)arg[1];
+ }
+ *argv++ = charp + (USTKTOP-TSTKTOP);
+ n = strlen(*argp) + 1;
+ memmove(charp, *argp++, n);
+ charp += n;
+ }
+ free(file0);
+
+ free(up->text);
+ up->text = elem;
+ elem = nil; /* so waserror() won't free elem */
+ USED(elem);
+
+ /* copy args; easiest from new process's stack */
+ n = charp - args;
+ if(n > 128) /* don't waste too much space on huge arg lists */
+ n = 128;
+ a = up->args;
+ up->args = nil;
+ free(a);
+ up->args = smalloc(n);
+ memmove(up->args, args, n);
+ if(n>0 && up->args[n-1]!='\0'){
+ /* make sure last arg is NUL-terminated */
+ /* put NUL at UTF-8 character boundary */
+ for(i=n-1; i>0; --i)
+ if(fullrune(up->args+i, n-i))
+ break;
+ up->args[i] = 0;
+ n = i+1;
+ }
+ up->nargs = n;
+
+ /*
+ * Committed.
+ * Free old memory.
+ * Special segments are maintained across exec
+ */
+ for(i = SSEG; i <= BSEG; i++) {
+ putseg(up->seg[i]);
+ /* prevent a second free if we have an error */
+ up->seg[i] = 0;
+ }
+ for(i = BSEG+1; i < NSEG; i++) {
+ s = up->seg[i];
+ if(s != 0 && (s->type&SG_CEXEC)) {
+ putseg(s);
+ up->seg[i] = 0;
+ }
+ }
+
+ /*
+ * Close on exec
+ */
+ f = up->fgrp;
+ for(i=0; i<=f->maxfd; i++)
+ fdclose(i, CCEXEC);
+
+ /* Text. Shared. Attaches to cache image if possible */
+ /* attachimage returns a locked cache image */
+ img = attachimage(SG_TEXT|SG_RONLY, tc, UTZERO, (t-UTZERO)>>PGSHIFT);
+ ts = img->s;
+ up->seg[TSEG] = ts;
+ ts->flushme = 1;
+ ts->fstart = 0;
+ ts->flen = sizeof(Exec)+text;
+ unlock(img);
+
+ /* Data. Shared. */
+ s = newseg(SG_DATA, t, (d-t)>>PGSHIFT);
+ up->seg[DSEG] = s;
+
+ /* Attached by hand */
+ incref(img);
+ s->image = img;
+ s->fstart = ts->fstart+ts->flen;
+ s->flen = data;
+
+ /* BSS. Zero fill on demand */
+ up->seg[BSEG] = newseg(SG_BSS, d, (b-d)>>PGSHIFT);
+
+ /*
+ * Move the stack
+ */
+ s = up->seg[ESEG];
+ up->seg[ESEG] = 0;
+ up->seg[SSEG] = s;
+ qunlock(&up->seglock);
+ poperror(); /* seglock */
+ poperror(); /* elem */
+ s->base = USTKTOP-USTKSIZE;
+ s->top = USTKTOP;
+ relocateseg(s, USTKTOP-TSTKTOP);
+
+ /*
+ * '/' processes are higher priority (hack to make /ip more responsive).
+ */
+ if(devtab[tc->type]->dc == L'/')
+ up->basepri = PriRoot;
+ up->priority = up->basepri;
+ poperror();
+ cclose(tc);
+
+ /*
+ * At this point, the mmu contains info about the old address
+ * space and needs to be flushed
+ */
+ flushmmu();
+ qlock(&up->debug);
+ up->nnote = 0;
+ up->notify = 0;
+ up->notified = 0;
+ up->privatemem = 0;
+ procsetup(up);
+ qunlock(&up->debug);
+ if(up->hang)
+ up->procctl = Proc_stopme;
+
+ return execregs(entry, ssize, nargs);
+}
+
+int
+shargs(char *s, int n, char **ap)
+{
+ int i;
+
+ s += 2;
+ n -= 2; /* skip #! */
+ for(i=0; s[i]!='\n'; i++)
+ if(i == n-1)
+ return 0;
+ s[i] = 0;
+ *ap = 0;
+ i = 0;
+ for(;;) {
+ while(*s==' ' || *s=='\t')
+ s++;
+ if(*s == 0)
+ break;
+ i++;
+ *ap++ = s;
+ *ap = 0;
+ while(*s && *s!=' ' && *s!='\t')
+ s++;
+ if(*s == 0)
+ break;
+ else
+ *s++ = 0;
+ }
+ return i;
+}
+
+int
+return0(void*)
+{
+ return 0;
+}
+
+long
+syssleep(ulong *arg)
+{
+
+ int n;
+
+ n = arg[0];
+ if(n <= 0) {
+ if (up->edf && (up->edf->flags & Admitted))
+ edfyield();
+ else
+ yield();
+ return 0;
+ }
+ if(n < TK2MS(1))
+ n = TK2MS(1);
+ tsleep(&up->sleep, return0, 0, n);
+ return 0;
+}
+
+long
+sysalarm(ulong *arg)
+{
+ return procalarm(arg[0]);
+}
+
+long
+sysexits(ulong *arg)
+{
+ char *status;
+ char *inval = "invalid exit string";
+ char buf[ERRMAX];
+
+ status = (char*)arg[0];
+ if(status){
+ if(waserror())
+ status = inval;
+ else{
+ validaddr((ulong)status, 1, 0);
+ if(vmemchr(status, 0, ERRMAX) == 0){
+ memmove(buf, status, ERRMAX);
+ buf[ERRMAX-1] = 0;
+ status = buf;
+ }
+ poperror();
+ }
+
+ }
+ pexit(status, 1);
+ return 0; /* not reached */
+}
+
+long
+sys_wait(ulong *arg)
+{
+ int pid;
+ Waitmsg w;
+ OWaitmsg *ow;
+
+ if(arg[0] == 0)
+ return pwait(nil);
+
+ validaddr(arg[0], sizeof(OWaitmsg), 1);
+ evenaddr(arg[0]);
+ pid = pwait(&w);
+ if(pid >= 0){
+ ow = (OWaitmsg*)arg[0];
+ readnum(0, ow->pid, NUMSIZE, w.pid, NUMSIZE);
+ readnum(0, ow->time+TUser*NUMSIZE, NUMSIZE, w.time[TUser], NUMSIZE);
+ readnum(0, ow->time+TSys*NUMSIZE, NUMSIZE, w.time[TSys], NUMSIZE);
+ readnum(0, ow->time+TReal*NUMSIZE, NUMSIZE, w.time[TReal], NUMSIZE);
+ strncpy(ow->msg, w.msg, sizeof(ow->msg));
+ ow->msg[sizeof(ow->msg)-1] = '\0';
+ }
+ return pid;
+}
+
+long
+sysawait(ulong *arg)
+{
+ int i;
+ int pid;
+ Waitmsg w;
+ ulong n;
+
+ n = arg[1];
+ validaddr(arg[0], n, 1);
+ pid = pwait(&w);
+ if(pid < 0)
+ return -1;
+ i = snprint((char*)arg[0], n, "%d %lud %lud %lud %q",
+ w.pid,
+ w.time[TUser], w.time[TSys], w.time[TReal],
+ w.msg);
+
+ return i;
+}
+
+void
+werrstr(char *fmt, ...)
+{
+ va_list va;
+
+ if(up == nil)
+ return;
+
+ va_start(va, fmt);
+ vseprint(up->syserrstr, up->syserrstr+ERRMAX, fmt, va);
+ va_end(va);
+}
+
+static long
+generrstr(char *buf, uint nbuf)
+{
+ char tmp[ERRMAX];
+
+ if(nbuf == 0)
+ error(Ebadarg);
+ validaddr((ulong)buf, nbuf, 1);
+ if(nbuf > sizeof tmp)
+ nbuf = sizeof tmp;
+ memmove(tmp, buf, nbuf);
+
+ /* make sure it's NUL-terminated */
+ tmp[nbuf-1] = '\0';
+ memmove(buf, up->syserrstr, nbuf);
+ buf[nbuf-1] = '\0';
+ memmove(up->syserrstr, tmp, nbuf);
+ return 0;
+}
+
+long
+syserrstr(ulong *arg)
+{
+ return generrstr((char*)arg[0], arg[1]);
+}
+
+/* compatibility for old binaries */
+long
+sys_errstr(ulong *arg)
+{
+ return generrstr((char*)arg[0], 64);
+}
+
+long
+sysnotify(ulong *arg)
+{
+ if(arg[0] != 0)
+ validaddr(arg[0], sizeof(ulong), 0);
+ up->notify = (int(*)(void*, char*))(arg[0]);
+ return 0;
+}
+
+long
+sysnoted(ulong *arg)
+{
+ if(arg[0]!=NRSTR && !up->notified)
+ error(Egreg);
+ return 0;
+}
+
+long
+syssegbrk(ulong *arg)
+{
+ int i;
+ ulong addr;
+ Segment *s;
+
+ addr = arg[0];
+ for(i = 0; i < NSEG; i++) {
+ s = up->seg[i];
+ if(s == 0 || addr < s->base || addr >= s->top)
+ continue;
+ switch(s->type&SG_TYPE) {
+ case SG_TEXT:
+ case SG_DATA:
+ case SG_STACK:
+ error(Ebadarg);
+ default:
+ return ibrk(arg[1], i);
+ }
+ }
+
+ error(Ebadarg);
+ return 0; /* not reached */
+}
+
+long
+syssegattach(ulong *arg)
+{
+ return segattach(up, arg[0], (char*)arg[1], arg[2], arg[3]);
+}
+
+long
+syssegdetach(ulong *arg)
+{
+ int i;
+ ulong addr;
+ Segment *s;
+
+ qlock(&up->seglock);
+ if(waserror()){
+ qunlock(&up->seglock);
+ nexterror();
+ }
+
+ s = 0;
+ addr = arg[0];
+ for(i = 0; i < NSEG; i++)
+ if(s = up->seg[i]) {
+ qlock(&s->lk);
+ if((addr >= s->base && addr < s->top) ||
+ (s->top == s->base && addr == s->base))
+ goto found;
+ qunlock(&s->lk);
+ }
+
+ error(Ebadarg);
+
+found:
+ /*
+ * Check we are not detaching the initial stack segment.
+ */
+ if(s == up->seg[SSEG]){
+ qunlock(&s->lk);
+ error(Ebadarg);
+ }
+ up->seg[i] = 0;
+ qunlock(&s->lk);
+ putseg(s);
+ qunlock(&up->seglock);
+ poperror();
+
+ /* Ensure we flush any entries from the lost segment */
+ flushmmu();
+ return 0;
+}
+
+long
+syssegfree(ulong *arg)
+{
+ Segment *s;
+ ulong from, to;
+
+ from = arg[0];
+ s = seg(up, from, 1);
+ if(s == nil)
+ error(Ebadarg);
+ to = (from + arg[1]) & ~(BY2PG-1);
+ from = PGROUND(from);
+
+ if(to > s->top) {
+ qunlock(&s->lk);
+ error(Ebadarg);
+ }
+
+ mfreeseg(s, from, (to - from) / BY2PG);
+ qunlock(&s->lk);
+ flushmmu();
+
+ return 0;
+}
+
+/* For binary compatibility */
+long
+sysbrk_(ulong *arg)
+{
+ return ibrk(arg[0], BSEG);
+}
+
+long
+sysrendezvous(ulong *arg)
+{
+ uintptr tag, val;
+ Proc *p, **l;
+
+ tag = arg[0];
+ l = &REND(up->rgrp, tag);
+ up->rendval = ~(uintptr)0;
+
+ lock(up->rgrp);
+ for(p = *l; p; p = p->rendhash) {
+ if(p->rendtag == tag) {
+ *l = p->rendhash;
+ val = p->rendval;
+ p->rendval = arg[1];
+
+ while(p->mach != 0)
+ ;
+ ready(p);
+ unlock(up->rgrp);
+ return val;
+ }
+ l = &p->rendhash;
+ }
+
+ /* Going to sleep here */
+ up->rendtag = tag;
+ up->rendval = arg[1];
+ up->rendhash = *l;
+ *l = up;
+ up->state = Rendezvous;
+ unlock(up->rgrp);
+
+ sched();
+
+ return up->rendval;
+}
+
+/*
+ * The implementation of semaphores is complicated by needing
+ * to avoid rescheduling in syssemrelease, so that it is safe
+ * to call from real-time processes. This means syssemrelease
+ * cannot acquire any qlocks, only spin locks.
+ *
+ * Semacquire and semrelease must both manipulate the semaphore
+ * wait list. Lock-free linked lists only exist in theory, not
+ * in practice, so the wait list is protected by a spin lock.
+ *
+ * The semaphore value *addr is stored in user memory, so it
+ * cannot be read or written while holding spin locks.
+ *
+ * Thus, we can access the list only when holding the lock, and
+ * we can access the semaphore only when not holding the lock.
+ * This makes things interesting. Note that sleep's condition function
+ * is called while holding two locks - r and up->rlock - so it cannot
+ * access the semaphore value either.
+ *
+ * An acquirer announces its intention to try for the semaphore
+ * by putting a Sema structure onto the wait list and then
+ * setting Sema.waiting. After one last check of semaphore,
+ * the acquirer sleeps until Sema.waiting==0. A releaser of n
+ * must wake up n acquirers who have Sema.waiting set. It does
+ * this by clearing Sema.waiting and then calling wakeup.
+ *
+ * There are three interesting races here.
+
+ * The first is that in this particular sleep/wakeup usage, a single
+ * wakeup can rouse a process from two consecutive sleeps!
+ * The ordering is:
+ *
+ * (a) set Sema.waiting = 1
+ * (a) call sleep
+ * (b) set Sema.waiting = 0
+ * (a) check Sema.waiting inside sleep, return w/o sleeping
+ * (a) try for semaphore, fail
+ * (a) set Sema.waiting = 1
+ * (a) call sleep
+ * (b) call wakeup(a)
+ * (a) wake up again
+ *
+ * This is okay - semacquire will just go around the loop
+ * again. It does mean that at the top of the for(;;) loop in
+ * semacquire, phore.waiting might already be set to 1.
+ *
+ * The second is that a releaser might wake an acquirer who is
+ * interrupted before he can acquire the lock. Since
+ * release(n) issues only n wakeup calls -- only n can be used
+ * anyway -- if the interrupted process is not going to use his
+ * wakeup call he must pass it on to another acquirer.
+ *
+ * The third race is similar to the second but more subtle. An
+ * acquirer sets waiting=1 and then does a final canacquire()
+ * before going to sleep. The opposite order would result in
+ * missing wakeups that happen between canacquire and
+ * waiting=1. (In fact, the whole point of Sema.waiting is to
+ * avoid missing wakeups between canacquire() and sleep().) But
+ * there can be spurious wakeups between a successful
+ * canacquire() and the following semdequeue(). This wakeup is
+ * not useful to the acquirer, since he has already acquired
+ * the semaphore. Like in the previous case, though, the
+ * acquirer must pass the wakeup call along.
+ *
+ * This is all rather subtle. The code below has been verified
+ * with the spin model /sys/src/9/port/semaphore.p. The
+ * original code anticipated the second race but not the first
+ * or third, which were caught only with spin. The first race
+ * is mentioned in /sys/doc/sleep.ps, but I'd forgotten about it.
+ * It was lucky that my abstract model of sleep/wakeup still managed
+ * to preserve that behavior.
+ *
+ * I remain slightly concerned about memory coherence
+ * outside of locks. The spin model does not take
+ * queued processor writes into account so we have to
+ * think hard. The only variables accessed outside locks
+ * are the semaphore value itself and the boolean flag
+ * Sema.waiting. The value is only accessed with cmpswap,
+ * whose job description includes doing the right thing as
+ * far as memory coherence across processors. That leaves
+ * Sema.waiting. To handle it, we call coherence() before each
+ * read and after each write. - rsc
+ */
+
+/* Add semaphore p with addr a to list in seg. */
+static void
+semqueue(Segment *s, long *a, Sema *p)
+{
+ memset(p, 0, sizeof *p);
+ p->addr = a;
+ lock(&s->sema); /* uses s->sema.Rendez.Lock, but no one else is */
+ p->next = &s->sema;
+ p->prev = s->sema.prev;
+ p->next->prev = p;
+ p->prev->next = p;
+ unlock(&s->sema);
+}
+
+/* Remove semaphore p from list in seg. */
+static void
+semdequeue(Segment *s, Sema *p)
+{
+ lock(&s->sema);
+ p->next->prev = p->prev;
+ p->prev->next = p->next;
+ unlock(&s->sema);
+}
+
+/* Wake up n waiters with addr a on list in seg. */
+static void
+semwakeup(Segment *s, long *a, long n)
+{
+ Sema *p;
+
+ lock(&s->sema);
+ for(p=s->sema.next; p!=&s->sema && n>0; p=p->next){
+ if(p->addr == a && p->waiting){
+ p->waiting = 0;
+ coherence();
+ wakeup(p);
+ n--;
+ }
+ }
+ unlock(&s->sema);
+}
+
+/* Add delta to semaphore and wake up waiters as appropriate. */
+static long
+semrelease(Segment *s, long *addr, long delta)
+{
+ long value;
+
+ do
+ value = *addr;
+ while(!cmpswap(addr, value, value+delta));
+ semwakeup(s, addr, delta);
+ return value+delta;
+}
+
+/* Try to acquire semaphore using compare-and-swap */
+static int
+canacquire(long *addr)
+{
+ long value;
+
+ while((value=*addr) > 0)
+ if(cmpswap(addr, value, value-1))
+ return 1;
+ return 0;
+}
+
+/* Should we wake up? */
+static int
+semawoke(void *p)
+{
+ coherence();
+ return !((Sema*)p)->waiting;
+}
+
+/* Acquire semaphore (subtract 1). */
+static int
+semacquire(Segment *s, long *addr, int block)
+{
+ int acquired;
+ Sema phore;
+
+ if(canacquire(addr))
+ return 1;
+ if(!block)
+ return 0;
+
+ acquired = 0;
+ semqueue(s, addr, &phore);
+ for(;;){
+ phore.waiting = 1;
+ coherence();
+ if(canacquire(addr)){
+ acquired = 1;
+ break;
+ }
+ if(waserror())
+ break;
+ sleep(&phore, semawoke, &phore);
+ poperror();
+ }
+ semdequeue(s, &phore);
+ coherence(); /* not strictly necessary due to lock in semdequeue */
+ if(!phore.waiting)
+ semwakeup(s, addr, 1);
+ if(!acquired)
+ nexterror();
+ return 1;
+}
+
+long
+syssemacquire(ulong *arg)
+{
+ int block;
+ long *addr;
+ Segment *s;
+
+ validaddr(arg[0], sizeof(long), 1);
+ evenaddr(arg[0]);
+ addr = (long*)arg[0];
+ block = arg[1];
+
+ if((s = seg(up, (ulong)addr, 0)) == nil)
+ error(Ebadarg);
+ if(*addr < 0)
+ error(Ebadarg);
+ return semacquire(s, addr, block);
+}
+
+long
+syssemrelease(ulong *arg)
+{
+ long *addr, delta;
+ Segment *s;
+
+ validaddr(arg[0], sizeof(long), 1);
+ evenaddr(arg[0]);
+ addr = (long*)arg[0];
+ delta = arg[1];
+
+ if((s = seg(up, (ulong)addr, 0)) == nil)
+ error(Ebadarg);
+ if(delta < 0 || *addr < 0)
+ error(Ebadarg);
+ return semrelease(s, addr, arg[1]);
+}