summaryrefslogtreecommitdiff
path: root/sys/src/9/port/devswap.c
diff options
context:
space:
mode:
authorcinap_lenrek <cinap_lenrek@felloff.net>2017-10-29 23:09:54 +0100
committercinap_lenrek <cinap_lenrek@felloff.net>2017-10-29 23:09:54 +0100
commitf3f93925173d15ca48e90ce1624452d7e3b7726f (patch)
treed9faed1bb124d767aacca06261c6d51a22595851 /sys/src/9/port/devswap.c
parent93117262c2e377d9d4f1588924032d1b69e7e2f9 (diff)
kernel: introduce devswap #¶ to serve /dev/swap and handle swapfile encryption
Diffstat (limited to 'sys/src/9/port/devswap.c')
-rw-r--r--sys/src/9/port/devswap.c612
1 files changed, 612 insertions, 0 deletions
diff --git a/sys/src/9/port/devswap.c b/sys/src/9/port/devswap.c
new file mode 100644
index 000000000..64c3aa613
--- /dev/null
+++ b/sys/src/9/port/devswap.c
@@ -0,0 +1,612 @@
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "../port/error.h"
+
+#include <libsec.h>
+#include <pool.h>
+
+static int canflush(Proc*, Segment*);
+static void executeio(void);
+static void pageout(Proc*, Segment*);
+static void pagepte(int, Page**);
+static void pager(void*);
+
+Image swapimage = {
+ .notext = 1,
+};
+
+static Chan *swapchan;
+static uchar *swapbuf;
+static AESstate *swapkey;
+
+static Page **iolist;
+static int ioptr;
+
+static ushort ageclock;
+
+static void
+swapinit(void)
+{
+ swapalloc.swmap = xalloc(conf.nswap);
+ swapalloc.top = &swapalloc.swmap[conf.nswap];
+ swapalloc.alloc = swapalloc.swmap;
+ swapalloc.last = swapalloc.swmap;
+ swapalloc.free = conf.nswap;
+ swapalloc.xref = 0;
+
+ iolist = xalloc(conf.nswppo*sizeof(Page*));
+ if(swapalloc.swmap == nil || iolist == nil)
+ panic("swapinit: not enough memory");
+}
+
+static uintptr
+newswap(void)
+{
+ uchar *look;
+
+ lock(&swapalloc);
+ if(swapalloc.free == 0) {
+ unlock(&swapalloc);
+ return ~0;
+ }
+ look = memchr(swapalloc.last, 0, swapalloc.top-swapalloc.last);
+ if(look == nil)
+ look = memchr(swapalloc.swmap, 0, swapalloc.last-swapalloc.swmap);
+ *look = 2; /* ref for pte + io transaction */
+ swapalloc.last = look;
+ swapalloc.free--;
+ unlock(&swapalloc);
+ return (look-swapalloc.swmap) * BY2PG;
+}
+
+void
+putswap(Page *p)
+{
+ uchar *idx;
+
+ lock(&swapalloc);
+ idx = &swapalloc.swmap[((uintptr)p)/BY2PG];
+ if(*idx == 0)
+ panic("putswap %#p ref == 0", p);
+
+ if(*idx == 255) {
+ if(swapalloc.xref == 0)
+ panic("putswap %#p xref == 0", p);
+
+ if(--swapalloc.xref == 0) {
+ for(idx = swapalloc.swmap; idx < swapalloc.top; idx++) {
+ if(*idx == 255) {
+ *idx = 0;
+ swapalloc.free++;
+ }
+ }
+ }
+ } else {
+ if(--(*idx) == 0)
+ swapalloc.free++;
+ }
+ unlock(&swapalloc);
+}
+
+void
+dupswap(Page *p)
+{
+ uchar *idx;
+
+ lock(&swapalloc);
+ idx = &swapalloc.swmap[((uintptr)p)/BY2PG];
+ if(*idx == 255)
+ swapalloc.xref++;
+ else {
+ if(++(*idx) == 255)
+ swapalloc.xref += 255;
+ }
+ unlock(&swapalloc);
+}
+
+int
+swapcount(uintptr daddr)
+{
+ return swapalloc.swmap[daddr/BY2PG];
+}
+
+void
+kickpager(void)
+{
+ static Ref started;
+
+ if(started.ref || incref(&started) != 1)
+ wakeup(&swapalloc.r);
+ else
+ kproc("pager", pager, 0);
+}
+
+static int
+reclaim(void)
+{
+ ulong np;
+
+ for(;;){
+ if((np = pagereclaim(&fscache, 1000)) > 0) {
+ if(0) print("reclaim: %lud fscache\n", np);
+ } else if((np = pagereclaim(&swapimage, 1000)) > 0) {
+ if(0) print("reclaim: %lud swap\n", np);
+ } else if((np = imagereclaim(1000)) > 0) {
+ if(0) print("reclaim: %lud image\n", np);
+ }
+ if(!needpages(nil))
+ return 1; /* have pages, done */
+ if(np == 0)
+ return 0; /* didnt reclaim, need to swap */
+ sched();
+ }
+}
+
+static void
+pager(void*)
+{
+ int i;
+ Segment *s;
+ Proc *p, *ep;
+
+ p = proctab(0);
+ ep = &p[conf.nproc];
+
+ while(waserror())
+ ;
+
+ for(;;){
+ up->psstate = "Reclaim";
+ if(reclaim()){
+ up->psstate = "Idle";
+ wakeup(&palloc.pwait[0]);
+ wakeup(&palloc.pwait[1]);
+ sleep(&swapalloc.r, needpages, nil);
+ continue;
+ }
+
+ if(swapimage.c == nil || swapalloc.free == 0){
+ Killbig:
+ if(!freebroken())
+ killbig("out of memory");
+ sched();
+ continue;
+ }
+
+ i = ageclock;
+ do {
+ if(++p >= ep){
+ if(++ageclock == i)
+ goto Killbig;
+ p = proctab(0);
+ }
+ } while(p->state == Dead || p->noswap || !canqlock(&p->seglock));
+ up->psstate = "Pageout";
+ for(i = 0; i < NSEG; i++) {
+ if((s = p->seg[i]) != nil) {
+ switch(s->type&SG_TYPE) {
+ default:
+ break;
+ case SG_TEXT:
+ pageout(p, s);
+ break;
+ case SG_DATA:
+ case SG_BSS:
+ case SG_STACK:
+ case SG_SHARED:
+ pageout(p, s);
+ break;
+ }
+ }
+ }
+ qunlock(&p->seglock);
+
+ if(ioptr > 0) {
+ up->psstate = "I/O";
+ executeio();
+ }
+ }
+}
+
+static void
+pageout(Proc *p, Segment *s)
+{
+ int type, i, size;
+ short age;
+ Pte *l;
+ Page **pg, *entry;
+
+ if(!canqlock(s)) /* We cannot afford to wait, we will surely deadlock */
+ return;
+
+ if(!canflush(p, s) /* Able to invalidate all tlbs with references */
+ || waserror()) {
+ qunlock(s);
+ putseg(s);
+ return;
+ }
+
+ /* Pass through the pte tables looking for memory pages to swap out */
+ type = s->type&SG_TYPE;
+ size = s->mapsize;
+ for(i = 0; i < size; i++) {
+ l = s->map[i];
+ if(l == nil)
+ continue;
+ for(pg = l->first; pg <= l->last; pg++) {
+ entry = *pg;
+ if(pagedout(entry))
+ continue;
+ if(entry->modref & PG_REF) {
+ entry->modref &= ~PG_REF;
+ entry->refage = ageclock;
+ continue;
+ }
+ age = (short)(ageclock - entry->refage);
+ if(age < 16)
+ continue;
+ pagepte(type, pg);
+ }
+ }
+ poperror();
+ qunlock(s);
+ putseg(s);
+}
+
+static int
+canflush(Proc *p, Segment *s)
+{
+ int i;
+ Proc *ep;
+
+ if(incref(s) == 2) /* Easy if we are the only user */
+ return canpage(p);
+
+ /* Now we must do hardwork to ensure all processes which have tlb
+ * entries for this segment will be flushed if we succeed in paging it out
+ */
+ p = proctab(0);
+ ep = &p[conf.nproc];
+ while(p < ep) {
+ if(p->state != Dead) {
+ for(i = 0; i < NSEG; i++)
+ if(p->seg[i] == s)
+ if(!canpage(p))
+ return 0;
+ }
+ p++;
+ }
+ return 1;
+}
+
+static void
+pagepte(int type, Page **pg)
+{
+ uintptr daddr;
+ Page *outp;
+
+ outp = *pg;
+ switch(type) {
+ case SG_TEXT: /* Revert to demand load */
+ putpage(outp);
+ *pg = nil;
+ break;
+
+ case SG_DATA:
+ case SG_BSS:
+ case SG_STACK:
+ case SG_SHARED:
+ if(ioptr >= conf.nswppo)
+ break;
+
+ /*
+ * get a new swap address with swapcount 2, one for the pte
+ * and one extra ref for us while we write the page to disk
+ */
+ daddr = newswap();
+ if(daddr == ~0)
+ break;
+
+ /* clear any pages referring to it from the cache */
+ cachedel(&swapimage, daddr);
+
+ /* forget anything that it used to cache */
+ uncachepage(outp);
+
+ /*
+ * enter it into the cache so that a fault happening
+ * during the write will grab the page from the cache
+ * rather than one partially written to the disk
+ */
+ outp->daddr = daddr;
+ cachepage(outp, &swapimage);
+ *pg = (Page*)(daddr|PG_ONSWAP);
+
+ /* Add page to IO transaction list */
+ iolist[ioptr++] = outp;
+ break;
+ }
+}
+
+void
+pagersummary(void)
+{
+ print("%lud/%lud memory %lud/%lud swap %d iolist\n",
+ palloc.user-palloc.freecount,
+ palloc.user, conf.nswap-swapalloc.free, conf.nswap,
+ ioptr);
+}
+
+static void
+executeio(void)
+{
+ Page *outp;
+ int i, n;
+ Chan *c;
+ char *kaddr;
+ KMap *k;
+
+ c = swapimage.c;
+ for(i = 0; i < ioptr; i++) {
+ if(ioptr > conf.nswppo)
+ panic("executeio: ioptr %d > %d", ioptr, conf.nswppo);
+ outp = iolist[i];
+
+ assert(outp->ref > 0);
+ assert(outp->image == &swapimage);
+ assert(outp->daddr != ~0);
+
+ /* only write when swap address still in use */
+ if(swapcount(outp->daddr) > 1){
+ k = kmap(outp);
+ kaddr = (char*)VA(k);
+
+ if(waserror())
+ panic("executeio: page outp I/O error");
+
+ n = devtab[c->type]->write(c, kaddr, BY2PG, outp->daddr);
+ if(n != BY2PG)
+ nexterror();
+
+ kunmap(k);
+ poperror();
+ }
+
+ /* drop our extra swap reference */
+ putswap((Page*)outp->daddr);
+
+ /* Free up the page after I/O */
+ putpage(outp);
+ }
+ ioptr = 0;
+}
+
+int
+needpages(void*)
+{
+ return palloc.freecount < swapalloc.headroom;
+}
+
+static void
+setswapchan(Chan *c)
+{
+ uchar buf[sizeof(Dir)+100];
+ Dir d;
+ int n;
+
+ if(waserror()){
+ cclose(c);
+ nexterror();
+ }
+ if(swapimage.c != nil) {
+ if(swapalloc.free != conf.nswap)
+ error(Einuse);
+ cclose(swapimage.c);
+ swapimage.c = nil;
+ }
+
+ /*
+ * if this isn't a file, set the swap space
+ * to be at most the size of the partition
+ */
+ if(devtab[c->type]->dc != L'M'){
+ n = devtab[c->type]->stat(c, buf, sizeof buf);
+ if(n <= 0 || convM2D(buf, n, &d, nil) == 0)
+ error("stat failed in setswapchan");
+ if(d.length < conf.nswppo*BY2PG)
+ error("swap device too small");
+ if(d.length < conf.nswap*BY2PG){
+ conf.nswap = d.length/BY2PG;
+ swapalloc.top = &swapalloc.swmap[conf.nswap];
+ swapalloc.free = conf.nswap;
+ }
+ }
+ c->flag &= ~CCACHE;
+ cclunk(c);
+ poperror();
+
+ swapchan = c;
+ swapimage.c = namec("#¶/swapfile", Aopen, ORDWR, 0);
+}
+
+enum {
+ Qdir,
+ Qswap,
+ Qswapfile,
+};
+
+static Dirtab swapdir[]={
+ ".", {Qdir, 0, QTDIR}, 0, DMDIR|0555,
+ "swap", {Qswap}, 0, 0664,
+ "swapfile", {Qswapfile}, 0, 0600,
+};
+
+static Chan*
+swapattach(char *spec)
+{
+ return devattach(L'¶', spec);
+}
+
+static Walkqid*
+swapwalk(Chan *c, Chan *nc, char **name, int nname)
+{
+ return devwalk(c, nc, name, nname, swapdir, nelem(swapdir), devgen);
+}
+
+static int
+swapstat(Chan *c, uchar *dp, int n)
+{
+ return devstat(c, dp, n, swapdir, nelem(swapdir), devgen);
+}
+
+static Chan*
+swapopen(Chan *c, int omode)
+{
+ uchar key[128/8];
+
+ switch((ulong)c->qid.path){
+ case Qswapfile:
+ if(!iseve() || omode != ORDWR)
+ error(Eperm);
+ if(swapimage.c != nil)
+ error(Einuse);
+ if(swapchan == nil)
+ error(Egreg);
+
+ c->mode = openmode(omode);
+ c->flag |= COPEN;
+ c->offset = 0;
+
+ swapbuf = mallocalign(BY2PG, BY2PG, 0, 0);
+ swapkey = secalloc(sizeof(AESstate)*2);
+ if(swapbuf == nil || swapkey == nil)
+ error(Enomem);
+
+ genrandom(key, sizeof(key));
+ setupAESstate(&swapkey[0], key, sizeof(key), nil);
+ genrandom(key, sizeof(key));
+ setupAESstate(&swapkey[1], key, sizeof(key), nil);
+ memset(key, 0, sizeof(key));
+
+ return c;
+ }
+ return devopen(c, omode, swapdir, nelem(swapdir), devgen);
+}
+
+static void
+swapclose(Chan *c)
+{
+ if((c->flag & COPEN) == 0)
+ return;
+ switch((ulong)c->qid.path){
+ case Qswapfile:
+ cclose(swapchan);
+ swapchan = nil;
+ secfree(swapkey);
+ swapkey = nil;
+ free(swapbuf);
+ swapbuf = nil;
+ break;
+ }
+}
+
+static long
+swapread(Chan *c, void *va, long n, vlong off)
+{
+ char tmp[256]; /* must be >= 18*NUMSIZE (Qswap) */
+
+ switch((ulong)c->qid.path){
+ case Qdir:
+ return devdirread(c, va, n, swapdir, nelem(swapdir), devgen);
+ case Qswap:
+ snprint(tmp, sizeof tmp,
+ "%llud memory\n"
+ "%llud pagesize\n"
+ "%lud kernel\n"
+ "%lud/%lud user\n"
+ "%lud/%lud swap\n"
+ "%llud/%llud/%llud kernel malloc\n"
+ "%llud/%llud/%llud kernel draw\n"
+ "%llud/%llud/%llud kernel secret\n",
+ (uvlong)conf.npage*BY2PG,
+ (uvlong)BY2PG,
+ conf.npage-conf.upages,
+ palloc.user-palloc.freecount-fscache.pgref-swapimage.pgref, palloc.user,
+ conf.nswap-swapalloc.free, conf.nswap,
+ (uvlong)mainmem->curalloc,
+ (uvlong)mainmem->cursize,
+ (uvlong)mainmem->maxsize,
+ (uvlong)imagmem->curalloc,
+ (uvlong)imagmem->cursize,
+ (uvlong)imagmem->maxsize,
+ (uvlong)secrmem->curalloc,
+ (uvlong)secrmem->cursize,
+ (uvlong)secrmem->maxsize);
+ return readstr((ulong)off, va, n, tmp);
+ case Qswapfile:
+ if(n != BY2PG)
+ error(Ebadarg);
+ if(devtab[swapchan->type]->read(swapchan, va, n, off) != n)
+ error(Eio);
+ aes_xts_decrypt(&swapkey[0], &swapkey[1], off, va, va, n);
+ return n;
+ }
+ error(Egreg);
+ return 0;
+}
+
+static long
+swapwrite(Chan *c, void *va, long n, vlong off)
+{
+ char buf[256];
+
+ switch((ulong)c->qid.path){
+ case Qswap:
+ if(!iseve())
+ error(Eperm);
+ if(n >= sizeof buf)
+ error(Egreg);
+ memmove(buf, va, n); /* so we can NUL-terminate */
+ buf[n] = 0;
+ /* start a pager if not already started */
+ if(strncmp(buf, "start", 5) == 0)
+ kickpager();
+ else if(buf[0]>='0' && '9'<=buf[0])
+ setswapchan(fdtochan(strtoul(buf, nil, 0), ORDWR, 1, 1));
+ else
+ error(Ebadctl);
+ return n;
+ case Qswapfile:
+ if(n != BY2PG)
+ error(Ebadarg);
+ aes_xts_encrypt(&swapkey[0], &swapkey[1], off, va, swapbuf, n);
+ if(devtab[swapchan->type]->write(swapchan, swapbuf, n, off) != n)
+ error(Eio);
+ return n;
+ }
+ error(Egreg);
+ return 0;
+}
+
+Dev swapdevtab = {
+ L'¶',
+ "swap",
+ devreset,
+ swapinit,
+ devshutdown,
+ swapattach,
+ swapwalk,
+ swapstat,
+ swapopen,
+ devcreate,
+ swapclose,
+ swapread,
+ devbread,
+ swapwrite,
+ devbwrite,
+ devremove,
+ devwstat,
+};