diff options
author | Taru Karttunen <taruti@taruti.net> | 2011-03-30 15:46:40 +0300 |
---|---|---|
committer | Taru Karttunen <taruti@taruti.net> | 2011-03-30 15:46:40 +0300 |
commit | e5888a1ffdae813d7575f5fb02275c6bb07e5199 (patch) | |
tree | d8d51eac403f07814b9e936eed0c9a79195e2450 /sys/src/cmd/venti |
Import sources from 2011-03-30 iso image
Diffstat (limited to 'sys/src/cmd/venti')
88 files changed, 21787 insertions, 0 deletions
diff --git a/sys/src/cmd/venti/copy.c b/sys/src/cmd/venti/copy.c new file mode 100755 index 000000000..db07dcb9d --- /dev/null +++ b/sys/src/cmd/venti/copy.c @@ -0,0 +1,262 @@ +#include <u.h> +#include <libc.h> +#include <venti.h> +#include <libsec.h> +#include <avl.h> +#include <bin.h> + +int changes; +int rewrite; +int ignoreerrors; +int fast; +int verbose; +int nskip; +int nwrite; + +VtConn *zsrc, *zdst; +uchar zeroscore[VtScoreSize]; /* all zeros */ + +typedef struct ScoreTree ScoreTree; +struct ScoreTree +{ + Avl avl; + uchar score[VtScoreSize]; + int type; +}; + +Avltree *scoretree; +Bin *scorebin; + +static int +scoretreecmp(Avl *va, Avl *vb) +{ + ScoreTree *a, *b; + int i; + + a = (ScoreTree*)va; + b = (ScoreTree*)vb; + + i = memcmp(a->score, b->score, VtScoreSize); + if(i != 0) + return i; + return a->type - b->type; +} + +static int +havevisited(uchar score[VtScoreSize], int type) +{ + ScoreTree a; + + if(scoretree == nil) + return 0; + memmove(a.score, score, VtScoreSize); + a.type = type; + return lookupavl(scoretree, &a.avl) != nil; +} + +static void +markvisited(uchar score[VtScoreSize], int type) +{ + ScoreTree *a; + Avl *old; + + if(scoretree == nil) + return; + a = binalloc(&scorebin, sizeof *a, 1); + memmove(a->score, score, VtScoreSize); + a->type = type; + insertavl(scoretree, &a->avl, &old); +} + +void +usage(void) +{ + fprint(2, "usage: %s [-fimrv] [-t type] srchost dsthost score\n", argv0); + exits("usage"); +} + +void +walk(uchar score[VtScoreSize], uint type, int base) +{ + int i, n; + uchar *buf; + uchar nscore[VtScoreSize]; + VtEntry e; + VtRoot root; + + if(memcmp(score, vtzeroscore, VtScoreSize) == 0 || memcmp(score, zeroscore, VtScoreSize) == 0) + return; + + if(havevisited(score, type)){ + nskip++; + return; + } + + buf = vtmallocz(VtMaxLumpSize); + if(fast && vtread(zdst, score, type, buf, VtMaxLumpSize) >= 0){ + if(verbose) + fprint(2, "skip %V\n", score); + free(buf); + return; + } + + n = vtread(zsrc, score, type, buf, VtMaxLumpSize); + if(n < 0){ + if(rewrite){ + changes++; + memmove(score, vtzeroscore, VtScoreSize); + }else if(!ignoreerrors) + sysfatal("reading block %V (type %d): %r", score, type); + return; + } + + switch(type){ + case VtRootType: + if(vtrootunpack(&root, buf) < 0){ + fprint(2, "warning: could not unpack root in %V %d\n", score, type); + break; + } + walk(root.prev, VtRootType, 0); + walk(root.score, VtDirType, 0); + if(rewrite) + vtrootpack(&root, buf); /* walk might have changed score */ + break; + + case VtDirType: + for(i=0; i<n/VtEntrySize; i++){ + if(vtentryunpack(&e, buf, i) < 0){ + fprint(2, "warning: could not unpack entry #%d in %V %d\n", i, score, type); + continue; + } + if(!(e.flags & VtEntryActive)) + continue; + walk(e.score, e.type, e.type&VtTypeBaseMask); + /* + * Don't repack unless we're rewriting -- some old + * vac files have psize==0 and dsize==0, and these + * get rewritten by vtentryunpack to have less strange + * block sizes. So vtentryunpack; vtentrypack does not + * guarantee to preserve the exact bytes in buf. + */ + if(rewrite) + vtentrypack(&e, buf, i); + } + break; + + case VtDataType: + break; + + default: /* pointers */ + for(i=0; i<n; i+=VtScoreSize) + if(memcmp(buf+i, vtzeroscore, VtScoreSize) != 0) + walk(buf+i, type-1, base); + break; + } + + nwrite++; + if(vtwrite(zdst, nscore, type, buf, n) < 0){ + /* figure out score for better error message */ + /* can't use input argument - might have changed contents */ + n = vtzerotruncate(type, buf, n); + sha1(buf, n, score, nil); + sysfatal("writing block %V (type %d): %r", score, type); + } + if(!rewrite && memcmp(score, nscore, VtScoreSize) != 0){ + fprint(2, "not rewriting: wrote %V got %V\n", score, nscore); + abort(); + sysfatal("not rewriting: wrote %V got %V", score, nscore); + } + + markvisited(score, type); + free(buf); +} + +void +main(int argc, char *argv[]) +{ + int type, n; + uchar score[VtScoreSize]; + uchar *buf; + char *prefix; + + fmtinstall('F', vtfcallfmt); + fmtinstall('V', vtscorefmt); + + type = -1; + ARGBEGIN{ + case 'V': + chattyventi++; + break; + case 'f': + fast = 1; + break; + case 'i': + if(rewrite) + usage(); + ignoreerrors = 1; + break; + case 'm': + scoretree = mkavltree(scoretreecmp); + break; + case 'r': + if(ignoreerrors) + usage(); + rewrite = 1; + break; + case 't': + type = atoi(EARGF(usage())); + break; + case 'v': + verbose = 1; + break; + default: + usage(); + break; + }ARGEND + + if(argc != 3) + usage(); + + if(vtparsescore(argv[2], &prefix, score) < 0) + sysfatal("could not parse score: %r"); + + buf = vtmallocz(VtMaxLumpSize); + + zsrc = vtdial(argv[0]); + if(zsrc == nil) + sysfatal("could not dial src server: %r"); + if(vtconnect(zsrc) < 0) + sysfatal("vtconnect src: %r"); + + zdst = vtdial(argv[1]); + if(zdst == nil) + sysfatal("could not dial dst server: %r"); + if(vtconnect(zdst) < 0) + sysfatal("vtconnect dst: %r"); + + if(type != -1){ + n = vtread(zsrc, score, type, buf, VtMaxLumpSize); + if(n < 0) + sysfatal("could not read block: %r"); + }else{ + for(type=0; type<VtMaxType; type++){ + n = vtread(zsrc, score, type, buf, VtMaxLumpSize); + if(n >= 0) + break; + } + if(type == VtMaxType) + sysfatal("could not find block %V of any type", score); + } + + walk(score, type, VtDirType); + if(changes) + print("%s:%V (%d pointers rewritten)\n", prefix, score, changes); + + if(verbose) + print("%d skipped, %d written\n", nskip, nwrite); + + if(vtsync(zdst) < 0) + sysfatal("could not sync dst server: %r"); + + exits(0); +} diff --git a/sys/src/cmd/venti/devnull.c b/sys/src/cmd/venti/devnull.c new file mode 100755 index 000000000..fdad553e6 --- /dev/null +++ b/sys/src/cmd/venti/devnull.c @@ -0,0 +1,79 @@ +/* Copyright (c) 2004 Russ Cox */ +#include <u.h> +#include <libc.h> +#include <venti.h> +#include <thread.h> +#include <libsec.h> + +#ifndef _UNISTD_H_ +#pragma varargck type "F" VtFcall* +#pragma varargck type "T" void +#endif + +int verbose; + +enum +{ + STACK = 8192 +}; + +void +usage(void) +{ + fprint(2, "usage: venti/devnull [-v] [-a address]\n"); + threadexitsall("usage"); +} + +void +threadmain(int argc, char **argv) +{ + VtReq *r; + VtSrv *srv; + char *address; + + fmtinstall('V', vtscorefmt); + fmtinstall('F', vtfcallfmt); + + address = "tcp!*!venti"; + + ARGBEGIN{ + case 'v': + verbose++; + break; + case 'a': + address = EARGF(usage()); + break; + default: + usage(); + }ARGEND + + srv = vtlisten(address); + if(srv == nil) + sysfatal("vtlisten %s: %r", address); + + while((r = vtgetreq(srv)) != nil){ + r->rx.msgtype = r->tx.msgtype+1; + if(verbose) + fprint(2, "<- %F\n", &r->tx); + switch(r->tx.msgtype){ + case VtTping: + break; + case VtTgoodbye: + break; + case VtTread: + r->rx.error = vtstrdup("no such block"); + r->rx.msgtype = VtRerror; + break; + case VtTwrite: + packetsha1(r->tx.data, r->rx.score); + break; + case VtTsync: + break; + } + if(verbose) + fprint(2, "-> %F\n", &r->rx); + vtrespond(r); + } + threadexitsall(nil); +} + diff --git a/sys/src/cmd/venti/mkfile b/sys/src/cmd/venti/mkfile new file mode 100755 index 000000000..0c1ed7f9a --- /dev/null +++ b/sys/src/cmd/venti/mkfile @@ -0,0 +1,28 @@ +</$objtype/mkfile + +TARG=\ + copy\ + read\ + ro\ + sync\ + write\ + + +BIN=/$objtype/bin/venti + +</sys/src/cmd/mkmany + +CFLAGS=$CFLAGS -I. + +extra:V: $O.devnull $O.mkroot $O.randtest $O.readlist $O.root + +all:V: srv.all.dir +install:V: srv.install.dir +installall:V: srv.installall.dir +safeinstall:V: srv.safeinstall.dir +safeinstallall:V: srv.safeinstallall.dir +clean:V: srv.clean.dir +nuke:V: srv.nuke.dir + +srv.%.dir:V: + @{ cd srv && mk $stem } diff --git a/sys/src/cmd/venti/mkroot.c b/sys/src/cmd/venti/mkroot.c new file mode 100755 index 000000000..8c38b1d16 --- /dev/null +++ b/sys/src/cmd/venti/mkroot.c @@ -0,0 +1,61 @@ +#include <u.h> +#include <libc.h> +#include <venti.h> +#include <thread.h> + +char *host; + +void +usage(void) +{ + fprint(2, "usage: mkroot [-h host] name type score blocksize prev\n"); + threadexitsall("usage"); +} + +void +threadmain(int argc, char *argv[]) +{ + uchar score[VtScoreSize]; + uchar buf[VtRootSize]; + VtConn *z; + VtRoot root; + + ARGBEGIN{ + case 'h': + host = EARGF(usage()); + break; + default: + usage(); + break; + }ARGEND + + if(argc != 5) + usage(); + + fmtinstall('V', vtscorefmt); + fmtinstall('F', vtfcallfmt); + + strecpy(root.name, root.name+sizeof root.name, argv[0]); + strecpy(root.type, root.type+sizeof root.type, argv[1]); + if(vtparsescore(argv[2], nil, root.score) < 0) + sysfatal("bad score '%s'", argv[2]); + root.blocksize = atoi(argv[3]); + if(vtparsescore(argv[4], nil, root.prev) < 0) + sysfatal("bad score '%s'", argv[4]); + vtrootpack(&root, buf); + + z = vtdial(host); + if(z == nil) + sysfatal("could not connect to server: %r"); + + if(vtconnect(z) < 0) + sysfatal("vtconnect: %r"); + + if(vtwrite(z, score, VtRootType, buf, VtRootSize) < 0) + sysfatal("vtwrite: %r"); + if(vtsync(z) < 0) + sysfatal("vtsync: %r"); + vthangup(z); + print("%V\n", score); + threadexitsall(0); +} diff --git a/sys/src/cmd/venti/randtest.c b/sys/src/cmd/venti/randtest.c new file mode 100755 index 000000000..2a1fa6ee7 --- /dev/null +++ b/sys/src/cmd/venti/randtest.c @@ -0,0 +1,335 @@ +#include <u.h> +#include <libc.h> +#include <venti.h> +#include <libsec.h> +#include <thread.h> + + +enum { STACK = 32768 }; +void xxxsrand(long); +long xxxlrand(void); + +Channel *cw; +Channel *cr; +char *host; +int blocksize, seed, randpct; +int doread, dowrite, packets, permute; +vlong totalbytes, cur; +VtConn *z; +int multi; +int maxpackets; +int sequence; +int doublecheck = 1; +uint *order; + +void +usage(void) +{ + fprint(2, "usage: randtest [-q] [-h host] [-s seed] [-b blocksize] [-p randpct] [-n totalbytes] [-M maxblocks] [-P] [-r] [-w]\n"); + threadexitsall("usage"); +} + +void +wr(char *buf, char *buf2) +{ + uchar score[VtScoreSize], score2[VtScoreSize]; + DigestState ds; + + USED(buf2); + memset(&ds, 0, sizeof ds); + if(doublecheck) + sha1((uchar*)buf, blocksize, score, &ds); + if(vtwrite(z, score2, VtDataType, (uchar*)buf, blocksize) < 0) + sysfatal("vtwrite %V at %,lld: %r", score, cur); + if(doublecheck && memcmp(score, score2, VtScoreSize) != 0) + sysfatal("score mismatch! %V %V", score, score2); +} + +void +wrthread(void *v) +{ + char *p; + + USED(v); + while((p = recvp(cw)) != nil){ + wr(p, nil); + free(p); + } +} + +void +rd(char *buf, char *buf2) +{ + uchar score[VtScoreSize]; + DigestState ds; + + memset(&ds, 0, sizeof ds); + sha1((uchar*)buf, blocksize, score, &ds); + if(vtread(z, score, VtDataType, (uchar*)buf2, blocksize) < 0) + sysfatal("vtread %V at %,lld: %r", score, cur); + if(memcmp(buf, buf2, blocksize) != 0) + sysfatal("bad data read! %V", score); +} + +void +rdthread(void *v) +{ + char *p, *buf2; + + buf2 = vtmalloc(blocksize); + USED(v); + while((p = recvp(cr)) != nil){ + rd(p, buf2); + free(p); + } +} + +char *template; + +void +run(void (*fn)(char*, char*), Channel *c) +{ + int i, t, j, packets; + char *buf2, *buf; + + buf2 = vtmalloc(blocksize); + buf = vtmalloc(blocksize); + cur = 0; + packets = totalbytes/blocksize; + if(maxpackets == 0) + maxpackets = packets; + order = vtmalloc(packets*sizeof order[0]); + for(i=0; i<packets; i++) + order[i] = i; + if(permute){ + for(i=1; i<packets; i++){ + j = nrand(i+1); + t = order[i]; + order[i] = order[j]; + order[j] = t; + } + } + for(i=0; i<packets && i<maxpackets; i++){ + memmove(buf, template, blocksize); + *(uint*)buf = order[i]; + if(c){ + sendp(c, buf); + buf = vtmalloc(blocksize); + }else + (*fn)(buf, buf2); + cur += blocksize; + } + free(order); +} + +#define TWID64 ((u64int)~(u64int)0) + +u64int +unittoull(char *s) +{ + char *es; + u64int n; + + if(s == nil) + return TWID64; + n = strtoul(s, &es, 0); + if(*es == 'k' || *es == 'K'){ + n *= 1024; + es++; + }else if(*es == 'm' || *es == 'M'){ + n *= 1024*1024; + es++; + }else if(*es == 'g' || *es == 'G'){ + n *= 1024*1024*1024; + es++; + }else if(*es == 't' || *es == 'T'){ + n *= 1024*1024; + n *= 1024*1024; + } + if(*es != '\0') + return TWID64; + return n; +} + +void +threadmain(int argc, char *argv[]) +{ + int i, max; + vlong t0; + double t; + + blocksize = 8192; + seed = 0; + randpct = 50; + host = nil; + doread = 0; + dowrite = 0; + totalbytes = 1*1024*1024*1024; + fmtinstall('V', vtscorefmt); + fmtinstall('F', vtfcallfmt); + + ARGBEGIN{ + case 'b': + blocksize = unittoull(EARGF(usage())); + break; + case 'h': + host = EARGF(usage()); + break; + case 'M': + maxpackets = unittoull(EARGF(usage())); + break; + case 'm': + multi = atoi(EARGF(usage())); + break; + case 'n': + totalbytes = unittoull(EARGF(usage())); + break; + case 'p': + randpct = atoi(EARGF(usage())); + break; + case 'P': + permute = 1; + break; + case 'S': + doublecheck = 0; + ventidoublechecksha1 = 0; + break; + case 's': + seed = atoi(EARGF(usage())); + break; + case 'r': + doread = 1; + break; + case 'w': + dowrite = 1; + break; + case 'V': + chattyventi++; + break; + default: + usage(); + }ARGEND + + if(doread==0 && dowrite==0){ + doread = 1; + dowrite = 1; + } + + z = vtdial(host); + if(z == nil) + sysfatal("could not connect to server: %r"); + if(vtconnect(z) < 0) + sysfatal("vtconnect: %r"); + + if(multi){ + cr = chancreate(sizeof(void*), 0); + cw = chancreate(sizeof(void*), 0); + for(i=0; i<multi; i++){ + proccreate(wrthread, nil, STACK); + proccreate(rdthread, nil, STACK); + } + } + + template = vtmalloc(blocksize); + xxxsrand(seed); + max = (256*randpct)/100; + if(max == 0) + max = 1; + for(i=0; i<blocksize; i++) + template[i] = xxxlrand()%max; + if(dowrite){ + t0 = nsec(); + run(wr, cw); + for(i=0; i<multi; i++) + sendp(cw, nil); + t = (nsec() - t0)/1.e9; + print("write: %lld bytes / %.3f seconds = %.6f MB/s\n", + totalbytes, t, (double)totalbytes/1e6/t); + } + if(doread){ + t0 = nsec(); + run(rd, cr); + for(i=0; i<multi; i++) + sendp(cr, nil); + t = (nsec() - t0)/1.e9; + print("read: %lld bytes / %.3f seconds = %.6f MB/s\n", + totalbytes, t, (double)totalbytes/1e6/t); + } + threadexitsall(nil); +} + + +/* + * algorithm by + * D. P. Mitchell & J. A. Reeds + */ + +#define LEN 607 +#define TAP 273 +#define MASK 0x7fffffffL +#define A 48271 +#define M 2147483647 +#define Q 44488 +#define R 3399 +#define NORM (1.0/(1.0+MASK)) + +static ulong rng_vec[LEN]; +static ulong* rng_tap = rng_vec; +static ulong* rng_feed = 0; + +static void +isrand(long seed) +{ + long lo, hi, x; + int i; + + rng_tap = rng_vec; + rng_feed = rng_vec+LEN-TAP; + seed = seed%M; + if(seed < 0) + seed += M; + if(seed == 0) + seed = 89482311; + x = seed; + /* + * Initialize by x[n+1] = 48271 * x[n] mod (2**31 - 1) + */ + for(i = -20; i < LEN; i++) { + hi = x / Q; + lo = x % Q; + x = A*lo - R*hi; + if(x < 0) + x += M; + if(i >= 0) + rng_vec[i] = x; + } +} + +void +xxxsrand(long seed) +{ + isrand(seed); +} + +long +xxxlrand(void) +{ + ulong x; + + rng_tap--; + if(rng_tap < rng_vec) { + if(rng_feed == 0) { + isrand(1); + rng_tap--; + } + rng_tap += LEN; + } + rng_feed--; + if(rng_feed < rng_vec) + rng_feed += LEN; + x = (*rng_feed + *rng_tap) & MASK; + *rng_feed = x; + + return x; +} + diff --git a/sys/src/cmd/venti/read.c b/sys/src/cmd/venti/read.c new file mode 100755 index 000000000..a48e62e6a --- /dev/null +++ b/sys/src/cmd/venti/read.c @@ -0,0 +1,74 @@ +#include <u.h> +#include <libc.h> +#include <venti.h> +#include <libsec.h> +#include <thread.h> + +void +usage(void) +{ + fprint(2, "usage: read [-h host] [-t type] score\n"); + threadexitsall("usage"); +} + +void +threadmain(int argc, char *argv[]) +{ + int type, n; + uchar score[VtScoreSize]; + uchar *buf; + VtConn *z; + char *host; + + fmtinstall('F', vtfcallfmt); + fmtinstall('V', vtscorefmt); + + host = nil; + type = -1; + ARGBEGIN{ + case 'h': + host = EARGF(usage()); + break; + case 't': + type = atoi(EARGF(usage())); + break; + default: + usage(); + break; + }ARGEND + + if(argc != 1) + usage(); + + if(vtparsescore(argv[0], nil, score) < 0) + sysfatal("could not parse score '%s': %r", argv[0]); + + buf = vtmallocz(VtMaxLumpSize); + + z = vtdial(host); + if(z == nil) + sysfatal("could not connect to server: %r"); + + if(vtconnect(z) < 0) + sysfatal("vtconnect: %r"); + + if(type == -1){ + n = -1; + for(type=0; type<VtMaxType; type++){ + n = vtread(z, score, type, buf, VtMaxLumpSize); + if(n >= 0){ + fprint(2, "venti/read%s%s %V %d\n", host ? " -h" : "", host ? host : "", + score, type); + break; + } + } + }else + n = vtread(z, score, type, buf, VtMaxLumpSize); + + vthangup(z); + if(n < 0) + sysfatal("could not read block: %r"); + if(write(1, buf, n) != n) + sysfatal("write: %r"); + threadexitsall(0); +} diff --git a/sys/src/cmd/venti/readlist.c b/sys/src/cmd/venti/readlist.c new file mode 100755 index 000000000..6d928086a --- /dev/null +++ b/sys/src/cmd/venti/readlist.c @@ -0,0 +1,112 @@ +#include <u.h> +#include <libc.h> +#include <thread.h> +#include <venti.h> +#include <bio.h> + +char *host; +Biobuf b; +VtConn *z; +uchar *buf; +void run(Biobuf*); +int nn; + +void +usage(void) +{ + fprint(2, "usage: readlist [-h host] list\n"); + threadexitsall("usage"); +} + +int +parsescore(uchar *score, char *buf, int n) +{ + int i, c; + + memset(score, 0, VtScoreSize); + + if(n != VtScoreSize*2){ + werrstr("score wrong length %d", n); + return -1; + } + for(i=0; i<VtScoreSize*2; i++) { + if(buf[i] >= '0' && buf[i] <= '9') + c = buf[i] - '0'; + else if(buf[i] >= 'a' && buf[i] <= 'f') + c = buf[i] - 'a' + 10; + else if(buf[i] >= 'A' && buf[i] <= 'F') + c = buf[i] - 'A' + 10; + else { + c = buf[i]; + werrstr("bad score char %d '%c'", c, c); + return -1; + } + + if((i & 1) == 0) + c <<= 4; + + score[i>>1] |= c; + } + return 0; +} + +void +threadmain(int argc, char *argv[]) +{ + int fd, i; + + ARGBEGIN{ + case 'h': + host = EARGF(usage()); + break; + default: + usage(); + break; + }ARGEND + + fmtinstall('V', vtscorefmt); + buf = vtmallocz(VtMaxLumpSize); + z = vtdial(host); + if(z == nil) + sysfatal("could not connect to server: %r"); + if(vtconnect(z) < 0) + sysfatal("vtconnect: %r"); + + if(argc == 0){ + Binit(&b, 0, OREAD); + run(&b); + }else{ + for(i=0; i<argc; i++){ + if((fd = open(argv[i], OREAD)) < 0) + sysfatal("open %s: %r", argv[i]); + Binit(&b, fd, OREAD); + run(&b); + } + } + threadexitsall(nil); +} + +void +run(Biobuf *b) +{ + char *p, *f[10]; + int nf; + uchar score[20]; + int type, n; + + while((p = Brdline(b, '\n')) != nil){ + p[Blinelen(b)-1] = 0; + nf = tokenize(p, f, nelem(f)); + if(nf != 2) + sysfatal("syntax error in work list"); + if(parsescore(score, f[0], strlen(f[0])) < 0) + sysfatal("bad score %s in work list", f[0]); + type = atoi(f[1]); + n = vtread(z, score, type, buf, VtMaxLumpSize); + if(n < 0) + sysfatal("could not read %s %s: %r", f[0], f[1]); + /* write(1, buf, n); */ + if(++nn%1000 == 0) + print("%d...", nn); + } +} diff --git a/sys/src/cmd/venti/ro.c b/sys/src/cmd/venti/ro.c new file mode 100755 index 000000000..ee954a32b --- /dev/null +++ b/sys/src/cmd/venti/ro.c @@ -0,0 +1,112 @@ +/* Copyright (c) 2004 Russ Cox */ +#include <u.h> +#include <libc.h> +#include <venti.h> +#include <thread.h> +#include <libsec.h> + +#ifndef _UNISTD_H_ +#pragma varargck type "F" VtFcall* +#pragma varargck type "T" void +#endif + +VtConn *z; +int verbose; + +enum +{ + STACK = 8192 +}; + +void +usage(void) +{ + fprint(2, "usage: venti/ro [-v] [-a address] [-h address]\n"); + threadexitsall("usage"); +} + +void +readthread(void *v) +{ + char err[ERRMAX]; + VtReq *r; + uchar *buf; + int n; + + r = v; + buf = vtmalloc(r->tx.count); + if((n=vtread(z, r->tx.score, r->tx.blocktype, buf, r->tx.count)) < 0){ + r->rx.msgtype = VtRerror; + rerrstr(err, sizeof err); + r->rx.error = vtstrdup(err); + free(buf); + }else{ + r->rx.data = packetforeign(buf, n, free, buf); + } + if(verbose) + fprint(2, "-> %F\n", &r->rx); + vtrespond(r); +} + +void +threadmain(int argc, char **argv) +{ + VtReq *r; + VtSrv *srv; + char *address, *ventiaddress; + + fmtinstall('F', vtfcallfmt); + fmtinstall('V', vtscorefmt); + + address = "tcp!*!venti"; + ventiaddress = nil; + + ARGBEGIN{ + case 'v': + verbose++; + break; + case 'a': + address = EARGF(usage()); + break; + case 'h': + ventiaddress = EARGF(usage()); + break; + default: + usage(); + }ARGEND + + if((z = vtdial(ventiaddress)) == nil) + sysfatal("vtdial %s: %r", ventiaddress); + if(vtconnect(z) < 0) + sysfatal("vtconnect: %r"); + + srv = vtlisten(address); + if(srv == nil) + sysfatal("vtlisten %s: %r", address); + + while((r = vtgetreq(srv)) != nil){ + r->rx.msgtype = r->tx.msgtype+1; + if(verbose) + fprint(2, "<- %F\n", &r->tx); + switch(r->tx.msgtype){ + case VtTping: + break; + case VtTgoodbye: + break; + case VtTread: + threadcreate(readthread, r, 16384); + continue; + case VtTwrite: + r->rx.error = vtstrdup("read-only server"); + r->rx.msgtype = VtRerror; + break; + case VtTsync: + break; + } + if(verbose) + fprint(2, "-> %F\n", &r->rx); + vtrespond(r); + } + threadexitsall(nil); +} + diff --git a/sys/src/cmd/venti/root.c b/sys/src/cmd/venti/root.c new file mode 100755 index 000000000..5d67ad316 --- /dev/null +++ b/sys/src/cmd/venti/root.c @@ -0,0 +1,72 @@ +#include <u.h> +#include <libc.h> +#include <venti.h> +#include <libsec.h> +#include <thread.h> + +void +usage(void) +{ + fprint(2, "usage: root [-h host] score\n"); + threadexitsall("usage"); +} + +void +threadmain(int argc, char *argv[]) +{ + int i, n; + uchar score[VtScoreSize]; + uchar *buf; + VtConn *z; + char *host; + VtRoot root; + + fmtinstall('F', vtfcallfmt); + fmtinstall('V', vtscorefmt); + quotefmtinstall(); + + host = nil; + ARGBEGIN{ + case 'h': + host = EARGF(usage()); + break; + default: + usage(); + break; + }ARGEND + + if(argc == 0) + usage(); + + buf = vtmallocz(VtMaxLumpSize); + + z = vtdial(host); + if(z == nil) + sysfatal("could not connect to server: %r"); + + if(vtconnect(z) < 0) + sysfatal("vtconnect: %r"); + + for(i=0; i<argc; i++){ + if(vtparsescore(argv[i], nil, score) < 0){ + fprint(2, "cannot parse score '%s': %r\n", argv[i]); + continue; + } + n = vtread(z, score, VtRootType, buf, VtMaxLumpSize); + if(n < 0){ + fprint(2, "could not read block %V: %r\n", score); + continue; + } + if(n != VtRootSize){ + fprint(2, "block %V is wrong size %d != 300\n", score, n); + continue; + } + if(vtrootunpack(&root, buf) < 0){ + fprint(2, "unpacking block %V: %r\n", score); + continue; + } + print("%V: %q %q %V %d %V\n", score, root.name, root.type, root.score, root.blocksize, root.prev); + } + vthangup(z); + threadexitsall(0); +} diff --git a/sys/src/cmd/venti/srv/arena.c b/sys/src/cmd/venti/srv/arena.c new file mode 100755 index 000000000..2a176cded --- /dev/null +++ b/sys/src/cmd/venti/srv/arena.c @@ -0,0 +1,931 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +typedef struct ASum ASum; + +struct ASum +{ + Arena *arena; + ASum *next; +}; + +static void sealarena(Arena *arena); +static int okarena(Arena *arena); +static int loadarena(Arena *arena); +static CIBlock *getcib(Arena *arena, int clump, int writing, CIBlock *rock); +static void putcib(Arena *arena, CIBlock *cib); +static void sumproc(void *); +static void loadcig(Arena *arena); + +static QLock sumlock; +static Rendez sumwait; +static ASum *sumq; +static ASum *sumqtail; +static uchar zero[8192]; + +int arenasumsleeptime; + +int +initarenasum(void) +{ + needzeroscore(); /* OS X */ + + sumwait.l = &sumlock; + + if(vtproc(sumproc, nil) < 0){ + seterr(EOk, "can't start arena checksum slave: %r"); + return -1; + } + return 0; +} + +/* + * make an Arena, and initialize it based upon the disk header and trailer. + */ +Arena* +initarena(Part *part, u64int base, u64int size, u32int blocksize) +{ + Arena *arena; + + arena = MKZ(Arena); + arena->part = part; + arena->blocksize = blocksize; + arena->clumpmax = arena->blocksize / ClumpInfoSize; + arena->base = base + blocksize; + arena->size = size - 2 * blocksize; + + if(loadarena(arena) < 0){ + seterr(ECorrupt, "arena header or trailer corrupted"); + freearena(arena); + return nil; + } + if(okarena(arena) < 0){ + freearena(arena); + return nil; + } + + if(arena->diskstats.sealed && scorecmp(zeroscore, arena->score)==0) + sealarena(arena); + + return arena; +} + +void +freearena(Arena *arena) +{ + if(arena == nil) + return; + free(arena); +} + +Arena* +newarena(Part *part, u32int vers, char *name, u64int base, u64int size, u32int blocksize) +{ + int bsize; + Arena *arena; + + if(nameok(name) < 0){ + seterr(EOk, "illegal arena name", name); + return nil; + } + arena = MKZ(Arena); + arena->part = part; + arena->version = vers; + if(vers == ArenaVersion4) + arena->clumpmagic = _ClumpMagic; + else{ + do + arena->clumpmagic = fastrand(); + while(arena->clumpmagic==_ClumpMagic || arena->clumpmagic==0); + } + arena->blocksize = blocksize; + arena->clumpmax = arena->blocksize / ClumpInfoSize; + arena->base = base + blocksize; + arena->size = size - 2 * blocksize; + + namecp(arena->name, name); + + bsize = sizeof zero; + if(bsize > arena->blocksize) + bsize = arena->blocksize; + + if(wbarena(arena)<0 || wbarenahead(arena)<0 + || writepart(arena->part, arena->base, zero, bsize)<0){ + freearena(arena); + return nil; + } + + return arena; +} + +int +readclumpinfo(Arena *arena, int clump, ClumpInfo *ci) +{ + CIBlock *cib, r; + + cib = getcib(arena, clump, 0, &r); + if(cib == nil) + return -1; + unpackclumpinfo(ci, &cib->data->data[cib->offset]); + putcib(arena, cib); + return 0; +} + +int +readclumpinfos(Arena *arena, int clump, ClumpInfo *cis, int n) +{ + CIBlock *cib, r; + int i; + + /* + * because the clump blocks are laid out + * in reverse order at the end of the arena, + * it can be a few percent faster to read + * the clumps backwards, which reads the + * disk blocks forwards. + */ + for(i = n-1; i >= 0; i--){ + cib = getcib(arena, clump + i, 0, &r); + if(cib == nil){ + n = i; + continue; + } + unpackclumpinfo(&cis[i], &cib->data->data[cib->offset]); + putcib(arena, cib); + } + return n; +} + +/* + * write directory information for one clump + * must be called the arena locked + */ +int +writeclumpinfo(Arena *arena, int clump, ClumpInfo *ci) +{ + CIBlock *cib, r; + + cib = getcib(arena, clump, 1, &r); + if(cib == nil) + return -1; + dirtydblock(cib->data, DirtyArenaCib); + packclumpinfo(ci, &cib->data->data[cib->offset]); + putcib(arena, cib); + return 0; +} + +u64int +arenadirsize(Arena *arena, u32int clumps) +{ + return ((clumps / arena->clumpmax) + 1) * arena->blocksize; +} + +/* + * read a clump of data + * n is a hint of the size of the data, not including the header + * make sure it won't run off the end, then return the number of bytes actually read + */ +u32int +readarena(Arena *arena, u64int aa, u8int *buf, long n) +{ + DBlock *b; + u64int a; + u32int blocksize, off, m; + long nn; + + if(n == 0) + return -1; + + qlock(&arena->lock); + a = arena->size - arenadirsize(arena, arena->memstats.clumps); + qunlock(&arena->lock); + if(aa >= a){ + seterr(EOk, "reading beyond arena clump storage: clumps=%d aa=%lld a=%lld -1 clumps=%lld\n", arena->memstats.clumps, aa, a, arena->size - arenadirsize(arena, arena->memstats.clumps - 1)); + return -1; + } + if(aa + n > a) + n = a - aa; + + blocksize = arena->blocksize; + a = arena->base + aa; + off = a & (blocksize - 1); + a -= off; + nn = 0; + for(;;){ + b = getdblock(arena->part, a, OREAD); + if(b == nil) + return -1; + m = blocksize - off; + if(m > n - nn) + m = n - nn; + memmove(&buf[nn], &b->data[off], m); + putdblock(b); + nn += m; + if(nn == n) + break; + off = 0; + a += blocksize; + } + return n; +} + +/* + * write some data to the clump section at a given offset + * used to fix up corrupted arenas. + */ +u32int +writearena(Arena *arena, u64int aa, u8int *clbuf, u32int n) +{ + DBlock *b; + u64int a; + u32int blocksize, off, m; + long nn; + int ok; + + if(n == 0) + return -1; + + qlock(&arena->lock); + a = arena->size - arenadirsize(arena, arena->memstats.clumps); + if(aa >= a || aa + n > a){ + qunlock(&arena->lock); + seterr(EOk, "writing beyond arena clump storage"); + return -1; + } + + blocksize = arena->blocksize; + a = arena->base + aa; + off = a & (blocksize - 1); + a -= off; + nn = 0; + for(;;){ + b = getdblock(arena->part, a, off != 0 || off + n < blocksize ? ORDWR : OWRITE); + if(b == nil){ + qunlock(&arena->lock); + return -1; + } + dirtydblock(b, DirtyArena); + m = blocksize - off; + if(m > n - nn) + m = n - nn; + memmove(&b->data[off], &clbuf[nn], m); + ok = 0; + putdblock(b); + if(ok < 0){ + qunlock(&arena->lock); + return -1; + } + nn += m; + if(nn == n) + break; + off = 0; + a += blocksize; + } + qunlock(&arena->lock); + return n; +} + +/* + * allocate space for the clump and write it, + * updating the arena directory +ZZZ question: should this distinguish between an arena +filling up and real errors writing the clump? + */ +u64int +writeaclump(Arena *arena, Clump *c, u8int *clbuf) +{ + DBlock *b; + u64int a, aa; + u32int clump, n, nn, m, off, blocksize; + int ok; + + n = c->info.size + ClumpSize + U32Size; + qlock(&arena->lock); + aa = arena->memstats.used; + if(arena->memstats.sealed + || aa + n + U32Size + arenadirsize(arena, arena->memstats.clumps + 1) > arena->size){ + if(!arena->memstats.sealed){ + logerr(EOk, "seal memstats %s", arena->name); + arena->memstats.sealed = 1; + wbarena(arena); + } + qunlock(&arena->lock); + return TWID64; + } + if(packclump(c, &clbuf[0], arena->clumpmagic) < 0){ + qunlock(&arena->lock); + return TWID64; + } + + /* + * write the data out one block at a time + */ + blocksize = arena->blocksize; + a = arena->base + aa; + off = a & (blocksize - 1); + a -= off; + nn = 0; + for(;;){ + b = getdblock(arena->part, a, off != 0 ? ORDWR : OWRITE); + if(b == nil){ + qunlock(&arena->lock); + return TWID64; + } + dirtydblock(b, DirtyArena); + m = blocksize - off; + if(m > n - nn) + m = n - nn; + memmove(&b->data[off], &clbuf[nn], m); + ok = 0; + putdblock(b); + if(ok < 0){ + qunlock(&arena->lock); + return TWID64; + } + nn += m; + if(nn == n) + break; + off = 0; + a += blocksize; + } + + arena->memstats.used += c->info.size + ClumpSize; + arena->memstats.uncsize += c->info.uncsize; + if(c->info.size < c->info.uncsize) + arena->memstats.cclumps++; + + clump = arena->memstats.clumps; + if(clump % ArenaCIGSize == 0){ + if(arena->cig == nil){ + loadcig(arena); + if(arena->cig == nil) + goto NoCIG; + } + /* add aa as start of next cig */ + if(clump/ArenaCIGSize != arena->ncig){ + fprint(2, "bad arena cig computation %s: writing clump %d but %d cigs\n", + arena->name, clump, arena->ncig); + arena->ncig = -1; + vtfree(arena->cig); + arena->cig = nil; + goto NoCIG; + } + arena->cig = vtrealloc(arena->cig, (arena->ncig+1)*sizeof arena->cig[0]); + arena->cig[arena->ncig++].offset = aa; + } +NoCIG: + arena->memstats.clumps++; + + if(arena->memstats.clumps == 0) + sysfatal("clumps wrapped"); + arena->wtime = now(); + if(arena->ctime == 0) + arena->ctime = arena->wtime; + + writeclumpinfo(arena, clump, &c->info); + wbarena(arena); + + qunlock(&arena->lock); + + return aa; +} + +int +atailcmp(ATailStats *a, ATailStats *b) +{ + /* good test */ + if(a->used < b->used) + return -1; + if(a->used > b->used) + return 1; + + /* suspect tests - why order this way? (no one cares) */ + if(a->clumps < b->clumps) + return -1; + if(a->clumps > b->clumps) + return 1; + if(a->cclumps < b->cclumps) + return -1; + if(a->cclumps > b->cclumps) + return 1; + if(a->uncsize < b->uncsize) + return -1; + if(a->uncsize > b->uncsize) + return 1; + if(a->sealed < b->sealed) + return -1; + if(a->sealed > b->sealed) + return 1; + + /* everything matches */ + return 0; +} + +void +setatailstate(AState *as) +{ + int i, j, osealed; + Arena *a; + Index *ix; + + trace(0, "setatailstate %s 0x%llux clumps %d", as->arena->name, as->aa, as->stats.clumps); + + /* + * Look up as->arena to find index. + */ + needmainindex(); /* OS X linker */ + ix = mainindex; + for(i=0; i<ix->narenas; i++) + if(ix->arenas[i] == as->arena) + break; + if(i==ix->narenas || as->aa < ix->amap[i].start || as->aa >= ix->amap[i].stop || as->arena != ix->arenas[i]){ + fprint(2, "funny settailstate 0x%llux\n", as->aa); + return; + } + + for(j=0; j<=i; j++){ + a = ix->arenas[j]; + if(atailcmp(&a->diskstats, &a->memstats) == 0) + continue; + qlock(&a->lock); + osealed = a->diskstats.sealed; + if(j == i) + a->diskstats = as->stats; + else + a->diskstats = a->memstats; + wbarena(a); + if(a->diskstats.sealed != osealed && !a->inqueue) + sealarena(a); + qunlock(&a->lock); + } +} + +/* + * once sealed, an arena never has any data added to it. + * it should only be changed to fix errors. + * this also syncs the clump directory. + */ +static void +sealarena(Arena *arena) +{ + arena->inqueue = 1; + backsumarena(arena); +} + +void +backsumarena(Arena *arena) +{ + ASum *as; + + if(sumwait.l == nil) + return; + + as = MK(ASum); + if(as == nil) + return; + qlock(&sumlock); + as->arena = arena; + as->next = nil; + if(sumq) + sumqtail->next = as; + else + sumq = as; + sumqtail = as; + rwakeup(&sumwait); + qunlock(&sumlock); +} + +static void +sumproc(void *unused) +{ + ASum *as; + Arena *arena; + + USED(unused); + + for(;;){ + qlock(&sumlock); + while(sumq == nil) + rsleep(&sumwait); + as = sumq; + sumq = as->next; + qunlock(&sumlock); + arena = as->arena; + free(as); + + sumarena(arena); + } +} + +void +sumarena(Arena *arena) +{ + ZBlock *b; + DigestState s; + u64int a, e; + u32int bs; + int t; + u8int score[VtScoreSize]; + + bs = MaxIoSize; + if(bs < arena->blocksize) + bs = arena->blocksize; + + /* + * read & sum all blocks except the last one + */ + flushdcache(); + memset(&s, 0, sizeof s); + b = alloczblock(bs, 0, arena->part->blocksize); + e = arena->base + arena->size; + for(a = arena->base - arena->blocksize; a + arena->blocksize <= e; a += bs){ + disksched(); + while((t=arenasumsleeptime) == SleepForever){ + sleep(1000); + disksched(); + } + sleep(t); + if(a + bs > e) + bs = arena->blocksize; + if(readpart(arena->part, a, b->data, bs) < 0) + goto ReadErr; + addstat(StatSumRead, 1); + addstat(StatSumReadBytes, bs); + sha1(b->data, bs, nil, &s); + } + + /* + * the last one is special, since it may already have the checksum included + */ + bs = arena->blocksize; + if(readpart(arena->part, e, b->data, bs) < 0){ +ReadErr: + logerr(EOk, "sumarena can't sum %s, read at %lld failed: %r", arena->name, a); + freezblock(b); + return; + } + addstat(StatSumRead, 1); + addstat(StatSumReadBytes, bs); + + sha1(b->data, bs-VtScoreSize, nil, &s); + sha1(zeroscore, VtScoreSize, nil, &s); + sha1(nil, 0, score, &s); + + /* + * check for no checksum or the same + */ + if(scorecmp(score, &b->data[bs - VtScoreSize]) != 0 + && scorecmp(zeroscore, &b->data[bs - VtScoreSize]) != 0) + logerr(EOk, "overwriting mismatched checksums for arena=%s, found=%V calculated=%V", + arena->name, &b->data[bs - VtScoreSize], score); + freezblock(b); + + qlock(&arena->lock); + scorecp(arena->score, score); + wbarena(arena); + qunlock(&arena->lock); +} + +/* + * write the arena trailer block to the partition + */ +int +wbarena(Arena *arena) +{ + DBlock *b; + int bad; + + if((b = getdblock(arena->part, arena->base + arena->size, OWRITE)) == nil){ + logerr(EAdmin, "can't write arena trailer: %r"); + return -1; + } + dirtydblock(b, DirtyArenaTrailer); + bad = okarena(arena)<0 || packarena(arena, b->data)<0; + scorecp(b->data + arena->blocksize - VtScoreSize, arena->score); + putdblock(b); + if(bad) + return -1; + return 0; +} + +int +wbarenahead(Arena *arena) +{ + ZBlock *b; + ArenaHead head; + int bad; + + namecp(head.name, arena->name); + head.version = arena->version; + head.size = arena->size + 2 * arena->blocksize; + head.blocksize = arena->blocksize; + head.clumpmagic = arena->clumpmagic; + b = alloczblock(arena->blocksize, 1, arena->part->blocksize); + if(b == nil){ + logerr(EAdmin, "can't write arena header: %r"); +/* ZZZ add error message? */ + return -1; + } + /* + * this writepart is okay because it only happens + * during initialization. + */ + bad = packarenahead(&head, b->data)<0 || + writepart(arena->part, arena->base - arena->blocksize, b->data, arena->blocksize)<0 || + flushpart(arena->part)<0; + freezblock(b); + if(bad) + return -1; + return 0; +} + +/* + * read the arena header and trailer blocks from disk + */ +static int +loadarena(Arena *arena) +{ + ArenaHead head; + ZBlock *b; + + b = alloczblock(arena->blocksize, 0, arena->part->blocksize); + if(b == nil) + return -1; + if(readpart(arena->part, arena->base + arena->size, b->data, arena->blocksize) < 0){ + freezblock(b); + return -1; + } + if(unpackarena(arena, b->data) < 0){ + freezblock(b); + return -1; + } + if(arena->version != ArenaVersion4 && arena->version != ArenaVersion5){ + seterr(EAdmin, "unknown arena version %d", arena->version); + freezblock(b); + return -1; + } + scorecp(arena->score, &b->data[arena->blocksize - VtScoreSize]); + + if(readpart(arena->part, arena->base - arena->blocksize, b->data, arena->blocksize) < 0){ + logerr(EAdmin, "can't read arena header: %r"); + freezblock(b); + return 0; + } + if(unpackarenahead(&head, b->data) < 0) + logerr(ECorrupt, "corrupted arena header: %r"); + else if(namecmp(arena->name, head.name)!=0 + || arena->clumpmagic != head.clumpmagic + || arena->version != head.version + || arena->blocksize != head.blocksize + || arena->size + 2 * arena->blocksize != head.size){ + if(namecmp(arena->name, head.name)!=0) + logerr(ECorrupt, "arena tail name %s head %s", + arena->name, head.name); + else if(arena->clumpmagic != head.clumpmagic) + logerr(ECorrupt, "arena %d tail clumpmagic 0x%lux head 0x%lux", + debugarena, (ulong)arena->clumpmagic, + (ulong)head.clumpmagic); + else if(arena->version != head.version) + logerr(ECorrupt, "arena tail version %d head version %d", + arena->version, head.version); + else if(arena->blocksize != head.blocksize) + logerr(ECorrupt, "arena tail block size %d head %d", + arena->blocksize, head.blocksize); + else if(arena->size+2*arena->blocksize != head.size) + logerr(ECorrupt, "arena tail size %lud head %lud", + (ulong)arena->size+2*arena->blocksize, head.size); + else + logerr(ECorrupt, "arena header inconsistent with arena data"); + } + freezblock(b); + + return 0; +} + +static int +okarena(Arena *arena) +{ + u64int dsize; + int ok; + + ok = 0; + dsize = arenadirsize(arena, arena->diskstats.clumps); + if(arena->diskstats.used + dsize > arena->size){ + seterr(ECorrupt, "arena %s used > size", arena->name); + ok = -1; + } + + if(arena->diskstats.cclumps > arena->diskstats.clumps) + logerr(ECorrupt, "arena %s has more compressed clumps than total clumps", arena->name); + + /* + * This need not be true if some of the disk is corrupted. + * + if(arena->diskstats.uncsize + arena->diskstats.clumps * ClumpSize + arena->blocksize < arena->diskstats.used) + logerr(ECorrupt, "arena %s uncompressed size inconsistent with used space %lld %d %lld", arena->name, arena->diskstats.uncsize, arena->diskstats.clumps, arena->diskstats.used); + */ + + /* + * this happens; it's harmless. + * + if(arena->ctime > arena->wtime) + logerr(ECorrupt, "arena %s creation time after last write time", arena->name); + */ + return ok; +} + +static CIBlock* +getcib(Arena *arena, int clump, int writing, CIBlock *rock) +{ + int mode; + CIBlock *cib; + u32int block, off; + + if(clump >= arena->memstats.clumps){ + seterr(EOk, "clump directory access out of range"); + return nil; + } + block = clump / arena->clumpmax; + off = (clump - block * arena->clumpmax) * ClumpInfoSize; + cib = rock; + cib->block = block; + cib->offset = off; + + if(writing){ + if(off == 0 && clump == arena->memstats.clumps-1) + mode = OWRITE; + else + mode = ORDWR; + }else + mode = OREAD; + + cib->data = getdblock(arena->part, + arena->base + arena->size - (block + 1) * arena->blocksize, mode); + if(cib->data == nil) + return nil; + return cib; +} + +static void +putcib(Arena *arena, CIBlock *cib) +{ + USED(arena); + + putdblock(cib->data); + cib->data = nil; +} + + +/* + * For index entry readahead purposes, the arenas are + * broken into smaller subpieces, called clump info groups + * or cigs. Each cig has ArenaCIGSize clumps (ArenaCIGSize + * is chosen to make the index entries take up about half + * a megabyte). The index entries do not contain enough + * information to determine what the clump index is for + * a given address in an arena. That info is needed both for + * figuring out which clump group an address belongs to + * and for prefetching a clump group's index entries from + * the arena table of contents. The first time clump groups + * are accessed, we scan the entire arena table of contents + * (which might be 10s of megabytes), recording the data + * offset of each clump group. + */ + +/* + * load clump info group information by scanning entire toc. + */ +static void +loadcig(Arena *arena) +{ + u32int i, j, ncig, nci; + ArenaCIG *cig; + ClumpInfo *ci; + u64int offset; + int ms; + + if(arena->cig || arena->ncig < 0) + return; + +// fprint(2, "loadcig %s\n", arena->name); + + ncig = (arena->memstats.clumps+ArenaCIGSize-1) / ArenaCIGSize; + if(ncig == 0){ + arena->cig = vtmalloc(1); + arena->ncig = 0; + return; + } + + ms = msec(); + cig = vtmalloc(ncig*sizeof cig[0]); + ci = vtmalloc(ArenaCIGSize*sizeof ci[0]); + offset = 0; + for(i=0; i<ncig; i++){ + nci = readclumpinfos(arena, i*ArenaCIGSize, ci, ArenaCIGSize); + cig[i].offset = offset; + for(j=0; j<nci; j++) + offset += ClumpSize + ci[j].size; + if(nci < ArenaCIGSize){ + if(i != ncig-1){ + vtfree(ci); + vtfree(cig); + arena->ncig = -1; + fprint(2, "loadcig %s: got %ud cigs, expected %ud\n", arena->name, i+1, ncig); + goto out; + } + } + } + vtfree(ci); + + arena->ncig = ncig; + arena->cig = cig; + +out: + ms = msec() - ms; + addstat2(StatCigLoad, 1, StatCigLoadTime, ms); +} + +/* + * convert arena address into arena group + data boundaries. + */ +int +arenatog(Arena *arena, u64int addr, u64int *gstart, u64int *glimit, int *g) +{ + int r, l, m; + + qlock(&arena->lock); + if(arena->cig == nil) + loadcig(arena); + if(arena->cig == nil || arena->ncig == 0){ + qunlock(&arena->lock); + return -1; + } + + l = 1; + r = arena->ncig - 1; + while(l <= r){ + m = (r + l) / 2; + if(arena->cig[m].offset <= addr) + l = m + 1; + else + r = m - 1; + } + l--; + + *g = l; + *gstart = arena->cig[l].offset; + if(l+1 < arena->ncig) + *glimit = arena->cig[l+1].offset; + else + *glimit = arena->memstats.used; + qunlock(&arena->lock); + return 0; +} + +/* + * load the clump info for group g into the index entries. + */ +int +asumload(Arena *arena, int g, IEntry *entries, int nentries) +{ + int i, base, limit; + u64int addr; + ClumpInfo ci; + IEntry *ie; + + if(nentries < ArenaCIGSize){ + fprint(2, "asking for too few entries\n"); + return -1; + } + + qlock(&arena->lock); + if(arena->cig == nil) + loadcig(arena); + if(arena->cig == nil || arena->ncig == 0 || g >= arena->ncig){ + qunlock(&arena->lock); + return -1; + } + + addr = 0; + base = g*ArenaCIGSize; + limit = base + ArenaCIGSize; + if(base > arena->memstats.clumps) + base = arena->memstats.clumps; + ie = entries; + for(i=base; i<limit; i++){ + if(readclumpinfo(arena, i, &ci) < 0) + break; + if(ci.type != VtCorruptType){ + scorecp(ie->score, ci.score); + ie->ia.type = ci.type; + ie->ia.size = ci.uncsize; + ie->ia.blocks = (ci.size + ClumpSize + (1<<ABlockLog) - 1) >> ABlockLog; + ie->ia.addr = addr; + ie++; + } + addr += ClumpSize + ci.size; + } + qunlock(&arena->lock); + return ie - entries; +} diff --git a/sys/src/cmd/venti/srv/arenas.c b/sys/src/cmd/venti/srv/arenas.c new file mode 100755 index 000000000..0316c4c86 --- /dev/null +++ b/sys/src/cmd/venti/srv/arenas.c @@ -0,0 +1,420 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +typedef struct AHash AHash; + +/* + * hash table for finding arena's based on their names. + */ +struct AHash +{ + AHash *next; + Arena *arena; +}; + +enum +{ + AHashSize = 512 +}; + +static AHash *ahash[AHashSize]; + +static u32int +hashstr(char *s) +{ + u32int h; + int c; + + h = 0; + for(; c = *s; s++){ + c ^= c << 6; + h += (c << 11) ^ (c >> 1); + c = *s; + h ^= (c << 14) + (c << 7) + (c << 4) + c; + } + return h; +} + +int +addarena(Arena *arena) +{ + AHash *a; + u32int h; + + h = hashstr(arena->name) & (AHashSize - 1); + a = MK(AHash); + if(a == nil) + return -1; + a->arena = arena; + a->next = ahash[h]; + ahash[h] = a; + return 0; +} + +Arena* +findarena(char *name) +{ + AHash *a; + u32int h; + + h = hashstr(name) & (AHashSize - 1); + for(a = ahash[h]; a != nil; a = a->next) + if(strcmp(a->arena->name, name) == 0) + return a->arena; + return nil; +} + +int +delarena(Arena *arena) +{ + AHash *a, *last; + u32int h; + + h = hashstr(arena->name) & (AHashSize - 1); + last = nil; + for(a = ahash[h]; a != nil; a = a->next){ + if(a->arena == arena){ + if(last != nil) + last->next = a->next; + else + ahash[h] = a->next; + free(a); + return 0; + } + last = a; + } + return -1; +} + +ArenaPart* +initarenapart(Part *part) +{ + AMapN amn; + ArenaPart *ap; + ZBlock *b; + u32int i; + int ok; + + b = alloczblock(HeadSize, 0, 0); + if(b == nil || readpart(part, PartBlank, b->data, HeadSize) < 0){ + seterr(EAdmin, "can't read arena partition header: %r"); + return nil; + } + + ap = MKZ(ArenaPart); + if(ap == nil){ + freezblock(b); + return nil; + } + ap->part = part; + ok = unpackarenapart(ap, b->data); + freezblock(b); + if(ok < 0){ + freearenapart(ap, 0); + return nil; + } + + ap->tabbase = (PartBlank + HeadSize + ap->blocksize - 1) & ~(ap->blocksize - 1); + if(ap->version != ArenaPartVersion){ + seterr(ECorrupt, "unknown arena partition version %d", ap->version); + freearenapart(ap, 0); + return nil; + } + if(ap->blocksize & (ap->blocksize - 1)){ + seterr(ECorrupt, "illegal non-power-of-2 block size %d\n", ap->blocksize); + freearenapart(ap, 0); + return nil; + } + if(ap->tabbase >= ap->arenabase){ + seterr(ECorrupt, "arena partition table overlaps with arena storage"); + freearenapart(ap, 0); + return nil; + } + ap->tabsize = ap->arenabase - ap->tabbase; + partblocksize(part, ap->blocksize); + ap->size = ap->part->size & ~(u64int)(ap->blocksize - 1); + + if(readarenamap(&amn, part, ap->tabbase, ap->tabsize) < 0){ + freearenapart(ap, 0); + return nil; + } + ap->narenas = amn.n; + ap->map = amn.map; + if(okamap(ap->map, ap->narenas, ap->arenabase, ap->size, "arena table") < 0){ + freearenapart(ap, 0); + return nil; + } + + ap->arenas = MKNZ(Arena*, ap->narenas); + for(i = 0; i < ap->narenas; i++){ + debugarena = i; + ap->arenas[i] = initarena(part, ap->map[i].start, ap->map[i].stop - ap->map[i].start, ap->blocksize); + if(ap->arenas[i] == nil){ + seterr(ECorrupt, "%s: %r", ap->map[i].name); + freearenapart(ap, 1); + return nil; + } + if(namecmp(ap->map[i].name, ap->arenas[i]->name) != 0){ + seterr(ECorrupt, "arena name mismatches with expected name: %s vs. %s", + ap->map[i].name, ap->arenas[i]->name); + freearenapart(ap, 1); + return nil; + } + if(findarena(ap->arenas[i]->name)){ + seterr(ECorrupt, "duplicate arena name %s in %s", + ap->map[i].name, ap->part->name); + freearenapart(ap, 1); + return nil; + } + } + + for(i = 0; i < ap->narenas; i++) { + debugarena = i; + addarena(ap->arenas[i]); + } + debugarena = -1; + + return ap; +} + +ArenaPart* +newarenapart(Part *part, u32int blocksize, u32int tabsize) +{ + ArenaPart *ap; + + if(blocksize & (blocksize - 1)){ + seterr(ECorrupt, "illegal non-power-of-2 block size %d\n", blocksize); + return nil; + } + ap = MKZ(ArenaPart); + if(ap == nil) + return nil; + + ap->version = ArenaPartVersion; + ap->part = part; + ap->blocksize = blocksize; + partblocksize(part, blocksize); + ap->size = part->size & ~(u64int)(blocksize - 1); + ap->tabbase = (PartBlank + HeadSize + blocksize - 1) & ~(blocksize - 1); + ap->arenabase = (ap->tabbase + tabsize + blocksize - 1) & ~(blocksize - 1); + ap->tabsize = ap->arenabase - ap->tabbase; + ap->narenas = 0; + + if(wbarenapart(ap) < 0){ + freearenapart(ap, 0); + return nil; + } + + return ap; +} + +int +wbarenapart(ArenaPart *ap) +{ + ZBlock *b; + + if(okamap(ap->map, ap->narenas, ap->arenabase, ap->size, "arena table") < 0) + return -1; + b = alloczblock(HeadSize, 1, 0); + if(b == nil) +/* ZZZ set error message? */ + return -1; + + if(packarenapart(ap, b->data) < 0){ + seterr(ECorrupt, "can't make arena partition header: %r"); + freezblock(b); + return -1; + } + if(writepart(ap->part, PartBlank, b->data, HeadSize) < 0 || + flushpart(ap->part) < 0){ + seterr(EAdmin, "can't write arena partition header: %r"); + freezblock(b); + return -1; + } + freezblock(b); + + return wbarenamap(ap->map, ap->narenas, ap->part, ap->tabbase, ap->tabsize); +} + +void +freearenapart(ArenaPart *ap, int freearenas) +{ + int i; + + if(ap == nil) + return; + if(freearenas){ + for(i = 0; i < ap->narenas; i++){ + if(ap->arenas[i] == nil) + continue; + delarena(ap->arenas[i]); + freearena(ap->arenas[i]); + } + } + free(ap->map); + free(ap->arenas); + free(ap); +} + +int +okamap(AMap *am, int n, u64int start, u64int stop, char *what) +{ + u64int last; + u32int i; + + last = start; + for(i = 0; i < n; i++){ + if(am[i].start < last){ + if(i == 0) + seterr(ECorrupt, "invalid start address in %s", what); + else + seterr(ECorrupt, "overlapping ranges in %s", what); + return -1; + } + if(am[i].stop < am[i].start){ + seterr(ECorrupt, "invalid range in %s", what); + return -1; + } + last = am[i].stop; + } + if(last > stop){ + seterr(ECorrupt, "invalid ending address in %s", what); + return -1; + } + return 0; +} + +int +maparenas(AMap *am, Arena **arenas, int n, char *what) +{ + u32int i; + + for(i = 0; i < n; i++){ + arenas[i] = findarena(am[i].name); + if(arenas[i] == nil){ + seterr(EAdmin, "can't find arena '%s' for '%s'\n", am[i].name, what); + return -1; + } + } + return 0; +} + +int +readarenamap(AMapN *amn, Part *part, u64int base, u32int size) +{ + IFile f; + u32int ok; + + if(partifile(&f, part, base, size) < 0) + return -1; + ok = parseamap(&f, amn); + freeifile(&f); + return ok; +} + +int +wbarenamap(AMap *am, int n, Part *part, u64int base, u64int size) +{ + Fmt f; + ZBlock *b; + + b = alloczblock(size, 1, part->blocksize); + if(b == nil) + return -1; + + fmtzbinit(&f, b); + + if(outputamap(&f, am, n) < 0){ + seterr(ECorrupt, "arena set size too small"); + freezblock(b); + return -1; + } + if(writepart(part, base, b->data, size) < 0 || flushpart(part) < 0){ + seterr(EAdmin, "can't write arena set: %r"); + freezblock(b); + return -1; + } + freezblock(b); + return 0; +} + +/* + * amap: n '\n' amapelem * n + * n: u32int + * amapelem: name '\t' astart '\t' astop '\n' + * astart, astop: u64int + */ +int +parseamap(IFile *f, AMapN *amn) +{ + AMap *am; + u64int v64; + u32int v; + char *s, *t, *flds[4]; + int i, n; + + /* + * arenas + */ + if(ifileu32int(f, &v) < 0){ + seterr(ECorrupt, "syntax error: bad number of elements in %s", f->name); + return -1; + } + n = v; + if(n > MaxAMap){ + seterr(ECorrupt, "illegal number of elements %d in %s", + n, f->name); + return -1; + } + am = MKNZ(AMap, n); + if(am == nil){ + fprint(2, "out of memory\n"); + return -1; + } + for(i = 0; i < n; i++){ + s = ifileline(f); + if(s) + t = estrdup(s); + else + t = nil; + if(s == nil || getfields(s, flds, 4, 0, "\t") != 3){ + fprint(2, "early eof after %d of %d, %s:#%d: %s\n", i, n, f->name, f->pos, t); + free(t); + return -1; + } + free(t); + if(nameok(flds[0]) < 0) + return -1; + namecp(am[i].name, flds[0]); + if(stru64int(flds[1], &v64) < 0){ + seterr(ECorrupt, "syntax error: bad arena base address in %s", f->name); + free(am); + return -1; + } + am[i].start = v64; + if(stru64int(flds[2], &v64) < 0){ + seterr(ECorrupt, "syntax error: bad arena size in %s", f->name); + free(am); + return -1; + } + am[i].stop = v64; + } + + amn->map = am; + amn->n = n; + return 0; +} + +int +outputamap(Fmt *f, AMap *am, int n) +{ + int i; + + if(fmtprint(f, "%ud\n", n) < 0) + return -1; + for(i = 0; i < n; i++) + if(fmtprint(f, "%s\t%llud\t%llud\n", am[i].name, am[i].start, am[i].stop) < 0) + return -1; + return 0; +} diff --git a/sys/src/cmd/venti/srv/bloom.c b/sys/src/cmd/venti/srv/bloom.c new file mode 100755 index 000000000..1db36bd92 --- /dev/null +++ b/sys/src/cmd/venti/srv/bloom.c @@ -0,0 +1,256 @@ +/* + * Bloom filter tracking which scores are present in our arenas + * and (more importantly) which are not. + */ + +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +int ignorebloom; + +int +bloominit(Bloom *b, vlong vsize, u8int *data) +{ + ulong size; + + size = vsize; + if(size != vsize){ /* truncation */ + werrstr("bloom data too big"); + return -1; + } + + b->size = size; + b->nhash = 32; /* will be fixed by caller on initialization */ + if(data != nil) + if(unpackbloomhead(b, data) < 0) + return -1; + + b->bitmask = (b->size<<3) - 1; + b->data = data; + return 0; +} + +void +wbbloomhead(Bloom *b) +{ + packbloomhead(b, b->data); +} + +Bloom* +readbloom(Part *p) +{ + uchar buf[512]; + Bloom *b; + + b = vtmallocz(sizeof *b); + if(readpart(p, 0, buf, sizeof buf) < 0) + return nil; + /* + * pass buf as b->data so that bloominit + * can parse header. won't be used for + * accessing bits (cleared below). + */ + if(bloominit(b, 0, buf) < 0){ + vtfree(b); + return nil; + }else{ + /* + * default block size is system page size. + * the bloom filter is usually very big. + * bump the block size up to speed i/o. + */ + if(p->blocksize < (1<<20)){ + p->blocksize = 1<<20; + if(p->blocksize > p->size) + p->blocksize = p->size; + } + } + b->part = p; + b->data = nil; + return b; +} + +int +resetbloom(Bloom *b) +{ + uchar *data; + + data = vtmallocz(b->size); + b->data = data; + if(b->size == MaxBloomSize) /* 2^32 overflows ulong */ + addstat(StatBloomBits, b->size*8-1); + else + addstat(StatBloomBits, b->size*8); + return 0; +} + +int +loadbloom(Bloom *b) +{ + int i, n; + uint ones; + uchar *data; + u32int *a; + + data = vtmallocz(b->size); + if(readpart(b->part, 0, data, b->size) < 0){ + vtfree(b); + vtfree(data); + return -1; + } + b->data = data; + + a = (u32int*)b->data; + n = b->size/4; + ones = 0; + for(i=0; i<n; i++) + ones += countbits(a[i]); + addstat(StatBloomOnes, ones); + + if(b->size == MaxBloomSize) /* 2^32 overflows ulong */ + addstat(StatBloomBits, b->size*8-1); + else + addstat(StatBloomBits, b->size*8); + + return 0; +} + +int +writebloom(Bloom *b) +{ + wbbloomhead(b); + if(writepart(b->part, 0, b->data, b->size) < 0) + return -1; + if(flushpart(b->part) < 0) + return -1; + return 0; +} + +/* + * Derive two random 32-bit quantities a, b from the score + * and then use a+b*i as a sequence of bloom filter indices. + * Michael Mitzenmacher has a recent (2005) paper saying this is okay. + * We reserve the bottom bytes (BloomHeadSize*8 bits) for the header. + */ +static void +gethashes(u8int *score, ulong *h) +{ + int i; + u32int a, b; + + a = 0; + b = 0; + for(i=4; i+8<=VtScoreSize; i+=8){ + a ^= *(u32int*)(score+i); + b ^= *(u32int*)(score+i+4); + } + if(i+4 <= VtScoreSize) /* 20 is not 4-aligned */ + a ^= *(u32int*)(score+i); + for(i=0; i<BloomMaxHash; i++, a+=b) + h[i] = a < BloomHeadSize*8 ? BloomHeadSize*8 : a; +} + +static void +_markbloomfilter(Bloom *b, u8int *score) +{ + int i, nnew; + ulong h[BloomMaxHash]; + u32int x, *y, z, *tab; + + trace("markbloomfilter", "markbloomfilter %V", score); + gethashes(score, h); + nnew = 0; + tab = (u32int*)b->data; + for(i=0; i<b->nhash; i++){ + x = h[i]; + y = &tab[(x&b->bitmask)>>5]; + z = 1<<(x&31); + if(!(*y&z)){ + nnew++; + *y |= z; + } + } + if(nnew) + addstat(StatBloomOnes, nnew); + + trace("markbloomfilter", "markbloomfilter exit"); +} + +static int +_inbloomfilter(Bloom *b, u8int *score) +{ + int i; + ulong h[BloomMaxHash], x; + u32int *tab; + + gethashes(score, h); + tab = (u32int*)b->data; + for(i=0; i<b->nhash; i++){ + x = h[i]; + if(!(tab[(x&b->bitmask)>>5] & (1<<(x&31)))) + return 0; + } + return 1; +} + +int +inbloomfilter(Bloom *b, u8int *score) +{ + int r; + + if(b == nil || b->data == nil) + return 1; + + if(ignorebloom) + return 1; + + rlock(&b->lk); + r = _inbloomfilter(b, score); + runlock(&b->lk); + addstat(StatBloomLookup, 1); + if(r) + addstat(StatBloomMiss, 1); + else + addstat(StatBloomHit, 1); + return r; +} + +void +markbloomfilter(Bloom *b, u8int *score) +{ + if(b == nil || b->data == nil) + return; + + rlock(&b->lk); + qlock(&b->mod); + _markbloomfilter(b, score); + qunlock(&b->mod); + runlock(&b->lk); +} + +static void +bloomwriteproc(void *v) +{ + int ret; + Bloom *b; + + threadsetname("bloomwriteproc"); + b = v; + for(;;){ + recv(b->writechan, 0); + if((ret=writebloom(b)) < 0) + fprint(2, "oops! writing bloom: %r\n"); + else + ret = 0; + sendul(b->writedonechan, ret); + } +} + +void +startbloomproc(Bloom *b) +{ + b->writechan = chancreate(sizeof(void*), 0); + b->writedonechan = chancreate(sizeof(void*), 0); + vtproc(bloomwriteproc, b); +} diff --git a/sys/src/cmd/venti/srv/buildbuck.c b/sys/src/cmd/venti/srv/buildbuck.c new file mode 100755 index 000000000..73f8056be --- /dev/null +++ b/sys/src/cmd/venti/srv/buildbuck.c @@ -0,0 +1,132 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +/* + * An IEStream is a sorted list of index entries. + */ +struct IEStream +{ + Part *part; + u64int off; /* read position within part */ + u64int n; /* number of valid ientries left to read */ + u32int size; /* allocated space in buffer */ + u8int *buf; + u8int *pos; /* current place in buffer */ + u8int *epos; /* end of valid buffer contents */ +}; + +IEStream* +initiestream(Part *part, u64int off, u64int clumps, u32int size) +{ + IEStream *ies; + +/* out of memory? */ + ies = MKZ(IEStream); + ies->buf = MKN(u8int, size); + ies->epos = ies->buf; + ies->pos = ies->epos; + ies->off = off; + ies->n = clumps; + ies->size = size; + ies->part = part; + return ies; +} + +void +freeiestream(IEStream *ies) +{ + if(ies == nil) + return; + free(ies->buf); + free(ies); +} + +/* + * Return the next IEntry (still packed) in the stream. + */ +static u8int* +peekientry(IEStream *ies) +{ + u32int n, nn; + + n = ies->epos - ies->pos; + if(n < IEntrySize){ + memmove(ies->buf, ies->pos, n); + ies->epos = &ies->buf[n]; + ies->pos = ies->buf; + nn = ies->size; + if(nn > ies->n * IEntrySize) + nn = ies->n * IEntrySize; + nn -= n; + if(nn == 0) + return nil; +//fprint(2, "peek %d from %llud into %p\n", nn, ies->off, ies->epos); + if(readpart(ies->part, ies->off, ies->epos, nn) < 0){ + seterr(EOk, "can't read sorted index entries: %r"); + return nil; + } + ies->epos += nn; + ies->off += nn; + } + return ies->pos; +} + +/* + * Compute the bucket number for the given IEntry. + * Knows that the score is the first thing in the packed + * representation. + */ +static u32int +iebuck(Index *ix, u8int *b, IBucket *ib, IEStream *ies) +{ + USED(ies); + USED(ib); + return hashbits(b, 32) / ix->div; +} + +/* + * Fill ib with the next bucket in the stream. + */ +u32int +buildbucket(Index *ix, IEStream *ies, IBucket *ib, uint maxdata) +{ + IEntry ie1, ie2; + u8int *b; + u32int buck; + + buck = TWID32; + ib->n = 0; + while(ies->n){ + b = peekientry(ies); + if(b == nil) + return TWID32; +/* fprint(2, "b=%p ies->n=%lld ib.n=%d buck=%d score=%V\n", b, ies->n, ib->n, iebuck(ix, b, ib, ies), b); */ + if(ib->n == 0) + buck = iebuck(ix, b, ib, ies); + else{ + if(buck != iebuck(ix, b, ib, ies)) + break; + if(ientrycmp(&ib->data[(ib->n - 1)* IEntrySize], b) == 0){ + /* + * guess that the larger address is the correct one to use + */ + unpackientry(&ie1, &ib->data[(ib->n - 1)* IEntrySize]); + unpackientry(&ie2, b); + seterr(EOk, "duplicate index entry for score=%V type=%d", ie1.score, ie1.ia.type); + ib->n--; + if(ie1.ia.addr > ie2.ia.addr) + memmove(b, &ib->data[ib->n * IEntrySize], IEntrySize); + } + } + if((ib->n+1)*IEntrySize > maxdata){ + seterr(EOk, "bucket overflow"); + return TWID32; + } + memmove(&ib->data[ib->n * IEntrySize], b, IEntrySize); + ib->n++; + ies->n--; + ies->pos += IEntrySize; + } + return buck; +} diff --git a/sys/src/cmd/venti/srv/buildindex.c b/sys/src/cmd/venti/srv/buildindex.c new file mode 100755 index 000000000..2f24055c9 --- /dev/null +++ b/sys/src/cmd/venti/srv/buildindex.c @@ -0,0 +1,966 @@ +/* + * Rebuild the index from scratch, in place. + */ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +enum +{ + MinBufSize = 64*1024, + MaxBufSize = 4*1024*1024, +}; + +int dumb; +int errors; +char **isect; +int nisect; +int bloom; +int zero; + +u32int isectmem; +u64int totalbuckets; +u64int totalclumps; +Channel *arenadonechan; +Channel *isectdonechan; +Index *ix; + +u64int arenaentries; +u64int skipentries; +u64int indexentries; + +static int shouldprocess(ISect*); +static void isectproc(void*); +static void arenapartproc(void*); + +void +usage(void) +{ + fprint(2, "usage: buildindex [-b] [-i isect]... [-M imem] venti.conf\n"); + threadexitsall("usage"); +} + +void +threadmain(int argc, char *argv[]) +{ + int fd, i, napart, nfinish, maxdisks; + u32int bcmem, imem; + Config conf; + Part *p; + + maxdisks = 100000; + ventifmtinstall(); + imem = 256*1024*1024; + ARGBEGIN{ + case 'b': + bloom = 1; + break; + case 'd': /* debugging - make sure to run all 3 passes */ + dumb = 1; + break; + case 'i': + isect = vtrealloc(isect, (nisect+1)*sizeof(isect[0])); + isect[nisect++] = EARGF(usage()); + break; + case 'M': + imem = unittoull(EARGF(usage())); + break; + case 'm': /* temporary - might go away */ + maxdisks = atoi(EARGF(usage())); + break; + default: + usage(); + break; + }ARGEND + + if(argc != 1) + usage(); + + if(initventi(argv[0], &conf) < 0) + sysfatal("can't init venti: %r"); + ix = mainindex; + if(nisect == 0 && ix->bloom) + bloom = 1; + if(bloom && ix->bloom && resetbloom(ix->bloom) < 0) + sysfatal("loadbloom: %r"); + if(bloom && !ix->bloom) + sysfatal("-b specified but no bloom filter"); + if(!bloom) + ix->bloom = nil; + isectmem = imem/ix->nsects; + + /* + * safety first - only need read access to arenas + */ + p = nil; + for(i=0; i<ix->narenas; i++){ + if(ix->arenas[i]->part != p){ + p = ix->arenas[i]->part; + if((fd = open(p->filename, OREAD)) < 0) + sysfatal("cannot reopen %s: %r", p->filename); + dup(fd, p->fd); + close(fd); + } + } + + /* + * need a block for every arena + */ + bcmem = maxblocksize * (mainindex->narenas + 16); + if(0) fprint(2, "initialize %d bytes of disk block cache\n", bcmem); + initdcache(bcmem); + + totalclumps = 0; + for(i=0; i<ix->narenas; i++) + totalclumps += ix->arenas[i]->diskstats.clumps; + + totalbuckets = 0; + for(i=0; i<ix->nsects; i++) + totalbuckets += ix->sects[i]->blocks; + fprint(2, "%,lld clumps, %,lld buckets\n", totalclumps, totalbuckets); + + /* start index procs */ + fprint(2, "%T read index\n"); + isectdonechan = chancreate(sizeof(void*), 0); + for(i=0; i<ix->nsects; i++){ + if(shouldprocess(ix->sects[i])){ + ix->sects[i]->writechan = chancreate(sizeof(IEntry), 0); + vtproc(isectproc, ix->sects[i]); + } + } + + for(i=0; i<nisect; i++) + if(isect[i]) + fprint(2, "warning: did not find index section %s\n", isect[i]); + + /* start arena procs */ + p = nil; + napart = 0; + nfinish = 0; + arenadonechan = chancreate(sizeof(void*), 0); + for(i=0; i<ix->narenas; i++){ + if(ix->arenas[i]->part != p){ + p = ix->arenas[i]->part; + vtproc(arenapartproc, p); + if(++napart >= maxdisks){ + recvp(arenadonechan); + nfinish++; + } + } + } + + /* wait for arena procs to finish */ + for(nfinish=0; nfinish<napart; nfinish++) + recvp(arenadonechan); + + /* tell index procs to finish */ + for(i=0; i<ix->nsects; i++) + if(ix->sects[i]->writechan) + send(ix->sects[i]->writechan, nil); + + /* wait for index procs to finish */ + for(i=0; i<ix->nsects; i++) + if(ix->sects[i]->writechan) + recvp(isectdonechan); + + if(ix->bloom && writebloom(ix->bloom) < 0) + fprint(2, "writing bloom filter: %r\n"); + + fprint(2, "%T done arenaentries=%,lld indexed=%,lld (nskip=%,lld)\n", + arenaentries, indexentries, skipentries); + threadexitsall(nil); +} + +static int +shouldprocess(ISect *is) +{ + int i; + + if(nisect == 0) + return 1; + + for(i=0; i<nisect; i++) + if(isect[i] && strcmp(isect[i], is->name) == 0){ + isect[i] = nil; + return 1; + } + return 0; +} + +static void +add(u64int *a, u64int n) +{ + static Lock l; + + lock(&l); + *a += n; + unlock(&l); +} + +/* + * Read through an arena partition and send each of its IEntries + * to the appropriate index section. When finished, send on + * arenadonechan. + */ +enum +{ + ClumpChunks = 32*1024, +}; +static void +arenapartproc(void *v) +{ + int i, j, n, nskip, x; + u32int clump; + u64int addr, tot; + Arena *a; + ClumpInfo *ci, *cis; + IEntry ie; + Part *p; + + p = v; + threadsetname("arenaproc %s", p->name); + + nskip = 0; + tot = 0; + cis = MKN(ClumpInfo, ClumpChunks); + for(i=0; i<ix->narenas; i++){ + a = ix->arenas[i]; + if(a->part != p) + continue; + if(a->memstats.clumps) + fprint(2, "%T arena %s: %d entries\n", + a->name, a->memstats.clumps); + /* + * Running the loop backwards accesses the + * clump info blocks forwards, since they are + * stored in reverse order at the end of the arena. + * This speeds things slightly. + */ + addr = ix->amap[i].start + a->memstats.used; + for(clump=a->memstats.clumps; clump > 0; clump-=n){ + n = ClumpChunks; + if(n > clump) + n = clump; + if(readclumpinfos(a, clump-n, cis, n) != n){ + fprint(2, "%T arena %s: directory read: %r\n", a->name); + errors = 1; + break; + } + for(j=n-1; j>=0; j--){ + ci = &cis[j]; + ie.ia.type = ci->type; + ie.ia.size = ci->uncsize; + addr -= ci->size + ClumpSize; + ie.ia.addr = addr; + ie.ia.blocks = (ci->size + ClumpSize + (1<<ABlockLog)-1) >> ABlockLog; + scorecp(ie.score, ci->score); + if(ci->type == VtCorruptType) + nskip++; + else{ + tot++; + x = indexsect(ix, ie.score); + assert(0 <= x && x < ix->nsects); + if(ix->sects[x]->writechan) + send(ix->sects[x]->writechan, &ie); + if(ix->bloom) + markbloomfilter(ix->bloom, ie.score); + } + } + } + if(addr != ix->amap[i].start) + fprint(2, "%T arena %s: clump miscalculation %lld != %lld\n", a->name, addr, ix->amap[i].start); + } + add(&arenaentries, tot); + add(&skipentries, nskip); + sendp(arenadonechan, p); +} + +/* + * Convert score into relative bucket number in isect. + * Can pass a packed ientry instead of score - score is first. + */ +static u32int +score2bucket(ISect *is, uchar *score) +{ + u32int b; + + b = hashbits(score, 32)/ix->div; + if(b < is->start || b >= is->stop){ + fprint(2, "score2bucket: score=%V div=%d b=%ud start=%ud stop=%ud\n", + score, ix->div, b, is->start, is->stop); + } + assert(is->start <= b && b < is->stop); + return b - is->start; +} + +/* + * Convert offset in index section to bucket number. + */ +static u32int +offset2bucket(ISect *is, u64int offset) +{ + u32int b; + + assert(is->blockbase <= offset); + offset -= is->blockbase; + b = offset/is->blocksize; + assert(b < is->stop-is->start); + return b; +} + +/* + * Convert bucket number to offset. + */ +static u64int +bucket2offset(ISect *is, u32int b) +{ + assert(b <= is->stop-is->start); + return is->blockbase + (u64int)b*is->blocksize; +} + +/* + * IEntry buffers to hold initial round of spraying. + */ +typedef struct Buf Buf; +struct Buf +{ + Part *part; /* partition being written */ + uchar *bp; /* current block */ + uchar *ep; /* end of block */ + uchar *wp; /* write position in block */ + u64int boffset; /* start offset */ + u64int woffset; /* next write offset */ + u64int eoffset; /* end offset */ + u32int nentry; /* number of entries written */ +}; + +static void +bflush(Buf *buf) +{ + u32int bufsize; + + if(buf->woffset >= buf->eoffset) + sysfatal("buf index chunk overflow - need bigger index"); + bufsize = buf->ep - buf->bp; + if(writepart(buf->part, buf->woffset, buf->bp, bufsize) < 0){ + fprint(2, "write %s: %r\n", buf->part->name); + errors = 1; + } + buf->woffset += bufsize; + memset(buf->bp, 0, bufsize); + buf->wp = buf->bp; +} + +static void +bwrite(Buf *buf, IEntry *ie) +{ + if(buf->wp+IEntrySize > buf->ep) + bflush(buf); + assert(buf->bp <= buf->wp && buf->wp < buf->ep); + packientry(ie, buf->wp); + buf->wp += IEntrySize; + assert(buf->bp <= buf->wp && buf->wp <= buf->ep); + buf->nentry++; +} + +/* + * Minibuffer. In-memory data structure holds our place + * in the buffer but has no block data. We are writing and + * reading the minibuffers at the same time. (Careful!) + */ +typedef struct Minibuf Minibuf; +struct Minibuf +{ + u64int boffset; /* start offset */ + u64int roffset; /* read offset */ + u64int woffset; /* write offset */ + u64int eoffset; /* end offset */ + u32int nentry; /* # entries left to read */ + u32int nwentry; /* # entries written */ +}; + +/* + * Index entry pool. Used when trying to shuffle around + * the entries in a big buffer into the corresponding M minibuffers. + * Sized to hold M*EntriesPerBlock entries, so that there will always + * either be room in the pool for another block worth of entries + * or there will be an entire block worth of sorted entries to + * write out. + */ +typedef struct IEntryLink IEntryLink; +typedef struct IPool IPool; + +struct IEntryLink +{ + uchar ie[IEntrySize]; /* raw IEntry */ + IEntryLink *next; /* next in chain */ +}; + +struct IPool +{ + ISect *isect; + u32int buck0; /* first bucket in pool */ + u32int mbufbuckets; /* buckets per minibuf */ + IEntryLink *entry; /* all IEntryLinks */ + u32int nentry; /* # of IEntryLinks */ + IEntryLink *free; /* free list */ + u32int nfree; /* # on free list */ + Minibuf *mbuf; /* all minibufs */ + u32int nmbuf; /* # of minibufs */ + IEntryLink **mlist; /* lists for each minibuf */ + u32int *mcount; /* # on each mlist[i] */ + u32int bufsize; /* block buffer size */ + uchar *rbuf; /* read buffer */ + uchar *wbuf; /* write buffer */ + u32int epbuf; /* entries per block buffer */ +}; + +/* +static int +countsokay(IPool *p) +{ + int i; + u64int n; + + n = 0; + for(i=0; i<p->nmbuf; i++) + n += p->mcount[i]; + n += p->nfree; + if(n != p->nentry){ + print("free %ud:", p->nfree); + for(i=0; i<p->nmbuf; i++) + print(" %ud", p->mcount[i]); + print(" = %lld nentry: %ud\n", n, p->nentry); + } + return n == p->nentry; +} +*/ + +static IPool* +mkipool(ISect *isect, Minibuf *mbuf, u32int nmbuf, + u32int mbufbuckets, u32int bufsize) +{ + u32int i, nentry; + uchar *data; + IPool *p; + IEntryLink *l; + + nentry = (nmbuf+1)*bufsize / IEntrySize; + p = ezmalloc(sizeof(IPool) + +nentry*sizeof(IEntry) + +nmbuf*sizeof(IEntryLink*) + +nmbuf*sizeof(u32int) + +3*bufsize); + + p->isect = isect; + p->mbufbuckets = mbufbuckets; + p->bufsize = bufsize; + p->entry = (IEntryLink*)(p+1); + p->nentry = nentry; + p->mlist = (IEntryLink**)(p->entry+nentry); + p->mcount = (u32int*)(p->mlist+nmbuf); + p->nmbuf = nmbuf; + p->mbuf = mbuf; + data = (uchar*)(p->mcount+nmbuf); + data += bufsize - (uintptr)data%bufsize; + p->rbuf = data; + p->wbuf = data+bufsize; + p->epbuf = bufsize/IEntrySize; + + for(i=0; i<p->nentry; i++){ + l = &p->entry[i]; + l->next = p->free; + p->free = l; + p->nfree++; + } + return p; +} + +/* + * Add the index entry ie to the pool p. + * Caller must know there is room. + */ +static void +ipoolinsert(IPool *p, uchar *ie) +{ + u32int buck, x; + IEntryLink *l; + + assert(p->free != nil); + + buck = score2bucket(p->isect, ie); + x = (buck-p->buck0) / p->mbufbuckets; + if(x >= p->nmbuf){ + fprint(2, "buck=%ud mbufbucket=%ud x=%ud\n", + buck, p->mbufbuckets, x); + } + assert(x < p->nmbuf); + + l = p->free; + p->free = l->next; + p->nfree--; + memmove(l->ie, ie, IEntrySize); + l->next = p->mlist[x]; + p->mlist[x] = l; + p->mcount[x]++; +} + +/* + * Pull out a block containing as many + * entries as possible for minibuffer x. + */ +static u32int +ipoolgetbuf(IPool *p, u32int x) +{ + uchar *bp, *ep, *wp; + IEntryLink *l; + u32int n; + + bp = p->wbuf; + ep = p->wbuf + p->bufsize; + n = 0; + assert(x < p->nmbuf); + for(wp=bp; wp+IEntrySize<=ep && p->mlist[x]; wp+=IEntrySize){ + l = p->mlist[x]; + p->mlist[x] = l->next; + p->mcount[x]--; + memmove(wp, l->ie, IEntrySize); + l->next = p->free; + p->free = l; + p->nfree++; + n++; + } + memset(wp, 0, ep-wp); + return n; +} + +/* + * Read a block worth of entries from the minibuf + * into the pool. Caller must know there is room. + */ +static void +ipoolloadblock(IPool *p, Minibuf *mb) +{ + u32int i, n; + + assert(mb->nentry > 0); + assert(mb->roffset >= mb->woffset); + assert(mb->roffset < mb->eoffset); + + n = p->bufsize/IEntrySize; + if(n > mb->nentry) + n = mb->nentry; + if(readpart(p->isect->part, mb->roffset, p->rbuf, p->bufsize) < 0) + fprint(2, "readpart %s: %r\n", p->isect->part->name); + else{ + for(i=0; i<n; i++) + ipoolinsert(p, p->rbuf+i*IEntrySize); + } + mb->nentry -= n; + mb->roffset += p->bufsize; +} + +/* + * Write out a block worth of entries to minibuffer x. + * If necessary, pick up the data there before overwriting it. + */ +static void +ipoolflush0(IPool *pool, u32int x) +{ + u32int bufsize; + Minibuf *mb; + + mb = pool->mbuf+x; + bufsize = pool->bufsize; + mb->nwentry += ipoolgetbuf(pool, x); + if(mb->nentry > 0 && mb->roffset == mb->woffset){ + assert(pool->nfree >= pool->bufsize/IEntrySize); + /* + * There will be room in the pool -- we just + * removed a block worth. + */ + ipoolloadblock(pool, mb); + } + if(writepart(pool->isect->part, mb->woffset, pool->wbuf, bufsize) < 0) + fprint(2, "writepart %s: %r\n", pool->isect->part->name); + mb->woffset += bufsize; +} + +/* + * Write out some full block of entries. + * (There must be one -- the pool is almost full!) + */ +static void +ipoolflush1(IPool *pool) +{ + u32int i; + + assert(pool->nfree <= pool->epbuf); + + for(i=0; i<pool->nmbuf; i++){ + if(pool->mcount[i] >= pool->epbuf){ + ipoolflush0(pool, i); + return; + } + } + /* can't be reached - someone must be full */ + sysfatal("ipoolflush1"); +} + +/* + * Flush all the entries in the pool out to disk. + * Nothing more to read from disk. + */ +static void +ipoolflush(IPool *pool) +{ + u32int i; + + for(i=0; i<pool->nmbuf; i++) + while(pool->mlist[i]) + ipoolflush0(pool, i); + assert(pool->nfree == pool->nentry); +} + +/* + * Third pass. Pick up each minibuffer from disk into + * memory and then write out the buckets. + */ + +/* + * Compare two packed index entries. + * Usual ordering except break ties by putting higher + * index addresses first (assumes have duplicates + * due to corruption in the lower addresses). + */ +static int +ientrycmpaddr(const void *va, const void *vb) +{ + int i; + uchar *a, *b; + + a = (uchar*)va; + b = (uchar*)vb; + i = ientrycmp(a, b); + if(i) + return i; + return -memcmp(a+IEntryAddrOff, b+IEntryAddrOff, 8); +} + +static void +zerorange(Part *p, u64int o, u64int e) +{ + static uchar zero[MaxIoSize]; + u32int n; + + for(; o<e; o+=n){ + n = sizeof zero; + if(o+n > e) + n = e-o; + if(writepart(p, o, zero, n) < 0) + fprint(2, "writepart %s: %r\n", p->name); + } +} + +/* + * Load a minibuffer into memory and write out the + * corresponding buckets. + */ +static void +sortminibuffer(ISect *is, Minibuf *mb, uchar *buf, u32int nbuf, u32int bufsize) +{ + uchar *buckdata, *p, *q, *ep; + u32int b, lastb, memsize, n; + u64int o; + IBucket ib; + Part *part; + + part = is->part; + buckdata = emalloc(is->blocksize); + + if(mb->nwentry == 0) + return; + + /* + * read entire buffer. + */ + assert(mb->nwentry*IEntrySize <= mb->woffset-mb->boffset); + assert(mb->woffset-mb->boffset <= nbuf); + if(readpart(part, mb->boffset, buf, mb->woffset-mb->boffset) < 0){ + fprint(2, "readpart %s: %r\n", part->name); + errors = 1; + return; + } + assert(*(uint*)buf != 0xa5a5a5a5); + + /* + * remove fragmentation due to IEntrySize + * not evenly dividing Bufsize + */ + memsize = (bufsize/IEntrySize)*IEntrySize; + for(o=mb->boffset, p=q=buf; o<mb->woffset; o+=bufsize){ + memmove(p, q, memsize); + p += memsize; + q += bufsize; + } + ep = buf + mb->nwentry*IEntrySize; + assert(ep <= buf+nbuf); + + /* + * sort entries + */ + qsort(buf, mb->nwentry, IEntrySize, ientrycmpaddr); + + /* + * write buckets out + */ + n = 0; + lastb = offset2bucket(is, mb->boffset); + for(p=buf; p<ep; p=q){ + b = score2bucket(is, p); + for(q=p; q<ep && score2bucket(is, q)==b; q+=IEntrySize) + ; + if(lastb+1 < b && zero) + zerorange(part, bucket2offset(is, lastb+1), bucket2offset(is, b)); + if(IBucketSize+(q-p) > is->blocksize) + sysfatal("bucket overflow - make index bigger"); + memmove(buckdata+IBucketSize, p, q-p); + ib.n = (q-p)/IEntrySize; + n += ib.n; + packibucket(&ib, buckdata, is->bucketmagic); + if(writepart(part, bucket2offset(is, b), buckdata, is->blocksize) < 0) + fprint(2, "write %s: %r\n", part->name); + lastb = b; + } + if(lastb+1 < is->stop-is->start && zero) + zerorange(part, bucket2offset(is, lastb+1), bucket2offset(is, is->stop - is->start)); + + if(n != mb->nwentry) + fprint(2, "sortminibuffer bug: n=%ud nwentry=%ud have=%ld\n", n, mb->nwentry, (ep-buf)/IEntrySize); + + free(buckdata); +} + +static void +isectproc(void *v) +{ + u32int buck, bufbuckets, bufsize, epbuf, i, j; + u32int mbufbuckets, n, nbucket, nn, space; + u32int nbuf, nminibuf, xminiclump, prod; + u64int blocksize, offset, xclump; + uchar *data, *p; + Buf *buf; + IEntry ie; + IPool *ipool; + ISect *is; + Minibuf *mbuf, *mb; + + is = v; + blocksize = is->blocksize; + nbucket = is->stop - is->start; + + /* + * Three passes: + * pass 1 - write index entries from arenas into + * large sequential sections on index disk. + * requires nbuf * bufsize memory. + * + * pass 2 - split each section into minibufs. + * requires nminibuf * bufsize memory. + * + * pass 3 - read each minibuf into memory and + * write buckets out. + * requires entries/minibuf * IEntrySize memory. + * + * The larger we set bufsize the less seeking hurts us. + * + * The fewer sections and minibufs we have, the less + * seeking hurts us. + * + * The fewer sections and minibufs we have, the + * more entries we end up with in each minibuf + * at the end. + * + * Shoot for using half our memory to hold each + * minibuf. The chance of a random distribution + * getting off by 2x is quite low. + * + * Once that is decided, figure out the smallest + * nminibuf and nsection/biggest bufsize we can use + * and still fit in the memory constraints. + */ + + /* expected number of clump index entries we'll see */ + xclump = nbucket * (double)totalclumps/totalbuckets; + + /* number of clumps we want to see in a minibuf */ + xminiclump = isectmem/2/IEntrySize; + + /* total number of minibufs we need */ + prod = (xclump+xminiclump-1) / xminiclump; + + /* if possible, skip second pass */ + if(!dumb && prod*MinBufSize < isectmem){ + nbuf = prod; + nminibuf = 1; + }else{ + /* otherwise use nsection = sqrt(nmini) */ + for(nbuf=1; nbuf*nbuf<prod; nbuf++) + ; + if(nbuf*MinBufSize > isectmem) + sysfatal("not enough memory"); + nminibuf = nbuf; + } + if (nbuf == 0) { + fprint(2, "%s: brand-new index, no work to do\n", argv0); + exits(0); + } + + /* size buffer to use extra memory */ + bufsize = MinBufSize; + while(bufsize*2*nbuf <= isectmem && bufsize < MaxBufSize) + bufsize *= 2; + data = emalloc(nbuf*bufsize); + epbuf = bufsize/IEntrySize; + fprint(2, "%T %s: %,ud buckets, %,ud groups, %,ud minigroups, %,ud buffer\n", + is->part->name, nbucket, nbuf, nminibuf, bufsize); + /* + * Accept index entries from arena procs. + */ + buf = MKNZ(Buf, nbuf); + p = data; + offset = is->blockbase; + bufbuckets = (nbucket+nbuf-1)/nbuf; + for(i=0; i<nbuf; i++){ + buf[i].part = is->part; + buf[i].bp = p; + buf[i].wp = p; + p += bufsize; + buf[i].ep = p; + buf[i].boffset = offset; + buf[i].woffset = offset; + if(i < nbuf-1){ + offset += bufbuckets*blocksize; + buf[i].eoffset = offset; + }else{ + offset = is->blockbase + nbucket*blocksize; + buf[i].eoffset = offset; + } + } + assert(p == data+nbuf*bufsize); + + n = 0; + while(recv(is->writechan, &ie) == 1){ + if(ie.ia.addr == 0) + break; + buck = score2bucket(is, ie.score); + i = buck/bufbuckets; + assert(i < nbuf); + bwrite(&buf[i], &ie); + n++; + } + add(&indexentries, n); + + nn = 0; + for(i=0; i<nbuf; i++){ + bflush(&buf[i]); + buf[i].bp = nil; + buf[i].ep = nil; + buf[i].wp = nil; + nn += buf[i].nentry; + } + if(n != nn) + fprint(2, "isectproc bug: n=%ud nn=%ud\n", n, nn); + + free(data); + + fprint(2, "%T %s: reordering\n", is->part->name); + + /* + * Rearrange entries into minibuffers and then + * split each minibuffer into buckets. + * The minibuffer must be sized so that it is + * a multiple of blocksize -- ipoolloadblock assumes + * that each minibuf starts aligned on a blocksize + * boundary. + */ + mbuf = MKN(Minibuf, nminibuf); + mbufbuckets = (bufbuckets+nminibuf-1)/nminibuf; + while(mbufbuckets*blocksize % bufsize) + mbufbuckets++; + for(i=0; i<nbuf; i++){ + /* + * Set up descriptors. + */ + n = buf[i].nentry; + nn = 0; + offset = buf[i].boffset; + memset(mbuf, 0, nminibuf*sizeof(mbuf[0])); + for(j=0; j<nminibuf; j++){ + mb = &mbuf[j]; + mb->boffset = offset; + offset += mbufbuckets*blocksize; + if(offset > buf[i].eoffset) + offset = buf[i].eoffset; + mb->eoffset = offset; + mb->roffset = mb->boffset; + mb->woffset = mb->boffset; + mb->nentry = epbuf * (mb->eoffset - mb->boffset)/bufsize; + if(mb->nentry > buf[i].nentry) + mb->nentry = buf[i].nentry; + buf[i].nentry -= mb->nentry; + nn += mb->nentry; + } + if(n != nn) + fprint(2, "isectproc bug2: n=%ud nn=%ud (i=%d)\n", n, nn, i);; + /* + * Rearrange. + */ + if(!dumb && nminibuf == 1){ + mbuf[0].nwentry = mbuf[0].nentry; + mbuf[0].woffset = buf[i].woffset; + }else{ + ipool = mkipool(is, mbuf, nminibuf, mbufbuckets, bufsize); + ipool->buck0 = bufbuckets*i; + for(j=0; j<nminibuf; j++){ + mb = &mbuf[j]; + while(mb->nentry > 0){ + if(ipool->nfree < epbuf){ + ipoolflush1(ipool); + /* ipoolflush1 might change mb->nentry */ + continue; + } + assert(ipool->nfree >= epbuf); + ipoolloadblock(ipool, mb); + } + } + ipoolflush(ipool); + nn = 0; + for(j=0; j<nminibuf; j++) + nn += mbuf[j].nwentry; + if(n != nn) + fprint(2, "isectproc bug3: n=%ud nn=%ud (i=%d)\n", n, nn, i); + free(ipool); + } + + /* + * Make buckets. + */ + space = 0; + for(j=0; j<nminibuf; j++) + if(space < mbuf[j].woffset - mbuf[j].boffset) + space = mbuf[j].woffset - mbuf[j].boffset; + + data = emalloc(space); + for(j=0; j<nminibuf; j++){ + mb = &mbuf[j]; + sortminibuffer(is, mb, data, space, bufsize); + } + free(data); + } + + sendp(isectdonechan, is); +} + + + diff --git a/sys/src/cmd/venti/srv/checkarenas.c b/sys/src/cmd/venti/srv/checkarenas.c new file mode 100755 index 000000000..4ad03a297 --- /dev/null +++ b/sys/src/cmd/venti/srv/checkarenas.c @@ -0,0 +1,139 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +static int verbose; + +static void +checkarena(Arena *arena, int scan, int fix) +{ + ATailStats old; + int err, e; + + if(verbose && arena->memstats.clumps) + printarena(2, arena); + + old = arena->memstats; + + if(scan){ + arena->memstats.used = 0; + arena->memstats.clumps = 0; + arena->memstats.cclumps = 0; + arena->memstats.uncsize = 0; + } + + err = 0; + for(;;){ + e = syncarena(arena, 1000, 0, fix); + err |= e; + if(!(e & SyncHeader)) + break; + if(verbose && arena->memstats.clumps) + fprint(2, "."); + } + if(verbose && arena->memstats.clumps) + fprint(2, "\n"); + + err &= ~SyncHeader; + if(arena->memstats.used != old.used + || arena->memstats.clumps != old.clumps + || arena->memstats.cclumps != old.cclumps + || arena->memstats.uncsize != old.uncsize){ + fprint(2, "%s: incorrect arena header fields\n", arena->name); + printarena(2, arena); + err |= SyncHeader; + } + + if(!err || !fix) + return; + + fprint(2, "%s: writing fixed arena header fields\n", arena->name); + arena->diskstats = arena->memstats; + if(wbarena(arena) < 0) + fprint(2, "arena header write failed: %r\n"); + flushdcache(); +} + +void +usage(void) +{ + fprint(2, "usage: checkarenas [-afv] file [arenaname...]\n"); + threadexitsall(0); +} + +int +should(char *name, int argc, char **argv) +{ + int i; + + if(argc == 0) + return 1; + for(i=0; i<argc; i++) + if(strcmp(name, argv[i]) == 0) + return 1; + return 0; +} + +void +threadmain(int argc, char *argv[]) +{ + ArenaPart *ap; + Part *part; + char *file; + int i, fix, scan; + + ventifmtinstall(); + statsinit(); + + fix = 0; + scan = 0; + ARGBEGIN{ + case 'f': + fix++; + break; + case 'a': + scan = 1; + break; + case 'v': + verbose++; + break; + default: + usage(); + break; + }ARGEND + + if(!fix) + readonly = 1; + + if(argc < 1) + usage(); + + file = argv[0]; + argc--; + argv++; + + part = initpart(file, (fix ? ORDWR : OREAD)|ODIRECT); + if(part == nil) + sysfatal("can't open partition %s: %r", file); + + ap = initarenapart(part); + if(ap == nil) + sysfatal("can't initialize arena partition in %s: %r", file); + + if(verbose > 1){ + printarenapart(2, ap); + fprint(2, "\n"); + } + + initdcache(8 * MaxDiskBlock); + + for(i = 0; i < ap->narenas; i++) + if(should(ap->arenas[i]->name, argc, argv)) { + debugarena = i; + checkarena(ap->arenas[i], scan, fix); + } + + if(verbose > 1) + printstats(); + threadexitsall(0); +} diff --git a/sys/src/cmd/venti/srv/checkindex.c b/sys/src/cmd/venti/srv/checkindex.c new file mode 100755 index 000000000..ca9557302 --- /dev/null +++ b/sys/src/cmd/venti/srv/checkindex.c @@ -0,0 +1,295 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +static int extra, missing, wrong; + +static void +phdr(DBlock *eb) +{ + static int did; + + if(!did){ + did = 1; + print("# diff actual correct\n"); + } + print("%s block 0x%llux\n", eb->part->name, eb->addr); +} + +static void +pie(IEntry *ie, char c) +{ + print("%c %V %22lld %3d %5d %3d\n", + c, ie->score, ie->ia.addr, ie->ia.type, ie->ia.size, ie->ia.blocks); +} + +static int +checkbucket(Index *ix, u32int buck, IBucket *ib) +{ + ISect *is; + DBlock *eb; + IBucket eib; + IEntry ie, eie; + int i, ei, ok, c, hdr; + + is = ix->sects[indexsect0(ix, buck)]; + if(buck < is->start || buck >= is->stop){ + seterr(EAdmin, "cannot find index section for bucket %lud\n", (ulong)buck); + return -1; + } + buck -= is->start; + eb = getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), OREAD); + if(eb == nil) + return -1; + unpackibucket(&eib, eb->data, is->bucketmagic); + + ok = 0; + ei = 0; + hdr = 0; + for(i = 0; i < ib->n; i++){ + while(ei < eib.n){ + c = ientrycmp(&ib->data[i * IEntrySize], &eib.data[ei * IEntrySize]); + if(c == 0){ + unpackientry(&ie, &ib->data[i * IEntrySize]); + unpackientry(&eie, &eib.data[ei * IEntrySize]); + if(iaddrcmp(&ie.ia, &eie.ia) != 0){ + if(!hdr){ + phdr(eb); + hdr = 1; + } + wrong++; + pie(&eie, '<'); + pie(&ie, '>'); + } + ei++; + goto cont; + } + if(c < 0) + break; + if(!hdr){ + phdr(eb); + hdr = 1; + } + unpackientry(&eie, &eib.data[ei*IEntrySize]); + extra++; + pie(&eie, '<'); + ei++; + ok = -1; + } + if(!hdr){ + phdr(eb); + hdr = 1; + } + unpackientry(&ie, &ib->data[i*IEntrySize]); + missing++; + pie(&ie, '>'); + ok = -1; + cont:; + } + for(; ei < eib.n; ei++){ + if(!hdr){ + phdr(eb); + hdr = 1; + } + unpackientry(&eie, &eib.data[ei*IEntrySize]); + pie(&eie, '<'); + ok = -1; + } + putdblock(eb); + return ok; +} + +int +checkindex(Index *ix, Part *part, u64int off, u64int clumps, int zero) +{ + IEStream *ies; + IBucket ib, zib; + ZBlock *z, *b; + u32int next, buck; + int ok, bok; +u64int found = 0; + +/* ZZZ make buffer size configurable */ + b = alloczblock(ix->blocksize, 0, ix->blocksize); + z = alloczblock(ix->blocksize, 1, ix->blocksize); + ies = initiestream(part, off, clumps, 64*1024); + if(b == nil || z == nil || ies == nil){ + werrstr("allocating: %r"); + ok = -1; + goto out; + } + ok = 0; + next = 0; + memset(&ib, 0, sizeof ib); + ib.data = b->data; + zib.data = z->data; + zib.n = 0; + zib.buck = 0; + for(;;){ + buck = buildbucket(ix, ies, &ib, ix->blocksize-IBucketSize); + found += ib.n; + if(zero){ + for(; next != buck; next++){ + if(next == ix->buckets){ + if(buck != TWID32){ + ok = -1; + werrstr("internal error: bucket out of range"); + } + if(ok < 0) + werrstr("%d spurious entries, %d missing, %d wrong", extra, missing, wrong); + goto out; + } + bok = checkbucket(ix, next, &zib); + if(bok < 0) + ok = -1; + } + } + if(buck >= ix->buckets){ + if(buck == TWID32) + break; + werrstr("internal error: bucket out of range"); + ok = -1; + goto out; + } + bok = checkbucket(ix, buck, &ib); + if(bok < 0) + ok = -1; + next = buck + 1; + } +out: + freeiestream(ies); + freezblock(z); + freezblock(b); + return ok; +} + +int +checkbloom(Bloom *b1, Bloom *b2, int fix) +{ + u32int *a1, *a2; + int i, n, extra, missing; + + if(b1==nil && b2==nil) + return 0; + if(b1==nil || b2==nil){ + werrstr("nil/non-nil"); + return -1; + } + wbbloomhead(b1); + wbbloomhead(b2); + if(memcmp(b1->data, b2->data, BloomHeadSize) != 0){ + werrstr("bloom header mismatch"); + return -1; + } + a1 = (u32int*)b1->data; + a2 = (u32int*)b2->data; + n = b1->size/4; + extra = 0; + missing = 0; + for(i=BloomHeadSize/4; i<n; i++){ + if(a1[i] != a2[i]){ +// print("%.8ux/%.8ux.", a1[i], a2[i]); + extra += countbits(a1[i] & ~a2[i]); + missing += countbits(a2[i] & ~a1[i]); + } + } + if(extra || missing) + fprint(2, "bloom filter: %d spurious bits, %d missing bits\n", + extra, missing); + else + fprint(2, "bloom filter: correct\n"); + if(!fix && missing){ + werrstr("missing bits"); + return -1; + } + if(fix && (missing || extra)){ + memmove(b1->data, b2->data, b1->size); + return writebloom(b1); + } + return 0; +} + + +void +usage(void) +{ + fprint(2, "usage: checkindex [-f] [-B blockcachesize] config tmp\n"); + threadexitsall(0); +} + +Config conf; + +void +threadmain(int argc, char *argv[]) +{ + Bloom *oldbloom, *newbloom; + Part *part; + u64int clumps, base; + u32int bcmem; + int fix, skipz, ok; + + fix = 0; + bcmem = 0; + skipz = 0; + ARGBEGIN{ + case 'B': + bcmem = unittoull(ARGF()); + break; + case 'f': + fix++; + break; + case 'Z': + skipz = 1; + break; + default: + usage(); + break; + }ARGEND + + if(argc != 2) + usage(); + + ventifmtinstall(); + + part = initpart(argv[1], ORDWR|ODIRECT); + if(part == nil) + sysfatal("can't initialize temporary partition: %r"); + + if(!fix) + readonly = 1; + + if(initventi(argv[0], &conf) < 0) + sysfatal("can't init venti: %r"); + if(mainindex->bloom && loadbloom(mainindex->bloom) < 0) + sysfatal("can't load bloom filter: %r"); + oldbloom = mainindex->bloom; + newbloom = nil; + if(oldbloom){ + newbloom = vtmallocz(sizeof *newbloom); + bloominit(newbloom, oldbloom->size, nil); + newbloom->data = vtmallocz(oldbloom->size); + } + if(bcmem < maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16)) + bcmem = maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16); + if(0) fprint(2, "initialize %d bytes of disk block cache\n", bcmem); + initdcache(bcmem); + + fprint(2, "checkindex: building entry list\n"); + clumps = sortrawientries(mainindex, part, &base, newbloom); + if(clumps == TWID64) + sysfatal("can't build sorted index: %r"); + fprint(2, "checkindex: checking %lld entries at %lld\n", clumps, base); + ok = 0; + if(checkindex(mainindex, part, base, clumps, !skipz) < 0){ + fprint(2, "checkindex: %r\n"); + ok = -1; + } + if(checkbloom(oldbloom, newbloom, fix) < 0){ + fprint(2, "checkbloom: %r\n"); + ok = -1; + } + if(ok < 0) + sysfatal("errors found"); + fprint(2, "checkindex: index is correct\n"); + threadexitsall(0); +} diff --git a/sys/src/cmd/venti/srv/clump.c b/sys/src/cmd/venti/srv/clump.c new file mode 100755 index 000000000..ed4de34d9 --- /dev/null +++ b/sys/src/cmd/venti/srv/clump.c @@ -0,0 +1,225 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" +#include "whack.h" + +/* + * Write a lump to disk. Updates ia with an index address + * for the newly-written lump. Upon return, the lump will + * have been placed in the disk cache but will likely not be on disk yet. + */ +int +storeclump(Index *ix, ZBlock *zb, u8int *sc, int type, u32int creator, IAddr *ia) +{ + ZBlock *cb; + Clump cl; + u64int a; + u8int bh[VtScoreSize]; + int size, dsize; + + trace(TraceLump, "storeclump enter", sc, type); + size = zb->len; + if(size > VtMaxLumpSize){ + seterr(EStrange, "lump too large"); + return -1; + } + if(vttypevalid(type) < 0){ + seterr(EStrange, "invalid lump type"); + return -1; + } + + if(0){ + scoremem(bh, zb->data, size); + if(scorecmp(sc, bh) != 0){ + seterr(ECorrupt, "storing clump: corrupted; expected=%V got=%V, size=%d", sc, bh, size); + return -1; + } + } + + cb = alloczblock(size + ClumpSize + U32Size, 0, 0); + if(cb == nil) + return -1; + + cl.info.type = type; + cl.info.uncsize = size; + cl.creator = creator; + cl.time = now(); + scorecp(cl.info.score, sc); + + trace(TraceLump, "storeclump whackblock"); + dsize = whackblock(&cb->data[ClumpSize], zb->data, size); + if(dsize > 0 && dsize < size){ + cl.encoding = ClumpECompress; + }else{ + if(dsize > size){ + fprint(2, "whack error: dsize=%d size=%d\n", dsize, size); + abort(); + } + cl.encoding = ClumpENone; + dsize = size; + memmove(&cb->data[ClumpSize], zb->data, size); + } + memset(cb->data+ClumpSize+dsize, 0, 4); + cl.info.size = dsize; + + a = writeiclump(ix, &cl, cb->data); + trace(TraceLump, "storeclump exit %lld", a); + freezblock(cb); + if(a == TWID64) + return -1; + + ia->addr = a; + ia->type = type; + ia->size = size; + ia->blocks = (dsize + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog; + +/* + qlock(&stats.lock); + stats.clumpwrites++; + stats.clumpbwrites += size; + stats.clumpbcomp += dsize; + qunlock(&stats.lock); +*/ + + return 0; +} + +u32int +clumpmagic(Arena *arena, u64int aa) +{ + u8int buf[U32Size]; + + if(readarena(arena, aa, buf, U32Size) == TWID32) + return TWID32; + return unpackmagic(buf); +} + +/* + * fetch a block based at addr. + * score is filled in with the block's score. + * blocks is roughly the length of the clump on disk; + * if zero, the length is unknown. + */ +ZBlock* +loadclump(Arena *arena, u64int aa, int blocks, Clump *cl, u8int *score, int verify) +{ + Unwhack uw; + ZBlock *zb, *cb; + u8int bh[VtScoreSize], *buf; + u32int n; + int nunc; + +/* + qlock(&stats.lock); + stats.clumpreads++; + qunlock(&stats.lock); +*/ + + if(blocks <= 0) + blocks = 1; + + trace(TraceLump, "loadclump enter"); + + cb = alloczblock(blocks << ABlockLog, 0, 0); + if(cb == nil) + return nil; + n = readarena(arena, aa, cb->data, blocks << ABlockLog); + if(n < ClumpSize){ + if(n != 0) + seterr(ECorrupt, "loadclump read less than a header"); + freezblock(cb); + return nil; + } + trace(TraceLump, "loadclump unpack"); + if(unpackclump(cl, cb->data, arena->clumpmagic) < 0){ + seterr(ECorrupt, "loadclump %s %llud: %r", arena->name, aa); + freezblock(cb); + return nil; + } + if(cl->info.type == VtCorruptType){ + seterr(EOk, "clump is marked corrupt"); + freezblock(cb); + return nil; + } + n -= ClumpSize; + if(n < cl->info.size){ + freezblock(cb); + n = cl->info.size; + cb = alloczblock(n, 0, 0); + if(cb == nil) + return nil; + if(readarena(arena, aa + ClumpSize, cb->data, n) != n){ + seterr(ECorrupt, "loadclump read too little data"); + freezblock(cb); + return nil; + } + buf = cb->data; + }else + buf = cb->data + ClumpSize; + + scorecp(score, cl->info.score); + + zb = alloczblock(cl->info.uncsize, 0, 0); + if(zb == nil){ + freezblock(cb); + return nil; + } + switch(cl->encoding){ + case ClumpECompress: + trace(TraceLump, "loadclump decompress"); + unwhackinit(&uw); + nunc = unwhack(&uw, zb->data, cl->info.uncsize, buf, cl->info.size); + if(nunc != cl->info.uncsize){ + if(nunc < 0) + seterr(ECorrupt, "decompression of %llud failed: %s", aa, uw.err); + else + seterr(ECorrupt, "decompression of %llud gave partial block: %d/%d\n", aa, nunc, cl->info.uncsize); + freezblock(cb); + freezblock(zb); + return nil; + } + break; + case ClumpENone: + if(cl->info.size != cl->info.uncsize){ + seterr(ECorrupt, "loading clump: bad uncompressed size for uncompressed block %llud", aa); + freezblock(cb); + freezblock(zb); + return nil; + } + scoremem(bh, buf, cl->info.uncsize); + if(scorecmp(cl->info.score, bh) != 0) + seterr(ECorrupt, "pre-copy sha1 wrong at %s %llud: expected=%V got=%V", arena->name, aa, cl->info.score, bh); + memmove(zb->data, buf, cl->info.uncsize); + break; + default: + seterr(ECorrupt, "unknown encoding in loadlump %llud", aa); + freezblock(cb); + freezblock(zb); + return nil; + } + freezblock(cb); + + if(verify){ + trace(TraceLump, "loadclump verify"); + scoremem(bh, zb->data, cl->info.uncsize); + if(scorecmp(cl->info.score, bh) != 0){ + seterr(ECorrupt, "loading clump: corrupted at %s %llud; expected=%V got=%V", arena->name, aa, cl->info.score, bh); + freezblock(zb); + return nil; + } + if(vttypevalid(cl->info.type) < 0){ + seterr(ECorrupt, "loading lump at %s %llud: invalid lump type %d", arena->name, aa, cl->info.type); + freezblock(zb); + return nil; + } + } + + trace(TraceLump, "loadclump exit"); +/* + qlock(&stats.lock); + stats.clumpbreads += cl->info.size; + stats.clumpbuncomp += cl->info.uncsize; + qunlock(&stats.lock); +*/ + return zb; +} diff --git a/sys/src/cmd/venti/srv/clumpstats.c b/sys/src/cmd/venti/srv/clumpstats.c new file mode 100755 index 000000000..d2cfe251c --- /dev/null +++ b/sys/src/cmd/venti/srv/clumpstats.c @@ -0,0 +1,127 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +int count[VtMaxLumpSize][VtMaxType]; +Config conf; + +enum +{ + ClumpChunks = 32*1024 +}; + +static int +readarenainfo(Arena *arena) +{ + ClumpInfo *ci, *cis; + u32int clump; + int i, n, ok; + + if(arena->memstats.clumps) + fprint(2, "reading directory for arena=%s with %d entries\n", arena->name, arena->memstats.clumps); + + cis = MKN(ClumpInfo, ClumpChunks); + ok = 0; + for(clump = 0; clump < arena->memstats.clumps; clump += n){ + n = ClumpChunks; + + if(n > arena->memstats.clumps - clump) + n = arena->memstats.clumps - clump; + + if((i=readclumpinfos(arena, clump, cis, n)) != n){ + seterr(EOk, "arena directory read failed %d not %d: %r", i, n); + ok = -1; + break; + } + + for(i = 0; i < n; i++){ + ci = &cis[i]; + if(ci->type >= VtMaxType || ci->uncsize >= VtMaxLumpSize) { + fprint(2, "bad clump: %d: type = %d: size = %d\n", clump+i, ci->type, ci->uncsize); + continue; + } + count[ci->uncsize][ci->type]++; + } + } + free(cis); + if(ok < 0) + return TWID32; + return clump; +} + +static void +clumpstats(Index *ix) +{ + int ok; + ulong clumps, n; + int i, j, t; + + ok = 0; + clumps = 0; + for(i = 0; i < ix->narenas; i++){ + n = readarenainfo(ix->arenas[i]); + if(n == TWID32){ + ok = -1; + break; + } + clumps += n; + } + + if(ok < 0) + return; + + print("clumps = %ld\n", clumps); + for(i=0; i<VtMaxLumpSize; i++) { + t = 0; + for(j=0; j<VtMaxType; j++) + t += count[i][j]; + if(t == 0) + continue; + print("%d\t%d", i, t); + for(j=0; j<VtMaxType; j++) + print("\t%d", count[i][j]); + print("\n"); + } +} + + +void +usage(void) +{ + fprint(2, "usage: clumpstats [-B blockcachesize] config\n"); + threadexitsall(0); +} + +void +threadmain(int argc, char *argv[]) +{ + u32int bcmem; + + bcmem = 0; + + ARGBEGIN{ + case 'B': + bcmem = unittoull(ARGF()); + break; + default: + usage(); + break; + }ARGEND + + readonly = 1; + + if(argc != 1) + usage(); + + if(initventi(argv[0], &conf) < 0) + sysfatal("can't init venti: %r"); + + if(bcmem < maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16)) + bcmem = maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16); + if(0) fprint(2, "initialize %d bytes of disk block cache\n", bcmem); + initdcache(bcmem); + + clumpstats(mainindex); + + threadexitsall(0); +} diff --git a/sys/src/cmd/venti/srv/cmparenas.c b/sys/src/cmd/venti/srv/cmparenas.c new file mode 100755 index 000000000..322f16ee1 --- /dev/null +++ b/sys/src/cmd/venti/srv/cmparenas.c @@ -0,0 +1,317 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +static int verbose; +static int fd; +static int fd1; +static uchar *data; +static uchar *data1; +static int blocksize; +static int sleepms; + +void +usage(void) +{ + fprint(2, "usage: cmparenas [-b blocksize] [-s ms] [-v] arenapart1 arenapart2 [name...]]\n"); + threadexitsall(0); +} + +static int +preadblock(int fd, uchar *buf, int n, vlong off) +{ + int nr, m; + + for(nr = 0; nr < n; nr += m){ + m = n - nr; + m = pread(fd, &buf[nr], m, off+nr); + if(m <= 0){ + if(m == 0) + werrstr("early eof"); + return -1; + } + } + return 0; +} + +static int +readblock(int fd, uchar *buf, int n) +{ + int nr, m; + + for(nr = 0; nr < n; nr += m){ + m = n - nr; + m = read(fd, &buf[nr], m); + if(m <= 0){ + if(m == 0) + werrstr("early eof"); + return -1; + } + } + return 0; +} + +static int +printheader(char *name, ArenaHead *head, int fd) +{ + Arena arena; + vlong baseoff, lo, hi, off; + int clumpmax; + + off = seek(fd, 0, 1); + seek(fd, off + head->size - head->blocksize, 0); + if(readblock(fd, data, head->blocksize) < 0){ + fprint(2, "%s: reading arena tail: %r\n", name); + return -1; + } + seek(fd, off, 0); + + memset(&arena, 0, sizeof arena); + if(unpackarena(&arena, data) < 0){ + fprint(2, "%s: unpack arena tail: %r\n", name); + return -1; + } + arena.blocksize = head->blocksize; + arena.base = off + head->blocksize; + arena.clumpmax = arena.blocksize / ClumpInfoSize; + arena.size = head->size - 2*head->blocksize; + + fprint(2, "%s: base=%llx size=%llx blocksize=%x\n", name, off, head->size, head->blocksize); + + baseoff = head->blocksize; + fprint(2, "\t%llx-%llx: head\n", (vlong)0, baseoff); + lo = baseoff; + hi = baseoff + arena.diskstats.used; + fprint(2, "\t%llx-%llx: data (%llx)\n", lo, hi, hi - lo); + hi = head->size - head->blocksize; + clumpmax = head->blocksize / ClumpInfoSize; + if(clumpmax > 0) + lo = hi - (u64int)arena.diskstats.clumps/clumpmax * head->blocksize; + else + lo = hi; + fprint(2, "\t%llx-%llx: clumps (%llx)\n", lo, hi, hi - lo); + fprint(2, "\t%llx-%llx: tail\n", hi, hi + head->blocksize); + + fprint(2, "arena:\n"); + printarena(2, &arena); + return 0; +} + +static void +cmparena(char *name, vlong len) +{ + ArenaHead head; + DigestState s; + u64int n, e; + u32int bs; + int i, j; + char buf[20]; + + fprint(2, "cmp %s\n", name); + + memset(&s, 0, sizeof s); + + /* + * read a little bit, which will include the header + */ + if(readblock(fd, data, HeadSize) < 0){ + fprint(2, "%s: reading header: %r\n", name); + return; + } + if(unpackarenahead(&head, data) < 0){ + fprint(2, "%s: corrupt arena header: %r\n", name); + return; + } + if(head.version != ArenaVersion4 && head.version != ArenaVersion5) + fprint(2, "%s: warning: unknown arena version %d\n", name, head.version); + if(len != 0 && len != head.size) + fprint(2, "%s: warning: unexpected length %lld != %lld\n", name, head.size, len); + if(strcmp(name, "<stdin>") != 0 && strcmp(head.name, name) != 0) + fprint(2, "%s: warning: unexpected name %s\n", name, head.name); + + if(readblock(fd1, data1, HeadSize) < 0){ + fprint(2, "%s: reading header: %r\n", name); + return; + } + if(unpackarenahead(&head, data) < 0){ + fprint(2, "%s: corrupt arena header: %r\n", name); + return; + } + if(head.version != ArenaVersion4 && head.version != ArenaVersion5) + fprint(2, "%s: warning: unknown arena version %d\n", name, head.version); + if(len != 0 && len != head.size) + fprint(2, "%s: warning: unexpected length %lld != %lld\n", name, head.size, len); + if(strcmp(name, "<stdin>") != 0 && strcmp(head.name, name) != 0) + fprint(2, "%s: warning: unexpected name %s\n", name, head.name); + + seek(fd, -HeadSize, 1); + seek(fd1, -HeadSize, 1); + + if(printheader(name, &head, fd) < 0) + return; + + /* + * now we know how much to read + * read everything but the last block, which is special + */ + e = head.size; + bs = blocksize; + for(n = 0; n < e; n += bs){ + if(n + bs > e) + bs = e - n; + if(readblock(fd, data, bs) < 0){ + fprint(2, "%s: read data: %r\n", name); + return; + } + if(readblock(fd1, data1, bs) < 0){ + fprint(2, "%s: read data: %r\n", name); + return; + } + if(memcmp(data, data1, bs) != 0){ + print("mismatch at %llx\n", n); + for(i=0; i<bs; i+=16){ + if(memcmp(data+i, data1+i, 16) != 0){ + snprint(buf, sizeof buf, "%llx", n+i); + print("%s ", buf); + for(j=0; j<16; j++){ + print(" %.2ux", data[i+j]); + if(j == 7) + print(" -"); + } + print("\n"); + print("%*s ", (int)strlen(buf), ""); + for(j=0; j<16; j++){ + print(" %.2ux", data1[i+j]); + if(j == 7) + print(" -"); + } + print("\n"); + } + } + } + } +} + +static int +shouldcheck(char *name, char **s, int n) +{ + int i; + + if(n == 0) + return 1; + + for(i=0; i<n; i++){ + if(s[i] && strcmp(name, s[i]) == 0){ + s[i] = nil; + return 1; + } + } + return 0; +} + +char * +readap(int fd, ArenaPart *ap) +{ + char *table; + + if(preadblock(fd, data, 8192, PartBlank) < 0) + sysfatal("read arena part header: %r"); + if(unpackarenapart(ap, data) < 0) + sysfatal("corrupted arena part header: %r"); + fprint(2, "# arena part version=%d blocksize=%d arenabase=%d\n", + ap->version, ap->blocksize, ap->arenabase); + ap->tabbase = (PartBlank+HeadSize+ap->blocksize-1)&~(ap->blocksize-1); + ap->tabsize = ap->arenabase - ap->tabbase; + table = malloc(ap->tabsize+1); + if(preadblock(fd, (uchar*)table, ap->tabsize, ap->tabbase) < 0) + sysfatal("reading arena part directory: %r"); + table[ap->tabsize] = 0; + return table; +} + +void +threadmain(int argc, char *argv[]) +{ + int i, nline; + char *p, *q, *table, *table1, *f[10], line[256]; + vlong start, stop; + ArenaPart ap; + ArenaPart ap1; + + ventifmtinstall(); + blocksize = MaxIoSize; + ARGBEGIN{ + case 'b': + blocksize = unittoull(EARGF(usage())); + break; + case 's': + sleepms = atoi(EARGF(usage())); + break; + case 'v': + verbose++; + break; + default: + usage(); + break; + }ARGEND + + if(argc < 2) + usage(); + + data = vtmalloc(blocksize); + data1 = vtmalloc(blocksize); + if((fd = open(argv[0], OREAD)) < 0) + sysfatal("open %s: %r", argv[0]); + if((fd1 = open(argv[1], OREAD)) < 0) + sysfatal("open %s: %r", argv[0]); + + table = readap(fd, &ap); + table1 = readap(fd1, &ap1); + if(strcmp(table, table1) != 0) + sysfatal("arena partitions do not have identical tables"); + + nline = atoi(table); + p = strchr(table, '\n'); + if(p) + p++; + for(i=0; i<nline; i++){ + if(p == nil){ + fprint(2, "warning: unexpected arena table end\n"); + break; + } + q = strchr(p, '\n'); + if(q) + *q++ = 0; + if(strlen(p) >= sizeof line){ + fprint(2, "warning: long arena table line: %s\n", p); + p = q; + continue; + } + strcpy(line, p); + memset(f, 0, sizeof f); + if(tokenize(line, f, nelem(f)) < 3){ + fprint(2, "warning: bad arena table line: %s\n", p); + p = q; + continue; + } + p = q; + if(shouldcheck(f[0], argv+1, argc-1)){ + start = strtoull(f[1], 0, 0); + stop = strtoull(f[2], 0, 0); + if(stop <= start){ + fprint(2, "%s: bad start,stop %lld,%lld\n", f[0], stop, start); + continue; + } + if(seek(fd, start, 0) < 0) + fprint(2, "%s: seek to start: %r\n", f[0]); + if(seek(fd1, start, 0) < 0) + fprint(2, "%s: seek to start: %r\n", f[0]); + cmparena(f[0], stop - start); + } + } + for(i=2; i<argc; i++) + if(argv[i] != 0) + fprint(2, "%s: did not find arena\n", argv[i]); + + threadexitsall(nil); +} diff --git a/sys/src/cmd/venti/srv/conf.rc b/sys/src/cmd/venti/srv/conf.rc new file mode 100755 index 000000000..cce980fa7 --- /dev/null +++ b/sys/src/cmd/venti/srv/conf.rc @@ -0,0 +1,67 @@ +#!/bin/rc + +# the venti configuration is stored at the 248kB offset in the first index +# partition and extends for at most 8 kB. + +rfork e +fn usage { + echo 'usage: venti/conf [-w] /dev/sdC0/v.arenas' >[1=2] + exit usage +} + +wflag=no +while(! ~ $#* 0 && ~ $1 -* && ! ~ $1 --){ + switch($1){ + case -w + wflag=yes + case * + usage + } + shift +} +if(~ $1 --) + shift + +if(~ $wflag no && ! ~ $#* 1) + usage +if(~ $wflag yes && ! ~ $#* 1 2) + usage + +disk=$1 +if(! test -f $disk){ + echo 'unknown disk' $1 >[1=2] + exit nodisk +} + +fn sigexit { + #rm -f /tmp/venticonf.$pid +} + +if(~ $wflag yes){ + {echo venti config; cat $2} >/tmp/venticonf.$pid || exit oops + if(! test -s /tmp/venticonf.$pid){ + echo 'config is empty; will not install' >[1=2] + exit emptyconfig + } + if(test `{ls -l /tmp/venticonf.$pid | awk '{print $6}'} -gt 8192){ + echo 'config is too long; max is a little less than eight kilobytes' >[1=2] + exit toolong + } + dd -quiet 1 -bs 1024 -count 8 -if $disk -iseek 248 \ + >/tmp/_venticonf.old || exit backup + dd -quiet 1 -count 2 </dev/zero >> /tmp/venticonf.$pid || exit dd + dd -quiet 1 -bs 1024 -count 8 -if /tmp/venticonf.$pid \ + -of $disk -trunc 0 -oseek 248 || exit dd2 + exit 0 +} + +dd -quiet 1 -bs 1024 -count 8 -if $disk -iseek 248 | + aux/zerotrunc >/tmp/venticonf.$pid + +if(! cmp <{sed 1q /tmp/venticonf.$pid} <{echo venti config}){ + echo 'config has bad header' >[1=2] + exit badconfig +} + +sed 1d /tmp/venticonf.$pid +exit '' diff --git a/sys/src/cmd/venti/srv/config.c b/sys/src/cmd/venti/srv/config.c new file mode 100755 index 000000000..ba4daba1a --- /dev/null +++ b/sys/src/cmd/venti/srv/config.c @@ -0,0 +1,253 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +Index *mainindex; +int paranoid = 1; /* should verify hashes on disk read */ + +static ArenaPart *configarenas(char *file); +static ISect *configisect(char *file); +static Bloom *configbloom(char *file); + +int +initventi(char *file, Config *conf) +{ + statsinit(); + + if(file == nil){ + seterr(EOk, "no configuration file"); + return -1; + } + if(runconfig(file, conf) < 0){ + seterr(EOk, "can't initialize venti: %r"); + return -1; + } + mainindex = initindex(conf->index, conf->sects, conf->nsects); + if(mainindex == nil) + return -1; + mainindex->bloom = conf->bloom; + return 0; +} + +static int +numok(char *s) +{ + char *p; + + strtoull(s, &p, 0); + if(p == s) + return -1; + if(*p == 0) + return 0; + if(p[1] == 0 && strchr("MmGgKk", *p)) + return 0; + return 0; +} + +/* + * configs : + * | configs config + * config : "isect" filename + * | "arenas" filename + * | "index" name + * | "bcmem" num + * | "mem" num + * | "icmem" num + * | "queuewrites" + * | "httpaddr" address + * | "addr" address + * + * '#' and \n delimit comments + */ +enum +{ + MaxArgs = 2 +}; +int +runconfig(char *file, Config *config) +{ + ArenaPart **av; + ISect **sv; + IFile f; + char *s, *line, *flds[MaxArgs + 1]; + int i, ok; + + if(readifile(&f, file) < 0) + return -1; + memset(config, 0, sizeof *config); + config->mem = Unspecified; + ok = -1; + line = nil; + for(;;){ + s = ifileline(&f); + if(s == nil){ + ok = 0; + break; + } + line = estrdup(s); + i = getfields(s, flds, MaxArgs + 1, 1, " \t\r"); + if(i == 2 && strcmp(flds[0], "isect") == 0){ + sv = MKN(ISect*, config->nsects + 1); + for(i = 0; i < config->nsects; i++) + sv[i] = config->sects[i]; + free(config->sects); + config->sects = sv; + config->sects[config->nsects] = configisect(flds[1]); + if(config->sects[config->nsects] == nil) + break; + config->nsects++; + }else if(i == 2 && strcmp(flds[0], "arenas") == 0){ + av = MKN(ArenaPart*, config->naparts + 1); + for(i = 0; i < config->naparts; i++) + av[i] = config->aparts[i]; + free(config->aparts); + config->aparts = av; + config->aparts[config->naparts] = configarenas(flds[1]); + if(config->aparts[config->naparts] == nil) + break; + config->naparts++; + }else if(i == 2 && strcmp(flds[0], "bloom") == 0){ + if(config->bloom){ + seterr(EAdmin, "duplicate bloom lines in configuration file %s", file); + break; + } + if((config->bloom = configbloom(flds[1])) == nil) + break; + }else if(i == 2 && strcmp(flds[0], "index") == 0){ + if(nameok(flds[1]) < 0){ + seterr(EAdmin, "illegal index name %s in config file %s", flds[1], file); + break; + } + if(config->index != nil){ + seterr(EAdmin, "duplicate indices in config file %s", file); + break; + } + config->index = estrdup(flds[1]); + }else if(i == 2 && strcmp(flds[0], "bcmem") == 0){ + if(numok(flds[1]) < 0){ + seterr(EAdmin, "illegal size %s in config file %s", + flds[1], file); + break; + } + if(config->bcmem != 0){ + seterr(EAdmin, "duplicate bcmem lines in config file %s", file); + break; + } + config->bcmem = unittoull(flds[1]); + }else if(i == 2 && strcmp(flds[0], "mem") == 0){ + if(numok(flds[1]) < 0){ + seterr(EAdmin, "illegal size %s in config file %s", + flds[1], file); + break; + } + if(config->mem != Unspecified){ + seterr(EAdmin, "duplicate mem lines in config file %s", file); + break; + } + config->mem = unittoull(flds[1]); + }else if(i == 2 && strcmp(flds[0], "icmem") == 0){ + if(numok(flds[1]) < 0){ + seterr(EAdmin, "illegal size %s in config file %s", + flds[1], file); + break; + } + if(config->icmem != 0){ + seterr(EAdmin, "duplicate icmem lines in config file %s", file); + break; + } + config->icmem = unittoull(flds[1]); + }else if(i == 1 && strcmp(flds[0], "queuewrites") == 0){ + config->queuewrites = 1; + }else if(i == 2 && strcmp(flds[0], "httpaddr") == 0){ + if(config->haddr){ + seterr(EAdmin, "duplicate httpaddr lines in configuration file %s", file); + break; + } + config->haddr = estrdup(flds[1]); + }else if(i == 2 && strcmp(flds[0], "webroot") == 0){ + if(config->webroot){ + seterr(EAdmin, "duplicate webroot lines in configuration file %s", file); + break; + } + config->webroot = estrdup(flds[1]); + }else if(i == 2 && strcmp(flds[0], "addr") == 0){ + if(config->vaddr){ + seterr(EAdmin, "duplicate addr lines in configuration file %s", file); + break; + } + config->vaddr = estrdup(flds[1]); + }else{ + seterr(EAdmin, "illegal line '%s' in configuration file %s", line, file); + break; + } + free(line); + line = nil; + } + free(line); + freeifile(&f); + if(ok < 0){ + free(config->sects); + config->sects = nil; + free(config->aparts); + config->aparts = nil; + } + return ok; +} + +static ISect* +configisect(char *file) +{ + Part *part; + ISect *is; + + if(0) fprint(2, "configure index section in %s\n", file); + + part = initpart(file, ORDWR|ODIRECT); + if(part == nil) + return nil; + is = initisect(part); + if(is == nil) + werrstr("%s: %r", file); + return is; +} + +static ArenaPart* +configarenas(char *file) +{ + ArenaPart *ap; + Part *part; + + if(0) fprint(2, "configure arenas in %s\n", file); + part = initpart(file, ORDWR|ODIRECT); + if(part == nil) + return nil; + ap = initarenapart(part); + if(ap == nil) + werrstr("%s: %r", file); + return ap; +} + +static Bloom* +configbloom(char *file) +{ + Bloom *b; + Part *part; + + if(0) fprint(2, "configure bloom in %s\n", file); + part = initpart(file, ORDWR|ODIRECT); + if(part == nil) + return nil; + b = readbloom(part); + if(b == nil){ + werrstr("%s: %r", file); + freepart(part); + } + return b; +} + +/* for OS X linker, which only resolves functions, not data */ +void +needmainindex(void) +{ +} + diff --git a/sys/src/cmd/venti/srv/conv.c b/sys/src/cmd/venti/srv/conv.c new file mode 100755 index 000000000..e6a6cbfe1 --- /dev/null +++ b/sys/src/cmd/venti/srv/conv.c @@ -0,0 +1,730 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +/* + * disk structure conversion routines + */ +#define U8GET(p) ((p)[0]) +#define U16GET(p) (((p)[0]<<8)|(p)[1]) +#define U32GET(p) ((u32int)(((p)[0]<<24)|((p)[1]<<16)|((p)[2]<<8)|(p)[3])) +#define U64GET(p) (((u64int)U32GET(p)<<32)|(u64int)U32GET((p)+4)) + +#define U8PUT(p,v) (p)[0]=(v)&0xFF +#define U16PUT(p,v) (p)[0]=((v)>>8)&0xFF;(p)[1]=(v)&0xFF +#define U32PUT(p,v) (p)[0]=((v)>>24)&0xFF;(p)[1]=((v)>>16)&0xFF;(p)[2]=((v)>>8)&0xFF;(p)[3]=(v)&0xFF +#define U64PUT(p,v,t32) t32=(v)>>32;U32PUT(p,t32);t32=(v);U32PUT((p)+4,t32) + +int debugarena = -1; /* hack to improve error reporting */ + +static struct { + u32int m; + char *s; +} magics[] = { + ArenaPartMagic, "ArenaPartMagic", + ArenaHeadMagic, "ArenaHeadMagic", + ArenaMagic, "ArenaMagic", + ISectMagic, "ISectMagic", + BloomMagic, "BloomMagic", +}; + +static char* +fmtmagic(char *s, u32int m) +{ + int i; + + for(i=0; i<nelem(magics); i++) + if(magics[i].m == m) + return magics[i].s; + sprint(s, "%#08ux", m); + return s; +} + +u32int +unpackmagic(u8int *buf) +{ + return U32GET(buf); +} + +void +packmagic(u32int magic, u8int *buf) +{ + U32PUT(buf, magic); +} + +int +unpackarenapart(ArenaPart *ap, u8int *buf) +{ + u8int *p; + u32int m; + char fbuf[20]; + + p = buf; + + m = U32GET(p); + if(m != ArenaPartMagic){ + seterr(ECorrupt, "arena set has wrong magic number: %s expected ArenaPartMagic (%#lux)", fmtmagic(fbuf, m), ArenaPartMagic); + return -1; + } + p += U32Size; + ap->version = U32GET(p); + p += U32Size; + ap->blocksize = U32GET(p); + p += U32Size; + ap->arenabase = U32GET(p); + p += U32Size; + + if(buf + ArenaPartSize != p) + sysfatal("unpackarenapart unpacked wrong amount"); + + return 0; +} + +int +packarenapart(ArenaPart *ap, u8int *buf) +{ + u8int *p; + + p = buf; + + U32PUT(p, ArenaPartMagic); + p += U32Size; + U32PUT(p, ap->version); + p += U32Size; + U32PUT(p, ap->blocksize); + p += U32Size; + U32PUT(p, ap->arenabase); + p += U32Size; + + if(buf + ArenaPartSize != p) + sysfatal("packarenapart packed wrong amount"); + + return 0; +} + +int +unpackarena(Arena *arena, u8int *buf) +{ + int sz; + u8int *p; + u32int m; + char fbuf[20]; + + p = buf; + + m = U32GET(p); + if(m != ArenaMagic){ + seterr(ECorrupt, "arena %d has wrong magic number: %s " + "expected ArenaMagic (%#lux)", debugarena, + fmtmagic(fbuf, m), ArenaMagic); + return -1; + } + p += U32Size; + arena->version = U32GET(p); + p += U32Size; + namecp(arena->name, (char*)p); + p += ANameSize; + arena->diskstats.clumps = U32GET(p); + p += U32Size; + arena->diskstats.cclumps = U32GET(p); + p += U32Size; + arena->ctime = U32GET(p); + p += U32Size; + arena->wtime = U32GET(p); + p += U32Size; + if(arena->version == ArenaVersion5){ + arena->clumpmagic = U32GET(p); + p += U32Size; + } + arena->diskstats.used = U64GET(p); + p += U64Size; + arena->diskstats.uncsize = U64GET(p); + p += U64Size; + arena->diskstats.sealed = U8GET(p); + p += U8Size; + switch(arena->version){ + case ArenaVersion4: + sz = ArenaSize4; + arena->clumpmagic = _ClumpMagic; + break; + case ArenaVersion5: + sz = ArenaSize5; + break; + default: + seterr(ECorrupt, "arena has bad version number %d", arena->version); + return -1; + } + /* + * Additional fields for the memstats version of the stats. + * Diskstats reflects what is committed to the index. + * Memstats reflects what is in the arena. Originally intended + * this to be a version 5 extension, but might as well use for + * all the existing version 4 arenas too. + * + * To maintain backwards compatibility with existing venti + * installations using the older format, we define that if + * memstats == diskstats, then the extension fields are not + * included (see packarena below). That is, only partially + * indexed arenas have these fields. Fully indexed arenas + * (in particular, sealed arenas) do not. + */ + if(U8GET(p) == 1){ + sz += ArenaSize5a-ArenaSize5; + p += U8Size; + arena->memstats.clumps = U32GET(p); + p += U32Size; + arena->memstats.cclumps = U32GET(p); + p += U32Size; + arena->memstats.used = U64GET(p); + p += U64Size; + arena->memstats.uncsize = U64GET(p); + p += U64Size; + arena->memstats.sealed = U8GET(p); + p += U8Size; + + /* + * 2008/4/2 + * Packarena (below) used to have a bug in which it would + * not zero out any existing extension fields when writing + * the arena metadata. This would manifest itself as arenas + * with arena->diskstats.sealed == 1 but arena->memstats.sealed == 0 + * after a server restart. Because arena->memstats.sealed wouldn't + * be set, the server might try to fit another block into the arena + * (and succeed), violating the append-only structure of the log + * and invalidating any already-computed seal on the arena. + * + * It might end up that other fields in arena->memstats end up + * behind arena->diskstats too, but that would be considerably + * more rare, and the bug is fixed now. The case we need to + * handle is just the sealed mismatch. + * + * If we encounter such a bogus arena, fix the sealed field. + */ + if(arena->diskstats.sealed) + arena->memstats.sealed = 1; + }else + arena->memstats = arena->diskstats; + if(buf + sz != p) + sysfatal("unpackarena unpacked wrong amount"); + + return 0; +} + +int +packarena(Arena *arena, u8int *buf) +{ + return _packarena(arena, buf, 0); +} + +int +_packarena(Arena *arena, u8int *buf, int forceext) +{ + int sz; + u8int *p; + u32int t32; + + switch(arena->version){ + case ArenaVersion4: + sz = ArenaSize4; + if(arena->clumpmagic != _ClumpMagic) + fprint(2, "warning: writing old arena tail loses clump magic 0x%lux != 0x%lux\n", + (ulong)arena->clumpmagic, (ulong)_ClumpMagic); + break; + case ArenaVersion5: + sz = ArenaSize5; + break; + default: + sysfatal("packarena unknown version %d", arena->version); + return -1; + } + + p = buf; + + U32PUT(p, ArenaMagic); + p += U32Size; + U32PUT(p, arena->version); + p += U32Size; + namecp((char*)p, arena->name); + p += ANameSize; + U32PUT(p, arena->diskstats.clumps); + p += U32Size; + U32PUT(p, arena->diskstats.cclumps); + p += U32Size; + U32PUT(p, arena->ctime); + p += U32Size; + U32PUT(p, arena->wtime); + p += U32Size; + if(arena->version == ArenaVersion5){ + U32PUT(p, arena->clumpmagic); + p += U32Size; + } + U64PUT(p, arena->diskstats.used, t32); + p += U64Size; + U64PUT(p, arena->diskstats.uncsize, t32); + p += U64Size; + U8PUT(p, arena->diskstats.sealed); + p += U8Size; + + /* + * Extension fields; see above. + */ + if(forceext + || arena->memstats.clumps != arena->diskstats.clumps + || arena->memstats.cclumps != arena->diskstats.cclumps + || arena->memstats.used != arena->diskstats.used + || arena->memstats.uncsize != arena->diskstats.uncsize + || arena->memstats.sealed != arena->diskstats.sealed){ + sz += ArenaSize5a - ArenaSize5; + U8PUT(p, 1); + p += U8Size; + U32PUT(p, arena->memstats.clumps); + p += U32Size; + U32PUT(p, arena->memstats.cclumps); + p += U32Size; + U64PUT(p, arena->memstats.used, t32); + p += U64Size; + U64PUT(p, arena->memstats.uncsize, t32); + p += U64Size; + U8PUT(p, arena->memstats.sealed); + p += U8Size; + }else{ + /* Clear any extension fields already on disk. */ + memset(p, 0, ArenaSize5a - ArenaSize5); + p += ArenaSize5a - ArenaSize5; + sz += ArenaSize5a - ArenaSize5; + } + + if(buf + sz != p) + sysfatal("packarena packed wrong amount"); + + return 0; +} + +int +unpackarenahead(ArenaHead *head, u8int *buf) +{ + u8int *p; + u32int m; + int sz; + char fbuf[20]; + + p = buf; + + m = U32GET(p); + if(m != ArenaHeadMagic){ + seterr(ECorrupt, "arena %d head has wrong magic number: %s " + "expected ArenaHeadMagic (%#lux)", debugarena, + fmtmagic(fbuf, m), ArenaHeadMagic); + return -1; + } + + p += U32Size; + head->version = U32GET(p); + p += U32Size; + namecp(head->name, (char*)p); + p += ANameSize; + head->blocksize = U32GET(p); + p += U32Size; + head->size = U64GET(p); + p += U64Size; + if(head->version == ArenaVersion5){ + head->clumpmagic = U32GET(p); + p += U32Size; + } + + switch(head->version){ + case ArenaVersion4: + sz = ArenaHeadSize4; + head->clumpmagic = _ClumpMagic; + break; + case ArenaVersion5: + sz = ArenaHeadSize5; + break; + default: + seterr(ECorrupt, "arena head has unexpected version %d", head->version); + return -1; + } + + if(buf + sz != p) + sysfatal("unpackarenahead unpacked wrong amount"); + + return 0; +} + +int +packarenahead(ArenaHead *head, u8int *buf) +{ + u8int *p; + int sz; + u32int t32; + + switch(head->version){ + case ArenaVersion4: + sz = ArenaHeadSize4; + if(head->clumpmagic != _ClumpMagic) + fprint(2, "warning: writing old arena header loses clump magic 0x%lux != 0x%lux\n", + (ulong)head->clumpmagic, (ulong)_ClumpMagic); + break; + case ArenaVersion5: + sz = ArenaHeadSize5; + break; + default: + sysfatal("packarenahead unknown version %d", head->version); + return -1; + } + + p = buf; + + U32PUT(p, ArenaHeadMagic); + p += U32Size; + U32PUT(p, head->version); + p += U32Size; + namecp((char*)p, head->name); + p += ANameSize; + U32PUT(p, head->blocksize); + p += U32Size; + U64PUT(p, head->size, t32); + p += U64Size; + if(head->version == ArenaVersion5){ + U32PUT(p, head->clumpmagic); + p += U32Size; + } + if(buf + sz != p) + sysfatal("packarenahead packed wrong amount"); + + return 0; +} + +static int +checkclump(Clump *w) +{ + if(w->encoding == ClumpENone){ + if(w->info.size != w->info.uncsize){ + seterr(ECorrupt, "uncompressed wad size mismatch"); + return -1; + } + }else if(w->encoding == ClumpECompress){ + if(w->info.size >= w->info.uncsize){ + seterr(ECorrupt, "compressed lump has inconsistent block sizes %d %d", w->info.size, w->info.uncsize); + return -1; + } + }else{ + seterr(ECorrupt, "clump has illegal encoding"); + return -1; + } + + return 0; +} + +int +unpackclump(Clump *c, u8int *buf, u32int cmagic) +{ + u8int *p; + u32int magic; + + p = buf; + magic = U32GET(p); + if(magic != cmagic){ + seterr(ECorrupt, "clump has bad magic number=%#8.8ux != %#8.8ux", magic, cmagic); + return -1; + } + p += U32Size; + + c->info.type = vtfromdisktype(U8GET(p)); + p += U8Size; + c->info.size = U16GET(p); + p += U16Size; + c->info.uncsize = U16GET(p); + p += U16Size; + scorecp(c->info.score, p); + p += VtScoreSize; + + c->encoding = U8GET(p); + p += U8Size; + c->creator = U32GET(p); + p += U32Size; + c->time = U32GET(p); + p += U32Size; + + if(buf + ClumpSize != p) + sysfatal("unpackclump unpacked wrong amount"); + + return checkclump(c); +} + +int +packclump(Clump *c, u8int *buf, u32int magic) +{ + u8int *p; + + p = buf; + U32PUT(p, magic); + p += U32Size; + + U8PUT(p, vttodisktype(c->info.type)); + p += U8Size; + U16PUT(p, c->info.size); + p += U16Size; + U16PUT(p, c->info.uncsize); + p += U16Size; + scorecp(p, c->info.score); + p += VtScoreSize; + + U8PUT(p, c->encoding); + p += U8Size; + U32PUT(p, c->creator); + p += U32Size; + U32PUT(p, c->time); + p += U32Size; + + if(buf + ClumpSize != p) + sysfatal("packclump packed wrong amount"); + + return checkclump(c); +} + +void +unpackclumpinfo(ClumpInfo *ci, u8int *buf) +{ + u8int *p; + + p = buf; + ci->type = vtfromdisktype(U8GET(p)); + p += U8Size; + ci->size = U16GET(p); + p += U16Size; + ci->uncsize = U16GET(p); + p += U16Size; + scorecp(ci->score, p); + p += VtScoreSize; + + if(buf + ClumpInfoSize != p) + sysfatal("unpackclumpinfo unpacked wrong amount"); +} + +void +packclumpinfo(ClumpInfo *ci, u8int *buf) +{ + u8int *p; + + p = buf; + U8PUT(p, vttodisktype(ci->type)); + p += U8Size; + U16PUT(p, ci->size); + p += U16Size; + U16PUT(p, ci->uncsize); + p += U16Size; + scorecp(p, ci->score); + p += VtScoreSize; + + if(buf + ClumpInfoSize != p) + sysfatal("packclumpinfo packed wrong amount"); +} + +int +unpackisect(ISect *is, u8int *buf) +{ + u8int *p; + u32int m; + char fbuf[20]; + + p = buf; + + + m = U32GET(p); + if(m != ISectMagic){ + seterr(ECorrupt, "index section has wrong magic number: %s expected ISectMagic (%#lux)", + fmtmagic(fbuf, m), ISectMagic); + return -1; + } + p += U32Size; + is->version = U32GET(p); + p += U32Size; + namecp(is->name, (char*)p); + p += ANameSize; + namecp(is->index, (char*)p); + p += ANameSize; + is->blocksize = U32GET(p); + p += U32Size; + is->blockbase = U32GET(p); + p += U32Size; + is->blocks = U32GET(p); + p += U32Size; + is->start = U32GET(p); + p += U32Size; + is->stop = U32GET(p); + p += U32Size; + if(buf + ISectSize1 != p) + sysfatal("unpackisect unpacked wrong amount"); + is->bucketmagic = 0; + if(is->version == ISectVersion2){ + is->bucketmagic = U32GET(p); + p += U32Size; + if(buf + ISectSize2 != p) + sysfatal("unpackisect unpacked wrong amount"); + } + + return 0; +} + +int +packisect(ISect *is, u8int *buf) +{ + u8int *p; + + p = buf; + + U32PUT(p, ISectMagic); + p += U32Size; + U32PUT(p, is->version); + p += U32Size; + namecp((char*)p, is->name); + p += ANameSize; + namecp((char*)p, is->index); + p += ANameSize; + U32PUT(p, is->blocksize); + p += U32Size; + U32PUT(p, is->blockbase); + p += U32Size; + U32PUT(p, is->blocks); + p += U32Size; + U32PUT(p, is->start); + p += U32Size; + U32PUT(p, is->stop); + p += U32Size; + if(buf + ISectSize1 != p) + sysfatal("packisect packed wrong amount"); + if(is->version == ISectVersion2){ + U32PUT(p, is->bucketmagic); + p += U32Size; + if(buf + ISectSize2 != p) + sysfatal("packisect packed wrong amount"); + } + + return 0; +} + +void +unpackientry(IEntry *ie, u8int *buf) +{ + u8int *p; + + p = buf; + + scorecp(ie->score, p); + p += VtScoreSize; + /* ie->wtime = U32GET(p); */ + p += U32Size; + /* ie->train = U16GET(p); */ + p += U16Size; + if(p - buf != IEntryAddrOff) + sysfatal("unpackentry bad IEntryAddrOff amount"); + ie->ia.addr = U64GET(p); +if(ie->ia.addr>>56) print("%.8H => %llux\n", p, ie->ia.addr); + p += U64Size; + ie->ia.size = U16GET(p); + p += U16Size; + if(p - buf != IEntryTypeOff) + sysfatal("unpackientry bad IEntryTypeOff amount"); + ie->ia.type = vtfromdisktype(U8GET(p)); + p += U8Size; + ie->ia.blocks = U8GET(p); + p += U8Size; + + if(p - buf != IEntrySize) + sysfatal("unpackientry unpacked wrong amount"); +} + +void +packientry(IEntry *ie, u8int *buf) +{ + u32int t32; + u8int *p; + + p = buf; + + scorecp(p, ie->score); + p += VtScoreSize; + U32PUT(p, 0); /* wtime */ + p += U32Size; + U16PUT(p, 0); /* train */ + p += U16Size; + U64PUT(p, ie->ia.addr, t32); + p += U64Size; + U16PUT(p, ie->ia.size); + p += U16Size; + U8PUT(p, vttodisktype(ie->ia.type)); + p += U8Size; + U8PUT(p, ie->ia.blocks); + p += U8Size; + + if(p - buf != IEntrySize) + sysfatal("packientry packed wrong amount"); +} + +void +unpackibucket(IBucket *b, u8int *buf, u32int magic) +{ + b->n = U16GET(buf); + b->data = buf + IBucketSize; + if(magic && magic != U32GET(buf+U16Size)) + b->n = 0; +} + +void +packibucket(IBucket *b, u8int *buf, u32int magic) +{ + U16PUT(buf, b->n); + U32PUT(buf+U16Size, magic); +} + +void +packbloomhead(Bloom *b, u8int *buf) +{ + u8int *p; + + p = buf; + U32PUT(p, BloomMagic); + U32PUT(p+4, BloomVersion); + U32PUT(p+8, b->nhash); + U32PUT(p+12, b->size); +} + +int +unpackbloomhead(Bloom *b, u8int *buf) +{ + u8int *p; + u32int m; + char fbuf[20]; + + p = buf; + + m = U32GET(p); + if(m != BloomMagic){ + seterr(ECorrupt, "bloom filter has wrong magic number: %s expected BloomMagic (%#lux)", fmtmagic(fbuf, m), (ulong)BloomMagic); + return -1; + } + p += U32Size; + + m = U32GET(p); + if(m != BloomVersion){ + seterr(ECorrupt, "bloom filter has wrong version %ud expected %ud", (uint)m, (uint)BloomVersion); + return -1; + } + p += U32Size; + + b->nhash = U32GET(p); + p += U32Size; + + b->size = U32GET(p); + p += U32Size; + if(b->size < BloomHeadSize || b->size > MaxBloomSize || (b->size&(b->size-1))){ + seterr(ECorrupt, "bloom filter has invalid size %#lux", b->size); + return -1; + } + + if(buf + BloomHeadSize != p) + sysfatal("unpackarena unpacked wrong amount"); + + return 0; +} diff --git a/sys/src/cmd/venti/srv/dat.h b/sys/src/cmd/venti/srv/dat.h new file mode 100755 index 000000000..24cc79e8b --- /dev/null +++ b/sys/src/cmd/venti/srv/dat.h @@ -0,0 +1,758 @@ +typedef struct Config Config; +typedef struct AMap AMap; +typedef struct AMapN AMapN; +typedef struct Arena Arena; +typedef struct AState AState; +typedef struct ArenaCIG ArenaCIG; +typedef struct ArenaHead ArenaHead; +typedef struct ArenaPart ArenaPart; +typedef struct ArenaTail ArenaTail; +typedef struct ATailStats ATailStats; +typedef struct CIBlock CIBlock; +typedef struct Clump Clump; +typedef struct ClumpInfo ClumpInfo; +typedef struct Graph Graph; +typedef struct IAddr IAddr; +typedef struct IBucket IBucket; +typedef struct IEStream IEStream; +typedef struct IEntry IEntry; +typedef struct IFile IFile; +typedef struct ISect ISect; +typedef struct Index Index; +typedef struct Lump Lump; +typedef struct DBlock DBlock; +typedef struct Part Part; +typedef struct Statbin Statbin; +typedef struct Statdesc Statdesc; +typedef struct Stats Stats; +typedef struct ZBlock ZBlock; +typedef struct Round Round; +typedef struct Bloom Bloom; + +#pragma incomplete IEStream + +#define TWID32 ((u32int)~(u32int)0) +#define TWID64 ((u64int)~(u64int)0) +#define TWID8 ((u8int)~(u8int)0) + +enum +{ + ABlockLog = 9, /* log2(512), the quantum for reading arenas */ + ANameSize = 64, + MaxDiskBlock = 64*1024, /* max. allowed size for a disk block */ + MaxIoSize = 64*1024, /* max. allowed size for a disk io operation */ + PartBlank = 256*1024, /* untouched section at beginning of partition */ + HeadSize = 512, /* size of a header after PartBlank */ + MinArenaSize = 1*1024*1024, /* smallest reasonable arena size */ + IndexBase = 1024*1024, /* initial address to use in an index */ + MaxIo = 64*1024, /* max size of a single read or write operation */ + ICacheBits = 16, /* default bits for indexing icache */ + MaxAMap = 31*1024, /* max. allowed arenas in an address mapping; must be < 32*1024 */ + Unspecified = ~0ul, + + /* + * return codes from syncarena + */ + SyncDataErr = 1 << 0, /* problem reading the clump data */ + SyncCIErr = 1 << 1, /* found erroneous clump directory entries */ + SyncCIZero = 1 << 2, /* found unwritten clump directory entries */ + SyncFixErr = 1 << 3, /* error writing fixed data */ + SyncHeader = 1 << 4, /* altered header fields */ + + /* + * error severity + */ + EOk = 0, /* error expected in normal operation */ + EStrange, /* strange error that should be logged */ + ECorrupt, /* corrupted data found in arenas */ + EICorrupt, /* corrupted data found in index */ + EAdmin, /* should be brought to administrators' attention */ + ECrash, /* really bad internal error */ + EBug, /* a limitation which should be fixed */ + EInconsist, /* inconsistencies between index and arena */ + EMax, + + /* + * internal disk formats for the venti archival storage system + */ + /* + * magic numbers on disk + */ + _ClumpMagic = 0xd15cb10cU, /* clump header, deprecated */ + ClumpFreeMagic = 0, /* free clump; terminates active clump log */ + + ArenaPartMagic = 0xa9e4a5e7U, /* arena partition header */ + ArenaMagic = 0xf2a14eadU, /* arena trailer */ + ArenaHeadMagic = 0xd15c4eadU, /* arena header */ + + BloomMagic = 0xb1004eadU, /* bloom filter header */ + BloomMaxHash = 32, + + ISectMagic = 0xd15c5ec7U, /* index header */ + + ArenaPartVersion = 3, + ArenaVersion4 = 4, + ArenaVersion5 = 5, + BloomVersion = 1, + IndexVersion = 1, + ISectVersion1 = 1, + ISectVersion2 = 2, + + /* + * encodings of clumps on disk + */ + ClumpEErr = 0, /* can't happen */ + ClumpENone, /* plain */ + ClumpECompress, /* compressed */ + ClumpEMax, + + /* + * sizes in bytes on disk + */ + U8Size = 1, + U16Size = 2, + U32Size = 4, + U64Size = 8, + + ArenaPartSize = 4 * U32Size, + ArenaSize4 = 2 * U64Size + 6 * U32Size + ANameSize + U8Size, + ArenaSize5 = ArenaSize4 + U32Size, + ArenaSize5a = ArenaSize5 + 2 * U8Size + 2 * U32Size + 2 * U64Size, + ArenaHeadSize4 = U64Size + 3 * U32Size + ANameSize, + ArenaHeadSize5 = ArenaHeadSize4 + U32Size, + BloomHeadSize = 4 * U32Size, + ISectSize1 = 7 * U32Size + 2 * ANameSize, + ISectSize2 = ISectSize1 + U32Size, + ClumpInfoSize = U8Size + 2 * U16Size + VtScoreSize, + ClumpSize = ClumpInfoSize + U8Size + 3 * U32Size, + MaxBloomSize = 1<<(32-3), /* 2^32 bits */ + MaxBloomHash = 32, /* bits per score */ + /* + * BUG - The various block copies that manipulate entry buckets + * would be faster if we bumped IBucketSize up to 8 and IEntrySize up to 40, + * so that everything is word-aligned. Buildindex is actually cpu-bound + * by the (byte at a time) copying in qsort. + */ + IBucketSize = U32Size + U16Size, + IEntrySize = U64Size + U32Size + 2*U16Size + 2*U8Size + VtScoreSize, + IEntryTypeOff = VtScoreSize + U32Size + U16Size + U64Size + U16Size, + IEntryAddrOff = VtScoreSize + U32Size + U16Size, + + MaxClumpBlocks = (VtMaxLumpSize + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog, + + IcacheFrac = 1000000, /* denominator */ + + SleepForever = 1000000000, /* magic value for sleep time */ + /* + * dirty flags - order controls disk write order + */ + DirtyArena = 1, + DirtyArenaCib, + DirtyArenaTrailer, + DirtyMax, + + ArenaCIGSize = 10*1024, // about 0.5 MB worth of IEntry. + + VentiZZZZZZZZ +}; + +extern char TraceDisk[]; +extern char TraceLump[]; +extern char TraceBlock[]; +extern char TraceProc[]; +extern char TraceWork[]; +extern char TraceQuiet[]; +extern char TraceRpc[]; + +/* + * results of parsing and initializing a config file + */ +struct Config +{ + char *index; /* name of the index to initialize */ + int naparts; /* arena partitions initialized */ + ArenaPart **aparts; + int nsects; /* index sections initialized */ + ISect **sects; + Bloom *bloom; /* bloom filter */ + u32int bcmem; + u32int mem; + u32int icmem; + int queuewrites; + char* haddr; + char* vaddr; + char* webroot; +}; + +/* + * a Part is the low level interface to files or disks. + * there are two main types of partitions + * arena paritions, which some number of arenas, each in a sub-partition. + * index partition, which only have one subpartition. + */ +struct Part +{ + int fd; /* rock for accessing the disk */ + int mode; + u64int offset; + u64int size; /* size of the partiton */ + u32int blocksize; /* block size for reads and writes */ + u32int fsblocksize; /* minimum file system block size */ + char *name; + char *filename; + Channel *writechan; /* chan[dcache.nblock](DBlock*) */ +}; + +/* + * a cached block from the partition + * yuck -- most of this is internal structure for the cache + * all other routines should only use data + */ +struct DBlock +{ + u8int *data; + + Part *part; /* partition in which cached */ + u64int addr; /* base address on the partition */ + u32int size; /* amount of data available, not amount allocated; should go away */ + u32int mode; + u32int dirty; + u32int dirtying; + DBlock *next; /* doubly linked hash chains */ + DBlock *prev; + u32int heap; /* index in heap table */ + u32int used; /* last reference times */ + u32int used2; + u32int ref; /* reference count */ + RWLock lock; /* for access to data only */ + Channel *writedonechan; + void* chanbuf[1]; /* buffer for the chan! */ +}; + +/* + * a cached block from the partition + * yuck -- most of this is internal structure for the cache + * all other routines should only use data + * double yuck -- this is mostly the same as a DBlock + */ +struct Lump +{ + Packet *data; + + Part *part; /* partition in which cached */ + u8int score[VtScoreSize]; /* score of packet */ + u8int type; /* type of packet */ + u32int size; /* amount of data allocated to hold packet */ + Lump *next; /* doubly linked hash chains */ + Lump *prev; + u32int heap; /* index in heap table */ + u32int used; /* last reference times */ + u32int used2; + u32int ref; /* reference count */ + QLock lock; /* for access to data only */ +}; + +/* + * mapping between names and address ranges + */ +struct AMap +{ + u64int start; + u64int stop; + char name[ANameSize]; +}; + +/* + * an AMap along with a length + */ +struct AMapN +{ + int n; + AMap *map; +}; + +/* + * an ArenaPart is a partition made up of Arenas + * it exists because most os's don't support many partitions, + * and we want to have many different Arenas + */ +struct ArenaPart +{ + Part *part; + u64int size; /* size of underlying partition, rounded down to blocks */ + Arena **arenas; + u32int tabbase; /* base address of arena table on disk */ + u32int tabsize; /* max. bytes in arena table */ + + /* + * fields stored on disk + */ + u32int version; + u32int blocksize; /* "optimal" block size for reads and writes */ + u32int arenabase; /* base address of first arena */ + + /* + * stored in the arena mapping table on disk + */ + AMap *map; + int narenas; +}; + +/* + * info about one block in the clump info cache + */ +struct CIBlock +{ + u32int block; /* blocks in the directory */ + int offset; /* offsets of one clump in the data */ + DBlock *data; +}; + +/* + * Statistics kept in the tail. + */ +struct ATailStats +{ + u32int clumps; /* number of clumps */ + u32int cclumps; /* number of compressed clumps */ + u64int used; + u64int uncsize; + u8int sealed; +}; + +/* + * Arena state - represents a point in the data log + */ +struct AState +{ + Arena *arena; + u64int aa; /* index address */ + ATailStats stats; +}; + +/* + * an Arena is a log of Clumps, preceeded by an ArenaHeader, + * and followed by a Arena, each in one disk block. + * struct on disk is not always up to date, but should be self-consistent. + * to sync after reboot, follow clumps starting at used until ClumpFreeMagic if found. + * <struct name="Arena" type="Arena *"> + * <field name="name" val="s->name" type="AName"/> + * <field name="version" val="s->version" type="U32int"/> + * <field name="partition" val="s->part->name" type="AName"/> + * <field name="blocksize" val="s->blocksize" type="U32int"/> + * <field name="start" val="s->base" type="U64int"/> + * <field name="stop" val="s->base+2*s->blocksize" type="U64int"/> + * <field name="created" val="s->ctime" type="U32int"/> + * <field name="modified" val="s->wtime" type="U32int"/> + * <field name="sealed" val="s->sealed" type="Sealed"/> + * <field name="score" val="s->score" type="Score"/> + * <field name="clumps" val="s->clumps" type="U32int"/> + * <field name="compressedclumps" val="s->cclumps" type="U32int"/> + * <field name="data" val="s->uncsize" type="U64int"/> + * <field name="compresseddata" val="s->used - s->clumps * ClumpSize" type="U64int"/> + * <field name="storage" val="s->used + s->clumps * ClumpInfoSize" type="U64int"/> + * </struct> + */ +struct Arena +{ + QLock lock; /* lock for arena fields, writing to disk */ + Part *part; /* partition in which arena lives */ + int blocksize; /* size of block to read or write */ + u64int base; /* base address on disk */ + u64int size; /* total space in the arena */ + u8int score[VtScoreSize]; /* score of the entire sealed & summed arena */ + + int clumpmax; /* ClumpInfos per block */ + AState mem; + int inqueue; + + /* + * fields stored on disk + */ + u32int version; + char name[ANameSize]; /* text label */ + ATailStats memstats; + ATailStats diskstats; + u32int ctime; /* first time a block was written */ + u32int wtime; /* last time a block was written */ + u32int clumpmagic; + + ArenaCIG *cig; + int ncig; +}; + +struct ArenaCIG +{ + u64int offset; // from arena base +}; + +/* + * redundant storage of some fields at the beginning of each arena + */ +struct ArenaHead +{ + u32int version; + char name[ANameSize]; + u32int blocksize; + u64int size; + u32int clumpmagic; +}; + +/* + * most interesting meta information for a clump. + * stored in each clump's header and in the Arena's directory, + * stored in reverse order just prior to the arena trailer + */ +struct ClumpInfo +{ + u8int type; + u16int size; /* size of disk data, not including header */ + u16int uncsize; /* size of uncompressed data */ + u8int score[VtScoreSize]; /* score of the uncompressed data only */ +}; + +/* + * header for an immutable clump of data + */ +struct Clump +{ + ClumpInfo info; + u8int encoding; + u32int creator; /* initial client which wrote the block */ + u32int time; /* creation at gmt seconds since 1/1/1970 */ +}; + +/* + * index of all clumps according to their score + * this is just a wrapper to tie together the index sections + * <struct name="Index" type="Index *"> + * <field name="name" val="s->name" type="AName"/> + * <field name="version" val="s->version" type="U32int"/> + * <field name="blocksize" val="s->blocksize" type="U32int"/> + * <field name="tabsize" val="s->tabsize" type="U32int"/> + * <field name="buckets" val="s->buckets" type="U32int"/> + * <field name="buckdiv" val="s->div" type="U32int"/> + * <field name="bitblocks" val="s->div" type="U32int"/> + * <field name="maxdepth" val="s->div" type="U32int"/> + * <field name="bitkeylog" val="s->div" type="U32int"/> + * <field name="bitkeymask" val="s->div" type="U32int"/> + * <array name="sect" val="&s->smap[i]" elems="s->nsects" type="Amap"/> + * <array name="amap" val="&s->amap[i]" elems="s->narenas" type="Amap"/> + * <array name="arena" val="s->arenas[i]" elems="s->narenas" type="Arena"/> + * </struct> + * <struct name="Amap" type="AMap *"> + * <field name="name" val="s->name" type="AName"/> + * <field name="start" val="s->start" type="U64int"/> + * <field name="stop" val="s->stop" type="U64int"/> + * </struct> + */ +struct Index +{ + u32int div; /* divisor for mapping score to bucket */ + u32int buckets; /* last bucket used in disk hash table */ + u32int blocksize; + u32int tabsize; /* max. bytes in index config */ + + int mapalloc; /* first arena to check when adding a lump */ + Arena **arenas; /* arenas in the mapping */ + ISect **sects; /* sections which hold the buckets */ + Bloom *bloom; /* bloom filter */ + + /* + * fields stored in config file + */ + u32int version; + char name[ANameSize]; /* text label */ + int nsects; + AMap *smap; /* mapping of buckets to index sections */ + int narenas; + AMap *amap; /* mapping from index addesses to arenas */ + + QLock writing; +}; + +/* + * one part of the bucket storage for an index. + * the index blocks are sequentially allocated + * across all of the sections. + */ +struct ISect +{ + Part *part; + int blocklog; /* log2(blocksize) */ + int buckmax; /* max. entries in a index bucket */ + u32int tabbase; /* base address of index config table on disk */ + u32int tabsize; /* max. bytes in index config */ + Channel *writechan; + Channel *writedonechan; + void *ig; /* used by buildindex only */ + int ng; + + /* + * fields stored on disk + */ + u32int version; + u32int bucketmagic; + char name[ANameSize]; /* text label */ + char index[ANameSize]; /* index owning the section */ + u32int blocksize; /* size of hash buckets in index */ + u32int blockbase; /* address of start of on disk index table */ + u32int blocks; /* total blocks on disk; some may be unused */ + u32int start; /* first bucket in this section */ + u32int stop; /* limit of buckets in this section */ +}; + +/* + * externally interesting part of an IEntry + */ +struct IAddr +{ + u64int addr; + u16int size; /* uncompressed size */ + u8int type; /* type of block */ + u8int blocks; /* arena io quanta for Clump + data */ +}; + +/* + * entries in the index + * kept in IBuckets in the disk index table, + * cached in the memory ICache. + */ +struct IEntry +{ + /* on disk data - 32 bytes*/ + u8int score[VtScoreSize]; + IAddr ia; + + IEntry *nexthash; + IEntry *nextdirty; + IEntry *next; + IEntry *prev; + u8int state; +}; +enum { + IEClean = 0, + IEDirty = 1, + IESummary = 2, +}; + +/* + * buckets in the on disk index table + */ +struct IBucket +{ + u16int n; /* number of active indices */ + u32int buck; /* used by buildindex/checkindex only */ + u8int *data; +}; + +/* + * temporary buffers used by individual threads + */ +struct ZBlock +{ + u32int len; + u32int _size; + u8int *data; + u8int *free; +}; + +/* + * simple input buffer for a '\0' terminated text file + */ +struct IFile +{ + char *name; /* name of the file */ + ZBlock *b; /* entire contents of file */ + u32int pos; /* current position in the file */ +}; + +struct Statdesc +{ + char *name; + ulong max; +}; + +/* keep in sync with stats.c:/statdesc and httpd.c:/graphname*/ +enum +{ + StatRpcTotal, + StatRpcRead, + StatRpcReadOk, + StatRpcReadFail, + StatRpcReadBytes, + StatRpcReadTime, + StatRpcReadCached, + StatRpcReadCachedTime, + StatRpcReadUncached, + StatRpcReadUncachedTime, + StatRpcWrite, + StatRpcWriteNew, + StatRpcWriteOld, + StatRpcWriteFail, + StatRpcWriteBytes, + StatRpcWriteTime, + StatRpcWriteNewTime, + StatRpcWriteOldTime, + + StatLcacheHit, + StatLcacheMiss, + StatLcacheRead, + StatLcacheWrite, + StatLcacheSize, + StatLcacheStall, + StatLcacheReadTime, + + StatDcacheHit, + StatDcacheMiss, + StatDcacheLookup, + StatDcacheRead, + StatDcacheWrite, + StatDcacheDirty, + StatDcacheSize, + StatDcacheFlush, + StatDcacheStall, + StatDcacheLookupTime, + + StatDblockStall, + StatLumpStall, + + StatIcacheHit, + StatIcacheMiss, + StatIcacheRead, + StatIcacheWrite, + StatIcacheFill, + StatIcachePrefetch, + StatIcacheDirty, + StatIcacheSize, + StatIcacheFlush, + StatIcacheStall, + StatIcacheReadTime, + StatIcacheLookup, + StatScacheHit, + StatScachePrefetch, + + StatBloomHit, + StatBloomMiss, + StatBloomFalseMiss, + StatBloomLookup, + StatBloomOnes, + StatBloomBits, + + StatApartRead, + StatApartReadBytes, + StatApartWrite, + StatApartWriteBytes, + + StatIsectRead, + StatIsectReadBytes, + StatIsectWrite, + StatIsectWriteBytes, + + StatSumRead, + StatSumReadBytes, + + StatCigLoad, + StatCigLoadTime, + + NStat +}; + +extern Statdesc statdesc[NStat]; + +/* + * statistics about the operation of the server + * mainly for performance monitoring and profiling. + */ +struct Stats +{ + ulong now; + ulong n[NStat]; +}; + +struct Statbin +{ + uint nsamp; + uint min; + uint max; + uint avg; +}; + +struct Graph +{ + long (*fn)(Stats*, Stats*, void*); + void *arg; + long t0; + long t1; + long min; + long max; + long wid; + long ht; + int fill; +}; + +/* + * for kicking background processes that run one round after another after another + */ +struct Round +{ + QLock lock; + Rendez start; + Rendez finish; + Rendez delaywait; + int delaytime; + int delaykick; + char* name; + int last; + int current; + int next; + int doanother; +}; + +/* + * Bloom filter of stored block hashes + */ +struct Bloom +{ + RWLock lk; /* protects nhash, nbits, tab, mb */ + QLock mod; /* one marker at a time, protects nb */ + int nhash; + ulong size; /* bytes in tab */ + ulong bitmask; /* to produce bit index */ + u8int *data; + Part *part; + Channel *writechan; + Channel *writedonechan; +}; + +extern Index *mainindex; +extern u32int maxblocksize; /* max. block size used by any partition */ +extern int paranoid; /* should verify hashes on disk read */ +extern int queuewrites; /* put all lump writes on a queue and finish later */ +extern int readonly; /* only allowed to read the disk data */ +extern Stats stats; +extern u8int zeroscore[VtScoreSize]; +extern int compressblocks; +extern int writestodevnull; /* dangerous - for performance debugging */ +extern int collectstats; +extern QLock memdrawlock; +extern int icachesleeptime; +extern int minicachesleeptime; +extern int arenasumsleeptime; +extern int manualscheduling; +extern int l0quantum; +extern int l1quantum; +extern int ignorebloom; +extern int icacheprefetch; +extern int syncwrites; +extern int debugarena; /* print in arena error msgs; -1==unknown */ + +extern Stats *stathist; +extern int nstathist; +extern ulong stattime; + +#ifndef PLAN9PORT +#pragma varargck type "V" uchar* +#define ODIRECT 0 +#endif + diff --git a/sys/src/cmd/venti/srv/dcache.c b/sys/src/cmd/venti/srv/dcache.c new file mode 100755 index 000000000..a50ef0c5c --- /dev/null +++ b/sys/src/cmd/venti/srv/dcache.c @@ -0,0 +1,712 @@ +/* + * Disk cache. + * + * Caches raw disk blocks. Getdblock() gets a block, putdblock puts it back. + * Getdblock has a mode parameter that determines i/o and access to a block: + * if mode is OREAD or ORDWR, it is read from disk if not already in memory. + * If mode is ORDWR or OWRITE, it is locked for exclusive use before being returned. + * It is *not* marked dirty -- once changes have been made, they should be noted + * by using dirtydblock() before putdblock(). + * + * There is a global cache lock as well as a lock on each block. + * Within a thread, the cache lock can be acquired while holding a block lock, + * but not vice versa; and a block cannot be locked if you already hold the lock + * on another block. + * + * The flush proc writes out dirty blocks in batches, one batch per dirty tag. + * For example, the DirtyArena blocks are all written to disk before any of the + * DirtyArenaCib blocks. + * + * This code used to be in charge of flushing the dirty index blocks out to + * disk, but updating the index turned out to benefit from extra care. + * Now cached index blocks are never marked dirty. The index.c code takes + * care of updating them behind our back, and uses _getdblock to update any + * cached copies of the blocks as it changes them on disk. + */ + +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +typedef struct DCache DCache; + +enum +{ + HashLog = 9, + HashSize = 1<<HashLog, + HashMask = HashSize - 1, +}; + +struct DCache +{ + QLock lock; + RWLock dirtylock; /* must be held to inspect or set b->dirty */ + Rendez full; + Round round; + DBlock *free; /* list of available lumps */ + u32int now; /* ticks for usage timestamps */ + int size; /* max. size of any block; allocated to each block */ + DBlock **heads; /* hash table for finding address */ + int nheap; /* number of available victims */ + DBlock **heap; /* heap for locating victims */ + int nblocks; /* number of blocks allocated */ + DBlock *blocks; /* array of block descriptors */ + DBlock **write; /* array of block pointers to be written */ + u8int *mem; /* memory for all block descriptors */ + int ndirty; /* number of dirty blocks */ + int maxdirty; /* max. number of dirty blocks */ +}; + +typedef struct Ra Ra; +struct Ra +{ + Part *part; + u64int addr; +}; + +static DCache dcache; + +static int downheap(int i, DBlock *b); +static int upheap(int i, DBlock *b); +static DBlock *bumpdblock(void); +static void delheap(DBlock *db); +static void fixheap(int i, DBlock *b); +static void flushproc(void*); +static void writeproc(void*); + +void +initdcache(u32int mem) +{ + DBlock *b, *last; + u32int nblocks, blocksize; + int i; + u8int *p; + + if(mem < maxblocksize * 2) + sysfatal("need at least %d bytes for the disk cache", maxblocksize * 2); + if(maxblocksize == 0) + sysfatal("no max. block size given for disk cache"); + blocksize = maxblocksize; + nblocks = mem / blocksize; + dcache.full.l = &dcache.lock; + dcache.nblocks = nblocks; + dcache.maxdirty = (nblocks * 2) / 3; + trace(TraceProc, "initialize disk cache with %d blocks of %d bytes, maximum %d dirty blocks\n", + nblocks, blocksize, dcache.maxdirty); + dcache.size = blocksize; + dcache.heads = MKNZ(DBlock*, HashSize); + dcache.heap = MKNZ(DBlock*, nblocks); + dcache.blocks = MKNZ(DBlock, nblocks); + dcache.write = MKNZ(DBlock*, nblocks); + dcache.mem = MKNZ(u8int, (nblocks+1+128) * blocksize); + + last = nil; + p = (u8int*)(((uintptr)dcache.mem+blocksize-1)&~(uintptr)(blocksize-1)); + for(i = 0; i < nblocks; i++){ + b = &dcache.blocks[i]; + b->data = &p[i * blocksize]; + b->heap = TWID32; + b->writedonechan = chancreate(sizeof(void*), 1); + b->next = last; + last = b; + } + + dcache.free = last; + dcache.nheap = 0; + setstat(StatDcacheSize, nblocks); + initround(&dcache.round, "dcache", 120*1000); + + vtproc(flushproc, nil); + vtproc(delaykickroundproc, &dcache.round); +} + +static u32int +pbhash(u64int addr) +{ + u32int h; + +#define hashit(c) ((((c) * 0x6b43a9b5) >> (32 - HashLog)) & HashMask) + h = (addr >> 32) ^ addr; + return hashit(h); +} + +DBlock* +getdblock(Part *part, u64int addr, int mode) +{ + DBlock *b; + + b = _getdblock(part, addr, mode, 1); + if(mode == OREAD || mode == ORDWR) + addstat(StatDcacheRead, 1); + if(mode == OWRITE || mode == ORDWR) + addstat(StatDcacheWrite, 1); + return b; +} + +DBlock* +_getdblock(Part *part, u64int addr, int mode, int load) +{ + DBlock *b; + u32int h, size, ms; + + ms = 0; + trace(TraceBlock, "getdblock enter %s 0x%llux", part->name, addr); + size = part->blocksize; + if(size > dcache.size){ + seterr(EAdmin, "block size %d too big for cache with size %d", size, dcache.size); + if(load) + addstat(StatDcacheLookup, 1); + return nil; + } + h = pbhash(addr); + + /* + * look for the block in the cache + */ + qlock(&dcache.lock); +again: + for(b = dcache.heads[h]; b != nil; b = b->next){ + if(b->part == part && b->addr == addr){ + if(load) + addstat2(StatDcacheHit, 1, StatDcacheLookup, 1); + goto found; + } + } + + /* + * missed: locate the block with the oldest second to last use. + * remove it from the heap, and fix up the heap. + */ + if(!load){ + qunlock(&dcache.lock); + return nil; + } + + /* + * Only start timer here, on cache miss - calling msec() on plain cache hits + * makes cache hits system-call bound. + */ + ms = msec(); + addstat2(StatDcacheLookup, 1, StatDcacheMiss, 1); + + b = bumpdblock(); + if(b == nil){ + trace(TraceBlock, "all disk cache blocks in use"); + addstat(StatDcacheStall, 1); + rsleep(&dcache.full); + addstat(StatDcacheStall, -1); + goto again; + } + + assert(!b->dirty); + + /* + * the new block has no last use, so assume it happens sometime in the middle +ZZZ this is not reasonable + */ + b->used = (b->used2 + dcache.now) / 2; + + /* + * rechain the block on the correct hash chain + */ + b->next = dcache.heads[h]; + dcache.heads[h] = b; + if(b->next != nil) + b->next->prev = b; + b->prev = nil; + + b->addr = addr; + b->part = part; + b->size = 0; + +found: + b->ref++; + b->used2 = b->used; + b->used = dcache.now++; + if(b->heap != TWID32) + fixheap(b->heap, b); + + if((mode == ORDWR || mode == OWRITE) && part->writechan == nil){ + trace(TraceBlock, "getdblock allocwriteproc %s", part->name); + part->writechan = chancreate(sizeof(DBlock*), dcache.nblocks); + vtproc(writeproc, part); + } + qunlock(&dcache.lock); + + trace(TraceBlock, "getdblock lock"); + addstat(StatDblockStall, 1); + if(mode == OREAD) + rlock(&b->lock); + else + wlock(&b->lock); + addstat(StatDblockStall, -1); + trace(TraceBlock, "getdblock locked"); + + if(b->size != size){ + if(mode == OREAD){ + addstat(StatDblockStall, 1); + runlock(&b->lock); + wlock(&b->lock); + addstat(StatDblockStall, -1); + } + if(b->size < size){ + if(mode == OWRITE) + memset(&b->data[b->size], 0, size - b->size); + else{ + trace(TraceBlock, "getdblock readpart %s 0x%llux", part->name, addr); + diskaccess(0); + if(readpart(part, addr + b->size, &b->data[b->size], size - b->size) < 0){ + b->mode = ORDWR; /* so putdblock wunlocks */ + putdblock(b); + return nil; + } + trace(TraceBlock, "getdblock readpartdone"); + addstat(StatApartRead, 1); + addstat(StatApartReadBytes, size-b->size); + } + } + b->size = size; + if(mode == OREAD){ + addstat(StatDblockStall, 1); + wunlock(&b->lock); + rlock(&b->lock); + addstat(StatDblockStall, -1); + } + } + + b->mode = mode; + trace(TraceBlock, "getdblock exit"); + if(ms) + addstat(StatDcacheLookupTime, msec() - ms); + return b; +} + +void +putdblock(DBlock *b) +{ + if(b == nil) + return; + + trace(TraceBlock, "putdblock %s 0x%llux", b->part->name, b->addr); + + if(b->mode == OREAD) + runlock(&b->lock); + else + wunlock(&b->lock); + + qlock(&dcache.lock); + if(--b->ref == 0 && !b->dirty){ + if(b->heap == TWID32) + upheap(dcache.nheap++, b); + rwakeupall(&dcache.full); + } + qunlock(&dcache.lock); +} + +void +dirtydblock(DBlock *b, int dirty) +{ + int odirty; + + trace(TraceBlock, "dirtydblock enter %s 0x%llux %d from 0x%lux", + b->part->name, b->addr, dirty, getcallerpc(&b)); + assert(b->ref != 0); + assert(b->mode==ORDWR || b->mode==OWRITE); + + odirty = b->dirty; + if(b->dirty) + assert(b->dirty == dirty); + else + b->dirty = dirty; + + qlock(&dcache.lock); + if(!odirty){ + dcache.ndirty++; + setstat(StatDcacheDirty, dcache.ndirty); + if(dcache.ndirty >= dcache.maxdirty) + kickround(&dcache.round, 0); + else + delaykickround(&dcache.round); + } + qunlock(&dcache.lock); +} + +static void +unchain(DBlock *b) +{ + ulong h; + + /* + * unchain the block + */ + if(b->prev == nil){ + h = pbhash(b->addr); + if(dcache.heads[h] != b) + sysfatal("bad hash chains in disk cache"); + dcache.heads[h] = b->next; + }else + b->prev->next = b->next; + if(b->next != nil) + b->next->prev = b->prev; +} + +/* + * remove some block from use and update the free list and counters + */ +static DBlock* +bumpdblock(void) +{ + DBlock *b; + + trace(TraceBlock, "bumpdblock enter"); + b = dcache.free; + if(b != nil){ + dcache.free = b->next; + return b; + } + + if(dcache.ndirty >= dcache.maxdirty) + kickdcache(); + + /* + * remove blocks until we find one that is unused + * referenced blocks are left in the heap even though + * they can't be scavenged; this is simple a speed optimization + */ + for(;;){ + if(dcache.nheap == 0){ + kickdcache(); + trace(TraceBlock, "bumpdblock gotnothing"); + return nil; + } + b = dcache.heap[0]; + delheap(b); + if(!b->ref && !b->dirty) + break; + } + + trace(TraceBlock, "bumpdblock bumping %s 0x%llux", b->part->name, b->addr); + + unchain(b); + return b; +} + +void +emptydcache(void) +{ + DBlock *b; + + qlock(&dcache.lock); + while(dcache.nheap > 0){ + b = dcache.heap[0]; + delheap(b); + if(!b->ref && !b->dirty){ + unchain(b); + b->next = dcache.free; + dcache.free = b; + } + } + qunlock(&dcache.lock); +} + +/* + * delete an arbitrary block from the heap + */ +static void +delheap(DBlock *db) +{ + if(db->heap == TWID32) + return; + fixheap(db->heap, dcache.heap[--dcache.nheap]); + db->heap = TWID32; +} + +/* + * push an element up or down to it's correct new location + */ +static void +fixheap(int i, DBlock *b) +{ + if(upheap(i, b) == i) + downheap(i, b); +} + +static int +upheap(int i, DBlock *b) +{ + DBlock *bb; + u32int now; + int p; + + now = dcache.now; + for(; i != 0; i = p){ + p = (i - 1) >> 1; + bb = dcache.heap[p]; + if(b->used2 - now >= bb->used2 - now) + break; + dcache.heap[i] = bb; + bb->heap = i; + } + + dcache.heap[i] = b; + b->heap = i; + return i; +} + +static int +downheap(int i, DBlock *b) +{ + DBlock *bb; + u32int now; + int k; + + now = dcache.now; + for(; ; i = k){ + k = (i << 1) + 1; + if(k >= dcache.nheap) + break; + if(k + 1 < dcache.nheap && dcache.heap[k]->used2 - now > dcache.heap[k + 1]->used2 - now) + k++; + bb = dcache.heap[k]; + if(b->used2 - now <= bb->used2 - now) + break; + dcache.heap[i] = bb; + bb->heap = i; + } + + dcache.heap[i] = b; + b->heap = i; + return i; +} + +static void +findblock(DBlock *bb) +{ + DBlock *b, *last; + int h; + + last = nil; + h = pbhash(bb->addr); + for(b = dcache.heads[h]; b != nil; b = b->next){ + if(last != b->prev) + sysfatal("bad prev link"); + if(b == bb) + return; + last = b; + } + sysfatal("block missing from hash table"); +} + +void +checkdcache(void) +{ + DBlock *b; + u32int size, now; + int i, k, refed, nfree; + + qlock(&dcache.lock); + size = dcache.size; + now = dcache.now; + for(i = 0; i < dcache.nheap; i++){ + if(dcache.heap[i]->heap != i) + sysfatal("dc: mis-heaped at %d: %d", i, dcache.heap[i]->heap); + if(i > 0 && dcache.heap[(i - 1) >> 1]->used2 - now > dcache.heap[i]->used2 - now) + sysfatal("dc: bad heap ordering"); + k = (i << 1) + 1; + if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now) + sysfatal("dc: bad heap ordering"); + k++; + if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now) + sysfatal("dc: bad heap ordering"); + } + + refed = 0; + for(i = 0; i < dcache.nblocks; i++){ + b = &dcache.blocks[i]; + if(b->data != &dcache.mem[i * size]) + sysfatal("dc: mis-blocked at %d", i); + if(b->ref && b->heap == TWID32) + refed++; + if(b->addr) + findblock(b); + if(b->heap != TWID32 + && dcache.heap[b->heap] != b) + sysfatal("dc: spurious heap value"); + } + + nfree = 0; + for(b = dcache.free; b != nil; b = b->next){ + if(b->addr != 0 || b->heap != TWID32) + sysfatal("dc: bad free list"); + nfree++; + } + + if(dcache.nheap + nfree + refed != dcache.nblocks) + sysfatal("dc: missing blocks: %d %d %d", dcache.nheap, refed, dcache.nblocks); + qunlock(&dcache.lock); +} + +void +flushdcache(void) +{ + trace(TraceProc, "flushdcache enter"); + kickround(&dcache.round, 1); + trace(TraceProc, "flushdcache exit"); +} + +void +kickdcache(void) +{ + kickround(&dcache.round, 0); +} + +static int +parallelwrites(DBlock **b, DBlock **eb, int dirty) +{ + DBlock **p, **q; + Part *part; + + for(p=b; p<eb && (*p)->dirty == dirty; p++){ + assert(b<=p && p<eb); + sendp((*p)->part->writechan, *p); + } + q = p; + for(p=b; p<q; p++){ + assert(b<=p && p<eb); + recvp((*p)->writedonechan); + } + + /* + * Flush the partitions that have been written to. + */ + part = nil; + for(p=b; p<q; p++){ + if(part != (*p)->part){ + part = (*p)->part; + flushpart(part); /* what if it fails? */ + } + } + + return p-b; +} + +/* + * Sort first by dirty flag, then by partition, then by address in partition. + */ +static int +writeblockcmp(const void *va, const void *vb) +{ + DBlock *a, *b; + + a = *(DBlock**)va; + b = *(DBlock**)vb; + + if(a->dirty != b->dirty) + return a->dirty - b->dirty; + if(a->part != b->part){ + if(a->part < b->part) + return -1; + if(a->part > b->part) + return 1; + } + if(a->addr < b->addr) + return -1; + return 1; +} + +static void +flushproc(void *v) +{ + int i, j, n; + ulong t0; + DBlock *b, **write; + + USED(v); + threadsetname("flushproc"); + for(;;){ + waitforkick(&dcache.round); + + trace(TraceWork, "start"); + t0 = nsec()/1000; + trace(TraceProc, "build t=%lud", (ulong)(nsec()/1000)-t0); + + write = dcache.write; + n = 0; + for(i=0; i<dcache.nblocks; i++){ + b = &dcache.blocks[i]; + if(b->dirty) + write[n++] = b; + } + + qsort(write, n, sizeof(write[0]), writeblockcmp); + + /* Write each stage of blocks out. */ + trace(TraceProc, "writeblocks t=%lud", (ulong)(nsec()/1000)-t0); + i = 0; + for(j=1; j<DirtyMax; j++){ + trace(TraceProc, "writeblocks.%d t=%lud", + j, (ulong)(nsec()/1000)-t0); + i += parallelwrites(write+i, write+n, j); + } + if(i != n){ + fprint(2, "in flushproc i=%d n=%d\n", i, n); + for(i=0; i<n; i++) + fprint(2, "\tblock %d: dirty=%d\n", + i, write[i]->dirty); + abort(); + } + + /* + * b->dirty is protected by b->lock while ndirty is protected + * by dcache.lock, so the --ndirty below is the delayed one + * from clearing b->dirty in the write proc. It may happen + * that some other proc has come along and redirtied b since + * the write. That's okay, it just means that ndirty may be + * one too high until we catch up and do the decrement. + */ + trace(TraceProc, "undirty.%d t=%lud", j, (ulong)(nsec()/1000)-t0); + qlock(&dcache.lock); + for(i=0; i<n; i++){ + b = write[i]; + --dcache.ndirty; + if(b->ref == 0 && b->heap == TWID32){ + upheap(dcache.nheap++, b); + rwakeupall(&dcache.full); + } + } + setstat(StatDcacheDirty, dcache.ndirty); + qunlock(&dcache.lock); + addstat(StatDcacheFlush, 1); + trace(TraceWork, "finish"); + } +} + +static void +writeproc(void *v) +{ + DBlock *b; + Part *p; + + p = v; + + threadsetname("writeproc:%s", p->name); + for(;;){ + b = recvp(p->writechan); + trace(TraceWork, "start"); + assert(b->part == p); + trace(TraceProc, "wlock %s 0x%llux", p->name, b->addr); + wlock(&b->lock); + trace(TraceProc, "writepart %s 0x%llux", p->name, b->addr); + diskaccess(0); + if(writepart(p, b->addr, b->data, b->size) < 0) + fprint(2, "%s: writeproc: part %s addr 0x%llux: write error: %r\n", + argv0, p->name, b->addr); + addstat(StatApartWrite, 1); + addstat(StatApartWriteBytes, b->size); + b->dirty = 0; + wunlock(&b->lock); + trace(TraceProc, "finish %s 0x%llux", p->name, b->addr); + trace(TraceWork, "finish"); + sendp(b->writedonechan, b); + } +} diff --git a/sys/src/cmd/venti/srv/disksched.c b/sys/src/cmd/venti/srv/disksched.c new file mode 100755 index 000000000..d43b64c7f --- /dev/null +++ b/sys/src/cmd/venti/srv/disksched.c @@ -0,0 +1,89 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +ulong lasttime[2]; +int manualscheduling; +int l0quantum = 120; +int l1quantum = 120; +ulong lasticachechange; + +void +disksched(void) +{ + int p, nwrite, nflush, ndirty, tdirty, toflush; + ulong t; + vlong cflush; + Stats *prev; + + /* + * no locks because all the data accesses are atomic. + */ + t = time(0); + if(manualscheduling){ + lasticachechange = t; + return; + } + + if(t-lasttime[0] < l0quantum){ + /* level-0 disk access going on */ + p = icachedirtyfrac(); + if(p < IcacheFrac*5/10){ /* can wait */ + icachesleeptime = SleepForever; + lasticachechange = t; + }else if(p > IcacheFrac*9/10){ /* can't wait */ + icachesleeptime = 0; + lasticachechange = t; + }else if(t-lasticachechange > 60){ + /* have minute worth of data for current rate */ + prev = &stathist[(stattime-60+nstathist)%nstathist]; + + /* # entries written to index cache */ + nwrite = stats.n[StatIcacheWrite] - prev->n[StatIcacheWrite]; + + /* # dirty entries in index cache */ + ndirty = stats.n[StatIcacheDirty] - prev->n[StatIcacheDirty]; + + /* # entries flushed to disk */ + nflush = nwrite - ndirty; + + /* want to stay around 70% dirty */ + tdirty = (vlong)stats.n[StatIcacheSize]*700/1000; + + /* assume nflush*icachesleeptime is a constant */ + cflush = (vlong)nflush*(icachesleeptime+1); + + /* computer number entries to write in next minute */ + toflush = nwrite + (stats.n[StatIcacheDirty] - tdirty); + + /* schedule for that many */ + if(toflush <= 0 || cflush/toflush > 100000) + icachesleeptime = SleepForever; + else + icachesleeptime = cflush/toflush; + } + arenasumsleeptime = SleepForever; + return; + } + if(t-lasttime[1] < l1quantum){ + /* level-1 disk access (icache flush) going on */ + icachesleeptime = 0; + arenasumsleeptime = SleepForever; + return; + } + /* no disk access going on - no holds barred*/ + icachesleeptime = 0; + arenasumsleeptime = 0; +} + +void +diskaccess(int level) +{ + if(level < 0 || level >= nelem(lasttime)){ + fprint(2, "bad level in diskaccess; caller=%#p\n", + getcallerpc(&level)); + return; + } + lasttime[level] = time(0); +} + diff --git a/sys/src/cmd/venti/srv/dump.c b/sys/src/cmd/venti/srv/dump.c new file mode 100755 index 000000000..fa2bfb7d2 --- /dev/null +++ b/sys/src/cmd/venti/srv/dump.c @@ -0,0 +1,47 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +void +printindex(int fd, Index *ix) +{ + int i; + + fprint(fd, "index=%s version=%d blocksize=%d tabsize=%d\n", + ix->name, ix->version, ix->blocksize, ix->tabsize); + fprint(fd, "\tbuckets=%d div=%d\n", ix->buckets, ix->div); + for(i = 0; i < ix->nsects; i++) + fprint(fd, "\tsect=%s for buckets [%lld,%lld)\n", ix->smap[i].name, ix->smap[i].start, ix->smap[i].stop); + for(i = 0; i < ix->narenas; i++) + fprint(fd, "\tarena=%s at [%lld,%lld)\n", ix->amap[i].name, ix->amap[i].start, ix->amap[i].stop); +} + +void +printarenapart(int fd, ArenaPart *ap) +{ + int i; + + fprint(fd, "arena partition=%s\n\tversion=%d blocksize=%d arenas=%d\n\tsetbase=%d setsize=%d\n", + ap->part->name, ap->version, ap->blocksize, ap->narenas, ap->tabbase, ap->tabsize); + for(i = 0; i < ap->narenas; i++) + fprint(fd, "\tarena=%s at [%lld,%lld)\n", ap->map[i].name, ap->map[i].start, ap->map[i].stop); +} + +void +printarena(int fd, Arena *arena) +{ + fprint(fd, "arena='%s' [%lld,%lld)\n\tversion=%d created=%d modified=%d", + arena->name, arena->base, arena->base + arena->size + 2 * arena->blocksize, + arena->version, arena->ctime, arena->wtime); + if(arena->memstats.sealed) + fprint(2, " sealed\n"); + else + fprint(2, "\n"); + if(scorecmp(zeroscore, arena->score) != 0) + fprint(2, "\tscore=%V\n", arena->score); + + fprint(fd, "\tclumps=%,d compressed clumps=%,d data=%,lld compressed data=%,lld disk storage=%,lld\n", + arena->memstats.clumps, arena->memstats.cclumps, arena->memstats.uncsize, + arena->memstats.used - arena->memstats.clumps * ClumpSize, + arena->memstats.used + arena->memstats.clumps * ClumpInfoSize); +} diff --git a/sys/src/cmd/venti/srv/findscore.c b/sys/src/cmd/venti/srv/findscore.c new file mode 100755 index 000000000..412b07d40 --- /dev/null +++ b/sys/src/cmd/venti/srv/findscore.c @@ -0,0 +1,122 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +enum +{ + ClumpChunks = 32*1024 +}; + +static int verbose; + +int +clumpinfoeq(ClumpInfo *c, ClumpInfo *d) +{ + return c->type == d->type + && c->size == d->size + && c->uncsize == d->uncsize + && scorecmp(c->score, d->score)==0; +} + +int +findscore(Arena *arena, uchar *score) +{ + IEntry ie; + ClumpInfo *ci, *cis; + u64int a; + u32int clump; + int i, n, found; + +//ZZZ remove fprint? + if(arena->memstats.clumps) + fprint(2, "reading directory for arena=%s with %d entries\n", + arena->name, arena->memstats.clumps); + + cis = MKN(ClumpInfo, ClumpChunks); + found = 0; + a = 0; + memset(&ie, 0, sizeof(IEntry)); + for(clump = 0; clump < arena->memstats.clumps; clump += n){ + n = ClumpChunks; + if(n > arena->memstats.clumps - clump) + n = arena->memstats.clumps - clump; + if(readclumpinfos(arena, clump, cis, n) != n){ + seterr(EOk, "arena directory read failed: %r"); + break; + } + + for(i = 0; i < n; i++){ + ci = &cis[i]; + if(scorecmp(score, ci->score)==0){ + fprint(2, "found at clump=%d with type=%d size=%d csize=%d position=%lld\n", + clump + i, ci->type, ci->uncsize, ci->size, a); + found++; + } + a += ci->size + ClumpSize; + } + } + free(cis); + return found; +} + +void +usage(void) +{ + fprint(2, "usage: findscore [-v] arenafile score\n"); + threadexitsall(0); +} + +void +threadmain(int argc, char *argv[]) +{ + ArenaPart *ap; + Part *part; + char *file; + u8int score[VtScoreSize]; + int i, found; + + ventifmtinstall(); + + ARGBEGIN{ + case 'v': + verbose++; + break; + default: + usage(); + break; + }ARGEND + + readonly = 1; + + if(argc != 2) + usage(); + + file = argv[0]; + if(strscore(argv[1], score) < 0) + sysfatal("bad score %s", argv[1]); + + part = initpart(file, OREAD|ODIRECT); + if(part == nil) + sysfatal("can't open partition %s: %r", file); + + ap = initarenapart(part); + if(ap == nil) + sysfatal("can't initialize arena partition in %s: %r", file); + + if(verbose > 1){ + printarenapart(2, ap); + fprint(2, "\n"); + } + + initdcache(8 * MaxDiskBlock); + + found = 0; + for(i = 0; i < ap->narenas; i++) + found += findscore(ap->arenas[i], score); + + print("found %d occurrences of %V\n", found, score); + + if(verbose > 1) + printstats(); + threadexitsall(0); +} diff --git a/sys/src/cmd/venti/srv/fixarenas.c b/sys/src/cmd/venti/srv/fixarenas.c new file mode 100755 index 000000000..ac05ab8cd --- /dev/null +++ b/sys/src/cmd/venti/srv/fixarenas.c @@ -0,0 +1,1914 @@ +/* + * Check and fix an arena partition. + * + * This is a lot grittier than the rest of Venti because + * it can't just give up if a byte here or there is wrong. + * + * The rule here (hopefully followed!) is that block corruption + * only ever has a local effect -- there are no blocks that you + * can wipe out that will cause large portions of + * uncorrupted data blocks to be useless. + */ + +#include "stdinc.h" +#include "dat.h" +#include "fns.h" +#include "whack.h" + +#define ROUNDUP(x,n) (((x)+(n)-1)&~((n)-1)) + +#pragma varargck type "z" uvlong +#pragma varargck type "z" vlong +#pragma varargck type "t" uint + +enum +{ + K = 1024, + M = 1024*1024, + G = 1024*1024*1024, + + Block = 4096, +}; + +int debugsha1; + +int verbose; +Part *part; +char *file; +char *basename; +char *dumpbase; +int fix; +int badreads; +int unseal; +uchar zero[MaxDiskBlock]; + +Arena lastarena; +ArenaPart ap; +uvlong arenasize; +int nbadread; +int nbad; +uvlong partend; +void checkarena(vlong, int); + +void +usage(void) +{ + fprint(2, "usage: fixarenas [-fv] [-a arenasize] [-b blocksize] file [ranges]\n"); + threadexitsall(0); +} + +/* + * Format number in simplest way that is okay with unittoull. + */ +static int +zfmt(Fmt *fmt) +{ + vlong x; + + x = va_arg(fmt->args, vlong); + if(x == 0) + return fmtstrcpy(fmt, "0"); + if(x%G == 0) + return fmtprint(fmt, "%lldG", x/G); + if(x%M == 0) + return fmtprint(fmt, "%lldM", x/M); + if(x%K == 0) + return fmtprint(fmt, "%lldK", x/K); + return fmtprint(fmt, "%lld", x); +} + +/* + * Format time like ctime without newline. + */ +static int +tfmt(Fmt *fmt) +{ + uint t; + char buf[30]; + + t = va_arg(fmt->args, uint); + strcpy(buf, ctime(t)); + buf[28] = 0; + return fmtstrcpy(fmt, buf); +} + +/* + * Coalesce messages about unreadable sectors into larger ranges. + * bad(0, 0) flushes the buffer. + */ +static void +bad(char *msg, vlong o, int len) +{ + static vlong lb0, lb1; + static char *lmsg; + + if(msg == nil) + msg = lmsg; + if(o == -1){ + lmsg = nil; + lb0 = 0; + lb1 = 0; + return; + } + if(lb1 != o || (msg && lmsg && strcmp(msg, lmsg) != 0)){ + if(lb0 != lb1) + print("%s %#llux+%#llux (%,lld+%,lld)\n", + lmsg, lb0, lb1-lb0, lb0, lb1-lb0); + lb0 = o; + } + lmsg = msg; + lb1 = o+len; +} + +/* + * Read in the len bytes of data at the offset. If can't for whatever reason, + * fill it with garbage but print an error. + */ +static uchar* +readdisk(uchar *buf, vlong offset, int len) +{ + int i, j, k, n; + + if(offset >= partend){ + memset(buf, 0xFB, sizeof buf); + return buf; + } + + if(offset+len > partend){ + memset(buf, 0xFB, sizeof buf); + len = partend - offset; + } + + if(readpart(part, offset, buf, len) >= 0) + return buf; + + /* + * The read failed. Clear the buffer to nonsense, and + * then try reading in smaller pieces. If that fails, + * read in even smaller pieces. And so on down to sectors. + */ + memset(buf, 0xFD, len); + for(i=0; i<len; i+=64*K){ + n = 64*K; + if(i+n > len) + n = len-i; + if(readpart(part, offset+i, buf+i, n) >= 0) + continue; + for(j=i; j<len && j<i+64*K; j+=4*K){ + n = 4*K; + if(j+n > len) + n = len-j; + if(readpart(part, offset+j, buf+j, n) >= 0) + continue; + for(k=j; k<len && k<j+4*K; k+=512){ + if(readpart(part, offset+k, buf+k, 512) >= 0) + continue; + bad("disk read failed at", k, 512); + badreads++; + } + } + } + bad(nil, 0, 0); + return buf; +} + +/* + * Buffer to support running SHA1 hash of the disk. + */ +typedef struct Shabuf Shabuf; +struct Shabuf +{ + int fd; + vlong offset; + DigestState state; + int rollback; + vlong r0; + DigestState *hist; + int nhist; +}; + +void +sbdebug(Shabuf *sb, char *file) +{ + int fd; + + if(sb->fd > 0){ + close(sb->fd); + sb->fd = 0; + } + if((fd = create(file, OWRITE, 0666)) < 0) + return; + if(fd == 0){ + fd = dup(fd, -1); + close(0); + } + sb->fd = fd; +} + +void +sbupdate(Shabuf *sb, uchar *p, vlong offset, int len) +{ + int n, x; + vlong o; + + if(sb->rollback && !sb->hist){ + sb->r0 = offset; + sb->nhist = 1; + sb->hist = vtmalloc(sb->nhist*sizeof *sb->hist); + memset(sb->hist, 0, sizeof sb->hist[0]); + } + if(sb->r0 == 0) + sb->r0 = offset; + + if(sb->offset < offset || sb->offset >= offset+len){ + if(0) print("sbupdate %p %#llux+%d but offset=%#llux\n", + p, offset, len, sb->offset); + return; + } + x = sb->offset - offset; + if(0) print("sbupdate %p %#llux+%d skip %d\n", + sb, offset, len, x); + if(x){ + p += x; + offset += x; + len -= x; + } + assert(sb->offset == offset); + + if(sb->fd > 0) + pwrite(sb->fd, p, len, offset - sb->r0); + + if(!sb->rollback){ + sha1(p, len, nil, &sb->state); + sb->offset += len; + return; + } + + /* save state every 4M so we can roll back quickly */ + o = offset - sb->r0; + while(len > 0){ + n = 4*M - o%(4*M); + if(n > len) + n = len; + sha1(p, n, nil, &sb->state); + sb->offset += n; + o += n; + p += n; + len -= n; + if(o%(4*M) == 0){ + x = o/(4*M); + if(x >= sb->nhist){ + if(x != sb->nhist) + print("oops! x=%d nhist=%d\n", x, sb->nhist); + sb->nhist += 32; + sb->hist = vtrealloc(sb->hist, sb->nhist*sizeof *sb->hist); + } + sb->hist[x] = sb->state; + } + } +} + +void +sbdiskhash(Shabuf *sb, vlong eoffset) +{ + static uchar dbuf[4*M]; + int n; + + while(sb->offset < eoffset){ + n = sizeof dbuf; + if(sb->offset+n > eoffset) + n = eoffset - sb->offset; + readdisk(dbuf, sb->offset, n); + sbupdate(sb, dbuf, sb->offset, n); + } +} + +void +sbrollback(Shabuf *sb, vlong offset) +{ + int x; + vlong o; + Dir d; + + if(!sb->rollback || !sb->r0){ + print("cannot rollback sha\n"); + return; + } + if(offset >= sb->offset) + return; + o = offset - sb->r0; + x = o/(4*M); + if(x >= sb->nhist){ + print("cannot rollback sha\n"); + return; + } + sb->state = sb->hist[x]; + sb->offset = sb->r0 + x*4*M; + assert(sb->offset <= offset); + + if(sb->fd > 0){ + nulldir(&d); + d.length = sb->offset - sb->r0; + dirfwstat(sb->fd, &d); + } +} + +void +sbscore(Shabuf *sb, uchar *score) +{ + if(sb->hist){ + free(sb->hist); + sb->hist = nil; + } + sha1(nil, 0, score, &sb->state); +} + +/* + * If we're fixing arenas, then editing this memory edits the disk! + * It will be written back out as new data is paged in. + */ +uchar buf[4*M]; +uchar sbuf[4*M]; +vlong bufoffset; +int buflen; + +static void pageout(void); +static uchar* +pagein(vlong offset, int len) +{ + pageout(); + if(offset >= partend){ + memset(buf, 0xFB, sizeof buf); + return buf; + } + + if(offset+len > partend){ + memset(buf, 0xFB, sizeof buf); + len = partend - offset; + } + bufoffset = offset; + buflen = len; + readdisk(buf, offset, len); + memmove(sbuf, buf, len); + return buf; +} + +static void +pageout(void) +{ + if(buflen==0 || !fix || memcmp(buf, sbuf, buflen) == 0){ + buflen = 0; + return; + } + if(writepart(part, bufoffset, buf, buflen) < 0) + print("disk write failed at %#llux+%#ux (%,lld+%,d)\n", + bufoffset, buflen, bufoffset, buflen); + buflen = 0; +} + +static void +zerorange(vlong offset, int len) +{ + int i; + vlong ooff; + int olen; + enum { MinBlock = 4*K, MaxBlock = 8*K }; + + if(0) + if(bufoffset <= offset && offset+len <= bufoffset+buflen){ + memset(buf+(offset-bufoffset), 0, len); + return; + } + + ooff = bufoffset; + olen = buflen; + + i = offset%MinBlock; + if(i+len < MaxBlock){ + pagein(offset-i, (len+MinBlock-1)&~(MinBlock-1)); + memset(buf+i, 0, len); + }else{ + pagein(offset-i, MaxBlock); + memset(buf+i, 0, MaxBlock-i); + offset += MaxBlock-i; + len -= MaxBlock-i; + while(len >= MaxBlock){ + pagein(offset, MaxBlock); + memset(buf, 0, MaxBlock); + offset += MaxBlock; + len -= MaxBlock; + } + pagein(offset, (len+MinBlock-1)&~(MinBlock-1)); + memset(buf, 0, len); + } + pagein(ooff, olen); +} + +/* + * read/write integers + * +static void +p16(uchar *p, u16int u) +{ + p[0] = (u>>8) & 0xFF; + p[1] = u & 0xFF; +} +*/ + +static u16int +u16(uchar *p) +{ + return (p[0]<<8)|p[1]; +} + +static void +p32(uchar *p, u32int u) +{ + p[0] = (u>>24) & 0xFF; + p[1] = (u>>16) & 0xFF; + p[2] = (u>>8) & 0xFF; + p[3] = u & 0xFF; +} + +static u32int +u32(uchar *p) +{ + return (p[0]<<24)|(p[1]<<16)|(p[2]<<8)|p[3]; +} + +/* +static void +p64(uchar *p, u64int u) +{ + p32(p, u>>32); + p32(p, u); +} +*/ + +static u64int +u64(uchar *p) +{ + return ((u64int)u32(p)<<32) | u32(p+4); +} + +static int +vlongcmp(const void *va, const void *vb) +{ + vlong a, b; + + a = *(vlong*)va; + b = *(vlong*)vb; + if(a < b) + return -1; + if(b > a) + return 1; + return 0; +} + +/* D and S are in draw.h */ +#define D VD +#define S VS + +enum +{ + D = 0x10000, + Z = 0x20000, + S = 0x30000, + T = 0x40000, + N = 0xFFFF +}; +typedef struct Info Info; +struct Info +{ + int len; + char *name; +}; + +Info partinfo[] = { + 4, "magic", + D|4, "version", + Z|4, "blocksize", + 4, "arenabase", + 0 +}; + +Info headinfo4[] = { + 4, "magic", + D|4, "version", + S|ANameSize, "name", + Z|4, "blocksize", + Z|8, "size", + 0 +}; + +Info headinfo5[] = { + 4, "magic", + D|4, "version", + S|ANameSize, "name", + Z|4, "blocksize", + Z|8, "size", + 4, "clumpmagic", + 0 +}; + +Info tailinfo4[] = { + 4, "magic", + D|4, "version", + S|ANameSize, "name", + D|4, "clumps", + D|4, "cclumps", + T|4, "ctime", + T|4, "wtime", + D|8, "used", + D|8, "uncsize", + 1, "sealed", + 0 +}; + +Info tailinfo4a[] = { + /* tailinfo 4 */ + 4, "magic", + D|4, "version", + S|ANameSize, "name", + D|4, "clumps", + D|4, "cclumps", + T|4, "ctime", + T|4, "wtime", + D|8, "used", + D|8, "uncsize", + 1, "sealed", + + /* mem stats */ + 1, "extension", + D|4, "mem.clumps", + D|4, "mem.cclumps", + D|8, "mem.used", + D|8, "mem.uncsize", + 1, "mem.sealed", + 0 +}; + +Info tailinfo5[] = { + 4, "magic", + D|4, "version", + S|ANameSize, "name", + D|4, "clumps", + D|4, "cclumps", + T|4, "ctime", + T|4, "wtime", + 4, "clumpmagic", + D|8, "used", + D|8, "uncsize", + 1, "sealed", + 0 +}; + +Info tailinfo5a[] = { + /* tailinfo 5 */ + 4, "magic", + D|4, "version", + S|ANameSize, "name", + D|4, "clumps", + D|4, "cclumps", + T|4, "ctime", + T|4, "wtime", + 4, "clumpmagic", + D|8, "used", + D|8, "uncsize", + 1, "sealed", + + /* mem stats */ + 1, "extension", + D|4, "mem.clumps", + D|4, "mem.cclumps", + D|8, "mem.used", + D|8, "mem.uncsize", + 1, "mem.sealed", + 0 +}; + +void +showdiffs(uchar *want, uchar *have, int len, Info *info) +{ + int n; + + while(len > 0 && (n=info->len&N) > 0){ + if(memcmp(have, want, n) != 0){ + switch(info->len){ + case 1: + print("\t%s: correct=%d disk=%d\n", + info->name, *want, *have); + break; + case 4: + print("\t%s: correct=%#ux disk=%#ux\n", + info->name, u32(want), u32(have)); + break; + case D|4: + print("\t%s: correct=%,ud disk=%,ud\n", + info->name, u32(want), u32(have)); + break; + case T|4: + print("\t%s: correct=%t\n\t\tdisk=%t\n", + info->name, u32(want), u32(have)); + break; + case Z|4: + print("\t%s: correct=%z disk=%z\n", + info->name, (uvlong)u32(want), (uvlong)u32(have)); + break; + case D|8: + print("\t%s: correct=%,lld disk=%,lld\n", + info->name, u64(want), u64(have)); + break; + case Z|8: + print("\t%s: correct=%z disk=%z\n", + info->name, u64(want), u64(have)); + break; + case S|ANameSize: + print("\t%s: correct=%s disk=%.*s\n", + info->name, (char*)want, + utfnlen((char*)have, ANameSize-1), + (char*)have); + break; + default: + print("\t%s: correct=%.*H disk=%.*H\n", + info->name, n, want, n, have); + break; + } + } + have += n; + want += n; + len -= n; + info++; + } + if(len > 0 && memcmp(have, want, len) != 0){ + if(memcmp(want, zero, len) != 0) + print("!!\textra want data in showdiffs (bug in fixarenas)\n"); + else + print("\tnon-zero data on disk after structure\n"); + if(verbose > 1){ + print("want: %.*H\n", len, want); + print("have: %.*H\n", len, have); + } + } +} + +/* + * Does part begin with an arena? + */ +int +isonearena(void) +{ + return u32(pagein(0, Block)) == ArenaHeadMagic; +} + +static int tabsizes[] = { 16*1024, 64*1024, 512*1024, 768*1024, }; +/* + * Poke around on the disk to guess what the ArenaPart numbers are. + */ +void +guessgeometry(void) +{ + int i, j, n, bestn, ndiff, nhead, ntail; + uchar *p, *ep, *sp; + u64int diff[100], head[20], tail[20]; + u64int offset, bestdiff; + + ap.version = ArenaPartVersion; + + if(arenasize == 0 || ap.blocksize == 0){ + /* + * The ArenaPart block at offset PartBlank may be corrupt or just wrong. + * Instead, look for the individual arena headers and tails, which there + * are many of, and once we've seen enough, infer the spacing. + * + * Of course, nothing in the file format requires that arenas be evenly + * spaced, but fmtarenas always does that for us. + */ + nhead = 0; + ntail = 0; + for(offset=PartBlank; offset<partend; offset+=4*M){ + p = pagein(offset, 4*M); + for(sp=p, ep=p+4*M; p<ep; p+=K){ + if(u32(p) == ArenaHeadMagic && nhead < nelem(head)){ + if(verbose) + print("arena head at %#llx\n", offset+(p-sp)); + head[nhead++] = offset+(p-sp); + } + if(u32(p) == ArenaMagic && ntail < nelem(tail)){ + tail[ntail++] = offset+(p-sp); + if(verbose) + print("arena tail at %#llx\n", offset+(p-sp)); + } + } + if(nhead == nelem(head) && ntail == nelem(tail)) + break; + } + if(nhead < 3 && ntail < 3) + sysfatal("too few intact arenas: %d heads, %d tails", nhead, ntail); + + /* + * Arena size is likely the most common + * inter-head or inter-tail spacing. + */ + ndiff = 0; + for(i=1; i<nhead; i++) + diff[ndiff++] = head[i] - head[i-1]; + for(i=1; i<ntail; i++) + diff[ndiff++] = tail[i] - tail[i-1]; + qsort(diff, ndiff, sizeof diff[0], vlongcmp); + bestn = 0; + bestdiff = 0; + for(i=1, n=1; i<=ndiff; i++, n++){ + if(i==ndiff || diff[i] != diff[i-1]){ + if(n > bestn){ + bestn = n; + bestdiff = diff[i-1]; + } + n = 0; + } + } + print("arena size likely %z (%d of %d)\n", bestdiff, bestn, ndiff); + if(arenasize != 0 && arenasize != bestdiff) + print("using user-specified size %z instead\n", arenasize); + else + arenasize = bestdiff; + + /* + * The arena tail for an arena is arenasize-blocksize from the head. + */ + ndiff = 0; + for(i=j=0; i<nhead && j<ntail; ){ + if(tail[j] < head[i]){ + j++; + continue; + } + if(tail[j] < head[i]+arenasize){ + diff[ndiff++] = head[i]+arenasize - tail[j]; + j++; + continue; + } + i++; + } + if(ndiff < 3) + sysfatal("too few intact arenas: %d head, tail pairs", ndiff); + qsort(diff, ndiff, sizeof diff[0], vlongcmp); + bestn = 0; + bestdiff = 0; + for(i=1, n=1; i<=ndiff; i++, n++){ + if(i==ndiff || diff[i] != diff[i-1]){ + if(n > bestn){ + bestn = n; + bestdiff = diff[i-1]; + } + n = 0; + } + } + print("block size likely %z (%d of %d)\n", bestdiff, bestn, ndiff); + if(ap.blocksize != 0 && ap.blocksize != bestdiff) + print("using user-specified size %z instead\n", (vlong)ap.blocksize); + else + ap.blocksize = bestdiff; + if(ap.blocksize == 0 || ap.blocksize&(ap.blocksize-1)) + sysfatal("block size not a power of two"); + if(ap.blocksize > MaxDiskBlock) + sysfatal("block size too big (max=%d)", MaxDiskBlock); + + /* + * Use head/tail information to deduce arena base. + */ + ndiff = 0; + for(i=0; i<nhead; i++) + diff[ndiff++] = head[i]%arenasize; + for(i=0; i<ntail; i++) + diff[ndiff++] = (tail[i]+ap.blocksize)%arenasize; + qsort(diff, ndiff, sizeof diff[0], vlongcmp); + bestn = 0; + bestdiff = 0; + for(i=1, n=1; i<=ndiff; i++, n++){ + if(i==ndiff || diff[i] != diff[i-1]){ + if(n > bestn){ + bestn = n; + bestdiff = diff[i-1]; + } + n = 0; + } + } + ap.arenabase = bestdiff; + } + + ap.tabbase = ROUNDUP(PartBlank+HeadSize, ap.blocksize); + /* + * XXX pick up table, check arenabase. + * XXX pick up table, record base name. + */ + + /* + * Somewhat standard computation. + * Fmtarenas used to use 64k tab, now uses 512k tab. + */ + if(ap.arenabase == 0){ + print("trying standard arena bases...\n"); + for(i=0; i<nelem(tabsizes); i++){ + ap.arenabase = ROUNDUP(PartBlank+HeadSize+tabsizes[i], ap.blocksize); + p = pagein(ap.arenabase, Block); + if(u32(p) == ArenaHeadMagic) + break; + } + } + p = pagein(ap.arenabase, Block); + print("arena base likely %z%s\n", (vlong)ap.arenabase, + u32(p)!=ArenaHeadMagic ? " (but no arena head there)" : ""); + + ap.tabsize = ap.arenabase - ap.tabbase; +} + +/* + * Check the arena partition blocks and then the arenas listed in range. + */ +void +checkarenas(char *range) +{ + char *s, *t; + int i, lo, hi, narena; + uchar dbuf[HeadSize]; + uchar *p; + + guessgeometry(); + + partend -= partend%ap.blocksize; + + memset(dbuf, 0, sizeof dbuf); + packarenapart(&ap, dbuf); + p = pagein(PartBlank, Block); + if(memcmp(p, dbuf, HeadSize) != 0){ + print("on-disk arena part superblock incorrect\n"); + showdiffs(dbuf, p, HeadSize, partinfo); + } + memmove(p, dbuf, HeadSize); + + narena = (partend-ap.arenabase + arenasize-1)/arenasize; + if(range == nil){ + for(i=0; i<narena; i++) + checkarena(ap.arenabase+(vlong)i*arenasize, i); + }else if(strcmp(range, "none") == 0){ + /* nothing */ + }else{ + /* parse, e.g., -4,8-9,10- */ + for(s=range; *s; s=t){ + t = strchr(s, ','); + if(t) + *t++ = 0; + else + t = s+strlen(s); + if(*s == '-') + lo = 0; + else + lo = strtol(s, &s, 0); + hi = lo; + if(*s == '-'){ + s++; + if(*s == 0) + hi = narena-1; + else + hi = strtol(s, &s, 0); + } + if(*s != 0){ + print("bad arena range: %s\n", s); + continue; + } + for(i=lo; i<=hi; i++) + checkarena(ap.arenabase+(vlong)i*arenasize, i); + } + } +} + +/* + * Is there a clump here at p? + */ +static int +isclump(uchar *p, Clump *cl, u32int *pmagic) +{ + int n; + u32int magic; + uchar score[VtScoreSize], *bp; + Unwhack uw; + uchar ubuf[70*1024]; + + bp = p; + magic = u32(p); + if(magic == 0) + return 0; + p += U32Size; + + cl->info.type = vtfromdisktype(*p); + if(cl->info.type == 0xFF) + return 0; + p++; + cl->info.size = u16(p); + p += U16Size; + cl->info.uncsize = u16(p); + if(cl->info.size > cl->info.uncsize) + return 0; + p += U16Size; + scorecp(cl->info.score, p); + p += VtScoreSize; + cl->encoding = *p; + p++; + cl->creator = u32(p); + p += U32Size; + cl->time = u32(p); + p += U32Size; + + switch(cl->encoding){ + case ClumpENone: + if(cl->info.size != cl->info.uncsize) + return 0; + scoremem(score, p, cl->info.size); + if(scorecmp(score, cl->info.score) != 0) + return 0; + break; + case ClumpECompress: + if(cl->info.size >= cl->info.uncsize) + return 0; + unwhackinit(&uw); + n = unwhack(&uw, ubuf, cl->info.uncsize, p, cl->info.size); + if(n != cl->info.uncsize) + return 0; + scoremem(score, ubuf, cl->info.uncsize); + if(scorecmp(score, cl->info.score) != 0) + return 0; + break; + default: + return 0; + } + p += cl->info.size; + + /* it all worked out in the end */ + *pmagic = magic; + return p - bp; +} + +/* + * All ClumpInfos seen in this arena. + * Kept in binary tree so we can look up by score. + */ +typedef struct Cit Cit; +struct Cit +{ + int left; + int right; + vlong corrupt; + ClumpInfo ci; +}; +Cit *cibuf; +int ciroot; +int ncibuf, mcibuf; + +void +resetcibuf(void) +{ + ncibuf = 0; + ciroot = -1; +} + +int* +ltreewalk(int *p, uchar *score) +{ + int i; + + for(;;){ + if(*p == -1) + return p; + i = scorecmp(cibuf[*p].ci.score, score); + if(i == 0) + return p; + if(i < 0) + p = &cibuf[*p].right; + else + p = &cibuf[*p].left; + } +} + +void +addcibuf(ClumpInfo *ci, vlong corrupt) +{ + Cit *cit; + + if(ncibuf == mcibuf){ + mcibuf += 131072; + cibuf = vtrealloc(cibuf, mcibuf*sizeof cibuf[0]); + } + cit = &cibuf[ncibuf]; + cit->ci = *ci; + cit->left = -1; + cit->right = -1; + cit->corrupt = corrupt; + if(!corrupt) + *ltreewalk(&ciroot, ci->score) = ncibuf; + ncibuf++; +} + +void +addcicorrupt(vlong len) +{ + static ClumpInfo zci; + + addcibuf(&zci, len); +} + +int +haveclump(uchar *score) +{ + int i; + int p; + + p = ciroot; + for(;;){ + if(p == -1) + return 0; + i = scorecmp(cibuf[p].ci.score, score); + if(i == 0) + return 1; + if(i < 0) + p = cibuf[p].right; + else + p = cibuf[p].left; + } +} + +int +matchci(ClumpInfo *ci, uchar *p) +{ + if(ci->type != vtfromdisktype(p[0])) + return 0; + if(ci->size != u16(p+1)) + return 0; + if(ci->uncsize != u16(p+3)) + return 0; + if(scorecmp(ci->score, p+5) != 0) + return 0; + return 1; +} + +int +sealedarena(uchar *p, int blocksize) +{ + int v, n; + + v = u32(p+4); + switch(v){ + default: + return 0; + case ArenaVersion4: + n = ArenaSize4; + break; + case ArenaVersion5: + n = ArenaSize5; + break; + } + if(p[n-1] != 1){ + print("arena tail says not sealed\n"); + return 0; + } + if(memcmp(p+n, zero, blocksize-VtScoreSize-n) != 0){ + print("arena tail followed by non-zero data\n"); + return 0; + } + if(memcmp(p+blocksize-VtScoreSize, zero, VtScoreSize) == 0){ + print("arena score zero\n"); + return 0; + } + return 1; +} + +int +okayname(char *name, int n) +{ + char buf[20]; + + if(nameok(name) < 0) + return 0; + sprint(buf, "%d", n); + if(n == 0) + buf[0] = 0; + if(strlen(name) < strlen(buf) + || strcmp(name+strlen(name)-strlen(buf), buf) != 0) + return 0; + return 1; +} + +int +clumpinfocmp(ClumpInfo *a, ClumpInfo *b) +{ + if(a->type != b->type) + return a->type - b->type; + if(a->size != b->size) + return a->size - b->size; + if(a->uncsize != b->uncsize) + return a->uncsize - b->uncsize; + return scorecmp(a->score, b->score); +} + +ClumpInfo* +loadci(vlong offset, Arena *arena, int nci) +{ + int i, j, per; + uchar *p, *sp; + ClumpInfo *bci, *ci; + + per = arena->blocksize/ClumpInfoSize; + bci = vtmalloc(nci*sizeof bci[0]); + ci = bci; + offset += arena->size - arena->blocksize; + p = sp = nil; + for(i=0; i<nci; i+=per){ + if(p == sp){ + sp = pagein(offset-4*M, 4*M); + p = sp+4*M; + } + p -= arena->blocksize; + offset -= arena->blocksize; + for(j=0; j<per && i+j<nci; j++) + unpackclumpinfo(ci++, p+j*ClumpInfoSize); + } + return bci; +} + +vlong +writeci(vlong offset, Arena *arena, ClumpInfo *ci, int nci) +{ + int i, j, per; + uchar *p, *sp; + + per = arena->blocksize/ClumpInfoSize; + offset += arena->size - arena->blocksize; + p = sp = nil; + for(i=0; i<nci; i+=per){ + if(p == sp){ + sp = pagein(offset-4*M, 4*M); + p = sp+4*M; + } + p -= arena->blocksize; + offset -= arena->blocksize; + memset(p, 0, arena->blocksize); + for(j=0; j<per && i+j<nci; j++) + packclumpinfo(ci++, p+j*ClumpInfoSize); + } + pageout(); + return offset; +} + +void +loadarenabasics(vlong offset0, int anum, ArenaHead *head, Arena *arena) +{ + char dname[ANameSize]; + static char lastbase[ANameSize]; + uchar *p; + Arena oarena; + ArenaHead ohead; + + /* + * Fmtarenas makes all arenas the same size + * except the last, which may be smaller. + * It uses the same block size for arenas as for + * the arena partition blocks. + */ + arena->size = arenasize; + if(offset0+arena->size > partend) + arena->size = partend - offset0; + head->size = arena->size; + + arena->blocksize = ap.blocksize; + head->blocksize = arena->blocksize; + + /* + * Look for clump magic and name in head/tail blocks. + * All the other info we will reconstruct just in case. + */ + p = pagein(offset0, arena->blocksize); + memset(&ohead, 0, sizeof ohead); + if(unpackarenahead(&ohead, p) >= 0){ + head->version = ohead.version; + head->clumpmagic = ohead.clumpmagic; + if(okayname(ohead.name, anum)) + strcpy(head->name, ohead.name); + } + + p = pagein(offset0+arena->size-arena->blocksize, + arena->blocksize); + memset(&oarena, 0, sizeof oarena); + if(unpackarena(&oarena, p) >= 0){ + arena->version = oarena.version; + arena->clumpmagic = oarena.clumpmagic; + if(okayname(oarena.name, anum)) + strcpy(arena->name, oarena.name); + arena->diskstats.clumps = oarena.diskstats.clumps; +print("old arena: sealed=%d\n", oarena.diskstats.sealed); + arena->diskstats.sealed = oarena.diskstats.sealed; + } + + /* Head trumps arena. */ + if(head->version){ + arena->version = head->version; + arena->clumpmagic = head->clumpmagic; + } + if(arena->version == 0) + arena->version = ArenaVersion5; + if(basename){ + if(anum == -1) + snprint(arena->name, ANameSize, "%s", basename); + else + snprint(arena->name, ANameSize, "%s%d", basename, anum); + }else if(lastbase[0]) + snprint(arena->name, ANameSize, "%s%d", lastbase, anum); + else if(head->name[0]) + strcpy(arena->name, head->name); + else if(arena->name[0] == 0) + sysfatal("cannot determine base name for arena; use -n"); + strcpy(lastbase, arena->name); + sprint(dname, "%d", anum); + lastbase[strlen(lastbase)-strlen(dname)] = 0; + + /* Was working in arena, now copy to head. */ + head->version = arena->version; + memmove(head->name, arena->name, sizeof head->name); + head->blocksize = arena->blocksize; + head->size = arena->size; +} + +void +shahead(Shabuf *sb, vlong offset0, ArenaHead *head) +{ + uchar headbuf[MaxDiskBlock]; + + sb->offset = offset0; + memset(headbuf, 0, sizeof headbuf); + packarenahead(head, headbuf); + sbupdate(sb, headbuf, offset0, head->blocksize); +} + +u32int +newclumpmagic(int version) +{ + u32int m; + + if(version == ArenaVersion4) + return _ClumpMagic; + do{ + m = fastrand(); + }while(m==0 || m == _ClumpMagic); + return m; +} + +/* + * Poke around in the arena to find the clump data + * and compute the relevant statistics. + */ +void +guessarena(vlong offset0, int anum, ArenaHead *head, Arena *arena, + uchar *oldscore, uchar *score) +{ + uchar dbuf[MaxDiskBlock]; + int needtozero, clumps, nb1, nb2, minclumps; + int inbad, n, ncib, printed, sealing, smart; + u32int magic; + uchar *sp, *ep, *p; + vlong boffset, eoffset, lastclumpend, leaked; + vlong offset, toffset, totalcorrupt, v; + Clump cl; + ClumpInfo *bci, *ci, *eci, *xci; + Cit *bcit, *cit, *ecit; + Shabuf oldsha, newsha; + + /* + * We expect to find an arena, with data, between offset + * and offset+arenasize. With any luck, the data starts at + * offset+ap.blocksize. The blocks have variable size and + * aren't padded at all, which doesn't give us any alignment + * constraints. The blocks are compressed or high entropy, + * but the headers are pretty low entropy (except the score): + * + * type[1] (range 0 thru 9, 13) + * size[2] + * uncsize[2] (<= size) + * + * so we can look for these. We check the scores as we go, + * so we can't make any wrong turns. If we find ourselves + * in a dead end, scan forward looking for a new start. + */ + + resetcibuf(); + memset(head, 0, sizeof *head); + memset(arena, 0, sizeof *arena); + memset(oldscore, 0, VtScoreSize); + memset(score, 0, VtScoreSize); + memset(&oldsha, 0, sizeof oldsha); + memset(&newsha, 0, sizeof newsha); + newsha.rollback = 1; + + if(0){ + sbdebug(&oldsha, "old.sha"); + sbdebug(&newsha, "new.sha"); + } + + loadarenabasics(offset0, anum, head, arena); + + /* start the clump hunt */ + + clumps = 0; + totalcorrupt = 0; + sealing = 1; + boffset = offset0 + arena->blocksize; + offset = boffset; + eoffset = offset0+arena->size - arena->blocksize; + toffset = eoffset; + sp = pagein(offset0, 4*M); + + if(arena->diskstats.sealed){ + oldsha.offset = offset0; + sbupdate(&oldsha, sp, offset0, 4*M); + } + ep = sp+4*M; + p = sp + (boffset - offset0); + ncib = arena->blocksize / ClumpInfoSize; /* ci per block in index */ + lastclumpend = offset; + nbad = 0; + inbad = 0; + needtozero = 0; + minclumps = 0; + while(offset < eoffset){ + /* + * Shift buffer if we're running out of room. + */ + if(p+70*K >= ep){ + /* + * Start the post SHA1 buffer. By now we should know the + * clumpmagic and arena version, so we can create a + * correct head block to get things going. + */ + if(sealing && fix && newsha.offset == 0){ + newsha.offset = offset0; + if(arena->clumpmagic == 0){ + if(arena->version == 0) + arena->version = ArenaVersion5; + arena->clumpmagic = newclumpmagic(arena->version); + } + head->clumpmagic = arena->clumpmagic; + shahead(&newsha, offset0, head); + } + n = 4*M-256*K; + if(sealing && fix){ + sbdiskhash(&newsha, bufoffset); + sbupdate(&newsha, buf, bufoffset, 4*M-256*K); + } + pagein(bufoffset+n, 4*M); + p -= n; + if(arena->diskstats.sealed) + sbupdate(&oldsha, buf, bufoffset, 4*M); + } + + /* + * Check for a clump at p, which is at offset in the disk. + * Duplicate clumps happen in corrupted disks + * (the same pattern gets written many times in a row) + * and should never happen during regular use. + */ + magic = 0; + if((n = isclump(p, &cl, &magic)) > 0){ + /* + * If we were in the middle of some corrupted data, + * flush a warning about it and then add any clump + * info blocks as necessary. + */ + if(inbad){ + inbad = 0; + v = offset-lastclumpend; + if(needtozero){ + zerorange(lastclumpend, v); + sbrollback(&newsha, lastclumpend); + print("corrupt clump data - %#llux+%#llux (%,llud bytes)\n", + lastclumpend, v, v); + } + addcicorrupt(v); + totalcorrupt += v; + nb1 = (minclumps+ncib-1)/ncib; + minclumps += (v+ClumpSize+VtMaxLumpSize-1)/(ClumpSize+VtMaxLumpSize); + nb2 = (minclumps+ncib-1)/ncib; + eoffset -= (nb2-nb1)*arena->blocksize; + } + + if(haveclump(cl.info.score)) + print("warning: duplicate clump %d %V at %#llux+%#d\n", cl.info.type, cl.info.score, offset, n); + + /* + * If clumps use different magic numbers, we don't care. + * We'll just use the first one we find and make the others + * follow suit. + */ + if(arena->clumpmagic == 0){ + print("clump type %d size %d score %V magic %x\n", + cl.info.type, cl.info.size, cl.info.score, magic); + arena->clumpmagic = magic; + if(magic == _ClumpMagic) + arena->version = ArenaVersion4; + else + arena->version = ArenaVersion5; + } + if(magic != arena->clumpmagic) + p32(p, arena->clumpmagic); + if(clumps == 0) + arena->ctime = cl.time; + + /* + * Record the clump, update arena stats, + * grow clump info blocks if needed. + */ + if(verbose > 1) + print("\tclump %d: %d %V at %#llux+%#ux (%d)\n", + clumps, cl.info.type, cl.info.score, offset, n, n); + addcibuf(&cl.info, 0); + if(minclumps%ncib == 0) + eoffset -= arena->blocksize; + minclumps++; + clumps++; + if(cl.encoding != ClumpENone) + arena->diskstats.cclumps++; + arena->diskstats.uncsize += cl.info.uncsize; + arena->wtime = cl.time; + + /* + * Move to next clump. + */ + offset += n; + p += n; + lastclumpend = offset; + }else{ + /* + * Overwrite malformed clump data with zeros later. + * For now, just record whether it needs to be overwritten. + * Bad regions must be of size at least ClumpSize. + * Postponing the overwriting keeps us from writing past + * the end of the arena data (which might be directory data) + * with zeros. + */ + if(!inbad){ + inbad = 1; + needtozero = 0; + if(memcmp(p, zero, ClumpSize) != 0) + needtozero = 1; + p += ClumpSize; + offset += ClumpSize; + nbad++; + }else{ + if(*p != 0) + needtozero = 1; + p++; + offset++; + } + } + } + pageout(); + + if(verbose) + print("readable clumps: %d; min. directory entries: %d\n", + clumps, minclumps); + arena->diskstats.used = lastclumpend - boffset; + leaked = eoffset - lastclumpend; + if(verbose) + print("used from %#llux to %#llux = %,lld (%,lld unused)\n", + boffset, lastclumpend, arena->diskstats.used, leaked); + + /* + * Finish the SHA1 of the old data. + */ + if(arena->diskstats.sealed){ + sbdiskhash(&oldsha, toffset); + readdisk(dbuf, toffset, arena->blocksize); + scorecp(dbuf+arena->blocksize-VtScoreSize, zero); + sbupdate(&oldsha, dbuf, toffset, arena->blocksize); + sbscore(&oldsha, oldscore); + } + + /* + * If we still don't know the clump magic, the arena + * must be empty. It still needs a value, so make + * something up. + */ + if(arena->version == 0) + arena->version = ArenaVersion5; + if(arena->clumpmagic == 0){ + if(arena->version == ArenaVersion4) + arena->clumpmagic = _ClumpMagic; + else{ + do + arena->clumpmagic = fastrand(); + while(arena->clumpmagic==_ClumpMagic + ||arena->clumpmagic==0); + } + head->clumpmagic = arena->clumpmagic; + } + + /* + * Guess at number of clumpinfo blocks to load. + * If we guess high, it's no big deal. If we guess low, + * we'll be forced into rewriting the whole directory. + * Still not such a big deal. + */ + if(clumps == 0 || arena->diskstats.used == totalcorrupt) + goto Nocib; + if(clumps < arena->diskstats.clumps) + clumps = arena->diskstats.clumps; + if(clumps < ncibuf) + clumps = ncibuf; + clumps += totalcorrupt/ + ((arena->diskstats.used - totalcorrupt)/clumps); + clumps += totalcorrupt/2000; + if(clumps < minclumps) + clumps = minclumps; + clumps += ncib-1; + clumps -= clumps%ncib; + + /* + * Can't write into the actual data. + */ + v = offset0 + arena->size - arena->blocksize; + v -= (clumps+ncib-1)/ncib * arena->blocksize; + if(v < lastclumpend){ + v = offset0 + arena->size - arena->blocksize; + clumps = (v-lastclumpend)/arena->blocksize * ncib; + } + + if(clumps < minclumps) + print("cannot happen?\n"); + + /* + * Check clumpinfo blocks against directory we created. + * The tricky part is handling the corrupt sections of arena. + * If possible, we remark just the affected directory entries + * rather than slide everything down. + * + * Allocate clumps+1 blocks and check that we don't need + * the last one at the end. + */ + bci = loadci(offset0, arena, clumps+1); + eci = bci+clumps+1; + bcit = cibuf; + ecit = cibuf+ncibuf; + + smart = 0; /* Somehow the smart code doesn't do corrupt clumps right. */ +Again: + nbad = 0; + ci = bci; + for(cit=bcit; cit<ecit && ci<eci; cit++){ + if(cit->corrupt){ + vlong n, m; + if(smart){ + /* + * If we can, just mark existing entries as corrupt. + */ + n = cit->corrupt; + for(xci=ci; n>0 && xci<eci; xci++) + n -= ClumpSize+xci->size; + if(n > 0 || xci >= eci) + goto Dumb; + printed = 0; + for(; ci<xci; ci++){ + if(verbose && ci->type != VtCorruptType){ + if(!printed){ + print("marking directory %d-%d as corrupt\n", + (int)(ci-bci), (int)(xci-bci)); + printed = 1; + } + print("\ttype=%d size=%d uncsize=%d score=%V\n", + ci->type, ci->size, ci->uncsize, ci->score); + } + ci->type = VtCorruptType; + } + }else{ + Dumb: + print("\trewriting clump directory\n"); + /* + * Otherwise, blaze a new trail. + */ + n = cit->corrupt; + while(n > 0 && ci < eci){ + if(n < ClumpSize) + sysfatal("bad math in clump corrupt"); + if(n <= VtMaxLumpSize+ClumpSize) + m = n; + else{ + m = VtMaxLumpSize+ClumpSize; + if(n-m < ClumpSize) + m -= ClumpSize; + } + ci->type = VtCorruptType; + ci->size = m-ClumpSize; + ci->uncsize = m-ClumpSize; + memset(ci->score, 0, VtScoreSize); + ci++; + n -= m; + } + } + continue; + } + if(clumpinfocmp(&cit->ci, ci) != 0){ + if(verbose && (smart || verbose>1)){ + print("clumpinfo %d\n", (int)(ci-bci)); + print("\twant: %d %d %d %V\n", + cit->ci.type, cit->ci.size, + cit->ci.uncsize, cit->ci.score); + print("\thave: %d %d %d %V\n", + ci->type, ci->size, + ci->uncsize, ci->score); + } + *ci = cit->ci; + nbad++; + } + ci++; + } + if(ci >= eci || cit < ecit){ + print("ran out of space editing existing directory; rewriting\n"); + print("# eci %ld ci %ld ecit %ld cit %ld\n", eci-bci, ci-bci, ecit-bcit, cit-bcit); + assert(smart); /* can't happen second time thru */ + smart = 0; + goto Again; + } + + assert(ci <= eci); + arena->diskstats.clumps = ci-bci; + eoffset = writeci(offset0, arena, bci, ci-bci); + if(sealing && fix) + sbrollback(&newsha, v); +print("eoffset=%lld lastclumpend=%lld diff=%lld unseal=%d\n", eoffset, lastclumpend, eoffset-lastclumpend, unseal); + if(lastclumpend > eoffset) + print("arena directory overwrote blocks! cannot happen!\n"); + free(bci); + if(smart && nbad) + print("arena directory has %d bad or missing entries\n", nbad); +Nocib: + if(eoffset - lastclumpend > 64*1024 && (!arena->diskstats.sealed || unseal)){ + if(arena->diskstats.sealed) + print("unsealing arena\n"); + sealing = 0; + memset(oldscore, 0, VtScoreSize); + } + + /* + * Finish the SHA1 of the new data - only meaningful + * if we've been writing to disk (`fix'). + */ + arena->diskstats.sealed = sealing; + arena->memstats = arena->diskstats; + if(sealing && fix){ + uchar tbuf[MaxDiskBlock]; + + sbdiskhash(&newsha, toffset); + memset(tbuf, 0, sizeof tbuf); + packarena(arena, tbuf); + sbupdate(&newsha, tbuf, toffset, arena->blocksize); + sbscore(&newsha, score); + } +} + +void +dumparena(vlong offset, int anum, Arena *arena) +{ + char buf[1000]; + vlong o, e; + int fd, n; + + snprint(buf, sizeof buf, "%s.%d", dumpbase, anum); + if((fd = create(buf, OWRITE, 0666)) < 0){ + fprint(2, "create %s: %r\n", buf); + return; + } + e = offset+arena->size; + for(o=offset; o<e; o+=n){ + n = 4*M; + if(o+n > e) + n = e-o; + if(pwrite(fd, pagein(o, n), n, o-offset) != n){ + fprint(2, "write %s at %#llux: %r\n", buf, o-offset); + return; + } + } +} + +void +checkarena(vlong offset, int anum) +{ + uchar dbuf[MaxDiskBlock]; + uchar *p, oldscore[VtScoreSize], score[VtScoreSize]; + Arena arena, oarena; + ArenaHead head; + Info *fmt, *fmta; + int sz; + + print("# arena %d: offset %#llux\n", anum, offset); + + if(offset >= partend){ + print("arena offset out of bounds\n"); + return; + } + + guessarena(offset, anum, &head, &arena, oldscore, score); + + if(verbose){ + print("#\tversion=%d name=%s blocksize=%d size=%z", + head.version, head.name, head.blocksize, head.size); + if(head.clumpmagic) + print(" clumpmagic=%#.8ux", head.clumpmagic); + print("\n#\tclumps=%d cclumps=%d used=%,lld uncsize=%,lld\n", + arena.diskstats.clumps, arena.diskstats.cclumps, + arena.diskstats.used, arena.diskstats.uncsize); + print("#\tctime=%t\n", arena.ctime); + print("#\twtime=%t\n", arena.wtime); + if(arena.diskstats.sealed) + print("#\tsealed score=%V\n", score); + } + + if(dumpbase){ + dumparena(offset, anum, &arena); + return; + } + + memset(dbuf, 0, sizeof dbuf); + packarenahead(&head, dbuf); + p = pagein(offset, arena.blocksize); + if(memcmp(dbuf, p, arena.blocksize) != 0){ + print("on-disk arena header incorrect\n"); + showdiffs(dbuf, p, arena.blocksize, + arena.version==ArenaVersion4 ? headinfo4 : headinfo5); + } + memmove(p, dbuf, arena.blocksize); + + memset(dbuf, 0, sizeof dbuf); + packarena(&arena, dbuf); + if(arena.diskstats.sealed) + scorecp(dbuf+arena.blocksize-VtScoreSize, score); + p = pagein(offset+arena.size-arena.blocksize, arena.blocksize); + memset(&oarena, 0, sizeof oarena); + unpackarena(&oarena, p); + if(arena.version == ArenaVersion4){ + sz = ArenaSize4; + fmt = tailinfo4; + fmta = tailinfo4a; + }else{ + sz = ArenaSize5; + fmt = tailinfo5; + fmta = tailinfo5a; + } + if(p[sz] == 1){ + fmt = fmta; + if(oarena.diskstats.sealed){ + /* + * some arenas were sealed with the extension + * before we adopted the convention that if it didn't + * add new information it gets dropped. + */ + _packarena(&arena, dbuf, 1); + } + } + if(memcmp(dbuf, p, arena.blocksize-VtScoreSize) != 0){ + print("on-disk arena tail incorrect\n"); + showdiffs(dbuf, p, arena.blocksize-VtScoreSize, fmt); + } + if(arena.diskstats.sealed){ + if(oarena.diskstats.sealed) + if(scorecmp(p+arena.blocksize-VtScoreSize, oldscore) != 0){ + print("on-disk arena seal score incorrect\n"); + print("\tcorrect=%V\n", oldscore); + print("\t disk=%V\n", p+arena.blocksize-VtScoreSize); + } + if(fix && scorecmp(p+arena.blocksize-VtScoreSize, score) != 0){ + print("%ssealing arena%s: %V\n", + oarena.diskstats.sealed ? "re" : "", + scorecmp(oldscore, score) == 0 ? + "" : " after changes", score); + } + } + memmove(p, dbuf, arena.blocksize); + + pageout(); +} + +AMapN* +buildamap(void) +{ + uchar *p; + vlong o; + ArenaHead h; + AMapN *an; + AMap *m; + + an = vtmallocz(sizeof *an); + for(o=ap.arenabase; o<partend; o+=arenasize){ + p = pagein(o, Block); + if(unpackarenahead(&h, p) >= 0){ + an->map = vtrealloc(an->map, (an->n+1)*sizeof an->map[0]); + m = &an->map[an->n++]; + m->start = o; + m->stop = o+h.size; + strcpy(m->name, h.name); + } + } + return an; +} + +void +checkmap(void) +{ + char *s; + uchar *p; + int i, len; + AMapN *an; + Fmt fmt; + + an = buildamap(); + fmtstrinit(&fmt); + fmtprint(&fmt, "%ud\n", an->n); + for(i=0; i<an->n; i++) + fmtprint(&fmt, "%s\t%lld\t%lld\n", + an->map[i].name, an->map[i].start, an->map[i].stop); + s = fmtstrflush(&fmt); + len = strlen(s); + if(len > ap.tabsize){ + print("arena partition map too long: need %z bytes have %z\n", + (vlong)len, (vlong)ap.tabsize); + len = ap.tabsize; + } + + if(ap.tabsize >= 4*M){ /* can't happen - max arenas is 2000 */ + print("arena partition map *way* too long\n"); + return; + } + + p = pagein(ap.tabbase, ap.tabsize); + if(memcmp(p, s, len) != 0){ + print("arena partition map incorrect; rewriting.\n"); + memmove(p, s, len); + } + pageout(); +} + +int mainstacksize = 512*1024; + +void +threadmain(int argc, char **argv) +{ + int mode; + + mode = OREAD; + readonly = 1; + ARGBEGIN{ + case 'U': + unseal = 1; + break; + case 'a': + arenasize = unittoull(EARGF(usage())); + break; + case 'b': + ap.blocksize = unittoull(EARGF(usage())); + break; + case 'f': + fix = 1; + mode = ORDWR; + readonly = 0; + break; + case 'n': + basename = EARGF(usage()); + break; + case 'v': + verbose++; + break; + case 'x': + dumpbase = EARGF(usage()); + break; + default: + usage(); + }ARGEND + + if(argc != 1 && argc != 2) + usage(); + + file = argv[0]; + + ventifmtinstall(); + fmtinstall('z', zfmt); + fmtinstall('t', tfmt); + quotefmtinstall(); + + part = initpart(file, mode|ODIRECT); + if(part == nil) + sysfatal("can't open %s: %r", file); + partend = part->size; + + if(isonearena()){ + checkarena(0, -1); + threadexitsall(nil); + } + checkarenas(argc > 1 ? argv[1] : nil); + checkmap(); + threadexitsall(nil); +} + diff --git a/sys/src/cmd/venti/srv/fmtarenas.c b/sys/src/cmd/venti/srv/fmtarenas.c new file mode 100755 index 000000000..f196f22d4 --- /dev/null +++ b/sys/src/cmd/venti/srv/fmtarenas.c @@ -0,0 +1,132 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +void +usage(void) +{ + fprint(2, "usage: fmtarenas [-Z] [-b blocksize] [-a arenasize] name file\n"); + threadexitsall(0); +} + +void +threadmain(int argc, char *argv[]) +{ + int vers; + ArenaPart *ap; + Part *part; + Arena *arena; + u64int addr, limit, asize, apsize; + char *file, *name, aname[ANameSize]; + int i, n, blocksize, tabsize, zero; + + ventifmtinstall(); + statsinit(); + + blocksize = 8 * 1024; + asize = 512 * 1024 *1024; + tabsize = 512 * 1024; /* BUG: should be determine from number of arenas */ + zero = -1; + vers = ArenaVersion5; + ARGBEGIN{ + case 'D': + settrace(EARGF(usage())); + break; + case 'a': + asize = unittoull(EARGF(usage())); + if(asize == TWID64) + usage(); + break; + case 'b': + blocksize = unittoull(EARGF(usage())); + if(blocksize == ~0) + usage(); + if(blocksize > MaxDiskBlock){ + fprint(2, "block size too large, max %d\n", MaxDiskBlock); + threadexitsall("usage"); + } + break; + case '4': + vers = ArenaVersion4; + break; + case 'Z': + zero = 0; + break; + default: + usage(); + break; + }ARGEND + + if(zero == -1){ + if(vers == ArenaVersion4) + zero = 1; + else + zero = 0; + } + + if(argc != 2) + usage(); + + name = argv[0]; + file = argv[1]; + + if(nameok(name) < 0) + sysfatal("illegal name template %s", name); + + part = initpart(file, ORDWR|ODIRECT); + if(part == nil) + sysfatal("can't open partition %s: %r", file); + + if(zero) + zeropart(part, blocksize); + + maxblocksize = blocksize; + initdcache(20*blocksize); + + ap = newarenapart(part, blocksize, tabsize); + if(ap == nil) + sysfatal("can't initialize arena: %r"); + + apsize = ap->size - ap->arenabase; + n = apsize / asize; + if(apsize - (n * asize) >= MinArenaSize) + n++; + + fprint(2, "fmtarenas %s: %,d arenas, %,lld bytes storage, %,d bytes for index map\n", + file, n, apsize, ap->tabsize); + + ap->narenas = n; + ap->map = MKNZ(AMap, n); + ap->arenas = MKNZ(Arena*, n); + + addr = ap->arenabase; + for(i = 0; i < n; i++){ + limit = addr + asize; + if(limit >= ap->size || ap->size - limit < MinArenaSize){ + limit = ap->size; + if(limit - addr < MinArenaSize) + sysfatal("bad arena set math: runt arena at %lld,%lld %lld", addr, limit, ap->size); + } + + snprint(aname, ANameSize, "%s%d", name, i); + + if(0) fprint(2, "adding arena %s at [%lld,%lld)\n", aname, addr, limit); + + arena = newarena(part, vers, aname, addr, limit - addr, blocksize); + if(!arena) + fprint(2, "can't make new arena %s: %r", aname); + freearena(arena); + + ap->map[i].start = addr; + ap->map[i].stop = limit; + namecp(ap->map[i].name, aname); + + addr = limit; + } + + if(wbarenapart(ap) < 0) + fprint(2, "can't write back arena partition header for %s: %r\n", file); + + flushdcache(); + threadexitsall(0); +} diff --git a/sys/src/cmd/venti/srv/fmtbloom.c b/sys/src/cmd/venti/srv/fmtbloom.c new file mode 100755 index 000000000..f700d7814 --- /dev/null +++ b/sys/src/cmd/venti/srv/fmtbloom.c @@ -0,0 +1,116 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +Bloom b; + +void +usage(void) +{ + fprint(2, "usage: fmtbloom [-s size] [-n nblocks | -N nhash] file\n"); + threadexitsall(0); +} + +void +threadmain(int argc, char *argv[]) +{ + Part *part; + char *file; + vlong bits, size, size2; + int nhash; + vlong nblocks; + + ventifmtinstall(); + statsinit(); + + size = 0; + nhash = 0; + nblocks = 0; + ARGBEGIN{ + case 'n': + if(nhash || nblocks) + usage(); + nblocks = unittoull(EARGF(usage())); + break; + case 'N': + if(nhash || nblocks) + usage(); + nhash = unittoull(EARGF(usage())); + if(nhash > BloomMaxHash){ + fprint(2, "maximum possible is -N %d", BloomMaxHash); + usage(); + } + break; + case 's': + size = unittoull(ARGF()); + if(size == ~0) + usage(); + break; + default: + usage(); + break; + }ARGEND + + if(argc != 1) + usage(); + + file = argv[0]; + + part = initpart(file, ORDWR|ODIRECT); + if(part == nil) + sysfatal("can't open partition %s: %r", file); + + if(size == 0) + size = part->size; + + if(size < 1024*1024) + sysfatal("bloom filter too small"); + + if(size > MaxBloomSize){ + fprint(2, "warning: not using entire %,lld bytes; using only %,lld bytes\n", + size, (vlong)MaxBloomSize); + size = MaxBloomSize; + } + if(size&(size-1)){ + for(size2=1; size2<size; size2*=2) + ; + size = size2/2; + fprint(2, "warning: size not a power of 2; only using %lldMB\n", size/1024/1024); + } + + if(nblocks){ + /* + * no use for more than 32 bits per block + * shoot for less than 64 bits per block + */ + size2 = size; + while(size2*8 >= nblocks*64) + size2 >>= 1; + if(size2 != size){ + size = size2; + fprint(2, "warning: using only %lldMB - not enough blocks to warrant more\n", + size/1024/1024); + } + + /* + * optimal is to use ln 2 times as many hash functions as we have bits per blocks. + */ + bits = (8*size)/nblocks; + nhash = bits*7/10; + if(nhash > BloomMaxHash) + nhash = BloomMaxHash; + } + if(!nhash) + nhash = BloomMaxHash; + if(bloominit(&b, size, nil) < 0) + sysfatal("bloominit: %r"); + b.nhash = nhash; + bits = nhash*10/7; + nblocks = (8*size)/bits; + fprint(2, "fmtbloom: using %lldMB, %d hashes/score, best up to %,lld blocks\n", size/1024/1024, nhash, nblocks); + b.data = vtmallocz(size); + b.part = part; + if(writebloom(&b) < 0) + sysfatal("writing %s: %r", file); + threadexitsall(0); +} diff --git a/sys/src/cmd/venti/srv/fmtindex.c b/sys/src/cmd/venti/srv/fmtindex.c new file mode 100755 index 000000000..2a5148ea4 --- /dev/null +++ b/sys/src/cmd/venti/srv/fmtindex.c @@ -0,0 +1,120 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +void +usage(void) +{ + fprint(2, "usage: fmtindex [-a] config\n"); + threadexitsall(0); +} + +void +threadmain(int argc, char *argv[]) +{ + Config conf; + Index *ix; + ArenaPart *ap; + Arena **arenas; + AMap *amap; + u64int addr; + char *file; + u32int i, j, n, narenas; + int add; + + ventifmtinstall(); + statsinit(); + + add = 0; + ARGBEGIN{ + case 'a': + add = 1; + break; + default: + usage(); + break; + }ARGEND + + if(argc != 1) + usage(); + + file = argv[0]; + + if(runconfig(file, &conf) < 0) + sysfatal("can't initialize config %s: %r", file); + if(conf.index == nil) + sysfatal("no index specified in %s", file); + if(nameok(conf.index) < 0) + sysfatal("illegal index name %s", conf.index); + + narenas = 0; + for(i = 0; i < conf.naparts; i++){ + ap = conf.aparts[i]; + narenas += ap->narenas; + } + + if(add){ + ix = initindex(conf.index, conf.sects, conf.nsects); + if(ix == nil) + sysfatal("can't initialize index %s: %r", conf.index); + }else{ + ix = newindex(conf.index, conf.sects, conf.nsects); + if(ix == nil) + sysfatal("can't create new index %s: %r", conf.index); + + n = 0; + for(i = 0; i < ix->nsects; i++) + n += ix->sects[i]->blocks; + + if(0) fprint(2, "using %ud buckets of %ud; div=%d\n", ix->buckets, n, ix->div); + } + amap = MKNZ(AMap, narenas); + arenas = MKNZ(Arena*, narenas); + + addr = IndexBase; + n = 0; + for(i = 0; i < conf.naparts; i++){ + ap = conf.aparts[i]; + for(j = 0; j < ap->narenas; j++){ + if(n >= narenas) + sysfatal("too few slots in index's arena set"); + + arenas[n] = ap->arenas[j]; + if(n < ix->narenas){ + if(arenas[n] != ix->arenas[n]) + sysfatal("mismatched arenas %s and %s at slot %d", + arenas[n]->name, ix->arenas[n]->name, n); + amap[n] = ix->amap[n]; + if(amap[n].start != addr) + sysfatal("mis-located arena %s in index %s", arenas[n]->name, ix->name); + addr = amap[n].stop; + }else{ + amap[n].start = addr; + addr += ap->arenas[j]->size; + amap[n].stop = addr; + namecp(amap[n].name, ap->arenas[j]->name); + if(0) fprint(2, "add arena %s at [%lld,%lld)\n", + amap[n].name, amap[n].start, amap[n].stop); + } + + n++; + } + } + if(0){ + fprint(2, "configured index=%s with arenas=%d and storage=%lld\n", + ix->name, n, addr - IndexBase); + fprint(2, "\tbuckets=%d\n", + ix->buckets); + } + fprint(2, "fmtindex: %,d arenas, %,d index buckets, %,lld bytes storage\n", + n, ix->buckets, addr-IndexBase); + + ix->amap = amap; + ix->arenas = arenas; + ix->narenas = narenas; + + if(wbindex(ix) < 0) + fprint(2, "can't write back arena partition header for %s: %r\n", file); + + threadexitsall(0); +} diff --git a/sys/src/cmd/venti/srv/fmtisect.c b/sys/src/cmd/venti/srv/fmtisect.c new file mode 100755 index 000000000..28b88de61 --- /dev/null +++ b/sys/src/cmd/venti/srv/fmtisect.c @@ -0,0 +1,83 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +void +usage(void) +{ + fprint(2, "usage: fmtisect [-Z] [-b blocksize] name file\n"); + threadexitsall(0); +} + +void +threadmain(int argc, char *argv[]) +{ + int vers; + ISect *is; + Part *part; + char *file, *name; + int blocksize, setsize, zero; + + ventifmtinstall(); + statsinit(); + + blocksize = 8 * 1024; + setsize = 512 * 1024; + zero = -1; + vers = ISectVersion2; + ARGBEGIN{ + case 'b': + blocksize = unittoull(ARGF()); + if(blocksize == ~0) + usage(); + if(blocksize > MaxDiskBlock){ + fprint(2, "block size too large, max %d\n", MaxDiskBlock); + threadexitsall("usage"); + } + break; + case '1': + vers = ISectVersion1; + break; + case 'Z': + zero = 0; + break; + default: + usage(); + break; + }ARGEND + + if(zero == -1){ + if(vers == ISectVersion1) + zero = 1; + else + zero = 0; + } + + if(argc != 2) + usage(); + + name = argv[0]; + file = argv[1]; + + if(nameok(name) < 0) + sysfatal("illegal name %s", name); + + part = initpart(file, ORDWR|ODIRECT); + if(part == nil) + sysfatal("can't open partition %s: %r", file); + + if(zero) + zeropart(part, blocksize); + + is = newisect(part, vers, name, blocksize, setsize); + if(is == nil) + sysfatal("can't initialize new index: %r"); + + fprint(2, "fmtisect %s: %,d buckets of %,d entries, %,d bytes for index map\n", + file, is->blocks, is->buckmax, setsize); + + if(wbisect(is) < 0) + fprint(2, "can't write back index section header for %s: %r\n", file); + + threadexitsall(0); +} diff --git a/sys/src/cmd/venti/srv/fns.h b/sys/src/cmd/venti/srv/fns.h new file mode 100755 index 000000000..398562c27 --- /dev/null +++ b/sys/src/cmd/venti/srv/fns.h @@ -0,0 +1,228 @@ +/* + * sorted by 4,/^$/|sort -bd +1 + */ +int addarena(Arena *name); +void addstat(int, int); +void addstat2(int, int, int, int); +ZBlock *alloczblock(u32int size, int zeroed, uint alignment); +Arena *amapitoa(Index *index, u64int a, u64int *aa); +Arena *amapitoag(Index *index, u64int a, u64int *gstart, u64int *glimit, int *g); +u64int arenadirsize(Arena *arena, u32int clumps); +int arenatog(Arena *arena, u64int aa, u64int *gstart, u64int *glimit, int *g); +void arenaupdate(Arena *arena, u32int size, u8int *score); +int asumload(Arena *arena, int g, IEntry *entries, int maxentries); +void backsumarena(Arena *arena); +void binstats(long (*fn)(Stats *s0, Stats *s1, void*), void *arg, long t0, long t1, Statbin *bin, int nbin); +int bloominit(Bloom*, vlong, uchar*); +int bucklook(u8int*, int, u8int*, int); +u32int buildbucket(Index *ix, IEStream *ies, IBucket *ib, uint); +void checkdcache(void); +void checklumpcache(void); +int clumpinfoeq(ClumpInfo *c, ClumpInfo *d); +int clumpinfoeq(ClumpInfo *c, ClumpInfo *d); +u32int clumpmagic(Arena *arena, u64int aa); +uint countbits(uint n); +int delarena(Arena *arena); +void delaykickicache(void); +void delaykickround(Round*); +void delaykickroundproc(void*); +void dirtydblock(DBlock*, int); +void diskaccess(int); +void disksched(void); +void *emalloc(ulong); +void emptydcache(void); +void emptyicache(void); +void emptylumpcache(void); +void *erealloc(void *, ulong); +char *estrdup(char*); +void *ezmalloc(ulong); +Arena *findarena(char *name); +int flushciblocks(Arena *arena); +void flushdcache(void); +void flushicache(void); +int flushpart(Part*); +void flushqueue(void); +void fmtzbinit(Fmt *f, ZBlock *b); +void freearena(Arena *arena); +void freearenapart(ArenaPart *ap, int freearenas); +void freeiestream(IEStream *ies); +void freeifile(IFile *f); +void freeisect(ISect *is); +void freeindex(Index *index); +void freepart(Part *part); +void freezblock(ZBlock *b); +DBlock *_getdblock(Part *part, u64int addr, int mode, int load); +DBlock *getdblock(Part *part, u64int addr, int mode); +u32int hashbits(u8int *score, int nbits); +char *hargstr(HConnect*, char*, char*); +vlong hargint(HConnect*, char*, vlong); +int hdebug(HConnect*); +int hdisk(HConnect*); +int hnotfound(HConnect*); +int hproc(HConnect*); +int hsethtml(HConnect*); +int hsettext(HConnect*); +int httpdinit(char *address, char *webroot); +int iaddrcmp(IAddr *ia1, IAddr *ia2); +IEntry* icachedirty(u32int, u32int, u64int); +ulong icachedirtyfrac(void); +void icacheclean(IEntry*); +int icachelookup(u8int *score, int type, IAddr *ia); +AState icachestate(void); +int ientrycmp(const void *vie1, const void *vie2); +char *ifileline(IFile *f); +int ifilename(IFile *f, char *dst); +int ifileu32int(IFile *f, u32int *r); +int inbloomfilter(Bloom*, u8int*); +int indexsect(Index *ix, u8int *score); +int indexsect0(Index *ix, u32int buck); +Arena *initarena(Part *part, u64int base, u64int size, u32int blocksize); +ArenaPart *initarenapart(Part *part); +int initarenasum(void); +void initbloomfilter(Index*); +void initdcache(u32int mem); +void initicache(u32int mem); +void initicachewrite(void); +IEStream *initiestream(Part *part, u64int off, u64int clumps, u32int size); +ISect *initisect(Part *part); +Index *initindex(char *name, ISect **sects, int n); +void initlumpcache(u32int size, u32int nblocks); +int initlumpqueues(int nq); +Part* initpart(char *name, int mode); +void initround(Round*, char*, int); +int initventi(char *config, Config *conf); +void insertlump(Lump *lump, Packet *p); +int insertscore(u8int *score, IAddr *ia, int state, AState *as); +void kickdcache(void); +void kickicache(void); +void kickround(Round*, int wait); +int loadbloom(Bloom*); +ZBlock *loadclump(Arena *arena, u64int aa, int blocks, Clump *cl, u8int *score, int verify); +DBlock *loadibucket(Index *index, u8int *score, ISect **is, u32int *buck, IBucket *ib); +int loadientry(Index *index, u8int *score, int type, IEntry *ie); +void logerr(int severity, char *fmt, ...); +Lump *lookuplump(u8int *score, int type); +int lookupscore(u8int *score, int type, IAddr *ia); +int maparenas(AMap *am, Arena **arenas, int n, char *what); +void markbloomfilter(Bloom*, u8int*); +uint msec(void); +int namecmp(char *s, char *t); +void namecp(char *dst, char *src); +int nameok(char *name); +void needmainindex(void); +void needzeroscore(void); +Arena *newarena(Part *part, u32int, char *name, u64int base, u64int size, u32int blocksize); +ArenaPart *newarenapart(Part *part, u32int blocksize, u32int tabsize); +ISect *newisect(Part *part, u32int vers, char *name, u32int blocksize, u32int tabsize); +Index *newindex(char *name, ISect **sects, int n); +u32int now(void); +int okamap(AMap *am, int n, u64int start, u64int stop, char *what); +int okibucket(IBucket*, ISect*); +int outputamap(Fmt *f, AMap *am, int n); +int outputindex(Fmt *f, Index *ix); +int _packarena(Arena *arena, u8int *buf, int); +int packarena(Arena *arena, u8int *buf); +int packarenahead(ArenaHead *head, u8int *buf); +int packarenapart(ArenaPart *as, u8int *buf); +void packbloomhead(Bloom*, u8int*); +int packclump(Clump *c, u8int *buf, u32int); +void packclumpinfo(ClumpInfo *ci, u8int *buf); +void packibucket(IBucket *b, u8int *buf, u32int magic); +void packientry(IEntry *i, u8int *buf); +int packisect(ISect *is, u8int *buf); +void packmagic(u32int magic, u8int *buf); +ZBlock *packet2zblock(Packet *p, u32int size); +int parseamap(IFile *f, AMapN *amn); +int parseindex(IFile *f, Index *ix); +void partblocksize(Part *part, u32int blocksize); +int partifile(IFile *f, Part *part, u64int start, u32int size); +void printarenapart(int fd, ArenaPart *ap); +void printarena(int fd, Arena *arena); +void printindex(int fd, Index *ix); +void printstats(void); +void putdblock(DBlock *b); +void putlump(Lump *b); +int queuewrite(Lump *b, Packet *p, int creator, uint ms); +u32int readarena(Arena *arena, u64int aa, u8int *buf, long n); +int readarenamap(AMapN *amn, Part *part, u64int base, u32int size); +Bloom *readbloom(Part*); +int readclumpinfo(Arena *arena, int clump, ClumpInfo *ci); +int readclumpinfos(Arena *arena, int clump, ClumpInfo *cis, int n); +ZBlock *readfile(char *name); +int readifile(IFile *f, char *name); +Packet *readlump(u8int *score, int type, u32int size, int *cached); +int readpart(Part *part, u64int addr, u8int *buf, u32int n); +int resetbloom(Bloom*); +int runconfig(char *config, Config*); +int scorecmp(u8int *, u8int *); +void scoremem(u8int *score, u8int *buf, int size); +void setatailstate(AState*); +void seterr(int severity, char *fmt, ...); +void setstat(int, long); +void settrace(char *type); +u64int sortrawientries(Index *ix, Part *tmp, u64int *tmpoff, Bloom *bloom); +void startbloomproc(Bloom*); +Memimage* statgraph(Graph *g); +void statsinit(void); +int storeclump(Index *index, ZBlock *b, u8int *score, int type, u32int creator, IAddr *ia); +int storeientry(Index *index, IEntry *m); +int strscore(char *s, u8int *score); +int stru32int(char *s, u32int *r); +int stru64int(char *s, u64int *r); +void sumarena(Arena *arena); +int syncarena(Arena *arena, u32int n, int zok, int fix); +int syncindex(Index *ix); +void trace(char *type, char*, ...); +void traceinit(void); +int u64log2(u64int v); +u64int unittoull(char *s); +int unpackarena(Arena *arena, u8int *buf); +int unpackarenahead(ArenaHead *head, u8int *buf); +int unpackarenapart(ArenaPart *as, u8int *buf); +int unpackbloomhead(Bloom*, u8int*); +int unpackclump(Clump *c, u8int *buf, u32int); +void unpackclumpinfo(ClumpInfo *ci, u8int *buf); +void unpackibucket(IBucket *b, u8int *buf, u32int magic); +void unpackientry(IEntry *i, u8int *buf); +int unpackisect(ISect *is, u8int *buf); +u32int unpackmagic(u8int *buf); +void ventifmtinstall(void); +void vtloghdump(Hio*, VtLog*); +void vtloghlist(Hio*); +int vtproc(void(*)(void*), void*); +int vttypevalid(int type); +void waitforkick(Round*); +int wbarena(Arena *arena); +int wbarenahead(Arena *arena); +int wbarenamap(AMap *am, int n, Part *part, u64int base, u64int size); +int wbarenapart(ArenaPart *ap); +void wbbloomhead(Bloom*); +int wbisect(ISect *is); +int wbindex(Index *ix); +int whackblock(u8int *dst, u8int *src, int ssize); +u64int writeaclump(Arena *a, Clump *c, u8int *clbuf); +u32int writearena(Arena *arena, u64int aa, u8int *clbuf, u32int n); +int writebloom(Bloom*); +int writeclumpinfo(Arena *arean, int clump, ClumpInfo *ci); +int writepng(Hio*, Memimage*); +u64int writeiclump(Index *ix, Clump *c, u8int *clbuf); +int writelump(Packet *p, u8int *score, int type, u32int creator, uint ms); +int writepart(Part *part, u64int addr, u8int *buf, u32int n); +int writeqlump(Lump *u, Packet *p, int creator, uint ms); +Packet *zblock2packet(ZBlock *zb, u32int size); +void zeropart(Part *part, int blocksize); + +/* +#pragma varargck argpos sysfatal 1 +#pragma varargck argpos logerr 2 +#pragma varargck argpos SetErr 2 +*/ + +#define scorecmp(h1,h2) memcmp((h1),(h2),VtScoreSize) +#define scorecp(h1,h2) memmove((h1),(h2),VtScoreSize) + +#define MK(t) ((t*)emalloc(sizeof(t))) +#define MKZ(t) ((t*)ezmalloc(sizeof(t))) +#define MKN(t,n) ((t*)emalloc((n)*sizeof(t))) +#define MKNZ(t,n) ((t*)ezmalloc((n)*sizeof(t))) +#define MKNA(t,at,n) ((t*)emalloc(sizeof(t) + (n)*sizeof(at))) diff --git a/sys/src/cmd/venti/srv/graph.c b/sys/src/cmd/venti/srv/graph.c new file mode 100755 index 000000000..cbad1ada2 --- /dev/null +++ b/sys/src/cmd/venti/srv/graph.c @@ -0,0 +1,197 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +enum +{ + Top = 1, + Bottom = 1, + Left = 40, + Right = 0, + MinWidth = Left+Right+2, + MinHeight = Top+Bottom+2, + DefaultWidth = Left+Right+500, + DefaultHeight = Top+Bottom+40 +}; + +QLock memdrawlock; +static Memsubfont *smallfont; +static Memimage *black; +static Memimage *blue; +static Memimage *red; +static Memimage *lofill[6]; +static Memimage *hifill[6]; +static Memimage *grid; + +static ulong fill[] = { + 0xFFAAAAFF, 0xBB5D5DFF, /* peach */ + DPalegreygreen, DPurpleblue, /* aqua */ + DDarkyellow, DYellowgreen, /* yellow */ + DMedgreen, DDarkgreen, /* green */ + 0x00AAFFFF, 0x0088CCFF, /* blue */ + 0xCCCCCCFF, 0x888888FF, /* grey */ +}; + +Memimage* +allocrepl(ulong color) +{ + Memimage *m; + + m = allocmemimage(Rect(0,0,1,1), RGB24); + memfillcolor(m, color); + m->flags |= Frepl; + m->clipr = Rect(-1000000, -1000000, 1000000, 1000000); + return m; +} + +static void +ginit(void) +{ + static int first = 1; + int i; + + if(!first) + return; + + first = 0; + memimageinit(); +#ifdef PLAN9PORT + smallfont = openmemsubfont(unsharp("#9/font/lucsans/lstr.10")); +#else + smallfont = openmemsubfont("/lib/font/bit/lucidasans/lstr.10"); +#endif + black = memblack; + blue = allocrepl(DBlue); + red = allocrepl(DRed); + grid = allocrepl(0x77777777); + for(i=0; i<nelem(fill)/2 && i<nelem(lofill) && i<nelem(hifill); i++){ + lofill[i] = allocrepl(fill[2*i]); + hifill[i] = allocrepl(fill[2*i+1]); + } +} + +static void +mklabel(char *str, int v) +{ + if(v < 0){ + v = -v; + *str++ = '-'; + } + if(v < 10000) + sprint(str, "%d", v); + else if(v < 10000000) + sprint(str, "%dk", v/1000); + else + sprint(str, "%dM", v/1000000); +} + +static void +drawlabel(Memimage *m, Point p, int n) +{ + char buf[30]; + Point w; + + mklabel(buf, n); + w = memsubfontwidth(smallfont, buf); + memimagestring(m, Pt(p.x-5-w.x, p.y), memblack, ZP, smallfont, buf); +} + +static int +scalept(int val, int valmin, int valmax, int ptmin, int ptmax) +{ + if(val <= valmin) + val = valmin; + if(val >= valmax) + val = valmax; + if(valmax == valmin) + valmax++; + return ptmin + (vlong)(val-valmin)*(ptmax-ptmin)/(valmax-valmin); +} + +Memimage* +statgraph(Graph *g) +{ + int i, nbin, x, lo, hi, min, max, first; + Memimage *m; + Rectangle r; + Statbin *b, bin[2000]; /* 32 kB, but whack is worse */ + + needstack(8192); /* double check that bin didn't kill us */ + + if(g->wid <= MinWidth) + g->wid = DefaultWidth; + if(g->ht <= MinHeight) + g->ht = DefaultHeight; + if(g->wid > nelem(bin)) + g->wid = nelem(bin); + if(g->fill < 0) + g->fill = ((uint)(uintptr)g->arg>>8)%nelem(lofill); + if(g->fill > nelem(lofill)) + g->fill %= nelem(lofill); + + nbin = g->wid - (Left+Right); + binstats(g->fn, g->arg, g->t0, g->t1, bin, nbin); + + /* + * compute bounds + */ + min = g->min; + max = g->max; + if(min < 0 || max <= min){ + min = max = 0; + first = 1; + for(i=0; i<nbin; i++){ + b = &bin[i]; + if(b->nsamp == 0) + continue; + if(first || b->min < min) + min = b->min; + if(first || b->max > max) + max = b->max; + first = 0; + } + } + + qlock(&memdrawlock); + ginit(); + if(smallfont==nil || black==nil || blue==nil || red==nil || hifill==nil || lofill==nil){ + werrstr("graphics initialization failed: %r"); + qunlock(&memdrawlock); + return nil; + } + + /* fresh image */ + m = allocmemimage(Rect(0,0,g->wid,g->ht), ABGR32); + if(m == nil){ + qunlock(&memdrawlock); + return nil; + } + r = Rect(Left, Top, g->wid-Right, g->ht-Bottom); + memfillcolor(m, DTransparent); + + /* x axis */ + memimagedraw(m, Rect(r.min.x, r.max.y, r.max.x, r.max.y+1), black, ZP, memopaque, ZP, S); + + /* y labels */ + drawlabel(m, r.min, max); + if(min != 0) + drawlabel(m, Pt(r.min.x, r.max.y-smallfont->height), min); + + /* actual data */ + for(i=0; i<nbin; i++){ + b = &bin[i]; + if(b->nsamp == 0) + continue; + lo = scalept(b->min, min, max, r.max.y, r.min.y); + hi = scalept(b->max, min, max, r.max.y, r.min.y); + x = r.min.x+i; + hi-=2; + memimagedraw(m, Rect(x, hi, x+1,lo), hifill[g->fill%nelem(hifill)], ZP, memopaque, ZP, S); + memimagedraw(m, Rect(x, lo, x+1, r.max.y), lofill[g->fill%nelem(lofill)], ZP, memopaque, ZP, S); + } + + if(bin[nbin-1].nsamp) + drawlabel(m, Pt(r.max.x, r.min.y+(Dy(r)-smallfont->height)/2), bin[nbin-1].avg); + qunlock(&memdrawlock); + return m; +} diff --git a/sys/src/cmd/venti/srv/hdisk.c b/sys/src/cmd/venti/srv/hdisk.c new file mode 100755 index 000000000..8cf937d1b --- /dev/null +++ b/sys/src/cmd/venti/srv/hdisk.c @@ -0,0 +1,696 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" +#include "whack.h" + +static int disksummary(HConnect*); +static int diskarenapart(HConnect*, char*, Part*); +static int diskbloom(HConnect*, char*, Part*); +static int diskisect(HConnect*, char*, Part*); + +int +hdisk(HConnect *c) +{ + char *disk, *type; + Part *p; + int ret; + + if(hsethtml(c) < 0) + return -1; + + disk = hargstr(c, "disk", ""); + if(!disk[0]) + return disksummary(c); + if((p = initpart(disk, OREAD)) == nil){ + hprint(&c->hout, "open %s: %r", disk); + return 0; + } + + type = hargstr(c, "type", ""); + switch(type[0]){ + case 'a': + ret = diskarenapart(c, disk, p); + break; + case 'b': + ret = diskbloom(c, disk, p); + break; + case 'i': + ret = diskisect(c, disk, p); + break; + default: + hprint(&c->hout, "unknown disk type %s", type); + return 0; + } + freepart(p); + return ret; +} + +static int +disksummary(HConnect *c) +{ + int i; + Index *ix; + Part *p; + + hprint(&c->hout, "<h1>venti disks</h1>\n"); + hprint(&c->hout, "<pre>\n"); + ix = mainindex; + p = nil; + for(i=0; i<ix->narenas; i++){ + if(ix->arenas[i]->part == p) + continue; + p = ix->arenas[i]->part; + hprint(&c->hout, "<a href=\"/disk?disk=%s&type=a\">%s</a> %s\n", p->name, p->name, ix->arenas[i]->name); + } + hprint(&c->hout, "\n"); + p = nil; + for(i=0; i<ix->nsects; i++){ + if(ix->sects[i]->part == p) + continue; + p = ix->sects[i]->part; + hprint(&c->hout, "<a href=\"/disk?disk=%s&type=i\">%s</a> %s\n", p->name, p->name, ix->sects[i]->name); + } + hprint(&c->hout, "\n"); + if(ix->bloom){ + p = ix->bloom->part; + hprint(&c->hout, "<a href=\"/disk?disk=%s&type=b\">%s</a> %s\n", p->name, p->name, "bloom filter"); + } + return 0; +} + +static char* +readap(Part *p, ArenaPart *ap) +{ + uchar *blk; + char *table; + + blk = vtmalloc(8192); + if(readpart(p, PartBlank, blk, 8192) != 8192) + return nil; + if(unpackarenapart(ap, blk) < 0){ + werrstr("corrupt arena part header: %r"); + return nil; + } + vtfree(blk); + ap->tabbase = (PartBlank+HeadSize+ap->blocksize-1)&~(ap->blocksize-1); + ap->tabsize = ap->arenabase - ap->tabbase; + table = vtmalloc(ap->tabsize+1); + if(readpart(p, ap->tabbase, (uchar*)table, ap->tabsize) != ap->tabsize){ + werrstr("reading arena part directory: %r"); + return nil; + } + table[ap->tabsize] = 0; + return table; +} + +static int +xfindarena(char *table, char *name, vlong *start, vlong *end) +{ + int i, nline; + char *p, *q, *f[4], line[256]; + + nline = atoi(table); + p = strchr(table, '\n'); + if(p) + p++; + for(i=0; i<nline; i++){ + if(p == nil) + break; + q = strchr(p, '\n'); + if(q) + *q++ = 0; + if(strlen(p) >= sizeof line){ + p = q; + continue; + } + strcpy(line, p); + memset(f, 0, sizeof f); + if(tokenize(line, f, nelem(f)) < 3){ + p = q; + continue; + } + if(strcmp(f[0], name) == 0){ + *start = strtoull(f[1], 0, 0); + *end = strtoull(f[2], 0, 0); + return 0; + } + p = q; + } + return -1; +} + +static void +diskarenatable(HConnect *c, char *disk, char *table) +{ + char *p, *q; + int i, nline; + char *f[4], line[256], base[256]; + + hprint(&c->hout, "<h2>table</h2>\n"); + hprint(&c->hout, "<pre>\n"); + nline = atoi(table); + snprint(base, sizeof base, "/disk?disk=%s&type=a", disk); + p = strchr(table, '\n'); + if(p) + p++; + for(i=0; i<nline; i++){ + if(p == nil){ + hprint(&c->hout, "<b><i>unexpected end of table</i></b>\n"); + break; + } + q = strchr(p, '\n'); + if(q) + *q++ = 0; + if(strlen(p) >= sizeof line){ + hprint(&c->hout, "%s\n", p); + p = q; + continue; + } + strcpy(line, p); + memset(f, 0, sizeof f); + if(tokenize(line, f, 3) < 3){ + hprint(&c->hout, "%s\n", p); + p = q; + continue; + } + p = q; + hprint(&c->hout, "<a href=\"%s&arena=%s\">%s</a> %s %s\n", + base, f[0], f[0], f[1], f[2]); + } + hprint(&c->hout, "</pre>\n"); +} + +static char* +fmttime(char *buf, ulong time) +{ + strcpy(buf, ctime(time)); + buf[28] = 0; + return buf; +} + + +static int diskarenaclump(HConnect*, Arena*, vlong, char*); +static int diskarenatoc(HConnect*, Arena*); + +static int +diskarenapart(HConnect *c, char *disk, Part *p) +{ + char *arenaname; + ArenaPart ap; + ArenaHead head; + Arena arena; + char *table; + char *score; + char *clump; + uchar *blk; + vlong start, end, off; + char tbuf[60]; + + hprint(&c->hout, "<h1>arena partition %s</h1>\n", disk); + + if((table = readap(p, &ap)) == nil){ + hprint(&c->hout, "%r\n"); + goto out; + } + + hprint(&c->hout, "<pre>\n"); + hprint(&c->hout, "version=%d blocksize=%d base=%d\n", + ap.version, ap.blocksize, ap.arenabase); + hprint(&c->hout, "</pre>\n"); + + arenaname = hargstr(c, "arena", ""); + if(arenaname[0] == 0){ + diskarenatable(c, disk, table); + goto out; + } + + if(xfindarena(table, arenaname, &start, &end) < 0){ + hprint(&c->hout, "no such arena %s\n", arenaname); + goto out; + } + + hprint(&c->hout, "<h2>arena %s</h2>\n", arenaname); + hprint(&c->hout, "<pre>start=%#llx end=%#llx<pre>\n", start, end); + if(end < start || end - start < HeadSize){ + hprint(&c->hout, "bad size %#llx\n", end - start); + goto out; + } + + // read arena header, tail + blk = vtmalloc(HeadSize); + if(readpart(p, start, blk, HeadSize) != HeadSize){ + hprint(&c->hout, "reading header: %r\n"); + vtfree(blk); + goto out; + } + if(unpackarenahead(&head, blk) < 0){ + hprint(&c->hout, "corrupt arena header: %r\n"); + // hhex(blk, HeadSize); + vtfree(blk); + goto out; + } + vtfree(blk); + + hprint(&c->hout, "head:\n<pre>\n"); + hprint(&c->hout, "version=%d name=%s blocksize=%d size=%#llx clumpmagic=%#ux\n", + head.version, head.name, head.blocksize, head.size, + head.clumpmagic); + hprint(&c->hout, "</pre><br><br>\n"); + + if(head.blocksize > MaxIoSize || head.blocksize >= end - start){ + hprint(&c->hout, "corrupt block size %d\n", head.blocksize); + goto out; + } + + blk = vtmalloc(head.blocksize); + if(readpart(p, end - head.blocksize, blk, head.blocksize) < 0){ + hprint(&c->hout, "reading tail: %r\n"); + vtfree(blk); + goto out; + } + memset(&arena, 0, sizeof arena); + arena.part = p; + arena.blocksize = head.blocksize; + arena.clumpmax = head.blocksize / ClumpInfoSize; + arena.base = start + head.blocksize; + arena.size = end - start - 2 * head.blocksize; + if(unpackarena(&arena, blk) < 0){ + vtfree(blk); + goto out; + } + scorecp(arena.score, blk+head.blocksize - VtScoreSize); + + vtfree(blk); + + hprint(&c->hout, "tail:\n<pre>\n"); + hprint(&c->hout, "version=%d name=%s\n", arena.version, arena.name); + hprint(&c->hout, "ctime=%d %s\n", arena.ctime, fmttime(tbuf, arena.ctime)); + hprint(&c->hout, "wtime=%d %s\n", arena.wtime, fmttime(tbuf, arena.wtime)); + hprint(&c->hout, "clumpmagic=%#ux\n", arena.clumpmagic); + hprint(&c->hout, "score %V\n", arena.score); + hprint(&c->hout, "diskstats:\n"); + hprint(&c->hout, "\tclumps=%,d cclumps=%,d used=%,lld uncsize=%,lld sealed=%d\n", + arena.diskstats.clumps, arena.diskstats.cclumps, + arena.diskstats.used, arena.diskstats.uncsize, + arena.diskstats.sealed); + hprint(&c->hout, "memstats:\n"); + hprint(&c->hout, "\tclumps=%,d cclumps=%,d used=%,lld uncsize=%,lld sealed=%d\n", + arena.memstats.clumps, arena.memstats.cclumps, + arena.memstats.used, arena.memstats.uncsize, + arena.memstats.sealed); + if(arena.clumpmax == 0){ + hprint(&c->hout, "bad clumpmax\n"); + goto out; + } + + score = hargstr(c, "score", ""); + clump = hargstr(c, "clump", ""); + + if(clump[0]){ + off = strtoull(clump, 0, 0); + diskarenaclump(c, &arena, off, score[0] ? score : nil); + }else if(score[0]){ + diskarenaclump(c, &arena, -1, score); + }else{ + diskarenatoc(c, &arena); + } + +out: + free(table); + return 0; +} + +static vlong +findintoc(HConnect *c, Arena *arena, uchar *score) +{ + uchar *blk; + int i; + vlong off; + vlong coff; + ClumpInfo ci; + + blk = vtmalloc(arena->blocksize); + off = arena->base + arena->size; + coff = 0; + for(i=0; i<arena->memstats.clumps; i++){ + if(i%arena->clumpmax == 0){ + off -= arena->blocksize; + if(readpart(arena->part, off, blk, arena->blocksize) != arena->blocksize){ + if(c) + hprint(&c->hout, "<i>clump info directory at %#llx: %r</i>\n<br>\n", + off); + break; + } + } + unpackclumpinfo(&ci, blk+(i%arena->clumpmax)*ClumpInfoSize); + if(scorecmp(ci.score, score) == 0){ + vtfree(blk); + return coff; + } + coff += ClumpSize + ci.size; + } + vtfree(blk); + return -1; +} + + +static int +diskarenatoc(HConnect *c, Arena *arena) +{ + uchar *blk; + int i; + vlong off; + vlong coff; + ClumpInfo ci; + char base[512]; + int cib; + + snprint(base, sizeof base, "/disk?disk=%s&type=a&arena=%s", + arena->part->name, arena->name); + + blk = vtmalloc(arena->blocksize); + off = arena->base + arena->size; + hprint(&c->hout, "<h2>table of contents</h2>\n"); + hprint(&c->hout, "<pre>\n"); + hprint(&c->hout, "%5s %6s %7s %s\n", "type", "size", "uncsize", "score"); + coff = 0; + cib = hargint(c, "cib", 0); + + for(i=0; i<arena->memstats.clumps; i++){ + if(i%arena->clumpmax == 0){ + off -= arena->blocksize; + if(readpart(arena->part, off, blk, arena->blocksize) != arena->blocksize){ + hprint(&c->hout, "<i>clump info directory at %#llx: %r</i>\n<br>\n", + off); + i += arena->clumpmax-1; + coff = -1; + continue; + } + } + unpackclumpinfo(&ci, blk+(i%arena->clumpmax)*ClumpInfoSize); + if(i/arena->clumpmax == cib || i%arena->clumpmax == 0){ + hprint(&c->hout, "%5d %6d %7d %V", + ci.type, ci.size, ci.uncsize, ci.score); + if(coff >= 0) + hprint(&c->hout, " at <a href=\"%s&clump=%#llx&score=%V\">%#llx</a>", + base, coff, ci.score, coff); + if(i/arena->clumpmax != cib) + hprint(&c->hout, " <font size=-1><a href=\"%s&cib=%d\">more</a></font>", base, i/arena->clumpmax); + hprint(&c->hout, "\n"); + } + if(coff >= 0) + coff += ClumpSize + ci.size; + } + hprint(&c->hout, "</pre>\n"); + return 0; +} + +#define U32GET(p) ((u32int)(((p)[0]<<24)|((p)[1]<<16)|((p)[2]<<8)|(p)[3])) +static int +diskarenaclump(HConnect *c, Arena *arena, vlong off, char *scorestr) +{ + uchar *blk, *blk2; + Clump cl; + char err[ERRMAX]; + uchar xscore[VtScoreSize], score[VtScoreSize]; + Unwhack uw; + int n; + + if(scorestr){ + if(vtparsescore(scorestr, nil, score) < 0){ + hprint(&c->hout, "bad score %s: %r\n", scorestr); + return -1; + } + if(off < 0){ + off = findintoc(c, arena, score); + if(off < 0){ + hprint(&c->hout, "score %V not found in arena %s\n", score, arena->name); + return -1; + } + hprint(&c->hout, "score %V at %#llx\n", score, off); + } + }else + memset(score, 0, sizeof score); + + if(off < 0){ + hprint(&c->hout, "bad offset %#llx\n", off); + return -1; + } + + off += arena->base; + + blk = vtmalloc(ClumpSize + VtMaxLumpSize); + if(readpart(arena->part, off, blk, ClumpSize + VtMaxLumpSize) != ClumpSize + VtMaxLumpSize){ + hprint(&c->hout, "reading at %#llx: %r\n", off); + vtfree(blk); + return -1; + } + + if(unpackclump(&cl, blk, arena->clumpmagic) < 0){ + hprint(&c->hout, "unpackclump: %r\n<br>"); + rerrstr(err, sizeof err); + if(strstr(err, "magic")){ + hprint(&c->hout, "trying again with magic=%#ux<br>\n", U32GET(blk)); + if(unpackclump(&cl, blk, U32GET(blk)) < 0){ + hprint(&c->hout, "unpackclump: %r\n<br>\n"); + goto error; + } + }else + goto error; + } + + hprint(&c->hout, "<pre>type=%d size=%d uncsize=%d score=%V\n", cl.info.type, cl.info.size, cl.info.uncsize, cl.info.score); + hprint(&c->hout, "encoding=%d creator=%d time=%d %s</pre>\n", cl.encoding, cl.creator, cl.time, fmttime(err, cl.time)); + + if(cl.info.type == VtCorruptType) + hprint(&c->hout, "clump is marked corrupt<br>\n"); + + if(cl.info.size >= VtMaxLumpSize){ + hprint(&c->hout, "clump too big\n"); + goto error; + } + + switch(cl.encoding){ + case ClumpECompress: + blk2 = vtmalloc(VtMaxLumpSize); + unwhackinit(&uw); + n = unwhack(&uw, blk2, cl.info.uncsize, blk+ClumpSize, cl.info.size); + if(n < 0){ + hprint(&c->hout, "decompression failed\n"); + vtfree(blk2); + goto error; + } + if(n != cl.info.uncsize){ + hprint(&c->hout, "got wrong amount: %d wanted %d\n", n, cl.info.uncsize); + // hhex(blk2, n); + vtfree(blk2); + goto error; + } + scoremem(xscore, blk2, cl.info.uncsize); + vtfree(blk2); + break; + case ClumpENone: + scoremem(xscore, blk+ClumpSize, cl.info.size); + break; + } + + hprint(&c->hout, "score=%V<br>\n", xscore); + if(scorestr && scorecmp(score, xscore) != 0) + hprint(&c->hout, "score does NOT match expected %V\n", score); + + vtfree(blk); + return 0; + +error: + // hhex(blk, ClumpSize + VtMaxLumpSize); + vtfree(blk); + return -1; +} + +static int +diskbloom(HConnect *c, char *disk, Part *p) +{ + USED(c); + USED(disk); + USED(p); + return 0; +} + +static int +diskisect(HConnect *c, char *disk, Part *p) +{ + USED(c); + USED(disk); + USED(p); + return 0; +} + +static void +debugamap(HConnect *c) +{ + int i; + AMap *amap; + + hprint(&c->hout, "<h2>arena map</h2>\n"); + hprint(&c->hout, "<pre>\n"); + + amap = mainindex->amap; + for(i=0; i<mainindex->narenas; i++) + hprint(&c->hout, "%s %#llx %#llx\n", + amap[i].name, amap[i].start, amap[i].stop); +} + +static void +debugread(HConnect *c, u8int *score) +{ + int type; + Lump *u; + IAddr ia; + IEntry ie; + int i; + Arena *arena; + u64int aa; + ZBlock *zb; + Clump cl; + vlong off; + u8int sc[VtScoreSize]; + + if(scorecmp(score, zeroscore) == 0){ + hprint(&c->hout, "zero score\n"); + return; + } + + hprint(&c->hout, "<h2>index search %V</h2><pre>\n", score); + if(icachelookup(score, -1, &ia) < 0) + hprint(&c->hout, " icache: not found\n"); + else + hprint(&c->hout, " icache: addr=%#llx size=%d type=%d blocks=%d\n", + ia.addr, ia.size, ia.type, ia.blocks); + + if(loadientry(mainindex, score, -1, &ie) < 0) + hprint(&c->hout, " idisk: not found\n"); + else + hprint(&c->hout, " idisk: addr=%#llx size=%d type=%d blocks=%d\n", + ie.ia.addr, ie.ia.size, ie.ia.type, ie.ia.blocks); + + hprint(&c->hout, "</pre><h2>lookup %V</h2>\n", score); + hprint(&c->hout, "<pre>\n"); + + for(type=0; type < VtMaxType; type++){ + hprint(&c->hout, "%V type %d:", score, type); + u = lookuplump(score, type); + if(u->data != nil) + hprint(&c->hout, " +cache"); + else + hprint(&c->hout, " -cache"); + putlump(u); + + if(lookupscore(score, type, &ia) < 0){ + hprint(&c->hout, " -lookup\n"); + continue; + } + hprint(&c->hout, "\n lookupscore: addr=%#llx size=%d blocks=%d\n", + ia.addr, ia.size, ia.blocks); + + arena = amapitoa(mainindex, ia.addr, &aa); + if(arena == nil){ + hprint(&c->hout, " amapitoa failed: %r\n"); + continue; + } + + hprint(&c->hout, " amapitoa: aa=%#llx arena=" + "<a href=\"/disk?disk=%s&type=a&arena=%s&score=%V\">%s</a>\n", + aa, arena->part->name, arena->name, score, arena->name); + zb = loadclump(arena, aa, ia.blocks, &cl, sc, 1); + if(zb == nil){ + hprint(&c->hout, " loadclump failed: %r\n"); + continue; + } + + hprint(&c->hout, " loadclump: uncsize=%d type=%d score=%V\n", + cl.info.uncsize, cl.info.type, sc); + if(ia.size != cl.info.uncsize || ia.type != cl.info.type || scorecmp(score, sc) != 0){ + hprint(&c->hout, " clump info mismatch\n"); + continue; + } + } + + if(hargstr(c, "brute", "")[0] == 'y'){ + hprint(&c->hout, "</pre>\n"); + hprint(&c->hout, "<h2>brute force arena search %V</h2>\n", score); + hprint(&c->hout, "<pre>\n"); + + for(i=0; i<mainindex->narenas; i++){ + arena = mainindex->arenas[i]; + hprint(&c->hout, "%s...\n", arena->name); + hflush(&c->hout); + off = findintoc(nil, arena, score); + if(off >= 0) + hprint(&c->hout, "%s %#llx (%#llx)\n", arena->name, off, mainindex->amap[i].start + off); + } + } + + hprint(&c->hout, "</pre>\n"); +} + +static void +debugmem(HConnect *c) +{ + Index *ix; + + ix = mainindex; + hprint(&c->hout, "<h2>memory</h2>\n"); + + hprint(&c->hout, "<pre>\n"); + hprint(&c->hout, "ix=%p\n", ix); + hprint(&c->hout, "\tarenas=%p\n", ix->arenas); + if(ix->narenas > 0) + hprint(&c->hout, "\tarenas[...] = %p...%p\n", ix->arenas[0], ix->arenas[ix->narenas-1]); + hprint(&c->hout, "\tsmap=%p\n", ix->smap); + hprint(&c->hout, "\tamap=%p\n", ix->amap); + hprint(&c->hout, "\tbloom=%p\n", ix->bloom); + hprint(&c->hout, "\tbloom->data=%p\n", ix->bloom ? ix->bloom->data : nil); + hprint(&c->hout, "\tisects=%p\n", ix->sects); + if(ix->nsects > 0) + hprint(&c->hout, "\tsects[...] = %p...%p\n", ix->sects[0], ix->sects[ix->nsects-1]); +} + +int +hdebug(HConnect *c) +{ + char *scorestr, *op; + u8int score[VtScoreSize]; + + if(hsethtml(c) < 0) + return -1; + hprint(&c->hout, "<h1>venti debug</h1>\n"); + + op = hargstr(c, "op", ""); + if(!op[0]){ + hprint(&c->hout, "no op\n"); + return 0; + } + + if(strcmp(op, "amap") == 0){ + debugamap(c); + return 0; + } + + if(strcmp(op, "mem") == 0){ + debugmem(c); + return 0; + } + + if(strcmp(op, "read") == 0){ + scorestr = hargstr(c, "score", ""); + if(vtparsescore(scorestr, nil, score) < 0){ + hprint(&c->hout, "bad score %s: %r\n", scorestr); + return 0; + } + debugread(c, score); + return 0; + } + + hprint(&c->hout, "unknown op %s", op); + return 0; +} diff --git a/sys/src/cmd/venti/srv/hproc.c b/sys/src/cmd/venti/srv/hproc.c new file mode 100755 index 000000000..7a22ec251 --- /dev/null +++ b/sys/src/cmd/venti/srv/hproc.c @@ -0,0 +1,674 @@ +#include "stdinc.h" +#include <bio.h> +#include <mach.h> +#include <ureg.h> +#include "/sys/src/libthread/threadimpl.h" +#include "dat.h" +#include "fns.h" + +typedef struct Ureg Ureg; +typedef struct Debug Debug; + +struct Debug +{ + int textfd; + QLock lock; + Fhdr fhdr; + Map *map; + Fmt *fmt; + int pid; + char *stkprefix; + int pcoff; + int spoff; +}; + +static Debug debug = { -1 }; + +static int +text(int pid) +{ + int fd; + char buf[100]; + + if(debug.textfd >= 0){ + close(debug.textfd); + debug.textfd = -1; + } + memset(&debug.fhdr, 0, sizeof debug.fhdr); + + snprint(buf, sizeof buf, "#p/%d/text", pid); + fd = open(buf, OREAD); + if(fd < 0) + return -1; + if(crackhdr(fd, &debug.fhdr) < 0){ + close(fd); + return -1; + } + if(syminit(fd, &debug.fhdr) < 0){ + memset(&debug.fhdr, 0, sizeof debug.fhdr); + close(fd); + return -1; + } + debug.textfd = fd; + machbytype(debug.fhdr.type); + return 0; +} + +static void +unmap(Map *m) +{ + int i; + + for(i=0; i<m->nsegs; i++) + if(m->seg[i].inuse) + close(m->seg[i].fd); + free(m); +} + +static Map* +map(int pid) +{ + int mem; + char buf[100]; + Map *m; + + snprint(buf, sizeof buf, "#p/%d/mem", pid); + mem = open(buf, OREAD); + if(mem < 0) + return nil; + + m = attachproc(pid, 0, mem, &debug.fhdr); + if(m == 0){ + close(mem); + return nil; + } + + if(debug.map) + unmap(debug.map); + debug.map = m; + debug.pid = pid; + return m; +} + +static void +dprint(char *fmt, ...) +{ + va_list arg; + + va_start(arg, fmt); + fmtvprint(debug.fmt, fmt, arg); + va_end(arg); +} + +static void +openfiles(void) +{ + char buf[4096]; + int fd, n; + + snprint(buf, sizeof buf, "#p/%d/fd", getpid()); + if((fd = open(buf, OREAD)) < 0){ + dprint("open %s: %r\n", buf); + return; + } + n = readn(fd, buf, sizeof buf-1); + close(fd); + if(n >= 0){ + buf[n] = 0; + fmtstrcpy(debug.fmt, buf); + } +} + +/* + * dump the raw symbol table + */ +static void +printsym(void) +{ + int i; + Sym *sp; + + for (i = 0; sp = getsym(i); i++) { + switch(sp->type) { + case 't': + case 'l': + dprint("%16#llux t %s\n", sp->value, sp->name); + break; + case 'T': + case 'L': + dprint("%16#llux T %s\n", sp->value, sp->name); + break; + case 'D': + case 'd': + case 'B': + case 'b': + case 'a': + case 'p': + case 'm': + dprint("%16#llux %c %s\n", sp->value, sp->type, sp->name); + break; + default: + break; + } + } +} + +static void +printmap(char *s, Map *map) +{ + int i; + + if (!map) + return; + dprint("%s\n", s); + for (i = 0; i < map->nsegs; i++) { + if (map->seg[i].inuse) + dprint("%-16s %-16#llux %-16#llux %-16#llux\n", + map->seg[i].name, map->seg[i].b, + map->seg[i].e, map->seg[i].f); + } +} + +static void +printlocals(Map *map, Symbol *fn, uintptr fp) +{ + int i; + uintptr w; + Symbol s; + char buf[100]; + + s = *fn; + for (i = 0; localsym(&s, i); i++) { + if (s.class != CAUTO) + continue; + snprint(buf, sizeof buf, "%s%s/", debug.stkprefix, s.name); + if (geta(map, fp - s.value, (uvlong*)&w) > 0) + dprint("\t%-10s %10#p %ld\n", buf, w, w); + else + dprint("\t%-10s ?\n", buf); + } +} + +static void +printparams(Map *map, Symbol *fn, uintptr fp) +{ + int i; + Symbol s; + uintptr w; + int first = 0; + + fp += mach->szaddr; /* skip saved pc */ + s = *fn; + for (i = 0; localsym(&s, i); i++) { + if (s.class != CPARAM) + continue; + if (first++) + dprint(", "); + if (geta(map, fp + s.value, (uvlong *)&w) > 0) + dprint("%s=%#p", s.name, w); + } +} + +static void +printsource(uintptr dot) +{ + char str[100]; + + if (fileline(str, sizeof str, dot)) + dprint("%s", str); +} + + +/* + * callback on stack trace + */ +static uintptr nextpc; + +static void +ptrace(Map *map, uvlong pc, uvlong sp, Symbol *sym) +{ + if(nextpc == 0) + nextpc = sym->value; + if(debug.stkprefix == nil) + debug.stkprefix = ""; + dprint("%s%s(", debug.stkprefix, sym->name); + printparams(map, sym, sp); + dprint(")"); + if(nextpc != sym->value) + dprint("+%#llux ", nextpc - sym->value); + printsource(nextpc); + dprint("\n"); + printlocals(map, sym, sp); + nextpc = pc; +} + +static void +stacktracepcsp(Map *m, uintptr pc, uintptr sp) +{ + nextpc = 0; + if(machdata->ctrace==nil) + dprint("no machdata->ctrace\n"); + else if(machdata->ctrace(m, pc, sp, 0, ptrace) <= 0) + dprint("no stack frame: pc=%#p sp=%#p\n", pc, sp); +} + +static void +ureginit(void) +{ + Reglist *r; + + for(r = mach->reglist; r->rname; r++) + if (strcmp(r->rname, "PC") == 0) + debug.pcoff = r->roffs; + else if (strcmp(r->rname, "SP") == 0) + debug.spoff = r->roffs; +} + +static void +stacktrace(Map *m) +{ + uintptr pc, sp; + + if(geta(m, debug.pcoff, (uvlong *)&pc) < 0){ + dprint("geta pc: %r"); + return; + } + if(geta(m, debug.spoff, (uvlong *)&sp) < 0){ + dprint("geta sp: %r"); + return; + } + stacktracepcsp(m, pc, sp); +} + +static uintptr +star(uintptr addr) +{ + uintptr x; + static int warned; + + if(addr == 0) + return 0; + + if(debug.map == nil){ + if(!warned++) + dprint("no debug.map\n"); + return 0; + } + if(geta(debug.map, addr, (uvlong *)&x) < 0){ + dprint("geta %#p (pid=%d): %r\n", addr, debug.pid); + return 0; + } + return x; +} + +static uintptr +resolvev(char *name) +{ + Symbol s; + + if(lookup(nil, name, &s) == 0) + return 0; + return s.value; +} + +static uintptr +resolvef(char *name) +{ + Symbol s; + + if(lookup(name, nil, &s) == 0) + return 0; + return s.value; +} + +#define FADDR(type, p, name) ((p) + offsetof(type, name)) +#define FIELD(type, p, name) star(FADDR(type, p, name)) + +static uintptr threadpc; + +static int +strprefix(char *big, char *pre) +{ + return strncmp(big, pre, strlen(pre)); +} +static void +tptrace(Map *map, uvlong pc, uvlong sp, Symbol *sym) +{ + char buf[512]; + + USED(map); + USED(sym); + USED(sp); + + if(threadpc != 0) + return; + if(!fileline(buf, sizeof buf, pc)) + return; + if(strprefix(buf, "/sys/src/libc/") == 0) + return; + if(strprefix(buf, "/sys/src/libthread/") == 0) + return; + threadpc = pc; +} + +static char* +threadstkline(uintptr t) +{ + uintptr pc, sp; + static char buf[500]; + + if(FIELD(Thread, t, state) == Running){ + geta(debug.map, debug.pcoff, (uvlong *)&pc); + geta(debug.map, debug.spoff, (uvlong *)&sp); + }else{ + // pc = FIELD(Thread, t, sched[JMPBUFPC]); + pc = resolvef("longjmp"); + sp = FIELD(Thread, t, sched[JMPBUFSP]); + } + if(machdata->ctrace == nil) + return ""; + threadpc = 0; + machdata->ctrace(debug.map, pc, sp, 0, tptrace); + if(!fileline(buf, sizeof buf, threadpc)) + buf[0] = 0; + return buf; +} + +static void +proc(uintptr p) +{ + dprint("p=(Proc)%#p pid %d ", p, FIELD(Proc, p, pid)); + if(FIELD(Proc, p, thread) == 0) + dprint(" Sched\n"); + else + dprint(" Running\n"); +} + +static void +fmtbufinit(Fmt *f, char *buf, int len) +{ + memset(f, 0, sizeof *f); + f->runes = 0; + f->start = buf; + f->to = buf; + f->stop = buf + len - 1; + f->flush = nil; + f->farg = nil; + f->nfmt = 0; +} + +static char* +fmtbufflush(Fmt *f) +{ + *(char*)f->to = 0; + return (char*)f->start; +} + +static char* +debugstr(uintptr s) +{ + static char buf[4096]; + char *p, *e; + + p = buf; + e = buf+sizeof buf - 1; + while(p < e){ + if(get1(debug.map, s++, (uchar*)p, 1) < 0) + break; + if(*p == 0) + break; + p++; + } + *p = 0; + return buf; +} + +static char* +threadfmt(uintptr t) +{ + static char buf[4096]; + Fmt fmt; + int s; + + fmtbufinit(&fmt, buf, sizeof buf); + + fmtprint(&fmt, "t=(Thread)%#p ", t); + switch(s = FIELD(Thread, t, state)){ + case Running: + fmtprint(&fmt, " Running "); + break; + case Ready: + fmtprint(&fmt, " Ready "); + break; + case Rendezvous: + fmtprint(&fmt, " Rendez "); + break; + default: + fmtprint(&fmt, " bad state %d ", s); + break; + } + + fmtprint(&fmt, "%s", threadstkline(t)); + + if(FIELD(Thread, t, moribund) == 1) + fmtprint(&fmt, " Moribund"); + if(s = FIELD(Thread, t, cmdname)){ + fmtprint(&fmt, " [%s]", debugstr(s)); + } + + fmtbufflush(&fmt); + return buf; +} + + +static void +thread(uintptr t) +{ + dprint("%s\n", threadfmt(t)); +} + +static void +threadapply(uintptr p, void (*fn)(uintptr)) +{ + int oldpid, pid; + uintptr tq, t; + + oldpid = debug.pid; + pid = FIELD(Proc, p, pid); + if(map(pid) == nil) + return; + tq = FADDR(Proc, p, threads); + t = FIELD(Tqueue, tq, head); + while(t != 0){ + fn(t); + t = FIELD(Thread, t, nextt); + } + map(oldpid); +} + +static void +pthreads1(uintptr t) +{ + dprint("\t"); + thread(t); +} + +static void +pthreads(uintptr p) +{ + threadapply(p, pthreads1); +} + +static void +lproc(uintptr p) +{ + proc(p); + pthreads(p); +} + +static void +procapply(void (*fn)(uintptr)) +{ + uintptr proc, pq; + + pq = resolvev("_threadpq"); + if(pq == 0){ + dprint("no thread run queue\n"); + return; + } + + proc = FIELD(Pqueue, pq, head); + while(proc){ + fn(proc); + proc = FIELD(Proc, proc, next); + } +} + +static void +threads(HConnect *c) +{ + USED(c); + procapply(lproc); +} + +static void +procs(HConnect *c) +{ + USED(c); + procapply(proc); +} + +static void +threadstack(uintptr t) +{ + uintptr pc, sp; + + if(FIELD(Thread, t, state) == Running){ + stacktrace(debug.map); + }else{ + // pc = FIELD(Thread, t, sched[JMPBUFPC]); + pc = resolvef("longjmp"); + sp = FIELD(Thread, t, sched[JMPBUFSP]); + stacktracepcsp(debug.map, pc, sp); + } +} + + +static void +tstacks(uintptr t) +{ + dprint("\t"); + thread(t); + threadstack(t); + dprint("\n"); +} + +static void +pstacks(uintptr p) +{ + proc(p); + threadapply(p, tstacks); +} + +static void +stacks(HConnect *c) +{ + USED(c); + debug.stkprefix = "\t\t"; + procapply(pstacks); + debug.stkprefix = ""; +} + +static void +symbols(HConnect *c) +{ + USED(c); + printsym(); +} + +static void +segments(HConnect *c) +{ + USED(c); + printmap("segments", debug.map); +} + +static void +fds(HConnect *c) +{ + USED(c); + openfiles(); +} + +static void +all(HConnect *c) +{ + dprint("/proc/segment\n"); + segments(c); + dprint("\n/proc/fd\n"); + fds(c); + dprint("\n/proc/procs\n"); + procs(c); + dprint("\n/proc/threads\n"); + threads(c); + dprint("\n/proc/stacks\n"); + stacks(c); + dprint("\n# /proc/symbols\n"); + // symbols(c); +} + +int +hproc(HConnect *c) +{ + void (*fn)(HConnect*); + Fmt fmt; + static int beenhere; + static char buf[65536]; + + if (!beenhere) { + beenhere = 1; + ureginit(); + } + if(strcmp(c->req.uri, "/proc/all") == 0) + fn = all; + else if(strcmp(c->req.uri, "/proc/segment") == 0) + fn = segments; + else if(strcmp(c->req.uri, "/proc/fd") == 0) + fn = fds; + else if(strcmp(c->req.uri, "/proc/procs") == 0) + fn = procs; + else if(strcmp(c->req.uri, "/proc/threads") == 0) + fn = threads; + else if(strcmp(c->req.uri, "/proc/stacks") == 0) + fn = stacks; + else if(strcmp(c->req.uri, "/proc/symbols") == 0) + fn = symbols; + else + return hnotfound(c); + + if(hsettext(c) < 0) + return -1; + if(!canqlock(&debug.lock)){ + hprint(&c->hout, "debugger is busy\n"); + return 0; + } + if(debug.textfd < 0){ + if(text(getpid()) < 0){ + hprint(&c->hout, "cannot attach self text: %r\n"); + goto out; + } + } + if(map(getpid()) == nil){ + hprint(&c->hout, "cannot map self: %r\n"); + goto out; + } + + fmtbufinit(&fmt, buf, sizeof buf); + debug.fmt = &fmt; + fn(c); + hprint(&c->hout, "%s\n", fmtbufflush(&fmt)); + debug.fmt = nil; +out: + qunlock(&debug.lock); + return 0; +} diff --git a/sys/src/cmd/venti/srv/httpd.c b/sys/src/cmd/venti/srv/httpd.c new file mode 100755 index 000000000..623d4e476 --- /dev/null +++ b/sys/src/cmd/venti/srv/httpd.c @@ -0,0 +1,1177 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" +#include "xml.h" + +typedef struct HttpObj HttpObj; +extern QLock memdrawlock; + +enum +{ + ObjNameSize = 64, + MaxObjs = 64 +}; + +struct HttpObj +{ + char name[ObjNameSize]; + int (*f)(HConnect*); +}; + +static HttpObj objs[MaxObjs]; + +static char *webroot; + +static void listenproc(void*); +static int estats(HConnect *c); +static int dindex(HConnect *c); +static int xindex(HConnect *c); +static int xlog(HConnect *c); +static int sindex(HConnect *c); +static int hempty(HConnect *c); +static int hlcacheempty(HConnect *c); +static int hdcacheempty(HConnect *c); +static int hicacheempty(HConnect *c); +static int hicachekick(HConnect *c); +static int hdcachekick(HConnect *c); +static int hicacheflush(HConnect *c); +static int hdcacheflush(HConnect *c); +static int httpdobj(char *name, int (*f)(HConnect*)); +static int xgraph(HConnect *c); +static int xset(HConnect *c); +static int fromwebdir(HConnect *c); + +int +httpdinit(char *address, char *dir) +{ + fmtinstall('D', hdatefmt); +/* fmtinstall('H', httpfmt); */ + fmtinstall('U', hurlfmt); + + if(address == nil) + address = "tcp!*!http"; + webroot = dir; + + httpdobj("/stats", estats); + httpdobj("/index", dindex); + httpdobj("/storage", sindex); + httpdobj("/xindex", xindex); + httpdobj("/flushicache", hicacheflush); + httpdobj("/flushdcache", hdcacheflush); + httpdobj("/kickicache", hicachekick); + httpdobj("/kickdcache", hdcachekick); + httpdobj("/graph", xgraph); + httpdobj("/set", xset); + httpdobj("/log", xlog); + httpdobj("/empty", hempty); + httpdobj("/emptyicache", hicacheempty); + httpdobj("/emptylumpcache", hlcacheempty); + httpdobj("/emptydcache", hdcacheempty); + httpdobj("/disk", hdisk); + httpdobj("/debug", hdebug); + httpdobj("/proc/", hproc); + + if(vtproc(listenproc, address) < 0) + return -1; + return 0; +} + +static int +httpdobj(char *name, int (*f)(HConnect*)) +{ + int i; + + if(name == nil || strlen(name) >= ObjNameSize) + return -1; + for(i = 0; i < MaxObjs; i++){ + if(objs[i].name[0] == '\0'){ + strcpy(objs[i].name, name); + objs[i].f = f; + return 0; + } + if(strcmp(objs[i].name, name) == 0) + return -1; + } + return -1; +} + +static HConnect* +mkconnect(void) +{ + HConnect *c; + + c = mallocz(sizeof(HConnect), 1); + if(c == nil) + sysfatal("out of memory"); + c->replog = nil; + c->hpos = c->header; + c->hstop = c->header; + return c; +} + +void httpproc(void*); + +static void +listenproc(void *vaddress) +{ + HConnect *c; + char *address, ndir[NETPATHLEN], dir[NETPATHLEN]; + int ctl, nctl, data; + + address = vaddress; + ctl = announce(address, dir); + if(ctl < 0){ + fprint(2, "venti: httpd can't announce on %s: %r\n", address); + return; + } + + if(0) print("announce ctl %d dir %s\n", ctl, dir); + for(;;){ + /* + * wait for a call (or an error) + */ + nctl = listen(dir, ndir); + if(0) print("httpd listen %d %s...\n", nctl, ndir); + if(nctl < 0){ + fprint(2, "venti: httpd can't listen on %s: %r\n", address); + return; + } + + data = accept(ctl, ndir); + if(0) print("httpd accept %d...\n", data); + if(data < 0){ + fprint(2, "venti: httpd accept: %r\n"); + close(nctl); + continue; + } + if(0) print("httpd close nctl %d\n", nctl); + close(nctl); + c = mkconnect(); + hinit(&c->hin, data, Hread); + hinit(&c->hout, data, Hwrite); + vtproc(httpproc, c); + } +} + +void +httpproc(void *v) +{ + HConnect *c; + int ok, i, n; + + c = v; + + for(;;){ + /* + * No timeout because the signal appears to hit every + * proc, not just us. + */ + if(hparsereq(c, 0) < 0) + break; + + for(i = 0; i < MaxObjs && objs[i].name[0]; i++){ + n = strlen(objs[i].name); + if((objs[i].name[n-1] == '/' && strncmp(c->req.uri, objs[i].name, n) == 0) + || (objs[i].name[n-1] != '/' && strcmp(c->req.uri, objs[i].name) == 0)){ + ok = (*objs[i].f)(c); + goto found; + } + } + ok = fromwebdir(c); + found: + hflush(&c->hout); + if(c->head.closeit) + ok = -1; + hreqcleanup(c); + + if(ok < 0) + break; + } + hreqcleanup(c); + close(c->hin.fd); + free(c); +} + +char* +hargstr(HConnect *c, char *name, char *def) +{ + HSPairs *p; + + for(p=c->req.searchpairs; p; p=p->next) + if(strcmp(p->s, name) == 0) + return p->t; + return def; +} + +vlong +hargint(HConnect *c, char *name, vlong def) +{ + char *a; + + if((a = hargstr(c, name, nil)) == nil) + return def; + return atoll(a); +} + +static int +percent(ulong v, ulong total) +{ + if(total == 0) + total = 1; + if(v < 1000*1000) + return (v * 100) / total; + total /= 100; + if(total == 0) + total = 1; + return v / total; +} + +static int +preq(HConnect *c) +{ + if(hparseheaders(c, 0) < 0) + return -1; + if(strcmp(c->req.meth, "GET") != 0 + && strcmp(c->req.meth, "HEAD") != 0) + return hunallowed(c, "GET, HEAD"); + if(c->head.expectother || c->head.expectcont) + return hfail(c, HExpectFail, nil); + return 0; +} + +int +hsettype(HConnect *c, char *type) +{ + Hio *hout; + int r; + + r = preq(c); + if(r < 0) + return r; + + hout = &c->hout; + if(c->req.vermaj){ + hokheaders(c); + hprint(hout, "Content-type: %s\r\n", type); + if(http11(c)) + hprint(hout, "Transfer-Encoding: chunked\r\n"); + hprint(hout, "\r\n"); + } + + if(http11(c)) + hxferenc(hout, 1); + else + c->head.closeit = 1; + return 0; +} + +int +hsethtml(HConnect *c) +{ + return hsettype(c, "text/html; charset=utf-8"); +} + +int +hsettext(HConnect *c) +{ + return hsettype(c, "text/plain; charset=utf-8"); +} + +static int +herror(HConnect *c) +{ + int n; + Hio *hout; + + hout = &c->hout; + n = snprint(c->xferbuf, HBufSize, "<html><head><title>Error</title></head>\n<body><h1>Error</h1>\n<pre>%r</pre>\n</body></html>"); + hprint(hout, "%s %s\r\n", hversion, "400 Bad Request"); + hprint(hout, "Date: %D\r\n", time(nil)); + hprint(hout, "Server: Venti\r\n"); + hprint(hout, "Content-Type: text/html\r\n"); + hprint(hout, "Content-Length: %d\r\n", n); + if(c->head.closeit) + hprint(hout, "Connection: close\r\n"); + else if(!http11(c)) + hprint(hout, "Connection: Keep-Alive\r\n"); + hprint(hout, "\r\n"); + + if(c->req.meth == nil || strcmp(c->req.meth, "HEAD") != 0) + hwrite(hout, c->xferbuf, n); + + return hflush(hout); +} + +int +hnotfound(HConnect *c) +{ + int r; + + r = preq(c); + if(r < 0) + return r; + return hfail(c, HNotFound, c->req.uri); +} + +struct { + char *ext; + char *type; +} exttab[] = { + ".html", "text/html", + ".txt", "text/plain", + ".xml", "text/xml", + ".png", "image/png", + ".gif", "image/gif", + 0 +}; + +static int +fromwebdir(HConnect *c) +{ + char buf[4096], *p, *ext, *type; + int i, fd, n, defaulted; + Dir *d; + + if(webroot == nil || strstr(c->req.uri, "..")) + return hnotfound(c); + snprint(buf, sizeof buf-20, "%s/%s", webroot, c->req.uri+1); + defaulted = 0; +reopen: + if((fd = open(buf, OREAD)) < 0) + return hnotfound(c); + d = dirfstat(fd); + if(d == nil){ + close(fd); + return hnotfound(c); + } + if(d->mode&DMDIR){ + if(!defaulted){ + defaulted = 1; + strcat(buf, "/index.html"); + free(d); + close(fd); + goto reopen; + } + free(d); + return hnotfound(c); + } + free(d); + p = buf+strlen(buf); + type = "application/octet-stream"; + for(i=0; exttab[i].ext; i++){ + ext = exttab[i].ext; + if(p-strlen(ext) >= buf && strcmp(p-strlen(ext), ext) == 0){ + type = exttab[i].type; + break; + } + } + if(hsettype(c, type) < 0){ + close(fd); + return 0; + } + while((n = read(fd, buf, sizeof buf)) > 0) + if(hwrite(&c->hout, buf, n) < 0) + break; + close(fd); + hflush(&c->hout); + return 0; +} + +static struct +{ + char *name; + int *p; +} namedints[] = +{ + "compress", &compressblocks, + "devnull", &writestodevnull, + "logging", &ventilogging, + "stats", &collectstats, + "icachesleeptime", &icachesleeptime, + "minicachesleeptime", &minicachesleeptime, + "arenasumsleeptime", &arenasumsleeptime, + "l0quantum", &l0quantum, + "l1quantum", &l1quantum, + "manualscheduling", &manualscheduling, + "ignorebloom", &ignorebloom, + "syncwrites", &syncwrites, + "icacheprefetch", &icacheprefetch, + 0 +}; + +static int +xset(HConnect *c) +{ + int i, old; + char *name, *value; + + if(hsettext(c) < 0) + return -1; + + if((name = hargstr(c, "name", nil)) == nil || name[0] == 0){ + for(i=0; namedints[i].name; i++) + hprint(&c->hout, "%s = %d\n", namedints[i].name, *namedints[i].p); + hflush(&c->hout); + return 0; + } + + for(i=0; namedints[i].name; i++) + if(strcmp(name, namedints[i].name) == 0) + break; + if(!namedints[i].name){ + hprint(&c->hout, "%s not found\n", name); + hflush(&c->hout); + return 0; + } + + if((value = hargstr(c, "value", nil)) == nil || value[0] == 0){ + hprint(&c->hout, "%s = %d\n", namedints[i].name, *namedints[i].p); + hflush(&c->hout); + return 0; + } + + old = *namedints[i].p; + *namedints[i].p = atoll(value); + hprint(&c->hout, "%s = %d (was %d)\n", name, *namedints[i].p, old); + hflush(&c->hout); + return 0; +} + +static int +estats(HConnect *c) +{ + Hio *hout; + int r; + + r = hsettext(c); + if(r < 0) + return r; + + + hout = &c->hout; +/* + hprint(hout, "lump writes=%,ld\n", stats.lumpwrites); + hprint(hout, "lump reads=%,ld\n", stats.lumpreads); + hprint(hout, "lump cache read hits=%,ld\n", stats.lumphit); + hprint(hout, "lump cache read misses=%,ld\n", stats.lumpmiss); + + hprint(hout, "clump disk writes=%,ld\n", stats.clumpwrites); + hprint(hout, "clump disk bytes written=%,lld\n", stats.clumpbwrites); + hprint(hout, "clump disk bytes compressed=%,lld\n", stats.clumpbcomp); + hprint(hout, "clump disk reads=%,ld\n", stats.clumpreads); + hprint(hout, "clump disk bytes read=%,lld\n", stats.clumpbreads); + hprint(hout, "clump disk bytes uncompressed=%,lld\n", stats.clumpbuncomp); + + hprint(hout, "clump directory disk writes=%,ld\n", stats.ciwrites); + hprint(hout, "clump directory disk reads=%,ld\n", stats.cireads); + + hprint(hout, "index disk writes=%,ld\n", stats.indexwrites); + hprint(hout, "index disk reads=%,ld\n", stats.indexreads); + hprint(hout, "index disk bloom filter hits=%,ld %d%% falsemisses=%,ld %d%%\n", + stats.indexbloomhits, + percent(stats.indexbloomhits, stats.indexreads), + stats.indexbloomfalsemisses, + percent(stats.indexbloomfalsemisses, stats.indexreads)); + hprint(hout, "bloom filter bits=%,ld of %,ld %d%%\n", + stats.bloomones, stats.bloombits, percent(stats.bloomones, stats.bloombits)); + hprint(hout, "index disk reads for modify=%,ld\n", stats.indexwreads); + hprint(hout, "index disk reads for allocation=%,ld\n", stats.indexareads); + hprint(hout, "index block splits=%,ld\n", stats.indexsplits); + + hprint(hout, "index cache lookups=%,ld\n", stats.iclookups); + hprint(hout, "index cache hits=%,ld %d%%\n", stats.ichits, + percent(stats.ichits, stats.iclookups)); + hprint(hout, "index cache fills=%,ld %d%%\n", stats.icfills, + percent(stats.icfills, stats.iclookups)); + hprint(hout, "index cache inserts=%,ld\n", stats.icinserts); + + hprint(hout, "disk cache hits=%,ld\n", stats.pchit); + hprint(hout, "disk cache misses=%,ld\n", stats.pcmiss); + hprint(hout, "disk cache reads=%,ld\n", stats.pcreads); + hprint(hout, "disk cache bytes read=%,lld\n", stats.pcbreads); + + hprint(hout, "disk cache writes=%,ld\n", stats.dirtydblocks); + hprint(hout, "disk cache writes absorbed=%,ld %d%%\n", stats.absorbedwrites, + percent(stats.absorbedwrites, stats.dirtydblocks)); + + hprint(hout, "disk cache flushes=%,ld\n", stats.dcacheflushes); + hprint(hout, "disk cache flush writes=%,ld (%,ld per flush)\n", + stats.dcacheflushwrites, + stats.dcacheflushwrites/(stats.dcacheflushes ? stats.dcacheflushes : 1)); + + hprint(hout, "disk writes=%,ld\n", stats.diskwrites); + hprint(hout, "disk bytes written=%,lld\n", stats.diskbwrites); + hprint(hout, "disk reads=%,ld\n", stats.diskreads); + hprint(hout, "disk bytes read=%,lld\n", stats.diskbreads); +*/ + + hflush(hout); + return 0; +} + +static int +sindex(HConnect *c) +{ + Hio *hout; + Index *ix; + Arena *arena; + vlong clumps, cclumps, uncsize, used, size; + int i, r, active; + + r = hsettext(c); + if(r < 0) + return r; + hout = &c->hout; + + ix = mainindex; + + hprint(hout, "index=%s\n", ix->name); + + active = 0; + clumps = 0; + cclumps = 0; + uncsize = 0; + used = 0; + size = 0; + for(i = 0; i < ix->narenas; i++){ + arena = ix->arenas[i]; + if(arena != nil && arena->memstats.clumps != 0){ + active++; + clumps += arena->memstats.clumps; + cclumps += arena->memstats.cclumps; + uncsize += arena->memstats.uncsize; + used += arena->memstats.used; + } + size += arena->size; + } + hprint(hout, "total arenas=%,d active=%,d\n", ix->narenas, active); + hprint(hout, "total space=%,lld used=%,lld\n", size, used + clumps * ClumpInfoSize); + hprint(hout, "clumps=%,lld compressed clumps=%,lld data=%,lld compressed data=%,lld\n", + clumps, cclumps, uncsize, used - clumps * ClumpSize); + hflush(hout); + return 0; +} + +static void +darena(Hio *hout, Arena *arena) +{ + hprint(hout, "arena='%s' on %s at [%lld,%lld)\n\tversion=%d created=%d modified=%d", + arena->name, arena->part->name, arena->base, arena->base + arena->size + 2 * arena->blocksize, + arena->version, arena->ctime, arena->wtime); + if(arena->memstats.sealed) + hprint(hout, " mem=sealed"); + if(arena->diskstats.sealed) + hprint(hout, " disk=sealed"); + hprint(hout, "\n"); + if(scorecmp(zeroscore, arena->score) != 0) + hprint(hout, "\tscore=%V\n", arena->score); + + hprint(hout, "\twritten: clumps=%d compressed clumps=%d data=%,lld compressed data=%,lld storage=%,lld\n", + arena->memstats.clumps, arena->memstats.cclumps, arena->memstats.uncsize, + arena->memstats.used - arena->memstats.clumps * ClumpSize, + arena->memstats.used + arena->memstats.clumps * ClumpInfoSize); + hprint(hout, "\tindexed: clumps=%d compressed clumps=%d data=%,lld compressed data=%,lld storage=%,lld\n", + arena->diskstats.clumps, arena->diskstats.cclumps, arena->diskstats.uncsize, + arena->diskstats.used - arena->diskstats.clumps * ClumpSize, + arena->diskstats.used + arena->diskstats.clumps * ClumpInfoSize); +} + +static int +hempty(HConnect *c) +{ + Hio *hout; + int r; + + r = hsettext(c); + if(r < 0) + return r; + hout = &c->hout; + + emptylumpcache(); + emptydcache(); + emptyicache(); + hprint(hout, "emptied all caches\n"); + hflush(hout); + return 0; +} + +static int +hlcacheempty(HConnect *c) +{ + Hio *hout; + int r; + + r = hsettext(c); + if(r < 0) + return r; + hout = &c->hout; + + emptylumpcache(); + hprint(hout, "emptied lumpcache\n"); + hflush(hout); + return 0; +} + +static int +hicacheempty(HConnect *c) +{ + Hio *hout; + int r; + + r = hsettext(c); + if(r < 0) + return r; + hout = &c->hout; + + emptyicache(); + hprint(hout, "emptied icache\n"); + hflush(hout); + return 0; +} + +static int +hdcacheempty(HConnect *c) +{ + Hio *hout; + int r; + + r = hsettext(c); + if(r < 0) + return r; + hout = &c->hout; + + emptydcache(); + hprint(hout, "emptied dcache\n"); + hflush(hout); + return 0; +} +static int +hicachekick(HConnect *c) +{ + Hio *hout; + int r; + + r = hsettext(c); + if(r < 0) + return r; + hout = &c->hout; + + kickicache(); + hprint(hout, "kicked icache\n"); + hflush(hout); + return 0; +} + +static int +hdcachekick(HConnect *c) +{ + Hio *hout; + int r; + + r = hsettext(c); + if(r < 0) + return r; + hout = &c->hout; + + kickdcache(); + hprint(hout, "kicked dcache\n"); + hflush(hout); + return 0; +} +static int +hicacheflush(HConnect *c) +{ + Hio *hout; + int r; + + r = hsettext(c); + if(r < 0) + return r; + hout = &c->hout; + + flushicache(); + hprint(hout, "flushed icache\n"); + hflush(hout); + return 0; +} + +static int +hdcacheflush(HConnect *c) +{ + Hio *hout; + int r; + + r = hsettext(c); + if(r < 0) + return r; + hout = &c->hout; + + flushdcache(); + hprint(hout, "flushed dcache\n"); + hflush(hout); + return 0; +} + +static int +dindex(HConnect *c) +{ + Hio *hout; + Index *ix; + int i, r; + + r = hsettext(c); + if(r < 0) + return r; + hout = &c->hout; + + + ix = mainindex; + hprint(hout, "index=%s version=%d blocksize=%d tabsize=%d\n", + ix->name, ix->version, ix->blocksize, ix->tabsize); + hprint(hout, "\tbuckets=%d div=%d\n", ix->buckets, ix->div); + for(i = 0; i < ix->nsects; i++) + hprint(hout, "\tsect=%s for buckets [%lld,%lld) buckmax=%d\n", ix->smap[i].name, ix->smap[i].start, ix->smap[i].stop, ix->sects[i]->buckmax); + for(i = 0; i < ix->narenas; i++){ + if(ix->arenas[i] != nil && ix->arenas[i]->memstats.clumps != 0){ + hprint(hout, "arena=%s at index [%lld,%lld)\n\t", ix->amap[i].name, ix->amap[i].start, ix->amap[i].stop); + darena(hout, ix->arenas[i]); + } + } + hflush(hout); + return 0; +} + +typedef struct Arg Arg; +struct Arg +{ + int index; + int index2; +}; + +static long +rawgraph(Stats *s, Stats *t, void *va) +{ + Arg *a; + + USED(s); + a = va; + return t->n[a->index]; +} + +static long +diffgraph(Stats *s, Stats *t, void *va) +{ + Arg *a; + + a = va; + return t->n[a->index] - s->n[a->index]; +} + +static long +pctgraph(Stats *s, Stats *t, void *va) +{ + Arg *a; + + USED(s); + a = va; + return percent(t->n[a->index], t->n[a->index2]); +} + +static long +pctdiffgraph(Stats *s, Stats *t, void *va) +{ + Arg *a; + + a = va; + return percent(t->n[a->index]-s->n[a->index], t->n[a->index2]-s->n[a->index2]); +} + +static long +xdiv(long a, long b) +{ + if(b == 0) + b++; + return a/b; +} + +static long +divdiffgraph(Stats *s, Stats *t, void *va) +{ + Arg *a; + + a = va; + return xdiv(t->n[a->index] - s->n[a->index], t->n[a->index2] - s->n[a->index2]); +} + +static long +netbw(Stats *s) +{ + ulong *n; + + n = s->n; + return n[StatRpcReadBytes]+n[StatRpcWriteBytes]; /* not exactly right */ +} + +static long +diskbw(Stats *s) +{ + ulong *n; + + n = s->n; + return n[StatApartReadBytes]+n[StatApartWriteBytes] + + n[StatIsectReadBytes]+n[StatIsectWriteBytes] + + n[StatSumReadBytes]; +} + +static long +iobw(Stats *s) +{ + return netbw(s)+diskbw(s); +} + +static long +diskgraph(Stats *s, Stats *t, void *va) +{ + USED(va); + return diskbw(t)-diskbw(s); +} + +static long +netgraph(Stats *s, Stats *t, void *va) +{ + USED(va); + return netbw(t)-netbw(s); +} + +static long +iograph(Stats *s, Stats *t, void *va) +{ + USED(va); + return iobw(t)-iobw(s); +} + + +static char* graphname[] = +{ + "rpctotal", + "rpcread", + "rpcreadok", + "rpcreadfail", + "rpcreadbyte", + "rpcreadtime", + "rpcreadcached", + "rpcreadcachedtime", + "rpcreaduncached", + "rpcreaduncachedtime", + "rpcwrite", + "rpcwritenew", + "rpcwriteold", + "rpcwritefail", + "rpcwritebyte", + "rpcwritetime", + "rpcwritenewtime", + "rpcwriteoldtime", + + "lcachehit", + "lcachemiss", + "lcachelookup", + "lcachewrite", + "lcachesize", + "lcachestall", + "lcachelookuptime", + + "dcachehit", + "dcachemiss", + "dcachelookup", + "dcacheread", + "dcachewrite", + "dcachedirty", + "dcachesize", + "dcacheflush", + "dcachestall", + "dcachelookuptime", + + "dblockstall", + "lumpstall", + + "icachehit", + "icachemiss", + "icacheread", + "icachewrite", + "icachefill", + "icacheprefetch", + "icachedirty", + "icachesize", + "icacheflush", + "icachestall", + "icachelookuptime", + "icachelookup", + "scachehit", + "scacheprefetch", + + "bloomhit", + "bloommiss", + "bloomfalsemiss", + "bloomlookup", + "bloomones", + "bloombits", + + "apartread", + "apartreadbyte", + "apartwrite", + "apartwritebyte", + + "isectread", + "isectreadbyte", + "isectwrite", + "isectwritebyte", + + "sumread", + "sumreadbyte", + + "cigload", + "cigloadtime", +}; + +static int +findname(char *s) +{ + int i; + + for(i=0; i<nelem(graphname); i++) + if(strcmp(graphname[i], s) == 0) + return i; + return -1; +} + +static void +dotextbin(Hio *io, Graph *g) +{ + int i, nbin; + Statbin *b, bin[2000]; /* 32 kB, but whack is worse */ + + needstack(8192); /* double check that bin didn't kill us */ + nbin = 100; + binstats(g->fn, g->arg, g->t0, g->t1, bin, nbin); + + hprint(io, "stats\n\n"); + for(i=0; i<nbin; i++){ + b = &bin[i]; + hprint(io, "%d: nsamp=%d min=%d max=%d avg=%d\n", + i, b->nsamp, b->min, b->max, b->avg); + } +} + +static int +xgraph(HConnect *c) +{ + char *name; + Hio *hout; + Memimage *m; + int dotext; + Graph g; + Arg arg; + char *graph, *a; + + name = hargstr(c, "arg", ""); + if((arg.index = findname(name)) == -1 && strcmp(name, "*") != 0){ + werrstr("unknown name %s", name); + goto error; + } + a = hargstr(c, "arg2", ""); + if(a[0] && (arg.index2 = findname(a)) == -1){ + werrstr("unknown name %s", a); + goto error; + } + + g.arg = &arg; + g.t0 = hargint(c, "t0", -120); + g.t1 = hargint(c, "t1", 0); + g.min = hargint(c, "min", -1); + g.max = hargint(c, "max", -1); + g.wid = hargint(c, "wid", -1); + g.ht = hargint(c, "ht", -1); + dotext = hargstr(c, "text", "")[0] != 0; + g.fill = hargint(c, "fill", -1); + + graph = hargstr(c, "graph", "raw"); + if(strcmp(graph, "raw") == 0) + g.fn = rawgraph; + else if(strcmp(graph, "diskbw") == 0) + g.fn = diskgraph; + else if(strcmp(graph, "iobw") == 0) + g.fn = iograph; + else if(strcmp(graph, "netbw") == 0) + g.fn = netgraph; + else if(strcmp(graph, "diff") == 0) + g.fn = diffgraph; + else if(strcmp(graph, "pct") == 0) + g.fn = pctgraph; + else if(strcmp(graph, "pctdiff") == 0) + g.fn = pctdiffgraph; + else if(strcmp(graph, "divdiff") == 0) + g.fn = divdiffgraph; + else{ + werrstr("unknown graph %s", graph); + goto error; + } + + if(dotext){ + hsettype(c, "text/plain"); + dotextbin(&c->hout, &g); + hflush(&c->hout); + return 0; + } + + m = statgraph(&g); + if(m == nil) + goto error; + + if(hsettype(c, "image/png") < 0) + return -1; + hout = &c->hout; + writepng(hout, m); + qlock(&memdrawlock); + freememimage(m); + qunlock(&memdrawlock); + hflush(hout); + return 0; + +error: + return herror(c); +} + +static int +xloglist(HConnect *c) +{ + if(hsettype(c, "text/html") < 0) + return -1; + vtloghlist(&c->hout); + hflush(&c->hout); + return 0; +} + +static int +xlog(HConnect *c) +{ + char *name; + VtLog *l; + + name = hargstr(c, "log", ""); + if(!name[0]) + return xloglist(c); + l = vtlogopen(name, 0); + if(l == nil) + return hnotfound(c); + if(hsettype(c, "text/html") < 0){ + vtlogclose(l); + return -1; + } + vtloghdump(&c->hout, l); + vtlogclose(l); + hflush(&c->hout); + return 0; +} + +static int +xindex(HConnect *c) +{ + if(hsettype(c, "text/xml") < 0) + return -1; + xmlindex(&c->hout, mainindex, "index", 0); + hflush(&c->hout); + return 0; +} + +void +xmlindent(Hio *hout, int indent) +{ + int i; + + for(i = 0; i < indent; i++) + hputc(hout, '\t'); +} + +void +xmlaname(Hio *hout, char *v, char *tag) +{ + hprint(hout, " %s=\"%s\"", tag, v); +} + +void +xmlscore(Hio *hout, u8int *v, char *tag) +{ + if(scorecmp(zeroscore, v) == 0) + return; + hprint(hout, " %s=\"%V\"", tag, v); +} + +void +xmlsealed(Hio *hout, int v, char *tag) +{ + if(!v) + return; + hprint(hout, " %s=\"yes\"", tag); +} + +void +xmlu32int(Hio *hout, u32int v, char *tag) +{ + hprint(hout, " %s=\"%ud\"", tag, v); +} + +void +xmlu64int(Hio *hout, u64int v, char *tag) +{ + hprint(hout, " %s=\"%llud\"", tag, v); +} + +void +vtloghdump(Hio *h, VtLog *l) +{ + int i; + VtLogChunk *c; + char *name; + + name = l ? l->name : "<nil>"; + + hprint(h, "<html><head>\n"); + hprint(h, "<title>Venti Server Log: %s</title>\n", name); + hprint(h, "</head><body>\n"); + hprint(h, "<b>Venti Server Log: %s</b>\n<p>\n", name); + + if(l){ + c = l->w; + for(i=0; i<l->nchunk; i++){ + if(++c == l->chunk+l->nchunk) + c = l->chunk; + hwrite(h, c->p, c->wp-c->p); + } + } + hprint(h, "</body></html>\n"); +} + +static int +strpcmp(const void *va, const void *vb) +{ + return strcmp(*(char**)va, *(char**)vb); +} + +void +vtloghlist(Hio *h) +{ + char **p; + int i, n; + + hprint(h, "<html><head>\n"); + hprint(h, "<title>Venti Server Logs</title>\n"); + hprint(h, "</head><body>\n"); + hprint(h, "<b>Venti Server Logs</b>\n<p>\n"); + + p = vtlognames(&n); + qsort(p, n, sizeof(p[0]), strpcmp); + for(i=0; i<n; i++) + hprint(h, "<a href=\"/log?log=%s\">%s</a><br>\n", p[i], p[i]); + vtfree(p); + hprint(h, "</body></html>\n"); +} diff --git a/sys/src/cmd/venti/srv/icache.c b/sys/src/cmd/venti/srv/icache.c new file mode 100755 index 000000000..67faba209 --- /dev/null +++ b/sys/src/cmd/venti/srv/icache.c @@ -0,0 +1,571 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +int icacheprefetch = 1; + +typedef struct ICache ICache; +typedef struct IHash IHash; +typedef struct ISum ISum; + +struct ICache +{ + QLock lock; + Rendez full; + IHash *hash; + IEntry *entries; + int nentries; + IEntry free; + IEntry clean; + IEntry dirty; + u32int maxdirty; + u32int ndirty; + AState as; + + ISum **sum; + int nsum; + IHash *shash; + IEntry *sentries; + int nsentries; +}; + +static ICache icache; + +/* + * Hash table of IEntries + */ + +struct IHash +{ + int bits; + u32int size; + IEntry **table; +}; + +static IHash* +mkihash(int size1) +{ + u32int size; + int bits; + IHash *ih; + + bits = 0; + size = 1; + while(size < size1){ + bits++; + size <<= 1; + } + + ih = vtmallocz(sizeof(IHash)+size*sizeof(ih->table[0])); + ih->table = (IEntry**)(ih+1); + ih->bits = bits; + ih->size = size; + return ih; +} + +static IEntry* +ihashlookup(IHash *ih, u8int score[VtScoreSize], int type) +{ + u32int h; + IEntry *ie; + + h = hashbits(score, ih->bits); + for(ie=ih->table[h]; ie; ie=ie->nexthash) + if((type == -1 || type == ie->ia.type) && scorecmp(score, ie->score) == 0) + return ie; + return nil; +} + +static void +ihashdelete(IHash *ih, IEntry *ie, char *what) +{ + u32int h; + IEntry **l; + + h = hashbits(ie->score, ih->bits); + for(l=&ih->table[h]; *l; l=&(*l)->nexthash) + if(*l == ie){ + *l = ie->nexthash; + return; + } + fprint(2, "warning: %s %V not found in ihashdelete\n", what, ie->score); +} + +static void +ihashinsert(IHash *ih, IEntry *ie) +{ + u32int h; + + h = hashbits(ie->score, ih->bits); + ie->nexthash = ih->table[h]; + ih->table[h] = ie; +} + + +/* + * IEntry lists. + */ + +static IEntry* +popout(IEntry *ie) +{ + if(ie->prev == nil && ie->next == nil) + return ie; + ie->prev->next = ie->next; + ie->next->prev = ie->prev; + ie->next = nil; + ie->prev = nil; + return ie; +} + +static IEntry* +poplast(IEntry *list) +{ + if(list->prev == list) + return nil; + return popout(list->prev); +} + +static IEntry* +pushfirst(IEntry *list, IEntry *ie) +{ + popout(ie); + ie->prev = list; + ie->next = list->next; + ie->prev->next = ie; + ie->next->prev = ie; + return ie; +} + +/* + * Arena summary cache. + */ +struct ISum +{ + QLock lock; + IEntry *entries; + int nentries; + int loaded; + u64int addr; + u64int limit; + Arena *arena; + int g; +}; + +static ISum* +scachelookup(u64int addr) +{ + int i; + ISum *s; + + for(i=0; i<icache.nsum; i++){ + s = icache.sum[i]; + if(s->addr <= addr && addr < s->limit){ + if(i > 0){ + memmove(icache.sum+1, icache.sum, i*sizeof icache.sum[0]); + icache.sum[0] = s; + } + return s; + } + } + return nil; +} + +static void +sumclear(ISum *s) +{ + int i; + + for(i=0; i<s->nentries; i++) + ihashdelete(icache.shash, &s->entries[i], "scache"); + s->nentries = 0; + s->loaded = 0; + s->addr = 0; + s->limit = 0; + s->arena = nil; + s->g = 0; +} + +static ISum* +scacheevict(void) +{ + ISum *s; + int i; + + for(i=icache.nsum-1; i>=0; i--){ + s = icache.sum[i]; + if(canqlock(&s->lock)){ + if(i > 0){ + memmove(icache.sum+1, icache.sum, i*sizeof icache.sum[0]); + icache.sum[0] = s; + } + sumclear(s); + return s; + } + } + return nil; +} + +static void +scachehit(u64int addr) +{ + scachelookup(addr); /* for move-to-front */ +} + +static void +scachesetup(ISum *s, u64int addr) +{ + u64int addr0, limit; + int g; + + s->arena = amapitoag(mainindex, addr, &addr0, &limit, &g); + s->addr = addr0; + s->limit = limit; + s->g = g; +} + +static void +scacheload(ISum *s) +{ + int i, n; + + s->loaded = 1; + n = asumload(s->arena, s->g, s->entries, ArenaCIGSize); + /* + * n can be less then ArenaCIGSize, either if the clump group + * is the last in the arena and is only partially filled, or if there + * are corrupt clumps in the group -- those are not returned. + */ + for(i=0; i<n; i++){ + s->entries[i].ia.addr += s->addr; + ihashinsert(icache.shash, &s->entries[i]); + } +//fprint(2, "%T scacheload %s %d - %d entries\n", s->arena->name, s->g, n); + addstat(StatScachePrefetch, n); + s->nentries = n; +} + +static ISum* +scachemiss(u64int addr) +{ + ISum *s; + + if(!icacheprefetch) + return nil; + s = scachelookup(addr); + if(s == nil){ + /* first time: make an entry in the cache but don't populate it yet */ + s = scacheevict(); + if(s == nil) + return nil; + scachesetup(s, addr); + qunlock(&s->lock); + return nil; + } + + /* second time: load from disk */ + qlock(&s->lock); + if(s->loaded || !icacheprefetch){ + qunlock(&s->lock); + return nil; + } + + return s; /* locked */ +} + +/* + * Index cache. + */ + +void +initicache(u32int mem0) +{ + u32int mem; + int i, entries, scache; + + icache.full.l = &icache.lock; + + mem = mem0; + entries = mem / (sizeof(IEntry)+sizeof(IEntry*)); + scache = (entries/8) / ArenaCIGSize; + entries -= entries/8; + if(scache < 4) + scache = 4; + if(scache > 16) + scache = 16; + if(entries < 1000) + entries = 1000; +fprint(2, "icache %,d bytes = %,d entries; %d scache\n", mem0, entries, scache); + + icache.clean.prev = icache.clean.next = &icache.clean; + icache.dirty.prev = icache.dirty.next = &icache.dirty; + icache.free.prev = icache.free.next = &icache.free; + + icache.hash = mkihash(entries); + icache.nentries = entries; + setstat(StatIcacheSize, entries); + icache.entries = vtmallocz(entries*sizeof icache.entries[0]); + icache.maxdirty = entries / 2; + for(i=0; i<entries; i++) + pushfirst(&icache.free, &icache.entries[i]); + + icache.nsum = scache; + icache.sum = vtmallocz(scache*sizeof icache.sum[0]); + icache.sum[0] = vtmallocz(scache*sizeof icache.sum[0][0]); + icache.nsentries = scache * ArenaCIGSize; + icache.sentries = vtmallocz(scache*ArenaCIGSize*sizeof icache.sentries[0]); + icache.shash = mkihash(scache*ArenaCIGSize); + for(i=0; i<scache; i++){ + icache.sum[i] = icache.sum[0] + i; + icache.sum[i]->entries = icache.sentries + i*ArenaCIGSize; + } +} + + +static IEntry* +evictlru(void) +{ + IEntry *ie; + + ie = poplast(&icache.clean); + if(ie == nil) + return nil; + ihashdelete(icache.hash, ie, "evictlru"); + return ie; +} + +static void +icacheinsert(u8int score[VtScoreSize], IAddr *ia, int state) +{ + IEntry *ie; + + if((ie = poplast(&icache.free)) == nil && (ie = evictlru()) == nil){ + addstat(StatIcacheStall, 1); + while((ie = poplast(&icache.free)) == nil && (ie = evictlru()) == nil){ + // Could safely return here if state == IEClean. + // But if state == IEDirty, have to wait to make + // sure we don't lose an index write. + // Let's wait all the time. + flushdcache(); + kickicache(); + rsleep(&icache.full); + } + addstat(StatIcacheStall, -1); + } + + memmove(ie->score, score, VtScoreSize); + ie->state = state; + ie->ia = *ia; + if(state == IEClean){ + addstat(StatIcachePrefetch, 1); + pushfirst(&icache.clean, ie); + }else{ + addstat(StatIcacheWrite, 1); + assert(state == IEDirty); + icache.ndirty++; + setstat(StatIcacheDirty, icache.ndirty); + delaykickicache(); + pushfirst(&icache.dirty, ie); + } + ihashinsert(icache.hash, ie); +} + +int +icachelookup(u8int score[VtScoreSize], int type, IAddr *ia) +{ + IEntry *ie; + + qlock(&icache.lock); + addstat(StatIcacheLookup, 1); + if((ie = ihashlookup(icache.hash, score, type)) != nil){ + *ia = ie->ia; + if(ie->state == IEClean) + pushfirst(&icache.clean, ie); + addstat(StatIcacheHit, 1); + qunlock(&icache.lock); + return 0; + } + + if((ie = ihashlookup(icache.shash, score, type)) != nil){ + *ia = ie->ia; + icacheinsert(score, &ie->ia, IEClean); + scachehit(ie->ia.addr); + addstat(StatScacheHit, 1); + qunlock(&icache.lock); + return 0; + } + addstat(StatIcacheMiss, 1); + qunlock(&icache.lock); + + return -1; +} + +int +insertscore(u8int score[VtScoreSize], IAddr *ia, int state, AState *as) +{ + ISum *toload; + + qlock(&icache.lock); + icacheinsert(score, ia, state); + if(state == IEClean) + toload = scachemiss(ia->addr); + else{ + assert(state == IEDirty); + toload = nil; + if(as == nil) + fprint(2, "%T insertscore IEDirty without as; called from %#p\n", + getcallerpc(&score)); + else{ + if(icache.as.aa > as->aa) + fprint(2, "%T insertscore: aa moving backward: %#llux -> %#llux\n", icache.as.aa, as->aa); + icache.as = *as; + } + } + qunlock(&icache.lock); + if(toload){ + scacheload(toload); + qunlock(&toload->lock); + } + + if(icache.ndirty >= icache.maxdirty) + kickicache(); + + /* + * It's okay not to do this under icache.lock. + * Calling insertscore only happens when we hold + * the lump, meaning any searches for this block + * will hit in the lump cache until after we return. + */ + if(state == IEDirty) + markbloomfilter(mainindex->bloom, score); + + return 0; +} + +int +lookupscore(u8int score[VtScoreSize], int type, IAddr *ia) +{ + int ms, ret; + IEntry d; + + if(icachelookup(score, type, ia) >= 0){ + addstat(StatIcacheRead, 1); + return 0; + } + + ms = msec(); + addstat(StatIcacheFill, 1); + if(loadientry(mainindex, score, type, &d) < 0) + ret = -1; + else{ + ret = 0; + insertscore(score, &d.ia, IEClean, nil); + *ia = d.ia; + } + addstat2(StatIcacheRead, 1, StatIcacheReadTime, msec() - ms); + return ret; +} + +u32int +hashbits(u8int *sc, int bits) +{ + u32int v; + + v = (sc[0] << 24) | (sc[1] << 16) | (sc[2] << 8) | sc[3]; + if(bits < 32) + v >>= (32 - bits); + return v; +} + +ulong +icachedirtyfrac(void) +{ + return (vlong)icache.ndirty*IcacheFrac / icache.nentries; +} + +/* + * Return a singly-linked list of dirty index entries. + * with 32-bit hash numbers between lo and hi + * and address < limit. + */ +IEntry* +icachedirty(u32int lo, u32int hi, u64int limit) +{ + u32int h; + IEntry *ie, *dirty; + + dirty = nil; + trace(TraceProc, "icachedirty enter"); + qlock(&icache.lock); + for(ie = icache.dirty.next; ie != &icache.dirty; ie=ie->next){ + if(ie->state == IEDirty && ie->ia.addr <= limit){ + h = hashbits(ie->score, 32); + if(lo <= h && h <= hi){ + ie->nextdirty = dirty; + dirty = ie; + } + } + } + qunlock(&icache.lock); + trace(TraceProc, "icachedirty exit"); + if(dirty == nil) + flushdcache(); + return dirty; +} + +AState +icachestate(void) +{ + AState as; + + qlock(&icache.lock); + as = icache.as; + qunlock(&icache.lock); + return as; +} + +/* + * The singly-linked non-circular list of index entries ie + * has been written to disk. Move them to the clean list. + */ +void +icacheclean(IEntry *ie) +{ + IEntry *next; + + trace(TraceProc, "icacheclean enter"); + qlock(&icache.lock); + for(; ie; ie=next){ + assert(ie->state == IEDirty); + next = ie->nextdirty; + ie->nextdirty = nil; + popout(ie); /* from icache.dirty */ + icache.ndirty--; + ie->state = IEClean; + pushfirst(&icache.clean, ie); + } + setstat(StatIcacheDirty, icache.ndirty); + rwakeupall(&icache.full); + qunlock(&icache.lock); + trace(TraceProc, "icacheclean exit"); +} + +void +emptyicache(void) +{ + int i; + IEntry *ie; + ISum *s; + + qlock(&icache.lock); + while((ie = evictlru()) != nil) + pushfirst(&icache.free, ie); + for(i=0; i<icache.nsum; i++){ + s = icache.sum[i]; + qlock(&s->lock); + sumclear(s); + qunlock(&s->lock); + } + qunlock(&icache.lock); +} + diff --git a/sys/src/cmd/venti/srv/icachewrite.c b/sys/src/cmd/venti/srv/icachewrite.c new file mode 100755 index 000000000..e1406ef15 --- /dev/null +++ b/sys/src/cmd/venti/srv/icachewrite.c @@ -0,0 +1,358 @@ +/* + * Write the dirty icache entries to disk. Random seeks are + * so expensive that it makes sense to wait until we have + * a lot and then just make a sequential pass over the disk. + */ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +static void icachewriteproc(void*); +static void icachewritecoord(void*); +static IEntry *iesort(IEntry*); + +int icachesleeptime = 1000; /* milliseconds */ +int minicachesleeptime = 0; + +enum +{ + Bufsize = 8*1024*1024 +}; + +typedef struct IWrite IWrite; +struct IWrite +{ + Round round; + AState as; +}; + +static IWrite iwrite; + +void +initicachewrite(void) +{ + int i; + Index *ix; + + initround(&iwrite.round, "icache", 120*60*1000); + ix = mainindex; + for(i=0; i<ix->nsects; i++){ + ix->sects[i]->writechan = chancreate(sizeof(ulong), 1); + ix->sects[i]->writedonechan = chancreate(sizeof(ulong), 1); + vtproc(icachewriteproc, ix->sects[i]); + } + vtproc(icachewritecoord, nil); + vtproc(delaykickroundproc, &iwrite.round); +} + +static u64int +ie2diskaddr(Index *ix, ISect *is, IEntry *ie) +{ + u64int bucket, addr; + + bucket = hashbits(ie->score, 32)/ix->div; + addr = is->blockbase + ((bucket - is->start) << is->blocklog); + return addr; +} + +static IEntry* +nextchunk(Index *ix, ISect *is, IEntry **pie, u64int *paddr, uint *pnbuf) +{ + u64int addr, naddr; + uint nbuf; + int bsize; + IEntry *iefirst, *ie, **l; + + bsize = 1<<is->blocklog; + iefirst = *pie; + addr = ie2diskaddr(ix, is, iefirst); + nbuf = 0; + for(l = &iefirst->nextdirty; (ie = *l) != nil; l = &(*l)->nextdirty){ + naddr = ie2diskaddr(ix, is, ie); + if(naddr - addr >= Bufsize) + break; + nbuf = naddr - addr; + } + nbuf += bsize; + + *l = nil; + *pie = ie; + *paddr = addr; + *pnbuf = nbuf; + return iefirst; +} + +static int +icachewritesect(Index *ix, ISect *is, u8int *buf) +{ + int err, i, werr, h, bsize, t; + u32int lo, hi; + u64int addr, naddr; + uint nbuf, off; + DBlock *b; + IBucket ib; + IEntry *ie, *iedirty, **l, *chunk; + + lo = is->start * ix->div; + if(TWID32/ix->div < is->stop) + hi = TWID32; + else + hi = is->stop * ix->div - 1; + + trace(TraceProc, "icachewritesect enter %ud %ud %llud", + lo, hi, iwrite.as.aa); + + iedirty = icachedirty(lo, hi, iwrite.as.aa); + iedirty = iesort(iedirty); + bsize = 1 << is->blocklog; + err = 0; + + while(iedirty){ + disksched(); + while((t = icachesleeptime) == SleepForever){ + sleep(1000); + disksched(); + } + if(t < minicachesleeptime) + t = minicachesleeptime; + if(t > 0) + sleep(t); + trace(TraceProc, "icachewritesect nextchunk"); + chunk = nextchunk(ix, is, &iedirty, &addr, &nbuf); + + trace(TraceProc, "icachewritesect readpart 0x%llux+0x%ux", + addr, nbuf); + if(readpart(is->part, addr, buf, nbuf) < 0){ + fprint(2, "%s: part %s addr 0x%llux: icachewritesect " + "readpart: %r\n", argv0, is->part->name, addr); + err = -1; + continue; + } + trace(TraceProc, "icachewritesect updatebuf"); + addstat(StatIsectReadBytes, nbuf); + addstat(StatIsectRead, 1); + + for(l=&chunk; (ie=*l)!=nil; l=&ie->nextdirty){ +again: + naddr = ie2diskaddr(ix, is, ie); + off = naddr - addr; + if(off+bsize > nbuf){ + fprint(2, "%s: whoops! addr=0x%llux nbuf=%ud " + "addr+nbuf=0x%llux naddr=0x%llux\n", + argv0, addr, nbuf, addr+nbuf, naddr); + assert(off+bsize <= nbuf); + } + unpackibucket(&ib, buf+off, is->bucketmagic); + if(okibucket(&ib, is) < 0){ + fprint(2, "%s: bad bucket XXX\n", argv0); + goto skipit; + } + trace(TraceProc, "icachewritesect add %V at 0x%llux", + ie->score, naddr); + h = bucklook(ie->score, ie->ia.type, ib.data, ib.n); + if(h & 1){ + h ^= 1; + packientry(ie, &ib.data[h]); + }else if(ib.n < is->buckmax){ + memmove(&ib.data[h + IEntrySize], &ib.data[h], + ib.n*IEntrySize - h); + ib.n++; + packientry(ie, &ib.data[h]); + }else{ + fprint(2, "%s: bucket overflow XXX\n", argv0); +skipit: + err = -1; + *l = ie->nextdirty; + ie = *l; + if(ie) + goto again; + else + break; + } + packibucket(&ib, buf+off, is->bucketmagic); + } + + diskaccess(1); + + trace(TraceProc, "icachewritesect writepart", addr, nbuf); + werr = 0; + if(writepart(is->part, addr, buf, nbuf) < 0 || flushpart(is->part) < 0) + werr = -1; + + for(i=0; i<nbuf; i+=bsize){ + if((b = _getdblock(is->part, addr+i, ORDWR, 0)) != nil){ + memmove(b->data, buf+i, bsize); + putdblock(b); + } + } + + if(werr < 0){ + fprint(2, "%s: part %s addr 0x%llux: icachewritesect " + "writepart: %r\n", argv0, is->part->name, addr); + err = -1; + continue; + } + + addstat(StatIsectWriteBytes, nbuf); + addstat(StatIsectWrite, 1); + icacheclean(chunk); + } + + trace(TraceProc, "icachewritesect done"); + return err; +} + +static void +icachewriteproc(void *v) +{ + int ret; + uint bsize; + ISect *is; + Index *ix; + u8int *buf; + + ix = mainindex; + is = v; + threadsetname("icachewriteproc:%s", is->part->name); + + bsize = 1<<is->blocklog; + buf = emalloc(Bufsize+bsize); + buf = (u8int*)(((uintptr)buf+bsize-1)&~(uintptr)(bsize-1)); + + for(;;){ + trace(TraceProc, "icachewriteproc recv"); + recv(is->writechan, 0); + trace(TraceWork, "start"); + ret = icachewritesect(ix, is, buf); + trace(TraceProc, "icachewriteproc send"); + trace(TraceWork, "finish"); + sendul(is->writedonechan, ret); + } +} + +static void +icachewritecoord(void *v) +{ + int i, err; + Index *ix; + AState as; + + USED(v); + + threadsetname("icachewritecoord"); + + ix = mainindex; + iwrite.as = icachestate(); + + for(;;){ + trace(TraceProc, "icachewritecoord sleep"); + waitforkick(&iwrite.round); + trace(TraceWork, "start"); + as = icachestate(); + if(as.arena==iwrite.as.arena && as.aa==iwrite.as.aa){ + /* will not be able to do anything more than last flush - kick disk */ + trace(TraceProc, "icachewritecoord kick dcache"); + kickdcache(); + trace(TraceProc, "icachewritecoord kicked dcache"); + goto SkipWork; /* won't do anything; don't bother rewriting bloom filter */ + } + iwrite.as = as; + + trace(TraceProc, "icachewritecoord start flush"); + if(iwrite.as.arena){ + for(i=0; i<ix->nsects; i++) + send(ix->sects[i]->writechan, 0); + if(ix->bloom) + send(ix->bloom->writechan, 0); + + err = 0; + for(i=0; i<ix->nsects; i++) + err |= recvul(ix->sects[i]->writedonechan); + if(ix->bloom) + err |= recvul(ix->bloom->writedonechan); + + trace(TraceProc, "icachewritecoord donewrite err=%d", err); + if(err == 0){ + setatailstate(&iwrite.as); + } + } + SkipWork: + icacheclean(nil); /* wake up anyone waiting */ + trace(TraceWork, "finish"); + addstat(StatIcacheFlush, 1); + } +} + +void +flushicache(void) +{ + trace(TraceProc, "flushicache enter"); + kickround(&iwrite.round, 1); + trace(TraceProc, "flushicache exit"); +} + +void +kickicache(void) +{ + kickround(&iwrite.round, 0); +} + +void +delaykickicache(void) +{ + delaykickround(&iwrite.round); +} + +static IEntry* +iesort(IEntry *ie) +{ + int cmp; + IEntry **l; + IEntry *ie1, *ie2, *sorted; + + if(ie == nil || ie->nextdirty == nil) + return ie; + + /* split the lists */ + ie1 = ie; + ie2 = ie; + if(ie2) + ie2 = ie2->nextdirty; + if(ie2) + ie2 = ie2->nextdirty; + while(ie1 && ie2){ + ie1 = ie1->nextdirty; + ie2 = ie2->nextdirty; + if(ie2) + ie2 = ie2->nextdirty; + } + if(ie1){ + ie2 = ie1->nextdirty; + ie1->nextdirty = nil; + } + + /* sort the lists */ + ie1 = iesort(ie); + ie2 = iesort(ie2); + + /* merge the lists */ + sorted = nil; + l = &sorted; + cmp = 0; + while(ie1 || ie2){ + if(ie1 && ie2) + cmp = scorecmp(ie1->score, ie2->score); + if(ie1==nil || (ie2 && cmp > 0)){ + *l = ie2; + l = &ie2->nextdirty; + ie2 = ie2->nextdirty; + }else{ + *l = ie1; + l = &ie1->nextdirty; + ie1 = ie1->nextdirty; + } + } + *l = nil; + return sorted; +} + diff --git a/sys/src/cmd/venti/srv/ifile.c b/sys/src/cmd/venti/srv/ifile.c new file mode 100755 index 000000000..36d96b941 --- /dev/null +++ b/sys/src/cmd/venti/srv/ifile.c @@ -0,0 +1,149 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +static char vcmagic[] = "venti config\n"; + +enum { + Maxconfig = 8 * 1024, + Maglen = sizeof vcmagic - 1, +}; + +int +readifile(IFile *f, char *name) +{ + Part *p; + ZBlock *b; + u8int *z; + + p = initpart(name, OREAD); + if(p == nil) + return -1; + b = alloczblock(Maxconfig+1, 1, 0); + if(b == nil){ + seterr(EOk, "can't alloc for %s: %R", name); + return -1; + } + if(p->size > PartBlank){ + /* + * this is likely a real venti partition, in which case we're + * looking for the config file stored as 8k at end of PartBlank. + */ + if(readpart(p, PartBlank-Maxconfig, b->data, Maxconfig) < 0){ + seterr(EOk, "can't read %s: %r", name); + freezblock(b); + freepart(p); + return -1; + } + b->data[Maxconfig] = '\0'; + if(memcmp(b->data, vcmagic, Maglen) != 0){ + seterr(EOk, "bad venti config magic in %s", name); + freezblock(b); + freepart(p); + return -1; + } + /* + * if we change b->data+b->_size, freezblock + * will blow an assertion, so don't. + */ + b->data += Maglen; + b->_size -= Maglen; + b->len -= Maglen; + z = memchr(b->data, '\0', b->len); + if(z) + b->len = z - b->data; + }else if(p->size > Maxconfig){ + seterr(EOk, "config file is too large"); + freepart(p); + freezblock(b); + return -1; + }else{ + freezblock(b); + b = readfile(name); + if(b == nil){ + freepart(p); + return -1; + } + } + freepart(p); + f->name = name; + f->b = b; + f->pos = 0; + return 0; +} + +void +freeifile(IFile *f) +{ + freezblock(f->b); + f->b = nil; + f->pos = 0; +} + +int +partifile(IFile *f, Part *part, u64int start, u32int size) +{ + ZBlock *b; + + b = alloczblock(size, 0, part->blocksize); + if(b == nil) + return -1; + if(readpart(part, start, b->data, size) < 0){ + seterr(EAdmin, "can't read %s: %r", part->name); + freezblock(b); + return -1; + } + f->name = part->name; + f->b = b; + f->pos = 0; + return 0; +} + +/* + * return the next non-blank input line, + * stripped of leading white space and with # comments eliminated + */ +char* +ifileline(IFile *f) +{ + char *s, *e, *t; + int c; + + for(;;){ + s = (char*)&f->b->data[f->pos]; + e = memchr(s, '\n', f->b->len - f->pos); + if(e == nil) + return nil; + *e++ = '\0'; + f->pos = e - (char*)f->b->data; + t = strchr(s, '#'); + if(t != nil) + *t = '\0'; + for(; c = *s; s++) + if(c != ' ' && c != '\t' && c != '\r') + return s; + } +} + +int +ifilename(IFile *f, char *dst) +{ + char *s; + + s = ifileline(f); + if(s == nil || strlen(s) >= ANameSize) + return -1; + namecp(dst, s); + return 0; +} + +int +ifileu32int(IFile *f, u32int *r) +{ + char *s; + + s = ifileline(f); + if(s == nil) + return -1; + return stru32int(s, r); +} diff --git a/sys/src/cmd/venti/srv/index.c b/sys/src/cmd/venti/srv/index.c new file mode 100755 index 000000000..9877893ec --- /dev/null +++ b/sys/src/cmd/venti/srv/index.c @@ -0,0 +1,866 @@ +/* + * Index, mapping scores to log positions. + * + * The index is made up of some number of index sections, each of + * which is typically stored on a different disk. The blocks in all the + * index sections are logically numbered, with each index section + * responsible for a range of blocks. Blocks are typically 8kB. + * + * The N index blocks are treated as a giant hash table. The top 32 bits + * of score are used as the key for a lookup. Each index block holds + * one hash bucket, which is responsible for ceil(2^32 / N) of the key space. + * + * The index is sized so that a particular bucket is extraordinarily + * unlikely to overflow: assuming compressed data blocks are 4kB + * on disk, and assuming each block has a 40 byte index entry, + * the index data will be 1% of the total data. Since scores are essentially + * random, all buckets should be about the same fullness. + * A factor of 5 gives us a wide comfort boundary to account for + * random variation. So the index disk space should be 5% of the arena disk space. + */ + +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +static int initindex1(Index*); +static ISect *initisect1(ISect *is); + +#define KEY(k,d) ((d) ? (k)>>(32-(d)) : 0) + +static char IndexMagic[] = "venti index configuration"; + +Index* +initindex(char *name, ISect **sects, int n) +{ + IFile f; + Index *ix; + ISect *is; + u32int last, blocksize, tabsize; + int i; + + if(n <= 0){ +fprint(2, "bad n\n"); + seterr(EOk, "no index sections to initialize index"); + return nil; + } + ix = MKZ(Index); + if(ix == nil){ +fprint(2, "no mem\n"); + seterr(EOk, "can't initialize index: out of memory"); + freeindex(ix); + return nil; + } + + tabsize = sects[0]->tabsize; + if(partifile(&f, sects[0]->part, sects[0]->tabbase, tabsize) < 0) + return nil; + if(parseindex(&f, ix) < 0){ + freeifile(&f); + freeindex(ix); + return nil; + } + freeifile(&f); + if(namecmp(ix->name, name) != 0){ + seterr(ECorrupt, "mismatched index name: found %s expected %s", ix->name, name); + return nil; + } + if(ix->nsects != n){ + seterr(ECorrupt, "mismatched number index sections: found %d expected %d", n, ix->nsects); + freeindex(ix); + return nil; + } + ix->sects = sects; + last = 0; + blocksize = ix->blocksize; + for(i = 0; i < ix->nsects; i++){ + is = sects[i]; + if(namecmp(ix->name, is->index) != 0 + || is->blocksize != blocksize + || is->tabsize != tabsize + || namecmp(is->name, ix->smap[i].name) != 0 + || is->start != ix->smap[i].start + || is->stop != ix->smap[i].stop + || last != is->start + || is->start > is->stop){ + seterr(ECorrupt, "inconsistent index sections in %s", ix->name); + freeindex(ix); + return nil; + } + last = is->stop; + } + ix->tabsize = tabsize; + ix->buckets = last; + + if(initindex1(ix) < 0){ + freeindex(ix); + return nil; + } + + ix->arenas = MKNZ(Arena*, ix->narenas); + if(maparenas(ix->amap, ix->arenas, ix->narenas, ix->name) < 0){ + freeindex(ix); + return nil; + } + + return ix; +} + +static int +initindex1(Index *ix) +{ + u32int buckets; + + ix->div = (((u64int)1 << 32) + ix->buckets - 1) / ix->buckets; + buckets = (((u64int)1 << 32) - 1) / ix->div + 1; + if(buckets != ix->buckets){ + seterr(ECorrupt, "inconsistent math for divisor and buckets in %s", ix->name); + return -1; + } + + return 0; +} + +int +wbindex(Index *ix) +{ + Fmt f; + ZBlock *b; + int i; + + if(ix->nsects == 0){ + seterr(EOk, "no sections in index %s", ix->name); + return -1; + } + b = alloczblock(ix->tabsize, 1, ix->blocksize); + if(b == nil){ + seterr(EOk, "can't write index configuration: out of memory"); + return -1; + } + fmtzbinit(&f, b); + if(outputindex(&f, ix) < 0){ + seterr(EOk, "can't make index configuration: table storage too small %d", ix->tabsize); + freezblock(b); + return -1; + } + for(i = 0; i < ix->nsects; i++){ + if(writepart(ix->sects[i]->part, ix->sects[i]->tabbase, b->data, ix->tabsize) < 0 + || flushpart(ix->sects[i]->part) < 0){ + seterr(EOk, "can't write index: %r"); + freezblock(b); + return -1; + } + } + freezblock(b); + + for(i = 0; i < ix->nsects; i++) + if(wbisect(ix->sects[i]) < 0) + return -1; + + return 0; +} + +/* + * index: IndexMagic '\n' version '\n' name '\n' blocksize '\n' [V2: bitblocks '\n'] sections arenas + * version, blocksize: u32int + * name: max. ANameSize string + * sections, arenas: AMap + */ +int +outputindex(Fmt *f, Index *ix) +{ + if(fmtprint(f, "%s\n%ud\n%s\n%ud\n", IndexMagic, ix->version, ix->name, ix->blocksize) < 0 + || outputamap(f, ix->smap, ix->nsects) < 0 + || outputamap(f, ix->amap, ix->narenas) < 0) + return -1; + return 0; +} + +int +parseindex(IFile *f, Index *ix) +{ + AMapN amn; + u32int v; + char *s; + + /* + * magic + */ + s = ifileline(f); + if(s == nil || strcmp(s, IndexMagic) != 0){ + seterr(ECorrupt, "bad index magic for %s", f->name); + return -1; + } + + /* + * version + */ + if(ifileu32int(f, &v) < 0){ + seterr(ECorrupt, "syntax error: bad version number in %s", f->name); + return -1; + } + ix->version = v; + if(ix->version != IndexVersion){ + seterr(ECorrupt, "bad version number in %s", f->name); + return -1; + } + + /* + * name + */ + if(ifilename(f, ix->name) < 0){ + seterr(ECorrupt, "syntax error: bad index name in %s", f->name); + return -1; + } + + /* + * block size + */ + if(ifileu32int(f, &v) < 0){ + seterr(ECorrupt, "syntax error: bad block size number in %s", f->name); + return -1; + } + ix->blocksize = v; + + if(parseamap(f, &amn) < 0) + return -1; + ix->nsects = amn.n; + ix->smap = amn.map; + + if(parseamap(f, &amn) < 0) + return -1; + ix->narenas = amn.n; + ix->amap = amn.map; + + return 0; +} + +/* + * initialize an entirely new index + */ +Index * +newindex(char *name, ISect **sects, int n) +{ + Index *ix; + AMap *smap; + u64int nb; + u32int div, ub, xb, start, stop, blocksize, tabsize; + int i, j; + + if(n < 1){ + seterr(EOk, "creating index with no index sections"); + return nil; + } + + /* + * compute the total buckets available in the index, + * and the total buckets which are used. + */ + nb = 0; + blocksize = sects[0]->blocksize; + tabsize = sects[0]->tabsize; + for(i = 0; i < n; i++){ + /* + * allow index, start, and stop to be set if index is correct + * and start and stop are what we would have picked. + * this allows calling fmtindex to reformat the index after + * replacing a bad index section with a freshly formatted one. + * start and stop are checked below. + */ + if(sects[i]->index[0] != '\0' && strcmp(sects[i]->index, name) != 0){ + seterr(EOk, "creating new index using non-empty section %s", sects[i]->name); + return nil; + } + if(blocksize != sects[i]->blocksize){ + seterr(EOk, "mismatched block sizes in index sections"); + return nil; + } + if(tabsize != sects[i]->tabsize){ + seterr(EOk, "mismatched config table sizes in index sections"); + return nil; + } + nb += sects[i]->blocks; + } + + /* + * check for duplicate names + */ + for(i = 0; i < n; i++){ + for(j = i + 1; j < n; j++){ + if(namecmp(sects[i]->name, sects[j]->name) == 0){ + seterr(EOk, "duplicate section name %s for index %s", sects[i]->name, name); + return nil; + } + } + } + + if(nb >= ((u64int)1 << 32)){ + fprint(2, "%s: index is 2^32 blocks or more; ignoring some of it\n", + argv0); + nb = ((u64int)1 << 32) - 1; + } + + div = (((u64int)1 << 32) + nb - 1) / nb; + if(div < 100){ + fprint(2, "%s: index divisor %d too coarse; " + "index larger than needed, ignoring some of it\n", + argv0, div); + div = 100; + nb = (((u64int)1 << 32) - 1) / (100 - 1); + } + ub = (((u64int)1 << 32) - 1) / div + 1; + if(ub > nb){ + seterr(EBug, "index initialization math wrong"); + return nil; + } + xb = nb - ub; + + /* + * initialize each of the index sections + * and the section map table + */ + smap = MKNZ(AMap, n); + if(smap == nil){ + seterr(EOk, "can't create new index: out of memory"); + return nil; + } + start = 0; + for(i = 0; i < n; i++){ + stop = start + sects[i]->blocks - xb / n; + if(i == n - 1) + stop = ub; + + if(sects[i]->start != 0 || sects[i]->stop != 0) + if(sects[i]->start != start || sects[i]->stop != stop){ + seterr(EOk, "creating new index using non-empty section %s", sects[i]->name); + return nil; + } + + sects[i]->start = start; + sects[i]->stop = stop; + namecp(sects[i]->index, name); + + smap[i].start = start; + smap[i].stop = stop; + namecp(smap[i].name, sects[i]->name); + start = stop; + } + + /* + * initialize the index itself + */ + ix = MKZ(Index); + if(ix == nil){ + seterr(EOk, "can't create new index: out of memory"); + free(smap); + return nil; + } + ix->version = IndexVersion; + namecp(ix->name, name); + ix->sects = sects; + ix->smap = smap; + ix->nsects = n; + ix->blocksize = blocksize; + ix->buckets = ub; + ix->tabsize = tabsize; + ix->div = div; + + if(initindex1(ix) < 0){ + free(smap); + return nil; + } + + return ix; +} + +ISect* +initisect(Part *part) +{ + ISect *is; + ZBlock *b; + int ok; + + b = alloczblock(HeadSize, 0, 0); + if(b == nil || readpart(part, PartBlank, b->data, HeadSize) < 0){ + seterr(EAdmin, "can't read index section header: %r"); + return nil; + } + + is = MKZ(ISect); + if(is == nil){ + freezblock(b); + return nil; + } + is->part = part; + ok = unpackisect(is, b->data); + freezblock(b); + if(ok < 0){ + seterr(ECorrupt, "corrupted index section header: %r"); + freeisect(is); + return nil; + } + + if(is->version != ISectVersion1 && is->version != ISectVersion2){ + seterr(EAdmin, "unknown index section version %d", is->version); + freeisect(is); + return nil; + } + + return initisect1(is); +} + +ISect* +newisect(Part *part, u32int vers, char *name, u32int blocksize, u32int tabsize) +{ + ISect *is; + u32int tabbase; + + is = MKZ(ISect); + if(is == nil) + return nil; + + namecp(is->name, name); + is->version = vers; + is->part = part; + is->blocksize = blocksize; + is->start = 0; + is->stop = 0; + tabbase = (PartBlank + HeadSize + blocksize - 1) & ~(blocksize - 1); + is->blockbase = (tabbase + tabsize + blocksize - 1) & ~(blocksize - 1); + is->blocks = is->part->size / blocksize - is->blockbase / blocksize; + is->bucketmagic = 0; + if(is->version == ISectVersion2){ + do{ + is->bucketmagic = fastrand(); + }while(is->bucketmagic==0); + } + is = initisect1(is); + if(is == nil) + return nil; + + return is; +} + +/* + * initialize the computed parameters for an index + */ +static ISect* +initisect1(ISect *is) +{ + u64int v; + + is->buckmax = (is->blocksize - IBucketSize) / IEntrySize; + is->blocklog = u64log2(is->blocksize); + if(is->blocksize != (1 << is->blocklog)){ + seterr(ECorrupt, "illegal non-power-of-2 bucket size %d\n", is->blocksize); + freeisect(is); + return nil; + } + partblocksize(is->part, is->blocksize); + is->tabbase = (PartBlank + HeadSize + is->blocksize - 1) & ~(is->blocksize - 1); + if(is->tabbase >= is->blockbase){ + seterr(ECorrupt, "index section config table overlaps bucket storage"); + freeisect(is); + return nil; + } + is->tabsize = is->blockbase - is->tabbase; + v = is->part->size & ~(u64int)(is->blocksize - 1); + if(is->blockbase + (u64int)is->blocks * is->blocksize != v){ + seterr(ECorrupt, "invalid blocks in index section %s", is->name); + /* ZZZ what to do? + freeisect(is); + return nil; + */ + } + + if(is->stop - is->start > is->blocks){ + seterr(ECorrupt, "index section overflows available space"); + freeisect(is); + return nil; + } + if(is->start > is->stop){ + seterr(ECorrupt, "invalid index section range"); + freeisect(is); + return nil; + } + + return is; +} + +int +wbisect(ISect *is) +{ + ZBlock *b; + + b = alloczblock(HeadSize, 1, 0); + if(b == nil){ + /* ZZZ set error? */ + return -1; + } + + if(packisect(is, b->data) < 0){ + seterr(ECorrupt, "can't make index section header: %r"); + freezblock(b); + return -1; + } + if(writepart(is->part, PartBlank, b->data, HeadSize) < 0 || flushpart(is->part) < 0){ + seterr(EAdmin, "can't write index section header: %r"); + freezblock(b); + return -1; + } + freezblock(b); + + return 0; +} + +void +freeisect(ISect *is) +{ + if(is == nil) + return; + free(is); +} + +void +freeindex(Index *ix) +{ + int i; + + if(ix == nil) + return; + free(ix->amap); + free(ix->arenas); + if(ix->sects) + for(i = 0; i < ix->nsects; i++) + freeisect(ix->sects[i]); + free(ix->sects); + free(ix->smap); + free(ix); +} + +/* + * write a clump to an available arena in the index + * and return the address of the clump within the index. +ZZZ question: should this distinguish between an arena +filling up and real errors writing the clump? + */ +u64int +writeiclump(Index *ix, Clump *c, u8int *clbuf) +{ + u64int a; + int i; + IAddr ia; + AState as; + + trace(TraceLump, "writeiclump enter"); + qlock(&ix->writing); + for(i = ix->mapalloc; i < ix->narenas; i++){ + a = writeaclump(ix->arenas[i], c, clbuf); + if(a != TWID64){ + ix->mapalloc = i; + ia.addr = ix->amap[i].start + a; + ia.type = c->info.type; + ia.size = c->info.uncsize; + ia.blocks = (c->info.size + ClumpSize + (1<<ABlockLog) - 1) >> ABlockLog; + as.arena = ix->arenas[i]; + as.aa = ia.addr; + as.stats = as.arena->memstats; + insertscore(c->info.score, &ia, IEDirty, &as); + qunlock(&ix->writing); + trace(TraceLump, "writeiclump exit"); + return ia.addr; + } + } + qunlock(&ix->writing); + + seterr(EAdmin, "no space left in arenas"); + trace(TraceLump, "writeiclump failed"); + return TWID64; +} + +/* + * convert an arena index to an relative arena address + */ +Arena* +amapitoa(Index *ix, u64int a, u64int *aa) +{ + int i, r, l, m; + + l = 1; + r = ix->narenas - 1; + while(l <= r){ + m = (r + l) / 2; + if(ix->amap[m].start <= a) + l = m + 1; + else + r = m - 1; + } + l--; + + if(a > ix->amap[l].stop){ +for(i=0; i<ix->narenas; i++) + print("arena %d: %llux - %llux\n", i, ix->amap[i].start, ix->amap[i].stop); +print("want arena %d for %llux\n", l, a); + seterr(ECrash, "unmapped address passed to amapitoa"); + return nil; + } + + if(ix->arenas[l] == nil){ + seterr(ECrash, "unmapped arena selected in amapitoa"); + return nil; + } + *aa = a - ix->amap[l].start; + return ix->arenas[l]; +} + +/* + * convert an arena index to the bounds of the containing arena group. + */ +Arena* +amapitoag(Index *ix, u64int a, u64int *gstart, u64int *glimit, int *g) +{ + u64int aa; + Arena *arena; + + arena = amapitoa(ix, a, &aa); + if(arena == nil) + return nil; + if(arenatog(arena, aa, gstart, glimit, g) < 0) + return nil; + *gstart += a - aa; + *glimit += a - aa; + return arena; +} + +int +iaddrcmp(IAddr *ia1, IAddr *ia2) +{ + return ia1->type != ia2->type + || ia1->size != ia2->size + || ia1->blocks != ia2->blocks + || ia1->addr != ia2->addr; +} + +/* + * lookup the score in the partition + * + * nothing needs to be explicitly locked: + * only static parts of ix are used, and + * the bucket is locked by the DBlock lock. + */ +int +loadientry(Index *ix, u8int *score, int type, IEntry *ie) +{ + ISect *is; + DBlock *b; + IBucket ib; + u32int buck; + int h, ok; + + ok = -1; + + trace(TraceLump, "loadientry enter"); + + /* + qlock(&stats.lock); + stats.indexreads++; + qunlock(&stats.lock); + */ + + if(!inbloomfilter(mainindex->bloom, score)){ + trace(TraceLump, "loadientry bloomhit"); + return -1; + } + + trace(TraceLump, "loadientry loadibucket"); + b = loadibucket(ix, score, &is, &buck, &ib); + trace(TraceLump, "loadientry loadedibucket"); + if(b == nil) + return -1; + + if(okibucket(&ib, is) < 0){ + trace(TraceLump, "loadientry badbucket"); + goto out; + } + + h = bucklook(score, type, ib.data, ib.n); + if(h & 1){ + h ^= 1; + trace(TraceLump, "loadientry found"); + unpackientry(ie, &ib.data[h]); + ok = 0; + goto out; + } + trace(TraceLump, "loadientry notfound"); + addstat(StatBloomFalseMiss, 1); +out: + putdblock(b); + trace(TraceLump, "loadientry exit"); + return ok; +} + +int +okibucket(IBucket *ib, ISect *is) +{ + if(ib->n <= is->buckmax) + return 0; + + seterr(EICorrupt, "corrupted disk index bucket: n=%ud max=%ud, range=[%lud,%lud)", + ib->n, is->buckmax, is->start, is->stop); + return -1; +} + +/* + * look for score within data; + * return 1 | byte index of matching index, + * or 0 | index of least element > score + */ +int +bucklook(u8int *score, int otype, u8int *data, int n) +{ + int i, r, l, m, h, c, cc, type; + + if(otype == -1) + type = -1; + else + type = vttodisktype(otype); + l = 0; + r = n - 1; + while(l <= r){ + m = (r + l) >> 1; + h = m * IEntrySize; + for(i = 0; i < VtScoreSize; i++){ + c = score[i]; + cc = data[h + i]; + if(c != cc){ + if(c > cc) + l = m + 1; + else + r = m - 1; + goto cont; + } + } + cc = data[h + IEntryTypeOff]; + if(type != cc && type != -1){ + if(type > cc) + l = m + 1; + else + r = m - 1; + goto cont; + } + return h | 1; + cont:; + } + + return l * IEntrySize; +} + +/* + * compare two IEntries; consistent with bucklook + */ +int +ientrycmp(const void *vie1, const void *vie2) +{ + u8int *ie1, *ie2; + int i, v1, v2; + + ie1 = (u8int*)vie1; + ie2 = (u8int*)vie2; + for(i = 0; i < VtScoreSize; i++){ + v1 = ie1[i]; + v2 = ie2[i]; + if(v1 != v2){ + if(v1 < v2) + return -1; + return 1; + } + } + v1 = ie1[IEntryTypeOff]; + v2 = ie2[IEntryTypeOff]; + if(v1 != v2){ + if(v1 < v2) + return -1; + return 1; + } + return 0; +} + +/* + * find the number of the index section holding bucket #buck + */ +int +indexsect0(Index *ix, u32int buck) +{ + int r, l, m; + + l = 1; + r = ix->nsects - 1; + while(l <= r){ + m = (r + l) >> 1; + if(ix->sects[m]->start <= buck) + l = m + 1; + else + r = m - 1; + } + return l - 1; +} + +/* + * load the index block at bucket #buck + */ +static DBlock* +loadibucket0(Index *ix, u32int buck, ISect **pis, u32int *pbuck, IBucket *ib, int mode) +{ + ISect *is; + DBlock *b; + + is = ix->sects[indexsect0(ix, buck)]; + if(buck < is->start || is->stop <= buck){ + seterr(EAdmin, "index lookup out of range: %ud not found in index\n", buck); + return nil; + } + + buck -= is->start; + if((b = getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), mode)) == nil) + return nil; + + if(pis) + *pis = is; + if(pbuck) + *pbuck = buck; + if(ib) + unpackibucket(ib, b->data, is->bucketmagic); + return b; +} + +/* + * find the number of the index section holding score + */ +int +indexsect1(Index *ix, u8int *score) +{ + return indexsect0(ix, hashbits(score, 32) / ix->div); +} + +/* + * load the index block responsible for score. + */ +static DBlock* +loadibucket1(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib) +{ + return loadibucket0(ix, hashbits(score, 32)/ix->div, pis, pbuck, ib, OREAD); +} + +int +indexsect(Index *ix, u8int *score) +{ + return indexsect1(ix, score); +} + +DBlock* +loadibucket(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib) +{ + return loadibucket1(ix, score, pis, pbuck, ib); +} + + diff --git a/sys/src/cmd/venti/srv/lump.c b/sys/src/cmd/venti/srv/lump.c new file mode 100755 index 000000000..9b244948b --- /dev/null +++ b/sys/src/cmd/venti/srv/lump.c @@ -0,0 +1,240 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +int syncwrites = 0; +int queuewrites = 0; +int writestodevnull = 0; +int verifywrites = 0; + +static Packet *readilump(Lump *u, IAddr *ia, u8int *score); + +/* + * Some of this logic is duplicated in hdisk.c + */ +Packet* +readlump(u8int *score, int type, u32int size, int *cached) +{ + Lump *u; + Packet *p; + IAddr ia; + u32int n; + + trace(TraceLump, "readlump enter"); +/* + qlock(&stats.lock); + stats.lumpreads++; + qunlock(&stats.lock); +*/ + if(scorecmp(score, zeroscore) == 0) + return packetalloc(); + u = lookuplump(score, type); + if(u->data != nil){ + trace(TraceLump, "readlump lookuplump hit"); + if(cached) + *cached = 1; + n = packetsize(u->data); + if(n > size){ + seterr(EOk, "read too small: asked for %d need at least %d", size, n); + putlump(u); + + return nil; + } + p = packetdup(u->data, 0, n); + putlump(u); + return p; + } + + if(cached) + *cached = 0; + + if(lookupscore(score, type, &ia) < 0){ + /* ZZZ place to check for someone trying to guess scores */ + seterr(EOk, "no block with score %V/%d exists", score, type); + + putlump(u); + return nil; + } + if(ia.size > size){ + seterr(EOk, "read too small 1: asked for %d need at least %d", size, ia.size); + + putlump(u); + return nil; + } + + trace(TraceLump, "readlump readilump"); + p = readilump(u, &ia, score); + putlump(u); + + trace(TraceLump, "readlump exit"); + return p; +} + +/* + * save away a lump, and return it's score. + * doesn't store duplicates, but checks that the data is really the same. + */ +int +writelump(Packet *p, u8int *score, int type, u32int creator, uint ms) +{ + Lump *u; + int ok; + +/* + qlock(&stats.lock); + stats.lumpwrites++; + qunlock(&stats.lock); +*/ + + packetsha1(p, score); + if(packetsize(p) == 0 || writestodevnull==1){ + packetfree(p); + return 0; + } + + u = lookuplump(score, type); + if(u->data != nil){ + ok = 0; + if(packetcmp(p, u->data) != 0){ + uchar nscore[VtScoreSize]; + + packetsha1(u->data, nscore); + if(scorecmp(u->score, score) != 0) + seterr(EStrange, "lookuplump returned bad score %V not %V", u->score, score); + else if(scorecmp(u->score, nscore) != 0) + seterr(EStrange, "lookuplump returned bad data %V not %V", nscore, u->score); + else + seterr(EStrange, "score collision %V", score); + ok = -1; + } + packetfree(p); + putlump(u); + return ok; + } + + if(writestodevnull==2){ + packetfree(p); + return 0; + } + + if(queuewrites) + return queuewrite(u, p, creator, ms); + + ok = writeqlump(u, p, creator, ms); + + putlump(u); + return ok; +} + +int +writeqlump(Lump *u, Packet *p, int creator, uint ms) +{ + ZBlock *flat; + Packet *old; + IAddr ia; + int ok; + + if(lookupscore(u->score, u->type, &ia) == 0){ + if(verifywrites == 0){ + /* assume the data is here! */ + packetfree(p); + ms = msec() - ms; + addstat2(StatRpcWriteOld, 1, StatRpcWriteOldTime, ms); + return 0; + } + + /* + * if the read fails, + * assume it was corrupted data and store the block again + */ + old = readilump(u, &ia, u->score); + if(old != nil){ + ok = 0; + if(packetcmp(p, old) != 0){ + uchar nscore[VtScoreSize]; + + packetsha1(old, nscore); + if(scorecmp(u->score, nscore) != 0) + seterr(EStrange, "readilump returned bad data %V not %V", nscore, u->score); + else + seterr(EStrange, "score collision %V", u->score); + ok = -1; + } + packetfree(p); + packetfree(old); + + ms = msec() - ms; + addstat2(StatRpcWriteOld, 1, StatRpcWriteOldTime, ms); + return ok; + } + logerr(EAdmin, "writelump: read %V failed, rewriting: %r\n", u->score); + } + + flat = packet2zblock(p, packetsize(p)); + ok = storeclump(mainindex, flat, u->score, u->type, creator, &ia); + freezblock(flat); + if(ok == 0) + insertlump(u, p); + else + packetfree(p); + + if(syncwrites){ + flushdcache(); + flushicache(); + flushdcache(); + } + + ms = msec() - ms; + addstat2(StatRpcWriteNew, 1, StatRpcWriteNewTime, ms); + return ok; +} + +static Packet* +readilump(Lump *u, IAddr *ia, u8int *score) +{ + Arena *arena; + ZBlock *zb; + Packet *p, *pp; + Clump cl; + u64int aa; + u8int sc[VtScoreSize]; + + trace(TraceLump, "readilump enter"); + arena = amapitoa(mainindex, ia->addr, &aa); + if(arena == nil){ + trace(TraceLump, "readilump amapitoa failed"); + return nil; + } + + trace(TraceLump, "readilump loadclump"); + zb = loadclump(arena, aa, ia->blocks, &cl, sc, paranoid); + if(zb == nil){ + trace(TraceLump, "readilump loadclump failed"); + return nil; + } + + if(ia->size != cl.info.uncsize){ + seterr(EInconsist, "index and clump size mismatch"); + freezblock(zb); + return nil; + } + if(ia->type != cl.info.type){ + seterr(EInconsist, "index and clump type mismatch"); + freezblock(zb); + return nil; + } + if(scorecmp(score, sc) != 0){ + seterr(ECrash, "score mismatch"); + freezblock(zb); + return nil; + } + + trace(TraceLump, "readilump success"); + p = zblock2packet(zb, cl.info.uncsize); + freezblock(zb); + pp = packetdup(p, 0, packetsize(p)); + trace(TraceLump, "readilump insertlump"); + insertlump(u, pp); + trace(TraceLump, "readilump exit"); + return p; +} diff --git a/sys/src/cmd/venti/srv/lumpcache.c b/sys/src/cmd/venti/srv/lumpcache.c new file mode 100755 index 000000000..d9a6b954e --- /dev/null +++ b/sys/src/cmd/venti/srv/lumpcache.c @@ -0,0 +1,429 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +/* #define CHECK(x) x */ +#define CHECK(x) + +typedef struct LumpCache LumpCache; + +enum +{ + HashLog = 9, + HashSize = 1<<HashLog, + HashMask = HashSize - 1, +}; + +struct LumpCache +{ + QLock lock; + Rendez full; + Lump *free; /* list of available lumps */ + u32int allowed; /* total allowable space for packets */ + u32int avail; /* remaining space for packets */ + u32int now; /* ticks for usage timestamps */ + Lump **heads; /* hash table for finding address */ + int nheap; /* number of available victims */ + Lump **heap; /* heap for locating victims */ + int nblocks; /* number of blocks allocated */ + Lump *blocks; /* array of block descriptors */ +}; + +static LumpCache lumpcache; + +static void delheap(Lump *db); +static int downheap(int i, Lump *b); +static void fixheap(int i, Lump *b); +static int upheap(int i, Lump *b); +static Lump *bumplump(void); + +void +initlumpcache(u32int size, u32int nblocks) +{ + Lump *last, *b; + int i; + + lumpcache.full.l = &lumpcache.lock; + lumpcache.nblocks = nblocks; + lumpcache.allowed = size; + lumpcache.avail = size; + lumpcache.heads = MKNZ(Lump*, HashSize); + lumpcache.heap = MKNZ(Lump*, nblocks); + lumpcache.blocks = MKNZ(Lump, nblocks); + setstat(StatLcacheSize, lumpcache.nblocks); + + last = nil; + for(i = 0; i < nblocks; i++){ + b = &lumpcache.blocks[i]; + b->type = TWID8; + b->heap = TWID32; + b->next = last; + last = b; + } + lumpcache.free = last; + lumpcache.nheap = 0; +} + +Lump* +lookuplump(u8int *score, int type) +{ + uint ms; + Lump *b; + u32int h; + + ms = 0; + trace(TraceLump, "lookuplump enter"); + + h = hashbits(score, HashLog); + + /* + * look for the block in the cache + */ + qlock(&lumpcache.lock); + CHECK(checklumpcache()); +again: + for(b = lumpcache.heads[h]; b != nil; b = b->next){ + if(scorecmp(score, b->score)==0 && type == b->type){ + addstat(StatLcacheHit, 1); + trace(TraceLump, "lookuplump hit"); + goto found; + } + } + + trace(TraceLump, "lookuplump miss"); + + /* + * missed: locate the block with the oldest second to last use. + * remove it from the heap, and fix up the heap. + */ + while(lumpcache.free == nil){ + trace(TraceLump, "lookuplump bump"); + CHECK(checklumpcache()); + if(bumplump() == nil){ + CHECK(checklumpcache()); + logerr(EAdmin, "all lump cache blocks in use"); + addstat(StatLcacheStall, 1); + CHECK(checklumpcache()); + rsleep(&lumpcache.full); + CHECK(checklumpcache()); + addstat(StatLcacheStall, -1); + goto again; + } + CHECK(checklumpcache()); + } + + /* start timer on cache miss to avoid system call on cache hit */ + ms = msec(); + + addstat(StatLcacheMiss, 1); + b = lumpcache.free; + lumpcache.free = b->next; + + /* + * the new block has no last use, so assume it happens sometime in the middle +ZZZ this is not reasonable + */ + b->used = (b->used2 + lumpcache.now) / 2; + + /* + * rechain the block on the correct hash chain + */ + b->next = lumpcache.heads[h]; + lumpcache.heads[h] = b; + if(b->next != nil) + b->next->prev = b; + b->prev = nil; + + scorecp(b->score, score); + b->type = type; + b->size = 0; + b->data = nil; + +found: + b->ref++; + b->used2 = b->used; + b->used = lumpcache.now++; + if(b->heap != TWID32) + fixheap(b->heap, b); + CHECK(checklumpcache()); + qunlock(&lumpcache.lock); + + + addstat(StatLumpStall, 1); + qlock(&b->lock); + addstat(StatLumpStall, -1); + + trace(TraceLump, "lookuplump exit"); + addstat2(StatLcacheRead, 1, StatLcacheReadTime, ms ? msec()-ms : 0); + return b; +} + +void +insertlump(Lump *b, Packet *p) +{ + u32int size; + + /* + * look for the block in the cache + */ + trace(TraceLump, "insertlump enter"); + qlock(&lumpcache.lock); + CHECK(checklumpcache()); +again: + + addstat(StatLcacheWrite, 1); + + /* + * missed: locate the block with the oldest second to last use. + * remove it from the heap, and fix up the heap. + */ + size = packetasize(p); + while(lumpcache.avail < size){ + trace(TraceLump, "insertlump bump"); + CHECK(checklumpcache()); + if(bumplump() == nil){ + logerr(EAdmin, "all lump cache blocks in use"); + addstat(StatLcacheStall, 1); + CHECK(checklumpcache()); + rsleep(&lumpcache.full); + CHECK(checklumpcache()); + addstat(StatLcacheStall, -1); + goto again; + } + CHECK(checklumpcache()); + } + b->data = p; + b->size = size; + lumpcache.avail -= size; + CHECK(checklumpcache()); + qunlock(&lumpcache.lock); + trace(TraceLump, "insertlump exit"); +} + +void +putlump(Lump *b) +{ + if(b == nil) + return; + + trace(TraceLump, "putlump"); + qunlock(&b->lock); + qlock(&lumpcache.lock); + CHECK(checklumpcache()); + if(--b->ref == 0){ + if(b->heap == TWID32) + upheap(lumpcache.nheap++, b); + trace(TraceLump, "putlump wakeup"); + rwakeupall(&lumpcache.full); + } + CHECK(checklumpcache()); + qunlock(&lumpcache.lock); +} + +/* + * remove some lump from use and update the free list and counters + */ +static Lump* +bumplump(void) +{ + Lump *b; + u32int h; + + /* + * remove blocks until we find one that is unused + * referenced blocks are left in the heap even though + * they can't be scavenged; this is simple a speed optimization + */ + CHECK(checklumpcache()); + for(;;){ + if(lumpcache.nheap == 0){ + trace(TraceLump, "bumplump emptyheap"); + return nil; + } + b = lumpcache.heap[0]; + delheap(b); + if(!b->ref){ + trace(TraceLump, "bumplump wakeup"); + rwakeupall(&lumpcache.full); + break; + } + } + + /* + * unchain the block + */ + trace(TraceLump, "bumplump unchain"); + if(b->prev == nil){ + h = hashbits(b->score, HashLog); + if(lumpcache.heads[h] != b) + sysfatal("bad hash chains in lump cache"); + lumpcache.heads[h] = b->next; + }else + b->prev->next = b->next; + if(b->next != nil) + b->next->prev = b->prev; + + if(b->data != nil){ + packetfree(b->data); + b->data = nil; + lumpcache.avail += b->size; + b->size = 0; + } + b->type = TWID8; + + b->next = lumpcache.free; + lumpcache.free = b; + + CHECK(checklumpcache()); + trace(TraceLump, "bumplump exit"); + return b; +} + +void +emptylumpcache(void) +{ + qlock(&lumpcache.lock); + while(bumplump()) + ; + qunlock(&lumpcache.lock); +} + +/* + * delete an arbitrary block from the heap + */ +static void +delheap(Lump *db) +{ + fixheap(db->heap, lumpcache.heap[--lumpcache.nheap]); + db->heap = TWID32; +} + +/* + * push an element up or down to it's correct new location + */ +static void +fixheap(int i, Lump *b) +{ + if(upheap(i, b) == i) + downheap(i, b); +} + +static int +upheap(int i, Lump *b) +{ + Lump *bb; + u32int now; + int p; + + now = lumpcache.now; + for(; i != 0; i = p){ + p = (i - 1) >> 1; + bb = lumpcache.heap[p]; + if(b->used2 - now >= bb->used2 - now) + break; + lumpcache.heap[i] = bb; + bb->heap = i; + } + + lumpcache.heap[i] = b; + b->heap = i; + return i; +} + +static int +downheap(int i, Lump *b) +{ + Lump *bb; + u32int now; + int k; + + now = lumpcache.now; + for(; ; i = k){ + k = (i << 1) + 1; + if(k >= lumpcache.nheap) + break; + if(k + 1 < lumpcache.nheap && lumpcache.heap[k]->used2 - now > lumpcache.heap[k + 1]->used2 - now) + k++; + bb = lumpcache.heap[k]; + if(b->used2 - now <= bb->used2 - now) + break; + lumpcache.heap[i] = bb; + bb->heap = i; + } + + lumpcache.heap[i] = b; + b->heap = i; + return i; +} + +static void +findblock(Lump *bb) +{ + Lump *b, *last; + int h; + + last = nil; + h = hashbits(bb->score, HashLog); + for(b = lumpcache.heads[h]; b != nil; b = b->next){ + if(last != b->prev) + sysfatal("bad prev link"); + if(b == bb) + return; + last = b; + } + sysfatal("block score=%V type=%#x missing from hash table", bb->score, bb->type); +} + +void +checklumpcache(void) +{ + Lump *b; + u32int size, now, nfree; + int i, k, refed; + + now = lumpcache.now; + for(i = 0; i < lumpcache.nheap; i++){ + if(lumpcache.heap[i]->heap != i) + sysfatal("lc: mis-heaped at %d: %d", i, lumpcache.heap[i]->heap); + if(i > 0 && lumpcache.heap[(i - 1) >> 1]->used2 - now > lumpcache.heap[i]->used2 - now) + sysfatal("lc: bad heap ordering"); + k = (i << 1) + 1; + if(k < lumpcache.nheap && lumpcache.heap[i]->used2 - now > lumpcache.heap[k]->used2 - now) + sysfatal("lc: bad heap ordering"); + k++; + if(k < lumpcache.nheap && lumpcache.heap[i]->used2 - now > lumpcache.heap[k]->used2 - now) + sysfatal("lc: bad heap ordering"); + } + + refed = 0; + size = 0; + for(i = 0; i < lumpcache.nblocks; i++){ + b = &lumpcache.blocks[i]; + if(b->data == nil && b->size != 0) + sysfatal("bad size: %d data=%p", b->size, b->data); + if(b->ref && b->heap == TWID32) + refed++; + if(b->type != TWID8){ + findblock(b); + size += b->size; + } + if(b->heap != TWID32 + && lumpcache.heap[b->heap] != b) + sysfatal("lc: spurious heap value"); + } + if(lumpcache.avail != lumpcache.allowed - size){ + fprint(2, "mismatched available=%d and allowed=%d - used=%d space", lumpcache.avail, lumpcache.allowed, size); + *(int*)0=0; + } + + nfree = 0; + for(b = lumpcache.free; b != nil; b = b->next){ + if(b->type != TWID8 || b->heap != TWID32) + sysfatal("lc: bad free list"); + nfree++; + } + + if(lumpcache.nheap + nfree + refed != lumpcache.nblocks) + sysfatal("lc: missing blocks: %d %d %d %d", lumpcache.nheap, refed, nfree, lumpcache.nblocks); +} + diff --git a/sys/src/cmd/venti/srv/lumpqueue.c b/sys/src/cmd/venti/srv/lumpqueue.c new file mode 100755 index 000000000..869eaeae0 --- /dev/null +++ b/sys/src/cmd/venti/srv/lumpqueue.c @@ -0,0 +1,171 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +typedef struct LumpQueue LumpQueue; +typedef struct WLump WLump; + +enum +{ + MaxLumpQ = 1 << 3 /* max. lumps on a single write queue, must be pow 2 */ +}; + +struct WLump +{ + Lump *u; + Packet *p; + int creator; + int gen; + uint ms; +}; + +struct LumpQueue +{ + QLock lock; + Rendez flush; + Rendez full; + Rendez empty; + WLump q[MaxLumpQ]; + int w; + int r; +}; + +static LumpQueue *lumpqs; +static int nqs; + +static QLock glk; +static int gen; + +static void queueproc(void *vq); + +int +initlumpqueues(int nq) +{ + LumpQueue *q; + + int i; + nqs = nq; + + lumpqs = MKNZ(LumpQueue, nq); + + for(i = 0; i < nq; i++){ + q = &lumpqs[i]; + q->full.l = &q->lock; + q->empty.l = &q->lock; + q->flush.l = &q->lock; + + if(vtproc(queueproc, q) < 0){ + seterr(EOk, "can't start write queue slave: %r"); + return -1; + } + } + + return 0; +} + +/* + * queue a lump & it's packet data for writing + */ +int +queuewrite(Lump *u, Packet *p, int creator, uint ms) +{ + LumpQueue *q; + int i; + + trace(TraceProc, "queuewrite"); + i = indexsect(mainindex, u->score); + if(i < 0 || i >= nqs){ + seterr(EBug, "internal error: illegal index section in queuewrite"); + return -1; + } + + q = &lumpqs[i]; + + qlock(&q->lock); + while(q->r == ((q->w + 1) & (MaxLumpQ - 1))){ + trace(TraceProc, "queuewrite sleep"); + rsleep(&q->full); + } + + q->q[q->w].u = u; + q->q[q->w].p = p; + q->q[q->w].creator = creator; + q->q[q->w].ms = ms; + q->q[q->w].gen = gen; + q->w = (q->w + 1) & (MaxLumpQ - 1); + + trace(TraceProc, "queuewrite wakeup"); + rwakeup(&q->empty); + + qunlock(&q->lock); + + return 0; +} + +void +flushqueue(void) +{ + int i; + LumpQueue *q; + + if(!lumpqs) + return; + + trace(TraceProc, "flushqueue"); + + qlock(&glk); + gen++; + qunlock(&glk); + + for(i=0; i<mainindex->nsects; i++){ + q = &lumpqs[i]; + qlock(&q->lock); + while(q->w != q->r && gen - q->q[q->r].gen > 0){ + trace(TraceProc, "flushqueue sleep q%d", i); + rsleep(&q->flush); + } + qunlock(&q->lock); + } +} + +static void +queueproc(void *vq) +{ + LumpQueue *q; + Lump *u; + Packet *p; + int creator; + uint ms; + + threadsetname("queueproc"); + + q = vq; + for(;;){ + qlock(&q->lock); + while(q->w == q->r){ + trace(TraceProc, "queueproc sleep empty"); + rsleep(&q->empty); + } + + u = q->q[q->r].u; + p = q->q[q->r].p; + creator = q->q[q->r].creator; + ms = q->q[q->r].ms; + + q->r = (q->r + 1) & (MaxLumpQ - 1); + trace(TraceProc, "queueproc wakeup flush"); + rwakeupall(&q->flush); + + trace(TraceProc, "queueproc wakeup full"); + rwakeup(&q->full); + + qunlock(&q->lock); + + trace(TraceProc, "queueproc writelump %V", u->score); + if(writeqlump(u, p, creator, ms) < 0) + fprint(2, "failed to write lump for %V: %r", u->score); + trace(TraceProc, "queueproc wrotelump %V", u->score); + + putlump(u); + } +} diff --git a/sys/src/cmd/venti/srv/mirrorarenas.c b/sys/src/cmd/venti/srv/mirrorarenas.c new file mode 100755 index 000000000..8b72f1a51 --- /dev/null +++ b/sys/src/cmd/venti/srv/mirrorarenas.c @@ -0,0 +1,523 @@ +/* + * Mirror one arena partition onto another. + * Be careful to copy only new data. + */ + +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +Channel *writechan; + +typedef struct Write Write; +struct Write +{ + uchar *p; + int n; + uvlong o; + int error; +}; + +Part *src; +Part *dst; +int force; +int verbose; +int dosha1 = 1; +char *status; +uvlong astart, aend; + +void +usage(void) +{ + fprint(2, "usage: mirrorarenas [-sv] src dst [ranges]\n"); + threadexitsall("usage"); +} + +char *tagged; + +void +tag(char *fmt, ...) +{ + va_list arg; + + if(tagged){ + free(tagged); + tagged = nil; + } + va_start(arg, fmt); + tagged = vsmprint(fmt, arg); + va_end(arg); +} + +void +chat(char *fmt, ...) +{ + va_list arg; + + if(tagged){ + write(1, tagged, strlen(tagged)); + free(tagged); + tagged = nil; + } + va_start(arg, fmt); + vfprint(1, fmt, arg); + va_end(arg); +} + +#pragma varargck argpos tag 1 +#pragma varargck argpos chat 1 + + +int +ereadpart(Part *p, u64int offset, u8int *buf, u32int count) +{ + if(readpart(p, offset, buf, count) != count){ + chat("%T readpart %s at %#llux+%ud: %r\n", p->name, offset, count); + return -1; + } + return 0; +} + +int +ewritepart(Part *p, u64int offset, u8int *buf, u32int count) +{ + if(writepart(p, offset, buf, count) != count || flushpart(p) < 0){ + chat("%T writepart %s at %#llux+%ud: %r\n", p->name, offset, count); + return -1; + } + return 0; +} + +/* + * Extra proc to do writes to dst, so that we can overlap reading + * src with writing dst during copy. This is an easy factor of two + * (almost) in performance. + */ +static Write wsync; +static void +writeproc(void *v) +{ + Write *w; + + USED(v); + while((w = recvp(writechan)) != nil){ + if(w == &wsync) + continue; + if(ewritepart(dst, w->o, w->p, w->n) < 0) + w->error = 1; + } +} + +int +copy(uvlong start, uvlong end, char *what, DigestState *ds) +{ + int i, n; + uvlong o; + static uchar tmp[2][1024*1024]; + Write w[2]; + + assert(start <= end); + assert(astart <= start && start < aend); + assert(astart <= end && end <= aend); + + if(verbose && start != end) + chat("%T copy %,llud-%,llud %s\n", start, end, what); + + i = 0; + memset(w, 0, sizeof w); + for(o=start; o<end; o+=n){ + if(w[i].error) + goto error; + n = sizeof tmp[i]; + if(o+n > end) + n = end - o; + if(ereadpart(src, o, tmp[i], n) < 0) + goto error; + w[i].p = tmp[i]; + w[i].o = o; + w[i].n = n; + w[i].error = 0; + sendp(writechan, &w[i]); + if(ds) + sha1(tmp[i], n, nil, ds); + i = 1-i; + } + if(w[i].error) + goto error; + + /* + * wait for queued write to finish + */ + sendp(writechan, &wsync); + i = 1-i; + if(w[i].error) + return -1; + return 0; + +error: + /* + * sync with write proc + */ + w[i].p = nil; + w[i].o = 0; + w[i].n = 0; + w[i].error = 0; + sendp(writechan, &w[i]); + return -1; +} + +/* single-threaded, for reference */ +int +copy1(uvlong start, uvlong end, char *what, DigestState *ds) +{ + int n; + uvlong o; + static uchar tmp[1024*1024]; + + assert(start <= end); + assert(astart <= start && start < aend); + assert(astart <= end && end <= aend); + + if(verbose && start != end) + chat("%T copy %,llud-%,llud %s\n", start, end, what); + + for(o=start; o<end; o+=n){ + n = sizeof tmp; + if(o+n > end) + n = end - o; + if(ereadpart(src, o, tmp, n) < 0) + return -1; + if(ds) + sha1(tmp, n, nil, ds); + if(ewritepart(dst, o, tmp, n) < 0) + return -1; + } + return 0; +} + +int +asha1(Part *p, uvlong start, uvlong end, DigestState *ds) +{ + int n; + uvlong o; + static uchar tmp[1024*1024]; + + if(start == end) + return 0; + assert(start < end); + + if(verbose) + chat("%T sha1 %,llud-%,llud\n", start, end); + + for(o=start; o<end; o+=n){ + n = sizeof tmp; + if(o+n > end) + n = end - o; + if(ereadpart(p, o, tmp, n) < 0) + return -1; + sha1(tmp, n, nil, ds); + } + return 0; +} + +uvlong +rdown(uvlong a, int b) +{ + return a-a%b; +} + +uvlong +rup(uvlong a, int b) +{ + if(a%b == 0) + return a; + return a+b-a%b; +} + +void +mirror(Arena *sa, Arena *da) +{ + vlong v, si, di, end; + int clumpmax, blocksize, sealed; + static uchar buf[MaxIoSize]; + ArenaHead h; + DigestState xds, *ds; + vlong shaoff, base; + + base = sa->base; + blocksize = sa->blocksize; + end = sa->base + sa->size; + + astart = base - blocksize; + aend = end + blocksize; + + tag("%T %s (%,llud-%,llud)\n", sa->name, astart, aend); + + if(force){ + copy(astart, aend, "all", nil); + return; + } + + if(sa->diskstats.sealed && da->diskstats.sealed && scorecmp(da->score, zeroscore) != 0){ + if(scorecmp(sa->score, da->score) == 0){ + if(verbose) + chat("%T %s: %V sealed mirrored\n", sa->name, sa->score); + return; + } + chat("%T %s: warning: sealed score mismatch %V vs %V\n", sa->name, sa->score, da->score); + /* Keep executing; will correct seal if possible. */ + } + if(!sa->diskstats.sealed && da->diskstats.sealed && scorecmp(da->score, zeroscore) != 0){ + chat("%T %s: dst is sealed, src is not\n", sa->name); + status = "errors"; + return; + } + if(sa->diskstats.used < da->diskstats.used){ + chat("%T %s: src used %,lld < dst used %,lld\n", sa->name, sa->diskstats.used, da->diskstats.used); + status = "errors"; + return; + } + + if(da->clumpmagic != sa->clumpmagic){ + /* + * Write this now to reduce the window in which + * the head and tail disagree about clumpmagic. + */ + da->clumpmagic = sa->clumpmagic; + memset(buf, 0, sizeof buf); + packarena(da, buf); + if(ewritepart(dst, end, buf, blocksize) < 0) + return; + } + + memset(&h, 0, sizeof h); + h.version = da->version; + strcpy(h.name, da->name); + h.blocksize = da->blocksize; + h.size = da->size + 2*da->blocksize; + h.clumpmagic = da->clumpmagic; + memset(buf, 0, sizeof buf); + packarenahead(&h, buf); + if(ewritepart(dst, base - blocksize, buf, blocksize) < 0) + return; + + shaoff = 0; + ds = nil; + sealed = sa->diskstats.sealed && scorecmp(sa->score, zeroscore) != 0; + if(sealed && dosha1){ + /* start sha1 state with header */ + memset(&xds, 0, sizeof xds); + ds = &xds; + sha1(buf, blocksize, nil, ds); + shaoff = base; + } + + if(sa->diskstats.used != da->diskstats.used){ + di = base+rdown(da->diskstats.used, blocksize); + si = base+rup(sa->diskstats.used, blocksize); + if(ds && asha1(dst, shaoff, di, ds) < 0) + return; + if(copy(di, si, "data", ds) < 0) + return; + shaoff = si; + } + + clumpmax = sa->clumpmax; + di = end - da->diskstats.clumps/clumpmax * blocksize; + si = end - (sa->diskstats.clumps+clumpmax-1)/clumpmax * blocksize; + + if(sa->diskstats.sealed){ + /* + * might be a small hole between the end of the + * data and the beginning of the directory. + */ + v = base+rup(sa->diskstats.used, blocksize); + if(ds && asha1(dst, shaoff, v, ds) < 0) + return; + if(copy(v, si, "hole", ds) < 0) + return; + shaoff = si; + } + + if(da->diskstats.clumps != sa->diskstats.clumps){ + if(ds && asha1(dst, shaoff, si, ds) < 0) + return; + if(copy(si, di, "directory", ds) < 0) /* si < di because clumpinfo blocks grow down */ + return; + shaoff = di; + } + + da->ctime = sa->ctime; + da->wtime = sa->wtime; + da->diskstats = sa->diskstats; + da->diskstats.sealed = 0; + + /* + * Repack the arena tail information + * and save it for next time... + */ + memset(buf, 0, sizeof buf); + packarena(da, buf); + if(ewritepart(dst, end, buf, blocksize) < 0) + return; + + if(sealed){ + /* + * ... but on the final pass, copy the encoding + * of the tail information from the source + * arena itself. There are multiple possible + * ways to write the tail info out (the exact + * details have changed as venti went through + * revisions), and to keep the SHA1 hash the + * same, we have to use what the disk uses. + */ + if(asha1(dst, shaoff, end, ds) < 0 + || copy(end, end+blocksize-VtScoreSize, "tail", ds) < 0) + return; + if(dosha1){ + memset(buf, 0, VtScoreSize); + sha1(buf, VtScoreSize, da->score, ds); + if(scorecmp(sa->score, da->score) == 0){ + if(verbose) + chat("%T %s: %V sealed mirrored\n", sa->name, sa->score); + if(ewritepart(dst, end+blocksize-VtScoreSize, da->score, VtScoreSize) < 0) + return; + }else{ + chat("%T %s: sealing dst: score mismatch: %V vs %V\n", sa->name, sa->score, da->score); + memset(&xds, 0, sizeof xds); + asha1(dst, base-blocksize, end+blocksize-VtScoreSize, &xds); + sha1(buf, VtScoreSize, 0, &xds); + chat("%T reseal: %V\n", da->score); + status = "errors"; + } + }else{ + if(verbose) + chat("%T %s: %V mirrored\n", sa->name, sa->score); + if(ewritepart(dst, end+blocksize-VtScoreSize, sa->score, VtScoreSize) < 0) + return; + } + }else{ + chat("%T %s: %,lld used mirrored\n", + sa->name, sa->diskstats.used); + } +} + +void +mirrormany(ArenaPart *sp, ArenaPart *dp, char *range) +{ + int i, lo, hi; + char *s, *t; + Arena *sa, *da; + + if(range == nil){ + for(i=0; i<sp->narenas; i++){ + sa = sp->arenas[i]; + da = dp->arenas[i]; + mirror(sa, da); + } + return; + } + if(strcmp(range, "none") == 0) + return; + + for(s=range; *s; s=t){ + t = strchr(s, ','); + if(t) + *t++ = 0; + else + t = s+strlen(s); + if(*s == '-') + lo = 0; + else + lo = strtol(s, &s, 0); + hi = lo; + if(*s == '-'){ + s++; + if(*s == 0) + hi = sp->narenas-1; + else + hi = strtol(s, &s, 0); + } + if(*s != 0){ + chat("%T bad arena range: %s\n", s); + continue; + } + for(i=lo; i<=hi; i++){ + sa = sp->arenas[i]; + da = dp->arenas[i]; + mirror(sa, da); + } + } +} + + +void +threadmain(int argc, char **argv) +{ + int i; + Arena *sa, *da; + ArenaPart *s, *d; + char *ranges; + + ventifmtinstall(); + + ARGBEGIN{ + case 'F': + force = 1; + break; + case 'v': + verbose++; + break; + case 's': + dosha1 = 0; + break; + default: + usage(); + }ARGEND + + if(argc != 2 && argc != 3) + usage(); + ranges = nil; + if(argc == 3) + ranges = argv[2]; + + if((src = initpart(argv[0], OREAD)) == nil) + sysfatal("initpart %s: %r", argv[0]); + if((dst = initpart(argv[1], ORDWR)) == nil) + sysfatal("initpart %s: %r", argv[1]); + if((s = initarenapart(src)) == nil) + sysfatal("initarenapart %s: %r", argv[0]); + for(i=0; i<s->narenas; i++) + delarena(s->arenas[i]); + if((d = initarenapart(dst)) == nil) + sysfatal("loadarenapart %s: %r", argv[1]); + for(i=0; i<d->narenas; i++) + delarena(d->arenas[i]); + + /* + * The arena geometries must match or all bets are off. + */ + if(s->narenas != d->narenas) + sysfatal("arena count mismatch: %d vs %d", s->narenas, d->narenas); + for(i=0; i<s->narenas; i++){ + sa = s->arenas[i]; + da = d->arenas[i]; + if(sa->version != da->version) + sysfatal("arena %d: version mismatch: %d vs %d", i, sa->version, da->version); + if(sa->blocksize != da->blocksize) + sysfatal("arena %d: blocksize mismatch: %d vs %d", i, sa->blocksize, da->blocksize); + if(sa->size != da->size) + sysfatal("arena %d: size mismatch: %,lld vs %,lld", i, sa->size, da->size); + if(strcmp(sa->name, da->name) != 0) + sysfatal("arena %d: name mismatch: %s vs %s", i, sa->name, da->name); + } + + /* + * Mirror one arena at a time. + */ + writechan = chancreate(sizeof(void*), 0); + vtproc(writeproc, nil); + mirrormany(s, d, ranges); + sendp(writechan, nil); + threadexitsall(status); +} diff --git a/sys/src/cmd/venti/srv/mkfile b/sys/src/cmd/venti/srv/mkfile new file mode 100755 index 000000000..947710ea0 --- /dev/null +++ b/sys/src/cmd/venti/srv/mkfile @@ -0,0 +1,101 @@ +</$objtype/mkfile + +LIBOFILES=\ + arena.$O\ + arenas.$O\ + bloom.$O\ + buildbuck.$O\ + clump.$O\ + config.$O\ + conv.$O\ + dcache.$O\ + disksched.$O\ + dump.$O\ + graph.$O\ + hdisk.$O\ + hproc.$O\ + httpd.$O\ + icache.$O\ + icachewrite.$O\ + ifile.$O\ + index.$O\ + lump.$O\ + lumpcache.$O\ + lumpqueue.$O\ + part.$O\ + png.$O\ + round.$O\ + score.$O\ + sortientry.$O\ + stats.$O\ + syncarena.$O\ + syncindex0.$O\ + trace.$O\ + unwhack.$O\ + utils.$O\ + unittoull.$O\ + whack.$O\ + xml.$O\ + zblock.$O\ + zeropart.$O\ + +SLIB=libvs.a$O + +LIB=$SLIB # /$objtype/lib/libventi.a + +HFILES= dat.h\ + fns.h\ + stdinc.h\ + /sys/include/venti.h\ + /sys/include/httpd.h\ + +TARG=\ + venti\ + buildindex\ + checkarenas\ + checkindex\ + clumpstats\ + conf\ + findscore\ + fixarenas\ + fmtarenas\ + fmtbloom\ + fmtindex\ + fmtisect\ + mirrorarenas\ + printarena\ + printarenapart\ + rdarena\ + syncindex\ + verifyarena\ + wrarena\ + +OFILES= + +BIN=/$objtype/bin/venti + +it:V: $O.venti + +CLEANFILES=$CLEANFILES $SLIB + +</sys/src/cmd/mkmany + +CFLAGS=$CFLAGS -I. + +$SLIB: $LIBOFILES + ar rvc $SLIB $LIBOFILES + +# xml.c:D: mkxml dat.h +# ./mkxml dat.h > xml.c + +acid:D: lumpcache.acid + cat $prereq >$target + +$O.conf:D: conf.rc + { + echo '#!/bin/rc' + echo '# THIS FILE IS AUTOMATICALLY GENERATED' + echo '# FROM /sys/src/cmd/venti/conf.rc. DO NOT EDIT.' + echo + sed 1d conf.rc + } >$target && chmod +x $target diff --git a/sys/src/cmd/venti/srv/part.c b/sys/src/cmd/venti/srv/part.c new file mode 100755 index 000000000..9f112cf62 --- /dev/null +++ b/sys/src/cmd/venti/srv/part.c @@ -0,0 +1,249 @@ +#include "stdinc.h" +#include <ctype.h> +#include "dat.h" +#include "fns.h" + +u32int maxblocksize; +int readonly; + +static int +strtoullsuf(char *p, char **pp, int rad, u64int *u) +{ + u64int v; + + if(!isdigit((uchar)*p)) + return -1; + v = strtoull(p, &p, rad); + switch(*p){ + case 'k': + case 'K': + v *= 1024; + p++; + break; + case 'm': + case 'M': + v *= 1024*1024; + p++; + break; + case 'g': + case 'G': + v *= 1024*1024*1024; + p++; + break; + case 't': + case 'T': + v *= 1024*1024; + v *= 1024*1024; + p++; + break; + } + *pp = p; + *u = v; + return 0; +} + +static int +parsepart(char *name, char **file, u64int *lo, u64int *hi) +{ + char *p; + + *file = estrdup(name); + if((p = strrchr(*file, ':')) == nil){ + *lo = 0; + *hi = 0; + return 0; + } + *p++ = 0; + if(*p == '-') + *lo = 0; + else{ + if(strtoullsuf(p, &p, 0, lo) < 0){ + free(*file); + return -1; + } + } + if(*p == '-') + p++; + if(*p == 0){ + *hi = 0; + return 0; + } + if(strtoullsuf(p, &p, 0, hi) < 0 || *p != 0){ + free(*file); + return -1; + } + return 0; +} + +Part* +initpart(char *name, int mode) +{ + Part *part; + Dir *dir; + char *file; + u64int lo, hi; + + if(parsepart(name, &file, &lo, &hi) < 0) + return nil; + trace(TraceDisk, "initpart %s file %s lo 0x%llx hi 0x%llx", name, file, lo, hi); + part = MKZ(Part); + part->name = estrdup(name); + part->filename = estrdup(file); + if(readonly){ + mode &= ~(OREAD|OWRITE|ORDWR); + mode |= OREAD; + } + part->fd = open(file, mode); + if(part->fd < 0){ + if((mode&(OREAD|OWRITE|ORDWR)) == ORDWR) + part->fd = open(file, (mode&~ORDWR)|OREAD); + if(part->fd < 0){ + freepart(part); + fprint(2, "can't open partition='%s': %r\n", file); + seterr(EOk, "can't open partition='%s': %r", file); + fprint(2, "%r\n"); + free(file); + return nil; + } + fprint(2, "warning: %s opened for reading only\n", name); + } + part->offset = lo; + dir = dirfstat(part->fd); + if(dir == nil){ + freepart(part); + seterr(EOk, "can't stat partition='%s': %r", file); + free(file); + return nil; + } + if(dir->length == 0){ + free(dir); + freepart(part); + seterr(EOk, "can't determine size of partition %s", file); + free(file); + return nil; + } + if(dir->length < hi || dir->length < lo){ + freepart(part); + seterr(EOk, "partition '%s': bounds out of range (max %lld)", name, dir->length); + free(dir); + free(file); + return nil; + } + if(hi == 0) + hi = dir->length; + part->size = hi - part->offset; + free(dir); + return part; +} + +int +flushpart(Part *part) +{ + USED(part); + return 0; +} + +void +freepart(Part *part) +{ + if(part == nil) + return; + if(part->fd >= 0) + close(part->fd); + free(part->name); + free(part); +} + +void +partblocksize(Part *part, u32int blocksize) +{ + if(part->blocksize) + sysfatal("resetting partition=%s's block size", part->name); + part->blocksize = blocksize; + if(blocksize > maxblocksize) + maxblocksize = blocksize; +} + +enum { + Maxxfer = 64*1024, /* for NCR SCSI controllers; was 128K */ +}; + +static int reopen(Part*); + +int +rwpart(Part *part, int isread, u64int offset0, u8int *buf0, u32int count0) +{ + u32int count, opsize; + int n; + u8int *buf; + u64int offset; + + trace(TraceDisk, "%s %s %ud at 0x%llx", + isread ? "read" : "write", part->name, count0, offset0); + if(offset0 >= part->size || offset0+count0 > part->size){ + seterr(EStrange, "out of bounds %s offset 0x%llux count %ud to partition %s size 0x%llux", + isread ? "read" : "write", offset0, count0, part->name, + part->size); + return -1; + } + + buf = buf0; + count = count0; + offset = offset0; + while(count > 0){ + opsize = count; + if(opsize > Maxxfer) + opsize = Maxxfer; + if(isread) + n = pread(part->fd, buf, opsize, offset); + else + n = pwrite(part->fd, buf, opsize, offset); + if(n <= 0){ + seterr(EAdmin, "%s %s offset 0x%llux count %ud buf %p returned %d: %r", + isread ? "read" : "write", part->filename, offset, opsize, buf, n); + return -1; + } + offset += n; + count -= n; + buf += n; + } + + return count0; +} + +int +readpart(Part *part, u64int offset, u8int *buf, u32int count) +{ + return rwpart(part, 1, offset, buf, count); +} + +int +writepart(Part *part, u64int offset, u8int *buf, u32int count) +{ + return rwpart(part, 0, offset, buf, count); +} + +ZBlock* +readfile(char *name) +{ + Part *p; + ZBlock *b; + + p = initpart(name, OREAD); + if(p == nil) + return nil; + b = alloczblock(p->size, 0, p->blocksize); + if(b == nil){ + seterr(EOk, "can't alloc %s: %r", name); + freepart(p); + return nil; + } + if(readpart(p, 0, b->data, p->size) < 0){ + seterr(EOk, "can't read %s: %r", name); + freepart(p); + freezblock(b); + return nil; + } + freepart(p); + return b; +} diff --git a/sys/src/cmd/venti/srv/png.c b/sys/src/cmd/venti/srv/png.c new file mode 100755 index 000000000..81ab14c0c --- /dev/null +++ b/sys/src/cmd/venti/srv/png.c @@ -0,0 +1,239 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +enum +{ + IDATSIZE = 20000, + FilterNone = 0 +}; + +typedef struct ZlibR ZlibR; +typedef struct ZlibW ZlibW; + +struct ZlibR +{ + uchar *data; + int width; + int dx; + int dy; + int x; + int y; + int pixwid; +}; + +struct ZlibW +{ + Hio *io; + uchar *buf; + uchar *b; + uchar *e; +}; + +static ulong *crctab; +static uchar PNGmagic[] = { 137, 'P', 'N', 'G', '\r', '\n', 26, '\n'}; + +static void +put4(uchar *a, ulong v) +{ + a[0] = v>>24; + a[1] = v>>16; + a[2] = v>>8; + a[3] = v; +} + +static void +chunk(Hio *io, char *type, uchar *d, int n) +{ + uchar buf[4]; + ulong crc = 0; + + if(strlen(type) != 4) + return; + put4(buf, n); + hwrite(io, buf, 4); + hwrite(io, type, 4); + hwrite(io, d, n); + crc = blockcrc(crctab, crc, type, 4); + crc = blockcrc(crctab, crc, d, n); + put4(buf, crc); + hwrite(io, buf, 4); +} + +static int +zread(void *va, void *buf, int n) +{ + int a, i, pixels, pixwid; + uchar *b, *e, *img; + ZlibR *z; + + z = va; + pixwid = z->pixwid; + b = buf; + e = b+n; + while(b+pixwid <= e){ + if(z->y >= z->dy) + break; + if(z->x == 0) + *b++ = FilterNone; + pixels = (e-b)/pixwid; + if(pixels > z->dx - z->x) + pixels = z->dx - z->x; + img = z->data + z->width*z->y + pixwid*z->x; + memmove(b, img, pixwid*pixels); + if(pixwid == 4){ + /* + * Convert to non-premultiplied alpha. + */ + for(i=0; i<pixels; i++, b+=4){ + a = b[3]; + if(a != 0 && a != 255){ + if(b[0] >= a) + b[0] = a; + b[0] = (b[0]*255)/a; + if(b[1] >= a) + b[1] = a; + b[1] = (b[1]*255)/a; + if(b[2] >= a) + b[2] = a; + b[2] = (b[2]*255)/a; + } + } + }else + b += pixwid*pixels; + + z->x += pixels; + if(z->x >= z->dx){ + z->x = 0; + z->y++; + } + } + return b - (uchar*)buf; +} + +static void +IDAT(ZlibW *z) +{ + chunk(z->io, "IDAT", z->buf, z->b - z->buf); + z->b = z->buf; +} + +static int +zwrite(void *va, void *buf, int n) +{ + int m; + uchar *b, *e; + ZlibW *z; + + z = va; + b = buf; + e = b+n; + + while(b < e){ + m = z->e - z->b; + if(m > e - b) + m = e - b; + memmove(z->b, b, m); + z->b += m; + b += m; + if(z->b >= z->e) + IDAT(z); + } + return n; +} + +static Memimage* +memRGBA(Memimage *i) +{ + Memimage *ni; + char buf[32]; + ulong dst; + + /* + * [A]BGR because we want R,G,B,[A] in big-endian order. Sigh. + */ + chantostr(buf, i->chan); + if(strchr(buf, 'a')) + dst = ABGR32; + else + dst = BGR24; + + if(i->chan == dst) + return i; + + qlock(&memdrawlock); + ni = allocmemimage(i->r, dst); + if(ni) + memimagedraw(ni, ni->r, i, i->r.min, nil, i->r.min, S); + qunlock(&memdrawlock); + return ni; +} + +int +writepng(Hio *io, Memimage *m) +{ + static int first = 1; + static QLock lk; + uchar buf[200], *h; + Memimage *rgb; + ZlibR zr; + ZlibW zw; + + if(first){ + qlock(&lk); + if(first){ + deflateinit(); + crctab = mkcrctab(0xedb88320); + first = 0; + } + qunlock(&lk); + } + + rgb = memRGBA(m); + if(rgb == nil) + return -1; + + hwrite(io, PNGmagic, sizeof PNGmagic); + + /* IHDR chunk */ + h = buf; + put4(h, Dx(m->r)); h += 4; + put4(h, Dy(m->r)); h += 4; + *h++ = 8; /* 8 bits per channel */ + if(rgb->chan == BGR24) + *h++ = 2; /* RGB */ + else + *h++ = 6; /* RGBA */ + *h++ = 0; /* compression - deflate */ + *h++ = 0; /* filter - none */ + *h++ = 0; /* interlace - none */ + chunk(io, "IHDR", buf, h-buf); + + /* image data */ + zr.dx = Dx(m->r); + zr.dy = Dy(m->r); + zr.width = rgb->width * sizeof(ulong); + zr.data = rgb->data->bdata; + zr.x = 0; + zr.y = 0; + zr.pixwid = chantodepth(rgb->chan)/8; + zw.io = io; + zw.buf = vtmalloc(IDATSIZE); + zw.b = zw.buf; + zw.e = zw.b + IDATSIZE; + if(deflatezlib(&zw, zwrite, &zr, zread, 6, 0) < 0){ + free(zw.buf); + return -1; + } + if(zw.b > zw.buf) + IDAT(&zw); + free(zw.buf); + chunk(io, "IEND", nil, 0); + + if(m != rgb){ + qlock(&memdrawlock); + freememimage(rgb); + qunlock(&memdrawlock); + } + return 0; +} diff --git a/sys/src/cmd/venti/srv/printarena.c b/sys/src/cmd/venti/srv/printarena.c new file mode 100755 index 000000000..399385caf --- /dev/null +++ b/sys/src/cmd/venti/srv/printarena.c @@ -0,0 +1,126 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +void +usage(void) +{ + fprint(2, "usage: printarena arenafile [offset]\n"); + threadexitsall("usage"); +} + +static void +rdarena(Arena *arena, u64int offset) +{ + u64int a, aa, e; + u32int magic; + Clump cl; + uchar score[VtScoreSize]; + ZBlock *lump; + + printarena(2, arena); + + a = arena->base; + e = arena->base + arena->size; + if(offset != ~(u64int)0) { + if(offset >= e-a) + sysfatal("bad offset %llud >= %llud", + offset, e-a); + aa = offset; + } else + aa = 0; + + for(; aa < e; aa += ClumpSize+cl.info.size) { + magic = clumpmagic(arena, aa); + if(magic == ClumpFreeMagic) + break; + if(magic != arena->clumpmagic) { + fprint(2, "illegal clump magic number %#8.8ux offset %llud\n", + magic, aa); + break; + } + lump = loadclump(arena, aa, 0, &cl, score, 0); + if(lump == nil) { + fprint(2, "clump %llud failed to read: %r\n", aa); + break; + } + if(cl.info.type != VtCorruptType) { + scoremem(score, lump->data, cl.info.uncsize); + if(scorecmp(cl.info.score, score) != 0) { + fprint(2, "clump %llud has mismatched score\n", aa); + break; + } + if(vttypevalid(cl.info.type) < 0) { + fprint(2, "clump %llud has bad type %d\n", aa, cl.info.type); + break; + } + } + print("%22llud %V %3d %5d\n", aa, score, cl.info.type, cl.info.uncsize); + freezblock(lump); + } + print("end offset %llud\n", aa); +} + +void +threadmain(int argc, char *argv[]) +{ + char *file; + Arena *arena; + u64int offset, aoffset; + Part *part; + static uchar buf[8192]; + ArenaHead head; + + readonly = 1; /* for part.c */ + aoffset = 0; + ARGBEGIN{ + case 'o': + aoffset = strtoull(EARGF(usage()), 0, 0); + break; + default: + usage(); + break; + }ARGEND + + offset = ~(u64int)0; + switch(argc) { + default: + usage(); + case 2: + offset = strtoull(argv[1], 0, 0); + /* fall through */ + case 1: + file = argv[0]; + } + + + ventifmtinstall(); + statsinit(); + + part = initpart(file, OREAD|ODIRECT); + if(part == nil) + sysfatal("can't open file %s: %r", file); + if(readpart(part, aoffset, buf, sizeof buf) < 0) + sysfatal("can't read file %s: %r", file); + + if(unpackarenahead(&head, buf) < 0) + sysfatal("corrupted arena header: %r"); + + print("# arena head version=%d name=%.*s blocksize=%d size=%lld clumpmagic=0x%.8ux\n", + head.version, ANameSize, head.name, head.blocksize, + head.size, head.clumpmagic); + + if(aoffset+head.size > part->size) + sysfatal("arena is truncated: want %llud bytes have %llud", + head.size, part->size); + + partblocksize(part, head.blocksize); + initdcache(8 * MaxDiskBlock); + + arena = initarena(part, aoffset, head.size, head.blocksize); + if(arena == nil) + sysfatal("initarena: %r"); + + rdarena(arena, offset); + threadexitsall(0); +} diff --git a/sys/src/cmd/venti/srv/printarenapart.c b/sys/src/cmd/venti/srv/printarenapart.c new file mode 100755 index 000000000..5367d9669 --- /dev/null +++ b/sys/src/cmd/venti/srv/printarenapart.c @@ -0,0 +1,155 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +uchar buf[64*1024]; + +void +usage(void) +{ + fprint(2, "usage: printarenapart arenafile [offset]\n"); + threadexitsall("usage"); +} + +static void +rdarena(Arena *arena, u64int offset) +{ + u64int a, aa, e; + u32int magic; + Clump cl; + uchar score[VtScoreSize]; + ZBlock *lump; + + printarena(2, arena); + + a = arena->base; + e = arena->base + arena->size; + if(offset != ~(u64int)0) { + if(offset >= e-a) + sysfatal("bad offset %llud >= %llud", + offset, e-a); + aa = offset; + } else + aa = 0; + + for(; aa < e; aa += ClumpSize+cl.info.size) { + magic = clumpmagic(arena, aa); + if(magic == ClumpFreeMagic) + break; + if(magic != arena->clumpmagic) { + fprint(2, "illegal clump magic number %#8.8ux offset %llud\n", + magic, aa); + break; + } + lump = loadclump(arena, aa, 0, &cl, score, 0); + if(lump == nil) { + fprint(2, "clump %llud failed to read: %r\n", aa); + break; + } + if(cl.info.type != VtCorruptType) { + scoremem(score, lump->data, cl.info.uncsize); + if(scorecmp(cl.info.score, score) != 0) { + fprint(2, "clump %llud has mismatched score\n", aa); + break; + } + if(vttypevalid(cl.info.type) < 0) { + fprint(2, "clump %llud has bad type %d\n", aa, cl.info.type); + break; + } + } + print("%22llud %V %3d %5d\n", aa, score, cl.info.type, cl.info.uncsize); + freezblock(lump); + } + print("end offset %llud\n", aa); +} + +void +threadmain(int argc, char *argv[]) +{ + char *file, *p, *name; + char *table; + u64int offset; + Part *part; + ArenaPart ap; + ArenaHead head; + Arena tail; + char ct[40], mt[40]; + + readonly = 1; /* for part.c */ + ARGBEGIN{ + default: + usage(); + break; + }ARGEND + + switch(argc) { + default: + usage(); + case 1: + file = argv[0]; + } + + ventifmtinstall(); + statsinit(); + + part = initpart(file, OREAD|ODIRECT); + if(part == nil) + sysfatal("can't open file %s: %r", file); + if(readpart(part, PartBlank, buf, sizeof buf) < 0) + sysfatal("can't read file %s: %r", file); + + if(unpackarenapart(&ap, buf) < 0) + sysfatal("corrupted arena part header: %r"); + + print("# arena part version=%d blocksize=%d arenabase=%d\n", + ap.version, ap.blocksize, ap.arenabase); + ap.tabbase = (PartBlank+HeadSize+ap.blocksize-1)&~(ap.blocksize-1); + ap.tabsize = ap.arenabase - ap.tabbase; + + table = malloc(ap.tabsize+1); + if(readpart(part, ap.tabbase, (uchar*)table, ap.tabsize) < 0) + sysfatal("read %s: %r", file); + table[ap.tabsize] = 0; + + partblocksize(part, ap.blocksize); + initdcache(8 * MaxDiskBlock); + + for(p=table; p && *p; p=strchr(p, '\n')){ + if(*p == '\n') + p++; + name = p; + p = strpbrk(p, " \t"); + if(p == nil){ + fprint(2, "bad line: %s\n", name); + break; + } + offset = strtoull(p, nil, 0); + if(readpart(part, offset, buf, sizeof buf) < 0){ + fprint(2, "%s: read %s: %r\n", argv0, file); + continue; + } + if(unpackarenahead(&head, buf) < 0){ + fprint(2, "%s: unpackarenahead: %r\n", argv0); + continue; + } + if(readpart(part, offset+head.size-head.blocksize, buf, head.blocksize) < 0){ + fprint(2, "%s: read %s: %r\n", argv0, file); + continue; + } + if(unpackarena(&tail, buf) < 0){ + fprint(2, "%s: unpackarena: %r\n", argv0); + continue; + } + print("arena %s %lld clumps=%,d cclumps=%,d used=%,lld uncsize=%,lld%s\n", + tail.name, offset, + tail.diskstats.clumps, tail.diskstats.cclumps, + tail.diskstats.used, tail.diskstats.uncsize, + tail.diskstats.sealed ? " sealed" : ""); + strcpy(ct, ctime(tail.ctime)); + ct[28] = 0; + strcpy(mt, ctime(tail.wtime)); + mt[28] = 0; + print("\tctime=%s\n\tmtime=%s\n", ct, mt); + } + threadexitsall(0); +} diff --git a/sys/src/cmd/venti/srv/printarenas.c b/sys/src/cmd/venti/srv/printarenas.c new file mode 100755 index 000000000..111db0187 --- /dev/null +++ b/sys/src/cmd/venti/srv/printarenas.c @@ -0,0 +1,113 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" +#include <bio.h> + +Biobuf bout; + +static void +pie(IEntry *ie) +{ + Bprint(&bout, "%22lld %V %3d %5d\n", + ie->ia.addr, ie->score, ie->ia.type, ie->ia.size); +} + +void +usage(void) +{ + fprint(2, "usage: printarenas [-B blockcachesize] config [arenaname...]\n"); + threadexitsall(0); +} + +Config conf; + +int +shoulddump(char *name, int argc, char **argv) +{ + int i; + + if(argc == 0) + return 1; + for(i=0; i<argc; i++) + if(strcmp(name, argv[i]) == 0) + return 1; + return 0; +} + +enum +{ + ClumpChunks = 32*1024, +}; + +void +dumparena(Arena *arena, u64int a) +{ + IEntry ie; + ClumpInfo *ci, *cis; + u32int clump; + int i, n, nskip; + + cis = MKN(ClumpInfo, ClumpChunks); + nskip = 0; + memset(&ie, 0, sizeof(IEntry)); + for(clump = 0; clump < arena->memstats.clumps; clump += n){ + n = ClumpChunks; + if(n > arena->memstats.clumps - clump) + n = arena->memstats.clumps - clump; + if(readclumpinfos(arena, clump, cis, n) != n){ + fprint(2, "arena directory read failed: %r\n"); + break; + } + + for(i = 0; i < n; i++){ + ci = &cis[i]; + ie.ia.type = ci->type; + ie.ia.size = ci->uncsize; + ie.ia.addr = a; + a += ci->size + ClumpSize; + ie.ia.blocks = (ci->size + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog; + scorecp(ie.score, ci->score); + pie(&ie); + } + } + free(cis); +} + +void +threadmain(int argc, char *argv[]) +{ + int i; + Index *ix; + u32int bcmem; + + bcmem = 0; + ARGBEGIN{ + case 'B': + bcmem = unittoull(ARGF()); + break; + default: + usage(); + break; + }ARGEND + + if(argc < 1) + usage(); + + ventifmtinstall(); + + if(initventi(argv[0], &conf) < 0) + sysfatal("can't init venti: %r"); + + if(bcmem < maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16)) + bcmem = maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16); + if(0) fprint(2, "initialize %d bytes of disk block cache\n", bcmem); + initdcache(bcmem); + + Binit(&bout, 1, OWRITE); + ix = mainindex; + for(i=0; i<ix->narenas; i++) + if(shoulddump(ix->arenas[i]->name, argc-1, argv+1)) + dumparena(ix->arenas[i], ix->amap[i].start); + Bterm(&bout); + threadexitsall(0); +} diff --git a/sys/src/cmd/venti/srv/printindex.c b/sys/src/cmd/venti/srv/printindex.c new file mode 100755 index 000000000..edbcf7934 --- /dev/null +++ b/sys/src/cmd/venti/srv/printindex.c @@ -0,0 +1,99 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" +#include <bio.h> + +Biobuf bout; + +static void +pie(IEntry *ie) +{ + Bprint(&bout, "%22lld %V %3d %5d\n", + ie->ia.addr, ie->score, ie->ia.type, ie->ia.size); +} + +void +usage(void) +{ + fprint(2, "usage: printindex [-B blockcachesize] config [isectname...]\n"); + threadexitsall(0); +} + +Config conf; + +int +shoulddump(char *name, int argc, char **argv) +{ + int i; + + if(argc == 0) + return 1; + for(i=0; i<argc; i++) + if(strcmp(name, argv[i]) == 0) + return 1; + return 0; +} + +void +dumpisect(ISect *is) +{ + int j; + uchar *buf; + u32int i; + u64int off; + IBucket ib; + IEntry ie; + + buf = emalloc(is->blocksize); + for(i=0; i<is->blocks; i++){ + off = is->blockbase+(u64int)is->blocksize*i; + if(readpart(is->part, off, buf, is->blocksize) < 0) + fprint(2, "read %s at 0x%llux: %r\n", is->part->name, off); + else{ + unpackibucket(&ib, buf, is->bucketmagic); + for(j=0; j<ib.n; j++){ + unpackientry(&ie, &ib.data[j*IEntrySize]); + pie(&ie); + } + } + } +} + +void +threadmain(int argc, char *argv[]) +{ + int i; + Index *ix; + u32int bcmem; + + bcmem = 0; + ARGBEGIN{ + case 'B': + bcmem = unittoull(ARGF()); + break; + default: + usage(); + break; + }ARGEND + + if(argc < 1) + usage(); + + fmtinstall('H', encodefmt); + + if(initventi(argv[0], &conf) < 0) + sysfatal("can't init venti: %r"); + + if(bcmem < maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16)) + bcmem = maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16); + if(0) fprint(2, "initialize %d bytes of disk block cache\n", bcmem); + initdcache(bcmem); + + ix = mainindex; + Binit(&bout, 1, OWRITE); + for(i=0; i<ix->nsects; i++) + if(shoulddump(ix->sects[i]->name, argc-1, argv+1)) + dumpisect(ix->sects[i]); + Bterm(&bout); + threadexitsall(0); +} diff --git a/sys/src/cmd/venti/srv/printmap.c b/sys/src/cmd/venti/srv/printmap.c new file mode 100755 index 000000000..f3392ef81 --- /dev/null +++ b/sys/src/cmd/venti/srv/printmap.c @@ -0,0 +1,42 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +void +usage(void) +{ + fprint(2, "usage: printmap [-B blockcachesize] config\n"); + threadexitsall("usage"); +} + +Config conf; + +void +threadmain(int argc, char *argv[]) +{ + u32int bcmem; + int fix; + + fix = 0; + bcmem = 0; + ARGBEGIN{ + case 'B': + bcmem = unittoull(ARGF()); + break; + default: + usage(); + break; + }ARGEND + + if(!fix) + readonly = 1; + + if(argc != 1) + usage(); + + if(initventi(argv[0], &conf) < 0) + sysfatal("can't init venti: %r"); + + printindex(1, mainindex); + threadexitsall(0); +} diff --git a/sys/src/cmd/venti/srv/rdarena.c b/sys/src/cmd/venti/srv/rdarena.c new file mode 100755 index 000000000..0ccc1d96a --- /dev/null +++ b/sys/src/cmd/venti/srv/rdarena.c @@ -0,0 +1,96 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +static int verbose, quiet; + +void +usage(void) +{ + fprint(2, "usage: rdarena [-v] arenapart arena\n"); + threadexitsall(0); +} + +static void +rdarena(Arena *arena) +{ + ZBlock *b; + u64int a, e; + u32int bs; + + if (!quiet) { + fprint(2, "copying %s to standard output\n", arena->name); + printarena(2, arena); + } + + bs = MaxIoSize; + if(bs < arena->blocksize) + bs = arena->blocksize; + + b = alloczblock(bs, 0, arena->blocksize); + e = arena->base + arena->size + arena->blocksize; + for(a = arena->base - arena->blocksize; a + arena->blocksize <= e; a += bs){ + if(a + bs > e) + bs = arena->blocksize; + if(readpart(arena->part, a, b->data, bs) < 0) + fprint(2, "can't copy %s, read at %lld failed: %r\n", arena->name, a); + if(write(1, b->data, bs) != bs) + sysfatal("can't copy %s, write at %lld failed: %r", arena->name, a); + } + + freezblock(b); +} + +void +threadmain(int argc, char *argv[]) +{ + ArenaPart *ap; + Part *part; + char *file, *aname; + int i; + + ventifmtinstall(); + statsinit(); + + ARGBEGIN{ + case 'q': + quiet++; + break; + case 'v': + verbose++; + break; + default: + usage(); + break; + }ARGEND + + readonly = 1; + + if(argc != 2) + usage(); + + file = argv[0]; + aname = argv[1]; + + part = initpart(file, OREAD|ODIRECT); + if(part == nil) + sysfatal("can't open partition %s: %r", file); + + ap = initarenapart(part); + if(ap == nil) + sysfatal("can't initialize arena partition in %s: %r", file); + + if(verbose) + printarenapart(2, ap); + + initdcache(8 * MaxDiskBlock); + + for(i = 0; i < ap->narenas; i++){ + if(strcmp(ap->arenas[i]->name, aname) == 0){ + rdarena(ap->arenas[i]); + threadexitsall(0); + } + } + + sysfatal("couldn't find arena %s", aname); +} diff --git a/sys/src/cmd/venti/srv/readifile.c b/sys/src/cmd/venti/srv/readifile.c new file mode 100755 index 000000000..a822a9878 --- /dev/null +++ b/sys/src/cmd/venti/srv/readifile.c @@ -0,0 +1,29 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +void +usage(void) +{ + fprint(2, "usage: readifile file\n"); + threadexitsall("usage"); +} + +void +threadmain(int argc, char *argv[]) +{ + IFile ifile; + + ARGBEGIN{ + default: + usage(); + }ARGEND + + if(argc != 1) + usage(); + + if(readifile(&ifile, argv[0]) < 0) + sysfatal("readifile %s: %r", argv[0]); + write(1, ifile.b->data, ifile.b->len); + threadexitsall(nil); +} diff --git a/sys/src/cmd/venti/srv/reseal.c b/sys/src/cmd/venti/srv/reseal.c new file mode 100755 index 000000000..f7353122e --- /dev/null +++ b/sys/src/cmd/venti/srv/reseal.c @@ -0,0 +1,303 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +static uchar *data; +static uchar *data1; +static int blocksize; +static int sleepms; +static int fd; +static int force; +static vlong offset0; + +void +usage(void) +{ + fprint(2, "usage: reseal [-f] [-b blocksize] [-s ms] arenapart1 [name...]]\n"); + threadexitsall(0); +} + +static int +pwriteblock(uchar *buf, int n, vlong off) +{ + int nr, m; + + for(nr = 0; nr < n; nr += m){ + m = n - nr; + m = pwrite(fd, &buf[nr], m, offset0+off+nr); + if(m <= 0) + return -1; + } + return 0; +} + +static int +preadblock(uchar *buf, int n, vlong off) +{ + int nr, m; + + for(nr = 0; nr < n; nr += m){ + m = n - nr; + m = pread(fd, &buf[nr], m, offset0+off+nr); + if(m <= 0){ + if(m == 0) + werrstr("early eof"); + return -1; + } + } + return 0; +} + +static int +loadheader(char *name, ArenaHead *head, Arena *arena, vlong off) +{ + if(preadblock(data, head->blocksize, off + head->size - head->blocksize) < 0){ + fprint(2, "%s: reading arena tail: %r\n", name); + return -1; + } + + memset(arena, 0, sizeof *arena); + if(unpackarena(arena, data) < 0){ + fprint(2, "%s: unpack arena tail: %r\n", name); + return -1; + } + arena->blocksize = head->blocksize; + arena->base = off + head->blocksize; + arena->clumpmax = arena->blocksize / ClumpInfoSize; + arena->size = head->size - 2*head->blocksize; + + if(arena->diskstats.sealed) + scorecp(arena->score, data + head->blocksize - VtScoreSize); + return 0; +} + +uchar zero[VtScoreSize]; + +static int +verify(Arena *arena, void *data, uchar *newscore) +{ + vlong e, bs, n, o; + DigestState ds, ds1; + uchar score[VtScoreSize]; + + /* + * now we know how much to read + * read everything but the last block, which is special + */ + e = arena->size + arena->blocksize; + o = arena->base - arena->blocksize; + bs = arena->blocksize; + memset(&ds, 0, sizeof ds); + for(n = 0; n < e; n += bs){ + if(preadblock(data, bs, o + n) < 0){ + werrstr("read: %r"); + return -1; + } + if(n + bs > e) + bs = e - n; + sha1(data, bs, nil, &ds); + } + + /* last block */ + if(preadblock(data, arena->blocksize, o + e) < 0){ + werrstr("read: %r"); + return -1; + } + ds1 = ds; + sha1(data, bs - VtScoreSize, nil, &ds); + sha1(zero, VtScoreSize, score, &ds); + if(scorecmp(score, arena->score) != 0){ + if(!force){ + werrstr("score mismatch: %V != %V", score, arena->score); + return -1; + } + fprint(2, "warning: score mismatch %V != %V\n", score, arena->score); + } + + /* prepare new last block */ + memset(data, 0, arena->blocksize); + packarena(arena, data); + sha1(data, bs, newscore, &ds1); + scorecp((uchar*)data + arena->blocksize - VtScoreSize, newscore); + + return 0; +} + +static void +resealarena(char *name, vlong len) +{ + ArenaHead head; + Arena arena; + DigestState s; + u64int off; + uchar newscore[VtScoreSize]; + + fprint(2, "%s: begin reseal\n", name); + + memset(&s, 0, sizeof s); + + off = seek(fd, 0, 1); + + /* + * read a little bit, which will include the header + */ + if(preadblock(data, HeadSize, off) < 0){ + fprint(2, "%s: reading header: %r\n", name); + return; + } + if(unpackarenahead(&head, data) < 0){ + fprint(2, "%s: corrupt arena header: %r\n", name); + return; + } + if(head.version != ArenaVersion4 && head.version != ArenaVersion5) + fprint(2, "%s: warning: unknown arena version %d\n", name, head.version); + if(len != 0 && len != head.size) + fprint(2, "%s: warning: unexpected length %lld != %lld\n", name, head.size, len); + if(strcmp(name, "<stdin>") != 0 && strcmp(head.name, name) != 0) + fprint(2, "%s: warning: unexpected name %s\n", name, head.name); + + if(loadheader(name, &head, &arena, off) < 0) + return; + + if(!arena.diskstats.sealed){ + fprint(2, "%s: not sealed\n", name); + return; + } + + if(verify(&arena, data, newscore) < 0){ + fprint(2, "%s: failed to verify before reseal: %r\n", name); + return; + } + + if(pwriteblock(data, arena.blocksize, arena.base + arena.size) < 0){ + fprint(2, "%s: writing new tail: %r\n", name); + return; + } + scorecp(arena.score, newscore); + fprint(2, "%s: resealed: %V\n", name, newscore); + + if(verify(&arena, data, newscore) < 0){ + fprint(2, "%s: failed to verify after reseal!: %r\n", name); + return; + } + + fprint(2, "%s: verified: %V\n", name, newscore); +} + +static int +shouldcheck(char *name, char **s, int n) +{ + int i; + + if(n == 0) + return 1; + + for(i=0; i<n; i++){ + if(s[i] && strcmp(name, s[i]) == 0){ + s[i] = nil; + return 1; + } + } + return 0; +} + +char * +readap(ArenaPart *ap) +{ + char *table; + + if(preadblock(data, 8192, PartBlank) < 0) + sysfatal("read arena part header: %r"); + if(unpackarenapart(ap, data) < 0) + sysfatal("corrupted arena part header: %r"); + fprint(2, "# arena part version=%d blocksize=%d arenabase=%d\n", + ap->version, ap->blocksize, ap->arenabase); + ap->tabbase = (PartBlank+HeadSize+ap->blocksize-1)&~(ap->blocksize-1); + ap->tabsize = ap->arenabase - ap->tabbase; + table = malloc(ap->tabsize+1); + if(preadblock((uchar*)table, ap->tabsize, ap->tabbase) < 0) + sysfatal("reading arena part directory: %r"); + table[ap->tabsize] = 0; + return table; +} + +void +threadmain(int argc, char *argv[]) +{ + int i, nline; + char *p, *q, *table, *f[10], line[256]; + vlong start, stop; + ArenaPart ap; + Part *part; + + ventifmtinstall(); + blocksize = MaxIoSize; + ARGBEGIN{ + case 'b': + blocksize = unittoull(EARGF(usage())); + break; + case 'f': + force = 1; + break; + case 's': + sleepms = atoi(EARGF(usage())); + break; + default: + usage(); + break; + }ARGEND + + if(argc < 2) + usage(); + + data = vtmalloc(blocksize); + if((part = initpart(argv[0], ORDWR)) == nil) + sysfatal("open partition %s: %r", argv[0]); + fd = part->fd; + offset0 = part->offset; + + table = readap(&ap); + + nline = atoi(table); + p = strchr(table, '\n'); + if(p) + p++; + for(i=0; i<nline; i++){ + if(p == nil){ + fprint(2, "warning: unexpected arena table end\n"); + break; + } + q = strchr(p, '\n'); + if(q) + *q++ = 0; + if(strlen(p) >= sizeof line){ + fprint(2, "warning: long arena table line: %s\n", p); + p = q; + continue; + } + strcpy(line, p); + memset(f, 0, sizeof f); + if(tokenize(line, f, nelem(f)) < 3){ + fprint(2, "warning: bad arena table line: %s\n", p); + p = q; + continue; + } + p = q; + if(shouldcheck(f[0], argv+1, argc-1)){ + start = strtoull(f[1], 0, 0); + stop = strtoull(f[2], 0, 0); + if(stop <= start){ + fprint(2, "%s: bad start,stop %lld,%lld\n", f[0], stop, start); + continue; + } + if(seek(fd, start, 0) < 0) + fprint(2, "%s: seek to start: %r\n", f[0]); + resealarena(f[0], stop - start); + } + } + for(i=2; i<argc; i++) + if(argv[i] != 0) + fprint(2, "%s: did not find arena\n", argv[i]); + + threadexitsall(nil); +} diff --git a/sys/src/cmd/venti/srv/round.c b/sys/src/cmd/venti/srv/round.c new file mode 100755 index 000000000..bbf4a478a --- /dev/null +++ b/sys/src/cmd/venti/srv/round.c @@ -0,0 +1,102 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +void +waitforkick(Round *r) +{ + int n; + + qlock(&r->lock); + r->last = r->current; + assert(r->current+1 == r->next); + rwakeupall(&r->finish); + while(!r->doanother) + rsleep(&r->start); + n = r->next++; + r->current = n; + r->doanother = 0; + qunlock(&r->lock); +} + +static void +_kickround(Round *r, int wait) +{ + int n; + + if(!r->doanother) + trace(TraceProc, "kick %s", r->name); + r->doanother = 1; + rwakeup(&r->start); + if(wait){ + n = r->next; + while((int)(n - r->last) > 0){ + r->doanother = 1; + rwakeup(&r->start); + rsleep(&r->finish); + } + } +} + +void +kickround(Round *r, int wait) +{ + qlock(&r->lock); + _kickround(r, wait); + qunlock(&r->lock); +} + +void +initround(Round *r, char *name, int delay) +{ + memset(r, 0, sizeof *r); + r->name = name; + r->start.l = &r->lock; + r->finish.l = &r->lock; + r->delaywait.l = &r->lock; + r->last = 0; + r->current = 0; + r->next = 1; + r->doanother = 0; + r->delaytime = delay; +} + +void +delaykickround(Round *r) +{ + qlock(&r->lock); + r->delaykick = 1; + rwakeup(&r->delaywait); + qunlock(&r->lock); +} + +void +delaykickroundproc(void *v) +{ + Round *r = v; + int n; + + threadsetname("delaykickproc %s", r->name); + qlock(&r->lock); + for(;;){ + while(r->delaykick == 0){ + trace(TraceProc, "sleep"); + rsleep(&r->delaywait); + } + + n = r->next; + qunlock(&r->lock); + + trace(TraceProc, "waitround 0x%ux", (uint)n); + sleep(r->delaytime); + + qlock(&r->lock); + if(n == r->next){ + trace(TraceProc, "kickround 0x%ux", (uint)n); + _kickround(r, 1); + } + + trace(TraceProc, "finishround 0x%ux", (uint)n); + } +} + diff --git a/sys/src/cmd/venti/srv/score.c b/sys/src/cmd/venti/srv/score.c new file mode 100755 index 000000000..f150fd78e --- /dev/null +++ b/sys/src/cmd/venti/srv/score.c @@ -0,0 +1,46 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +u8int zeroscore[VtScoreSize]; + +/* Call this function to force linking of score.o for zeroscore on OS X */ +void needzeroscore(void) { } + +void +scoremem(u8int *score, u8int *buf, int n) +{ + DigestState s; + + memset(&s, 0, sizeof s); + sha1(buf, n, score, &s); +} + +static int +hexv(int c) +{ + if(c >= '0' && c <= '9') + return c - '0'; + if(c >= 'a' && c <= 'f') + return c - 'a' + 10; + if(c >= 'A' && c <= 'F') + return c - 'A' + 10; + return -1; +} + +int +strscore(char *s, u8int *score) +{ + int i, c, d; + + for(i = 0; i < VtScoreSize; i++){ + c = hexv(s[2 * i]); + if(c < 0) + return -1; + d = hexv(s[2 * i + 1]); + if(d < 0) + return -1; + score[i] = (c << 4) + d; + } + return s[2 * i] == '\0'; +} diff --git a/sys/src/cmd/venti/srv/sortientry.c b/sys/src/cmd/venti/srv/sortientry.c new file mode 100755 index 000000000..b8b8e876c --- /dev/null +++ b/sys/src/cmd/venti/srv/sortientry.c @@ -0,0 +1,365 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" +#include <bio.h> + +typedef struct IEBuck IEBuck; +typedef struct IEBucks IEBucks; + +enum +{ + ClumpChunks = 32*1024 +}; + +struct IEBuck +{ + u32int head; /* head of chain of chunks on the disk */ + u32int used; /* usage of the last chunk */ + u64int total; /* total number of bytes in this bucket */ + u8int *buf; /* chunk of entries for this bucket */ +}; + +struct IEBucks +{ + Part *part; + u64int off; /* offset for writing data in the partition */ + u32int chunks; /* total chunks written to fd */ + u64int max; /* max bytes entered in any one bucket */ + int bits; /* number of bits in initial bucket sort */ + int nbucks; /* 1 << bits, the number of buckets */ + u32int size; /* bytes in each of the buckets chunks */ + u32int usable; /* amount usable for IEntry data */ + u8int *buf; /* buffer for all chunks */ + u8int *xbuf; + IEBuck *bucks; +}; + +#define U32GET(p) (((p)[0]<<24)|((p)[1]<<16)|((p)[2]<<8)|(p)[3]) +#define U32PUT(p,v) (p)[0]=(v)>>24;(p)[1]=(v)>>16;(p)[2]=(v)>>8;(p)[3]=(v) + +static IEBucks *initiebucks(Part *part, int bits, u32int size); +static int flushiebuck(IEBucks *ib, int b, int reset); +static int flushiebucks(IEBucks *ib); +static u32int sortiebuck(IEBucks *ib, int b); +static u64int sortiebucks(IEBucks *ib); +static int sprayientry(IEBucks *ib, IEntry *ie); +static u32int readarenainfo(IEBucks *ib, Arena *arena, u64int a, Bloom *b); +static u32int readiebuck(IEBucks *ib, int b); +static void freeiebucks(IEBucks *ib); + +/* + * build a sorted file with all IEntries which should be in ix. + * assumes the arenas' directories are up to date. + * reads each, converts the entries to index entries, + * and sorts them. + */ +u64int +sortrawientries(Index *ix, Part *tmp, u64int *base, Bloom *bloom) +{ + IEBucks *ib; + u64int clumps, sorted; + u32int n; + int i, ok; + +/* ZZZ should allow configuration of bits, bucket size */ + ib = initiebucks(tmp, 8, 64*1024); + if(ib == nil){ + seterr(EOk, "can't create sorting buckets: %r"); + return TWID64; + } + ok = 0; + clumps = 0; + fprint(2, "constructing entry list\n"); + for(i = 0; i < ix->narenas; i++){ + n = readarenainfo(ib, ix->arenas[i], ix->amap[i].start, bloom); + if(n == TWID32){ + ok = -1; + break; + } + clumps += n; + } + fprint(2, "sorting %lld entries\n", clumps); + if(ok == 0){ + sorted = sortiebucks(ib); + *base = (u64int)ib->chunks * ib->size; + if(sorted != clumps){ + fprint(2, "sorting messed up: clumps=%lld sorted=%lld\n", clumps, sorted); + ok = -1; + } + } + freeiebucks(ib); + if(ok < 0) + return TWID64; + return clumps; +} + +#define CHECK(cis) if(((ulong*)cis)[-4] != 0xA110C09) xabort(); + +void +xabort(void) +{ + int *x; + + x = 0; + *x = 0; +} + +/* + * read in all of the arena's clump directory, + * convert to IEntry format, and bucket sort based + * on the first few bits. + */ +static u32int +readarenainfo(IEBucks *ib, Arena *arena, u64int a, Bloom *b) +{ + IEntry ie; + ClumpInfo *ci, *cis; + u32int clump; + int i, n, ok, nskip; + + if(arena->memstats.clumps) + fprint(2, "\tarena %s: %d entries\n", arena->name, arena->memstats.clumps); + else + fprint(2, "[%s] ", arena->name); + + cis = MKN(ClumpInfo, ClumpChunks); + ok = 0; + nskip = 0; + memset(&ie, 0, sizeof(IEntry)); + for(clump = 0; clump < arena->memstats.clumps; clump += n){ + n = ClumpChunks; + if(n > arena->memstats.clumps - clump) + n = arena->memstats.clumps - clump; + if(readclumpinfos(arena, clump, cis, n) != n){ + seterr(EOk, "arena directory read failed: %r"); + ok = -1; + break; + } + + for(i = 0; i < n; i++){ + ci = &cis[i]; + ie.ia.type = ci->type; + ie.ia.size = ci->uncsize; + ie.ia.addr = a; + a += ci->size + ClumpSize; + ie.ia.blocks = (ci->size + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog; + scorecp(ie.score, ci->score); + if(ci->type == VtCorruptType){ + if(0) print("! %V %22lld %3d %5d %3d\n", + ie.score, ie.ia.addr, ie.ia.type, ie.ia.size, ie.ia.blocks); + nskip++; + }else + sprayientry(ib, &ie); + markbloomfilter(b, ie.score); + } + } + free(cis); + if(ok < 0) + return TWID32; + return clump - nskip; +} + +/* + * initialize the external bucket sorting data structures + */ +static IEBucks* +initiebucks(Part *part, int bits, u32int size) +{ + IEBucks *ib; + int i; + + ib = MKZ(IEBucks); + if(ib == nil){ + seterr(EOk, "out of memory"); + return nil; + } + ib->bits = bits; + ib->nbucks = 1 << bits; + ib->size = size; + ib->usable = (size - U32Size) / IEntrySize * IEntrySize; + ib->bucks = MKNZ(IEBuck, ib->nbucks); + if(ib->bucks == nil){ + seterr(EOk, "out of memory allocation sorting buckets"); + freeiebucks(ib); + return nil; + } + ib->xbuf = MKN(u8int, size * ((1 << bits)+1)); + ib->buf = (u8int*)(((uintptr)ib->xbuf+size-1)&~(uintptr)(size-1)); + if(ib->buf == nil){ + seterr(EOk, "out of memory allocating sorting buckets' buffers"); + freeiebucks(ib); + return nil; + } + for(i = 0; i < ib->nbucks; i++){ + ib->bucks[i].head = TWID32; + ib->bucks[i].buf = &ib->buf[i * size]; + } + ib->part = part; + return ib; +} + +static void +freeiebucks(IEBucks *ib) +{ + if(ib == nil) + return; + free(ib->bucks); + free(ib->buf); + free(ib); +} + +/* + * initial sort: put the entry into the correct bucket + */ +static int +sprayientry(IEBucks *ib, IEntry *ie) +{ + u32int n; + int b; + + b = hashbits(ie->score, ib->bits); + n = ib->bucks[b].used; + if(n + IEntrySize > ib->usable){ + /* should be flushed below, but if flush fails, this can happen */ + seterr(EOk, "out of space in bucket"); + return -1; + } + packientry(ie, &ib->bucks[b].buf[n]); + n += IEntrySize; + ib->bucks[b].used = n; + if(n + IEntrySize <= ib->usable) + return 0; + return flushiebuck(ib, b, 1); +} + +/* + * finish sorting: + * for each bucket, read it in and sort it + * write out the the final file + */ +static u64int +sortiebucks(IEBucks *ib) +{ + u64int tot; + u32int n; + int i; + + if(flushiebucks(ib) < 0) + return TWID64; + for(i = 0; i < ib->nbucks; i++) + ib->bucks[i].buf = nil; + ib->off = (u64int)ib->chunks * ib->size; + free(ib->xbuf); + + ib->buf = MKN(u8int, ib->max + U32Size); + if(ib->buf == nil){ + seterr(EOk, "out of memory allocating final sorting buffer; try more buckets"); + return TWID64; + } + tot = 0; + for(i = 0; i < ib->nbucks; i++){ + n = sortiebuck(ib, i); + if(n == TWID32) + return TWID64; + if(n != ib->bucks[i].total/IEntrySize) + fprint(2, "bucket %d changed count %d => %d\n", + i, (int)(ib->bucks[i].total/IEntrySize), n); + tot += n; + } + return tot; +} + +/* + * sort from bucket b of ib into the output file to + */ +static u32int +sortiebuck(IEBucks *ib, int b) +{ + u32int n; + + n = readiebuck(ib, b); + if(n == TWID32) + return TWID32; + qsort(ib->buf, n, IEntrySize, ientrycmp); + if(writepart(ib->part, ib->off, ib->buf, n*IEntrySize) < 0){ + seterr(EOk, "can't write sorted bucket: %r"); + return TWID32; + } + ib->off += n * IEntrySize; + return n; +} + +/* + * write out a single bucket + */ +static int +flushiebuck(IEBucks *ib, int b, int reset) +{ + u32int n; + + if(ib->bucks[b].used == 0) + return 0; + n = ib->bucks[b].used; + U32PUT(&ib->bucks[b].buf[n], ib->bucks[b].head); + n += U32Size; + USED(n); + if(writepart(ib->part, (u64int)ib->chunks * ib->size, ib->bucks[b].buf, ib->size) < 0){ + seterr(EOk, "can't write sorting bucket to file: %r"); +xabort(); + return -1; + } + ib->bucks[b].head = ib->chunks++; + ib->bucks[b].total += ib->bucks[b].used; + if(reset) + ib->bucks[b].used = 0; + return 0; +} + +/* + * write out all of the buckets, and compute + * the maximum size of any bucket + */ +static int +flushiebucks(IEBucks *ib) +{ + int i; + + for(i = 0; i < ib->nbucks; i++){ + if(flushiebuck(ib, i, 0) < 0) + return -1; + if(ib->bucks[i].total > ib->max) + ib->max = ib->bucks[i].total; + } + return 0; +} + +/* + * read in the chained buffers for bucket b, + * and return it's total number of IEntries + */ +static u32int +readiebuck(IEBucks *ib, int b) +{ + u32int head, m, n; + + head = ib->bucks[b].head; + n = 0; + m = ib->bucks[b].used; + if(m == 0) + m = ib->usable; + if(0) if(ib->bucks[b].total) + fprint(2, "\tbucket %d: %lld entries\n", b, ib->bucks[b].total/IEntrySize); + while(head != TWID32){ + if(readpart(ib->part, (u64int)head * ib->size, &ib->buf[n], m+U32Size) < 0){ + seterr(EOk, "can't read index sort bucket: %r"); + return TWID32; + } + n += m; + head = U32GET(&ib->buf[n]); + m = ib->usable; + } + if(n != ib->bucks[b].total) + fprint(2, "\tbucket %d: expected %d entries, got %d\n", + b, (int)ib->bucks[b].total/IEntrySize, n/IEntrySize); + return n / IEntrySize; +} diff --git a/sys/src/cmd/venti/srv/stats.c b/sys/src/cmd/venti/srv/stats.c new file mode 100755 index 000000000..bb944760b --- /dev/null +++ b/sys/src/cmd/venti/srv/stats.c @@ -0,0 +1,212 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +int collectstats = 1; + +/* keep in sync with dat.h:/NStat */ +Statdesc statdesc[NStat] = +{ + { "rpc total", }, + { "rpc reads", }, + { "rpc reads ok", }, + { "rpc reads failed", }, + { "rpc read bytes", }, + { "rpc read time", }, + { "rpc read cached", }, + { "rpc read cached time", }, + { "rpc read uncached", }, + { "rpc read uncached time "}, + + { "rpc writes", }, + { "rpc writes new", }, + { "rpc writes old", }, + { "rpc writes failed", }, + { "rpc write bytes", }, + { "rpc write time", }, + { "rpc write new time", }, + { "rpc write old time", }, + + { "lump cache hits", }, + { "lump cache misses", }, + { "lump cache reads", }, + { "lump cache writes", }, + { "lump cache size", }, + { "lump cache stall", }, + { "lump cache read time", }, + + { "disk cache hits", }, + { "disk cache misses", }, + { "disk cache lookups", }, + { "disk cache reads", }, + { "disk cache writes", }, + { "disk cache dirty", }, + { "disk cache size", }, + { "disk cache flushes", }, + { "disk cache stalls", }, + { "disk cache lookup time", }, + + { "disk block stalls", }, + { "lump stalls", }, + + { "index cache hits", }, + { "index cache misses", }, + { "index cache reads", }, + { "index cache writes", }, + { "index cache fills", }, + { "index cache prefetches", }, + { "index cache dirty", }, + { "index cache size", }, + { "index cache flushes", }, + { "index cache stalls", }, + { "index cache read time", }, + { "index cache lookups" }, + { "index cache summary hits" }, + { "index cache summary prefetches" }, + + { "bloom filter hits", }, + { "bloom filter misses", }, + { "bloom filter false misses", }, + { "bloom filter lookups", }, + { "bloom filter ones", }, + { "bloom filter bits", }, + + { "arena block reads", }, + { "arena block read bytes", }, + { "arena block writes", }, + { "arena block write bytes", }, + + { "isect block reads", }, + { "isect block read bytes", }, + { "isect block writes", }, + { "isect block write bytes", }, + + { "sum reads", }, + { "sum read bytes", }, + + { "cig loads" }, + { "cig load time" }, +}; + +QLock statslock; +Stats stats; +Stats *stathist; +int nstathist; +ulong statind; +ulong stattime; + +void +statsproc(void *v) +{ + USED(v); + + for(;;){ + stats.now = time(0); + stathist[stattime%nstathist] = stats; + stattime++; + sleep(1000); + } +} + +void +statsinit(void) +{ + nstathist = 90000; + stathist = MKNZ(Stats, nstathist); + vtproc(statsproc, nil); +} + +void +setstat(int index, long val) +{ + qlock(&statslock); + stats.n[index] = val; + qunlock(&statslock); +} + +void +addstat(int index, int inc) +{ + if(!collectstats) + return; + qlock(&statslock); + stats.n[index] += inc; + qunlock(&statslock); +} + +void +addstat2(int index, int inc, int index1, int inc1) +{ + if(!collectstats) + return; + qlock(&statslock); + stats.n[index] += inc; + stats.n[index1] += inc1; + qunlock(&statslock); +} + +void +printstats(void) +{ +} + +void +binstats(long (*fn)(Stats *s0, Stats *s1, void *arg), void *arg, + long t0, long t1, Statbin *bin, int nbin) +{ + long xt0, t, te, v; + int i, j, lo, hi, m; + vlong tot; + Statbin *b; + + t = stats.now; + + /* negative times mean relative to now. */ + if(t0 <= 0) + t0 += t; + if(t1 <= 0) + t1 += t; + /* ten minute range if none given */ + if(t1 <= t0) + t0 = t1 - 60*10; + if(0) fprint(2, "stats %ld-%ld\n", t0, t1); + + /* binary search to find t0-1 or close */ + lo = stattime; + hi = stattime+nstathist; + while(lo+1 < hi){ + m = (lo+hi)/2; + if(stathist[m%nstathist].now >= t0) + hi = m; + else + lo = m; + } + xt0 = stathist[lo%nstathist].now; + if(xt0 >= t1){ + /* no samples */ + memset(bin, 0, nbin*sizeof bin[0]); + return; + } + + hi = stattime+nstathist; + j = lo+1; + for(i=0; i<nbin; i++){ + te = t0 + (t1-t0)*i/nbin; + b = &bin[i]; + memset(b, 0, sizeof *b); + tot = 0; + for(; j<hi && stathist[j%nstathist].now<te; j++){ + v = fn(&stathist[(j-1)%nstathist], &stathist[j%nstathist], arg); + if(b->nsamp==0 || v < b->min) + b->min = v; + if(b->nsamp==0 || v > b->max) + b->max = v; + tot += v; + b->nsamp++; + } + if(b->nsamp) + b->avg = tot / b->nsamp; + if(b->nsamp==0 && i>0) + *b = bin[i-1]; + } +} diff --git a/sys/src/cmd/venti/srv/stdinc.h b/sys/src/cmd/venti/srv/stdinc.h new file mode 100755 index 000000000..3fd06ccd7 --- /dev/null +++ b/sys/src/cmd/venti/srv/stdinc.h @@ -0,0 +1,9 @@ +#include <u.h> +#include <libc.h> +#include <venti.h> +#include <flate.h> +#include <libsec.h> +#include <thread.h> +#include <httpd.h> +#include <draw.h> +#include <memdraw.h> diff --git a/sys/src/cmd/venti/srv/syncarena.c b/sys/src/cmd/venti/srv/syncarena.c new file mode 100755 index 000000000..0e6cc2019 --- /dev/null +++ b/sys/src/cmd/venti/srv/syncarena.c @@ -0,0 +1,174 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +static int writeclumphead(Arena *arena, u64int aa, Clump *cl); +static int writeclumpmagic(Arena *arena, u64int aa, u32int magic); + +int +clumpinfocmp(ClumpInfo *c, ClumpInfo *d) +{ + return c->type != d->type + || c->size != d->size + || c->uncsize != d->uncsize + || scorecmp(c->score, d->score)!=0; +} + +/* + * synchronize the clump info directory with + * with the clumps actually stored in the arena. + * the directory should be at least as up to date + * as the arena's trailer. + * + * checks/updates at most n clumps. + * + * returns 0 if ok, flags if error occurred + */ +int +syncarena(Arena *arena, u32int n, int zok, int fix) +{ + ZBlock *lump; + Clump cl; + ClumpInfo ci; + static ClumpInfo zci = { .type = -1 }; + u8int score[VtScoreSize]; + u64int uncsize, used, aa; + u32int clump, clumps, cclumps, magic; + int err, flush, broken; + + used = arena->memstats.used; + clumps = arena->memstats.clumps; + cclumps = arena->memstats.cclumps; + uncsize = arena->memstats.uncsize; + trace(TraceProc, "syncarena start"); + flush = 0; + err = 0; + for(; n; n--){ + aa = arena->memstats.used; + clump = arena->memstats.clumps; + magic = clumpmagic(arena, aa); + if(magic == ClumpFreeMagic) + break; + if(magic != arena->clumpmagic){ + fprint(2, "%s: illegal clump magic number=%#8.8ux at clump=%d\n", arena->name, magic, clump); + /* err |= SyncDataErr; */ + if(fix && writeclumpmagic(arena, aa, ClumpFreeMagic) < 0){ + fprint(2, "%s: can't write corrected clump free magic: %r", arena->name); + err |= SyncFixErr; + } + break; + } + + broken = 0; + lump = loadclump(arena, aa, 0, &cl, score, 0); + if(lump == nil){ + fprint(2, "%s: clump=%d failed to read correctly: %r\n", arena->name, clump); + break; + }else if(cl.info.type != VtCorruptType){ + scoremem(score, lump->data, cl.info.uncsize); + if(scorecmp(cl.info.score, score) != 0){ + /* ignore partially written block */ + if(cl.encoding == ClumpENone) + break; + fprint(2, "%s: clump=%d has mismatched score\n", arena->name, clump); + err |= SyncDataErr; + broken = 1; + }else if(vttypevalid(cl.info.type) < 0){ + fprint(2, "%s: clump=%d has invalid type %d", arena->name, clump, cl.info.type); + err |= SyncDataErr; + broken = 1; + } + if(broken && fix){ + cl.info.type = VtCorruptType; + if(writeclumphead(arena, aa, &cl) < 0){ + fprint(2, "%s: can't write corrected clump header: %r", arena->name); + err |= SyncFixErr; + } + } + } + freezblock(lump); + arena->memstats.used += ClumpSize + cl.info.size; + + arena->memstats.clumps++; + if(!broken && readclumpinfo(arena, clump, &ci)<0){ + fprint(2, "%s: arena directory read failed\n", arena->name); + broken = 1; + }else if(!broken && clumpinfocmp(&ci, &cl.info)!=0){ + if(clumpinfocmp(&ci, &zci) == 0){ + err |= SyncCIZero; + if(!zok) + fprint(2, "%s: unwritten clump info for clump=%d\n", arena->name, clump); + }else{ + err |= SyncCIErr; + fprint(2, "%s: bad clump info for clump=%d\n", arena->name, clump); + fprint(2, "\texpected score=%V type=%d size=%d uncsize=%d\n", + cl.info.score, cl.info.type, cl.info.size, cl.info.uncsize); + fprint(2, "\tfound score=%V type=%d size=%d uncsize=%d\n", + ci.score, ci.type, ci.size, ci.uncsize); + } + broken = 1; + } + if(broken && fix){ + flush = 1; + ci = cl.info; + if(writeclumpinfo(arena, clump, &ci) < 0){ + fprint(2, "%s: can't write correct clump directory: %r\n", arena->name); + err |= SyncFixErr; + } + } + trace(TraceProc, "syncarena unindexed clump %V %d", cl.info.score, arena->memstats.clumps); + + arena->memstats.uncsize += cl.info.uncsize; + if(cl.info.size < cl.info.uncsize) + arena->memstats.cclumps++; + } + + if(flush){ + trace(TraceProc, "syncarena flush"); + arena->wtime = now(); + if(arena->ctime == 0 && arena->memstats.clumps) + arena->ctime = arena->wtime; + flushdcache(); + } + + if(used != arena->memstats.used + || clumps != arena->memstats.clumps + || cclumps != arena->memstats.cclumps + || uncsize != arena->memstats.uncsize){ + err |= SyncHeader; + fprint(2, "arena %s: fix=%d flush=%d %lld->%lld %ud->%ud %ud->%ud %lld->%lld\n", + arena->name, + fix, + flush, + used, arena->memstats.used, + clumps, arena->memstats.clumps, + cclumps, arena->memstats.cclumps, + uncsize, arena->memstats.uncsize); + } + + return err; +} + +static int +writeclumphead(Arena *arena, u64int aa, Clump *cl) +{ + ZBlock *zb; + int bad; + + zb = alloczblock(ClumpSize, 0, arena->blocksize); + if(zb == nil) + return -1; + bad = packclump(cl, zb->data, arena->clumpmagic)<0 + || writearena(arena, aa, zb->data, ClumpSize) != ClumpSize; + freezblock(zb); + return bad ? -1 : 0; +} + +static int +writeclumpmagic(Arena *arena, u64int aa, u32int magic) +{ + u8int buf[U32Size]; + + packmagic(magic, buf); + return writearena(arena, aa, buf, U32Size) == U32Size; +} diff --git a/sys/src/cmd/venti/srv/syncindex.c b/sys/src/cmd/venti/srv/syncindex.c new file mode 100755 index 000000000..6bf996ae4 --- /dev/null +++ b/sys/src/cmd/venti/srv/syncindex.c @@ -0,0 +1,64 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +static int verbose; +void +usage(void) +{ + fprint(2, "usage: syncindex [-v] [-B blockcachesize] config\n"); + threadexitsall("usage"); +} + +Config conf; + +void +threadmain(int argc, char *argv[]) +{ + u32int bcmem, icmem; + + bcmem = 0; + icmem = 0; + ARGBEGIN{ + case 'B': + bcmem = unittoull(EARGF(usage())); + break; + case 'I': + icmem = unittoull(EARGF(usage())); + break; + case 'v': + verbose++; + break; + default: + usage(); + break; + }ARGEND + + if(argc != 1) + usage(); + + ventifmtinstall(); + if(initventi(argv[0], &conf) < 0) + sysfatal("can't init venti: %r"); + if(mainindex->bloom && loadbloom(mainindex->bloom) < 0) + sysfatal("can't load bloom filter: %r"); + + if(bcmem < maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16)) + bcmem = maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16); + if(0) fprint(2, "initialize %d bytes of disk block cache\n", bcmem); + initdcache(bcmem); + initlumpcache(1*1024*1024, 1024/8); + initicache(icmem); + initicachewrite(); + if(mainindex->bloom) + startbloomproc(mainindex->bloom); + + if(verbose) + printindex(2, mainindex); + if(syncindex(mainindex) < 0) + sysfatal("failed to sync index=%s: %r", mainindex->name); + flushicache(); + flushdcache(); + + threadexitsall(0); +} diff --git a/sys/src/cmd/venti/srv/syncindex0.c b/sys/src/cmd/venti/srv/syncindex0.c new file mode 100755 index 000000000..be3a2ea06 --- /dev/null +++ b/sys/src/cmd/venti/srv/syncindex0.c @@ -0,0 +1,93 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +static int +syncarenaindex(Arena *arena, u64int a0) +{ + int ok; + u32int clump; + u64int a; + ClumpInfo ci; + IAddr ia; + AState as; + + if(arena->diskstats.clumps == arena->memstats.clumps) + return 0; + + memset(&as, 0, sizeof as); + as.arena = arena; + as.stats = arena->diskstats; + + ok = 0; + a = a0 + arena->diskstats.used; + for(clump=arena->diskstats.clumps; clump < arena->memstats.clumps; clump++){ + if(readclumpinfo(arena, clump, &ci) < 0){ + fprint(2, "%s: clump %d: cannot read clumpinfo\n", + arena->name, clump); + ok = -1; + break; + } + + ia.type = ci.type; + ia.size = ci.uncsize; + ia.addr = a; + ia.blocks = (ClumpSize + ci.size + (1 << ABlockLog) - 1) >> ABlockLog; + a += ClumpSize + ci.size; + + as.stats.used += ClumpSize + ci.size; + as.stats.uncsize += ia.size; + as.stats.clumps++; + if(ci.uncsize > ci.size) + as.stats.cclumps++; + as.aa = a; + insertscore(ci.score, &ia, IEDirty, &as); + } + flushdcache(); + return ok; +} + +int +syncindex(Index *ix) +{ + Arena *arena; + int i, e, e1, ok; + + ok = 0; + for(i = 0; i < ix->narenas; i++){ + trace(TraceProc, "syncindex start %d", i); + arena = ix->arenas[i]; + e = syncarena(arena, TWID32, 1, 1); + e1 = e; + e1 &= ~(SyncHeader|SyncCIZero|SyncCIErr); + if(e & SyncHeader) + fprint(2, "arena %s: header is out-of-date\n", arena->name); + if(e1){ + fprint(2, "arena %s: %x\n", arena->name, e1); + ok = -1; + continue; + } + flushdcache(); + + if(arena->memstats.clumps == arena->diskstats.clumps) + continue; + + fprint(2, "%T %s: indexing %d clumps...\n", + arena->name, + arena->memstats.clumps - arena->diskstats.clumps); + + if(syncarenaindex(arena, ix->amap[i].start) < 0){ + fprint(2, "arena %s: syncarenaindex: %r\n", arena->name); + ok = -1; + continue; + } + if(wbarena(arena) < 0){ + fprint(2, "arena %s: wbarena: %r\n", arena->name); + ok = -1; + continue; + } + flushdcache(); + delaykickicache(); + } + return ok; +} diff --git a/sys/src/cmd/venti/srv/trace.c b/sys/src/cmd/venti/srv/trace.c new file mode 100755 index 000000000..3c0169557 --- /dev/null +++ b/sys/src/cmd/venti/srv/trace.c @@ -0,0 +1,39 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +char TraceDisk[] = "disk"; +char TraceLump[] = "lump"; +char TraceBlock[] = "block"; +char TraceProc[] = "proc"; +char TraceWork[] = "work"; +char TraceQuiet[] = "quiet"; +char TraceRpc[] = "rpc"; + +void +trace(char *level, char *fmt, ...) +{ + char buf[512]; + va_list arg; + + if(level == nil || !ventilogging) + return; + va_start(arg, fmt); + vsnprint(buf, sizeof buf, fmt, arg); + va_end(arg); + vtlog(level, "<font size=-1>%T %s:</font> %s<br>\n", + threadgetname(), buf); + vtlog("all", "<font size=-1>%T <font color=#777777>%s</font> %s:</font> %s<br>\n", + level, threadgetname(), buf); +} + +void +traceinit(void) +{ +} + +void +settrace(char *trace) +{ + USED(trace); +} diff --git a/sys/src/cmd/venti/srv/unittoull.c b/sys/src/cmd/venti/srv/unittoull.c new file mode 100755 index 000000000..1f7411702 --- /dev/null +++ b/sys/src/cmd/venti/srv/unittoull.c @@ -0,0 +1,30 @@ +#include "stdinc.h" + +#define TWID64 ((u64int)~(u64int)0) + +u64int +unittoull(char *s) +{ + char *es; + u64int n; + + if(s == nil) + return TWID64; + n = strtoul(s, &es, 0); + if(*es == 'k' || *es == 'K'){ + n *= 1024; + es++; + }else if(*es == 'm' || *es == 'M'){ + n *= 1024*1024; + es++; + }else if(*es == 'g' || *es == 'G'){ + n *= 1024*1024*1024; + es++; + }else if(*es == 't' || *es == 'T'){ + n *= 1024*1024; + n *= 1024*1024; + } + if(*es != '\0') + return TWID64; + return n; +} diff --git a/sys/src/cmd/venti/srv/unwhack.c b/sys/src/cmd/venti/srv/unwhack.c new file mode 100755 index 000000000..5530bd07d --- /dev/null +++ b/sys/src/cmd/venti/srv/unwhack.c @@ -0,0 +1,179 @@ +#include "stdinc.h" +#include "whack.h" + +enum +{ + DMaxFastLen = 7, + DBigLenCode = 0x3c, /* minimum code for large lenth encoding */ + DBigLenBits = 6, + DBigLenBase = 1 /* starting items to encode for big lens */ +}; + +static uchar lenval[1 << (DBigLenBits - 1)] = +{ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, + 5, + 6, + 255, + 255 +}; + +static uchar lenbits[] = +{ + 0, 0, 0, + 2, 3, 5, 5, +}; + +static uchar offbits[16] = +{ + 5, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 12, 13 +}; + +static ushort offbase[16] = +{ + 0, 0x20, + 0x40, 0x60, + 0x80, 0xc0, + 0x100, 0x180, + 0x200, 0x300, + 0x400, 0x600, + 0x800, 0xc00, + 0x1000, + 0x2000 +}; + +void +unwhackinit(Unwhack *uw) +{ + uw->err[0] = '\0'; +} + +int +unwhack(Unwhack *uw, uchar *dst, int ndst, uchar *src, int nsrc) +{ + uchar *s, *d, *dmax, *smax, lit; + ulong uwbits, lithist; + int i, off, len, bits, use, code, uwnbits, overbits; + + d = dst; + dmax = d + ndst; + + smax = src + nsrc; + uwnbits = 0; + uwbits = 0; + overbits = 0; + lithist = ~0; + while(src < smax || uwnbits - overbits >= MinDecode){ + while(uwnbits <= 24){ + uwbits <<= 8; + if(src < smax) + uwbits |= *src++; + else + overbits += 8; + uwnbits += 8; + } + + /* + * literal + */ + len = lenval[(uwbits >> (uwnbits - 5)) & 0x1f]; + if(len == 0){ + if(lithist & 0xf){ + uwnbits -= 9; + lit = (uwbits >> uwnbits) & 0xff; + lit &= 255; + }else{ + uwnbits -= 8; + lit = (uwbits >> uwnbits) & 0x7f; + if(lit < 32){ + if(lit < 24){ + uwnbits -= 2; + lit = (lit << 2) | ((uwbits >> uwnbits) & 3); + }else{ + uwnbits -= 3; + lit = (lit << 3) | ((uwbits >> uwnbits) & 7); + } + lit = (lit - 64) & 0xff; + } + } + if(d >= dmax){ + snprint(uw->err, WhackErrLen, "too much output"); + return -1; + } + *d++ = lit; + lithist = (lithist << 1) | (lit < 32) | (lit > 127); + continue; + } + + /* + * length + */ + if(len < 255) + uwnbits -= lenbits[len]; + else{ + uwnbits -= DBigLenBits; + code = ((uwbits >> uwnbits) & ((1 << DBigLenBits) - 1)) - DBigLenCode; + len = DMaxFastLen; + use = DBigLenBase; + bits = (DBigLenBits & 1) ^ 1; + while(code >= use){ + len += use; + code -= use; + code <<= 1; + uwnbits--; + if(uwnbits < 0){ + snprint(uw->err, WhackErrLen, "len out of range"); + return -1; + } + code |= (uwbits >> uwnbits) & 1; + use <<= bits; + bits ^= 1; + } + len += code; + + while(uwnbits <= 24){ + uwbits <<= 8; + if(src < smax) + uwbits |= *src++; + else + overbits += 8; + uwnbits += 8; + } + } + + /* + * offset + */ + uwnbits -= 4; + bits = (uwbits >> uwnbits) & 0xf; + off = offbase[bits]; + bits = offbits[bits]; + + uwnbits -= bits; + off |= (uwbits >> uwnbits) & ((1 << bits) - 1); + off++; + + if(off > d - dst){ + snprint(uw->err, WhackErrLen, "offset out of range: off=%d d=%ld len=%d nbits=%d", off, d - dst, len, uwnbits); + return -1; + } + if(d + len > dmax){ + snprint(uw->err, WhackErrLen, "len out of range"); + return -1; + } + s = d - off; + for(i = 0; i < len; i++) + d[i] = s[i]; + d += len; + } + if(uwnbits < overbits){ + snprint(uw->err, WhackErrLen, "compressed data overrun"); + return -1; + } + + len = d - dst; + + return len; +} diff --git a/sys/src/cmd/venti/srv/utils.c b/sys/src/cmd/venti/srv/utils.c new file mode 100755 index 000000000..d810c53d8 --- /dev/null +++ b/sys/src/cmd/venti/srv/utils.c @@ -0,0 +1,259 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +int +namecmp(char *s, char *t) +{ + return strncmp(s, t, ANameSize); +} + +void +namecp(char *dst, char *src) +{ + strncpy(dst, src, ANameSize - 1); + dst[ANameSize - 1] = '\0'; +} + +int +nameok(char *name) +{ + char *t; + int c; + + if(name == nil) + return -1; + for(t = name; c = *t; t++) + if(t - name >= ANameSize + || c < ' ' || c >= 0x7f) + return -1; + return 0; +} + +int +stru32int(char *s, u32int *r) +{ + char *t; + u32int n, nn, m; + int c; + + m = TWID32 / 10; + n = 0; + for(t = s; ; t++){ + c = *t; + if(c < '0' || c > '9') + break; + if(n > m) + return -1; + nn = n * 10 + c - '0'; + if(nn < n) + return -1; + n = nn; + } + *r = n; + return s != t && *t == '\0'; +} + +int +stru64int(char *s, u64int *r) +{ + char *t; + u64int n, nn, m; + int c; + + m = TWID64 / 10; + n = 0; + for(t = s; ; t++){ + c = *t; + if(c < '0' || c > '9') + break; + if(n > m) + return -1; + nn = n * 10 + c - '0'; + if(nn < n) + return -1; + n = nn; + } + *r = n; + return s != t && *t == '\0'; +} + +int +vttypevalid(int type) +{ + return type < VtMaxType; +} + +static char* +logit(int severity, char *fmt, va_list args) +{ + char *s; + + s = vsmprint(fmt, args); + if(s == nil) + return nil; + if(severity != EOk){ + if(argv0 == nil) + fprint(2, "%T %s: err %d: %s\n", argv0, severity, s); + else + fprint(2, "%T err %d: %s\n", severity, s); + } + return s; +} + +void +seterr(int severity, char *fmt, ...) +{ + char *s; + va_list args; + + va_start(args, fmt); + s = logit(severity, fmt, args); + va_end(args); + if(s == nil) + werrstr("error setting error"); + else{ + werrstr("%s", s); + free(s); + } +} + +void +logerr(int severity, char *fmt, ...) +{ + char *s; + va_list args; + + va_start(args, fmt); + s = logit(severity, fmt, args); + va_end(args); + free(s); +} + +u32int +now(void) +{ + return time(nil); +} + +int abortonmem = 1; + +void * +emalloc(ulong n) +{ + void *p; + + p = malloc(n); + if(p == nil){ + if(abortonmem) + abort(); + sysfatal("out of memory allocating %lud", n); + } + memset(p, 0xa5, n); + setmalloctag(p, getcallerpc(&n)); +if(0)print("emalloc %p-%p by %#p\n", p, (char*)p+n, getcallerpc(&n)); + return p; +} + +void * +ezmalloc(ulong n) +{ + void *p; + + p = malloc(n); + if(p == nil){ + if(abortonmem) + abort(); + sysfatal("out of memory allocating %lud", n); + } + memset(p, 0, n); + setmalloctag(p, getcallerpc(&n)); +if(0)print("ezmalloc %p-%p by %#p\n", p, (char*)p+n, getcallerpc(&n)); + return p; +} + +void * +erealloc(void *p, ulong n) +{ + p = realloc(p, n); + if(p == nil){ + if(abortonmem) + abort(); + sysfatal("out of memory allocating %lud", n); + } + setrealloctag(p, getcallerpc(&p)); +if(0)print("erealloc %p-%p by %#p\n", p, (char*)p+n, getcallerpc(&p)); + return p; +} + +char * +estrdup(char *s) +{ + char *t; + int n; + + n = strlen(s) + 1; + t = emalloc(n); + memmove(t, s, n); + setmalloctag(t, getcallerpc(&s)); +if(0)print("estrdup %p-%p by %#p\n", t, (char*)t+n, getcallerpc(&s)); + return t; +} + +/* + * return floor(log2(v)) + */ +int +u64log2(u64int v) +{ + int i; + + for(i = 0; i < 64; i++) + if((v >> i) <= 1) + break; + return i; +} + +int +vtproc(void (*fn)(void*), void *arg) +{ + proccreate(fn, arg, 256*1024); + return 0; +} + +int +ientryfmt(Fmt *fmt) +{ + IEntry *ie; + + ie = va_arg(fmt->args, IEntry*); + return fmtprint(fmt, "%V %22lld %3d %5d %3d", + ie->score, ie->ia.addr, ie->ia.type, ie->ia.size, ie->ia.blocks); +} + +void +ventifmtinstall(void) +{ + fmtinstall('F', vtfcallfmt); + fmtinstall('H', encodefmt); + fmtinstall('I', ientryfmt); + fmtinstall('T', vttimefmt); + fmtinstall('V', vtscorefmt); +} + +uint +msec(void) +{ + return nsec()/1000000; +} + +uint +countbits(uint n) +{ + n = (n&0x55555555)+((n>>1)&0x55555555); + n = (n&0x33333333)+((n>>2)&0x33333333); + n = (n&0x0F0F0F0F)+((n>>4)&0x0F0F0F0F); + n = (n&0x00FF00FF)+((n>>8)&0x00FF00FF); + n = (n&0x0000FFFF)+((n>>16)&0x0000FFFF); + return n; +} diff --git a/sys/src/cmd/venti/srv/venti.c b/sys/src/cmd/venti/srv/venti.c new file mode 100755 index 000000000..1cf67a1c4 --- /dev/null +++ b/sys/src/cmd/venti/srv/venti.c @@ -0,0 +1,428 @@ +#ifdef PLAN9PORT +#include <u.h> +#include <signal.h> +#endif +#include "stdinc.h" +#include <bio.h> +#include "dat.h" +#include "fns.h" + +#include "whack.h" + +typedef struct Allocs Allocs; +struct Allocs { + u32int mem; + u32int bcmem; + u32int icmem; + u32int stfree; /* free memory at start */ + uint mempcnt; +}; + +int debug; +int nofork; +int mainstacksize = 256*1024; +VtSrv *ventisrv; + +static void ventiserver(void*); + +static ulong +freemem(void) +{ + int nf, pgsize = 0; + uvlong size, userpgs = 0, userused = 0; + char *ln, *sl; + char *fields[2]; + Biobuf *bp; + + size = 64*1024*1024; + bp = Bopen("#c/swap", OREAD); + if (bp != nil) { + while ((ln = Brdline(bp, '\n')) != nil) { + ln[Blinelen(bp)-1] = '\0'; + nf = tokenize(ln, fields, nelem(fields)); + if (nf != 2) + continue; + if (strcmp(fields[1], "pagesize") == 0) + pgsize = atoi(fields[0]); + else if (strcmp(fields[1], "user") == 0) { + sl = strchr(fields[0], '/'); + if (sl == nil) + continue; + userpgs = atoll(sl+1); + userused = atoll(fields[0]); + } + } + Bterm(bp); + if (pgsize > 0 && userpgs > 0 && userused > 0) + size = (userpgs - userused) * pgsize; + } + /* cap it to keep the size within 32 bits */ + if (size >= 3840UL * 1024 * 1024) + size = 3840UL * 1024 * 1024; + return size; +} + +static void +allocminima(Allocs *all) /* enforce minima for sanity */ +{ + if (all->icmem < 6 * 1024 * 1024) + all->icmem = 6 * 1024 * 1024; + if (all->mem < 1024 * 1024 || all->mem == Unspecified) /* lumps */ + all->mem = 1024 * 1024; + if (all->bcmem < 2 * 1024 * 1024) + all->bcmem = 2 * 1024 * 1024; +} + +/* automatic memory allocations sizing per venti(8) guidelines */ +static Allocs +allocbypcnt(u32int mempcnt, u32int stfree) +{ + u32int avail; + vlong blmsize; + Allocs all; + static u32int free; + + all.mem = Unspecified; + all.bcmem = all.icmem = 0; + all.mempcnt = mempcnt; + all.stfree = stfree; + + if (free == 0) + free = freemem(); + blmsize = stfree - free; + if (blmsize <= 0) + blmsize = 0; + avail = ((vlong)stfree * mempcnt) / 100; + if (blmsize >= avail || (avail -= blmsize) <= (1 + 2 + 6) * 1024 * 1024) + fprint(2, "%s: bloom filter bigger than mem pcnt; " + "resorting to minimum values (9MB total)\n", argv0); + else { + if (avail >= 3840UL * 1024 * 1024) + avail = 3840UL * 1024 * 1024; /* sanity */ + avail /= 2; + all.icmem = avail; + avail /= 3; + all.mem = avail; + all.bcmem = 2 * avail; + } + return all; +} + +/* + * we compute default values for allocations, + * which can be overridden by (in order): + * configuration file parameters, + * command-line options other than -m, and -m. + */ +static Allocs +sizeallocs(Allocs opt, Config *cfg) +{ + Allocs all; + + /* work out sane defaults */ + all = allocbypcnt(20, opt.stfree); + + /* config file parameters override */ + if (cfg->mem && cfg->mem != Unspecified) + all.mem = cfg->mem; + if (cfg->bcmem) + all.bcmem = cfg->bcmem; + if (cfg->icmem) + all.icmem = cfg->icmem; + + /* command-line options override */ + if (opt.mem && opt.mem != Unspecified) + all.mem = opt.mem; + if (opt.bcmem) + all.bcmem = opt.bcmem; + if (opt.icmem) + all.icmem = opt.icmem; + + /* automatic memory sizing? */ + if(opt.mempcnt > 0) + all = allocbypcnt(opt.mempcnt, opt.stfree); + + allocminima(&all); + return all; +} + +void +usage(void) +{ + fprint(2, "usage: venti [-Ldrsw] [-a ventiaddr] [-c config] " +"[-h httpaddr] [-m %%mem] [-B blockcachesize] [-C cachesize] [-I icachesize] " +"[-W webroot]\n"); + threadexitsall("usage"); +} + +void +threadmain(int argc, char *argv[]) +{ + char *configfile, *haddr, *vaddr, *webroot; + u32int mem, icmem, bcmem, minbcmem, mempcnt, stfree; + Allocs allocs; + Config config; + + traceinit(); + threadsetname("main"); + mempcnt = 0; + vaddr = nil; + haddr = nil; + configfile = nil; + webroot = nil; + mem = Unspecified; + icmem = 0; + bcmem = 0; + ARGBEGIN{ + case 'a': + vaddr = EARGF(usage()); + break; + case 'B': + bcmem = unittoull(EARGF(usage())); + break; + case 'c': + configfile = EARGF(usage()); + break; + case 'C': + mem = unittoull(EARGF(usage())); + break; + case 'D': + settrace(EARGF(usage())); + break; + case 'd': + debug = 1; + nofork = 1; + break; + case 'h': + haddr = EARGF(usage()); + break; + case 'm': + mempcnt = atoi(EARGF(usage())); + if (mempcnt <= 0 || mempcnt >= 100) + usage(); + break; + case 'I': + icmem = unittoull(EARGF(usage())); + break; + case 'L': + ventilogging = 1; + break; + case 'r': + readonly = 1; + break; + case 's': + nofork = 1; + break; + case 'w': /* compatibility with old venti */ + queuewrites = 1; + break; + case 'W': + webroot = EARGF(usage()); + break; + default: + usage(); + }ARGEND + + if(argc) + usage(); + + if(!nofork) + rfork(RFNOTEG); + +#ifdef PLAN9PORT + { + /* sigh - needed to avoid signals when writing to hungup networks */ + struct sigaction sa; + memset(&sa, 0, sizeof sa); + sa.sa_handler = SIG_IGN; + sigaction(SIGPIPE, &sa, nil); + } +#endif + + ventifmtinstall(); + trace(TraceQuiet, "venti started"); + fprint(2, "%T venti: "); + + if(configfile == nil) + configfile = "venti.conf"; + + /* remember free memory before initventi & loadbloom, for auto-sizing */ + stfree = freemem(); + fprint(2, "conf..."); + if(initventi(configfile, &config) < 0) + sysfatal("can't init server: %r"); + /* + * load bloom filter + */ + if(mainindex->bloom && loadbloom(mainindex->bloom) < 0) + sysfatal("can't load bloom filter: %r"); + + /* + * size memory allocations; assumes bloom filter is loaded + */ + allocs = sizeallocs((Allocs){mem, bcmem, icmem, stfree, mempcnt}, + &config); + mem = allocs.mem; + bcmem = allocs.bcmem; + icmem = allocs.icmem; + fprint(2, "%s: mem %,ud bcmem %,ud icmem %,ud...", + argv0, mem, bcmem, icmem); + + /* + * default other configuration-file parameters + */ + if(haddr == nil) + haddr = config.haddr; + if(vaddr == nil) + vaddr = config.vaddr; + if(vaddr == nil) + vaddr = "tcp!*!venti"; + if(webroot == nil) + webroot = config.webroot; + if(queuewrites == 0) + queuewrites = config.queuewrites; + + if(haddr){ + fprint(2, "httpd %s...", haddr); + if(httpdinit(haddr, webroot) < 0) + fprint(2, "warning: can't start http server: %r"); + } + fprint(2, "init..."); + + /* + * lump cache + */ + if(0) fprint(2, "initialize %d bytes of lump cache for %d lumps\n", + mem, mem / (8 * 1024)); + initlumpcache(mem, mem / (8 * 1024)); + + /* + * index cache + */ + initicache(icmem); + initicachewrite(); + + /* + * block cache: need a block for every arena and every process + */ + minbcmem = maxblocksize * + (mainindex->narenas + mainindex->nsects*4 + 16); + if(bcmem < minbcmem) + bcmem = minbcmem; + if(0) fprint(2, "initialize %d bytes of disk block cache\n", bcmem); + initdcache(bcmem); + + if(mainindex->bloom) + startbloomproc(mainindex->bloom); + + fprint(2, "sync..."); + if(!readonly && syncindex(mainindex) < 0) + sysfatal("can't sync server: %r"); + + if(!readonly && queuewrites){ + fprint(2, "queue..."); + if(initlumpqueues(mainindex->nsects) < 0){ + fprint(2, "can't initialize lump queues," + " disabling write queueing: %r"); + queuewrites = 0; + } + } + + if(initarenasum() < 0) + fprint(2, "warning: can't initialize arena summing process: %r"); + + fprint(2, "announce %s...", vaddr); + ventisrv = vtlisten(vaddr); + if(ventisrv == nil) + sysfatal("can't announce %s: %r", vaddr); + + fprint(2, "serving.\n"); + if(nofork) + ventiserver(nil); + else + vtproc(ventiserver, nil); + + threadexits(nil); +} + +static void +vtrerror(VtReq *r, char *error) +{ + r->rx.msgtype = VtRerror; + r->rx.error = estrdup(error); +} + +static void +ventiserver(void *v) +{ + Packet *p; + VtReq *r; + char err[ERRMAX]; + uint ms; + int cached, ok; + + USED(v); + threadsetname("ventiserver"); + trace(TraceWork, "start"); + while((r = vtgetreq(ventisrv)) != nil){ + trace(TraceWork, "finish"); + trace(TraceWork, "start request %F", &r->tx); + trace(TraceRpc, "<- %F", &r->tx); + r->rx.msgtype = r->tx.msgtype+1; + addstat(StatRpcTotal, 1); + if(0) print("req (arenas[0]=%p sects[0]=%p) %F\n", + mainindex->arenas[0], mainindex->sects[0], &r->tx); + switch(r->tx.msgtype){ + default: + vtrerror(r, "unknown request"); + break; + case VtTread: + ms = msec(); + r->rx.data = readlump(r->tx.score, r->tx.blocktype, r->tx.count, &cached); + ms = msec() - ms; + addstat2(StatRpcRead, 1, StatRpcReadTime, ms); + if(r->rx.data == nil){ + addstat(StatRpcReadFail, 1); + rerrstr(err, sizeof err); + vtrerror(r, err); + }else{ + addstat(StatRpcReadBytes, packetsize(r->rx.data)); + addstat(StatRpcReadOk, 1); + if(cached) + addstat2(StatRpcReadCached, 1, StatRpcReadCachedTime, ms); + else + addstat2(StatRpcReadUncached, 1, StatRpcReadUncachedTime, ms); + } + break; + case VtTwrite: + if(readonly){ + vtrerror(r, "read only"); + break; + } + p = r->tx.data; + r->tx.data = nil; + addstat(StatRpcWriteBytes, packetsize(p)); + ms = msec(); + ok = writelump(p, r->rx.score, r->tx.blocktype, 0, ms); + ms = msec() - ms; + addstat2(StatRpcWrite, 1, StatRpcWriteTime, ms); + + if(ok < 0){ + addstat(StatRpcWriteFail, 1); + rerrstr(err, sizeof err); + vtrerror(r, err); + } + break; + case VtTsync: + flushqueue(); + flushdcache(); + break; + } + trace(TraceRpc, "-> %F", &r->rx); + vtrespond(r); + trace(TraceWork, "start"); + } + flushdcache(); + flushicache(); + threadexitsall(0); +} diff --git a/sys/src/cmd/venti/srv/verifyarena.c b/sys/src/cmd/venti/srv/verifyarena.c new file mode 100755 index 000000000..662d53239 --- /dev/null +++ b/sys/src/cmd/venti/srv/verifyarena.c @@ -0,0 +1,266 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +static int verbose; +static int fd; +static uchar *data; +static int blocksize; +static int sleepms; +static vlong offset0; + +void +usage(void) +{ + fprint(2, "usage: verifyarena [-b blocksize] [-s ms] [-v] [arenapart [name...]]\n"); + threadexitsall(0); +} + +static int +preadblock(uchar *buf, int n, vlong off) +{ + int nr, m; + + for(nr = 0; nr < n; nr += m){ + m = n - nr; + m = pread(fd, &buf[nr], m, offset0+off+nr); + if(m <= 0){ + if(m == 0) + werrstr("early eof"); + return -1; + } + } + return 0; +} + +static int +readblock(uchar *buf, int n) +{ + int nr, m; + + for(nr = 0; nr < n; nr += m){ + m = n - nr; + m = read(fd, &buf[nr], m); + if(m <= 0){ + if(m == 0) + werrstr("early eof"); + return -1; + } + } + return 0; +} + +static void +verifyarena(char *name, vlong len) +{ + Arena arena; + ArenaHead head; + DigestState s; + u64int n, e; + u32int bs; + u8int score[VtScoreSize]; + + fprint(2, "%T verify %s\n", name); + + memset(&arena, 0, sizeof arena); + memset(&s, 0, sizeof s); + + /* + * read a little bit, which will include the header + */ + if(readblock(data, HeadSize) < 0){ + fprint(2, "%T %s: reading header: %r\n", name); + return; + } + sha1(data, HeadSize, nil, &s); + if(unpackarenahead(&head, data) < 0){ + fprint(2, "%T %s: corrupt arena header: %r\n", name); + return; + } + if(head.version != ArenaVersion4 && head.version != ArenaVersion5) + fprint(2, "%T %s: warning: unknown arena version %d\n", name, head.version); + if(len != 0 && len != head.size) + fprint(2, "%T %s: warning: unexpected length %lld != %lld\n", name, head.size, len); + if(strcmp(name, "<stdin>") != 0 && strcmp(head.name, name) != 0) + fprint(2, "%T %s: warning: unexpected name %s\n", name, head.name); + + /* + * now we know how much to read + * read everything but the last block, which is special + */ + e = head.size - head.blocksize; + bs = blocksize; + for(n = HeadSize; n < e; n += bs){ + if(n + bs > e) + bs = e - n; + if(readblock(data, bs) < 0){ + fprint(2, "%T %s: read data: %r\n", name); + return; + } + sha1(data, bs, nil, &s); + if(sleepms) + sleep(sleepms); + } + + /* + * read the last block update the sum. + * the sum is calculated assuming the slot for the sum is zero. + */ + bs = head.blocksize; + if(readblock(data, bs) < 0){ + fprint(2, "%T %s: read last block: %r\n", name); + return; + } + sha1(data, bs-VtScoreSize, nil, &s); + sha1(zeroscore, VtScoreSize, nil, &s); + sha1(nil, 0, score, &s); + + /* + * validity check on the trailer + */ + arena.blocksize = head.blocksize; + if(unpackarena(&arena, data) < 0){ + fprint(2, "%T %s: corrupt arena trailer: %r\n", name); + return; + } + scorecp(arena.score, &data[arena.blocksize - VtScoreSize]); + + if(namecmp(arena.name, head.name) != 0){ + fprint(2, "%T %s: wrong name in trailer: %s vs. %s\n", + name, head.name, arena.name); + return; + } + if(arena.version != head.version){ + fprint(2, "%T %s: wrong version in trailer: %d vs. %d\n", + name, head.version, arena.version); + return; + } + arena.size = head.size - 2 * head.blocksize; + + /* + * check for no checksum or the same + */ + if(scorecmp(score, arena.score) == 0) + fprint(2, "%T %s: verified score\n", name); + else if(scorecmp(zeroscore, arena.score) == 0) + fprint(2, "%T %s: unsealed\n", name); + else{ + fprint(2, "%T %s: mismatch checksum - found=%V calculated=%V\n", + name, arena.score, score); + return; + } + printarena(2, &arena); +} + +static int +shouldcheck(char *name, char **s, int n) +{ + int i; + + if(n == 0) + return 1; + + for(i=0; i<n; i++){ + if(s[i] && strcmp(name, s[i]) == 0){ + s[i] = nil; + return 1; + } + } + return 0; +} + +void +threadmain(int argc, char *argv[]) +{ + int i, nline; + char *p, *q, *table, *f[10], line[256]; + vlong start, stop; + ArenaPart ap; + Part *part; + + needzeroscore(); + ventifmtinstall(); + blocksize = MaxIoSize; + ARGBEGIN{ + case 'b': + blocksize = unittoull(EARGF(usage())); + break; + case 's': + sleepms = atoi(EARGF(usage())); + break; + case 'v': + verbose++; + break; + default: + usage(); + break; + }ARGEND + + data = vtmalloc(blocksize); + if(argc == 0){ + fd = 0; + verifyarena("<stdin>", 0); + threadexitsall(nil); + } + + if((part = initpart(argv[0], OREAD)) == nil) + sysfatal("open partition %s: %r", argv[0]); + fd = part->fd; + offset0 = part->offset; + + if(preadblock(data, 8192, PartBlank) < 0) + sysfatal("read arena part header: %r"); + if(unpackarenapart(&ap, data) < 0) + sysfatal("corrupted arena part header: %r"); + fprint(2, "%T # arena part version=%d blocksize=%d arenabase=%d\n", + ap.version, ap.blocksize, ap.arenabase); + ap.tabbase = (PartBlank+HeadSize+ap.blocksize-1)&~(ap.blocksize-1); + ap.tabsize = ap.arenabase - ap.tabbase; + table = malloc(ap.tabsize+1); + if(preadblock((uchar*)table, ap.tabsize, ap.tabbase) < 0) + sysfatal("reading arena part directory: %r"); + table[ap.tabsize] = 0; + + nline = atoi(table); + p = strchr(table, '\n'); + if(p) + p++; + for(i=0; i<nline; i++){ + if(p == nil){ + fprint(2, "%T warning: unexpected arena table end\n"); + break; + } + q = strchr(p, '\n'); + if(q) + *q++ = 0; + if(strlen(p) >= sizeof line){ + fprint(2, "%T warning: long arena table line: %s\n", p); + p = q; + continue; + } + strcpy(line, p); + memset(f, 0, sizeof f); + if(tokenize(line, f, nelem(f)) < 3){ + fprint(2, "%T warning: bad arena table line: %s\n", p); + p = q; + continue; + } + p = q; + if(shouldcheck(f[0], argv+1, argc-1)){ + start = strtoull(f[1], 0, 0); + stop = strtoull(f[2], 0, 0); + if(stop <= start){ + fprint(2, "%T %s: bad start,stop %lld,%lld\n", f[0], stop, start); + continue; + } + if(seek(fd, offset0+start, 0) < 0) + fprint(2, "%T %s: seek to start: %r\n", f[0]); + verifyarena(f[0], stop - start); + } + } + for(i=1; i<argc; i++) + if(argv[i] != 0) + fprint(2, "%T %s: did not find arena\n", argv[i]); + + threadexitsall(nil); +} diff --git a/sys/src/cmd/venti/srv/whack.c b/sys/src/cmd/venti/srv/whack.c new file mode 100755 index 000000000..ecd290339 --- /dev/null +++ b/sys/src/cmd/venti/srv/whack.c @@ -0,0 +1,331 @@ +#include "stdinc.h" +#include "whack.h" + +typedef struct Huff Huff; +int compressblocks = 1; + +enum +{ + MaxFastLen = 9, + BigLenCode = 0x1f4, /* minimum code for large lenth encoding */ + BigLenBits = 9, + BigLenBase = 4, /* starting items to encode for big lens */ + + MinOffBits = 6, + MaxOffBits = MinOffBits + 8, + + MaxLen = 2051 /* max. length encodable in 24 bits */ +}; + +enum +{ + StatBytes, + StatOutBytes, + StatLits, + StatMatches, + StatLitBits, + StatOffBits, + StatLenBits, + + MaxStat +}; + +struct Huff +{ + short bits; /* length of the code */ + ulong encode; /* the code */ +}; + +static Huff lentab[MaxFastLen] = +{ + {2, 0x2}, /* 10 */ + {3, 0x6}, /* 110 */ + {5, 0x1c}, /* 11100 */ + {5, 0x1d}, /* 11101 */ + {6, 0x3c}, /* 111100 */ + {7, 0x7a}, /* 1111010 */ + {7, 0x7b}, /* 1111011 */ + {8, 0xf8}, /* 11111000 */ + {8, 0xf9}, /* 11111001 */ +}; + +static int thwmaxcheck; + +void +whackinit(Whack *tw, int level) +{ + thwmaxcheck = (1 << level); + thwmaxcheck -= thwmaxcheck >> 2; + if(thwmaxcheck < 2) + thwmaxcheck = 2; + else if(thwmaxcheck > 1024) + thwmaxcheck = 1024; + memset(tw, 0, sizeof *tw); + tw->begin = 2 * WhackMaxOff; +} + +/* + * find a string in the dictionary + */ +static int +whackmatch(Whack *b, uchar **ss, uchar *esrc, ulong h, ulong now) +{ + ushort then, off, last; + int bestoff, bestlen, check; + uchar *s, *t; + + s = *ss; + if(esrc < s + MinMatch) + return -1; + if(s + MaxLen < esrc) + esrc = s + MaxLen; + + bestoff = 0; + bestlen = 0; + check = thwmaxcheck; + last = 0; + for(then = b->hash[h]; check-- > 0; then = b->next[then & (WhackMaxOff - 1)]){ + off = now - then; + if(off <= last || off > WhackMaxOff) + break; + + /* + * don't need to check for the end because + * 1) s too close check above + */ + t = s - off; + if(s[0] == t[0] && s[1] == t[1] && s[2] == t[2]){ + if(!bestlen || esrc - s > bestlen && s[bestlen] == t[bestlen]){ + t += 3; + for(s += 3; s < esrc; s++){ + if(*s != *t) + break; + t++; + } + if(s - *ss > bestlen){ + bestlen = s - *ss; + bestoff = off; + if(bestlen > thwmaxcheck) + break; + } + } + } + s = *ss; + last = off; + } + *ss += bestlen; + return bestoff; +} + +/* + * knuth vol. 3 multiplicative hashing + * each byte x chosen according to rules + * 1/4 < x < 3/10, 1/3 x < < 3/7, 4/7 < x < 2/3, 7/10 < x < 3/4 + * with reasonable spread between the bytes & their complements + * + * the 3 byte value appears to be as almost good as the 4 byte value, + * and might be faster on some machines + */ +/* +#define hashit(c) ((((ulong)(c) * 0x6b43a9) >> (24 - HashLog)) & HashMask) +*/ +#define hashit(c) (((((ulong)(c) & 0xffffff) * 0x6b43a9b5) >> (32 - HashLog)) & HashMask) + +/* + * lz77 compression with single lookup in a hash table for each block + */ +int +whack(Whack *w, uchar *dst, uchar *src, int n, ulong stats[WhackStats]) +{ + uchar *s, *ss, *sss, *esrc, *half, *wdst, *wdmax; + ulong cont, code, wbits; + ushort now; + int toff, lithist, h, len, bits, use, wnbits, lits, matches, offbits, lenbits; + + if(!compressblocks || n < MinMatch) + return -1; + + wdst = dst; + wdmax = dst + n; + + now = w->begin; + s = src; + w->data = s; + + cont = (s[0] << 16) | (s[1] << 8) | s[2]; + + esrc = s + n; + half = s + (n >> 1); + wnbits = 0; + wbits = 0; + lits = 0; + matches = 0; + offbits = 0; + lenbits = 0; + lithist = ~0; + while(s < esrc){ + h = hashit(cont); + + sss = s; + toff = whackmatch(w, &sss, esrc, h, now); + ss = sss; + + len = ss - s; + for(; wnbits >= 8; wnbits -= 8){ + if(wdst >= wdmax){ + w->begin = now; + return -1; + } + *wdst++ = wbits >> (wnbits - 8); + } + if(len < MinMatch){ + toff = *s; + lithist = (lithist << 1) | toff < 32 | toff > 127; + if(lithist & 0x1e){ + wbits = (wbits << 9) | toff; + wnbits += 9; + }else if(lithist & 1){ + toff = (toff + 64) & 0xff; + if(toff < 96){ + wbits = (wbits << 10) | toff; + wnbits += 10; + }else{ + wbits = (wbits << 11) | toff; + wnbits += 11; + } + }else{ + wbits = (wbits << 8) | toff; + wnbits += 8; + } + lits++; + + /* + * speed hack + * check for compression progress, bail if none achieved + */ + if(s > half){ + if(4 * (s - src) < 5 * lits){ + w->begin = now; + return -1; + } + half = esrc; + } + + if(s + MinMatch <= esrc){ + w->next[now & (WhackMaxOff - 1)] = w->hash[h]; + w->hash[h] = now; + if(s + MinMatch < esrc) + cont = (cont << 8) | s[MinMatch]; + } + now++; + s++; + continue; + } + + matches++; + + /* + * length of match + */ + if(len > MaxLen){ + len = MaxLen; + ss = s + len; + } + len -= MinMatch; + if(len < MaxFastLen){ + bits = lentab[len].bits; + wbits = (wbits << bits) | lentab[len].encode; + wnbits += bits; + lenbits += bits; + }else{ + code = BigLenCode; + bits = BigLenBits; + use = BigLenBase; + len -= MaxFastLen; + while(len >= use){ + len -= use; + code = (code + use) << 1; + use <<= (bits & 1) ^ 1; + bits++; + } + + wbits = (wbits << bits) | (code + len); + wnbits += bits; + lenbits += bits; + + for(; wnbits >= 8; wnbits -= 8){ + if(wdst >= wdmax){ + w->begin = now; + return -1; + } + *wdst++ = wbits >> (wnbits - 8); + } + } + + /* + * offset in history + */ + toff--; + for(bits = MinOffBits; toff >= (1 << bits); bits++) + ; + if(bits < MaxOffBits-1){ + wbits = (wbits << 3) | (bits - MinOffBits); + if(bits != MinOffBits) + bits--; + wnbits += bits + 3; + offbits += bits + 3; + }else{ + wbits = (wbits << 4) | 0xe | (bits - (MaxOffBits-1)); + bits--; + wnbits += bits + 4; + offbits += bits + 4; + } + wbits = (wbits << bits) | toff & ((1 << bits) - 1); + + for(; s != ss; s++){ + if(s + MinMatch <= esrc){ + h = hashit(cont); + w->next[now & (WhackMaxOff - 1)] = w->hash[h]; + w->hash[h] = now; + if(s + MinMatch < esrc) + cont = (cont << 8) | s[MinMatch]; + } + now++; + } + } + + w->begin = now; + + stats[StatBytes] += esrc - src; + stats[StatLits] += lits; + stats[StatMatches] += matches; + stats[StatLitBits] += (wdst - (dst + 2)) * 8 + wnbits - offbits - lenbits; + stats[StatOffBits] += offbits; + stats[StatLenBits] += lenbits; + + if(wnbits & 7){ + wbits <<= 8 - (wnbits & 7); + wnbits += 8 - (wnbits & 7); + } + for(; wnbits >= 8; wnbits -= 8){ + if(wdst >= wdmax) + return -1; + *wdst++ = wbits >> (wnbits - 8); + } + + stats[StatOutBytes] += wdst - dst; + + return wdst - dst; +} + +int +whackblock(uchar *dst, uchar *src, int ssize) +{ + Whack w; + ulong stats[MaxStat]; + int r; + + whackinit(&w, 6); + r = whack(&w, dst, src, ssize, stats); + return r; +} diff --git a/sys/src/cmd/venti/srv/whack.h b/sys/src/cmd/venti/srv/whack.h new file mode 100755 index 000000000..fb966169c --- /dev/null +++ b/sys/src/cmd/venti/srv/whack.h @@ -0,0 +1,40 @@ +typedef struct Whack Whack; +typedef struct Unwhack Unwhack; + +enum +{ + WhackStats = 8, + WhackErrLen = 64, /* max length of error message from thwack or unthwack */ + WhackMaxOff = 16*1024, /* max allowed offset */ + + HashLog = 14, + HashSize = 1<<HashLog, + HashMask = HashSize - 1, + + MinMatch = 3, /* shortest match possible */ + + MinDecode = 8, /* minimum bits to decode a match or lit; >= 8 */ + + MaxSeqMask = 8, /* number of bits in coding block mask */ + MaxSeqStart = 256 /* max offset of initial coding block */ +}; + +struct Whack +{ + ushort begin; /* time of first byte in hash */ + ushort hash[HashSize]; + ushort next[WhackMaxOff]; + uchar *data; +}; + +struct Unwhack +{ + char err[WhackErrLen]; +}; + +void whackinit(Whack*, int level); +void unwhackinit(Unwhack*); +int whack(Whack*, uchar *dst, uchar *src, int nsrc, ulong stats[WhackStats]); +int unwhack(Unwhack*, uchar *dst, int ndst, uchar *src, int nsrc); + +int whackblock(uchar *dst, uchar *src, int ssize); diff --git a/sys/src/cmd/venti/srv/wrarena.c b/sys/src/cmd/venti/srv/wrarena.c new file mode 100755 index 000000000..9ead93704 --- /dev/null +++ b/sys/src/cmd/venti/srv/wrarena.c @@ -0,0 +1,225 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +QLock godot; +char *host; +int readonly = 1; /* for part.c */ +int mainstacksize = 256*1024; +Channel *c; +VtConn *z; +int fast; /* and a bit unsafe; only for benchmarking */ +int haveaoffset; +int maxwrites = -1; +int verbose; + +typedef struct ZClump ZClump; +struct ZClump +{ + ZBlock *lump; + Clump cl; + u64int aa; +}; + +void +usage(void) +{ + fprint(2, "usage: wrarena [-h host] arenafile [offset]\n"); + threadexitsall("usage"); +} + +void +vtsendthread(void *v) +{ + ZClump zcl; + + USED(v); + while(recv(c, &zcl) == 1){ + if(zcl.lump == nil) + break; + if(vtwrite(z, zcl.cl.info.score, zcl.cl.info.type, zcl.lump->data, zcl.cl.info.uncsize) < 0) + sysfatal("failed writing clump %llud: %r", zcl.aa); + if(verbose) + print("%V\n", zcl.cl.info.score); + freezblock(zcl.lump); + } + /* + * All the send threads try to exit right when + * threadmain is calling threadexitsall. + * Either libthread or the Linux NPTL pthreads library + * can't handle this condition (I suspect NPTL but have + * not confirmed this) and we get a seg fault in exit. + * I spent a day tracking this down with no success, + * so we're going to work around it instead by just + * sitting here and waiting for the threadexitsall to + * take effect. + */ + qlock(&godot); +} + +static void +rdarena(Arena *arena, u64int offset) +{ + int i; + u64int a, aa, e; + uchar score[VtScoreSize]; + Clump cl; + ClumpInfo ci; + ZBlock *lump; + ZClump zcl; + + fprint(2, "wrarena: copying %s to venti\n", arena->name); + printarena(2, arena); + + a = arena->base; + e = arena->base + arena->size; + if(offset != ~(u64int)0) { + if(offset >= e - a) + sysfatal("bad offset %#llx >= %#llx", offset, e - a); + aa = offset; + } else + aa = 0; + + i = 0; + for(a = 0; maxwrites != 0 && i < arena->memstats.clumps; + a += ClumpSize + ci.size){ + if(readclumpinfo(arena, i++, &ci) < 0) + break; + if(a < aa || ci.type == VtCorruptType){ + if(ci.type == VtCorruptType) + fprint(2, "%s: corrupt clump read at %#llx: +%d\n", + argv0, a, ClumpSize+ci.size); + continue; + } + lump = loadclump(arena, a, 0, &cl, score, 0); + if(lump == nil) { + fprint(2, "clump %#llx failed to read: %r\n", a); + continue; + } + if(!fast && cl.info.type != VtCorruptType) { + scoremem(score, lump->data, cl.info.uncsize); + if(scorecmp(cl.info.score, score) != 0) { + fprint(2, "clump %#llx has mismatched score\n", + a); + break; + } + if(vttypevalid(cl.info.type) < 0) { + fprint(2, "clump %#llx has bad type %d\n", + a, cl.info.type); + break; + } + } + if(z && cl.info.type != VtCorruptType){ + zcl.cl = cl; + zcl.lump = lump; + zcl.aa = a; + send(c, &zcl); + }else + freezblock(lump); + if(maxwrites > 0) + --maxwrites; + } + if(a > aa) + aa = a; + if(haveaoffset) + print("end offset %#llx\n", aa); +} + +void +threadmain(int argc, char *argv[]) +{ + int i; + char *file; + Arena *arena; + u64int offset, aoffset; + Part *part; + uchar buf[8192]; + ArenaHead head; + ZClump zerocl; + + ventifmtinstall(); + qlock(&godot); + aoffset = 0; + ARGBEGIN{ + case 'f': + fast = 1; + ventidoublechecksha1 = 0; + break; + case 'h': + host = EARGF(usage()); + break; + case 'o': + haveaoffset = 1; + aoffset = strtoull(EARGF(usage()), 0, 0); + break; + case 'M': + maxwrites = atoi(EARGF(usage())); + break; + case 'v': + verbose = 1; + break; + default: + usage(); + break; + }ARGEND + + offset = ~(u64int)0; + switch(argc) { + default: + usage(); + case 2: + offset = strtoull(argv[1], 0, 0); + /* fall through */ + case 1: + file = argv[0]; + } + + fmtinstall('V', vtscorefmt); + + statsinit(); + + part = initpart(file, OREAD); + if(part == nil) + sysfatal("can't open file %s: %r", file); + if(readpart(part, aoffset, buf, sizeof buf) < 0) + sysfatal("can't read file %s: %r", file); + + if(unpackarenahead(&head, buf) < 0) + sysfatal("corrupted arena header: %r"); + + if(aoffset+head.size > part->size) + sysfatal("arena is truncated: want %llud bytes have %llud", + head.size, part->size); + + partblocksize(part, head.blocksize); + initdcache(8 * MaxDiskBlock); + + arena = initarena(part, aoffset, head.size, head.blocksize); + if(arena == nil) + sysfatal("initarena: %r"); + + z = nil; + if(host==nil || strcmp(host, "/dev/null") != 0){ + z = vtdial(host); + if(z == nil) + sysfatal("could not connect to server: %r"); + if(vtconnect(z) < 0) + sysfatal("vtconnect: %r"); + } + + c = chancreate(sizeof(ZClump), 0); + for(i=0; i<12; i++) + vtproc(vtsendthread, nil); + + rdarena(arena, offset); + if(vtsync(z) < 0) + sysfatal("executing sync: %r"); + + memset(&zerocl, 0, sizeof zerocl); + for(i=0; i<12; i++) + send(c, &zerocl); + if(z){ + vthangup(z); + } + threadexitsall(0); +} diff --git a/sys/src/cmd/venti/srv/www/stats.html b/sys/src/cmd/venti/srv/www/stats.html new file mode 100755 index 000000000..e7579394e --- /dev/null +++ b/sys/src/cmd/venti/srv/www/stats.html @@ -0,0 +1,33 @@ +<html> + <head> + <base href="/"> + <META http-equiv="Content-Type" content="text/html; charset=utf-8"> + <script language="javascript" src="stats.js"></script> + <script language="javascript" src="status.js"></script> + </head> + <body bgcolor=#ffffff> + + <center> + <b>venti.your-domain.com – venti server statistics</b> + <p> + <a href="javascript:redraw()">redraw</a> + <p> + <table id="statgraphs"> + <tr><td>JavaScript is required to view the graphs. + </table> + <p> + + <font size=-1>the small graphs show the past ten minutes of operation.</font> + <p> + + <tt>http://venti.yourdomain.com:8001/</tt> + <p id="settings">JavaScript is required to change the settings.</p> + <p> + <p id="debug"></p> + </center> + <script language="javascript"> + loadsettings() + redraw() + </script> + </body> +</html> diff --git a/sys/src/cmd/venti/srv/www/stats.js b/sys/src/cmd/venti/srv/www/stats.js new file mode 100755 index 000000000..76e9f276a --- /dev/null +++ b/sys/src/cmd/venti/srv/www/stats.js @@ -0,0 +1,387 @@ + +biggraph = "arg=rpctotal&graph=diff" + +graphname = new Array( + "arg=*&graph=diskbw", + "<b>disk</b> bytes/second", + "arg=*&graph=netbw", + "<b>network</b> bytes/second", + "arg=*&graph=iobw", + "total: <b>disk+net</b> bytes/second", + + "arg=apartreadbyte&graph=diff", + "arena read bytes/second", + "arg=apartwritebyte&graph=diff", + "arena write bytes/second", + + "arg=bloomfalsemiss&graph=pctdiff&arg2=bloomlookup&max=100", + "bloom false hit %", + "arg=bloomhit&graph=pctdiff&arg2=bloomlookup&max=100", + "bloom miss %", + "arg=bloomlookuptime&graph=divdiff&arg2=bloomlookup", + "bloom lookup time", + "arg=bloomones&graph=pct&arg2=bloombits&max=100", + "bloom usage %", + + "arg=dcachedirty&graph=pct&arg2=dcachesize&max=100", + "dcache dirty %", + "arg=dcachehit&graph=pctdiff&arg2=dcachelookup&max=100", + "dcache hit %", + "arg=dcachelookuptime&graph=divdiff&arg2=dcachelookup", + "dcache lookup time", + "arg=dcachelookup&graph=diff", + "dcache lookups/second", + "arg=dcachewrite&graph=diff", + "dcache writes/second", + + "arg=icachedirty&graph=pct&arg2=icachesize&max=100", + "icache dirty %", + "arg=icachehit&graph=pctdiff&arg2=icachelookup&max=100", + "icache hit %", + "arg=icachelookuptime&graph=divdiff&arg2=icachelookup", + "icache lookup time", + "arg=icacheprefetch&graph=diff", + "icache prefetches/second", + "arg=icachewrite&graph=diff", + "icache writes/second", + + "arg=isectreadbyte&graph=diff", + "isect read bytes/second", + "arg=isectwritebyte&graph=diff", + "isect write bytes/second", + + "arg=lcachehit&graph=pctdiff&arg2=lcachelookup&max=100", + "lump cache hit %", + "arg=lcachelookuptime&graph=divdiff&arg2=lcachelookup", + "lump cache lookup time", + "arg=lcachewrite&graph=diff", + "lcache writes/second", + + "arg=rpcreadbyte&graph=diff", + "read RPC bytes/second", + "arg=rpctotal&graph=diff", + "RPCs/second", + "arg=rpcwritebyte&graph=diff", + "write RPC bytes/second", + "arg=rpcreadtime&graph=divdiff&arg2=rpcread", + "read RPC time", + "arg=rpcwritetime&graph=divdiff&arg2=rpcwrite", + "write RPC time", + "arg=rpcreadcachedtime&graph=divdiff&arg2=rpcreadcached", + "cached read RPC time", + "arg=rpcreaduncachedtime&graph=divdiff&arg2=rpcreaduncached", + "uncached read RPC time", + "arg=rpcwritenewtime&graph=divdiff&arg2=rpcwritenew", + "fresh write RPC time", + "arg=rpcwriteoldtime&graph=divdiff&arg2=rpcwriteold", + "dup write RPC time", + + "arg=sumreadbyte&graph=diff", + "checksum bytes/second", + + "arg=dblockstall", + "threads stalled: dblock", + "arg=dcachestall", + "threads stalled: dcache", + "arg=icachestall", + "threads stalled: icache", + "arg=lumpstall", + "threads stalled: lump", + + "arg=END" +) + +column0 = new Array( + "column0", + "!bandwidth", + "arg=*&graph=iobw", + "arg=*&graph=netbw", + "arg=rpcreadbyte&graph=diff", + "arg=rpcwritebyte&graph=diff", + "arg=*&graph=diskbw", + "arg=isectreadbyte&graph=diff", + "arg=isectwritebyte&graph=diff", + "arg=apartreadbyte&graph=diff", + "arg=apartwritebyte&graph=diff", + "arg=sumreadbyte&graph=diff", + + "!bloom filter", + "arg=bloomhit&graph=pctdiff&arg2=bloomlookup&max=100", + "arg=bloomfalsemiss&graph=pctdiff&arg2=bloomlookup&max=100", + "arg=bloomones&graph=pct&arg2=bloombits&max=100", + + "END" +) + +column1 = new Array( + "column1", + "!icache", + "arg=icachedirty&graph=pct&arg2=icachesize&max=100", + "arg=icachehit&graph=pctdiff&arg2=icachelookup&max=100", + "arg=icachewrite&graph=diff", + "arg=icacheprefetch&graph=diff", + + "!dcache", + "arg=dcachedirty&graph=pct&arg2=dcachesize&max=100", + "arg=dcachehit&graph=pctdiff&arg2=dcachelookup&max=100", + "arg=dcachelookup&graph=diff", + "arg=dcachewrite&graph=diff", + + "!lump cache", + "arg=lcachehit&graph=pctdiff&arg2=lcachelookup&max=100", + "arg=lcachewrite&graph=diff", + + "END" +) + +column2 = new Array( + "column2", + + "!stalls", + "arg=icachestall", + "arg=dcachestall", + "arg=dblockstall", + "arg=lumpstall", + + "!timings", + "arg=bloomlookuptime&graph=divdiff&arg2=bloomlookup", + "arg=icachelookuptime&graph=divdiff&arg2=icachelookup", + "arg=lcachelookuptime&graph=divdiff&arg2=lcachelookup", + "arg=dcachelookuptime&graph=divdiff&arg2=dcachelookup", + "arg=rpcreadtime&graph=divdiff&arg2=rpcread", + "arg=rpcwritetime&graph=divdiff&arg2=rpcwrite", + "arg=rpcreadcachedtime&graph=divdiff&arg2=rpcreadcached", + "arg=rpcreaduncachedtime&graph=divdiff&arg2=rpcreaduncached", + "arg=rpcwritenewtime&graph=divdiff&arg2=rpcwritenew", + "arg=rpcwriteoldtime&graph=divdiff&arg2=rpcwriteold", + + "END" +) + +col0info = new Array(column0.length) +col1info = new Array(column1.length) +col2info = new Array(column2.length) + +function cleardebug() { + var p = document.getElementById("debug") + p.innerHTML = "" +} + +function debug(s) { + var p = document.getElementById("debug") + if(p.innerHTML == "") + p.innerHTML = "<a href=\"javascript:cleardebug()\">clear</a>\n" + p.innerHTML += "<br>"+s +} + +function Ginfo(y, fill, name) { + var g = new Object() + g.y = y + g.fill = fill + g.name = name + return g +} + +function cleartable(t) { + for(var i=t.rows.length-1; i>=0; i--) + t.deleteRow(i) +} + +function textofname(name) +{ + for(var i=0; i<graphname.length; i+=2) + if(name == graphname[i]) + return graphname[i+1] +} + +function graphrow(row, span, name, dt, wid, ht, fill, text) { + var url = "/graph?"+name + url = url+"&min=0" + url = url+"&t0=-"+dt + url = url+"&wid="+wid + url = url+"&ht="+ht + url = url+"&fill="+fill + + var s = "<td colSpan="+span + s = s+" valign=bottom" + s = s+" align=center" + s = s+" width="+wid + s = s+" height="+ht + s = s+" style=\"background-image: url("+url+");\"" + s = s+">"+textofname(name)+text+"</td>" + row.innerHTML = s +} + + +function graphcell(cell, name, dt, wid, ht, fill) { + cell.vAlign = "bottom" + cell.align = "center" + cell.width = wid + cell.height = ht +} + +function redraw() { + redrawgraphs() + redrawsettings() +} + +function redrawgraphs() { + var t = document.getElementById("statgraphs") + + cleartable(t) + for(var i=0; i<4; i++) + t.insertRow(i) + + graphrow(t.rows[0], 3, biggraph, 86400, 900, 30, 0, " – showing 24 hours") + graphrow(t.rows[1], 3, biggraph, 3600, 900, 30, 1, " – showing 1 hour") + t.rows[2].innerHTML = "<td height=10></td>" + + var r = t.rows[3] + graphtable(r.insertCell(0), column0, col0info, 0) + graphtable(r.insertCell(1), column1, col1info, 2) + graphtable(r.insertCell(2), column2, col2info, 4) +} + +function graphtable(bigcell, list, infolist, fill) { + bigcell.innerHTML = "<table id=\""+list[0]+"\"></table>" + bigcell.vAlign = "top" + var t = document.getElementById(list[0]) + t.onclick = columnclick + + for(var i=1; i<list.length; i++){ + var r = t.insertRow(t.rows.length) + name = list[i] + infolist[i] = Ginfo(t.offsetHeight, fill, name) + if(name == "END") + break + if(name.substring(0,1) == "!"){ + name = name.substring(1) + if(i > 1){ + r.innerHTML = "<td height=10></td>" + r = t.insertRow(t.rows.length) + } + r.innerHTML = "<td align=center><b>"+name+"</b>" + }else{ + graphrow(r, 1, name, 600, 300, 30, fill++, "") + } + } +} + +function xpos(obj) { + var x = 0 + if(obj.fixedx) + return obj.fixedx + if(obj.offsetParent){ + while(obj.offsetParent){ + x += obj.offsetLeft + obj = obj.offsetParent + } + }else if(obj.x) + x = obj.x + return x +} + +function ypos(obj) { + var y = 0 + if(obj.fixedy) + return obj.fixedy + if(obj.offsetParent){ + while(obj.offsetParent){ + y += obj.offsetTop + obj = obj.offsetParent + } + }else if(obj.y) + y = obj.y + return y +} + +function scrollleft() { + return document.body.scrollLeft +} + +function scrolltop() { + return document.body.scrollTop +} + +function columnclick(e) { + if(e.which && e.which != 1) + return; + var g = findgraph(scrollleft()+e.clientX, scrolltop()+e.clientY) + if(g && g.name.substring(0,1) != "!"){ + biggraph = g.name + var t = document.getElementById("statgraphs") + graphrow(t.rows[0], 3, biggraph, 86400, 900, 30, 0, " – showing 24 hours") + graphrow(t.rows[1], 3, biggraph, 3600, 900, 30, 1, " – showing 1 hour") + } +} + +function findgraph(x, y) { + var g + + if(g = findgraphin(x, y, "column2", col2info)) + return g + if(g = findgraphin(x, y, "column1", col1info)) + return g + if(g = findgraphin(x, y, "column0", col0info)) + return g + return +} + +function findgraphin(x, y, tname, info) { + var t = document.getElementById(tname) + if(x < xpos(t)) + return + y = y - ypos(t) + for(var i=info.length-2; i>=1; i--){ + if(y > info[i].y) + return info[i] + } + return +} + +function setof(name, val, list) { + var s = "" + for(var i=0; i<list.length; i++){ + if(val == list[i]) + s = s+" <b>"+val+"</b>" + else + s = s+" <a href=\"javascript:set('"+name+"', '"+list[i]+"')\">"+list[i]+"</a>" + } + return s +} + +function loglinks(list) { + var s = "" + for(var i=0; i<list.length; i++){ + s = s+" <a href=\"/log/"+list[i]+"\">"+list[i]+"</a>" + } + return s +} + +first = 1 +function redrawsettings() { + if(first){ + loadsettings() + first = 0 + } + var s = "" + s = s+"<font size=-1>\n" + s = s+"logging:"+setof("logging", logging, loggingchoices) + s = s+" " + s = s+"stats:"+setof("stats", stats, statschoices) + s = s+"\n<p/>\n" + s = s+"compression:"+setof("compress", compress, compresschoices1) + s = s+"<br>"+setof("compress", compress, compresschoices2) + s = s+"\n<p/>\n" + s = s+"<a href=/index>index</a> | <a href=/storage>storage</a> | " + s = s+"log:"+loglinks(logs) + s = s+"</font>" + document.getElementById("settings").innerHTML = s +} + +function set(name, value) { + eval(name+"= \""+value+"\"") + redrawsettings() + // Works in FireFox, not in Safari + parent.hidden.location.href = "/set/"+name+"/"+value +} diff --git a/sys/src/cmd/venti/srv/www/status.js b/sys/src/cmd/venti/srv/www/status.js new file mode 100755 index 000000000..48e197d8f --- /dev/null +++ b/sys/src/cmd/venti/srv/www/status.js @@ -0,0 +1,16 @@ +function loadsettings() { + logging = "off" + loggingchoices = new Array("0", "1") + + stats = "on" + statschoices = new Array("0", "1") + + compress = "whack" + compresschoices1 = new Array("none", + "flate1", "flate2", "flate3", "flate4", "flate5", + "flate6", "flate7", "flate8", "flate9") + compresschoices2 = new Array("smack1", "smack2", "smack3", "whack") + + logs = new Array("all", "libventi/server", "disk", "lump", "block", "proc", "quiet", "rpc") +} + diff --git a/sys/src/cmd/venti/srv/www/status1.js b/sys/src/cmd/venti/srv/www/status1.js new file mode 100755 index 000000000..b0e284e3e --- /dev/null +++ b/sys/src/cmd/venti/srv/www/status1.js @@ -0,0 +1,14 @@ +logging = "on" +loggingchoices = new Array("off", "on") + +stats = "on" +statschoices = new Array("off", "on") + +compress = "whack" +compresschoices1 = new Array("none", + "flate1", "flate2", "flate3", "flate4", "flate5", + "flate6", "flate7", "flate8", "flate9") +compresschoices2 = new Array("smack1", "smack2", "smack3", "whack") + +logs = new Array("all", "libventi/server", "disk", "lump", "block", "proc", "quiet", "rpc") + diff --git a/sys/src/cmd/venti/srv/xml.c b/sys/src/cmd/venti/srv/xml.c new file mode 100755 index 000000000..e91afa054 --- /dev/null +++ b/sys/src/cmd/venti/srv/xml.c @@ -0,0 +1,68 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" +#include "xml.h" + +void xmlarena(Hio *hout, Arena *s, char *tag, int indent){ + xmlindent(hout, indent); + hprint(hout, "<%s", tag); + xmlaname(hout, s->name, "name"); + xmlu32int(hout, s->version, "version"); + xmlaname(hout, s->part->name, "partition"); + xmlu32int(hout, s->blocksize, "blocksize"); + xmlu64int(hout, s->base, "start"); + xmlu64int(hout, s->base+2*s->blocksize, "stop"); + xmlu32int(hout, s->ctime, "created"); + xmlu32int(hout, s->wtime, "modified"); + xmlsealed(hout, s->memstats.sealed, "sealed"); + xmlscore(hout, s->score, "score"); + xmlu32int(hout, s->memstats.clumps, "clumps"); + xmlu32int(hout, s->memstats.cclumps, "compressedclumps"); + xmlu64int(hout, s->memstats.uncsize, "data"); + xmlu64int(hout, s->memstats.used - s->memstats.clumps * ClumpSize, "compresseddata"); + xmlu64int(hout, s->memstats.used + s->memstats.clumps * ClumpInfoSize, "storage"); + hprint(hout, "/>\n"); +} + +void xmlindex(Hio *hout, Index *s, char *tag, int indent){ + int i; + xmlindent(hout, indent); + hprint(hout, "<%s", tag); + xmlaname(hout, s->name, "name"); + xmlu32int(hout, s->version, "version"); + xmlu32int(hout, s->blocksize, "blocksize"); + xmlu32int(hout, s->tabsize, "tabsize"); + xmlu32int(hout, s->buckets, "buckets"); + xmlu32int(hout, s->div, "buckdiv"); + hprint(hout, ">\n"); + xmlindent(hout, indent + 1); + hprint(hout, "<sects>\n"); + for(i = 0; i < s->nsects; i++) + xmlamap(hout, &s->smap[i], "sect", indent + 2); + xmlindent(hout, indent + 1); + hprint(hout, "</sects>\n"); + xmlindent(hout, indent + 1); + hprint(hout, "<amaps>\n"); + for(i = 0; i < s->narenas; i++) + xmlamap(hout, &s->amap[i], "amap", indent + 2); + xmlindent(hout, indent + 1); + hprint(hout, "</amaps>\n"); + xmlindent(hout, indent + 1); + hprint(hout, "<arenas>\n"); + for(i = 0; i < s->narenas; i++) + xmlarena(hout, s->arenas[i], "arena", indent + 2); + xmlindent(hout, indent + 1); + hprint(hout, "</arenas>\n"); + xmlindent(hout, indent); + hprint(hout, "</%s>\n", tag); +} + +void xmlamap(Hio *hout, AMap *s, char *tag, int indent){ + xmlindent(hout, indent); + hprint(hout, "<%s", tag); + xmlaname(hout, s->name, "name"); + xmlu64int(hout, s->start, "start"); + xmlu64int(hout, s->stop, "stop"); + hprint(hout, "/>\n"); +} + diff --git a/sys/src/cmd/venti/srv/xml.h b/sys/src/cmd/venti/srv/xml.h new file mode 100755 index 000000000..c9e52b0bb --- /dev/null +++ b/sys/src/cmd/venti/srv/xml.h @@ -0,0 +1,11 @@ +void xmlamap(Hio *hout, AMap *v, char *tag, int indent); +void xmlarena(Hio *hout, Arena *v, char *tag, int indent); +void xmlindex(Hio *hout, Index *v, char *tag, int indent); + +void xmlaname(Hio *hout, char *v, char *tag); +void xmlscore(Hio *hout, u8int *v, char *tag); +void xmlsealed(Hio *hout, int v, char *tag); +void xmlu32int(Hio *hout, u32int v, char *tag); +void xmlu64int(Hio *hout, u64int v, char *tag); + +void xmlindent(Hio *hout, int indent); diff --git a/sys/src/cmd/venti/srv/zblock.c b/sys/src/cmd/venti/srv/zblock.c new file mode 100755 index 000000000..afff08010 --- /dev/null +++ b/sys/src/cmd/venti/srv/zblock.c @@ -0,0 +1,96 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +void +fmtzbinit(Fmt *f, ZBlock *b) +{ + memset(f, 0, sizeof *f); +#ifdef PLAN9PORT + fmtlocaleinit(f, nil, nil, nil); +#endif + f->start = b->data; + f->to = f->start; + f->stop = (char*)f->start + b->len; +} + +#define ROUNDUP(p, n) ((void*)(((uintptr)(p)+(n)-1)&~(uintptr)((n)-1))) + +enum { + OverflowCheck = 32 +}; +static char zmagic[] = "1234567890abcdefghijklmnopqrstuvxyz"; + +ZBlock * +alloczblock(u32int size, int zeroed, uint blocksize) +{ + uchar *p, *data; + ZBlock *b; + static ZBlock z; + int n; + + if(blocksize == 0) + blocksize = 32; /* try for cache line alignment */ + + n = size+OverflowCheck+sizeof(ZBlock)+blocksize+8; + p = malloc(n); + if(p == nil){ + seterr(EOk, "out of memory"); + return nil; + } + + data = ROUNDUP(p, blocksize); + b = ROUNDUP(data+size+OverflowCheck, 8); + if(0) fprint(2, "alloc %p-%p data %p-%p b %p-%p\n", + p, p+n, data, data+size, b, b+1); + *b = z; + b->data = data; + b->free = p; + b->len = size; + b->_size = size; + if(zeroed) + memset(b->data, 0, size); + memmove(b->data+size, zmagic, OverflowCheck); + return b; +} + +void +freezblock(ZBlock *b) +{ + if(b){ + if(memcmp(b->data+b->_size, zmagic, OverflowCheck) != 0) + abort(); + memset(b->data+b->_size, 0, OverflowCheck); + free(b->free); + } +} + +ZBlock* +packet2zblock(Packet *p, u32int size) +{ + ZBlock *b; + + if(p == nil) + return nil; + b = alloczblock(size, 0, 0); + if(b == nil) + return nil; + if(packetcopy(p, b->data, 0, size) < 0){ + freezblock(b); + return nil; + } + return b; +} + +Packet* +zblock2packet(ZBlock *zb, u32int size) +{ + Packet *p; + + if(zb == nil) + return nil; + p = packetalloc(); + packetappend(p, zb->data, size); + return p; +} + diff --git a/sys/src/cmd/venti/srv/zeropart.c b/sys/src/cmd/venti/srv/zeropart.c new file mode 100755 index 000000000..97d6038ee --- /dev/null +++ b/sys/src/cmd/venti/srv/zeropart.c @@ -0,0 +1,30 @@ +#include "stdinc.h" +#include "dat.h" +#include "fns.h" + +void +zeropart(Part *part, int blocksize) +{ + ZBlock *b; + u64int addr; + int w; + + fprint(2, "clearing %s\n", part->name); + b = alloczblock(MaxIoSize, 1, blocksize); + + w = 0; + for(addr = PartBlank; addr + MaxIoSize <= part->size; addr += MaxIoSize){ + if(writepart(part, addr, b->data, MaxIoSize) < 0) + sysfatal("can't initialize %s, writing block %d failed: %r", part->name, w); + w++; + } + + for(; addr + blocksize <= part->size; addr += blocksize) + if(writepart(part, addr, b->data, blocksize) < 0) + sysfatal("can't initialize %s: %r", part->name); + + if(flushpart(part) < 0) + sysfatal("can't flush writes to %s: %r", part->name); + + freezblock(b); +} diff --git a/sys/src/cmd/venti/sync.c b/sys/src/cmd/venti/sync.c new file mode 100755 index 000000000..9d817a72e --- /dev/null +++ b/sys/src/cmd/venti/sync.c @@ -0,0 +1,54 @@ +#include <u.h> +#include <libc.h> +#include <thread.h> +#include <venti.h> + +char *host; +int donothing; + +void +usage(void) +{ + fprint(2, "usage: sync [-h host]\n"); + threadexitsall("usage"); +} + +void +threadmain(int argc, char *argv[]) +{ + VtConn *z; + + fmtinstall('V', vtscorefmt); + fmtinstall('F', vtfcallfmt); + + ARGBEGIN{ + case 'h': + host = EARGF(usage()); + if(host == nil) + usage(); + break; + case 'x': + donothing = 1; + break; + default: + usage(); + break; + }ARGEND + + if(argc != 0) + usage(); + + z = vtdial(host); + if(z == nil) + sysfatal("could not connect to server: %r"); + + if(vtconnect(z) < 0) + sysfatal("vtconnect: %r"); + + if(!donothing) + if(vtsync(z) < 0) + sysfatal("vtsync: %r"); + + vthangup(z); + threadexitsall(0); +} diff --git a/sys/src/cmd/venti/words/backup.example b/sys/src/cmd/venti/words/backup.example new file mode 100755 index 000000000..5d0f49e3c --- /dev/null +++ b/sys/src/cmd/venti/words/backup.example @@ -0,0 +1,26 @@ +#!/bin/rc + +rfork e +cd /usr/rsc +. bkup.info +fn x { + echo x $* + y=$1 + if(~ $#$y 0){ + $y=0 + } + echo venti/wrarena -o $2 $3 $$y + end=`{venti/wrarena -o $2 $3 $$y | grep '^end offset ' | sed 's/^end offset //'} + if(~ $#end 1 && ! ~ $$y $end){ + $y=$end + echo '#' `{date} >>bkup.info + whatis $y >>bkup.info + } +} +hget http://127.1:8000/index | +awk ' +/^index=/ { blockSize=0+substr($3, 11); } +/^arena=/ { arena=substr($1, 7); } +/^ arena=/ { start=0+substr($5, 2)-blockSize; printf("x %s %d %s\n", arena, start, $3); } +' |rc + diff --git a/sys/src/cmd/venti/words/dumpvacroots b/sys/src/cmd/venti/words/dumpvacroots new file mode 100755 index 000000000..0b38172d5 --- /dev/null +++ b/sys/src/cmd/venti/words/dumpvacroots @@ -0,0 +1,21 @@ +#!/bin/rc +# dumpvacroots - dumps all the vac scores ever stored to the venti server +# if nothing else, this illustrates that you have to control access +# to the physical disks storing the archive! + +ventihttp=`{ + echo $venti | sed 's/^[a-z]+!([0-9\.]+)![a-z0-9]+$/\1/ + s/^[a-z]+!([0-9\.]+)/\1/; s/$/:8000/' +} + +hget http://$ventihttp/index | + awk ' + /^index=/ { blockSize = 0 + substr($3, 11) } + /^arena=/ { arena = substr($1, 7) } + /^ arena=/ { + start = (0 + substr($5, 2)) - blockSize + printf("venti/printarena -o %.0f %s\n", start, $3 "") + } + ' | + rc | + awk '$3 == 16 { printf("vac:%s\n", $2 "") }' diff --git a/sys/src/cmd/venti/words/notes b/sys/src/cmd/venti/words/notes new file mode 100755 index 000000000..024fae8c0 --- /dev/null +++ b/sys/src/cmd/venti/words/notes @@ -0,0 +1,149 @@ +all data is big-endian on disk. + +arena layout: + +ArenaPart (first at offset PartBlank = 256kB in the disk file) + magic[4] 0xA9E4A5E7 + version[4] 3 + blockSize[4] + arenaBase[4] offset of first ArenaHead structure in the disk file + +the ArenaMap starts at the first block at offset >= PartBlank+512 bytes. +it is a sequence of text lines +/* + * amap: n '\n' amapelem * n + * n: u32int + * amapelem: name '\t' astart '\t' asize '\n' + * astart, asize: u64int + */ + +the astart and astop are byte offsets in the disk file. +they are the offsets to the ArenaHead and the end of the Arena block. + +ArenaHead +[base points here in the C code] +size bytes + Clumps + ClumpInfo blocks +Arena + +Arena + magic[4] 0xF2A14EAD + version[4] 4 + name[64] + clumps[4] + cclumps[4] + ctime[4] + wtime[4] + used[8] + uncsize[8] + sealed[1] + optional score[20] + +once sealed, the sha1 hash of every block from the +ArenaHead to the Arena is checksummed, as though +the final score in Arena were the zeroScore. strangely, +the tail of the Arena block (the last one) is not included in the checksum +(i.e., the unused data after the score). + +clumpMax = blocksize/ClumpInfoSize = blocksize/25 +dirsize = ((clumps/clumpMax)+1) * blocksize +want used+dirsize <= size +want cclumps <= clumps +want uncsize+clumps*ClumpSize+blocksize < used +want ctime <= wtime + +clump info is stored packed into blocks in order. +clump info moves forward through a block but the +blocks themselves move backwards. so if cm=clumpMax +and there are two blocks worth of clumpinfo, the blocks +look like; + + [cm..2*cm-1] [0..cm-1] [Arena] + +with the blocks pushed right up against the Arena trailer. + +ArenaHead + magic[4] 0xD15C4EAD + version[4] = Arena.version + name[64] + blockSize[4] + size[8] + +Clump + magic[4] 0xD15CB10C (0 for an unused clump) + type[1] + size[2] + uncsize[2] + score[20] + encoding[1] raw=1, compress=2 + creator[4] + time[4] + +ClumpInfo + type[1] + size[2] + uncsize[2] + score[20] + +the arenas are mapped into a single address space corresponding +to the index that brings them together. if each arena has 100M bytes +excluding the headers and there are 4 arenas, then there's 400M of +index address space between them. index address space starts at 1M +instead of 0, so the index addresses assigned to the first arena are +1M up to 101M, then 101M to 201M, etc. + +of course, the assignment of addresses has nothing to do with the index, +but that's what they're called. + + +the index is split into index sections, which are put on different disks +to get parallelism of disk heads. each index section holds some number +of hash buckets, each in its own disk block. collectively the index sections +hold ix->buckets between them. + +the top 32-bits of the score is used to assign scores to buckets. +div = ceil(2³² / ix->buckets) is the amount of 32-bit score space per bucket. + +to look up a block, take the top 32 bits of score and divide by div +to get the bucket number. then look through the index section headers +to figure out which index section has that bucket. + +then load that block from the index section. it's an IBucket. + +the IBucket has ib.n IEntry structures in it, sorted by score and then by type. +do the lookup and get an IEntry. the ia.addr will be a logical address +that you then use to get the + +ISect + magic[4] 0xD15C5EC7 + version[4] + name[64] + index[64] + blockSize[4] + blockBase[4] address in partition where bucket blocks start + blocks[4] + start[4] + stop[4] stop - start <= blocks, but not necessarily == + +IEntry + score[20] + wtime[4] + train[2] + ia.addr[8] index address (see note above) + ia.size[2] size of uncompressed block data + ia.type[1] + ia.blocks[1] number of blocks of clump on disk + +IBucket + n[2] + next[4] not sure; either 0 or inside [start,stop) for the ISect + data[n*IEntrySize] + +final piece: all the disk partitions start with PartBlank=256kB of unused disk +(presumably to avoid problems with boot sectors and layout tables +and the like). + +actually the last 8k of the 256k (that is, at offset 248kB) can hold +a venti config file to help during bootstrap of the venti file server. + diff --git a/sys/src/cmd/venti/words/venti.conf b/sys/src/cmd/venti/words/venti.conf new file mode 100755 index 000000000..03775ea52 --- /dev/null +++ b/sys/src/cmd/venti/words/venti.conf @@ -0,0 +1,20 @@ +# a sample venti configuration file +# +# formated with +# venti/fmtarenas arena. /tmp/disks/arenas +# venti/fmtisect isect0 /tmp/disks/isect0 +# venti/fmtisect isect1 /tmp/disks/isect1 +# venti/fmtindex venti.conf +# +# server is started with +# venti/venti + +# the name of the index +index main + +# the index sections +isect /tmp/disks/isect0 +isect /tmp/disks/isect1 + +# the arenas +arenas /tmp/disks/arenas diff --git a/sys/src/cmd/venti/words/wrtape b/sys/src/cmd/venti/words/wrtape new file mode 100755 index 000000000..7e9490ede --- /dev/null +++ b/sys/src/cmd/venti/words/wrtape @@ -0,0 +1,21 @@ +#!/bin/rc + +tape=$1 + +start=`{echo $tape'*32+1' | hoc} +end=`{echo $start'+31' | hoc} + +echo rewind | scuzz /dev/sd03 + +arenas=`{hget http://iolaire/index | grep '^arena' | sed -n $start,$end^p | sed 's/^.*=//' | sed 's/ .*//'} +for(i in $arenas) { + dev=`{hget http://iolaire/index | grep ''''$i'''' | sed 's/.* on //' | sed 's/ .*//'} + echo `{date} $tape $i + echo `{date} $tape $i >> /sys/log/ventibackup + echo $dev + echo write '''|venti/rdarena $dev $i''' | scuzz -m 8192 /dev/sd03 + echo filemark | scuzz -m 6144 /dev/sd03 +} + +echo rewind | scuzz /dev/sd03 + diff --git a/sys/src/cmd/venti/write.c b/sys/src/cmd/venti/write.c new file mode 100755 index 000000000..c11a5a314 --- /dev/null +++ b/sys/src/cmd/venti/write.c @@ -0,0 +1,62 @@ +#include <u.h> +#include <libc.h> +#include <venti.h> +#include <libsec.h> +#include <thread.h> + +void +usage(void) +{ + fprint(2, "usage: write [-z] [-h host] [-t type] <datablock\n"); + threadexitsall("usage"); +} + +void +threadmain(int argc, char *argv[]) +{ + char *host; + int dotrunc, n, type; + uchar *p, score[VtScoreSize]; + VtConn *z; + + fmtinstall('F', vtfcallfmt); + fmtinstall('V', vtscorefmt); + + host = nil; + dotrunc = 0; + type = VtDataType; + ARGBEGIN{ + case 'z': + dotrunc = 1; + break; + case 'h': + host = EARGF(usage()); + break; + case 't': + type = atoi(EARGF(usage())); + break; + default: + usage(); + break; + }ARGEND + + if(argc != 0) + usage(); + + p = vtmallocz(VtMaxLumpSize+1); + n = readn(0, p, VtMaxLumpSize+1); + if(n > VtMaxLumpSize) + sysfatal("input too big: max block size is %d", VtMaxLumpSize); + z = vtdial(host); + if(z == nil) + sysfatal("could not connect to server: %r"); + if(vtconnect(z) < 0) + sysfatal("vtconnect: %r"); + if(dotrunc) + n = vtzerotruncate(type, p, n); + if(vtwrite(z, score, type, p, n) < 0) + sysfatal("vtwrite: %r"); + vthangup(z); + print("%V\n", score); + threadexitsall(0); +} |