summaryrefslogtreecommitdiff
path: root/sys/src/cmd/venti
diff options
context:
space:
mode:
authorTaru Karttunen <taruti@taruti.net>2011-03-30 15:46:40 +0300
committerTaru Karttunen <taruti@taruti.net>2011-03-30 15:46:40 +0300
commite5888a1ffdae813d7575f5fb02275c6bb07e5199 (patch)
treed8d51eac403f07814b9e936eed0c9a79195e2450 /sys/src/cmd/venti
Import sources from 2011-03-30 iso image
Diffstat (limited to 'sys/src/cmd/venti')
-rwxr-xr-xsys/src/cmd/venti/copy.c262
-rwxr-xr-xsys/src/cmd/venti/devnull.c79
-rwxr-xr-xsys/src/cmd/venti/mkfile28
-rwxr-xr-xsys/src/cmd/venti/mkroot.c61
-rwxr-xr-xsys/src/cmd/venti/randtest.c335
-rwxr-xr-xsys/src/cmd/venti/read.c74
-rwxr-xr-xsys/src/cmd/venti/readlist.c112
-rwxr-xr-xsys/src/cmd/venti/ro.c112
-rwxr-xr-xsys/src/cmd/venti/root.c72
-rwxr-xr-xsys/src/cmd/venti/srv/arena.c931
-rwxr-xr-xsys/src/cmd/venti/srv/arenas.c420
-rwxr-xr-xsys/src/cmd/venti/srv/bloom.c256
-rwxr-xr-xsys/src/cmd/venti/srv/buildbuck.c132
-rwxr-xr-xsys/src/cmd/venti/srv/buildindex.c966
-rwxr-xr-xsys/src/cmd/venti/srv/checkarenas.c139
-rwxr-xr-xsys/src/cmd/venti/srv/checkindex.c295
-rwxr-xr-xsys/src/cmd/venti/srv/clump.c225
-rwxr-xr-xsys/src/cmd/venti/srv/clumpstats.c127
-rwxr-xr-xsys/src/cmd/venti/srv/cmparenas.c317
-rwxr-xr-xsys/src/cmd/venti/srv/conf.rc67
-rwxr-xr-xsys/src/cmd/venti/srv/config.c253
-rwxr-xr-xsys/src/cmd/venti/srv/conv.c730
-rwxr-xr-xsys/src/cmd/venti/srv/dat.h758
-rwxr-xr-xsys/src/cmd/venti/srv/dcache.c712
-rwxr-xr-xsys/src/cmd/venti/srv/disksched.c89
-rwxr-xr-xsys/src/cmd/venti/srv/dump.c47
-rwxr-xr-xsys/src/cmd/venti/srv/findscore.c122
-rwxr-xr-xsys/src/cmd/venti/srv/fixarenas.c1914
-rwxr-xr-xsys/src/cmd/venti/srv/fmtarenas.c132
-rwxr-xr-xsys/src/cmd/venti/srv/fmtbloom.c116
-rwxr-xr-xsys/src/cmd/venti/srv/fmtindex.c120
-rwxr-xr-xsys/src/cmd/venti/srv/fmtisect.c83
-rwxr-xr-xsys/src/cmd/venti/srv/fns.h228
-rwxr-xr-xsys/src/cmd/venti/srv/graph.c197
-rwxr-xr-xsys/src/cmd/venti/srv/hdisk.c696
-rwxr-xr-xsys/src/cmd/venti/srv/hproc.c674
-rwxr-xr-xsys/src/cmd/venti/srv/httpd.c1177
-rwxr-xr-xsys/src/cmd/venti/srv/icache.c571
-rwxr-xr-xsys/src/cmd/venti/srv/icachewrite.c358
-rwxr-xr-xsys/src/cmd/venti/srv/ifile.c149
-rwxr-xr-xsys/src/cmd/venti/srv/index.c866
-rwxr-xr-xsys/src/cmd/venti/srv/lump.c240
-rwxr-xr-xsys/src/cmd/venti/srv/lumpcache.c429
-rwxr-xr-xsys/src/cmd/venti/srv/lumpqueue.c171
-rwxr-xr-xsys/src/cmd/venti/srv/mirrorarenas.c523
-rwxr-xr-xsys/src/cmd/venti/srv/mkfile101
-rwxr-xr-xsys/src/cmd/venti/srv/part.c249
-rwxr-xr-xsys/src/cmd/venti/srv/png.c239
-rwxr-xr-xsys/src/cmd/venti/srv/printarena.c126
-rwxr-xr-xsys/src/cmd/venti/srv/printarenapart.c155
-rwxr-xr-xsys/src/cmd/venti/srv/printarenas.c113
-rwxr-xr-xsys/src/cmd/venti/srv/printindex.c99
-rwxr-xr-xsys/src/cmd/venti/srv/printmap.c42
-rwxr-xr-xsys/src/cmd/venti/srv/rdarena.c96
-rwxr-xr-xsys/src/cmd/venti/srv/readifile.c29
-rwxr-xr-xsys/src/cmd/venti/srv/reseal.c303
-rwxr-xr-xsys/src/cmd/venti/srv/round.c102
-rwxr-xr-xsys/src/cmd/venti/srv/score.c46
-rwxr-xr-xsys/src/cmd/venti/srv/sortientry.c365
-rwxr-xr-xsys/src/cmd/venti/srv/stats.c212
-rwxr-xr-xsys/src/cmd/venti/srv/stdinc.h9
-rwxr-xr-xsys/src/cmd/venti/srv/syncarena.c174
-rwxr-xr-xsys/src/cmd/venti/srv/syncindex.c64
-rwxr-xr-xsys/src/cmd/venti/srv/syncindex0.c93
-rwxr-xr-xsys/src/cmd/venti/srv/trace.c39
-rwxr-xr-xsys/src/cmd/venti/srv/unittoull.c30
-rwxr-xr-xsys/src/cmd/venti/srv/unwhack.c179
-rwxr-xr-xsys/src/cmd/venti/srv/utils.c259
-rwxr-xr-xsys/src/cmd/venti/srv/venti.c428
-rwxr-xr-xsys/src/cmd/venti/srv/verifyarena.c266
-rwxr-xr-xsys/src/cmd/venti/srv/whack.c331
-rwxr-xr-xsys/src/cmd/venti/srv/whack.h40
-rwxr-xr-xsys/src/cmd/venti/srv/wrarena.c225
-rwxr-xr-xsys/src/cmd/venti/srv/www/stats.html33
-rwxr-xr-xsys/src/cmd/venti/srv/www/stats.js387
-rwxr-xr-xsys/src/cmd/venti/srv/www/status.js16
-rwxr-xr-xsys/src/cmd/venti/srv/www/status1.js14
-rwxr-xr-xsys/src/cmd/venti/srv/xml.c68
-rwxr-xr-xsys/src/cmd/venti/srv/xml.h11
-rwxr-xr-xsys/src/cmd/venti/srv/zblock.c96
-rwxr-xr-xsys/src/cmd/venti/srv/zeropart.c30
-rwxr-xr-xsys/src/cmd/venti/sync.c54
-rwxr-xr-xsys/src/cmd/venti/words/backup.example26
-rwxr-xr-xsys/src/cmd/venti/words/dumpvacroots21
-rwxr-xr-xsys/src/cmd/venti/words/notes149
-rwxr-xr-xsys/src/cmd/venti/words/venti.conf20
-rwxr-xr-xsys/src/cmd/venti/words/wrtape21
-rwxr-xr-xsys/src/cmd/venti/write.c62
88 files changed, 21787 insertions, 0 deletions
diff --git a/sys/src/cmd/venti/copy.c b/sys/src/cmd/venti/copy.c
new file mode 100755
index 000000000..db07dcb9d
--- /dev/null
+++ b/sys/src/cmd/venti/copy.c
@@ -0,0 +1,262 @@
+#include <u.h>
+#include <libc.h>
+#include <venti.h>
+#include <libsec.h>
+#include <avl.h>
+#include <bin.h>
+
+int changes;
+int rewrite;
+int ignoreerrors;
+int fast;
+int verbose;
+int nskip;
+int nwrite;
+
+VtConn *zsrc, *zdst;
+uchar zeroscore[VtScoreSize]; /* all zeros */
+
+typedef struct ScoreTree ScoreTree;
+struct ScoreTree
+{
+ Avl avl;
+ uchar score[VtScoreSize];
+ int type;
+};
+
+Avltree *scoretree;
+Bin *scorebin;
+
+static int
+scoretreecmp(Avl *va, Avl *vb)
+{
+ ScoreTree *a, *b;
+ int i;
+
+ a = (ScoreTree*)va;
+ b = (ScoreTree*)vb;
+
+ i = memcmp(a->score, b->score, VtScoreSize);
+ if(i != 0)
+ return i;
+ return a->type - b->type;
+}
+
+static int
+havevisited(uchar score[VtScoreSize], int type)
+{
+ ScoreTree a;
+
+ if(scoretree == nil)
+ return 0;
+ memmove(a.score, score, VtScoreSize);
+ a.type = type;
+ return lookupavl(scoretree, &a.avl) != nil;
+}
+
+static void
+markvisited(uchar score[VtScoreSize], int type)
+{
+ ScoreTree *a;
+ Avl *old;
+
+ if(scoretree == nil)
+ return;
+ a = binalloc(&scorebin, sizeof *a, 1);
+ memmove(a->score, score, VtScoreSize);
+ a->type = type;
+ insertavl(scoretree, &a->avl, &old);
+}
+
+void
+usage(void)
+{
+ fprint(2, "usage: %s [-fimrv] [-t type] srchost dsthost score\n", argv0);
+ exits("usage");
+}
+
+void
+walk(uchar score[VtScoreSize], uint type, int base)
+{
+ int i, n;
+ uchar *buf;
+ uchar nscore[VtScoreSize];
+ VtEntry e;
+ VtRoot root;
+
+ if(memcmp(score, vtzeroscore, VtScoreSize) == 0 || memcmp(score, zeroscore, VtScoreSize) == 0)
+ return;
+
+ if(havevisited(score, type)){
+ nskip++;
+ return;
+ }
+
+ buf = vtmallocz(VtMaxLumpSize);
+ if(fast && vtread(zdst, score, type, buf, VtMaxLumpSize) >= 0){
+ if(verbose)
+ fprint(2, "skip %V\n", score);
+ free(buf);
+ return;
+ }
+
+ n = vtread(zsrc, score, type, buf, VtMaxLumpSize);
+ if(n < 0){
+ if(rewrite){
+ changes++;
+ memmove(score, vtzeroscore, VtScoreSize);
+ }else if(!ignoreerrors)
+ sysfatal("reading block %V (type %d): %r", score, type);
+ return;
+ }
+
+ switch(type){
+ case VtRootType:
+ if(vtrootunpack(&root, buf) < 0){
+ fprint(2, "warning: could not unpack root in %V %d\n", score, type);
+ break;
+ }
+ walk(root.prev, VtRootType, 0);
+ walk(root.score, VtDirType, 0);
+ if(rewrite)
+ vtrootpack(&root, buf); /* walk might have changed score */
+ break;
+
+ case VtDirType:
+ for(i=0; i<n/VtEntrySize; i++){
+ if(vtentryunpack(&e, buf, i) < 0){
+ fprint(2, "warning: could not unpack entry #%d in %V %d\n", i, score, type);
+ continue;
+ }
+ if(!(e.flags & VtEntryActive))
+ continue;
+ walk(e.score, e.type, e.type&VtTypeBaseMask);
+ /*
+ * Don't repack unless we're rewriting -- some old
+ * vac files have psize==0 and dsize==0, and these
+ * get rewritten by vtentryunpack to have less strange
+ * block sizes. So vtentryunpack; vtentrypack does not
+ * guarantee to preserve the exact bytes in buf.
+ */
+ if(rewrite)
+ vtentrypack(&e, buf, i);
+ }
+ break;
+
+ case VtDataType:
+ break;
+
+ default: /* pointers */
+ for(i=0; i<n; i+=VtScoreSize)
+ if(memcmp(buf+i, vtzeroscore, VtScoreSize) != 0)
+ walk(buf+i, type-1, base);
+ break;
+ }
+
+ nwrite++;
+ if(vtwrite(zdst, nscore, type, buf, n) < 0){
+ /* figure out score for better error message */
+ /* can't use input argument - might have changed contents */
+ n = vtzerotruncate(type, buf, n);
+ sha1(buf, n, score, nil);
+ sysfatal("writing block %V (type %d): %r", score, type);
+ }
+ if(!rewrite && memcmp(score, nscore, VtScoreSize) != 0){
+ fprint(2, "not rewriting: wrote %V got %V\n", score, nscore);
+ abort();
+ sysfatal("not rewriting: wrote %V got %V", score, nscore);
+ }
+
+ markvisited(score, type);
+ free(buf);
+}
+
+void
+main(int argc, char *argv[])
+{
+ int type, n;
+ uchar score[VtScoreSize];
+ uchar *buf;
+ char *prefix;
+
+ fmtinstall('F', vtfcallfmt);
+ fmtinstall('V', vtscorefmt);
+
+ type = -1;
+ ARGBEGIN{
+ case 'V':
+ chattyventi++;
+ break;
+ case 'f':
+ fast = 1;
+ break;
+ case 'i':
+ if(rewrite)
+ usage();
+ ignoreerrors = 1;
+ break;
+ case 'm':
+ scoretree = mkavltree(scoretreecmp);
+ break;
+ case 'r':
+ if(ignoreerrors)
+ usage();
+ rewrite = 1;
+ break;
+ case 't':
+ type = atoi(EARGF(usage()));
+ break;
+ case 'v':
+ verbose = 1;
+ break;
+ default:
+ usage();
+ break;
+ }ARGEND
+
+ if(argc != 3)
+ usage();
+
+ if(vtparsescore(argv[2], &prefix, score) < 0)
+ sysfatal("could not parse score: %r");
+
+ buf = vtmallocz(VtMaxLumpSize);
+
+ zsrc = vtdial(argv[0]);
+ if(zsrc == nil)
+ sysfatal("could not dial src server: %r");
+ if(vtconnect(zsrc) < 0)
+ sysfatal("vtconnect src: %r");
+
+ zdst = vtdial(argv[1]);
+ if(zdst == nil)
+ sysfatal("could not dial dst server: %r");
+ if(vtconnect(zdst) < 0)
+ sysfatal("vtconnect dst: %r");
+
+ if(type != -1){
+ n = vtread(zsrc, score, type, buf, VtMaxLumpSize);
+ if(n < 0)
+ sysfatal("could not read block: %r");
+ }else{
+ for(type=0; type<VtMaxType; type++){
+ n = vtread(zsrc, score, type, buf, VtMaxLumpSize);
+ if(n >= 0)
+ break;
+ }
+ if(type == VtMaxType)
+ sysfatal("could not find block %V of any type", score);
+ }
+
+ walk(score, type, VtDirType);
+ if(changes)
+ print("%s:%V (%d pointers rewritten)\n", prefix, score, changes);
+
+ if(verbose)
+ print("%d skipped, %d written\n", nskip, nwrite);
+
+ if(vtsync(zdst) < 0)
+ sysfatal("could not sync dst server: %r");
+
+ exits(0);
+}
diff --git a/sys/src/cmd/venti/devnull.c b/sys/src/cmd/venti/devnull.c
new file mode 100755
index 000000000..fdad553e6
--- /dev/null
+++ b/sys/src/cmd/venti/devnull.c
@@ -0,0 +1,79 @@
+/* Copyright (c) 2004 Russ Cox */
+#include <u.h>
+#include <libc.h>
+#include <venti.h>
+#include <thread.h>
+#include <libsec.h>
+
+#ifndef _UNISTD_H_
+#pragma varargck type "F" VtFcall*
+#pragma varargck type "T" void
+#endif
+
+int verbose;
+
+enum
+{
+ STACK = 8192
+};
+
+void
+usage(void)
+{
+ fprint(2, "usage: venti/devnull [-v] [-a address]\n");
+ threadexitsall("usage");
+}
+
+void
+threadmain(int argc, char **argv)
+{
+ VtReq *r;
+ VtSrv *srv;
+ char *address;
+
+ fmtinstall('V', vtscorefmt);
+ fmtinstall('F', vtfcallfmt);
+
+ address = "tcp!*!venti";
+
+ ARGBEGIN{
+ case 'v':
+ verbose++;
+ break;
+ case 'a':
+ address = EARGF(usage());
+ break;
+ default:
+ usage();
+ }ARGEND
+
+ srv = vtlisten(address);
+ if(srv == nil)
+ sysfatal("vtlisten %s: %r", address);
+
+ while((r = vtgetreq(srv)) != nil){
+ r->rx.msgtype = r->tx.msgtype+1;
+ if(verbose)
+ fprint(2, "<- %F\n", &r->tx);
+ switch(r->tx.msgtype){
+ case VtTping:
+ break;
+ case VtTgoodbye:
+ break;
+ case VtTread:
+ r->rx.error = vtstrdup("no such block");
+ r->rx.msgtype = VtRerror;
+ break;
+ case VtTwrite:
+ packetsha1(r->tx.data, r->rx.score);
+ break;
+ case VtTsync:
+ break;
+ }
+ if(verbose)
+ fprint(2, "-> %F\n", &r->rx);
+ vtrespond(r);
+ }
+ threadexitsall(nil);
+}
+
diff --git a/sys/src/cmd/venti/mkfile b/sys/src/cmd/venti/mkfile
new file mode 100755
index 000000000..0c1ed7f9a
--- /dev/null
+++ b/sys/src/cmd/venti/mkfile
@@ -0,0 +1,28 @@
+</$objtype/mkfile
+
+TARG=\
+ copy\
+ read\
+ ro\
+ sync\
+ write\
+
+
+BIN=/$objtype/bin/venti
+
+</sys/src/cmd/mkmany
+
+CFLAGS=$CFLAGS -I.
+
+extra:V: $O.devnull $O.mkroot $O.randtest $O.readlist $O.root
+
+all:V: srv.all.dir
+install:V: srv.install.dir
+installall:V: srv.installall.dir
+safeinstall:V: srv.safeinstall.dir
+safeinstallall:V: srv.safeinstallall.dir
+clean:V: srv.clean.dir
+nuke:V: srv.nuke.dir
+
+srv.%.dir:V:
+ @{ cd srv && mk $stem }
diff --git a/sys/src/cmd/venti/mkroot.c b/sys/src/cmd/venti/mkroot.c
new file mode 100755
index 000000000..8c38b1d16
--- /dev/null
+++ b/sys/src/cmd/venti/mkroot.c
@@ -0,0 +1,61 @@
+#include <u.h>
+#include <libc.h>
+#include <venti.h>
+#include <thread.h>
+
+char *host;
+
+void
+usage(void)
+{
+ fprint(2, "usage: mkroot [-h host] name type score blocksize prev\n");
+ threadexitsall("usage");
+}
+
+void
+threadmain(int argc, char *argv[])
+{
+ uchar score[VtScoreSize];
+ uchar buf[VtRootSize];
+ VtConn *z;
+ VtRoot root;
+
+ ARGBEGIN{
+ case 'h':
+ host = EARGF(usage());
+ break;
+ default:
+ usage();
+ break;
+ }ARGEND
+
+ if(argc != 5)
+ usage();
+
+ fmtinstall('V', vtscorefmt);
+ fmtinstall('F', vtfcallfmt);
+
+ strecpy(root.name, root.name+sizeof root.name, argv[0]);
+ strecpy(root.type, root.type+sizeof root.type, argv[1]);
+ if(vtparsescore(argv[2], nil, root.score) < 0)
+ sysfatal("bad score '%s'", argv[2]);
+ root.blocksize = atoi(argv[3]);
+ if(vtparsescore(argv[4], nil, root.prev) < 0)
+ sysfatal("bad score '%s'", argv[4]);
+ vtrootpack(&root, buf);
+
+ z = vtdial(host);
+ if(z == nil)
+ sysfatal("could not connect to server: %r");
+
+ if(vtconnect(z) < 0)
+ sysfatal("vtconnect: %r");
+
+ if(vtwrite(z, score, VtRootType, buf, VtRootSize) < 0)
+ sysfatal("vtwrite: %r");
+ if(vtsync(z) < 0)
+ sysfatal("vtsync: %r");
+ vthangup(z);
+ print("%V\n", score);
+ threadexitsall(0);
+}
diff --git a/sys/src/cmd/venti/randtest.c b/sys/src/cmd/venti/randtest.c
new file mode 100755
index 000000000..2a1fa6ee7
--- /dev/null
+++ b/sys/src/cmd/venti/randtest.c
@@ -0,0 +1,335 @@
+#include <u.h>
+#include <libc.h>
+#include <venti.h>
+#include <libsec.h>
+#include <thread.h>
+
+
+enum { STACK = 32768 };
+void xxxsrand(long);
+long xxxlrand(void);
+
+Channel *cw;
+Channel *cr;
+char *host;
+int blocksize, seed, randpct;
+int doread, dowrite, packets, permute;
+vlong totalbytes, cur;
+VtConn *z;
+int multi;
+int maxpackets;
+int sequence;
+int doublecheck = 1;
+uint *order;
+
+void
+usage(void)
+{
+ fprint(2, "usage: randtest [-q] [-h host] [-s seed] [-b blocksize] [-p randpct] [-n totalbytes] [-M maxblocks] [-P] [-r] [-w]\n");
+ threadexitsall("usage");
+}
+
+void
+wr(char *buf, char *buf2)
+{
+ uchar score[VtScoreSize], score2[VtScoreSize];
+ DigestState ds;
+
+ USED(buf2);
+ memset(&ds, 0, sizeof ds);
+ if(doublecheck)
+ sha1((uchar*)buf, blocksize, score, &ds);
+ if(vtwrite(z, score2, VtDataType, (uchar*)buf, blocksize) < 0)
+ sysfatal("vtwrite %V at %,lld: %r", score, cur);
+ if(doublecheck && memcmp(score, score2, VtScoreSize) != 0)
+ sysfatal("score mismatch! %V %V", score, score2);
+}
+
+void
+wrthread(void *v)
+{
+ char *p;
+
+ USED(v);
+ while((p = recvp(cw)) != nil){
+ wr(p, nil);
+ free(p);
+ }
+}
+
+void
+rd(char *buf, char *buf2)
+{
+ uchar score[VtScoreSize];
+ DigestState ds;
+
+ memset(&ds, 0, sizeof ds);
+ sha1((uchar*)buf, blocksize, score, &ds);
+ if(vtread(z, score, VtDataType, (uchar*)buf2, blocksize) < 0)
+ sysfatal("vtread %V at %,lld: %r", score, cur);
+ if(memcmp(buf, buf2, blocksize) != 0)
+ sysfatal("bad data read! %V", score);
+}
+
+void
+rdthread(void *v)
+{
+ char *p, *buf2;
+
+ buf2 = vtmalloc(blocksize);
+ USED(v);
+ while((p = recvp(cr)) != nil){
+ rd(p, buf2);
+ free(p);
+ }
+}
+
+char *template;
+
+void
+run(void (*fn)(char*, char*), Channel *c)
+{
+ int i, t, j, packets;
+ char *buf2, *buf;
+
+ buf2 = vtmalloc(blocksize);
+ buf = vtmalloc(blocksize);
+ cur = 0;
+ packets = totalbytes/blocksize;
+ if(maxpackets == 0)
+ maxpackets = packets;
+ order = vtmalloc(packets*sizeof order[0]);
+ for(i=0; i<packets; i++)
+ order[i] = i;
+ if(permute){
+ for(i=1; i<packets; i++){
+ j = nrand(i+1);
+ t = order[i];
+ order[i] = order[j];
+ order[j] = t;
+ }
+ }
+ for(i=0; i<packets && i<maxpackets; i++){
+ memmove(buf, template, blocksize);
+ *(uint*)buf = order[i];
+ if(c){
+ sendp(c, buf);
+ buf = vtmalloc(blocksize);
+ }else
+ (*fn)(buf, buf2);
+ cur += blocksize;
+ }
+ free(order);
+}
+
+#define TWID64 ((u64int)~(u64int)0)
+
+u64int
+unittoull(char *s)
+{
+ char *es;
+ u64int n;
+
+ if(s == nil)
+ return TWID64;
+ n = strtoul(s, &es, 0);
+ if(*es == 'k' || *es == 'K'){
+ n *= 1024;
+ es++;
+ }else if(*es == 'm' || *es == 'M'){
+ n *= 1024*1024;
+ es++;
+ }else if(*es == 'g' || *es == 'G'){
+ n *= 1024*1024*1024;
+ es++;
+ }else if(*es == 't' || *es == 'T'){
+ n *= 1024*1024;
+ n *= 1024*1024;
+ }
+ if(*es != '\0')
+ return TWID64;
+ return n;
+}
+
+void
+threadmain(int argc, char *argv[])
+{
+ int i, max;
+ vlong t0;
+ double t;
+
+ blocksize = 8192;
+ seed = 0;
+ randpct = 50;
+ host = nil;
+ doread = 0;
+ dowrite = 0;
+ totalbytes = 1*1024*1024*1024;
+ fmtinstall('V', vtscorefmt);
+ fmtinstall('F', vtfcallfmt);
+
+ ARGBEGIN{
+ case 'b':
+ blocksize = unittoull(EARGF(usage()));
+ break;
+ case 'h':
+ host = EARGF(usage());
+ break;
+ case 'M':
+ maxpackets = unittoull(EARGF(usage()));
+ break;
+ case 'm':
+ multi = atoi(EARGF(usage()));
+ break;
+ case 'n':
+ totalbytes = unittoull(EARGF(usage()));
+ break;
+ case 'p':
+ randpct = atoi(EARGF(usage()));
+ break;
+ case 'P':
+ permute = 1;
+ break;
+ case 'S':
+ doublecheck = 0;
+ ventidoublechecksha1 = 0;
+ break;
+ case 's':
+ seed = atoi(EARGF(usage()));
+ break;
+ case 'r':
+ doread = 1;
+ break;
+ case 'w':
+ dowrite = 1;
+ break;
+ case 'V':
+ chattyventi++;
+ break;
+ default:
+ usage();
+ }ARGEND
+
+ if(doread==0 && dowrite==0){
+ doread = 1;
+ dowrite = 1;
+ }
+
+ z = vtdial(host);
+ if(z == nil)
+ sysfatal("could not connect to server: %r");
+ if(vtconnect(z) < 0)
+ sysfatal("vtconnect: %r");
+
+ if(multi){
+ cr = chancreate(sizeof(void*), 0);
+ cw = chancreate(sizeof(void*), 0);
+ for(i=0; i<multi; i++){
+ proccreate(wrthread, nil, STACK);
+ proccreate(rdthread, nil, STACK);
+ }
+ }
+
+ template = vtmalloc(blocksize);
+ xxxsrand(seed);
+ max = (256*randpct)/100;
+ if(max == 0)
+ max = 1;
+ for(i=0; i<blocksize; i++)
+ template[i] = xxxlrand()%max;
+ if(dowrite){
+ t0 = nsec();
+ run(wr, cw);
+ for(i=0; i<multi; i++)
+ sendp(cw, nil);
+ t = (nsec() - t0)/1.e9;
+ print("write: %lld bytes / %.3f seconds = %.6f MB/s\n",
+ totalbytes, t, (double)totalbytes/1e6/t);
+ }
+ if(doread){
+ t0 = nsec();
+ run(rd, cr);
+ for(i=0; i<multi; i++)
+ sendp(cr, nil);
+ t = (nsec() - t0)/1.e9;
+ print("read: %lld bytes / %.3f seconds = %.6f MB/s\n",
+ totalbytes, t, (double)totalbytes/1e6/t);
+ }
+ threadexitsall(nil);
+}
+
+
+/*
+ * algorithm by
+ * D. P. Mitchell & J. A. Reeds
+ */
+
+#define LEN 607
+#define TAP 273
+#define MASK 0x7fffffffL
+#define A 48271
+#define M 2147483647
+#define Q 44488
+#define R 3399
+#define NORM (1.0/(1.0+MASK))
+
+static ulong rng_vec[LEN];
+static ulong* rng_tap = rng_vec;
+static ulong* rng_feed = 0;
+
+static void
+isrand(long seed)
+{
+ long lo, hi, x;
+ int i;
+
+ rng_tap = rng_vec;
+ rng_feed = rng_vec+LEN-TAP;
+ seed = seed%M;
+ if(seed < 0)
+ seed += M;
+ if(seed == 0)
+ seed = 89482311;
+ x = seed;
+ /*
+ * Initialize by x[n+1] = 48271 * x[n] mod (2**31 - 1)
+ */
+ for(i = -20; i < LEN; i++) {
+ hi = x / Q;
+ lo = x % Q;
+ x = A*lo - R*hi;
+ if(x < 0)
+ x += M;
+ if(i >= 0)
+ rng_vec[i] = x;
+ }
+}
+
+void
+xxxsrand(long seed)
+{
+ isrand(seed);
+}
+
+long
+xxxlrand(void)
+{
+ ulong x;
+
+ rng_tap--;
+ if(rng_tap < rng_vec) {
+ if(rng_feed == 0) {
+ isrand(1);
+ rng_tap--;
+ }
+ rng_tap += LEN;
+ }
+ rng_feed--;
+ if(rng_feed < rng_vec)
+ rng_feed += LEN;
+ x = (*rng_feed + *rng_tap) & MASK;
+ *rng_feed = x;
+
+ return x;
+}
+
diff --git a/sys/src/cmd/venti/read.c b/sys/src/cmd/venti/read.c
new file mode 100755
index 000000000..a48e62e6a
--- /dev/null
+++ b/sys/src/cmd/venti/read.c
@@ -0,0 +1,74 @@
+#include <u.h>
+#include <libc.h>
+#include <venti.h>
+#include <libsec.h>
+#include <thread.h>
+
+void
+usage(void)
+{
+ fprint(2, "usage: read [-h host] [-t type] score\n");
+ threadexitsall("usage");
+}
+
+void
+threadmain(int argc, char *argv[])
+{
+ int type, n;
+ uchar score[VtScoreSize];
+ uchar *buf;
+ VtConn *z;
+ char *host;
+
+ fmtinstall('F', vtfcallfmt);
+ fmtinstall('V', vtscorefmt);
+
+ host = nil;
+ type = -1;
+ ARGBEGIN{
+ case 'h':
+ host = EARGF(usage());
+ break;
+ case 't':
+ type = atoi(EARGF(usage()));
+ break;
+ default:
+ usage();
+ break;
+ }ARGEND
+
+ if(argc != 1)
+ usage();
+
+ if(vtparsescore(argv[0], nil, score) < 0)
+ sysfatal("could not parse score '%s': %r", argv[0]);
+
+ buf = vtmallocz(VtMaxLumpSize);
+
+ z = vtdial(host);
+ if(z == nil)
+ sysfatal("could not connect to server: %r");
+
+ if(vtconnect(z) < 0)
+ sysfatal("vtconnect: %r");
+
+ if(type == -1){
+ n = -1;
+ for(type=0; type<VtMaxType; type++){
+ n = vtread(z, score, type, buf, VtMaxLumpSize);
+ if(n >= 0){
+ fprint(2, "venti/read%s%s %V %d\n", host ? " -h" : "", host ? host : "",
+ score, type);
+ break;
+ }
+ }
+ }else
+ n = vtread(z, score, type, buf, VtMaxLumpSize);
+
+ vthangup(z);
+ if(n < 0)
+ sysfatal("could not read block: %r");
+ if(write(1, buf, n) != n)
+ sysfatal("write: %r");
+ threadexitsall(0);
+}
diff --git a/sys/src/cmd/venti/readlist.c b/sys/src/cmd/venti/readlist.c
new file mode 100755
index 000000000..6d928086a
--- /dev/null
+++ b/sys/src/cmd/venti/readlist.c
@@ -0,0 +1,112 @@
+#include <u.h>
+#include <libc.h>
+#include <thread.h>
+#include <venti.h>
+#include <bio.h>
+
+char *host;
+Biobuf b;
+VtConn *z;
+uchar *buf;
+void run(Biobuf*);
+int nn;
+
+void
+usage(void)
+{
+ fprint(2, "usage: readlist [-h host] list\n");
+ threadexitsall("usage");
+}
+
+int
+parsescore(uchar *score, char *buf, int n)
+{
+ int i, c;
+
+ memset(score, 0, VtScoreSize);
+
+ if(n != VtScoreSize*2){
+ werrstr("score wrong length %d", n);
+ return -1;
+ }
+ for(i=0; i<VtScoreSize*2; i++) {
+ if(buf[i] >= '0' && buf[i] <= '9')
+ c = buf[i] - '0';
+ else if(buf[i] >= 'a' && buf[i] <= 'f')
+ c = buf[i] - 'a' + 10;
+ else if(buf[i] >= 'A' && buf[i] <= 'F')
+ c = buf[i] - 'A' + 10;
+ else {
+ c = buf[i];
+ werrstr("bad score char %d '%c'", c, c);
+ return -1;
+ }
+
+ if((i & 1) == 0)
+ c <<= 4;
+
+ score[i>>1] |= c;
+ }
+ return 0;
+}
+
+void
+threadmain(int argc, char *argv[])
+{
+ int fd, i;
+
+ ARGBEGIN{
+ case 'h':
+ host = EARGF(usage());
+ break;
+ default:
+ usage();
+ break;
+ }ARGEND
+
+ fmtinstall('V', vtscorefmt);
+ buf = vtmallocz(VtMaxLumpSize);
+ z = vtdial(host);
+ if(z == nil)
+ sysfatal("could not connect to server: %r");
+ if(vtconnect(z) < 0)
+ sysfatal("vtconnect: %r");
+
+ if(argc == 0){
+ Binit(&b, 0, OREAD);
+ run(&b);
+ }else{
+ for(i=0; i<argc; i++){
+ if((fd = open(argv[i], OREAD)) < 0)
+ sysfatal("open %s: %r", argv[i]);
+ Binit(&b, fd, OREAD);
+ run(&b);
+ }
+ }
+ threadexitsall(nil);
+}
+
+void
+run(Biobuf *b)
+{
+ char *p, *f[10];
+ int nf;
+ uchar score[20];
+ int type, n;
+
+ while((p = Brdline(b, '\n')) != nil){
+ p[Blinelen(b)-1] = 0;
+ nf = tokenize(p, f, nelem(f));
+ if(nf != 2)
+ sysfatal("syntax error in work list");
+ if(parsescore(score, f[0], strlen(f[0])) < 0)
+ sysfatal("bad score %s in work list", f[0]);
+ type = atoi(f[1]);
+ n = vtread(z, score, type, buf, VtMaxLumpSize);
+ if(n < 0)
+ sysfatal("could not read %s %s: %r", f[0], f[1]);
+ /* write(1, buf, n); */
+ if(++nn%1000 == 0)
+ print("%d...", nn);
+ }
+}
diff --git a/sys/src/cmd/venti/ro.c b/sys/src/cmd/venti/ro.c
new file mode 100755
index 000000000..ee954a32b
--- /dev/null
+++ b/sys/src/cmd/venti/ro.c
@@ -0,0 +1,112 @@
+/* Copyright (c) 2004 Russ Cox */
+#include <u.h>
+#include <libc.h>
+#include <venti.h>
+#include <thread.h>
+#include <libsec.h>
+
+#ifndef _UNISTD_H_
+#pragma varargck type "F" VtFcall*
+#pragma varargck type "T" void
+#endif
+
+VtConn *z;
+int verbose;
+
+enum
+{
+ STACK = 8192
+};
+
+void
+usage(void)
+{
+ fprint(2, "usage: venti/ro [-v] [-a address] [-h address]\n");
+ threadexitsall("usage");
+}
+
+void
+readthread(void *v)
+{
+ char err[ERRMAX];
+ VtReq *r;
+ uchar *buf;
+ int n;
+
+ r = v;
+ buf = vtmalloc(r->tx.count);
+ if((n=vtread(z, r->tx.score, r->tx.blocktype, buf, r->tx.count)) < 0){
+ r->rx.msgtype = VtRerror;
+ rerrstr(err, sizeof err);
+ r->rx.error = vtstrdup(err);
+ free(buf);
+ }else{
+ r->rx.data = packetforeign(buf, n, free, buf);
+ }
+ if(verbose)
+ fprint(2, "-> %F\n", &r->rx);
+ vtrespond(r);
+}
+
+void
+threadmain(int argc, char **argv)
+{
+ VtReq *r;
+ VtSrv *srv;
+ char *address, *ventiaddress;
+
+ fmtinstall('F', vtfcallfmt);
+ fmtinstall('V', vtscorefmt);
+
+ address = "tcp!*!venti";
+ ventiaddress = nil;
+
+ ARGBEGIN{
+ case 'v':
+ verbose++;
+ break;
+ case 'a':
+ address = EARGF(usage());
+ break;
+ case 'h':
+ ventiaddress = EARGF(usage());
+ break;
+ default:
+ usage();
+ }ARGEND
+
+ if((z = vtdial(ventiaddress)) == nil)
+ sysfatal("vtdial %s: %r", ventiaddress);
+ if(vtconnect(z) < 0)
+ sysfatal("vtconnect: %r");
+
+ srv = vtlisten(address);
+ if(srv == nil)
+ sysfatal("vtlisten %s: %r", address);
+
+ while((r = vtgetreq(srv)) != nil){
+ r->rx.msgtype = r->tx.msgtype+1;
+ if(verbose)
+ fprint(2, "<- %F\n", &r->tx);
+ switch(r->tx.msgtype){
+ case VtTping:
+ break;
+ case VtTgoodbye:
+ break;
+ case VtTread:
+ threadcreate(readthread, r, 16384);
+ continue;
+ case VtTwrite:
+ r->rx.error = vtstrdup("read-only server");
+ r->rx.msgtype = VtRerror;
+ break;
+ case VtTsync:
+ break;
+ }
+ if(verbose)
+ fprint(2, "-> %F\n", &r->rx);
+ vtrespond(r);
+ }
+ threadexitsall(nil);
+}
+
diff --git a/sys/src/cmd/venti/root.c b/sys/src/cmd/venti/root.c
new file mode 100755
index 000000000..5d67ad316
--- /dev/null
+++ b/sys/src/cmd/venti/root.c
@@ -0,0 +1,72 @@
+#include <u.h>
+#include <libc.h>
+#include <venti.h>
+#include <libsec.h>
+#include <thread.h>
+
+void
+usage(void)
+{
+ fprint(2, "usage: root [-h host] score\n");
+ threadexitsall("usage");
+}
+
+void
+threadmain(int argc, char *argv[])
+{
+ int i, n;
+ uchar score[VtScoreSize];
+ uchar *buf;
+ VtConn *z;
+ char *host;
+ VtRoot root;
+
+ fmtinstall('F', vtfcallfmt);
+ fmtinstall('V', vtscorefmt);
+ quotefmtinstall();
+
+ host = nil;
+ ARGBEGIN{
+ case 'h':
+ host = EARGF(usage());
+ break;
+ default:
+ usage();
+ break;
+ }ARGEND
+
+ if(argc == 0)
+ usage();
+
+ buf = vtmallocz(VtMaxLumpSize);
+
+ z = vtdial(host);
+ if(z == nil)
+ sysfatal("could not connect to server: %r");
+
+ if(vtconnect(z) < 0)
+ sysfatal("vtconnect: %r");
+
+ for(i=0; i<argc; i++){
+ if(vtparsescore(argv[i], nil, score) < 0){
+ fprint(2, "cannot parse score '%s': %r\n", argv[i]);
+ continue;
+ }
+ n = vtread(z, score, VtRootType, buf, VtMaxLumpSize);
+ if(n < 0){
+ fprint(2, "could not read block %V: %r\n", score);
+ continue;
+ }
+ if(n != VtRootSize){
+ fprint(2, "block %V is wrong size %d != 300\n", score, n);
+ continue;
+ }
+ if(vtrootunpack(&root, buf) < 0){
+ fprint(2, "unpacking block %V: %r\n", score);
+ continue;
+ }
+ print("%V: %q %q %V %d %V\n", score, root.name, root.type, root.score, root.blocksize, root.prev);
+ }
+ vthangup(z);
+ threadexitsall(0);
+}
diff --git a/sys/src/cmd/venti/srv/arena.c b/sys/src/cmd/venti/srv/arena.c
new file mode 100755
index 000000000..2a176cded
--- /dev/null
+++ b/sys/src/cmd/venti/srv/arena.c
@@ -0,0 +1,931 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+typedef struct ASum ASum;
+
+struct ASum
+{
+ Arena *arena;
+ ASum *next;
+};
+
+static void sealarena(Arena *arena);
+static int okarena(Arena *arena);
+static int loadarena(Arena *arena);
+static CIBlock *getcib(Arena *arena, int clump, int writing, CIBlock *rock);
+static void putcib(Arena *arena, CIBlock *cib);
+static void sumproc(void *);
+static void loadcig(Arena *arena);
+
+static QLock sumlock;
+static Rendez sumwait;
+static ASum *sumq;
+static ASum *sumqtail;
+static uchar zero[8192];
+
+int arenasumsleeptime;
+
+int
+initarenasum(void)
+{
+ needzeroscore(); /* OS X */
+
+ sumwait.l = &sumlock;
+
+ if(vtproc(sumproc, nil) < 0){
+ seterr(EOk, "can't start arena checksum slave: %r");
+ return -1;
+ }
+ return 0;
+}
+
+/*
+ * make an Arena, and initialize it based upon the disk header and trailer.
+ */
+Arena*
+initarena(Part *part, u64int base, u64int size, u32int blocksize)
+{
+ Arena *arena;
+
+ arena = MKZ(Arena);
+ arena->part = part;
+ arena->blocksize = blocksize;
+ arena->clumpmax = arena->blocksize / ClumpInfoSize;
+ arena->base = base + blocksize;
+ arena->size = size - 2 * blocksize;
+
+ if(loadarena(arena) < 0){
+ seterr(ECorrupt, "arena header or trailer corrupted");
+ freearena(arena);
+ return nil;
+ }
+ if(okarena(arena) < 0){
+ freearena(arena);
+ return nil;
+ }
+
+ if(arena->diskstats.sealed && scorecmp(zeroscore, arena->score)==0)
+ sealarena(arena);
+
+ return arena;
+}
+
+void
+freearena(Arena *arena)
+{
+ if(arena == nil)
+ return;
+ free(arena);
+}
+
+Arena*
+newarena(Part *part, u32int vers, char *name, u64int base, u64int size, u32int blocksize)
+{
+ int bsize;
+ Arena *arena;
+
+ if(nameok(name) < 0){
+ seterr(EOk, "illegal arena name", name);
+ return nil;
+ }
+ arena = MKZ(Arena);
+ arena->part = part;
+ arena->version = vers;
+ if(vers == ArenaVersion4)
+ arena->clumpmagic = _ClumpMagic;
+ else{
+ do
+ arena->clumpmagic = fastrand();
+ while(arena->clumpmagic==_ClumpMagic || arena->clumpmagic==0);
+ }
+ arena->blocksize = blocksize;
+ arena->clumpmax = arena->blocksize / ClumpInfoSize;
+ arena->base = base + blocksize;
+ arena->size = size - 2 * blocksize;
+
+ namecp(arena->name, name);
+
+ bsize = sizeof zero;
+ if(bsize > arena->blocksize)
+ bsize = arena->blocksize;
+
+ if(wbarena(arena)<0 || wbarenahead(arena)<0
+ || writepart(arena->part, arena->base, zero, bsize)<0){
+ freearena(arena);
+ return nil;
+ }
+
+ return arena;
+}
+
+int
+readclumpinfo(Arena *arena, int clump, ClumpInfo *ci)
+{
+ CIBlock *cib, r;
+
+ cib = getcib(arena, clump, 0, &r);
+ if(cib == nil)
+ return -1;
+ unpackclumpinfo(ci, &cib->data->data[cib->offset]);
+ putcib(arena, cib);
+ return 0;
+}
+
+int
+readclumpinfos(Arena *arena, int clump, ClumpInfo *cis, int n)
+{
+ CIBlock *cib, r;
+ int i;
+
+ /*
+ * because the clump blocks are laid out
+ * in reverse order at the end of the arena,
+ * it can be a few percent faster to read
+ * the clumps backwards, which reads the
+ * disk blocks forwards.
+ */
+ for(i = n-1; i >= 0; i--){
+ cib = getcib(arena, clump + i, 0, &r);
+ if(cib == nil){
+ n = i;
+ continue;
+ }
+ unpackclumpinfo(&cis[i], &cib->data->data[cib->offset]);
+ putcib(arena, cib);
+ }
+ return n;
+}
+
+/*
+ * write directory information for one clump
+ * must be called the arena locked
+ */
+int
+writeclumpinfo(Arena *arena, int clump, ClumpInfo *ci)
+{
+ CIBlock *cib, r;
+
+ cib = getcib(arena, clump, 1, &r);
+ if(cib == nil)
+ return -1;
+ dirtydblock(cib->data, DirtyArenaCib);
+ packclumpinfo(ci, &cib->data->data[cib->offset]);
+ putcib(arena, cib);
+ return 0;
+}
+
+u64int
+arenadirsize(Arena *arena, u32int clumps)
+{
+ return ((clumps / arena->clumpmax) + 1) * arena->blocksize;
+}
+
+/*
+ * read a clump of data
+ * n is a hint of the size of the data, not including the header
+ * make sure it won't run off the end, then return the number of bytes actually read
+ */
+u32int
+readarena(Arena *arena, u64int aa, u8int *buf, long n)
+{
+ DBlock *b;
+ u64int a;
+ u32int blocksize, off, m;
+ long nn;
+
+ if(n == 0)
+ return -1;
+
+ qlock(&arena->lock);
+ a = arena->size - arenadirsize(arena, arena->memstats.clumps);
+ qunlock(&arena->lock);
+ if(aa >= a){
+ seterr(EOk, "reading beyond arena clump storage: clumps=%d aa=%lld a=%lld -1 clumps=%lld\n", arena->memstats.clumps, aa, a, arena->size - arenadirsize(arena, arena->memstats.clumps - 1));
+ return -1;
+ }
+ if(aa + n > a)
+ n = a - aa;
+
+ blocksize = arena->blocksize;
+ a = arena->base + aa;
+ off = a & (blocksize - 1);
+ a -= off;
+ nn = 0;
+ for(;;){
+ b = getdblock(arena->part, a, OREAD);
+ if(b == nil)
+ return -1;
+ m = blocksize - off;
+ if(m > n - nn)
+ m = n - nn;
+ memmove(&buf[nn], &b->data[off], m);
+ putdblock(b);
+ nn += m;
+ if(nn == n)
+ break;
+ off = 0;
+ a += blocksize;
+ }
+ return n;
+}
+
+/*
+ * write some data to the clump section at a given offset
+ * used to fix up corrupted arenas.
+ */
+u32int
+writearena(Arena *arena, u64int aa, u8int *clbuf, u32int n)
+{
+ DBlock *b;
+ u64int a;
+ u32int blocksize, off, m;
+ long nn;
+ int ok;
+
+ if(n == 0)
+ return -1;
+
+ qlock(&arena->lock);
+ a = arena->size - arenadirsize(arena, arena->memstats.clumps);
+ if(aa >= a || aa + n > a){
+ qunlock(&arena->lock);
+ seterr(EOk, "writing beyond arena clump storage");
+ return -1;
+ }
+
+ blocksize = arena->blocksize;
+ a = arena->base + aa;
+ off = a & (blocksize - 1);
+ a -= off;
+ nn = 0;
+ for(;;){
+ b = getdblock(arena->part, a, off != 0 || off + n < blocksize ? ORDWR : OWRITE);
+ if(b == nil){
+ qunlock(&arena->lock);
+ return -1;
+ }
+ dirtydblock(b, DirtyArena);
+ m = blocksize - off;
+ if(m > n - nn)
+ m = n - nn;
+ memmove(&b->data[off], &clbuf[nn], m);
+ ok = 0;
+ putdblock(b);
+ if(ok < 0){
+ qunlock(&arena->lock);
+ return -1;
+ }
+ nn += m;
+ if(nn == n)
+ break;
+ off = 0;
+ a += blocksize;
+ }
+ qunlock(&arena->lock);
+ return n;
+}
+
+/*
+ * allocate space for the clump and write it,
+ * updating the arena directory
+ZZZ question: should this distinguish between an arena
+filling up and real errors writing the clump?
+ */
+u64int
+writeaclump(Arena *arena, Clump *c, u8int *clbuf)
+{
+ DBlock *b;
+ u64int a, aa;
+ u32int clump, n, nn, m, off, blocksize;
+ int ok;
+
+ n = c->info.size + ClumpSize + U32Size;
+ qlock(&arena->lock);
+ aa = arena->memstats.used;
+ if(arena->memstats.sealed
+ || aa + n + U32Size + arenadirsize(arena, arena->memstats.clumps + 1) > arena->size){
+ if(!arena->memstats.sealed){
+ logerr(EOk, "seal memstats %s", arena->name);
+ arena->memstats.sealed = 1;
+ wbarena(arena);
+ }
+ qunlock(&arena->lock);
+ return TWID64;
+ }
+ if(packclump(c, &clbuf[0], arena->clumpmagic) < 0){
+ qunlock(&arena->lock);
+ return TWID64;
+ }
+
+ /*
+ * write the data out one block at a time
+ */
+ blocksize = arena->blocksize;
+ a = arena->base + aa;
+ off = a & (blocksize - 1);
+ a -= off;
+ nn = 0;
+ for(;;){
+ b = getdblock(arena->part, a, off != 0 ? ORDWR : OWRITE);
+ if(b == nil){
+ qunlock(&arena->lock);
+ return TWID64;
+ }
+ dirtydblock(b, DirtyArena);
+ m = blocksize - off;
+ if(m > n - nn)
+ m = n - nn;
+ memmove(&b->data[off], &clbuf[nn], m);
+ ok = 0;
+ putdblock(b);
+ if(ok < 0){
+ qunlock(&arena->lock);
+ return TWID64;
+ }
+ nn += m;
+ if(nn == n)
+ break;
+ off = 0;
+ a += blocksize;
+ }
+
+ arena->memstats.used += c->info.size + ClumpSize;
+ arena->memstats.uncsize += c->info.uncsize;
+ if(c->info.size < c->info.uncsize)
+ arena->memstats.cclumps++;
+
+ clump = arena->memstats.clumps;
+ if(clump % ArenaCIGSize == 0){
+ if(arena->cig == nil){
+ loadcig(arena);
+ if(arena->cig == nil)
+ goto NoCIG;
+ }
+ /* add aa as start of next cig */
+ if(clump/ArenaCIGSize != arena->ncig){
+ fprint(2, "bad arena cig computation %s: writing clump %d but %d cigs\n",
+ arena->name, clump, arena->ncig);
+ arena->ncig = -1;
+ vtfree(arena->cig);
+ arena->cig = nil;
+ goto NoCIG;
+ }
+ arena->cig = vtrealloc(arena->cig, (arena->ncig+1)*sizeof arena->cig[0]);
+ arena->cig[arena->ncig++].offset = aa;
+ }
+NoCIG:
+ arena->memstats.clumps++;
+
+ if(arena->memstats.clumps == 0)
+ sysfatal("clumps wrapped");
+ arena->wtime = now();
+ if(arena->ctime == 0)
+ arena->ctime = arena->wtime;
+
+ writeclumpinfo(arena, clump, &c->info);
+ wbarena(arena);
+
+ qunlock(&arena->lock);
+
+ return aa;
+}
+
+int
+atailcmp(ATailStats *a, ATailStats *b)
+{
+ /* good test */
+ if(a->used < b->used)
+ return -1;
+ if(a->used > b->used)
+ return 1;
+
+ /* suspect tests - why order this way? (no one cares) */
+ if(a->clumps < b->clumps)
+ return -1;
+ if(a->clumps > b->clumps)
+ return 1;
+ if(a->cclumps < b->cclumps)
+ return -1;
+ if(a->cclumps > b->cclumps)
+ return 1;
+ if(a->uncsize < b->uncsize)
+ return -1;
+ if(a->uncsize > b->uncsize)
+ return 1;
+ if(a->sealed < b->sealed)
+ return -1;
+ if(a->sealed > b->sealed)
+ return 1;
+
+ /* everything matches */
+ return 0;
+}
+
+void
+setatailstate(AState *as)
+{
+ int i, j, osealed;
+ Arena *a;
+ Index *ix;
+
+ trace(0, "setatailstate %s 0x%llux clumps %d", as->arena->name, as->aa, as->stats.clumps);
+
+ /*
+ * Look up as->arena to find index.
+ */
+ needmainindex(); /* OS X linker */
+ ix = mainindex;
+ for(i=0; i<ix->narenas; i++)
+ if(ix->arenas[i] == as->arena)
+ break;
+ if(i==ix->narenas || as->aa < ix->amap[i].start || as->aa >= ix->amap[i].stop || as->arena != ix->arenas[i]){
+ fprint(2, "funny settailstate 0x%llux\n", as->aa);
+ return;
+ }
+
+ for(j=0; j<=i; j++){
+ a = ix->arenas[j];
+ if(atailcmp(&a->diskstats, &a->memstats) == 0)
+ continue;
+ qlock(&a->lock);
+ osealed = a->diskstats.sealed;
+ if(j == i)
+ a->diskstats = as->stats;
+ else
+ a->diskstats = a->memstats;
+ wbarena(a);
+ if(a->diskstats.sealed != osealed && !a->inqueue)
+ sealarena(a);
+ qunlock(&a->lock);
+ }
+}
+
+/*
+ * once sealed, an arena never has any data added to it.
+ * it should only be changed to fix errors.
+ * this also syncs the clump directory.
+ */
+static void
+sealarena(Arena *arena)
+{
+ arena->inqueue = 1;
+ backsumarena(arena);
+}
+
+void
+backsumarena(Arena *arena)
+{
+ ASum *as;
+
+ if(sumwait.l == nil)
+ return;
+
+ as = MK(ASum);
+ if(as == nil)
+ return;
+ qlock(&sumlock);
+ as->arena = arena;
+ as->next = nil;
+ if(sumq)
+ sumqtail->next = as;
+ else
+ sumq = as;
+ sumqtail = as;
+ rwakeup(&sumwait);
+ qunlock(&sumlock);
+}
+
+static void
+sumproc(void *unused)
+{
+ ASum *as;
+ Arena *arena;
+
+ USED(unused);
+
+ for(;;){
+ qlock(&sumlock);
+ while(sumq == nil)
+ rsleep(&sumwait);
+ as = sumq;
+ sumq = as->next;
+ qunlock(&sumlock);
+ arena = as->arena;
+ free(as);
+
+ sumarena(arena);
+ }
+}
+
+void
+sumarena(Arena *arena)
+{
+ ZBlock *b;
+ DigestState s;
+ u64int a, e;
+ u32int bs;
+ int t;
+ u8int score[VtScoreSize];
+
+ bs = MaxIoSize;
+ if(bs < arena->blocksize)
+ bs = arena->blocksize;
+
+ /*
+ * read & sum all blocks except the last one
+ */
+ flushdcache();
+ memset(&s, 0, sizeof s);
+ b = alloczblock(bs, 0, arena->part->blocksize);
+ e = arena->base + arena->size;
+ for(a = arena->base - arena->blocksize; a + arena->blocksize <= e; a += bs){
+ disksched();
+ while((t=arenasumsleeptime) == SleepForever){
+ sleep(1000);
+ disksched();
+ }
+ sleep(t);
+ if(a + bs > e)
+ bs = arena->blocksize;
+ if(readpart(arena->part, a, b->data, bs) < 0)
+ goto ReadErr;
+ addstat(StatSumRead, 1);
+ addstat(StatSumReadBytes, bs);
+ sha1(b->data, bs, nil, &s);
+ }
+
+ /*
+ * the last one is special, since it may already have the checksum included
+ */
+ bs = arena->blocksize;
+ if(readpart(arena->part, e, b->data, bs) < 0){
+ReadErr:
+ logerr(EOk, "sumarena can't sum %s, read at %lld failed: %r", arena->name, a);
+ freezblock(b);
+ return;
+ }
+ addstat(StatSumRead, 1);
+ addstat(StatSumReadBytes, bs);
+
+ sha1(b->data, bs-VtScoreSize, nil, &s);
+ sha1(zeroscore, VtScoreSize, nil, &s);
+ sha1(nil, 0, score, &s);
+
+ /*
+ * check for no checksum or the same
+ */
+ if(scorecmp(score, &b->data[bs - VtScoreSize]) != 0
+ && scorecmp(zeroscore, &b->data[bs - VtScoreSize]) != 0)
+ logerr(EOk, "overwriting mismatched checksums for arena=%s, found=%V calculated=%V",
+ arena->name, &b->data[bs - VtScoreSize], score);
+ freezblock(b);
+
+ qlock(&arena->lock);
+ scorecp(arena->score, score);
+ wbarena(arena);
+ qunlock(&arena->lock);
+}
+
+/*
+ * write the arena trailer block to the partition
+ */
+int
+wbarena(Arena *arena)
+{
+ DBlock *b;
+ int bad;
+
+ if((b = getdblock(arena->part, arena->base + arena->size, OWRITE)) == nil){
+ logerr(EAdmin, "can't write arena trailer: %r");
+ return -1;
+ }
+ dirtydblock(b, DirtyArenaTrailer);
+ bad = okarena(arena)<0 || packarena(arena, b->data)<0;
+ scorecp(b->data + arena->blocksize - VtScoreSize, arena->score);
+ putdblock(b);
+ if(bad)
+ return -1;
+ return 0;
+}
+
+int
+wbarenahead(Arena *arena)
+{
+ ZBlock *b;
+ ArenaHead head;
+ int bad;
+
+ namecp(head.name, arena->name);
+ head.version = arena->version;
+ head.size = arena->size + 2 * arena->blocksize;
+ head.blocksize = arena->blocksize;
+ head.clumpmagic = arena->clumpmagic;
+ b = alloczblock(arena->blocksize, 1, arena->part->blocksize);
+ if(b == nil){
+ logerr(EAdmin, "can't write arena header: %r");
+/* ZZZ add error message? */
+ return -1;
+ }
+ /*
+ * this writepart is okay because it only happens
+ * during initialization.
+ */
+ bad = packarenahead(&head, b->data)<0 ||
+ writepart(arena->part, arena->base - arena->blocksize, b->data, arena->blocksize)<0 ||
+ flushpart(arena->part)<0;
+ freezblock(b);
+ if(bad)
+ return -1;
+ return 0;
+}
+
+/*
+ * read the arena header and trailer blocks from disk
+ */
+static int
+loadarena(Arena *arena)
+{
+ ArenaHead head;
+ ZBlock *b;
+
+ b = alloczblock(arena->blocksize, 0, arena->part->blocksize);
+ if(b == nil)
+ return -1;
+ if(readpart(arena->part, arena->base + arena->size, b->data, arena->blocksize) < 0){
+ freezblock(b);
+ return -1;
+ }
+ if(unpackarena(arena, b->data) < 0){
+ freezblock(b);
+ return -1;
+ }
+ if(arena->version != ArenaVersion4 && arena->version != ArenaVersion5){
+ seterr(EAdmin, "unknown arena version %d", arena->version);
+ freezblock(b);
+ return -1;
+ }
+ scorecp(arena->score, &b->data[arena->blocksize - VtScoreSize]);
+
+ if(readpart(arena->part, arena->base - arena->blocksize, b->data, arena->blocksize) < 0){
+ logerr(EAdmin, "can't read arena header: %r");
+ freezblock(b);
+ return 0;
+ }
+ if(unpackarenahead(&head, b->data) < 0)
+ logerr(ECorrupt, "corrupted arena header: %r");
+ else if(namecmp(arena->name, head.name)!=0
+ || arena->clumpmagic != head.clumpmagic
+ || arena->version != head.version
+ || arena->blocksize != head.blocksize
+ || arena->size + 2 * arena->blocksize != head.size){
+ if(namecmp(arena->name, head.name)!=0)
+ logerr(ECorrupt, "arena tail name %s head %s",
+ arena->name, head.name);
+ else if(arena->clumpmagic != head.clumpmagic)
+ logerr(ECorrupt, "arena %d tail clumpmagic 0x%lux head 0x%lux",
+ debugarena, (ulong)arena->clumpmagic,
+ (ulong)head.clumpmagic);
+ else if(arena->version != head.version)
+ logerr(ECorrupt, "arena tail version %d head version %d",
+ arena->version, head.version);
+ else if(arena->blocksize != head.blocksize)
+ logerr(ECorrupt, "arena tail block size %d head %d",
+ arena->blocksize, head.blocksize);
+ else if(arena->size+2*arena->blocksize != head.size)
+ logerr(ECorrupt, "arena tail size %lud head %lud",
+ (ulong)arena->size+2*arena->blocksize, head.size);
+ else
+ logerr(ECorrupt, "arena header inconsistent with arena data");
+ }
+ freezblock(b);
+
+ return 0;
+}
+
+static int
+okarena(Arena *arena)
+{
+ u64int dsize;
+ int ok;
+
+ ok = 0;
+ dsize = arenadirsize(arena, arena->diskstats.clumps);
+ if(arena->diskstats.used + dsize > arena->size){
+ seterr(ECorrupt, "arena %s used > size", arena->name);
+ ok = -1;
+ }
+
+ if(arena->diskstats.cclumps > arena->diskstats.clumps)
+ logerr(ECorrupt, "arena %s has more compressed clumps than total clumps", arena->name);
+
+ /*
+ * This need not be true if some of the disk is corrupted.
+ *
+ if(arena->diskstats.uncsize + arena->diskstats.clumps * ClumpSize + arena->blocksize < arena->diskstats.used)
+ logerr(ECorrupt, "arena %s uncompressed size inconsistent with used space %lld %d %lld", arena->name, arena->diskstats.uncsize, arena->diskstats.clumps, arena->diskstats.used);
+ */
+
+ /*
+ * this happens; it's harmless.
+ *
+ if(arena->ctime > arena->wtime)
+ logerr(ECorrupt, "arena %s creation time after last write time", arena->name);
+ */
+ return ok;
+}
+
+static CIBlock*
+getcib(Arena *arena, int clump, int writing, CIBlock *rock)
+{
+ int mode;
+ CIBlock *cib;
+ u32int block, off;
+
+ if(clump >= arena->memstats.clumps){
+ seterr(EOk, "clump directory access out of range");
+ return nil;
+ }
+ block = clump / arena->clumpmax;
+ off = (clump - block * arena->clumpmax) * ClumpInfoSize;
+ cib = rock;
+ cib->block = block;
+ cib->offset = off;
+
+ if(writing){
+ if(off == 0 && clump == arena->memstats.clumps-1)
+ mode = OWRITE;
+ else
+ mode = ORDWR;
+ }else
+ mode = OREAD;
+
+ cib->data = getdblock(arena->part,
+ arena->base + arena->size - (block + 1) * arena->blocksize, mode);
+ if(cib->data == nil)
+ return nil;
+ return cib;
+}
+
+static void
+putcib(Arena *arena, CIBlock *cib)
+{
+ USED(arena);
+
+ putdblock(cib->data);
+ cib->data = nil;
+}
+
+
+/*
+ * For index entry readahead purposes, the arenas are
+ * broken into smaller subpieces, called clump info groups
+ * or cigs. Each cig has ArenaCIGSize clumps (ArenaCIGSize
+ * is chosen to make the index entries take up about half
+ * a megabyte). The index entries do not contain enough
+ * information to determine what the clump index is for
+ * a given address in an arena. That info is needed both for
+ * figuring out which clump group an address belongs to
+ * and for prefetching a clump group's index entries from
+ * the arena table of contents. The first time clump groups
+ * are accessed, we scan the entire arena table of contents
+ * (which might be 10s of megabytes), recording the data
+ * offset of each clump group.
+ */
+
+/*
+ * load clump info group information by scanning entire toc.
+ */
+static void
+loadcig(Arena *arena)
+{
+ u32int i, j, ncig, nci;
+ ArenaCIG *cig;
+ ClumpInfo *ci;
+ u64int offset;
+ int ms;
+
+ if(arena->cig || arena->ncig < 0)
+ return;
+
+// fprint(2, "loadcig %s\n", arena->name);
+
+ ncig = (arena->memstats.clumps+ArenaCIGSize-1) / ArenaCIGSize;
+ if(ncig == 0){
+ arena->cig = vtmalloc(1);
+ arena->ncig = 0;
+ return;
+ }
+
+ ms = msec();
+ cig = vtmalloc(ncig*sizeof cig[0]);
+ ci = vtmalloc(ArenaCIGSize*sizeof ci[0]);
+ offset = 0;
+ for(i=0; i<ncig; i++){
+ nci = readclumpinfos(arena, i*ArenaCIGSize, ci, ArenaCIGSize);
+ cig[i].offset = offset;
+ for(j=0; j<nci; j++)
+ offset += ClumpSize + ci[j].size;
+ if(nci < ArenaCIGSize){
+ if(i != ncig-1){
+ vtfree(ci);
+ vtfree(cig);
+ arena->ncig = -1;
+ fprint(2, "loadcig %s: got %ud cigs, expected %ud\n", arena->name, i+1, ncig);
+ goto out;
+ }
+ }
+ }
+ vtfree(ci);
+
+ arena->ncig = ncig;
+ arena->cig = cig;
+
+out:
+ ms = msec() - ms;
+ addstat2(StatCigLoad, 1, StatCigLoadTime, ms);
+}
+
+/*
+ * convert arena address into arena group + data boundaries.
+ */
+int
+arenatog(Arena *arena, u64int addr, u64int *gstart, u64int *glimit, int *g)
+{
+ int r, l, m;
+
+ qlock(&arena->lock);
+ if(arena->cig == nil)
+ loadcig(arena);
+ if(arena->cig == nil || arena->ncig == 0){
+ qunlock(&arena->lock);
+ return -1;
+ }
+
+ l = 1;
+ r = arena->ncig - 1;
+ while(l <= r){
+ m = (r + l) / 2;
+ if(arena->cig[m].offset <= addr)
+ l = m + 1;
+ else
+ r = m - 1;
+ }
+ l--;
+
+ *g = l;
+ *gstart = arena->cig[l].offset;
+ if(l+1 < arena->ncig)
+ *glimit = arena->cig[l+1].offset;
+ else
+ *glimit = arena->memstats.used;
+ qunlock(&arena->lock);
+ return 0;
+}
+
+/*
+ * load the clump info for group g into the index entries.
+ */
+int
+asumload(Arena *arena, int g, IEntry *entries, int nentries)
+{
+ int i, base, limit;
+ u64int addr;
+ ClumpInfo ci;
+ IEntry *ie;
+
+ if(nentries < ArenaCIGSize){
+ fprint(2, "asking for too few entries\n");
+ return -1;
+ }
+
+ qlock(&arena->lock);
+ if(arena->cig == nil)
+ loadcig(arena);
+ if(arena->cig == nil || arena->ncig == 0 || g >= arena->ncig){
+ qunlock(&arena->lock);
+ return -1;
+ }
+
+ addr = 0;
+ base = g*ArenaCIGSize;
+ limit = base + ArenaCIGSize;
+ if(base > arena->memstats.clumps)
+ base = arena->memstats.clumps;
+ ie = entries;
+ for(i=base; i<limit; i++){
+ if(readclumpinfo(arena, i, &ci) < 0)
+ break;
+ if(ci.type != VtCorruptType){
+ scorecp(ie->score, ci.score);
+ ie->ia.type = ci.type;
+ ie->ia.size = ci.uncsize;
+ ie->ia.blocks = (ci.size + ClumpSize + (1<<ABlockLog) - 1) >> ABlockLog;
+ ie->ia.addr = addr;
+ ie++;
+ }
+ addr += ClumpSize + ci.size;
+ }
+ qunlock(&arena->lock);
+ return ie - entries;
+}
diff --git a/sys/src/cmd/venti/srv/arenas.c b/sys/src/cmd/venti/srv/arenas.c
new file mode 100755
index 000000000..0316c4c86
--- /dev/null
+++ b/sys/src/cmd/venti/srv/arenas.c
@@ -0,0 +1,420 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+typedef struct AHash AHash;
+
+/*
+ * hash table for finding arena's based on their names.
+ */
+struct AHash
+{
+ AHash *next;
+ Arena *arena;
+};
+
+enum
+{
+ AHashSize = 512
+};
+
+static AHash *ahash[AHashSize];
+
+static u32int
+hashstr(char *s)
+{
+ u32int h;
+ int c;
+
+ h = 0;
+ for(; c = *s; s++){
+ c ^= c << 6;
+ h += (c << 11) ^ (c >> 1);
+ c = *s;
+ h ^= (c << 14) + (c << 7) + (c << 4) + c;
+ }
+ return h;
+}
+
+int
+addarena(Arena *arena)
+{
+ AHash *a;
+ u32int h;
+
+ h = hashstr(arena->name) & (AHashSize - 1);
+ a = MK(AHash);
+ if(a == nil)
+ return -1;
+ a->arena = arena;
+ a->next = ahash[h];
+ ahash[h] = a;
+ return 0;
+}
+
+Arena*
+findarena(char *name)
+{
+ AHash *a;
+ u32int h;
+
+ h = hashstr(name) & (AHashSize - 1);
+ for(a = ahash[h]; a != nil; a = a->next)
+ if(strcmp(a->arena->name, name) == 0)
+ return a->arena;
+ return nil;
+}
+
+int
+delarena(Arena *arena)
+{
+ AHash *a, *last;
+ u32int h;
+
+ h = hashstr(arena->name) & (AHashSize - 1);
+ last = nil;
+ for(a = ahash[h]; a != nil; a = a->next){
+ if(a->arena == arena){
+ if(last != nil)
+ last->next = a->next;
+ else
+ ahash[h] = a->next;
+ free(a);
+ return 0;
+ }
+ last = a;
+ }
+ return -1;
+}
+
+ArenaPart*
+initarenapart(Part *part)
+{
+ AMapN amn;
+ ArenaPart *ap;
+ ZBlock *b;
+ u32int i;
+ int ok;
+
+ b = alloczblock(HeadSize, 0, 0);
+ if(b == nil || readpart(part, PartBlank, b->data, HeadSize) < 0){
+ seterr(EAdmin, "can't read arena partition header: %r");
+ return nil;
+ }
+
+ ap = MKZ(ArenaPart);
+ if(ap == nil){
+ freezblock(b);
+ return nil;
+ }
+ ap->part = part;
+ ok = unpackarenapart(ap, b->data);
+ freezblock(b);
+ if(ok < 0){
+ freearenapart(ap, 0);
+ return nil;
+ }
+
+ ap->tabbase = (PartBlank + HeadSize + ap->blocksize - 1) & ~(ap->blocksize - 1);
+ if(ap->version != ArenaPartVersion){
+ seterr(ECorrupt, "unknown arena partition version %d", ap->version);
+ freearenapart(ap, 0);
+ return nil;
+ }
+ if(ap->blocksize & (ap->blocksize - 1)){
+ seterr(ECorrupt, "illegal non-power-of-2 block size %d\n", ap->blocksize);
+ freearenapart(ap, 0);
+ return nil;
+ }
+ if(ap->tabbase >= ap->arenabase){
+ seterr(ECorrupt, "arena partition table overlaps with arena storage");
+ freearenapart(ap, 0);
+ return nil;
+ }
+ ap->tabsize = ap->arenabase - ap->tabbase;
+ partblocksize(part, ap->blocksize);
+ ap->size = ap->part->size & ~(u64int)(ap->blocksize - 1);
+
+ if(readarenamap(&amn, part, ap->tabbase, ap->tabsize) < 0){
+ freearenapart(ap, 0);
+ return nil;
+ }
+ ap->narenas = amn.n;
+ ap->map = amn.map;
+ if(okamap(ap->map, ap->narenas, ap->arenabase, ap->size, "arena table") < 0){
+ freearenapart(ap, 0);
+ return nil;
+ }
+
+ ap->arenas = MKNZ(Arena*, ap->narenas);
+ for(i = 0; i < ap->narenas; i++){
+ debugarena = i;
+ ap->arenas[i] = initarena(part, ap->map[i].start, ap->map[i].stop - ap->map[i].start, ap->blocksize);
+ if(ap->arenas[i] == nil){
+ seterr(ECorrupt, "%s: %r", ap->map[i].name);
+ freearenapart(ap, 1);
+ return nil;
+ }
+ if(namecmp(ap->map[i].name, ap->arenas[i]->name) != 0){
+ seterr(ECorrupt, "arena name mismatches with expected name: %s vs. %s",
+ ap->map[i].name, ap->arenas[i]->name);
+ freearenapart(ap, 1);
+ return nil;
+ }
+ if(findarena(ap->arenas[i]->name)){
+ seterr(ECorrupt, "duplicate arena name %s in %s",
+ ap->map[i].name, ap->part->name);
+ freearenapart(ap, 1);
+ return nil;
+ }
+ }
+
+ for(i = 0; i < ap->narenas; i++) {
+ debugarena = i;
+ addarena(ap->arenas[i]);
+ }
+ debugarena = -1;
+
+ return ap;
+}
+
+ArenaPart*
+newarenapart(Part *part, u32int blocksize, u32int tabsize)
+{
+ ArenaPart *ap;
+
+ if(blocksize & (blocksize - 1)){
+ seterr(ECorrupt, "illegal non-power-of-2 block size %d\n", blocksize);
+ return nil;
+ }
+ ap = MKZ(ArenaPart);
+ if(ap == nil)
+ return nil;
+
+ ap->version = ArenaPartVersion;
+ ap->part = part;
+ ap->blocksize = blocksize;
+ partblocksize(part, blocksize);
+ ap->size = part->size & ~(u64int)(blocksize - 1);
+ ap->tabbase = (PartBlank + HeadSize + blocksize - 1) & ~(blocksize - 1);
+ ap->arenabase = (ap->tabbase + tabsize + blocksize - 1) & ~(blocksize - 1);
+ ap->tabsize = ap->arenabase - ap->tabbase;
+ ap->narenas = 0;
+
+ if(wbarenapart(ap) < 0){
+ freearenapart(ap, 0);
+ return nil;
+ }
+
+ return ap;
+}
+
+int
+wbarenapart(ArenaPart *ap)
+{
+ ZBlock *b;
+
+ if(okamap(ap->map, ap->narenas, ap->arenabase, ap->size, "arena table") < 0)
+ return -1;
+ b = alloczblock(HeadSize, 1, 0);
+ if(b == nil)
+/* ZZZ set error message? */
+ return -1;
+
+ if(packarenapart(ap, b->data) < 0){
+ seterr(ECorrupt, "can't make arena partition header: %r");
+ freezblock(b);
+ return -1;
+ }
+ if(writepart(ap->part, PartBlank, b->data, HeadSize) < 0 ||
+ flushpart(ap->part) < 0){
+ seterr(EAdmin, "can't write arena partition header: %r");
+ freezblock(b);
+ return -1;
+ }
+ freezblock(b);
+
+ return wbarenamap(ap->map, ap->narenas, ap->part, ap->tabbase, ap->tabsize);
+}
+
+void
+freearenapart(ArenaPart *ap, int freearenas)
+{
+ int i;
+
+ if(ap == nil)
+ return;
+ if(freearenas){
+ for(i = 0; i < ap->narenas; i++){
+ if(ap->arenas[i] == nil)
+ continue;
+ delarena(ap->arenas[i]);
+ freearena(ap->arenas[i]);
+ }
+ }
+ free(ap->map);
+ free(ap->arenas);
+ free(ap);
+}
+
+int
+okamap(AMap *am, int n, u64int start, u64int stop, char *what)
+{
+ u64int last;
+ u32int i;
+
+ last = start;
+ for(i = 0; i < n; i++){
+ if(am[i].start < last){
+ if(i == 0)
+ seterr(ECorrupt, "invalid start address in %s", what);
+ else
+ seterr(ECorrupt, "overlapping ranges in %s", what);
+ return -1;
+ }
+ if(am[i].stop < am[i].start){
+ seterr(ECorrupt, "invalid range in %s", what);
+ return -1;
+ }
+ last = am[i].stop;
+ }
+ if(last > stop){
+ seterr(ECorrupt, "invalid ending address in %s", what);
+ return -1;
+ }
+ return 0;
+}
+
+int
+maparenas(AMap *am, Arena **arenas, int n, char *what)
+{
+ u32int i;
+
+ for(i = 0; i < n; i++){
+ arenas[i] = findarena(am[i].name);
+ if(arenas[i] == nil){
+ seterr(EAdmin, "can't find arena '%s' for '%s'\n", am[i].name, what);
+ return -1;
+ }
+ }
+ return 0;
+}
+
+int
+readarenamap(AMapN *amn, Part *part, u64int base, u32int size)
+{
+ IFile f;
+ u32int ok;
+
+ if(partifile(&f, part, base, size) < 0)
+ return -1;
+ ok = parseamap(&f, amn);
+ freeifile(&f);
+ return ok;
+}
+
+int
+wbarenamap(AMap *am, int n, Part *part, u64int base, u64int size)
+{
+ Fmt f;
+ ZBlock *b;
+
+ b = alloczblock(size, 1, part->blocksize);
+ if(b == nil)
+ return -1;
+
+ fmtzbinit(&f, b);
+
+ if(outputamap(&f, am, n) < 0){
+ seterr(ECorrupt, "arena set size too small");
+ freezblock(b);
+ return -1;
+ }
+ if(writepart(part, base, b->data, size) < 0 || flushpart(part) < 0){
+ seterr(EAdmin, "can't write arena set: %r");
+ freezblock(b);
+ return -1;
+ }
+ freezblock(b);
+ return 0;
+}
+
+/*
+ * amap: n '\n' amapelem * n
+ * n: u32int
+ * amapelem: name '\t' astart '\t' astop '\n'
+ * astart, astop: u64int
+ */
+int
+parseamap(IFile *f, AMapN *amn)
+{
+ AMap *am;
+ u64int v64;
+ u32int v;
+ char *s, *t, *flds[4];
+ int i, n;
+
+ /*
+ * arenas
+ */
+ if(ifileu32int(f, &v) < 0){
+ seterr(ECorrupt, "syntax error: bad number of elements in %s", f->name);
+ return -1;
+ }
+ n = v;
+ if(n > MaxAMap){
+ seterr(ECorrupt, "illegal number of elements %d in %s",
+ n, f->name);
+ return -1;
+ }
+ am = MKNZ(AMap, n);
+ if(am == nil){
+ fprint(2, "out of memory\n");
+ return -1;
+ }
+ for(i = 0; i < n; i++){
+ s = ifileline(f);
+ if(s)
+ t = estrdup(s);
+ else
+ t = nil;
+ if(s == nil || getfields(s, flds, 4, 0, "\t") != 3){
+ fprint(2, "early eof after %d of %d, %s:#%d: %s\n", i, n, f->name, f->pos, t);
+ free(t);
+ return -1;
+ }
+ free(t);
+ if(nameok(flds[0]) < 0)
+ return -1;
+ namecp(am[i].name, flds[0]);
+ if(stru64int(flds[1], &v64) < 0){
+ seterr(ECorrupt, "syntax error: bad arena base address in %s", f->name);
+ free(am);
+ return -1;
+ }
+ am[i].start = v64;
+ if(stru64int(flds[2], &v64) < 0){
+ seterr(ECorrupt, "syntax error: bad arena size in %s", f->name);
+ free(am);
+ return -1;
+ }
+ am[i].stop = v64;
+ }
+
+ amn->map = am;
+ amn->n = n;
+ return 0;
+}
+
+int
+outputamap(Fmt *f, AMap *am, int n)
+{
+ int i;
+
+ if(fmtprint(f, "%ud\n", n) < 0)
+ return -1;
+ for(i = 0; i < n; i++)
+ if(fmtprint(f, "%s\t%llud\t%llud\n", am[i].name, am[i].start, am[i].stop) < 0)
+ return -1;
+ return 0;
+}
diff --git a/sys/src/cmd/venti/srv/bloom.c b/sys/src/cmd/venti/srv/bloom.c
new file mode 100755
index 000000000..1db36bd92
--- /dev/null
+++ b/sys/src/cmd/venti/srv/bloom.c
@@ -0,0 +1,256 @@
+/*
+ * Bloom filter tracking which scores are present in our arenas
+ * and (more importantly) which are not.
+ */
+
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+int ignorebloom;
+
+int
+bloominit(Bloom *b, vlong vsize, u8int *data)
+{
+ ulong size;
+
+ size = vsize;
+ if(size != vsize){ /* truncation */
+ werrstr("bloom data too big");
+ return -1;
+ }
+
+ b->size = size;
+ b->nhash = 32; /* will be fixed by caller on initialization */
+ if(data != nil)
+ if(unpackbloomhead(b, data) < 0)
+ return -1;
+
+ b->bitmask = (b->size<<3) - 1;
+ b->data = data;
+ return 0;
+}
+
+void
+wbbloomhead(Bloom *b)
+{
+ packbloomhead(b, b->data);
+}
+
+Bloom*
+readbloom(Part *p)
+{
+ uchar buf[512];
+ Bloom *b;
+
+ b = vtmallocz(sizeof *b);
+ if(readpart(p, 0, buf, sizeof buf) < 0)
+ return nil;
+ /*
+ * pass buf as b->data so that bloominit
+ * can parse header. won't be used for
+ * accessing bits (cleared below).
+ */
+ if(bloominit(b, 0, buf) < 0){
+ vtfree(b);
+ return nil;
+ }else{
+ /*
+ * default block size is system page size.
+ * the bloom filter is usually very big.
+ * bump the block size up to speed i/o.
+ */
+ if(p->blocksize < (1<<20)){
+ p->blocksize = 1<<20;
+ if(p->blocksize > p->size)
+ p->blocksize = p->size;
+ }
+ }
+ b->part = p;
+ b->data = nil;
+ return b;
+}
+
+int
+resetbloom(Bloom *b)
+{
+ uchar *data;
+
+ data = vtmallocz(b->size);
+ b->data = data;
+ if(b->size == MaxBloomSize) /* 2^32 overflows ulong */
+ addstat(StatBloomBits, b->size*8-1);
+ else
+ addstat(StatBloomBits, b->size*8);
+ return 0;
+}
+
+int
+loadbloom(Bloom *b)
+{
+ int i, n;
+ uint ones;
+ uchar *data;
+ u32int *a;
+
+ data = vtmallocz(b->size);
+ if(readpart(b->part, 0, data, b->size) < 0){
+ vtfree(b);
+ vtfree(data);
+ return -1;
+ }
+ b->data = data;
+
+ a = (u32int*)b->data;
+ n = b->size/4;
+ ones = 0;
+ for(i=0; i<n; i++)
+ ones += countbits(a[i]);
+ addstat(StatBloomOnes, ones);
+
+ if(b->size == MaxBloomSize) /* 2^32 overflows ulong */
+ addstat(StatBloomBits, b->size*8-1);
+ else
+ addstat(StatBloomBits, b->size*8);
+
+ return 0;
+}
+
+int
+writebloom(Bloom *b)
+{
+ wbbloomhead(b);
+ if(writepart(b->part, 0, b->data, b->size) < 0)
+ return -1;
+ if(flushpart(b->part) < 0)
+ return -1;
+ return 0;
+}
+
+/*
+ * Derive two random 32-bit quantities a, b from the score
+ * and then use a+b*i as a sequence of bloom filter indices.
+ * Michael Mitzenmacher has a recent (2005) paper saying this is okay.
+ * We reserve the bottom bytes (BloomHeadSize*8 bits) for the header.
+ */
+static void
+gethashes(u8int *score, ulong *h)
+{
+ int i;
+ u32int a, b;
+
+ a = 0;
+ b = 0;
+ for(i=4; i+8<=VtScoreSize; i+=8){
+ a ^= *(u32int*)(score+i);
+ b ^= *(u32int*)(score+i+4);
+ }
+ if(i+4 <= VtScoreSize) /* 20 is not 4-aligned */
+ a ^= *(u32int*)(score+i);
+ for(i=0; i<BloomMaxHash; i++, a+=b)
+ h[i] = a < BloomHeadSize*8 ? BloomHeadSize*8 : a;
+}
+
+static void
+_markbloomfilter(Bloom *b, u8int *score)
+{
+ int i, nnew;
+ ulong h[BloomMaxHash];
+ u32int x, *y, z, *tab;
+
+ trace("markbloomfilter", "markbloomfilter %V", score);
+ gethashes(score, h);
+ nnew = 0;
+ tab = (u32int*)b->data;
+ for(i=0; i<b->nhash; i++){
+ x = h[i];
+ y = &tab[(x&b->bitmask)>>5];
+ z = 1<<(x&31);
+ if(!(*y&z)){
+ nnew++;
+ *y |= z;
+ }
+ }
+ if(nnew)
+ addstat(StatBloomOnes, nnew);
+
+ trace("markbloomfilter", "markbloomfilter exit");
+}
+
+static int
+_inbloomfilter(Bloom *b, u8int *score)
+{
+ int i;
+ ulong h[BloomMaxHash], x;
+ u32int *tab;
+
+ gethashes(score, h);
+ tab = (u32int*)b->data;
+ for(i=0; i<b->nhash; i++){
+ x = h[i];
+ if(!(tab[(x&b->bitmask)>>5] & (1<<(x&31))))
+ return 0;
+ }
+ return 1;
+}
+
+int
+inbloomfilter(Bloom *b, u8int *score)
+{
+ int r;
+
+ if(b == nil || b->data == nil)
+ return 1;
+
+ if(ignorebloom)
+ return 1;
+
+ rlock(&b->lk);
+ r = _inbloomfilter(b, score);
+ runlock(&b->lk);
+ addstat(StatBloomLookup, 1);
+ if(r)
+ addstat(StatBloomMiss, 1);
+ else
+ addstat(StatBloomHit, 1);
+ return r;
+}
+
+void
+markbloomfilter(Bloom *b, u8int *score)
+{
+ if(b == nil || b->data == nil)
+ return;
+
+ rlock(&b->lk);
+ qlock(&b->mod);
+ _markbloomfilter(b, score);
+ qunlock(&b->mod);
+ runlock(&b->lk);
+}
+
+static void
+bloomwriteproc(void *v)
+{
+ int ret;
+ Bloom *b;
+
+ threadsetname("bloomwriteproc");
+ b = v;
+ for(;;){
+ recv(b->writechan, 0);
+ if((ret=writebloom(b)) < 0)
+ fprint(2, "oops! writing bloom: %r\n");
+ else
+ ret = 0;
+ sendul(b->writedonechan, ret);
+ }
+}
+
+void
+startbloomproc(Bloom *b)
+{
+ b->writechan = chancreate(sizeof(void*), 0);
+ b->writedonechan = chancreate(sizeof(void*), 0);
+ vtproc(bloomwriteproc, b);
+}
diff --git a/sys/src/cmd/venti/srv/buildbuck.c b/sys/src/cmd/venti/srv/buildbuck.c
new file mode 100755
index 000000000..73f8056be
--- /dev/null
+++ b/sys/src/cmd/venti/srv/buildbuck.c
@@ -0,0 +1,132 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+/*
+ * An IEStream is a sorted list of index entries.
+ */
+struct IEStream
+{
+ Part *part;
+ u64int off; /* read position within part */
+ u64int n; /* number of valid ientries left to read */
+ u32int size; /* allocated space in buffer */
+ u8int *buf;
+ u8int *pos; /* current place in buffer */
+ u8int *epos; /* end of valid buffer contents */
+};
+
+IEStream*
+initiestream(Part *part, u64int off, u64int clumps, u32int size)
+{
+ IEStream *ies;
+
+/* out of memory? */
+ ies = MKZ(IEStream);
+ ies->buf = MKN(u8int, size);
+ ies->epos = ies->buf;
+ ies->pos = ies->epos;
+ ies->off = off;
+ ies->n = clumps;
+ ies->size = size;
+ ies->part = part;
+ return ies;
+}
+
+void
+freeiestream(IEStream *ies)
+{
+ if(ies == nil)
+ return;
+ free(ies->buf);
+ free(ies);
+}
+
+/*
+ * Return the next IEntry (still packed) in the stream.
+ */
+static u8int*
+peekientry(IEStream *ies)
+{
+ u32int n, nn;
+
+ n = ies->epos - ies->pos;
+ if(n < IEntrySize){
+ memmove(ies->buf, ies->pos, n);
+ ies->epos = &ies->buf[n];
+ ies->pos = ies->buf;
+ nn = ies->size;
+ if(nn > ies->n * IEntrySize)
+ nn = ies->n * IEntrySize;
+ nn -= n;
+ if(nn == 0)
+ return nil;
+//fprint(2, "peek %d from %llud into %p\n", nn, ies->off, ies->epos);
+ if(readpart(ies->part, ies->off, ies->epos, nn) < 0){
+ seterr(EOk, "can't read sorted index entries: %r");
+ return nil;
+ }
+ ies->epos += nn;
+ ies->off += nn;
+ }
+ return ies->pos;
+}
+
+/*
+ * Compute the bucket number for the given IEntry.
+ * Knows that the score is the first thing in the packed
+ * representation.
+ */
+static u32int
+iebuck(Index *ix, u8int *b, IBucket *ib, IEStream *ies)
+{
+ USED(ies);
+ USED(ib);
+ return hashbits(b, 32) / ix->div;
+}
+
+/*
+ * Fill ib with the next bucket in the stream.
+ */
+u32int
+buildbucket(Index *ix, IEStream *ies, IBucket *ib, uint maxdata)
+{
+ IEntry ie1, ie2;
+ u8int *b;
+ u32int buck;
+
+ buck = TWID32;
+ ib->n = 0;
+ while(ies->n){
+ b = peekientry(ies);
+ if(b == nil)
+ return TWID32;
+/* fprint(2, "b=%p ies->n=%lld ib.n=%d buck=%d score=%V\n", b, ies->n, ib->n, iebuck(ix, b, ib, ies), b); */
+ if(ib->n == 0)
+ buck = iebuck(ix, b, ib, ies);
+ else{
+ if(buck != iebuck(ix, b, ib, ies))
+ break;
+ if(ientrycmp(&ib->data[(ib->n - 1)* IEntrySize], b) == 0){
+ /*
+ * guess that the larger address is the correct one to use
+ */
+ unpackientry(&ie1, &ib->data[(ib->n - 1)* IEntrySize]);
+ unpackientry(&ie2, b);
+ seterr(EOk, "duplicate index entry for score=%V type=%d", ie1.score, ie1.ia.type);
+ ib->n--;
+ if(ie1.ia.addr > ie2.ia.addr)
+ memmove(b, &ib->data[ib->n * IEntrySize], IEntrySize);
+ }
+ }
+ if((ib->n+1)*IEntrySize > maxdata){
+ seterr(EOk, "bucket overflow");
+ return TWID32;
+ }
+ memmove(&ib->data[ib->n * IEntrySize], b, IEntrySize);
+ ib->n++;
+ ies->n--;
+ ies->pos += IEntrySize;
+ }
+ return buck;
+}
diff --git a/sys/src/cmd/venti/srv/buildindex.c b/sys/src/cmd/venti/srv/buildindex.c
new file mode 100755
index 000000000..2f24055c9
--- /dev/null
+++ b/sys/src/cmd/venti/srv/buildindex.c
@@ -0,0 +1,966 @@
+/*
+ * Rebuild the index from scratch, in place.
+ */
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+enum
+{
+ MinBufSize = 64*1024,
+ MaxBufSize = 4*1024*1024,
+};
+
+int dumb;
+int errors;
+char **isect;
+int nisect;
+int bloom;
+int zero;
+
+u32int isectmem;
+u64int totalbuckets;
+u64int totalclumps;
+Channel *arenadonechan;
+Channel *isectdonechan;
+Index *ix;
+
+u64int arenaentries;
+u64int skipentries;
+u64int indexentries;
+
+static int shouldprocess(ISect*);
+static void isectproc(void*);
+static void arenapartproc(void*);
+
+void
+usage(void)
+{
+ fprint(2, "usage: buildindex [-b] [-i isect]... [-M imem] venti.conf\n");
+ threadexitsall("usage");
+}
+
+void
+threadmain(int argc, char *argv[])
+{
+ int fd, i, napart, nfinish, maxdisks;
+ u32int bcmem, imem;
+ Config conf;
+ Part *p;
+
+ maxdisks = 100000;
+ ventifmtinstall();
+ imem = 256*1024*1024;
+ ARGBEGIN{
+ case 'b':
+ bloom = 1;
+ break;
+ case 'd': /* debugging - make sure to run all 3 passes */
+ dumb = 1;
+ break;
+ case 'i':
+ isect = vtrealloc(isect, (nisect+1)*sizeof(isect[0]));
+ isect[nisect++] = EARGF(usage());
+ break;
+ case 'M':
+ imem = unittoull(EARGF(usage()));
+ break;
+ case 'm': /* temporary - might go away */
+ maxdisks = atoi(EARGF(usage()));
+ break;
+ default:
+ usage();
+ break;
+ }ARGEND
+
+ if(argc != 1)
+ usage();
+
+ if(initventi(argv[0], &conf) < 0)
+ sysfatal("can't init venti: %r");
+ ix = mainindex;
+ if(nisect == 0 && ix->bloom)
+ bloom = 1;
+ if(bloom && ix->bloom && resetbloom(ix->bloom) < 0)
+ sysfatal("loadbloom: %r");
+ if(bloom && !ix->bloom)
+ sysfatal("-b specified but no bloom filter");
+ if(!bloom)
+ ix->bloom = nil;
+ isectmem = imem/ix->nsects;
+
+ /*
+ * safety first - only need read access to arenas
+ */
+ p = nil;
+ for(i=0; i<ix->narenas; i++){
+ if(ix->arenas[i]->part != p){
+ p = ix->arenas[i]->part;
+ if((fd = open(p->filename, OREAD)) < 0)
+ sysfatal("cannot reopen %s: %r", p->filename);
+ dup(fd, p->fd);
+ close(fd);
+ }
+ }
+
+ /*
+ * need a block for every arena
+ */
+ bcmem = maxblocksize * (mainindex->narenas + 16);
+ if(0) fprint(2, "initialize %d bytes of disk block cache\n", bcmem);
+ initdcache(bcmem);
+
+ totalclumps = 0;
+ for(i=0; i<ix->narenas; i++)
+ totalclumps += ix->arenas[i]->diskstats.clumps;
+
+ totalbuckets = 0;
+ for(i=0; i<ix->nsects; i++)
+ totalbuckets += ix->sects[i]->blocks;
+ fprint(2, "%,lld clumps, %,lld buckets\n", totalclumps, totalbuckets);
+
+ /* start index procs */
+ fprint(2, "%T read index\n");
+ isectdonechan = chancreate(sizeof(void*), 0);
+ for(i=0; i<ix->nsects; i++){
+ if(shouldprocess(ix->sects[i])){
+ ix->sects[i]->writechan = chancreate(sizeof(IEntry), 0);
+ vtproc(isectproc, ix->sects[i]);
+ }
+ }
+
+ for(i=0; i<nisect; i++)
+ if(isect[i])
+ fprint(2, "warning: did not find index section %s\n", isect[i]);
+
+ /* start arena procs */
+ p = nil;
+ napart = 0;
+ nfinish = 0;
+ arenadonechan = chancreate(sizeof(void*), 0);
+ for(i=0; i<ix->narenas; i++){
+ if(ix->arenas[i]->part != p){
+ p = ix->arenas[i]->part;
+ vtproc(arenapartproc, p);
+ if(++napart >= maxdisks){
+ recvp(arenadonechan);
+ nfinish++;
+ }
+ }
+ }
+
+ /* wait for arena procs to finish */
+ for(nfinish=0; nfinish<napart; nfinish++)
+ recvp(arenadonechan);
+
+ /* tell index procs to finish */
+ for(i=0; i<ix->nsects; i++)
+ if(ix->sects[i]->writechan)
+ send(ix->sects[i]->writechan, nil);
+
+ /* wait for index procs to finish */
+ for(i=0; i<ix->nsects; i++)
+ if(ix->sects[i]->writechan)
+ recvp(isectdonechan);
+
+ if(ix->bloom && writebloom(ix->bloom) < 0)
+ fprint(2, "writing bloom filter: %r\n");
+
+ fprint(2, "%T done arenaentries=%,lld indexed=%,lld (nskip=%,lld)\n",
+ arenaentries, indexentries, skipentries);
+ threadexitsall(nil);
+}
+
+static int
+shouldprocess(ISect *is)
+{
+ int i;
+
+ if(nisect == 0)
+ return 1;
+
+ for(i=0; i<nisect; i++)
+ if(isect[i] && strcmp(isect[i], is->name) == 0){
+ isect[i] = nil;
+ return 1;
+ }
+ return 0;
+}
+
+static void
+add(u64int *a, u64int n)
+{
+ static Lock l;
+
+ lock(&l);
+ *a += n;
+ unlock(&l);
+}
+
+/*
+ * Read through an arena partition and send each of its IEntries
+ * to the appropriate index section. When finished, send on
+ * arenadonechan.
+ */
+enum
+{
+ ClumpChunks = 32*1024,
+};
+static void
+arenapartproc(void *v)
+{
+ int i, j, n, nskip, x;
+ u32int clump;
+ u64int addr, tot;
+ Arena *a;
+ ClumpInfo *ci, *cis;
+ IEntry ie;
+ Part *p;
+
+ p = v;
+ threadsetname("arenaproc %s", p->name);
+
+ nskip = 0;
+ tot = 0;
+ cis = MKN(ClumpInfo, ClumpChunks);
+ for(i=0; i<ix->narenas; i++){
+ a = ix->arenas[i];
+ if(a->part != p)
+ continue;
+ if(a->memstats.clumps)
+ fprint(2, "%T arena %s: %d entries\n",
+ a->name, a->memstats.clumps);
+ /*
+ * Running the loop backwards accesses the
+ * clump info blocks forwards, since they are
+ * stored in reverse order at the end of the arena.
+ * This speeds things slightly.
+ */
+ addr = ix->amap[i].start + a->memstats.used;
+ for(clump=a->memstats.clumps; clump > 0; clump-=n){
+ n = ClumpChunks;
+ if(n > clump)
+ n = clump;
+ if(readclumpinfos(a, clump-n, cis, n) != n){
+ fprint(2, "%T arena %s: directory read: %r\n", a->name);
+ errors = 1;
+ break;
+ }
+ for(j=n-1; j>=0; j--){
+ ci = &cis[j];
+ ie.ia.type = ci->type;
+ ie.ia.size = ci->uncsize;
+ addr -= ci->size + ClumpSize;
+ ie.ia.addr = addr;
+ ie.ia.blocks = (ci->size + ClumpSize + (1<<ABlockLog)-1) >> ABlockLog;
+ scorecp(ie.score, ci->score);
+ if(ci->type == VtCorruptType)
+ nskip++;
+ else{
+ tot++;
+ x = indexsect(ix, ie.score);
+ assert(0 <= x && x < ix->nsects);
+ if(ix->sects[x]->writechan)
+ send(ix->sects[x]->writechan, &ie);
+ if(ix->bloom)
+ markbloomfilter(ix->bloom, ie.score);
+ }
+ }
+ }
+ if(addr != ix->amap[i].start)
+ fprint(2, "%T arena %s: clump miscalculation %lld != %lld\n", a->name, addr, ix->amap[i].start);
+ }
+ add(&arenaentries, tot);
+ add(&skipentries, nskip);
+ sendp(arenadonechan, p);
+}
+
+/*
+ * Convert score into relative bucket number in isect.
+ * Can pass a packed ientry instead of score - score is first.
+ */
+static u32int
+score2bucket(ISect *is, uchar *score)
+{
+ u32int b;
+
+ b = hashbits(score, 32)/ix->div;
+ if(b < is->start || b >= is->stop){
+ fprint(2, "score2bucket: score=%V div=%d b=%ud start=%ud stop=%ud\n",
+ score, ix->div, b, is->start, is->stop);
+ }
+ assert(is->start <= b && b < is->stop);
+ return b - is->start;
+}
+
+/*
+ * Convert offset in index section to bucket number.
+ */
+static u32int
+offset2bucket(ISect *is, u64int offset)
+{
+ u32int b;
+
+ assert(is->blockbase <= offset);
+ offset -= is->blockbase;
+ b = offset/is->blocksize;
+ assert(b < is->stop-is->start);
+ return b;
+}
+
+/*
+ * Convert bucket number to offset.
+ */
+static u64int
+bucket2offset(ISect *is, u32int b)
+{
+ assert(b <= is->stop-is->start);
+ return is->blockbase + (u64int)b*is->blocksize;
+}
+
+/*
+ * IEntry buffers to hold initial round of spraying.
+ */
+typedef struct Buf Buf;
+struct Buf
+{
+ Part *part; /* partition being written */
+ uchar *bp; /* current block */
+ uchar *ep; /* end of block */
+ uchar *wp; /* write position in block */
+ u64int boffset; /* start offset */
+ u64int woffset; /* next write offset */
+ u64int eoffset; /* end offset */
+ u32int nentry; /* number of entries written */
+};
+
+static void
+bflush(Buf *buf)
+{
+ u32int bufsize;
+
+ if(buf->woffset >= buf->eoffset)
+ sysfatal("buf index chunk overflow - need bigger index");
+ bufsize = buf->ep - buf->bp;
+ if(writepart(buf->part, buf->woffset, buf->bp, bufsize) < 0){
+ fprint(2, "write %s: %r\n", buf->part->name);
+ errors = 1;
+ }
+ buf->woffset += bufsize;
+ memset(buf->bp, 0, bufsize);
+ buf->wp = buf->bp;
+}
+
+static void
+bwrite(Buf *buf, IEntry *ie)
+{
+ if(buf->wp+IEntrySize > buf->ep)
+ bflush(buf);
+ assert(buf->bp <= buf->wp && buf->wp < buf->ep);
+ packientry(ie, buf->wp);
+ buf->wp += IEntrySize;
+ assert(buf->bp <= buf->wp && buf->wp <= buf->ep);
+ buf->nentry++;
+}
+
+/*
+ * Minibuffer. In-memory data structure holds our place
+ * in the buffer but has no block data. We are writing and
+ * reading the minibuffers at the same time. (Careful!)
+ */
+typedef struct Minibuf Minibuf;
+struct Minibuf
+{
+ u64int boffset; /* start offset */
+ u64int roffset; /* read offset */
+ u64int woffset; /* write offset */
+ u64int eoffset; /* end offset */
+ u32int nentry; /* # entries left to read */
+ u32int nwentry; /* # entries written */
+};
+
+/*
+ * Index entry pool. Used when trying to shuffle around
+ * the entries in a big buffer into the corresponding M minibuffers.
+ * Sized to hold M*EntriesPerBlock entries, so that there will always
+ * either be room in the pool for another block worth of entries
+ * or there will be an entire block worth of sorted entries to
+ * write out.
+ */
+typedef struct IEntryLink IEntryLink;
+typedef struct IPool IPool;
+
+struct IEntryLink
+{
+ uchar ie[IEntrySize]; /* raw IEntry */
+ IEntryLink *next; /* next in chain */
+};
+
+struct IPool
+{
+ ISect *isect;
+ u32int buck0; /* first bucket in pool */
+ u32int mbufbuckets; /* buckets per minibuf */
+ IEntryLink *entry; /* all IEntryLinks */
+ u32int nentry; /* # of IEntryLinks */
+ IEntryLink *free; /* free list */
+ u32int nfree; /* # on free list */
+ Minibuf *mbuf; /* all minibufs */
+ u32int nmbuf; /* # of minibufs */
+ IEntryLink **mlist; /* lists for each minibuf */
+ u32int *mcount; /* # on each mlist[i] */
+ u32int bufsize; /* block buffer size */
+ uchar *rbuf; /* read buffer */
+ uchar *wbuf; /* write buffer */
+ u32int epbuf; /* entries per block buffer */
+};
+
+/*
+static int
+countsokay(IPool *p)
+{
+ int i;
+ u64int n;
+
+ n = 0;
+ for(i=0; i<p->nmbuf; i++)
+ n += p->mcount[i];
+ n += p->nfree;
+ if(n != p->nentry){
+ print("free %ud:", p->nfree);
+ for(i=0; i<p->nmbuf; i++)
+ print(" %ud", p->mcount[i]);
+ print(" = %lld nentry: %ud\n", n, p->nentry);
+ }
+ return n == p->nentry;
+}
+*/
+
+static IPool*
+mkipool(ISect *isect, Minibuf *mbuf, u32int nmbuf,
+ u32int mbufbuckets, u32int bufsize)
+{
+ u32int i, nentry;
+ uchar *data;
+ IPool *p;
+ IEntryLink *l;
+
+ nentry = (nmbuf+1)*bufsize / IEntrySize;
+ p = ezmalloc(sizeof(IPool)
+ +nentry*sizeof(IEntry)
+ +nmbuf*sizeof(IEntryLink*)
+ +nmbuf*sizeof(u32int)
+ +3*bufsize);
+
+ p->isect = isect;
+ p->mbufbuckets = mbufbuckets;
+ p->bufsize = bufsize;
+ p->entry = (IEntryLink*)(p+1);
+ p->nentry = nentry;
+ p->mlist = (IEntryLink**)(p->entry+nentry);
+ p->mcount = (u32int*)(p->mlist+nmbuf);
+ p->nmbuf = nmbuf;
+ p->mbuf = mbuf;
+ data = (uchar*)(p->mcount+nmbuf);
+ data += bufsize - (uintptr)data%bufsize;
+ p->rbuf = data;
+ p->wbuf = data+bufsize;
+ p->epbuf = bufsize/IEntrySize;
+
+ for(i=0; i<p->nentry; i++){
+ l = &p->entry[i];
+ l->next = p->free;
+ p->free = l;
+ p->nfree++;
+ }
+ return p;
+}
+
+/*
+ * Add the index entry ie to the pool p.
+ * Caller must know there is room.
+ */
+static void
+ipoolinsert(IPool *p, uchar *ie)
+{
+ u32int buck, x;
+ IEntryLink *l;
+
+ assert(p->free != nil);
+
+ buck = score2bucket(p->isect, ie);
+ x = (buck-p->buck0) / p->mbufbuckets;
+ if(x >= p->nmbuf){
+ fprint(2, "buck=%ud mbufbucket=%ud x=%ud\n",
+ buck, p->mbufbuckets, x);
+ }
+ assert(x < p->nmbuf);
+
+ l = p->free;
+ p->free = l->next;
+ p->nfree--;
+ memmove(l->ie, ie, IEntrySize);
+ l->next = p->mlist[x];
+ p->mlist[x] = l;
+ p->mcount[x]++;
+}
+
+/*
+ * Pull out a block containing as many
+ * entries as possible for minibuffer x.
+ */
+static u32int
+ipoolgetbuf(IPool *p, u32int x)
+{
+ uchar *bp, *ep, *wp;
+ IEntryLink *l;
+ u32int n;
+
+ bp = p->wbuf;
+ ep = p->wbuf + p->bufsize;
+ n = 0;
+ assert(x < p->nmbuf);
+ for(wp=bp; wp+IEntrySize<=ep && p->mlist[x]; wp+=IEntrySize){
+ l = p->mlist[x];
+ p->mlist[x] = l->next;
+ p->mcount[x]--;
+ memmove(wp, l->ie, IEntrySize);
+ l->next = p->free;
+ p->free = l;
+ p->nfree++;
+ n++;
+ }
+ memset(wp, 0, ep-wp);
+ return n;
+}
+
+/*
+ * Read a block worth of entries from the minibuf
+ * into the pool. Caller must know there is room.
+ */
+static void
+ipoolloadblock(IPool *p, Minibuf *mb)
+{
+ u32int i, n;
+
+ assert(mb->nentry > 0);
+ assert(mb->roffset >= mb->woffset);
+ assert(mb->roffset < mb->eoffset);
+
+ n = p->bufsize/IEntrySize;
+ if(n > mb->nentry)
+ n = mb->nentry;
+ if(readpart(p->isect->part, mb->roffset, p->rbuf, p->bufsize) < 0)
+ fprint(2, "readpart %s: %r\n", p->isect->part->name);
+ else{
+ for(i=0; i<n; i++)
+ ipoolinsert(p, p->rbuf+i*IEntrySize);
+ }
+ mb->nentry -= n;
+ mb->roffset += p->bufsize;
+}
+
+/*
+ * Write out a block worth of entries to minibuffer x.
+ * If necessary, pick up the data there before overwriting it.
+ */
+static void
+ipoolflush0(IPool *pool, u32int x)
+{
+ u32int bufsize;
+ Minibuf *mb;
+
+ mb = pool->mbuf+x;
+ bufsize = pool->bufsize;
+ mb->nwentry += ipoolgetbuf(pool, x);
+ if(mb->nentry > 0 && mb->roffset == mb->woffset){
+ assert(pool->nfree >= pool->bufsize/IEntrySize);
+ /*
+ * There will be room in the pool -- we just
+ * removed a block worth.
+ */
+ ipoolloadblock(pool, mb);
+ }
+ if(writepart(pool->isect->part, mb->woffset, pool->wbuf, bufsize) < 0)
+ fprint(2, "writepart %s: %r\n", pool->isect->part->name);
+ mb->woffset += bufsize;
+}
+
+/*
+ * Write out some full block of entries.
+ * (There must be one -- the pool is almost full!)
+ */
+static void
+ipoolflush1(IPool *pool)
+{
+ u32int i;
+
+ assert(pool->nfree <= pool->epbuf);
+
+ for(i=0; i<pool->nmbuf; i++){
+ if(pool->mcount[i] >= pool->epbuf){
+ ipoolflush0(pool, i);
+ return;
+ }
+ }
+ /* can't be reached - someone must be full */
+ sysfatal("ipoolflush1");
+}
+
+/*
+ * Flush all the entries in the pool out to disk.
+ * Nothing more to read from disk.
+ */
+static void
+ipoolflush(IPool *pool)
+{
+ u32int i;
+
+ for(i=0; i<pool->nmbuf; i++)
+ while(pool->mlist[i])
+ ipoolflush0(pool, i);
+ assert(pool->nfree == pool->nentry);
+}
+
+/*
+ * Third pass. Pick up each minibuffer from disk into
+ * memory and then write out the buckets.
+ */
+
+/*
+ * Compare two packed index entries.
+ * Usual ordering except break ties by putting higher
+ * index addresses first (assumes have duplicates
+ * due to corruption in the lower addresses).
+ */
+static int
+ientrycmpaddr(const void *va, const void *vb)
+{
+ int i;
+ uchar *a, *b;
+
+ a = (uchar*)va;
+ b = (uchar*)vb;
+ i = ientrycmp(a, b);
+ if(i)
+ return i;
+ return -memcmp(a+IEntryAddrOff, b+IEntryAddrOff, 8);
+}
+
+static void
+zerorange(Part *p, u64int o, u64int e)
+{
+ static uchar zero[MaxIoSize];
+ u32int n;
+
+ for(; o<e; o+=n){
+ n = sizeof zero;
+ if(o+n > e)
+ n = e-o;
+ if(writepart(p, o, zero, n) < 0)
+ fprint(2, "writepart %s: %r\n", p->name);
+ }
+}
+
+/*
+ * Load a minibuffer into memory and write out the
+ * corresponding buckets.
+ */
+static void
+sortminibuffer(ISect *is, Minibuf *mb, uchar *buf, u32int nbuf, u32int bufsize)
+{
+ uchar *buckdata, *p, *q, *ep;
+ u32int b, lastb, memsize, n;
+ u64int o;
+ IBucket ib;
+ Part *part;
+
+ part = is->part;
+ buckdata = emalloc(is->blocksize);
+
+ if(mb->nwentry == 0)
+ return;
+
+ /*
+ * read entire buffer.
+ */
+ assert(mb->nwentry*IEntrySize <= mb->woffset-mb->boffset);
+ assert(mb->woffset-mb->boffset <= nbuf);
+ if(readpart(part, mb->boffset, buf, mb->woffset-mb->boffset) < 0){
+ fprint(2, "readpart %s: %r\n", part->name);
+ errors = 1;
+ return;
+ }
+ assert(*(uint*)buf != 0xa5a5a5a5);
+
+ /*
+ * remove fragmentation due to IEntrySize
+ * not evenly dividing Bufsize
+ */
+ memsize = (bufsize/IEntrySize)*IEntrySize;
+ for(o=mb->boffset, p=q=buf; o<mb->woffset; o+=bufsize){
+ memmove(p, q, memsize);
+ p += memsize;
+ q += bufsize;
+ }
+ ep = buf + mb->nwentry*IEntrySize;
+ assert(ep <= buf+nbuf);
+
+ /*
+ * sort entries
+ */
+ qsort(buf, mb->nwentry, IEntrySize, ientrycmpaddr);
+
+ /*
+ * write buckets out
+ */
+ n = 0;
+ lastb = offset2bucket(is, mb->boffset);
+ for(p=buf; p<ep; p=q){
+ b = score2bucket(is, p);
+ for(q=p; q<ep && score2bucket(is, q)==b; q+=IEntrySize)
+ ;
+ if(lastb+1 < b && zero)
+ zerorange(part, bucket2offset(is, lastb+1), bucket2offset(is, b));
+ if(IBucketSize+(q-p) > is->blocksize)
+ sysfatal("bucket overflow - make index bigger");
+ memmove(buckdata+IBucketSize, p, q-p);
+ ib.n = (q-p)/IEntrySize;
+ n += ib.n;
+ packibucket(&ib, buckdata, is->bucketmagic);
+ if(writepart(part, bucket2offset(is, b), buckdata, is->blocksize) < 0)
+ fprint(2, "write %s: %r\n", part->name);
+ lastb = b;
+ }
+ if(lastb+1 < is->stop-is->start && zero)
+ zerorange(part, bucket2offset(is, lastb+1), bucket2offset(is, is->stop - is->start));
+
+ if(n != mb->nwentry)
+ fprint(2, "sortminibuffer bug: n=%ud nwentry=%ud have=%ld\n", n, mb->nwentry, (ep-buf)/IEntrySize);
+
+ free(buckdata);
+}
+
+static void
+isectproc(void *v)
+{
+ u32int buck, bufbuckets, bufsize, epbuf, i, j;
+ u32int mbufbuckets, n, nbucket, nn, space;
+ u32int nbuf, nminibuf, xminiclump, prod;
+ u64int blocksize, offset, xclump;
+ uchar *data, *p;
+ Buf *buf;
+ IEntry ie;
+ IPool *ipool;
+ ISect *is;
+ Minibuf *mbuf, *mb;
+
+ is = v;
+ blocksize = is->blocksize;
+ nbucket = is->stop - is->start;
+
+ /*
+ * Three passes:
+ * pass 1 - write index entries from arenas into
+ * large sequential sections on index disk.
+ * requires nbuf * bufsize memory.
+ *
+ * pass 2 - split each section into minibufs.
+ * requires nminibuf * bufsize memory.
+ *
+ * pass 3 - read each minibuf into memory and
+ * write buckets out.
+ * requires entries/minibuf * IEntrySize memory.
+ *
+ * The larger we set bufsize the less seeking hurts us.
+ *
+ * The fewer sections and minibufs we have, the less
+ * seeking hurts us.
+ *
+ * The fewer sections and minibufs we have, the
+ * more entries we end up with in each minibuf
+ * at the end.
+ *
+ * Shoot for using half our memory to hold each
+ * minibuf. The chance of a random distribution
+ * getting off by 2x is quite low.
+ *
+ * Once that is decided, figure out the smallest
+ * nminibuf and nsection/biggest bufsize we can use
+ * and still fit in the memory constraints.
+ */
+
+ /* expected number of clump index entries we'll see */
+ xclump = nbucket * (double)totalclumps/totalbuckets;
+
+ /* number of clumps we want to see in a minibuf */
+ xminiclump = isectmem/2/IEntrySize;
+
+ /* total number of minibufs we need */
+ prod = (xclump+xminiclump-1) / xminiclump;
+
+ /* if possible, skip second pass */
+ if(!dumb && prod*MinBufSize < isectmem){
+ nbuf = prod;
+ nminibuf = 1;
+ }else{
+ /* otherwise use nsection = sqrt(nmini) */
+ for(nbuf=1; nbuf*nbuf<prod; nbuf++)
+ ;
+ if(nbuf*MinBufSize > isectmem)
+ sysfatal("not enough memory");
+ nminibuf = nbuf;
+ }
+ if (nbuf == 0) {
+ fprint(2, "%s: brand-new index, no work to do\n", argv0);
+ exits(0);
+ }
+
+ /* size buffer to use extra memory */
+ bufsize = MinBufSize;
+ while(bufsize*2*nbuf <= isectmem && bufsize < MaxBufSize)
+ bufsize *= 2;
+ data = emalloc(nbuf*bufsize);
+ epbuf = bufsize/IEntrySize;
+ fprint(2, "%T %s: %,ud buckets, %,ud groups, %,ud minigroups, %,ud buffer\n",
+ is->part->name, nbucket, nbuf, nminibuf, bufsize);
+ /*
+ * Accept index entries from arena procs.
+ */
+ buf = MKNZ(Buf, nbuf);
+ p = data;
+ offset = is->blockbase;
+ bufbuckets = (nbucket+nbuf-1)/nbuf;
+ for(i=0; i<nbuf; i++){
+ buf[i].part = is->part;
+ buf[i].bp = p;
+ buf[i].wp = p;
+ p += bufsize;
+ buf[i].ep = p;
+ buf[i].boffset = offset;
+ buf[i].woffset = offset;
+ if(i < nbuf-1){
+ offset += bufbuckets*blocksize;
+ buf[i].eoffset = offset;
+ }else{
+ offset = is->blockbase + nbucket*blocksize;
+ buf[i].eoffset = offset;
+ }
+ }
+ assert(p == data+nbuf*bufsize);
+
+ n = 0;
+ while(recv(is->writechan, &ie) == 1){
+ if(ie.ia.addr == 0)
+ break;
+ buck = score2bucket(is, ie.score);
+ i = buck/bufbuckets;
+ assert(i < nbuf);
+ bwrite(&buf[i], &ie);
+ n++;
+ }
+ add(&indexentries, n);
+
+ nn = 0;
+ for(i=0; i<nbuf; i++){
+ bflush(&buf[i]);
+ buf[i].bp = nil;
+ buf[i].ep = nil;
+ buf[i].wp = nil;
+ nn += buf[i].nentry;
+ }
+ if(n != nn)
+ fprint(2, "isectproc bug: n=%ud nn=%ud\n", n, nn);
+
+ free(data);
+
+ fprint(2, "%T %s: reordering\n", is->part->name);
+
+ /*
+ * Rearrange entries into minibuffers and then
+ * split each minibuffer into buckets.
+ * The minibuffer must be sized so that it is
+ * a multiple of blocksize -- ipoolloadblock assumes
+ * that each minibuf starts aligned on a blocksize
+ * boundary.
+ */
+ mbuf = MKN(Minibuf, nminibuf);
+ mbufbuckets = (bufbuckets+nminibuf-1)/nminibuf;
+ while(mbufbuckets*blocksize % bufsize)
+ mbufbuckets++;
+ for(i=0; i<nbuf; i++){
+ /*
+ * Set up descriptors.
+ */
+ n = buf[i].nentry;
+ nn = 0;
+ offset = buf[i].boffset;
+ memset(mbuf, 0, nminibuf*sizeof(mbuf[0]));
+ for(j=0; j<nminibuf; j++){
+ mb = &mbuf[j];
+ mb->boffset = offset;
+ offset += mbufbuckets*blocksize;
+ if(offset > buf[i].eoffset)
+ offset = buf[i].eoffset;
+ mb->eoffset = offset;
+ mb->roffset = mb->boffset;
+ mb->woffset = mb->boffset;
+ mb->nentry = epbuf * (mb->eoffset - mb->boffset)/bufsize;
+ if(mb->nentry > buf[i].nentry)
+ mb->nentry = buf[i].nentry;
+ buf[i].nentry -= mb->nentry;
+ nn += mb->nentry;
+ }
+ if(n != nn)
+ fprint(2, "isectproc bug2: n=%ud nn=%ud (i=%d)\n", n, nn, i);;
+ /*
+ * Rearrange.
+ */
+ if(!dumb && nminibuf == 1){
+ mbuf[0].nwentry = mbuf[0].nentry;
+ mbuf[0].woffset = buf[i].woffset;
+ }else{
+ ipool = mkipool(is, mbuf, nminibuf, mbufbuckets, bufsize);
+ ipool->buck0 = bufbuckets*i;
+ for(j=0; j<nminibuf; j++){
+ mb = &mbuf[j];
+ while(mb->nentry > 0){
+ if(ipool->nfree < epbuf){
+ ipoolflush1(ipool);
+ /* ipoolflush1 might change mb->nentry */
+ continue;
+ }
+ assert(ipool->nfree >= epbuf);
+ ipoolloadblock(ipool, mb);
+ }
+ }
+ ipoolflush(ipool);
+ nn = 0;
+ for(j=0; j<nminibuf; j++)
+ nn += mbuf[j].nwentry;
+ if(n != nn)
+ fprint(2, "isectproc bug3: n=%ud nn=%ud (i=%d)\n", n, nn, i);
+ free(ipool);
+ }
+
+ /*
+ * Make buckets.
+ */
+ space = 0;
+ for(j=0; j<nminibuf; j++)
+ if(space < mbuf[j].woffset - mbuf[j].boffset)
+ space = mbuf[j].woffset - mbuf[j].boffset;
+
+ data = emalloc(space);
+ for(j=0; j<nminibuf; j++){
+ mb = &mbuf[j];
+ sortminibuffer(is, mb, data, space, bufsize);
+ }
+ free(data);
+ }
+
+ sendp(isectdonechan, is);
+}
+
+
+
diff --git a/sys/src/cmd/venti/srv/checkarenas.c b/sys/src/cmd/venti/srv/checkarenas.c
new file mode 100755
index 000000000..4ad03a297
--- /dev/null
+++ b/sys/src/cmd/venti/srv/checkarenas.c
@@ -0,0 +1,139 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+static int verbose;
+
+static void
+checkarena(Arena *arena, int scan, int fix)
+{
+ ATailStats old;
+ int err, e;
+
+ if(verbose && arena->memstats.clumps)
+ printarena(2, arena);
+
+ old = arena->memstats;
+
+ if(scan){
+ arena->memstats.used = 0;
+ arena->memstats.clumps = 0;
+ arena->memstats.cclumps = 0;
+ arena->memstats.uncsize = 0;
+ }
+
+ err = 0;
+ for(;;){
+ e = syncarena(arena, 1000, 0, fix);
+ err |= e;
+ if(!(e & SyncHeader))
+ break;
+ if(verbose && arena->memstats.clumps)
+ fprint(2, ".");
+ }
+ if(verbose && arena->memstats.clumps)
+ fprint(2, "\n");
+
+ err &= ~SyncHeader;
+ if(arena->memstats.used != old.used
+ || arena->memstats.clumps != old.clumps
+ || arena->memstats.cclumps != old.cclumps
+ || arena->memstats.uncsize != old.uncsize){
+ fprint(2, "%s: incorrect arena header fields\n", arena->name);
+ printarena(2, arena);
+ err |= SyncHeader;
+ }
+
+ if(!err || !fix)
+ return;
+
+ fprint(2, "%s: writing fixed arena header fields\n", arena->name);
+ arena->diskstats = arena->memstats;
+ if(wbarena(arena) < 0)
+ fprint(2, "arena header write failed: %r\n");
+ flushdcache();
+}
+
+void
+usage(void)
+{
+ fprint(2, "usage: checkarenas [-afv] file [arenaname...]\n");
+ threadexitsall(0);
+}
+
+int
+should(char *name, int argc, char **argv)
+{
+ int i;
+
+ if(argc == 0)
+ return 1;
+ for(i=0; i<argc; i++)
+ if(strcmp(name, argv[i]) == 0)
+ return 1;
+ return 0;
+}
+
+void
+threadmain(int argc, char *argv[])
+{
+ ArenaPart *ap;
+ Part *part;
+ char *file;
+ int i, fix, scan;
+
+ ventifmtinstall();
+ statsinit();
+
+ fix = 0;
+ scan = 0;
+ ARGBEGIN{
+ case 'f':
+ fix++;
+ break;
+ case 'a':
+ scan = 1;
+ break;
+ case 'v':
+ verbose++;
+ break;
+ default:
+ usage();
+ break;
+ }ARGEND
+
+ if(!fix)
+ readonly = 1;
+
+ if(argc < 1)
+ usage();
+
+ file = argv[0];
+ argc--;
+ argv++;
+
+ part = initpart(file, (fix ? ORDWR : OREAD)|ODIRECT);
+ if(part == nil)
+ sysfatal("can't open partition %s: %r", file);
+
+ ap = initarenapart(part);
+ if(ap == nil)
+ sysfatal("can't initialize arena partition in %s: %r", file);
+
+ if(verbose > 1){
+ printarenapart(2, ap);
+ fprint(2, "\n");
+ }
+
+ initdcache(8 * MaxDiskBlock);
+
+ for(i = 0; i < ap->narenas; i++)
+ if(should(ap->arenas[i]->name, argc, argv)) {
+ debugarena = i;
+ checkarena(ap->arenas[i], scan, fix);
+ }
+
+ if(verbose > 1)
+ printstats();
+ threadexitsall(0);
+}
diff --git a/sys/src/cmd/venti/srv/checkindex.c b/sys/src/cmd/venti/srv/checkindex.c
new file mode 100755
index 000000000..ca9557302
--- /dev/null
+++ b/sys/src/cmd/venti/srv/checkindex.c
@@ -0,0 +1,295 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+static int extra, missing, wrong;
+
+static void
+phdr(DBlock *eb)
+{
+ static int did;
+
+ if(!did){
+ did = 1;
+ print("# diff actual correct\n");
+ }
+ print("%s block 0x%llux\n", eb->part->name, eb->addr);
+}
+
+static void
+pie(IEntry *ie, char c)
+{
+ print("%c %V %22lld %3d %5d %3d\n",
+ c, ie->score, ie->ia.addr, ie->ia.type, ie->ia.size, ie->ia.blocks);
+}
+
+static int
+checkbucket(Index *ix, u32int buck, IBucket *ib)
+{
+ ISect *is;
+ DBlock *eb;
+ IBucket eib;
+ IEntry ie, eie;
+ int i, ei, ok, c, hdr;
+
+ is = ix->sects[indexsect0(ix, buck)];
+ if(buck < is->start || buck >= is->stop){
+ seterr(EAdmin, "cannot find index section for bucket %lud\n", (ulong)buck);
+ return -1;
+ }
+ buck -= is->start;
+ eb = getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), OREAD);
+ if(eb == nil)
+ return -1;
+ unpackibucket(&eib, eb->data, is->bucketmagic);
+
+ ok = 0;
+ ei = 0;
+ hdr = 0;
+ for(i = 0; i < ib->n; i++){
+ while(ei < eib.n){
+ c = ientrycmp(&ib->data[i * IEntrySize], &eib.data[ei * IEntrySize]);
+ if(c == 0){
+ unpackientry(&ie, &ib->data[i * IEntrySize]);
+ unpackientry(&eie, &eib.data[ei * IEntrySize]);
+ if(iaddrcmp(&ie.ia, &eie.ia) != 0){
+ if(!hdr){
+ phdr(eb);
+ hdr = 1;
+ }
+ wrong++;
+ pie(&eie, '<');
+ pie(&ie, '>');
+ }
+ ei++;
+ goto cont;
+ }
+ if(c < 0)
+ break;
+ if(!hdr){
+ phdr(eb);
+ hdr = 1;
+ }
+ unpackientry(&eie, &eib.data[ei*IEntrySize]);
+ extra++;
+ pie(&eie, '<');
+ ei++;
+ ok = -1;
+ }
+ if(!hdr){
+ phdr(eb);
+ hdr = 1;
+ }
+ unpackientry(&ie, &ib->data[i*IEntrySize]);
+ missing++;
+ pie(&ie, '>');
+ ok = -1;
+ cont:;
+ }
+ for(; ei < eib.n; ei++){
+ if(!hdr){
+ phdr(eb);
+ hdr = 1;
+ }
+ unpackientry(&eie, &eib.data[ei*IEntrySize]);
+ pie(&eie, '<');
+ ok = -1;
+ }
+ putdblock(eb);
+ return ok;
+}
+
+int
+checkindex(Index *ix, Part *part, u64int off, u64int clumps, int zero)
+{
+ IEStream *ies;
+ IBucket ib, zib;
+ ZBlock *z, *b;
+ u32int next, buck;
+ int ok, bok;
+u64int found = 0;
+
+/* ZZZ make buffer size configurable */
+ b = alloczblock(ix->blocksize, 0, ix->blocksize);
+ z = alloczblock(ix->blocksize, 1, ix->blocksize);
+ ies = initiestream(part, off, clumps, 64*1024);
+ if(b == nil || z == nil || ies == nil){
+ werrstr("allocating: %r");
+ ok = -1;
+ goto out;
+ }
+ ok = 0;
+ next = 0;
+ memset(&ib, 0, sizeof ib);
+ ib.data = b->data;
+ zib.data = z->data;
+ zib.n = 0;
+ zib.buck = 0;
+ for(;;){
+ buck = buildbucket(ix, ies, &ib, ix->blocksize-IBucketSize);
+ found += ib.n;
+ if(zero){
+ for(; next != buck; next++){
+ if(next == ix->buckets){
+ if(buck != TWID32){
+ ok = -1;
+ werrstr("internal error: bucket out of range");
+ }
+ if(ok < 0)
+ werrstr("%d spurious entries, %d missing, %d wrong", extra, missing, wrong);
+ goto out;
+ }
+ bok = checkbucket(ix, next, &zib);
+ if(bok < 0)
+ ok = -1;
+ }
+ }
+ if(buck >= ix->buckets){
+ if(buck == TWID32)
+ break;
+ werrstr("internal error: bucket out of range");
+ ok = -1;
+ goto out;
+ }
+ bok = checkbucket(ix, buck, &ib);
+ if(bok < 0)
+ ok = -1;
+ next = buck + 1;
+ }
+out:
+ freeiestream(ies);
+ freezblock(z);
+ freezblock(b);
+ return ok;
+}
+
+int
+checkbloom(Bloom *b1, Bloom *b2, int fix)
+{
+ u32int *a1, *a2;
+ int i, n, extra, missing;
+
+ if(b1==nil && b2==nil)
+ return 0;
+ if(b1==nil || b2==nil){
+ werrstr("nil/non-nil");
+ return -1;
+ }
+ wbbloomhead(b1);
+ wbbloomhead(b2);
+ if(memcmp(b1->data, b2->data, BloomHeadSize) != 0){
+ werrstr("bloom header mismatch");
+ return -1;
+ }
+ a1 = (u32int*)b1->data;
+ a2 = (u32int*)b2->data;
+ n = b1->size/4;
+ extra = 0;
+ missing = 0;
+ for(i=BloomHeadSize/4; i<n; i++){
+ if(a1[i] != a2[i]){
+// print("%.8ux/%.8ux.", a1[i], a2[i]);
+ extra += countbits(a1[i] & ~a2[i]);
+ missing += countbits(a2[i] & ~a1[i]);
+ }
+ }
+ if(extra || missing)
+ fprint(2, "bloom filter: %d spurious bits, %d missing bits\n",
+ extra, missing);
+ else
+ fprint(2, "bloom filter: correct\n");
+ if(!fix && missing){
+ werrstr("missing bits");
+ return -1;
+ }
+ if(fix && (missing || extra)){
+ memmove(b1->data, b2->data, b1->size);
+ return writebloom(b1);
+ }
+ return 0;
+}
+
+
+void
+usage(void)
+{
+ fprint(2, "usage: checkindex [-f] [-B blockcachesize] config tmp\n");
+ threadexitsall(0);
+}
+
+Config conf;
+
+void
+threadmain(int argc, char *argv[])
+{
+ Bloom *oldbloom, *newbloom;
+ Part *part;
+ u64int clumps, base;
+ u32int bcmem;
+ int fix, skipz, ok;
+
+ fix = 0;
+ bcmem = 0;
+ skipz = 0;
+ ARGBEGIN{
+ case 'B':
+ bcmem = unittoull(ARGF());
+ break;
+ case 'f':
+ fix++;
+ break;
+ case 'Z':
+ skipz = 1;
+ break;
+ default:
+ usage();
+ break;
+ }ARGEND
+
+ if(argc != 2)
+ usage();
+
+ ventifmtinstall();
+
+ part = initpart(argv[1], ORDWR|ODIRECT);
+ if(part == nil)
+ sysfatal("can't initialize temporary partition: %r");
+
+ if(!fix)
+ readonly = 1;
+
+ if(initventi(argv[0], &conf) < 0)
+ sysfatal("can't init venti: %r");
+ if(mainindex->bloom && loadbloom(mainindex->bloom) < 0)
+ sysfatal("can't load bloom filter: %r");
+ oldbloom = mainindex->bloom;
+ newbloom = nil;
+ if(oldbloom){
+ newbloom = vtmallocz(sizeof *newbloom);
+ bloominit(newbloom, oldbloom->size, nil);
+ newbloom->data = vtmallocz(oldbloom->size);
+ }
+ if(bcmem < maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16))
+ bcmem = maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16);
+ if(0) fprint(2, "initialize %d bytes of disk block cache\n", bcmem);
+ initdcache(bcmem);
+
+ fprint(2, "checkindex: building entry list\n");
+ clumps = sortrawientries(mainindex, part, &base, newbloom);
+ if(clumps == TWID64)
+ sysfatal("can't build sorted index: %r");
+ fprint(2, "checkindex: checking %lld entries at %lld\n", clumps, base);
+ ok = 0;
+ if(checkindex(mainindex, part, base, clumps, !skipz) < 0){
+ fprint(2, "checkindex: %r\n");
+ ok = -1;
+ }
+ if(checkbloom(oldbloom, newbloom, fix) < 0){
+ fprint(2, "checkbloom: %r\n");
+ ok = -1;
+ }
+ if(ok < 0)
+ sysfatal("errors found");
+ fprint(2, "checkindex: index is correct\n");
+ threadexitsall(0);
+}
diff --git a/sys/src/cmd/venti/srv/clump.c b/sys/src/cmd/venti/srv/clump.c
new file mode 100755
index 000000000..ed4de34d9
--- /dev/null
+++ b/sys/src/cmd/venti/srv/clump.c
@@ -0,0 +1,225 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+#include "whack.h"
+
+/*
+ * Write a lump to disk. Updates ia with an index address
+ * for the newly-written lump. Upon return, the lump will
+ * have been placed in the disk cache but will likely not be on disk yet.
+ */
+int
+storeclump(Index *ix, ZBlock *zb, u8int *sc, int type, u32int creator, IAddr *ia)
+{
+ ZBlock *cb;
+ Clump cl;
+ u64int a;
+ u8int bh[VtScoreSize];
+ int size, dsize;
+
+ trace(TraceLump, "storeclump enter", sc, type);
+ size = zb->len;
+ if(size > VtMaxLumpSize){
+ seterr(EStrange, "lump too large");
+ return -1;
+ }
+ if(vttypevalid(type) < 0){
+ seterr(EStrange, "invalid lump type");
+ return -1;
+ }
+
+ if(0){
+ scoremem(bh, zb->data, size);
+ if(scorecmp(sc, bh) != 0){
+ seterr(ECorrupt, "storing clump: corrupted; expected=%V got=%V, size=%d", sc, bh, size);
+ return -1;
+ }
+ }
+
+ cb = alloczblock(size + ClumpSize + U32Size, 0, 0);
+ if(cb == nil)
+ return -1;
+
+ cl.info.type = type;
+ cl.info.uncsize = size;
+ cl.creator = creator;
+ cl.time = now();
+ scorecp(cl.info.score, sc);
+
+ trace(TraceLump, "storeclump whackblock");
+ dsize = whackblock(&cb->data[ClumpSize], zb->data, size);
+ if(dsize > 0 && dsize < size){
+ cl.encoding = ClumpECompress;
+ }else{
+ if(dsize > size){
+ fprint(2, "whack error: dsize=%d size=%d\n", dsize, size);
+ abort();
+ }
+ cl.encoding = ClumpENone;
+ dsize = size;
+ memmove(&cb->data[ClumpSize], zb->data, size);
+ }
+ memset(cb->data+ClumpSize+dsize, 0, 4);
+ cl.info.size = dsize;
+
+ a = writeiclump(ix, &cl, cb->data);
+ trace(TraceLump, "storeclump exit %lld", a);
+ freezblock(cb);
+ if(a == TWID64)
+ return -1;
+
+ ia->addr = a;
+ ia->type = type;
+ ia->size = size;
+ ia->blocks = (dsize + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog;
+
+/*
+ qlock(&stats.lock);
+ stats.clumpwrites++;
+ stats.clumpbwrites += size;
+ stats.clumpbcomp += dsize;
+ qunlock(&stats.lock);
+*/
+
+ return 0;
+}
+
+u32int
+clumpmagic(Arena *arena, u64int aa)
+{
+ u8int buf[U32Size];
+
+ if(readarena(arena, aa, buf, U32Size) == TWID32)
+ return TWID32;
+ return unpackmagic(buf);
+}
+
+/*
+ * fetch a block based at addr.
+ * score is filled in with the block's score.
+ * blocks is roughly the length of the clump on disk;
+ * if zero, the length is unknown.
+ */
+ZBlock*
+loadclump(Arena *arena, u64int aa, int blocks, Clump *cl, u8int *score, int verify)
+{
+ Unwhack uw;
+ ZBlock *zb, *cb;
+ u8int bh[VtScoreSize], *buf;
+ u32int n;
+ int nunc;
+
+/*
+ qlock(&stats.lock);
+ stats.clumpreads++;
+ qunlock(&stats.lock);
+*/
+
+ if(blocks <= 0)
+ blocks = 1;
+
+ trace(TraceLump, "loadclump enter");
+
+ cb = alloczblock(blocks << ABlockLog, 0, 0);
+ if(cb == nil)
+ return nil;
+ n = readarena(arena, aa, cb->data, blocks << ABlockLog);
+ if(n < ClumpSize){
+ if(n != 0)
+ seterr(ECorrupt, "loadclump read less than a header");
+ freezblock(cb);
+ return nil;
+ }
+ trace(TraceLump, "loadclump unpack");
+ if(unpackclump(cl, cb->data, arena->clumpmagic) < 0){
+ seterr(ECorrupt, "loadclump %s %llud: %r", arena->name, aa);
+ freezblock(cb);
+ return nil;
+ }
+ if(cl->info.type == VtCorruptType){
+ seterr(EOk, "clump is marked corrupt");
+ freezblock(cb);
+ return nil;
+ }
+ n -= ClumpSize;
+ if(n < cl->info.size){
+ freezblock(cb);
+ n = cl->info.size;
+ cb = alloczblock(n, 0, 0);
+ if(cb == nil)
+ return nil;
+ if(readarena(arena, aa + ClumpSize, cb->data, n) != n){
+ seterr(ECorrupt, "loadclump read too little data");
+ freezblock(cb);
+ return nil;
+ }
+ buf = cb->data;
+ }else
+ buf = cb->data + ClumpSize;
+
+ scorecp(score, cl->info.score);
+
+ zb = alloczblock(cl->info.uncsize, 0, 0);
+ if(zb == nil){
+ freezblock(cb);
+ return nil;
+ }
+ switch(cl->encoding){
+ case ClumpECompress:
+ trace(TraceLump, "loadclump decompress");
+ unwhackinit(&uw);
+ nunc = unwhack(&uw, zb->data, cl->info.uncsize, buf, cl->info.size);
+ if(nunc != cl->info.uncsize){
+ if(nunc < 0)
+ seterr(ECorrupt, "decompression of %llud failed: %s", aa, uw.err);
+ else
+ seterr(ECorrupt, "decompression of %llud gave partial block: %d/%d\n", aa, nunc, cl->info.uncsize);
+ freezblock(cb);
+ freezblock(zb);
+ return nil;
+ }
+ break;
+ case ClumpENone:
+ if(cl->info.size != cl->info.uncsize){
+ seterr(ECorrupt, "loading clump: bad uncompressed size for uncompressed block %llud", aa);
+ freezblock(cb);
+ freezblock(zb);
+ return nil;
+ }
+ scoremem(bh, buf, cl->info.uncsize);
+ if(scorecmp(cl->info.score, bh) != 0)
+ seterr(ECorrupt, "pre-copy sha1 wrong at %s %llud: expected=%V got=%V", arena->name, aa, cl->info.score, bh);
+ memmove(zb->data, buf, cl->info.uncsize);
+ break;
+ default:
+ seterr(ECorrupt, "unknown encoding in loadlump %llud", aa);
+ freezblock(cb);
+ freezblock(zb);
+ return nil;
+ }
+ freezblock(cb);
+
+ if(verify){
+ trace(TraceLump, "loadclump verify");
+ scoremem(bh, zb->data, cl->info.uncsize);
+ if(scorecmp(cl->info.score, bh) != 0){
+ seterr(ECorrupt, "loading clump: corrupted at %s %llud; expected=%V got=%V", arena->name, aa, cl->info.score, bh);
+ freezblock(zb);
+ return nil;
+ }
+ if(vttypevalid(cl->info.type) < 0){
+ seterr(ECorrupt, "loading lump at %s %llud: invalid lump type %d", arena->name, aa, cl->info.type);
+ freezblock(zb);
+ return nil;
+ }
+ }
+
+ trace(TraceLump, "loadclump exit");
+/*
+ qlock(&stats.lock);
+ stats.clumpbreads += cl->info.size;
+ stats.clumpbuncomp += cl->info.uncsize;
+ qunlock(&stats.lock);
+*/
+ return zb;
+}
diff --git a/sys/src/cmd/venti/srv/clumpstats.c b/sys/src/cmd/venti/srv/clumpstats.c
new file mode 100755
index 000000000..d2cfe251c
--- /dev/null
+++ b/sys/src/cmd/venti/srv/clumpstats.c
@@ -0,0 +1,127 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+int count[VtMaxLumpSize][VtMaxType];
+Config conf;
+
+enum
+{
+ ClumpChunks = 32*1024
+};
+
+static int
+readarenainfo(Arena *arena)
+{
+ ClumpInfo *ci, *cis;
+ u32int clump;
+ int i, n, ok;
+
+ if(arena->memstats.clumps)
+ fprint(2, "reading directory for arena=%s with %d entries\n", arena->name, arena->memstats.clumps);
+
+ cis = MKN(ClumpInfo, ClumpChunks);
+ ok = 0;
+ for(clump = 0; clump < arena->memstats.clumps; clump += n){
+ n = ClumpChunks;
+
+ if(n > arena->memstats.clumps - clump)
+ n = arena->memstats.clumps - clump;
+
+ if((i=readclumpinfos(arena, clump, cis, n)) != n){
+ seterr(EOk, "arena directory read failed %d not %d: %r", i, n);
+ ok = -1;
+ break;
+ }
+
+ for(i = 0; i < n; i++){
+ ci = &cis[i];
+ if(ci->type >= VtMaxType || ci->uncsize >= VtMaxLumpSize) {
+ fprint(2, "bad clump: %d: type = %d: size = %d\n", clump+i, ci->type, ci->uncsize);
+ continue;
+ }
+ count[ci->uncsize][ci->type]++;
+ }
+ }
+ free(cis);
+ if(ok < 0)
+ return TWID32;
+ return clump;
+}
+
+static void
+clumpstats(Index *ix)
+{
+ int ok;
+ ulong clumps, n;
+ int i, j, t;
+
+ ok = 0;
+ clumps = 0;
+ for(i = 0; i < ix->narenas; i++){
+ n = readarenainfo(ix->arenas[i]);
+ if(n == TWID32){
+ ok = -1;
+ break;
+ }
+ clumps += n;
+ }
+
+ if(ok < 0)
+ return;
+
+ print("clumps = %ld\n", clumps);
+ for(i=0; i<VtMaxLumpSize; i++) {
+ t = 0;
+ for(j=0; j<VtMaxType; j++)
+ t += count[i][j];
+ if(t == 0)
+ continue;
+ print("%d\t%d", i, t);
+ for(j=0; j<VtMaxType; j++)
+ print("\t%d", count[i][j]);
+ print("\n");
+ }
+}
+
+
+void
+usage(void)
+{
+ fprint(2, "usage: clumpstats [-B blockcachesize] config\n");
+ threadexitsall(0);
+}
+
+void
+threadmain(int argc, char *argv[])
+{
+ u32int bcmem;
+
+ bcmem = 0;
+
+ ARGBEGIN{
+ case 'B':
+ bcmem = unittoull(ARGF());
+ break;
+ default:
+ usage();
+ break;
+ }ARGEND
+
+ readonly = 1;
+
+ if(argc != 1)
+ usage();
+
+ if(initventi(argv[0], &conf) < 0)
+ sysfatal("can't init venti: %r");
+
+ if(bcmem < maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16))
+ bcmem = maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16);
+ if(0) fprint(2, "initialize %d bytes of disk block cache\n", bcmem);
+ initdcache(bcmem);
+
+ clumpstats(mainindex);
+
+ threadexitsall(0);
+}
diff --git a/sys/src/cmd/venti/srv/cmparenas.c b/sys/src/cmd/venti/srv/cmparenas.c
new file mode 100755
index 000000000..322f16ee1
--- /dev/null
+++ b/sys/src/cmd/venti/srv/cmparenas.c
@@ -0,0 +1,317 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+static int verbose;
+static int fd;
+static int fd1;
+static uchar *data;
+static uchar *data1;
+static int blocksize;
+static int sleepms;
+
+void
+usage(void)
+{
+ fprint(2, "usage: cmparenas [-b blocksize] [-s ms] [-v] arenapart1 arenapart2 [name...]]\n");
+ threadexitsall(0);
+}
+
+static int
+preadblock(int fd, uchar *buf, int n, vlong off)
+{
+ int nr, m;
+
+ for(nr = 0; nr < n; nr += m){
+ m = n - nr;
+ m = pread(fd, &buf[nr], m, off+nr);
+ if(m <= 0){
+ if(m == 0)
+ werrstr("early eof");
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static int
+readblock(int fd, uchar *buf, int n)
+{
+ int nr, m;
+
+ for(nr = 0; nr < n; nr += m){
+ m = n - nr;
+ m = read(fd, &buf[nr], m);
+ if(m <= 0){
+ if(m == 0)
+ werrstr("early eof");
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static int
+printheader(char *name, ArenaHead *head, int fd)
+{
+ Arena arena;
+ vlong baseoff, lo, hi, off;
+ int clumpmax;
+
+ off = seek(fd, 0, 1);
+ seek(fd, off + head->size - head->blocksize, 0);
+ if(readblock(fd, data, head->blocksize) < 0){
+ fprint(2, "%s: reading arena tail: %r\n", name);
+ return -1;
+ }
+ seek(fd, off, 0);
+
+ memset(&arena, 0, sizeof arena);
+ if(unpackarena(&arena, data) < 0){
+ fprint(2, "%s: unpack arena tail: %r\n", name);
+ return -1;
+ }
+ arena.blocksize = head->blocksize;
+ arena.base = off + head->blocksize;
+ arena.clumpmax = arena.blocksize / ClumpInfoSize;
+ arena.size = head->size - 2*head->blocksize;
+
+ fprint(2, "%s: base=%llx size=%llx blocksize=%x\n", name, off, head->size, head->blocksize);
+
+ baseoff = head->blocksize;
+ fprint(2, "\t%llx-%llx: head\n", (vlong)0, baseoff);
+ lo = baseoff;
+ hi = baseoff + arena.diskstats.used;
+ fprint(2, "\t%llx-%llx: data (%llx)\n", lo, hi, hi - lo);
+ hi = head->size - head->blocksize;
+ clumpmax = head->blocksize / ClumpInfoSize;
+ if(clumpmax > 0)
+ lo = hi - (u64int)arena.diskstats.clumps/clumpmax * head->blocksize;
+ else
+ lo = hi;
+ fprint(2, "\t%llx-%llx: clumps (%llx)\n", lo, hi, hi - lo);
+ fprint(2, "\t%llx-%llx: tail\n", hi, hi + head->blocksize);
+
+ fprint(2, "arena:\n");
+ printarena(2, &arena);
+ return 0;
+}
+
+static void
+cmparena(char *name, vlong len)
+{
+ ArenaHead head;
+ DigestState s;
+ u64int n, e;
+ u32int bs;
+ int i, j;
+ char buf[20];
+
+ fprint(2, "cmp %s\n", name);
+
+ memset(&s, 0, sizeof s);
+
+ /*
+ * read a little bit, which will include the header
+ */
+ if(readblock(fd, data, HeadSize) < 0){
+ fprint(2, "%s: reading header: %r\n", name);
+ return;
+ }
+ if(unpackarenahead(&head, data) < 0){
+ fprint(2, "%s: corrupt arena header: %r\n", name);
+ return;
+ }
+ if(head.version != ArenaVersion4 && head.version != ArenaVersion5)
+ fprint(2, "%s: warning: unknown arena version %d\n", name, head.version);
+ if(len != 0 && len != head.size)
+ fprint(2, "%s: warning: unexpected length %lld != %lld\n", name, head.size, len);
+ if(strcmp(name, "<stdin>") != 0 && strcmp(head.name, name) != 0)
+ fprint(2, "%s: warning: unexpected name %s\n", name, head.name);
+
+ if(readblock(fd1, data1, HeadSize) < 0){
+ fprint(2, "%s: reading header: %r\n", name);
+ return;
+ }
+ if(unpackarenahead(&head, data) < 0){
+ fprint(2, "%s: corrupt arena header: %r\n", name);
+ return;
+ }
+ if(head.version != ArenaVersion4 && head.version != ArenaVersion5)
+ fprint(2, "%s: warning: unknown arena version %d\n", name, head.version);
+ if(len != 0 && len != head.size)
+ fprint(2, "%s: warning: unexpected length %lld != %lld\n", name, head.size, len);
+ if(strcmp(name, "<stdin>") != 0 && strcmp(head.name, name) != 0)
+ fprint(2, "%s: warning: unexpected name %s\n", name, head.name);
+
+ seek(fd, -HeadSize, 1);
+ seek(fd1, -HeadSize, 1);
+
+ if(printheader(name, &head, fd) < 0)
+ return;
+
+ /*
+ * now we know how much to read
+ * read everything but the last block, which is special
+ */
+ e = head.size;
+ bs = blocksize;
+ for(n = 0; n < e; n += bs){
+ if(n + bs > e)
+ bs = e - n;
+ if(readblock(fd, data, bs) < 0){
+ fprint(2, "%s: read data: %r\n", name);
+ return;
+ }
+ if(readblock(fd1, data1, bs) < 0){
+ fprint(2, "%s: read data: %r\n", name);
+ return;
+ }
+ if(memcmp(data, data1, bs) != 0){
+ print("mismatch at %llx\n", n);
+ for(i=0; i<bs; i+=16){
+ if(memcmp(data+i, data1+i, 16) != 0){
+ snprint(buf, sizeof buf, "%llx", n+i);
+ print("%s ", buf);
+ for(j=0; j<16; j++){
+ print(" %.2ux", data[i+j]);
+ if(j == 7)
+ print(" -");
+ }
+ print("\n");
+ print("%*s ", (int)strlen(buf), "");
+ for(j=0; j<16; j++){
+ print(" %.2ux", data1[i+j]);
+ if(j == 7)
+ print(" -");
+ }
+ print("\n");
+ }
+ }
+ }
+ }
+}
+
+static int
+shouldcheck(char *name, char **s, int n)
+{
+ int i;
+
+ if(n == 0)
+ return 1;
+
+ for(i=0; i<n; i++){
+ if(s[i] && strcmp(name, s[i]) == 0){
+ s[i] = nil;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+char *
+readap(int fd, ArenaPart *ap)
+{
+ char *table;
+
+ if(preadblock(fd, data, 8192, PartBlank) < 0)
+ sysfatal("read arena part header: %r");
+ if(unpackarenapart(ap, data) < 0)
+ sysfatal("corrupted arena part header: %r");
+ fprint(2, "# arena part version=%d blocksize=%d arenabase=%d\n",
+ ap->version, ap->blocksize, ap->arenabase);
+ ap->tabbase = (PartBlank+HeadSize+ap->blocksize-1)&~(ap->blocksize-1);
+ ap->tabsize = ap->arenabase - ap->tabbase;
+ table = malloc(ap->tabsize+1);
+ if(preadblock(fd, (uchar*)table, ap->tabsize, ap->tabbase) < 0)
+ sysfatal("reading arena part directory: %r");
+ table[ap->tabsize] = 0;
+ return table;
+}
+
+void
+threadmain(int argc, char *argv[])
+{
+ int i, nline;
+ char *p, *q, *table, *table1, *f[10], line[256];
+ vlong start, stop;
+ ArenaPart ap;
+ ArenaPart ap1;
+
+ ventifmtinstall();
+ blocksize = MaxIoSize;
+ ARGBEGIN{
+ case 'b':
+ blocksize = unittoull(EARGF(usage()));
+ break;
+ case 's':
+ sleepms = atoi(EARGF(usage()));
+ break;
+ case 'v':
+ verbose++;
+ break;
+ default:
+ usage();
+ break;
+ }ARGEND
+
+ if(argc < 2)
+ usage();
+
+ data = vtmalloc(blocksize);
+ data1 = vtmalloc(blocksize);
+ if((fd = open(argv[0], OREAD)) < 0)
+ sysfatal("open %s: %r", argv[0]);
+ if((fd1 = open(argv[1], OREAD)) < 0)
+ sysfatal("open %s: %r", argv[0]);
+
+ table = readap(fd, &ap);
+ table1 = readap(fd1, &ap1);
+ if(strcmp(table, table1) != 0)
+ sysfatal("arena partitions do not have identical tables");
+
+ nline = atoi(table);
+ p = strchr(table, '\n');
+ if(p)
+ p++;
+ for(i=0; i<nline; i++){
+ if(p == nil){
+ fprint(2, "warning: unexpected arena table end\n");
+ break;
+ }
+ q = strchr(p, '\n');
+ if(q)
+ *q++ = 0;
+ if(strlen(p) >= sizeof line){
+ fprint(2, "warning: long arena table line: %s\n", p);
+ p = q;
+ continue;
+ }
+ strcpy(line, p);
+ memset(f, 0, sizeof f);
+ if(tokenize(line, f, nelem(f)) < 3){
+ fprint(2, "warning: bad arena table line: %s\n", p);
+ p = q;
+ continue;
+ }
+ p = q;
+ if(shouldcheck(f[0], argv+1, argc-1)){
+ start = strtoull(f[1], 0, 0);
+ stop = strtoull(f[2], 0, 0);
+ if(stop <= start){
+ fprint(2, "%s: bad start,stop %lld,%lld\n", f[0], stop, start);
+ continue;
+ }
+ if(seek(fd, start, 0) < 0)
+ fprint(2, "%s: seek to start: %r\n", f[0]);
+ if(seek(fd1, start, 0) < 0)
+ fprint(2, "%s: seek to start: %r\n", f[0]);
+ cmparena(f[0], stop - start);
+ }
+ }
+ for(i=2; i<argc; i++)
+ if(argv[i] != 0)
+ fprint(2, "%s: did not find arena\n", argv[i]);
+
+ threadexitsall(nil);
+}
diff --git a/sys/src/cmd/venti/srv/conf.rc b/sys/src/cmd/venti/srv/conf.rc
new file mode 100755
index 000000000..cce980fa7
--- /dev/null
+++ b/sys/src/cmd/venti/srv/conf.rc
@@ -0,0 +1,67 @@
+#!/bin/rc
+
+# the venti configuration is stored at the 248kB offset in the first index
+# partition and extends for at most 8 kB.
+
+rfork e
+fn usage {
+ echo 'usage: venti/conf [-w] /dev/sdC0/v.arenas' >[1=2]
+ exit usage
+}
+
+wflag=no
+while(! ~ $#* 0 && ~ $1 -* && ! ~ $1 --){
+ switch($1){
+ case -w
+ wflag=yes
+ case *
+ usage
+ }
+ shift
+}
+if(~ $1 --)
+ shift
+
+if(~ $wflag no && ! ~ $#* 1)
+ usage
+if(~ $wflag yes && ! ~ $#* 1 2)
+ usage
+
+disk=$1
+if(! test -f $disk){
+ echo 'unknown disk' $1 >[1=2]
+ exit nodisk
+}
+
+fn sigexit {
+ #rm -f /tmp/venticonf.$pid
+}
+
+if(~ $wflag yes){
+ {echo venti config; cat $2} >/tmp/venticonf.$pid || exit oops
+ if(! test -s /tmp/venticonf.$pid){
+ echo 'config is empty; will not install' >[1=2]
+ exit emptyconfig
+ }
+ if(test `{ls -l /tmp/venticonf.$pid | awk '{print $6}'} -gt 8192){
+ echo 'config is too long; max is a little less than eight kilobytes' >[1=2]
+ exit toolong
+ }
+ dd -quiet 1 -bs 1024 -count 8 -if $disk -iseek 248 \
+ >/tmp/_venticonf.old || exit backup
+ dd -quiet 1 -count 2 </dev/zero >> /tmp/venticonf.$pid || exit dd
+ dd -quiet 1 -bs 1024 -count 8 -if /tmp/venticonf.$pid \
+ -of $disk -trunc 0 -oseek 248 || exit dd2
+ exit 0
+}
+
+dd -quiet 1 -bs 1024 -count 8 -if $disk -iseek 248 |
+ aux/zerotrunc >/tmp/venticonf.$pid
+
+if(! cmp <{sed 1q /tmp/venticonf.$pid} <{echo venti config}){
+ echo 'config has bad header' >[1=2]
+ exit badconfig
+}
+
+sed 1d /tmp/venticonf.$pid
+exit ''
diff --git a/sys/src/cmd/venti/srv/config.c b/sys/src/cmd/venti/srv/config.c
new file mode 100755
index 000000000..ba4daba1a
--- /dev/null
+++ b/sys/src/cmd/venti/srv/config.c
@@ -0,0 +1,253 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+Index *mainindex;
+int paranoid = 1; /* should verify hashes on disk read */
+
+static ArenaPart *configarenas(char *file);
+static ISect *configisect(char *file);
+static Bloom *configbloom(char *file);
+
+int
+initventi(char *file, Config *conf)
+{
+ statsinit();
+
+ if(file == nil){
+ seterr(EOk, "no configuration file");
+ return -1;
+ }
+ if(runconfig(file, conf) < 0){
+ seterr(EOk, "can't initialize venti: %r");
+ return -1;
+ }
+ mainindex = initindex(conf->index, conf->sects, conf->nsects);
+ if(mainindex == nil)
+ return -1;
+ mainindex->bloom = conf->bloom;
+ return 0;
+}
+
+static int
+numok(char *s)
+{
+ char *p;
+
+ strtoull(s, &p, 0);
+ if(p == s)
+ return -1;
+ if(*p == 0)
+ return 0;
+ if(p[1] == 0 && strchr("MmGgKk", *p))
+ return 0;
+ return 0;
+}
+
+/*
+ * configs :
+ * | configs config
+ * config : "isect" filename
+ * | "arenas" filename
+ * | "index" name
+ * | "bcmem" num
+ * | "mem" num
+ * | "icmem" num
+ * | "queuewrites"
+ * | "httpaddr" address
+ * | "addr" address
+ *
+ * '#' and \n delimit comments
+ */
+enum
+{
+ MaxArgs = 2
+};
+int
+runconfig(char *file, Config *config)
+{
+ ArenaPart **av;
+ ISect **sv;
+ IFile f;
+ char *s, *line, *flds[MaxArgs + 1];
+ int i, ok;
+
+ if(readifile(&f, file) < 0)
+ return -1;
+ memset(config, 0, sizeof *config);
+ config->mem = Unspecified;
+ ok = -1;
+ line = nil;
+ for(;;){
+ s = ifileline(&f);
+ if(s == nil){
+ ok = 0;
+ break;
+ }
+ line = estrdup(s);
+ i = getfields(s, flds, MaxArgs + 1, 1, " \t\r");
+ if(i == 2 && strcmp(flds[0], "isect") == 0){
+ sv = MKN(ISect*, config->nsects + 1);
+ for(i = 0; i < config->nsects; i++)
+ sv[i] = config->sects[i];
+ free(config->sects);
+ config->sects = sv;
+ config->sects[config->nsects] = configisect(flds[1]);
+ if(config->sects[config->nsects] == nil)
+ break;
+ config->nsects++;
+ }else if(i == 2 && strcmp(flds[0], "arenas") == 0){
+ av = MKN(ArenaPart*, config->naparts + 1);
+ for(i = 0; i < config->naparts; i++)
+ av[i] = config->aparts[i];
+ free(config->aparts);
+ config->aparts = av;
+ config->aparts[config->naparts] = configarenas(flds[1]);
+ if(config->aparts[config->naparts] == nil)
+ break;
+ config->naparts++;
+ }else if(i == 2 && strcmp(flds[0], "bloom") == 0){
+ if(config->bloom){
+ seterr(EAdmin, "duplicate bloom lines in configuration file %s", file);
+ break;
+ }
+ if((config->bloom = configbloom(flds[1])) == nil)
+ break;
+ }else if(i == 2 && strcmp(flds[0], "index") == 0){
+ if(nameok(flds[1]) < 0){
+ seterr(EAdmin, "illegal index name %s in config file %s", flds[1], file);
+ break;
+ }
+ if(config->index != nil){
+ seterr(EAdmin, "duplicate indices in config file %s", file);
+ break;
+ }
+ config->index = estrdup(flds[1]);
+ }else if(i == 2 && strcmp(flds[0], "bcmem") == 0){
+ if(numok(flds[1]) < 0){
+ seterr(EAdmin, "illegal size %s in config file %s",
+ flds[1], file);
+ break;
+ }
+ if(config->bcmem != 0){
+ seterr(EAdmin, "duplicate bcmem lines in config file %s", file);
+ break;
+ }
+ config->bcmem = unittoull(flds[1]);
+ }else if(i == 2 && strcmp(flds[0], "mem") == 0){
+ if(numok(flds[1]) < 0){
+ seterr(EAdmin, "illegal size %s in config file %s",
+ flds[1], file);
+ break;
+ }
+ if(config->mem != Unspecified){
+ seterr(EAdmin, "duplicate mem lines in config file %s", file);
+ break;
+ }
+ config->mem = unittoull(flds[1]);
+ }else if(i == 2 && strcmp(flds[0], "icmem") == 0){
+ if(numok(flds[1]) < 0){
+ seterr(EAdmin, "illegal size %s in config file %s",
+ flds[1], file);
+ break;
+ }
+ if(config->icmem != 0){
+ seterr(EAdmin, "duplicate icmem lines in config file %s", file);
+ break;
+ }
+ config->icmem = unittoull(flds[1]);
+ }else if(i == 1 && strcmp(flds[0], "queuewrites") == 0){
+ config->queuewrites = 1;
+ }else if(i == 2 && strcmp(flds[0], "httpaddr") == 0){
+ if(config->haddr){
+ seterr(EAdmin, "duplicate httpaddr lines in configuration file %s", file);
+ break;
+ }
+ config->haddr = estrdup(flds[1]);
+ }else if(i == 2 && strcmp(flds[0], "webroot") == 0){
+ if(config->webroot){
+ seterr(EAdmin, "duplicate webroot lines in configuration file %s", file);
+ break;
+ }
+ config->webroot = estrdup(flds[1]);
+ }else if(i == 2 && strcmp(flds[0], "addr") == 0){
+ if(config->vaddr){
+ seterr(EAdmin, "duplicate addr lines in configuration file %s", file);
+ break;
+ }
+ config->vaddr = estrdup(flds[1]);
+ }else{
+ seterr(EAdmin, "illegal line '%s' in configuration file %s", line, file);
+ break;
+ }
+ free(line);
+ line = nil;
+ }
+ free(line);
+ freeifile(&f);
+ if(ok < 0){
+ free(config->sects);
+ config->sects = nil;
+ free(config->aparts);
+ config->aparts = nil;
+ }
+ return ok;
+}
+
+static ISect*
+configisect(char *file)
+{
+ Part *part;
+ ISect *is;
+
+ if(0) fprint(2, "configure index section in %s\n", file);
+
+ part = initpart(file, ORDWR|ODIRECT);
+ if(part == nil)
+ return nil;
+ is = initisect(part);
+ if(is == nil)
+ werrstr("%s: %r", file);
+ return is;
+}
+
+static ArenaPart*
+configarenas(char *file)
+{
+ ArenaPart *ap;
+ Part *part;
+
+ if(0) fprint(2, "configure arenas in %s\n", file);
+ part = initpart(file, ORDWR|ODIRECT);
+ if(part == nil)
+ return nil;
+ ap = initarenapart(part);
+ if(ap == nil)
+ werrstr("%s: %r", file);
+ return ap;
+}
+
+static Bloom*
+configbloom(char *file)
+{
+ Bloom *b;
+ Part *part;
+
+ if(0) fprint(2, "configure bloom in %s\n", file);
+ part = initpart(file, ORDWR|ODIRECT);
+ if(part == nil)
+ return nil;
+ b = readbloom(part);
+ if(b == nil){
+ werrstr("%s: %r", file);
+ freepart(part);
+ }
+ return b;
+}
+
+/* for OS X linker, which only resolves functions, not data */
+void
+needmainindex(void)
+{
+}
+
diff --git a/sys/src/cmd/venti/srv/conv.c b/sys/src/cmd/venti/srv/conv.c
new file mode 100755
index 000000000..e6a6cbfe1
--- /dev/null
+++ b/sys/src/cmd/venti/srv/conv.c
@@ -0,0 +1,730 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+/*
+ * disk structure conversion routines
+ */
+#define U8GET(p) ((p)[0])
+#define U16GET(p) (((p)[0]<<8)|(p)[1])
+#define U32GET(p) ((u32int)(((p)[0]<<24)|((p)[1]<<16)|((p)[2]<<8)|(p)[3]))
+#define U64GET(p) (((u64int)U32GET(p)<<32)|(u64int)U32GET((p)+4))
+
+#define U8PUT(p,v) (p)[0]=(v)&0xFF
+#define U16PUT(p,v) (p)[0]=((v)>>8)&0xFF;(p)[1]=(v)&0xFF
+#define U32PUT(p,v) (p)[0]=((v)>>24)&0xFF;(p)[1]=((v)>>16)&0xFF;(p)[2]=((v)>>8)&0xFF;(p)[3]=(v)&0xFF
+#define U64PUT(p,v,t32) t32=(v)>>32;U32PUT(p,t32);t32=(v);U32PUT((p)+4,t32)
+
+int debugarena = -1; /* hack to improve error reporting */
+
+static struct {
+ u32int m;
+ char *s;
+} magics[] = {
+ ArenaPartMagic, "ArenaPartMagic",
+ ArenaHeadMagic, "ArenaHeadMagic",
+ ArenaMagic, "ArenaMagic",
+ ISectMagic, "ISectMagic",
+ BloomMagic, "BloomMagic",
+};
+
+static char*
+fmtmagic(char *s, u32int m)
+{
+ int i;
+
+ for(i=0; i<nelem(magics); i++)
+ if(magics[i].m == m)
+ return magics[i].s;
+ sprint(s, "%#08ux", m);
+ return s;
+}
+
+u32int
+unpackmagic(u8int *buf)
+{
+ return U32GET(buf);
+}
+
+void
+packmagic(u32int magic, u8int *buf)
+{
+ U32PUT(buf, magic);
+}
+
+int
+unpackarenapart(ArenaPart *ap, u8int *buf)
+{
+ u8int *p;
+ u32int m;
+ char fbuf[20];
+
+ p = buf;
+
+ m = U32GET(p);
+ if(m != ArenaPartMagic){
+ seterr(ECorrupt, "arena set has wrong magic number: %s expected ArenaPartMagic (%#lux)", fmtmagic(fbuf, m), ArenaPartMagic);
+ return -1;
+ }
+ p += U32Size;
+ ap->version = U32GET(p);
+ p += U32Size;
+ ap->blocksize = U32GET(p);
+ p += U32Size;
+ ap->arenabase = U32GET(p);
+ p += U32Size;
+
+ if(buf + ArenaPartSize != p)
+ sysfatal("unpackarenapart unpacked wrong amount");
+
+ return 0;
+}
+
+int
+packarenapart(ArenaPart *ap, u8int *buf)
+{
+ u8int *p;
+
+ p = buf;
+
+ U32PUT(p, ArenaPartMagic);
+ p += U32Size;
+ U32PUT(p, ap->version);
+ p += U32Size;
+ U32PUT(p, ap->blocksize);
+ p += U32Size;
+ U32PUT(p, ap->arenabase);
+ p += U32Size;
+
+ if(buf + ArenaPartSize != p)
+ sysfatal("packarenapart packed wrong amount");
+
+ return 0;
+}
+
+int
+unpackarena(Arena *arena, u8int *buf)
+{
+ int sz;
+ u8int *p;
+ u32int m;
+ char fbuf[20];
+
+ p = buf;
+
+ m = U32GET(p);
+ if(m != ArenaMagic){
+ seterr(ECorrupt, "arena %d has wrong magic number: %s "
+ "expected ArenaMagic (%#lux)", debugarena,
+ fmtmagic(fbuf, m), ArenaMagic);
+ return -1;
+ }
+ p += U32Size;
+ arena->version = U32GET(p);
+ p += U32Size;
+ namecp(arena->name, (char*)p);
+ p += ANameSize;
+ arena->diskstats.clumps = U32GET(p);
+ p += U32Size;
+ arena->diskstats.cclumps = U32GET(p);
+ p += U32Size;
+ arena->ctime = U32GET(p);
+ p += U32Size;
+ arena->wtime = U32GET(p);
+ p += U32Size;
+ if(arena->version == ArenaVersion5){
+ arena->clumpmagic = U32GET(p);
+ p += U32Size;
+ }
+ arena->diskstats.used = U64GET(p);
+ p += U64Size;
+ arena->diskstats.uncsize = U64GET(p);
+ p += U64Size;
+ arena->diskstats.sealed = U8GET(p);
+ p += U8Size;
+ switch(arena->version){
+ case ArenaVersion4:
+ sz = ArenaSize4;
+ arena->clumpmagic = _ClumpMagic;
+ break;
+ case ArenaVersion5:
+ sz = ArenaSize5;
+ break;
+ default:
+ seterr(ECorrupt, "arena has bad version number %d", arena->version);
+ return -1;
+ }
+ /*
+ * Additional fields for the memstats version of the stats.
+ * Diskstats reflects what is committed to the index.
+ * Memstats reflects what is in the arena. Originally intended
+ * this to be a version 5 extension, but might as well use for
+ * all the existing version 4 arenas too.
+ *
+ * To maintain backwards compatibility with existing venti
+ * installations using the older format, we define that if
+ * memstats == diskstats, then the extension fields are not
+ * included (see packarena below). That is, only partially
+ * indexed arenas have these fields. Fully indexed arenas
+ * (in particular, sealed arenas) do not.
+ */
+ if(U8GET(p) == 1){
+ sz += ArenaSize5a-ArenaSize5;
+ p += U8Size;
+ arena->memstats.clumps = U32GET(p);
+ p += U32Size;
+ arena->memstats.cclumps = U32GET(p);
+ p += U32Size;
+ arena->memstats.used = U64GET(p);
+ p += U64Size;
+ arena->memstats.uncsize = U64GET(p);
+ p += U64Size;
+ arena->memstats.sealed = U8GET(p);
+ p += U8Size;
+
+ /*
+ * 2008/4/2
+ * Packarena (below) used to have a bug in which it would
+ * not zero out any existing extension fields when writing
+ * the arena metadata. This would manifest itself as arenas
+ * with arena->diskstats.sealed == 1 but arena->memstats.sealed == 0
+ * after a server restart. Because arena->memstats.sealed wouldn't
+ * be set, the server might try to fit another block into the arena
+ * (and succeed), violating the append-only structure of the log
+ * and invalidating any already-computed seal on the arena.
+ *
+ * It might end up that other fields in arena->memstats end up
+ * behind arena->diskstats too, but that would be considerably
+ * more rare, and the bug is fixed now. The case we need to
+ * handle is just the sealed mismatch.
+ *
+ * If we encounter such a bogus arena, fix the sealed field.
+ */
+ if(arena->diskstats.sealed)
+ arena->memstats.sealed = 1;
+ }else
+ arena->memstats = arena->diskstats;
+ if(buf + sz != p)
+ sysfatal("unpackarena unpacked wrong amount");
+
+ return 0;
+}
+
+int
+packarena(Arena *arena, u8int *buf)
+{
+ return _packarena(arena, buf, 0);
+}
+
+int
+_packarena(Arena *arena, u8int *buf, int forceext)
+{
+ int sz;
+ u8int *p;
+ u32int t32;
+
+ switch(arena->version){
+ case ArenaVersion4:
+ sz = ArenaSize4;
+ if(arena->clumpmagic != _ClumpMagic)
+ fprint(2, "warning: writing old arena tail loses clump magic 0x%lux != 0x%lux\n",
+ (ulong)arena->clumpmagic, (ulong)_ClumpMagic);
+ break;
+ case ArenaVersion5:
+ sz = ArenaSize5;
+ break;
+ default:
+ sysfatal("packarena unknown version %d", arena->version);
+ return -1;
+ }
+
+ p = buf;
+
+ U32PUT(p, ArenaMagic);
+ p += U32Size;
+ U32PUT(p, arena->version);
+ p += U32Size;
+ namecp((char*)p, arena->name);
+ p += ANameSize;
+ U32PUT(p, arena->diskstats.clumps);
+ p += U32Size;
+ U32PUT(p, arena->diskstats.cclumps);
+ p += U32Size;
+ U32PUT(p, arena->ctime);
+ p += U32Size;
+ U32PUT(p, arena->wtime);
+ p += U32Size;
+ if(arena->version == ArenaVersion5){
+ U32PUT(p, arena->clumpmagic);
+ p += U32Size;
+ }
+ U64PUT(p, arena->diskstats.used, t32);
+ p += U64Size;
+ U64PUT(p, arena->diskstats.uncsize, t32);
+ p += U64Size;
+ U8PUT(p, arena->diskstats.sealed);
+ p += U8Size;
+
+ /*
+ * Extension fields; see above.
+ */
+ if(forceext
+ || arena->memstats.clumps != arena->diskstats.clumps
+ || arena->memstats.cclumps != arena->diskstats.cclumps
+ || arena->memstats.used != arena->diskstats.used
+ || arena->memstats.uncsize != arena->diskstats.uncsize
+ || arena->memstats.sealed != arena->diskstats.sealed){
+ sz += ArenaSize5a - ArenaSize5;
+ U8PUT(p, 1);
+ p += U8Size;
+ U32PUT(p, arena->memstats.clumps);
+ p += U32Size;
+ U32PUT(p, arena->memstats.cclumps);
+ p += U32Size;
+ U64PUT(p, arena->memstats.used, t32);
+ p += U64Size;
+ U64PUT(p, arena->memstats.uncsize, t32);
+ p += U64Size;
+ U8PUT(p, arena->memstats.sealed);
+ p += U8Size;
+ }else{
+ /* Clear any extension fields already on disk. */
+ memset(p, 0, ArenaSize5a - ArenaSize5);
+ p += ArenaSize5a - ArenaSize5;
+ sz += ArenaSize5a - ArenaSize5;
+ }
+
+ if(buf + sz != p)
+ sysfatal("packarena packed wrong amount");
+
+ return 0;
+}
+
+int
+unpackarenahead(ArenaHead *head, u8int *buf)
+{
+ u8int *p;
+ u32int m;
+ int sz;
+ char fbuf[20];
+
+ p = buf;
+
+ m = U32GET(p);
+ if(m != ArenaHeadMagic){
+ seterr(ECorrupt, "arena %d head has wrong magic number: %s "
+ "expected ArenaHeadMagic (%#lux)", debugarena,
+ fmtmagic(fbuf, m), ArenaHeadMagic);
+ return -1;
+ }
+
+ p += U32Size;
+ head->version = U32GET(p);
+ p += U32Size;
+ namecp(head->name, (char*)p);
+ p += ANameSize;
+ head->blocksize = U32GET(p);
+ p += U32Size;
+ head->size = U64GET(p);
+ p += U64Size;
+ if(head->version == ArenaVersion5){
+ head->clumpmagic = U32GET(p);
+ p += U32Size;
+ }
+
+ switch(head->version){
+ case ArenaVersion4:
+ sz = ArenaHeadSize4;
+ head->clumpmagic = _ClumpMagic;
+ break;
+ case ArenaVersion5:
+ sz = ArenaHeadSize5;
+ break;
+ default:
+ seterr(ECorrupt, "arena head has unexpected version %d", head->version);
+ return -1;
+ }
+
+ if(buf + sz != p)
+ sysfatal("unpackarenahead unpacked wrong amount");
+
+ return 0;
+}
+
+int
+packarenahead(ArenaHead *head, u8int *buf)
+{
+ u8int *p;
+ int sz;
+ u32int t32;
+
+ switch(head->version){
+ case ArenaVersion4:
+ sz = ArenaHeadSize4;
+ if(head->clumpmagic != _ClumpMagic)
+ fprint(2, "warning: writing old arena header loses clump magic 0x%lux != 0x%lux\n",
+ (ulong)head->clumpmagic, (ulong)_ClumpMagic);
+ break;
+ case ArenaVersion5:
+ sz = ArenaHeadSize5;
+ break;
+ default:
+ sysfatal("packarenahead unknown version %d", head->version);
+ return -1;
+ }
+
+ p = buf;
+
+ U32PUT(p, ArenaHeadMagic);
+ p += U32Size;
+ U32PUT(p, head->version);
+ p += U32Size;
+ namecp((char*)p, head->name);
+ p += ANameSize;
+ U32PUT(p, head->blocksize);
+ p += U32Size;
+ U64PUT(p, head->size, t32);
+ p += U64Size;
+ if(head->version == ArenaVersion5){
+ U32PUT(p, head->clumpmagic);
+ p += U32Size;
+ }
+ if(buf + sz != p)
+ sysfatal("packarenahead packed wrong amount");
+
+ return 0;
+}
+
+static int
+checkclump(Clump *w)
+{
+ if(w->encoding == ClumpENone){
+ if(w->info.size != w->info.uncsize){
+ seterr(ECorrupt, "uncompressed wad size mismatch");
+ return -1;
+ }
+ }else if(w->encoding == ClumpECompress){
+ if(w->info.size >= w->info.uncsize){
+ seterr(ECorrupt, "compressed lump has inconsistent block sizes %d %d", w->info.size, w->info.uncsize);
+ return -1;
+ }
+ }else{
+ seterr(ECorrupt, "clump has illegal encoding");
+ return -1;
+ }
+
+ return 0;
+}
+
+int
+unpackclump(Clump *c, u8int *buf, u32int cmagic)
+{
+ u8int *p;
+ u32int magic;
+
+ p = buf;
+ magic = U32GET(p);
+ if(magic != cmagic){
+ seterr(ECorrupt, "clump has bad magic number=%#8.8ux != %#8.8ux", magic, cmagic);
+ return -1;
+ }
+ p += U32Size;
+
+ c->info.type = vtfromdisktype(U8GET(p));
+ p += U8Size;
+ c->info.size = U16GET(p);
+ p += U16Size;
+ c->info.uncsize = U16GET(p);
+ p += U16Size;
+ scorecp(c->info.score, p);
+ p += VtScoreSize;
+
+ c->encoding = U8GET(p);
+ p += U8Size;
+ c->creator = U32GET(p);
+ p += U32Size;
+ c->time = U32GET(p);
+ p += U32Size;
+
+ if(buf + ClumpSize != p)
+ sysfatal("unpackclump unpacked wrong amount");
+
+ return checkclump(c);
+}
+
+int
+packclump(Clump *c, u8int *buf, u32int magic)
+{
+ u8int *p;
+
+ p = buf;
+ U32PUT(p, magic);
+ p += U32Size;
+
+ U8PUT(p, vttodisktype(c->info.type));
+ p += U8Size;
+ U16PUT(p, c->info.size);
+ p += U16Size;
+ U16PUT(p, c->info.uncsize);
+ p += U16Size;
+ scorecp(p, c->info.score);
+ p += VtScoreSize;
+
+ U8PUT(p, c->encoding);
+ p += U8Size;
+ U32PUT(p, c->creator);
+ p += U32Size;
+ U32PUT(p, c->time);
+ p += U32Size;
+
+ if(buf + ClumpSize != p)
+ sysfatal("packclump packed wrong amount");
+
+ return checkclump(c);
+}
+
+void
+unpackclumpinfo(ClumpInfo *ci, u8int *buf)
+{
+ u8int *p;
+
+ p = buf;
+ ci->type = vtfromdisktype(U8GET(p));
+ p += U8Size;
+ ci->size = U16GET(p);
+ p += U16Size;
+ ci->uncsize = U16GET(p);
+ p += U16Size;
+ scorecp(ci->score, p);
+ p += VtScoreSize;
+
+ if(buf + ClumpInfoSize != p)
+ sysfatal("unpackclumpinfo unpacked wrong amount");
+}
+
+void
+packclumpinfo(ClumpInfo *ci, u8int *buf)
+{
+ u8int *p;
+
+ p = buf;
+ U8PUT(p, vttodisktype(ci->type));
+ p += U8Size;
+ U16PUT(p, ci->size);
+ p += U16Size;
+ U16PUT(p, ci->uncsize);
+ p += U16Size;
+ scorecp(p, ci->score);
+ p += VtScoreSize;
+
+ if(buf + ClumpInfoSize != p)
+ sysfatal("packclumpinfo packed wrong amount");
+}
+
+int
+unpackisect(ISect *is, u8int *buf)
+{
+ u8int *p;
+ u32int m;
+ char fbuf[20];
+
+ p = buf;
+
+
+ m = U32GET(p);
+ if(m != ISectMagic){
+ seterr(ECorrupt, "index section has wrong magic number: %s expected ISectMagic (%#lux)",
+ fmtmagic(fbuf, m), ISectMagic);
+ return -1;
+ }
+ p += U32Size;
+ is->version = U32GET(p);
+ p += U32Size;
+ namecp(is->name, (char*)p);
+ p += ANameSize;
+ namecp(is->index, (char*)p);
+ p += ANameSize;
+ is->blocksize = U32GET(p);
+ p += U32Size;
+ is->blockbase = U32GET(p);
+ p += U32Size;
+ is->blocks = U32GET(p);
+ p += U32Size;
+ is->start = U32GET(p);
+ p += U32Size;
+ is->stop = U32GET(p);
+ p += U32Size;
+ if(buf + ISectSize1 != p)
+ sysfatal("unpackisect unpacked wrong amount");
+ is->bucketmagic = 0;
+ if(is->version == ISectVersion2){
+ is->bucketmagic = U32GET(p);
+ p += U32Size;
+ if(buf + ISectSize2 != p)
+ sysfatal("unpackisect unpacked wrong amount");
+ }
+
+ return 0;
+}
+
+int
+packisect(ISect *is, u8int *buf)
+{
+ u8int *p;
+
+ p = buf;
+
+ U32PUT(p, ISectMagic);
+ p += U32Size;
+ U32PUT(p, is->version);
+ p += U32Size;
+ namecp((char*)p, is->name);
+ p += ANameSize;
+ namecp((char*)p, is->index);
+ p += ANameSize;
+ U32PUT(p, is->blocksize);
+ p += U32Size;
+ U32PUT(p, is->blockbase);
+ p += U32Size;
+ U32PUT(p, is->blocks);
+ p += U32Size;
+ U32PUT(p, is->start);
+ p += U32Size;
+ U32PUT(p, is->stop);
+ p += U32Size;
+ if(buf + ISectSize1 != p)
+ sysfatal("packisect packed wrong amount");
+ if(is->version == ISectVersion2){
+ U32PUT(p, is->bucketmagic);
+ p += U32Size;
+ if(buf + ISectSize2 != p)
+ sysfatal("packisect packed wrong amount");
+ }
+
+ return 0;
+}
+
+void
+unpackientry(IEntry *ie, u8int *buf)
+{
+ u8int *p;
+
+ p = buf;
+
+ scorecp(ie->score, p);
+ p += VtScoreSize;
+ /* ie->wtime = U32GET(p); */
+ p += U32Size;
+ /* ie->train = U16GET(p); */
+ p += U16Size;
+ if(p - buf != IEntryAddrOff)
+ sysfatal("unpackentry bad IEntryAddrOff amount");
+ ie->ia.addr = U64GET(p);
+if(ie->ia.addr>>56) print("%.8H => %llux\n", p, ie->ia.addr);
+ p += U64Size;
+ ie->ia.size = U16GET(p);
+ p += U16Size;
+ if(p - buf != IEntryTypeOff)
+ sysfatal("unpackientry bad IEntryTypeOff amount");
+ ie->ia.type = vtfromdisktype(U8GET(p));
+ p += U8Size;
+ ie->ia.blocks = U8GET(p);
+ p += U8Size;
+
+ if(p - buf != IEntrySize)
+ sysfatal("unpackientry unpacked wrong amount");
+}
+
+void
+packientry(IEntry *ie, u8int *buf)
+{
+ u32int t32;
+ u8int *p;
+
+ p = buf;
+
+ scorecp(p, ie->score);
+ p += VtScoreSize;
+ U32PUT(p, 0); /* wtime */
+ p += U32Size;
+ U16PUT(p, 0); /* train */
+ p += U16Size;
+ U64PUT(p, ie->ia.addr, t32);
+ p += U64Size;
+ U16PUT(p, ie->ia.size);
+ p += U16Size;
+ U8PUT(p, vttodisktype(ie->ia.type));
+ p += U8Size;
+ U8PUT(p, ie->ia.blocks);
+ p += U8Size;
+
+ if(p - buf != IEntrySize)
+ sysfatal("packientry packed wrong amount");
+}
+
+void
+unpackibucket(IBucket *b, u8int *buf, u32int magic)
+{
+ b->n = U16GET(buf);
+ b->data = buf + IBucketSize;
+ if(magic && magic != U32GET(buf+U16Size))
+ b->n = 0;
+}
+
+void
+packibucket(IBucket *b, u8int *buf, u32int magic)
+{
+ U16PUT(buf, b->n);
+ U32PUT(buf+U16Size, magic);
+}
+
+void
+packbloomhead(Bloom *b, u8int *buf)
+{
+ u8int *p;
+
+ p = buf;
+ U32PUT(p, BloomMagic);
+ U32PUT(p+4, BloomVersion);
+ U32PUT(p+8, b->nhash);
+ U32PUT(p+12, b->size);
+}
+
+int
+unpackbloomhead(Bloom *b, u8int *buf)
+{
+ u8int *p;
+ u32int m;
+ char fbuf[20];
+
+ p = buf;
+
+ m = U32GET(p);
+ if(m != BloomMagic){
+ seterr(ECorrupt, "bloom filter has wrong magic number: %s expected BloomMagic (%#lux)", fmtmagic(fbuf, m), (ulong)BloomMagic);
+ return -1;
+ }
+ p += U32Size;
+
+ m = U32GET(p);
+ if(m != BloomVersion){
+ seterr(ECorrupt, "bloom filter has wrong version %ud expected %ud", (uint)m, (uint)BloomVersion);
+ return -1;
+ }
+ p += U32Size;
+
+ b->nhash = U32GET(p);
+ p += U32Size;
+
+ b->size = U32GET(p);
+ p += U32Size;
+ if(b->size < BloomHeadSize || b->size > MaxBloomSize || (b->size&(b->size-1))){
+ seterr(ECorrupt, "bloom filter has invalid size %#lux", b->size);
+ return -1;
+ }
+
+ if(buf + BloomHeadSize != p)
+ sysfatal("unpackarena unpacked wrong amount");
+
+ return 0;
+}
diff --git a/sys/src/cmd/venti/srv/dat.h b/sys/src/cmd/venti/srv/dat.h
new file mode 100755
index 000000000..24cc79e8b
--- /dev/null
+++ b/sys/src/cmd/venti/srv/dat.h
@@ -0,0 +1,758 @@
+typedef struct Config Config;
+typedef struct AMap AMap;
+typedef struct AMapN AMapN;
+typedef struct Arena Arena;
+typedef struct AState AState;
+typedef struct ArenaCIG ArenaCIG;
+typedef struct ArenaHead ArenaHead;
+typedef struct ArenaPart ArenaPart;
+typedef struct ArenaTail ArenaTail;
+typedef struct ATailStats ATailStats;
+typedef struct CIBlock CIBlock;
+typedef struct Clump Clump;
+typedef struct ClumpInfo ClumpInfo;
+typedef struct Graph Graph;
+typedef struct IAddr IAddr;
+typedef struct IBucket IBucket;
+typedef struct IEStream IEStream;
+typedef struct IEntry IEntry;
+typedef struct IFile IFile;
+typedef struct ISect ISect;
+typedef struct Index Index;
+typedef struct Lump Lump;
+typedef struct DBlock DBlock;
+typedef struct Part Part;
+typedef struct Statbin Statbin;
+typedef struct Statdesc Statdesc;
+typedef struct Stats Stats;
+typedef struct ZBlock ZBlock;
+typedef struct Round Round;
+typedef struct Bloom Bloom;
+
+#pragma incomplete IEStream
+
+#define TWID32 ((u32int)~(u32int)0)
+#define TWID64 ((u64int)~(u64int)0)
+#define TWID8 ((u8int)~(u8int)0)
+
+enum
+{
+ ABlockLog = 9, /* log2(512), the quantum for reading arenas */
+ ANameSize = 64,
+ MaxDiskBlock = 64*1024, /* max. allowed size for a disk block */
+ MaxIoSize = 64*1024, /* max. allowed size for a disk io operation */
+ PartBlank = 256*1024, /* untouched section at beginning of partition */
+ HeadSize = 512, /* size of a header after PartBlank */
+ MinArenaSize = 1*1024*1024, /* smallest reasonable arena size */
+ IndexBase = 1024*1024, /* initial address to use in an index */
+ MaxIo = 64*1024, /* max size of a single read or write operation */
+ ICacheBits = 16, /* default bits for indexing icache */
+ MaxAMap = 31*1024, /* max. allowed arenas in an address mapping; must be < 32*1024 */
+ Unspecified = ~0ul,
+
+ /*
+ * return codes from syncarena
+ */
+ SyncDataErr = 1 << 0, /* problem reading the clump data */
+ SyncCIErr = 1 << 1, /* found erroneous clump directory entries */
+ SyncCIZero = 1 << 2, /* found unwritten clump directory entries */
+ SyncFixErr = 1 << 3, /* error writing fixed data */
+ SyncHeader = 1 << 4, /* altered header fields */
+
+ /*
+ * error severity
+ */
+ EOk = 0, /* error expected in normal operation */
+ EStrange, /* strange error that should be logged */
+ ECorrupt, /* corrupted data found in arenas */
+ EICorrupt, /* corrupted data found in index */
+ EAdmin, /* should be brought to administrators' attention */
+ ECrash, /* really bad internal error */
+ EBug, /* a limitation which should be fixed */
+ EInconsist, /* inconsistencies between index and arena */
+ EMax,
+
+ /*
+ * internal disk formats for the venti archival storage system
+ */
+ /*
+ * magic numbers on disk
+ */
+ _ClumpMagic = 0xd15cb10cU, /* clump header, deprecated */
+ ClumpFreeMagic = 0, /* free clump; terminates active clump log */
+
+ ArenaPartMagic = 0xa9e4a5e7U, /* arena partition header */
+ ArenaMagic = 0xf2a14eadU, /* arena trailer */
+ ArenaHeadMagic = 0xd15c4eadU, /* arena header */
+
+ BloomMagic = 0xb1004eadU, /* bloom filter header */
+ BloomMaxHash = 32,
+
+ ISectMagic = 0xd15c5ec7U, /* index header */
+
+ ArenaPartVersion = 3,
+ ArenaVersion4 = 4,
+ ArenaVersion5 = 5,
+ BloomVersion = 1,
+ IndexVersion = 1,
+ ISectVersion1 = 1,
+ ISectVersion2 = 2,
+
+ /*
+ * encodings of clumps on disk
+ */
+ ClumpEErr = 0, /* can't happen */
+ ClumpENone, /* plain */
+ ClumpECompress, /* compressed */
+ ClumpEMax,
+
+ /*
+ * sizes in bytes on disk
+ */
+ U8Size = 1,
+ U16Size = 2,
+ U32Size = 4,
+ U64Size = 8,
+
+ ArenaPartSize = 4 * U32Size,
+ ArenaSize4 = 2 * U64Size + 6 * U32Size + ANameSize + U8Size,
+ ArenaSize5 = ArenaSize4 + U32Size,
+ ArenaSize5a = ArenaSize5 + 2 * U8Size + 2 * U32Size + 2 * U64Size,
+ ArenaHeadSize4 = U64Size + 3 * U32Size + ANameSize,
+ ArenaHeadSize5 = ArenaHeadSize4 + U32Size,
+ BloomHeadSize = 4 * U32Size,
+ ISectSize1 = 7 * U32Size + 2 * ANameSize,
+ ISectSize2 = ISectSize1 + U32Size,
+ ClumpInfoSize = U8Size + 2 * U16Size + VtScoreSize,
+ ClumpSize = ClumpInfoSize + U8Size + 3 * U32Size,
+ MaxBloomSize = 1<<(32-3), /* 2^32 bits */
+ MaxBloomHash = 32, /* bits per score */
+ /*
+ * BUG - The various block copies that manipulate entry buckets
+ * would be faster if we bumped IBucketSize up to 8 and IEntrySize up to 40,
+ * so that everything is word-aligned. Buildindex is actually cpu-bound
+ * by the (byte at a time) copying in qsort.
+ */
+ IBucketSize = U32Size + U16Size,
+ IEntrySize = U64Size + U32Size + 2*U16Size + 2*U8Size + VtScoreSize,
+ IEntryTypeOff = VtScoreSize + U32Size + U16Size + U64Size + U16Size,
+ IEntryAddrOff = VtScoreSize + U32Size + U16Size,
+
+ MaxClumpBlocks = (VtMaxLumpSize + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog,
+
+ IcacheFrac = 1000000, /* denominator */
+
+ SleepForever = 1000000000, /* magic value for sleep time */
+ /*
+ * dirty flags - order controls disk write order
+ */
+ DirtyArena = 1,
+ DirtyArenaCib,
+ DirtyArenaTrailer,
+ DirtyMax,
+
+ ArenaCIGSize = 10*1024, // about 0.5 MB worth of IEntry.
+
+ VentiZZZZZZZZ
+};
+
+extern char TraceDisk[];
+extern char TraceLump[];
+extern char TraceBlock[];
+extern char TraceProc[];
+extern char TraceWork[];
+extern char TraceQuiet[];
+extern char TraceRpc[];
+
+/*
+ * results of parsing and initializing a config file
+ */
+struct Config
+{
+ char *index; /* name of the index to initialize */
+ int naparts; /* arena partitions initialized */
+ ArenaPart **aparts;
+ int nsects; /* index sections initialized */
+ ISect **sects;
+ Bloom *bloom; /* bloom filter */
+ u32int bcmem;
+ u32int mem;
+ u32int icmem;
+ int queuewrites;
+ char* haddr;
+ char* vaddr;
+ char* webroot;
+};
+
+/*
+ * a Part is the low level interface to files or disks.
+ * there are two main types of partitions
+ * arena paritions, which some number of arenas, each in a sub-partition.
+ * index partition, which only have one subpartition.
+ */
+struct Part
+{
+ int fd; /* rock for accessing the disk */
+ int mode;
+ u64int offset;
+ u64int size; /* size of the partiton */
+ u32int blocksize; /* block size for reads and writes */
+ u32int fsblocksize; /* minimum file system block size */
+ char *name;
+ char *filename;
+ Channel *writechan; /* chan[dcache.nblock](DBlock*) */
+};
+
+/*
+ * a cached block from the partition
+ * yuck -- most of this is internal structure for the cache
+ * all other routines should only use data
+ */
+struct DBlock
+{
+ u8int *data;
+
+ Part *part; /* partition in which cached */
+ u64int addr; /* base address on the partition */
+ u32int size; /* amount of data available, not amount allocated; should go away */
+ u32int mode;
+ u32int dirty;
+ u32int dirtying;
+ DBlock *next; /* doubly linked hash chains */
+ DBlock *prev;
+ u32int heap; /* index in heap table */
+ u32int used; /* last reference times */
+ u32int used2;
+ u32int ref; /* reference count */
+ RWLock lock; /* for access to data only */
+ Channel *writedonechan;
+ void* chanbuf[1]; /* buffer for the chan! */
+};
+
+/*
+ * a cached block from the partition
+ * yuck -- most of this is internal structure for the cache
+ * all other routines should only use data
+ * double yuck -- this is mostly the same as a DBlock
+ */
+struct Lump
+{
+ Packet *data;
+
+ Part *part; /* partition in which cached */
+ u8int score[VtScoreSize]; /* score of packet */
+ u8int type; /* type of packet */
+ u32int size; /* amount of data allocated to hold packet */
+ Lump *next; /* doubly linked hash chains */
+ Lump *prev;
+ u32int heap; /* index in heap table */
+ u32int used; /* last reference times */
+ u32int used2;
+ u32int ref; /* reference count */
+ QLock lock; /* for access to data only */
+};
+
+/*
+ * mapping between names and address ranges
+ */
+struct AMap
+{
+ u64int start;
+ u64int stop;
+ char name[ANameSize];
+};
+
+/*
+ * an AMap along with a length
+ */
+struct AMapN
+{
+ int n;
+ AMap *map;
+};
+
+/*
+ * an ArenaPart is a partition made up of Arenas
+ * it exists because most os's don't support many partitions,
+ * and we want to have many different Arenas
+ */
+struct ArenaPart
+{
+ Part *part;
+ u64int size; /* size of underlying partition, rounded down to blocks */
+ Arena **arenas;
+ u32int tabbase; /* base address of arena table on disk */
+ u32int tabsize; /* max. bytes in arena table */
+
+ /*
+ * fields stored on disk
+ */
+ u32int version;
+ u32int blocksize; /* "optimal" block size for reads and writes */
+ u32int arenabase; /* base address of first arena */
+
+ /*
+ * stored in the arena mapping table on disk
+ */
+ AMap *map;
+ int narenas;
+};
+
+/*
+ * info about one block in the clump info cache
+ */
+struct CIBlock
+{
+ u32int block; /* blocks in the directory */
+ int offset; /* offsets of one clump in the data */
+ DBlock *data;
+};
+
+/*
+ * Statistics kept in the tail.
+ */
+struct ATailStats
+{
+ u32int clumps; /* number of clumps */
+ u32int cclumps; /* number of compressed clumps */
+ u64int used;
+ u64int uncsize;
+ u8int sealed;
+};
+
+/*
+ * Arena state - represents a point in the data log
+ */
+struct AState
+{
+ Arena *arena;
+ u64int aa; /* index address */
+ ATailStats stats;
+};
+
+/*
+ * an Arena is a log of Clumps, preceeded by an ArenaHeader,
+ * and followed by a Arena, each in one disk block.
+ * struct on disk is not always up to date, but should be self-consistent.
+ * to sync after reboot, follow clumps starting at used until ClumpFreeMagic if found.
+ * <struct name="Arena" type="Arena *">
+ * <field name="name" val="s->name" type="AName"/>
+ * <field name="version" val="s->version" type="U32int"/>
+ * <field name="partition" val="s->part->name" type="AName"/>
+ * <field name="blocksize" val="s->blocksize" type="U32int"/>
+ * <field name="start" val="s->base" type="U64int"/>
+ * <field name="stop" val="s->base+2*s->blocksize" type="U64int"/>
+ * <field name="created" val="s->ctime" type="U32int"/>
+ * <field name="modified" val="s->wtime" type="U32int"/>
+ * <field name="sealed" val="s->sealed" type="Sealed"/>
+ * <field name="score" val="s->score" type="Score"/>
+ * <field name="clumps" val="s->clumps" type="U32int"/>
+ * <field name="compressedclumps" val="s->cclumps" type="U32int"/>
+ * <field name="data" val="s->uncsize" type="U64int"/>
+ * <field name="compresseddata" val="s->used - s->clumps * ClumpSize" type="U64int"/>
+ * <field name="storage" val="s->used + s->clumps * ClumpInfoSize" type="U64int"/>
+ * </struct>
+ */
+struct Arena
+{
+ QLock lock; /* lock for arena fields, writing to disk */
+ Part *part; /* partition in which arena lives */
+ int blocksize; /* size of block to read or write */
+ u64int base; /* base address on disk */
+ u64int size; /* total space in the arena */
+ u8int score[VtScoreSize]; /* score of the entire sealed & summed arena */
+
+ int clumpmax; /* ClumpInfos per block */
+ AState mem;
+ int inqueue;
+
+ /*
+ * fields stored on disk
+ */
+ u32int version;
+ char name[ANameSize]; /* text label */
+ ATailStats memstats;
+ ATailStats diskstats;
+ u32int ctime; /* first time a block was written */
+ u32int wtime; /* last time a block was written */
+ u32int clumpmagic;
+
+ ArenaCIG *cig;
+ int ncig;
+};
+
+struct ArenaCIG
+{
+ u64int offset; // from arena base
+};
+
+/*
+ * redundant storage of some fields at the beginning of each arena
+ */
+struct ArenaHead
+{
+ u32int version;
+ char name[ANameSize];
+ u32int blocksize;
+ u64int size;
+ u32int clumpmagic;
+};
+
+/*
+ * most interesting meta information for a clump.
+ * stored in each clump's header and in the Arena's directory,
+ * stored in reverse order just prior to the arena trailer
+ */
+struct ClumpInfo
+{
+ u8int type;
+ u16int size; /* size of disk data, not including header */
+ u16int uncsize; /* size of uncompressed data */
+ u8int score[VtScoreSize]; /* score of the uncompressed data only */
+};
+
+/*
+ * header for an immutable clump of data
+ */
+struct Clump
+{
+ ClumpInfo info;
+ u8int encoding;
+ u32int creator; /* initial client which wrote the block */
+ u32int time; /* creation at gmt seconds since 1/1/1970 */
+};
+
+/*
+ * index of all clumps according to their score
+ * this is just a wrapper to tie together the index sections
+ * <struct name="Index" type="Index *">
+ * <field name="name" val="s->name" type="AName"/>
+ * <field name="version" val="s->version" type="U32int"/>
+ * <field name="blocksize" val="s->blocksize" type="U32int"/>
+ * <field name="tabsize" val="s->tabsize" type="U32int"/>
+ * <field name="buckets" val="s->buckets" type="U32int"/>
+ * <field name="buckdiv" val="s->div" type="U32int"/>
+ * <field name="bitblocks" val="s->div" type="U32int"/>
+ * <field name="maxdepth" val="s->div" type="U32int"/>
+ * <field name="bitkeylog" val="s->div" type="U32int"/>
+ * <field name="bitkeymask" val="s->div" type="U32int"/>
+ * <array name="sect" val="&s->smap[i]" elems="s->nsects" type="Amap"/>
+ * <array name="amap" val="&s->amap[i]" elems="s->narenas" type="Amap"/>
+ * <array name="arena" val="s->arenas[i]" elems="s->narenas" type="Arena"/>
+ * </struct>
+ * <struct name="Amap" type="AMap *">
+ * <field name="name" val="s->name" type="AName"/>
+ * <field name="start" val="s->start" type="U64int"/>
+ * <field name="stop" val="s->stop" type="U64int"/>
+ * </struct>
+ */
+struct Index
+{
+ u32int div; /* divisor for mapping score to bucket */
+ u32int buckets; /* last bucket used in disk hash table */
+ u32int blocksize;
+ u32int tabsize; /* max. bytes in index config */
+
+ int mapalloc; /* first arena to check when adding a lump */
+ Arena **arenas; /* arenas in the mapping */
+ ISect **sects; /* sections which hold the buckets */
+ Bloom *bloom; /* bloom filter */
+
+ /*
+ * fields stored in config file
+ */
+ u32int version;
+ char name[ANameSize]; /* text label */
+ int nsects;
+ AMap *smap; /* mapping of buckets to index sections */
+ int narenas;
+ AMap *amap; /* mapping from index addesses to arenas */
+
+ QLock writing;
+};
+
+/*
+ * one part of the bucket storage for an index.
+ * the index blocks are sequentially allocated
+ * across all of the sections.
+ */
+struct ISect
+{
+ Part *part;
+ int blocklog; /* log2(blocksize) */
+ int buckmax; /* max. entries in a index bucket */
+ u32int tabbase; /* base address of index config table on disk */
+ u32int tabsize; /* max. bytes in index config */
+ Channel *writechan;
+ Channel *writedonechan;
+ void *ig; /* used by buildindex only */
+ int ng;
+
+ /*
+ * fields stored on disk
+ */
+ u32int version;
+ u32int bucketmagic;
+ char name[ANameSize]; /* text label */
+ char index[ANameSize]; /* index owning the section */
+ u32int blocksize; /* size of hash buckets in index */
+ u32int blockbase; /* address of start of on disk index table */
+ u32int blocks; /* total blocks on disk; some may be unused */
+ u32int start; /* first bucket in this section */
+ u32int stop; /* limit of buckets in this section */
+};
+
+/*
+ * externally interesting part of an IEntry
+ */
+struct IAddr
+{
+ u64int addr;
+ u16int size; /* uncompressed size */
+ u8int type; /* type of block */
+ u8int blocks; /* arena io quanta for Clump + data */
+};
+
+/*
+ * entries in the index
+ * kept in IBuckets in the disk index table,
+ * cached in the memory ICache.
+ */
+struct IEntry
+{
+ /* on disk data - 32 bytes*/
+ u8int score[VtScoreSize];
+ IAddr ia;
+
+ IEntry *nexthash;
+ IEntry *nextdirty;
+ IEntry *next;
+ IEntry *prev;
+ u8int state;
+};
+enum {
+ IEClean = 0,
+ IEDirty = 1,
+ IESummary = 2,
+};
+
+/*
+ * buckets in the on disk index table
+ */
+struct IBucket
+{
+ u16int n; /* number of active indices */
+ u32int buck; /* used by buildindex/checkindex only */
+ u8int *data;
+};
+
+/*
+ * temporary buffers used by individual threads
+ */
+struct ZBlock
+{
+ u32int len;
+ u32int _size;
+ u8int *data;
+ u8int *free;
+};
+
+/*
+ * simple input buffer for a '\0' terminated text file
+ */
+struct IFile
+{
+ char *name; /* name of the file */
+ ZBlock *b; /* entire contents of file */
+ u32int pos; /* current position in the file */
+};
+
+struct Statdesc
+{
+ char *name;
+ ulong max;
+};
+
+/* keep in sync with stats.c:/statdesc and httpd.c:/graphname*/
+enum
+{
+ StatRpcTotal,
+ StatRpcRead,
+ StatRpcReadOk,
+ StatRpcReadFail,
+ StatRpcReadBytes,
+ StatRpcReadTime,
+ StatRpcReadCached,
+ StatRpcReadCachedTime,
+ StatRpcReadUncached,
+ StatRpcReadUncachedTime,
+ StatRpcWrite,
+ StatRpcWriteNew,
+ StatRpcWriteOld,
+ StatRpcWriteFail,
+ StatRpcWriteBytes,
+ StatRpcWriteTime,
+ StatRpcWriteNewTime,
+ StatRpcWriteOldTime,
+
+ StatLcacheHit,
+ StatLcacheMiss,
+ StatLcacheRead,
+ StatLcacheWrite,
+ StatLcacheSize,
+ StatLcacheStall,
+ StatLcacheReadTime,
+
+ StatDcacheHit,
+ StatDcacheMiss,
+ StatDcacheLookup,
+ StatDcacheRead,
+ StatDcacheWrite,
+ StatDcacheDirty,
+ StatDcacheSize,
+ StatDcacheFlush,
+ StatDcacheStall,
+ StatDcacheLookupTime,
+
+ StatDblockStall,
+ StatLumpStall,
+
+ StatIcacheHit,
+ StatIcacheMiss,
+ StatIcacheRead,
+ StatIcacheWrite,
+ StatIcacheFill,
+ StatIcachePrefetch,
+ StatIcacheDirty,
+ StatIcacheSize,
+ StatIcacheFlush,
+ StatIcacheStall,
+ StatIcacheReadTime,
+ StatIcacheLookup,
+ StatScacheHit,
+ StatScachePrefetch,
+
+ StatBloomHit,
+ StatBloomMiss,
+ StatBloomFalseMiss,
+ StatBloomLookup,
+ StatBloomOnes,
+ StatBloomBits,
+
+ StatApartRead,
+ StatApartReadBytes,
+ StatApartWrite,
+ StatApartWriteBytes,
+
+ StatIsectRead,
+ StatIsectReadBytes,
+ StatIsectWrite,
+ StatIsectWriteBytes,
+
+ StatSumRead,
+ StatSumReadBytes,
+
+ StatCigLoad,
+ StatCigLoadTime,
+
+ NStat
+};
+
+extern Statdesc statdesc[NStat];
+
+/*
+ * statistics about the operation of the server
+ * mainly for performance monitoring and profiling.
+ */
+struct Stats
+{
+ ulong now;
+ ulong n[NStat];
+};
+
+struct Statbin
+{
+ uint nsamp;
+ uint min;
+ uint max;
+ uint avg;
+};
+
+struct Graph
+{
+ long (*fn)(Stats*, Stats*, void*);
+ void *arg;
+ long t0;
+ long t1;
+ long min;
+ long max;
+ long wid;
+ long ht;
+ int fill;
+};
+
+/*
+ * for kicking background processes that run one round after another after another
+ */
+struct Round
+{
+ QLock lock;
+ Rendez start;
+ Rendez finish;
+ Rendez delaywait;
+ int delaytime;
+ int delaykick;
+ char* name;
+ int last;
+ int current;
+ int next;
+ int doanother;
+};
+
+/*
+ * Bloom filter of stored block hashes
+ */
+struct Bloom
+{
+ RWLock lk; /* protects nhash, nbits, tab, mb */
+ QLock mod; /* one marker at a time, protects nb */
+ int nhash;
+ ulong size; /* bytes in tab */
+ ulong bitmask; /* to produce bit index */
+ u8int *data;
+ Part *part;
+ Channel *writechan;
+ Channel *writedonechan;
+};
+
+extern Index *mainindex;
+extern u32int maxblocksize; /* max. block size used by any partition */
+extern int paranoid; /* should verify hashes on disk read */
+extern int queuewrites; /* put all lump writes on a queue and finish later */
+extern int readonly; /* only allowed to read the disk data */
+extern Stats stats;
+extern u8int zeroscore[VtScoreSize];
+extern int compressblocks;
+extern int writestodevnull; /* dangerous - for performance debugging */
+extern int collectstats;
+extern QLock memdrawlock;
+extern int icachesleeptime;
+extern int minicachesleeptime;
+extern int arenasumsleeptime;
+extern int manualscheduling;
+extern int l0quantum;
+extern int l1quantum;
+extern int ignorebloom;
+extern int icacheprefetch;
+extern int syncwrites;
+extern int debugarena; /* print in arena error msgs; -1==unknown */
+
+extern Stats *stathist;
+extern int nstathist;
+extern ulong stattime;
+
+#ifndef PLAN9PORT
+#pragma varargck type "V" uchar*
+#define ODIRECT 0
+#endif
+
diff --git a/sys/src/cmd/venti/srv/dcache.c b/sys/src/cmd/venti/srv/dcache.c
new file mode 100755
index 000000000..a50ef0c5c
--- /dev/null
+++ b/sys/src/cmd/venti/srv/dcache.c
@@ -0,0 +1,712 @@
+/*
+ * Disk cache.
+ *
+ * Caches raw disk blocks. Getdblock() gets a block, putdblock puts it back.
+ * Getdblock has a mode parameter that determines i/o and access to a block:
+ * if mode is OREAD or ORDWR, it is read from disk if not already in memory.
+ * If mode is ORDWR or OWRITE, it is locked for exclusive use before being returned.
+ * It is *not* marked dirty -- once changes have been made, they should be noted
+ * by using dirtydblock() before putdblock().
+ *
+ * There is a global cache lock as well as a lock on each block.
+ * Within a thread, the cache lock can be acquired while holding a block lock,
+ * but not vice versa; and a block cannot be locked if you already hold the lock
+ * on another block.
+ *
+ * The flush proc writes out dirty blocks in batches, one batch per dirty tag.
+ * For example, the DirtyArena blocks are all written to disk before any of the
+ * DirtyArenaCib blocks.
+ *
+ * This code used to be in charge of flushing the dirty index blocks out to
+ * disk, but updating the index turned out to benefit from extra care.
+ * Now cached index blocks are never marked dirty. The index.c code takes
+ * care of updating them behind our back, and uses _getdblock to update any
+ * cached copies of the blocks as it changes them on disk.
+ */
+
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+typedef struct DCache DCache;
+
+enum
+{
+ HashLog = 9,
+ HashSize = 1<<HashLog,
+ HashMask = HashSize - 1,
+};
+
+struct DCache
+{
+ QLock lock;
+ RWLock dirtylock; /* must be held to inspect or set b->dirty */
+ Rendez full;
+ Round round;
+ DBlock *free; /* list of available lumps */
+ u32int now; /* ticks for usage timestamps */
+ int size; /* max. size of any block; allocated to each block */
+ DBlock **heads; /* hash table for finding address */
+ int nheap; /* number of available victims */
+ DBlock **heap; /* heap for locating victims */
+ int nblocks; /* number of blocks allocated */
+ DBlock *blocks; /* array of block descriptors */
+ DBlock **write; /* array of block pointers to be written */
+ u8int *mem; /* memory for all block descriptors */
+ int ndirty; /* number of dirty blocks */
+ int maxdirty; /* max. number of dirty blocks */
+};
+
+typedef struct Ra Ra;
+struct Ra
+{
+ Part *part;
+ u64int addr;
+};
+
+static DCache dcache;
+
+static int downheap(int i, DBlock *b);
+static int upheap(int i, DBlock *b);
+static DBlock *bumpdblock(void);
+static void delheap(DBlock *db);
+static void fixheap(int i, DBlock *b);
+static void flushproc(void*);
+static void writeproc(void*);
+
+void
+initdcache(u32int mem)
+{
+ DBlock *b, *last;
+ u32int nblocks, blocksize;
+ int i;
+ u8int *p;
+
+ if(mem < maxblocksize * 2)
+ sysfatal("need at least %d bytes for the disk cache", maxblocksize * 2);
+ if(maxblocksize == 0)
+ sysfatal("no max. block size given for disk cache");
+ blocksize = maxblocksize;
+ nblocks = mem / blocksize;
+ dcache.full.l = &dcache.lock;
+ dcache.nblocks = nblocks;
+ dcache.maxdirty = (nblocks * 2) / 3;
+ trace(TraceProc, "initialize disk cache with %d blocks of %d bytes, maximum %d dirty blocks\n",
+ nblocks, blocksize, dcache.maxdirty);
+ dcache.size = blocksize;
+ dcache.heads = MKNZ(DBlock*, HashSize);
+ dcache.heap = MKNZ(DBlock*, nblocks);
+ dcache.blocks = MKNZ(DBlock, nblocks);
+ dcache.write = MKNZ(DBlock*, nblocks);
+ dcache.mem = MKNZ(u8int, (nblocks+1+128) * blocksize);
+
+ last = nil;
+ p = (u8int*)(((uintptr)dcache.mem+blocksize-1)&~(uintptr)(blocksize-1));
+ for(i = 0; i < nblocks; i++){
+ b = &dcache.blocks[i];
+ b->data = &p[i * blocksize];
+ b->heap = TWID32;
+ b->writedonechan = chancreate(sizeof(void*), 1);
+ b->next = last;
+ last = b;
+ }
+
+ dcache.free = last;
+ dcache.nheap = 0;
+ setstat(StatDcacheSize, nblocks);
+ initround(&dcache.round, "dcache", 120*1000);
+
+ vtproc(flushproc, nil);
+ vtproc(delaykickroundproc, &dcache.round);
+}
+
+static u32int
+pbhash(u64int addr)
+{
+ u32int h;
+
+#define hashit(c) ((((c) * 0x6b43a9b5) >> (32 - HashLog)) & HashMask)
+ h = (addr >> 32) ^ addr;
+ return hashit(h);
+}
+
+DBlock*
+getdblock(Part *part, u64int addr, int mode)
+{
+ DBlock *b;
+
+ b = _getdblock(part, addr, mode, 1);
+ if(mode == OREAD || mode == ORDWR)
+ addstat(StatDcacheRead, 1);
+ if(mode == OWRITE || mode == ORDWR)
+ addstat(StatDcacheWrite, 1);
+ return b;
+}
+
+DBlock*
+_getdblock(Part *part, u64int addr, int mode, int load)
+{
+ DBlock *b;
+ u32int h, size, ms;
+
+ ms = 0;
+ trace(TraceBlock, "getdblock enter %s 0x%llux", part->name, addr);
+ size = part->blocksize;
+ if(size > dcache.size){
+ seterr(EAdmin, "block size %d too big for cache with size %d", size, dcache.size);
+ if(load)
+ addstat(StatDcacheLookup, 1);
+ return nil;
+ }
+ h = pbhash(addr);
+
+ /*
+ * look for the block in the cache
+ */
+ qlock(&dcache.lock);
+again:
+ for(b = dcache.heads[h]; b != nil; b = b->next){
+ if(b->part == part && b->addr == addr){
+ if(load)
+ addstat2(StatDcacheHit, 1, StatDcacheLookup, 1);
+ goto found;
+ }
+ }
+
+ /*
+ * missed: locate the block with the oldest second to last use.
+ * remove it from the heap, and fix up the heap.
+ */
+ if(!load){
+ qunlock(&dcache.lock);
+ return nil;
+ }
+
+ /*
+ * Only start timer here, on cache miss - calling msec() on plain cache hits
+ * makes cache hits system-call bound.
+ */
+ ms = msec();
+ addstat2(StatDcacheLookup, 1, StatDcacheMiss, 1);
+
+ b = bumpdblock();
+ if(b == nil){
+ trace(TraceBlock, "all disk cache blocks in use");
+ addstat(StatDcacheStall, 1);
+ rsleep(&dcache.full);
+ addstat(StatDcacheStall, -1);
+ goto again;
+ }
+
+ assert(!b->dirty);
+
+ /*
+ * the new block has no last use, so assume it happens sometime in the middle
+ZZZ this is not reasonable
+ */
+ b->used = (b->used2 + dcache.now) / 2;
+
+ /*
+ * rechain the block on the correct hash chain
+ */
+ b->next = dcache.heads[h];
+ dcache.heads[h] = b;
+ if(b->next != nil)
+ b->next->prev = b;
+ b->prev = nil;
+
+ b->addr = addr;
+ b->part = part;
+ b->size = 0;
+
+found:
+ b->ref++;
+ b->used2 = b->used;
+ b->used = dcache.now++;
+ if(b->heap != TWID32)
+ fixheap(b->heap, b);
+
+ if((mode == ORDWR || mode == OWRITE) && part->writechan == nil){
+ trace(TraceBlock, "getdblock allocwriteproc %s", part->name);
+ part->writechan = chancreate(sizeof(DBlock*), dcache.nblocks);
+ vtproc(writeproc, part);
+ }
+ qunlock(&dcache.lock);
+
+ trace(TraceBlock, "getdblock lock");
+ addstat(StatDblockStall, 1);
+ if(mode == OREAD)
+ rlock(&b->lock);
+ else
+ wlock(&b->lock);
+ addstat(StatDblockStall, -1);
+ trace(TraceBlock, "getdblock locked");
+
+ if(b->size != size){
+ if(mode == OREAD){
+ addstat(StatDblockStall, 1);
+ runlock(&b->lock);
+ wlock(&b->lock);
+ addstat(StatDblockStall, -1);
+ }
+ if(b->size < size){
+ if(mode == OWRITE)
+ memset(&b->data[b->size], 0, size - b->size);
+ else{
+ trace(TraceBlock, "getdblock readpart %s 0x%llux", part->name, addr);
+ diskaccess(0);
+ if(readpart(part, addr + b->size, &b->data[b->size], size - b->size) < 0){
+ b->mode = ORDWR; /* so putdblock wunlocks */
+ putdblock(b);
+ return nil;
+ }
+ trace(TraceBlock, "getdblock readpartdone");
+ addstat(StatApartRead, 1);
+ addstat(StatApartReadBytes, size-b->size);
+ }
+ }
+ b->size = size;
+ if(mode == OREAD){
+ addstat(StatDblockStall, 1);
+ wunlock(&b->lock);
+ rlock(&b->lock);
+ addstat(StatDblockStall, -1);
+ }
+ }
+
+ b->mode = mode;
+ trace(TraceBlock, "getdblock exit");
+ if(ms)
+ addstat(StatDcacheLookupTime, msec() - ms);
+ return b;
+}
+
+void
+putdblock(DBlock *b)
+{
+ if(b == nil)
+ return;
+
+ trace(TraceBlock, "putdblock %s 0x%llux", b->part->name, b->addr);
+
+ if(b->mode == OREAD)
+ runlock(&b->lock);
+ else
+ wunlock(&b->lock);
+
+ qlock(&dcache.lock);
+ if(--b->ref == 0 && !b->dirty){
+ if(b->heap == TWID32)
+ upheap(dcache.nheap++, b);
+ rwakeupall(&dcache.full);
+ }
+ qunlock(&dcache.lock);
+}
+
+void
+dirtydblock(DBlock *b, int dirty)
+{
+ int odirty;
+
+ trace(TraceBlock, "dirtydblock enter %s 0x%llux %d from 0x%lux",
+ b->part->name, b->addr, dirty, getcallerpc(&b));
+ assert(b->ref != 0);
+ assert(b->mode==ORDWR || b->mode==OWRITE);
+
+ odirty = b->dirty;
+ if(b->dirty)
+ assert(b->dirty == dirty);
+ else
+ b->dirty = dirty;
+
+ qlock(&dcache.lock);
+ if(!odirty){
+ dcache.ndirty++;
+ setstat(StatDcacheDirty, dcache.ndirty);
+ if(dcache.ndirty >= dcache.maxdirty)
+ kickround(&dcache.round, 0);
+ else
+ delaykickround(&dcache.round);
+ }
+ qunlock(&dcache.lock);
+}
+
+static void
+unchain(DBlock *b)
+{
+ ulong h;
+
+ /*
+ * unchain the block
+ */
+ if(b->prev == nil){
+ h = pbhash(b->addr);
+ if(dcache.heads[h] != b)
+ sysfatal("bad hash chains in disk cache");
+ dcache.heads[h] = b->next;
+ }else
+ b->prev->next = b->next;
+ if(b->next != nil)
+ b->next->prev = b->prev;
+}
+
+/*
+ * remove some block from use and update the free list and counters
+ */
+static DBlock*
+bumpdblock(void)
+{
+ DBlock *b;
+
+ trace(TraceBlock, "bumpdblock enter");
+ b = dcache.free;
+ if(b != nil){
+ dcache.free = b->next;
+ return b;
+ }
+
+ if(dcache.ndirty >= dcache.maxdirty)
+ kickdcache();
+
+ /*
+ * remove blocks until we find one that is unused
+ * referenced blocks are left in the heap even though
+ * they can't be scavenged; this is simple a speed optimization
+ */
+ for(;;){
+ if(dcache.nheap == 0){
+ kickdcache();
+ trace(TraceBlock, "bumpdblock gotnothing");
+ return nil;
+ }
+ b = dcache.heap[0];
+ delheap(b);
+ if(!b->ref && !b->dirty)
+ break;
+ }
+
+ trace(TraceBlock, "bumpdblock bumping %s 0x%llux", b->part->name, b->addr);
+
+ unchain(b);
+ return b;
+}
+
+void
+emptydcache(void)
+{
+ DBlock *b;
+
+ qlock(&dcache.lock);
+ while(dcache.nheap > 0){
+ b = dcache.heap[0];
+ delheap(b);
+ if(!b->ref && !b->dirty){
+ unchain(b);
+ b->next = dcache.free;
+ dcache.free = b;
+ }
+ }
+ qunlock(&dcache.lock);
+}
+
+/*
+ * delete an arbitrary block from the heap
+ */
+static void
+delheap(DBlock *db)
+{
+ if(db->heap == TWID32)
+ return;
+ fixheap(db->heap, dcache.heap[--dcache.nheap]);
+ db->heap = TWID32;
+}
+
+/*
+ * push an element up or down to it's correct new location
+ */
+static void
+fixheap(int i, DBlock *b)
+{
+ if(upheap(i, b) == i)
+ downheap(i, b);
+}
+
+static int
+upheap(int i, DBlock *b)
+{
+ DBlock *bb;
+ u32int now;
+ int p;
+
+ now = dcache.now;
+ for(; i != 0; i = p){
+ p = (i - 1) >> 1;
+ bb = dcache.heap[p];
+ if(b->used2 - now >= bb->used2 - now)
+ break;
+ dcache.heap[i] = bb;
+ bb->heap = i;
+ }
+
+ dcache.heap[i] = b;
+ b->heap = i;
+ return i;
+}
+
+static int
+downheap(int i, DBlock *b)
+{
+ DBlock *bb;
+ u32int now;
+ int k;
+
+ now = dcache.now;
+ for(; ; i = k){
+ k = (i << 1) + 1;
+ if(k >= dcache.nheap)
+ break;
+ if(k + 1 < dcache.nheap && dcache.heap[k]->used2 - now > dcache.heap[k + 1]->used2 - now)
+ k++;
+ bb = dcache.heap[k];
+ if(b->used2 - now <= bb->used2 - now)
+ break;
+ dcache.heap[i] = bb;
+ bb->heap = i;
+ }
+
+ dcache.heap[i] = b;
+ b->heap = i;
+ return i;
+}
+
+static void
+findblock(DBlock *bb)
+{
+ DBlock *b, *last;
+ int h;
+
+ last = nil;
+ h = pbhash(bb->addr);
+ for(b = dcache.heads[h]; b != nil; b = b->next){
+ if(last != b->prev)
+ sysfatal("bad prev link");
+ if(b == bb)
+ return;
+ last = b;
+ }
+ sysfatal("block missing from hash table");
+}
+
+void
+checkdcache(void)
+{
+ DBlock *b;
+ u32int size, now;
+ int i, k, refed, nfree;
+
+ qlock(&dcache.lock);
+ size = dcache.size;
+ now = dcache.now;
+ for(i = 0; i < dcache.nheap; i++){
+ if(dcache.heap[i]->heap != i)
+ sysfatal("dc: mis-heaped at %d: %d", i, dcache.heap[i]->heap);
+ if(i > 0 && dcache.heap[(i - 1) >> 1]->used2 - now > dcache.heap[i]->used2 - now)
+ sysfatal("dc: bad heap ordering");
+ k = (i << 1) + 1;
+ if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now)
+ sysfatal("dc: bad heap ordering");
+ k++;
+ if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now)
+ sysfatal("dc: bad heap ordering");
+ }
+
+ refed = 0;
+ for(i = 0; i < dcache.nblocks; i++){
+ b = &dcache.blocks[i];
+ if(b->data != &dcache.mem[i * size])
+ sysfatal("dc: mis-blocked at %d", i);
+ if(b->ref && b->heap == TWID32)
+ refed++;
+ if(b->addr)
+ findblock(b);
+ if(b->heap != TWID32
+ && dcache.heap[b->heap] != b)
+ sysfatal("dc: spurious heap value");
+ }
+
+ nfree = 0;
+ for(b = dcache.free; b != nil; b = b->next){
+ if(b->addr != 0 || b->heap != TWID32)
+ sysfatal("dc: bad free list");
+ nfree++;
+ }
+
+ if(dcache.nheap + nfree + refed != dcache.nblocks)
+ sysfatal("dc: missing blocks: %d %d %d", dcache.nheap, refed, dcache.nblocks);
+ qunlock(&dcache.lock);
+}
+
+void
+flushdcache(void)
+{
+ trace(TraceProc, "flushdcache enter");
+ kickround(&dcache.round, 1);
+ trace(TraceProc, "flushdcache exit");
+}
+
+void
+kickdcache(void)
+{
+ kickround(&dcache.round, 0);
+}
+
+static int
+parallelwrites(DBlock **b, DBlock **eb, int dirty)
+{
+ DBlock **p, **q;
+ Part *part;
+
+ for(p=b; p<eb && (*p)->dirty == dirty; p++){
+ assert(b<=p && p<eb);
+ sendp((*p)->part->writechan, *p);
+ }
+ q = p;
+ for(p=b; p<q; p++){
+ assert(b<=p && p<eb);
+ recvp((*p)->writedonechan);
+ }
+
+ /*
+ * Flush the partitions that have been written to.
+ */
+ part = nil;
+ for(p=b; p<q; p++){
+ if(part != (*p)->part){
+ part = (*p)->part;
+ flushpart(part); /* what if it fails? */
+ }
+ }
+
+ return p-b;
+}
+
+/*
+ * Sort first by dirty flag, then by partition, then by address in partition.
+ */
+static int
+writeblockcmp(const void *va, const void *vb)
+{
+ DBlock *a, *b;
+
+ a = *(DBlock**)va;
+ b = *(DBlock**)vb;
+
+ if(a->dirty != b->dirty)
+ return a->dirty - b->dirty;
+ if(a->part != b->part){
+ if(a->part < b->part)
+ return -1;
+ if(a->part > b->part)
+ return 1;
+ }
+ if(a->addr < b->addr)
+ return -1;
+ return 1;
+}
+
+static void
+flushproc(void *v)
+{
+ int i, j, n;
+ ulong t0;
+ DBlock *b, **write;
+
+ USED(v);
+ threadsetname("flushproc");
+ for(;;){
+ waitforkick(&dcache.round);
+
+ trace(TraceWork, "start");
+ t0 = nsec()/1000;
+ trace(TraceProc, "build t=%lud", (ulong)(nsec()/1000)-t0);
+
+ write = dcache.write;
+ n = 0;
+ for(i=0; i<dcache.nblocks; i++){
+ b = &dcache.blocks[i];
+ if(b->dirty)
+ write[n++] = b;
+ }
+
+ qsort(write, n, sizeof(write[0]), writeblockcmp);
+
+ /* Write each stage of blocks out. */
+ trace(TraceProc, "writeblocks t=%lud", (ulong)(nsec()/1000)-t0);
+ i = 0;
+ for(j=1; j<DirtyMax; j++){
+ trace(TraceProc, "writeblocks.%d t=%lud",
+ j, (ulong)(nsec()/1000)-t0);
+ i += parallelwrites(write+i, write+n, j);
+ }
+ if(i != n){
+ fprint(2, "in flushproc i=%d n=%d\n", i, n);
+ for(i=0; i<n; i++)
+ fprint(2, "\tblock %d: dirty=%d\n",
+ i, write[i]->dirty);
+ abort();
+ }
+
+ /*
+ * b->dirty is protected by b->lock while ndirty is protected
+ * by dcache.lock, so the --ndirty below is the delayed one
+ * from clearing b->dirty in the write proc. It may happen
+ * that some other proc has come along and redirtied b since
+ * the write. That's okay, it just means that ndirty may be
+ * one too high until we catch up and do the decrement.
+ */
+ trace(TraceProc, "undirty.%d t=%lud", j, (ulong)(nsec()/1000)-t0);
+ qlock(&dcache.lock);
+ for(i=0; i<n; i++){
+ b = write[i];
+ --dcache.ndirty;
+ if(b->ref == 0 && b->heap == TWID32){
+ upheap(dcache.nheap++, b);
+ rwakeupall(&dcache.full);
+ }
+ }
+ setstat(StatDcacheDirty, dcache.ndirty);
+ qunlock(&dcache.lock);
+ addstat(StatDcacheFlush, 1);
+ trace(TraceWork, "finish");
+ }
+}
+
+static void
+writeproc(void *v)
+{
+ DBlock *b;
+ Part *p;
+
+ p = v;
+
+ threadsetname("writeproc:%s", p->name);
+ for(;;){
+ b = recvp(p->writechan);
+ trace(TraceWork, "start");
+ assert(b->part == p);
+ trace(TraceProc, "wlock %s 0x%llux", p->name, b->addr);
+ wlock(&b->lock);
+ trace(TraceProc, "writepart %s 0x%llux", p->name, b->addr);
+ diskaccess(0);
+ if(writepart(p, b->addr, b->data, b->size) < 0)
+ fprint(2, "%s: writeproc: part %s addr 0x%llux: write error: %r\n",
+ argv0, p->name, b->addr);
+ addstat(StatApartWrite, 1);
+ addstat(StatApartWriteBytes, b->size);
+ b->dirty = 0;
+ wunlock(&b->lock);
+ trace(TraceProc, "finish %s 0x%llux", p->name, b->addr);
+ trace(TraceWork, "finish");
+ sendp(b->writedonechan, b);
+ }
+}
diff --git a/sys/src/cmd/venti/srv/disksched.c b/sys/src/cmd/venti/srv/disksched.c
new file mode 100755
index 000000000..d43b64c7f
--- /dev/null
+++ b/sys/src/cmd/venti/srv/disksched.c
@@ -0,0 +1,89 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+ulong lasttime[2];
+int manualscheduling;
+int l0quantum = 120;
+int l1quantum = 120;
+ulong lasticachechange;
+
+void
+disksched(void)
+{
+ int p, nwrite, nflush, ndirty, tdirty, toflush;
+ ulong t;
+ vlong cflush;
+ Stats *prev;
+
+ /*
+ * no locks because all the data accesses are atomic.
+ */
+ t = time(0);
+ if(manualscheduling){
+ lasticachechange = t;
+ return;
+ }
+
+ if(t-lasttime[0] < l0quantum){
+ /* level-0 disk access going on */
+ p = icachedirtyfrac();
+ if(p < IcacheFrac*5/10){ /* can wait */
+ icachesleeptime = SleepForever;
+ lasticachechange = t;
+ }else if(p > IcacheFrac*9/10){ /* can't wait */
+ icachesleeptime = 0;
+ lasticachechange = t;
+ }else if(t-lasticachechange > 60){
+ /* have minute worth of data for current rate */
+ prev = &stathist[(stattime-60+nstathist)%nstathist];
+
+ /* # entries written to index cache */
+ nwrite = stats.n[StatIcacheWrite] - prev->n[StatIcacheWrite];
+
+ /* # dirty entries in index cache */
+ ndirty = stats.n[StatIcacheDirty] - prev->n[StatIcacheDirty];
+
+ /* # entries flushed to disk */
+ nflush = nwrite - ndirty;
+
+ /* want to stay around 70% dirty */
+ tdirty = (vlong)stats.n[StatIcacheSize]*700/1000;
+
+ /* assume nflush*icachesleeptime is a constant */
+ cflush = (vlong)nflush*(icachesleeptime+1);
+
+ /* computer number entries to write in next minute */
+ toflush = nwrite + (stats.n[StatIcacheDirty] - tdirty);
+
+ /* schedule for that many */
+ if(toflush <= 0 || cflush/toflush > 100000)
+ icachesleeptime = SleepForever;
+ else
+ icachesleeptime = cflush/toflush;
+ }
+ arenasumsleeptime = SleepForever;
+ return;
+ }
+ if(t-lasttime[1] < l1quantum){
+ /* level-1 disk access (icache flush) going on */
+ icachesleeptime = 0;
+ arenasumsleeptime = SleepForever;
+ return;
+ }
+ /* no disk access going on - no holds barred*/
+ icachesleeptime = 0;
+ arenasumsleeptime = 0;
+}
+
+void
+diskaccess(int level)
+{
+ if(level < 0 || level >= nelem(lasttime)){
+ fprint(2, "bad level in diskaccess; caller=%#p\n",
+ getcallerpc(&level));
+ return;
+ }
+ lasttime[level] = time(0);
+}
+
diff --git a/sys/src/cmd/venti/srv/dump.c b/sys/src/cmd/venti/srv/dump.c
new file mode 100755
index 000000000..fa2bfb7d2
--- /dev/null
+++ b/sys/src/cmd/venti/srv/dump.c
@@ -0,0 +1,47 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+void
+printindex(int fd, Index *ix)
+{
+ int i;
+
+ fprint(fd, "index=%s version=%d blocksize=%d tabsize=%d\n",
+ ix->name, ix->version, ix->blocksize, ix->tabsize);
+ fprint(fd, "\tbuckets=%d div=%d\n", ix->buckets, ix->div);
+ for(i = 0; i < ix->nsects; i++)
+ fprint(fd, "\tsect=%s for buckets [%lld,%lld)\n", ix->smap[i].name, ix->smap[i].start, ix->smap[i].stop);
+ for(i = 0; i < ix->narenas; i++)
+ fprint(fd, "\tarena=%s at [%lld,%lld)\n", ix->amap[i].name, ix->amap[i].start, ix->amap[i].stop);
+}
+
+void
+printarenapart(int fd, ArenaPart *ap)
+{
+ int i;
+
+ fprint(fd, "arena partition=%s\n\tversion=%d blocksize=%d arenas=%d\n\tsetbase=%d setsize=%d\n",
+ ap->part->name, ap->version, ap->blocksize, ap->narenas, ap->tabbase, ap->tabsize);
+ for(i = 0; i < ap->narenas; i++)
+ fprint(fd, "\tarena=%s at [%lld,%lld)\n", ap->map[i].name, ap->map[i].start, ap->map[i].stop);
+}
+
+void
+printarena(int fd, Arena *arena)
+{
+ fprint(fd, "arena='%s' [%lld,%lld)\n\tversion=%d created=%d modified=%d",
+ arena->name, arena->base, arena->base + arena->size + 2 * arena->blocksize,
+ arena->version, arena->ctime, arena->wtime);
+ if(arena->memstats.sealed)
+ fprint(2, " sealed\n");
+ else
+ fprint(2, "\n");
+ if(scorecmp(zeroscore, arena->score) != 0)
+ fprint(2, "\tscore=%V\n", arena->score);
+
+ fprint(fd, "\tclumps=%,d compressed clumps=%,d data=%,lld compressed data=%,lld disk storage=%,lld\n",
+ arena->memstats.clumps, arena->memstats.cclumps, arena->memstats.uncsize,
+ arena->memstats.used - arena->memstats.clumps * ClumpSize,
+ arena->memstats.used + arena->memstats.clumps * ClumpInfoSize);
+}
diff --git a/sys/src/cmd/venti/srv/findscore.c b/sys/src/cmd/venti/srv/findscore.c
new file mode 100755
index 000000000..412b07d40
--- /dev/null
+++ b/sys/src/cmd/venti/srv/findscore.c
@@ -0,0 +1,122 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+enum
+{
+ ClumpChunks = 32*1024
+};
+
+static int verbose;
+
+int
+clumpinfoeq(ClumpInfo *c, ClumpInfo *d)
+{
+ return c->type == d->type
+ && c->size == d->size
+ && c->uncsize == d->uncsize
+ && scorecmp(c->score, d->score)==0;
+}
+
+int
+findscore(Arena *arena, uchar *score)
+{
+ IEntry ie;
+ ClumpInfo *ci, *cis;
+ u64int a;
+ u32int clump;
+ int i, n, found;
+
+//ZZZ remove fprint?
+ if(arena->memstats.clumps)
+ fprint(2, "reading directory for arena=%s with %d entries\n",
+ arena->name, arena->memstats.clumps);
+
+ cis = MKN(ClumpInfo, ClumpChunks);
+ found = 0;
+ a = 0;
+ memset(&ie, 0, sizeof(IEntry));
+ for(clump = 0; clump < arena->memstats.clumps; clump += n){
+ n = ClumpChunks;
+ if(n > arena->memstats.clumps - clump)
+ n = arena->memstats.clumps - clump;
+ if(readclumpinfos(arena, clump, cis, n) != n){
+ seterr(EOk, "arena directory read failed: %r");
+ break;
+ }
+
+ for(i = 0; i < n; i++){
+ ci = &cis[i];
+ if(scorecmp(score, ci->score)==0){
+ fprint(2, "found at clump=%d with type=%d size=%d csize=%d position=%lld\n",
+ clump + i, ci->type, ci->uncsize, ci->size, a);
+ found++;
+ }
+ a += ci->size + ClumpSize;
+ }
+ }
+ free(cis);
+ return found;
+}
+
+void
+usage(void)
+{
+ fprint(2, "usage: findscore [-v] arenafile score\n");
+ threadexitsall(0);
+}
+
+void
+threadmain(int argc, char *argv[])
+{
+ ArenaPart *ap;
+ Part *part;
+ char *file;
+ u8int score[VtScoreSize];
+ int i, found;
+
+ ventifmtinstall();
+
+ ARGBEGIN{
+ case 'v':
+ verbose++;
+ break;
+ default:
+ usage();
+ break;
+ }ARGEND
+
+ readonly = 1;
+
+ if(argc != 2)
+ usage();
+
+ file = argv[0];
+ if(strscore(argv[1], score) < 0)
+ sysfatal("bad score %s", argv[1]);
+
+ part = initpart(file, OREAD|ODIRECT);
+ if(part == nil)
+ sysfatal("can't open partition %s: %r", file);
+
+ ap = initarenapart(part);
+ if(ap == nil)
+ sysfatal("can't initialize arena partition in %s: %r", file);
+
+ if(verbose > 1){
+ printarenapart(2, ap);
+ fprint(2, "\n");
+ }
+
+ initdcache(8 * MaxDiskBlock);
+
+ found = 0;
+ for(i = 0; i < ap->narenas; i++)
+ found += findscore(ap->arenas[i], score);
+
+ print("found %d occurrences of %V\n", found, score);
+
+ if(verbose > 1)
+ printstats();
+ threadexitsall(0);
+}
diff --git a/sys/src/cmd/venti/srv/fixarenas.c b/sys/src/cmd/venti/srv/fixarenas.c
new file mode 100755
index 000000000..ac05ab8cd
--- /dev/null
+++ b/sys/src/cmd/venti/srv/fixarenas.c
@@ -0,0 +1,1914 @@
+/*
+ * Check and fix an arena partition.
+ *
+ * This is a lot grittier than the rest of Venti because
+ * it can't just give up if a byte here or there is wrong.
+ *
+ * The rule here (hopefully followed!) is that block corruption
+ * only ever has a local effect -- there are no blocks that you
+ * can wipe out that will cause large portions of
+ * uncorrupted data blocks to be useless.
+ */
+
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+#include "whack.h"
+
+#define ROUNDUP(x,n) (((x)+(n)-1)&~((n)-1))
+
+#pragma varargck type "z" uvlong
+#pragma varargck type "z" vlong
+#pragma varargck type "t" uint
+
+enum
+{
+ K = 1024,
+ M = 1024*1024,
+ G = 1024*1024*1024,
+
+ Block = 4096,
+};
+
+int debugsha1;
+
+int verbose;
+Part *part;
+char *file;
+char *basename;
+char *dumpbase;
+int fix;
+int badreads;
+int unseal;
+uchar zero[MaxDiskBlock];
+
+Arena lastarena;
+ArenaPart ap;
+uvlong arenasize;
+int nbadread;
+int nbad;
+uvlong partend;
+void checkarena(vlong, int);
+
+void
+usage(void)
+{
+ fprint(2, "usage: fixarenas [-fv] [-a arenasize] [-b blocksize] file [ranges]\n");
+ threadexitsall(0);
+}
+
+/*
+ * Format number in simplest way that is okay with unittoull.
+ */
+static int
+zfmt(Fmt *fmt)
+{
+ vlong x;
+
+ x = va_arg(fmt->args, vlong);
+ if(x == 0)
+ return fmtstrcpy(fmt, "0");
+ if(x%G == 0)
+ return fmtprint(fmt, "%lldG", x/G);
+ if(x%M == 0)
+ return fmtprint(fmt, "%lldM", x/M);
+ if(x%K == 0)
+ return fmtprint(fmt, "%lldK", x/K);
+ return fmtprint(fmt, "%lld", x);
+}
+
+/*
+ * Format time like ctime without newline.
+ */
+static int
+tfmt(Fmt *fmt)
+{
+ uint t;
+ char buf[30];
+
+ t = va_arg(fmt->args, uint);
+ strcpy(buf, ctime(t));
+ buf[28] = 0;
+ return fmtstrcpy(fmt, buf);
+}
+
+/*
+ * Coalesce messages about unreadable sectors into larger ranges.
+ * bad(0, 0) flushes the buffer.
+ */
+static void
+bad(char *msg, vlong o, int len)
+{
+ static vlong lb0, lb1;
+ static char *lmsg;
+
+ if(msg == nil)
+ msg = lmsg;
+ if(o == -1){
+ lmsg = nil;
+ lb0 = 0;
+ lb1 = 0;
+ return;
+ }
+ if(lb1 != o || (msg && lmsg && strcmp(msg, lmsg) != 0)){
+ if(lb0 != lb1)
+ print("%s %#llux+%#llux (%,lld+%,lld)\n",
+ lmsg, lb0, lb1-lb0, lb0, lb1-lb0);
+ lb0 = o;
+ }
+ lmsg = msg;
+ lb1 = o+len;
+}
+
+/*
+ * Read in the len bytes of data at the offset. If can't for whatever reason,
+ * fill it with garbage but print an error.
+ */
+static uchar*
+readdisk(uchar *buf, vlong offset, int len)
+{
+ int i, j, k, n;
+
+ if(offset >= partend){
+ memset(buf, 0xFB, sizeof buf);
+ return buf;
+ }
+
+ if(offset+len > partend){
+ memset(buf, 0xFB, sizeof buf);
+ len = partend - offset;
+ }
+
+ if(readpart(part, offset, buf, len) >= 0)
+ return buf;
+
+ /*
+ * The read failed. Clear the buffer to nonsense, and
+ * then try reading in smaller pieces. If that fails,
+ * read in even smaller pieces. And so on down to sectors.
+ */
+ memset(buf, 0xFD, len);
+ for(i=0; i<len; i+=64*K){
+ n = 64*K;
+ if(i+n > len)
+ n = len-i;
+ if(readpart(part, offset+i, buf+i, n) >= 0)
+ continue;
+ for(j=i; j<len && j<i+64*K; j+=4*K){
+ n = 4*K;
+ if(j+n > len)
+ n = len-j;
+ if(readpart(part, offset+j, buf+j, n) >= 0)
+ continue;
+ for(k=j; k<len && k<j+4*K; k+=512){
+ if(readpart(part, offset+k, buf+k, 512) >= 0)
+ continue;
+ bad("disk read failed at", k, 512);
+ badreads++;
+ }
+ }
+ }
+ bad(nil, 0, 0);
+ return buf;
+}
+
+/*
+ * Buffer to support running SHA1 hash of the disk.
+ */
+typedef struct Shabuf Shabuf;
+struct Shabuf
+{
+ int fd;
+ vlong offset;
+ DigestState state;
+ int rollback;
+ vlong r0;
+ DigestState *hist;
+ int nhist;
+};
+
+void
+sbdebug(Shabuf *sb, char *file)
+{
+ int fd;
+
+ if(sb->fd > 0){
+ close(sb->fd);
+ sb->fd = 0;
+ }
+ if((fd = create(file, OWRITE, 0666)) < 0)
+ return;
+ if(fd == 0){
+ fd = dup(fd, -1);
+ close(0);
+ }
+ sb->fd = fd;
+}
+
+void
+sbupdate(Shabuf *sb, uchar *p, vlong offset, int len)
+{
+ int n, x;
+ vlong o;
+
+ if(sb->rollback && !sb->hist){
+ sb->r0 = offset;
+ sb->nhist = 1;
+ sb->hist = vtmalloc(sb->nhist*sizeof *sb->hist);
+ memset(sb->hist, 0, sizeof sb->hist[0]);
+ }
+ if(sb->r0 == 0)
+ sb->r0 = offset;
+
+ if(sb->offset < offset || sb->offset >= offset+len){
+ if(0) print("sbupdate %p %#llux+%d but offset=%#llux\n",
+ p, offset, len, sb->offset);
+ return;
+ }
+ x = sb->offset - offset;
+ if(0) print("sbupdate %p %#llux+%d skip %d\n",
+ sb, offset, len, x);
+ if(x){
+ p += x;
+ offset += x;
+ len -= x;
+ }
+ assert(sb->offset == offset);
+
+ if(sb->fd > 0)
+ pwrite(sb->fd, p, len, offset - sb->r0);
+
+ if(!sb->rollback){
+ sha1(p, len, nil, &sb->state);
+ sb->offset += len;
+ return;
+ }
+
+ /* save state every 4M so we can roll back quickly */
+ o = offset - sb->r0;
+ while(len > 0){
+ n = 4*M - o%(4*M);
+ if(n > len)
+ n = len;
+ sha1(p, n, nil, &sb->state);
+ sb->offset += n;
+ o += n;
+ p += n;
+ len -= n;
+ if(o%(4*M) == 0){
+ x = o/(4*M);
+ if(x >= sb->nhist){
+ if(x != sb->nhist)
+ print("oops! x=%d nhist=%d\n", x, sb->nhist);
+ sb->nhist += 32;
+ sb->hist = vtrealloc(sb->hist, sb->nhist*sizeof *sb->hist);
+ }
+ sb->hist[x] = sb->state;
+ }
+ }
+}
+
+void
+sbdiskhash(Shabuf *sb, vlong eoffset)
+{
+ static uchar dbuf[4*M];
+ int n;
+
+ while(sb->offset < eoffset){
+ n = sizeof dbuf;
+ if(sb->offset+n > eoffset)
+ n = eoffset - sb->offset;
+ readdisk(dbuf, sb->offset, n);
+ sbupdate(sb, dbuf, sb->offset, n);
+ }
+}
+
+void
+sbrollback(Shabuf *sb, vlong offset)
+{
+ int x;
+ vlong o;
+ Dir d;
+
+ if(!sb->rollback || !sb->r0){
+ print("cannot rollback sha\n");
+ return;
+ }
+ if(offset >= sb->offset)
+ return;
+ o = offset - sb->r0;
+ x = o/(4*M);
+ if(x >= sb->nhist){
+ print("cannot rollback sha\n");
+ return;
+ }
+ sb->state = sb->hist[x];
+ sb->offset = sb->r0 + x*4*M;
+ assert(sb->offset <= offset);
+
+ if(sb->fd > 0){
+ nulldir(&d);
+ d.length = sb->offset - sb->r0;
+ dirfwstat(sb->fd, &d);
+ }
+}
+
+void
+sbscore(Shabuf *sb, uchar *score)
+{
+ if(sb->hist){
+ free(sb->hist);
+ sb->hist = nil;
+ }
+ sha1(nil, 0, score, &sb->state);
+}
+
+/*
+ * If we're fixing arenas, then editing this memory edits the disk!
+ * It will be written back out as new data is paged in.
+ */
+uchar buf[4*M];
+uchar sbuf[4*M];
+vlong bufoffset;
+int buflen;
+
+static void pageout(void);
+static uchar*
+pagein(vlong offset, int len)
+{
+ pageout();
+ if(offset >= partend){
+ memset(buf, 0xFB, sizeof buf);
+ return buf;
+ }
+
+ if(offset+len > partend){
+ memset(buf, 0xFB, sizeof buf);
+ len = partend - offset;
+ }
+ bufoffset = offset;
+ buflen = len;
+ readdisk(buf, offset, len);
+ memmove(sbuf, buf, len);
+ return buf;
+}
+
+static void
+pageout(void)
+{
+ if(buflen==0 || !fix || memcmp(buf, sbuf, buflen) == 0){
+ buflen = 0;
+ return;
+ }
+ if(writepart(part, bufoffset, buf, buflen) < 0)
+ print("disk write failed at %#llux+%#ux (%,lld+%,d)\n",
+ bufoffset, buflen, bufoffset, buflen);
+ buflen = 0;
+}
+
+static void
+zerorange(vlong offset, int len)
+{
+ int i;
+ vlong ooff;
+ int olen;
+ enum { MinBlock = 4*K, MaxBlock = 8*K };
+
+ if(0)
+ if(bufoffset <= offset && offset+len <= bufoffset+buflen){
+ memset(buf+(offset-bufoffset), 0, len);
+ return;
+ }
+
+ ooff = bufoffset;
+ olen = buflen;
+
+ i = offset%MinBlock;
+ if(i+len < MaxBlock){
+ pagein(offset-i, (len+MinBlock-1)&~(MinBlock-1));
+ memset(buf+i, 0, len);
+ }else{
+ pagein(offset-i, MaxBlock);
+ memset(buf+i, 0, MaxBlock-i);
+ offset += MaxBlock-i;
+ len -= MaxBlock-i;
+ while(len >= MaxBlock){
+ pagein(offset, MaxBlock);
+ memset(buf, 0, MaxBlock);
+ offset += MaxBlock;
+ len -= MaxBlock;
+ }
+ pagein(offset, (len+MinBlock-1)&~(MinBlock-1));
+ memset(buf, 0, len);
+ }
+ pagein(ooff, olen);
+}
+
+/*
+ * read/write integers
+ *
+static void
+p16(uchar *p, u16int u)
+{
+ p[0] = (u>>8) & 0xFF;
+ p[1] = u & 0xFF;
+}
+*/
+
+static u16int
+u16(uchar *p)
+{
+ return (p[0]<<8)|p[1];
+}
+
+static void
+p32(uchar *p, u32int u)
+{
+ p[0] = (u>>24) & 0xFF;
+ p[1] = (u>>16) & 0xFF;
+ p[2] = (u>>8) & 0xFF;
+ p[3] = u & 0xFF;
+}
+
+static u32int
+u32(uchar *p)
+{
+ return (p[0]<<24)|(p[1]<<16)|(p[2]<<8)|p[3];
+}
+
+/*
+static void
+p64(uchar *p, u64int u)
+{
+ p32(p, u>>32);
+ p32(p, u);
+}
+*/
+
+static u64int
+u64(uchar *p)
+{
+ return ((u64int)u32(p)<<32) | u32(p+4);
+}
+
+static int
+vlongcmp(const void *va, const void *vb)
+{
+ vlong a, b;
+
+ a = *(vlong*)va;
+ b = *(vlong*)vb;
+ if(a < b)
+ return -1;
+ if(b > a)
+ return 1;
+ return 0;
+}
+
+/* D and S are in draw.h */
+#define D VD
+#define S VS
+
+enum
+{
+ D = 0x10000,
+ Z = 0x20000,
+ S = 0x30000,
+ T = 0x40000,
+ N = 0xFFFF
+};
+typedef struct Info Info;
+struct Info
+{
+ int len;
+ char *name;
+};
+
+Info partinfo[] = {
+ 4, "magic",
+ D|4, "version",
+ Z|4, "blocksize",
+ 4, "arenabase",
+ 0
+};
+
+Info headinfo4[] = {
+ 4, "magic",
+ D|4, "version",
+ S|ANameSize, "name",
+ Z|4, "blocksize",
+ Z|8, "size",
+ 0
+};
+
+Info headinfo5[] = {
+ 4, "magic",
+ D|4, "version",
+ S|ANameSize, "name",
+ Z|4, "blocksize",
+ Z|8, "size",
+ 4, "clumpmagic",
+ 0
+};
+
+Info tailinfo4[] = {
+ 4, "magic",
+ D|4, "version",
+ S|ANameSize, "name",
+ D|4, "clumps",
+ D|4, "cclumps",
+ T|4, "ctime",
+ T|4, "wtime",
+ D|8, "used",
+ D|8, "uncsize",
+ 1, "sealed",
+ 0
+};
+
+Info tailinfo4a[] = {
+ /* tailinfo 4 */
+ 4, "magic",
+ D|4, "version",
+ S|ANameSize, "name",
+ D|4, "clumps",
+ D|4, "cclumps",
+ T|4, "ctime",
+ T|4, "wtime",
+ D|8, "used",
+ D|8, "uncsize",
+ 1, "sealed",
+
+ /* mem stats */
+ 1, "extension",
+ D|4, "mem.clumps",
+ D|4, "mem.cclumps",
+ D|8, "mem.used",
+ D|8, "mem.uncsize",
+ 1, "mem.sealed",
+ 0
+};
+
+Info tailinfo5[] = {
+ 4, "magic",
+ D|4, "version",
+ S|ANameSize, "name",
+ D|4, "clumps",
+ D|4, "cclumps",
+ T|4, "ctime",
+ T|4, "wtime",
+ 4, "clumpmagic",
+ D|8, "used",
+ D|8, "uncsize",
+ 1, "sealed",
+ 0
+};
+
+Info tailinfo5a[] = {
+ /* tailinfo 5 */
+ 4, "magic",
+ D|4, "version",
+ S|ANameSize, "name",
+ D|4, "clumps",
+ D|4, "cclumps",
+ T|4, "ctime",
+ T|4, "wtime",
+ 4, "clumpmagic",
+ D|8, "used",
+ D|8, "uncsize",
+ 1, "sealed",
+
+ /* mem stats */
+ 1, "extension",
+ D|4, "mem.clumps",
+ D|4, "mem.cclumps",
+ D|8, "mem.used",
+ D|8, "mem.uncsize",
+ 1, "mem.sealed",
+ 0
+};
+
+void
+showdiffs(uchar *want, uchar *have, int len, Info *info)
+{
+ int n;
+
+ while(len > 0 && (n=info->len&N) > 0){
+ if(memcmp(have, want, n) != 0){
+ switch(info->len){
+ case 1:
+ print("\t%s: correct=%d disk=%d\n",
+ info->name, *want, *have);
+ break;
+ case 4:
+ print("\t%s: correct=%#ux disk=%#ux\n",
+ info->name, u32(want), u32(have));
+ break;
+ case D|4:
+ print("\t%s: correct=%,ud disk=%,ud\n",
+ info->name, u32(want), u32(have));
+ break;
+ case T|4:
+ print("\t%s: correct=%t\n\t\tdisk=%t\n",
+ info->name, u32(want), u32(have));
+ break;
+ case Z|4:
+ print("\t%s: correct=%z disk=%z\n",
+ info->name, (uvlong)u32(want), (uvlong)u32(have));
+ break;
+ case D|8:
+ print("\t%s: correct=%,lld disk=%,lld\n",
+ info->name, u64(want), u64(have));
+ break;
+ case Z|8:
+ print("\t%s: correct=%z disk=%z\n",
+ info->name, u64(want), u64(have));
+ break;
+ case S|ANameSize:
+ print("\t%s: correct=%s disk=%.*s\n",
+ info->name, (char*)want,
+ utfnlen((char*)have, ANameSize-1),
+ (char*)have);
+ break;
+ default:
+ print("\t%s: correct=%.*H disk=%.*H\n",
+ info->name, n, want, n, have);
+ break;
+ }
+ }
+ have += n;
+ want += n;
+ len -= n;
+ info++;
+ }
+ if(len > 0 && memcmp(have, want, len) != 0){
+ if(memcmp(want, zero, len) != 0)
+ print("!!\textra want data in showdiffs (bug in fixarenas)\n");
+ else
+ print("\tnon-zero data on disk after structure\n");
+ if(verbose > 1){
+ print("want: %.*H\n", len, want);
+ print("have: %.*H\n", len, have);
+ }
+ }
+}
+
+/*
+ * Does part begin with an arena?
+ */
+int
+isonearena(void)
+{
+ return u32(pagein(0, Block)) == ArenaHeadMagic;
+}
+
+static int tabsizes[] = { 16*1024, 64*1024, 512*1024, 768*1024, };
+/*
+ * Poke around on the disk to guess what the ArenaPart numbers are.
+ */
+void
+guessgeometry(void)
+{
+ int i, j, n, bestn, ndiff, nhead, ntail;
+ uchar *p, *ep, *sp;
+ u64int diff[100], head[20], tail[20];
+ u64int offset, bestdiff;
+
+ ap.version = ArenaPartVersion;
+
+ if(arenasize == 0 || ap.blocksize == 0){
+ /*
+ * The ArenaPart block at offset PartBlank may be corrupt or just wrong.
+ * Instead, look for the individual arena headers and tails, which there
+ * are many of, and once we've seen enough, infer the spacing.
+ *
+ * Of course, nothing in the file format requires that arenas be evenly
+ * spaced, but fmtarenas always does that for us.
+ */
+ nhead = 0;
+ ntail = 0;
+ for(offset=PartBlank; offset<partend; offset+=4*M){
+ p = pagein(offset, 4*M);
+ for(sp=p, ep=p+4*M; p<ep; p+=K){
+ if(u32(p) == ArenaHeadMagic && nhead < nelem(head)){
+ if(verbose)
+ print("arena head at %#llx\n", offset+(p-sp));
+ head[nhead++] = offset+(p-sp);
+ }
+ if(u32(p) == ArenaMagic && ntail < nelem(tail)){
+ tail[ntail++] = offset+(p-sp);
+ if(verbose)
+ print("arena tail at %#llx\n", offset+(p-sp));
+ }
+ }
+ if(nhead == nelem(head) && ntail == nelem(tail))
+ break;
+ }
+ if(nhead < 3 && ntail < 3)
+ sysfatal("too few intact arenas: %d heads, %d tails", nhead, ntail);
+
+ /*
+ * Arena size is likely the most common
+ * inter-head or inter-tail spacing.
+ */
+ ndiff = 0;
+ for(i=1; i<nhead; i++)
+ diff[ndiff++] = head[i] - head[i-1];
+ for(i=1; i<ntail; i++)
+ diff[ndiff++] = tail[i] - tail[i-1];
+ qsort(diff, ndiff, sizeof diff[0], vlongcmp);
+ bestn = 0;
+ bestdiff = 0;
+ for(i=1, n=1; i<=ndiff; i++, n++){
+ if(i==ndiff || diff[i] != diff[i-1]){
+ if(n > bestn){
+ bestn = n;
+ bestdiff = diff[i-1];
+ }
+ n = 0;
+ }
+ }
+ print("arena size likely %z (%d of %d)\n", bestdiff, bestn, ndiff);
+ if(arenasize != 0 && arenasize != bestdiff)
+ print("using user-specified size %z instead\n", arenasize);
+ else
+ arenasize = bestdiff;
+
+ /*
+ * The arena tail for an arena is arenasize-blocksize from the head.
+ */
+ ndiff = 0;
+ for(i=j=0; i<nhead && j<ntail; ){
+ if(tail[j] < head[i]){
+ j++;
+ continue;
+ }
+ if(tail[j] < head[i]+arenasize){
+ diff[ndiff++] = head[i]+arenasize - tail[j];
+ j++;
+ continue;
+ }
+ i++;
+ }
+ if(ndiff < 3)
+ sysfatal("too few intact arenas: %d head, tail pairs", ndiff);
+ qsort(diff, ndiff, sizeof diff[0], vlongcmp);
+ bestn = 0;
+ bestdiff = 0;
+ for(i=1, n=1; i<=ndiff; i++, n++){
+ if(i==ndiff || diff[i] != diff[i-1]){
+ if(n > bestn){
+ bestn = n;
+ bestdiff = diff[i-1];
+ }
+ n = 0;
+ }
+ }
+ print("block size likely %z (%d of %d)\n", bestdiff, bestn, ndiff);
+ if(ap.blocksize != 0 && ap.blocksize != bestdiff)
+ print("using user-specified size %z instead\n", (vlong)ap.blocksize);
+ else
+ ap.blocksize = bestdiff;
+ if(ap.blocksize == 0 || ap.blocksize&(ap.blocksize-1))
+ sysfatal("block size not a power of two");
+ if(ap.blocksize > MaxDiskBlock)
+ sysfatal("block size too big (max=%d)", MaxDiskBlock);
+
+ /*
+ * Use head/tail information to deduce arena base.
+ */
+ ndiff = 0;
+ for(i=0; i<nhead; i++)
+ diff[ndiff++] = head[i]%arenasize;
+ for(i=0; i<ntail; i++)
+ diff[ndiff++] = (tail[i]+ap.blocksize)%arenasize;
+ qsort(diff, ndiff, sizeof diff[0], vlongcmp);
+ bestn = 0;
+ bestdiff = 0;
+ for(i=1, n=1; i<=ndiff; i++, n++){
+ if(i==ndiff || diff[i] != diff[i-1]){
+ if(n > bestn){
+ bestn = n;
+ bestdiff = diff[i-1];
+ }
+ n = 0;
+ }
+ }
+ ap.arenabase = bestdiff;
+ }
+
+ ap.tabbase = ROUNDUP(PartBlank+HeadSize, ap.blocksize);
+ /*
+ * XXX pick up table, check arenabase.
+ * XXX pick up table, record base name.
+ */
+
+ /*
+ * Somewhat standard computation.
+ * Fmtarenas used to use 64k tab, now uses 512k tab.
+ */
+ if(ap.arenabase == 0){
+ print("trying standard arena bases...\n");
+ for(i=0; i<nelem(tabsizes); i++){
+ ap.arenabase = ROUNDUP(PartBlank+HeadSize+tabsizes[i], ap.blocksize);
+ p = pagein(ap.arenabase, Block);
+ if(u32(p) == ArenaHeadMagic)
+ break;
+ }
+ }
+ p = pagein(ap.arenabase, Block);
+ print("arena base likely %z%s\n", (vlong)ap.arenabase,
+ u32(p)!=ArenaHeadMagic ? " (but no arena head there)" : "");
+
+ ap.tabsize = ap.arenabase - ap.tabbase;
+}
+
+/*
+ * Check the arena partition blocks and then the arenas listed in range.
+ */
+void
+checkarenas(char *range)
+{
+ char *s, *t;
+ int i, lo, hi, narena;
+ uchar dbuf[HeadSize];
+ uchar *p;
+
+ guessgeometry();
+
+ partend -= partend%ap.blocksize;
+
+ memset(dbuf, 0, sizeof dbuf);
+ packarenapart(&ap, dbuf);
+ p = pagein(PartBlank, Block);
+ if(memcmp(p, dbuf, HeadSize) != 0){
+ print("on-disk arena part superblock incorrect\n");
+ showdiffs(dbuf, p, HeadSize, partinfo);
+ }
+ memmove(p, dbuf, HeadSize);
+
+ narena = (partend-ap.arenabase + arenasize-1)/arenasize;
+ if(range == nil){
+ for(i=0; i<narena; i++)
+ checkarena(ap.arenabase+(vlong)i*arenasize, i);
+ }else if(strcmp(range, "none") == 0){
+ /* nothing */
+ }else{
+ /* parse, e.g., -4,8-9,10- */
+ for(s=range; *s; s=t){
+ t = strchr(s, ',');
+ if(t)
+ *t++ = 0;
+ else
+ t = s+strlen(s);
+ if(*s == '-')
+ lo = 0;
+ else
+ lo = strtol(s, &s, 0);
+ hi = lo;
+ if(*s == '-'){
+ s++;
+ if(*s == 0)
+ hi = narena-1;
+ else
+ hi = strtol(s, &s, 0);
+ }
+ if(*s != 0){
+ print("bad arena range: %s\n", s);
+ continue;
+ }
+ for(i=lo; i<=hi; i++)
+ checkarena(ap.arenabase+(vlong)i*arenasize, i);
+ }
+ }
+}
+
+/*
+ * Is there a clump here at p?
+ */
+static int
+isclump(uchar *p, Clump *cl, u32int *pmagic)
+{
+ int n;
+ u32int magic;
+ uchar score[VtScoreSize], *bp;
+ Unwhack uw;
+ uchar ubuf[70*1024];
+
+ bp = p;
+ magic = u32(p);
+ if(magic == 0)
+ return 0;
+ p += U32Size;
+
+ cl->info.type = vtfromdisktype(*p);
+ if(cl->info.type == 0xFF)
+ return 0;
+ p++;
+ cl->info.size = u16(p);
+ p += U16Size;
+ cl->info.uncsize = u16(p);
+ if(cl->info.size > cl->info.uncsize)
+ return 0;
+ p += U16Size;
+ scorecp(cl->info.score, p);
+ p += VtScoreSize;
+ cl->encoding = *p;
+ p++;
+ cl->creator = u32(p);
+ p += U32Size;
+ cl->time = u32(p);
+ p += U32Size;
+
+ switch(cl->encoding){
+ case ClumpENone:
+ if(cl->info.size != cl->info.uncsize)
+ return 0;
+ scoremem(score, p, cl->info.size);
+ if(scorecmp(score, cl->info.score) != 0)
+ return 0;
+ break;
+ case ClumpECompress:
+ if(cl->info.size >= cl->info.uncsize)
+ return 0;
+ unwhackinit(&uw);
+ n = unwhack(&uw, ubuf, cl->info.uncsize, p, cl->info.size);
+ if(n != cl->info.uncsize)
+ return 0;
+ scoremem(score, ubuf, cl->info.uncsize);
+ if(scorecmp(score, cl->info.score) != 0)
+ return 0;
+ break;
+ default:
+ return 0;
+ }
+ p += cl->info.size;
+
+ /* it all worked out in the end */
+ *pmagic = magic;
+ return p - bp;
+}
+
+/*
+ * All ClumpInfos seen in this arena.
+ * Kept in binary tree so we can look up by score.
+ */
+typedef struct Cit Cit;
+struct Cit
+{
+ int left;
+ int right;
+ vlong corrupt;
+ ClumpInfo ci;
+};
+Cit *cibuf;
+int ciroot;
+int ncibuf, mcibuf;
+
+void
+resetcibuf(void)
+{
+ ncibuf = 0;
+ ciroot = -1;
+}
+
+int*
+ltreewalk(int *p, uchar *score)
+{
+ int i;
+
+ for(;;){
+ if(*p == -1)
+ return p;
+ i = scorecmp(cibuf[*p].ci.score, score);
+ if(i == 0)
+ return p;
+ if(i < 0)
+ p = &cibuf[*p].right;
+ else
+ p = &cibuf[*p].left;
+ }
+}
+
+void
+addcibuf(ClumpInfo *ci, vlong corrupt)
+{
+ Cit *cit;
+
+ if(ncibuf == mcibuf){
+ mcibuf += 131072;
+ cibuf = vtrealloc(cibuf, mcibuf*sizeof cibuf[0]);
+ }
+ cit = &cibuf[ncibuf];
+ cit->ci = *ci;
+ cit->left = -1;
+ cit->right = -1;
+ cit->corrupt = corrupt;
+ if(!corrupt)
+ *ltreewalk(&ciroot, ci->score) = ncibuf;
+ ncibuf++;
+}
+
+void
+addcicorrupt(vlong len)
+{
+ static ClumpInfo zci;
+
+ addcibuf(&zci, len);
+}
+
+int
+haveclump(uchar *score)
+{
+ int i;
+ int p;
+
+ p = ciroot;
+ for(;;){
+ if(p == -1)
+ return 0;
+ i = scorecmp(cibuf[p].ci.score, score);
+ if(i == 0)
+ return 1;
+ if(i < 0)
+ p = cibuf[p].right;
+ else
+ p = cibuf[p].left;
+ }
+}
+
+int
+matchci(ClumpInfo *ci, uchar *p)
+{
+ if(ci->type != vtfromdisktype(p[0]))
+ return 0;
+ if(ci->size != u16(p+1))
+ return 0;
+ if(ci->uncsize != u16(p+3))
+ return 0;
+ if(scorecmp(ci->score, p+5) != 0)
+ return 0;
+ return 1;
+}
+
+int
+sealedarena(uchar *p, int blocksize)
+{
+ int v, n;
+
+ v = u32(p+4);
+ switch(v){
+ default:
+ return 0;
+ case ArenaVersion4:
+ n = ArenaSize4;
+ break;
+ case ArenaVersion5:
+ n = ArenaSize5;
+ break;
+ }
+ if(p[n-1] != 1){
+ print("arena tail says not sealed\n");
+ return 0;
+ }
+ if(memcmp(p+n, zero, blocksize-VtScoreSize-n) != 0){
+ print("arena tail followed by non-zero data\n");
+ return 0;
+ }
+ if(memcmp(p+blocksize-VtScoreSize, zero, VtScoreSize) == 0){
+ print("arena score zero\n");
+ return 0;
+ }
+ return 1;
+}
+
+int
+okayname(char *name, int n)
+{
+ char buf[20];
+
+ if(nameok(name) < 0)
+ return 0;
+ sprint(buf, "%d", n);
+ if(n == 0)
+ buf[0] = 0;
+ if(strlen(name) < strlen(buf)
+ || strcmp(name+strlen(name)-strlen(buf), buf) != 0)
+ return 0;
+ return 1;
+}
+
+int
+clumpinfocmp(ClumpInfo *a, ClumpInfo *b)
+{
+ if(a->type != b->type)
+ return a->type - b->type;
+ if(a->size != b->size)
+ return a->size - b->size;
+ if(a->uncsize != b->uncsize)
+ return a->uncsize - b->uncsize;
+ return scorecmp(a->score, b->score);
+}
+
+ClumpInfo*
+loadci(vlong offset, Arena *arena, int nci)
+{
+ int i, j, per;
+ uchar *p, *sp;
+ ClumpInfo *bci, *ci;
+
+ per = arena->blocksize/ClumpInfoSize;
+ bci = vtmalloc(nci*sizeof bci[0]);
+ ci = bci;
+ offset += arena->size - arena->blocksize;
+ p = sp = nil;
+ for(i=0; i<nci; i+=per){
+ if(p == sp){
+ sp = pagein(offset-4*M, 4*M);
+ p = sp+4*M;
+ }
+ p -= arena->blocksize;
+ offset -= arena->blocksize;
+ for(j=0; j<per && i+j<nci; j++)
+ unpackclumpinfo(ci++, p+j*ClumpInfoSize);
+ }
+ return bci;
+}
+
+vlong
+writeci(vlong offset, Arena *arena, ClumpInfo *ci, int nci)
+{
+ int i, j, per;
+ uchar *p, *sp;
+
+ per = arena->blocksize/ClumpInfoSize;
+ offset += arena->size - arena->blocksize;
+ p = sp = nil;
+ for(i=0; i<nci; i+=per){
+ if(p == sp){
+ sp = pagein(offset-4*M, 4*M);
+ p = sp+4*M;
+ }
+ p -= arena->blocksize;
+ offset -= arena->blocksize;
+ memset(p, 0, arena->blocksize);
+ for(j=0; j<per && i+j<nci; j++)
+ packclumpinfo(ci++, p+j*ClumpInfoSize);
+ }
+ pageout();
+ return offset;
+}
+
+void
+loadarenabasics(vlong offset0, int anum, ArenaHead *head, Arena *arena)
+{
+ char dname[ANameSize];
+ static char lastbase[ANameSize];
+ uchar *p;
+ Arena oarena;
+ ArenaHead ohead;
+
+ /*
+ * Fmtarenas makes all arenas the same size
+ * except the last, which may be smaller.
+ * It uses the same block size for arenas as for
+ * the arena partition blocks.
+ */
+ arena->size = arenasize;
+ if(offset0+arena->size > partend)
+ arena->size = partend - offset0;
+ head->size = arena->size;
+
+ arena->blocksize = ap.blocksize;
+ head->blocksize = arena->blocksize;
+
+ /*
+ * Look for clump magic and name in head/tail blocks.
+ * All the other info we will reconstruct just in case.
+ */
+ p = pagein(offset0, arena->blocksize);
+ memset(&ohead, 0, sizeof ohead);
+ if(unpackarenahead(&ohead, p) >= 0){
+ head->version = ohead.version;
+ head->clumpmagic = ohead.clumpmagic;
+ if(okayname(ohead.name, anum))
+ strcpy(head->name, ohead.name);
+ }
+
+ p = pagein(offset0+arena->size-arena->blocksize,
+ arena->blocksize);
+ memset(&oarena, 0, sizeof oarena);
+ if(unpackarena(&oarena, p) >= 0){
+ arena->version = oarena.version;
+ arena->clumpmagic = oarena.clumpmagic;
+ if(okayname(oarena.name, anum))
+ strcpy(arena->name, oarena.name);
+ arena->diskstats.clumps = oarena.diskstats.clumps;
+print("old arena: sealed=%d\n", oarena.diskstats.sealed);
+ arena->diskstats.sealed = oarena.diskstats.sealed;
+ }
+
+ /* Head trumps arena. */
+ if(head->version){
+ arena->version = head->version;
+ arena->clumpmagic = head->clumpmagic;
+ }
+ if(arena->version == 0)
+ arena->version = ArenaVersion5;
+ if(basename){
+ if(anum == -1)
+ snprint(arena->name, ANameSize, "%s", basename);
+ else
+ snprint(arena->name, ANameSize, "%s%d", basename, anum);
+ }else if(lastbase[0])
+ snprint(arena->name, ANameSize, "%s%d", lastbase, anum);
+ else if(head->name[0])
+ strcpy(arena->name, head->name);
+ else if(arena->name[0] == 0)
+ sysfatal("cannot determine base name for arena; use -n");
+ strcpy(lastbase, arena->name);
+ sprint(dname, "%d", anum);
+ lastbase[strlen(lastbase)-strlen(dname)] = 0;
+
+ /* Was working in arena, now copy to head. */
+ head->version = arena->version;
+ memmove(head->name, arena->name, sizeof head->name);
+ head->blocksize = arena->blocksize;
+ head->size = arena->size;
+}
+
+void
+shahead(Shabuf *sb, vlong offset0, ArenaHead *head)
+{
+ uchar headbuf[MaxDiskBlock];
+
+ sb->offset = offset0;
+ memset(headbuf, 0, sizeof headbuf);
+ packarenahead(head, headbuf);
+ sbupdate(sb, headbuf, offset0, head->blocksize);
+}
+
+u32int
+newclumpmagic(int version)
+{
+ u32int m;
+
+ if(version == ArenaVersion4)
+ return _ClumpMagic;
+ do{
+ m = fastrand();
+ }while(m==0 || m == _ClumpMagic);
+ return m;
+}
+
+/*
+ * Poke around in the arena to find the clump data
+ * and compute the relevant statistics.
+ */
+void
+guessarena(vlong offset0, int anum, ArenaHead *head, Arena *arena,
+ uchar *oldscore, uchar *score)
+{
+ uchar dbuf[MaxDiskBlock];
+ int needtozero, clumps, nb1, nb2, minclumps;
+ int inbad, n, ncib, printed, sealing, smart;
+ u32int magic;
+ uchar *sp, *ep, *p;
+ vlong boffset, eoffset, lastclumpend, leaked;
+ vlong offset, toffset, totalcorrupt, v;
+ Clump cl;
+ ClumpInfo *bci, *ci, *eci, *xci;
+ Cit *bcit, *cit, *ecit;
+ Shabuf oldsha, newsha;
+
+ /*
+ * We expect to find an arena, with data, between offset
+ * and offset+arenasize. With any luck, the data starts at
+ * offset+ap.blocksize. The blocks have variable size and
+ * aren't padded at all, which doesn't give us any alignment
+ * constraints. The blocks are compressed or high entropy,
+ * but the headers are pretty low entropy (except the score):
+ *
+ * type[1] (range 0 thru 9, 13)
+ * size[2]
+ * uncsize[2] (<= size)
+ *
+ * so we can look for these. We check the scores as we go,
+ * so we can't make any wrong turns. If we find ourselves
+ * in a dead end, scan forward looking for a new start.
+ */
+
+ resetcibuf();
+ memset(head, 0, sizeof *head);
+ memset(arena, 0, sizeof *arena);
+ memset(oldscore, 0, VtScoreSize);
+ memset(score, 0, VtScoreSize);
+ memset(&oldsha, 0, sizeof oldsha);
+ memset(&newsha, 0, sizeof newsha);
+ newsha.rollback = 1;
+
+ if(0){
+ sbdebug(&oldsha, "old.sha");
+ sbdebug(&newsha, "new.sha");
+ }
+
+ loadarenabasics(offset0, anum, head, arena);
+
+ /* start the clump hunt */
+
+ clumps = 0;
+ totalcorrupt = 0;
+ sealing = 1;
+ boffset = offset0 + arena->blocksize;
+ offset = boffset;
+ eoffset = offset0+arena->size - arena->blocksize;
+ toffset = eoffset;
+ sp = pagein(offset0, 4*M);
+
+ if(arena->diskstats.sealed){
+ oldsha.offset = offset0;
+ sbupdate(&oldsha, sp, offset0, 4*M);
+ }
+ ep = sp+4*M;
+ p = sp + (boffset - offset0);
+ ncib = arena->blocksize / ClumpInfoSize; /* ci per block in index */
+ lastclumpend = offset;
+ nbad = 0;
+ inbad = 0;
+ needtozero = 0;
+ minclumps = 0;
+ while(offset < eoffset){
+ /*
+ * Shift buffer if we're running out of room.
+ */
+ if(p+70*K >= ep){
+ /*
+ * Start the post SHA1 buffer. By now we should know the
+ * clumpmagic and arena version, so we can create a
+ * correct head block to get things going.
+ */
+ if(sealing && fix && newsha.offset == 0){
+ newsha.offset = offset0;
+ if(arena->clumpmagic == 0){
+ if(arena->version == 0)
+ arena->version = ArenaVersion5;
+ arena->clumpmagic = newclumpmagic(arena->version);
+ }
+ head->clumpmagic = arena->clumpmagic;
+ shahead(&newsha, offset0, head);
+ }
+ n = 4*M-256*K;
+ if(sealing && fix){
+ sbdiskhash(&newsha, bufoffset);
+ sbupdate(&newsha, buf, bufoffset, 4*M-256*K);
+ }
+ pagein(bufoffset+n, 4*M);
+ p -= n;
+ if(arena->diskstats.sealed)
+ sbupdate(&oldsha, buf, bufoffset, 4*M);
+ }
+
+ /*
+ * Check for a clump at p, which is at offset in the disk.
+ * Duplicate clumps happen in corrupted disks
+ * (the same pattern gets written many times in a row)
+ * and should never happen during regular use.
+ */
+ magic = 0;
+ if((n = isclump(p, &cl, &magic)) > 0){
+ /*
+ * If we were in the middle of some corrupted data,
+ * flush a warning about it and then add any clump
+ * info blocks as necessary.
+ */
+ if(inbad){
+ inbad = 0;
+ v = offset-lastclumpend;
+ if(needtozero){
+ zerorange(lastclumpend, v);
+ sbrollback(&newsha, lastclumpend);
+ print("corrupt clump data - %#llux+%#llux (%,llud bytes)\n",
+ lastclumpend, v, v);
+ }
+ addcicorrupt(v);
+ totalcorrupt += v;
+ nb1 = (minclumps+ncib-1)/ncib;
+ minclumps += (v+ClumpSize+VtMaxLumpSize-1)/(ClumpSize+VtMaxLumpSize);
+ nb2 = (minclumps+ncib-1)/ncib;
+ eoffset -= (nb2-nb1)*arena->blocksize;
+ }
+
+ if(haveclump(cl.info.score))
+ print("warning: duplicate clump %d %V at %#llux+%#d\n", cl.info.type, cl.info.score, offset, n);
+
+ /*
+ * If clumps use different magic numbers, we don't care.
+ * We'll just use the first one we find and make the others
+ * follow suit.
+ */
+ if(arena->clumpmagic == 0){
+ print("clump type %d size %d score %V magic %x\n",
+ cl.info.type, cl.info.size, cl.info.score, magic);
+ arena->clumpmagic = magic;
+ if(magic == _ClumpMagic)
+ arena->version = ArenaVersion4;
+ else
+ arena->version = ArenaVersion5;
+ }
+ if(magic != arena->clumpmagic)
+ p32(p, arena->clumpmagic);
+ if(clumps == 0)
+ arena->ctime = cl.time;
+
+ /*
+ * Record the clump, update arena stats,
+ * grow clump info blocks if needed.
+ */
+ if(verbose > 1)
+ print("\tclump %d: %d %V at %#llux+%#ux (%d)\n",
+ clumps, cl.info.type, cl.info.score, offset, n, n);
+ addcibuf(&cl.info, 0);
+ if(minclumps%ncib == 0)
+ eoffset -= arena->blocksize;
+ minclumps++;
+ clumps++;
+ if(cl.encoding != ClumpENone)
+ arena->diskstats.cclumps++;
+ arena->diskstats.uncsize += cl.info.uncsize;
+ arena->wtime = cl.time;
+
+ /*
+ * Move to next clump.
+ */
+ offset += n;
+ p += n;
+ lastclumpend = offset;
+ }else{
+ /*
+ * Overwrite malformed clump data with zeros later.
+ * For now, just record whether it needs to be overwritten.
+ * Bad regions must be of size at least ClumpSize.
+ * Postponing the overwriting keeps us from writing past
+ * the end of the arena data (which might be directory data)
+ * with zeros.
+ */
+ if(!inbad){
+ inbad = 1;
+ needtozero = 0;
+ if(memcmp(p, zero, ClumpSize) != 0)
+ needtozero = 1;
+ p += ClumpSize;
+ offset += ClumpSize;
+ nbad++;
+ }else{
+ if(*p != 0)
+ needtozero = 1;
+ p++;
+ offset++;
+ }
+ }
+ }
+ pageout();
+
+ if(verbose)
+ print("readable clumps: %d; min. directory entries: %d\n",
+ clumps, minclumps);
+ arena->diskstats.used = lastclumpend - boffset;
+ leaked = eoffset - lastclumpend;
+ if(verbose)
+ print("used from %#llux to %#llux = %,lld (%,lld unused)\n",
+ boffset, lastclumpend, arena->diskstats.used, leaked);
+
+ /*
+ * Finish the SHA1 of the old data.
+ */
+ if(arena->diskstats.sealed){
+ sbdiskhash(&oldsha, toffset);
+ readdisk(dbuf, toffset, arena->blocksize);
+ scorecp(dbuf+arena->blocksize-VtScoreSize, zero);
+ sbupdate(&oldsha, dbuf, toffset, arena->blocksize);
+ sbscore(&oldsha, oldscore);
+ }
+
+ /*
+ * If we still don't know the clump magic, the arena
+ * must be empty. It still needs a value, so make
+ * something up.
+ */
+ if(arena->version == 0)
+ arena->version = ArenaVersion5;
+ if(arena->clumpmagic == 0){
+ if(arena->version == ArenaVersion4)
+ arena->clumpmagic = _ClumpMagic;
+ else{
+ do
+ arena->clumpmagic = fastrand();
+ while(arena->clumpmagic==_ClumpMagic
+ ||arena->clumpmagic==0);
+ }
+ head->clumpmagic = arena->clumpmagic;
+ }
+
+ /*
+ * Guess at number of clumpinfo blocks to load.
+ * If we guess high, it's no big deal. If we guess low,
+ * we'll be forced into rewriting the whole directory.
+ * Still not such a big deal.
+ */
+ if(clumps == 0 || arena->diskstats.used == totalcorrupt)
+ goto Nocib;
+ if(clumps < arena->diskstats.clumps)
+ clumps = arena->diskstats.clumps;
+ if(clumps < ncibuf)
+ clumps = ncibuf;
+ clumps += totalcorrupt/
+ ((arena->diskstats.used - totalcorrupt)/clumps);
+ clumps += totalcorrupt/2000;
+ if(clumps < minclumps)
+ clumps = minclumps;
+ clumps += ncib-1;
+ clumps -= clumps%ncib;
+
+ /*
+ * Can't write into the actual data.
+ */
+ v = offset0 + arena->size - arena->blocksize;
+ v -= (clumps+ncib-1)/ncib * arena->blocksize;
+ if(v < lastclumpend){
+ v = offset0 + arena->size - arena->blocksize;
+ clumps = (v-lastclumpend)/arena->blocksize * ncib;
+ }
+
+ if(clumps < minclumps)
+ print("cannot happen?\n");
+
+ /*
+ * Check clumpinfo blocks against directory we created.
+ * The tricky part is handling the corrupt sections of arena.
+ * If possible, we remark just the affected directory entries
+ * rather than slide everything down.
+ *
+ * Allocate clumps+1 blocks and check that we don't need
+ * the last one at the end.
+ */
+ bci = loadci(offset0, arena, clumps+1);
+ eci = bci+clumps+1;
+ bcit = cibuf;
+ ecit = cibuf+ncibuf;
+
+ smart = 0; /* Somehow the smart code doesn't do corrupt clumps right. */
+Again:
+ nbad = 0;
+ ci = bci;
+ for(cit=bcit; cit<ecit && ci<eci; cit++){
+ if(cit->corrupt){
+ vlong n, m;
+ if(smart){
+ /*
+ * If we can, just mark existing entries as corrupt.
+ */
+ n = cit->corrupt;
+ for(xci=ci; n>0 && xci<eci; xci++)
+ n -= ClumpSize+xci->size;
+ if(n > 0 || xci >= eci)
+ goto Dumb;
+ printed = 0;
+ for(; ci<xci; ci++){
+ if(verbose && ci->type != VtCorruptType){
+ if(!printed){
+ print("marking directory %d-%d as corrupt\n",
+ (int)(ci-bci), (int)(xci-bci));
+ printed = 1;
+ }
+ print("\ttype=%d size=%d uncsize=%d score=%V\n",
+ ci->type, ci->size, ci->uncsize, ci->score);
+ }
+ ci->type = VtCorruptType;
+ }
+ }else{
+ Dumb:
+ print("\trewriting clump directory\n");
+ /*
+ * Otherwise, blaze a new trail.
+ */
+ n = cit->corrupt;
+ while(n > 0 && ci < eci){
+ if(n < ClumpSize)
+ sysfatal("bad math in clump corrupt");
+ if(n <= VtMaxLumpSize+ClumpSize)
+ m = n;
+ else{
+ m = VtMaxLumpSize+ClumpSize;
+ if(n-m < ClumpSize)
+ m -= ClumpSize;
+ }
+ ci->type = VtCorruptType;
+ ci->size = m-ClumpSize;
+ ci->uncsize = m-ClumpSize;
+ memset(ci->score, 0, VtScoreSize);
+ ci++;
+ n -= m;
+ }
+ }
+ continue;
+ }
+ if(clumpinfocmp(&cit->ci, ci) != 0){
+ if(verbose && (smart || verbose>1)){
+ print("clumpinfo %d\n", (int)(ci-bci));
+ print("\twant: %d %d %d %V\n",
+ cit->ci.type, cit->ci.size,
+ cit->ci.uncsize, cit->ci.score);
+ print("\thave: %d %d %d %V\n",
+ ci->type, ci->size,
+ ci->uncsize, ci->score);
+ }
+ *ci = cit->ci;
+ nbad++;
+ }
+ ci++;
+ }
+ if(ci >= eci || cit < ecit){
+ print("ran out of space editing existing directory; rewriting\n");
+ print("# eci %ld ci %ld ecit %ld cit %ld\n", eci-bci, ci-bci, ecit-bcit, cit-bcit);
+ assert(smart); /* can't happen second time thru */
+ smart = 0;
+ goto Again;
+ }
+
+ assert(ci <= eci);
+ arena->diskstats.clumps = ci-bci;
+ eoffset = writeci(offset0, arena, bci, ci-bci);
+ if(sealing && fix)
+ sbrollback(&newsha, v);
+print("eoffset=%lld lastclumpend=%lld diff=%lld unseal=%d\n", eoffset, lastclumpend, eoffset-lastclumpend, unseal);
+ if(lastclumpend > eoffset)
+ print("arena directory overwrote blocks! cannot happen!\n");
+ free(bci);
+ if(smart && nbad)
+ print("arena directory has %d bad or missing entries\n", nbad);
+Nocib:
+ if(eoffset - lastclumpend > 64*1024 && (!arena->diskstats.sealed || unseal)){
+ if(arena->diskstats.sealed)
+ print("unsealing arena\n");
+ sealing = 0;
+ memset(oldscore, 0, VtScoreSize);
+ }
+
+ /*
+ * Finish the SHA1 of the new data - only meaningful
+ * if we've been writing to disk (`fix').
+ */
+ arena->diskstats.sealed = sealing;
+ arena->memstats = arena->diskstats;
+ if(sealing && fix){
+ uchar tbuf[MaxDiskBlock];
+
+ sbdiskhash(&newsha, toffset);
+ memset(tbuf, 0, sizeof tbuf);
+ packarena(arena, tbuf);
+ sbupdate(&newsha, tbuf, toffset, arena->blocksize);
+ sbscore(&newsha, score);
+ }
+}
+
+void
+dumparena(vlong offset, int anum, Arena *arena)
+{
+ char buf[1000];
+ vlong o, e;
+ int fd, n;
+
+ snprint(buf, sizeof buf, "%s.%d", dumpbase, anum);
+ if((fd = create(buf, OWRITE, 0666)) < 0){
+ fprint(2, "create %s: %r\n", buf);
+ return;
+ }
+ e = offset+arena->size;
+ for(o=offset; o<e; o+=n){
+ n = 4*M;
+ if(o+n > e)
+ n = e-o;
+ if(pwrite(fd, pagein(o, n), n, o-offset) != n){
+ fprint(2, "write %s at %#llux: %r\n", buf, o-offset);
+ return;
+ }
+ }
+}
+
+void
+checkarena(vlong offset, int anum)
+{
+ uchar dbuf[MaxDiskBlock];
+ uchar *p, oldscore[VtScoreSize], score[VtScoreSize];
+ Arena arena, oarena;
+ ArenaHead head;
+ Info *fmt, *fmta;
+ int sz;
+
+ print("# arena %d: offset %#llux\n", anum, offset);
+
+ if(offset >= partend){
+ print("arena offset out of bounds\n");
+ return;
+ }
+
+ guessarena(offset, anum, &head, &arena, oldscore, score);
+
+ if(verbose){
+ print("#\tversion=%d name=%s blocksize=%d size=%z",
+ head.version, head.name, head.blocksize, head.size);
+ if(head.clumpmagic)
+ print(" clumpmagic=%#.8ux", head.clumpmagic);
+ print("\n#\tclumps=%d cclumps=%d used=%,lld uncsize=%,lld\n",
+ arena.diskstats.clumps, arena.diskstats.cclumps,
+ arena.diskstats.used, arena.diskstats.uncsize);
+ print("#\tctime=%t\n", arena.ctime);
+ print("#\twtime=%t\n", arena.wtime);
+ if(arena.diskstats.sealed)
+ print("#\tsealed score=%V\n", score);
+ }
+
+ if(dumpbase){
+ dumparena(offset, anum, &arena);
+ return;
+ }
+
+ memset(dbuf, 0, sizeof dbuf);
+ packarenahead(&head, dbuf);
+ p = pagein(offset, arena.blocksize);
+ if(memcmp(dbuf, p, arena.blocksize) != 0){
+ print("on-disk arena header incorrect\n");
+ showdiffs(dbuf, p, arena.blocksize,
+ arena.version==ArenaVersion4 ? headinfo4 : headinfo5);
+ }
+ memmove(p, dbuf, arena.blocksize);
+
+ memset(dbuf, 0, sizeof dbuf);
+ packarena(&arena, dbuf);
+ if(arena.diskstats.sealed)
+ scorecp(dbuf+arena.blocksize-VtScoreSize, score);
+ p = pagein(offset+arena.size-arena.blocksize, arena.blocksize);
+ memset(&oarena, 0, sizeof oarena);
+ unpackarena(&oarena, p);
+ if(arena.version == ArenaVersion4){
+ sz = ArenaSize4;
+ fmt = tailinfo4;
+ fmta = tailinfo4a;
+ }else{
+ sz = ArenaSize5;
+ fmt = tailinfo5;
+ fmta = tailinfo5a;
+ }
+ if(p[sz] == 1){
+ fmt = fmta;
+ if(oarena.diskstats.sealed){
+ /*
+ * some arenas were sealed with the extension
+ * before we adopted the convention that if it didn't
+ * add new information it gets dropped.
+ */
+ _packarena(&arena, dbuf, 1);
+ }
+ }
+ if(memcmp(dbuf, p, arena.blocksize-VtScoreSize) != 0){
+ print("on-disk arena tail incorrect\n");
+ showdiffs(dbuf, p, arena.blocksize-VtScoreSize, fmt);
+ }
+ if(arena.diskstats.sealed){
+ if(oarena.diskstats.sealed)
+ if(scorecmp(p+arena.blocksize-VtScoreSize, oldscore) != 0){
+ print("on-disk arena seal score incorrect\n");
+ print("\tcorrect=%V\n", oldscore);
+ print("\t disk=%V\n", p+arena.blocksize-VtScoreSize);
+ }
+ if(fix && scorecmp(p+arena.blocksize-VtScoreSize, score) != 0){
+ print("%ssealing arena%s: %V\n",
+ oarena.diskstats.sealed ? "re" : "",
+ scorecmp(oldscore, score) == 0 ?
+ "" : " after changes", score);
+ }
+ }
+ memmove(p, dbuf, arena.blocksize);
+
+ pageout();
+}
+
+AMapN*
+buildamap(void)
+{
+ uchar *p;
+ vlong o;
+ ArenaHead h;
+ AMapN *an;
+ AMap *m;
+
+ an = vtmallocz(sizeof *an);
+ for(o=ap.arenabase; o<partend; o+=arenasize){
+ p = pagein(o, Block);
+ if(unpackarenahead(&h, p) >= 0){
+ an->map = vtrealloc(an->map, (an->n+1)*sizeof an->map[0]);
+ m = &an->map[an->n++];
+ m->start = o;
+ m->stop = o+h.size;
+ strcpy(m->name, h.name);
+ }
+ }
+ return an;
+}
+
+void
+checkmap(void)
+{
+ char *s;
+ uchar *p;
+ int i, len;
+ AMapN *an;
+ Fmt fmt;
+
+ an = buildamap();
+ fmtstrinit(&fmt);
+ fmtprint(&fmt, "%ud\n", an->n);
+ for(i=0; i<an->n; i++)
+ fmtprint(&fmt, "%s\t%lld\t%lld\n",
+ an->map[i].name, an->map[i].start, an->map[i].stop);
+ s = fmtstrflush(&fmt);
+ len = strlen(s);
+ if(len > ap.tabsize){
+ print("arena partition map too long: need %z bytes have %z\n",
+ (vlong)len, (vlong)ap.tabsize);
+ len = ap.tabsize;
+ }
+
+ if(ap.tabsize >= 4*M){ /* can't happen - max arenas is 2000 */
+ print("arena partition map *way* too long\n");
+ return;
+ }
+
+ p = pagein(ap.tabbase, ap.tabsize);
+ if(memcmp(p, s, len) != 0){
+ print("arena partition map incorrect; rewriting.\n");
+ memmove(p, s, len);
+ }
+ pageout();
+}
+
+int mainstacksize = 512*1024;
+
+void
+threadmain(int argc, char **argv)
+{
+ int mode;
+
+ mode = OREAD;
+ readonly = 1;
+ ARGBEGIN{
+ case 'U':
+ unseal = 1;
+ break;
+ case 'a':
+ arenasize = unittoull(EARGF(usage()));
+ break;
+ case 'b':
+ ap.blocksize = unittoull(EARGF(usage()));
+ break;
+ case 'f':
+ fix = 1;
+ mode = ORDWR;
+ readonly = 0;
+ break;
+ case 'n':
+ basename = EARGF(usage());
+ break;
+ case 'v':
+ verbose++;
+ break;
+ case 'x':
+ dumpbase = EARGF(usage());
+ break;
+ default:
+ usage();
+ }ARGEND
+
+ if(argc != 1 && argc != 2)
+ usage();
+
+ file = argv[0];
+
+ ventifmtinstall();
+ fmtinstall('z', zfmt);
+ fmtinstall('t', tfmt);
+ quotefmtinstall();
+
+ part = initpart(file, mode|ODIRECT);
+ if(part == nil)
+ sysfatal("can't open %s: %r", file);
+ partend = part->size;
+
+ if(isonearena()){
+ checkarena(0, -1);
+ threadexitsall(nil);
+ }
+ checkarenas(argc > 1 ? argv[1] : nil);
+ checkmap();
+ threadexitsall(nil);
+}
+
diff --git a/sys/src/cmd/venti/srv/fmtarenas.c b/sys/src/cmd/venti/srv/fmtarenas.c
new file mode 100755
index 000000000..f196f22d4
--- /dev/null
+++ b/sys/src/cmd/venti/srv/fmtarenas.c
@@ -0,0 +1,132 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+void
+usage(void)
+{
+ fprint(2, "usage: fmtarenas [-Z] [-b blocksize] [-a arenasize] name file\n");
+ threadexitsall(0);
+}
+
+void
+threadmain(int argc, char *argv[])
+{
+ int vers;
+ ArenaPart *ap;
+ Part *part;
+ Arena *arena;
+ u64int addr, limit, asize, apsize;
+ char *file, *name, aname[ANameSize];
+ int i, n, blocksize, tabsize, zero;
+
+ ventifmtinstall();
+ statsinit();
+
+ blocksize = 8 * 1024;
+ asize = 512 * 1024 *1024;
+ tabsize = 512 * 1024; /* BUG: should be determine from number of arenas */
+ zero = -1;
+ vers = ArenaVersion5;
+ ARGBEGIN{
+ case 'D':
+ settrace(EARGF(usage()));
+ break;
+ case 'a':
+ asize = unittoull(EARGF(usage()));
+ if(asize == TWID64)
+ usage();
+ break;
+ case 'b':
+ blocksize = unittoull(EARGF(usage()));
+ if(blocksize == ~0)
+ usage();
+ if(blocksize > MaxDiskBlock){
+ fprint(2, "block size too large, max %d\n", MaxDiskBlock);
+ threadexitsall("usage");
+ }
+ break;
+ case '4':
+ vers = ArenaVersion4;
+ break;
+ case 'Z':
+ zero = 0;
+ break;
+ default:
+ usage();
+ break;
+ }ARGEND
+
+ if(zero == -1){
+ if(vers == ArenaVersion4)
+ zero = 1;
+ else
+ zero = 0;
+ }
+
+ if(argc != 2)
+ usage();
+
+ name = argv[0];
+ file = argv[1];
+
+ if(nameok(name) < 0)
+ sysfatal("illegal name template %s", name);
+
+ part = initpart(file, ORDWR|ODIRECT);
+ if(part == nil)
+ sysfatal("can't open partition %s: %r", file);
+
+ if(zero)
+ zeropart(part, blocksize);
+
+ maxblocksize = blocksize;
+ initdcache(20*blocksize);
+
+ ap = newarenapart(part, blocksize, tabsize);
+ if(ap == nil)
+ sysfatal("can't initialize arena: %r");
+
+ apsize = ap->size - ap->arenabase;
+ n = apsize / asize;
+ if(apsize - (n * asize) >= MinArenaSize)
+ n++;
+
+ fprint(2, "fmtarenas %s: %,d arenas, %,lld bytes storage, %,d bytes for index map\n",
+ file, n, apsize, ap->tabsize);
+
+ ap->narenas = n;
+ ap->map = MKNZ(AMap, n);
+ ap->arenas = MKNZ(Arena*, n);
+
+ addr = ap->arenabase;
+ for(i = 0; i < n; i++){
+ limit = addr + asize;
+ if(limit >= ap->size || ap->size - limit < MinArenaSize){
+ limit = ap->size;
+ if(limit - addr < MinArenaSize)
+ sysfatal("bad arena set math: runt arena at %lld,%lld %lld", addr, limit, ap->size);
+ }
+
+ snprint(aname, ANameSize, "%s%d", name, i);
+
+ if(0) fprint(2, "adding arena %s at [%lld,%lld)\n", aname, addr, limit);
+
+ arena = newarena(part, vers, aname, addr, limit - addr, blocksize);
+ if(!arena)
+ fprint(2, "can't make new arena %s: %r", aname);
+ freearena(arena);
+
+ ap->map[i].start = addr;
+ ap->map[i].stop = limit;
+ namecp(ap->map[i].name, aname);
+
+ addr = limit;
+ }
+
+ if(wbarenapart(ap) < 0)
+ fprint(2, "can't write back arena partition header for %s: %r\n", file);
+
+ flushdcache();
+ threadexitsall(0);
+}
diff --git a/sys/src/cmd/venti/srv/fmtbloom.c b/sys/src/cmd/venti/srv/fmtbloom.c
new file mode 100755
index 000000000..f700d7814
--- /dev/null
+++ b/sys/src/cmd/venti/srv/fmtbloom.c
@@ -0,0 +1,116 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+Bloom b;
+
+void
+usage(void)
+{
+ fprint(2, "usage: fmtbloom [-s size] [-n nblocks | -N nhash] file\n");
+ threadexitsall(0);
+}
+
+void
+threadmain(int argc, char *argv[])
+{
+ Part *part;
+ char *file;
+ vlong bits, size, size2;
+ int nhash;
+ vlong nblocks;
+
+ ventifmtinstall();
+ statsinit();
+
+ size = 0;
+ nhash = 0;
+ nblocks = 0;
+ ARGBEGIN{
+ case 'n':
+ if(nhash || nblocks)
+ usage();
+ nblocks = unittoull(EARGF(usage()));
+ break;
+ case 'N':
+ if(nhash || nblocks)
+ usage();
+ nhash = unittoull(EARGF(usage()));
+ if(nhash > BloomMaxHash){
+ fprint(2, "maximum possible is -N %d", BloomMaxHash);
+ usage();
+ }
+ break;
+ case 's':
+ size = unittoull(ARGF());
+ if(size == ~0)
+ usage();
+ break;
+ default:
+ usage();
+ break;
+ }ARGEND
+
+ if(argc != 1)
+ usage();
+
+ file = argv[0];
+
+ part = initpart(file, ORDWR|ODIRECT);
+ if(part == nil)
+ sysfatal("can't open partition %s: %r", file);
+
+ if(size == 0)
+ size = part->size;
+
+ if(size < 1024*1024)
+ sysfatal("bloom filter too small");
+
+ if(size > MaxBloomSize){
+ fprint(2, "warning: not using entire %,lld bytes; using only %,lld bytes\n",
+ size, (vlong)MaxBloomSize);
+ size = MaxBloomSize;
+ }
+ if(size&(size-1)){
+ for(size2=1; size2<size; size2*=2)
+ ;
+ size = size2/2;
+ fprint(2, "warning: size not a power of 2; only using %lldMB\n", size/1024/1024);
+ }
+
+ if(nblocks){
+ /*
+ * no use for more than 32 bits per block
+ * shoot for less than 64 bits per block
+ */
+ size2 = size;
+ while(size2*8 >= nblocks*64)
+ size2 >>= 1;
+ if(size2 != size){
+ size = size2;
+ fprint(2, "warning: using only %lldMB - not enough blocks to warrant more\n",
+ size/1024/1024);
+ }
+
+ /*
+ * optimal is to use ln 2 times as many hash functions as we have bits per blocks.
+ */
+ bits = (8*size)/nblocks;
+ nhash = bits*7/10;
+ if(nhash > BloomMaxHash)
+ nhash = BloomMaxHash;
+ }
+ if(!nhash)
+ nhash = BloomMaxHash;
+ if(bloominit(&b, size, nil) < 0)
+ sysfatal("bloominit: %r");
+ b.nhash = nhash;
+ bits = nhash*10/7;
+ nblocks = (8*size)/bits;
+ fprint(2, "fmtbloom: using %lldMB, %d hashes/score, best up to %,lld blocks\n", size/1024/1024, nhash, nblocks);
+ b.data = vtmallocz(size);
+ b.part = part;
+ if(writebloom(&b) < 0)
+ sysfatal("writing %s: %r", file);
+ threadexitsall(0);
+}
diff --git a/sys/src/cmd/venti/srv/fmtindex.c b/sys/src/cmd/venti/srv/fmtindex.c
new file mode 100755
index 000000000..2a5148ea4
--- /dev/null
+++ b/sys/src/cmd/venti/srv/fmtindex.c
@@ -0,0 +1,120 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+void
+usage(void)
+{
+ fprint(2, "usage: fmtindex [-a] config\n");
+ threadexitsall(0);
+}
+
+void
+threadmain(int argc, char *argv[])
+{
+ Config conf;
+ Index *ix;
+ ArenaPart *ap;
+ Arena **arenas;
+ AMap *amap;
+ u64int addr;
+ char *file;
+ u32int i, j, n, narenas;
+ int add;
+
+ ventifmtinstall();
+ statsinit();
+
+ add = 0;
+ ARGBEGIN{
+ case 'a':
+ add = 1;
+ break;
+ default:
+ usage();
+ break;
+ }ARGEND
+
+ if(argc != 1)
+ usage();
+
+ file = argv[0];
+
+ if(runconfig(file, &conf) < 0)
+ sysfatal("can't initialize config %s: %r", file);
+ if(conf.index == nil)
+ sysfatal("no index specified in %s", file);
+ if(nameok(conf.index) < 0)
+ sysfatal("illegal index name %s", conf.index);
+
+ narenas = 0;
+ for(i = 0; i < conf.naparts; i++){
+ ap = conf.aparts[i];
+ narenas += ap->narenas;
+ }
+
+ if(add){
+ ix = initindex(conf.index, conf.sects, conf.nsects);
+ if(ix == nil)
+ sysfatal("can't initialize index %s: %r", conf.index);
+ }else{
+ ix = newindex(conf.index, conf.sects, conf.nsects);
+ if(ix == nil)
+ sysfatal("can't create new index %s: %r", conf.index);
+
+ n = 0;
+ for(i = 0; i < ix->nsects; i++)
+ n += ix->sects[i]->blocks;
+
+ if(0) fprint(2, "using %ud buckets of %ud; div=%d\n", ix->buckets, n, ix->div);
+ }
+ amap = MKNZ(AMap, narenas);
+ arenas = MKNZ(Arena*, narenas);
+
+ addr = IndexBase;
+ n = 0;
+ for(i = 0; i < conf.naparts; i++){
+ ap = conf.aparts[i];
+ for(j = 0; j < ap->narenas; j++){
+ if(n >= narenas)
+ sysfatal("too few slots in index's arena set");
+
+ arenas[n] = ap->arenas[j];
+ if(n < ix->narenas){
+ if(arenas[n] != ix->arenas[n])
+ sysfatal("mismatched arenas %s and %s at slot %d",
+ arenas[n]->name, ix->arenas[n]->name, n);
+ amap[n] = ix->amap[n];
+ if(amap[n].start != addr)
+ sysfatal("mis-located arena %s in index %s", arenas[n]->name, ix->name);
+ addr = amap[n].stop;
+ }else{
+ amap[n].start = addr;
+ addr += ap->arenas[j]->size;
+ amap[n].stop = addr;
+ namecp(amap[n].name, ap->arenas[j]->name);
+ if(0) fprint(2, "add arena %s at [%lld,%lld)\n",
+ amap[n].name, amap[n].start, amap[n].stop);
+ }
+
+ n++;
+ }
+ }
+ if(0){
+ fprint(2, "configured index=%s with arenas=%d and storage=%lld\n",
+ ix->name, n, addr - IndexBase);
+ fprint(2, "\tbuckets=%d\n",
+ ix->buckets);
+ }
+ fprint(2, "fmtindex: %,d arenas, %,d index buckets, %,lld bytes storage\n",
+ n, ix->buckets, addr-IndexBase);
+
+ ix->amap = amap;
+ ix->arenas = arenas;
+ ix->narenas = narenas;
+
+ if(wbindex(ix) < 0)
+ fprint(2, "can't write back arena partition header for %s: %r\n", file);
+
+ threadexitsall(0);
+}
diff --git a/sys/src/cmd/venti/srv/fmtisect.c b/sys/src/cmd/venti/srv/fmtisect.c
new file mode 100755
index 000000000..28b88de61
--- /dev/null
+++ b/sys/src/cmd/venti/srv/fmtisect.c
@@ -0,0 +1,83 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+void
+usage(void)
+{
+ fprint(2, "usage: fmtisect [-Z] [-b blocksize] name file\n");
+ threadexitsall(0);
+}
+
+void
+threadmain(int argc, char *argv[])
+{
+ int vers;
+ ISect *is;
+ Part *part;
+ char *file, *name;
+ int blocksize, setsize, zero;
+
+ ventifmtinstall();
+ statsinit();
+
+ blocksize = 8 * 1024;
+ setsize = 512 * 1024;
+ zero = -1;
+ vers = ISectVersion2;
+ ARGBEGIN{
+ case 'b':
+ blocksize = unittoull(ARGF());
+ if(blocksize == ~0)
+ usage();
+ if(blocksize > MaxDiskBlock){
+ fprint(2, "block size too large, max %d\n", MaxDiskBlock);
+ threadexitsall("usage");
+ }
+ break;
+ case '1':
+ vers = ISectVersion1;
+ break;
+ case 'Z':
+ zero = 0;
+ break;
+ default:
+ usage();
+ break;
+ }ARGEND
+
+ if(zero == -1){
+ if(vers == ISectVersion1)
+ zero = 1;
+ else
+ zero = 0;
+ }
+
+ if(argc != 2)
+ usage();
+
+ name = argv[0];
+ file = argv[1];
+
+ if(nameok(name) < 0)
+ sysfatal("illegal name %s", name);
+
+ part = initpart(file, ORDWR|ODIRECT);
+ if(part == nil)
+ sysfatal("can't open partition %s: %r", file);
+
+ if(zero)
+ zeropart(part, blocksize);
+
+ is = newisect(part, vers, name, blocksize, setsize);
+ if(is == nil)
+ sysfatal("can't initialize new index: %r");
+
+ fprint(2, "fmtisect %s: %,d buckets of %,d entries, %,d bytes for index map\n",
+ file, is->blocks, is->buckmax, setsize);
+
+ if(wbisect(is) < 0)
+ fprint(2, "can't write back index section header for %s: %r\n", file);
+
+ threadexitsall(0);
+}
diff --git a/sys/src/cmd/venti/srv/fns.h b/sys/src/cmd/venti/srv/fns.h
new file mode 100755
index 000000000..398562c27
--- /dev/null
+++ b/sys/src/cmd/venti/srv/fns.h
@@ -0,0 +1,228 @@
+/*
+ * sorted by 4,/^$/|sort -bd +1
+ */
+int addarena(Arena *name);
+void addstat(int, int);
+void addstat2(int, int, int, int);
+ZBlock *alloczblock(u32int size, int zeroed, uint alignment);
+Arena *amapitoa(Index *index, u64int a, u64int *aa);
+Arena *amapitoag(Index *index, u64int a, u64int *gstart, u64int *glimit, int *g);
+u64int arenadirsize(Arena *arena, u32int clumps);
+int arenatog(Arena *arena, u64int aa, u64int *gstart, u64int *glimit, int *g);
+void arenaupdate(Arena *arena, u32int size, u8int *score);
+int asumload(Arena *arena, int g, IEntry *entries, int maxentries);
+void backsumarena(Arena *arena);
+void binstats(long (*fn)(Stats *s0, Stats *s1, void*), void *arg, long t0, long t1, Statbin *bin, int nbin);
+int bloominit(Bloom*, vlong, uchar*);
+int bucklook(u8int*, int, u8int*, int);
+u32int buildbucket(Index *ix, IEStream *ies, IBucket *ib, uint);
+void checkdcache(void);
+void checklumpcache(void);
+int clumpinfoeq(ClumpInfo *c, ClumpInfo *d);
+int clumpinfoeq(ClumpInfo *c, ClumpInfo *d);
+u32int clumpmagic(Arena *arena, u64int aa);
+uint countbits(uint n);
+int delarena(Arena *arena);
+void delaykickicache(void);
+void delaykickround(Round*);
+void delaykickroundproc(void*);
+void dirtydblock(DBlock*, int);
+void diskaccess(int);
+void disksched(void);
+void *emalloc(ulong);
+void emptydcache(void);
+void emptyicache(void);
+void emptylumpcache(void);
+void *erealloc(void *, ulong);
+char *estrdup(char*);
+void *ezmalloc(ulong);
+Arena *findarena(char *name);
+int flushciblocks(Arena *arena);
+void flushdcache(void);
+void flushicache(void);
+int flushpart(Part*);
+void flushqueue(void);
+void fmtzbinit(Fmt *f, ZBlock *b);
+void freearena(Arena *arena);
+void freearenapart(ArenaPart *ap, int freearenas);
+void freeiestream(IEStream *ies);
+void freeifile(IFile *f);
+void freeisect(ISect *is);
+void freeindex(Index *index);
+void freepart(Part *part);
+void freezblock(ZBlock *b);
+DBlock *_getdblock(Part *part, u64int addr, int mode, int load);
+DBlock *getdblock(Part *part, u64int addr, int mode);
+u32int hashbits(u8int *score, int nbits);
+char *hargstr(HConnect*, char*, char*);
+vlong hargint(HConnect*, char*, vlong);
+int hdebug(HConnect*);
+int hdisk(HConnect*);
+int hnotfound(HConnect*);
+int hproc(HConnect*);
+int hsethtml(HConnect*);
+int hsettext(HConnect*);
+int httpdinit(char *address, char *webroot);
+int iaddrcmp(IAddr *ia1, IAddr *ia2);
+IEntry* icachedirty(u32int, u32int, u64int);
+ulong icachedirtyfrac(void);
+void icacheclean(IEntry*);
+int icachelookup(u8int *score, int type, IAddr *ia);
+AState icachestate(void);
+int ientrycmp(const void *vie1, const void *vie2);
+char *ifileline(IFile *f);
+int ifilename(IFile *f, char *dst);
+int ifileu32int(IFile *f, u32int *r);
+int inbloomfilter(Bloom*, u8int*);
+int indexsect(Index *ix, u8int *score);
+int indexsect0(Index *ix, u32int buck);
+Arena *initarena(Part *part, u64int base, u64int size, u32int blocksize);
+ArenaPart *initarenapart(Part *part);
+int initarenasum(void);
+void initbloomfilter(Index*);
+void initdcache(u32int mem);
+void initicache(u32int mem);
+void initicachewrite(void);
+IEStream *initiestream(Part *part, u64int off, u64int clumps, u32int size);
+ISect *initisect(Part *part);
+Index *initindex(char *name, ISect **sects, int n);
+void initlumpcache(u32int size, u32int nblocks);
+int initlumpqueues(int nq);
+Part* initpart(char *name, int mode);
+void initround(Round*, char*, int);
+int initventi(char *config, Config *conf);
+void insertlump(Lump *lump, Packet *p);
+int insertscore(u8int *score, IAddr *ia, int state, AState *as);
+void kickdcache(void);
+void kickicache(void);
+void kickround(Round*, int wait);
+int loadbloom(Bloom*);
+ZBlock *loadclump(Arena *arena, u64int aa, int blocks, Clump *cl, u8int *score, int verify);
+DBlock *loadibucket(Index *index, u8int *score, ISect **is, u32int *buck, IBucket *ib);
+int loadientry(Index *index, u8int *score, int type, IEntry *ie);
+void logerr(int severity, char *fmt, ...);
+Lump *lookuplump(u8int *score, int type);
+int lookupscore(u8int *score, int type, IAddr *ia);
+int maparenas(AMap *am, Arena **arenas, int n, char *what);
+void markbloomfilter(Bloom*, u8int*);
+uint msec(void);
+int namecmp(char *s, char *t);
+void namecp(char *dst, char *src);
+int nameok(char *name);
+void needmainindex(void);
+void needzeroscore(void);
+Arena *newarena(Part *part, u32int, char *name, u64int base, u64int size, u32int blocksize);
+ArenaPart *newarenapart(Part *part, u32int blocksize, u32int tabsize);
+ISect *newisect(Part *part, u32int vers, char *name, u32int blocksize, u32int tabsize);
+Index *newindex(char *name, ISect **sects, int n);
+u32int now(void);
+int okamap(AMap *am, int n, u64int start, u64int stop, char *what);
+int okibucket(IBucket*, ISect*);
+int outputamap(Fmt *f, AMap *am, int n);
+int outputindex(Fmt *f, Index *ix);
+int _packarena(Arena *arena, u8int *buf, int);
+int packarena(Arena *arena, u8int *buf);
+int packarenahead(ArenaHead *head, u8int *buf);
+int packarenapart(ArenaPart *as, u8int *buf);
+void packbloomhead(Bloom*, u8int*);
+int packclump(Clump *c, u8int *buf, u32int);
+void packclumpinfo(ClumpInfo *ci, u8int *buf);
+void packibucket(IBucket *b, u8int *buf, u32int magic);
+void packientry(IEntry *i, u8int *buf);
+int packisect(ISect *is, u8int *buf);
+void packmagic(u32int magic, u8int *buf);
+ZBlock *packet2zblock(Packet *p, u32int size);
+int parseamap(IFile *f, AMapN *amn);
+int parseindex(IFile *f, Index *ix);
+void partblocksize(Part *part, u32int blocksize);
+int partifile(IFile *f, Part *part, u64int start, u32int size);
+void printarenapart(int fd, ArenaPart *ap);
+void printarena(int fd, Arena *arena);
+void printindex(int fd, Index *ix);
+void printstats(void);
+void putdblock(DBlock *b);
+void putlump(Lump *b);
+int queuewrite(Lump *b, Packet *p, int creator, uint ms);
+u32int readarena(Arena *arena, u64int aa, u8int *buf, long n);
+int readarenamap(AMapN *amn, Part *part, u64int base, u32int size);
+Bloom *readbloom(Part*);
+int readclumpinfo(Arena *arena, int clump, ClumpInfo *ci);
+int readclumpinfos(Arena *arena, int clump, ClumpInfo *cis, int n);
+ZBlock *readfile(char *name);
+int readifile(IFile *f, char *name);
+Packet *readlump(u8int *score, int type, u32int size, int *cached);
+int readpart(Part *part, u64int addr, u8int *buf, u32int n);
+int resetbloom(Bloom*);
+int runconfig(char *config, Config*);
+int scorecmp(u8int *, u8int *);
+void scoremem(u8int *score, u8int *buf, int size);
+void setatailstate(AState*);
+void seterr(int severity, char *fmt, ...);
+void setstat(int, long);
+void settrace(char *type);
+u64int sortrawientries(Index *ix, Part *tmp, u64int *tmpoff, Bloom *bloom);
+void startbloomproc(Bloom*);
+Memimage* statgraph(Graph *g);
+void statsinit(void);
+int storeclump(Index *index, ZBlock *b, u8int *score, int type, u32int creator, IAddr *ia);
+int storeientry(Index *index, IEntry *m);
+int strscore(char *s, u8int *score);
+int stru32int(char *s, u32int *r);
+int stru64int(char *s, u64int *r);
+void sumarena(Arena *arena);
+int syncarena(Arena *arena, u32int n, int zok, int fix);
+int syncindex(Index *ix);
+void trace(char *type, char*, ...);
+void traceinit(void);
+int u64log2(u64int v);
+u64int unittoull(char *s);
+int unpackarena(Arena *arena, u8int *buf);
+int unpackarenahead(ArenaHead *head, u8int *buf);
+int unpackarenapart(ArenaPart *as, u8int *buf);
+int unpackbloomhead(Bloom*, u8int*);
+int unpackclump(Clump *c, u8int *buf, u32int);
+void unpackclumpinfo(ClumpInfo *ci, u8int *buf);
+void unpackibucket(IBucket *b, u8int *buf, u32int magic);
+void unpackientry(IEntry *i, u8int *buf);
+int unpackisect(ISect *is, u8int *buf);
+u32int unpackmagic(u8int *buf);
+void ventifmtinstall(void);
+void vtloghdump(Hio*, VtLog*);
+void vtloghlist(Hio*);
+int vtproc(void(*)(void*), void*);
+int vttypevalid(int type);
+void waitforkick(Round*);
+int wbarena(Arena *arena);
+int wbarenahead(Arena *arena);
+int wbarenamap(AMap *am, int n, Part *part, u64int base, u64int size);
+int wbarenapart(ArenaPart *ap);
+void wbbloomhead(Bloom*);
+int wbisect(ISect *is);
+int wbindex(Index *ix);
+int whackblock(u8int *dst, u8int *src, int ssize);
+u64int writeaclump(Arena *a, Clump *c, u8int *clbuf);
+u32int writearena(Arena *arena, u64int aa, u8int *clbuf, u32int n);
+int writebloom(Bloom*);
+int writeclumpinfo(Arena *arean, int clump, ClumpInfo *ci);
+int writepng(Hio*, Memimage*);
+u64int writeiclump(Index *ix, Clump *c, u8int *clbuf);
+int writelump(Packet *p, u8int *score, int type, u32int creator, uint ms);
+int writepart(Part *part, u64int addr, u8int *buf, u32int n);
+int writeqlump(Lump *u, Packet *p, int creator, uint ms);
+Packet *zblock2packet(ZBlock *zb, u32int size);
+void zeropart(Part *part, int blocksize);
+
+/*
+#pragma varargck argpos sysfatal 1
+#pragma varargck argpos logerr 2
+#pragma varargck argpos SetErr 2
+*/
+
+#define scorecmp(h1,h2) memcmp((h1),(h2),VtScoreSize)
+#define scorecp(h1,h2) memmove((h1),(h2),VtScoreSize)
+
+#define MK(t) ((t*)emalloc(sizeof(t)))
+#define MKZ(t) ((t*)ezmalloc(sizeof(t)))
+#define MKN(t,n) ((t*)emalloc((n)*sizeof(t)))
+#define MKNZ(t,n) ((t*)ezmalloc((n)*sizeof(t)))
+#define MKNA(t,at,n) ((t*)emalloc(sizeof(t) + (n)*sizeof(at)))
diff --git a/sys/src/cmd/venti/srv/graph.c b/sys/src/cmd/venti/srv/graph.c
new file mode 100755
index 000000000..cbad1ada2
--- /dev/null
+++ b/sys/src/cmd/venti/srv/graph.c
@@ -0,0 +1,197 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+enum
+{
+ Top = 1,
+ Bottom = 1,
+ Left = 40,
+ Right = 0,
+ MinWidth = Left+Right+2,
+ MinHeight = Top+Bottom+2,
+ DefaultWidth = Left+Right+500,
+ DefaultHeight = Top+Bottom+40
+};
+
+QLock memdrawlock;
+static Memsubfont *smallfont;
+static Memimage *black;
+static Memimage *blue;
+static Memimage *red;
+static Memimage *lofill[6];
+static Memimage *hifill[6];
+static Memimage *grid;
+
+static ulong fill[] = {
+ 0xFFAAAAFF, 0xBB5D5DFF, /* peach */
+ DPalegreygreen, DPurpleblue, /* aqua */
+ DDarkyellow, DYellowgreen, /* yellow */
+ DMedgreen, DDarkgreen, /* green */
+ 0x00AAFFFF, 0x0088CCFF, /* blue */
+ 0xCCCCCCFF, 0x888888FF, /* grey */
+};
+
+Memimage*
+allocrepl(ulong color)
+{
+ Memimage *m;
+
+ m = allocmemimage(Rect(0,0,1,1), RGB24);
+ memfillcolor(m, color);
+ m->flags |= Frepl;
+ m->clipr = Rect(-1000000, -1000000, 1000000, 1000000);
+ return m;
+}
+
+static void
+ginit(void)
+{
+ static int first = 1;
+ int i;
+
+ if(!first)
+ return;
+
+ first = 0;
+ memimageinit();
+#ifdef PLAN9PORT
+ smallfont = openmemsubfont(unsharp("#9/font/lucsans/lstr.10"));
+#else
+ smallfont = openmemsubfont("/lib/font/bit/lucidasans/lstr.10");
+#endif
+ black = memblack;
+ blue = allocrepl(DBlue);
+ red = allocrepl(DRed);
+ grid = allocrepl(0x77777777);
+ for(i=0; i<nelem(fill)/2 && i<nelem(lofill) && i<nelem(hifill); i++){
+ lofill[i] = allocrepl(fill[2*i]);
+ hifill[i] = allocrepl(fill[2*i+1]);
+ }
+}
+
+static void
+mklabel(char *str, int v)
+{
+ if(v < 0){
+ v = -v;
+ *str++ = '-';
+ }
+ if(v < 10000)
+ sprint(str, "%d", v);
+ else if(v < 10000000)
+ sprint(str, "%dk", v/1000);
+ else
+ sprint(str, "%dM", v/1000000);
+}
+
+static void
+drawlabel(Memimage *m, Point p, int n)
+{
+ char buf[30];
+ Point w;
+
+ mklabel(buf, n);
+ w = memsubfontwidth(smallfont, buf);
+ memimagestring(m, Pt(p.x-5-w.x, p.y), memblack, ZP, smallfont, buf);
+}
+
+static int
+scalept(int val, int valmin, int valmax, int ptmin, int ptmax)
+{
+ if(val <= valmin)
+ val = valmin;
+ if(val >= valmax)
+ val = valmax;
+ if(valmax == valmin)
+ valmax++;
+ return ptmin + (vlong)(val-valmin)*(ptmax-ptmin)/(valmax-valmin);
+}
+
+Memimage*
+statgraph(Graph *g)
+{
+ int i, nbin, x, lo, hi, min, max, first;
+ Memimage *m;
+ Rectangle r;
+ Statbin *b, bin[2000]; /* 32 kB, but whack is worse */
+
+ needstack(8192); /* double check that bin didn't kill us */
+
+ if(g->wid <= MinWidth)
+ g->wid = DefaultWidth;
+ if(g->ht <= MinHeight)
+ g->ht = DefaultHeight;
+ if(g->wid > nelem(bin))
+ g->wid = nelem(bin);
+ if(g->fill < 0)
+ g->fill = ((uint)(uintptr)g->arg>>8)%nelem(lofill);
+ if(g->fill > nelem(lofill))
+ g->fill %= nelem(lofill);
+
+ nbin = g->wid - (Left+Right);
+ binstats(g->fn, g->arg, g->t0, g->t1, bin, nbin);
+
+ /*
+ * compute bounds
+ */
+ min = g->min;
+ max = g->max;
+ if(min < 0 || max <= min){
+ min = max = 0;
+ first = 1;
+ for(i=0; i<nbin; i++){
+ b = &bin[i];
+ if(b->nsamp == 0)
+ continue;
+ if(first || b->min < min)
+ min = b->min;
+ if(first || b->max > max)
+ max = b->max;
+ first = 0;
+ }
+ }
+
+ qlock(&memdrawlock);
+ ginit();
+ if(smallfont==nil || black==nil || blue==nil || red==nil || hifill==nil || lofill==nil){
+ werrstr("graphics initialization failed: %r");
+ qunlock(&memdrawlock);
+ return nil;
+ }
+
+ /* fresh image */
+ m = allocmemimage(Rect(0,0,g->wid,g->ht), ABGR32);
+ if(m == nil){
+ qunlock(&memdrawlock);
+ return nil;
+ }
+ r = Rect(Left, Top, g->wid-Right, g->ht-Bottom);
+ memfillcolor(m, DTransparent);
+
+ /* x axis */
+ memimagedraw(m, Rect(r.min.x, r.max.y, r.max.x, r.max.y+1), black, ZP, memopaque, ZP, S);
+
+ /* y labels */
+ drawlabel(m, r.min, max);
+ if(min != 0)
+ drawlabel(m, Pt(r.min.x, r.max.y-smallfont->height), min);
+
+ /* actual data */
+ for(i=0; i<nbin; i++){
+ b = &bin[i];
+ if(b->nsamp == 0)
+ continue;
+ lo = scalept(b->min, min, max, r.max.y, r.min.y);
+ hi = scalept(b->max, min, max, r.max.y, r.min.y);
+ x = r.min.x+i;
+ hi-=2;
+ memimagedraw(m, Rect(x, hi, x+1,lo), hifill[g->fill%nelem(hifill)], ZP, memopaque, ZP, S);
+ memimagedraw(m, Rect(x, lo, x+1, r.max.y), lofill[g->fill%nelem(lofill)], ZP, memopaque, ZP, S);
+ }
+
+ if(bin[nbin-1].nsamp)
+ drawlabel(m, Pt(r.max.x, r.min.y+(Dy(r)-smallfont->height)/2), bin[nbin-1].avg);
+ qunlock(&memdrawlock);
+ return m;
+}
diff --git a/sys/src/cmd/venti/srv/hdisk.c b/sys/src/cmd/venti/srv/hdisk.c
new file mode 100755
index 000000000..8cf937d1b
--- /dev/null
+++ b/sys/src/cmd/venti/srv/hdisk.c
@@ -0,0 +1,696 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+#include "whack.h"
+
+static int disksummary(HConnect*);
+static int diskarenapart(HConnect*, char*, Part*);
+static int diskbloom(HConnect*, char*, Part*);
+static int diskisect(HConnect*, char*, Part*);
+
+int
+hdisk(HConnect *c)
+{
+ char *disk, *type;
+ Part *p;
+ int ret;
+
+ if(hsethtml(c) < 0)
+ return -1;
+
+ disk = hargstr(c, "disk", "");
+ if(!disk[0])
+ return disksummary(c);
+ if((p = initpart(disk, OREAD)) == nil){
+ hprint(&c->hout, "open %s: %r", disk);
+ return 0;
+ }
+
+ type = hargstr(c, "type", "");
+ switch(type[0]){
+ case 'a':
+ ret = diskarenapart(c, disk, p);
+ break;
+ case 'b':
+ ret = diskbloom(c, disk, p);
+ break;
+ case 'i':
+ ret = diskisect(c, disk, p);
+ break;
+ default:
+ hprint(&c->hout, "unknown disk type %s", type);
+ return 0;
+ }
+ freepart(p);
+ return ret;
+}
+
+static int
+disksummary(HConnect *c)
+{
+ int i;
+ Index *ix;
+ Part *p;
+
+ hprint(&c->hout, "<h1>venti disks</h1>\n");
+ hprint(&c->hout, "<pre>\n");
+ ix = mainindex;
+ p = nil;
+ for(i=0; i<ix->narenas; i++){
+ if(ix->arenas[i]->part == p)
+ continue;
+ p = ix->arenas[i]->part;
+ hprint(&c->hout, "<a href=\"/disk?disk=%s&type=a\">%s</a> %s\n", p->name, p->name, ix->arenas[i]->name);
+ }
+ hprint(&c->hout, "\n");
+ p = nil;
+ for(i=0; i<ix->nsects; i++){
+ if(ix->sects[i]->part == p)
+ continue;
+ p = ix->sects[i]->part;
+ hprint(&c->hout, "<a href=\"/disk?disk=%s&type=i\">%s</a> %s\n", p->name, p->name, ix->sects[i]->name);
+ }
+ hprint(&c->hout, "\n");
+ if(ix->bloom){
+ p = ix->bloom->part;
+ hprint(&c->hout, "<a href=\"/disk?disk=%s&type=b\">%s</a> %s\n", p->name, p->name, "bloom filter");
+ }
+ return 0;
+}
+
+static char*
+readap(Part *p, ArenaPart *ap)
+{
+ uchar *blk;
+ char *table;
+
+ blk = vtmalloc(8192);
+ if(readpart(p, PartBlank, blk, 8192) != 8192)
+ return nil;
+ if(unpackarenapart(ap, blk) < 0){
+ werrstr("corrupt arena part header: %r");
+ return nil;
+ }
+ vtfree(blk);
+ ap->tabbase = (PartBlank+HeadSize+ap->blocksize-1)&~(ap->blocksize-1);
+ ap->tabsize = ap->arenabase - ap->tabbase;
+ table = vtmalloc(ap->tabsize+1);
+ if(readpart(p, ap->tabbase, (uchar*)table, ap->tabsize) != ap->tabsize){
+ werrstr("reading arena part directory: %r");
+ return nil;
+ }
+ table[ap->tabsize] = 0;
+ return table;
+}
+
+static int
+xfindarena(char *table, char *name, vlong *start, vlong *end)
+{
+ int i, nline;
+ char *p, *q, *f[4], line[256];
+
+ nline = atoi(table);
+ p = strchr(table, '\n');
+ if(p)
+ p++;
+ for(i=0; i<nline; i++){
+ if(p == nil)
+ break;
+ q = strchr(p, '\n');
+ if(q)
+ *q++ = 0;
+ if(strlen(p) >= sizeof line){
+ p = q;
+ continue;
+ }
+ strcpy(line, p);
+ memset(f, 0, sizeof f);
+ if(tokenize(line, f, nelem(f)) < 3){
+ p = q;
+ continue;
+ }
+ if(strcmp(f[0], name) == 0){
+ *start = strtoull(f[1], 0, 0);
+ *end = strtoull(f[2], 0, 0);
+ return 0;
+ }
+ p = q;
+ }
+ return -1;
+}
+
+static void
+diskarenatable(HConnect *c, char *disk, char *table)
+{
+ char *p, *q;
+ int i, nline;
+ char *f[4], line[256], base[256];
+
+ hprint(&c->hout, "<h2>table</h2>\n");
+ hprint(&c->hout, "<pre>\n");
+ nline = atoi(table);
+ snprint(base, sizeof base, "/disk?disk=%s&type=a", disk);
+ p = strchr(table, '\n');
+ if(p)
+ p++;
+ for(i=0; i<nline; i++){
+ if(p == nil){
+ hprint(&c->hout, "<b><i>unexpected end of table</i></b>\n");
+ break;
+ }
+ q = strchr(p, '\n');
+ if(q)
+ *q++ = 0;
+ if(strlen(p) >= sizeof line){
+ hprint(&c->hout, "%s\n", p);
+ p = q;
+ continue;
+ }
+ strcpy(line, p);
+ memset(f, 0, sizeof f);
+ if(tokenize(line, f, 3) < 3){
+ hprint(&c->hout, "%s\n", p);
+ p = q;
+ continue;
+ }
+ p = q;
+ hprint(&c->hout, "<a href=\"%s&arena=%s\">%s</a> %s %s\n",
+ base, f[0], f[0], f[1], f[2]);
+ }
+ hprint(&c->hout, "</pre>\n");
+}
+
+static char*
+fmttime(char *buf, ulong time)
+{
+ strcpy(buf, ctime(time));
+ buf[28] = 0;
+ return buf;
+}
+
+
+static int diskarenaclump(HConnect*, Arena*, vlong, char*);
+static int diskarenatoc(HConnect*, Arena*);
+
+static int
+diskarenapart(HConnect *c, char *disk, Part *p)
+{
+ char *arenaname;
+ ArenaPart ap;
+ ArenaHead head;
+ Arena arena;
+ char *table;
+ char *score;
+ char *clump;
+ uchar *blk;
+ vlong start, end, off;
+ char tbuf[60];
+
+ hprint(&c->hout, "<h1>arena partition %s</h1>\n", disk);
+
+ if((table = readap(p, &ap)) == nil){
+ hprint(&c->hout, "%r\n");
+ goto out;
+ }
+
+ hprint(&c->hout, "<pre>\n");
+ hprint(&c->hout, "version=%d blocksize=%d base=%d\n",
+ ap.version, ap.blocksize, ap.arenabase);
+ hprint(&c->hout, "</pre>\n");
+
+ arenaname = hargstr(c, "arena", "");
+ if(arenaname[0] == 0){
+ diskarenatable(c, disk, table);
+ goto out;
+ }
+
+ if(xfindarena(table, arenaname, &start, &end) < 0){
+ hprint(&c->hout, "no such arena %s\n", arenaname);
+ goto out;
+ }
+
+ hprint(&c->hout, "<h2>arena %s</h2>\n", arenaname);
+ hprint(&c->hout, "<pre>start=%#llx end=%#llx<pre>\n", start, end);
+ if(end < start || end - start < HeadSize){
+ hprint(&c->hout, "bad size %#llx\n", end - start);
+ goto out;
+ }
+
+ // read arena header, tail
+ blk = vtmalloc(HeadSize);
+ if(readpart(p, start, blk, HeadSize) != HeadSize){
+ hprint(&c->hout, "reading header: %r\n");
+ vtfree(blk);
+ goto out;
+ }
+ if(unpackarenahead(&head, blk) < 0){
+ hprint(&c->hout, "corrupt arena header: %r\n");
+ // hhex(blk, HeadSize);
+ vtfree(blk);
+ goto out;
+ }
+ vtfree(blk);
+
+ hprint(&c->hout, "head:\n<pre>\n");
+ hprint(&c->hout, "version=%d name=%s blocksize=%d size=%#llx clumpmagic=%#ux\n",
+ head.version, head.name, head.blocksize, head.size,
+ head.clumpmagic);
+ hprint(&c->hout, "</pre><br><br>\n");
+
+ if(head.blocksize > MaxIoSize || head.blocksize >= end - start){
+ hprint(&c->hout, "corrupt block size %d\n", head.blocksize);
+ goto out;
+ }
+
+ blk = vtmalloc(head.blocksize);
+ if(readpart(p, end - head.blocksize, blk, head.blocksize) < 0){
+ hprint(&c->hout, "reading tail: %r\n");
+ vtfree(blk);
+ goto out;
+ }
+ memset(&arena, 0, sizeof arena);
+ arena.part = p;
+ arena.blocksize = head.blocksize;
+ arena.clumpmax = head.blocksize / ClumpInfoSize;
+ arena.base = start + head.blocksize;
+ arena.size = end - start - 2 * head.blocksize;
+ if(unpackarena(&arena, blk) < 0){
+ vtfree(blk);
+ goto out;
+ }
+ scorecp(arena.score, blk+head.blocksize - VtScoreSize);
+
+ vtfree(blk);
+
+ hprint(&c->hout, "tail:\n<pre>\n");
+ hprint(&c->hout, "version=%d name=%s\n", arena.version, arena.name);
+ hprint(&c->hout, "ctime=%d %s\n", arena.ctime, fmttime(tbuf, arena.ctime));
+ hprint(&c->hout, "wtime=%d %s\n", arena.wtime, fmttime(tbuf, arena.wtime));
+ hprint(&c->hout, "clumpmagic=%#ux\n", arena.clumpmagic);
+ hprint(&c->hout, "score %V\n", arena.score);
+ hprint(&c->hout, "diskstats:\n");
+ hprint(&c->hout, "\tclumps=%,d cclumps=%,d used=%,lld uncsize=%,lld sealed=%d\n",
+ arena.diskstats.clumps, arena.diskstats.cclumps,
+ arena.diskstats.used, arena.diskstats.uncsize,
+ arena.diskstats.sealed);
+ hprint(&c->hout, "memstats:\n");
+ hprint(&c->hout, "\tclumps=%,d cclumps=%,d used=%,lld uncsize=%,lld sealed=%d\n",
+ arena.memstats.clumps, arena.memstats.cclumps,
+ arena.memstats.used, arena.memstats.uncsize,
+ arena.memstats.sealed);
+ if(arena.clumpmax == 0){
+ hprint(&c->hout, "bad clumpmax\n");
+ goto out;
+ }
+
+ score = hargstr(c, "score", "");
+ clump = hargstr(c, "clump", "");
+
+ if(clump[0]){
+ off = strtoull(clump, 0, 0);
+ diskarenaclump(c, &arena, off, score[0] ? score : nil);
+ }else if(score[0]){
+ diskarenaclump(c, &arena, -1, score);
+ }else{
+ diskarenatoc(c, &arena);
+ }
+
+out:
+ free(table);
+ return 0;
+}
+
+static vlong
+findintoc(HConnect *c, Arena *arena, uchar *score)
+{
+ uchar *blk;
+ int i;
+ vlong off;
+ vlong coff;
+ ClumpInfo ci;
+
+ blk = vtmalloc(arena->blocksize);
+ off = arena->base + arena->size;
+ coff = 0;
+ for(i=0; i<arena->memstats.clumps; i++){
+ if(i%arena->clumpmax == 0){
+ off -= arena->blocksize;
+ if(readpart(arena->part, off, blk, arena->blocksize) != arena->blocksize){
+ if(c)
+ hprint(&c->hout, "<i>clump info directory at %#llx: %r</i>\n<br>\n",
+ off);
+ break;
+ }
+ }
+ unpackclumpinfo(&ci, blk+(i%arena->clumpmax)*ClumpInfoSize);
+ if(scorecmp(ci.score, score) == 0){
+ vtfree(blk);
+ return coff;
+ }
+ coff += ClumpSize + ci.size;
+ }
+ vtfree(blk);
+ return -1;
+}
+
+
+static int
+diskarenatoc(HConnect *c, Arena *arena)
+{
+ uchar *blk;
+ int i;
+ vlong off;
+ vlong coff;
+ ClumpInfo ci;
+ char base[512];
+ int cib;
+
+ snprint(base, sizeof base, "/disk?disk=%s&type=a&arena=%s",
+ arena->part->name, arena->name);
+
+ blk = vtmalloc(arena->blocksize);
+ off = arena->base + arena->size;
+ hprint(&c->hout, "<h2>table of contents</h2>\n");
+ hprint(&c->hout, "<pre>\n");
+ hprint(&c->hout, "%5s %6s %7s %s\n", "type", "size", "uncsize", "score");
+ coff = 0;
+ cib = hargint(c, "cib", 0);
+
+ for(i=0; i<arena->memstats.clumps; i++){
+ if(i%arena->clumpmax == 0){
+ off -= arena->blocksize;
+ if(readpart(arena->part, off, blk, arena->blocksize) != arena->blocksize){
+ hprint(&c->hout, "<i>clump info directory at %#llx: %r</i>\n<br>\n",
+ off);
+ i += arena->clumpmax-1;
+ coff = -1;
+ continue;
+ }
+ }
+ unpackclumpinfo(&ci, blk+(i%arena->clumpmax)*ClumpInfoSize);
+ if(i/arena->clumpmax == cib || i%arena->clumpmax == 0){
+ hprint(&c->hout, "%5d %6d %7d %V",
+ ci.type, ci.size, ci.uncsize, ci.score);
+ if(coff >= 0)
+ hprint(&c->hout, " at <a href=\"%s&clump=%#llx&score=%V\">%#llx</a>",
+ base, coff, ci.score, coff);
+ if(i/arena->clumpmax != cib)
+ hprint(&c->hout, " <font size=-1><a href=\"%s&cib=%d\">more</a></font>", base, i/arena->clumpmax);
+ hprint(&c->hout, "\n");
+ }
+ if(coff >= 0)
+ coff += ClumpSize + ci.size;
+ }
+ hprint(&c->hout, "</pre>\n");
+ return 0;
+}
+
+#define U32GET(p) ((u32int)(((p)[0]<<24)|((p)[1]<<16)|((p)[2]<<8)|(p)[3]))
+static int
+diskarenaclump(HConnect *c, Arena *arena, vlong off, char *scorestr)
+{
+ uchar *blk, *blk2;
+ Clump cl;
+ char err[ERRMAX];
+ uchar xscore[VtScoreSize], score[VtScoreSize];
+ Unwhack uw;
+ int n;
+
+ if(scorestr){
+ if(vtparsescore(scorestr, nil, score) < 0){
+ hprint(&c->hout, "bad score %s: %r\n", scorestr);
+ return -1;
+ }
+ if(off < 0){
+ off = findintoc(c, arena, score);
+ if(off < 0){
+ hprint(&c->hout, "score %V not found in arena %s\n", score, arena->name);
+ return -1;
+ }
+ hprint(&c->hout, "score %V at %#llx\n", score, off);
+ }
+ }else
+ memset(score, 0, sizeof score);
+
+ if(off < 0){
+ hprint(&c->hout, "bad offset %#llx\n", off);
+ return -1;
+ }
+
+ off += arena->base;
+
+ blk = vtmalloc(ClumpSize + VtMaxLumpSize);
+ if(readpart(arena->part, off, blk, ClumpSize + VtMaxLumpSize) != ClumpSize + VtMaxLumpSize){
+ hprint(&c->hout, "reading at %#llx: %r\n", off);
+ vtfree(blk);
+ return -1;
+ }
+
+ if(unpackclump(&cl, blk, arena->clumpmagic) < 0){
+ hprint(&c->hout, "unpackclump: %r\n<br>");
+ rerrstr(err, sizeof err);
+ if(strstr(err, "magic")){
+ hprint(&c->hout, "trying again with magic=%#ux<br>\n", U32GET(blk));
+ if(unpackclump(&cl, blk, U32GET(blk)) < 0){
+ hprint(&c->hout, "unpackclump: %r\n<br>\n");
+ goto error;
+ }
+ }else
+ goto error;
+ }
+
+ hprint(&c->hout, "<pre>type=%d size=%d uncsize=%d score=%V\n", cl.info.type, cl.info.size, cl.info.uncsize, cl.info.score);
+ hprint(&c->hout, "encoding=%d creator=%d time=%d %s</pre>\n", cl.encoding, cl.creator, cl.time, fmttime(err, cl.time));
+
+ if(cl.info.type == VtCorruptType)
+ hprint(&c->hout, "clump is marked corrupt<br>\n");
+
+ if(cl.info.size >= VtMaxLumpSize){
+ hprint(&c->hout, "clump too big\n");
+ goto error;
+ }
+
+ switch(cl.encoding){
+ case ClumpECompress:
+ blk2 = vtmalloc(VtMaxLumpSize);
+ unwhackinit(&uw);
+ n = unwhack(&uw, blk2, cl.info.uncsize, blk+ClumpSize, cl.info.size);
+ if(n < 0){
+ hprint(&c->hout, "decompression failed\n");
+ vtfree(blk2);
+ goto error;
+ }
+ if(n != cl.info.uncsize){
+ hprint(&c->hout, "got wrong amount: %d wanted %d\n", n, cl.info.uncsize);
+ // hhex(blk2, n);
+ vtfree(blk2);
+ goto error;
+ }
+ scoremem(xscore, blk2, cl.info.uncsize);
+ vtfree(blk2);
+ break;
+ case ClumpENone:
+ scoremem(xscore, blk+ClumpSize, cl.info.size);
+ break;
+ }
+
+ hprint(&c->hout, "score=%V<br>\n", xscore);
+ if(scorestr && scorecmp(score, xscore) != 0)
+ hprint(&c->hout, "score does NOT match expected %V\n", score);
+
+ vtfree(blk);
+ return 0;
+
+error:
+ // hhex(blk, ClumpSize + VtMaxLumpSize);
+ vtfree(blk);
+ return -1;
+}
+
+static int
+diskbloom(HConnect *c, char *disk, Part *p)
+{
+ USED(c);
+ USED(disk);
+ USED(p);
+ return 0;
+}
+
+static int
+diskisect(HConnect *c, char *disk, Part *p)
+{
+ USED(c);
+ USED(disk);
+ USED(p);
+ return 0;
+}
+
+static void
+debugamap(HConnect *c)
+{
+ int i;
+ AMap *amap;
+
+ hprint(&c->hout, "<h2>arena map</h2>\n");
+ hprint(&c->hout, "<pre>\n");
+
+ amap = mainindex->amap;
+ for(i=0; i<mainindex->narenas; i++)
+ hprint(&c->hout, "%s %#llx %#llx\n",
+ amap[i].name, amap[i].start, amap[i].stop);
+}
+
+static void
+debugread(HConnect *c, u8int *score)
+{
+ int type;
+ Lump *u;
+ IAddr ia;
+ IEntry ie;
+ int i;
+ Arena *arena;
+ u64int aa;
+ ZBlock *zb;
+ Clump cl;
+ vlong off;
+ u8int sc[VtScoreSize];
+
+ if(scorecmp(score, zeroscore) == 0){
+ hprint(&c->hout, "zero score\n");
+ return;
+ }
+
+ hprint(&c->hout, "<h2>index search %V</h2><pre>\n", score);
+ if(icachelookup(score, -1, &ia) < 0)
+ hprint(&c->hout, " icache: not found\n");
+ else
+ hprint(&c->hout, " icache: addr=%#llx size=%d type=%d blocks=%d\n",
+ ia.addr, ia.size, ia.type, ia.blocks);
+
+ if(loadientry(mainindex, score, -1, &ie) < 0)
+ hprint(&c->hout, " idisk: not found\n");
+ else
+ hprint(&c->hout, " idisk: addr=%#llx size=%d type=%d blocks=%d\n",
+ ie.ia.addr, ie.ia.size, ie.ia.type, ie.ia.blocks);
+
+ hprint(&c->hout, "</pre><h2>lookup %V</h2>\n", score);
+ hprint(&c->hout, "<pre>\n");
+
+ for(type=0; type < VtMaxType; type++){
+ hprint(&c->hout, "%V type %d:", score, type);
+ u = lookuplump(score, type);
+ if(u->data != nil)
+ hprint(&c->hout, " +cache");
+ else
+ hprint(&c->hout, " -cache");
+ putlump(u);
+
+ if(lookupscore(score, type, &ia) < 0){
+ hprint(&c->hout, " -lookup\n");
+ continue;
+ }
+ hprint(&c->hout, "\n lookupscore: addr=%#llx size=%d blocks=%d\n",
+ ia.addr, ia.size, ia.blocks);
+
+ arena = amapitoa(mainindex, ia.addr, &aa);
+ if(arena == nil){
+ hprint(&c->hout, " amapitoa failed: %r\n");
+ continue;
+ }
+
+ hprint(&c->hout, " amapitoa: aa=%#llx arena="
+ "<a href=\"/disk?disk=%s&type=a&arena=%s&score=%V\">%s</a>\n",
+ aa, arena->part->name, arena->name, score, arena->name);
+ zb = loadclump(arena, aa, ia.blocks, &cl, sc, 1);
+ if(zb == nil){
+ hprint(&c->hout, " loadclump failed: %r\n");
+ continue;
+ }
+
+ hprint(&c->hout, " loadclump: uncsize=%d type=%d score=%V\n",
+ cl.info.uncsize, cl.info.type, sc);
+ if(ia.size != cl.info.uncsize || ia.type != cl.info.type || scorecmp(score, sc) != 0){
+ hprint(&c->hout, " clump info mismatch\n");
+ continue;
+ }
+ }
+
+ if(hargstr(c, "brute", "")[0] == 'y'){
+ hprint(&c->hout, "</pre>\n");
+ hprint(&c->hout, "<h2>brute force arena search %V</h2>\n", score);
+ hprint(&c->hout, "<pre>\n");
+
+ for(i=0; i<mainindex->narenas; i++){
+ arena = mainindex->arenas[i];
+ hprint(&c->hout, "%s...\n", arena->name);
+ hflush(&c->hout);
+ off = findintoc(nil, arena, score);
+ if(off >= 0)
+ hprint(&c->hout, "%s %#llx (%#llx)\n", arena->name, off, mainindex->amap[i].start + off);
+ }
+ }
+
+ hprint(&c->hout, "</pre>\n");
+}
+
+static void
+debugmem(HConnect *c)
+{
+ Index *ix;
+
+ ix = mainindex;
+ hprint(&c->hout, "<h2>memory</h2>\n");
+
+ hprint(&c->hout, "<pre>\n");
+ hprint(&c->hout, "ix=%p\n", ix);
+ hprint(&c->hout, "\tarenas=%p\n", ix->arenas);
+ if(ix->narenas > 0)
+ hprint(&c->hout, "\tarenas[...] = %p...%p\n", ix->arenas[0], ix->arenas[ix->narenas-1]);
+ hprint(&c->hout, "\tsmap=%p\n", ix->smap);
+ hprint(&c->hout, "\tamap=%p\n", ix->amap);
+ hprint(&c->hout, "\tbloom=%p\n", ix->bloom);
+ hprint(&c->hout, "\tbloom->data=%p\n", ix->bloom ? ix->bloom->data : nil);
+ hprint(&c->hout, "\tisects=%p\n", ix->sects);
+ if(ix->nsects > 0)
+ hprint(&c->hout, "\tsects[...] = %p...%p\n", ix->sects[0], ix->sects[ix->nsects-1]);
+}
+
+int
+hdebug(HConnect *c)
+{
+ char *scorestr, *op;
+ u8int score[VtScoreSize];
+
+ if(hsethtml(c) < 0)
+ return -1;
+ hprint(&c->hout, "<h1>venti debug</h1>\n");
+
+ op = hargstr(c, "op", "");
+ if(!op[0]){
+ hprint(&c->hout, "no op\n");
+ return 0;
+ }
+
+ if(strcmp(op, "amap") == 0){
+ debugamap(c);
+ return 0;
+ }
+
+ if(strcmp(op, "mem") == 0){
+ debugmem(c);
+ return 0;
+ }
+
+ if(strcmp(op, "read") == 0){
+ scorestr = hargstr(c, "score", "");
+ if(vtparsescore(scorestr, nil, score) < 0){
+ hprint(&c->hout, "bad score %s: %r\n", scorestr);
+ return 0;
+ }
+ debugread(c, score);
+ return 0;
+ }
+
+ hprint(&c->hout, "unknown op %s", op);
+ return 0;
+}
diff --git a/sys/src/cmd/venti/srv/hproc.c b/sys/src/cmd/venti/srv/hproc.c
new file mode 100755
index 000000000..7a22ec251
--- /dev/null
+++ b/sys/src/cmd/venti/srv/hproc.c
@@ -0,0 +1,674 @@
+#include "stdinc.h"
+#include <bio.h>
+#include <mach.h>
+#include <ureg.h>
+#include "/sys/src/libthread/threadimpl.h"
+#include "dat.h"
+#include "fns.h"
+
+typedef struct Ureg Ureg;
+typedef struct Debug Debug;
+
+struct Debug
+{
+ int textfd;
+ QLock lock;
+ Fhdr fhdr;
+ Map *map;
+ Fmt *fmt;
+ int pid;
+ char *stkprefix;
+ int pcoff;
+ int spoff;
+};
+
+static Debug debug = { -1 };
+
+static int
+text(int pid)
+{
+ int fd;
+ char buf[100];
+
+ if(debug.textfd >= 0){
+ close(debug.textfd);
+ debug.textfd = -1;
+ }
+ memset(&debug.fhdr, 0, sizeof debug.fhdr);
+
+ snprint(buf, sizeof buf, "#p/%d/text", pid);
+ fd = open(buf, OREAD);
+ if(fd < 0)
+ return -1;
+ if(crackhdr(fd, &debug.fhdr) < 0){
+ close(fd);
+ return -1;
+ }
+ if(syminit(fd, &debug.fhdr) < 0){
+ memset(&debug.fhdr, 0, sizeof debug.fhdr);
+ close(fd);
+ return -1;
+ }
+ debug.textfd = fd;
+ machbytype(debug.fhdr.type);
+ return 0;
+}
+
+static void
+unmap(Map *m)
+{
+ int i;
+
+ for(i=0; i<m->nsegs; i++)
+ if(m->seg[i].inuse)
+ close(m->seg[i].fd);
+ free(m);
+}
+
+static Map*
+map(int pid)
+{
+ int mem;
+ char buf[100];
+ Map *m;
+
+ snprint(buf, sizeof buf, "#p/%d/mem", pid);
+ mem = open(buf, OREAD);
+ if(mem < 0)
+ return nil;
+
+ m = attachproc(pid, 0, mem, &debug.fhdr);
+ if(m == 0){
+ close(mem);
+ return nil;
+ }
+
+ if(debug.map)
+ unmap(debug.map);
+ debug.map = m;
+ debug.pid = pid;
+ return m;
+}
+
+static void
+dprint(char *fmt, ...)
+{
+ va_list arg;
+
+ va_start(arg, fmt);
+ fmtvprint(debug.fmt, fmt, arg);
+ va_end(arg);
+}
+
+static void
+openfiles(void)
+{
+ char buf[4096];
+ int fd, n;
+
+ snprint(buf, sizeof buf, "#p/%d/fd", getpid());
+ if((fd = open(buf, OREAD)) < 0){
+ dprint("open %s: %r\n", buf);
+ return;
+ }
+ n = readn(fd, buf, sizeof buf-1);
+ close(fd);
+ if(n >= 0){
+ buf[n] = 0;
+ fmtstrcpy(debug.fmt, buf);
+ }
+}
+
+/*
+ * dump the raw symbol table
+ */
+static void
+printsym(void)
+{
+ int i;
+ Sym *sp;
+
+ for (i = 0; sp = getsym(i); i++) {
+ switch(sp->type) {
+ case 't':
+ case 'l':
+ dprint("%16#llux t %s\n", sp->value, sp->name);
+ break;
+ case 'T':
+ case 'L':
+ dprint("%16#llux T %s\n", sp->value, sp->name);
+ break;
+ case 'D':
+ case 'd':
+ case 'B':
+ case 'b':
+ case 'a':
+ case 'p':
+ case 'm':
+ dprint("%16#llux %c %s\n", sp->value, sp->type, sp->name);
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+static void
+printmap(char *s, Map *map)
+{
+ int i;
+
+ if (!map)
+ return;
+ dprint("%s\n", s);
+ for (i = 0; i < map->nsegs; i++) {
+ if (map->seg[i].inuse)
+ dprint("%-16s %-16#llux %-16#llux %-16#llux\n",
+ map->seg[i].name, map->seg[i].b,
+ map->seg[i].e, map->seg[i].f);
+ }
+}
+
+static void
+printlocals(Map *map, Symbol *fn, uintptr fp)
+{
+ int i;
+ uintptr w;
+ Symbol s;
+ char buf[100];
+
+ s = *fn;
+ for (i = 0; localsym(&s, i); i++) {
+ if (s.class != CAUTO)
+ continue;
+ snprint(buf, sizeof buf, "%s%s/", debug.stkprefix, s.name);
+ if (geta(map, fp - s.value, (uvlong*)&w) > 0)
+ dprint("\t%-10s %10#p %ld\n", buf, w, w);
+ else
+ dprint("\t%-10s ?\n", buf);
+ }
+}
+
+static void
+printparams(Map *map, Symbol *fn, uintptr fp)
+{
+ int i;
+ Symbol s;
+ uintptr w;
+ int first = 0;
+
+ fp += mach->szaddr; /* skip saved pc */
+ s = *fn;
+ for (i = 0; localsym(&s, i); i++) {
+ if (s.class != CPARAM)
+ continue;
+ if (first++)
+ dprint(", ");
+ if (geta(map, fp + s.value, (uvlong *)&w) > 0)
+ dprint("%s=%#p", s.name, w);
+ }
+}
+
+static void
+printsource(uintptr dot)
+{
+ char str[100];
+
+ if (fileline(str, sizeof str, dot))
+ dprint("%s", str);
+}
+
+
+/*
+ * callback on stack trace
+ */
+static uintptr nextpc;
+
+static void
+ptrace(Map *map, uvlong pc, uvlong sp, Symbol *sym)
+{
+ if(nextpc == 0)
+ nextpc = sym->value;
+ if(debug.stkprefix == nil)
+ debug.stkprefix = "";
+ dprint("%s%s(", debug.stkprefix, sym->name);
+ printparams(map, sym, sp);
+ dprint(")");
+ if(nextpc != sym->value)
+ dprint("+%#llux ", nextpc - sym->value);
+ printsource(nextpc);
+ dprint("\n");
+ printlocals(map, sym, sp);
+ nextpc = pc;
+}
+
+static void
+stacktracepcsp(Map *m, uintptr pc, uintptr sp)
+{
+ nextpc = 0;
+ if(machdata->ctrace==nil)
+ dprint("no machdata->ctrace\n");
+ else if(machdata->ctrace(m, pc, sp, 0, ptrace) <= 0)
+ dprint("no stack frame: pc=%#p sp=%#p\n", pc, sp);
+}
+
+static void
+ureginit(void)
+{
+ Reglist *r;
+
+ for(r = mach->reglist; r->rname; r++)
+ if (strcmp(r->rname, "PC") == 0)
+ debug.pcoff = r->roffs;
+ else if (strcmp(r->rname, "SP") == 0)
+ debug.spoff = r->roffs;
+}
+
+static void
+stacktrace(Map *m)
+{
+ uintptr pc, sp;
+
+ if(geta(m, debug.pcoff, (uvlong *)&pc) < 0){
+ dprint("geta pc: %r");
+ return;
+ }
+ if(geta(m, debug.spoff, (uvlong *)&sp) < 0){
+ dprint("geta sp: %r");
+ return;
+ }
+ stacktracepcsp(m, pc, sp);
+}
+
+static uintptr
+star(uintptr addr)
+{
+ uintptr x;
+ static int warned;
+
+ if(addr == 0)
+ return 0;
+
+ if(debug.map == nil){
+ if(!warned++)
+ dprint("no debug.map\n");
+ return 0;
+ }
+ if(geta(debug.map, addr, (uvlong *)&x) < 0){
+ dprint("geta %#p (pid=%d): %r\n", addr, debug.pid);
+ return 0;
+ }
+ return x;
+}
+
+static uintptr
+resolvev(char *name)
+{
+ Symbol s;
+
+ if(lookup(nil, name, &s) == 0)
+ return 0;
+ return s.value;
+}
+
+static uintptr
+resolvef(char *name)
+{
+ Symbol s;
+
+ if(lookup(name, nil, &s) == 0)
+ return 0;
+ return s.value;
+}
+
+#define FADDR(type, p, name) ((p) + offsetof(type, name))
+#define FIELD(type, p, name) star(FADDR(type, p, name))
+
+static uintptr threadpc;
+
+static int
+strprefix(char *big, char *pre)
+{
+ return strncmp(big, pre, strlen(pre));
+}
+static void
+tptrace(Map *map, uvlong pc, uvlong sp, Symbol *sym)
+{
+ char buf[512];
+
+ USED(map);
+ USED(sym);
+ USED(sp);
+
+ if(threadpc != 0)
+ return;
+ if(!fileline(buf, sizeof buf, pc))
+ return;
+ if(strprefix(buf, "/sys/src/libc/") == 0)
+ return;
+ if(strprefix(buf, "/sys/src/libthread/") == 0)
+ return;
+ threadpc = pc;
+}
+
+static char*
+threadstkline(uintptr t)
+{
+ uintptr pc, sp;
+ static char buf[500];
+
+ if(FIELD(Thread, t, state) == Running){
+ geta(debug.map, debug.pcoff, (uvlong *)&pc);
+ geta(debug.map, debug.spoff, (uvlong *)&sp);
+ }else{
+ // pc = FIELD(Thread, t, sched[JMPBUFPC]);
+ pc = resolvef("longjmp");
+ sp = FIELD(Thread, t, sched[JMPBUFSP]);
+ }
+ if(machdata->ctrace == nil)
+ return "";
+ threadpc = 0;
+ machdata->ctrace(debug.map, pc, sp, 0, tptrace);
+ if(!fileline(buf, sizeof buf, threadpc))
+ buf[0] = 0;
+ return buf;
+}
+
+static void
+proc(uintptr p)
+{
+ dprint("p=(Proc)%#p pid %d ", p, FIELD(Proc, p, pid));
+ if(FIELD(Proc, p, thread) == 0)
+ dprint(" Sched\n");
+ else
+ dprint(" Running\n");
+}
+
+static void
+fmtbufinit(Fmt *f, char *buf, int len)
+{
+ memset(f, 0, sizeof *f);
+ f->runes = 0;
+ f->start = buf;
+ f->to = buf;
+ f->stop = buf + len - 1;
+ f->flush = nil;
+ f->farg = nil;
+ f->nfmt = 0;
+}
+
+static char*
+fmtbufflush(Fmt *f)
+{
+ *(char*)f->to = 0;
+ return (char*)f->start;
+}
+
+static char*
+debugstr(uintptr s)
+{
+ static char buf[4096];
+ char *p, *e;
+
+ p = buf;
+ e = buf+sizeof buf - 1;
+ while(p < e){
+ if(get1(debug.map, s++, (uchar*)p, 1) < 0)
+ break;
+ if(*p == 0)
+ break;
+ p++;
+ }
+ *p = 0;
+ return buf;
+}
+
+static char*
+threadfmt(uintptr t)
+{
+ static char buf[4096];
+ Fmt fmt;
+ int s;
+
+ fmtbufinit(&fmt, buf, sizeof buf);
+
+ fmtprint(&fmt, "t=(Thread)%#p ", t);
+ switch(s = FIELD(Thread, t, state)){
+ case Running:
+ fmtprint(&fmt, " Running ");
+ break;
+ case Ready:
+ fmtprint(&fmt, " Ready ");
+ break;
+ case Rendezvous:
+ fmtprint(&fmt, " Rendez ");
+ break;
+ default:
+ fmtprint(&fmt, " bad state %d ", s);
+ break;
+ }
+
+ fmtprint(&fmt, "%s", threadstkline(t));
+
+ if(FIELD(Thread, t, moribund) == 1)
+ fmtprint(&fmt, " Moribund");
+ if(s = FIELD(Thread, t, cmdname)){
+ fmtprint(&fmt, " [%s]", debugstr(s));
+ }
+
+ fmtbufflush(&fmt);
+ return buf;
+}
+
+
+static void
+thread(uintptr t)
+{
+ dprint("%s\n", threadfmt(t));
+}
+
+static void
+threadapply(uintptr p, void (*fn)(uintptr))
+{
+ int oldpid, pid;
+ uintptr tq, t;
+
+ oldpid = debug.pid;
+ pid = FIELD(Proc, p, pid);
+ if(map(pid) == nil)
+ return;
+ tq = FADDR(Proc, p, threads);
+ t = FIELD(Tqueue, tq, head);
+ while(t != 0){
+ fn(t);
+ t = FIELD(Thread, t, nextt);
+ }
+ map(oldpid);
+}
+
+static void
+pthreads1(uintptr t)
+{
+ dprint("\t");
+ thread(t);
+}
+
+static void
+pthreads(uintptr p)
+{
+ threadapply(p, pthreads1);
+}
+
+static void
+lproc(uintptr p)
+{
+ proc(p);
+ pthreads(p);
+}
+
+static void
+procapply(void (*fn)(uintptr))
+{
+ uintptr proc, pq;
+
+ pq = resolvev("_threadpq");
+ if(pq == 0){
+ dprint("no thread run queue\n");
+ return;
+ }
+
+ proc = FIELD(Pqueue, pq, head);
+ while(proc){
+ fn(proc);
+ proc = FIELD(Proc, proc, next);
+ }
+}
+
+static void
+threads(HConnect *c)
+{
+ USED(c);
+ procapply(lproc);
+}
+
+static void
+procs(HConnect *c)
+{
+ USED(c);
+ procapply(proc);
+}
+
+static void
+threadstack(uintptr t)
+{
+ uintptr pc, sp;
+
+ if(FIELD(Thread, t, state) == Running){
+ stacktrace(debug.map);
+ }else{
+ // pc = FIELD(Thread, t, sched[JMPBUFPC]);
+ pc = resolvef("longjmp");
+ sp = FIELD(Thread, t, sched[JMPBUFSP]);
+ stacktracepcsp(debug.map, pc, sp);
+ }
+}
+
+
+static void
+tstacks(uintptr t)
+{
+ dprint("\t");
+ thread(t);
+ threadstack(t);
+ dprint("\n");
+}
+
+static void
+pstacks(uintptr p)
+{
+ proc(p);
+ threadapply(p, tstacks);
+}
+
+static void
+stacks(HConnect *c)
+{
+ USED(c);
+ debug.stkprefix = "\t\t";
+ procapply(pstacks);
+ debug.stkprefix = "";
+}
+
+static void
+symbols(HConnect *c)
+{
+ USED(c);
+ printsym();
+}
+
+static void
+segments(HConnect *c)
+{
+ USED(c);
+ printmap("segments", debug.map);
+}
+
+static void
+fds(HConnect *c)
+{
+ USED(c);
+ openfiles();
+}
+
+static void
+all(HConnect *c)
+{
+ dprint("/proc/segment\n");
+ segments(c);
+ dprint("\n/proc/fd\n");
+ fds(c);
+ dprint("\n/proc/procs\n");
+ procs(c);
+ dprint("\n/proc/threads\n");
+ threads(c);
+ dprint("\n/proc/stacks\n");
+ stacks(c);
+ dprint("\n# /proc/symbols\n");
+ // symbols(c);
+}
+
+int
+hproc(HConnect *c)
+{
+ void (*fn)(HConnect*);
+ Fmt fmt;
+ static int beenhere;
+ static char buf[65536];
+
+ if (!beenhere) {
+ beenhere = 1;
+ ureginit();
+ }
+ if(strcmp(c->req.uri, "/proc/all") == 0)
+ fn = all;
+ else if(strcmp(c->req.uri, "/proc/segment") == 0)
+ fn = segments;
+ else if(strcmp(c->req.uri, "/proc/fd") == 0)
+ fn = fds;
+ else if(strcmp(c->req.uri, "/proc/procs") == 0)
+ fn = procs;
+ else if(strcmp(c->req.uri, "/proc/threads") == 0)
+ fn = threads;
+ else if(strcmp(c->req.uri, "/proc/stacks") == 0)
+ fn = stacks;
+ else if(strcmp(c->req.uri, "/proc/symbols") == 0)
+ fn = symbols;
+ else
+ return hnotfound(c);
+
+ if(hsettext(c) < 0)
+ return -1;
+ if(!canqlock(&debug.lock)){
+ hprint(&c->hout, "debugger is busy\n");
+ return 0;
+ }
+ if(debug.textfd < 0){
+ if(text(getpid()) < 0){
+ hprint(&c->hout, "cannot attach self text: %r\n");
+ goto out;
+ }
+ }
+ if(map(getpid()) == nil){
+ hprint(&c->hout, "cannot map self: %r\n");
+ goto out;
+ }
+
+ fmtbufinit(&fmt, buf, sizeof buf);
+ debug.fmt = &fmt;
+ fn(c);
+ hprint(&c->hout, "%s\n", fmtbufflush(&fmt));
+ debug.fmt = nil;
+out:
+ qunlock(&debug.lock);
+ return 0;
+}
diff --git a/sys/src/cmd/venti/srv/httpd.c b/sys/src/cmd/venti/srv/httpd.c
new file mode 100755
index 000000000..623d4e476
--- /dev/null
+++ b/sys/src/cmd/venti/srv/httpd.c
@@ -0,0 +1,1177 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+#include "xml.h"
+
+typedef struct HttpObj HttpObj;
+extern QLock memdrawlock;
+
+enum
+{
+ ObjNameSize = 64,
+ MaxObjs = 64
+};
+
+struct HttpObj
+{
+ char name[ObjNameSize];
+ int (*f)(HConnect*);
+};
+
+static HttpObj objs[MaxObjs];
+
+static char *webroot;
+
+static void listenproc(void*);
+static int estats(HConnect *c);
+static int dindex(HConnect *c);
+static int xindex(HConnect *c);
+static int xlog(HConnect *c);
+static int sindex(HConnect *c);
+static int hempty(HConnect *c);
+static int hlcacheempty(HConnect *c);
+static int hdcacheempty(HConnect *c);
+static int hicacheempty(HConnect *c);
+static int hicachekick(HConnect *c);
+static int hdcachekick(HConnect *c);
+static int hicacheflush(HConnect *c);
+static int hdcacheflush(HConnect *c);
+static int httpdobj(char *name, int (*f)(HConnect*));
+static int xgraph(HConnect *c);
+static int xset(HConnect *c);
+static int fromwebdir(HConnect *c);
+
+int
+httpdinit(char *address, char *dir)
+{
+ fmtinstall('D', hdatefmt);
+/* fmtinstall('H', httpfmt); */
+ fmtinstall('U', hurlfmt);
+
+ if(address == nil)
+ address = "tcp!*!http";
+ webroot = dir;
+
+ httpdobj("/stats", estats);
+ httpdobj("/index", dindex);
+ httpdobj("/storage", sindex);
+ httpdobj("/xindex", xindex);
+ httpdobj("/flushicache", hicacheflush);
+ httpdobj("/flushdcache", hdcacheflush);
+ httpdobj("/kickicache", hicachekick);
+ httpdobj("/kickdcache", hdcachekick);
+ httpdobj("/graph", xgraph);
+ httpdobj("/set", xset);
+ httpdobj("/log", xlog);
+ httpdobj("/empty", hempty);
+ httpdobj("/emptyicache", hicacheempty);
+ httpdobj("/emptylumpcache", hlcacheempty);
+ httpdobj("/emptydcache", hdcacheempty);
+ httpdobj("/disk", hdisk);
+ httpdobj("/debug", hdebug);
+ httpdobj("/proc/", hproc);
+
+ if(vtproc(listenproc, address) < 0)
+ return -1;
+ return 0;
+}
+
+static int
+httpdobj(char *name, int (*f)(HConnect*))
+{
+ int i;
+
+ if(name == nil || strlen(name) >= ObjNameSize)
+ return -1;
+ for(i = 0; i < MaxObjs; i++){
+ if(objs[i].name[0] == '\0'){
+ strcpy(objs[i].name, name);
+ objs[i].f = f;
+ return 0;
+ }
+ if(strcmp(objs[i].name, name) == 0)
+ return -1;
+ }
+ return -1;
+}
+
+static HConnect*
+mkconnect(void)
+{
+ HConnect *c;
+
+ c = mallocz(sizeof(HConnect), 1);
+ if(c == nil)
+ sysfatal("out of memory");
+ c->replog = nil;
+ c->hpos = c->header;
+ c->hstop = c->header;
+ return c;
+}
+
+void httpproc(void*);
+
+static void
+listenproc(void *vaddress)
+{
+ HConnect *c;
+ char *address, ndir[NETPATHLEN], dir[NETPATHLEN];
+ int ctl, nctl, data;
+
+ address = vaddress;
+ ctl = announce(address, dir);
+ if(ctl < 0){
+ fprint(2, "venti: httpd can't announce on %s: %r\n", address);
+ return;
+ }
+
+ if(0) print("announce ctl %d dir %s\n", ctl, dir);
+ for(;;){
+ /*
+ * wait for a call (or an error)
+ */
+ nctl = listen(dir, ndir);
+ if(0) print("httpd listen %d %s...\n", nctl, ndir);
+ if(nctl < 0){
+ fprint(2, "venti: httpd can't listen on %s: %r\n", address);
+ return;
+ }
+
+ data = accept(ctl, ndir);
+ if(0) print("httpd accept %d...\n", data);
+ if(data < 0){
+ fprint(2, "venti: httpd accept: %r\n");
+ close(nctl);
+ continue;
+ }
+ if(0) print("httpd close nctl %d\n", nctl);
+ close(nctl);
+ c = mkconnect();
+ hinit(&c->hin, data, Hread);
+ hinit(&c->hout, data, Hwrite);
+ vtproc(httpproc, c);
+ }
+}
+
+void
+httpproc(void *v)
+{
+ HConnect *c;
+ int ok, i, n;
+
+ c = v;
+
+ for(;;){
+ /*
+ * No timeout because the signal appears to hit every
+ * proc, not just us.
+ */
+ if(hparsereq(c, 0) < 0)
+ break;
+
+ for(i = 0; i < MaxObjs && objs[i].name[0]; i++){
+ n = strlen(objs[i].name);
+ if((objs[i].name[n-1] == '/' && strncmp(c->req.uri, objs[i].name, n) == 0)
+ || (objs[i].name[n-1] != '/' && strcmp(c->req.uri, objs[i].name) == 0)){
+ ok = (*objs[i].f)(c);
+ goto found;
+ }
+ }
+ ok = fromwebdir(c);
+ found:
+ hflush(&c->hout);
+ if(c->head.closeit)
+ ok = -1;
+ hreqcleanup(c);
+
+ if(ok < 0)
+ break;
+ }
+ hreqcleanup(c);
+ close(c->hin.fd);
+ free(c);
+}
+
+char*
+hargstr(HConnect *c, char *name, char *def)
+{
+ HSPairs *p;
+
+ for(p=c->req.searchpairs; p; p=p->next)
+ if(strcmp(p->s, name) == 0)
+ return p->t;
+ return def;
+}
+
+vlong
+hargint(HConnect *c, char *name, vlong def)
+{
+ char *a;
+
+ if((a = hargstr(c, name, nil)) == nil)
+ return def;
+ return atoll(a);
+}
+
+static int
+percent(ulong v, ulong total)
+{
+ if(total == 0)
+ total = 1;
+ if(v < 1000*1000)
+ return (v * 100) / total;
+ total /= 100;
+ if(total == 0)
+ total = 1;
+ return v / total;
+}
+
+static int
+preq(HConnect *c)
+{
+ if(hparseheaders(c, 0) < 0)
+ return -1;
+ if(strcmp(c->req.meth, "GET") != 0
+ && strcmp(c->req.meth, "HEAD") != 0)
+ return hunallowed(c, "GET, HEAD");
+ if(c->head.expectother || c->head.expectcont)
+ return hfail(c, HExpectFail, nil);
+ return 0;
+}
+
+int
+hsettype(HConnect *c, char *type)
+{
+ Hio *hout;
+ int r;
+
+ r = preq(c);
+ if(r < 0)
+ return r;
+
+ hout = &c->hout;
+ if(c->req.vermaj){
+ hokheaders(c);
+ hprint(hout, "Content-type: %s\r\n", type);
+ if(http11(c))
+ hprint(hout, "Transfer-Encoding: chunked\r\n");
+ hprint(hout, "\r\n");
+ }
+
+ if(http11(c))
+ hxferenc(hout, 1);
+ else
+ c->head.closeit = 1;
+ return 0;
+}
+
+int
+hsethtml(HConnect *c)
+{
+ return hsettype(c, "text/html; charset=utf-8");
+}
+
+int
+hsettext(HConnect *c)
+{
+ return hsettype(c, "text/plain; charset=utf-8");
+}
+
+static int
+herror(HConnect *c)
+{
+ int n;
+ Hio *hout;
+
+ hout = &c->hout;
+ n = snprint(c->xferbuf, HBufSize, "<html><head><title>Error</title></head>\n<body><h1>Error</h1>\n<pre>%r</pre>\n</body></html>");
+ hprint(hout, "%s %s\r\n", hversion, "400 Bad Request");
+ hprint(hout, "Date: %D\r\n", time(nil));
+ hprint(hout, "Server: Venti\r\n");
+ hprint(hout, "Content-Type: text/html\r\n");
+ hprint(hout, "Content-Length: %d\r\n", n);
+ if(c->head.closeit)
+ hprint(hout, "Connection: close\r\n");
+ else if(!http11(c))
+ hprint(hout, "Connection: Keep-Alive\r\n");
+ hprint(hout, "\r\n");
+
+ if(c->req.meth == nil || strcmp(c->req.meth, "HEAD") != 0)
+ hwrite(hout, c->xferbuf, n);
+
+ return hflush(hout);
+}
+
+int
+hnotfound(HConnect *c)
+{
+ int r;
+
+ r = preq(c);
+ if(r < 0)
+ return r;
+ return hfail(c, HNotFound, c->req.uri);
+}
+
+struct {
+ char *ext;
+ char *type;
+} exttab[] = {
+ ".html", "text/html",
+ ".txt", "text/plain",
+ ".xml", "text/xml",
+ ".png", "image/png",
+ ".gif", "image/gif",
+ 0
+};
+
+static int
+fromwebdir(HConnect *c)
+{
+ char buf[4096], *p, *ext, *type;
+ int i, fd, n, defaulted;
+ Dir *d;
+
+ if(webroot == nil || strstr(c->req.uri, ".."))
+ return hnotfound(c);
+ snprint(buf, sizeof buf-20, "%s/%s", webroot, c->req.uri+1);
+ defaulted = 0;
+reopen:
+ if((fd = open(buf, OREAD)) < 0)
+ return hnotfound(c);
+ d = dirfstat(fd);
+ if(d == nil){
+ close(fd);
+ return hnotfound(c);
+ }
+ if(d->mode&DMDIR){
+ if(!defaulted){
+ defaulted = 1;
+ strcat(buf, "/index.html");
+ free(d);
+ close(fd);
+ goto reopen;
+ }
+ free(d);
+ return hnotfound(c);
+ }
+ free(d);
+ p = buf+strlen(buf);
+ type = "application/octet-stream";
+ for(i=0; exttab[i].ext; i++){
+ ext = exttab[i].ext;
+ if(p-strlen(ext) >= buf && strcmp(p-strlen(ext), ext) == 0){
+ type = exttab[i].type;
+ break;
+ }
+ }
+ if(hsettype(c, type) < 0){
+ close(fd);
+ return 0;
+ }
+ while((n = read(fd, buf, sizeof buf)) > 0)
+ if(hwrite(&c->hout, buf, n) < 0)
+ break;
+ close(fd);
+ hflush(&c->hout);
+ return 0;
+}
+
+static struct
+{
+ char *name;
+ int *p;
+} namedints[] =
+{
+ "compress", &compressblocks,
+ "devnull", &writestodevnull,
+ "logging", &ventilogging,
+ "stats", &collectstats,
+ "icachesleeptime", &icachesleeptime,
+ "minicachesleeptime", &minicachesleeptime,
+ "arenasumsleeptime", &arenasumsleeptime,
+ "l0quantum", &l0quantum,
+ "l1quantum", &l1quantum,
+ "manualscheduling", &manualscheduling,
+ "ignorebloom", &ignorebloom,
+ "syncwrites", &syncwrites,
+ "icacheprefetch", &icacheprefetch,
+ 0
+};
+
+static int
+xset(HConnect *c)
+{
+ int i, old;
+ char *name, *value;
+
+ if(hsettext(c) < 0)
+ return -1;
+
+ if((name = hargstr(c, "name", nil)) == nil || name[0] == 0){
+ for(i=0; namedints[i].name; i++)
+ hprint(&c->hout, "%s = %d\n", namedints[i].name, *namedints[i].p);
+ hflush(&c->hout);
+ return 0;
+ }
+
+ for(i=0; namedints[i].name; i++)
+ if(strcmp(name, namedints[i].name) == 0)
+ break;
+ if(!namedints[i].name){
+ hprint(&c->hout, "%s not found\n", name);
+ hflush(&c->hout);
+ return 0;
+ }
+
+ if((value = hargstr(c, "value", nil)) == nil || value[0] == 0){
+ hprint(&c->hout, "%s = %d\n", namedints[i].name, *namedints[i].p);
+ hflush(&c->hout);
+ return 0;
+ }
+
+ old = *namedints[i].p;
+ *namedints[i].p = atoll(value);
+ hprint(&c->hout, "%s = %d (was %d)\n", name, *namedints[i].p, old);
+ hflush(&c->hout);
+ return 0;
+}
+
+static int
+estats(HConnect *c)
+{
+ Hio *hout;
+ int r;
+
+ r = hsettext(c);
+ if(r < 0)
+ return r;
+
+
+ hout = &c->hout;
+/*
+ hprint(hout, "lump writes=%,ld\n", stats.lumpwrites);
+ hprint(hout, "lump reads=%,ld\n", stats.lumpreads);
+ hprint(hout, "lump cache read hits=%,ld\n", stats.lumphit);
+ hprint(hout, "lump cache read misses=%,ld\n", stats.lumpmiss);
+
+ hprint(hout, "clump disk writes=%,ld\n", stats.clumpwrites);
+ hprint(hout, "clump disk bytes written=%,lld\n", stats.clumpbwrites);
+ hprint(hout, "clump disk bytes compressed=%,lld\n", stats.clumpbcomp);
+ hprint(hout, "clump disk reads=%,ld\n", stats.clumpreads);
+ hprint(hout, "clump disk bytes read=%,lld\n", stats.clumpbreads);
+ hprint(hout, "clump disk bytes uncompressed=%,lld\n", stats.clumpbuncomp);
+
+ hprint(hout, "clump directory disk writes=%,ld\n", stats.ciwrites);
+ hprint(hout, "clump directory disk reads=%,ld\n", stats.cireads);
+
+ hprint(hout, "index disk writes=%,ld\n", stats.indexwrites);
+ hprint(hout, "index disk reads=%,ld\n", stats.indexreads);
+ hprint(hout, "index disk bloom filter hits=%,ld %d%% falsemisses=%,ld %d%%\n",
+ stats.indexbloomhits,
+ percent(stats.indexbloomhits, stats.indexreads),
+ stats.indexbloomfalsemisses,
+ percent(stats.indexbloomfalsemisses, stats.indexreads));
+ hprint(hout, "bloom filter bits=%,ld of %,ld %d%%\n",
+ stats.bloomones, stats.bloombits, percent(stats.bloomones, stats.bloombits));
+ hprint(hout, "index disk reads for modify=%,ld\n", stats.indexwreads);
+ hprint(hout, "index disk reads for allocation=%,ld\n", stats.indexareads);
+ hprint(hout, "index block splits=%,ld\n", stats.indexsplits);
+
+ hprint(hout, "index cache lookups=%,ld\n", stats.iclookups);
+ hprint(hout, "index cache hits=%,ld %d%%\n", stats.ichits,
+ percent(stats.ichits, stats.iclookups));
+ hprint(hout, "index cache fills=%,ld %d%%\n", stats.icfills,
+ percent(stats.icfills, stats.iclookups));
+ hprint(hout, "index cache inserts=%,ld\n", stats.icinserts);
+
+ hprint(hout, "disk cache hits=%,ld\n", stats.pchit);
+ hprint(hout, "disk cache misses=%,ld\n", stats.pcmiss);
+ hprint(hout, "disk cache reads=%,ld\n", stats.pcreads);
+ hprint(hout, "disk cache bytes read=%,lld\n", stats.pcbreads);
+
+ hprint(hout, "disk cache writes=%,ld\n", stats.dirtydblocks);
+ hprint(hout, "disk cache writes absorbed=%,ld %d%%\n", stats.absorbedwrites,
+ percent(stats.absorbedwrites, stats.dirtydblocks));
+
+ hprint(hout, "disk cache flushes=%,ld\n", stats.dcacheflushes);
+ hprint(hout, "disk cache flush writes=%,ld (%,ld per flush)\n",
+ stats.dcacheflushwrites,
+ stats.dcacheflushwrites/(stats.dcacheflushes ? stats.dcacheflushes : 1));
+
+ hprint(hout, "disk writes=%,ld\n", stats.diskwrites);
+ hprint(hout, "disk bytes written=%,lld\n", stats.diskbwrites);
+ hprint(hout, "disk reads=%,ld\n", stats.diskreads);
+ hprint(hout, "disk bytes read=%,lld\n", stats.diskbreads);
+*/
+
+ hflush(hout);
+ return 0;
+}
+
+static int
+sindex(HConnect *c)
+{
+ Hio *hout;
+ Index *ix;
+ Arena *arena;
+ vlong clumps, cclumps, uncsize, used, size;
+ int i, r, active;
+
+ r = hsettext(c);
+ if(r < 0)
+ return r;
+ hout = &c->hout;
+
+ ix = mainindex;
+
+ hprint(hout, "index=%s\n", ix->name);
+
+ active = 0;
+ clumps = 0;
+ cclumps = 0;
+ uncsize = 0;
+ used = 0;
+ size = 0;
+ for(i = 0; i < ix->narenas; i++){
+ arena = ix->arenas[i];
+ if(arena != nil && arena->memstats.clumps != 0){
+ active++;
+ clumps += arena->memstats.clumps;
+ cclumps += arena->memstats.cclumps;
+ uncsize += arena->memstats.uncsize;
+ used += arena->memstats.used;
+ }
+ size += arena->size;
+ }
+ hprint(hout, "total arenas=%,d active=%,d\n", ix->narenas, active);
+ hprint(hout, "total space=%,lld used=%,lld\n", size, used + clumps * ClumpInfoSize);
+ hprint(hout, "clumps=%,lld compressed clumps=%,lld data=%,lld compressed data=%,lld\n",
+ clumps, cclumps, uncsize, used - clumps * ClumpSize);
+ hflush(hout);
+ return 0;
+}
+
+static void
+darena(Hio *hout, Arena *arena)
+{
+ hprint(hout, "arena='%s' on %s at [%lld,%lld)\n\tversion=%d created=%d modified=%d",
+ arena->name, arena->part->name, arena->base, arena->base + arena->size + 2 * arena->blocksize,
+ arena->version, arena->ctime, arena->wtime);
+ if(arena->memstats.sealed)
+ hprint(hout, " mem=sealed");
+ if(arena->diskstats.sealed)
+ hprint(hout, " disk=sealed");
+ hprint(hout, "\n");
+ if(scorecmp(zeroscore, arena->score) != 0)
+ hprint(hout, "\tscore=%V\n", arena->score);
+
+ hprint(hout, "\twritten: clumps=%d compressed clumps=%d data=%,lld compressed data=%,lld storage=%,lld\n",
+ arena->memstats.clumps, arena->memstats.cclumps, arena->memstats.uncsize,
+ arena->memstats.used - arena->memstats.clumps * ClumpSize,
+ arena->memstats.used + arena->memstats.clumps * ClumpInfoSize);
+ hprint(hout, "\tindexed: clumps=%d compressed clumps=%d data=%,lld compressed data=%,lld storage=%,lld\n",
+ arena->diskstats.clumps, arena->diskstats.cclumps, arena->diskstats.uncsize,
+ arena->diskstats.used - arena->diskstats.clumps * ClumpSize,
+ arena->diskstats.used + arena->diskstats.clumps * ClumpInfoSize);
+}
+
+static int
+hempty(HConnect *c)
+{
+ Hio *hout;
+ int r;
+
+ r = hsettext(c);
+ if(r < 0)
+ return r;
+ hout = &c->hout;
+
+ emptylumpcache();
+ emptydcache();
+ emptyicache();
+ hprint(hout, "emptied all caches\n");
+ hflush(hout);
+ return 0;
+}
+
+static int
+hlcacheempty(HConnect *c)
+{
+ Hio *hout;
+ int r;
+
+ r = hsettext(c);
+ if(r < 0)
+ return r;
+ hout = &c->hout;
+
+ emptylumpcache();
+ hprint(hout, "emptied lumpcache\n");
+ hflush(hout);
+ return 0;
+}
+
+static int
+hicacheempty(HConnect *c)
+{
+ Hio *hout;
+ int r;
+
+ r = hsettext(c);
+ if(r < 0)
+ return r;
+ hout = &c->hout;
+
+ emptyicache();
+ hprint(hout, "emptied icache\n");
+ hflush(hout);
+ return 0;
+}
+
+static int
+hdcacheempty(HConnect *c)
+{
+ Hio *hout;
+ int r;
+
+ r = hsettext(c);
+ if(r < 0)
+ return r;
+ hout = &c->hout;
+
+ emptydcache();
+ hprint(hout, "emptied dcache\n");
+ hflush(hout);
+ return 0;
+}
+static int
+hicachekick(HConnect *c)
+{
+ Hio *hout;
+ int r;
+
+ r = hsettext(c);
+ if(r < 0)
+ return r;
+ hout = &c->hout;
+
+ kickicache();
+ hprint(hout, "kicked icache\n");
+ hflush(hout);
+ return 0;
+}
+
+static int
+hdcachekick(HConnect *c)
+{
+ Hio *hout;
+ int r;
+
+ r = hsettext(c);
+ if(r < 0)
+ return r;
+ hout = &c->hout;
+
+ kickdcache();
+ hprint(hout, "kicked dcache\n");
+ hflush(hout);
+ return 0;
+}
+static int
+hicacheflush(HConnect *c)
+{
+ Hio *hout;
+ int r;
+
+ r = hsettext(c);
+ if(r < 0)
+ return r;
+ hout = &c->hout;
+
+ flushicache();
+ hprint(hout, "flushed icache\n");
+ hflush(hout);
+ return 0;
+}
+
+static int
+hdcacheflush(HConnect *c)
+{
+ Hio *hout;
+ int r;
+
+ r = hsettext(c);
+ if(r < 0)
+ return r;
+ hout = &c->hout;
+
+ flushdcache();
+ hprint(hout, "flushed dcache\n");
+ hflush(hout);
+ return 0;
+}
+
+static int
+dindex(HConnect *c)
+{
+ Hio *hout;
+ Index *ix;
+ int i, r;
+
+ r = hsettext(c);
+ if(r < 0)
+ return r;
+ hout = &c->hout;
+
+
+ ix = mainindex;
+ hprint(hout, "index=%s version=%d blocksize=%d tabsize=%d\n",
+ ix->name, ix->version, ix->blocksize, ix->tabsize);
+ hprint(hout, "\tbuckets=%d div=%d\n", ix->buckets, ix->div);
+ for(i = 0; i < ix->nsects; i++)
+ hprint(hout, "\tsect=%s for buckets [%lld,%lld) buckmax=%d\n", ix->smap[i].name, ix->smap[i].start, ix->smap[i].stop, ix->sects[i]->buckmax);
+ for(i = 0; i < ix->narenas; i++){
+ if(ix->arenas[i] != nil && ix->arenas[i]->memstats.clumps != 0){
+ hprint(hout, "arena=%s at index [%lld,%lld)\n\t", ix->amap[i].name, ix->amap[i].start, ix->amap[i].stop);
+ darena(hout, ix->arenas[i]);
+ }
+ }
+ hflush(hout);
+ return 0;
+}
+
+typedef struct Arg Arg;
+struct Arg
+{
+ int index;
+ int index2;
+};
+
+static long
+rawgraph(Stats *s, Stats *t, void *va)
+{
+ Arg *a;
+
+ USED(s);
+ a = va;
+ return t->n[a->index];
+}
+
+static long
+diffgraph(Stats *s, Stats *t, void *va)
+{
+ Arg *a;
+
+ a = va;
+ return t->n[a->index] - s->n[a->index];
+}
+
+static long
+pctgraph(Stats *s, Stats *t, void *va)
+{
+ Arg *a;
+
+ USED(s);
+ a = va;
+ return percent(t->n[a->index], t->n[a->index2]);
+}
+
+static long
+pctdiffgraph(Stats *s, Stats *t, void *va)
+{
+ Arg *a;
+
+ a = va;
+ return percent(t->n[a->index]-s->n[a->index], t->n[a->index2]-s->n[a->index2]);
+}
+
+static long
+xdiv(long a, long b)
+{
+ if(b == 0)
+ b++;
+ return a/b;
+}
+
+static long
+divdiffgraph(Stats *s, Stats *t, void *va)
+{
+ Arg *a;
+
+ a = va;
+ return xdiv(t->n[a->index] - s->n[a->index], t->n[a->index2] - s->n[a->index2]);
+}
+
+static long
+netbw(Stats *s)
+{
+ ulong *n;
+
+ n = s->n;
+ return n[StatRpcReadBytes]+n[StatRpcWriteBytes]; /* not exactly right */
+}
+
+static long
+diskbw(Stats *s)
+{
+ ulong *n;
+
+ n = s->n;
+ return n[StatApartReadBytes]+n[StatApartWriteBytes]
+ + n[StatIsectReadBytes]+n[StatIsectWriteBytes]
+ + n[StatSumReadBytes];
+}
+
+static long
+iobw(Stats *s)
+{
+ return netbw(s)+diskbw(s);
+}
+
+static long
+diskgraph(Stats *s, Stats *t, void *va)
+{
+ USED(va);
+ return diskbw(t)-diskbw(s);
+}
+
+static long
+netgraph(Stats *s, Stats *t, void *va)
+{
+ USED(va);
+ return netbw(t)-netbw(s);
+}
+
+static long
+iograph(Stats *s, Stats *t, void *va)
+{
+ USED(va);
+ return iobw(t)-iobw(s);
+}
+
+
+static char* graphname[] =
+{
+ "rpctotal",
+ "rpcread",
+ "rpcreadok",
+ "rpcreadfail",
+ "rpcreadbyte",
+ "rpcreadtime",
+ "rpcreadcached",
+ "rpcreadcachedtime",
+ "rpcreaduncached",
+ "rpcreaduncachedtime",
+ "rpcwrite",
+ "rpcwritenew",
+ "rpcwriteold",
+ "rpcwritefail",
+ "rpcwritebyte",
+ "rpcwritetime",
+ "rpcwritenewtime",
+ "rpcwriteoldtime",
+
+ "lcachehit",
+ "lcachemiss",
+ "lcachelookup",
+ "lcachewrite",
+ "lcachesize",
+ "lcachestall",
+ "lcachelookuptime",
+
+ "dcachehit",
+ "dcachemiss",
+ "dcachelookup",
+ "dcacheread",
+ "dcachewrite",
+ "dcachedirty",
+ "dcachesize",
+ "dcacheflush",
+ "dcachestall",
+ "dcachelookuptime",
+
+ "dblockstall",
+ "lumpstall",
+
+ "icachehit",
+ "icachemiss",
+ "icacheread",
+ "icachewrite",
+ "icachefill",
+ "icacheprefetch",
+ "icachedirty",
+ "icachesize",
+ "icacheflush",
+ "icachestall",
+ "icachelookuptime",
+ "icachelookup",
+ "scachehit",
+ "scacheprefetch",
+
+ "bloomhit",
+ "bloommiss",
+ "bloomfalsemiss",
+ "bloomlookup",
+ "bloomones",
+ "bloombits",
+
+ "apartread",
+ "apartreadbyte",
+ "apartwrite",
+ "apartwritebyte",
+
+ "isectread",
+ "isectreadbyte",
+ "isectwrite",
+ "isectwritebyte",
+
+ "sumread",
+ "sumreadbyte",
+
+ "cigload",
+ "cigloadtime",
+};
+
+static int
+findname(char *s)
+{
+ int i;
+
+ for(i=0; i<nelem(graphname); i++)
+ if(strcmp(graphname[i], s) == 0)
+ return i;
+ return -1;
+}
+
+static void
+dotextbin(Hio *io, Graph *g)
+{
+ int i, nbin;
+ Statbin *b, bin[2000]; /* 32 kB, but whack is worse */
+
+ needstack(8192); /* double check that bin didn't kill us */
+ nbin = 100;
+ binstats(g->fn, g->arg, g->t0, g->t1, bin, nbin);
+
+ hprint(io, "stats\n\n");
+ for(i=0; i<nbin; i++){
+ b = &bin[i];
+ hprint(io, "%d: nsamp=%d min=%d max=%d avg=%d\n",
+ i, b->nsamp, b->min, b->max, b->avg);
+ }
+}
+
+static int
+xgraph(HConnect *c)
+{
+ char *name;
+ Hio *hout;
+ Memimage *m;
+ int dotext;
+ Graph g;
+ Arg arg;
+ char *graph, *a;
+
+ name = hargstr(c, "arg", "");
+ if((arg.index = findname(name)) == -1 && strcmp(name, "*") != 0){
+ werrstr("unknown name %s", name);
+ goto error;
+ }
+ a = hargstr(c, "arg2", "");
+ if(a[0] && (arg.index2 = findname(a)) == -1){
+ werrstr("unknown name %s", a);
+ goto error;
+ }
+
+ g.arg = &arg;
+ g.t0 = hargint(c, "t0", -120);
+ g.t1 = hargint(c, "t1", 0);
+ g.min = hargint(c, "min", -1);
+ g.max = hargint(c, "max", -1);
+ g.wid = hargint(c, "wid", -1);
+ g.ht = hargint(c, "ht", -1);
+ dotext = hargstr(c, "text", "")[0] != 0;
+ g.fill = hargint(c, "fill", -1);
+
+ graph = hargstr(c, "graph", "raw");
+ if(strcmp(graph, "raw") == 0)
+ g.fn = rawgraph;
+ else if(strcmp(graph, "diskbw") == 0)
+ g.fn = diskgraph;
+ else if(strcmp(graph, "iobw") == 0)
+ g.fn = iograph;
+ else if(strcmp(graph, "netbw") == 0)
+ g.fn = netgraph;
+ else if(strcmp(graph, "diff") == 0)
+ g.fn = diffgraph;
+ else if(strcmp(graph, "pct") == 0)
+ g.fn = pctgraph;
+ else if(strcmp(graph, "pctdiff") == 0)
+ g.fn = pctdiffgraph;
+ else if(strcmp(graph, "divdiff") == 0)
+ g.fn = divdiffgraph;
+ else{
+ werrstr("unknown graph %s", graph);
+ goto error;
+ }
+
+ if(dotext){
+ hsettype(c, "text/plain");
+ dotextbin(&c->hout, &g);
+ hflush(&c->hout);
+ return 0;
+ }
+
+ m = statgraph(&g);
+ if(m == nil)
+ goto error;
+
+ if(hsettype(c, "image/png") < 0)
+ return -1;
+ hout = &c->hout;
+ writepng(hout, m);
+ qlock(&memdrawlock);
+ freememimage(m);
+ qunlock(&memdrawlock);
+ hflush(hout);
+ return 0;
+
+error:
+ return herror(c);
+}
+
+static int
+xloglist(HConnect *c)
+{
+ if(hsettype(c, "text/html") < 0)
+ return -1;
+ vtloghlist(&c->hout);
+ hflush(&c->hout);
+ return 0;
+}
+
+static int
+xlog(HConnect *c)
+{
+ char *name;
+ VtLog *l;
+
+ name = hargstr(c, "log", "");
+ if(!name[0])
+ return xloglist(c);
+ l = vtlogopen(name, 0);
+ if(l == nil)
+ return hnotfound(c);
+ if(hsettype(c, "text/html") < 0){
+ vtlogclose(l);
+ return -1;
+ }
+ vtloghdump(&c->hout, l);
+ vtlogclose(l);
+ hflush(&c->hout);
+ return 0;
+}
+
+static int
+xindex(HConnect *c)
+{
+ if(hsettype(c, "text/xml") < 0)
+ return -1;
+ xmlindex(&c->hout, mainindex, "index", 0);
+ hflush(&c->hout);
+ return 0;
+}
+
+void
+xmlindent(Hio *hout, int indent)
+{
+ int i;
+
+ for(i = 0; i < indent; i++)
+ hputc(hout, '\t');
+}
+
+void
+xmlaname(Hio *hout, char *v, char *tag)
+{
+ hprint(hout, " %s=\"%s\"", tag, v);
+}
+
+void
+xmlscore(Hio *hout, u8int *v, char *tag)
+{
+ if(scorecmp(zeroscore, v) == 0)
+ return;
+ hprint(hout, " %s=\"%V\"", tag, v);
+}
+
+void
+xmlsealed(Hio *hout, int v, char *tag)
+{
+ if(!v)
+ return;
+ hprint(hout, " %s=\"yes\"", tag);
+}
+
+void
+xmlu32int(Hio *hout, u32int v, char *tag)
+{
+ hprint(hout, " %s=\"%ud\"", tag, v);
+}
+
+void
+xmlu64int(Hio *hout, u64int v, char *tag)
+{
+ hprint(hout, " %s=\"%llud\"", tag, v);
+}
+
+void
+vtloghdump(Hio *h, VtLog *l)
+{
+ int i;
+ VtLogChunk *c;
+ char *name;
+
+ name = l ? l->name : "&lt;nil&gt;";
+
+ hprint(h, "<html><head>\n");
+ hprint(h, "<title>Venti Server Log: %s</title>\n", name);
+ hprint(h, "</head><body>\n");
+ hprint(h, "<b>Venti Server Log: %s</b>\n<p>\n", name);
+
+ if(l){
+ c = l->w;
+ for(i=0; i<l->nchunk; i++){
+ if(++c == l->chunk+l->nchunk)
+ c = l->chunk;
+ hwrite(h, c->p, c->wp-c->p);
+ }
+ }
+ hprint(h, "</body></html>\n");
+}
+
+static int
+strpcmp(const void *va, const void *vb)
+{
+ return strcmp(*(char**)va, *(char**)vb);
+}
+
+void
+vtloghlist(Hio *h)
+{
+ char **p;
+ int i, n;
+
+ hprint(h, "<html><head>\n");
+ hprint(h, "<title>Venti Server Logs</title>\n");
+ hprint(h, "</head><body>\n");
+ hprint(h, "<b>Venti Server Logs</b>\n<p>\n");
+
+ p = vtlognames(&n);
+ qsort(p, n, sizeof(p[0]), strpcmp);
+ for(i=0; i<n; i++)
+ hprint(h, "<a href=\"/log?log=%s\">%s</a><br>\n", p[i], p[i]);
+ vtfree(p);
+ hprint(h, "</body></html>\n");
+}
diff --git a/sys/src/cmd/venti/srv/icache.c b/sys/src/cmd/venti/srv/icache.c
new file mode 100755
index 000000000..67faba209
--- /dev/null
+++ b/sys/src/cmd/venti/srv/icache.c
@@ -0,0 +1,571 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+int icacheprefetch = 1;
+
+typedef struct ICache ICache;
+typedef struct IHash IHash;
+typedef struct ISum ISum;
+
+struct ICache
+{
+ QLock lock;
+ Rendez full;
+ IHash *hash;
+ IEntry *entries;
+ int nentries;
+ IEntry free;
+ IEntry clean;
+ IEntry dirty;
+ u32int maxdirty;
+ u32int ndirty;
+ AState as;
+
+ ISum **sum;
+ int nsum;
+ IHash *shash;
+ IEntry *sentries;
+ int nsentries;
+};
+
+static ICache icache;
+
+/*
+ * Hash table of IEntries
+ */
+
+struct IHash
+{
+ int bits;
+ u32int size;
+ IEntry **table;
+};
+
+static IHash*
+mkihash(int size1)
+{
+ u32int size;
+ int bits;
+ IHash *ih;
+
+ bits = 0;
+ size = 1;
+ while(size < size1){
+ bits++;
+ size <<= 1;
+ }
+
+ ih = vtmallocz(sizeof(IHash)+size*sizeof(ih->table[0]));
+ ih->table = (IEntry**)(ih+1);
+ ih->bits = bits;
+ ih->size = size;
+ return ih;
+}
+
+static IEntry*
+ihashlookup(IHash *ih, u8int score[VtScoreSize], int type)
+{
+ u32int h;
+ IEntry *ie;
+
+ h = hashbits(score, ih->bits);
+ for(ie=ih->table[h]; ie; ie=ie->nexthash)
+ if((type == -1 || type == ie->ia.type) && scorecmp(score, ie->score) == 0)
+ return ie;
+ return nil;
+}
+
+static void
+ihashdelete(IHash *ih, IEntry *ie, char *what)
+{
+ u32int h;
+ IEntry **l;
+
+ h = hashbits(ie->score, ih->bits);
+ for(l=&ih->table[h]; *l; l=&(*l)->nexthash)
+ if(*l == ie){
+ *l = ie->nexthash;
+ return;
+ }
+ fprint(2, "warning: %s %V not found in ihashdelete\n", what, ie->score);
+}
+
+static void
+ihashinsert(IHash *ih, IEntry *ie)
+{
+ u32int h;
+
+ h = hashbits(ie->score, ih->bits);
+ ie->nexthash = ih->table[h];
+ ih->table[h] = ie;
+}
+
+
+/*
+ * IEntry lists.
+ */
+
+static IEntry*
+popout(IEntry *ie)
+{
+ if(ie->prev == nil && ie->next == nil)
+ return ie;
+ ie->prev->next = ie->next;
+ ie->next->prev = ie->prev;
+ ie->next = nil;
+ ie->prev = nil;
+ return ie;
+}
+
+static IEntry*
+poplast(IEntry *list)
+{
+ if(list->prev == list)
+ return nil;
+ return popout(list->prev);
+}
+
+static IEntry*
+pushfirst(IEntry *list, IEntry *ie)
+{
+ popout(ie);
+ ie->prev = list;
+ ie->next = list->next;
+ ie->prev->next = ie;
+ ie->next->prev = ie;
+ return ie;
+}
+
+/*
+ * Arena summary cache.
+ */
+struct ISum
+{
+ QLock lock;
+ IEntry *entries;
+ int nentries;
+ int loaded;
+ u64int addr;
+ u64int limit;
+ Arena *arena;
+ int g;
+};
+
+static ISum*
+scachelookup(u64int addr)
+{
+ int i;
+ ISum *s;
+
+ for(i=0; i<icache.nsum; i++){
+ s = icache.sum[i];
+ if(s->addr <= addr && addr < s->limit){
+ if(i > 0){
+ memmove(icache.sum+1, icache.sum, i*sizeof icache.sum[0]);
+ icache.sum[0] = s;
+ }
+ return s;
+ }
+ }
+ return nil;
+}
+
+static void
+sumclear(ISum *s)
+{
+ int i;
+
+ for(i=0; i<s->nentries; i++)
+ ihashdelete(icache.shash, &s->entries[i], "scache");
+ s->nentries = 0;
+ s->loaded = 0;
+ s->addr = 0;
+ s->limit = 0;
+ s->arena = nil;
+ s->g = 0;
+}
+
+static ISum*
+scacheevict(void)
+{
+ ISum *s;
+ int i;
+
+ for(i=icache.nsum-1; i>=0; i--){
+ s = icache.sum[i];
+ if(canqlock(&s->lock)){
+ if(i > 0){
+ memmove(icache.sum+1, icache.sum, i*sizeof icache.sum[0]);
+ icache.sum[0] = s;
+ }
+ sumclear(s);
+ return s;
+ }
+ }
+ return nil;
+}
+
+static void
+scachehit(u64int addr)
+{
+ scachelookup(addr); /* for move-to-front */
+}
+
+static void
+scachesetup(ISum *s, u64int addr)
+{
+ u64int addr0, limit;
+ int g;
+
+ s->arena = amapitoag(mainindex, addr, &addr0, &limit, &g);
+ s->addr = addr0;
+ s->limit = limit;
+ s->g = g;
+}
+
+static void
+scacheload(ISum *s)
+{
+ int i, n;
+
+ s->loaded = 1;
+ n = asumload(s->arena, s->g, s->entries, ArenaCIGSize);
+ /*
+ * n can be less then ArenaCIGSize, either if the clump group
+ * is the last in the arena and is only partially filled, or if there
+ * are corrupt clumps in the group -- those are not returned.
+ */
+ for(i=0; i<n; i++){
+ s->entries[i].ia.addr += s->addr;
+ ihashinsert(icache.shash, &s->entries[i]);
+ }
+//fprint(2, "%T scacheload %s %d - %d entries\n", s->arena->name, s->g, n);
+ addstat(StatScachePrefetch, n);
+ s->nentries = n;
+}
+
+static ISum*
+scachemiss(u64int addr)
+{
+ ISum *s;
+
+ if(!icacheprefetch)
+ return nil;
+ s = scachelookup(addr);
+ if(s == nil){
+ /* first time: make an entry in the cache but don't populate it yet */
+ s = scacheevict();
+ if(s == nil)
+ return nil;
+ scachesetup(s, addr);
+ qunlock(&s->lock);
+ return nil;
+ }
+
+ /* second time: load from disk */
+ qlock(&s->lock);
+ if(s->loaded || !icacheprefetch){
+ qunlock(&s->lock);
+ return nil;
+ }
+
+ return s; /* locked */
+}
+
+/*
+ * Index cache.
+ */
+
+void
+initicache(u32int mem0)
+{
+ u32int mem;
+ int i, entries, scache;
+
+ icache.full.l = &icache.lock;
+
+ mem = mem0;
+ entries = mem / (sizeof(IEntry)+sizeof(IEntry*));
+ scache = (entries/8) / ArenaCIGSize;
+ entries -= entries/8;
+ if(scache < 4)
+ scache = 4;
+ if(scache > 16)
+ scache = 16;
+ if(entries < 1000)
+ entries = 1000;
+fprint(2, "icache %,d bytes = %,d entries; %d scache\n", mem0, entries, scache);
+
+ icache.clean.prev = icache.clean.next = &icache.clean;
+ icache.dirty.prev = icache.dirty.next = &icache.dirty;
+ icache.free.prev = icache.free.next = &icache.free;
+
+ icache.hash = mkihash(entries);
+ icache.nentries = entries;
+ setstat(StatIcacheSize, entries);
+ icache.entries = vtmallocz(entries*sizeof icache.entries[0]);
+ icache.maxdirty = entries / 2;
+ for(i=0; i<entries; i++)
+ pushfirst(&icache.free, &icache.entries[i]);
+
+ icache.nsum = scache;
+ icache.sum = vtmallocz(scache*sizeof icache.sum[0]);
+ icache.sum[0] = vtmallocz(scache*sizeof icache.sum[0][0]);
+ icache.nsentries = scache * ArenaCIGSize;
+ icache.sentries = vtmallocz(scache*ArenaCIGSize*sizeof icache.sentries[0]);
+ icache.shash = mkihash(scache*ArenaCIGSize);
+ for(i=0; i<scache; i++){
+ icache.sum[i] = icache.sum[0] + i;
+ icache.sum[i]->entries = icache.sentries + i*ArenaCIGSize;
+ }
+}
+
+
+static IEntry*
+evictlru(void)
+{
+ IEntry *ie;
+
+ ie = poplast(&icache.clean);
+ if(ie == nil)
+ return nil;
+ ihashdelete(icache.hash, ie, "evictlru");
+ return ie;
+}
+
+static void
+icacheinsert(u8int score[VtScoreSize], IAddr *ia, int state)
+{
+ IEntry *ie;
+
+ if((ie = poplast(&icache.free)) == nil && (ie = evictlru()) == nil){
+ addstat(StatIcacheStall, 1);
+ while((ie = poplast(&icache.free)) == nil && (ie = evictlru()) == nil){
+ // Could safely return here if state == IEClean.
+ // But if state == IEDirty, have to wait to make
+ // sure we don't lose an index write.
+ // Let's wait all the time.
+ flushdcache();
+ kickicache();
+ rsleep(&icache.full);
+ }
+ addstat(StatIcacheStall, -1);
+ }
+
+ memmove(ie->score, score, VtScoreSize);
+ ie->state = state;
+ ie->ia = *ia;
+ if(state == IEClean){
+ addstat(StatIcachePrefetch, 1);
+ pushfirst(&icache.clean, ie);
+ }else{
+ addstat(StatIcacheWrite, 1);
+ assert(state == IEDirty);
+ icache.ndirty++;
+ setstat(StatIcacheDirty, icache.ndirty);
+ delaykickicache();
+ pushfirst(&icache.dirty, ie);
+ }
+ ihashinsert(icache.hash, ie);
+}
+
+int
+icachelookup(u8int score[VtScoreSize], int type, IAddr *ia)
+{
+ IEntry *ie;
+
+ qlock(&icache.lock);
+ addstat(StatIcacheLookup, 1);
+ if((ie = ihashlookup(icache.hash, score, type)) != nil){
+ *ia = ie->ia;
+ if(ie->state == IEClean)
+ pushfirst(&icache.clean, ie);
+ addstat(StatIcacheHit, 1);
+ qunlock(&icache.lock);
+ return 0;
+ }
+
+ if((ie = ihashlookup(icache.shash, score, type)) != nil){
+ *ia = ie->ia;
+ icacheinsert(score, &ie->ia, IEClean);
+ scachehit(ie->ia.addr);
+ addstat(StatScacheHit, 1);
+ qunlock(&icache.lock);
+ return 0;
+ }
+ addstat(StatIcacheMiss, 1);
+ qunlock(&icache.lock);
+
+ return -1;
+}
+
+int
+insertscore(u8int score[VtScoreSize], IAddr *ia, int state, AState *as)
+{
+ ISum *toload;
+
+ qlock(&icache.lock);
+ icacheinsert(score, ia, state);
+ if(state == IEClean)
+ toload = scachemiss(ia->addr);
+ else{
+ assert(state == IEDirty);
+ toload = nil;
+ if(as == nil)
+ fprint(2, "%T insertscore IEDirty without as; called from %#p\n",
+ getcallerpc(&score));
+ else{
+ if(icache.as.aa > as->aa)
+ fprint(2, "%T insertscore: aa moving backward: %#llux -> %#llux\n", icache.as.aa, as->aa);
+ icache.as = *as;
+ }
+ }
+ qunlock(&icache.lock);
+ if(toload){
+ scacheload(toload);
+ qunlock(&toload->lock);
+ }
+
+ if(icache.ndirty >= icache.maxdirty)
+ kickicache();
+
+ /*
+ * It's okay not to do this under icache.lock.
+ * Calling insertscore only happens when we hold
+ * the lump, meaning any searches for this block
+ * will hit in the lump cache until after we return.
+ */
+ if(state == IEDirty)
+ markbloomfilter(mainindex->bloom, score);
+
+ return 0;
+}
+
+int
+lookupscore(u8int score[VtScoreSize], int type, IAddr *ia)
+{
+ int ms, ret;
+ IEntry d;
+
+ if(icachelookup(score, type, ia) >= 0){
+ addstat(StatIcacheRead, 1);
+ return 0;
+ }
+
+ ms = msec();
+ addstat(StatIcacheFill, 1);
+ if(loadientry(mainindex, score, type, &d) < 0)
+ ret = -1;
+ else{
+ ret = 0;
+ insertscore(score, &d.ia, IEClean, nil);
+ *ia = d.ia;
+ }
+ addstat2(StatIcacheRead, 1, StatIcacheReadTime, msec() - ms);
+ return ret;
+}
+
+u32int
+hashbits(u8int *sc, int bits)
+{
+ u32int v;
+
+ v = (sc[0] << 24) | (sc[1] << 16) | (sc[2] << 8) | sc[3];
+ if(bits < 32)
+ v >>= (32 - bits);
+ return v;
+}
+
+ulong
+icachedirtyfrac(void)
+{
+ return (vlong)icache.ndirty*IcacheFrac / icache.nentries;
+}
+
+/*
+ * Return a singly-linked list of dirty index entries.
+ * with 32-bit hash numbers between lo and hi
+ * and address < limit.
+ */
+IEntry*
+icachedirty(u32int lo, u32int hi, u64int limit)
+{
+ u32int h;
+ IEntry *ie, *dirty;
+
+ dirty = nil;
+ trace(TraceProc, "icachedirty enter");
+ qlock(&icache.lock);
+ for(ie = icache.dirty.next; ie != &icache.dirty; ie=ie->next){
+ if(ie->state == IEDirty && ie->ia.addr <= limit){
+ h = hashbits(ie->score, 32);
+ if(lo <= h && h <= hi){
+ ie->nextdirty = dirty;
+ dirty = ie;
+ }
+ }
+ }
+ qunlock(&icache.lock);
+ trace(TraceProc, "icachedirty exit");
+ if(dirty == nil)
+ flushdcache();
+ return dirty;
+}
+
+AState
+icachestate(void)
+{
+ AState as;
+
+ qlock(&icache.lock);
+ as = icache.as;
+ qunlock(&icache.lock);
+ return as;
+}
+
+/*
+ * The singly-linked non-circular list of index entries ie
+ * has been written to disk. Move them to the clean list.
+ */
+void
+icacheclean(IEntry *ie)
+{
+ IEntry *next;
+
+ trace(TraceProc, "icacheclean enter");
+ qlock(&icache.lock);
+ for(; ie; ie=next){
+ assert(ie->state == IEDirty);
+ next = ie->nextdirty;
+ ie->nextdirty = nil;
+ popout(ie); /* from icache.dirty */
+ icache.ndirty--;
+ ie->state = IEClean;
+ pushfirst(&icache.clean, ie);
+ }
+ setstat(StatIcacheDirty, icache.ndirty);
+ rwakeupall(&icache.full);
+ qunlock(&icache.lock);
+ trace(TraceProc, "icacheclean exit");
+}
+
+void
+emptyicache(void)
+{
+ int i;
+ IEntry *ie;
+ ISum *s;
+
+ qlock(&icache.lock);
+ while((ie = evictlru()) != nil)
+ pushfirst(&icache.free, ie);
+ for(i=0; i<icache.nsum; i++){
+ s = icache.sum[i];
+ qlock(&s->lock);
+ sumclear(s);
+ qunlock(&s->lock);
+ }
+ qunlock(&icache.lock);
+}
+
diff --git a/sys/src/cmd/venti/srv/icachewrite.c b/sys/src/cmd/venti/srv/icachewrite.c
new file mode 100755
index 000000000..e1406ef15
--- /dev/null
+++ b/sys/src/cmd/venti/srv/icachewrite.c
@@ -0,0 +1,358 @@
+/*
+ * Write the dirty icache entries to disk. Random seeks are
+ * so expensive that it makes sense to wait until we have
+ * a lot and then just make a sequential pass over the disk.
+ */
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+static void icachewriteproc(void*);
+static void icachewritecoord(void*);
+static IEntry *iesort(IEntry*);
+
+int icachesleeptime = 1000; /* milliseconds */
+int minicachesleeptime = 0;
+
+enum
+{
+ Bufsize = 8*1024*1024
+};
+
+typedef struct IWrite IWrite;
+struct IWrite
+{
+ Round round;
+ AState as;
+};
+
+static IWrite iwrite;
+
+void
+initicachewrite(void)
+{
+ int i;
+ Index *ix;
+
+ initround(&iwrite.round, "icache", 120*60*1000);
+ ix = mainindex;
+ for(i=0; i<ix->nsects; i++){
+ ix->sects[i]->writechan = chancreate(sizeof(ulong), 1);
+ ix->sects[i]->writedonechan = chancreate(sizeof(ulong), 1);
+ vtproc(icachewriteproc, ix->sects[i]);
+ }
+ vtproc(icachewritecoord, nil);
+ vtproc(delaykickroundproc, &iwrite.round);
+}
+
+static u64int
+ie2diskaddr(Index *ix, ISect *is, IEntry *ie)
+{
+ u64int bucket, addr;
+
+ bucket = hashbits(ie->score, 32)/ix->div;
+ addr = is->blockbase + ((bucket - is->start) << is->blocklog);
+ return addr;
+}
+
+static IEntry*
+nextchunk(Index *ix, ISect *is, IEntry **pie, u64int *paddr, uint *pnbuf)
+{
+ u64int addr, naddr;
+ uint nbuf;
+ int bsize;
+ IEntry *iefirst, *ie, **l;
+
+ bsize = 1<<is->blocklog;
+ iefirst = *pie;
+ addr = ie2diskaddr(ix, is, iefirst);
+ nbuf = 0;
+ for(l = &iefirst->nextdirty; (ie = *l) != nil; l = &(*l)->nextdirty){
+ naddr = ie2diskaddr(ix, is, ie);
+ if(naddr - addr >= Bufsize)
+ break;
+ nbuf = naddr - addr;
+ }
+ nbuf += bsize;
+
+ *l = nil;
+ *pie = ie;
+ *paddr = addr;
+ *pnbuf = nbuf;
+ return iefirst;
+}
+
+static int
+icachewritesect(Index *ix, ISect *is, u8int *buf)
+{
+ int err, i, werr, h, bsize, t;
+ u32int lo, hi;
+ u64int addr, naddr;
+ uint nbuf, off;
+ DBlock *b;
+ IBucket ib;
+ IEntry *ie, *iedirty, **l, *chunk;
+
+ lo = is->start * ix->div;
+ if(TWID32/ix->div < is->stop)
+ hi = TWID32;
+ else
+ hi = is->stop * ix->div - 1;
+
+ trace(TraceProc, "icachewritesect enter %ud %ud %llud",
+ lo, hi, iwrite.as.aa);
+
+ iedirty = icachedirty(lo, hi, iwrite.as.aa);
+ iedirty = iesort(iedirty);
+ bsize = 1 << is->blocklog;
+ err = 0;
+
+ while(iedirty){
+ disksched();
+ while((t = icachesleeptime) == SleepForever){
+ sleep(1000);
+ disksched();
+ }
+ if(t < minicachesleeptime)
+ t = minicachesleeptime;
+ if(t > 0)
+ sleep(t);
+ trace(TraceProc, "icachewritesect nextchunk");
+ chunk = nextchunk(ix, is, &iedirty, &addr, &nbuf);
+
+ trace(TraceProc, "icachewritesect readpart 0x%llux+0x%ux",
+ addr, nbuf);
+ if(readpart(is->part, addr, buf, nbuf) < 0){
+ fprint(2, "%s: part %s addr 0x%llux: icachewritesect "
+ "readpart: %r\n", argv0, is->part->name, addr);
+ err = -1;
+ continue;
+ }
+ trace(TraceProc, "icachewritesect updatebuf");
+ addstat(StatIsectReadBytes, nbuf);
+ addstat(StatIsectRead, 1);
+
+ for(l=&chunk; (ie=*l)!=nil; l=&ie->nextdirty){
+again:
+ naddr = ie2diskaddr(ix, is, ie);
+ off = naddr - addr;
+ if(off+bsize > nbuf){
+ fprint(2, "%s: whoops! addr=0x%llux nbuf=%ud "
+ "addr+nbuf=0x%llux naddr=0x%llux\n",
+ argv0, addr, nbuf, addr+nbuf, naddr);
+ assert(off+bsize <= nbuf);
+ }
+ unpackibucket(&ib, buf+off, is->bucketmagic);
+ if(okibucket(&ib, is) < 0){
+ fprint(2, "%s: bad bucket XXX\n", argv0);
+ goto skipit;
+ }
+ trace(TraceProc, "icachewritesect add %V at 0x%llux",
+ ie->score, naddr);
+ h = bucklook(ie->score, ie->ia.type, ib.data, ib.n);
+ if(h & 1){
+ h ^= 1;
+ packientry(ie, &ib.data[h]);
+ }else if(ib.n < is->buckmax){
+ memmove(&ib.data[h + IEntrySize], &ib.data[h],
+ ib.n*IEntrySize - h);
+ ib.n++;
+ packientry(ie, &ib.data[h]);
+ }else{
+ fprint(2, "%s: bucket overflow XXX\n", argv0);
+skipit:
+ err = -1;
+ *l = ie->nextdirty;
+ ie = *l;
+ if(ie)
+ goto again;
+ else
+ break;
+ }
+ packibucket(&ib, buf+off, is->bucketmagic);
+ }
+
+ diskaccess(1);
+
+ trace(TraceProc, "icachewritesect writepart", addr, nbuf);
+ werr = 0;
+ if(writepart(is->part, addr, buf, nbuf) < 0 || flushpart(is->part) < 0)
+ werr = -1;
+
+ for(i=0; i<nbuf; i+=bsize){
+ if((b = _getdblock(is->part, addr+i, ORDWR, 0)) != nil){
+ memmove(b->data, buf+i, bsize);
+ putdblock(b);
+ }
+ }
+
+ if(werr < 0){
+ fprint(2, "%s: part %s addr 0x%llux: icachewritesect "
+ "writepart: %r\n", argv0, is->part->name, addr);
+ err = -1;
+ continue;
+ }
+
+ addstat(StatIsectWriteBytes, nbuf);
+ addstat(StatIsectWrite, 1);
+ icacheclean(chunk);
+ }
+
+ trace(TraceProc, "icachewritesect done");
+ return err;
+}
+
+static void
+icachewriteproc(void *v)
+{
+ int ret;
+ uint bsize;
+ ISect *is;
+ Index *ix;
+ u8int *buf;
+
+ ix = mainindex;
+ is = v;
+ threadsetname("icachewriteproc:%s", is->part->name);
+
+ bsize = 1<<is->blocklog;
+ buf = emalloc(Bufsize+bsize);
+ buf = (u8int*)(((uintptr)buf+bsize-1)&~(uintptr)(bsize-1));
+
+ for(;;){
+ trace(TraceProc, "icachewriteproc recv");
+ recv(is->writechan, 0);
+ trace(TraceWork, "start");
+ ret = icachewritesect(ix, is, buf);
+ trace(TraceProc, "icachewriteproc send");
+ trace(TraceWork, "finish");
+ sendul(is->writedonechan, ret);
+ }
+}
+
+static void
+icachewritecoord(void *v)
+{
+ int i, err;
+ Index *ix;
+ AState as;
+
+ USED(v);
+
+ threadsetname("icachewritecoord");
+
+ ix = mainindex;
+ iwrite.as = icachestate();
+
+ for(;;){
+ trace(TraceProc, "icachewritecoord sleep");
+ waitforkick(&iwrite.round);
+ trace(TraceWork, "start");
+ as = icachestate();
+ if(as.arena==iwrite.as.arena && as.aa==iwrite.as.aa){
+ /* will not be able to do anything more than last flush - kick disk */
+ trace(TraceProc, "icachewritecoord kick dcache");
+ kickdcache();
+ trace(TraceProc, "icachewritecoord kicked dcache");
+ goto SkipWork; /* won't do anything; don't bother rewriting bloom filter */
+ }
+ iwrite.as = as;
+
+ trace(TraceProc, "icachewritecoord start flush");
+ if(iwrite.as.arena){
+ for(i=0; i<ix->nsects; i++)
+ send(ix->sects[i]->writechan, 0);
+ if(ix->bloom)
+ send(ix->bloom->writechan, 0);
+
+ err = 0;
+ for(i=0; i<ix->nsects; i++)
+ err |= recvul(ix->sects[i]->writedonechan);
+ if(ix->bloom)
+ err |= recvul(ix->bloom->writedonechan);
+
+ trace(TraceProc, "icachewritecoord donewrite err=%d", err);
+ if(err == 0){
+ setatailstate(&iwrite.as);
+ }
+ }
+ SkipWork:
+ icacheclean(nil); /* wake up anyone waiting */
+ trace(TraceWork, "finish");
+ addstat(StatIcacheFlush, 1);
+ }
+}
+
+void
+flushicache(void)
+{
+ trace(TraceProc, "flushicache enter");
+ kickround(&iwrite.round, 1);
+ trace(TraceProc, "flushicache exit");
+}
+
+void
+kickicache(void)
+{
+ kickround(&iwrite.round, 0);
+}
+
+void
+delaykickicache(void)
+{
+ delaykickround(&iwrite.round);
+}
+
+static IEntry*
+iesort(IEntry *ie)
+{
+ int cmp;
+ IEntry **l;
+ IEntry *ie1, *ie2, *sorted;
+
+ if(ie == nil || ie->nextdirty == nil)
+ return ie;
+
+ /* split the lists */
+ ie1 = ie;
+ ie2 = ie;
+ if(ie2)
+ ie2 = ie2->nextdirty;
+ if(ie2)
+ ie2 = ie2->nextdirty;
+ while(ie1 && ie2){
+ ie1 = ie1->nextdirty;
+ ie2 = ie2->nextdirty;
+ if(ie2)
+ ie2 = ie2->nextdirty;
+ }
+ if(ie1){
+ ie2 = ie1->nextdirty;
+ ie1->nextdirty = nil;
+ }
+
+ /* sort the lists */
+ ie1 = iesort(ie);
+ ie2 = iesort(ie2);
+
+ /* merge the lists */
+ sorted = nil;
+ l = &sorted;
+ cmp = 0;
+ while(ie1 || ie2){
+ if(ie1 && ie2)
+ cmp = scorecmp(ie1->score, ie2->score);
+ if(ie1==nil || (ie2 && cmp > 0)){
+ *l = ie2;
+ l = &ie2->nextdirty;
+ ie2 = ie2->nextdirty;
+ }else{
+ *l = ie1;
+ l = &ie1->nextdirty;
+ ie1 = ie1->nextdirty;
+ }
+ }
+ *l = nil;
+ return sorted;
+}
+
diff --git a/sys/src/cmd/venti/srv/ifile.c b/sys/src/cmd/venti/srv/ifile.c
new file mode 100755
index 000000000..36d96b941
--- /dev/null
+++ b/sys/src/cmd/venti/srv/ifile.c
@@ -0,0 +1,149 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+static char vcmagic[] = "venti config\n";
+
+enum {
+ Maxconfig = 8 * 1024,
+ Maglen = sizeof vcmagic - 1,
+};
+
+int
+readifile(IFile *f, char *name)
+{
+ Part *p;
+ ZBlock *b;
+ u8int *z;
+
+ p = initpart(name, OREAD);
+ if(p == nil)
+ return -1;
+ b = alloczblock(Maxconfig+1, 1, 0);
+ if(b == nil){
+ seterr(EOk, "can't alloc for %s: %R", name);
+ return -1;
+ }
+ if(p->size > PartBlank){
+ /*
+ * this is likely a real venti partition, in which case we're
+ * looking for the config file stored as 8k at end of PartBlank.
+ */
+ if(readpart(p, PartBlank-Maxconfig, b->data, Maxconfig) < 0){
+ seterr(EOk, "can't read %s: %r", name);
+ freezblock(b);
+ freepart(p);
+ return -1;
+ }
+ b->data[Maxconfig] = '\0';
+ if(memcmp(b->data, vcmagic, Maglen) != 0){
+ seterr(EOk, "bad venti config magic in %s", name);
+ freezblock(b);
+ freepart(p);
+ return -1;
+ }
+ /*
+ * if we change b->data+b->_size, freezblock
+ * will blow an assertion, so don't.
+ */
+ b->data += Maglen;
+ b->_size -= Maglen;
+ b->len -= Maglen;
+ z = memchr(b->data, '\0', b->len);
+ if(z)
+ b->len = z - b->data;
+ }else if(p->size > Maxconfig){
+ seterr(EOk, "config file is too large");
+ freepart(p);
+ freezblock(b);
+ return -1;
+ }else{
+ freezblock(b);
+ b = readfile(name);
+ if(b == nil){
+ freepart(p);
+ return -1;
+ }
+ }
+ freepart(p);
+ f->name = name;
+ f->b = b;
+ f->pos = 0;
+ return 0;
+}
+
+void
+freeifile(IFile *f)
+{
+ freezblock(f->b);
+ f->b = nil;
+ f->pos = 0;
+}
+
+int
+partifile(IFile *f, Part *part, u64int start, u32int size)
+{
+ ZBlock *b;
+
+ b = alloczblock(size, 0, part->blocksize);
+ if(b == nil)
+ return -1;
+ if(readpart(part, start, b->data, size) < 0){
+ seterr(EAdmin, "can't read %s: %r", part->name);
+ freezblock(b);
+ return -1;
+ }
+ f->name = part->name;
+ f->b = b;
+ f->pos = 0;
+ return 0;
+}
+
+/*
+ * return the next non-blank input line,
+ * stripped of leading white space and with # comments eliminated
+ */
+char*
+ifileline(IFile *f)
+{
+ char *s, *e, *t;
+ int c;
+
+ for(;;){
+ s = (char*)&f->b->data[f->pos];
+ e = memchr(s, '\n', f->b->len - f->pos);
+ if(e == nil)
+ return nil;
+ *e++ = '\0';
+ f->pos = e - (char*)f->b->data;
+ t = strchr(s, '#');
+ if(t != nil)
+ *t = '\0';
+ for(; c = *s; s++)
+ if(c != ' ' && c != '\t' && c != '\r')
+ return s;
+ }
+}
+
+int
+ifilename(IFile *f, char *dst)
+{
+ char *s;
+
+ s = ifileline(f);
+ if(s == nil || strlen(s) >= ANameSize)
+ return -1;
+ namecp(dst, s);
+ return 0;
+}
+
+int
+ifileu32int(IFile *f, u32int *r)
+{
+ char *s;
+
+ s = ifileline(f);
+ if(s == nil)
+ return -1;
+ return stru32int(s, r);
+}
diff --git a/sys/src/cmd/venti/srv/index.c b/sys/src/cmd/venti/srv/index.c
new file mode 100755
index 000000000..9877893ec
--- /dev/null
+++ b/sys/src/cmd/venti/srv/index.c
@@ -0,0 +1,866 @@
+/*
+ * Index, mapping scores to log positions.
+ *
+ * The index is made up of some number of index sections, each of
+ * which is typically stored on a different disk. The blocks in all the
+ * index sections are logically numbered, with each index section
+ * responsible for a range of blocks. Blocks are typically 8kB.
+ *
+ * The N index blocks are treated as a giant hash table. The top 32 bits
+ * of score are used as the key for a lookup. Each index block holds
+ * one hash bucket, which is responsible for ceil(2^32 / N) of the key space.
+ *
+ * The index is sized so that a particular bucket is extraordinarily
+ * unlikely to overflow: assuming compressed data blocks are 4kB
+ * on disk, and assuming each block has a 40 byte index entry,
+ * the index data will be 1% of the total data. Since scores are essentially
+ * random, all buckets should be about the same fullness.
+ * A factor of 5 gives us a wide comfort boundary to account for
+ * random variation. So the index disk space should be 5% of the arena disk space.
+ */
+
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+static int initindex1(Index*);
+static ISect *initisect1(ISect *is);
+
+#define KEY(k,d) ((d) ? (k)>>(32-(d)) : 0)
+
+static char IndexMagic[] = "venti index configuration";
+
+Index*
+initindex(char *name, ISect **sects, int n)
+{
+ IFile f;
+ Index *ix;
+ ISect *is;
+ u32int last, blocksize, tabsize;
+ int i;
+
+ if(n <= 0){
+fprint(2, "bad n\n");
+ seterr(EOk, "no index sections to initialize index");
+ return nil;
+ }
+ ix = MKZ(Index);
+ if(ix == nil){
+fprint(2, "no mem\n");
+ seterr(EOk, "can't initialize index: out of memory");
+ freeindex(ix);
+ return nil;
+ }
+
+ tabsize = sects[0]->tabsize;
+ if(partifile(&f, sects[0]->part, sects[0]->tabbase, tabsize) < 0)
+ return nil;
+ if(parseindex(&f, ix) < 0){
+ freeifile(&f);
+ freeindex(ix);
+ return nil;
+ }
+ freeifile(&f);
+ if(namecmp(ix->name, name) != 0){
+ seterr(ECorrupt, "mismatched index name: found %s expected %s", ix->name, name);
+ return nil;
+ }
+ if(ix->nsects != n){
+ seterr(ECorrupt, "mismatched number index sections: found %d expected %d", n, ix->nsects);
+ freeindex(ix);
+ return nil;
+ }
+ ix->sects = sects;
+ last = 0;
+ blocksize = ix->blocksize;
+ for(i = 0; i < ix->nsects; i++){
+ is = sects[i];
+ if(namecmp(ix->name, is->index) != 0
+ || is->blocksize != blocksize
+ || is->tabsize != tabsize
+ || namecmp(is->name, ix->smap[i].name) != 0
+ || is->start != ix->smap[i].start
+ || is->stop != ix->smap[i].stop
+ || last != is->start
+ || is->start > is->stop){
+ seterr(ECorrupt, "inconsistent index sections in %s", ix->name);
+ freeindex(ix);
+ return nil;
+ }
+ last = is->stop;
+ }
+ ix->tabsize = tabsize;
+ ix->buckets = last;
+
+ if(initindex1(ix) < 0){
+ freeindex(ix);
+ return nil;
+ }
+
+ ix->arenas = MKNZ(Arena*, ix->narenas);
+ if(maparenas(ix->amap, ix->arenas, ix->narenas, ix->name) < 0){
+ freeindex(ix);
+ return nil;
+ }
+
+ return ix;
+}
+
+static int
+initindex1(Index *ix)
+{
+ u32int buckets;
+
+ ix->div = (((u64int)1 << 32) + ix->buckets - 1) / ix->buckets;
+ buckets = (((u64int)1 << 32) - 1) / ix->div + 1;
+ if(buckets != ix->buckets){
+ seterr(ECorrupt, "inconsistent math for divisor and buckets in %s", ix->name);
+ return -1;
+ }
+
+ return 0;
+}
+
+int
+wbindex(Index *ix)
+{
+ Fmt f;
+ ZBlock *b;
+ int i;
+
+ if(ix->nsects == 0){
+ seterr(EOk, "no sections in index %s", ix->name);
+ return -1;
+ }
+ b = alloczblock(ix->tabsize, 1, ix->blocksize);
+ if(b == nil){
+ seterr(EOk, "can't write index configuration: out of memory");
+ return -1;
+ }
+ fmtzbinit(&f, b);
+ if(outputindex(&f, ix) < 0){
+ seterr(EOk, "can't make index configuration: table storage too small %d", ix->tabsize);
+ freezblock(b);
+ return -1;
+ }
+ for(i = 0; i < ix->nsects; i++){
+ if(writepart(ix->sects[i]->part, ix->sects[i]->tabbase, b->data, ix->tabsize) < 0
+ || flushpart(ix->sects[i]->part) < 0){
+ seterr(EOk, "can't write index: %r");
+ freezblock(b);
+ return -1;
+ }
+ }
+ freezblock(b);
+
+ for(i = 0; i < ix->nsects; i++)
+ if(wbisect(ix->sects[i]) < 0)
+ return -1;
+
+ return 0;
+}
+
+/*
+ * index: IndexMagic '\n' version '\n' name '\n' blocksize '\n' [V2: bitblocks '\n'] sections arenas
+ * version, blocksize: u32int
+ * name: max. ANameSize string
+ * sections, arenas: AMap
+ */
+int
+outputindex(Fmt *f, Index *ix)
+{
+ if(fmtprint(f, "%s\n%ud\n%s\n%ud\n", IndexMagic, ix->version, ix->name, ix->blocksize) < 0
+ || outputamap(f, ix->smap, ix->nsects) < 0
+ || outputamap(f, ix->amap, ix->narenas) < 0)
+ return -1;
+ return 0;
+}
+
+int
+parseindex(IFile *f, Index *ix)
+{
+ AMapN amn;
+ u32int v;
+ char *s;
+
+ /*
+ * magic
+ */
+ s = ifileline(f);
+ if(s == nil || strcmp(s, IndexMagic) != 0){
+ seterr(ECorrupt, "bad index magic for %s", f->name);
+ return -1;
+ }
+
+ /*
+ * version
+ */
+ if(ifileu32int(f, &v) < 0){
+ seterr(ECorrupt, "syntax error: bad version number in %s", f->name);
+ return -1;
+ }
+ ix->version = v;
+ if(ix->version != IndexVersion){
+ seterr(ECorrupt, "bad version number in %s", f->name);
+ return -1;
+ }
+
+ /*
+ * name
+ */
+ if(ifilename(f, ix->name) < 0){
+ seterr(ECorrupt, "syntax error: bad index name in %s", f->name);
+ return -1;
+ }
+
+ /*
+ * block size
+ */
+ if(ifileu32int(f, &v) < 0){
+ seterr(ECorrupt, "syntax error: bad block size number in %s", f->name);
+ return -1;
+ }
+ ix->blocksize = v;
+
+ if(parseamap(f, &amn) < 0)
+ return -1;
+ ix->nsects = amn.n;
+ ix->smap = amn.map;
+
+ if(parseamap(f, &amn) < 0)
+ return -1;
+ ix->narenas = amn.n;
+ ix->amap = amn.map;
+
+ return 0;
+}
+
+/*
+ * initialize an entirely new index
+ */
+Index *
+newindex(char *name, ISect **sects, int n)
+{
+ Index *ix;
+ AMap *smap;
+ u64int nb;
+ u32int div, ub, xb, start, stop, blocksize, tabsize;
+ int i, j;
+
+ if(n < 1){
+ seterr(EOk, "creating index with no index sections");
+ return nil;
+ }
+
+ /*
+ * compute the total buckets available in the index,
+ * and the total buckets which are used.
+ */
+ nb = 0;
+ blocksize = sects[0]->blocksize;
+ tabsize = sects[0]->tabsize;
+ for(i = 0; i < n; i++){
+ /*
+ * allow index, start, and stop to be set if index is correct
+ * and start and stop are what we would have picked.
+ * this allows calling fmtindex to reformat the index after
+ * replacing a bad index section with a freshly formatted one.
+ * start and stop are checked below.
+ */
+ if(sects[i]->index[0] != '\0' && strcmp(sects[i]->index, name) != 0){
+ seterr(EOk, "creating new index using non-empty section %s", sects[i]->name);
+ return nil;
+ }
+ if(blocksize != sects[i]->blocksize){
+ seterr(EOk, "mismatched block sizes in index sections");
+ return nil;
+ }
+ if(tabsize != sects[i]->tabsize){
+ seterr(EOk, "mismatched config table sizes in index sections");
+ return nil;
+ }
+ nb += sects[i]->blocks;
+ }
+
+ /*
+ * check for duplicate names
+ */
+ for(i = 0; i < n; i++){
+ for(j = i + 1; j < n; j++){
+ if(namecmp(sects[i]->name, sects[j]->name) == 0){
+ seterr(EOk, "duplicate section name %s for index %s", sects[i]->name, name);
+ return nil;
+ }
+ }
+ }
+
+ if(nb >= ((u64int)1 << 32)){
+ fprint(2, "%s: index is 2^32 blocks or more; ignoring some of it\n",
+ argv0);
+ nb = ((u64int)1 << 32) - 1;
+ }
+
+ div = (((u64int)1 << 32) + nb - 1) / nb;
+ if(div < 100){
+ fprint(2, "%s: index divisor %d too coarse; "
+ "index larger than needed, ignoring some of it\n",
+ argv0, div);
+ div = 100;
+ nb = (((u64int)1 << 32) - 1) / (100 - 1);
+ }
+ ub = (((u64int)1 << 32) - 1) / div + 1;
+ if(ub > nb){
+ seterr(EBug, "index initialization math wrong");
+ return nil;
+ }
+ xb = nb - ub;
+
+ /*
+ * initialize each of the index sections
+ * and the section map table
+ */
+ smap = MKNZ(AMap, n);
+ if(smap == nil){
+ seterr(EOk, "can't create new index: out of memory");
+ return nil;
+ }
+ start = 0;
+ for(i = 0; i < n; i++){
+ stop = start + sects[i]->blocks - xb / n;
+ if(i == n - 1)
+ stop = ub;
+
+ if(sects[i]->start != 0 || sects[i]->stop != 0)
+ if(sects[i]->start != start || sects[i]->stop != stop){
+ seterr(EOk, "creating new index using non-empty section %s", sects[i]->name);
+ return nil;
+ }
+
+ sects[i]->start = start;
+ sects[i]->stop = stop;
+ namecp(sects[i]->index, name);
+
+ smap[i].start = start;
+ smap[i].stop = stop;
+ namecp(smap[i].name, sects[i]->name);
+ start = stop;
+ }
+
+ /*
+ * initialize the index itself
+ */
+ ix = MKZ(Index);
+ if(ix == nil){
+ seterr(EOk, "can't create new index: out of memory");
+ free(smap);
+ return nil;
+ }
+ ix->version = IndexVersion;
+ namecp(ix->name, name);
+ ix->sects = sects;
+ ix->smap = smap;
+ ix->nsects = n;
+ ix->blocksize = blocksize;
+ ix->buckets = ub;
+ ix->tabsize = tabsize;
+ ix->div = div;
+
+ if(initindex1(ix) < 0){
+ free(smap);
+ return nil;
+ }
+
+ return ix;
+}
+
+ISect*
+initisect(Part *part)
+{
+ ISect *is;
+ ZBlock *b;
+ int ok;
+
+ b = alloczblock(HeadSize, 0, 0);
+ if(b == nil || readpart(part, PartBlank, b->data, HeadSize) < 0){
+ seterr(EAdmin, "can't read index section header: %r");
+ return nil;
+ }
+
+ is = MKZ(ISect);
+ if(is == nil){
+ freezblock(b);
+ return nil;
+ }
+ is->part = part;
+ ok = unpackisect(is, b->data);
+ freezblock(b);
+ if(ok < 0){
+ seterr(ECorrupt, "corrupted index section header: %r");
+ freeisect(is);
+ return nil;
+ }
+
+ if(is->version != ISectVersion1 && is->version != ISectVersion2){
+ seterr(EAdmin, "unknown index section version %d", is->version);
+ freeisect(is);
+ return nil;
+ }
+
+ return initisect1(is);
+}
+
+ISect*
+newisect(Part *part, u32int vers, char *name, u32int blocksize, u32int tabsize)
+{
+ ISect *is;
+ u32int tabbase;
+
+ is = MKZ(ISect);
+ if(is == nil)
+ return nil;
+
+ namecp(is->name, name);
+ is->version = vers;
+ is->part = part;
+ is->blocksize = blocksize;
+ is->start = 0;
+ is->stop = 0;
+ tabbase = (PartBlank + HeadSize + blocksize - 1) & ~(blocksize - 1);
+ is->blockbase = (tabbase + tabsize + blocksize - 1) & ~(blocksize - 1);
+ is->blocks = is->part->size / blocksize - is->blockbase / blocksize;
+ is->bucketmagic = 0;
+ if(is->version == ISectVersion2){
+ do{
+ is->bucketmagic = fastrand();
+ }while(is->bucketmagic==0);
+ }
+ is = initisect1(is);
+ if(is == nil)
+ return nil;
+
+ return is;
+}
+
+/*
+ * initialize the computed parameters for an index
+ */
+static ISect*
+initisect1(ISect *is)
+{
+ u64int v;
+
+ is->buckmax = (is->blocksize - IBucketSize) / IEntrySize;
+ is->blocklog = u64log2(is->blocksize);
+ if(is->blocksize != (1 << is->blocklog)){
+ seterr(ECorrupt, "illegal non-power-of-2 bucket size %d\n", is->blocksize);
+ freeisect(is);
+ return nil;
+ }
+ partblocksize(is->part, is->blocksize);
+ is->tabbase = (PartBlank + HeadSize + is->blocksize - 1) & ~(is->blocksize - 1);
+ if(is->tabbase >= is->blockbase){
+ seterr(ECorrupt, "index section config table overlaps bucket storage");
+ freeisect(is);
+ return nil;
+ }
+ is->tabsize = is->blockbase - is->tabbase;
+ v = is->part->size & ~(u64int)(is->blocksize - 1);
+ if(is->blockbase + (u64int)is->blocks * is->blocksize != v){
+ seterr(ECorrupt, "invalid blocks in index section %s", is->name);
+ /* ZZZ what to do?
+ freeisect(is);
+ return nil;
+ */
+ }
+
+ if(is->stop - is->start > is->blocks){
+ seterr(ECorrupt, "index section overflows available space");
+ freeisect(is);
+ return nil;
+ }
+ if(is->start > is->stop){
+ seterr(ECorrupt, "invalid index section range");
+ freeisect(is);
+ return nil;
+ }
+
+ return is;
+}
+
+int
+wbisect(ISect *is)
+{
+ ZBlock *b;
+
+ b = alloczblock(HeadSize, 1, 0);
+ if(b == nil){
+ /* ZZZ set error? */
+ return -1;
+ }
+
+ if(packisect(is, b->data) < 0){
+ seterr(ECorrupt, "can't make index section header: %r");
+ freezblock(b);
+ return -1;
+ }
+ if(writepart(is->part, PartBlank, b->data, HeadSize) < 0 || flushpart(is->part) < 0){
+ seterr(EAdmin, "can't write index section header: %r");
+ freezblock(b);
+ return -1;
+ }
+ freezblock(b);
+
+ return 0;
+}
+
+void
+freeisect(ISect *is)
+{
+ if(is == nil)
+ return;
+ free(is);
+}
+
+void
+freeindex(Index *ix)
+{
+ int i;
+
+ if(ix == nil)
+ return;
+ free(ix->amap);
+ free(ix->arenas);
+ if(ix->sects)
+ for(i = 0; i < ix->nsects; i++)
+ freeisect(ix->sects[i]);
+ free(ix->sects);
+ free(ix->smap);
+ free(ix);
+}
+
+/*
+ * write a clump to an available arena in the index
+ * and return the address of the clump within the index.
+ZZZ question: should this distinguish between an arena
+filling up and real errors writing the clump?
+ */
+u64int
+writeiclump(Index *ix, Clump *c, u8int *clbuf)
+{
+ u64int a;
+ int i;
+ IAddr ia;
+ AState as;
+
+ trace(TraceLump, "writeiclump enter");
+ qlock(&ix->writing);
+ for(i = ix->mapalloc; i < ix->narenas; i++){
+ a = writeaclump(ix->arenas[i], c, clbuf);
+ if(a != TWID64){
+ ix->mapalloc = i;
+ ia.addr = ix->amap[i].start + a;
+ ia.type = c->info.type;
+ ia.size = c->info.uncsize;
+ ia.blocks = (c->info.size + ClumpSize + (1<<ABlockLog) - 1) >> ABlockLog;
+ as.arena = ix->arenas[i];
+ as.aa = ia.addr;
+ as.stats = as.arena->memstats;
+ insertscore(c->info.score, &ia, IEDirty, &as);
+ qunlock(&ix->writing);
+ trace(TraceLump, "writeiclump exit");
+ return ia.addr;
+ }
+ }
+ qunlock(&ix->writing);
+
+ seterr(EAdmin, "no space left in arenas");
+ trace(TraceLump, "writeiclump failed");
+ return TWID64;
+}
+
+/*
+ * convert an arena index to an relative arena address
+ */
+Arena*
+amapitoa(Index *ix, u64int a, u64int *aa)
+{
+ int i, r, l, m;
+
+ l = 1;
+ r = ix->narenas - 1;
+ while(l <= r){
+ m = (r + l) / 2;
+ if(ix->amap[m].start <= a)
+ l = m + 1;
+ else
+ r = m - 1;
+ }
+ l--;
+
+ if(a > ix->amap[l].stop){
+for(i=0; i<ix->narenas; i++)
+ print("arena %d: %llux - %llux\n", i, ix->amap[i].start, ix->amap[i].stop);
+print("want arena %d for %llux\n", l, a);
+ seterr(ECrash, "unmapped address passed to amapitoa");
+ return nil;
+ }
+
+ if(ix->arenas[l] == nil){
+ seterr(ECrash, "unmapped arena selected in amapitoa");
+ return nil;
+ }
+ *aa = a - ix->amap[l].start;
+ return ix->arenas[l];
+}
+
+/*
+ * convert an arena index to the bounds of the containing arena group.
+ */
+Arena*
+amapitoag(Index *ix, u64int a, u64int *gstart, u64int *glimit, int *g)
+{
+ u64int aa;
+ Arena *arena;
+
+ arena = amapitoa(ix, a, &aa);
+ if(arena == nil)
+ return nil;
+ if(arenatog(arena, aa, gstart, glimit, g) < 0)
+ return nil;
+ *gstart += a - aa;
+ *glimit += a - aa;
+ return arena;
+}
+
+int
+iaddrcmp(IAddr *ia1, IAddr *ia2)
+{
+ return ia1->type != ia2->type
+ || ia1->size != ia2->size
+ || ia1->blocks != ia2->blocks
+ || ia1->addr != ia2->addr;
+}
+
+/*
+ * lookup the score in the partition
+ *
+ * nothing needs to be explicitly locked:
+ * only static parts of ix are used, and
+ * the bucket is locked by the DBlock lock.
+ */
+int
+loadientry(Index *ix, u8int *score, int type, IEntry *ie)
+{
+ ISect *is;
+ DBlock *b;
+ IBucket ib;
+ u32int buck;
+ int h, ok;
+
+ ok = -1;
+
+ trace(TraceLump, "loadientry enter");
+
+ /*
+ qlock(&stats.lock);
+ stats.indexreads++;
+ qunlock(&stats.lock);
+ */
+
+ if(!inbloomfilter(mainindex->bloom, score)){
+ trace(TraceLump, "loadientry bloomhit");
+ return -1;
+ }
+
+ trace(TraceLump, "loadientry loadibucket");
+ b = loadibucket(ix, score, &is, &buck, &ib);
+ trace(TraceLump, "loadientry loadedibucket");
+ if(b == nil)
+ return -1;
+
+ if(okibucket(&ib, is) < 0){
+ trace(TraceLump, "loadientry badbucket");
+ goto out;
+ }
+
+ h = bucklook(score, type, ib.data, ib.n);
+ if(h & 1){
+ h ^= 1;
+ trace(TraceLump, "loadientry found");
+ unpackientry(ie, &ib.data[h]);
+ ok = 0;
+ goto out;
+ }
+ trace(TraceLump, "loadientry notfound");
+ addstat(StatBloomFalseMiss, 1);
+out:
+ putdblock(b);
+ trace(TraceLump, "loadientry exit");
+ return ok;
+}
+
+int
+okibucket(IBucket *ib, ISect *is)
+{
+ if(ib->n <= is->buckmax)
+ return 0;
+
+ seterr(EICorrupt, "corrupted disk index bucket: n=%ud max=%ud, range=[%lud,%lud)",
+ ib->n, is->buckmax, is->start, is->stop);
+ return -1;
+}
+
+/*
+ * look for score within data;
+ * return 1 | byte index of matching index,
+ * or 0 | index of least element > score
+ */
+int
+bucklook(u8int *score, int otype, u8int *data, int n)
+{
+ int i, r, l, m, h, c, cc, type;
+
+ if(otype == -1)
+ type = -1;
+ else
+ type = vttodisktype(otype);
+ l = 0;
+ r = n - 1;
+ while(l <= r){
+ m = (r + l) >> 1;
+ h = m * IEntrySize;
+ for(i = 0; i < VtScoreSize; i++){
+ c = score[i];
+ cc = data[h + i];
+ if(c != cc){
+ if(c > cc)
+ l = m + 1;
+ else
+ r = m - 1;
+ goto cont;
+ }
+ }
+ cc = data[h + IEntryTypeOff];
+ if(type != cc && type != -1){
+ if(type > cc)
+ l = m + 1;
+ else
+ r = m - 1;
+ goto cont;
+ }
+ return h | 1;
+ cont:;
+ }
+
+ return l * IEntrySize;
+}
+
+/*
+ * compare two IEntries; consistent with bucklook
+ */
+int
+ientrycmp(const void *vie1, const void *vie2)
+{
+ u8int *ie1, *ie2;
+ int i, v1, v2;
+
+ ie1 = (u8int*)vie1;
+ ie2 = (u8int*)vie2;
+ for(i = 0; i < VtScoreSize; i++){
+ v1 = ie1[i];
+ v2 = ie2[i];
+ if(v1 != v2){
+ if(v1 < v2)
+ return -1;
+ return 1;
+ }
+ }
+ v1 = ie1[IEntryTypeOff];
+ v2 = ie2[IEntryTypeOff];
+ if(v1 != v2){
+ if(v1 < v2)
+ return -1;
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * find the number of the index section holding bucket #buck
+ */
+int
+indexsect0(Index *ix, u32int buck)
+{
+ int r, l, m;
+
+ l = 1;
+ r = ix->nsects - 1;
+ while(l <= r){
+ m = (r + l) >> 1;
+ if(ix->sects[m]->start <= buck)
+ l = m + 1;
+ else
+ r = m - 1;
+ }
+ return l - 1;
+}
+
+/*
+ * load the index block at bucket #buck
+ */
+static DBlock*
+loadibucket0(Index *ix, u32int buck, ISect **pis, u32int *pbuck, IBucket *ib, int mode)
+{
+ ISect *is;
+ DBlock *b;
+
+ is = ix->sects[indexsect0(ix, buck)];
+ if(buck < is->start || is->stop <= buck){
+ seterr(EAdmin, "index lookup out of range: %ud not found in index\n", buck);
+ return nil;
+ }
+
+ buck -= is->start;
+ if((b = getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), mode)) == nil)
+ return nil;
+
+ if(pis)
+ *pis = is;
+ if(pbuck)
+ *pbuck = buck;
+ if(ib)
+ unpackibucket(ib, b->data, is->bucketmagic);
+ return b;
+}
+
+/*
+ * find the number of the index section holding score
+ */
+int
+indexsect1(Index *ix, u8int *score)
+{
+ return indexsect0(ix, hashbits(score, 32) / ix->div);
+}
+
+/*
+ * load the index block responsible for score.
+ */
+static DBlock*
+loadibucket1(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib)
+{
+ return loadibucket0(ix, hashbits(score, 32)/ix->div, pis, pbuck, ib, OREAD);
+}
+
+int
+indexsect(Index *ix, u8int *score)
+{
+ return indexsect1(ix, score);
+}
+
+DBlock*
+loadibucket(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib)
+{
+ return loadibucket1(ix, score, pis, pbuck, ib);
+}
+
+
diff --git a/sys/src/cmd/venti/srv/lump.c b/sys/src/cmd/venti/srv/lump.c
new file mode 100755
index 000000000..9b244948b
--- /dev/null
+++ b/sys/src/cmd/venti/srv/lump.c
@@ -0,0 +1,240 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+int syncwrites = 0;
+int queuewrites = 0;
+int writestodevnull = 0;
+int verifywrites = 0;
+
+static Packet *readilump(Lump *u, IAddr *ia, u8int *score);
+
+/*
+ * Some of this logic is duplicated in hdisk.c
+ */
+Packet*
+readlump(u8int *score, int type, u32int size, int *cached)
+{
+ Lump *u;
+ Packet *p;
+ IAddr ia;
+ u32int n;
+
+ trace(TraceLump, "readlump enter");
+/*
+ qlock(&stats.lock);
+ stats.lumpreads++;
+ qunlock(&stats.lock);
+*/
+ if(scorecmp(score, zeroscore) == 0)
+ return packetalloc();
+ u = lookuplump(score, type);
+ if(u->data != nil){
+ trace(TraceLump, "readlump lookuplump hit");
+ if(cached)
+ *cached = 1;
+ n = packetsize(u->data);
+ if(n > size){
+ seterr(EOk, "read too small: asked for %d need at least %d", size, n);
+ putlump(u);
+
+ return nil;
+ }
+ p = packetdup(u->data, 0, n);
+ putlump(u);
+ return p;
+ }
+
+ if(cached)
+ *cached = 0;
+
+ if(lookupscore(score, type, &ia) < 0){
+ /* ZZZ place to check for someone trying to guess scores */
+ seterr(EOk, "no block with score %V/%d exists", score, type);
+
+ putlump(u);
+ return nil;
+ }
+ if(ia.size > size){
+ seterr(EOk, "read too small 1: asked for %d need at least %d", size, ia.size);
+
+ putlump(u);
+ return nil;
+ }
+
+ trace(TraceLump, "readlump readilump");
+ p = readilump(u, &ia, score);
+ putlump(u);
+
+ trace(TraceLump, "readlump exit");
+ return p;
+}
+
+/*
+ * save away a lump, and return it's score.
+ * doesn't store duplicates, but checks that the data is really the same.
+ */
+int
+writelump(Packet *p, u8int *score, int type, u32int creator, uint ms)
+{
+ Lump *u;
+ int ok;
+
+/*
+ qlock(&stats.lock);
+ stats.lumpwrites++;
+ qunlock(&stats.lock);
+*/
+
+ packetsha1(p, score);
+ if(packetsize(p) == 0 || writestodevnull==1){
+ packetfree(p);
+ return 0;
+ }
+
+ u = lookuplump(score, type);
+ if(u->data != nil){
+ ok = 0;
+ if(packetcmp(p, u->data) != 0){
+ uchar nscore[VtScoreSize];
+
+ packetsha1(u->data, nscore);
+ if(scorecmp(u->score, score) != 0)
+ seterr(EStrange, "lookuplump returned bad score %V not %V", u->score, score);
+ else if(scorecmp(u->score, nscore) != 0)
+ seterr(EStrange, "lookuplump returned bad data %V not %V", nscore, u->score);
+ else
+ seterr(EStrange, "score collision %V", score);
+ ok = -1;
+ }
+ packetfree(p);
+ putlump(u);
+ return ok;
+ }
+
+ if(writestodevnull==2){
+ packetfree(p);
+ return 0;
+ }
+
+ if(queuewrites)
+ return queuewrite(u, p, creator, ms);
+
+ ok = writeqlump(u, p, creator, ms);
+
+ putlump(u);
+ return ok;
+}
+
+int
+writeqlump(Lump *u, Packet *p, int creator, uint ms)
+{
+ ZBlock *flat;
+ Packet *old;
+ IAddr ia;
+ int ok;
+
+ if(lookupscore(u->score, u->type, &ia) == 0){
+ if(verifywrites == 0){
+ /* assume the data is here! */
+ packetfree(p);
+ ms = msec() - ms;
+ addstat2(StatRpcWriteOld, 1, StatRpcWriteOldTime, ms);
+ return 0;
+ }
+
+ /*
+ * if the read fails,
+ * assume it was corrupted data and store the block again
+ */
+ old = readilump(u, &ia, u->score);
+ if(old != nil){
+ ok = 0;
+ if(packetcmp(p, old) != 0){
+ uchar nscore[VtScoreSize];
+
+ packetsha1(old, nscore);
+ if(scorecmp(u->score, nscore) != 0)
+ seterr(EStrange, "readilump returned bad data %V not %V", nscore, u->score);
+ else
+ seterr(EStrange, "score collision %V", u->score);
+ ok = -1;
+ }
+ packetfree(p);
+ packetfree(old);
+
+ ms = msec() - ms;
+ addstat2(StatRpcWriteOld, 1, StatRpcWriteOldTime, ms);
+ return ok;
+ }
+ logerr(EAdmin, "writelump: read %V failed, rewriting: %r\n", u->score);
+ }
+
+ flat = packet2zblock(p, packetsize(p));
+ ok = storeclump(mainindex, flat, u->score, u->type, creator, &ia);
+ freezblock(flat);
+ if(ok == 0)
+ insertlump(u, p);
+ else
+ packetfree(p);
+
+ if(syncwrites){
+ flushdcache();
+ flushicache();
+ flushdcache();
+ }
+
+ ms = msec() - ms;
+ addstat2(StatRpcWriteNew, 1, StatRpcWriteNewTime, ms);
+ return ok;
+}
+
+static Packet*
+readilump(Lump *u, IAddr *ia, u8int *score)
+{
+ Arena *arena;
+ ZBlock *zb;
+ Packet *p, *pp;
+ Clump cl;
+ u64int aa;
+ u8int sc[VtScoreSize];
+
+ trace(TraceLump, "readilump enter");
+ arena = amapitoa(mainindex, ia->addr, &aa);
+ if(arena == nil){
+ trace(TraceLump, "readilump amapitoa failed");
+ return nil;
+ }
+
+ trace(TraceLump, "readilump loadclump");
+ zb = loadclump(arena, aa, ia->blocks, &cl, sc, paranoid);
+ if(zb == nil){
+ trace(TraceLump, "readilump loadclump failed");
+ return nil;
+ }
+
+ if(ia->size != cl.info.uncsize){
+ seterr(EInconsist, "index and clump size mismatch");
+ freezblock(zb);
+ return nil;
+ }
+ if(ia->type != cl.info.type){
+ seterr(EInconsist, "index and clump type mismatch");
+ freezblock(zb);
+ return nil;
+ }
+ if(scorecmp(score, sc) != 0){
+ seterr(ECrash, "score mismatch");
+ freezblock(zb);
+ return nil;
+ }
+
+ trace(TraceLump, "readilump success");
+ p = zblock2packet(zb, cl.info.uncsize);
+ freezblock(zb);
+ pp = packetdup(p, 0, packetsize(p));
+ trace(TraceLump, "readilump insertlump");
+ insertlump(u, pp);
+ trace(TraceLump, "readilump exit");
+ return p;
+}
diff --git a/sys/src/cmd/venti/srv/lumpcache.c b/sys/src/cmd/venti/srv/lumpcache.c
new file mode 100755
index 000000000..d9a6b954e
--- /dev/null
+++ b/sys/src/cmd/venti/srv/lumpcache.c
@@ -0,0 +1,429 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+/* #define CHECK(x) x */
+#define CHECK(x)
+
+typedef struct LumpCache LumpCache;
+
+enum
+{
+ HashLog = 9,
+ HashSize = 1<<HashLog,
+ HashMask = HashSize - 1,
+};
+
+struct LumpCache
+{
+ QLock lock;
+ Rendez full;
+ Lump *free; /* list of available lumps */
+ u32int allowed; /* total allowable space for packets */
+ u32int avail; /* remaining space for packets */
+ u32int now; /* ticks for usage timestamps */
+ Lump **heads; /* hash table for finding address */
+ int nheap; /* number of available victims */
+ Lump **heap; /* heap for locating victims */
+ int nblocks; /* number of blocks allocated */
+ Lump *blocks; /* array of block descriptors */
+};
+
+static LumpCache lumpcache;
+
+static void delheap(Lump *db);
+static int downheap(int i, Lump *b);
+static void fixheap(int i, Lump *b);
+static int upheap(int i, Lump *b);
+static Lump *bumplump(void);
+
+void
+initlumpcache(u32int size, u32int nblocks)
+{
+ Lump *last, *b;
+ int i;
+
+ lumpcache.full.l = &lumpcache.lock;
+ lumpcache.nblocks = nblocks;
+ lumpcache.allowed = size;
+ lumpcache.avail = size;
+ lumpcache.heads = MKNZ(Lump*, HashSize);
+ lumpcache.heap = MKNZ(Lump*, nblocks);
+ lumpcache.blocks = MKNZ(Lump, nblocks);
+ setstat(StatLcacheSize, lumpcache.nblocks);
+
+ last = nil;
+ for(i = 0; i < nblocks; i++){
+ b = &lumpcache.blocks[i];
+ b->type = TWID8;
+ b->heap = TWID32;
+ b->next = last;
+ last = b;
+ }
+ lumpcache.free = last;
+ lumpcache.nheap = 0;
+}
+
+Lump*
+lookuplump(u8int *score, int type)
+{
+ uint ms;
+ Lump *b;
+ u32int h;
+
+ ms = 0;
+ trace(TraceLump, "lookuplump enter");
+
+ h = hashbits(score, HashLog);
+
+ /*
+ * look for the block in the cache
+ */
+ qlock(&lumpcache.lock);
+ CHECK(checklumpcache());
+again:
+ for(b = lumpcache.heads[h]; b != nil; b = b->next){
+ if(scorecmp(score, b->score)==0 && type == b->type){
+ addstat(StatLcacheHit, 1);
+ trace(TraceLump, "lookuplump hit");
+ goto found;
+ }
+ }
+
+ trace(TraceLump, "lookuplump miss");
+
+ /*
+ * missed: locate the block with the oldest second to last use.
+ * remove it from the heap, and fix up the heap.
+ */
+ while(lumpcache.free == nil){
+ trace(TraceLump, "lookuplump bump");
+ CHECK(checklumpcache());
+ if(bumplump() == nil){
+ CHECK(checklumpcache());
+ logerr(EAdmin, "all lump cache blocks in use");
+ addstat(StatLcacheStall, 1);
+ CHECK(checklumpcache());
+ rsleep(&lumpcache.full);
+ CHECK(checklumpcache());
+ addstat(StatLcacheStall, -1);
+ goto again;
+ }
+ CHECK(checklumpcache());
+ }
+
+ /* start timer on cache miss to avoid system call on cache hit */
+ ms = msec();
+
+ addstat(StatLcacheMiss, 1);
+ b = lumpcache.free;
+ lumpcache.free = b->next;
+
+ /*
+ * the new block has no last use, so assume it happens sometime in the middle
+ZZZ this is not reasonable
+ */
+ b->used = (b->used2 + lumpcache.now) / 2;
+
+ /*
+ * rechain the block on the correct hash chain
+ */
+ b->next = lumpcache.heads[h];
+ lumpcache.heads[h] = b;
+ if(b->next != nil)
+ b->next->prev = b;
+ b->prev = nil;
+
+ scorecp(b->score, score);
+ b->type = type;
+ b->size = 0;
+ b->data = nil;
+
+found:
+ b->ref++;
+ b->used2 = b->used;
+ b->used = lumpcache.now++;
+ if(b->heap != TWID32)
+ fixheap(b->heap, b);
+ CHECK(checklumpcache());
+ qunlock(&lumpcache.lock);
+
+
+ addstat(StatLumpStall, 1);
+ qlock(&b->lock);
+ addstat(StatLumpStall, -1);
+
+ trace(TraceLump, "lookuplump exit");
+ addstat2(StatLcacheRead, 1, StatLcacheReadTime, ms ? msec()-ms : 0);
+ return b;
+}
+
+void
+insertlump(Lump *b, Packet *p)
+{
+ u32int size;
+
+ /*
+ * look for the block in the cache
+ */
+ trace(TraceLump, "insertlump enter");
+ qlock(&lumpcache.lock);
+ CHECK(checklumpcache());
+again:
+
+ addstat(StatLcacheWrite, 1);
+
+ /*
+ * missed: locate the block with the oldest second to last use.
+ * remove it from the heap, and fix up the heap.
+ */
+ size = packetasize(p);
+ while(lumpcache.avail < size){
+ trace(TraceLump, "insertlump bump");
+ CHECK(checklumpcache());
+ if(bumplump() == nil){
+ logerr(EAdmin, "all lump cache blocks in use");
+ addstat(StatLcacheStall, 1);
+ CHECK(checklumpcache());
+ rsleep(&lumpcache.full);
+ CHECK(checklumpcache());
+ addstat(StatLcacheStall, -1);
+ goto again;
+ }
+ CHECK(checklumpcache());
+ }
+ b->data = p;
+ b->size = size;
+ lumpcache.avail -= size;
+ CHECK(checklumpcache());
+ qunlock(&lumpcache.lock);
+ trace(TraceLump, "insertlump exit");
+}
+
+void
+putlump(Lump *b)
+{
+ if(b == nil)
+ return;
+
+ trace(TraceLump, "putlump");
+ qunlock(&b->lock);
+ qlock(&lumpcache.lock);
+ CHECK(checklumpcache());
+ if(--b->ref == 0){
+ if(b->heap == TWID32)
+ upheap(lumpcache.nheap++, b);
+ trace(TraceLump, "putlump wakeup");
+ rwakeupall(&lumpcache.full);
+ }
+ CHECK(checklumpcache());
+ qunlock(&lumpcache.lock);
+}
+
+/*
+ * remove some lump from use and update the free list and counters
+ */
+static Lump*
+bumplump(void)
+{
+ Lump *b;
+ u32int h;
+
+ /*
+ * remove blocks until we find one that is unused
+ * referenced blocks are left in the heap even though
+ * they can't be scavenged; this is simple a speed optimization
+ */
+ CHECK(checklumpcache());
+ for(;;){
+ if(lumpcache.nheap == 0){
+ trace(TraceLump, "bumplump emptyheap");
+ return nil;
+ }
+ b = lumpcache.heap[0];
+ delheap(b);
+ if(!b->ref){
+ trace(TraceLump, "bumplump wakeup");
+ rwakeupall(&lumpcache.full);
+ break;
+ }
+ }
+
+ /*
+ * unchain the block
+ */
+ trace(TraceLump, "bumplump unchain");
+ if(b->prev == nil){
+ h = hashbits(b->score, HashLog);
+ if(lumpcache.heads[h] != b)
+ sysfatal("bad hash chains in lump cache");
+ lumpcache.heads[h] = b->next;
+ }else
+ b->prev->next = b->next;
+ if(b->next != nil)
+ b->next->prev = b->prev;
+
+ if(b->data != nil){
+ packetfree(b->data);
+ b->data = nil;
+ lumpcache.avail += b->size;
+ b->size = 0;
+ }
+ b->type = TWID8;
+
+ b->next = lumpcache.free;
+ lumpcache.free = b;
+
+ CHECK(checklumpcache());
+ trace(TraceLump, "bumplump exit");
+ return b;
+}
+
+void
+emptylumpcache(void)
+{
+ qlock(&lumpcache.lock);
+ while(bumplump())
+ ;
+ qunlock(&lumpcache.lock);
+}
+
+/*
+ * delete an arbitrary block from the heap
+ */
+static void
+delheap(Lump *db)
+{
+ fixheap(db->heap, lumpcache.heap[--lumpcache.nheap]);
+ db->heap = TWID32;
+}
+
+/*
+ * push an element up or down to it's correct new location
+ */
+static void
+fixheap(int i, Lump *b)
+{
+ if(upheap(i, b) == i)
+ downheap(i, b);
+}
+
+static int
+upheap(int i, Lump *b)
+{
+ Lump *bb;
+ u32int now;
+ int p;
+
+ now = lumpcache.now;
+ for(; i != 0; i = p){
+ p = (i - 1) >> 1;
+ bb = lumpcache.heap[p];
+ if(b->used2 - now >= bb->used2 - now)
+ break;
+ lumpcache.heap[i] = bb;
+ bb->heap = i;
+ }
+
+ lumpcache.heap[i] = b;
+ b->heap = i;
+ return i;
+}
+
+static int
+downheap(int i, Lump *b)
+{
+ Lump *bb;
+ u32int now;
+ int k;
+
+ now = lumpcache.now;
+ for(; ; i = k){
+ k = (i << 1) + 1;
+ if(k >= lumpcache.nheap)
+ break;
+ if(k + 1 < lumpcache.nheap && lumpcache.heap[k]->used2 - now > lumpcache.heap[k + 1]->used2 - now)
+ k++;
+ bb = lumpcache.heap[k];
+ if(b->used2 - now <= bb->used2 - now)
+ break;
+ lumpcache.heap[i] = bb;
+ bb->heap = i;
+ }
+
+ lumpcache.heap[i] = b;
+ b->heap = i;
+ return i;
+}
+
+static void
+findblock(Lump *bb)
+{
+ Lump *b, *last;
+ int h;
+
+ last = nil;
+ h = hashbits(bb->score, HashLog);
+ for(b = lumpcache.heads[h]; b != nil; b = b->next){
+ if(last != b->prev)
+ sysfatal("bad prev link");
+ if(b == bb)
+ return;
+ last = b;
+ }
+ sysfatal("block score=%V type=%#x missing from hash table", bb->score, bb->type);
+}
+
+void
+checklumpcache(void)
+{
+ Lump *b;
+ u32int size, now, nfree;
+ int i, k, refed;
+
+ now = lumpcache.now;
+ for(i = 0; i < lumpcache.nheap; i++){
+ if(lumpcache.heap[i]->heap != i)
+ sysfatal("lc: mis-heaped at %d: %d", i, lumpcache.heap[i]->heap);
+ if(i > 0 && lumpcache.heap[(i - 1) >> 1]->used2 - now > lumpcache.heap[i]->used2 - now)
+ sysfatal("lc: bad heap ordering");
+ k = (i << 1) + 1;
+ if(k < lumpcache.nheap && lumpcache.heap[i]->used2 - now > lumpcache.heap[k]->used2 - now)
+ sysfatal("lc: bad heap ordering");
+ k++;
+ if(k < lumpcache.nheap && lumpcache.heap[i]->used2 - now > lumpcache.heap[k]->used2 - now)
+ sysfatal("lc: bad heap ordering");
+ }
+
+ refed = 0;
+ size = 0;
+ for(i = 0; i < lumpcache.nblocks; i++){
+ b = &lumpcache.blocks[i];
+ if(b->data == nil && b->size != 0)
+ sysfatal("bad size: %d data=%p", b->size, b->data);
+ if(b->ref && b->heap == TWID32)
+ refed++;
+ if(b->type != TWID8){
+ findblock(b);
+ size += b->size;
+ }
+ if(b->heap != TWID32
+ && lumpcache.heap[b->heap] != b)
+ sysfatal("lc: spurious heap value");
+ }
+ if(lumpcache.avail != lumpcache.allowed - size){
+ fprint(2, "mismatched available=%d and allowed=%d - used=%d space", lumpcache.avail, lumpcache.allowed, size);
+ *(int*)0=0;
+ }
+
+ nfree = 0;
+ for(b = lumpcache.free; b != nil; b = b->next){
+ if(b->type != TWID8 || b->heap != TWID32)
+ sysfatal("lc: bad free list");
+ nfree++;
+ }
+
+ if(lumpcache.nheap + nfree + refed != lumpcache.nblocks)
+ sysfatal("lc: missing blocks: %d %d %d %d", lumpcache.nheap, refed, nfree, lumpcache.nblocks);
+}
+
diff --git a/sys/src/cmd/venti/srv/lumpqueue.c b/sys/src/cmd/venti/srv/lumpqueue.c
new file mode 100755
index 000000000..869eaeae0
--- /dev/null
+++ b/sys/src/cmd/venti/srv/lumpqueue.c
@@ -0,0 +1,171 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+typedef struct LumpQueue LumpQueue;
+typedef struct WLump WLump;
+
+enum
+{
+ MaxLumpQ = 1 << 3 /* max. lumps on a single write queue, must be pow 2 */
+};
+
+struct WLump
+{
+ Lump *u;
+ Packet *p;
+ int creator;
+ int gen;
+ uint ms;
+};
+
+struct LumpQueue
+{
+ QLock lock;
+ Rendez flush;
+ Rendez full;
+ Rendez empty;
+ WLump q[MaxLumpQ];
+ int w;
+ int r;
+};
+
+static LumpQueue *lumpqs;
+static int nqs;
+
+static QLock glk;
+static int gen;
+
+static void queueproc(void *vq);
+
+int
+initlumpqueues(int nq)
+{
+ LumpQueue *q;
+
+ int i;
+ nqs = nq;
+
+ lumpqs = MKNZ(LumpQueue, nq);
+
+ for(i = 0; i < nq; i++){
+ q = &lumpqs[i];
+ q->full.l = &q->lock;
+ q->empty.l = &q->lock;
+ q->flush.l = &q->lock;
+
+ if(vtproc(queueproc, q) < 0){
+ seterr(EOk, "can't start write queue slave: %r");
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * queue a lump & it's packet data for writing
+ */
+int
+queuewrite(Lump *u, Packet *p, int creator, uint ms)
+{
+ LumpQueue *q;
+ int i;
+
+ trace(TraceProc, "queuewrite");
+ i = indexsect(mainindex, u->score);
+ if(i < 0 || i >= nqs){
+ seterr(EBug, "internal error: illegal index section in queuewrite");
+ return -1;
+ }
+
+ q = &lumpqs[i];
+
+ qlock(&q->lock);
+ while(q->r == ((q->w + 1) & (MaxLumpQ - 1))){
+ trace(TraceProc, "queuewrite sleep");
+ rsleep(&q->full);
+ }
+
+ q->q[q->w].u = u;
+ q->q[q->w].p = p;
+ q->q[q->w].creator = creator;
+ q->q[q->w].ms = ms;
+ q->q[q->w].gen = gen;
+ q->w = (q->w + 1) & (MaxLumpQ - 1);
+
+ trace(TraceProc, "queuewrite wakeup");
+ rwakeup(&q->empty);
+
+ qunlock(&q->lock);
+
+ return 0;
+}
+
+void
+flushqueue(void)
+{
+ int i;
+ LumpQueue *q;
+
+ if(!lumpqs)
+ return;
+
+ trace(TraceProc, "flushqueue");
+
+ qlock(&glk);
+ gen++;
+ qunlock(&glk);
+
+ for(i=0; i<mainindex->nsects; i++){
+ q = &lumpqs[i];
+ qlock(&q->lock);
+ while(q->w != q->r && gen - q->q[q->r].gen > 0){
+ trace(TraceProc, "flushqueue sleep q%d", i);
+ rsleep(&q->flush);
+ }
+ qunlock(&q->lock);
+ }
+}
+
+static void
+queueproc(void *vq)
+{
+ LumpQueue *q;
+ Lump *u;
+ Packet *p;
+ int creator;
+ uint ms;
+
+ threadsetname("queueproc");
+
+ q = vq;
+ for(;;){
+ qlock(&q->lock);
+ while(q->w == q->r){
+ trace(TraceProc, "queueproc sleep empty");
+ rsleep(&q->empty);
+ }
+
+ u = q->q[q->r].u;
+ p = q->q[q->r].p;
+ creator = q->q[q->r].creator;
+ ms = q->q[q->r].ms;
+
+ q->r = (q->r + 1) & (MaxLumpQ - 1);
+ trace(TraceProc, "queueproc wakeup flush");
+ rwakeupall(&q->flush);
+
+ trace(TraceProc, "queueproc wakeup full");
+ rwakeup(&q->full);
+
+ qunlock(&q->lock);
+
+ trace(TraceProc, "queueproc writelump %V", u->score);
+ if(writeqlump(u, p, creator, ms) < 0)
+ fprint(2, "failed to write lump for %V: %r", u->score);
+ trace(TraceProc, "queueproc wrotelump %V", u->score);
+
+ putlump(u);
+ }
+}
diff --git a/sys/src/cmd/venti/srv/mirrorarenas.c b/sys/src/cmd/venti/srv/mirrorarenas.c
new file mode 100755
index 000000000..8b72f1a51
--- /dev/null
+++ b/sys/src/cmd/venti/srv/mirrorarenas.c
@@ -0,0 +1,523 @@
+/*
+ * Mirror one arena partition onto another.
+ * Be careful to copy only new data.
+ */
+
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+Channel *writechan;
+
+typedef struct Write Write;
+struct Write
+{
+ uchar *p;
+ int n;
+ uvlong o;
+ int error;
+};
+
+Part *src;
+Part *dst;
+int force;
+int verbose;
+int dosha1 = 1;
+char *status;
+uvlong astart, aend;
+
+void
+usage(void)
+{
+ fprint(2, "usage: mirrorarenas [-sv] src dst [ranges]\n");
+ threadexitsall("usage");
+}
+
+char *tagged;
+
+void
+tag(char *fmt, ...)
+{
+ va_list arg;
+
+ if(tagged){
+ free(tagged);
+ tagged = nil;
+ }
+ va_start(arg, fmt);
+ tagged = vsmprint(fmt, arg);
+ va_end(arg);
+}
+
+void
+chat(char *fmt, ...)
+{
+ va_list arg;
+
+ if(tagged){
+ write(1, tagged, strlen(tagged));
+ free(tagged);
+ tagged = nil;
+ }
+ va_start(arg, fmt);
+ vfprint(1, fmt, arg);
+ va_end(arg);
+}
+
+#pragma varargck argpos tag 1
+#pragma varargck argpos chat 1
+
+
+int
+ereadpart(Part *p, u64int offset, u8int *buf, u32int count)
+{
+ if(readpart(p, offset, buf, count) != count){
+ chat("%T readpart %s at %#llux+%ud: %r\n", p->name, offset, count);
+ return -1;
+ }
+ return 0;
+}
+
+int
+ewritepart(Part *p, u64int offset, u8int *buf, u32int count)
+{
+ if(writepart(p, offset, buf, count) != count || flushpart(p) < 0){
+ chat("%T writepart %s at %#llux+%ud: %r\n", p->name, offset, count);
+ return -1;
+ }
+ return 0;
+}
+
+/*
+ * Extra proc to do writes to dst, so that we can overlap reading
+ * src with writing dst during copy. This is an easy factor of two
+ * (almost) in performance.
+ */
+static Write wsync;
+static void
+writeproc(void *v)
+{
+ Write *w;
+
+ USED(v);
+ while((w = recvp(writechan)) != nil){
+ if(w == &wsync)
+ continue;
+ if(ewritepart(dst, w->o, w->p, w->n) < 0)
+ w->error = 1;
+ }
+}
+
+int
+copy(uvlong start, uvlong end, char *what, DigestState *ds)
+{
+ int i, n;
+ uvlong o;
+ static uchar tmp[2][1024*1024];
+ Write w[2];
+
+ assert(start <= end);
+ assert(astart <= start && start < aend);
+ assert(astart <= end && end <= aend);
+
+ if(verbose && start != end)
+ chat("%T copy %,llud-%,llud %s\n", start, end, what);
+
+ i = 0;
+ memset(w, 0, sizeof w);
+ for(o=start; o<end; o+=n){
+ if(w[i].error)
+ goto error;
+ n = sizeof tmp[i];
+ if(o+n > end)
+ n = end - o;
+ if(ereadpart(src, o, tmp[i], n) < 0)
+ goto error;
+ w[i].p = tmp[i];
+ w[i].o = o;
+ w[i].n = n;
+ w[i].error = 0;
+ sendp(writechan, &w[i]);
+ if(ds)
+ sha1(tmp[i], n, nil, ds);
+ i = 1-i;
+ }
+ if(w[i].error)
+ goto error;
+
+ /*
+ * wait for queued write to finish
+ */
+ sendp(writechan, &wsync);
+ i = 1-i;
+ if(w[i].error)
+ return -1;
+ return 0;
+
+error:
+ /*
+ * sync with write proc
+ */
+ w[i].p = nil;
+ w[i].o = 0;
+ w[i].n = 0;
+ w[i].error = 0;
+ sendp(writechan, &w[i]);
+ return -1;
+}
+
+/* single-threaded, for reference */
+int
+copy1(uvlong start, uvlong end, char *what, DigestState *ds)
+{
+ int n;
+ uvlong o;
+ static uchar tmp[1024*1024];
+
+ assert(start <= end);
+ assert(astart <= start && start < aend);
+ assert(astart <= end && end <= aend);
+
+ if(verbose && start != end)
+ chat("%T copy %,llud-%,llud %s\n", start, end, what);
+
+ for(o=start; o<end; o+=n){
+ n = sizeof tmp;
+ if(o+n > end)
+ n = end - o;
+ if(ereadpart(src, o, tmp, n) < 0)
+ return -1;
+ if(ds)
+ sha1(tmp, n, nil, ds);
+ if(ewritepart(dst, o, tmp, n) < 0)
+ return -1;
+ }
+ return 0;
+}
+
+int
+asha1(Part *p, uvlong start, uvlong end, DigestState *ds)
+{
+ int n;
+ uvlong o;
+ static uchar tmp[1024*1024];
+
+ if(start == end)
+ return 0;
+ assert(start < end);
+
+ if(verbose)
+ chat("%T sha1 %,llud-%,llud\n", start, end);
+
+ for(o=start; o<end; o+=n){
+ n = sizeof tmp;
+ if(o+n > end)
+ n = end - o;
+ if(ereadpart(p, o, tmp, n) < 0)
+ return -1;
+ sha1(tmp, n, nil, ds);
+ }
+ return 0;
+}
+
+uvlong
+rdown(uvlong a, int b)
+{
+ return a-a%b;
+}
+
+uvlong
+rup(uvlong a, int b)
+{
+ if(a%b == 0)
+ return a;
+ return a+b-a%b;
+}
+
+void
+mirror(Arena *sa, Arena *da)
+{
+ vlong v, si, di, end;
+ int clumpmax, blocksize, sealed;
+ static uchar buf[MaxIoSize];
+ ArenaHead h;
+ DigestState xds, *ds;
+ vlong shaoff, base;
+
+ base = sa->base;
+ blocksize = sa->blocksize;
+ end = sa->base + sa->size;
+
+ astart = base - blocksize;
+ aend = end + blocksize;
+
+ tag("%T %s (%,llud-%,llud)\n", sa->name, astart, aend);
+
+ if(force){
+ copy(astart, aend, "all", nil);
+ return;
+ }
+
+ if(sa->diskstats.sealed && da->diskstats.sealed && scorecmp(da->score, zeroscore) != 0){
+ if(scorecmp(sa->score, da->score) == 0){
+ if(verbose)
+ chat("%T %s: %V sealed mirrored\n", sa->name, sa->score);
+ return;
+ }
+ chat("%T %s: warning: sealed score mismatch %V vs %V\n", sa->name, sa->score, da->score);
+ /* Keep executing; will correct seal if possible. */
+ }
+ if(!sa->diskstats.sealed && da->diskstats.sealed && scorecmp(da->score, zeroscore) != 0){
+ chat("%T %s: dst is sealed, src is not\n", sa->name);
+ status = "errors";
+ return;
+ }
+ if(sa->diskstats.used < da->diskstats.used){
+ chat("%T %s: src used %,lld < dst used %,lld\n", sa->name, sa->diskstats.used, da->diskstats.used);
+ status = "errors";
+ return;
+ }
+
+ if(da->clumpmagic != sa->clumpmagic){
+ /*
+ * Write this now to reduce the window in which
+ * the head and tail disagree about clumpmagic.
+ */
+ da->clumpmagic = sa->clumpmagic;
+ memset(buf, 0, sizeof buf);
+ packarena(da, buf);
+ if(ewritepart(dst, end, buf, blocksize) < 0)
+ return;
+ }
+
+ memset(&h, 0, sizeof h);
+ h.version = da->version;
+ strcpy(h.name, da->name);
+ h.blocksize = da->blocksize;
+ h.size = da->size + 2*da->blocksize;
+ h.clumpmagic = da->clumpmagic;
+ memset(buf, 0, sizeof buf);
+ packarenahead(&h, buf);
+ if(ewritepart(dst, base - blocksize, buf, blocksize) < 0)
+ return;
+
+ shaoff = 0;
+ ds = nil;
+ sealed = sa->diskstats.sealed && scorecmp(sa->score, zeroscore) != 0;
+ if(sealed && dosha1){
+ /* start sha1 state with header */
+ memset(&xds, 0, sizeof xds);
+ ds = &xds;
+ sha1(buf, blocksize, nil, ds);
+ shaoff = base;
+ }
+
+ if(sa->diskstats.used != da->diskstats.used){
+ di = base+rdown(da->diskstats.used, blocksize);
+ si = base+rup(sa->diskstats.used, blocksize);
+ if(ds && asha1(dst, shaoff, di, ds) < 0)
+ return;
+ if(copy(di, si, "data", ds) < 0)
+ return;
+ shaoff = si;
+ }
+
+ clumpmax = sa->clumpmax;
+ di = end - da->diskstats.clumps/clumpmax * blocksize;
+ si = end - (sa->diskstats.clumps+clumpmax-1)/clumpmax * blocksize;
+
+ if(sa->diskstats.sealed){
+ /*
+ * might be a small hole between the end of the
+ * data and the beginning of the directory.
+ */
+ v = base+rup(sa->diskstats.used, blocksize);
+ if(ds && asha1(dst, shaoff, v, ds) < 0)
+ return;
+ if(copy(v, si, "hole", ds) < 0)
+ return;
+ shaoff = si;
+ }
+
+ if(da->diskstats.clumps != sa->diskstats.clumps){
+ if(ds && asha1(dst, shaoff, si, ds) < 0)
+ return;
+ if(copy(si, di, "directory", ds) < 0) /* si < di because clumpinfo blocks grow down */
+ return;
+ shaoff = di;
+ }
+
+ da->ctime = sa->ctime;
+ da->wtime = sa->wtime;
+ da->diskstats = sa->diskstats;
+ da->diskstats.sealed = 0;
+
+ /*
+ * Repack the arena tail information
+ * and save it for next time...
+ */
+ memset(buf, 0, sizeof buf);
+ packarena(da, buf);
+ if(ewritepart(dst, end, buf, blocksize) < 0)
+ return;
+
+ if(sealed){
+ /*
+ * ... but on the final pass, copy the encoding
+ * of the tail information from the source
+ * arena itself. There are multiple possible
+ * ways to write the tail info out (the exact
+ * details have changed as venti went through
+ * revisions), and to keep the SHA1 hash the
+ * same, we have to use what the disk uses.
+ */
+ if(asha1(dst, shaoff, end, ds) < 0
+ || copy(end, end+blocksize-VtScoreSize, "tail", ds) < 0)
+ return;
+ if(dosha1){
+ memset(buf, 0, VtScoreSize);
+ sha1(buf, VtScoreSize, da->score, ds);
+ if(scorecmp(sa->score, da->score) == 0){
+ if(verbose)
+ chat("%T %s: %V sealed mirrored\n", sa->name, sa->score);
+ if(ewritepart(dst, end+blocksize-VtScoreSize, da->score, VtScoreSize) < 0)
+ return;
+ }else{
+ chat("%T %s: sealing dst: score mismatch: %V vs %V\n", sa->name, sa->score, da->score);
+ memset(&xds, 0, sizeof xds);
+ asha1(dst, base-blocksize, end+blocksize-VtScoreSize, &xds);
+ sha1(buf, VtScoreSize, 0, &xds);
+ chat("%T reseal: %V\n", da->score);
+ status = "errors";
+ }
+ }else{
+ if(verbose)
+ chat("%T %s: %V mirrored\n", sa->name, sa->score);
+ if(ewritepart(dst, end+blocksize-VtScoreSize, sa->score, VtScoreSize) < 0)
+ return;
+ }
+ }else{
+ chat("%T %s: %,lld used mirrored\n",
+ sa->name, sa->diskstats.used);
+ }
+}
+
+void
+mirrormany(ArenaPart *sp, ArenaPart *dp, char *range)
+{
+ int i, lo, hi;
+ char *s, *t;
+ Arena *sa, *da;
+
+ if(range == nil){
+ for(i=0; i<sp->narenas; i++){
+ sa = sp->arenas[i];
+ da = dp->arenas[i];
+ mirror(sa, da);
+ }
+ return;
+ }
+ if(strcmp(range, "none") == 0)
+ return;
+
+ for(s=range; *s; s=t){
+ t = strchr(s, ',');
+ if(t)
+ *t++ = 0;
+ else
+ t = s+strlen(s);
+ if(*s == '-')
+ lo = 0;
+ else
+ lo = strtol(s, &s, 0);
+ hi = lo;
+ if(*s == '-'){
+ s++;
+ if(*s == 0)
+ hi = sp->narenas-1;
+ else
+ hi = strtol(s, &s, 0);
+ }
+ if(*s != 0){
+ chat("%T bad arena range: %s\n", s);
+ continue;
+ }
+ for(i=lo; i<=hi; i++){
+ sa = sp->arenas[i];
+ da = dp->arenas[i];
+ mirror(sa, da);
+ }
+ }
+}
+
+
+void
+threadmain(int argc, char **argv)
+{
+ int i;
+ Arena *sa, *da;
+ ArenaPart *s, *d;
+ char *ranges;
+
+ ventifmtinstall();
+
+ ARGBEGIN{
+ case 'F':
+ force = 1;
+ break;
+ case 'v':
+ verbose++;
+ break;
+ case 's':
+ dosha1 = 0;
+ break;
+ default:
+ usage();
+ }ARGEND
+
+ if(argc != 2 && argc != 3)
+ usage();
+ ranges = nil;
+ if(argc == 3)
+ ranges = argv[2];
+
+ if((src = initpart(argv[0], OREAD)) == nil)
+ sysfatal("initpart %s: %r", argv[0]);
+ if((dst = initpart(argv[1], ORDWR)) == nil)
+ sysfatal("initpart %s: %r", argv[1]);
+ if((s = initarenapart(src)) == nil)
+ sysfatal("initarenapart %s: %r", argv[0]);
+ for(i=0; i<s->narenas; i++)
+ delarena(s->arenas[i]);
+ if((d = initarenapart(dst)) == nil)
+ sysfatal("loadarenapart %s: %r", argv[1]);
+ for(i=0; i<d->narenas; i++)
+ delarena(d->arenas[i]);
+
+ /*
+ * The arena geometries must match or all bets are off.
+ */
+ if(s->narenas != d->narenas)
+ sysfatal("arena count mismatch: %d vs %d", s->narenas, d->narenas);
+ for(i=0; i<s->narenas; i++){
+ sa = s->arenas[i];
+ da = d->arenas[i];
+ if(sa->version != da->version)
+ sysfatal("arena %d: version mismatch: %d vs %d", i, sa->version, da->version);
+ if(sa->blocksize != da->blocksize)
+ sysfatal("arena %d: blocksize mismatch: %d vs %d", i, sa->blocksize, da->blocksize);
+ if(sa->size != da->size)
+ sysfatal("arena %d: size mismatch: %,lld vs %,lld", i, sa->size, da->size);
+ if(strcmp(sa->name, da->name) != 0)
+ sysfatal("arena %d: name mismatch: %s vs %s", i, sa->name, da->name);
+ }
+
+ /*
+ * Mirror one arena at a time.
+ */
+ writechan = chancreate(sizeof(void*), 0);
+ vtproc(writeproc, nil);
+ mirrormany(s, d, ranges);
+ sendp(writechan, nil);
+ threadexitsall(status);
+}
diff --git a/sys/src/cmd/venti/srv/mkfile b/sys/src/cmd/venti/srv/mkfile
new file mode 100755
index 000000000..947710ea0
--- /dev/null
+++ b/sys/src/cmd/venti/srv/mkfile
@@ -0,0 +1,101 @@
+</$objtype/mkfile
+
+LIBOFILES=\
+ arena.$O\
+ arenas.$O\
+ bloom.$O\
+ buildbuck.$O\
+ clump.$O\
+ config.$O\
+ conv.$O\
+ dcache.$O\
+ disksched.$O\
+ dump.$O\
+ graph.$O\
+ hdisk.$O\
+ hproc.$O\
+ httpd.$O\
+ icache.$O\
+ icachewrite.$O\
+ ifile.$O\
+ index.$O\
+ lump.$O\
+ lumpcache.$O\
+ lumpqueue.$O\
+ part.$O\
+ png.$O\
+ round.$O\
+ score.$O\
+ sortientry.$O\
+ stats.$O\
+ syncarena.$O\
+ syncindex0.$O\
+ trace.$O\
+ unwhack.$O\
+ utils.$O\
+ unittoull.$O\
+ whack.$O\
+ xml.$O\
+ zblock.$O\
+ zeropart.$O\
+
+SLIB=libvs.a$O
+
+LIB=$SLIB # /$objtype/lib/libventi.a
+
+HFILES= dat.h\
+ fns.h\
+ stdinc.h\
+ /sys/include/venti.h\
+ /sys/include/httpd.h\
+
+TARG=\
+ venti\
+ buildindex\
+ checkarenas\
+ checkindex\
+ clumpstats\
+ conf\
+ findscore\
+ fixarenas\
+ fmtarenas\
+ fmtbloom\
+ fmtindex\
+ fmtisect\
+ mirrorarenas\
+ printarena\
+ printarenapart\
+ rdarena\
+ syncindex\
+ verifyarena\
+ wrarena\
+
+OFILES=
+
+BIN=/$objtype/bin/venti
+
+it:V: $O.venti
+
+CLEANFILES=$CLEANFILES $SLIB
+
+</sys/src/cmd/mkmany
+
+CFLAGS=$CFLAGS -I.
+
+$SLIB: $LIBOFILES
+ ar rvc $SLIB $LIBOFILES
+
+# xml.c:D: mkxml dat.h
+# ./mkxml dat.h > xml.c
+
+acid:D: lumpcache.acid
+ cat $prereq >$target
+
+$O.conf:D: conf.rc
+ {
+ echo '#!/bin/rc'
+ echo '# THIS FILE IS AUTOMATICALLY GENERATED'
+ echo '# FROM /sys/src/cmd/venti/conf.rc. DO NOT EDIT.'
+ echo
+ sed 1d conf.rc
+ } >$target && chmod +x $target
diff --git a/sys/src/cmd/venti/srv/part.c b/sys/src/cmd/venti/srv/part.c
new file mode 100755
index 000000000..9f112cf62
--- /dev/null
+++ b/sys/src/cmd/venti/srv/part.c
@@ -0,0 +1,249 @@
+#include "stdinc.h"
+#include <ctype.h>
+#include "dat.h"
+#include "fns.h"
+
+u32int maxblocksize;
+int readonly;
+
+static int
+strtoullsuf(char *p, char **pp, int rad, u64int *u)
+{
+ u64int v;
+
+ if(!isdigit((uchar)*p))
+ return -1;
+ v = strtoull(p, &p, rad);
+ switch(*p){
+ case 'k':
+ case 'K':
+ v *= 1024;
+ p++;
+ break;
+ case 'm':
+ case 'M':
+ v *= 1024*1024;
+ p++;
+ break;
+ case 'g':
+ case 'G':
+ v *= 1024*1024*1024;
+ p++;
+ break;
+ case 't':
+ case 'T':
+ v *= 1024*1024;
+ v *= 1024*1024;
+ p++;
+ break;
+ }
+ *pp = p;
+ *u = v;
+ return 0;
+}
+
+static int
+parsepart(char *name, char **file, u64int *lo, u64int *hi)
+{
+ char *p;
+
+ *file = estrdup(name);
+ if((p = strrchr(*file, ':')) == nil){
+ *lo = 0;
+ *hi = 0;
+ return 0;
+ }
+ *p++ = 0;
+ if(*p == '-')
+ *lo = 0;
+ else{
+ if(strtoullsuf(p, &p, 0, lo) < 0){
+ free(*file);
+ return -1;
+ }
+ }
+ if(*p == '-')
+ p++;
+ if(*p == 0){
+ *hi = 0;
+ return 0;
+ }
+ if(strtoullsuf(p, &p, 0, hi) < 0 || *p != 0){
+ free(*file);
+ return -1;
+ }
+ return 0;
+}
+
+Part*
+initpart(char *name, int mode)
+{
+ Part *part;
+ Dir *dir;
+ char *file;
+ u64int lo, hi;
+
+ if(parsepart(name, &file, &lo, &hi) < 0)
+ return nil;
+ trace(TraceDisk, "initpart %s file %s lo 0x%llx hi 0x%llx", name, file, lo, hi);
+ part = MKZ(Part);
+ part->name = estrdup(name);
+ part->filename = estrdup(file);
+ if(readonly){
+ mode &= ~(OREAD|OWRITE|ORDWR);
+ mode |= OREAD;
+ }
+ part->fd = open(file, mode);
+ if(part->fd < 0){
+ if((mode&(OREAD|OWRITE|ORDWR)) == ORDWR)
+ part->fd = open(file, (mode&~ORDWR)|OREAD);
+ if(part->fd < 0){
+ freepart(part);
+ fprint(2, "can't open partition='%s': %r\n", file);
+ seterr(EOk, "can't open partition='%s': %r", file);
+ fprint(2, "%r\n");
+ free(file);
+ return nil;
+ }
+ fprint(2, "warning: %s opened for reading only\n", name);
+ }
+ part->offset = lo;
+ dir = dirfstat(part->fd);
+ if(dir == nil){
+ freepart(part);
+ seterr(EOk, "can't stat partition='%s': %r", file);
+ free(file);
+ return nil;
+ }
+ if(dir->length == 0){
+ free(dir);
+ freepart(part);
+ seterr(EOk, "can't determine size of partition %s", file);
+ free(file);
+ return nil;
+ }
+ if(dir->length < hi || dir->length < lo){
+ freepart(part);
+ seterr(EOk, "partition '%s': bounds out of range (max %lld)", name, dir->length);
+ free(dir);
+ free(file);
+ return nil;
+ }
+ if(hi == 0)
+ hi = dir->length;
+ part->size = hi - part->offset;
+ free(dir);
+ return part;
+}
+
+int
+flushpart(Part *part)
+{
+ USED(part);
+ return 0;
+}
+
+void
+freepart(Part *part)
+{
+ if(part == nil)
+ return;
+ if(part->fd >= 0)
+ close(part->fd);
+ free(part->name);
+ free(part);
+}
+
+void
+partblocksize(Part *part, u32int blocksize)
+{
+ if(part->blocksize)
+ sysfatal("resetting partition=%s's block size", part->name);
+ part->blocksize = blocksize;
+ if(blocksize > maxblocksize)
+ maxblocksize = blocksize;
+}
+
+enum {
+ Maxxfer = 64*1024, /* for NCR SCSI controllers; was 128K */
+};
+
+static int reopen(Part*);
+
+int
+rwpart(Part *part, int isread, u64int offset0, u8int *buf0, u32int count0)
+{
+ u32int count, opsize;
+ int n;
+ u8int *buf;
+ u64int offset;
+
+ trace(TraceDisk, "%s %s %ud at 0x%llx",
+ isread ? "read" : "write", part->name, count0, offset0);
+ if(offset0 >= part->size || offset0+count0 > part->size){
+ seterr(EStrange, "out of bounds %s offset 0x%llux count %ud to partition %s size 0x%llux",
+ isread ? "read" : "write", offset0, count0, part->name,
+ part->size);
+ return -1;
+ }
+
+ buf = buf0;
+ count = count0;
+ offset = offset0;
+ while(count > 0){
+ opsize = count;
+ if(opsize > Maxxfer)
+ opsize = Maxxfer;
+ if(isread)
+ n = pread(part->fd, buf, opsize, offset);
+ else
+ n = pwrite(part->fd, buf, opsize, offset);
+ if(n <= 0){
+ seterr(EAdmin, "%s %s offset 0x%llux count %ud buf %p returned %d: %r",
+ isread ? "read" : "write", part->filename, offset, opsize, buf, n);
+ return -1;
+ }
+ offset += n;
+ count -= n;
+ buf += n;
+ }
+
+ return count0;
+}
+
+int
+readpart(Part *part, u64int offset, u8int *buf, u32int count)
+{
+ return rwpart(part, 1, offset, buf, count);
+}
+
+int
+writepart(Part *part, u64int offset, u8int *buf, u32int count)
+{
+ return rwpart(part, 0, offset, buf, count);
+}
+
+ZBlock*
+readfile(char *name)
+{
+ Part *p;
+ ZBlock *b;
+
+ p = initpart(name, OREAD);
+ if(p == nil)
+ return nil;
+ b = alloczblock(p->size, 0, p->blocksize);
+ if(b == nil){
+ seterr(EOk, "can't alloc %s: %r", name);
+ freepart(p);
+ return nil;
+ }
+ if(readpart(p, 0, b->data, p->size) < 0){
+ seterr(EOk, "can't read %s: %r", name);
+ freepart(p);
+ freezblock(b);
+ return nil;
+ }
+ freepart(p);
+ return b;
+}
diff --git a/sys/src/cmd/venti/srv/png.c b/sys/src/cmd/venti/srv/png.c
new file mode 100755
index 000000000..81ab14c0c
--- /dev/null
+++ b/sys/src/cmd/venti/srv/png.c
@@ -0,0 +1,239 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+enum
+{
+ IDATSIZE = 20000,
+ FilterNone = 0
+};
+
+typedef struct ZlibR ZlibR;
+typedef struct ZlibW ZlibW;
+
+struct ZlibR
+{
+ uchar *data;
+ int width;
+ int dx;
+ int dy;
+ int x;
+ int y;
+ int pixwid;
+};
+
+struct ZlibW
+{
+ Hio *io;
+ uchar *buf;
+ uchar *b;
+ uchar *e;
+};
+
+static ulong *crctab;
+static uchar PNGmagic[] = { 137, 'P', 'N', 'G', '\r', '\n', 26, '\n'};
+
+static void
+put4(uchar *a, ulong v)
+{
+ a[0] = v>>24;
+ a[1] = v>>16;
+ a[2] = v>>8;
+ a[3] = v;
+}
+
+static void
+chunk(Hio *io, char *type, uchar *d, int n)
+{
+ uchar buf[4];
+ ulong crc = 0;
+
+ if(strlen(type) != 4)
+ return;
+ put4(buf, n);
+ hwrite(io, buf, 4);
+ hwrite(io, type, 4);
+ hwrite(io, d, n);
+ crc = blockcrc(crctab, crc, type, 4);
+ crc = blockcrc(crctab, crc, d, n);
+ put4(buf, crc);
+ hwrite(io, buf, 4);
+}
+
+static int
+zread(void *va, void *buf, int n)
+{
+ int a, i, pixels, pixwid;
+ uchar *b, *e, *img;
+ ZlibR *z;
+
+ z = va;
+ pixwid = z->pixwid;
+ b = buf;
+ e = b+n;
+ while(b+pixwid <= e){
+ if(z->y >= z->dy)
+ break;
+ if(z->x == 0)
+ *b++ = FilterNone;
+ pixels = (e-b)/pixwid;
+ if(pixels > z->dx - z->x)
+ pixels = z->dx - z->x;
+ img = z->data + z->width*z->y + pixwid*z->x;
+ memmove(b, img, pixwid*pixels);
+ if(pixwid == 4){
+ /*
+ * Convert to non-premultiplied alpha.
+ */
+ for(i=0; i<pixels; i++, b+=4){
+ a = b[3];
+ if(a != 0 && a != 255){
+ if(b[0] >= a)
+ b[0] = a;
+ b[0] = (b[0]*255)/a;
+ if(b[1] >= a)
+ b[1] = a;
+ b[1] = (b[1]*255)/a;
+ if(b[2] >= a)
+ b[2] = a;
+ b[2] = (b[2]*255)/a;
+ }
+ }
+ }else
+ b += pixwid*pixels;
+
+ z->x += pixels;
+ if(z->x >= z->dx){
+ z->x = 0;
+ z->y++;
+ }
+ }
+ return b - (uchar*)buf;
+}
+
+static void
+IDAT(ZlibW *z)
+{
+ chunk(z->io, "IDAT", z->buf, z->b - z->buf);
+ z->b = z->buf;
+}
+
+static int
+zwrite(void *va, void *buf, int n)
+{
+ int m;
+ uchar *b, *e;
+ ZlibW *z;
+
+ z = va;
+ b = buf;
+ e = b+n;
+
+ while(b < e){
+ m = z->e - z->b;
+ if(m > e - b)
+ m = e - b;
+ memmove(z->b, b, m);
+ z->b += m;
+ b += m;
+ if(z->b >= z->e)
+ IDAT(z);
+ }
+ return n;
+}
+
+static Memimage*
+memRGBA(Memimage *i)
+{
+ Memimage *ni;
+ char buf[32];
+ ulong dst;
+
+ /*
+ * [A]BGR because we want R,G,B,[A] in big-endian order. Sigh.
+ */
+ chantostr(buf, i->chan);
+ if(strchr(buf, 'a'))
+ dst = ABGR32;
+ else
+ dst = BGR24;
+
+ if(i->chan == dst)
+ return i;
+
+ qlock(&memdrawlock);
+ ni = allocmemimage(i->r, dst);
+ if(ni)
+ memimagedraw(ni, ni->r, i, i->r.min, nil, i->r.min, S);
+ qunlock(&memdrawlock);
+ return ni;
+}
+
+int
+writepng(Hio *io, Memimage *m)
+{
+ static int first = 1;
+ static QLock lk;
+ uchar buf[200], *h;
+ Memimage *rgb;
+ ZlibR zr;
+ ZlibW zw;
+
+ if(first){
+ qlock(&lk);
+ if(first){
+ deflateinit();
+ crctab = mkcrctab(0xedb88320);
+ first = 0;
+ }
+ qunlock(&lk);
+ }
+
+ rgb = memRGBA(m);
+ if(rgb == nil)
+ return -1;
+
+ hwrite(io, PNGmagic, sizeof PNGmagic);
+
+ /* IHDR chunk */
+ h = buf;
+ put4(h, Dx(m->r)); h += 4;
+ put4(h, Dy(m->r)); h += 4;
+ *h++ = 8; /* 8 bits per channel */
+ if(rgb->chan == BGR24)
+ *h++ = 2; /* RGB */
+ else
+ *h++ = 6; /* RGBA */
+ *h++ = 0; /* compression - deflate */
+ *h++ = 0; /* filter - none */
+ *h++ = 0; /* interlace - none */
+ chunk(io, "IHDR", buf, h-buf);
+
+ /* image data */
+ zr.dx = Dx(m->r);
+ zr.dy = Dy(m->r);
+ zr.width = rgb->width * sizeof(ulong);
+ zr.data = rgb->data->bdata;
+ zr.x = 0;
+ zr.y = 0;
+ zr.pixwid = chantodepth(rgb->chan)/8;
+ zw.io = io;
+ zw.buf = vtmalloc(IDATSIZE);
+ zw.b = zw.buf;
+ zw.e = zw.b + IDATSIZE;
+ if(deflatezlib(&zw, zwrite, &zr, zread, 6, 0) < 0){
+ free(zw.buf);
+ return -1;
+ }
+ if(zw.b > zw.buf)
+ IDAT(&zw);
+ free(zw.buf);
+ chunk(io, "IEND", nil, 0);
+
+ if(m != rgb){
+ qlock(&memdrawlock);
+ freememimage(rgb);
+ qunlock(&memdrawlock);
+ }
+ return 0;
+}
diff --git a/sys/src/cmd/venti/srv/printarena.c b/sys/src/cmd/venti/srv/printarena.c
new file mode 100755
index 000000000..399385caf
--- /dev/null
+++ b/sys/src/cmd/venti/srv/printarena.c
@@ -0,0 +1,126 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+void
+usage(void)
+{
+ fprint(2, "usage: printarena arenafile [offset]\n");
+ threadexitsall("usage");
+}
+
+static void
+rdarena(Arena *arena, u64int offset)
+{
+ u64int a, aa, e;
+ u32int magic;
+ Clump cl;
+ uchar score[VtScoreSize];
+ ZBlock *lump;
+
+ printarena(2, arena);
+
+ a = arena->base;
+ e = arena->base + arena->size;
+ if(offset != ~(u64int)0) {
+ if(offset >= e-a)
+ sysfatal("bad offset %llud >= %llud",
+ offset, e-a);
+ aa = offset;
+ } else
+ aa = 0;
+
+ for(; aa < e; aa += ClumpSize+cl.info.size) {
+ magic = clumpmagic(arena, aa);
+ if(magic == ClumpFreeMagic)
+ break;
+ if(magic != arena->clumpmagic) {
+ fprint(2, "illegal clump magic number %#8.8ux offset %llud\n",
+ magic, aa);
+ break;
+ }
+ lump = loadclump(arena, aa, 0, &cl, score, 0);
+ if(lump == nil) {
+ fprint(2, "clump %llud failed to read: %r\n", aa);
+ break;
+ }
+ if(cl.info.type != VtCorruptType) {
+ scoremem(score, lump->data, cl.info.uncsize);
+ if(scorecmp(cl.info.score, score) != 0) {
+ fprint(2, "clump %llud has mismatched score\n", aa);
+ break;
+ }
+ if(vttypevalid(cl.info.type) < 0) {
+ fprint(2, "clump %llud has bad type %d\n", aa, cl.info.type);
+ break;
+ }
+ }
+ print("%22llud %V %3d %5d\n", aa, score, cl.info.type, cl.info.uncsize);
+ freezblock(lump);
+ }
+ print("end offset %llud\n", aa);
+}
+
+void
+threadmain(int argc, char *argv[])
+{
+ char *file;
+ Arena *arena;
+ u64int offset, aoffset;
+ Part *part;
+ static uchar buf[8192];
+ ArenaHead head;
+
+ readonly = 1; /* for part.c */
+ aoffset = 0;
+ ARGBEGIN{
+ case 'o':
+ aoffset = strtoull(EARGF(usage()), 0, 0);
+ break;
+ default:
+ usage();
+ break;
+ }ARGEND
+
+ offset = ~(u64int)0;
+ switch(argc) {
+ default:
+ usage();
+ case 2:
+ offset = strtoull(argv[1], 0, 0);
+ /* fall through */
+ case 1:
+ file = argv[0];
+ }
+
+
+ ventifmtinstall();
+ statsinit();
+
+ part = initpart(file, OREAD|ODIRECT);
+ if(part == nil)
+ sysfatal("can't open file %s: %r", file);
+ if(readpart(part, aoffset, buf, sizeof buf) < 0)
+ sysfatal("can't read file %s: %r", file);
+
+ if(unpackarenahead(&head, buf) < 0)
+ sysfatal("corrupted arena header: %r");
+
+ print("# arena head version=%d name=%.*s blocksize=%d size=%lld clumpmagic=0x%.8ux\n",
+ head.version, ANameSize, head.name, head.blocksize,
+ head.size, head.clumpmagic);
+
+ if(aoffset+head.size > part->size)
+ sysfatal("arena is truncated: want %llud bytes have %llud",
+ head.size, part->size);
+
+ partblocksize(part, head.blocksize);
+ initdcache(8 * MaxDiskBlock);
+
+ arena = initarena(part, aoffset, head.size, head.blocksize);
+ if(arena == nil)
+ sysfatal("initarena: %r");
+
+ rdarena(arena, offset);
+ threadexitsall(0);
+}
diff --git a/sys/src/cmd/venti/srv/printarenapart.c b/sys/src/cmd/venti/srv/printarenapart.c
new file mode 100755
index 000000000..5367d9669
--- /dev/null
+++ b/sys/src/cmd/venti/srv/printarenapart.c
@@ -0,0 +1,155 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+uchar buf[64*1024];
+
+void
+usage(void)
+{
+ fprint(2, "usage: printarenapart arenafile [offset]\n");
+ threadexitsall("usage");
+}
+
+static void
+rdarena(Arena *arena, u64int offset)
+{
+ u64int a, aa, e;
+ u32int magic;
+ Clump cl;
+ uchar score[VtScoreSize];
+ ZBlock *lump;
+
+ printarena(2, arena);
+
+ a = arena->base;
+ e = arena->base + arena->size;
+ if(offset != ~(u64int)0) {
+ if(offset >= e-a)
+ sysfatal("bad offset %llud >= %llud",
+ offset, e-a);
+ aa = offset;
+ } else
+ aa = 0;
+
+ for(; aa < e; aa += ClumpSize+cl.info.size) {
+ magic = clumpmagic(arena, aa);
+ if(magic == ClumpFreeMagic)
+ break;
+ if(magic != arena->clumpmagic) {
+ fprint(2, "illegal clump magic number %#8.8ux offset %llud\n",
+ magic, aa);
+ break;
+ }
+ lump = loadclump(arena, aa, 0, &cl, score, 0);
+ if(lump == nil) {
+ fprint(2, "clump %llud failed to read: %r\n", aa);
+ break;
+ }
+ if(cl.info.type != VtCorruptType) {
+ scoremem(score, lump->data, cl.info.uncsize);
+ if(scorecmp(cl.info.score, score) != 0) {
+ fprint(2, "clump %llud has mismatched score\n", aa);
+ break;
+ }
+ if(vttypevalid(cl.info.type) < 0) {
+ fprint(2, "clump %llud has bad type %d\n", aa, cl.info.type);
+ break;
+ }
+ }
+ print("%22llud %V %3d %5d\n", aa, score, cl.info.type, cl.info.uncsize);
+ freezblock(lump);
+ }
+ print("end offset %llud\n", aa);
+}
+
+void
+threadmain(int argc, char *argv[])
+{
+ char *file, *p, *name;
+ char *table;
+ u64int offset;
+ Part *part;
+ ArenaPart ap;
+ ArenaHead head;
+ Arena tail;
+ char ct[40], mt[40];
+
+ readonly = 1; /* for part.c */
+ ARGBEGIN{
+ default:
+ usage();
+ break;
+ }ARGEND
+
+ switch(argc) {
+ default:
+ usage();
+ case 1:
+ file = argv[0];
+ }
+
+ ventifmtinstall();
+ statsinit();
+
+ part = initpart(file, OREAD|ODIRECT);
+ if(part == nil)
+ sysfatal("can't open file %s: %r", file);
+ if(readpart(part, PartBlank, buf, sizeof buf) < 0)
+ sysfatal("can't read file %s: %r", file);
+
+ if(unpackarenapart(&ap, buf) < 0)
+ sysfatal("corrupted arena part header: %r");
+
+ print("# arena part version=%d blocksize=%d arenabase=%d\n",
+ ap.version, ap.blocksize, ap.arenabase);
+ ap.tabbase = (PartBlank+HeadSize+ap.blocksize-1)&~(ap.blocksize-1);
+ ap.tabsize = ap.arenabase - ap.tabbase;
+
+ table = malloc(ap.tabsize+1);
+ if(readpart(part, ap.tabbase, (uchar*)table, ap.tabsize) < 0)
+ sysfatal("read %s: %r", file);
+ table[ap.tabsize] = 0;
+
+ partblocksize(part, ap.blocksize);
+ initdcache(8 * MaxDiskBlock);
+
+ for(p=table; p && *p; p=strchr(p, '\n')){
+ if(*p == '\n')
+ p++;
+ name = p;
+ p = strpbrk(p, " \t");
+ if(p == nil){
+ fprint(2, "bad line: %s\n", name);
+ break;
+ }
+ offset = strtoull(p, nil, 0);
+ if(readpart(part, offset, buf, sizeof buf) < 0){
+ fprint(2, "%s: read %s: %r\n", argv0, file);
+ continue;
+ }
+ if(unpackarenahead(&head, buf) < 0){
+ fprint(2, "%s: unpackarenahead: %r\n", argv0);
+ continue;
+ }
+ if(readpart(part, offset+head.size-head.blocksize, buf, head.blocksize) < 0){
+ fprint(2, "%s: read %s: %r\n", argv0, file);
+ continue;
+ }
+ if(unpackarena(&tail, buf) < 0){
+ fprint(2, "%s: unpackarena: %r\n", argv0);
+ continue;
+ }
+ print("arena %s %lld clumps=%,d cclumps=%,d used=%,lld uncsize=%,lld%s\n",
+ tail.name, offset,
+ tail.diskstats.clumps, tail.diskstats.cclumps,
+ tail.diskstats.used, tail.diskstats.uncsize,
+ tail.diskstats.sealed ? " sealed" : "");
+ strcpy(ct, ctime(tail.ctime));
+ ct[28] = 0;
+ strcpy(mt, ctime(tail.wtime));
+ mt[28] = 0;
+ print("\tctime=%s\n\tmtime=%s\n", ct, mt);
+ }
+ threadexitsall(0);
+}
diff --git a/sys/src/cmd/venti/srv/printarenas.c b/sys/src/cmd/venti/srv/printarenas.c
new file mode 100755
index 000000000..111db0187
--- /dev/null
+++ b/sys/src/cmd/venti/srv/printarenas.c
@@ -0,0 +1,113 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+#include <bio.h>
+
+Biobuf bout;
+
+static void
+pie(IEntry *ie)
+{
+ Bprint(&bout, "%22lld %V %3d %5d\n",
+ ie->ia.addr, ie->score, ie->ia.type, ie->ia.size);
+}
+
+void
+usage(void)
+{
+ fprint(2, "usage: printarenas [-B blockcachesize] config [arenaname...]\n");
+ threadexitsall(0);
+}
+
+Config conf;
+
+int
+shoulddump(char *name, int argc, char **argv)
+{
+ int i;
+
+ if(argc == 0)
+ return 1;
+ for(i=0; i<argc; i++)
+ if(strcmp(name, argv[i]) == 0)
+ return 1;
+ return 0;
+}
+
+enum
+{
+ ClumpChunks = 32*1024,
+};
+
+void
+dumparena(Arena *arena, u64int a)
+{
+ IEntry ie;
+ ClumpInfo *ci, *cis;
+ u32int clump;
+ int i, n, nskip;
+
+ cis = MKN(ClumpInfo, ClumpChunks);
+ nskip = 0;
+ memset(&ie, 0, sizeof(IEntry));
+ for(clump = 0; clump < arena->memstats.clumps; clump += n){
+ n = ClumpChunks;
+ if(n > arena->memstats.clumps - clump)
+ n = arena->memstats.clumps - clump;
+ if(readclumpinfos(arena, clump, cis, n) != n){
+ fprint(2, "arena directory read failed: %r\n");
+ break;
+ }
+
+ for(i = 0; i < n; i++){
+ ci = &cis[i];
+ ie.ia.type = ci->type;
+ ie.ia.size = ci->uncsize;
+ ie.ia.addr = a;
+ a += ci->size + ClumpSize;
+ ie.ia.blocks = (ci->size + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog;
+ scorecp(ie.score, ci->score);
+ pie(&ie);
+ }
+ }
+ free(cis);
+}
+
+void
+threadmain(int argc, char *argv[])
+{
+ int i;
+ Index *ix;
+ u32int bcmem;
+
+ bcmem = 0;
+ ARGBEGIN{
+ case 'B':
+ bcmem = unittoull(ARGF());
+ break;
+ default:
+ usage();
+ break;
+ }ARGEND
+
+ if(argc < 1)
+ usage();
+
+ ventifmtinstall();
+
+ if(initventi(argv[0], &conf) < 0)
+ sysfatal("can't init venti: %r");
+
+ if(bcmem < maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16))
+ bcmem = maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16);
+ if(0) fprint(2, "initialize %d bytes of disk block cache\n", bcmem);
+ initdcache(bcmem);
+
+ Binit(&bout, 1, OWRITE);
+ ix = mainindex;
+ for(i=0; i<ix->narenas; i++)
+ if(shoulddump(ix->arenas[i]->name, argc-1, argv+1))
+ dumparena(ix->arenas[i], ix->amap[i].start);
+ Bterm(&bout);
+ threadexitsall(0);
+}
diff --git a/sys/src/cmd/venti/srv/printindex.c b/sys/src/cmd/venti/srv/printindex.c
new file mode 100755
index 000000000..edbcf7934
--- /dev/null
+++ b/sys/src/cmd/venti/srv/printindex.c
@@ -0,0 +1,99 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+#include <bio.h>
+
+Biobuf bout;
+
+static void
+pie(IEntry *ie)
+{
+ Bprint(&bout, "%22lld %V %3d %5d\n",
+ ie->ia.addr, ie->score, ie->ia.type, ie->ia.size);
+}
+
+void
+usage(void)
+{
+ fprint(2, "usage: printindex [-B blockcachesize] config [isectname...]\n");
+ threadexitsall(0);
+}
+
+Config conf;
+
+int
+shoulddump(char *name, int argc, char **argv)
+{
+ int i;
+
+ if(argc == 0)
+ return 1;
+ for(i=0; i<argc; i++)
+ if(strcmp(name, argv[i]) == 0)
+ return 1;
+ return 0;
+}
+
+void
+dumpisect(ISect *is)
+{
+ int j;
+ uchar *buf;
+ u32int i;
+ u64int off;
+ IBucket ib;
+ IEntry ie;
+
+ buf = emalloc(is->blocksize);
+ for(i=0; i<is->blocks; i++){
+ off = is->blockbase+(u64int)is->blocksize*i;
+ if(readpart(is->part, off, buf, is->blocksize) < 0)
+ fprint(2, "read %s at 0x%llux: %r\n", is->part->name, off);
+ else{
+ unpackibucket(&ib, buf, is->bucketmagic);
+ for(j=0; j<ib.n; j++){
+ unpackientry(&ie, &ib.data[j*IEntrySize]);
+ pie(&ie);
+ }
+ }
+ }
+}
+
+void
+threadmain(int argc, char *argv[])
+{
+ int i;
+ Index *ix;
+ u32int bcmem;
+
+ bcmem = 0;
+ ARGBEGIN{
+ case 'B':
+ bcmem = unittoull(ARGF());
+ break;
+ default:
+ usage();
+ break;
+ }ARGEND
+
+ if(argc < 1)
+ usage();
+
+ fmtinstall('H', encodefmt);
+
+ if(initventi(argv[0], &conf) < 0)
+ sysfatal("can't init venti: %r");
+
+ if(bcmem < maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16))
+ bcmem = maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16);
+ if(0) fprint(2, "initialize %d bytes of disk block cache\n", bcmem);
+ initdcache(bcmem);
+
+ ix = mainindex;
+ Binit(&bout, 1, OWRITE);
+ for(i=0; i<ix->nsects; i++)
+ if(shoulddump(ix->sects[i]->name, argc-1, argv+1))
+ dumpisect(ix->sects[i]);
+ Bterm(&bout);
+ threadexitsall(0);
+}
diff --git a/sys/src/cmd/venti/srv/printmap.c b/sys/src/cmd/venti/srv/printmap.c
new file mode 100755
index 000000000..f3392ef81
--- /dev/null
+++ b/sys/src/cmd/venti/srv/printmap.c
@@ -0,0 +1,42 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+void
+usage(void)
+{
+ fprint(2, "usage: printmap [-B blockcachesize] config\n");
+ threadexitsall("usage");
+}
+
+Config conf;
+
+void
+threadmain(int argc, char *argv[])
+{
+ u32int bcmem;
+ int fix;
+
+ fix = 0;
+ bcmem = 0;
+ ARGBEGIN{
+ case 'B':
+ bcmem = unittoull(ARGF());
+ break;
+ default:
+ usage();
+ break;
+ }ARGEND
+
+ if(!fix)
+ readonly = 1;
+
+ if(argc != 1)
+ usage();
+
+ if(initventi(argv[0], &conf) < 0)
+ sysfatal("can't init venti: %r");
+
+ printindex(1, mainindex);
+ threadexitsall(0);
+}
diff --git a/sys/src/cmd/venti/srv/rdarena.c b/sys/src/cmd/venti/srv/rdarena.c
new file mode 100755
index 000000000..0ccc1d96a
--- /dev/null
+++ b/sys/src/cmd/venti/srv/rdarena.c
@@ -0,0 +1,96 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+static int verbose, quiet;
+
+void
+usage(void)
+{
+ fprint(2, "usage: rdarena [-v] arenapart arena\n");
+ threadexitsall(0);
+}
+
+static void
+rdarena(Arena *arena)
+{
+ ZBlock *b;
+ u64int a, e;
+ u32int bs;
+
+ if (!quiet) {
+ fprint(2, "copying %s to standard output\n", arena->name);
+ printarena(2, arena);
+ }
+
+ bs = MaxIoSize;
+ if(bs < arena->blocksize)
+ bs = arena->blocksize;
+
+ b = alloczblock(bs, 0, arena->blocksize);
+ e = arena->base + arena->size + arena->blocksize;
+ for(a = arena->base - arena->blocksize; a + arena->blocksize <= e; a += bs){
+ if(a + bs > e)
+ bs = arena->blocksize;
+ if(readpart(arena->part, a, b->data, bs) < 0)
+ fprint(2, "can't copy %s, read at %lld failed: %r\n", arena->name, a);
+ if(write(1, b->data, bs) != bs)
+ sysfatal("can't copy %s, write at %lld failed: %r", arena->name, a);
+ }
+
+ freezblock(b);
+}
+
+void
+threadmain(int argc, char *argv[])
+{
+ ArenaPart *ap;
+ Part *part;
+ char *file, *aname;
+ int i;
+
+ ventifmtinstall();
+ statsinit();
+
+ ARGBEGIN{
+ case 'q':
+ quiet++;
+ break;
+ case 'v':
+ verbose++;
+ break;
+ default:
+ usage();
+ break;
+ }ARGEND
+
+ readonly = 1;
+
+ if(argc != 2)
+ usage();
+
+ file = argv[0];
+ aname = argv[1];
+
+ part = initpart(file, OREAD|ODIRECT);
+ if(part == nil)
+ sysfatal("can't open partition %s: %r", file);
+
+ ap = initarenapart(part);
+ if(ap == nil)
+ sysfatal("can't initialize arena partition in %s: %r", file);
+
+ if(verbose)
+ printarenapart(2, ap);
+
+ initdcache(8 * MaxDiskBlock);
+
+ for(i = 0; i < ap->narenas; i++){
+ if(strcmp(ap->arenas[i]->name, aname) == 0){
+ rdarena(ap->arenas[i]);
+ threadexitsall(0);
+ }
+ }
+
+ sysfatal("couldn't find arena %s", aname);
+}
diff --git a/sys/src/cmd/venti/srv/readifile.c b/sys/src/cmd/venti/srv/readifile.c
new file mode 100755
index 000000000..a822a9878
--- /dev/null
+++ b/sys/src/cmd/venti/srv/readifile.c
@@ -0,0 +1,29 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+void
+usage(void)
+{
+ fprint(2, "usage: readifile file\n");
+ threadexitsall("usage");
+}
+
+void
+threadmain(int argc, char *argv[])
+{
+ IFile ifile;
+
+ ARGBEGIN{
+ default:
+ usage();
+ }ARGEND
+
+ if(argc != 1)
+ usage();
+
+ if(readifile(&ifile, argv[0]) < 0)
+ sysfatal("readifile %s: %r", argv[0]);
+ write(1, ifile.b->data, ifile.b->len);
+ threadexitsall(nil);
+}
diff --git a/sys/src/cmd/venti/srv/reseal.c b/sys/src/cmd/venti/srv/reseal.c
new file mode 100755
index 000000000..f7353122e
--- /dev/null
+++ b/sys/src/cmd/venti/srv/reseal.c
@@ -0,0 +1,303 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+static uchar *data;
+static uchar *data1;
+static int blocksize;
+static int sleepms;
+static int fd;
+static int force;
+static vlong offset0;
+
+void
+usage(void)
+{
+ fprint(2, "usage: reseal [-f] [-b blocksize] [-s ms] arenapart1 [name...]]\n");
+ threadexitsall(0);
+}
+
+static int
+pwriteblock(uchar *buf, int n, vlong off)
+{
+ int nr, m;
+
+ for(nr = 0; nr < n; nr += m){
+ m = n - nr;
+ m = pwrite(fd, &buf[nr], m, offset0+off+nr);
+ if(m <= 0)
+ return -1;
+ }
+ return 0;
+}
+
+static int
+preadblock(uchar *buf, int n, vlong off)
+{
+ int nr, m;
+
+ for(nr = 0; nr < n; nr += m){
+ m = n - nr;
+ m = pread(fd, &buf[nr], m, offset0+off+nr);
+ if(m <= 0){
+ if(m == 0)
+ werrstr("early eof");
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static int
+loadheader(char *name, ArenaHead *head, Arena *arena, vlong off)
+{
+ if(preadblock(data, head->blocksize, off + head->size - head->blocksize) < 0){
+ fprint(2, "%s: reading arena tail: %r\n", name);
+ return -1;
+ }
+
+ memset(arena, 0, sizeof *arena);
+ if(unpackarena(arena, data) < 0){
+ fprint(2, "%s: unpack arena tail: %r\n", name);
+ return -1;
+ }
+ arena->blocksize = head->blocksize;
+ arena->base = off + head->blocksize;
+ arena->clumpmax = arena->blocksize / ClumpInfoSize;
+ arena->size = head->size - 2*head->blocksize;
+
+ if(arena->diskstats.sealed)
+ scorecp(arena->score, data + head->blocksize - VtScoreSize);
+ return 0;
+}
+
+uchar zero[VtScoreSize];
+
+static int
+verify(Arena *arena, void *data, uchar *newscore)
+{
+ vlong e, bs, n, o;
+ DigestState ds, ds1;
+ uchar score[VtScoreSize];
+
+ /*
+ * now we know how much to read
+ * read everything but the last block, which is special
+ */
+ e = arena->size + arena->blocksize;
+ o = arena->base - arena->blocksize;
+ bs = arena->blocksize;
+ memset(&ds, 0, sizeof ds);
+ for(n = 0; n < e; n += bs){
+ if(preadblock(data, bs, o + n) < 0){
+ werrstr("read: %r");
+ return -1;
+ }
+ if(n + bs > e)
+ bs = e - n;
+ sha1(data, bs, nil, &ds);
+ }
+
+ /* last block */
+ if(preadblock(data, arena->blocksize, o + e) < 0){
+ werrstr("read: %r");
+ return -1;
+ }
+ ds1 = ds;
+ sha1(data, bs - VtScoreSize, nil, &ds);
+ sha1(zero, VtScoreSize, score, &ds);
+ if(scorecmp(score, arena->score) != 0){
+ if(!force){
+ werrstr("score mismatch: %V != %V", score, arena->score);
+ return -1;
+ }
+ fprint(2, "warning: score mismatch %V != %V\n", score, arena->score);
+ }
+
+ /* prepare new last block */
+ memset(data, 0, arena->blocksize);
+ packarena(arena, data);
+ sha1(data, bs, newscore, &ds1);
+ scorecp((uchar*)data + arena->blocksize - VtScoreSize, newscore);
+
+ return 0;
+}
+
+static void
+resealarena(char *name, vlong len)
+{
+ ArenaHead head;
+ Arena arena;
+ DigestState s;
+ u64int off;
+ uchar newscore[VtScoreSize];
+
+ fprint(2, "%s: begin reseal\n", name);
+
+ memset(&s, 0, sizeof s);
+
+ off = seek(fd, 0, 1);
+
+ /*
+ * read a little bit, which will include the header
+ */
+ if(preadblock(data, HeadSize, off) < 0){
+ fprint(2, "%s: reading header: %r\n", name);
+ return;
+ }
+ if(unpackarenahead(&head, data) < 0){
+ fprint(2, "%s: corrupt arena header: %r\n", name);
+ return;
+ }
+ if(head.version != ArenaVersion4 && head.version != ArenaVersion5)
+ fprint(2, "%s: warning: unknown arena version %d\n", name, head.version);
+ if(len != 0 && len != head.size)
+ fprint(2, "%s: warning: unexpected length %lld != %lld\n", name, head.size, len);
+ if(strcmp(name, "<stdin>") != 0 && strcmp(head.name, name) != 0)
+ fprint(2, "%s: warning: unexpected name %s\n", name, head.name);
+
+ if(loadheader(name, &head, &arena, off) < 0)
+ return;
+
+ if(!arena.diskstats.sealed){
+ fprint(2, "%s: not sealed\n", name);
+ return;
+ }
+
+ if(verify(&arena, data, newscore) < 0){
+ fprint(2, "%s: failed to verify before reseal: %r\n", name);
+ return;
+ }
+
+ if(pwriteblock(data, arena.blocksize, arena.base + arena.size) < 0){
+ fprint(2, "%s: writing new tail: %r\n", name);
+ return;
+ }
+ scorecp(arena.score, newscore);
+ fprint(2, "%s: resealed: %V\n", name, newscore);
+
+ if(verify(&arena, data, newscore) < 0){
+ fprint(2, "%s: failed to verify after reseal!: %r\n", name);
+ return;
+ }
+
+ fprint(2, "%s: verified: %V\n", name, newscore);
+}
+
+static int
+shouldcheck(char *name, char **s, int n)
+{
+ int i;
+
+ if(n == 0)
+ return 1;
+
+ for(i=0; i<n; i++){
+ if(s[i] && strcmp(name, s[i]) == 0){
+ s[i] = nil;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+char *
+readap(ArenaPart *ap)
+{
+ char *table;
+
+ if(preadblock(data, 8192, PartBlank) < 0)
+ sysfatal("read arena part header: %r");
+ if(unpackarenapart(ap, data) < 0)
+ sysfatal("corrupted arena part header: %r");
+ fprint(2, "# arena part version=%d blocksize=%d arenabase=%d\n",
+ ap->version, ap->blocksize, ap->arenabase);
+ ap->tabbase = (PartBlank+HeadSize+ap->blocksize-1)&~(ap->blocksize-1);
+ ap->tabsize = ap->arenabase - ap->tabbase;
+ table = malloc(ap->tabsize+1);
+ if(preadblock((uchar*)table, ap->tabsize, ap->tabbase) < 0)
+ sysfatal("reading arena part directory: %r");
+ table[ap->tabsize] = 0;
+ return table;
+}
+
+void
+threadmain(int argc, char *argv[])
+{
+ int i, nline;
+ char *p, *q, *table, *f[10], line[256];
+ vlong start, stop;
+ ArenaPart ap;
+ Part *part;
+
+ ventifmtinstall();
+ blocksize = MaxIoSize;
+ ARGBEGIN{
+ case 'b':
+ blocksize = unittoull(EARGF(usage()));
+ break;
+ case 'f':
+ force = 1;
+ break;
+ case 's':
+ sleepms = atoi(EARGF(usage()));
+ break;
+ default:
+ usage();
+ break;
+ }ARGEND
+
+ if(argc < 2)
+ usage();
+
+ data = vtmalloc(blocksize);
+ if((part = initpart(argv[0], ORDWR)) == nil)
+ sysfatal("open partition %s: %r", argv[0]);
+ fd = part->fd;
+ offset0 = part->offset;
+
+ table = readap(&ap);
+
+ nline = atoi(table);
+ p = strchr(table, '\n');
+ if(p)
+ p++;
+ for(i=0; i<nline; i++){
+ if(p == nil){
+ fprint(2, "warning: unexpected arena table end\n");
+ break;
+ }
+ q = strchr(p, '\n');
+ if(q)
+ *q++ = 0;
+ if(strlen(p) >= sizeof line){
+ fprint(2, "warning: long arena table line: %s\n", p);
+ p = q;
+ continue;
+ }
+ strcpy(line, p);
+ memset(f, 0, sizeof f);
+ if(tokenize(line, f, nelem(f)) < 3){
+ fprint(2, "warning: bad arena table line: %s\n", p);
+ p = q;
+ continue;
+ }
+ p = q;
+ if(shouldcheck(f[0], argv+1, argc-1)){
+ start = strtoull(f[1], 0, 0);
+ stop = strtoull(f[2], 0, 0);
+ if(stop <= start){
+ fprint(2, "%s: bad start,stop %lld,%lld\n", f[0], stop, start);
+ continue;
+ }
+ if(seek(fd, start, 0) < 0)
+ fprint(2, "%s: seek to start: %r\n", f[0]);
+ resealarena(f[0], stop - start);
+ }
+ }
+ for(i=2; i<argc; i++)
+ if(argv[i] != 0)
+ fprint(2, "%s: did not find arena\n", argv[i]);
+
+ threadexitsall(nil);
+}
diff --git a/sys/src/cmd/venti/srv/round.c b/sys/src/cmd/venti/srv/round.c
new file mode 100755
index 000000000..bbf4a478a
--- /dev/null
+++ b/sys/src/cmd/venti/srv/round.c
@@ -0,0 +1,102 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+void
+waitforkick(Round *r)
+{
+ int n;
+
+ qlock(&r->lock);
+ r->last = r->current;
+ assert(r->current+1 == r->next);
+ rwakeupall(&r->finish);
+ while(!r->doanother)
+ rsleep(&r->start);
+ n = r->next++;
+ r->current = n;
+ r->doanother = 0;
+ qunlock(&r->lock);
+}
+
+static void
+_kickround(Round *r, int wait)
+{
+ int n;
+
+ if(!r->doanother)
+ trace(TraceProc, "kick %s", r->name);
+ r->doanother = 1;
+ rwakeup(&r->start);
+ if(wait){
+ n = r->next;
+ while((int)(n - r->last) > 0){
+ r->doanother = 1;
+ rwakeup(&r->start);
+ rsleep(&r->finish);
+ }
+ }
+}
+
+void
+kickround(Round *r, int wait)
+{
+ qlock(&r->lock);
+ _kickround(r, wait);
+ qunlock(&r->lock);
+}
+
+void
+initround(Round *r, char *name, int delay)
+{
+ memset(r, 0, sizeof *r);
+ r->name = name;
+ r->start.l = &r->lock;
+ r->finish.l = &r->lock;
+ r->delaywait.l = &r->lock;
+ r->last = 0;
+ r->current = 0;
+ r->next = 1;
+ r->doanother = 0;
+ r->delaytime = delay;
+}
+
+void
+delaykickround(Round *r)
+{
+ qlock(&r->lock);
+ r->delaykick = 1;
+ rwakeup(&r->delaywait);
+ qunlock(&r->lock);
+}
+
+void
+delaykickroundproc(void *v)
+{
+ Round *r = v;
+ int n;
+
+ threadsetname("delaykickproc %s", r->name);
+ qlock(&r->lock);
+ for(;;){
+ while(r->delaykick == 0){
+ trace(TraceProc, "sleep");
+ rsleep(&r->delaywait);
+ }
+
+ n = r->next;
+ qunlock(&r->lock);
+
+ trace(TraceProc, "waitround 0x%ux", (uint)n);
+ sleep(r->delaytime);
+
+ qlock(&r->lock);
+ if(n == r->next){
+ trace(TraceProc, "kickround 0x%ux", (uint)n);
+ _kickround(r, 1);
+ }
+
+ trace(TraceProc, "finishround 0x%ux", (uint)n);
+ }
+}
+
diff --git a/sys/src/cmd/venti/srv/score.c b/sys/src/cmd/venti/srv/score.c
new file mode 100755
index 000000000..f150fd78e
--- /dev/null
+++ b/sys/src/cmd/venti/srv/score.c
@@ -0,0 +1,46 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+u8int zeroscore[VtScoreSize];
+
+/* Call this function to force linking of score.o for zeroscore on OS X */
+void needzeroscore(void) { }
+
+void
+scoremem(u8int *score, u8int *buf, int n)
+{
+ DigestState s;
+
+ memset(&s, 0, sizeof s);
+ sha1(buf, n, score, &s);
+}
+
+static int
+hexv(int c)
+{
+ if(c >= '0' && c <= '9')
+ return c - '0';
+ if(c >= 'a' && c <= 'f')
+ return c - 'a' + 10;
+ if(c >= 'A' && c <= 'F')
+ return c - 'A' + 10;
+ return -1;
+}
+
+int
+strscore(char *s, u8int *score)
+{
+ int i, c, d;
+
+ for(i = 0; i < VtScoreSize; i++){
+ c = hexv(s[2 * i]);
+ if(c < 0)
+ return -1;
+ d = hexv(s[2 * i + 1]);
+ if(d < 0)
+ return -1;
+ score[i] = (c << 4) + d;
+ }
+ return s[2 * i] == '\0';
+}
diff --git a/sys/src/cmd/venti/srv/sortientry.c b/sys/src/cmd/venti/srv/sortientry.c
new file mode 100755
index 000000000..b8b8e876c
--- /dev/null
+++ b/sys/src/cmd/venti/srv/sortientry.c
@@ -0,0 +1,365 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+#include <bio.h>
+
+typedef struct IEBuck IEBuck;
+typedef struct IEBucks IEBucks;
+
+enum
+{
+ ClumpChunks = 32*1024
+};
+
+struct IEBuck
+{
+ u32int head; /* head of chain of chunks on the disk */
+ u32int used; /* usage of the last chunk */
+ u64int total; /* total number of bytes in this bucket */
+ u8int *buf; /* chunk of entries for this bucket */
+};
+
+struct IEBucks
+{
+ Part *part;
+ u64int off; /* offset for writing data in the partition */
+ u32int chunks; /* total chunks written to fd */
+ u64int max; /* max bytes entered in any one bucket */
+ int bits; /* number of bits in initial bucket sort */
+ int nbucks; /* 1 << bits, the number of buckets */
+ u32int size; /* bytes in each of the buckets chunks */
+ u32int usable; /* amount usable for IEntry data */
+ u8int *buf; /* buffer for all chunks */
+ u8int *xbuf;
+ IEBuck *bucks;
+};
+
+#define U32GET(p) (((p)[0]<<24)|((p)[1]<<16)|((p)[2]<<8)|(p)[3])
+#define U32PUT(p,v) (p)[0]=(v)>>24;(p)[1]=(v)>>16;(p)[2]=(v)>>8;(p)[3]=(v)
+
+static IEBucks *initiebucks(Part *part, int bits, u32int size);
+static int flushiebuck(IEBucks *ib, int b, int reset);
+static int flushiebucks(IEBucks *ib);
+static u32int sortiebuck(IEBucks *ib, int b);
+static u64int sortiebucks(IEBucks *ib);
+static int sprayientry(IEBucks *ib, IEntry *ie);
+static u32int readarenainfo(IEBucks *ib, Arena *arena, u64int a, Bloom *b);
+static u32int readiebuck(IEBucks *ib, int b);
+static void freeiebucks(IEBucks *ib);
+
+/*
+ * build a sorted file with all IEntries which should be in ix.
+ * assumes the arenas' directories are up to date.
+ * reads each, converts the entries to index entries,
+ * and sorts them.
+ */
+u64int
+sortrawientries(Index *ix, Part *tmp, u64int *base, Bloom *bloom)
+{
+ IEBucks *ib;
+ u64int clumps, sorted;
+ u32int n;
+ int i, ok;
+
+/* ZZZ should allow configuration of bits, bucket size */
+ ib = initiebucks(tmp, 8, 64*1024);
+ if(ib == nil){
+ seterr(EOk, "can't create sorting buckets: %r");
+ return TWID64;
+ }
+ ok = 0;
+ clumps = 0;
+ fprint(2, "constructing entry list\n");
+ for(i = 0; i < ix->narenas; i++){
+ n = readarenainfo(ib, ix->arenas[i], ix->amap[i].start, bloom);
+ if(n == TWID32){
+ ok = -1;
+ break;
+ }
+ clumps += n;
+ }
+ fprint(2, "sorting %lld entries\n", clumps);
+ if(ok == 0){
+ sorted = sortiebucks(ib);
+ *base = (u64int)ib->chunks * ib->size;
+ if(sorted != clumps){
+ fprint(2, "sorting messed up: clumps=%lld sorted=%lld\n", clumps, sorted);
+ ok = -1;
+ }
+ }
+ freeiebucks(ib);
+ if(ok < 0)
+ return TWID64;
+ return clumps;
+}
+
+#define CHECK(cis) if(((ulong*)cis)[-4] != 0xA110C09) xabort();
+
+void
+xabort(void)
+{
+ int *x;
+
+ x = 0;
+ *x = 0;
+}
+
+/*
+ * read in all of the arena's clump directory,
+ * convert to IEntry format, and bucket sort based
+ * on the first few bits.
+ */
+static u32int
+readarenainfo(IEBucks *ib, Arena *arena, u64int a, Bloom *b)
+{
+ IEntry ie;
+ ClumpInfo *ci, *cis;
+ u32int clump;
+ int i, n, ok, nskip;
+
+ if(arena->memstats.clumps)
+ fprint(2, "\tarena %s: %d entries\n", arena->name, arena->memstats.clumps);
+ else
+ fprint(2, "[%s] ", arena->name);
+
+ cis = MKN(ClumpInfo, ClumpChunks);
+ ok = 0;
+ nskip = 0;
+ memset(&ie, 0, sizeof(IEntry));
+ for(clump = 0; clump < arena->memstats.clumps; clump += n){
+ n = ClumpChunks;
+ if(n > arena->memstats.clumps - clump)
+ n = arena->memstats.clumps - clump;
+ if(readclumpinfos(arena, clump, cis, n) != n){
+ seterr(EOk, "arena directory read failed: %r");
+ ok = -1;
+ break;
+ }
+
+ for(i = 0; i < n; i++){
+ ci = &cis[i];
+ ie.ia.type = ci->type;
+ ie.ia.size = ci->uncsize;
+ ie.ia.addr = a;
+ a += ci->size + ClumpSize;
+ ie.ia.blocks = (ci->size + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog;
+ scorecp(ie.score, ci->score);
+ if(ci->type == VtCorruptType){
+ if(0) print("! %V %22lld %3d %5d %3d\n",
+ ie.score, ie.ia.addr, ie.ia.type, ie.ia.size, ie.ia.blocks);
+ nskip++;
+ }else
+ sprayientry(ib, &ie);
+ markbloomfilter(b, ie.score);
+ }
+ }
+ free(cis);
+ if(ok < 0)
+ return TWID32;
+ return clump - nskip;
+}
+
+/*
+ * initialize the external bucket sorting data structures
+ */
+static IEBucks*
+initiebucks(Part *part, int bits, u32int size)
+{
+ IEBucks *ib;
+ int i;
+
+ ib = MKZ(IEBucks);
+ if(ib == nil){
+ seterr(EOk, "out of memory");
+ return nil;
+ }
+ ib->bits = bits;
+ ib->nbucks = 1 << bits;
+ ib->size = size;
+ ib->usable = (size - U32Size) / IEntrySize * IEntrySize;
+ ib->bucks = MKNZ(IEBuck, ib->nbucks);
+ if(ib->bucks == nil){
+ seterr(EOk, "out of memory allocation sorting buckets");
+ freeiebucks(ib);
+ return nil;
+ }
+ ib->xbuf = MKN(u8int, size * ((1 << bits)+1));
+ ib->buf = (u8int*)(((uintptr)ib->xbuf+size-1)&~(uintptr)(size-1));
+ if(ib->buf == nil){
+ seterr(EOk, "out of memory allocating sorting buckets' buffers");
+ freeiebucks(ib);
+ return nil;
+ }
+ for(i = 0; i < ib->nbucks; i++){
+ ib->bucks[i].head = TWID32;
+ ib->bucks[i].buf = &ib->buf[i * size];
+ }
+ ib->part = part;
+ return ib;
+}
+
+static void
+freeiebucks(IEBucks *ib)
+{
+ if(ib == nil)
+ return;
+ free(ib->bucks);
+ free(ib->buf);
+ free(ib);
+}
+
+/*
+ * initial sort: put the entry into the correct bucket
+ */
+static int
+sprayientry(IEBucks *ib, IEntry *ie)
+{
+ u32int n;
+ int b;
+
+ b = hashbits(ie->score, ib->bits);
+ n = ib->bucks[b].used;
+ if(n + IEntrySize > ib->usable){
+ /* should be flushed below, but if flush fails, this can happen */
+ seterr(EOk, "out of space in bucket");
+ return -1;
+ }
+ packientry(ie, &ib->bucks[b].buf[n]);
+ n += IEntrySize;
+ ib->bucks[b].used = n;
+ if(n + IEntrySize <= ib->usable)
+ return 0;
+ return flushiebuck(ib, b, 1);
+}
+
+/*
+ * finish sorting:
+ * for each bucket, read it in and sort it
+ * write out the the final file
+ */
+static u64int
+sortiebucks(IEBucks *ib)
+{
+ u64int tot;
+ u32int n;
+ int i;
+
+ if(flushiebucks(ib) < 0)
+ return TWID64;
+ for(i = 0; i < ib->nbucks; i++)
+ ib->bucks[i].buf = nil;
+ ib->off = (u64int)ib->chunks * ib->size;
+ free(ib->xbuf);
+
+ ib->buf = MKN(u8int, ib->max + U32Size);
+ if(ib->buf == nil){
+ seterr(EOk, "out of memory allocating final sorting buffer; try more buckets");
+ return TWID64;
+ }
+ tot = 0;
+ for(i = 0; i < ib->nbucks; i++){
+ n = sortiebuck(ib, i);
+ if(n == TWID32)
+ return TWID64;
+ if(n != ib->bucks[i].total/IEntrySize)
+ fprint(2, "bucket %d changed count %d => %d\n",
+ i, (int)(ib->bucks[i].total/IEntrySize), n);
+ tot += n;
+ }
+ return tot;
+}
+
+/*
+ * sort from bucket b of ib into the output file to
+ */
+static u32int
+sortiebuck(IEBucks *ib, int b)
+{
+ u32int n;
+
+ n = readiebuck(ib, b);
+ if(n == TWID32)
+ return TWID32;
+ qsort(ib->buf, n, IEntrySize, ientrycmp);
+ if(writepart(ib->part, ib->off, ib->buf, n*IEntrySize) < 0){
+ seterr(EOk, "can't write sorted bucket: %r");
+ return TWID32;
+ }
+ ib->off += n * IEntrySize;
+ return n;
+}
+
+/*
+ * write out a single bucket
+ */
+static int
+flushiebuck(IEBucks *ib, int b, int reset)
+{
+ u32int n;
+
+ if(ib->bucks[b].used == 0)
+ return 0;
+ n = ib->bucks[b].used;
+ U32PUT(&ib->bucks[b].buf[n], ib->bucks[b].head);
+ n += U32Size;
+ USED(n);
+ if(writepart(ib->part, (u64int)ib->chunks * ib->size, ib->bucks[b].buf, ib->size) < 0){
+ seterr(EOk, "can't write sorting bucket to file: %r");
+xabort();
+ return -1;
+ }
+ ib->bucks[b].head = ib->chunks++;
+ ib->bucks[b].total += ib->bucks[b].used;
+ if(reset)
+ ib->bucks[b].used = 0;
+ return 0;
+}
+
+/*
+ * write out all of the buckets, and compute
+ * the maximum size of any bucket
+ */
+static int
+flushiebucks(IEBucks *ib)
+{
+ int i;
+
+ for(i = 0; i < ib->nbucks; i++){
+ if(flushiebuck(ib, i, 0) < 0)
+ return -1;
+ if(ib->bucks[i].total > ib->max)
+ ib->max = ib->bucks[i].total;
+ }
+ return 0;
+}
+
+/*
+ * read in the chained buffers for bucket b,
+ * and return it's total number of IEntries
+ */
+static u32int
+readiebuck(IEBucks *ib, int b)
+{
+ u32int head, m, n;
+
+ head = ib->bucks[b].head;
+ n = 0;
+ m = ib->bucks[b].used;
+ if(m == 0)
+ m = ib->usable;
+ if(0) if(ib->bucks[b].total)
+ fprint(2, "\tbucket %d: %lld entries\n", b, ib->bucks[b].total/IEntrySize);
+ while(head != TWID32){
+ if(readpart(ib->part, (u64int)head * ib->size, &ib->buf[n], m+U32Size) < 0){
+ seterr(EOk, "can't read index sort bucket: %r");
+ return TWID32;
+ }
+ n += m;
+ head = U32GET(&ib->buf[n]);
+ m = ib->usable;
+ }
+ if(n != ib->bucks[b].total)
+ fprint(2, "\tbucket %d: expected %d entries, got %d\n",
+ b, (int)ib->bucks[b].total/IEntrySize, n/IEntrySize);
+ return n / IEntrySize;
+}
diff --git a/sys/src/cmd/venti/srv/stats.c b/sys/src/cmd/venti/srv/stats.c
new file mode 100755
index 000000000..bb944760b
--- /dev/null
+++ b/sys/src/cmd/venti/srv/stats.c
@@ -0,0 +1,212 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+int collectstats = 1;
+
+/* keep in sync with dat.h:/NStat */
+Statdesc statdesc[NStat] =
+{
+ { "rpc total", },
+ { "rpc reads", },
+ { "rpc reads ok", },
+ { "rpc reads failed", },
+ { "rpc read bytes", },
+ { "rpc read time", },
+ { "rpc read cached", },
+ { "rpc read cached time", },
+ { "rpc read uncached", },
+ { "rpc read uncached time "},
+
+ { "rpc writes", },
+ { "rpc writes new", },
+ { "rpc writes old", },
+ { "rpc writes failed", },
+ { "rpc write bytes", },
+ { "rpc write time", },
+ { "rpc write new time", },
+ { "rpc write old time", },
+
+ { "lump cache hits", },
+ { "lump cache misses", },
+ { "lump cache reads", },
+ { "lump cache writes", },
+ { "lump cache size", },
+ { "lump cache stall", },
+ { "lump cache read time", },
+
+ { "disk cache hits", },
+ { "disk cache misses", },
+ { "disk cache lookups", },
+ { "disk cache reads", },
+ { "disk cache writes", },
+ { "disk cache dirty", },
+ { "disk cache size", },
+ { "disk cache flushes", },
+ { "disk cache stalls", },
+ { "disk cache lookup time", },
+
+ { "disk block stalls", },
+ { "lump stalls", },
+
+ { "index cache hits", },
+ { "index cache misses", },
+ { "index cache reads", },
+ { "index cache writes", },
+ { "index cache fills", },
+ { "index cache prefetches", },
+ { "index cache dirty", },
+ { "index cache size", },
+ { "index cache flushes", },
+ { "index cache stalls", },
+ { "index cache read time", },
+ { "index cache lookups" },
+ { "index cache summary hits" },
+ { "index cache summary prefetches" },
+
+ { "bloom filter hits", },
+ { "bloom filter misses", },
+ { "bloom filter false misses", },
+ { "bloom filter lookups", },
+ { "bloom filter ones", },
+ { "bloom filter bits", },
+
+ { "arena block reads", },
+ { "arena block read bytes", },
+ { "arena block writes", },
+ { "arena block write bytes", },
+
+ { "isect block reads", },
+ { "isect block read bytes", },
+ { "isect block writes", },
+ { "isect block write bytes", },
+
+ { "sum reads", },
+ { "sum read bytes", },
+
+ { "cig loads" },
+ { "cig load time" },
+};
+
+QLock statslock;
+Stats stats;
+Stats *stathist;
+int nstathist;
+ulong statind;
+ulong stattime;
+
+void
+statsproc(void *v)
+{
+ USED(v);
+
+ for(;;){
+ stats.now = time(0);
+ stathist[stattime%nstathist] = stats;
+ stattime++;
+ sleep(1000);
+ }
+}
+
+void
+statsinit(void)
+{
+ nstathist = 90000;
+ stathist = MKNZ(Stats, nstathist);
+ vtproc(statsproc, nil);
+}
+
+void
+setstat(int index, long val)
+{
+ qlock(&statslock);
+ stats.n[index] = val;
+ qunlock(&statslock);
+}
+
+void
+addstat(int index, int inc)
+{
+ if(!collectstats)
+ return;
+ qlock(&statslock);
+ stats.n[index] += inc;
+ qunlock(&statslock);
+}
+
+void
+addstat2(int index, int inc, int index1, int inc1)
+{
+ if(!collectstats)
+ return;
+ qlock(&statslock);
+ stats.n[index] += inc;
+ stats.n[index1] += inc1;
+ qunlock(&statslock);
+}
+
+void
+printstats(void)
+{
+}
+
+void
+binstats(long (*fn)(Stats *s0, Stats *s1, void *arg), void *arg,
+ long t0, long t1, Statbin *bin, int nbin)
+{
+ long xt0, t, te, v;
+ int i, j, lo, hi, m;
+ vlong tot;
+ Statbin *b;
+
+ t = stats.now;
+
+ /* negative times mean relative to now. */
+ if(t0 <= 0)
+ t0 += t;
+ if(t1 <= 0)
+ t1 += t;
+ /* ten minute range if none given */
+ if(t1 <= t0)
+ t0 = t1 - 60*10;
+ if(0) fprint(2, "stats %ld-%ld\n", t0, t1);
+
+ /* binary search to find t0-1 or close */
+ lo = stattime;
+ hi = stattime+nstathist;
+ while(lo+1 < hi){
+ m = (lo+hi)/2;
+ if(stathist[m%nstathist].now >= t0)
+ hi = m;
+ else
+ lo = m;
+ }
+ xt0 = stathist[lo%nstathist].now;
+ if(xt0 >= t1){
+ /* no samples */
+ memset(bin, 0, nbin*sizeof bin[0]);
+ return;
+ }
+
+ hi = stattime+nstathist;
+ j = lo+1;
+ for(i=0; i<nbin; i++){
+ te = t0 + (t1-t0)*i/nbin;
+ b = &bin[i];
+ memset(b, 0, sizeof *b);
+ tot = 0;
+ for(; j<hi && stathist[j%nstathist].now<te; j++){
+ v = fn(&stathist[(j-1)%nstathist], &stathist[j%nstathist], arg);
+ if(b->nsamp==0 || v < b->min)
+ b->min = v;
+ if(b->nsamp==0 || v > b->max)
+ b->max = v;
+ tot += v;
+ b->nsamp++;
+ }
+ if(b->nsamp)
+ b->avg = tot / b->nsamp;
+ if(b->nsamp==0 && i>0)
+ *b = bin[i-1];
+ }
+}
diff --git a/sys/src/cmd/venti/srv/stdinc.h b/sys/src/cmd/venti/srv/stdinc.h
new file mode 100755
index 000000000..3fd06ccd7
--- /dev/null
+++ b/sys/src/cmd/venti/srv/stdinc.h
@@ -0,0 +1,9 @@
+#include <u.h>
+#include <libc.h>
+#include <venti.h>
+#include <flate.h>
+#include <libsec.h>
+#include <thread.h>
+#include <httpd.h>
+#include <draw.h>
+#include <memdraw.h>
diff --git a/sys/src/cmd/venti/srv/syncarena.c b/sys/src/cmd/venti/srv/syncarena.c
new file mode 100755
index 000000000..0e6cc2019
--- /dev/null
+++ b/sys/src/cmd/venti/srv/syncarena.c
@@ -0,0 +1,174 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+static int writeclumphead(Arena *arena, u64int aa, Clump *cl);
+static int writeclumpmagic(Arena *arena, u64int aa, u32int magic);
+
+int
+clumpinfocmp(ClumpInfo *c, ClumpInfo *d)
+{
+ return c->type != d->type
+ || c->size != d->size
+ || c->uncsize != d->uncsize
+ || scorecmp(c->score, d->score)!=0;
+}
+
+/*
+ * synchronize the clump info directory with
+ * with the clumps actually stored in the arena.
+ * the directory should be at least as up to date
+ * as the arena's trailer.
+ *
+ * checks/updates at most n clumps.
+ *
+ * returns 0 if ok, flags if error occurred
+ */
+int
+syncarena(Arena *arena, u32int n, int zok, int fix)
+{
+ ZBlock *lump;
+ Clump cl;
+ ClumpInfo ci;
+ static ClumpInfo zci = { .type = -1 };
+ u8int score[VtScoreSize];
+ u64int uncsize, used, aa;
+ u32int clump, clumps, cclumps, magic;
+ int err, flush, broken;
+
+ used = arena->memstats.used;
+ clumps = arena->memstats.clumps;
+ cclumps = arena->memstats.cclumps;
+ uncsize = arena->memstats.uncsize;
+ trace(TraceProc, "syncarena start");
+ flush = 0;
+ err = 0;
+ for(; n; n--){
+ aa = arena->memstats.used;
+ clump = arena->memstats.clumps;
+ magic = clumpmagic(arena, aa);
+ if(magic == ClumpFreeMagic)
+ break;
+ if(magic != arena->clumpmagic){
+ fprint(2, "%s: illegal clump magic number=%#8.8ux at clump=%d\n", arena->name, magic, clump);
+ /* err |= SyncDataErr; */
+ if(fix && writeclumpmagic(arena, aa, ClumpFreeMagic) < 0){
+ fprint(2, "%s: can't write corrected clump free magic: %r", arena->name);
+ err |= SyncFixErr;
+ }
+ break;
+ }
+
+ broken = 0;
+ lump = loadclump(arena, aa, 0, &cl, score, 0);
+ if(lump == nil){
+ fprint(2, "%s: clump=%d failed to read correctly: %r\n", arena->name, clump);
+ break;
+ }else if(cl.info.type != VtCorruptType){
+ scoremem(score, lump->data, cl.info.uncsize);
+ if(scorecmp(cl.info.score, score) != 0){
+ /* ignore partially written block */
+ if(cl.encoding == ClumpENone)
+ break;
+ fprint(2, "%s: clump=%d has mismatched score\n", arena->name, clump);
+ err |= SyncDataErr;
+ broken = 1;
+ }else if(vttypevalid(cl.info.type) < 0){
+ fprint(2, "%s: clump=%d has invalid type %d", arena->name, clump, cl.info.type);
+ err |= SyncDataErr;
+ broken = 1;
+ }
+ if(broken && fix){
+ cl.info.type = VtCorruptType;
+ if(writeclumphead(arena, aa, &cl) < 0){
+ fprint(2, "%s: can't write corrected clump header: %r", arena->name);
+ err |= SyncFixErr;
+ }
+ }
+ }
+ freezblock(lump);
+ arena->memstats.used += ClumpSize + cl.info.size;
+
+ arena->memstats.clumps++;
+ if(!broken && readclumpinfo(arena, clump, &ci)<0){
+ fprint(2, "%s: arena directory read failed\n", arena->name);
+ broken = 1;
+ }else if(!broken && clumpinfocmp(&ci, &cl.info)!=0){
+ if(clumpinfocmp(&ci, &zci) == 0){
+ err |= SyncCIZero;
+ if(!zok)
+ fprint(2, "%s: unwritten clump info for clump=%d\n", arena->name, clump);
+ }else{
+ err |= SyncCIErr;
+ fprint(2, "%s: bad clump info for clump=%d\n", arena->name, clump);
+ fprint(2, "\texpected score=%V type=%d size=%d uncsize=%d\n",
+ cl.info.score, cl.info.type, cl.info.size, cl.info.uncsize);
+ fprint(2, "\tfound score=%V type=%d size=%d uncsize=%d\n",
+ ci.score, ci.type, ci.size, ci.uncsize);
+ }
+ broken = 1;
+ }
+ if(broken && fix){
+ flush = 1;
+ ci = cl.info;
+ if(writeclumpinfo(arena, clump, &ci) < 0){
+ fprint(2, "%s: can't write correct clump directory: %r\n", arena->name);
+ err |= SyncFixErr;
+ }
+ }
+ trace(TraceProc, "syncarena unindexed clump %V %d", cl.info.score, arena->memstats.clumps);
+
+ arena->memstats.uncsize += cl.info.uncsize;
+ if(cl.info.size < cl.info.uncsize)
+ arena->memstats.cclumps++;
+ }
+
+ if(flush){
+ trace(TraceProc, "syncarena flush");
+ arena->wtime = now();
+ if(arena->ctime == 0 && arena->memstats.clumps)
+ arena->ctime = arena->wtime;
+ flushdcache();
+ }
+
+ if(used != arena->memstats.used
+ || clumps != arena->memstats.clumps
+ || cclumps != arena->memstats.cclumps
+ || uncsize != arena->memstats.uncsize){
+ err |= SyncHeader;
+ fprint(2, "arena %s: fix=%d flush=%d %lld->%lld %ud->%ud %ud->%ud %lld->%lld\n",
+ arena->name,
+ fix,
+ flush,
+ used, arena->memstats.used,
+ clumps, arena->memstats.clumps,
+ cclumps, arena->memstats.cclumps,
+ uncsize, arena->memstats.uncsize);
+ }
+
+ return err;
+}
+
+static int
+writeclumphead(Arena *arena, u64int aa, Clump *cl)
+{
+ ZBlock *zb;
+ int bad;
+
+ zb = alloczblock(ClumpSize, 0, arena->blocksize);
+ if(zb == nil)
+ return -1;
+ bad = packclump(cl, zb->data, arena->clumpmagic)<0
+ || writearena(arena, aa, zb->data, ClumpSize) != ClumpSize;
+ freezblock(zb);
+ return bad ? -1 : 0;
+}
+
+static int
+writeclumpmagic(Arena *arena, u64int aa, u32int magic)
+{
+ u8int buf[U32Size];
+
+ packmagic(magic, buf);
+ return writearena(arena, aa, buf, U32Size) == U32Size;
+}
diff --git a/sys/src/cmd/venti/srv/syncindex.c b/sys/src/cmd/venti/srv/syncindex.c
new file mode 100755
index 000000000..6bf996ae4
--- /dev/null
+++ b/sys/src/cmd/venti/srv/syncindex.c
@@ -0,0 +1,64 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+static int verbose;
+void
+usage(void)
+{
+ fprint(2, "usage: syncindex [-v] [-B blockcachesize] config\n");
+ threadexitsall("usage");
+}
+
+Config conf;
+
+void
+threadmain(int argc, char *argv[])
+{
+ u32int bcmem, icmem;
+
+ bcmem = 0;
+ icmem = 0;
+ ARGBEGIN{
+ case 'B':
+ bcmem = unittoull(EARGF(usage()));
+ break;
+ case 'I':
+ icmem = unittoull(EARGF(usage()));
+ break;
+ case 'v':
+ verbose++;
+ break;
+ default:
+ usage();
+ break;
+ }ARGEND
+
+ if(argc != 1)
+ usage();
+
+ ventifmtinstall();
+ if(initventi(argv[0], &conf) < 0)
+ sysfatal("can't init venti: %r");
+ if(mainindex->bloom && loadbloom(mainindex->bloom) < 0)
+ sysfatal("can't load bloom filter: %r");
+
+ if(bcmem < maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16))
+ bcmem = maxblocksize * (mainindex->narenas + mainindex->nsects * 4 + 16);
+ if(0) fprint(2, "initialize %d bytes of disk block cache\n", bcmem);
+ initdcache(bcmem);
+ initlumpcache(1*1024*1024, 1024/8);
+ initicache(icmem);
+ initicachewrite();
+ if(mainindex->bloom)
+ startbloomproc(mainindex->bloom);
+
+ if(verbose)
+ printindex(2, mainindex);
+ if(syncindex(mainindex) < 0)
+ sysfatal("failed to sync index=%s: %r", mainindex->name);
+ flushicache();
+ flushdcache();
+
+ threadexitsall(0);
+}
diff --git a/sys/src/cmd/venti/srv/syncindex0.c b/sys/src/cmd/venti/srv/syncindex0.c
new file mode 100755
index 000000000..be3a2ea06
--- /dev/null
+++ b/sys/src/cmd/venti/srv/syncindex0.c
@@ -0,0 +1,93 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+static int
+syncarenaindex(Arena *arena, u64int a0)
+{
+ int ok;
+ u32int clump;
+ u64int a;
+ ClumpInfo ci;
+ IAddr ia;
+ AState as;
+
+ if(arena->diskstats.clumps == arena->memstats.clumps)
+ return 0;
+
+ memset(&as, 0, sizeof as);
+ as.arena = arena;
+ as.stats = arena->diskstats;
+
+ ok = 0;
+ a = a0 + arena->diskstats.used;
+ for(clump=arena->diskstats.clumps; clump < arena->memstats.clumps; clump++){
+ if(readclumpinfo(arena, clump, &ci) < 0){
+ fprint(2, "%s: clump %d: cannot read clumpinfo\n",
+ arena->name, clump);
+ ok = -1;
+ break;
+ }
+
+ ia.type = ci.type;
+ ia.size = ci.uncsize;
+ ia.addr = a;
+ ia.blocks = (ClumpSize + ci.size + (1 << ABlockLog) - 1) >> ABlockLog;
+ a += ClumpSize + ci.size;
+
+ as.stats.used += ClumpSize + ci.size;
+ as.stats.uncsize += ia.size;
+ as.stats.clumps++;
+ if(ci.uncsize > ci.size)
+ as.stats.cclumps++;
+ as.aa = a;
+ insertscore(ci.score, &ia, IEDirty, &as);
+ }
+ flushdcache();
+ return ok;
+}
+
+int
+syncindex(Index *ix)
+{
+ Arena *arena;
+ int i, e, e1, ok;
+
+ ok = 0;
+ for(i = 0; i < ix->narenas; i++){
+ trace(TraceProc, "syncindex start %d", i);
+ arena = ix->arenas[i];
+ e = syncarena(arena, TWID32, 1, 1);
+ e1 = e;
+ e1 &= ~(SyncHeader|SyncCIZero|SyncCIErr);
+ if(e & SyncHeader)
+ fprint(2, "arena %s: header is out-of-date\n", arena->name);
+ if(e1){
+ fprint(2, "arena %s: %x\n", arena->name, e1);
+ ok = -1;
+ continue;
+ }
+ flushdcache();
+
+ if(arena->memstats.clumps == arena->diskstats.clumps)
+ continue;
+
+ fprint(2, "%T %s: indexing %d clumps...\n",
+ arena->name,
+ arena->memstats.clumps - arena->diskstats.clumps);
+
+ if(syncarenaindex(arena, ix->amap[i].start) < 0){
+ fprint(2, "arena %s: syncarenaindex: %r\n", arena->name);
+ ok = -1;
+ continue;
+ }
+ if(wbarena(arena) < 0){
+ fprint(2, "arena %s: wbarena: %r\n", arena->name);
+ ok = -1;
+ continue;
+ }
+ flushdcache();
+ delaykickicache();
+ }
+ return ok;
+}
diff --git a/sys/src/cmd/venti/srv/trace.c b/sys/src/cmd/venti/srv/trace.c
new file mode 100755
index 000000000..3c0169557
--- /dev/null
+++ b/sys/src/cmd/venti/srv/trace.c
@@ -0,0 +1,39 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+char TraceDisk[] = "disk";
+char TraceLump[] = "lump";
+char TraceBlock[] = "block";
+char TraceProc[] = "proc";
+char TraceWork[] = "work";
+char TraceQuiet[] = "quiet";
+char TraceRpc[] = "rpc";
+
+void
+trace(char *level, char *fmt, ...)
+{
+ char buf[512];
+ va_list arg;
+
+ if(level == nil || !ventilogging)
+ return;
+ va_start(arg, fmt);
+ vsnprint(buf, sizeof buf, fmt, arg);
+ va_end(arg);
+ vtlog(level, "<font size=-1>%T %s:</font> %s<br>\n",
+ threadgetname(), buf);
+ vtlog("all", "<font size=-1>%T <font color=#777777>%s</font> %s:</font> %s<br>\n",
+ level, threadgetname(), buf);
+}
+
+void
+traceinit(void)
+{
+}
+
+void
+settrace(char *trace)
+{
+ USED(trace);
+}
diff --git a/sys/src/cmd/venti/srv/unittoull.c b/sys/src/cmd/venti/srv/unittoull.c
new file mode 100755
index 000000000..1f7411702
--- /dev/null
+++ b/sys/src/cmd/venti/srv/unittoull.c
@@ -0,0 +1,30 @@
+#include "stdinc.h"
+
+#define TWID64 ((u64int)~(u64int)0)
+
+u64int
+unittoull(char *s)
+{
+ char *es;
+ u64int n;
+
+ if(s == nil)
+ return TWID64;
+ n = strtoul(s, &es, 0);
+ if(*es == 'k' || *es == 'K'){
+ n *= 1024;
+ es++;
+ }else if(*es == 'm' || *es == 'M'){
+ n *= 1024*1024;
+ es++;
+ }else if(*es == 'g' || *es == 'G'){
+ n *= 1024*1024*1024;
+ es++;
+ }else if(*es == 't' || *es == 'T'){
+ n *= 1024*1024;
+ n *= 1024*1024;
+ }
+ if(*es != '\0')
+ return TWID64;
+ return n;
+}
diff --git a/sys/src/cmd/venti/srv/unwhack.c b/sys/src/cmd/venti/srv/unwhack.c
new file mode 100755
index 000000000..5530bd07d
--- /dev/null
+++ b/sys/src/cmd/venti/srv/unwhack.c
@@ -0,0 +1,179 @@
+#include "stdinc.h"
+#include "whack.h"
+
+enum
+{
+ DMaxFastLen = 7,
+ DBigLenCode = 0x3c, /* minimum code for large lenth encoding */
+ DBigLenBits = 6,
+ DBigLenBase = 1 /* starting items to encode for big lens */
+};
+
+static uchar lenval[1 << (DBigLenBits - 1)] =
+{
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 4, 4, 4, 4,
+ 5,
+ 6,
+ 255,
+ 255
+};
+
+static uchar lenbits[] =
+{
+ 0, 0, 0,
+ 2, 3, 5, 5,
+};
+
+static uchar offbits[16] =
+{
+ 5, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 12, 13
+};
+
+static ushort offbase[16] =
+{
+ 0, 0x20,
+ 0x40, 0x60,
+ 0x80, 0xc0,
+ 0x100, 0x180,
+ 0x200, 0x300,
+ 0x400, 0x600,
+ 0x800, 0xc00,
+ 0x1000,
+ 0x2000
+};
+
+void
+unwhackinit(Unwhack *uw)
+{
+ uw->err[0] = '\0';
+}
+
+int
+unwhack(Unwhack *uw, uchar *dst, int ndst, uchar *src, int nsrc)
+{
+ uchar *s, *d, *dmax, *smax, lit;
+ ulong uwbits, lithist;
+ int i, off, len, bits, use, code, uwnbits, overbits;
+
+ d = dst;
+ dmax = d + ndst;
+
+ smax = src + nsrc;
+ uwnbits = 0;
+ uwbits = 0;
+ overbits = 0;
+ lithist = ~0;
+ while(src < smax || uwnbits - overbits >= MinDecode){
+ while(uwnbits <= 24){
+ uwbits <<= 8;
+ if(src < smax)
+ uwbits |= *src++;
+ else
+ overbits += 8;
+ uwnbits += 8;
+ }
+
+ /*
+ * literal
+ */
+ len = lenval[(uwbits >> (uwnbits - 5)) & 0x1f];
+ if(len == 0){
+ if(lithist & 0xf){
+ uwnbits -= 9;
+ lit = (uwbits >> uwnbits) & 0xff;
+ lit &= 255;
+ }else{
+ uwnbits -= 8;
+ lit = (uwbits >> uwnbits) & 0x7f;
+ if(lit < 32){
+ if(lit < 24){
+ uwnbits -= 2;
+ lit = (lit << 2) | ((uwbits >> uwnbits) & 3);
+ }else{
+ uwnbits -= 3;
+ lit = (lit << 3) | ((uwbits >> uwnbits) & 7);
+ }
+ lit = (lit - 64) & 0xff;
+ }
+ }
+ if(d >= dmax){
+ snprint(uw->err, WhackErrLen, "too much output");
+ return -1;
+ }
+ *d++ = lit;
+ lithist = (lithist << 1) | (lit < 32) | (lit > 127);
+ continue;
+ }
+
+ /*
+ * length
+ */
+ if(len < 255)
+ uwnbits -= lenbits[len];
+ else{
+ uwnbits -= DBigLenBits;
+ code = ((uwbits >> uwnbits) & ((1 << DBigLenBits) - 1)) - DBigLenCode;
+ len = DMaxFastLen;
+ use = DBigLenBase;
+ bits = (DBigLenBits & 1) ^ 1;
+ while(code >= use){
+ len += use;
+ code -= use;
+ code <<= 1;
+ uwnbits--;
+ if(uwnbits < 0){
+ snprint(uw->err, WhackErrLen, "len out of range");
+ return -1;
+ }
+ code |= (uwbits >> uwnbits) & 1;
+ use <<= bits;
+ bits ^= 1;
+ }
+ len += code;
+
+ while(uwnbits <= 24){
+ uwbits <<= 8;
+ if(src < smax)
+ uwbits |= *src++;
+ else
+ overbits += 8;
+ uwnbits += 8;
+ }
+ }
+
+ /*
+ * offset
+ */
+ uwnbits -= 4;
+ bits = (uwbits >> uwnbits) & 0xf;
+ off = offbase[bits];
+ bits = offbits[bits];
+
+ uwnbits -= bits;
+ off |= (uwbits >> uwnbits) & ((1 << bits) - 1);
+ off++;
+
+ if(off > d - dst){
+ snprint(uw->err, WhackErrLen, "offset out of range: off=%d d=%ld len=%d nbits=%d", off, d - dst, len, uwnbits);
+ return -1;
+ }
+ if(d + len > dmax){
+ snprint(uw->err, WhackErrLen, "len out of range");
+ return -1;
+ }
+ s = d - off;
+ for(i = 0; i < len; i++)
+ d[i] = s[i];
+ d += len;
+ }
+ if(uwnbits < overbits){
+ snprint(uw->err, WhackErrLen, "compressed data overrun");
+ return -1;
+ }
+
+ len = d - dst;
+
+ return len;
+}
diff --git a/sys/src/cmd/venti/srv/utils.c b/sys/src/cmd/venti/srv/utils.c
new file mode 100755
index 000000000..d810c53d8
--- /dev/null
+++ b/sys/src/cmd/venti/srv/utils.c
@@ -0,0 +1,259 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+int
+namecmp(char *s, char *t)
+{
+ return strncmp(s, t, ANameSize);
+}
+
+void
+namecp(char *dst, char *src)
+{
+ strncpy(dst, src, ANameSize - 1);
+ dst[ANameSize - 1] = '\0';
+}
+
+int
+nameok(char *name)
+{
+ char *t;
+ int c;
+
+ if(name == nil)
+ return -1;
+ for(t = name; c = *t; t++)
+ if(t - name >= ANameSize
+ || c < ' ' || c >= 0x7f)
+ return -1;
+ return 0;
+}
+
+int
+stru32int(char *s, u32int *r)
+{
+ char *t;
+ u32int n, nn, m;
+ int c;
+
+ m = TWID32 / 10;
+ n = 0;
+ for(t = s; ; t++){
+ c = *t;
+ if(c < '0' || c > '9')
+ break;
+ if(n > m)
+ return -1;
+ nn = n * 10 + c - '0';
+ if(nn < n)
+ return -1;
+ n = nn;
+ }
+ *r = n;
+ return s != t && *t == '\0';
+}
+
+int
+stru64int(char *s, u64int *r)
+{
+ char *t;
+ u64int n, nn, m;
+ int c;
+
+ m = TWID64 / 10;
+ n = 0;
+ for(t = s; ; t++){
+ c = *t;
+ if(c < '0' || c > '9')
+ break;
+ if(n > m)
+ return -1;
+ nn = n * 10 + c - '0';
+ if(nn < n)
+ return -1;
+ n = nn;
+ }
+ *r = n;
+ return s != t && *t == '\0';
+}
+
+int
+vttypevalid(int type)
+{
+ return type < VtMaxType;
+}
+
+static char*
+logit(int severity, char *fmt, va_list args)
+{
+ char *s;
+
+ s = vsmprint(fmt, args);
+ if(s == nil)
+ return nil;
+ if(severity != EOk){
+ if(argv0 == nil)
+ fprint(2, "%T %s: err %d: %s\n", argv0, severity, s);
+ else
+ fprint(2, "%T err %d: %s\n", severity, s);
+ }
+ return s;
+}
+
+void
+seterr(int severity, char *fmt, ...)
+{
+ char *s;
+ va_list args;
+
+ va_start(args, fmt);
+ s = logit(severity, fmt, args);
+ va_end(args);
+ if(s == nil)
+ werrstr("error setting error");
+ else{
+ werrstr("%s", s);
+ free(s);
+ }
+}
+
+void
+logerr(int severity, char *fmt, ...)
+{
+ char *s;
+ va_list args;
+
+ va_start(args, fmt);
+ s = logit(severity, fmt, args);
+ va_end(args);
+ free(s);
+}
+
+u32int
+now(void)
+{
+ return time(nil);
+}
+
+int abortonmem = 1;
+
+void *
+emalloc(ulong n)
+{
+ void *p;
+
+ p = malloc(n);
+ if(p == nil){
+ if(abortonmem)
+ abort();
+ sysfatal("out of memory allocating %lud", n);
+ }
+ memset(p, 0xa5, n);
+ setmalloctag(p, getcallerpc(&n));
+if(0)print("emalloc %p-%p by %#p\n", p, (char*)p+n, getcallerpc(&n));
+ return p;
+}
+
+void *
+ezmalloc(ulong n)
+{
+ void *p;
+
+ p = malloc(n);
+ if(p == nil){
+ if(abortonmem)
+ abort();
+ sysfatal("out of memory allocating %lud", n);
+ }
+ memset(p, 0, n);
+ setmalloctag(p, getcallerpc(&n));
+if(0)print("ezmalloc %p-%p by %#p\n", p, (char*)p+n, getcallerpc(&n));
+ return p;
+}
+
+void *
+erealloc(void *p, ulong n)
+{
+ p = realloc(p, n);
+ if(p == nil){
+ if(abortonmem)
+ abort();
+ sysfatal("out of memory allocating %lud", n);
+ }
+ setrealloctag(p, getcallerpc(&p));
+if(0)print("erealloc %p-%p by %#p\n", p, (char*)p+n, getcallerpc(&p));
+ return p;
+}
+
+char *
+estrdup(char *s)
+{
+ char *t;
+ int n;
+
+ n = strlen(s) + 1;
+ t = emalloc(n);
+ memmove(t, s, n);
+ setmalloctag(t, getcallerpc(&s));
+if(0)print("estrdup %p-%p by %#p\n", t, (char*)t+n, getcallerpc(&s));
+ return t;
+}
+
+/*
+ * return floor(log2(v))
+ */
+int
+u64log2(u64int v)
+{
+ int i;
+
+ for(i = 0; i < 64; i++)
+ if((v >> i) <= 1)
+ break;
+ return i;
+}
+
+int
+vtproc(void (*fn)(void*), void *arg)
+{
+ proccreate(fn, arg, 256*1024);
+ return 0;
+}
+
+int
+ientryfmt(Fmt *fmt)
+{
+ IEntry *ie;
+
+ ie = va_arg(fmt->args, IEntry*);
+ return fmtprint(fmt, "%V %22lld %3d %5d %3d",
+ ie->score, ie->ia.addr, ie->ia.type, ie->ia.size, ie->ia.blocks);
+}
+
+void
+ventifmtinstall(void)
+{
+ fmtinstall('F', vtfcallfmt);
+ fmtinstall('H', encodefmt);
+ fmtinstall('I', ientryfmt);
+ fmtinstall('T', vttimefmt);
+ fmtinstall('V', vtscorefmt);
+}
+
+uint
+msec(void)
+{
+ return nsec()/1000000;
+}
+
+uint
+countbits(uint n)
+{
+ n = (n&0x55555555)+((n>>1)&0x55555555);
+ n = (n&0x33333333)+((n>>2)&0x33333333);
+ n = (n&0x0F0F0F0F)+((n>>4)&0x0F0F0F0F);
+ n = (n&0x00FF00FF)+((n>>8)&0x00FF00FF);
+ n = (n&0x0000FFFF)+((n>>16)&0x0000FFFF);
+ return n;
+}
diff --git a/sys/src/cmd/venti/srv/venti.c b/sys/src/cmd/venti/srv/venti.c
new file mode 100755
index 000000000..1cf67a1c4
--- /dev/null
+++ b/sys/src/cmd/venti/srv/venti.c
@@ -0,0 +1,428 @@
+#ifdef PLAN9PORT
+#include <u.h>
+#include <signal.h>
+#endif
+#include "stdinc.h"
+#include <bio.h>
+#include "dat.h"
+#include "fns.h"
+
+#include "whack.h"
+
+typedef struct Allocs Allocs;
+struct Allocs {
+ u32int mem;
+ u32int bcmem;
+ u32int icmem;
+ u32int stfree; /* free memory at start */
+ uint mempcnt;
+};
+
+int debug;
+int nofork;
+int mainstacksize = 256*1024;
+VtSrv *ventisrv;
+
+static void ventiserver(void*);
+
+static ulong
+freemem(void)
+{
+ int nf, pgsize = 0;
+ uvlong size, userpgs = 0, userused = 0;
+ char *ln, *sl;
+ char *fields[2];
+ Biobuf *bp;
+
+ size = 64*1024*1024;
+ bp = Bopen("#c/swap", OREAD);
+ if (bp != nil) {
+ while ((ln = Brdline(bp, '\n')) != nil) {
+ ln[Blinelen(bp)-1] = '\0';
+ nf = tokenize(ln, fields, nelem(fields));
+ if (nf != 2)
+ continue;
+ if (strcmp(fields[1], "pagesize") == 0)
+ pgsize = atoi(fields[0]);
+ else if (strcmp(fields[1], "user") == 0) {
+ sl = strchr(fields[0], '/');
+ if (sl == nil)
+ continue;
+ userpgs = atoll(sl+1);
+ userused = atoll(fields[0]);
+ }
+ }
+ Bterm(bp);
+ if (pgsize > 0 && userpgs > 0 && userused > 0)
+ size = (userpgs - userused) * pgsize;
+ }
+ /* cap it to keep the size within 32 bits */
+ if (size >= 3840UL * 1024 * 1024)
+ size = 3840UL * 1024 * 1024;
+ return size;
+}
+
+static void
+allocminima(Allocs *all) /* enforce minima for sanity */
+{
+ if (all->icmem < 6 * 1024 * 1024)
+ all->icmem = 6 * 1024 * 1024;
+ if (all->mem < 1024 * 1024 || all->mem == Unspecified) /* lumps */
+ all->mem = 1024 * 1024;
+ if (all->bcmem < 2 * 1024 * 1024)
+ all->bcmem = 2 * 1024 * 1024;
+}
+
+/* automatic memory allocations sizing per venti(8) guidelines */
+static Allocs
+allocbypcnt(u32int mempcnt, u32int stfree)
+{
+ u32int avail;
+ vlong blmsize;
+ Allocs all;
+ static u32int free;
+
+ all.mem = Unspecified;
+ all.bcmem = all.icmem = 0;
+ all.mempcnt = mempcnt;
+ all.stfree = stfree;
+
+ if (free == 0)
+ free = freemem();
+ blmsize = stfree - free;
+ if (blmsize <= 0)
+ blmsize = 0;
+ avail = ((vlong)stfree * mempcnt) / 100;
+ if (blmsize >= avail || (avail -= blmsize) <= (1 + 2 + 6) * 1024 * 1024)
+ fprint(2, "%s: bloom filter bigger than mem pcnt; "
+ "resorting to minimum values (9MB total)\n", argv0);
+ else {
+ if (avail >= 3840UL * 1024 * 1024)
+ avail = 3840UL * 1024 * 1024; /* sanity */
+ avail /= 2;
+ all.icmem = avail;
+ avail /= 3;
+ all.mem = avail;
+ all.bcmem = 2 * avail;
+ }
+ return all;
+}
+
+/*
+ * we compute default values for allocations,
+ * which can be overridden by (in order):
+ * configuration file parameters,
+ * command-line options other than -m, and -m.
+ */
+static Allocs
+sizeallocs(Allocs opt, Config *cfg)
+{
+ Allocs all;
+
+ /* work out sane defaults */
+ all = allocbypcnt(20, opt.stfree);
+
+ /* config file parameters override */
+ if (cfg->mem && cfg->mem != Unspecified)
+ all.mem = cfg->mem;
+ if (cfg->bcmem)
+ all.bcmem = cfg->bcmem;
+ if (cfg->icmem)
+ all.icmem = cfg->icmem;
+
+ /* command-line options override */
+ if (opt.mem && opt.mem != Unspecified)
+ all.mem = opt.mem;
+ if (opt.bcmem)
+ all.bcmem = opt.bcmem;
+ if (opt.icmem)
+ all.icmem = opt.icmem;
+
+ /* automatic memory sizing? */
+ if(opt.mempcnt > 0)
+ all = allocbypcnt(opt.mempcnt, opt.stfree);
+
+ allocminima(&all);
+ return all;
+}
+
+void
+usage(void)
+{
+ fprint(2, "usage: venti [-Ldrsw] [-a ventiaddr] [-c config] "
+"[-h httpaddr] [-m %%mem] [-B blockcachesize] [-C cachesize] [-I icachesize] "
+"[-W webroot]\n");
+ threadexitsall("usage");
+}
+
+void
+threadmain(int argc, char *argv[])
+{
+ char *configfile, *haddr, *vaddr, *webroot;
+ u32int mem, icmem, bcmem, minbcmem, mempcnt, stfree;
+ Allocs allocs;
+ Config config;
+
+ traceinit();
+ threadsetname("main");
+ mempcnt = 0;
+ vaddr = nil;
+ haddr = nil;
+ configfile = nil;
+ webroot = nil;
+ mem = Unspecified;
+ icmem = 0;
+ bcmem = 0;
+ ARGBEGIN{
+ case 'a':
+ vaddr = EARGF(usage());
+ break;
+ case 'B':
+ bcmem = unittoull(EARGF(usage()));
+ break;
+ case 'c':
+ configfile = EARGF(usage());
+ break;
+ case 'C':
+ mem = unittoull(EARGF(usage()));
+ break;
+ case 'D':
+ settrace(EARGF(usage()));
+ break;
+ case 'd':
+ debug = 1;
+ nofork = 1;
+ break;
+ case 'h':
+ haddr = EARGF(usage());
+ break;
+ case 'm':
+ mempcnt = atoi(EARGF(usage()));
+ if (mempcnt <= 0 || mempcnt >= 100)
+ usage();
+ break;
+ case 'I':
+ icmem = unittoull(EARGF(usage()));
+ break;
+ case 'L':
+ ventilogging = 1;
+ break;
+ case 'r':
+ readonly = 1;
+ break;
+ case 's':
+ nofork = 1;
+ break;
+ case 'w': /* compatibility with old venti */
+ queuewrites = 1;
+ break;
+ case 'W':
+ webroot = EARGF(usage());
+ break;
+ default:
+ usage();
+ }ARGEND
+
+ if(argc)
+ usage();
+
+ if(!nofork)
+ rfork(RFNOTEG);
+
+#ifdef PLAN9PORT
+ {
+ /* sigh - needed to avoid signals when writing to hungup networks */
+ struct sigaction sa;
+ memset(&sa, 0, sizeof sa);
+ sa.sa_handler = SIG_IGN;
+ sigaction(SIGPIPE, &sa, nil);
+ }
+#endif
+
+ ventifmtinstall();
+ trace(TraceQuiet, "venti started");
+ fprint(2, "%T venti: ");
+
+ if(configfile == nil)
+ configfile = "venti.conf";
+
+ /* remember free memory before initventi & loadbloom, for auto-sizing */
+ stfree = freemem();
+ fprint(2, "conf...");
+ if(initventi(configfile, &config) < 0)
+ sysfatal("can't init server: %r");
+ /*
+ * load bloom filter
+ */
+ if(mainindex->bloom && loadbloom(mainindex->bloom) < 0)
+ sysfatal("can't load bloom filter: %r");
+
+ /*
+ * size memory allocations; assumes bloom filter is loaded
+ */
+ allocs = sizeallocs((Allocs){mem, bcmem, icmem, stfree, mempcnt},
+ &config);
+ mem = allocs.mem;
+ bcmem = allocs.bcmem;
+ icmem = allocs.icmem;
+ fprint(2, "%s: mem %,ud bcmem %,ud icmem %,ud...",
+ argv0, mem, bcmem, icmem);
+
+ /*
+ * default other configuration-file parameters
+ */
+ if(haddr == nil)
+ haddr = config.haddr;
+ if(vaddr == nil)
+ vaddr = config.vaddr;
+ if(vaddr == nil)
+ vaddr = "tcp!*!venti";
+ if(webroot == nil)
+ webroot = config.webroot;
+ if(queuewrites == 0)
+ queuewrites = config.queuewrites;
+
+ if(haddr){
+ fprint(2, "httpd %s...", haddr);
+ if(httpdinit(haddr, webroot) < 0)
+ fprint(2, "warning: can't start http server: %r");
+ }
+ fprint(2, "init...");
+
+ /*
+ * lump cache
+ */
+ if(0) fprint(2, "initialize %d bytes of lump cache for %d lumps\n",
+ mem, mem / (8 * 1024));
+ initlumpcache(mem, mem / (8 * 1024));
+
+ /*
+ * index cache
+ */
+ initicache(icmem);
+ initicachewrite();
+
+ /*
+ * block cache: need a block for every arena and every process
+ */
+ minbcmem = maxblocksize *
+ (mainindex->narenas + mainindex->nsects*4 + 16);
+ if(bcmem < minbcmem)
+ bcmem = minbcmem;
+ if(0) fprint(2, "initialize %d bytes of disk block cache\n", bcmem);
+ initdcache(bcmem);
+
+ if(mainindex->bloom)
+ startbloomproc(mainindex->bloom);
+
+ fprint(2, "sync...");
+ if(!readonly && syncindex(mainindex) < 0)
+ sysfatal("can't sync server: %r");
+
+ if(!readonly && queuewrites){
+ fprint(2, "queue...");
+ if(initlumpqueues(mainindex->nsects) < 0){
+ fprint(2, "can't initialize lump queues,"
+ " disabling write queueing: %r");
+ queuewrites = 0;
+ }
+ }
+
+ if(initarenasum() < 0)
+ fprint(2, "warning: can't initialize arena summing process: %r");
+
+ fprint(2, "announce %s...", vaddr);
+ ventisrv = vtlisten(vaddr);
+ if(ventisrv == nil)
+ sysfatal("can't announce %s: %r", vaddr);
+
+ fprint(2, "serving.\n");
+ if(nofork)
+ ventiserver(nil);
+ else
+ vtproc(ventiserver, nil);
+
+ threadexits(nil);
+}
+
+static void
+vtrerror(VtReq *r, char *error)
+{
+ r->rx.msgtype = VtRerror;
+ r->rx.error = estrdup(error);
+}
+
+static void
+ventiserver(void *v)
+{
+ Packet *p;
+ VtReq *r;
+ char err[ERRMAX];
+ uint ms;
+ int cached, ok;
+
+ USED(v);
+ threadsetname("ventiserver");
+ trace(TraceWork, "start");
+ while((r = vtgetreq(ventisrv)) != nil){
+ trace(TraceWork, "finish");
+ trace(TraceWork, "start request %F", &r->tx);
+ trace(TraceRpc, "<- %F", &r->tx);
+ r->rx.msgtype = r->tx.msgtype+1;
+ addstat(StatRpcTotal, 1);
+ if(0) print("req (arenas[0]=%p sects[0]=%p) %F\n",
+ mainindex->arenas[0], mainindex->sects[0], &r->tx);
+ switch(r->tx.msgtype){
+ default:
+ vtrerror(r, "unknown request");
+ break;
+ case VtTread:
+ ms = msec();
+ r->rx.data = readlump(r->tx.score, r->tx.blocktype, r->tx.count, &cached);
+ ms = msec() - ms;
+ addstat2(StatRpcRead, 1, StatRpcReadTime, ms);
+ if(r->rx.data == nil){
+ addstat(StatRpcReadFail, 1);
+ rerrstr(err, sizeof err);
+ vtrerror(r, err);
+ }else{
+ addstat(StatRpcReadBytes, packetsize(r->rx.data));
+ addstat(StatRpcReadOk, 1);
+ if(cached)
+ addstat2(StatRpcReadCached, 1, StatRpcReadCachedTime, ms);
+ else
+ addstat2(StatRpcReadUncached, 1, StatRpcReadUncachedTime, ms);
+ }
+ break;
+ case VtTwrite:
+ if(readonly){
+ vtrerror(r, "read only");
+ break;
+ }
+ p = r->tx.data;
+ r->tx.data = nil;
+ addstat(StatRpcWriteBytes, packetsize(p));
+ ms = msec();
+ ok = writelump(p, r->rx.score, r->tx.blocktype, 0, ms);
+ ms = msec() - ms;
+ addstat2(StatRpcWrite, 1, StatRpcWriteTime, ms);
+
+ if(ok < 0){
+ addstat(StatRpcWriteFail, 1);
+ rerrstr(err, sizeof err);
+ vtrerror(r, err);
+ }
+ break;
+ case VtTsync:
+ flushqueue();
+ flushdcache();
+ break;
+ }
+ trace(TraceRpc, "-> %F", &r->rx);
+ vtrespond(r);
+ trace(TraceWork, "start");
+ }
+ flushdcache();
+ flushicache();
+ threadexitsall(0);
+}
diff --git a/sys/src/cmd/venti/srv/verifyarena.c b/sys/src/cmd/venti/srv/verifyarena.c
new file mode 100755
index 000000000..662d53239
--- /dev/null
+++ b/sys/src/cmd/venti/srv/verifyarena.c
@@ -0,0 +1,266 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+static int verbose;
+static int fd;
+static uchar *data;
+static int blocksize;
+static int sleepms;
+static vlong offset0;
+
+void
+usage(void)
+{
+ fprint(2, "usage: verifyarena [-b blocksize] [-s ms] [-v] [arenapart [name...]]\n");
+ threadexitsall(0);
+}
+
+static int
+preadblock(uchar *buf, int n, vlong off)
+{
+ int nr, m;
+
+ for(nr = 0; nr < n; nr += m){
+ m = n - nr;
+ m = pread(fd, &buf[nr], m, offset0+off+nr);
+ if(m <= 0){
+ if(m == 0)
+ werrstr("early eof");
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static int
+readblock(uchar *buf, int n)
+{
+ int nr, m;
+
+ for(nr = 0; nr < n; nr += m){
+ m = n - nr;
+ m = read(fd, &buf[nr], m);
+ if(m <= 0){
+ if(m == 0)
+ werrstr("early eof");
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static void
+verifyarena(char *name, vlong len)
+{
+ Arena arena;
+ ArenaHead head;
+ DigestState s;
+ u64int n, e;
+ u32int bs;
+ u8int score[VtScoreSize];
+
+ fprint(2, "%T verify %s\n", name);
+
+ memset(&arena, 0, sizeof arena);
+ memset(&s, 0, sizeof s);
+
+ /*
+ * read a little bit, which will include the header
+ */
+ if(readblock(data, HeadSize) < 0){
+ fprint(2, "%T %s: reading header: %r\n", name);
+ return;
+ }
+ sha1(data, HeadSize, nil, &s);
+ if(unpackarenahead(&head, data) < 0){
+ fprint(2, "%T %s: corrupt arena header: %r\n", name);
+ return;
+ }
+ if(head.version != ArenaVersion4 && head.version != ArenaVersion5)
+ fprint(2, "%T %s: warning: unknown arena version %d\n", name, head.version);
+ if(len != 0 && len != head.size)
+ fprint(2, "%T %s: warning: unexpected length %lld != %lld\n", name, head.size, len);
+ if(strcmp(name, "<stdin>") != 0 && strcmp(head.name, name) != 0)
+ fprint(2, "%T %s: warning: unexpected name %s\n", name, head.name);
+
+ /*
+ * now we know how much to read
+ * read everything but the last block, which is special
+ */
+ e = head.size - head.blocksize;
+ bs = blocksize;
+ for(n = HeadSize; n < e; n += bs){
+ if(n + bs > e)
+ bs = e - n;
+ if(readblock(data, bs) < 0){
+ fprint(2, "%T %s: read data: %r\n", name);
+ return;
+ }
+ sha1(data, bs, nil, &s);
+ if(sleepms)
+ sleep(sleepms);
+ }
+
+ /*
+ * read the last block update the sum.
+ * the sum is calculated assuming the slot for the sum is zero.
+ */
+ bs = head.blocksize;
+ if(readblock(data, bs) < 0){
+ fprint(2, "%T %s: read last block: %r\n", name);
+ return;
+ }
+ sha1(data, bs-VtScoreSize, nil, &s);
+ sha1(zeroscore, VtScoreSize, nil, &s);
+ sha1(nil, 0, score, &s);
+
+ /*
+ * validity check on the trailer
+ */
+ arena.blocksize = head.blocksize;
+ if(unpackarena(&arena, data) < 0){
+ fprint(2, "%T %s: corrupt arena trailer: %r\n", name);
+ return;
+ }
+ scorecp(arena.score, &data[arena.blocksize - VtScoreSize]);
+
+ if(namecmp(arena.name, head.name) != 0){
+ fprint(2, "%T %s: wrong name in trailer: %s vs. %s\n",
+ name, head.name, arena.name);
+ return;
+ }
+ if(arena.version != head.version){
+ fprint(2, "%T %s: wrong version in trailer: %d vs. %d\n",
+ name, head.version, arena.version);
+ return;
+ }
+ arena.size = head.size - 2 * head.blocksize;
+
+ /*
+ * check for no checksum or the same
+ */
+ if(scorecmp(score, arena.score) == 0)
+ fprint(2, "%T %s: verified score\n", name);
+ else if(scorecmp(zeroscore, arena.score) == 0)
+ fprint(2, "%T %s: unsealed\n", name);
+ else{
+ fprint(2, "%T %s: mismatch checksum - found=%V calculated=%V\n",
+ name, arena.score, score);
+ return;
+ }
+ printarena(2, &arena);
+}
+
+static int
+shouldcheck(char *name, char **s, int n)
+{
+ int i;
+
+ if(n == 0)
+ return 1;
+
+ for(i=0; i<n; i++){
+ if(s[i] && strcmp(name, s[i]) == 0){
+ s[i] = nil;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+void
+threadmain(int argc, char *argv[])
+{
+ int i, nline;
+ char *p, *q, *table, *f[10], line[256];
+ vlong start, stop;
+ ArenaPart ap;
+ Part *part;
+
+ needzeroscore();
+ ventifmtinstall();
+ blocksize = MaxIoSize;
+ ARGBEGIN{
+ case 'b':
+ blocksize = unittoull(EARGF(usage()));
+ break;
+ case 's':
+ sleepms = atoi(EARGF(usage()));
+ break;
+ case 'v':
+ verbose++;
+ break;
+ default:
+ usage();
+ break;
+ }ARGEND
+
+ data = vtmalloc(blocksize);
+ if(argc == 0){
+ fd = 0;
+ verifyarena("<stdin>", 0);
+ threadexitsall(nil);
+ }
+
+ if((part = initpart(argv[0], OREAD)) == nil)
+ sysfatal("open partition %s: %r", argv[0]);
+ fd = part->fd;
+ offset0 = part->offset;
+
+ if(preadblock(data, 8192, PartBlank) < 0)
+ sysfatal("read arena part header: %r");
+ if(unpackarenapart(&ap, data) < 0)
+ sysfatal("corrupted arena part header: %r");
+ fprint(2, "%T # arena part version=%d blocksize=%d arenabase=%d\n",
+ ap.version, ap.blocksize, ap.arenabase);
+ ap.tabbase = (PartBlank+HeadSize+ap.blocksize-1)&~(ap.blocksize-1);
+ ap.tabsize = ap.arenabase - ap.tabbase;
+ table = malloc(ap.tabsize+1);
+ if(preadblock((uchar*)table, ap.tabsize, ap.tabbase) < 0)
+ sysfatal("reading arena part directory: %r");
+ table[ap.tabsize] = 0;
+
+ nline = atoi(table);
+ p = strchr(table, '\n');
+ if(p)
+ p++;
+ for(i=0; i<nline; i++){
+ if(p == nil){
+ fprint(2, "%T warning: unexpected arena table end\n");
+ break;
+ }
+ q = strchr(p, '\n');
+ if(q)
+ *q++ = 0;
+ if(strlen(p) >= sizeof line){
+ fprint(2, "%T warning: long arena table line: %s\n", p);
+ p = q;
+ continue;
+ }
+ strcpy(line, p);
+ memset(f, 0, sizeof f);
+ if(tokenize(line, f, nelem(f)) < 3){
+ fprint(2, "%T warning: bad arena table line: %s\n", p);
+ p = q;
+ continue;
+ }
+ p = q;
+ if(shouldcheck(f[0], argv+1, argc-1)){
+ start = strtoull(f[1], 0, 0);
+ stop = strtoull(f[2], 0, 0);
+ if(stop <= start){
+ fprint(2, "%T %s: bad start,stop %lld,%lld\n", f[0], stop, start);
+ continue;
+ }
+ if(seek(fd, offset0+start, 0) < 0)
+ fprint(2, "%T %s: seek to start: %r\n", f[0]);
+ verifyarena(f[0], stop - start);
+ }
+ }
+ for(i=1; i<argc; i++)
+ if(argv[i] != 0)
+ fprint(2, "%T %s: did not find arena\n", argv[i]);
+
+ threadexitsall(nil);
+}
diff --git a/sys/src/cmd/venti/srv/whack.c b/sys/src/cmd/venti/srv/whack.c
new file mode 100755
index 000000000..ecd290339
--- /dev/null
+++ b/sys/src/cmd/venti/srv/whack.c
@@ -0,0 +1,331 @@
+#include "stdinc.h"
+#include "whack.h"
+
+typedef struct Huff Huff;
+int compressblocks = 1;
+
+enum
+{
+ MaxFastLen = 9,
+ BigLenCode = 0x1f4, /* minimum code for large lenth encoding */
+ BigLenBits = 9,
+ BigLenBase = 4, /* starting items to encode for big lens */
+
+ MinOffBits = 6,
+ MaxOffBits = MinOffBits + 8,
+
+ MaxLen = 2051 /* max. length encodable in 24 bits */
+};
+
+enum
+{
+ StatBytes,
+ StatOutBytes,
+ StatLits,
+ StatMatches,
+ StatLitBits,
+ StatOffBits,
+ StatLenBits,
+
+ MaxStat
+};
+
+struct Huff
+{
+ short bits; /* length of the code */
+ ulong encode; /* the code */
+};
+
+static Huff lentab[MaxFastLen] =
+{
+ {2, 0x2}, /* 10 */
+ {3, 0x6}, /* 110 */
+ {5, 0x1c}, /* 11100 */
+ {5, 0x1d}, /* 11101 */
+ {6, 0x3c}, /* 111100 */
+ {7, 0x7a}, /* 1111010 */
+ {7, 0x7b}, /* 1111011 */
+ {8, 0xf8}, /* 11111000 */
+ {8, 0xf9}, /* 11111001 */
+};
+
+static int thwmaxcheck;
+
+void
+whackinit(Whack *tw, int level)
+{
+ thwmaxcheck = (1 << level);
+ thwmaxcheck -= thwmaxcheck >> 2;
+ if(thwmaxcheck < 2)
+ thwmaxcheck = 2;
+ else if(thwmaxcheck > 1024)
+ thwmaxcheck = 1024;
+ memset(tw, 0, sizeof *tw);
+ tw->begin = 2 * WhackMaxOff;
+}
+
+/*
+ * find a string in the dictionary
+ */
+static int
+whackmatch(Whack *b, uchar **ss, uchar *esrc, ulong h, ulong now)
+{
+ ushort then, off, last;
+ int bestoff, bestlen, check;
+ uchar *s, *t;
+
+ s = *ss;
+ if(esrc < s + MinMatch)
+ return -1;
+ if(s + MaxLen < esrc)
+ esrc = s + MaxLen;
+
+ bestoff = 0;
+ bestlen = 0;
+ check = thwmaxcheck;
+ last = 0;
+ for(then = b->hash[h]; check-- > 0; then = b->next[then & (WhackMaxOff - 1)]){
+ off = now - then;
+ if(off <= last || off > WhackMaxOff)
+ break;
+
+ /*
+ * don't need to check for the end because
+ * 1) s too close check above
+ */
+ t = s - off;
+ if(s[0] == t[0] && s[1] == t[1] && s[2] == t[2]){
+ if(!bestlen || esrc - s > bestlen && s[bestlen] == t[bestlen]){
+ t += 3;
+ for(s += 3; s < esrc; s++){
+ if(*s != *t)
+ break;
+ t++;
+ }
+ if(s - *ss > bestlen){
+ bestlen = s - *ss;
+ bestoff = off;
+ if(bestlen > thwmaxcheck)
+ break;
+ }
+ }
+ }
+ s = *ss;
+ last = off;
+ }
+ *ss += bestlen;
+ return bestoff;
+}
+
+/*
+ * knuth vol. 3 multiplicative hashing
+ * each byte x chosen according to rules
+ * 1/4 < x < 3/10, 1/3 x < < 3/7, 4/7 < x < 2/3, 7/10 < x < 3/4
+ * with reasonable spread between the bytes & their complements
+ *
+ * the 3 byte value appears to be as almost good as the 4 byte value,
+ * and might be faster on some machines
+ */
+/*
+#define hashit(c) ((((ulong)(c) * 0x6b43a9) >> (24 - HashLog)) & HashMask)
+*/
+#define hashit(c) (((((ulong)(c) & 0xffffff) * 0x6b43a9b5) >> (32 - HashLog)) & HashMask)
+
+/*
+ * lz77 compression with single lookup in a hash table for each block
+ */
+int
+whack(Whack *w, uchar *dst, uchar *src, int n, ulong stats[WhackStats])
+{
+ uchar *s, *ss, *sss, *esrc, *half, *wdst, *wdmax;
+ ulong cont, code, wbits;
+ ushort now;
+ int toff, lithist, h, len, bits, use, wnbits, lits, matches, offbits, lenbits;
+
+ if(!compressblocks || n < MinMatch)
+ return -1;
+
+ wdst = dst;
+ wdmax = dst + n;
+
+ now = w->begin;
+ s = src;
+ w->data = s;
+
+ cont = (s[0] << 16) | (s[1] << 8) | s[2];
+
+ esrc = s + n;
+ half = s + (n >> 1);
+ wnbits = 0;
+ wbits = 0;
+ lits = 0;
+ matches = 0;
+ offbits = 0;
+ lenbits = 0;
+ lithist = ~0;
+ while(s < esrc){
+ h = hashit(cont);
+
+ sss = s;
+ toff = whackmatch(w, &sss, esrc, h, now);
+ ss = sss;
+
+ len = ss - s;
+ for(; wnbits >= 8; wnbits -= 8){
+ if(wdst >= wdmax){
+ w->begin = now;
+ return -1;
+ }
+ *wdst++ = wbits >> (wnbits - 8);
+ }
+ if(len < MinMatch){
+ toff = *s;
+ lithist = (lithist << 1) | toff < 32 | toff > 127;
+ if(lithist & 0x1e){
+ wbits = (wbits << 9) | toff;
+ wnbits += 9;
+ }else if(lithist & 1){
+ toff = (toff + 64) & 0xff;
+ if(toff < 96){
+ wbits = (wbits << 10) | toff;
+ wnbits += 10;
+ }else{
+ wbits = (wbits << 11) | toff;
+ wnbits += 11;
+ }
+ }else{
+ wbits = (wbits << 8) | toff;
+ wnbits += 8;
+ }
+ lits++;
+
+ /*
+ * speed hack
+ * check for compression progress, bail if none achieved
+ */
+ if(s > half){
+ if(4 * (s - src) < 5 * lits){
+ w->begin = now;
+ return -1;
+ }
+ half = esrc;
+ }
+
+ if(s + MinMatch <= esrc){
+ w->next[now & (WhackMaxOff - 1)] = w->hash[h];
+ w->hash[h] = now;
+ if(s + MinMatch < esrc)
+ cont = (cont << 8) | s[MinMatch];
+ }
+ now++;
+ s++;
+ continue;
+ }
+
+ matches++;
+
+ /*
+ * length of match
+ */
+ if(len > MaxLen){
+ len = MaxLen;
+ ss = s + len;
+ }
+ len -= MinMatch;
+ if(len < MaxFastLen){
+ bits = lentab[len].bits;
+ wbits = (wbits << bits) | lentab[len].encode;
+ wnbits += bits;
+ lenbits += bits;
+ }else{
+ code = BigLenCode;
+ bits = BigLenBits;
+ use = BigLenBase;
+ len -= MaxFastLen;
+ while(len >= use){
+ len -= use;
+ code = (code + use) << 1;
+ use <<= (bits & 1) ^ 1;
+ bits++;
+ }
+
+ wbits = (wbits << bits) | (code + len);
+ wnbits += bits;
+ lenbits += bits;
+
+ for(; wnbits >= 8; wnbits -= 8){
+ if(wdst >= wdmax){
+ w->begin = now;
+ return -1;
+ }
+ *wdst++ = wbits >> (wnbits - 8);
+ }
+ }
+
+ /*
+ * offset in history
+ */
+ toff--;
+ for(bits = MinOffBits; toff >= (1 << bits); bits++)
+ ;
+ if(bits < MaxOffBits-1){
+ wbits = (wbits << 3) | (bits - MinOffBits);
+ if(bits != MinOffBits)
+ bits--;
+ wnbits += bits + 3;
+ offbits += bits + 3;
+ }else{
+ wbits = (wbits << 4) | 0xe | (bits - (MaxOffBits-1));
+ bits--;
+ wnbits += bits + 4;
+ offbits += bits + 4;
+ }
+ wbits = (wbits << bits) | toff & ((1 << bits) - 1);
+
+ for(; s != ss; s++){
+ if(s + MinMatch <= esrc){
+ h = hashit(cont);
+ w->next[now & (WhackMaxOff - 1)] = w->hash[h];
+ w->hash[h] = now;
+ if(s + MinMatch < esrc)
+ cont = (cont << 8) | s[MinMatch];
+ }
+ now++;
+ }
+ }
+
+ w->begin = now;
+
+ stats[StatBytes] += esrc - src;
+ stats[StatLits] += lits;
+ stats[StatMatches] += matches;
+ stats[StatLitBits] += (wdst - (dst + 2)) * 8 + wnbits - offbits - lenbits;
+ stats[StatOffBits] += offbits;
+ stats[StatLenBits] += lenbits;
+
+ if(wnbits & 7){
+ wbits <<= 8 - (wnbits & 7);
+ wnbits += 8 - (wnbits & 7);
+ }
+ for(; wnbits >= 8; wnbits -= 8){
+ if(wdst >= wdmax)
+ return -1;
+ *wdst++ = wbits >> (wnbits - 8);
+ }
+
+ stats[StatOutBytes] += wdst - dst;
+
+ return wdst - dst;
+}
+
+int
+whackblock(uchar *dst, uchar *src, int ssize)
+{
+ Whack w;
+ ulong stats[MaxStat];
+ int r;
+
+ whackinit(&w, 6);
+ r = whack(&w, dst, src, ssize, stats);
+ return r;
+}
diff --git a/sys/src/cmd/venti/srv/whack.h b/sys/src/cmd/venti/srv/whack.h
new file mode 100755
index 000000000..fb966169c
--- /dev/null
+++ b/sys/src/cmd/venti/srv/whack.h
@@ -0,0 +1,40 @@
+typedef struct Whack Whack;
+typedef struct Unwhack Unwhack;
+
+enum
+{
+ WhackStats = 8,
+ WhackErrLen = 64, /* max length of error message from thwack or unthwack */
+ WhackMaxOff = 16*1024, /* max allowed offset */
+
+ HashLog = 14,
+ HashSize = 1<<HashLog,
+ HashMask = HashSize - 1,
+
+ MinMatch = 3, /* shortest match possible */
+
+ MinDecode = 8, /* minimum bits to decode a match or lit; >= 8 */
+
+ MaxSeqMask = 8, /* number of bits in coding block mask */
+ MaxSeqStart = 256 /* max offset of initial coding block */
+};
+
+struct Whack
+{
+ ushort begin; /* time of first byte in hash */
+ ushort hash[HashSize];
+ ushort next[WhackMaxOff];
+ uchar *data;
+};
+
+struct Unwhack
+{
+ char err[WhackErrLen];
+};
+
+void whackinit(Whack*, int level);
+void unwhackinit(Unwhack*);
+int whack(Whack*, uchar *dst, uchar *src, int nsrc, ulong stats[WhackStats]);
+int unwhack(Unwhack*, uchar *dst, int ndst, uchar *src, int nsrc);
+
+int whackblock(uchar *dst, uchar *src, int ssize);
diff --git a/sys/src/cmd/venti/srv/wrarena.c b/sys/src/cmd/venti/srv/wrarena.c
new file mode 100755
index 000000000..9ead93704
--- /dev/null
+++ b/sys/src/cmd/venti/srv/wrarena.c
@@ -0,0 +1,225 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+QLock godot;
+char *host;
+int readonly = 1; /* for part.c */
+int mainstacksize = 256*1024;
+Channel *c;
+VtConn *z;
+int fast; /* and a bit unsafe; only for benchmarking */
+int haveaoffset;
+int maxwrites = -1;
+int verbose;
+
+typedef struct ZClump ZClump;
+struct ZClump
+{
+ ZBlock *lump;
+ Clump cl;
+ u64int aa;
+};
+
+void
+usage(void)
+{
+ fprint(2, "usage: wrarena [-h host] arenafile [offset]\n");
+ threadexitsall("usage");
+}
+
+void
+vtsendthread(void *v)
+{
+ ZClump zcl;
+
+ USED(v);
+ while(recv(c, &zcl) == 1){
+ if(zcl.lump == nil)
+ break;
+ if(vtwrite(z, zcl.cl.info.score, zcl.cl.info.type, zcl.lump->data, zcl.cl.info.uncsize) < 0)
+ sysfatal("failed writing clump %llud: %r", zcl.aa);
+ if(verbose)
+ print("%V\n", zcl.cl.info.score);
+ freezblock(zcl.lump);
+ }
+ /*
+ * All the send threads try to exit right when
+ * threadmain is calling threadexitsall.
+ * Either libthread or the Linux NPTL pthreads library
+ * can't handle this condition (I suspect NPTL but have
+ * not confirmed this) and we get a seg fault in exit.
+ * I spent a day tracking this down with no success,
+ * so we're going to work around it instead by just
+ * sitting here and waiting for the threadexitsall to
+ * take effect.
+ */
+ qlock(&godot);
+}
+
+static void
+rdarena(Arena *arena, u64int offset)
+{
+ int i;
+ u64int a, aa, e;
+ uchar score[VtScoreSize];
+ Clump cl;
+ ClumpInfo ci;
+ ZBlock *lump;
+ ZClump zcl;
+
+ fprint(2, "wrarena: copying %s to venti\n", arena->name);
+ printarena(2, arena);
+
+ a = arena->base;
+ e = arena->base + arena->size;
+ if(offset != ~(u64int)0) {
+ if(offset >= e - a)
+ sysfatal("bad offset %#llx >= %#llx", offset, e - a);
+ aa = offset;
+ } else
+ aa = 0;
+
+ i = 0;
+ for(a = 0; maxwrites != 0 && i < arena->memstats.clumps;
+ a += ClumpSize + ci.size){
+ if(readclumpinfo(arena, i++, &ci) < 0)
+ break;
+ if(a < aa || ci.type == VtCorruptType){
+ if(ci.type == VtCorruptType)
+ fprint(2, "%s: corrupt clump read at %#llx: +%d\n",
+ argv0, a, ClumpSize+ci.size);
+ continue;
+ }
+ lump = loadclump(arena, a, 0, &cl, score, 0);
+ if(lump == nil) {
+ fprint(2, "clump %#llx failed to read: %r\n", a);
+ continue;
+ }
+ if(!fast && cl.info.type != VtCorruptType) {
+ scoremem(score, lump->data, cl.info.uncsize);
+ if(scorecmp(cl.info.score, score) != 0) {
+ fprint(2, "clump %#llx has mismatched score\n",
+ a);
+ break;
+ }
+ if(vttypevalid(cl.info.type) < 0) {
+ fprint(2, "clump %#llx has bad type %d\n",
+ a, cl.info.type);
+ break;
+ }
+ }
+ if(z && cl.info.type != VtCorruptType){
+ zcl.cl = cl;
+ zcl.lump = lump;
+ zcl.aa = a;
+ send(c, &zcl);
+ }else
+ freezblock(lump);
+ if(maxwrites > 0)
+ --maxwrites;
+ }
+ if(a > aa)
+ aa = a;
+ if(haveaoffset)
+ print("end offset %#llx\n", aa);
+}
+
+void
+threadmain(int argc, char *argv[])
+{
+ int i;
+ char *file;
+ Arena *arena;
+ u64int offset, aoffset;
+ Part *part;
+ uchar buf[8192];
+ ArenaHead head;
+ ZClump zerocl;
+
+ ventifmtinstall();
+ qlock(&godot);
+ aoffset = 0;
+ ARGBEGIN{
+ case 'f':
+ fast = 1;
+ ventidoublechecksha1 = 0;
+ break;
+ case 'h':
+ host = EARGF(usage());
+ break;
+ case 'o':
+ haveaoffset = 1;
+ aoffset = strtoull(EARGF(usage()), 0, 0);
+ break;
+ case 'M':
+ maxwrites = atoi(EARGF(usage()));
+ break;
+ case 'v':
+ verbose = 1;
+ break;
+ default:
+ usage();
+ break;
+ }ARGEND
+
+ offset = ~(u64int)0;
+ switch(argc) {
+ default:
+ usage();
+ case 2:
+ offset = strtoull(argv[1], 0, 0);
+ /* fall through */
+ case 1:
+ file = argv[0];
+ }
+
+ fmtinstall('V', vtscorefmt);
+
+ statsinit();
+
+ part = initpart(file, OREAD);
+ if(part == nil)
+ sysfatal("can't open file %s: %r", file);
+ if(readpart(part, aoffset, buf, sizeof buf) < 0)
+ sysfatal("can't read file %s: %r", file);
+
+ if(unpackarenahead(&head, buf) < 0)
+ sysfatal("corrupted arena header: %r");
+
+ if(aoffset+head.size > part->size)
+ sysfatal("arena is truncated: want %llud bytes have %llud",
+ head.size, part->size);
+
+ partblocksize(part, head.blocksize);
+ initdcache(8 * MaxDiskBlock);
+
+ arena = initarena(part, aoffset, head.size, head.blocksize);
+ if(arena == nil)
+ sysfatal("initarena: %r");
+
+ z = nil;
+ if(host==nil || strcmp(host, "/dev/null") != 0){
+ z = vtdial(host);
+ if(z == nil)
+ sysfatal("could not connect to server: %r");
+ if(vtconnect(z) < 0)
+ sysfatal("vtconnect: %r");
+ }
+
+ c = chancreate(sizeof(ZClump), 0);
+ for(i=0; i<12; i++)
+ vtproc(vtsendthread, nil);
+
+ rdarena(arena, offset);
+ if(vtsync(z) < 0)
+ sysfatal("executing sync: %r");
+
+ memset(&zerocl, 0, sizeof zerocl);
+ for(i=0; i<12; i++)
+ send(c, &zerocl);
+ if(z){
+ vthangup(z);
+ }
+ threadexitsall(0);
+}
diff --git a/sys/src/cmd/venti/srv/www/stats.html b/sys/src/cmd/venti/srv/www/stats.html
new file mode 100755
index 000000000..e7579394e
--- /dev/null
+++ b/sys/src/cmd/venti/srv/www/stats.html
@@ -0,0 +1,33 @@
+<html>
+ <head>
+ <base href="/">
+ <META http-equiv="Content-Type" content="text/html; charset=utf-8">
+ <script language="javascript" src="stats.js"></script>
+ <script language="javascript" src="status.js"></script>
+ </head>
+ <body bgcolor=#ffffff>
+
+ <center>
+ <b>venti.your-domain.com &ndash; venti server statistics</b>
+ <p>
+ <a href="javascript:redraw()">redraw</a>
+ <p>
+ <table id="statgraphs">
+ <tr><td>JavaScript is required to view the graphs.
+ </table>
+ <p>
+
+ <font size=-1>the small graphs show the past ten minutes of operation.</font>
+ <p>
+
+ <tt>http://venti.yourdomain.com:8001/</tt>
+ <p id="settings">JavaScript is required to change the settings.</p>
+ <p>
+ <p id="debug"></p>
+ </center>
+ <script language="javascript">
+ loadsettings()
+ redraw()
+ </script>
+ </body>
+</html>
diff --git a/sys/src/cmd/venti/srv/www/stats.js b/sys/src/cmd/venti/srv/www/stats.js
new file mode 100755
index 000000000..76e9f276a
--- /dev/null
+++ b/sys/src/cmd/venti/srv/www/stats.js
@@ -0,0 +1,387 @@
+
+biggraph = "arg=rpctotal&graph=diff"
+
+graphname = new Array(
+ "arg=*&graph=diskbw",
+ "<b>disk</b> bytes/second",
+ "arg=*&graph=netbw",
+ "<b>network</b> bytes/second",
+ "arg=*&graph=iobw",
+ "total: <b>disk+net</b> bytes/second",
+
+ "arg=apartreadbyte&graph=diff",
+ "arena read bytes/second",
+ "arg=apartwritebyte&graph=diff",
+ "arena write bytes/second",
+
+ "arg=bloomfalsemiss&graph=pctdiff&arg2=bloomlookup&max=100",
+ "bloom false hit %",
+ "arg=bloomhit&graph=pctdiff&arg2=bloomlookup&max=100",
+ "bloom miss %",
+ "arg=bloomlookuptime&graph=divdiff&arg2=bloomlookup",
+ "bloom lookup time",
+ "arg=bloomones&graph=pct&arg2=bloombits&max=100",
+ "bloom usage %",
+
+ "arg=dcachedirty&graph=pct&arg2=dcachesize&max=100",
+ "dcache dirty %",
+ "arg=dcachehit&graph=pctdiff&arg2=dcachelookup&max=100",
+ "dcache hit %",
+ "arg=dcachelookuptime&graph=divdiff&arg2=dcachelookup",
+ "dcache lookup time",
+ "arg=dcachelookup&graph=diff",
+ "dcache lookups/second",
+ "arg=dcachewrite&graph=diff",
+ "dcache writes/second",
+
+ "arg=icachedirty&graph=pct&arg2=icachesize&max=100",
+ "icache dirty %",
+ "arg=icachehit&graph=pctdiff&arg2=icachelookup&max=100",
+ "icache hit %",
+ "arg=icachelookuptime&graph=divdiff&arg2=icachelookup",
+ "icache lookup time",
+ "arg=icacheprefetch&graph=diff",
+ "icache prefetches/second",
+ "arg=icachewrite&graph=diff",
+ "icache writes/second",
+
+ "arg=isectreadbyte&graph=diff",
+ "isect read bytes/second",
+ "arg=isectwritebyte&graph=diff",
+ "isect write bytes/second",
+
+ "arg=lcachehit&graph=pctdiff&arg2=lcachelookup&max=100",
+ "lump cache hit %",
+ "arg=lcachelookuptime&graph=divdiff&arg2=lcachelookup",
+ "lump cache lookup time",
+ "arg=lcachewrite&graph=diff",
+ "lcache writes/second",
+
+ "arg=rpcreadbyte&graph=diff",
+ "read RPC bytes/second",
+ "arg=rpctotal&graph=diff",
+ "RPCs/second",
+ "arg=rpcwritebyte&graph=diff",
+ "write RPC bytes/second",
+ "arg=rpcreadtime&graph=divdiff&arg2=rpcread",
+ "read RPC time",
+ "arg=rpcwritetime&graph=divdiff&arg2=rpcwrite",
+ "write RPC time",
+ "arg=rpcreadcachedtime&graph=divdiff&arg2=rpcreadcached",
+ "cached read RPC time",
+ "arg=rpcreaduncachedtime&graph=divdiff&arg2=rpcreaduncached",
+ "uncached read RPC time",
+ "arg=rpcwritenewtime&graph=divdiff&arg2=rpcwritenew",
+ "fresh write RPC time",
+ "arg=rpcwriteoldtime&graph=divdiff&arg2=rpcwriteold",
+ "dup write RPC time",
+
+ "arg=sumreadbyte&graph=diff",
+ "checksum bytes/second",
+
+ "arg=dblockstall",
+ "threads stalled: dblock",
+ "arg=dcachestall",
+ "threads stalled: dcache",
+ "arg=icachestall",
+ "threads stalled: icache",
+ "arg=lumpstall",
+ "threads stalled: lump",
+
+ "arg=END"
+)
+
+column0 = new Array(
+ "column0",
+ "!bandwidth",
+ "arg=*&graph=iobw",
+ "arg=*&graph=netbw",
+ "arg=rpcreadbyte&graph=diff",
+ "arg=rpcwritebyte&graph=diff",
+ "arg=*&graph=diskbw",
+ "arg=isectreadbyte&graph=diff",
+ "arg=isectwritebyte&graph=diff",
+ "arg=apartreadbyte&graph=diff",
+ "arg=apartwritebyte&graph=diff",
+ "arg=sumreadbyte&graph=diff",
+
+ "!bloom filter",
+ "arg=bloomhit&graph=pctdiff&arg2=bloomlookup&max=100",
+ "arg=bloomfalsemiss&graph=pctdiff&arg2=bloomlookup&max=100",
+ "arg=bloomones&graph=pct&arg2=bloombits&max=100",
+
+ "END"
+)
+
+column1 = new Array(
+ "column1",
+ "!icache",
+ "arg=icachedirty&graph=pct&arg2=icachesize&max=100",
+ "arg=icachehit&graph=pctdiff&arg2=icachelookup&max=100",
+ "arg=icachewrite&graph=diff",
+ "arg=icacheprefetch&graph=diff",
+
+ "!dcache",
+ "arg=dcachedirty&graph=pct&arg2=dcachesize&max=100",
+ "arg=dcachehit&graph=pctdiff&arg2=dcachelookup&max=100",
+ "arg=dcachelookup&graph=diff",
+ "arg=dcachewrite&graph=diff",
+
+ "!lump cache",
+ "arg=lcachehit&graph=pctdiff&arg2=lcachelookup&max=100",
+ "arg=lcachewrite&graph=diff",
+
+ "END"
+)
+
+column2 = new Array(
+ "column2",
+
+ "!stalls",
+ "arg=icachestall",
+ "arg=dcachestall",
+ "arg=dblockstall",
+ "arg=lumpstall",
+
+ "!timings",
+ "arg=bloomlookuptime&graph=divdiff&arg2=bloomlookup",
+ "arg=icachelookuptime&graph=divdiff&arg2=icachelookup",
+ "arg=lcachelookuptime&graph=divdiff&arg2=lcachelookup",
+ "arg=dcachelookuptime&graph=divdiff&arg2=dcachelookup",
+ "arg=rpcreadtime&graph=divdiff&arg2=rpcread",
+ "arg=rpcwritetime&graph=divdiff&arg2=rpcwrite",
+ "arg=rpcreadcachedtime&graph=divdiff&arg2=rpcreadcached",
+ "arg=rpcreaduncachedtime&graph=divdiff&arg2=rpcreaduncached",
+ "arg=rpcwritenewtime&graph=divdiff&arg2=rpcwritenew",
+ "arg=rpcwriteoldtime&graph=divdiff&arg2=rpcwriteold",
+
+ "END"
+)
+
+col0info = new Array(column0.length)
+col1info = new Array(column1.length)
+col2info = new Array(column2.length)
+
+function cleardebug() {
+ var p = document.getElementById("debug")
+ p.innerHTML = ""
+}
+
+function debug(s) {
+ var p = document.getElementById("debug")
+ if(p.innerHTML == "")
+ p.innerHTML = "<a href=\"javascript:cleardebug()\">clear</a>\n"
+ p.innerHTML += "<br>"+s
+}
+
+function Ginfo(y, fill, name) {
+ var g = new Object()
+ g.y = y
+ g.fill = fill
+ g.name = name
+ return g
+}
+
+function cleartable(t) {
+ for(var i=t.rows.length-1; i>=0; i--)
+ t.deleteRow(i)
+}
+
+function textofname(name)
+{
+ for(var i=0; i<graphname.length; i+=2)
+ if(name == graphname[i])
+ return graphname[i+1]
+}
+
+function graphrow(row, span, name, dt, wid, ht, fill, text) {
+ var url = "/graph?"+name
+ url = url+"&min=0"
+ url = url+"&t0=-"+dt
+ url = url+"&wid="+wid
+ url = url+"&ht="+ht
+ url = url+"&fill="+fill
+
+ var s = "<td colSpan="+span
+ s = s+" valign=bottom"
+ s = s+" align=center"
+ s = s+" width="+wid
+ s = s+" height="+ht
+ s = s+" style=\"background-image: url("+url+");\""
+ s = s+">"+textofname(name)+text+"</td>"
+ row.innerHTML = s
+}
+
+
+function graphcell(cell, name, dt, wid, ht, fill) {
+ cell.vAlign = "bottom"
+ cell.align = "center"
+ cell.width = wid
+ cell.height = ht
+}
+
+function redraw() {
+ redrawgraphs()
+ redrawsettings()
+}
+
+function redrawgraphs() {
+ var t = document.getElementById("statgraphs")
+
+ cleartable(t)
+ for(var i=0; i<4; i++)
+ t.insertRow(i)
+
+ graphrow(t.rows[0], 3, biggraph, 86400, 900, 30, 0, " &ndash; showing 24 hours")
+ graphrow(t.rows[1], 3, biggraph, 3600, 900, 30, 1, " &ndash; showing 1 hour")
+ t.rows[2].innerHTML = "<td height=10></td>"
+
+ var r = t.rows[3]
+ graphtable(r.insertCell(0), column0, col0info, 0)
+ graphtable(r.insertCell(1), column1, col1info, 2)
+ graphtable(r.insertCell(2), column2, col2info, 4)
+}
+
+function graphtable(bigcell, list, infolist, fill) {
+ bigcell.innerHTML = "<table id=\""+list[0]+"\"></table>"
+ bigcell.vAlign = "top"
+ var t = document.getElementById(list[0])
+ t.onclick = columnclick
+
+ for(var i=1; i<list.length; i++){
+ var r = t.insertRow(t.rows.length)
+ name = list[i]
+ infolist[i] = Ginfo(t.offsetHeight, fill, name)
+ if(name == "END")
+ break
+ if(name.substring(0,1) == "!"){
+ name = name.substring(1)
+ if(i > 1){
+ r.innerHTML = "<td height=10></td>"
+ r = t.insertRow(t.rows.length)
+ }
+ r.innerHTML = "<td align=center><b>"+name+"</b>"
+ }else{
+ graphrow(r, 1, name, 600, 300, 30, fill++, "")
+ }
+ }
+}
+
+function xpos(obj) {
+ var x = 0
+ if(obj.fixedx)
+ return obj.fixedx
+ if(obj.offsetParent){
+ while(obj.offsetParent){
+ x += obj.offsetLeft
+ obj = obj.offsetParent
+ }
+ }else if(obj.x)
+ x = obj.x
+ return x
+}
+
+function ypos(obj) {
+ var y = 0
+ if(obj.fixedy)
+ return obj.fixedy
+ if(obj.offsetParent){
+ while(obj.offsetParent){
+ y += obj.offsetTop
+ obj = obj.offsetParent
+ }
+ }else if(obj.y)
+ y = obj.y
+ return y
+}
+
+function scrollleft() {
+ return document.body.scrollLeft
+}
+
+function scrolltop() {
+ return document.body.scrollTop
+}
+
+function columnclick(e) {
+ if(e.which && e.which != 1)
+ return;
+ var g = findgraph(scrollleft()+e.clientX, scrolltop()+e.clientY)
+ if(g && g.name.substring(0,1) != "!"){
+ biggraph = g.name
+ var t = document.getElementById("statgraphs")
+ graphrow(t.rows[0], 3, biggraph, 86400, 900, 30, 0, " &ndash; showing 24 hours")
+ graphrow(t.rows[1], 3, biggraph, 3600, 900, 30, 1, " &ndash; showing 1 hour")
+ }
+}
+
+function findgraph(x, y) {
+ var g
+
+ if(g = findgraphin(x, y, "column2", col2info))
+ return g
+ if(g = findgraphin(x, y, "column1", col1info))
+ return g
+ if(g = findgraphin(x, y, "column0", col0info))
+ return g
+ return
+}
+
+function findgraphin(x, y, tname, info) {
+ var t = document.getElementById(tname)
+ if(x < xpos(t))
+ return
+ y = y - ypos(t)
+ for(var i=info.length-2; i>=1; i--){
+ if(y > info[i].y)
+ return info[i]
+ }
+ return
+}
+
+function setof(name, val, list) {
+ var s = ""
+ for(var i=0; i<list.length; i++){
+ if(val == list[i])
+ s = s+" <b>"+val+"</b>"
+ else
+ s = s+" <a href=\"javascript:set('"+name+"', '"+list[i]+"')\">"+list[i]+"</a>"
+ }
+ return s
+}
+
+function loglinks(list) {
+ var s = ""
+ for(var i=0; i<list.length; i++){
+ s = s+" <a href=\"/log/"+list[i]+"\">"+list[i]+"</a>"
+ }
+ return s
+}
+
+first = 1
+function redrawsettings() {
+ if(first){
+ loadsettings()
+ first = 0
+ }
+ var s = ""
+ s = s+"<font size=-1>\n"
+ s = s+"logging:"+setof("logging", logging, loggingchoices)
+ s = s+" &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; "
+ s = s+"stats:"+setof("stats", stats, statschoices)
+ s = s+"\n<p/>\n"
+ s = s+"compression:"+setof("compress", compress, compresschoices1)
+ s = s+"<br>"+setof("compress", compress, compresschoices2)
+ s = s+"\n<p/>\n"
+ s = s+"<a href=/index>index</a> | <a href=/storage>storage</a> | "
+ s = s+"log:"+loglinks(logs)
+ s = s+"</font>"
+ document.getElementById("settings").innerHTML = s
+}
+
+function set(name, value) {
+ eval(name+"= \""+value+"\"")
+ redrawsettings()
+ // Works in FireFox, not in Safari
+ parent.hidden.location.href = "/set/"+name+"/"+value
+}
diff --git a/sys/src/cmd/venti/srv/www/status.js b/sys/src/cmd/venti/srv/www/status.js
new file mode 100755
index 000000000..48e197d8f
--- /dev/null
+++ b/sys/src/cmd/venti/srv/www/status.js
@@ -0,0 +1,16 @@
+function loadsettings() {
+ logging = "off"
+ loggingchoices = new Array("0", "1")
+
+ stats = "on"
+ statschoices = new Array("0", "1")
+
+ compress = "whack"
+ compresschoices1 = new Array("none",
+ "flate1", "flate2", "flate3", "flate4", "flate5",
+ "flate6", "flate7", "flate8", "flate9")
+ compresschoices2 = new Array("smack1", "smack2", "smack3", "whack")
+
+ logs = new Array("all", "libventi/server", "disk", "lump", "block", "proc", "quiet", "rpc")
+}
+
diff --git a/sys/src/cmd/venti/srv/www/status1.js b/sys/src/cmd/venti/srv/www/status1.js
new file mode 100755
index 000000000..b0e284e3e
--- /dev/null
+++ b/sys/src/cmd/venti/srv/www/status1.js
@@ -0,0 +1,14 @@
+logging = "on"
+loggingchoices = new Array("off", "on")
+
+stats = "on"
+statschoices = new Array("off", "on")
+
+compress = "whack"
+compresschoices1 = new Array("none",
+ "flate1", "flate2", "flate3", "flate4", "flate5",
+ "flate6", "flate7", "flate8", "flate9")
+compresschoices2 = new Array("smack1", "smack2", "smack3", "whack")
+
+logs = new Array("all", "libventi/server", "disk", "lump", "block", "proc", "quiet", "rpc")
+
diff --git a/sys/src/cmd/venti/srv/xml.c b/sys/src/cmd/venti/srv/xml.c
new file mode 100755
index 000000000..e91afa054
--- /dev/null
+++ b/sys/src/cmd/venti/srv/xml.c
@@ -0,0 +1,68 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+#include "xml.h"
+
+void xmlarena(Hio *hout, Arena *s, char *tag, int indent){
+ xmlindent(hout, indent);
+ hprint(hout, "<%s", tag);
+ xmlaname(hout, s->name, "name");
+ xmlu32int(hout, s->version, "version");
+ xmlaname(hout, s->part->name, "partition");
+ xmlu32int(hout, s->blocksize, "blocksize");
+ xmlu64int(hout, s->base, "start");
+ xmlu64int(hout, s->base+2*s->blocksize, "stop");
+ xmlu32int(hout, s->ctime, "created");
+ xmlu32int(hout, s->wtime, "modified");
+ xmlsealed(hout, s->memstats.sealed, "sealed");
+ xmlscore(hout, s->score, "score");
+ xmlu32int(hout, s->memstats.clumps, "clumps");
+ xmlu32int(hout, s->memstats.cclumps, "compressedclumps");
+ xmlu64int(hout, s->memstats.uncsize, "data");
+ xmlu64int(hout, s->memstats.used - s->memstats.clumps * ClumpSize, "compresseddata");
+ xmlu64int(hout, s->memstats.used + s->memstats.clumps * ClumpInfoSize, "storage");
+ hprint(hout, "/>\n");
+}
+
+void xmlindex(Hio *hout, Index *s, char *tag, int indent){
+ int i;
+ xmlindent(hout, indent);
+ hprint(hout, "<%s", tag);
+ xmlaname(hout, s->name, "name");
+ xmlu32int(hout, s->version, "version");
+ xmlu32int(hout, s->blocksize, "blocksize");
+ xmlu32int(hout, s->tabsize, "tabsize");
+ xmlu32int(hout, s->buckets, "buckets");
+ xmlu32int(hout, s->div, "buckdiv");
+ hprint(hout, ">\n");
+ xmlindent(hout, indent + 1);
+ hprint(hout, "<sects>\n");
+ for(i = 0; i < s->nsects; i++)
+ xmlamap(hout, &s->smap[i], "sect", indent + 2);
+ xmlindent(hout, indent + 1);
+ hprint(hout, "</sects>\n");
+ xmlindent(hout, indent + 1);
+ hprint(hout, "<amaps>\n");
+ for(i = 0; i < s->narenas; i++)
+ xmlamap(hout, &s->amap[i], "amap", indent + 2);
+ xmlindent(hout, indent + 1);
+ hprint(hout, "</amaps>\n");
+ xmlindent(hout, indent + 1);
+ hprint(hout, "<arenas>\n");
+ for(i = 0; i < s->narenas; i++)
+ xmlarena(hout, s->arenas[i], "arena", indent + 2);
+ xmlindent(hout, indent + 1);
+ hprint(hout, "</arenas>\n");
+ xmlindent(hout, indent);
+ hprint(hout, "</%s>\n", tag);
+}
+
+void xmlamap(Hio *hout, AMap *s, char *tag, int indent){
+ xmlindent(hout, indent);
+ hprint(hout, "<%s", tag);
+ xmlaname(hout, s->name, "name");
+ xmlu64int(hout, s->start, "start");
+ xmlu64int(hout, s->stop, "stop");
+ hprint(hout, "/>\n");
+}
+
diff --git a/sys/src/cmd/venti/srv/xml.h b/sys/src/cmd/venti/srv/xml.h
new file mode 100755
index 000000000..c9e52b0bb
--- /dev/null
+++ b/sys/src/cmd/venti/srv/xml.h
@@ -0,0 +1,11 @@
+void xmlamap(Hio *hout, AMap *v, char *tag, int indent);
+void xmlarena(Hio *hout, Arena *v, char *tag, int indent);
+void xmlindex(Hio *hout, Index *v, char *tag, int indent);
+
+void xmlaname(Hio *hout, char *v, char *tag);
+void xmlscore(Hio *hout, u8int *v, char *tag);
+void xmlsealed(Hio *hout, int v, char *tag);
+void xmlu32int(Hio *hout, u32int v, char *tag);
+void xmlu64int(Hio *hout, u64int v, char *tag);
+
+void xmlindent(Hio *hout, int indent);
diff --git a/sys/src/cmd/venti/srv/zblock.c b/sys/src/cmd/venti/srv/zblock.c
new file mode 100755
index 000000000..afff08010
--- /dev/null
+++ b/sys/src/cmd/venti/srv/zblock.c
@@ -0,0 +1,96 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+void
+fmtzbinit(Fmt *f, ZBlock *b)
+{
+ memset(f, 0, sizeof *f);
+#ifdef PLAN9PORT
+ fmtlocaleinit(f, nil, nil, nil);
+#endif
+ f->start = b->data;
+ f->to = f->start;
+ f->stop = (char*)f->start + b->len;
+}
+
+#define ROUNDUP(p, n) ((void*)(((uintptr)(p)+(n)-1)&~(uintptr)((n)-1)))
+
+enum {
+ OverflowCheck = 32
+};
+static char zmagic[] = "1234567890abcdefghijklmnopqrstuvxyz";
+
+ZBlock *
+alloczblock(u32int size, int zeroed, uint blocksize)
+{
+ uchar *p, *data;
+ ZBlock *b;
+ static ZBlock z;
+ int n;
+
+ if(blocksize == 0)
+ blocksize = 32; /* try for cache line alignment */
+
+ n = size+OverflowCheck+sizeof(ZBlock)+blocksize+8;
+ p = malloc(n);
+ if(p == nil){
+ seterr(EOk, "out of memory");
+ return nil;
+ }
+
+ data = ROUNDUP(p, blocksize);
+ b = ROUNDUP(data+size+OverflowCheck, 8);
+ if(0) fprint(2, "alloc %p-%p data %p-%p b %p-%p\n",
+ p, p+n, data, data+size, b, b+1);
+ *b = z;
+ b->data = data;
+ b->free = p;
+ b->len = size;
+ b->_size = size;
+ if(zeroed)
+ memset(b->data, 0, size);
+ memmove(b->data+size, zmagic, OverflowCheck);
+ return b;
+}
+
+void
+freezblock(ZBlock *b)
+{
+ if(b){
+ if(memcmp(b->data+b->_size, zmagic, OverflowCheck) != 0)
+ abort();
+ memset(b->data+b->_size, 0, OverflowCheck);
+ free(b->free);
+ }
+}
+
+ZBlock*
+packet2zblock(Packet *p, u32int size)
+{
+ ZBlock *b;
+
+ if(p == nil)
+ return nil;
+ b = alloczblock(size, 0, 0);
+ if(b == nil)
+ return nil;
+ if(packetcopy(p, b->data, 0, size) < 0){
+ freezblock(b);
+ return nil;
+ }
+ return b;
+}
+
+Packet*
+zblock2packet(ZBlock *zb, u32int size)
+{
+ Packet *p;
+
+ if(zb == nil)
+ return nil;
+ p = packetalloc();
+ packetappend(p, zb->data, size);
+ return p;
+}
+
diff --git a/sys/src/cmd/venti/srv/zeropart.c b/sys/src/cmd/venti/srv/zeropart.c
new file mode 100755
index 000000000..97d6038ee
--- /dev/null
+++ b/sys/src/cmd/venti/srv/zeropart.c
@@ -0,0 +1,30 @@
+#include "stdinc.h"
+#include "dat.h"
+#include "fns.h"
+
+void
+zeropart(Part *part, int blocksize)
+{
+ ZBlock *b;
+ u64int addr;
+ int w;
+
+ fprint(2, "clearing %s\n", part->name);
+ b = alloczblock(MaxIoSize, 1, blocksize);
+
+ w = 0;
+ for(addr = PartBlank; addr + MaxIoSize <= part->size; addr += MaxIoSize){
+ if(writepart(part, addr, b->data, MaxIoSize) < 0)
+ sysfatal("can't initialize %s, writing block %d failed: %r", part->name, w);
+ w++;
+ }
+
+ for(; addr + blocksize <= part->size; addr += blocksize)
+ if(writepart(part, addr, b->data, blocksize) < 0)
+ sysfatal("can't initialize %s: %r", part->name);
+
+ if(flushpart(part) < 0)
+ sysfatal("can't flush writes to %s: %r", part->name);
+
+ freezblock(b);
+}
diff --git a/sys/src/cmd/venti/sync.c b/sys/src/cmd/venti/sync.c
new file mode 100755
index 000000000..9d817a72e
--- /dev/null
+++ b/sys/src/cmd/venti/sync.c
@@ -0,0 +1,54 @@
+#include <u.h>
+#include <libc.h>
+#include <thread.h>
+#include <venti.h>
+
+char *host;
+int donothing;
+
+void
+usage(void)
+{
+ fprint(2, "usage: sync [-h host]\n");
+ threadexitsall("usage");
+}
+
+void
+threadmain(int argc, char *argv[])
+{
+ VtConn *z;
+
+ fmtinstall('V', vtscorefmt);
+ fmtinstall('F', vtfcallfmt);
+
+ ARGBEGIN{
+ case 'h':
+ host = EARGF(usage());
+ if(host == nil)
+ usage();
+ break;
+ case 'x':
+ donothing = 1;
+ break;
+ default:
+ usage();
+ break;
+ }ARGEND
+
+ if(argc != 0)
+ usage();
+
+ z = vtdial(host);
+ if(z == nil)
+ sysfatal("could not connect to server: %r");
+
+ if(vtconnect(z) < 0)
+ sysfatal("vtconnect: %r");
+
+ if(!donothing)
+ if(vtsync(z) < 0)
+ sysfatal("vtsync: %r");
+
+ vthangup(z);
+ threadexitsall(0);
+}
diff --git a/sys/src/cmd/venti/words/backup.example b/sys/src/cmd/venti/words/backup.example
new file mode 100755
index 000000000..5d0f49e3c
--- /dev/null
+++ b/sys/src/cmd/venti/words/backup.example
@@ -0,0 +1,26 @@
+#!/bin/rc
+
+rfork e
+cd /usr/rsc
+. bkup.info
+fn x {
+ echo x $*
+ y=$1
+ if(~ $#$y 0){
+ $y=0
+ }
+ echo venti/wrarena -o $2 $3 $$y
+ end=`{venti/wrarena -o $2 $3 $$y | grep '^end offset ' | sed 's/^end offset //'}
+ if(~ $#end 1 && ! ~ $$y $end){
+ $y=$end
+ echo '#' `{date} >>bkup.info
+ whatis $y >>bkup.info
+ }
+}
+hget http://127.1:8000/index |
+awk '
+/^index=/ { blockSize=0+substr($3, 11); }
+/^arena=/ { arena=substr($1, 7); }
+/^ arena=/ { start=0+substr($5, 2)-blockSize; printf("x %s %d %s\n", arena, start, $3); }
+' |rc
+
diff --git a/sys/src/cmd/venti/words/dumpvacroots b/sys/src/cmd/venti/words/dumpvacroots
new file mode 100755
index 000000000..0b38172d5
--- /dev/null
+++ b/sys/src/cmd/venti/words/dumpvacroots
@@ -0,0 +1,21 @@
+#!/bin/rc
+# dumpvacroots - dumps all the vac scores ever stored to the venti server
+# if nothing else, this illustrates that you have to control access
+# to the physical disks storing the archive!
+
+ventihttp=`{
+ echo $venti | sed 's/^[a-z]+!([0-9\.]+)![a-z0-9]+$/\1/
+ s/^[a-z]+!([0-9\.]+)/\1/; s/$/:8000/'
+}
+
+hget http://$ventihttp/index |
+ awk '
+ /^index=/ { blockSize = 0 + substr($3, 11) }
+ /^arena=/ { arena = substr($1, 7) }
+ /^ arena=/ {
+ start = (0 + substr($5, 2)) - blockSize
+ printf("venti/printarena -o %.0f %s\n", start, $3 "")
+ }
+ ' |
+ rc |
+ awk '$3 == 16 { printf("vac:%s\n", $2 "") }'
diff --git a/sys/src/cmd/venti/words/notes b/sys/src/cmd/venti/words/notes
new file mode 100755
index 000000000..024fae8c0
--- /dev/null
+++ b/sys/src/cmd/venti/words/notes
@@ -0,0 +1,149 @@
+all data is big-endian on disk.
+
+arena layout:
+
+ArenaPart (first at offset PartBlank = 256kB in the disk file)
+ magic[4] 0xA9E4A5E7
+ version[4] 3
+ blockSize[4]
+ arenaBase[4] offset of first ArenaHead structure in the disk file
+
+the ArenaMap starts at the first block at offset >= PartBlank+512 bytes.
+it is a sequence of text lines
+/*
+ * amap: n '\n' amapelem * n
+ * n: u32int
+ * amapelem: name '\t' astart '\t' asize '\n'
+ * astart, asize: u64int
+ */
+
+the astart and astop are byte offsets in the disk file.
+they are the offsets to the ArenaHead and the end of the Arena block.
+
+ArenaHead
+[base points here in the C code]
+size bytes
+ Clumps
+ ClumpInfo blocks
+Arena
+
+Arena
+ magic[4] 0xF2A14EAD
+ version[4] 4
+ name[64]
+ clumps[4]
+ cclumps[4]
+ ctime[4]
+ wtime[4]
+ used[8]
+ uncsize[8]
+ sealed[1]
+ optional score[20]
+
+once sealed, the sha1 hash of every block from the
+ArenaHead to the Arena is checksummed, as though
+the final score in Arena were the zeroScore. strangely,
+the tail of the Arena block (the last one) is not included in the checksum
+(i.e., the unused data after the score).
+
+clumpMax = blocksize/ClumpInfoSize = blocksize/25
+dirsize = ((clumps/clumpMax)+1) * blocksize
+want used+dirsize <= size
+want cclumps <= clumps
+want uncsize+clumps*ClumpSize+blocksize < used
+want ctime <= wtime
+
+clump info is stored packed into blocks in order.
+clump info moves forward through a block but the
+blocks themselves move backwards. so if cm=clumpMax
+and there are two blocks worth of clumpinfo, the blocks
+look like;
+
+ [cm..2*cm-1] [0..cm-1] [Arena]
+
+with the blocks pushed right up against the Arena trailer.
+
+ArenaHead
+ magic[4] 0xD15C4EAD
+ version[4] = Arena.version
+ name[64]
+ blockSize[4]
+ size[8]
+
+Clump
+ magic[4] 0xD15CB10C (0 for an unused clump)
+ type[1]
+ size[2]
+ uncsize[2]
+ score[20]
+ encoding[1] raw=1, compress=2
+ creator[4]
+ time[4]
+
+ClumpInfo
+ type[1]
+ size[2]
+ uncsize[2]
+ score[20]
+
+the arenas are mapped into a single address space corresponding
+to the index that brings them together. if each arena has 100M bytes
+excluding the headers and there are 4 arenas, then there's 400M of
+index address space between them. index address space starts at 1M
+instead of 0, so the index addresses assigned to the first arena are
+1M up to 101M, then 101M to 201M, etc.
+
+of course, the assignment of addresses has nothing to do with the index,
+but that's what they're called.
+
+
+the index is split into index sections, which are put on different disks
+to get parallelism of disk heads. each index section holds some number
+of hash buckets, each in its own disk block. collectively the index sections
+hold ix->buckets between them.
+
+the top 32-bits of the score is used to assign scores to buckets.
+div = ceil(2³² / ix->buckets) is the amount of 32-bit score space per bucket.
+
+to look up a block, take the top 32 bits of score and divide by div
+to get the bucket number. then look through the index section headers
+to figure out which index section has that bucket.
+
+then load that block from the index section. it's an IBucket.
+
+the IBucket has ib.n IEntry structures in it, sorted by score and then by type.
+do the lookup and get an IEntry. the ia.addr will be a logical address
+that you then use to get the
+
+ISect
+ magic[4] 0xD15C5EC7
+ version[4]
+ name[64]
+ index[64]
+ blockSize[4]
+ blockBase[4] address in partition where bucket blocks start
+ blocks[4]
+ start[4]
+ stop[4] stop - start <= blocks, but not necessarily ==
+
+IEntry
+ score[20]
+ wtime[4]
+ train[2]
+ ia.addr[8] index address (see note above)
+ ia.size[2] size of uncompressed block data
+ ia.type[1]
+ ia.blocks[1] number of blocks of clump on disk
+
+IBucket
+ n[2]
+ next[4] not sure; either 0 or inside [start,stop) for the ISect
+ data[n*IEntrySize]
+
+final piece: all the disk partitions start with PartBlank=256kB of unused disk
+(presumably to avoid problems with boot sectors and layout tables
+and the like).
+
+actually the last 8k of the 256k (that is, at offset 248kB) can hold
+a venti config file to help during bootstrap of the venti file server.
+
diff --git a/sys/src/cmd/venti/words/venti.conf b/sys/src/cmd/venti/words/venti.conf
new file mode 100755
index 000000000..03775ea52
--- /dev/null
+++ b/sys/src/cmd/venti/words/venti.conf
@@ -0,0 +1,20 @@
+# a sample venti configuration file
+#
+# formated with
+# venti/fmtarenas arena. /tmp/disks/arenas
+# venti/fmtisect isect0 /tmp/disks/isect0
+# venti/fmtisect isect1 /tmp/disks/isect1
+# venti/fmtindex venti.conf
+#
+# server is started with
+# venti/venti
+
+# the name of the index
+index main
+
+# the index sections
+isect /tmp/disks/isect0
+isect /tmp/disks/isect1
+
+# the arenas
+arenas /tmp/disks/arenas
diff --git a/sys/src/cmd/venti/words/wrtape b/sys/src/cmd/venti/words/wrtape
new file mode 100755
index 000000000..7e9490ede
--- /dev/null
+++ b/sys/src/cmd/venti/words/wrtape
@@ -0,0 +1,21 @@
+#!/bin/rc
+
+tape=$1
+
+start=`{echo $tape'*32+1' | hoc}
+end=`{echo $start'+31' | hoc}
+
+echo rewind | scuzz /dev/sd03
+
+arenas=`{hget http://iolaire/index | grep '^arena' | sed -n $start,$end^p | sed 's/^.*=//' | sed 's/ .*//'}
+for(i in $arenas) {
+ dev=`{hget http://iolaire/index | grep ''''$i'''' | sed 's/.* on //' | sed 's/ .*//'}
+ echo `{date} $tape $i
+ echo `{date} $tape $i >> /sys/log/ventibackup
+ echo $dev
+ echo write '''|venti/rdarena $dev $i''' | scuzz -m 8192 /dev/sd03
+ echo filemark | scuzz -m 6144 /dev/sd03
+}
+
+echo rewind | scuzz /dev/sd03
+
diff --git a/sys/src/cmd/venti/write.c b/sys/src/cmd/venti/write.c
new file mode 100755
index 000000000..c11a5a314
--- /dev/null
+++ b/sys/src/cmd/venti/write.c
@@ -0,0 +1,62 @@
+#include <u.h>
+#include <libc.h>
+#include <venti.h>
+#include <libsec.h>
+#include <thread.h>
+
+void
+usage(void)
+{
+ fprint(2, "usage: write [-z] [-h host] [-t type] <datablock\n");
+ threadexitsall("usage");
+}
+
+void
+threadmain(int argc, char *argv[])
+{
+ char *host;
+ int dotrunc, n, type;
+ uchar *p, score[VtScoreSize];
+ VtConn *z;
+
+ fmtinstall('F', vtfcallfmt);
+ fmtinstall('V', vtscorefmt);
+
+ host = nil;
+ dotrunc = 0;
+ type = VtDataType;
+ ARGBEGIN{
+ case 'z':
+ dotrunc = 1;
+ break;
+ case 'h':
+ host = EARGF(usage());
+ break;
+ case 't':
+ type = atoi(EARGF(usage()));
+ break;
+ default:
+ usage();
+ break;
+ }ARGEND
+
+ if(argc != 0)
+ usage();
+
+ p = vtmallocz(VtMaxLumpSize+1);
+ n = readn(0, p, VtMaxLumpSize+1);
+ if(n > VtMaxLumpSize)
+ sysfatal("input too big: max block size is %d", VtMaxLumpSize);
+ z = vtdial(host);
+ if(z == nil)
+ sysfatal("could not connect to server: %r");
+ if(vtconnect(z) < 0)
+ sysfatal("vtconnect: %r");
+ if(dotrunc)
+ n = vtzerotruncate(type, p, n);
+ if(vtwrite(z, score, type, p, n) < 0)
+ sysfatal("vtwrite: %r");
+ vthangup(z);
+ print("%V\n", score);
+ threadexitsall(0);
+}