summaryrefslogtreecommitdiff
path: root/sys/src/cmd/spell/pcode.c
diff options
context:
space:
mode:
authorTaru Karttunen <taruti@taruti.net>2011-03-30 15:46:40 +0300
committerTaru Karttunen <taruti@taruti.net>2011-03-30 15:46:40 +0300
commite5888a1ffdae813d7575f5fb02275c6bb07e5199 (patch)
treed8d51eac403f07814b9e936eed0c9a79195e2450 /sys/src/cmd/spell/pcode.c
Import sources from 2011-03-30 iso image
Diffstat (limited to 'sys/src/cmd/spell/pcode.c')
-rwxr-xr-xsys/src/cmd/spell/pcode.c336
1 files changed, 336 insertions, 0 deletions
diff --git a/sys/src/cmd/spell/pcode.c b/sys/src/cmd/spell/pcode.c
new file mode 100755
index 000000000..1d058a3d1
--- /dev/null
+++ b/sys/src/cmd/spell/pcode.c
@@ -0,0 +1,336 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include <ctype.h>
+#include "code.h"
+
+/* read an annotated spelling list in the form
+ word <tab> affixcode [ , affixcode ] ...
+ print a reencoded version
+ octal <tab> word
+ */
+
+typedef struct Dict Dict;
+struct Dict
+{
+ char* word;
+ int encode;
+};
+
+Dict words[200000];
+char space[500000];
+long encodes[4094];
+long nspace;
+long nwords;
+int ncodes;
+Biobuf bout;
+
+void readinput(int f);
+long typecode(char *str);
+int wcmp(void*, void*);
+void pdict(void);
+void sput(int);
+
+void
+main(int argc, char *argv[])
+{
+ int f;
+
+ Binit(&bout, 1, OWRITE);
+ nwords = 0;
+ nspace = 0;
+ ncodes = 0;
+ if(argc <= 1)
+ readinput(0);
+ while(argc > 1) {
+ f = open(argv[1], 0);
+ if(f < 0) {
+ fprint(2, "Cannot open %s\n", argv[1]);
+ exits("open");
+ }
+ readinput(f);
+ argc--;
+ argv++;
+ }
+ fprint(2, "words = %ld; space = %ld; codes = %d\n",
+ nwords, nspace, ncodes);
+ qsort(words, nwords, sizeof(words[0]), wcmp);
+ pdict();
+ exits(0);
+}
+
+wcmp(void *a, void *b)
+{
+
+ return strcmp(((Dict*)a)->word, ((Dict*)b)->word);
+}
+
+void
+readinput(int f)
+{
+ long i;
+ char *code, *line, *bword;
+ Biobuf buf;
+ long lineno = 0;
+
+ Binit(&buf, f, OREAD);
+ while(line = Brdline(&buf, '\n')) {
+ line[Blinelen(&buf)-1] = 0;
+ lineno++;
+ code = line;
+ while(isspace(*code))
+ code++;
+ bword = code;
+ while(*code && !isspace(*code))
+ code++;
+
+ i = code-bword;
+ memmove(space+nspace, bword, i);
+ words[nwords].word = space+nspace;
+ nspace += i;
+ space[nspace] = 0;
+ nspace++;
+
+ if(*code) {
+ *code++ = 0;
+ while(isspace(*code))
+ code++;
+ }
+ words[nwords].encode = typecode(code);
+ nwords++;
+ if(nwords >= sizeof(words)/sizeof(words[0])) {
+ fprint(2, "words array too small\n");
+ exits("words");
+ }
+ if(nspace >= sizeof(space)/sizeof(space[0])) {
+ fprint(2, "space array too small\n");
+ exits("space");
+ }
+ }
+ Bterm(&buf);
+}
+
+
+typedef struct Class Class;
+struct Class
+{
+ char* codename;
+ long bits;
+};
+Class codea[] =
+{
+ { "a", ADJ },
+ { "adv", ADV },
+ 0
+};
+Class codec[] =
+{
+ { "comp", COMP },
+ 0
+};
+Class coded[] =
+{
+ { "d", DONT_TOUCH},
+ 0
+};
+
+Class codee[] =
+{
+ { "ed", ED },
+ { "er", ACTOR },
+ 0
+};
+
+Class codei[] =
+{
+ { "in", IN },
+ { "ion", ION },
+ 0
+};
+
+Class codem[] =
+{
+ { "man", MAN },
+ { "ms", MONO },
+ 0
+};
+
+Class coden[] =
+{
+ { "n", NOUN },
+ { "na", N_AFFIX },
+ { "nopref", NOPREF },
+ 0
+};
+
+Class codep[] =
+{
+ { "pc", PROP_COLLECT },
+ 0
+};
+Class codes[] =
+{
+ { "s", STOP },
+ 0
+};
+
+Class codev[] =
+{
+ { "v", VERB },
+ { "va", V_AFFIX },
+ { "vi", V_IRREG },
+ 0
+};
+
+Class codey[] =
+{
+ { "y", _Y },
+ 0
+};
+
+Class codez[] =
+{
+ 0
+};
+Class* codetab[] =
+{
+ codea,
+ codez,
+ codec,
+ coded,
+ codee,
+ codez,
+ codez,
+ codez,
+ codei,
+ codez,
+ codez,
+ codez,
+ codem,
+ coden,
+ codez,
+ codep,
+ codez,
+ codez,
+ codes,
+ codez,
+ codez,
+ codev,
+ codez,
+ codez,
+ codey,
+ codez,
+};
+
+long
+typecode(char *str)
+{
+ Class *p;
+ long code;
+ int n, i;
+ char *s, *sp, *st;
+
+ code = 0;
+
+loop:
+ for(s=str; *s != 0 && *s != ','; s++)
+ ;
+ for(p = codetab[*str-'a']; sp = p->codename; p++) {
+ st = str;
+ for(n=s-str;; st++,sp++) {
+ if(*st != *sp)
+ goto cont;
+ n--;
+ if(n == 0)
+ break;
+ }
+ code |= p->bits;
+ if(*s == 0)
+ goto out;
+ str = s+1;
+ goto loop;
+ cont:;
+ }
+ fprint(2, "Unknown affix code \"%s\"\n", str);
+ return 0;
+out:
+ for(i=0; i<ncodes; i++)
+ if(encodes[i] == code)
+ return i;
+ encodes[i] = code;
+ ncodes++;
+ return i;
+}
+
+void
+sput(int s)
+{
+
+ Bputc(&bout, s>>8);
+ Bputc(&bout, s);
+}
+
+void
+lput(long l)
+{
+ Bputc(&bout, l>>24);
+ Bputc(&bout, l>>16);
+ Bputc(&bout, l>>8);
+ Bputc(&bout, l);
+}
+
+/*
+ * spit out the encoded dictionary
+ * all numbers are encoded big-endian.
+ * struct
+ * {
+ * short ncodes;
+ * long encodes[ncodes];
+ * struct
+ * {
+ * short encode;
+ * char word[*];
+ * } words[*];
+ * };
+ * 0x8000 flag for code word
+ * 0x7800 count of number of common bytes with previous word
+ * 0x07ff index into codes array for affixes
+ */
+void
+pdict(void)
+{
+ long i, count;
+ int encode, j, c;
+ char *lastword, *thisword, *word;
+
+ sput(ncodes);
+ for(i=0; i<ncodes; i++)
+ lput(encodes[i]);
+
+ count = ncodes*4 + 2;
+ lastword = "";
+ for(i=0; i<nwords; i++) {
+ word = words[i].word;
+ thisword = word;
+ for(j=0; *thisword == *lastword; j++) {
+ if(*thisword == 0) {
+ fprint(2, "identical words: %s\n", word);
+ break;
+ }
+ thisword++;
+ lastword++;
+ }
+ if(j > 15)
+ j = 15;
+ encode = words[i].encode;
+ c = (1<<15) | (j<<11) | encode;
+ sput(c);
+ count += 2;
+ for(thisword=word+j; c = *thisword; thisword++) {
+ Bputc(&bout, c);
+ count++;
+ }
+ lastword = word;
+ }
+ fprint(2, "output bytes = %ld\n", count);
+}