diff options
author | Taru Karttunen <taruti@taruti.net> | 2011-03-30 15:46:40 +0300 |
---|---|---|
committer | Taru Karttunen <taruti@taruti.net> | 2011-03-30 15:46:40 +0300 |
commit | e5888a1ffdae813d7575f5fb02275c6bb07e5199 (patch) | |
tree | d8d51eac403f07814b9e936eed0c9a79195e2450 /sys/src/cmd/spell/pcode.c |
Import sources from 2011-03-30 iso image
Diffstat (limited to 'sys/src/cmd/spell/pcode.c')
-rwxr-xr-x | sys/src/cmd/spell/pcode.c | 336 |
1 files changed, 336 insertions, 0 deletions
diff --git a/sys/src/cmd/spell/pcode.c b/sys/src/cmd/spell/pcode.c new file mode 100755 index 000000000..1d058a3d1 --- /dev/null +++ b/sys/src/cmd/spell/pcode.c @@ -0,0 +1,336 @@ +#include <u.h> +#include <libc.h> +#include <bio.h> +#include <ctype.h> +#include "code.h" + +/* read an annotated spelling list in the form + word <tab> affixcode [ , affixcode ] ... + print a reencoded version + octal <tab> word + */ + +typedef struct Dict Dict; +struct Dict +{ + char* word; + int encode; +}; + +Dict words[200000]; +char space[500000]; +long encodes[4094]; +long nspace; +long nwords; +int ncodes; +Biobuf bout; + +void readinput(int f); +long typecode(char *str); +int wcmp(void*, void*); +void pdict(void); +void sput(int); + +void +main(int argc, char *argv[]) +{ + int f; + + Binit(&bout, 1, OWRITE); + nwords = 0; + nspace = 0; + ncodes = 0; + if(argc <= 1) + readinput(0); + while(argc > 1) { + f = open(argv[1], 0); + if(f < 0) { + fprint(2, "Cannot open %s\n", argv[1]); + exits("open"); + } + readinput(f); + argc--; + argv++; + } + fprint(2, "words = %ld; space = %ld; codes = %d\n", + nwords, nspace, ncodes); + qsort(words, nwords, sizeof(words[0]), wcmp); + pdict(); + exits(0); +} + +wcmp(void *a, void *b) +{ + + return strcmp(((Dict*)a)->word, ((Dict*)b)->word); +} + +void +readinput(int f) +{ + long i; + char *code, *line, *bword; + Biobuf buf; + long lineno = 0; + + Binit(&buf, f, OREAD); + while(line = Brdline(&buf, '\n')) { + line[Blinelen(&buf)-1] = 0; + lineno++; + code = line; + while(isspace(*code)) + code++; + bword = code; + while(*code && !isspace(*code)) + code++; + + i = code-bword; + memmove(space+nspace, bword, i); + words[nwords].word = space+nspace; + nspace += i; + space[nspace] = 0; + nspace++; + + if(*code) { + *code++ = 0; + while(isspace(*code)) + code++; + } + words[nwords].encode = typecode(code); + nwords++; + if(nwords >= sizeof(words)/sizeof(words[0])) { + fprint(2, "words array too small\n"); + exits("words"); + } + if(nspace >= sizeof(space)/sizeof(space[0])) { + fprint(2, "space array too small\n"); + exits("space"); + } + } + Bterm(&buf); +} + + +typedef struct Class Class; +struct Class +{ + char* codename; + long bits; +}; +Class codea[] = +{ + { "a", ADJ }, + { "adv", ADV }, + 0 +}; +Class codec[] = +{ + { "comp", COMP }, + 0 +}; +Class coded[] = +{ + { "d", DONT_TOUCH}, + 0 +}; + +Class codee[] = +{ + { "ed", ED }, + { "er", ACTOR }, + 0 +}; + +Class codei[] = +{ + { "in", IN }, + { "ion", ION }, + 0 +}; + +Class codem[] = +{ + { "man", MAN }, + { "ms", MONO }, + 0 +}; + +Class coden[] = +{ + { "n", NOUN }, + { "na", N_AFFIX }, + { "nopref", NOPREF }, + 0 +}; + +Class codep[] = +{ + { "pc", PROP_COLLECT }, + 0 +}; +Class codes[] = +{ + { "s", STOP }, + 0 +}; + +Class codev[] = +{ + { "v", VERB }, + { "va", V_AFFIX }, + { "vi", V_IRREG }, + 0 +}; + +Class codey[] = +{ + { "y", _Y }, + 0 +}; + +Class codez[] = +{ + 0 +}; +Class* codetab[] = +{ + codea, + codez, + codec, + coded, + codee, + codez, + codez, + codez, + codei, + codez, + codez, + codez, + codem, + coden, + codez, + codep, + codez, + codez, + codes, + codez, + codez, + codev, + codez, + codez, + codey, + codez, +}; + +long +typecode(char *str) +{ + Class *p; + long code; + int n, i; + char *s, *sp, *st; + + code = 0; + +loop: + for(s=str; *s != 0 && *s != ','; s++) + ; + for(p = codetab[*str-'a']; sp = p->codename; p++) { + st = str; + for(n=s-str;; st++,sp++) { + if(*st != *sp) + goto cont; + n--; + if(n == 0) + break; + } + code |= p->bits; + if(*s == 0) + goto out; + str = s+1; + goto loop; + cont:; + } + fprint(2, "Unknown affix code \"%s\"\n", str); + return 0; +out: + for(i=0; i<ncodes; i++) + if(encodes[i] == code) + return i; + encodes[i] = code; + ncodes++; + return i; +} + +void +sput(int s) +{ + + Bputc(&bout, s>>8); + Bputc(&bout, s); +} + +void +lput(long l) +{ + Bputc(&bout, l>>24); + Bputc(&bout, l>>16); + Bputc(&bout, l>>8); + Bputc(&bout, l); +} + +/* + * spit out the encoded dictionary + * all numbers are encoded big-endian. + * struct + * { + * short ncodes; + * long encodes[ncodes]; + * struct + * { + * short encode; + * char word[*]; + * } words[*]; + * }; + * 0x8000 flag for code word + * 0x7800 count of number of common bytes with previous word + * 0x07ff index into codes array for affixes + */ +void +pdict(void) +{ + long i, count; + int encode, j, c; + char *lastword, *thisword, *word; + + sput(ncodes); + for(i=0; i<ncodes; i++) + lput(encodes[i]); + + count = ncodes*4 + 2; + lastword = ""; + for(i=0; i<nwords; i++) { + word = words[i].word; + thisword = word; + for(j=0; *thisword == *lastword; j++) { + if(*thisword == 0) { + fprint(2, "identical words: %s\n", word); + break; + } + thisword++; + lastword++; + } + if(j > 15) + j = 15; + encode = words[i].encode; + c = (1<<15) | (j<<11) | encode; + sput(c); + count += 2; + for(thisword=word+j; c = *thisword; thisword++) { + Bputc(&bout, c); + count++; + } + lastword = word; + } + fprint(2, "output bytes = %ld\n", count); +} |