diff options
author | Taru Karttunen <taruti@taruti.net> | 2011-03-30 15:46:40 +0300 |
---|---|---|
committer | Taru Karttunen <taruti@taruti.net> | 2011-03-30 15:46:40 +0300 |
commit | e5888a1ffdae813d7575f5fb02275c6bb07e5199 (patch) | |
tree | d8d51eac403f07814b9e936eed0c9a79195e2450 /sys/src/cmd/dict/utils.c |
Import sources from 2011-03-30 iso image
Diffstat (limited to 'sys/src/cmd/dict/utils.c')
-rwxr-xr-x | sys/src/cmd/dict/utils.c | 580 |
1 files changed, 580 insertions, 0 deletions
diff --git a/sys/src/cmd/dict/utils.c b/sys/src/cmd/dict/utils.c new file mode 100755 index 000000000..d13a71333 --- /dev/null +++ b/sys/src/cmd/dict/utils.c @@ -0,0 +1,580 @@ +#include <u.h> +#include <libc.h> +#include <bio.h> +#include "dict.h" + +Dict dicts[] = { + {"oed", "Oxford English Dictionary, 2nd Ed.", + "/lib/dict/oed2", "/lib/dict/oed2index", + oednextoff, oedprintentry, oedprintkey}, + {"ahd", "American Heritage Dictionary, 2nd College Ed.", + "/lib/ahd/DICT.DB", "/lib/ahd/index", + ahdnextoff, ahdprintentry, ahdprintkey}, + {"pgw", "Project Gutenberg Webster Dictionary", + "/lib/dict/pgw", "/lib/dict/pgwindex", + pgwnextoff, pgwprintentry, pgwprintkey}, + {"thesaurus", "Collins Thesaurus", + "/lib/dict/thesaurus", "/lib/dict/thesindex", + thesnextoff, thesprintentry, thesprintkey}, + {"roget", "Project Gutenberg Roget's Thesaurus", + "/lib/dict/roget", "/lib/dict/rogetindex", + rogetnextoff, rogetprintentry, rogetprintkey}, + + {"ce", "Gendai Chinese->English", + "/lib/dict/world/sansdata/sandic24.dat", + "/lib/dict/world/sansdata/ceindex", + worldnextoff, worldprintentry, worldprintkey}, + {"ceh", "Gendai Chinese->English (Hanzi index)", + "/lib/dict/world/sansdata/sandic24.dat", + "/lib/dict/world/sansdata/cehindex", + worldnextoff, worldprintentry, worldprintkey}, + {"ec", "Gendai English->Chinese", + "/lib/dict/world/sansdata/sandic24.dat", + "/lib/dict/world/sansdata/ecindex", + worldnextoff, worldprintentry, worldprintkey}, + + {"dae", "Gyldendal Danish->English", + "/lib/dict/world/gylddata/sandic30.dat", + "/lib/dict/world/gylddata/daeindex", + worldnextoff, worldprintentry, worldprintkey}, + {"eda", "Gyldendal English->Danish", + "/lib/dict/world/gylddata/sandic29.dat", + "/lib/dict/world/gylddata/edaindex", + worldnextoff, worldprintentry, worldprintkey}, + + {"due", "Wolters-Noordhoff Dutch->English", + "/lib/dict/world/woltdata/sandic07.dat", + "/lib/dict/world/woltdata/deindex", + worldnextoff, worldprintentry, worldprintkey}, + {"edu", "Wolters-Noordhoff English->Dutch", + "/lib/dict/world/woltdata/sandic06.dat", + "/lib/dict/world/woltdata/edindex", + worldnextoff, worldprintentry, worldprintkey}, + + {"fie", "WSOY Finnish->English", + "/lib/dict/world/werndata/sandic32.dat", + "/lib/dict/world/werndata/fieindex", + worldnextoff, worldprintentry, worldprintkey}, + {"efi", "WSOY English->Finnish", + "/lib/dict/world/werndata/sandic31.dat", + "/lib/dict/world/werndata/efiindex", + worldnextoff, worldprintentry, worldprintkey}, + + {"fe", "Collins French->English", + "/lib/dict/fe", "/lib/dict/feindex", + pcollnextoff, pcollprintentry, pcollprintkey}, + {"ef", "Collins English->French", + "/lib/dict/ef", "/lib/dict/efindex", + pcollnextoff, pcollprintentry, pcollprintkey}, + + {"ge", "Collins German->English", + "/lib/dict/ge", "/lib/dict/geindex", + pcollgnextoff, pcollgprintentry, pcollgprintkey}, + {"eg", "Collins English->German", + "/lib/dict/eg", "/lib/dict/egindex", + pcollgnextoff, pcollgprintentry, pcollgprintkey}, + + {"ie", "Collins Italian->English", + "/lib/dict/ie", "/lib/dict/ieindex", + pcollnextoff, pcollprintentry, pcollprintkey}, + {"ei", "Collins English->Italian", + "/lib/dict/ei", "/lib/dict/eiindex", + pcollnextoff, pcollprintentry, pcollprintkey}, + + {"je", "Sanshusha Japanese->English", + "/lib/dict/world/sansdata/sandic18.dat", + "/lib/dict/world/sansdata/jeindex", + worldnextoff, worldprintentry, worldprintkey}, + {"jek", "Sanshusha Japanese->English (Kanji index)", + "/lib/dict/world/sansdata/sandic18.dat", + "/lib/dict/world/sansdata/jekindex", + worldnextoff, worldprintentry, worldprintkey}, + {"ej", "Sanshusha English->Japanese", + "/lib/dict/world/sansdata/sandic18.dat", + "/lib/dict/world/sansdata/ejindex", + worldnextoff, worldprintentry, worldprintkey}, + + {"tjeg", "Sanshusha technical Japanese->English,German", + "/lib/dict/world/sansdata/sandic16.dat", + "/lib/dict/world/sansdata/tjegindex", + worldnextoff, worldprintentry, worldprintkey}, + {"tjegk", "Sanshusha technical Japanese->English,German (Kanji index)", + "/lib/dict/world/sansdata/sandic16.dat", + "/lib/dict/world/sansdata/tjegkindex", + worldnextoff, worldprintentry, worldprintkey}, + {"tegj", "Sanshusha technical English->German,Japanese", + "/lib/dict/world/sansdata/sandic16.dat", + "/lib/dict/world/sansdata/tegjindex", + worldnextoff, worldprintentry, worldprintkey}, + {"tgje", "Sanshusha technical German->Japanese,English", + "/lib/dict/world/sansdata/sandic16.dat", + "/lib/dict/world/sansdata/tgjeindex", + worldnextoff, worldprintentry, worldprintkey}, + + {"ne", "Kunnskapforlaget Norwegian->English", + "/lib/dict/world/kunndata/sandic28.dat", + "/lib/dict/world/kunndata/neindex", + worldnextoff, worldprintentry, worldprintkey}, + {"en", "Kunnskapforlaget English->Norwegian", + "/lib/dict/world/kunndata/sandic27.dat", + "/lib/dict/world/kunndata/enindex", + worldnextoff, worldprintentry, worldprintkey}, + + {"re", "Leon Ungier Russian->English", + "/lib/dict/re", "/lib/dict/reindex", + simplenextoff, simpleprintentry, simpleprintkey}, + {"er", "Leon Ungier English->Russian", + "/lib/dict/re", "/lib/dict/erindex", + simplenextoff, simpleprintentry, simpleprintkey}, + + {"se", "Collins Spanish->English", + "/lib/dict/se", "/lib/dict/seindex", + pcollnextoff, pcollprintentry, pcollprintkey}, + {"es", "Collins English->Spanish", + "/lib/dict/es", "/lib/dict/esindex", + pcollnextoff, pcollprintentry, pcollprintkey}, + + {"swe", "Esselte Studium Swedish->English", + "/lib/dict/world/essedata/sandic34.dat", + "/lib/dict/world/essedata/sweindex", + worldnextoff, worldprintentry, worldprintkey}, + {"esw", "Esselte Studium English->Swedish", + "/lib/dict/world/essedata/sandic33.dat", + "/lib/dict/world/essedata/eswindex", + worldnextoff, worldprintentry, worldprintkey}, + + {"movie", "Movies -- by title", + "/lib/movie/data", "/lib/dict/movtindex", + movienextoff, movieprintentry, movieprintkey}, + {"moviea", "Movies -- by actor", + "/lib/movie/data", "/lib/dict/movaindex", + movienextoff, movieprintentry, movieprintkey}, + {"movied", "Movies -- by director", + "/lib/movie/data", "/lib/dict/movdindex", + movienextoff, movieprintentry, movieprintkey}, + + {"slang", "English Slang", + "/lib/dict/slang", "/lib/dict/slangindex", + slangnextoff, slangprintentry, slangprintkey}, + + {"robert", "Robert Électronique", + "/lib/dict/robert/_pointers", "/lib/dict/robert/_index", + robertnextoff, robertindexentry, robertprintkey}, + {"robertv", "Robert Électronique - formes des verbes", + "/lib/dict/robert/flex.rob", "/lib/dict/robert/_flexindex", + robertnextflex, robertflexentry, robertprintkey}, + + {0, 0, 0, 0, 0} +}; + +typedef struct Lig Lig; +struct Lig { + Rune start; /* accent rune */ + Rune *pairs; /* <char,accented version> pairs */ +}; + +static Lig ligtab[Nligs] = { +[LACU-LIGS] {L'´', L"AÁaáCĆcćEÉeégģIÍiíıíLĹlĺNŃnńOÓoóRŔrŕSŚsśUÚuúYÝyýZŹzź"}, +[LGRV-LIGS] {L'ˋ', L"AÀaàEÈeèIÌiìıìOÒoòUÙuù"}, +[LUML-LIGS] {L'¨', L"AÄaäEËeëIÏiïOÖoöUÜuüYŸyÿ"}, +[LCED-LIGS] {L'¸', L"CÇcçGĢKĶkķLĻlļNŅnņRŖrŗSŞsşTŢtţ"}, +[LTIL-LIGS] {L'˜', L"AÃaãIĨiĩıĩNÑnñOÕoõUŨuũ"}, +[LBRV-LIGS] {L'˘', L"AĂaăEĔeĕGĞgğIĬiĭıĭOŎoŏUŬuŭ"}, +[LRNG-LIGS] {L'˚', L"AÅaåUŮuů"}, +[LDOT-LIGS] {L'˙', L"CĊcċEĖeėGĠgġIİLĿlŀZŻzż"}, +[LDTB-LIGS] {L'.', L""}, +[LFRN-LIGS] {L'⌢', L"AÂaâCĈcĉEÊeêGĜgĝHĤhĥIÎiîıîJĴjĵOÔoôSŜsŝUÛuûWŴwŵYŶyŷ"}, +[LFRB-LIGS] {L'̯', L""}, +[LOGO-LIGS] {L'˛', L"AĄaąEĘeęIĮiįıįUŲuų"}, +[LMAC-LIGS] {L'¯', L"AĀaāEĒeēIĪiīıīOŌoōUŪuū"}, +[LHCK-LIGS] {L'ˇ', L"CČcčDĎdďEĚeěLĽlľNŇnňRŘrřSŠsšTŤtťZŽzž"}, +[LASP-LIGS] {L'ʽ', L""}, +[LLEN-LIGS] {L'ʼ', L""}, +[LBRB-LIGS] {L'̮', L""} +}; + +Rune *multitab[Nmulti] = { +[MAAS-MULTI] L"ʽα", +[MALN-MULTI] L"ʼα", +[MAND-MULTI] L"and", +[MAOQ-MULTI] L"a/q", +[MBRA-MULTI] L"<|", +[MDD-MULTI] L"..", +[MDDD-MULTI] L"...", +[MEAS-MULTI] L"ʽε", +[MELN-MULTI] L"ʼε", +[MEMM-MULTI] L"——", +[MHAS-MULTI] L"ʽη", +[MHLN-MULTI] L"ʼη", +[MIAS-MULTI] L"ʽι", +[MILN-MULTI] L"ʼι", +[MLCT-MULTI] L"ct", +[MLFF-MULTI] L"ff", +[MLFFI-MULTI] L"ffi", +[MLFFL-MULTI] L"ffl", +[MLFL-MULTI] L"fl", +[MLFI-MULTI] L"fi", +[MLLS-MULTI] L"ɫɫ", +[MLST-MULTI] L"st", +[MOAS-MULTI] L"ʽο", +[MOLN-MULTI] L"ʼο", +[MOR-MULTI] L"or", +[MRAS-MULTI] L"ʽρ", +[MRLN-MULTI] L"ʼρ", +[MTT-MULTI] L"~~", +[MUAS-MULTI] L"ʽυ", +[MULN-MULTI] L"ʼυ", +[MWAS-MULTI] L"ʽω", +[MWLN-MULTI] L"ʼω", +[MOE-MULTI] L"oe", +[MES-MULTI] L" ", +}; + +#define risupper(r) (L'A' <= (r) && (r) <= L'Z') +#define rislatin1(r) (0xC0 <= (r) && (r) <= 0xFF) +#define rtolower(r) ((r)-'A'+'a') + +static Rune latin_fold_tab[] = +{ +/* Table to fold latin 1 characters to ASCII equivalents + based at Rune value 0xc0 + + À Á Â Ã Ä Å Æ Ç + È É Ê Ë Ì Í Î Ï + Ð Ñ Ò Ó Ô Õ Ö × + Ø Ù Ú Û Ü Ý Þ ß + à á â ã ä å æ ç + è é ê ë ì í î ï + ð ñ ò ó ô õ ö ÷ + ø ù ú û ü ý þ ÿ +*/ + 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', + 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i', + 'd', 'n', 'o', 'o', 'o', 'o', 'o', 0 , + 'o', 'u', 'u', 'u', 'u', 'y', 0 , 0 , + 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', + 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i', + 'd', 'n', 'o', 'o', 'o', 'o', 'o', 0 , + 'o', 'u', 'u', 'u', 'u', 'y', 0 , 'y', +}; + +static Rune *ttabstack[20]; +static int ntt; + +/* + * tab is an array of n Assoc's, sorted by key. + * Look for key in tab, and return corresponding val + * or -1 if not there + */ +long +lookassoc(Assoc *tab, int n, char *key) +{ + Assoc *q; + long i, low, high; + int r; + + for(low = -1, high = n; high > low+1; ){ + i = (high+low)/2; + q = &tab[i]; + if((r=strcmp(key, q->key))<0) + high = i; + else if(r == 0) + return q->val; + else + low=i; + } + return -1; +} + +long +looknassoc(Nassoc *tab, int n, long key) +{ + Nassoc *q; + long i, low, high; + + for(low = -1, high = n; high > low+1; ){ + i = (high+low)/2; + q = &tab[i]; + if(key < q->key) + high = i; + else if(key == q->key) + return q->val; + else + low=i; + } + return -1; +} + +void +err(char *fmt, ...) +{ + char buf[1000]; + va_list v; + + va_start(v, fmt); + vsnprint(buf, sizeof(buf), fmt, v); + va_end(v); + fprint(2, "%s: %s\n", argv0, buf); +} + +/* + * Write the rune r to bout, keeping track of line length + * and breaking the lines (at blanks) when they get too long + */ +void +outrune(long r) +{ + if(outinhibit) + return; + if(++linelen > breaklen && r == L' ') { + Bputc(bout, '\n'); + linelen = 0; + } else + Bputrune(bout, r); +} + +void +outrunes(Rune *rp) +{ + Rune r; + + while((r = *rp++) != 0) + outrune(r); +} + +/* like outrune, but when arg is know to be a char */ +void +outchar(int c) +{ + if(outinhibit) + return; + if(++linelen > breaklen && c == ' ') { + c ='\n'; + linelen = 0; + } + Bputc(bout, c); +} + +void +outchars(char *s) +{ + char c; + + while((c = *s++) != 0) + outchar(c); +} + +void +outprint(char *fmt, ...) +{ + char buf[1000]; + va_list v; + + va_start(v, fmt); + vsnprint(buf, sizeof(buf), fmt, v); + va_end(v); + outchars(buf); +} + +void +outpiece(char *b, char *e) +{ + int c, lastc; + + lastc = 0; + while(b < e) { + c = *b++; + if(c == '\n') + c = ' '; + if(!(c == ' ' && lastc == ' ')) + outchar(c); + lastc = c; + } +} + +/* + * Go to new line if not already there; indent if ind != 0. + * If ind > 1, leave a blank line too. + * Slight hack: assume if current line is only one or two + * characters long, then they were spaces. + */ +void +outnl(int ind) +{ + if(outinhibit) + return; + if(ind) { + if(ind > 1) { + if(linelen > 2) + Bputc(bout, '\n'); + Bprint(bout, "\n "); + } else if(linelen == 0) + Bprint(bout, " "); + else if(linelen == 1) + Bputc(bout, ' '); + else if(linelen != 2) + Bprint(bout, "\n "); + linelen = 2; + } else { + if(linelen) { + Bputc(bout, '\n'); + linelen = 0; + } + } +} + +/* + * Fold the runes in null-terminated rp. + * Use the sort(1) definition of folding (uppercase to lowercase, + * latin1-accented characters to corresponding unaccented chars) + */ +void +fold(Rune *rp) +{ + Rune r; + + while((r = *rp) != 0) { + if (rislatin1(r) && latin_fold_tab[r-0xc0]) + r = latin_fold_tab[r-0xc0]; + if(risupper(r)) + r = rtolower(r); + *rp++ = r; + } +} + +/* + * Like fold, but put folded result into new + * (assumed to have enough space). + * old is a regular expression, but we know that + * metacharacters aren't affected + */ +void +foldre(char *new, char *old) +{ + Rune r; + + while(*old) { + old += chartorune(&r, old); + if (rislatin1(r) && latin_fold_tab[r-0xc0]) + r = latin_fold_tab[r-0xc0]; + if(risupper(r)) + r = rtolower(r); + new += runetochar(new, &r); + } + *new = 0; +} + +/* + * acomp(s, t) returns: + * -2 if s strictly precedes t + * -1 if s is a prefix of t + * 0 if s is the same as t + * 1 if t is a prefix of s + * 2 if t strictly precedes s + */ + +int +acomp(Rune *s, Rune *t) +{ + int cs, ct; + + for(;;) { + cs = *s; + ct = *t; + if(cs != ct) + break; + if(cs == 0) + return 0; + s++; + t++; + } + if(cs == 0) + return -1; + if(ct == 0) + return 1; + if(cs < ct) + return -2; + return 2; +} + +/* + * Copy null terminated Runes from 'from' to 'to'. + */ +void +runescpy(Rune *to, Rune *from) +{ + while((*to++ = *from++) != 0) + continue; +} + +/* + * Conversion of unsigned number to long, no overflow detection + */ +long +runetol(Rune *r) +{ + int c; + long n; + + n = 0; + for(;; r++){ + c = *r; + if(L'0'<=c && c<=L'9') + c -= '0'; + else + break; + n = n*10 + c; + } + return n; +} + +/* + * See if there is a rune corresponding to the accented + * version of r with accent acc (acc in [LIGS..LIGE-1]), + * and return it if so, else return NONE. + */ +Rune +liglookup(Rune acc, Rune r) +{ + Rune *p; + + if(acc < LIGS || acc >= LIGE) + return NONE; + for(p = ligtab[acc-LIGS].pairs; *p; p += 2) + if(*p == r) + return *(p+1); + return NONE; +} + +/* + * Maintain a translation table stack (a translation table + * is an array of Runes indexed by bytes or 7-bit bytes). + * If starting is true, push the curtab onto the stack + * and return newtab; else pop the top of the stack and + * return it. + * If curtab is 0, initialize the stack and return. + */ +Rune * +changett(Rune *curtab, Rune *newtab, int starting) +{ + if(curtab == 0) { + ntt = 0; + return 0; + } + if(starting) { + if(ntt >= asize(ttabstack)) { + if(debug) + err("translation stack overflow"); + return curtab; + } + ttabstack[ntt++] = curtab; + return newtab; + } else { + if(ntt == 0) { + if(debug) + err("translation stack underflow"); + return curtab; + } + return ttabstack[--ntt]; + } +} |