Import sources from 2011-03-30 iso image

author: Taru Karttunen <taruti@taruti.net> 2011-03-30 15:46:40 +0300
committer: Taru Karttunen <taruti@taruti.net> 2011-03-30 15:46:40 +0300
commit: e5888a1ffdae813d7575f5fb02275c6bb07e5199 (patch)
tree: d8d51eac403f07814b9e936eed0c9a79195e2450 /sys/src/cmd/dict/utils.c
1 files changed, 580 insertions, 0 deletions
diff --git a/sys/src/cmd/dict/utils.c b/sys/src/cmd/dict/utils.c
new file mode 100755
index 000000000..d13a71333
--- /dev/null
+++ b/sys/src/cmd/dict/utils.c
@@ -0,0 +1,580 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include "dict.h"
+
+Dict dicts[] = {
+	{"oed",		"Oxford English Dictionary, 2nd Ed.",
+	 "/lib/dict/oed2",	"/lib/dict/oed2index",
+	 oednextoff,	oedprintentry,		oedprintkey},
+	{"ahd",		"American Heritage Dictionary, 2nd College Ed.",
+	 "/lib/ahd/DICT.DB",	"/lib/ahd/index",
+	 ahdnextoff,	ahdprintentry,		ahdprintkey},
+	{"pgw",		"Project Gutenberg Webster Dictionary",
+	 "/lib/dict/pgw",	"/lib/dict/pgwindex",
+	 pgwnextoff,	pgwprintentry,		pgwprintkey},
+	{"thesaurus",	"Collins Thesaurus",
+	 "/lib/dict/thesaurus",	"/lib/dict/thesindex",
+	 thesnextoff,	thesprintentry,	thesprintkey},
+	{"roget",		"Project Gutenberg Roget's Thesaurus",
+	 "/lib/dict/roget", "/lib/dict/rogetindex",
+	 rogetnextoff,	rogetprintentry,	rogetprintkey},
+
+	{"ce",		"Gendai Chinese->English",
+	 "/lib/dict/world/sansdata/sandic24.dat",
+	 "/lib/dict/world/sansdata/ceindex",
+	 worldnextoff,	worldprintentry,	worldprintkey},
+	{"ceh",		"Gendai Chinese->English (Hanzi index)",
+	 "/lib/dict/world/sansdata/sandic24.dat",
+	 "/lib/dict/world/sansdata/cehindex",
+	 worldnextoff,	worldprintentry,	worldprintkey},
+	{"ec",		"Gendai English->Chinese",
+	 "/lib/dict/world/sansdata/sandic24.dat",
+	 "/lib/dict/world/sansdata/ecindex",
+	 worldnextoff,	worldprintentry,	worldprintkey},
+
+	{"dae",		"Gyldendal Danish->English",
+	 "/lib/dict/world/gylddata/sandic30.dat",
+	 "/lib/dict/world/gylddata/daeindex",
+	 worldnextoff,	worldprintentry,	worldprintkey},
+	{"eda",		"Gyldendal English->Danish",
+	 "/lib/dict/world/gylddata/sandic29.dat",
+	 "/lib/dict/world/gylddata/edaindex",
+	 worldnextoff,	worldprintentry,	worldprintkey},
+
+	{"due",		"Wolters-Noordhoff Dutch->English",
+	 "/lib/dict/world/woltdata/sandic07.dat",
+	 "/lib/dict/world/woltdata/deindex",
+	 worldnextoff,	worldprintentry,	worldprintkey},
+	{"edu",		"Wolters-Noordhoff English->Dutch",
+	 "/lib/dict/world/woltdata/sandic06.dat",
+	 "/lib/dict/world/woltdata/edindex",
+	 worldnextoff,	worldprintentry,	worldprintkey},
+
+	{"fie",		"WSOY Finnish->English",
+	 "/lib/dict/world/werndata/sandic32.dat",
+	 "/lib/dict/world/werndata/fieindex",
+	 worldnextoff,	worldprintentry,	worldprintkey},
+	{"efi",		"WSOY English->Finnish",
+	 "/lib/dict/world/werndata/sandic31.dat",
+	 "/lib/dict/world/werndata/efiindex",
+	 worldnextoff,	worldprintentry,	worldprintkey},
+
+	{"fe",		"Collins French->English",
+	 "/lib/dict/fe",	"/lib/dict/feindex",
+	 pcollnextoff,	pcollprintentry,	pcollprintkey},
+	{"ef",		"Collins English->French",
+	 "/lib/dict/ef",	"/lib/dict/efindex",
+	 pcollnextoff,	pcollprintentry,	pcollprintkey},
+
+	{"ge",		"Collins German->English",
+	 "/lib/dict/ge",	"/lib/dict/geindex",
+	 pcollgnextoff,	pcollgprintentry,	pcollgprintkey},
+	{"eg",		"Collins English->German",
+	 "/lib/dict/eg",	"/lib/dict/egindex",
+	 pcollgnextoff,	pcollgprintentry,	pcollgprintkey},
+
+	{"ie",		"Collins Italian->English",
+	 "/lib/dict/ie",	"/lib/dict/ieindex",
+	 pcollnextoff,	pcollprintentry,	pcollprintkey},
+	{"ei",		"Collins English->Italian",
+	 "/lib/dict/ei",	"/lib/dict/eiindex",
+	 pcollnextoff,	pcollprintentry,	pcollprintkey},
+
+	{"je",		"Sanshusha Japanese->English",
+	 "/lib/dict/world/sansdata/sandic18.dat",
+	 "/lib/dict/world/sansdata/jeindex",
+	 worldnextoff,	worldprintentry,	worldprintkey},
+	{"jek",		"Sanshusha Japanese->English (Kanji index)",
+	 "/lib/dict/world/sansdata/sandic18.dat",
+	 "/lib/dict/world/sansdata/jekindex",
+	 worldnextoff,	worldprintentry,	worldprintkey},
+	{"ej",		"Sanshusha English->Japanese",
+	 "/lib/dict/world/sansdata/sandic18.dat",
+	 "/lib/dict/world/sansdata/ejindex",
+	 worldnextoff,	worldprintentry,	worldprintkey},
+
+	{"tjeg",	"Sanshusha technical Japanese->English,German",
+	 "/lib/dict/world/sansdata/sandic16.dat",
+	 "/lib/dict/world/sansdata/tjegindex",
+	 worldnextoff,	worldprintentry,	worldprintkey},
+	{"tjegk",	"Sanshusha technical Japanese->English,German (Kanji index)",
+	 "/lib/dict/world/sansdata/sandic16.dat",
+	 "/lib/dict/world/sansdata/tjegkindex",
+	 worldnextoff,	worldprintentry,	worldprintkey},
+	{"tegj",	"Sanshusha technical English->German,Japanese",
+	 "/lib/dict/world/sansdata/sandic16.dat",
+	 "/lib/dict/world/sansdata/tegjindex",
+	 worldnextoff,	worldprintentry,	worldprintkey},
+	{"tgje",	"Sanshusha technical German->Japanese,English",
+	 "/lib/dict/world/sansdata/sandic16.dat",
+	 "/lib/dict/world/sansdata/tgjeindex",
+	 worldnextoff,	worldprintentry,	worldprintkey},
+
+	{"ne",		"Kunnskapforlaget Norwegian->English",
+	 "/lib/dict/world/kunndata/sandic28.dat",
+	 "/lib/dict/world/kunndata/neindex",
+	 worldnextoff,	worldprintentry,	worldprintkey},
+	{"en",		"Kunnskapforlaget English->Norwegian",
+	 "/lib/dict/world/kunndata/sandic27.dat",
+	 "/lib/dict/world/kunndata/enindex",
+	 worldnextoff,	worldprintentry,	worldprintkey},
+
+	{"re",		"Leon Ungier Russian->English",
+	 "/lib/dict/re",	"/lib/dict/reindex",
+	 simplenextoff,	simpleprintentry,	simpleprintkey},
+	{"er",		"Leon Ungier English->Russian",
+	 "/lib/dict/re",	"/lib/dict/erindex",
+	 simplenextoff,	simpleprintentry,	simpleprintkey},
+
+	{"se",		"Collins Spanish->English",
+	 "/lib/dict/se",	"/lib/dict/seindex",
+	 pcollnextoff,	pcollprintentry,	pcollprintkey},
+	{"es",		"Collins English->Spanish",
+	 "/lib/dict/es",	"/lib/dict/esindex",
+	 pcollnextoff,	pcollprintentry,	pcollprintkey},
+
+	{"swe",		"Esselte Studium Swedish->English",
+	 "/lib/dict/world/essedata/sandic34.dat",
+	 "/lib/dict/world/essedata/sweindex",
+	 worldnextoff,	worldprintentry,	worldprintkey},
+	{"esw",		"Esselte Studium English->Swedish",
+	 "/lib/dict/world/essedata/sandic33.dat",
+	 "/lib/dict/world/essedata/eswindex",
+	 worldnextoff,	worldprintentry,	worldprintkey},
+
+	{"movie",	"Movies -- by title",
+	 "/lib/movie/data",	"/lib/dict/movtindex",
+	 movienextoff,	movieprintentry,	movieprintkey},
+	{"moviea",	"Movies -- by actor",
+	 "/lib/movie/data",	"/lib/dict/movaindex",
+	 movienextoff,	movieprintentry,	movieprintkey},
+	{"movied",	"Movies -- by director",
+	 "/lib/movie/data",	"/lib/dict/movdindex",
+	 movienextoff,	movieprintentry,	movieprintkey},
+
+	{"slang",	"English Slang",
+	 "/lib/dict/slang",	"/lib/dict/slangindex",
+	 slangnextoff,	slangprintentry,	slangprintkey},
+
+	{"robert",	"Robert Électronique",
+	 "/lib/dict/robert/_pointers",	"/lib/dict/robert/_index",
+	 robertnextoff,	robertindexentry,	robertprintkey},
+	{"robertv",	"Robert Électronique - formes des verbes",
+	 "/lib/dict/robert/flex.rob",	"/lib/dict/robert/_flexindex",
+	 robertnextflex,	robertflexentry,	robertprintkey},
+
+	{0, 0, 0, 0, 0}
+};
+
+typedef struct Lig Lig;
+struct Lig {
+	Rune	start;		/* accent rune */
+	Rune	*pairs;		/* <char,accented version> pairs */
+};
+
+static Lig ligtab[Nligs] = {
+[LACU-LIGS]	{L'´',	L"AÁaáCĆcćEÉeégģIÍiíıíLĹlĺNŃnńOÓoóRŔrŕSŚsśUÚuúYÝyýZŹzź"},
+[LGRV-LIGS]	{L'ˋ',	L"AÀaàEÈeèIÌiìıìOÒoòUÙuù"},
+[LUML-LIGS]	{L'¨',	L"AÄaäEËeëIÏiïOÖoöUÜuüYŸyÿ"},
+[LCED-LIGS]	{L'¸',	L"CÇcçGĢKĶkķLĻlļNŅnņRŖrŗSŞsşTŢtţ"},
+[LTIL-LIGS]	{L'˜',	L"AÃaãIĨiĩıĩNÑnñOÕoõUŨuũ"},
+[LBRV-LIGS]	{L'˘',	L"AĂaăEĔeĕGĞgğIĬiĭıĭOŎoŏUŬuŭ"},
+[LRNG-LIGS]	{L'˚',	L"AÅaåUŮuů"},
+[LDOT-LIGS]	{L'˙',	L"CĊcċEĖeėGĠgġIİLĿlŀZŻzż"},
+[LDTB-LIGS]	{L'.',	L""},
+[LFRN-LIGS]	{L'⌢',	L"AÂaâCĈcĉEÊeêGĜgĝHĤhĥIÎiîıîJĴjĵOÔoôSŜsŝUÛuûWŴwŵYŶyŷ"},
+[LFRB-LIGS]	{L'̯',	L""},
+[LOGO-LIGS]	{L'˛',	L"AĄaąEĘeęIĮiįıįUŲuų"},
+[LMAC-LIGS]	{L'¯',	L"AĀaāEĒeēIĪiīıīOŌoōUŪuū"},
+[LHCK-LIGS]	{L'ˇ',	L"CČcčDĎdďEĚeěLĽlľNŇnňRŘrřSŠsšTŤtťZŽzž"},
+[LASP-LIGS]	{L'ʽ',	L""},
+[LLEN-LIGS]	{L'ʼ',	L""},
+[LBRB-LIGS]	{L'̮',	L""}
+};
+
+Rune *multitab[Nmulti] = {
+[MAAS-MULTI]	L"ʽα",
+[MALN-MULTI]	L"ʼα",
+[MAND-MULTI]	L"and",
+[MAOQ-MULTI]	L"a/q",
+[MBRA-MULTI]	L"<|",
+[MDD-MULTI]	L"..",
+[MDDD-MULTI]	L"...",
+[MEAS-MULTI]	L"ʽε",
+[MELN-MULTI]	L"ʼε",
+[MEMM-MULTI]	L"——",
+[MHAS-MULTI]	L"ʽη",
+[MHLN-MULTI]	L"ʼη",
+[MIAS-MULTI]	L"ʽι",
+[MILN-MULTI]	L"ʼι",
+[MLCT-MULTI]	L"ct",
+[MLFF-MULTI]	L"ff",
+[MLFFI-MULTI]	L"ffi",
+[MLFFL-MULTI]	L"ffl",
+[MLFL-MULTI]	L"fl",
+[MLFI-MULTI]	L"fi",
+[MLLS-MULTI]	L"ɫɫ",
+[MLST-MULTI]	L"st",
+[MOAS-MULTI]	L"ʽο",
+[MOLN-MULTI]	L"ʼο",
+[MOR-MULTI]	L"or",
+[MRAS-MULTI]	L"ʽρ",
+[MRLN-MULTI]	L"ʼρ",
+[MTT-MULTI]	L"~~",
+[MUAS-MULTI]	L"ʽυ",
+[MULN-MULTI]	L"ʼυ",
+[MWAS-MULTI]	L"ʽω",
+[MWLN-MULTI]	L"ʼω",
+[MOE-MULTI]	L"oe",
+[MES-MULTI]	L"  ",
+};
+
+#define	risupper(r)	(L'A' <= (r) && (r) <= L'Z')
+#define	rislatin1(r)	(0xC0 <= (r) && (r) <= 0xFF)
+#define	rtolower(r)	((r)-'A'+'a')
+
+static Rune latin_fold_tab[] =
+{
+/*	Table to fold latin 1 characters to ASCII equivalents
+			based at Rune value 0xc0
+
+	 À    Á    Â    Ã    Ä    Å    Æ    Ç
+	 È    É    Ê    Ë    Ì    Í    Î    Ï
+	 Ð    Ñ    Ò    Ó    Ô    Õ    Ö    ×
+	 Ø    Ù    Ú    Û    Ü    Ý    Þ    ß
+	 à    á    â    ã    ä    å    æ    ç
+	 è    é    ê    ë    ì    í    î    ï
+	 ð    ñ    ò    ó    ô    õ    ö    ÷
+	 ø    ù    ú    û    ü    ý    þ    ÿ
+*/
+	'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c',
+	'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
+	'd', 'n', 'o', 'o', 'o', 'o', 'o',  0 ,
+	'o', 'u', 'u', 'u', 'u', 'y',  0 ,  0 ,
+	'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c',
+	'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
+	'd', 'n', 'o', 'o', 'o', 'o', 'o',  0 ,
+	'o', 'u', 'u', 'u', 'u', 'y',  0 , 'y',
+};
+
+static Rune 	*ttabstack[20];
+static int	ntt;
+
+/*
+ * tab is an array of n Assoc's, sorted by key.
+ * Look for key in tab, and return corresponding val
+ * or -1 if not there
+ */
+long
+lookassoc(Assoc *tab, int n, char *key)
+{
+	Assoc *q;
+	long i, low, high;
+	int r;
+
+	for(low = -1, high = n; high > low+1; ){
+		i = (high+low)/2;
+		q = &tab[i];
+		if((r=strcmp(key, q->key))<0)
+			high = i;
+		else if(r == 0)
+			return q->val;
+		else
+			low=i;
+	}
+	return -1;
+}
+
+long
+looknassoc(Nassoc *tab, int n, long key)
+{
+	Nassoc *q;
+	long i, low, high;
+
+	for(low = -1, high = n; high > low+1; ){
+		i = (high+low)/2;
+		q = &tab[i];
+		if(key < q->key)
+			high = i;
+		else if(key == q->key)
+			return q->val;
+		else
+			low=i;
+	}
+	return -1;
+}
+
+void
+err(char *fmt, ...)
+{
+	char buf[1000];
+	va_list v;
+
+	va_start(v, fmt);
+	vsnprint(buf, sizeof(buf), fmt, v);
+	va_end(v);
+	fprint(2, "%s: %s\n", argv0, buf);
+}
+
+/*
+ * Write the rune r to bout, keeping track of line length
+ * and breaking the lines (at blanks) when they get too long
+ */
+void
+outrune(long r)
+{
+	if(outinhibit)
+		return;
+	if(++linelen > breaklen && r == L' ') {
+		Bputc(bout, '\n');
+		linelen = 0;
+	} else
+		Bputrune(bout, r);
+}
+
+void
+outrunes(Rune *rp)
+{
+	Rune r;
+
+	while((r = *rp++) != 0)
+		outrune(r);
+}
+
+/* like outrune, but when arg is know to be a char */
+void
+outchar(int c)
+{
+	if(outinhibit)
+		return;
+	if(++linelen > breaklen && c == ' ') {
+		c ='\n';
+		linelen = 0;
+	}
+	Bputc(bout, c);
+}
+
+void
+outchars(char *s)
+{
+	char c;
+
+	while((c = *s++) != 0)
+		outchar(c);
+}
+
+void
+outprint(char *fmt, ...)
+{
+	char buf[1000];
+	va_list v;
+
+	va_start(v, fmt);
+	vsnprint(buf, sizeof(buf), fmt, v);
+	va_end(v);
+	outchars(buf);
+}
+
+void
+outpiece(char *b, char *e)
+{
+	int c, lastc;
+
+	lastc = 0;
+	while(b < e) {
+		c = *b++;
+		if(c == '\n')
+			c = ' ';
+		if(!(c == ' ' && lastc == ' '))
+			outchar(c);
+		lastc = c;
+	}
+}
+
+/*
+ * Go to new line if not already there; indent if ind != 0.
+ * If ind > 1, leave a blank line too.
+ * Slight hack: assume if current line is only one or two
+ * characters long, then they were spaces.
+ */
+void
+outnl(int ind)
+{
+	if(outinhibit)
+		return;
+	if(ind) {
+		if(ind > 1) {
+			if(linelen > 2)
+				Bputc(bout, '\n');
+			Bprint(bout, "\n  ");
+		} else if(linelen == 0)
+			Bprint(bout, "  ");
+		else if(linelen == 1)
+			Bputc(bout, ' ');
+		else if(linelen != 2)
+			Bprint(bout, "\n  ");
+		linelen = 2;
+	} else {
+		if(linelen) {
+			Bputc(bout, '\n');
+			linelen = 0;
+		}
+	}
+}
+
+/*
+ * Fold the runes in null-terminated rp.
+ * Use the sort(1) definition of folding (uppercase to lowercase,
+ * latin1-accented characters to corresponding unaccented chars)
+ */
+void
+fold(Rune *rp)
+{
+	Rune r;
+
+	while((r = *rp) != 0) {
+		if (rislatin1(r) && latin_fold_tab[r-0xc0])
+				r = latin_fold_tab[r-0xc0];
+		if(risupper(r))
+			r = rtolower(r);
+		*rp++ = r;
+	}
+}
+
+/*
+ * Like fold, but put folded result into new
+ * (assumed to have enough space).
+ * old is a regular expression, but we know that
+ * metacharacters aren't affected
+ */
+void
+foldre(char *new, char *old)
+{
+	Rune r;
+
+	while(*old) {
+		old += chartorune(&r, old);
+		if (rislatin1(r) && latin_fold_tab[r-0xc0])
+				r = latin_fold_tab[r-0xc0];
+		if(risupper(r))
+			r = rtolower(r);
+		new += runetochar(new, &r);
+	}
+	*new = 0;
+}
+
+/*
+ *	acomp(s, t) returns:
+ *		-2 if s strictly precedes t
+ *		-1 if s is a prefix of t
+ *		0 if s is the same as t
+ *		1 if t is a prefix of s
+ *		2 if t strictly precedes s
+ */
+
+int
+acomp(Rune *s, Rune *t)
+{
+	int cs, ct;
+
+	for(;;) {
+		cs = *s;
+		ct = *t;
+		if(cs != ct)
+			break;
+		if(cs == 0)
+			return 0;
+		s++;
+		t++;
+	}
+	if(cs == 0)
+		return -1;
+	if(ct == 0)
+		return 1;
+	if(cs < ct)
+		return -2;
+	return 2;
+}
+
+/*
+ * Copy null terminated Runes from 'from' to 'to'.
+ */
+void
+runescpy(Rune *to, Rune *from)
+{
+	while((*to++ = *from++) != 0)
+		continue;
+}
+
+/*
+ * Conversion of unsigned number to long, no overflow detection
+ */
+long
+runetol(Rune *r)
+{
+	int c;
+	long n;
+
+	n = 0;
+	for(;; r++){
+		c = *r;
+		if(L'0'<=c && c<=L'9')
+			c -= '0';
+		else
+			break;
+		n = n*10 + c;
+	}
+	return n;
+}
+
+/*
+ * See if there is a rune corresponding to the accented
+ * version of r with accent acc (acc in [LIGS..LIGE-1]),
+ * and return it if so, else return NONE.
+ */
+Rune
+liglookup(Rune acc, Rune r)
+{
+	Rune *p;
+
+	if(acc < LIGS || acc >= LIGE)
+		return NONE;
+	for(p = ligtab[acc-LIGS].pairs; *p; p += 2)
+		if(*p == r)
+			return *(p+1);
+	return NONE;
+}
+
+/*
+ * Maintain a translation table stack (a translation table
+ * is an array of Runes indexed by bytes or 7-bit bytes).
+ * If starting is true, push the curtab onto the stack
+ * and return newtab; else pop the top of the stack and
+ * return it.
+ * If curtab is 0, initialize the stack and return.
+ */
+Rune *
+changett(Rune *curtab, Rune *newtab, int starting)
+{
+	if(curtab == 0) {
+		ntt = 0;
+		return 0;
+	}
+	if(starting) {
+		if(ntt >= asize(ttabstack)) {
+			if(debug)
+				err("translation stack overflow");
+			return curtab;
+		}
+		ttabstack[ntt++] = curtab;
+		return newtab;
+	} else {
+		if(ntt == 0) {
+			if(debug)
+				err("translation stack underflow");
+			return curtab;
+		}
+		return ttabstack[--ntt];
+	}
+}
author	Taru Karttunen <taruti@taruti.net>	2011-03-30 15:46:40 +0300
committer	Taru Karttunen <taruti@taruti.net>	2011-03-30 15:46:40 +0300
commit	e5888a1ffdae813d7575f5fb02275c6bb07e5199 (patch)
tree	d8d51eac403f07814b9e936eed0c9a79195e2450 /sys/src/cmd/dict/utils.c