Import sources from 2011-03-30 iso image

author: Taru Karttunen <taruti@taruti.net> 2011-03-30 15:46:40 +0300
committer: Taru Karttunen <taruti@taruti.net> 2011-03-30 15:46:40 +0300
commit: e5888a1ffdae813d7575f5fb02275c6bb07e5199 (patch)
tree: d8d51eac403f07814b9e936eed0c9a79195e2450 /sys/src/cmd/upas/scanmail/common.c
1 files changed, 667 insertions, 0 deletions
diff --git a/sys/src/cmd/upas/scanmail/common.c b/sys/src/cmd/upas/scanmail/common.c
new file mode 100755
index 000000000..b6ea720d1
--- /dev/null
+++ b/sys/src/cmd/upas/scanmail/common.c
@@ -0,0 +1,667 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include <regexp.h>
+#include "spam.h"
+
+enum {
+	Quanta	= 8192,
+	Minbody = 6000,
+	HdrMax	= 15,
+};
+
+typedef struct keyword Keyword;
+typedef struct word Word;
+
+struct word{
+	char	*string;
+	int	n;
+};
+
+struct	keyword{
+	char	*string;
+	int	value;
+};
+
+Word	htmlcmds[] =
+{
+	"html",		4,
+	"!doctype html", 13,
+	0,
+
+};
+
+Word	hrefs[] =
+{
+	"a href=",	7,
+	"a title=",	8,
+	"a target=",	9,
+	"base href=",	10,
+	"img src=",	8,
+	"img border=",	11,
+	"form action=", 12,
+	"!--",		3,
+	0,
+
+};
+
+/*
+ *	RFC822 header keywords to look for for fractured header.
+ *	all lengths must be less than HdrMax defined above.
+ */
+Word	hdrwords[] =
+{
+	"cc:",			3,
+	"bcc:", 		4,
+	"to:",			3,
+	0,			0,
+
+};
+
+Keyword	keywords[] =
+{
+	"header",	HoldHeader,
+	"line",		SaveLine,
+	"hold",		Hold,
+	"dump",		Dump,
+	"loff",		Lineoff,
+	0,		Nactions,
+};
+
+Patterns patterns[] = {
+[Dump]		{ "DUMP:", 0, 0 },
+[HoldHeader]	{ "HEADER:", 0, 0 },
+[Hold]		{ "HOLD:", 0, 0 },
+[SaveLine]	{ "LINE:", 0, 0 },
+[Lineoff]	{ "LINEOFF:", 0, 0 },
+[Nactions]	{ 0, 0, 0 },
+};
+
+static char*	endofhdr(char*, char*);
+static	int	escape(char**);
+static	int	extract(char*);
+static	int	findkey(char*);
+static	int	hash(int);
+static	int	isword(Word*, char*, int);
+static	void	parsealt(Biobuf*, char*, Spat**);
+
+/*
+ *	The canonicalizer: convert input to canonical representation
+ */
+char*
+readmsg(Biobuf *bp, int *hsize, int *bufsize)
+{
+	char *p, *buf;
+	int n, offset, eoh, bsize, delta;
+
+	buf = 0;
+	offset = 0;
+	if(bufsize)
+		*bufsize = 0;
+	if(hsize)
+		*hsize = 0;
+	for(;;) {
+		buf = Realloc(buf, offset+Quanta+1);
+		n = Bread(bp, buf+offset, Quanta);
+		if(n < 0){
+			free(buf);
+			return 0;
+		}
+		p = buf+offset;			/* start of this chunk */
+		offset += n;			/* end of this chunk */
+		buf[offset] = 0;
+		if(n == 0){
+			if(offset == 0)
+				return 0;
+			break;
+		}
+
+		if(hsize == 0)			/* don't process header */
+			break;
+		if(p != buf && p[-1] == '\n')	/* check for EOH across buffer split */
+			p--;
+		p = endofhdr(p, buf+offset);
+		if(p)
+			break;
+		if(offset >= Maxread)		/* gargantuan header - just punt*/
+		{
+			if(hsize)
+				*hsize = offset;
+			if(bufsize)
+				*bufsize = offset;
+			return buf;
+		}
+	}
+	eoh = p-buf;				/* End of header */
+	bsize = offset - eoh;			/* amount of body already read */
+
+		/* Read at least Minbody bytes of the body */
+	if (bsize < Minbody){
+		delta = Minbody-bsize;
+		buf = Realloc(buf, offset+delta+1);
+		n = Bread(bp, buf+offset, delta);
+		if(n > 0) {
+			offset += n;
+			buf[offset] = 0;
+		}
+	}
+	if(hsize)
+		*hsize = eoh;
+	if(bufsize)
+		*bufsize = offset;
+	return buf;
+}
+
+static	int
+isword(Word *wp, char *text, int len)
+{
+	for(;wp->string; wp++)
+		if(len >= wp->n && strncmp(text, wp->string, wp->n) == 0)
+			return 1;
+	return 0;
+}
+
+static char*
+endofhdr(char *raw, char *end)
+{
+	int i;
+	char *p, *q;
+	char buf[HdrMax];
+
+	/*
+ 	 * can't use strchr to search for newlines because
+	 * there may be embedded NULL's.
+	 */
+	for(p = raw; p < end; p++){
+		if(*p != '\n' || p[1] != '\n')
+			continue;
+		p++;
+		for(i = 0, q = p+1; i < sizeof(buf) && *q; q++){
+			buf[i++] = tolower(*q);
+			if(*q == ':' || *q == '\n')
+				break;
+		}
+		if(!isword(hdrwords, buf, i))
+			return p+1;
+	}
+	return 0;
+}
+
+static	int
+htmlmatch(Word *wp, char *text, char *end, int *n)
+{
+	char *cp;
+	int i, c, lastc;
+	char buf[MaxHtml];
+
+	/*
+	 * extract a string up to '>'
+	 */
+
+	i = lastc = 0;
+	cp = text;
+	while (cp < end && i < sizeof(buf)-1){
+		c = *cp++;
+		if(c == '=')
+			c = escape(&cp);
+		switch(c){
+		case 0:
+		case '\r':
+			continue;
+		case '>':
+			goto out;
+		case '\n':
+		case ' ':
+		case '\t':
+			if(lastc == ' ')
+				continue;
+			c = ' ';
+			break;
+		default:
+			c = tolower(c);
+			break;
+		}
+		buf[i++] = lastc = c;
+	}
+out:
+	buf[i] = 0;
+	if(n)
+		*n = cp-text;
+	return isword(wp, buf, i);
+}
+
+static int
+escape(char **msg)
+{
+	int c;
+	char *p;
+
+	p = *msg;
+	c = *p;
+	if(c == '\n'){
+		p++;
+		c = *p++;
+	} else
+	if(c == '2'){
+		c = tolower(p[1]);
+		if(c == 'e'){
+			p += 2;
+			c = '.';
+		}else
+		if(c == 'f'){
+			p += 2;
+			c = '/';
+		}else
+		if(c == '0'){
+			p += 2;
+			c = ' ';
+		}
+		else c = '=';
+	} else {
+		if(c == '3' && tolower(p[1]) == 'd')
+			p += 2;
+		c = '=';
+	}
+	*msg = p;
+	return c;
+}
+
+static int
+htmlchk(char **msg, char *end)
+{
+	int n;
+	char *p;
+
+	static int ishtml;
+
+	p = *msg;
+	if(ishtml == 0){
+		ishtml = htmlmatch(htmlcmds, p, end, &n);
+	
+		/* If not an HTML keyword, check if it's
+		 * an HTML comment (<!comment>).  if so,
+		 * skip over it; otherwise copy it in.
+		 */
+		if(ishtml == 0 && *p != '!')	/* not comment */
+			return '<';		/* copy it */
+
+	} else if(htmlmatch(hrefs, p, end, &n))	/* if special HTML string  */
+		return '<';			/* copy it */
+	
+	/*
+	 * this is an uninteresting HTML command; skip over it.
+	 */
+	p += n;
+	*msg = p+1;
+	return *p;
+}
+
+/*
+ * decode a base 64 encode body
+ */
+void
+conv64(char *msg, char *end, char *buf, int bufsize)
+{
+	int len, i;
+	char *cp;
+
+	len = end - msg;
+	i = (len*3)/4+1;	// room for max chars + null
+	cp = Malloc(i);
+	len = dec64((uchar*)cp, i, msg, len);
+	convert(cp, cp+len, buf, bufsize, 1);
+	free(cp);
+}
+
+int
+convert(char *msg, char *end, char *buf, int bufsize, int isbody)
+{
+
+	char *p;
+	int c, lastc, base64;
+
+	lastc = 0;
+	base64 = 0;
+	while(msg < end && bufsize > 0){
+		c = *msg++;
+
+		/*
+		 * In the body only, try to strip most HTML and
+		 * replace certain MIME escape sequences with the character
+		 */
+		if(isbody) {
+			do{
+				p = msg;
+				if(c == '<')
+					c = htmlchk(&msg, end);
+				if(c == '=')
+					c = escape(&msg);
+			} while(p != msg && p < end);
+		}
+		switch(c){
+		case 0:
+		case '\r':
+			continue;
+		case '\t':
+		case ' ':
+		case '\n':
+			if(lastc == ' ')
+				continue;
+			c = ' ';
+			break;
+		case 'C':	/* check for MIME base 64 encoding in header */
+		case 'c':
+			if(isbody == 0)
+			if(msg < end-32 && *msg == 'o' && msg[1] == 'n')
+			if(cistrncmp(msg+2, "tent-transfer-encoding: base64", 30) == 0)
+				base64 = 1;
+			c = 'c';
+			break;
+		default:
+			c = tolower(c);
+			break;
+		}
+		*buf++ = c;
+		lastc = c;
+		bufsize--;
+	}
+	*buf = 0;
+	return base64;
+}
+
+/*
+ *	The pattern parser: build data structures from the pattern file
+ */
+
+static int
+hash(int c)
+{
+	return c & 127;
+}
+
+static	int
+findkey(char *val)
+{
+	Keyword *kp;
+
+	for(kp = keywords; kp->string; kp++)
+		if(strcmp(val, kp->string) == 0)
+				break;
+	return kp->value;
+}
+
+#define	whitespace(c)	((c) == ' ' || (c) == '\t')
+
+void
+parsepats(Biobuf *bp)
+{
+	Pattern *p, *new;
+	char *cp, *qp;
+	int type, action, n, h;
+	Spat *spat;
+
+	for(;;){
+		cp = Brdline(bp, '\n');
+		if(cp == 0)
+			break;
+		cp[Blinelen(bp)-1] = 0;
+		while(*cp == ' ' || *cp == '\t')
+			cp++;
+		if(*cp == '#' || *cp == 0)
+			continue;
+		type = regexp;
+		if(*cp == '*'){
+			type = string;
+			cp++;
+		}
+		qp = strchr(cp, ':');
+		if(qp == 0)
+			continue;
+		*qp = 0;
+		if(debug)
+			fprint(2, "action = %s\n", cp);
+		action = findkey(cp);
+		if(action >= Nactions)
+			continue;
+		cp = qp+1;
+		n = extract(cp);
+		if(n <= 0 || *cp == 0)
+			continue;
+
+		qp = strstr(cp, "~~");
+		if(qp){
+			*qp = 0;
+			n = strlen(cp);
+		}
+		if(debug)
+			fprint(2, " Pattern: `%s'\n", cp);
+
+			/* Hook regexps into a chain */
+		if(type == regexp) {
+			new = Malloc(sizeof(Pattern));
+			new->action = action;
+			new->pat = regcomp(cp);
+			if(new->pat == 0){
+				free(new);
+				continue;
+			}
+			new->type = regexp;
+			new->alt = 0;
+			new->next = 0;
+
+			if(qp)
+				parsealt(bp, qp+2, &new->alt);
+
+			new->next = patterns[action].regexps;
+			patterns[action].regexps = new;
+			continue;
+
+		}
+			/* not a Regexp - hook strings into Pattern hash chain */
+		spat = Malloc(sizeof(*spat));
+		spat->next = 0;
+		spat->alt = 0;
+		spat->len = n;
+		spat->string = Malloc(n+1);
+		spat->c1 = cp[1];
+		strcpy(spat->string, cp);
+
+		if(qp)
+			parsealt(bp, qp+2, &spat->alt);
+
+		p = patterns[action].strings;
+		if(p == 0) {
+			p = Malloc(sizeof(Pattern));
+			memset(p, 0, sizeof(*p));
+			p->action = action;
+			p->type = string;
+			patterns[action].strings = p;
+		}
+		h = hash(*spat->string);
+		spat->next = p->spat[h];
+		p->spat[h] = spat;
+	}
+}
+
+static void
+parsealt(Biobuf *bp, char *cp, Spat** head)
+{
+	char *p;
+	Spat *alt;
+
+	while(cp){
+		if(*cp == 0){		/*escaped newline*/
+			do{
+				cp = Brdline(bp, '\n');
+				if(cp == 0)
+					return;
+				cp[Blinelen(bp)-1] = 0;
+			} while(extract(cp) <= 0 || *cp == 0);
+		}
+
+		p = cp;
+		cp = strstr(p, "~~");
+		if(cp){
+			*cp = 0;
+			cp += 2;
+		}
+		if(strlen(p)){
+			alt = Malloc(sizeof(*alt));
+			alt->string = strdup(p);
+			alt->next = *head;
+			*head = alt;
+		}
+	}
+}
+
+static int
+extract(char *cp)
+{
+	int c;
+	char *p, *q, *r;
+
+	p = q = r = cp;
+	while(whitespace(*p))
+		p++;
+	while(c = *p++){
+		if (c == '#')
+			break;
+		if(c == '"'){
+			while(*p && *p != '"'){
+				if(*p == '\\' && p[1] == '"')
+					p++;
+				if('A' <= *p && *p <= 'Z')
+					*q++ = *p++ + ('a'-'A');
+				else
+					*q++ = *p++;
+			}
+			if(*p)
+				p++;
+			r = q;		/* never back up over a quoted string */
+		} else {
+			if('A' <= c && c <= 'Z')
+				c += ('a'-'A');
+			*q++ = c;
+		}
+	}
+	while(q > r && whitespace(q[-1]))
+		q--;
+	*q = 0;
+	return q-cp;
+}
+
+/*
+ *	The matching engine: compare canonical input to pattern structures
+ */
+
+static Spat*
+isalt(char *message, Spat *alt)
+{
+	while(alt) {
+		if(*cmd)
+		if(message != cmd && strstr(cmd, alt->string))
+			break;
+		if(message != header+1 && strstr(header+1, alt->string))
+			break;
+		if(strstr(message, alt->string))
+			break;
+		alt = alt->next;
+	}
+	return alt;
+}
+
+int
+matchpat(Pattern *p, char *message, Resub *m)
+{
+	Spat *spat;
+	char *s;
+	int c, c1;
+
+	if(p->type == string){
+		c1 = *message;
+		for(s=message; c=c1; s++){
+			c1 = s[1];
+			for(spat=p->spat[hash(c)]; spat; spat=spat->next){
+				if(c1 == spat->c1)
+				if(memcmp(s, spat->string, spat->len) == 0)
+				if(!isalt(message, spat->alt)){
+					m->sp = s;
+					m->ep = s + spat->len;
+					return 1;
+				}
+			}
+		}
+		return 0;
+	}
+	m->sp = m->ep = 0;
+	if(regexec(p->pat, message, m, 1) == 0)
+		return 0;
+	if(isalt(message, p->alt))
+		return 0;
+	return 1;
+}
+
+
+void
+xprint(int fd, char *type, Resub *m)
+{
+	char *p, *q;
+	int i;
+
+	if(m->sp == 0 || m->ep == 0)
+		return;
+
+		/* back up approx 30 characters to whitespace */
+	for(p = m->sp, i = 0; *p && i < 30; i++, p--)
+			;
+	while(*p && *p != ' ')
+		p--;
+	p++;
+
+		/* grab about 30 more chars beyond the end of the match */
+	for(q = m->ep, i = 0; *q && i < 30; i++, q++)
+			;
+	while(*q && *q != ' ')
+		q++;
+
+	fprint(fd, "%s %.*s~%.*s~%.*s\n", type, (int)(m->sp-p), p, (int)(m->ep-m->sp), m->sp, (int)(q-m->ep), m->ep);
+}
+
+enum {
+	INVAL=	255
+};
+
+static uchar t64d[256] = {
+/*00 */	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*10*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*20*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+	INVAL, INVAL, INVAL,    62, INVAL, INVAL, INVAL,    63,
+/*30*/	   52,	  53,	 54,	55,    56,    57,    58,    59,
+	   60,	  61, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*40*/	INVAL,    0,      1,     2,     3,     4,     5,     6,
+	    7,    8,      9,    10,    11,    12,    13,    14,
+/*50*/	   15,   16,     17,    18,    19,    20,    21,    22,
+	   23,   24,     25, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*60*/	INVAL,   26,     27,    28,    29,    30,    31,    32,
+	   33,   34,     35,    36,    37,    38,    39,    40,
+/*70*/	   41,   42,     43,    44,    45,    46,    47,    48,
+	   49,   50,     51, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*80*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*90*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*A0*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*B0*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*C0*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*D0*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*E0*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*F0*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+};
author	Taru Karttunen <taruti@taruti.net>	2011-03-30 15:46:40 +0300
committer	Taru Karttunen <taruti@taruti.net>	2011-03-30 15:46:40 +0300
commit	e5888a1ffdae813d7575f5fb02275c6bb07e5199 (patch)
tree	d8d51eac403f07814b9e936eed0c9a79195e2450 /sys/src/cmd/upas/scanmail/common.c