summaryrefslogtreecommitdiff
path: root/sys/src/cmd/upas/scanmail/common.c
diff options
context:
space:
mode:
authorTaru Karttunen <taruti@taruti.net>2011-03-30 15:46:40 +0300
committerTaru Karttunen <taruti@taruti.net>2011-03-30 15:46:40 +0300
commite5888a1ffdae813d7575f5fb02275c6bb07e5199 (patch)
treed8d51eac403f07814b9e936eed0c9a79195e2450 /sys/src/cmd/upas/scanmail/common.c
Import sources from 2011-03-30 iso image
Diffstat (limited to 'sys/src/cmd/upas/scanmail/common.c')
-rwxr-xr-xsys/src/cmd/upas/scanmail/common.c667
1 files changed, 667 insertions, 0 deletions
diff --git a/sys/src/cmd/upas/scanmail/common.c b/sys/src/cmd/upas/scanmail/common.c
new file mode 100755
index 000000000..b6ea720d1
--- /dev/null
+++ b/sys/src/cmd/upas/scanmail/common.c
@@ -0,0 +1,667 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include <regexp.h>
+#include "spam.h"
+
+enum {
+ Quanta = 8192,
+ Minbody = 6000,
+ HdrMax = 15,
+};
+
+typedef struct keyword Keyword;
+typedef struct word Word;
+
+struct word{
+ char *string;
+ int n;
+};
+
+struct keyword{
+ char *string;
+ int value;
+};
+
+Word htmlcmds[] =
+{
+ "html", 4,
+ "!doctype html", 13,
+ 0,
+
+};
+
+Word hrefs[] =
+{
+ "a href=", 7,
+ "a title=", 8,
+ "a target=", 9,
+ "base href=", 10,
+ "img src=", 8,
+ "img border=", 11,
+ "form action=", 12,
+ "!--", 3,
+ 0,
+
+};
+
+/*
+ * RFC822 header keywords to look for for fractured header.
+ * all lengths must be less than HdrMax defined above.
+ */
+Word hdrwords[] =
+{
+ "cc:", 3,
+ "bcc:", 4,
+ "to:", 3,
+ 0, 0,
+
+};
+
+Keyword keywords[] =
+{
+ "header", HoldHeader,
+ "line", SaveLine,
+ "hold", Hold,
+ "dump", Dump,
+ "loff", Lineoff,
+ 0, Nactions,
+};
+
+Patterns patterns[] = {
+[Dump] { "DUMP:", 0, 0 },
+[HoldHeader] { "HEADER:", 0, 0 },
+[Hold] { "HOLD:", 0, 0 },
+[SaveLine] { "LINE:", 0, 0 },
+[Lineoff] { "LINEOFF:", 0, 0 },
+[Nactions] { 0, 0, 0 },
+};
+
+static char* endofhdr(char*, char*);
+static int escape(char**);
+static int extract(char*);
+static int findkey(char*);
+static int hash(int);
+static int isword(Word*, char*, int);
+static void parsealt(Biobuf*, char*, Spat**);
+
+/*
+ * The canonicalizer: convert input to canonical representation
+ */
+char*
+readmsg(Biobuf *bp, int *hsize, int *bufsize)
+{
+ char *p, *buf;
+ int n, offset, eoh, bsize, delta;
+
+ buf = 0;
+ offset = 0;
+ if(bufsize)
+ *bufsize = 0;
+ if(hsize)
+ *hsize = 0;
+ for(;;) {
+ buf = Realloc(buf, offset+Quanta+1);
+ n = Bread(bp, buf+offset, Quanta);
+ if(n < 0){
+ free(buf);
+ return 0;
+ }
+ p = buf+offset; /* start of this chunk */
+ offset += n; /* end of this chunk */
+ buf[offset] = 0;
+ if(n == 0){
+ if(offset == 0)
+ return 0;
+ break;
+ }
+
+ if(hsize == 0) /* don't process header */
+ break;
+ if(p != buf && p[-1] == '\n') /* check for EOH across buffer split */
+ p--;
+ p = endofhdr(p, buf+offset);
+ if(p)
+ break;
+ if(offset >= Maxread) /* gargantuan header - just punt*/
+ {
+ if(hsize)
+ *hsize = offset;
+ if(bufsize)
+ *bufsize = offset;
+ return buf;
+ }
+ }
+ eoh = p-buf; /* End of header */
+ bsize = offset - eoh; /* amount of body already read */
+
+ /* Read at least Minbody bytes of the body */
+ if (bsize < Minbody){
+ delta = Minbody-bsize;
+ buf = Realloc(buf, offset+delta+1);
+ n = Bread(bp, buf+offset, delta);
+ if(n > 0) {
+ offset += n;
+ buf[offset] = 0;
+ }
+ }
+ if(hsize)
+ *hsize = eoh;
+ if(bufsize)
+ *bufsize = offset;
+ return buf;
+}
+
+static int
+isword(Word *wp, char *text, int len)
+{
+ for(;wp->string; wp++)
+ if(len >= wp->n && strncmp(text, wp->string, wp->n) == 0)
+ return 1;
+ return 0;
+}
+
+static char*
+endofhdr(char *raw, char *end)
+{
+ int i;
+ char *p, *q;
+ char buf[HdrMax];
+
+ /*
+ * can't use strchr to search for newlines because
+ * there may be embedded NULL's.
+ */
+ for(p = raw; p < end; p++){
+ if(*p != '\n' || p[1] != '\n')
+ continue;
+ p++;
+ for(i = 0, q = p+1; i < sizeof(buf) && *q; q++){
+ buf[i++] = tolower(*q);
+ if(*q == ':' || *q == '\n')
+ break;
+ }
+ if(!isword(hdrwords, buf, i))
+ return p+1;
+ }
+ return 0;
+}
+
+static int
+htmlmatch(Word *wp, char *text, char *end, int *n)
+{
+ char *cp;
+ int i, c, lastc;
+ char buf[MaxHtml];
+
+ /*
+ * extract a string up to '>'
+ */
+
+ i = lastc = 0;
+ cp = text;
+ while (cp < end && i < sizeof(buf)-1){
+ c = *cp++;
+ if(c == '=')
+ c = escape(&cp);
+ switch(c){
+ case 0:
+ case '\r':
+ continue;
+ case '>':
+ goto out;
+ case '\n':
+ case ' ':
+ case '\t':
+ if(lastc == ' ')
+ continue;
+ c = ' ';
+ break;
+ default:
+ c = tolower(c);
+ break;
+ }
+ buf[i++] = lastc = c;
+ }
+out:
+ buf[i] = 0;
+ if(n)
+ *n = cp-text;
+ return isword(wp, buf, i);
+}
+
+static int
+escape(char **msg)
+{
+ int c;
+ char *p;
+
+ p = *msg;
+ c = *p;
+ if(c == '\n'){
+ p++;
+ c = *p++;
+ } else
+ if(c == '2'){
+ c = tolower(p[1]);
+ if(c == 'e'){
+ p += 2;
+ c = '.';
+ }else
+ if(c == 'f'){
+ p += 2;
+ c = '/';
+ }else
+ if(c == '0'){
+ p += 2;
+ c = ' ';
+ }
+ else c = '=';
+ } else {
+ if(c == '3' && tolower(p[1]) == 'd')
+ p += 2;
+ c = '=';
+ }
+ *msg = p;
+ return c;
+}
+
+static int
+htmlchk(char **msg, char *end)
+{
+ int n;
+ char *p;
+
+ static int ishtml;
+
+ p = *msg;
+ if(ishtml == 0){
+ ishtml = htmlmatch(htmlcmds, p, end, &n);
+
+ /* If not an HTML keyword, check if it's
+ * an HTML comment (<!comment>). if so,
+ * skip over it; otherwise copy it in.
+ */
+ if(ishtml == 0 && *p != '!') /* not comment */
+ return '<'; /* copy it */
+
+ } else if(htmlmatch(hrefs, p, end, &n)) /* if special HTML string */
+ return '<'; /* copy it */
+
+ /*
+ * this is an uninteresting HTML command; skip over it.
+ */
+ p += n;
+ *msg = p+1;
+ return *p;
+}
+
+/*
+ * decode a base 64 encode body
+ */
+void
+conv64(char *msg, char *end, char *buf, int bufsize)
+{
+ int len, i;
+ char *cp;
+
+ len = end - msg;
+ i = (len*3)/4+1; // room for max chars + null
+ cp = Malloc(i);
+ len = dec64((uchar*)cp, i, msg, len);
+ convert(cp, cp+len, buf, bufsize, 1);
+ free(cp);
+}
+
+int
+convert(char *msg, char *end, char *buf, int bufsize, int isbody)
+{
+
+ char *p;
+ int c, lastc, base64;
+
+ lastc = 0;
+ base64 = 0;
+ while(msg < end && bufsize > 0){
+ c = *msg++;
+
+ /*
+ * In the body only, try to strip most HTML and
+ * replace certain MIME escape sequences with the character
+ */
+ if(isbody) {
+ do{
+ p = msg;
+ if(c == '<')
+ c = htmlchk(&msg, end);
+ if(c == '=')
+ c = escape(&msg);
+ } while(p != msg && p < end);
+ }
+ switch(c){
+ case 0:
+ case '\r':
+ continue;
+ case '\t':
+ case ' ':
+ case '\n':
+ if(lastc == ' ')
+ continue;
+ c = ' ';
+ break;
+ case 'C': /* check for MIME base 64 encoding in header */
+ case 'c':
+ if(isbody == 0)
+ if(msg < end-32 && *msg == 'o' && msg[1] == 'n')
+ if(cistrncmp(msg+2, "tent-transfer-encoding: base64", 30) == 0)
+ base64 = 1;
+ c = 'c';
+ break;
+ default:
+ c = tolower(c);
+ break;
+ }
+ *buf++ = c;
+ lastc = c;
+ bufsize--;
+ }
+ *buf = 0;
+ return base64;
+}
+
+/*
+ * The pattern parser: build data structures from the pattern file
+ */
+
+static int
+hash(int c)
+{
+ return c & 127;
+}
+
+static int
+findkey(char *val)
+{
+ Keyword *kp;
+
+ for(kp = keywords; kp->string; kp++)
+ if(strcmp(val, kp->string) == 0)
+ break;
+ return kp->value;
+}
+
+#define whitespace(c) ((c) == ' ' || (c) == '\t')
+
+void
+parsepats(Biobuf *bp)
+{
+ Pattern *p, *new;
+ char *cp, *qp;
+ int type, action, n, h;
+ Spat *spat;
+
+ for(;;){
+ cp = Brdline(bp, '\n');
+ if(cp == 0)
+ break;
+ cp[Blinelen(bp)-1] = 0;
+ while(*cp == ' ' || *cp == '\t')
+ cp++;
+ if(*cp == '#' || *cp == 0)
+ continue;
+ type = regexp;
+ if(*cp == '*'){
+ type = string;
+ cp++;
+ }
+ qp = strchr(cp, ':');
+ if(qp == 0)
+ continue;
+ *qp = 0;
+ if(debug)
+ fprint(2, "action = %s\n", cp);
+ action = findkey(cp);
+ if(action >= Nactions)
+ continue;
+ cp = qp+1;
+ n = extract(cp);
+ if(n <= 0 || *cp == 0)
+ continue;
+
+ qp = strstr(cp, "~~");
+ if(qp){
+ *qp = 0;
+ n = strlen(cp);
+ }
+ if(debug)
+ fprint(2, " Pattern: `%s'\n", cp);
+
+ /* Hook regexps into a chain */
+ if(type == regexp) {
+ new = Malloc(sizeof(Pattern));
+ new->action = action;
+ new->pat = regcomp(cp);
+ if(new->pat == 0){
+ free(new);
+ continue;
+ }
+ new->type = regexp;
+ new->alt = 0;
+ new->next = 0;
+
+ if(qp)
+ parsealt(bp, qp+2, &new->alt);
+
+ new->next = patterns[action].regexps;
+ patterns[action].regexps = new;
+ continue;
+
+ }
+ /* not a Regexp - hook strings into Pattern hash chain */
+ spat = Malloc(sizeof(*spat));
+ spat->next = 0;
+ spat->alt = 0;
+ spat->len = n;
+ spat->string = Malloc(n+1);
+ spat->c1 = cp[1];
+ strcpy(spat->string, cp);
+
+ if(qp)
+ parsealt(bp, qp+2, &spat->alt);
+
+ p = patterns[action].strings;
+ if(p == 0) {
+ p = Malloc(sizeof(Pattern));
+ memset(p, 0, sizeof(*p));
+ p->action = action;
+ p->type = string;
+ patterns[action].strings = p;
+ }
+ h = hash(*spat->string);
+ spat->next = p->spat[h];
+ p->spat[h] = spat;
+ }
+}
+
+static void
+parsealt(Biobuf *bp, char *cp, Spat** head)
+{
+ char *p;
+ Spat *alt;
+
+ while(cp){
+ if(*cp == 0){ /*escaped newline*/
+ do{
+ cp = Brdline(bp, '\n');
+ if(cp == 0)
+ return;
+ cp[Blinelen(bp)-1] = 0;
+ } while(extract(cp) <= 0 || *cp == 0);
+ }
+
+ p = cp;
+ cp = strstr(p, "~~");
+ if(cp){
+ *cp = 0;
+ cp += 2;
+ }
+ if(strlen(p)){
+ alt = Malloc(sizeof(*alt));
+ alt->string = strdup(p);
+ alt->next = *head;
+ *head = alt;
+ }
+ }
+}
+
+static int
+extract(char *cp)
+{
+ int c;
+ char *p, *q, *r;
+
+ p = q = r = cp;
+ while(whitespace(*p))
+ p++;
+ while(c = *p++){
+ if (c == '#')
+ break;
+ if(c == '"'){
+ while(*p && *p != '"'){
+ if(*p == '\\' && p[1] == '"')
+ p++;
+ if('A' <= *p && *p <= 'Z')
+ *q++ = *p++ + ('a'-'A');
+ else
+ *q++ = *p++;
+ }
+ if(*p)
+ p++;
+ r = q; /* never back up over a quoted string */
+ } else {
+ if('A' <= c && c <= 'Z')
+ c += ('a'-'A');
+ *q++ = c;
+ }
+ }
+ while(q > r && whitespace(q[-1]))
+ q--;
+ *q = 0;
+ return q-cp;
+}
+
+/*
+ * The matching engine: compare canonical input to pattern structures
+ */
+
+static Spat*
+isalt(char *message, Spat *alt)
+{
+ while(alt) {
+ if(*cmd)
+ if(message != cmd && strstr(cmd, alt->string))
+ break;
+ if(message != header+1 && strstr(header+1, alt->string))
+ break;
+ if(strstr(message, alt->string))
+ break;
+ alt = alt->next;
+ }
+ return alt;
+}
+
+int
+matchpat(Pattern *p, char *message, Resub *m)
+{
+ Spat *spat;
+ char *s;
+ int c, c1;
+
+ if(p->type == string){
+ c1 = *message;
+ for(s=message; c=c1; s++){
+ c1 = s[1];
+ for(spat=p->spat[hash(c)]; spat; spat=spat->next){
+ if(c1 == spat->c1)
+ if(memcmp(s, spat->string, spat->len) == 0)
+ if(!isalt(message, spat->alt)){
+ m->sp = s;
+ m->ep = s + spat->len;
+ return 1;
+ }
+ }
+ }
+ return 0;
+ }
+ m->sp = m->ep = 0;
+ if(regexec(p->pat, message, m, 1) == 0)
+ return 0;
+ if(isalt(message, p->alt))
+ return 0;
+ return 1;
+}
+
+
+void
+xprint(int fd, char *type, Resub *m)
+{
+ char *p, *q;
+ int i;
+
+ if(m->sp == 0 || m->ep == 0)
+ return;
+
+ /* back up approx 30 characters to whitespace */
+ for(p = m->sp, i = 0; *p && i < 30; i++, p--)
+ ;
+ while(*p && *p != ' ')
+ p--;
+ p++;
+
+ /* grab about 30 more chars beyond the end of the match */
+ for(q = m->ep, i = 0; *q && i < 30; i++, q++)
+ ;
+ while(*q && *q != ' ')
+ q++;
+
+ fprint(fd, "%s %.*s~%.*s~%.*s\n", type, (int)(m->sp-p), p, (int)(m->ep-m->sp), m->sp, (int)(q-m->ep), m->ep);
+}
+
+enum {
+ INVAL= 255
+};
+
+static uchar t64d[256] = {
+/*00 */ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*10*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*20*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+ INVAL, INVAL, INVAL, 62, INVAL, INVAL, INVAL, 63,
+/*30*/ 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*40*/ INVAL, 0, 1, 2, 3, 4, 5, 6,
+ 7, 8, 9, 10, 11, 12, 13, 14,
+/*50*/ 15, 16, 17, 18, 19, 20, 21, 22,
+ 23, 24, 25, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*60*/ INVAL, 26, 27, 28, 29, 30, 31, 32,
+ 33, 34, 35, 36, 37, 38, 39, 40,
+/*70*/ 41, 42, 43, 44, 45, 46, 47, 48,
+ 49, 50, 51, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*80*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*90*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*A0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*B0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*C0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*D0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*E0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+/*F0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
+};