diff options
author | Taru Karttunen <taruti@taruti.net> | 2011-03-30 15:46:40 +0300 |
---|---|---|
committer | Taru Karttunen <taruti@taruti.net> | 2011-03-30 15:46:40 +0300 |
commit | e5888a1ffdae813d7575f5fb02275c6bb07e5199 (patch) | |
tree | d8d51eac403f07814b9e936eed0c9a79195e2450 /sys/src/cmd/upas/scanmail/common.c |
Import sources from 2011-03-30 iso image
Diffstat (limited to 'sys/src/cmd/upas/scanmail/common.c')
-rwxr-xr-x | sys/src/cmd/upas/scanmail/common.c | 667 |
1 files changed, 667 insertions, 0 deletions
diff --git a/sys/src/cmd/upas/scanmail/common.c b/sys/src/cmd/upas/scanmail/common.c new file mode 100755 index 000000000..b6ea720d1 --- /dev/null +++ b/sys/src/cmd/upas/scanmail/common.c @@ -0,0 +1,667 @@ +#include <u.h> +#include <libc.h> +#include <bio.h> +#include <regexp.h> +#include "spam.h" + +enum { + Quanta = 8192, + Minbody = 6000, + HdrMax = 15, +}; + +typedef struct keyword Keyword; +typedef struct word Word; + +struct word{ + char *string; + int n; +}; + +struct keyword{ + char *string; + int value; +}; + +Word htmlcmds[] = +{ + "html", 4, + "!doctype html", 13, + 0, + +}; + +Word hrefs[] = +{ + "a href=", 7, + "a title=", 8, + "a target=", 9, + "base href=", 10, + "img src=", 8, + "img border=", 11, + "form action=", 12, + "!--", 3, + 0, + +}; + +/* + * RFC822 header keywords to look for for fractured header. + * all lengths must be less than HdrMax defined above. + */ +Word hdrwords[] = +{ + "cc:", 3, + "bcc:", 4, + "to:", 3, + 0, 0, + +}; + +Keyword keywords[] = +{ + "header", HoldHeader, + "line", SaveLine, + "hold", Hold, + "dump", Dump, + "loff", Lineoff, + 0, Nactions, +}; + +Patterns patterns[] = { +[Dump] { "DUMP:", 0, 0 }, +[HoldHeader] { "HEADER:", 0, 0 }, +[Hold] { "HOLD:", 0, 0 }, +[SaveLine] { "LINE:", 0, 0 }, +[Lineoff] { "LINEOFF:", 0, 0 }, +[Nactions] { 0, 0, 0 }, +}; + +static char* endofhdr(char*, char*); +static int escape(char**); +static int extract(char*); +static int findkey(char*); +static int hash(int); +static int isword(Word*, char*, int); +static void parsealt(Biobuf*, char*, Spat**); + +/* + * The canonicalizer: convert input to canonical representation + */ +char* +readmsg(Biobuf *bp, int *hsize, int *bufsize) +{ + char *p, *buf; + int n, offset, eoh, bsize, delta; + + buf = 0; + offset = 0; + if(bufsize) + *bufsize = 0; + if(hsize) + *hsize = 0; + for(;;) { + buf = Realloc(buf, offset+Quanta+1); + n = Bread(bp, buf+offset, Quanta); + if(n < 0){ + free(buf); + return 0; + } + p = buf+offset; /* start of this chunk */ + offset += n; /* end of this chunk */ + buf[offset] = 0; + if(n == 0){ + if(offset == 0) + return 0; + break; + } + + if(hsize == 0) /* don't process header */ + break; + if(p != buf && p[-1] == '\n') /* check for EOH across buffer split */ + p--; + p = endofhdr(p, buf+offset); + if(p) + break; + if(offset >= Maxread) /* gargantuan header - just punt*/ + { + if(hsize) + *hsize = offset; + if(bufsize) + *bufsize = offset; + return buf; + } + } + eoh = p-buf; /* End of header */ + bsize = offset - eoh; /* amount of body already read */ + + /* Read at least Minbody bytes of the body */ + if (bsize < Minbody){ + delta = Minbody-bsize; + buf = Realloc(buf, offset+delta+1); + n = Bread(bp, buf+offset, delta); + if(n > 0) { + offset += n; + buf[offset] = 0; + } + } + if(hsize) + *hsize = eoh; + if(bufsize) + *bufsize = offset; + return buf; +} + +static int +isword(Word *wp, char *text, int len) +{ + for(;wp->string; wp++) + if(len >= wp->n && strncmp(text, wp->string, wp->n) == 0) + return 1; + return 0; +} + +static char* +endofhdr(char *raw, char *end) +{ + int i; + char *p, *q; + char buf[HdrMax]; + + /* + * can't use strchr to search for newlines because + * there may be embedded NULL's. + */ + for(p = raw; p < end; p++){ + if(*p != '\n' || p[1] != '\n') + continue; + p++; + for(i = 0, q = p+1; i < sizeof(buf) && *q; q++){ + buf[i++] = tolower(*q); + if(*q == ':' || *q == '\n') + break; + } + if(!isword(hdrwords, buf, i)) + return p+1; + } + return 0; +} + +static int +htmlmatch(Word *wp, char *text, char *end, int *n) +{ + char *cp; + int i, c, lastc; + char buf[MaxHtml]; + + /* + * extract a string up to '>' + */ + + i = lastc = 0; + cp = text; + while (cp < end && i < sizeof(buf)-1){ + c = *cp++; + if(c == '=') + c = escape(&cp); + switch(c){ + case 0: + case '\r': + continue; + case '>': + goto out; + case '\n': + case ' ': + case '\t': + if(lastc == ' ') + continue; + c = ' '; + break; + default: + c = tolower(c); + break; + } + buf[i++] = lastc = c; + } +out: + buf[i] = 0; + if(n) + *n = cp-text; + return isword(wp, buf, i); +} + +static int +escape(char **msg) +{ + int c; + char *p; + + p = *msg; + c = *p; + if(c == '\n'){ + p++; + c = *p++; + } else + if(c == '2'){ + c = tolower(p[1]); + if(c == 'e'){ + p += 2; + c = '.'; + }else + if(c == 'f'){ + p += 2; + c = '/'; + }else + if(c == '0'){ + p += 2; + c = ' '; + } + else c = '='; + } else { + if(c == '3' && tolower(p[1]) == 'd') + p += 2; + c = '='; + } + *msg = p; + return c; +} + +static int +htmlchk(char **msg, char *end) +{ + int n; + char *p; + + static int ishtml; + + p = *msg; + if(ishtml == 0){ + ishtml = htmlmatch(htmlcmds, p, end, &n); + + /* If not an HTML keyword, check if it's + * an HTML comment (<!comment>). if so, + * skip over it; otherwise copy it in. + */ + if(ishtml == 0 && *p != '!') /* not comment */ + return '<'; /* copy it */ + + } else if(htmlmatch(hrefs, p, end, &n)) /* if special HTML string */ + return '<'; /* copy it */ + + /* + * this is an uninteresting HTML command; skip over it. + */ + p += n; + *msg = p+1; + return *p; +} + +/* + * decode a base 64 encode body + */ +void +conv64(char *msg, char *end, char *buf, int bufsize) +{ + int len, i; + char *cp; + + len = end - msg; + i = (len*3)/4+1; // room for max chars + null + cp = Malloc(i); + len = dec64((uchar*)cp, i, msg, len); + convert(cp, cp+len, buf, bufsize, 1); + free(cp); +} + +int +convert(char *msg, char *end, char *buf, int bufsize, int isbody) +{ + + char *p; + int c, lastc, base64; + + lastc = 0; + base64 = 0; + while(msg < end && bufsize > 0){ + c = *msg++; + + /* + * In the body only, try to strip most HTML and + * replace certain MIME escape sequences with the character + */ + if(isbody) { + do{ + p = msg; + if(c == '<') + c = htmlchk(&msg, end); + if(c == '=') + c = escape(&msg); + } while(p != msg && p < end); + } + switch(c){ + case 0: + case '\r': + continue; + case '\t': + case ' ': + case '\n': + if(lastc == ' ') + continue; + c = ' '; + break; + case 'C': /* check for MIME base 64 encoding in header */ + case 'c': + if(isbody == 0) + if(msg < end-32 && *msg == 'o' && msg[1] == 'n') + if(cistrncmp(msg+2, "tent-transfer-encoding: base64", 30) == 0) + base64 = 1; + c = 'c'; + break; + default: + c = tolower(c); + break; + } + *buf++ = c; + lastc = c; + bufsize--; + } + *buf = 0; + return base64; +} + +/* + * The pattern parser: build data structures from the pattern file + */ + +static int +hash(int c) +{ + return c & 127; +} + +static int +findkey(char *val) +{ + Keyword *kp; + + for(kp = keywords; kp->string; kp++) + if(strcmp(val, kp->string) == 0) + break; + return kp->value; +} + +#define whitespace(c) ((c) == ' ' || (c) == '\t') + +void +parsepats(Biobuf *bp) +{ + Pattern *p, *new; + char *cp, *qp; + int type, action, n, h; + Spat *spat; + + for(;;){ + cp = Brdline(bp, '\n'); + if(cp == 0) + break; + cp[Blinelen(bp)-1] = 0; + while(*cp == ' ' || *cp == '\t') + cp++; + if(*cp == '#' || *cp == 0) + continue; + type = regexp; + if(*cp == '*'){ + type = string; + cp++; + } + qp = strchr(cp, ':'); + if(qp == 0) + continue; + *qp = 0; + if(debug) + fprint(2, "action = %s\n", cp); + action = findkey(cp); + if(action >= Nactions) + continue; + cp = qp+1; + n = extract(cp); + if(n <= 0 || *cp == 0) + continue; + + qp = strstr(cp, "~~"); + if(qp){ + *qp = 0; + n = strlen(cp); + } + if(debug) + fprint(2, " Pattern: `%s'\n", cp); + + /* Hook regexps into a chain */ + if(type == regexp) { + new = Malloc(sizeof(Pattern)); + new->action = action; + new->pat = regcomp(cp); + if(new->pat == 0){ + free(new); + continue; + } + new->type = regexp; + new->alt = 0; + new->next = 0; + + if(qp) + parsealt(bp, qp+2, &new->alt); + + new->next = patterns[action].regexps; + patterns[action].regexps = new; + continue; + + } + /* not a Regexp - hook strings into Pattern hash chain */ + spat = Malloc(sizeof(*spat)); + spat->next = 0; + spat->alt = 0; + spat->len = n; + spat->string = Malloc(n+1); + spat->c1 = cp[1]; + strcpy(spat->string, cp); + + if(qp) + parsealt(bp, qp+2, &spat->alt); + + p = patterns[action].strings; + if(p == 0) { + p = Malloc(sizeof(Pattern)); + memset(p, 0, sizeof(*p)); + p->action = action; + p->type = string; + patterns[action].strings = p; + } + h = hash(*spat->string); + spat->next = p->spat[h]; + p->spat[h] = spat; + } +} + +static void +parsealt(Biobuf *bp, char *cp, Spat** head) +{ + char *p; + Spat *alt; + + while(cp){ + if(*cp == 0){ /*escaped newline*/ + do{ + cp = Brdline(bp, '\n'); + if(cp == 0) + return; + cp[Blinelen(bp)-1] = 0; + } while(extract(cp) <= 0 || *cp == 0); + } + + p = cp; + cp = strstr(p, "~~"); + if(cp){ + *cp = 0; + cp += 2; + } + if(strlen(p)){ + alt = Malloc(sizeof(*alt)); + alt->string = strdup(p); + alt->next = *head; + *head = alt; + } + } +} + +static int +extract(char *cp) +{ + int c; + char *p, *q, *r; + + p = q = r = cp; + while(whitespace(*p)) + p++; + while(c = *p++){ + if (c == '#') + break; + if(c == '"'){ + while(*p && *p != '"'){ + if(*p == '\\' && p[1] == '"') + p++; + if('A' <= *p && *p <= 'Z') + *q++ = *p++ + ('a'-'A'); + else + *q++ = *p++; + } + if(*p) + p++; + r = q; /* never back up over a quoted string */ + } else { + if('A' <= c && c <= 'Z') + c += ('a'-'A'); + *q++ = c; + } + } + while(q > r && whitespace(q[-1])) + q--; + *q = 0; + return q-cp; +} + +/* + * The matching engine: compare canonical input to pattern structures + */ + +static Spat* +isalt(char *message, Spat *alt) +{ + while(alt) { + if(*cmd) + if(message != cmd && strstr(cmd, alt->string)) + break; + if(message != header+1 && strstr(header+1, alt->string)) + break; + if(strstr(message, alt->string)) + break; + alt = alt->next; + } + return alt; +} + +int +matchpat(Pattern *p, char *message, Resub *m) +{ + Spat *spat; + char *s; + int c, c1; + + if(p->type == string){ + c1 = *message; + for(s=message; c=c1; s++){ + c1 = s[1]; + for(spat=p->spat[hash(c)]; spat; spat=spat->next){ + if(c1 == spat->c1) + if(memcmp(s, spat->string, spat->len) == 0) + if(!isalt(message, spat->alt)){ + m->sp = s; + m->ep = s + spat->len; + return 1; + } + } + } + return 0; + } + m->sp = m->ep = 0; + if(regexec(p->pat, message, m, 1) == 0) + return 0; + if(isalt(message, p->alt)) + return 0; + return 1; +} + + +void +xprint(int fd, char *type, Resub *m) +{ + char *p, *q; + int i; + + if(m->sp == 0 || m->ep == 0) + return; + + /* back up approx 30 characters to whitespace */ + for(p = m->sp, i = 0; *p && i < 30; i++, p--) + ; + while(*p && *p != ' ') + p--; + p++; + + /* grab about 30 more chars beyond the end of the match */ + for(q = m->ep, i = 0; *q && i < 30; i++, q++) + ; + while(*q && *q != ' ') + q++; + + fprint(fd, "%s %.*s~%.*s~%.*s\n", type, (int)(m->sp-p), p, (int)(m->ep-m->sp), m->sp, (int)(q-m->ep), m->ep); +} + +enum { + INVAL= 255 +}; + +static uchar t64d[256] = { +/*00 */ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, + INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, +/*10*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, + INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, +/*20*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, + INVAL, INVAL, INVAL, 62, INVAL, INVAL, INVAL, 63, +/*30*/ 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, +/*40*/ INVAL, 0, 1, 2, 3, 4, 5, 6, + 7, 8, 9, 10, 11, 12, 13, 14, +/*50*/ 15, 16, 17, 18, 19, 20, 21, 22, + 23, 24, 25, INVAL, INVAL, INVAL, INVAL, INVAL, +/*60*/ INVAL, 26, 27, 28, 29, 30, 31, 32, + 33, 34, 35, 36, 37, 38, 39, 40, +/*70*/ 41, 42, 43, 44, 45, 46, 47, 48, + 49, 50, 51, INVAL, INVAL, INVAL, INVAL, INVAL, +/*80*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, + INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, +/*90*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, + INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, +/*A0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, + INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, +/*B0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, + INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, +/*C0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, + INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, +/*D0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, + INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, +/*E0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, + INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, +/*F0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, + INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, +}; |