summaryrefslogtreecommitdiff
path: root/sys/src/libregexp
diff options
context:
space:
mode:
authorTaru Karttunen <taruti@taruti.net>2011-03-30 15:46:40 +0300
committerTaru Karttunen <taruti@taruti.net>2011-03-30 15:46:40 +0300
commite5888a1ffdae813d7575f5fb02275c6bb07e5199 (patch)
treed8d51eac403f07814b9e936eed0c9a79195e2450 /sys/src/libregexp
Import sources from 2011-03-30 iso image
Diffstat (limited to 'sys/src/libregexp')
-rwxr-xr-xsys/src/libregexp/mkfile28
-rwxr-xr-xsys/src/libregexp/regaux.c113
-rwxr-xr-xsys/src/libregexp/regcomp.c567
-rwxr-xr-xsys/src/libregexp/regcomp.h63
-rwxr-xr-xsys/src/libregexp/regerror.c15
-rwxr-xr-xsys/src/libregexp/regexec.c232
-rwxr-xr-xsys/src/libregexp/regsub.c63
-rwxr-xr-xsys/src/libregexp/rregexec.c212
-rwxr-xr-xsys/src/libregexp/rregsub.c64
9 files changed, 1357 insertions, 0 deletions
diff --git a/sys/src/libregexp/mkfile b/sys/src/libregexp/mkfile
new file mode 100755
index 000000000..dcc457028
--- /dev/null
+++ b/sys/src/libregexp/mkfile
@@ -0,0 +1,28 @@
+</$objtype/mkfile
+
+LIB=/$objtype/lib/libregexp.a
+OFILES=\
+ regcomp.$O\
+ regerror.$O\
+ regexec.$O\
+ regsub.$O\
+ regaux.$O\
+ rregexec.$O\
+ rregsub.$O\
+
+HFILES=/sys/include/regexp.h\
+ regcomp.h\
+
+UPDATE=\
+ mkfile\
+ $HFILES\
+ ${OFILES:%.$O=%.c}\
+ ${LIB:/$objtype/%=/386/%}\
+
+</sys/src/cmd/mksyslib
+
+test: test.$O $OFILES
+ $LD -o test $prereq
+
+test2: test2.$O $OFILES
+ $LD -o test2 $prereq
diff --git a/sys/src/libregexp/regaux.c b/sys/src/libregexp/regaux.c
new file mode 100755
index 000000000..a7d52eca8
--- /dev/null
+++ b/sys/src/libregexp/regaux.c
@@ -0,0 +1,113 @@
+#include <u.h>
+#include <libc.h>
+#include "regexp.h"
+#include "regcomp.h"
+
+
+/*
+ * save a new match in mp
+ */
+extern void
+_renewmatch(Resub *mp, int ms, Resublist *sp)
+{
+ int i;
+
+ if(mp==0 || ms<=0)
+ return;
+ if(mp[0].sp==0 || sp->m[0].sp<mp[0].sp ||
+ (sp->m[0].sp==mp[0].sp && sp->m[0].ep>mp[0].ep)){
+ for(i=0; i<ms && i<NSUBEXP; i++)
+ mp[i] = sp->m[i];
+ for(; i<ms; i++)
+ mp[i].sp = mp[i].ep = 0;
+ }
+}
+
+/*
+ * Note optimization in _renewthread:
+ * *lp must be pending when _renewthread called; if *l has been looked
+ * at already, the optimization is a bug.
+ */
+extern Relist*
+_renewthread(Relist *lp, /* _relist to add to */
+ Reinst *ip, /* instruction to add */
+ int ms,
+ Resublist *sep) /* pointers to subexpressions */
+{
+ Relist *p;
+
+ for(p=lp; p->inst; p++){
+ if(p->inst == ip){
+ if(sep->m[0].sp < p->se.m[0].sp){
+ if(ms > 1)
+ p->se = *sep;
+ else
+ p->se.m[0] = sep->m[0];
+ }
+ return 0;
+ }
+ }
+ p->inst = ip;
+ if(ms > 1)
+ p->se = *sep;
+ else
+ p->se.m[0] = sep->m[0];
+ (++p)->inst = 0;
+ return p;
+}
+
+/*
+ * same as renewthread, but called with
+ * initial empty start pointer.
+ */
+extern Relist*
+_renewemptythread(Relist *lp, /* _relist to add to */
+ Reinst *ip, /* instruction to add */
+ int ms,
+ char *sp) /* pointers to subexpressions */
+{
+ Relist *p;
+
+ for(p=lp; p->inst; p++){
+ if(p->inst == ip){
+ if(sp < p->se.m[0].sp) {
+ if(ms > 1)
+ memset(&p->se, 0, sizeof(p->se));
+ p->se.m[0].sp = sp;
+ }
+ return 0;
+ }
+ }
+ p->inst = ip;
+ if(ms > 1)
+ memset(&p->se, 0, sizeof(p->se));
+ p->se.m[0].sp = sp;
+ (++p)->inst = 0;
+ return p;
+}
+
+extern Relist*
+_rrenewemptythread(Relist *lp, /* _relist to add to */
+ Reinst *ip, /* instruction to add */
+ int ms,
+ Rune *rsp) /* pointers to subexpressions */
+{
+ Relist *p;
+
+ for(p=lp; p->inst; p++){
+ if(p->inst == ip){
+ if(rsp < p->se.m[0].rsp) {
+ if(ms > 1)
+ memset(&p->se, 0, sizeof(p->se));
+ p->se.m[0].rsp = rsp;
+ }
+ return 0;
+ }
+ }
+ p->inst = ip;
+ if(ms > 1)
+ memset(&p->se, 0, sizeof(p->se));
+ p->se.m[0].rsp = rsp;
+ (++p)->inst = 0;
+ return p;
+}
diff --git a/sys/src/libregexp/regcomp.c b/sys/src/libregexp/regcomp.c
new file mode 100755
index 000000000..e48ef15ac
--- /dev/null
+++ b/sys/src/libregexp/regcomp.c
@@ -0,0 +1,567 @@
+#include <u.h>
+#include <libc.h>
+#include "regexp.h"
+#include "regcomp.h"
+
+#define TRUE 1
+#define FALSE 0
+
+/*
+ * Parser Information
+ */
+typedef
+struct Node
+{
+ Reinst* first;
+ Reinst* last;
+}Node;
+
+/* max character classes per program is nelem(reprog->class) */
+static Reprog *reprog;
+
+/* max rune ranges per character class is nelem(classp->spans)/2 */
+#define NCCRUNE nelem(classp->spans)
+
+#define NSTACK 20
+static Node andstack[NSTACK];
+static Node *andp;
+static int atorstack[NSTACK];
+static int* atorp;
+static int cursubid; /* id of current subexpression */
+static int subidstack[NSTACK]; /* parallel to atorstack */
+static int* subidp;
+static int lastwasand; /* Last token was operand */
+static int nbra;
+static char* exprp; /* pointer to next character in source expression */
+static int lexdone;
+static int nclass;
+static Reclass*classp;
+static Reinst* freep;
+static int errors;
+static Rune yyrune; /* last lex'd rune */
+static Reclass*yyclassp; /* last lex'd class */
+
+/* predeclared crap */
+static void operator(int);
+static void pushand(Reinst*, Reinst*);
+static void pushator(int);
+static void evaluntil(int);
+static int bldcclass(void);
+
+static jmp_buf regkaboom;
+
+static void
+rcerror(char *s)
+{
+ errors++;
+ regerror(s);
+ longjmp(regkaboom, 1);
+}
+
+static Reinst*
+newinst(int t)
+{
+ freep->type = t;
+ freep->left = 0;
+ freep->right = 0;
+ return freep++;
+}
+
+static void
+operand(int t)
+{
+ Reinst *i;
+
+ if(lastwasand)
+ operator(CAT); /* catenate is implicit */
+ i = newinst(t);
+
+ if(t == CCLASS || t == NCCLASS)
+ i->cp = yyclassp;
+ if(t == RUNE)
+ i->r = yyrune;
+
+ pushand(i, i);
+ lastwasand = TRUE;
+}
+
+static void
+operator(int t)
+{
+ if(t==RBRA && --nbra<0)
+ rcerror("unmatched right paren");
+ if(t==LBRA){
+ if(++cursubid >= NSUBEXP)
+ rcerror ("too many subexpressions");
+ nbra++;
+ if(lastwasand)
+ operator(CAT);
+ } else
+ evaluntil(t);
+ if(t != RBRA)
+ pushator(t);
+ lastwasand = FALSE;
+ if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
+ lastwasand = TRUE; /* these look like operands */
+}
+
+static void
+regerr2(char *s, int c)
+{
+ char buf[100];
+ char *cp = buf;
+ while(*s)
+ *cp++ = *s++;
+ *cp++ = c;
+ *cp = '\0';
+ rcerror(buf);
+}
+
+static void
+cant(char *s)
+{
+ char buf[100];
+ strcpy(buf, "can't happen: ");
+ strcat(buf, s);
+ rcerror(buf);
+}
+
+static void
+pushand(Reinst *f, Reinst *l)
+{
+ if(andp >= &andstack[NSTACK])
+ cant("operand stack overflow");
+ andp->first = f;
+ andp->last = l;
+ andp++;
+}
+
+static void
+pushator(int t)
+{
+ if(atorp >= &atorstack[NSTACK])
+ cant("operator stack overflow");
+ *atorp++ = t;
+ *subidp++ = cursubid;
+}
+
+static Node*
+popand(int op)
+{
+ Reinst *inst;
+
+ if(andp <= &andstack[0]){
+ regerr2("missing operand for ", op);
+ inst = newinst(NOP);
+ pushand(inst,inst);
+ }
+ return --andp;
+}
+
+static int
+popator(void)
+{
+ if(atorp <= &atorstack[0])
+ cant("operator stack underflow");
+ --subidp;
+ return *--atorp;
+}
+
+static void
+evaluntil(int pri)
+{
+ Node *op1, *op2;
+ Reinst *inst1, *inst2;
+
+ while(pri==RBRA || atorp[-1]>=pri){
+ switch(popator()){
+ default:
+ rcerror("unknown operator in evaluntil");
+ break;
+ case LBRA: /* must have been RBRA */
+ op1 = popand('(');
+ inst2 = newinst(RBRA);
+ inst2->subid = *subidp;
+ op1->last->next = inst2;
+ inst1 = newinst(LBRA);
+ inst1->subid = *subidp;
+ inst1->next = op1->first;
+ pushand(inst1, inst2);
+ return;
+ case OR:
+ op2 = popand('|');
+ op1 = popand('|');
+ inst2 = newinst(NOP);
+ op2->last->next = inst2;
+ op1->last->next = inst2;
+ inst1 = newinst(OR);
+ inst1->right = op1->first;
+ inst1->left = op2->first;
+ pushand(inst1, inst2);
+ break;
+ case CAT:
+ op2 = popand(0);
+ op1 = popand(0);
+ op1->last->next = op2->first;
+ pushand(op1->first, op2->last);
+ break;
+ case STAR:
+ op2 = popand('*');
+ inst1 = newinst(OR);
+ op2->last->next = inst1;
+ inst1->right = op2->first;
+ pushand(inst1, inst1);
+ break;
+ case PLUS:
+ op2 = popand('+');
+ inst1 = newinst(OR);
+ op2->last->next = inst1;
+ inst1->right = op2->first;
+ pushand(op2->first, inst1);
+ break;
+ case QUEST:
+ op2 = popand('?');
+ inst1 = newinst(OR);
+ inst2 = newinst(NOP);
+ inst1->left = inst2;
+ inst1->right = op2->first;
+ op2->last->next = inst2;
+ pushand(inst1, inst2);
+ break;
+ }
+ }
+}
+
+static Reprog*
+optimize(Reprog *pp)
+{
+ Reinst *inst, *target;
+ int size;
+ Reprog *npp;
+ Reclass *cl;
+ int diff;
+
+ /*
+ * get rid of NOOP chains
+ */
+ for(inst=pp->firstinst; inst->type!=END; inst++){
+ target = inst->next;
+ while(target->type == NOP)
+ target = target->next;
+ inst->next = target;
+ }
+
+ /*
+ * The original allocation is for an area larger than
+ * necessary. Reallocate to the actual space used
+ * and then relocate the code.
+ */
+ size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
+ npp = realloc(pp, size);
+ if(npp==0 || npp==pp)
+ return pp;
+ diff = (char *)npp - (char *)pp;
+ freep = (Reinst *)((char *)freep + diff);
+ for(inst=npp->firstinst; inst<freep; inst++){
+ switch(inst->type){
+ case OR:
+ case STAR:
+ case PLUS:
+ case QUEST:
+ *(char **)&inst->right += diff;
+ break;
+ case CCLASS:
+ case NCCLASS:
+ *(char **)&inst->right += diff;
+ cl = inst->cp;
+ *(char **)&cl->end += diff;
+ break;
+ }
+ *(char **)&inst->left += diff;
+ }
+ *(char **)&npp->startinst += diff;
+ return npp;
+}
+
+#ifdef DEBUG
+static void
+dumpstack(void){
+ Node *stk;
+ int *ip;
+
+ print("operators\n");
+ for(ip=atorstack; ip<atorp; ip++)
+ print("0%o\n", *ip);
+ print("operands\n");
+ for(stk=andstack; stk<andp; stk++)
+ print("0%o\t0%o\n", stk->first->type, stk->last->type);
+}
+
+static void
+dump(Reprog *pp)
+{
+ Reinst *l;
+ Rune *p;
+
+ l = pp->firstinst;
+ do{
+ print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
+ l->left-pp->firstinst, l->right-pp->firstinst);
+ if(l->type == RUNE)
+ print("\t%C\n", l->r);
+ else if(l->type == CCLASS || l->type == NCCLASS){
+ print("\t[");
+ if(l->type == NCCLASS)
+ print("^");
+ for(p = l->cp->spans; p < l->cp->end; p += 2)
+ if(p[0] == p[1])
+ print("%C", p[0]);
+ else
+ print("%C-%C", p[0], p[1]);
+ print("]\n");
+ } else
+ print("\n");
+ }while(l++->type);
+}
+#endif
+
+static Reclass*
+newclass(void)
+{
+ if(nclass >= nelem(reprog->class))
+ rcerror("too many character classes; increase Reprog.class size");
+ return &(classp[nclass++]);
+}
+
+static int
+nextc(Rune *rp)
+{
+ if(lexdone){
+ *rp = 0;
+ return 1;
+ }
+ exprp += chartorune(rp, exprp);
+ if(*rp == L'\\'){
+ exprp += chartorune(rp, exprp);
+ return 1;
+ }
+ if(*rp == 0)
+ lexdone = 1;
+ return 0;
+}
+
+static int
+lex(int literal, int dot_type)
+{
+ int quoted;
+
+ quoted = nextc(&yyrune);
+ if(literal || quoted){
+ if(yyrune == 0)
+ return END;
+ return RUNE;
+ }
+
+ switch(yyrune){
+ case 0:
+ return END;
+ case L'*':
+ return STAR;
+ case L'?':
+ return QUEST;
+ case L'+':
+ return PLUS;
+ case L'|':
+ return OR;
+ case L'.':
+ return dot_type;
+ case L'(':
+ return LBRA;
+ case L')':
+ return RBRA;
+ case L'^':
+ return BOL;
+ case L'$':
+ return EOL;
+ case L'[':
+ return bldcclass();
+ }
+ return RUNE;
+}
+
+static int
+bldcclass(void)
+{
+ int type;
+ Rune r[NCCRUNE];
+ Rune *p, *ep, *np;
+ Rune rune;
+ int quoted;
+
+ /* we have already seen the '[' */
+ type = CCLASS;
+ yyclassp = newclass();
+
+ /* look ahead for negation */
+ /* SPECIAL CASE!!! negated classes don't match \n */
+ ep = r;
+ quoted = nextc(&rune);
+ if(!quoted && rune == L'^'){
+ type = NCCLASS;
+ quoted = nextc(&rune);
+ *ep++ = L'\n';
+ *ep++ = L'\n';
+ }
+
+ /* parse class into a set of spans */
+ while(ep < &r[NCCRUNE-1]){
+ if(rune == 0){
+ rcerror("malformed '[]'");
+ return 0;
+ }
+ if(!quoted && rune == L']')
+ break;
+ if(!quoted && rune == L'-'){
+ if(ep == r){
+ rcerror("malformed '[]'");
+ return 0;
+ }
+ quoted = nextc(&rune);
+ if((!quoted && rune == L']') || rune == 0){
+ rcerror("malformed '[]'");
+ return 0;
+ }
+ *(ep-1) = rune;
+ } else {
+ *ep++ = rune;
+ *ep++ = rune;
+ }
+ quoted = nextc(&rune);
+ }
+ if(ep >= &r[NCCRUNE-1]) {
+ rcerror("char class too large; increase Reclass.spans size");
+ return 0;
+ }
+
+ /* sort on span start */
+ for(p = r; p < ep; p += 2){
+ for(np = p; np < ep; np += 2)
+ if(*np < *p){
+ rune = np[0];
+ np[0] = p[0];
+ p[0] = rune;
+ rune = np[1];
+ np[1] = p[1];
+ p[1] = rune;
+ }
+ }
+
+ /* merge spans */
+ np = yyclassp->spans;
+ p = r;
+ if(r == ep)
+ yyclassp->end = np;
+ else {
+ np[0] = *p++;
+ np[1] = *p++;
+ for(; p < ep; p += 2)
+ /* overlapping or adjacent ranges? */
+ if(p[0] <= np[1] + 1){
+ if(p[1] >= np[1])
+ np[1] = p[1]; /* coalesce */
+ } else {
+ np += 2;
+ np[0] = p[0];
+ np[1] = p[1];
+ }
+ yyclassp->end = np+2;
+ }
+
+ return type;
+}
+
+static Reprog*
+regcomp1(char *s, int literal, int dot_type)
+{
+ int token;
+ Reprog *pp;
+
+ /* get memory for the program */
+ pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
+ if(pp == 0){
+ regerror("out of memory");
+ return 0;
+ }
+ freep = pp->firstinst;
+ classp = pp->class;
+ errors = 0;
+
+ if(setjmp(regkaboom))
+ goto out;
+
+ /* go compile the sucker */
+ lexdone = 0;
+ exprp = s;
+ nclass = 0;
+ nbra = 0;
+ atorp = atorstack;
+ andp = andstack;
+ subidp = subidstack;
+ lastwasand = FALSE;
+ cursubid = 0;
+
+ /* Start with a low priority operator to prime parser */
+ pushator(START-1);
+ while((token = lex(literal, dot_type)) != END){
+ if((token&0300) == OPERATOR)
+ operator(token);
+ else
+ operand(token);
+ }
+
+ /* Close with a low priority operator */
+ evaluntil(START);
+
+ /* Force END */
+ operand(END);
+ evaluntil(START);
+#ifdef DEBUG
+ dumpstack();
+#endif
+ if(nbra)
+ rcerror("unmatched left paren");
+ --andp; /* points to first and only operand */
+ pp->startinst = andp->first;
+#ifdef DEBUG
+ dump(pp);
+#endif
+ pp = optimize(pp);
+#ifdef DEBUG
+ print("start: %d\n", andp->first-pp->firstinst);
+ dump(pp);
+#endif
+out:
+ if(errors){
+ free(pp);
+ pp = 0;
+ }
+ return pp;
+}
+
+extern Reprog*
+regcomp(char *s)
+{
+ return regcomp1(s, 0, ANY);
+}
+
+extern Reprog*
+regcomplit(char *s)
+{
+ return regcomp1(s, 1, ANY);
+}
+
+extern Reprog*
+regcompnl(char *s)
+{
+ return regcomp1(s, 0, ANYNL);
+}
diff --git a/sys/src/libregexp/regcomp.h b/sys/src/libregexp/regcomp.h
new file mode 100755
index 000000000..402fe7d5c
--- /dev/null
+++ b/sys/src/libregexp/regcomp.h
@@ -0,0 +1,63 @@
+/*
+ * substitution list
+ */
+#define NSUBEXP 32
+typedef struct Resublist Resublist;
+struct Resublist
+{
+ Resub m[NSUBEXP];
+};
+
+/*
+ * Actions and Tokens (Reinst types)
+ *
+ * 02xx are operators, value == precedence
+ * 03xx are tokens, i.e. operands for operators
+ */
+#define RUNE 0177
+#define OPERATOR 0200 /* Bitmask of all operators */
+#define START 0200 /* Start, used for marker on stack */
+#define RBRA 0201 /* Right bracket, ) */
+#define LBRA 0202 /* Left bracket, ( */
+#define OR 0203 /* Alternation, | */
+#define CAT 0204 /* Concatentation, implicit operator */
+#define STAR 0205 /* Closure, * */
+#define PLUS 0206 /* a+ == aa* */
+#define QUEST 0207 /* a? == a|nothing, i.e. 0 or 1 a's */
+#define ANY 0300 /* Any character except newline, . */
+#define ANYNL 0301 /* Any character including newline, . */
+#define NOP 0302 /* No operation, internal use only */
+#define BOL 0303 /* Beginning of line, ^ */
+#define EOL 0304 /* End of line, $ */
+#define CCLASS 0305 /* Character class, [] */
+#define NCCLASS 0306 /* Negated character class, [] */
+#define END 0377 /* Terminate: match found */
+
+/*
+ * regexec execution lists
+ */
+#define LISTSIZE 10
+#define BIGLISTSIZE (25*LISTSIZE)
+typedef struct Relist Relist;
+struct Relist
+{
+ Reinst* inst; /* Reinstruction of the thread */
+ Resublist se; /* matched subexpressions in this thread */
+};
+typedef struct Reljunk Reljunk;
+struct Reljunk
+{
+ Relist* relist[2];
+ Relist* reliste[2];
+ int starttype;
+ Rune startchar;
+ char* starts;
+ char* eol;
+ Rune* rstarts;
+ Rune* reol;
+};
+
+extern Relist* _renewthread(Relist*, Reinst*, int, Resublist*);
+extern void _renewmatch(Resub*, int, Resublist*);
+extern Relist* _renewemptythread(Relist*, Reinst*, int, char*);
+extern Relist* _rrenewemptythread(Relist*, Reinst*, int, Rune*);
diff --git a/sys/src/libregexp/regerror.c b/sys/src/libregexp/regerror.c
new file mode 100755
index 000000000..92b73a0d9
--- /dev/null
+++ b/sys/src/libregexp/regerror.c
@@ -0,0 +1,15 @@
+#include <u.h>
+#include <libc.h>
+#include "regexp.h"
+
+void
+regerror(char *s)
+{
+ char buf[132];
+
+ strcpy(buf, "regerror: ");
+ strcat(buf, s);
+ strcat(buf, "\n");
+ write(2, buf, strlen(buf));
+ exits("regerr");
+}
diff --git a/sys/src/libregexp/regexec.c b/sys/src/libregexp/regexec.c
new file mode 100755
index 000000000..b806ba60a
--- /dev/null
+++ b/sys/src/libregexp/regexec.c
@@ -0,0 +1,232 @@
+#include <u.h>
+#include <libc.h>
+#include "regexp.h"
+#include "regcomp.h"
+
+
+/*
+ * return 0 if no match
+ * >0 if a match
+ * <0 if we ran out of _relist space
+ */
+static int
+regexec1(Reprog *progp, /* program to run */
+ char *bol, /* string to run machine on */
+ Resub *mp, /* subexpression elements */
+ int ms, /* number of elements at mp */
+ Reljunk *j
+)
+{
+ int flag=0;
+ Reinst *inst;
+ Relist *tlp;
+ char *s;
+ int i, checkstart;
+ Rune r, *rp, *ep;
+ int n;
+ Relist* tl; /* This list, next list */
+ Relist* nl;
+ Relist* tle; /* ends of this and next list */
+ Relist* nle;
+ int match;
+ char *p;
+
+ match = 0;
+ checkstart = j->starttype;
+ if(mp)
+ for(i=0; i<ms; i++) {
+ mp[i].sp = 0;
+ mp[i].ep = 0;
+ }
+ j->relist[0][0].inst = 0;
+ j->relist[1][0].inst = 0;
+
+ /* Execute machine once for each character, including terminal NUL */
+ s = j->starts;
+ do{
+ /* fast check for first char */
+ if(checkstart) {
+ switch(j->starttype) {
+ case RUNE:
+ p = utfrune(s, j->startchar);
+ if(p == 0 || s == j->eol)
+ return match;
+ s = p;
+ break;
+ case BOL:
+ if(s == bol)
+ break;
+ p = utfrune(s, '\n');
+ if(p == 0 || s == j->eol)
+ return match;
+ s = p+1;
+ break;
+ }
+ }
+ r = *(uchar*)s;
+ if(r < Runeself)
+ n = 1;
+ else
+ n = chartorune(&r, s);
+
+ /* switch run lists */
+ tl = j->relist[flag];
+ tle = j->reliste[flag];
+ nl = j->relist[flag^=1];
+ nle = j->reliste[flag];
+ nl->inst = 0;
+
+ /* Add first instruction to current list */
+ if(match == 0)
+ _renewemptythread(tl, progp->startinst, ms, s);
+
+ /* Execute machine until current list is empty */
+ for(tlp=tl; tlp->inst; tlp++){ /* assignment = */
+ for(inst = tlp->inst; ; inst = inst->next){
+ switch(inst->type){
+ case RUNE: /* regular character */
+ if(inst->r == r){
+ if(_renewthread(nl, inst->next, ms, &tlp->se)==nle)
+ return -1;
+ }
+ break;
+ case LBRA:
+ tlp->se.m[inst->subid].sp = s;
+ continue;
+ case RBRA:
+ tlp->se.m[inst->subid].ep = s;
+ continue;
+ case ANY:
+ if(r != '\n')
+ if(_renewthread(nl, inst->next, ms, &tlp->se)==nle)
+ return -1;
+ break;
+ case ANYNL:
+ if(_renewthread(nl, inst->next, ms, &tlp->se)==nle)
+ return -1;
+ break;
+ case BOL:
+ if(s == bol || *(s-1) == '\n')
+ continue;
+ break;
+ case EOL:
+ if(s == j->eol || r == 0 || r == '\n')
+ continue;
+ break;
+ case CCLASS:
+ ep = inst->cp->end;
+ for(rp = inst->cp->spans; rp < ep; rp += 2)
+ if(r >= rp[0] && r <= rp[1]){
+ if(_renewthread(nl, inst->next, ms, &tlp->se)==nle)
+ return -1;
+ break;
+ }
+ break;
+ case NCCLASS:
+ ep = inst->cp->end;
+ for(rp = inst->cp->spans; rp < ep; rp += 2)
+ if(r >= rp[0] && r <= rp[1])
+ break;
+ if(rp == ep)
+ if(_renewthread(nl, inst->next, ms, &tlp->se)==nle)
+ return -1;
+ break;
+ case OR:
+ /* evaluate right choice later */
+ if(_renewthread(tlp, inst->right, ms, &tlp->se) == tle)
+ return -1;
+ /* efficiency: advance and re-evaluate */
+ continue;
+ case END: /* Match! */
+ match = 1;
+ tlp->se.m[0].ep = s;
+ if(mp != 0)
+ _renewmatch(mp, ms, &tlp->se);
+ break;
+ }
+ break;
+ }
+ }
+ if(s == j->eol)
+ break;
+ checkstart = j->starttype && nl->inst==0;
+ s += n;
+ }while(r);
+ return match;
+}
+
+static int
+regexec2(Reprog *progp, /* program to run */
+ char *bol, /* string to run machine on */
+ Resub *mp, /* subexpression elements */
+ int ms, /* number of elements at mp */
+ Reljunk *j
+)
+{
+ int rv;
+ Relist *relist0, *relist1;
+
+ /* mark space */
+ relist0 = malloc(BIGLISTSIZE*sizeof(Relist));
+ if(relist0 == nil)
+ return -1;
+ relist1 = malloc(BIGLISTSIZE*sizeof(Relist));
+ if(relist1 == nil){
+ free(relist1);
+ return -1;
+ }
+ j->relist[0] = relist0;
+ j->relist[1] = relist1;
+ j->reliste[0] = relist0 + BIGLISTSIZE - 2;
+ j->reliste[1] = relist1 + BIGLISTSIZE - 2;
+
+ rv = regexec1(progp, bol, mp, ms, j);
+ free(relist0);
+ free(relist1);
+ return rv;
+}
+
+extern int
+regexec(Reprog *progp, /* program to run */
+ char *bol, /* string to run machine on */
+ Resub *mp, /* subexpression elements */
+ int ms) /* number of elements at mp */
+{
+ Reljunk j;
+ Relist relist0[LISTSIZE], relist1[LISTSIZE];
+ int rv;
+
+ /*
+ * use user-specified starting/ending location if specified
+ */
+ j.starts = bol;
+ j.eol = 0;
+ if(mp && ms>0){
+ if(mp->sp)
+ j.starts = mp->sp;
+ if(mp->ep)
+ j.eol = mp->ep;
+ }
+ j.starttype = 0;
+ j.startchar = 0;
+ if(progp->startinst->type == RUNE && progp->startinst->r < Runeself) {
+ j.starttype = RUNE;
+ j.startchar = progp->startinst->r;
+ }
+ if(progp->startinst->type == BOL)
+ j.starttype = BOL;
+
+ /* mark space */
+ j.relist[0] = relist0;
+ j.relist[1] = relist1;
+ j.reliste[0] = relist0 + nelem(relist0) - 2;
+ j.reliste[1] = relist1 + nelem(relist1) - 2;
+
+ rv = regexec1(progp, bol, mp, ms, &j);
+ if(rv >= 0)
+ return rv;
+ rv = regexec2(progp, bol, mp, ms, &j);
+ if(rv >= 0)
+ return rv;
+ return -1;
+}
diff --git a/sys/src/libregexp/regsub.c b/sys/src/libregexp/regsub.c
new file mode 100755
index 000000000..2474d2989
--- /dev/null
+++ b/sys/src/libregexp/regsub.c
@@ -0,0 +1,63 @@
+#include <u.h>
+#include <libc.h>
+#include "regexp.h"
+
+/* substitute into one string using the matches from the last regexec() */
+extern void
+regsub(char *sp, /* source string */
+ char *dp, /* destination string */
+ int dlen,
+ Resub *mp, /* subexpression elements */
+ int ms) /* number of elements pointed to by mp */
+{
+ char *ssp, *ep;
+ int i;
+
+ ep = dp+dlen-1;
+ while(*sp != '\0'){
+ if(*sp == '\\'){
+ switch(*++sp){
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ i = *sp-'0';
+ if(mp!=0 && mp[i].sp != 0 && ms>i)
+ for(ssp = mp[i].sp;
+ ssp < mp[i].ep;
+ ssp++)
+ if(dp < ep)
+ *dp++ = *ssp;
+ break;
+ case '\\':
+ if(dp < ep)
+ *dp++ = '\\';
+ break;
+ case '\0':
+ sp--;
+ break;
+ default:
+ if(dp < ep)
+ *dp++ = *sp;
+ break;
+ }
+ }else if(*sp == '&'){
+ if(mp!=0 && mp[0].sp != 0 && ms>0)
+ for(ssp = mp[0].sp;
+ ssp < mp[0].ep; ssp++)
+ if(dp < ep)
+ *dp++ = *ssp;
+ }else{
+ if(dp < ep)
+ *dp++ = *sp;
+ }
+ sp++;
+ }
+ *dp = '\0';
+}
diff --git a/sys/src/libregexp/rregexec.c b/sys/src/libregexp/rregexec.c
new file mode 100755
index 000000000..301a3589b
--- /dev/null
+++ b/sys/src/libregexp/rregexec.c
@@ -0,0 +1,212 @@
+#include <u.h>
+#include <libc.h>
+#include "regexp.h"
+#include "regcomp.h"
+
+/*
+ * return 0 if no match
+ * >0 if a match
+ * <0 if we ran out of _relist space
+ */
+static int
+rregexec1(Reprog *progp, /* program to run */
+ Rune *bol, /* string to run machine on */
+ Resub *mp, /* subexpression elements */
+ int ms, /* number of elements at mp */
+ Reljunk *j)
+{
+ int flag=0;
+ Reinst *inst;
+ Relist *tlp;
+ Rune *s;
+ int i, checkstart;
+ Rune r, *rp, *ep;
+ Relist* tl; /* This list, next list */
+ Relist* nl;
+ Relist* tle; /* ends of this and next list */
+ Relist* nle;
+ int match;
+ Rune *p;
+
+ match = 0;
+ checkstart = j->startchar;
+ if(mp)
+ for(i=0; i<ms; i++) {
+ mp[i].rsp = 0;
+ mp[i].rep = 0;
+ }
+ j->relist[0][0].inst = 0;
+ j->relist[1][0].inst = 0;
+
+ /* Execute machine once for each character, including terminal NUL */
+ s = j->rstarts;
+ do{
+ /* fast check for first char */
+ if(checkstart) {
+ switch(j->starttype) {
+ case RUNE:
+ p = runestrchr(s, j->startchar);
+ if(p == 0 || s == j->reol)
+ return match;
+ s = p;
+ break;
+ case BOL:
+ if(s == bol)
+ break;
+ p = runestrchr(s, '\n');
+ if(p == 0 || s == j->reol)
+ return match;
+ s = p+1;
+ break;
+ }
+ }
+
+ r = *s;
+
+ /* switch run lists */
+ tl = j->relist[flag];
+ tle = j->reliste[flag];
+ nl = j->relist[flag^=1];
+ nle = j->reliste[flag];
+ nl->inst = 0;
+
+ /* Add first instruction to current list */
+ _rrenewemptythread(tl, progp->startinst, ms, s);
+
+ /* Execute machine until current list is empty */
+ for(tlp=tl; tlp->inst; tlp++){
+ for(inst=tlp->inst; ; inst = inst->next){
+ switch(inst->type){
+ case RUNE: /* regular character */
+ if(inst->r == r)
+ if(_renewthread(nl, inst->next, ms, &tlp->se)==nle)
+ return -1;
+ break;
+ case LBRA:
+ tlp->se.m[inst->subid].rsp = s;
+ continue;
+ case RBRA:
+ tlp->se.m[inst->subid].rep = s;
+ continue;
+ case ANY:
+ if(r != '\n')
+ if(_renewthread(nl, inst->next, ms, &tlp->se)==nle)
+ return -1;
+ break;
+ case ANYNL:
+ if(_renewthread(nl, inst->next, ms, &tlp->se)==nle)
+ return -1;
+ break;
+ case BOL:
+ if(s == bol || *(s-1) == '\n')
+ continue;
+ break;
+ case EOL:
+ if(s == j->reol || r == 0 || r == '\n')
+ continue;
+ break;
+ case CCLASS:
+ ep = inst->cp->end;
+ for(rp = inst->cp->spans; rp < ep; rp += 2)
+ if(r >= rp[0] && r <= rp[1]){
+ if(_renewthread(nl, inst->next, ms, &tlp->se)==nle)
+ return -1;
+ break;
+ }
+ break;
+ case NCCLASS:
+ ep = inst->cp->end;
+ for(rp = inst->cp->spans; rp < ep; rp += 2)
+ if(r >= rp[0] && r <= rp[1])
+ break;
+ if(rp == ep)
+ if(_renewthread(nl, inst->next, ms, &tlp->se)==nle)
+ return -1;
+ break;
+ case OR:
+ /* evaluate right choice later */
+ if(_renewthread(tlp, inst->right, ms, &tlp->se) == tle)
+ return -1;
+ /* efficiency: advance and re-evaluate */
+ continue;
+ case END: /* Match! */
+ match = 1;
+ tlp->se.m[0].rep = s;
+ if(mp != 0)
+ _renewmatch(mp, ms, &tlp->se);
+ break;
+ }
+ break;
+ }
+ }
+ if(s == j->reol)
+ break;
+ checkstart = j->startchar && nl->inst==0;
+ s++;
+ }while(r);
+ return match;
+}
+
+static int
+rregexec2(Reprog *progp, /* program to run */
+ Rune *bol, /* string to run machine on */
+ Resub *mp, /* subexpression elements */
+ int ms, /* number of elements at mp */
+ Reljunk *j
+)
+{
+ Relist relist0[5*LISTSIZE], relist1[5*LISTSIZE];
+
+ /* mark space */
+ j->relist[0] = relist0;
+ j->relist[1] = relist1;
+ j->reliste[0] = relist0 + nelem(relist0) - 2;
+ j->reliste[1] = relist1 + nelem(relist1) - 2;
+
+ return rregexec1(progp, bol, mp, ms, j);
+}
+
+extern int
+rregexec(Reprog *progp, /* program to run */
+ Rune *bol, /* string to run machine on */
+ Resub *mp, /* subexpression elements */
+ int ms) /* number of elements at mp */
+{
+ Reljunk j;
+ Relist relist0[LISTSIZE], relist1[LISTSIZE];
+ int rv;
+
+ /*
+ * use user-specified starting/ending location if specified
+ */
+ j.rstarts = bol;
+ j.reol = 0;
+ if(mp && ms>0){
+ if(mp->sp)
+ j.rstarts = mp->rsp;
+ if(mp->ep)
+ j.reol = mp->rep;
+ }
+ j.starttype = 0;
+ j.startchar = 0;
+ if(progp->startinst->type == RUNE && progp->startinst->r < Runeself) {
+ j.starttype = RUNE;
+ j.startchar = progp->startinst->r;
+ }
+ if(progp->startinst->type == BOL)
+ j.starttype = BOL;
+
+ /* mark space */
+ j.relist[0] = relist0;
+ j.relist[1] = relist1;
+ j.reliste[0] = relist0 + nelem(relist0) - 2;
+ j.reliste[1] = relist1 + nelem(relist1) - 2;
+
+ rv = rregexec1(progp, bol, mp, ms, &j);
+ if(rv >= 0)
+ return rv;
+ rv = rregexec2(progp, bol, mp, ms, &j);
+ if(rv >= 0)
+ return rv;
+ return -1;
+}
diff --git a/sys/src/libregexp/rregsub.c b/sys/src/libregexp/rregsub.c
new file mode 100755
index 000000000..c395583d0
--- /dev/null
+++ b/sys/src/libregexp/rregsub.c
@@ -0,0 +1,64 @@
+#include <u.h>
+#include <libc.h>
+#include "regexp.h"
+
+/* substitute into one string using the matches from the last regexec() */
+extern void
+rregsub(Rune *sp, /* source string */
+ Rune *dp, /* destination string */
+ int dlen,
+ Resub *mp, /* subexpression elements */
+ int ms) /* number of elements pointed to by mp */
+{
+ Rune *ssp, *ep;
+ int i;
+
+ ep = dp+(dlen/sizeof(Rune))-1;
+ while(*sp != '\0'){
+ if(*sp == '\\'){
+ switch(*++sp){
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ i = *sp-'0';
+ if(mp[i].rsp != 0 && mp!=0 && ms>i)
+ for(ssp = mp[i].rsp;
+ ssp < mp[i].rep;
+ ssp++)
+ if(dp < ep)
+ *dp++ = *ssp;
+ break;
+ case '\\':
+ if(dp < ep)
+ *dp++ = '\\';
+ break;
+ case '\0':
+ sp--;
+ break;
+ default:
+ if(dp < ep)
+ *dp++ = *sp;
+ break;
+ }
+ }else if(*sp == '&'){
+ if(mp[0].rsp != 0 && mp!=0 && ms>0)
+ if(mp[0].rsp != 0)
+ for(ssp = mp[0].rsp;
+ ssp < mp[0].rep; ssp++)
+ if(dp < ep)
+ *dp++ = *ssp;
+ }else{
+ if(dp < ep)
+ *dp++ = *sp;
+ }
+ sp++;
+ }
+ *dp = '\0';
+}