summaryrefslogtreecommitdiff
path: root/sys/src/libregexp
diff options
context:
space:
mode:
authorben <ben@rana>2016-04-26 22:23:44 -0500
committerben <ben@rana>2016-04-26 22:23:44 -0500
commit0a460e1722c50e31653359f8a86fe0b606d2b513 (patch)
treeb5a00bbfc883aa98709db012e0a7bacc67e234af /sys/src/libregexp
parent651d6c2bc68e7e5224c3ba41b094e37b1c1890ed (diff)
New libregexp and APE ported to native
Diffstat (limited to 'sys/src/libregexp')
-rw-r--r--sys/src/libregexp/mkfile12
-rw-r--r--sys/src/libregexp/regcomp.c971
-rw-r--r--sys/src/libregexp/regerror.c2
-rw-r--r--sys/src/libregexp/regexec.c390
-rw-r--r--sys/src/libregexp/regimpl.h104
-rw-r--r--sys/src/libregexp/regprint.c66
-rw-r--r--sys/src/libregexp/regsub.c109
-rw-r--r--sys/src/libregexp/rregexec.c369
-rw-r--r--sys/src/libregexp/rregsub.c110
9 files changed, 1128 insertions, 1005 deletions
diff --git a/sys/src/libregexp/mkfile b/sys/src/libregexp/mkfile
index dcc457028..f724310ae 100644
--- a/sys/src/libregexp/mkfile
+++ b/sys/src/libregexp/mkfile
@@ -6,12 +6,12 @@ OFILES=\
regerror.$O\
regexec.$O\
regsub.$O\
- regaux.$O\
rregexec.$O\
rregsub.$O\
+ regprint.$O\
HFILES=/sys/include/regexp.h\
- regcomp.h\
+ regimpl.h\
UPDATE=\
mkfile\
@@ -21,8 +21,8 @@ UPDATE=\
</sys/src/cmd/mksyslib
-test: test.$O $OFILES
- $LD -o test $prereq
+$O.regextest: tests/regextest.$O $LIB
+ $LD -o $target regextest.$O
-test2: test2.$O $OFILES
- $LD -o test2 $prereq
+$O.sysregextest: tests/sysregextest.$O
+ $LD -o $target sysregextest.$O
diff --git a/sys/src/libregexp/regcomp.c b/sys/src/libregexp/regcomp.c
index e48ef15ac..58bf864d7 100644
--- a/sys/src/libregexp/regcomp.c
+++ b/sys/src/libregexp/regcomp.c
@@ -1,567 +1,580 @@
#include <u.h>
#include <libc.h>
-#include "regexp.h"
-#include "regcomp.h"
-
-#define TRUE 1
-#define FALSE 0
-
-/*
- * Parser Information
- */
-typedef
-struct Node
-{
- Reinst* first;
- Reinst* last;
-}Node;
-
-/* max character classes per program is nelem(reprog->class) */
-static Reprog *reprog;
-
-/* max rune ranges per character class is nelem(classp->spans)/2 */
-#define NCCRUNE nelem(classp->spans)
-
-#define NSTACK 20
-static Node andstack[NSTACK];
-static Node *andp;
-static int atorstack[NSTACK];
-static int* atorp;
-static int cursubid; /* id of current subexpression */
-static int subidstack[NSTACK]; /* parallel to atorstack */
-static int* subidp;
-static int lastwasand; /* Last token was operand */
-static int nbra;
-static char* exprp; /* pointer to next character in source expression */
-static int lexdone;
-static int nclass;
-static Reclass*classp;
-static Reinst* freep;
-static int errors;
-static Rune yyrune; /* last lex'd rune */
-static Reclass*yyclassp; /* last lex'd class */
-
-/* predeclared crap */
-static void operator(int);
-static void pushand(Reinst*, Reinst*);
-static void pushator(int);
-static void evaluntil(int);
-static int bldcclass(void);
-
-static jmp_buf regkaboom;
-
-static void
-rcerror(char *s)
+#include <regexp.h>
+#include "regimpl.h"
+
+int instrcnt[] = {
+ [TANY] 2,
+ [TBOL] 1,
+ [TCAT] 0,
+ [TCLASS] 1,
+ [TEOL] 1,
+ [TNOTNL] 1,
+ [TOR] 2,
+ [TPLUS] 1,
+ [TQUES] 1,
+ [TRUNE] 1,
+ [TSTAR] 2,
+ [TSUB] 2
+};
+
+static Renode*
+node(Parselex *plex, int op, Renode *l, Renode *r)
{
- errors++;
- regerror(s);
- longjmp(regkaboom, 1);
+ Renode *n;
+
+ plex->instrs += instrcnt[op];
+ n = plex->next++;
+ n->op = op;
+ n->left = l;
+ n->right = r;
+ return n;
}
-static Reinst*
-newinst(int t)
+static Renode*
+e3(Parselex *plex)
{
- freep->type = t;
- freep->left = 0;
- freep->right = 0;
- return freep++;
+ char error[128];
+ Renode *n;
+
+ switch(lex(plex)) {
+ case LANY:
+ return node(plex, TANY, nil, nil);
+ case LBOL:
+ return node(plex, TBOL, nil, nil);
+ case LEOL:
+ return node(plex, TEOL, nil, nil);
+ case LRUNE:
+ n = node(plex, TRUNE, nil, nil);
+ n->r = plex->rune;
+ return n;
+ case LCLASS:
+ if(plex->nc)
+ return buildclassn(plex);
+ return buildclass(plex);
+ case LLPAR:
+ n = e0(plex);
+ n = node(plex, TSUB, n, nil);
+ if(lex(plex) != LRPAR) {
+ snprint(error, sizeof(error), "regexp %s: no matching parenthesis", plex->orig);
+ regerror(error);
+ longjmp(plex->exitenv, 1);
+ }
+ return n;
+ default:
+ if(plex->rune)
+ snprint(error, sizeof(error), "regexp %s: syntax error: %C", plex->orig, plex->rune);
+ else
+ snprint(error, sizeof(error), "regexp %s: parsing error", plex->orig);
+ regerror(error);
+ longjmp(plex->exitenv, 1);
+ }
+ return nil;
}
-static void
-operand(int t)
+static Renode*
+e2(Parselex *plex)
{
- Reinst *i;
-
- if(lastwasand)
- operator(CAT); /* catenate is implicit */
- i = newinst(t);
-
- if(t == CCLASS || t == NCCLASS)
- i->cp = yyclassp;
- if(t == RUNE)
- i->r = yyrune;
-
- pushand(i, i);
- lastwasand = TRUE;
+ Renode *n;
+
+ n = e3(plex);
+ if(lex(plex) == LREP) {
+ switch(plex->rune) {
+ case L'*':
+ return node(plex, TSTAR, n, nil);
+ case L'+':
+ return node(plex, TPLUS, n, nil);
+ case L'?':
+ return node(plex, TQUES, n, nil);
+ }
+ }
+ plex->peek = 1;
+ return n;
}
-static void
-operator(int t)
+static Renode*
+invert(Renode *n)
{
- if(t==RBRA && --nbra<0)
- rcerror("unmatched right paren");
- if(t==LBRA){
- if(++cursubid >= NSUBEXP)
- rcerror ("too many subexpressions");
- nbra++;
- if(lastwasand)
- operator(CAT);
- } else
- evaluntil(t);
- if(t != RBRA)
- pushator(t);
- lastwasand = FALSE;
- if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
- lastwasand = TRUE; /* these look like operands */
+ Renode *n1;
+
+ if(n->op != TCAT)
+ return n;
+ while(n->left->op == TCAT) {
+ n1 = n->left;
+ n->left = n1->right;
+ n1->right = n;
+ n = n1;
+ }
+ return n;
}
-static void
-regerr2(char *s, int c)
+static Renode*
+e1(Parselex *plex)
{
- char buf[100];
- char *cp = buf;
- while(*s)
- *cp++ = *s++;
- *cp++ = c;
- *cp = '\0';
- rcerror(buf);
+ Renode *n;
+ int sym;
+
+ n = e2(plex);
+ for(;;) {
+ sym = lex(plex);
+ if(sym == LEND || sym == LOR || sym == LRPAR)
+ break;
+ plex->peek = 1;
+ n = node(plex, TCAT, n, e2(plex));
+ }
+ plex->peek = 1;
+ return invert(n);
}
-static void
-cant(char *s)
+static Renode*
+e0(Parselex *plex)
{
- char buf[100];
- strcpy(buf, "can't happen: ");
- strcat(buf, s);
- rcerror(buf);
+ Renode *n;
+
+ n = e1(plex);
+ for(;;) {
+ if(lex(plex) != LOR)
+ break;
+ n = node(plex, TOR, n, e1(plex));
+ }
+ plex->peek = 1;
+ return n;
}
-static void
-pushand(Reinst *f, Reinst *l)
+static Parselex*
+initplex(Parselex *plex, char *regstr, int lit)
{
- if(andp >= &andstack[NSTACK])
- cant("operand stack overflow");
- andp->first = f;
- andp->last = l;
- andp++;
+ plex->getnextr = lit ? getnextrlit : getnextr;
+ plex->rawexp = plex->orig = regstr;
+ plex->sub = 0;
+ plex->instrs = 0;
+ plex->peek = 0;
+ plex->done = 0;
+ return plex;
}
-static void
-pushator(int t)
+static int
+maxthreads(Renode *tree)
{
- if(atorp >= &atorstack[NSTACK])
- cant("operator stack overflow");
- *atorp++ = t;
- *subidp++ = cursubid;
+ tree = tree->left;
+ if(tree->op == TCAT)
+ tree = tree->left;
+ if(tree->op == TBOL)
+ return 2;
+ return -1;
}
-static Node*
-popand(int op)
+static Reprog*
+regcomp1(char *regstr, int nl, int lit)
{
- Reinst *inst;
-
- if(andp <= &andstack[0]){
- regerr2("missing operand for ", op);
- inst = newinst(NOP);
- pushand(inst,inst);
+ Reprog *reprog;
+ Parselex plex;
+ Renode *parsetr;
+ int regstrlen, maxthr;
+
+ regstrlen = utflen(regstr);
+ initplex(&plex, regstr, lit);
+ plex.nodes = calloc(sizeof(*plex.nodes), regstrlen*2);
+ if(plex.nodes == nil)
+ return nil;
+ plex.next = plex.nodes;
+
+ if(setjmp(plex.exitenv) != 0) {
+ free(plex.nodes);
+ return nil;
}
- return --andp;
+
+ parsetr = node(&plex, TSUB, e0(&plex), nil);
+ maxthr = maxthreads(parsetr);
+ if(maxthr == -1)
+ maxthr = regstrlen;
+
+ prtree(parsetr, 0, 1);
+ reprog = malloc(sizeof(Reprog) +
+ sizeof(Reinst) * plex.instrs +
+ sizeof(Rethread) * maxthr +
+ sizeof(Rethread*) * maxthr);
+ reprog->len = plex.instrs;
+ reprog->nthr = maxthr;
+ reprog->startinst = compile(parsetr, reprog, nl);
+ reprog->threads = (Rethread*)(reprog->startinst + reprog->len);
+ reprog->thrpool = (Rethread**)(reprog->threads + reprog->nthr);
+ reprog->regstr = regstr;
+
+ free(plex.nodes);
+ return reprog;
}
-static int
-popator(void)
+Reprog*
+regcomp(char *str)
{
- if(atorp <= &atorstack[0])
- cant("operator stack underflow");
- --subidp;
- return *--atorp;
+ return regcomp1(str, 0, 0);
}
-static void
-evaluntil(int pri)
+Reprog*
+regcomplit(char *str)
{
- Node *op1, *op2;
- Reinst *inst1, *inst2;
-
- while(pri==RBRA || atorp[-1]>=pri){
- switch(popator()){
- default:
- rcerror("unknown operator in evaluntil");
- break;
- case LBRA: /* must have been RBRA */
- op1 = popand('(');
- inst2 = newinst(RBRA);
- inst2->subid = *subidp;
- op1->last->next = inst2;
- inst1 = newinst(LBRA);
- inst1->subid = *subidp;
- inst1->next = op1->first;
- pushand(inst1, inst2);
- return;
- case OR:
- op2 = popand('|');
- op1 = popand('|');
- inst2 = newinst(NOP);
- op2->last->next = inst2;
- op1->last->next = inst2;
- inst1 = newinst(OR);
- inst1->right = op1->first;
- inst1->left = op2->first;
- pushand(inst1, inst2);
- break;
- case CAT:
- op2 = popand(0);
- op1 = popand(0);
- op1->last->next = op2->first;
- pushand(op1->first, op2->last);
- break;
- case STAR:
- op2 = popand('*');
- inst1 = newinst(OR);
- op2->last->next = inst1;
- inst1->right = op2->first;
- pushand(inst1, inst1);
- break;
- case PLUS:
- op2 = popand('+');
- inst1 = newinst(OR);
- op2->last->next = inst1;
- inst1->right = op2->first;
- pushand(op2->first, inst1);
- break;
- case QUEST:
- op2 = popand('?');
- inst1 = newinst(OR);
- inst2 = newinst(NOP);
- inst1->left = inst2;
- inst1->right = op2->first;
- op2->last->next = inst2;
- pushand(inst1, inst2);
- break;
- }
- }
+ return regcomp1(str, 0, 1);
}
-static Reprog*
-optimize(Reprog *pp)
+Reprog*
+regcompnl(char *str)
{
- Reinst *inst, *target;
- int size;
- Reprog *npp;
- Reclass *cl;
- int diff;
-
- /*
- * get rid of NOOP chains
- */
- for(inst=pp->firstinst; inst->type!=END; inst++){
- target = inst->next;
- while(target->type == NOP)
- target = target->next;
- inst->next = target;
- }
-
- /*
- * The original allocation is for an area larger than
- * necessary. Reallocate to the actual space used
- * and then relocate the code.
- */
- size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
- npp = realloc(pp, size);
- if(npp==0 || npp==pp)
- return pp;
- diff = (char *)npp - (char *)pp;
- freep = (Reinst *)((char *)freep + diff);
- for(inst=npp->firstinst; inst<freep; inst++){
- switch(inst->type){
- case OR:
- case STAR:
- case PLUS:
- case QUEST:
- *(char **)&inst->right += diff;
- break;
- case CCLASS:
- case NCCLASS:
- *(char **)&inst->right += diff;
- cl = inst->cp;
- *(char **)&cl->end += diff;
- break;
- }
- *(char **)&inst->left += diff;
- }
- *(char **)&npp->startinst += diff;
- return npp;
+ return regcomp1(str, 1, 0);
}
-#ifdef DEBUG
-static void
-dumpstack(void){
- Node *stk;
- int *ip;
-
- print("operators\n");
- for(ip=atorstack; ip<atorp; ip++)
- print("0%o\n", *ip);
- print("operands\n");
- for(stk=andstack; stk<andp; stk++)
- print("0%o\t0%o\n", stk->first->type, stk->last->type);
+static Reinst*
+compile1(Renode *renode, Reinst *reinst, int *sub, int nl)
+{
+ Reinst *i;
+ int s;
+
+Tailcall:
+ if(renode == nil)
+ return reinst;
+ switch(renode->op) {
+ case TCLASS:
+ reinst->op = OCLASS;
+ reinst->r = renode->r;
+ reinst->r1 = renode->r1;
+ reinst->a = reinst + 1 + renode->nclass;
+ renode = renode->left;
+ reinst++;
+ goto Tailcall;
+ case TCAT:
+ reinst = compile1(renode->left, reinst, sub, nl);
+ renode = renode->right;
+ goto Tailcall;
+ case TOR:
+ reinst->op = OSPLIT;
+ reinst->a = reinst + 1;
+ i = compile1(renode->left, reinst->a, sub, nl);
+ reinst->b = i + 1;
+ i->op = OJMP;
+ i->a = compile1(renode->right, reinst->b, sub, nl);
+ return i->a;
+ case TSTAR:
+ reinst->op = OSPLIT;
+ reinst->a = reinst + 1;
+ i = compile1(renode->left, reinst->a, sub, nl);
+ reinst->b = i + 1;
+ i->op = OJMP;
+ i->a = reinst;
+ return reinst->b;
+ case TPLUS:
+ i = reinst;
+ reinst = compile1(renode->left, reinst, sub, nl);
+ reinst->op = OSPLIT;
+ reinst->a = i;
+ reinst->b = reinst + 1;
+ return reinst->b;
+ case TQUES:
+ reinst->op = OSPLIT;
+ reinst->a = reinst + 1;
+ reinst->b = compile1(renode->left, reinst->a, sub, nl);
+ return reinst->b;
+ case TSUB:
+ reinst->op = OSAVE;
+ reinst->sub = s = (*sub)++;
+ reinst = compile1(renode->left, reinst+1, sub, nl);
+ reinst->op = OUNSAVE;
+ reinst->sub = s;
+ return reinst + 1;
+ case TANY:
+ if(nl == 0)
+ reinst++->op = ONOTNL;
+ reinst->op = OANY;
+ return reinst + 1;
+ case TRUNE:
+ reinst->op = ORUNE;
+ reinst->r = renode->r;
+ return reinst + 1;
+ case TNOTNL:
+ reinst->op = ONOTNL;
+ return reinst + 1;
+ case TEOL:
+ reinst->op = OEOL;
+ return reinst + 1;
+ case TBOL:
+ reinst->op = OBOL;
+ return reinst + 1;
+ }
+ return nil;
}
-static void
-dump(Reprog *pp)
+static Reinst*
+compile(Renode *parsetr, Reprog *reprog, int nl)
{
- Reinst *l;
- Rune *p;
+ Reinst *reinst;
+ int sub;
- l = pp->firstinst;
- do{
- print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
- l->left-pp->firstinst, l->right-pp->firstinst);
- if(l->type == RUNE)
- print("\t%C\n", l->r);
- else if(l->type == CCLASS || l->type == NCCLASS){
- print("\t[");
- if(l->type == NCCLASS)
- print("^");
- for(p = l->cp->spans; p < l->cp->end; p += 2)
- if(p[0] == p[1])
- print("%C", p[0]);
- else
- print("%C-%C", p[0], p[1]);
- print("]\n");
- } else
- print("\n");
- }while(l++->type);
+ sub = 0;
+ reinst = (Reinst*)(reprog+1);
+ compile1(parsetr, reinst, &sub, nl);
+ return reinst;
}
-#endif
-static Reclass*
-newclass(void)
+static void
+getnextr(Parselex *l)
{
- if(nclass >= nelem(reprog->class))
- rcerror("too many character classes; increase Reprog.class size");
- return &(classp[nclass++]);
+ l->literal = 0;
+ if(l->done) {
+ l->rune = 0;
+ return;
+ }
+ l->rawexp += chartorune(&l->rune, l->rawexp);
+ if(l->rune == L'\\') {
+ l->rawexp += chartorune(&l->rune, l->rawexp);
+ l->literal = 1;
+ }
+ if(*l->rawexp == 0)
+ l->done = 1;
+ return;
}
-static int
-nextc(Rune *rp)
+static void
+getnextrlit(Parselex *l)
{
- if(lexdone){
- *rp = 0;
- return 1;
- }
- exprp += chartorune(rp, exprp);
- if(*rp == L'\\'){
- exprp += chartorune(rp, exprp);
- return 1;
+ l->literal = 1;
+ if(l->done) {
+ l->literal = 0;
+ l->rune = 0;
+ return;
}
- if(*rp == 0)
- lexdone = 1;
- return 0;
+ l->rawexp += chartorune(&l->rune, l->rawexp);
+ if(*l->rawexp == 0)
+ l->done = 1;
+ return;
}
-static int
-lex(int literal, int dot_type)
+static int
+lex(Parselex *l)
{
- int quoted;
-
- quoted = nextc(&yyrune);
- if(literal || quoted){
- if(yyrune == 0)
- return END;
- return RUNE;
+ if(l->peek) {
+ l->peek = 0;
+ return l->peeklex;
}
-
- switch(yyrune){
+ l->getnextr(l);
+ if(l->literal)
+ return l->peeklex = LRUNE;
+ switch(l->rune){
case 0:
- return END;
+ return l->peeklex = LEND;
case L'*':
- return STAR;
case L'?':
- return QUEST;
case L'+':
- return PLUS;
+ return l->peeklex = LREP;
case L'|':
- return OR;
+ return l->peeklex = LOR;
case L'.':
- return dot_type;
+ return l->peeklex = LANY;
case L'(':
- return LBRA;
+ return l->peeklex = LLPAR;
case L')':
- return RBRA;
+ return l->peeklex = LRPAR;
case L'^':
- return BOL;
+ return l->peeklex = LBOL;
case L'$':
- return EOL;
+ return l->peeklex = LEOL;
case L'[':
- return bldcclass();
+ getclass(l);
+ return l->peeklex = LCLASS;
}
- return RUNE;
+ return l->peeklex = LRUNE;
}
static int
-bldcclass(void)
+pcmp(void *va, void *vb)
{
- int type;
- Rune r[NCCRUNE];
- Rune *p, *ep, *np;
- Rune rune;
- int quoted;
-
- /* we have already seen the '[' */
- type = CCLASS;
- yyclassp = newclass();
-
- /* look ahead for negation */
- /* SPECIAL CASE!!! negated classes don't match \n */
- ep = r;
- quoted = nextc(&rune);
- if(!quoted && rune == L'^'){
- type = NCCLASS;
- quoted = nextc(&rune);
- *ep++ = L'\n';
- *ep++ = L'\n';
- }
-
- /* parse class into a set of spans */
- while(ep < &r[NCCRUNE-1]){
- if(rune == 0){
- rcerror("malformed '[]'");
- return 0;
- }
- if(!quoted && rune == L']')
- break;
- if(!quoted && rune == L'-'){
- if(ep == r){
- rcerror("malformed '[]'");
- return 0;
- }
- quoted = nextc(&rune);
- if((!quoted && rune == L']') || rune == 0){
- rcerror("malformed '[]'");
- return 0;
- }
- *(ep-1) = rune;
- } else {
- *ep++ = rune;
- *ep++ = rune;
- }
- quoted = nextc(&rune);
- }
- if(ep >= &r[NCCRUNE-1]) {
- rcerror("char class too large; increase Reclass.spans size");
- return 0;
- }
-
- /* sort on span start */
- for(p = r; p < ep; p += 2){
- for(np = p; np < ep; np += 2)
- if(*np < *p){
- rune = np[0];
- np[0] = p[0];
- p[0] = rune;
- rune = np[1];
- np[1] = p[1];
- p[1] = rune;
- }
- }
+ vlong n;
+ Rune *a, *b;
- /* merge spans */
- np = yyclassp->spans;
- p = r;
- if(r == ep)
- yyclassp->end = np;
- else {
- np[0] = *p++;
- np[1] = *p++;
- for(; p < ep; p += 2)
- /* overlapping or adjacent ranges? */
- if(p[0] <= np[1] + 1){
- if(p[1] >= np[1])
- np[1] = p[1]; /* coalesce */
- } else {
- np += 2;
- np[0] = p[0];
- np[1] = p[1];
- }
- yyclassp->end = np+2;
- }
+ a = va;
+ b = vb;
- return type;
+ n = (vlong)b[0] - (vlong)a[0];
+ if(n)
+ return n;
+ return (vlong)b[1] - (vlong)a[1];
}
-static Reprog*
-regcomp1(char *s, int literal, int dot_type)
+static void
+getclass(Parselex *l)
{
- int token;
- Reprog *pp;
-
- /* get memory for the program */
- pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
- if(pp == 0){
- regerror("out of memory");
- return 0;
+ Rune *p, *q, t;
+
+ l->nc = 0;
+ getnextrlit(l);
+ if(l->rune == L'^') {
+ l->nc = 1;
+ getnextrlit(l);
}
- freep = pp->firstinst;
- classp = pp->class;
- errors = 0;
-
- if(setjmp(regkaboom))
- goto out;
-
- /* go compile the sucker */
- lexdone = 0;
- exprp = s;
- nclass = 0;
- nbra = 0;
- atorp = atorstack;
- andp = andstack;
- subidp = subidstack;
- lastwasand = FALSE;
- cursubid = 0;
-
- /* Start with a low priority operator to prime parser */
- pushator(START-1);
- while((token = lex(literal, dot_type)) != END){
- if((token&0300) == OPERATOR)
- operator(token);
- else
- operand(token);
+ p = l->cpairs;
+ for(;;) {
+ *p = l->rune;
+ if(l->rune == L']')
+ break;
+ if(l->rune == L'-') {
+ regerror("Malformed class");
+ longjmp(l->exitenv, 1);
+ }
+ if(l->rune == '\\') {
+ getnextrlit(l);
+ *p = l->rune;
+ }
+ if(l->rune == 0) {
+ regerror("No closing ] for class");
+ longjmp(l->exitenv, 1);
+ }
+ getnextrlit(l);
+ if(l->rune == L'-') {
+ getnextrlit(l);
+ if(l->rune == L']') {
+ regerror("Malformed class");
+ longjmp(l->exitenv, 1);
+ }
+ if(l->rune == L'-') {
+ regerror("Malformed class");
+ longjmp(l->exitenv, 1);
+ }
+ if(l->rune == L'\\')
+ getnextrlit(l);
+ p[1] = l->rune;
+ if(p[0] > p[1]) {
+ t = p[0];
+ p[0] = p[1];
+ p[1] = t;
+ }
+ getnextrlit(l);
+ } else
+ p[1] = p[0];
+ if(p >= l->cpairs + nelem(l->cpairs) - 2) {
+ regerror("Class too big\n");
+ longjmp(l->exitenv, 1);
+ }
+ p += 2;
}
-
- /* Close with a low priority operator */
- evaluntil(START);
-
- /* Force END */
- operand(END);
- evaluntil(START);
-#ifdef DEBUG
- dumpstack();
-#endif
- if(nbra)
- rcerror("unmatched left paren");
- --andp; /* points to first and only operand */
- pp->startinst = andp->first;
-#ifdef DEBUG
- dump(pp);
-#endif
- pp = optimize(pp);
-#ifdef DEBUG
- print("start: %d\n", andp->first-pp->firstinst);
- dump(pp);
-#endif
-out:
- if(errors){
- free(pp);
- pp = 0;
+ *p = L'\0';
+ qsort(l->cpairs, (p - l->cpairs)/2, 2*sizeof(*l->cpairs), pcmp);
+ q = l->cpairs;
+ for(p = l->cpairs+2; *p != 0; p += 2) {
+ if(p[1] < q[0] - 1) {
+ q[2] = p[0];
+ q[3] = p[1];
+ q += 2;
+ continue;
+ }
+ q[0] = p[0];
+ if(p[1] > q[1])
+ q[1] = p[1];
}
- return pp;
+ q[2] = 0;
}
-extern Reprog*
-regcomp(char *s)
+/* classes are in descending order */
+static Renode*
+buildclassn(Parselex *l)
{
- return regcomp1(s, 0, ANY);
+ Renode *n;
+ Rune *p;
+ int i;
+
+ i = 0;
+ p = l->cpairs;
+ n = node(l, TCLASS, nil, nil);
+ n->r = p[1] + 1;
+ n->r1 = Runemax;
+ n->nclass = i++;
+
+ for(; *p != 0; p += 2) {
+ n = node(l, TCLASS, n, nil);
+ n->r = p[3] + 1;
+ n->r1 = p[0] - 1;
+ n->nclass = i++;
+ }
+ n->r = 0;
+ return node(l, TCAT, node(l, TNOTNL, nil, nil), n);
}
-extern Reprog*
-regcomplit(char *s)
+static Renode*
+buildclass(Parselex *l)
{
- return regcomp1(s, 1, ANY);
+ Renode *n;
+ Rune *p;
+ int i;
+
+ i = 0;
+ n = node(l, TCLASS, nil, nil);
+ n->r = Runemax + 1;
+ n->nclass = i++;
+
+ for(p = l->cpairs; *p != 0; p += 2) {
+ n = node(l, TCLASS, n, nil);
+ n->r = p[0];
+ n->r1 = p[1];
+ n->nclass = i++;
+ }
+ return n;
}
-extern Reprog*
-regcompnl(char *s)
+static void
+prtree(Renode *tree, int d, int f)
{
- return regcomp1(s, 0, ANYNL);
+ int i;
+
+ if(tree == nil)
+ return;
+ if(f)
+ for(i = 0; i < d; i++)
+ print("\t");
+ switch(tree->op) {
+ case TCAT:
+ prtree(tree->left, d, 0);
+ prtree(tree->right, d, 1);
+ break;
+ case TOR:
+ print("TOR\n");
+ prtree(tree->left, d+1, 1);
+ for(i = 0; i < d; i++)
+ print("\t");
+ print("|\n");
+ prtree(tree->right, d+1, 1);
+ break;
+ case TSTAR:
+ print("*\n");
+ prtree(tree->left, d+1, 1);
+ break;
+ case TPLUS:
+ print("+\n");
+ prtree(tree->left, d+1, 1);
+ break;
+ case TQUES:
+ print("?\n");
+ prtree(tree->left, d+1, 1);
+ break;
+ case TANY:
+ print(".\n");
+ prtree(tree->left, d+1, 1);
+ break;
+ case TBOL:
+ print("^\n");
+ break;
+ case TEOL:
+ print("$\n");
+ break;
+ case TSUB:
+ print("TSUB\n");
+ prtree(tree->left, d+1, 1);
+ break;
+ case TRUNE:
+ print("TRUNE: %C\n", tree->r);
+ break;
+ case TNOTNL:
+ print("TNOTNL: !\\n\n");
+ break;
+ case TCLASS:
+ print("CLASS: %C-%C\n", tree->r, tree->r1);
+ prtree(tree->left, d, 1);
+ break;
+ }
}
diff --git a/sys/src/libregexp/regerror.c b/sys/src/libregexp/regerror.c
index 92b73a0d9..1e0eef175 100644
--- a/sys/src/libregexp/regerror.c
+++ b/sys/src/libregexp/regerror.c
@@ -1,6 +1,6 @@
#include <u.h>
#include <libc.h>
-#include "regexp.h"
+#include <regexp.h>
void
regerror(char *s)
diff --git a/sys/src/libregexp/regexec.c b/sys/src/libregexp/regexec.c
index b806ba60a..a153d77e9 100644
--- a/sys/src/libregexp/regexec.c
+++ b/sys/src/libregexp/regexec.c
@@ -1,232 +1,190 @@
#include <u.h>
#include <libc.h>
-#include "regexp.h"
-#include "regcomp.h"
+#include <regexp.h>
+#include "regimpl.h"
+typedef struct RethreadQ RethreadQ;
+struct RethreadQ
+{
+ Rethread *head;
+ Rethread **tail;
+};
-/*
- * return 0 if no match
- * >0 if a match
- * <0 if we ran out of _relist space
- */
-static int
-regexec1(Reprog *progp, /* program to run */
- char *bol, /* string to run machine on */
- Resub *mp, /* subexpression elements */
- int ms, /* number of elements at mp */
- Reljunk *j
-)
+int
+regexec(Reprog *prog, char *str, Resub *sem, int msize)
{
- int flag=0;
- Reinst *inst;
- Relist *tlp;
- char *s;
- int i, checkstart;
- Rune r, *rp, *ep;
- int n;
- Relist* tl; /* This list, next list */
- Relist* nl;
- Relist* tle; /* ends of this and next list */
- Relist* nle;
- int match;
- char *p;
+ RethreadQ lists[2], *clist, *nlist, *tmp;
+ Rethread *t, *nextthr, **availthr;
+ Reinst *curinst;
+ Rune r;
+ char *sp, *ep, endc;
+ int i, match, first, gen, matchpri, pri;
- match = 0;
- checkstart = j->starttype;
- if(mp)
- for(i=0; i<ms; i++) {
- mp[i].sp = 0;
- mp[i].ep = 0;
- }
- j->relist[0][0].inst = 0;
- j->relist[1][0].inst = 0;
+ if(msize > NSUBEXPM)
+ msize = NSUBEXPM;
- /* Execute machine once for each character, including terminal NUL */
- s = j->starts;
- do{
- /* fast check for first char */
- if(checkstart) {
- switch(j->starttype) {
- case RUNE:
- p = utfrune(s, j->startchar);
- if(p == 0 || s == j->eol)
- return match;
- s = p;
- break;
- case BOL:
- if(s == bol)
- break;
- p = utfrune(s, '\n');
- if(p == 0 || s == j->eol)
- return match;
- s = p+1;
- break;
- }
- }
- r = *(uchar*)s;
- if(r < Runeself)
- n = 1;
- else
- n = chartorune(&r, s);
+ if(prog->startinst->gen != 0) {
+ for(curinst = prog->startinst; curinst < prog->startinst + prog->len; curinst++)
+ curinst->gen = 0;
+ }
- /* switch run lists */
- tl = j->relist[flag];
- tle = j->reliste[flag];
- nl = j->relist[flag^=1];
- nle = j->reliste[flag];
- nl->inst = 0;
+ clist = lists;
+ clist->head = nil;
+ clist->tail = &clist->head;
+ nlist = lists + 1;
+ nlist->head = nil;
+ nlist->tail = &nlist->head;
- /* Add first instruction to current list */
- if(match == 0)
- _renewemptythread(tl, progp->startinst, ms, s);
+ for(i = 0; i < prog->nthr; i++)
+ prog->thrpool[i] = prog->threads + i;
+ availthr = prog->thrpool + prog->nthr;
- /* Execute machine until current list is empty */
- for(tlp=tl; tlp->inst; tlp++){ /* assignment = */
- for(inst = tlp->inst; ; inst = inst->next){
- switch(inst->type){
- case RUNE: /* regular character */
- if(inst->r == r){
- if(_renewthread(nl, inst->next, ms, &tlp->se)==nle)
- return -1;
- }
- break;
- case LBRA:
- tlp->se.m[inst->subid].sp = s;
- continue;
- case RBRA:
- tlp->se.m[inst->subid].ep = s;
- continue;
- case ANY:
- if(r != '\n')
- if(_renewthread(nl, inst->next, ms, &tlp->se)==nle)
- return -1;
- break;
- case ANYNL:
- if(_renewthread(nl, inst->next, ms, &tlp->se)==nle)
- return -1;
- break;
- case BOL:
- if(s == bol || *(s-1) == '\n')
- continue;
- break;
- case EOL:
- if(s == j->eol || r == 0 || r == '\n')
- continue;
- break;
- case CCLASS:
- ep = inst->cp->end;
- for(rp = inst->cp->spans; rp < ep; rp += 2)
- if(r >= rp[0] && r <= rp[1]){
- if(_renewthread(nl, inst->next, ms, &tlp->se)==nle)
- return -1;
- break;
- }
- break;
- case NCCLASS:
- ep = inst->cp->end;
- for(rp = inst->cp->spans; rp < ep; rp += 2)
- if(r >= rp[0] && r <= rp[1])
- break;
- if(rp == ep)
- if(_renewthread(nl, inst->next, ms, &tlp->se)==nle)
- return -1;
- break;
- case OR:
- /* evaluate right choice later */
- if(_renewthread(tlp, inst->right, ms, &tlp->se) == tle)
- return -1;
- /* efficiency: advance and re-evaluate */
- continue;
- case END: /* Match! */
- match = 1;
- tlp->se.m[0].ep = s;
- if(mp != 0)
- _renewmatch(mp, ms, &tlp->se);
- break;
- }
+ pri = matchpri = gen = match = 0;
+ sp = str;
+ ep = nil;
+ endc = '\0';
+ if(sem != nil && msize > 0) {
+ if(sem->sp != nil)
+ sp = sem->sp;
+ if(sem->ep != nil && *sem->ep != '\0') {
+ ep = sem->ep;
+ endc = *sem->ep;
+ *sem->ep = '\0';
+ }
+ }
+ r = Runemax + 1;
+ for(; r != L'\0'; sp += i) {
+ gen++;
+ i = chartorune(&r, sp);
+ first = 1;
+ t = clist->head;
+ if(t == nil)
+ goto Start;
+ curinst = t->pc;
+Again:
+ if(curinst->gen == gen)
+ goto Done;
+ curinst->gen = gen;
+ switch(curinst->op) {
+ case ORUNE:
+ if(r != curinst->r)
+ goto Done;
+ case OANY: /* fallthrough */
+ Any:
+ nextthr = t->next;
+ t->pc = curinst + 1;
+ t->next = nil;
+ *nlist->tail = t;
+ nlist->tail = &t->next;
+ if(nextthr == nil)
+ break;
+ t = nextthr;
+ curinst = t->pc;
+ goto Again;
+ case OCLASS:
+ Class:
+ if(r < curinst->r)
+ goto Done;
+ if(r > curinst->r1) {
+ curinst++;
+ goto Class;
+ }
+ nextthr = t->next;
+ t->pc = curinst->a;
+ t->next = nil;
+ *nlist->tail = t;
+ nlist->tail = &t->next;
+ if(nextthr == nil)
break;
+ t = nextthr;
+ curinst = t->pc;
+ goto Again;
+ case ONOTNL:
+ if(r != L'\n') {
+ curinst++;
+ goto Again;
+ }
+ goto Done;
+ case OBOL:
+ if(sp == str || sp[-1] == '\n') {
+ curinst++;
+ goto Again;
}
+ goto Done;
+ case OEOL:
+ if(r == L'\0' && ep == nil) {
+ curinst++;
+ goto Again;
+ }
+ if(r == L'\n')
+ goto Any;
+ goto Done;
+ case OJMP:
+ curinst = curinst->a;
+ goto Again;
+ case OSPLIT:
+ nextthr = *--availthr;
+ nextthr->pc = curinst->b;
+ if(msize > 0)
+ memcpy(nextthr->sem, t->sem, sizeof(Resub)*msize);
+ nextthr->pri = t->pri;
+ nextthr->next = t->next;
+ t->next = nextthr;
+ curinst = curinst->a;
+ goto Again;
+ case OSAVE:
+ if(curinst->sub < msize)
+ t->sem[curinst->sub].sp = sp;
+ curinst++;
+ goto Again;
+ case OUNSAVE:
+ if(curinst->sub == 0) {
+ /* "Highest" priority is the left-most longest. */
+ if (t->pri > matchpri)
+ goto Done;
+ match = 1;
+ matchpri = t->pri;
+ if(sem != nil && msize > 0) {
+ memcpy(sem, t->sem, sizeof(Resub)*msize);
+ sem->ep = sp;
+ }
+ goto Done;
+ }
+ if(curinst->sub < msize)
+ t->sem[curinst->sub].ep = sp;
+ curinst++;
+ goto Again;
+ Done:
+ *availthr++ = t;
+ t = t->next;
+ if(t == nil)
+ break;
+ curinst = t->pc;
+ goto Again;
}
- if(s == j->eol)
+Start:
+ /* Start again once if we haven't found anything. */
+ if(first == 1 && match == 0) {
+ first = 0;
+ t = *--availthr;
+ if(msize > 0)
+ memset(t->sem, 0, sizeof(Resub)*msize);
+ /* "Lower" priority thread */
+ t->pri = matchpri = pri++;
+ t->next = nil;
+ curinst = prog->startinst;
+ goto Again;
+ }
+ /* If we have a match and no extant threads, we are done. */
+ if(match == 1 && nlist->head == nil)
break;
- checkstart = j->starttype && nl->inst==0;
- s += n;
- }while(r);
- return match;
-}
-
-static int
-regexec2(Reprog *progp, /* program to run */
- char *bol, /* string to run machine on */
- Resub *mp, /* subexpression elements */
- int ms, /* number of elements at mp */
- Reljunk *j
-)
-{
- int rv;
- Relist *relist0, *relist1;
-
- /* mark space */
- relist0 = malloc(BIGLISTSIZE*sizeof(Relist));
- if(relist0 == nil)
- return -1;
- relist1 = malloc(BIGLISTSIZE*sizeof(Relist));
- if(relist1 == nil){
- free(relist1);
- return -1;
- }
- j->relist[0] = relist0;
- j->relist[1] = relist1;
- j->reliste[0] = relist0 + BIGLISTSIZE - 2;
- j->reliste[1] = relist1 + BIGLISTSIZE - 2;
-
- rv = regexec1(progp, bol, mp, ms, j);
- free(relist0);
- free(relist1);
- return rv;
-}
-
-extern int
-regexec(Reprog *progp, /* program to run */
- char *bol, /* string to run machine on */
- Resub *mp, /* subexpression elements */
- int ms) /* number of elements at mp */
-{
- Reljunk j;
- Relist relist0[LISTSIZE], relist1[LISTSIZE];
- int rv;
-
- /*
- * use user-specified starting/ending location if specified
- */
- j.starts = bol;
- j.eol = 0;
- if(mp && ms>0){
- if(mp->sp)
- j.starts = mp->sp;
- if(mp->ep)
- j.eol = mp->ep;
+ tmp = clist;
+ clist = nlist;
+ nlist = tmp;
+ nlist->head = nil;
+ nlist->tail = &nlist->head;
}
- j.starttype = 0;
- j.startchar = 0;
- if(progp->startinst->type == RUNE && progp->startinst->r < Runeself) {
- j.starttype = RUNE;
- j.startchar = progp->startinst->r;
- }
- if(progp->startinst->type == BOL)
- j.starttype = BOL;
-
- /* mark space */
- j.relist[0] = relist0;
- j.relist[1] = relist1;
- j.reliste[0] = relist0 + nelem(relist0) - 2;
- j.reliste[1] = relist1 + nelem(relist1) - 2;
-
- rv = regexec1(progp, bol, mp, ms, &j);
- if(rv >= 0)
- return rv;
- rv = regexec2(progp, bol, mp, ms, &j);
- if(rv >= 0)
- return rv;
- return -1;
+ if(ep != nil)
+ *ep = endc;
+ return match;
}
diff --git a/sys/src/libregexp/regimpl.h b/sys/src/libregexp/regimpl.h
new file mode 100644
index 000000000..f173a970c
--- /dev/null
+++ b/sys/src/libregexp/regimpl.h
@@ -0,0 +1,104 @@
+enum
+{
+ LANY = 0,
+ LBOL,
+ LCLASS,
+ LEND,
+ LEOL,
+ LLPAR,
+ LOR,
+ LREP,
+ LRPAR,
+ LRUNE,
+
+ TANY = 0,
+ TBOL,
+ TCAT,
+ TCLASS,
+ TEOL,
+ TNOTNL,
+ TOR,
+ TPLUS,
+ TQUES,
+ TRUNE,
+ TSTAR,
+ TSUB,
+
+ NSUBEXPM = 32
+};
+
+typedef struct Parselex Parselex;
+typedef struct Renode Renode;
+
+struct Parselex
+{
+ /* Parse */
+ Renode *next;
+ Renode *nodes;
+ int sub;
+ int instrs;
+ jmp_buf exitenv;
+ /* Lex */
+ void (*getnextr)(Parselex*);
+ char *rawexp;
+ char *orig;
+ Rune rune;
+ Rune peek;
+ int peeklex;
+ int done;
+ int literal;
+ Rune cpairs[400+2];
+ int nc;
+};
+struct Renode
+{
+ int op;
+ Renode *left;
+ Rune r;
+ union
+ {
+ Rune r1;
+ int sub;
+ Renode *right;
+ };
+ int nclass;
+};
+struct Rethread
+{
+ Reinst *pc;
+ Resub sem[NSUBEXPM];
+ int pri;
+ Rethread *next;
+};
+struct Reinst
+{
+ char op;
+ int gen;
+ Reinst *a;
+ union
+ {
+ Rune r;
+ int sub;
+ };
+ union
+ {
+ Rune r1;
+ Reinst *b;
+ };
+};
+
+static int lex(Parselex*);
+static void getnextr(Parselex*);
+static void getnextrlit(Parselex*);
+static void getclass(Parselex*);
+static Renode *e0(Parselex*);
+static Renode *e1(Parselex*);
+static Renode *e2(Parselex*);
+static Renode *e3(Parselex*);
+static Renode *buildclass(Parselex*);
+static Renode *buildclassn(Parselex*);
+static int pcmp(void*, void*);
+static Reprog *regcomp1(char*, int, int);
+static Reinst *compile(Renode*, Reprog*, int);
+static Reinst *compile1(Renode*, Reinst*, int*, int);
+static void prtree(Renode*, int, int);
diff --git a/sys/src/libregexp/regprint.c b/sys/src/libregexp/regprint.c
new file mode 100644
index 000000000..673ca1f7b
--- /dev/null
+++ b/sys/src/libregexp/regprint.c
@@ -0,0 +1,66 @@
+#include <u.h>
+#include <libc.h>
+#include <regexp.h>
+#include <regimpl.h>
+
+static int
+fmtprinst(Fmt *f, Reinst *inst)
+{
+ int r;
+
+ r = fmtprint(f, "%p ", inst);
+ switch(inst->op) {
+ case ORUNE:
+ r += fmtprint(f, "ORUNE\t%C\n", inst->r);
+ break;
+ case ONOTNL:
+ r += fmtprint(f, "ONOTNL\n");
+ break;
+ case OCLASS:
+ r += fmtprint(f, "OCLASS\t%C-%C %p\n", inst->r, inst->r1, inst->a);
+ break;
+ case OSPLIT:
+ r += fmtprint(f, "OSPLIT\t%p %p\n", inst->a, inst->b);
+ break;
+ case OJMP:
+ r += fmtprint(f, "OJMP \t%p\n", inst->a);
+ break;
+ case OSAVE:
+ r += fmtprint(f, "OSAVE\t%d\n", inst->sub);
+ break;
+ case OUNSAVE:
+ r += fmtprint(f, "OUNSAVE\t%d\n", inst->sub);
+ break;
+ case OANY:
+ r += fmtprint(f, "OANY \t.\n");
+ break;
+ case OEOL:
+ r += fmtprint(f, "OEOL \t$\n");
+ break;
+ case OBOL:
+ r += fmtprint(f, "OBOL \t^\n");
+ break;
+ }
+ return r;
+}
+
+static int
+fmtprprog(Fmt *f, Reprog *reprog)
+{
+ Reinst *inst;
+ int r;
+
+ r = 0;
+ for(inst = reprog->startinst; inst < reprog->startinst + reprog->len; inst++)
+ r += fmtprinst(f, inst);
+ return r;
+}
+
+int
+reprogfmt(Fmt *f)
+{
+ Reprog *r;
+
+ r = va_arg(f->args, Reprog*);
+ return fmtprprog(f, r);
+}
diff --git a/sys/src/libregexp/regsub.c b/sys/src/libregexp/regsub.c
index 2474d2989..d11d86f25 100644
--- a/sys/src/libregexp/regsub.c
+++ b/sys/src/libregexp/regsub.c
@@ -1,63 +1,66 @@
#include <u.h>
#include <libc.h>
-#include "regexp.h"
+#include <regexp.h>
-/* substitute into one string using the matches from the last regexec() */
-extern void
-regsub(char *sp, /* source string */
- char *dp, /* destination string */
- int dlen,
- Resub *mp, /* subexpression elements */
- int ms) /* number of elements pointed to by mp */
+void
+regsub(char *src, char *dst, int dlen, Resub *match, int msize)
{
- char *ssp, *ep;
int i;
+ char *ep, c;
- ep = dp+dlen-1;
- while(*sp != '\0'){
- if(*sp == '\\'){
- switch(*++sp){
- case '0':
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- case '8':
- case '9':
- i = *sp-'0';
- if(mp!=0 && mp[i].sp != 0 && ms>i)
- for(ssp = mp[i].sp;
- ssp < mp[i].ep;
- ssp++)
- if(dp < ep)
- *dp++ = *ssp;
- break;
- case '\\':
- if(dp < ep)
- *dp++ = '\\';
- break;
- case '\0':
- sp--;
- break;
- default:
- if(dp < ep)
- *dp++ = *sp;
- break;
+ ep = dst + dlen-1;
+ for(;*src != '\0'; src++) switch(*src) {
+ case '\\':
+ switch(*++src) {
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ i = *src - '0';
+ if(match != nil && i < msize && match[i].ep != nil) {
+ c = *match[i].ep;
+ *match[i].ep = '\0';
+ dst = strecpy(dst, ep+1, match[i].sp);
+ *match[i].ep = c;
}
- }else if(*sp == '&'){
- if(mp!=0 && mp[0].sp != 0 && ms>0)
- for(ssp = mp[0].sp;
- ssp < mp[0].ep; ssp++)
- if(dp < ep)
- *dp++ = *ssp;
- }else{
- if(dp < ep)
- *dp++ = *sp;
+ break;
+ case '\\':
+ if(dst < ep)
+ *dst++ = '\\';
+ else
+ goto End;
+ break;
+ case '\0':
+ goto End;
+ default:
+ if(dst < ep)
+ *dst++ = *src;
+ else
+ goto End;
+ break;
}
- sp++;
+ break;
+ case '&':
+ if(match != nil && msize > 0 && match[0].sp != nil) {
+ c = *match[0].ep;
+ *match[0].ep = '\0';
+ dst = strecpy(dst, ep+1, match[0].sp);
+ *match[0].ep = c;
+ }
+ break;
+ default:
+ if(dst < ep)
+ *dst++ = *src;
+ else
+ goto End;
+ break;
}
- *dp = '\0';
+End:
+ *dst = '\0';
}
diff --git a/sys/src/libregexp/rregexec.c b/sys/src/libregexp/rregexec.c
index 301a3589b..572718733 100644
--- a/sys/src/libregexp/rregexec.c
+++ b/sys/src/libregexp/rregexec.c
@@ -1,212 +1,189 @@
#include <u.h>
#include <libc.h>
-#include "regexp.h"
-#include "regcomp.h"
+#include <regexp.h>
+#include "regimpl.h"
-/*
- * return 0 if no match
- * >0 if a match
- * <0 if we ran out of _relist space
- */
-static int
-rregexec1(Reprog *progp, /* program to run */
- Rune *bol, /* string to run machine on */
- Resub *mp, /* subexpression elements */
- int ms, /* number of elements at mp */
- Reljunk *j)
+typedef struct RethreadQ RethreadQ;
+struct RethreadQ
{
- int flag=0;
- Reinst *inst;
- Relist *tlp;
- Rune *s;
- int i, checkstart;
- Rune r, *rp, *ep;
- Relist* tl; /* This list, next list */
- Relist* nl;
- Relist* tle; /* ends of this and next list */
- Relist* nle;
- int match;
- Rune *p;
+ Rethread *head;
+ Rethread **tail;
+};
- match = 0;
- checkstart = j->startchar;
- if(mp)
- for(i=0; i<ms; i++) {
- mp[i].rsp = 0;
- mp[i].rep = 0;
- }
- j->relist[0][0].inst = 0;
- j->relist[1][0].inst = 0;
+int
+rregexec(Reprog *prog, Rune *str, Resub *sem, int msize)
+{
+ RethreadQ lists[2], *clist, *nlist, *tmp;
+ Rethread *t, *nextthr, **availthr;
+ Reinst *curinst;
+ Rune *rsp, *rep, endr, last;
+ int i, match, first, gen, pri, matchpri;
- /* Execute machine once for each character, including terminal NUL */
- s = j->rstarts;
- do{
- /* fast check for first char */
- if(checkstart) {
- switch(j->starttype) {
- case RUNE:
- p = runestrchr(s, j->startchar);
- if(p == 0 || s == j->reol)
- return match;
- s = p;
- break;
- case BOL:
- if(s == bol)
- break;
- p = runestrchr(s, '\n');
- if(p == 0 || s == j->reol)
- return match;
- s = p+1;
- break;
- }
- }
+ if(msize > NSUBEXPM)
+ msize = NSUBEXPM;
- r = *s;
+ if(prog->startinst->gen != 0) {
+ for(curinst = prog->startinst; curinst < prog->startinst + prog->len; curinst++)
+ curinst->gen = 0;
+ }
- /* switch run lists */
- tl = j->relist[flag];
- tle = j->reliste[flag];
- nl = j->relist[flag^=1];
- nle = j->reliste[flag];
- nl->inst = 0;
+ clist = lists;
+ clist->head = nil;
+ clist->tail = &clist->head;
+ nlist = lists + 1;
+ nlist->head = nil;
+ nlist->tail = &nlist->head;
- /* Add first instruction to current list */
- _rrenewemptythread(tl, progp->startinst, ms, s);
+ for(i = 0; i < prog->nthr; i++)
+ prog->thrpool[i] = prog->threads + i;
+ availthr = prog->thrpool + prog->nthr;
- /* Execute machine until current list is empty */
- for(tlp=tl; tlp->inst; tlp++){
- for(inst=tlp->inst; ; inst = inst->next){
- switch(inst->type){
- case RUNE: /* regular character */
- if(inst->r == r)
- if(_renewthread(nl, inst->next, ms, &tlp->se)==nle)
- return -1;
- break;
- case LBRA:
- tlp->se.m[inst->subid].rsp = s;
- continue;
- case RBRA:
- tlp->se.m[inst->subid].rep = s;
- continue;
- case ANY:
- if(r != '\n')
- if(_renewthread(nl, inst->next, ms, &tlp->se)==nle)
- return -1;
- break;
- case ANYNL:
- if(_renewthread(nl, inst->next, ms, &tlp->se)==nle)
- return -1;
- break;
- case BOL:
- if(s == bol || *(s-1) == '\n')
- continue;
- break;
- case EOL:
- if(s == j->reol || r == 0 || r == '\n')
- continue;
- break;
- case CCLASS:
- ep = inst->cp->end;
- for(rp = inst->cp->spans; rp < ep; rp += 2)
- if(r >= rp[0] && r <= rp[1]){
- if(_renewthread(nl, inst->next, ms, &tlp->se)==nle)
- return -1;
- break;
- }
- break;
- case NCCLASS:
- ep = inst->cp->end;
- for(rp = inst->cp->spans; rp < ep; rp += 2)
- if(r >= rp[0] && r <= rp[1])
- break;
- if(rp == ep)
- if(_renewthread(nl, inst->next, ms, &tlp->se)==nle)
- return -1;
- break;
- case OR:
- /* evaluate right choice later */
- if(_renewthread(tlp, inst->right, ms, &tlp->se) == tle)
- return -1;
- /* efficiency: advance and re-evaluate */
- continue;
- case END: /* Match! */
- match = 1;
- tlp->se.m[0].rep = s;
- if(mp != 0)
- _renewmatch(mp, ms, &tlp->se);
- break;
- }
+ pri = matchpri = gen = match = 0;
+ rsp = str;
+ rep = nil;
+ endr = L'\0';
+ if(sem != nil && msize > 0) {
+ if(sem->rsp != nil)
+ rsp = sem->rsp;
+ if(sem->rep != nil && *sem->rep != L'\0') {
+ rep = sem->rep;
+ endr = *sem->rep;
+ *sem->rep = '\0';
+ }
+ }
+ last = 1;
+ for(; last != L'\0'; rsp++) {
+ gen++;
+ last = *rsp;
+ first = 1;
+ t = clist->head;
+ if(t == nil)
+ goto Start;
+ curinst = t->pc;
+Again:
+ if(curinst->gen == gen)
+ goto Done;
+ curinst->gen = gen;
+ switch(curinst->op) {
+ case ORUNE:
+ if(*rsp != curinst->r)
+ goto Done;
+ case OANY: /* fallthrough */
+ Any:
+ nextthr = t->next;
+ t->pc = curinst + 1;
+ t->next = nil;
+ *nlist->tail = t;
+ nlist->tail = &t->next;
+ if(nextthr == nil)
+ break;
+ t = nextthr;
+ curinst = t->pc;
+ goto Again;
+ case OCLASS:
+ Class:
+ if(*rsp < curinst->r)
+ goto Done;
+ if(*rsp > curinst->r1) {
+ curinst++;
+ goto Class;
+ }
+ nextthr = t->next;
+ t->pc = curinst->a;
+ t->next = nil;
+ *nlist->tail = t;
+ nlist->tail = &t->next;
+ if(nextthr == nil)
break;
+ t = nextthr;
+ curinst = t->pc;
+ goto Again;
+ case ONOTNL:
+ if(*rsp != L'\n') {
+ curinst++;
+ goto Again;
+ }
+ goto Done;
+ case OBOL:
+ if(rsp == str || rsp[-1] == L'\n') {
+ curinst++;
+ goto Again;
+ }
+ goto Done;
+ case OEOL:
+ if(*rsp == L'\0' && rep == nil) {
+ curinst++;
+ goto Again;
+ }
+ if(*rsp == '\n')
+ goto Any;
+ goto Done;
+ case OJMP:
+ curinst = curinst->a;
+ goto Again;
+ case OSPLIT:
+ nextthr = *--availthr;
+ nextthr->pc = curinst->b;
+ if(msize > 0)
+ memcpy(nextthr->sem, t->sem, sizeof(Resub)*msize);
+ nextthr->pri = t->pri;
+ nextthr->next = t->next;
+ t->next = nextthr;
+ curinst = curinst->a;
+ goto Again;
+ case OSAVE:
+ if(curinst->sub < msize)
+ t->sem[curinst->sub].rsp = rsp;
+ curinst++;
+ goto Again;
+ case OUNSAVE:
+ if(curinst->sub == 0) {
+ /* "Highest" priority is the left-most longest. */
+ if (t->pri > matchpri)
+ goto Done;
+ match = 1;
+ matchpri = t->pri;
+ if(sem != nil && msize > 0) {
+ memcpy(sem, t->sem, sizeof(Resub)*msize);
+ sem->rep = rsp;
+ }
+ goto Done;
}
+ if(curinst->sub < msize)
+ t->sem[curinst->sub].rep = rsp;
+ curinst++;
+ goto Again;
+ Done:
+ *availthr++ = t;
+ t = t->next;
+ if(t == nil)
+ break;
+ curinst = t->pc;
+ goto Again;
+ }
+Start:
+ /* Start again once if we haven't found anything. */
+ if(first == 1 && match == 0) {
+ first = 0;
+ t = *--availthr;
+ if(msize > 0)
+ memset(t->sem, 0, sizeof(Resub)*msize);
+ /* "Lower" priority thread */
+ t->pri = matchpri = pri++;
+ t->next = nil;
+ curinst = prog->startinst;
+ goto Again;
}
- if(s == j->reol)
+ /* If we have a match and no extant threads, we are done. */
+ if(match == 1 && nlist->head == nil)
break;
- checkstart = j->startchar && nl->inst==0;
- s++;
- }while(r);
- return match;
-}
-
-static int
-rregexec2(Reprog *progp, /* program to run */
- Rune *bol, /* string to run machine on */
- Resub *mp, /* subexpression elements */
- int ms, /* number of elements at mp */
- Reljunk *j
-)
-{
- Relist relist0[5*LISTSIZE], relist1[5*LISTSIZE];
-
- /* mark space */
- j->relist[0] = relist0;
- j->relist[1] = relist1;
- j->reliste[0] = relist0 + nelem(relist0) - 2;
- j->reliste[1] = relist1 + nelem(relist1) - 2;
-
- return rregexec1(progp, bol, mp, ms, j);
-}
-
-extern int
-rregexec(Reprog *progp, /* program to run */
- Rune *bol, /* string to run machine on */
- Resub *mp, /* subexpression elements */
- int ms) /* number of elements at mp */
-{
- Reljunk j;
- Relist relist0[LISTSIZE], relist1[LISTSIZE];
- int rv;
-
- /*
- * use user-specified starting/ending location if specified
- */
- j.rstarts = bol;
- j.reol = 0;
- if(mp && ms>0){
- if(mp->sp)
- j.rstarts = mp->rsp;
- if(mp->ep)
- j.reol = mp->rep;
+ tmp = clist;
+ clist = nlist;
+ nlist = tmp;
+ nlist->head = nil;
+ nlist->tail = &nlist->head;
}
- j.starttype = 0;
- j.startchar = 0;
- if(progp->startinst->type == RUNE && progp->startinst->r < Runeself) {
- j.starttype = RUNE;
- j.startchar = progp->startinst->r;
- }
- if(progp->startinst->type == BOL)
- j.starttype = BOL;
-
- /* mark space */
- j.relist[0] = relist0;
- j.relist[1] = relist1;
- j.reliste[0] = relist0 + nelem(relist0) - 2;
- j.reliste[1] = relist1 + nelem(relist1) - 2;
-
- rv = rregexec1(progp, bol, mp, ms, &j);
- if(rv >= 0)
- return rv;
- rv = rregexec2(progp, bol, mp, ms, &j);
- if(rv >= 0)
- return rv;
- return -1;
+ if(rep != nil)
+ *rep = endr;
+ return match;
}
diff --git a/sys/src/libregexp/rregsub.c b/sys/src/libregexp/rregsub.c
index c395583d0..88cbbfc89 100644
--- a/sys/src/libregexp/rregsub.c
+++ b/sys/src/libregexp/rregsub.c
@@ -1,64 +1,66 @@
#include <u.h>
#include <libc.h>
-#include "regexp.h"
+#include <regexp.h>
-/* substitute into one string using the matches from the last regexec() */
-extern void
-rregsub(Rune *sp, /* source string */
- Rune *dp, /* destination string */
- int dlen,
- Resub *mp, /* subexpression elements */
- int ms) /* number of elements pointed to by mp */
+void
+rregsub(Rune *src, Rune *dst, int dlen, Resub *match, int msize)
{
- Rune *ssp, *ep;
int i;
+ Rune *ep, r;
- ep = dp+(dlen/sizeof(Rune))-1;
- while(*sp != '\0'){
- if(*sp == '\\'){
- switch(*++sp){
- case '0':
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- case '8':
- case '9':
- i = *sp-'0';
- if(mp[i].rsp != 0 && mp!=0 && ms>i)
- for(ssp = mp[i].rsp;
- ssp < mp[i].rep;
- ssp++)
- if(dp < ep)
- *dp++ = *ssp;
- break;
- case '\\':
- if(dp < ep)
- *dp++ = '\\';
- break;
- case '\0':
- sp--;
- break;
- default:
- if(dp < ep)
- *dp++ = *sp;
- break;
+ ep = dst + dlen-1;
+ for(;*src != L'\0'; src++) switch(*src) {
+ case L'\\':
+ switch(*++src) {
+ case L'0':
+ case L'1':
+ case L'2':
+ case L'3':
+ case L'4':
+ case L'5':
+ case L'6':
+ case L'7':
+ case L'8':
+ case L'9':
+ i = *src - L'0';
+ if(match != nil && i < msize && match[i].rsp != nil) {
+ r = *match[i].rep;
+ *match[i].rep = L'\0';
+ dst = runestrecpy(dst, ep+1, match[i].rsp);
+ *match[i].rep = r;
}
- }else if(*sp == '&'){
- if(mp[0].rsp != 0 && mp!=0 && ms>0)
- if(mp[0].rsp != 0)
- for(ssp = mp[0].rsp;
- ssp < mp[0].rep; ssp++)
- if(dp < ep)
- *dp++ = *ssp;
- }else{
- if(dp < ep)
- *dp++ = *sp;
+ break;
+ case L'\\':
+ if(dst < ep)
+ *dst++ = L'\\';
+ else
+ goto End;
+ break;
+ case L'\0':
+ goto End;
+ default:
+ if(dst < ep)
+ *dst++ = *src;
+ else
+ goto End;
+ break;
}
- sp++;
+ break;
+ case L'&':
+ if(match != nil && msize > 0 && match[0].rsp != nil) {
+ r = *match[0].rep;
+ *match[0].rep = L'\0';
+ dst = runestrecpy(dst, ep+1, match[0].rsp);
+ *match[0].rep = r;
+ }
+ break;
+ default:
+ if(dst < ep)
+ *dst++ = *src;
+ else
+ goto End;
+ break;
}
- *dp = '\0';
+End:
+ *dst = L'\0';
}