diff options
author | ben <ben@rana> | 2016-04-26 22:23:44 -0500 |
---|---|---|
committer | ben <ben@rana> | 2016-04-26 22:23:44 -0500 |
commit | 0a460e1722c50e31653359f8a86fe0b606d2b513 (patch) | |
tree | b5a00bbfc883aa98709db012e0a7bacc67e234af /sys/src/libregexp | |
parent | 651d6c2bc68e7e5224c3ba41b094e37b1c1890ed (diff) |
New libregexp and APE ported to native
Diffstat (limited to 'sys/src/libregexp')
-rw-r--r-- | sys/src/libregexp/mkfile | 12 | ||||
-rw-r--r-- | sys/src/libregexp/regcomp.c | 971 | ||||
-rw-r--r-- | sys/src/libregexp/regerror.c | 2 | ||||
-rw-r--r-- | sys/src/libregexp/regexec.c | 390 | ||||
-rw-r--r-- | sys/src/libregexp/regimpl.h | 104 | ||||
-rw-r--r-- | sys/src/libregexp/regprint.c | 66 | ||||
-rw-r--r-- | sys/src/libregexp/regsub.c | 109 | ||||
-rw-r--r-- | sys/src/libregexp/rregexec.c | 369 | ||||
-rw-r--r-- | sys/src/libregexp/rregsub.c | 110 |
9 files changed, 1128 insertions, 1005 deletions
diff --git a/sys/src/libregexp/mkfile b/sys/src/libregexp/mkfile index dcc457028..f724310ae 100644 --- a/sys/src/libregexp/mkfile +++ b/sys/src/libregexp/mkfile @@ -6,12 +6,12 @@ OFILES=\ regerror.$O\ regexec.$O\ regsub.$O\ - regaux.$O\ rregexec.$O\ rregsub.$O\ + regprint.$O\ HFILES=/sys/include/regexp.h\ - regcomp.h\ + regimpl.h\ UPDATE=\ mkfile\ @@ -21,8 +21,8 @@ UPDATE=\ </sys/src/cmd/mksyslib -test: test.$O $OFILES - $LD -o test $prereq +$O.regextest: tests/regextest.$O $LIB + $LD -o $target regextest.$O -test2: test2.$O $OFILES - $LD -o test2 $prereq +$O.sysregextest: tests/sysregextest.$O + $LD -o $target sysregextest.$O diff --git a/sys/src/libregexp/regcomp.c b/sys/src/libregexp/regcomp.c index e48ef15ac..58bf864d7 100644 --- a/sys/src/libregexp/regcomp.c +++ b/sys/src/libregexp/regcomp.c @@ -1,567 +1,580 @@ #include <u.h> #include <libc.h> -#include "regexp.h" -#include "regcomp.h" - -#define TRUE 1 -#define FALSE 0 - -/* - * Parser Information - */ -typedef -struct Node -{ - Reinst* first; - Reinst* last; -}Node; - -/* max character classes per program is nelem(reprog->class) */ -static Reprog *reprog; - -/* max rune ranges per character class is nelem(classp->spans)/2 */ -#define NCCRUNE nelem(classp->spans) - -#define NSTACK 20 -static Node andstack[NSTACK]; -static Node *andp; -static int atorstack[NSTACK]; -static int* atorp; -static int cursubid; /* id of current subexpression */ -static int subidstack[NSTACK]; /* parallel to atorstack */ -static int* subidp; -static int lastwasand; /* Last token was operand */ -static int nbra; -static char* exprp; /* pointer to next character in source expression */ -static int lexdone; -static int nclass; -static Reclass*classp; -static Reinst* freep; -static int errors; -static Rune yyrune; /* last lex'd rune */ -static Reclass*yyclassp; /* last lex'd class */ - -/* predeclared crap */ -static void operator(int); -static void pushand(Reinst*, Reinst*); -static void pushator(int); -static void evaluntil(int); -static int bldcclass(void); - -static jmp_buf regkaboom; - -static void -rcerror(char *s) +#include <regexp.h> +#include "regimpl.h" + +int instrcnt[] = { + [TANY] 2, + [TBOL] 1, + [TCAT] 0, + [TCLASS] 1, + [TEOL] 1, + [TNOTNL] 1, + [TOR] 2, + [TPLUS] 1, + [TQUES] 1, + [TRUNE] 1, + [TSTAR] 2, + [TSUB] 2 +}; + +static Renode* +node(Parselex *plex, int op, Renode *l, Renode *r) { - errors++; - regerror(s); - longjmp(regkaboom, 1); + Renode *n; + + plex->instrs += instrcnt[op]; + n = plex->next++; + n->op = op; + n->left = l; + n->right = r; + return n; } -static Reinst* -newinst(int t) +static Renode* +e3(Parselex *plex) { - freep->type = t; - freep->left = 0; - freep->right = 0; - return freep++; + char error[128]; + Renode *n; + + switch(lex(plex)) { + case LANY: + return node(plex, TANY, nil, nil); + case LBOL: + return node(plex, TBOL, nil, nil); + case LEOL: + return node(plex, TEOL, nil, nil); + case LRUNE: + n = node(plex, TRUNE, nil, nil); + n->r = plex->rune; + return n; + case LCLASS: + if(plex->nc) + return buildclassn(plex); + return buildclass(plex); + case LLPAR: + n = e0(plex); + n = node(plex, TSUB, n, nil); + if(lex(plex) != LRPAR) { + snprint(error, sizeof(error), "regexp %s: no matching parenthesis", plex->orig); + regerror(error); + longjmp(plex->exitenv, 1); + } + return n; + default: + if(plex->rune) + snprint(error, sizeof(error), "regexp %s: syntax error: %C", plex->orig, plex->rune); + else + snprint(error, sizeof(error), "regexp %s: parsing error", plex->orig); + regerror(error); + longjmp(plex->exitenv, 1); + } + return nil; } -static void -operand(int t) +static Renode* +e2(Parselex *plex) { - Reinst *i; - - if(lastwasand) - operator(CAT); /* catenate is implicit */ - i = newinst(t); - - if(t == CCLASS || t == NCCLASS) - i->cp = yyclassp; - if(t == RUNE) - i->r = yyrune; - - pushand(i, i); - lastwasand = TRUE; + Renode *n; + + n = e3(plex); + if(lex(plex) == LREP) { + switch(plex->rune) { + case L'*': + return node(plex, TSTAR, n, nil); + case L'+': + return node(plex, TPLUS, n, nil); + case L'?': + return node(plex, TQUES, n, nil); + } + } + plex->peek = 1; + return n; } -static void -operator(int t) +static Renode* +invert(Renode *n) { - if(t==RBRA && --nbra<0) - rcerror("unmatched right paren"); - if(t==LBRA){ - if(++cursubid >= NSUBEXP) - rcerror ("too many subexpressions"); - nbra++; - if(lastwasand) - operator(CAT); - } else - evaluntil(t); - if(t != RBRA) - pushator(t); - lastwasand = FALSE; - if(t==STAR || t==QUEST || t==PLUS || t==RBRA) - lastwasand = TRUE; /* these look like operands */ + Renode *n1; + + if(n->op != TCAT) + return n; + while(n->left->op == TCAT) { + n1 = n->left; + n->left = n1->right; + n1->right = n; + n = n1; + } + return n; } -static void -regerr2(char *s, int c) +static Renode* +e1(Parselex *plex) { - char buf[100]; - char *cp = buf; - while(*s) - *cp++ = *s++; - *cp++ = c; - *cp = '\0'; - rcerror(buf); + Renode *n; + int sym; + + n = e2(plex); + for(;;) { + sym = lex(plex); + if(sym == LEND || sym == LOR || sym == LRPAR) + break; + plex->peek = 1; + n = node(plex, TCAT, n, e2(plex)); + } + plex->peek = 1; + return invert(n); } -static void -cant(char *s) +static Renode* +e0(Parselex *plex) { - char buf[100]; - strcpy(buf, "can't happen: "); - strcat(buf, s); - rcerror(buf); + Renode *n; + + n = e1(plex); + for(;;) { + if(lex(plex) != LOR) + break; + n = node(plex, TOR, n, e1(plex)); + } + plex->peek = 1; + return n; } -static void -pushand(Reinst *f, Reinst *l) +static Parselex* +initplex(Parselex *plex, char *regstr, int lit) { - if(andp >= &andstack[NSTACK]) - cant("operand stack overflow"); - andp->first = f; - andp->last = l; - andp++; + plex->getnextr = lit ? getnextrlit : getnextr; + plex->rawexp = plex->orig = regstr; + plex->sub = 0; + plex->instrs = 0; + plex->peek = 0; + plex->done = 0; + return plex; } -static void -pushator(int t) +static int +maxthreads(Renode *tree) { - if(atorp >= &atorstack[NSTACK]) - cant("operator stack overflow"); - *atorp++ = t; - *subidp++ = cursubid; + tree = tree->left; + if(tree->op == TCAT) + tree = tree->left; + if(tree->op == TBOL) + return 2; + return -1; } -static Node* -popand(int op) +static Reprog* +regcomp1(char *regstr, int nl, int lit) { - Reinst *inst; - - if(andp <= &andstack[0]){ - regerr2("missing operand for ", op); - inst = newinst(NOP); - pushand(inst,inst); + Reprog *reprog; + Parselex plex; + Renode *parsetr; + int regstrlen, maxthr; + + regstrlen = utflen(regstr); + initplex(&plex, regstr, lit); + plex.nodes = calloc(sizeof(*plex.nodes), regstrlen*2); + if(plex.nodes == nil) + return nil; + plex.next = plex.nodes; + + if(setjmp(plex.exitenv) != 0) { + free(plex.nodes); + return nil; } - return --andp; + + parsetr = node(&plex, TSUB, e0(&plex), nil); + maxthr = maxthreads(parsetr); + if(maxthr == -1) + maxthr = regstrlen; + + prtree(parsetr, 0, 1); + reprog = malloc(sizeof(Reprog) + + sizeof(Reinst) * plex.instrs + + sizeof(Rethread) * maxthr + + sizeof(Rethread*) * maxthr); + reprog->len = plex.instrs; + reprog->nthr = maxthr; + reprog->startinst = compile(parsetr, reprog, nl); + reprog->threads = (Rethread*)(reprog->startinst + reprog->len); + reprog->thrpool = (Rethread**)(reprog->threads + reprog->nthr); + reprog->regstr = regstr; + + free(plex.nodes); + return reprog; } -static int -popator(void) +Reprog* +regcomp(char *str) { - if(atorp <= &atorstack[0]) - cant("operator stack underflow"); - --subidp; - return *--atorp; + return regcomp1(str, 0, 0); } -static void -evaluntil(int pri) +Reprog* +regcomplit(char *str) { - Node *op1, *op2; - Reinst *inst1, *inst2; - - while(pri==RBRA || atorp[-1]>=pri){ - switch(popator()){ - default: - rcerror("unknown operator in evaluntil"); - break; - case LBRA: /* must have been RBRA */ - op1 = popand('('); - inst2 = newinst(RBRA); - inst2->subid = *subidp; - op1->last->next = inst2; - inst1 = newinst(LBRA); - inst1->subid = *subidp; - inst1->next = op1->first; - pushand(inst1, inst2); - return; - case OR: - op2 = popand('|'); - op1 = popand('|'); - inst2 = newinst(NOP); - op2->last->next = inst2; - op1->last->next = inst2; - inst1 = newinst(OR); - inst1->right = op1->first; - inst1->left = op2->first; - pushand(inst1, inst2); - break; - case CAT: - op2 = popand(0); - op1 = popand(0); - op1->last->next = op2->first; - pushand(op1->first, op2->last); - break; - case STAR: - op2 = popand('*'); - inst1 = newinst(OR); - op2->last->next = inst1; - inst1->right = op2->first; - pushand(inst1, inst1); - break; - case PLUS: - op2 = popand('+'); - inst1 = newinst(OR); - op2->last->next = inst1; - inst1->right = op2->first; - pushand(op2->first, inst1); - break; - case QUEST: - op2 = popand('?'); - inst1 = newinst(OR); - inst2 = newinst(NOP); - inst1->left = inst2; - inst1->right = op2->first; - op2->last->next = inst2; - pushand(inst1, inst2); - break; - } - } + return regcomp1(str, 0, 1); } -static Reprog* -optimize(Reprog *pp) +Reprog* +regcompnl(char *str) { - Reinst *inst, *target; - int size; - Reprog *npp; - Reclass *cl; - int diff; - - /* - * get rid of NOOP chains - */ - for(inst=pp->firstinst; inst->type!=END; inst++){ - target = inst->next; - while(target->type == NOP) - target = target->next; - inst->next = target; - } - - /* - * The original allocation is for an area larger than - * necessary. Reallocate to the actual space used - * and then relocate the code. - */ - size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst); - npp = realloc(pp, size); - if(npp==0 || npp==pp) - return pp; - diff = (char *)npp - (char *)pp; - freep = (Reinst *)((char *)freep + diff); - for(inst=npp->firstinst; inst<freep; inst++){ - switch(inst->type){ - case OR: - case STAR: - case PLUS: - case QUEST: - *(char **)&inst->right += diff; - break; - case CCLASS: - case NCCLASS: - *(char **)&inst->right += diff; - cl = inst->cp; - *(char **)&cl->end += diff; - break; - } - *(char **)&inst->left += diff; - } - *(char **)&npp->startinst += diff; - return npp; + return regcomp1(str, 1, 0); } -#ifdef DEBUG -static void -dumpstack(void){ - Node *stk; - int *ip; - - print("operators\n"); - for(ip=atorstack; ip<atorp; ip++) - print("0%o\n", *ip); - print("operands\n"); - for(stk=andstack; stk<andp; stk++) - print("0%o\t0%o\n", stk->first->type, stk->last->type); +static Reinst* +compile1(Renode *renode, Reinst *reinst, int *sub, int nl) +{ + Reinst *i; + int s; + +Tailcall: + if(renode == nil) + return reinst; + switch(renode->op) { + case TCLASS: + reinst->op = OCLASS; + reinst->r = renode->r; + reinst->r1 = renode->r1; + reinst->a = reinst + 1 + renode->nclass; + renode = renode->left; + reinst++; + goto Tailcall; + case TCAT: + reinst = compile1(renode->left, reinst, sub, nl); + renode = renode->right; + goto Tailcall; + case TOR: + reinst->op = OSPLIT; + reinst->a = reinst + 1; + i = compile1(renode->left, reinst->a, sub, nl); + reinst->b = i + 1; + i->op = OJMP; + i->a = compile1(renode->right, reinst->b, sub, nl); + return i->a; + case TSTAR: + reinst->op = OSPLIT; + reinst->a = reinst + 1; + i = compile1(renode->left, reinst->a, sub, nl); + reinst->b = i + 1; + i->op = OJMP; + i->a = reinst; + return reinst->b; + case TPLUS: + i = reinst; + reinst = compile1(renode->left, reinst, sub, nl); + reinst->op = OSPLIT; + reinst->a = i; + reinst->b = reinst + 1; + return reinst->b; + case TQUES: + reinst->op = OSPLIT; + reinst->a = reinst + 1; + reinst->b = compile1(renode->left, reinst->a, sub, nl); + return reinst->b; + case TSUB: + reinst->op = OSAVE; + reinst->sub = s = (*sub)++; + reinst = compile1(renode->left, reinst+1, sub, nl); + reinst->op = OUNSAVE; + reinst->sub = s; + return reinst + 1; + case TANY: + if(nl == 0) + reinst++->op = ONOTNL; + reinst->op = OANY; + return reinst + 1; + case TRUNE: + reinst->op = ORUNE; + reinst->r = renode->r; + return reinst + 1; + case TNOTNL: + reinst->op = ONOTNL; + return reinst + 1; + case TEOL: + reinst->op = OEOL; + return reinst + 1; + case TBOL: + reinst->op = OBOL; + return reinst + 1; + } + return nil; } -static void -dump(Reprog *pp) +static Reinst* +compile(Renode *parsetr, Reprog *reprog, int nl) { - Reinst *l; - Rune *p; + Reinst *reinst; + int sub; - l = pp->firstinst; - do{ - print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type, - l->left-pp->firstinst, l->right-pp->firstinst); - if(l->type == RUNE) - print("\t%C\n", l->r); - else if(l->type == CCLASS || l->type == NCCLASS){ - print("\t["); - if(l->type == NCCLASS) - print("^"); - for(p = l->cp->spans; p < l->cp->end; p += 2) - if(p[0] == p[1]) - print("%C", p[0]); - else - print("%C-%C", p[0], p[1]); - print("]\n"); - } else - print("\n"); - }while(l++->type); + sub = 0; + reinst = (Reinst*)(reprog+1); + compile1(parsetr, reinst, &sub, nl); + return reinst; } -#endif -static Reclass* -newclass(void) +static void +getnextr(Parselex *l) { - if(nclass >= nelem(reprog->class)) - rcerror("too many character classes; increase Reprog.class size"); - return &(classp[nclass++]); + l->literal = 0; + if(l->done) { + l->rune = 0; + return; + } + l->rawexp += chartorune(&l->rune, l->rawexp); + if(l->rune == L'\\') { + l->rawexp += chartorune(&l->rune, l->rawexp); + l->literal = 1; + } + if(*l->rawexp == 0) + l->done = 1; + return; } -static int -nextc(Rune *rp) +static void +getnextrlit(Parselex *l) { - if(lexdone){ - *rp = 0; - return 1; - } - exprp += chartorune(rp, exprp); - if(*rp == L'\\'){ - exprp += chartorune(rp, exprp); - return 1; + l->literal = 1; + if(l->done) { + l->literal = 0; + l->rune = 0; + return; } - if(*rp == 0) - lexdone = 1; - return 0; + l->rawexp += chartorune(&l->rune, l->rawexp); + if(*l->rawexp == 0) + l->done = 1; + return; } -static int -lex(int literal, int dot_type) +static int +lex(Parselex *l) { - int quoted; - - quoted = nextc(&yyrune); - if(literal || quoted){ - if(yyrune == 0) - return END; - return RUNE; + if(l->peek) { + l->peek = 0; + return l->peeklex; } - - switch(yyrune){ + l->getnextr(l); + if(l->literal) + return l->peeklex = LRUNE; + switch(l->rune){ case 0: - return END; + return l->peeklex = LEND; case L'*': - return STAR; case L'?': - return QUEST; case L'+': - return PLUS; + return l->peeklex = LREP; case L'|': - return OR; + return l->peeklex = LOR; case L'.': - return dot_type; + return l->peeklex = LANY; case L'(': - return LBRA; + return l->peeklex = LLPAR; case L')': - return RBRA; + return l->peeklex = LRPAR; case L'^': - return BOL; + return l->peeklex = LBOL; case L'$': - return EOL; + return l->peeklex = LEOL; case L'[': - return bldcclass(); + getclass(l); + return l->peeklex = LCLASS; } - return RUNE; + return l->peeklex = LRUNE; } static int -bldcclass(void) +pcmp(void *va, void *vb) { - int type; - Rune r[NCCRUNE]; - Rune *p, *ep, *np; - Rune rune; - int quoted; - - /* we have already seen the '[' */ - type = CCLASS; - yyclassp = newclass(); - - /* look ahead for negation */ - /* SPECIAL CASE!!! negated classes don't match \n */ - ep = r; - quoted = nextc(&rune); - if(!quoted && rune == L'^'){ - type = NCCLASS; - quoted = nextc(&rune); - *ep++ = L'\n'; - *ep++ = L'\n'; - } - - /* parse class into a set of spans */ - while(ep < &r[NCCRUNE-1]){ - if(rune == 0){ - rcerror("malformed '[]'"); - return 0; - } - if(!quoted && rune == L']') - break; - if(!quoted && rune == L'-'){ - if(ep == r){ - rcerror("malformed '[]'"); - return 0; - } - quoted = nextc(&rune); - if((!quoted && rune == L']') || rune == 0){ - rcerror("malformed '[]'"); - return 0; - } - *(ep-1) = rune; - } else { - *ep++ = rune; - *ep++ = rune; - } - quoted = nextc(&rune); - } - if(ep >= &r[NCCRUNE-1]) { - rcerror("char class too large; increase Reclass.spans size"); - return 0; - } - - /* sort on span start */ - for(p = r; p < ep; p += 2){ - for(np = p; np < ep; np += 2) - if(*np < *p){ - rune = np[0]; - np[0] = p[0]; - p[0] = rune; - rune = np[1]; - np[1] = p[1]; - p[1] = rune; - } - } + vlong n; + Rune *a, *b; - /* merge spans */ - np = yyclassp->spans; - p = r; - if(r == ep) - yyclassp->end = np; - else { - np[0] = *p++; - np[1] = *p++; - for(; p < ep; p += 2) - /* overlapping or adjacent ranges? */ - if(p[0] <= np[1] + 1){ - if(p[1] >= np[1]) - np[1] = p[1]; /* coalesce */ - } else { - np += 2; - np[0] = p[0]; - np[1] = p[1]; - } - yyclassp->end = np+2; - } + a = va; + b = vb; - return type; + n = (vlong)b[0] - (vlong)a[0]; + if(n) + return n; + return (vlong)b[1] - (vlong)a[1]; } -static Reprog* -regcomp1(char *s, int literal, int dot_type) +static void +getclass(Parselex *l) { - int token; - Reprog *pp; - - /* get memory for the program */ - pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s)); - if(pp == 0){ - regerror("out of memory"); - return 0; + Rune *p, *q, t; + + l->nc = 0; + getnextrlit(l); + if(l->rune == L'^') { + l->nc = 1; + getnextrlit(l); } - freep = pp->firstinst; - classp = pp->class; - errors = 0; - - if(setjmp(regkaboom)) - goto out; - - /* go compile the sucker */ - lexdone = 0; - exprp = s; - nclass = 0; - nbra = 0; - atorp = atorstack; - andp = andstack; - subidp = subidstack; - lastwasand = FALSE; - cursubid = 0; - - /* Start with a low priority operator to prime parser */ - pushator(START-1); - while((token = lex(literal, dot_type)) != END){ - if((token&0300) == OPERATOR) - operator(token); - else - operand(token); + p = l->cpairs; + for(;;) { + *p = l->rune; + if(l->rune == L']') + break; + if(l->rune == L'-') { + regerror("Malformed class"); + longjmp(l->exitenv, 1); + } + if(l->rune == '\\') { + getnextrlit(l); + *p = l->rune; + } + if(l->rune == 0) { + regerror("No closing ] for class"); + longjmp(l->exitenv, 1); + } + getnextrlit(l); + if(l->rune == L'-') { + getnextrlit(l); + if(l->rune == L']') { + regerror("Malformed class"); + longjmp(l->exitenv, 1); + } + if(l->rune == L'-') { + regerror("Malformed class"); + longjmp(l->exitenv, 1); + } + if(l->rune == L'\\') + getnextrlit(l); + p[1] = l->rune; + if(p[0] > p[1]) { + t = p[0]; + p[0] = p[1]; + p[1] = t; + } + getnextrlit(l); + } else + p[1] = p[0]; + if(p >= l->cpairs + nelem(l->cpairs) - 2) { + regerror("Class too big\n"); + longjmp(l->exitenv, 1); + } + p += 2; } - - /* Close with a low priority operator */ - evaluntil(START); - - /* Force END */ - operand(END); - evaluntil(START); -#ifdef DEBUG - dumpstack(); -#endif - if(nbra) - rcerror("unmatched left paren"); - --andp; /* points to first and only operand */ - pp->startinst = andp->first; -#ifdef DEBUG - dump(pp); -#endif - pp = optimize(pp); -#ifdef DEBUG - print("start: %d\n", andp->first-pp->firstinst); - dump(pp); -#endif -out: - if(errors){ - free(pp); - pp = 0; + *p = L'\0'; + qsort(l->cpairs, (p - l->cpairs)/2, 2*sizeof(*l->cpairs), pcmp); + q = l->cpairs; + for(p = l->cpairs+2; *p != 0; p += 2) { + if(p[1] < q[0] - 1) { + q[2] = p[0]; + q[3] = p[1]; + q += 2; + continue; + } + q[0] = p[0]; + if(p[1] > q[1]) + q[1] = p[1]; } - return pp; + q[2] = 0; } -extern Reprog* -regcomp(char *s) +/* classes are in descending order */ +static Renode* +buildclassn(Parselex *l) { - return regcomp1(s, 0, ANY); + Renode *n; + Rune *p; + int i; + + i = 0; + p = l->cpairs; + n = node(l, TCLASS, nil, nil); + n->r = p[1] + 1; + n->r1 = Runemax; + n->nclass = i++; + + for(; *p != 0; p += 2) { + n = node(l, TCLASS, n, nil); + n->r = p[3] + 1; + n->r1 = p[0] - 1; + n->nclass = i++; + } + n->r = 0; + return node(l, TCAT, node(l, TNOTNL, nil, nil), n); } -extern Reprog* -regcomplit(char *s) +static Renode* +buildclass(Parselex *l) { - return regcomp1(s, 1, ANY); + Renode *n; + Rune *p; + int i; + + i = 0; + n = node(l, TCLASS, nil, nil); + n->r = Runemax + 1; + n->nclass = i++; + + for(p = l->cpairs; *p != 0; p += 2) { + n = node(l, TCLASS, n, nil); + n->r = p[0]; + n->r1 = p[1]; + n->nclass = i++; + } + return n; } -extern Reprog* -regcompnl(char *s) +static void +prtree(Renode *tree, int d, int f) { - return regcomp1(s, 0, ANYNL); + int i; + + if(tree == nil) + return; + if(f) + for(i = 0; i < d; i++) + print("\t"); + switch(tree->op) { + case TCAT: + prtree(tree->left, d, 0); + prtree(tree->right, d, 1); + break; + case TOR: + print("TOR\n"); + prtree(tree->left, d+1, 1); + for(i = 0; i < d; i++) + print("\t"); + print("|\n"); + prtree(tree->right, d+1, 1); + break; + case TSTAR: + print("*\n"); + prtree(tree->left, d+1, 1); + break; + case TPLUS: + print("+\n"); + prtree(tree->left, d+1, 1); + break; + case TQUES: + print("?\n"); + prtree(tree->left, d+1, 1); + break; + case TANY: + print(".\n"); + prtree(tree->left, d+1, 1); + break; + case TBOL: + print("^\n"); + break; + case TEOL: + print("$\n"); + break; + case TSUB: + print("TSUB\n"); + prtree(tree->left, d+1, 1); + break; + case TRUNE: + print("TRUNE: %C\n", tree->r); + break; + case TNOTNL: + print("TNOTNL: !\\n\n"); + break; + case TCLASS: + print("CLASS: %C-%C\n", tree->r, tree->r1); + prtree(tree->left, d, 1); + break; + } } diff --git a/sys/src/libregexp/regerror.c b/sys/src/libregexp/regerror.c index 92b73a0d9..1e0eef175 100644 --- a/sys/src/libregexp/regerror.c +++ b/sys/src/libregexp/regerror.c @@ -1,6 +1,6 @@ #include <u.h> #include <libc.h> -#include "regexp.h" +#include <regexp.h> void regerror(char *s) diff --git a/sys/src/libregexp/regexec.c b/sys/src/libregexp/regexec.c index b806ba60a..a153d77e9 100644 --- a/sys/src/libregexp/regexec.c +++ b/sys/src/libregexp/regexec.c @@ -1,232 +1,190 @@ #include <u.h> #include <libc.h> -#include "regexp.h" -#include "regcomp.h" +#include <regexp.h> +#include "regimpl.h" +typedef struct RethreadQ RethreadQ; +struct RethreadQ +{ + Rethread *head; + Rethread **tail; +}; -/* - * return 0 if no match - * >0 if a match - * <0 if we ran out of _relist space - */ -static int -regexec1(Reprog *progp, /* program to run */ - char *bol, /* string to run machine on */ - Resub *mp, /* subexpression elements */ - int ms, /* number of elements at mp */ - Reljunk *j -) +int +regexec(Reprog *prog, char *str, Resub *sem, int msize) { - int flag=0; - Reinst *inst; - Relist *tlp; - char *s; - int i, checkstart; - Rune r, *rp, *ep; - int n; - Relist* tl; /* This list, next list */ - Relist* nl; - Relist* tle; /* ends of this and next list */ - Relist* nle; - int match; - char *p; + RethreadQ lists[2], *clist, *nlist, *tmp; + Rethread *t, *nextthr, **availthr; + Reinst *curinst; + Rune r; + char *sp, *ep, endc; + int i, match, first, gen, matchpri, pri; - match = 0; - checkstart = j->starttype; - if(mp) - for(i=0; i<ms; i++) { - mp[i].sp = 0; - mp[i].ep = 0; - } - j->relist[0][0].inst = 0; - j->relist[1][0].inst = 0; + if(msize > NSUBEXPM) + msize = NSUBEXPM; - /* Execute machine once for each character, including terminal NUL */ - s = j->starts; - do{ - /* fast check for first char */ - if(checkstart) { - switch(j->starttype) { - case RUNE: - p = utfrune(s, j->startchar); - if(p == 0 || s == j->eol) - return match; - s = p; - break; - case BOL: - if(s == bol) - break; - p = utfrune(s, '\n'); - if(p == 0 || s == j->eol) - return match; - s = p+1; - break; - } - } - r = *(uchar*)s; - if(r < Runeself) - n = 1; - else - n = chartorune(&r, s); + if(prog->startinst->gen != 0) { + for(curinst = prog->startinst; curinst < prog->startinst + prog->len; curinst++) + curinst->gen = 0; + } - /* switch run lists */ - tl = j->relist[flag]; - tle = j->reliste[flag]; - nl = j->relist[flag^=1]; - nle = j->reliste[flag]; - nl->inst = 0; + clist = lists; + clist->head = nil; + clist->tail = &clist->head; + nlist = lists + 1; + nlist->head = nil; + nlist->tail = &nlist->head; - /* Add first instruction to current list */ - if(match == 0) - _renewemptythread(tl, progp->startinst, ms, s); + for(i = 0; i < prog->nthr; i++) + prog->thrpool[i] = prog->threads + i; + availthr = prog->thrpool + prog->nthr; - /* Execute machine until current list is empty */ - for(tlp=tl; tlp->inst; tlp++){ /* assignment = */ - for(inst = tlp->inst; ; inst = inst->next){ - switch(inst->type){ - case RUNE: /* regular character */ - if(inst->r == r){ - if(_renewthread(nl, inst->next, ms, &tlp->se)==nle) - return -1; - } - break; - case LBRA: - tlp->se.m[inst->subid].sp = s; - continue; - case RBRA: - tlp->se.m[inst->subid].ep = s; - continue; - case ANY: - if(r != '\n') - if(_renewthread(nl, inst->next, ms, &tlp->se)==nle) - return -1; - break; - case ANYNL: - if(_renewthread(nl, inst->next, ms, &tlp->se)==nle) - return -1; - break; - case BOL: - if(s == bol || *(s-1) == '\n') - continue; - break; - case EOL: - if(s == j->eol || r == 0 || r == '\n') - continue; - break; - case CCLASS: - ep = inst->cp->end; - for(rp = inst->cp->spans; rp < ep; rp += 2) - if(r >= rp[0] && r <= rp[1]){ - if(_renewthread(nl, inst->next, ms, &tlp->se)==nle) - return -1; - break; - } - break; - case NCCLASS: - ep = inst->cp->end; - for(rp = inst->cp->spans; rp < ep; rp += 2) - if(r >= rp[0] && r <= rp[1]) - break; - if(rp == ep) - if(_renewthread(nl, inst->next, ms, &tlp->se)==nle) - return -1; - break; - case OR: - /* evaluate right choice later */ - if(_renewthread(tlp, inst->right, ms, &tlp->se) == tle) - return -1; - /* efficiency: advance and re-evaluate */ - continue; - case END: /* Match! */ - match = 1; - tlp->se.m[0].ep = s; - if(mp != 0) - _renewmatch(mp, ms, &tlp->se); - break; - } + pri = matchpri = gen = match = 0; + sp = str; + ep = nil; + endc = '\0'; + if(sem != nil && msize > 0) { + if(sem->sp != nil) + sp = sem->sp; + if(sem->ep != nil && *sem->ep != '\0') { + ep = sem->ep; + endc = *sem->ep; + *sem->ep = '\0'; + } + } + r = Runemax + 1; + for(; r != L'\0'; sp += i) { + gen++; + i = chartorune(&r, sp); + first = 1; + t = clist->head; + if(t == nil) + goto Start; + curinst = t->pc; +Again: + if(curinst->gen == gen) + goto Done; + curinst->gen = gen; + switch(curinst->op) { + case ORUNE: + if(r != curinst->r) + goto Done; + case OANY: /* fallthrough */ + Any: + nextthr = t->next; + t->pc = curinst + 1; + t->next = nil; + *nlist->tail = t; + nlist->tail = &t->next; + if(nextthr == nil) + break; + t = nextthr; + curinst = t->pc; + goto Again; + case OCLASS: + Class: + if(r < curinst->r) + goto Done; + if(r > curinst->r1) { + curinst++; + goto Class; + } + nextthr = t->next; + t->pc = curinst->a; + t->next = nil; + *nlist->tail = t; + nlist->tail = &t->next; + if(nextthr == nil) break; + t = nextthr; + curinst = t->pc; + goto Again; + case ONOTNL: + if(r != L'\n') { + curinst++; + goto Again; + } + goto Done; + case OBOL: + if(sp == str || sp[-1] == '\n') { + curinst++; + goto Again; } + goto Done; + case OEOL: + if(r == L'\0' && ep == nil) { + curinst++; + goto Again; + } + if(r == L'\n') + goto Any; + goto Done; + case OJMP: + curinst = curinst->a; + goto Again; + case OSPLIT: + nextthr = *--availthr; + nextthr->pc = curinst->b; + if(msize > 0) + memcpy(nextthr->sem, t->sem, sizeof(Resub)*msize); + nextthr->pri = t->pri; + nextthr->next = t->next; + t->next = nextthr; + curinst = curinst->a; + goto Again; + case OSAVE: + if(curinst->sub < msize) + t->sem[curinst->sub].sp = sp; + curinst++; + goto Again; + case OUNSAVE: + if(curinst->sub == 0) { + /* "Highest" priority is the left-most longest. */ + if (t->pri > matchpri) + goto Done; + match = 1; + matchpri = t->pri; + if(sem != nil && msize > 0) { + memcpy(sem, t->sem, sizeof(Resub)*msize); + sem->ep = sp; + } + goto Done; + } + if(curinst->sub < msize) + t->sem[curinst->sub].ep = sp; + curinst++; + goto Again; + Done: + *availthr++ = t; + t = t->next; + if(t == nil) + break; + curinst = t->pc; + goto Again; } - if(s == j->eol) +Start: + /* Start again once if we haven't found anything. */ + if(first == 1 && match == 0) { + first = 0; + t = *--availthr; + if(msize > 0) + memset(t->sem, 0, sizeof(Resub)*msize); + /* "Lower" priority thread */ + t->pri = matchpri = pri++; + t->next = nil; + curinst = prog->startinst; + goto Again; + } + /* If we have a match and no extant threads, we are done. */ + if(match == 1 && nlist->head == nil) break; - checkstart = j->starttype && nl->inst==0; - s += n; - }while(r); - return match; -} - -static int -regexec2(Reprog *progp, /* program to run */ - char *bol, /* string to run machine on */ - Resub *mp, /* subexpression elements */ - int ms, /* number of elements at mp */ - Reljunk *j -) -{ - int rv; - Relist *relist0, *relist1; - - /* mark space */ - relist0 = malloc(BIGLISTSIZE*sizeof(Relist)); - if(relist0 == nil) - return -1; - relist1 = malloc(BIGLISTSIZE*sizeof(Relist)); - if(relist1 == nil){ - free(relist1); - return -1; - } - j->relist[0] = relist0; - j->relist[1] = relist1; - j->reliste[0] = relist0 + BIGLISTSIZE - 2; - j->reliste[1] = relist1 + BIGLISTSIZE - 2; - - rv = regexec1(progp, bol, mp, ms, j); - free(relist0); - free(relist1); - return rv; -} - -extern int -regexec(Reprog *progp, /* program to run */ - char *bol, /* string to run machine on */ - Resub *mp, /* subexpression elements */ - int ms) /* number of elements at mp */ -{ - Reljunk j; - Relist relist0[LISTSIZE], relist1[LISTSIZE]; - int rv; - - /* - * use user-specified starting/ending location if specified - */ - j.starts = bol; - j.eol = 0; - if(mp && ms>0){ - if(mp->sp) - j.starts = mp->sp; - if(mp->ep) - j.eol = mp->ep; + tmp = clist; + clist = nlist; + nlist = tmp; + nlist->head = nil; + nlist->tail = &nlist->head; } - j.starttype = 0; - j.startchar = 0; - if(progp->startinst->type == RUNE && progp->startinst->r < Runeself) { - j.starttype = RUNE; - j.startchar = progp->startinst->r; - } - if(progp->startinst->type == BOL) - j.starttype = BOL; - - /* mark space */ - j.relist[0] = relist0; - j.relist[1] = relist1; - j.reliste[0] = relist0 + nelem(relist0) - 2; - j.reliste[1] = relist1 + nelem(relist1) - 2; - - rv = regexec1(progp, bol, mp, ms, &j); - if(rv >= 0) - return rv; - rv = regexec2(progp, bol, mp, ms, &j); - if(rv >= 0) - return rv; - return -1; + if(ep != nil) + *ep = endc; + return match; } diff --git a/sys/src/libregexp/regimpl.h b/sys/src/libregexp/regimpl.h new file mode 100644 index 000000000..f173a970c --- /dev/null +++ b/sys/src/libregexp/regimpl.h @@ -0,0 +1,104 @@ +enum +{ + LANY = 0, + LBOL, + LCLASS, + LEND, + LEOL, + LLPAR, + LOR, + LREP, + LRPAR, + LRUNE, + + TANY = 0, + TBOL, + TCAT, + TCLASS, + TEOL, + TNOTNL, + TOR, + TPLUS, + TQUES, + TRUNE, + TSTAR, + TSUB, + + NSUBEXPM = 32 +}; + +typedef struct Parselex Parselex; +typedef struct Renode Renode; + +struct Parselex +{ + /* Parse */ + Renode *next; + Renode *nodes; + int sub; + int instrs; + jmp_buf exitenv; + /* Lex */ + void (*getnextr)(Parselex*); + char *rawexp; + char *orig; + Rune rune; + Rune peek; + int peeklex; + int done; + int literal; + Rune cpairs[400+2]; + int nc; +}; +struct Renode +{ + int op; + Renode *left; + Rune r; + union + { + Rune r1; + int sub; + Renode *right; + }; + int nclass; +}; +struct Rethread +{ + Reinst *pc; + Resub sem[NSUBEXPM]; + int pri; + Rethread *next; +}; +struct Reinst +{ + char op; + int gen; + Reinst *a; + union + { + Rune r; + int sub; + }; + union + { + Rune r1; + Reinst *b; + }; +}; + +static int lex(Parselex*); +static void getnextr(Parselex*); +static void getnextrlit(Parselex*); +static void getclass(Parselex*); +static Renode *e0(Parselex*); +static Renode *e1(Parselex*); +static Renode *e2(Parselex*); +static Renode *e3(Parselex*); +static Renode *buildclass(Parselex*); +static Renode *buildclassn(Parselex*); +static int pcmp(void*, void*); +static Reprog *regcomp1(char*, int, int); +static Reinst *compile(Renode*, Reprog*, int); +static Reinst *compile1(Renode*, Reinst*, int*, int); +static void prtree(Renode*, int, int); diff --git a/sys/src/libregexp/regprint.c b/sys/src/libregexp/regprint.c new file mode 100644 index 000000000..673ca1f7b --- /dev/null +++ b/sys/src/libregexp/regprint.c @@ -0,0 +1,66 @@ +#include <u.h> +#include <libc.h> +#include <regexp.h> +#include <regimpl.h> + +static int +fmtprinst(Fmt *f, Reinst *inst) +{ + int r; + + r = fmtprint(f, "%p ", inst); + switch(inst->op) { + case ORUNE: + r += fmtprint(f, "ORUNE\t%C\n", inst->r); + break; + case ONOTNL: + r += fmtprint(f, "ONOTNL\n"); + break; + case OCLASS: + r += fmtprint(f, "OCLASS\t%C-%C %p\n", inst->r, inst->r1, inst->a); + break; + case OSPLIT: + r += fmtprint(f, "OSPLIT\t%p %p\n", inst->a, inst->b); + break; + case OJMP: + r += fmtprint(f, "OJMP \t%p\n", inst->a); + break; + case OSAVE: + r += fmtprint(f, "OSAVE\t%d\n", inst->sub); + break; + case OUNSAVE: + r += fmtprint(f, "OUNSAVE\t%d\n", inst->sub); + break; + case OANY: + r += fmtprint(f, "OANY \t.\n"); + break; + case OEOL: + r += fmtprint(f, "OEOL \t$\n"); + break; + case OBOL: + r += fmtprint(f, "OBOL \t^\n"); + break; + } + return r; +} + +static int +fmtprprog(Fmt *f, Reprog *reprog) +{ + Reinst *inst; + int r; + + r = 0; + for(inst = reprog->startinst; inst < reprog->startinst + reprog->len; inst++) + r += fmtprinst(f, inst); + return r; +} + +int +reprogfmt(Fmt *f) +{ + Reprog *r; + + r = va_arg(f->args, Reprog*); + return fmtprprog(f, r); +} diff --git a/sys/src/libregexp/regsub.c b/sys/src/libregexp/regsub.c index 2474d2989..d11d86f25 100644 --- a/sys/src/libregexp/regsub.c +++ b/sys/src/libregexp/regsub.c @@ -1,63 +1,66 @@ #include <u.h> #include <libc.h> -#include "regexp.h" +#include <regexp.h> -/* substitute into one string using the matches from the last regexec() */ -extern void -regsub(char *sp, /* source string */ - char *dp, /* destination string */ - int dlen, - Resub *mp, /* subexpression elements */ - int ms) /* number of elements pointed to by mp */ +void +regsub(char *src, char *dst, int dlen, Resub *match, int msize) { - char *ssp, *ep; int i; + char *ep, c; - ep = dp+dlen-1; - while(*sp != '\0'){ - if(*sp == '\\'){ - switch(*++sp){ - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - i = *sp-'0'; - if(mp!=0 && mp[i].sp != 0 && ms>i) - for(ssp = mp[i].sp; - ssp < mp[i].ep; - ssp++) - if(dp < ep) - *dp++ = *ssp; - break; - case '\\': - if(dp < ep) - *dp++ = '\\'; - break; - case '\0': - sp--; - break; - default: - if(dp < ep) - *dp++ = *sp; - break; + ep = dst + dlen-1; + for(;*src != '\0'; src++) switch(*src) { + case '\\': + switch(*++src) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + i = *src - '0'; + if(match != nil && i < msize && match[i].ep != nil) { + c = *match[i].ep; + *match[i].ep = '\0'; + dst = strecpy(dst, ep+1, match[i].sp); + *match[i].ep = c; } - }else if(*sp == '&'){ - if(mp!=0 && mp[0].sp != 0 && ms>0) - for(ssp = mp[0].sp; - ssp < mp[0].ep; ssp++) - if(dp < ep) - *dp++ = *ssp; - }else{ - if(dp < ep) - *dp++ = *sp; + break; + case '\\': + if(dst < ep) + *dst++ = '\\'; + else + goto End; + break; + case '\0': + goto End; + default: + if(dst < ep) + *dst++ = *src; + else + goto End; + break; } - sp++; + break; + case '&': + if(match != nil && msize > 0 && match[0].sp != nil) { + c = *match[0].ep; + *match[0].ep = '\0'; + dst = strecpy(dst, ep+1, match[0].sp); + *match[0].ep = c; + } + break; + default: + if(dst < ep) + *dst++ = *src; + else + goto End; + break; } - *dp = '\0'; +End: + *dst = '\0'; } diff --git a/sys/src/libregexp/rregexec.c b/sys/src/libregexp/rregexec.c index 301a3589b..572718733 100644 --- a/sys/src/libregexp/rregexec.c +++ b/sys/src/libregexp/rregexec.c @@ -1,212 +1,189 @@ #include <u.h> #include <libc.h> -#include "regexp.h" -#include "regcomp.h" +#include <regexp.h> +#include "regimpl.h" -/* - * return 0 if no match - * >0 if a match - * <0 if we ran out of _relist space - */ -static int -rregexec1(Reprog *progp, /* program to run */ - Rune *bol, /* string to run machine on */ - Resub *mp, /* subexpression elements */ - int ms, /* number of elements at mp */ - Reljunk *j) +typedef struct RethreadQ RethreadQ; +struct RethreadQ { - int flag=0; - Reinst *inst; - Relist *tlp; - Rune *s; - int i, checkstart; - Rune r, *rp, *ep; - Relist* tl; /* This list, next list */ - Relist* nl; - Relist* tle; /* ends of this and next list */ - Relist* nle; - int match; - Rune *p; + Rethread *head; + Rethread **tail; +}; - match = 0; - checkstart = j->startchar; - if(mp) - for(i=0; i<ms; i++) { - mp[i].rsp = 0; - mp[i].rep = 0; - } - j->relist[0][0].inst = 0; - j->relist[1][0].inst = 0; +int +rregexec(Reprog *prog, Rune *str, Resub *sem, int msize) +{ + RethreadQ lists[2], *clist, *nlist, *tmp; + Rethread *t, *nextthr, **availthr; + Reinst *curinst; + Rune *rsp, *rep, endr, last; + int i, match, first, gen, pri, matchpri; - /* Execute machine once for each character, including terminal NUL */ - s = j->rstarts; - do{ - /* fast check for first char */ - if(checkstart) { - switch(j->starttype) { - case RUNE: - p = runestrchr(s, j->startchar); - if(p == 0 || s == j->reol) - return match; - s = p; - break; - case BOL: - if(s == bol) - break; - p = runestrchr(s, '\n'); - if(p == 0 || s == j->reol) - return match; - s = p+1; - break; - } - } + if(msize > NSUBEXPM) + msize = NSUBEXPM; - r = *s; + if(prog->startinst->gen != 0) { + for(curinst = prog->startinst; curinst < prog->startinst + prog->len; curinst++) + curinst->gen = 0; + } - /* switch run lists */ - tl = j->relist[flag]; - tle = j->reliste[flag]; - nl = j->relist[flag^=1]; - nle = j->reliste[flag]; - nl->inst = 0; + clist = lists; + clist->head = nil; + clist->tail = &clist->head; + nlist = lists + 1; + nlist->head = nil; + nlist->tail = &nlist->head; - /* Add first instruction to current list */ - _rrenewemptythread(tl, progp->startinst, ms, s); + for(i = 0; i < prog->nthr; i++) + prog->thrpool[i] = prog->threads + i; + availthr = prog->thrpool + prog->nthr; - /* Execute machine until current list is empty */ - for(tlp=tl; tlp->inst; tlp++){ - for(inst=tlp->inst; ; inst = inst->next){ - switch(inst->type){ - case RUNE: /* regular character */ - if(inst->r == r) - if(_renewthread(nl, inst->next, ms, &tlp->se)==nle) - return -1; - break; - case LBRA: - tlp->se.m[inst->subid].rsp = s; - continue; - case RBRA: - tlp->se.m[inst->subid].rep = s; - continue; - case ANY: - if(r != '\n') - if(_renewthread(nl, inst->next, ms, &tlp->se)==nle) - return -1; - break; - case ANYNL: - if(_renewthread(nl, inst->next, ms, &tlp->se)==nle) - return -1; - break; - case BOL: - if(s == bol || *(s-1) == '\n') - continue; - break; - case EOL: - if(s == j->reol || r == 0 || r == '\n') - continue; - break; - case CCLASS: - ep = inst->cp->end; - for(rp = inst->cp->spans; rp < ep; rp += 2) - if(r >= rp[0] && r <= rp[1]){ - if(_renewthread(nl, inst->next, ms, &tlp->se)==nle) - return -1; - break; - } - break; - case NCCLASS: - ep = inst->cp->end; - for(rp = inst->cp->spans; rp < ep; rp += 2) - if(r >= rp[0] && r <= rp[1]) - break; - if(rp == ep) - if(_renewthread(nl, inst->next, ms, &tlp->se)==nle) - return -1; - break; - case OR: - /* evaluate right choice later */ - if(_renewthread(tlp, inst->right, ms, &tlp->se) == tle) - return -1; - /* efficiency: advance and re-evaluate */ - continue; - case END: /* Match! */ - match = 1; - tlp->se.m[0].rep = s; - if(mp != 0) - _renewmatch(mp, ms, &tlp->se); - break; - } + pri = matchpri = gen = match = 0; + rsp = str; + rep = nil; + endr = L'\0'; + if(sem != nil && msize > 0) { + if(sem->rsp != nil) + rsp = sem->rsp; + if(sem->rep != nil && *sem->rep != L'\0') { + rep = sem->rep; + endr = *sem->rep; + *sem->rep = '\0'; + } + } + last = 1; + for(; last != L'\0'; rsp++) { + gen++; + last = *rsp; + first = 1; + t = clist->head; + if(t == nil) + goto Start; + curinst = t->pc; +Again: + if(curinst->gen == gen) + goto Done; + curinst->gen = gen; + switch(curinst->op) { + case ORUNE: + if(*rsp != curinst->r) + goto Done; + case OANY: /* fallthrough */ + Any: + nextthr = t->next; + t->pc = curinst + 1; + t->next = nil; + *nlist->tail = t; + nlist->tail = &t->next; + if(nextthr == nil) + break; + t = nextthr; + curinst = t->pc; + goto Again; + case OCLASS: + Class: + if(*rsp < curinst->r) + goto Done; + if(*rsp > curinst->r1) { + curinst++; + goto Class; + } + nextthr = t->next; + t->pc = curinst->a; + t->next = nil; + *nlist->tail = t; + nlist->tail = &t->next; + if(nextthr == nil) break; + t = nextthr; + curinst = t->pc; + goto Again; + case ONOTNL: + if(*rsp != L'\n') { + curinst++; + goto Again; + } + goto Done; + case OBOL: + if(rsp == str || rsp[-1] == L'\n') { + curinst++; + goto Again; + } + goto Done; + case OEOL: + if(*rsp == L'\0' && rep == nil) { + curinst++; + goto Again; + } + if(*rsp == '\n') + goto Any; + goto Done; + case OJMP: + curinst = curinst->a; + goto Again; + case OSPLIT: + nextthr = *--availthr; + nextthr->pc = curinst->b; + if(msize > 0) + memcpy(nextthr->sem, t->sem, sizeof(Resub)*msize); + nextthr->pri = t->pri; + nextthr->next = t->next; + t->next = nextthr; + curinst = curinst->a; + goto Again; + case OSAVE: + if(curinst->sub < msize) + t->sem[curinst->sub].rsp = rsp; + curinst++; + goto Again; + case OUNSAVE: + if(curinst->sub == 0) { + /* "Highest" priority is the left-most longest. */ + if (t->pri > matchpri) + goto Done; + match = 1; + matchpri = t->pri; + if(sem != nil && msize > 0) { + memcpy(sem, t->sem, sizeof(Resub)*msize); + sem->rep = rsp; + } + goto Done; } + if(curinst->sub < msize) + t->sem[curinst->sub].rep = rsp; + curinst++; + goto Again; + Done: + *availthr++ = t; + t = t->next; + if(t == nil) + break; + curinst = t->pc; + goto Again; + } +Start: + /* Start again once if we haven't found anything. */ + if(first == 1 && match == 0) { + first = 0; + t = *--availthr; + if(msize > 0) + memset(t->sem, 0, sizeof(Resub)*msize); + /* "Lower" priority thread */ + t->pri = matchpri = pri++; + t->next = nil; + curinst = prog->startinst; + goto Again; } - if(s == j->reol) + /* If we have a match and no extant threads, we are done. */ + if(match == 1 && nlist->head == nil) break; - checkstart = j->startchar && nl->inst==0; - s++; - }while(r); - return match; -} - -static int -rregexec2(Reprog *progp, /* program to run */ - Rune *bol, /* string to run machine on */ - Resub *mp, /* subexpression elements */ - int ms, /* number of elements at mp */ - Reljunk *j -) -{ - Relist relist0[5*LISTSIZE], relist1[5*LISTSIZE]; - - /* mark space */ - j->relist[0] = relist0; - j->relist[1] = relist1; - j->reliste[0] = relist0 + nelem(relist0) - 2; - j->reliste[1] = relist1 + nelem(relist1) - 2; - - return rregexec1(progp, bol, mp, ms, j); -} - -extern int -rregexec(Reprog *progp, /* program to run */ - Rune *bol, /* string to run machine on */ - Resub *mp, /* subexpression elements */ - int ms) /* number of elements at mp */ -{ - Reljunk j; - Relist relist0[LISTSIZE], relist1[LISTSIZE]; - int rv; - - /* - * use user-specified starting/ending location if specified - */ - j.rstarts = bol; - j.reol = 0; - if(mp && ms>0){ - if(mp->sp) - j.rstarts = mp->rsp; - if(mp->ep) - j.reol = mp->rep; + tmp = clist; + clist = nlist; + nlist = tmp; + nlist->head = nil; + nlist->tail = &nlist->head; } - j.starttype = 0; - j.startchar = 0; - if(progp->startinst->type == RUNE && progp->startinst->r < Runeself) { - j.starttype = RUNE; - j.startchar = progp->startinst->r; - } - if(progp->startinst->type == BOL) - j.starttype = BOL; - - /* mark space */ - j.relist[0] = relist0; - j.relist[1] = relist1; - j.reliste[0] = relist0 + nelem(relist0) - 2; - j.reliste[1] = relist1 + nelem(relist1) - 2; - - rv = rregexec1(progp, bol, mp, ms, &j); - if(rv >= 0) - return rv; - rv = rregexec2(progp, bol, mp, ms, &j); - if(rv >= 0) - return rv; - return -1; + if(rep != nil) + *rep = endr; + return match; } diff --git a/sys/src/libregexp/rregsub.c b/sys/src/libregexp/rregsub.c index c395583d0..88cbbfc89 100644 --- a/sys/src/libregexp/rregsub.c +++ b/sys/src/libregexp/rregsub.c @@ -1,64 +1,66 @@ #include <u.h> #include <libc.h> -#include "regexp.h" +#include <regexp.h> -/* substitute into one string using the matches from the last regexec() */ -extern void -rregsub(Rune *sp, /* source string */ - Rune *dp, /* destination string */ - int dlen, - Resub *mp, /* subexpression elements */ - int ms) /* number of elements pointed to by mp */ +void +rregsub(Rune *src, Rune *dst, int dlen, Resub *match, int msize) { - Rune *ssp, *ep; int i; + Rune *ep, r; - ep = dp+(dlen/sizeof(Rune))-1; - while(*sp != '\0'){ - if(*sp == '\\'){ - switch(*++sp){ - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - i = *sp-'0'; - if(mp[i].rsp != 0 && mp!=0 && ms>i) - for(ssp = mp[i].rsp; - ssp < mp[i].rep; - ssp++) - if(dp < ep) - *dp++ = *ssp; - break; - case '\\': - if(dp < ep) - *dp++ = '\\'; - break; - case '\0': - sp--; - break; - default: - if(dp < ep) - *dp++ = *sp; - break; + ep = dst + dlen-1; + for(;*src != L'\0'; src++) switch(*src) { + case L'\\': + switch(*++src) { + case L'0': + case L'1': + case L'2': + case L'3': + case L'4': + case L'5': + case L'6': + case L'7': + case L'8': + case L'9': + i = *src - L'0'; + if(match != nil && i < msize && match[i].rsp != nil) { + r = *match[i].rep; + *match[i].rep = L'\0'; + dst = runestrecpy(dst, ep+1, match[i].rsp); + *match[i].rep = r; } - }else if(*sp == '&'){ - if(mp[0].rsp != 0 && mp!=0 && ms>0) - if(mp[0].rsp != 0) - for(ssp = mp[0].rsp; - ssp < mp[0].rep; ssp++) - if(dp < ep) - *dp++ = *ssp; - }else{ - if(dp < ep) - *dp++ = *sp; + break; + case L'\\': + if(dst < ep) + *dst++ = L'\\'; + else + goto End; + break; + case L'\0': + goto End; + default: + if(dst < ep) + *dst++ = *src; + else + goto End; + break; } - sp++; + break; + case L'&': + if(match != nil && msize > 0 && match[0].rsp != nil) { + r = *match[0].rep; + *match[0].rep = L'\0'; + dst = runestrecpy(dst, ep+1, match[0].rsp); + *match[0].rep = r; + } + break; + default: + if(dst < ep) + *dst++ = *src; + else + goto End; + break; } - *dp = '\0'; +End: + *dst = L'\0'; } |