diff options
author | Taru Karttunen <taruti@taruti.net> | 2011-03-30 15:46:40 +0300 |
---|---|---|
committer | Taru Karttunen <taruti@taruti.net> | 2011-03-30 15:46:40 +0300 |
commit | e5888a1ffdae813d7575f5fb02275c6bb07e5199 (patch) | |
tree | d8d51eac403f07814b9e936eed0c9a79195e2450 /sys/src/ape/lib/regexp |
Import sources from 2011-03-30 iso image
Diffstat (limited to 'sys/src/ape/lib/regexp')
-rwxr-xr-x | sys/src/ape/lib/regexp/mkfile | 15 | ||||
-rwxr-xr-x | sys/src/ape/lib/regexp/regaux.c | 56 | ||||
-rwxr-xr-x | sys/src/ape/lib/regexp/regcomp.c | 560 | ||||
-rwxr-xr-x | sys/src/ape/lib/regexp/regcomp.h | 61 | ||||
-rwxr-xr-x | sys/src/ape/lib/regexp/regerror.c | 16 | ||||
-rwxr-xr-x | sys/src/ape/lib/regexp/regexec.c | 191 | ||||
-rwxr-xr-x | sys/src/ape/lib/regexp/regsub.c | 64 | ||||
-rwxr-xr-x | sys/src/ape/lib/regexp/rregexec.c | 181 | ||||
-rwxr-xr-x | sys/src/ape/lib/regexp/rregsub.c | 64 |
9 files changed, 1208 insertions, 0 deletions
diff --git a/sys/src/ape/lib/regexp/mkfile b/sys/src/ape/lib/regexp/mkfile new file mode 100755 index 000000000..e5d8bc209 --- /dev/null +++ b/sys/src/ape/lib/regexp/mkfile @@ -0,0 +1,15 @@ +APE=/sys/src/ape +<$APE/config + +LIB=/$objtype/lib/ape/libregexp.a +OFILES=regcomp.$O\ + regerror.$O\ + regexec.$O\ + regsub.$O\ + regaux.$O\ + rregexec.$O\ + rregsub.$O\ + +</sys/src/cmd/mksyslib + +CFLAGS=-c -DUTF -D_REGEXP_EXTENSION diff --git a/sys/src/ape/lib/regexp/regaux.c b/sys/src/ape/lib/regexp/regaux.c new file mode 100755 index 000000000..6643b8ff8 --- /dev/null +++ b/sys/src/ape/lib/regexp/regaux.c @@ -0,0 +1,56 @@ +#include <stdlib.h> +#include <stdio.h> +#include "regexp.h" +#include "regcomp.h" + +/* + * Machine state + */ +Relist* _relist[2]; +Relist* _reliste[2]; +int _relistsize = LISTINCREMENT; + +/* + * save a new match in mp + */ +extern void +_renewmatch(Resub *mp, int ms, Resublist *sp) +{ + int i; + + if(mp==0 || ms<=0) + return; + if(mp[0].s.sp==0 || sp->m[0].s.sp<mp[0].s.sp || + (sp->m[0].s.sp==mp[0].s.sp && sp->m[0].e.ep>mp[0].e.ep)){ + for(i=0; i<ms && i<NSUBEXP; i++) + mp[i] = sp->m[i]; + for(; i<ms; i++) + mp[i].s.sp = mp[i].e.ep = 0; + } +} + +/* + * Note optimization in _renewthread: + * *lp must be pending when _renewthread called; if *l has been looked + * at already, the optimization is a bug. + */ +extern Relist* +_renewthread(Relist *lp, /* _relist to add to */ + Reinst *ip, /* instruction to add */ + Resublist *sep) /* pointers to subexpressions */ +{ + Relist *p; + + for(p=lp; p->inst; p++){ + if(p->inst == ip){ + if((sep)->m[0].s.sp < p->se.m[0].s.sp) + p->se = *sep; + return 0; + } + } + p->inst = ip; + p->se = *sep; + (++p)->inst = 0; + return p; +} + diff --git a/sys/src/ape/lib/regexp/regcomp.c b/sys/src/ape/lib/regexp/regcomp.c new file mode 100755 index 000000000..5d31fc2f6 --- /dev/null +++ b/sys/src/ape/lib/regexp/regcomp.c @@ -0,0 +1,560 @@ +#include <stdlib.h> +#include <stdio.h> +#include <setjmp.h> +#include <string.h> +#include "regexp.h" +#include "regcomp.h" + +#define TRUE 1 +#define FALSE 0 + +/* + * Parser Information + */ +typedef +struct Node +{ + Reinst* first; + Reinst* last; +}Node; + +#define NSTACK 20 +static Node andstack[NSTACK]; +static Node *andp; +static int atorstack[NSTACK]; +static int* atorp; +static int cursubid; /* id of current subexpression */ +static int subidstack[NSTACK]; /* parallel to atorstack */ +static int* subidp; +static int lastwasand; /* Last token was operand */ +static int nbra; +static char* exprp; /* pointer to next character in source expression */ +static int lexdone; +static int nclass; +static Reclass*classp; +static Reinst* freep; +static int errors; +static wchar_t yyrune; /* last lex'd rune */ +static Reclass*yyclassp; /* last lex'd class */ + +/* predeclared crap */ +static void operator(int); +static void pushand(Reinst*, Reinst*); +static void pushator(int); +static void evaluntil(int); +static int bldcclass(void); + +static jmp_buf regkaboom; + +static void +rcerror(char *s) +{ + errors++; + regerror(s); + longjmp(regkaboom, 1); +} + +static Reinst* +newinst(int t) +{ + freep->type = t; + freep->l.left = 0; + freep->r.right = 0; + return freep++; +} + +static void +operand(int t) +{ + Reinst *i; + + if(lastwasand) + operator(CAT); /* catenate is implicit */ + i = newinst(t); + + if(t == CCLASS || t == NCCLASS) + i->r.cp = yyclassp; + if(t == RUNE) + i->r.r = yyrune; + + pushand(i, i); + lastwasand = TRUE; +} + +static void +operator(int t) +{ + if(t==RBRA && --nbra<0) + rcerror("unmatched right paren"); + if(t==LBRA){ + if(++cursubid >= NSUBEXP) + rcerror ("too many subexpressions"); + nbra++; + if(lastwasand) + operator(CAT); + } else + evaluntil(t); + if(t != RBRA) + pushator(t); + lastwasand = FALSE; + if(t==STAR || t==QUEST || t==PLUS || t==RBRA) + lastwasand = TRUE; /* these look like operands */ +} + +static void +regerr2(char *s, int c) +{ + char buf[100]; + char *cp = buf; + while(*s) + *cp++ = *s++; + *cp++ = c; + *cp = '\0'; + rcerror(buf); +} + +static void +cant(char *s) +{ + char buf[100]; + strcpy(buf, "can't happen: "); + strcat(buf, s); + rcerror(buf); +} + +static void +pushand(Reinst *f, Reinst *l) +{ + if(andp >= &andstack[NSTACK]) + cant("operand stack overflow"); + andp->first = f; + andp->last = l; + andp++; +} + +static void +pushator(int t) +{ + if(atorp >= &atorstack[NSTACK]) + cant("operator stack overflow"); + *atorp++ = t; + *subidp++ = cursubid; +} + +static Node* +popand(int op) +{ + Reinst *inst; + + if(andp <= &andstack[0]){ + regerr2("missing operand for ", op); + inst = newinst(NOP); + pushand(inst,inst); + } + return --andp; +} + +static int +popator(void) +{ + if(atorp <= &atorstack[0]) + cant("operator stack underflow"); + --subidp; + return *--atorp; +} + +static void +evaluntil(int pri) +{ + Node *op1, *op2; + Reinst *inst1, *inst2; + + while(pri==RBRA || atorp[-1]>=pri){ + switch(popator()){ + default: + rcerror("unknown operator in evaluntil"); + break; + case LBRA: /* must have been RBRA */ + op1 = popand('('); + inst2 = newinst(RBRA); + inst2->r.subid = *subidp; + op1->last->l.next = inst2; + inst1 = newinst(LBRA); + inst1->r.subid = *subidp; + inst1->l.next = op1->first; + pushand(inst1, inst2); + return; + case OR: + op2 = popand('|'); + op1 = popand('|'); + inst2 = newinst(NOP); + op2->last->l.next = inst2; + op1->last->l.next = inst2; + inst1 = newinst(OR); + inst1->r.right = op1->first; + inst1->l.left = op2->first; + pushand(inst1, inst2); + break; + case CAT: + op2 = popand(0); + op1 = popand(0); + op1->last->l.next = op2->first; + pushand(op1->first, op2->last); + break; + case STAR: + op2 = popand('*'); + inst1 = newinst(OR); + op2->last->l.next = inst1; + inst1->r.right = op2->first; + pushand(inst1, inst1); + break; + case PLUS: + op2 = popand('+'); + inst1 = newinst(OR); + op2->last->l.next = inst1; + inst1->r.right = op2->first; + pushand(op2->first, inst1); + break; + case QUEST: + op2 = popand('?'); + inst1 = newinst(OR); + inst2 = newinst(NOP); + inst1->l.left = inst2; + inst1->r.right = op2->first; + op2->last->l.next = inst2; + pushand(inst1, inst2); + break; + } + } +} + +static Reprog* +optimize(Reprog *pp) +{ + Reinst *inst, *target; + int size; + Reprog *npp; + int diff; + + /* + * get rid of NOOP chains + */ + for(inst=pp->firstinst; inst->type!=END; inst++){ + target = inst->l.next; + while(target->type == NOP) + target = target->l.next; + inst->l.next = target; + } + + /* + * The original allocation is for an area larger than + * necessary. Reallocate to the actual space used + * and then relocate the code. + */ + size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst); + npp = realloc(pp, size); + if(npp==0 || npp==pp) + return pp; + diff = (char *)npp - (char *)pp; + freep = (Reinst *)((char *)freep + diff); + for(inst=npp->firstinst; inst<freep; inst++){ + switch(inst->type){ + case OR: + case STAR: + case PLUS: + case QUEST: + case CCLASS: + case NCCLASS: + *(char **)&inst->r.right += diff; + break; + } + *(char **)&inst->l.left += diff; + } + *(char **)&npp->startinst += diff; + return npp; +} + +#ifdef DEBUG +static void +dumpstack(void){ + Node *stk; + int *ip; + + print("operators\n"); + for(ip=atorstack; ip<atorp; ip++) + print("0%o\n", *ip); + print("operands\n"); + for(stk=andstack; stk<andp; stk++) + print("0%o\t0%o\n", stk->first->type, stk->last->type); +} + +static void +dump(Reprog *pp) +{ + Reinst *l; + wchar_t *p; + + l = pp->firstinst; + do{ + print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type, + l->l.left-pp->firstinst, l->l.right-pp->firstinst); + if(l->type == RUNE) + print("\t%C\n", l->r); + else if(l->type == CCLASS || l->type == NCCLASS){ + print("\t["); + if(l->type == NCCLASS) + print("^"); + for(p = l->r.cp->spans; p < l->r.cp->end; p += 2) + if(p[0] == p[1]) + print("%C", p[0]); + else + print("%C-%C", p[0], p[1]); + print("]\n"); + } else + print("\n"); + }while(l++->type); +} +#endif + +static Reclass* +newclass(void) +{ + if(nclass >= NCLASS) + regerr2("too many character classes; limit", NCLASS+'0'); + return &(classp[nclass++]); +} + +static int +nextc(wchar_t *rp) +{ + int n; + + if(lexdone){ + *rp = 0; + return 1; + } + n = mbtowc(rp, exprp, MB_CUR_MAX); + if (n <= 0) + n = 1; + exprp += n; + if(*rp == L'\\'){ + n = mbtowc(rp, exprp, MB_CUR_MAX); + if (n <= 0) + n = 1; + exprp += n; + return 1; + } + if(*rp == 0) + lexdone = 1; + return 0; +} + +static int +lex(int literal, int dot_type) +{ + int quoted; + + quoted = nextc(&yyrune); + if(literal || quoted){ + if(yyrune == 0) + return END; + return RUNE; + } + + switch(yyrune){ + case 0: + return END; + case L'*': + return STAR; + case L'?': + return QUEST; + case L'+': + return PLUS; + case L'|': + return OR; + case L'.': + return dot_type; + case L'(': + return LBRA; + case L')': + return RBRA; + case L'^': + return BOL; + case L'$': + return EOL; + case L'[': + return bldcclass(); + } + return RUNE; +} + +static int +bldcclass(void) +{ + int type; + wchar_t r[NCCRUNE]; + wchar_t *p, *ep, *np; + wchar_t rune; + int quoted; + + /* we have already seen the '[' */ + type = CCLASS; + yyclassp = newclass(); + + /* look ahead for negation */ + ep = r; + quoted = nextc(&rune); + if(!quoted && rune == L'^'){ + type = NCCLASS; + quoted = nextc(&rune); + *ep++ = L'\n'; + *ep++ = L'\n'; + } + + /* parse class into a set of spans */ + for(; ep<&r[NCCRUNE];){ + if(rune == 0){ + rcerror("malformed '[]'"); + return 0; + } + if(!quoted && rune == L']') + break; + if(!quoted && rune == L'-'){ + if(ep == r){ + rcerror("malformed '[]'"); + return 0; + } + quoted = nextc(&rune); + if((!quoted && rune == L']') || rune == 0){ + rcerror("malformed '[]'"); + return 0; + } + *(ep-1) = rune; + } else { + *ep++ = rune; + *ep++ = rune; + } + quoted = nextc(&rune); + } + + /* sort on span start */ + for(p = r; p < ep; p += 2){ + for(np = p; np < ep; np += 2) + if(*np < *p){ + rune = np[0]; + np[0] = p[0]; + p[0] = rune; + rune = np[1]; + np[1] = p[1]; + p[1] = rune; + } + } + + /* merge spans */ + np = yyclassp->spans; + p = r; + if(r == ep) + yyclassp->end = np; + else { + np[0] = *p++; + np[1] = *p++; + for(; p < ep; p += 2) + if(p[0] <= np[1]){ + if(p[1] > np[1]) + np[1] = p[1]; + } else { + np += 2; + np[0] = p[0]; + np[1] = p[1]; + } + yyclassp->end = np+2; + } + + return type; +} + +static Reprog* +regcomp1(char *s, int literal, int dot_type) +{ + int token; + Reprog *pp; + + /* get memory for the program */ + pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s)); + if(pp == 0){ + regerror("out of memory"); + return 0; + } + freep = pp->firstinst; + classp = pp->class; + errors = 0; + + if(setjmp(regkaboom)) + goto out; + + /* go compile the sucker */ + lexdone = 0; + exprp = s; + nclass = 0; + nbra = 0; + atorp = atorstack; + andp = andstack; + subidp = subidstack; + lastwasand = FALSE; + cursubid = 0; + + /* Start with a low priority operator to prime parser */ + pushator(START-1); + while((token = lex(literal, dot_type)) != END){ + if((token&0300) == OPERATOR) + operator(token); + else + operand(token); + } + + /* Close with a low priority operator */ + evaluntil(START); + + /* Force END */ + operand(END); + evaluntil(START); +#ifdef DEBUG + dumpstack(); +#endif + if(nbra) + rcerror("unmatched left paren"); + --andp; /* points to first and only operand */ + pp->startinst = andp->first; +#ifdef DEBUG + dump(pp); +#endif + pp = optimize(pp); +#ifdef DEBUG + print("start: %d\n", andp->first-pp->firstinst); + dump(pp); +#endif +out: + if(errors){ + free(pp); + pp = 0; + } + return pp; +} + +extern Reprog* +regcomp(char *s) +{ + return regcomp1(s, 0, ANY); +} + +extern Reprog* +regcomplit(char *s) +{ + return regcomp1(s, 1, ANY); +} + +extern Reprog* +regcompnl(char *s) +{ + return regcomp1(s, 0, ANYNL); +} diff --git a/sys/src/ape/lib/regexp/regcomp.h b/sys/src/ape/lib/regexp/regcomp.h new file mode 100755 index 000000000..082e6137e --- /dev/null +++ b/sys/src/ape/lib/regexp/regcomp.h @@ -0,0 +1,61 @@ +/* + * substitution list + */ +typedef struct Resublist Resublist; +struct Resublist +{ + Resub m[32]; +}; + +/* max subexpressions per program */ +Resublist ReSuBlIsT; +#define NSUBEXP (sizeof(ReSuBlIsT.m)/sizeof(Resub)) + +/* max character classes per program */ +Reprog RePrOg; +#define NCLASS (sizeof(RePrOg.class)/sizeof(Reclass)) + +/* max rune ranges per character class */ +#define NCCRUNE (sizeof(Reclass)/sizeof(wchar_t)) + +/* + * Actions and Tokens (Reinst types) + * + * 02xx are operators, value == precedence + * 03xx are tokens, i.e. operands for operators + */ +#define RUNE 0177 +#define OPERATOR 0200 /* Bitmask of all operators */ +#define START 0200 /* Start, used for marker on stack */ +#define RBRA 0201 /* Right bracket, ) */ +#define LBRA 0202 /* Left bracket, ( */ +#define OR 0203 /* Alternation, | */ +#define CAT 0204 /* Concatentation, implicit operator */ +#define STAR 0205 /* Closure, * */ +#define PLUS 0206 /* a+ == aa* */ +#define QUEST 0207 /* a? == a|nothing, i.e. 0 or 1 a's */ +#define ANY 0300 /* Any character except newline, . */ +#define ANYNL 0301 /* Any character including newline, . */ +#define NOP 0302 /* No operation, internal use only */ +#define BOL 0303 /* Beginning of line, ^ */ +#define EOL 0304 /* End of line, $ */ +#define CCLASS 0305 /* Character class, [] */ +#define NCCLASS 0306 /* Negated character class, [] */ +#define END 0377 /* Terminate: match found */ + +/* + * regexec execution lists + */ +#define LISTINCREMENT 8 +typedef struct Relist Relist; +struct Relist +{ + Reinst *inst; /* Reinstruction of the thread */ + Resublist se; /* matched subexpressions in this thread */ +}; +extern Relist* _relist[2]; +extern Relist* _reliste[2]; +extern int _relistsize; + +extern Relist* _renewthread(Relist*, Reinst*, Resublist*); +extern void _renewmatch(Resub*, int, Resublist*); diff --git a/sys/src/ape/lib/regexp/regerror.c b/sys/src/ape/lib/regexp/regerror.c new file mode 100755 index 000000000..bfa158299 --- /dev/null +++ b/sys/src/ape/lib/regexp/regerror.c @@ -0,0 +1,16 @@ +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include "regexp.h" + +void +regerror(char *s) +{ + char buf[132]; + + strcpy(buf, "regerror: "); + strcat(buf, s); + strcat(buf, "\n"); + fwrite(buf, 1, strlen(buf), stderr); + exit(1); +} diff --git a/sys/src/ape/lib/regexp/regexec.c b/sys/src/ape/lib/regexp/regexec.c new file mode 100755 index 000000000..dee888b09 --- /dev/null +++ b/sys/src/ape/lib/regexp/regexec.c @@ -0,0 +1,191 @@ +#include <stdlib.h> +#include <stdio.h> +#include "regexp.h" +#include "regcomp.h" + +static Resublist sempty; /* empty set of matches */ + +/* + * return 0 if no match + * >0 if a match + * <0 if we ran out of _relist space + */ +static int +regexec1(Reprog *progp, /* program to run */ + char *bol, /* string to run machine on */ + Resub *mp, /* subexpression elements */ + int ms, /* number of elements at mp */ + char *starts, + char *eol, + wchar_t startchar) +{ + int flag=0; + Reinst *inst; + Relist *tlp; + char *s; + int i, checkstart; + wchar_t r, *rp, *ep; + int n; + Relist* tl; /* This list, next list */ + Relist* nl; + Relist* tle; /* ends of this and next list */ + Relist* nle; + int match; + + match = 0; + checkstart = startchar; + sempty.m[0].s.sp = 0; + if(mp!=0) + for(i=0; i<ms; i++) + mp[i].s.sp = mp[i].e.ep = 0; + _relist[0][0].inst = _relist[1][0].inst = 0; + + /* Execute machine once for each character, including terminal NUL */ + s = starts; + do{ + /* fast check for first char */ + r = *(unsigned char*)s; + if(checkstart && r != startchar){ + s++; + continue; + } + + if(r < Runeself) + n = 1; + else { + n = mbtowc(&r, s, MB_CUR_MAX); + if (n <= 0) + n = 1; + } + + /* switch run lists */ + tl = _relist[flag]; + tle = _reliste[flag]; + nl = _relist[flag^=1]; + nle = _reliste[flag]; + nl->inst = 0; + + /* Add first instruction to current list */ + if(match == 0){ + sempty.m[0].s.sp = s; + _renewthread(tl, progp->startinst, &sempty); + } + + /* Execute machine until current list is empty */ + for(tlp=tl; tlp->inst; tlp++){ /* assignment = */ + if(s == eol) + break; + + for(inst = tlp->inst; ; inst = inst->l.next){ + switch(inst->type){ + case RUNE: /* regular character */ + if(inst->r.r == r) + if(_renewthread(nl, inst->l.next, &tlp->se)==nle) + return -1; + break; + case LBRA: + tlp->se.m[inst->r.subid].s.sp = s; + continue; + case RBRA: + tlp->se.m[inst->r.subid].e.ep = s; + continue; + case ANY: + if(r != '\n') + if(_renewthread(nl, inst->l.next, &tlp->se)==nle) + return -1; + break; + case ANYNL: + if(_renewthread(nl, inst->l.next, &tlp->se)==nle) + return -1; + break; + case BOL: + if(s == bol || *(s-1) == '\n') + continue; + break; + case EOL: + if(r == 0 || r == '\n') + continue; + break; + case CCLASS: + ep = inst->r.cp->end; + for(rp = inst->r.cp->spans; rp < ep; rp += 2) + if(r >= rp[0] && r <= rp[1]){ + if(_renewthread(nl, inst->l.next, &tlp->se)==nle) + return -1; + break; + } + break; + case NCCLASS: + ep = inst->r.cp->end; + for(rp = inst->r.cp->spans; rp < ep; rp += 2) + if(r >= rp[0] && r <= rp[1]) + break; + if(rp == ep) + if(_renewthread(nl, inst->l.next, &tlp->se)==nle) + return -1; + break; + case OR: + /* evaluate right choice later */ + if(_renewthread(tlp, inst->r.right, &tlp->se) == tle) + return -1; + /* efficiency: advance and re-evaluate */ + continue; + case END: /* Match! */ + match = 1; + tlp->se.m[0].e.ep = s; + if(mp != 0) + _renewmatch(mp, ms, &tlp->se); + break; + } + break; + } + } + checkstart = startchar && nl->inst==0; + s += n; + }while(r); + return match; +} + +extern int +regexec(Reprog *progp, /* program to run */ + char *bol, /* string to run machine on */ + Resub *mp, /* subexpression elements */ + int ms) /* number of elements at mp */ +{ + char *starts; /* where to start match */ + char *eol; /* where to end match */ + wchar_t startchar; + int rv; + + /* + * use user-specified starting/ending location if specified + */ + starts = bol; + eol = 0; + if(mp && ms>0){ + if(mp->s.sp) + starts = mp->s.sp; + if(mp->e.ep) + eol = mp->e.ep; + } + startchar = (progp->startinst->type == RUNE && progp->startinst->r.r < Runeself) + ? progp->startinst->r.r : 0; + + /* keep trying till we have enough list space to terminate */ + for(;;){ + if(_relist[0] == 0){ + _relist[0] = malloc(2*_relistsize*sizeof(Relist)); + _relist[1] = _relist[0] + _relistsize; + _reliste[0] = _relist[0] + _relistsize - 1; + _reliste[1] = _relist[1] + _relistsize - 1; + if(_relist[0] == 0) + regerror("_relist overflow"); + } + rv = regexec1(progp, bol, mp, ms, starts, eol, startchar); + if(rv >= 0) + return rv; + free(_relist[0]); + _relist[0] = 0; + _relistsize += LISTINCREMENT; + } +} diff --git a/sys/src/ape/lib/regexp/regsub.c b/sys/src/ape/lib/regexp/regsub.c new file mode 100755 index 000000000..0c891e9c6 --- /dev/null +++ b/sys/src/ape/lib/regexp/regsub.c @@ -0,0 +1,64 @@ +#include <stdlib.h> +#include <stdio.h> +#include "regexp.h" + +/* substitute into one string using the matches from the last regexec() */ +extern void +regsub(char *sp, /* source string */ + char *dp, /* destination string */ + int dlen, + Resub *mp, /* subexpression elements */ + int ms) /* number of elements pointed to by mp */ +{ + char *ssp, *ep; + int i; + + ep = dp+dlen-1; + while(*sp != '\0'){ + if(*sp == '\\'){ + switch(*++sp){ + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + i = *sp-'0'; + if(mp[i].s.sp != 0 && mp!=0 && ms>i) + for(ssp = mp[i].s.sp; + ssp < mp[i].e.ep; + ssp++) + if(dp < ep) + *dp++ = *ssp; + break; + case '\\': + if(dp < ep) + *dp++ = '\\'; + break; + case '\0': + sp--; + break; + default: + if(dp < ep) + *dp++ = *sp; + break; + } + }else if(*sp == '&'){ + if(mp[0].s.sp != 0 && mp!=0 && ms>0) + if(mp[0].s.sp != 0) + for(ssp = mp[0].s.sp; + ssp < mp[0].e.ep; ssp++) + if(dp < ep) + *dp++ = *ssp; + }else{ + if(dp < ep) + *dp++ = *sp; + } + sp++; + } + *dp = '\0'; +} diff --git a/sys/src/ape/lib/regexp/rregexec.c b/sys/src/ape/lib/regexp/rregexec.c new file mode 100755 index 000000000..519ac25ad --- /dev/null +++ b/sys/src/ape/lib/regexp/rregexec.c @@ -0,0 +1,181 @@ +#include <stdlib.h> +#include <stdio.h> +#include "regexp.h" +#include "regcomp.h" + +static Resublist sempty; /* empty set of matches */ + +/* + * return 0 if no match + * >0 if a match + * <0 if we ran out of _relist space + */ +static int +rregexec1(Reprog *progp, /* program to run */ + wchar_t *bol, /* string to run machine on */ + Resub *mp, /* subexpression elements */ + int ms, /* number of elements at mp */ + wchar_t *starts, + wchar_t *eol, + wchar_t startchar) +{ + int flag=0; + Reinst *inst; + Relist *tlp; + wchar_t *s; + int i, checkstart; + wchar_t r, *rp, *ep; + int n; + Relist* tl; /* This list, next list */ + Relist* nl; + Relist* tle; /* ends of this and next list */ + Relist* nle; + int match; + + match = 0; + checkstart = startchar; + sempty.m[0].s.rsp = 0; + if(mp!=0) + for(i=0; i<ms; i++) + mp[i].s.rsp = mp[i].e.rep = 0; + _relist[0][0].inst = _relist[1][0].inst = 0; + + /* Execute machine once for each character, including terminal NUL */ + s = starts; + do{ + r = *s; + + /* fast check for first char */ + if(checkstart && r!=startchar){ + s++; + continue; + } + + /* switch run lists */ + tl = _relist[flag]; + tle = _reliste[flag]; + nl = _relist[flag^=1]; + nle = _reliste[flag]; + nl->inst = 0; + + /* Add first instruction to current list */ + sempty.m[0].s.rsp = s; + _renewthread(tl, progp->startinst, &sempty); + + /* Execute machine until current list is empty */ + for(tlp=tl; tlp->inst; tlp++){ /* assignment = */ + if(s == eol) + break; + + for(inst=tlp->inst; ; inst = inst->l.next){ + switch(inst->type){ + case RUNE: /* regular character */ + if(inst->r.r == r) + if(_renewthread(nl, inst->l.next, &tlp->se)==nle) + return -1; + break; + case LBRA: + tlp->se.m[inst->r.subid].s.rsp = s; + continue; + case RBRA: + tlp->se.m[inst->r.subid].e.rep = s; + continue; + case ANY: + if(r != '\n') + if(_renewthread(nl, inst->l.next, &tlp->se)==nle) + return -1; + break; + case ANYNL: + if(_renewthread(nl, inst->l.next, &tlp->se)==nle) + return -1; + break; + case BOL: + if(s == bol || *(s-1) == '\n') + continue; + break; + case EOL: + if(r == 0 || r == '\n') + continue; + break; + case CCLASS: + ep = inst->r.cp->end; + for(rp = inst->r.cp->spans; rp < ep; rp += 2) + if(r >= rp[0] && r <= rp[1]){ + if(_renewthread(nl, inst->l.next, &tlp->se)==nle) + return -1; + break; + } + break; + case NCCLASS: + ep = inst->r.cp->end; + for(rp = inst->r.cp->spans; rp < ep; rp += 2) + if(r >= rp[0] && r <= rp[1]) + break; + if(rp == ep) + if(_renewthread(nl, inst->l.next, &tlp->se)==nle) + return -1; + break; + case OR: + /* evaluate right choice later */ + if(_renewthread(tlp, inst->r.right, &tlp->se) == tle) + return -1; + /* efficiency: advance and re-evaluate */ + continue; + case END: /* Match! */ + match = 1; + tlp->se.m[0].e.rep = s; + if(mp != 0) + _renewmatch(mp, ms, &tlp->se); + break; + } + break; + } + } + checkstart = startchar && nl->inst==0; + s++; + }while(r); + return match; +} + +extern int +rregexec(Reprog *progp, /* program to run */ + wchar_t *bol, /* string to run machine on */ + Resub *mp, /* subexpression elements */ + int ms) /* number of elements at mp */ +{ + wchar_t *starts; /* where to start match */ + wchar_t *eol; /* where to end match */ + wchar_t startchar; + int rv; + + /* + * use user-specified starting/ending location if specified + */ + starts = bol; + eol = 0; + if(mp && ms>0){ + if(mp->s.rsp) + starts = mp->s.rsp; + if(mp->e.rep) + eol = mp->e.rep; + } + startchar = progp->startinst->type == RUNE ? progp->startinst->r.r : 0; + + /* keep trying till we have enough list space to terminate */ + for(;;){ + if(_relist[0] == 0){ + _relist[0] = malloc(2*_relistsize*sizeof(Relist)); + _relist[1] = _relist[0] + _relistsize; + _reliste[0] = _relist[0] + _relistsize - 1; + _reliste[1] = _relist[1] + _relistsize - 1; + if(_relist[0] == 0) + regerror("_relist overflow"); + } + rv = rregexec1(progp, bol, mp, ms, starts, eol, startchar); + if(rv >= 0) + return rv; + free(_relist[0]); + _relist[0] = 0; + _relistsize += LISTINCREMENT; + } +} diff --git a/sys/src/ape/lib/regexp/rregsub.c b/sys/src/ape/lib/regexp/rregsub.c new file mode 100755 index 000000000..5fc3e5960 --- /dev/null +++ b/sys/src/ape/lib/regexp/rregsub.c @@ -0,0 +1,64 @@ +#include <stdlib.h> +#include <stdio.h> +#include "regexp.h" + +/* substitute into one string using the matches from the last regexec() */ +extern void +rregsub(wchar_t *sp, /* source string */ + wchar_t *dp, /* destination string */ + int dlen, + Resub *mp, /* subexpression elements */ + int ms) /* number of elements pointed to by mp */ +{ + wchar_t *ssp, *ep; + int i; + + ep = dp+(dlen/sizeof(wchar_t))-1; + while(*sp != '\0'){ + if(*sp == '\\'){ + switch(*++sp){ + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + i = *sp-'0'; + if(mp[i].s.rsp != 0 && mp!=0 && ms>i) + for(ssp = mp[i].s.rsp; + ssp < mp[i].e.rep; + ssp++) + if(dp < ep) + *dp++ = *ssp; + break; + case '\\': + if(dp < ep) + *dp++ = '\\'; + break; + case '\0': + sp--; + break; + default: + if(dp < ep) + *dp++ = *sp; + break; + } + }else if(*sp == '&'){ + if(mp[0].s.rsp != 0 && mp!=0 && ms>0) + if(mp[0].s.rsp != 0) + for(ssp = mp[0].s.rsp; + ssp < mp[0].e.rep; ssp++) + if(dp < ep) + *dp++ = *ssp; + }else{ + if(dp < ep) + *dp++ = *sp; + } + sp++; + } + *dp = '\0'; +} |