diff options
author | cinap_lenrek <cinap_lenrek@gmx.de> | 2013-04-24 20:13:18 +0200 |
---|---|---|
committer | cinap_lenrek <cinap_lenrek@gmx.de> | 2013-04-24 20:13:18 +0200 |
commit | 667010554b30c46e35b9cad62edcfa01e37e1576 (patch) | |
tree | 418f828288c6c5c5ba0e6a18775af855966579f0 /sys/src | |
parent | 78c7ba36a1a732c08fbb7e4f8b19d1bc825c5b7e (diff) |
make all the commands agnostic about Rune width. (from sources)
Diffstat (limited to 'sys/src')
-rw-r--r-- | sys/src/cmd/acme/regx.c | 45 | ||||
-rw-r--r-- | sys/src/cmd/ed.c | 6 | ||||
-rw-r--r-- | sys/src/cmd/file.c | 60 | ||||
-rw-r--r-- | sys/src/cmd/freq.c | 2 | ||||
-rw-r--r-- | sys/src/cmd/grep/comp.c | 2 | ||||
-rw-r--r-- | sys/src/cmd/grep/grep.h | 2 | ||||
-rw-r--r-- | sys/src/cmd/htmlroff/char.c | 8 | ||||
-rw-r--r-- | sys/src/cmd/rc/glob.c | 35 | ||||
-rw-r--r-- | sys/src/cmd/rc/lex.c | 20 | ||||
-rw-r--r-- | sys/src/cmd/rc/rc.h | 6 | ||||
-rw-r--r-- | sys/src/cmd/sam/regexp.c | 47 | ||||
-rw-r--r-- | sys/src/cmd/tr.c | 6 |
12 files changed, 100 insertions, 139 deletions
diff --git a/sys/src/cmd/acme/regx.c b/sys/src/cmd/acme/regx.c index 09ad0290b..af1197f8e 100644 --- a/sys/src/cmd/acme/regx.c +++ b/sys/src/cmd/acme/regx.c @@ -20,7 +20,7 @@ Rune *lastregexp; typedef struct Inst Inst; struct Inst { - uint type; /* < 0x10000 ==> literal, otherwise action */ + uint type; /* <= Runemax+1 ==> literal, otherwise action */ union { int sid; int subid; @@ -61,25 +61,28 @@ static Rangeset sempty; * 0x100xx are operators, value == precedence * 0x200xx are tokens, i.e. operands for operators */ -#define OPERATOR 0x10000 /* Bitmask of all operators */ -#define START 0x10000 /* Start, used for marker on stack */ -#define RBRA 0x10001 /* Right bracket, ) */ -#define LBRA 0x10002 /* Left bracket, ( */ -#define OR 0x10003 /* Alternation, | */ -#define CAT 0x10004 /* Concatentation, implicit operator */ -#define STAR 0x10005 /* Closure, * */ -#define PLUS 0x10006 /* a+ == aa* */ -#define QUEST 0x10007 /* a? == a|nothing, i.e. 0 or 1 a's */ -#define ANY 0x20000 /* Any character but newline, . */ -#define NOP 0x20001 /* No operation, internal use only */ -#define BOL 0x20002 /* Beginning of line, ^ */ -#define EOL 0x20003 /* End of line, $ */ -#define CCLASS 0x20004 /* Character class, [] */ -#define NCCLASS 0x20005 /* Negated character class, [^] */ -#define END 0x20077 /* Terminate: match found */ - -#define ISATOR 0x10000 -#define ISAND 0x20000 +enum { + OPERATOR = Runemask+1, /* Bitmask of all operators */ + START = OPERATOR, /* Start, used for marker on stack */ + RBRA, /* Right bracket, ) */ + LBRA, /* Left bracket, ( */ + OR, /* Alternation, | */ + CAT, /* Concatentation, implicit operator */ + STAR, /* Closure, * */ + PLUS, /* a+ == aa* */ + QUEST, /* a? == a|nothing, i.e. 0 or 1 a's */ + + ANY = OPERATOR<<1, /* Any character but newline, . */ + NOP, /* No operation, internal use only */ + BOL, /* Beginning of line, ^ */ + EOL, /* End of line, $ */ + CCLASS, /* Character class, [] */ + NCCLASS, /* Negated character class, [^] */ + END, /* Terminate: match found */ + + ISATOR = OPERATOR, + ISAND = OPERATOR<<1, +}; /* * Parser Information @@ -452,7 +455,7 @@ nextrec(void) exprp++; return '\n'; } - return *exprp++|0x10000; + return *exprp++|(Runemax+1); } return *exprp++; } diff --git a/sys/src/cmd/ed.c b/sys/src/cmd/ed.c index 0f18fadc0..35ce3d343 100644 --- a/sys/src/cmd/ed.c +++ b/sys/src/cmd/ed.c @@ -15,7 +15,7 @@ enum ESIZE = 256, /* max size of reg exp */ GBSIZE = 256, /* max size of global command */ MAXSUB = 9, /* max number of sub reg exp */ - ESCFLG = 0xFFFF, /* escape Rune - user defined code */ + ESCFLG = Runemax, /* escape Rune - user defined code */ EOF = -1, }; @@ -737,7 +737,7 @@ gety(void) if(c == 0) continue; *p++ = c; - if(p >= &linebuf[LBSIZE-2]) + if(p >= &linebuf[LBSIZE-sizeof(Rune)]) error(Q); } } @@ -1162,7 +1162,7 @@ join(void) for(a1=addr1; a1<=addr2; a1++) { lp = getline(*a1); while(*gp = *lp++) - if(gp++ >= &genbuf[LBSIZE-2]) + if(gp++ >= &genbuf[LBSIZE-sizeof(Rune)]) error(Q); } lp = linebuf; diff --git a/sys/src/cmd/file.c b/sys/src/cmd/file.c index 4256850b0..b92f088ad 100644 --- a/sys/src/cmd/file.c +++ b/sys/src/cmd/file.c @@ -273,60 +273,6 @@ type(char *file, int nlen) close(fd); } -/* - * Unicode 4.0 4-byte runes. - */ -typedef int Rune1; - -enum { - UTFmax1 = 4, -}; - -int -fullrune1(char *p, int n) -{ - int c; - - if(n >= 1) { - c = *(uchar*)p; - if(c < 0x80) - return 1; - if(n >= 2 && c < 0xE0) - return 1; - if(n >= 3 && c < 0xF0) - return 1; - if(n >= 4) - return 1; - } - return 0; -} - -int -chartorune1(Rune1 *rune, char *str) -{ - int c, c1, c2, c3, n; - Rune r; - - c = *(uchar*)str; - if(c < 0xF0){ - r = 0; - n = chartorune(&r, str); - *rune = r; - return n; - } - c &= ~0xF0; - c1 = *(uchar*)(str+1) & ~0x80; - c2 = *(uchar*)(str+2) & ~0x80; - c3 = *(uchar*)(str+3) & ~0x80; - n = (c<<18) | (c1<<12) | (c2<<6) | c3; - if(n < 0x10000 || n > 0x10FFFF){ - *rune = Runeerror; - return 1; - } - *rune = n; - return 4; -} - void utfconv(void) { @@ -392,7 +338,7 @@ utfconv(void) void filetype(int fd) { - Rune1 r; + Rune r; int i, f, n; char *p, *eob; @@ -435,9 +381,9 @@ filetype(int fd) language[i].count = 0; eob = (char *)buf+nbuf; for(n = 0, p = (char *)buf; p < eob; n++) { - if (!fullrune1(p, eob-p) && eob-p < UTFmax1) + if (!fullrune(p, eob-p) && eob-p < UTFmax) break; - p += chartorune1(&r, p); + p += chartorune(&r, p); if (r == 0) f = Cnull; else if (r <= 0x7f) { diff --git a/sys/src/cmd/freq.c b/sys/src/cmd/freq.c index b5e075dab..05ac51ee8 100644 --- a/sys/src/cmd/freq.c +++ b/sys/src/cmd/freq.c @@ -2,7 +2,7 @@ #include <libc.h> #include <bio.h> -uvlong count[1<<16]; +uvlong count[Runemax+1]; Biobuf bout; void usage(void); diff --git a/sys/src/cmd/grep/comp.c b/sys/src/cmd/grep/comp.c index 7f807e87c..6be061bbc 100644 --- a/sys/src/cmd/grep/comp.c +++ b/sys/src/cmd/grep/comp.c @@ -275,7 +275,7 @@ re2class(char *s) x = re2or(x, rclass(ov, p[0]-1)); ov = p[1]+1; } - x = re2or(x, rclass(ov, 0xffff)); + x = re2or(x, rclass(ov, Runemask)); } else { x = rclass(p[0], p[1]); for(p+=2; *p; p+=2) diff --git a/sys/src/cmd/grep/grep.h b/sys/src/cmd/grep/grep.h index f1f02d4c7..ba7b26776 100644 --- a/sys/src/cmd/grep/grep.h +++ b/sys/src/cmd/grep/grep.h @@ -53,7 +53,7 @@ enum Caselim = 7, Nhunk = 1<<16, - Cbegin = 0x10000, + Cbegin = Runemax+1, Flshcnt = (1<<9)-1, Cflag = 1<<0, diff --git a/sys/src/cmd/htmlroff/char.c b/sys/src/cmd/htmlroff/char.c index 842d08eaa..7a1bc6a8b 100644 --- a/sys/src/cmd/htmlroff/char.c +++ b/sys/src/cmd/htmlroff/char.c @@ -16,6 +16,12 @@ rune2html(Rune r) if(r == '\n') return L("\n"); + if(((uint)r&~0xFFFF) != 0){ + /* The cache must grow a lot to handle them */ + fprint(2, "%s: can't handle rune '%C'\n", argv0, r); + return L("?"); + } + if(tcscache[r>>8] && tcscache[r>>8][r&0xFF]) return tcscache[r>>8][r&0xFF]; @@ -59,7 +65,7 @@ rune2html(Rune r) typedef struct Trtab Trtab; struct Trtab { - char t[3]; + char t[UTFmax]; Rune r; }; diff --git a/sys/src/cmd/rc/glob.c b/sys/src/cmd/rc/glob.c index 1c4983e40..295d7b6a6 100644 --- a/sys/src/cmd/rc/glob.c +++ b/sys/src/cmd/rc/glob.c @@ -118,18 +118,16 @@ glob(void *ap) int equtf(uchar *p, uchar *q) { + Rune pr, qr; + if(*p!=*q) - return 0; - if(twobyte(*p)) return p[1]==q[1]; - if(threebyte(*p)){ - if(p[1]!=q[1]) - return 0; - if(p[1]=='\0') - return 1; /* broken code at end of string! */ - return p[2]==q[2]; - } - return 1; + return 0; + + chartorune(&pr, (char*)p); + chartorune(&qr, (char*)q); + return pr == qr; } + /* * Return a pointer to the next utf code in the string, * not jumping past nuls in broken utf codes! @@ -138,10 +136,11 @@ equtf(uchar *p, uchar *q) uchar* nextutf(uchar *p) { - if(twobyte(*p)) return p[1]=='\0'?p+1:p+2; - if(threebyte(*p)) return p[1]=='\0'?p+1:p[2]=='\0'?p+2:p+3; - return p+1; + Rune dummy; + + return p + chartorune(&dummy, (char*)p); } + /* * Convert the utf code at *p to a unicode value */ @@ -149,14 +148,12 @@ nextutf(uchar *p) int unicode(uchar *p) { - int u = *p; + Rune r; - if(twobyte(u)) - return ((u&0x1f)<<6)|(p[1]&0x3f); - if(threebyte(u)) - return (u<<12)|((p[1]&0x3f)<<6)|(p[2]&0x3f); - return u; + chartorune(&r, (char*)p); + return r; } + /* * Does the string s match the pattern p * . and .. are only matched by patterns starting with . diff --git a/sys/src/cmd/rc/lex.c b/sys/src/cmd/rc/lex.c index 369348328..fecd0ec64 100644 --- a/sys/src/cmd/rc/lex.c +++ b/sys/src/cmd/rc/lex.c @@ -166,15 +166,25 @@ addtok(char *p, int val) char* addutf(char *p, int c) { - p = addtok(p, c); - if(twobyte(c)) /* 2-byte escape */ - return addtok(p, advance()); - if(threebyte(c)){ /* 3-byte escape */ + uchar b, m; + int i; + + p = addtok(p, c); /* 1-byte UTF runes are special */ + if(onebyte(c)) + return p; + + m = 0xc0; + b = 0x80; + for(i=1; i < UTFmax; i++){ + if((c&m) == b) + break; p = addtok(p, advance()); - return addtok(p, advance()); + b = m; + m = (m >> 1)|0x80; } return p; } + int lastdol; /* was the last token read '$' or '$#' or '"'? */ int lastword; /* was the last token read a word or compound word terminator? */ diff --git a/sys/src/cmd/rc/rc.h b/sys/src/cmd/rc/rc.h index 242a9b5ea..2e1d9ae59 100644 --- a/sys/src/cmd/rc/rc.h +++ b/sys/src/cmd/rc/rc.h @@ -123,12 +123,10 @@ int mypid; */ #define GLOB ((char)0x01) /* - * onebyte(c), twobyte(c), threebyte(c) - * Is c the first character of a one- two- or three-byte utf sequence? + * onebyte(c) + * Is c the first character of a one-byte utf sequence? */ #define onebyte(c) ((c&0x80)==0x00) -#define twobyte(c) ((c&0xe0)==0xc0) -#define threebyte(c) ((c&0xf0)==0xe0) char **argp; char **args; diff --git a/sys/src/cmd/sam/regexp.c b/sys/src/cmd/sam/regexp.c index 4c655dda3..2bf540636 100644 --- a/sys/src/cmd/sam/regexp.c +++ b/sys/src/cmd/sam/regexp.c @@ -9,7 +9,7 @@ typedef struct Inst Inst; struct Inst { - long type; /* < 0x10000 ==> literal, otherwise action */ + long type; /* <= Runemax ==> literal, otherwise action */ union { int rsid; int rsubid; @@ -46,7 +46,7 @@ struct Ilist #define NLIST 127 -Ilist *tl, *nl; /* This list, next list */ +Ilist *tl, *nl; /* This list, next list */ Ilist list[2][NLIST+1]; /* +1 for trailing null */ static Rangeset sempty; @@ -56,25 +56,28 @@ static Rangeset sempty; * 0x100xx are operators, value == precedence * 0x200xx are tokens, i.e. operands for operators */ -#define OPERATOR 0x10000 /* Bitmask of all operators */ -#define START 0x10000 /* Start, used for marker on stack */ -#define RBRA 0x10001 /* Right bracket, ) */ -#define LBRA 0x10002 /* Left bracket, ( */ -#define OR 0x10003 /* Alternation, | */ -#define CAT 0x10004 /* Concatentation, implicit operator */ -#define STAR 0x10005 /* Closure, * */ -#define PLUS 0x10006 /* a+ == aa* */ -#define QUEST 0x10007 /* a? == a|nothing, i.e. 0 or 1 a's */ -#define ANY 0x20000 /* Any character but newline, . */ -#define NOP 0x20001 /* No operation, internal use only */ -#define BOL 0x20002 /* Beginning of line, ^ */ -#define EOL 0x20003 /* End of line, $ */ -#define CCLASS 0x20004 /* Character class, [] */ -#define NCCLASS 0x20005 /* Negated character class, [^] */ -#define END 0x20077 /* Terminate: match found */ - -#define ISATOR 0x10000 -#define ISAND 0x20000 +enum { + OPERATOR = Runemask+1, /* Bitmask of all operators */ + START = OPERATOR, /* Start, used for marker on stack */ + RBRA, /* Right bracket, ) */ + LBRA, /* Left bracket, ( */ + OR, /* Alternation, | */ + CAT, /* Concatentation, implicit operator */ + STAR, /* Closure, * */ + PLUS, /* a+ == aa* */ + QUEST, /* a? == a|nothing, i.e. 0 or 1 a's */ + + ANY = OPERATOR<<1, /* Any character but newline, . */ + NOP, /* No operation, internal use only */ + BOL, /* Beginning of line, ^ */ + EOL, /* End of line, $ */ + CCLASS, /* Character class, [] */ + NCCLASS, /* Negated character class, [^] */ + END, /* Terminate: match found */ + + ISATOR = OPERATOR, + ISAND = OPERATOR<<1, +}; /* * Parser Information @@ -459,7 +462,7 @@ nextrec(void){ exprp++; return '\n'; } - return *exprp++|0x10000; + return *exprp++|(Runemax+1); } return *exprp++; } diff --git a/sys/src/cmd/tr.c b/sys/src/cmd/tr.c index adea05c25..85c76935b 100644 --- a/sys/src/cmd/tr.c +++ b/sys/src/cmd/tr.c @@ -15,10 +15,8 @@ uchar bits[] = { 1, 2, 4, 8, 16, 32, 64, 128 }; #define CLEARBIT(a,c) ((a)[(c)/8] &= ~bits[(c)&07]) #define BITSET(a,c) ((a)[(c)/8] & bits[(c)&07]) -#define MAXRUNE Runemax - -uchar f[(MAXRUNE+1)/8]; -uchar t[(MAXRUNE+1)/8]; +uchar f[(Runemax+1)/8]; +uchar t[(Runemax+1)/8]; char wbuf[4096]; char *wptr; |