summaryrefslogtreecommitdiff
path: root/sys/src
diff options
context:
space:
mode:
authorcinap_lenrek <cinap_lenrek@gmx.de>2013-04-24 20:13:18 +0200
committercinap_lenrek <cinap_lenrek@gmx.de>2013-04-24 20:13:18 +0200
commit667010554b30c46e35b9cad62edcfa01e37e1576 (patch)
tree418f828288c6c5c5ba0e6a18775af855966579f0 /sys/src
parent78c7ba36a1a732c08fbb7e4f8b19d1bc825c5b7e (diff)
make all the commands agnostic about Rune width. (from sources)
Diffstat (limited to 'sys/src')
-rw-r--r--sys/src/cmd/acme/regx.c45
-rw-r--r--sys/src/cmd/ed.c6
-rw-r--r--sys/src/cmd/file.c60
-rw-r--r--sys/src/cmd/freq.c2
-rw-r--r--sys/src/cmd/grep/comp.c2
-rw-r--r--sys/src/cmd/grep/grep.h2
-rw-r--r--sys/src/cmd/htmlroff/char.c8
-rw-r--r--sys/src/cmd/rc/glob.c35
-rw-r--r--sys/src/cmd/rc/lex.c20
-rw-r--r--sys/src/cmd/rc/rc.h6
-rw-r--r--sys/src/cmd/sam/regexp.c47
-rw-r--r--sys/src/cmd/tr.c6
12 files changed, 100 insertions, 139 deletions
diff --git a/sys/src/cmd/acme/regx.c b/sys/src/cmd/acme/regx.c
index 09ad0290b..af1197f8e 100644
--- a/sys/src/cmd/acme/regx.c
+++ b/sys/src/cmd/acme/regx.c
@@ -20,7 +20,7 @@ Rune *lastregexp;
typedef struct Inst Inst;
struct Inst
{
- uint type; /* < 0x10000 ==> literal, otherwise action */
+ uint type; /* <= Runemax+1 ==> literal, otherwise action */
union {
int sid;
int subid;
@@ -61,25 +61,28 @@ static Rangeset sempty;
* 0x100xx are operators, value == precedence
* 0x200xx are tokens, i.e. operands for operators
*/
-#define OPERATOR 0x10000 /* Bitmask of all operators */
-#define START 0x10000 /* Start, used for marker on stack */
-#define RBRA 0x10001 /* Right bracket, ) */
-#define LBRA 0x10002 /* Left bracket, ( */
-#define OR 0x10003 /* Alternation, | */
-#define CAT 0x10004 /* Concatentation, implicit operator */
-#define STAR 0x10005 /* Closure, * */
-#define PLUS 0x10006 /* a+ == aa* */
-#define QUEST 0x10007 /* a? == a|nothing, i.e. 0 or 1 a's */
-#define ANY 0x20000 /* Any character but newline, . */
-#define NOP 0x20001 /* No operation, internal use only */
-#define BOL 0x20002 /* Beginning of line, ^ */
-#define EOL 0x20003 /* End of line, $ */
-#define CCLASS 0x20004 /* Character class, [] */
-#define NCCLASS 0x20005 /* Negated character class, [^] */
-#define END 0x20077 /* Terminate: match found */
-
-#define ISATOR 0x10000
-#define ISAND 0x20000
+enum {
+ OPERATOR = Runemask+1, /* Bitmask of all operators */
+ START = OPERATOR, /* Start, used for marker on stack */
+ RBRA, /* Right bracket, ) */
+ LBRA, /* Left bracket, ( */
+ OR, /* Alternation, | */
+ CAT, /* Concatentation, implicit operator */
+ STAR, /* Closure, * */
+ PLUS, /* a+ == aa* */
+ QUEST, /* a? == a|nothing, i.e. 0 or 1 a's */
+
+ ANY = OPERATOR<<1, /* Any character but newline, . */
+ NOP, /* No operation, internal use only */
+ BOL, /* Beginning of line, ^ */
+ EOL, /* End of line, $ */
+ CCLASS, /* Character class, [] */
+ NCCLASS, /* Negated character class, [^] */
+ END, /* Terminate: match found */
+
+ ISATOR = OPERATOR,
+ ISAND = OPERATOR<<1,
+};
/*
* Parser Information
@@ -452,7 +455,7 @@ nextrec(void)
exprp++;
return '\n';
}
- return *exprp++|0x10000;
+ return *exprp++|(Runemax+1);
}
return *exprp++;
}
diff --git a/sys/src/cmd/ed.c b/sys/src/cmd/ed.c
index 0f18fadc0..35ce3d343 100644
--- a/sys/src/cmd/ed.c
+++ b/sys/src/cmd/ed.c
@@ -15,7 +15,7 @@ enum
ESIZE = 256, /* max size of reg exp */
GBSIZE = 256, /* max size of global command */
MAXSUB = 9, /* max number of sub reg exp */
- ESCFLG = 0xFFFF, /* escape Rune - user defined code */
+ ESCFLG = Runemax, /* escape Rune - user defined code */
EOF = -1,
};
@@ -737,7 +737,7 @@ gety(void)
if(c == 0)
continue;
*p++ = c;
- if(p >= &linebuf[LBSIZE-2])
+ if(p >= &linebuf[LBSIZE-sizeof(Rune)])
error(Q);
}
}
@@ -1162,7 +1162,7 @@ join(void)
for(a1=addr1; a1<=addr2; a1++) {
lp = getline(*a1);
while(*gp = *lp++)
- if(gp++ >= &genbuf[LBSIZE-2])
+ if(gp++ >= &genbuf[LBSIZE-sizeof(Rune)])
error(Q);
}
lp = linebuf;
diff --git a/sys/src/cmd/file.c b/sys/src/cmd/file.c
index 4256850b0..b92f088ad 100644
--- a/sys/src/cmd/file.c
+++ b/sys/src/cmd/file.c
@@ -273,60 +273,6 @@ type(char *file, int nlen)
close(fd);
}
-/*
- * Unicode 4.0 4-byte runes.
- */
-typedef int Rune1;
-
-enum {
- UTFmax1 = 4,
-};
-
-int
-fullrune1(char *p, int n)
-{
- int c;
-
- if(n >= 1) {
- c = *(uchar*)p;
- if(c < 0x80)
- return 1;
- if(n >= 2 && c < 0xE0)
- return 1;
- if(n >= 3 && c < 0xF0)
- return 1;
- if(n >= 4)
- return 1;
- }
- return 0;
-}
-
-int
-chartorune1(Rune1 *rune, char *str)
-{
- int c, c1, c2, c3, n;
- Rune r;
-
- c = *(uchar*)str;
- if(c < 0xF0){
- r = 0;
- n = chartorune(&r, str);
- *rune = r;
- return n;
- }
- c &= ~0xF0;
- c1 = *(uchar*)(str+1) & ~0x80;
- c2 = *(uchar*)(str+2) & ~0x80;
- c3 = *(uchar*)(str+3) & ~0x80;
- n = (c<<18) | (c1<<12) | (c2<<6) | c3;
- if(n < 0x10000 || n > 0x10FFFF){
- *rune = Runeerror;
- return 1;
- }
- *rune = n;
- return 4;
-}
-
void
utfconv(void)
{
@@ -392,7 +338,7 @@ utfconv(void)
void
filetype(int fd)
{
- Rune1 r;
+ Rune r;
int i, f, n;
char *p, *eob;
@@ -435,9 +381,9 @@ filetype(int fd)
language[i].count = 0;
eob = (char *)buf+nbuf;
for(n = 0, p = (char *)buf; p < eob; n++) {
- if (!fullrune1(p, eob-p) && eob-p < UTFmax1)
+ if (!fullrune(p, eob-p) && eob-p < UTFmax)
break;
- p += chartorune1(&r, p);
+ p += chartorune(&r, p);
if (r == 0)
f = Cnull;
else if (r <= 0x7f) {
diff --git a/sys/src/cmd/freq.c b/sys/src/cmd/freq.c
index b5e075dab..05ac51ee8 100644
--- a/sys/src/cmd/freq.c
+++ b/sys/src/cmd/freq.c
@@ -2,7 +2,7 @@
#include <libc.h>
#include <bio.h>
-uvlong count[1<<16];
+uvlong count[Runemax+1];
Biobuf bout;
void usage(void);
diff --git a/sys/src/cmd/grep/comp.c b/sys/src/cmd/grep/comp.c
index 7f807e87c..6be061bbc 100644
--- a/sys/src/cmd/grep/comp.c
+++ b/sys/src/cmd/grep/comp.c
@@ -275,7 +275,7 @@ re2class(char *s)
x = re2or(x, rclass(ov, p[0]-1));
ov = p[1]+1;
}
- x = re2or(x, rclass(ov, 0xffff));
+ x = re2or(x, rclass(ov, Runemask));
} else {
x = rclass(p[0], p[1]);
for(p+=2; *p; p+=2)
diff --git a/sys/src/cmd/grep/grep.h b/sys/src/cmd/grep/grep.h
index f1f02d4c7..ba7b26776 100644
--- a/sys/src/cmd/grep/grep.h
+++ b/sys/src/cmd/grep/grep.h
@@ -53,7 +53,7 @@ enum
Caselim = 7,
Nhunk = 1<<16,
- Cbegin = 0x10000,
+ Cbegin = Runemax+1,
Flshcnt = (1<<9)-1,
Cflag = 1<<0,
diff --git a/sys/src/cmd/htmlroff/char.c b/sys/src/cmd/htmlroff/char.c
index 842d08eaa..7a1bc6a8b 100644
--- a/sys/src/cmd/htmlroff/char.c
+++ b/sys/src/cmd/htmlroff/char.c
@@ -16,6 +16,12 @@ rune2html(Rune r)
if(r == '\n')
return L("\n");
+ if(((uint)r&~0xFFFF) != 0){
+ /* The cache must grow a lot to handle them */
+ fprint(2, "%s: can't handle rune '%C'\n", argv0, r);
+ return L("?");
+ }
+
if(tcscache[r>>8] && tcscache[r>>8][r&0xFF])
return tcscache[r>>8][r&0xFF];
@@ -59,7 +65,7 @@ rune2html(Rune r)
typedef struct Trtab Trtab;
struct Trtab
{
- char t[3];
+ char t[UTFmax];
Rune r;
};
diff --git a/sys/src/cmd/rc/glob.c b/sys/src/cmd/rc/glob.c
index 1c4983e40..295d7b6a6 100644
--- a/sys/src/cmd/rc/glob.c
+++ b/sys/src/cmd/rc/glob.c
@@ -118,18 +118,16 @@ glob(void *ap)
int
equtf(uchar *p, uchar *q)
{
+ Rune pr, qr;
+
if(*p!=*q)
- return 0;
- if(twobyte(*p)) return p[1]==q[1];
- if(threebyte(*p)){
- if(p[1]!=q[1])
- return 0;
- if(p[1]=='\0')
- return 1; /* broken code at end of string! */
- return p[2]==q[2];
- }
- return 1;
+ return 0;
+
+ chartorune(&pr, (char*)p);
+ chartorune(&qr, (char*)q);
+ return pr == qr;
}
+
/*
* Return a pointer to the next utf code in the string,
* not jumping past nuls in broken utf codes!
@@ -138,10 +136,11 @@ equtf(uchar *p, uchar *q)
uchar*
nextutf(uchar *p)
{
- if(twobyte(*p)) return p[1]=='\0'?p+1:p+2;
- if(threebyte(*p)) return p[1]=='\0'?p+1:p[2]=='\0'?p+2:p+3;
- return p+1;
+ Rune dummy;
+
+ return p + chartorune(&dummy, (char*)p);
}
+
/*
* Convert the utf code at *p to a unicode value
*/
@@ -149,14 +148,12 @@ nextutf(uchar *p)
int
unicode(uchar *p)
{
- int u = *p;
+ Rune r;
- if(twobyte(u))
- return ((u&0x1f)<<6)|(p[1]&0x3f);
- if(threebyte(u))
- return (u<<12)|((p[1]&0x3f)<<6)|(p[2]&0x3f);
- return u;
+ chartorune(&r, (char*)p);
+ return r;
}
+
/*
* Does the string s match the pattern p
* . and .. are only matched by patterns starting with .
diff --git a/sys/src/cmd/rc/lex.c b/sys/src/cmd/rc/lex.c
index 369348328..fecd0ec64 100644
--- a/sys/src/cmd/rc/lex.c
+++ b/sys/src/cmd/rc/lex.c
@@ -166,15 +166,25 @@ addtok(char *p, int val)
char*
addutf(char *p, int c)
{
- p = addtok(p, c);
- if(twobyte(c)) /* 2-byte escape */
- return addtok(p, advance());
- if(threebyte(c)){ /* 3-byte escape */
+ uchar b, m;
+ int i;
+
+ p = addtok(p, c); /* 1-byte UTF runes are special */
+ if(onebyte(c))
+ return p;
+
+ m = 0xc0;
+ b = 0x80;
+ for(i=1; i < UTFmax; i++){
+ if((c&m) == b)
+ break;
p = addtok(p, advance());
- return addtok(p, advance());
+ b = m;
+ m = (m >> 1)|0x80;
}
return p;
}
+
int lastdol; /* was the last token read '$' or '$#' or '"'? */
int lastword; /* was the last token read a word or compound word terminator? */
diff --git a/sys/src/cmd/rc/rc.h b/sys/src/cmd/rc/rc.h
index 242a9b5ea..2e1d9ae59 100644
--- a/sys/src/cmd/rc/rc.h
+++ b/sys/src/cmd/rc/rc.h
@@ -123,12 +123,10 @@ int mypid;
*/
#define GLOB ((char)0x01)
/*
- * onebyte(c), twobyte(c), threebyte(c)
- * Is c the first character of a one- two- or three-byte utf sequence?
+ * onebyte(c)
+ * Is c the first character of a one-byte utf sequence?
*/
#define onebyte(c) ((c&0x80)==0x00)
-#define twobyte(c) ((c&0xe0)==0xc0)
-#define threebyte(c) ((c&0xf0)==0xe0)
char **argp;
char **args;
diff --git a/sys/src/cmd/sam/regexp.c b/sys/src/cmd/sam/regexp.c
index 4c655dda3..2bf540636 100644
--- a/sys/src/cmd/sam/regexp.c
+++ b/sys/src/cmd/sam/regexp.c
@@ -9,7 +9,7 @@ typedef struct Inst Inst;
struct Inst
{
- long type; /* < 0x10000 ==> literal, otherwise action */
+ long type; /* <= Runemax ==> literal, otherwise action */
union {
int rsid;
int rsubid;
@@ -46,7 +46,7 @@ struct Ilist
#define NLIST 127
-Ilist *tl, *nl; /* This list, next list */
+Ilist *tl, *nl; /* This list, next list */
Ilist list[2][NLIST+1]; /* +1 for trailing null */
static Rangeset sempty;
@@ -56,25 +56,28 @@ static Rangeset sempty;
* 0x100xx are operators, value == precedence
* 0x200xx are tokens, i.e. operands for operators
*/
-#define OPERATOR 0x10000 /* Bitmask of all operators */
-#define START 0x10000 /* Start, used for marker on stack */
-#define RBRA 0x10001 /* Right bracket, ) */
-#define LBRA 0x10002 /* Left bracket, ( */
-#define OR 0x10003 /* Alternation, | */
-#define CAT 0x10004 /* Concatentation, implicit operator */
-#define STAR 0x10005 /* Closure, * */
-#define PLUS 0x10006 /* a+ == aa* */
-#define QUEST 0x10007 /* a? == a|nothing, i.e. 0 or 1 a's */
-#define ANY 0x20000 /* Any character but newline, . */
-#define NOP 0x20001 /* No operation, internal use only */
-#define BOL 0x20002 /* Beginning of line, ^ */
-#define EOL 0x20003 /* End of line, $ */
-#define CCLASS 0x20004 /* Character class, [] */
-#define NCCLASS 0x20005 /* Negated character class, [^] */
-#define END 0x20077 /* Terminate: match found */
-
-#define ISATOR 0x10000
-#define ISAND 0x20000
+enum {
+ OPERATOR = Runemask+1, /* Bitmask of all operators */
+ START = OPERATOR, /* Start, used for marker on stack */
+ RBRA, /* Right bracket, ) */
+ LBRA, /* Left bracket, ( */
+ OR, /* Alternation, | */
+ CAT, /* Concatentation, implicit operator */
+ STAR, /* Closure, * */
+ PLUS, /* a+ == aa* */
+ QUEST, /* a? == a|nothing, i.e. 0 or 1 a's */
+
+ ANY = OPERATOR<<1, /* Any character but newline, . */
+ NOP, /* No operation, internal use only */
+ BOL, /* Beginning of line, ^ */
+ EOL, /* End of line, $ */
+ CCLASS, /* Character class, [] */
+ NCCLASS, /* Negated character class, [^] */
+ END, /* Terminate: match found */
+
+ ISATOR = OPERATOR,
+ ISAND = OPERATOR<<1,
+};
/*
* Parser Information
@@ -459,7 +462,7 @@ nextrec(void){
exprp++;
return '\n';
}
- return *exprp++|0x10000;
+ return *exprp++|(Runemax+1);
}
return *exprp++;
}
diff --git a/sys/src/cmd/tr.c b/sys/src/cmd/tr.c
index adea05c25..85c76935b 100644
--- a/sys/src/cmd/tr.c
+++ b/sys/src/cmd/tr.c
@@ -15,10 +15,8 @@ uchar bits[] = { 1, 2, 4, 8, 16, 32, 64, 128 };
#define CLEARBIT(a,c) ((a)[(c)/8] &= ~bits[(c)&07])
#define BITSET(a,c) ((a)[(c)/8] & bits[(c)&07])
-#define MAXRUNE Runemax
-
-uchar f[(MAXRUNE+1)/8];
-uchar t[(MAXRUNE+1)/8];
+uchar f[(Runemax+1)/8];
+uchar t[(Runemax+1)/8];
char wbuf[4096];
char *wptr;