diff options
author | cinap_lenrek <cinap_lenrek@gmx.de> | 2012-12-31 21:09:46 +0100 |
---|---|---|
committer | cinap_lenrek <cinap_lenrek@gmx.de> | 2012-12-31 21:09:46 +0100 |
commit | 6cadd03bbeace1c256ba875c2e6a877f924877cd (patch) | |
tree | 8079ea6f6ccdb1c2cbb2b7813f618837617cb33e /sys/src/cmd | |
parent | 6d99096136278f06f6333f927da34105a8dfe0bf (diff) |
fix utf and rune handling in preparation for 32bit runes
Diffstat (limited to 'sys/src/cmd')
37 files changed, 351 insertions, 181 deletions
diff --git a/sys/src/cmd/1c/swt.c b/sys/src/cmd/1c/swt.c index 66f4d36dc..85622a4df 100644 --- a/sys/src/cmd/1c/swt.c +++ b/sys/src/cmd/1c/swt.c @@ -244,26 +244,26 @@ outstring(char *s, long n) } long -outlstring(ushort *s, long n) +outlstring(Rune *s, long n) { - char buf[2]; - int c; + char buf[sizeof(Rune)]; + int c, i; long r; - while(nstring & 1) + while(nstring % sizeof buf) outstring("", 1); r = nstring; while(n > 0) { c = *s++; if(align(0, types[TCHAR], Aarg1)) { - buf[0] = c>>8; - buf[1] = c; + for(i = sizeof buf; i > 0; c >>= 8) + buf[--i] = c; } else { - buf[0] = c; - buf[1] = c>>8; + for(i = 0; i < sizeof buf; c >>= 8) + buf[i++] = c; } - outstring(buf, 2); - n -= sizeof(ushort); + outstring(buf, sizeof buf); + n -= sizeof buf; } return r; } diff --git a/sys/src/cmd/2c/swt.c b/sys/src/cmd/2c/swt.c index a2a94ea08..ce180d67f 100644 --- a/sys/src/cmd/2c/swt.c +++ b/sys/src/cmd/2c/swt.c @@ -324,26 +324,26 @@ outstring(char *s, long n) } long -outlstring(ushort *s, long n) +outlstring(Rune *s, long n) { - char buf[2]; - int c; + char buf[sizeof(Rune)]; + int c, i; long r; - while(nstring & 1) + while(nstring % sizeof buf) outstring("", 1); r = nstring; while(n > 0) { c = *s++; if(align(0, types[TCHAR], Aarg1)) { - buf[0] = c>>8; - buf[1] = c; + for(i = sizeof buf; i > 0; c >>= 8) + buf[--i] = c; } else { - buf[0] = c; - buf[1] = c>>8; + for(i = 0; i < sizeof buf; c >>= 8) + buf[i++] = c; } - outstring(buf, 2); - n -= sizeof(ushort); + outstring(buf, sizeof buf); + n -= sizeof buf; } return r; } diff --git a/sys/src/cmd/acme/regx.c b/sys/src/cmd/acme/regx.c index f18d395f0..09ad0290b 100644 --- a/sys/src/cmd/acme/regx.c +++ b/sys/src/cmd/acme/regx.c @@ -487,7 +487,7 @@ bldcclass(void) exprp++; /* eat '-' */ if((c2 = nextrec()) == ']') goto Error; - classp[n+0] = 0xFFFF; + classp[n+0] = Runemax; classp[n+1] = c1; classp[n+2] = c2; n += 3; @@ -509,7 +509,7 @@ classmatch(int classno, int c, int negate) p = class[classno]; while(*p){ - if(*p == 0xFFFF){ + if(*p == Runemax){ if(p[1]<=c && c<=p[2]) return !negate; p += 3; diff --git a/sys/src/cmd/auth/convkeys.c b/sys/src/cmd/auth/convkeys.c index 200c5ef21..dcd2c027f 100644 --- a/sys/src/cmd/auth/convkeys.c +++ b/sys/src/cmd/auth/convkeys.c @@ -121,7 +121,7 @@ badname(char *s) for (; *s != '\0'; s += n) { n = chartorune(&r, s); - if (n == 1 && r == Runeerror) + if (r == Runeerror) return 1; } return 0; diff --git a/sys/src/cmd/bitsy/keyboard.c b/sys/src/cmd/bitsy/keyboard.c index 0972bcd75..aaa811436 100644 --- a/sys/src/cmd/bitsy/keyboard.c +++ b/sys/src/cmd/bitsy/keyboard.c @@ -395,7 +395,7 @@ threadmain(int argc, char *argv[]) if(strcmp(args[0], "keyboard:")==0 || strcmp(args[0], "scribble:")==0) if(strcmp(args[1], "value") == 0){ n = atoi(args[2]); - if(n <= 0xFFFF){ + if(n <= Runemax){ r = n; i = runetochar(str, &r); write(kbdfd, str, i); diff --git a/sys/src/cmd/bitsy/prompter.c b/sys/src/cmd/bitsy/prompter.c index 9a76a0d0e..df8e546f5 100644 --- a/sys/src/cmd/bitsy/prompter.c +++ b/sys/src/cmd/bitsy/prompter.c @@ -282,7 +282,7 @@ threadmain(int argc, char *argv[]) n = atoi(args[2]); if(n == '\033') /* Escape exits */ break; - if(n <= 0xFFFF){ + if(n <= Runemax){ r = n; send(kbdctl->c, &r); } diff --git a/sys/src/cmd/cc/cc.h b/sys/src/cmd/cc/cc.h index d66faaa11..01eb04562 100644 --- a/sys/src/cmd/cc/cc.h +++ b/sys/src/cmd/cc/cc.h @@ -51,7 +51,7 @@ struct Node double fconst; /* fp constant */ vlong vconst; /* non fp const */ char* cstring; /* character string */ - ushort* rstring; /* rune string */ + Rune* rstring; /* rune string */ Sym* sym; Type* type; @@ -336,6 +336,8 @@ enum TFILE, TOLD, NALLTYPES, + + TRUNE = sizeof(Rune)==4? TUINT: TUSHORT, }; enum { @@ -740,7 +742,7 @@ void gclean(void); void gextern(Sym*, Node*, long, long); void ginit(void); long outstring(char*, long); -long outlstring(ushort*, long); +long outlstring(Rune*, long); void sextern(Sym*, Node*, long, long); void xcom(Node*); long exreg(Type*); diff --git a/sys/src/cmd/cc/cc.y b/sys/src/cmd/cc/cc.y index 09b788598..eff930b23 100644 --- a/sys/src/cmd/cc/cc.y +++ b/sys/src/cmd/cc/cc.y @@ -855,9 +855,9 @@ lstring: LLSTRING { $$ = new(OLSTRING, Z, Z); - $$->type = typ(TARRAY, types[TUSHORT]); - $$->type->width = $1.l + sizeof(ushort); - $$->rstring = (ushort*)$1.s; + $$->type = typ(TARRAY, types[TRUNE]); + $$->type->width = $1.l + sizeof(Rune); + $$->rstring = (Rune*)$1.s; $$->sym = symstring; $$->etype = TARRAY; $$->class = CSTATIC; @@ -867,16 +867,16 @@ lstring: char *s; int n; - n = $1->type->width - sizeof(ushort); + n = $1->type->width - sizeof(Rune); s = alloc(n+$2.l+MAXALIGN); memcpy(s, $1->rstring, n); memcpy(s+n, $2.s, $2.l); - *(ushort*)(s+n+$2.l) = 0; + *(Rune*)(s+n+$2.l) = 0; $$ = $1; $$->type->width += $2.l; - $$->rstring = (ushort*)s; + $$->rstring = (Rune*)s; } zelist: diff --git a/sys/src/cmd/cc/com.c b/sys/src/cmd/cc/com.c index 8ff7c4663..a957c3acd 100644 --- a/sys/src/cmd/cc/com.c +++ b/sys/src/cmd/cc/com.c @@ -633,10 +633,11 @@ tcomo(Node *n, int f) break; case OLSTRING: - if(n->type->link != types[TUSHORT]) { + if(n->type->link != types[TRUNE]) { o = outstring(0, 0); while(o & 3) { - outlstring(L"", sizeof(ushort)); + Rune str[1] = {0}; + outlstring(str, sizeof(Rune)); o = outlstring(0, 0); } } diff --git a/sys/src/cmd/cc/dpchk.c b/sys/src/cmd/cc/dpchk.c index 99a49ee5a..aa7d468b8 100644 --- a/sys/src/cmd/cc/dpchk.c +++ b/sys/src/cmd/cc/dpchk.c @@ -67,13 +67,14 @@ getflag(char *s) { Bits flag; int f; - char *fmt; + char *fmt, *e; Rune c; fmt = fmtbuf; + e = fmtbuf + sizeof(fmtbuf)-1; flag = zbits; nstar = 0; - for(;;) { + while(fmt < e){ s += chartorune(&c, s); fmt += runetochar(fmt, &c); if(c == 0 || c >= nelem(flagbits)) @@ -175,7 +176,7 @@ pragvararg(void) { Sym *s; int n, c; - char *t; + char *t, *e; Rune r; Type *ty; @@ -225,12 +226,15 @@ cktype: if(c != '"') goto bad; t = fmtbuf; + e = t + sizeof(fmtbuf)-1; for(;;) { r = getr(); if(r == ' ' || r == '\n') goto bad; if(r == '"') break; + if(t >= e) + goto bad; t += runetochar(t, &r); } *t = 0; diff --git a/sys/src/cmd/cc/lex.c b/sys/src/cmd/cc/lex.c index 68a566ba0..48da7339f 100644 --- a/sys/src/cmd/cc/lex.c +++ b/sys/src/cmd/cc/lex.c @@ -467,7 +467,7 @@ l1: yyerror("missing '"); peekc = c1; } - yylval.vval = convvtox(c, TUSHORT); + yylval.vval = convvtox(c, TRUNE); return LUCONST; } if(c == '"') { @@ -541,15 +541,15 @@ l1: c = escchar('"', 1, 0); if(c == EOF) break; - cp = allocn(cp, c1, sizeof(ushort)); - *(ushort*)(cp + c1) = c; - c1 += sizeof(ushort); + cp = allocn(cp, c1, sizeof(Rune)); + *(Rune*)(cp + c1) = c; + c1 += sizeof(Rune); } yylval.sval.l = c1; do { - cp = allocn(cp, c1, sizeof(ushort)); - *(ushort*)(cp + c1) = 0; - c1 += sizeof(ushort); + cp = allocn(cp, c1, sizeof(Rune)); + *(Rune*)(cp + c1) = 0; + c1 += sizeof(Rune); } while(c1 & MAXALIGN); yylval.sval.s = cp; return LLSTRING; @@ -1027,7 +1027,7 @@ getnsc(void) } else c = GETC(); for(;;) { - if(!isspace(c)) + if(c >= Runeself || !isspace(c)) return c; if(c == '\n') { lineno++; diff --git a/sys/src/cmd/cc/pswt.c b/sys/src/cmd/cc/pswt.c index df1cda4a4..1eb495595 100644 --- a/sys/src/cmd/cc/pswt.c +++ b/sys/src/cmd/cc/pswt.c @@ -132,28 +132,28 @@ casf(void) } long -outlstring(ushort *s, long n) +outlstring(Rune *s, long n) { - char buf[2]; - int c; + char buf[sizeof(Rune)]; + int c, i; long r; if(suppress) return nstring; - while(nstring & 1) + while(nstring % sizeof buf) outstring("", 1); r = nstring; while(n > 0) { c = *s++; if(align(0, types[TCHAR], Aarg1)) { - buf[0] = c>>8; - buf[1] = c; + for(i = sizeof buf; i > 0; c >>= 8) + buf[--i] = c; } else { - buf[0] = c; - buf[1] = c>>8; + for(i = 0; i < sizeof buf; c >>= 8) + buf[i++] = c; } - outstring(buf, 2); - n -= sizeof(ushort); + outstring(buf, sizeof buf); + n -= sizeof buf; } return r; } diff --git a/sys/src/cmd/disk/9660/cdrdwr.c b/sys/src/cmd/disk/9660/cdrdwr.c index 36e849377..b80195100 100644 --- a/sys/src/cmd/disk/9660/cdrdwr.c +++ b/sys/src/cmd/disk/9660/cdrdwr.c @@ -503,7 +503,6 @@ Cputrscvt(Cdimg *cd, char *s, int size) { Rune r[256]; - strtorune(r, s); Cputrs(cd, strtorune(r, s), size); } diff --git a/sys/src/cmd/disk/9660/jchar.c b/sys/src/cmd/disk/9660/jchar.c index c49da6351..9836b610c 100644 --- a/sys/src/cmd/disk/9660/jchar.c +++ b/sys/src/cmd/disk/9660/jchar.c @@ -45,8 +45,7 @@ isbadjoliet(char *s) if(utflen(s) > 64) return 1; - strtorune(r, s); - for(p=r; *p; p++) + for(p=strtorune(r, s); *p; p++) if(isjolietfrog(*p)) return 1; return 0; diff --git a/sys/src/cmd/ed.c b/sys/src/cmd/ed.c index 9864dd3bf..0f18fadc0 100644 --- a/sys/src/cmd/ed.c +++ b/sys/src/cmd/ed.c @@ -54,7 +54,7 @@ Reprog *pattern; int peekc; int pflag; int rescuing; -Rune rhsbuf[LBSIZE/2]; +Rune rhsbuf[LBSIZE/sizeof(Rune)]; char savedfile[FNSIZE]; jmp_buf savej; int subnewa; @@ -990,11 +990,11 @@ getline(int tl) lp = linebuf; bp = getblock(tl, OREAD); nl = nleft; - tl &= ~((BLKSIZE/2) - 1); + tl &= ~((BLKSIZE/sizeof(Rune)) - 1); while(*lp++ = *bp++) { nl -= sizeof(Rune); if(nl == 0) { - bp = getblock(tl += BLKSIZE/2, OREAD); + bp = getblock(tl += BLKSIZE/sizeof(Rune), OREAD); nl = nleft; } } @@ -1012,7 +1012,7 @@ putline(void) tl = tline; bp = getblock(tl, OWRITE); nl = nleft; - tl &= ~((BLKSIZE/2)-1); + tl &= ~((BLKSIZE/sizeof(Rune))-1); while(*bp = *lp++) { if(*bp++ == '\n') { bp[-1] = 0; @@ -1021,7 +1021,7 @@ putline(void) } nl -= sizeof(Rune); if(nl == 0) { - tl += BLKSIZE/2; + tl += BLKSIZE/sizeof(Rune); bp = getblock(tl, OWRITE); nl = nleft; } @@ -1048,8 +1048,8 @@ getblock(int atl, int iof) static uchar ibuff[BLKSIZE]; static uchar obuff[BLKSIZE]; - bno = atl / (BLKSIZE/2); - off = (atl<<1) & (BLKSIZE-1) & ~03; + bno = atl / (BLKSIZE/sizeof(Rune)); + off = (atl*sizeof(Rune)) & (BLKSIZE-1) & ~03; if(bno >= NBLK) { lastc = '\n'; error(T); @@ -1240,7 +1240,7 @@ compsub(void) if(c == '\\') { c = getchr(); *p++ = ESCFLG; - if(p >= &rhsbuf[LBSIZE/2]) + if(p >= &rhsbuf[nelem(rhsbuf)]) error(Q); } else if(c == '\n' && (!globp || !globp[0])) { @@ -1251,7 +1251,7 @@ compsub(void) if(c == seof) break; *p++ = c; - if(p >= &rhsbuf[LBSIZE/2]) + if(p >= &rhsbuf[nelem(rhsbuf)]) error(Q); } *p = 0; diff --git a/sys/src/cmd/file.c b/sys/src/cmd/file.c index dec241f96..cbc2227d8 100644 --- a/sys/src/cmd/file.c +++ b/sys/src/cmd/file.c @@ -359,7 +359,7 @@ utfconv(void) rb = malloc(nbuf+1); memmove(rb, buf+2, nbuf); p = (char*)buf; - e = p+nbuf-4; + e = p+sizeof(buf)-UTFmax-1; for(i=0; i<nbuf && p < e; i+=2){ r = rb[i+1] | rb[i]<<8; p += runetochar(p, &r); @@ -376,7 +376,7 @@ utfconv(void) rb = malloc(nbuf+1); memmove(rb, buf+2, nbuf); p = (char*)buf; - e = p+nbuf-4; + e = p+sizeof(buf)-UTFmax-1; for(i=0; i<nbuf && p < e; i+=2){ r = rb[i] | rb[i+1]<<8; p += runetochar(p, &r); diff --git a/sys/src/cmd/ip/ftpfs/proto.c b/sys/src/cmd/ip/ftpfs/proto.c index 0e4737952..d96ac036f 100644 --- a/sys/src/cmd/ip/ftpfs/proto.c +++ b/sys/src/cmd/ip/ftpfs/proto.c @@ -1525,7 +1525,7 @@ fromlatin1(char *from) if(*p == 0) return nil; - to = malloc(3*strlen(from)+2); + to = malloc(UTFmax*strlen(from)+2); if(to == nil) return nil; for(p = to; *from; from++){ diff --git a/sys/src/cmd/ip/httpd/wikipost.c b/sys/src/cmd/ip/httpd/wikipost.c index 31fbedda1..6d9279c47 100644 --- a/sys/src/cmd/ip/httpd/wikipost.c +++ b/sys/src/cmd/ip/httpd/wikipost.c @@ -59,7 +59,7 @@ _urlunesc(char *s) t = v; while(*s){ /* in decoding error, assume latin1 */ - if((n=chartorune(&r, s)) == 1 && r == 0x80) + if((n=chartorune(&r, s)) == 1 && r == Runeerror) r = *s; s += n; t += runetochar(t, &r); diff --git a/sys/src/cmd/join.c b/sys/src/cmd/join.c index 5a527a4d4..4c85b0886 100644 --- a/sys/src/cmd/join.c +++ b/sys/src/cmd/join.c @@ -286,7 +286,7 @@ output(int on1, int on2) /* print items from olist */ { int i; Rune *temp; - char buf[BUFSIZ]; + char buf[BUFSIZ*UTFmax+1]; if (no <= 0) { /* default case */ printf("%s", runetostr(buf, on1? ppi[F1][j1]: ppi[F2][j2])); diff --git a/sys/src/cmd/postscript/common/rune.c b/sys/src/cmd/postscript/common/rune.c index 01ee6ba81..4227d84bb 100644 --- a/sys/src/cmd/postscript/common/rune.c +++ b/sys/src/cmd/postscript/common/rune.c @@ -7,16 +7,19 @@ enum Bit2 = 5, Bit3 = 4, Bit4 = 3, + Bit5 = 2, T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */ Maskx = (1<<Bitx)-1, /* 0011 1111 */ Testx = Maskx ^ 0xFF, /* 1100 0000 */ @@ -27,7 +30,7 @@ enum int chartorune(Rune *rune, char *str) { - int c, c1, c2; + int c, c1, c2, c3; long l; /* @@ -72,6 +75,25 @@ chartorune(Rune *rune, char *str) return 3; } + /* + * four character sequence + * 10000-10FFFF => T4 Tx Tx Tx + */ + if(UTFmax >= 4) { + c3 = *(unsigned char*)(str+3) ^ Tx; + if(c3 & Testx) + goto bad; + if(c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if(l <= Rune3) + goto bad; + if(l > Runemax) + goto bad; + *rune = l; + return 4; + } + } + /* * bad decoding */ @@ -85,11 +107,14 @@ runetochar(char *str, Rune *rune) { long c; + c = *rune; + if(c > Runemax) + c = Runeerror; + /* * one character sequence * 00000-0007F => 00-7F */ - c = *rune; if(c <= Rune1) { str[0] = c; return 1; @@ -109,34 +134,70 @@ runetochar(char *str, Rune *rune) * three character sequence * 0800-FFFF => T3 Tx Tx */ - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; + if(c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence + * 10000-1FFFFF => T4 Tx Tx Tx + */ + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; } int runelen(long c) { Rune rune; - char str[10]; + char str[UTFmax]; rune = c; return runetochar(str, &rune); } int +runenlen(Rune *r, int nrune) +{ + int nb, c; + + nb = 0; + while(nrune--) { + c = *r++; + if(c <= Rune1) + nb++; + else + if(c <= Rune2) + nb += 2; + else + if(c <= Rune3 || c > Runemax) + nb += 3; + else + nb += 4; + } + return nb; +} + +int fullrune(char *str, int n) { int c; - if(n > 0) { - c = *(unsigned char*)str; - if(c < Tx) - return 1; - if(n > 1) - if(c < T3 || n > 2) - return 1; - } - return 0; + if(n <= 0) + return 0; + c = *(unsigned char*)str; + if(c < Tx) + return 1; + if(c < T3) + return n >= 2; + if(UTFmax == 3 || c < T4) + return n >= 3; + return n >= 4; } + diff --git a/sys/src/cmd/postscript/common/rune.h b/sys/src/cmd/postscript/common/rune.h index 9c1fd4fd0..84301a8ea 100644 --- a/sys/src/cmd/postscript/common/rune.h +++ b/sys/src/cmd/postscript/common/rune.h @@ -14,6 +14,7 @@ enum UTFmax = 3, /* maximum bytes per rune */ Runesync = 0x80, /* cannot represent part of a utf sequence (<) */ Runeself = 0x80, /* rune and utf sequences are the same (<) */ - Runeerror = 0xFFFD, /* decoding error in utf */ + Runeerror = 0xFFFD, /* decoding error in utf */ + Runemax = 0xFFFF, /* 16 bit rune */ }; #endif diff --git a/sys/src/cmd/sam/cmd.c b/sys/src/cmd/sam/cmd.c index d34333d18..a0e336f01 100644 --- a/sys/src/cmd/sam/cmd.c +++ b/sys/src/cmd/sam/cmd.c @@ -71,7 +71,7 @@ int inputc(void) { int n, nbuf; - char buf[3]; + char buf[UTFmax]; Rune r; Again: diff --git a/sys/src/cmd/sam/regexp.c b/sys/src/cmd/sam/regexp.c index 3fd05a0b5..4c655dda3 100644 --- a/sys/src/cmd/sam/regexp.c +++ b/sys/src/cmd/sam/regexp.c @@ -494,7 +494,7 @@ bldcclass(void) exprp++; /* eat '-' */ if((c2 = nextrec()) == ']') goto Error; - classp[n+0] = 0xFFFF; + classp[n+0] = Runemax; classp[n+1] = c1; classp[n+2] = c2; n += 3; @@ -516,7 +516,7 @@ classmatch(int classno, int c, int negate) p = class[classno]; while(*p){ - if(*p == 0xFFFF){ + if(*p == Runemax){ if(p[1]<=c && c<=p[2]) return !negate; p += 3; diff --git a/sys/src/cmd/samterm/mesg.c b/sys/src/cmd/samterm/mesg.c index be306a0f6..99831a9e2 100644 --- a/sys/src/cmd/samterm/mesg.c +++ b/sys/src/cmd/samterm/mesg.c @@ -429,7 +429,7 @@ outTv(Tmesg type, vlong v1) void outTslS(Tmesg type, int s1, long l1, Rune *s) { - char buf[DATASIZE*3+1]; + char buf[DATASIZE*UTFmax+1]; char *c; outstart(type); diff --git a/sys/src/cmd/sed.c b/sys/src/cmd/sed.c index 96c3eb493..790f2ed51 100644 --- a/sys/src/cmd/sed.c +++ b/sys/src/cmd/sed.c @@ -625,7 +625,7 @@ compsub(Rune *rhs, Rune *end) while ((r = *cp++) != '\0') { if(r == '\\') { if (rhs < end) - *rhs++ = 0xFFFF; + *rhs++ = Runemax; else return 0; r = *cp++; @@ -1055,7 +1055,7 @@ dosub(Rune *rhsbuf) sp = place(sp, loc1, loc2); continue; } - if (c == 0xFFFF && (c = *rp++) >= '1' && c < MAXSUB + '0') { + if (c == Runemax && (c = *rp++) >= '1' && c < MAXSUB + '0') { n = c-'0'; if (subexp[n].rsp && subexp[n].rep) { sp = place(sp, subexp[n].rsp, subexp[n].rep); @@ -1336,7 +1336,7 @@ void arout(void) { int c; - char *s; + char *s, *e; char buf[128]; Rune *p1; Biobuf *fi; @@ -1347,7 +1347,7 @@ arout(void) Bputrune(&fout, *p1); Bputc(&fout, '\n'); } else { - for(s = buf, p1 = (*aptr)->text; *p1; p1++) + for(s = buf, e = buf+sizeof(buf)-UTFmax-1, p1 = (*aptr)->text; *p1 && s < e; p1++) s += runetochar(s, p1); *s = '\0'; if((fi = Bopen(buf, OREAD)) == 0) diff --git a/sys/src/cmd/tcs/utf.c b/sys/src/cmd/tcs/utf.c index 56e91890a..764ef9f7b 100644 --- a/sys/src/cmd/tcs/utf.c +++ b/sys/src/cmd/tcs/utf.c @@ -93,7 +93,7 @@ isoutf_in(int fd, long *notused, struct convert *out) if(!fullisorune(buf+i, tot-i)) break; c = isochartorune(&runes[j], buf+i); - if(runes[j] == Runeerror && c == 1){ + if(runes[j] == Runeerror){ if(squawk) EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i); if(clean){ diff --git a/sys/src/cmd/tr.c b/sys/src/cmd/tr.c index da6fedf5f..adea05c25 100644 --- a/sys/src/cmd/tr.c +++ b/sys/src/cmd/tr.c @@ -15,7 +15,7 @@ uchar bits[] = { 1, 2, 4, 8, 16, 32, 64, 128 }; #define CLEARBIT(a,c) ((a)[(c)/8] &= ~bits[(c)&07]) #define BITSET(a,c) ((a)[(c)/8] & bits[(c)&07]) -#define MAXRUNE 0xFFFF +#define MAXRUNE Runemax uchar f[(MAXRUNE+1)/8]; uchar t[(MAXRUNE+1)/8]; diff --git a/sys/src/cmd/tweak.c b/sys/src/cmd/tweak.c index 54ce2f678..ef4256889 100644 --- a/sys/src/cmd/tweak.c +++ b/sys/src/cmd/tweak.c @@ -803,13 +803,14 @@ attext(Thing *t, Point p, char *buf) } int -type(char *buf, char *tag) +type(char *buf, int nbuf, char *tag) { Rune r; - char *p; + char *p, *e; esetcursor(&busy); p = buf; + e = buf + nbuf-UTFmax-1; for(;;){ *p = 0; mesg("%s: %s", tag, buf); @@ -827,7 +828,8 @@ type(char *buf, char *tag) --p; break; default: - p += runetochar(p, &r); + if(p < e) + p += runetochar(p, &r); } } } @@ -846,7 +848,7 @@ textedit(Thing *t, char *tag) Thing *nt; buttons(Up); - if(type(buf, tag) == 0) + if(type(buf, sizeof(buf), tag) == 0) return; if(strcmp(tag, "file") == 0){ for(s=buf; *s; s++) @@ -1174,7 +1176,7 @@ cntledit(char *tag) long l; buttons(Up); - if(type(buf, tag) == 0) + if(type(buf, sizeof(buf), tag) == 0) return; if(strcmp(tag, "mag") == 0){ if(buf[0]<'0' || '9'<buf[0] || (l=atoi(buf))<=0 || l>Maxmag){ @@ -1806,7 +1808,7 @@ tchar(Thing *t) return; } } - if(type(buf, "char (hex or character or hex-hex)") == 0) + if(type(buf, sizeof(buf), "char (hex or character or hex-hex)") == 0) return; if(utflen(buf) == 1){ chartorune(&r, buf); @@ -2000,7 +2002,7 @@ menu(void) sel = emenuhit(3, &mouse, &menu3); switch(sel){ case Mopen: - if(type(buf, "file")){ + if(type(buf, sizeof(buf), "file")){ t = tget(buf); if(t) drawthing(t, 1); diff --git a/sys/src/cmd/unicode.c b/sys/src/cmd/unicode.c index a04472711..aec44b750 100644 --- a/sys/src/cmd/unicode.c +++ b/sys/src/cmd/unicode.c @@ -51,13 +51,13 @@ range(char *argv[]) return "bad range"; } min = strtoul(q, &q, 16); - if(min<0 || min>0xFFFF || *q!='-') + if(min<0 || min>Runemax || *q!='-') goto err; q++; if(strchr(hex, *q) == 0) goto err; max = strtoul(q, &q, 16); - if(max<0 || max>0xFFFF || max<min || *q!=0) + if(max<0 || max>Runemax || max<min || *q!=0) goto err; i = 0; do{ @@ -111,7 +111,7 @@ chars(char *argv[]) return "bad char"; } m = strtoul(q, &q, 16); - if(m<0 || m>0xFFFF || *q!=0) + if(m<0 || m>Runemax || *q!=0) goto err; Bprint(&bout, "%C", m); if(!text) diff --git a/sys/src/cmd/unix/drawterm/libc/dofmt.c b/sys/src/cmd/unix/drawterm/libc/dofmt.c index 826360d0a..f905c0ad8 100644 --- a/sys/src/cmd/unix/drawterm/libc/dofmt.c +++ b/sys/src/cmd/unix/drawterm/libc/dofmt.c @@ -528,12 +528,15 @@ __flagfmt(Fmt *f) int __badfmt(Fmt *f) { - char x[3]; + char x[2+UTFmax]; + Rune r; + int n; + r = f->r; x[0] = '%'; - x[1] = f->r; - x[2] = '%'; - f->prec = 3; - __fmtcpy(f, (const void*)x, 3, 3); + n = 1+runetochar(x+1, &r); + x[n++] = '%'; + f->prec = n; + _fmtcpy(f, x, n, n); return 0; } diff --git a/sys/src/cmd/unix/drawterm/libc/rune.c b/sys/src/cmd/unix/drawterm/libc/rune.c index b62da9e66..0bb49a745 100644 --- a/sys/src/cmd/unix/drawterm/libc/rune.c +++ b/sys/src/cmd/unix/drawterm/libc/rune.c @@ -8,16 +8,19 @@ enum Bit2 = 5, Bit3 = 4, Bit4 = 3, + Bit5 = 2, T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */ Maskx = (1<<Bitx)-1, /* 0011 1111 */ Testx = Maskx ^ 0xFF, /* 1100 0000 */ @@ -28,7 +31,7 @@ enum int chartorune(Rune *rune, char *str) { - int c, c1, c2; + int c, c1, c2, c3; long l; /* @@ -73,6 +76,25 @@ chartorune(Rune *rune, char *str) return 3; } + /* + * four character sequence + * 10000-10FFFF => T4 Tx Tx Tx + */ + if(UTFmax >= 4) { + c3 = *(uchar*)(str+3) ^ Tx; + if(c3 & Testx) + goto bad; + if(c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if(l <= Rune3) + goto bad; + if(l > Runemax) + goto bad; + *rune = l; + return 4; + } + } + /* * bad decoding */ @@ -86,11 +108,14 @@ runetochar(char *str, Rune *rune) { long c; + c = *rune; + if(c > Runemax) + c = Runeerror; + /* * one character sequence * 00000-0007F => 00-7F */ - c = *rune; if(c <= Rune1) { str[0] = c; return 1; @@ -110,10 +135,22 @@ runetochar(char *str, Rune *rune) * three character sequence * 0800-FFFF => T3 Tx Tx */ - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; + if(c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence + * 10000-1FFFFF => T4 Tx Tx Tx + */ + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; } int @@ -140,7 +177,10 @@ runenlen(Rune *r, int nrune) if(c <= Rune2) nb += 2; else + if(c <= Rune3 || c > Runemax) nb += 3; + else + nb += 4; } return nb; } @@ -150,13 +190,15 @@ fullrune(char *str, int n) { int c; - if(n > 0) { - c = *(uchar*)str; - if(c < Tx) - return 1; - if(n > 1) - if(c < T3 || n > 2) - return 1; - } - return 0; + if(n <= 0) + return 0; + c = *(uchar*)str; + if(c < Tx) + return 1; + if(c < T3) + return n >= 2; + if(UTFmax == 3 || c < T4) + return n >= 3; + return n >= 4; } + diff --git a/sys/src/cmd/unix/drawterm/libc/utf.h b/sys/src/cmd/unix/drawterm/libc/utf.h index 623bfda94..f7c3ebd83 100644 --- a/sys/src/cmd/unix/drawterm/libc/utf.h +++ b/sys/src/cmd/unix/drawterm/libc/utf.h @@ -8,7 +8,8 @@ enum UTFmax = 3, /* maximum bytes per rune */ Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ Runeself = 0x80, /* rune and UTF sequences are the same (<) */ - Runeerror = 0x80, /* decoding error in UTF */ + Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0xFFFF, /* 16 bit rune */ }; /* diff --git a/sys/src/cmd/unix/u9fs/rune.c b/sys/src/cmd/unix/u9fs/rune.c index a0822d625..b8f73ba94 100644 --- a/sys/src/cmd/unix/u9fs/rune.c +++ b/sys/src/cmd/unix/u9fs/rune.c @@ -1,6 +1,7 @@ #include <plan9.h> char *argv0; + enum { Bit1 = 7, @@ -8,27 +9,30 @@ enum Bit2 = 5, Bit3 = 4, Bit4 = 3, + Bit5 = 2, T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */ Maskx = (1<<Bitx)-1, /* 0011 1111 */ Testx = Maskx ^ 0xFF, /* 1100 0000 */ - Bad = Runeerror + Bad = Runeerror, }; int chartorune(Rune *rune, char *str) { - int c, c1, c2; + int c, c1, c2, c3; long l; /* @@ -73,6 +77,25 @@ chartorune(Rune *rune, char *str) return 3; } + /* + * four character sequence + * 10000-10FFFF => T4 Tx Tx Tx + */ + if(UTFmax >= 4) { + c3 = *(uchar*)(str+3) ^ Tx; + if(c3 & Testx) + goto bad; + if(c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if(l <= Rune3) + goto bad; + if(l > Runemax) + goto bad; + *rune = l; + return 4; + } + } + /* * bad decoding */ @@ -86,11 +109,14 @@ runetochar(char *str, Rune *rune) { long c; + c = *rune; + if(c > Runemax) + c = Runeerror; + /* * one character sequence * 00000-0007F => 00-7F */ - c = *rune; if(c <= Rune1) { str[0] = c; return 1; @@ -110,39 +136,70 @@ runetochar(char *str, Rune *rune) * three character sequence * 0800-FFFF => T3 Tx Tx */ - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; + if(c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence + * 10000-1FFFFF => T4 Tx Tx Tx + */ + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; } int runelen(long c) { Rune rune; - char str[10]; + char str[UTFmax]; rune = c; return runetochar(str, &rune); } int -utflen(char *s) +runenlen(Rune *r, int nrune) +{ + int nb, c; + + nb = 0; + while(nrune--) { + c = *r++; + if(c <= Rune1) + nb++; + else + if(c <= Rune2) + nb += 2; + else + if(c <= Rune3 || c > Runemax) + nb += 3; + else + nb += 4; + } + return nb; +} + +int +fullrune(char *str, int n) { int c; - long n; - Rune rune; - n = 0; - for(;;) { - c = *(uchar*)s; - if(c < Runeself) { - if(c == 0) - return n; - s++; - } else - s += chartorune(&rune, s); - n++; - } - return 0; + if(n <= 0) + return 0; + c = *(uchar*)str; + if(c < Tx) + return 1; + if(c < T3) + return n >= 2; + if(UTFmax == 3 || c < T4) + return n >= 3; + return n >= 4; } + diff --git a/sys/src/cmd/upas/fs/mbox.c b/sys/src/cmd/upas/fs/mbox.c index 71dab3d0c..21f786f0c 100644 --- a/sys/src/cmd/upas/fs/mbox.c +++ b/sys/src/cmd/upas/fs/mbox.c @@ -1223,12 +1223,12 @@ latin1toutf(char **out, char *in, char *e) return 0; n += e-in; - *out = p = malloc(n+1); + *out = p = malloc(UTFmax*n+1); if(p == nil) return 0; for(; in < e; in++){ - r = (uchar)*in; + r = (*in) & 0xff; p += runetochar(p, &r); } *p = 0; diff --git a/sys/src/cmd/upas/vf/vf.c b/sys/src/cmd/upas/vf/vf.c index 376488f7e..d73fbe47f 100644 --- a/sys/src/cmd/upas/vf/vf.c +++ b/sys/src/cmd/upas/vf/vf.c @@ -954,7 +954,7 @@ tokenconvert(String *t) { String *s; char decoded[1024]; - char utfbuf[2*1024]; + char utfbuf[UTFmax*1024]; int i, len; char *e; char *token; diff --git a/sys/src/cmd/vnc/screen.c b/sys/src/cmd/vnc/screen.c index e11155ac9..ce8a7e818 100644 --- a/sys/src/cmd/vnc/screen.c +++ b/sys/src/cmd/vnc/screen.c @@ -335,6 +335,8 @@ screenputc(char *buf) addflush(r); curpos.x = *xp; break; + case '\0': + break; default: p = memsubfontwidth(memdefont, buf); w = p.x; @@ -354,23 +356,19 @@ screenputc(char *buf) void screenputs(char *s, int n) { - int i; - Rune r; - char buf[4]; + static char rb[UTFmax+1]; + static int nrb; + char *e; drawlock(); - while(n > 0){ - i = chartorune(&r, s); - if(i == 0){ - s++; - --n; - continue; + e = s + n; + while(s < e){ + rb[nrb++] = *s++; + if(nrb >= UTFmax || fullrune(rb, nrb)){ + rb[nrb] = 0; + screenputc(rb); + nrb = 0; } - memmove(buf, s, i); - buf[i] = 0; - n -= i; - s += i; - screenputc(buf); } screenflush(); drawunlock(); diff --git a/sys/src/cmd/yacc.c b/sys/src/cmd/yacc.c index 7aef1a281..b25594df9 100644 --- a/sys/src/cmd/yacc.c +++ b/sys/src/cmd/yacc.c @@ -141,7 +141,7 @@ Biobuf* foutput; /* y.output file */ char* infile; /* input file name */ int numbval; /* value of an input number */ -char tokname[NAMESIZE+4]; /* input token name, slop for runes and 0 */ +char tokname[NAMESIZE+UTFmax+1]; /* input token name, slop for runes and 0 */ /* structure declarations */ |