diff options
author | cinap_lenrek <cinap_lenrek@gmx.de> | 2012-12-31 21:09:46 +0100 |
---|---|---|
committer | cinap_lenrek <cinap_lenrek@gmx.de> | 2012-12-31 21:09:46 +0100 |
commit | 6cadd03bbeace1c256ba875c2e6a877f924877cd (patch) | |
tree | 8079ea6f6ccdb1c2cbb2b7813f618837617cb33e /sys/src/libc | |
parent | 6d99096136278f06f6333f927da34105a8dfe0bf (diff) |
fix utf and rune handling in preparation for 32bit runes
Diffstat (limited to 'sys/src/libc')
-rw-r--r-- | sys/src/libc/fmt/dofmt.c | 13 | ||||
-rw-r--r-- | sys/src/libc/port/rune.c | 80 |
2 files changed, 69 insertions, 24 deletions
diff --git a/sys/src/libc/fmt/dofmt.c b/sys/src/libc/fmt/dofmt.c index 2a4b42959..95852f5c2 100644 --- a/sys/src/libc/fmt/dofmt.c +++ b/sys/src/libc/fmt/dofmt.c @@ -512,12 +512,15 @@ _flagfmt(Fmt *f) int _badfmt(Fmt *f) { - char x[3]; + char x[2+UTFmax]; + Rune r; + int n; + r = f->r; x[0] = '%'; - x[1] = f->r; - x[2] = '%'; - f->prec = 3; - _fmtcpy(f, x, 3, 3); + n = 1+runetochar(x+1, &r); + x[n++] = '%'; + f->prec = n; + _fmtcpy(f, x, n, n); return 0; } diff --git a/sys/src/libc/port/rune.c b/sys/src/libc/port/rune.c index b62da9e66..d69d59515 100644 --- a/sys/src/libc/port/rune.c +++ b/sys/src/libc/port/rune.c @@ -8,16 +8,19 @@ enum Bit2 = 5, Bit3 = 4, Bit4 = 3, + Bit5 = 2, T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */ Maskx = (1<<Bitx)-1, /* 0011 1111 */ Testx = Maskx ^ 0xFF, /* 1100 0000 */ @@ -28,7 +31,7 @@ enum int chartorune(Rune *rune, char *str) { - int c, c1, c2; + int c, c1, c2, c3; long l; /* @@ -73,6 +76,25 @@ chartorune(Rune *rune, char *str) return 3; } + /* + * four character sequence + * 10000-10FFFF => T4 Tx Tx Tx + */ + if(UTFmax >= 4) { + c3 = *(uchar*)(str+3) ^ Tx; + if(c3 & Testx) + goto bad; + if(c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if(l <= Rune3) + goto bad; + if(l > Runemax) + goto bad; + *rune = l; + return 4; + } + } + /* * bad decoding */ @@ -86,11 +108,14 @@ runetochar(char *str, Rune *rune) { long c; + c = *rune; + if(c > Runemax) + c = Runeerror; + /* * one character sequence * 00000-0007F => 00-7F */ - c = *rune; if(c <= Rune1) { str[0] = c; return 1; @@ -110,17 +135,29 @@ runetochar(char *str, Rune *rune) * three character sequence * 0800-FFFF => T3 Tx Tx */ - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; + if(c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence + * 10000-1FFFFF => T4 Tx Tx Tx + */ + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; } int runelen(long c) { Rune rune; - char str[10]; + char str[UTFmax]; rune = c; return runetochar(str, &rune); @@ -140,7 +177,10 @@ runenlen(Rune *r, int nrune) if(c <= Rune2) nb += 2; else + if(c <= Rune3 || c > Runemax) nb += 3; + else + nb += 4; } return nb; } @@ -150,13 +190,15 @@ fullrune(char *str, int n) { int c; - if(n > 0) { - c = *(uchar*)str; - if(c < Tx) - return 1; - if(n > 1) - if(c < T3 || n > 2) - return 1; - } - return 0; + if(n <= 0) + return 0; + c = *(uchar*)str; + if(c < Tx) + return 1; + if(c < T3) + return n >= 2; + if(UTFmax == 3 || c < T4) + return n >= 3; + return n >= 4; } + |