diff options
author | cinap_lenrek <cinap_lenrek@gmx.de> | 2012-12-31 21:09:46 +0100 |
---|---|---|
committer | cinap_lenrek <cinap_lenrek@gmx.de> | 2012-12-31 21:09:46 +0100 |
commit | 6cadd03bbeace1c256ba875c2e6a877f924877cd (patch) | |
tree | 8079ea6f6ccdb1c2cbb2b7813f618837617cb33e /sys/src/ape/lib/ap | |
parent | 6d99096136278f06f6333f927da34105a8dfe0bf (diff) |
fix utf and rune handling in preparation for 32bit runes
Diffstat (limited to 'sys/src/ape/lib/ap')
-rw-r--r-- | sys/src/ape/lib/ap/gen/mbwc.c | 67 |
1 files changed, 48 insertions, 19 deletions
diff --git a/sys/src/ape/lib/ap/gen/mbwc.c b/sys/src/ape/lib/ap/gen/mbwc.c index 66a982193..416ab03cf 100644 --- a/sys/src/ape/lib/ap/gen/mbwc.c +++ b/sys/src/ape/lib/ap/gen/mbwc.c @@ -1,4 +1,5 @@ #include <stdlib.h> +#include <utf.h> /* * Use the FSS-UTF transformation proposed by posix. @@ -7,12 +8,14 @@ * Tx 10xxxxxx 6 free bits * T1 110xxxxx 5 free bits * T2 1110xxxx 4 free bits + * T3 11110xxx 3 free bits * * Encoding is as follows. * From hex Thru hex Sequence Bits * 00000000 0000007F T0 7 * 00000080 000007FF T1 Tx 11 * 00000800 0000FFFF T2 Tx Tx 16 + * 00010000 0010FFFF T3 Tx Tx Tx 20 (and change) */ int @@ -25,7 +28,7 @@ mblen(const char *s, size_t n) int mbtowc(wchar_t *pwc, const char *s, size_t n) { - int c, c1, c2; + int c, c1, c2, c3; long l; if(!s) @@ -70,7 +73,25 @@ mbtowc(wchar_t *pwc, const char *s, size_t n) return 3; } - /* + if(n < 4) + goto bad; + if(UTFmax >= 4) { + c3 = (s[3] ^ 0x80) & 0xff; + if(c3 & 0xC0) + goto bad; + if(c < 0xf8) { + l = ((((((c << 6) | c1) << 6) | c2) << 6) | c3) & 0x3fffff; + if(l <= 0x10000) + goto bad; + if(l > Runemax) + goto bad; + if(pwc) + *pwc = l; + return 4; + } + } + + /* * bad decoding */ bad: @@ -86,7 +107,10 @@ wctomb(char *s, wchar_t wchar) if(!s) return 0; - c = wchar & 0xFFFF; + c = wchar; + if(c > Runemax) + c = Runeerror; + if(c < 0x80) { s[0] = c; return 1; @@ -98,10 +122,18 @@ wctomb(char *s, wchar_t wchar) return 2; } - s[0] = 0xE0 | (c >> 12); - s[1] = 0x80 | ((c >> 6) & 0x3F); - s[2] = 0x80 | (c & 0x3F); - return 3; + if(c < 0x10000) { + s[0] = 0xE0 | (c >> 12); + s[1] = 0x80 | ((c >> 6) & 0x3F); + s[2] = 0x80 | (c & 0x3F); + return 3; + } + + s[0] = 0xf0 | c >> 18; + s[1] = 0x80 | (c >> 12) & 0x3F; + s[2] = 0x80 | (c >> 6) & 0x3F; + s[3] = 0x80 | (c & 0x3F); + return 4; } size_t @@ -117,7 +149,7 @@ mbstowcs(wchar_t *pwcs, const char *s, size_t n) break; s++; } else { - d = mbtowc(pwcs, s, 3); + d = mbtowc(pwcs, s, UTFmax); if(d <= 0) return (size_t)((d<0) ? -1 : i); s += d; @@ -133,10 +165,10 @@ wcstombs(char *s, const wchar_t *pwcs, size_t n) int i, d; long c; char *p, *pe; - char buf[3]; + char buf[UTFmax]; p = s; - pe = p+n-3; + pe = p+n-UTFmax; while(p < pe) { c = *pwcs++; if(c < 0x80) @@ -146,17 +178,14 @@ wcstombs(char *s, const wchar_t *pwcs, size_t n) if(c == 0) return p-s; } - while(p < pe+3) { + while(p < pe+UTFmax) { c = *pwcs++; d = wctomb(buf, c); - if(p+d <= pe+3) { - *p++ = buf[0]; - if(d > 1) { - *p++ = buf[2]; - if(d > 2) - *p++ = buf[3]; - } - } + if(p+d <= pe+UTFmax) { + for(i = 0; i < d; i++) + p[i] = buf[i]; + p += d; + } if(c == 0) break; } |