diff options
author | cinap_lenrek <cinap_lenrek@gmx.de> | 2012-12-31 21:09:46 +0100 |
---|---|---|
committer | cinap_lenrek <cinap_lenrek@gmx.de> | 2012-12-31 21:09:46 +0100 |
commit | 6cadd03bbeace1c256ba875c2e6a877f924877cd (patch) | |
tree | 8079ea6f6ccdb1c2cbb2b7813f618837617cb33e /sys/src/cmd/postscript | |
parent | 6d99096136278f06f6333f927da34105a8dfe0bf (diff) |
fix utf and rune handling in preparation for 32bit runes
Diffstat (limited to 'sys/src/cmd/postscript')
-rw-r--r-- | sys/src/cmd/postscript/common/rune.c | 99 | ||||
-rw-r--r-- | sys/src/cmd/postscript/common/rune.h | 3 |
2 files changed, 82 insertions, 20 deletions
diff --git a/sys/src/cmd/postscript/common/rune.c b/sys/src/cmd/postscript/common/rune.c index 01ee6ba81..4227d84bb 100644 --- a/sys/src/cmd/postscript/common/rune.c +++ b/sys/src/cmd/postscript/common/rune.c @@ -7,16 +7,19 @@ enum Bit2 = 5, Bit3 = 4, Bit4 = 3, + Bit5 = 2, T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */ Maskx = (1<<Bitx)-1, /* 0011 1111 */ Testx = Maskx ^ 0xFF, /* 1100 0000 */ @@ -27,7 +30,7 @@ enum int chartorune(Rune *rune, char *str) { - int c, c1, c2; + int c, c1, c2, c3; long l; /* @@ -72,6 +75,25 @@ chartorune(Rune *rune, char *str) return 3; } + /* + * four character sequence + * 10000-10FFFF => T4 Tx Tx Tx + */ + if(UTFmax >= 4) { + c3 = *(unsigned char*)(str+3) ^ Tx; + if(c3 & Testx) + goto bad; + if(c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if(l <= Rune3) + goto bad; + if(l > Runemax) + goto bad; + *rune = l; + return 4; + } + } + /* * bad decoding */ @@ -85,11 +107,14 @@ runetochar(char *str, Rune *rune) { long c; + c = *rune; + if(c > Runemax) + c = Runeerror; + /* * one character sequence * 00000-0007F => 00-7F */ - c = *rune; if(c <= Rune1) { str[0] = c; return 1; @@ -109,34 +134,70 @@ runetochar(char *str, Rune *rune) * three character sequence * 0800-FFFF => T3 Tx Tx */ - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; + if(c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence + * 10000-1FFFFF => T4 Tx Tx Tx + */ + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; } int runelen(long c) { Rune rune; - char str[10]; + char str[UTFmax]; rune = c; return runetochar(str, &rune); } int +runenlen(Rune *r, int nrune) +{ + int nb, c; + + nb = 0; + while(nrune--) { + c = *r++; + if(c <= Rune1) + nb++; + else + if(c <= Rune2) + nb += 2; + else + if(c <= Rune3 || c > Runemax) + nb += 3; + else + nb += 4; + } + return nb; +} + +int fullrune(char *str, int n) { int c; - if(n > 0) { - c = *(unsigned char*)str; - if(c < Tx) - return 1; - if(n > 1) - if(c < T3 || n > 2) - return 1; - } - return 0; + if(n <= 0) + return 0; + c = *(unsigned char*)str; + if(c < Tx) + return 1; + if(c < T3) + return n >= 2; + if(UTFmax == 3 || c < T4) + return n >= 3; + return n >= 4; } + diff --git a/sys/src/cmd/postscript/common/rune.h b/sys/src/cmd/postscript/common/rune.h index 9c1fd4fd0..84301a8ea 100644 --- a/sys/src/cmd/postscript/common/rune.h +++ b/sys/src/cmd/postscript/common/rune.h @@ -14,6 +14,7 @@ enum UTFmax = 3, /* maximum bytes per rune */ Runesync = 0x80, /* cannot represent part of a utf sequence (<) */ Runeself = 0x80, /* rune and utf sequences are the same (<) */ - Runeerror = 0xFFFD, /* decoding error in utf */ + Runeerror = 0xFFFD, /* decoding error in utf */ + Runemax = 0xFFFF, /* 16 bit rune */ }; #endif |