summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorcinap_lenrek <cinap_lenrek@felloff.net>2015-09-24 12:23:17 +0200
committercinap_lenrek <cinap_lenrek@felloff.net>2015-09-24 12:23:17 +0200
commit917da0089dcaa013979a69aaeaeff0c08cbc7e26 (patch)
tree52067c34ac225ec92cc684c29551308e244140ee
parent8003c8b1e2d5d6e2a22ca7e552b53e631db86df4 (diff)
cpp: handle 4 byte utf sequences (21-bit runes)
-rw-r--r--sys/src/cmd/cpp/lex.c9
1 files changed, 7 insertions, 2 deletions
diff --git a/sys/src/cmd/cpp/lex.c b/sys/src/cmd/cpp/lex.c
index e90423e93..226097b35 100644
--- a/sys/src/cmd/cpp/lex.c
+++ b/sys/src/cmd/cpp/lex.c
@@ -29,6 +29,7 @@
#define UTF2(c) ((c)>=0xA0 && (c)<0xE0) /* 2-char UTF seq */
#define UTF3(c) ((c)>=0xE0 && (c)<0xF0) /* 3-char UTF seq */
+#define UTF4(c) ((c)>=0xF0 && (c)<0xF8) /* 4-char UTF seq */
/* character classes */
#define C_WS 1
@@ -259,7 +260,7 @@ expandlex(void)
case C_ALPH:
for (j=0; j<=256; j++)
if ('a'<=j&&j<='z' || 'A'<=j&&j<='Z'
- || UTF2(j) || UTF3(j) || j=='_')
+ || UTF2(j) || UTF3(j) || UTF4(j) || j=='_')
bigfsm[j][fp->state] = nstate;
continue;
case C_NUM:
@@ -274,7 +275,7 @@ expandlex(void)
/* install special cases for ? (trigraphs), \ (splicing), runes */
for (i=0; i<MAXSTATE; i++) {
for (j=0; j<0xFF; j++)
- if (j=='?' || j=='\\' || UTF2(j) || UTF3(j)) {
+ if (j=='?' || j=='\\' || UTF2(j) || UTF3(j) || UTF4(j)) {
if (bigfsm[j][i]>0)
bigfsm[j][i] = ~bigfsm[j][i];
bigfsm[j][i] &= ~QBSBIT;
@@ -393,6 +394,10 @@ gettokens(Tokenrow *trp, int reset)
runelen = 3;
goto reswitch;
}
+ if (UTF4(c)) {
+ runelen = 4;
+ goto reswitch;
+ }
error(WARNING, "Lexical botch in cpp");
ip += runelen;
runelen = 1;