diff options
author | Jacob Moody <moody@posixcafe.org> | 2023-03-26 01:02:20 +0000 |
---|---|---|
committer | Jacob Moody <moody@posixcafe.org> | 2023-03-26 01:02:20 +0000 |
commit | 04759ec9af6dcc78ea5873ceaf6db2e3b3920b22 (patch) | |
tree | 0e20684eee0f36fff952c46bc14828f1005dfb31 /sys/src/libc | |
parent | 2163aebcb85e8214869a2c026b3fc5bd9ddac22c (diff) |
runecomp(2)
Diffstat (limited to 'sys/src/libc')
-rw-r--r-- | sys/src/libc/port/mkfile | 28 | ||||
-rw-r--r-- | sys/src/libc/port/mkrunetype.c | 748 | ||||
-rw-r--r-- | sys/src/libc/port/runebreak.c | 293 | ||||
-rw-r--r-- | sys/src/libc/port/runeistype.c | 40 | ||||
-rw-r--r-- | sys/src/libc/port/runenorm.c | 334 | ||||
-rw-r--r-- | sys/src/libc/port/runetotype.c | 22 | ||||
-rw-r--r-- | sys/src/libc/test/mkfile | 8 | ||||
-rw-r--r-- | sys/src/libc/test/runebreak.c | 112 | ||||
-rw-r--r-- | sys/src/libc/test/runenorm.c | 92 |
9 files changed, 1676 insertions, 1 deletions
diff --git a/sys/src/libc/port/mkfile b/sys/src/libc/port/mkfile index 31c7c8c2d..3388f7551 100644 --- a/sys/src/libc/port/mkfile +++ b/sys/src/libc/port/mkfile @@ -62,6 +62,9 @@ CFILES=\ rand.c\ readn.c\ rune.c\ + runebreak.c\ + runeistype.c\ + runenorm.c\ runestrcat.c\ runestrchr.c\ runestrcmp.c\ @@ -74,7 +77,7 @@ CFILES=\ runestrrchr.c\ runestrlen.c\ runestrstr.c\ - runetype.c\ + runetotype.c\ sin.c\ sinh.c\ sqrt.c\ @@ -127,3 +130,26 @@ UPDATE=mkfile\ </sys/src/cmd/mksyslib profile.$O: /sys/include/tos.h + +runenorm.$O: runenormdata runenorm.c +runetotype.$O: runetotypedata runetotype.c +runeistype.$O: runeistypedata runeistype.c +runebreak.$O: runebreakdata runebreak.c + +UCD=\ + /lib/ucd/WordBreakProperty.txt\ + /lib/ucd/GraphemeBreakProperty.txt\ + /lib/ucd/emoji-data.txt\ + /lib/ucd/CompositionExclusions.txt\ + /lib/ucd/UnicodeData.txt\ + +/lib/ucd/%: + cd /lib/ucd && mk $stem + +runenormdata runetotypedata runeistypedata runebreakdata: mkrunetype.c $UCD + @{ + eval `{grep '^[A-Z]' /$cputype/mkfile} + $CC $CFLAGS -o mkrunetype.$O mkrunetype.c + $LD $LDFLAGS -o $O.mkrunetype mkrunetype.$O + $O.mkrunetype + } diff --git a/sys/src/libc/port/mkrunetype.c b/sys/src/libc/port/mkrunetype.c new file mode 100644 index 000000000..d8bfc60dd --- /dev/null +++ b/sys/src/libc/port/mkrunetype.c @@ -0,0 +1,748 @@ +#include <u.h> +#include <libc.h> +#include <bio.h> + +enum{ + NRUNES = 1<<21 +}; + +typedef struct Param Param; +typedef struct Lvl Lvl; +struct Lvl{ + int bits; + int max; + int mask; +}; +struct Param{ + Lvl idx1; + Lvl idx2; + Lvl data; + + int round1max; +}; + +static void +derive(Lvl *l) +{ + l->max = 1 << l->bits; + l->mask = l->max - 1; +} + +static void +param(Param *p, int idx1, int idx2) +{ + + assert(idx1 + idx2 < 21); + p->idx1.bits = idx1; + p->idx2.bits = idx2; + p->data.bits = 21 - idx1 - idx2; + derive(&p->idx1); + derive(&p->idx2); + derive(&p->data); + + p->round1max = NRUNES/p->data.max; +} + +static int +lkup(Param *p, int *idx1, int *idx2, int *data, int x) +{ + int y, z; + + y = (((x)>>(p->data.bits+p->idx2.bits))&p->idx1.mask); + z = (((x)>>p->data.bits)&p->idx2.mask); + return data[idx2[idx1[y] + z] + (x&p->data.mask)]; +} + +static int +mkarrvar(int fd, char *name, int *d, int len) +{ + int i, sz; + int max, min; + char *t; + + max = min = 0; + for(i = 0; i < len; i++){ + if(d[i] > max) + max = d[i]; + if(d[i] < min) + min = d[i]; + } + if(min == 0){ + if(max < 0xFF) + t = "uchar", sz = 1; + else if(max < 0xFFFF) + t = "ushort", sz = 2; + else + t = "uint", sz = 4; + } else { + if(max < 1<<7) + t = "char", sz = 1; + else if(max < 1<<15) + t = "short", sz = 2; + else + t = "int", sz = 4; + } + if(fd < 0) + return sz * len; + + fprint(fd, "static\n%s\t%s[%d] =\n{\n\t", t, name, len); + for(i = 0; i < len; i++){ + fprint(fd, "%d,", d[i]); + if((i+1) % 16 == 0) + fprint(fd, "\n\t"); + } + fprint(fd, "\n};\n"); + + return sz * len; +} + +static int +mkexceptarr(int fd, char *name, int *d, int n, int all) +{ + int i; + fprint(fd, "static\nRune %s[][%d] =\n{\n\t", name, all ? 3 : 2); + for(i = 0; i < n*3; i += 3){ + if(all && d[i] != 0) + fprint(fd, "{0x%X, 0x%X, 0x%X},", d[i], d[i+1], d[i+2]); + else if(!all) + fprint(fd, "{0x%X, 0x%X},", d[i+1], d[i+2]); + if((i+3) % (8*3) == 0) + fprint(fd, "\n\t"); + } + fprint(fd, "\n};\n"); + return n * sizeof(Rune) * 2; +} + +static int +compact(int *data, int *idx, int nidx, int *src, int chunksize) +{ + int i, n, ndata, best; + int *dot, *lp, *rp; + + dot = src; + ndata = 0; + idx[0] = 0; + for(i = 1; i <= nidx; i++){ + rp = dot + chunksize; + lp = rp - 1; + + for(best = 0, n = 0; i != nidx && n < chunksize; n++, lp--){ + if(memcmp(lp, rp, (n+1) * sizeof data[0]) == 0) + best = n+1; + } + memmove(data + ndata, dot, (chunksize - best) * sizeof data[0]); + ndata += (chunksize - best); + idx[i] = idx[i - 1] + (chunksize - best); + dot = rp; + } + return ndata; +} + + +static int +mklkup(int fd, char *label, int *map, Param *p) +{ + static int data[NRUNES]; + static int idx2[NRUNES]; + static int idx2dest[NRUNES]; + static int idx1[NRUNES]; + int i, nidx2, ndata; + int size; + + ndata = compact(data, idx2, p->round1max, map, p->data.max); + nidx2 = compact(idx2dest, idx1, p->idx1.max, idx2, p->idx2.max); + + if(fd >= 0){ + for(i = 0; i < NRUNES; i++) + if(map[i] != lkup(p, idx1, idx2dest, data, i)) + sysfatal("mismatch in %s at %d %d %d\n", label, i, map[i], lkup(p, idx1, idx2dest, data, i)); + } + + size = mkarrvar(fd, smprint("_%sdata", label), data, ndata); + size += mkarrvar(fd, smprint("_%sidx2", label), idx2dest, nidx2); + size += mkarrvar(fd, smprint("_%sidx1", label), idx1, p->idx1.max); + if(fd >= 0){ + fprint(fd, "\n"); + fprint(fd, "#define %sindex1(x) (((x)>>(%d+%d))&0x%X)\n", label, p->data.bits, p->idx2.bits, p->idx1.mask); + fprint(fd, "#define %sindex2(x) (((x)>>%d)&0x%X)\n", label, p->data.bits, p->idx2.mask); + fprint(fd, "#define %soffset(x) ((x)&0x%X)\n", label, p->data.mask); + fprint(fd, "#define %slkup(x) (_%sdata[_%sidx2[_%sidx1[%sindex1(x)] + %sindex2(x)] + %soffset(x)] )\n\n", + label, label, label, label, label, label, label); + } + return size; +} + +static void +mklkupmatrix(char *label, int *map, Param *p) +{ + int bestsize, size, bestx, besty; + int x, y; + + bestsize = bestx = besty = -1; + for(x = 4; x <= 12; x++) + for(y=4; y <= (19 - x); y++){ + param(p, x, y); + size = mklkup(-1, label, map, p); + if(bestsize == -1 || size < bestsize){ + bestx = x; + besty = y; + bestsize = size; + } + } + + assert(bestsize != -1); + fprint(2, "label: %s best: %d %d (%d)\n", label, bestx, besty, bestsize); + param(p, bestx, besty); +} + +static int myismerged[NRUNES]; +static int mytoupper[NRUNES]; +static int mytolower[NRUNES]; +static int mytotitle[NRUNES]; +static int mybreak[NRUNES]; + +enum{ DSTART = 0xEEEE }; +static int mydecomp[NRUNES]; +static int mydespecial[256*3]; +static int nspecial; +static int myccc[NRUNES]; + +typedef struct KV KV; +struct KV{ + uint key; + uint val; + ushort next; +}; + +static KV myrecomp[2000]; +static int nrecomp; + +static int recompext[256*3]; +static int nrecompext; + +static uint +hash(uint x) +{ + x ^= x >> 16; + x *= 0x21f0aaad; + x ^= x >> 15; + x *= 0xd35a2d97; + x ^= x >> 15; + return x; +} + +static void +mkrecomp(int fd) +{ + int i; + KV *p; + static KV vals[512]; + static KV coll[1000]; + int over; + int maxchain; + + for(i = 0; i < nelem(vals); i++) + vals[i] = (KV){0, 0, 0}; + for(i = 0; i < nelem(coll); i++) + coll[i] = (KV){0, 0, 0}; + over = 1; + for(i = 0; i < nrecomp; i++){ + p = vals + (hash(myrecomp[i].key) % nelem(vals)); + maxchain = 0; + while(p->key != 0){ + maxchain++; + if(p->next == 0){ + p->next = over; + p = coll + over - 1; + over++; + } else + p = coll + p->next - 1; + } + p->key = myrecomp[i].key; + p->val = myrecomp[i].val; + } + fprint(2, "recomp map [%d][%d]: %d\n", nelem(vals), over-1, (nelem(vals) + over-1) * (4+2+2)); + fprint(fd, "static\nuint\t_recompdata[] =\n{\n\t"); + for(p = vals, i = 0;; i++){ + assert(p->val < 0xFFFF); + assert(p->next < 0xFFFF); + fprint(fd, "%udU,%udU,", p->key, p->val | (p->next<<16)); + if((i+1) % 8 == 0) + fprint(fd, "\n\t"); + + if(p == vals+nelem(vals)-1) + p = coll; + else if(p == coll + over - 2) + break; + else + p++; + } + fprint(fd, "\n};\n"); + fprint(fd, "static uint *_recompcoll = _recompdata+%d*2;\n", nelem(vals)); +} + +static void +mktables(void) +{ + Param p; + int tofd, isfd, normfd, breakfd; + int size; + + tofd = create("runetotypedata", OWRITE, 0664); + if(tofd < 0) + sysfatal("could not create runetotypedata: %r"); + param(&p, 10, 7); + size = mklkup(tofd, "upper", mytoupper, &p); + fprint(2, "%s: %d\n", "upper", size); + + size = mklkup(tofd, "lower", mytolower, &p); + fprint(2, "%s: %d\n", "lower", size); + + size = mklkup(tofd, "title", mytotitle, &p); + fprint(2, "%s: %d\n", "title", size); + close(tofd); + + isfd = create("runeistypedata", OWRITE, 0664); + if(isfd < 0) + sysfatal("could not create runeistypedata: %r"); + param(&p, 11, 6); + size = mklkup(isfd, "merged", myismerged, &p); + fprint(2, "%s: %d\n", "merged", size); + fprint(isfd, "static\nenum {\n"); + fprint(isfd, "\tL%s = %s,\n", "space", "1<<0"); + fprint(isfd, "\tL%s = %s,\n", "alpha", "1<<1"); + fprint(isfd, "\tL%s = %s,\n", "digit", "1<<2"); + fprint(isfd, "\tL%s = %s,\n", "upper", "1<<3"); + fprint(isfd, "\tL%s = %s,\n", "lower", "1<<4"); + fprint(isfd, "\tL%s = %s,\n", "title", "1<<5"); + fprint(isfd, "};\n"); + close(isfd); + + normfd = create("runenormdata", OWRITE, 0664); + if(normfd < 0) + sysfatal("could not create runenormdata: %r"); + param(&p, 10, 7); + size = mklkup(normfd, "decomp", mydecomp, &p); + fprint(2, "%s: %d\n", "decomp", size); + + param(&p, 9, 7); + size = mklkup(normfd, "ccc", myccc, &p); + fprint(2, "%s: %d\n", "ccc", size); + + mkexceptarr(normfd, "_decompexceptions", mydespecial, nspecial, 0); + mkexceptarr(normfd, "_recompexceptions", recompext, nrecompext, 1); + mkrecomp(normfd); + close(normfd); + + param(&p, 10, 6); + breakfd = create("runebreakdata", OWRITE, 0644); + if(breakfd < 0) + sysfatal("could not create runebreakdata: %r"); + size = mklkup(breakfd, "break", mybreak, &p); + fprint(2, "%s: %d\n", "break", size); +} + +enum { + FIELD_CODE, + FIELD_NAME, + FIELD_CATEGORY, + FIELD_COMBINING, + FIELD_BIDIR, + FIELD_DECOMP, + FIELD_DECIMAL_DIG, + FIELD_DIG, + FIELD_NUMERIC_VAL, + FIELD_MIRRORED, + FIELD_UNICODE_1_NAME, + FIELD_COMMENT, + FIELD_UPPER, + FIELD_LOWER, + FIELD_TITLE, + NFIELDS, +}; + +static int +getunicodeline(Biobuf *in, char **fields) +{ + char *p; + + if((p = Brdline(in, '\n')) == nil) + return 0; + + p[Blinelen(in)-1] = '\0'; + + if (getfields(p, fields, NFIELDS + 1, 0, ";") != NFIELDS) + sysfatal("bad number of fields"); + + return 1; +} + +static int +estrtoul(char *s, int base) +{ + char *epr; + Rune code; + + code = strtoul(s, &epr, base); + if(s == epr) + sysfatal("bad code point hex string"); + return code; +} + +enum { + OTHER, + Hebrew_Letter, Newline, Extend, Format, + Katakana, ALetter, MidLetter, MidNum, + MidNumLet, Numeric, ExtendNumLet, WSegSpace, + PREPEND = 0x10, CONTROL = 0x20, EXTEND = 0x30, REGION = 0x40, + L = 0x50, V = 0x60, T = 0x70, LV = 0x80, LVT = 0x90, SPACEMK = 0xA0, + EMOJIEX = 0xB0, +}; + +static void +markbreak(void) +{ + Biobuf *b; + char *p, *dot; + int i, s, e; + uchar v; + + b = Bopen("/lib/ucd/WordBreakProperty.txt", OREAD); + if(b == nil) + sysfatal("could not load word breaks: %r"); + + while((p = Brdline(b, '\n')) != nil){ + p[Blinelen(b)-1] = 0; + if(p[0] == 0 || p[0] == '#') + continue; + if((dot = strstr(p, "..")) != nil){ + *dot = 0; + dot += 2; + s = estrtoul(p, 16); + e = estrtoul(dot, 16); + } else { + s = e = estrtoul(p, 16); + dot = p; + } + v = 0; + if(strstr(dot, "ExtendNumLet") != nil) + v = ExtendNumLet; + else if(strstr(dot, "Hebrew_Letter") != nil) + v = Hebrew_Letter; + else if(strstr(dot, "Newline") != nil) + v = Newline; + else if(strstr(dot, "Extend") != nil) + v = Extend; + else if(strstr(dot, "Format") != nil) + v = Format; + else if(strstr(dot, "Katakana") != nil) + v = Katakana; + else if(strstr(dot, "ALetter") != nil) + v = ALetter; + else if(strstr(dot, "MidLetter") != nil) + v = MidLetter; + else if(strstr(dot, "MidNum") != nil) + v = MidNum; + else if(strstr(dot, "Numeric") != nil) + v = Numeric; + else if(strstr(dot, "WSegSpace") != nil) + v = WSegSpace; + for(i = s; i <= e; i++) + mybreak[i] = v; + } + Bterm(b); + b = Bopen("/lib/ucd/GraphemeBreakProperty.txt", OREAD); + if(b == nil) + sysfatal("could not load Grapheme breaks: %r"); + + while((p = Brdline(b, '\n')) != nil){ + p[Blinelen(b)-1] = 0; + if(p[0] == 0 || p[0] == '#') + continue; + if((dot = strstr(p, "..")) != nil){ + *dot = 0; + dot += 2; + s = estrtoul(p, 16); + e = estrtoul(dot, 16); + } else { + s = e = estrtoul(p, 16); + dot = p; + } + v = 0; + if(strstr(dot, "; Prepend #") != nil) + v = PREPEND; + else if(strstr(dot, "; Control #") != nil) + v = CONTROL; + else if(strstr(dot, "; Extend #") != nil) + v = EXTEND; + else if(strstr(dot, "; Regional_Indicator #") != nil) + v = REGION; + else if(strstr(dot, "; SpacingMark #") != nil) + v = SPACEMK; + else if(strstr(dot, "; L #") != nil) + v = L; + else if(strstr(dot, "; V #") != nil) + v = V; + else if(strstr(dot, "; T #") != nil) + v = T; + else if(strstr(dot, "; LV #") != nil) + v = LV; + else if(strstr(dot, "; LVT #") != nil) + v = LVT; + for(i = s; i <= e; i++) + mybreak[i] |= v; + } + Bterm(b); + + b = Bopen("/lib/ucd/emoji-data.txt", OREAD); + if(b == nil) + sysfatal("could not load emoji-data: %r"); + + while((p = Brdline(b, '\n')) != nil){ + p[Blinelen(b)-1] = 0; + if(p[0] == 0 || p[0] == '#') + continue; + if((dot = strstr(p, "..")) != nil){ + *dot = 0; + dot += 2; + s = estrtoul(p, 16); + e = estrtoul(dot, 16); + } else { + s = e = estrtoul(p, 16); + dot = p; + } + v = 0; + if(strstr(dot, "; Extended_Pictographic") != nil) + v = EMOJIEX; + for(i = s; i <= e; i++) + mybreak[i] |= v; + } + Bterm(b); +} + +static void +markexclusions(void) +{ + Biobuf *b; + char *p; + int i; + uint x; + + b = Bopen("/lib/ucd/CompositionExclusions.txt", OREAD); + if(b == nil) + sysfatal("could not load composition exclusions: %r"); + + while((p = Brdline(b, '\n')) != nil){ + p[Blinelen(b)-1] = 0; + if(p[0] == 0 || p[0] == '#') + continue; + x = estrtoul(p, 16); + for(i = 0; i < nrecomp; i++){ + if(myrecomp[i].val == x){ + myrecomp[i].val = 0; + break; + } + } + if(i == nrecomp){ + for(i = 0; i < nrecompext; i++){ + if(recompext[i*3] == x){ + recompext[i*3] = 0; + break; + } + } + } + } + Bterm(b); +} + +void +main(int, char) +{ + static char myisspace[NRUNES]; + static char myisalpha[NRUNES]; + static char myisdigit[NRUNES]; + static char myisupper[NRUNES]; + static char myislower[NRUNES]; + static char myistitle[NRUNES]; + Biobuf *in; + char *fields[NFIELDS + 1], *fields2[NFIELDS + 1]; + char *p, *d; + int i, code, last; + int decomp[2], *ip; + + in = Bopen("/lib/ucd/UnicodeData.txt", OREAD); + if(in == nil) + sysfatal("can't open UnicodeData.txt: %r"); + + for(i = 0; i < NRUNES; i++){ + mytoupper[i] = -1; + mytolower[i] = -1; + mytotitle[i] = -1; + mydecomp[i] = 0; + myccc[i] = 0; + mybreak[i] = 0; + } + + myisspace['\t'] = 1; + myisspace['\n'] = 1; + myisspace['\r'] = 1; + myisspace['\f'] = 1; + myisspace['\v'] = 1; + myisspace[0x85] = 1; /* control char, "next line" */ + myisspace[0xfeff] = 1; /* zero-width non-break space */ + + last = -1; + nspecial = nrecomp = nrecompext = 0; + while(getunicodeline(in, fields)){ + code = estrtoul(fields[FIELD_CODE], 16); + if (code >= NRUNES) + sysfatal("code-point value too big: %x", code); + if(code <= last) + sysfatal("bad code sequence: %x then %x", last, code); + last = code; + + p = fields[FIELD_CATEGORY]; + if(strstr(fields[FIELD_NAME], ", First>") != nil){ + if(!getunicodeline(in, fields2)) + sysfatal("range start at eof"); + if (strstr(fields2[FIELD_NAME], ", Last>") == nil) + sysfatal("range start not followed by range end"); + last = estrtoul(fields2[FIELD_CODE], 16); + if(last <= code) + sysfatal("range out of sequence: %x then %x", code, last); + if(strcmp(p, fields2[FIELD_CATEGORY]) != 0) + sysfatal("range with mismatched category"); + } + + d = fields[FIELD_DECOMP]; + if(strlen(d) > 0 && strstr(d, "<") == nil){ + decomp[0] = estrtoul(d, 16); + d = strstr(d, " "); + if(d == nil){ + /* singleton recompositions are verboden */ + decomp[1] = 0; + if(decomp[0] > 0xFFFF){ + ip = mydespecial + nspecial*3; + ip[0] = code; + ip[1] = decomp[0]; + ip[2] = 0; + mydecomp[code] = (DSTART+nspecial)<<16; + nspecial++; + } else + mydecomp[code] = decomp[0]<<16; + } else { + d++; + decomp[1] = estrtoul(d, 16); + if(decomp[0] > 0xFFFF || decomp[1] > 0xFFFF){ + ip = mydespecial + nspecial*3; + ip[0] = code; + ip[1] = decomp[0]; + ip[2] = decomp[1]; + mydecomp[code] = (DSTART+nspecial)<<16; + nspecial++; + ip = recompext + nrecompext*3; + ip[0] = code; + ip[1] = decomp[0]; + ip[2] = decomp[1]; + nrecompext++; + } else { + mydecomp[code] = decomp[0]<<16 | decomp[1]; + myrecomp[nrecomp++] = (KV){decomp[0]<<16 | decomp[1], code, 0}; + } + } + } + + for (; code <= last; code++){ + if(p[0] == 'L') + myisalpha[code] = 1; + if(p[0] == 'Z') + myisspace[code] = 1; + + if(strcmp(p, "Lu") == 0) + myisupper[code] = 1; + if(strcmp(p, "Ll") == 0) + myislower[code] = 1; + + if(strcmp(p, "Lt") == 0) + myistitle[code] = 1; + + if(strcmp(p, "Nd") == 0) + myisdigit[code] = 1; + + if(fields[FIELD_UPPER][0] != '\0') + mytoupper[code] = estrtoul(fields[FIELD_UPPER], 16); + + if(fields[FIELD_LOWER][0] != '\0') + mytolower[code] = estrtoul(fields[FIELD_LOWER], 16); + + if(fields[FIELD_TITLE][0] != '\0') + mytotitle[code] = estrtoul(fields[FIELD_TITLE], 16); + + myccc[code] = estrtoul(fields[FIELD_COMBINING], 10); + } + } + + Bterm(in); + markexclusions(); + + /* + * according to standard, if totitle(x) is not defined in ucd + * but toupper(x) is, then totitle is defined to be toupper(x) + */ + for(i = 0; i < NRUNES; i++){ + if(mytotitle[i] == -1 + && mytoupper[i] != -1 + && !myistitle[i]) + mytotitle[i] = mytoupper[i]; + } + + /* + * A couple corrections: + * is*(to*(x)) should be true. + * restore undefined transformations. + * store offset instead of value, makes them sparse. + */ + for(i = 0; i < NRUNES; i++){ + if(mytoupper[i] != -1) + myisupper[mytoupper[i]] = 1; + else + mytoupper[i] = i; + + if(mytolower[i] != -1) + myislower[mytolower[i]] = 1; + else + mytolower[i] = i; + + if(mytotitle[i] != -1) + myistitle[mytotitle[i]] = 1; + else + mytotitle[i] = i; + + mytoupper[i] = mytoupper[i] - i; + mytolower[i] = mytolower[i] - i; + mytotitle[i] = mytotitle[i] - i; + } + + uchar b; + for(i = 0; i < NRUNES; i++){ + b = 0; + if(myisspace[i]) + b |= 1<<0; + if(myisalpha[i]) + b |= 1<<1; + if(myisdigit[i]) + b |= 1<<2; + if(myisupper[i]) + b |= 1<<3; + if(myislower[i]) + b |= 1<<4; + if(myistitle[i]) + b |= 1<<5; + + myismerged[i] = b; + } + + markbreak(); + mktables(); + exits(nil); +} diff --git a/sys/src/libc/port/runebreak.c b/sys/src/libc/port/runebreak.c new file mode 100644 index 000000000..1d113b38f --- /dev/null +++ b/sys/src/libc/port/runebreak.c @@ -0,0 +1,293 @@ +#include <u.h> +#include <libc.h> + +#include "runebreakdata" + +enum { + OTHER, + Hebrew_Letter, Newline, Extend, Format, + Katakana, ALetter, MidLetter, MidNum, + MidNumLet, Numeric, ExtendNumLet, WSegSpace, + PREPEND = 0x10, CONTROL = 0x20, EXTEND = 0x30, REGION = 0x40, + L = 0x50, V = 0x60, T = 0x70, LV = 0x80, LVT = 0x90, SPACEMK = 0xA0, + EMOJIEX = 0xB0, + + ZWJ = 0x200DU, + LINETAB = 0xB, +}; + +#define IS(x, y) ((x&0xf) == y) +#define ISG(x, y) ((x&0xf0) == y) + +Rune* +runegbreak(Rune *s) +{ + Rune l, r; + uchar lt, rt; + Rune *p; + + p = s; + if((l = *p++) == 0) + return s; + if((r = *p) == 0) + return s; + lt = breaklkup(l); + rt = breaklkup(r); + if(l == '\r' && r == '\n') + goto Done; + if(ISG(lt, CONTROL) || l == '\r' || l == '\n') + return p; + if(ISG(rt, CONTROL) || r == '\r' || r == '\n') + return p; + if(ISG(lt, L) && (ISG(rt, L) || ISG(rt, V) || ISG(rt, LV) || ISG(rt, LVT))) + goto Done; + if((ISG(lt, LV) || ISG(lt, V)) && (ISG(rt, V) || ISG(rt, T))) + goto Done; + if((ISG(lt, LVT) || ISG(lt, T)) && (ISG(rt, T) || ISG(rt, T))) + goto Done; + if(ISG(rt, SPACEMK) || ISG(lt, PREPEND)) + goto Done; + if(ISG(lt, EMOJIEX) && (ISG(rt, EXTEND) || r == ZWJ)){ + while(ISG(rt, EXTEND)){ + p++; + if((r = *p) == 0) + return s; + rt = breaklkup(r); + } + if(r != ZWJ) + return p; + p++; + if((r = *p) == 0) + return s; + rt = breaklkup(r); + if(ISG(rt, EMOJIEX)) + goto Done; + return p; + } + if(ISG(rt, EXTEND) || r == ZWJ) + goto Done; + if(ISG(lt, REGION) && ISG(rt, REGION)) + goto Done; + + return p; + +Done: + if(p[1] == 0) + return s; + return p + 1; +} + +char* +utfgbreak(char *s) +{ + Rune l, r; + uchar lt, rt; + char *p; + + p = s; + p += chartorune(&l, p); + if(l == 0) + return s; + chartorune(&r, p); + if(r == 0) + return s; + lt = breaklkup(l); + rt = breaklkup(r); + if(l == '\r' && r == '\n') + goto Done; + if(ISG(lt, CONTROL) || l == '\r' || l == '\n') + return p; + if(ISG(rt, CONTROL) || r == '\r' || r == '\n') + return p; + if(ISG(lt, L) && (ISG(rt, L) || ISG(rt, V) || ISG(rt, LV) || ISG(rt, LVT))) + goto Done; + if((ISG(lt, LV) || ISG(lt, V)) && (ISG(rt, V) || ISG(rt, T))) + goto Done; + if((ISG(lt, LVT) || ISG(lt, T)) && (ISG(rt, T) || ISG(rt, T))) + goto Done; + if(ISG(rt, SPACEMK) || ISG(lt, PREPEND)) + goto Done; + if(ISG(lt, EMOJIEX) && (ISG(rt, EXTEND) || r == ZWJ)){ + while(ISG(rt, EXTEND)){ + p += chartorune(&r, p); + chartorune(&r, p); + if(r == 0) + return s; + rt = breaklkup(r); + } + if(r != ZWJ) + return p; + + p += chartorune(&r, p); + chartorune(&r, p); + if(r == 0) + return s; + rt = breaklkup(r); + if(ISG(rt, EMOJIEX)) + goto Done; + return p; + } + if(ISG(rt, EXTEND) || r == ZWJ) + goto Done; + if(ISG(lt, REGION) && ISG(rt, REGION)) + goto Done; + + return p; + +Done: + p += chartorune(&r, p); + chartorune(&r, p); + if(r == 0) + return s; + return p; +} + +#define AH(x) (IS(x, ALetter) || IS(x, Hebrew_Letter)) +#define MNLQ(x) (IS(x, MidNumLet) || x == '\'') + +Rune* +runewbreak(Rune *s) +{ + Rune l, r; + uchar lt, rt; + Rune *p; + + p = s; + if((l = *p++) == 0) + return s; + if((r = *p) == 0) + return s; + lt = breaklkup(l); + rt = breaklkup(r); + if(l == '\r' && r == '\n') + goto Done; + if(l == '\r' || l == '\n' || l == LINETAB) + return p; + if(r == '\r' || r == '\n' || l == LINETAB) + return p; + if(IS(lt, WSegSpace) && IS(rt, WSegSpace)) + goto Done; + if(IS(rt, Format) || IS(rt, Extend)) + goto Done; + if(AH(lt)){ + if(AH(rt)) + goto Done; + if((IS(rt, MidLetter) || MNLQ(rt)) && p[1] != 0 && AH(breaklkup(p[1]))) + goto Done; + if(IS(lt, Hebrew_Letter) && r == '\'') + goto Done; + if(IS(lt, Hebrew_Letter) && r == '"' && p[1] != 0 && IS(breaklkup(p[1]), Hebrew_Letter)) + goto Done; + if(IS(rt, Numeric)) + goto Done; + } + if(IS(lt, Numeric) && (AH(rt) || IS(rt, Numeric))) + goto Done; + if(IS(lt, Numeric) && (IS(rt, MidNum) || MNLQ(rt)) && p[1] != 0 && IS(breaklkup(p[1]), Numeric)) + goto Done; + if(IS(lt, Katakana) && IS(rt, Katakana)) + goto Done; + if(AH(lt) || IS(lt, Numeric) || IS(lt, Katakana) || IS(lt, ExtendNumLet)) + if(IS(rt, ExtendNumLet)) + goto Done; + if(IS(lt, ExtendNumLet) && (AH(rt) || IS(rt, Numeric) || IS(rt, Katakana))) + goto Done; + if(ISG(lt, REGION)){ + if(ISG(rt, REGION)) + goto Done; + if(r != ZWJ) + return p; + p++; + if((r = *p) == 0) + return s; + rt = breaklkup(r); + if(ISG(rt, REGION)) + goto Done; + } + + return p; + +Done: + if(p[1] == 0) + return s; + return p + 1; +} + +char* +utfwbreak(char *s) +{ + Rune l, r; + Rune peek; + uchar lt, rt; + char *p; + + p = s; + p += chartorune(&l, p); + if(l == 0) + return s; + chartorune(&peek, p+chartorune(&r, p)); + if(r == 0) + return s; + lt = breaklkup(l); + rt = breaklkup(r); + if(l == '\r' && r == '\n') + goto Done; + if(l == '\r' || l == '\n' || l == LINETAB) + return p; + if(r == '\r' || r == '\n' || l == LINETAB) + return p; + if(IS(lt, WSegSpace) && IS(rt, WSegSpace)) + goto Done; + if(IS(rt, Format) || IS(rt, Extend)) + goto Done; + if(AH(lt)){ + if(AH(rt)) + goto Done; + if(IS(rt, MidLetter) || MNLQ(rt)) + if(peek != 0 && AH(breaklkup(peek))) + goto Done; + + if(IS(lt, Hebrew_Letter) && r == '\'') + goto Done; + + if(IS(lt, Hebrew_Letter) && r == '"') + if(peek != 0 && IS(breaklkup(peek), Hebrew_Letter)) + goto Done; + + if(IS(rt, Numeric)) + goto Done; + } + if(IS(lt, Numeric) && (AH(rt) || IS(rt, Numeric))) + goto Done; + if(IS(lt, Numeric) && (IS(rt, MidNum) || MNLQ(rt)) && peek != 0 && IS(breaklkup(peek), Numeric)) + goto Done; + if(IS(lt, Katakana) && IS(rt, Katakana)) + goto Done; + if(AH(lt) || IS(lt, Numeric) || IS(lt, Katakana) || IS(lt, ExtendNumLet)) + if(IS(rt, ExtendNumLet)) + goto Done; + if(IS(lt, ExtendNumLet) && (AH(rt) || IS(rt, Numeric) || IS(rt, Katakana))) + goto Done; + if(ISG(lt, REGION)){ + if(ISG(rt, REGION)) + goto Done; + if(r != ZWJ) + return p; + p += chartorune(&r, p); + chartorune(&r, p); + if(r == 0) + return s; + rt = breaklkup(r); + if(ISG(rt, REGION)) + goto Done; + } + + return p; + +Done: + p += chartorune(&r, p); + chartorune(&r, p); + if(r == 0) + return s; + return p; +} diff --git a/sys/src/libc/port/runeistype.c b/sys/src/libc/port/runeistype.c new file mode 100644 index 000000000..3ec2dd0f9 --- /dev/null +++ b/sys/src/libc/port/runeistype.c @@ -0,0 +1,40 @@ +#include <u.h> +#include <libc.h> + +#include "runeistypedata" + +int +isspacerune(Rune c) +{ + return (mergedlkup(c) & Lspace) == Lspace; +} + +int +isalpharune(Rune c) +{ + return (mergedlkup(c) & Lalpha) == Lalpha; +} + +int +isdigitrune(Rune c) +{ + return (mergedlkup(c) & Ldigit) == Ldigit; +} + +int +isupperrune(Rune c) +{ + return (mergedlkup(c) & Lupper) == Lupper; +} + +int +islowerrune(Rune c) +{ + return (mergedlkup(c) & Llower) == Llower; +} + +int +istitlerune(Rune c) +{ + return (mergedlkup(c) & Ltitle) == Ltitle; +} diff --git a/sys/src/libc/port/runenorm.c b/sys/src/libc/port/runenorm.c new file mode 100644 index 000000000..942ea4d49 --- /dev/null +++ b/sys/src/libc/port/runenorm.c @@ -0,0 +1,334 @@ +#include <u.h> +#include <libc.h> + +#include "runenormdata" + +//Unicode Standard: Section 3.12 Conjoining Jamo Behavior +enum { + SBase = 0xAC00, + LBase = 0x1100, + VBase = 0x1161, + TBase = 0x11A7, + + LCount = 19, + VCount = 21, + TCount = 28, + NCount = VCount * TCount, + SCount = LCount * NCount, + + LLast = LBase + LCount - 1, + SLast = SBase + SCount - 1, + VLast = VBase + VCount - 1, + TLast = TBase + TCount - 1, +}; + +static void +_runedecomp(Rune dst[2], Rune c) +{ + uint x; + + if(c >= SBase && c <= SLast){ + c -= SBase; + x = c % TCount; + if(x){ + dst[0] = SBase + ((c / TCount) * TCount); + dst[1] = TBase + x; + return; + } + dst[0] = LBase + (c / NCount); + dst[1] = VBase + ((c % NCount) / TCount); + return; + } + x = decomplkup(c); + if((x & 0xFFFF) != 0){ + dst[0] = x>>16; + dst[1] = x & 0xFFFF; + return; + } + x >>= 16; + if(x >= 0xEEEE && x <0xF8FF){ + memmove(dst, _decompexceptions[x - 0xEEEE], sizeof(Rune)*2); + return; + } + dst[0] = x; + dst[1] = 0; +} + +static Rune +_runerecomp(Rune r[2]) +{ + uint x, y, *p, next; + + if(r[0] >= LBase && r[0] <= LLast){ + if(r[1] < VBase || r[1] > VLast) + return 0; + x = (r[0] - LBase) * NCount + (r[1] - VBase) * TCount; + return SBase + x; + } + if(r[0] >= SBase && r[0] <= SLast && (r[0] - SBase) % TCount == 0){ + if(r[1] > TBase && r[1] <= TLast) + return r[0] + (r[1] - TBase); + return 0; + } + if(r[0] > 0xFFFF || r[1] > 0xFFFF){ + for(x = 0; x < nelem(_recompexceptions); x++) + if(r[0] == _recompexceptions[x][1] && r[1] == _recompexceptions[x][2]) + return _recompexceptions[x][0]; + return 0; + } + y = x = r[0]<<16 | r[1]; + x ^= x >> 16; + x *= 0x21f0aaad; + x ^= x >> 15; + x *= 0xd35a2d97; + x ^= x >> 15; + p = _recompdata + (x%512)*2; + while(p[0] != y){ + next = p[1]>>16; + if(!next) + return 0; + p = _recompcoll + (next-1)*2; + } + return p[1] & 0xFFFF; +} + +static void +runecccsort(Rune *a, int len) +{ + Rune r; + int i; + int fail; + + do { + fail = 0; + for(i = 0; i < len - 1; i++){ + if(ccclkup(a[i]) > ccclkup(a[i+1]) > 0){ + r = a[i]; + a[i] = a[i+1]; + a[i + 1] = r; + fail = 1; + } + } + } while(fail); +} + +char* +fullutfnorm(char *s, int n) +{ + Rune r, peek; + char *p, *p2; + + p = s; + if(fullrune(p, n) == 0) + return s; + + p += chartorune(&r, p); + n -= (p - s); + + if((r >= LBase && r <= LLast) || (r >= SBase && r <= SLast)){ + do { + if(fullrune(p, n) == 0) + return s; + p2 = p + chartorune(&peek, p); + n -= (p2 - p); + p = p2; + } while(n > 0 && (peek >= VBase && peek <= VLast) || (peek > TBase && peek <= TLast)); + if(n <= 0) + return s; + return p; + } + + do { + if(fullrune(p, n) == 0) + return s; + p2 = p + chartorune(&peek, p); + n -= (p2 - p); + p = p2; + if(ccclkup(peek) == 0) + return p; + } while(n > 0); + + return s; +} + +Rune* +fullrunenorm(Rune *r, int n) +{ + Rune *e, *p; + + p = r; + e = p + n; + + if((*p >= LBase && *p <= LLast) || (*p >= SBase && *p <= SLast)){ + p++; + while(p < e && (*p >= VBase && *p <= VLast) || (*p > TBase && *p <= TLast)) + p++; + + if(p >= e) + return r; + return p; + } + + for(; p < e && p + 1 < e; p++) + if(ccclkup(p[1]) == 0) + return p + 1; + + return r; +} + +static int +runenorm(Rune *dst, Rune *src, char *sdst, char *ssrc, int max, int compose) +{ + Rune c, r[2], _stack[32]; + Rune *p, *stack, *sp, *tp; + char *strp, *strstop; + Rune *rp, *rrp; + Rune *stop; + Rune peek; + int w, w2, size; + int mode; + + if(src){ + mode = 1; + p = src; + stop = dst + (max - 1); + strp = ""; + strstop = nil; + } else { + mode = 0; + p = L""; + stop = nil; + strp = ssrc; + strstop = sdst + (max - 1); + } + + stack = _stack + nelem(_stack)/2; + size = 0; + w = w2 = 0; + while(*strp || *p){ + if(mode) + c = *p; + else + w = chartorune(&c, strp); + + sp = stack - 1; + tp = stack; + _runedecomp(r, c); + while(r[0] != 0){ + c = r[0]; + if(r[1] != 0){ + *sp-- = r[1]; + if(sp == _stack) + break; + } + _runedecomp(r, c); + } + + *sp = c; + if(mode) + peek = p[1]; + else + w2 = chartorune(&peek, strp+w); + + if((*sp >= LBase && *sp <= LLast) || (*sp >= SBase && *sp <= SLast)){ + while(peek != 0 && (peek >= VBase && peek <= VLast) || (peek > TBase && peek <= TLast)){ + *tp++ = peek; + if(mode){ + p++; + peek = p[1]; + } else { + strp += w; + w = w2; + w2 = chartorune(&peek, strp+w); + } + if(tp == _stack + nelem(_stack)) + break; + } + } + while(peek != 0 && ccclkup(peek) != 0){ + _runedecomp(r, peek); + if(r[1] != 0){ + if(tp+1 >= _stack + nelem(_stack)) + break; + *tp++ = r[0]; + *tp++ = r[1]; + } else if(r[0] != 0) + *tp++ = r[0]; + else + *tp++ = peek; + + if(mode){ + p++; + peek = p[1]; + } else { + strp += w; + w = w2; + w2 = chartorune(&peek, strp+w); + } + if(tp == _stack + nelem(_stack)) + break; + } + runecccsort(sp, tp - sp); + + if(compose && ccclkup(*sp) == 0){ + for(rp = sp + 1; rp < tp; rp++){ + r[0] = *sp; + r[1] = *rp; + c = _runerecomp(r); + if(c != 0){ + *sp = c; + for(rrp = rp; rrp > sp; rrp--) + *rrp = rrp[-1]; + sp++; + } else while(rp + 1 < tp && ccclkup(*rp) == ccclkup(*(rp+1))) + rp++; + } + } + + for(; sp < tp; sp++){ + if(mode){ + if(dst < stop) + *dst++ = *sp; + size++; + } else { + w2 = runelen(*sp); + if(sdst+w2 < strstop) + sdst += runetochar(sdst, sp); + size += w2; + } + } + if(mode) + p++; + else + strp += w; + } + if(mode) + *dst = 0; + else + *sdst = 0; + return size; +} + +int +runecomp(Rune *dst, Rune *src, int max) +{ + return runenorm(dst, src, nil, nil, max, 1); +} + +int +runedecomp(Rune *dst, Rune *src, int max) +{ + return runenorm(dst, src, nil, nil, max, 0); +} + +int +utfcomp(char *dst, char *src, int max) +{ + return runenorm(nil, nil, dst, src, max, 1); +} + +int +utfdecomp(char *dst, char *src, int max) +{ + return runenorm(nil, nil, dst, src, max, 0); +} diff --git a/sys/src/libc/port/runetotype.c b/sys/src/libc/port/runetotype.c new file mode 100644 index 000000000..6161858cc --- /dev/null +++ b/sys/src/libc/port/runetotype.c @@ -0,0 +1,22 @@ +#include <u.h> +#include <libc.h> + +#include "runetotypedata" + +Rune +toupperrune(Rune c) +{ + return c + upperlkup(c); +} + +Rune +tolowerrune(Rune c) +{ + return c + lowerlkup(c); +} + +Rune +totitlerune(Rune c) +{ + return c + titlelkup(c); +} diff --git a/sys/src/libc/test/mkfile b/sys/src/libc/test/mkfile index b6b5cce33..2bbc1fa13 100644 --- a/sys/src/libc/test/mkfile +++ b/sys/src/libc/test/mkfile @@ -3,6 +3,14 @@ TEST=\ date\ pow\ + runebreak\ + runenorm\ strchr\ </sys/src/cmd/mktest + +/lib/ucd/%: + cd /lib/ucd && mk $stem + +runebreak.test: /lib/ucd/GraphemeBreakTest.txt /lib/ucd/WordBreakTest.txt +runenorm.test: /lib/ucd/NormalizationTest.txt diff --git a/sys/src/libc/test/runebreak.c b/sys/src/libc/test/runebreak.c new file mode 100644 index 000000000..c9cca51f4 --- /dev/null +++ b/sys/src/libc/test/runebreak.c @@ -0,0 +1,112 @@ +#include <u.h> +#include <libc.h> +#include <bio.h> + +static int +estrtoul(char *s) +{ + char *epr; + Rune code; + + code = strtoul(s, &epr, 16); + if(s == epr) + sysfatal("bad code point hex string"); + return code; +} + +static Rune* +check(Rune *r, Rune* (*fn)(Rune*), char* (*fn2)(char*)) +{ + Rune *r2, *tmp; + char *p, *p2; + + p = smprint("%S", r); + r2 = fn(r); + p2 = fn2(p); + + tmp = runesmprint("%.*s", (int)(p2-p), p); + if(memcmp(r, tmp, r2-r) != 0) + print("utf mismstach\n"); + + free(p); + free(tmp); + return r2; +} + +static void +run(char *file, Rune* (*fn)(Rune*), char* (*fn2)(char*)) +{ + Biobuf *b; + char *p, *dot; + char *pieces[16]; + int i, j, n; + Rune stack[16], ops[16]; + int nstack, nops; + Rune r, *rp, *rp2; + char *line; + + b = Bopen(file, OREAD); + if(b == nil) + sysfatal("could not load composition exclusions: %r"); + + for(;(p = Brdline(b, '\n')) != nil; free(line)){ + p[Blinelen(b)-1] = 0; + line = strdup(p); + if(p[0] == 0 || p[0] == '#') + continue; + if((dot = strstr(p, "#")) != nil) + *dot = 0; + n = getfields(p, pieces, nelem(pieces), 0, " "); + nstack = nops = 0; + for(i = 0; i < n; i++){ + chartorune(&r, pieces[i]); + if(r != L'÷' && r != L'×'){ + r = estrtoul(pieces[i]); + stack[nstack++] = r; + stack[nstack] = 0; + } else { + ops[nops++] = r; + ops[nops] = 0; + } + } + + rp = stack; + for(i = 1; i < nops-1;){ + rp2 = check(rp, fn, fn2); + switch(ops[i]){ + case L'÷': + if(rp2 != rp+1){ + print("break fail %X %X || %s\n", rp[0], rp[1], line); + goto Break; + } + rp++; + i++; + break; + case L'×': + if(rp2 - rp == 0){ + for(j = i; j < nops - 1; j++) + if(ops[j] != L'×') + print("skipped %d %d %s\n", i, nops, line); + goto Break; + } + for(; rp < (rp2-1); rp++, i++){ + if(ops[i] != L'×') + print("skipped %d %d %s\n", i, nops, line); + } + rp = rp2; + i++; + break; + } + } +Break: + ; + } +} + +void +main(int, char) +{ + run("/lib/ucd/GraphemeBreakTest.txt", runegbreak, utfgbreak); + run("/lib/ucd/WordBreakTest.txt", runewbreak, utfwbreak); + exits(nil); +} diff --git a/sys/src/libc/test/runenorm.c b/sys/src/libc/test/runenorm.c new file mode 100644 index 000000000..fb4168e28 --- /dev/null +++ b/sys/src/libc/test/runenorm.c @@ -0,0 +1,92 @@ +#include <u.h> +#include <libc.h> +#include <bio.h> + +static int +estrtoul(char *s) +{ + char *epr; + Rune code; + + code = strtoul(s, &epr, 16); + if(s == epr) + sysfatal("bad code point hex string"); + return code; +} + +void +main(int, char) +{ + Rune buffer1[64]; + Rune buffer2[64]; + char utfbuff1[128]; + char utfbuff2[128]; + char srctmp[128], tmp1[128], tmp2[128]; + char *fields[10]; + char *runes[32]; + char *p; + int n, n2; + int i; + uint fail; + Biobuf *b; + + b = Bopen("/lib/ucd/NormalizationTest.txt", OREAD); + if(b == nil) + sysfatal("could not load composition exclusions: %r"); + + struct { + Rune src[32]; + Rune nfc[32]; + Rune nfd[32]; + } test; + while((p = Brdline(b, '\n')) != nil){ + p[Blinelen(b)-1] = 0; + if(p[0] == 0 || p[0] == '#' || p[0] == '@') + continue; + getfields(p, fields, 6 + 1, 0, ";"); + n = getfields(fields[0], runes, nelem(runes), 0, " "); + for(i = 0; i < n; i++) + test.src[i] = estrtoul(runes[i]); + test.src[i] = 0; + + n = getfields(fields[1], runes, nelem(runes), 0, " "); + for(i = 0; i < n; i++) + test.nfc[i] = estrtoul(runes[i]); + test.nfc[i] = 0; + + n = getfields(fields[2], runes, nelem(runes), 0, " "); + for(i = 0; i < n; i++) + test.nfd[i] = estrtoul(runes[i]); + test.nfd[i] = 0; + + n = runecomp(buffer1, test.src, nelem(buffer1)); + n2 = runedecomp(buffer2, test.src, nelem(buffer2)); + fail = 0; + + if(runestrcmp(buffer1, test.nfc) != 0) + fail |= 1<<0; + if(runestrcmp(buffer2, test.nfd) != 0) + fail |= 1<<1; + if(fail) + print("%d %d %S %S %S %S %S\n", fail, i, test.src, test.nfd, test.nfc, buffer2, buffer1); + assert(n == runestrlen(test.nfc)); + assert(n2 == runestrlen(test.nfd)); + + snprint(srctmp, sizeof tmp1, "%S", test.src); + snprint(tmp1, sizeof tmp1, "%S", test.nfc); + snprint(tmp2, sizeof tmp2, "%S", test.nfd); + + n = utfcomp(utfbuff1, srctmp, nelem(utfbuff1)); + n2 = utfdecomp(utfbuff2, srctmp, nelem(utfbuff2)); + + if(strcmp(utfbuff1, tmp1) != 0) + fail |= 1<<2; + if(strcmp(utfbuff2, tmp2) != 0) + fail |= 1<<3; + if(fail) + print("%d %d %s %s %s %s %s\n", fail, i, srctmp, tmp2, tmp1, utfbuff2, utfbuff1); + assert(n == strlen(tmp1)); + assert(n2 == strlen(tmp2)); + } + exits(nil); +} |