diff options
author | cinap_lenrek <cinap_lenrek@centraldogma> | 2011-09-24 17:06:45 +0200 |
---|---|---|
committer | cinap_lenrek <cinap_lenrek@centraldogma> | 2011-09-24 17:06:45 +0200 |
commit | 13304b7b967c6172cfaa6b31dd4f92348056ed1a (patch) | |
tree | 4c0e56aa2313735a847f529366dee45ee6110a5d | |
parent | 6d6880cec936a13e67e43357538394a5c7f09010 (diff) |
html2ms, tcs, mothra, uhtml: threat ' as special entity, add uhtml(1)
-rw-r--r-- | sys/man/1/uhtml | 46 | ||||
-rw-r--r-- | sys/src/cmd/html2ms.c | 2 | ||||
-rw-r--r-- | sys/src/cmd/mothra/rdhtml.c | 2 | ||||
-rw-r--r-- | sys/src/cmd/tcs/html.c | 25 | ||||
-rw-r--r-- | sys/src/cmd/uhtml.c | 49 |
5 files changed, 95 insertions, 29 deletions
diff --git a/sys/man/1/uhtml b/sys/man/1/uhtml new file mode 100644 index 000000000..5e91a5608 --- /dev/null +++ b/sys/man/1/uhtml @@ -0,0 +1,46 @@ +.TH UHTML 1 +.SH NAME +uhtml \- convert foreign character set HTML file to unicode +.SH SYNOPSIS +.B uhtml +[ +.B -p +] [ +.B -c +.I charset +] [ +.I file +] +.SH DESCRIPTION +HTML comes in various character set encodings +and has special forms to encode characters. To +make it easier to process html, uthml is used +to normalize it to a unicode only form. +.LP +Uhtml detects the character set of the html input +.I file +and calls +.IR tcs (1) +to convert it to utf replacing html-entity forms +by ther unicode character representations except for +.B lt +.B gt +.B amp +.B quot +and +.B apos . +The converted html is written to +standard output. If no +.I file +was given, it is read from standard input. If the +.B -p +option is given, the detected character set is printed and +the program exits without conversion. +In case character set detection fails, the default (utf) +is assumed. This default can be changed with the +.B -c +option. +.SH SOURCE +.B /sys/src/cmd/uhtml.c +.SH SEE ALSO +.IR tcs (1) diff --git a/sys/src/cmd/html2ms.c b/sys/src/cmd/html2ms.c index 9caabe636..770882858 100644 --- a/sys/src/cmd/html2ms.c +++ b/sys/src/cmd/html2ms.c @@ -680,6 +680,8 @@ parserune(int c) return '>'; if(strcmp(buf, "quot") == 0) return '"'; + if(strcmp(buf, "apos") == 0) + return '\''; if(strcmp(buf, "amp") == 0) return '&'; /* use tcs -f html to handle the rest. */ diff --git a/sys/src/cmd/mothra/rdhtml.c b/sys/src/cmd/mothra/rdhtml.c index 121a25589..44ba88c82 100644 --- a/sys/src/cmd/mothra/rdhtml.c +++ b/sys/src/cmd/mothra/rdhtml.c @@ -272,6 +272,8 @@ void pl_rmentities(Hglob *g, char *s){ *t++='>'; else if(strcmp(u, "quot") == 0) *t++='"'; + else if(strcmp(u, "apos") == 0) + *t++='\''; else if(strcmp(u, "amp") == 0) *t++='&'; else { diff --git a/sys/src/cmd/tcs/html.c b/sys/src/cmd/tcs/html.c index 607168092..abf964d0a 100644 --- a/sys/src/cmd/tcs/html.c +++ b/sys/src/cmd/tcs/html.c @@ -11,8 +11,6 @@ struct Hchar Rune r; }; -/* <, >, ", & intentionally omitted */ - /* * Names beginning with _ are names we recognize * (without the underscore) but will not generate, @@ -86,7 +84,7 @@ static Hchar byname[] = {"agrave", 224}, {"alefsym", 8501}, {"alpha", 945}, - /* {"amp", 38}, */ + {"amp", 38}, {"and", 8743}, {"ang", 8736}, {"aring", 229}, @@ -141,7 +139,7 @@ static Hchar byname[] = {"frasl", 8260}, {"gamma", 947}, {"ge", 8805}, - /* {"gt", 62}, */ + {"gt", 62}, {"hArr", 8660}, {"harr", 8596}, {"hearts", 9829}, @@ -173,7 +171,7 @@ static Hchar byname[] = {"lrm", 8206}, {"lsaquo", 8249}, {"lsquo", 8216}, - /* {"lt", 60}, */ + {"lt", 60}, {"macr", 175}, {"mdash", 8212}, {"micro", 181}, @@ -219,7 +217,7 @@ static Hchar byname[] = {"prop", 8733}, {"psi", 968}, {"quad", 8193}, - /* {"quot", 34}, */ + {"quot", 34}, {"rArr", 8658}, {"radic", 8730}, {"rang", 9002}, @@ -416,10 +414,8 @@ html_in(int fd, long *x, struct convert *out) } buf[i] = 0; if(i > 1){ - if((c = findbyname(buf+1)) != Runeerror){ - *r++ = c; - continue; - } + if((c = findbyname(buf+1)) != Runeerror) + goto out; if(i > 2 && buf[1] == '#'){ if(i > 3 && strchr("xX", buf[2])) c = strtol(buf+3, &p, 16); @@ -427,8 +423,7 @@ html_in(int fd, long *x, struct convert *out) c = strtol(buf+2, &p, 10); if(*p || c >= NRUNE || c < 0) goto bad; - *r++ = c; - continue; + goto out; } } bad: @@ -442,6 +437,12 @@ html_in(int fd, long *x, struct convert *out) } } continue; + out: + if(strchr("<>&\"'", c)){ + s = ';'; + i = sprint(buf, "&%s", findbyrune(c)); + goto bad; + } } *r++ = c; } diff --git a/sys/src/cmd/uhtml.c b/sys/src/cmd/uhtml.c index 35fc041ae..f21604503 100644 --- a/sys/src/cmd/uhtml.c +++ b/sys/src/cmd/uhtml.c @@ -41,7 +41,7 @@ void main(int argc, char *argv[]) { int pfd[2], pflag = 0; - char *arg[4], *s; + char *arg[4], *s, *p; ARGBEGIN { case 'h': @@ -59,42 +59,54 @@ main(int argc, char *argv[]) if(open(*argv, OREAD) != 1) sysfatal("open: %r"); } - if((nbuf = read(0, buf, sizeof(buf)-1)) < 0) + if((nbuf = readn(0, buf, sizeof(buf)-1)) < 0) sysfatal("read: %r"); buf[nbuf] = 0; - - /* useless BOM marker */ - if(memcmp(buf, "\xEF\xBB\xBF", 3)==0) - memmove(buf, buf+3, nbuf-3); - - for(;;){ - if(s = cistrstr(buf, "encoding=")) + p = buf; + while(nbuf > 0){ + if(memcmp(p, "\xEF\xBB\xBF", 3)==0){ + p += 3; + cset = "utf"; + break; + } + if(memcmp(p, "\xFE\xFF", 2) == 0){ + p += 2; + cset = "unicode-be"; + break; + } + if(memcmp(p, "\xFF\xFE", 2) == 0){ + p += 2; + cset = "unicode-le"; + break; + } + if(s = cistrstr(p, "encoding=")) if(s = strval(s+9)){ cset = s; break; } - if(s = cistrstr(buf, "charset=")) + if(s = cistrstr(p, "charset=")) if(s = strval(s+8)){ cset = s; break; } break; } + nbuf -= p - buf; if(pflag){ print("%s\n", cset); exits(0); } - if(pipe(pfd) < 0) - sysfatal("pipe: %r"); - if(nbuf == 0){ - write(1, buf, 0); + write(1, p, 0); exits(0); } - switch(rfork(RFFDG|RFREND|RFPROC|RFNOWAIT)){ + if(pipe(pfd) < 0) + sysfatal("pipe: %r"); + + switch(rfork(RFFDG|RFREND|RFPROC)){ case -1: sysfatal("fork: %r"); case 0: @@ -114,10 +126,13 @@ main(int argc, char *argv[]) close(pfd[1]); while(nbuf > 0){ - if(write(1, buf, nbuf) != nbuf) + if(write(1, p, nbuf) != nbuf) sysfatal("write: %r"); - if((nbuf = read(0, buf, sizeof(buf))) < 0) + p = buf; + if((nbuf = read(0, p, sizeof(buf))) < 0) sysfatal("read: %r"); } + close(1); + waitpid(); exits(0); } |