summaryrefslogtreecommitdiff
path: root/sys/src/cmd/html2ms.c
diff options
context:
space:
mode:
authorcinap_lenrek <cinap_lenrek@centraldogma>2011-09-20 00:38:28 +0200
committercinap_lenrek <cinap_lenrek@centraldogma>2011-09-20 00:38:28 +0200
commita31e4f61a4c9afb9696a7ff4ff09e02b3281a2f3 (patch)
treeb011661fc3c4e6f98d8ff53ce0b103a579853ff1 /sys/src/cmd/html2ms.c
parente7df0daa66531eccb2d37f7b66e27d16c9ae4391 (diff)
uhtml: add html to unicode converter, used by mothra and page/html2ms
Diffstat (limited to 'sys/src/cmd/html2ms.c')
-rw-r--r--sys/src/cmd/html2ms.c93
1 files changed, 10 insertions, 83 deletions
diff --git a/sys/src/cmd/html2ms.c b/sys/src/cmd/html2ms.c
index 229836216..fd175d72f 100644
--- a/sys/src/cmd/html2ms.c
+++ b/sys/src/cmd/html2ms.c
@@ -296,79 +296,6 @@ parsetag(Tag *t)
return n > 0;
}
-struct {
- char *entity;
- Rune rune;
-} entities[] = {
- "AElig", 198, "Aacute", 193, "Acirc", 194, "Agrave", 192,
- "Alpha", 913, "Aring", 197, "Atilde", 195, "Auml", 196,
- "Beta", 914, "Ccedil", 199, "Chi", 935, "Dagger", 8225,
- "Delta", 916, "ETH", 208, "Eacute", 201, "Ecirc", 202,
- "Egrave", 200, "Epsilon", 917, "Eta", 919, "Euml", 203,
- "Gamma", 915, "Iacute", 205, "Icirc", 206, "Igrave", 204,
- "Iota", 921, "Iuml", 207, "Kappa", 922, "Lambda", 923,
- "Mu", 924, "Ntilde", 209, "Nu", 925, "OElig", 338,
- "Oacute", 211, "Ocirc", 212, "Ograve", 210, "Omega", 937,
- "Omicron", 927, "Oslash", 216, "Otilde", 213, "Ouml", 214,
- "Phi", 934, "Pi", 928, "Prime", 8243, "Psi", 936,
- "Rho", 929, "Scaron", 352, "Sigma", 931, "THORN", 222,
- "Tau", 932, "Theta", 920, "Uacute", 218, "Ucirc", 219,
- "Ugrave", 217, "Upsilon", 933, "Uuml", 220, "Xi", 926,
- "Yacute", 221, "Yuml", 376, "Zeta", 918, "aacute", 225,
- "acirc", 226, "acute", 180, "aelig", 230, "agrave", 224,
- "alefsym", 8501,"alpha", 945, "amp", 38, "and", 8743,
- "ang", 8736, "aring", 229, "asymp", 8776, "atilde", 227,
- "auml", 228, "bdquo", 8222, "beta", 946, "brvbar", 166,
- "bull", 8226, "cap", 8745, "ccedil", 231, "cdots", 8943,
- "cedil", 184, "cent", 162, "chi", 967, "circ", 710,
- "clubs", 9827, "cong", 8773, "copy", 169, "crarr", 8629,
- "cup", 8746, "curren", 164, "dArr", 8659, "dagger", 8224,
- "darr", 8595, "ddots", 8945, "deg", 176, "delta", 948,
- "diams", 9830, "divide", 247, "eacute", 233, "ecirc", 234,
- "egrave", 232, "emdash", 8212, "empty", 8709, "emsp", 8195,
- "endash", 8211, "ensp", 8194, "epsilon", 949, "equiv", 8801,
- "eta", 951, "eth", 240, "euml", 235, "euro", 8364,
- "exist", 8707, "fnof", 402, "forall", 8704, "frac12", 189,
- "frac14", 188, "frac34", 190, "frasl", 8260, "gamma", 947,
- "ge", 8805, "gt", 62, "hArr", 8660, "harr", 8596,
- "hearts", 9829, "hellip", 8230, "iacute", 237, "icirc", 238,
- "iexcl", 161, "igrave", 236, "image", 8465, "infin", 8734,
- "int", 8747, "iota", 953, "iquest", 191, "isin", 8712,
- "iuml", 239, "kappa", 954, "lArr", 8656, "lambda", 955,
- "lang", 9001, "laquo", 171, "larr", 8592, "lceil", 8968,
- "ldots", 8230, "ldquo", 8220, "le", 8804, "lfloor", 8970,
- "lowast", 8727, "loz", 9674, "lrm", 8206, "lsaquo", 8249,
- "lsquo", 8216, "lt", 60, "macr", 175, "mdash", 8212,
- "micro", 181, "middot", 183, "minus", 8722, "mu", 956,
- "nabla", 8711, "nbsp", 160, "ndash", 8211, "ne", 8800,
- "ni", 8715, "not", 172, "notin", 8713, "nsub", 8836,
- "ntilde", 241, "nu", 957, "oacute", 243, "ocirc", 244,
- "oelig", 339, "ograve", 242, "oline", 8254, "omega", 969,
- "omicron", 959, "oplus", 8853, "or", 8744, "ordf", 170,
- "ordm", 186, "oslash", 248, "otilde", 245, "otimes", 8855,
- "ouml", 246, "para", 182, "part", 8706, "permil", 8240,
- "perp", 8869, "phi", 966, "pi", 960, "piv", 982,
- "plusmn", 177, "pound", 163, "prime", 8242, "prod", 8719,
- "prop", 8733, "psi", 968, "quad", 8193, "quot", 34,
- "rArr", 8658, "radic", 8730, "rang", 9002, "raquo", 187,
- "rarr", 8594, "rceil", 8969, "rdquo", 8221, "real", 8476,
- "reg", 174, "rfloor", 8971, "rho", 961, "rlm", 8207,
- "rsaquo", 8250, "rsquo", 8217, "sbquo", 8218, "scaron", 353,
- "sdot", 8901, "sect", 167, "shy", 173, "sigma", 963,
- "sigmaf", 962, "sim", 8764, "sp", 8194, "spades", 9824,
- "sub", 8834, "sube", 8838, "sum", 8721, "sup", 8835,
- "sup1", 185, "sup2", 178, "sup3", 179, "supe", 8839,
- "szlig", 223, "tau", 964, "there4", 8756, "theta", 952,
- "thetasym", 977,"thinsp", 8201, "thorn", 254, "tilde", 732,
- "times", 215, "trade", 8482, "uArr", 8657, "uacute", 250,
- "uarr", 8593, "ucirc", 251, "ugrave", 249, "uml", 168,
- "upsih", 978, "upsilon", 965, "uuml", 252, "varepsilon", 8712,
- "varphi", 981, "varpi", 982, "varrho", 1009, "vdots", 8942,
- "vsigma", 962, "vtheta", 977, "weierp", 8472, "xi", 958,
- "yacute", 253, "yen", 165, "yuml", 255, "zeta", 950,
- "zwj", 8205, "zwnj", 8204,
-};
-
Rune
parserune(int c)
{
@@ -379,7 +306,7 @@ parserune(int c)
n = 0;
if(c == '&'){
while((c = Bgetc(&in)) > 0){
- if(strchr("\n\r\t ;</>", c)){
+ if(strchr(";&</>\n\r\t ", c)){
if(c != ';')
Bungetc(&in);
if(n == 0)
@@ -391,15 +318,15 @@ parserune(int c)
buf[n++] = c;
}
buf[n] = 0;
- if(buf[0] == '#')
- return atoi(buf+1);
- for(i=0; i<nelem(entities); i++){
- n = strcmp(buf, entities[i].entity);
- if(n == 0)
- return entities[i].rune;
- if(n < 0)
- break;
- }
+ if(strcmp(buf, "lt") == 0)
+ return '<';
+ if(strcmp(buf, "gt") == 0)
+ return '>';
+ if(strcmp(buf, "quot") == 0)
+ return '"';
+ if(strcmp(buf, "amp") == 0)
+ return '&';
+ /* use tcs -f html to handle the rest. */
} else {
do {
buf[n++] = c;