From a31e4f61a4c9afb9696a7ff4ff09e02b3281a2f3 Mon Sep 17 00:00:00 2001 From: cinap_lenrek Date: Tue, 20 Sep 2011 00:38:28 +0200 Subject: uhtml: add html to unicode converter, used by mothra and page/html2ms --- sys/src/cmd/html2ms.c | 93 ++++++--------------------------------------------- 1 file changed, 10 insertions(+), 83 deletions(-) (limited to 'sys/src/cmd/html2ms.c') diff --git a/sys/src/cmd/html2ms.c b/sys/src/cmd/html2ms.c index 229836216..fd175d72f 100644 --- a/sys/src/cmd/html2ms.c +++ b/sys/src/cmd/html2ms.c @@ -296,79 +296,6 @@ parsetag(Tag *t) return n > 0; } -struct { - char *entity; - Rune rune; -} entities[] = { - "AElig", 198, "Aacute", 193, "Acirc", 194, "Agrave", 192, - "Alpha", 913, "Aring", 197, "Atilde", 195, "Auml", 196, - "Beta", 914, "Ccedil", 199, "Chi", 935, "Dagger", 8225, - "Delta", 916, "ETH", 208, "Eacute", 201, "Ecirc", 202, - "Egrave", 200, "Epsilon", 917, "Eta", 919, "Euml", 203, - "Gamma", 915, "Iacute", 205, "Icirc", 206, "Igrave", 204, - "Iota", 921, "Iuml", 207, "Kappa", 922, "Lambda", 923, - "Mu", 924, "Ntilde", 209, "Nu", 925, "OElig", 338, - "Oacute", 211, "Ocirc", 212, "Ograve", 210, "Omega", 937, - "Omicron", 927, "Oslash", 216, "Otilde", 213, "Ouml", 214, - "Phi", 934, "Pi", 928, "Prime", 8243, "Psi", 936, - "Rho", 929, "Scaron", 352, "Sigma", 931, "THORN", 222, - "Tau", 932, "Theta", 920, "Uacute", 218, "Ucirc", 219, - "Ugrave", 217, "Upsilon", 933, "Uuml", 220, "Xi", 926, - "Yacute", 221, "Yuml", 376, "Zeta", 918, "aacute", 225, - "acirc", 226, "acute", 180, "aelig", 230, "agrave", 224, - "alefsym", 8501,"alpha", 945, "amp", 38, "and", 8743, - "ang", 8736, "aring", 229, "asymp", 8776, "atilde", 227, - "auml", 228, "bdquo", 8222, "beta", 946, "brvbar", 166, - "bull", 8226, "cap", 8745, "ccedil", 231, "cdots", 8943, - "cedil", 184, "cent", 162, "chi", 967, "circ", 710, - "clubs", 9827, "cong", 8773, "copy", 169, "crarr", 8629, - "cup", 8746, "curren", 164, "dArr", 8659, "dagger", 8224, - "darr", 8595, "ddots", 8945, "deg", 176, "delta", 948, - "diams", 9830, "divide", 247, "eacute", 233, "ecirc", 234, - "egrave", 232, "emdash", 8212, "empty", 8709, "emsp", 8195, - "endash", 8211, "ensp", 8194, "epsilon", 949, "equiv", 8801, - "eta", 951, "eth", 240, "euml", 235, "euro", 8364, - "exist", 8707, "fnof", 402, "forall", 8704, "frac12", 189, - "frac14", 188, "frac34", 190, "frasl", 8260, "gamma", 947, - "ge", 8805, "gt", 62, "hArr", 8660, "harr", 8596, - "hearts", 9829, "hellip", 8230, "iacute", 237, "icirc", 238, - "iexcl", 161, "igrave", 236, "image", 8465, "infin", 8734, - "int", 8747, "iota", 953, "iquest", 191, "isin", 8712, - "iuml", 239, "kappa", 954, "lArr", 8656, "lambda", 955, - "lang", 9001, "laquo", 171, "larr", 8592, "lceil", 8968, - "ldots", 8230, "ldquo", 8220, "le", 8804, "lfloor", 8970, - "lowast", 8727, "loz", 9674, "lrm", 8206, "lsaquo", 8249, - "lsquo", 8216, "lt", 60, "macr", 175, "mdash", 8212, - "micro", 181, "middot", 183, "minus", 8722, "mu", 956, - "nabla", 8711, "nbsp", 160, "ndash", 8211, "ne", 8800, - "ni", 8715, "not", 172, "notin", 8713, "nsub", 8836, - "ntilde", 241, "nu", 957, "oacute", 243, "ocirc", 244, - "oelig", 339, "ograve", 242, "oline", 8254, "omega", 969, - "omicron", 959, "oplus", 8853, "or", 8744, "ordf", 170, - "ordm", 186, "oslash", 248, "otilde", 245, "otimes", 8855, - "ouml", 246, "para", 182, "part", 8706, "permil", 8240, - "perp", 8869, "phi", 966, "pi", 960, "piv", 982, - "plusmn", 177, "pound", 163, "prime", 8242, "prod", 8719, - "prop", 8733, "psi", 968, "quad", 8193, "quot", 34, - "rArr", 8658, "radic", 8730, "rang", 9002, "raquo", 187, - "rarr", 8594, "rceil", 8969, "rdquo", 8221, "real", 8476, - "reg", 174, "rfloor", 8971, "rho", 961, "rlm", 8207, - "rsaquo", 8250, "rsquo", 8217, "sbquo", 8218, "scaron", 353, - "sdot", 8901, "sect", 167, "shy", 173, "sigma", 963, - "sigmaf", 962, "sim", 8764, "sp", 8194, "spades", 9824, - "sub", 8834, "sube", 8838, "sum", 8721, "sup", 8835, - "sup1", 185, "sup2", 178, "sup3", 179, "supe", 8839, - "szlig", 223, "tau", 964, "there4", 8756, "theta", 952, - "thetasym", 977,"thinsp", 8201, "thorn", 254, "tilde", 732, - "times", 215, "trade", 8482, "uArr", 8657, "uacute", 250, - "uarr", 8593, "ucirc", 251, "ugrave", 249, "uml", 168, - "upsih", 978, "upsilon", 965, "uuml", 252, "varepsilon", 8712, - "varphi", 981, "varpi", 982, "varrho", 1009, "vdots", 8942, - "vsigma", 962, "vtheta", 977, "weierp", 8472, "xi", 958, - "yacute", 253, "yen", 165, "yuml", 255, "zeta", 950, - "zwj", 8205, "zwnj", 8204, -}; - Rune parserune(int c) { @@ -379,7 +306,7 @@ parserune(int c) n = 0; if(c == '&'){ while((c = Bgetc(&in)) > 0){ - if(strchr("\n\r\t ;", c)){ + if(strchr(";&\n\r\t ", c)){ if(c != ';') Bungetc(&in); if(n == 0) @@ -391,15 +318,15 @@ parserune(int c) buf[n++] = c; } buf[n] = 0; - if(buf[0] == '#') - return atoi(buf+1); - for(i=0; i'; + if(strcmp(buf, "quot") == 0) + return '"'; + if(strcmp(buf, "amp") == 0) + return '&'; + /* use tcs -f html to handle the rest. */ } else { do { buf[n++] = c; -- cgit v1.2.3