diff options
author | Taru Karttunen <taruti@taruti.net> | 2011-03-30 15:46:40 +0300 |
---|---|---|
committer | Taru Karttunen <taruti@taruti.net> | 2011-03-30 15:46:40 +0300 |
commit | e5888a1ffdae813d7575f5fb02275c6bb07e5199 (patch) | |
tree | d8d51eac403f07814b9e936eed0c9a79195e2450 /sys/src/libhtml/lex.c |
Import sources from 2011-03-30 iso image
Diffstat (limited to 'sys/src/libhtml/lex.c')
-rwxr-xr-x | sys/src/libhtml/lex.c | 1502 |
1 files changed, 1502 insertions, 0 deletions
diff --git a/sys/src/libhtml/lex.c b/sys/src/libhtml/lex.c new file mode 100755 index 000000000..ca8fc77d6 --- /dev/null +++ b/sys/src/libhtml/lex.c @@ -0,0 +1,1502 @@ +#include <u.h> +#include <libc.h> +#include <draw.h> +#include <ctype.h> +#include <html.h> +#include "impl.h" + +typedef struct TokenSource TokenSource; +struct TokenSource +{ + int i; // index of next byte to use + uchar* data; // all the data + int edata; // data[0:edata] is valid + int chset; // one of US_Ascii, etc. + int mtype; // TextHtml or TextPlain +}; + +enum { + EOF = -2, + EOB = -1 +}; + +#define ISNAMCHAR(c) ((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.')) + +#define SMALLBUFSIZE 240 +#define BIGBUFSIZE 2000 + +// HTML 4.0 tag names. +// Keep sorted, and in correspondence with enum in iparse.h. +Rune* tagnames[] = { + L" ", + L"!", + L"a", + L"abbr", + L"acronym", + L"address", + L"applet", + L"area", + L"b", + L"base", + L"basefont", + L"bdo", + L"big", + L"blink", + L"blockquote", + L"body", + L"bq", + L"br", + L"button", + L"caption", + L"center", + L"cite", + L"code", + L"col", + L"colgroup", + L"dd", + L"del", + L"dfn", + L"dir", + L"div", + L"dl", + L"dt", + L"em", + L"fieldset", + L"font", + L"form", + L"frame", + L"frameset", + L"h1", + L"h2", + L"h3", + L"h4", + L"h5", + L"h6", + L"head", + L"hr", + L"html", + L"i", + L"iframe", + L"img", + L"input", + L"ins", + L"isindex", + L"kbd", + L"label", + L"legend", + L"li", + L"link", + L"map", + L"menu", + L"meta", + L"nobr", + L"noframes", + L"noscript", + L"object", + L"ol", + L"optgroup", + L"option", + L"p", + L"param", + L"pre", + L"q", + L"s", + L"samp", + L"script", + L"select", + L"small", + L"span", + L"strike", + L"strong", + L"style", + L"sub", + L"sup", + L"table", + L"tbody", + L"td", + L"textarea", + L"tfoot", + L"th", + L"thead", + L"title", + L"tr", + L"tt", + L"u", + L"ul", + L"var" +}; + +// HTML 4.0 attribute names. +// Keep sorted, and in correspondence with enum in impl.h. +Rune* attrnames[] = { + L"abbr", + L"accept-charset", + L"access-key", + L"action", + L"align", + L"alink", + L"alt", + L"archive", + L"axis", + L"background", + L"bgcolor", + L"border", + L"cellpadding", + L"cellspacing", + L"char", + L"charoff", + L"charset", + L"checked", + L"cite", + L"class", + L"classid", + L"clear", + L"code", + L"codebase", + L"codetype", + L"color", + L"cols", + L"colspan", + L"compact", + L"content", + L"coords", + L"data", + L"datetime", + L"declare", + L"defer", + L"dir", + L"disabled", + L"enctype", + L"face", + L"for", + L"frame", + L"frameborder", + L"headers", + L"height", + L"href", + L"hreflang", + L"hspace", + L"http-equiv", + L"id", + L"ismap", + L"label", + L"lang", + L"link", + L"longdesc", + L"marginheight", + L"marginwidth", + L"maxlength", + L"media", + L"method", + L"multiple", + L"name", + L"nohref", + L"noresize", + L"noshade", + L"nowrap", + L"object", + L"onblur", + L"onchange", + L"onclick", + L"ondblclick", + L"onfocus", + L"onkeypress", + L"onkeyup", + L"onload", + L"onmousedown", + L"onmousemove", + L"onmouseout", + L"onmouseover", + L"onmouseup", + L"onreset", + L"onselect", + L"onsubmit", + L"onunload", + L"profile", + L"prompt", + L"readonly", + L"rel", + L"rev", + L"rows", + L"rowspan", + L"rules", + L"scheme", + L"scope", + L"scrolling", + L"selected", + L"shape", + L"size", + L"span", + L"src", + L"standby", + L"start", + L"style", + L"summary", + L"tabindex", + L"target", + L"text", + L"title", + L"type", + L"usemap", + L"valign", + L"value", + L"valuetype", + L"version", + L"vlink", + L"vspace", + L"width" +}; + + +// Character entity to unicode character number map. +// Keep sorted by name. +StringInt chartab[]= { + {L"AElig", 198}, + {L"Aacute", 193}, + {L"Acirc", 194}, + {L"Agrave", 192}, + {L"Alpha", 913}, + {L"Aring", 197}, + {L"Atilde", 195}, + {L"Auml", 196}, + {L"Beta", 914}, + {L"Ccedil", 199}, + {L"Chi", 935}, + {L"Dagger", 8225}, + {L"Delta", 916}, + {L"ETH", 208}, + {L"Eacute", 201}, + {L"Ecirc", 202}, + {L"Egrave", 200}, + {L"Epsilon", 917}, + {L"Eta", 919}, + {L"Euml", 203}, + {L"Gamma", 915}, + {L"Iacute", 205}, + {L"Icirc", 206}, + {L"Igrave", 204}, + {L"Iota", 921}, + {L"Iuml", 207}, + {L"Kappa", 922}, + {L"Lambda", 923}, + {L"Mu", 924}, + {L"Ntilde", 209}, + {L"Nu", 925}, + {L"OElig", 338}, + {L"Oacute", 211}, + {L"Ocirc", 212}, + {L"Ograve", 210}, + {L"Omega", 937}, + {L"Omicron", 927}, + {L"Oslash", 216}, + {L"Otilde", 213}, + {L"Ouml", 214}, + {L"Phi", 934}, + {L"Pi", 928}, + {L"Prime", 8243}, + {L"Psi", 936}, + {L"Rho", 929}, + {L"Scaron", 352}, + {L"Sigma", 931}, + {L"THORN", 222}, + {L"Tau", 932}, + {L"Theta", 920}, + {L"Uacute", 218}, + {L"Ucirc", 219}, + {L"Ugrave", 217}, + {L"Upsilon", 933}, + {L"Uuml", 220}, + {L"Xi", 926}, + {L"Yacute", 221}, + {L"Yuml", 376}, + {L"Zeta", 918}, + {L"aacute", 225}, + {L"acirc", 226}, + {L"acute", 180}, + {L"aelig", 230}, + {L"agrave", 224}, + {L"alefsym", 8501}, + {L"alpha", 945}, + {L"amp", 38}, + {L"and", 8743}, + {L"ang", 8736}, + {L"aring", 229}, + {L"asymp", 8776}, + {L"atilde", 227}, + {L"auml", 228}, + {L"bdquo", 8222}, + {L"beta", 946}, + {L"brvbar", 166}, + {L"bull", 8226}, + {L"cap", 8745}, + {L"ccedil", 231}, + {L"cdots", 8943}, + {L"cedil", 184}, + {L"cent", 162}, + {L"chi", 967}, + {L"circ", 710}, + {L"clubs", 9827}, + {L"cong", 8773}, + {L"copy", 169}, + {L"crarr", 8629}, + {L"cup", 8746}, + {L"curren", 164}, + {L"dArr", 8659}, + {L"dagger", 8224}, + {L"darr", 8595}, + {L"ddots", 8945}, + {L"deg", 176}, + {L"delta", 948}, + {L"diams", 9830}, + {L"divide", 247}, + {L"eacute", 233}, + {L"ecirc", 234}, + {L"egrave", 232}, + {L"emdash", 8212}, /* non-standard but commonly used */ + {L"empty", 8709}, + {L"emsp", 8195}, + {L"endash", 8211}, /* non-standard but commonly used */ + {L"ensp", 8194}, + {L"epsilon", 949}, + {L"equiv", 8801}, + {L"eta", 951}, + {L"eth", 240}, + {L"euml", 235}, + {L"euro", 8364}, + {L"exist", 8707}, + {L"fnof", 402}, + {L"forall", 8704}, + {L"frac12", 189}, + {L"frac14", 188}, + {L"frac34", 190}, + {L"frasl", 8260}, + {L"gamma", 947}, + {L"ge", 8805}, + {L"gt", 62}, + {L"hArr", 8660}, + {L"harr", 8596}, + {L"hearts", 9829}, + {L"hellip", 8230}, + {L"iacute", 237}, + {L"icirc", 238}, + {L"iexcl", 161}, + {L"igrave", 236}, + {L"image", 8465}, + {L"infin", 8734}, + {L"int", 8747}, + {L"iota", 953}, + {L"iquest", 191}, + {L"isin", 8712}, + {L"iuml", 239}, + {L"kappa", 954}, + {L"lArr", 8656}, + {L"lambda", 955}, + {L"lang", 9001}, + {L"laquo", 171}, + {L"larr", 8592}, + {L"lceil", 8968}, + {L"ldots", 8230}, + {L"ldquo", 8220}, + {L"le", 8804}, + {L"lfloor", 8970}, + {L"lowast", 8727}, + {L"loz", 9674}, + {L"lrm", 8206}, + {L"lsaquo", 8249}, + {L"lsquo", 8216}, + {L"lt", 60}, + {L"macr", 175}, + {L"mdash", 8212}, + {L"micro", 181}, + {L"middot", 183}, + {L"minus", 8722}, + {L"mu", 956}, + {L"nabla", 8711}, + {L"nbsp", 160}, + {L"ndash", 8211}, + {L"ne", 8800}, + {L"ni", 8715}, + {L"not", 172}, + {L"notin", 8713}, + {L"nsub", 8836}, + {L"ntilde", 241}, + {L"nu", 957}, + {L"oacute", 243}, + {L"ocirc", 244}, + {L"oelig", 339}, + {L"ograve", 242}, + {L"oline", 8254}, + {L"omega", 969}, + {L"omicron", 959}, + {L"oplus", 8853}, + {L"or", 8744}, + {L"ordf", 170}, + {L"ordm", 186}, + {L"oslash", 248}, + {L"otilde", 245}, + {L"otimes", 8855}, + {L"ouml", 246}, + {L"para", 182}, + {L"part", 8706}, + {L"permil", 8240}, + {L"perp", 8869}, + {L"phi", 966}, + {L"pi", 960}, + {L"piv", 982}, + {L"plusmn", 177}, + {L"pound", 163}, + {L"prime", 8242}, + {L"prod", 8719}, + {L"prop", 8733}, + {L"psi", 968}, + {L"quad", 8193}, + {L"quot", 34}, + {L"rArr", 8658}, + {L"radic", 8730}, + {L"rang", 9002}, + {L"raquo", 187}, + {L"rarr", 8594}, + {L"rceil", 8969}, + {L"rdquo", 8221}, + {L"real", 8476}, + {L"reg", 174}, + {L"rfloor", 8971}, + {L"rho", 961}, + {L"rlm", 8207}, + {L"rsaquo", 8250}, + {L"rsquo", 8217}, + {L"sbquo", 8218}, + {L"scaron", 353}, + {L"sdot", 8901}, + {L"sect", 167}, + {L"shy", 173}, + {L"sigma", 963}, + {L"sigmaf", 962}, + {L"sim", 8764}, + {L"sp", 8194}, + {L"spades", 9824}, + {L"sub", 8834}, + {L"sube", 8838}, + {L"sum", 8721}, + {L"sup", 8835}, + {L"sup1", 185}, + {L"sup2", 178}, + {L"sup3", 179}, + {L"supe", 8839}, + {L"szlig", 223}, + {L"tau", 964}, + {L"there4", 8756}, + {L"theta", 952}, + {L"thetasym", 977}, + {L"thinsp", 8201}, + {L"thorn", 254}, + {L"tilde", 732}, + {L"times", 215}, + {L"trade", 8482}, + {L"uArr", 8657}, + {L"uacute", 250}, + {L"uarr", 8593}, + {L"ucirc", 251}, + {L"ugrave", 249}, + {L"uml", 168}, + {L"upsih", 978}, + {L"upsilon", 965}, + {L"uuml", 252}, + {L"varepsilon", 8712}, + {L"varphi", 981}, + {L"varpi", 982}, + {L"varrho", 1009}, + {L"vdots", 8942}, + {L"vsigma", 962}, + {L"vtheta", 977}, + {L"weierp", 8472}, + {L"xi", 958}, + {L"yacute", 253}, + {L"yen", 165}, + {L"yuml", 255}, + {L"zeta", 950}, + {L"zwj", 8205}, + {L"zwnj", 8204} +}; +#define NCHARTAB (sizeof(chartab)/sizeof(chartab[0])) + +// Characters Winstart..Winend are those that Windows +// uses interpolated into the Latin1 set. +// They aren't supposed to appear in HTML, but they do.... +enum { + Winstart = 127, + Winend = 159 +}; + +static int winchars[]= { 8226, // 8226 is a bullet + 8226, 8226, 8218, 402, 8222, 8230, 8224, 8225, + 710, 8240, 352, 8249, 338, 8226, 8226, 8226, + 8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212, + 732, 8482, 353, 8250, 339, 8226, 8226, 376}; + +static StringInt* tagtable; // initialized from tagnames +static StringInt* attrtable; // initialized from attrnames + +static void lexinit(void); +static int getplaindata(TokenSource* ts, Token* a, int* pai); +static int getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai); +static int getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai, int findtag); +static int gettag(TokenSource* ts, int starti, Token* a, int* pai); +static Rune* buftostr(Rune* s, Rune* buf, int j); +static int comment(TokenSource* ts); +static int findstr(TokenSource* ts, Rune* s); +static int ampersand(TokenSource* ts); +static int lowerc(int c); +static int getchar(TokenSource* ts); +static void ungetchar(TokenSource* ts, int c); +static void backup(TokenSource* ts, int savei); +static void freeinsidetoken(Token* t); +static void freeattrs(Attr* ahead); +static Attr* newattr(int attid, Rune* value, Attr* link); +static int Tconv(Fmt* f); + +int dbglex = 0; +static int lexinited = 0; + +static void +lexinit(void) +{ + tagtable = _makestrinttab(tagnames, Numtags); + attrtable = _makestrinttab(attrnames, Numattrs); + fmtinstall('T', Tconv); + lexinited = 1; +} + +static TokenSource* +newtokensource(uchar* data, int edata, int chset, int mtype) +{ + TokenSource* ans; + + assert(chset == US_Ascii || chset == ISO_8859_1 || + chset == UTF_8 || chset == Unicode); + ans = (TokenSource*)emalloc(sizeof(TokenSource)); + ans->i = 0; + ans->data = data; + ans->edata = edata; + ans->chset = chset; + ans->mtype = mtype; + return ans; +} + +enum { + ToksChunk = 500, +}; + +// Call this to get the tokens. +// The number of returned tokens is returned in *plen. +Token* +_gettoks(uchar* data, int datalen, int chset, int mtype, int* plen) +{ + TokenSource* ts; + Token* a; + int alen; + int ai; + int starti; + int c; + int tag; + + if(!lexinited) + lexinit(); + ts = newtokensource(data, datalen, chset, mtype); + if(dbglex) + fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata); + alen = 0; + ai = 0; + a = 0; + if(ts->mtype == TextHtml) { + for(;;) { + if(alen - ai < ToksChunk/32) { + alen += ToksChunk; + a = erealloc(a, alen*sizeof *a); + } + starti = ts->i; + c = getchar(ts); + if(c < 0) + break; + if(c == '<') { + tag = gettag(ts, starti, a, &ai); + if(tag == Tscript || tag == Tstyle) { + // special rules for getting Data after.... + starti = ts->i; + c = getchar(ts); + tag = getscriptdata(ts, c, starti, a, &ai, tag); + } + } + else + tag = getdata(ts, c, starti, a, &ai); + if(tag == -1) + break; + else if(dbglex > 1 && tag != Comment) + fprint(2, "lex: got token %T\n", &a[ai-1]); + } + } + else { + // plain text (non-html) tokens + for(;;) { + if(alen - ai < ToksChunk/32) { + alen += ToksChunk; + a = erealloc(a, alen*sizeof *a); + } + tag = getplaindata(ts, a, &ai); + if(tag == -1) + break; + if(dbglex > 1) + fprint(2, "lex: got token %T\n", &a[ai]); + } + } + free(ts); + if(dbglex) + fprint(2, "lex: returning %d tokens\n", ai); + *plen = ai; + if(ai == 0){ + free(a); + a = 0; + } + return a; +} + +// For case where source isn't HTML. +// Just make data tokens, one per line (or partial line, +// at end of buffer), ignoring non-whitespace control +// characters and dumping \r's. +// If find non-empty token, fill in a[*pai], bump *pai, and return Data. +// Otherwise return -1; +static int +getplaindata(TokenSource* ts, Token* a, int* pai) +{ + Rune* s; + int j; + int starti; + int c; + Token* tok; + Rune buf[BIGBUFSIZE]; + + s = nil; + j = 0; + starti = ts->i; + for(c = getchar(ts); c >= 0; c = getchar(ts)) { + if(c < ' ') { + if(isspace(c)) { + if(c == '\r') { + // ignore it unless no following '\n', + // in which case treat it like '\n' + c = getchar(ts); + if(c != '\n') { + if(c >= 0) + ungetchar(ts, c); + c = '\n'; + } + } + } + else + c = 0; + } + if(c != 0) { + buf[j++] = c; + if(j == nelem(buf)-1) { + s = buftostr(s, buf, j); + j = 0; + } + } + if(c == '\n') + break; + } + s = buftostr(s, buf, j); + if(s == nil) + return -1; + tok = &a[(*pai)++]; + tok->tag = Data; + tok->text = s; + tok->attr = nil; + tok->starti = starti; + return Data; +} + +// Return concatenation of s and buf[0:j] +static Rune* +buftostr(Rune* s, Rune* buf, int j) +{ + int i; + + if(s == nil) + s = _Strndup(buf, j); + else { + i = _Strlen(s); + s = realloc(s, ( i+j+1)*sizeof *s); + memcpy(&s[i], buf, j*sizeof *s); + s[i+j] = 0; + } + return s; +} + +// Gather data up to next start-of-tag or end-of-buffer. +// Translate entity references (&). +// Ignore non-whitespace control characters and get rid of \r's. +// If find non-empty token, fill in a[*pai], bump *pai, and return Data. +// Otherwise return -1; +static int +getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai) +{ + Rune* s; + int j; + int c; + Token* tok; + Rune buf[SMALLBUFSIZE]; + + s = nil; + j = 0; + for(c = firstc; c >= 0; c = getchar(ts)){ + if(c == '&') { + c = ampersand(ts); + if(c < 0) + break; + } + else if(c < ' ') { + if(isspace(c)) { + if(c == '\r') { + // ignore it unless no following '\n', + // in which case treat it like '\n' + c = getchar(ts); + if(c != '\n') { + if(c >= 0) + ungetchar(ts, c); + c = '\n'; + } + } + } + else { + if(warn) + fprint(2, "warning: non-whitespace control character %d ignored\n", c); + c = 0; + } + } + else if(c == '<') { + ungetchar(ts, c); + break; + } + if(c != 0) { + buf[j++] = c; + if(j == nelem(buf)-1) { + s = buftostr(s, buf, j); + j = 0; + } + } + } + s = buftostr(s, buf, j); + if(s == nil) + return -1; + tok = &a[(*pai)++]; + tok->tag = Data; + tok->text = s; + tok->attr = nil; + tok->starti = starti; + return Data; +} + +// The rules for lexing scripts are different (ugh). +// Gather up everything until see an "</" tagnames[tok] ">" +static int +getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai, int findtag) +{ + Rune* s; + int j; + int tstarti; + int savei; + int c; + int tag; + int done; + Token* tok; + Rune buf[BIGBUFSIZE]; + + s = nil; + j = 0; + tstarti = starti; + c = firstc; + done = 0; + while(c >= 0) { + if(c == '<') { + // other browsers ignore stuff to end of line after <! + savei = ts->i; + c = getchar(ts); + if(c == '!') { + if(comment(ts) == -1) + break; + if(c == '\r') + c = getchar(ts); + if(c == '\n') + c = getchar(ts); + } + else if(c >= 0) { + backup(ts, savei); + tag = gettag(ts, tstarti, a, pai); + if(tag == -1) + break; + if(tag != Comment) + (*pai)--; + backup(ts, tstarti); + if(tag == findtag + RBRA) { + done = 1; + break; + } + // here tag was not the one we were looking for, so take as regular data + c = getchar(ts); + } + } + if(c < 0) + break; + if(c != 0) { + buf[j++] = c; + if(j == nelem(buf)-1) { + s = buftostr(s, buf, j); + j = 0; + } + } + tstarti = ts->i; + c = getchar(ts); + } + if(done || ts->i == ts->edata) { + s = buftostr(s, buf, j); + tok = &a[(*pai)++]; + tok->tag = Data; + tok->text = s; + tok->attr = nil; + tok->starti = starti; + return Data; + } + free(s); + backup(ts, starti); + return -1; +} + +// We've just seen a '<'. Gather up stuff to closing '>' (if buffer +// ends before then, return -1). +// If it's a tag, look up the name, gather the attributes, and return +// the appropriate token. +// Else it's either just plain data or some kind of ignorable stuff: +// return Data or Comment as appropriate. +// If it's not a Comment, put it in a[*pai] and bump *pai. +static int +gettag(TokenSource* ts, int starti, Token* a, int* pai) +{ + int rbra; + int ans; + Attr* al; + int nexti; + int c; + int ti; + int afnd; + int attid; + int quote; + Rune* val; + int nv; + int i; + int tag; + Token* tok; + Rune buf[BIGBUFSIZE]; + + rbra = 0; + nexti = ts->i; + tok = &a[*pai]; + tok->tag = Notfound; + tok->text = nil; + tok->attr = nil; + tok->starti = starti; + c = getchar(ts); + if(c == '/') { + rbra = RBRA; + c = getchar(ts); + } + if(c < 0) + goto eob_done; + if(c >= 256 || !isalpha(c)) { + // not a tag + if(c == '!') { + ans = comment(ts); + if(ans != -1) + return ans; + goto eob_done; + } + else { + backup(ts, nexti); + tok->tag = Data; + tok->text = _Strdup(L"<"); + (*pai)++; + return Data; + } + } + // c starts a tagname + buf[0] = c; + i = 1; + while(1) { + c = getchar(ts); + if(c < 0) + goto eob_done; + if(!ISNAMCHAR(c)) + break; + // if name is bigger than buf it won't be found anyway... + if(i < BIGBUFSIZE) + buf[i++] = c; + } + if(_lookup(tagtable, Numtags, buf, i, &tag)) + tok->tag = tag + rbra; + else + tok->text = _Strndup(buf, i); // for warning print, in build + // attribute gathering loop + al = nil; + while(1) { + // look for "ws name" or "ws name ws = ws val" (ws=whitespace) + // skip whitespace +attrloop_continue: + while(c < 256 && isspace(c)) { + c = getchar(ts); + if(c < 0) + goto eob_done; + } + if(c == '>') + goto attrloop_done; + if(c == '<') { + if(warn) + fprint(2, "warning: unclosed tag\n"); + ungetchar(ts, c); + goto attrloop_done; + } + if(c >= 256 || !isalpha(c)) { + if(warn) + fprint(2, "warning: expected attribute name\n"); + // skipt to next attribute name + while(1) { + c = getchar(ts); + if(c < 0) + goto eob_done; + if(c < 256 && isalpha(c)) + goto attrloop_continue; + if(c == '<') { + if(warn) + fprint(2, "warning: unclosed tag\n"); + ungetchar(ts, 60); + goto attrloop_done; + } + if(c == '>') + goto attrloop_done; + } + } + // gather attribute name + buf[0] = c; + i = 1; + while(1) { + c = getchar(ts); + if(c < 0) + goto eob_done; + if(!ISNAMCHAR(c)) + break; + if(i < BIGBUFSIZE-1) + buf[i++] = c; + } + afnd = _lookup(attrtable, Numattrs, buf, i, &attid); + if(warn && !afnd) { + buf[i] = 0; + fprint(2, "warning: unknown attribute name %S\n", buf); + } + // skip whitespace + while(c < 256 && isspace(c)) { + c = getchar(ts); + if(c < 0) + goto eob_done; + } + if(c != '=') { + if(afnd) + al = newattr(attid, nil, al); + goto attrloop_continue; + } + //# c is '=' here; skip whitespace + while(1) { + c = getchar(ts); + if(c < 0) + goto eob_done; + if(c >= 256 || !isspace(c)) + break; + } + quote = 0; + if(c == '\'' || c == '"') { + quote = c; + c = getchar(ts); + if(c < 0) + goto eob_done; + } + val = nil; + nv = 0; + while(1) { +valloop_continue: + if(c < 0) + goto eob_done; + if(c == '>') { + if(quote) { + // c might be part of string (though not good style) + // but if line ends before close quote, assume + // there was an unmatched quote + ti = ts->i; + while(1) { + c = getchar(ts); + if(c < 0) + goto eob_done; + if(c == quote) { + backup(ts, ti); + buf[nv++] = '>'; + if(nv == BIGBUFSIZE-1) { + val = buftostr(val, buf, nv); + nv = 0; + } + c = getchar(ts); + goto valloop_continue; + } + if(c == '\n') { + if(warn) + fprint(2, "warning: apparent unmatched quote\n"); + backup(ts, ti); + c = '>'; + goto valloop_done; + } + } + } + else + goto valloop_done; + } + if(quote) { + if(c == quote) { + c = getchar(ts); + if(c < 0) + goto eob_done; + goto valloop_done; + } + if(c == '\r') { + c = getchar(ts); + goto valloop_continue; + } + if(c == '\t' || c == '\n') + c = ' '; + } + else { + if(c < 256 && isspace(c)) + goto valloop_done; + } + if(c == '&') { + c = ampersand(ts); + if(c == -1) + goto eob_done; + } + buf[nv++] = c; + if(nv == BIGBUFSIZE-1) { + val = buftostr(val, buf, nv); + nv = 0; + } + c = getchar(ts); + } +valloop_done: + if(afnd) { + val = buftostr(val, buf, nv); + al = newattr(attid, val, al); + } + } + +attrloop_done: + tok->attr = al; + (*pai)++; + return tok->tag; + +eob_done: + if(warn) + fprint(2, "warning: incomplete tag at end of page\n"); + backup(ts, nexti); + tok->tag = Data; + tok->text = _Strdup(L"<"); + return Data; +} + +// We've just read a '<!' at position starti, +// so this may be a comment or other ignored section, or it may +// be just a literal string if there is no close before end of file +// (other browsers do that). +// The accepted practice seems to be (note: contrary to SGML spec!): +// If see <!--, look for --> to close, or if none, > to close. +// If see <!(not --), look for > to close. +// If no close before end of file, leave original characters in as literal data. +// +// If we see ignorable stuff, return Comment. +// Else return nil (caller should back up and try again when more data arrives, +// unless at end of file, in which case caller should just make '<' a data token). +static int +comment(TokenSource* ts) +{ + int nexti; + int havecomment; + int c; + + nexti = ts->i; + havecomment = 0; + c = getchar(ts); + if(c == '-') { + c = getchar(ts); + if(c == '-') { + if(findstr(ts, L"-->")) + havecomment = 1; + else + backup(ts, nexti); + } + } + if(!havecomment) { + if(c == '>') + havecomment = 1; + else if(c >= 0) { + if(findstr(ts, L">")) + havecomment = 1; + } + } + if(havecomment) + return Comment; + return -1; +} + +// Look for string s in token source. +// If found, return 1, with buffer at next char after s, +// else return 0 (caller should back up). +static int +findstr(TokenSource* ts, Rune* s) +{ + int c0; + int n; + int nexti; + int i; + int c; + + c0 = s[0]; + n = runestrlen(s); + while(1) { + c = getchar(ts); + if(c < 0) + break; + if(c == c0) { + if(n == 1) + return 1; + nexti = ts->i; + for(i = 1; i < n; i++) { + c = getchar(ts); + if(c < 0) + goto mainloop_done; + if(c != s[i]) + break; + } + if(i == n) + return 1; + backup(ts, nexti); + } + } +mainloop_done: + return 0; +} + +// We've just read an '&'; look for an entity reference +// name, and if found, return translated char. +// if there is a complete entity name but it isn't known, +// back up to just past the '&' and return '&'. +// If the entity can't be completed in the current buffer, back up +// to the '&' and return -1. +static int +ampersand(TokenSource* ts) +{ + int savei; + int c; + int fnd; + int ans; + int v; + int k; + Rune buf[25]; + + savei = ts->i; + c = getchar(ts); + fnd = 0; + ans = -1; + if(c == '#') { + c = getchar(ts); + v = 0; + if(c == 'X' || c == 'x') + for(c = getchar(ts); c < 256; c = getchar(ts)) + if(c >= '0' && c <= '9') + v = v*16+c-'0'; + else if(c >= 'A' && c<= 'F') + v = v*16+c-'A'+10; + else if(c >= 'a' && c <= 'f') + v = v*16+c-'a'+10; + else + break; + else + while(c >= 0) { + if(!(c < 256 && isdigit(c))) + break; + v = v*10 + c - 48; + c = getchar(ts); + } + if(c >= 0) { + if(!(c == ';' || c == '\n' || c == '\r')) + ungetchar(ts, c); + c = v; + if(c == 160) + c = 160; + if(c >= Winstart && c <= Winend) { + c = winchars[c - Winstart]; + } + ans = c; + fnd = 1; + } + } + else if(c < 256 && isalpha(c)) { + buf[0] = c; + k = 1; + while(1) { + c = getchar(ts); + if(c < 0) + break; + if(c < 256 && (isalpha(c) || isdigit(c))) { + if(k < nelem(buf)-1) + buf[k++] = c; + } + else { + if(!(c == ';' || c == '\n' || c == '\r')) + ungetchar(ts, c); + break; + } + } + if(c >= 256 || c != '=' && !(isalpha(c) || isdigit(c))) + fnd = _lookup(chartab, NCHARTAB, buf, k, &ans); + } + if(!fnd) { + backup(ts, savei); + ans = '&'; + } + return ans; +} + +// Get next char, obeying ts.chset. +// Returns -1 if no complete character left before current end of data. +static int +getchar(TokenSource* ts) +{ + uchar* buf; + int c; + int n; + int ok; + Rune r; + + if(ts->i >= ts->edata) + return -1; + buf = ts->data; + c = buf[ts->i]; + switch(ts->chset) { + case ISO_8859_1: + if(c >= Winstart && c <= Winend) + c = winchars[c - Winstart]; + ts->i++; + break; + case US_Ascii: + if(c > 127) { + if(warn) + fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c); + } + ts->i++; + break; + case UTF_8: + ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i); + n = chartorune(&r, (char*)(buf+ts->i)); + if(ok) { + if(warn && c == 0x80) + fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]); + ts->i += n; + c = r; + } + else { + // not enough bytes in buf to complete utf-8 char + ts->i = ts->edata; // mark "all used" + c = -1; + } + break; + case Unicode: + if(ts->i < ts->edata - 1) { + //standards say most-significant byte first + c = (c << 8)|(buf[ts->i + 1]); + ts->i += 2; + } + else { + ts->i = ts->edata; // mark "all used" + c = -1; + } + break; + default: + return -1; + } + return c; +} + +// Assuming c was the last character returned by getchar, set +// things up so that next getchar will get that same character +// followed by the current 'next character', etc. +static void +ungetchar(TokenSource* ts, int c) +{ + int n; + Rune r; + char a[UTFmax]; + + n = 1; + switch(ts->chset) { + case UTF_8: + if(c >= 128) { + r = c; + n = runetochar(a, &r); + } + break; + case Unicode: + n = 2; + break; + } + ts->i -= n; +} + +// Restore ts so that it is at the state where the index was savei. +static void +backup(TokenSource* ts, int savei) +{ + if(dbglex) + fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei); + ts->i = savei; +} + + +// Look for value associated with attribute attid in token t. +// If there is one, return 1 and put the value in *pans, +// else return 0. +// If xfer is true, transfer ownership of the string to the caller +// (nil it out here); otherwise, caller must duplicate the answer +// if it needs to save it. +// OK to have pans==0, in which case this is just looking +// to see if token is present. +int +_tokaval(Token* t, int attid, Rune** pans, int xfer) +{ + Attr* attr; + + attr = t->attr; + while(attr != nil) { + if(attr->attid == attid) { + if(pans != nil) + *pans = attr->value; + if(xfer) + attr->value = nil; + return 1; + } + attr = attr->next; + } + if(pans != nil) + *pans = nil; + return 0; +} + +static int +Tconv(Fmt *f) +{ + Token* t; + int i; + int tag; + char* srbra; + Rune* aname; + Rune* tname; + Attr* a; + char buf[BIGBUFSIZE]; + + t = va_arg(f->args, Token*); + if(t == nil) + sprint(buf, "<null>"); + else { + i = 0; + if(dbglex > 1) + i = snprint(buf, sizeof(buf), "[%d]", t->starti); + tag = t->tag; + if(tag == Data) { + i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text); + } + else { + srbra = ""; + if(tag >= RBRA) { + tag -= RBRA; + srbra = "/"; + } + tname = tagnames[tag]; + if(tag == Notfound) + tname = L"?"; + i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname); + for(a = t->attr; a != nil; a = a->next) { + aname = attrnames[a->attid]; + i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname); + if(a->value != nil) + i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value); + } + i += snprint(buf+i, sizeof(buf)-i-1, ">"); + } + buf[i] = 0; + } + return fmtstrcpy(f, buf); +} + +// Attrs own their constituent strings, but build may eventually +// transfer some values to its items and nil them out in the Attr. +static Attr* +newattr(int attid, Rune* value, Attr* link) +{ + Attr* ans; + + ans = (Attr*)emalloc(sizeof(Attr)); + ans->attid = attid; + ans->value = value; + ans->next = link; + return ans; +} + +// Free list of Attrs linked through next field +static void +freeattrs(Attr* ahead) +{ + Attr* a; + Attr* nexta; + + a = ahead; + while(a != nil) { + nexta = a->next; + free(a->value); + free(a); + a = nexta; + } +} + +// Free array of Tokens. +// Allocated space might have room for more than n tokens, +// but only n of them are initialized. +// If caller has transferred ownership of constitutent strings +// or attributes, it must have nil'd out the pointers in the Tokens. +void +_freetokens(Token* tarray, int n) +{ + int i; + Token* t; + + if(tarray == nil) + return; + for(i = 0; i < n; i++) { + t = &tarray[i]; + free(t->text); + freeattrs(t->attr); + } + free(tarray); +} |