Import sources from 2011-03-30 iso image

author: Taru Karttunen <taruti@taruti.net> 2011-03-30 15:46:40 +0300
committer: Taru Karttunen <taruti@taruti.net> 2011-03-30 15:46:40 +0300
commit: e5888a1ffdae813d7575f5fb02275c6bb07e5199 (patch)
tree: d8d51eac403f07814b9e936eed0c9a79195e2450 /sys/src/libhtml/lex.c
1 files changed, 1502 insertions, 0 deletions
diff --git a/sys/src/libhtml/lex.c b/sys/src/libhtml/lex.c
new file mode 100755
index 000000000..ca8fc77d6
--- /dev/null
+++ b/sys/src/libhtml/lex.c
@@ -0,0 +1,1502 @@
+#include <u.h>
+#include <libc.h>
+#include <draw.h>
+#include <ctype.h>
+#include <html.h>
+#include "impl.h"
+
+typedef struct TokenSource TokenSource;
+struct TokenSource
+{
+	int			i;		// index of next byte to use
+	uchar*		data;		// all the data
+	int			edata;	// data[0:edata] is valid
+	int			chset;	// one of US_Ascii, etc.
+	int			mtype;	// TextHtml or TextPlain
+};
+
+enum {
+	EOF = -2,
+	EOB = -1
+};
+
+#define ISNAMCHAR(c)	((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.'))
+
+#define SMALLBUFSIZE 240
+#define BIGBUFSIZE 2000
+
+// HTML 4.0 tag names.
+// Keep sorted, and in correspondence with enum in iparse.h.
+Rune* tagnames[] = {
+	L" ",
+	L"!",
+	L"a", 
+	L"abbr",
+	L"acronym",
+	L"address",
+	L"applet", 
+	L"area",
+	L"b",
+	L"base",
+	L"basefont",
+	L"bdo",
+	L"big",
+	L"blink",
+	L"blockquote",
+	L"body",
+	L"bq",
+	L"br",
+	L"button",
+	L"caption",
+	L"center",
+	L"cite",
+	L"code",
+	L"col",
+	L"colgroup",
+	L"dd",
+	L"del",
+	L"dfn",
+	L"dir",
+	L"div",
+	L"dl",
+	L"dt",
+	L"em",
+	L"fieldset",
+	L"font",
+	L"form",
+	L"frame",
+	L"frameset",
+	L"h1",
+	L"h2",
+	L"h3",
+	L"h4",
+	L"h5",
+	L"h6",
+	L"head",
+	L"hr",
+	L"html",
+	L"i",
+	L"iframe",
+	L"img",
+	L"input",
+	L"ins",
+	L"isindex",
+	L"kbd",
+	L"label",
+	L"legend",
+	L"li",
+	L"link",
+	L"map",
+	L"menu",
+	L"meta",
+	L"nobr",
+	L"noframes",
+	L"noscript",
+	L"object",
+	L"ol",
+	L"optgroup",
+	L"option",
+	L"p",
+	L"param",
+	L"pre",
+	L"q",
+	L"s",
+	L"samp",
+	L"script",
+	L"select",
+	L"small",
+	L"span",
+	L"strike",
+	L"strong",
+	L"style",
+	L"sub",
+	L"sup",
+	L"table",
+	L"tbody",
+	L"td",
+	L"textarea",
+	L"tfoot",
+	L"th",
+	L"thead",
+	L"title",
+	L"tr",
+	L"tt",
+	L"u",
+	L"ul",
+	L"var"
+};
+
+// HTML 4.0 attribute names.
+// Keep sorted, and in correspondence with enum in impl.h.
+Rune* attrnames[] = {
+	L"abbr",
+	L"accept-charset",
+	L"access-key",
+	L"action",
+	L"align",
+	L"alink",
+	L"alt",
+	L"archive",
+	L"axis",
+	L"background",
+	L"bgcolor",
+	L"border",
+	L"cellpadding",
+	L"cellspacing",
+	L"char",
+	L"charoff",
+	L"charset",
+	L"checked",
+	L"cite",
+	L"class",
+	L"classid",
+	L"clear",
+	L"code",
+	L"codebase",
+	L"codetype",
+	L"color",
+	L"cols",
+	L"colspan",
+	L"compact",
+	L"content",
+	L"coords",
+	L"data",
+	L"datetime",
+	L"declare",
+	L"defer",
+	L"dir",
+	L"disabled",
+	L"enctype",
+	L"face",
+	L"for",
+	L"frame",
+	L"frameborder",
+	L"headers",
+	L"height",
+	L"href",
+	L"hreflang",
+	L"hspace",
+	L"http-equiv",
+	L"id",
+	L"ismap",
+	L"label",
+	L"lang",
+	L"link",
+	L"longdesc",
+	L"marginheight",
+	L"marginwidth",
+	L"maxlength",
+	L"media",
+	L"method",
+	L"multiple",
+	L"name",
+	L"nohref",
+	L"noresize",
+	L"noshade",
+	L"nowrap",
+	L"object",
+	L"onblur",
+	L"onchange",
+	L"onclick",
+	L"ondblclick",
+	L"onfocus",
+	L"onkeypress",
+	L"onkeyup",
+	L"onload",
+	L"onmousedown",
+	L"onmousemove",
+	L"onmouseout",
+	L"onmouseover",
+	L"onmouseup",
+	L"onreset",
+	L"onselect",
+	L"onsubmit",
+	L"onunload",
+	L"profile",
+	L"prompt",
+	L"readonly",
+	L"rel",
+	L"rev",
+	L"rows",
+	L"rowspan",
+	L"rules",
+	L"scheme",
+	L"scope",
+	L"scrolling",
+	L"selected",
+	L"shape",
+	L"size",
+	L"span",
+	L"src",
+	L"standby",
+	L"start",
+	L"style",
+	L"summary",
+	L"tabindex",
+	L"target",
+	L"text",
+	L"title",
+	L"type",
+	L"usemap",
+	L"valign",
+	L"value",
+	L"valuetype",
+	L"version",
+	L"vlink",
+	L"vspace",
+	L"width"
+};
+
+
+// Character entity to unicode character number map.
+// Keep sorted by name.
+StringInt	chartab[]= {
+	{L"AElig", 198},
+	{L"Aacute", 193},
+	{L"Acirc", 194},
+	{L"Agrave", 192},
+	{L"Alpha", 913},
+	{L"Aring", 197},
+	{L"Atilde", 195},
+	{L"Auml", 196},
+	{L"Beta", 914},
+	{L"Ccedil", 199},
+	{L"Chi", 935},
+	{L"Dagger", 8225},
+	{L"Delta", 916},
+	{L"ETH", 208},
+	{L"Eacute", 201},
+	{L"Ecirc", 202},
+	{L"Egrave", 200},
+	{L"Epsilon", 917},
+	{L"Eta", 919},
+	{L"Euml", 203},
+	{L"Gamma", 915},
+	{L"Iacute", 205},
+	{L"Icirc", 206},
+	{L"Igrave", 204},
+	{L"Iota", 921},
+	{L"Iuml", 207},
+	{L"Kappa", 922},
+	{L"Lambda", 923},
+	{L"Mu", 924},
+	{L"Ntilde", 209},
+	{L"Nu", 925},
+	{L"OElig", 338},
+	{L"Oacute", 211},
+	{L"Ocirc", 212},
+	{L"Ograve", 210},
+	{L"Omega", 937},
+	{L"Omicron", 927},
+	{L"Oslash", 216},
+	{L"Otilde", 213},
+	{L"Ouml", 214},
+	{L"Phi", 934},
+	{L"Pi", 928},
+	{L"Prime", 8243},
+	{L"Psi", 936},
+	{L"Rho", 929},
+	{L"Scaron", 352},
+	{L"Sigma", 931},
+	{L"THORN", 222},
+	{L"Tau", 932},
+	{L"Theta", 920},
+	{L"Uacute", 218},
+	{L"Ucirc", 219},
+	{L"Ugrave", 217},
+	{L"Upsilon", 933},
+	{L"Uuml", 220},
+	{L"Xi", 926},
+	{L"Yacute", 221},
+	{L"Yuml", 376},
+	{L"Zeta", 918},
+	{L"aacute", 225},
+	{L"acirc", 226},
+	{L"acute", 180},
+	{L"aelig", 230},
+	{L"agrave", 224},
+	{L"alefsym", 8501},
+	{L"alpha", 945},
+	{L"amp", 38},
+	{L"and", 8743},
+	{L"ang", 8736},
+	{L"aring", 229},
+	{L"asymp", 8776},
+	{L"atilde", 227},
+	{L"auml", 228},
+	{L"bdquo", 8222},
+	{L"beta", 946},
+	{L"brvbar", 166},
+	{L"bull", 8226},
+	{L"cap", 8745},
+	{L"ccedil", 231},
+	{L"cdots", 8943},
+	{L"cedil", 184},
+	{L"cent", 162},
+	{L"chi", 967},
+	{L"circ", 710},
+	{L"clubs", 9827},
+	{L"cong", 8773},
+	{L"copy", 169},
+	{L"crarr", 8629},
+	{L"cup", 8746},
+	{L"curren", 164},
+	{L"dArr", 8659},
+	{L"dagger", 8224},
+	{L"darr", 8595},
+	{L"ddots", 8945},
+	{L"deg", 176},
+	{L"delta", 948},
+	{L"diams", 9830},
+	{L"divide", 247},
+	{L"eacute", 233},
+	{L"ecirc", 234},
+	{L"egrave", 232},
+	{L"emdash", 8212},	/* non-standard but commonly used */
+	{L"empty", 8709},
+	{L"emsp", 8195},
+	{L"endash", 8211},	/* non-standard but commonly used */
+	{L"ensp", 8194},
+	{L"epsilon", 949},
+	{L"equiv", 8801},
+	{L"eta", 951},
+	{L"eth", 240},
+	{L"euml", 235},
+	{L"euro", 8364},
+	{L"exist", 8707},
+	{L"fnof", 402},
+	{L"forall", 8704},
+	{L"frac12", 189},
+	{L"frac14", 188},
+	{L"frac34", 190},
+	{L"frasl", 8260},
+	{L"gamma", 947},
+	{L"ge", 8805},
+	{L"gt", 62},
+	{L"hArr", 8660},
+	{L"harr", 8596},
+	{L"hearts", 9829},
+	{L"hellip", 8230},
+	{L"iacute", 237},
+	{L"icirc", 238},
+	{L"iexcl", 161},
+	{L"igrave", 236},
+	{L"image", 8465},
+	{L"infin", 8734},
+	{L"int", 8747},
+	{L"iota", 953},
+	{L"iquest", 191},
+	{L"isin", 8712},
+	{L"iuml", 239},
+	{L"kappa", 954},
+	{L"lArr", 8656},
+	{L"lambda", 955},
+	{L"lang", 9001},
+	{L"laquo", 171},
+	{L"larr", 8592},
+	{L"lceil", 8968},
+	{L"ldots", 8230},
+	{L"ldquo", 8220},
+	{L"le", 8804},
+	{L"lfloor", 8970},
+	{L"lowast", 8727},
+	{L"loz", 9674},
+	{L"lrm", 8206},
+	{L"lsaquo", 8249},
+	{L"lsquo", 8216},
+	{L"lt", 60},
+	{L"macr", 175},
+	{L"mdash", 8212},
+	{L"micro", 181},
+	{L"middot", 183},
+	{L"minus", 8722},
+	{L"mu", 956},
+	{L"nabla", 8711},
+	{L"nbsp", 160},
+	{L"ndash", 8211},
+	{L"ne", 8800},
+	{L"ni", 8715},
+	{L"not", 172},
+	{L"notin", 8713},
+	{L"nsub", 8836},
+	{L"ntilde", 241},
+	{L"nu", 957},
+	{L"oacute", 243},
+	{L"ocirc", 244},
+	{L"oelig", 339},
+	{L"ograve", 242},
+	{L"oline", 8254},
+	{L"omega", 969},
+	{L"omicron", 959},
+	{L"oplus", 8853},
+	{L"or", 8744},
+	{L"ordf", 170},
+	{L"ordm", 186},
+	{L"oslash", 248},
+	{L"otilde", 245},
+	{L"otimes", 8855},
+	{L"ouml", 246},
+	{L"para", 182},
+	{L"part", 8706},
+	{L"permil", 8240},
+	{L"perp", 8869},
+	{L"phi", 966},
+	{L"pi", 960},
+	{L"piv", 982},
+	{L"plusmn", 177},
+	{L"pound", 163},
+	{L"prime", 8242},
+	{L"prod", 8719},
+	{L"prop", 8733},
+	{L"psi", 968},
+	{L"quad", 8193},
+	{L"quot", 34},
+	{L"rArr", 8658},
+	{L"radic", 8730},
+	{L"rang", 9002},
+	{L"raquo", 187},
+	{L"rarr", 8594},
+	{L"rceil", 8969},
+	{L"rdquo", 8221},
+	{L"real", 8476},
+	{L"reg", 174},
+	{L"rfloor", 8971},
+	{L"rho", 961},
+	{L"rlm", 8207},
+	{L"rsaquo", 8250},
+	{L"rsquo", 8217},
+	{L"sbquo", 8218},
+	{L"scaron", 353},
+	{L"sdot", 8901},
+	{L"sect", 167},
+	{L"shy", 173},
+	{L"sigma", 963},
+	{L"sigmaf", 962},
+	{L"sim", 8764},
+	{L"sp", 8194},
+	{L"spades", 9824},
+	{L"sub", 8834},
+	{L"sube", 8838},
+	{L"sum", 8721},
+	{L"sup", 8835},
+	{L"sup1", 185},
+	{L"sup2", 178},
+	{L"sup3", 179},
+	{L"supe", 8839},
+	{L"szlig", 223},
+	{L"tau", 964},
+	{L"there4", 8756},
+	{L"theta", 952},
+	{L"thetasym", 977},
+	{L"thinsp", 8201},
+	{L"thorn", 254},
+	{L"tilde", 732},
+	{L"times", 215},
+	{L"trade", 8482},
+	{L"uArr", 8657},
+	{L"uacute", 250},
+	{L"uarr", 8593},
+	{L"ucirc", 251},
+	{L"ugrave", 249},
+	{L"uml", 168},
+	{L"upsih", 978},
+	{L"upsilon", 965},
+	{L"uuml", 252},
+	{L"varepsilon", 8712},
+	{L"varphi", 981},
+	{L"varpi", 982},
+	{L"varrho", 1009},
+	{L"vdots", 8942},
+	{L"vsigma", 962},
+	{L"vtheta", 977},
+	{L"weierp", 8472},
+	{L"xi", 958},
+	{L"yacute", 253},
+	{L"yen", 165},
+	{L"yuml", 255},
+	{L"zeta", 950},
+	{L"zwj", 8205},
+	{L"zwnj", 8204}
+};
+#define NCHARTAB (sizeof(chartab)/sizeof(chartab[0]))
+
+// Characters Winstart..Winend are those that Windows
+// uses interpolated into the Latin1 set.
+// They aren't supposed to appear in HTML, but they do....
+enum {
+	Winstart = 127,
+	Winend = 159
+};
+
+static int	winchars[]= { 8226,	// 8226 is a bullet
+	8226, 8226, 8218, 402, 8222, 8230, 8224, 8225,
+	710, 8240, 352, 8249, 338, 8226, 8226, 8226,
+	8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212,
+	732, 8482, 353, 8250, 339, 8226, 8226, 376};
+
+static StringInt*	tagtable;		// initialized from tagnames
+static StringInt*	attrtable;		// initialized from attrnames
+
+static void	lexinit(void);
+static int		getplaindata(TokenSource* ts, Token* a, int* pai);
+static int		getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
+static int		getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai, int findtag);
+static int		gettag(TokenSource* ts, int starti, Token* a, int* pai);
+static Rune*	buftostr(Rune* s, Rune* buf, int j);
+static int		comment(TokenSource* ts);
+static int		findstr(TokenSource* ts, Rune* s);
+static int		ampersand(TokenSource* ts);
+static int		lowerc(int c);
+static int		getchar(TokenSource* ts);
+static void		ungetchar(TokenSource* ts, int c);
+static void		backup(TokenSource* ts, int savei);
+static void		freeinsidetoken(Token* t);
+static void		freeattrs(Attr* ahead);
+static Attr*	newattr(int attid, Rune* value, Attr* link);
+static int		Tconv(Fmt* f);
+
+int	dbglex = 0;
+static int lexinited = 0;
+
+static void
+lexinit(void)
+{
+	tagtable = _makestrinttab(tagnames, Numtags);
+	attrtable = _makestrinttab(attrnames, Numattrs);
+	fmtinstall('T', Tconv);
+	lexinited = 1;
+}
+
+static TokenSource*
+newtokensource(uchar* data, int edata, int chset, int mtype)
+{
+	TokenSource*	ans;
+
+	assert(chset == US_Ascii || chset == ISO_8859_1 ||
+			chset == UTF_8 || chset == Unicode);
+	ans = (TokenSource*)emalloc(sizeof(TokenSource));
+	ans->i = 0;
+	ans->data = data;
+	ans->edata = edata;
+	ans->chset = chset;
+	ans->mtype = mtype;
+	return ans;
+}
+
+enum {
+	ToksChunk = 500,
+};
+
+// Call this to get the tokens.
+//  The number of returned tokens is returned in *plen.
+Token*
+_gettoks(uchar* data, int datalen, int chset, int mtype, int* plen)
+{
+	TokenSource*	ts;
+	Token*		a;
+	int	alen;
+	int	ai;
+	int	starti;
+	int	c;
+	int	tag;
+
+	if(!lexinited)
+		lexinit();
+	ts = newtokensource(data, datalen, chset, mtype);
+	if(dbglex)
+		fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata);
+	alen = 0;
+	ai = 0;
+	a = 0;
+	if(ts->mtype == TextHtml) {
+		for(;;) {
+			if(alen - ai < ToksChunk/32) {
+				alen += ToksChunk;
+				a = erealloc(a, alen*sizeof *a);
+			}
+			starti = ts->i;
+			c = getchar(ts);
+			if(c < 0)
+				break;
+			if(c == '<') {
+				tag = gettag(ts, starti, a, &ai);
+				if(tag == Tscript || tag == Tstyle) {
+					// special rules for getting Data after....
+					starti = ts->i;
+					c = getchar(ts);
+					tag = getscriptdata(ts, c, starti, a, &ai, tag);
+				}
+			}
+			else
+				tag = getdata(ts, c, starti, a, &ai);
+			if(tag == -1)
+				break;
+			else if(dbglex > 1 && tag != Comment)
+				fprint(2, "lex: got token %T\n", &a[ai-1]);
+		}
+	}
+	else {
+		// plain text (non-html) tokens
+		for(;;) {
+			if(alen - ai < ToksChunk/32) {
+				alen += ToksChunk;
+				a = erealloc(a, alen*sizeof *a);
+			}
+			tag = getplaindata(ts, a, &ai);
+			if(tag == -1)
+				break;
+			if(dbglex > 1)
+				fprint(2, "lex: got token %T\n", &a[ai]);
+		}
+	}
+	free(ts);
+	if(dbglex)
+		fprint(2, "lex: returning %d tokens\n", ai);
+	*plen = ai;
+	if(ai == 0){
+		free(a);
+		a = 0;
+	}
+	return a;
+}
+
+// For case where source isn't HTML.
+// Just make data tokens, one per line (or partial line,
+// at end of buffer), ignoring non-whitespace control
+// characters and dumping \r's.
+// If find non-empty token, fill in a[*pai], bump *pai, and return Data.
+// Otherwise return -1;
+static int
+getplaindata(TokenSource* ts, Token* a, int* pai)
+{
+	Rune*	s;
+	int	j;
+	int	starti;
+	int	c;
+	Token*	tok;
+	Rune	buf[BIGBUFSIZE];
+
+	s = nil;
+	j = 0;
+	starti = ts->i;
+	for(c = getchar(ts); c >= 0; c = getchar(ts)) {
+		if(c < ' ') {
+			if(isspace(c)) {
+				if(c == '\r') {
+					// ignore it unless no following '\n',
+					// in which case treat it like '\n'
+					c = getchar(ts);
+					if(c != '\n') {
+						if(c >= 0)
+							ungetchar(ts, c);
+						c = '\n';
+					}
+				}
+			}
+			else
+				c = 0;
+		}
+		if(c != 0) {
+			buf[j++] = c;
+			if(j == nelem(buf)-1) {
+				s = buftostr(s, buf, j);
+				j = 0;
+			}
+		}
+		if(c == '\n')
+			break;
+	}
+	s = buftostr(s, buf, j);
+	if(s == nil)
+		return -1;
+	tok = &a[(*pai)++];
+	tok->tag = Data;
+	tok->text = s;
+	tok->attr = nil;
+	tok->starti = starti;
+	return Data;
+}
+
+// Return concatenation of s and buf[0:j]
+static Rune*
+buftostr(Rune* s, Rune* buf, int j)
+{
+	int i;
+
+	if(s == nil)
+		s = _Strndup(buf, j);
+	else {
+		i = _Strlen(s);
+		s = realloc(s, ( i+j+1)*sizeof *s);
+		memcpy(&s[i], buf, j*sizeof *s);
+		s[i+j] = 0;
+	}
+	return s;
+}
+
+// Gather data up to next start-of-tag or end-of-buffer.
+// Translate entity references (&amp;).
+// Ignore non-whitespace control characters and get rid of \r's.
+// If find non-empty token, fill in a[*pai], bump *pai, and return Data.
+// Otherwise return -1;
+static int
+getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
+{
+	Rune*	s;
+	int	j;
+	int	c;
+	Token*	tok;
+	Rune	buf[SMALLBUFSIZE];
+
+	s = nil;
+	j = 0;
+	for(c = firstc; c >= 0; c = getchar(ts)){
+		if(c == '&') {
+			c = ampersand(ts);
+			if(c < 0)
+				break;
+		}
+		else if(c < ' ') {
+			if(isspace(c)) {
+				if(c == '\r') {
+					// ignore it unless no following '\n',
+					// in which case treat it like '\n'
+					c = getchar(ts);
+					if(c != '\n') {
+						if(c >= 0)
+							ungetchar(ts, c);
+						c = '\n';
+					}
+				}
+			}
+			else {
+				if(warn)
+					fprint(2, "warning: non-whitespace control character %d ignored\n", c);
+				c = 0;
+			}
+		}
+		else if(c == '<') {
+			ungetchar(ts, c);
+			break;
+		}
+		if(c != 0) {
+			buf[j++] = c;
+			if(j == nelem(buf)-1) {
+				s = buftostr(s, buf, j);
+				j = 0;
+			}
+		}
+	}
+	s = buftostr(s, buf, j);
+	if(s == nil)
+		return -1;
+	tok = &a[(*pai)++];
+	tok->tag = Data;
+	tok->text = s;
+	tok->attr = nil;
+	tok->starti = starti;
+	return Data;
+}
+
+// The rules for lexing scripts are different (ugh).
+// Gather up everything until see an "</" tagnames[tok] ">"
+static int
+getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai, int findtag)
+{
+	Rune*	s;
+	int	j;
+	int	tstarti;
+	int	savei;
+	int	c;
+	int	tag;
+	int	done;
+	Token*	tok;
+	Rune	buf[BIGBUFSIZE];
+
+	s = nil;
+	j = 0;
+	tstarti = starti;
+	c = firstc;
+	done = 0;
+	while(c >= 0) {
+		if(c == '<') {
+			// other browsers ignore stuff to end of line after <!
+			savei = ts->i;
+			c = getchar(ts);
+			if(c == '!') {
+				if(comment(ts) == -1)
+					break;
+				if(c == '\r')
+					c = getchar(ts);
+				if(c == '\n')
+					c = getchar(ts);
+			}
+			else if(c >= 0) {
+				backup(ts, savei);
+				tag = gettag(ts, tstarti, a, pai);
+				if(tag == -1)
+					break;
+				if(tag != Comment)
+					(*pai)--;
+				backup(ts, tstarti);
+				if(tag == findtag + RBRA) {
+					done = 1;
+					break;
+				}
+				// here tag was not the one we were looking for, so take as regular data
+				c = getchar(ts);
+			}
+		}
+		if(c < 0)
+			break;
+		if(c != 0) {
+			buf[j++] = c;
+			if(j == nelem(buf)-1) {
+				s = buftostr(s, buf, j);
+				j = 0;
+			}
+		}
+		tstarti = ts->i;
+		c = getchar(ts);
+	}
+	if(done || ts->i == ts->edata) {
+		s = buftostr(s, buf, j);
+		tok = &a[(*pai)++];
+		tok->tag = Data;
+		tok->text = s;
+		tok->attr = nil;
+		tok->starti = starti;
+		return Data;
+	}
+	free(s);
+	backup(ts, starti);
+	return -1;
+}
+
+// We've just seen a '<'.  Gather up stuff to closing '>' (if buffer
+// ends before then, return -1).
+// If it's a tag, look up the name, gather the attributes, and return
+// the appropriate token.
+// Else it's either just plain data or some kind of ignorable stuff:
+// return Data or Comment as appropriate.
+// If it's not a Comment, put it in a[*pai] and bump *pai.
+static int
+gettag(TokenSource* ts, int starti, Token* a, int* pai)
+{
+	int	rbra;
+	int	ans;
+	Attr*	al;
+	int	nexti;
+	int	c;
+	int	ti;
+	int	afnd;
+	int	attid;
+	int	quote;
+	Rune*	val;
+	int	nv;
+	int	i;
+	int	tag;
+	Token*	tok;
+	Rune	buf[BIGBUFSIZE];
+
+	rbra = 0;
+	nexti = ts->i;
+	tok = &a[*pai];
+	tok->tag = Notfound;
+	tok->text = nil;
+	tok->attr = nil;
+	tok->starti = starti;
+	c = getchar(ts);
+	if(c == '/') {
+		rbra = RBRA;
+		c = getchar(ts);
+	}
+	if(c < 0)
+		goto eob_done;
+	if(c >= 256 || !isalpha(c)) {
+		// not a tag
+		if(c == '!') {
+			ans = comment(ts);
+			if(ans != -1)
+				return ans;
+			goto eob_done;
+		}
+		else {
+			backup(ts, nexti);
+			tok->tag = Data;
+			tok->text = _Strdup(L"<");
+			(*pai)++;
+			return Data;
+		}
+	}
+	// c starts a tagname
+	buf[0] = c;
+	i = 1;
+	while(1) {
+		c = getchar(ts);
+		if(c < 0)
+			goto eob_done;
+		if(!ISNAMCHAR(c))
+			break;
+		// if name is bigger than buf it won't be found anyway...
+		if(i < BIGBUFSIZE)
+			buf[i++] = c;
+	}
+	if(_lookup(tagtable, Numtags, buf, i, &tag))
+		tok->tag = tag + rbra;
+	else
+		tok->text = _Strndup(buf, i);	// for warning print, in build
+	// attribute gathering loop
+	al = nil;
+	while(1) {
+		// look for "ws name" or "ws name ws = ws val"  (ws=whitespace)
+		// skip whitespace
+attrloop_continue:
+		while(c < 256 && isspace(c)) {
+			c = getchar(ts);
+			if(c < 0)
+				goto eob_done;
+		}
+		if(c == '>')
+			goto attrloop_done;
+		if(c == '<') {
+			if(warn)
+				fprint(2, "warning: unclosed tag\n");
+			ungetchar(ts, c);
+			goto attrloop_done;
+		}
+		if(c >= 256 || !isalpha(c)) {
+			if(warn)
+				fprint(2, "warning: expected attribute name\n");
+			// skipt to next attribute name
+			while(1) {
+				c = getchar(ts);
+				if(c < 0)
+					goto eob_done;
+				if(c < 256 && isalpha(c))
+					goto attrloop_continue;
+				if(c == '<') {
+					if(warn)
+						fprint(2, "warning: unclosed tag\n");
+					ungetchar(ts, 60);
+					goto attrloop_done;
+				}
+				if(c == '>')
+					goto attrloop_done;
+			}
+		}
+		// gather attribute name
+		buf[0] = c;
+		i = 1;
+		while(1) {
+			c = getchar(ts);
+			if(c < 0)
+				goto eob_done;
+			if(!ISNAMCHAR(c))
+				break;
+			if(i < BIGBUFSIZE-1)
+				buf[i++] = c;
+		}
+		afnd = _lookup(attrtable, Numattrs, buf, i, &attid);
+		if(warn && !afnd) {
+			buf[i] = 0;
+			fprint(2, "warning: unknown attribute name %S\n", buf);
+		}
+		// skip whitespace
+		while(c < 256 && isspace(c)) {
+			c = getchar(ts);
+			if(c < 0)
+				goto eob_done;
+		}
+		if(c != '=') {
+			if(afnd)
+				al = newattr(attid, nil, al);
+			goto attrloop_continue;
+		}
+		//# c is '=' here;  skip whitespace
+		while(1) {
+			c = getchar(ts);
+			if(c < 0)
+				goto eob_done;
+			if(c >= 256 || !isspace(c))
+				break;
+		}
+		quote = 0;
+		if(c == '\'' || c == '"') {
+			quote = c;
+			c = getchar(ts);
+			if(c < 0)
+				goto eob_done;
+		}
+		val = nil;
+		nv = 0;
+		while(1) {
+valloop_continue:
+			if(c < 0)
+				goto eob_done;
+			if(c == '>') {
+				if(quote) {
+					// c might be part of string (though not good style)
+					// but if line ends before close quote, assume
+					// there was an unmatched quote
+					ti = ts->i;
+					while(1) {
+						c = getchar(ts);
+						if(c < 0)
+							goto eob_done;
+						if(c == quote) {
+							backup(ts, ti);
+							buf[nv++] = '>';
+							if(nv == BIGBUFSIZE-1) {
+								val = buftostr(val, buf, nv);
+								nv = 0;
+							}
+							c = getchar(ts);
+							goto valloop_continue;
+						}
+						if(c == '\n') {
+							if(warn)
+								fprint(2, "warning: apparent unmatched quote\n");
+							backup(ts, ti);
+							c = '>';
+							goto valloop_done;
+						}
+					}
+				}
+				else
+					goto valloop_done;
+			}
+			if(quote) {
+				if(c == quote) {
+					c = getchar(ts);
+					if(c < 0)
+						goto eob_done;
+					goto valloop_done;
+				}
+				if(c == '\r') {
+					c = getchar(ts);
+					goto valloop_continue;
+				}
+				if(c == '\t' || c == '\n')
+					c = ' ';
+			}
+			else {
+				if(c < 256 && isspace(c))
+					goto valloop_done;
+			}
+			if(c == '&') {
+				c = ampersand(ts);
+				if(c == -1)
+					goto eob_done;
+			}
+			buf[nv++] = c;
+			if(nv == BIGBUFSIZE-1) {
+				val = buftostr(val, buf, nv);
+				nv = 0;
+			}
+			c = getchar(ts);
+		}
+valloop_done:
+		if(afnd) {
+			val = buftostr(val, buf, nv);
+			al = newattr(attid, val, al);
+		}
+	}
+
+attrloop_done:
+	tok->attr = al;
+	(*pai)++;
+	return tok->tag;
+
+eob_done:
+	if(warn)
+		fprint(2, "warning: incomplete tag at end of page\n");
+	backup(ts, nexti);
+	tok->tag = Data;
+	tok->text = _Strdup(L"<");
+	return Data;
+}
+
+// We've just read a '<!' at position starti,
+// so this may be a comment or other ignored section, or it may
+// be just a literal string if there is no close before end of file
+// (other browsers do that).
+// The accepted practice seems to be (note: contrary to SGML spec!):
+// If see <!--, look for --> to close, or if none, > to close.
+// If see <!(not --), look for > to close.
+// If no close before end of file, leave original characters in as literal data.
+//
+// If we see ignorable stuff, return Comment.
+// Else return nil (caller should back up and try again when more data arrives,
+// unless at end of file, in which case caller should just make '<' a data token).
+static int
+comment(TokenSource* ts)
+{
+	int	nexti;
+	int	havecomment;
+	int	c;
+
+	nexti = ts->i;
+	havecomment = 0;
+	c = getchar(ts);
+	if(c == '-') {
+		c = getchar(ts);
+		if(c == '-') {
+			if(findstr(ts, L"-->"))
+				havecomment = 1;
+			else
+				backup(ts, nexti);
+		}
+	}
+	if(!havecomment) {
+		if(c == '>')
+			havecomment = 1;
+		else if(c >= 0) {
+			if(findstr(ts, L">"))
+				havecomment = 1;
+		}
+	}
+	if(havecomment)
+		return Comment;
+	return -1;
+}
+
+// Look for string s in token source.
+// If found, return 1, with buffer at next char after s,
+// else return 0 (caller should back up).
+static int
+findstr(TokenSource* ts, Rune* s)
+{
+	int	c0;
+	int	n;
+	int	nexti;
+	int	i;
+	int	c;
+
+	c0 = s[0];
+	n = runestrlen(s);
+	while(1) {
+		c = getchar(ts);
+		if(c < 0)
+			break;
+		if(c == c0) {
+			if(n == 1)
+				return 1;
+			nexti = ts->i;
+			for(i = 1; i < n; i++) {
+				c = getchar(ts);
+				if(c < 0)
+					goto mainloop_done;
+				if(c != s[i])
+					break;
+			}
+			if(i == n)
+				return 1;
+			backup(ts, nexti);
+		}
+	}
+mainloop_done:
+	return 0;
+}
+
+// We've just read an '&'; look for an entity reference
+// name, and if found, return translated char.
+// if there is a complete entity name but it isn't known,
+// back up to just past the '&' and return '&'.
+// If the entity can't be completed in the current buffer, back up
+// to the '&' and return -1.
+static int
+ampersand(TokenSource* ts)
+{
+	int	savei;
+	int	c;
+	int	fnd;
+	int	ans;
+	int	v;
+	int	k;
+	Rune	buf[25];
+
+	savei = ts->i;
+	c = getchar(ts);
+	fnd = 0;
+	ans = -1;
+	if(c == '#') {
+		c = getchar(ts);
+		v = 0;
+		if(c == 'X' || c == 'x')
+			for(c = getchar(ts); c < 256; c = getchar(ts))
+				if(c >= '0' && c <= '9')
+					v = v*16+c-'0';
+				else if(c >= 'A' && c<= 'F')
+					v = v*16+c-'A'+10;
+				else if(c >= 'a' && c <= 'f')
+					v = v*16+c-'a'+10;
+				else
+					break;
+		else
+			while(c >= 0) {
+				if(!(c < 256 && isdigit(c)))
+					break;
+				v = v*10 + c - 48;
+				c = getchar(ts);
+			}
+		if(c >= 0) {
+			if(!(c == ';' || c == '\n' || c == '\r'))
+				ungetchar(ts, c);
+			c = v;
+			if(c == 160)
+				c = 160;
+			if(c >= Winstart && c <= Winend) {
+				c = winchars[c - Winstart];
+			}
+			ans = c;
+			fnd = 1;
+		}
+	}
+	else if(c < 256 && isalpha(c)) {
+		buf[0] = c;
+		k = 1;
+		while(1) {
+			c = getchar(ts);
+			if(c < 0)
+				break;
+			if(c < 256 && (isalpha(c) || isdigit(c))) {
+				if(k < nelem(buf)-1)
+					buf[k++] = c;
+			}
+			else {
+				if(!(c == ';' || c == '\n' || c == '\r'))
+					ungetchar(ts, c);
+				break;
+			}
+		}
+		if(c >= 256 || c != '=' && !(isalpha(c) || isdigit(c)))
+			fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
+	}
+	if(!fnd) {
+		backup(ts, savei);
+		ans = '&';
+	}
+	return ans;
+}
+
+// Get next char, obeying ts.chset.
+// Returns -1 if no complete character left before current end of data.
+static int
+getchar(TokenSource* ts)
+{
+	uchar*	buf;
+	int	c;
+	int	n;
+	int	ok;
+	Rune	r;
+
+	if(ts->i >= ts->edata)
+		return -1;
+	buf = ts->data;
+	c = buf[ts->i];
+	switch(ts->chset) {
+	case ISO_8859_1:
+		if(c >= Winstart && c <= Winend)
+			c = winchars[c - Winstart];
+		ts->i++;
+		break;
+	case US_Ascii:
+		if(c > 127) {
+			if(warn)
+				fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c);
+		}
+		ts->i++;
+		break;
+	case UTF_8:
+		ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
+		n = chartorune(&r, (char*)(buf+ts->i));
+		if(ok) {
+			if(warn && c == 0x80)
+				fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
+			ts->i += n;
+			c = r;
+		}
+		else {
+			// not enough bytes in buf to complete utf-8 char
+			ts->i = ts->edata;	// mark "all used"
+			c = -1;
+		}
+		break;
+	case Unicode:
+		if(ts->i < ts->edata - 1) {
+			//standards say most-significant byte first
+			c = (c << 8)|(buf[ts->i + 1]);
+			ts->i += 2;
+		}
+		else {
+			ts->i = ts->edata;	// mark "all used"
+			c = -1;
+		}
+		break;
+	default:
+		return -1;
+	}
+	return c;
+}
+
+// Assuming c was the last character returned by getchar, set
+// things up so that next getchar will get that same character
+// followed by the current 'next character', etc.
+static void
+ungetchar(TokenSource* ts, int c)
+{
+	int	n;
+	Rune	r;
+	char	a[UTFmax];
+
+	n = 1;
+	switch(ts->chset) {
+	case UTF_8:
+		if(c >= 128) {
+			r = c;
+			n = runetochar(a, &r);
+		}
+		break;
+	case Unicode:
+		n = 2;
+		break;
+	}
+	ts->i -= n;
+}
+
+// Restore ts so that it is at the state where the index was savei.
+static void
+backup(TokenSource* ts, int savei)
+{
+	if(dbglex)
+		fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei);
+	ts->i = savei;
+}
+
+
+// Look for value associated with attribute attid in token t.
+// If there is one, return 1 and put the value in *pans,
+// else return 0.
+// If xfer is true, transfer ownership of the string to the caller
+// (nil it out here); otherwise, caller must duplicate the answer
+// if it needs to save it.
+// OK to have pans==0, in which case this is just looking
+// to see if token is present.
+int
+_tokaval(Token* t, int attid, Rune** pans, int xfer)
+{
+	Attr*	attr;
+
+	attr = t->attr;
+	while(attr != nil) {
+		if(attr->attid == attid) {
+			if(pans != nil)
+				*pans = attr->value;
+			if(xfer)
+				attr->value = nil;
+			return 1;
+		}
+		attr = attr->next;
+	}
+	if(pans != nil)
+		*pans = nil;
+	return 0;
+}
+
+static int
+Tconv(Fmt *f)
+{
+	Token*	t;
+	int	i;
+	int	tag;
+	char*	srbra;
+	Rune*	aname;
+	Rune*	tname;
+	Attr*	a;
+	char	buf[BIGBUFSIZE];
+
+	t = va_arg(f->args, Token*);
+	if(t == nil)
+		sprint(buf, "<null>");
+	else {
+		i = 0;
+		if(dbglex > 1)
+			i = snprint(buf, sizeof(buf), "[%d]", t->starti);
+		tag = t->tag;
+		if(tag == Data) {
+			i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text);
+		}
+		else {
+			srbra = "";
+			if(tag >= RBRA) {
+				tag -= RBRA;
+				srbra = "/";
+			}
+			tname = tagnames[tag];
+			if(tag == Notfound)
+				tname = L"?";
+			i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname);
+			for(a = t->attr; a != nil; a = a->next) {
+				aname = attrnames[a->attid];
+				i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname);
+				if(a->value != nil)
+					i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value);
+			}
+			i += snprint(buf+i, sizeof(buf)-i-1, ">");
+		}
+		buf[i] = 0;
+	}
+	return fmtstrcpy(f, buf);
+}
+
+// Attrs own their constituent strings, but build may eventually
+// transfer some values to its items and nil them out in the Attr.
+static Attr*
+newattr(int attid, Rune* value, Attr* link)
+{
+	Attr* ans;
+
+	ans = (Attr*)emalloc(sizeof(Attr));
+	ans->attid = attid;
+	ans->value = value;
+	ans->next = link;
+	return ans;
+}
+
+// Free list of Attrs linked through next field
+static void
+freeattrs(Attr* ahead)
+{
+	Attr* a;
+	Attr* nexta;
+
+	a = ahead;
+	while(a != nil) {
+		nexta = a->next;
+		free(a->value);
+		free(a);
+		a = nexta;
+	}
+}
+
+// Free array of Tokens.
+// Allocated space might have room for more than n tokens,
+// but only n of them are initialized.
+// If caller has transferred ownership of constitutent strings
+// or attributes, it must have nil'd out the pointers in the Tokens.
+void
+_freetokens(Token* tarray, int n)
+{
+	int i;
+	Token* t;
+
+	if(tarray == nil)
+		return;
+	for(i = 0; i < n; i++) {
+		t = &tarray[i];
+		free(t->text);
+		freeattrs(t->attr);
+	}
+	free(tarray);
+}
author	Taru Karttunen <taruti@taruti.net>	2011-03-30 15:46:40 +0300
committer	Taru Karttunen <taruti@taruti.net>	2011-03-30 15:46:40 +0300
commit	e5888a1ffdae813d7575f5fb02275c6bb07e5199 (patch)
tree	d8d51eac403f07814b9e936eed0c9a79195e2450 /sys/src/libhtml/lex.c