Import sources from 2011-03-30 iso image

author: Taru Karttunen <taruti@taruti.net> 2011-03-30 15:46:40 +0300
committer: Taru Karttunen <taruti@taruti.net> 2011-03-30 15:46:40 +0300
commit: e5888a1ffdae813d7575f5fb02275c6bb07e5199 (patch)
tree: d8d51eac403f07814b9e936eed0c9a79195e2450 /sys/src/cmd/webfs/url.c
1 files changed, 1092 insertions, 0 deletions
diff --git a/sys/src/cmd/webfs/url.c b/sys/src/cmd/webfs/url.c
new file mode 100755
index 000000000..f46c8b47b
--- /dev/null
+++ b/sys/src/cmd/webfs/url.c
@@ -0,0 +1,1092 @@
+/*
+ * This is a URL parser, written to parse "Common Internet Scheme" URL
+ * syntax as described in RFC1738 and updated by RFC2396.  Only absolute URLs 
+ * are supported, using "server-based" naming authorities in the schemes.
+ * Support for literal IPv6 addresses is included, per RFC2732.
+ *
+ * Current "known" schemes: http, ftp, file.
+ *
+ * We can do all the parsing operations without Runes since URLs are
+ * defined to be composed of US-ASCII printable characters.
+ * See RFC1738, RFC2396.
+ */
+
+#include <u.h>
+#include <libc.h>
+#include <ctype.h>
+#include <regexp.h>
+#include <plumb.h>
+#include <thread.h>
+#include <fcall.h>
+#include <9p.h>
+#include "dat.h"
+#include "fns.h"
+
+int urldebug;
+
+/* If set, relative paths with leading ".." segments will have them trimmed */
+#define RemoveExtraRelDotDots	0
+#define ExpandCurrentDocUrls	1
+
+static char*
+schemestrtab[] =
+{
+	nil,
+	"http",
+	"https",
+	"ftp",
+	"file",
+};
+
+static int
+ischeme(char *s)
+{
+	int i;
+
+	for(i=0; i<nelem(schemestrtab); i++)
+		if(schemestrtab[i] && strcmp(s, schemestrtab[i])==0)
+			return i;
+	return USunknown;
+}
+
+/*
+ * URI splitting regexp is from RFC2396, Appendix B: 
+ *		^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
+ *		 12            3  4          5       6  7        8 9
+ *
+ * Example: "http://www.ics.uci.edu/pub/ietf/uri/#Related"
+ * $2 = scheme			"http"
+ * $4 = authority		"www.ics.uci.edu"
+ * $5 = path			"/pub/ietf/uri/"
+ * $7 = query			<undefined>
+ * $9 = fragment		"Related"
+ */
+
+/*
+ * RFC2396, Sec 3.1, contains:
+ *
+ * Scheme names consist of a sequence of characters beginning with a
+ * lower case letter and followed by any combination of lower case
+ * letters, digits, plus ("+"), period ("."), or hyphen ("-").  For
+ * resiliency, programs interpreting URI should treat upper case letters
+ * as equivalent to lower case in scheme names (e.g., allow "HTTP" as
+ * well as "http").
+ */
+
+/*
+ * For server-based naming authorities (RFC2396 Sec 3.2.2):
+ *    server        = [ [ userinfo "@" ] hostport ]
+ *    userinfo      = *( unreserved | escaped |
+ *                      ";" | ":" | "&" | "=" | "+" | "$" | "," )
+ *    hostport      = host [ ":" port ]
+ *    host          = hostname | IPv4address
+ *    hostname      = *( domainlabel "." ) toplabel [ "." ]
+ *    domainlabel   = alphanum | alphanum *( alphanum | "-" ) alphanum
+ *    toplabel      = alpha | alpha *( alphanum | "-" ) alphanum
+ *    IPv4address   = 1*digit "." 1*digit "." 1*digit "." 1*digit
+ *    port          = *digit
+ *
+ *  The host is a domain name of a network host, or its IPv4 address as a
+ *  set of four decimal digit groups separated by ".".  Literal IPv6
+ *  addresses are not supported.
+ *
+ * Note that literal IPv6 address support is outlined in RFC2732:
+ *    host          = hostname | IPv4address | IPv6reference
+ *    ipv6reference = "[" IPv6address "]"		(RFC2373)
+ *
+ * Since hostnames and numbers will have to be resolved by the OS anyway,
+ * we don't have to parse them too pedantically (counting '.'s, checking 
+ * for well-formed literal IP addresses, etc.).
+ *
+ * In FTP/file paths, we reject most ";param"s and querys.  In HTTP paths,
+ * we just pass them through.
+ *
+ * Instead of letting a "path" be 0-or-more characters as RFC2396 suggests, 
+ * we'll say it's 1-or-more characters, 0-or-1 times.  This way, an absent
+ * path yields a nil substring match, instead of an empty one.
+ *
+ * We're more restrictive than RFC2396 indicates with "userinfo" strings,
+ * insisting they have the form "[user[:password]]".  This may need to
+ * change at some point, however.
+ */
+
+/* RE character-class components -- these go in brackets */
+#define PUNCT			"\\-_.!~*'()"
+#define RES			";/?:@&=+$,"
+#define ALNUM		"a-zA-Z0-9"
+#define HEX			"0-9a-fA-F"
+#define UNRES			ALNUM PUNCT
+
+/* RE components; _N => has N parenthesized subexpressions when expanded */
+#define ESCAPED_1			"(%[" HEX "][" HEX "])"
+#define URIC_2			"([" RES UNRES "]|" ESCAPED_1 ")"
+#define URICNOSLASH_2		"([" UNRES ";?:@&=+$,]|" ESCAPED_1 ")"
+#define USERINFO_2		"([" UNRES ";:&=+$,]|" ESCAPED_1 ")"
+#define PCHAR_2			"([" UNRES ":@&=+$,]|" ESCAPED_1 ")"
+#define PSEGCHAR_3		"([/;]|" PCHAR_2 ")"
+
+typedef struct Retab Retab;
+struct Retab
+{
+	char	*str;
+	Reprog	*prog;
+	int		size;
+	int		ind[5];
+};
+
+enum
+{
+	REsplit = 0,
+	REscheme,
+	REunknowndata,
+	REauthority,
+	REhost,
+	REuserinfo,
+	REabspath,
+	REquery,
+	REfragment,
+	REhttppath,
+	REftppath,
+	REfilepath,
+
+	MaxResub=	20,
+};
+
+Retab retab[] =	/* view in constant width Font */
+{
+[REsplit]
+	"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]+)?(\\?([^#]*))?(#(.*))?$", nil, 0,
+	/* |-scheme-|      |-auth.-|  |path--|    |query|     |--|frag */
+	{  2,              4,         5,          7,          9},
+
+[REscheme]
+	"^[a-z][a-z0-9+-.]*$", nil, 0,
+	{ 0, },
+
+[REunknowndata]
+	"^" URICNOSLASH_2 URIC_2 "*$", nil, 0,
+	{ 0, },
+
+[REauthority]
+	"^(((" USERINFO_2 "*)@)?(((\\[[^\\]@]+\\])|([^:\\[@]+))(:([0-9]*))?)?)?$", nil, 0,
+	/* |----user info-----|  |--------host----------------|  |-port-| */
+	{  3,                    7,                              11, },
+
+[REhost]
+	"^(([a-zA-Z0-9\\-.]+)|(\\[([a-fA-F0-9.:]+)\\]))$", nil, 0,
+	/* |--regular host--|     |-IPv6 literal-| */
+	{  2,                     4, },
+
+[REuserinfo]
+	"^(([^:]*)(:([^:]*))?)$", nil, 0,
+	/* |user-|  |pass-| */
+	{  2,       4, },
+
+[REabspath]
+	"^/" PSEGCHAR_3 "*$", nil, 0,
+	{ 0, },
+
+[REquery]
+	"^" URIC_2 "*$", nil, 0,
+	{ 0, },
+
+[REfragment]
+	"^" URIC_2 "*$", nil, 0,
+	{ 0, },
+
+[REhttppath]
+	"^.*$", nil, 0,
+	{ 0, },
+
+[REftppath]
+	"^(.+)(;[tT][yY][pP][eE]=([aAiIdD]))?$", nil, 0,
+	/*|--|-path              |ftptype-| */
+	{ 1,                     3, }, 
+
+[REfilepath]
+	"^.*$", nil, 0,
+	{ 0, },
+};
+
+static int
+countleftparen(char *s)
+{
+	int n;
+
+	n = 0;
+	for(; *s; s++)
+		if(*s == '(')
+			n++;
+	return n;
+}
+
+void
+initurl(void)
+{
+	int i, j;
+
+	for(i=0; i<nelem(retab); i++){
+		retab[i].prog = regcomp(retab[i].str);
+		if(retab[i].prog == nil)
+			sysfatal("recomp(%s): %r", retab[i].str);
+		retab[i].size = countleftparen(retab[i].str)+1;
+		for(j=0; j<nelem(retab[i].ind); j++)
+			if(retab[i].ind[j] >= retab[i].size)
+				sysfatal("bad index in regexp table: retab[%d].ind[%d] = %d >= %d",
+					i, j, retab[i].ind[j], retab[i].size);
+		if(MaxResub < retab[i].size)
+			sysfatal("MaxResub too small: %d < %d", MaxResub, retab[i].size);
+	}
+}
+
+typedef struct SplitUrl SplitUrl;
+struct SplitUrl
+{
+	struct {
+		char *s;
+		char *e;
+	} url, scheme, authority, path, query, fragment;
+};
+
+/*
+ * Implements the algorithm in RFC2396 sec 5.2 step 6.
+ * Returns number of chars written, excluding NUL terminator.
+ * dest is known to be >= strlen(base)+rel_len.
+ */
+static void
+merge_relative_path(char *base, char *rel_st, int rel_len, char *dest)
+{
+	char *s, *p, *e, *pdest;
+
+	pdest = dest;
+
+	/* 6a: start with base, discard last segment */
+	if(base && base[0]){
+		/* Empty paths don't match in our scheme; 'base' should be nil */
+		assert(base[0] == '/');
+		e = strrchr(base, '/');
+		e++;
+		memmove(pdest, base, e-base);
+		pdest += e-base;
+	}else{
+		/* Artistic license on my part */
+		*pdest++ = '/';
+	}
+
+	/* 6b: append relative component */
+	if(rel_st){
+		memmove(pdest, rel_st, rel_len);
+		pdest += rel_len;
+	}
+
+	/* 6c: remove any occurrences of "./" as a complete segment */
+	s = dest;
+	*pdest = '\0';
+	while(e = strstr(s, "./")){
+		if((e == dest) || (*(e-1) == '/')){
+ 			memmove(e, e+2, pdest+1-(e+2));	/* +1 for NUL */
+			pdest -= 2;
+		}else
+			s = e+1;
+	}
+
+	/* 6d: remove a trailing "." as a complete segment */
+	if(pdest>dest && *(pdest-1)=='.' && 
+	  (pdest==dest+1 || *(pdest-2)=='/'))
+		*--pdest = '\0';
+
+	/* 6e: remove occurences of "seg/../", where seg != "..", left->right */
+	s = dest+1;
+	while(e = strstr(s, "/../")){
+		p = e - 1;
+		while(p >= dest && *p != '/')
+			p--;
+		if(memcmp(p, "/../", 4) != 0){
+			memmove(p+1, e+4, pdest+1-(e+4));
+			pdest -= (e+4) - (p+1);
+		}else
+			s = e+1;
+	}
+
+	/* 6f: remove a trailing "seg/..", where seg isn't ".."  */
+	if(pdest-3 > dest && memcmp(pdest-3, "/..", 3)==0){
+		p = pdest-3 - 1;
+		while(p >= dest && *p != '/')
+			p--;
+		if(memcmp(p, "/../", 4) != 0){
+			pdest = p+1;
+			*pdest = '\0';
+		}
+	}
+
+	/* 6g: leading ".." segments are errors -- we'll just blat them out. */
+	if(RemoveExtraRelDotDots){
+		p = dest;
+		if (p[0] == '/')
+			p++;
+		s = p;
+		while(s[0]=='.' && s[1]=='.' && (s[2]==0 || s[2]=='/'))
+			s += 3;
+		if(s > p){
+			memmove(p, s, pdest+1-s);
+			pdest -= s-p;
+		}
+	}
+	USED(pdest);
+
+	if(urldebug)
+		fprint(2, "merge_relative_path: '%s' + '%.*s' -> '%s'\n", base, rel_len, 
+			rel_st, dest);
+}
+
+/*
+ * See RFC2396 sec 5.2 for info on resolving relative URIs to absolute form.
+ *
+ * If successful, this just ends up freeing and replacing "u->url".
+ */
+static int
+resolve_relative(SplitUrl *su, Url *base, Url *u)
+{
+	char *url, *path;
+	char *purl, *ppath;
+	int currentdoc, ulen, plen;
+
+	if(base == nil){
+		werrstr("relative URI given without base");
+		return -1;
+	}
+	if(base->scheme == nil){
+		werrstr("relative URI given with no scheme");
+		return -1;
+	}
+	if(base->ischeme == USunknown){
+		werrstr("relative URI given with unknown scheme");
+		return -1;
+	}
+	if(base->ischeme == UScurrent){
+		werrstr("relative URI given with incomplete base");
+		return -1;
+	}
+	assert(su->scheme.s == nil);
+
+	/* Sec 5.2 step 2 */
+	currentdoc = 0;
+	if(su->path.s==nil && su->scheme.s==nil && su->authority.s==nil && su->query.s==nil){
+		/* Reference is to current document */
+		if(urldebug)
+			fprint(2, "url %s is relative to current document\n", u->url);
+		u->ischeme = UScurrent;
+		if(!ExpandCurrentDocUrls)
+			return 0;
+		currentdoc = 1;
+	}
+	
+	/* Over-estimate the maximum lengths, for allocation purposes */
+	/* (constants are for separators) */
+	plen = 1;
+	if(base->path)
+		plen += strlen(base->path);
+	if(su->path.s)
+		plen += 1 + (su->path.e - su->path.s);
+
+	ulen = 0;
+	ulen += strlen(base->scheme) + 1;
+	if(su->authority.s)
+		ulen += 2 + (su->authority.e - su->authority.s);
+	else
+		ulen += 2 + ((base->authority) ? strlen(base->authority) : 0);
+	ulen += plen;
+	if(su->query.s)
+		ulen += 1 + (su->query.e - su->query.s);
+	else if(currentdoc && base->query)
+		ulen += 1 + strlen(base->query);
+	if(su->fragment.s)
+		ulen += 1 + (su->fragment.e - su->fragment.s);
+	else if(currentdoc && base->fragment)
+		ulen += 1 + strlen(base->fragment);
+	url = emalloc(ulen+1);
+	path = emalloc(plen+1);
+
+	url[0] = '\0';
+	purl = url;
+	path[0] = '\0';
+	ppath = path;
+
+	if(su->authority.s || (su->path.s && (su->path.s[0] == '/'))){
+		/* Is a "network-path" or "absolute-path"; don't merge with base path */
+		/* Sec 5.2 steps 4,5 */
+		if(su->path.s){
+			memmove(ppath, su->path.s, su->path.e - su->path.s);
+			ppath += su->path.e - su->path.s;
+			*ppath = '\0';
+		}
+	}else if(currentdoc){
+		/* Is a current-doc reference; just copy the path from the base URL */
+		if(base->path){
+			strcpy(ppath, base->path);
+			ppath += strlen(ppath);
+		}
+		USED(ppath);
+	}else{
+		/* Is a relative-path reference; we have to merge it */
+		/* Sec 5.2 step 6 */
+		merge_relative_path(base->path,
+			su->path.s, su->path.e - su->path.s, ppath);
+	}
+
+	/* Build new URL from pieces, inheriting from base where needed */
+	strcpy(purl, base->scheme);
+	purl += strlen(purl);
+	*purl++ = ':';
+	if(su->authority.s){
+		strcpy(purl, "//");
+		purl += strlen(purl);
+		memmove(purl, su->authority.s, su->authority.e - su->authority.s);
+		purl += su->authority.e - su->authority.s;
+	}else if(base->authority){
+		strcpy(purl, "//");
+		purl += strlen(purl);
+		strcpy(purl, base->authority);
+		purl += strlen(purl);
+	}
+	assert((path[0] == '\0') || (path[0] == '/'));
+	strcpy(purl, path);
+	purl += strlen(purl);
+
+	/*
+	 * The query and fragment are not inherited from the base,
+	 * except in case of "current document" URLs, which inherit any query
+	 * and may inherit the fragment.
+	 */
+	if(su->query.s){
+		*purl++ = '?';
+		memmove(purl, su->query.s, su->query.e - su->query.s);
+		purl += su->query.e - su->query.s;
+	}else if(currentdoc && base->query){
+		*purl++ = '?';
+		strcpy(purl, base->query);
+		purl += strlen(purl);
+	}
+
+	if(su->fragment.s){
+		*purl++ = '#';
+		memmove(purl, su->query.s, su->query.e - su->query.s);
+		purl += su->fragment.e - su->fragment.s;
+	}else if(currentdoc && base->fragment){
+		*purl++ = '#';
+		strcpy(purl, base->fragment);
+		purl += strlen(purl);
+	}
+	USED(purl);
+
+	if(urldebug)
+		fprint(2, "resolve_relative: '%s' + '%s' -> '%s'\n", base->url, u->url, url);
+	free(u->url);
+	u->url = url;
+	free(path);
+	return 0;
+}
+
+int
+regx(Reprog *prog, char *s, Resub *m, int nm)
+{
+	int i;
+
+	if(s == nil)
+		s = m[0].sp;	/* why is this necessary? */
+
+	i = regexec(prog, s, m, nm);
+/*
+	if(i >= 0)
+		for(j=0; j<nm; j++)
+			fprint(2, "match%d: %.*s\n", j, utfnlen(m[j].sp, m[j].ep-m[j].sp), m[j].sp);
+*/
+	return i;
+}
+
+static int
+ismatch(int i, char *s, char *desc)
+{
+	Resub m[1];
+
+	m[0].sp = m[0].ep = nil;
+	if(!regx(retab[i].prog, s, m, 1)){
+		werrstr("malformed %s: %q", desc, s);
+		return 0;
+	}
+	return 1;
+}
+
+static int
+spliturl(char *url, SplitUrl *su)
+{
+	Resub m[MaxResub];
+	Retab *t;
+
+	/*
+	 * Newlines are not valid in a URI, but regexp(2) treats them specially 
+	 * so it's best to make sure there are none before proceeding.
+	 */
+	if(strchr(url, '\n')){
+		werrstr("newline in URI");
+		return -1;
+	}
+
+	/*
+	 * Because we use NUL-terminated strings, as do many client and server
+	 * implementations, an escaped NUL ("%00") will quite likely cause problems
+	 * when unescaped.  We can check for such a sequence once before examining
+ 	 * the components because, per RFC2396 sec. 2.4.1 - 2.4.2, '%' is reserved
+	 * in URIs to _always_ indicate escape sequences.  Something like "%2500"
+	 * will still get by, but that's legitimate, and if it ends up causing
+	 * a NUL then someone is unescaping too many times.
+	 */
+	if(strstr(url, "%00")){
+		werrstr("escaped NUL in URI");
+		return -1;
+	}
+
+	m[0].sp = m[0].ep = nil;
+	t = &retab[REsplit];
+	if(!regx(t->prog, url, m, t->size)){
+		werrstr("malformed URI: %q", url);
+		return -1;
+	}
+
+	su->url.s = m[0].sp;
+	su->url.e = m[0].ep;
+	su->scheme.s = m[t->ind[0]].sp;
+	su->scheme.e = m[t->ind[0]].ep;
+	su->authority.s = m[t->ind[1]].sp;
+	su->authority.e = m[t->ind[1]].ep;
+	su->path.s = m[t->ind[2]].sp;
+	su->path.e = m[t->ind[2]].ep;
+	su->query.s = m[t->ind[3]].sp;
+	su->query.e = m[t->ind[3]].ep;
+	su->fragment.s = m[t->ind[4]].sp;
+	su->fragment.e = m[t->ind[4]].ep;
+
+	if(urldebug)
+		fprint(2, "split url %s into %.*q %.*q %.*q %.*q %.*q %.*q\n",
+			url,
+			su->url.s ? utfnlen(su->url.s, su->url.e-su->url.s) : 10, su->url.s ? su->url.s : "",
+			su->scheme.s ? utfnlen(su->scheme.s, su->scheme.e-su->scheme.s) : 10, su->scheme.s ? su->scheme.s : "",
+			su->authority.s ? utfnlen(su->authority.s, su->authority.e-su->authority.s) : 10, su->authority.s ? su->authority.s : "",
+			su->path.s ? utfnlen(su->path.s, su->path.e-su->path.s) : 10, su->path.s ? su->path.s : "",
+			su->query.s ? utfnlen(su->query.s, su->query.e-su->query.s) : 10, su->query.s ? su->query.s : "",
+			su->fragment.s ? utfnlen(su->fragment.s, su->fragment.e-su->fragment.s) : 10, su->fragment.s ? su->fragment.s : "");
+
+	return 0;
+}
+
+static int
+parse_scheme(SplitUrl *su, Url *u)
+{
+	if(su->scheme.s == nil){
+		werrstr("missing scheme");
+		return -1;
+	}
+	u->scheme = estredup(su->scheme.s, su->scheme.e);
+	strlower(u->scheme);
+
+	if(!ismatch(REscheme, u->scheme, "scheme"))
+		return -1;
+
+	u->ischeme = ischeme(u->scheme);
+	if(urldebug)
+		fprint(2, "parse_scheme %s => %d\n", u->scheme, u->ischeme);
+	return 0;
+}
+
+static int
+parse_unknown_part(SplitUrl *su, Url *u)
+{
+	char *s, *e;
+
+	assert(u->ischeme == USunknown);
+	assert(su->scheme.e[0] == ':');
+
+	s = su->scheme.e+1;
+	if(su->fragment.s){
+		e = su->fragment.s-1;
+		assert(*e == '#');
+	}else
+		e = s+strlen(s);
+
+	u->schemedata = estredup(s, e);
+	if(!ismatch(REunknowndata, u->schemedata, "unknown scheme data"))
+		return -1;
+	return 0;
+}
+
+static int
+parse_userinfo(char *s, char *e, Url *u)
+{
+	Resub m[MaxResub];
+	Retab *t;
+
+	m[0].sp = s;
+	m[0].ep = e;
+	t = &retab[REuserinfo];
+	if(!regx(t->prog, nil, m, t->size)){
+		werrstr("malformed userinfo: %.*q", utfnlen(s, e-s), s);
+		return -1;
+	}
+	if(m[t->ind[0]].sp)
+		u->user = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
+	if(m[t->ind[1]].sp)
+		u->user = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
+	return 0;
+}
+
+static int
+parse_host(char *s, char *e, Url *u)
+{
+	Resub m[MaxResub];
+	Retab *t;
+
+	m[0].sp = s;
+	m[0].ep = e;
+	t = &retab[REhost];
+	if(!regx(t->prog, nil, m, t->size)){
+		werrstr("malformed host: %.*q", utfnlen(s, e-s), s);
+		return -1;
+	}
+
+	assert(m[t->ind[0]].sp || m[t->ind[1]].sp);
+
+	if(m[t->ind[0]].sp)	/* regular */
+		u->host = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
+	else
+		u->host = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
+	return 0;
+}
+
+static int
+parse_authority(SplitUrl *su, Url *u)
+{
+	Resub m[MaxResub];
+	Retab *t;
+	char *host;
+	char *userinfo;
+
+	if(su->authority.s == nil)
+		return 0;
+
+	u->authority = estredup(su->authority.s, su->authority.e);
+	m[0].sp = m[0].ep = nil;
+	t = &retab[REauthority];
+	if(!regx(t->prog, u->authority, m, t->size)){
+		werrstr("malformed authority: %q", u->authority);
+		return -1;
+	}
+
+	if(m[t->ind[0]].sp)
+		if(parse_userinfo(m[t->ind[0]].sp, m[t->ind[0]].ep, u) < 0)
+			return -1;
+	if(m[t->ind[1]].sp)
+		if(parse_host(m[t->ind[1]].sp, m[t->ind[1]].ep, u) < 0)
+			return -1;
+	if(m[t->ind[2]].sp)
+		u->port = estredup(m[t->ind[2]].sp, m[t->ind[2]].ep);
+
+
+	if(urldebug > 0){
+		userinfo = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep); 
+		host = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
+		fprint(2, "port: %q, authority %q\n", u->port, u->authority);
+		fprint(2, "host %q, userinfo %q\n", host, userinfo);
+		free(host);
+		free(userinfo);
+	}
+	return 0;
+}
+
+static int
+parse_abspath(SplitUrl *su, Url *u)
+{
+	if(su->path.s == nil)
+		return 0;
+	u->path = estredup(su->path.s, su->path.e);
+	if(!ismatch(REabspath, u->path, "absolute path"))
+		return -1;
+	return 0;
+}
+
+static int
+parse_query(SplitUrl *su, Url *u)
+{
+	if(su->query.s == nil)
+		return 0;
+	u->query = estredup(su->query.s, su->query.e);
+	if(!ismatch(REquery, u->query, "query"))
+		return -1;
+	return 0;
+}
+
+static int
+parse_fragment(SplitUrl *su, Url *u)
+{
+	if(su->fragment.s == nil)
+		return 0;
+	u->fragment = estredup(su->fragment.s, su->fragment.e);
+	if(!ismatch(REfragment, u->fragment, "fragment"))
+		return -1;
+	return 0;
+}
+
+static int
+postparse_http(Url *u)
+{
+	u->open = httpopen;
+	u->read = httpread;
+	u->close = httpclose;
+
+	if(u->authority==nil){
+		werrstr("missing authority (hostname, port, etc.)");
+		return -1;
+	}
+	if(u->host == nil){
+		werrstr("missing host specification");
+		return -1;
+	}
+
+	if(u->path == nil){
+		u->http.page_spec = estrdup("/");
+		return 0;
+	}
+
+	if(!ismatch(REhttppath, u->path, "http path"))
+		return -1;
+	if(u->query){
+		u->http.page_spec = emalloc(strlen(u->path)+1+strlen(u->query)+1);
+		strcpy(u->http.page_spec, u->path);
+		strcat(u->http.page_spec, "?");
+		strcat(u->http.page_spec, u->query);
+	}else
+		u->http.page_spec = estrdup(u->path);
+
+	return 0;
+}
+
+static int
+postparse_ftp(Url *u)
+{
+	Resub m[MaxResub];
+	Retab *t;
+
+	if(u->authority==nil){
+		werrstr("missing authority (hostname, port, etc.)");
+		return -1;
+	}
+	if(u->query){
+		werrstr("unexpected \"?query\" in ftp path");
+		return -1;
+	}
+	if(u->host == nil){
+		werrstr("missing host specification");
+		return -1;
+	}
+
+	if(u->path == nil){
+		u->ftp.path_spec = estrdup("/");
+		return 0;
+	}
+
+	m[0].sp = m[0].ep = nil;
+	t = &retab[REftppath];
+	if(!regx(t->prog, u->path, m, t->size)){
+		werrstr("malformed ftp path: %q", u->path);
+		return -1;
+	}
+
+	if(m[t->ind[0]].sp){
+		u->ftp.path_spec = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
+		if(strchr(u->ftp.path_spec, ';')){
+			werrstr("unexpected \";param\" in ftp path");
+			return -1;
+		}
+	}else
+		u->ftp.path_spec = estrdup("/");
+
+	if(m[t->ind[1]].sp){
+		u->ftp.type = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
+		strlower(u->ftp.type);
+	}
+	return 0;
+}
+
+static int
+postparse_file(Url *u)
+{
+	if(u->user || u->passwd){
+		werrstr("user information not valid with file scheme");
+		return -1;
+	}
+	if(u->query){
+		werrstr("unexpected \"?query\" in file path");
+		return -1;
+	}
+	if(u->port){
+		werrstr("port not valid with file scheme");
+		return -1;
+	}
+	if(u->path == nil){
+		werrstr("missing path in file scheme");
+		return -1;
+	}
+	if(strchr(u->path, ';')){
+		werrstr("unexpected \";param\" in file path");
+		return -1;
+	}
+
+	if(!ismatch(REfilepath, u->path, "file path"))
+		return -1;
+
+	/* "localhost" is equivalent to no host spec, we'll chose the latter */
+	if(u->host && cistrcmp(u->host, "localhost") == 0){
+		free(u->host);
+		u->host = nil;
+	}
+	return 0;
+}
+
+static int (*postparse[])(Url*) = {
+	nil,
+	postparse_http,
+	postparse_http,
+	postparse_ftp,
+	postparse_file,
+};
+
+Url*
+parseurl(char *url, Url *base)
+{
+	Url *u;
+	SplitUrl su;
+
+	if(urldebug)
+		fprint(2, "parseurl %s with base %s\n", url, base ? base->url : "<none>");
+
+	u = emalloc(sizeof(Url));
+	u->url = estrdup(url);
+	if(spliturl(u->url, &su) < 0){
+	Fail:
+		freeurl(u);
+		return nil;
+	}
+
+	/* RFC2396 sec 3.1 says relative URIs are distinguished by absent scheme */ 
+	if(su.scheme.s==nil){
+		if(urldebug)
+			fprint(2, "parseurl has nil scheme\n");
+		if(resolve_relative(&su, base, u) < 0 || spliturl(u->url, &su) < 0)
+			goto Fail;
+		if(u->ischeme == UScurrent){
+			/* 'u.url' refers to current document; set fragment and return */
+			if(parse_fragment(&su, u) < 0)
+				goto Fail;
+			return u;
+		}
+	}
+
+	if(parse_scheme(&su, u) < 0
+	|| parse_fragment(&su, u) < 0)
+		goto Fail;
+
+	if(u->ischeme == USunknown){
+		if(parse_unknown_part(&su, u) < 0)
+			goto Fail;
+		return u;
+	}
+
+	if(parse_query(&su, u) < 0
+	|| parse_authority(&su, u) < 0
+	|| parse_abspath(&su, u) < 0)
+		goto Fail;
+
+	if(u->ischeme < nelem(postparse) && postparse[u->ischeme])
+		if((*postparse[u->ischeme])(u) < 0)
+			goto Fail;
+
+	setmalloctag(u, getcallerpc(&url));
+	return u;
+}
+
+void
+freeurl(Url *u)
+{
+	if(u == nil)
+		return;
+	free(u->url);
+	free(u->scheme);
+	free(u->schemedata);
+	free(u->authority);
+	free(u->user);
+	free(u->passwd);
+	free(u->host);
+	free(u->port);
+	free(u->path);
+	free(u->query);
+	free(u->fragment);
+	switch(u->ischeme){
+	case UShttp:
+		free(u->http.page_spec);
+		break;
+	case USftp:
+		free(u->ftp.path_spec);
+		free(u->ftp.type);
+		break;
+	}
+	free(u);
+}
+
+void
+rewriteurl(Url *u)
+{
+	char *s;
+
+	if(u->schemedata)
+		s = estrmanydup(u->scheme, ":", u->schemedata, nil);
+	else
+		s = estrmanydup(u->scheme, "://", 
+			u->user ? u->user : "",
+			u->passwd ? ":" : "", u->passwd ? u->passwd : "",
+			u->user ? "@" : "", u->host ? u->host : "", 
+			u->port ? ":" : "", u->port ? u->port : "",
+			u->path,
+			u->query ? "?" : "", u->query ? u->query : "",
+			u->fragment ? "#" : "", u->fragment ? u->fragment : "",
+			nil);
+	free(u->url);
+	u->url = s;
+}
+
+int
+seturlquery(Url *u, char *query)
+{
+	if(query == nil){
+		free(u->query);
+		u->query = nil;
+		return 0;
+	}
+
+	if(!ismatch(REquery, query, "query"))
+		return -1;
+
+	free(u->query);
+	u->query = estrdup(query);
+	return 0;
+}
+
+static void
+dupp(char **p)
+{
+	if(*p)
+		*p = estrdup(*p);
+}
+
+Url*
+copyurl(Url *u)
+{
+	Url *v;
+
+	v = emalloc(sizeof(Url));
+	*v = *u;
+	dupp(&v->url);
+	dupp(&v->scheme);
+	dupp(&v->schemedata);
+	dupp(&v->authority);
+	dupp(&v->user);
+	dupp(&v->passwd);
+	dupp(&v->host);
+	dupp(&v->port);
+	dupp(&v->path);
+	dupp(&v->query);
+	dupp(&v->fragment);
+
+	switch(v->ischeme){
+	case UShttp:
+		dupp(&v->http.page_spec);
+		break;
+	case USftp:
+		dupp(&v->ftp.path_spec);
+		dupp(&v->ftp.type);
+		break;
+	}
+	return v;
+}
+
+static int
+dhex(char c)
+{
+	if('0' <= c && c <= '9')
+		return c-'0';
+	if('a' <= c && c <= 'f')
+		return c-'a'+10;
+	if('A' <= c && c <= 'F')
+		return c-'A'+10;
+	return 0;
+}
+
+char*
+escapeurl(char *s, int (*needesc)(int))
+{
+	int n;
+	char *t, *u;
+	Rune r;
+	static char *hex = "0123456789abcdef";
+
+	n = 0;
+	for(t=s; *t; t++)
+		if((*needesc)(*t))
+			n++;
+
+	u = emalloc(strlen(s)+2*n+1);
+	t = u;
+	for(; *s; s++){
+		s += chartorune(&r, s);
+		if(r >= 0xFF){
+			werrstr("URLs cannot contain Runes > 0xFF");
+			free(t);
+			return nil;
+		}
+		if((*needesc)(r)){
+			*u++ = '%';
+			*u++ = hex[(r>>4)&0xF];
+			*u++ = hex[r&0xF];
+		}else
+			*u++ = r;
+	}
+	*u = '\0';
+	return t;
+}
+
+char*
+unescapeurl(char *s)
+{
+	char *r, *w;
+	Rune rune;
+
+	s = estrdup(s);
+	for(r=w=s; *r; r++){
+		if(*r=='%'){
+			r++;
+			if(!isxdigit(r[0]) || !isxdigit(r[1])){
+				werrstr("bad escape sequence '%.3s' in URL", r);
+				return nil;
+			}
+			if(r[0]=='0' && r[2]=='0'){
+				werrstr("escaped NUL in URL");
+				return nil;
+			}
+			rune = (dhex(r[0])<<4)|dhex(r[1]);	/* latin1 */
+			w += runetochar(w, &rune);
+			r += 2;
+		}else
+			*w++ = *r;
+	}
+	*w = '\0';
+	return s;
+}
+
author	Taru Karttunen <taruti@taruti.net>	2011-03-30 15:46:40 +0300
committer	Taru Karttunen <taruti@taruti.net>	2011-03-30 15:46:40 +0300
commit	e5888a1ffdae813d7575f5fb02275c6bb07e5199 (patch)
tree	d8d51eac403f07814b9e936eed0c9a79195e2450 /sys/src/cmd/webfs/url.c