new webfs, rc based hget

author: cinap_lenrek <cinap_lenrek@rei2.9hal> 2012-01-11 16:17:54 +0100
committer: cinap_lenrek <cinap_lenrek@rei2.9hal> 2012-01-11 16:17:54 +0100
commit: 75e1ef0ab60acb6bccc54254b82770aec5786ead (patch)
tree: d273fc755a20e67801aa0a13df30ab75b2883419 /sys/src/cmd/webfs/url.c
parent: 62fb4f97177d8e76f1fd49bb9d0073007b7c9bcc (diff)
1 files changed, 288 insertions, 948 deletions
diff --git a/sys/src/cmd/webfs/url.c b/sys/src/cmd/webfs/url.c
index c6c5695f0..2137010d1 100644
--- a/sys/src/cmd/webfs/url.c
+++ b/sys/src/cmd/webfs/url.c
@@ -1,871 +1,360 @@
-/*
- * This is a URL parser, written to parse "Common Internet Scheme" URL
- * syntax as described in RFC1738 and updated by RFC2396.  Only absolute URLs 
- * are supported, using "server-based" naming authorities in the schemes.
- * Support for literal IPv6 addresses is included, per RFC2732.
- *
- * Current "known" schemes: http, ftp, file.
- *
- * We can do all the parsing operations without Runes since URLs are
- * defined to be composed of US-ASCII printable characters.
- * See RFC1738, RFC2396.
- */
-
 #include <u.h>
 #include <libc.h>
 #include <ctype.h>
-#include <regexp.h>
-#include <plumb.h>
-#include <thread.h>
 #include <fcall.h>
+#include <thread.h>
 #include <9p.h>
+
 #include "dat.h"
 #include "fns.h"
 
-int urldebug;
-
-/* If set, relative paths with leading ".." segments will have them trimmed */
-#define RemoveExtraRelDotDots	0
-#define ExpandCurrentDocUrls	1
-
-static char*
-schemestrtab[] =
-{
-	nil,
-	"http",
-	"https",
-	"ftp",
-	"file",
-};
-
 static int
-ischeme(char *s)
-{
-	int i;
-
-	for(i=0; i<nelem(schemestrtab); i++)
-		if(schemestrtab[i] && strcmp(s, schemestrtab[i])==0)
-			return i;
-	return USunknown;
-}
-
-/*
- * URI splitting regexp is from RFC2396, Appendix B: 
- *		^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
- *		 12            3  4          5       6  7        8 9
- *
- * Example: "http://www.ics.uci.edu/pub/ietf/uri/#Related"
- * $2 = scheme			"http"
- * $4 = authority		"www.ics.uci.edu"
- * $5 = path			"/pub/ietf/uri/"
- * $7 = query			<undefined>
- * $9 = fragment		"Related"
- */
-
-/*
- * RFC2396, Sec 3.1, contains:
- *
- * Scheme names consist of a sequence of characters beginning with a
- * lower case letter and followed by any combination of lower case
- * letters, digits, plus ("+"), period ("."), or hyphen ("-").  For
- * resiliency, programs interpreting URI should treat upper case letters
- * as equivalent to lower case in scheme names (e.g., allow "HTTP" as
- * well as "http").
- */
-
-/*
- * For server-based naming authorities (RFC2396 Sec 3.2.2):
- *    server        = [ [ userinfo "@" ] hostport ]
- *    userinfo      = *( unreserved | escaped |
- *                      ";" | ":" | "&" | "=" | "+" | "$" | "," )
- *    hostport      = host [ ":" port ]
- *    host          = hostname | IPv4address
- *    hostname      = *( domainlabel "." ) toplabel [ "." ]
- *    domainlabel   = alphanum | alphanum *( alphanum | "-" ) alphanum
- *    toplabel      = alpha | alpha *( alphanum | "-" ) alphanum
- *    IPv4address   = 1*digit "." 1*digit "." 1*digit "." 1*digit
- *    port          = *digit
- *
- *  The host is a domain name of a network host, or its IPv4 address as a
- *  set of four decimal digit groups separated by ".".  Literal IPv6
- *  addresses are not supported.
- *
- * Note that literal IPv6 address support is outlined in RFC2732:
- *    host          = hostname | IPv4address | IPv6reference
- *    ipv6reference = "[" IPv6address "]"		(RFC2373)
- *
- * Since hostnames and numbers will have to be resolved by the OS anyway,
- * we don't have to parse them too pedantically (counting '.'s, checking 
- * for well-formed literal IP addresses, etc.).
- *
- * In FTP/file paths, we reject most ";param"s and querys.  In HTTP paths,
- * we just pass them through.
- *
- * Instead of letting a "path" be 0-or-more characters as RFC2396 suggests, 
- * we'll say it's 1-or-more characters, 0-or-1 times.  This way, an absent
- * path yields a nil substring match, instead of an empty one.
- *
- * We're more restrictive than RFC2396 indicates with "userinfo" strings,
- * insisting they have the form "[user[:password]]".  This may need to
- * change at some point, however.
- */
-
-/* RE character-class components -- these go in brackets */
-#define PUNCT			"\\-_.!~*'()"
-#define ALNUM		"a-zA-Z0-9"
-#define HEX			"0-9a-fA-F"
-#define UNRES			ALNUM PUNCT
-
-/* RE components; _N => has N parenthesized subexpressions when expanded */
-#define USERINFO_2		"([" UNRES ";:&=+$,]|(%[" HEX "][" HEX "]))"
-
-typedef struct Retab Retab;
-struct Retab
-{
-	char	*str;
-	Reprog	*prog;
-	int		size;
-	int		ind[5];
-};
-
-enum
-{
-	REsplit = 0,
-	REscheme,
-	REauthority,
-	REhost,
-	REuserinfo,
-	REftppath,
-
-	MaxResub=	20,
-};
-
-Retab retab[] =	/* view in constant width Font */
-{
-[REsplit]
-	"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]+)?(\\?([^#]*))?(#(.*))?$", nil, 0,
-	/* |-scheme-|      |-auth.-|  |path--|    |query|     |--|frag */
-	{  2,              4,         5,          7,          9},
-
-[REscheme]
-	"^[a-z][a-z0-9+-.]*$", nil, 0,
-	{ 0, },
-
-[REauthority]
-	"^(((" USERINFO_2 "*)@)?(((\\[[^\\]@]+\\])|([^:\\[@]+))(:([0-9]*))?)?)?$", nil, 0,
-	/* |----user info-----|  |--------host----------------|  |-port-| */
-	{  3,                    7,                              11, },
-
-[REhost]
-	"^(([a-zA-Z0-9\\-.]+)|(\\[([a-fA-F0-9.:]+)\\]))$", nil, 0,
-	/* |--regular host--|     |-IPv6 literal-| */
-	{  2,                     4, },
-
-[REuserinfo]
-	"^(([^:]*)(:([^:]*))?)$", nil, 0,
-	/* |user-|  |pass-| */
-	{  2,       4, },
-
-[REftppath]
-	"^(.+)(;[tT][yY][pP][eE]=([aAiIdD]))?$", nil, 0,
-	/*|--|-path              |ftptype-| */
-	{ 1,                     3, }, 
-};
-
-static int
-countleftparen(char *s)
-{
-	int n;
-
-	n = 0;
-	for(; *s; s++)
-		if(*s == '(')
-			n++;
-	return n;
-}
-
-void
-initurl(void)
+dhex(char c)
 {
-	int i, j;
-
-	for(i=0; i<nelem(retab); i++){
-		retab[i].prog = regcomp(retab[i].str);
-		if(retab[i].prog == nil)
-			sysfatal("recomp(%s): %r", retab[i].str);
-		retab[i].size = countleftparen(retab[i].str)+1;
-		for(j=0; j<nelem(retab[i].ind); j++)
-			if(retab[i].ind[j] >= retab[i].size)
-				sysfatal("bad index in regexp table: retab[%d].ind[%d] = %d >= %d",
-					i, j, retab[i].ind[j], retab[i].size);
-		if(MaxResub < retab[i].size)
-			sysfatal("MaxResub too small: %d < %d", MaxResub, retab[i].size);
-	}
+	if('0' <= c && c <= '9')
+		return c-'0';
+	if('a' <= c && c <= 'f')
+		return c-'a'+10;
+	if('A' <= c && c <= 'F')
+		return c-'A'+10;
+	return 0;
 }
 
-typedef struct SplitUrl SplitUrl;
-struct SplitUrl
-{
-	struct {
-		char *s;
-		char *e;
-	} url, scheme, authority, path, query, fragment;
-};
-
-/*
- * Implements the algorithm in RFC2396 sec 5.2 step 6.
- * Returns number of chars written, excluding NUL terminator.
- * dest is known to be >= strlen(base)+rel_len.
- */
-static void
-merge_relative_path(char *base, char *rel_st, int rel_len, char *dest)
+static char*
+unescape(char *s, char *spec)
 {
-	char *s, *p, *e, *pdest;
-
-	pdest = dest;
-
-	/* 6a: start with base, discard last segment */
-	if(base && base[0]){
-		/* Empty paths don't match in our scheme; 'base' should be nil */
-		assert(base[0] == '/');
-		e = strrchr(base, '/');
-		e++;
-		memmove(pdest, base, e-base);
-		pdest += e-base;
-	}else{
-		/* Artistic license on my part */
-		*pdest++ = '/';
-	}
-
-	/* 6b: append relative component */
-	if(rel_st){
-		memmove(pdest, rel_st, rel_len);
-		pdest += rel_len;
-	}
-
-	/* 6c: remove any occurrences of "./" as a complete segment */
-	s = dest;
-	*pdest = '\0';
-	while(e = strstr(s, "./")){
-		if((e == dest) || (*(e-1) == '/')){
- 			memmove(e, e+2, pdest+1-(e+2));	/* +1 for NUL */
-			pdest -= 2;
-		}else
-			s = e+1;
-	}
-
-	/* 6d: remove a trailing "." as a complete segment */
-	if(pdest>dest && *(pdest-1)=='.' && 
-	  (pdest==dest+1 || *(pdest-2)=='/'))
-		*--pdest = '\0';
-
-	/* 6e: remove occurences of "seg/../", where seg != "..", left->right */
-	s = dest+1;
-	while(e = strstr(s, "/../")){
-		p = e - 1;
-		while(p >= dest && *p != '/')
-			p--;
-		if(memcmp(p, "/../", 4) != 0){
-			memmove(p+1, e+4, pdest+1-(e+4));
-			pdest -= (e+4) - (p+1);
-		}else
-			s = e+1;
-	}
-
-	/* 6f: remove a trailing "seg/..", where seg isn't ".."  */
-	if(pdest-3 > dest && memcmp(pdest-3, "/..", 3)==0){
-		p = pdest-3 - 1;
-		while(p >= dest && *p != '/')
-			p--;
-		if(memcmp(p, "/../", 4) != 0){
-			pdest = p+1;
-			*pdest = '\0';
-		}
-	}
+	char *r, *w;
+	uchar x;
 
-	/* 6g: leading ".." segments are errors -- we'll just blat them out. */
-	if(RemoveExtraRelDotDots){
-		p = dest;
-		if (p[0] == '/')
-			p++;
-		s = p;
-		while(s[0]=='.' && s[1]=='.' && (s[2]==0 || s[2]=='/'))
-			s += 3;
-		if(s > p){
-			memmove(p, s, pdest+1-s);
-			pdest -= s-p;
+	if(s == nil)
+		return s;
+	for(r=w=s; x = *r; r++){
+		if(x == '%' && isxdigit(r[1]) && isxdigit(r[2])){
+			x = (dhex(r[1])<<4)|dhex(r[2]);
+			if(x == 0 || (x > 0x1F && x < 0x7F && strchr(spec, x))){
+				*w++ = '%';
+				*w++ = toupper(r[1]);
+				*w++ = toupper(r[2]);
+			}
+			else
+				*w++ = x;
+			r += 2;
+			continue;
 		}
+		if(x == '+')
+			x = ' ';
+		*w++ = x;
 	}
-	USED(pdest);
-
-	if(urldebug)
-		fprint(2, "merge_relative_path: '%s' + '%.*s' -> '%s'\n", base, rel_len, 
-			rel_st, dest);
+	*w = 0;
+	return s;
 }
 
-/*
- * See RFC2396 sec 5.2 for info on resolving relative URIs to absolute form.
- *
- * If successful, this just ends up freeing and replacing "u->url".
- */
-static int
-resolve_relative(SplitUrl *su, Url *base, Url *u)
+int
+Efmt(Fmt *f)
 {
-	char *url, *path;
-	char *purl, *ppath;
-	int currentdoc, ulen, plen;
-
-	if(base == nil){
-		werrstr("relative URI given without base");
-		return -1;
-	}
-	if(base->scheme == nil){
-		werrstr("relative URI given with no scheme");
-		return -1;
-	}
-	if(base->ischeme == USunknown){
-		werrstr("relative URI given with unknown scheme");
-		return -1;
-	}
-	if(base->ischeme == UScurrent){
-		werrstr("relative URI given with incomplete base");
-		return -1;
-	}
-	assert(su->scheme.s == nil);
-
-	/* Sec 5.2 step 2 */
-	currentdoc = 0;
-	if(su->path.s==nil && su->scheme.s==nil && su->authority.s==nil && su->query.s==nil){
-		/* Reference is to current document */
-		if(urldebug)
-			fprint(2, "url %s is relative to current document\n", u->url);
-		u->ischeme = UScurrent;
-		if(!ExpandCurrentDocUrls)
-			return 0;
-		currentdoc = 1;
-	}
-	
-	/* Over-estimate the maximum lengths, for allocation purposes */
-	/* (constants are for separators) */
-	plen = 1;
-	if(base->path)
-		plen += strlen(base->path);
-	if(su->path.s)
-		plen += 1 + (su->path.e - su->path.s);
-
-	ulen = 0;
-	ulen += strlen(base->scheme) + 1;
-	if(su->authority.s)
-		ulen += 2 + (su->authority.e - su->authority.s);
-	else
-		ulen += 2 + ((base->authority) ? strlen(base->authority) : 0);
-	ulen += plen;
-	if(su->query.s)
-		ulen += 1 + (su->query.e - su->query.s);
-	else if(currentdoc && base->query)
-		ulen += 1 + strlen(base->query);
-	if(su->fragment.s)
-		ulen += 1 + (su->fragment.e - su->fragment.s);
-	else if(currentdoc && base->fragment)
-		ulen += 1 + strlen(base->fragment);
-	url = emalloc(ulen+1);
-	path = emalloc(plen+1);
+	char *s, *spec;
+	Str2 s2;
 
-	url[0] = '\0';
-	purl = url;
-	path[0] = '\0';
-	ppath = path;
-
-	if(su->authority.s || (su->path.s && (su->path.s[0] == '/'))){
-		/* Is a "network-path" or "absolute-path"; don't merge with base path */
-		/* Sec 5.2 steps 4,5 */
-		if(su->path.s){
-			memmove(ppath, su->path.s, su->path.e - su->path.s);
-			ppath += su->path.e - su->path.s;
-			*ppath = '\0';
-		}
-	}else if(currentdoc){
-		/* Is a current-doc reference; just copy the path from the base URL */
-		if(base->path){
-			strcpy(ppath, base->path);
-			ppath += strlen(ppath);
+	s2 = va_arg(f->args, Str2);
+	s = s2.s1;
+	spec = s2.s2;
+	for(; *s; s++)
+		if(*s == '%' && isxdigit(s[1]) && isxdigit(s[2])){
+			fmtprint(f, "%%%c%c", toupper(s[1]), toupper(s[2]));
+			s += 2;
 		}
-		USED(ppath);
-	}else{
-		/* Is a relative-path reference; we have to merge it */
-		/* Sec 5.2 step 6 */
-		merge_relative_path(base->path,
-			su->path.s, su->path.e - su->path.s, ppath);
-	}
-
-	/* Build new URL from pieces, inheriting from base where needed */
-	strcpy(purl, base->scheme);
-	purl += strlen(purl);
-	*purl++ = ':';
-	if(su->authority.s){
-		strcpy(purl, "//");
-		purl += strlen(purl);
-		memmove(purl, su->authority.s, su->authority.e - su->authority.s);
-		purl += su->authority.e - su->authority.s;
-	}else if(base->authority){
-		strcpy(purl, "//");
-		purl += strlen(purl);
-		strcpy(purl, base->authority);
-		purl += strlen(purl);
-	}
-	assert((path[0] == '\0') || (path[0] == '/'));
-	strcpy(purl, path);
-	purl += strlen(purl);
-
-	/*
-	 * The query and fragment are not inherited from the base,
-	 * except in case of "current document" URLs, which inherit any query
-	 * and may inherit the fragment.
-	 */
-	if(su->query.s){
-		*purl++ = '?';
-		memmove(purl, su->query.s, su->query.e - su->query.s);
-		purl += su->query.e - su->query.s;
-	}else if(currentdoc && base->query){
-		*purl++ = '?';
-		strcpy(purl, base->query);
-		purl += strlen(purl);
-	}
-
-	if(su->fragment.s){
-		*purl++ = '#';
-		memmove(purl, su->fragment.s, su->fragment.e - su->fragment.s);
-		purl += su->fragment.e - su->fragment.s;
-	}else if(currentdoc && base->fragment){
-		*purl++ = '#';
-		strcpy(purl, base->fragment);
-		purl += strlen(purl);
-	}
-	USED(purl);
-
-	if(urldebug)
-		fprint(2, "resolve_relative: '%s' + '%s' -> '%s'\n", base->url, u->url, url);
-	free(u->url);
-	u->url = url;
-	free(path);
+		else if(isalnum(*s) || strchr(".-_~!$&'()*,;=", *s) || strchr(spec, *s))
+			fmtprint(f, "%c", *s);
+		else
+			fmtprint(f, "%%%.2X", *s & 0xff);
 	return 0;
 }
 
 int
-regx(Reprog *prog, char *s, Resub *m, int nm)
-{
-	int i;
-
-	if(s == nil)
-		s = m[0].sp;	/* why is this necessary? */
-
-	i = regexec(prog, s, m, nm);
-/*
-	if(i >= 0)
-		for(j=0; j<nm; j++)
-			fprint(2, "match%d: %.*s\n", j, utfnlen(m[j].sp, m[j].ep-m[j].sp), m[j].sp);
-*/
-	return i;
-}
-
-static int
-ismatch(int i, char *s, char *desc)
+Ufmt(Fmt *f)
 {
-	Resub m[1];
-
-	m[0].sp = m[0].ep = nil;
-	if(!regx(retab[i].prog, s, m, 1)){
-		werrstr("malformed %s: %q", desc, s);
-		return 0;
-	}
-	return 1;
-}
-
-static int
-spliturl(char *url, SplitUrl *su)
-{
-	Resub m[MaxResub];
-	Retab *t;
-
-	/*
-	 * Newlines are not valid in a URI, but regexp(2) treats them specially 
-	 * so it's best to make sure there are none before proceeding.
-	 */
-	if(strchr(url, '\n')){
-		werrstr("newline in URI");
-		return -1;
-	}
-
-	m[0].sp = m[0].ep = nil;
-	t = &retab[REsplit];
-	if(!regx(t->prog, url, m, t->size)){
-		werrstr("malformed URI: %q", url);
-		return -1;
-	}
-
-	su->url.s = m[0].sp;
-	su->url.e = m[0].ep;
-	su->scheme.s = m[t->ind[0]].sp;
-	su->scheme.e = m[t->ind[0]].ep;
-	su->authority.s = m[t->ind[1]].sp;
-	su->authority.e = m[t->ind[1]].ep;
-	su->path.s = m[t->ind[2]].sp;
-	su->path.e = m[t->ind[2]].ep;
-	su->query.s = m[t->ind[3]].sp;
-	su->query.e = m[t->ind[3]].ep;
-	su->fragment.s = m[t->ind[4]].sp;
-	su->fragment.e = m[t->ind[4]].ep;
-
-	if(urldebug)
-		fprint(2, "split url %s into %.*q %.*q %.*q %.*q %.*q %.*q\n",
-			url,
-			su->url.s ? utfnlen(su->url.s, su->url.e-su->url.s) : 10, su->url.s ? su->url.s : "",
-			su->scheme.s ? utfnlen(su->scheme.s, su->scheme.e-su->scheme.s) : 10, su->scheme.s ? su->scheme.s : "",
-			su->authority.s ? utfnlen(su->authority.s, su->authority.e-su->authority.s) : 10, su->authority.s ? su->authority.s : "",
-			su->path.s ? utfnlen(su->path.s, su->path.e-su->path.s) : 10, su->path.s ? su->path.s : "",
-			su->query.s ? utfnlen(su->query.s, su->query.e-su->query.s) : 10, su->query.s ? su->query.s : "",
-			su->fragment.s ? utfnlen(su->fragment.s, su->fragment.e-su->fragment.s) : 10, su->fragment.s ? su->fragment.s : "");
+	char *s;
+	Url *u;
 
+	if((u = va_arg(f->args, Url*)) == nil)
+		return fmtprint(f, "nil");
+	if(u->scheme)
+		fmtprint(f, "%s:", u->scheme);
+	if(u->user || u->host)
+		fmtprint(f, "//");
+	if(u->user){
+		fmtprint(f, "%E", (Str2){u->user, ""});
+		if(u->pass)
+			fmtprint(f, ":%E", (Str2){u->pass, ""});
+		fmtprint(f, "@");
+	}
+	if(u->host){
+		fmtprint(f, strchr(u->host, ':') ? "[%s]" : "%s", u->host);
+		if(u->port)
+			fmtprint(f, ":%s", u->port);
+	}
+	if(s = Upath(u))
+		fmtprint(f, "%E", (Str2){s, "/:@"});
+	if(u->query)
+		fmtprint(f, "?%E", (Str2){u->query, "/:@"});
+	if(u->fragment)
+		fmtprint(f, "#%E", (Str2){u->fragment, "/:@?"});
 	return 0;
 }
 
-static int
-parse_scheme(SplitUrl *su, Url *u)
+char*
+Upath(Url *u)
 {
-	if(su->scheme.s == nil){
-		werrstr("missing scheme");
-		return -1;
+	if(u){
+		if(u->path)
+			return u->path;
+		if(u->user || u->host)
+			return "/";
 	}
-	u->scheme = estredup(su->scheme.s, su->scheme.e);
-	strlower(u->scheme);
-
-	if(!ismatch(REscheme, u->scheme, "scheme"))
-		return -1;
-
-	u->ischeme = ischeme(u->scheme);
-	if(urldebug)
-		fprint(2, "parse_scheme %s => %d\n", u->scheme, u->ischeme);
-	return 0;
+	return nil;
 }
 
-static int
-parse_unknown_part(SplitUrl *su, Url *u)
-{
-	char *s, *e;
-
-	assert(u->ischeme == USunknown);
-	assert(su->scheme.e[0] == ':');
-
-	s = su->scheme.e+1;
-	if(su->fragment.s){
-		e = su->fragment.s-1;
-		assert(*e == '#');
-	}else
-		e = s+strlen(s);
-
-	u->schemedata = estredup(s, e);
-	return 0;
+static char*
+remdot(char *s)
+{
+	char *b, *d, *p;
+	int dir, n;
+
+	dir = 1;
+	b = d = s;
+	while(*s == '/')
+		s++;
+	for(; s; s = p){
+		if(p = strchr(s, '/'))
+			while(*p == '/')
+				*p++ = 0;
+		if(*s == '.' && ((s[1] == 0) || (s[1] == '.' && s[2] == 0))){
+			if(s[1] == '.')
+				while(d > b)
+					if(*--d == '/')
+						break;
+			dir = 1;
+			continue;
+		} else
+			dir = (p != nil);
+		n = strlen(s);
+		memmove(d+1, s, n);
+		*d = '/';
+		d += n+1;
+	}
+	if(dir)
+		*d++ = '/';
+	*d = 0;
+	return b;
 }
 
-static int
-parse_userinfo(char *s, char *e, Url *u)
+static char*
+abspath(char *s, char *b)
 {
-	Resub m[MaxResub];
-	Retab *t;
+	char *x, *a;
 
-	m[0].sp = s;
-	m[0].ep = e;
-	t = &retab[REuserinfo];
-	if(!regx(t->prog, nil, m, t->size)){
-		werrstr("malformed userinfo: %.*q", utfnlen(s, e-s), s);
-		return -1;
+	if(b && *b){
+		if(s == nil || *s == 0)
+			return estrdup(b);
+		if(*s != '/' && (x = strrchr(b, '/'))){
+			a = emalloc((x - b) + strlen(s) + 4);
+			sprint(a, "/%.*s/%s", (int)(x - b), b, s);
+			return remdot(a);
+		}
 	}
-	if(m[t->ind[0]].sp)
-		u->user = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
-	if(m[t->ind[1]].sp)
-		u->user = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
-	return 0;
-}
-
-static int
-parse_host(char *s, char *e, Url *u)
-{
-	Resub m[MaxResub];
-	Retab *t;
-
-	m[0].sp = s;
-	m[0].ep = e;
-	t = &retab[REhost];
-	if(!regx(t->prog, nil, m, t->size)){
-		werrstr("malformed host: %.*q", utfnlen(s, e-s), s);
-		return -1;
+	if(s && *s){
+		if(*s != '/')
+			return estrdup(s);
+		a = emalloc(strlen(s) + 4);
+		sprint(a, "/%s", s);
+		return remdot(a);
 	}
-
-	assert(m[t->ind[0]].sp || m[t->ind[1]].sp);
-
-	if(m[t->ind[0]].sp)	/* regular */
-		u->host = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
-	else
-		u->host = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
-	return 0;
+	return nil;
 }
 
-static int
-parse_authority(SplitUrl *su, Url *u)
+static void
+pstrdup(char **p)
 {
-	Resub m[MaxResub];
-	Retab *t;
-	char *host;
-	char *userinfo;
-
-	if(su->authority.s == nil)
-		return 0;
-
-	u->authority = estredup(su->authority.s, su->authority.e);
-	m[0].sp = m[0].ep = nil;
-	t = &retab[REauthority];
-	if(!regx(t->prog, u->authority, m, t->size)){
-		werrstr("malformed authority: %q", u->authority);
-		return -1;
-	}
-
-	if(m[t->ind[0]].sp)
-		if(parse_userinfo(m[t->ind[0]].sp, m[t->ind[0]].ep, u) < 0)
-			return -1;
-	if(m[t->ind[1]].sp)
-		if(parse_host(m[t->ind[1]].sp, m[t->ind[1]].ep, u) < 0)
-			return -1;
-	if(m[t->ind[2]].sp)
-		u->port = estredup(m[t->ind[2]].sp, m[t->ind[2]].ep);
-
-
-	if(urldebug > 0){
-		userinfo = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep); 
-		host = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
-		fprint(2, "port: %q, authority %q\n", u->port, u->authority);
-		fprint(2, "host %q, userinfo %q\n", host, userinfo);
-		free(host);
-		free(userinfo);
+	if(p == nil || *p == nil)
+		return;
+	if(**p == 0){
+		*p = nil;
+		return;
 	}
-	return 0;
+	*p = estrdup(*p);
 }
 
-static int
-parse_abspath(SplitUrl *su, Url *u)
-{
-	char *s;
-
-	if(su->path.s == nil)
-		return 0;
-	s = estredup(su->path.s, su->path.e);
-	u->path = unescapeurl(s, "/");
-	free(s);
-	return 0;
-}
-
-static int
-parse_query(SplitUrl *su, Url *u)
-{
-	char *s;
-
-	if(su->query.s == nil)
-		return 0;
-	s = estredup(su->query.s, su->query.e);
-	u->query = unescapeurl(s, "&;=/");
-	free(s);
-	return 0;
-}
-
-static int
-parse_fragment(SplitUrl *su, Url *u)
-{
-	char *s;
-
-	if(su->fragment.s == nil)
-		return 0;
-	s = estredup(su->fragment.s, su->fragment.e);
-	u->fragment = unescapeurl(s, "");
-	free(s);
-	return 0;
-}
-
-static int
-postparse_http(Url *u)
+static char*
+mklowcase(char *s)
 {
-	char *p, *q;
-
-	u->open = httpopen;
-	u->read = httpread;
-	u->close = httpclose;
+	char *p;
 
-	if(u->authority==nil){
-		werrstr("missing authority (hostname, port, etc.)");
-		return -1;
-	}
-	if(u->host == nil){
-		werrstr("missing host specification");
-		return -1;
-	}
-
-	if(u->path == nil){
-		u->http.page_spec = estrdup("/");
-		return 0;
-	}
-	p = escapeurl(u->path, "/");
-	if(u->query){
-		q = escapeurl(u->query, "&;=/");
-		u->http.page_spec = emalloc(strlen(p)+1+strlen(q)+1);
-		strcpy(u->http.page_spec, p);
-		strcat(u->http.page_spec, "?");
-		strcat(u->http.page_spec, q);
-		free(q);
-		free(p);
-	}else
-		u->http.page_spec = p;
-	return 0;
+	if(s == nil)
+		return s;
+	for(p = s; *p; p++)
+		*p = tolower(*p);
+	return s;
 }
 
-static int
-postparse_ftp(Url *u)
+Url*
+url(char *s, Url *b)
 {
-	Resub m[MaxResub];
-	Retab *t;
+	char *t, *p, *x, *y;
+	Url *u;
 
-	if(u->authority==nil){
-		werrstr("missing authority (hostname, port, etc.)");
-		return -1;
-	}
-	if(u->query){
-		werrstr("unexpected \"?query\" in ftp path");
-		return -1;
+	if(s == nil)
+		s = "";
+	t = nil;
+	s = p = estrdup(s);
+	u = emalloc(sizeof(*u));
+	for(; *p; p++){
+		if(*p == ':'){
+			if(p == s)
+				break;
+			*p++ = 0;
+			u->scheme = s;
+			b = nil;
+			goto Abs;
+		}
+		if(!isalpha(*p))
+			if((p == s) || ((!isdigit(*p) && strchr("+-.", *p) == nil)))
+				break;
+	}
+	p = s;
+	if(b){
+		switch(*p){
+		case 0:
+			memmove(u, b, sizeof(*u));
+			goto Out;
+		case '#':
+			memmove(u, b, sizeof(*u));
+			u->fragment = p+1;
+			goto Out;
+		case '?':
+			memmove(u, b, sizeof(*u));
+			u->fragment = u->query = nil;
+			break;
+		case '/':
+			if(p[1] == '/'){
+				u->scheme = b->scheme;
+				b = nil;
+				break;
+			}
+		default:
+			memmove(u, b, sizeof(*u));
+			u->fragment = u->query = u->path = nil;
+			break;
+		}
 	}
-	if(u->host == nil){
-		werrstr("missing host specification");
-		return -1;
+Abs:
+	if(x = strchr(p, '#')){
+		*x = 0;
+		u->fragment = x+1;
 	}
-
-	if(u->path == nil){
-		u->ftp.path_spec = estrdup("/");
-		return 0;
+	if(x = strchr(p, '?')){
+		*x = 0;
+		u->query = x+1;
 	}
-
-	m[0].sp = m[0].ep = nil;
-	t = &retab[REftppath];
-	if(!regx(t->prog, u->path, m, t->size)){
-		werrstr("malformed ftp path: %q", u->path);
-		return -1;
-	}
-
-	if(m[t->ind[0]].sp){
-		u->ftp.path_spec = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
-		if(strchr(u->ftp.path_spec, ';')){
-			werrstr("unexpected \";param\" in ftp path");
-			return -1;
+	if(p[0] == '/' && p[1] == '/'){
+		p += 2;
+		if(x = strchr(p, '/')){
+			u->path = t = abspath(x, Upath(b));
+			*x = 0;
 		}
-	}else
-		u->ftp.path_spec = estrdup("/");
-
-	if(m[t->ind[1]].sp){
-		u->ftp.type = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
-		strlower(u->ftp.type);
-	}
-	return 0;
-}
+		if(x = strchr(p, '@')){
+			*x = 0;
+			if(y = strchr(p, ':')){
+				*y = 0;
+				u->pass = y+1;
+			}
+			u->user = p;
+			p = x+1;
+		}
+		if((x = strrchr(p, ']')) == nil)
+			x = p;
+		if(x = strrchr(x, ':')){
+			*x = 0;
+			u->port = x+1;
+		}
+		if(x = strchr(p, '[')){
+			p = x+1;
+			if(y = strchr(p, ']'))
+				*y = 0;
+		}
+		u->host = p;
+	} else {
+		u->path = t = abspath(p, Upath(b));
+	}
+Out:
+	pstrdup(&u->scheme);
+	pstrdup(&u->user);
+	pstrdup(&u->pass);
+	pstrdup(&u->host);
+	pstrdup(&u->port);
+	pstrdup(&u->path);
+	pstrdup(&u->query);
+	pstrdup(&u->fragment);
+	free(s);
+	free(t);
 
-static int
-postparse_file(Url *u)
-{
-	if(u->user || u->passwd){
-		werrstr("user information not valid with file scheme");
-		return -1;
-	}
-	if(u->query){
-		werrstr("unexpected \"?query\" in file path");
-		return -1;
-	}
-	if(u->port){
-		werrstr("port not valid with file scheme");
-		return -1;
-	}
-	if(u->path == nil){
-		werrstr("missing path in file scheme");
-		return -1;
-	}
-	if(strchr(u->path, ';')){
-		werrstr("unexpected \";param\" in file path");
-		return -1;
-	}
+	unescape(u->user, "");
+	unescape(u->pass, "");
+	unescape(u->path, "/");
+	unescape(u->query, "&;=/?#");
+	unescape(u->fragment, "");
+	mklowcase(u->scheme);
+	mklowcase(u->host);
+	mklowcase(u->port);
 
-	/* "localhost" is equivalent to no host spec, we'll chose the latter */
-	if(u->host && cistrcmp(u->host, "localhost") == 0){
-		free(u->host);
-		u->host = nil;
-	}
-	return 0;
+	return u;
 }
 
-static int (*postparse[])(Url*) = {
-	nil,
-	postparse_http,
-	postparse_http,
-	postparse_ftp,
-	postparse_file,
-};
-
 Url*
-parseurl(char *url, Url *base)
+saneurl(Url *u)
 {
-	Url *u;
-	SplitUrl su;
-
-	if(urldebug)
-		fprint(2, "parseurl %s with base %s\n", url, base ? base->url : "<none>");
-
-	u = emalloc(sizeof(Url));
-	u->url = estrdup(url);
-	if(spliturl(u->url, &su) < 0){
-	Fail:
+	if(u == nil || u->scheme == nil || u->host == nil || Upath(u) == nil){
 		freeurl(u);
 		return nil;
 	}
-
-	/* RFC2396 sec 3.1 says relative URIs are distinguished by absent scheme */ 
-	if(su.scheme.s==nil){
-		if(urldebug)
-			fprint(2, "parseurl has nil scheme\n");
-		if(resolve_relative(&su, base, u) < 0 || spliturl(u->url, &su) < 0)
-			goto Fail;
-		if(u->ischeme == UScurrent){
-			/* 'u.url' refers to current document; set fragment and return */
-			if(parse_fragment(&su, u) < 0)
-				goto Fail;
-			goto Done;
+	if(u->port){
+		/* remove default ports */
+		switch(atoi(u->port)){
+		case 21:	if(!strcmp(u->scheme, "ftp"))	goto Defport; break;
+		case 70:	if(!strcmp(u->scheme, "gopher"))goto Defport; break;
+		case 80:	if(!strcmp(u->scheme, "http"))	goto Defport; break;
+		case 443:	if(!strcmp(u->scheme, "https"))	goto Defport; break;
+		default:	if(!strcmp(u->scheme, u->port))	goto Defport; break;
+		Defport:
+			free(u->port);
+			u->port = nil;
 		}
 	}
+	return u;
+}
 
-	if(parse_scheme(&su, u) < 0
-	|| parse_fragment(&su, u) < 0)
-		goto Fail;
+int
+matchurl(Url *u, Url *s)
+{
+	if(u){
+		char *a, *b;
 
-	if(u->ischeme == USunknown){
-		if(parse_unknown_part(&su, u) < 0)
-			goto Fail;
-		goto Done;
+		if(s == nil)
+			return 0;
+		if(u->scheme && (s->scheme == nil || strcmp(u->scheme, s->scheme)))
+			return 0;
+		if(u->user && (s->user == nil || strcmp(u->user, s->user)))
+			return 0;
+		if(u->host && (s->host == nil || strcmp(u->host, s->host)))
+			return 0;
+		if(u->port && (s->port == nil || strcmp(u->port, s->port)))
+			return 0;
+		if(a = Upath(u)){
+			b = Upath(s);
+			if(b == nil || strncmp(a, b, strlen(a)))
+				return 0;
+		}
 	}
-
-	if(parse_query(&su, u) < 0
-	|| parse_authority(&su, u) < 0
-	|| parse_abspath(&su, u) < 0)
-		goto Fail;
-
-	if(u->ischeme < nelem(postparse) && postparse[u->ischeme])
-		if((*postparse[u->ischeme])(u) < 0)
-			goto Fail;
-
-Done:
-	setmalloctag(u, getcallerpc(&url));
-	rewriteurl(u);
-	return u;
+	return 1;
 }
 
 void
@@ -873,162 +362,13 @@ freeurl(Url *u)
 {
 	if(u == nil)
 		return;
-	free(u->url);
 	free(u->scheme);
-	free(u->schemedata);
-	free(u->authority);
 	free(u->user);
-	free(u->passwd);
+	free(u->pass);
 	free(u->host);
 	free(u->port);
 	free(u->path);
 	free(u->query);
 	free(u->fragment);
-	switch(u->ischeme){
-	case UShttp:
-	case UShttps:
-		free(u->http.page_spec);
-		break;
-	case USftp:
-		free(u->ftp.path_spec);
-		free(u->ftp.type);
-		break;
-	}
 	free(u);
 }
-
-void
-rewriteurl(Url *u)
-{
-	char *s;
-
-	if(u->scheme == nil)
-		return;
-	if(u->schemedata)
-		s = estrmanydup(u->scheme, ":", u->schemedata, nil);
-	else
-		s = estrmanydup(u->scheme, "://", 
-			u->user ? u->user : "",
-			u->passwd ? ":" : "", u->passwd ? u->passwd : "",
-			u->user ? "@" : "", u->host ? u->host : "", 
-			u->port ? ":" : "", u->port ? u->port : "",
-			u->path ? u->path : "",
-			u->query ? "?" : "", u->query ? u->query : "",
-			u->fragment ? "#" : "", u->fragment ? u->fragment : "",
-			nil);
-	free(u->url);
-	u->url = s;
-}
-
-int
-seturlquery(Url *u, char *query)
-{
-	if(query == nil){
-		free(u->query);
-		u->query = nil;
-		return 0;
-	}
-	free(u->query);
-	u->query = unescapeurl(query, "&;=/");
-	return 0;
-}
-
-static void
-dupp(char **p)
-{
-	if(*p)
-		*p = estrdup(*p);
-}
-
-Url*
-copyurl(Url *u)
-{
-	Url *v;
-
-	v = emalloc(sizeof(Url));
-	*v = *u;
-	dupp(&v->url);
-	dupp(&v->scheme);
-	dupp(&v->schemedata);
-	dupp(&v->authority);
-	dupp(&v->user);
-	dupp(&v->passwd);
-	dupp(&v->host);
-	dupp(&v->port);
-	dupp(&v->path);
-	dupp(&v->query);
-	dupp(&v->fragment);
-
-	switch(v->ischeme){
-	case UShttp:
-	case UShttps:
-		dupp(&v->http.page_spec);
-		break;
-	case USftp:
-		dupp(&v->ftp.path_spec);
-		dupp(&v->ftp.type);
-		break;
-	}
-	return v;
-}
-
-static int
-dhex(char c)
-{
-	if('0' <= c && c <= '9')
-		return c-'0';
-	if('a' <= c && c <= 'f')
-		return c-'a'+10;
-	if('A' <= c && c <= 'F')
-		return c-'A'+10;
-	return 0;
-}
-
-char*
-escapeurl(char *s, char *special)
-{
-	static char *hex = "0123456789abcdef";
-	char *t, *u;
-
-	t = u = emalloc(strlen(s)*3+1);
-	for(; *s; s++){
-		if((s[0] == '%' && isxdigit(s[1]) && isxdigit(s[2])) ||
-			(*s >= '0' && *s <= '9') || 
-			(*s >= 'a' && *s <= 'z') ||
-			(*s >= 'A' && *s <= 'Z') || 
-			strchr(".-_~", *s) || strchr(special, *s))
-			*u++ = *s;
-		else if(s[0] == ' ')
-			*u++ = '+';
-		else {
-			*u++ = '%';
-			*u++ = hex[(*s>>4)&0xF];
-			*u++ = hex[*s&0xF];
-		}
-	}
-	*u = '\0';
-	return t;
-}
-
-char*
-unescapeurl(char *s, char *special)
-{
-	char *r, *w;
-	Rune x;
-
-	s = estrdup(s);
-	for(r=w=s; x = *r; r++){
-		if(x=='%' && isxdigit(r[1]) && isxdigit(r[2])){
-			x = (dhex(r[1])<<4)|dhex(r[2]);
-			if(x == 0 || (x > 0x1F && x < 0x7F && strchr(special, x)))
-				x = *r;
-			else
-				r += 2;
-		} else if(x=='+')
-			x = ' ';
-		*w++ = x;
-	}
-	*w = '\0';
-	return s;
-}
-
author	cinap_lenrek <cinap_lenrek@rei2.9hal>	2012-01-11 16:17:54 +0100
committer	cinap_lenrek <cinap_lenrek@rei2.9hal>	2012-01-11 16:17:54 +0100
commit	75e1ef0ab60acb6bccc54254b82770aec5786ead (patch)
tree	d273fc755a20e67801aa0a13df30ab75b2883419 /sys/src/cmd/webfs/url.c
parent	62fb4f97177d8e76f1fd49bb9d0073007b7c9bcc (diff)