summaryrefslogtreecommitdiff
path: root/sys/src/cmd/webfs/url.c
diff options
context:
space:
mode:
authorcinap_lenrek <cinap_lenrek@rei2.9hal>2012-01-11 16:17:54 +0100
committercinap_lenrek <cinap_lenrek@rei2.9hal>2012-01-11 16:17:54 +0100
commit75e1ef0ab60acb6bccc54254b82770aec5786ead (patch)
treed273fc755a20e67801aa0a13df30ab75b2883419 /sys/src/cmd/webfs/url.c
parent62fb4f97177d8e76f1fd49bb9d0073007b7c9bcc (diff)
new webfs, rc based hget
Diffstat (limited to 'sys/src/cmd/webfs/url.c')
-rw-r--r--sys/src/cmd/webfs/url.c1236
1 files changed, 288 insertions, 948 deletions
diff --git a/sys/src/cmd/webfs/url.c b/sys/src/cmd/webfs/url.c
index c6c5695f0..2137010d1 100644
--- a/sys/src/cmd/webfs/url.c
+++ b/sys/src/cmd/webfs/url.c
@@ -1,871 +1,360 @@
-/*
- * This is a URL parser, written to parse "Common Internet Scheme" URL
- * syntax as described in RFC1738 and updated by RFC2396. Only absolute URLs
- * are supported, using "server-based" naming authorities in the schemes.
- * Support for literal IPv6 addresses is included, per RFC2732.
- *
- * Current "known" schemes: http, ftp, file.
- *
- * We can do all the parsing operations without Runes since URLs are
- * defined to be composed of US-ASCII printable characters.
- * See RFC1738, RFC2396.
- */
-
#include <u.h>
#include <libc.h>
#include <ctype.h>
-#include <regexp.h>
-#include <plumb.h>
-#include <thread.h>
#include <fcall.h>
+#include <thread.h>
#include <9p.h>
+
#include "dat.h"
#include "fns.h"
-int urldebug;
-
-/* If set, relative paths with leading ".." segments will have them trimmed */
-#define RemoveExtraRelDotDots 0
-#define ExpandCurrentDocUrls 1
-
-static char*
-schemestrtab[] =
-{
- nil,
- "http",
- "https",
- "ftp",
- "file",
-};
-
static int
-ischeme(char *s)
-{
- int i;
-
- for(i=0; i<nelem(schemestrtab); i++)
- if(schemestrtab[i] && strcmp(s, schemestrtab[i])==0)
- return i;
- return USunknown;
-}
-
-/*
- * URI splitting regexp is from RFC2396, Appendix B:
- * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
- * 12 3 4 5 6 7 8 9
- *
- * Example: "http://www.ics.uci.edu/pub/ietf/uri/#Related"
- * $2 = scheme "http"
- * $4 = authority "www.ics.uci.edu"
- * $5 = path "/pub/ietf/uri/"
- * $7 = query <undefined>
- * $9 = fragment "Related"
- */
-
-/*
- * RFC2396, Sec 3.1, contains:
- *
- * Scheme names consist of a sequence of characters beginning with a
- * lower case letter and followed by any combination of lower case
- * letters, digits, plus ("+"), period ("."), or hyphen ("-"). For
- * resiliency, programs interpreting URI should treat upper case letters
- * as equivalent to lower case in scheme names (e.g., allow "HTTP" as
- * well as "http").
- */
-
-/*
- * For server-based naming authorities (RFC2396 Sec 3.2.2):
- * server = [ [ userinfo "@" ] hostport ]
- * userinfo = *( unreserved | escaped |
- * ";" | ":" | "&" | "=" | "+" | "$" | "," )
- * hostport = host [ ":" port ]
- * host = hostname | IPv4address
- * hostname = *( domainlabel "." ) toplabel [ "." ]
- * domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
- * toplabel = alpha | alpha *( alphanum | "-" ) alphanum
- * IPv4address = 1*digit "." 1*digit "." 1*digit "." 1*digit
- * port = *digit
- *
- * The host is a domain name of a network host, or its IPv4 address as a
- * set of four decimal digit groups separated by ".". Literal IPv6
- * addresses are not supported.
- *
- * Note that literal IPv6 address support is outlined in RFC2732:
- * host = hostname | IPv4address | IPv6reference
- * ipv6reference = "[" IPv6address "]" (RFC2373)
- *
- * Since hostnames and numbers will have to be resolved by the OS anyway,
- * we don't have to parse them too pedantically (counting '.'s, checking
- * for well-formed literal IP addresses, etc.).
- *
- * In FTP/file paths, we reject most ";param"s and querys. In HTTP paths,
- * we just pass them through.
- *
- * Instead of letting a "path" be 0-or-more characters as RFC2396 suggests,
- * we'll say it's 1-or-more characters, 0-or-1 times. This way, an absent
- * path yields a nil substring match, instead of an empty one.
- *
- * We're more restrictive than RFC2396 indicates with "userinfo" strings,
- * insisting they have the form "[user[:password]]". This may need to
- * change at some point, however.
- */
-
-/* RE character-class components -- these go in brackets */
-#define PUNCT "\\-_.!~*'()"
-#define ALNUM "a-zA-Z0-9"
-#define HEX "0-9a-fA-F"
-#define UNRES ALNUM PUNCT
-
-/* RE components; _N => has N parenthesized subexpressions when expanded */
-#define USERINFO_2 "([" UNRES ";:&=+$,]|(%[" HEX "][" HEX "]))"
-
-typedef struct Retab Retab;
-struct Retab
-{
- char *str;
- Reprog *prog;
- int size;
- int ind[5];
-};
-
-enum
-{
- REsplit = 0,
- REscheme,
- REauthority,
- REhost,
- REuserinfo,
- REftppath,
-
- MaxResub= 20,
-};
-
-Retab retab[] = /* view in constant width Font */
-{
-[REsplit]
- "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]+)?(\\?([^#]*))?(#(.*))?$", nil, 0,
- /* |-scheme-| |-auth.-| |path--| |query| |--|frag */
- { 2, 4, 5, 7, 9},
-
-[REscheme]
- "^[a-z][a-z0-9+-.]*$", nil, 0,
- { 0, },
-
-[REauthority]
- "^(((" USERINFO_2 "*)@)?(((\\[[^\\]@]+\\])|([^:\\[@]+))(:([0-9]*))?)?)?$", nil, 0,
- /* |----user info-----| |--------host----------------| |-port-| */
- { 3, 7, 11, },
-
-[REhost]
- "^(([a-zA-Z0-9\\-.]+)|(\\[([a-fA-F0-9.:]+)\\]))$", nil, 0,
- /* |--regular host--| |-IPv6 literal-| */
- { 2, 4, },
-
-[REuserinfo]
- "^(([^:]*)(:([^:]*))?)$", nil, 0,
- /* |user-| |pass-| */
- { 2, 4, },
-
-[REftppath]
- "^(.+)(;[tT][yY][pP][eE]=([aAiIdD]))?$", nil, 0,
- /*|--|-path |ftptype-| */
- { 1, 3, },
-};
-
-static int
-countleftparen(char *s)
-{
- int n;
-
- n = 0;
- for(; *s; s++)
- if(*s == '(')
- n++;
- return n;
-}
-
-void
-initurl(void)
+dhex(char c)
{
- int i, j;
-
- for(i=0; i<nelem(retab); i++){
- retab[i].prog = regcomp(retab[i].str);
- if(retab[i].prog == nil)
- sysfatal("recomp(%s): %r", retab[i].str);
- retab[i].size = countleftparen(retab[i].str)+1;
- for(j=0; j<nelem(retab[i].ind); j++)
- if(retab[i].ind[j] >= retab[i].size)
- sysfatal("bad index in regexp table: retab[%d].ind[%d] = %d >= %d",
- i, j, retab[i].ind[j], retab[i].size);
- if(MaxResub < retab[i].size)
- sysfatal("MaxResub too small: %d < %d", MaxResub, retab[i].size);
- }
+ if('0' <= c && c <= '9')
+ return c-'0';
+ if('a' <= c && c <= 'f')
+ return c-'a'+10;
+ if('A' <= c && c <= 'F')
+ return c-'A'+10;
+ return 0;
}
-typedef struct SplitUrl SplitUrl;
-struct SplitUrl
-{
- struct {
- char *s;
- char *e;
- } url, scheme, authority, path, query, fragment;
-};
-
-/*
- * Implements the algorithm in RFC2396 sec 5.2 step 6.
- * Returns number of chars written, excluding NUL terminator.
- * dest is known to be >= strlen(base)+rel_len.
- */
-static void
-merge_relative_path(char *base, char *rel_st, int rel_len, char *dest)
+static char*
+unescape(char *s, char *spec)
{
- char *s, *p, *e, *pdest;
-
- pdest = dest;
-
- /* 6a: start with base, discard last segment */
- if(base && base[0]){
- /* Empty paths don't match in our scheme; 'base' should be nil */
- assert(base[0] == '/');
- e = strrchr(base, '/');
- e++;
- memmove(pdest, base, e-base);
- pdest += e-base;
- }else{
- /* Artistic license on my part */
- *pdest++ = '/';
- }
-
- /* 6b: append relative component */
- if(rel_st){
- memmove(pdest, rel_st, rel_len);
- pdest += rel_len;
- }
-
- /* 6c: remove any occurrences of "./" as a complete segment */
- s = dest;
- *pdest = '\0';
- while(e = strstr(s, "./")){
- if((e == dest) || (*(e-1) == '/')){
- memmove(e, e+2, pdest+1-(e+2)); /* +1 for NUL */
- pdest -= 2;
- }else
- s = e+1;
- }
-
- /* 6d: remove a trailing "." as a complete segment */
- if(pdest>dest && *(pdest-1)=='.' &&
- (pdest==dest+1 || *(pdest-2)=='/'))
- *--pdest = '\0';
-
- /* 6e: remove occurences of "seg/../", where seg != "..", left->right */
- s = dest+1;
- while(e = strstr(s, "/../")){
- p = e - 1;
- while(p >= dest && *p != '/')
- p--;
- if(memcmp(p, "/../", 4) != 0){
- memmove(p+1, e+4, pdest+1-(e+4));
- pdest -= (e+4) - (p+1);
- }else
- s = e+1;
- }
-
- /* 6f: remove a trailing "seg/..", where seg isn't ".." */
- if(pdest-3 > dest && memcmp(pdest-3, "/..", 3)==0){
- p = pdest-3 - 1;
- while(p >= dest && *p != '/')
- p--;
- if(memcmp(p, "/../", 4) != 0){
- pdest = p+1;
- *pdest = '\0';
- }
- }
+ char *r, *w;
+ uchar x;
- /* 6g: leading ".." segments are errors -- we'll just blat them out. */
- if(RemoveExtraRelDotDots){
- p = dest;
- if (p[0] == '/')
- p++;
- s = p;
- while(s[0]=='.' && s[1]=='.' && (s[2]==0 || s[2]=='/'))
- s += 3;
- if(s > p){
- memmove(p, s, pdest+1-s);
- pdest -= s-p;
+ if(s == nil)
+ return s;
+ for(r=w=s; x = *r; r++){
+ if(x == '%' && isxdigit(r[1]) && isxdigit(r[2])){
+ x = (dhex(r[1])<<4)|dhex(r[2]);
+ if(x == 0 || (x > 0x1F && x < 0x7F && strchr(spec, x))){
+ *w++ = '%';
+ *w++ = toupper(r[1]);
+ *w++ = toupper(r[2]);
+ }
+ else
+ *w++ = x;
+ r += 2;
+ continue;
}
+ if(x == '+')
+ x = ' ';
+ *w++ = x;
}
- USED(pdest);
-
- if(urldebug)
- fprint(2, "merge_relative_path: '%s' + '%.*s' -> '%s'\n", base, rel_len,
- rel_st, dest);
+ *w = 0;
+ return s;
}
-/*
- * See RFC2396 sec 5.2 for info on resolving relative URIs to absolute form.
- *
- * If successful, this just ends up freeing and replacing "u->url".
- */
-static int
-resolve_relative(SplitUrl *su, Url *base, Url *u)
+int
+Efmt(Fmt *f)
{
- char *url, *path;
- char *purl, *ppath;
- int currentdoc, ulen, plen;
-
- if(base == nil){
- werrstr("relative URI given without base");
- return -1;
- }
- if(base->scheme == nil){
- werrstr("relative URI given with no scheme");
- return -1;
- }
- if(base->ischeme == USunknown){
- werrstr("relative URI given with unknown scheme");
- return -1;
- }
- if(base->ischeme == UScurrent){
- werrstr("relative URI given with incomplete base");
- return -1;
- }
- assert(su->scheme.s == nil);
-
- /* Sec 5.2 step 2 */
- currentdoc = 0;
- if(su->path.s==nil && su->scheme.s==nil && su->authority.s==nil && su->query.s==nil){
- /* Reference is to current document */
- if(urldebug)
- fprint(2, "url %s is relative to current document\n", u->url);
- u->ischeme = UScurrent;
- if(!ExpandCurrentDocUrls)
- return 0;
- currentdoc = 1;
- }
-
- /* Over-estimate the maximum lengths, for allocation purposes */
- /* (constants are for separators) */
- plen = 1;
- if(base->path)
- plen += strlen(base->path);
- if(su->path.s)
- plen += 1 + (su->path.e - su->path.s);
-
- ulen = 0;
- ulen += strlen(base->scheme) + 1;
- if(su->authority.s)
- ulen += 2 + (su->authority.e - su->authority.s);
- else
- ulen += 2 + ((base->authority) ? strlen(base->authority) : 0);
- ulen += plen;
- if(su->query.s)
- ulen += 1 + (su->query.e - su->query.s);
- else if(currentdoc && base->query)
- ulen += 1 + strlen(base->query);
- if(su->fragment.s)
- ulen += 1 + (su->fragment.e - su->fragment.s);
- else if(currentdoc && base->fragment)
- ulen += 1 + strlen(base->fragment);
- url = emalloc(ulen+1);
- path = emalloc(plen+1);
+ char *s, *spec;
+ Str2 s2;
- url[0] = '\0';
- purl = url;
- path[0] = '\0';
- ppath = path;
-
- if(su->authority.s || (su->path.s && (su->path.s[0] == '/'))){
- /* Is a "network-path" or "absolute-path"; don't merge with base path */
- /* Sec 5.2 steps 4,5 */
- if(su->path.s){
- memmove(ppath, su->path.s, su->path.e - su->path.s);
- ppath += su->path.e - su->path.s;
- *ppath = '\0';
- }
- }else if(currentdoc){
- /* Is a current-doc reference; just copy the path from the base URL */
- if(base->path){
- strcpy(ppath, base->path);
- ppath += strlen(ppath);
+ s2 = va_arg(f->args, Str2);
+ s = s2.s1;
+ spec = s2.s2;
+ for(; *s; s++)
+ if(*s == '%' && isxdigit(s[1]) && isxdigit(s[2])){
+ fmtprint(f, "%%%c%c", toupper(s[1]), toupper(s[2]));
+ s += 2;
}
- USED(ppath);
- }else{
- /* Is a relative-path reference; we have to merge it */
- /* Sec 5.2 step 6 */
- merge_relative_path(base->path,
- su->path.s, su->path.e - su->path.s, ppath);
- }
-
- /* Build new URL from pieces, inheriting from base where needed */
- strcpy(purl, base->scheme);
- purl += strlen(purl);
- *purl++ = ':';
- if(su->authority.s){
- strcpy(purl, "//");
- purl += strlen(purl);
- memmove(purl, su->authority.s, su->authority.e - su->authority.s);
- purl += su->authority.e - su->authority.s;
- }else if(base->authority){
- strcpy(purl, "//");
- purl += strlen(purl);
- strcpy(purl, base->authority);
- purl += strlen(purl);
- }
- assert((path[0] == '\0') || (path[0] == '/'));
- strcpy(purl, path);
- purl += strlen(purl);
-
- /*
- * The query and fragment are not inherited from the base,
- * except in case of "current document" URLs, which inherit any query
- * and may inherit the fragment.
- */
- if(su->query.s){
- *purl++ = '?';
- memmove(purl, su->query.s, su->query.e - su->query.s);
- purl += su->query.e - su->query.s;
- }else if(currentdoc && base->query){
- *purl++ = '?';
- strcpy(purl, base->query);
- purl += strlen(purl);
- }
-
- if(su->fragment.s){
- *purl++ = '#';
- memmove(purl, su->fragment.s, su->fragment.e - su->fragment.s);
- purl += su->fragment.e - su->fragment.s;
- }else if(currentdoc && base->fragment){
- *purl++ = '#';
- strcpy(purl, base->fragment);
- purl += strlen(purl);
- }
- USED(purl);
-
- if(urldebug)
- fprint(2, "resolve_relative: '%s' + '%s' -> '%s'\n", base->url, u->url, url);
- free(u->url);
- u->url = url;
- free(path);
+ else if(isalnum(*s) || strchr(".-_~!$&'()*,;=", *s) || strchr(spec, *s))
+ fmtprint(f, "%c", *s);
+ else
+ fmtprint(f, "%%%.2X", *s & 0xff);
return 0;
}
int
-regx(Reprog *prog, char *s, Resub *m, int nm)
-{
- int i;
-
- if(s == nil)
- s = m[0].sp; /* why is this necessary? */
-
- i = regexec(prog, s, m, nm);
-/*
- if(i >= 0)
- for(j=0; j<nm; j++)
- fprint(2, "match%d: %.*s\n", j, utfnlen(m[j].sp, m[j].ep-m[j].sp), m[j].sp);
-*/
- return i;
-}
-
-static int
-ismatch(int i, char *s, char *desc)
+Ufmt(Fmt *f)
{
- Resub m[1];
-
- m[0].sp = m[0].ep = nil;
- if(!regx(retab[i].prog, s, m, 1)){
- werrstr("malformed %s: %q", desc, s);
- return 0;
- }
- return 1;
-}
-
-static int
-spliturl(char *url, SplitUrl *su)
-{
- Resub m[MaxResub];
- Retab *t;
-
- /*
- * Newlines are not valid in a URI, but regexp(2) treats them specially
- * so it's best to make sure there are none before proceeding.
- */
- if(strchr(url, '\n')){
- werrstr("newline in URI");
- return -1;
- }
-
- m[0].sp = m[0].ep = nil;
- t = &retab[REsplit];
- if(!regx(t->prog, url, m, t->size)){
- werrstr("malformed URI: %q", url);
- return -1;
- }
-
- su->url.s = m[0].sp;
- su->url.e = m[0].ep;
- su->scheme.s = m[t->ind[0]].sp;
- su->scheme.e = m[t->ind[0]].ep;
- su->authority.s = m[t->ind[1]].sp;
- su->authority.e = m[t->ind[1]].ep;
- su->path.s = m[t->ind[2]].sp;
- su->path.e = m[t->ind[2]].ep;
- su->query.s = m[t->ind[3]].sp;
- su->query.e = m[t->ind[3]].ep;
- su->fragment.s = m[t->ind[4]].sp;
- su->fragment.e = m[t->ind[4]].ep;
-
- if(urldebug)
- fprint(2, "split url %s into %.*q %.*q %.*q %.*q %.*q %.*q\n",
- url,
- su->url.s ? utfnlen(su->url.s, su->url.e-su->url.s) : 10, su->url.s ? su->url.s : "",
- su->scheme.s ? utfnlen(su->scheme.s, su->scheme.e-su->scheme.s) : 10, su->scheme.s ? su->scheme.s : "",
- su->authority.s ? utfnlen(su->authority.s, su->authority.e-su->authority.s) : 10, su->authority.s ? su->authority.s : "",
- su->path.s ? utfnlen(su->path.s, su->path.e-su->path.s) : 10, su->path.s ? su->path.s : "",
- su->query.s ? utfnlen(su->query.s, su->query.e-su->query.s) : 10, su->query.s ? su->query.s : "",
- su->fragment.s ? utfnlen(su->fragment.s, su->fragment.e-su->fragment.s) : 10, su->fragment.s ? su->fragment.s : "");
+ char *s;
+ Url *u;
+ if((u = va_arg(f->args, Url*)) == nil)
+ return fmtprint(f, "nil");
+ if(u->scheme)
+ fmtprint(f, "%s:", u->scheme);
+ if(u->user || u->host)
+ fmtprint(f, "//");
+ if(u->user){
+ fmtprint(f, "%E", (Str2){u->user, ""});
+ if(u->pass)
+ fmtprint(f, ":%E", (Str2){u->pass, ""});
+ fmtprint(f, "@");
+ }
+ if(u->host){
+ fmtprint(f, strchr(u->host, ':') ? "[%s]" : "%s", u->host);
+ if(u->port)
+ fmtprint(f, ":%s", u->port);
+ }
+ if(s = Upath(u))
+ fmtprint(f, "%E", (Str2){s, "/:@"});
+ if(u->query)
+ fmtprint(f, "?%E", (Str2){u->query, "/:@"});
+ if(u->fragment)
+ fmtprint(f, "#%E", (Str2){u->fragment, "/:@?"});
return 0;
}
-static int
-parse_scheme(SplitUrl *su, Url *u)
+char*
+Upath(Url *u)
{
- if(su->scheme.s == nil){
- werrstr("missing scheme");
- return -1;
+ if(u){
+ if(u->path)
+ return u->path;
+ if(u->user || u->host)
+ return "/";
}
- u->scheme = estredup(su->scheme.s, su->scheme.e);
- strlower(u->scheme);
-
- if(!ismatch(REscheme, u->scheme, "scheme"))
- return -1;
-
- u->ischeme = ischeme(u->scheme);
- if(urldebug)
- fprint(2, "parse_scheme %s => %d\n", u->scheme, u->ischeme);
- return 0;
+ return nil;
}
-static int
-parse_unknown_part(SplitUrl *su, Url *u)
-{
- char *s, *e;
-
- assert(u->ischeme == USunknown);
- assert(su->scheme.e[0] == ':');
-
- s = su->scheme.e+1;
- if(su->fragment.s){
- e = su->fragment.s-1;
- assert(*e == '#');
- }else
- e = s+strlen(s);
-
- u->schemedata = estredup(s, e);
- return 0;
+static char*
+remdot(char *s)
+{
+ char *b, *d, *p;
+ int dir, n;
+
+ dir = 1;
+ b = d = s;
+ while(*s == '/')
+ s++;
+ for(; s; s = p){
+ if(p = strchr(s, '/'))
+ while(*p == '/')
+ *p++ = 0;
+ if(*s == '.' && ((s[1] == 0) || (s[1] == '.' && s[2] == 0))){
+ if(s[1] == '.')
+ while(d > b)
+ if(*--d == '/')
+ break;
+ dir = 1;
+ continue;
+ } else
+ dir = (p != nil);
+ n = strlen(s);
+ memmove(d+1, s, n);
+ *d = '/';
+ d += n+1;
+ }
+ if(dir)
+ *d++ = '/';
+ *d = 0;
+ return b;
}
-static int
-parse_userinfo(char *s, char *e, Url *u)
+static char*
+abspath(char *s, char *b)
{
- Resub m[MaxResub];
- Retab *t;
+ char *x, *a;
- m[0].sp = s;
- m[0].ep = e;
- t = &retab[REuserinfo];
- if(!regx(t->prog, nil, m, t->size)){
- werrstr("malformed userinfo: %.*q", utfnlen(s, e-s), s);
- return -1;
+ if(b && *b){
+ if(s == nil || *s == 0)
+ return estrdup(b);
+ if(*s != '/' && (x = strrchr(b, '/'))){
+ a = emalloc((x - b) + strlen(s) + 4);
+ sprint(a, "/%.*s/%s", (int)(x - b), b, s);
+ return remdot(a);
+ }
}
- if(m[t->ind[0]].sp)
- u->user = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
- if(m[t->ind[1]].sp)
- u->user = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
- return 0;
-}
-
-static int
-parse_host(char *s, char *e, Url *u)
-{
- Resub m[MaxResub];
- Retab *t;
-
- m[0].sp = s;
- m[0].ep = e;
- t = &retab[REhost];
- if(!regx(t->prog, nil, m, t->size)){
- werrstr("malformed host: %.*q", utfnlen(s, e-s), s);
- return -1;
+ if(s && *s){
+ if(*s != '/')
+ return estrdup(s);
+ a = emalloc(strlen(s) + 4);
+ sprint(a, "/%s", s);
+ return remdot(a);
}
-
- assert(m[t->ind[0]].sp || m[t->ind[1]].sp);
-
- if(m[t->ind[0]].sp) /* regular */
- u->host = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
- else
- u->host = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
- return 0;
+ return nil;
}
-static int
-parse_authority(SplitUrl *su, Url *u)
+static void
+pstrdup(char **p)
{
- Resub m[MaxResub];
- Retab *t;
- char *host;
- char *userinfo;
-
- if(su->authority.s == nil)
- return 0;
-
- u->authority = estredup(su->authority.s, su->authority.e);
- m[0].sp = m[0].ep = nil;
- t = &retab[REauthority];
- if(!regx(t->prog, u->authority, m, t->size)){
- werrstr("malformed authority: %q", u->authority);
- return -1;
- }
-
- if(m[t->ind[0]].sp)
- if(parse_userinfo(m[t->ind[0]].sp, m[t->ind[0]].ep, u) < 0)
- return -1;
- if(m[t->ind[1]].sp)
- if(parse_host(m[t->ind[1]].sp, m[t->ind[1]].ep, u) < 0)
- return -1;
- if(m[t->ind[2]].sp)
- u->port = estredup(m[t->ind[2]].sp, m[t->ind[2]].ep);
-
-
- if(urldebug > 0){
- userinfo = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
- host = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
- fprint(2, "port: %q, authority %q\n", u->port, u->authority);
- fprint(2, "host %q, userinfo %q\n", host, userinfo);
- free(host);
- free(userinfo);
+ if(p == nil || *p == nil)
+ return;
+ if(**p == 0){
+ *p = nil;
+ return;
}
- return 0;
+ *p = estrdup(*p);
}
-static int
-parse_abspath(SplitUrl *su, Url *u)
-{
- char *s;
-
- if(su->path.s == nil)
- return 0;
- s = estredup(su->path.s, su->path.e);
- u->path = unescapeurl(s, "/");
- free(s);
- return 0;
-}
-
-static int
-parse_query(SplitUrl *su, Url *u)
-{
- char *s;
-
- if(su->query.s == nil)
- return 0;
- s = estredup(su->query.s, su->query.e);
- u->query = unescapeurl(s, "&;=/");
- free(s);
- return 0;
-}
-
-static int
-parse_fragment(SplitUrl *su, Url *u)
-{
- char *s;
-
- if(su->fragment.s == nil)
- return 0;
- s = estredup(su->fragment.s, su->fragment.e);
- u->fragment = unescapeurl(s, "");
- free(s);
- return 0;
-}
-
-static int
-postparse_http(Url *u)
+static char*
+mklowcase(char *s)
{
- char *p, *q;
-
- u->open = httpopen;
- u->read = httpread;
- u->close = httpclose;
+ char *p;
- if(u->authority==nil){
- werrstr("missing authority (hostname, port, etc.)");
- return -1;
- }
- if(u->host == nil){
- werrstr("missing host specification");
- return -1;
- }
-
- if(u->path == nil){
- u->http.page_spec = estrdup("/");
- return 0;
- }
- p = escapeurl(u->path, "/");
- if(u->query){
- q = escapeurl(u->query, "&;=/");
- u->http.page_spec = emalloc(strlen(p)+1+strlen(q)+1);
- strcpy(u->http.page_spec, p);
- strcat(u->http.page_spec, "?");
- strcat(u->http.page_spec, q);
- free(q);
- free(p);
- }else
- u->http.page_spec = p;
- return 0;
+ if(s == nil)
+ return s;
+ for(p = s; *p; p++)
+ *p = tolower(*p);
+ return s;
}
-static int
-postparse_ftp(Url *u)
+Url*
+url(char *s, Url *b)
{
- Resub m[MaxResub];
- Retab *t;
+ char *t, *p, *x, *y;
+ Url *u;
- if(u->authority==nil){
- werrstr("missing authority (hostname, port, etc.)");
- return -1;
- }
- if(u->query){
- werrstr("unexpected \"?query\" in ftp path");
- return -1;
+ if(s == nil)
+ s = "";
+ t = nil;
+ s = p = estrdup(s);
+ u = emalloc(sizeof(*u));
+ for(; *p; p++){
+ if(*p == ':'){
+ if(p == s)
+ break;
+ *p++ = 0;
+ u->scheme = s;
+ b = nil;
+ goto Abs;
+ }
+ if(!isalpha(*p))
+ if((p == s) || ((!isdigit(*p) && strchr("+-.", *p) == nil)))
+ break;
+ }
+ p = s;
+ if(b){
+ switch(*p){
+ case 0:
+ memmove(u, b, sizeof(*u));
+ goto Out;
+ case '#':
+ memmove(u, b, sizeof(*u));
+ u->fragment = p+1;
+ goto Out;
+ case '?':
+ memmove(u, b, sizeof(*u));
+ u->fragment = u->query = nil;
+ break;
+ case '/':
+ if(p[1] == '/'){
+ u->scheme = b->scheme;
+ b = nil;
+ break;
+ }
+ default:
+ memmove(u, b, sizeof(*u));
+ u->fragment = u->query = u->path = nil;
+ break;
+ }
}
- if(u->host == nil){
- werrstr("missing host specification");
- return -1;
+Abs:
+ if(x = strchr(p, '#')){
+ *x = 0;
+ u->fragment = x+1;
}
-
- if(u->path == nil){
- u->ftp.path_spec = estrdup("/");
- return 0;
+ if(x = strchr(p, '?')){
+ *x = 0;
+ u->query = x+1;
}
-
- m[0].sp = m[0].ep = nil;
- t = &retab[REftppath];
- if(!regx(t->prog, u->path, m, t->size)){
- werrstr("malformed ftp path: %q", u->path);
- return -1;
- }
-
- if(m[t->ind[0]].sp){
- u->ftp.path_spec = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
- if(strchr(u->ftp.path_spec, ';')){
- werrstr("unexpected \";param\" in ftp path");
- return -1;
+ if(p[0] == '/' && p[1] == '/'){
+ p += 2;
+ if(x = strchr(p, '/')){
+ u->path = t = abspath(x, Upath(b));
+ *x = 0;
}
- }else
- u->ftp.path_spec = estrdup("/");
-
- if(m[t->ind[1]].sp){
- u->ftp.type = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
- strlower(u->ftp.type);
- }
- return 0;
-}
+ if(x = strchr(p, '@')){
+ *x = 0;
+ if(y = strchr(p, ':')){
+ *y = 0;
+ u->pass = y+1;
+ }
+ u->user = p;
+ p = x+1;
+ }
+ if((x = strrchr(p, ']')) == nil)
+ x = p;
+ if(x = strrchr(x, ':')){
+ *x = 0;
+ u->port = x+1;
+ }
+ if(x = strchr(p, '[')){
+ p = x+1;
+ if(y = strchr(p, ']'))
+ *y = 0;
+ }
+ u->host = p;
+ } else {
+ u->path = t = abspath(p, Upath(b));
+ }
+Out:
+ pstrdup(&u->scheme);
+ pstrdup(&u->user);
+ pstrdup(&u->pass);
+ pstrdup(&u->host);
+ pstrdup(&u->port);
+ pstrdup(&u->path);
+ pstrdup(&u->query);
+ pstrdup(&u->fragment);
+ free(s);
+ free(t);
-static int
-postparse_file(Url *u)
-{
- if(u->user || u->passwd){
- werrstr("user information not valid with file scheme");
- return -1;
- }
- if(u->query){
- werrstr("unexpected \"?query\" in file path");
- return -1;
- }
- if(u->port){
- werrstr("port not valid with file scheme");
- return -1;
- }
- if(u->path == nil){
- werrstr("missing path in file scheme");
- return -1;
- }
- if(strchr(u->path, ';')){
- werrstr("unexpected \";param\" in file path");
- return -1;
- }
+ unescape(u->user, "");
+ unescape(u->pass, "");
+ unescape(u->path, "/");
+ unescape(u->query, "&;=/?#");
+ unescape(u->fragment, "");
+ mklowcase(u->scheme);
+ mklowcase(u->host);
+ mklowcase(u->port);
- /* "localhost" is equivalent to no host spec, we'll chose the latter */
- if(u->host && cistrcmp(u->host, "localhost") == 0){
- free(u->host);
- u->host = nil;
- }
- return 0;
+ return u;
}
-static int (*postparse[])(Url*) = {
- nil,
- postparse_http,
- postparse_http,
- postparse_ftp,
- postparse_file,
-};
-
Url*
-parseurl(char *url, Url *base)
+saneurl(Url *u)
{
- Url *u;
- SplitUrl su;
-
- if(urldebug)
- fprint(2, "parseurl %s with base %s\n", url, base ? base->url : "<none>");
-
- u = emalloc(sizeof(Url));
- u->url = estrdup(url);
- if(spliturl(u->url, &su) < 0){
- Fail:
+ if(u == nil || u->scheme == nil || u->host == nil || Upath(u) == nil){
freeurl(u);
return nil;
}
-
- /* RFC2396 sec 3.1 says relative URIs are distinguished by absent scheme */
- if(su.scheme.s==nil){
- if(urldebug)
- fprint(2, "parseurl has nil scheme\n");
- if(resolve_relative(&su, base, u) < 0 || spliturl(u->url, &su) < 0)
- goto Fail;
- if(u->ischeme == UScurrent){
- /* 'u.url' refers to current document; set fragment and return */
- if(parse_fragment(&su, u) < 0)
- goto Fail;
- goto Done;
+ if(u->port){
+ /* remove default ports */
+ switch(atoi(u->port)){
+ case 21: if(!strcmp(u->scheme, "ftp")) goto Defport; break;
+ case 70: if(!strcmp(u->scheme, "gopher"))goto Defport; break;
+ case 80: if(!strcmp(u->scheme, "http")) goto Defport; break;
+ case 443: if(!strcmp(u->scheme, "https")) goto Defport; break;
+ default: if(!strcmp(u->scheme, u->port)) goto Defport; break;
+ Defport:
+ free(u->port);
+ u->port = nil;
}
}
+ return u;
+}
- if(parse_scheme(&su, u) < 0
- || parse_fragment(&su, u) < 0)
- goto Fail;
+int
+matchurl(Url *u, Url *s)
+{
+ if(u){
+ char *a, *b;
- if(u->ischeme == USunknown){
- if(parse_unknown_part(&su, u) < 0)
- goto Fail;
- goto Done;
+ if(s == nil)
+ return 0;
+ if(u->scheme && (s->scheme == nil || strcmp(u->scheme, s->scheme)))
+ return 0;
+ if(u->user && (s->user == nil || strcmp(u->user, s->user)))
+ return 0;
+ if(u->host && (s->host == nil || strcmp(u->host, s->host)))
+ return 0;
+ if(u->port && (s->port == nil || strcmp(u->port, s->port)))
+ return 0;
+ if(a = Upath(u)){
+ b = Upath(s);
+ if(b == nil || strncmp(a, b, strlen(a)))
+ return 0;
+ }
}
-
- if(parse_query(&su, u) < 0
- || parse_authority(&su, u) < 0
- || parse_abspath(&su, u) < 0)
- goto Fail;
-
- if(u->ischeme < nelem(postparse) && postparse[u->ischeme])
- if((*postparse[u->ischeme])(u) < 0)
- goto Fail;
-
-Done:
- setmalloctag(u, getcallerpc(&url));
- rewriteurl(u);
- return u;
+ return 1;
}
void
@@ -873,162 +362,13 @@ freeurl(Url *u)
{
if(u == nil)
return;
- free(u->url);
free(u->scheme);
- free(u->schemedata);
- free(u->authority);
free(u->user);
- free(u->passwd);
+ free(u->pass);
free(u->host);
free(u->port);
free(u->path);
free(u->query);
free(u->fragment);
- switch(u->ischeme){
- case UShttp:
- case UShttps:
- free(u->http.page_spec);
- break;
- case USftp:
- free(u->ftp.path_spec);
- free(u->ftp.type);
- break;
- }
free(u);
}
-
-void
-rewriteurl(Url *u)
-{
- char *s;
-
- if(u->scheme == nil)
- return;
- if(u->schemedata)
- s = estrmanydup(u->scheme, ":", u->schemedata, nil);
- else
- s = estrmanydup(u->scheme, "://",
- u->user ? u->user : "",
- u->passwd ? ":" : "", u->passwd ? u->passwd : "",
- u->user ? "@" : "", u->host ? u->host : "",
- u->port ? ":" : "", u->port ? u->port : "",
- u->path ? u->path : "",
- u->query ? "?" : "", u->query ? u->query : "",
- u->fragment ? "#" : "", u->fragment ? u->fragment : "",
- nil);
- free(u->url);
- u->url = s;
-}
-
-int
-seturlquery(Url *u, char *query)
-{
- if(query == nil){
- free(u->query);
- u->query = nil;
- return 0;
- }
- free(u->query);
- u->query = unescapeurl(query, "&;=/");
- return 0;
-}
-
-static void
-dupp(char **p)
-{
- if(*p)
- *p = estrdup(*p);
-}
-
-Url*
-copyurl(Url *u)
-{
- Url *v;
-
- v = emalloc(sizeof(Url));
- *v = *u;
- dupp(&v->url);
- dupp(&v->scheme);
- dupp(&v->schemedata);
- dupp(&v->authority);
- dupp(&v->user);
- dupp(&v->passwd);
- dupp(&v->host);
- dupp(&v->port);
- dupp(&v->path);
- dupp(&v->query);
- dupp(&v->fragment);
-
- switch(v->ischeme){
- case UShttp:
- case UShttps:
- dupp(&v->http.page_spec);
- break;
- case USftp:
- dupp(&v->ftp.path_spec);
- dupp(&v->ftp.type);
- break;
- }
- return v;
-}
-
-static int
-dhex(char c)
-{
- if('0' <= c && c <= '9')
- return c-'0';
- if('a' <= c && c <= 'f')
- return c-'a'+10;
- if('A' <= c && c <= 'F')
- return c-'A'+10;
- return 0;
-}
-
-char*
-escapeurl(char *s, char *special)
-{
- static char *hex = "0123456789abcdef";
- char *t, *u;
-
- t = u = emalloc(strlen(s)*3+1);
- for(; *s; s++){
- if((s[0] == '%' && isxdigit(s[1]) && isxdigit(s[2])) ||
- (*s >= '0' && *s <= '9') ||
- (*s >= 'a' && *s <= 'z') ||
- (*s >= 'A' && *s <= 'Z') ||
- strchr(".-_~", *s) || strchr(special, *s))
- *u++ = *s;
- else if(s[0] == ' ')
- *u++ = '+';
- else {
- *u++ = '%';
- *u++ = hex[(*s>>4)&0xF];
- *u++ = hex[*s&0xF];
- }
- }
- *u = '\0';
- return t;
-}
-
-char*
-unescapeurl(char *s, char *special)
-{
- char *r, *w;
- Rune x;
-
- s = estrdup(s);
- for(r=w=s; x = *r; r++){
- if(x=='%' && isxdigit(r[1]) && isxdigit(r[2])){
- x = (dhex(r[1])<<4)|dhex(r[2]);
- if(x == 0 || (x > 0x1F && x < 0x7F && strchr(special, x)))
- x = *r;
- else
- r += 2;
- } else if(x=='+')
- x = ' ';
- *w++ = x;
- }
- *w = '\0';
- return s;
-}
-