diff options
author | cinap_lenrek <cinap_lenrek@rei2.9hal> | 2012-01-11 16:17:54 +0100 |
---|---|---|
committer | cinap_lenrek <cinap_lenrek@rei2.9hal> | 2012-01-11 16:17:54 +0100 |
commit | 75e1ef0ab60acb6bccc54254b82770aec5786ead (patch) | |
tree | d273fc755a20e67801aa0a13df30ab75b2883419 /sys/src/cmd/webfs/url.c | |
parent | 62fb4f97177d8e76f1fd49bb9d0073007b7c9bcc (diff) |
new webfs, rc based hget
Diffstat (limited to 'sys/src/cmd/webfs/url.c')
-rw-r--r-- | sys/src/cmd/webfs/url.c | 1236 |
1 files changed, 288 insertions, 948 deletions
diff --git a/sys/src/cmd/webfs/url.c b/sys/src/cmd/webfs/url.c index c6c5695f0..2137010d1 100644 --- a/sys/src/cmd/webfs/url.c +++ b/sys/src/cmd/webfs/url.c @@ -1,871 +1,360 @@ -/* - * This is a URL parser, written to parse "Common Internet Scheme" URL - * syntax as described in RFC1738 and updated by RFC2396. Only absolute URLs - * are supported, using "server-based" naming authorities in the schemes. - * Support for literal IPv6 addresses is included, per RFC2732. - * - * Current "known" schemes: http, ftp, file. - * - * We can do all the parsing operations without Runes since URLs are - * defined to be composed of US-ASCII printable characters. - * See RFC1738, RFC2396. - */ - #include <u.h> #include <libc.h> #include <ctype.h> -#include <regexp.h> -#include <plumb.h> -#include <thread.h> #include <fcall.h> +#include <thread.h> #include <9p.h> + #include "dat.h" #include "fns.h" -int urldebug; - -/* If set, relative paths with leading ".." segments will have them trimmed */ -#define RemoveExtraRelDotDots 0 -#define ExpandCurrentDocUrls 1 - -static char* -schemestrtab[] = -{ - nil, - "http", - "https", - "ftp", - "file", -}; - static int -ischeme(char *s) -{ - int i; - - for(i=0; i<nelem(schemestrtab); i++) - if(schemestrtab[i] && strcmp(s, schemestrtab[i])==0) - return i; - return USunknown; -} - -/* - * URI splitting regexp is from RFC2396, Appendix B: - * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? - * 12 3 4 5 6 7 8 9 - * - * Example: "http://www.ics.uci.edu/pub/ietf/uri/#Related" - * $2 = scheme "http" - * $4 = authority "www.ics.uci.edu" - * $5 = path "/pub/ietf/uri/" - * $7 = query <undefined> - * $9 = fragment "Related" - */ - -/* - * RFC2396, Sec 3.1, contains: - * - * Scheme names consist of a sequence of characters beginning with a - * lower case letter and followed by any combination of lower case - * letters, digits, plus ("+"), period ("."), or hyphen ("-"). For - * resiliency, programs interpreting URI should treat upper case letters - * as equivalent to lower case in scheme names (e.g., allow "HTTP" as - * well as "http"). - */ - -/* - * For server-based naming authorities (RFC2396 Sec 3.2.2): - * server = [ [ userinfo "@" ] hostport ] - * userinfo = *( unreserved | escaped | - * ";" | ":" | "&" | "=" | "+" | "$" | "," ) - * hostport = host [ ":" port ] - * host = hostname | IPv4address - * hostname = *( domainlabel "." ) toplabel [ "." ] - * domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum - * toplabel = alpha | alpha *( alphanum | "-" ) alphanum - * IPv4address = 1*digit "." 1*digit "." 1*digit "." 1*digit - * port = *digit - * - * The host is a domain name of a network host, or its IPv4 address as a - * set of four decimal digit groups separated by ".". Literal IPv6 - * addresses are not supported. - * - * Note that literal IPv6 address support is outlined in RFC2732: - * host = hostname | IPv4address | IPv6reference - * ipv6reference = "[" IPv6address "]" (RFC2373) - * - * Since hostnames and numbers will have to be resolved by the OS anyway, - * we don't have to parse them too pedantically (counting '.'s, checking - * for well-formed literal IP addresses, etc.). - * - * In FTP/file paths, we reject most ";param"s and querys. In HTTP paths, - * we just pass them through. - * - * Instead of letting a "path" be 0-or-more characters as RFC2396 suggests, - * we'll say it's 1-or-more characters, 0-or-1 times. This way, an absent - * path yields a nil substring match, instead of an empty one. - * - * We're more restrictive than RFC2396 indicates with "userinfo" strings, - * insisting they have the form "[user[:password]]". This may need to - * change at some point, however. - */ - -/* RE character-class components -- these go in brackets */ -#define PUNCT "\\-_.!~*'()" -#define ALNUM "a-zA-Z0-9" -#define HEX "0-9a-fA-F" -#define UNRES ALNUM PUNCT - -/* RE components; _N => has N parenthesized subexpressions when expanded */ -#define USERINFO_2 "([" UNRES ";:&=+$,]|(%[" HEX "][" HEX "]))" - -typedef struct Retab Retab; -struct Retab -{ - char *str; - Reprog *prog; - int size; - int ind[5]; -}; - -enum -{ - REsplit = 0, - REscheme, - REauthority, - REhost, - REuserinfo, - REftppath, - - MaxResub= 20, -}; - -Retab retab[] = /* view in constant width Font */ -{ -[REsplit] - "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]+)?(\\?([^#]*))?(#(.*))?$", nil, 0, - /* |-scheme-| |-auth.-| |path--| |query| |--|frag */ - { 2, 4, 5, 7, 9}, - -[REscheme] - "^[a-z][a-z0-9+-.]*$", nil, 0, - { 0, }, - -[REauthority] - "^(((" USERINFO_2 "*)@)?(((\\[[^\\]@]+\\])|([^:\\[@]+))(:([0-9]*))?)?)?$", nil, 0, - /* |----user info-----| |--------host----------------| |-port-| */ - { 3, 7, 11, }, - -[REhost] - "^(([a-zA-Z0-9\\-.]+)|(\\[([a-fA-F0-9.:]+)\\]))$", nil, 0, - /* |--regular host--| |-IPv6 literal-| */ - { 2, 4, }, - -[REuserinfo] - "^(([^:]*)(:([^:]*))?)$", nil, 0, - /* |user-| |pass-| */ - { 2, 4, }, - -[REftppath] - "^(.+)(;[tT][yY][pP][eE]=([aAiIdD]))?$", nil, 0, - /*|--|-path |ftptype-| */ - { 1, 3, }, -}; - -static int -countleftparen(char *s) -{ - int n; - - n = 0; - for(; *s; s++) - if(*s == '(') - n++; - return n; -} - -void -initurl(void) +dhex(char c) { - int i, j; - - for(i=0; i<nelem(retab); i++){ - retab[i].prog = regcomp(retab[i].str); - if(retab[i].prog == nil) - sysfatal("recomp(%s): %r", retab[i].str); - retab[i].size = countleftparen(retab[i].str)+1; - for(j=0; j<nelem(retab[i].ind); j++) - if(retab[i].ind[j] >= retab[i].size) - sysfatal("bad index in regexp table: retab[%d].ind[%d] = %d >= %d", - i, j, retab[i].ind[j], retab[i].size); - if(MaxResub < retab[i].size) - sysfatal("MaxResub too small: %d < %d", MaxResub, retab[i].size); - } + if('0' <= c && c <= '9') + return c-'0'; + if('a' <= c && c <= 'f') + return c-'a'+10; + if('A' <= c && c <= 'F') + return c-'A'+10; + return 0; } -typedef struct SplitUrl SplitUrl; -struct SplitUrl -{ - struct { - char *s; - char *e; - } url, scheme, authority, path, query, fragment; -}; - -/* - * Implements the algorithm in RFC2396 sec 5.2 step 6. - * Returns number of chars written, excluding NUL terminator. - * dest is known to be >= strlen(base)+rel_len. - */ -static void -merge_relative_path(char *base, char *rel_st, int rel_len, char *dest) +static char* +unescape(char *s, char *spec) { - char *s, *p, *e, *pdest; - - pdest = dest; - - /* 6a: start with base, discard last segment */ - if(base && base[0]){ - /* Empty paths don't match in our scheme; 'base' should be nil */ - assert(base[0] == '/'); - e = strrchr(base, '/'); - e++; - memmove(pdest, base, e-base); - pdest += e-base; - }else{ - /* Artistic license on my part */ - *pdest++ = '/'; - } - - /* 6b: append relative component */ - if(rel_st){ - memmove(pdest, rel_st, rel_len); - pdest += rel_len; - } - - /* 6c: remove any occurrences of "./" as a complete segment */ - s = dest; - *pdest = '\0'; - while(e = strstr(s, "./")){ - if((e == dest) || (*(e-1) == '/')){ - memmove(e, e+2, pdest+1-(e+2)); /* +1 for NUL */ - pdest -= 2; - }else - s = e+1; - } - - /* 6d: remove a trailing "." as a complete segment */ - if(pdest>dest && *(pdest-1)=='.' && - (pdest==dest+1 || *(pdest-2)=='/')) - *--pdest = '\0'; - - /* 6e: remove occurences of "seg/../", where seg != "..", left->right */ - s = dest+1; - while(e = strstr(s, "/../")){ - p = e - 1; - while(p >= dest && *p != '/') - p--; - if(memcmp(p, "/../", 4) != 0){ - memmove(p+1, e+4, pdest+1-(e+4)); - pdest -= (e+4) - (p+1); - }else - s = e+1; - } - - /* 6f: remove a trailing "seg/..", where seg isn't ".." */ - if(pdest-3 > dest && memcmp(pdest-3, "/..", 3)==0){ - p = pdest-3 - 1; - while(p >= dest && *p != '/') - p--; - if(memcmp(p, "/../", 4) != 0){ - pdest = p+1; - *pdest = '\0'; - } - } + char *r, *w; + uchar x; - /* 6g: leading ".." segments are errors -- we'll just blat them out. */ - if(RemoveExtraRelDotDots){ - p = dest; - if (p[0] == '/') - p++; - s = p; - while(s[0]=='.' && s[1]=='.' && (s[2]==0 || s[2]=='/')) - s += 3; - if(s > p){ - memmove(p, s, pdest+1-s); - pdest -= s-p; + if(s == nil) + return s; + for(r=w=s; x = *r; r++){ + if(x == '%' && isxdigit(r[1]) && isxdigit(r[2])){ + x = (dhex(r[1])<<4)|dhex(r[2]); + if(x == 0 || (x > 0x1F && x < 0x7F && strchr(spec, x))){ + *w++ = '%'; + *w++ = toupper(r[1]); + *w++ = toupper(r[2]); + } + else + *w++ = x; + r += 2; + continue; } + if(x == '+') + x = ' '; + *w++ = x; } - USED(pdest); - - if(urldebug) - fprint(2, "merge_relative_path: '%s' + '%.*s' -> '%s'\n", base, rel_len, - rel_st, dest); + *w = 0; + return s; } -/* - * See RFC2396 sec 5.2 for info on resolving relative URIs to absolute form. - * - * If successful, this just ends up freeing and replacing "u->url". - */ -static int -resolve_relative(SplitUrl *su, Url *base, Url *u) +int +Efmt(Fmt *f) { - char *url, *path; - char *purl, *ppath; - int currentdoc, ulen, plen; - - if(base == nil){ - werrstr("relative URI given without base"); - return -1; - } - if(base->scheme == nil){ - werrstr("relative URI given with no scheme"); - return -1; - } - if(base->ischeme == USunknown){ - werrstr("relative URI given with unknown scheme"); - return -1; - } - if(base->ischeme == UScurrent){ - werrstr("relative URI given with incomplete base"); - return -1; - } - assert(su->scheme.s == nil); - - /* Sec 5.2 step 2 */ - currentdoc = 0; - if(su->path.s==nil && su->scheme.s==nil && su->authority.s==nil && su->query.s==nil){ - /* Reference is to current document */ - if(urldebug) - fprint(2, "url %s is relative to current document\n", u->url); - u->ischeme = UScurrent; - if(!ExpandCurrentDocUrls) - return 0; - currentdoc = 1; - } - - /* Over-estimate the maximum lengths, for allocation purposes */ - /* (constants are for separators) */ - plen = 1; - if(base->path) - plen += strlen(base->path); - if(su->path.s) - plen += 1 + (su->path.e - su->path.s); - - ulen = 0; - ulen += strlen(base->scheme) + 1; - if(su->authority.s) - ulen += 2 + (su->authority.e - su->authority.s); - else - ulen += 2 + ((base->authority) ? strlen(base->authority) : 0); - ulen += plen; - if(su->query.s) - ulen += 1 + (su->query.e - su->query.s); - else if(currentdoc && base->query) - ulen += 1 + strlen(base->query); - if(su->fragment.s) - ulen += 1 + (su->fragment.e - su->fragment.s); - else if(currentdoc && base->fragment) - ulen += 1 + strlen(base->fragment); - url = emalloc(ulen+1); - path = emalloc(plen+1); + char *s, *spec; + Str2 s2; - url[0] = '\0'; - purl = url; - path[0] = '\0'; - ppath = path; - - if(su->authority.s || (su->path.s && (su->path.s[0] == '/'))){ - /* Is a "network-path" or "absolute-path"; don't merge with base path */ - /* Sec 5.2 steps 4,5 */ - if(su->path.s){ - memmove(ppath, su->path.s, su->path.e - su->path.s); - ppath += su->path.e - su->path.s; - *ppath = '\0'; - } - }else if(currentdoc){ - /* Is a current-doc reference; just copy the path from the base URL */ - if(base->path){ - strcpy(ppath, base->path); - ppath += strlen(ppath); + s2 = va_arg(f->args, Str2); + s = s2.s1; + spec = s2.s2; + for(; *s; s++) + if(*s == '%' && isxdigit(s[1]) && isxdigit(s[2])){ + fmtprint(f, "%%%c%c", toupper(s[1]), toupper(s[2])); + s += 2; } - USED(ppath); - }else{ - /* Is a relative-path reference; we have to merge it */ - /* Sec 5.2 step 6 */ - merge_relative_path(base->path, - su->path.s, su->path.e - su->path.s, ppath); - } - - /* Build new URL from pieces, inheriting from base where needed */ - strcpy(purl, base->scheme); - purl += strlen(purl); - *purl++ = ':'; - if(su->authority.s){ - strcpy(purl, "//"); - purl += strlen(purl); - memmove(purl, su->authority.s, su->authority.e - su->authority.s); - purl += su->authority.e - su->authority.s; - }else if(base->authority){ - strcpy(purl, "//"); - purl += strlen(purl); - strcpy(purl, base->authority); - purl += strlen(purl); - } - assert((path[0] == '\0') || (path[0] == '/')); - strcpy(purl, path); - purl += strlen(purl); - - /* - * The query and fragment are not inherited from the base, - * except in case of "current document" URLs, which inherit any query - * and may inherit the fragment. - */ - if(su->query.s){ - *purl++ = '?'; - memmove(purl, su->query.s, su->query.e - su->query.s); - purl += su->query.e - su->query.s; - }else if(currentdoc && base->query){ - *purl++ = '?'; - strcpy(purl, base->query); - purl += strlen(purl); - } - - if(su->fragment.s){ - *purl++ = '#'; - memmove(purl, su->fragment.s, su->fragment.e - su->fragment.s); - purl += su->fragment.e - su->fragment.s; - }else if(currentdoc && base->fragment){ - *purl++ = '#'; - strcpy(purl, base->fragment); - purl += strlen(purl); - } - USED(purl); - - if(urldebug) - fprint(2, "resolve_relative: '%s' + '%s' -> '%s'\n", base->url, u->url, url); - free(u->url); - u->url = url; - free(path); + else if(isalnum(*s) || strchr(".-_~!$&'()*,;=", *s) || strchr(spec, *s)) + fmtprint(f, "%c", *s); + else + fmtprint(f, "%%%.2X", *s & 0xff); return 0; } int -regx(Reprog *prog, char *s, Resub *m, int nm) -{ - int i; - - if(s == nil) - s = m[0].sp; /* why is this necessary? */ - - i = regexec(prog, s, m, nm); -/* - if(i >= 0) - for(j=0; j<nm; j++) - fprint(2, "match%d: %.*s\n", j, utfnlen(m[j].sp, m[j].ep-m[j].sp), m[j].sp); -*/ - return i; -} - -static int -ismatch(int i, char *s, char *desc) +Ufmt(Fmt *f) { - Resub m[1]; - - m[0].sp = m[0].ep = nil; - if(!regx(retab[i].prog, s, m, 1)){ - werrstr("malformed %s: %q", desc, s); - return 0; - } - return 1; -} - -static int -spliturl(char *url, SplitUrl *su) -{ - Resub m[MaxResub]; - Retab *t; - - /* - * Newlines are not valid in a URI, but regexp(2) treats them specially - * so it's best to make sure there are none before proceeding. - */ - if(strchr(url, '\n')){ - werrstr("newline in URI"); - return -1; - } - - m[0].sp = m[0].ep = nil; - t = &retab[REsplit]; - if(!regx(t->prog, url, m, t->size)){ - werrstr("malformed URI: %q", url); - return -1; - } - - su->url.s = m[0].sp; - su->url.e = m[0].ep; - su->scheme.s = m[t->ind[0]].sp; - su->scheme.e = m[t->ind[0]].ep; - su->authority.s = m[t->ind[1]].sp; - su->authority.e = m[t->ind[1]].ep; - su->path.s = m[t->ind[2]].sp; - su->path.e = m[t->ind[2]].ep; - su->query.s = m[t->ind[3]].sp; - su->query.e = m[t->ind[3]].ep; - su->fragment.s = m[t->ind[4]].sp; - su->fragment.e = m[t->ind[4]].ep; - - if(urldebug) - fprint(2, "split url %s into %.*q %.*q %.*q %.*q %.*q %.*q\n", - url, - su->url.s ? utfnlen(su->url.s, su->url.e-su->url.s) : 10, su->url.s ? su->url.s : "", - su->scheme.s ? utfnlen(su->scheme.s, su->scheme.e-su->scheme.s) : 10, su->scheme.s ? su->scheme.s : "", - su->authority.s ? utfnlen(su->authority.s, su->authority.e-su->authority.s) : 10, su->authority.s ? su->authority.s : "", - su->path.s ? utfnlen(su->path.s, su->path.e-su->path.s) : 10, su->path.s ? su->path.s : "", - su->query.s ? utfnlen(su->query.s, su->query.e-su->query.s) : 10, su->query.s ? su->query.s : "", - su->fragment.s ? utfnlen(su->fragment.s, su->fragment.e-su->fragment.s) : 10, su->fragment.s ? su->fragment.s : ""); + char *s; + Url *u; + if((u = va_arg(f->args, Url*)) == nil) + return fmtprint(f, "nil"); + if(u->scheme) + fmtprint(f, "%s:", u->scheme); + if(u->user || u->host) + fmtprint(f, "//"); + if(u->user){ + fmtprint(f, "%E", (Str2){u->user, ""}); + if(u->pass) + fmtprint(f, ":%E", (Str2){u->pass, ""}); + fmtprint(f, "@"); + } + if(u->host){ + fmtprint(f, strchr(u->host, ':') ? "[%s]" : "%s", u->host); + if(u->port) + fmtprint(f, ":%s", u->port); + } + if(s = Upath(u)) + fmtprint(f, "%E", (Str2){s, "/:@"}); + if(u->query) + fmtprint(f, "?%E", (Str2){u->query, "/:@"}); + if(u->fragment) + fmtprint(f, "#%E", (Str2){u->fragment, "/:@?"}); return 0; } -static int -parse_scheme(SplitUrl *su, Url *u) +char* +Upath(Url *u) { - if(su->scheme.s == nil){ - werrstr("missing scheme"); - return -1; + if(u){ + if(u->path) + return u->path; + if(u->user || u->host) + return "/"; } - u->scheme = estredup(su->scheme.s, su->scheme.e); - strlower(u->scheme); - - if(!ismatch(REscheme, u->scheme, "scheme")) - return -1; - - u->ischeme = ischeme(u->scheme); - if(urldebug) - fprint(2, "parse_scheme %s => %d\n", u->scheme, u->ischeme); - return 0; + return nil; } -static int -parse_unknown_part(SplitUrl *su, Url *u) -{ - char *s, *e; - - assert(u->ischeme == USunknown); - assert(su->scheme.e[0] == ':'); - - s = su->scheme.e+1; - if(su->fragment.s){ - e = su->fragment.s-1; - assert(*e == '#'); - }else - e = s+strlen(s); - - u->schemedata = estredup(s, e); - return 0; +static char* +remdot(char *s) +{ + char *b, *d, *p; + int dir, n; + + dir = 1; + b = d = s; + while(*s == '/') + s++; + for(; s; s = p){ + if(p = strchr(s, '/')) + while(*p == '/') + *p++ = 0; + if(*s == '.' && ((s[1] == 0) || (s[1] == '.' && s[2] == 0))){ + if(s[1] == '.') + while(d > b) + if(*--d == '/') + break; + dir = 1; + continue; + } else + dir = (p != nil); + n = strlen(s); + memmove(d+1, s, n); + *d = '/'; + d += n+1; + } + if(dir) + *d++ = '/'; + *d = 0; + return b; } -static int -parse_userinfo(char *s, char *e, Url *u) +static char* +abspath(char *s, char *b) { - Resub m[MaxResub]; - Retab *t; + char *x, *a; - m[0].sp = s; - m[0].ep = e; - t = &retab[REuserinfo]; - if(!regx(t->prog, nil, m, t->size)){ - werrstr("malformed userinfo: %.*q", utfnlen(s, e-s), s); - return -1; + if(b && *b){ + if(s == nil || *s == 0) + return estrdup(b); + if(*s != '/' && (x = strrchr(b, '/'))){ + a = emalloc((x - b) + strlen(s) + 4); + sprint(a, "/%.*s/%s", (int)(x - b), b, s); + return remdot(a); + } } - if(m[t->ind[0]].sp) - u->user = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep); - if(m[t->ind[1]].sp) - u->user = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep); - return 0; -} - -static int -parse_host(char *s, char *e, Url *u) -{ - Resub m[MaxResub]; - Retab *t; - - m[0].sp = s; - m[0].ep = e; - t = &retab[REhost]; - if(!regx(t->prog, nil, m, t->size)){ - werrstr("malformed host: %.*q", utfnlen(s, e-s), s); - return -1; + if(s && *s){ + if(*s != '/') + return estrdup(s); + a = emalloc(strlen(s) + 4); + sprint(a, "/%s", s); + return remdot(a); } - - assert(m[t->ind[0]].sp || m[t->ind[1]].sp); - - if(m[t->ind[0]].sp) /* regular */ - u->host = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep); - else - u->host = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep); - return 0; + return nil; } -static int -parse_authority(SplitUrl *su, Url *u) +static void +pstrdup(char **p) { - Resub m[MaxResub]; - Retab *t; - char *host; - char *userinfo; - - if(su->authority.s == nil) - return 0; - - u->authority = estredup(su->authority.s, su->authority.e); - m[0].sp = m[0].ep = nil; - t = &retab[REauthority]; - if(!regx(t->prog, u->authority, m, t->size)){ - werrstr("malformed authority: %q", u->authority); - return -1; - } - - if(m[t->ind[0]].sp) - if(parse_userinfo(m[t->ind[0]].sp, m[t->ind[0]].ep, u) < 0) - return -1; - if(m[t->ind[1]].sp) - if(parse_host(m[t->ind[1]].sp, m[t->ind[1]].ep, u) < 0) - return -1; - if(m[t->ind[2]].sp) - u->port = estredup(m[t->ind[2]].sp, m[t->ind[2]].ep); - - - if(urldebug > 0){ - userinfo = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep); - host = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep); - fprint(2, "port: %q, authority %q\n", u->port, u->authority); - fprint(2, "host %q, userinfo %q\n", host, userinfo); - free(host); - free(userinfo); + if(p == nil || *p == nil) + return; + if(**p == 0){ + *p = nil; + return; } - return 0; + *p = estrdup(*p); } -static int -parse_abspath(SplitUrl *su, Url *u) -{ - char *s; - - if(su->path.s == nil) - return 0; - s = estredup(su->path.s, su->path.e); - u->path = unescapeurl(s, "/"); - free(s); - return 0; -} - -static int -parse_query(SplitUrl *su, Url *u) -{ - char *s; - - if(su->query.s == nil) - return 0; - s = estredup(su->query.s, su->query.e); - u->query = unescapeurl(s, "&;=/"); - free(s); - return 0; -} - -static int -parse_fragment(SplitUrl *su, Url *u) -{ - char *s; - - if(su->fragment.s == nil) - return 0; - s = estredup(su->fragment.s, su->fragment.e); - u->fragment = unescapeurl(s, ""); - free(s); - return 0; -} - -static int -postparse_http(Url *u) +static char* +mklowcase(char *s) { - char *p, *q; - - u->open = httpopen; - u->read = httpread; - u->close = httpclose; + char *p; - if(u->authority==nil){ - werrstr("missing authority (hostname, port, etc.)"); - return -1; - } - if(u->host == nil){ - werrstr("missing host specification"); - return -1; - } - - if(u->path == nil){ - u->http.page_spec = estrdup("/"); - return 0; - } - p = escapeurl(u->path, "/"); - if(u->query){ - q = escapeurl(u->query, "&;=/"); - u->http.page_spec = emalloc(strlen(p)+1+strlen(q)+1); - strcpy(u->http.page_spec, p); - strcat(u->http.page_spec, "?"); - strcat(u->http.page_spec, q); - free(q); - free(p); - }else - u->http.page_spec = p; - return 0; + if(s == nil) + return s; + for(p = s; *p; p++) + *p = tolower(*p); + return s; } -static int -postparse_ftp(Url *u) +Url* +url(char *s, Url *b) { - Resub m[MaxResub]; - Retab *t; + char *t, *p, *x, *y; + Url *u; - if(u->authority==nil){ - werrstr("missing authority (hostname, port, etc.)"); - return -1; - } - if(u->query){ - werrstr("unexpected \"?query\" in ftp path"); - return -1; + if(s == nil) + s = ""; + t = nil; + s = p = estrdup(s); + u = emalloc(sizeof(*u)); + for(; *p; p++){ + if(*p == ':'){ + if(p == s) + break; + *p++ = 0; + u->scheme = s; + b = nil; + goto Abs; + } + if(!isalpha(*p)) + if((p == s) || ((!isdigit(*p) && strchr("+-.", *p) == nil))) + break; + } + p = s; + if(b){ + switch(*p){ + case 0: + memmove(u, b, sizeof(*u)); + goto Out; + case '#': + memmove(u, b, sizeof(*u)); + u->fragment = p+1; + goto Out; + case '?': + memmove(u, b, sizeof(*u)); + u->fragment = u->query = nil; + break; + case '/': + if(p[1] == '/'){ + u->scheme = b->scheme; + b = nil; + break; + } + default: + memmove(u, b, sizeof(*u)); + u->fragment = u->query = u->path = nil; + break; + } } - if(u->host == nil){ - werrstr("missing host specification"); - return -1; +Abs: + if(x = strchr(p, '#')){ + *x = 0; + u->fragment = x+1; } - - if(u->path == nil){ - u->ftp.path_spec = estrdup("/"); - return 0; + if(x = strchr(p, '?')){ + *x = 0; + u->query = x+1; } - - m[0].sp = m[0].ep = nil; - t = &retab[REftppath]; - if(!regx(t->prog, u->path, m, t->size)){ - werrstr("malformed ftp path: %q", u->path); - return -1; - } - - if(m[t->ind[0]].sp){ - u->ftp.path_spec = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep); - if(strchr(u->ftp.path_spec, ';')){ - werrstr("unexpected \";param\" in ftp path"); - return -1; + if(p[0] == '/' && p[1] == '/'){ + p += 2; + if(x = strchr(p, '/')){ + u->path = t = abspath(x, Upath(b)); + *x = 0; } - }else - u->ftp.path_spec = estrdup("/"); - - if(m[t->ind[1]].sp){ - u->ftp.type = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep); - strlower(u->ftp.type); - } - return 0; -} + if(x = strchr(p, '@')){ + *x = 0; + if(y = strchr(p, ':')){ + *y = 0; + u->pass = y+1; + } + u->user = p; + p = x+1; + } + if((x = strrchr(p, ']')) == nil) + x = p; + if(x = strrchr(x, ':')){ + *x = 0; + u->port = x+1; + } + if(x = strchr(p, '[')){ + p = x+1; + if(y = strchr(p, ']')) + *y = 0; + } + u->host = p; + } else { + u->path = t = abspath(p, Upath(b)); + } +Out: + pstrdup(&u->scheme); + pstrdup(&u->user); + pstrdup(&u->pass); + pstrdup(&u->host); + pstrdup(&u->port); + pstrdup(&u->path); + pstrdup(&u->query); + pstrdup(&u->fragment); + free(s); + free(t); -static int -postparse_file(Url *u) -{ - if(u->user || u->passwd){ - werrstr("user information not valid with file scheme"); - return -1; - } - if(u->query){ - werrstr("unexpected \"?query\" in file path"); - return -1; - } - if(u->port){ - werrstr("port not valid with file scheme"); - return -1; - } - if(u->path == nil){ - werrstr("missing path in file scheme"); - return -1; - } - if(strchr(u->path, ';')){ - werrstr("unexpected \";param\" in file path"); - return -1; - } + unescape(u->user, ""); + unescape(u->pass, ""); + unescape(u->path, "/"); + unescape(u->query, "&;=/?#"); + unescape(u->fragment, ""); + mklowcase(u->scheme); + mklowcase(u->host); + mklowcase(u->port); - /* "localhost" is equivalent to no host spec, we'll chose the latter */ - if(u->host && cistrcmp(u->host, "localhost") == 0){ - free(u->host); - u->host = nil; - } - return 0; + return u; } -static int (*postparse[])(Url*) = { - nil, - postparse_http, - postparse_http, - postparse_ftp, - postparse_file, -}; - Url* -parseurl(char *url, Url *base) +saneurl(Url *u) { - Url *u; - SplitUrl su; - - if(urldebug) - fprint(2, "parseurl %s with base %s\n", url, base ? base->url : "<none>"); - - u = emalloc(sizeof(Url)); - u->url = estrdup(url); - if(spliturl(u->url, &su) < 0){ - Fail: + if(u == nil || u->scheme == nil || u->host == nil || Upath(u) == nil){ freeurl(u); return nil; } - - /* RFC2396 sec 3.1 says relative URIs are distinguished by absent scheme */ - if(su.scheme.s==nil){ - if(urldebug) - fprint(2, "parseurl has nil scheme\n"); - if(resolve_relative(&su, base, u) < 0 || spliturl(u->url, &su) < 0) - goto Fail; - if(u->ischeme == UScurrent){ - /* 'u.url' refers to current document; set fragment and return */ - if(parse_fragment(&su, u) < 0) - goto Fail; - goto Done; + if(u->port){ + /* remove default ports */ + switch(atoi(u->port)){ + case 21: if(!strcmp(u->scheme, "ftp")) goto Defport; break; + case 70: if(!strcmp(u->scheme, "gopher"))goto Defport; break; + case 80: if(!strcmp(u->scheme, "http")) goto Defport; break; + case 443: if(!strcmp(u->scheme, "https")) goto Defport; break; + default: if(!strcmp(u->scheme, u->port)) goto Defport; break; + Defport: + free(u->port); + u->port = nil; } } + return u; +} - if(parse_scheme(&su, u) < 0 - || parse_fragment(&su, u) < 0) - goto Fail; +int +matchurl(Url *u, Url *s) +{ + if(u){ + char *a, *b; - if(u->ischeme == USunknown){ - if(parse_unknown_part(&su, u) < 0) - goto Fail; - goto Done; + if(s == nil) + return 0; + if(u->scheme && (s->scheme == nil || strcmp(u->scheme, s->scheme))) + return 0; + if(u->user && (s->user == nil || strcmp(u->user, s->user))) + return 0; + if(u->host && (s->host == nil || strcmp(u->host, s->host))) + return 0; + if(u->port && (s->port == nil || strcmp(u->port, s->port))) + return 0; + if(a = Upath(u)){ + b = Upath(s); + if(b == nil || strncmp(a, b, strlen(a))) + return 0; + } } - - if(parse_query(&su, u) < 0 - || parse_authority(&su, u) < 0 - || parse_abspath(&su, u) < 0) - goto Fail; - - if(u->ischeme < nelem(postparse) && postparse[u->ischeme]) - if((*postparse[u->ischeme])(u) < 0) - goto Fail; - -Done: - setmalloctag(u, getcallerpc(&url)); - rewriteurl(u); - return u; + return 1; } void @@ -873,162 +362,13 @@ freeurl(Url *u) { if(u == nil) return; - free(u->url); free(u->scheme); - free(u->schemedata); - free(u->authority); free(u->user); - free(u->passwd); + free(u->pass); free(u->host); free(u->port); free(u->path); free(u->query); free(u->fragment); - switch(u->ischeme){ - case UShttp: - case UShttps: - free(u->http.page_spec); - break; - case USftp: - free(u->ftp.path_spec); - free(u->ftp.type); - break; - } free(u); } - -void -rewriteurl(Url *u) -{ - char *s; - - if(u->scheme == nil) - return; - if(u->schemedata) - s = estrmanydup(u->scheme, ":", u->schemedata, nil); - else - s = estrmanydup(u->scheme, "://", - u->user ? u->user : "", - u->passwd ? ":" : "", u->passwd ? u->passwd : "", - u->user ? "@" : "", u->host ? u->host : "", - u->port ? ":" : "", u->port ? u->port : "", - u->path ? u->path : "", - u->query ? "?" : "", u->query ? u->query : "", - u->fragment ? "#" : "", u->fragment ? u->fragment : "", - nil); - free(u->url); - u->url = s; -} - -int -seturlquery(Url *u, char *query) -{ - if(query == nil){ - free(u->query); - u->query = nil; - return 0; - } - free(u->query); - u->query = unescapeurl(query, "&;=/"); - return 0; -} - -static void -dupp(char **p) -{ - if(*p) - *p = estrdup(*p); -} - -Url* -copyurl(Url *u) -{ - Url *v; - - v = emalloc(sizeof(Url)); - *v = *u; - dupp(&v->url); - dupp(&v->scheme); - dupp(&v->schemedata); - dupp(&v->authority); - dupp(&v->user); - dupp(&v->passwd); - dupp(&v->host); - dupp(&v->port); - dupp(&v->path); - dupp(&v->query); - dupp(&v->fragment); - - switch(v->ischeme){ - case UShttp: - case UShttps: - dupp(&v->http.page_spec); - break; - case USftp: - dupp(&v->ftp.path_spec); - dupp(&v->ftp.type); - break; - } - return v; -} - -static int -dhex(char c) -{ - if('0' <= c && c <= '9') - return c-'0'; - if('a' <= c && c <= 'f') - return c-'a'+10; - if('A' <= c && c <= 'F') - return c-'A'+10; - return 0; -} - -char* -escapeurl(char *s, char *special) -{ - static char *hex = "0123456789abcdef"; - char *t, *u; - - t = u = emalloc(strlen(s)*3+1); - for(; *s; s++){ - if((s[0] == '%' && isxdigit(s[1]) && isxdigit(s[2])) || - (*s >= '0' && *s <= '9') || - (*s >= 'a' && *s <= 'z') || - (*s >= 'A' && *s <= 'Z') || - strchr(".-_~", *s) || strchr(special, *s)) - *u++ = *s; - else if(s[0] == ' ') - *u++ = '+'; - else { - *u++ = '%'; - *u++ = hex[(*s>>4)&0xF]; - *u++ = hex[*s&0xF]; - } - } - *u = '\0'; - return t; -} - -char* -unescapeurl(char *s, char *special) -{ - char *r, *w; - Rune x; - - s = estrdup(s); - for(r=w=s; x = *r; r++){ - if(x=='%' && isxdigit(r[1]) && isxdigit(r[2])){ - x = (dhex(r[1])<<4)|dhex(r[2]); - if(x == 0 || (x > 0x1F && x < 0x7F && strchr(special, x))) - x = *r; - else - r += 2; - } else if(x=='+') - x = ' '; - *w++ = x; - } - *w = '\0'; - return s; -} - |