2 * This is a URL parser, written to parse "Common Internet Scheme" URL
3 * syntax as described in RFC1738 and updated by RFC2396. Only absolute URLs
4 * are supported, using "server-based" naming authorities in the schemes.
5 * Support for literal IPv6 addresses is included, per RFC2732.
7 * Current "known" schemes: http, ftp, file.
9 * We can do all the parsing operations without Runes since URLs are
10 * defined to be composed of US-ASCII printable characters.
11 * See RFC1738, RFC2396.
27 /* If set, relative paths with leading ".." segments will have them trimmed */
28 #define RemoveExtraRelDotDots 0
29 #define ExpandCurrentDocUrls 1
46 for(i=0; i<nelem(schemestrtab); i++)
47 if(schemestrtab[i] && strcmp(s, schemestrtab[i])==0)
53 * URI splitting regexp is from RFC2396, Appendix B:
54 * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
57 * Example: "http://www.ics.uci.edu/pub/ietf/uri/#Related"
59 * $4 = authority "www.ics.uci.edu"
60 * $5 = path "/pub/ietf/uri/"
61 * $7 = query <undefined>
62 * $9 = fragment "Related"
66 * RFC2396, Sec 3.1, contains:
68 * Scheme names consist of a sequence of characters beginning with a
69 * lower case letter and followed by any combination of lower case
70 * letters, digits, plus ("+"), period ("."), or hyphen ("-"). For
71 * resiliency, programs interpreting URI should treat upper case letters
72 * as equivalent to lower case in scheme names (e.g., allow "HTTP" as
77 * For server-based naming authorities (RFC2396 Sec 3.2.2):
78 * server = [ [ userinfo "@" ] hostport ]
79 * userinfo = *( unreserved | escaped |
80 * ";" | ":" | "&" | "=" | "+" | "$" | "," )
81 * hostport = host [ ":" port ]
82 * host = hostname | IPv4address
83 * hostname = *( domainlabel "." ) toplabel [ "." ]
84 * domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
85 * toplabel = alpha | alpha *( alphanum | "-" ) alphanum
86 * IPv4address = 1*digit "." 1*digit "." 1*digit "." 1*digit
89 * The host is a domain name of a network host, or its IPv4 address as a
90 * set of four decimal digit groups separated by ".". Literal IPv6
91 * addresses are not supported.
93 * Note that literal IPv6 address support is outlined in RFC2732:
94 * host = hostname | IPv4address | IPv6reference
95 * ipv6reference = "[" IPv6address "]" (RFC2373)
97 * Since hostnames and numbers will have to be resolved by the OS anyway,
98 * we don't have to parse them too pedantically (counting '.'s, checking
99 * for well-formed literal IP addresses, etc.).
101 * In FTP/file paths, we reject most ";param"s and querys. In HTTP paths,
102 * we just pass them through.
104 * Instead of letting a "path" be 0-or-more characters as RFC2396 suggests,
105 * we'll say it's 1-or-more characters, 0-or-1 times. This way, an absent
106 * path yields a nil substring match, instead of an empty one.
108 * We're more restrictive than RFC2396 indicates with "userinfo" strings,
109 * insisting they have the form "[user[:password]]". This may need to
110 * change at some point, however.
113 /* RE character-class components -- these go in brackets */
114 #define PUNCT "\\-_.!~*'()"
115 #define ALNUM "a-zA-Z0-9"
116 #define HEX "0-9a-fA-F"
117 #define UNRES ALNUM PUNCT
119 /* RE components; _N => has N parenthesized subexpressions when expanded */
120 #define USERINFO_2 "([" UNRES ";:&=+$,]|(%[" HEX "][" HEX "]))"
122 typedef struct Retab Retab;
143 Retab retab[] = /* view in constant width Font */
146 "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]+)?(\\?([^#]*))?(#(.*))?$", nil, 0,
147 /* |-scheme-| |-auth.-| |path--| |query| |--|frag */
151 "^[a-z][a-z0-9+-.]*$", nil, 0,
155 "^(((" USERINFO_2 "*)@)?(((\\[[^\\]@]+\\])|([^:\\[@]+))(:([0-9]*))?)?)?$", nil, 0,
156 /* |----user info-----| |--------host----------------| |-port-| */
160 "^(([a-zA-Z0-9\\-.]+)|(\\[([a-fA-F0-9.:]+)\\]))$", nil, 0,
161 /* |--regular host--| |-IPv6 literal-| */
165 "^(([^:]*)(:([^:]*))?)$", nil, 0,
166 /* |user-| |pass-| */
170 "^(.+)(;[tT][yY][pP][eE]=([aAiIdD]))?$", nil, 0,
171 /*|--|-path |ftptype-| */
176 countleftparen(char *s)
192 for(i=0; i<nelem(retab); i++){
193 retab[i].prog = regcomp(retab[i].str);
194 if(retab[i].prog == nil)
195 sysfatal("recomp(%s): %r", retab[i].str);
196 retab[i].size = countleftparen(retab[i].str)+1;
197 for(j=0; j<nelem(retab[i].ind); j++)
198 if(retab[i].ind[j] >= retab[i].size)
199 sysfatal("bad index in regexp table: retab[%d].ind[%d] = %d >= %d",
200 i, j, retab[i].ind[j], retab[i].size);
201 if(MaxResub < retab[i].size)
202 sysfatal("MaxResub too small: %d < %d", MaxResub, retab[i].size);
206 typedef struct SplitUrl SplitUrl;
212 } url, scheme, authority, path, query, fragment;
216 * Implements the algorithm in RFC2396 sec 5.2 step 6.
217 * Returns number of chars written, excluding NUL terminator.
218 * dest is known to be >= strlen(base)+rel_len.
221 merge_relative_path(char *base, char *rel_st, int rel_len, char *dest)
223 char *s, *p, *e, *pdest;
227 /* 6a: start with base, discard last segment */
229 /* Empty paths don't match in our scheme; 'base' should be nil */
230 assert(base[0] == '/');
231 e = strrchr(base, '/');
233 memmove(pdest, base, e-base);
236 /* Artistic license on my part */
240 /* 6b: append relative component */
242 memmove(pdest, rel_st, rel_len);
246 /* 6c: remove any occurrences of "./" as a complete segment */
249 while(e = strstr(s, "./")){
250 if((e == dest) || (*(e-1) == '/')){
251 memmove(e, e+2, pdest+1-(e+2)); /* +1 for NUL */
257 /* 6d: remove a trailing "." as a complete segment */
258 if(pdest>dest && *(pdest-1)=='.' &&
259 (pdest==dest+1 || *(pdest-2)=='/'))
262 /* 6e: remove occurences of "seg/../", where seg != "..", left->right */
264 while(e = strstr(s, "/../")){
266 while(p >= dest && *p != '/')
268 if(memcmp(p, "/../", 4) != 0){
269 memmove(p+1, e+4, pdest+1-(e+4));
270 pdest -= (e+4) - (p+1);
275 /* 6f: remove a trailing "seg/..", where seg isn't ".." */
276 if(pdest-3 > dest && memcmp(pdest-3, "/..", 3)==0){
278 while(p >= dest && *p != '/')
280 if(memcmp(p, "/../", 4) != 0){
286 /* 6g: leading ".." segments are errors -- we'll just blat them out. */
287 if(RemoveExtraRelDotDots){
292 while(s[0]=='.' && s[1]=='.' && (s[2]==0 || s[2]=='/'))
295 memmove(p, s, pdest+1-s);
302 fprint(2, "merge_relative_path: '%s' + '%.*s' -> '%s'\n", base, rel_len,
307 * See RFC2396 sec 5.2 for info on resolving relative URIs to absolute form.
309 * If successful, this just ends up freeing and replacing "u->url".
312 resolve_relative(SplitUrl *su, Url *base, Url *u)
316 int currentdoc, ulen, plen;
319 werrstr("relative URI given without base");
322 if(base->scheme == nil){
323 werrstr("relative URI given with no scheme");
326 if(base->ischeme == USunknown){
327 werrstr("relative URI given with unknown scheme");
330 if(base->ischeme == UScurrent){
331 werrstr("relative URI given with incomplete base");
334 assert(su->scheme.s == nil);
338 if(su->path.s==nil && su->scheme.s==nil && su->authority.s==nil && su->query.s==nil){
339 /* Reference is to current document */
341 fprint(2, "url %s is relative to current document\n", u->url);
342 u->ischeme = UScurrent;
343 if(!ExpandCurrentDocUrls)
348 /* Over-estimate the maximum lengths, for allocation purposes */
349 /* (constants are for separators) */
352 plen += strlen(base->path);
354 plen += 1 + (su->path.e - su->path.s);
357 ulen += strlen(base->scheme) + 1;
359 ulen += 2 + (su->authority.e - su->authority.s);
361 ulen += 2 + ((base->authority) ? strlen(base->authority) : 0);
364 ulen += 1 + (su->query.e - su->query.s);
365 else if(currentdoc && base->query)
366 ulen += 1 + strlen(base->query);
368 ulen += 1 + (su->fragment.e - su->fragment.s);
369 else if(currentdoc && base->fragment)
370 ulen += 1 + strlen(base->fragment);
371 url = emalloc(ulen+1);
372 path = emalloc(plen+1);
379 if(su->authority.s || (su->path.s && (su->path.s[0] == '/'))){
380 /* Is a "network-path" or "absolute-path"; don't merge with base path */
381 /* Sec 5.2 steps 4,5 */
383 memmove(ppath, su->path.s, su->path.e - su->path.s);
384 ppath += su->path.e - su->path.s;
387 }else if(currentdoc){
388 /* Is a current-doc reference; just copy the path from the base URL */
390 strcpy(ppath, base->path);
391 ppath += strlen(ppath);
395 /* Is a relative-path reference; we have to merge it */
397 merge_relative_path(base->path,
398 su->path.s, su->path.e - su->path.s, ppath);
401 /* Build new URL from pieces, inheriting from base where needed */
402 strcpy(purl, base->scheme);
403 purl += strlen(purl);
407 purl += strlen(purl);
408 memmove(purl, su->authority.s, su->authority.e - su->authority.s);
409 purl += su->authority.e - su->authority.s;
410 }else if(base->authority){
412 purl += strlen(purl);
413 strcpy(purl, base->authority);
414 purl += strlen(purl);
416 assert((path[0] == '\0') || (path[0] == '/'));
418 purl += strlen(purl);
421 * The query and fragment are not inherited from the base,
422 * except in case of "current document" URLs, which inherit any query
423 * and may inherit the fragment.
427 memmove(purl, su->query.s, su->query.e - su->query.s);
428 purl += su->query.e - su->query.s;
429 }else if(currentdoc && base->query){
431 strcpy(purl, base->query);
432 purl += strlen(purl);
437 memmove(purl, su->query.s, su->query.e - su->query.s);
438 purl += su->fragment.e - su->fragment.s;
439 }else if(currentdoc && base->fragment){
441 strcpy(purl, base->fragment);
442 purl += strlen(purl);
447 fprint(2, "resolve_relative: '%s' + '%s' -> '%s'\n", base->url, u->url, url);
455 regx(Reprog *prog, char *s, Resub *m, int nm)
460 s = m[0].sp; /* why is this necessary? */
462 i = regexec(prog, s, m, nm);
466 fprint(2, "match%d: %.*s\n", j, utfnlen(m[j].sp, m[j].ep-m[j].sp), m[j].sp);
472 ismatch(int i, char *s, char *desc)
476 m[0].sp = m[0].ep = nil;
477 if(!regx(retab[i].prog, s, m, 1)){
478 werrstr("malformed %s: %q", desc, s);
485 spliturl(char *url, SplitUrl *su)
491 * Newlines are not valid in a URI, but regexp(2) treats them specially
492 * so it's best to make sure there are none before proceeding.
494 if(strchr(url, '\n')){
495 werrstr("newline in URI");
499 m[0].sp = m[0].ep = nil;
501 if(!regx(t->prog, url, m, t->size)){
502 werrstr("malformed URI: %q", url);
508 su->scheme.s = m[t->ind[0]].sp;
509 su->scheme.e = m[t->ind[0]].ep;
510 su->authority.s = m[t->ind[1]].sp;
511 su->authority.e = m[t->ind[1]].ep;
512 su->path.s = m[t->ind[2]].sp;
513 su->path.e = m[t->ind[2]].ep;
514 su->query.s = m[t->ind[3]].sp;
515 su->query.e = m[t->ind[3]].ep;
516 su->fragment.s = m[t->ind[4]].sp;
517 su->fragment.e = m[t->ind[4]].ep;
520 fprint(2, "split url %s into %.*q %.*q %.*q %.*q %.*q %.*q\n",
522 su->url.s ? utfnlen(su->url.s, su->url.e-su->url.s) : 10, su->url.s ? su->url.s : "",
523 su->scheme.s ? utfnlen(su->scheme.s, su->scheme.e-su->scheme.s) : 10, su->scheme.s ? su->scheme.s : "",
524 su->authority.s ? utfnlen(su->authority.s, su->authority.e-su->authority.s) : 10, su->authority.s ? su->authority.s : "",
525 su->path.s ? utfnlen(su->path.s, su->path.e-su->path.s) : 10, su->path.s ? su->path.s : "",
526 su->query.s ? utfnlen(su->query.s, su->query.e-su->query.s) : 10, su->query.s ? su->query.s : "",
527 su->fragment.s ? utfnlen(su->fragment.s, su->fragment.e-su->fragment.s) : 10, su->fragment.s ? su->fragment.s : "");
533 parse_scheme(SplitUrl *su, Url *u)
535 if(su->scheme.s == nil){
536 werrstr("missing scheme");
539 u->scheme = estredup(su->scheme.s, su->scheme.e);
542 if(!ismatch(REscheme, u->scheme, "scheme"))
545 u->ischeme = ischeme(u->scheme);
547 fprint(2, "parse_scheme %s => %d\n", u->scheme, u->ischeme);
552 parse_unknown_part(SplitUrl *su, Url *u)
556 assert(u->ischeme == USunknown);
557 assert(su->scheme.e[0] == ':');
561 e = su->fragment.s-1;
566 u->schemedata = estredup(s, e);
571 parse_userinfo(char *s, char *e, Url *u)
578 t = &retab[REuserinfo];
579 if(!regx(t->prog, nil, m, t->size)){
580 werrstr("malformed userinfo: %.*q", utfnlen(s, e-s), s);
584 u->user = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
586 u->user = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
591 parse_host(char *s, char *e, Url *u)
599 if(!regx(t->prog, nil, m, t->size)){
600 werrstr("malformed host: %.*q", utfnlen(s, e-s), s);
604 assert(m[t->ind[0]].sp || m[t->ind[1]].sp);
606 if(m[t->ind[0]].sp) /* regular */
607 u->host = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
609 u->host = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
614 parse_authority(SplitUrl *su, Url *u)
621 if(su->authority.s == nil)
624 u->authority = estredup(su->authority.s, su->authority.e);
625 m[0].sp = m[0].ep = nil;
626 t = &retab[REauthority];
627 if(!regx(t->prog, u->authority, m, t->size)){
628 werrstr("malformed authority: %q", u->authority);
633 if(parse_userinfo(m[t->ind[0]].sp, m[t->ind[0]].ep, u) < 0)
636 if(parse_host(m[t->ind[1]].sp, m[t->ind[1]].ep, u) < 0)
639 u->port = estredup(m[t->ind[2]].sp, m[t->ind[2]].ep);
643 userinfo = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
644 host = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
645 fprint(2, "port: %q, authority %q\n", u->port, u->authority);
646 fprint(2, "host %q, userinfo %q\n", host, userinfo);
654 parse_abspath(SplitUrl *su, Url *u)
658 if(su->path.s == nil)
660 s = estredup(su->path.s, su->path.e);
661 u->path = unescapeurl(s, "/");
667 parse_query(SplitUrl *su, Url *u)
671 if(su->query.s == nil)
673 s = estredup(su->query.s, su->query.e);
674 u->query = unescapeurl(s, "&=");
680 parse_fragment(SplitUrl *su, Url *u)
684 if(su->fragment.s == nil)
686 s = estredup(su->fragment.s, su->fragment.e);
687 u->fragment = unescapeurl(s, "");
693 postparse_http(Url *u)
699 u->close = httpclose;
701 if(u->authority==nil){
702 werrstr("missing authority (hostname, port, etc.)");
706 werrstr("missing host specification");
711 u->http.page_spec = estrdup("/");
714 p = escapeurl(u->path, " \"<>#%\\");
716 q = escapeurl(u->query, " \"<>#%\\");
717 u->http.page_spec = emalloc(strlen(p)+1+strlen(q)+1);
718 strcpy(u->http.page_spec, p);
719 strcat(u->http.page_spec, "?");
720 strcat(u->http.page_spec, q);
724 u->http.page_spec = p;
729 postparse_ftp(Url *u)
734 if(u->authority==nil){
735 werrstr("missing authority (hostname, port, etc.)");
739 werrstr("unexpected \"?query\" in ftp path");
743 werrstr("missing host specification");
748 u->ftp.path_spec = estrdup("/");
752 m[0].sp = m[0].ep = nil;
753 t = &retab[REftppath];
754 if(!regx(t->prog, u->path, m, t->size)){
755 werrstr("malformed ftp path: %q", u->path);
760 u->ftp.path_spec = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
761 if(strchr(u->ftp.path_spec, ';')){
762 werrstr("unexpected \";param\" in ftp path");
766 u->ftp.path_spec = estrdup("/");
769 u->ftp.type = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
770 strlower(u->ftp.type);
776 postparse_file(Url *u)
778 if(u->user || u->passwd){
779 werrstr("user information not valid with file scheme");
783 werrstr("unexpected \"?query\" in file path");
787 werrstr("port not valid with file scheme");
791 werrstr("missing path in file scheme");
794 if(strchr(u->path, ';')){
795 werrstr("unexpected \";param\" in file path");
799 /* "localhost" is equivalent to no host spec, we'll chose the latter */
800 if(u->host && cistrcmp(u->host, "localhost") == 0){
807 static int (*postparse[])(Url*) = {
816 parseurl(char *url, Url *base)
822 fprint(2, "parseurl %s with base %s\n", url, base ? base->url : "<none>");
824 u = emalloc(sizeof(Url));
825 u->url = estrdup(url);
826 if(spliturl(u->url, &su) < 0){
832 /* RFC2396 sec 3.1 says relative URIs are distinguished by absent scheme */
833 if(su.scheme.s==nil){
835 fprint(2, "parseurl has nil scheme\n");
836 if(resolve_relative(&su, base, u) < 0 || spliturl(u->url, &su) < 0)
838 if(u->ischeme == UScurrent){
839 /* 'u.url' refers to current document; set fragment and return */
840 if(parse_fragment(&su, u) < 0)
846 if(parse_scheme(&su, u) < 0
847 || parse_fragment(&su, u) < 0)
850 if(u->ischeme == USunknown){
851 if(parse_unknown_part(&su, u) < 0)
856 if(parse_query(&su, u) < 0
857 || parse_authority(&su, u) < 0
858 || parse_abspath(&su, u) < 0)
861 if(u->ischeme < nelem(postparse) && postparse[u->ischeme])
862 if((*postparse[u->ischeme])(u) < 0)
866 setmalloctag(u, getcallerpc(&url));
889 free(u->http.page_spec);
892 free(u->ftp.path_spec);
905 s = estrmanydup(u->scheme, ":", u->schemedata, nil);
907 s = estrmanydup(u->scheme, "://",
908 u->user ? u->user : "",
909 u->passwd ? ":" : "", u->passwd ? u->passwd : "",
910 u->user ? "@" : "", u->host ? u->host : "",
911 u->port ? ":" : "", u->port ? u->port : "",
913 u->query ? "?" : "", u->query ? u->query : "",
914 u->fragment ? "#" : "", u->fragment ? u->fragment : "",
921 seturlquery(Url *u, char *query)
929 u->query = unescapeurl(query, "&=");
945 v = emalloc(sizeof(Url));
949 dupp(&v->schemedata);
961 dupp(&v->http.page_spec);
964 dupp(&v->ftp.path_spec);
974 if('0' <= c && c <= '9')
976 if('a' <= c && c <= 'f')
978 if('A' <= c && c <= 'F')
984 escapeurl(char *s, char *special)
988 static char *hex = "0123456789abcdef";
992 if(*t <= 0x1F || *t >= 0x7F || strchr(special, *t))
994 u = emalloc(strlen(s)+2*n+1);
997 if(s[0] == '%' && isxdigit(s[1]) && isxdigit(s[2]))
999 else if(*s <= 0x1F || *s >= 0x7F || strchr(special, *s)){
1001 *u++ = hex[(*s>>4)&0xF];
1011 unescapeurl(char *s, char *special)
1016 for(r=w=s; x = *r; r++){
1017 if(x=='%' && isxdigit(r[1]) && isxdigit(r[2])){
1018 x = (dhex(r[1])<<4)|dhex(r[2]);
1019 if(x == 0 || (x > 0x1F && x < 0x7F && strchr(special, x)))