sys/src/cmd/webfs/url.c

   1 /*
   2  * This is a URL parser, written to parse "Common Internet Scheme" URL
   3  * syntax as described in RFC1738 and updated by RFC2396.  Only absolute URLs
   4  * are supported, using "server-based" naming authorities in the schemes.
   5  * Support for literal IPv6 addresses is included, per RFC2732.
   6  *
   7  * Current "known" schemes: http, ftp, file.
   8  *
   9  * We can do all the parsing operations without Runes since URLs are
  10  * defined to be composed of US-ASCII printable characters.
  11  * See RFC1738, RFC2396.
  12  */
  13
  14 #include <u.h>
  15 #include <libc.h>
  16 #include <ctype.h>
  17 #include <regexp.h>
  18 #include <plumb.h>
  19 #include <thread.h>
  20 #include <fcall.h>
  21 #include <9p.h>
  22 #include "dat.h"
  23 #include "fns.h"
  24
  25 int urldebug;
  26
  27 /* If set, relative paths with leading ".." segments will have them trimmed */
  28 #define RemoveExtraRelDotDots   0
  29 #define ExpandCurrentDocUrls    1
  30
  31 static char*
  32 schemestrtab[] =
  33 {
  34         nil,
  35         "http",
  36         "https",
  37         "ftp",
  38         "file",
  39 };
  40
  41 static int
  42 ischeme(char *s)
  43 {
  44         int i;
  45
  46         for(i=0; i<nelem(schemestrtab); i++)
  47                 if(schemestrtab[i] && strcmp(s, schemestrtab[i])==0)
  48                         return i;
  49         return USunknown;
  50 }
  51
  52 /*
  53  * URI splitting regexp is from RFC2396, Appendix B:
  54  *              ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
  55  *               12            3  4          5       6  7        8 9
  56  *
  57  * Example: "http://www.ics.uci.edu/pub/ietf/uri/#Related"
  58  * $2 = scheme                  "http"
  59  * $4 = authority               "www.ics.uci.edu"
  60  * $5 = path                    "/pub/ietf/uri/"
  61  * $7 = query                   <undefined>
  62  * $9 = fragment                "Related"
  63  */
  64
  65 /*
  66  * RFC2396, Sec 3.1, contains:
  67  *
  68  * Scheme names consist of a sequence of characters beginning with a
  69  * lower case letter and followed by any combination of lower case
  70  * letters, digits, plus ("+"), period ("."), or hyphen ("-").  For
  71  * resiliency, programs interpreting URI should treat upper case letters
  72  * as equivalent to lower case in scheme names (e.g., allow "HTTP" as
  73  * well as "http").
  74  */
  75
  76 /*
  77  * For server-based naming authorities (RFC2396 Sec 3.2.2):
  78  *    server        = [ [ userinfo "@" ] hostport ]
  79  *    userinfo      = *( unreserved | escaped |
  80  *                      ";" | ":" | "&" | "=" | "+" | "$" | "," )
  81  *    hostport      = host [ ":" port ]
  82  *    host          = hostname | IPv4address
  83  *    hostname      = *( domainlabel "." ) toplabel [ "." ]
  84  *    domainlabel   = alphanum | alphanum *( alphanum | "-" ) alphanum
  85  *    toplabel      = alpha | alpha *( alphanum | "-" ) alphanum
  86  *    IPv4address   = 1*digit "." 1*digit "." 1*digit "." 1*digit
  87  *    port          = *digit
  88  *
  89  *  The host is a domain name of a network host, or its IPv4 address as a
  90  *  set of four decimal digit groups separated by ".".  Literal IPv6
  91  *  addresses are not supported.
  92  *
  93  * Note that literal IPv6 address support is outlined in RFC2732:
  94  *    host          = hostname | IPv4address | IPv6reference
  95  *    ipv6reference = "[" IPv6address "]"               (RFC2373)
  96  *
  97  * Since hostnames and numbers will have to be resolved by the OS anyway,
  98  * we don't have to parse them too pedantically (counting '.'s, checking
  99  * for well-formed literal IP addresses, etc.).
 100  *
 101  * In FTP/file paths, we reject most ";param"s and querys.  In HTTP paths,
 102  * we just pass them through.
 103  *
 104  * Instead of letting a "path" be 0-or-more characters as RFC2396 suggests,
 105  * we'll say it's 1-or-more characters, 0-or-1 times.  This way, an absent
 106  * path yields a nil substring match, instead of an empty one.
 107  *
 108  * We're more restrictive than RFC2396 indicates with "userinfo" strings,
 109  * insisting they have the form "[user[:password]]".  This may need to
 110  * change at some point, however.
 111  */
 112
 113 /* RE character-class components -- these go in brackets */
 114 #define PUNCT                   "\\-_.!~*'()"
 115 #define RES                     ";/?:@&=+$,"
 116 #define ALNUM           "a-zA-Z0-9"
 117 #define HEX                     "0-9a-fA-F"
 118 #define UNRES                   ALNUM PUNCT
 119
 120 /* RE components; _N => has N parenthesized subexpressions when expanded */
 121 #define ESCAPED_1                       "(%[" HEX "][" HEX "])"
 122 #define URIC_2                  "([" RES UNRES "]|" ESCAPED_1 ")"
 123 #define URICNOSLASH_2           "([" UNRES ";?:@&=+$,]|" ESCAPED_1 ")"
 124 #define USERINFO_2              "([" UNRES ";:&=+$,]|" ESCAPED_1 ")"
 125 #define PCHAR_2                 "([" UNRES ":@&=+$,]|" ESCAPED_1 ")"
 126 #define PSEGCHAR_3              "([/;]|" PCHAR_2 ")"
 127
 128 typedef struct Retab Retab;
 129 struct Retab
 130 {
 131         char    *str;
 132         Reprog  *prog;
 133         int             size;
 134         int             ind[5];
 135 };
 136
 137 enum
 138 {
 139         REsplit = 0,
 140         REscheme,
 141         REunknowndata,
 142         REauthority,
 143         REhost,
 144         REuserinfo,
 145         REabspath,
 146         REquery,
 147         REfragment,
 148         REhttppath,
 149         REftppath,
 150         REfilepath,
 151
 152         MaxResub=       20,
 153 };
 154
 155 Retab retab[] = /* view in constant width Font */
 156 {
 157 [REsplit]
 158         "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]+)?(\\?([^#]*))?(#(.*))?$", nil, 0,
 159         /* |-scheme-|      |-auth.-|  |path--|    |query|     |--|frag */
 160         {  2,              4,         5,          7,          9},
 161
 162 [REscheme]
 163         "^[a-z][a-z0-9+-.]*$", nil, 0,
 164         { 0, },
 165
 166 [REunknowndata]
 167         "^" URICNOSLASH_2 URIC_2 "*$", nil, 0,
 168         { 0, },
 169
 170 [REauthority]
 171         "^(((" USERINFO_2 "*)@)?(((\\[[^\\]@]+\\])|([^:\\[@]+))(:([0-9]*))?)?)?$", nil, 0,
 172         /* |----user info-----|  |--------host----------------|  |-port-| */
 173         {  3,                    7,                              11, },
 174
 175 [REhost]
 176         "^(([a-zA-Z0-9\\-.]+)|(\\[([a-fA-F0-9.:]+)\\]))$", nil, 0,
 177         /* |--regular host--|     |-IPv6 literal-| */
 178         {  2,                     4, },
 179
 180 [REuserinfo]
 181         "^(([^:]*)(:([^:]*))?)$", nil, 0,
 182         /* |user-|  |pass-| */
 183         {  2,       4, },
 184
 185 [REabspath]
 186         "^/" PSEGCHAR_3 "*$", nil, 0,
 187         { 0, },
 188
 189 [REquery]
 190         "^" URIC_2 "*$", nil, 0,
 191         { 0, },
 192
 193 [REfragment]
 194         "^" URIC_2 "*$", nil, 0,
 195         { 0, },
 196
 197 [REhttppath]
 198         "^.*$", nil, 0,
 199         { 0, },
 200
 201 [REftppath]
 202         "^(.+)(;[tT][yY][pP][eE]=([aAiIdD]))?$", nil, 0,
 203         /*|--|-path              |ftptype-| */
 204         { 1,                     3, },
 205
 206 [REfilepath]
 207         "^.*$", nil, 0,
 208         { 0, },
 209 };
 210
 211 static int
 212 countleftparen(char *s)
 213 {
 214         int n;
 215
 216         n = 0;
 217         for(; *s; s++)
 218                 if(*s == '(')
 219                         n++;
 220         return n;
 221 }
 222
 223 void
 224 initurl(void)
 225 {
 226         int i, j;
 227
 228         for(i=0; i<nelem(retab); i++){
 229                 retab[i].prog = regcomp(retab[i].str);
 230                 if(retab[i].prog == nil)
 231                         sysfatal("recomp(%s): %r", retab[i].str);
 232                 retab[i].size = countleftparen(retab[i].str)+1;
 233                 for(j=0; j<nelem(retab[i].ind); j++)
 234                         if(retab[i].ind[j] >= retab[i].size)
 235                                 sysfatal("bad index in regexp table: retab[%d].ind[%d] = %d >= %d",
 236                                         i, j, retab[i].ind[j], retab[i].size);
 237                 if(MaxResub < retab[i].size)
 238                         sysfatal("MaxResub too small: %d < %d", MaxResub, retab[i].size);
 239         }
 240 }
 241
 242 typedef struct SplitUrl SplitUrl;
 243 struct SplitUrl
 244 {
 245         struct {
 246                 char *s;
 247                 char *e;
 248         } url, scheme, authority, path, query, fragment;
 249 };
 250
 251 /*
 252  * Implements the algorithm in RFC2396 sec 5.2 step 6.
 253  * Returns number of chars written, excluding NUL terminator.
 254  * dest is known to be >= strlen(base)+rel_len.
 255  */
 256 static void
 257 merge_relative_path(char *base, char *rel_st, int rel_len, char *dest)
 258 {
 259         char *s, *p, *e, *pdest;
 260
 261         pdest = dest;
 262
 263         /* 6a: start with base, discard last segment */
 264         if(base && base[0]){
 265                 /* Empty paths don't match in our scheme; 'base' should be nil */
 266                 assert(base[0] == '/');
 267                 e = strrchr(base, '/');
 268                 e++;
 269                 memmove(pdest, base, e-base);
 270                 pdest += e-base;
 271         }else{
 272                 /* Artistic license on my part */
 273                 *pdest++ = '/';
 274         }
 275
 276         /* 6b: append relative component */
 277         if(rel_st){
 278                 memmove(pdest, rel_st, rel_len);
 279                 pdest += rel_len;
 280         }
 281
 282         /* 6c: remove any occurrences of "./" as a complete segment */
 283         s = dest;
 284         *pdest = '\0';
 285         while(e = strstr(s, "./")){
 286                 if((e == dest) || (*(e-1) == '/')){
 287                         memmove(e, e+2, pdest+1-(e+2)); /* +1 for NUL */
 288                         pdest -= 2;
 289                 }else
 290                         s = e+1;
 291         }
 292
 293         /* 6d: remove a trailing "." as a complete segment */
 294         if(pdest>dest && *(pdest-1)=='.' &&
 295           (pdest==dest+1 || *(pdest-2)=='/'))
 296                 *--pdest = '\0';
 297
 298         /* 6e: remove occurences of "seg/../", where seg != "..", left->right */
 299         s = dest+1;
 300         while(e = strstr(s, "/../")){
 301                 p = e - 1;
 302                 while(p >= dest && *p != '/')
 303                         p--;
 304                 if(memcmp(p, "/../", 4) != 0){
 305                         memmove(p+1, e+4, pdest+1-(e+4));
 306                         pdest -= (e+4) - (p+1);
 307                 }else
 308                         s = e+1;
 309         }
 310
 311         /* 6f: remove a trailing "seg/..", where seg isn't ".."  */
 312         if(pdest-3 > dest && memcmp(pdest-3, "/..", 3)==0){
 313                 p = pdest-3 - 1;
 314                 while(p >= dest && *p != '/')
 315                         p--;
 316                 if(memcmp(p, "/../", 4) != 0){
 317                         pdest = p+1;
 318                         *pdest = '\0';
 319                 }
 320         }
 321
 322         /* 6g: leading ".." segments are errors -- we'll just blat them out. */
 323         if(RemoveExtraRelDotDots){
 324                 p = dest;
 325                 if (p[0] == '/')
 326                         p++;
 327                 s = p;
 328                 while(s[0]=='.' && s[1]=='.' && (s[2]==0 || s[2]=='/'))
 329                         s += 3;
 330                 if(s > p){
 331                         memmove(p, s, pdest+1-s);
 332                         pdest -= s-p;
 333                 }
 334         }
 335         USED(pdest);
 336
 337         if(urldebug)
 338                 fprint(2, "merge_relative_path: '%s' + '%.*s' -> '%s'\n", base, rel_len,
 339                         rel_st, dest);
 340 }
 341
 342 /*
 343  * See RFC2396 sec 5.2 for info on resolving relative URIs to absolute form.
 344  *
 345  * If successful, this just ends up freeing and replacing "u->url".
 346  */
 347 static int
 348 resolve_relative(SplitUrl *su, Url *base, Url *u)
 349 {
 350         char *url, *path;
 351         char *purl, *ppath;
 352         int currentdoc, ulen, plen;
 353
 354         if(base == nil){
 355                 werrstr("relative URI given without base");
 356                 return -1;
 357         }
 358         if(base->scheme == nil){
 359                 werrstr("relative URI given with no scheme");
 360                 return -1;
 361         }
 362         if(base->ischeme == USunknown){
 363                 werrstr("relative URI given with unknown scheme");
 364                 return -1;
 365         }
 366         if(base->ischeme == UScurrent){
 367                 werrstr("relative URI given with incomplete base");
 368                 return -1;
 369         }
 370         assert(su->scheme.s == nil);
 371
 372         /* Sec 5.2 step 2 */
 373         currentdoc = 0;
 374         if(su->path.s==nil && su->scheme.s==nil && su->authority.s==nil && su->query.s==nil){
 375                 /* Reference is to current document */
 376                 if(urldebug)
 377                         fprint(2, "url %s is relative to current document\n", u->url);
 378                 u->ischeme = UScurrent;
 379                 if(!ExpandCurrentDocUrls)
 380                         return 0;
 381                 currentdoc = 1;
 382         }
 383
 384         /* Over-estimate the maximum lengths, for allocation purposes */
 385         /* (constants are for separators) */
 386         plen = 1;
 387         if(base->path)
 388                 plen += strlen(base->path);
 389         if(su->path.s)
 390                 plen += 1 + (su->path.e - su->path.s);
 391
 392         ulen = 0;
 393         ulen += strlen(base->scheme) + 1;
 394         if(su->authority.s)
 395                 ulen += 2 + (su->authority.e - su->authority.s);
 396         else
 397                 ulen += 2 + ((base->authority) ? strlen(base->authority) : 0);
 398         ulen += plen;
 399         if(su->query.s)
 400                 ulen += 1 + (su->query.e - su->query.s);
 401         else if(currentdoc && base->query)
 402                 ulen += 1 + strlen(base->query);
 403         if(su->fragment.s)
 404                 ulen += 1 + (su->fragment.e - su->fragment.s);
 405         else if(currentdoc && base->fragment)
 406                 ulen += 1 + strlen(base->fragment);
 407         url = emalloc(ulen+1);
 408         path = emalloc(plen+1);
 409
 410         url[0] = '\0';
 411         purl = url;
 412         path[0] = '\0';
 413         ppath = path;
 414
 415         if(su->authority.s || (su->path.s && (su->path.s[0] == '/'))){
 416                 /* Is a "network-path" or "absolute-path"; don't merge with base path */
 417                 /* Sec 5.2 steps 4,5 */
 418                 if(su->path.s){
 419                         memmove(ppath, su->path.s, su->path.e - su->path.s);
 420                         ppath += su->path.e - su->path.s;
 421                         *ppath = '\0';
 422                 }
 423         }else if(currentdoc){
 424                 /* Is a current-doc reference; just copy the path from the base URL */
 425                 if(base->path){
 426                         strcpy(ppath, base->path);
 427                         ppath += strlen(ppath);
 428                 }
 429                 USED(ppath);
 430         }else{
 431                 /* Is a relative-path reference; we have to merge it */
 432                 /* Sec 5.2 step 6 */
 433                 merge_relative_path(base->path,
 434                         su->path.s, su->path.e - su->path.s, ppath);
 435         }
 436
 437         /* Build new URL from pieces, inheriting from base where needed */
 438         strcpy(purl, base->scheme);
 439         purl += strlen(purl);
 440         *purl++ = ':';
 441         if(su->authority.s){
 442                 strcpy(purl, "//");
 443                 purl += strlen(purl);
 444                 memmove(purl, su->authority.s, su->authority.e - su->authority.s);
 445                 purl += su->authority.e - su->authority.s;
 446         }else if(base->authority){
 447                 strcpy(purl, "//");
 448                 purl += strlen(purl);
 449                 strcpy(purl, base->authority);
 450                 purl += strlen(purl);
 451         }
 452         assert((path[0] == '\0') || (path[0] == '/'));
 453         strcpy(purl, path);
 454         purl += strlen(purl);
 455
 456         /*
 457          * The query and fragment are not inherited from the base,
 458          * except in case of "current document" URLs, which inherit any query
 459          * and may inherit the fragment.
 460          */
 461         if(su->query.s){
 462                 *purl++ = '?';
 463                 memmove(purl, su->query.s, su->query.e - su->query.s);
 464                 purl += su->query.e - su->query.s;
 465         }else if(currentdoc && base->query){
 466                 *purl++ = '?';
 467                 strcpy(purl, base->query);
 468                 purl += strlen(purl);
 469         }
 470
 471         if(su->fragment.s){
 472                 *purl++ = '#';
 473                 memmove(purl, su->query.s, su->query.e - su->query.s);
 474                 purl += su->fragment.e - su->fragment.s;
 475         }else if(currentdoc && base->fragment){
 476                 *purl++ = '#';
 477                 strcpy(purl, base->fragment);
 478                 purl += strlen(purl);
 479         }
 480         USED(purl);
 481
 482         if(urldebug)
 483                 fprint(2, "resolve_relative: '%s' + '%s' -> '%s'\n", base->url, u->url, url);
 484         free(u->url);
 485         u->url = url;
 486         free(path);
 487         return 0;
 488 }
 489
 490 int
 491 regx(Reprog *prog, char *s, Resub *m, int nm)
 492 {
 493         int i;
 494
 495         if(s == nil)
 496                 s = m[0].sp;    /* why is this necessary? */
 497
 498         i = regexec(prog, s, m, nm);
 499 /*
 500         if(i >= 0)
 501                 for(j=0; j<nm; j++)
 502                         fprint(2, "match%d: %.*s\n", j, utfnlen(m[j].sp, m[j].ep-m[j].sp), m[j].sp);
 503 */
 504         return i;
 505 }
 506
 507 static int
 508 ismatch(int i, char *s, char *desc)
 509 {
 510         Resub m[1];
 511
 512         m[0].sp = m[0].ep = nil;
 513         if(!regx(retab[i].prog, s, m, 1)){
 514                 werrstr("malformed %s: %q", desc, s);
 515                 return 0;
 516         }
 517         return 1;
 518 }
 519
 520 static int
 521 spliturl(char *url, SplitUrl *su)
 522 {
 523         Resub m[MaxResub];
 524         Retab *t;
 525
 526         /*
 527          * Newlines are not valid in a URI, but regexp(2) treats them specially
 528          * so it's best to make sure there are none before proceeding.
 529          */
 530         if(strchr(url, '\n')){
 531                 werrstr("newline in URI");
 532                 return -1;
 533         }
 534
 535         /*
 536          * Because we use NUL-terminated strings, as do many client and server
 537          * implementations, an escaped NUL ("%00") will quite likely cause problems
 538          * when unescaped.  We can check for such a sequence once before examining
 539          * the components because, per RFC2396 sec. 2.4.1 - 2.4.2, '%' is reserved
 540          * in URIs to _always_ indicate escape sequences.  Something like "%2500"
 541          * will still get by, but that's legitimate, and if it ends up causing
 542          * a NUL then someone is unescaping too many times.
 543          */
 544         if(strstr(url, "%00")){
 545                 werrstr("escaped NUL in URI");
 546                 return -1;
 547         }
 548
 549         m[0].sp = m[0].ep = nil;
 550         t = &retab[REsplit];
 551         if(!regx(t->prog, url, m, t->size)){
 552                 werrstr("malformed URI: %q", url);
 553                 return -1;
 554         }
 555
 556         su->url.s = m[0].sp;
 557         su->url.e = m[0].ep;
 558         su->scheme.s = m[t->ind[0]].sp;
 559         su->scheme.e = m[t->ind[0]].ep;
 560         su->authority.s = m[t->ind[1]].sp;
 561         su->authority.e = m[t->ind[1]].ep;
 562         su->path.s = m[t->ind[2]].sp;
 563         su->path.e = m[t->ind[2]].ep;
 564         su->query.s = m[t->ind[3]].sp;
 565         su->query.e = m[t->ind[3]].ep;
 566         su->fragment.s = m[t->ind[4]].sp;
 567         su->fragment.e = m[t->ind[4]].ep;
 568
 569         if(urldebug)
 570                 fprint(2, "split url %s into %.*q %.*q %.*q %.*q %.*q %.*q\n",
 571                         url,
 572                         su->url.s ? utfnlen(su->url.s, su->url.e-su->url.s) : 10, su->url.s ? su->url.s : "",
 573                         su->scheme.s ? utfnlen(su->scheme.s, su->scheme.e-su->scheme.s) : 10, su->scheme.s ? su->scheme.s : "",
 574                         su->authority.s ? utfnlen(su->authority.s, su->authority.e-su->authority.s) : 10, su->authority.s ? su->authority.s : "",
 575                         su->path.s ? utfnlen(su->path.s, su->path.e-su->path.s) : 10, su->path.s ? su->path.s : "",
 576                         su->query.s ? utfnlen(su->query.s, su->query.e-su->query.s) : 10, su->query.s ? su->query.s : "",
 577                         su->fragment.s ? utfnlen(su->fragment.s, su->fragment.e-su->fragment.s) : 10, su->fragment.s ? su->fragment.s : "");
 578
 579         return 0;
 580 }
 581
 582 static int
 583 parse_scheme(SplitUrl *su, Url *u)
 584 {
 585         if(su->scheme.s == nil){
 586                 werrstr("missing scheme");
 587                 return -1;
 588         }
 589         u->scheme = estredup(su->scheme.s, su->scheme.e);
 590         strlower(u->scheme);
 591
 592         if(!ismatch(REscheme, u->scheme, "scheme"))
 593                 return -1;
 594
 595         u->ischeme = ischeme(u->scheme);
 596         if(urldebug)
 597                 fprint(2, "parse_scheme %s => %d\n", u->scheme, u->ischeme);
 598         return 0;
 599 }
 600
 601 static int
 602 parse_unknown_part(SplitUrl *su, Url *u)
 603 {
 604         char *s, *e;
 605
 606         assert(u->ischeme == USunknown);
 607         assert(su->scheme.e[0] == ':');
 608
 609         s = su->scheme.e+1;
 610         if(su->fragment.s){
 611                 e = su->fragment.s-1;
 612                 assert(*e == '#');
 613         }else
 614                 e = s+strlen(s);
 615
 616         u->schemedata = estredup(s, e);
 617         if(!ismatch(REunknowndata, u->schemedata, "unknown scheme data"))
 618                 return -1;
 619         return 0;
 620 }
 621
 622 static int
 623 parse_userinfo(char *s, char *e, Url *u)
 624 {
 625         Resub m[MaxResub];
 626         Retab *t;
 627
 628         m[0].sp = s;
 629         m[0].ep = e;
 630         t = &retab[REuserinfo];
 631         if(!regx(t->prog, nil, m, t->size)){
 632                 werrstr("malformed userinfo: %.*q", utfnlen(s, e-s), s);
 633                 return -1;
 634         }
 635         if(m[t->ind[0]].sp)
 636                 u->user = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
 637         if(m[t->ind[1]].sp)
 638                 u->user = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
 639         return 0;
 640 }
 641
 642 static int
 643 parse_host(char *s, char *e, Url *u)
 644 {
 645         Resub m[MaxResub];
 646         Retab *t;
 647
 648         m[0].sp = s;
 649         m[0].ep = e;
 650         t = &retab[REhost];
 651         if(!regx(t->prog, nil, m, t->size)){
 652                 werrstr("malformed host: %.*q", utfnlen(s, e-s), s);
 653                 return -1;
 654         }
 655
 656         assert(m[t->ind[0]].sp || m[t->ind[1]].sp);
 657
 658         if(m[t->ind[0]].sp)     /* regular */
 659                 u->host = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
 660         else
 661                 u->host = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
 662         return 0;
 663 }
 664
 665 static int
 666 parse_authority(SplitUrl *su, Url *u)
 667 {
 668         Resub m[MaxResub];
 669         Retab *t;
 670         char *host;
 671         char *userinfo;
 672
 673         if(su->authority.s == nil)
 674                 return 0;
 675
 676         u->authority = estredup(su->authority.s, su->authority.e);
 677         m[0].sp = m[0].ep = nil;
 678         t = &retab[REauthority];
 679         if(!regx(t->prog, u->authority, m, t->size)){
 680                 werrstr("malformed authority: %q", u->authority);
 681                 return -1;
 682         }
 683
 684         if(m[t->ind[0]].sp)
 685                 if(parse_userinfo(m[t->ind[0]].sp, m[t->ind[0]].ep, u) < 0)
 686                         return -1;
 687         if(m[t->ind[1]].sp)
 688                 if(parse_host(m[t->ind[1]].sp, m[t->ind[1]].ep, u) < 0)
 689                         return -1;
 690         if(m[t->ind[2]].sp)
 691                 u->port = estredup(m[t->ind[2]].sp, m[t->ind[2]].ep);
 692
 693
 694         if(urldebug > 0){
 695                 userinfo = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
 696                 host = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
 697                 fprint(2, "port: %q, authority %q\n", u->port, u->authority);
 698                 fprint(2, "host %q, userinfo %q\n", host, userinfo);
 699                 free(host);
 700                 free(userinfo);
 701         }
 702         return 0;
 703 }
 704
 705 static int
 706 parse_abspath(SplitUrl *su, Url *u)
 707 {
 708         if(su->path.s == nil)
 709                 return 0;
 710         u->path = estredup(su->path.s, su->path.e);
 711         if(!ismatch(REabspath, u->path, "absolute path"))
 712                 return -1;
 713         return 0;
 714 }
 715
 716 static int
 717 parse_query(SplitUrl *su, Url *u)
 718 {
 719         if(su->query.s == nil)
 720                 return 0;
 721         u->query = estredup(su->query.s, su->query.e);
 722         if(!ismatch(REquery, u->query, "query"))
 723                 return -1;
 724         return 0;
 725 }
 726
 727 static int
 728 parse_fragment(SplitUrl *su, Url *u)
 729 {
 730         if(su->fragment.s == nil)
 731                 return 0;
 732         u->fragment = estredup(su->fragment.s, su->fragment.e);
 733         if(!ismatch(REfragment, u->fragment, "fragment"))
 734                 return -1;
 735         return 0;
 736 }
 737
 738 static int
 739 postparse_http(Url *u)
 740 {
 741         u->open = httpopen;
 742         u->read = httpread;
 743         u->close = httpclose;
 744
 745         if(u->authority==nil){
 746                 werrstr("missing authority (hostname, port, etc.)");
 747                 return -1;
 748         }
 749         if(u->host == nil){
 750                 werrstr("missing host specification");
 751                 return -1;
 752         }
 753
 754         if(u->path == nil){
 755                 u->http.page_spec = estrdup("/");
 756                 return 0;
 757         }
 758
 759         if(!ismatch(REhttppath, u->path, "http path"))
 760                 return -1;
 761         if(u->query){
 762                 u->http.page_spec = emalloc(strlen(u->path)+1+strlen(u->query)+1);
 763                 strcpy(u->http.page_spec, u->path);
 764                 strcat(u->http.page_spec, "?");
 765                 strcat(u->http.page_spec, u->query);
 766         }else
 767                 u->http.page_spec = estrdup(u->path);
 768
 769         return 0;
 770 }
 771
 772 static int
 773 postparse_ftp(Url *u)
 774 {
 775         Resub m[MaxResub];
 776         Retab *t;
 777
 778         if(u->authority==nil){
 779                 werrstr("missing authority (hostname, port, etc.)");
 780                 return -1;
 781         }
 782         if(u->query){
 783                 werrstr("unexpected \"?query\" in ftp path");
 784                 return -1;
 785         }
 786         if(u->host == nil){
 787                 werrstr("missing host specification");
 788                 return -1;
 789         }
 790
 791         if(u->path == nil){
 792                 u->ftp.path_spec = estrdup("/");
 793                 return 0;
 794         }
 795
 796         m[0].sp = m[0].ep = nil;
 797         t = &retab[REftppath];
 798         if(!regx(t->prog, u->path, m, t->size)){
 799                 werrstr("malformed ftp path: %q", u->path);
 800                 return -1;
 801         }
 802
 803         if(m[t->ind[0]].sp){
 804                 u->ftp.path_spec = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
 805                 if(strchr(u->ftp.path_spec, ';')){
 806                         werrstr("unexpected \";param\" in ftp path");
 807                         return -1;
 808                 }
 809         }else
 810                 u->ftp.path_spec = estrdup("/");
 811
 812         if(m[t->ind[1]].sp){
 813                 u->ftp.type = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
 814                 strlower(u->ftp.type);
 815         }
 816         return 0;
 817 }
 818
 819 static int
 820 postparse_file(Url *u)
 821 {
 822         if(u->user || u->passwd){
 823                 werrstr("user information not valid with file scheme");
 824                 return -1;
 825         }
 826         if(u->query){
 827                 werrstr("unexpected \"?query\" in file path");
 828                 return -1;
 829         }
 830         if(u->port){
 831                 werrstr("port not valid with file scheme");
 832                 return -1;
 833         }
 834         if(u->path == nil){
 835                 werrstr("missing path in file scheme");
 836                 return -1;
 837         }
 838         if(strchr(u->path, ';')){
 839                 werrstr("unexpected \";param\" in file path");
 840                 return -1;
 841         }
 842
 843         if(!ismatch(REfilepath, u->path, "file path"))
 844                 return -1;
 845
 846         /* "localhost" is equivalent to no host spec, we'll chose the latter */
 847         if(u->host && cistrcmp(u->host, "localhost") == 0){
 848                 free(u->host);
 849                 u->host = nil;
 850         }
 851         return 0;
 852 }
 853
 854 static int (*postparse[])(Url*) = {
 855         nil,
 856         postparse_http,
 857         postparse_http,
 858         postparse_ftp,
 859         postparse_file,
 860 };
 861
 862 Url*
 863 parseurl(char *url, Url *base)
 864 {
 865         Url *u;
 866         SplitUrl su;
 867
 868         if(urldebug)
 869                 fprint(2, "parseurl %s with base %s\n", url, base ? base->url : "<none>");
 870
 871         u = emalloc(sizeof(Url));
 872         u->url = estrdup(url);
 873         if(spliturl(u->url, &su) < 0){
 874         Fail:
 875                 freeurl(u);
 876                 return nil;
 877         }
 878
 879         /* RFC2396 sec 3.1 says relative URIs are distinguished by absent scheme */
 880         if(su.scheme.s==nil){
 881                 if(urldebug)
 882                         fprint(2, "parseurl has nil scheme\n");
 883                 if(resolve_relative(&su, base, u) < 0 || spliturl(u->url, &su) < 0)
 884                         goto Fail;
 885                 if(u->ischeme == UScurrent){
 886                         /* 'u.url' refers to current document; set fragment and return */
 887                         if(parse_fragment(&su, u) < 0)
 888                                 goto Fail;
 889                         return u;
 890                 }
 891         }
 892
 893         if(parse_scheme(&su, u) < 0
 894         || parse_fragment(&su, u) < 0)
 895                 goto Fail;
 896
 897         if(u->ischeme == USunknown){
 898                 if(parse_unknown_part(&su, u) < 0)
 899                         goto Fail;
 900                 return u;
 901         }
 902
 903         if(parse_query(&su, u) < 0
 904         || parse_authority(&su, u) < 0
 905         || parse_abspath(&su, u) < 0)
 906                 goto Fail;
 907
 908         if(u->ischeme < nelem(postparse) && postparse[u->ischeme])
 909                 if((*postparse[u->ischeme])(u) < 0)
 910                         goto Fail;
 911
 912         setmalloctag(u, getcallerpc(&url));
 913         return u;
 914 }
 915
 916 void
 917 freeurl(Url *u)
 918 {
 919         if(u == nil)
 920                 return;
 921         free(u->url);
 922         free(u->scheme);
 923         free(u->schemedata);
 924         free(u->authority);
 925         free(u->user);
 926         free(u->passwd);
 927         free(u->host);
 928         free(u->port);
 929         free(u->path);
 930         free(u->query);
 931         free(u->fragment);
 932         switch(u->ischeme){
 933         case UShttp:
 934                 free(u->http.page_spec);
 935                 break;
 936         case USftp:
 937                 free(u->ftp.path_spec);
 938                 free(u->ftp.type);
 939                 break;
 940         }
 941         free(u);
 942 }
 943
 944 void
 945 rewriteurl(Url *u)
 946 {
 947         char *s;
 948
 949         if(u->schemedata)
 950                 s = estrmanydup(u->scheme, ":", u->schemedata, nil);
 951         else
 952                 s = estrmanydup(u->scheme, "://",
 953                         u->user ? u->user : "",
 954                         u->passwd ? ":" : "", u->passwd ? u->passwd : "",
 955                         u->user ? "@" : "", u->host ? u->host : "",
 956                         u->port ? ":" : "", u->port ? u->port : "",
 957                         u->path,
 958                         u->query ? "?" : "", u->query ? u->query : "",
 959                         u->fragment ? "#" : "", u->fragment ? u->fragment : "",
 960                         nil);
 961         free(u->url);
 962         u->url = s;
 963 }
 964
 965 int
 966 seturlquery(Url *u, char *query)
 967 {
 968         if(query == nil){
 969                 free(u->query);
 970                 u->query = nil;
 971                 return 0;
 972         }
 973
 974         if(!ismatch(REquery, query, "query"))
 975                 return -1;
 976
 977         free(u->query);
 978         u->query = estrdup(query);
 979         return 0;
 980 }
 981
 982 static void
 983 dupp(char **p)
 984 {
 985         if(*p)
 986                 *p = estrdup(*p);
 987 }
 988
 989 Url*
 990 copyurl(Url *u)
 991 {
 992         Url *v;
 993
 994         v = emalloc(sizeof(Url));
 995         *v = *u;
 996         dupp(&v->url);
 997         dupp(&v->scheme);
 998         dupp(&v->schemedata);
 999         dupp(&v->authority);
1000         dupp(&v->user);
1001         dupp(&v->passwd);
1002         dupp(&v->host);
1003         dupp(&v->port);
1004         dupp(&v->path);
1005         dupp(&v->query);
1006         dupp(&v->fragment);
1007
1008         switch(v->ischeme){
1009         case UShttp:
1010                 dupp(&v->http.page_spec);
1011                 break;
1012         case USftp:
1013                 dupp(&v->ftp.path_spec);
1014                 dupp(&v->ftp.type);
1015                 break;
1016         }
1017         return v;
1018 }
1019
1020 static int
1021 dhex(char c)
1022 {
1023         if('0' <= c && c <= '9')
1024                 return c-'0';
1025         if('a' <= c && c <= 'f')
1026                 return c-'a'+10;
1027         if('A' <= c && c <= 'F')
1028                 return c-'A'+10;
1029         return 0;
1030 }
1031
1032 char*
1033 escapeurl(char *s, int (*needesc)(int))
1034 {
1035         int n;
1036         char *t, *u;
1037         Rune r;
1038         static char *hex = "0123456789abcdef";
1039
1040         n = 0;
1041         for(t=s; *t; t++)
1042                 if((*needesc)(*t))
1043                         n++;
1044
1045         u = emalloc(strlen(s)+2*n+1);
1046         t = u;
1047         for(; *s; s++){
1048                 s += chartorune(&r, s);
1049                 if(r >= 0xFF){
1050                         werrstr("URLs cannot contain Runes > 0xFF");
1051                         free(t);
1052                         return nil;
1053                 }
1054                 if((*needesc)(r)){
1055                         *u++ = '%';
1056                         *u++ = hex[(r>>4)&0xF];
1057                         *u++ = hex[r&0xF];
1058                 }else
1059                         *u++ = r;
1060         }
1061         *u = '\0';
1062         return t;
1063 }
1064
1065 char*
1066 unescapeurl(char *s)
1067 {
1068         char *r, *w;
1069         Rune rune;
1070
1071         s = estrdup(s);
1072         for(r=w=s; *r; r++){
1073                 if(*r=='%'){
1074                         r++;
1075                         if(!isxdigit(r[0]) || !isxdigit(r[1])){
1076                                 werrstr("bad escape sequence '%.3s' in URL", r);
1077                                 return nil;
1078                         }
1079                         if(r[0]=='0' && r[2]=='0'){
1080                                 werrstr("escaped NUL in URL");
1081                                 return nil;
1082                         }
1083                         rune = (dhex(r[0])<<4)|dhex(r[1]);      /* latin1 */
1084                         w += runetochar(w, &rune);
1085                         r += 2;
1086                 }else
1087                         *w++ = *r;
1088         }
1089         *w = '\0';
1090         return s;
1091 }
1092