sys/src/cmd/webfs/url.c

   1 /*
   2  * This is a URL parser, written to parse "Common Internet Scheme" URL
   3  * syntax as described in RFC1738 and updated by RFC2396.  Only absolute URLs
   4  * are supported, using "server-based" naming authorities in the schemes.
   5  * Support for literal IPv6 addresses is included, per RFC2732.
   6  *
   7  * Current "known" schemes: http, ftp, file.
   8  *
   9  * We can do all the parsing operations without Runes since URLs are
  10  * defined to be composed of US-ASCII printable characters.
  11  * See RFC1738, RFC2396.
  12  */
  13
  14 #include <u.h>
  15 #include <libc.h>
  16 #include <ctype.h>
  17 #include <regexp.h>
  18 #include <plumb.h>
  19 #include <thread.h>
  20 #include <fcall.h>
  21 #include <9p.h>
  22 #include "dat.h"
  23 #include "fns.h"
  24
  25 int urldebug;
  26
  27 /* If set, relative paths with leading ".." segments will have them trimmed */
  28 #define RemoveExtraRelDotDots   0
  29 #define ExpandCurrentDocUrls    1
  30
  31 static char*
  32 schemestrtab[] =
  33 {
  34         nil,
  35         "http",
  36         "https",
  37         "ftp",
  38         "file",
  39 };
  40
  41 static int
  42 ischeme(char *s)
  43 {
  44         int i;
  45
  46         for(i=0; i<nelem(schemestrtab); i++)
  47                 if(schemestrtab[i] && strcmp(s, schemestrtab[i])==0)
  48                         return i;
  49         return USunknown;
  50 }
  51
  52 /*
  53  * URI splitting regexp is from RFC2396, Appendix B:
  54  *              ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
  55  *               12            3  4          5       6  7        8 9
  56  *
  57  * Example: "http://www.ics.uci.edu/pub/ietf/uri/#Related"
  58  * $2 = scheme                  "http"
  59  * $4 = authority               "www.ics.uci.edu"
  60  * $5 = path                    "/pub/ietf/uri/"
  61  * $7 = query                   <undefined>
  62  * $9 = fragment                "Related"
  63  */
  64
  65 /*
  66  * RFC2396, Sec 3.1, contains:
  67  *
  68  * Scheme names consist of a sequence of characters beginning with a
  69  * lower case letter and followed by any combination of lower case
  70  * letters, digits, plus ("+"), period ("."), or hyphen ("-").  For
  71  * resiliency, programs interpreting URI should treat upper case letters
  72  * as equivalent to lower case in scheme names (e.g., allow "HTTP" as
  73  * well as "http").
  74  */
  75
  76 /*
  77  * For server-based naming authorities (RFC2396 Sec 3.2.2):
  78  *    server        = [ [ userinfo "@" ] hostport ]
  79  *    userinfo      = *( unreserved | escaped |
  80  *                      ";" | ":" | "&" | "=" | "+" | "$" | "," )
  81  *    hostport      = host [ ":" port ]
  82  *    host          = hostname | IPv4address
  83  *    hostname      = *( domainlabel "." ) toplabel [ "." ]
  84  *    domainlabel   = alphanum | alphanum *( alphanum | "-" ) alphanum
  85  *    toplabel      = alpha | alpha *( alphanum | "-" ) alphanum
  86  *    IPv4address   = 1*digit "." 1*digit "." 1*digit "." 1*digit
  87  *    port          = *digit
  88  *
  89  *  The host is a domain name of a network host, or its IPv4 address as a
  90  *  set of four decimal digit groups separated by ".".  Literal IPv6
  91  *  addresses are not supported.
  92  *
  93  * Note that literal IPv6 address support is outlined in RFC2732:
  94  *    host          = hostname | IPv4address | IPv6reference
  95  *    ipv6reference = "[" IPv6address "]"               (RFC2373)
  96  *
  97  * Since hostnames and numbers will have to be resolved by the OS anyway,
  98  * we don't have to parse them too pedantically (counting '.'s, checking
  99  * for well-formed literal IP addresses, etc.).
 100  *
 101  * In FTP/file paths, we reject most ";param"s and querys.  In HTTP paths,
 102  * we just pass them through.
 103  *
 104  * Instead of letting a "path" be 0-or-more characters as RFC2396 suggests,
 105  * we'll say it's 1-or-more characters, 0-or-1 times.  This way, an absent
 106  * path yields a nil substring match, instead of an empty one.
 107  *
 108  * We're more restrictive than RFC2396 indicates with "userinfo" strings,
 109  * insisting they have the form "[user[:password]]".  This may need to
 110  * change at some point, however.
 111  */
 112
 113 /* RE character-class components -- these go in brackets */
 114 #define PUNCT                   "\\-_.!~*'()"
 115 #define ALNUM           "a-zA-Z0-9"
 116 #define HEX                     "0-9a-fA-F"
 117 #define UNRES                   ALNUM PUNCT
 118
 119 /* RE components; _N => has N parenthesized subexpressions when expanded */
 120 #define USERINFO_2              "([" UNRES ";:&=+$,]|(%[" HEX "][" HEX "]))"
 121
 122 typedef struct Retab Retab;
 123 struct Retab
 124 {
 125         char    *str;
 126         Reprog  *prog;
 127         int             size;
 128         int             ind[5];
 129 };
 130
 131 enum
 132 {
 133         REsplit = 0,
 134         REscheme,
 135         REauthority,
 136         REhost,
 137         REuserinfo,
 138         REftppath,
 139
 140         MaxResub=       20,
 141 };
 142
 143 Retab retab[] = /* view in constant width Font */
 144 {
 145 [REsplit]
 146         "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]+)?(\\?([^#]*))?(#(.*))?$", nil, 0,
 147         /* |-scheme-|      |-auth.-|  |path--|    |query|     |--|frag */
 148         {  2,              4,         5,          7,          9},
 149
 150 [REscheme]
 151         "^[a-z][a-z0-9+-.]*$", nil, 0,
 152         { 0, },
 153
 154 [REauthority]
 155         "^(((" USERINFO_2 "*)@)?(((\\[[^\\]@]+\\])|([^:\\[@]+))(:([0-9]*))?)?)?$", nil, 0,
 156         /* |----user info-----|  |--------host----------------|  |-port-| */
 157         {  3,                    7,                              11, },
 158
 159 [REhost]
 160         "^(([a-zA-Z0-9\\-.]+)|(\\[([a-fA-F0-9.:]+)\\]))$", nil, 0,
 161         /* |--regular host--|     |-IPv6 literal-| */
 162         {  2,                     4, },
 163
 164 [REuserinfo]
 165         "^(([^:]*)(:([^:]*))?)$", nil, 0,
 166         /* |user-|  |pass-| */
 167         {  2,       4, },
 168
 169 [REftppath]
 170         "^(.+)(;[tT][yY][pP][eE]=([aAiIdD]))?$", nil, 0,
 171         /*|--|-path              |ftptype-| */
 172         { 1,                     3, },
 173 };
 174
 175 static int
 176 countleftparen(char *s)
 177 {
 178         int n;
 179
 180         n = 0;
 181         for(; *s; s++)
 182                 if(*s == '(')
 183                         n++;
 184         return n;
 185 }
 186
 187 void
 188 initurl(void)
 189 {
 190         int i, j;
 191
 192         for(i=0; i<nelem(retab); i++){
 193                 retab[i].prog = regcomp(retab[i].str);
 194                 if(retab[i].prog == nil)
 195                         sysfatal("recomp(%s): %r", retab[i].str);
 196                 retab[i].size = countleftparen(retab[i].str)+1;
 197                 for(j=0; j<nelem(retab[i].ind); j++)
 198                         if(retab[i].ind[j] >= retab[i].size)
 199                                 sysfatal("bad index in regexp table: retab[%d].ind[%d] = %d >= %d",
 200                                         i, j, retab[i].ind[j], retab[i].size);
 201                 if(MaxResub < retab[i].size)
 202                         sysfatal("MaxResub too small: %d < %d", MaxResub, retab[i].size);
 203         }
 204 }
 205
 206 typedef struct SplitUrl SplitUrl;
 207 struct SplitUrl
 208 {
 209         struct {
 210                 char *s;
 211                 char *e;
 212         } url, scheme, authority, path, query, fragment;
 213 };
 214
 215 /*
 216  * Implements the algorithm in RFC2396 sec 5.2 step 6.
 217  * Returns number of chars written, excluding NUL terminator.
 218  * dest is known to be >= strlen(base)+rel_len.
 219  */
 220 static void
 221 merge_relative_path(char *base, char *rel_st, int rel_len, char *dest)
 222 {
 223         char *s, *p, *e, *pdest;
 224
 225         pdest = dest;
 226
 227         /* 6a: start with base, discard last segment */
 228         if(base && base[0]){
 229                 /* Empty paths don't match in our scheme; 'base' should be nil */
 230                 assert(base[0] == '/');
 231                 e = strrchr(base, '/');
 232                 e++;
 233                 memmove(pdest, base, e-base);
 234                 pdest += e-base;
 235         }else{
 236                 /* Artistic license on my part */
 237                 *pdest++ = '/';
 238         }
 239
 240         /* 6b: append relative component */
 241         if(rel_st){
 242                 memmove(pdest, rel_st, rel_len);
 243                 pdest += rel_len;
 244         }
 245
 246         /* 6c: remove any occurrences of "./" as a complete segment */
 247         s = dest;
 248         *pdest = '\0';
 249         while(e = strstr(s, "./")){
 250                 if((e == dest) || (*(e-1) == '/')){
 251                         memmove(e, e+2, pdest+1-(e+2)); /* +1 for NUL */
 252                         pdest -= 2;
 253                 }else
 254                         s = e+1;
 255         }
 256
 257         /* 6d: remove a trailing "." as a complete segment */
 258         if(pdest>dest && *(pdest-1)=='.' &&
 259           (pdest==dest+1 || *(pdest-2)=='/'))
 260                 *--pdest = '\0';
 261
 262         /* 6e: remove occurences of "seg/../", where seg != "..", left->right */
 263         s = dest+1;
 264         while(e = strstr(s, "/../")){
 265                 p = e - 1;
 266                 while(p >= dest && *p != '/')
 267                         p--;
 268                 if(memcmp(p, "/../", 4) != 0){
 269                         memmove(p+1, e+4, pdest+1-(e+4));
 270                         pdest -= (e+4) - (p+1);
 271                 }else
 272                         s = e+1;
 273         }
 274
 275         /* 6f: remove a trailing "seg/..", where seg isn't ".."  */
 276         if(pdest-3 > dest && memcmp(pdest-3, "/..", 3)==0){
 277                 p = pdest-3 - 1;
 278                 while(p >= dest && *p != '/')
 279                         p--;
 280                 if(memcmp(p, "/../", 4) != 0){
 281                         pdest = p+1;
 282                         *pdest = '\0';
 283                 }
 284         }
 285
 286         /* 6g: leading ".." segments are errors -- we'll just blat them out. */
 287         if(RemoveExtraRelDotDots){
 288                 p = dest;
 289                 if (p[0] == '/')
 290                         p++;
 291                 s = p;
 292                 while(s[0]=='.' && s[1]=='.' && (s[2]==0 || s[2]=='/'))
 293                         s += 3;
 294                 if(s > p){
 295                         memmove(p, s, pdest+1-s);
 296                         pdest -= s-p;
 297                 }
 298         }
 299         USED(pdest);
 300
 301         if(urldebug)
 302                 fprint(2, "merge_relative_path: '%s' + '%.*s' -> '%s'\n", base, rel_len,
 303                         rel_st, dest);
 304 }
 305
 306 /*
 307  * See RFC2396 sec 5.2 for info on resolving relative URIs to absolute form.
 308  *
 309  * If successful, this just ends up freeing and replacing "u->url".
 310  */
 311 static int
 312 resolve_relative(SplitUrl *su, Url *base, Url *u)
 313 {
 314         char *url, *path;
 315         char *purl, *ppath;
 316         int currentdoc, ulen, plen;
 317
 318         if(base == nil){
 319                 werrstr("relative URI given without base");
 320                 return -1;
 321         }
 322         if(base->scheme == nil){
 323                 werrstr("relative URI given with no scheme");
 324                 return -1;
 325         }
 326         if(base->ischeme == USunknown){
 327                 werrstr("relative URI given with unknown scheme");
 328                 return -1;
 329         }
 330         if(base->ischeme == UScurrent){
 331                 werrstr("relative URI given with incomplete base");
 332                 return -1;
 333         }
 334         assert(su->scheme.s == nil);
 335
 336         /* Sec 5.2 step 2 */
 337         currentdoc = 0;
 338         if(su->path.s==nil && su->scheme.s==nil && su->authority.s==nil && su->query.s==nil){
 339                 /* Reference is to current document */
 340                 if(urldebug)
 341                         fprint(2, "url %s is relative to current document\n", u->url);
 342                 u->ischeme = UScurrent;
 343                 if(!ExpandCurrentDocUrls)
 344                         return 0;
 345                 currentdoc = 1;
 346         }
 347
 348         /* Over-estimate the maximum lengths, for allocation purposes */
 349         /* (constants are for separators) */
 350         plen = 1;
 351         if(base->path)
 352                 plen += strlen(base->path);
 353         if(su->path.s)
 354                 plen += 1 + (su->path.e - su->path.s);
 355
 356         ulen = 0;
 357         ulen += strlen(base->scheme) + 1;
 358         if(su->authority.s)
 359                 ulen += 2 + (su->authority.e - su->authority.s);
 360         else
 361                 ulen += 2 + ((base->authority) ? strlen(base->authority) : 0);
 362         ulen += plen;
 363         if(su->query.s)
 364                 ulen += 1 + (su->query.e - su->query.s);
 365         else if(currentdoc && base->query)
 366                 ulen += 1 + strlen(base->query);
 367         if(su->fragment.s)
 368                 ulen += 1 + (su->fragment.e - su->fragment.s);
 369         else if(currentdoc && base->fragment)
 370                 ulen += 1 + strlen(base->fragment);
 371         url = emalloc(ulen+1);
 372         path = emalloc(plen+1);
 373
 374         url[0] = '\0';
 375         purl = url;
 376         path[0] = '\0';
 377         ppath = path;
 378
 379         if(su->authority.s || (su->path.s && (su->path.s[0] == '/'))){
 380                 /* Is a "network-path" or "absolute-path"; don't merge with base path */
 381                 /* Sec 5.2 steps 4,5 */
 382                 if(su->path.s){
 383                         memmove(ppath, su->path.s, su->path.e - su->path.s);
 384                         ppath += su->path.e - su->path.s;
 385                         *ppath = '\0';
 386                 }
 387         }else if(currentdoc){
 388                 /* Is a current-doc reference; just copy the path from the base URL */
 389                 if(base->path){
 390                         strcpy(ppath, base->path);
 391                         ppath += strlen(ppath);
 392                 }
 393                 USED(ppath);
 394         }else{
 395                 /* Is a relative-path reference; we have to merge it */
 396                 /* Sec 5.2 step 6 */
 397                 merge_relative_path(base->path,
 398                         su->path.s, su->path.e - su->path.s, ppath);
 399         }
 400
 401         /* Build new URL from pieces, inheriting from base where needed */
 402         strcpy(purl, base->scheme);
 403         purl += strlen(purl);
 404         *purl++ = ':';
 405         if(su->authority.s){
 406                 strcpy(purl, "//");
 407                 purl += strlen(purl);
 408                 memmove(purl, su->authority.s, su->authority.e - su->authority.s);
 409                 purl += su->authority.e - su->authority.s;
 410         }else if(base->authority){
 411                 strcpy(purl, "//");
 412                 purl += strlen(purl);
 413                 strcpy(purl, base->authority);
 414                 purl += strlen(purl);
 415         }
 416         assert((path[0] == '\0') || (path[0] == '/'));
 417         strcpy(purl, path);
 418         purl += strlen(purl);
 419
 420         /*
 421          * The query and fragment are not inherited from the base,
 422          * except in case of "current document" URLs, which inherit any query
 423          * and may inherit the fragment.
 424          */
 425         if(su->query.s){
 426                 *purl++ = '?';
 427                 memmove(purl, su->query.s, su->query.e - su->query.s);
 428                 purl += su->query.e - su->query.s;
 429         }else if(currentdoc && base->query){
 430                 *purl++ = '?';
 431                 strcpy(purl, base->query);
 432                 purl += strlen(purl);
 433         }
 434
 435         if(su->fragment.s){
 436                 *purl++ = '#';
 437                 memmove(purl, su->query.s, su->query.e - su->query.s);
 438                 purl += su->fragment.e - su->fragment.s;
 439         }else if(currentdoc && base->fragment){
 440                 *purl++ = '#';
 441                 strcpy(purl, base->fragment);
 442                 purl += strlen(purl);
 443         }
 444         USED(purl);
 445
 446         if(urldebug)
 447                 fprint(2, "resolve_relative: '%s' + '%s' -> '%s'\n", base->url, u->url, url);
 448         free(u->url);
 449         u->url = url;
 450         free(path);
 451         return 0;
 452 }
 453
 454 int
 455 regx(Reprog *prog, char *s, Resub *m, int nm)
 456 {
 457         int i;
 458
 459         if(s == nil)
 460                 s = m[0].sp;    /* why is this necessary? */
 461
 462         i = regexec(prog, s, m, nm);
 463 /*
 464         if(i >= 0)
 465                 for(j=0; j<nm; j++)
 466                         fprint(2, "match%d: %.*s\n", j, utfnlen(m[j].sp, m[j].ep-m[j].sp), m[j].sp);
 467 */
 468         return i;
 469 }
 470
 471 static int
 472 ismatch(int i, char *s, char *desc)
 473 {
 474         Resub m[1];
 475
 476         m[0].sp = m[0].ep = nil;
 477         if(!regx(retab[i].prog, s, m, 1)){
 478                 werrstr("malformed %s: %q", desc, s);
 479                 return 0;
 480         }
 481         return 1;
 482 }
 483
 484 static int
 485 spliturl(char *url, SplitUrl *su)
 486 {
 487         Resub m[MaxResub];
 488         Retab *t;
 489
 490         /*
 491          * Newlines are not valid in a URI, but regexp(2) treats them specially
 492          * so it's best to make sure there are none before proceeding.
 493          */
 494         if(strchr(url, '\n')){
 495                 werrstr("newline in URI");
 496                 return -1;
 497         }
 498
 499         m[0].sp = m[0].ep = nil;
 500         t = &retab[REsplit];
 501         if(!regx(t->prog, url, m, t->size)){
 502                 werrstr("malformed URI: %q", url);
 503                 return -1;
 504         }
 505
 506         su->url.s = m[0].sp;
 507         su->url.e = m[0].ep;
 508         su->scheme.s = m[t->ind[0]].sp;
 509         su->scheme.e = m[t->ind[0]].ep;
 510         su->authority.s = m[t->ind[1]].sp;
 511         su->authority.e = m[t->ind[1]].ep;
 512         su->path.s = m[t->ind[2]].sp;
 513         su->path.e = m[t->ind[2]].ep;
 514         su->query.s = m[t->ind[3]].sp;
 515         su->query.e = m[t->ind[3]].ep;
 516         su->fragment.s = m[t->ind[4]].sp;
 517         su->fragment.e = m[t->ind[4]].ep;
 518
 519         if(urldebug)
 520                 fprint(2, "split url %s into %.*q %.*q %.*q %.*q %.*q %.*q\n",
 521                         url,
 522                         su->url.s ? utfnlen(su->url.s, su->url.e-su->url.s) : 10, su->url.s ? su->url.s : "",
 523                         su->scheme.s ? utfnlen(su->scheme.s, su->scheme.e-su->scheme.s) : 10, su->scheme.s ? su->scheme.s : "",
 524                         su->authority.s ? utfnlen(su->authority.s, su->authority.e-su->authority.s) : 10, su->authority.s ? su->authority.s : "",
 525                         su->path.s ? utfnlen(su->path.s, su->path.e-su->path.s) : 10, su->path.s ? su->path.s : "",
 526                         su->query.s ? utfnlen(su->query.s, su->query.e-su->query.s) : 10, su->query.s ? su->query.s : "",
 527                         su->fragment.s ? utfnlen(su->fragment.s, su->fragment.e-su->fragment.s) : 10, su->fragment.s ? su->fragment.s : "");
 528
 529         return 0;
 530 }
 531
 532 static int
 533 parse_scheme(SplitUrl *su, Url *u)
 534 {
 535         if(su->scheme.s == nil){
 536                 werrstr("missing scheme");
 537                 return -1;
 538         }
 539         u->scheme = estredup(su->scheme.s, su->scheme.e);
 540         strlower(u->scheme);
 541
 542         if(!ismatch(REscheme, u->scheme, "scheme"))
 543                 return -1;
 544
 545         u->ischeme = ischeme(u->scheme);
 546         if(urldebug)
 547                 fprint(2, "parse_scheme %s => %d\n", u->scheme, u->ischeme);
 548         return 0;
 549 }
 550
 551 static int
 552 parse_unknown_part(SplitUrl *su, Url *u)
 553 {
 554         char *s, *e;
 555
 556         assert(u->ischeme == USunknown);
 557         assert(su->scheme.e[0] == ':');
 558
 559         s = su->scheme.e+1;
 560         if(su->fragment.s){
 561                 e = su->fragment.s-1;
 562                 assert(*e == '#');
 563         }else
 564                 e = s+strlen(s);
 565
 566         u->schemedata = estredup(s, e);
 567         return 0;
 568 }
 569
 570 static int
 571 parse_userinfo(char *s, char *e, Url *u)
 572 {
 573         Resub m[MaxResub];
 574         Retab *t;
 575
 576         m[0].sp = s;
 577         m[0].ep = e;
 578         t = &retab[REuserinfo];
 579         if(!regx(t->prog, nil, m, t->size)){
 580                 werrstr("malformed userinfo: %.*q", utfnlen(s, e-s), s);
 581                 return -1;
 582         }
 583         if(m[t->ind[0]].sp)
 584                 u->user = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
 585         if(m[t->ind[1]].sp)
 586                 u->user = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
 587         return 0;
 588 }
 589
 590 static int
 591 parse_host(char *s, char *e, Url *u)
 592 {
 593         Resub m[MaxResub];
 594         Retab *t;
 595
 596         m[0].sp = s;
 597         m[0].ep = e;
 598         t = &retab[REhost];
 599         if(!regx(t->prog, nil, m, t->size)){
 600                 werrstr("malformed host: %.*q", utfnlen(s, e-s), s);
 601                 return -1;
 602         }
 603
 604         assert(m[t->ind[0]].sp || m[t->ind[1]].sp);
 605
 606         if(m[t->ind[0]].sp)     /* regular */
 607                 u->host = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
 608         else
 609                 u->host = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
 610         return 0;
 611 }
 612
 613 static int
 614 parse_authority(SplitUrl *su, Url *u)
 615 {
 616         Resub m[MaxResub];
 617         Retab *t;
 618         char *host;
 619         char *userinfo;
 620
 621         if(su->authority.s == nil)
 622                 return 0;
 623
 624         u->authority = estredup(su->authority.s, su->authority.e);
 625         m[0].sp = m[0].ep = nil;
 626         t = &retab[REauthority];
 627         if(!regx(t->prog, u->authority, m, t->size)){
 628                 werrstr("malformed authority: %q", u->authority);
 629                 return -1;
 630         }
 631
 632         if(m[t->ind[0]].sp)
 633                 if(parse_userinfo(m[t->ind[0]].sp, m[t->ind[0]].ep, u) < 0)
 634                         return -1;
 635         if(m[t->ind[1]].sp)
 636                 if(parse_host(m[t->ind[1]].sp, m[t->ind[1]].ep, u) < 0)
 637                         return -1;
 638         if(m[t->ind[2]].sp)
 639                 u->port = estredup(m[t->ind[2]].sp, m[t->ind[2]].ep);
 640
 641
 642         if(urldebug > 0){
 643                 userinfo = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
 644                 host = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
 645                 fprint(2, "port: %q, authority %q\n", u->port, u->authority);
 646                 fprint(2, "host %q, userinfo %q\n", host, userinfo);
 647                 free(host);
 648                 free(userinfo);
 649         }
 650         return 0;
 651 }
 652
 653 static int
 654 parse_abspath(SplitUrl *su, Url *u)
 655 {
 656         char *s;
 657
 658         if(su->path.s == nil)
 659                 return 0;
 660         s = estredup(su->path.s, su->path.e);
 661         u->path = unescapeurl(s, "/");
 662         free(s);
 663         return 0;
 664 }
 665
 666 static int
 667 parse_query(SplitUrl *su, Url *u)
 668 {
 669         char *s;
 670
 671         if(su->query.s == nil)
 672                 return 0;
 673         s = estredup(su->query.s, su->query.e);
 674         u->query = unescapeurl(s, "&=");
 675         free(s);
 676         return 0;
 677 }
 678
 679 static int
 680 parse_fragment(SplitUrl *su, Url *u)
 681 {
 682         char *s;
 683
 684         if(su->fragment.s == nil)
 685                 return 0;
 686         s = estredup(su->fragment.s, su->fragment.e);
 687         u->fragment = unescapeurl(s, "");
 688         free(s);
 689         return 0;
 690 }
 691
 692 static int
 693 postparse_http(Url *u)
 694 {
 695         char *p, *q;
 696
 697         u->open = httpopen;
 698         u->read = httpread;
 699         u->close = httpclose;
 700
 701         if(u->authority==nil){
 702                 werrstr("missing authority (hostname, port, etc.)");
 703                 return -1;
 704         }
 705         if(u->host == nil){
 706                 werrstr("missing host specification");
 707                 return -1;
 708         }
 709
 710         if(u->path == nil){
 711                 u->http.page_spec = estrdup("/");
 712                 return 0;
 713         }
 714         p = escapeurl(u->path, " \"<>#%\\");
 715         if(u->query){
 716                 q = escapeurl(u->query, " \"<>#%\\");
 717                 u->http.page_spec = emalloc(strlen(p)+1+strlen(q)+1);
 718                 strcpy(u->http.page_spec, p);
 719                 strcat(u->http.page_spec, "?");
 720                 strcat(u->http.page_spec, q);
 721                 free(q);
 722                 free(p);
 723         }else
 724                 u->http.page_spec = p;
 725         return 0;
 726 }
 727
 728 static int
 729 postparse_ftp(Url *u)
 730 {
 731         Resub m[MaxResub];
 732         Retab *t;
 733
 734         if(u->authority==nil){
 735                 werrstr("missing authority (hostname, port, etc.)");
 736                 return -1;
 737         }
 738         if(u->query){
 739                 werrstr("unexpected \"?query\" in ftp path");
 740                 return -1;
 741         }
 742         if(u->host == nil){
 743                 werrstr("missing host specification");
 744                 return -1;
 745         }
 746
 747         if(u->path == nil){
 748                 u->ftp.path_spec = estrdup("/");
 749                 return 0;
 750         }
 751
 752         m[0].sp = m[0].ep = nil;
 753         t = &retab[REftppath];
 754         if(!regx(t->prog, u->path, m, t->size)){
 755                 werrstr("malformed ftp path: %q", u->path);
 756                 return -1;
 757         }
 758
 759         if(m[t->ind[0]].sp){
 760                 u->ftp.path_spec = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
 761                 if(strchr(u->ftp.path_spec, ';')){
 762                         werrstr("unexpected \";param\" in ftp path");
 763                         return -1;
 764                 }
 765         }else
 766                 u->ftp.path_spec = estrdup("/");
 767
 768         if(m[t->ind[1]].sp){
 769                 u->ftp.type = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
 770                 strlower(u->ftp.type);
 771         }
 772         return 0;
 773 }
 774
 775 static int
 776 postparse_file(Url *u)
 777 {
 778         if(u->user || u->passwd){
 779                 werrstr("user information not valid with file scheme");
 780                 return -1;
 781         }
 782         if(u->query){
 783                 werrstr("unexpected \"?query\" in file path");
 784                 return -1;
 785         }
 786         if(u->port){
 787                 werrstr("port not valid with file scheme");
 788                 return -1;
 789         }
 790         if(u->path == nil){
 791                 werrstr("missing path in file scheme");
 792                 return -1;
 793         }
 794         if(strchr(u->path, ';')){
 795                 werrstr("unexpected \";param\" in file path");
 796                 return -1;
 797         }
 798
 799         /* "localhost" is equivalent to no host spec, we'll chose the latter */
 800         if(u->host && cistrcmp(u->host, "localhost") == 0){
 801                 free(u->host);
 802                 u->host = nil;
 803         }
 804         return 0;
 805 }
 806
 807 static int (*postparse[])(Url*) = {
 808         nil,
 809         postparse_http,
 810         postparse_http,
 811         postparse_ftp,
 812         postparse_file,
 813 };
 814
 815 Url*
 816 parseurl(char *url, Url *base)
 817 {
 818         Url *u;
 819         SplitUrl su;
 820
 821         if(urldebug)
 822                 fprint(2, "parseurl %s with base %s\n", url, base ? base->url : "<none>");
 823
 824         u = emalloc(sizeof(Url));
 825         u->url = estrdup(url);
 826         if(spliturl(u->url, &su) < 0){
 827         Fail:
 828                 freeurl(u);
 829                 return nil;
 830         }
 831
 832         /* RFC2396 sec 3.1 says relative URIs are distinguished by absent scheme */
 833         if(su.scheme.s==nil){
 834                 if(urldebug)
 835                         fprint(2, "parseurl has nil scheme\n");
 836                 if(resolve_relative(&su, base, u) < 0 || spliturl(u->url, &su) < 0)
 837                         goto Fail;
 838                 if(u->ischeme == UScurrent){
 839                         /* 'u.url' refers to current document; set fragment and return */
 840                         if(parse_fragment(&su, u) < 0)
 841                                 goto Fail;
 842                         goto Done;
 843                 }
 844         }
 845
 846         if(parse_scheme(&su, u) < 0
 847         || parse_fragment(&su, u) < 0)
 848                 goto Fail;
 849
 850         if(u->ischeme == USunknown){
 851                 if(parse_unknown_part(&su, u) < 0)
 852                         goto Fail;
 853                 goto Done;
 854         }
 855
 856         if(parse_query(&su, u) < 0
 857         || parse_authority(&su, u) < 0
 858         || parse_abspath(&su, u) < 0)
 859                 goto Fail;
 860
 861         if(u->ischeme < nelem(postparse) && postparse[u->ischeme])
 862                 if((*postparse[u->ischeme])(u) < 0)
 863                         goto Fail;
 864
 865 Done:
 866         setmalloctag(u, getcallerpc(&url));
 867         rewriteurl(u);
 868         return u;
 869 }
 870
 871 void
 872 freeurl(Url *u)
 873 {
 874         if(u == nil)
 875                 return;
 876         free(u->url);
 877         free(u->scheme);
 878         free(u->schemedata);
 879         free(u->authority);
 880         free(u->user);
 881         free(u->passwd);
 882         free(u->host);
 883         free(u->port);
 884         free(u->path);
 885         free(u->query);
 886         free(u->fragment);
 887         switch(u->ischeme){
 888         case UShttp:
 889                 free(u->http.page_spec);
 890                 break;
 891         case USftp:
 892                 free(u->ftp.path_spec);
 893                 free(u->ftp.type);
 894                 break;
 895         }
 896         free(u);
 897 }
 898
 899 void
 900 rewriteurl(Url *u)
 901 {
 902         char *s;
 903
 904         if(u->schemedata)
 905                 s = estrmanydup(u->scheme, ":", u->schemedata, nil);
 906         else
 907                 s = estrmanydup(u->scheme, "://",
 908                         u->user ? u->user : "",
 909                         u->passwd ? ":" : "", u->passwd ? u->passwd : "",
 910                         u->user ? "@" : "", u->host ? u->host : "",
 911                         u->port ? ":" : "", u->port ? u->port : "",
 912                         u->path,
 913                         u->query ? "?" : "", u->query ? u->query : "",
 914                         u->fragment ? "#" : "", u->fragment ? u->fragment : "",
 915                         nil);
 916         free(u->url);
 917         u->url = s;
 918 }
 919
 920 int
 921 seturlquery(Url *u, char *query)
 922 {
 923         if(query == nil){
 924                 free(u->query);
 925                 u->query = nil;
 926                 return 0;
 927         }
 928         free(u->query);
 929         u->query = unescapeurl(query, "&=");
 930         return 0;
 931 }
 932
 933 static void
 934 dupp(char **p)
 935 {
 936         if(*p)
 937                 *p = estrdup(*p);
 938 }
 939
 940 Url*
 941 copyurl(Url *u)
 942 {
 943         Url *v;
 944
 945         v = emalloc(sizeof(Url));
 946         *v = *u;
 947         dupp(&v->url);
 948         dupp(&v->scheme);
 949         dupp(&v->schemedata);
 950         dupp(&v->authority);
 951         dupp(&v->user);
 952         dupp(&v->passwd);
 953         dupp(&v->host);
 954         dupp(&v->port);
 955         dupp(&v->path);
 956         dupp(&v->query);
 957         dupp(&v->fragment);
 958
 959         switch(v->ischeme){
 960         case UShttp:
 961                 dupp(&v->http.page_spec);
 962                 break;
 963         case USftp:
 964                 dupp(&v->ftp.path_spec);
 965                 dupp(&v->ftp.type);
 966                 break;
 967         }
 968         return v;
 969 }
 970
 971 static int
 972 dhex(char c)
 973 {
 974         if('0' <= c && c <= '9')
 975                 return c-'0';
 976         if('a' <= c && c <= 'f')
 977                 return c-'a'+10;
 978         if('A' <= c && c <= 'F')
 979                 return c-'A'+10;
 980         return 0;
 981 }
 982
 983 char*
 984 escapeurl(char *s, char *special)
 985 {
 986         int n;
 987         char *t, *u;
 988         static char *hex = "0123456789abcdef";
 989
 990         n = 0;
 991         for(t=s; *t; t++)
 992                 if(*t <= 0x1F || *t >= 0x7F || strchr(special, *t))
 993                         n++;
 994         u = emalloc(strlen(s)+2*n+1);
 995         t = u;
 996         for(; *s; s++){
 997                 if(s[0] == '%' && isxdigit(s[1]) && isxdigit(s[2]))
 998                         *u++ = *s;
 999                 else if(*s <= 0x1F || *s >= 0x7F || strchr(special, *s)){
1000                         *u++ = '%';
1001                         *u++ = hex[(*s>>4)&0xF];
1002                         *u++ = hex[*s&0xF];
1003                 }else
1004                         *u++ = *s;
1005         }
1006         *u = '\0';
1007         return t;
1008 }
1009
1010 char*
1011 unescapeurl(char *s, char *special)
1012 {
1013         char *r, *w, x;
1014
1015         s = estrdup(s);
1016         for(r=w=s; x = *r; r++){
1017                 if(x=='%' && isxdigit(r[1]) && isxdigit(r[2])){
1018                         x = (dhex(r[1])<<4)|dhex(r[2]);
1019                         if(x == 0 || (x > 0x1F && x < 0x7F && strchr(special, x)))
1020                                 x = *r;
1021                         else
1022                                 r += 2;
1023                 }
1024                 *w++ = x;
1025         }
1026         *w = '\0';
1027         return s;
1028 }
1029