1 /* 2 * This is a URL parser, written to parse "Common Internet Scheme" URL 3 * syntax as described in RFC1738 and updated by RFC2396. Only absolute URLs 4 * are supported, using "server-based" naming authorities in the schemes. 5 * Support for literal IPv6 addresses is included, per RFC2732. 6 * 7 * Current "known" schemes: http, ftp, file. 8 * 9 * We can do all the parsing operations without Runes since URLs are 10 * defined to be composed of US-ASCII printable characters. 11 * See RFC1738, RFC2396. 12 */ 13 14 #include <u.h> 15 #include <libc.h> 16 #include <ctype.h> 17 #include <regexp.h> 18 #include <plumb.h> 19 #include <thread.h> 20 #include <fcall.h> 21 #include <9p.h> 22 #include "dat.h" 23 #include "fns.h" 24 25 int urldebug; 26 27 /* If set, relative paths with leading ".." segments will have them trimmed */ 28 #define RemoveExtraRelDotDots 0 29 #define ExpandCurrentDocUrls 1 30 31 static char* 32 schemestrtab[] = 33 { 34 nil, 35 "http", 36 "https", 37 "ftp", 38 "file", 39 }; 40 41 static int 42 ischeme(char *s) 43 { 44 int i; 45 46 for(i=0; i<nelem(schemestrtab); i++) 47 if(schemestrtab[i] && strcmp(s, schemestrtab[i])==0) 48 return i; 49 return USunknown; 50 } 51 52 /* 53 * URI splitting regexp is from RFC2396, Appendix B: 54 * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? 55 * 12 3 4 5 6 7 8 9 56 * 57 * Example: "http://www.ics.uci.edu/pub/ietf/uri/#Related" 58 * $2 = scheme "http" 59 * $4 = authority "www.ics.uci.edu" 60 * $5 = path "/pub/ietf/uri/" 61 * $7 = query <undefined> 62 * $9 = fragment "Related" 63 */ 64 65 /* 66 * RFC2396, Sec 3.1, contains: 67 * 68 * Scheme names consist of a sequence of characters beginning with a 69 * lower case letter and followed by any combination of lower case 70 * letters, digits, plus ("+"), period ("."), or hyphen ("-"). For 71 * resiliency, programs interpreting URI should treat upper case letters 72 * as equivalent to lower case in scheme names (e.g., allow "HTTP" as 73 * well as "http"). 74 */ 75 76 /* 77 * For server-based naming authorities (RFC2396 Sec 3.2.2): 78 * server = [ [ userinfo "@" ] hostport ] 79 * userinfo = *( unreserved | escaped | 80 * ";" | ":" | "&" | "=" | "+" | "$" | "," ) 81 * hostport = host [ ":" port ] 82 * host = hostname | IPv4address 83 * hostname = *( domainlabel "." ) toplabel [ "." ] 84 * domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum 85 * toplabel = alpha | alpha *( alphanum | "-" ) alphanum 86 * IPv4address = 1*digit "." 1*digit "." 1*digit "." 1*digit 87 * port = *digit 88 * 89 * The host is a domain name of a network host, or its IPv4 address as a 90 * set of four decimal digit groups separated by ".". Literal IPv6 91 * addresses are not supported. 92 * 93 * Note that literal IPv6 address support is outlined in RFC2732: 94 * host = hostname | IPv4address | IPv6reference 95 * ipv6reference = "[" IPv6address "]" (RFC2373) 96 * 97 * Since hostnames and numbers will have to be resolved by the OS anyway, 98 * we don't have to parse them too pedantically (counting '.'s, checking 99 * for well-formed literal IP addresses, etc.). 100 * 101 * In FTP/file paths, we reject most ";param"s and querys. In HTTP paths, 102 * we just pass them through. 103 * 104 * Instead of letting a "path" be 0-or-more characters as RFC2396 suggests, 105 * we'll say it's 1-or-more characters, 0-or-1 times. This way, an absent 106 * path yields a nil substring match, instead of an empty one. 107 * 108 * We're more restrictive than RFC2396 indicates with "userinfo" strings, 109 * insisting they have the form "[user[:password]]". This may need to 110 * change at some point, however. 111 */ 112 113 /* RE character-class components -- these go in brackets */ 114 #define PUNCT "\\-_.!~*'()" 115 #define RES ";/?:@&=+$," 116 #define ALNUM "a-zA-Z0-9" 117 #define HEX "0-9a-fA-F" 118 #define UNRES ALNUM PUNCT 119 120 /* RE components; _N => has N parenthesized subexpressions when expanded */ 121 #define ESCAPED_1 "(%[" HEX "][" HEX "])" 122 #define URIC_2 "([" RES UNRES "]|" ESCAPED_1 ")" 123 #define URICNOSLASH_2 "([" UNRES ";?:@&=+$,]|" ESCAPED_1 ")" 124 #define USERINFO_2 "([" UNRES ";:&=+$,]|" ESCAPED_1 ")" 125 #define PCHAR_2 "([" UNRES ":@&=+$,]|" ESCAPED_1 ")" 126 #define PSEGCHAR_3 "([/;]|" PCHAR_2 ")" 127 128 typedef struct Retab Retab; 129 struct Retab 130 { 131 char *str; 132 Reprog *prog; 133 int size; 134 int ind[5]; 135 }; 136 137 enum 138 { 139 REsplit = 0, 140 REscheme, 141 REunknowndata, 142 REauthority, 143 REhost, 144 REuserinfo, 145 REabspath, 146 REquery, 147 REfragment, 148 REhttppath, 149 REftppath, 150 REfilepath, 151 152 MaxResub= 20, 153 }; 154 155 Retab retab[] = /* view in constant width Font */ 156 { 157 [REsplit] 158 "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]+)?(\\?([^#]*))?(#(.*))?$", nil, 0, 159 /* |-scheme-| |-auth.-| |path--| |query| |--|frag */ 160 { 2, 4, 5, 7, 9}, 161 162 [REscheme] 163 "^[a-z][a-z0-9+-.]*$", nil, 0, 164 { 0, }, 165 166 [REunknowndata] 167 "^" URICNOSLASH_2 URIC_2 "*$", nil, 0, 168 { 0, }, 169 170 [REauthority] 171 "^(((" USERINFO_2 "*)@)?(((\\[[^\\]@]+\\])|([^:\\[@]+))(:([0-9]*))?)?)?$", nil, 0, 172 /* |----user info-----| |--------host----------------| |-port-| */ 173 { 3, 7, 11, }, 174 175 [REhost] 176 "^(([a-zA-Z0-9\\-.]+)|(\\[([a-fA-F0-9.:]+)\\]))$", nil, 0, 177 /* |--regular host--| |-IPv6 literal-| */ 178 { 2, 4, }, 179 180 [REuserinfo] 181 "^(([^:]*)(:([^:]*))?)$", nil, 0, 182 /* |user-| |pass-| */ 183 { 2, 4, }, 184 185 [REabspath] 186 "^/" PSEGCHAR_3 "*$", nil, 0, 187 { 0, }, 188 189 [REquery] 190 "^" URIC_2 "*$", nil, 0, 191 { 0, }, 192 193 [REfragment] 194 "^" URIC_2 "*$", nil, 0, 195 { 0, }, 196 197 [REhttppath] 198 "^.*$", nil, 0, 199 { 0, }, 200 201 [REftppath] 202 "^(.+)(;[tT][yY][pP][eE]=([aAiIdD]))?$", nil, 0, 203 /*|--|-path |ftptype-| */ 204 { 1, 3, }, 205 206 [REfilepath] 207 "^.*$", nil, 0, 208 { 0, }, 209 }; 210 211 static int 212 countleftparen(char *s) 213 { 214 int n; 215 216 n = 0; 217 for(; *s; s++) 218 if(*s == '(') 219 n++; 220 return n; 221 } 222 223 void 224 initurl(void) 225 { 226 int i, j; 227 228 for(i=0; i<nelem(retab); i++){ 229 retab[i].prog = regcomp(retab[i].str); 230 if(retab[i].prog == nil) 231 sysfatal("recomp(%s): %r", retab[i].str); 232 retab[i].size = countleftparen(retab[i].str)+1; 233 for(j=0; j<nelem(retab[i].ind); j++) 234 if(retab[i].ind[j] >= retab[i].size) 235 sysfatal("bad index in regexp table: retab[%d].ind[%d] = %d >= %d", 236 i, j, retab[i].ind[j], retab[i].size); 237 if(MaxResub < retab[i].size) 238 sysfatal("MaxResub too small: %d < %d", MaxResub, retab[i].size); 239 } 240 } 241 242 typedef struct SplitUrl SplitUrl; 243 struct SplitUrl 244 { 245 struct { 246 char *s; 247 char *e; 248 } url, scheme, authority, path, query, fragment; 249 }; 250 251 /* 252 * Implements the algorithm in RFC2396 sec 5.2 step 6. 253 * Returns number of chars written, excluding NUL terminator. 254 * dest is known to be >= strlen(base)+rel_len. 255 */ 256 static void 257 merge_relative_path(char *base, char *rel_st, int rel_len, char *dest) 258 { 259 char *s, *p, *e, *pdest; 260 261 pdest = dest; 262 263 /* 6a: start with base, discard last segment */ 264 if(base){ 265 /* Empty paths don't match in our scheme; 'base' should be nil */ 266 assert(base[0] == '/'); 267 e = strrchr(base, '/'); 268 e++; 269 memmove(pdest, base, e-base); 270 pdest += e-base; 271 }else{ 272 /* Artistic license on my part */ 273 *pdest++ = '/'; 274 } 275 276 /* 6b: append relative component */ 277 if(rel_st){ 278 memmove(pdest, rel_st, rel_len); 279 pdest += rel_len; 280 } 281 282 /* 6c: remove any occurrences of "./" as a complete segment */ 283 s = dest; 284 *pdest = '\0'; 285 while(e = strstr(s, "./")){ 286 if((e == dest) || (*(e-1) == '/')){ 287 memmove(e, e+2, pdest+1-(e+2)); /* +1 for NUL */ 288 pdest -= 2; 289 }else 290 s = e+1; 291 } 292 293 /* 6d: remove a trailing "." as a complete segment */ 294 if(pdest>dest && *(pdest-1)=='.' && 295 (pdest==dest+1 || *(pdest-2)=='/')) 296 *--pdest = '\0'; 297 298 /* 6e: remove occurences of "seg/../", where seg != "..", left->right */ 299 s = dest+1; 300 while(e = strstr(s, "/../")){ 301 p = e - 1; 302 while(p >= dest && *p != '/') 303 p--; 304 if(memcmp(p, "/../", 4) != 0){ 305 memmove(p+1, e+4, pdest+1-(e+4)); 306 pdest -= (e+4) - (p+1); 307 }else 308 s = e+1; 309 } 310 311 /* 6f: remove a trailing "seg/..", where seg isn't ".." */ 312 if(pdest-3 > dest && memcmp(pdest-3, "/..", 3)==0){ 313 p = pdest-3 - 1; 314 while(p >= dest && *p != '/') 315 p--; 316 if(memcmp(p, "/../", 4) != 0){ 317 pdest = p+1; 318 *pdest = '\0'; 319 } 320 } 321 322 /* 6g: leading ".." segments are errors -- we'll just blat them out. */ 323 if(RemoveExtraRelDotDots){ 324 p = dest; 325 if (p[0] == '/') 326 p++; 327 s = p; 328 while(s[0]=='.' && s[1]=='.' && (s[2]==0 || s[2]=='/')) 329 s += 3; 330 if(s > p){ 331 memmove(p, s, pdest+1-s); 332 pdest -= s-p; 333 } 334 } 335 USED(pdest); 336 337 if(urldebug) 338 fprint(2, "merge_relative_path: '%s' + '%.*s' -> '%s'\n", base, rel_len, 339 rel_st, dest); 340 } 341 342 /* 343 * See RFC2396 sec 5.2 for info on resolving relative URIs to absolute form. 344 * 345 * If successful, this just ends up freeing and replacing "u->url". 346 */ 347 static int 348 resolve_relative(SplitUrl *su, Url *base, Url *u) 349 { 350 char *url, *path; 351 char *purl, *ppath; 352 int currentdoc, ulen, plen; 353 354 if(base == nil){ 355 werrstr("relative URI given without base"); 356 return -1; 357 } 358 if(base->scheme == nil){ 359 werrstr("relative URI given with no scheme"); 360 return -1; 361 } 362 if(base->ischeme == USunknown){ 363 werrstr("relative URI given with unknown scheme"); 364 return -1; 365 } 366 if(base->ischeme == UScurrent){ 367 werrstr("relative URI given with incomplete base"); 368 return -1; 369 } 370 assert(su->scheme.s == nil); 371 372 /* Sec 5.2 step 2 */ 373 currentdoc = 0; 374 if(su->path.s==nil && su->scheme.s==nil && su->authority.s==nil && su->query.s==nil){ 375 /* Reference is to current document */ 376 if(urldebug) 377 fprint(2, "url %s is relative to current document\n", u->url); 378 u->ischeme = UScurrent; 379 if(!ExpandCurrentDocUrls) 380 return 0; 381 currentdoc = 1; 382 } 383 384 /* Over-estimate the maximum lengths, for allocation purposes */ 385 /* (constants are for separators) */ 386 plen = 1; 387 if(base->path) 388 plen += strlen(base->path); 389 if(su->path.s) 390 plen += 1 + (su->path.e - su->path.s); 391 392 ulen = 0; 393 ulen += strlen(base->scheme) + 1; 394 if(su->authority.s) 395 ulen += 2 + (su->authority.e - su->authority.s); 396 else 397 ulen += 2 + ((base->authority) ? strlen(base->authority) : 0); 398 ulen += plen; 399 if(su->query.s) 400 ulen += 1 + (su->query.e - su->query.s); 401 else if(currentdoc && base->query) 402 ulen += 1 + strlen(base->query); 403 if(su->fragment.s) 404 ulen += 1 + (su->fragment.e - su->fragment.s); 405 else if(currentdoc && base->fragment) 406 ulen += 1 + strlen(base->fragment); 407 url = emalloc(ulen+1); 408 path = emalloc(plen+1); 409 410 url[0] = '\0'; 411 purl = url; 412 path[0] = '\0'; 413 ppath = path; 414 415 if(su->authority.s || (su->path.s && (su->path.s[0] == '/'))){ 416 /* Is a "network-path" or "absolute-path"; don't merge with base path */ 417 /* Sec 5.2 steps 4,5 */ 418 if(su->path.s){ 419 memmove(ppath, su->path.s, su->path.e - su->path.s); 420 ppath += su->path.e - su->path.s; 421 *ppath = '\0'; 422 } 423 }else if(currentdoc){ 424 /* Is a current-doc reference; just copy the path from the base URL */ 425 if(base->path){ 426 strcpy(ppath, base->path); 427 ppath += strlen(ppath); 428 } 429 USED(ppath); 430 }else{ 431 /* Is a relative-path reference; we have to merge it */ 432 /* Sec 5.2 step 6 */ 433 merge_relative_path(base->path, 434 su->path.s, su->path.e - su->path.s, ppath); 435 } 436 437 /* Build new URL from pieces, inheriting from base where needed */ 438 strcpy(purl, base->scheme); 439 purl += strlen(purl); 440 *purl++ = ':'; 441 if(su->authority.s){ 442 strcpy(purl, "//"); 443 purl += strlen(purl); 444 memmove(purl, su->authority.s, su->authority.e - su->authority.s); 445 purl += su->authority.e - su->authority.s; 446 }else if(base->authority){ 447 strcpy(purl, "//"); 448 purl += strlen(purl); 449 strcpy(purl, base->authority); 450 purl += strlen(purl); 451 } 452 assert((path[0] == '\0') || (path[0] == '/')); 453 strcpy(purl, path); 454 purl += strlen(purl); 455 456 /* 457 * The query and fragment are not inherited from the base, 458 * except in case of "current document" URLs, which inherit any query 459 * and may inherit the fragment. 460 */ 461 if(su->query.s){ 462 *purl++ = '?'; 463 memmove(purl, su->query.s, su->query.e - su->query.s); 464 purl += su->query.e - su->query.s; 465 }else if(currentdoc && base->query){ 466 *purl++ = '?'; 467 strcpy(purl, base->query); 468 purl += strlen(purl); 469 } 470 471 if(su->fragment.s){ 472 *purl++ = '#'; 473 memmove(purl, su->query.s, su->query.e - su->query.s); 474 purl += su->fragment.e - su->fragment.s; 475 }else if(currentdoc && base->fragment){ 476 *purl++ = '#'; 477 strcpy(purl, base->fragment); 478 purl += strlen(purl); 479 } 480 USED(purl); 481 482 if(urldebug) 483 fprint(2, "resolve_relative: '%s' + '%s' -> '%s'\n", base->url, u->url, url); 484 free(u->url); 485 u->url = url; 486 free(path); 487 return 0; 488 } 489 490 int 491 regx(Reprog *prog, char *s, Resub *m, int nm) 492 { 493 int i; 494 495 if(s == nil) 496 s = m[0].sp; /* why is this necessary? */ 497 498 i = regexec(prog, s, m, nm); 499 /* 500 if(i >= 0) 501 for(j=0; j<nm; j++) 502 fprint(2, "match%d: %.*s\n", j, utfnlen(m[j].sp, m[j].ep-m[j].sp), m[j].sp); 503 */ 504 return i; 505 } 506 507 static int 508 ismatch(int i, char *s, char *desc) 509 { 510 Resub m[1]; 511 512 m[0].sp = m[0].ep = nil; 513 if(!regx(retab[i].prog, s, m, 1)){ 514 werrstr("malformed %s: %q", desc, s); 515 return 0; 516 } 517 return 1; 518 } 519 520 static int 521 spliturl(char *url, SplitUrl *su) 522 { 523 Resub m[MaxResub]; 524 Retab *t; 525 526 /* 527 * Newlines are not valid in a URI, but regexp(2) treats them specially 528 * so it's best to make sure there are none before proceeding. 529 */ 530 if(strchr(url, '\n')){ 531 werrstr("newline in URI"); 532 return -1; 533 } 534 535 /* 536 * Because we use NUL-terminated strings, as do many client and server 537 * implementations, an escaped NUL ("%00") will quite likely cause problems 538 * when unescaped. We can check for such a sequence once before examining 539 * the components because, per RFC2396 sec. 2.4.1 - 2.4.2, '%' is reserved 540 * in URIs to _always_ indicate escape sequences. Something like "%2500" 541 * will still get by, but that's legitimate, and if it ends up causing 542 * a NUL then someone is unescaping too many times. 543 */ 544 if(strstr(url, "%00")){ 545 werrstr("escaped NUL in URI"); 546 return -1; 547 } 548 549 m[0].sp = m[0].ep = nil; 550 t = &retab[REsplit]; 551 if(!regx(t->prog, url, m, t->size)){ 552 werrstr("malformed URI: %q", url); 553 return -1; 554 } 555 556 su->url.s = m[0].sp; 557 su->url.e = m[0].ep; 558 su->scheme.s = m[t->ind[0]].sp; 559 su->scheme.e = m[t->ind[0]].ep; 560 su->authority.s = m[t->ind[1]].sp; 561 su->authority.e = m[t->ind[1]].ep; 562 su->path.s = m[t->ind[2]].sp; 563 su->path.e = m[t->ind[2]].ep; 564 su->query.s = m[t->ind[3]].sp; 565 su->query.e = m[t->ind[3]].ep; 566 su->fragment.s = m[t->ind[4]].sp; 567 su->fragment.e = m[t->ind[4]].ep; 568 569 if(urldebug) 570 fprint(2, "split url %s into %.*q %.*q %.*q %.*q %.*q %.*q\n", 571 url, 572 su->url.s ? utfnlen(su->url.s, su->url.e-su->url.s) : 10, su->url.s ? su->url.s : "", 573 su->scheme.s ? utfnlen(su->scheme.s, su->scheme.e-su->scheme.s) : 10, su->scheme.s ? su->scheme.s : "", 574 su->authority.s ? utfnlen(su->authority.s, su->authority.e-su->authority.s) : 10, su->authority.s ? su->authority.s : "", 575 su->path.s ? utfnlen(su->path.s, su->path.e-su->path.s) : 10, su->path.s ? su->path.s : "", 576 su->query.s ? utfnlen(su->query.s, su->query.e-su->query.s) : 10, su->query.s ? su->query.s : "", 577 su->fragment.s ? utfnlen(su->fragment.s, su->fragment.e-su->fragment.s) : 10, su->fragment.s ? su->fragment.s : ""); 578 579 return 0; 580 } 581 582 static int 583 parse_scheme(SplitUrl *su, Url *u) 584 { 585 if(su->scheme.s == nil){ 586 werrstr("missing scheme"); 587 return -1; 588 } 589 u->scheme = estredup(su->scheme.s, su->scheme.e); 590 strlower(u->scheme); 591 592 if(!ismatch(REscheme, u->scheme, "scheme")) 593 return -1; 594 595 u->ischeme = ischeme(u->scheme); 596 if(urldebug) 597 fprint(2, "parse_scheme %s => %d\n", u->scheme, u->ischeme); 598 return 0; 599 } 600 601 static int 602 parse_unknown_part(SplitUrl *su, Url *u) 603 { 604 char *s, *e; 605 606 assert(u->ischeme == USunknown); 607 assert(su->scheme.e[0] == ':'); 608 609 s = su->scheme.e+1; 610 if(su->fragment.s){ 611 e = su->fragment.s-1; 612 assert(*e == '#'); 613 }else 614 e = s+strlen(s); 615 616 u->schemedata = estredup(s, e); 617 if(!ismatch(REunknowndata, u->schemedata, "unknown scheme data")) 618 return -1; 619 return 0; 620 } 621 622 static int 623 parse_userinfo(char *s, char *e, Url *u) 624 { 625 Resub m[MaxResub]; 626 Retab *t; 627 628 m[0].sp = s; 629 m[0].ep = e; 630 t = &retab[REuserinfo]; 631 if(!regx(t->prog, nil, m, t->size)){ 632 werrstr("malformed userinfo: %.*q", utfnlen(s, e-s), s); 633 return -1; 634 } 635 if(m[t->ind[0]].sp) 636 u->user = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep); 637 if(m[t->ind[1]].sp) 638 u->user = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep); 639 return 0; 640 } 641 642 static int 643 parse_host(char *s, char *e, Url *u) 644 { 645 Resub m[MaxResub]; 646 Retab *t; 647 648 m[0].sp = s; 649 m[0].ep = e; 650 t = &retab[REhost]; 651 if(!regx(t->prog, nil, m, t->size)){ 652 werrstr("malformed host: %.*q", utfnlen(s, e-s), s); 653 return -1; 654 } 655 656 assert(m[t->ind[0]].sp || m[t->ind[1]].sp); 657 658 if(m[t->ind[0]].sp) /* regular */ 659 u->host = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep); 660 else 661 u->host = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep); 662 return 0; 663 } 664 665 static int 666 parse_authority(SplitUrl *su, Url *u) 667 { 668 Resub m[MaxResub]; 669 Retab *t; 670 char *host; 671 char *userinfo; 672 673 if(su->authority.s == nil) 674 return 0; 675 676 u->authority = estredup(su->authority.s, su->authority.e); 677 m[0].sp = m[0].ep = nil; 678 t = &retab[REauthority]; 679 if(!regx(t->prog, u->authority, m, t->size)){ 680 werrstr("malformed authority: %q", u->authority); 681 return -1; 682 } 683 684 if(m[t->ind[0]].sp) 685 if(parse_userinfo(m[t->ind[0]].sp, m[t->ind[0]].ep, u) < 0) 686 return -1; 687 if(m[t->ind[1]].sp) 688 if(parse_host(m[t->ind[1]].sp, m[t->ind[1]].ep, u) < 0) 689 return -1; 690 if(m[t->ind[2]].sp) 691 u->port = estredup(m[t->ind[2]].sp, m[t->ind[2]].ep); 692 693 694 if(urldebug > 0){ 695 userinfo = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep); 696 host = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep); 697 fprint(2, "port: %q, authority %q\n", u->port, u->authority); 698 fprint(2, "host %q, userinfo %q\n", host, userinfo); 699 free(host); 700 free(userinfo); 701 } 702 return 0; 703 } 704 705 static int 706 parse_abspath(SplitUrl *su, Url *u) 707 { 708 if(su->path.s == nil) 709 return 0; 710 u->path = estredup(su->path.s, su->path.e); 711 if(!ismatch(REabspath, u->path, "absolute path")) 712 return -1; 713 return 0; 714 } 715 716 static int 717 parse_query(SplitUrl *su, Url *u) 718 { 719 if(su->query.s == nil) 720 return 0; 721 u->query = estredup(su->query.s, su->query.e); 722 if(!ismatch(REquery, u->query, "query")) 723 return -1; 724 return 0; 725 } 726 727 static int 728 parse_fragment(SplitUrl *su, Url *u) 729 { 730 if(su->fragment.s == nil) 731 return 0; 732 u->fragment = estredup(su->fragment.s, su->fragment.e); 733 if(!ismatch(REfragment, u->fragment, "fragment")) 734 return -1; 735 return 0; 736 } 737 738 static int 739 postparse_http(Url *u) 740 { 741 u->open = httpopen; 742 u->read = httpread; 743 u->close = httpclose; 744 745 if(u->authority==nil){ 746 werrstr("missing authority (hostname, port, etc.)"); 747 return -1; 748 } 749 if(u->host == nil){ 750 werrstr("missing host specification"); 751 return -1; 752 } 753 754 if(u->path == nil){ 755 u->http.page_spec = estrdup("/"); 756 return 0; 757 } 758 759 if(!ismatch(REhttppath, u->path, "http path")) 760 return -1; 761 if(u->query){ 762 u->http.page_spec = emalloc(strlen(u->path)+1+strlen(u->query)+1); 763 strcpy(u->http.page_spec, u->path); 764 strcat(u->http.page_spec, "?"); 765 strcat(u->http.page_spec, u->query); 766 }else 767 u->http.page_spec = estrdup(u->path); 768 769 return 0; 770 } 771 772 static int 773 postparse_ftp(Url *u) 774 { 775 Resub m[MaxResub]; 776 Retab *t; 777 778 if(u->authority==nil){ 779 werrstr("missing authority (hostname, port, etc.)"); 780 return -1; 781 } 782 if(u->query){ 783 werrstr("unexpected \"?query\" in ftp path"); 784 return -1; 785 } 786 if(u->host == nil){ 787 werrstr("missing host specification"); 788 return -1; 789 } 790 791 if(u->path == nil){ 792 u->ftp.path_spec = estrdup("/"); 793 return 0; 794 } 795 796 m[0].sp = m[0].ep = nil; 797 t = &retab[REftppath]; 798 if(!regx(t->prog, u->path, m, t->size)){ 799 werrstr("malformed ftp path: %q", u->path); 800 return -1; 801 } 802 803 if(m[t->ind[0]].sp){ 804 u->ftp.path_spec = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep); 805 if(strchr(u->ftp.path_spec, ';')){ 806 werrstr("unexpected \";param\" in ftp path"); 807 return -1; 808 } 809 }else 810 u->ftp.path_spec = estrdup("/"); 811 812 if(m[t->ind[1]].sp){ 813 u->ftp.type = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep); 814 strlower(u->ftp.type); 815 } 816 return 0; 817 } 818 819 static int 820 postparse_file(Url *u) 821 { 822 if(u->user || u->passwd){ 823 werrstr("user information not valid with file scheme"); 824 return -1; 825 } 826 if(u->query){ 827 werrstr("unexpected \"?query\" in file path"); 828 return -1; 829 } 830 if(u->port){ 831 werrstr("port not valid with file scheme"); 832 return -1; 833 } 834 if(u->path == nil){ 835 werrstr("missing path in file scheme"); 836 return -1; 837 } 838 if(strchr(u->path, ';')){ 839 werrstr("unexpected \";param\" in file path"); 840 return -1; 841 } 842 843 if(!ismatch(REfilepath, u->path, "file path")) 844 return -1; 845 846 /* "localhost" is equivalent to no host spec, we'll chose the latter */ 847 if(u->host && cistrcmp(u->host, "localhost") == 0){ 848 free(u->host); 849 u->host = nil; 850 } 851 return 0; 852 } 853 854 static int (*postparse[])(Url*) = { 855 nil, 856 postparse_http, 857 postparse_http, 858 postparse_ftp, 859 postparse_file, 860 }; 861 862 Url* 863 parseurl(char *url, Url *base) 864 { 865 Url *u; 866 SplitUrl su; 867 868 if(urldebug) 869 fprint(2, "parseurl %s with base %s\n", url, base ? base->url : "<none>"); 870 871 u = emalloc(sizeof(Url)); 872 u->url = estrdup(url); 873 if(spliturl(u->url, &su) < 0){ 874 Fail: 875 freeurl(u); 876 return nil; 877 } 878 879 /* RFC2396 sec 3.1 says relative URIs are distinguished by absent scheme */ 880 if(su.scheme.s==nil){ 881 if(urldebug) 882 fprint(2, "parseurl has nil scheme\n"); 883 if(resolve_relative(&su, base, u) < 0 || spliturl(u->url, &su) < 0) 884 goto Fail; 885 if(u->ischeme == UScurrent){ 886 /* 'u.url' refers to current document; set fragment and return */ 887 if(parse_fragment(&su, u) < 0) 888 goto Fail; 889 return u; 890 } 891 } 892 893 if(parse_scheme(&su, u) < 0 894 || parse_fragment(&su, u) < 0) 895 goto Fail; 896 897 if(u->ischeme == USunknown){ 898 if(parse_unknown_part(&su, u) < 0) 899 goto Fail; 900 return u; 901 } 902 903 if(parse_query(&su, u) < 0 904 || parse_authority(&su, u) < 0 905 || parse_abspath(&su, u) < 0) 906 goto Fail; 907 908 if(u->ischeme < nelem(postparse) && postparse[u->ischeme]) 909 if((*postparse[u->ischeme])(u) < 0) 910 goto Fail; 911 912 setmalloctag(u, getcallerpc(&url)); 913 return u; 914 } 915 916 void 917 freeurl(Url *u) 918 { 919 if(u == nil) 920 return; 921 free(u->url); 922 free(u->scheme); 923 free(u->schemedata); 924 free(u->authority); 925 free(u->user); 926 free(u->passwd); 927 free(u->host); 928 free(u->port); 929 free(u->path); 930 free(u->query); 931 free(u->fragment); 932 switch(u->ischeme){ 933 case UShttp: 934 free(u->http.page_spec); 935 break; 936 case USftp: 937 free(u->ftp.path_spec); 938 free(u->ftp.type); 939 break; 940 } 941 free(u); 942 } 943 944 void 945 rewriteurl(Url *u) 946 { 947 char *s; 948 949 if(u->schemedata) 950 s = estrmanydup(u->scheme, ":", u->schemedata, nil); 951 else 952 s = estrmanydup(u->scheme, "://", 953 u->user ? u->user : "", 954 u->passwd ? ":" : "", u->passwd ? u->passwd : "", 955 u->user ? "@" : "", u->host ? u->host : "", 956 u->port ? ":" : "", u->port ? u->port : "", 957 u->path, 958 u->query ? "?" : "", u->query ? u->query : "", 959 u->fragment ? "#" : "", u->fragment ? u->fragment : "", 960 nil); 961 free(u->url); 962 u->url = s; 963 } 964 965 int 966 seturlquery(Url *u, char *query) 967 { 968 if(query == nil){ 969 free(u->query); 970 u->query = nil; 971 return 0; 972 } 973 974 if(!ismatch(REquery, query, "query")) 975 return -1; 976 977 free(u->query); 978 u->query = estrdup(query); 979 return 0; 980 } 981 982 static void 983 dupp(char **p) 984 { 985 if(*p) 986 *p = estrdup(*p); 987 } 988 989 Url* 990 copyurl(Url *u) 991 { 992 Url *v; 993 994 v = emalloc(sizeof(Url)); 995 *v = *u; 996 dupp(&v->url); 997 dupp(&v->scheme); 998 dupp(&v->schemedata); 999 dupp(&v->authority); 1000 dupp(&v->user); 1001 dupp(&v->passwd); 1002 dupp(&v->host); 1003 dupp(&v->port); 1004 dupp(&v->path); 1005 dupp(&v->query); 1006 dupp(&v->fragment); 1007 1008 switch(v->ischeme){ 1009 case UShttp: 1010 dupp(&v->http.page_spec); 1011 break; 1012 case USftp: 1013 dupp(&v->ftp.path_spec); 1014 dupp(&v->ftp.type); 1015 break; 1016 } 1017 return v; 1018 } 1019 1020 static int 1021 dhex(char c) 1022 { 1023 if('0' <= c && c <= '9') 1024 return c-'0'; 1025 if('a' <= c && c <= 'f') 1026 return c-'a'+10; 1027 if('A' <= c && c <= 'F') 1028 return c-'A'+10; 1029 return 0; 1030 } 1031 1032 char* 1033 escapeurl(char *s, int (*needesc)(int)) 1034 { 1035 int n; 1036 char *t, *u; 1037 Rune r; 1038 static char *hex = "0123456789abcdef"; 1039 1040 n = 0; 1041 for(t=s; *t; t++) 1042 if((*needesc)(*t)) 1043 n++; 1044 1045 u = emalloc(strlen(s)+2*n+1); 1046 t = u; 1047 for(; *s; s++){ 1048 s += chartorune(&r, s); 1049 if(r >= 0xFF){ 1050 werrstr("URLs cannot contain Runes > 0xFF"); 1051 free(t); 1052 return nil; 1053 } 1054 if((*needesc)(r)){ 1055 *u++ = '%'; 1056 *u++ = hex[(r>>4)&0xF]; 1057 *u++ = hex[r&0xF]; 1058 }else 1059 *u++ = r; 1060 } 1061 *u = '\0'; 1062 return t; 1063 } 1064 1065 char* 1066 unescapeurl(char *s) 1067 { 1068 char *r, *w; 1069 Rune rune; 1070 1071 s = estrdup(s); 1072 for(r=w=s; *r; r++){ 1073 if(*r=='%'){ 1074 r++; 1075 if(!isxdigit(r[0]) || !isxdigit(r[1])){ 1076 werrstr("bad escape sequence '%.3s' in URL", r); 1077 return nil; 1078 } 1079 if(r[0]=='0' && r[2]=='0'){ 1080 werrstr("escaped NUL in URL"); 1081 return nil; 1082 } 1083 rune = (dhex(r[0])<<4)|dhex(r[1]); /* latin1 */ 1084 w += runetochar(w, &rune); 1085 r += 2; 1086 }else 1087 *w++ = *r; 1088 } 1089 *w = '\0'; 1090 return s; 1091 } 1092 1093