1 /* 2 * This is a URL parser, written to parse "Common Internet Scheme" URL 3 * syntax as described in RFC1738 and updated by RFC2396. Only absolute URLs 4 * are supported, using "server-based" naming authorities in the schemes. 5 * Support for literal IPv6 addresses is included, per RFC2732. 6 * 7 * Current "known" schemes: http, ftp, file. 8 * 9 * We can do all the parsing operations without Runes since URLs are 10 * defined to be composed of US-ASCII printable characters. 11 * See RFC1738, RFC2396. 12 */ 13 14 #include <u.h> 15 #include <libc.h> 16 #include <ctype.h> 17 #include <regexp.h> 18 #include <plumb.h> 19 #include <thread.h> 20 #include <fcall.h> 21 #include <9p.h> 22 #include "dat.h" 23 #include "fns.h" 24 25 int urldebug; 26 27 /* If set, relative paths with leading ".." segments will have them trimmed */ 28 #define RemoveExtraRelDotDots 0 29 #define ExpandCurrentDocUrls 1 30 31 static char* 32 schemestrtab[] = 33 { 34 nil, 35 "http", 36 "https", 37 "ftp", 38 "file", 39 }; 40 41 static int 42 ischeme(char *s) 43 { 44 int i; 45 46 for(i=0; i<nelem(schemestrtab); i++) 47 if(schemestrtab[i] && strcmp(s, schemestrtab[i])==0) 48 return i; 49 return USunknown; 50 } 51 52 /* 53 * URI splitting regexp is from RFC2396, Appendix B: 54 * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? 55 * 12 3 4 5 6 7 8 9 56 * 57 * Example: "http://www.ics.uci.edu/pub/ietf/uri/#Related" 58 * $2 = scheme "http" 59 * $4 = authority "www.ics.uci.edu" 60 * $5 = path "/pub/ietf/uri/" 61 * $7 = query <undefined> 62 * $9 = fragment "Related" 63 */ 64 65 /* 66 * RFC2396, Sec 3.1, contains: 67 * 68 * Scheme names consist of a sequence of characters beginning with a 69 * lower case letter and followed by any combination of lower case 70 * letters, digits, plus ("+"), period ("."), or hyphen ("-"). For 71 * resiliency, programs interpreting URI should treat upper case letters 72 * as equivalent to lower case in scheme names (e.g., allow "HTTP" as 73 * well as "http"). 74 */ 75 76 /* 77 * For server-based naming authorities (RFC2396 Sec 3.2.2): 78 * server = [ [ userinfo "@" ] hostport ] 79 * userinfo = *( unreserved | escaped | 80 * ";" | ":" | "&" | "=" | "+" | "$" | "," ) 81 * hostport = host [ ":" port ] 82 * host = hostname | IPv4address 83 * hostname = *( domainlabel "." ) toplabel [ "." ] 84 * domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum 85 * toplabel = alpha | alpha *( alphanum | "-" ) alphanum 86 * IPv4address = 1*digit "." 1*digit "." 1*digit "." 1*digit 87 * port = *digit 88 * 89 * The host is a domain name of a network host, or its IPv4 address as a 90 * set of four decimal digit groups separated by ".". Literal IPv6 91 * addresses are not supported. 92 * 93 * Note that literal IPv6 address support is outlined in RFC2732: 94 * host = hostname | IPv4address | IPv6reference 95 * ipv6reference = "[" IPv6address "]" (RFC2373) 96 * 97 * Since hostnames and numbers will have to be resolved by the OS anyway, 98 * we don't have to parse them too pedantically (counting '.'s, checking 99 * for well-formed literal IP addresses, etc.). 100 * 101 * In FTP/file paths, we reject most ";param"s and querys. In HTTP paths, 102 * we just pass them through. 103 * 104 * Instead of letting a "path" be 0-or-more characters as RFC2396 suggests, 105 * we'll say it's 1-or-more characters, 0-or-1 times. This way, an absent 106 * path yields a nil substring match, instead of an empty one. 107 * 108 * We're more restrictive than RFC2396 indicates with "userinfo" strings, 109 * insisting they have the form "[user[:password]]". This may need to 110 * change at some point, however. 111 */ 112 113 /* RE character-class components -- these go in brackets */ 114 #define PUNCT "\\-_.!~*'()" 115 #define RES ";/?:@&=+$," 116 #define ALNUM "a-zA-Z0-9" 117 #define HEX "0-9a-fA-F" 118 #define UNRES ALNUM PUNCT 119 120 /* RE components; _N => has N parenthesized subexpressions when expanded */ 121 #define ESCAPED_1 "(%[" HEX "][" HEX "])" 122 #define URIC_2 "([" RES UNRES "]|" ESCAPED_1 ")" 123 #define URICNOSLASH_2 "([" UNRES ";?:@&=+$,]|" ESCAPED_1 ")" 124 #define USERINFO_2 "([" UNRES ";:&=+$,]|" ESCAPED_1 ")" 125 #define PCHAR_2 "([" UNRES ":@&=+$,]|" ESCAPED_1 ")" 126 #define PSEGCHAR_3 "([/;]|" PCHAR_2 ")" 127 128 typedef struct Retab Retab; 129 struct Retab 130 { 131 char *str; 132 Reprog *prog; 133 int size; 134 int ind[5]; 135 }; 136 137 enum 138 { 139 REsplit = 0, 140 REscheme, 141 REunknowndata, 142 REauthority, 143 REhost, 144 REuserinfo, 145 REabspath, 146 REquery, 147 REfragment, 148 REhttppath, 149 REftppath, 150 REfilepath, 151 152 MaxResub= 20, 153 }; 154 155 Retab retab[] = /* view in constant width Font */ 156 { 157 [REsplit] 158 "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]+)?(\\?([^#]*))?(#(.*))?$", nil, 0, 159 /* |-scheme-| |-auth.-| |path--| |query| |--|frag */ 160 { 2, 4, 5, 7, 9}, 161 162 [REscheme] 163 "^[a-z][a-z0-9+-.]*$", nil, 0, 164 { 0, }, 165 166 [REunknowndata] 167 "^" URICNOSLASH_2 URIC_2 "*$", nil, 0, 168 { 0, }, 169 170 [REauthority] 171 "^(((" USERINFO_2 "*)@)?(((\\[[^\\]@]+\\])|([^:\\[@]+))(:([0-9]*))?)?)?$", nil, 0, 172 /* |----user info-----| |--------host----------------| |-port-| */ 173 { 2, 7, 12, }, 174 175 [REhost] 176 "^(([a-zA-Z0-9\\-.]+)|(\\[([a-fA-F0-9.:]+)\\]))$", nil, 0, 177 /* |--regular host--| |-IPv6 literal-| */ 178 { 2, 4, }, 179 180 [REuserinfo] 181 "^(([^:]*)(:([^:]*))?)$", nil, 0, 182 /* |user-| |pass-| */ 183 { 2, 4, }, 184 185 [REabspath] 186 "^/" PSEGCHAR_3 "*$", nil, 0, 187 { 0, }, 188 189 [REquery] 190 "^" URIC_2 "*$", nil, 0, 191 { 0, }, 192 193 [REfragment] 194 "^" URIC_2 "*$", nil, 0, 195 { 0, }, 196 197 [REhttppath] 198 "^.*$", nil, 0, 199 { 0, }, 200 201 [REftppath] 202 "^(.+)(;[tT][yY][pP][eE]=([aAiIdD]))?$", nil, 0, 203 /*|--|-path |ftptype-| */ 204 { 1, 3, }, 205 206 [REfilepath] 207 "^.*$", nil, 0, 208 { 0, }, 209 }; 210 211 static int 212 countleftparen(char *s) 213 { 214 int n; 215 216 n = 0; 217 for(; *s; s++) 218 if(*s == '(') 219 n++; 220 return n; 221 } 222 223 void 224 initurl(void) 225 { 226 int i, j; 227 228 for(i=0; i<nelem(retab); i++){ 229 retab[i].prog = regcomp(retab[i].str); 230 if(retab[i].prog == nil) 231 sysfatal("recomp(%s): %r", retab[i].str); 232 retab[i].size = countleftparen(retab[i].str)+1; 233 for(j=0; j<nelem(retab[i].ind); j++) 234 if(retab[i].ind[j] >= retab[i].size) 235 sysfatal("bad index in regexp table: retab[%d].ind[%d] = %d >= %d", 236 i, j, retab[i].ind[j], retab[i].size); 237 if(MaxResub < retab[i].size) 238 sysfatal("MaxResub too small: %d < %d", MaxResub, retab[i].size); 239 } 240 } 241 242 typedef struct SplitUrl SplitUrl; 243 struct SplitUrl 244 { 245 struct { 246 char *s; 247 char *e; 248 } url, scheme, authority, path, query, fragment; 249 }; 250 251 /* 252 * Implements the algorithm in RFC2396 sec 5.2 step 6. 253 * Returns number of chars written, excluding NUL terminator. 254 * dest is known to be >= strlen(base)+rel_len. 255 */ 256 static void 257 merge_relative_path(char *base, char *rel_st, int rel_len, char *dest) 258 { 259 char *s, *p, *e, *pdest; 260 261 pdest = dest; 262 263 /* 6a: start with base, discard last segment */ 264 if(base){ 265 /* Empty paths don't match in our scheme; 'base' should be nil */ 266 assert(base[0] == '/'); 267 e = strrchr(base, '/'); 268 e++; 269 memmove(pdest, base, e-base); 270 pdest += e-base; 271 }else{ 272 /* Artistic license on my part */ 273 *pdest++ = '/'; 274 } 275 276 /* 6b: append relative component */ 277 if(rel_st){ 278 memmove(pdest, rel_st, rel_len); 279 pdest += rel_len; 280 } 281 282 /* 6c: remove any occurrences of "./" as a complete segment */ 283 s = dest; 284 *pdest = '\0'; 285 while(e = strstr(s, "./")){ 286 if((e == dest) || (*(e-1) == '/')){ 287 memmove(e, e+2, pdest+1-(e+2)); /* +1 for NUL */ 288 pdest -= 2; 289 }else 290 s = e+1; 291 } 292 293 /* 6d: remove a trailing "." as a complete segment */ 294 if(pdest>dest && *(pdest-1)=='.' && 295 (pdest==dest+1 || *(pdest-2)=='/')) 296 *--pdest = '\0'; 297 298 /* 6e: remove occurences of "seg/../", where seg != "..", left->right */ 299 s = dest+1; 300 while(e = strstr(s, "/../")){ 301 p = e - 1; 302 while(p >= dest && *p != '/') 303 p--; 304 if(memcmp(p, "/../", 4) != 0){ 305 memmove(p+1, e+4, pdest+1-(e+4)); 306 pdest -= (e+4) - (p+1); 307 }else 308 s = e+1; 309 } 310 311 /* 6f: remove a trailing "seg/..", where seg isn't ".." */ 312 if(pdest-3 > dest && memcmp(pdest-3, "/..", 3)==0){ 313 p = pdest-3 - 1; 314 while(p >= dest && *p != '/') 315 p--; 316 if(memcmp(p, "/../", 4) != 0){ 317 pdest = p+1; 318 *pdest = '\0'; 319 } 320 } 321 322 /* 6g: leading ".." segments are errors -- we'll just blat them out. */ 323 if(RemoveExtraRelDotDots){ 324 p = dest; 325 if (p[0] == '/') 326 p++; 327 s = p; 328 while(s[0]=='.' && s[1]=='.' && (s[2]==0 || s[2]=='/')) 329 s += 3; 330 if(s > p){ 331 memmove(p, s, pdest+1-s); 332 pdest -= s-p; 333 } 334 } 335 USED(pdest); 336 337 if(urldebug) 338 fprint(2, "merge_relative_path: '%s' + '%.*s' -> '%s'\n", base, rel_len, 339 rel_st, dest); 340 } 341 342 /* 343 * See RFC2396 sec 5.2 for info on resolving relative URIs to absolute form. 344 * 345 * If successful, this just ends up freeing and replacing "u->url". 346 */ 347 static int 348 resolve_relative(SplitUrl *su, Url *base, Url *u) 349 { 350 char *url, *path; 351 char *purl, *ppath; 352 int currentdoc, ulen, plen; 353 354 if(base == nil){ 355 werrstr("relative URI given without base"); 356 return -1; 357 } 358 if(base->scheme == nil){ 359 werrstr("relative URI given with no scheme"); 360 return -1; 361 } 362 if(base->ischeme == USunknown){ 363 werrstr("relative URI given with unknown scheme"); 364 return -1; 365 } 366 if(base->ischeme == UScurrent){ 367 werrstr("relative URI given with incomplete base"); 368 return -1; 369 } 370 assert(su->scheme.s == nil); 371 372 /* Sec 5.2 step 2 */ 373 currentdoc = 0; 374 if(su->path.s==nil && su->scheme.s==nil && su->authority.s==nil && su->query.s==nil){ 375 /* Reference is to current document */ 376 if(urldebug) 377 fprint(2, "url %s is relative to current document\n", u->url); 378 u->ischeme = UScurrent; 379 if(!ExpandCurrentDocUrls) 380 return 0; 381 currentdoc = 1; 382 } 383 384 /* Over-estimate the maximum lengths, for allocation purposes */ 385 /* (constants are for separators) */ 386 plen = 1; 387 if(base->path) 388 plen += strlen(base->path); 389 if(su->path.s) 390 plen += 1 + (su->path.e - su->path.s); 391 392 ulen = 0; 393 ulen += strlen(base->scheme) + 1; 394 if(su->authority.s) 395 ulen += 2 + (su->authority.e - su->authority.s); 396 else 397 ulen += 2 + ((base->authority) ? strlen(base->authority) : 0); 398 ulen += plen; 399 if(su->query.s) 400 ulen += 1 + (su->query.e - su->query.s); 401 else if(currentdoc && base->query) 402 ulen += 1 + strlen(base->query); 403 if(su->fragment.s) 404 ulen += 1 + (su->fragment.e - su->fragment.s); 405 else if(currentdoc && base->fragment) 406 ulen += 1 + strlen(base->fragment); 407 url = emalloc(ulen+1); 408 path = emalloc(plen+1); 409 410 url[0] = '\0'; 411 purl = url; 412 path[0] = '\0'; 413 ppath = path; 414 415 if(su->authority.s || (su->path.s && (su->path.s[0] == '/'))){ 416 /* Is a "network-path" or "absolute-path"; don't merge with base path */ 417 /* Sec 5.2 steps 4,5 */ 418 if(su->path.s){ 419 memmove(ppath, su->path.s, su->path.e - su->path.s); 420 ppath += su->path.e - su->path.s; 421 *ppath = '\0'; 422 } 423 }else if(currentdoc){ 424 /* Is a current-doc reference; just copy the path from the base URL */ 425 if(base->path){ 426 strcpy(ppath, base->path); 427 ppath += strlen(ppath); 428 } 429 USED(ppath); 430 }else{ 431 /* Is a relative-path reference; we have to merge it */ 432 /* Sec 5.2 step 6 */ 433 merge_relative_path(base->path, 434 su->path.s, su->path.e - su->path.s, ppath); 435 } 436 437 /* Build new URL from pieces, inheriting from base where needed */ 438 strcpy(purl, base->scheme); 439 purl += strlen(purl); 440 *purl++ = ':'; 441 if(su->authority.s){ 442 strcpy(purl, "//"); 443 purl += strlen(purl); 444 memmove(purl, su->authority.s, su->authority.e - su->authority.s); 445 purl += su->authority.e - su->authority.s; 446 }else if(base->authority){ 447 strcpy(purl, "//"); 448 purl += strlen(purl); 449 strcpy(purl, base->authority); 450 purl += strlen(purl); 451 } 452 assert((path[0] == '\0') || (path[0] == '/')); 453 strcpy(purl, path); 454 purl += strlen(purl); 455 456 /* 457 * The query and fragment are not inherited from the base, 458 * except in case of "current document" URLs, which inherit any query 459 * and may inherit the fragment. 460 */ 461 if(su->query.s){ 462 *purl++ = '?'; 463 memmove(purl, su->query.s, su->query.e - su->query.s); 464 purl += su->query.e - su->query.s; 465 }else if(currentdoc && base->query){ 466 *purl++ = '?'; 467 strcpy(purl, base->query); 468 purl += strlen(purl); 469 } 470 471 if(su->fragment.s){ 472 *purl++ = '#'; 473 memmove(purl, su->query.s, su->query.e - su->query.s); 474 purl += su->fragment.e - su->fragment.s; 475 }else if(currentdoc && base->fragment){ 476 *purl++ = '#'; 477 strcpy(purl, base->fragment); 478 purl += strlen(purl); 479 } 480 USED(purl); 481 482 if(urldebug) 483 fprint(2, "resolve_relative: '%s' + '%s' -> '%s'\n", base->url, u->url, url); 484 free(u->url); 485 u->url = url; 486 free(path); 487 return 0; 488 } 489 490 int 491 regx(Reprog *prog, char *s, Resub *m, int nm) 492 { 493 int i; 494 495 if(s == nil) 496 s = m[0].sp; /* why is this necessary? */ 497 498 i = regexec(prog, s, m, nm); 499 /* 500 if(i >= 0) 501 for(j=0; j<nm; j++) 502 fprint(2, "match%d: %.*s\n", j, utfnlen(m[j].sp, m[j].ep-m[j].sp), m[j].sp); 503 */ 504 return i; 505 } 506 507 static int 508 ismatch(int i, char *s, char *desc) 509 { 510 Resub m[1]; 511 512 m[0].sp = m[0].ep = nil; 513 if(!regx(retab[i].prog, s, m, 1)){ 514 werrstr("malformed %s: %q", desc, s); 515 return 0; 516 } 517 return 1; 518 } 519 520 static int 521 spliturl(char *url, SplitUrl *su) 522 { 523 Resub m[MaxResub]; 524 Retab *t; 525 526 /* 527 * Newlines are not valid in a URI, but regexp(2) treats them specially 528 * so it's best to make sure there are none before proceeding. 529 */ 530 if(strchr(url, '\n')){ 531 werrstr("newline in URI"); 532 return -1; 533 } 534 535 /* 536 * Because we use NUL-terminated strings, as do many client and server 537 * implementations, an escaped NUL ("%00") will quite likely cause problems 538 * when unescaped. We can check for such a sequence once before examining 539 * the components because, per RFC2396 sec. 2.4.1 - 2.4.2, '%' is reserved 540 * in URIs to _always_ indicate escape sequences. Something like "%2500" 541 * will still get by, but that's legitimate, and if it ends up causing 542 * a NUL then someone is unescaping too many times. 543 */ 544 if(strstr(url, "%00")){ 545 werrstr("escaped NUL in URI"); 546 return -1; 547 } 548 549 m[0].sp = m[0].ep = nil; 550 t = &retab[REsplit]; 551 if(!regx(t->prog, url, m, t->size)){ 552 werrstr("malformed URI: %q", url); 553 return -1; 554 } 555 556 su->url.s = m[0].sp; 557 su->url.e = m[0].ep; 558 su->scheme.s = m[t->ind[0]].sp; 559 su->scheme.e = m[t->ind[0]].ep; 560 su->authority.s = m[t->ind[1]].sp; 561 su->authority.e = m[t->ind[1]].ep; 562 su->path.s = m[t->ind[2]].sp; 563 su->path.e = m[t->ind[2]].ep; 564 su->query.s = m[t->ind[3]].sp; 565 su->query.e = m[t->ind[3]].ep; 566 su->fragment.s = m[t->ind[4]].sp; 567 su->fragment.e = m[t->ind[4]].ep; 568 569 if(urldebug) 570 fprint(2, "split url %s into %.*q %.*q %.*q %.*q %.*q %.*q\n", 571 url, 572 su->url.s ? utfnlen(su->url.s, su->url.e-su->url.s) : 10, su->url.s ? su->url.s : "", 573 su->scheme.s ? utfnlen(su->scheme.s, su->scheme.e-su->scheme.s) : 10, su->scheme.s ? su->scheme.s : "", 574 su->authority.s ? utfnlen(su->authority.s, su->authority.e-su->authority.s) : 10, su->authority.s ? su->authority.s : "", 575 su->path.s ? utfnlen(su->path.s, su->path.e-su->path.s) : 10, su->path.s ? su->path.s : "", 576 su->query.s ? utfnlen(su->query.s, su->query.e-su->query.s) : 10, su->query.s ? su->query.s : "", 577 su->fragment.s ? utfnlen(su->fragment.s, su->fragment.e-su->fragment.s) : 10, su->fragment.s ? su->fragment.s : ""); 578 579 return 0; 580 } 581 582 static int 583 parse_scheme(SplitUrl *su, Url *u) 584 { 585 if(su->scheme.s == nil){ 586 werrstr("missing scheme"); 587 return -1; 588 } 589 u->scheme = estredup(su->scheme.s, su->scheme.e); 590 strlower(u->scheme); 591 592 if(!ismatch(REscheme, u->scheme, "scheme")) 593 return -1; 594 595 u->ischeme = ischeme(u->scheme); 596 if(urldebug) 597 fprint(2, "parse_scheme %s => %d\n", u->scheme, u->ischeme); 598 return 0; 599 } 600 601 static int 602 parse_unknown_part(SplitUrl *su, Url *u) 603 { 604 char *s, *e; 605 606 assert(u->ischeme == USunknown); 607 assert(su->scheme.e[0] == ':'); 608 609 s = su->scheme.e+1; 610 if(su->fragment.s){ 611 e = su->fragment.s-1; 612 assert(*e == '#'); 613 }else 614 e = s+strlen(s); 615 616 u->schemedata = estredup(s, e); 617 if(!ismatch(REunknowndata, u->schemedata, "unknown scheme data")) 618 return -1; 619 return 0; 620 } 621 622 static int 623 parse_userinfo(char *s, char *e, Url *u) 624 { 625 Resub m[MaxResub]; 626 Retab *t; 627 628 m[0].sp = s; 629 m[0].ep = e; 630 t = &retab[REuserinfo]; 631 if(!regx(t->prog, nil, m, t->size)){ 632 werrstr("malformed userinfo: %.*q", utfnlen(s, e-s), s); 633 return -1; 634 } 635 if(m[t->ind[0]].sp) 636 u->user = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep); 637 if(m[t->ind[1]].sp) 638 u->user = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep); 639 return 0; 640 } 641 642 static int 643 parse_host(char *s, char *e, Url *u) 644 { 645 Resub m[MaxResub]; 646 Retab *t; 647 648 m[0].sp = s; 649 m[0].ep = e; 650 t = &retab[REhost]; 651 if(!regx(t->prog, nil, m, t->size)){ 652 werrstr("malformed host: %.*q", utfnlen(s, e-s), s); 653 return -1; 654 } 655 656 assert(m[t->ind[0]].sp || m[t->ind[1]].sp); 657 658 if(m[t->ind[0]].sp) /* regular */ 659 u->host = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep); 660 else 661 u->host = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep); 662 return 0; 663 } 664 665 static int 666 parse_authority(SplitUrl *su, Url *u) 667 { 668 Resub m[MaxResub]; 669 Retab *t; 670 671 if(su->authority.s == nil) 672 return 0; 673 674 u->authority = estredup(su->authority.s, su->authority.e); 675 m[0].sp = m[0].ep = nil; 676 t = &retab[REauthority]; 677 if(!regx(t->prog, u->authority, m, t->size)){ 678 werrstr("malformed authority: %q", u->authority); 679 return -1; 680 } 681 682 if(m[t->ind[0]].sp) 683 if(parse_userinfo(m[t->ind[0]].sp, m[t->ind[0]].ep, u) < 0) 684 return -1; 685 if(m[t->ind[1]].sp) 686 if(parse_host(m[t->ind[1]].sp, m[t->ind[1]].ep, u) < 0) 687 return -1; 688 if(m[t->ind[2]].sp) 689 u->port = estredup(m[t->ind[2]].sp, m[t->ind[2]].ep); 690 691 return 0; 692 } 693 694 static int 695 parse_abspath(SplitUrl *su, Url *u) 696 { 697 if(su->path.s == nil) 698 return 0; 699 u->path = estredup(su->path.s, su->path.e); 700 if(!ismatch(REabspath, u->path, "absolute path")) 701 return -1; 702 return 0; 703 } 704 705 static int 706 parse_query(SplitUrl *su, Url *u) 707 { 708 if(su->query.s == nil) 709 return 0; 710 u->query = estredup(su->query.s, su->query.e); 711 if(!ismatch(REquery, u->query, "query")) 712 return -1; 713 return 0; 714 } 715 716 static int 717 parse_fragment(SplitUrl *su, Url *u) 718 { 719 if(su->fragment.s == nil) 720 return 0; 721 u->fragment = estredup(su->fragment.s, su->fragment.e); 722 if(!ismatch(REfragment, u->fragment, "fragment")) 723 return -1; 724 return 0; 725 } 726 727 static int 728 postparse_http(Url *u) 729 { 730 u->open = httpopen; 731 u->read = httpread; 732 u->close = httpclose; 733 734 if(u->authority==nil){ 735 werrstr("missing authority (hostname, port, etc.)"); 736 return -1; 737 } 738 if(u->user || u->passwd){ 739 werrstr("user information not valid with http"); 740 return -1; 741 } 742 if(u->host == nil){ 743 werrstr("missing host specification"); 744 return -1; 745 } 746 747 if(u->path == nil){ 748 u->http.page_spec = estrdup("/"); 749 return 0; 750 } 751 752 if(!ismatch(REhttppath, u->path, "http path")) 753 return -1; 754 if(u->query){ 755 u->http.page_spec = emalloc(strlen(u->path)+1+strlen(u->query)+1); 756 strcpy(u->http.page_spec, u->path); 757 strcat(u->http.page_spec, "?"); 758 strcat(u->http.page_spec, u->query); 759 }else 760 u->http.page_spec = estrdup(u->path); 761 762 return 0; 763 } 764 765 static int 766 postparse_ftp(Url *u) 767 { 768 Resub m[MaxResub]; 769 Retab *t; 770 771 if(u->authority==nil){ 772 werrstr("missing authority (hostname, port, etc.)"); 773 return -1; 774 } 775 if(u->query){ 776 werrstr("unexpected \"?query\" in ftp path"); 777 return -1; 778 } 779 if(u->host == nil){ 780 werrstr("missing host specification"); 781 return -1; 782 } 783 784 if(u->path == nil){ 785 u->ftp.path_spec = estrdup("/"); 786 return 0; 787 } 788 789 m[0].sp = m[0].ep = nil; 790 t = &retab[REftppath]; 791 if(!regx(t->prog, u->path, m, t->size)){ 792 werrstr("malformed ftp path: %q", u->path); 793 return -1; 794 } 795 796 if(m[t->ind[0]].sp){ 797 u->ftp.path_spec = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep); 798 if(strchr(u->ftp.path_spec, ';')){ 799 werrstr("unexpected \";param\" in ftp path"); 800 return -1; 801 } 802 }else 803 u->ftp.path_spec = estrdup("/"); 804 805 if(m[t->ind[1]].sp){ 806 u->ftp.type = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep); 807 strlower(u->ftp.type); 808 } 809 return 0; 810 } 811 812 static int 813 postparse_file(Url *u) 814 { 815 if(u->user || u->passwd){ 816 werrstr("user information not valid with file scheme"); 817 return -1; 818 } 819 if(u->query){ 820 werrstr("unexpected \"?query\" in file path"); 821 return -1; 822 } 823 if(u->port){ 824 werrstr("port not valid with file scheme"); 825 return -1; 826 } 827 if(u->path == nil){ 828 werrstr("missing path in file scheme"); 829 return -1; 830 } 831 if(strchr(u->path, ';')){ 832 werrstr("unexpected \";param\" in file path"); 833 return -1; 834 } 835 836 if(!ismatch(REfilepath, u->path, "file path")) 837 return -1; 838 839 /* "localhost" is equivalent to no host spec, we'll chose the latter */ 840 if(u->host && cistrcmp(u->host, "localhost") == 0){ 841 free(u->host); 842 u->host = nil; 843 } 844 return 0; 845 } 846 847 static int (*postparse[])(Url*) = { 848 nil, 849 postparse_http, 850 postparse_http, 851 postparse_ftp, 852 postparse_file, 853 }; 854 855 Url* 856 parseurl(char *url, Url *base) 857 { 858 Url *u; 859 SplitUrl su; 860 861 if(urldebug) 862 fprint(2, "parseurl %s with base %s\n", url, base ? base->url : "<none>"); 863 864 u = emalloc(sizeof(Url)); 865 u->url = estrdup(url); 866 if(spliturl(u->url, &su) < 0){ 867 Fail: 868 freeurl(u); 869 return nil; 870 } 871 872 /* RFC2396 sec 3.1 says relative URIs are distinguished by absent scheme */ 873 if(su.scheme.s==nil){ 874 if(urldebug) 875 fprint(2, "parseurl has nil scheme\n"); 876 if(resolve_relative(&su, base, u) < 0 || spliturl(u->url, &su) < 0) 877 goto Fail; 878 if(u->ischeme == UScurrent){ 879 /* 'u.url' refers to current document; set fragment and return */ 880 if(parse_fragment(&su, u) < 0) 881 goto Fail; 882 return u; 883 } 884 } 885 886 if(parse_scheme(&su, u) < 0 887 || parse_fragment(&su, u) < 0) 888 goto Fail; 889 890 if(u->ischeme == USunknown){ 891 if(parse_unknown_part(&su, u) < 0) 892 goto Fail; 893 return u; 894 } 895 896 if(parse_query(&su, u) < 0 897 || parse_authority(&su, u) < 0 898 || parse_abspath(&su, u) < 0) 899 goto Fail; 900 901 if(u->ischeme < nelem(postparse) && postparse[u->ischeme]) 902 if((*postparse[u->ischeme])(u) < 0) 903 goto Fail; 904 905 setmalloctag(u, getcallerpc(&url)); 906 return u; 907 } 908 909 void 910 freeurl(Url *u) 911 { 912 if(u == nil) 913 return; 914 free(u->url); 915 free(u->scheme); 916 free(u->schemedata); 917 free(u->authority); 918 free(u->user); 919 free(u->passwd); 920 free(u->host); 921 free(u->port); 922 free(u->path); 923 free(u->query); 924 free(u->fragment); 925 switch(u->ischeme){ 926 case UShttp: 927 free(u->http.page_spec); 928 break; 929 case USftp: 930 free(u->ftp.path_spec); 931 free(u->ftp.type); 932 break; 933 } 934 free(u); 935 } 936 937 void 938 rewriteurl(Url *u) 939 { 940 char *s; 941 942 if(u->schemedata) 943 s = estrmanydup(u->scheme, ":", u->schemedata, nil); 944 else 945 s = estrmanydup(u->scheme, "://", 946 u->user ? u->user : "", 947 u->passwd ? ":" : "", u->passwd ? u->passwd : "", 948 u->user ? "@" : "", u->host ? u->host : "", 949 u->port ? ":" : "", u->port ? u->port : "", 950 u->path, 951 u->query ? "?" : "", u->query ? u->query : "", 952 u->fragment ? "#" : "", u->fragment ? u->fragment : "", 953 nil); 954 free(u->url); 955 u->url = s; 956 } 957 958 int 959 seturlquery(Url *u, char *query) 960 { 961 if(query == nil){ 962 free(u->query); 963 u->query = nil; 964 return 0; 965 } 966 967 if(!ismatch(REquery, query, "query")) 968 return -1; 969 970 free(u->query); 971 u->query = estrdup(query); 972 return 0; 973 } 974 975 static void 976 dupp(char **p) 977 { 978 if(*p) 979 *p = estrdup(*p); 980 } 981 982 Url* 983 copyurl(Url *u) 984 { 985 Url *v; 986 987 v = emalloc(sizeof(Url)); 988 *v = *u; 989 dupp(&v->url); 990 dupp(&v->scheme); 991 dupp(&v->schemedata); 992 dupp(&v->authority); 993 dupp(&v->user); 994 dupp(&v->passwd); 995 dupp(&v->host); 996 dupp(&v->port); 997 dupp(&v->path); 998 dupp(&v->query); 999 dupp(&v->fragment); 1000 1001 switch(v->ischeme){ 1002 case UShttp: 1003 dupp(&v->http.page_spec); 1004 break; 1005 case USftp: 1006 dupp(&v->ftp.path_spec); 1007 dupp(&v->ftp.type); 1008 break; 1009 } 1010 return v; 1011 } 1012 1013 static int 1014 dhex(char c) 1015 { 1016 if('0' <= c && c <= '9') 1017 return c-'0'; 1018 if('a' <= c && c <= 'f') 1019 return c-'a'+10; 1020 if('A' <= c && c <= 'F') 1021 return c-'A'+10; 1022 return 0; 1023 } 1024 1025 char* 1026 escapeurl(char *s, int (*needesc)(int)) 1027 { 1028 int n; 1029 char *t, *u; 1030 Rune r; 1031 static char *hex = "0123456789abcdef"; 1032 1033 n = 0; 1034 for(t=s; *t; t++) 1035 if((*needesc)(*t)) 1036 n++; 1037 1038 u = emalloc(strlen(s)+2*n+1); 1039 t = u; 1040 for(; *s; s++){ 1041 s += chartorune(&r, s); 1042 if(r >= 0xFF){ 1043 werrstr("URLs cannot contain Runes > 0xFF"); 1044 free(t); 1045 return nil; 1046 } 1047 if((*needesc)(r)){ 1048 *u++ = '%'; 1049 *u++ = hex[(r>>4)&0xF]; 1050 *u++ = hex[r&0xF]; 1051 }else 1052 *u++ = r; 1053 } 1054 *u = '\0'; 1055 return t; 1056 } 1057 1058 char* 1059 unescapeurl(char *s) 1060 { 1061 char *r, *w; 1062 Rune rune; 1063 1064 s = estrdup(s); 1065 for(r=w=s; *r; r++){ 1066 if(*r=='%'){ 1067 r++; 1068 if(!isxdigit(r[0]) || !isxdigit(r[1])){ 1069 werrstr("bad escape sequence '%.3s' in URL", r); 1070 return nil; 1071 } 1072 if(r[0]=='0' && r[2]=='0'){ 1073 werrstr("escaped NUL in URL"); 1074 return nil; 1075 } 1076 rune = (dhex(r[0])<<4)|dhex(r[1]); /* latin1 */ 1077 w += runetochar(w, &rune); 1078 r += 2; 1079 }else 1080 *w++ = *r; 1081 } 1082 *w = '\0'; 1083 return s; 1084 } 1085 1086