xref: /plan9/sys/src/cmd/webfs/url.c (revision b249590635b298a2b629c4458908d752eced8f6f)
1 /*
2  * This is a URL parser, written to parse "Common Internet Scheme" URL
3  * syntax as described in RFC1738 and updated by RFC2396.  Only absolute URLs
4  * are supported, using "server-based" naming authorities in the schemes.
5  * Support for literal IPv6 addresses is included, per RFC2732.
6  *
7  * Current "known" schemes: http, ftp, file.
8  *
9  * We can do all the parsing operations without Runes since URLs are
10  * defined to be composed of US-ASCII printable characters.
11  * See RFC1738, RFC2396.
12  */
13 
14 #include <u.h>
15 #include <libc.h>
16 #include <ctype.h>
17 #include <regexp.h>
18 #include <plumb.h>
19 #include <thread.h>
20 #include <fcall.h>
21 #include <9p.h>
22 #include "dat.h"
23 #include "fns.h"
24 
25 int urldebug;
26 
27 /* If set, relative paths with leading ".." segments will have them trimmed */
28 #define RemoveExtraRelDotDots	0
29 #define ExpandCurrentDocUrls	1
30 
31 static char*
32 schemestrtab[] =
33 {
34 	nil,
35 	"http",
36 	"https",
37 	"ftp",
38 	"file",
39 };
40 
41 static int
ischeme(char * s)42 ischeme(char *s)
43 {
44 	int i;
45 
46 	for(i=0; i<nelem(schemestrtab); i++)
47 		if(schemestrtab[i] && strcmp(s, schemestrtab[i])==0)
48 			return i;
49 	return USunknown;
50 }
51 
52 /*
53  * URI splitting regexp is from RFC2396, Appendix B:
54  *		^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
55  *		 12            3  4          5       6  7        8 9
56  *
57  * Example: "http://www.ics.uci.edu/pub/ietf/uri/#Related"
58  * $2 = scheme			"http"
59  * $4 = authority		"www.ics.uci.edu"
60  * $5 = path			"/pub/ietf/uri/"
61  * $7 = query			<undefined>
62  * $9 = fragment		"Related"
63  */
64 
65 /*
66  * RFC2396, Sec 3.1, contains:
67  *
68  * Scheme names consist of a sequence of characters beginning with a
69  * lower case letter and followed by any combination of lower case
70  * letters, digits, plus ("+"), period ("."), or hyphen ("-").  For
71  * resiliency, programs interpreting URI should treat upper case letters
72  * as equivalent to lower case in scheme names (e.g., allow "HTTP" as
73  * well as "http").
74  */
75 
76 /*
77  * For server-based naming authorities (RFC2396 Sec 3.2.2):
78  *    server        = [ [ userinfo "@" ] hostport ]
79  *    userinfo      = *( unreserved | escaped |
80  *                      ";" | ":" | "&" | "=" | "+" | "$" | "," )
81  *    hostport      = host [ ":" port ]
82  *    host          = hostname | IPv4address
83  *    hostname      = *( domainlabel "." ) toplabel [ "." ]
84  *    domainlabel   = alphanum | alphanum *( alphanum | "-" ) alphanum
85  *    toplabel      = alpha | alpha *( alphanum | "-" ) alphanum
86  *    IPv4address   = 1*digit "." 1*digit "." 1*digit "." 1*digit
87  *    port          = *digit
88  *
89  *  The host is a domain name of a network host, or its IPv4 address as a
90  *  set of four decimal digit groups separated by ".".  Literal IPv6
91  *  addresses are not supported.
92  *
93  * Note that literal IPv6 address support is outlined in RFC2732:
94  *    host          = hostname | IPv4address | IPv6reference
95  *    ipv6reference = "[" IPv6address "]"		(RFC2373)
96  *
97  * Since hostnames and numbers will have to be resolved by the OS anyway,
98  * we don't have to parse them too pedantically (counting '.'s, checking
99  * for well-formed literal IP addresses, etc.).
100  *
101  * In FTP/file paths, we reject most ";param"s and querys.  In HTTP paths,
102  * we just pass them through.
103  *
104  * Instead of letting a "path" be 0-or-more characters as RFC2396 suggests,
105  * we'll say it's 1-or-more characters, 0-or-1 times.  This way, an absent
106  * path yields a nil substring match, instead of an empty one.
107  *
108  * We're more restrictive than RFC2396 indicates with "userinfo" strings,
109  * insisting they have the form "[user[:password]]".  This may need to
110  * change at some point, however.
111  */
112 
113 /* RE character-class components -- these go in brackets */
114 #define PUNCT			"\\-_.!~*'()"
115 #define RES			";/?:@&=+$,"
116 #define ALNUM		"a-zA-Z0-9"
117 #define HEX			"0-9a-fA-F"
118 #define UNRES			ALNUM PUNCT
119 
120 /* RE components; _N => has N parenthesized subexpressions when expanded */
121 #define ESCAPED_1			"(%[" HEX "][" HEX "])"
122 #define URIC_2			"([" RES UNRES "]|" ESCAPED_1 ")"
123 #define URICNOSLASH_2		"([" UNRES ";?:@&=+$,]|" ESCAPED_1 ")"
124 #define USERINFO_2		"([" UNRES ";:&=+$,]|" ESCAPED_1 ")"
125 #define PCHAR_2			"([" UNRES ":@&=+$,]|" ESCAPED_1 ")"
126 #define PSEGCHAR_3		"([/;]|" PCHAR_2 ")"
127 
128 typedef struct Retab Retab;
129 struct Retab
130 {
131 	char	*str;
132 	Reprog	*prog;
133 	int		size;
134 	int		ind[5];
135 };
136 
137 enum
138 {
139 	REsplit = 0,
140 	REscheme,
141 	REunknowndata,
142 	REauthority,
143 	REhost,
144 	REuserinfo,
145 	REabspath,
146 	REquery,
147 	REfragment,
148 	REhttppath,
149 	REftppath,
150 	REfilepath,
151 
152 	MaxResub=	20,
153 };
154 
155 Retab retab[] =	/* view in constant width Font */
156 {
157 [REsplit]
158 	"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]+)?(\\?([^#]*))?(#(.*))?$", nil, 0,
159 	/* |-scheme-|      |-auth.-|  |path--|    |query|     |--|frag */
160 	{  2,              4,         5,          7,          9},
161 
162 [REscheme]
163 	"^[a-z][a-z0-9+-.]*$", nil, 0,
164 	{ 0, },
165 
166 [REunknowndata]
167 	"^" URICNOSLASH_2 URIC_2 "*$", nil, 0,
168 	{ 0, },
169 
170 [REauthority]
171 	"^(((" USERINFO_2 "*)@)?(((\\[[^\\]@]+\\])|([^:\\[@]+))(:([0-9]*))?)?)?$", nil, 0,
172 	/* |----user info-----|  |--------host----------------|  |-port-| */
173 	{  3,                    7,                              11, },
174 
175 [REhost]
176 	"^(([a-zA-Z0-9\\-.]+)|(\\[([a-fA-F0-9.:]+)\\]))$", nil, 0,
177 	/* |--regular host--|     |-IPv6 literal-| */
178 	{  2,                     4, },
179 
180 [REuserinfo]
181 	"^(([^:]*)(:([^:]*))?)$", nil, 0,
182 	/* |user-|  |pass-| */
183 	{  2,       4, },
184 
185 [REabspath]
186 	"^/" PSEGCHAR_3 "*$", nil, 0,
187 	{ 0, },
188 
189 [REquery]
190 	"^" URIC_2 "*$", nil, 0,
191 	{ 0, },
192 
193 [REfragment]
194 	"^" URIC_2 "*$", nil, 0,
195 	{ 0, },
196 
197 [REhttppath]
198 	"^.*$", nil, 0,
199 	{ 0, },
200 
201 [REftppath]
202 	"^(.+)(;[tT][yY][pP][eE]=([aAiIdD]))?$", nil, 0,
203 	/*|--|-path              |ftptype-| */
204 	{ 1,                     3, },
205 
206 [REfilepath]
207 	"^.*$", nil, 0,
208 	{ 0, },
209 };
210 
211 static int
countleftparen(char * s)212 countleftparen(char *s)
213 {
214 	int n;
215 
216 	n = 0;
217 	for(; *s; s++)
218 		if(*s == '(')
219 			n++;
220 	return n;
221 }
222 
223 void
initurl(void)224 initurl(void)
225 {
226 	int i, j;
227 
228 	for(i=0; i<nelem(retab); i++){
229 		retab[i].prog = regcomp(retab[i].str);
230 		if(retab[i].prog == nil)
231 			sysfatal("recomp(%s): %r", retab[i].str);
232 		retab[i].size = countleftparen(retab[i].str)+1;
233 		for(j=0; j<nelem(retab[i].ind); j++)
234 			if(retab[i].ind[j] >= retab[i].size)
235 				sysfatal("bad index in regexp table: retab[%d].ind[%d] = %d >= %d",
236 					i, j, retab[i].ind[j], retab[i].size);
237 		if(MaxResub < retab[i].size)
238 			sysfatal("MaxResub too small: %d < %d", MaxResub, retab[i].size);
239 	}
240 }
241 
242 typedef struct SplitUrl SplitUrl;
243 struct SplitUrl
244 {
245 	struct {
246 		char *s;
247 		char *e;
248 	} url, scheme, authority, path, query, fragment;
249 };
250 
251 /*
252  * Implements the algorithm in RFC2396 sec 5.2 step 6.
253  * Returns number of chars written, excluding NUL terminator.
254  * dest is known to be >= strlen(base)+rel_len.
255  */
256 static void
merge_relative_path(char * base,char * rel_st,int rel_len,char * dest)257 merge_relative_path(char *base, char *rel_st, int rel_len, char *dest)
258 {
259 	char *s, *p, *e, *pdest;
260 
261 	pdest = dest;
262 
263 	/* 6a: start with base, discard last segment */
264 	if(base && base[0]){
265 		/* Empty paths don't match in our scheme; 'base' should be nil */
266 		assert(base[0] == '/');
267 		e = strrchr(base, '/');
268 		e++;
269 		memmove(pdest, base, e-base);
270 		pdest += e-base;
271 	}else{
272 		/* Artistic license on my part */
273 		*pdest++ = '/';
274 	}
275 
276 	/* 6b: append relative component */
277 	if(rel_st){
278 		memmove(pdest, rel_st, rel_len);
279 		pdest += rel_len;
280 	}
281 
282 	/* 6c: remove any occurrences of "./" as a complete segment */
283 	s = dest;
284 	*pdest = '\0';
285 	while(e = strstr(s, "./")){
286 		if((e == dest) || (*(e-1) == '/')){
287  			memmove(e, e+2, pdest+1-(e+2));	/* +1 for NUL */
288 			pdest -= 2;
289 		}else
290 			s = e+1;
291 	}
292 
293 	/* 6d: remove a trailing "." as a complete segment */
294 	if(pdest>dest && *(pdest-1)=='.' &&
295 	  (pdest==dest+1 || *(pdest-2)=='/'))
296 		*--pdest = '\0';
297 
298 	/* 6e: remove occurences of "seg/../", where seg != "..", left->right */
299 	s = dest+1;
300 	while(e = strstr(s, "/../")){
301 		p = e - 1;
302 		while(p >= dest && *p != '/')
303 			p--;
304 		if(memcmp(p, "/../", 4) != 0){
305 			memmove(p+1, e+4, pdest+1-(e+4));
306 			pdest -= (e+4) - (p+1);
307 		}else
308 			s = e+1;
309 	}
310 
311 	/* 6f: remove a trailing "seg/..", where seg isn't ".."  */
312 	if(pdest-3 > dest && memcmp(pdest-3, "/..", 3)==0){
313 		p = pdest-3 - 1;
314 		while(p >= dest && *p != '/')
315 			p--;
316 		if(memcmp(p, "/../", 4) != 0){
317 			pdest = p+1;
318 			*pdest = '\0';
319 		}
320 	}
321 
322 	/* 6g: leading ".." segments are errors -- we'll just blat them out. */
323 	if(RemoveExtraRelDotDots){
324 		p = dest;
325 		if (p[0] == '/')
326 			p++;
327 		s = p;
328 		while(s[0]=='.' && s[1]=='.' && (s[2]==0 || s[2]=='/'))
329 			s += 3;
330 		if(s > p){
331 			memmove(p, s, pdest+1-s);
332 			pdest -= s-p;
333 		}
334 	}
335 	USED(pdest);
336 
337 	if(urldebug)
338 		fprint(2, "merge_relative_path: '%s' + '%.*s' -> '%s'\n", base, rel_len,
339 			rel_st, dest);
340 }
341 
342 /*
343  * See RFC2396 sec 5.2 for info on resolving relative URIs to absolute form.
344  *
345  * If successful, this just ends up freeing and replacing "u->url".
346  */
347 static int
resolve_relative(SplitUrl * su,Url * base,Url * u)348 resolve_relative(SplitUrl *su, Url *base, Url *u)
349 {
350 	char *url, *path;
351 	char *purl, *ppath;
352 	int currentdoc, ulen, plen;
353 
354 	if(base == nil){
355 		werrstr("relative URI given without base");
356 		return -1;
357 	}
358 	if(base->scheme == nil){
359 		werrstr("relative URI given with no scheme");
360 		return -1;
361 	}
362 	if(base->ischeme == USunknown){
363 		werrstr("relative URI given with unknown scheme");
364 		return -1;
365 	}
366 	if(base->ischeme == UScurrent){
367 		werrstr("relative URI given with incomplete base");
368 		return -1;
369 	}
370 	assert(su->scheme.s == nil);
371 
372 	/* Sec 5.2 step 2 */
373 	currentdoc = 0;
374 	if(su->path.s==nil && su->scheme.s==nil && su->authority.s==nil && su->query.s==nil){
375 		/* Reference is to current document */
376 		if(urldebug)
377 			fprint(2, "url %s is relative to current document\n", u->url);
378 		u->ischeme = UScurrent;
379 		if(!ExpandCurrentDocUrls)
380 			return 0;
381 		currentdoc = 1;
382 	}
383 
384 	/* Over-estimate the maximum lengths, for allocation purposes */
385 	/* (constants are for separators) */
386 	plen = 1;
387 	if(base->path)
388 		plen += strlen(base->path);
389 	if(su->path.s)
390 		plen += 1 + (su->path.e - su->path.s);
391 
392 	ulen = 0;
393 	ulen += strlen(base->scheme) + 1;
394 	if(su->authority.s)
395 		ulen += 2 + (su->authority.e - su->authority.s);
396 	else
397 		ulen += 2 + ((base->authority) ? strlen(base->authority) : 0);
398 	ulen += plen;
399 	if(su->query.s)
400 		ulen += 1 + (su->query.e - su->query.s);
401 	else if(currentdoc && base->query)
402 		ulen += 1 + strlen(base->query);
403 	if(su->fragment.s)
404 		ulen += 1 + (su->fragment.e - su->fragment.s);
405 	else if(currentdoc && base->fragment)
406 		ulen += 1 + strlen(base->fragment);
407 	url = emalloc(ulen+1);
408 	path = emalloc(plen+1);
409 
410 	url[0] = '\0';
411 	purl = url;
412 	path[0] = '\0';
413 	ppath = path;
414 
415 	if(su->authority.s || (su->path.s && (su->path.s[0] == '/'))){
416 		/* Is a "network-path" or "absolute-path"; don't merge with base path */
417 		/* Sec 5.2 steps 4,5 */
418 		if(su->path.s){
419 			memmove(ppath, su->path.s, su->path.e - su->path.s);
420 			ppath += su->path.e - su->path.s;
421 			*ppath = '\0';
422 		}
423 	}else if(currentdoc){
424 		/* Is a current-doc reference; just copy the path from the base URL */
425 		if(base->path){
426 			strcpy(ppath, base->path);
427 			ppath += strlen(ppath);
428 		}
429 		USED(ppath);
430 	}else{
431 		/* Is a relative-path reference; we have to merge it */
432 		/* Sec 5.2 step 6 */
433 		merge_relative_path(base->path,
434 			su->path.s, su->path.e - su->path.s, ppath);
435 	}
436 
437 	/* Build new URL from pieces, inheriting from base where needed */
438 	strcpy(purl, base->scheme);
439 	purl += strlen(purl);
440 	*purl++ = ':';
441 	if(su->authority.s){
442 		strcpy(purl, "//");
443 		purl += strlen(purl);
444 		memmove(purl, su->authority.s, su->authority.e - su->authority.s);
445 		purl += su->authority.e - su->authority.s;
446 	}else if(base->authority){
447 		strcpy(purl, "//");
448 		purl += strlen(purl);
449 		strcpy(purl, base->authority);
450 		purl += strlen(purl);
451 	}
452 	assert((path[0] == '\0') || (path[0] == '/'));
453 	strcpy(purl, path);
454 	purl += strlen(purl);
455 
456 	/*
457 	 * The query and fragment are not inherited from the base,
458 	 * except in case of "current document" URLs, which inherit any query
459 	 * and may inherit the fragment.
460 	 */
461 	if(su->query.s){
462 		*purl++ = '?';
463 		memmove(purl, su->query.s, su->query.e - su->query.s);
464 		purl += su->query.e - su->query.s;
465 	}else if(currentdoc && base->query){
466 		*purl++ = '?';
467 		strcpy(purl, base->query);
468 		purl += strlen(purl);
469 	}
470 
471 	if(su->fragment.s){
472 		*purl++ = '#';
473 		memmove(purl, su->query.s, su->query.e - su->query.s);
474 		purl += su->fragment.e - su->fragment.s;
475 	}else if(currentdoc && base->fragment){
476 		*purl++ = '#';
477 		strcpy(purl, base->fragment);
478 		purl += strlen(purl);
479 	}
480 	USED(purl);
481 
482 	if(urldebug)
483 		fprint(2, "resolve_relative: '%s' + '%s' -> '%s'\n", base->url, u->url, url);
484 	free(u->url);
485 	u->url = url;
486 	free(path);
487 	return 0;
488 }
489 
490 int
regx(Reprog * prog,char * s,Resub * m,int nm)491 regx(Reprog *prog, char *s, Resub *m, int nm)
492 {
493 	int i;
494 
495 	if(s == nil)
496 		s = m[0].sp;	/* why is this necessary? */
497 
498 	i = regexec(prog, s, m, nm);
499 /*
500 	if(i >= 0)
501 		for(j=0; j<nm; j++)
502 			fprint(2, "match%d: %.*s\n", j, utfnlen(m[j].sp, m[j].ep-m[j].sp), m[j].sp);
503 */
504 	return i;
505 }
506 
507 static int
ismatch(int i,char * s,char * desc)508 ismatch(int i, char *s, char *desc)
509 {
510 	Resub m[1];
511 
512 	m[0].sp = m[0].ep = nil;
513 	if(!regx(retab[i].prog, s, m, 1)){
514 		werrstr("malformed %s: %q", desc, s);
515 		return 0;
516 	}
517 	return 1;
518 }
519 
520 static int
spliturl(char * url,SplitUrl * su)521 spliturl(char *url, SplitUrl *su)
522 {
523 	Resub m[MaxResub];
524 	Retab *t;
525 
526 	/*
527 	 * Newlines are not valid in a URI, but regexp(2) treats them specially
528 	 * so it's best to make sure there are none before proceeding.
529 	 */
530 	if(strchr(url, '\n')){
531 		werrstr("newline in URI");
532 		return -1;
533 	}
534 
535 	/*
536 	 * Because we use NUL-terminated strings, as do many client and server
537 	 * implementations, an escaped NUL ("%00") will quite likely cause problems
538 	 * when unescaped.  We can check for such a sequence once before examining
539  	 * the components because, per RFC2396 sec. 2.4.1 - 2.4.2, '%' is reserved
540 	 * in URIs to _always_ indicate escape sequences.  Something like "%2500"
541 	 * will still get by, but that's legitimate, and if it ends up causing
542 	 * a NUL then someone is unescaping too many times.
543 	 */
544 	if(strstr(url, "%00")){
545 		werrstr("escaped NUL in URI");
546 		return -1;
547 	}
548 
549 	m[0].sp = m[0].ep = nil;
550 	t = &retab[REsplit];
551 	if(!regx(t->prog, url, m, t->size)){
552 		werrstr("malformed URI: %q", url);
553 		return -1;
554 	}
555 
556 	su->url.s = m[0].sp;
557 	su->url.e = m[0].ep;
558 	su->scheme.s = m[t->ind[0]].sp;
559 	su->scheme.e = m[t->ind[0]].ep;
560 	su->authority.s = m[t->ind[1]].sp;
561 	su->authority.e = m[t->ind[1]].ep;
562 	su->path.s = m[t->ind[2]].sp;
563 	su->path.e = m[t->ind[2]].ep;
564 	su->query.s = m[t->ind[3]].sp;
565 	su->query.e = m[t->ind[3]].ep;
566 	su->fragment.s = m[t->ind[4]].sp;
567 	su->fragment.e = m[t->ind[4]].ep;
568 
569 	if(urldebug)
570 		fprint(2, "split url %s into %.*q %.*q %.*q %.*q %.*q %.*q\n",
571 			url,
572 			su->url.s ? utfnlen(su->url.s, su->url.e-su->url.s) : 10, su->url.s ? su->url.s : "",
573 			su->scheme.s ? utfnlen(su->scheme.s, su->scheme.e-su->scheme.s) : 10, su->scheme.s ? su->scheme.s : "",
574 			su->authority.s ? utfnlen(su->authority.s, su->authority.e-su->authority.s) : 10, su->authority.s ? su->authority.s : "",
575 			su->path.s ? utfnlen(su->path.s, su->path.e-su->path.s) : 10, su->path.s ? su->path.s : "",
576 			su->query.s ? utfnlen(su->query.s, su->query.e-su->query.s) : 10, su->query.s ? su->query.s : "",
577 			su->fragment.s ? utfnlen(su->fragment.s, su->fragment.e-su->fragment.s) : 10, su->fragment.s ? su->fragment.s : "");
578 
579 	return 0;
580 }
581 
582 static int
parse_scheme(SplitUrl * su,Url * u)583 parse_scheme(SplitUrl *su, Url *u)
584 {
585 	if(su->scheme.s == nil){
586 		werrstr("missing scheme");
587 		return -1;
588 	}
589 	u->scheme = estredup(su->scheme.s, su->scheme.e);
590 	strlower(u->scheme);
591 
592 	if(!ismatch(REscheme, u->scheme, "scheme"))
593 		return -1;
594 
595 	u->ischeme = ischeme(u->scheme);
596 	if(urldebug)
597 		fprint(2, "parse_scheme %s => %d\n", u->scheme, u->ischeme);
598 	return 0;
599 }
600 
601 static int
parse_unknown_part(SplitUrl * su,Url * u)602 parse_unknown_part(SplitUrl *su, Url *u)
603 {
604 	char *s, *e;
605 
606 	assert(u->ischeme == USunknown);
607 	assert(su->scheme.e[0] == ':');
608 
609 	s = su->scheme.e+1;
610 	if(su->fragment.s){
611 		e = su->fragment.s-1;
612 		assert(*e == '#');
613 	}else
614 		e = s+strlen(s);
615 
616 	u->schemedata = estredup(s, e);
617 	if(!ismatch(REunknowndata, u->schemedata, "unknown scheme data"))
618 		return -1;
619 	return 0;
620 }
621 
622 static int
parse_userinfo(char * s,char * e,Url * u)623 parse_userinfo(char *s, char *e, Url *u)
624 {
625 	Resub m[MaxResub];
626 	Retab *t;
627 
628 	m[0].sp = s;
629 	m[0].ep = e;
630 	t = &retab[REuserinfo];
631 	if(!regx(t->prog, nil, m, t->size)){
632 		werrstr("malformed userinfo: %.*q", utfnlen(s, e-s), s);
633 		return -1;
634 	}
635 	if(m[t->ind[0]].sp)
636 		u->user = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
637 	if(m[t->ind[1]].sp)
638 		u->user = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
639 	return 0;
640 }
641 
642 static int
parse_host(char * s,char * e,Url * u)643 parse_host(char *s, char *e, Url *u)
644 {
645 	Resub m[MaxResub];
646 	Retab *t;
647 
648 	m[0].sp = s;
649 	m[0].ep = e;
650 	t = &retab[REhost];
651 	if(!regx(t->prog, nil, m, t->size)){
652 		werrstr("malformed host: %.*q", utfnlen(s, e-s), s);
653 		return -1;
654 	}
655 
656 	assert(m[t->ind[0]].sp || m[t->ind[1]].sp);
657 
658 	if(m[t->ind[0]].sp)	/* regular */
659 		u->host = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
660 	else
661 		u->host = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
662 	return 0;
663 }
664 
665 static int
parse_authority(SplitUrl * su,Url * u)666 parse_authority(SplitUrl *su, Url *u)
667 {
668 	Resub m[MaxResub];
669 	Retab *t;
670 	char *host;
671 	char *userinfo;
672 
673 	if(su->authority.s == nil)
674 		return 0;
675 
676 	u->authority = estredup(su->authority.s, su->authority.e);
677 	m[0].sp = m[0].ep = nil;
678 	t = &retab[REauthority];
679 	if(!regx(t->prog, u->authority, m, t->size)){
680 		werrstr("malformed authority: %q", u->authority);
681 		return -1;
682 	}
683 
684 	if(m[t->ind[0]].sp)
685 		if(parse_userinfo(m[t->ind[0]].sp, m[t->ind[0]].ep, u) < 0)
686 			return -1;
687 	if(m[t->ind[1]].sp)
688 		if(parse_host(m[t->ind[1]].sp, m[t->ind[1]].ep, u) < 0)
689 			return -1;
690 	if(m[t->ind[2]].sp)
691 		u->port = estredup(m[t->ind[2]].sp, m[t->ind[2]].ep);
692 
693 
694 	if(urldebug > 0){
695 		userinfo = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
696 		host = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
697 		fprint(2, "port: %q, authority %q\n", u->port, u->authority);
698 		fprint(2, "host %q, userinfo %q\n", host, userinfo);
699 		free(host);
700 		free(userinfo);
701 	}
702 	return 0;
703 }
704 
705 static int
parse_abspath(SplitUrl * su,Url * u)706 parse_abspath(SplitUrl *su, Url *u)
707 {
708 	if(su->path.s == nil)
709 		return 0;
710 	u->path = estredup(su->path.s, su->path.e);
711 	if(!ismatch(REabspath, u->path, "absolute path"))
712 		return -1;
713 	return 0;
714 }
715 
716 static int
parse_query(SplitUrl * su,Url * u)717 parse_query(SplitUrl *su, Url *u)
718 {
719 	if(su->query.s == nil)
720 		return 0;
721 	u->query = estredup(su->query.s, su->query.e);
722 	if(!ismatch(REquery, u->query, "query"))
723 		return -1;
724 	return 0;
725 }
726 
727 static int
parse_fragment(SplitUrl * su,Url * u)728 parse_fragment(SplitUrl *su, Url *u)
729 {
730 	if(su->fragment.s == nil)
731 		return 0;
732 	u->fragment = estredup(su->fragment.s, su->fragment.e);
733 	if(!ismatch(REfragment, u->fragment, "fragment"))
734 		return -1;
735 	return 0;
736 }
737 
738 static int
postparse_http(Url * u)739 postparse_http(Url *u)
740 {
741 	u->open = httpopen;
742 	u->read = httpread;
743 	u->close = httpclose;
744 
745 	if(u->authority==nil){
746 		werrstr("missing authority (hostname, port, etc.)");
747 		return -1;
748 	}
749 	if(u->host == nil){
750 		werrstr("missing host specification");
751 		return -1;
752 	}
753 
754 	if(u->path == nil){
755 		u->http.page_spec = estrdup("/");
756 		return 0;
757 	}
758 
759 	if(!ismatch(REhttppath, u->path, "http path"))
760 		return -1;
761 	if(u->query){
762 		u->http.page_spec = emalloc(strlen(u->path)+1+strlen(u->query)+1);
763 		strcpy(u->http.page_spec, u->path);
764 		strcat(u->http.page_spec, "?");
765 		strcat(u->http.page_spec, u->query);
766 	}else
767 		u->http.page_spec = estrdup(u->path);
768 
769 	return 0;
770 }
771 
772 static int
postparse_ftp(Url * u)773 postparse_ftp(Url *u)
774 {
775 	Resub m[MaxResub];
776 	Retab *t;
777 
778 	if(u->authority==nil){
779 		werrstr("missing authority (hostname, port, etc.)");
780 		return -1;
781 	}
782 	if(u->query){
783 		werrstr("unexpected \"?query\" in ftp path");
784 		return -1;
785 	}
786 	if(u->host == nil){
787 		werrstr("missing host specification");
788 		return -1;
789 	}
790 
791 	if(u->path == nil){
792 		u->ftp.path_spec = estrdup("/");
793 		return 0;
794 	}
795 
796 	m[0].sp = m[0].ep = nil;
797 	t = &retab[REftppath];
798 	if(!regx(t->prog, u->path, m, t->size)){
799 		werrstr("malformed ftp path: %q", u->path);
800 		return -1;
801 	}
802 
803 	if(m[t->ind[0]].sp){
804 		u->ftp.path_spec = estredup(m[t->ind[0]].sp, m[t->ind[0]].ep);
805 		if(strchr(u->ftp.path_spec, ';')){
806 			werrstr("unexpected \";param\" in ftp path");
807 			return -1;
808 		}
809 	}else
810 		u->ftp.path_spec = estrdup("/");
811 
812 	if(m[t->ind[1]].sp){
813 		u->ftp.type = estredup(m[t->ind[1]].sp, m[t->ind[1]].ep);
814 		strlower(u->ftp.type);
815 	}
816 	return 0;
817 }
818 
819 static int
postparse_file(Url * u)820 postparse_file(Url *u)
821 {
822 	if(u->user || u->passwd){
823 		werrstr("user information not valid with file scheme");
824 		return -1;
825 	}
826 	if(u->query){
827 		werrstr("unexpected \"?query\" in file path");
828 		return -1;
829 	}
830 	if(u->port){
831 		werrstr("port not valid with file scheme");
832 		return -1;
833 	}
834 	if(u->path == nil){
835 		werrstr("missing path in file scheme");
836 		return -1;
837 	}
838 	if(strchr(u->path, ';')){
839 		werrstr("unexpected \";param\" in file path");
840 		return -1;
841 	}
842 
843 	if(!ismatch(REfilepath, u->path, "file path"))
844 		return -1;
845 
846 	/* "localhost" is equivalent to no host spec, we'll chose the latter */
847 	if(u->host && cistrcmp(u->host, "localhost") == 0){
848 		free(u->host);
849 		u->host = nil;
850 	}
851 	return 0;
852 }
853 
854 static int (*postparse[])(Url*) = {
855 	nil,
856 	postparse_http,
857 	postparse_http,
858 	postparse_ftp,
859 	postparse_file,
860 };
861 
862 Url*
parseurl(char * url,Url * base)863 parseurl(char *url, Url *base)
864 {
865 	Url *u;
866 	SplitUrl su;
867 
868 	if(urldebug)
869 		fprint(2, "parseurl %s with base %s\n", url, base ? base->url : "<none>");
870 
871 	u = emalloc(sizeof(Url));
872 	u->url = estrdup(url);
873 	if(spliturl(u->url, &su) < 0){
874 	Fail:
875 		freeurl(u);
876 		return nil;
877 	}
878 
879 	/* RFC2396 sec 3.1 says relative URIs are distinguished by absent scheme */
880 	if(su.scheme.s==nil){
881 		if(urldebug)
882 			fprint(2, "parseurl has nil scheme\n");
883 		if(resolve_relative(&su, base, u) < 0 || spliturl(u->url, &su) < 0)
884 			goto Fail;
885 		if(u->ischeme == UScurrent){
886 			/* 'u.url' refers to current document; set fragment and return */
887 			if(parse_fragment(&su, u) < 0)
888 				goto Fail;
889 			return u;
890 		}
891 	}
892 
893 	if(parse_scheme(&su, u) < 0
894 	|| parse_fragment(&su, u) < 0)
895 		goto Fail;
896 
897 	if(u->ischeme == USunknown){
898 		if(parse_unknown_part(&su, u) < 0)
899 			goto Fail;
900 		return u;
901 	}
902 
903 	if(parse_query(&su, u) < 0
904 	|| parse_authority(&su, u) < 0
905 	|| parse_abspath(&su, u) < 0)
906 		goto Fail;
907 
908 	if(u->ischeme < nelem(postparse) && postparse[u->ischeme])
909 		if((*postparse[u->ischeme])(u) < 0)
910 			goto Fail;
911 
912 	setmalloctag(u, getcallerpc(&url));
913 	return u;
914 }
915 
916 void
freeurl(Url * u)917 freeurl(Url *u)
918 {
919 	if(u == nil)
920 		return;
921 	free(u->url);
922 	free(u->scheme);
923 	free(u->schemedata);
924 	free(u->authority);
925 	free(u->user);
926 	free(u->passwd);
927 	free(u->host);
928 	free(u->port);
929 	free(u->path);
930 	free(u->query);
931 	free(u->fragment);
932 	switch(u->ischeme){
933 	case UShttp:
934 		free(u->http.page_spec);
935 		break;
936 	case USftp:
937 		free(u->ftp.path_spec);
938 		free(u->ftp.type);
939 		break;
940 	}
941 	free(u);
942 }
943 
944 void
rewriteurl(Url * u)945 rewriteurl(Url *u)
946 {
947 	char *s;
948 
949 	if(u->schemedata)
950 		s = estrmanydup(u->scheme, ":", u->schemedata, nil);
951 	else
952 		s = estrmanydup(u->scheme, "://",
953 			u->user ? u->user : "",
954 			u->passwd ? ":" : "", u->passwd ? u->passwd : "",
955 			u->user ? "@" : "", u->host ? u->host : "",
956 			u->port ? ":" : "", u->port ? u->port : "",
957 			u->path,
958 			u->query ? "?" : "", u->query ? u->query : "",
959 			u->fragment ? "#" : "", u->fragment ? u->fragment : "",
960 			nil);
961 	free(u->url);
962 	u->url = s;
963 }
964 
965 int
seturlquery(Url * u,char * query)966 seturlquery(Url *u, char *query)
967 {
968 	if(query == nil){
969 		free(u->query);
970 		u->query = nil;
971 		return 0;
972 	}
973 
974 	if(!ismatch(REquery, query, "query"))
975 		return -1;
976 
977 	free(u->query);
978 	u->query = estrdup(query);
979 	return 0;
980 }
981 
982 static void
dupp(char ** p)983 dupp(char **p)
984 {
985 	if(*p)
986 		*p = estrdup(*p);
987 }
988 
989 Url*
copyurl(Url * u)990 copyurl(Url *u)
991 {
992 	Url *v;
993 
994 	v = emalloc(sizeof(Url));
995 	*v = *u;
996 	dupp(&v->url);
997 	dupp(&v->scheme);
998 	dupp(&v->schemedata);
999 	dupp(&v->authority);
1000 	dupp(&v->user);
1001 	dupp(&v->passwd);
1002 	dupp(&v->host);
1003 	dupp(&v->port);
1004 	dupp(&v->path);
1005 	dupp(&v->query);
1006 	dupp(&v->fragment);
1007 
1008 	switch(v->ischeme){
1009 	case UShttp:
1010 		dupp(&v->http.page_spec);
1011 		break;
1012 	case USftp:
1013 		dupp(&v->ftp.path_spec);
1014 		dupp(&v->ftp.type);
1015 		break;
1016 	}
1017 	return v;
1018 }
1019 
1020 static int
dhex(char c)1021 dhex(char c)
1022 {
1023 	if('0' <= c && c <= '9')
1024 		return c-'0';
1025 	if('a' <= c && c <= 'f')
1026 		return c-'a'+10;
1027 	if('A' <= c && c <= 'F')
1028 		return c-'A'+10;
1029 	return 0;
1030 }
1031 
1032 char*
escapeurl(char * s,int (* needesc)(int))1033 escapeurl(char *s, int (*needesc)(int))
1034 {
1035 	int n;
1036 	char *t, *u;
1037 	Rune r;
1038 	static char *hex = "0123456789abcdef";
1039 
1040 	n = 0;
1041 	for(t=s; *t; t++)
1042 		if((*needesc)(*t))
1043 			n++;
1044 
1045 	u = emalloc(strlen(s)+2*n+1);
1046 	t = u;
1047 	for(; *s; s++){
1048 		s += chartorune(&r, s);
1049 		if(r >= 0xFF){
1050 			werrstr("URLs cannot contain Runes > 0xFF");
1051 			free(t);
1052 			return nil;
1053 		}
1054 		if((*needesc)(r)){
1055 			*u++ = '%';
1056 			*u++ = hex[(r>>4)&0xF];
1057 			*u++ = hex[r&0xF];
1058 		}else
1059 			*u++ = r;
1060 	}
1061 	*u = '\0';
1062 	return t;
1063 }
1064 
1065 char*
unescapeurl(char * s)1066 unescapeurl(char *s)
1067 {
1068 	char *r, *w;
1069 	Rune rune;
1070 
1071 	s = estrdup(s);
1072 	for(r=w=s; *r; r++){
1073 		if(*r=='%'){
1074 			r++;
1075 			if(!isxdigit(r[0]) || !isxdigit(r[1])){
1076 				werrstr("bad escape sequence '%.3s' in URL", r);
1077 				return nil;
1078 			}
1079 			if(r[0]=='0' && r[2]=='0'){
1080 				werrstr("escaped NUL in URL");
1081 				return nil;
1082 			}
1083 			rune = (dhex(r[0])<<4)|dhex(r[1]);	/* latin1 */
1084 			w += runetochar(w, &rune);
1085 			r += 2;
1086 		}else
1087 			*w++ = *r;
1088 	}
1089 	*w = '\0';
1090 	return s;
1091 }
1092 
1093