xref: /netbsd-src/external/mpl/dhcp/bind/dist/lib/isc/url.c (revision 4afad4b7fa6d4a0d3dedf41d1587a7250710ae54)
1 /*	$NetBSD: url.c,v 1.1 2024/02/18 20:57:51 christos Exp $	*/
2 
3 /*
4  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
5  *
6  * SPDX-License-Identifier: MPL-2.0 and MIT
7  *
8  * This Source Code Form is subject to the terms of the Mozilla Public
9  * License, v. 2.0. If a copy of the MPL was not distributed with this
10  * file, you can obtain one at https://mozilla.org/MPL/2.0/.
11  *
12  * See the COPYRIGHT file distributed with this work for additional
13  * information regarding copyright ownership.
14  */
15 
16 /*
17  * Copyright Joyent, Inc. and other Node contributors. All rights reserved.
18  *
19  * Permission is hereby granted, free of charge, to any person obtaining a copy
20  * of this software and associated documentation files (the "Software"), to
21  * deal in the Software without restriction, including without limitation the
22  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
23  * sell copies of the Software, and to permit persons to whom the Software is
24  * furnished to do so, subject to the following conditions:
25  *
26  * The above copyright notice and this permission notice shall be included in
27  * all copies or substantial portions of the Software.
28  *
29  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
34  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
35  * IN THE SOFTWARE.
36  */
37 
38 #include <ctype.h>
39 #include <limits.h>
40 #include <stddef.h>
41 #include <string.h>
42 
43 #include <isc/url.h>
44 #include <isc/util.h>
45 
46 #ifndef BIT_AT
47 #define BIT_AT(a, i)                                    \
48 	(!!((unsigned int)(a)[(unsigned int)(i) >> 3] & \
49 	    (1 << ((unsigned int)(i)&7))))
50 #endif
51 
52 #if HTTP_PARSER_STRICT
53 #define T(v) 0
54 #else
55 #define T(v) v
56 #endif
57 
58 static const uint8_t normal_url_char[32] = {
59 	/*   0 nul  1 soh  2 stx  3 etx  4 eot  5 enq  6 ack  7 bel  */
60 	0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,
61 	/*   8 bs   9 ht  10 nl  11 vt  12 np  13 cr  14 so  15 si */
62 	0 | T(2) | 0 | 0 | T(16) | 0 | 0 | 0,
63 	/*  16 dle 17 dc1 18 dc2 19 dc3 20 dc4 21 nak 22 syn 23 etb */
64 	0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,
65 	/*  24 can 25 em  26 sub 27 esc 28 fs  29 gs  30 rs  31 us */
66 	0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,
67 	/*  32 sp  33  !  34  "  35  #  36  $  37  %  38  &  39  ' */
68 	0 | 2 | 4 | 0 | 16 | 32 | 64 | 128,
69 	/*  40  (  41  )  42  *  43  +  44  ,  45  -  46  .  47  / */
70 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
71 	/*  48  0  49  1  50  2  51  3  52  4  53  5  54  6  55  7 */
72 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
73 	/*  56  8  57  9  58  :  59  ;  60  <  61  =  62  >  63  ?  */
74 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 0,
75 	/*  64  @  65  A  66  B  67  C  68  D  69  E  70  F  71  G */
76 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
77 	/*  72  H  73  I  74  J  75  K  76  L  77  M  78  N  79  O */
78 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
79 	/*  80  P  81  Q  82  R  83  S  84  T  85  U  86  V  87  W */
80 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
81 	/*  88  X  89  Y  90  Z  91  [  92  \  93  ]  94  ^  95  _ */
82 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
83 	/*  96  `  97  a  98  b  99  c 100  d 101  e 102  f 103  g */
84 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
85 	/* 104  h 105  i 106  j 107  k 108  l 109  m 110  n 111  o */
86 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
87 	/* 112  p 113  q 114  r 115  s 116  t 117  u 118  v 119  w */
88 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
89 	/* 120  x 121  y 122  z 123  { 124  | 125  } 126  ~ 127 del */
90 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 0,
91 };
92 
93 #undef T
94 
95 typedef enum {
96 	s_dead = 1, /* important that this is > 0 */
97 
98 	s_start_req_or_res,
99 	s_res_or_resp_H,
100 	s_start_res,
101 	s_res_H,
102 	s_res_HT,
103 	s_res_HTT,
104 	s_res_HTTP,
105 	s_res_http_major,
106 	s_res_http_dot,
107 	s_res_http_minor,
108 	s_res_http_end,
109 	s_res_first_status_code,
110 	s_res_status_code,
111 	s_res_status_start,
112 	s_res_status,
113 	s_res_line_almost_done,
114 
115 	s_start_req,
116 
117 	s_req_method,
118 	s_req_spaces_before_url,
119 	s_req_schema,
120 	s_req_schema_slash,
121 	s_req_schema_slash_slash,
122 	s_req_server_start,
123 	s_req_server,
124 	s_req_server_with_at,
125 	s_req_path,
126 	s_req_query_string_start,
127 	s_req_query_string,
128 	s_req_fragment_start,
129 	s_req_fragment,
130 	s_req_http_start,
131 	s_req_http_H,
132 	s_req_http_HT,
133 	s_req_http_HTT,
134 	s_req_http_HTTP,
135 	s_req_http_I,
136 	s_req_http_IC,
137 	s_req_http_major,
138 	s_req_http_dot,
139 	s_req_http_minor,
140 	s_req_http_end,
141 	s_req_line_almost_done,
142 
143 	s_header_field_start,
144 	s_header_field,
145 	s_header_value_discard_ws,
146 	s_header_value_discard_ws_almost_done,
147 	s_header_value_discard_lws,
148 	s_header_value_start,
149 	s_header_value,
150 	s_header_value_lws,
151 
152 	s_header_almost_done,
153 
154 	s_chunk_size_start,
155 	s_chunk_size,
156 	s_chunk_parameters,
157 	s_chunk_size_almost_done,
158 
159 	s_headers_almost_done,
160 	s_headers_done,
161 
162 	/*
163 	 * Important: 's_headers_done' must be the last 'header' state. All
164 	 * states beyond this must be 'body' states. It is used for overflow
165 	 * checking. See the PARSING_HEADER() macro.
166 	 */
167 
168 	s_chunk_data,
169 	s_chunk_data_almost_done,
170 	s_chunk_data_done,
171 
172 	s_body_identity,
173 	s_body_identity_eof,
174 
175 	s_message_done
176 } state_t;
177 
178 typedef enum {
179 	s_http_host_dead = 1,
180 	s_http_userinfo_start,
181 	s_http_userinfo,
182 	s_http_host_start,
183 	s_http_host_v6_start,
184 	s_http_host,
185 	s_http_host_v6,
186 	s_http_host_v6_end,
187 	s_http_host_v6_zone_start,
188 	s_http_host_v6_zone,
189 	s_http_host_port_start,
190 	s_http_host_port
191 } host_state_t;
192 
193 /* Macros for character classes; depends on strict-mode  */
194 #define IS_MARK(c)                                                             \
195 	((c) == '-' || (c) == '_' || (c) == '.' || (c) == '!' || (c) == '~' || \
196 	 (c) == '*' || (c) == '\'' || (c) == '(' || (c) == ')')
197 #define IS_USERINFO_CHAR(c)                                                    \
198 	(isalnum((unsigned char)c) || IS_MARK(c) || (c) == '%' ||              \
199 	 (c) == ';' || (c) == ':' || (c) == '&' || (c) == '=' || (c) == '+' || \
200 	 (c) == '$' || (c) == ',')
201 
202 #if HTTP_PARSER_STRICT
203 #define IS_URL_CHAR(c)	(BIT_AT(normal_url_char, (unsigned char)c))
204 #define IS_HOST_CHAR(c) (isalnum((unsigned char)c) || (c) == '.' || (c) == '-')
205 #else
206 #define IS_URL_CHAR(c) (BIT_AT(normal_url_char, (unsigned char)c) || ((c)&0x80))
207 #define IS_HOST_CHAR(c) \
208 	(isalnum((unsigned char)c) || (c) == '.' || (c) == '-' || (c) == '_')
209 #endif
210 
211 /*
212  * Our URL parser.
213  *
214  * This is designed to be shared by http_parser_execute() for URL validation,
215  * hence it has a state transition + byte-for-byte interface. In addition, it
216  * is meant to be embedded in http_parser_parse_url(), which does the dirty
217  * work of turning state transitions URL components for its API.
218  *
219  * This function should only be invoked with non-space characters. It is
220  * assumed that the caller cares about (and can detect) the transition between
221  * URL and non-URL states by looking for these.
222  */
223 static state_t
parse_url_char(state_t s,const char ch)224 parse_url_char(state_t s, const char ch) {
225 	if (ch == ' ' || ch == '\r' || ch == '\n') {
226 		return (s_dead);
227 	}
228 
229 #if HTTP_PARSER_STRICT
230 	if (ch == '\t' || ch == '\f') {
231 		return (s_dead);
232 	}
233 #endif
234 
235 	switch (s) {
236 	case s_req_spaces_before_url:
237 		/* Proxied requests are followed by scheme of an absolute URI
238 		 * (alpha). All methods except CONNECT are followed by '/' or
239 		 * '*'.
240 		 */
241 
242 		if (ch == '/' || ch == '*') {
243 			return (s_req_path);
244 		}
245 
246 		if (isalpha((unsigned char)ch)) {
247 			return (s_req_schema);
248 		}
249 
250 		break;
251 
252 	case s_req_schema:
253 		if (isalpha((unsigned char)ch)) {
254 			return (s);
255 		}
256 
257 		if (ch == ':') {
258 			return (s_req_schema_slash);
259 		}
260 
261 		break;
262 
263 	case s_req_schema_slash:
264 		if (ch == '/') {
265 			return (s_req_schema_slash_slash);
266 		}
267 
268 		break;
269 
270 	case s_req_schema_slash_slash:
271 		if (ch == '/') {
272 			return (s_req_server_start);
273 		}
274 
275 		break;
276 
277 	case s_req_server_with_at:
278 		if (ch == '@') {
279 			return (s_dead);
280 		}
281 
282 		FALLTHROUGH;
283 	case s_req_server_start:
284 	case s_req_server:
285 		if (ch == '/') {
286 			return (s_req_path);
287 		}
288 
289 		if (ch == '?') {
290 			return (s_req_query_string_start);
291 		}
292 
293 		if (ch == '@') {
294 			return (s_req_server_with_at);
295 		}
296 
297 		if (IS_USERINFO_CHAR(ch) || ch == '[' || ch == ']') {
298 			return (s_req_server);
299 		}
300 
301 		break;
302 
303 	case s_req_path:
304 		if (IS_URL_CHAR(ch)) {
305 			return (s);
306 		}
307 
308 		switch (ch) {
309 		case '?':
310 			return (s_req_query_string_start);
311 
312 		case '#':
313 			return (s_req_fragment_start);
314 		}
315 
316 		break;
317 
318 	case s_req_query_string_start:
319 	case s_req_query_string:
320 		if (IS_URL_CHAR(ch)) {
321 			return (s_req_query_string);
322 		}
323 
324 		switch (ch) {
325 		case '?':
326 			/* allow extra '?' in query string */
327 			return (s_req_query_string);
328 
329 		case '#':
330 			return (s_req_fragment_start);
331 		}
332 
333 		break;
334 
335 	case s_req_fragment_start:
336 		if (IS_URL_CHAR(ch)) {
337 			return (s_req_fragment);
338 		}
339 
340 		switch (ch) {
341 		case '?':
342 			return (s_req_fragment);
343 
344 		case '#':
345 			return (s);
346 		}
347 
348 		break;
349 
350 	case s_req_fragment:
351 		if (IS_URL_CHAR(ch)) {
352 			return (s);
353 		}
354 
355 		switch (ch) {
356 		case '?':
357 		case '#':
358 			return (s);
359 		}
360 
361 		break;
362 
363 	default:
364 		break;
365 	}
366 
367 	/*
368 	 * We should never fall out of the switch above unless there's an
369 	 * error.
370 	 */
371 	return (s_dead);
372 }
373 
374 static host_state_t
http_parse_host_char(host_state_t s,const char ch)375 http_parse_host_char(host_state_t s, const char ch) {
376 	switch (s) {
377 	case s_http_userinfo:
378 	case s_http_userinfo_start:
379 		if (ch == '@') {
380 			return (s_http_host_start);
381 		}
382 
383 		if (IS_USERINFO_CHAR(ch)) {
384 			return (s_http_userinfo);
385 		}
386 		break;
387 
388 	case s_http_host_start:
389 		if (ch == '[') {
390 			return (s_http_host_v6_start);
391 		}
392 
393 		if (IS_HOST_CHAR(ch)) {
394 			return (s_http_host);
395 		}
396 
397 		break;
398 
399 	case s_http_host:
400 		if (IS_HOST_CHAR(ch)) {
401 			return (s_http_host);
402 		}
403 
404 		FALLTHROUGH;
405 	case s_http_host_v6_end:
406 		if (ch == ':') {
407 			return (s_http_host_port_start);
408 		}
409 
410 		break;
411 
412 	case s_http_host_v6:
413 		if (ch == ']') {
414 			return (s_http_host_v6_end);
415 		}
416 
417 		FALLTHROUGH;
418 	case s_http_host_v6_start:
419 		if (isxdigit((unsigned char)ch) || ch == ':' || ch == '.') {
420 			return (s_http_host_v6);
421 		}
422 
423 		if (s == s_http_host_v6 && ch == '%') {
424 			return (s_http_host_v6_zone_start);
425 		}
426 		break;
427 
428 	case s_http_host_v6_zone:
429 		if (ch == ']') {
430 			return (s_http_host_v6_end);
431 		}
432 
433 		FALLTHROUGH;
434 	case s_http_host_v6_zone_start:
435 		/* RFC 6874 Zone ID consists of 1*( unreserved / pct-encoded) */
436 		if (isalnum((unsigned char)ch) || ch == '%' || ch == '.' ||
437 		    ch == '-' || ch == '_' || ch == '~')
438 		{
439 			return (s_http_host_v6_zone);
440 		}
441 		break;
442 
443 	case s_http_host_port:
444 	case s_http_host_port_start:
445 		if (isdigit((unsigned char)ch)) {
446 			return (s_http_host_port);
447 		}
448 
449 		break;
450 
451 	default:
452 		break;
453 	}
454 
455 	return (s_http_host_dead);
456 }
457 
458 static isc_result_t
http_parse_host(const char * buf,isc_url_parser_t * up,int found_at)459 http_parse_host(const char *buf, isc_url_parser_t *up, int found_at) {
460 	host_state_t s;
461 	const char *p = NULL;
462 	size_t buflen = up->field_data[ISC_UF_HOST].off +
463 			up->field_data[ISC_UF_HOST].len;
464 
465 	REQUIRE((up->field_set & (1 << ISC_UF_HOST)) != 0);
466 
467 	up->field_data[ISC_UF_HOST].len = 0;
468 
469 	s = found_at ? s_http_userinfo_start : s_http_host_start;
470 
471 	for (p = buf + up->field_data[ISC_UF_HOST].off; p < buf + buflen; p++) {
472 		host_state_t new_s = http_parse_host_char(s, *p);
473 
474 		if (new_s == s_http_host_dead) {
475 			return (ISC_R_FAILURE);
476 		}
477 
478 		switch (new_s) {
479 		case s_http_host:
480 			if (s != s_http_host) {
481 				up->field_data[ISC_UF_HOST].off =
482 					(uint16_t)(p - buf);
483 			}
484 			up->field_data[ISC_UF_HOST].len++;
485 			break;
486 
487 		case s_http_host_v6:
488 			if (s != s_http_host_v6) {
489 				up->field_data[ISC_UF_HOST].off =
490 					(uint16_t)(p - buf);
491 			}
492 			up->field_data[ISC_UF_HOST].len++;
493 			break;
494 
495 		case s_http_host_v6_zone_start:
496 		case s_http_host_v6_zone:
497 			up->field_data[ISC_UF_HOST].len++;
498 			break;
499 
500 		case s_http_host_port:
501 			if (s != s_http_host_port) {
502 				up->field_data[ISC_UF_PORT].off =
503 					(uint16_t)(p - buf);
504 				up->field_data[ISC_UF_PORT].len = 0;
505 				up->field_set |= (1 << ISC_UF_PORT);
506 			}
507 			up->field_data[ISC_UF_PORT].len++;
508 			break;
509 
510 		case s_http_userinfo:
511 			if (s != s_http_userinfo) {
512 				up->field_data[ISC_UF_USERINFO].off =
513 					(uint16_t)(p - buf);
514 				up->field_data[ISC_UF_USERINFO].len = 0;
515 				up->field_set |= (1 << ISC_UF_USERINFO);
516 			}
517 			up->field_data[ISC_UF_USERINFO].len++;
518 			break;
519 
520 		default:
521 			break;
522 		}
523 
524 		s = new_s;
525 	}
526 
527 	/* Make sure we don't end somewhere unexpected */
528 	switch (s) {
529 	case s_http_host_start:
530 	case s_http_host_v6_start:
531 	case s_http_host_v6:
532 	case s_http_host_v6_zone_start:
533 	case s_http_host_v6_zone:
534 	case s_http_host_port_start:
535 	case s_http_userinfo:
536 	case s_http_userinfo_start:
537 		return (ISC_R_FAILURE);
538 	default:
539 		break;
540 	}
541 
542 	return (ISC_R_SUCCESS);
543 }
544 
545 isc_result_t
isc_url_parse(const char * buf,size_t buflen,bool is_connect,isc_url_parser_t * up)546 isc_url_parse(const char *buf, size_t buflen, bool is_connect,
547 	      isc_url_parser_t *up) {
548 	state_t s;
549 	isc_url_field_t uf, old_uf;
550 	int found_at = 0;
551 	const char *p = NULL;
552 
553 	if (buflen == 0) {
554 		return (ISC_R_FAILURE);
555 	}
556 
557 	up->port = up->field_set = 0;
558 	s = is_connect ? s_req_server_start : s_req_spaces_before_url;
559 	old_uf = ISC_UF_MAX;
560 
561 	for (p = buf; p < buf + buflen; p++) {
562 		s = parse_url_char(s, *p);
563 
564 		/* Figure out the next field that we're operating on */
565 		switch (s) {
566 		case s_dead:
567 			return (ISC_R_FAILURE);
568 
569 		/* Skip delimiters */
570 		case s_req_schema_slash:
571 		case s_req_schema_slash_slash:
572 		case s_req_server_start:
573 		case s_req_query_string_start:
574 		case s_req_fragment_start:
575 			continue;
576 
577 		case s_req_schema:
578 			uf = ISC_UF_SCHEMA;
579 			break;
580 
581 		case s_req_server_with_at:
582 			found_at = 1;
583 			FALLTHROUGH;
584 		case s_req_server:
585 			uf = ISC_UF_HOST;
586 			break;
587 
588 		case s_req_path:
589 			uf = ISC_UF_PATH;
590 			break;
591 
592 		case s_req_query_string:
593 			uf = ISC_UF_QUERY;
594 			break;
595 
596 		case s_req_fragment:
597 			uf = ISC_UF_FRAGMENT;
598 			break;
599 
600 		default:
601 			UNREACHABLE();
602 		}
603 
604 		/* Nothing's changed; soldier on */
605 		if (uf == old_uf) {
606 			up->field_data[uf].len++;
607 			continue;
608 		}
609 
610 		up->field_data[uf].off = (uint16_t)(p - buf);
611 		up->field_data[uf].len = 1;
612 
613 		up->field_set |= (1 << uf);
614 		old_uf = uf;
615 	}
616 
617 	/* host must be present if there is a schema */
618 	/* parsing http:///toto will fail */
619 	if ((up->field_set & (1 << ISC_UF_SCHEMA)) &&
620 	    (up->field_set & (1 << ISC_UF_HOST)) == 0)
621 	{
622 		return (ISC_R_FAILURE);
623 	}
624 
625 	if (up->field_set & (1 << ISC_UF_HOST)) {
626 		isc_result_t result;
627 
628 		result = http_parse_host(buf, up, found_at);
629 		if (result != ISC_R_SUCCESS) {
630 			return (result);
631 		}
632 	}
633 
634 	/* CONNECT requests can only contain "hostname:port" */
635 	if (is_connect &&
636 	    up->field_set != ((1 << ISC_UF_HOST) | (1 << ISC_UF_PORT)))
637 	{
638 		return (ISC_R_FAILURE);
639 	}
640 
641 	if (up->field_set & (1 << ISC_UF_PORT)) {
642 		uint16_t off;
643 		uint16_t len;
644 		const char *pp = NULL;
645 		const char *end = NULL;
646 		unsigned long v;
647 
648 		off = up->field_data[ISC_UF_PORT].off;
649 		len = up->field_data[ISC_UF_PORT].len;
650 		end = buf + off + len;
651 
652 		/*
653 		 * NOTE: The characters are already validated and are in the
654 		 * [0-9] range
655 		 */
656 		INSIST(off + len <= buflen);
657 
658 		v = 0;
659 		for (pp = buf + off; pp < end; pp++) {
660 			v *= 10;
661 			v += *pp - '0';
662 
663 			/* Ports have a max value of 2^16 */
664 			if (v > 0xffff) {
665 				return (ISC_R_RANGE);
666 			}
667 		}
668 
669 		up->port = (uint16_t)v;
670 	}
671 
672 	return (ISC_R_SUCCESS);
673 }
674