xref: /netbsd-src/external/mpl/bind/dist/lib/isc/url.c (revision bcda20f65a8566e103791ec395f7f499ef322704)
1 /*	$NetBSD: url.c,v 1.6 2025/01/26 16:25:39 christos Exp $	*/
2 
3 /*
4  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
5  *
6  * SPDX-License-Identifier: MPL-2.0 and MIT
7  *
8  * This Source Code Form is subject to the terms of the Mozilla Public
9  * License, v. 2.0. If a copy of the MPL was not distributed with this
10  * file, you can obtain one at https://mozilla.org/MPL/2.0/.
11  *
12  * See the COPYRIGHT file distributed with this work for additional
13  * information regarding copyright ownership.
14  */
15 
16 /*
17  * Copyright Joyent, Inc. and other Node contributors. All rights reserved.
18  *
19  * Permission is hereby granted, free of charge, to any person obtaining a copy
20  * of this software and associated documentation files (the "Software"), to
21  * deal in the Software without restriction, including without limitation the
22  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
23  * sell copies of the Software, and to permit persons to whom the Software is
24  * furnished to do so, subject to the following conditions:
25  *
26  * The above copyright notice and this permission notice shall be included in
27  * all copies or substantial portions of the Software.
28  *
29  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
34  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
35  * IN THE SOFTWARE.
36  */
37 
38 #include <ctype.h>
39 #include <limits.h>
40 #include <stddef.h>
41 #include <string.h>
42 
43 #include <isc/url.h>
44 #include <isc/util.h>
45 
46 #ifndef BIT_AT
47 #define BIT_AT(a, i)                                    \
48 	(!!((unsigned int)(a)[(unsigned int)(i) >> 3] & \
49 	    (1 << ((unsigned int)(i) & 7))))
50 #endif
51 
52 #if HTTP_PARSER_STRICT
53 #define T(v) 0
54 #else
55 #define T(v) v
56 #endif
57 
58 static const uint8_t normal_url_char[32] = {
59 	/*   0 nul  1 soh  2 stx  3 etx  4 eot  5 enq  6 ack  7 bel  */
60 	0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,
61 	/*   8 bs   9 ht  10 nl  11 vt  12 np  13 cr  14 so  15 si */
62 	0 | T(2) | 0 | 0 | T(16) | 0 | 0 | 0,
63 	/*  16 dle 17 dc1 18 dc2 19 dc3 20 dc4 21 nak 22 syn 23 etb */
64 	0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,
65 	/*  24 can 25 em  26 sub 27 esc 28 fs  29 gs  30 rs  31 us */
66 	0 | 0 | 0 | 0 | 0 | 0 | 0 | 0,
67 	/*  32 sp  33  !  34  "  35  #  36  $  37  %  38  &  39  ' */
68 	0 | 2 | 4 | 0 | 16 | 32 | 64 | 128,
69 	/*  40  (  41  )  42  *  43  +  44  ,  45  -  46  .  47  / */
70 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
71 	/*  48  0  49  1  50  2  51  3  52  4  53  5  54  6  55  7 */
72 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
73 	/*  56  8  57  9  58  :  59  ;  60  <  61  =  62  >  63  ?  */
74 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 0,
75 	/*  64  @  65  A  66  B  67  C  68  D  69  E  70  F  71  G */
76 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
77 	/*  72  H  73  I  74  J  75  K  76  L  77  M  78  N  79  O */
78 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
79 	/*  80  P  81  Q  82  R  83  S  84  T  85  U  86  V  87  W */
80 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
81 	/*  88  X  89  Y  90  Z  91  [  92  \  93  ]  94  ^  95  _ */
82 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
83 	/*  96  `  97  a  98  b  99  c 100  d 101  e 102  f 103  g */
84 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
85 	/* 104  h 105  i 106  j 107  k 108  l 109  m 110  n 111  o */
86 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
87 	/* 112  p 113  q 114  r 115  s 116  t 117  u 118  v 119  w */
88 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 128,
89 	/* 120  x 121  y 122  z 123  { 124  | 125  } 126  ~ 127 del */
90 	1 | 2 | 4 | 8 | 16 | 32 | 64 | 0,
91 };
92 
93 #undef T
94 
95 typedef enum {
96 	s_dead = 1, /* important that this is > 0 */
97 
98 	s_start_req_or_res,
99 	s_res_or_resp_H,
100 	s_start_res,
101 	s_res_H,
102 	s_res_HT,
103 	s_res_HTT,
104 	s_res_HTTP,
105 	s_res_http_major,
106 	s_res_http_dot,
107 	s_res_http_minor,
108 	s_res_http_end,
109 	s_res_first_status_code,
110 	s_res_status_code,
111 	s_res_status_start,
112 	s_res_status,
113 	s_res_line_almost_done,
114 
115 	s_start_req,
116 
117 	s_req_method,
118 	s_req_spaces_before_url,
119 	s_req_schema,
120 	s_req_schema_slash,
121 	s_req_schema_slash_slash,
122 	s_req_server_start,
123 	s_req_server,
124 	s_req_server_with_at,
125 	s_req_path,
126 	s_req_query_string_start,
127 	s_req_query_string,
128 	s_req_fragment_start,
129 	s_req_fragment,
130 	s_req_http_start,
131 	s_req_http_H,
132 	s_req_http_HT,
133 	s_req_http_HTT,
134 	s_req_http_HTTP,
135 	s_req_http_I,
136 	s_req_http_IC,
137 	s_req_http_major,
138 	s_req_http_dot,
139 	s_req_http_minor,
140 	s_req_http_end,
141 	s_req_line_almost_done,
142 
143 	s_header_field_start,
144 	s_header_field,
145 	s_header_value_discard_ws,
146 	s_header_value_discard_ws_almost_done,
147 	s_header_value_discard_lws,
148 	s_header_value_start,
149 	s_header_value,
150 	s_header_value_lws,
151 
152 	s_header_almost_done,
153 
154 	s_chunk_size_start,
155 	s_chunk_size,
156 	s_chunk_parameters,
157 	s_chunk_size_almost_done,
158 
159 	s_headers_almost_done,
160 	s_headers_done,
161 
162 	/*
163 	 * Important: 's_headers_done' must be the last 'header' state. All
164 	 * states beyond this must be 'body' states. It is used for overflow
165 	 * checking. See the PARSING_HEADER() macro.
166 	 */
167 
168 	s_chunk_data,
169 	s_chunk_data_almost_done,
170 	s_chunk_data_done,
171 
172 	s_body_identity,
173 	s_body_identity_eof,
174 
175 	s_message_done
176 } state_t;
177 
178 typedef enum {
179 	s_http_host_dead = 1,
180 	s_http_userinfo_start,
181 	s_http_userinfo,
182 	s_http_host_start,
183 	s_http_host_v6_start,
184 	s_http_host,
185 	s_http_host_v6,
186 	s_http_host_v6_end,
187 	s_http_host_v6_zone_start,
188 	s_http_host_v6_zone,
189 	s_http_host_port_start,
190 	s_http_host_port
191 } host_state_t;
192 
193 /* Macros for character classes; depends on strict-mode  */
194 #define IS_MARK(c)                                                             \
195 	((c) == '-' || (c) == '_' || (c) == '.' || (c) == '!' || (c) == '~' || \
196 	 (c) == '*' || (c) == '\'' || (c) == '(' || (c) == ')')
197 #define IS_USERINFO_CHAR(c)                                                    \
198 	(isalnum((unsigned char)c) || IS_MARK(c) || (c) == '%' ||              \
199 	 (c) == ';' || (c) == ':' || (c) == '&' || (c) == '=' || (c) == '+' || \
200 	 (c) == '$' || (c) == ',')
201 
202 #if HTTP_PARSER_STRICT
203 #define IS_URL_CHAR(c)	(BIT_AT(normal_url_char, (unsigned char)c))
204 #define IS_HOST_CHAR(c) (isalnum((unsigned char)c) || (c) == '.' || (c) == '-')
205 #else
206 #define IS_URL_CHAR(c) \
207 	(BIT_AT(normal_url_char, (unsigned char)c) || ((c) & 0x80))
208 #define IS_HOST_CHAR(c) \
209 	(isalnum((unsigned char)c) || (c) == '.' || (c) == '-' || (c) == '_')
210 #endif
211 
212 /*
213  * Our URL parser.
214  *
215  * This is designed to be shared by http_parser_execute() for URL validation,
216  * hence it has a state transition + byte-for-byte interface. In addition, it
217  * is meant to be embedded in http_parser_parse_url(), which does the dirty
218  * work of turning state transitions URL components for its API.
219  *
220  * This function should only be invoked with non-space characters. It is
221  * assumed that the caller cares about (and can detect) the transition between
222  * URL and non-URL states by looking for these.
223  */
224 static state_t
225 parse_url_char(state_t s, const char ch) {
226 	if (ch == ' ' || ch == '\r' || ch == '\n') {
227 		return s_dead;
228 	}
229 
230 #if HTTP_PARSER_STRICT
231 	if (ch == '\t' || ch == '\f') {
232 		return s_dead;
233 	}
234 #endif
235 
236 	switch (s) {
237 	case s_req_spaces_before_url:
238 		/* Proxied requests are followed by scheme of an absolute URI
239 		 * (alpha). All methods except CONNECT are followed by '/' or
240 		 * '*'.
241 		 */
242 
243 		if (ch == '/' || ch == '*') {
244 			return s_req_path;
245 		}
246 
247 		if (isalpha((unsigned char)ch)) {
248 			return s_req_schema;
249 		}
250 
251 		break;
252 
253 	case s_req_schema:
254 		if (isalpha((unsigned char)ch)) {
255 			return s;
256 		}
257 
258 		if (ch == ':') {
259 			return s_req_schema_slash;
260 		}
261 
262 		break;
263 
264 	case s_req_schema_slash:
265 		if (ch == '/') {
266 			return s_req_schema_slash_slash;
267 		}
268 
269 		break;
270 
271 	case s_req_schema_slash_slash:
272 		if (ch == '/') {
273 			return s_req_server_start;
274 		}
275 
276 		break;
277 
278 	case s_req_server_with_at:
279 		if (ch == '@') {
280 			return s_dead;
281 		}
282 
283 		FALLTHROUGH;
284 	case s_req_server_start:
285 	case s_req_server:
286 		if (ch == '/') {
287 			return s_req_path;
288 		}
289 
290 		if (ch == '?') {
291 			return s_req_query_string_start;
292 		}
293 
294 		if (ch == '@') {
295 			return s_req_server_with_at;
296 		}
297 
298 		if (IS_USERINFO_CHAR(ch) || ch == '[' || ch == ']') {
299 			return s_req_server;
300 		}
301 
302 		break;
303 
304 	case s_req_path:
305 		if (IS_URL_CHAR(ch)) {
306 			return s;
307 		}
308 
309 		switch (ch) {
310 		case '?':
311 			return s_req_query_string_start;
312 
313 		case '#':
314 			return s_req_fragment_start;
315 		}
316 
317 		break;
318 
319 	case s_req_query_string_start:
320 	case s_req_query_string:
321 		if (IS_URL_CHAR(ch)) {
322 			return s_req_query_string;
323 		}
324 
325 		switch (ch) {
326 		case '?':
327 			/* allow extra '?' in query string */
328 			return s_req_query_string;
329 
330 		case '#':
331 			return s_req_fragment_start;
332 		}
333 
334 		break;
335 
336 	case s_req_fragment_start:
337 		if (IS_URL_CHAR(ch)) {
338 			return s_req_fragment;
339 		}
340 
341 		switch (ch) {
342 		case '?':
343 			return s_req_fragment;
344 
345 		case '#':
346 			return s;
347 		}
348 
349 		break;
350 
351 	case s_req_fragment:
352 		if (IS_URL_CHAR(ch)) {
353 			return s;
354 		}
355 
356 		switch (ch) {
357 		case '?':
358 		case '#':
359 			return s;
360 		}
361 
362 		break;
363 
364 	default:
365 		break;
366 	}
367 
368 	/*
369 	 * We should never fall out of the switch above unless there's an
370 	 * error.
371 	 */
372 	return s_dead;
373 }
374 
375 static host_state_t
376 http_parse_host_char(host_state_t s, const char ch) {
377 	switch (s) {
378 	case s_http_userinfo:
379 	case s_http_userinfo_start:
380 		if (ch == '@') {
381 			return s_http_host_start;
382 		}
383 
384 		if (IS_USERINFO_CHAR(ch)) {
385 			return s_http_userinfo;
386 		}
387 		break;
388 
389 	case s_http_host_start:
390 		if (ch == '[') {
391 			return s_http_host_v6_start;
392 		}
393 
394 		if (IS_HOST_CHAR(ch)) {
395 			return s_http_host;
396 		}
397 
398 		break;
399 
400 	case s_http_host:
401 		if (IS_HOST_CHAR(ch)) {
402 			return s_http_host;
403 		}
404 
405 		FALLTHROUGH;
406 	case s_http_host_v6_end:
407 		if (ch == ':') {
408 			return s_http_host_port_start;
409 		}
410 
411 		break;
412 
413 	case s_http_host_v6:
414 		if (ch == ']') {
415 			return s_http_host_v6_end;
416 		}
417 
418 		FALLTHROUGH;
419 	case s_http_host_v6_start:
420 		if (isxdigit((unsigned char)ch) || ch == ':' || ch == '.') {
421 			return s_http_host_v6;
422 		}
423 
424 		if (s == s_http_host_v6 && ch == '%') {
425 			return s_http_host_v6_zone_start;
426 		}
427 		break;
428 
429 	case s_http_host_v6_zone:
430 		if (ch == ']') {
431 			return s_http_host_v6_end;
432 		}
433 
434 		FALLTHROUGH;
435 	case s_http_host_v6_zone_start:
436 		/* RFC 6874 Zone ID consists of 1*( unreserved / pct-encoded) */
437 		if (isalnum((unsigned char)ch) || ch == '%' || ch == '.' ||
438 		    ch == '-' || ch == '_' || ch == '~')
439 		{
440 			return s_http_host_v6_zone;
441 		}
442 		break;
443 
444 	case s_http_host_port:
445 	case s_http_host_port_start:
446 		if (isdigit((unsigned char)ch)) {
447 			return s_http_host_port;
448 		}
449 
450 		break;
451 
452 	default:
453 		break;
454 	}
455 
456 	return s_http_host_dead;
457 }
458 
459 static isc_result_t
460 http_parse_host(const char *buf, isc_url_parser_t *up, int found_at) {
461 	host_state_t s;
462 	const char *p = NULL;
463 	size_t buflen = up->field_data[ISC_UF_HOST].off +
464 			up->field_data[ISC_UF_HOST].len;
465 
466 	REQUIRE((up->field_set & (1 << ISC_UF_HOST)) != 0);
467 
468 	up->field_data[ISC_UF_HOST].len = 0;
469 
470 	s = found_at ? s_http_userinfo_start : s_http_host_start;
471 
472 	for (p = buf + up->field_data[ISC_UF_HOST].off; p < buf + buflen; p++) {
473 		host_state_t new_s = http_parse_host_char(s, *p);
474 
475 		if (new_s == s_http_host_dead) {
476 			return ISC_R_FAILURE;
477 		}
478 
479 		switch (new_s) {
480 		case s_http_host:
481 			if (s != s_http_host) {
482 				up->field_data[ISC_UF_HOST].off =
483 					(uint16_t)(p - buf);
484 			}
485 			up->field_data[ISC_UF_HOST].len++;
486 			break;
487 
488 		case s_http_host_v6:
489 			if (s != s_http_host_v6) {
490 				up->field_data[ISC_UF_HOST].off =
491 					(uint16_t)(p - buf);
492 			}
493 			up->field_data[ISC_UF_HOST].len++;
494 			break;
495 
496 		case s_http_host_v6_zone_start:
497 		case s_http_host_v6_zone:
498 			up->field_data[ISC_UF_HOST].len++;
499 			break;
500 
501 		case s_http_host_port:
502 			if (s != s_http_host_port) {
503 				up->field_data[ISC_UF_PORT].off =
504 					(uint16_t)(p - buf);
505 				up->field_data[ISC_UF_PORT].len = 0;
506 				up->field_set |= (1 << ISC_UF_PORT);
507 			}
508 			up->field_data[ISC_UF_PORT].len++;
509 			break;
510 
511 		case s_http_userinfo:
512 			if (s != s_http_userinfo) {
513 				up->field_data[ISC_UF_USERINFO].off =
514 					(uint16_t)(p - buf);
515 				up->field_data[ISC_UF_USERINFO].len = 0;
516 				up->field_set |= (1 << ISC_UF_USERINFO);
517 			}
518 			up->field_data[ISC_UF_USERINFO].len++;
519 			break;
520 
521 		default:
522 			break;
523 		}
524 
525 		s = new_s;
526 	}
527 
528 	/* Make sure we don't end somewhere unexpected */
529 	switch (s) {
530 	case s_http_host_start:
531 	case s_http_host_v6_start:
532 	case s_http_host_v6:
533 	case s_http_host_v6_zone_start:
534 	case s_http_host_v6_zone:
535 	case s_http_host_port_start:
536 	case s_http_userinfo:
537 	case s_http_userinfo_start:
538 		return ISC_R_FAILURE;
539 	default:
540 		break;
541 	}
542 
543 	return ISC_R_SUCCESS;
544 }
545 
546 isc_result_t
547 isc_url_parse(const char *buf, size_t buflen, bool is_connect,
548 	      isc_url_parser_t *up) {
549 	state_t s;
550 	isc_url_field_t uf, old_uf;
551 	int found_at = 0;
552 	const char *p = NULL;
553 
554 	if (buflen == 0) {
555 		return ISC_R_FAILURE;
556 	}
557 
558 	up->port = up->field_set = 0;
559 	s = is_connect ? s_req_server_start : s_req_spaces_before_url;
560 	old_uf = ISC_UF_MAX;
561 
562 	for (p = buf; p < buf + buflen; p++) {
563 		s = parse_url_char(s, *p);
564 
565 		/* Figure out the next field that we're operating on */
566 		switch (s) {
567 		case s_dead:
568 			return ISC_R_FAILURE;
569 
570 		/* Skip delimiters */
571 		case s_req_schema_slash:
572 		case s_req_schema_slash_slash:
573 		case s_req_server_start:
574 		case s_req_query_string_start:
575 		case s_req_fragment_start:
576 			continue;
577 
578 		case s_req_schema:
579 			uf = ISC_UF_SCHEMA;
580 			break;
581 
582 		case s_req_server_with_at:
583 			found_at = 1;
584 			FALLTHROUGH;
585 		case s_req_server:
586 			uf = ISC_UF_HOST;
587 			break;
588 
589 		case s_req_path:
590 			uf = ISC_UF_PATH;
591 			break;
592 
593 		case s_req_query_string:
594 			uf = ISC_UF_QUERY;
595 			break;
596 
597 		case s_req_fragment:
598 			uf = ISC_UF_FRAGMENT;
599 			break;
600 
601 		default:
602 			UNREACHABLE();
603 		}
604 
605 		/* Nothing's changed; soldier on */
606 		if (uf == old_uf) {
607 			up->field_data[uf].len++;
608 			continue;
609 		}
610 
611 		up->field_data[uf].off = (uint16_t)(p - buf);
612 		up->field_data[uf].len = 1;
613 
614 		up->field_set |= (1 << uf);
615 		old_uf = uf;
616 	}
617 
618 	/* host must be present if there is a schema */
619 	/* parsing http:///toto will fail */
620 	if ((up->field_set & (1 << ISC_UF_SCHEMA)) &&
621 	    (up->field_set & (1 << ISC_UF_HOST)) == 0)
622 	{
623 		return ISC_R_FAILURE;
624 	}
625 
626 	if (up->field_set & (1 << ISC_UF_HOST)) {
627 		isc_result_t result;
628 
629 		result = http_parse_host(buf, up, found_at);
630 		if (result != ISC_R_SUCCESS) {
631 			return result;
632 		}
633 	}
634 
635 	/* CONNECT requests can only contain "hostname:port" */
636 	if (is_connect &&
637 	    up->field_set != ((1 << ISC_UF_HOST) | (1 << ISC_UF_PORT)))
638 	{
639 		return ISC_R_FAILURE;
640 	}
641 
642 	if (up->field_set & (1 << ISC_UF_PORT)) {
643 		uint16_t off;
644 		uint16_t len;
645 		const char *pp = NULL;
646 		const char *end = NULL;
647 		unsigned long v;
648 
649 		off = up->field_data[ISC_UF_PORT].off;
650 		len = up->field_data[ISC_UF_PORT].len;
651 		end = buf + off + len;
652 
653 		/*
654 		 * NOTE: The characters are already validated and are in the
655 		 * [0-9] range
656 		 */
657 		INSIST(off + len <= buflen);
658 
659 		v = 0;
660 		for (pp = buf + off; pp < end; pp++) {
661 			v *= 10;
662 			v += *pp - '0';
663 
664 			/* Ports have a max value of 2^16 */
665 			if (v > 0xffff) {
666 				return ISC_R_RANGE;
667 			}
668 		}
669 
670 		up->port = (uint16_t)v;
671 	}
672 
673 	return ISC_R_SUCCESS;
674 }
675