1 /* $NetBSD: url.c,v 1.6 2025/01/26 16:25:39 christos Exp $ */ 2 3 /* 4 * Copyright (C) Internet Systems Consortium, Inc. ("ISC") 5 * 6 * SPDX-License-Identifier: MPL-2.0 and MIT 7 * 8 * This Source Code Form is subject to the terms of the Mozilla Public 9 * License, v. 2.0. If a copy of the MPL was not distributed with this 10 * file, you can obtain one at https://mozilla.org/MPL/2.0/. 11 * 12 * See the COPYRIGHT file distributed with this work for additional 13 * information regarding copyright ownership. 14 */ 15 16 /* 17 * Copyright Joyent, Inc. and other Node contributors. All rights reserved. 18 * 19 * Permission is hereby granted, free of charge, to any person obtaining a copy 20 * of this software and associated documentation files (the "Software"), to 21 * deal in the Software without restriction, including without limitation the 22 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 23 * sell copies of the Software, and to permit persons to whom the Software is 24 * furnished to do so, subject to the following conditions: 25 * 26 * The above copyright notice and this permission notice shall be included in 27 * all copies or substantial portions of the Software. 28 * 29 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 30 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 31 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 32 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 33 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 34 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 35 * IN THE SOFTWARE. 36 */ 37 38 #include <ctype.h> 39 #include <limits.h> 40 #include <stddef.h> 41 #include <string.h> 42 43 #include <isc/url.h> 44 #include <isc/util.h> 45 46 #ifndef BIT_AT 47 #define BIT_AT(a, i) \ 48 (!!((unsigned int)(a)[(unsigned int)(i) >> 3] & \ 49 (1 << ((unsigned int)(i) & 7)))) 50 #endif 51 52 #if HTTP_PARSER_STRICT 53 #define T(v) 0 54 #else 55 #define T(v) v 56 #endif 57 58 static const uint8_t normal_url_char[32] = { 59 /* 0 nul 1 soh 2 stx 3 etx 4 eot 5 enq 6 ack 7 bel */ 60 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0, 61 /* 8 bs 9 ht 10 nl 11 vt 12 np 13 cr 14 so 15 si */ 62 0 | T(2) | 0 | 0 | T(16) | 0 | 0 | 0, 63 /* 16 dle 17 dc1 18 dc2 19 dc3 20 dc4 21 nak 22 syn 23 etb */ 64 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0, 65 /* 24 can 25 em 26 sub 27 esc 28 fs 29 gs 30 rs 31 us */ 66 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0, 67 /* 32 sp 33 ! 34 " 35 # 36 $ 37 % 38 & 39 ' */ 68 0 | 2 | 4 | 0 | 16 | 32 | 64 | 128, 69 /* 40 ( 41 ) 42 * 43 + 44 , 45 - 46 . 47 / */ 70 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128, 71 /* 48 0 49 1 50 2 51 3 52 4 53 5 54 6 55 7 */ 72 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128, 73 /* 56 8 57 9 58 : 59 ; 60 < 61 = 62 > 63 ? */ 74 1 | 2 | 4 | 8 | 16 | 32 | 64 | 0, 75 /* 64 @ 65 A 66 B 67 C 68 D 69 E 70 F 71 G */ 76 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128, 77 /* 72 H 73 I 74 J 75 K 76 L 77 M 78 N 79 O */ 78 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128, 79 /* 80 P 81 Q 82 R 83 S 84 T 85 U 86 V 87 W */ 80 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128, 81 /* 88 X 89 Y 90 Z 91 [ 92 \ 93 ] 94 ^ 95 _ */ 82 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128, 83 /* 96 ` 97 a 98 b 99 c 100 d 101 e 102 f 103 g */ 84 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128, 85 /* 104 h 105 i 106 j 107 k 108 l 109 m 110 n 111 o */ 86 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128, 87 /* 112 p 113 q 114 r 115 s 116 t 117 u 118 v 119 w */ 88 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128, 89 /* 120 x 121 y 122 z 123 { 124 | 125 } 126 ~ 127 del */ 90 1 | 2 | 4 | 8 | 16 | 32 | 64 | 0, 91 }; 92 93 #undef T 94 95 typedef enum { 96 s_dead = 1, /* important that this is > 0 */ 97 98 s_start_req_or_res, 99 s_res_or_resp_H, 100 s_start_res, 101 s_res_H, 102 s_res_HT, 103 s_res_HTT, 104 s_res_HTTP, 105 s_res_http_major, 106 s_res_http_dot, 107 s_res_http_minor, 108 s_res_http_end, 109 s_res_first_status_code, 110 s_res_status_code, 111 s_res_status_start, 112 s_res_status, 113 s_res_line_almost_done, 114 115 s_start_req, 116 117 s_req_method, 118 s_req_spaces_before_url, 119 s_req_schema, 120 s_req_schema_slash, 121 s_req_schema_slash_slash, 122 s_req_server_start, 123 s_req_server, 124 s_req_server_with_at, 125 s_req_path, 126 s_req_query_string_start, 127 s_req_query_string, 128 s_req_fragment_start, 129 s_req_fragment, 130 s_req_http_start, 131 s_req_http_H, 132 s_req_http_HT, 133 s_req_http_HTT, 134 s_req_http_HTTP, 135 s_req_http_I, 136 s_req_http_IC, 137 s_req_http_major, 138 s_req_http_dot, 139 s_req_http_minor, 140 s_req_http_end, 141 s_req_line_almost_done, 142 143 s_header_field_start, 144 s_header_field, 145 s_header_value_discard_ws, 146 s_header_value_discard_ws_almost_done, 147 s_header_value_discard_lws, 148 s_header_value_start, 149 s_header_value, 150 s_header_value_lws, 151 152 s_header_almost_done, 153 154 s_chunk_size_start, 155 s_chunk_size, 156 s_chunk_parameters, 157 s_chunk_size_almost_done, 158 159 s_headers_almost_done, 160 s_headers_done, 161 162 /* 163 * Important: 's_headers_done' must be the last 'header' state. All 164 * states beyond this must be 'body' states. It is used for overflow 165 * checking. See the PARSING_HEADER() macro. 166 */ 167 168 s_chunk_data, 169 s_chunk_data_almost_done, 170 s_chunk_data_done, 171 172 s_body_identity, 173 s_body_identity_eof, 174 175 s_message_done 176 } state_t; 177 178 typedef enum { 179 s_http_host_dead = 1, 180 s_http_userinfo_start, 181 s_http_userinfo, 182 s_http_host_start, 183 s_http_host_v6_start, 184 s_http_host, 185 s_http_host_v6, 186 s_http_host_v6_end, 187 s_http_host_v6_zone_start, 188 s_http_host_v6_zone, 189 s_http_host_port_start, 190 s_http_host_port 191 } host_state_t; 192 193 /* Macros for character classes; depends on strict-mode */ 194 #define IS_MARK(c) \ 195 ((c) == '-' || (c) == '_' || (c) == '.' || (c) == '!' || (c) == '~' || \ 196 (c) == '*' || (c) == '\'' || (c) == '(' || (c) == ')') 197 #define IS_USERINFO_CHAR(c) \ 198 (isalnum((unsigned char)c) || IS_MARK(c) || (c) == '%' || \ 199 (c) == ';' || (c) == ':' || (c) == '&' || (c) == '=' || (c) == '+' || \ 200 (c) == '$' || (c) == ',') 201 202 #if HTTP_PARSER_STRICT 203 #define IS_URL_CHAR(c) (BIT_AT(normal_url_char, (unsigned char)c)) 204 #define IS_HOST_CHAR(c) (isalnum((unsigned char)c) || (c) == '.' || (c) == '-') 205 #else 206 #define IS_URL_CHAR(c) \ 207 (BIT_AT(normal_url_char, (unsigned char)c) || ((c) & 0x80)) 208 #define IS_HOST_CHAR(c) \ 209 (isalnum((unsigned char)c) || (c) == '.' || (c) == '-' || (c) == '_') 210 #endif 211 212 /* 213 * Our URL parser. 214 * 215 * This is designed to be shared by http_parser_execute() for URL validation, 216 * hence it has a state transition + byte-for-byte interface. In addition, it 217 * is meant to be embedded in http_parser_parse_url(), which does the dirty 218 * work of turning state transitions URL components for its API. 219 * 220 * This function should only be invoked with non-space characters. It is 221 * assumed that the caller cares about (and can detect) the transition between 222 * URL and non-URL states by looking for these. 223 */ 224 static state_t 225 parse_url_char(state_t s, const char ch) { 226 if (ch == ' ' || ch == '\r' || ch == '\n') { 227 return s_dead; 228 } 229 230 #if HTTP_PARSER_STRICT 231 if (ch == '\t' || ch == '\f') { 232 return s_dead; 233 } 234 #endif 235 236 switch (s) { 237 case s_req_spaces_before_url: 238 /* Proxied requests are followed by scheme of an absolute URI 239 * (alpha). All methods except CONNECT are followed by '/' or 240 * '*'. 241 */ 242 243 if (ch == '/' || ch == '*') { 244 return s_req_path; 245 } 246 247 if (isalpha((unsigned char)ch)) { 248 return s_req_schema; 249 } 250 251 break; 252 253 case s_req_schema: 254 if (isalpha((unsigned char)ch)) { 255 return s; 256 } 257 258 if (ch == ':') { 259 return s_req_schema_slash; 260 } 261 262 break; 263 264 case s_req_schema_slash: 265 if (ch == '/') { 266 return s_req_schema_slash_slash; 267 } 268 269 break; 270 271 case s_req_schema_slash_slash: 272 if (ch == '/') { 273 return s_req_server_start; 274 } 275 276 break; 277 278 case s_req_server_with_at: 279 if (ch == '@') { 280 return s_dead; 281 } 282 283 FALLTHROUGH; 284 case s_req_server_start: 285 case s_req_server: 286 if (ch == '/') { 287 return s_req_path; 288 } 289 290 if (ch == '?') { 291 return s_req_query_string_start; 292 } 293 294 if (ch == '@') { 295 return s_req_server_with_at; 296 } 297 298 if (IS_USERINFO_CHAR(ch) || ch == '[' || ch == ']') { 299 return s_req_server; 300 } 301 302 break; 303 304 case s_req_path: 305 if (IS_URL_CHAR(ch)) { 306 return s; 307 } 308 309 switch (ch) { 310 case '?': 311 return s_req_query_string_start; 312 313 case '#': 314 return s_req_fragment_start; 315 } 316 317 break; 318 319 case s_req_query_string_start: 320 case s_req_query_string: 321 if (IS_URL_CHAR(ch)) { 322 return s_req_query_string; 323 } 324 325 switch (ch) { 326 case '?': 327 /* allow extra '?' in query string */ 328 return s_req_query_string; 329 330 case '#': 331 return s_req_fragment_start; 332 } 333 334 break; 335 336 case s_req_fragment_start: 337 if (IS_URL_CHAR(ch)) { 338 return s_req_fragment; 339 } 340 341 switch (ch) { 342 case '?': 343 return s_req_fragment; 344 345 case '#': 346 return s; 347 } 348 349 break; 350 351 case s_req_fragment: 352 if (IS_URL_CHAR(ch)) { 353 return s; 354 } 355 356 switch (ch) { 357 case '?': 358 case '#': 359 return s; 360 } 361 362 break; 363 364 default: 365 break; 366 } 367 368 /* 369 * We should never fall out of the switch above unless there's an 370 * error. 371 */ 372 return s_dead; 373 } 374 375 static host_state_t 376 http_parse_host_char(host_state_t s, const char ch) { 377 switch (s) { 378 case s_http_userinfo: 379 case s_http_userinfo_start: 380 if (ch == '@') { 381 return s_http_host_start; 382 } 383 384 if (IS_USERINFO_CHAR(ch)) { 385 return s_http_userinfo; 386 } 387 break; 388 389 case s_http_host_start: 390 if (ch == '[') { 391 return s_http_host_v6_start; 392 } 393 394 if (IS_HOST_CHAR(ch)) { 395 return s_http_host; 396 } 397 398 break; 399 400 case s_http_host: 401 if (IS_HOST_CHAR(ch)) { 402 return s_http_host; 403 } 404 405 FALLTHROUGH; 406 case s_http_host_v6_end: 407 if (ch == ':') { 408 return s_http_host_port_start; 409 } 410 411 break; 412 413 case s_http_host_v6: 414 if (ch == ']') { 415 return s_http_host_v6_end; 416 } 417 418 FALLTHROUGH; 419 case s_http_host_v6_start: 420 if (isxdigit((unsigned char)ch) || ch == ':' || ch == '.') { 421 return s_http_host_v6; 422 } 423 424 if (s == s_http_host_v6 && ch == '%') { 425 return s_http_host_v6_zone_start; 426 } 427 break; 428 429 case s_http_host_v6_zone: 430 if (ch == ']') { 431 return s_http_host_v6_end; 432 } 433 434 FALLTHROUGH; 435 case s_http_host_v6_zone_start: 436 /* RFC 6874 Zone ID consists of 1*( unreserved / pct-encoded) */ 437 if (isalnum((unsigned char)ch) || ch == '%' || ch == '.' || 438 ch == '-' || ch == '_' || ch == '~') 439 { 440 return s_http_host_v6_zone; 441 } 442 break; 443 444 case s_http_host_port: 445 case s_http_host_port_start: 446 if (isdigit((unsigned char)ch)) { 447 return s_http_host_port; 448 } 449 450 break; 451 452 default: 453 break; 454 } 455 456 return s_http_host_dead; 457 } 458 459 static isc_result_t 460 http_parse_host(const char *buf, isc_url_parser_t *up, int found_at) { 461 host_state_t s; 462 const char *p = NULL; 463 size_t buflen = up->field_data[ISC_UF_HOST].off + 464 up->field_data[ISC_UF_HOST].len; 465 466 REQUIRE((up->field_set & (1 << ISC_UF_HOST)) != 0); 467 468 up->field_data[ISC_UF_HOST].len = 0; 469 470 s = found_at ? s_http_userinfo_start : s_http_host_start; 471 472 for (p = buf + up->field_data[ISC_UF_HOST].off; p < buf + buflen; p++) { 473 host_state_t new_s = http_parse_host_char(s, *p); 474 475 if (new_s == s_http_host_dead) { 476 return ISC_R_FAILURE; 477 } 478 479 switch (new_s) { 480 case s_http_host: 481 if (s != s_http_host) { 482 up->field_data[ISC_UF_HOST].off = 483 (uint16_t)(p - buf); 484 } 485 up->field_data[ISC_UF_HOST].len++; 486 break; 487 488 case s_http_host_v6: 489 if (s != s_http_host_v6) { 490 up->field_data[ISC_UF_HOST].off = 491 (uint16_t)(p - buf); 492 } 493 up->field_data[ISC_UF_HOST].len++; 494 break; 495 496 case s_http_host_v6_zone_start: 497 case s_http_host_v6_zone: 498 up->field_data[ISC_UF_HOST].len++; 499 break; 500 501 case s_http_host_port: 502 if (s != s_http_host_port) { 503 up->field_data[ISC_UF_PORT].off = 504 (uint16_t)(p - buf); 505 up->field_data[ISC_UF_PORT].len = 0; 506 up->field_set |= (1 << ISC_UF_PORT); 507 } 508 up->field_data[ISC_UF_PORT].len++; 509 break; 510 511 case s_http_userinfo: 512 if (s != s_http_userinfo) { 513 up->field_data[ISC_UF_USERINFO].off = 514 (uint16_t)(p - buf); 515 up->field_data[ISC_UF_USERINFO].len = 0; 516 up->field_set |= (1 << ISC_UF_USERINFO); 517 } 518 up->field_data[ISC_UF_USERINFO].len++; 519 break; 520 521 default: 522 break; 523 } 524 525 s = new_s; 526 } 527 528 /* Make sure we don't end somewhere unexpected */ 529 switch (s) { 530 case s_http_host_start: 531 case s_http_host_v6_start: 532 case s_http_host_v6: 533 case s_http_host_v6_zone_start: 534 case s_http_host_v6_zone: 535 case s_http_host_port_start: 536 case s_http_userinfo: 537 case s_http_userinfo_start: 538 return ISC_R_FAILURE; 539 default: 540 break; 541 } 542 543 return ISC_R_SUCCESS; 544 } 545 546 isc_result_t 547 isc_url_parse(const char *buf, size_t buflen, bool is_connect, 548 isc_url_parser_t *up) { 549 state_t s; 550 isc_url_field_t uf, old_uf; 551 int found_at = 0; 552 const char *p = NULL; 553 554 if (buflen == 0) { 555 return ISC_R_FAILURE; 556 } 557 558 up->port = up->field_set = 0; 559 s = is_connect ? s_req_server_start : s_req_spaces_before_url; 560 old_uf = ISC_UF_MAX; 561 562 for (p = buf; p < buf + buflen; p++) { 563 s = parse_url_char(s, *p); 564 565 /* Figure out the next field that we're operating on */ 566 switch (s) { 567 case s_dead: 568 return ISC_R_FAILURE; 569 570 /* Skip delimiters */ 571 case s_req_schema_slash: 572 case s_req_schema_slash_slash: 573 case s_req_server_start: 574 case s_req_query_string_start: 575 case s_req_fragment_start: 576 continue; 577 578 case s_req_schema: 579 uf = ISC_UF_SCHEMA; 580 break; 581 582 case s_req_server_with_at: 583 found_at = 1; 584 FALLTHROUGH; 585 case s_req_server: 586 uf = ISC_UF_HOST; 587 break; 588 589 case s_req_path: 590 uf = ISC_UF_PATH; 591 break; 592 593 case s_req_query_string: 594 uf = ISC_UF_QUERY; 595 break; 596 597 case s_req_fragment: 598 uf = ISC_UF_FRAGMENT; 599 break; 600 601 default: 602 UNREACHABLE(); 603 } 604 605 /* Nothing's changed; soldier on */ 606 if (uf == old_uf) { 607 up->field_data[uf].len++; 608 continue; 609 } 610 611 up->field_data[uf].off = (uint16_t)(p - buf); 612 up->field_data[uf].len = 1; 613 614 up->field_set |= (1 << uf); 615 old_uf = uf; 616 } 617 618 /* host must be present if there is a schema */ 619 /* parsing http:///toto will fail */ 620 if ((up->field_set & (1 << ISC_UF_SCHEMA)) && 621 (up->field_set & (1 << ISC_UF_HOST)) == 0) 622 { 623 return ISC_R_FAILURE; 624 } 625 626 if (up->field_set & (1 << ISC_UF_HOST)) { 627 isc_result_t result; 628 629 result = http_parse_host(buf, up, found_at); 630 if (result != ISC_R_SUCCESS) { 631 return result; 632 } 633 } 634 635 /* CONNECT requests can only contain "hostname:port" */ 636 if (is_connect && 637 up->field_set != ((1 << ISC_UF_HOST) | (1 << ISC_UF_PORT))) 638 { 639 return ISC_R_FAILURE; 640 } 641 642 if (up->field_set & (1 << ISC_UF_PORT)) { 643 uint16_t off; 644 uint16_t len; 645 const char *pp = NULL; 646 const char *end = NULL; 647 unsigned long v; 648 649 off = up->field_data[ISC_UF_PORT].off; 650 len = up->field_data[ISC_UF_PORT].len; 651 end = buf + off + len; 652 653 /* 654 * NOTE: The characters are already validated and are in the 655 * [0-9] range 656 */ 657 INSIST(off + len <= buflen); 658 659 v = 0; 660 for (pp = buf + off; pp < end; pp++) { 661 v *= 10; 662 v += *pp - '0'; 663 664 /* Ports have a max value of 2^16 */ 665 if (v > 0xffff) { 666 return ISC_R_RANGE; 667 } 668 } 669 670 up->port = (uint16_t)v; 671 } 672 673 return ISC_R_SUCCESS; 674 } 675