1 /* $NetBSD: url.c,v 1.4 2023/01/25 21:43:31 christos Exp $ */ 2 3 /* 4 * Copyright (C) Internet Systems Consortium, Inc. ("ISC") 5 * 6 * SPDX-License-Identifier: MPL-2.0 and MIT 7 * 8 * This Source Code Form is subject to the terms of the Mozilla Public 9 * License, v. 2.0. If a copy of the MPL was not distributed with this 10 * file, you can obtain one at https://mozilla.org/MPL/2.0/. 11 * 12 * See the COPYRIGHT file distributed with this work for additional 13 * information regarding copyright ownership. 14 */ 15 16 /* 17 * Copyright Joyent, Inc. and other Node contributors. All rights reserved. 18 * 19 * Permission is hereby granted, free of charge, to any person obtaining a copy 20 * of this software and associated documentation files (the "Software"), to 21 * deal in the Software without restriction, including without limitation the 22 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 23 * sell copies of the Software, and to permit persons to whom the Software is 24 * furnished to do so, subject to the following conditions: 25 * 26 * The above copyright notice and this permission notice shall be included in 27 * all copies or substantial portions of the Software. 28 * 29 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 30 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 31 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 32 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 33 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 34 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 35 * IN THE SOFTWARE. 36 */ 37 38 #include <ctype.h> 39 #include <limits.h> 40 #include <stddef.h> 41 #include <string.h> 42 43 #include <isc/url.h> 44 #include <isc/util.h> 45 46 #ifndef BIT_AT 47 #define BIT_AT(a, i) \ 48 (!!((unsigned int)(a)[(unsigned int)(i) >> 3] & \ 49 (1 << ((unsigned int)(i)&7)))) 50 #endif 51 52 #if HTTP_PARSER_STRICT 53 #define T(v) 0 54 #else 55 #define T(v) v 56 #endif 57 58 static const uint8_t normal_url_char[32] = { 59 /* 0 nul 1 soh 2 stx 3 etx 4 eot 5 enq 6 ack 7 bel */ 60 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0, 61 /* 8 bs 9 ht 10 nl 11 vt 12 np 13 cr 14 so 15 si */ 62 0 | T(2) | 0 | 0 | T(16) | 0 | 0 | 0, 63 /* 16 dle 17 dc1 18 dc2 19 dc3 20 dc4 21 nak 22 syn 23 etb */ 64 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0, 65 /* 24 can 25 em 26 sub 27 esc 28 fs 29 gs 30 rs 31 us */ 66 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0, 67 /* 32 sp 33 ! 34 " 35 # 36 $ 37 % 38 & 39 ' */ 68 0 | 2 | 4 | 0 | 16 | 32 | 64 | 128, 69 /* 40 ( 41 ) 42 * 43 + 44 , 45 - 46 . 47 / */ 70 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128, 71 /* 48 0 49 1 50 2 51 3 52 4 53 5 54 6 55 7 */ 72 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128, 73 /* 56 8 57 9 58 : 59 ; 60 < 61 = 62 > 63 ? */ 74 1 | 2 | 4 | 8 | 16 | 32 | 64 | 0, 75 /* 64 @ 65 A 66 B 67 C 68 D 69 E 70 F 71 G */ 76 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128, 77 /* 72 H 73 I 74 J 75 K 76 L 77 M 78 N 79 O */ 78 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128, 79 /* 80 P 81 Q 82 R 83 S 84 T 85 U 86 V 87 W */ 80 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128, 81 /* 88 X 89 Y 90 Z 91 [ 92 \ 93 ] 94 ^ 95 _ */ 82 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128, 83 /* 96 ` 97 a 98 b 99 c 100 d 101 e 102 f 103 g */ 84 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128, 85 /* 104 h 105 i 106 j 107 k 108 l 109 m 110 n 111 o */ 86 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128, 87 /* 112 p 113 q 114 r 115 s 116 t 117 u 118 v 119 w */ 88 1 | 2 | 4 | 8 | 16 | 32 | 64 | 128, 89 /* 120 x 121 y 122 z 123 { 124 | 125 } 126 ~ 127 del */ 90 1 | 2 | 4 | 8 | 16 | 32 | 64 | 0, 91 }; 92 93 #undef T 94 95 typedef enum { 96 s_dead = 1, /* important that this is > 0 */ 97 98 s_start_req_or_res, 99 s_res_or_resp_H, 100 s_start_res, 101 s_res_H, 102 s_res_HT, 103 s_res_HTT, 104 s_res_HTTP, 105 s_res_http_major, 106 s_res_http_dot, 107 s_res_http_minor, 108 s_res_http_end, 109 s_res_first_status_code, 110 s_res_status_code, 111 s_res_status_start, 112 s_res_status, 113 s_res_line_almost_done, 114 115 s_start_req, 116 117 s_req_method, 118 s_req_spaces_before_url, 119 s_req_schema, 120 s_req_schema_slash, 121 s_req_schema_slash_slash, 122 s_req_server_start, 123 s_req_server, 124 s_req_server_with_at, 125 s_req_path, 126 s_req_query_string_start, 127 s_req_query_string, 128 s_req_fragment_start, 129 s_req_fragment, 130 s_req_http_start, 131 s_req_http_H, 132 s_req_http_HT, 133 s_req_http_HTT, 134 s_req_http_HTTP, 135 s_req_http_I, 136 s_req_http_IC, 137 s_req_http_major, 138 s_req_http_dot, 139 s_req_http_minor, 140 s_req_http_end, 141 s_req_line_almost_done, 142 143 s_header_field_start, 144 s_header_field, 145 s_header_value_discard_ws, 146 s_header_value_discard_ws_almost_done, 147 s_header_value_discard_lws, 148 s_header_value_start, 149 s_header_value, 150 s_header_value_lws, 151 152 s_header_almost_done, 153 154 s_chunk_size_start, 155 s_chunk_size, 156 s_chunk_parameters, 157 s_chunk_size_almost_done, 158 159 s_headers_almost_done, 160 s_headers_done, 161 162 /* 163 * Important: 's_headers_done' must be the last 'header' state. All 164 * states beyond this must be 'body' states. It is used for overflow 165 * checking. See the PARSING_HEADER() macro. 166 */ 167 168 s_chunk_data, 169 s_chunk_data_almost_done, 170 s_chunk_data_done, 171 172 s_body_identity, 173 s_body_identity_eof, 174 175 s_message_done 176 } state_t; 177 178 typedef enum { 179 s_http_host_dead = 1, 180 s_http_userinfo_start, 181 s_http_userinfo, 182 s_http_host_start, 183 s_http_host_v6_start, 184 s_http_host, 185 s_http_host_v6, 186 s_http_host_v6_end, 187 s_http_host_v6_zone_start, 188 s_http_host_v6_zone, 189 s_http_host_port_start, 190 s_http_host_port 191 } host_state_t; 192 193 /* Macros for character classes; depends on strict-mode */ 194 #define IS_MARK(c) \ 195 ((c) == '-' || (c) == '_' || (c) == '.' || (c) == '!' || (c) == '~' || \ 196 (c) == '*' || (c) == '\'' || (c) == '(' || (c) == ')') 197 #define IS_USERINFO_CHAR(c) \ 198 (isalnum((unsigned char)c) || IS_MARK(c) || (c) == '%' || \ 199 (c) == ';' || (c) == ':' || (c) == '&' || (c) == '=' || (c) == '+' || \ 200 (c) == '$' || (c) == ',') 201 202 #if HTTP_PARSER_STRICT 203 #define IS_URL_CHAR(c) (BIT_AT(normal_url_char, (unsigned char)c)) 204 #define IS_HOST_CHAR(c) (isalnum((unsigned char)c) || (c) == '.' || (c) == '-') 205 #else 206 #define IS_URL_CHAR(c) (BIT_AT(normal_url_char, (unsigned char)c) || ((c)&0x80)) 207 #define IS_HOST_CHAR(c) \ 208 (isalnum((unsigned char)c) || (c) == '.' || (c) == '-' || (c) == '_') 209 #endif 210 211 /* 212 * Our URL parser. 213 * 214 * This is designed to be shared by http_parser_execute() for URL validation, 215 * hence it has a state transition + byte-for-byte interface. In addition, it 216 * is meant to be embedded in http_parser_parse_url(), which does the dirty 217 * work of turning state transitions URL components for its API. 218 * 219 * This function should only be invoked with non-space characters. It is 220 * assumed that the caller cares about (and can detect) the transition between 221 * URL and non-URL states by looking for these. 222 */ 223 static state_t 224 parse_url_char(state_t s, const char ch) { 225 if (ch == ' ' || ch == '\r' || ch == '\n') { 226 return (s_dead); 227 } 228 229 #if HTTP_PARSER_STRICT 230 if (ch == '\t' || ch == '\f') { 231 return (s_dead); 232 } 233 #endif 234 235 switch (s) { 236 case s_req_spaces_before_url: 237 /* Proxied requests are followed by scheme of an absolute URI 238 * (alpha). All methods except CONNECT are followed by '/' or 239 * '*'. 240 */ 241 242 if (ch == '/' || ch == '*') { 243 return (s_req_path); 244 } 245 246 if (isalpha((unsigned char)ch)) { 247 return (s_req_schema); 248 } 249 250 break; 251 252 case s_req_schema: 253 if (isalpha((unsigned char)ch)) { 254 return (s); 255 } 256 257 if (ch == ':') { 258 return (s_req_schema_slash); 259 } 260 261 break; 262 263 case s_req_schema_slash: 264 if (ch == '/') { 265 return (s_req_schema_slash_slash); 266 } 267 268 break; 269 270 case s_req_schema_slash_slash: 271 if (ch == '/') { 272 return (s_req_server_start); 273 } 274 275 break; 276 277 case s_req_server_with_at: 278 if (ch == '@') { 279 return (s_dead); 280 } 281 282 FALLTHROUGH; 283 case s_req_server_start: 284 case s_req_server: 285 if (ch == '/') { 286 return (s_req_path); 287 } 288 289 if (ch == '?') { 290 return (s_req_query_string_start); 291 } 292 293 if (ch == '@') { 294 return (s_req_server_with_at); 295 } 296 297 if (IS_USERINFO_CHAR(ch) || ch == '[' || ch == ']') { 298 return (s_req_server); 299 } 300 301 break; 302 303 case s_req_path: 304 if (IS_URL_CHAR(ch)) { 305 return (s); 306 } 307 308 switch (ch) { 309 case '?': 310 return (s_req_query_string_start); 311 312 case '#': 313 return (s_req_fragment_start); 314 } 315 316 break; 317 318 case s_req_query_string_start: 319 case s_req_query_string: 320 if (IS_URL_CHAR(ch)) { 321 return (s_req_query_string); 322 } 323 324 switch (ch) { 325 case '?': 326 /* allow extra '?' in query string */ 327 return (s_req_query_string); 328 329 case '#': 330 return (s_req_fragment_start); 331 } 332 333 break; 334 335 case s_req_fragment_start: 336 if (IS_URL_CHAR(ch)) { 337 return (s_req_fragment); 338 } 339 340 switch (ch) { 341 case '?': 342 return (s_req_fragment); 343 344 case '#': 345 return (s); 346 } 347 348 break; 349 350 case s_req_fragment: 351 if (IS_URL_CHAR(ch)) { 352 return (s); 353 } 354 355 switch (ch) { 356 case '?': 357 case '#': 358 return (s); 359 } 360 361 break; 362 363 default: 364 break; 365 } 366 367 /* 368 * We should never fall out of the switch above unless there's an 369 * error. 370 */ 371 return (s_dead); 372 } 373 374 static host_state_t 375 http_parse_host_char(host_state_t s, const char ch) { 376 switch (s) { 377 case s_http_userinfo: 378 case s_http_userinfo_start: 379 if (ch == '@') { 380 return (s_http_host_start); 381 } 382 383 if (IS_USERINFO_CHAR(ch)) { 384 return (s_http_userinfo); 385 } 386 break; 387 388 case s_http_host_start: 389 if (ch == '[') { 390 return (s_http_host_v6_start); 391 } 392 393 if (IS_HOST_CHAR(ch)) { 394 return (s_http_host); 395 } 396 397 break; 398 399 case s_http_host: 400 if (IS_HOST_CHAR(ch)) { 401 return (s_http_host); 402 } 403 404 FALLTHROUGH; 405 case s_http_host_v6_end: 406 if (ch == ':') { 407 return (s_http_host_port_start); 408 } 409 410 break; 411 412 case s_http_host_v6: 413 if (ch == ']') { 414 return (s_http_host_v6_end); 415 } 416 417 FALLTHROUGH; 418 case s_http_host_v6_start: 419 if (isxdigit((unsigned char)ch) || ch == ':' || ch == '.') { 420 return (s_http_host_v6); 421 } 422 423 if (s == s_http_host_v6 && ch == '%') { 424 return (s_http_host_v6_zone_start); 425 } 426 break; 427 428 case s_http_host_v6_zone: 429 if (ch == ']') { 430 return (s_http_host_v6_end); 431 } 432 433 FALLTHROUGH; 434 case s_http_host_v6_zone_start: 435 /* RFC 6874 Zone ID consists of 1*( unreserved / pct-encoded) */ 436 if (isalnum((unsigned char)ch) || ch == '%' || ch == '.' || 437 ch == '-' || ch == '_' || ch == '~') 438 { 439 return (s_http_host_v6_zone); 440 } 441 break; 442 443 case s_http_host_port: 444 case s_http_host_port_start: 445 if (isdigit((unsigned char)ch)) { 446 return (s_http_host_port); 447 } 448 449 break; 450 451 default: 452 break; 453 } 454 455 return (s_http_host_dead); 456 } 457 458 static isc_result_t 459 http_parse_host(const char *buf, isc_url_parser_t *up, int found_at) { 460 host_state_t s; 461 const char *p = NULL; 462 size_t buflen = up->field_data[ISC_UF_HOST].off + 463 up->field_data[ISC_UF_HOST].len; 464 465 REQUIRE((up->field_set & (1 << ISC_UF_HOST)) != 0); 466 467 up->field_data[ISC_UF_HOST].len = 0; 468 469 s = found_at ? s_http_userinfo_start : s_http_host_start; 470 471 for (p = buf + up->field_data[ISC_UF_HOST].off; p < buf + buflen; p++) { 472 host_state_t new_s = http_parse_host_char(s, *p); 473 474 if (new_s == s_http_host_dead) { 475 return (ISC_R_FAILURE); 476 } 477 478 switch (new_s) { 479 case s_http_host: 480 if (s != s_http_host) { 481 up->field_data[ISC_UF_HOST].off = 482 (uint16_t)(p - buf); 483 } 484 up->field_data[ISC_UF_HOST].len++; 485 break; 486 487 case s_http_host_v6: 488 if (s != s_http_host_v6) { 489 up->field_data[ISC_UF_HOST].off = 490 (uint16_t)(p - buf); 491 } 492 up->field_data[ISC_UF_HOST].len++; 493 break; 494 495 case s_http_host_v6_zone_start: 496 case s_http_host_v6_zone: 497 up->field_data[ISC_UF_HOST].len++; 498 break; 499 500 case s_http_host_port: 501 if (s != s_http_host_port) { 502 up->field_data[ISC_UF_PORT].off = 503 (uint16_t)(p - buf); 504 up->field_data[ISC_UF_PORT].len = 0; 505 up->field_set |= (1 << ISC_UF_PORT); 506 } 507 up->field_data[ISC_UF_PORT].len++; 508 break; 509 510 case s_http_userinfo: 511 if (s != s_http_userinfo) { 512 up->field_data[ISC_UF_USERINFO].off = 513 (uint16_t)(p - buf); 514 up->field_data[ISC_UF_USERINFO].len = 0; 515 up->field_set |= (1 << ISC_UF_USERINFO); 516 } 517 up->field_data[ISC_UF_USERINFO].len++; 518 break; 519 520 default: 521 break; 522 } 523 524 s = new_s; 525 } 526 527 /* Make sure we don't end somewhere unexpected */ 528 switch (s) { 529 case s_http_host_start: 530 case s_http_host_v6_start: 531 case s_http_host_v6: 532 case s_http_host_v6_zone_start: 533 case s_http_host_v6_zone: 534 case s_http_host_port_start: 535 case s_http_userinfo: 536 case s_http_userinfo_start: 537 return (ISC_R_FAILURE); 538 default: 539 break; 540 } 541 542 return (ISC_R_SUCCESS); 543 } 544 545 isc_result_t 546 isc_url_parse(const char *buf, size_t buflen, bool is_connect, 547 isc_url_parser_t *up) { 548 state_t s; 549 isc_url_field_t uf, old_uf; 550 int found_at = 0; 551 const char *p = NULL; 552 553 if (buflen == 0) { 554 return (ISC_R_FAILURE); 555 } 556 557 up->port = up->field_set = 0; 558 s = is_connect ? s_req_server_start : s_req_spaces_before_url; 559 old_uf = ISC_UF_MAX; 560 561 for (p = buf; p < buf + buflen; p++) { 562 s = parse_url_char(s, *p); 563 564 /* Figure out the next field that we're operating on */ 565 switch (s) { 566 case s_dead: 567 return (ISC_R_FAILURE); 568 569 /* Skip delimiters */ 570 case s_req_schema_slash: 571 case s_req_schema_slash_slash: 572 case s_req_server_start: 573 case s_req_query_string_start: 574 case s_req_fragment_start: 575 continue; 576 577 case s_req_schema: 578 uf = ISC_UF_SCHEMA; 579 break; 580 581 case s_req_server_with_at: 582 found_at = 1; 583 FALLTHROUGH; 584 case s_req_server: 585 uf = ISC_UF_HOST; 586 break; 587 588 case s_req_path: 589 uf = ISC_UF_PATH; 590 break; 591 592 case s_req_query_string: 593 uf = ISC_UF_QUERY; 594 break; 595 596 case s_req_fragment: 597 uf = ISC_UF_FRAGMENT; 598 break; 599 600 default: 601 UNREACHABLE(); 602 } 603 604 /* Nothing's changed; soldier on */ 605 if (uf == old_uf) { 606 up->field_data[uf].len++; 607 continue; 608 } 609 610 up->field_data[uf].off = (uint16_t)(p - buf); 611 up->field_data[uf].len = 1; 612 613 up->field_set |= (1 << uf); 614 old_uf = uf; 615 } 616 617 /* host must be present if there is a schema */ 618 /* parsing http:///toto will fail */ 619 if ((up->field_set & (1 << ISC_UF_SCHEMA)) && 620 (up->field_set & (1 << ISC_UF_HOST)) == 0) 621 { 622 return (ISC_R_FAILURE); 623 } 624 625 if (up->field_set & (1 << ISC_UF_HOST)) { 626 isc_result_t result; 627 628 result = http_parse_host(buf, up, found_at); 629 if (result != ISC_R_SUCCESS) { 630 return (result); 631 } 632 } 633 634 /* CONNECT requests can only contain "hostname:port" */ 635 if (is_connect && 636 up->field_set != ((1 << ISC_UF_HOST) | (1 << ISC_UF_PORT))) 637 { 638 return (ISC_R_FAILURE); 639 } 640 641 if (up->field_set & (1 << ISC_UF_PORT)) { 642 uint16_t off; 643 uint16_t len; 644 const char *pp = NULL; 645 const char *end = NULL; 646 unsigned long v; 647 648 off = up->field_data[ISC_UF_PORT].off; 649 len = up->field_data[ISC_UF_PORT].len; 650 end = buf + off + len; 651 652 /* 653 * NOTE: The characters are already validated and are in the 654 * [0-9] range 655 */ 656 INSIST(off + len <= buflen); 657 658 v = 0; 659 for (pp = buf + off; pp < end; pp++) { 660 v *= 10; 661 v += *pp - '0'; 662 663 /* Ports have a max value of 2^16 */ 664 if (v > 0xffff) { 665 return (ISC_R_RANGE); 666 } 667 } 668 669 up->port = (uint16_t)v; 670 } 671 672 return (ISC_R_SUCCESS); 673 } 674