1 /* $OpenBSD: in_pcb.c,v 1.277 2023/06/24 20:54:46 bluhm Exp $ */ 2 /* $NetBSD: in_pcb.c,v 1.25 1996/02/13 23:41:53 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1991, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include "pf.h" 72 73 #include <sys/param.h> 74 #include <sys/systm.h> 75 #include <sys/mbuf.h> 76 #include <sys/protosw.h> 77 #include <sys/socket.h> 78 #include <sys/socketvar.h> 79 #include <sys/domain.h> 80 #include <sys/mount.h> 81 #include <sys/pool.h> 82 #include <sys/proc.h> 83 84 #include <net/if.h> 85 #include <net/if_var.h> 86 #include <net/pfvar.h> 87 #include <net/route.h> 88 89 #include <netinet/in.h> 90 #include <netinet/in_var.h> 91 #include <netinet/ip.h> 92 #include <netinet/ip_var.h> 93 #include <netinet/in_pcb.h> 94 #ifdef IPSEC 95 #include <netinet/ip_esp.h> 96 #endif /* IPSEC */ 97 98 #include "stoeplitz.h" 99 #if NSTOEPLITZ > 0 100 #include <net/toeplitz.h> 101 #endif 102 103 const struct in_addr zeroin_addr; 104 105 union { 106 struct in_addr za_in; 107 struct in6_addr za_in6; 108 } zeroin46_addr; 109 110 /* 111 * These configure the range of local port addresses assigned to 112 * "unspecified" outgoing connections/packets/whatever. 113 */ 114 int ipport_firstauto = IPPORT_RESERVED; 115 int ipport_lastauto = IPPORT_USERRESERVED; 116 int ipport_hifirstauto = IPPORT_HIFIRSTAUTO; 117 int ipport_hilastauto = IPPORT_HILASTAUTO; 118 119 struct baddynamicports baddynamicports; 120 struct baddynamicports rootonlyports; 121 struct pool inpcb_pool; 122 123 void in_pcbhash_insert(struct inpcb *); 124 struct inpcb *in_pcbhash_lookup(struct inpcbtable *, uint64_t, u_int, 125 const struct in_addr *, u_short, const struct in_addr *, u_short); 126 int in_pcbresize(struct inpcbtable *, int); 127 128 #define INPCBHASH_LOADFACTOR(_x) (((_x) * 3) / 4) 129 130 uint64_t in_pcbhash(struct inpcbtable *, u_int, 131 const struct in_addr *, u_short, const struct in_addr *, u_short); 132 uint64_t in_pcblhash(struct inpcbtable *, u_int, u_short); 133 134 /* 135 * in_pcb is used for inet and inet6. in6_pcb only contains special 136 * IPv6 cases. So the internet initializer is used for both domains. 137 */ 138 void 139 in_init(void) 140 { 141 pool_init(&inpcb_pool, sizeof(struct inpcb), 0, 142 IPL_SOFTNET, 0, "inpcb", NULL); 143 } 144 145 uint64_t 146 in_pcbhash(struct inpcbtable *table, u_int rdomain, 147 const struct in_addr *faddr, u_short fport, 148 const struct in_addr *laddr, u_short lport) 149 { 150 SIPHASH_CTX ctx; 151 u_int32_t nrdom = htonl(rdomain); 152 153 SipHash24_Init(&ctx, &table->inpt_key); 154 SipHash24_Update(&ctx, &nrdom, sizeof(nrdom)); 155 SipHash24_Update(&ctx, faddr, sizeof(*faddr)); 156 SipHash24_Update(&ctx, &fport, sizeof(fport)); 157 SipHash24_Update(&ctx, laddr, sizeof(*laddr)); 158 SipHash24_Update(&ctx, &lport, sizeof(lport)); 159 return SipHash24_End(&ctx); 160 } 161 162 uint64_t 163 in_pcblhash(struct inpcbtable *table, u_int rdomain, u_short lport) 164 { 165 SIPHASH_CTX ctx; 166 u_int32_t nrdom = htonl(rdomain); 167 168 SipHash24_Init(&ctx, &table->inpt_lkey); 169 SipHash24_Update(&ctx, &nrdom, sizeof(nrdom)); 170 SipHash24_Update(&ctx, &lport, sizeof(lport)); 171 return SipHash24_End(&ctx); 172 } 173 174 void 175 in_pcbinit(struct inpcbtable *table, int hashsize) 176 { 177 mtx_init(&table->inpt_mtx, IPL_SOFTNET); 178 rw_init(&table->inpt_notify, "inpnotify"); 179 TAILQ_INIT(&table->inpt_queue); 180 table->inpt_hashtbl = hashinit(hashsize, M_PCB, M_WAITOK, 181 &table->inpt_mask); 182 table->inpt_lhashtbl = hashinit(hashsize, M_PCB, M_WAITOK, 183 &table->inpt_lmask); 184 table->inpt_count = 0; 185 table->inpt_size = hashsize; 186 arc4random_buf(&table->inpt_key, sizeof(table->inpt_key)); 187 arc4random_buf(&table->inpt_lkey, sizeof(table->inpt_lkey)); 188 } 189 190 /* 191 * Check if the specified port is invalid for dynamic allocation. 192 */ 193 int 194 in_baddynamic(u_int16_t port, u_int16_t proto) 195 { 196 switch (proto) { 197 case IPPROTO_TCP: 198 return (DP_ISSET(baddynamicports.tcp, port)); 199 case IPPROTO_UDP: 200 #ifdef IPSEC 201 /* Cannot preset this as it is a sysctl */ 202 if (port == udpencap_port) 203 return (1); 204 #endif 205 return (DP_ISSET(baddynamicports.udp, port)); 206 default: 207 return (0); 208 } 209 } 210 211 int 212 in_rootonly(u_int16_t port, u_int16_t proto) 213 { 214 switch (proto) { 215 case IPPROTO_TCP: 216 return (port < IPPORT_RESERVED || 217 DP_ISSET(rootonlyports.tcp, port)); 218 case IPPROTO_UDP: 219 return (port < IPPORT_RESERVED || 220 DP_ISSET(rootonlyports.udp, port)); 221 default: 222 return (0); 223 } 224 } 225 226 int 227 in_pcballoc(struct socket *so, struct inpcbtable *table, int wait) 228 { 229 struct inpcb *inp; 230 231 inp = pool_get(&inpcb_pool, (wait == M_WAIT ? PR_WAITOK : PR_NOWAIT) | 232 PR_ZERO); 233 if (inp == NULL) 234 return (ENOBUFS); 235 inp->inp_table = table; 236 inp->inp_socket = so; 237 refcnt_init_trace(&inp->inp_refcnt, DT_REFCNT_IDX_INPCB); 238 mtx_init(&inp->inp_mtx, IPL_SOFTNET); 239 inp->inp_seclevel[SL_AUTH] = IPSEC_AUTH_LEVEL_DEFAULT; 240 inp->inp_seclevel[SL_ESP_TRANS] = IPSEC_ESP_TRANS_LEVEL_DEFAULT; 241 inp->inp_seclevel[SL_ESP_NETWORK] = IPSEC_ESP_NETWORK_LEVEL_DEFAULT; 242 inp->inp_seclevel[SL_IPCOMP] = IPSEC_IPCOMP_LEVEL_DEFAULT; 243 inp->inp_rtableid = curproc->p_p->ps_rtableid; 244 inp->inp_hops = -1; 245 #ifdef INET6 246 /* 247 * Small change in this function to set the INP_IPV6 flag so routines 248 * outside pcb-specific routines don't need to use sotopf(), and all 249 * of its pointer chasing, later. 250 */ 251 if (sotopf(so) == PF_INET6) 252 inp->inp_flags = INP_IPV6; 253 inp->inp_cksum6 = -1; 254 #endif /* INET6 */ 255 256 mtx_enter(&table->inpt_mtx); 257 if (table->inpt_count++ > INPCBHASH_LOADFACTOR(table->inpt_size)) 258 (void)in_pcbresize(table, table->inpt_size * 2); 259 TAILQ_INSERT_HEAD(&table->inpt_queue, inp, inp_queue); 260 in_pcbhash_insert(inp); 261 mtx_leave(&table->inpt_mtx); 262 263 so->so_pcb = inp; 264 265 return (0); 266 } 267 268 int 269 in_pcbbind(struct inpcb *inp, struct mbuf *nam, struct proc *p) 270 { 271 struct socket *so = inp->inp_socket; 272 u_int16_t lport = 0; 273 int wild = 0; 274 void *laddr = &zeroin46_addr; 275 int error; 276 277 if (inp->inp_lport) 278 return (EINVAL); 279 280 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0 && 281 ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0 || 282 (so->so_options & SO_ACCEPTCONN) == 0)) 283 wild = INPLOOKUP_WILDCARD; 284 285 switch (sotopf(so)) { 286 #ifdef INET6 287 case PF_INET6: 288 if (!IN6_IS_ADDR_UNSPECIFIED(&inp->inp_laddr6)) 289 return (EINVAL); 290 wild |= INPLOOKUP_IPV6; 291 292 if (nam) { 293 struct sockaddr_in6 *sin6; 294 295 if ((error = in6_nam2sin6(nam, &sin6))) 296 return (error); 297 if ((error = in6_pcbaddrisavail(inp, sin6, wild, p))) 298 return (error); 299 laddr = &sin6->sin6_addr; 300 lport = sin6->sin6_port; 301 } 302 break; 303 #endif 304 case PF_INET: 305 if (inp->inp_laddr.s_addr != INADDR_ANY) 306 return (EINVAL); 307 308 if (nam) { 309 struct sockaddr_in *sin; 310 311 if ((error = in_nam2sin(nam, &sin))) 312 return (error); 313 if ((error = in_pcbaddrisavail(inp, sin, wild, p))) 314 return (error); 315 laddr = &sin->sin_addr; 316 lport = sin->sin_port; 317 } 318 break; 319 default: 320 return (EINVAL); 321 } 322 323 if (lport == 0) { 324 if ((error = in_pcbpickport(&lport, laddr, wild, inp, p))) 325 return (error); 326 } else { 327 if (in_rootonly(ntohs(lport), so->so_proto->pr_protocol) && 328 suser(p) != 0) 329 return (EACCES); 330 } 331 if (nam) { 332 switch (sotopf(so)) { 333 #ifdef INET6 334 case PF_INET6: 335 inp->inp_laddr6 = *(struct in6_addr *)laddr; 336 break; 337 #endif 338 case PF_INET: 339 inp->inp_laddr = *(struct in_addr *)laddr; 340 break; 341 } 342 } 343 inp->inp_lport = lport; 344 in_pcbrehash(inp); 345 return (0); 346 } 347 348 int 349 in_pcbaddrisavail(struct inpcb *inp, struct sockaddr_in *sin, int wild, 350 struct proc *p) 351 { 352 struct socket *so = inp->inp_socket; 353 struct inpcbtable *table = inp->inp_table; 354 u_int16_t lport = sin->sin_port; 355 int reuseport = (so->so_options & SO_REUSEPORT); 356 357 if (IN_MULTICAST(sin->sin_addr.s_addr)) { 358 /* 359 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; 360 * allow complete duplication of binding if 361 * SO_REUSEPORT is set, or if SO_REUSEADDR is set 362 * and a multicast address is bound on both 363 * new and duplicated sockets. 364 */ 365 if (so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) 366 reuseport = SO_REUSEADDR|SO_REUSEPORT; 367 } else if (sin->sin_addr.s_addr != INADDR_ANY) { 368 /* 369 * we must check that we are binding to an address we 370 * own except when: 371 * - SO_BINDANY is set or 372 * - we are binding a UDP socket to 255.255.255.255 or 373 * - we are binding a UDP socket to one of our broadcast 374 * addresses 375 */ 376 if (!ISSET(so->so_options, SO_BINDANY) && 377 !(so->so_type == SOCK_DGRAM && 378 sin->sin_addr.s_addr == INADDR_BROADCAST) && 379 !(so->so_type == SOCK_DGRAM && 380 in_broadcast(sin->sin_addr, inp->inp_rtableid))) { 381 struct ifaddr *ia; 382 383 sin->sin_port = 0; 384 memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); 385 ia = ifa_ifwithaddr(sintosa(sin), inp->inp_rtableid); 386 sin->sin_port = lport; 387 388 if (ia == NULL) 389 return (EADDRNOTAVAIL); 390 } 391 } 392 if (lport) { 393 struct inpcb *t; 394 int error = 0; 395 396 if (so->so_euid && !IN_MULTICAST(sin->sin_addr.s_addr)) { 397 t = in_pcblookup_local(table, &sin->sin_addr, lport, 398 INPLOOKUP_WILDCARD, inp->inp_rtableid); 399 if (t && (so->so_euid != t->inp_socket->so_euid)) 400 error = EADDRINUSE; 401 in_pcbunref(t); 402 if (error) 403 return (error); 404 } 405 t = in_pcblookup_local(table, &sin->sin_addr, lport, 406 wild, inp->inp_rtableid); 407 if (t && (reuseport & t->inp_socket->so_options) == 0) 408 error = EADDRINUSE; 409 in_pcbunref(t); 410 if (error) 411 return (error); 412 } 413 414 return (0); 415 } 416 417 int 418 in_pcbpickport(u_int16_t *lport, void *laddr, int wild, struct inpcb *inp, 419 struct proc *p) 420 { 421 struct socket *so = inp->inp_socket; 422 struct inpcbtable *table = inp->inp_table; 423 struct inpcb *t; 424 u_int16_t first, last, lower, higher, candidate, localport; 425 int count; 426 427 if (inp->inp_flags & INP_HIGHPORT) { 428 first = ipport_hifirstauto; /* sysctl */ 429 last = ipport_hilastauto; 430 } else if (inp->inp_flags & INP_LOWPORT) { 431 if (suser(p)) 432 return (EACCES); 433 first = IPPORT_RESERVED-1; /* 1023 */ 434 last = 600; /* not IPPORT_RESERVED/2 */ 435 } else { 436 first = ipport_firstauto; /* sysctl */ 437 last = ipport_lastauto; 438 } 439 if (first < last) { 440 lower = first; 441 higher = last; 442 } else { 443 lower = last; 444 higher = first; 445 } 446 447 /* 448 * Simple check to ensure all ports are not used up causing 449 * a deadlock here. 450 */ 451 452 count = higher - lower; 453 candidate = lower + arc4random_uniform(count); 454 455 t = NULL; 456 do { 457 in_pcbunref(t); 458 do { 459 if (count-- < 0) /* completely used? */ 460 return (EADDRNOTAVAIL); 461 ++candidate; 462 if (candidate < lower || candidate > higher) 463 candidate = lower; 464 localport = htons(candidate); 465 } while (in_baddynamic(candidate, so->so_proto->pr_protocol)); 466 t = in_pcblookup_local(table, laddr, localport, wild, 467 inp->inp_rtableid); 468 } while (t != NULL); 469 *lport = localport; 470 471 return (0); 472 } 473 474 /* 475 * Connect from a socket to a specified address. 476 * Both address and port must be specified in argument sin. 477 * If don't have a local address for this socket yet, 478 * then pick one. 479 */ 480 int 481 in_pcbconnect(struct inpcb *inp, struct mbuf *nam) 482 { 483 struct in_addr ina; 484 struct sockaddr_in *sin; 485 struct inpcb *t; 486 int error; 487 488 #ifdef INET6 489 if (sotopf(inp->inp_socket) == PF_INET6) 490 return (in6_pcbconnect(inp, nam)); 491 KASSERT((inp->inp_flags & INP_IPV6) == 0); 492 #endif /* INET6 */ 493 494 if ((error = in_nam2sin(nam, &sin))) 495 return (error); 496 if (sin->sin_port == 0) 497 return (EADDRNOTAVAIL); 498 error = in_pcbselsrc(&ina, sin, inp); 499 if (error) 500 return (error); 501 502 t = in_pcblookup(inp->inp_table, sin->sin_addr, sin->sin_port, 503 ina, inp->inp_lport, inp->inp_rtableid); 504 if (t != NULL) { 505 in_pcbunref(t); 506 return (EADDRINUSE); 507 } 508 509 KASSERT(inp->inp_laddr.s_addr == INADDR_ANY || inp->inp_lport); 510 511 if (inp->inp_laddr.s_addr == INADDR_ANY) { 512 if (inp->inp_lport == 0) { 513 error = in_pcbbind(inp, NULL, curproc); 514 if (error) 515 return (error); 516 t = in_pcblookup(inp->inp_table, sin->sin_addr, 517 sin->sin_port, ina, inp->inp_lport, 518 inp->inp_rtableid); 519 if (t != NULL) { 520 inp->inp_lport = 0; 521 in_pcbunref(t); 522 return (EADDRINUSE); 523 } 524 } 525 inp->inp_laddr = ina; 526 } 527 inp->inp_faddr = sin->sin_addr; 528 inp->inp_fport = sin->sin_port; 529 in_pcbrehash(inp); 530 #if NSTOEPLITZ > 0 531 inp->inp_flowid = stoeplitz_ip4port(inp->inp_faddr.s_addr, 532 inp->inp_laddr.s_addr, inp->inp_fport, inp->inp_lport); 533 #endif 534 return (0); 535 } 536 537 void 538 in_pcbdisconnect(struct inpcb *inp) 539 { 540 #if NPF > 0 541 if (inp->inp_pf_sk) { 542 pf_remove_divert_state(inp->inp_pf_sk); 543 /* pf_remove_divert_state() may have detached the state */ 544 pf_inp_unlink(inp); 545 } 546 #endif 547 switch (sotopf(inp->inp_socket)) { 548 #ifdef INET6 549 case PF_INET6: 550 inp->inp_faddr6 = in6addr_any; 551 break; 552 #endif 553 case PF_INET: 554 inp->inp_faddr.s_addr = INADDR_ANY; 555 break; 556 } 557 558 inp->inp_fport = 0; 559 inp->inp_flowid = 0; 560 in_pcbrehash(inp); 561 if (inp->inp_socket->so_state & SS_NOFDREF) 562 in_pcbdetach(inp); 563 } 564 565 void 566 in_pcbdetach(struct inpcb *inp) 567 { 568 struct socket *so = inp->inp_socket; 569 struct inpcbtable *table = inp->inp_table; 570 571 so->so_pcb = NULL; 572 /* 573 * As long as the NET_LOCK() is the default lock for Internet 574 * sockets, do not release it to not introduce new sleeping 575 * points. 576 */ 577 sofree(so, 1); 578 m_freem(inp->inp_options); 579 if (inp->inp_route.ro_rt) { 580 rtfree(inp->inp_route.ro_rt); 581 inp->inp_route.ro_rt = NULL; 582 } 583 #ifdef INET6 584 if (inp->inp_flags & INP_IPV6) { 585 ip6_freepcbopts(inp->inp_outputopts6); 586 ip6_freemoptions(inp->inp_moptions6); 587 } else 588 #endif 589 ip_freemoptions(inp->inp_moptions); 590 #if NPF > 0 591 if (inp->inp_pf_sk) { 592 pf_remove_divert_state(inp->inp_pf_sk); 593 /* pf_remove_divert_state() may have detached the state */ 594 pf_inp_unlink(inp); 595 } 596 #endif 597 mtx_enter(&table->inpt_mtx); 598 LIST_REMOVE(inp, inp_lhash); 599 LIST_REMOVE(inp, inp_hash); 600 TAILQ_REMOVE(&table->inpt_queue, inp, inp_queue); 601 table->inpt_count--; 602 mtx_leave(&table->inpt_mtx); 603 604 in_pcbunref(inp); 605 } 606 607 struct inpcb * 608 in_pcbref(struct inpcb *inp) 609 { 610 if (inp == NULL) 611 return NULL; 612 refcnt_take(&inp->inp_refcnt); 613 return inp; 614 } 615 616 void 617 in_pcbunref(struct inpcb *inp) 618 { 619 if (inp == NULL) 620 return; 621 if (refcnt_rele(&inp->inp_refcnt) == 0) 622 return; 623 KASSERT((LIST_NEXT(inp, inp_hash) == NULL) || 624 (LIST_NEXT(inp, inp_hash) == _Q_INVALID)); 625 KASSERT((LIST_NEXT(inp, inp_lhash) == NULL) || 626 (LIST_NEXT(inp, inp_lhash) == _Q_INVALID)); 627 KASSERT((TAILQ_NEXT(inp, inp_queue) == NULL) || 628 (TAILQ_NEXT(inp, inp_queue) == _Q_INVALID)); 629 pool_put(&inpcb_pool, inp); 630 } 631 632 void 633 in_setsockaddr(struct inpcb *inp, struct mbuf *nam) 634 { 635 struct sockaddr_in *sin; 636 637 nam->m_len = sizeof(*sin); 638 sin = mtod(nam, struct sockaddr_in *); 639 memset(sin, 0, sizeof(*sin)); 640 sin->sin_family = AF_INET; 641 sin->sin_len = sizeof(*sin); 642 sin->sin_port = inp->inp_lport; 643 sin->sin_addr = inp->inp_laddr; 644 } 645 646 void 647 in_setpeeraddr(struct inpcb *inp, struct mbuf *nam) 648 { 649 struct sockaddr_in *sin; 650 651 #ifdef INET6 652 if (sotopf(inp->inp_socket) == PF_INET6) { 653 in6_setpeeraddr(inp, nam); 654 return; 655 } 656 #endif /* INET6 */ 657 658 nam->m_len = sizeof(*sin); 659 sin = mtod(nam, struct sockaddr_in *); 660 memset(sin, 0, sizeof(*sin)); 661 sin->sin_family = AF_INET; 662 sin->sin_len = sizeof(*sin); 663 sin->sin_port = inp->inp_fport; 664 sin->sin_addr = inp->inp_faddr; 665 } 666 667 int 668 in_sockaddr(struct socket *so, struct mbuf *nam) 669 { 670 struct inpcb *inp; 671 672 inp = sotoinpcb(so); 673 in_setsockaddr(inp, nam); 674 675 return (0); 676 } 677 678 int 679 in_peeraddr(struct socket *so, struct mbuf *nam) 680 { 681 struct inpcb *inp; 682 683 inp = sotoinpcb(so); 684 in_setpeeraddr(inp, nam); 685 686 return (0); 687 } 688 689 /* 690 * Pass some notification to all connections of a protocol 691 * associated with address dst. The "usual action" will be 692 * taken, depending on the ctlinput cmd. The caller must filter any 693 * cmds that are uninteresting (e.g., no error in the map). 694 * Call the protocol specific routine (if any) to report 695 * any errors for each matching socket. 696 */ 697 void 698 in_pcbnotifyall(struct inpcbtable *table, struct sockaddr *dst, u_int rtable, 699 int errno, void (*notify)(struct inpcb *, int)) 700 { 701 SIMPLEQ_HEAD(, inpcb) inpcblist; 702 struct inpcb *inp; 703 struct in_addr faddr; 704 u_int rdomain; 705 706 if (dst->sa_family != AF_INET) 707 return; 708 faddr = satosin(dst)->sin_addr; 709 if (faddr.s_addr == INADDR_ANY) 710 return; 711 if (notify == NULL) 712 return; 713 714 /* 715 * Use a temporary notify list protected by rwlock to run over 716 * selected PCB. This is necessary as the list of all PCB is 717 * protected by a mutex. Notify may call ip_output() eventually 718 * which may sleep as pf lock is a rwlock. Also the SRP 719 * implementation of the routing table might sleep. 720 * The same inp_notify list entry and inpt_notify rwlock are 721 * used for UDP multicast and raw IP delivery. 722 */ 723 SIMPLEQ_INIT(&inpcblist); 724 rdomain = rtable_l2(rtable); 725 rw_enter_write(&table->inpt_notify); 726 mtx_enter(&table->inpt_mtx); 727 TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) { 728 #ifdef INET6 729 if (inp->inp_flags & INP_IPV6) 730 continue; 731 #endif 732 if (inp->inp_faddr.s_addr != faddr.s_addr || 733 rtable_l2(inp->inp_rtableid) != rdomain || 734 inp->inp_socket == NULL) { 735 continue; 736 } 737 in_pcbref(inp); 738 SIMPLEQ_INSERT_TAIL(&inpcblist, inp, inp_notify); 739 } 740 mtx_leave(&table->inpt_mtx); 741 742 while ((inp = SIMPLEQ_FIRST(&inpcblist)) != NULL) { 743 SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify); 744 (*notify)(inp, errno); 745 in_pcbunref(inp); 746 } 747 rw_exit_write(&table->inpt_notify); 748 } 749 750 /* 751 * Check for alternatives when higher level complains 752 * about service problems. For now, invalidate cached 753 * routing information. If the route was created dynamically 754 * (by a redirect), time to try a default gateway again. 755 */ 756 void 757 in_losing(struct inpcb *inp) 758 { 759 struct rtentry *rt = inp->inp_route.ro_rt; 760 761 if (rt) { 762 inp->inp_route.ro_rt = NULL; 763 764 if (rt->rt_flags & RTF_DYNAMIC) { 765 struct ifnet *ifp; 766 767 ifp = if_get(rt->rt_ifidx); 768 /* 769 * If the interface is gone, all its attached 770 * route entries have been removed from the table, 771 * so we're dealing with a stale cache and have 772 * nothing to do. 773 */ 774 if (ifp != NULL) 775 rtdeletemsg(rt, ifp, inp->inp_rtableid); 776 if_put(ifp); 777 } 778 /* 779 * A new route can be allocated 780 * the next time output is attempted. 781 * rtfree() needs to be called in anycase because the inp 782 * is still holding a reference to rt. 783 */ 784 rtfree(rt); 785 } 786 } 787 788 /* 789 * After a routing change, flush old routing 790 * and allocate a (hopefully) better one. 791 */ 792 void 793 in_rtchange(struct inpcb *inp, int errno) 794 { 795 if (inp->inp_route.ro_rt) { 796 rtfree(inp->inp_route.ro_rt); 797 inp->inp_route.ro_rt = NULL; 798 /* 799 * A new route can be allocated the next time 800 * output is attempted. 801 */ 802 } 803 } 804 805 struct inpcb * 806 in_pcblookup_local(struct inpcbtable *table, void *laddrp, u_int lport_arg, 807 int flags, u_int rtable) 808 { 809 struct inpcb *inp, *match = NULL; 810 int matchwild = 3, wildcard; 811 u_int16_t lport = lport_arg; 812 struct in_addr laddr = *(struct in_addr *)laddrp; 813 #ifdef INET6 814 struct in6_addr *laddr6 = (struct in6_addr *)laddrp; 815 #endif 816 struct inpcbhead *head; 817 uint64_t lhash; 818 u_int rdomain; 819 820 rdomain = rtable_l2(rtable); 821 lhash = in_pcblhash(table, rdomain, lport); 822 823 mtx_enter(&table->inpt_mtx); 824 head = &table->inpt_lhashtbl[lhash & table->inpt_lmask]; 825 LIST_FOREACH(inp, head, inp_lhash) { 826 if (rtable_l2(inp->inp_rtableid) != rdomain) 827 continue; 828 if (inp->inp_lport != lport) 829 continue; 830 wildcard = 0; 831 #ifdef INET6 832 if (ISSET(flags, INPLOOKUP_IPV6)) { 833 if (!ISSET(inp->inp_flags, INP_IPV6)) 834 continue; 835 836 if (!IN6_IS_ADDR_UNSPECIFIED(&inp->inp_faddr6)) 837 wildcard++; 838 839 if (!IN6_ARE_ADDR_EQUAL(&inp->inp_laddr6, laddr6)) { 840 if (IN6_IS_ADDR_UNSPECIFIED(&inp->inp_laddr6) || 841 IN6_IS_ADDR_UNSPECIFIED(laddr6)) 842 wildcard++; 843 else 844 continue; 845 } 846 847 } else 848 #endif /* INET6 */ 849 { 850 #ifdef INET6 851 if (ISSET(inp->inp_flags, INP_IPV6)) 852 continue; 853 #endif /* INET6 */ 854 855 if (inp->inp_faddr.s_addr != INADDR_ANY) 856 wildcard++; 857 858 if (inp->inp_laddr.s_addr != laddr.s_addr) { 859 if (inp->inp_laddr.s_addr == INADDR_ANY || 860 laddr.s_addr == INADDR_ANY) 861 wildcard++; 862 else 863 continue; 864 } 865 866 } 867 if ((!wildcard || (flags & INPLOOKUP_WILDCARD)) && 868 wildcard < matchwild) { 869 match = inp; 870 if ((matchwild = wildcard) == 0) 871 break; 872 } 873 } 874 in_pcbref(match); 875 mtx_leave(&table->inpt_mtx); 876 877 return (match); 878 } 879 880 struct rtentry * 881 in_pcbrtentry(struct inpcb *inp) 882 { 883 struct route *ro; 884 885 ro = &inp->inp_route; 886 887 /* check if route is still valid */ 888 if (!rtisvalid(ro->ro_rt)) { 889 rtfree(ro->ro_rt); 890 ro->ro_rt = NULL; 891 } 892 893 /* 894 * No route yet, so try to acquire one. 895 */ 896 if (ro->ro_rt == NULL) { 897 #ifdef INET6 898 memset(ro, 0, sizeof(struct route_in6)); 899 #else 900 memset(ro, 0, sizeof(struct route)); 901 #endif 902 903 switch(sotopf(inp->inp_socket)) { 904 #ifdef INET6 905 case PF_INET6: 906 if (IN6_IS_ADDR_UNSPECIFIED(&inp->inp_faddr6)) 907 break; 908 ro->ro_dst.sa_family = AF_INET6; 909 ro->ro_dst.sa_len = sizeof(struct sockaddr_in6); 910 satosin6(&ro->ro_dst)->sin6_addr = inp->inp_faddr6; 911 ro->ro_tableid = inp->inp_rtableid; 912 ro->ro_rt = rtalloc_mpath(&ro->ro_dst, 913 &inp->inp_laddr6.s6_addr32[0], ro->ro_tableid); 914 break; 915 #endif /* INET6 */ 916 case PF_INET: 917 if (inp->inp_faddr.s_addr == INADDR_ANY) 918 break; 919 ro->ro_dst.sa_family = AF_INET; 920 ro->ro_dst.sa_len = sizeof(struct sockaddr_in); 921 satosin(&ro->ro_dst)->sin_addr = inp->inp_faddr; 922 ro->ro_tableid = inp->inp_rtableid; 923 ro->ro_rt = rtalloc_mpath(&ro->ro_dst, 924 &inp->inp_laddr.s_addr, ro->ro_tableid); 925 break; 926 } 927 } 928 return (ro->ro_rt); 929 } 930 931 /* 932 * Return an IPv4 address, which is the most appropriate for a given 933 * destination. 934 * If necessary, this function lookups the routing table and returns 935 * an entry to the caller for later use. 936 */ 937 int 938 in_pcbselsrc(struct in_addr *insrc, struct sockaddr_in *sin, 939 struct inpcb *inp) 940 { 941 struct ip_moptions *mopts = inp->inp_moptions; 942 struct route *ro = &inp->inp_route; 943 struct in_addr *laddr = &inp->inp_laddr; 944 u_int rtableid = inp->inp_rtableid; 945 struct sockaddr *ip4_source = NULL; 946 947 struct sockaddr_in *sin2; 948 struct in_ifaddr *ia = NULL; 949 950 /* 951 * If the socket(if any) is already bound, use that bound address 952 * unless it is INADDR_ANY or INADDR_BROADCAST. 953 */ 954 if (laddr->s_addr != INADDR_ANY && 955 laddr->s_addr != INADDR_BROADCAST) { 956 *insrc = *laddr; 957 return (0); 958 } 959 960 /* 961 * If the destination address is multicast or limited 962 * broadcast (255.255.255.255) and an outgoing interface has 963 * been set as a multicast option, use the address of that 964 * interface as our source address. 965 */ 966 if ((IN_MULTICAST(sin->sin_addr.s_addr) || 967 sin->sin_addr.s_addr == INADDR_BROADCAST) && mopts != NULL) { 968 struct ifnet *ifp; 969 970 ifp = if_get(mopts->imo_ifidx); 971 if (ifp != NULL) { 972 if (ifp->if_rdomain == rtable_l2(rtableid)) 973 IFP_TO_IA(ifp, ia); 974 if (ia == NULL) { 975 if_put(ifp); 976 return (EADDRNOTAVAIL); 977 } 978 979 *insrc = ia->ia_addr.sin_addr; 980 if_put(ifp); 981 return (0); 982 } 983 } 984 985 /* 986 * If route is known or can be allocated now, 987 * our src addr is taken from the i/f, else punt. 988 */ 989 if (!rtisvalid(ro->ro_rt) || (ro->ro_tableid != rtableid) || 990 (satosin(&ro->ro_dst)->sin_addr.s_addr != sin->sin_addr.s_addr)) { 991 rtfree(ro->ro_rt); 992 ro->ro_rt = NULL; 993 } 994 if (ro->ro_rt == NULL) { 995 /* No route yet, so try to acquire one */ 996 ro->ro_dst.sa_family = AF_INET; 997 ro->ro_dst.sa_len = sizeof(struct sockaddr_in); 998 satosin(&ro->ro_dst)->sin_addr = sin->sin_addr; 999 ro->ro_tableid = rtableid; 1000 ro->ro_rt = rtalloc_mpath(&ro->ro_dst, NULL, ro->ro_tableid); 1001 1002 /* 1003 * It is important to zero out the rest of the 1004 * struct sockaddr_in when mixing v6 & v4! 1005 */ 1006 sin2 = satosin(&ro->ro_dst); 1007 memset(sin2->sin_zero, 0, sizeof(sin2->sin_zero)); 1008 } 1009 1010 /* 1011 * If we found a route, use the address 1012 * corresponding to the outgoing interface. 1013 */ 1014 if (ro->ro_rt != NULL) 1015 ia = ifatoia(ro->ro_rt->rt_ifa); 1016 1017 /* 1018 * Use preferred source address if : 1019 * - destination is not onlink 1020 * - preferred source address is set 1021 * - output interface is UP 1022 */ 1023 if (ro->ro_rt && !(ro->ro_rt->rt_flags & RTF_LLINFO) && 1024 !(ro->ro_rt->rt_flags & RTF_HOST)) { 1025 ip4_source = rtable_getsource(rtableid, AF_INET); 1026 if (ip4_source != NULL) { 1027 struct ifaddr *ifa; 1028 if ((ifa = ifa_ifwithaddr(ip4_source, rtableid)) != 1029 NULL && ISSET(ifa->ifa_ifp->if_flags, IFF_UP)) { 1030 *insrc = satosin(ip4_source)->sin_addr; 1031 return (0); 1032 } 1033 } 1034 } 1035 1036 if (ia == NULL) 1037 return (EADDRNOTAVAIL); 1038 1039 *insrc = ia->ia_addr.sin_addr; 1040 return (0); 1041 } 1042 1043 void 1044 in_pcbrehash(struct inpcb *inp) 1045 { 1046 struct inpcbtable *table = inp->inp_table; 1047 1048 mtx_enter(&table->inpt_mtx); 1049 LIST_REMOVE(inp, inp_lhash); 1050 LIST_REMOVE(inp, inp_hash); 1051 in_pcbhash_insert(inp); 1052 mtx_leave(&table->inpt_mtx); 1053 } 1054 1055 void 1056 in_pcbhash_insert(struct inpcb *inp) 1057 { 1058 struct inpcbtable *table = inp->inp_table; 1059 struct inpcbhead *head; 1060 uint64_t hash, lhash; 1061 1062 MUTEX_ASSERT_LOCKED(&table->inpt_mtx); 1063 1064 lhash = in_pcblhash(table, inp->inp_rtableid, inp->inp_lport); 1065 head = &table->inpt_lhashtbl[lhash & table->inpt_lmask]; 1066 LIST_INSERT_HEAD(head, inp, inp_lhash); 1067 #ifdef INET6 1068 if (inp->inp_flags & INP_IPV6) 1069 hash = in6_pcbhash(table, rtable_l2(inp->inp_rtableid), 1070 &inp->inp_faddr6, inp->inp_fport, 1071 &inp->inp_laddr6, inp->inp_lport); 1072 else 1073 #endif /* INET6 */ 1074 hash = in_pcbhash(table, rtable_l2(inp->inp_rtableid), 1075 &inp->inp_faddr, inp->inp_fport, 1076 &inp->inp_laddr, inp->inp_lport); 1077 head = &table->inpt_hashtbl[hash & table->inpt_mask]; 1078 LIST_INSERT_HEAD(head, inp, inp_hash); 1079 } 1080 1081 struct inpcb * 1082 in_pcbhash_lookup(struct inpcbtable *table, uint64_t hash, u_int rdomain, 1083 const struct in_addr *faddr, u_short fport, 1084 const struct in_addr *laddr, u_short lport) 1085 { 1086 struct inpcbhead *head; 1087 struct inpcb *inp; 1088 1089 MUTEX_ASSERT_LOCKED(&table->inpt_mtx); 1090 1091 head = &table->inpt_hashtbl[hash & table->inpt_mask]; 1092 LIST_FOREACH(inp, head, inp_hash) { 1093 #ifdef INET6 1094 if (ISSET(inp->inp_flags, INP_IPV6)) 1095 continue; 1096 #endif 1097 if (inp->inp_fport == fport && inp->inp_lport == lport && 1098 inp->inp_faddr.s_addr == faddr->s_addr && 1099 inp->inp_laddr.s_addr == laddr->s_addr && 1100 rtable_l2(inp->inp_rtableid) == rdomain) { 1101 break; 1102 } 1103 } 1104 if (inp != NULL) { 1105 /* 1106 * Move this PCB to the head of hash chain so that 1107 * repeated accesses are quicker. This is analogous to 1108 * the historic single-entry PCB cache. 1109 */ 1110 if (inp != LIST_FIRST(head)) { 1111 LIST_REMOVE(inp, inp_hash); 1112 LIST_INSERT_HEAD(head, inp, inp_hash); 1113 } 1114 } 1115 return (inp); 1116 } 1117 1118 int 1119 in_pcbresize(struct inpcbtable *table, int hashsize) 1120 { 1121 u_long nmask, nlmask; 1122 int osize; 1123 void *nhashtbl, *nlhashtbl, *ohashtbl, *olhashtbl; 1124 struct inpcb *inp; 1125 1126 MUTEX_ASSERT_LOCKED(&table->inpt_mtx); 1127 1128 ohashtbl = table->inpt_hashtbl; 1129 olhashtbl = table->inpt_lhashtbl; 1130 osize = table->inpt_size; 1131 1132 nhashtbl = hashinit(hashsize, M_PCB, M_NOWAIT, &nmask); 1133 if (nhashtbl == NULL) 1134 return ENOBUFS; 1135 nlhashtbl = hashinit(hashsize, M_PCB, M_NOWAIT, &nlmask); 1136 if (nlhashtbl == NULL) { 1137 hashfree(nhashtbl, hashsize, M_PCB); 1138 return ENOBUFS; 1139 } 1140 table->inpt_hashtbl = nhashtbl; 1141 table->inpt_lhashtbl = nlhashtbl; 1142 table->inpt_mask = nmask; 1143 table->inpt_lmask = nlmask; 1144 table->inpt_size = hashsize; 1145 1146 TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) { 1147 LIST_REMOVE(inp, inp_lhash); 1148 LIST_REMOVE(inp, inp_hash); 1149 in_pcbhash_insert(inp); 1150 } 1151 hashfree(ohashtbl, osize, M_PCB); 1152 hashfree(olhashtbl, osize, M_PCB); 1153 1154 return (0); 1155 } 1156 1157 #ifdef DIAGNOSTIC 1158 int in_pcbnotifymiss = 0; 1159 #endif 1160 1161 /* 1162 * The in(6)_pcblookup functions are used to locate connected sockets 1163 * quickly: 1164 * faddr.fport <-> laddr.lport 1165 * No wildcard matching is done so that listening sockets are not found. 1166 * If the functions return NULL in(6)_pcblookup_listen can be used to 1167 * find a listening/bound socket that may accept the connection. 1168 * After those two lookups no other are necessary. 1169 */ 1170 struct inpcb * 1171 in_pcblookup(struct inpcbtable *table, struct in_addr faddr, 1172 u_int fport, struct in_addr laddr, u_int lport, u_int rtable) 1173 { 1174 struct inpcb *inp; 1175 uint64_t hash; 1176 u_int rdomain; 1177 1178 rdomain = rtable_l2(rtable); 1179 hash = in_pcbhash(table, rdomain, &faddr, fport, &laddr, lport); 1180 1181 mtx_enter(&table->inpt_mtx); 1182 inp = in_pcbhash_lookup(table, hash, rdomain, 1183 &faddr, fport, &laddr, lport); 1184 in_pcbref(inp); 1185 mtx_leave(&table->inpt_mtx); 1186 1187 #ifdef DIAGNOSTIC 1188 if (inp == NULL && in_pcbnotifymiss) { 1189 printf("%s: faddr=%08x fport=%d laddr=%08x lport=%d rdom=%u\n", 1190 __func__, ntohl(faddr.s_addr), ntohs(fport), 1191 ntohl(laddr.s_addr), ntohs(lport), rdomain); 1192 } 1193 #endif 1194 return (inp); 1195 } 1196 1197 /* 1198 * The in(6)_pcblookup_listen functions are used to locate listening 1199 * sockets quickly. This are sockets with unspecified foreign address 1200 * and port: 1201 * *.* <-> laddr.lport 1202 * *.* <-> *.lport 1203 */ 1204 struct inpcb * 1205 in_pcblookup_listen(struct inpcbtable *table, struct in_addr laddr, 1206 u_int lport_arg, struct mbuf *m, u_int rtable) 1207 { 1208 const struct in_addr *key1, *key2; 1209 struct inpcb *inp; 1210 uint64_t hash; 1211 u_int16_t lport = lport_arg; 1212 u_int rdomain; 1213 1214 key1 = &laddr; 1215 key2 = &zeroin_addr; 1216 #if NPF > 0 1217 if (m && m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) { 1218 struct pf_divert *divert; 1219 1220 divert = pf_find_divert(m); 1221 KASSERT(divert != NULL); 1222 switch (divert->type) { 1223 case PF_DIVERT_TO: 1224 key1 = key2 = &divert->addr.v4; 1225 lport = divert->port; 1226 break; 1227 case PF_DIVERT_REPLY: 1228 return (NULL); 1229 default: 1230 panic("%s: unknown divert type %d, mbuf %p, divert %p", 1231 __func__, divert->type, m, divert); 1232 } 1233 } else if (m && m->m_pkthdr.pf.flags & PF_TAG_TRANSLATE_LOCALHOST) { 1234 /* 1235 * Redirected connections should not be treated the same 1236 * as connections directed to 127.0.0.0/8 since localhost 1237 * can only be accessed from the host itself. 1238 * For example portmap(8) grants more permissions for 1239 * connections to the socket bound to 127.0.0.1 than 1240 * to the * socket. 1241 */ 1242 key1 = &zeroin_addr; 1243 key2 = &laddr; 1244 } 1245 #endif 1246 1247 rdomain = rtable_l2(rtable); 1248 hash = in_pcbhash(table, rdomain, &zeroin_addr, 0, key1, lport); 1249 1250 mtx_enter(&table->inpt_mtx); 1251 inp = in_pcbhash_lookup(table, hash, rdomain, 1252 &zeroin_addr, 0, key1, lport); 1253 if (inp == NULL && key1->s_addr != key2->s_addr) { 1254 hash = in_pcbhash(table, rdomain, 1255 &zeroin_addr, 0, key2, lport); 1256 inp = in_pcbhash_lookup(table, hash, rdomain, 1257 &zeroin_addr, 0, key2, lport); 1258 } 1259 in_pcbref(inp); 1260 mtx_leave(&table->inpt_mtx); 1261 1262 #ifdef DIAGNOSTIC 1263 if (inp == NULL && in_pcbnotifymiss) { 1264 printf("%s: laddr=%08x lport=%d rdom=%u\n", 1265 __func__, ntohl(laddr.s_addr), ntohs(lport), rdomain); 1266 } 1267 #endif 1268 return (inp); 1269 } 1270