1 /* $OpenBSD: in_pcb.c,v 1.276 2022/10/03 16:43:52 bluhm Exp $ */ 2 /* $NetBSD: in_pcb.c,v 1.25 1996/02/13 23:41:53 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1991, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include "pf.h" 72 73 #include <sys/param.h> 74 #include <sys/systm.h> 75 #include <sys/mbuf.h> 76 #include <sys/protosw.h> 77 #include <sys/socket.h> 78 #include <sys/socketvar.h> 79 #include <sys/domain.h> 80 #include <sys/mount.h> 81 #include <sys/pool.h> 82 #include <sys/proc.h> 83 84 #include <net/if.h> 85 #include <net/if_var.h> 86 #include <net/pfvar.h> 87 #include <net/route.h> 88 89 #include <netinet/in.h> 90 #include <netinet/in_var.h> 91 #include <netinet/ip.h> 92 #include <netinet/ip_var.h> 93 #include <netinet/in_pcb.h> 94 #ifdef IPSEC 95 #include <netinet/ip_esp.h> 96 #endif /* IPSEC */ 97 98 #include "stoeplitz.h" 99 #if NSTOEPLITZ > 0 100 #include <net/toeplitz.h> 101 #endif 102 103 const struct in_addr zeroin_addr; 104 105 union { 106 struct in_addr za_in; 107 struct in6_addr za_in6; 108 } zeroin46_addr; 109 110 /* 111 * These configure the range of local port addresses assigned to 112 * "unspecified" outgoing connections/packets/whatever. 113 */ 114 int ipport_firstauto = IPPORT_RESERVED; 115 int ipport_lastauto = IPPORT_USERRESERVED; 116 int ipport_hifirstauto = IPPORT_HIFIRSTAUTO; 117 int ipport_hilastauto = IPPORT_HILASTAUTO; 118 119 struct baddynamicports baddynamicports; 120 struct baddynamicports rootonlyports; 121 struct pool inpcb_pool; 122 123 void in_pcbhash_insert(struct inpcb *); 124 struct inpcb *in_pcbhash_lookup(struct inpcbtable *, u_int, 125 const struct in_addr *, u_short, const struct in_addr *, u_short); 126 int in_pcbresize(struct inpcbtable *, int); 127 128 #define INPCBHASH_LOADFACTOR(_x) (((_x) * 3) / 4) 129 130 struct inpcbhead *in_pcbhash(struct inpcbtable *, u_int, 131 const struct in_addr *, u_short, const struct in_addr *, u_short); 132 struct inpcbhead *in_pcblhash(struct inpcbtable *, u_int, u_short); 133 134 /* 135 * in_pcb is used for inet and inet6. in6_pcb only contains special 136 * IPv6 cases. So the internet initializer is used for both domains. 137 */ 138 void 139 in_init(void) 140 { 141 pool_init(&inpcb_pool, sizeof(struct inpcb), 0, 142 IPL_SOFTNET, 0, "inpcb", NULL); 143 } 144 145 struct inpcbhead * 146 in_pcbhash(struct inpcbtable *table, u_int rdomain, 147 const struct in_addr *faddr, u_short fport, 148 const struct in_addr *laddr, u_short lport) 149 { 150 SIPHASH_CTX ctx; 151 u_int32_t nrdom = htonl(rdomain); 152 153 SipHash24_Init(&ctx, &table->inpt_key); 154 SipHash24_Update(&ctx, &nrdom, sizeof(nrdom)); 155 SipHash24_Update(&ctx, faddr, sizeof(*faddr)); 156 SipHash24_Update(&ctx, &fport, sizeof(fport)); 157 SipHash24_Update(&ctx, laddr, sizeof(*laddr)); 158 SipHash24_Update(&ctx, &lport, sizeof(lport)); 159 160 return (&table->inpt_hashtbl[SipHash24_End(&ctx) & table->inpt_mask]); 161 } 162 163 struct inpcbhead * 164 in_pcblhash(struct inpcbtable *table, u_int rdomain, u_short lport) 165 { 166 SIPHASH_CTX ctx; 167 u_int32_t nrdom = htonl(rdomain); 168 169 SipHash24_Init(&ctx, &table->inpt_lkey); 170 SipHash24_Update(&ctx, &nrdom, sizeof(nrdom)); 171 SipHash24_Update(&ctx, &lport, sizeof(lport)); 172 173 return (&table->inpt_lhashtbl[SipHash24_End(&ctx) & table->inpt_lmask]); 174 } 175 176 void 177 in_pcbinit(struct inpcbtable *table, int hashsize) 178 { 179 mtx_init(&table->inpt_mtx, IPL_SOFTNET); 180 rw_init(&table->inpt_notify, "inpnotify"); 181 TAILQ_INIT(&table->inpt_queue); 182 table->inpt_hashtbl = hashinit(hashsize, M_PCB, M_WAITOK, 183 &table->inpt_mask); 184 table->inpt_lhashtbl = hashinit(hashsize, M_PCB, M_WAITOK, 185 &table->inpt_lmask); 186 table->inpt_count = 0; 187 table->inpt_size = hashsize; 188 arc4random_buf(&table->inpt_key, sizeof(table->inpt_key)); 189 arc4random_buf(&table->inpt_lkey, sizeof(table->inpt_lkey)); 190 } 191 192 /* 193 * Check if the specified port is invalid for dynamic allocation. 194 */ 195 int 196 in_baddynamic(u_int16_t port, u_int16_t proto) 197 { 198 switch (proto) { 199 case IPPROTO_TCP: 200 return (DP_ISSET(baddynamicports.tcp, port)); 201 case IPPROTO_UDP: 202 #ifdef IPSEC 203 /* Cannot preset this as it is a sysctl */ 204 if (port == udpencap_port) 205 return (1); 206 #endif 207 return (DP_ISSET(baddynamicports.udp, port)); 208 default: 209 return (0); 210 } 211 } 212 213 int 214 in_rootonly(u_int16_t port, u_int16_t proto) 215 { 216 switch (proto) { 217 case IPPROTO_TCP: 218 return (port < IPPORT_RESERVED || 219 DP_ISSET(rootonlyports.tcp, port)); 220 case IPPROTO_UDP: 221 return (port < IPPORT_RESERVED || 222 DP_ISSET(rootonlyports.udp, port)); 223 default: 224 return (0); 225 } 226 } 227 228 int 229 in_pcballoc(struct socket *so, struct inpcbtable *table, int wait) 230 { 231 struct inpcb *inp; 232 233 inp = pool_get(&inpcb_pool, (wait == M_WAIT ? PR_WAITOK : PR_NOWAIT) | 234 PR_ZERO); 235 if (inp == NULL) 236 return (ENOBUFS); 237 inp->inp_table = table; 238 inp->inp_socket = so; 239 refcnt_init_trace(&inp->inp_refcnt, DT_REFCNT_IDX_INPCB); 240 mtx_init(&inp->inp_mtx, IPL_SOFTNET); 241 inp->inp_seclevel[SL_AUTH] = IPSEC_AUTH_LEVEL_DEFAULT; 242 inp->inp_seclevel[SL_ESP_TRANS] = IPSEC_ESP_TRANS_LEVEL_DEFAULT; 243 inp->inp_seclevel[SL_ESP_NETWORK] = IPSEC_ESP_NETWORK_LEVEL_DEFAULT; 244 inp->inp_seclevel[SL_IPCOMP] = IPSEC_IPCOMP_LEVEL_DEFAULT; 245 inp->inp_rtableid = curproc->p_p->ps_rtableid; 246 inp->inp_hops = -1; 247 #ifdef INET6 248 /* 249 * Small change in this function to set the INP_IPV6 flag so routines 250 * outside pcb-specific routines don't need to use sotopf(), and all 251 * of its pointer chasing, later. 252 */ 253 if (sotopf(so) == PF_INET6) 254 inp->inp_flags = INP_IPV6; 255 inp->inp_cksum6 = -1; 256 #endif /* INET6 */ 257 258 mtx_enter(&table->inpt_mtx); 259 if (table->inpt_count++ > INPCBHASH_LOADFACTOR(table->inpt_size)) 260 (void)in_pcbresize(table, table->inpt_size * 2); 261 TAILQ_INSERT_HEAD(&table->inpt_queue, inp, inp_queue); 262 in_pcbhash_insert(inp); 263 mtx_leave(&table->inpt_mtx); 264 265 so->so_pcb = inp; 266 267 return (0); 268 } 269 270 int 271 in_pcbbind(struct inpcb *inp, struct mbuf *nam, struct proc *p) 272 { 273 struct socket *so = inp->inp_socket; 274 u_int16_t lport = 0; 275 int wild = 0; 276 void *laddr = &zeroin46_addr; 277 int error; 278 279 if (inp->inp_lport) 280 return (EINVAL); 281 282 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0 && 283 ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0 || 284 (so->so_options & SO_ACCEPTCONN) == 0)) 285 wild = INPLOOKUP_WILDCARD; 286 287 switch (sotopf(so)) { 288 #ifdef INET6 289 case PF_INET6: 290 if (!IN6_IS_ADDR_UNSPECIFIED(&inp->inp_laddr6)) 291 return (EINVAL); 292 wild |= INPLOOKUP_IPV6; 293 294 if (nam) { 295 struct sockaddr_in6 *sin6; 296 297 if ((error = in6_nam2sin6(nam, &sin6))) 298 return (error); 299 if ((error = in6_pcbaddrisavail(inp, sin6, wild, p))) 300 return (error); 301 laddr = &sin6->sin6_addr; 302 lport = sin6->sin6_port; 303 } 304 break; 305 #endif 306 case PF_INET: 307 if (inp->inp_laddr.s_addr != INADDR_ANY) 308 return (EINVAL); 309 310 if (nam) { 311 struct sockaddr_in *sin; 312 313 if ((error = in_nam2sin(nam, &sin))) 314 return (error); 315 if ((error = in_pcbaddrisavail(inp, sin, wild, p))) 316 return (error); 317 laddr = &sin->sin_addr; 318 lport = sin->sin_port; 319 } 320 break; 321 default: 322 return (EINVAL); 323 } 324 325 if (lport == 0) { 326 if ((error = in_pcbpickport(&lport, laddr, wild, inp, p))) 327 return (error); 328 } else { 329 if (in_rootonly(ntohs(lport), so->so_proto->pr_protocol) && 330 suser(p) != 0) 331 return (EACCES); 332 } 333 if (nam) { 334 switch (sotopf(so)) { 335 #ifdef INET6 336 case PF_INET6: 337 inp->inp_laddr6 = *(struct in6_addr *)laddr; 338 break; 339 #endif 340 case PF_INET: 341 inp->inp_laddr = *(struct in_addr *)laddr; 342 break; 343 } 344 } 345 inp->inp_lport = lport; 346 in_pcbrehash(inp); 347 return (0); 348 } 349 350 int 351 in_pcbaddrisavail(struct inpcb *inp, struct sockaddr_in *sin, int wild, 352 struct proc *p) 353 { 354 struct socket *so = inp->inp_socket; 355 struct inpcbtable *table = inp->inp_table; 356 u_int16_t lport = sin->sin_port; 357 int reuseport = (so->so_options & SO_REUSEPORT); 358 359 if (IN_MULTICAST(sin->sin_addr.s_addr)) { 360 /* 361 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; 362 * allow complete duplication of binding if 363 * SO_REUSEPORT is set, or if SO_REUSEADDR is set 364 * and a multicast address is bound on both 365 * new and duplicated sockets. 366 */ 367 if (so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) 368 reuseport = SO_REUSEADDR|SO_REUSEPORT; 369 } else if (sin->sin_addr.s_addr != INADDR_ANY) { 370 /* 371 * we must check that we are binding to an address we 372 * own except when: 373 * - SO_BINDANY is set or 374 * - we are binding a UDP socket to 255.255.255.255 or 375 * - we are binding a UDP socket to one of our broadcast 376 * addresses 377 */ 378 if (!ISSET(so->so_options, SO_BINDANY) && 379 !(so->so_type == SOCK_DGRAM && 380 sin->sin_addr.s_addr == INADDR_BROADCAST) && 381 !(so->so_type == SOCK_DGRAM && 382 in_broadcast(sin->sin_addr, inp->inp_rtableid))) { 383 struct ifaddr *ia; 384 385 sin->sin_port = 0; 386 memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); 387 ia = ifa_ifwithaddr(sintosa(sin), inp->inp_rtableid); 388 sin->sin_port = lport; 389 390 if (ia == NULL) 391 return (EADDRNOTAVAIL); 392 } 393 } 394 if (lport) { 395 struct inpcb *t; 396 int error = 0; 397 398 if (so->so_euid && !IN_MULTICAST(sin->sin_addr.s_addr)) { 399 t = in_pcblookup_local(table, &sin->sin_addr, lport, 400 INPLOOKUP_WILDCARD, inp->inp_rtableid); 401 if (t && (so->so_euid != t->inp_socket->so_euid)) 402 error = EADDRINUSE; 403 in_pcbunref(t); 404 if (error) 405 return (error); 406 } 407 t = in_pcblookup_local(table, &sin->sin_addr, lport, 408 wild, inp->inp_rtableid); 409 if (t && (reuseport & t->inp_socket->so_options) == 0) 410 error = EADDRINUSE; 411 in_pcbunref(t); 412 if (error) 413 return (error); 414 } 415 416 return (0); 417 } 418 419 int 420 in_pcbpickport(u_int16_t *lport, void *laddr, int wild, struct inpcb *inp, 421 struct proc *p) 422 { 423 struct socket *so = inp->inp_socket; 424 struct inpcbtable *table = inp->inp_table; 425 struct inpcb *t; 426 u_int16_t first, last, lower, higher, candidate, localport; 427 int count; 428 429 if (inp->inp_flags & INP_HIGHPORT) { 430 first = ipport_hifirstauto; /* sysctl */ 431 last = ipport_hilastauto; 432 } else if (inp->inp_flags & INP_LOWPORT) { 433 if (suser(p)) 434 return (EACCES); 435 first = IPPORT_RESERVED-1; /* 1023 */ 436 last = 600; /* not IPPORT_RESERVED/2 */ 437 } else { 438 first = ipport_firstauto; /* sysctl */ 439 last = ipport_lastauto; 440 } 441 if (first < last) { 442 lower = first; 443 higher = last; 444 } else { 445 lower = last; 446 higher = first; 447 } 448 449 /* 450 * Simple check to ensure all ports are not used up causing 451 * a deadlock here. 452 */ 453 454 count = higher - lower; 455 candidate = lower + arc4random_uniform(count); 456 457 t = NULL; 458 do { 459 in_pcbunref(t); 460 do { 461 if (count-- < 0) /* completely used? */ 462 return (EADDRNOTAVAIL); 463 ++candidate; 464 if (candidate < lower || candidate > higher) 465 candidate = lower; 466 localport = htons(candidate); 467 } while (in_baddynamic(candidate, so->so_proto->pr_protocol)); 468 t = in_pcblookup_local(table, laddr, localport, wild, 469 inp->inp_rtableid); 470 } while (t != NULL); 471 *lport = localport; 472 473 return (0); 474 } 475 476 /* 477 * Connect from a socket to a specified address. 478 * Both address and port must be specified in argument sin. 479 * If don't have a local address for this socket yet, 480 * then pick one. 481 */ 482 int 483 in_pcbconnect(struct inpcb *inp, struct mbuf *nam) 484 { 485 struct in_addr ina; 486 struct sockaddr_in *sin; 487 struct inpcb *t; 488 int error; 489 490 #ifdef INET6 491 if (sotopf(inp->inp_socket) == PF_INET6) 492 return (in6_pcbconnect(inp, nam)); 493 KASSERT((inp->inp_flags & INP_IPV6) == 0); 494 #endif /* INET6 */ 495 496 if ((error = in_nam2sin(nam, &sin))) 497 return (error); 498 if (sin->sin_port == 0) 499 return (EADDRNOTAVAIL); 500 error = in_pcbselsrc(&ina, sin, inp); 501 if (error) 502 return (error); 503 504 t = in_pcblookup(inp->inp_table, sin->sin_addr, sin->sin_port, 505 ina, inp->inp_lport, inp->inp_rtableid); 506 if (t != NULL) { 507 in_pcbunref(t); 508 return (EADDRINUSE); 509 } 510 511 KASSERT(inp->inp_laddr.s_addr == INADDR_ANY || inp->inp_lport); 512 513 if (inp->inp_laddr.s_addr == INADDR_ANY) { 514 if (inp->inp_lport == 0) { 515 error = in_pcbbind(inp, NULL, curproc); 516 if (error) 517 return (error); 518 t = in_pcblookup(inp->inp_table, sin->sin_addr, 519 sin->sin_port, ina, inp->inp_lport, 520 inp->inp_rtableid); 521 if (t != NULL) { 522 inp->inp_lport = 0; 523 in_pcbunref(t); 524 return (EADDRINUSE); 525 } 526 } 527 inp->inp_laddr = ina; 528 } 529 inp->inp_faddr = sin->sin_addr; 530 inp->inp_fport = sin->sin_port; 531 in_pcbrehash(inp); 532 #if NSTOEPLITZ > 0 533 inp->inp_flowid = stoeplitz_ip4port(inp->inp_faddr.s_addr, 534 inp->inp_laddr.s_addr, inp->inp_fport, inp->inp_lport); 535 #endif 536 return (0); 537 } 538 539 void 540 in_pcbdisconnect(struct inpcb *inp) 541 { 542 #if NPF > 0 543 if (inp->inp_pf_sk) { 544 pf_remove_divert_state(inp->inp_pf_sk); 545 /* pf_remove_divert_state() may have detached the state */ 546 pf_inp_unlink(inp); 547 } 548 #endif 549 switch (sotopf(inp->inp_socket)) { 550 #ifdef INET6 551 case PF_INET6: 552 inp->inp_faddr6 = in6addr_any; 553 break; 554 #endif 555 case PF_INET: 556 inp->inp_faddr.s_addr = INADDR_ANY; 557 break; 558 } 559 560 inp->inp_fport = 0; 561 inp->inp_flowid = 0; 562 in_pcbrehash(inp); 563 if (inp->inp_socket->so_state & SS_NOFDREF) 564 in_pcbdetach(inp); 565 } 566 567 void 568 in_pcbdetach(struct inpcb *inp) 569 { 570 struct socket *so = inp->inp_socket; 571 struct inpcbtable *table = inp->inp_table; 572 573 so->so_pcb = NULL; 574 /* 575 * As long as the NET_LOCK() is the default lock for Internet 576 * sockets, do not release it to not introduce new sleeping 577 * points. 578 */ 579 sofree(so, 1); 580 m_freem(inp->inp_options); 581 if (inp->inp_route.ro_rt) { 582 rtfree(inp->inp_route.ro_rt); 583 inp->inp_route.ro_rt = NULL; 584 } 585 #ifdef INET6 586 if (inp->inp_flags & INP_IPV6) { 587 ip6_freepcbopts(inp->inp_outputopts6); 588 ip6_freemoptions(inp->inp_moptions6); 589 } else 590 #endif 591 ip_freemoptions(inp->inp_moptions); 592 #if NPF > 0 593 if (inp->inp_pf_sk) { 594 pf_remove_divert_state(inp->inp_pf_sk); 595 /* pf_remove_divert_state() may have detached the state */ 596 pf_inp_unlink(inp); 597 } 598 #endif 599 mtx_enter(&table->inpt_mtx); 600 LIST_REMOVE(inp, inp_lhash); 601 LIST_REMOVE(inp, inp_hash); 602 TAILQ_REMOVE(&table->inpt_queue, inp, inp_queue); 603 table->inpt_count--; 604 mtx_leave(&table->inpt_mtx); 605 606 in_pcbunref(inp); 607 } 608 609 struct inpcb * 610 in_pcbref(struct inpcb *inp) 611 { 612 if (inp == NULL) 613 return NULL; 614 refcnt_take(&inp->inp_refcnt); 615 return inp; 616 } 617 618 void 619 in_pcbunref(struct inpcb *inp) 620 { 621 if (inp == NULL) 622 return; 623 if (refcnt_rele(&inp->inp_refcnt) == 0) 624 return; 625 KASSERT((LIST_NEXT(inp, inp_hash) == NULL) || 626 (LIST_NEXT(inp, inp_hash) == _Q_INVALID)); 627 KASSERT((LIST_NEXT(inp, inp_lhash) == NULL) || 628 (LIST_NEXT(inp, inp_lhash) == _Q_INVALID)); 629 KASSERT((TAILQ_NEXT(inp, inp_queue) == NULL) || 630 (TAILQ_NEXT(inp, inp_queue) == _Q_INVALID)); 631 pool_put(&inpcb_pool, inp); 632 } 633 634 void 635 in_setsockaddr(struct inpcb *inp, struct mbuf *nam) 636 { 637 struct sockaddr_in *sin; 638 639 nam->m_len = sizeof(*sin); 640 sin = mtod(nam, struct sockaddr_in *); 641 memset(sin, 0, sizeof(*sin)); 642 sin->sin_family = AF_INET; 643 sin->sin_len = sizeof(*sin); 644 sin->sin_port = inp->inp_lport; 645 sin->sin_addr = inp->inp_laddr; 646 } 647 648 void 649 in_setpeeraddr(struct inpcb *inp, struct mbuf *nam) 650 { 651 struct sockaddr_in *sin; 652 653 #ifdef INET6 654 if (sotopf(inp->inp_socket) == PF_INET6) { 655 in6_setpeeraddr(inp, nam); 656 return; 657 } 658 #endif /* INET6 */ 659 660 nam->m_len = sizeof(*sin); 661 sin = mtod(nam, struct sockaddr_in *); 662 memset(sin, 0, sizeof(*sin)); 663 sin->sin_family = AF_INET; 664 sin->sin_len = sizeof(*sin); 665 sin->sin_port = inp->inp_fport; 666 sin->sin_addr = inp->inp_faddr; 667 } 668 669 int 670 in_sockaddr(struct socket *so, struct mbuf *nam) 671 { 672 struct inpcb *inp; 673 674 inp = sotoinpcb(so); 675 in_setsockaddr(inp, nam); 676 677 return (0); 678 } 679 680 int 681 in_peeraddr(struct socket *so, struct mbuf *nam) 682 { 683 struct inpcb *inp; 684 685 inp = sotoinpcb(so); 686 in_setpeeraddr(inp, nam); 687 688 return (0); 689 } 690 691 /* 692 * Pass some notification to all connections of a protocol 693 * associated with address dst. The "usual action" will be 694 * taken, depending on the ctlinput cmd. The caller must filter any 695 * cmds that are uninteresting (e.g., no error in the map). 696 * Call the protocol specific routine (if any) to report 697 * any errors for each matching socket. 698 */ 699 void 700 in_pcbnotifyall(struct inpcbtable *table, struct sockaddr *dst, u_int rtable, 701 int errno, void (*notify)(struct inpcb *, int)) 702 { 703 SIMPLEQ_HEAD(, inpcb) inpcblist; 704 struct inpcb *inp; 705 struct in_addr faddr; 706 u_int rdomain; 707 708 if (dst->sa_family != AF_INET) 709 return; 710 faddr = satosin(dst)->sin_addr; 711 if (faddr.s_addr == INADDR_ANY) 712 return; 713 if (notify == NULL) 714 return; 715 716 /* 717 * Use a temporary notify list protected by rwlock to run over 718 * selected PCB. This is necessary as the list of all PCB is 719 * protected by a mutex. Notify may call ip_output() eventually 720 * which may sleep as pf lock is a rwlock. Also the SRP 721 * implementation of the routing table might sleep. 722 * The same inp_notify list entry and inpt_notify rwlock are 723 * used for UDP multicast and raw IP delivery. 724 */ 725 SIMPLEQ_INIT(&inpcblist); 726 rdomain = rtable_l2(rtable); 727 rw_enter_write(&table->inpt_notify); 728 mtx_enter(&table->inpt_mtx); 729 TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) { 730 #ifdef INET6 731 if (inp->inp_flags & INP_IPV6) 732 continue; 733 #endif 734 if (inp->inp_faddr.s_addr != faddr.s_addr || 735 rtable_l2(inp->inp_rtableid) != rdomain || 736 inp->inp_socket == NULL) { 737 continue; 738 } 739 in_pcbref(inp); 740 SIMPLEQ_INSERT_TAIL(&inpcblist, inp, inp_notify); 741 } 742 mtx_leave(&table->inpt_mtx); 743 744 while ((inp = SIMPLEQ_FIRST(&inpcblist)) != NULL) { 745 SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify); 746 (*notify)(inp, errno); 747 in_pcbunref(inp); 748 } 749 rw_exit_write(&table->inpt_notify); 750 } 751 752 /* 753 * Check for alternatives when higher level complains 754 * about service problems. For now, invalidate cached 755 * routing information. If the route was created dynamically 756 * (by a redirect), time to try a default gateway again. 757 */ 758 void 759 in_losing(struct inpcb *inp) 760 { 761 struct rtentry *rt = inp->inp_route.ro_rt; 762 763 if (rt) { 764 inp->inp_route.ro_rt = NULL; 765 766 if (rt->rt_flags & RTF_DYNAMIC) { 767 struct ifnet *ifp; 768 769 ifp = if_get(rt->rt_ifidx); 770 /* 771 * If the interface is gone, all its attached 772 * route entries have been removed from the table, 773 * so we're dealing with a stale cache and have 774 * nothing to do. 775 */ 776 if (ifp != NULL) 777 rtdeletemsg(rt, ifp, inp->inp_rtableid); 778 if_put(ifp); 779 } 780 /* 781 * A new route can be allocated 782 * the next time output is attempted. 783 * rtfree() needs to be called in anycase because the inp 784 * is still holding a reference to rt. 785 */ 786 rtfree(rt); 787 } 788 } 789 790 /* 791 * After a routing change, flush old routing 792 * and allocate a (hopefully) better one. 793 */ 794 void 795 in_rtchange(struct inpcb *inp, int errno) 796 { 797 if (inp->inp_route.ro_rt) { 798 rtfree(inp->inp_route.ro_rt); 799 inp->inp_route.ro_rt = NULL; 800 /* 801 * A new route can be allocated the next time 802 * output is attempted. 803 */ 804 } 805 } 806 807 struct inpcb * 808 in_pcblookup_local(struct inpcbtable *table, void *laddrp, u_int lport_arg, 809 int flags, u_int rtable) 810 { 811 struct inpcb *inp, *match = NULL; 812 int matchwild = 3, wildcard; 813 u_int16_t lport = lport_arg; 814 struct in_addr laddr = *(struct in_addr *)laddrp; 815 #ifdef INET6 816 struct in6_addr *laddr6 = (struct in6_addr *)laddrp; 817 #endif 818 struct inpcbhead *head; 819 u_int rdomain; 820 821 rdomain = rtable_l2(rtable); 822 mtx_enter(&table->inpt_mtx); 823 head = in_pcblhash(table, rdomain, lport); 824 LIST_FOREACH(inp, head, inp_lhash) { 825 if (rtable_l2(inp->inp_rtableid) != rdomain) 826 continue; 827 if (inp->inp_lport != lport) 828 continue; 829 wildcard = 0; 830 #ifdef INET6 831 if (ISSET(flags, INPLOOKUP_IPV6)) { 832 if (!ISSET(inp->inp_flags, INP_IPV6)) 833 continue; 834 835 if (!IN6_IS_ADDR_UNSPECIFIED(&inp->inp_faddr6)) 836 wildcard++; 837 838 if (!IN6_ARE_ADDR_EQUAL(&inp->inp_laddr6, laddr6)) { 839 if (IN6_IS_ADDR_UNSPECIFIED(&inp->inp_laddr6) || 840 IN6_IS_ADDR_UNSPECIFIED(laddr6)) 841 wildcard++; 842 else 843 continue; 844 } 845 846 } else 847 #endif /* INET6 */ 848 { 849 #ifdef INET6 850 if (ISSET(inp->inp_flags, INP_IPV6)) 851 continue; 852 #endif /* INET6 */ 853 854 if (inp->inp_faddr.s_addr != INADDR_ANY) 855 wildcard++; 856 857 if (inp->inp_laddr.s_addr != laddr.s_addr) { 858 if (inp->inp_laddr.s_addr == INADDR_ANY || 859 laddr.s_addr == INADDR_ANY) 860 wildcard++; 861 else 862 continue; 863 } 864 865 } 866 if ((!wildcard || (flags & INPLOOKUP_WILDCARD)) && 867 wildcard < matchwild) { 868 match = inp; 869 if ((matchwild = wildcard) == 0) 870 break; 871 } 872 } 873 in_pcbref(match); 874 mtx_leave(&table->inpt_mtx); 875 876 return (match); 877 } 878 879 struct rtentry * 880 in_pcbrtentry(struct inpcb *inp) 881 { 882 struct route *ro; 883 884 ro = &inp->inp_route; 885 886 /* check if route is still valid */ 887 if (!rtisvalid(ro->ro_rt)) { 888 rtfree(ro->ro_rt); 889 ro->ro_rt = NULL; 890 } 891 892 /* 893 * No route yet, so try to acquire one. 894 */ 895 if (ro->ro_rt == NULL) { 896 #ifdef INET6 897 memset(ro, 0, sizeof(struct route_in6)); 898 #else 899 memset(ro, 0, sizeof(struct route)); 900 #endif 901 902 switch(sotopf(inp->inp_socket)) { 903 #ifdef INET6 904 case PF_INET6: 905 if (IN6_IS_ADDR_UNSPECIFIED(&inp->inp_faddr6)) 906 break; 907 ro->ro_dst.sa_family = AF_INET6; 908 ro->ro_dst.sa_len = sizeof(struct sockaddr_in6); 909 satosin6(&ro->ro_dst)->sin6_addr = inp->inp_faddr6; 910 ro->ro_tableid = inp->inp_rtableid; 911 ro->ro_rt = rtalloc_mpath(&ro->ro_dst, 912 &inp->inp_laddr6.s6_addr32[0], ro->ro_tableid); 913 break; 914 #endif /* INET6 */ 915 case PF_INET: 916 if (inp->inp_faddr.s_addr == INADDR_ANY) 917 break; 918 ro->ro_dst.sa_family = AF_INET; 919 ro->ro_dst.sa_len = sizeof(struct sockaddr_in); 920 satosin(&ro->ro_dst)->sin_addr = inp->inp_faddr; 921 ro->ro_tableid = inp->inp_rtableid; 922 ro->ro_rt = rtalloc_mpath(&ro->ro_dst, 923 &inp->inp_laddr.s_addr, ro->ro_tableid); 924 break; 925 } 926 } 927 return (ro->ro_rt); 928 } 929 930 /* 931 * Return an IPv4 address, which is the most appropriate for a given 932 * destination. 933 * If necessary, this function lookups the routing table and returns 934 * an entry to the caller for later use. 935 */ 936 int 937 in_pcbselsrc(struct in_addr *insrc, struct sockaddr_in *sin, 938 struct inpcb *inp) 939 { 940 struct ip_moptions *mopts = inp->inp_moptions; 941 struct route *ro = &inp->inp_route; 942 struct in_addr *laddr = &inp->inp_laddr; 943 u_int rtableid = inp->inp_rtableid; 944 struct sockaddr *ip4_source = NULL; 945 946 struct sockaddr_in *sin2; 947 struct in_ifaddr *ia = NULL; 948 949 /* 950 * If the socket(if any) is already bound, use that bound address 951 * unless it is INADDR_ANY or INADDR_BROADCAST. 952 */ 953 if (laddr->s_addr != INADDR_ANY && 954 laddr->s_addr != INADDR_BROADCAST) { 955 *insrc = *laddr; 956 return (0); 957 } 958 959 /* 960 * If the destination address is multicast or limited 961 * broadcast (255.255.255.255) and an outgoing interface has 962 * been set as a multicast option, use the address of that 963 * interface as our source address. 964 */ 965 if ((IN_MULTICAST(sin->sin_addr.s_addr) || 966 sin->sin_addr.s_addr == INADDR_BROADCAST) && mopts != NULL) { 967 struct ifnet *ifp; 968 969 ifp = if_get(mopts->imo_ifidx); 970 if (ifp != NULL) { 971 if (ifp->if_rdomain == rtable_l2(rtableid)) 972 IFP_TO_IA(ifp, ia); 973 if (ia == NULL) { 974 if_put(ifp); 975 return (EADDRNOTAVAIL); 976 } 977 978 *insrc = ia->ia_addr.sin_addr; 979 if_put(ifp); 980 return (0); 981 } 982 } 983 984 /* 985 * If route is known or can be allocated now, 986 * our src addr is taken from the i/f, else punt. 987 */ 988 if (!rtisvalid(ro->ro_rt) || (ro->ro_tableid != rtableid) || 989 (satosin(&ro->ro_dst)->sin_addr.s_addr != sin->sin_addr.s_addr)) { 990 rtfree(ro->ro_rt); 991 ro->ro_rt = NULL; 992 } 993 if (ro->ro_rt == NULL) { 994 /* No route yet, so try to acquire one */ 995 ro->ro_dst.sa_family = AF_INET; 996 ro->ro_dst.sa_len = sizeof(struct sockaddr_in); 997 satosin(&ro->ro_dst)->sin_addr = sin->sin_addr; 998 ro->ro_tableid = rtableid; 999 ro->ro_rt = rtalloc_mpath(&ro->ro_dst, NULL, ro->ro_tableid); 1000 1001 /* 1002 * It is important to zero out the rest of the 1003 * struct sockaddr_in when mixing v6 & v4! 1004 */ 1005 sin2 = satosin(&ro->ro_dst); 1006 memset(sin2->sin_zero, 0, sizeof(sin2->sin_zero)); 1007 } 1008 1009 /* 1010 * If we found a route, use the address 1011 * corresponding to the outgoing interface. 1012 */ 1013 if (ro->ro_rt != NULL) 1014 ia = ifatoia(ro->ro_rt->rt_ifa); 1015 1016 /* 1017 * Use preferred source address if : 1018 * - destination is not onlink 1019 * - preferred source address is set 1020 * - output interface is UP 1021 */ 1022 if (ro->ro_rt && !(ro->ro_rt->rt_flags & RTF_LLINFO) && 1023 !(ro->ro_rt->rt_flags & RTF_HOST)) { 1024 ip4_source = rtable_getsource(rtableid, AF_INET); 1025 if (ip4_source != NULL) { 1026 struct ifaddr *ifa; 1027 if ((ifa = ifa_ifwithaddr(ip4_source, rtableid)) != 1028 NULL && ISSET(ifa->ifa_ifp->if_flags, IFF_UP)) { 1029 *insrc = satosin(ip4_source)->sin_addr; 1030 return (0); 1031 } 1032 } 1033 } 1034 1035 if (ia == NULL) 1036 return (EADDRNOTAVAIL); 1037 1038 *insrc = ia->ia_addr.sin_addr; 1039 return (0); 1040 } 1041 1042 void 1043 in_pcbrehash(struct inpcb *inp) 1044 { 1045 struct inpcbtable *table = inp->inp_table; 1046 1047 mtx_enter(&table->inpt_mtx); 1048 LIST_REMOVE(inp, inp_lhash); 1049 LIST_REMOVE(inp, inp_hash); 1050 in_pcbhash_insert(inp); 1051 mtx_leave(&table->inpt_mtx); 1052 } 1053 1054 void 1055 in_pcbhash_insert(struct inpcb *inp) 1056 { 1057 struct inpcbtable *table = inp->inp_table; 1058 struct inpcbhead *head; 1059 1060 NET_ASSERT_LOCKED(); 1061 MUTEX_ASSERT_LOCKED(&table->inpt_mtx); 1062 1063 head = in_pcblhash(table, inp->inp_rtableid, inp->inp_lport); 1064 LIST_INSERT_HEAD(head, inp, inp_lhash); 1065 #ifdef INET6 1066 if (inp->inp_flags & INP_IPV6) 1067 head = in6_pcbhash(table, rtable_l2(inp->inp_rtableid), 1068 &inp->inp_faddr6, inp->inp_fport, 1069 &inp->inp_laddr6, inp->inp_lport); 1070 else 1071 #endif /* INET6 */ 1072 head = in_pcbhash(table, rtable_l2(inp->inp_rtableid), 1073 &inp->inp_faddr, inp->inp_fport, 1074 &inp->inp_laddr, inp->inp_lport); 1075 LIST_INSERT_HEAD(head, inp, inp_hash); 1076 } 1077 1078 struct inpcb * 1079 in_pcbhash_lookup(struct inpcbtable *table, u_int rdomain, 1080 const struct in_addr *faddr, u_short fport, 1081 const struct in_addr *laddr, u_short lport) 1082 { 1083 struct inpcbhead *head; 1084 struct inpcb *inp; 1085 1086 NET_ASSERT_LOCKED(); 1087 MUTEX_ASSERT_LOCKED(&table->inpt_mtx); 1088 1089 head = in_pcbhash(table, rdomain, faddr, fport, laddr, lport); 1090 LIST_FOREACH(inp, head, inp_hash) { 1091 #ifdef INET6 1092 if (ISSET(inp->inp_flags, INP_IPV6)) 1093 continue; 1094 #endif 1095 if (inp->inp_fport == fport && inp->inp_lport == lport && 1096 inp->inp_faddr.s_addr == faddr->s_addr && 1097 inp->inp_laddr.s_addr == laddr->s_addr && 1098 rtable_l2(inp->inp_rtableid) == rdomain) { 1099 break; 1100 } 1101 } 1102 if (inp != NULL) { 1103 /* 1104 * Move this PCB to the head of hash chain so that 1105 * repeated accesses are quicker. This is analogous to 1106 * the historic single-entry PCB cache. 1107 */ 1108 if (inp != LIST_FIRST(head)) { 1109 LIST_REMOVE(inp, inp_hash); 1110 LIST_INSERT_HEAD(head, inp, inp_hash); 1111 } 1112 } 1113 return (inp); 1114 } 1115 1116 int 1117 in_pcbresize(struct inpcbtable *table, int hashsize) 1118 { 1119 u_long nmask, nlmask; 1120 int osize; 1121 void *nhashtbl, *nlhashtbl, *ohashtbl, *olhashtbl; 1122 struct inpcb *inp; 1123 1124 MUTEX_ASSERT_LOCKED(&table->inpt_mtx); 1125 1126 ohashtbl = table->inpt_hashtbl; 1127 olhashtbl = table->inpt_lhashtbl; 1128 osize = table->inpt_size; 1129 1130 nhashtbl = hashinit(hashsize, M_PCB, M_NOWAIT, &nmask); 1131 if (nhashtbl == NULL) 1132 return ENOBUFS; 1133 nlhashtbl = hashinit(hashsize, M_PCB, M_NOWAIT, &nlmask); 1134 if (nlhashtbl == NULL) { 1135 hashfree(nhashtbl, hashsize, M_PCB); 1136 return ENOBUFS; 1137 } 1138 table->inpt_hashtbl = nhashtbl; 1139 table->inpt_lhashtbl = nlhashtbl; 1140 table->inpt_mask = nmask; 1141 table->inpt_lmask = nlmask; 1142 table->inpt_size = hashsize; 1143 arc4random_buf(&table->inpt_key, sizeof(table->inpt_key)); 1144 arc4random_buf(&table->inpt_lkey, sizeof(table->inpt_lkey)); 1145 1146 TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) { 1147 LIST_REMOVE(inp, inp_lhash); 1148 LIST_REMOVE(inp, inp_hash); 1149 in_pcbhash_insert(inp); 1150 } 1151 hashfree(ohashtbl, osize, M_PCB); 1152 hashfree(olhashtbl, osize, M_PCB); 1153 1154 return (0); 1155 } 1156 1157 #ifdef DIAGNOSTIC 1158 int in_pcbnotifymiss = 0; 1159 #endif 1160 1161 /* 1162 * The in(6)_pcblookup functions are used to locate connected sockets 1163 * quickly: 1164 * faddr.fport <-> laddr.lport 1165 * No wildcard matching is done so that listening sockets are not found. 1166 * If the functions return NULL in(6)_pcblookup_listen can be used to 1167 * find a listening/bound socket that may accept the connection. 1168 * After those two lookups no other are necessary. 1169 */ 1170 struct inpcb * 1171 in_pcblookup(struct inpcbtable *table, struct in_addr faddr, 1172 u_int fport, struct in_addr laddr, u_int lport, u_int rtable) 1173 { 1174 struct inpcb *inp; 1175 u_int rdomain; 1176 1177 rdomain = rtable_l2(rtable); 1178 mtx_enter(&table->inpt_mtx); 1179 inp = in_pcbhash_lookup(table, rdomain, &faddr, fport, &laddr, lport); 1180 in_pcbref(inp); 1181 mtx_leave(&table->inpt_mtx); 1182 #ifdef DIAGNOSTIC 1183 if (inp == NULL && in_pcbnotifymiss) { 1184 printf("%s: faddr=%08x fport=%d laddr=%08x lport=%d rdom=%u\n", 1185 __func__, ntohl(faddr.s_addr), ntohs(fport), 1186 ntohl(laddr.s_addr), ntohs(lport), rdomain); 1187 } 1188 #endif 1189 return (inp); 1190 } 1191 1192 /* 1193 * The in(6)_pcblookup_listen functions are used to locate listening 1194 * sockets quickly. This are sockets with unspecified foreign address 1195 * and port: 1196 * *.* <-> laddr.lport 1197 * *.* <-> *.lport 1198 */ 1199 struct inpcb * 1200 in_pcblookup_listen(struct inpcbtable *table, struct in_addr laddr, 1201 u_int lport_arg, struct mbuf *m, u_int rtable) 1202 { 1203 const struct in_addr *key1, *key2; 1204 struct inpcb *inp; 1205 u_int16_t lport = lport_arg; 1206 u_int rdomain; 1207 1208 key1 = &laddr; 1209 key2 = &zeroin_addr; 1210 #if NPF > 0 1211 if (m && m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) { 1212 struct pf_divert *divert; 1213 1214 divert = pf_find_divert(m); 1215 KASSERT(divert != NULL); 1216 switch (divert->type) { 1217 case PF_DIVERT_TO: 1218 key1 = key2 = &divert->addr.v4; 1219 lport = divert->port; 1220 break; 1221 case PF_DIVERT_REPLY: 1222 return (NULL); 1223 default: 1224 panic("%s: unknown divert type %d, mbuf %p, divert %p", 1225 __func__, divert->type, m, divert); 1226 } 1227 } else if (m && m->m_pkthdr.pf.flags & PF_TAG_TRANSLATE_LOCALHOST) { 1228 /* 1229 * Redirected connections should not be treated the same 1230 * as connections directed to 127.0.0.0/8 since localhost 1231 * can only be accessed from the host itself. 1232 * For example portmap(8) grants more permissions for 1233 * connections to the socket bound to 127.0.0.1 than 1234 * to the * socket. 1235 */ 1236 key1 = &zeroin_addr; 1237 key2 = &laddr; 1238 } 1239 #endif 1240 1241 rdomain = rtable_l2(rtable); 1242 mtx_enter(&table->inpt_mtx); 1243 inp = in_pcbhash_lookup(table, rdomain, &zeroin_addr, 0, key1, lport); 1244 if (inp == NULL && key1->s_addr != key2->s_addr) { 1245 inp = in_pcbhash_lookup(table, rdomain, 1246 &zeroin_addr, 0, key2, lport); 1247 } 1248 in_pcbref(inp); 1249 mtx_leave(&table->inpt_mtx); 1250 #ifdef DIAGNOSTIC 1251 if (inp == NULL && in_pcbnotifymiss) { 1252 printf("%s: laddr=%08x lport=%d rdom=%u\n", 1253 __func__, ntohl(laddr.s_addr), ntohs(lport), rdomain); 1254 } 1255 #endif 1256 return (inp); 1257 } 1258