1 /* 2 * Copyright (c) 2004 Jeffrey M. Hsu. All rights reserved. 3 * Copyright (c) 2004 The DragonFly Project. All rights reserved. 4 * 5 * This code is derived from software contributed to The DragonFly Project 6 * by Jeffrey M. Hsu. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of The DragonFly Project nor the names of its 17 * contributors may be used to endorse or promote products derived 18 * from this software without specific, prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 23 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 24 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 30 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34 /* 35 * Copyright (c) 1982, 1986, 1991, 1993, 1995 36 * The Regents of the University of California. All rights reserved. 37 * 38 * Redistribution and use in source and binary forms, with or without 39 * modification, are permitted provided that the following conditions 40 * are met: 41 * 1. Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * 2. Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in the 45 * documentation and/or other materials provided with the distribution. 46 * 3. Neither the name of the University nor the names of its contributors 47 * may be used to endorse or promote products derived from this software 48 * without specific prior written permission. 49 * 50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 60 * SUCH DAMAGE. 61 * 62 * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95 63 * $FreeBSD: src/sys/netinet/in_pcb.c,v 1.59.2.27 2004/01/02 04:06:42 ambrisko Exp $ 64 */ 65 66 #include "opt_ipsec.h" 67 #include "opt_inet6.h" 68 69 #include <sys/param.h> 70 #include <sys/systm.h> 71 #include <sys/malloc.h> 72 #include <sys/mbuf.h> 73 #include <sys/domain.h> 74 #include <sys/protosw.h> 75 #include <sys/socket.h> 76 #include <sys/socketvar.h> 77 #include <sys/proc.h> 78 #include <sys/priv.h> 79 #include <sys/jail.h> 80 #include <sys/kernel.h> 81 #include <sys/sysctl.h> 82 83 #include <sys/thread2.h> 84 #include <sys/socketvar2.h> 85 #include <sys/msgport2.h> 86 87 #include <machine/limits.h> 88 89 #include <net/if.h> 90 #include <net/if_types.h> 91 #include <net/route.h> 92 #include <net/netisr2.h> 93 94 #include <netinet/in.h> 95 #include <netinet/in_pcb.h> 96 #include <netinet/in_var.h> 97 #include <netinet/ip_var.h> 98 #ifdef INET6 99 #include <netinet/ip6.h> 100 #include <netinet6/ip6_var.h> 101 #endif /* INET6 */ 102 103 #ifdef IPSEC 104 #include <netinet6/ipsec.h> 105 #include <netproto/key/key.h> 106 #include <netproto/ipsec/esp_var.h> 107 #endif 108 109 #ifdef FAST_IPSEC 110 #if defined(IPSEC) || defined(IPSEC_ESP) 111 #error "Bad idea: don't compile with both IPSEC and FAST_IPSEC!" 112 #endif 113 114 #include <netproto/ipsec/ipsec.h> 115 #include <netproto/ipsec/key.h> 116 #define IPSEC 117 #endif /* FAST_IPSEC */ 118 119 #define INP_LOCALGROUP_SIZMIN 8 120 #define INP_LOCALGROUP_SIZMAX 256 121 122 struct in_addr zeroin_addr; 123 124 /* 125 * These configure the range of local port addresses assigned to 126 * "unspecified" outgoing connections/packets/whatever. 127 */ 128 int ipport_lowfirstauto = IPPORT_RESERVED - 1; /* 1023 */ 129 int ipport_lowlastauto = IPPORT_RESERVEDSTART; /* 600 */ 130 131 int ipport_firstauto = IPPORT_RESERVED; /* 1024 */ 132 int ipport_lastauto = IPPORT_USERRESERVED; /* 5000 */ 133 134 int ipport_hifirstauto = IPPORT_HIFIRSTAUTO; /* 49152 */ 135 int ipport_hilastauto = IPPORT_HILASTAUTO; /* 65535 */ 136 137 #define RANGECHK(var, min, max) \ 138 if ((var) < (min)) { (var) = (min); } \ 139 else if ((var) > (max)) { (var) = (max); } 140 141 int udpencap_enable = 1; /* enabled by default */ 142 int udpencap_port = 4500; /* triggers decapsulation */ 143 144 /* 145 * Per-netisr inpcb markers. 146 * NOTE: they should only be used in netisrs. 147 */ 148 static struct inpcb *in_pcbmarkers; 149 static struct inpcontainer *in_pcbcontainer_markers; 150 151 static int 152 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) 153 { 154 int error; 155 156 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); 157 if (!error) { 158 RANGECHK(ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); 159 RANGECHK(ipport_lowlastauto, 1, IPPORT_RESERVED - 1); 160 161 RANGECHK(ipport_firstauto, IPPORT_RESERVED, USHRT_MAX); 162 RANGECHK(ipport_lastauto, IPPORT_RESERVED, USHRT_MAX); 163 164 RANGECHK(ipport_hifirstauto, IPPORT_RESERVED, USHRT_MAX); 165 RANGECHK(ipport_hilastauto, IPPORT_RESERVED, USHRT_MAX); 166 } 167 return (error); 168 } 169 170 #undef RANGECHK 171 172 SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports"); 173 174 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW, 175 &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", ""); 176 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW, 177 &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", ""); 178 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW, 179 &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", ""); 180 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW, 181 &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", ""); 182 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW, 183 &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", ""); 184 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW, 185 &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", ""); 186 187 /* 188 * in_pcb.c: manage the Protocol Control Blocks. 189 * 190 * NOTE: It is assumed that most of these functions will be called from 191 * a critical section. XXX - There are, unfortunately, a few exceptions 192 * to this rule that should be fixed. 193 * 194 * NOTE: The caller should initialize the cpu field to the cpu running the 195 * protocol stack associated with this inpcbinfo. 196 */ 197 198 void 199 in_pcbinfo_init(struct inpcbinfo *pcbinfo, int cpu, boolean_t shared) 200 { 201 KASSERT(cpu >= 0 && cpu < ncpus, ("invalid cpu%d", cpu)); 202 pcbinfo->cpu = cpu; 203 204 LIST_INIT(&pcbinfo->pcblisthead); 205 pcbinfo->portsave = kmalloc(sizeof(*pcbinfo->portsave), M_PCB, 206 M_WAITOK | M_ZERO); 207 208 if (shared) { 209 pcbinfo->infotoken = kmalloc(sizeof(struct lwkt_token), 210 M_PCB, M_WAITOK); 211 lwkt_token_init(pcbinfo->infotoken, "infotoken"); 212 } else { 213 pcbinfo->infotoken = NULL; 214 } 215 } 216 217 struct baddynamicports baddynamicports; 218 219 /* 220 * Check if the specified port is invalid for dynamic allocation. 221 */ 222 int 223 in_baddynamic(u_int16_t port, u_int16_t proto) 224 { 225 switch (proto) { 226 case IPPROTO_TCP: 227 return (DP_ISSET(baddynamicports.tcp, port)); 228 case IPPROTO_UDP: 229 #ifdef IPSEC 230 /* Cannot preset this as it is a sysctl */ 231 if (port == udpencap_port) 232 return (1); 233 #endif 234 return (DP_ISSET(baddynamicports.udp, port)); 235 default: 236 return (0); 237 } 238 } 239 240 void 241 in_pcbonlist(struct inpcb *inp) 242 { 243 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 244 245 KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu), 246 ("not in the correct netisr")); 247 KASSERT((inp->inp_flags & INP_ONLIST) == 0, ("already on pcblist")); 248 inp->inp_flags |= INP_ONLIST; 249 250 GET_PCBINFO_TOKEN(pcbinfo); 251 LIST_INSERT_HEAD(&pcbinfo->pcblisthead, inp, inp_list); 252 pcbinfo->ipi_count++; 253 REL_PCBINFO_TOKEN(pcbinfo); 254 } 255 256 void 257 in_pcbofflist(struct inpcb *inp) 258 { 259 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 260 261 KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu), 262 ("not in the correct netisr")); 263 KASSERT(inp->inp_flags & INP_ONLIST, ("not on pcblist")); 264 inp->inp_flags &= ~INP_ONLIST; 265 266 GET_PCBINFO_TOKEN(pcbinfo); 267 LIST_REMOVE(inp, inp_list); 268 KASSERT(pcbinfo->ipi_count > 0, 269 ("invalid inpcb count %d", pcbinfo->ipi_count)); 270 pcbinfo->ipi_count--; 271 REL_PCBINFO_TOKEN(pcbinfo); 272 } 273 274 /* 275 * Allocate a PCB and associate it with the socket. 276 */ 277 int 278 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) 279 { 280 struct inpcb *inp; 281 #ifdef IPSEC 282 int error; 283 #endif 284 285 inp = kmalloc(pcbinfo->ipi_size, M_PCB, M_WAITOK|M_ZERO|M_NULLOK); 286 if (inp == NULL) 287 return (ENOMEM); 288 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 289 inp->inp_pcbinfo = pcbinfo; 290 inp->inp_socket = so; 291 #ifdef IPSEC 292 error = ipsec_init_policy(so, &inp->inp_sp); 293 if (error != 0) { 294 kfree(inp, M_PCB); 295 return (error); 296 } 297 #endif 298 #ifdef INET6 299 if (INP_SOCKAF(so) == AF_INET6 && ip6_v6only) 300 inp->inp_flags |= IN6P_IPV6_V6ONLY; 301 if (ip6_auto_flowlabel) 302 inp->inp_flags |= IN6P_AUTOFLOWLABEL; 303 #endif 304 soreference(so); 305 so->so_pcb = inp; 306 307 in_pcbonlist(inp); 308 return (0); 309 } 310 311 /* 312 * Unlink a pcb with the intention of moving it to another cpu with a 313 * different pcbinfo. While unlinked nothing should attempt to dereference 314 * inp_pcbinfo, NULL it out so we assert if it does. 315 */ 316 void 317 in_pcbunlink(struct inpcb *inp, struct inpcbinfo *pcbinfo) 318 { 319 KASSERT(inp->inp_pcbinfo == pcbinfo, ("pcbinfo mismatch")); 320 KASSERT((inp->inp_flags & (INP_WILDCARD | INP_CONNECTED)) == 0, 321 ("already linked")); 322 323 in_pcbofflist(inp); 324 inp->inp_pcbinfo = NULL; 325 } 326 327 /* 328 * Relink a pcb into a new pcbinfo. 329 */ 330 void 331 in_pcblink(struct inpcb *inp, struct inpcbinfo *pcbinfo) 332 { 333 KASSERT(inp->inp_pcbinfo == NULL, ("has pcbinfo")); 334 KASSERT((inp->inp_flags & (INP_WILDCARD | INP_CONNECTED)) == 0, 335 ("already linked")); 336 337 inp->inp_pcbinfo = pcbinfo; 338 in_pcbonlist(inp); 339 } 340 341 static int 342 in_pcbsetlport(struct inpcb *inp, int wild, struct ucred *cred) 343 { 344 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 345 struct inpcbportinfo *portinfo; 346 u_short first, last, lport, step; 347 u_short *lastport; 348 int count, error; 349 int portinfo_first, portinfo_idx; 350 351 inp->inp_flags |= INP_ANONPORT; 352 353 step = pcbinfo->portinfo_mask + 1; 354 portinfo_first = mycpuid & pcbinfo->portinfo_mask; 355 portinfo_idx = portinfo_first; 356 loop: 357 portinfo = &pcbinfo->portinfo[portinfo_idx]; 358 359 if (inp->inp_flags & INP_HIGHPORT) { 360 first = ipport_hifirstauto; /* sysctl */ 361 last = ipport_hilastauto; 362 lastport = &portinfo->lasthi; 363 } else if (inp->inp_flags & INP_LOWPORT) { 364 if (cred && 365 (error = 366 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0))) { 367 inp->inp_laddr.s_addr = INADDR_ANY; 368 return error; 369 } 370 first = ipport_lowfirstauto; /* 1023 */ 371 last = ipport_lowlastauto; /* 600 */ 372 lastport = &portinfo->lastlow; 373 } else { 374 first = ipport_firstauto; /* sysctl */ 375 last = ipport_lastauto; 376 lastport = &portinfo->lastport; 377 } 378 379 /* 380 * This has to be atomic. If the porthash is shared across multiple 381 * protocol threads (aka tcp) then the token must be held. 382 */ 383 GET_PORT_TOKEN(portinfo); 384 385 /* 386 * Simple check to ensure all ports are not used up causing 387 * a deadlock here. 388 * 389 * We split the two cases (up and down) so that the direction 390 * is not being tested on each round of the loop. 391 */ 392 if (first > last) { 393 /* 394 * counting down 395 */ 396 in_pcbportrange(&first, &last, portinfo->offset, step); 397 count = (first - last) / step; 398 399 do { 400 if (count-- < 0) { /* completely used? */ 401 error = EADDRNOTAVAIL; 402 goto done; 403 } 404 *lastport -= step; 405 if (*lastport > first || *lastport < last) 406 *lastport = first; 407 KKASSERT((*lastport & pcbinfo->portinfo_mask) == 408 portinfo->offset); 409 lport = htons(*lastport); 410 } while (in_pcblookup_local(portinfo, inp->inp_laddr, lport, 411 wild, cred)); 412 } else { 413 /* 414 * counting up 415 */ 416 in_pcbportrange(&last, &first, portinfo->offset, step); 417 count = (last - first) / step; 418 419 do { 420 if (count-- < 0) { /* completely used? */ 421 error = EADDRNOTAVAIL; 422 goto done; 423 } 424 *lastport += step; 425 if (*lastport < first || *lastport > last) 426 *lastport = first; 427 KKASSERT((*lastport & pcbinfo->portinfo_mask) == 428 portinfo->offset); 429 lport = htons(*lastport); 430 } while (in_pcblookup_local(portinfo, inp->inp_laddr, lport, 431 wild, cred)); 432 } 433 inp->inp_lport = lport; 434 in_pcbinsporthash(portinfo, inp); 435 error = 0; 436 done: 437 REL_PORT_TOKEN(portinfo); 438 439 if (error) { 440 /* Try next portinfo */ 441 portinfo_idx++; 442 portinfo_idx &= pcbinfo->portinfo_mask; 443 if (portinfo_idx != portinfo_first) 444 goto loop; 445 inp->inp_laddr.s_addr = INADDR_ANY; 446 } 447 return error; 448 } 449 450 int 451 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct thread *td) 452 { 453 struct socket *so = inp->inp_socket; 454 struct sockaddr_in jsin; 455 struct ucred *cred = NULL; 456 int wild = 0; 457 458 if (TAILQ_EMPTY(&in_ifaddrheads[mycpuid])) /* XXX broken! */ 459 return (EADDRNOTAVAIL); 460 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) 461 return (EINVAL); /* already bound */ 462 463 if (!(so->so_options & (SO_REUSEADDR|SO_REUSEPORT))) 464 wild = 1; /* neither SO_REUSEADDR nor SO_REUSEPORT is set */ 465 if (td->td_proc) 466 cred = td->td_proc->p_ucred; 467 468 if (nam != NULL) { 469 struct sockaddr_in *sin = (struct sockaddr_in *)nam; 470 struct inpcbinfo *pcbinfo; 471 struct inpcbportinfo *portinfo; 472 struct inpcb *t; 473 u_short lport, lport_ho; 474 int reuseport = (so->so_options & SO_REUSEPORT); 475 int error; 476 477 if (nam->sa_len != sizeof *sin) 478 return (EINVAL); 479 #ifdef notdef 480 /* 481 * We should check the family, but old programs 482 * incorrectly fail to initialize it. 483 */ 484 if (sin->sin_family != AF_INET) 485 return (EAFNOSUPPORT); 486 #endif 487 if (!prison_replace_wildcards(td, nam)) 488 return (EINVAL); 489 490 lport = sin->sin_port; 491 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { 492 /* 493 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; 494 * allow complete duplication of binding if 495 * SO_REUSEPORT is set, or if SO_REUSEADDR is set 496 * and a multicast address is bound on both 497 * new and duplicated sockets. 498 */ 499 if (so->so_options & SO_REUSEADDR) 500 reuseport = SO_REUSEADDR | SO_REUSEPORT; 501 } else if (sin->sin_addr.s_addr != INADDR_ANY) { 502 sin->sin_port = 0; /* yech... */ 503 bzero(&sin->sin_zero, sizeof sin->sin_zero); 504 if (ifa_ifwithaddr((struct sockaddr *)sin) == NULL) 505 return (EADDRNOTAVAIL); 506 } 507 508 inp->inp_laddr = sin->sin_addr; 509 510 jsin.sin_family = AF_INET; 511 jsin.sin_addr.s_addr = inp->inp_laddr.s_addr; 512 if (!prison_replace_wildcards(td, (struct sockaddr *)&jsin)) { 513 inp->inp_laddr.s_addr = INADDR_ANY; 514 return (EINVAL); 515 } 516 inp->inp_laddr.s_addr = jsin.sin_addr.s_addr; 517 518 if (lport == 0) { 519 /* Auto-select local port */ 520 return in_pcbsetlport(inp, wild, cred); 521 } 522 lport_ho = ntohs(lport); 523 524 /* GROSS */ 525 if (lport_ho < IPPORT_RESERVED && cred && 526 (error = 527 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0))) { 528 inp->inp_laddr.s_addr = INADDR_ANY; 529 return (error); 530 } 531 532 /* 533 * Locate the proper portinfo based on lport 534 */ 535 pcbinfo = inp->inp_pcbinfo; 536 portinfo = 537 &pcbinfo->portinfo[lport_ho & pcbinfo->portinfo_mask]; 538 KKASSERT((lport_ho & pcbinfo->portinfo_mask) == 539 portinfo->offset); 540 541 /* 542 * This has to be atomic. If the porthash is shared across 543 * multiple protocol threads (aka tcp) then the token must 544 * be held. 545 */ 546 GET_PORT_TOKEN(portinfo); 547 548 if (so->so_cred->cr_uid != 0 && 549 !IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { 550 t = in_pcblookup_local(portinfo, sin->sin_addr, lport, 551 INPLOOKUP_WILDCARD, cred); 552 if (t && 553 (!in_nullhost(sin->sin_addr) || 554 !in_nullhost(t->inp_laddr) || 555 (t->inp_socket->so_options & SO_REUSEPORT) == 0) && 556 (so->so_cred->cr_uid != 557 t->inp_socket->so_cred->cr_uid)) { 558 #ifdef INET6 559 if (!in_nullhost(sin->sin_addr) || 560 !in_nullhost(t->inp_laddr) || 561 INP_SOCKAF(so) == INP_SOCKAF(t->inp_socket)) 562 #endif 563 { 564 inp->inp_laddr.s_addr = INADDR_ANY; 565 error = EADDRINUSE; 566 goto done; 567 } 568 } 569 } 570 if (cred && !prison_replace_wildcards(td, nam)) { 571 inp->inp_laddr.s_addr = INADDR_ANY; 572 error = EADDRNOTAVAIL; 573 goto done; 574 } 575 t = in_pcblookup_local(portinfo, sin->sin_addr, lport, 576 wild, cred); 577 if (t && !(reuseport & t->inp_socket->so_options)) { 578 #ifdef INET6 579 if (!in_nullhost(sin->sin_addr) || 580 !in_nullhost(t->inp_laddr) || 581 INP_SOCKAF(so) == INP_SOCKAF(t->inp_socket)) 582 #endif 583 { 584 inp->inp_laddr.s_addr = INADDR_ANY; 585 error = EADDRINUSE; 586 goto done; 587 } 588 } 589 inp->inp_lport = lport; 590 in_pcbinsporthash(portinfo, inp); 591 error = 0; 592 done: 593 REL_PORT_TOKEN(portinfo); 594 return (error); 595 } else { 596 jsin.sin_family = AF_INET; 597 jsin.sin_addr.s_addr = inp->inp_laddr.s_addr; 598 if (!prison_replace_wildcards(td, (struct sockaddr *)&jsin)) { 599 inp->inp_laddr.s_addr = INADDR_ANY; 600 return (EINVAL); 601 } 602 inp->inp_laddr.s_addr = jsin.sin_addr.s_addr; 603 604 return in_pcbsetlport(inp, wild, cred); 605 } 606 } 607 608 static struct inpcb * 609 in_pcblookup_localremote(struct inpcbportinfo *portinfo, struct in_addr laddr, 610 u_short lport, struct in_addr faddr, u_short fport, struct ucred *cred) 611 { 612 struct inpcb *inp; 613 struct inpcbporthead *porthash; 614 struct inpcbport *phd; 615 struct inpcb *match = NULL; 616 617 /* 618 * If the porthashbase is shared across several cpus, it must 619 * have been locked. 620 */ 621 ASSERT_PORT_TOKEN_HELD(portinfo); 622 623 /* 624 * Best fit PCB lookup. 625 * 626 * First see if this local port is in use by looking on the 627 * port hash list. 628 */ 629 porthash = &portinfo->porthashbase[ 630 INP_PCBPORTHASH(lport, portinfo->porthashmask)]; 631 LIST_FOREACH(phd, porthash, phd_hash) { 632 if (phd->phd_port == lport) 633 break; 634 } 635 if (phd != NULL) { 636 LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { 637 #ifdef INET6 638 if ((inp->inp_vflag & INP_IPV4) == 0) 639 continue; 640 #endif 641 if (inp->inp_laddr.s_addr != INADDR_ANY && 642 inp->inp_laddr.s_addr != laddr.s_addr) 643 continue; 644 645 if (inp->inp_faddr.s_addr != INADDR_ANY && 646 inp->inp_faddr.s_addr != faddr.s_addr) 647 continue; 648 649 if (inp->inp_fport != 0 && inp->inp_fport != fport) 650 continue; 651 652 if (cred == NULL || 653 cred->cr_prison == 654 inp->inp_socket->so_cred->cr_prison) { 655 match = inp; 656 break; 657 } 658 } 659 } 660 return (match); 661 } 662 663 int 664 in_pcbbind_remote(struct inpcb *inp, const struct sockaddr *remote, 665 struct thread *td) 666 { 667 struct proc *p = td->td_proc; 668 unsigned short *lastport; 669 const struct sockaddr_in *sin = (const struct sockaddr_in *)remote; 670 struct sockaddr_in jsin; 671 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 672 struct inpcbportinfo *portinfo; 673 struct ucred *cred = NULL; 674 u_short first, last, lport, step; 675 int count, error, dup; 676 int portinfo_first, portinfo_idx; 677 678 if (TAILQ_EMPTY(&in_ifaddrheads[mycpuid])) /* XXX broken! */ 679 return (EADDRNOTAVAIL); 680 681 KKASSERT(inp->inp_laddr.s_addr != INADDR_ANY); 682 if (inp->inp_lport != 0) 683 return (EINVAL); /* already bound */ 684 685 KKASSERT(p); 686 cred = p->p_ucred; 687 688 jsin.sin_family = AF_INET; 689 jsin.sin_addr.s_addr = inp->inp_laddr.s_addr; 690 if (!prison_replace_wildcards(td, (struct sockaddr *)&jsin)) { 691 inp->inp_laddr.s_addr = INADDR_ANY; 692 return (EINVAL); 693 } 694 inp->inp_laddr.s_addr = jsin.sin_addr.s_addr; 695 696 inp->inp_flags |= INP_ANONPORT; 697 698 step = pcbinfo->portinfo_mask + 1; 699 portinfo_first = mycpuid & pcbinfo->portinfo_mask; 700 portinfo_idx = portinfo_first; 701 loop: 702 portinfo = &pcbinfo->portinfo[portinfo_idx]; 703 dup = 0; 704 705 if (inp->inp_flags & INP_HIGHPORT) { 706 first = ipport_hifirstauto; /* sysctl */ 707 last = ipport_hilastauto; 708 lastport = &portinfo->lasthi; 709 } else if (inp->inp_flags & INP_LOWPORT) { 710 if (cred && 711 (error = 712 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0))) { 713 inp->inp_laddr.s_addr = INADDR_ANY; 714 return (error); 715 } 716 first = ipport_lowfirstauto; /* 1023 */ 717 last = ipport_lowlastauto; /* 600 */ 718 lastport = &portinfo->lastlow; 719 } else { 720 first = ipport_firstauto; /* sysctl */ 721 last = ipport_lastauto; 722 lastport = &portinfo->lastport; 723 } 724 725 /* 726 * This has to be atomic. If the porthash is shared across multiple 727 * protocol threads (aka tcp) then the token must be held. 728 */ 729 GET_PORT_TOKEN(portinfo); 730 731 again: 732 /* 733 * Simple check to ensure all ports are not used up causing 734 * a deadlock here. 735 * 736 * We split the two cases (up and down) so that the direction 737 * is not being tested on each round of the loop. 738 */ 739 if (first > last) { 740 /* 741 * counting down 742 */ 743 in_pcbportrange(&first, &last, portinfo->offset, step); 744 count = (first - last) / step; 745 746 do { 747 if (count-- < 0) { /* completely used? */ 748 error = EADDRNOTAVAIL; 749 goto done; 750 } 751 *lastport -= step; 752 if (*lastport > first || *lastport < last) 753 *lastport = first; 754 KKASSERT((*lastport & pcbinfo->portinfo_mask) == 755 portinfo->offset); 756 lport = htons(*lastport); 757 } while (in_pcblookup_localremote(portinfo, inp->inp_laddr, 758 lport, sin->sin_addr, sin->sin_port, cred)); 759 } else { 760 /* 761 * counting up 762 */ 763 in_pcbportrange(&last, &first, portinfo->offset, step); 764 count = (last - first) / step; 765 766 do { 767 if (count-- < 0) { /* completely used? */ 768 error = EADDRNOTAVAIL; 769 goto done; 770 } 771 *lastport += step; 772 if (*lastport < first || *lastport > last) 773 *lastport = first; 774 KKASSERT((*lastport & pcbinfo->portinfo_mask) == 775 portinfo->offset); 776 lport = htons(*lastport); 777 } while (in_pcblookup_localremote(portinfo, inp->inp_laddr, 778 lport, sin->sin_addr, sin->sin_port, cred)); 779 } 780 781 /* This could happen on loopback interface */ 782 if (sin->sin_port == lport && 783 sin->sin_addr.s_addr == inp->inp_laddr.s_addr) { 784 if (dup) { 785 /* 786 * Duplicate again; give up 787 */ 788 error = EADDRNOTAVAIL; 789 goto done; 790 } 791 dup = 1; 792 goto again; 793 } 794 inp->inp_lport = lport; 795 in_pcbinsporthash(portinfo, inp); 796 error = 0; 797 done: 798 REL_PORT_TOKEN(portinfo); 799 800 if (error) { 801 /* Try next portinfo */ 802 portinfo_idx++; 803 portinfo_idx &= pcbinfo->portinfo_mask; 804 if (portinfo_idx != portinfo_first) 805 goto loop; 806 inp->inp_laddr.s_addr = INADDR_ANY; 807 } 808 return error; 809 } 810 811 /* 812 * Transform old in_pcbconnect() into an inner subroutine for new 813 * in_pcbconnect(): Do some validity-checking on the remote 814 * address (in mbuf 'nam') and then determine local host address 815 * (i.e., which interface) to use to access that remote host. 816 * 817 * This preserves definition of in_pcbconnect(), while supporting a 818 * slightly different version for T/TCP. (This is more than 819 * a bit of a kludge, but cleaning up the internal interfaces would 820 * have forced minor changes in every protocol). 821 */ 822 int 823 in_pcbladdr_find(struct inpcb *inp, struct sockaddr *nam, 824 struct sockaddr_in **plocal_sin, struct thread *td, int find) 825 { 826 struct in_ifaddr *ia; 827 struct ucred *cred = NULL; 828 struct sockaddr_in *sin = (struct sockaddr_in *)nam; 829 struct sockaddr *jsin; 830 int jailed = 0, alloc_route = 0; 831 832 if (nam->sa_len != sizeof *sin) 833 return (EINVAL); 834 if (sin->sin_family != AF_INET) 835 return (EAFNOSUPPORT); 836 if (sin->sin_port == 0) 837 return (EADDRNOTAVAIL); 838 if (td && td->td_proc && td->td_proc->p_ucred) 839 cred = td->td_proc->p_ucred; 840 if (cred && cred->cr_prison) 841 jailed = 1; 842 if (!TAILQ_EMPTY(&in_ifaddrheads[mycpuid])) { 843 ia = TAILQ_FIRST(&in_ifaddrheads[mycpuid])->ia; 844 /* 845 * If the destination address is INADDR_ANY, 846 * use the primary local address. 847 * If the supplied address is INADDR_BROADCAST, 848 * and the primary interface supports broadcast, 849 * choose the broadcast address for that interface. 850 */ 851 if (sin->sin_addr.s_addr == INADDR_ANY) 852 sin->sin_addr = IA_SIN(ia)->sin_addr; 853 else if (sin->sin_addr.s_addr == (u_long)INADDR_BROADCAST && 854 (ia->ia_ifp->if_flags & IFF_BROADCAST)) 855 sin->sin_addr = satosin(&ia->ia_broadaddr)->sin_addr; 856 } 857 if (find) { 858 struct route *ro; 859 860 ia = NULL; 861 /* 862 * If route is known or can be allocated now, 863 * our src addr is taken from the i/f, else punt. 864 * Note that we should check the address family of the cached 865 * destination, in case of sharing the cache with IPv6. 866 */ 867 ro = &inp->inp_route; 868 if (ro->ro_rt && 869 (!(ro->ro_rt->rt_flags & RTF_UP) || 870 ro->ro_dst.sa_family != AF_INET || 871 satosin(&ro->ro_dst)->sin_addr.s_addr != 872 sin->sin_addr.s_addr || 873 inp->inp_socket->so_options & SO_DONTROUTE)) { 874 RTFREE(ro->ro_rt); 875 ro->ro_rt = NULL; 876 } 877 if (!(inp->inp_socket->so_options & SO_DONTROUTE) && /*XXX*/ 878 (ro->ro_rt == NULL || 879 ro->ro_rt->rt_ifp == NULL)) { 880 /* No route yet, so try to acquire one */ 881 bzero(&ro->ro_dst, sizeof(struct sockaddr_in)); 882 ro->ro_dst.sa_family = AF_INET; 883 ro->ro_dst.sa_len = sizeof(struct sockaddr_in); 884 ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = 885 sin->sin_addr; 886 rtalloc(ro); 887 alloc_route = 1; 888 } 889 /* 890 * If we found a route, use the address 891 * corresponding to the outgoing interface 892 * unless it is the loopback (in case a route 893 * to our address on another net goes to loopback). 894 */ 895 if (ro->ro_rt && 896 !(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK)) { 897 if (jailed) { 898 if (jailed_ip(cred->cr_prison, 899 ro->ro_rt->rt_ifa->ifa_addr)) { 900 ia = ifatoia(ro->ro_rt->rt_ifa); 901 } 902 } else { 903 ia = ifatoia(ro->ro_rt->rt_ifa); 904 } 905 } 906 if (ia == NULL) { 907 u_short fport = sin->sin_port; 908 909 sin->sin_port = 0; 910 ia = ifatoia(ifa_ifwithdstaddr(sintosa(sin))); 911 if (ia && jailed && !jailed_ip(cred->cr_prison, 912 sintosa(&ia->ia_addr))) 913 ia = NULL; 914 if (ia == NULL) 915 ia = ifatoia(ifa_ifwithnet(sintosa(sin))); 916 if (ia && jailed && !jailed_ip(cred->cr_prison, 917 sintosa(&ia->ia_addr))) 918 ia = NULL; 919 sin->sin_port = fport; 920 if (ia == NULL && 921 !TAILQ_EMPTY(&in_ifaddrheads[mycpuid])) 922 ia = TAILQ_FIRST(&in_ifaddrheads[mycpuid])->ia; 923 if (ia && jailed && !jailed_ip(cred->cr_prison, 924 sintosa(&ia->ia_addr))) 925 ia = NULL; 926 927 if (!jailed && ia == NULL) 928 goto fail; 929 } 930 /* 931 * If the destination address is multicast and an outgoing 932 * interface has been set as a multicast option, use the 933 * address of that interface as our source address. 934 */ 935 if (!jailed && IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && 936 inp->inp_moptions != NULL) { 937 struct ip_moptions *imo; 938 struct ifnet *ifp; 939 940 imo = inp->inp_moptions; 941 if (imo->imo_multicast_ifp != NULL) { 942 struct in_ifaddr_container *iac; 943 944 ifp = imo->imo_multicast_ifp; 945 ia = NULL; 946 TAILQ_FOREACH(iac, 947 &in_ifaddrheads[mycpuid], ia_link) { 948 if (iac->ia->ia_ifp == ifp) { 949 ia = iac->ia; 950 break; 951 } 952 } 953 if (ia == NULL) 954 goto fail; 955 } 956 } 957 /* 958 * Don't do pcblookup call here; return interface in plocal_sin 959 * and exit to caller, that will do the lookup. 960 */ 961 if (ia == NULL && jailed) { 962 if ((jsin = prison_get_nonlocal( 963 cred->cr_prison, AF_INET, NULL)) != NULL || 964 (jsin = prison_get_local( 965 cred->cr_prison, AF_INET, NULL)) != NULL) { 966 *plocal_sin = satosin(jsin); 967 } else { 968 /* IPv6 only Jail */ 969 goto fail; 970 } 971 } else { 972 *plocal_sin = &ia->ia_addr; 973 } 974 } 975 return (0); 976 fail: 977 if (alloc_route) { 978 struct route *ro = &inp->inp_route; 979 980 if (ro->ro_rt != NULL) 981 RTFREE(ro->ro_rt); 982 bzero(ro, sizeof(*ro)); 983 } 984 return (EADDRNOTAVAIL); 985 } 986 987 int 988 in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, 989 struct sockaddr_in **plocal_sin, struct thread *td) 990 { 991 return in_pcbladdr_find(inp, nam, plocal_sin, td, 992 (inp->inp_laddr.s_addr == INADDR_ANY)); 993 } 994 995 /* 996 * Outer subroutine: 997 * Connect from a socket to a specified address. 998 * Both address and port must be specified in argument sin. 999 * If don't have a local address for this socket yet, 1000 * then pick one. 1001 */ 1002 int 1003 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct thread *td) 1004 { 1005 struct sockaddr_in *if_sin; 1006 struct sockaddr_in *sin = (struct sockaddr_in *)nam; 1007 int error; 1008 1009 /* Call inner routine to assign local interface address. */ 1010 if ((error = in_pcbladdr(inp, nam, &if_sin, td)) != 0) 1011 return (error); 1012 1013 if (in_pcblookup_hash(inp->inp_pcbinfo, sin->sin_addr, sin->sin_port, 1014 inp->inp_laddr.s_addr ? 1015 inp->inp_laddr : if_sin->sin_addr, 1016 inp->inp_lport, FALSE, NULL) != NULL) { 1017 return (EADDRINUSE); 1018 } 1019 if (inp->inp_laddr.s_addr == INADDR_ANY) { 1020 if (inp->inp_lport == 0) { 1021 error = in_pcbbind(inp, NULL, td); 1022 if (error) 1023 return (error); 1024 } 1025 inp->inp_laddr = if_sin->sin_addr; 1026 } 1027 inp->inp_faddr = sin->sin_addr; 1028 inp->inp_fport = sin->sin_port; 1029 in_pcbinsconnhash(inp); 1030 return (0); 1031 } 1032 1033 void 1034 in_pcbdisconnect(struct inpcb *inp) 1035 { 1036 1037 in_pcbremconnhash(inp); 1038 inp->inp_faddr.s_addr = INADDR_ANY; 1039 inp->inp_fport = 0; 1040 } 1041 1042 void 1043 in_pcbdetach(struct inpcb *inp) 1044 { 1045 struct socket *so = inp->inp_socket; 1046 struct inpcbinfo *ipi = inp->inp_pcbinfo; 1047 1048 #ifdef IPSEC 1049 ipsec4_delete_pcbpolicy(inp); 1050 #endif /*IPSEC*/ 1051 inp->inp_gencnt = ++ipi->ipi_gencnt; 1052 KKASSERT((so->so_state & SS_ASSERTINPROG) == 0); 1053 in_pcbremlists(inp); 1054 so->so_pcb = NULL; 1055 sofree(so); /* remove pcb ref */ 1056 if (inp->inp_options) 1057 m_free(inp->inp_options); 1058 if (inp->inp_route.ro_rt) 1059 rtfree(inp->inp_route.ro_rt); 1060 ip_freemoptions(inp->inp_moptions); 1061 inp->inp_vflag = 0; 1062 kfree(inp, M_PCB); 1063 } 1064 1065 /* 1066 * The calling convention of in_setsockaddr() and in_setpeeraddr() was 1067 * modified to match the pru_sockaddr() and pru_peeraddr() entry points 1068 * in struct pr_usrreqs, so that protocols can just reference then directly 1069 * without the need for a wrapper function. The socket must have a valid 1070 * (i.e., non-nil) PCB, but it should be impossible to get an invalid one 1071 * except through a kernel programming error, so it is acceptable to panic 1072 * (or in this case trap) if the PCB is invalid. (Actually, we don't trap 1073 * because there actually /is/ a programming error somewhere... XXX) 1074 */ 1075 int 1076 in_setsockaddr(struct socket *so, struct sockaddr **nam) 1077 { 1078 struct inpcb *inp; 1079 struct sockaddr_in *sin; 1080 1081 /* 1082 * Do the malloc first in case it blocks. 1083 */ 1084 sin = kmalloc(sizeof *sin, M_SONAME, M_WAITOK | M_ZERO); 1085 sin->sin_family = AF_INET; 1086 sin->sin_len = sizeof *sin; 1087 1088 crit_enter(); 1089 inp = so->so_pcb; 1090 if (!inp) { 1091 crit_exit(); 1092 kfree(sin, M_SONAME); 1093 return (ECONNRESET); 1094 } 1095 sin->sin_port = inp->inp_lport; 1096 sin->sin_addr = inp->inp_laddr; 1097 crit_exit(); 1098 1099 *nam = (struct sockaddr *)sin; 1100 return (0); 1101 } 1102 1103 void 1104 in_setsockaddr_dispatch(netmsg_t msg) 1105 { 1106 int error; 1107 1108 error = in_setsockaddr(msg->base.nm_so, msg->peeraddr.nm_nam); 1109 lwkt_replymsg(&msg->lmsg, error); 1110 } 1111 1112 int 1113 in_setpeeraddr(struct socket *so, struct sockaddr **nam) 1114 { 1115 struct inpcb *inp; 1116 struct sockaddr_in *sin; 1117 1118 /* 1119 * Do the malloc first in case it blocks. 1120 */ 1121 sin = kmalloc(sizeof *sin, M_SONAME, M_WAITOK | M_ZERO); 1122 sin->sin_family = AF_INET; 1123 sin->sin_len = sizeof *sin; 1124 1125 crit_enter(); 1126 inp = so->so_pcb; 1127 if (!inp) { 1128 crit_exit(); 1129 kfree(sin, M_SONAME); 1130 return (ECONNRESET); 1131 } 1132 sin->sin_port = inp->inp_fport; 1133 sin->sin_addr = inp->inp_faddr; 1134 crit_exit(); 1135 1136 *nam = (struct sockaddr *)sin; 1137 return (0); 1138 } 1139 1140 void 1141 in_setpeeraddr_dispatch(netmsg_t msg) 1142 { 1143 int error; 1144 1145 error = in_setpeeraddr(msg->base.nm_so, msg->peeraddr.nm_nam); 1146 lwkt_replymsg(&msg->lmsg, error); 1147 } 1148 1149 void 1150 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int err, 1151 void (*notify)(struct inpcb *, int)) 1152 { 1153 struct inpcb *inp, *marker; 1154 1155 KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu), 1156 ("not in the correct netisr")); 1157 marker = &in_pcbmarkers[mycpuid]; 1158 1159 /* 1160 * NOTE: 1161 * - If INP_PLACEMARKER is set we must ignore the rest of the 1162 * structure and skip it. 1163 * - It is safe to nuke inpcbs here, since we are in their own 1164 * netisr. 1165 */ 1166 GET_PCBINFO_TOKEN(pcbinfo); 1167 1168 LIST_INSERT_HEAD(&pcbinfo->pcblisthead, marker, inp_list); 1169 while ((inp = LIST_NEXT(marker, inp_list)) != NULL) { 1170 LIST_REMOVE(marker, inp_list); 1171 LIST_INSERT_AFTER(inp, marker, inp_list); 1172 1173 if (inp->inp_flags & INP_PLACEMARKER) 1174 continue; 1175 #ifdef INET6 1176 if (!(inp->inp_vflag & INP_IPV4)) 1177 continue; 1178 #endif 1179 if (inp->inp_faddr.s_addr != faddr.s_addr || 1180 inp->inp_socket == NULL) 1181 continue; 1182 (*notify)(inp, err); /* can remove inp from list! */ 1183 } 1184 LIST_REMOVE(marker, inp_list); 1185 1186 REL_PCBINFO_TOKEN(pcbinfo); 1187 } 1188 1189 void 1190 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) 1191 { 1192 struct inpcb *inp, *marker; 1193 1194 /* 1195 * We only need to make sure that we are in netisr0, where all 1196 * multicast operation happen. We could check inpcbinfo which 1197 * does not belong to netisr0 by holding the inpcbinfo's token. 1198 * In this case, the pcbinfo must be able to be shared, i.e. 1199 * pcbinfo->infotoken is not NULL. 1200 */ 1201 KASSERT(&curthread->td_msgport == netisr_cpuport(0), 1202 ("not in netisr0")); 1203 KASSERT(pcbinfo->cpu == 0 || pcbinfo->infotoken != NULL, 1204 ("pcbinfo could not be shared")); 1205 1206 /* 1207 * Get a marker for the current netisr (netisr0). 1208 * 1209 * It is possible that the multicast address deletion blocks, 1210 * which could cause temporary token releasing. So we use 1211 * inpcb marker here to get a coherent view of the inpcb list. 1212 * 1213 * While, on the other hand, moptions are only added and deleted 1214 * in netisr0, so we would not see staled moption or miss moption 1215 * even if the token was released due to the blocking multicast 1216 * address deletion. 1217 */ 1218 marker = &in_pcbmarkers[mycpuid]; 1219 1220 GET_PCBINFO_TOKEN(pcbinfo); 1221 1222 LIST_INSERT_HEAD(&pcbinfo->pcblisthead, marker, inp_list); 1223 while ((inp = LIST_NEXT(marker, inp_list)) != NULL) { 1224 struct ip_moptions *imo; 1225 1226 LIST_REMOVE(marker, inp_list); 1227 LIST_INSERT_AFTER(inp, marker, inp_list); 1228 1229 if (inp->inp_flags & INP_PLACEMARKER) 1230 continue; 1231 imo = inp->inp_moptions; 1232 if ((inp->inp_vflag & INP_IPV4) && imo != NULL) { 1233 int i, gap; 1234 1235 /* 1236 * Unselect the outgoing interface if it is being 1237 * detached. 1238 */ 1239 if (imo->imo_multicast_ifp == ifp) 1240 imo->imo_multicast_ifp = NULL; 1241 1242 /* 1243 * Drop multicast group membership if we joined 1244 * through the interface being detached. 1245 */ 1246 for (i = 0, gap = 0; i < imo->imo_num_memberships; 1247 i++) { 1248 if (imo->imo_membership[i]->inm_ifp == ifp) { 1249 /* 1250 * NOTE: 1251 * This could block and the pcbinfo 1252 * token could be passively released. 1253 */ 1254 in_delmulti(imo->imo_membership[i]); 1255 gap++; 1256 } else if (gap != 0) 1257 imo->imo_membership[i - gap] = 1258 imo->imo_membership[i]; 1259 } 1260 imo->imo_num_memberships -= gap; 1261 } 1262 } 1263 LIST_REMOVE(marker, inp_list); 1264 1265 REL_PCBINFO_TOKEN(pcbinfo); 1266 } 1267 1268 /* 1269 * Check for alternatives when higher level complains 1270 * about service problems. For now, invalidate cached 1271 * routing information. If the route was created dynamically 1272 * (by a redirect), time to try a default gateway again. 1273 */ 1274 void 1275 in_losing(struct inpcb *inp) 1276 { 1277 struct rtentry *rt; 1278 struct rt_addrinfo rtinfo; 1279 1280 if ((rt = inp->inp_route.ro_rt)) { 1281 bzero(&rtinfo, sizeof(struct rt_addrinfo)); 1282 rtinfo.rti_info[RTAX_DST] = rt_key(rt); 1283 rtinfo.rti_info[RTAX_GATEWAY] = rt->rt_gateway; 1284 rtinfo.rti_info[RTAX_NETMASK] = rt_mask(rt); 1285 rtinfo.rti_flags = rt->rt_flags; 1286 rt_missmsg(RTM_LOSING, &rtinfo, rt->rt_flags, 0); 1287 if (rt->rt_flags & RTF_DYNAMIC) { 1288 rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway, 1289 rt_mask(rt), rt->rt_flags, NULL); 1290 } 1291 inp->inp_route.ro_rt = NULL; 1292 rtfree(rt); 1293 /* 1294 * A new route can be allocated 1295 * the next time output is attempted. 1296 */ 1297 } 1298 } 1299 1300 /* 1301 * After a routing change, flush old routing 1302 * and allocate a (hopefully) better one. 1303 */ 1304 void 1305 in_rtchange(struct inpcb *inp, int err) 1306 { 1307 if (inp->inp_route.ro_rt) { 1308 rtfree(inp->inp_route.ro_rt); 1309 inp->inp_route.ro_rt = NULL; 1310 /* 1311 * A new route can be allocated the next time 1312 * output is attempted. 1313 */ 1314 } 1315 } 1316 1317 /* 1318 * Lookup a PCB based on the local address and port. 1319 */ 1320 struct inpcb * 1321 in_pcblookup_local(struct inpcbportinfo *portinfo, struct in_addr laddr, 1322 u_int lport_arg, int wild_okay, struct ucred *cred) 1323 { 1324 struct inpcb *inp; 1325 int matchwild = 3, wildcard; 1326 u_short lport = lport_arg; 1327 struct inpcbporthead *porthash; 1328 struct inpcbport *phd; 1329 struct inpcb *match = NULL; 1330 1331 /* 1332 * If the porthashbase is shared across several cpus, it must 1333 * have been locked. 1334 */ 1335 ASSERT_PORT_TOKEN_HELD(portinfo); 1336 1337 /* 1338 * Best fit PCB lookup. 1339 * 1340 * First see if this local port is in use by looking on the 1341 * port hash list. 1342 */ 1343 porthash = &portinfo->porthashbase[ 1344 INP_PCBPORTHASH(lport, portinfo->porthashmask)]; 1345 LIST_FOREACH(phd, porthash, phd_hash) { 1346 if (phd->phd_port == lport) 1347 break; 1348 } 1349 if (phd != NULL) { 1350 /* 1351 * Port is in use by one or more PCBs. Look for best 1352 * fit. 1353 */ 1354 LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { 1355 wildcard = 0; 1356 #ifdef INET6 1357 if ((inp->inp_vflag & INP_IPV4) == 0) 1358 continue; 1359 #endif 1360 if (inp->inp_faddr.s_addr != INADDR_ANY) 1361 wildcard++; 1362 if (inp->inp_laddr.s_addr != INADDR_ANY) { 1363 if (laddr.s_addr == INADDR_ANY) 1364 wildcard++; 1365 else if (inp->inp_laddr.s_addr != laddr.s_addr) 1366 continue; 1367 } else { 1368 if (laddr.s_addr != INADDR_ANY) 1369 wildcard++; 1370 } 1371 if (wildcard && !wild_okay) 1372 continue; 1373 if (wildcard < matchwild && 1374 (cred == NULL || 1375 cred->cr_prison == 1376 inp->inp_socket->so_cred->cr_prison)) { 1377 match = inp; 1378 matchwild = wildcard; 1379 if (matchwild == 0) { 1380 break; 1381 } 1382 } 1383 } 1384 } 1385 return (match); 1386 } 1387 1388 struct inpcb * 1389 in_pcblocalgroup_last(const struct inpcbinfo *pcbinfo, 1390 const struct inpcb *inp) 1391 { 1392 const struct inp_localgrphead *hdr; 1393 const struct inp_localgroup *grp; 1394 int i; 1395 1396 if (pcbinfo->localgrphashbase == NULL) 1397 return NULL; 1398 1399 GET_PCBINFO_TOKEN(pcbinfo); 1400 1401 hdr = &pcbinfo->localgrphashbase[ 1402 INP_PCBLOCALGRPHASH(inp->inp_lport, pcbinfo->localgrphashmask)]; 1403 1404 LIST_FOREACH(grp, hdr, il_list) { 1405 if (grp->il_vflag == inp->inp_vflag && 1406 grp->il_lport == inp->inp_lport && 1407 memcmp(&grp->il_dependladdr, 1408 &inp->inp_inc.inc_ie.ie_dependladdr, 1409 sizeof(grp->il_dependladdr)) == 0) { 1410 break; 1411 } 1412 } 1413 if (grp == NULL || grp->il_inpcnt == 1) { 1414 REL_PCBINFO_TOKEN(pcbinfo); 1415 return NULL; 1416 } 1417 1418 KASSERT(grp->il_inpcnt >= 2, 1419 ("invalid localgroup inp count %d", grp->il_inpcnt)); 1420 for (i = 0; i < grp->il_inpcnt; ++i) { 1421 if (grp->il_inp[i] == inp) { 1422 int last = grp->il_inpcnt - 1; 1423 1424 if (i == last) 1425 last = grp->il_inpcnt - 2; 1426 REL_PCBINFO_TOKEN(pcbinfo); 1427 return grp->il_inp[last]; 1428 } 1429 } 1430 REL_PCBINFO_TOKEN(pcbinfo); 1431 return NULL; 1432 } 1433 1434 static struct inpcb * 1435 inp_localgroup_lookup(const struct inpcbinfo *pcbinfo, 1436 struct in_addr laddr, uint16_t lport, uint32_t pkt_hash) 1437 { 1438 struct inpcb *local_wild = NULL; 1439 const struct inp_localgrphead *hdr; 1440 const struct inp_localgroup *grp; 1441 1442 ASSERT_PCBINFO_TOKEN_HELD(pcbinfo); 1443 1444 hdr = &pcbinfo->localgrphashbase[ 1445 INP_PCBLOCALGRPHASH(lport, pcbinfo->localgrphashmask)]; 1446 #ifdef INP_LOCALGROUP_HASHTHR 1447 pkt_hash >>= ncpus2_shift; 1448 #endif 1449 1450 /* 1451 * Order of socket selection: 1452 * 1. non-wild. 1453 * 2. wild. 1454 * 1455 * NOTE: 1456 * - Local group does not contain jailed sockets 1457 * - Local group does not contain IPv4 mapped INET6 wild sockets 1458 */ 1459 LIST_FOREACH(grp, hdr, il_list) { 1460 #ifdef INET6 1461 if (!(grp->il_vflag & INP_IPV4)) 1462 continue; 1463 #endif 1464 if (grp->il_lport == lport) { 1465 int idx; 1466 1467 #ifdef INP_LOCALGROUP_HASHTHR 1468 idx = pkt_hash / grp->il_factor; 1469 KASSERT(idx < grp->il_inpcnt && idx >= 0, 1470 ("invalid hash %04x, cnt %d or fact %d", 1471 pkt_hash, grp->il_inpcnt, grp->il_factor)); 1472 #else 1473 /* 1474 * Modulo-N is used here, which greatly reduces 1475 * completion queue token contention, thus more 1476 * cpu time is saved. 1477 */ 1478 idx = pkt_hash % grp->il_inpcnt; 1479 #endif 1480 1481 if (grp->il_laddr.s_addr == laddr.s_addr) 1482 return grp->il_inp[idx]; 1483 else if (grp->il_laddr.s_addr == INADDR_ANY) 1484 local_wild = grp->il_inp[idx]; 1485 } 1486 } 1487 if (local_wild != NULL) 1488 return local_wild; 1489 return NULL; 1490 } 1491 1492 /* 1493 * Lookup PCB in hash list. 1494 */ 1495 struct inpcb * 1496 in_pcblookup_pkthash(struct inpcbinfo *pcbinfo, struct in_addr faddr, 1497 u_int fport_arg, struct in_addr laddr, u_int lport_arg, 1498 boolean_t wildcard, struct ifnet *ifp, const struct mbuf *m) 1499 { 1500 struct inpcbhead *head; 1501 struct inpcb *inp, *jinp=NULL; 1502 u_short fport = fport_arg, lport = lport_arg; 1503 1504 /* 1505 * First look for an exact match. 1506 */ 1507 head = &pcbinfo->hashbase[INP_PCBCONNHASH(faddr.s_addr, fport, 1508 laddr.s_addr, lport, pcbinfo->hashmask)]; 1509 LIST_FOREACH(inp, head, inp_hash) { 1510 #ifdef INET6 1511 if (!(inp->inp_vflag & INP_IPV4)) 1512 continue; 1513 #endif 1514 if (in_hosteq(inp->inp_faddr, faddr) && 1515 in_hosteq(inp->inp_laddr, laddr) && 1516 inp->inp_fport == fport && inp->inp_lport == lport) { 1517 /* found */ 1518 if (inp->inp_socket == NULL || 1519 inp->inp_socket->so_cred->cr_prison == NULL) { 1520 return (inp); 1521 } else { 1522 if (jinp == NULL) 1523 jinp = inp; 1524 } 1525 } 1526 } 1527 if (jinp != NULL) 1528 return (jinp); 1529 1530 if (wildcard) { 1531 struct inpcb *local_wild = NULL; 1532 struct inpcb *jinp_wild = NULL; 1533 #ifdef INET6 1534 struct inpcb *local_wild_mapped = NULL; 1535 #endif 1536 struct inpcontainer *ic; 1537 struct inpcontainerhead *chead; 1538 struct sockaddr_in jsin; 1539 struct ucred *cred; 1540 1541 GET_PCBINFO_TOKEN(pcbinfo); 1542 1543 /* 1544 * Check local group first 1545 */ 1546 if (pcbinfo->localgrphashbase != NULL && 1547 m != NULL && (m->m_flags & M_HASH) && 1548 !(ifp && ifp->if_type == IFT_FAITH)) { 1549 inp = inp_localgroup_lookup(pcbinfo, 1550 laddr, lport, m->m_pkthdr.hash); 1551 if (inp != NULL) { 1552 REL_PCBINFO_TOKEN(pcbinfo); 1553 return inp; 1554 } 1555 } 1556 1557 /* 1558 * Order of socket selection: 1559 * 1. non-jailed, non-wild. 1560 * 2. non-jailed, wild. 1561 * 3. jailed, non-wild. 1562 * 4. jailed, wild. 1563 */ 1564 jsin.sin_family = AF_INET; 1565 chead = &pcbinfo->wildcardhashbase[ 1566 INP_PCBWILDCARDHASH(lport, pcbinfo->wildcardhashmask)]; 1567 LIST_FOREACH(ic, chead, ic_list) { 1568 inp = ic->ic_inp; 1569 if (inp->inp_flags & INP_PLACEMARKER) 1570 continue; 1571 1572 jsin.sin_addr.s_addr = laddr.s_addr; 1573 #ifdef INET6 1574 if (!(inp->inp_vflag & INP_IPV4)) 1575 continue; 1576 #endif 1577 if (inp->inp_socket != NULL) 1578 cred = inp->inp_socket->so_cred; 1579 else 1580 cred = NULL; 1581 if (cred != NULL && jailed(cred)) { 1582 if (jinp != NULL) 1583 continue; 1584 else 1585 if (!jailed_ip(cred->cr_prison, 1586 (struct sockaddr *)&jsin)) 1587 continue; 1588 } 1589 if (inp->inp_lport == lport) { 1590 if (ifp && ifp->if_type == IFT_FAITH && 1591 !(inp->inp_flags & INP_FAITH)) 1592 continue; 1593 if (inp->inp_laddr.s_addr == laddr.s_addr) { 1594 if (cred != NULL && jailed(cred)) { 1595 jinp = inp; 1596 } else { 1597 REL_PCBINFO_TOKEN(pcbinfo); 1598 return (inp); 1599 } 1600 } 1601 if (inp->inp_laddr.s_addr == INADDR_ANY) { 1602 #ifdef INET6 1603 if (INP_CHECK_SOCKAF(inp->inp_socket, 1604 AF_INET6)) 1605 local_wild_mapped = inp; 1606 else 1607 #endif 1608 if (cred != NULL && 1609 jailed(cred)) 1610 jinp_wild = inp; 1611 else 1612 local_wild = inp; 1613 } 1614 } 1615 } 1616 1617 REL_PCBINFO_TOKEN(pcbinfo); 1618 1619 if (local_wild != NULL) 1620 return (local_wild); 1621 #ifdef INET6 1622 if (local_wild_mapped != NULL) 1623 return (local_wild_mapped); 1624 #endif 1625 if (jinp != NULL) 1626 return (jinp); 1627 return (jinp_wild); 1628 } 1629 1630 /* 1631 * Not found. 1632 */ 1633 return (NULL); 1634 } 1635 1636 struct inpcb * 1637 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, 1638 u_int fport_arg, struct in_addr laddr, u_int lport_arg, 1639 boolean_t wildcard, struct ifnet *ifp) 1640 { 1641 return in_pcblookup_pkthash(pcbinfo, faddr, fport_arg, 1642 laddr, lport_arg, wildcard, ifp, NULL); 1643 } 1644 1645 /* 1646 * Insert PCB into connection hash table. 1647 */ 1648 void 1649 in_pcbinsconnhash(struct inpcb *inp) 1650 { 1651 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1652 struct inpcbhead *bucket; 1653 u_int32_t hashkey_faddr, hashkey_laddr; 1654 1655 #ifdef INET6 1656 if (inp->inp_vflag & INP_IPV6) { 1657 hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX JH */; 1658 hashkey_laddr = inp->in6p_laddr.s6_addr32[3] /* XXX JH */; 1659 } else { 1660 #endif 1661 hashkey_faddr = inp->inp_faddr.s_addr; 1662 hashkey_laddr = inp->inp_laddr.s_addr; 1663 #ifdef INET6 1664 } 1665 #endif 1666 1667 KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu), 1668 ("not in the correct netisr")); 1669 KASSERT(!(inp->inp_flags & INP_WILDCARD), ("already on wildcardhash")); 1670 KASSERT(!(inp->inp_flags & INP_CONNECTED), ("already on connhash")); 1671 inp->inp_flags |= INP_CONNECTED; 1672 1673 /* 1674 * Insert into the connection hash table. 1675 */ 1676 bucket = &pcbinfo->hashbase[INP_PCBCONNHASH(hashkey_faddr, 1677 inp->inp_fport, hashkey_laddr, inp->inp_lport, pcbinfo->hashmask)]; 1678 LIST_INSERT_HEAD(bucket, inp, inp_hash); 1679 } 1680 1681 /* 1682 * Remove PCB from connection hash table. 1683 */ 1684 void 1685 in_pcbremconnhash(struct inpcb *inp) 1686 { 1687 struct inpcbinfo *pcbinfo __debugvar = inp->inp_pcbinfo; 1688 1689 KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu), 1690 ("not in the correct netisr")); 1691 KASSERT(inp->inp_flags & INP_CONNECTED, ("inp not connected")); 1692 1693 LIST_REMOVE(inp, inp_hash); 1694 inp->inp_flags &= ~INP_CONNECTED; 1695 } 1696 1697 /* 1698 * Insert PCB into port hash table. 1699 */ 1700 void 1701 in_pcbinsporthash(struct inpcbportinfo *portinfo, struct inpcb *inp) 1702 { 1703 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1704 struct inpcbporthead *pcbporthash; 1705 struct inpcbport *phd; 1706 1707 /* 1708 * If the porthashbase is shared across several cpus, it must 1709 * have been locked. 1710 */ 1711 ASSERT_PORT_TOKEN_HELD(portinfo); 1712 1713 /* 1714 * Insert into the port hash table. 1715 */ 1716 pcbporthash = &portinfo->porthashbase[ 1717 INP_PCBPORTHASH(inp->inp_lport, portinfo->porthashmask)]; 1718 1719 /* Go through port list and look for a head for this lport. */ 1720 LIST_FOREACH(phd, pcbporthash, phd_hash) { 1721 if (phd->phd_port == inp->inp_lport) 1722 break; 1723 } 1724 1725 /* If none exists, use saved one and tack it on. */ 1726 if (phd == NULL) { 1727 KKASSERT(pcbinfo->portsave != NULL); 1728 phd = pcbinfo->portsave; 1729 pcbinfo->portsave = NULL; 1730 phd->phd_port = inp->inp_lport; 1731 LIST_INIT(&phd->phd_pcblist); 1732 LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); 1733 } 1734 1735 inp->inp_portinfo = portinfo; 1736 inp->inp_phd = phd; 1737 LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); 1738 1739 /* 1740 * Malloc one inpcbport for later use. It is safe to use 1741 * "wait" malloc here (port token would be released, if 1742 * malloc ever blocked), since all changes to the porthash 1743 * are done. 1744 */ 1745 if (pcbinfo->portsave == NULL) { 1746 pcbinfo->portsave = kmalloc(sizeof(*pcbinfo->portsave), 1747 M_PCB, M_INTWAIT | M_ZERO); 1748 } 1749 } 1750 1751 void 1752 in_pcbinsporthash_lport(struct inpcb *inp) 1753 { 1754 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1755 struct inpcbportinfo *portinfo; 1756 u_short lport_ho; 1757 1758 /* Locate the proper portinfo based on lport */ 1759 lport_ho = ntohs(inp->inp_lport); 1760 portinfo = &pcbinfo->portinfo[lport_ho & pcbinfo->portinfo_mask]; 1761 KKASSERT((lport_ho & pcbinfo->portinfo_mask) == portinfo->offset); 1762 1763 GET_PORT_TOKEN(portinfo); 1764 in_pcbinsporthash(portinfo, inp); 1765 REL_PORT_TOKEN(portinfo); 1766 } 1767 1768 static struct inp_localgroup * 1769 inp_localgroup_alloc(u_char vflag, 1770 uint16_t port, const union in_dependaddr *addr, int size) 1771 { 1772 struct inp_localgroup *grp; 1773 1774 grp = kmalloc(__offsetof(struct inp_localgroup, il_inp[size]), 1775 M_TEMP, M_INTWAIT | M_ZERO); 1776 grp->il_vflag = vflag; 1777 grp->il_lport = port; 1778 grp->il_dependladdr = *addr; 1779 grp->il_inpsiz = size; 1780 1781 return grp; 1782 } 1783 1784 static void 1785 inp_localgroup_free(struct inp_localgroup *grp) 1786 { 1787 kfree(grp, M_TEMP); 1788 } 1789 1790 static void 1791 inp_localgroup_destroy(struct inp_localgroup *grp) 1792 { 1793 LIST_REMOVE(grp, il_list); 1794 inp_localgroup_free(grp); 1795 } 1796 1797 static void 1798 inp_localgroup_copy(struct inp_localgroup *grp, 1799 const struct inp_localgroup *old_grp) 1800 { 1801 int i; 1802 1803 KASSERT(old_grp->il_inpcnt < grp->il_inpsiz, 1804 ("invalid new local group size %d and old local group count %d", 1805 grp->il_inpsiz, old_grp->il_inpcnt)); 1806 for (i = 0; i < old_grp->il_inpcnt; ++i) 1807 grp->il_inp[i] = old_grp->il_inp[i]; 1808 grp->il_inpcnt = old_grp->il_inpcnt; 1809 grp->il_factor = old_grp->il_factor; 1810 } 1811 1812 static void 1813 inp_localgroup_factor(struct inp_localgroup *grp) 1814 { 1815 grp->il_factor = 1816 ((uint32_t)(0xffff >> ncpus2_shift) / grp->il_inpcnt) + 1; 1817 KASSERT(grp->il_factor != 0, ("invalid local group factor, " 1818 "ncpus2_shift %d, inpcnt %d", ncpus2_shift, grp->il_inpcnt)); 1819 } 1820 1821 static void 1822 in_pcbinslocalgrphash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo) 1823 { 1824 struct inp_localgrphead *hdr; 1825 struct inp_localgroup *grp, *grp_alloc = NULL; 1826 struct ucred *cred; 1827 1828 ASSERT_PCBINFO_TOKEN_HELD(pcbinfo); 1829 1830 if (pcbinfo->localgrphashbase == NULL) 1831 return; 1832 1833 /* 1834 * XXX don't allow jailed socket to join local group 1835 */ 1836 if (inp->inp_socket != NULL) 1837 cred = inp->inp_socket->so_cred; 1838 else 1839 cred = NULL; 1840 if (cred != NULL && jailed(cred)) 1841 return; 1842 1843 #ifdef INET6 1844 /* 1845 * XXX don't allow IPv4 mapped INET6 wild socket 1846 */ 1847 if ((inp->inp_vflag & INP_IPV4) && 1848 inp->inp_laddr.s_addr == INADDR_ANY && 1849 INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) 1850 return; 1851 #endif 1852 1853 hdr = &pcbinfo->localgrphashbase[ 1854 INP_PCBLOCALGRPHASH(inp->inp_lport, pcbinfo->localgrphashmask)]; 1855 1856 again: 1857 LIST_FOREACH(grp, hdr, il_list) { 1858 if (grp->il_vflag == inp->inp_vflag && 1859 grp->il_lport == inp->inp_lport && 1860 memcmp(&grp->il_dependladdr, 1861 &inp->inp_inc.inc_ie.ie_dependladdr, 1862 sizeof(grp->il_dependladdr)) == 0) { 1863 break; 1864 } 1865 } 1866 if (grp == NULL) { 1867 /* 1868 * Create a new local group 1869 */ 1870 if (grp_alloc == NULL) { 1871 grp_alloc = inp_localgroup_alloc(inp->inp_vflag, 1872 inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, 1873 INP_LOCALGROUP_SIZMIN); 1874 /* 1875 * Local group allocation could block and the 1876 * local group w/ the same property might have 1877 * been added by others when we were blocked; 1878 * check again. 1879 */ 1880 goto again; 1881 } else { 1882 /* Local group has been allocated; link it */ 1883 grp = grp_alloc; 1884 grp_alloc = NULL; 1885 LIST_INSERT_HEAD(hdr, grp, il_list); 1886 } 1887 } else if (grp->il_inpcnt == grp->il_inpsiz) { 1888 if (grp->il_inpsiz >= INP_LOCALGROUP_SIZMAX) { 1889 static int limit_logged = 0; 1890 1891 if (!limit_logged) { 1892 limit_logged = 1; 1893 kprintf("local group port %d, " 1894 "limit reached\n", ntohs(grp->il_lport)); 1895 } 1896 if (grp_alloc != NULL) { 1897 /* 1898 * This would happen if the local group 1899 * w/ the same property was expanded when 1900 * our local group allocation blocked. 1901 */ 1902 inp_localgroup_free(grp_alloc); 1903 } 1904 return; 1905 } 1906 1907 /* 1908 * Expand this local group 1909 */ 1910 if (grp_alloc == NULL || 1911 grp->il_inpcnt >= grp_alloc->il_inpsiz) { 1912 if (grp_alloc != NULL) 1913 inp_localgroup_free(grp_alloc); 1914 grp_alloc = inp_localgroup_alloc(grp->il_vflag, 1915 grp->il_lport, &grp->il_dependladdr, 1916 grp->il_inpsiz * 2); 1917 /* 1918 * Local group allocation could block and the 1919 * local group w/ the same property might have 1920 * been expanded by others when we were blocked; 1921 * check again. 1922 */ 1923 goto again; 1924 } 1925 1926 /* 1927 * Save the old local group, link the new one, and then 1928 * destroy the old local group 1929 */ 1930 inp_localgroup_copy(grp_alloc, grp); 1931 LIST_INSERT_HEAD(hdr, grp_alloc, il_list); 1932 inp_localgroup_destroy(grp); 1933 1934 grp = grp_alloc; 1935 grp_alloc = NULL; 1936 } else { 1937 /* 1938 * Found the local group 1939 */ 1940 if (grp_alloc != NULL) { 1941 /* 1942 * This would happen if the local group w/ the 1943 * same property was added or expanded when our 1944 * local group allocation blocked. 1945 */ 1946 inp_localgroup_free(grp_alloc); 1947 grp_alloc = NULL; 1948 } 1949 } 1950 1951 KASSERT(grp->il_inpcnt < grp->il_inpsiz, 1952 ("invalid local group size %d and count %d", 1953 grp->il_inpsiz, grp->il_inpcnt)); 1954 grp->il_inp[grp->il_inpcnt] = inp; 1955 grp->il_inpcnt++; 1956 inp_localgroup_factor(grp); 1957 } 1958 1959 void 1960 in_pcbinswildcardhash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo) 1961 { 1962 struct inpcontainer *ic; 1963 struct inpcontainerhead *bucket; 1964 1965 GET_PCBINFO_TOKEN(pcbinfo); 1966 1967 in_pcbinslocalgrphash_oncpu(inp, pcbinfo); 1968 1969 bucket = &pcbinfo->wildcardhashbase[ 1970 INP_PCBWILDCARDHASH(inp->inp_lport, pcbinfo->wildcardhashmask)]; 1971 1972 ic = kmalloc(sizeof(struct inpcontainer), M_TEMP, M_INTWAIT); 1973 ic->ic_inp = inp; 1974 LIST_INSERT_HEAD(bucket, ic, ic_list); 1975 1976 REL_PCBINFO_TOKEN(pcbinfo); 1977 } 1978 1979 /* 1980 * Insert PCB into wildcard hash table. 1981 */ 1982 void 1983 in_pcbinswildcardhash(struct inpcb *inp) 1984 { 1985 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1986 1987 KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu), 1988 ("not in correct netisr")); 1989 KASSERT(!(inp->inp_flags & INP_CONNECTED), 1990 ("already on connhash")); 1991 KASSERT(!(inp->inp_flags & INP_WILDCARD), 1992 ("already on wildcardhash")); 1993 inp->inp_flags |= INP_WILDCARD; 1994 1995 in_pcbinswildcardhash_oncpu(inp, pcbinfo); 1996 } 1997 1998 static void 1999 in_pcbremlocalgrphash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo) 2000 { 2001 struct inp_localgrphead *hdr; 2002 struct inp_localgroup *grp; 2003 2004 ASSERT_PCBINFO_TOKEN_HELD(pcbinfo); 2005 2006 if (pcbinfo->localgrphashbase == NULL) 2007 return; 2008 2009 hdr = &pcbinfo->localgrphashbase[ 2010 INP_PCBLOCALGRPHASH(inp->inp_lport, pcbinfo->localgrphashmask)]; 2011 2012 LIST_FOREACH(grp, hdr, il_list) { 2013 int i; 2014 2015 for (i = 0; i < grp->il_inpcnt; ++i) { 2016 if (grp->il_inp[i] != inp) 2017 continue; 2018 2019 if (grp->il_inpcnt == 1) { 2020 /* Destroy this local group */ 2021 inp_localgroup_destroy(grp); 2022 } else { 2023 /* Pull up inpcbs */ 2024 for (; i + 1 < grp->il_inpcnt; ++i) 2025 grp->il_inp[i] = grp->il_inp[i + 1]; 2026 grp->il_inpcnt--; 2027 inp_localgroup_factor(grp); 2028 } 2029 return; 2030 } 2031 } 2032 } 2033 2034 void 2035 in_pcbremwildcardhash_oncpu(struct inpcb *inp, struct inpcbinfo *pcbinfo) 2036 { 2037 struct inpcontainer *ic; 2038 struct inpcontainerhead *head; 2039 2040 GET_PCBINFO_TOKEN(pcbinfo); 2041 2042 in_pcbremlocalgrphash_oncpu(inp, pcbinfo); 2043 2044 /* find bucket */ 2045 head = &pcbinfo->wildcardhashbase[ 2046 INP_PCBWILDCARDHASH(inp->inp_lport, pcbinfo->wildcardhashmask)]; 2047 2048 LIST_FOREACH(ic, head, ic_list) { 2049 if (ic->ic_inp == inp) 2050 goto found; 2051 } 2052 REL_PCBINFO_TOKEN(pcbinfo); 2053 return; /* not found! */ 2054 2055 found: 2056 LIST_REMOVE(ic, ic_list); /* remove container from bucket chain */ 2057 REL_PCBINFO_TOKEN(pcbinfo); 2058 kfree(ic, M_TEMP); /* deallocate container */ 2059 } 2060 2061 /* 2062 * Remove PCB from wildcard hash table. 2063 */ 2064 void 2065 in_pcbremwildcardhash(struct inpcb *inp) 2066 { 2067 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2068 2069 KASSERT(&curthread->td_msgport == netisr_cpuport(pcbinfo->cpu), 2070 ("not in correct netisr")); 2071 KASSERT(inp->inp_flags & INP_WILDCARD, ("inp not wildcard")); 2072 2073 in_pcbremwildcardhash_oncpu(inp, pcbinfo); 2074 inp->inp_flags &= ~INP_WILDCARD; 2075 } 2076 2077 /* 2078 * Remove PCB from various lists. 2079 */ 2080 void 2081 in_pcbremlists(struct inpcb *inp) 2082 { 2083 if (inp->inp_lport) { 2084 struct inpcbportinfo *portinfo; 2085 struct inpcbport *phd; 2086 2087 /* 2088 * NOTE: 2089 * inp->inp_portinfo is _not_ necessary same as 2090 * inp->inp_pcbinfo->portinfo. 2091 */ 2092 portinfo = inp->inp_portinfo; 2093 GET_PORT_TOKEN(portinfo); 2094 2095 phd = inp->inp_phd; 2096 LIST_REMOVE(inp, inp_portlist); 2097 if (LIST_FIRST(&phd->phd_pcblist) == NULL) { 2098 LIST_REMOVE(phd, phd_hash); 2099 kfree(phd, M_PCB); 2100 } 2101 2102 REL_PORT_TOKEN(portinfo); 2103 } 2104 if (inp->inp_flags & INP_WILDCARD) { 2105 in_pcbremwildcardhash(inp); 2106 } else if (inp->inp_flags & INP_CONNECTED) { 2107 in_pcbremconnhash(inp); 2108 } 2109 2110 if (inp->inp_flags & INP_ONLIST) 2111 in_pcbofflist(inp); 2112 } 2113 2114 int 2115 prison_xinpcb(struct thread *td, struct inpcb *inp) 2116 { 2117 struct ucred *cr; 2118 2119 if (td->td_proc == NULL) 2120 return (0); 2121 cr = td->td_proc->p_ucred; 2122 if (cr->cr_prison == NULL) 2123 return (0); 2124 if (inp->inp_socket && inp->inp_socket->so_cred && 2125 inp->inp_socket->so_cred->cr_prison && 2126 cr->cr_prison == inp->inp_socket->so_cred->cr_prison) 2127 return (0); 2128 return (1); 2129 } 2130 2131 int 2132 in_pcblist_global(SYSCTL_HANDLER_ARGS) 2133 { 2134 struct inpcbinfo *pcbinfo_arr = arg1; 2135 int pcbinfo_arrlen = arg2; 2136 struct inpcb *marker; 2137 int cpu, origcpu; 2138 int error, n; 2139 2140 KASSERT(pcbinfo_arrlen <= ncpus && pcbinfo_arrlen >= 1, 2141 ("invalid pcbinfo count %d", pcbinfo_arrlen)); 2142 2143 /* 2144 * The process of preparing the TCB list is too time-consuming and 2145 * resource-intensive to repeat twice on every request. 2146 */ 2147 n = 0; 2148 if (req->oldptr == NULL) { 2149 for (cpu = 0; cpu < pcbinfo_arrlen; ++cpu) 2150 n += pcbinfo_arr[cpu].ipi_count; 2151 req->oldidx = (n + n/8 + 10) * sizeof(struct xinpcb); 2152 return 0; 2153 } 2154 2155 if (req->newptr != NULL) 2156 return EPERM; 2157 2158 marker = kmalloc(sizeof(struct inpcb), M_TEMP, M_WAITOK|M_ZERO); 2159 marker->inp_flags |= INP_PLACEMARKER; 2160 2161 /* 2162 * OK, now we're committed to doing something. Re-fetch ipi_count 2163 * after obtaining the generation count. 2164 */ 2165 error = 0; 2166 origcpu = mycpuid; 2167 for (cpu = 0; cpu < pcbinfo_arrlen && error == 0; ++cpu) { 2168 struct inpcbinfo *pcbinfo = &pcbinfo_arr[cpu]; 2169 struct inpcb *inp; 2170 struct xinpcb xi; 2171 int i; 2172 2173 lwkt_migratecpu(cpu); 2174 2175 GET_PCBINFO_TOKEN(pcbinfo); 2176 2177 n = pcbinfo->ipi_count; 2178 2179 LIST_INSERT_HEAD(&pcbinfo->pcblisthead, marker, inp_list); 2180 i = 0; 2181 while ((inp = LIST_NEXT(marker, inp_list)) != NULL && i < n) { 2182 LIST_REMOVE(marker, inp_list); 2183 LIST_INSERT_AFTER(inp, marker, inp_list); 2184 2185 if (inp->inp_flags & INP_PLACEMARKER) 2186 continue; 2187 if (prison_xinpcb(req->td, inp)) 2188 continue; 2189 2190 bzero(&xi, sizeof xi); 2191 xi.xi_len = sizeof xi; 2192 bcopy(inp, &xi.xi_inp, sizeof *inp); 2193 if (inp->inp_socket) 2194 sotoxsocket(inp->inp_socket, &xi.xi_socket); 2195 if ((error = SYSCTL_OUT(req, &xi, sizeof xi)) != 0) 2196 break; 2197 ++i; 2198 } 2199 LIST_REMOVE(marker, inp_list); 2200 2201 REL_PCBINFO_TOKEN(pcbinfo); 2202 2203 if (error == 0 && i < n) { 2204 bzero(&xi, sizeof xi); 2205 xi.xi_len = sizeof xi; 2206 while (i < n) { 2207 error = SYSCTL_OUT(req, &xi, sizeof xi); 2208 if (error) 2209 break; 2210 ++i; 2211 } 2212 } 2213 } 2214 2215 lwkt_migratecpu(origcpu); 2216 kfree(marker, M_TEMP); 2217 return error; 2218 } 2219 2220 int 2221 in_pcblist_global_ncpus2(SYSCTL_HANDLER_ARGS) 2222 { 2223 return in_pcblist_global(oidp, arg1, ncpus2, req); 2224 } 2225 2226 void 2227 in_savefaddr(struct socket *so, const struct sockaddr *faddr) 2228 { 2229 struct sockaddr_in *sin; 2230 2231 KASSERT(faddr->sa_family == AF_INET, 2232 ("not AF_INET faddr %d", faddr->sa_family)); 2233 2234 sin = kmalloc(sizeof(*sin), M_SONAME, M_WAITOK | M_ZERO); 2235 sin->sin_family = AF_INET; 2236 sin->sin_len = sizeof(*sin); 2237 sin->sin_port = ((const struct sockaddr_in *)faddr)->sin_port; 2238 sin->sin_addr = ((const struct sockaddr_in *)faddr)->sin_addr; 2239 2240 so->so_faddr = (struct sockaddr *)sin; 2241 } 2242 2243 void 2244 in_pcbportinfo_init(struct inpcbportinfo *portinfo, int hashsize, 2245 boolean_t shared, u_short offset) 2246 { 2247 memset(portinfo, 0, sizeof(*portinfo)); 2248 2249 portinfo->offset = offset; 2250 portinfo->lastport = offset; 2251 portinfo->lastlow = offset; 2252 portinfo->lasthi = offset; 2253 2254 portinfo->porthashbase = hashinit(hashsize, M_PCB, 2255 &portinfo->porthashmask); 2256 2257 if (shared) { 2258 portinfo->porttoken = kmalloc(sizeof(struct lwkt_token), 2259 M_PCB, M_WAITOK); 2260 lwkt_token_init(portinfo->porttoken, "porttoken"); 2261 } 2262 } 2263 2264 void 2265 in_pcbportrange(u_short *hi0, u_short *lo0, u_short ofs, u_short step) 2266 { 2267 int hi, lo; 2268 2269 if (step == 1) 2270 return; 2271 2272 hi = *hi0; 2273 lo = *lo0; 2274 2275 hi = rounddown2(hi, step); 2276 hi += ofs; 2277 if (hi > (int)*hi0) 2278 hi -= step; 2279 2280 lo = roundup2(lo, step); 2281 lo -= (step - ofs); 2282 if (lo < (int)*lo0) 2283 lo += step; 2284 2285 *hi0 = hi; 2286 *lo0 = lo; 2287 } 2288 2289 void 2290 in_pcbglobalinit(void) 2291 { 2292 int cpu; 2293 2294 in_pcbmarkers = kmalloc(ncpus * sizeof(struct inpcb), M_PCB, 2295 M_WAITOK | M_ZERO); 2296 in_pcbcontainer_markers = kmalloc(ncpus * sizeof(struct inpcontainer), 2297 M_PCB, M_WAITOK | M_ZERO); 2298 2299 for (cpu = 0; cpu < ncpus; ++cpu) { 2300 struct inpcontainer *ic = &in_pcbcontainer_markers[cpu]; 2301 struct inpcb *marker = &in_pcbmarkers[cpu]; 2302 2303 marker->inp_flags |= INP_PLACEMARKER; 2304 ic->ic_inp = marker; 2305 } 2306 } 2307 2308 struct inpcb * 2309 in_pcbmarker(int cpuid) 2310 { 2311 KASSERT(cpuid >= 0 && cpuid < ncpus, ("invalid cpuid %d", cpuid)); 2312 KASSERT(curthread->td_type == TD_TYPE_NETISR, ("not in netisr")); 2313 2314 return &in_pcbmarkers[cpuid]; 2315 } 2316 2317 struct inpcontainer * 2318 in_pcbcontainer_marker(int cpuid) 2319 { 2320 KASSERT(cpuid >= 0 && cpuid < ncpus, ("invalid cpuid %d", cpuid)); 2321 KASSERT(curthread->td_type == TD_TYPE_NETISR, ("not in netisr")); 2322 2323 return &in_pcbcontainer_markers[cpuid]; 2324 } 2325