1 /* $OpenBSD: ip_output.c,v 1.402 2025/01/03 21:27:40 bluhm Exp $ */ 2 /* $NetBSD: ip_output.c,v 1.28 1996/02/13 23:43:07 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 33 */ 34 35 #include "pf.h" 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/mbuf.h> 40 #include <sys/protosw.h> 41 #include <sys/socket.h> 42 #include <sys/socketvar.h> 43 #include <sys/proc.h> 44 #include <sys/kernel.h> 45 46 #include <net/if.h> 47 #include <net/if_var.h> 48 #include <net/if_enc.h> 49 #include <net/route.h> 50 51 #include <netinet/in.h> 52 #include <netinet/ip.h> 53 #include <netinet/in_pcb.h> 54 #include <netinet/in_var.h> 55 #include <netinet/ip_var.h> 56 #include <netinet/ip_icmp.h> 57 #include <netinet/tcp.h> 58 #include <netinet/udp.h> 59 #include <netinet/tcp_timer.h> 60 #include <netinet/tcp_var.h> 61 #include <netinet/udp_var.h> 62 63 #if NPF > 0 64 #include <net/pfvar.h> 65 #endif 66 67 #ifdef IPSEC 68 #ifdef ENCDEBUG 69 #define DPRINTF(fmt, args...) \ 70 do { \ 71 if (encdebug) \ 72 printf("%s: " fmt "\n", __func__, ## args); \ 73 } while (0) 74 #else 75 #define DPRINTF(fmt, args...) \ 76 do { } while (0) 77 #endif 78 #endif /* IPSEC */ 79 80 int ip_pcbopts(struct mbuf **, struct mbuf *); 81 int ip_multicast_if(struct ip_mreqn *, u_int, unsigned int *); 82 int ip_setmoptions(int, struct ip_moptions **, struct mbuf *, u_int); 83 void ip_mloopback(struct ifnet *, struct mbuf *, struct sockaddr_in *); 84 static u_int16_t in_cksum_phdr(u_int32_t, u_int32_t, u_int32_t); 85 void in_delayed_cksum(struct mbuf *); 86 87 int ip_output_ipsec_lookup(struct mbuf *m, int hlen, 88 const struct ipsec_level *seclevel, struct tdb **, int ipsecflowinfo); 89 void ip_output_ipsec_pmtu_update(struct tdb *, struct route *, struct in_addr, 90 int, int); 91 int ip_output_ipsec_send(struct tdb *, struct mbuf *, struct route *, int); 92 93 /* 94 * IP output. The packet in mbuf chain m contains a skeletal IP 95 * header (with len, off, ttl, proto, tos, src, dst). 96 * The mbuf chain containing the packet will be freed. 97 * The mbuf opt, if present, will not be freed. 98 */ 99 int 100 ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, 101 struct ip_moptions *imo, const struct ipsec_level *seclevel, 102 u_int32_t ipsecflowinfo) 103 { 104 struct ip *ip; 105 struct ifnet *ifp = NULL; 106 struct mbuf_list ml; 107 int hlen = sizeof (struct ip); 108 int error = 0; 109 struct route iproute; 110 struct sockaddr_in *dst; 111 struct tdb *tdb = NULL; 112 u_long mtu; 113 #if NPF > 0 114 u_int orig_rtableid; 115 #endif 116 117 NET_ASSERT_LOCKED(); 118 119 #ifdef DIAGNOSTIC 120 if ((m->m_flags & M_PKTHDR) == 0) 121 panic("ip_output no HDR"); 122 #endif 123 if (opt) 124 m = ip_insertoptions(m, opt, &hlen); 125 126 ip = mtod(m, struct ip *); 127 128 /* 129 * Fill in IP header. 130 */ 131 if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { 132 ip->ip_v = IPVERSION; 133 ip->ip_off &= htons(IP_DF); 134 ip->ip_id = htons(ip_randomid()); 135 ip->ip_hl = hlen >> 2; 136 ipstat_inc(ips_localout); 137 } else { 138 hlen = ip->ip_hl << 2; 139 } 140 141 /* 142 * We should not send traffic to 0/8 say both Stevens and RFCs 143 * 5735 section 3 and 1122 sections 3.2.1.3 and 3.3.6. 144 */ 145 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == 0) { 146 error = ENETUNREACH; 147 goto bad; 148 } 149 150 #if NPF > 0 151 orig_rtableid = m->m_pkthdr.ph_rtableid; 152 reroute: 153 #endif 154 155 /* 156 * Do a route lookup now in case we need the source address to 157 * do an SPD lookup in IPsec; for most packets, the source address 158 * is set at a higher level protocol. ICMPs and other packets 159 * though (e.g., traceroute) have a source address of zeroes. 160 */ 161 if (ro == NULL) { 162 ro = &iproute; 163 ro->ro_rt = NULL; 164 } 165 166 /* 167 * If there is a cached route, check that it is to the same 168 * destination and is still up. If not, free it and try again. 169 */ 170 route_cache(ro, &ip->ip_dst, &ip->ip_src, m->m_pkthdr.ph_rtableid); 171 dst = &ro->ro_dstsin; 172 173 if ((IN_MULTICAST(ip->ip_dst.s_addr) || 174 (ip->ip_dst.s_addr == INADDR_BROADCAST)) && 175 imo != NULL && (ifp = if_get(imo->imo_ifidx)) != NULL) { 176 177 mtu = ifp->if_mtu; 178 if (ip->ip_src.s_addr == INADDR_ANY) { 179 struct in_ifaddr *ia; 180 181 IFP_TO_IA(ifp, ia); 182 if (ia != NULL) 183 ip->ip_src = ia->ia_addr.sin_addr; 184 } 185 } else { 186 struct in_ifaddr *ia; 187 188 if (ro->ro_rt == NULL) 189 ro->ro_rt = rtalloc_mpath(&ro->ro_dstsa, 190 &ip->ip_src.s_addr, ro->ro_tableid); 191 192 if (ro->ro_rt == NULL) { 193 ipstat_inc(ips_noroute); 194 error = EHOSTUNREACH; 195 goto bad; 196 } 197 198 ia = ifatoia(ro->ro_rt->rt_ifa); 199 if (ISSET(ro->ro_rt->rt_flags, RTF_LOCAL)) 200 ifp = if_get(rtable_loindex(m->m_pkthdr.ph_rtableid)); 201 else 202 ifp = if_get(ro->ro_rt->rt_ifidx); 203 /* 204 * We aren't using rtisvalid() here because the UP/DOWN state 205 * machine is broken with some Ethernet drivers like em(4). 206 * As a result we might try to use an invalid cached route 207 * entry while an interface is being detached. 208 */ 209 if (ifp == NULL) { 210 ipstat_inc(ips_noroute); 211 error = EHOSTUNREACH; 212 goto bad; 213 } 214 mtu = atomic_load_int(&ro->ro_rt->rt_mtu); 215 if (mtu == 0) 216 mtu = ifp->if_mtu; 217 218 if (ro->ro_rt->rt_flags & RTF_GATEWAY) 219 dst = satosin(ro->ro_rt->rt_gateway); 220 221 /* Set the source IP address */ 222 if (ip->ip_src.s_addr == INADDR_ANY && ia) 223 ip->ip_src = ia->ia_addr.sin_addr; 224 } 225 226 #ifdef IPSEC 227 if (ipsec_in_use || seclevel != NULL) { 228 /* Do we have any pending SAs to apply ? */ 229 error = ip_output_ipsec_lookup(m, hlen, seclevel, &tdb, 230 ipsecflowinfo); 231 if (error) { 232 /* Should silently drop packet */ 233 if (error == -EINVAL) 234 error = 0; 235 goto bad; 236 } 237 if (tdb != NULL) { 238 /* 239 * If it needs TCP/UDP hardware-checksumming, do the 240 * computation now. 241 */ 242 in_proto_cksum_out(m, NULL); 243 } 244 } 245 #endif /* IPSEC */ 246 247 if (IN_MULTICAST(ip->ip_dst.s_addr) || 248 (ip->ip_dst.s_addr == INADDR_BROADCAST)) { 249 250 m->m_flags |= (ip->ip_dst.s_addr == INADDR_BROADCAST) ? 251 M_BCAST : M_MCAST; 252 253 /* 254 * IP destination address is multicast. Make sure "dst" 255 * still points to the address in "ro". (It may have been 256 * changed to point to a gateway address, above.) 257 */ 258 dst = &ro->ro_dstsin; 259 260 /* 261 * See if the caller provided any multicast options 262 */ 263 if (imo != NULL) 264 ip->ip_ttl = imo->imo_ttl; 265 else 266 ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; 267 268 /* 269 * if we don't know the outgoing ifp yet, we can't generate 270 * output 271 */ 272 if (!ifp) { 273 ipstat_inc(ips_noroute); 274 error = EHOSTUNREACH; 275 goto bad; 276 } 277 278 /* 279 * Confirm that the outgoing interface supports multicast, 280 * but only if the packet actually is going out on that 281 * interface (i.e., no IPsec is applied). 282 */ 283 if ((((m->m_flags & M_MCAST) && 284 (ifp->if_flags & IFF_MULTICAST) == 0) || 285 ((m->m_flags & M_BCAST) && 286 (ifp->if_flags & IFF_BROADCAST) == 0)) && (tdb == NULL)) { 287 ipstat_inc(ips_noroute); 288 error = ENETUNREACH; 289 goto bad; 290 } 291 292 /* 293 * If source address not specified yet, use address 294 * of outgoing interface. 295 */ 296 if (ip->ip_src.s_addr == INADDR_ANY) { 297 struct in_ifaddr *ia; 298 299 IFP_TO_IA(ifp, ia); 300 if (ia != NULL) 301 ip->ip_src = ia->ia_addr.sin_addr; 302 } 303 304 if ((imo == NULL || imo->imo_loop) && 305 in_hasmulti(&ip->ip_dst, ifp)) { 306 /* 307 * If we belong to the destination multicast group 308 * on the outgoing interface, and the caller did not 309 * forbid loopback, loop back a copy. 310 * Can't defer TCP/UDP checksumming, do the 311 * computation now. 312 */ 313 in_proto_cksum_out(m, NULL); 314 ip_mloopback(ifp, m, dst); 315 } 316 #ifdef MROUTING 317 else { 318 /* 319 * If we are acting as a multicast router, perform 320 * multicast forwarding as if the packet had just 321 * arrived on the interface to which we are about 322 * to send. The multicast forwarding function 323 * recursively calls this function, using the 324 * IP_FORWARDING flag to prevent infinite recursion. 325 * 326 * Multicasts that are looped back by ip_mloopback(), 327 * above, will be forwarded by the ip_input() routine, 328 * if necessary. 329 */ 330 if (ipmforwarding && ip_mrouter[ifp->if_rdomain] && 331 (flags & IP_FORWARDING) == 0) { 332 int rv; 333 334 KERNEL_LOCK(); 335 rv = ip_mforward(m, ifp, flags); 336 KERNEL_UNLOCK(); 337 if (rv != 0) 338 goto bad; 339 } 340 } 341 #endif 342 /* 343 * Multicasts with a time-to-live of zero may be looped- 344 * back, above, but must not be transmitted on a network. 345 * Also, multicasts addressed to the loopback interface 346 * are not sent -- the above call to ip_mloopback() will 347 * loop back a copy if this host actually belongs to the 348 * destination group on the loopback interface. 349 */ 350 if (ip->ip_ttl == 0 || (ifp->if_flags & IFF_LOOPBACK) != 0) 351 goto bad; 352 353 goto sendit; 354 } 355 356 /* 357 * Look for broadcast address and verify user is allowed to send 358 * such a packet; if the packet is going in an IPsec tunnel, skip 359 * this check. 360 */ 361 if ((tdb == NULL) && ((dst->sin_addr.s_addr == INADDR_BROADCAST) || 362 (ro && ro->ro_rt && ISSET(ro->ro_rt->rt_flags, RTF_BROADCAST)))) { 363 if ((ifp->if_flags & IFF_BROADCAST) == 0) { 364 error = EADDRNOTAVAIL; 365 goto bad; 366 } 367 if ((flags & IP_ALLOWBROADCAST) == 0) { 368 error = EACCES; 369 goto bad; 370 } 371 372 /* Don't allow broadcast messages to be fragmented */ 373 if (ntohs(ip->ip_len) > ifp->if_mtu) { 374 error = EMSGSIZE; 375 goto bad; 376 } 377 m->m_flags |= M_BCAST; 378 } else 379 m->m_flags &= ~M_BCAST; 380 381 sendit: 382 /* 383 * If we're doing Path MTU discovery, we need to set DF unless 384 * the route's MTU is locked. 385 */ 386 if ((flags & IP_MTUDISC) && ro && ro->ro_rt && 387 (ro->ro_rt->rt_locks & RTV_MTU) == 0) 388 ip->ip_off |= htons(IP_DF); 389 390 #ifdef IPSEC 391 /* 392 * Check if the packet needs encapsulation. 393 */ 394 if (tdb != NULL) { 395 /* Callee frees mbuf */ 396 error = ip_output_ipsec_send(tdb, m, ro, 397 (flags & IP_FORWARDING) ? 1 : 0); 398 goto done; 399 } 400 #endif /* IPSEC */ 401 402 /* 403 * Packet filter 404 */ 405 #if NPF > 0 406 if (pf_test(AF_INET, (flags & IP_FORWARDING) ? PF_FWD : PF_OUT, 407 ifp, &m) != PF_PASS) { 408 error = EACCES; 409 goto bad; 410 } 411 if (m == NULL) 412 goto done; 413 ip = mtod(m, struct ip *); 414 hlen = ip->ip_hl << 2; 415 if ((m->m_pkthdr.pf.flags & (PF_TAG_REROUTE | PF_TAG_GENERATED)) == 416 (PF_TAG_REROUTE | PF_TAG_GENERATED)) 417 /* already rerun the route lookup, go on */ 418 m->m_pkthdr.pf.flags &= ~(PF_TAG_GENERATED | PF_TAG_REROUTE); 419 else if (m->m_pkthdr.pf.flags & PF_TAG_REROUTE) { 420 /* tag as generated to skip over pf_test on rerun */ 421 m->m_pkthdr.pf.flags |= PF_TAG_GENERATED; 422 if (ro == &iproute) 423 rtfree(ro->ro_rt); 424 ro = NULL; 425 if_put(ifp); /* drop reference since target changed */ 426 ifp = NULL; 427 goto reroute; 428 } 429 #endif 430 431 #ifdef IPSEC 432 if (ISSET(flags, IP_FORWARDING) && ISSET(flags, IP_FORWARDING_IPSEC) && 433 !ISSET(m->m_pkthdr.ph_tagsset, PACKET_TAG_IPSEC_IN_DONE)) { 434 error = EHOSTUNREACH; 435 goto bad; 436 } 437 #endif 438 439 /* 440 * If TSO or small enough for interface, can just send directly. 441 */ 442 error = if_output_tso(ifp, &m, sintosa(dst), ro->ro_rt, mtu); 443 if (error || m == NULL) 444 goto done; 445 446 /* 447 * Too large for interface; fragment if possible. 448 * Must be able to put at least 8 bytes per fragment. 449 */ 450 if (ip->ip_off & htons(IP_DF)) { 451 #ifdef IPSEC 452 if (ip_mtudisc) 453 ipsec_adjust_mtu(m, ifp->if_mtu); 454 #endif 455 error = EMSGSIZE; 456 #if NPF > 0 457 /* pf changed routing table, use orig rtable for path MTU */ 458 if (ro->ro_tableid != orig_rtableid) { 459 rtfree(ro->ro_rt); 460 ro->ro_tableid = orig_rtableid; 461 ro->ro_rt = icmp_mtudisc_clone( 462 ro->ro_dstsin.sin_addr, ro->ro_tableid, 0); 463 } 464 #endif 465 /* 466 * This case can happen if the user changed the MTU 467 * of an interface after enabling IP on it. Because 468 * most netifs don't keep track of routes pointing to 469 * them, there is no way for one to update all its 470 * routes when the MTU is changed. 471 */ 472 if (rtisvalid(ro->ro_rt) && 473 ISSET(ro->ro_rt->rt_flags, RTF_HOST) && 474 !(ro->ro_rt->rt_locks & RTV_MTU)) { 475 u_int rtmtu; 476 477 rtmtu = atomic_load_int(&ro->ro_rt->rt_mtu); 478 if (rtmtu > ifp->if_mtu) { 479 atomic_cas_uint(&ro->ro_rt->rt_mtu, rtmtu, 480 ifp->if_mtu); 481 } 482 } 483 ipstat_inc(ips_cantfrag); 484 goto bad; 485 } 486 487 if ((error = ip_fragment(m, &ml, ifp, mtu)) || 488 (error = if_output_ml(ifp, &ml, sintosa(dst), ro->ro_rt))) 489 goto done; 490 ipstat_inc(ips_fragmented); 491 492 done: 493 if (ro == &iproute) 494 rtfree(ro->ro_rt); 495 if_put(ifp); 496 #ifdef IPSEC 497 tdb_unref(tdb); 498 #endif /* IPSEC */ 499 return (error); 500 501 bad: 502 m_freem(m); 503 goto done; 504 } 505 506 #ifdef IPSEC 507 int 508 ip_output_ipsec_lookup(struct mbuf *m, int hlen, 509 const struct ipsec_level *seclevel, struct tdb **tdbout, int ipsecflowinfo) 510 { 511 struct m_tag *mtag; 512 struct tdb_ident *tdbi; 513 struct tdb *tdb; 514 struct ipsec_ids *ids = NULL; 515 int error; 516 517 /* Do we have any pending SAs to apply ? */ 518 if (ipsecflowinfo) 519 ids = ipsp_ids_lookup(ipsecflowinfo); 520 error = ipsp_spd_lookup(m, AF_INET, hlen, IPSP_DIRECTION_OUT, 521 NULL, seclevel, &tdb, ids); 522 ipsp_ids_free(ids); 523 if (error || tdb == NULL) { 524 *tdbout = NULL; 525 return error; 526 } 527 /* Loop detection */ 528 for (mtag = m_tag_first(m); mtag != NULL; mtag = m_tag_next(m, mtag)) { 529 if (mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_DONE) 530 continue; 531 tdbi = (struct tdb_ident *)(mtag + 1); 532 if (tdbi->spi == tdb->tdb_spi && 533 tdbi->proto == tdb->tdb_sproto && 534 tdbi->rdomain == tdb->tdb_rdomain && 535 !memcmp(&tdbi->dst, &tdb->tdb_dst, 536 sizeof(union sockaddr_union))) { 537 /* no IPsec needed */ 538 tdb_unref(tdb); 539 *tdbout = NULL; 540 return 0; 541 } 542 } 543 *tdbout = tdb; 544 return 0; 545 } 546 547 void 548 ip_output_ipsec_pmtu_update(struct tdb *tdb, struct route *ro, 549 struct in_addr dst, int rtableid, int transportmode) 550 { 551 struct rtentry *rt = NULL; 552 int rt_mtucloned = 0; 553 554 /* Find a host route to store the mtu in */ 555 if (ro != NULL) 556 rt = ro->ro_rt; 557 /* but don't add a PMTU route for transport mode SAs */ 558 if (transportmode) 559 rt = NULL; 560 else if (rt == NULL || (rt->rt_flags & RTF_HOST) == 0) { 561 rt = icmp_mtudisc_clone(dst, rtableid, 1); 562 rt_mtucloned = 1; 563 } 564 DPRINTF("spi %08x mtu %d rt %p cloned %d", 565 ntohl(tdb->tdb_spi), tdb->tdb_mtu, rt, rt_mtucloned); 566 if (rt != NULL) { 567 atomic_store_int(&rt->rt_mtu, tdb->tdb_mtu); 568 if (ro != NULL && ro->ro_rt != NULL) { 569 rtfree(ro->ro_rt); 570 ro->ro_rt = rtalloc(&ro->ro_dstsa, RT_RESOLVE, 571 rtableid); 572 } 573 if (rt_mtucloned) 574 rtfree(rt); 575 } 576 } 577 578 int 579 ip_output_ipsec_send(struct tdb *tdb, struct mbuf *m, struct route *ro, int fwd) 580 { 581 struct mbuf_list ml; 582 struct ifnet *encif = NULL; 583 struct ip *ip; 584 struct in_addr dst; 585 u_int len; 586 int error, rtableid, tso = 0; 587 588 #if NPF > 0 589 /* 590 * Packet filter 591 */ 592 if ((encif = enc_getif(tdb->tdb_rdomain, tdb->tdb_tap)) == NULL || 593 pf_test(AF_INET, fwd ? PF_FWD : PF_OUT, encif, &m) != PF_PASS) { 594 m_freem(m); 595 return EACCES; 596 } 597 if (m == NULL) 598 return 0; 599 /* 600 * PF_TAG_REROUTE handling or not... 601 * Packet is entering IPsec so the routing is 602 * already overruled by the IPsec policy. 603 * Until now the change was not reconsidered. 604 * What's the behaviour? 605 */ 606 #endif 607 608 /* Check if we can chop the TCP packet */ 609 ip = mtod(m, struct ip *); 610 if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) && 611 m->m_pkthdr.ph_mss <= tdb->tdb_mtu) { 612 tso = 1; 613 len = m->m_pkthdr.ph_mss; 614 } else 615 len = ntohs(ip->ip_len); 616 617 /* Check if we are allowed to fragment */ 618 dst = ip->ip_dst; 619 rtableid = m->m_pkthdr.ph_rtableid; 620 if (ip_mtudisc && (ip->ip_off & htons(IP_DF)) && tdb->tdb_mtu && 621 len > tdb->tdb_mtu && tdb->tdb_mtutimeout > gettime()) { 622 int transportmode; 623 624 transportmode = (tdb->tdb_dst.sa.sa_family == AF_INET) && 625 (tdb->tdb_dst.sin.sin_addr.s_addr == dst.s_addr); 626 ip_output_ipsec_pmtu_update(tdb, ro, dst, rtableid, 627 transportmode); 628 ipsec_adjust_mtu(m, tdb->tdb_mtu); 629 m_freem(m); 630 return EMSGSIZE; 631 } 632 /* propagate IP_DF for v4-over-v6 */ 633 if (ip_mtudisc && ip->ip_off & htons(IP_DF)) 634 SET(m->m_pkthdr.csum_flags, M_IPV6_DF_OUT); 635 636 /* 637 * Clear these -- they'll be set in the recursive invocation 638 * as needed. 639 */ 640 m->m_flags &= ~(M_MCAST | M_BCAST); 641 642 if (tso) { 643 error = tcp_chopper(m, &ml, encif, len); 644 if (error) 645 goto done; 646 } else { 647 CLR(m->m_pkthdr.csum_flags, M_TCP_TSO); 648 in_proto_cksum_out(m, encif); 649 ml_init(&ml); 650 ml_enqueue(&ml, m); 651 } 652 653 KERNEL_LOCK(); 654 while ((m = ml_dequeue(&ml)) != NULL) { 655 /* Callee frees mbuf */ 656 error = ipsp_process_packet(m, tdb, AF_INET, 0); 657 if (error) 658 break; 659 } 660 KERNEL_UNLOCK(); 661 done: 662 if (error) { 663 ml_purge(&ml); 664 ipsecstat_inc(ipsec_odrops); 665 tdbstat_inc(tdb, tdb_odrops); 666 } 667 if (!error && tso) 668 tcpstat_inc(tcps_outswtso); 669 if (ip_mtudisc && error == EMSGSIZE) 670 ip_output_ipsec_pmtu_update(tdb, ro, dst, rtableid, 0); 671 return error; 672 } 673 #endif /* IPSEC */ 674 675 int 676 ip_fragment(struct mbuf *m0, struct mbuf_list *ml, struct ifnet *ifp, 677 u_long mtu) 678 { 679 struct ip *ip; 680 int firstlen, hlen, tlen, len, off; 681 int error; 682 683 ml_init(ml); 684 ml_enqueue(ml, m0); 685 686 ip = mtod(m0, struct ip *); 687 hlen = ip->ip_hl << 2; 688 tlen = m0->m_pkthdr.len; 689 len = (mtu - hlen) &~ 7; 690 if (len < 8) { 691 error = EMSGSIZE; 692 goto bad; 693 } 694 firstlen = len; 695 696 /* 697 * If we are doing fragmentation, we can't defer TCP/UDP 698 * checksumming; compute the checksum and clear the flag. 699 */ 700 in_proto_cksum_out(m0, NULL); 701 702 /* 703 * Loop through length of payload after first fragment, 704 * make new header and copy data of each part and link onto chain. 705 */ 706 for (off = hlen + firstlen; off < tlen; off += len) { 707 struct mbuf *m; 708 struct ip *mhip; 709 int mhlen; 710 711 MGETHDR(m, M_DONTWAIT, MT_HEADER); 712 if (m == NULL) { 713 error = ENOBUFS; 714 goto bad; 715 } 716 ml_enqueue(ml, m); 717 if ((error = m_dup_pkthdr(m, m0, M_DONTWAIT)) != 0) 718 goto bad; 719 m->m_data += max_linkhdr; 720 mhip = mtod(m, struct ip *); 721 *mhip = *ip; 722 if (hlen > sizeof(struct ip)) { 723 mhlen = ip_optcopy(ip, mhip) + sizeof(struct ip); 724 mhip->ip_hl = mhlen >> 2; 725 } else 726 mhlen = sizeof(struct ip); 727 m->m_len = mhlen; 728 729 mhip->ip_off = ((off - hlen) >> 3) + 730 (ntohs(ip->ip_off) & ~IP_MF); 731 if (ip->ip_off & htons(IP_MF)) 732 mhip->ip_off |= IP_MF; 733 if (off + len >= tlen) 734 len = tlen - off; 735 else 736 mhip->ip_off |= IP_MF; 737 mhip->ip_off = htons(mhip->ip_off); 738 739 m->m_pkthdr.len = mhlen + len; 740 mhip->ip_len = htons(m->m_pkthdr.len); 741 m->m_next = m_copym(m0, off, len, M_NOWAIT); 742 if (m->m_next == NULL) { 743 error = ENOBUFS; 744 goto bad; 745 } 746 747 in_hdr_cksum_out(m, ifp); 748 } 749 750 /* 751 * Update first fragment by trimming what's been copied out 752 * and updating header, then send each fragment (in order). 753 */ 754 if (hlen + firstlen < tlen) { 755 m_adj(m0, hlen + firstlen - tlen); 756 ip->ip_off |= htons(IP_MF); 757 } 758 ip->ip_len = htons(m0->m_pkthdr.len); 759 760 in_hdr_cksum_out(m0, ifp); 761 762 ipstat_add(ips_ofragments, ml_len(ml)); 763 return (0); 764 765 bad: 766 ipstat_inc(ips_odropped); 767 ml_purge(ml); 768 return (error); 769 } 770 771 /* 772 * Insert IP options into preformed packet. 773 * Adjust IP destination as required for IP source routing, 774 * as indicated by a non-zero in_addr at the start of the options. 775 */ 776 struct mbuf * 777 ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen) 778 { 779 struct ipoption *p = mtod(opt, struct ipoption *); 780 struct mbuf *n; 781 struct ip *ip = mtod(m, struct ip *); 782 unsigned int optlen; 783 784 optlen = opt->m_len - sizeof(p->ipopt_dst); 785 if (optlen + ntohs(ip->ip_len) > IP_MAXPACKET) 786 return (m); /* XXX should fail */ 787 788 /* check if options will fit to IP header */ 789 if ((optlen + sizeof(struct ip)) > (0x0f << 2)) { 790 *phlen = sizeof(struct ip); 791 return (m); 792 } 793 794 if (p->ipopt_dst.s_addr) 795 ip->ip_dst = p->ipopt_dst; 796 if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) { 797 MGETHDR(n, M_DONTWAIT, MT_HEADER); 798 if (n == NULL) 799 return (m); 800 M_MOVE_HDR(n, m); 801 n->m_pkthdr.len += optlen; 802 m->m_len -= sizeof(struct ip); 803 m->m_data += sizeof(struct ip); 804 n->m_next = m; 805 m = n; 806 m->m_len = optlen + sizeof(struct ip); 807 m->m_data += max_linkhdr; 808 memcpy(mtod(m, caddr_t), ip, sizeof(struct ip)); 809 } else { 810 m->m_data -= optlen; 811 m->m_len += optlen; 812 m->m_pkthdr.len += optlen; 813 memmove(mtod(m, caddr_t), (caddr_t)ip, sizeof(struct ip)); 814 } 815 ip = mtod(m, struct ip *); 816 memcpy(ip + 1, p->ipopt_list, optlen); 817 *phlen = sizeof(struct ip) + optlen; 818 ip->ip_len = htons(ntohs(ip->ip_len) + optlen); 819 return (m); 820 } 821 822 /* 823 * Copy options from ip to jp, 824 * omitting those not copied during fragmentation. 825 */ 826 int 827 ip_optcopy(struct ip *ip, struct ip *jp) 828 { 829 u_char *cp, *dp; 830 int opt, optlen, cnt; 831 832 cp = (u_char *)(ip + 1); 833 dp = (u_char *)(jp + 1); 834 cnt = (ip->ip_hl << 2) - sizeof (struct ip); 835 for (; cnt > 0; cnt -= optlen, cp += optlen) { 836 opt = cp[0]; 837 if (opt == IPOPT_EOL) 838 break; 839 if (opt == IPOPT_NOP) { 840 /* Preserve for IP mcast tunnel's LSRR alignment. */ 841 *dp++ = IPOPT_NOP; 842 optlen = 1; 843 continue; 844 } 845 #ifdef DIAGNOSTIC 846 if (cnt < IPOPT_OLEN + sizeof(*cp)) 847 panic("malformed IPv4 option passed to ip_optcopy"); 848 #endif 849 optlen = cp[IPOPT_OLEN]; 850 #ifdef DIAGNOSTIC 851 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) 852 panic("malformed IPv4 option passed to ip_optcopy"); 853 #endif 854 /* bogus lengths should have been caught by ip_dooptions */ 855 if (optlen > cnt) 856 optlen = cnt; 857 if (IPOPT_COPIED(opt)) { 858 memcpy(dp, cp, optlen); 859 dp += optlen; 860 } 861 } 862 for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++) 863 *dp++ = IPOPT_EOL; 864 return (optlen); 865 } 866 867 /* 868 * IP socket option processing. 869 */ 870 int 871 ip_ctloutput(int op, struct socket *so, int level, int optname, 872 struct mbuf *m) 873 { 874 struct inpcb *inp = sotoinpcb(so); 875 int optval = 0; 876 struct proc *p = curproc; /* XXX */ 877 int error = 0; 878 u_int rtableid, rtid = 0; 879 880 if (level != IPPROTO_IP) 881 return (EINVAL); 882 883 rtableid = p->p_p->ps_rtableid; 884 885 switch (op) { 886 case PRCO_SETOPT: 887 switch (optname) { 888 case IP_OPTIONS: 889 return (ip_pcbopts(&inp->inp_options, m)); 890 891 case IP_TOS: 892 case IP_TTL: 893 case IP_MINTTL: 894 case IP_RECVOPTS: 895 case IP_RECVRETOPTS: 896 case IP_RECVDSTADDR: 897 case IP_RECVIF: 898 case IP_RECVTTL: 899 case IP_RECVDSTPORT: 900 case IP_RECVRTABLE: 901 case IP_IPSECFLOWINFO: 902 if (m == NULL || m->m_len != sizeof(int)) 903 error = EINVAL; 904 else { 905 optval = *mtod(m, int *); 906 switch (optname) { 907 908 case IP_TOS: 909 inp->inp_ip.ip_tos = optval; 910 break; 911 912 case IP_TTL: 913 if (optval > 0 && optval <= MAXTTL) 914 inp->inp_ip.ip_ttl = optval; 915 else if (optval == -1) 916 inp->inp_ip.ip_ttl = ip_defttl; 917 else 918 error = EINVAL; 919 break; 920 921 case IP_MINTTL: 922 if (optval >= 0 && optval <= MAXTTL) 923 inp->inp_ip_minttl = optval; 924 else 925 error = EINVAL; 926 break; 927 #define OPTSET(bit) \ 928 if (optval) \ 929 inp->inp_flags |= bit; \ 930 else \ 931 inp->inp_flags &= ~bit; 932 933 case IP_RECVOPTS: 934 OPTSET(INP_RECVOPTS); 935 break; 936 937 case IP_RECVRETOPTS: 938 OPTSET(INP_RECVRETOPTS); 939 break; 940 941 case IP_RECVDSTADDR: 942 OPTSET(INP_RECVDSTADDR); 943 break; 944 case IP_RECVIF: 945 OPTSET(INP_RECVIF); 946 break; 947 case IP_RECVTTL: 948 OPTSET(INP_RECVTTL); 949 break; 950 case IP_RECVDSTPORT: 951 OPTSET(INP_RECVDSTPORT); 952 break; 953 case IP_RECVRTABLE: 954 OPTSET(INP_RECVRTABLE); 955 break; 956 case IP_IPSECFLOWINFO: 957 OPTSET(INP_IPSECFLOWINFO); 958 break; 959 } 960 } 961 break; 962 #undef OPTSET 963 964 case IP_MULTICAST_IF: 965 case IP_MULTICAST_TTL: 966 case IP_MULTICAST_LOOP: 967 case IP_ADD_MEMBERSHIP: 968 case IP_DROP_MEMBERSHIP: 969 error = ip_setmoptions(optname, &inp->inp_moptions, m, 970 inp->inp_rtableid); 971 break; 972 973 case IP_PORTRANGE: 974 if (m == NULL || m->m_len != sizeof(int)) 975 error = EINVAL; 976 else { 977 optval = *mtod(m, int *); 978 979 switch (optval) { 980 981 case IP_PORTRANGE_DEFAULT: 982 inp->inp_flags &= ~(INP_LOWPORT); 983 inp->inp_flags &= ~(INP_HIGHPORT); 984 break; 985 986 case IP_PORTRANGE_HIGH: 987 inp->inp_flags &= ~(INP_LOWPORT); 988 inp->inp_flags |= INP_HIGHPORT; 989 break; 990 991 case IP_PORTRANGE_LOW: 992 inp->inp_flags &= ~(INP_HIGHPORT); 993 inp->inp_flags |= INP_LOWPORT; 994 break; 995 996 default: 997 998 error = EINVAL; 999 break; 1000 } 1001 } 1002 break; 1003 case IP_AUTH_LEVEL: 1004 case IP_ESP_TRANS_LEVEL: 1005 case IP_ESP_NETWORK_LEVEL: 1006 case IP_IPCOMP_LEVEL: 1007 #ifndef IPSEC 1008 error = EOPNOTSUPP; 1009 #else 1010 if (m == NULL || m->m_len != sizeof(int)) { 1011 error = EINVAL; 1012 break; 1013 } 1014 optval = *mtod(m, int *); 1015 1016 if (optval < IPSEC_LEVEL_BYPASS || 1017 optval > IPSEC_LEVEL_UNIQUE) { 1018 error = EINVAL; 1019 break; 1020 } 1021 1022 switch (optname) { 1023 case IP_AUTH_LEVEL: 1024 if (optval < IPSEC_AUTH_LEVEL_DEFAULT && 1025 suser(p)) { 1026 error = EACCES; 1027 break; 1028 } 1029 inp->inp_seclevel.sl_auth = optval; 1030 break; 1031 1032 case IP_ESP_TRANS_LEVEL: 1033 if (optval < IPSEC_ESP_TRANS_LEVEL_DEFAULT && 1034 suser(p)) { 1035 error = EACCES; 1036 break; 1037 } 1038 inp->inp_seclevel.sl_esp_trans = optval; 1039 break; 1040 1041 case IP_ESP_NETWORK_LEVEL: 1042 if (optval < IPSEC_ESP_NETWORK_LEVEL_DEFAULT && 1043 suser(p)) { 1044 error = EACCES; 1045 break; 1046 } 1047 inp->inp_seclevel.sl_esp_network = optval; 1048 break; 1049 case IP_IPCOMP_LEVEL: 1050 if (optval < IPSEC_IPCOMP_LEVEL_DEFAULT && 1051 suser(p)) { 1052 error = EACCES; 1053 break; 1054 } 1055 inp->inp_seclevel.sl_ipcomp = optval; 1056 break; 1057 } 1058 #endif 1059 break; 1060 1061 case IP_IPSEC_LOCAL_ID: 1062 case IP_IPSEC_REMOTE_ID: 1063 error = EOPNOTSUPP; 1064 break; 1065 case SO_RTABLE: 1066 if (m == NULL || m->m_len < sizeof(u_int)) { 1067 error = EINVAL; 1068 break; 1069 } 1070 rtid = *mtod(m, u_int *); 1071 if (inp->inp_rtableid == rtid) 1072 break; 1073 /* needs privileges to switch when already set */ 1074 if (rtableid != rtid && rtableid != 0 && 1075 (error = suser(p)) != 0) 1076 break; 1077 error = in_pcbset_rtableid(inp, rtid); 1078 break; 1079 case IP_PIPEX: 1080 if (m != NULL && m->m_len == sizeof(int)) 1081 inp->inp_pipex = *mtod(m, int *); 1082 else 1083 error = EINVAL; 1084 break; 1085 1086 default: 1087 error = ENOPROTOOPT; 1088 break; 1089 } 1090 break; 1091 1092 case PRCO_GETOPT: 1093 switch (optname) { 1094 case IP_OPTIONS: 1095 case IP_RETOPTS: 1096 if (inp->inp_options) { 1097 m->m_len = inp->inp_options->m_len; 1098 memcpy(mtod(m, caddr_t), 1099 mtod(inp->inp_options, caddr_t), m->m_len); 1100 } else 1101 m->m_len = 0; 1102 break; 1103 1104 case IP_TOS: 1105 case IP_TTL: 1106 case IP_MINTTL: 1107 case IP_RECVOPTS: 1108 case IP_RECVRETOPTS: 1109 case IP_RECVDSTADDR: 1110 case IP_RECVIF: 1111 case IP_RECVTTL: 1112 case IP_RECVDSTPORT: 1113 case IP_RECVRTABLE: 1114 case IP_IPSECFLOWINFO: 1115 case IP_IPDEFTTL: 1116 m->m_len = sizeof(int); 1117 switch (optname) { 1118 1119 case IP_TOS: 1120 optval = inp->inp_ip.ip_tos; 1121 break; 1122 1123 case IP_TTL: 1124 optval = inp->inp_ip.ip_ttl; 1125 break; 1126 1127 case IP_MINTTL: 1128 optval = inp->inp_ip_minttl; 1129 break; 1130 1131 case IP_IPDEFTTL: 1132 optval = ip_defttl; 1133 break; 1134 1135 #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) 1136 1137 case IP_RECVOPTS: 1138 optval = OPTBIT(INP_RECVOPTS); 1139 break; 1140 1141 case IP_RECVRETOPTS: 1142 optval = OPTBIT(INP_RECVRETOPTS); 1143 break; 1144 1145 case IP_RECVDSTADDR: 1146 optval = OPTBIT(INP_RECVDSTADDR); 1147 break; 1148 case IP_RECVIF: 1149 optval = OPTBIT(INP_RECVIF); 1150 break; 1151 case IP_RECVTTL: 1152 optval = OPTBIT(INP_RECVTTL); 1153 break; 1154 case IP_RECVDSTPORT: 1155 optval = OPTBIT(INP_RECVDSTPORT); 1156 break; 1157 case IP_RECVRTABLE: 1158 optval = OPTBIT(INP_RECVRTABLE); 1159 break; 1160 case IP_IPSECFLOWINFO: 1161 optval = OPTBIT(INP_IPSECFLOWINFO); 1162 break; 1163 } 1164 *mtod(m, int *) = optval; 1165 break; 1166 1167 case IP_MULTICAST_IF: 1168 case IP_MULTICAST_TTL: 1169 case IP_MULTICAST_LOOP: 1170 case IP_ADD_MEMBERSHIP: 1171 case IP_DROP_MEMBERSHIP: 1172 error = ip_getmoptions(optname, inp->inp_moptions, m); 1173 break; 1174 1175 case IP_PORTRANGE: 1176 m->m_len = sizeof(int); 1177 1178 if (inp->inp_flags & INP_HIGHPORT) 1179 optval = IP_PORTRANGE_HIGH; 1180 else if (inp->inp_flags & INP_LOWPORT) 1181 optval = IP_PORTRANGE_LOW; 1182 else 1183 optval = 0; 1184 1185 *mtod(m, int *) = optval; 1186 break; 1187 1188 case IP_AUTH_LEVEL: 1189 case IP_ESP_TRANS_LEVEL: 1190 case IP_ESP_NETWORK_LEVEL: 1191 case IP_IPCOMP_LEVEL: 1192 #ifndef IPSEC 1193 m->m_len = sizeof(int); 1194 *mtod(m, int *) = IPSEC_LEVEL_NONE; 1195 #else 1196 m->m_len = sizeof(int); 1197 switch (optname) { 1198 case IP_AUTH_LEVEL: 1199 optval = inp->inp_seclevel.sl_auth; 1200 break; 1201 1202 case IP_ESP_TRANS_LEVEL: 1203 optval = inp->inp_seclevel.sl_esp_trans; 1204 break; 1205 1206 case IP_ESP_NETWORK_LEVEL: 1207 optval = inp->inp_seclevel.sl_esp_network; 1208 break; 1209 case IP_IPCOMP_LEVEL: 1210 optval = inp->inp_seclevel.sl_ipcomp; 1211 break; 1212 } 1213 *mtod(m, int *) = optval; 1214 #endif 1215 break; 1216 case IP_IPSEC_LOCAL_ID: 1217 case IP_IPSEC_REMOTE_ID: 1218 error = EOPNOTSUPP; 1219 break; 1220 case SO_RTABLE: 1221 m->m_len = sizeof(u_int); 1222 *mtod(m, u_int *) = inp->inp_rtableid; 1223 break; 1224 case IP_PIPEX: 1225 m->m_len = sizeof(int); 1226 *mtod(m, int *) = inp->inp_pipex; 1227 break; 1228 default: 1229 error = ENOPROTOOPT; 1230 break; 1231 } 1232 break; 1233 } 1234 return (error); 1235 } 1236 1237 /* 1238 * Set up IP options in pcb for insertion in output packets. 1239 * Store in mbuf with pointer in pcbopt, adding pseudo-option 1240 * with destination address if source routed. 1241 */ 1242 int 1243 ip_pcbopts(struct mbuf **pcbopt, struct mbuf *m) 1244 { 1245 struct mbuf *n; 1246 struct ipoption *p; 1247 int cnt, off, optlen; 1248 u_char *cp; 1249 u_char opt; 1250 1251 /* turn off any old options */ 1252 m_freem(*pcbopt); 1253 *pcbopt = NULL; 1254 if (m == NULL || m->m_len == 0) { 1255 /* 1256 * Only turning off any previous options. 1257 */ 1258 return (0); 1259 } 1260 1261 if (m->m_len % sizeof(int32_t) || 1262 m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr)) 1263 return (EINVAL); 1264 1265 /* Don't sleep because NET_LOCK() is hold. */ 1266 if ((n = m_get(M_NOWAIT, MT_SOOPTS)) == NULL) 1267 return (ENOBUFS); 1268 p = mtod(n, struct ipoption *); 1269 memset(p, 0, sizeof (*p)); /* 0 = IPOPT_EOL, needed for padding */ 1270 n->m_len = sizeof(struct in_addr); 1271 1272 off = 0; 1273 cnt = m->m_len; 1274 cp = mtod(m, u_char *); 1275 1276 while (cnt > 0) { 1277 opt = cp[IPOPT_OPTVAL]; 1278 1279 if (opt == IPOPT_NOP || opt == IPOPT_EOL) { 1280 optlen = 1; 1281 } else { 1282 if (cnt < IPOPT_OLEN + sizeof(*cp)) 1283 goto bad; 1284 optlen = cp[IPOPT_OLEN]; 1285 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) 1286 goto bad; 1287 } 1288 switch (opt) { 1289 default: 1290 memcpy(p->ipopt_list + off, cp, optlen); 1291 break; 1292 1293 case IPOPT_LSRR: 1294 case IPOPT_SSRR: 1295 /* 1296 * user process specifies route as: 1297 * ->A->B->C->D 1298 * D must be our final destination (but we can't 1299 * check that since we may not have connected yet). 1300 * A is first hop destination, which doesn't appear in 1301 * actual IP option, but is stored before the options. 1302 */ 1303 if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr)) 1304 goto bad; 1305 1306 /* 1307 * Optlen is smaller because first address is popped. 1308 * Cnt and cp will be adjusted a bit later to reflect 1309 * this. 1310 */ 1311 optlen -= sizeof(struct in_addr); 1312 p->ipopt_list[off + IPOPT_OPTVAL] = opt; 1313 p->ipopt_list[off + IPOPT_OLEN] = optlen; 1314 1315 /* 1316 * Move first hop before start of options. 1317 */ 1318 memcpy(&p->ipopt_dst, cp + IPOPT_OFFSET, 1319 sizeof(struct in_addr)); 1320 cp += sizeof(struct in_addr); 1321 cnt -= sizeof(struct in_addr); 1322 /* 1323 * Then copy rest of options 1324 */ 1325 memcpy(p->ipopt_list + off + IPOPT_OFFSET, 1326 cp + IPOPT_OFFSET, optlen - IPOPT_OFFSET); 1327 break; 1328 } 1329 off += optlen; 1330 cp += optlen; 1331 cnt -= optlen; 1332 1333 if (opt == IPOPT_EOL) 1334 break; 1335 } 1336 /* pad options to next word, since p was zeroed just adjust off */ 1337 off = (off + sizeof(int32_t) - 1) & ~(sizeof(int32_t) - 1); 1338 n->m_len += off; 1339 if (n->m_len > sizeof(*p)) { 1340 bad: 1341 m_freem(n); 1342 return (EINVAL); 1343 } 1344 1345 *pcbopt = n; 1346 return (0); 1347 } 1348 1349 /* 1350 * Lookup the interface based on the information in the ip_mreqn struct. 1351 */ 1352 int 1353 ip_multicast_if(struct ip_mreqn *mreq, u_int rtableid, unsigned int *ifidx) 1354 { 1355 struct sockaddr_in sin; 1356 struct rtentry *rt; 1357 1358 /* 1359 * In case userland provides the imr_ifindex use this as interface. 1360 * If no interface address was provided, use the interface of 1361 * the route to the given multicast address. 1362 */ 1363 if (mreq->imr_ifindex != 0) { 1364 *ifidx = mreq->imr_ifindex; 1365 } else if (mreq->imr_address.s_addr == INADDR_ANY) { 1366 memset(&sin, 0, sizeof(sin)); 1367 sin.sin_len = sizeof(sin); 1368 sin.sin_family = AF_INET; 1369 sin.sin_addr = mreq->imr_multiaddr; 1370 rt = rtalloc(sintosa(&sin), RT_RESOLVE, rtableid); 1371 if (!rtisvalid(rt)) { 1372 rtfree(rt); 1373 return EADDRNOTAVAIL; 1374 } 1375 *ifidx = rt->rt_ifidx; 1376 rtfree(rt); 1377 } else { 1378 memset(&sin, 0, sizeof(sin)); 1379 sin.sin_len = sizeof(sin); 1380 sin.sin_family = AF_INET; 1381 sin.sin_addr = mreq->imr_address; 1382 rt = rtalloc(sintosa(&sin), 0, rtableid); 1383 if (!rtisvalid(rt) || !ISSET(rt->rt_flags, RTF_LOCAL)) { 1384 rtfree(rt); 1385 return EADDRNOTAVAIL; 1386 } 1387 *ifidx = rt->rt_ifidx; 1388 rtfree(rt); 1389 } 1390 1391 return 0; 1392 } 1393 1394 /* 1395 * Set the IP multicast options in response to user setsockopt(). 1396 */ 1397 int 1398 ip_setmoptions(int optname, struct ip_moptions **imop, struct mbuf *m, 1399 u_int rtableid) 1400 { 1401 struct in_addr addr; 1402 struct in_ifaddr *ia; 1403 struct ip_mreqn mreqn; 1404 struct ifnet *ifp = NULL; 1405 struct ip_moptions *imo = *imop; 1406 struct in_multi **immp; 1407 struct sockaddr_in sin; 1408 unsigned int ifidx; 1409 int i, error = 0; 1410 u_char loop; 1411 1412 if (imo == NULL) { 1413 /* 1414 * No multicast option buffer attached to the pcb; 1415 * allocate one and initialize to default values. 1416 */ 1417 imo = malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK|M_ZERO); 1418 immp = mallocarray(IP_MIN_MEMBERSHIPS, sizeof(*immp), M_IPMOPTS, 1419 M_WAITOK|M_ZERO); 1420 *imop = imo; 1421 imo->imo_ifidx = 0; 1422 imo->imo_ttl = IP_DEFAULT_MULTICAST_TTL; 1423 imo->imo_loop = IP_DEFAULT_MULTICAST_LOOP; 1424 imo->imo_num_memberships = 0; 1425 imo->imo_max_memberships = IP_MIN_MEMBERSHIPS; 1426 imo->imo_membership = immp; 1427 } 1428 1429 switch (optname) { 1430 1431 case IP_MULTICAST_IF: 1432 /* 1433 * Select the interface for outgoing multicast packets. 1434 */ 1435 if (m == NULL) { 1436 error = EINVAL; 1437 break; 1438 } 1439 if (m->m_len == sizeof(struct in_addr)) { 1440 addr = *(mtod(m, struct in_addr *)); 1441 } else if (m->m_len == sizeof(struct ip_mreq) || 1442 m->m_len == sizeof(struct ip_mreqn)) { 1443 memset(&mreqn, 0, sizeof(mreqn)); 1444 memcpy(&mreqn, mtod(m, void *), m->m_len); 1445 1446 /* 1447 * If an interface index is given use this 1448 * index to set the imo_ifidx but check first 1449 * that the interface actually exists. 1450 * In the other case just set the addr to 1451 * the imr_address and fall through to the 1452 * regular code. 1453 */ 1454 if (mreqn.imr_ifindex != 0) { 1455 ifp = if_get(mreqn.imr_ifindex); 1456 if (ifp == NULL || 1457 ifp->if_rdomain != rtable_l2(rtableid)) { 1458 error = EADDRNOTAVAIL; 1459 if_put(ifp); 1460 break; 1461 } 1462 imo->imo_ifidx = ifp->if_index; 1463 if_put(ifp); 1464 break; 1465 } else 1466 addr = mreqn.imr_address; 1467 } else { 1468 error = EINVAL; 1469 break; 1470 } 1471 /* 1472 * INADDR_ANY is used to remove a previous selection. 1473 * When no interface is selected, a default one is 1474 * chosen every time a multicast packet is sent. 1475 */ 1476 if (addr.s_addr == INADDR_ANY) { 1477 imo->imo_ifidx = 0; 1478 break; 1479 } 1480 /* 1481 * The selected interface is identified by its local 1482 * IP address. Find the interface and confirm that 1483 * it supports multicasting. 1484 */ 1485 memset(&sin, 0, sizeof(sin)); 1486 sin.sin_len = sizeof(sin); 1487 sin.sin_family = AF_INET; 1488 sin.sin_addr = addr; 1489 ia = ifatoia(ifa_ifwithaddr(sintosa(&sin), rtableid)); 1490 if (ia == NULL || 1491 (ia->ia_ifp->if_flags & IFF_MULTICAST) == 0) { 1492 error = EADDRNOTAVAIL; 1493 break; 1494 } 1495 imo->imo_ifidx = ia->ia_ifp->if_index; 1496 break; 1497 1498 case IP_MULTICAST_TTL: 1499 /* 1500 * Set the IP time-to-live for outgoing multicast packets. 1501 */ 1502 if (m == NULL || m->m_len != 1) { 1503 error = EINVAL; 1504 break; 1505 } 1506 imo->imo_ttl = *(mtod(m, u_char *)); 1507 break; 1508 1509 case IP_MULTICAST_LOOP: 1510 /* 1511 * Set the loopback flag for outgoing multicast packets. 1512 * Must be zero or one. 1513 */ 1514 if (m == NULL || m->m_len != 1 || 1515 (loop = *(mtod(m, u_char *))) > 1) { 1516 error = EINVAL; 1517 break; 1518 } 1519 imo->imo_loop = loop; 1520 break; 1521 1522 case IP_ADD_MEMBERSHIP: 1523 /* 1524 * Add a multicast group membership. 1525 * Group must be a valid IP multicast address. 1526 */ 1527 if (m == NULL || !(m->m_len == sizeof(struct ip_mreq) || 1528 m->m_len == sizeof(struct ip_mreqn))) { 1529 error = EINVAL; 1530 break; 1531 } 1532 memset(&mreqn, 0, sizeof(mreqn)); 1533 memcpy(&mreqn, mtod(m, void *), m->m_len); 1534 if (!IN_MULTICAST(mreqn.imr_multiaddr.s_addr)) { 1535 error = EINVAL; 1536 break; 1537 } 1538 1539 error = ip_multicast_if(&mreqn, rtableid, &ifidx); 1540 if (error) 1541 break; 1542 1543 /* 1544 * See if we found an interface, and confirm that it 1545 * supports multicast. 1546 */ 1547 ifp = if_get(ifidx); 1548 if (ifp == NULL || ifp->if_rdomain != rtable_l2(rtableid) || 1549 (ifp->if_flags & IFF_MULTICAST) == 0) { 1550 error = EADDRNOTAVAIL; 1551 if_put(ifp); 1552 break; 1553 } 1554 1555 /* 1556 * See if the membership already exists or if all the 1557 * membership slots are full. 1558 */ 1559 for (i = 0; i < imo->imo_num_memberships; ++i) { 1560 if (imo->imo_membership[i]->inm_ifidx == ifidx && 1561 imo->imo_membership[i]->inm_addr.s_addr 1562 == mreqn.imr_multiaddr.s_addr) 1563 break; 1564 } 1565 if (i < imo->imo_num_memberships) { 1566 error = EADDRINUSE; 1567 if_put(ifp); 1568 break; 1569 } 1570 if (imo->imo_num_memberships == imo->imo_max_memberships) { 1571 struct in_multi **nmships, **omships; 1572 size_t newmax; 1573 /* 1574 * Resize the vector to next power-of-two minus 1. If 1575 * the size would exceed the maximum then we know we've 1576 * really run out of entries. Otherwise, we reallocate 1577 * the vector. 1578 */ 1579 nmships = NULL; 1580 omships = imo->imo_membership; 1581 newmax = ((imo->imo_max_memberships + 1) * 2) - 1; 1582 if (newmax <= IP_MAX_MEMBERSHIPS) { 1583 nmships = mallocarray(newmax, sizeof(*nmships), 1584 M_IPMOPTS, M_NOWAIT|M_ZERO); 1585 if (nmships != NULL) { 1586 memcpy(nmships, omships, 1587 sizeof(*omships) * 1588 imo->imo_max_memberships); 1589 free(omships, M_IPMOPTS, 1590 sizeof(*omships) * 1591 imo->imo_max_memberships); 1592 imo->imo_membership = nmships; 1593 imo->imo_max_memberships = newmax; 1594 } 1595 } 1596 if (nmships == NULL) { 1597 error = ENOBUFS; 1598 if_put(ifp); 1599 break; 1600 } 1601 } 1602 /* 1603 * Everything looks good; add a new record to the multicast 1604 * address list for the given interface. 1605 */ 1606 if ((imo->imo_membership[i] = 1607 in_addmulti(&mreqn.imr_multiaddr, ifp)) == NULL) { 1608 error = ENOBUFS; 1609 if_put(ifp); 1610 break; 1611 } 1612 ++imo->imo_num_memberships; 1613 if_put(ifp); 1614 break; 1615 1616 case IP_DROP_MEMBERSHIP: 1617 /* 1618 * Drop a multicast group membership. 1619 * Group must be a valid IP multicast address. 1620 */ 1621 if (m == NULL || !(m->m_len == sizeof(struct ip_mreq) || 1622 m->m_len == sizeof(struct ip_mreqn))) { 1623 error = EINVAL; 1624 break; 1625 } 1626 memset(&mreqn, 0, sizeof(mreqn)); 1627 memcpy(&mreqn, mtod(m, void *), m->m_len); 1628 if (!IN_MULTICAST(mreqn.imr_multiaddr.s_addr)) { 1629 error = EINVAL; 1630 break; 1631 } 1632 1633 /* 1634 * If an interface address was specified, get a pointer 1635 * to its ifnet structure. 1636 */ 1637 error = ip_multicast_if(&mreqn, rtableid, &ifidx); 1638 if (error) 1639 break; 1640 1641 /* 1642 * Find the membership in the membership array. 1643 */ 1644 for (i = 0; i < imo->imo_num_memberships; ++i) { 1645 if ((ifidx == 0 || 1646 imo->imo_membership[i]->inm_ifidx == ifidx) && 1647 imo->imo_membership[i]->inm_addr.s_addr == 1648 mreqn.imr_multiaddr.s_addr) 1649 break; 1650 } 1651 if (i == imo->imo_num_memberships) { 1652 error = EADDRNOTAVAIL; 1653 break; 1654 } 1655 /* 1656 * Give up the multicast address record to which the 1657 * membership points. 1658 */ 1659 in_delmulti(imo->imo_membership[i]); 1660 /* 1661 * Remove the gap in the membership array. 1662 */ 1663 for (++i; i < imo->imo_num_memberships; ++i) 1664 imo->imo_membership[i-1] = imo->imo_membership[i]; 1665 --imo->imo_num_memberships; 1666 break; 1667 1668 default: 1669 error = EOPNOTSUPP; 1670 break; 1671 } 1672 1673 /* 1674 * If all options have default values, no need to keep the data. 1675 */ 1676 if (imo->imo_ifidx == 0 && 1677 imo->imo_ttl == IP_DEFAULT_MULTICAST_TTL && 1678 imo->imo_loop == IP_DEFAULT_MULTICAST_LOOP && 1679 imo->imo_num_memberships == 0) { 1680 free(imo->imo_membership , M_IPMOPTS, 1681 imo->imo_max_memberships * sizeof(struct in_multi *)); 1682 free(*imop, M_IPMOPTS, sizeof(**imop)); 1683 *imop = NULL; 1684 } 1685 1686 return (error); 1687 } 1688 1689 /* 1690 * Return the IP multicast options in response to user getsockopt(). 1691 */ 1692 int 1693 ip_getmoptions(int optname, struct ip_moptions *imo, struct mbuf *m) 1694 { 1695 u_char *ttl; 1696 u_char *loop; 1697 struct in_addr *addr; 1698 struct in_ifaddr *ia; 1699 struct ifnet *ifp; 1700 1701 switch (optname) { 1702 1703 case IP_MULTICAST_IF: 1704 addr = mtod(m, struct in_addr *); 1705 m->m_len = sizeof(struct in_addr); 1706 if (imo == NULL || (ifp = if_get(imo->imo_ifidx)) == NULL) 1707 addr->s_addr = INADDR_ANY; 1708 else { 1709 IFP_TO_IA(ifp, ia); 1710 addr->s_addr = (ia == NULL) ? INADDR_ANY 1711 : ia->ia_addr.sin_addr.s_addr; 1712 if_put(ifp); 1713 } 1714 return (0); 1715 1716 case IP_MULTICAST_TTL: 1717 ttl = mtod(m, u_char *); 1718 m->m_len = 1; 1719 *ttl = (imo == NULL) ? IP_DEFAULT_MULTICAST_TTL 1720 : imo->imo_ttl; 1721 return (0); 1722 1723 case IP_MULTICAST_LOOP: 1724 loop = mtod(m, u_char *); 1725 m->m_len = 1; 1726 *loop = (imo == NULL) ? IP_DEFAULT_MULTICAST_LOOP 1727 : imo->imo_loop; 1728 return (0); 1729 1730 default: 1731 return (EOPNOTSUPP); 1732 } 1733 } 1734 1735 /* 1736 * Discard the IP multicast options. 1737 */ 1738 void 1739 ip_freemoptions(struct ip_moptions *imo) 1740 { 1741 int i; 1742 1743 if (imo != NULL) { 1744 for (i = 0; i < imo->imo_num_memberships; ++i) 1745 in_delmulti(imo->imo_membership[i]); 1746 free(imo->imo_membership, M_IPMOPTS, 1747 imo->imo_max_memberships * sizeof(struct in_multi *)); 1748 free(imo, M_IPMOPTS, sizeof(*imo)); 1749 } 1750 } 1751 1752 /* 1753 * Routine called from ip_output() to loop back a copy of an IP multicast 1754 * packet to the input queue of a specified interface. 1755 */ 1756 void 1757 ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst) 1758 { 1759 struct mbuf *copym; 1760 1761 copym = m_dup_pkt(m, max_linkhdr, M_DONTWAIT); 1762 if (copym != NULL) { 1763 /* 1764 * We don't bother to fragment if the IP length is greater 1765 * than the interface's MTU. Can this possibly matter? 1766 */ 1767 in_hdr_cksum_out(copym, NULL); 1768 if_input_local(ifp, copym, dst->sin_family); 1769 } 1770 } 1771 1772 void 1773 in_hdr_cksum_out(struct mbuf *m, struct ifnet *ifp) 1774 { 1775 struct ip *ip = mtod(m, struct ip *); 1776 1777 ip->ip_sum = 0; 1778 if (in_ifcap_cksum(m, ifp, IFCAP_CSUM_IPv4)) { 1779 SET(m->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT); 1780 } else { 1781 ipstat_inc(ips_outswcsum); 1782 ip->ip_sum = in_cksum(m, ip->ip_hl << 2); 1783 CLR(m->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT); 1784 } 1785 } 1786 1787 /* 1788 * Compute significant parts of the IPv4 checksum pseudo-header 1789 * for use in a delayed TCP/UDP checksum calculation. 1790 */ 1791 static u_int16_t 1792 in_cksum_phdr(u_int32_t src, u_int32_t dst, u_int32_t lenproto) 1793 { 1794 u_int32_t sum; 1795 1796 sum = lenproto + 1797 (u_int16_t)(src >> 16) + 1798 (u_int16_t)(src /*& 0xffff*/) + 1799 (u_int16_t)(dst >> 16) + 1800 (u_int16_t)(dst /*& 0xffff*/); 1801 1802 sum = (u_int16_t)(sum >> 16) + (u_int16_t)(sum /*& 0xffff*/); 1803 1804 if (sum > 0xffff) 1805 sum -= 0xffff; 1806 1807 return (sum); 1808 } 1809 1810 /* 1811 * Process a delayed payload checksum calculation. 1812 */ 1813 void 1814 in_delayed_cksum(struct mbuf *m) 1815 { 1816 struct ip *ip; 1817 u_int16_t csum, offset; 1818 1819 ip = mtod(m, struct ip *); 1820 offset = ip->ip_hl << 2; 1821 csum = in4_cksum(m, 0, offset, m->m_pkthdr.len - offset); 1822 if (csum == 0 && ip->ip_p == IPPROTO_UDP) 1823 csum = 0xffff; 1824 1825 switch (ip->ip_p) { 1826 case IPPROTO_TCP: 1827 offset += offsetof(struct tcphdr, th_sum); 1828 break; 1829 1830 case IPPROTO_UDP: 1831 offset += offsetof(struct udphdr, uh_sum); 1832 break; 1833 1834 case IPPROTO_ICMP: 1835 offset += offsetof(struct icmp, icmp_cksum); 1836 break; 1837 1838 default: 1839 return; 1840 } 1841 1842 if ((offset + sizeof(u_int16_t)) > m->m_len) 1843 m_copyback(m, offset, sizeof(csum), &csum, M_NOWAIT); 1844 else 1845 *(u_int16_t *)(mtod(m, caddr_t) + offset) = csum; 1846 } 1847 1848 void 1849 in_proto_cksum_out(struct mbuf *m, struct ifnet *ifp) 1850 { 1851 struct ip *ip = mtod(m, struct ip *); 1852 1853 /* some hw and in_delayed_cksum need the pseudo header cksum */ 1854 if (m->m_pkthdr.csum_flags & 1855 (M_TCP_CSUM_OUT|M_UDP_CSUM_OUT|M_ICMP_CSUM_OUT)) { 1856 u_int16_t csum = 0, offset; 1857 1858 offset = ip->ip_hl << 2; 1859 if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) && 1860 in_ifcap_cksum(m, ifp, IFCAP_TSOv4)) { 1861 csum = in_cksum_phdr(ip->ip_src.s_addr, 1862 ip->ip_dst.s_addr, htonl(ip->ip_p)); 1863 } else if (ISSET(m->m_pkthdr.csum_flags, 1864 M_TCP_CSUM_OUT|M_UDP_CSUM_OUT)) { 1865 csum = in_cksum_phdr(ip->ip_src.s_addr, 1866 ip->ip_dst.s_addr, htonl(ntohs(ip->ip_len) - 1867 offset + ip->ip_p)); 1868 } 1869 if (ip->ip_p == IPPROTO_TCP) 1870 offset += offsetof(struct tcphdr, th_sum); 1871 else if (ip->ip_p == IPPROTO_UDP) 1872 offset += offsetof(struct udphdr, uh_sum); 1873 else if (ip->ip_p == IPPROTO_ICMP) 1874 offset += offsetof(struct icmp, icmp_cksum); 1875 if ((offset + sizeof(u_int16_t)) > m->m_len) 1876 m_copyback(m, offset, sizeof(csum), &csum, M_NOWAIT); 1877 else 1878 *(u_int16_t *)(mtod(m, caddr_t) + offset) = csum; 1879 } 1880 1881 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_OUT) { 1882 if (!in_ifcap_cksum(m, ifp, IFCAP_CSUM_TCPv4) || 1883 ip->ip_hl != 5) { 1884 tcpstat_inc(tcps_outswcsum); 1885 in_delayed_cksum(m); 1886 m->m_pkthdr.csum_flags &= ~M_TCP_CSUM_OUT; /* Clear */ 1887 } 1888 } else if (m->m_pkthdr.csum_flags & M_UDP_CSUM_OUT) { 1889 if (!in_ifcap_cksum(m, ifp, IFCAP_CSUM_UDPv4) || 1890 ip->ip_hl != 5) { 1891 udpstat_inc(udps_outswcsum); 1892 in_delayed_cksum(m); 1893 m->m_pkthdr.csum_flags &= ~M_UDP_CSUM_OUT; /* Clear */ 1894 } 1895 } else if (m->m_pkthdr.csum_flags & M_ICMP_CSUM_OUT) { 1896 in_delayed_cksum(m); 1897 m->m_pkthdr.csum_flags &= ~M_ICMP_CSUM_OUT; /* Clear */ 1898 } 1899 } 1900 1901 int 1902 in_ifcap_cksum(struct mbuf *m, struct ifnet *ifp, int ifcap) 1903 { 1904 if ((ifp == NULL) || 1905 !ISSET(ifp->if_capabilities, ifcap) || 1906 (ifp->if_bridgeidx != 0)) 1907 return (0); 1908 /* 1909 * Simplex interface sends packet back without hardware cksum. 1910 * Keep this check in sync with the condition where ether_resolve() 1911 * calls if_input_local(). 1912 */ 1913 if (ISSET(m->m_flags, M_BCAST) && 1914 ISSET(ifp->if_flags, IFF_SIMPLEX) && 1915 !m->m_pkthdr.pf.routed) 1916 return (0); 1917 return (1); 1918 } 1919