1 /* $OpenBSD: ip_output.c,v 1.365 2021/02/10 18:28:06 bluhm Exp $ */ 2 /* $NetBSD: ip_output.c,v 1.28 1996/02/13 23:43:07 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 33 */ 34 35 #include "pf.h" 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/mbuf.h> 40 #include <sys/protosw.h> 41 #include <sys/socket.h> 42 #include <sys/socketvar.h> 43 #include <sys/proc.h> 44 #include <sys/kernel.h> 45 46 #include <net/if.h> 47 #include <net/if_var.h> 48 #include <net/if_enc.h> 49 #include <net/route.h> 50 51 #include <netinet/in.h> 52 #include <netinet/ip.h> 53 #include <netinet/in_pcb.h> 54 #include <netinet/in_var.h> 55 #include <netinet/ip_var.h> 56 #include <netinet/ip_icmp.h> 57 #include <netinet/tcp.h> 58 #include <netinet/udp.h> 59 #include <netinet/tcp_timer.h> 60 #include <netinet/tcp_var.h> 61 #include <netinet/udp_var.h> 62 63 #if NPF > 0 64 #include <net/pfvar.h> 65 #endif 66 67 #ifdef IPSEC 68 #ifdef ENCDEBUG 69 #define DPRINTF(x) do { if (encdebug) printf x ; } while (0) 70 #else 71 #define DPRINTF(x) 72 #endif 73 #endif /* IPSEC */ 74 75 int ip_pcbopts(struct mbuf **, struct mbuf *); 76 int ip_multicast_if(struct ip_mreqn *, u_int, unsigned int *); 77 int ip_setmoptions(int, struct ip_moptions **, struct mbuf *, u_int); 78 void ip_mloopback(struct ifnet *, struct mbuf *, struct sockaddr_in *); 79 static __inline u_int16_t __attribute__((__unused__)) 80 in_cksum_phdr(u_int32_t, u_int32_t, u_int32_t); 81 void in_delayed_cksum(struct mbuf *); 82 int in_ifcap_cksum(struct mbuf *, struct ifnet *, int); 83 84 #ifdef IPSEC 85 struct tdb * 86 ip_output_ipsec_lookup(struct mbuf *m, int hlen, int *error, struct inpcb *inp, 87 int ipsecflowinfo); 88 int 89 ip_output_ipsec_send(struct tdb *, struct mbuf *, struct route *, int); 90 #endif /* IPSEC */ 91 92 /* 93 * IP output. The packet in mbuf chain m contains a skeletal IP 94 * header (with len, off, ttl, proto, tos, src, dst). 95 * The mbuf chain containing the packet will be freed. 96 * The mbuf opt, if present, will not be freed. 97 */ 98 int 99 ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro, int flags, 100 struct ip_moptions *imo, struct inpcb *inp, u_int32_t ipsecflowinfo) 101 { 102 struct ip *ip; 103 struct ifnet *ifp = NULL; 104 struct mbuf *m = m0; 105 int hlen = sizeof (struct ip); 106 int error = 0; 107 struct route iproute; 108 struct sockaddr_in *dst; 109 struct tdb *tdb = NULL; 110 u_long mtu; 111 #if NPF > 0 112 u_int orig_rtableid; 113 #endif 114 #ifdef MROUTING 115 int rv; 116 #endif 117 118 NET_ASSERT_LOCKED(); 119 120 #ifdef IPSEC 121 if (inp && (inp->inp_flags & INP_IPV6) != 0) 122 panic("ip_output: IPv6 pcb is passed"); 123 #endif /* IPSEC */ 124 125 #ifdef DIAGNOSTIC 126 if ((m->m_flags & M_PKTHDR) == 0) 127 panic("ip_output no HDR"); 128 #endif 129 if (opt) 130 m = ip_insertoptions(m, opt, &hlen); 131 132 ip = mtod(m, struct ip *); 133 134 /* 135 * Fill in IP header. 136 */ 137 if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { 138 ip->ip_v = IPVERSION; 139 ip->ip_off &= htons(IP_DF); 140 ip->ip_id = htons(ip_randomid()); 141 ip->ip_hl = hlen >> 2; 142 ipstat_inc(ips_localout); 143 } else { 144 hlen = ip->ip_hl << 2; 145 } 146 147 /* 148 * We should not send traffic to 0/8 say both Stevens and RFCs 149 * 5735 section 3 and 1122 sections 3.2.1.3 and 3.3.6. 150 */ 151 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == 0) { 152 error = ENETUNREACH; 153 goto bad; 154 } 155 156 #if NPF > 0 157 orig_rtableid = m->m_pkthdr.ph_rtableid; 158 reroute: 159 #endif 160 161 /* 162 * Do a route lookup now in case we need the source address to 163 * do an SPD lookup in IPsec; for most packets, the source address 164 * is set at a higher level protocol. ICMPs and other packets 165 * though (e.g., traceroute) have a source address of zeroes. 166 */ 167 if (ro == NULL) { 168 ro = &iproute; 169 memset(ro, 0, sizeof(*ro)); 170 } 171 172 dst = satosin(&ro->ro_dst); 173 174 /* 175 * If there is a cached route, check that it is to the same 176 * destination and is still up. If not, free it and try again. 177 */ 178 if (!rtisvalid(ro->ro_rt) || 179 dst->sin_addr.s_addr != ip->ip_dst.s_addr || 180 ro->ro_tableid != m->m_pkthdr.ph_rtableid) { 181 rtfree(ro->ro_rt); 182 ro->ro_rt = NULL; 183 } 184 185 if (ro->ro_rt == NULL) { 186 dst->sin_family = AF_INET; 187 dst->sin_len = sizeof(*dst); 188 dst->sin_addr = ip->ip_dst; 189 ro->ro_tableid = m->m_pkthdr.ph_rtableid; 190 } 191 192 if ((IN_MULTICAST(ip->ip_dst.s_addr) || 193 (ip->ip_dst.s_addr == INADDR_BROADCAST)) && 194 imo != NULL && (ifp = if_get(imo->imo_ifidx)) != NULL) { 195 196 mtu = ifp->if_mtu; 197 if (ip->ip_src.s_addr == INADDR_ANY) { 198 struct in_ifaddr *ia; 199 200 IFP_TO_IA(ifp, ia); 201 if (ia != NULL) 202 ip->ip_src = ia->ia_addr.sin_addr; 203 } 204 } else { 205 struct in_ifaddr *ia; 206 207 if (ro->ro_rt == NULL) 208 ro->ro_rt = rtalloc_mpath(&ro->ro_dst, 209 &ip->ip_src.s_addr, ro->ro_tableid); 210 211 if (ro->ro_rt == NULL) { 212 ipstat_inc(ips_noroute); 213 error = EHOSTUNREACH; 214 goto bad; 215 } 216 217 ia = ifatoia(ro->ro_rt->rt_ifa); 218 if (ISSET(ro->ro_rt->rt_flags, RTF_LOCAL)) 219 ifp = if_get(rtable_loindex(m->m_pkthdr.ph_rtableid)); 220 else 221 ifp = if_get(ro->ro_rt->rt_ifidx); 222 /* 223 * We aren't using rtisvalid() here because the UP/DOWN state 224 * machine is broken with some Ethernet drivers like em(4). 225 * As a result we might try to use an invalid cached route 226 * entry while an interface is being detached. 227 */ 228 if (ifp == NULL) { 229 ipstat_inc(ips_noroute); 230 error = EHOSTUNREACH; 231 goto bad; 232 } 233 if ((mtu = ro->ro_rt->rt_mtu) == 0) 234 mtu = ifp->if_mtu; 235 236 if (ro->ro_rt->rt_flags & RTF_GATEWAY) 237 dst = satosin(ro->ro_rt->rt_gateway); 238 239 /* Set the source IP address */ 240 if (ip->ip_src.s_addr == INADDR_ANY && ia) 241 ip->ip_src = ia->ia_addr.sin_addr; 242 } 243 244 #ifdef IPSEC 245 if (ipsec_in_use || inp != NULL) { 246 /* Do we have any pending SAs to apply ? */ 247 tdb = ip_output_ipsec_lookup(m, hlen, &error, inp, 248 ipsecflowinfo); 249 if (error != 0) { 250 /* Should silently drop packet */ 251 if (error == -EINVAL) 252 error = 0; 253 m_freem(m); 254 goto done; 255 } 256 if (tdb != NULL) { 257 /* 258 * If it needs TCP/UDP hardware-checksumming, do the 259 * computation now. 260 */ 261 in_proto_cksum_out(m, NULL); 262 } 263 } 264 #endif /* IPSEC */ 265 266 if (IN_MULTICAST(ip->ip_dst.s_addr) || 267 (ip->ip_dst.s_addr == INADDR_BROADCAST)) { 268 269 m->m_flags |= (ip->ip_dst.s_addr == INADDR_BROADCAST) ? 270 M_BCAST : M_MCAST; 271 272 /* 273 * IP destination address is multicast. Make sure "dst" 274 * still points to the address in "ro". (It may have been 275 * changed to point to a gateway address, above.) 276 */ 277 dst = satosin(&ro->ro_dst); 278 279 /* 280 * See if the caller provided any multicast options 281 */ 282 if (imo != NULL) 283 ip->ip_ttl = imo->imo_ttl; 284 else 285 ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; 286 287 /* 288 * if we don't know the outgoing ifp yet, we can't generate 289 * output 290 */ 291 if (!ifp) { 292 ipstat_inc(ips_noroute); 293 error = EHOSTUNREACH; 294 goto bad; 295 } 296 297 /* 298 * Confirm that the outgoing interface supports multicast, 299 * but only if the packet actually is going out on that 300 * interface (i.e., no IPsec is applied). 301 */ 302 if ((((m->m_flags & M_MCAST) && 303 (ifp->if_flags & IFF_MULTICAST) == 0) || 304 ((m->m_flags & M_BCAST) && 305 (ifp->if_flags & IFF_BROADCAST) == 0)) && (tdb == NULL)) { 306 ipstat_inc(ips_noroute); 307 error = ENETUNREACH; 308 goto bad; 309 } 310 311 /* 312 * If source address not specified yet, use address 313 * of outgoing interface. 314 */ 315 if (ip->ip_src.s_addr == INADDR_ANY) { 316 struct in_ifaddr *ia; 317 318 IFP_TO_IA(ifp, ia); 319 if (ia != NULL) 320 ip->ip_src = ia->ia_addr.sin_addr; 321 } 322 323 if ((imo == NULL || imo->imo_loop) && 324 in_hasmulti(&ip->ip_dst, ifp)) { 325 /* 326 * If we belong to the destination multicast group 327 * on the outgoing interface, and the caller did not 328 * forbid loopback, loop back a copy. 329 * Can't defer TCP/UDP checksumming, do the 330 * computation now. 331 */ 332 in_proto_cksum_out(m, NULL); 333 ip_mloopback(ifp, m, dst); 334 } 335 #ifdef MROUTING 336 else { 337 /* 338 * If we are acting as a multicast router, perform 339 * multicast forwarding as if the packet had just 340 * arrived on the interface to which we are about 341 * to send. The multicast forwarding function 342 * recursively calls this function, using the 343 * IP_FORWARDING flag to prevent infinite recursion. 344 * 345 * Multicasts that are looped back by ip_mloopback(), 346 * above, will be forwarded by the ip_input() routine, 347 * if necessary. 348 */ 349 if (ipmforwarding && ip_mrouter[ifp->if_rdomain] && 350 (flags & IP_FORWARDING) == 0) { 351 KERNEL_LOCK(); 352 rv = ip_mforward(m, ifp); 353 KERNEL_UNLOCK(); 354 if (rv != 0) { 355 m_freem(m); 356 goto done; 357 } 358 } 359 } 360 #endif 361 /* 362 * Multicasts with a time-to-live of zero may be looped- 363 * back, above, but must not be transmitted on a network. 364 * Also, multicasts addressed to the loopback interface 365 * are not sent -- the above call to ip_mloopback() will 366 * loop back a copy if this host actually belongs to the 367 * destination group on the loopback interface. 368 */ 369 if (ip->ip_ttl == 0 || (ifp->if_flags & IFF_LOOPBACK) != 0) { 370 m_freem(m); 371 goto done; 372 } 373 374 goto sendit; 375 } 376 377 /* 378 * Look for broadcast address and verify user is allowed to send 379 * such a packet; if the packet is going in an IPsec tunnel, skip 380 * this check. 381 */ 382 if ((tdb == NULL) && ((dst->sin_addr.s_addr == INADDR_BROADCAST) || 383 (ro && ro->ro_rt && ISSET(ro->ro_rt->rt_flags, RTF_BROADCAST)))) { 384 if ((ifp->if_flags & IFF_BROADCAST) == 0) { 385 error = EADDRNOTAVAIL; 386 goto bad; 387 } 388 if ((flags & IP_ALLOWBROADCAST) == 0) { 389 error = EACCES; 390 goto bad; 391 } 392 393 /* Don't allow broadcast messages to be fragmented */ 394 if (ntohs(ip->ip_len) > ifp->if_mtu) { 395 error = EMSGSIZE; 396 goto bad; 397 } 398 m->m_flags |= M_BCAST; 399 } else 400 m->m_flags &= ~M_BCAST; 401 402 sendit: 403 /* 404 * If we're doing Path MTU discovery, we need to set DF unless 405 * the route's MTU is locked. 406 */ 407 if ((flags & IP_MTUDISC) && ro && ro->ro_rt && 408 (ro->ro_rt->rt_locks & RTV_MTU) == 0) 409 ip->ip_off |= htons(IP_DF); 410 411 #ifdef IPSEC 412 /* 413 * Check if the packet needs encapsulation. 414 */ 415 if (tdb != NULL) { 416 /* Callee frees mbuf */ 417 error = ip_output_ipsec_send(tdb, m, ro, 418 (flags & IP_FORWARDING) ? 1 : 0); 419 goto done; 420 } 421 #endif /* IPSEC */ 422 423 /* 424 * Packet filter 425 */ 426 #if NPF > 0 427 if (pf_test(AF_INET, (flags & IP_FORWARDING) ? PF_FWD : PF_OUT, 428 ifp, &m) != PF_PASS) { 429 error = EACCES; 430 m_freem(m); 431 goto done; 432 } 433 if (m == NULL) 434 goto done; 435 ip = mtod(m, struct ip *); 436 hlen = ip->ip_hl << 2; 437 if ((m->m_pkthdr.pf.flags & (PF_TAG_REROUTE | PF_TAG_GENERATED)) == 438 (PF_TAG_REROUTE | PF_TAG_GENERATED)) 439 /* already rerun the route lookup, go on */ 440 m->m_pkthdr.pf.flags &= ~(PF_TAG_GENERATED | PF_TAG_REROUTE); 441 else if (m->m_pkthdr.pf.flags & PF_TAG_REROUTE) { 442 /* tag as generated to skip over pf_test on rerun */ 443 m->m_pkthdr.pf.flags |= PF_TAG_GENERATED; 444 ro = NULL; 445 if_put(ifp); /* drop reference since target changed */ 446 ifp = NULL; 447 goto reroute; 448 } 449 #endif 450 in_proto_cksum_out(m, ifp); 451 452 #ifdef IPSEC 453 if (ipsec_in_use && (flags & IP_FORWARDING) && (ipforwarding == 2) && 454 (m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL) == NULL)) { 455 error = EHOSTUNREACH; 456 m_freem(m); 457 goto done; 458 } 459 #endif 460 461 /* 462 * If small enough for interface, can just send directly. 463 */ 464 if (ntohs(ip->ip_len) <= mtu) { 465 ip->ip_sum = 0; 466 if (in_ifcap_cksum(m, ifp, IFCAP_CSUM_IPv4)) 467 m->m_pkthdr.csum_flags |= M_IPV4_CSUM_OUT; 468 else { 469 ipstat_inc(ips_outswcsum); 470 ip->ip_sum = in_cksum(m, hlen); 471 } 472 473 error = ifp->if_output(ifp, m, sintosa(dst), ro->ro_rt); 474 goto done; 475 } 476 477 /* 478 * Too large for interface; fragment if possible. 479 * Must be able to put at least 8 bytes per fragment. 480 */ 481 if (ip->ip_off & htons(IP_DF)) { 482 #ifdef IPSEC 483 if (ip_mtudisc) 484 ipsec_adjust_mtu(m, ifp->if_mtu); 485 #endif 486 error = EMSGSIZE; 487 #if NPF > 0 488 /* pf changed routing table, use orig rtable for path MTU */ 489 if (ro->ro_tableid != orig_rtableid) { 490 rtfree(ro->ro_rt); 491 ro->ro_tableid = orig_rtableid; 492 ro->ro_rt = icmp_mtudisc_clone( 493 satosin(&ro->ro_dst)->sin_addr, ro->ro_tableid, 0); 494 } 495 #endif 496 /* 497 * This case can happen if the user changed the MTU 498 * of an interface after enabling IP on it. Because 499 * most netifs don't keep track of routes pointing to 500 * them, there is no way for one to update all its 501 * routes when the MTU is changed. 502 */ 503 if (rtisvalid(ro->ro_rt) && 504 ISSET(ro->ro_rt->rt_flags, RTF_HOST) && 505 !(ro->ro_rt->rt_locks & RTV_MTU) && 506 (ro->ro_rt->rt_mtu > ifp->if_mtu)) { 507 ro->ro_rt->rt_mtu = ifp->if_mtu; 508 } 509 ipstat_inc(ips_cantfrag); 510 goto bad; 511 } 512 513 error = ip_fragment(m, ifp, mtu); 514 if (error) { 515 m = m0 = NULL; 516 goto bad; 517 } 518 519 for (; m; m = m0) { 520 m0 = m->m_nextpkt; 521 m->m_nextpkt = 0; 522 if (error == 0) 523 error = ifp->if_output(ifp, m, sintosa(dst), ro->ro_rt); 524 else 525 m_freem(m); 526 } 527 528 if (error == 0) 529 ipstat_inc(ips_fragmented); 530 531 done: 532 if (ro == &iproute && ro->ro_rt) 533 rtfree(ro->ro_rt); 534 if_put(ifp); 535 return (error); 536 bad: 537 m_freem(m0); 538 goto done; 539 } 540 541 #ifdef IPSEC 542 struct tdb * 543 ip_output_ipsec_lookup(struct mbuf *m, int hlen, int *error, struct inpcb *inp, 544 int ipsecflowinfo) 545 { 546 struct m_tag *mtag; 547 struct tdb_ident *tdbi; 548 struct tdb *tdb; 549 550 /* Do we have any pending SAs to apply ? */ 551 tdb = ipsp_spd_lookup(m, AF_INET, hlen, error, IPSP_DIRECTION_OUT, 552 NULL, inp, ipsecflowinfo); 553 if (tdb == NULL) 554 return NULL; 555 /* Loop detection */ 556 for (mtag = m_tag_first(m); mtag != NULL; mtag = m_tag_next(m, mtag)) { 557 if (mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_DONE) 558 continue; 559 tdbi = (struct tdb_ident *)(mtag + 1); 560 if (tdbi->spi == tdb->tdb_spi && 561 tdbi->proto == tdb->tdb_sproto && 562 tdbi->rdomain == tdb->tdb_rdomain && 563 !memcmp(&tdbi->dst, &tdb->tdb_dst, 564 sizeof(union sockaddr_union))) { 565 /* no IPsec needed */ 566 return NULL; 567 } 568 } 569 return tdb; 570 } 571 572 int 573 ip_output_ipsec_send(struct tdb *tdb, struct mbuf *m, struct route *ro, int fwd) 574 { 575 #if NPF > 0 576 struct ifnet *encif; 577 #endif 578 struct ip *ip; 579 int error; 580 581 #if NPF > 0 582 /* 583 * Packet filter 584 */ 585 if ((encif = enc_getif(tdb->tdb_rdomain, tdb->tdb_tap)) == NULL || 586 pf_test(AF_INET, fwd ? PF_FWD : PF_OUT, encif, &m) != PF_PASS) { 587 m_freem(m); 588 return EACCES; 589 } 590 if (m == NULL) 591 return 0; 592 /* 593 * PF_TAG_REROUTE handling or not... 594 * Packet is entering IPsec so the routing is 595 * already overruled by the IPsec policy. 596 * Until now the change was not reconsidered. 597 * What's the behaviour? 598 */ 599 in_proto_cksum_out(m, encif); 600 #endif 601 602 /* Check if we are allowed to fragment */ 603 ip = mtod(m, struct ip *); 604 if (ip_mtudisc && (ip->ip_off & htons(IP_DF)) && tdb->tdb_mtu && 605 ntohs(ip->ip_len) > tdb->tdb_mtu && 606 tdb->tdb_mtutimeout > gettime()) { 607 struct rtentry *rt = NULL; 608 int rt_mtucloned = 0; 609 int transportmode = 0; 610 611 transportmode = (tdb->tdb_dst.sa.sa_family == AF_INET) && 612 (tdb->tdb_dst.sin.sin_addr.s_addr == ip->ip_dst.s_addr); 613 614 /* Find a host route to store the mtu in */ 615 if (ro != NULL) 616 rt = ro->ro_rt; 617 /* but don't add a PMTU route for transport mode SAs */ 618 if (transportmode) 619 rt = NULL; 620 else if (rt == NULL || (rt->rt_flags & RTF_HOST) == 0) { 621 rt = icmp_mtudisc_clone(ip->ip_dst, 622 m->m_pkthdr.ph_rtableid, 1); 623 rt_mtucloned = 1; 624 } 625 DPRINTF(("%s: spi %08x mtu %d rt %p cloned %d\n", __func__, 626 ntohl(tdb->tdb_spi), tdb->tdb_mtu, rt, rt_mtucloned)); 627 if (rt != NULL) { 628 rt->rt_mtu = tdb->tdb_mtu; 629 if (ro != NULL && ro->ro_rt != NULL) { 630 rtfree(ro->ro_rt); 631 ro->ro_rt = rtalloc(&ro->ro_dst, RT_RESOLVE, 632 m->m_pkthdr.ph_rtableid); 633 } 634 if (rt_mtucloned) 635 rtfree(rt); 636 } 637 ipsec_adjust_mtu(m, tdb->tdb_mtu); 638 m_freem(m); 639 return EMSGSIZE; 640 } 641 /* propagate IP_DF for v4-over-v6 */ 642 if (ip_mtudisc && ip->ip_off & htons(IP_DF)) 643 SET(m->m_pkthdr.csum_flags, M_IPV6_DF_OUT); 644 645 /* 646 * Clear these -- they'll be set in the recursive invocation 647 * as needed. 648 */ 649 m->m_flags &= ~(M_MCAST | M_BCAST); 650 651 /* Callee frees mbuf */ 652 error = ipsp_process_packet(m, tdb, AF_INET, 0); 653 if (error) { 654 ipsecstat_inc(ipsec_odrops); 655 tdb->tdb_odrops++; 656 } 657 return error; 658 } 659 #endif /* IPSEC */ 660 661 int 662 ip_fragment(struct mbuf *m, struct ifnet *ifp, u_long mtu) 663 { 664 struct ip *ip, *mhip; 665 struct mbuf *m0; 666 int len, hlen, off; 667 int mhlen, firstlen; 668 struct mbuf **mnext; 669 int fragments = 0; 670 int error = 0; 671 672 ip = mtod(m, struct ip *); 673 hlen = ip->ip_hl << 2; 674 675 len = (mtu - hlen) &~ 7; 676 if (len < 8) { 677 m_freem(m); 678 return (EMSGSIZE); 679 } 680 681 /* 682 * If we are doing fragmentation, we can't defer TCP/UDP 683 * checksumming; compute the checksum and clear the flag. 684 */ 685 in_proto_cksum_out(m, NULL); 686 firstlen = len; 687 mnext = &m->m_nextpkt; 688 689 /* 690 * Loop through length of segment after first fragment, 691 * make new header and copy data of each part and link onto chain. 692 */ 693 m0 = m; 694 mhlen = sizeof (struct ip); 695 for (off = hlen + len; off < ntohs(ip->ip_len); off += len) { 696 MGETHDR(m, M_DONTWAIT, MT_HEADER); 697 if (m == NULL) { 698 ipstat_inc(ips_odropped); 699 error = ENOBUFS; 700 goto sendorfree; 701 } 702 *mnext = m; 703 mnext = &m->m_nextpkt; 704 m->m_data += max_linkhdr; 705 mhip = mtod(m, struct ip *); 706 *mhip = *ip; 707 /* we must inherit MCAST/BCAST flags, routing table and prio */ 708 m->m_flags |= m0->m_flags & (M_MCAST|M_BCAST); 709 m->m_pkthdr.ph_rtableid = m0->m_pkthdr.ph_rtableid; 710 m->m_pkthdr.pf.prio = m0->m_pkthdr.pf.prio; 711 if (hlen > sizeof (struct ip)) { 712 mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); 713 mhip->ip_hl = mhlen >> 2; 714 } 715 m->m_len = mhlen; 716 mhip->ip_off = ((off - hlen) >> 3) + 717 (ntohs(ip->ip_off) & ~IP_MF); 718 if (ip->ip_off & htons(IP_MF)) 719 mhip->ip_off |= IP_MF; 720 if (off + len >= ntohs(ip->ip_len)) 721 len = ntohs(ip->ip_len) - off; 722 else 723 mhip->ip_off |= IP_MF; 724 mhip->ip_len = htons((u_int16_t)(len + mhlen)); 725 m->m_next = m_copym(m0, off, len, M_NOWAIT); 726 if (m->m_next == 0) { 727 ipstat_inc(ips_odropped); 728 error = ENOBUFS; 729 goto sendorfree; 730 } 731 m->m_pkthdr.len = mhlen + len; 732 m->m_pkthdr.ph_ifidx = 0; 733 mhip->ip_off = htons((u_int16_t)mhip->ip_off); 734 mhip->ip_sum = 0; 735 if (in_ifcap_cksum(m, ifp, IFCAP_CSUM_IPv4)) 736 m->m_pkthdr.csum_flags |= M_IPV4_CSUM_OUT; 737 else { 738 ipstat_inc(ips_outswcsum); 739 mhip->ip_sum = in_cksum(m, mhlen); 740 } 741 ipstat_inc(ips_ofragments); 742 fragments++; 743 } 744 /* 745 * Update first fragment by trimming what's been copied out 746 * and updating header, then send each fragment (in order). 747 */ 748 m = m0; 749 m_adj(m, hlen + firstlen - ntohs(ip->ip_len)); 750 m->m_pkthdr.len = hlen + firstlen; 751 ip->ip_len = htons((u_int16_t)m->m_pkthdr.len); 752 ip->ip_off |= htons(IP_MF); 753 ip->ip_sum = 0; 754 if (in_ifcap_cksum(m, ifp, IFCAP_CSUM_IPv4)) 755 m->m_pkthdr.csum_flags |= M_IPV4_CSUM_OUT; 756 else { 757 ipstat_inc(ips_outswcsum); 758 ip->ip_sum = in_cksum(m, hlen); 759 } 760 sendorfree: 761 if (error) { 762 for (m = m0; m; m = m0) { 763 m0 = m->m_nextpkt; 764 m->m_nextpkt = NULL; 765 m_freem(m); 766 } 767 } 768 769 return (error); 770 } 771 772 /* 773 * Insert IP options into preformed packet. 774 * Adjust IP destination as required for IP source routing, 775 * as indicated by a non-zero in_addr at the start of the options. 776 */ 777 struct mbuf * 778 ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen) 779 { 780 struct ipoption *p = mtod(opt, struct ipoption *); 781 struct mbuf *n; 782 struct ip *ip = mtod(m, struct ip *); 783 unsigned int optlen; 784 785 optlen = opt->m_len - sizeof(p->ipopt_dst); 786 if (optlen + ntohs(ip->ip_len) > IP_MAXPACKET) 787 return (m); /* XXX should fail */ 788 if (p->ipopt_dst.s_addr) 789 ip->ip_dst = p->ipopt_dst; 790 if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) { 791 MGETHDR(n, M_DONTWAIT, MT_HEADER); 792 if (n == NULL) 793 return (m); 794 M_MOVE_HDR(n, m); 795 n->m_pkthdr.len += optlen; 796 m->m_len -= sizeof(struct ip); 797 m->m_data += sizeof(struct ip); 798 n->m_next = m; 799 m = n; 800 m->m_len = optlen + sizeof(struct ip); 801 m->m_data += max_linkhdr; 802 memcpy(mtod(m, caddr_t), ip, sizeof(struct ip)); 803 } else { 804 m->m_data -= optlen; 805 m->m_len += optlen; 806 m->m_pkthdr.len += optlen; 807 memmove(mtod(m, caddr_t), (caddr_t)ip, sizeof(struct ip)); 808 } 809 ip = mtod(m, struct ip *); 810 memcpy(ip + 1, p->ipopt_list, optlen); 811 *phlen = sizeof(struct ip) + optlen; 812 ip->ip_len = htons(ntohs(ip->ip_len) + optlen); 813 return (m); 814 } 815 816 /* 817 * Copy options from ip to jp, 818 * omitting those not copied during fragmentation. 819 */ 820 int 821 ip_optcopy(struct ip *ip, struct ip *jp) 822 { 823 u_char *cp, *dp; 824 int opt, optlen, cnt; 825 826 cp = (u_char *)(ip + 1); 827 dp = (u_char *)(jp + 1); 828 cnt = (ip->ip_hl << 2) - sizeof (struct ip); 829 for (; cnt > 0; cnt -= optlen, cp += optlen) { 830 opt = cp[0]; 831 if (opt == IPOPT_EOL) 832 break; 833 if (opt == IPOPT_NOP) { 834 /* Preserve for IP mcast tunnel's LSRR alignment. */ 835 *dp++ = IPOPT_NOP; 836 optlen = 1; 837 continue; 838 } 839 #ifdef DIAGNOSTIC 840 if (cnt < IPOPT_OLEN + sizeof(*cp)) 841 panic("malformed IPv4 option passed to ip_optcopy"); 842 #endif 843 optlen = cp[IPOPT_OLEN]; 844 #ifdef DIAGNOSTIC 845 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) 846 panic("malformed IPv4 option passed to ip_optcopy"); 847 #endif 848 /* bogus lengths should have been caught by ip_dooptions */ 849 if (optlen > cnt) 850 optlen = cnt; 851 if (IPOPT_COPIED(opt)) { 852 memcpy(dp, cp, optlen); 853 dp += optlen; 854 } 855 } 856 for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++) 857 *dp++ = IPOPT_EOL; 858 return (optlen); 859 } 860 861 /* 862 * IP socket option processing. 863 */ 864 int 865 ip_ctloutput(int op, struct socket *so, int level, int optname, 866 struct mbuf *m) 867 { 868 struct inpcb *inp = sotoinpcb(so); 869 int optval = 0; 870 struct proc *p = curproc; /* XXX */ 871 int error = 0; 872 u_int rtid = 0; 873 874 if (level != IPPROTO_IP) 875 return (EINVAL); 876 877 switch (op) { 878 case PRCO_SETOPT: 879 switch (optname) { 880 case IP_OPTIONS: 881 return (ip_pcbopts(&inp->inp_options, m)); 882 883 case IP_TOS: 884 case IP_TTL: 885 case IP_MINTTL: 886 case IP_RECVOPTS: 887 case IP_RECVRETOPTS: 888 case IP_RECVDSTADDR: 889 case IP_RECVIF: 890 case IP_RECVTTL: 891 case IP_RECVDSTPORT: 892 case IP_RECVRTABLE: 893 case IP_IPSECFLOWINFO: 894 if (m == NULL || m->m_len != sizeof(int)) 895 error = EINVAL; 896 else { 897 optval = *mtod(m, int *); 898 switch (optname) { 899 900 case IP_TOS: 901 inp->inp_ip.ip_tos = optval; 902 break; 903 904 case IP_TTL: 905 if (optval > 0 && optval <= MAXTTL) 906 inp->inp_ip.ip_ttl = optval; 907 else if (optval == -1) 908 inp->inp_ip.ip_ttl = ip_defttl; 909 else 910 error = EINVAL; 911 break; 912 913 case IP_MINTTL: 914 if (optval >= 0 && optval <= MAXTTL) 915 inp->inp_ip_minttl = optval; 916 else 917 error = EINVAL; 918 break; 919 #define OPTSET(bit) \ 920 if (optval) \ 921 inp->inp_flags |= bit; \ 922 else \ 923 inp->inp_flags &= ~bit; 924 925 case IP_RECVOPTS: 926 OPTSET(INP_RECVOPTS); 927 break; 928 929 case IP_RECVRETOPTS: 930 OPTSET(INP_RECVRETOPTS); 931 break; 932 933 case IP_RECVDSTADDR: 934 OPTSET(INP_RECVDSTADDR); 935 break; 936 case IP_RECVIF: 937 OPTSET(INP_RECVIF); 938 break; 939 case IP_RECVTTL: 940 OPTSET(INP_RECVTTL); 941 break; 942 case IP_RECVDSTPORT: 943 OPTSET(INP_RECVDSTPORT); 944 break; 945 case IP_RECVRTABLE: 946 OPTSET(INP_RECVRTABLE); 947 break; 948 case IP_IPSECFLOWINFO: 949 OPTSET(INP_IPSECFLOWINFO); 950 break; 951 } 952 } 953 break; 954 #undef OPTSET 955 956 case IP_MULTICAST_IF: 957 case IP_MULTICAST_TTL: 958 case IP_MULTICAST_LOOP: 959 case IP_ADD_MEMBERSHIP: 960 case IP_DROP_MEMBERSHIP: 961 error = ip_setmoptions(optname, &inp->inp_moptions, m, 962 inp->inp_rtableid); 963 break; 964 965 case IP_PORTRANGE: 966 if (m == NULL || m->m_len != sizeof(int)) 967 error = EINVAL; 968 else { 969 optval = *mtod(m, int *); 970 971 switch (optval) { 972 973 case IP_PORTRANGE_DEFAULT: 974 inp->inp_flags &= ~(INP_LOWPORT); 975 inp->inp_flags &= ~(INP_HIGHPORT); 976 break; 977 978 case IP_PORTRANGE_HIGH: 979 inp->inp_flags &= ~(INP_LOWPORT); 980 inp->inp_flags |= INP_HIGHPORT; 981 break; 982 983 case IP_PORTRANGE_LOW: 984 inp->inp_flags &= ~(INP_HIGHPORT); 985 inp->inp_flags |= INP_LOWPORT; 986 break; 987 988 default: 989 990 error = EINVAL; 991 break; 992 } 993 } 994 break; 995 case IP_AUTH_LEVEL: 996 case IP_ESP_TRANS_LEVEL: 997 case IP_ESP_NETWORK_LEVEL: 998 case IP_IPCOMP_LEVEL: 999 #ifndef IPSEC 1000 error = EOPNOTSUPP; 1001 #else 1002 if (m == NULL || m->m_len != sizeof(int)) { 1003 error = EINVAL; 1004 break; 1005 } 1006 optval = *mtod(m, int *); 1007 1008 if (optval < IPSEC_LEVEL_BYPASS || 1009 optval > IPSEC_LEVEL_UNIQUE) { 1010 error = EINVAL; 1011 break; 1012 } 1013 1014 switch (optname) { 1015 case IP_AUTH_LEVEL: 1016 if (optval < IPSEC_AUTH_LEVEL_DEFAULT && 1017 suser(p)) { 1018 error = EACCES; 1019 break; 1020 } 1021 inp->inp_seclevel[SL_AUTH] = optval; 1022 break; 1023 1024 case IP_ESP_TRANS_LEVEL: 1025 if (optval < IPSEC_ESP_TRANS_LEVEL_DEFAULT && 1026 suser(p)) { 1027 error = EACCES; 1028 break; 1029 } 1030 inp->inp_seclevel[SL_ESP_TRANS] = optval; 1031 break; 1032 1033 case IP_ESP_NETWORK_LEVEL: 1034 if (optval < IPSEC_ESP_NETWORK_LEVEL_DEFAULT && 1035 suser(p)) { 1036 error = EACCES; 1037 break; 1038 } 1039 inp->inp_seclevel[SL_ESP_NETWORK] = optval; 1040 break; 1041 case IP_IPCOMP_LEVEL: 1042 if (optval < IPSEC_IPCOMP_LEVEL_DEFAULT && 1043 suser(p)) { 1044 error = EACCES; 1045 break; 1046 } 1047 inp->inp_seclevel[SL_IPCOMP] = optval; 1048 break; 1049 } 1050 #endif 1051 break; 1052 1053 case IP_IPSEC_LOCAL_ID: 1054 case IP_IPSEC_REMOTE_ID: 1055 error = EOPNOTSUPP; 1056 break; 1057 case SO_RTABLE: 1058 if (m == NULL || m->m_len < sizeof(u_int)) { 1059 error = EINVAL; 1060 break; 1061 } 1062 rtid = *mtod(m, u_int *); 1063 if (inp->inp_rtableid == rtid) 1064 break; 1065 /* needs privileges to switch when already set */ 1066 if (p->p_p->ps_rtableid != rtid && 1067 p->p_p->ps_rtableid != 0 && 1068 (error = suser(p)) != 0) 1069 break; 1070 /* table must exist */ 1071 if (!rtable_exists(rtid)) { 1072 error = EINVAL; 1073 break; 1074 } 1075 if (inp->inp_lport) { 1076 error = EBUSY; 1077 break; 1078 } 1079 inp->inp_rtableid = rtid; 1080 in_pcbrehash(inp); 1081 break; 1082 case IP_PIPEX: 1083 if (m != NULL && m->m_len == sizeof(int)) 1084 inp->inp_pipex = *mtod(m, int *); 1085 else 1086 error = EINVAL; 1087 break; 1088 1089 default: 1090 error = ENOPROTOOPT; 1091 break; 1092 } 1093 break; 1094 1095 case PRCO_GETOPT: 1096 switch (optname) { 1097 case IP_OPTIONS: 1098 case IP_RETOPTS: 1099 if (inp->inp_options) { 1100 m->m_len = inp->inp_options->m_len; 1101 memcpy(mtod(m, caddr_t), 1102 mtod(inp->inp_options, caddr_t), m->m_len); 1103 } else 1104 m->m_len = 0; 1105 break; 1106 1107 case IP_TOS: 1108 case IP_TTL: 1109 case IP_MINTTL: 1110 case IP_RECVOPTS: 1111 case IP_RECVRETOPTS: 1112 case IP_RECVDSTADDR: 1113 case IP_RECVIF: 1114 case IP_RECVTTL: 1115 case IP_RECVDSTPORT: 1116 case IP_RECVRTABLE: 1117 case IP_IPSECFLOWINFO: 1118 case IP_IPDEFTTL: 1119 m->m_len = sizeof(int); 1120 switch (optname) { 1121 1122 case IP_TOS: 1123 optval = inp->inp_ip.ip_tos; 1124 break; 1125 1126 case IP_TTL: 1127 optval = inp->inp_ip.ip_ttl; 1128 break; 1129 1130 case IP_MINTTL: 1131 optval = inp->inp_ip_minttl; 1132 break; 1133 1134 case IP_IPDEFTTL: 1135 optval = ip_defttl; 1136 break; 1137 1138 #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) 1139 1140 case IP_RECVOPTS: 1141 optval = OPTBIT(INP_RECVOPTS); 1142 break; 1143 1144 case IP_RECVRETOPTS: 1145 optval = OPTBIT(INP_RECVRETOPTS); 1146 break; 1147 1148 case IP_RECVDSTADDR: 1149 optval = OPTBIT(INP_RECVDSTADDR); 1150 break; 1151 case IP_RECVIF: 1152 optval = OPTBIT(INP_RECVIF); 1153 break; 1154 case IP_RECVTTL: 1155 optval = OPTBIT(INP_RECVTTL); 1156 break; 1157 case IP_RECVDSTPORT: 1158 optval = OPTBIT(INP_RECVDSTPORT); 1159 break; 1160 case IP_RECVRTABLE: 1161 optval = OPTBIT(INP_RECVRTABLE); 1162 break; 1163 case IP_IPSECFLOWINFO: 1164 optval = OPTBIT(INP_IPSECFLOWINFO); 1165 break; 1166 } 1167 *mtod(m, int *) = optval; 1168 break; 1169 1170 case IP_MULTICAST_IF: 1171 case IP_MULTICAST_TTL: 1172 case IP_MULTICAST_LOOP: 1173 case IP_ADD_MEMBERSHIP: 1174 case IP_DROP_MEMBERSHIP: 1175 error = ip_getmoptions(optname, inp->inp_moptions, m); 1176 break; 1177 1178 case IP_PORTRANGE: 1179 m->m_len = sizeof(int); 1180 1181 if (inp->inp_flags & INP_HIGHPORT) 1182 optval = IP_PORTRANGE_HIGH; 1183 else if (inp->inp_flags & INP_LOWPORT) 1184 optval = IP_PORTRANGE_LOW; 1185 else 1186 optval = 0; 1187 1188 *mtod(m, int *) = optval; 1189 break; 1190 1191 case IP_AUTH_LEVEL: 1192 case IP_ESP_TRANS_LEVEL: 1193 case IP_ESP_NETWORK_LEVEL: 1194 case IP_IPCOMP_LEVEL: 1195 #ifndef IPSEC 1196 m->m_len = sizeof(int); 1197 *mtod(m, int *) = IPSEC_LEVEL_NONE; 1198 #else 1199 m->m_len = sizeof(int); 1200 switch (optname) { 1201 case IP_AUTH_LEVEL: 1202 optval = inp->inp_seclevel[SL_AUTH]; 1203 break; 1204 1205 case IP_ESP_TRANS_LEVEL: 1206 optval = inp->inp_seclevel[SL_ESP_TRANS]; 1207 break; 1208 1209 case IP_ESP_NETWORK_LEVEL: 1210 optval = inp->inp_seclevel[SL_ESP_NETWORK]; 1211 break; 1212 case IP_IPCOMP_LEVEL: 1213 optval = inp->inp_seclevel[SL_IPCOMP]; 1214 break; 1215 } 1216 *mtod(m, int *) = optval; 1217 #endif 1218 break; 1219 case IP_IPSEC_LOCAL_ID: 1220 case IP_IPSEC_REMOTE_ID: 1221 error = EOPNOTSUPP; 1222 break; 1223 case SO_RTABLE: 1224 m->m_len = sizeof(u_int); 1225 *mtod(m, u_int *) = inp->inp_rtableid; 1226 break; 1227 case IP_PIPEX: 1228 m->m_len = sizeof(int); 1229 *mtod(m, int *) = inp->inp_pipex; 1230 break; 1231 default: 1232 error = ENOPROTOOPT; 1233 break; 1234 } 1235 break; 1236 } 1237 return (error); 1238 } 1239 1240 /* 1241 * Set up IP options in pcb for insertion in output packets. 1242 * Store in mbuf with pointer in pcbopt, adding pseudo-option 1243 * with destination address if source routed. 1244 */ 1245 int 1246 ip_pcbopts(struct mbuf **pcbopt, struct mbuf *m) 1247 { 1248 struct mbuf *n; 1249 struct ipoption *p; 1250 int cnt, off, optlen; 1251 u_char *cp; 1252 u_char opt; 1253 1254 /* turn off any old options */ 1255 m_freem(*pcbopt); 1256 *pcbopt = NULL; 1257 if (m == NULL || m->m_len == 0) { 1258 /* 1259 * Only turning off any previous options. 1260 */ 1261 return (0); 1262 } 1263 1264 if (m->m_len % sizeof(int32_t) || 1265 m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr)) 1266 return (EINVAL); 1267 1268 /* Don't sleep because NET_LOCK() is hold. */ 1269 if ((n = m_get(M_NOWAIT, MT_SOOPTS)) == NULL) 1270 return (ENOBUFS); 1271 p = mtod(n, struct ipoption *); 1272 memset(p, 0, sizeof (*p)); /* 0 = IPOPT_EOL, needed for padding */ 1273 n->m_len = sizeof(struct in_addr); 1274 1275 off = 0; 1276 cnt = m->m_len; 1277 cp = mtod(m, u_char *); 1278 1279 while (cnt > 0) { 1280 opt = cp[IPOPT_OPTVAL]; 1281 1282 if (opt == IPOPT_NOP || opt == IPOPT_EOL) { 1283 optlen = 1; 1284 } else { 1285 if (cnt < IPOPT_OLEN + sizeof(*cp)) 1286 goto bad; 1287 optlen = cp[IPOPT_OLEN]; 1288 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) 1289 goto bad; 1290 } 1291 switch (opt) { 1292 default: 1293 memcpy(p->ipopt_list + off, cp, optlen); 1294 break; 1295 1296 case IPOPT_LSRR: 1297 case IPOPT_SSRR: 1298 /* 1299 * user process specifies route as: 1300 * ->A->B->C->D 1301 * D must be our final destination (but we can't 1302 * check that since we may not have connected yet). 1303 * A is first hop destination, which doesn't appear in 1304 * actual IP option, but is stored before the options. 1305 */ 1306 if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr)) 1307 goto bad; 1308 1309 /* 1310 * Optlen is smaller because first address is popped. 1311 * Cnt and cp will be adjusted a bit later to reflect 1312 * this. 1313 */ 1314 optlen -= sizeof(struct in_addr); 1315 p->ipopt_list[off + IPOPT_OPTVAL] = opt; 1316 p->ipopt_list[off + IPOPT_OLEN] = optlen; 1317 1318 /* 1319 * Move first hop before start of options. 1320 */ 1321 memcpy(&p->ipopt_dst, cp + IPOPT_OFFSET, 1322 sizeof(struct in_addr)); 1323 cp += sizeof(struct in_addr); 1324 cnt -= sizeof(struct in_addr); 1325 /* 1326 * Then copy rest of options 1327 */ 1328 memcpy(p->ipopt_list + off + IPOPT_OFFSET, 1329 cp + IPOPT_OFFSET, optlen - IPOPT_OFFSET); 1330 break; 1331 } 1332 off += optlen; 1333 cp += optlen; 1334 cnt -= optlen; 1335 1336 if (opt == IPOPT_EOL) 1337 break; 1338 } 1339 /* pad options to next word, since p was zeroed just adjust off */ 1340 off = (off + sizeof(int32_t) - 1) & ~(sizeof(int32_t) - 1); 1341 n->m_len += off; 1342 if (n->m_len > sizeof(*p)) { 1343 bad: 1344 m_freem(n); 1345 return (EINVAL); 1346 } 1347 1348 *pcbopt = n; 1349 return (0); 1350 } 1351 1352 /* 1353 * Lookup the interface based on the information in the ip_mreqn struct. 1354 */ 1355 int 1356 ip_multicast_if(struct ip_mreqn *mreq, u_int rtableid, unsigned int *ifidx) 1357 { 1358 struct sockaddr_in sin; 1359 struct rtentry *rt; 1360 1361 /* 1362 * In case userland provides the imr_ifindex use this as interface. 1363 * If no interface address was provided, use the interface of 1364 * the route to the given multicast address. 1365 */ 1366 if (mreq->imr_ifindex != 0) { 1367 *ifidx = mreq->imr_ifindex; 1368 } else if (mreq->imr_address.s_addr == INADDR_ANY) { 1369 memset(&sin, 0, sizeof(sin)); 1370 sin.sin_len = sizeof(sin); 1371 sin.sin_family = AF_INET; 1372 sin.sin_addr = mreq->imr_multiaddr; 1373 rt = rtalloc(sintosa(&sin), RT_RESOLVE, rtableid); 1374 if (!rtisvalid(rt)) { 1375 rtfree(rt); 1376 return EADDRNOTAVAIL; 1377 } 1378 *ifidx = rt->rt_ifidx; 1379 rtfree(rt); 1380 } else { 1381 memset(&sin, 0, sizeof(sin)); 1382 sin.sin_len = sizeof(sin); 1383 sin.sin_family = AF_INET; 1384 sin.sin_addr = mreq->imr_address; 1385 rt = rtalloc(sintosa(&sin), 0, rtableid); 1386 if (!rtisvalid(rt) || !ISSET(rt->rt_flags, RTF_LOCAL)) { 1387 rtfree(rt); 1388 return EADDRNOTAVAIL; 1389 } 1390 *ifidx = rt->rt_ifidx; 1391 rtfree(rt); 1392 } 1393 1394 return 0; 1395 } 1396 1397 /* 1398 * Set the IP multicast options in response to user setsockopt(). 1399 */ 1400 int 1401 ip_setmoptions(int optname, struct ip_moptions **imop, struct mbuf *m, 1402 u_int rtableid) 1403 { 1404 struct in_addr addr; 1405 struct in_ifaddr *ia; 1406 struct ip_mreqn mreqn; 1407 struct ifnet *ifp = NULL; 1408 struct ip_moptions *imo = *imop; 1409 struct in_multi **immp; 1410 struct sockaddr_in sin; 1411 unsigned int ifidx; 1412 int i, error = 0; 1413 u_char loop; 1414 1415 if (imo == NULL) { 1416 /* 1417 * No multicast option buffer attached to the pcb; 1418 * allocate one and initialize to default values. 1419 */ 1420 imo = malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK|M_ZERO); 1421 immp = mallocarray(IP_MIN_MEMBERSHIPS, sizeof(*immp), M_IPMOPTS, 1422 M_WAITOK|M_ZERO); 1423 *imop = imo; 1424 imo->imo_ifidx = 0; 1425 imo->imo_ttl = IP_DEFAULT_MULTICAST_TTL; 1426 imo->imo_loop = IP_DEFAULT_MULTICAST_LOOP; 1427 imo->imo_num_memberships = 0; 1428 imo->imo_max_memberships = IP_MIN_MEMBERSHIPS; 1429 imo->imo_membership = immp; 1430 } 1431 1432 switch (optname) { 1433 1434 case IP_MULTICAST_IF: 1435 /* 1436 * Select the interface for outgoing multicast packets. 1437 */ 1438 if (m == NULL) { 1439 error = EINVAL; 1440 break; 1441 } 1442 if (m->m_len == sizeof(struct in_addr)) { 1443 addr = *(mtod(m, struct in_addr *)); 1444 } else if (m->m_len == sizeof(struct ip_mreq) || 1445 m->m_len == sizeof(struct ip_mreqn)) { 1446 memset(&mreqn, 0, sizeof(mreqn)); 1447 memcpy(&mreqn, mtod(m, void *), m->m_len); 1448 1449 /* 1450 * If an interface index is given use this 1451 * index to set the imo_ifidx but check first 1452 * that the interface actually exists. 1453 * In the other case just set the addr to 1454 * the imr_address and fall through to the 1455 * regular code. 1456 */ 1457 if (mreqn.imr_ifindex != 0) { 1458 ifp = if_get(mreqn.imr_ifindex); 1459 if (ifp == NULL || 1460 ifp->if_rdomain != rtable_l2(rtableid)) { 1461 error = EADDRNOTAVAIL; 1462 if_put(ifp); 1463 break; 1464 } 1465 imo->imo_ifidx = ifp->if_index; 1466 if_put(ifp); 1467 break; 1468 } else 1469 addr = mreqn.imr_address; 1470 } else { 1471 error = EINVAL; 1472 break; 1473 } 1474 /* 1475 * INADDR_ANY is used to remove a previous selection. 1476 * When no interface is selected, a default one is 1477 * chosen every time a multicast packet is sent. 1478 */ 1479 if (addr.s_addr == INADDR_ANY) { 1480 imo->imo_ifidx = 0; 1481 break; 1482 } 1483 /* 1484 * The selected interface is identified by its local 1485 * IP address. Find the interface and confirm that 1486 * it supports multicasting. 1487 */ 1488 memset(&sin, 0, sizeof(sin)); 1489 sin.sin_len = sizeof(sin); 1490 sin.sin_family = AF_INET; 1491 sin.sin_addr = addr; 1492 ia = ifatoia(ifa_ifwithaddr(sintosa(&sin), rtableid)); 1493 if (ia == NULL || 1494 (ia->ia_ifp->if_flags & IFF_MULTICAST) == 0) { 1495 error = EADDRNOTAVAIL; 1496 break; 1497 } 1498 imo->imo_ifidx = ia->ia_ifp->if_index; 1499 break; 1500 1501 case IP_MULTICAST_TTL: 1502 /* 1503 * Set the IP time-to-live for outgoing multicast packets. 1504 */ 1505 if (m == NULL || m->m_len != 1) { 1506 error = EINVAL; 1507 break; 1508 } 1509 imo->imo_ttl = *(mtod(m, u_char *)); 1510 break; 1511 1512 case IP_MULTICAST_LOOP: 1513 /* 1514 * Set the loopback flag for outgoing multicast packets. 1515 * Must be zero or one. 1516 */ 1517 if (m == NULL || m->m_len != 1 || 1518 (loop = *(mtod(m, u_char *))) > 1) { 1519 error = EINVAL; 1520 break; 1521 } 1522 imo->imo_loop = loop; 1523 break; 1524 1525 case IP_ADD_MEMBERSHIP: 1526 /* 1527 * Add a multicast group membership. 1528 * Group must be a valid IP multicast address. 1529 */ 1530 if (m == NULL || !(m->m_len == sizeof(struct ip_mreq) || 1531 m->m_len == sizeof(struct ip_mreqn))) { 1532 error = EINVAL; 1533 break; 1534 } 1535 memset(&mreqn, 0, sizeof(mreqn)); 1536 memcpy(&mreqn, mtod(m, void *), m->m_len); 1537 if (!IN_MULTICAST(mreqn.imr_multiaddr.s_addr)) { 1538 error = EINVAL; 1539 break; 1540 } 1541 1542 error = ip_multicast_if(&mreqn, rtableid, &ifidx); 1543 if (error) 1544 break; 1545 1546 /* 1547 * See if we found an interface, and confirm that it 1548 * supports multicast. 1549 */ 1550 ifp = if_get(ifidx); 1551 if (ifp == NULL || ifp->if_rdomain != rtable_l2(rtableid) || 1552 (ifp->if_flags & IFF_MULTICAST) == 0) { 1553 error = EADDRNOTAVAIL; 1554 if_put(ifp); 1555 break; 1556 } 1557 1558 /* 1559 * See if the membership already exists or if all the 1560 * membership slots are full. 1561 */ 1562 for (i = 0; i < imo->imo_num_memberships; ++i) { 1563 if (imo->imo_membership[i]->inm_ifidx == ifidx && 1564 imo->imo_membership[i]->inm_addr.s_addr 1565 == mreqn.imr_multiaddr.s_addr) 1566 break; 1567 } 1568 if (i < imo->imo_num_memberships) { 1569 error = EADDRINUSE; 1570 if_put(ifp); 1571 break; 1572 } 1573 if (imo->imo_num_memberships == imo->imo_max_memberships) { 1574 struct in_multi **nmships, **omships; 1575 size_t newmax; 1576 /* 1577 * Resize the vector to next power-of-two minus 1. If 1578 * the size would exceed the maximum then we know we've 1579 * really run out of entries. Otherwise, we reallocate 1580 * the vector. 1581 */ 1582 nmships = NULL; 1583 omships = imo->imo_membership; 1584 newmax = ((imo->imo_max_memberships + 1) * 2) - 1; 1585 if (newmax <= IP_MAX_MEMBERSHIPS) { 1586 nmships = mallocarray(newmax, sizeof(*nmships), 1587 M_IPMOPTS, M_NOWAIT|M_ZERO); 1588 if (nmships != NULL) { 1589 memcpy(nmships, omships, 1590 sizeof(*omships) * 1591 imo->imo_max_memberships); 1592 free(omships, M_IPMOPTS, 1593 sizeof(*omships) * 1594 imo->imo_max_memberships); 1595 imo->imo_membership = nmships; 1596 imo->imo_max_memberships = newmax; 1597 } 1598 } 1599 if (nmships == NULL) { 1600 error = ENOBUFS; 1601 if_put(ifp); 1602 break; 1603 } 1604 } 1605 /* 1606 * Everything looks good; add a new record to the multicast 1607 * address list for the given interface. 1608 */ 1609 if ((imo->imo_membership[i] = 1610 in_addmulti(&mreqn.imr_multiaddr, ifp)) == NULL) { 1611 error = ENOBUFS; 1612 if_put(ifp); 1613 break; 1614 } 1615 ++imo->imo_num_memberships; 1616 if_put(ifp); 1617 break; 1618 1619 case IP_DROP_MEMBERSHIP: 1620 /* 1621 * Drop a multicast group membership. 1622 * Group must be a valid IP multicast address. 1623 */ 1624 if (m == NULL || !(m->m_len == sizeof(struct ip_mreq) || 1625 m->m_len == sizeof(struct ip_mreqn))) { 1626 error = EINVAL; 1627 break; 1628 } 1629 memset(&mreqn, 0, sizeof(mreqn)); 1630 memcpy(&mreqn, mtod(m, void *), m->m_len); 1631 if (!IN_MULTICAST(mreqn.imr_multiaddr.s_addr)) { 1632 error = EINVAL; 1633 break; 1634 } 1635 1636 /* 1637 * If an interface address was specified, get a pointer 1638 * to its ifnet structure. 1639 */ 1640 error = ip_multicast_if(&mreqn, rtableid, &ifidx); 1641 if (error) 1642 break; 1643 1644 /* 1645 * Find the membership in the membership array. 1646 */ 1647 for (i = 0; i < imo->imo_num_memberships; ++i) { 1648 if ((ifidx == 0 || 1649 imo->imo_membership[i]->inm_ifidx == ifidx) && 1650 imo->imo_membership[i]->inm_addr.s_addr == 1651 mreqn.imr_multiaddr.s_addr) 1652 break; 1653 } 1654 if (i == imo->imo_num_memberships) { 1655 error = EADDRNOTAVAIL; 1656 break; 1657 } 1658 /* 1659 * Give up the multicast address record to which the 1660 * membership points. 1661 */ 1662 in_delmulti(imo->imo_membership[i]); 1663 /* 1664 * Remove the gap in the membership array. 1665 */ 1666 for (++i; i < imo->imo_num_memberships; ++i) 1667 imo->imo_membership[i-1] = imo->imo_membership[i]; 1668 --imo->imo_num_memberships; 1669 break; 1670 1671 default: 1672 error = EOPNOTSUPP; 1673 break; 1674 } 1675 1676 /* 1677 * If all options have default values, no need to keep the data. 1678 */ 1679 if (imo->imo_ifidx == 0 && 1680 imo->imo_ttl == IP_DEFAULT_MULTICAST_TTL && 1681 imo->imo_loop == IP_DEFAULT_MULTICAST_LOOP && 1682 imo->imo_num_memberships == 0) { 1683 free(imo->imo_membership , M_IPMOPTS, 1684 imo->imo_max_memberships * sizeof(struct in_multi *)); 1685 free(*imop, M_IPMOPTS, sizeof(**imop)); 1686 *imop = NULL; 1687 } 1688 1689 return (error); 1690 } 1691 1692 /* 1693 * Return the IP multicast options in response to user getsockopt(). 1694 */ 1695 int 1696 ip_getmoptions(int optname, struct ip_moptions *imo, struct mbuf *m) 1697 { 1698 u_char *ttl; 1699 u_char *loop; 1700 struct in_addr *addr; 1701 struct in_ifaddr *ia; 1702 struct ifnet *ifp; 1703 1704 switch (optname) { 1705 1706 case IP_MULTICAST_IF: 1707 addr = mtod(m, struct in_addr *); 1708 m->m_len = sizeof(struct in_addr); 1709 if (imo == NULL || (ifp = if_get(imo->imo_ifidx)) == NULL) 1710 addr->s_addr = INADDR_ANY; 1711 else { 1712 IFP_TO_IA(ifp, ia); 1713 if_put(ifp); 1714 addr->s_addr = (ia == NULL) ? INADDR_ANY 1715 : ia->ia_addr.sin_addr.s_addr; 1716 } 1717 return (0); 1718 1719 case IP_MULTICAST_TTL: 1720 ttl = mtod(m, u_char *); 1721 m->m_len = 1; 1722 *ttl = (imo == NULL) ? IP_DEFAULT_MULTICAST_TTL 1723 : imo->imo_ttl; 1724 return (0); 1725 1726 case IP_MULTICAST_LOOP: 1727 loop = mtod(m, u_char *); 1728 m->m_len = 1; 1729 *loop = (imo == NULL) ? IP_DEFAULT_MULTICAST_LOOP 1730 : imo->imo_loop; 1731 return (0); 1732 1733 default: 1734 return (EOPNOTSUPP); 1735 } 1736 } 1737 1738 /* 1739 * Discard the IP multicast options. 1740 */ 1741 void 1742 ip_freemoptions(struct ip_moptions *imo) 1743 { 1744 int i; 1745 1746 if (imo != NULL) { 1747 for (i = 0; i < imo->imo_num_memberships; ++i) 1748 in_delmulti(imo->imo_membership[i]); 1749 free(imo->imo_membership, M_IPMOPTS, 1750 imo->imo_max_memberships * sizeof(struct in_multi *)); 1751 free(imo, M_IPMOPTS, sizeof(*imo)); 1752 } 1753 } 1754 1755 /* 1756 * Routine called from ip_output() to loop back a copy of an IP multicast 1757 * packet to the input queue of a specified interface. 1758 */ 1759 void 1760 ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst) 1761 { 1762 struct ip *ip; 1763 struct mbuf *copym; 1764 1765 copym = m_dup_pkt(m, max_linkhdr, M_DONTWAIT); 1766 if (copym != NULL) { 1767 /* 1768 * We don't bother to fragment if the IP length is greater 1769 * than the interface's MTU. Can this possibly matter? 1770 */ 1771 ip = mtod(copym, struct ip *); 1772 ip->ip_sum = 0; 1773 ip->ip_sum = in_cksum(copym, ip->ip_hl << 2); 1774 if_input_local(ifp, copym, dst->sin_family); 1775 } 1776 } 1777 1778 /* 1779 * Compute significant parts of the IPv4 checksum pseudo-header 1780 * for use in a delayed TCP/UDP checksum calculation. 1781 */ 1782 static __inline u_int16_t __attribute__((__unused__)) 1783 in_cksum_phdr(u_int32_t src, u_int32_t dst, u_int32_t lenproto) 1784 { 1785 u_int32_t sum; 1786 1787 sum = lenproto + 1788 (u_int16_t)(src >> 16) + 1789 (u_int16_t)(src /*& 0xffff*/) + 1790 (u_int16_t)(dst >> 16) + 1791 (u_int16_t)(dst /*& 0xffff*/); 1792 1793 sum = (u_int16_t)(sum >> 16) + (u_int16_t)(sum /*& 0xffff*/); 1794 1795 if (sum > 0xffff) 1796 sum -= 0xffff; 1797 1798 return (sum); 1799 } 1800 1801 /* 1802 * Process a delayed payload checksum calculation. 1803 */ 1804 void 1805 in_delayed_cksum(struct mbuf *m) 1806 { 1807 struct ip *ip; 1808 u_int16_t csum, offset; 1809 1810 ip = mtod(m, struct ip *); 1811 offset = ip->ip_hl << 2; 1812 csum = in4_cksum(m, 0, offset, m->m_pkthdr.len - offset); 1813 if (csum == 0 && ip->ip_p == IPPROTO_UDP) 1814 csum = 0xffff; 1815 1816 switch (ip->ip_p) { 1817 case IPPROTO_TCP: 1818 offset += offsetof(struct tcphdr, th_sum); 1819 break; 1820 1821 case IPPROTO_UDP: 1822 offset += offsetof(struct udphdr, uh_sum); 1823 break; 1824 1825 case IPPROTO_ICMP: 1826 offset += offsetof(struct icmp, icmp_cksum); 1827 break; 1828 1829 default: 1830 return; 1831 } 1832 1833 if ((offset + sizeof(u_int16_t)) > m->m_len) 1834 m_copyback(m, offset, sizeof(csum), &csum, M_NOWAIT); 1835 else 1836 *(u_int16_t *)(mtod(m, caddr_t) + offset) = csum; 1837 } 1838 1839 void 1840 in_proto_cksum_out(struct mbuf *m, struct ifnet *ifp) 1841 { 1842 struct ip *ip = mtod(m, struct ip *); 1843 1844 /* some hw and in_delayed_cksum need the pseudo header cksum */ 1845 if (m->m_pkthdr.csum_flags & 1846 (M_TCP_CSUM_OUT|M_UDP_CSUM_OUT|M_ICMP_CSUM_OUT)) { 1847 u_int16_t csum = 0, offset; 1848 1849 offset = ip->ip_hl << 2; 1850 if (m->m_pkthdr.csum_flags & (M_TCP_CSUM_OUT|M_UDP_CSUM_OUT)) 1851 csum = in_cksum_phdr(ip->ip_src.s_addr, 1852 ip->ip_dst.s_addr, htonl(ntohs(ip->ip_len) - 1853 offset + ip->ip_p)); 1854 if (ip->ip_p == IPPROTO_TCP) 1855 offset += offsetof(struct tcphdr, th_sum); 1856 else if (ip->ip_p == IPPROTO_UDP) 1857 offset += offsetof(struct udphdr, uh_sum); 1858 else if (ip->ip_p == IPPROTO_ICMP) 1859 offset += offsetof(struct icmp, icmp_cksum); 1860 if ((offset + sizeof(u_int16_t)) > m->m_len) 1861 m_copyback(m, offset, sizeof(csum), &csum, M_NOWAIT); 1862 else 1863 *(u_int16_t *)(mtod(m, caddr_t) + offset) = csum; 1864 } 1865 1866 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_OUT) { 1867 if (!in_ifcap_cksum(m, ifp, IFCAP_CSUM_TCPv4) || 1868 ip->ip_hl != 5) { 1869 tcpstat_inc(tcps_outswcsum); 1870 in_delayed_cksum(m); 1871 m->m_pkthdr.csum_flags &= ~M_TCP_CSUM_OUT; /* Clear */ 1872 } 1873 } else if (m->m_pkthdr.csum_flags & M_UDP_CSUM_OUT) { 1874 if (!in_ifcap_cksum(m, ifp, IFCAP_CSUM_UDPv4) || 1875 ip->ip_hl != 5) { 1876 udpstat_inc(udps_outswcsum); 1877 in_delayed_cksum(m); 1878 m->m_pkthdr.csum_flags &= ~M_UDP_CSUM_OUT; /* Clear */ 1879 } 1880 } else if (m->m_pkthdr.csum_flags & M_ICMP_CSUM_OUT) { 1881 in_delayed_cksum(m); 1882 m->m_pkthdr.csum_flags &= ~M_ICMP_CSUM_OUT; /* Clear */ 1883 } 1884 } 1885 1886 int 1887 in_ifcap_cksum(struct mbuf *m, struct ifnet *ifp, int ifcap) 1888 { 1889 if ((ifp == NULL) || 1890 !ISSET(ifp->if_capabilities, ifcap) || 1891 (ifp->if_bridgeidx != 0)) 1892 return (0); 1893 /* 1894 * Simplex interface sends packet back without hardware cksum. 1895 * Keep this check in sync with the condition where ether_resolve() 1896 * calls if_input_local(). 1897 */ 1898 if (ISSET(m->m_flags, M_BCAST) && 1899 ISSET(ifp->if_flags, IFF_SIMPLEX) && 1900 !m->m_pkthdr.pf.routed) 1901 return (0); 1902 return (1); 1903 } 1904