1 /* $OpenBSD: ip_output.c,v 1.355 2019/06/10 16:32:51 mpi Exp $ */ 2 /* $NetBSD: ip_output.c,v 1.28 1996/02/13 23:43:07 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 33 */ 34 35 #include "pf.h" 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/mbuf.h> 40 #include <sys/protosw.h> 41 #include <sys/socket.h> 42 #include <sys/socketvar.h> 43 #include <sys/proc.h> 44 #include <sys/kernel.h> 45 46 #include <net/if.h> 47 #include <net/if_var.h> 48 #include <net/if_enc.h> 49 #include <net/route.h> 50 51 #include <netinet/in.h> 52 #include <netinet/ip.h> 53 #include <netinet/in_pcb.h> 54 #include <netinet/in_var.h> 55 #include <netinet/ip_var.h> 56 #include <netinet/ip_icmp.h> 57 #include <netinet/tcp.h> 58 #include <netinet/udp.h> 59 #include <netinet/tcp_timer.h> 60 #include <netinet/tcp_var.h> 61 #include <netinet/udp_var.h> 62 63 #if NPF > 0 64 #include <net/pfvar.h> 65 #endif 66 67 #ifdef IPSEC 68 #ifdef ENCDEBUG 69 #define DPRINTF(x) do { if (encdebug) printf x ; } while (0) 70 #else 71 #define DPRINTF(x) 72 #endif 73 #endif /* IPSEC */ 74 75 int ip_pcbopts(struct mbuf **, struct mbuf *); 76 int ip_setmoptions(int, struct ip_moptions **, struct mbuf *, u_int); 77 void ip_mloopback(struct ifnet *, struct mbuf *, struct sockaddr_in *); 78 static __inline u_int16_t __attribute__((__unused__)) 79 in_cksum_phdr(u_int32_t, u_int32_t, u_int32_t); 80 void in_delayed_cksum(struct mbuf *); 81 82 #ifdef IPSEC 83 struct tdb * 84 ip_output_ipsec_lookup(struct mbuf *m, int hlen, int *error, struct inpcb *inp, 85 int ipsecflowinfo); 86 int 87 ip_output_ipsec_send(struct tdb *, struct mbuf *, struct route *, int); 88 #endif /* IPSEC */ 89 90 /* 91 * IP output. The packet in mbuf chain m contains a skeletal IP 92 * header (with len, off, ttl, proto, tos, src, dst). 93 * The mbuf chain containing the packet will be freed. 94 * The mbuf opt, if present, will not be freed. 95 */ 96 int 97 ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro, int flags, 98 struct ip_moptions *imo, struct inpcb *inp, u_int32_t ipsecflowinfo) 99 { 100 struct ip *ip; 101 struct ifnet *ifp = NULL; 102 struct mbuf *m = m0; 103 int hlen = sizeof (struct ip); 104 int len, error = 0; 105 struct route iproute; 106 struct sockaddr_in *dst; 107 struct tdb *tdb = NULL; 108 u_long mtu; 109 #if defined(MROUTING) 110 int rv; 111 #endif 112 113 NET_ASSERT_LOCKED(); 114 115 #ifdef IPSEC 116 if (inp && (inp->inp_flags & INP_IPV6) != 0) 117 panic("ip_output: IPv6 pcb is passed"); 118 #endif /* IPSEC */ 119 120 #ifdef DIAGNOSTIC 121 if ((m->m_flags & M_PKTHDR) == 0) 122 panic("ip_output no HDR"); 123 #endif 124 if (opt) { 125 m = ip_insertoptions(m, opt, &len); 126 hlen = len; 127 } 128 129 ip = mtod(m, struct ip *); 130 131 /* 132 * Fill in IP header. 133 */ 134 if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { 135 ip->ip_v = IPVERSION; 136 ip->ip_off &= htons(IP_DF); 137 ip->ip_id = htons(ip_randomid()); 138 ip->ip_hl = hlen >> 2; 139 ipstat_inc(ips_localout); 140 } else { 141 hlen = ip->ip_hl << 2; 142 } 143 144 /* 145 * We should not send traffic to 0/8 say both Stevens and RFCs 146 * 5735 section 3 and 1122 sections 3.2.1.3 and 3.3.6. 147 */ 148 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == 0) { 149 error = ENETUNREACH; 150 goto bad; 151 } 152 153 #if NPF > 0 154 reroute: 155 #endif 156 157 /* 158 * Do a route lookup now in case we need the source address to 159 * do an SPD lookup in IPsec; for most packets, the source address 160 * is set at a higher level protocol. ICMPs and other packets 161 * though (e.g., traceroute) have a source address of zeroes. 162 */ 163 if (ro == NULL) { 164 ro = &iproute; 165 memset(ro, 0, sizeof(*ro)); 166 } 167 168 dst = satosin(&ro->ro_dst); 169 170 /* 171 * If there is a cached route, check that it is to the same 172 * destination and is still up. If not, free it and try again. 173 */ 174 if (!rtisvalid(ro->ro_rt) || 175 dst->sin_addr.s_addr != ip->ip_dst.s_addr || 176 ro->ro_tableid != m->m_pkthdr.ph_rtableid) { 177 rtfree(ro->ro_rt); 178 ro->ro_rt = NULL; 179 } 180 181 if (ro->ro_rt == NULL) { 182 dst->sin_family = AF_INET; 183 dst->sin_len = sizeof(*dst); 184 dst->sin_addr = ip->ip_dst; 185 ro->ro_tableid = m->m_pkthdr.ph_rtableid; 186 } 187 188 if ((IN_MULTICAST(ip->ip_dst.s_addr) || 189 (ip->ip_dst.s_addr == INADDR_BROADCAST)) && 190 imo != NULL && (ifp = if_get(imo->imo_ifidx)) != NULL) { 191 192 mtu = ifp->if_mtu; 193 if (ip->ip_src.s_addr == INADDR_ANY) { 194 struct in_ifaddr *ia; 195 196 IFP_TO_IA(ifp, ia); 197 if (ia != NULL) 198 ip->ip_src = ia->ia_addr.sin_addr; 199 } 200 } else { 201 struct in_ifaddr *ia; 202 203 if (ro->ro_rt == NULL) 204 ro->ro_rt = rtalloc_mpath(&ro->ro_dst, 205 &ip->ip_src.s_addr, ro->ro_tableid); 206 207 if (ro->ro_rt == NULL) { 208 ipstat_inc(ips_noroute); 209 error = EHOSTUNREACH; 210 goto bad; 211 } 212 213 ia = ifatoia(ro->ro_rt->rt_ifa); 214 if (ISSET(ro->ro_rt->rt_flags, RTF_LOCAL)) 215 ifp = if_get(rtable_loindex(m->m_pkthdr.ph_rtableid)); 216 else 217 ifp = if_get(ro->ro_rt->rt_ifidx); 218 /* 219 * We aren't using rtisvalid() here because the UP/DOWN state 220 * machine is broken with some Ethernet drivers like em(4). 221 * As a result we might try to use an invalid cached route 222 * entry while an interface is being detached. 223 */ 224 if (ifp == NULL) { 225 ipstat_inc(ips_noroute); 226 error = EHOSTUNREACH; 227 goto bad; 228 } 229 if ((mtu = ro->ro_rt->rt_mtu) == 0) 230 mtu = ifp->if_mtu; 231 232 if (ro->ro_rt->rt_flags & RTF_GATEWAY) 233 dst = satosin(ro->ro_rt->rt_gateway); 234 235 /* Set the source IP address */ 236 if (ip->ip_src.s_addr == INADDR_ANY && ia) 237 ip->ip_src = ia->ia_addr.sin_addr; 238 } 239 240 #ifdef IPSEC 241 if (ipsec_in_use || inp != NULL) { 242 /* Do we have any pending SAs to apply ? */ 243 tdb = ip_output_ipsec_lookup(m, hlen, &error, inp, 244 ipsecflowinfo); 245 if (error != 0) { 246 /* Should silently drop packet */ 247 if (error == -EINVAL) 248 error = 0; 249 m_freem(m); 250 goto done; 251 } 252 if (tdb != NULL) { 253 /* 254 * If it needs TCP/UDP hardware-checksumming, do the 255 * computation now. 256 */ 257 in_proto_cksum_out(m, NULL); 258 } 259 } 260 #endif /* IPSEC */ 261 262 if (IN_MULTICAST(ip->ip_dst.s_addr) || 263 (ip->ip_dst.s_addr == INADDR_BROADCAST)) { 264 265 m->m_flags |= (ip->ip_dst.s_addr == INADDR_BROADCAST) ? 266 M_BCAST : M_MCAST; 267 268 /* 269 * IP destination address is multicast. Make sure "dst" 270 * still points to the address in "ro". (It may have been 271 * changed to point to a gateway address, above.) 272 */ 273 dst = satosin(&ro->ro_dst); 274 275 /* 276 * See if the caller provided any multicast options 277 */ 278 if (imo != NULL) 279 ip->ip_ttl = imo->imo_ttl; 280 else 281 ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; 282 283 /* 284 * if we don't know the outgoing ifp yet, we can't generate 285 * output 286 */ 287 if (!ifp) { 288 ipstat_inc(ips_noroute); 289 error = EHOSTUNREACH; 290 goto bad; 291 } 292 293 /* 294 * Confirm that the outgoing interface supports multicast, 295 * but only if the packet actually is going out on that 296 * interface (i.e., no IPsec is applied). 297 */ 298 if ((((m->m_flags & M_MCAST) && 299 (ifp->if_flags & IFF_MULTICAST) == 0) || 300 ((m->m_flags & M_BCAST) && 301 (ifp->if_flags & IFF_BROADCAST) == 0)) && (tdb == NULL)) { 302 ipstat_inc(ips_noroute); 303 error = ENETUNREACH; 304 goto bad; 305 } 306 307 /* 308 * If source address not specified yet, use address 309 * of outgoing interface. 310 */ 311 if (ip->ip_src.s_addr == INADDR_ANY) { 312 struct in_ifaddr *ia; 313 314 IFP_TO_IA(ifp, ia); 315 if (ia != NULL) 316 ip->ip_src = ia->ia_addr.sin_addr; 317 } 318 319 if ((imo == NULL || imo->imo_loop) && 320 in_hasmulti(&ip->ip_dst, ifp)) { 321 /* 322 * If we belong to the destination multicast group 323 * on the outgoing interface, and the caller did not 324 * forbid loopback, loop back a copy. 325 * Can't defer TCP/UDP checksumming, do the 326 * computation now. 327 */ 328 in_proto_cksum_out(m, NULL); 329 ip_mloopback(ifp, m, dst); 330 } 331 #ifdef MROUTING 332 else { 333 /* 334 * If we are acting as a multicast router, perform 335 * multicast forwarding as if the packet had just 336 * arrived on the interface to which we are about 337 * to send. The multicast forwarding function 338 * recursively calls this function, using the 339 * IP_FORWARDING flag to prevent infinite recursion. 340 * 341 * Multicasts that are looped back by ip_mloopback(), 342 * above, will be forwarded by the ip_input() routine, 343 * if necessary. 344 */ 345 if (ipmforwarding && ip_mrouter[ifp->if_rdomain] && 346 (flags & IP_FORWARDING) == 0) { 347 KERNEL_LOCK(); 348 rv = ip_mforward(m, ifp); 349 KERNEL_UNLOCK(); 350 if (rv != 0) { 351 m_freem(m); 352 goto done; 353 } 354 } 355 } 356 #endif 357 /* 358 * Multicasts with a time-to-live of zero may be looped- 359 * back, above, but must not be transmitted on a network. 360 * Also, multicasts addressed to the loopback interface 361 * are not sent -- the above call to ip_mloopback() will 362 * loop back a copy if this host actually belongs to the 363 * destination group on the loopback interface. 364 */ 365 if (ip->ip_ttl == 0 || (ifp->if_flags & IFF_LOOPBACK) != 0) { 366 m_freem(m); 367 goto done; 368 } 369 370 goto sendit; 371 } 372 373 /* 374 * Look for broadcast address and verify user is allowed to send 375 * such a packet; if the packet is going in an IPsec tunnel, skip 376 * this check. 377 */ 378 if ((tdb == NULL) && ((dst->sin_addr.s_addr == INADDR_BROADCAST) || 379 (ro && ro->ro_rt && ISSET(ro->ro_rt->rt_flags, RTF_BROADCAST)))) { 380 if ((ifp->if_flags & IFF_BROADCAST) == 0) { 381 error = EADDRNOTAVAIL; 382 goto bad; 383 } 384 if ((flags & IP_ALLOWBROADCAST) == 0) { 385 error = EACCES; 386 goto bad; 387 } 388 389 /* Don't allow broadcast messages to be fragmented */ 390 if (ntohs(ip->ip_len) > ifp->if_mtu) { 391 error = EMSGSIZE; 392 goto bad; 393 } 394 m->m_flags |= M_BCAST; 395 } else 396 m->m_flags &= ~M_BCAST; 397 398 sendit: 399 /* 400 * If we're doing Path MTU discovery, we need to set DF unless 401 * the route's MTU is locked. 402 */ 403 if ((flags & IP_MTUDISC) && ro && ro->ro_rt && 404 (ro->ro_rt->rt_locks & RTV_MTU) == 0) 405 ip->ip_off |= htons(IP_DF); 406 407 #ifdef IPSEC 408 /* 409 * Check if the packet needs encapsulation. 410 */ 411 if (tdb != NULL) { 412 /* Callee frees mbuf */ 413 error = ip_output_ipsec_send(tdb, m, ro, 414 (flags & IP_FORWARDING) ? 1 : 0); 415 goto done; 416 } 417 #endif /* IPSEC */ 418 419 /* 420 * Packet filter 421 */ 422 #if NPF > 0 423 if (pf_test(AF_INET, (flags & IP_FORWARDING) ? PF_FWD : PF_OUT, 424 ifp, &m) != PF_PASS) { 425 error = EACCES; 426 m_freem(m); 427 goto done; 428 } 429 if (m == NULL) 430 goto done; 431 ip = mtod(m, struct ip *); 432 hlen = ip->ip_hl << 2; 433 if ((m->m_pkthdr.pf.flags & (PF_TAG_REROUTE | PF_TAG_GENERATED)) == 434 (PF_TAG_REROUTE | PF_TAG_GENERATED)) 435 /* already rerun the route lookup, go on */ 436 m->m_pkthdr.pf.flags &= ~(PF_TAG_GENERATED | PF_TAG_REROUTE); 437 else if (m->m_pkthdr.pf.flags & PF_TAG_REROUTE) { 438 /* tag as generated to skip over pf_test on rerun */ 439 m->m_pkthdr.pf.flags |= PF_TAG_GENERATED; 440 ro = NULL; 441 if_put(ifp); /* drop reference since target changed */ 442 ifp = NULL; 443 goto reroute; 444 } 445 #endif 446 in_proto_cksum_out(m, ifp); 447 448 #ifdef IPSEC 449 if (ipsec_in_use && (flags & IP_FORWARDING) && (ipforwarding == 2) && 450 (m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL) == NULL)) { 451 error = EHOSTUNREACH; 452 m_freem(m); 453 goto done; 454 } 455 #endif 456 457 /* 458 * If small enough for interface, can just send directly. 459 */ 460 if (ntohs(ip->ip_len) <= mtu) { 461 ip->ip_sum = 0; 462 if ((ifp->if_capabilities & IFCAP_CSUM_IPv4) && 463 (ifp->if_bridgeidx == 0)) 464 m->m_pkthdr.csum_flags |= M_IPV4_CSUM_OUT; 465 else { 466 ipstat_inc(ips_outswcsum); 467 ip->ip_sum = in_cksum(m, hlen); 468 } 469 470 error = ifp->if_output(ifp, m, sintosa(dst), ro->ro_rt); 471 goto done; 472 } 473 474 /* 475 * Too large for interface; fragment if possible. 476 * Must be able to put at least 8 bytes per fragment. 477 */ 478 if (ip->ip_off & htons(IP_DF)) { 479 #ifdef IPSEC 480 if (ip_mtudisc) 481 ipsec_adjust_mtu(m, ifp->if_mtu); 482 #endif 483 error = EMSGSIZE; 484 /* 485 * This case can happen if the user changed the MTU 486 * of an interface after enabling IP on it. Because 487 * most netifs don't keep track of routes pointing to 488 * them, there is no way for one to update all its 489 * routes when the MTU is changed. 490 */ 491 if (rtisvalid(ro->ro_rt) && 492 ISSET(ro->ro_rt->rt_flags, RTF_HOST) && 493 !(ro->ro_rt->rt_locks & RTV_MTU) && 494 (ro->ro_rt->rt_mtu > ifp->if_mtu)) { 495 ro->ro_rt->rt_mtu = ifp->if_mtu; 496 } 497 ipstat_inc(ips_cantfrag); 498 goto bad; 499 } 500 501 error = ip_fragment(m, ifp, mtu); 502 if (error) { 503 m = m0 = NULL; 504 goto bad; 505 } 506 507 for (; m; m = m0) { 508 m0 = m->m_nextpkt; 509 m->m_nextpkt = 0; 510 if (error == 0) 511 error = ifp->if_output(ifp, m, sintosa(dst), ro->ro_rt); 512 else 513 m_freem(m); 514 } 515 516 if (error == 0) 517 ipstat_inc(ips_fragmented); 518 519 done: 520 if (ro == &iproute && ro->ro_rt) 521 rtfree(ro->ro_rt); 522 if_put(ifp); 523 return (error); 524 bad: 525 m_freem(m0); 526 goto done; 527 } 528 529 #ifdef IPSEC 530 struct tdb * 531 ip_output_ipsec_lookup(struct mbuf *m, int hlen, int *error, struct inpcb *inp, 532 int ipsecflowinfo) 533 { 534 struct m_tag *mtag; 535 struct tdb_ident *tdbi; 536 struct tdb *tdb; 537 538 /* Do we have any pending SAs to apply ? */ 539 tdb = ipsp_spd_lookup(m, AF_INET, hlen, error, IPSP_DIRECTION_OUT, 540 NULL, inp, ipsecflowinfo); 541 if (tdb == NULL) 542 return NULL; 543 /* Loop detection */ 544 for (mtag = m_tag_first(m); mtag != NULL; mtag = m_tag_next(m, mtag)) { 545 if (mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_DONE) 546 continue; 547 tdbi = (struct tdb_ident *)(mtag + 1); 548 if (tdbi->spi == tdb->tdb_spi && 549 tdbi->proto == tdb->tdb_sproto && 550 tdbi->rdomain == tdb->tdb_rdomain && 551 !memcmp(&tdbi->dst, &tdb->tdb_dst, 552 sizeof(union sockaddr_union))) { 553 /* no IPsec needed */ 554 return NULL; 555 } 556 } 557 return tdb; 558 } 559 560 int 561 ip_output_ipsec_send(struct tdb *tdb, struct mbuf *m, struct route *ro, int fwd) 562 { 563 #if NPF > 0 564 struct ifnet *encif; 565 #endif 566 struct ip *ip; 567 int error; 568 569 #if NPF > 0 570 /* 571 * Packet filter 572 */ 573 if ((encif = enc_getif(tdb->tdb_rdomain, tdb->tdb_tap)) == NULL || 574 pf_test(AF_INET, fwd ? PF_FWD : PF_OUT, encif, &m) != PF_PASS) { 575 m_freem(m); 576 return EACCES; 577 } 578 if (m == NULL) 579 return 0; 580 /* 581 * PF_TAG_REROUTE handling or not... 582 * Packet is entering IPsec so the routing is 583 * already overruled by the IPsec policy. 584 * Until now the change was not reconsidered. 585 * What's the behaviour? 586 */ 587 in_proto_cksum_out(m, encif); 588 #endif 589 590 /* Check if we are allowed to fragment */ 591 ip = mtod(m, struct ip *); 592 if (ip_mtudisc && (ip->ip_off & htons(IP_DF)) && tdb->tdb_mtu && 593 ntohs(ip->ip_len) > tdb->tdb_mtu && 594 tdb->tdb_mtutimeout > time_second) { 595 struct rtentry *rt = NULL; 596 int rt_mtucloned = 0; 597 int transportmode = 0; 598 599 transportmode = (tdb->tdb_dst.sa.sa_family == AF_INET) && 600 (tdb->tdb_dst.sin.sin_addr.s_addr == ip->ip_dst.s_addr); 601 602 /* Find a host route to store the mtu in */ 603 if (ro != NULL) 604 rt = ro->ro_rt; 605 /* but don't add a PMTU route for transport mode SAs */ 606 if (transportmode) 607 rt = NULL; 608 else if (rt == NULL || (rt->rt_flags & RTF_HOST) == 0) { 609 rt = icmp_mtudisc_clone(ip->ip_dst, 610 m->m_pkthdr.ph_rtableid); 611 rt_mtucloned = 1; 612 } 613 DPRINTF(("%s: spi %08x mtu %d rt %p cloned %d\n", __func__, 614 ntohl(tdb->tdb_spi), tdb->tdb_mtu, rt, rt_mtucloned)); 615 if (rt != NULL) { 616 rt->rt_mtu = tdb->tdb_mtu; 617 if (ro && ro->ro_rt != NULL) { 618 rtfree(ro->ro_rt); 619 ro->ro_rt = rtalloc(&ro->ro_dst, RT_RESOLVE, 620 m->m_pkthdr.ph_rtableid); 621 } 622 if (rt_mtucloned) 623 rtfree(rt); 624 } 625 ipsec_adjust_mtu(m, tdb->tdb_mtu); 626 m_freem(m); 627 return EMSGSIZE; 628 } 629 630 /* 631 * Clear these -- they'll be set in the recursive invocation 632 * as needed. 633 */ 634 m->m_flags &= ~(M_MCAST | M_BCAST); 635 636 /* Callee frees mbuf */ 637 error = ipsp_process_packet(m, tdb, AF_INET, 0); 638 if (error) { 639 ipsecstat_inc(ipsec_odrops); 640 tdb->tdb_odrops++; 641 } 642 return error; 643 } 644 #endif /* IPSEC */ 645 646 int 647 ip_fragment(struct mbuf *m, struct ifnet *ifp, u_long mtu) 648 { 649 struct ip *ip, *mhip; 650 struct mbuf *m0; 651 int len, hlen, off; 652 int mhlen, firstlen; 653 struct mbuf **mnext; 654 int fragments = 0; 655 int error = 0; 656 657 ip = mtod(m, struct ip *); 658 hlen = ip->ip_hl << 2; 659 660 len = (mtu - hlen) &~ 7; 661 if (len < 8) { 662 m_freem(m); 663 return (EMSGSIZE); 664 } 665 666 /* 667 * If we are doing fragmentation, we can't defer TCP/UDP 668 * checksumming; compute the checksum and clear the flag. 669 */ 670 in_proto_cksum_out(m, NULL); 671 firstlen = len; 672 mnext = &m->m_nextpkt; 673 674 /* 675 * Loop through length of segment after first fragment, 676 * make new header and copy data of each part and link onto chain. 677 */ 678 m0 = m; 679 mhlen = sizeof (struct ip); 680 for (off = hlen + len; off < ntohs(ip->ip_len); off += len) { 681 MGETHDR(m, M_DONTWAIT, MT_HEADER); 682 if (m == NULL) { 683 ipstat_inc(ips_odropped); 684 error = ENOBUFS; 685 goto sendorfree; 686 } 687 *mnext = m; 688 mnext = &m->m_nextpkt; 689 m->m_data += max_linkhdr; 690 mhip = mtod(m, struct ip *); 691 *mhip = *ip; 692 /* we must inherit MCAST/BCAST flags, routing table and prio */ 693 m->m_flags |= m0->m_flags & (M_MCAST|M_BCAST); 694 m->m_pkthdr.ph_rtableid = m0->m_pkthdr.ph_rtableid; 695 m->m_pkthdr.pf.prio = m0->m_pkthdr.pf.prio; 696 if (hlen > sizeof (struct ip)) { 697 mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); 698 mhip->ip_hl = mhlen >> 2; 699 } 700 m->m_len = mhlen; 701 mhip->ip_off = ((off - hlen) >> 3) + 702 (ntohs(ip->ip_off) & ~IP_MF); 703 if (ip->ip_off & htons(IP_MF)) 704 mhip->ip_off |= IP_MF; 705 if (off + len >= ntohs(ip->ip_len)) 706 len = ntohs(ip->ip_len) - off; 707 else 708 mhip->ip_off |= IP_MF; 709 mhip->ip_len = htons((u_int16_t)(len + mhlen)); 710 m->m_next = m_copym(m0, off, len, M_NOWAIT); 711 if (m->m_next == 0) { 712 ipstat_inc(ips_odropped); 713 error = ENOBUFS; 714 goto sendorfree; 715 } 716 m->m_pkthdr.len = mhlen + len; 717 m->m_pkthdr.ph_ifidx = 0; 718 mhip->ip_off = htons((u_int16_t)mhip->ip_off); 719 mhip->ip_sum = 0; 720 if ((ifp != NULL) && 721 (ifp->if_capabilities & IFCAP_CSUM_IPv4) && 722 (ifp->if_bridgeidx == 0)) 723 m->m_pkthdr.csum_flags |= M_IPV4_CSUM_OUT; 724 else { 725 ipstat_inc(ips_outswcsum); 726 mhip->ip_sum = in_cksum(m, mhlen); 727 } 728 ipstat_inc(ips_ofragments); 729 fragments++; 730 } 731 /* 732 * Update first fragment by trimming what's been copied out 733 * and updating header, then send each fragment (in order). 734 */ 735 m = m0; 736 m_adj(m, hlen + firstlen - ntohs(ip->ip_len)); 737 m->m_pkthdr.len = hlen + firstlen; 738 ip->ip_len = htons((u_int16_t)m->m_pkthdr.len); 739 ip->ip_off |= htons(IP_MF); 740 ip->ip_sum = 0; 741 if ((ifp != NULL) && 742 (ifp->if_capabilities & IFCAP_CSUM_IPv4) && 743 (ifp->if_bridgeidx == 0)) 744 m->m_pkthdr.csum_flags |= M_IPV4_CSUM_OUT; 745 else { 746 ipstat_inc(ips_outswcsum); 747 ip->ip_sum = in_cksum(m, hlen); 748 } 749 sendorfree: 750 if (error) { 751 for (m = m0; m; m = m0) { 752 m0 = m->m_nextpkt; 753 m->m_nextpkt = NULL; 754 m_freem(m); 755 } 756 } 757 758 return (error); 759 } 760 761 /* 762 * Insert IP options into preformed packet. 763 * Adjust IP destination as required for IP source routing, 764 * as indicated by a non-zero in_addr at the start of the options. 765 */ 766 struct mbuf * 767 ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen) 768 { 769 struct ipoption *p = mtod(opt, struct ipoption *); 770 struct mbuf *n; 771 struct ip *ip = mtod(m, struct ip *); 772 unsigned int optlen; 773 774 optlen = opt->m_len - sizeof(p->ipopt_dst); 775 if (optlen + ntohs(ip->ip_len) > IP_MAXPACKET) 776 return (m); /* XXX should fail */ 777 if (p->ipopt_dst.s_addr) 778 ip->ip_dst = p->ipopt_dst; 779 if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) { 780 MGETHDR(n, M_DONTWAIT, MT_HEADER); 781 if (n == NULL) 782 return (m); 783 M_MOVE_HDR(n, m); 784 n->m_pkthdr.len += optlen; 785 m->m_len -= sizeof(struct ip); 786 m->m_data += sizeof(struct ip); 787 n->m_next = m; 788 m = n; 789 m->m_len = optlen + sizeof(struct ip); 790 m->m_data += max_linkhdr; 791 memcpy(mtod(m, caddr_t), ip, sizeof(struct ip)); 792 } else { 793 m->m_data -= optlen; 794 m->m_len += optlen; 795 m->m_pkthdr.len += optlen; 796 memmove(mtod(m, caddr_t), (caddr_t)ip, sizeof(struct ip)); 797 } 798 ip = mtod(m, struct ip *); 799 memcpy(ip + 1, p->ipopt_list, optlen); 800 *phlen = sizeof(struct ip) + optlen; 801 ip->ip_len = htons(ntohs(ip->ip_len) + optlen); 802 return (m); 803 } 804 805 /* 806 * Copy options from ip to jp, 807 * omitting those not copied during fragmentation. 808 */ 809 int 810 ip_optcopy(struct ip *ip, struct ip *jp) 811 { 812 u_char *cp, *dp; 813 int opt, optlen, cnt; 814 815 cp = (u_char *)(ip + 1); 816 dp = (u_char *)(jp + 1); 817 cnt = (ip->ip_hl << 2) - sizeof (struct ip); 818 for (; cnt > 0; cnt -= optlen, cp += optlen) { 819 opt = cp[0]; 820 if (opt == IPOPT_EOL) 821 break; 822 if (opt == IPOPT_NOP) { 823 /* Preserve for IP mcast tunnel's LSRR alignment. */ 824 *dp++ = IPOPT_NOP; 825 optlen = 1; 826 continue; 827 } 828 #ifdef DIAGNOSTIC 829 if (cnt < IPOPT_OLEN + sizeof(*cp)) 830 panic("malformed IPv4 option passed to ip_optcopy"); 831 #endif 832 optlen = cp[IPOPT_OLEN]; 833 #ifdef DIAGNOSTIC 834 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) 835 panic("malformed IPv4 option passed to ip_optcopy"); 836 #endif 837 /* bogus lengths should have been caught by ip_dooptions */ 838 if (optlen > cnt) 839 optlen = cnt; 840 if (IPOPT_COPIED(opt)) { 841 memcpy(dp, cp, optlen); 842 dp += optlen; 843 } 844 } 845 for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++) 846 *dp++ = IPOPT_EOL; 847 return (optlen); 848 } 849 850 /* 851 * IP socket option processing. 852 */ 853 int 854 ip_ctloutput(int op, struct socket *so, int level, int optname, 855 struct mbuf *m) 856 { 857 struct inpcb *inp = sotoinpcb(so); 858 int optval = 0; 859 struct proc *p = curproc; /* XXX */ 860 int error = 0; 861 u_int rtid = 0; 862 863 if (level != IPPROTO_IP) 864 return (EINVAL); 865 866 switch (op) { 867 case PRCO_SETOPT: 868 switch (optname) { 869 case IP_OPTIONS: 870 return (ip_pcbopts(&inp->inp_options, m)); 871 872 case IP_TOS: 873 case IP_TTL: 874 case IP_MINTTL: 875 case IP_RECVOPTS: 876 case IP_RECVRETOPTS: 877 case IP_RECVDSTADDR: 878 case IP_RECVIF: 879 case IP_RECVTTL: 880 case IP_RECVDSTPORT: 881 case IP_RECVRTABLE: 882 case IP_IPSECFLOWINFO: 883 if (m == NULL || m->m_len != sizeof(int)) 884 error = EINVAL; 885 else { 886 optval = *mtod(m, int *); 887 switch (optname) { 888 889 case IP_TOS: 890 inp->inp_ip.ip_tos = optval; 891 break; 892 893 case IP_TTL: 894 if (optval > 0 && optval <= MAXTTL) 895 inp->inp_ip.ip_ttl = optval; 896 else if (optval == -1) 897 inp->inp_ip.ip_ttl = ip_defttl; 898 else 899 error = EINVAL; 900 break; 901 902 case IP_MINTTL: 903 if (optval >= 0 && optval <= MAXTTL) 904 inp->inp_ip_minttl = optval; 905 else 906 error = EINVAL; 907 break; 908 #define OPTSET(bit) \ 909 if (optval) \ 910 inp->inp_flags |= bit; \ 911 else \ 912 inp->inp_flags &= ~bit; 913 914 case IP_RECVOPTS: 915 OPTSET(INP_RECVOPTS); 916 break; 917 918 case IP_RECVRETOPTS: 919 OPTSET(INP_RECVRETOPTS); 920 break; 921 922 case IP_RECVDSTADDR: 923 OPTSET(INP_RECVDSTADDR); 924 break; 925 case IP_RECVIF: 926 OPTSET(INP_RECVIF); 927 break; 928 case IP_RECVTTL: 929 OPTSET(INP_RECVTTL); 930 break; 931 case IP_RECVDSTPORT: 932 OPTSET(INP_RECVDSTPORT); 933 break; 934 case IP_RECVRTABLE: 935 OPTSET(INP_RECVRTABLE); 936 break; 937 case IP_IPSECFLOWINFO: 938 OPTSET(INP_IPSECFLOWINFO); 939 break; 940 } 941 } 942 break; 943 #undef OPTSET 944 945 case IP_MULTICAST_IF: 946 case IP_MULTICAST_TTL: 947 case IP_MULTICAST_LOOP: 948 case IP_ADD_MEMBERSHIP: 949 case IP_DROP_MEMBERSHIP: 950 error = ip_setmoptions(optname, &inp->inp_moptions, m, 951 inp->inp_rtableid); 952 break; 953 954 case IP_PORTRANGE: 955 if (m == NULL || m->m_len != sizeof(int)) 956 error = EINVAL; 957 else { 958 optval = *mtod(m, int *); 959 960 switch (optval) { 961 962 case IP_PORTRANGE_DEFAULT: 963 inp->inp_flags &= ~(INP_LOWPORT); 964 inp->inp_flags &= ~(INP_HIGHPORT); 965 break; 966 967 case IP_PORTRANGE_HIGH: 968 inp->inp_flags &= ~(INP_LOWPORT); 969 inp->inp_flags |= INP_HIGHPORT; 970 break; 971 972 case IP_PORTRANGE_LOW: 973 inp->inp_flags &= ~(INP_HIGHPORT); 974 inp->inp_flags |= INP_LOWPORT; 975 break; 976 977 default: 978 979 error = EINVAL; 980 break; 981 } 982 } 983 break; 984 case IP_AUTH_LEVEL: 985 case IP_ESP_TRANS_LEVEL: 986 case IP_ESP_NETWORK_LEVEL: 987 case IP_IPCOMP_LEVEL: 988 #ifndef IPSEC 989 error = EOPNOTSUPP; 990 #else 991 if (m == NULL || m->m_len != sizeof(int)) { 992 error = EINVAL; 993 break; 994 } 995 optval = *mtod(m, int *); 996 997 if (optval < IPSEC_LEVEL_BYPASS || 998 optval > IPSEC_LEVEL_UNIQUE) { 999 error = EINVAL; 1000 break; 1001 } 1002 1003 switch (optname) { 1004 case IP_AUTH_LEVEL: 1005 if (optval < IPSEC_AUTH_LEVEL_DEFAULT && 1006 suser(p)) { 1007 error = EACCES; 1008 break; 1009 } 1010 inp->inp_seclevel[SL_AUTH] = optval; 1011 break; 1012 1013 case IP_ESP_TRANS_LEVEL: 1014 if (optval < IPSEC_ESP_TRANS_LEVEL_DEFAULT && 1015 suser(p)) { 1016 error = EACCES; 1017 break; 1018 } 1019 inp->inp_seclevel[SL_ESP_TRANS] = optval; 1020 break; 1021 1022 case IP_ESP_NETWORK_LEVEL: 1023 if (optval < IPSEC_ESP_NETWORK_LEVEL_DEFAULT && 1024 suser(p)) { 1025 error = EACCES; 1026 break; 1027 } 1028 inp->inp_seclevel[SL_ESP_NETWORK] = optval; 1029 break; 1030 case IP_IPCOMP_LEVEL: 1031 if (optval < IPSEC_IPCOMP_LEVEL_DEFAULT && 1032 suser(p)) { 1033 error = EACCES; 1034 break; 1035 } 1036 inp->inp_seclevel[SL_IPCOMP] = optval; 1037 break; 1038 } 1039 #endif 1040 break; 1041 1042 case IP_IPSEC_LOCAL_ID: 1043 case IP_IPSEC_REMOTE_ID: 1044 error = EOPNOTSUPP; 1045 break; 1046 case SO_RTABLE: 1047 if (m == NULL || m->m_len < sizeof(u_int)) { 1048 error = EINVAL; 1049 break; 1050 } 1051 rtid = *mtod(m, u_int *); 1052 if (inp->inp_rtableid == rtid) 1053 break; 1054 /* needs privileges to switch when already set */ 1055 if (p->p_p->ps_rtableid != rtid && 1056 p->p_p->ps_rtableid != 0 && 1057 (error = suser(p)) != 0) 1058 break; 1059 /* table must exist */ 1060 if (!rtable_exists(rtid)) { 1061 error = EINVAL; 1062 break; 1063 } 1064 if (inp->inp_lport) { 1065 error = EBUSY; 1066 break; 1067 } 1068 inp->inp_rtableid = rtid; 1069 in_pcbrehash(inp); 1070 break; 1071 case IP_PIPEX: 1072 if (m != NULL && m->m_len == sizeof(int)) 1073 inp->inp_pipex = *mtod(m, int *); 1074 else 1075 error = EINVAL; 1076 break; 1077 1078 default: 1079 error = ENOPROTOOPT; 1080 break; 1081 } 1082 break; 1083 1084 case PRCO_GETOPT: 1085 switch (optname) { 1086 case IP_OPTIONS: 1087 case IP_RETOPTS: 1088 if (inp->inp_options) { 1089 m->m_len = inp->inp_options->m_len; 1090 memcpy(mtod(m, caddr_t), 1091 mtod(inp->inp_options, caddr_t), m->m_len); 1092 } else 1093 m->m_len = 0; 1094 break; 1095 1096 case IP_TOS: 1097 case IP_TTL: 1098 case IP_MINTTL: 1099 case IP_RECVOPTS: 1100 case IP_RECVRETOPTS: 1101 case IP_RECVDSTADDR: 1102 case IP_RECVIF: 1103 case IP_RECVTTL: 1104 case IP_RECVDSTPORT: 1105 case IP_RECVRTABLE: 1106 case IP_IPSECFLOWINFO: 1107 case IP_IPDEFTTL: 1108 m->m_len = sizeof(int); 1109 switch (optname) { 1110 1111 case IP_TOS: 1112 optval = inp->inp_ip.ip_tos; 1113 break; 1114 1115 case IP_TTL: 1116 optval = inp->inp_ip.ip_ttl; 1117 break; 1118 1119 case IP_MINTTL: 1120 optval = inp->inp_ip_minttl; 1121 break; 1122 1123 case IP_IPDEFTTL: 1124 optval = ip_defttl; 1125 break; 1126 1127 #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) 1128 1129 case IP_RECVOPTS: 1130 optval = OPTBIT(INP_RECVOPTS); 1131 break; 1132 1133 case IP_RECVRETOPTS: 1134 optval = OPTBIT(INP_RECVRETOPTS); 1135 break; 1136 1137 case IP_RECVDSTADDR: 1138 optval = OPTBIT(INP_RECVDSTADDR); 1139 break; 1140 case IP_RECVIF: 1141 optval = OPTBIT(INP_RECVIF); 1142 break; 1143 case IP_RECVTTL: 1144 optval = OPTBIT(INP_RECVTTL); 1145 break; 1146 case IP_RECVDSTPORT: 1147 optval = OPTBIT(INP_RECVDSTPORT); 1148 break; 1149 case IP_RECVRTABLE: 1150 optval = OPTBIT(INP_RECVRTABLE); 1151 break; 1152 case IP_IPSECFLOWINFO: 1153 optval = OPTBIT(INP_IPSECFLOWINFO); 1154 break; 1155 } 1156 *mtod(m, int *) = optval; 1157 break; 1158 1159 case IP_MULTICAST_IF: 1160 case IP_MULTICAST_TTL: 1161 case IP_MULTICAST_LOOP: 1162 case IP_ADD_MEMBERSHIP: 1163 case IP_DROP_MEMBERSHIP: 1164 error = ip_getmoptions(optname, inp->inp_moptions, m); 1165 break; 1166 1167 case IP_PORTRANGE: 1168 m->m_len = sizeof(int); 1169 1170 if (inp->inp_flags & INP_HIGHPORT) 1171 optval = IP_PORTRANGE_HIGH; 1172 else if (inp->inp_flags & INP_LOWPORT) 1173 optval = IP_PORTRANGE_LOW; 1174 else 1175 optval = 0; 1176 1177 *mtod(m, int *) = optval; 1178 break; 1179 1180 case IP_AUTH_LEVEL: 1181 case IP_ESP_TRANS_LEVEL: 1182 case IP_ESP_NETWORK_LEVEL: 1183 case IP_IPCOMP_LEVEL: 1184 #ifndef IPSEC 1185 m->m_len = sizeof(int); 1186 *mtod(m, int *) = IPSEC_LEVEL_NONE; 1187 #else 1188 m->m_len = sizeof(int); 1189 switch (optname) { 1190 case IP_AUTH_LEVEL: 1191 optval = inp->inp_seclevel[SL_AUTH]; 1192 break; 1193 1194 case IP_ESP_TRANS_LEVEL: 1195 optval = inp->inp_seclevel[SL_ESP_TRANS]; 1196 break; 1197 1198 case IP_ESP_NETWORK_LEVEL: 1199 optval = inp->inp_seclevel[SL_ESP_NETWORK]; 1200 break; 1201 case IP_IPCOMP_LEVEL: 1202 optval = inp->inp_seclevel[SL_IPCOMP]; 1203 break; 1204 } 1205 *mtod(m, int *) = optval; 1206 #endif 1207 break; 1208 case IP_IPSEC_LOCAL_ID: 1209 case IP_IPSEC_REMOTE_ID: 1210 error = EOPNOTSUPP; 1211 break; 1212 case SO_RTABLE: 1213 m->m_len = sizeof(u_int); 1214 *mtod(m, u_int *) = inp->inp_rtableid; 1215 break; 1216 case IP_PIPEX: 1217 m->m_len = sizeof(int); 1218 *mtod(m, int *) = inp->inp_pipex; 1219 break; 1220 default: 1221 error = ENOPROTOOPT; 1222 break; 1223 } 1224 break; 1225 } 1226 return (error); 1227 } 1228 1229 /* 1230 * Set up IP options in pcb for insertion in output packets. 1231 * Store in mbuf with pointer in pcbopt, adding pseudo-option 1232 * with destination address if source routed. 1233 */ 1234 int 1235 ip_pcbopts(struct mbuf **pcbopt, struct mbuf *m) 1236 { 1237 struct mbuf *n; 1238 struct ipoption *p; 1239 int cnt, off, optlen; 1240 u_char *cp; 1241 u_char opt; 1242 1243 /* turn off any old options */ 1244 m_freem(*pcbopt); 1245 *pcbopt = NULL; 1246 if (m == NULL || m->m_len == 0) { 1247 /* 1248 * Only turning off any previous options. 1249 */ 1250 return (0); 1251 } 1252 1253 if (m->m_len % sizeof(int32_t) || 1254 m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr)) 1255 return (EINVAL); 1256 1257 /* Don't sleep because NET_LOCK() is hold. */ 1258 if ((n = m_get(M_NOWAIT, MT_SOOPTS)) == NULL) 1259 return (ENOBUFS); 1260 p = mtod(n, struct ipoption *); 1261 memset(p, 0, sizeof (*p)); /* 0 = IPOPT_EOL, needed for padding */ 1262 n->m_len = sizeof(struct in_addr); 1263 1264 off = 0; 1265 cnt = m->m_len; 1266 cp = mtod(m, u_char *); 1267 1268 while (cnt > 0) { 1269 opt = cp[IPOPT_OPTVAL]; 1270 1271 if (opt == IPOPT_NOP || opt == IPOPT_EOL) { 1272 optlen = 1; 1273 } else { 1274 if (cnt < IPOPT_OLEN + sizeof(*cp)) 1275 goto bad; 1276 optlen = cp[IPOPT_OLEN]; 1277 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) 1278 goto bad; 1279 } 1280 switch (opt) { 1281 default: 1282 memcpy(p->ipopt_list + off, cp, optlen); 1283 break; 1284 1285 case IPOPT_LSRR: 1286 case IPOPT_SSRR: 1287 /* 1288 * user process specifies route as: 1289 * ->A->B->C->D 1290 * D must be our final destination (but we can't 1291 * check that since we may not have connected yet). 1292 * A is first hop destination, which doesn't appear in 1293 * actual IP option, but is stored before the options. 1294 */ 1295 if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr)) 1296 goto bad; 1297 1298 /* 1299 * Optlen is smaller because first address is popped. 1300 * Cnt and cp will be adjusted a bit later to reflect 1301 * this. 1302 */ 1303 optlen -= sizeof(struct in_addr); 1304 p->ipopt_list[off + IPOPT_OPTVAL] = opt; 1305 p->ipopt_list[off + IPOPT_OLEN] = optlen; 1306 1307 /* 1308 * Move first hop before start of options. 1309 */ 1310 memcpy(&p->ipopt_dst, cp + IPOPT_OFFSET, 1311 sizeof(struct in_addr)); 1312 cp += sizeof(struct in_addr); 1313 cnt -= sizeof(struct in_addr); 1314 /* 1315 * Then copy rest of options 1316 */ 1317 memcpy(p->ipopt_list + off + IPOPT_OFFSET, 1318 cp + IPOPT_OFFSET, optlen - IPOPT_OFFSET); 1319 break; 1320 } 1321 off += optlen; 1322 cp += optlen; 1323 cnt -= optlen; 1324 1325 if (opt == IPOPT_EOL) 1326 break; 1327 } 1328 /* pad options to next word, since p was zeroed just adjust off */ 1329 off = (off + sizeof(int32_t) - 1) & ~(sizeof(int32_t) - 1); 1330 n->m_len += off; 1331 if (n->m_len > sizeof(*p)) { 1332 bad: 1333 m_freem(n); 1334 return (EINVAL); 1335 } 1336 1337 *pcbopt = n; 1338 return (0); 1339 } 1340 1341 /* 1342 * Set the IP multicast options in response to user setsockopt(). 1343 */ 1344 int 1345 ip_setmoptions(int optname, struct ip_moptions **imop, struct mbuf *m, 1346 u_int rtableid) 1347 { 1348 struct in_addr addr; 1349 struct in_ifaddr *ia; 1350 struct ip_mreq *mreq; 1351 struct ifnet *ifp = NULL; 1352 struct ip_moptions *imo = *imop; 1353 struct in_multi **immp; 1354 struct rtentry *rt; 1355 struct sockaddr_in sin; 1356 int i, error = 0; 1357 u_char loop; 1358 1359 if (imo == NULL) { 1360 /* 1361 * No multicast option buffer attached to the pcb; 1362 * allocate one and initialize to default values. 1363 */ 1364 imo = malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK|M_ZERO); 1365 immp = mallocarray(IP_MIN_MEMBERSHIPS, sizeof(*immp), M_IPMOPTS, 1366 M_WAITOK|M_ZERO); 1367 *imop = imo; 1368 imo->imo_ifidx = 0; 1369 imo->imo_ttl = IP_DEFAULT_MULTICAST_TTL; 1370 imo->imo_loop = IP_DEFAULT_MULTICAST_LOOP; 1371 imo->imo_num_memberships = 0; 1372 imo->imo_max_memberships = IP_MIN_MEMBERSHIPS; 1373 imo->imo_membership = immp; 1374 } 1375 1376 switch (optname) { 1377 1378 case IP_MULTICAST_IF: 1379 /* 1380 * Select the interface for outgoing multicast packets. 1381 */ 1382 if (m == NULL || m->m_len != sizeof(struct in_addr)) { 1383 error = EINVAL; 1384 break; 1385 } 1386 addr = *(mtod(m, struct in_addr *)); 1387 /* 1388 * INADDR_ANY is used to remove a previous selection. 1389 * When no interface is selected, a default one is 1390 * chosen every time a multicast packet is sent. 1391 */ 1392 if (addr.s_addr == INADDR_ANY) { 1393 imo->imo_ifidx = 0; 1394 break; 1395 } 1396 /* 1397 * The selected interface is identified by its local 1398 * IP address. Find the interface and confirm that 1399 * it supports multicasting. 1400 */ 1401 memset(&sin, 0, sizeof(sin)); 1402 sin.sin_len = sizeof(sin); 1403 sin.sin_family = AF_INET; 1404 sin.sin_addr = addr; 1405 ia = ifatoia(ifa_ifwithaddr(sintosa(&sin), rtableid)); 1406 if (ia == NULL || 1407 (ia->ia_ifp->if_flags & IFF_MULTICAST) == 0) { 1408 error = EADDRNOTAVAIL; 1409 break; 1410 } 1411 imo->imo_ifidx = ia->ia_ifp->if_index; 1412 break; 1413 1414 case IP_MULTICAST_TTL: 1415 /* 1416 * Set the IP time-to-live for outgoing multicast packets. 1417 */ 1418 if (m == NULL || m->m_len != 1) { 1419 error = EINVAL; 1420 break; 1421 } 1422 imo->imo_ttl = *(mtod(m, u_char *)); 1423 break; 1424 1425 case IP_MULTICAST_LOOP: 1426 /* 1427 * Set the loopback flag for outgoing multicast packets. 1428 * Must be zero or one. 1429 */ 1430 if (m == NULL || m->m_len != 1 || 1431 (loop = *(mtod(m, u_char *))) > 1) { 1432 error = EINVAL; 1433 break; 1434 } 1435 imo->imo_loop = loop; 1436 break; 1437 1438 case IP_ADD_MEMBERSHIP: 1439 /* 1440 * Add a multicast group membership. 1441 * Group must be a valid IP multicast address. 1442 */ 1443 if (m == NULL || m->m_len != sizeof(struct ip_mreq)) { 1444 error = EINVAL; 1445 break; 1446 } 1447 mreq = mtod(m, struct ip_mreq *); 1448 if (!IN_MULTICAST(mreq->imr_multiaddr.s_addr)) { 1449 error = EINVAL; 1450 break; 1451 } 1452 /* 1453 * If no interface address was provided, use the interface of 1454 * the route to the given multicast address. 1455 */ 1456 if (mreq->imr_interface.s_addr == INADDR_ANY) { 1457 memset(&sin, 0, sizeof(sin)); 1458 sin.sin_len = sizeof(sin); 1459 sin.sin_family = AF_INET; 1460 sin.sin_addr = mreq->imr_multiaddr; 1461 rt = rtalloc(sintosa(&sin), RT_RESOLVE, rtableid); 1462 if (!rtisvalid(rt)) { 1463 rtfree(rt); 1464 error = EADDRNOTAVAIL; 1465 break; 1466 } 1467 } else { 1468 memset(&sin, 0, sizeof(sin)); 1469 sin.sin_len = sizeof(sin); 1470 sin.sin_family = AF_INET; 1471 sin.sin_addr = mreq->imr_interface; 1472 rt = rtalloc(sintosa(&sin), 0, rtableid); 1473 if (!rtisvalid(rt) || !ISSET(rt->rt_flags, RTF_LOCAL)) { 1474 rtfree(rt); 1475 error = EADDRNOTAVAIL; 1476 break; 1477 } 1478 } 1479 ifp = if_get(rt->rt_ifidx); 1480 rtfree(rt); 1481 1482 /* 1483 * See if we found an interface, and confirm that it 1484 * supports multicast. 1485 */ 1486 if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { 1487 error = EADDRNOTAVAIL; 1488 if_put(ifp); 1489 break; 1490 } 1491 /* 1492 * See if the membership already exists or if all the 1493 * membership slots are full. 1494 */ 1495 for (i = 0; i < imo->imo_num_memberships; ++i) { 1496 if (imo->imo_membership[i]->inm_ifidx 1497 == ifp->if_index && 1498 imo->imo_membership[i]->inm_addr.s_addr 1499 == mreq->imr_multiaddr.s_addr) 1500 break; 1501 } 1502 if (i < imo->imo_num_memberships) { 1503 error = EADDRINUSE; 1504 if_put(ifp); 1505 break; 1506 } 1507 if (imo->imo_num_memberships == imo->imo_max_memberships) { 1508 struct in_multi **nmships, **omships; 1509 size_t newmax; 1510 /* 1511 * Resize the vector to next power-of-two minus 1. If the 1512 * size would exceed the maximum then we know we've really 1513 * run out of entries. Otherwise, we reallocate the vector. 1514 */ 1515 nmships = NULL; 1516 omships = imo->imo_membership; 1517 newmax = ((imo->imo_max_memberships + 1) * 2) - 1; 1518 if (newmax <= IP_MAX_MEMBERSHIPS) { 1519 nmships = mallocarray(newmax, sizeof(*nmships), 1520 M_IPMOPTS, M_NOWAIT|M_ZERO); 1521 if (nmships != NULL) { 1522 memcpy(nmships, omships, 1523 sizeof(*omships) * 1524 imo->imo_max_memberships); 1525 free(omships, M_IPMOPTS, 1526 sizeof(*omships) * 1527 imo->imo_max_memberships); 1528 imo->imo_membership = nmships; 1529 imo->imo_max_memberships = newmax; 1530 } 1531 } 1532 if (nmships == NULL) { 1533 error = ENOBUFS; 1534 if_put(ifp); 1535 break; 1536 } 1537 } 1538 /* 1539 * Everything looks good; add a new record to the multicast 1540 * address list for the given interface. 1541 */ 1542 if ((imo->imo_membership[i] = 1543 in_addmulti(&mreq->imr_multiaddr, ifp)) == NULL) { 1544 error = ENOBUFS; 1545 if_put(ifp); 1546 break; 1547 } 1548 ++imo->imo_num_memberships; 1549 if_put(ifp); 1550 break; 1551 1552 case IP_DROP_MEMBERSHIP: 1553 /* 1554 * Drop a multicast group membership. 1555 * Group must be a valid IP multicast address. 1556 */ 1557 if (m == NULL || m->m_len != sizeof(struct ip_mreq)) { 1558 error = EINVAL; 1559 break; 1560 } 1561 mreq = mtod(m, struct ip_mreq *); 1562 if (!IN_MULTICAST(mreq->imr_multiaddr.s_addr)) { 1563 error = EINVAL; 1564 break; 1565 } 1566 /* 1567 * If an interface address was specified, get a pointer 1568 * to its ifnet structure. 1569 */ 1570 if (mreq->imr_interface.s_addr == INADDR_ANY) 1571 ifp = NULL; 1572 else { 1573 memset(&sin, 0, sizeof(sin)); 1574 sin.sin_len = sizeof(sin); 1575 sin.sin_family = AF_INET; 1576 sin.sin_addr = mreq->imr_interface; 1577 ia = ifatoia(ifa_ifwithaddr(sintosa(&sin), rtableid)); 1578 if (ia == NULL) { 1579 error = EADDRNOTAVAIL; 1580 break; 1581 } 1582 ifp = ia->ia_ifp; 1583 } 1584 /* 1585 * Find the membership in the membership array. 1586 */ 1587 for (i = 0; i < imo->imo_num_memberships; ++i) { 1588 if ((ifp == NULL || 1589 imo->imo_membership[i]->inm_ifidx == 1590 ifp->if_index) && 1591 imo->imo_membership[i]->inm_addr.s_addr == 1592 mreq->imr_multiaddr.s_addr) 1593 break; 1594 } 1595 if (i == imo->imo_num_memberships) { 1596 error = EADDRNOTAVAIL; 1597 break; 1598 } 1599 /* 1600 * Give up the multicast address record to which the 1601 * membership points. 1602 */ 1603 in_delmulti(imo->imo_membership[i]); 1604 /* 1605 * Remove the gap in the membership array. 1606 */ 1607 for (++i; i < imo->imo_num_memberships; ++i) 1608 imo->imo_membership[i-1] = imo->imo_membership[i]; 1609 --imo->imo_num_memberships; 1610 break; 1611 1612 default: 1613 error = EOPNOTSUPP; 1614 break; 1615 } 1616 1617 /* 1618 * If all options have default values, no need to keep the data. 1619 */ 1620 if (imo->imo_ifidx == 0 && 1621 imo->imo_ttl == IP_DEFAULT_MULTICAST_TTL && 1622 imo->imo_loop == IP_DEFAULT_MULTICAST_LOOP && 1623 imo->imo_num_memberships == 0) { 1624 free(imo->imo_membership , M_IPMOPTS, 1625 imo->imo_max_memberships * sizeof(struct in_multi *)); 1626 free(*imop, M_IPMOPTS, sizeof(**imop)); 1627 *imop = NULL; 1628 } 1629 1630 return (error); 1631 } 1632 1633 /* 1634 * Return the IP multicast options in response to user getsockopt(). 1635 */ 1636 int 1637 ip_getmoptions(int optname, struct ip_moptions *imo, struct mbuf *m) 1638 { 1639 u_char *ttl; 1640 u_char *loop; 1641 struct in_addr *addr; 1642 struct in_ifaddr *ia; 1643 struct ifnet *ifp; 1644 1645 switch (optname) { 1646 1647 case IP_MULTICAST_IF: 1648 addr = mtod(m, struct in_addr *); 1649 m->m_len = sizeof(struct in_addr); 1650 if (imo == NULL || (ifp = if_get(imo->imo_ifidx)) == NULL) 1651 addr->s_addr = INADDR_ANY; 1652 else { 1653 IFP_TO_IA(ifp, ia); 1654 if_put(ifp); 1655 addr->s_addr = (ia == NULL) ? INADDR_ANY 1656 : ia->ia_addr.sin_addr.s_addr; 1657 } 1658 return (0); 1659 1660 case IP_MULTICAST_TTL: 1661 ttl = mtod(m, u_char *); 1662 m->m_len = 1; 1663 *ttl = (imo == NULL) ? IP_DEFAULT_MULTICAST_TTL 1664 : imo->imo_ttl; 1665 return (0); 1666 1667 case IP_MULTICAST_LOOP: 1668 loop = mtod(m, u_char *); 1669 m->m_len = 1; 1670 *loop = (imo == NULL) ? IP_DEFAULT_MULTICAST_LOOP 1671 : imo->imo_loop; 1672 return (0); 1673 1674 default: 1675 return (EOPNOTSUPP); 1676 } 1677 } 1678 1679 /* 1680 * Discard the IP multicast options. 1681 */ 1682 void 1683 ip_freemoptions(struct ip_moptions *imo) 1684 { 1685 int i; 1686 1687 if (imo != NULL) { 1688 for (i = 0; i < imo->imo_num_memberships; ++i) 1689 in_delmulti(imo->imo_membership[i]); 1690 free(imo->imo_membership, M_IPMOPTS, 1691 imo->imo_max_memberships * sizeof(struct in_multi *)); 1692 free(imo, M_IPMOPTS, sizeof(*imo)); 1693 } 1694 } 1695 1696 /* 1697 * Routine called from ip_output() to loop back a copy of an IP multicast 1698 * packet to the input queue of a specified interface. 1699 */ 1700 void 1701 ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst) 1702 { 1703 struct ip *ip; 1704 struct mbuf *copym; 1705 1706 copym = m_dup_pkt(m, max_linkhdr, M_DONTWAIT); 1707 if (copym != NULL) { 1708 /* 1709 * We don't bother to fragment if the IP length is greater 1710 * than the interface's MTU. Can this possibly matter? 1711 */ 1712 ip = mtod(copym, struct ip *); 1713 ip->ip_sum = 0; 1714 ip->ip_sum = in_cksum(copym, ip->ip_hl << 2); 1715 if_input_local(ifp, copym, dst->sin_family); 1716 } 1717 } 1718 1719 /* 1720 * Compute significant parts of the IPv4 checksum pseudo-header 1721 * for use in a delayed TCP/UDP checksum calculation. 1722 */ 1723 static __inline u_int16_t __attribute__((__unused__)) 1724 in_cksum_phdr(u_int32_t src, u_int32_t dst, u_int32_t lenproto) 1725 { 1726 u_int32_t sum; 1727 1728 sum = lenproto + 1729 (u_int16_t)(src >> 16) + 1730 (u_int16_t)(src /*& 0xffff*/) + 1731 (u_int16_t)(dst >> 16) + 1732 (u_int16_t)(dst /*& 0xffff*/); 1733 1734 sum = (u_int16_t)(sum >> 16) + (u_int16_t)(sum /*& 0xffff*/); 1735 1736 if (sum > 0xffff) 1737 sum -= 0xffff; 1738 1739 return (sum); 1740 } 1741 1742 /* 1743 * Process a delayed payload checksum calculation. 1744 */ 1745 void 1746 in_delayed_cksum(struct mbuf *m) 1747 { 1748 struct ip *ip; 1749 u_int16_t csum, offset; 1750 1751 ip = mtod(m, struct ip *); 1752 offset = ip->ip_hl << 2; 1753 csum = in4_cksum(m, 0, offset, m->m_pkthdr.len - offset); 1754 if (csum == 0 && ip->ip_p == IPPROTO_UDP) 1755 csum = 0xffff; 1756 1757 switch (ip->ip_p) { 1758 case IPPROTO_TCP: 1759 offset += offsetof(struct tcphdr, th_sum); 1760 break; 1761 1762 case IPPROTO_UDP: 1763 offset += offsetof(struct udphdr, uh_sum); 1764 break; 1765 1766 case IPPROTO_ICMP: 1767 offset += offsetof(struct icmp, icmp_cksum); 1768 break; 1769 1770 default: 1771 return; 1772 } 1773 1774 if ((offset + sizeof(u_int16_t)) > m->m_len) 1775 m_copyback(m, offset, sizeof(csum), &csum, M_NOWAIT); 1776 else 1777 *(u_int16_t *)(mtod(m, caddr_t) + offset) = csum; 1778 } 1779 1780 void 1781 in_proto_cksum_out(struct mbuf *m, struct ifnet *ifp) 1782 { 1783 struct ip *ip = mtod(m, struct ip *); 1784 1785 /* some hw and in_delayed_cksum need the pseudo header cksum */ 1786 if (m->m_pkthdr.csum_flags & 1787 (M_TCP_CSUM_OUT|M_UDP_CSUM_OUT|M_ICMP_CSUM_OUT)) { 1788 u_int16_t csum = 0, offset; 1789 1790 offset = ip->ip_hl << 2; 1791 if (m->m_pkthdr.csum_flags & (M_TCP_CSUM_OUT|M_UDP_CSUM_OUT)) 1792 csum = in_cksum_phdr(ip->ip_src.s_addr, 1793 ip->ip_dst.s_addr, htonl(ntohs(ip->ip_len) - 1794 offset + ip->ip_p)); 1795 if (ip->ip_p == IPPROTO_TCP) 1796 offset += offsetof(struct tcphdr, th_sum); 1797 else if (ip->ip_p == IPPROTO_UDP) 1798 offset += offsetof(struct udphdr, uh_sum); 1799 else if (ip->ip_p == IPPROTO_ICMP) 1800 offset += offsetof(struct icmp, icmp_cksum); 1801 if ((offset + sizeof(u_int16_t)) > m->m_len) 1802 m_copyback(m, offset, sizeof(csum), &csum, M_NOWAIT); 1803 else 1804 *(u_int16_t *)(mtod(m, caddr_t) + offset) = csum; 1805 } 1806 1807 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_OUT) { 1808 if (!ifp || !(ifp->if_capabilities & IFCAP_CSUM_TCPv4) || 1809 ip->ip_hl != 5 || ifp->if_bridgeidx != 0) { 1810 tcpstat_inc(tcps_outswcsum); 1811 in_delayed_cksum(m); 1812 m->m_pkthdr.csum_flags &= ~M_TCP_CSUM_OUT; /* Clear */ 1813 } 1814 } else if (m->m_pkthdr.csum_flags & M_UDP_CSUM_OUT) { 1815 if (!ifp || !(ifp->if_capabilities & IFCAP_CSUM_UDPv4) || 1816 ip->ip_hl != 5 || ifp->if_bridgeidx != 0) { 1817 udpstat_inc(udps_outswcsum); 1818 in_delayed_cksum(m); 1819 m->m_pkthdr.csum_flags &= ~M_UDP_CSUM_OUT; /* Clear */ 1820 } 1821 } else if (m->m_pkthdr.csum_flags & M_ICMP_CSUM_OUT) { 1822 in_delayed_cksum(m); 1823 m->m_pkthdr.csum_flags &= ~M_ICMP_CSUM_OUT; /* Clear */ 1824 } 1825 } 1826