1 /* $NetBSD: ip_output.c,v 1.260 2016/08/01 03:15:30 ozaki-r Exp $ */ 2 3 /* 4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the project nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 /*- 33 * Copyright (c) 1998 The NetBSD Foundation, Inc. 34 * All rights reserved. 35 * 36 * This code is derived from software contributed to The NetBSD Foundation 37 * by Public Access Networks Corporation ("Panix"). It was developed under 38 * contract to Panix by Eric Haszlakiewicz and Thor Lancelot Simon. 39 * 40 * Redistribution and use in source and binary forms, with or without 41 * modification, are permitted provided that the following conditions 42 * are met: 43 * 1. Redistributions of source code must retain the above copyright 44 * notice, this list of conditions and the following disclaimer. 45 * 2. Redistributions in binary form must reproduce the above copyright 46 * notice, this list of conditions and the following disclaimer in the 47 * documentation and/or other materials provided with the distribution. 48 * 49 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 50 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 51 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 52 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 53 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 54 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 55 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 56 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 57 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 58 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 59 * POSSIBILITY OF SUCH DAMAGE. 60 */ 61 62 /* 63 * Copyright (c) 1982, 1986, 1988, 1990, 1993 64 * The Regents of the University of California. All rights reserved. 65 * 66 * Redistribution and use in source and binary forms, with or without 67 * modification, are permitted provided that the following conditions 68 * are met: 69 * 1. Redistributions of source code must retain the above copyright 70 * notice, this list of conditions and the following disclaimer. 71 * 2. Redistributions in binary form must reproduce the above copyright 72 * notice, this list of conditions and the following disclaimer in the 73 * documentation and/or other materials provided with the distribution. 74 * 3. Neither the name of the University nor the names of its contributors 75 * may be used to endorse or promote products derived from this software 76 * without specific prior written permission. 77 * 78 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 79 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 80 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 81 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 82 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 83 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 84 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 85 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 86 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 87 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 88 * SUCH DAMAGE. 89 * 90 * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 91 */ 92 93 #include <sys/cdefs.h> 94 __KERNEL_RCSID(0, "$NetBSD: ip_output.c,v 1.260 2016/08/01 03:15:30 ozaki-r Exp $"); 95 96 #ifdef _KERNEL_OPT 97 #include "opt_inet.h" 98 #include "opt_ipsec.h" 99 #include "opt_mrouting.h" 100 #include "opt_net_mpsafe.h" 101 #include "opt_mpls.h" 102 #endif 103 104 #include <sys/param.h> 105 #include <sys/kmem.h> 106 #include <sys/mbuf.h> 107 #include <sys/protosw.h> 108 #include <sys/socket.h> 109 #include <sys/socketvar.h> 110 #include <sys/kauth.h> 111 #ifdef IPSEC 112 #include <sys/domain.h> 113 #endif 114 #include <sys/systm.h> 115 #include <sys/syslog.h> 116 117 #include <net/if.h> 118 #include <net/if_types.h> 119 #include <net/route.h> 120 #include <net/pfil.h> 121 122 #include <netinet/in.h> 123 #include <netinet/in_systm.h> 124 #include <netinet/ip.h> 125 #include <netinet/in_pcb.h> 126 #include <netinet/in_var.h> 127 #include <netinet/ip_var.h> 128 #include <netinet/ip_private.h> 129 #include <netinet/in_offload.h> 130 #include <netinet/portalgo.h> 131 #include <netinet/udp.h> 132 133 #ifdef INET6 134 #include <netinet6/ip6_var.h> 135 #endif 136 137 #ifdef MROUTING 138 #include <netinet/ip_mroute.h> 139 #endif 140 141 #ifdef IPSEC 142 #include <netipsec/ipsec.h> 143 #include <netipsec/key.h> 144 #endif 145 146 #ifdef MPLS 147 #include <netmpls/mpls.h> 148 #include <netmpls/mpls_var.h> 149 #endif 150 151 static int ip_pcbopts(struct inpcb *, const struct sockopt *); 152 static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *); 153 static struct ifnet *ip_multicast_if(struct in_addr *, int *); 154 static void ip_mloopback(struct ifnet *, struct mbuf *, 155 const struct sockaddr_in *); 156 157 extern pfil_head_t *inet_pfil_hook; /* XXX */ 158 159 int ip_do_loopback_cksum = 0; 160 161 static int 162 ip_mark_mpls(struct ifnet * const ifp, struct mbuf * const m, 163 const struct rtentry *rt) 164 { 165 int error = 0; 166 #ifdef MPLS 167 union mpls_shim msh; 168 169 if (rt == NULL || rt_gettag(rt) == NULL || 170 rt_gettag(rt)->sa_family != AF_MPLS || 171 (m->m_flags & (M_MCAST | M_BCAST)) != 0 || 172 ifp->if_type != IFT_ETHER) 173 return 0; 174 175 msh.s_addr = MPLS_GETSADDR(rt); 176 if (msh.shim.label != MPLS_LABEL_IMPLNULL) { 177 struct m_tag *mtag; 178 /* 179 * XXX tentative solution to tell ether_output 180 * it's MPLS. Need some more efficient solution. 181 */ 182 mtag = m_tag_get(PACKET_TAG_MPLS, 183 sizeof(int) /* dummy */, 184 M_NOWAIT); 185 if (mtag == NULL) 186 return ENOMEM; 187 m_tag_prepend(m, mtag); 188 } 189 #endif 190 return error; 191 } 192 193 /* 194 * Send an IP packet to a host. 195 */ 196 int 197 ip_if_output(struct ifnet * const ifp, struct mbuf * const m, 198 const struct sockaddr * const dst, const struct rtentry *rt) 199 { 200 int error = 0; 201 202 if (rt != NULL) { 203 error = rt_check_reject_route(rt, ifp); 204 if (error != 0) { 205 m_freem(m); 206 return error; 207 } 208 } 209 210 error = ip_mark_mpls(ifp, m, rt); 211 if (error != 0) { 212 m_freem(m); 213 return error; 214 } 215 216 error = if_output_lock(ifp, ifp, m, dst, rt); 217 218 return error; 219 } 220 221 /* 222 * IP output. The packet in mbuf chain m contains a skeletal IP 223 * header (with len, off, ttl, proto, tos, src, dst). 224 * The mbuf chain containing the packet will be freed. 225 * The mbuf opt, if present, will not be freed. 226 */ 227 int 228 ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro, int flags, 229 struct ip_moptions *imo, struct socket *so) 230 { 231 struct rtentry *rt; 232 struct ip *ip; 233 struct ifnet *ifp, *mifp = NULL; 234 struct mbuf *m = m0; 235 int hlen = sizeof (struct ip); 236 int len, error = 0; 237 struct route iproute; 238 const struct sockaddr_in *dst; 239 struct in_ifaddr *ia = NULL; 240 int isbroadcast; 241 int sw_csum; 242 u_long mtu; 243 #ifdef IPSEC 244 struct secpolicy *sp = NULL; 245 #endif 246 bool natt_frag = false; 247 bool rtmtu_nolock; 248 union { 249 struct sockaddr dst; 250 struct sockaddr_in dst4; 251 } u; 252 struct sockaddr *rdst = &u.dst; /* real IP destination, as opposed 253 * to the nexthop 254 */ 255 struct psref psref, psref_ia; 256 int bound; 257 bool bind_need_restore = false; 258 259 len = 0; 260 261 MCLAIM(m, &ip_tx_mowner); 262 263 KASSERT((m->m_flags & M_PKTHDR) != 0); 264 KASSERT((m->m_pkthdr.csum_flags & (M_CSUM_TCPv6|M_CSUM_UDPv6)) == 0); 265 KASSERT((m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) != 266 (M_CSUM_TCPv4|M_CSUM_UDPv4)); 267 268 if (opt) { 269 m = ip_insertoptions(m, opt, &len); 270 if (len >= sizeof(struct ip)) 271 hlen = len; 272 } 273 ip = mtod(m, struct ip *); 274 275 /* 276 * Fill in IP header. 277 */ 278 if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { 279 ip->ip_v = IPVERSION; 280 ip->ip_off = htons(0); 281 /* ip->ip_id filled in after we find out source ia */ 282 ip->ip_hl = hlen >> 2; 283 IP_STATINC(IP_STAT_LOCALOUT); 284 } else { 285 hlen = ip->ip_hl << 2; 286 } 287 288 /* 289 * Route packet. 290 */ 291 if (ro == NULL) { 292 memset(&iproute, 0, sizeof(iproute)); 293 ro = &iproute; 294 } 295 sockaddr_in_init(&u.dst4, &ip->ip_dst, 0); 296 dst = satocsin(rtcache_getdst(ro)); 297 298 /* 299 * If there is a cached route, check that it is to the same 300 * destination and is still up. If not, free it and try again. 301 * The address family should also be checked in case of sharing 302 * the cache with IPv6. 303 */ 304 if (dst && (dst->sin_family != AF_INET || 305 !in_hosteq(dst->sin_addr, ip->ip_dst))) 306 rtcache_free(ro); 307 308 if ((rt = rtcache_validate(ro)) == NULL && 309 (rt = rtcache_update(ro, 1)) == NULL) { 310 dst = &u.dst4; 311 error = rtcache_setdst(ro, &u.dst); 312 if (error != 0) 313 goto bad; 314 } 315 316 bound = curlwp_bind(); 317 bind_need_restore = true; 318 /* 319 * If routing to interface only, short circuit routing lookup. 320 */ 321 if (flags & IP_ROUTETOIF) { 322 struct ifaddr *ifa; 323 324 ifa = ifa_ifwithladdr_psref(sintocsa(dst), &psref_ia); 325 if (ifa == NULL) { 326 IP_STATINC(IP_STAT_NOROUTE); 327 error = ENETUNREACH; 328 goto bad; 329 } 330 /* ia is already referenced by psref_ia */ 331 ia = ifatoia(ifa); 332 333 ifp = ia->ia_ifp; 334 mtu = ifp->if_mtu; 335 ip->ip_ttl = 1; 336 isbroadcast = in_broadcast(dst->sin_addr, ifp); 337 } else if ((IN_MULTICAST(ip->ip_dst.s_addr) || 338 ip->ip_dst.s_addr == INADDR_BROADCAST) && 339 imo != NULL && imo->imo_multicast_if_index != 0) { 340 ifp = mifp = if_get_byindex(imo->imo_multicast_if_index, &psref); 341 if (ifp == NULL) { 342 IP_STATINC(IP_STAT_NOROUTE); 343 error = ENETUNREACH; 344 goto bad; 345 } 346 mtu = ifp->if_mtu; 347 ia = in_get_ia_from_ifp_psref(ifp, &psref_ia); 348 if (ia == NULL) { 349 error = EADDRNOTAVAIL; 350 goto bad; 351 } 352 isbroadcast = 0; 353 } else { 354 if (rt == NULL) 355 rt = rtcache_init(ro); 356 if (rt == NULL) { 357 IP_STATINC(IP_STAT_NOROUTE); 358 error = EHOSTUNREACH; 359 goto bad; 360 } 361 /* 362 * XXX NOMPSAFE: depends on accessing rt->rt_ifa isn't racy. 363 * Revisit when working on rtentry MP-ification. 364 */ 365 ifa_acquire(rt->rt_ifa, &psref_ia); 366 ia = ifatoia(rt->rt_ifa); 367 ifp = rt->rt_ifp; 368 if ((mtu = rt->rt_rmx.rmx_mtu) == 0) 369 mtu = ifp->if_mtu; 370 rt->rt_use++; 371 if (rt->rt_flags & RTF_GATEWAY) 372 dst = satosin(rt->rt_gateway); 373 if (rt->rt_flags & RTF_HOST) 374 isbroadcast = rt->rt_flags & RTF_BROADCAST; 375 else 376 isbroadcast = in_broadcast(dst->sin_addr, ifp); 377 } 378 rtmtu_nolock = rt && (rt->rt_rmx.rmx_locks & RTV_MTU) == 0; 379 380 if (IN_MULTICAST(ip->ip_dst.s_addr) || 381 (ip->ip_dst.s_addr == INADDR_BROADCAST)) { 382 bool inmgroup; 383 384 m->m_flags |= (ip->ip_dst.s_addr == INADDR_BROADCAST) ? 385 M_BCAST : M_MCAST; 386 /* 387 * See if the caller provided any multicast options 388 */ 389 if (imo != NULL) 390 ip->ip_ttl = imo->imo_multicast_ttl; 391 else 392 ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; 393 394 /* 395 * if we don't know the outgoing ifp yet, we can't generate 396 * output 397 */ 398 if (!ifp) { 399 IP_STATINC(IP_STAT_NOROUTE); 400 error = ENETUNREACH; 401 goto bad; 402 } 403 404 /* 405 * If the packet is multicast or broadcast, confirm that 406 * the outgoing interface can transmit it. 407 */ 408 if (((m->m_flags & M_MCAST) && 409 (ifp->if_flags & IFF_MULTICAST) == 0) || 410 ((m->m_flags & M_BCAST) && 411 (ifp->if_flags & (IFF_BROADCAST|IFF_POINTOPOINT)) == 0)) { 412 IP_STATINC(IP_STAT_NOROUTE); 413 error = ENETUNREACH; 414 goto bad; 415 } 416 /* 417 * If source address not specified yet, use an address 418 * of outgoing interface. 419 */ 420 if (in_nullhost(ip->ip_src)) { 421 struct in_ifaddr *xia; 422 struct ifaddr *xifa; 423 struct psref _psref; 424 425 xia = in_get_ia_from_ifp_psref(ifp, &_psref); 426 if (!xia) { 427 error = EADDRNOTAVAIL; 428 goto bad; 429 } 430 xifa = &xia->ia_ifa; 431 if (xifa->ifa_getifa != NULL) { 432 ia4_release(xia, &_psref); 433 /* FIXME NOMPSAFE */ 434 xia = ifatoia((*xifa->ifa_getifa)(xifa, rdst)); 435 if (xia == NULL) { 436 error = EADDRNOTAVAIL; 437 goto bad; 438 } 439 ia4_acquire(xia, &_psref); 440 } 441 ip->ip_src = xia->ia_addr.sin_addr; 442 ia4_release(xia, &_psref); 443 } 444 445 inmgroup = in_multi_group(ip->ip_dst, ifp, flags); 446 if (inmgroup && (imo == NULL || imo->imo_multicast_loop)) { 447 /* 448 * If we belong to the destination multicast group 449 * on the outgoing interface, and the caller did not 450 * forbid loopback, loop back a copy. 451 */ 452 ip_mloopback(ifp, m, &u.dst4); 453 } 454 #ifdef MROUTING 455 else { 456 /* 457 * If we are acting as a multicast router, perform 458 * multicast forwarding as if the packet had just 459 * arrived on the interface to which we are about 460 * to send. The multicast forwarding function 461 * recursively calls this function, using the 462 * IP_FORWARDING flag to prevent infinite recursion. 463 * 464 * Multicasts that are looped back by ip_mloopback(), 465 * above, will be forwarded by the ip_input() routine, 466 * if necessary. 467 */ 468 extern struct socket *ip_mrouter; 469 470 if (ip_mrouter && (flags & IP_FORWARDING) == 0) { 471 if (ip_mforward(m, ifp) != 0) { 472 m_freem(m); 473 goto done; 474 } 475 } 476 } 477 #endif 478 /* 479 * Multicasts with a time-to-live of zero may be looped- 480 * back, above, but must not be transmitted on a network. 481 * Also, multicasts addressed to the loopback interface 482 * are not sent -- the above call to ip_mloopback() will 483 * loop back a copy if this host actually belongs to the 484 * destination group on the loopback interface. 485 */ 486 if (ip->ip_ttl == 0 || (ifp->if_flags & IFF_LOOPBACK) != 0) { 487 m_freem(m); 488 goto done; 489 } 490 goto sendit; 491 } 492 493 /* 494 * If source address not specified yet, use address 495 * of outgoing interface. 496 */ 497 if (in_nullhost(ip->ip_src)) { 498 struct ifaddr *xifa; 499 500 xifa = &ia->ia_ifa; 501 if (xifa->ifa_getifa != NULL) { 502 ia4_release(ia, &psref_ia); 503 /* FIXME NOMPSAFE */ 504 ia = ifatoia((*xifa->ifa_getifa)(xifa, rdst)); 505 if (ia == NULL) { 506 error = EADDRNOTAVAIL; 507 goto bad; 508 } 509 ia4_acquire(ia, &psref_ia); 510 } 511 ip->ip_src = ia->ia_addr.sin_addr; 512 } 513 514 /* 515 * packets with Class-D address as source are not valid per 516 * RFC 1112 517 */ 518 if (IN_MULTICAST(ip->ip_src.s_addr)) { 519 IP_STATINC(IP_STAT_ODROPPED); 520 error = EADDRNOTAVAIL; 521 goto bad; 522 } 523 524 /* 525 * Look for broadcast address and and verify user is allowed to 526 * send such a packet. 527 */ 528 if (isbroadcast) { 529 if ((ifp->if_flags & IFF_BROADCAST) == 0) { 530 error = EADDRNOTAVAIL; 531 goto bad; 532 } 533 if ((flags & IP_ALLOWBROADCAST) == 0) { 534 error = EACCES; 535 goto bad; 536 } 537 /* don't allow broadcast messages to be fragmented */ 538 if (ntohs(ip->ip_len) > ifp->if_mtu) { 539 error = EMSGSIZE; 540 goto bad; 541 } 542 m->m_flags |= M_BCAST; 543 } else 544 m->m_flags &= ~M_BCAST; 545 546 sendit: 547 if ((flags & (IP_FORWARDING|IP_NOIPNEWID)) == 0) { 548 if (m->m_pkthdr.len < IP_MINFRAGSIZE) { 549 ip->ip_id = 0; 550 } else if ((m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0) { 551 ip->ip_id = ip_newid(ia); 552 } else { 553 554 /* 555 * TSO capable interfaces (typically?) increment 556 * ip_id for each segment. 557 * "allocate" enough ids here to increase the chance 558 * for them to be unique. 559 * 560 * note that the following calculation is not 561 * needed to be precise. wasting some ip_id is fine. 562 */ 563 564 unsigned int segsz = m->m_pkthdr.segsz; 565 unsigned int datasz = ntohs(ip->ip_len) - hlen; 566 unsigned int num = howmany(datasz, segsz); 567 568 ip->ip_id = ip_newid_range(ia, num); 569 } 570 } 571 if (ia != NULL) { 572 ia4_release(ia, &psref_ia); 573 ia = NULL; 574 } 575 576 /* 577 * If we're doing Path MTU Discovery, we need to set DF unless 578 * the route's MTU is locked. 579 */ 580 if ((flags & IP_MTUDISC) != 0 && rtmtu_nolock) { 581 ip->ip_off |= htons(IP_DF); 582 } 583 584 #ifdef IPSEC 585 if (ipsec_used) { 586 bool ipsec_done = false; 587 588 /* Perform IPsec processing, if any. */ 589 error = ipsec4_output(m, so, flags, &sp, &mtu, &natt_frag, 590 &ipsec_done); 591 if (error || ipsec_done) 592 goto done; 593 } 594 #endif 595 596 /* 597 * Run through list of hooks for output packets. 598 */ 599 error = pfil_run_hooks(inet_pfil_hook, &m, ifp, PFIL_OUT); 600 if (error) 601 goto done; 602 if (m == NULL) 603 goto done; 604 605 ip = mtod(m, struct ip *); 606 hlen = ip->ip_hl << 2; 607 608 m->m_pkthdr.csum_data |= hlen << 16; 609 610 #if IFA_STATS 611 /* 612 * search for the source address structure to 613 * maintain output statistics. 614 */ 615 KASSERT(ia == NULL); 616 ia = in_get_ia_psref(ip->ip_src, &psref_ia); 617 #endif 618 619 /* Maybe skip checksums on loopback interfaces. */ 620 if (IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)) { 621 m->m_pkthdr.csum_flags |= M_CSUM_IPv4; 622 } 623 sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_csum_flags_tx; 624 /* 625 * If small enough for mtu of path, or if using TCP segmentation 626 * offload, can just send directly. 627 */ 628 if (ntohs(ip->ip_len) <= mtu || 629 (m->m_pkthdr.csum_flags & M_CSUM_TSOv4) != 0) { 630 const struct sockaddr *sa; 631 632 #if IFA_STATS 633 if (ia) 634 ia->ia_ifa.ifa_data.ifad_outbytes += ntohs(ip->ip_len); 635 #endif 636 /* 637 * Always initialize the sum to 0! Some HW assisted 638 * checksumming requires this. 639 */ 640 ip->ip_sum = 0; 641 642 if ((m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0) { 643 /* 644 * Perform any checksums that the hardware can't do 645 * for us. 646 * 647 * XXX Does any hardware require the {th,uh}_sum 648 * XXX fields to be 0? 649 */ 650 if (sw_csum & M_CSUM_IPv4) { 651 KASSERT(IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)); 652 ip->ip_sum = in_cksum(m, hlen); 653 m->m_pkthdr.csum_flags &= ~M_CSUM_IPv4; 654 } 655 if (sw_csum & (M_CSUM_TCPv4|M_CSUM_UDPv4)) { 656 if (IN_NEED_CHECKSUM(ifp, 657 sw_csum & (M_CSUM_TCPv4|M_CSUM_UDPv4))) { 658 in_delayed_cksum(m); 659 } 660 m->m_pkthdr.csum_flags &= 661 ~(M_CSUM_TCPv4|M_CSUM_UDPv4); 662 } 663 } 664 665 sa = (m->m_flags & M_MCAST) ? sintocsa(rdst) : sintocsa(dst); 666 if (__predict_true( 667 (m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0 || 668 (ifp->if_capenable & IFCAP_TSOv4) != 0)) { 669 error = ip_if_output(ifp, m, sa, rt); 670 } else { 671 error = ip_tso_output(ifp, m, sa, rt); 672 } 673 goto done; 674 } 675 676 /* 677 * We can't use HW checksumming if we're about to 678 * to fragment the packet. 679 * 680 * XXX Some hardware can do this. 681 */ 682 if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) { 683 if (IN_NEED_CHECKSUM(ifp, 684 m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4))) { 685 in_delayed_cksum(m); 686 } 687 m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4); 688 } 689 690 /* 691 * Too large for interface; fragment if possible. 692 * Must be able to put at least 8 bytes per fragment. 693 */ 694 if (ntohs(ip->ip_off) & IP_DF) { 695 if (flags & IP_RETURNMTU) { 696 struct inpcb *inp; 697 698 KASSERT(so && solocked(so)); 699 inp = sotoinpcb(so); 700 inp->inp_errormtu = mtu; 701 } 702 error = EMSGSIZE; 703 IP_STATINC(IP_STAT_CANTFRAG); 704 goto bad; 705 } 706 707 error = ip_fragment(m, ifp, mtu); 708 if (error) { 709 m = NULL; 710 goto bad; 711 } 712 713 for (; m; m = m0) { 714 m0 = m->m_nextpkt; 715 m->m_nextpkt = 0; 716 if (error) { 717 m_freem(m); 718 continue; 719 } 720 #if IFA_STATS 721 if (ia) 722 ia->ia_ifa.ifa_data.ifad_outbytes += ntohs(ip->ip_len); 723 #endif 724 /* 725 * If we get there, the packet has not been handled by 726 * IPsec whereas it should have. Now that it has been 727 * fragmented, re-inject it in ip_output so that IPsec 728 * processing can occur. 729 */ 730 if (natt_frag) { 731 error = ip_output(m, opt, ro, 732 flags | IP_RAWOUTPUT | IP_NOIPNEWID, 733 imo, so); 734 } else { 735 KASSERT((m->m_pkthdr.csum_flags & 736 (M_CSUM_UDPv4 | M_CSUM_TCPv4)) == 0); 737 error = ip_if_output(ifp, m, 738 (m->m_flags & M_MCAST) ? 739 sintocsa(rdst) : sintocsa(dst), rt); 740 } 741 } 742 if (error == 0) { 743 IP_STATINC(IP_STAT_FRAGMENTED); 744 } 745 done: 746 ia4_release(ia, &psref_ia); 747 if (ro == &iproute) { 748 rtcache_free(&iproute); 749 } 750 #ifdef IPSEC 751 if (sp) { 752 KEY_FREESP(&sp); 753 } 754 #endif 755 if (mifp != NULL) { 756 if_put(mifp, &psref); 757 } 758 if (bind_need_restore) 759 curlwp_bindx(bound); 760 return error; 761 bad: 762 m_freem(m); 763 goto done; 764 } 765 766 int 767 ip_fragment(struct mbuf *m, struct ifnet *ifp, u_long mtu) 768 { 769 struct ip *ip, *mhip; 770 struct mbuf *m0; 771 int len, hlen, off; 772 int mhlen, firstlen; 773 struct mbuf **mnext; 774 int sw_csum = m->m_pkthdr.csum_flags; 775 int fragments = 0; 776 int s; 777 int error = 0; 778 779 ip = mtod(m, struct ip *); 780 hlen = ip->ip_hl << 2; 781 if (ifp != NULL) 782 sw_csum &= ~ifp->if_csum_flags_tx; 783 784 len = (mtu - hlen) &~ 7; 785 if (len < 8) { 786 m_freem(m); 787 return (EMSGSIZE); 788 } 789 790 firstlen = len; 791 mnext = &m->m_nextpkt; 792 793 /* 794 * Loop through length of segment after first fragment, 795 * make new header and copy data of each part and link onto chain. 796 */ 797 m0 = m; 798 mhlen = sizeof (struct ip); 799 for (off = hlen + len; off < ntohs(ip->ip_len); off += len) { 800 MGETHDR(m, M_DONTWAIT, MT_HEADER); 801 if (m == 0) { 802 error = ENOBUFS; 803 IP_STATINC(IP_STAT_ODROPPED); 804 goto sendorfree; 805 } 806 MCLAIM(m, m0->m_owner); 807 *mnext = m; 808 mnext = &m->m_nextpkt; 809 m->m_data += max_linkhdr; 810 mhip = mtod(m, struct ip *); 811 *mhip = *ip; 812 /* we must inherit MCAST and BCAST flags */ 813 m->m_flags |= m0->m_flags & (M_MCAST|M_BCAST); 814 if (hlen > sizeof (struct ip)) { 815 mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); 816 mhip->ip_hl = mhlen >> 2; 817 } 818 m->m_len = mhlen; 819 mhip->ip_off = ((off - hlen) >> 3) + 820 (ntohs(ip->ip_off) & ~IP_MF); 821 if (ip->ip_off & htons(IP_MF)) 822 mhip->ip_off |= IP_MF; 823 if (off + len >= ntohs(ip->ip_len)) 824 len = ntohs(ip->ip_len) - off; 825 else 826 mhip->ip_off |= IP_MF; 827 HTONS(mhip->ip_off); 828 mhip->ip_len = htons((u_int16_t)(len + mhlen)); 829 m->m_next = m_copym(m0, off, len, M_DONTWAIT); 830 if (m->m_next == 0) { 831 error = ENOBUFS; /* ??? */ 832 IP_STATINC(IP_STAT_ODROPPED); 833 goto sendorfree; 834 } 835 m->m_pkthdr.len = mhlen + len; 836 m_reset_rcvif(m); 837 mhip->ip_sum = 0; 838 KASSERT((m->m_pkthdr.csum_flags & M_CSUM_IPv4) == 0); 839 if (sw_csum & M_CSUM_IPv4) { 840 mhip->ip_sum = in_cksum(m, mhlen); 841 } else { 842 /* 843 * checksum is hw-offloaded or not necessary. 844 */ 845 m->m_pkthdr.csum_flags |= 846 m0->m_pkthdr.csum_flags & M_CSUM_IPv4; 847 m->m_pkthdr.csum_data |= mhlen << 16; 848 KASSERT(!(ifp != NULL && 849 IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)) || 850 (m->m_pkthdr.csum_flags & M_CSUM_IPv4) != 0); 851 } 852 IP_STATINC(IP_STAT_OFRAGMENTS); 853 fragments++; 854 } 855 /* 856 * Update first fragment by trimming what's been copied out 857 * and updating header, then send each fragment (in order). 858 */ 859 m = m0; 860 m_adj(m, hlen + firstlen - ntohs(ip->ip_len)); 861 m->m_pkthdr.len = hlen + firstlen; 862 ip->ip_len = htons((u_int16_t)m->m_pkthdr.len); 863 ip->ip_off |= htons(IP_MF); 864 ip->ip_sum = 0; 865 if (sw_csum & M_CSUM_IPv4) { 866 ip->ip_sum = in_cksum(m, hlen); 867 m->m_pkthdr.csum_flags &= ~M_CSUM_IPv4; 868 } else { 869 /* 870 * checksum is hw-offloaded or not necessary. 871 */ 872 KASSERT(!(ifp != NULL && IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)) || 873 (m->m_pkthdr.csum_flags & M_CSUM_IPv4) != 0); 874 KASSERT(M_CSUM_DATA_IPv4_IPHL(m->m_pkthdr.csum_data) >= 875 sizeof(struct ip)); 876 } 877 sendorfree: 878 /* 879 * If there is no room for all the fragments, don't queue 880 * any of them. 881 */ 882 if (ifp != NULL) { 883 s = splnet(); 884 if (ifp->if_snd.ifq_maxlen - ifp->if_snd.ifq_len < fragments && 885 error == 0) { 886 error = ENOBUFS; 887 IP_STATINC(IP_STAT_ODROPPED); 888 IFQ_INC_DROPS(&ifp->if_snd); 889 } 890 splx(s); 891 } 892 if (error) { 893 for (m = m0; m; m = m0) { 894 m0 = m->m_nextpkt; 895 m->m_nextpkt = NULL; 896 m_freem(m); 897 } 898 } 899 return (error); 900 } 901 902 /* 903 * Process a delayed payload checksum calculation. 904 */ 905 void 906 in_delayed_cksum(struct mbuf *m) 907 { 908 struct ip *ip; 909 u_int16_t csum, offset; 910 911 ip = mtod(m, struct ip *); 912 offset = ip->ip_hl << 2; 913 csum = in4_cksum(m, 0, offset, ntohs(ip->ip_len) - offset); 914 if (csum == 0 && (m->m_pkthdr.csum_flags & M_CSUM_UDPv4) != 0) 915 csum = 0xffff; 916 917 offset += M_CSUM_DATA_IPv4_OFFSET(m->m_pkthdr.csum_data); 918 919 if ((offset + sizeof(u_int16_t)) > m->m_len) { 920 /* This happen when ip options were inserted 921 printf("in_delayed_cksum: pullup len %d off %d proto %d\n", 922 m->m_len, offset, ip->ip_p); 923 */ 924 m_copyback(m, offset, sizeof(csum), (void *) &csum); 925 } else 926 *(u_int16_t *)(mtod(m, char *) + offset) = csum; 927 } 928 929 /* 930 * Determine the maximum length of the options to be inserted; 931 * we would far rather allocate too much space rather than too little. 932 */ 933 934 u_int 935 ip_optlen(struct inpcb *inp) 936 { 937 struct mbuf *m = inp->inp_options; 938 939 if (m && m->m_len > offsetof(struct ipoption, ipopt_dst)) { 940 return (m->m_len - offsetof(struct ipoption, ipopt_dst)); 941 } 942 return 0; 943 } 944 945 /* 946 * Insert IP options into preformed packet. 947 * Adjust IP destination as required for IP source routing, 948 * as indicated by a non-zero in_addr at the start of the options. 949 */ 950 static struct mbuf * 951 ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen) 952 { 953 struct ipoption *p = mtod(opt, struct ipoption *); 954 struct mbuf *n; 955 struct ip *ip = mtod(m, struct ip *); 956 unsigned optlen; 957 958 optlen = opt->m_len - sizeof(p->ipopt_dst); 959 if (optlen + ntohs(ip->ip_len) > IP_MAXPACKET) 960 return (m); /* XXX should fail */ 961 if (!in_nullhost(p->ipopt_dst)) 962 ip->ip_dst = p->ipopt_dst; 963 if (M_READONLY(m) || M_LEADINGSPACE(m) < optlen) { 964 MGETHDR(n, M_DONTWAIT, MT_HEADER); 965 if (n == 0) 966 return (m); 967 MCLAIM(n, m->m_owner); 968 M_MOVE_PKTHDR(n, m); 969 m->m_len -= sizeof(struct ip); 970 m->m_data += sizeof(struct ip); 971 n->m_next = m; 972 m = n; 973 m->m_len = optlen + sizeof(struct ip); 974 m->m_data += max_linkhdr; 975 bcopy((void *)ip, mtod(m, void *), sizeof(struct ip)); 976 } else { 977 m->m_data -= optlen; 978 m->m_len += optlen; 979 memmove(mtod(m, void *), ip, sizeof(struct ip)); 980 } 981 m->m_pkthdr.len += optlen; 982 ip = mtod(m, struct ip *); 983 bcopy((void *)p->ipopt_list, (void *)(ip + 1), (unsigned)optlen); 984 *phlen = sizeof(struct ip) + optlen; 985 ip->ip_len = htons(ntohs(ip->ip_len) + optlen); 986 return (m); 987 } 988 989 /* 990 * Copy options from ip to jp, 991 * omitting those not copied during fragmentation. 992 */ 993 int 994 ip_optcopy(struct ip *ip, struct ip *jp) 995 { 996 u_char *cp, *dp; 997 int opt, optlen, cnt; 998 999 cp = (u_char *)(ip + 1); 1000 dp = (u_char *)(jp + 1); 1001 cnt = (ip->ip_hl << 2) - sizeof (struct ip); 1002 for (; cnt > 0; cnt -= optlen, cp += optlen) { 1003 opt = cp[0]; 1004 if (opt == IPOPT_EOL) 1005 break; 1006 if (opt == IPOPT_NOP) { 1007 /* Preserve for IP mcast tunnel's LSRR alignment. */ 1008 *dp++ = IPOPT_NOP; 1009 optlen = 1; 1010 continue; 1011 } 1012 1013 KASSERT(cnt >= IPOPT_OLEN + sizeof(*cp)); 1014 optlen = cp[IPOPT_OLEN]; 1015 KASSERT(optlen >= IPOPT_OLEN + sizeof(*cp) && optlen < cnt); 1016 1017 /* Invalid lengths should have been caught by ip_dooptions. */ 1018 if (optlen > cnt) 1019 optlen = cnt; 1020 if (IPOPT_COPIED(opt)) { 1021 bcopy((void *)cp, (void *)dp, (unsigned)optlen); 1022 dp += optlen; 1023 } 1024 } 1025 for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++) 1026 *dp++ = IPOPT_EOL; 1027 return (optlen); 1028 } 1029 1030 /* 1031 * IP socket option processing. 1032 */ 1033 int 1034 ip_ctloutput(int op, struct socket *so, struct sockopt *sopt) 1035 { 1036 struct inpcb *inp = sotoinpcb(so); 1037 struct ip *ip = &inp->inp_ip; 1038 int inpflags = inp->inp_flags; 1039 int optval = 0, error = 0; 1040 1041 if (sopt->sopt_level != IPPROTO_IP) { 1042 if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_NOHEADER) 1043 return 0; 1044 return ENOPROTOOPT; 1045 } 1046 1047 switch (op) { 1048 case PRCO_SETOPT: 1049 switch (sopt->sopt_name) { 1050 case IP_OPTIONS: 1051 #ifdef notyet 1052 case IP_RETOPTS: 1053 #endif 1054 error = ip_pcbopts(inp, sopt); 1055 break; 1056 1057 case IP_TOS: 1058 case IP_TTL: 1059 case IP_MINTTL: 1060 case IP_PKTINFO: 1061 case IP_RECVOPTS: 1062 case IP_RECVRETOPTS: 1063 case IP_RECVDSTADDR: 1064 case IP_RECVIF: 1065 case IP_RECVPKTINFO: 1066 case IP_RECVTTL: 1067 error = sockopt_getint(sopt, &optval); 1068 if (error) 1069 break; 1070 1071 switch (sopt->sopt_name) { 1072 case IP_TOS: 1073 ip->ip_tos = optval; 1074 break; 1075 1076 case IP_TTL: 1077 ip->ip_ttl = optval; 1078 break; 1079 1080 case IP_MINTTL: 1081 if (optval > 0 && optval <= MAXTTL) 1082 inp->inp_ip_minttl = optval; 1083 else 1084 error = EINVAL; 1085 break; 1086 #define OPTSET(bit) \ 1087 if (optval) \ 1088 inpflags |= bit; \ 1089 else \ 1090 inpflags &= ~bit; 1091 1092 case IP_PKTINFO: 1093 OPTSET(INP_PKTINFO); 1094 break; 1095 1096 case IP_RECVOPTS: 1097 OPTSET(INP_RECVOPTS); 1098 break; 1099 1100 case IP_RECVPKTINFO: 1101 OPTSET(INP_RECVPKTINFO); 1102 break; 1103 1104 case IP_RECVRETOPTS: 1105 OPTSET(INP_RECVRETOPTS); 1106 break; 1107 1108 case IP_RECVDSTADDR: 1109 OPTSET(INP_RECVDSTADDR); 1110 break; 1111 1112 case IP_RECVIF: 1113 OPTSET(INP_RECVIF); 1114 break; 1115 1116 case IP_RECVTTL: 1117 OPTSET(INP_RECVTTL); 1118 break; 1119 } 1120 break; 1121 #undef OPTSET 1122 1123 case IP_MULTICAST_IF: 1124 case IP_MULTICAST_TTL: 1125 case IP_MULTICAST_LOOP: 1126 case IP_ADD_MEMBERSHIP: 1127 case IP_DROP_MEMBERSHIP: 1128 error = ip_setmoptions(&inp->inp_moptions, sopt); 1129 break; 1130 1131 case IP_PORTRANGE: 1132 error = sockopt_getint(sopt, &optval); 1133 if (error) 1134 break; 1135 1136 switch (optval) { 1137 case IP_PORTRANGE_DEFAULT: 1138 case IP_PORTRANGE_HIGH: 1139 inpflags &= ~(INP_LOWPORT); 1140 break; 1141 1142 case IP_PORTRANGE_LOW: 1143 inpflags |= INP_LOWPORT; 1144 break; 1145 1146 default: 1147 error = EINVAL; 1148 break; 1149 } 1150 break; 1151 1152 case IP_PORTALGO: 1153 error = sockopt_getint(sopt, &optval); 1154 if (error) 1155 break; 1156 1157 error = portalgo_algo_index_select( 1158 (struct inpcb_hdr *)inp, optval); 1159 break; 1160 1161 #if defined(IPSEC) 1162 case IP_IPSEC_POLICY: 1163 if (ipsec_enabled) { 1164 error = ipsec4_set_policy(inp, sopt->sopt_name, 1165 sopt->sopt_data, sopt->sopt_size, 1166 curlwp->l_cred); 1167 break; 1168 } 1169 /*FALLTHROUGH*/ 1170 #endif /* IPSEC */ 1171 1172 default: 1173 error = ENOPROTOOPT; 1174 break; 1175 } 1176 break; 1177 1178 case PRCO_GETOPT: 1179 switch (sopt->sopt_name) { 1180 case IP_OPTIONS: 1181 case IP_RETOPTS: { 1182 struct mbuf *mopts = inp->inp_options; 1183 1184 if (mopts) { 1185 struct mbuf *m; 1186 1187 m = m_copym(mopts, 0, M_COPYALL, M_DONTWAIT); 1188 if (m == NULL) { 1189 error = ENOBUFS; 1190 break; 1191 } 1192 error = sockopt_setmbuf(sopt, m); 1193 } 1194 break; 1195 } 1196 case IP_PKTINFO: 1197 case IP_TOS: 1198 case IP_TTL: 1199 case IP_MINTTL: 1200 case IP_RECVOPTS: 1201 case IP_RECVRETOPTS: 1202 case IP_RECVDSTADDR: 1203 case IP_RECVIF: 1204 case IP_RECVPKTINFO: 1205 case IP_RECVTTL: 1206 case IP_ERRORMTU: 1207 switch (sopt->sopt_name) { 1208 case IP_TOS: 1209 optval = ip->ip_tos; 1210 break; 1211 1212 case IP_TTL: 1213 optval = ip->ip_ttl; 1214 break; 1215 1216 case IP_MINTTL: 1217 optval = inp->inp_ip_minttl; 1218 break; 1219 1220 case IP_ERRORMTU: 1221 optval = inp->inp_errormtu; 1222 break; 1223 1224 #define OPTBIT(bit) (inpflags & bit ? 1 : 0) 1225 1226 case IP_PKTINFO: 1227 optval = OPTBIT(INP_PKTINFO); 1228 break; 1229 1230 case IP_RECVOPTS: 1231 optval = OPTBIT(INP_RECVOPTS); 1232 break; 1233 1234 case IP_RECVPKTINFO: 1235 optval = OPTBIT(INP_RECVPKTINFO); 1236 break; 1237 1238 case IP_RECVRETOPTS: 1239 optval = OPTBIT(INP_RECVRETOPTS); 1240 break; 1241 1242 case IP_RECVDSTADDR: 1243 optval = OPTBIT(INP_RECVDSTADDR); 1244 break; 1245 1246 case IP_RECVIF: 1247 optval = OPTBIT(INP_RECVIF); 1248 break; 1249 1250 case IP_RECVTTL: 1251 optval = OPTBIT(INP_RECVTTL); 1252 break; 1253 } 1254 error = sockopt_setint(sopt, optval); 1255 break; 1256 1257 #if 0 /* defined(IPSEC) */ 1258 case IP_IPSEC_POLICY: 1259 { 1260 struct mbuf *m = NULL; 1261 1262 /* XXX this will return EINVAL as sopt is empty */ 1263 error = ipsec4_get_policy(inp, sopt->sopt_data, 1264 sopt->sopt_size, &m); 1265 if (error == 0) 1266 error = sockopt_setmbuf(sopt, m); 1267 break; 1268 } 1269 #endif /*IPSEC*/ 1270 1271 case IP_MULTICAST_IF: 1272 case IP_MULTICAST_TTL: 1273 case IP_MULTICAST_LOOP: 1274 case IP_ADD_MEMBERSHIP: 1275 case IP_DROP_MEMBERSHIP: 1276 error = ip_getmoptions(inp->inp_moptions, sopt); 1277 break; 1278 1279 case IP_PORTRANGE: 1280 if (inpflags & INP_LOWPORT) 1281 optval = IP_PORTRANGE_LOW; 1282 else 1283 optval = IP_PORTRANGE_DEFAULT; 1284 error = sockopt_setint(sopt, optval); 1285 break; 1286 1287 case IP_PORTALGO: 1288 optval = inp->inp_portalgo; 1289 error = sockopt_setint(sopt, optval); 1290 break; 1291 1292 default: 1293 error = ENOPROTOOPT; 1294 break; 1295 } 1296 break; 1297 } 1298 1299 if (!error) { 1300 inp->inp_flags = inpflags; 1301 } 1302 return error; 1303 } 1304 1305 /* 1306 * Set up IP options in pcb for insertion in output packets. 1307 * Store in mbuf with pointer in pcbopt, adding pseudo-option 1308 * with destination address if source routed. 1309 */ 1310 static int 1311 ip_pcbopts(struct inpcb *inp, const struct sockopt *sopt) 1312 { 1313 struct mbuf *m; 1314 const u_char *cp; 1315 u_char *dp; 1316 int cnt; 1317 1318 /* Turn off any old options. */ 1319 if (inp->inp_options) { 1320 m_free(inp->inp_options); 1321 } 1322 inp->inp_options = NULL; 1323 if ((cnt = sopt->sopt_size) == 0) { 1324 /* Only turning off any previous options. */ 1325 return 0; 1326 } 1327 cp = sopt->sopt_data; 1328 1329 #ifndef __vax__ 1330 if (cnt % sizeof(int32_t)) 1331 return (EINVAL); 1332 #endif 1333 1334 m = m_get(M_DONTWAIT, MT_SOOPTS); 1335 if (m == NULL) 1336 return (ENOBUFS); 1337 1338 dp = mtod(m, u_char *); 1339 memset(dp, 0, sizeof(struct in_addr)); 1340 dp += sizeof(struct in_addr); 1341 m->m_len = sizeof(struct in_addr); 1342 1343 /* 1344 * IP option list according to RFC791. Each option is of the form 1345 * 1346 * [optval] [olen] [(olen - 2) data bytes] 1347 * 1348 * We validate the list and copy options to an mbuf for prepending 1349 * to data packets. The IP first-hop destination address will be 1350 * stored before actual options and is zero if unset. 1351 */ 1352 while (cnt > 0) { 1353 uint8_t optval, olen, offset; 1354 1355 optval = cp[IPOPT_OPTVAL]; 1356 1357 if (optval == IPOPT_EOL || optval == IPOPT_NOP) { 1358 olen = 1; 1359 } else { 1360 if (cnt < IPOPT_OLEN + 1) 1361 goto bad; 1362 1363 olen = cp[IPOPT_OLEN]; 1364 if (olen < IPOPT_OLEN + 1 || olen > cnt) 1365 goto bad; 1366 } 1367 1368 if (optval == IPOPT_LSRR || optval == IPOPT_SSRR) { 1369 /* 1370 * user process specifies route as: 1371 * ->A->B->C->D 1372 * D must be our final destination (but we can't 1373 * check that since we may not have connected yet). 1374 * A is first hop destination, which doesn't appear in 1375 * actual IP option, but is stored before the options. 1376 */ 1377 if (olen < IPOPT_OFFSET + 1 + sizeof(struct in_addr)) 1378 goto bad; 1379 1380 offset = cp[IPOPT_OFFSET]; 1381 memcpy(mtod(m, u_char *), cp + IPOPT_OFFSET + 1, 1382 sizeof(struct in_addr)); 1383 1384 cp += sizeof(struct in_addr); 1385 cnt -= sizeof(struct in_addr); 1386 olen -= sizeof(struct in_addr); 1387 1388 if (m->m_len + olen > MAX_IPOPTLEN + sizeof(struct in_addr)) 1389 goto bad; 1390 1391 memcpy(dp, cp, olen); 1392 dp[IPOPT_OPTVAL] = optval; 1393 dp[IPOPT_OLEN] = olen; 1394 dp[IPOPT_OFFSET] = offset; 1395 break; 1396 } else { 1397 if (m->m_len + olen > MAX_IPOPTLEN + sizeof(struct in_addr)) 1398 goto bad; 1399 1400 memcpy(dp, cp, olen); 1401 break; 1402 } 1403 1404 dp += olen; 1405 m->m_len += olen; 1406 1407 if (optval == IPOPT_EOL) 1408 break; 1409 1410 cp += olen; 1411 cnt -= olen; 1412 } 1413 1414 inp->inp_options = m; 1415 return 0; 1416 bad: 1417 (void)m_free(m); 1418 return EINVAL; 1419 } 1420 1421 /* 1422 * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index. 1423 */ 1424 static struct ifnet * 1425 ip_multicast_if(struct in_addr *a, int *ifindexp) 1426 { 1427 int ifindex; 1428 struct ifnet *ifp = NULL; 1429 struct in_ifaddr *ia; 1430 1431 if (ifindexp) 1432 *ifindexp = 0; 1433 if (ntohl(a->s_addr) >> 24 == 0) { 1434 ifindex = ntohl(a->s_addr) & 0xffffff; 1435 ifp = if_byindex(ifindex); 1436 if (!ifp) 1437 return NULL; 1438 if (ifindexp) 1439 *ifindexp = ifindex; 1440 } else { 1441 LIST_FOREACH(ia, &IN_IFADDR_HASH(a->s_addr), ia_hash) { 1442 if (in_hosteq(ia->ia_addr.sin_addr, *a) && 1443 (ia->ia_ifp->if_flags & IFF_MULTICAST) != 0) { 1444 ifp = ia->ia_ifp; 1445 break; 1446 } 1447 } 1448 } 1449 return ifp; 1450 } 1451 1452 static int 1453 ip_getoptval(const struct sockopt *sopt, u_int8_t *val, u_int maxval) 1454 { 1455 u_int tval; 1456 u_char cval; 1457 int error; 1458 1459 if (sopt == NULL) 1460 return EINVAL; 1461 1462 switch (sopt->sopt_size) { 1463 case sizeof(u_char): 1464 error = sockopt_get(sopt, &cval, sizeof(u_char)); 1465 tval = cval; 1466 break; 1467 1468 case sizeof(u_int): 1469 error = sockopt_get(sopt, &tval, sizeof(u_int)); 1470 break; 1471 1472 default: 1473 error = EINVAL; 1474 } 1475 1476 if (error) 1477 return error; 1478 1479 if (tval > maxval) 1480 return EINVAL; 1481 1482 *val = tval; 1483 return 0; 1484 } 1485 1486 static int 1487 ip_get_membership(const struct sockopt *sopt, struct ifnet **ifp, 1488 struct in_addr *ia, bool add) 1489 { 1490 int error; 1491 struct ip_mreq mreq; 1492 1493 error = sockopt_get(sopt, &mreq, sizeof(mreq)); 1494 if (error) 1495 return error; 1496 1497 if (!IN_MULTICAST(mreq.imr_multiaddr.s_addr)) 1498 return EINVAL; 1499 1500 memcpy(ia, &mreq.imr_multiaddr, sizeof(*ia)); 1501 1502 if (in_nullhost(mreq.imr_interface)) { 1503 union { 1504 struct sockaddr dst; 1505 struct sockaddr_in dst4; 1506 } u; 1507 struct route ro; 1508 1509 if (!add) { 1510 *ifp = NULL; 1511 return 0; 1512 } 1513 /* 1514 * If no interface address was provided, use the interface of 1515 * the route to the given multicast address. 1516 */ 1517 struct rtentry *rt; 1518 memset(&ro, 0, sizeof(ro)); 1519 1520 sockaddr_in_init(&u.dst4, ia, 0); 1521 error = rtcache_setdst(&ro, &u.dst); 1522 if (error != 0) 1523 return error; 1524 *ifp = (rt = rtcache_init(&ro)) != NULL ? rt->rt_ifp : NULL; 1525 rtcache_free(&ro); 1526 } else { 1527 *ifp = ip_multicast_if(&mreq.imr_interface, NULL); 1528 if (!add && *ifp == NULL) 1529 return EADDRNOTAVAIL; 1530 } 1531 return 0; 1532 } 1533 1534 /* 1535 * Add a multicast group membership. 1536 * Group must be a valid IP multicast address. 1537 */ 1538 static int 1539 ip_add_membership(struct ip_moptions *imo, const struct sockopt *sopt) 1540 { 1541 struct ifnet *ifp = NULL; // XXX: gcc [ppc] 1542 struct in_addr ia; 1543 int i, error; 1544 1545 if (sopt->sopt_size == sizeof(struct ip_mreq)) 1546 error = ip_get_membership(sopt, &ifp, &ia, true); 1547 else 1548 #ifdef INET6 1549 error = ip6_get_membership(sopt, &ifp, &ia, sizeof(ia)); 1550 #else 1551 return EINVAL; 1552 #endif 1553 1554 if (error) 1555 return error; 1556 1557 /* 1558 * See if we found an interface, and confirm that it 1559 * supports multicast. 1560 */ 1561 if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) 1562 return EADDRNOTAVAIL; 1563 1564 /* 1565 * See if the membership already exists or if all the 1566 * membership slots are full. 1567 */ 1568 for (i = 0; i < imo->imo_num_memberships; ++i) { 1569 if (imo->imo_membership[i]->inm_ifp == ifp && 1570 in_hosteq(imo->imo_membership[i]->inm_addr, ia)) 1571 break; 1572 } 1573 if (i < imo->imo_num_memberships) 1574 return EADDRINUSE; 1575 1576 if (i == IP_MAX_MEMBERSHIPS) 1577 return ETOOMANYREFS; 1578 1579 /* 1580 * Everything looks good; add a new record to the multicast 1581 * address list for the given interface. 1582 */ 1583 if ((imo->imo_membership[i] = in_addmulti(&ia, ifp)) == NULL) 1584 return ENOBUFS; 1585 1586 ++imo->imo_num_memberships; 1587 return 0; 1588 } 1589 1590 /* 1591 * Drop a multicast group membership. 1592 * Group must be a valid IP multicast address. 1593 */ 1594 static int 1595 ip_drop_membership(struct ip_moptions *imo, const struct sockopt *sopt) 1596 { 1597 struct in_addr ia = { .s_addr = 0 }; // XXX: gcc [ppc] 1598 struct ifnet *ifp = NULL; // XXX: gcc [ppc] 1599 int i, error; 1600 1601 if (sopt->sopt_size == sizeof(struct ip_mreq)) 1602 error = ip_get_membership(sopt, &ifp, &ia, false); 1603 else 1604 #ifdef INET6 1605 error = ip6_get_membership(sopt, &ifp, &ia, sizeof(ia)); 1606 #else 1607 return EINVAL; 1608 #endif 1609 1610 if (error) 1611 return error; 1612 1613 /* 1614 * Find the membership in the membership array. 1615 */ 1616 for (i = 0; i < imo->imo_num_memberships; ++i) { 1617 if ((ifp == NULL || 1618 imo->imo_membership[i]->inm_ifp == ifp) && 1619 in_hosteq(imo->imo_membership[i]->inm_addr, ia)) 1620 break; 1621 } 1622 if (i == imo->imo_num_memberships) 1623 return EADDRNOTAVAIL; 1624 1625 /* 1626 * Give up the multicast address record to which the 1627 * membership points. 1628 */ 1629 in_delmulti(imo->imo_membership[i]); 1630 1631 /* 1632 * Remove the gap in the membership array. 1633 */ 1634 for (++i; i < imo->imo_num_memberships; ++i) 1635 imo->imo_membership[i-1] = imo->imo_membership[i]; 1636 --imo->imo_num_memberships; 1637 return 0; 1638 } 1639 1640 /* 1641 * Set the IP multicast options in response to user setsockopt(). 1642 */ 1643 int 1644 ip_setmoptions(struct ip_moptions **pimo, const struct sockopt *sopt) 1645 { 1646 struct ip_moptions *imo = *pimo; 1647 struct in_addr addr; 1648 struct ifnet *ifp; 1649 int ifindex, error = 0; 1650 1651 if (!imo) { 1652 /* 1653 * No multicast option buffer attached to the pcb; 1654 * allocate one and initialize to default values. 1655 */ 1656 imo = kmem_intr_alloc(sizeof(*imo), KM_NOSLEEP); 1657 if (imo == NULL) 1658 return ENOBUFS; 1659 1660 imo->imo_multicast_if_index = 0; 1661 imo->imo_multicast_addr.s_addr = INADDR_ANY; 1662 imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 1663 imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; 1664 imo->imo_num_memberships = 0; 1665 *pimo = imo; 1666 } 1667 1668 switch (sopt->sopt_name) { 1669 case IP_MULTICAST_IF: 1670 /* 1671 * Select the interface for outgoing multicast packets. 1672 */ 1673 error = sockopt_get(sopt, &addr, sizeof(addr)); 1674 if (error) 1675 break; 1676 1677 /* 1678 * INADDR_ANY is used to remove a previous selection. 1679 * When no interface is selected, a default one is 1680 * chosen every time a multicast packet is sent. 1681 */ 1682 if (in_nullhost(addr)) { 1683 imo->imo_multicast_if_index = 0; 1684 break; 1685 } 1686 /* 1687 * The selected interface is identified by its local 1688 * IP address. Find the interface and confirm that 1689 * it supports multicasting. 1690 */ 1691 ifp = ip_multicast_if(&addr, &ifindex); 1692 if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { 1693 error = EADDRNOTAVAIL; 1694 break; 1695 } 1696 imo->imo_multicast_if_index = ifp->if_index; 1697 if (ifindex) 1698 imo->imo_multicast_addr = addr; 1699 else 1700 imo->imo_multicast_addr.s_addr = INADDR_ANY; 1701 break; 1702 1703 case IP_MULTICAST_TTL: 1704 /* 1705 * Set the IP time-to-live for outgoing multicast packets. 1706 */ 1707 error = ip_getoptval(sopt, &imo->imo_multicast_ttl, MAXTTL); 1708 break; 1709 1710 case IP_MULTICAST_LOOP: 1711 /* 1712 * Set the loopback flag for outgoing multicast packets. 1713 * Must be zero or one. 1714 */ 1715 error = ip_getoptval(sopt, &imo->imo_multicast_loop, 1); 1716 break; 1717 1718 case IP_ADD_MEMBERSHIP: /* IPV6_JOIN_GROUP */ 1719 error = ip_add_membership(imo, sopt); 1720 break; 1721 1722 case IP_DROP_MEMBERSHIP: /* IPV6_LEAVE_GROUP */ 1723 error = ip_drop_membership(imo, sopt); 1724 break; 1725 1726 default: 1727 error = EOPNOTSUPP; 1728 break; 1729 } 1730 1731 /* 1732 * If all options have default values, no need to keep the mbuf. 1733 */ 1734 if (imo->imo_multicast_if_index == 0 && 1735 imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL && 1736 imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP && 1737 imo->imo_num_memberships == 0) { 1738 kmem_free(imo, sizeof(*imo)); 1739 *pimo = NULL; 1740 } 1741 1742 return error; 1743 } 1744 1745 /* 1746 * Return the IP multicast options in response to user getsockopt(). 1747 */ 1748 int 1749 ip_getmoptions(struct ip_moptions *imo, struct sockopt *sopt) 1750 { 1751 struct in_addr addr; 1752 uint8_t optval; 1753 int error = 0; 1754 1755 switch (sopt->sopt_name) { 1756 case IP_MULTICAST_IF: 1757 if (imo == NULL || imo->imo_multicast_if_index == 0) 1758 addr = zeroin_addr; 1759 else if (imo->imo_multicast_addr.s_addr) { 1760 /* return the value user has set */ 1761 addr = imo->imo_multicast_addr; 1762 } else { 1763 struct ifnet *ifp; 1764 struct in_ifaddr *ia = NULL; 1765 int s = pserialize_read_enter(); 1766 1767 ifp = if_byindex(imo->imo_multicast_if_index); 1768 if (ifp != NULL) { 1769 ia = in_get_ia_from_ifp(ifp); 1770 } 1771 addr = ia ? ia->ia_addr.sin_addr : zeroin_addr; 1772 pserialize_read_exit(s); 1773 } 1774 error = sockopt_set(sopt, &addr, sizeof(addr)); 1775 break; 1776 1777 case IP_MULTICAST_TTL: 1778 optval = imo ? imo->imo_multicast_ttl 1779 : IP_DEFAULT_MULTICAST_TTL; 1780 1781 error = sockopt_set(sopt, &optval, sizeof(optval)); 1782 break; 1783 1784 case IP_MULTICAST_LOOP: 1785 optval = imo ? imo->imo_multicast_loop 1786 : IP_DEFAULT_MULTICAST_LOOP; 1787 1788 error = sockopt_set(sopt, &optval, sizeof(optval)); 1789 break; 1790 1791 default: 1792 error = EOPNOTSUPP; 1793 } 1794 1795 return error; 1796 } 1797 1798 /* 1799 * Discard the IP multicast options. 1800 */ 1801 void 1802 ip_freemoptions(struct ip_moptions *imo) 1803 { 1804 int i; 1805 1806 if (imo != NULL) { 1807 for (i = 0; i < imo->imo_num_memberships; ++i) 1808 in_delmulti(imo->imo_membership[i]); 1809 kmem_free(imo, sizeof(*imo)); 1810 } 1811 } 1812 1813 /* 1814 * Routine called from ip_output() to loop back a copy of an IP multicast 1815 * packet to the input queue of a specified interface. Note that this 1816 * calls the output routine of the loopback "driver", but with an interface 1817 * pointer that might NOT be lo0ifp -- easier than replicating that code here. 1818 */ 1819 static void 1820 ip_mloopback(struct ifnet *ifp, struct mbuf *m, const struct sockaddr_in *dst) 1821 { 1822 struct ip *ip; 1823 struct mbuf *copym; 1824 1825 copym = m_copypacket(m, M_DONTWAIT); 1826 if (copym != NULL && 1827 (copym->m_flags & M_EXT || copym->m_len < sizeof(struct ip))) 1828 copym = m_pullup(copym, sizeof(struct ip)); 1829 if (copym == NULL) 1830 return; 1831 /* 1832 * We don't bother to fragment if the IP length is greater 1833 * than the interface's MTU. Can this possibly matter? 1834 */ 1835 ip = mtod(copym, struct ip *); 1836 1837 if (copym->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) { 1838 in_delayed_cksum(copym); 1839 copym->m_pkthdr.csum_flags &= 1840 ~(M_CSUM_TCPv4|M_CSUM_UDPv4); 1841 } 1842 1843 ip->ip_sum = 0; 1844 ip->ip_sum = in_cksum(copym, ip->ip_hl << 2); 1845 #ifndef NET_MPSAFE 1846 KERNEL_LOCK(1, NULL); 1847 #endif 1848 (void)looutput(ifp, copym, sintocsa(dst), NULL); 1849 #ifndef NET_MPSAFE 1850 KERNEL_UNLOCK_ONE(NULL); 1851 #endif 1852 } 1853