1 /* $NetBSD: ip_output.c,v 1.217 2012/06/25 15:28:39 christos Exp $ */ 2 3 /* 4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the project nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 /*- 33 * Copyright (c) 1998 The NetBSD Foundation, Inc. 34 * All rights reserved. 35 * 36 * This code is derived from software contributed to The NetBSD Foundation 37 * by Public Access Networks Corporation ("Panix"). It was developed under 38 * contract to Panix by Eric Haszlakiewicz and Thor Lancelot Simon. 39 * 40 * Redistribution and use in source and binary forms, with or without 41 * modification, are permitted provided that the following conditions 42 * are met: 43 * 1. Redistributions of source code must retain the above copyright 44 * notice, this list of conditions and the following disclaimer. 45 * 2. Redistributions in binary form must reproduce the above copyright 46 * notice, this list of conditions and the following disclaimer in the 47 * documentation and/or other materials provided with the distribution. 48 * 49 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 50 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 51 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 52 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 53 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 54 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 55 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 56 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 57 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 58 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 59 * POSSIBILITY OF SUCH DAMAGE. 60 */ 61 62 /* 63 * Copyright (c) 1982, 1986, 1988, 1990, 1993 64 * The Regents of the University of California. All rights reserved. 65 * 66 * Redistribution and use in source and binary forms, with or without 67 * modification, are permitted provided that the following conditions 68 * are met: 69 * 1. Redistributions of source code must retain the above copyright 70 * notice, this list of conditions and the following disclaimer. 71 * 2. Redistributions in binary form must reproduce the above copyright 72 * notice, this list of conditions and the following disclaimer in the 73 * documentation and/or other materials provided with the distribution. 74 * 3. Neither the name of the University nor the names of its contributors 75 * may be used to endorse or promote products derived from this software 76 * without specific prior written permission. 77 * 78 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 79 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 80 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 81 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 82 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 83 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 84 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 85 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 86 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 87 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 88 * SUCH DAMAGE. 89 * 90 * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 91 */ 92 93 #include <sys/cdefs.h> 94 __KERNEL_RCSID(0, "$NetBSD: ip_output.c,v 1.217 2012/06/25 15:28:39 christos Exp $"); 95 96 #include "opt_pfil_hooks.h" 97 #include "opt_inet.h" 98 #include "opt_ipsec.h" 99 #include "opt_mrouting.h" 100 101 #include <sys/param.h> 102 #include <sys/malloc.h> 103 #include <sys/kmem.h> 104 #include <sys/mbuf.h> 105 #include <sys/errno.h> 106 #include <sys/protosw.h> 107 #include <sys/socket.h> 108 #include <sys/socketvar.h> 109 #include <sys/kauth.h> 110 #ifdef FAST_IPSEC 111 #include <sys/domain.h> 112 #endif 113 #include <sys/systm.h> 114 #include <sys/proc.h> 115 116 #include <net/if.h> 117 #include <net/route.h> 118 #include <net/pfil.h> 119 120 #include <netinet/in.h> 121 #include <netinet/in_systm.h> 122 #include <netinet/ip.h> 123 #include <netinet/in_pcb.h> 124 #include <netinet/in_var.h> 125 #include <netinet/ip_var.h> 126 #include <netinet/ip_private.h> 127 #include <netinet/in_offload.h> 128 #include <netinet/portalgo.h> 129 130 #ifdef MROUTING 131 #include <netinet/ip_mroute.h> 132 #endif 133 134 #ifdef FAST_IPSEC 135 #include <netipsec/ipsec.h> 136 #include <netipsec/key.h> 137 #include <netipsec/xform.h> 138 #endif /* FAST_IPSEC*/ 139 140 #ifdef IPSEC_NAT_T 141 #include <netinet/udp.h> 142 #endif 143 144 static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *); 145 static struct ifnet *ip_multicast_if(struct in_addr *, int *); 146 static void ip_mloopback(struct ifnet *, struct mbuf *, 147 const struct sockaddr_in *); 148 149 #ifdef PFIL_HOOKS 150 extern struct pfil_head inet_pfil_hook; /* XXX */ 151 #endif 152 153 int ip_do_loopback_cksum = 0; 154 155 /* 156 * IP output. The packet in mbuf chain m contains a skeletal IP 157 * header (with len, off, ttl, proto, tos, src, dst). 158 * The mbuf chain containing the packet will be freed. 159 * The mbuf opt, if present, will not be freed. 160 */ 161 int 162 ip_output(struct mbuf *m0, ...) 163 { 164 struct rtentry *rt; 165 struct ip *ip; 166 struct ifnet *ifp; 167 struct mbuf *m = m0; 168 int hlen = sizeof (struct ip); 169 int len, error = 0; 170 struct route iproute; 171 const struct sockaddr_in *dst; 172 struct in_ifaddr *ia; 173 struct ifaddr *xifa; 174 struct mbuf *opt; 175 struct route *ro; 176 int flags, sw_csum; 177 int *mtu_p; 178 u_long mtu; 179 struct ip_moptions *imo; 180 struct socket *so; 181 va_list ap; 182 #ifdef IPSEC_NAT_T 183 int natt_frag = 0; 184 #endif 185 #ifdef FAST_IPSEC 186 struct inpcb *inp; 187 struct secpolicy *sp = NULL; 188 int s; 189 #endif 190 u_int16_t ip_len; 191 union { 192 struct sockaddr dst; 193 struct sockaddr_in dst4; 194 } u; 195 struct sockaddr *rdst = &u.dst; /* real IP destination, as opposed 196 * to the nexthop 197 */ 198 199 len = 0; 200 va_start(ap, m0); 201 opt = va_arg(ap, struct mbuf *); 202 ro = va_arg(ap, struct route *); 203 flags = va_arg(ap, int); 204 imo = va_arg(ap, struct ip_moptions *); 205 so = va_arg(ap, struct socket *); 206 if (flags & IP_RETURNMTU) 207 mtu_p = va_arg(ap, int *); 208 else 209 mtu_p = NULL; 210 va_end(ap); 211 212 MCLAIM(m, &ip_tx_mowner); 213 #ifdef FAST_IPSEC 214 if (so != NULL && so->so_proto->pr_domain->dom_family == AF_INET) 215 inp = (struct inpcb *)so->so_pcb; 216 else 217 inp = NULL; 218 #endif /* FAST_IPSEC */ 219 220 #ifdef DIAGNOSTIC 221 if ((m->m_flags & M_PKTHDR) == 0) 222 panic("ip_output: no HDR"); 223 224 if ((m->m_pkthdr.csum_flags & (M_CSUM_TCPv6|M_CSUM_UDPv6)) != 0) { 225 panic("ip_output: IPv6 checksum offload flags: %d", 226 m->m_pkthdr.csum_flags); 227 } 228 229 if ((m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) == 230 (M_CSUM_TCPv4|M_CSUM_UDPv4)) { 231 panic("ip_output: conflicting checksum offload flags: %d", 232 m->m_pkthdr.csum_flags); 233 } 234 #endif 235 if (opt) { 236 m = ip_insertoptions(m, opt, &len); 237 if (len >= sizeof(struct ip)) 238 hlen = len; 239 } 240 ip = mtod(m, struct ip *); 241 /* 242 * Fill in IP header. 243 */ 244 if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { 245 ip->ip_v = IPVERSION; 246 ip->ip_off = htons(0); 247 /* ip->ip_id filled in after we find out source ia */ 248 ip->ip_hl = hlen >> 2; 249 IP_STATINC(IP_STAT_LOCALOUT); 250 } else { 251 hlen = ip->ip_hl << 2; 252 } 253 /* 254 * Route packet. 255 */ 256 memset(&iproute, 0, sizeof(iproute)); 257 if (ro == NULL) 258 ro = &iproute; 259 sockaddr_in_init(&u.dst4, &ip->ip_dst, 0); 260 dst = satocsin(rtcache_getdst(ro)); 261 /* 262 * If there is a cached route, 263 * check that it is to the same destination 264 * and is still up. If not, free it and try again. 265 * The address family should also be checked in case of sharing the 266 * cache with IPv6. 267 */ 268 if (dst == NULL) 269 ; 270 else if (dst->sin_family != AF_INET || 271 !in_hosteq(dst->sin_addr, ip->ip_dst)) 272 rtcache_free(ro); 273 274 if ((rt = rtcache_validate(ro)) == NULL && 275 (rt = rtcache_update(ro, 1)) == NULL) { 276 dst = &u.dst4; 277 rtcache_setdst(ro, &u.dst); 278 } 279 /* 280 * If routing to interface only, 281 * short circuit routing lookup. 282 */ 283 if (flags & IP_ROUTETOIF) { 284 if ((ia = ifatoia(ifa_ifwithladdr(sintocsa(dst)))) == NULL) { 285 IP_STATINC(IP_STAT_NOROUTE); 286 error = ENETUNREACH; 287 goto bad; 288 } 289 ifp = ia->ia_ifp; 290 mtu = ifp->if_mtu; 291 ip->ip_ttl = 1; 292 } else if ((IN_MULTICAST(ip->ip_dst.s_addr) || 293 ip->ip_dst.s_addr == INADDR_BROADCAST) && 294 imo != NULL && imo->imo_multicast_ifp != NULL) { 295 ifp = imo->imo_multicast_ifp; 296 mtu = ifp->if_mtu; 297 IFP_TO_IA(ifp, ia); 298 } else { 299 if (rt == NULL) 300 rt = rtcache_init(ro); 301 if (rt == NULL) { 302 IP_STATINC(IP_STAT_NOROUTE); 303 error = EHOSTUNREACH; 304 goto bad; 305 } 306 ia = ifatoia(rt->rt_ifa); 307 ifp = rt->rt_ifp; 308 if ((mtu = rt->rt_rmx.rmx_mtu) == 0) 309 mtu = ifp->if_mtu; 310 rt->rt_use++; 311 if (rt->rt_flags & RTF_GATEWAY) 312 dst = satosin(rt->rt_gateway); 313 } 314 if (IN_MULTICAST(ip->ip_dst.s_addr) || 315 (ip->ip_dst.s_addr == INADDR_BROADCAST)) { 316 struct in_multi *inm; 317 318 m->m_flags |= (ip->ip_dst.s_addr == INADDR_BROADCAST) ? 319 M_BCAST : M_MCAST; 320 /* 321 * See if the caller provided any multicast options 322 */ 323 if (imo != NULL) 324 ip->ip_ttl = imo->imo_multicast_ttl; 325 else 326 ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; 327 328 /* 329 * if we don't know the outgoing ifp yet, we can't generate 330 * output 331 */ 332 if (!ifp) { 333 IP_STATINC(IP_STAT_NOROUTE); 334 error = ENETUNREACH; 335 goto bad; 336 } 337 338 /* 339 * If the packet is multicast or broadcast, confirm that 340 * the outgoing interface can transmit it. 341 */ 342 if (((m->m_flags & M_MCAST) && 343 (ifp->if_flags & IFF_MULTICAST) == 0) || 344 ((m->m_flags & M_BCAST) && 345 (ifp->if_flags & (IFF_BROADCAST|IFF_POINTOPOINT)) == 0)) { 346 IP_STATINC(IP_STAT_NOROUTE); 347 error = ENETUNREACH; 348 goto bad; 349 } 350 /* 351 * If source address not specified yet, use an address 352 * of outgoing interface. 353 */ 354 if (in_nullhost(ip->ip_src)) { 355 struct in_ifaddr *xia; 356 357 IFP_TO_IA(ifp, xia); 358 if (!xia) { 359 error = EADDRNOTAVAIL; 360 goto bad; 361 } 362 xifa = &xia->ia_ifa; 363 if (xifa->ifa_getifa != NULL) { 364 xia = ifatoia((*xifa->ifa_getifa)(xifa, rdst)); 365 } 366 ip->ip_src = xia->ia_addr.sin_addr; 367 } 368 369 IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm); 370 if (inm != NULL && 371 (imo == NULL || imo->imo_multicast_loop)) { 372 /* 373 * If we belong to the destination multicast group 374 * on the outgoing interface, and the caller did not 375 * forbid loopback, loop back a copy. 376 */ 377 ip_mloopback(ifp, m, &u.dst4); 378 } 379 #ifdef MROUTING 380 else { 381 /* 382 * If we are acting as a multicast router, perform 383 * multicast forwarding as if the packet had just 384 * arrived on the interface to which we are about 385 * to send. The multicast forwarding function 386 * recursively calls this function, using the 387 * IP_FORWARDING flag to prevent infinite recursion. 388 * 389 * Multicasts that are looped back by ip_mloopback(), 390 * above, will be forwarded by the ip_input() routine, 391 * if necessary. 392 */ 393 extern struct socket *ip_mrouter; 394 395 if (ip_mrouter && (flags & IP_FORWARDING) == 0) { 396 if (ip_mforward(m, ifp) != 0) { 397 m_freem(m); 398 goto done; 399 } 400 } 401 } 402 #endif 403 /* 404 * Multicasts with a time-to-live of zero may be looped- 405 * back, above, but must not be transmitted on a network. 406 * Also, multicasts addressed to the loopback interface 407 * are not sent -- the above call to ip_mloopback() will 408 * loop back a copy if this host actually belongs to the 409 * destination group on the loopback interface. 410 */ 411 if (ip->ip_ttl == 0 || (ifp->if_flags & IFF_LOOPBACK) != 0) { 412 m_freem(m); 413 goto done; 414 } 415 416 goto sendit; 417 } 418 /* 419 * If source address not specified yet, use address 420 * of outgoing interface. 421 */ 422 if (in_nullhost(ip->ip_src)) { 423 xifa = &ia->ia_ifa; 424 if (xifa->ifa_getifa != NULL) 425 ia = ifatoia((*xifa->ifa_getifa)(xifa, rdst)); 426 ip->ip_src = ia->ia_addr.sin_addr; 427 } 428 429 /* 430 * packets with Class-D address as source are not valid per 431 * RFC 1112 432 */ 433 if (IN_MULTICAST(ip->ip_src.s_addr)) { 434 IP_STATINC(IP_STAT_ODROPPED); 435 error = EADDRNOTAVAIL; 436 goto bad; 437 } 438 439 /* 440 * Look for broadcast address and 441 * and verify user is allowed to send 442 * such a packet. 443 */ 444 if (in_broadcast(dst->sin_addr, ifp)) { 445 if ((ifp->if_flags & IFF_BROADCAST) == 0) { 446 error = EADDRNOTAVAIL; 447 goto bad; 448 } 449 if ((flags & IP_ALLOWBROADCAST) == 0) { 450 error = EACCES; 451 goto bad; 452 } 453 /* don't allow broadcast messages to be fragmented */ 454 if (ntohs(ip->ip_len) > ifp->if_mtu) { 455 error = EMSGSIZE; 456 goto bad; 457 } 458 m->m_flags |= M_BCAST; 459 } else 460 m->m_flags &= ~M_BCAST; 461 462 sendit: 463 if ((flags & (IP_FORWARDING|IP_NOIPNEWID)) == 0) { 464 if (m->m_pkthdr.len < IP_MINFRAGSIZE) { 465 ip->ip_id = 0; 466 } else if ((m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0) { 467 ip->ip_id = ip_newid(ia); 468 } else { 469 470 /* 471 * TSO capable interfaces (typically?) increment 472 * ip_id for each segment. 473 * "allocate" enough ids here to increase the chance 474 * for them to be unique. 475 * 476 * note that the following calculation is not 477 * needed to be precise. wasting some ip_id is fine. 478 */ 479 480 unsigned int segsz = m->m_pkthdr.segsz; 481 unsigned int datasz = ntohs(ip->ip_len) - hlen; 482 unsigned int num = howmany(datasz, segsz); 483 484 ip->ip_id = ip_newid_range(ia, num); 485 } 486 } 487 /* 488 * If we're doing Path MTU Discovery, we need to set DF unless 489 * the route's MTU is locked. 490 */ 491 if ((flags & IP_MTUDISC) != 0 && rt != NULL && 492 (rt->rt_rmx.rmx_locks & RTV_MTU) == 0) 493 ip->ip_off |= htons(IP_DF); 494 495 /* Remember the current ip_len */ 496 ip_len = ntohs(ip->ip_len); 497 498 #ifdef FAST_IPSEC 499 /* 500 * Check the security policy (SP) for the packet and, if 501 * required, do IPsec-related processing. There are two 502 * cases here; the first time a packet is sent through 503 * it will be untagged and handled by ipsec4_checkpolicy. 504 * If the packet is resubmitted to ip_output (e.g. after 505 * AH, ESP, etc. processing), there will be a tag to bypass 506 * the lookup and related policy checking. 507 */ 508 if (!ipsec_outdone(m)) { 509 s = splsoftnet(); 510 if (inp != NULL && 511 IPSEC_PCB_SKIP_IPSEC(inp->inp_sp, IPSEC_DIR_OUTBOUND)) { 512 splx(s); 513 goto spd_done; 514 } 515 sp = ipsec4_checkpolicy(m, IPSEC_DIR_OUTBOUND, flags, 516 &error, inp); 517 /* 518 * There are four return cases: 519 * sp != NULL apply IPsec policy 520 * sp == NULL, error == 0 no IPsec handling needed 521 * sp == NULL, error == -EINVAL discard packet w/o error 522 * sp == NULL, error != 0 discard packet, report error 523 */ 524 if (sp != NULL) { 525 #ifdef IPSEC_NAT_T 526 /* 527 * NAT-T ESP fragmentation: don't do IPSec processing now, 528 * we'll do it on each fragmented packet. 529 */ 530 if (sp->req->sav && 531 ((sp->req->sav->natt_type & UDP_ENCAP_ESPINUDP) || 532 (sp->req->sav->natt_type & UDP_ENCAP_ESPINUDP_NON_IKE))) { 533 if (ntohs(ip->ip_len) > sp->req->sav->esp_frag) { 534 natt_frag = 1; 535 mtu = sp->req->sav->esp_frag; 536 splx(s); 537 goto spd_done; 538 } 539 } 540 #endif /* IPSEC_NAT_T */ 541 542 /* 543 * Do delayed checksums now because we send before 544 * this is done in the normal processing path. 545 */ 546 if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) { 547 in_delayed_cksum(m); 548 m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4); 549 } 550 551 #ifdef __FreeBSD__ 552 ip->ip_len = htons(ip->ip_len); 553 ip->ip_off = htons(ip->ip_off); 554 #endif 555 556 /* NB: callee frees mbuf */ 557 error = ipsec4_process_packet(m, sp->req, flags, 0); 558 /* 559 * Preserve KAME behaviour: ENOENT can be returned 560 * when an SA acquire is in progress. Don't propagate 561 * this to user-level; it confuses applications. 562 * 563 * XXX this will go away when the SADB is redone. 564 */ 565 if (error == ENOENT) 566 error = 0; 567 splx(s); 568 goto done; 569 } else { 570 splx(s); 571 572 if (error != 0) { 573 /* 574 * Hack: -EINVAL is used to signal that a packet 575 * should be silently discarded. This is typically 576 * because we asked key management for an SA and 577 * it was delayed (e.g. kicked up to IKE). 578 */ 579 if (error == -EINVAL) 580 error = 0; 581 goto bad; 582 } else { 583 /* No IPsec processing for this packet. */ 584 } 585 } 586 } 587 spd_done: 588 #endif /* FAST_IPSEC */ 589 590 #ifdef PFIL_HOOKS 591 /* 592 * Run through list of hooks for output packets. 593 */ 594 if ((error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT)) != 0) 595 goto done; 596 if (m == NULL) 597 goto done; 598 599 ip = mtod(m, struct ip *); 600 hlen = ip->ip_hl << 2; 601 ip_len = ntohs(ip->ip_len); 602 #endif /* PFIL_HOOKS */ 603 604 m->m_pkthdr.csum_data |= hlen << 16; 605 606 #if IFA_STATS 607 /* 608 * search for the source address structure to 609 * maintain output statistics. 610 */ 611 INADDR_TO_IA(ip->ip_src, ia); 612 #endif 613 614 /* Maybe skip checksums on loopback interfaces. */ 615 if (IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)) { 616 m->m_pkthdr.csum_flags |= M_CSUM_IPv4; 617 } 618 sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_csum_flags_tx; 619 /* 620 * If small enough for mtu of path, or if using TCP segmentation 621 * offload, can just send directly. 622 */ 623 if (ip_len <= mtu || 624 (m->m_pkthdr.csum_flags & M_CSUM_TSOv4) != 0) { 625 #if IFA_STATS 626 if (ia) 627 ia->ia_ifa.ifa_data.ifad_outbytes += ip_len; 628 #endif 629 /* 630 * Always initialize the sum to 0! Some HW assisted 631 * checksumming requires this. 632 */ 633 ip->ip_sum = 0; 634 635 if ((m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0) { 636 /* 637 * Perform any checksums that the hardware can't do 638 * for us. 639 * 640 * XXX Does any hardware require the {th,uh}_sum 641 * XXX fields to be 0? 642 */ 643 if (sw_csum & M_CSUM_IPv4) { 644 KASSERT(IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)); 645 ip->ip_sum = in_cksum(m, hlen); 646 m->m_pkthdr.csum_flags &= ~M_CSUM_IPv4; 647 } 648 if (sw_csum & (M_CSUM_TCPv4|M_CSUM_UDPv4)) { 649 if (IN_NEED_CHECKSUM(ifp, 650 sw_csum & (M_CSUM_TCPv4|M_CSUM_UDPv4))) { 651 in_delayed_cksum(m); 652 } 653 m->m_pkthdr.csum_flags &= 654 ~(M_CSUM_TCPv4|M_CSUM_UDPv4); 655 } 656 } 657 658 if (__predict_true( 659 (m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0 || 660 (ifp->if_capenable & IFCAP_TSOv4) != 0)) { 661 KERNEL_LOCK(1, NULL); 662 error = 663 (*ifp->if_output)(ifp, m, 664 (m->m_flags & M_MCAST) ? 665 sintocsa(rdst) : sintocsa(dst), 666 rt); 667 KERNEL_UNLOCK_ONE(NULL); 668 } else { 669 error = 670 ip_tso_output(ifp, m, 671 (m->m_flags & M_MCAST) ? 672 sintocsa(rdst) : sintocsa(dst), 673 rt); 674 } 675 goto done; 676 } 677 678 /* 679 * We can't use HW checksumming if we're about to 680 * to fragment the packet. 681 * 682 * XXX Some hardware can do this. 683 */ 684 if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) { 685 if (IN_NEED_CHECKSUM(ifp, 686 m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4))) { 687 in_delayed_cksum(m); 688 } 689 m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4); 690 } 691 692 /* 693 * Too large for interface; fragment if possible. 694 * Must be able to put at least 8 bytes per fragment. 695 */ 696 if (ntohs(ip->ip_off) & IP_DF) { 697 if (flags & IP_RETURNMTU) 698 *mtu_p = mtu; 699 error = EMSGSIZE; 700 IP_STATINC(IP_STAT_CANTFRAG); 701 goto bad; 702 } 703 704 error = ip_fragment(m, ifp, mtu); 705 if (error) { 706 m = NULL; 707 goto bad; 708 } 709 710 for (; m; m = m0) { 711 m0 = m->m_nextpkt; 712 m->m_nextpkt = 0; 713 if (error == 0) { 714 #if IFA_STATS 715 if (ia) 716 ia->ia_ifa.ifa_data.ifad_outbytes += 717 ntohs(ip->ip_len); 718 #endif 719 #ifdef IPSEC_NAT_T 720 /* 721 * If we get there, the packet has not been handeld by 722 * IPSec whereas it should have. Now that it has been 723 * fragmented, re-inject it in ip_output so that IPsec 724 * processing can occur. 725 */ 726 if (natt_frag) { 727 error = ip_output(m, opt, 728 ro, flags | IP_RAWOUTPUT | IP_NOIPNEWID, imo, so, mtu_p); 729 } else 730 #endif /* IPSEC_NAT_T */ 731 { 732 KASSERT((m->m_pkthdr.csum_flags & 733 (M_CSUM_UDPv4 | M_CSUM_TCPv4)) == 0); 734 KERNEL_LOCK(1, NULL); 735 error = (*ifp->if_output)(ifp, m, 736 (m->m_flags & M_MCAST) ? 737 sintocsa(rdst) : sintocsa(dst), 738 rt); 739 KERNEL_UNLOCK_ONE(NULL); 740 } 741 } else 742 m_freem(m); 743 } 744 745 if (error == 0) 746 IP_STATINC(IP_STAT_FRAGMENTED); 747 done: 748 rtcache_free(&iproute); 749 750 #ifdef FAST_IPSEC 751 if (sp != NULL) 752 KEY_FREESP(&sp); 753 #endif /* FAST_IPSEC */ 754 755 return (error); 756 bad: 757 m_freem(m); 758 goto done; 759 } 760 761 int 762 ip_fragment(struct mbuf *m, struct ifnet *ifp, u_long mtu) 763 { 764 struct ip *ip, *mhip; 765 struct mbuf *m0; 766 int len, hlen, off; 767 int mhlen, firstlen; 768 struct mbuf **mnext; 769 int sw_csum = m->m_pkthdr.csum_flags; 770 int fragments = 0; 771 int s; 772 int error = 0; 773 774 ip = mtod(m, struct ip *); 775 hlen = ip->ip_hl << 2; 776 if (ifp != NULL) 777 sw_csum &= ~ifp->if_csum_flags_tx; 778 779 len = (mtu - hlen) &~ 7; 780 if (len < 8) { 781 m_freem(m); 782 return (EMSGSIZE); 783 } 784 785 firstlen = len; 786 mnext = &m->m_nextpkt; 787 788 /* 789 * Loop through length of segment after first fragment, 790 * make new header and copy data of each part and link onto chain. 791 */ 792 m0 = m; 793 mhlen = sizeof (struct ip); 794 for (off = hlen + len; off < ntohs(ip->ip_len); off += len) { 795 MGETHDR(m, M_DONTWAIT, MT_HEADER); 796 if (m == 0) { 797 error = ENOBUFS; 798 IP_STATINC(IP_STAT_ODROPPED); 799 goto sendorfree; 800 } 801 MCLAIM(m, m0->m_owner); 802 *mnext = m; 803 mnext = &m->m_nextpkt; 804 m->m_data += max_linkhdr; 805 mhip = mtod(m, struct ip *); 806 *mhip = *ip; 807 /* we must inherit MCAST and BCAST flags */ 808 m->m_flags |= m0->m_flags & (M_MCAST|M_BCAST); 809 if (hlen > sizeof (struct ip)) { 810 mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); 811 mhip->ip_hl = mhlen >> 2; 812 } 813 m->m_len = mhlen; 814 mhip->ip_off = ((off - hlen) >> 3) + 815 (ntohs(ip->ip_off) & ~IP_MF); 816 if (ip->ip_off & htons(IP_MF)) 817 mhip->ip_off |= IP_MF; 818 if (off + len >= ntohs(ip->ip_len)) 819 len = ntohs(ip->ip_len) - off; 820 else 821 mhip->ip_off |= IP_MF; 822 HTONS(mhip->ip_off); 823 mhip->ip_len = htons((u_int16_t)(len + mhlen)); 824 m->m_next = m_copym(m0, off, len, M_DONTWAIT); 825 if (m->m_next == 0) { 826 error = ENOBUFS; /* ??? */ 827 IP_STATINC(IP_STAT_ODROPPED); 828 goto sendorfree; 829 } 830 m->m_pkthdr.len = mhlen + len; 831 m->m_pkthdr.rcvif = NULL; 832 mhip->ip_sum = 0; 833 KASSERT((m->m_pkthdr.csum_flags & M_CSUM_IPv4) == 0); 834 if (sw_csum & M_CSUM_IPv4) { 835 mhip->ip_sum = in_cksum(m, mhlen); 836 } else { 837 /* 838 * checksum is hw-offloaded or not necessary. 839 */ 840 m->m_pkthdr.csum_flags |= 841 m0->m_pkthdr.csum_flags & M_CSUM_IPv4; 842 m->m_pkthdr.csum_data |= mhlen << 16; 843 KASSERT(!(ifp != NULL && 844 IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)) 845 || (m->m_pkthdr.csum_flags & M_CSUM_IPv4) != 0); 846 } 847 IP_STATINC(IP_STAT_OFRAGMENTS); 848 fragments++; 849 } 850 /* 851 * Update first fragment by trimming what's been copied out 852 * and updating header, then send each fragment (in order). 853 */ 854 m = m0; 855 m_adj(m, hlen + firstlen - ntohs(ip->ip_len)); 856 m->m_pkthdr.len = hlen + firstlen; 857 ip->ip_len = htons((u_int16_t)m->m_pkthdr.len); 858 ip->ip_off |= htons(IP_MF); 859 ip->ip_sum = 0; 860 if (sw_csum & M_CSUM_IPv4) { 861 ip->ip_sum = in_cksum(m, hlen); 862 m->m_pkthdr.csum_flags &= ~M_CSUM_IPv4; 863 } else { 864 /* 865 * checksum is hw-offloaded or not necessary. 866 */ 867 KASSERT(!(ifp != NULL && IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)) 868 || (m->m_pkthdr.csum_flags & M_CSUM_IPv4) != 0); 869 KASSERT(M_CSUM_DATA_IPv4_IPHL(m->m_pkthdr.csum_data) >= 870 sizeof(struct ip)); 871 } 872 sendorfree: 873 /* 874 * If there is no room for all the fragments, don't queue 875 * any of them. 876 */ 877 if (ifp != NULL) { 878 s = splnet(); 879 if (ifp->if_snd.ifq_maxlen - ifp->if_snd.ifq_len < fragments && 880 error == 0) { 881 error = ENOBUFS; 882 IP_STATINC(IP_STAT_ODROPPED); 883 IFQ_INC_DROPS(&ifp->if_snd); 884 } 885 splx(s); 886 } 887 if (error) { 888 for (m = m0; m; m = m0) { 889 m0 = m->m_nextpkt; 890 m->m_nextpkt = NULL; 891 m_freem(m); 892 } 893 } 894 return (error); 895 } 896 897 /* 898 * Process a delayed payload checksum calculation. 899 */ 900 void 901 in_delayed_cksum(struct mbuf *m) 902 { 903 struct ip *ip; 904 u_int16_t csum, offset; 905 906 ip = mtod(m, struct ip *); 907 offset = ip->ip_hl << 2; 908 csum = in4_cksum(m, 0, offset, ntohs(ip->ip_len) - offset); 909 if (csum == 0 && (m->m_pkthdr.csum_flags & M_CSUM_UDPv4) != 0) 910 csum = 0xffff; 911 912 offset += M_CSUM_DATA_IPv4_OFFSET(m->m_pkthdr.csum_data); 913 914 if ((offset + sizeof(u_int16_t)) > m->m_len) { 915 /* This happen when ip options were inserted 916 printf("in_delayed_cksum: pullup len %d off %d proto %d\n", 917 m->m_len, offset, ip->ip_p); 918 */ 919 m_copyback(m, offset, sizeof(csum), (void *) &csum); 920 } else 921 *(u_int16_t *)(mtod(m, char *) + offset) = csum; 922 } 923 924 /* 925 * Determine the maximum length of the options to be inserted; 926 * we would far rather allocate too much space rather than too little. 927 */ 928 929 u_int 930 ip_optlen(struct inpcb *inp) 931 { 932 struct mbuf *m = inp->inp_options; 933 934 if (m && m->m_len > offsetof(struct ipoption, ipopt_dst)) 935 return (m->m_len - offsetof(struct ipoption, ipopt_dst)); 936 else 937 return 0; 938 } 939 940 941 /* 942 * Insert IP options into preformed packet. 943 * Adjust IP destination as required for IP source routing, 944 * as indicated by a non-zero in_addr at the start of the options. 945 */ 946 static struct mbuf * 947 ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen) 948 { 949 struct ipoption *p = mtod(opt, struct ipoption *); 950 struct mbuf *n; 951 struct ip *ip = mtod(m, struct ip *); 952 unsigned optlen; 953 954 optlen = opt->m_len - sizeof(p->ipopt_dst); 955 if (optlen + ntohs(ip->ip_len) > IP_MAXPACKET) 956 return (m); /* XXX should fail */ 957 if (!in_nullhost(p->ipopt_dst)) 958 ip->ip_dst = p->ipopt_dst; 959 if (M_READONLY(m) || M_LEADINGSPACE(m) < optlen) { 960 MGETHDR(n, M_DONTWAIT, MT_HEADER); 961 if (n == 0) 962 return (m); 963 MCLAIM(n, m->m_owner); 964 M_MOVE_PKTHDR(n, m); 965 m->m_len -= sizeof(struct ip); 966 m->m_data += sizeof(struct ip); 967 n->m_next = m; 968 m = n; 969 m->m_len = optlen + sizeof(struct ip); 970 m->m_data += max_linkhdr; 971 bcopy((void *)ip, mtod(m, void *), sizeof(struct ip)); 972 } else { 973 m->m_data -= optlen; 974 m->m_len += optlen; 975 memmove(mtod(m, void *), ip, sizeof(struct ip)); 976 } 977 m->m_pkthdr.len += optlen; 978 ip = mtod(m, struct ip *); 979 bcopy((void *)p->ipopt_list, (void *)(ip + 1), (unsigned)optlen); 980 *phlen = sizeof(struct ip) + optlen; 981 ip->ip_len = htons(ntohs(ip->ip_len) + optlen); 982 return (m); 983 } 984 985 /* 986 * Copy options from ip to jp, 987 * omitting those not copied during fragmentation. 988 */ 989 int 990 ip_optcopy(struct ip *ip, struct ip *jp) 991 { 992 u_char *cp, *dp; 993 int opt, optlen, cnt; 994 995 cp = (u_char *)(ip + 1); 996 dp = (u_char *)(jp + 1); 997 cnt = (ip->ip_hl << 2) - sizeof (struct ip); 998 for (; cnt > 0; cnt -= optlen, cp += optlen) { 999 opt = cp[0]; 1000 if (opt == IPOPT_EOL) 1001 break; 1002 if (opt == IPOPT_NOP) { 1003 /* Preserve for IP mcast tunnel's LSRR alignment. */ 1004 *dp++ = IPOPT_NOP; 1005 optlen = 1; 1006 continue; 1007 } 1008 #ifdef DIAGNOSTIC 1009 if (cnt < IPOPT_OLEN + sizeof(*cp)) 1010 panic("malformed IPv4 option passed to ip_optcopy"); 1011 #endif 1012 optlen = cp[IPOPT_OLEN]; 1013 #ifdef DIAGNOSTIC 1014 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) 1015 panic("malformed IPv4 option passed to ip_optcopy"); 1016 #endif 1017 /* bogus lengths should have been caught by ip_dooptions */ 1018 if (optlen > cnt) 1019 optlen = cnt; 1020 if (IPOPT_COPIED(opt)) { 1021 bcopy((void *)cp, (void *)dp, (unsigned)optlen); 1022 dp += optlen; 1023 } 1024 } 1025 for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++) 1026 *dp++ = IPOPT_EOL; 1027 return (optlen); 1028 } 1029 1030 /* 1031 * IP socket option processing. 1032 */ 1033 int 1034 ip_ctloutput(int op, struct socket *so, struct sockopt *sopt) 1035 { 1036 struct inpcb *inp = sotoinpcb(so); 1037 int optval = 0; 1038 int error = 0; 1039 #if defined(FAST_IPSEC) 1040 struct lwp *l = curlwp; /*XXX*/ 1041 #endif 1042 1043 if (sopt->sopt_level != IPPROTO_IP) { 1044 if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_NOHEADER) 1045 return 0; 1046 return ENOPROTOOPT; 1047 } 1048 1049 switch (op) { 1050 case PRCO_SETOPT: 1051 switch (sopt->sopt_name) { 1052 case IP_OPTIONS: 1053 #ifdef notyet 1054 case IP_RETOPTS: 1055 #endif 1056 error = ip_pcbopts(&inp->inp_options, sopt); 1057 break; 1058 1059 case IP_TOS: 1060 case IP_TTL: 1061 case IP_MINTTL: 1062 case IP_RECVOPTS: 1063 case IP_RECVRETOPTS: 1064 case IP_RECVDSTADDR: 1065 case IP_RECVIF: 1066 case IP_RECVTTL: 1067 error = sockopt_getint(sopt, &optval); 1068 if (error) 1069 break; 1070 1071 switch (sopt->sopt_name) { 1072 case IP_TOS: 1073 inp->inp_ip.ip_tos = optval; 1074 break; 1075 1076 case IP_TTL: 1077 inp->inp_ip.ip_ttl = optval; 1078 break; 1079 1080 case IP_MINTTL: 1081 if (optval > 0 && optval <= MAXTTL) 1082 inp->inp_ip_minttl = optval; 1083 else 1084 error = EINVAL; 1085 break; 1086 #define OPTSET(bit) \ 1087 if (optval) \ 1088 inp->inp_flags |= bit; \ 1089 else \ 1090 inp->inp_flags &= ~bit; 1091 1092 case IP_RECVOPTS: 1093 OPTSET(INP_RECVOPTS); 1094 break; 1095 1096 case IP_RECVRETOPTS: 1097 OPTSET(INP_RECVRETOPTS); 1098 break; 1099 1100 case IP_RECVDSTADDR: 1101 OPTSET(INP_RECVDSTADDR); 1102 break; 1103 1104 case IP_RECVIF: 1105 OPTSET(INP_RECVIF); 1106 break; 1107 1108 case IP_RECVTTL: 1109 OPTSET(INP_RECVTTL); 1110 break; 1111 } 1112 break; 1113 #undef OPTSET 1114 1115 case IP_MULTICAST_IF: 1116 case IP_MULTICAST_TTL: 1117 case IP_MULTICAST_LOOP: 1118 case IP_ADD_MEMBERSHIP: 1119 case IP_DROP_MEMBERSHIP: 1120 error = ip_setmoptions(&inp->inp_moptions, sopt); 1121 break; 1122 1123 case IP_PORTRANGE: 1124 error = sockopt_getint(sopt, &optval); 1125 if (error) 1126 break; 1127 1128 /* INP_LOCK(inp); */ 1129 switch (optval) { 1130 case IP_PORTRANGE_DEFAULT: 1131 case IP_PORTRANGE_HIGH: 1132 inp->inp_flags &= ~(INP_LOWPORT); 1133 break; 1134 1135 case IP_PORTRANGE_LOW: 1136 inp->inp_flags |= INP_LOWPORT; 1137 break; 1138 1139 default: 1140 error = EINVAL; 1141 break; 1142 } 1143 /* INP_UNLOCK(inp); */ 1144 break; 1145 1146 case IP_PORTALGO: 1147 error = sockopt_getint(sopt, &optval); 1148 if (error) 1149 break; 1150 1151 error = portalgo_algo_index_select( 1152 (struct inpcb_hdr *)inp, optval); 1153 break; 1154 1155 #if defined(FAST_IPSEC) 1156 case IP_IPSEC_POLICY: 1157 error = ipsec4_set_policy(inp, sopt->sopt_name, 1158 sopt->sopt_data, sopt->sopt_size, l->l_cred); 1159 break; 1160 #endif /*IPSEC*/ 1161 1162 default: 1163 error = ENOPROTOOPT; 1164 break; 1165 } 1166 break; 1167 1168 case PRCO_GETOPT: 1169 switch (sopt->sopt_name) { 1170 case IP_OPTIONS: 1171 case IP_RETOPTS: 1172 if (inp->inp_options) { 1173 struct mbuf *m; 1174 1175 m = m_copym(inp->inp_options, 0, M_COPYALL, 1176 M_DONTWAIT); 1177 if (m == NULL) { 1178 error = ENOBUFS; 1179 break; 1180 } 1181 1182 error = sockopt_setmbuf(sopt, m); 1183 } 1184 break; 1185 1186 case IP_TOS: 1187 case IP_TTL: 1188 case IP_MINTTL: 1189 case IP_RECVOPTS: 1190 case IP_RECVRETOPTS: 1191 case IP_RECVDSTADDR: 1192 case IP_RECVIF: 1193 case IP_RECVTTL: 1194 case IP_ERRORMTU: 1195 switch (sopt->sopt_name) { 1196 case IP_TOS: 1197 optval = inp->inp_ip.ip_tos; 1198 break; 1199 1200 case IP_TTL: 1201 optval = inp->inp_ip.ip_ttl; 1202 break; 1203 1204 case IP_MINTTL: 1205 optval = inp->inp_ip_minttl; 1206 break; 1207 1208 case IP_ERRORMTU: 1209 optval = inp->inp_errormtu; 1210 break; 1211 1212 #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) 1213 1214 case IP_RECVOPTS: 1215 optval = OPTBIT(INP_RECVOPTS); 1216 break; 1217 1218 case IP_RECVRETOPTS: 1219 optval = OPTBIT(INP_RECVRETOPTS); 1220 break; 1221 1222 case IP_RECVDSTADDR: 1223 optval = OPTBIT(INP_RECVDSTADDR); 1224 break; 1225 1226 case IP_RECVIF: 1227 optval = OPTBIT(INP_RECVIF); 1228 break; 1229 1230 case IP_RECVTTL: 1231 optval = OPTBIT(INP_RECVTTL); 1232 break; 1233 } 1234 error = sockopt_setint(sopt, optval); 1235 break; 1236 1237 #if 0 /* defined(FAST_IPSEC) */ 1238 case IP_IPSEC_POLICY: 1239 { 1240 struct mbuf *m = NULL; 1241 1242 /* XXX this will return EINVAL as sopt is empty */ 1243 error = ipsec4_get_policy(inp, sopt->sopt_data, 1244 sopt->sopt_size, &m); 1245 if (error == 0) 1246 error = sockopt_setmbuf(sopt, m); 1247 break; 1248 } 1249 #endif /*IPSEC*/ 1250 1251 case IP_MULTICAST_IF: 1252 case IP_MULTICAST_TTL: 1253 case IP_MULTICAST_LOOP: 1254 case IP_ADD_MEMBERSHIP: 1255 case IP_DROP_MEMBERSHIP: 1256 error = ip_getmoptions(inp->inp_moptions, sopt); 1257 break; 1258 1259 case IP_PORTRANGE: 1260 if (inp->inp_flags & INP_LOWPORT) 1261 optval = IP_PORTRANGE_LOW; 1262 else 1263 optval = IP_PORTRANGE_DEFAULT; 1264 1265 error = sockopt_setint(sopt, optval); 1266 1267 break; 1268 1269 case IP_PORTALGO: 1270 optval = ((struct inpcb_hdr *)inp)->inph_portalgo; 1271 error = sockopt_setint(sopt, optval); 1272 break; 1273 1274 default: 1275 error = ENOPROTOOPT; 1276 break; 1277 } 1278 break; 1279 } 1280 return (error); 1281 } 1282 1283 /* 1284 * Set up IP options in pcb for insertion in output packets. 1285 * Store in mbuf with pointer in pcbopt, adding pseudo-option 1286 * with destination address if source routed. 1287 */ 1288 int 1289 ip_pcbopts(struct mbuf **pcbopt, const struct sockopt *sopt) 1290 { 1291 struct mbuf *m; 1292 const u_char *cp; 1293 u_char *dp; 1294 int cnt; 1295 uint8_t optval, olen, offset; 1296 1297 /* turn off any old options */ 1298 if (*pcbopt) 1299 (void)m_free(*pcbopt); 1300 *pcbopt = NULL; 1301 1302 cp = sopt->sopt_data; 1303 cnt = sopt->sopt_size; 1304 1305 if (cnt == 0) 1306 return (0); /* Only turning off any previous options */ 1307 1308 #ifndef __vax__ 1309 if (cnt % sizeof(int32_t)) 1310 return (EINVAL); 1311 #endif 1312 1313 m = m_get(M_DONTWAIT, MT_SOOPTS); 1314 if (m == NULL) 1315 return (ENOBUFS); 1316 1317 dp = mtod(m, u_char *); 1318 memset(dp, 0, sizeof(struct in_addr)); 1319 dp += sizeof(struct in_addr); 1320 m->m_len = sizeof(struct in_addr); 1321 1322 /* 1323 * IP option list according to RFC791. Each option is of the form 1324 * 1325 * [optval] [olen] [(olen - 2) data bytes] 1326 * 1327 * we validate the list and copy options to an mbuf for prepending 1328 * to data packets. The IP first-hop destination address will be 1329 * stored before actual options and is zero if unset. 1330 */ 1331 while (cnt > 0) { 1332 optval = cp[IPOPT_OPTVAL]; 1333 1334 if (optval == IPOPT_EOL || optval == IPOPT_NOP) { 1335 olen = 1; 1336 } else { 1337 if (cnt < IPOPT_OLEN + 1) 1338 goto bad; 1339 1340 olen = cp[IPOPT_OLEN]; 1341 if (olen < IPOPT_OLEN + 1 || olen > cnt) 1342 goto bad; 1343 } 1344 1345 if (optval == IPOPT_LSRR || optval == IPOPT_SSRR) { 1346 /* 1347 * user process specifies route as: 1348 * ->A->B->C->D 1349 * D must be our final destination (but we can't 1350 * check that since we may not have connected yet). 1351 * A is first hop destination, which doesn't appear in 1352 * actual IP option, but is stored before the options. 1353 */ 1354 if (olen < IPOPT_OFFSET + 1 + sizeof(struct in_addr)) 1355 goto bad; 1356 1357 offset = cp[IPOPT_OFFSET]; 1358 memcpy(mtod(m, u_char *), cp + IPOPT_OFFSET + 1, 1359 sizeof(struct in_addr)); 1360 1361 cp += sizeof(struct in_addr); 1362 cnt -= sizeof(struct in_addr); 1363 olen -= sizeof(struct in_addr); 1364 1365 if (m->m_len + olen > MAX_IPOPTLEN + sizeof(struct in_addr)) 1366 goto bad; 1367 1368 memcpy(dp, cp, olen); 1369 dp[IPOPT_OPTVAL] = optval; 1370 dp[IPOPT_OLEN] = olen; 1371 dp[IPOPT_OFFSET] = offset; 1372 break; 1373 } else { 1374 if (m->m_len + olen > MAX_IPOPTLEN + sizeof(struct in_addr)) 1375 goto bad; 1376 1377 memcpy(dp, cp, olen); 1378 break; 1379 } 1380 1381 dp += olen; 1382 m->m_len += olen; 1383 1384 if (optval == IPOPT_EOL) 1385 break; 1386 1387 cp += olen; 1388 cnt -= olen; 1389 } 1390 1391 *pcbopt = m; 1392 return (0); 1393 1394 bad: 1395 (void)m_free(m); 1396 return (EINVAL); 1397 } 1398 1399 /* 1400 * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index. 1401 */ 1402 static struct ifnet * 1403 ip_multicast_if(struct in_addr *a, int *ifindexp) 1404 { 1405 int ifindex; 1406 struct ifnet *ifp = NULL; 1407 struct in_ifaddr *ia; 1408 1409 if (ifindexp) 1410 *ifindexp = 0; 1411 if (ntohl(a->s_addr) >> 24 == 0) { 1412 ifindex = ntohl(a->s_addr) & 0xffffff; 1413 if (ifindex < 0 || if_indexlim <= ifindex) 1414 return NULL; 1415 ifp = ifindex2ifnet[ifindex]; 1416 if (!ifp) 1417 return NULL; 1418 if (ifindexp) 1419 *ifindexp = ifindex; 1420 } else { 1421 LIST_FOREACH(ia, &IN_IFADDR_HASH(a->s_addr), ia_hash) { 1422 if (in_hosteq(ia->ia_addr.sin_addr, *a) && 1423 (ia->ia_ifp->if_flags & IFF_MULTICAST) != 0) { 1424 ifp = ia->ia_ifp; 1425 break; 1426 } 1427 } 1428 } 1429 return ifp; 1430 } 1431 1432 static int 1433 ip_getoptval(const struct sockopt *sopt, u_int8_t *val, u_int maxval) 1434 { 1435 u_int tval; 1436 u_char cval; 1437 int error; 1438 1439 if (sopt == NULL) 1440 return EINVAL; 1441 1442 switch (sopt->sopt_size) { 1443 case sizeof(u_char): 1444 error = sockopt_get(sopt, &cval, sizeof(u_char)); 1445 tval = cval; 1446 break; 1447 1448 case sizeof(u_int): 1449 error = sockopt_get(sopt, &tval, sizeof(u_int)); 1450 break; 1451 1452 default: 1453 error = EINVAL; 1454 } 1455 1456 if (error) 1457 return error; 1458 1459 if (tval > maxval) 1460 return EINVAL; 1461 1462 *val = tval; 1463 return 0; 1464 } 1465 1466 /* 1467 * Set the IP multicast options in response to user setsockopt(). 1468 */ 1469 int 1470 ip_setmoptions(struct ip_moptions **imop, const struct sockopt *sopt) 1471 { 1472 struct in_addr addr; 1473 struct ip_mreq lmreq, *mreq; 1474 struct ifnet *ifp; 1475 struct ip_moptions *imo = *imop; 1476 int i, ifindex, error = 0; 1477 1478 if (imo == NULL) { 1479 /* 1480 * No multicast option buffer attached to the pcb; 1481 * allocate one and initialize to default values. 1482 */ 1483 imo = kmem_intr_alloc(sizeof(*imo), KM_NOSLEEP); 1484 if (imo == NULL) 1485 return ENOBUFS; 1486 1487 imo->imo_multicast_ifp = NULL; 1488 imo->imo_multicast_addr.s_addr = INADDR_ANY; 1489 imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 1490 imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; 1491 imo->imo_num_memberships = 0; 1492 *imop = imo; 1493 } 1494 1495 switch (sopt->sopt_name) { 1496 case IP_MULTICAST_IF: 1497 /* 1498 * Select the interface for outgoing multicast packets. 1499 */ 1500 error = sockopt_get(sopt, &addr, sizeof(addr)); 1501 if (error) 1502 break; 1503 1504 /* 1505 * INADDR_ANY is used to remove a previous selection. 1506 * When no interface is selected, a default one is 1507 * chosen every time a multicast packet is sent. 1508 */ 1509 if (in_nullhost(addr)) { 1510 imo->imo_multicast_ifp = NULL; 1511 break; 1512 } 1513 /* 1514 * The selected interface is identified by its local 1515 * IP address. Find the interface and confirm that 1516 * it supports multicasting. 1517 */ 1518 ifp = ip_multicast_if(&addr, &ifindex); 1519 if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { 1520 error = EADDRNOTAVAIL; 1521 break; 1522 } 1523 imo->imo_multicast_ifp = ifp; 1524 if (ifindex) 1525 imo->imo_multicast_addr = addr; 1526 else 1527 imo->imo_multicast_addr.s_addr = INADDR_ANY; 1528 break; 1529 1530 case IP_MULTICAST_TTL: 1531 /* 1532 * Set the IP time-to-live for outgoing multicast packets. 1533 */ 1534 error = ip_getoptval(sopt, &imo->imo_multicast_ttl, MAXTTL); 1535 break; 1536 1537 case IP_MULTICAST_LOOP: 1538 /* 1539 * Set the loopback flag for outgoing multicast packets. 1540 * Must be zero or one. 1541 */ 1542 error = ip_getoptval(sopt, &imo->imo_multicast_loop, 1); 1543 break; 1544 1545 case IP_ADD_MEMBERSHIP: 1546 /* 1547 * Add a multicast group membership. 1548 * Group must be a valid IP multicast address. 1549 */ 1550 error = sockopt_get(sopt, &lmreq, sizeof(lmreq)); 1551 if (error) 1552 break; 1553 1554 mreq = &lmreq; 1555 1556 if (!IN_MULTICAST(mreq->imr_multiaddr.s_addr)) { 1557 error = EINVAL; 1558 break; 1559 } 1560 /* 1561 * If no interface address was provided, use the interface of 1562 * the route to the given multicast address. 1563 */ 1564 if (in_nullhost(mreq->imr_interface)) { 1565 struct rtentry *rt; 1566 union { 1567 struct sockaddr dst; 1568 struct sockaddr_in dst4; 1569 } u; 1570 struct route ro; 1571 1572 memset(&ro, 0, sizeof(ro)); 1573 1574 sockaddr_in_init(&u.dst4, &mreq->imr_multiaddr, 0); 1575 rtcache_setdst(&ro, &u.dst); 1576 ifp = (rt = rtcache_init(&ro)) != NULL ? rt->rt_ifp 1577 : NULL; 1578 rtcache_free(&ro); 1579 } else { 1580 ifp = ip_multicast_if(&mreq->imr_interface, NULL); 1581 } 1582 /* 1583 * See if we found an interface, and confirm that it 1584 * supports multicast. 1585 */ 1586 if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { 1587 error = EADDRNOTAVAIL; 1588 break; 1589 } 1590 /* 1591 * See if the membership already exists or if all the 1592 * membership slots are full. 1593 */ 1594 for (i = 0; i < imo->imo_num_memberships; ++i) { 1595 if (imo->imo_membership[i]->inm_ifp == ifp && 1596 in_hosteq(imo->imo_membership[i]->inm_addr, 1597 mreq->imr_multiaddr)) 1598 break; 1599 } 1600 if (i < imo->imo_num_memberships) { 1601 error = EADDRINUSE; 1602 break; 1603 } 1604 if (i == IP_MAX_MEMBERSHIPS) { 1605 error = ETOOMANYREFS; 1606 break; 1607 } 1608 /* 1609 * Everything looks good; add a new record to the multicast 1610 * address list for the given interface. 1611 */ 1612 if ((imo->imo_membership[i] = 1613 in_addmulti(&mreq->imr_multiaddr, ifp)) == NULL) { 1614 error = ENOBUFS; 1615 break; 1616 } 1617 ++imo->imo_num_memberships; 1618 break; 1619 1620 case IP_DROP_MEMBERSHIP: 1621 /* 1622 * Drop a multicast group membership. 1623 * Group must be a valid IP multicast address. 1624 */ 1625 error = sockopt_get(sopt, &lmreq, sizeof(lmreq)); 1626 if (error) 1627 break; 1628 1629 mreq = &lmreq; 1630 1631 if (!IN_MULTICAST(mreq->imr_multiaddr.s_addr)) { 1632 error = EINVAL; 1633 break; 1634 } 1635 /* 1636 * If an interface address was specified, get a pointer 1637 * to its ifnet structure. 1638 */ 1639 if (in_nullhost(mreq->imr_interface)) 1640 ifp = NULL; 1641 else { 1642 ifp = ip_multicast_if(&mreq->imr_interface, NULL); 1643 if (ifp == NULL) { 1644 error = EADDRNOTAVAIL; 1645 break; 1646 } 1647 } 1648 /* 1649 * Find the membership in the membership array. 1650 */ 1651 for (i = 0; i < imo->imo_num_memberships; ++i) { 1652 if ((ifp == NULL || 1653 imo->imo_membership[i]->inm_ifp == ifp) && 1654 in_hosteq(imo->imo_membership[i]->inm_addr, 1655 mreq->imr_multiaddr)) 1656 break; 1657 } 1658 if (i == imo->imo_num_memberships) { 1659 error = EADDRNOTAVAIL; 1660 break; 1661 } 1662 /* 1663 * Give up the multicast address record to which the 1664 * membership points. 1665 */ 1666 in_delmulti(imo->imo_membership[i]); 1667 /* 1668 * Remove the gap in the membership array. 1669 */ 1670 for (++i; i < imo->imo_num_memberships; ++i) 1671 imo->imo_membership[i-1] = imo->imo_membership[i]; 1672 --imo->imo_num_memberships; 1673 break; 1674 1675 default: 1676 error = EOPNOTSUPP; 1677 break; 1678 } 1679 1680 /* 1681 * If all options have default values, no need to keep the mbuf. 1682 */ 1683 if (imo->imo_multicast_ifp == NULL && 1684 imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL && 1685 imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP && 1686 imo->imo_num_memberships == 0) { 1687 kmem_free(imo, sizeof(*imo)); 1688 *imop = NULL; 1689 } 1690 1691 return error; 1692 } 1693 1694 /* 1695 * Return the IP multicast options in response to user getsockopt(). 1696 */ 1697 int 1698 ip_getmoptions(struct ip_moptions *imo, struct sockopt *sopt) 1699 { 1700 struct in_addr addr; 1701 struct in_ifaddr *ia; 1702 int error; 1703 uint8_t optval; 1704 1705 error = 0; 1706 1707 switch (sopt->sopt_name) { 1708 case IP_MULTICAST_IF: 1709 if (imo == NULL || imo->imo_multicast_ifp == NULL) 1710 addr = zeroin_addr; 1711 else if (imo->imo_multicast_addr.s_addr) { 1712 /* return the value user has set */ 1713 addr = imo->imo_multicast_addr; 1714 } else { 1715 IFP_TO_IA(imo->imo_multicast_ifp, ia); 1716 addr = ia ? ia->ia_addr.sin_addr : zeroin_addr; 1717 } 1718 error = sockopt_set(sopt, &addr, sizeof(addr)); 1719 break; 1720 1721 case IP_MULTICAST_TTL: 1722 optval = imo ? imo->imo_multicast_ttl 1723 : IP_DEFAULT_MULTICAST_TTL; 1724 1725 error = sockopt_set(sopt, &optval, sizeof(optval)); 1726 break; 1727 1728 case IP_MULTICAST_LOOP: 1729 optval = imo ? imo->imo_multicast_loop 1730 : IP_DEFAULT_MULTICAST_LOOP; 1731 1732 error = sockopt_set(sopt, &optval, sizeof(optval)); 1733 break; 1734 1735 default: 1736 error = EOPNOTSUPP; 1737 } 1738 1739 return (error); 1740 } 1741 1742 /* 1743 * Discard the IP multicast options. 1744 */ 1745 void 1746 ip_freemoptions(struct ip_moptions *imo) 1747 { 1748 int i; 1749 1750 if (imo != NULL) { 1751 for (i = 0; i < imo->imo_num_memberships; ++i) 1752 in_delmulti(imo->imo_membership[i]); 1753 kmem_free(imo, sizeof(*imo)); 1754 } 1755 } 1756 1757 /* 1758 * Routine called from ip_output() to loop back a copy of an IP multicast 1759 * packet to the input queue of a specified interface. Note that this 1760 * calls the output routine of the loopback "driver", but with an interface 1761 * pointer that might NOT be lo0ifp -- easier than replicating that code here. 1762 */ 1763 static void 1764 ip_mloopback(struct ifnet *ifp, struct mbuf *m, const struct sockaddr_in *dst) 1765 { 1766 struct ip *ip; 1767 struct mbuf *copym; 1768 1769 copym = m_copypacket(m, M_DONTWAIT); 1770 if (copym != NULL 1771 && (copym->m_flags & M_EXT || copym->m_len < sizeof(struct ip))) 1772 copym = m_pullup(copym, sizeof(struct ip)); 1773 if (copym == NULL) 1774 return; 1775 /* 1776 * We don't bother to fragment if the IP length is greater 1777 * than the interface's MTU. Can this possibly matter? 1778 */ 1779 ip = mtod(copym, struct ip *); 1780 1781 if (copym->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) { 1782 in_delayed_cksum(copym); 1783 copym->m_pkthdr.csum_flags &= 1784 ~(M_CSUM_TCPv4|M_CSUM_UDPv4); 1785 } 1786 1787 ip->ip_sum = 0; 1788 ip->ip_sum = in_cksum(copym, ip->ip_hl << 2); 1789 (void)looutput(ifp, copym, sintocsa(dst), NULL); 1790 } 1791