1 /* $NetBSD: ip_output.c,v 1.214 2012/03/22 20:34:39 drochner Exp $ */ 2 3 /* 4 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the project nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 /*- 33 * Copyright (c) 1998 The NetBSD Foundation, Inc. 34 * All rights reserved. 35 * 36 * This code is derived from software contributed to The NetBSD Foundation 37 * by Public Access Networks Corporation ("Panix"). It was developed under 38 * contract to Panix by Eric Haszlakiewicz and Thor Lancelot Simon. 39 * 40 * Redistribution and use in source and binary forms, with or without 41 * modification, are permitted provided that the following conditions 42 * are met: 43 * 1. Redistributions of source code must retain the above copyright 44 * notice, this list of conditions and the following disclaimer. 45 * 2. Redistributions in binary form must reproduce the above copyright 46 * notice, this list of conditions and the following disclaimer in the 47 * documentation and/or other materials provided with the distribution. 48 * 49 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 50 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 51 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 52 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 53 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 54 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 55 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 56 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 57 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 58 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 59 * POSSIBILITY OF SUCH DAMAGE. 60 */ 61 62 /* 63 * Copyright (c) 1982, 1986, 1988, 1990, 1993 64 * The Regents of the University of California. All rights reserved. 65 * 66 * Redistribution and use in source and binary forms, with or without 67 * modification, are permitted provided that the following conditions 68 * are met: 69 * 1. Redistributions of source code must retain the above copyright 70 * notice, this list of conditions and the following disclaimer. 71 * 2. Redistributions in binary form must reproduce the above copyright 72 * notice, this list of conditions and the following disclaimer in the 73 * documentation and/or other materials provided with the distribution. 74 * 3. Neither the name of the University nor the names of its contributors 75 * may be used to endorse or promote products derived from this software 76 * without specific prior written permission. 77 * 78 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 79 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 80 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 81 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 82 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 83 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 84 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 85 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 86 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 87 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 88 * SUCH DAMAGE. 89 * 90 * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 91 */ 92 93 #include <sys/cdefs.h> 94 __KERNEL_RCSID(0, "$NetBSD: ip_output.c,v 1.214 2012/03/22 20:34:39 drochner Exp $"); 95 96 #include "opt_pfil_hooks.h" 97 #include "opt_inet.h" 98 #include "opt_ipsec.h" 99 #include "opt_mrouting.h" 100 101 #include <sys/param.h> 102 #include <sys/malloc.h> 103 #include <sys/mbuf.h> 104 #include <sys/errno.h> 105 #include <sys/protosw.h> 106 #include <sys/socket.h> 107 #include <sys/socketvar.h> 108 #include <sys/kauth.h> 109 #ifdef FAST_IPSEC 110 #include <sys/domain.h> 111 #endif 112 #include <sys/systm.h> 113 #include <sys/proc.h> 114 115 #include <net/if.h> 116 #include <net/route.h> 117 #include <net/pfil.h> 118 119 #include <netinet/in.h> 120 #include <netinet/in_systm.h> 121 #include <netinet/ip.h> 122 #include <netinet/in_pcb.h> 123 #include <netinet/in_var.h> 124 #include <netinet/ip_var.h> 125 #include <netinet/ip_private.h> 126 #include <netinet/in_offload.h> 127 128 #ifdef MROUTING 129 #include <netinet/ip_mroute.h> 130 #endif 131 132 #ifdef FAST_IPSEC 133 #include <netipsec/ipsec.h> 134 #include <netipsec/key.h> 135 #include <netipsec/xform.h> 136 #endif /* FAST_IPSEC*/ 137 138 #ifdef IPSEC_NAT_T 139 #include <netinet/udp.h> 140 #endif 141 142 static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *); 143 static struct ifnet *ip_multicast_if(struct in_addr *, int *); 144 static void ip_mloopback(struct ifnet *, struct mbuf *, 145 const struct sockaddr_in *); 146 147 #ifdef PFIL_HOOKS 148 extern struct pfil_head inet_pfil_hook; /* XXX */ 149 #endif 150 151 int ip_do_loopback_cksum = 0; 152 153 /* 154 * IP output. The packet in mbuf chain m contains a skeletal IP 155 * header (with len, off, ttl, proto, tos, src, dst). 156 * The mbuf chain containing the packet will be freed. 157 * The mbuf opt, if present, will not be freed. 158 */ 159 int 160 ip_output(struct mbuf *m0, ...) 161 { 162 struct rtentry *rt; 163 struct ip *ip; 164 struct ifnet *ifp; 165 struct mbuf *m = m0; 166 int hlen = sizeof (struct ip); 167 int len, error = 0; 168 struct route iproute; 169 const struct sockaddr_in *dst; 170 struct in_ifaddr *ia; 171 struct ifaddr *xifa; 172 struct mbuf *opt; 173 struct route *ro; 174 int flags, sw_csum; 175 int *mtu_p; 176 u_long mtu; 177 struct ip_moptions *imo; 178 struct socket *so; 179 va_list ap; 180 #ifdef IPSEC_NAT_T 181 int natt_frag = 0; 182 #endif 183 #ifdef FAST_IPSEC 184 struct inpcb *inp; 185 struct secpolicy *sp = NULL; 186 int s; 187 #endif 188 u_int16_t ip_len; 189 union { 190 struct sockaddr dst; 191 struct sockaddr_in dst4; 192 } u; 193 struct sockaddr *rdst = &u.dst; /* real IP destination, as opposed 194 * to the nexthop 195 */ 196 197 len = 0; 198 va_start(ap, m0); 199 opt = va_arg(ap, struct mbuf *); 200 ro = va_arg(ap, struct route *); 201 flags = va_arg(ap, int); 202 imo = va_arg(ap, struct ip_moptions *); 203 so = va_arg(ap, struct socket *); 204 if (flags & IP_RETURNMTU) 205 mtu_p = va_arg(ap, int *); 206 else 207 mtu_p = NULL; 208 va_end(ap); 209 210 MCLAIM(m, &ip_tx_mowner); 211 #ifdef FAST_IPSEC 212 if (so != NULL && so->so_proto->pr_domain->dom_family == AF_INET) 213 inp = (struct inpcb *)so->so_pcb; 214 else 215 inp = NULL; 216 #endif /* FAST_IPSEC */ 217 218 #ifdef DIAGNOSTIC 219 if ((m->m_flags & M_PKTHDR) == 0) 220 panic("ip_output: no HDR"); 221 222 if ((m->m_pkthdr.csum_flags & (M_CSUM_TCPv6|M_CSUM_UDPv6)) != 0) { 223 panic("ip_output: IPv6 checksum offload flags: %d", 224 m->m_pkthdr.csum_flags); 225 } 226 227 if ((m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) == 228 (M_CSUM_TCPv4|M_CSUM_UDPv4)) { 229 panic("ip_output: conflicting checksum offload flags: %d", 230 m->m_pkthdr.csum_flags); 231 } 232 #endif 233 if (opt) { 234 m = ip_insertoptions(m, opt, &len); 235 if (len >= sizeof(struct ip)) 236 hlen = len; 237 } 238 ip = mtod(m, struct ip *); 239 /* 240 * Fill in IP header. 241 */ 242 if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { 243 ip->ip_v = IPVERSION; 244 ip->ip_off = htons(0); 245 /* ip->ip_id filled in after we find out source ia */ 246 ip->ip_hl = hlen >> 2; 247 IP_STATINC(IP_STAT_LOCALOUT); 248 } else { 249 hlen = ip->ip_hl << 2; 250 } 251 /* 252 * Route packet. 253 */ 254 memset(&iproute, 0, sizeof(iproute)); 255 if (ro == NULL) 256 ro = &iproute; 257 sockaddr_in_init(&u.dst4, &ip->ip_dst, 0); 258 dst = satocsin(rtcache_getdst(ro)); 259 /* 260 * If there is a cached route, 261 * check that it is to the same destination 262 * and is still up. If not, free it and try again. 263 * The address family should also be checked in case of sharing the 264 * cache with IPv6. 265 */ 266 if (dst == NULL) 267 ; 268 else if (dst->sin_family != AF_INET || 269 !in_hosteq(dst->sin_addr, ip->ip_dst)) 270 rtcache_free(ro); 271 272 if ((rt = rtcache_validate(ro)) == NULL && 273 (rt = rtcache_update(ro, 1)) == NULL) { 274 dst = &u.dst4; 275 rtcache_setdst(ro, &u.dst); 276 } 277 /* 278 * If routing to interface only, 279 * short circuit routing lookup. 280 */ 281 if (flags & IP_ROUTETOIF) { 282 if ((ia = ifatoia(ifa_ifwithladdr(sintocsa(dst)))) == NULL) { 283 IP_STATINC(IP_STAT_NOROUTE); 284 error = ENETUNREACH; 285 goto bad; 286 } 287 ifp = ia->ia_ifp; 288 mtu = ifp->if_mtu; 289 ip->ip_ttl = 1; 290 } else if ((IN_MULTICAST(ip->ip_dst.s_addr) || 291 ip->ip_dst.s_addr == INADDR_BROADCAST) && 292 imo != NULL && imo->imo_multicast_ifp != NULL) { 293 ifp = imo->imo_multicast_ifp; 294 mtu = ifp->if_mtu; 295 IFP_TO_IA(ifp, ia); 296 } else { 297 if (rt == NULL) 298 rt = rtcache_init(ro); 299 if (rt == NULL) { 300 IP_STATINC(IP_STAT_NOROUTE); 301 error = EHOSTUNREACH; 302 goto bad; 303 } 304 ia = ifatoia(rt->rt_ifa); 305 ifp = rt->rt_ifp; 306 if ((mtu = rt->rt_rmx.rmx_mtu) == 0) 307 mtu = ifp->if_mtu; 308 rt->rt_use++; 309 if (rt->rt_flags & RTF_GATEWAY) 310 dst = satosin(rt->rt_gateway); 311 } 312 if (IN_MULTICAST(ip->ip_dst.s_addr) || 313 (ip->ip_dst.s_addr == INADDR_BROADCAST)) { 314 struct in_multi *inm; 315 316 m->m_flags |= (ip->ip_dst.s_addr == INADDR_BROADCAST) ? 317 M_BCAST : M_MCAST; 318 /* 319 * See if the caller provided any multicast options 320 */ 321 if (imo != NULL) 322 ip->ip_ttl = imo->imo_multicast_ttl; 323 else 324 ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; 325 326 /* 327 * if we don't know the outgoing ifp yet, we can't generate 328 * output 329 */ 330 if (!ifp) { 331 IP_STATINC(IP_STAT_NOROUTE); 332 error = ENETUNREACH; 333 goto bad; 334 } 335 336 /* 337 * If the packet is multicast or broadcast, confirm that 338 * the outgoing interface can transmit it. 339 */ 340 if (((m->m_flags & M_MCAST) && 341 (ifp->if_flags & IFF_MULTICAST) == 0) || 342 ((m->m_flags & M_BCAST) && 343 (ifp->if_flags & (IFF_BROADCAST|IFF_POINTOPOINT)) == 0)) { 344 IP_STATINC(IP_STAT_NOROUTE); 345 error = ENETUNREACH; 346 goto bad; 347 } 348 /* 349 * If source address not specified yet, use an address 350 * of outgoing interface. 351 */ 352 if (in_nullhost(ip->ip_src)) { 353 struct in_ifaddr *xia; 354 355 IFP_TO_IA(ifp, xia); 356 if (!xia) { 357 error = EADDRNOTAVAIL; 358 goto bad; 359 } 360 xifa = &xia->ia_ifa; 361 if (xifa->ifa_getifa != NULL) { 362 xia = ifatoia((*xifa->ifa_getifa)(xifa, rdst)); 363 } 364 ip->ip_src = xia->ia_addr.sin_addr; 365 } 366 367 IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm); 368 if (inm != NULL && 369 (imo == NULL || imo->imo_multicast_loop)) { 370 /* 371 * If we belong to the destination multicast group 372 * on the outgoing interface, and the caller did not 373 * forbid loopback, loop back a copy. 374 */ 375 ip_mloopback(ifp, m, &u.dst4); 376 } 377 #ifdef MROUTING 378 else { 379 /* 380 * If we are acting as a multicast router, perform 381 * multicast forwarding as if the packet had just 382 * arrived on the interface to which we are about 383 * to send. The multicast forwarding function 384 * recursively calls this function, using the 385 * IP_FORWARDING flag to prevent infinite recursion. 386 * 387 * Multicasts that are looped back by ip_mloopback(), 388 * above, will be forwarded by the ip_input() routine, 389 * if necessary. 390 */ 391 extern struct socket *ip_mrouter; 392 393 if (ip_mrouter && (flags & IP_FORWARDING) == 0) { 394 if (ip_mforward(m, ifp) != 0) { 395 m_freem(m); 396 goto done; 397 } 398 } 399 } 400 #endif 401 /* 402 * Multicasts with a time-to-live of zero may be looped- 403 * back, above, but must not be transmitted on a network. 404 * Also, multicasts addressed to the loopback interface 405 * are not sent -- the above call to ip_mloopback() will 406 * loop back a copy if this host actually belongs to the 407 * destination group on the loopback interface. 408 */ 409 if (ip->ip_ttl == 0 || (ifp->if_flags & IFF_LOOPBACK) != 0) { 410 m_freem(m); 411 goto done; 412 } 413 414 goto sendit; 415 } 416 /* 417 * If source address not specified yet, use address 418 * of outgoing interface. 419 */ 420 if (in_nullhost(ip->ip_src)) { 421 xifa = &ia->ia_ifa; 422 if (xifa->ifa_getifa != NULL) 423 ia = ifatoia((*xifa->ifa_getifa)(xifa, rdst)); 424 ip->ip_src = ia->ia_addr.sin_addr; 425 } 426 427 /* 428 * packets with Class-D address as source are not valid per 429 * RFC 1112 430 */ 431 if (IN_MULTICAST(ip->ip_src.s_addr)) { 432 IP_STATINC(IP_STAT_ODROPPED); 433 error = EADDRNOTAVAIL; 434 goto bad; 435 } 436 437 /* 438 * Look for broadcast address and 439 * and verify user is allowed to send 440 * such a packet. 441 */ 442 if (in_broadcast(dst->sin_addr, ifp)) { 443 if ((ifp->if_flags & IFF_BROADCAST) == 0) { 444 error = EADDRNOTAVAIL; 445 goto bad; 446 } 447 if ((flags & IP_ALLOWBROADCAST) == 0) { 448 error = EACCES; 449 goto bad; 450 } 451 /* don't allow broadcast messages to be fragmented */ 452 if (ntohs(ip->ip_len) > ifp->if_mtu) { 453 error = EMSGSIZE; 454 goto bad; 455 } 456 m->m_flags |= M_BCAST; 457 } else 458 m->m_flags &= ~M_BCAST; 459 460 sendit: 461 if ((flags & (IP_FORWARDING|IP_NOIPNEWID)) == 0) { 462 if (m->m_pkthdr.len < IP_MINFRAGSIZE) { 463 ip->ip_id = 0; 464 } else if ((m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0) { 465 ip->ip_id = ip_newid(ia); 466 } else { 467 468 /* 469 * TSO capable interfaces (typically?) increment 470 * ip_id for each segment. 471 * "allocate" enough ids here to increase the chance 472 * for them to be unique. 473 * 474 * note that the following calculation is not 475 * needed to be precise. wasting some ip_id is fine. 476 */ 477 478 unsigned int segsz = m->m_pkthdr.segsz; 479 unsigned int datasz = ntohs(ip->ip_len) - hlen; 480 unsigned int num = howmany(datasz, segsz); 481 482 ip->ip_id = ip_newid_range(ia, num); 483 } 484 } 485 /* 486 * If we're doing Path MTU Discovery, we need to set DF unless 487 * the route's MTU is locked. 488 */ 489 if ((flags & IP_MTUDISC) != 0 && rt != NULL && 490 (rt->rt_rmx.rmx_locks & RTV_MTU) == 0) 491 ip->ip_off |= htons(IP_DF); 492 493 /* Remember the current ip_len */ 494 ip_len = ntohs(ip->ip_len); 495 496 #ifdef FAST_IPSEC 497 /* 498 * Check the security policy (SP) for the packet and, if 499 * required, do IPsec-related processing. There are two 500 * cases here; the first time a packet is sent through 501 * it will be untagged and handled by ipsec4_checkpolicy. 502 * If the packet is resubmitted to ip_output (e.g. after 503 * AH, ESP, etc. processing), there will be a tag to bypass 504 * the lookup and related policy checking. 505 */ 506 if (!ipsec_outdone(m)) { 507 s = splsoftnet(); 508 if (inp != NULL && 509 IPSEC_PCB_SKIP_IPSEC(inp->inp_sp, IPSEC_DIR_OUTBOUND)) { 510 splx(s); 511 goto spd_done; 512 } 513 sp = ipsec4_checkpolicy(m, IPSEC_DIR_OUTBOUND, flags, 514 &error, inp); 515 /* 516 * There are four return cases: 517 * sp != NULL apply IPsec policy 518 * sp == NULL, error == 0 no IPsec handling needed 519 * sp == NULL, error == -EINVAL discard packet w/o error 520 * sp == NULL, error != 0 discard packet, report error 521 */ 522 if (sp != NULL) { 523 #ifdef IPSEC_NAT_T 524 /* 525 * NAT-T ESP fragmentation: don't do IPSec processing now, 526 * we'll do it on each fragmented packet. 527 */ 528 if (sp->req->sav && 529 ((sp->req->sav->natt_type & UDP_ENCAP_ESPINUDP) || 530 (sp->req->sav->natt_type & UDP_ENCAP_ESPINUDP_NON_IKE))) { 531 if (ntohs(ip->ip_len) > sp->req->sav->esp_frag) { 532 natt_frag = 1; 533 mtu = sp->req->sav->esp_frag; 534 splx(s); 535 goto spd_done; 536 } 537 } 538 #endif /* IPSEC_NAT_T */ 539 540 /* 541 * Do delayed checksums now because we send before 542 * this is done in the normal processing path. 543 */ 544 if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) { 545 in_delayed_cksum(m); 546 m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4); 547 } 548 549 #ifdef __FreeBSD__ 550 ip->ip_len = htons(ip->ip_len); 551 ip->ip_off = htons(ip->ip_off); 552 #endif 553 554 /* NB: callee frees mbuf */ 555 error = ipsec4_process_packet(m, sp->req, flags, 0); 556 /* 557 * Preserve KAME behaviour: ENOENT can be returned 558 * when an SA acquire is in progress. Don't propagate 559 * this to user-level; it confuses applications. 560 * 561 * XXX this will go away when the SADB is redone. 562 */ 563 if (error == ENOENT) 564 error = 0; 565 splx(s); 566 goto done; 567 } else { 568 splx(s); 569 570 if (error != 0) { 571 /* 572 * Hack: -EINVAL is used to signal that a packet 573 * should be silently discarded. This is typically 574 * because we asked key management for an SA and 575 * it was delayed (e.g. kicked up to IKE). 576 */ 577 if (error == -EINVAL) 578 error = 0; 579 goto bad; 580 } else { 581 /* No IPsec processing for this packet. */ 582 } 583 } 584 } 585 spd_done: 586 #endif /* FAST_IPSEC */ 587 588 #ifdef PFIL_HOOKS 589 /* 590 * Run through list of hooks for output packets. 591 */ 592 if ((error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT)) != 0) 593 goto done; 594 if (m == NULL) 595 goto done; 596 597 ip = mtod(m, struct ip *); 598 hlen = ip->ip_hl << 2; 599 ip_len = ntohs(ip->ip_len); 600 #endif /* PFIL_HOOKS */ 601 602 m->m_pkthdr.csum_data |= hlen << 16; 603 604 #if IFA_STATS 605 /* 606 * search for the source address structure to 607 * maintain output statistics. 608 */ 609 INADDR_TO_IA(ip->ip_src, ia); 610 #endif 611 612 /* Maybe skip checksums on loopback interfaces. */ 613 if (IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)) { 614 m->m_pkthdr.csum_flags |= M_CSUM_IPv4; 615 } 616 sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_csum_flags_tx; 617 /* 618 * If small enough for mtu of path, or if using TCP segmentation 619 * offload, can just send directly. 620 */ 621 if (ip_len <= mtu || 622 (m->m_pkthdr.csum_flags & M_CSUM_TSOv4) != 0) { 623 #if IFA_STATS 624 if (ia) 625 ia->ia_ifa.ifa_data.ifad_outbytes += ip_len; 626 #endif 627 /* 628 * Always initialize the sum to 0! Some HW assisted 629 * checksumming requires this. 630 */ 631 ip->ip_sum = 0; 632 633 if ((m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0) { 634 /* 635 * Perform any checksums that the hardware can't do 636 * for us. 637 * 638 * XXX Does any hardware require the {th,uh}_sum 639 * XXX fields to be 0? 640 */ 641 if (sw_csum & M_CSUM_IPv4) { 642 KASSERT(IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)); 643 ip->ip_sum = in_cksum(m, hlen); 644 m->m_pkthdr.csum_flags &= ~M_CSUM_IPv4; 645 } 646 if (sw_csum & (M_CSUM_TCPv4|M_CSUM_UDPv4)) { 647 if (IN_NEED_CHECKSUM(ifp, 648 sw_csum & (M_CSUM_TCPv4|M_CSUM_UDPv4))) { 649 in_delayed_cksum(m); 650 } 651 m->m_pkthdr.csum_flags &= 652 ~(M_CSUM_TCPv4|M_CSUM_UDPv4); 653 } 654 } 655 656 if (__predict_true( 657 (m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0 || 658 (ifp->if_capenable & IFCAP_TSOv4) != 0)) { 659 KERNEL_LOCK(1, NULL); 660 error = 661 (*ifp->if_output)(ifp, m, 662 (m->m_flags & M_MCAST) ? 663 sintocsa(rdst) : sintocsa(dst), 664 rt); 665 KERNEL_UNLOCK_ONE(NULL); 666 } else { 667 error = 668 ip_tso_output(ifp, m, 669 (m->m_flags & M_MCAST) ? 670 sintocsa(rdst) : sintocsa(dst), 671 rt); 672 } 673 goto done; 674 } 675 676 /* 677 * We can't use HW checksumming if we're about to 678 * to fragment the packet. 679 * 680 * XXX Some hardware can do this. 681 */ 682 if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) { 683 if (IN_NEED_CHECKSUM(ifp, 684 m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4))) { 685 in_delayed_cksum(m); 686 } 687 m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4); 688 } 689 690 /* 691 * Too large for interface; fragment if possible. 692 * Must be able to put at least 8 bytes per fragment. 693 */ 694 if (ntohs(ip->ip_off) & IP_DF) { 695 if (flags & IP_RETURNMTU) 696 *mtu_p = mtu; 697 error = EMSGSIZE; 698 IP_STATINC(IP_STAT_CANTFRAG); 699 goto bad; 700 } 701 702 error = ip_fragment(m, ifp, mtu); 703 if (error) { 704 m = NULL; 705 goto bad; 706 } 707 708 for (; m; m = m0) { 709 m0 = m->m_nextpkt; 710 m->m_nextpkt = 0; 711 if (error == 0) { 712 #if IFA_STATS 713 if (ia) 714 ia->ia_ifa.ifa_data.ifad_outbytes += 715 ntohs(ip->ip_len); 716 #endif 717 #ifdef IPSEC_NAT_T 718 /* 719 * If we get there, the packet has not been handeld by 720 * IPSec whereas it should have. Now that it has been 721 * fragmented, re-inject it in ip_output so that IPsec 722 * processing can occur. 723 */ 724 if (natt_frag) { 725 error = ip_output(m, opt, 726 ro, flags | IP_RAWOUTPUT | IP_NOIPNEWID, imo, so, mtu_p); 727 } else 728 #endif /* IPSEC_NAT_T */ 729 { 730 KASSERT((m->m_pkthdr.csum_flags & 731 (M_CSUM_UDPv4 | M_CSUM_TCPv4)) == 0); 732 KERNEL_LOCK(1, NULL); 733 error = (*ifp->if_output)(ifp, m, 734 (m->m_flags & M_MCAST) ? 735 sintocsa(rdst) : sintocsa(dst), 736 rt); 737 KERNEL_UNLOCK_ONE(NULL); 738 } 739 } else 740 m_freem(m); 741 } 742 743 if (error == 0) 744 IP_STATINC(IP_STAT_FRAGMENTED); 745 done: 746 rtcache_free(&iproute); 747 748 #ifdef FAST_IPSEC 749 if (sp != NULL) 750 KEY_FREESP(&sp); 751 #endif /* FAST_IPSEC */ 752 753 return (error); 754 bad: 755 m_freem(m); 756 goto done; 757 } 758 759 int 760 ip_fragment(struct mbuf *m, struct ifnet *ifp, u_long mtu) 761 { 762 struct ip *ip, *mhip; 763 struct mbuf *m0; 764 int len, hlen, off; 765 int mhlen, firstlen; 766 struct mbuf **mnext; 767 int sw_csum = m->m_pkthdr.csum_flags; 768 int fragments = 0; 769 int s; 770 int error = 0; 771 772 ip = mtod(m, struct ip *); 773 hlen = ip->ip_hl << 2; 774 if (ifp != NULL) 775 sw_csum &= ~ifp->if_csum_flags_tx; 776 777 len = (mtu - hlen) &~ 7; 778 if (len < 8) { 779 m_freem(m); 780 return (EMSGSIZE); 781 } 782 783 firstlen = len; 784 mnext = &m->m_nextpkt; 785 786 /* 787 * Loop through length of segment after first fragment, 788 * make new header and copy data of each part and link onto chain. 789 */ 790 m0 = m; 791 mhlen = sizeof (struct ip); 792 for (off = hlen + len; off < ntohs(ip->ip_len); off += len) { 793 MGETHDR(m, M_DONTWAIT, MT_HEADER); 794 if (m == 0) { 795 error = ENOBUFS; 796 IP_STATINC(IP_STAT_ODROPPED); 797 goto sendorfree; 798 } 799 MCLAIM(m, m0->m_owner); 800 *mnext = m; 801 mnext = &m->m_nextpkt; 802 m->m_data += max_linkhdr; 803 mhip = mtod(m, struct ip *); 804 *mhip = *ip; 805 /* we must inherit MCAST and BCAST flags */ 806 m->m_flags |= m0->m_flags & (M_MCAST|M_BCAST); 807 if (hlen > sizeof (struct ip)) { 808 mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); 809 mhip->ip_hl = mhlen >> 2; 810 } 811 m->m_len = mhlen; 812 mhip->ip_off = ((off - hlen) >> 3) + 813 (ntohs(ip->ip_off) & ~IP_MF); 814 if (ip->ip_off & htons(IP_MF)) 815 mhip->ip_off |= IP_MF; 816 if (off + len >= ntohs(ip->ip_len)) 817 len = ntohs(ip->ip_len) - off; 818 else 819 mhip->ip_off |= IP_MF; 820 HTONS(mhip->ip_off); 821 mhip->ip_len = htons((u_int16_t)(len + mhlen)); 822 m->m_next = m_copym(m0, off, len, M_DONTWAIT); 823 if (m->m_next == 0) { 824 error = ENOBUFS; /* ??? */ 825 IP_STATINC(IP_STAT_ODROPPED); 826 goto sendorfree; 827 } 828 m->m_pkthdr.len = mhlen + len; 829 m->m_pkthdr.rcvif = NULL; 830 mhip->ip_sum = 0; 831 KASSERT((m->m_pkthdr.csum_flags & M_CSUM_IPv4) == 0); 832 if (sw_csum & M_CSUM_IPv4) { 833 mhip->ip_sum = in_cksum(m, mhlen); 834 } else { 835 /* 836 * checksum is hw-offloaded or not necessary. 837 */ 838 m->m_pkthdr.csum_flags |= 839 m0->m_pkthdr.csum_flags & M_CSUM_IPv4; 840 m->m_pkthdr.csum_data |= mhlen << 16; 841 KASSERT(!(ifp != NULL && 842 IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)) 843 || (m->m_pkthdr.csum_flags & M_CSUM_IPv4) != 0); 844 } 845 IP_STATINC(IP_STAT_OFRAGMENTS); 846 fragments++; 847 } 848 /* 849 * Update first fragment by trimming what's been copied out 850 * and updating header, then send each fragment (in order). 851 */ 852 m = m0; 853 m_adj(m, hlen + firstlen - ntohs(ip->ip_len)); 854 m->m_pkthdr.len = hlen + firstlen; 855 ip->ip_len = htons((u_int16_t)m->m_pkthdr.len); 856 ip->ip_off |= htons(IP_MF); 857 ip->ip_sum = 0; 858 if (sw_csum & M_CSUM_IPv4) { 859 ip->ip_sum = in_cksum(m, hlen); 860 m->m_pkthdr.csum_flags &= ~M_CSUM_IPv4; 861 } else { 862 /* 863 * checksum is hw-offloaded or not necessary. 864 */ 865 KASSERT(!(ifp != NULL && IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)) 866 || (m->m_pkthdr.csum_flags & M_CSUM_IPv4) != 0); 867 KASSERT(M_CSUM_DATA_IPv4_IPHL(m->m_pkthdr.csum_data) >= 868 sizeof(struct ip)); 869 } 870 sendorfree: 871 /* 872 * If there is no room for all the fragments, don't queue 873 * any of them. 874 */ 875 if (ifp != NULL) { 876 s = splnet(); 877 if (ifp->if_snd.ifq_maxlen - ifp->if_snd.ifq_len < fragments && 878 error == 0) { 879 error = ENOBUFS; 880 IP_STATINC(IP_STAT_ODROPPED); 881 IFQ_INC_DROPS(&ifp->if_snd); 882 } 883 splx(s); 884 } 885 if (error) { 886 for (m = m0; m; m = m0) { 887 m0 = m->m_nextpkt; 888 m->m_nextpkt = NULL; 889 m_freem(m); 890 } 891 } 892 return (error); 893 } 894 895 /* 896 * Process a delayed payload checksum calculation. 897 */ 898 void 899 in_delayed_cksum(struct mbuf *m) 900 { 901 struct ip *ip; 902 u_int16_t csum, offset; 903 904 ip = mtod(m, struct ip *); 905 offset = ip->ip_hl << 2; 906 csum = in4_cksum(m, 0, offset, ntohs(ip->ip_len) - offset); 907 if (csum == 0 && (m->m_pkthdr.csum_flags & M_CSUM_UDPv4) != 0) 908 csum = 0xffff; 909 910 offset += M_CSUM_DATA_IPv4_OFFSET(m->m_pkthdr.csum_data); 911 912 if ((offset + sizeof(u_int16_t)) > m->m_len) { 913 /* This happen when ip options were inserted 914 printf("in_delayed_cksum: pullup len %d off %d proto %d\n", 915 m->m_len, offset, ip->ip_p); 916 */ 917 m_copyback(m, offset, sizeof(csum), (void *) &csum); 918 } else 919 *(u_int16_t *)(mtod(m, char *) + offset) = csum; 920 } 921 922 /* 923 * Determine the maximum length of the options to be inserted; 924 * we would far rather allocate too much space rather than too little. 925 */ 926 927 u_int 928 ip_optlen(struct inpcb *inp) 929 { 930 struct mbuf *m = inp->inp_options; 931 932 if (m && m->m_len > offsetof(struct ipoption, ipopt_dst)) 933 return (m->m_len - offsetof(struct ipoption, ipopt_dst)); 934 else 935 return 0; 936 } 937 938 939 /* 940 * Insert IP options into preformed packet. 941 * Adjust IP destination as required for IP source routing, 942 * as indicated by a non-zero in_addr at the start of the options. 943 */ 944 static struct mbuf * 945 ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen) 946 { 947 struct ipoption *p = mtod(opt, struct ipoption *); 948 struct mbuf *n; 949 struct ip *ip = mtod(m, struct ip *); 950 unsigned optlen; 951 952 optlen = opt->m_len - sizeof(p->ipopt_dst); 953 if (optlen + ntohs(ip->ip_len) > IP_MAXPACKET) 954 return (m); /* XXX should fail */ 955 if (!in_nullhost(p->ipopt_dst)) 956 ip->ip_dst = p->ipopt_dst; 957 if (M_READONLY(m) || M_LEADINGSPACE(m) < optlen) { 958 MGETHDR(n, M_DONTWAIT, MT_HEADER); 959 if (n == 0) 960 return (m); 961 MCLAIM(n, m->m_owner); 962 M_MOVE_PKTHDR(n, m); 963 m->m_len -= sizeof(struct ip); 964 m->m_data += sizeof(struct ip); 965 n->m_next = m; 966 m = n; 967 m->m_len = optlen + sizeof(struct ip); 968 m->m_data += max_linkhdr; 969 bcopy((void *)ip, mtod(m, void *), sizeof(struct ip)); 970 } else { 971 m->m_data -= optlen; 972 m->m_len += optlen; 973 memmove(mtod(m, void *), ip, sizeof(struct ip)); 974 } 975 m->m_pkthdr.len += optlen; 976 ip = mtod(m, struct ip *); 977 bcopy((void *)p->ipopt_list, (void *)(ip + 1), (unsigned)optlen); 978 *phlen = sizeof(struct ip) + optlen; 979 ip->ip_len = htons(ntohs(ip->ip_len) + optlen); 980 return (m); 981 } 982 983 /* 984 * Copy options from ip to jp, 985 * omitting those not copied during fragmentation. 986 */ 987 int 988 ip_optcopy(struct ip *ip, struct ip *jp) 989 { 990 u_char *cp, *dp; 991 int opt, optlen, cnt; 992 993 cp = (u_char *)(ip + 1); 994 dp = (u_char *)(jp + 1); 995 cnt = (ip->ip_hl << 2) - sizeof (struct ip); 996 for (; cnt > 0; cnt -= optlen, cp += optlen) { 997 opt = cp[0]; 998 if (opt == IPOPT_EOL) 999 break; 1000 if (opt == IPOPT_NOP) { 1001 /* Preserve for IP mcast tunnel's LSRR alignment. */ 1002 *dp++ = IPOPT_NOP; 1003 optlen = 1; 1004 continue; 1005 } 1006 #ifdef DIAGNOSTIC 1007 if (cnt < IPOPT_OLEN + sizeof(*cp)) 1008 panic("malformed IPv4 option passed to ip_optcopy"); 1009 #endif 1010 optlen = cp[IPOPT_OLEN]; 1011 #ifdef DIAGNOSTIC 1012 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) 1013 panic("malformed IPv4 option passed to ip_optcopy"); 1014 #endif 1015 /* bogus lengths should have been caught by ip_dooptions */ 1016 if (optlen > cnt) 1017 optlen = cnt; 1018 if (IPOPT_COPIED(opt)) { 1019 bcopy((void *)cp, (void *)dp, (unsigned)optlen); 1020 dp += optlen; 1021 } 1022 } 1023 for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++) 1024 *dp++ = IPOPT_EOL; 1025 return (optlen); 1026 } 1027 1028 /* 1029 * IP socket option processing. 1030 */ 1031 int 1032 ip_ctloutput(int op, struct socket *so, struct sockopt *sopt) 1033 { 1034 struct inpcb *inp = sotoinpcb(so); 1035 int optval = 0; 1036 int error = 0; 1037 #if defined(FAST_IPSEC) 1038 struct lwp *l = curlwp; /*XXX*/ 1039 #endif 1040 1041 if (sopt->sopt_level != IPPROTO_IP) { 1042 if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_NOHEADER) 1043 return 0; 1044 return ENOPROTOOPT; 1045 } 1046 1047 switch (op) { 1048 case PRCO_SETOPT: 1049 switch (sopt->sopt_name) { 1050 case IP_OPTIONS: 1051 #ifdef notyet 1052 case IP_RETOPTS: 1053 #endif 1054 error = ip_pcbopts(&inp->inp_options, sopt); 1055 break; 1056 1057 case IP_TOS: 1058 case IP_TTL: 1059 case IP_MINTTL: 1060 case IP_RECVOPTS: 1061 case IP_RECVRETOPTS: 1062 case IP_RECVDSTADDR: 1063 case IP_RECVIF: 1064 case IP_RECVTTL: 1065 error = sockopt_getint(sopt, &optval); 1066 if (error) 1067 break; 1068 1069 switch (sopt->sopt_name) { 1070 case IP_TOS: 1071 inp->inp_ip.ip_tos = optval; 1072 break; 1073 1074 case IP_TTL: 1075 inp->inp_ip.ip_ttl = optval; 1076 break; 1077 1078 case IP_MINTTL: 1079 if (optval > 0 && optval <= MAXTTL) 1080 inp->inp_ip_minttl = optval; 1081 else 1082 error = EINVAL; 1083 break; 1084 #define OPTSET(bit) \ 1085 if (optval) \ 1086 inp->inp_flags |= bit; \ 1087 else \ 1088 inp->inp_flags &= ~bit; 1089 1090 case IP_RECVOPTS: 1091 OPTSET(INP_RECVOPTS); 1092 break; 1093 1094 case IP_RECVRETOPTS: 1095 OPTSET(INP_RECVRETOPTS); 1096 break; 1097 1098 case IP_RECVDSTADDR: 1099 OPTSET(INP_RECVDSTADDR); 1100 break; 1101 1102 case IP_RECVIF: 1103 OPTSET(INP_RECVIF); 1104 break; 1105 1106 case IP_RECVTTL: 1107 OPTSET(INP_RECVTTL); 1108 break; 1109 } 1110 break; 1111 #undef OPTSET 1112 1113 case IP_MULTICAST_IF: 1114 case IP_MULTICAST_TTL: 1115 case IP_MULTICAST_LOOP: 1116 case IP_ADD_MEMBERSHIP: 1117 case IP_DROP_MEMBERSHIP: 1118 error = ip_setmoptions(&inp->inp_moptions, sopt); 1119 break; 1120 1121 case IP_PORTRANGE: 1122 error = sockopt_getint(sopt, &optval); 1123 if (error) 1124 break; 1125 1126 /* INP_LOCK(inp); */ 1127 switch (optval) { 1128 case IP_PORTRANGE_DEFAULT: 1129 case IP_PORTRANGE_HIGH: 1130 inp->inp_flags &= ~(INP_LOWPORT); 1131 break; 1132 1133 case IP_PORTRANGE_LOW: 1134 inp->inp_flags |= INP_LOWPORT; 1135 break; 1136 1137 default: 1138 error = EINVAL; 1139 break; 1140 } 1141 /* INP_UNLOCK(inp); */ 1142 break; 1143 1144 #if defined(FAST_IPSEC) 1145 case IP_IPSEC_POLICY: 1146 { 1147 error = ipsec4_set_policy(inp, sopt->sopt_name, 1148 sopt->sopt_data, sopt->sopt_size, l->l_cred); 1149 break; 1150 } 1151 #endif /*IPSEC*/ 1152 1153 default: 1154 error = ENOPROTOOPT; 1155 break; 1156 } 1157 break; 1158 1159 case PRCO_GETOPT: 1160 switch (sopt->sopt_name) { 1161 case IP_OPTIONS: 1162 case IP_RETOPTS: 1163 if (inp->inp_options) { 1164 struct mbuf *m; 1165 1166 m = m_copym(inp->inp_options, 0, M_COPYALL, 1167 M_DONTWAIT); 1168 if (m == NULL) { 1169 error = ENOBUFS; 1170 break; 1171 } 1172 1173 error = sockopt_setmbuf(sopt, m); 1174 } 1175 break; 1176 1177 case IP_TOS: 1178 case IP_TTL: 1179 case IP_MINTTL: 1180 case IP_RECVOPTS: 1181 case IP_RECVRETOPTS: 1182 case IP_RECVDSTADDR: 1183 case IP_RECVIF: 1184 case IP_RECVTTL: 1185 case IP_ERRORMTU: 1186 switch (sopt->sopt_name) { 1187 case IP_TOS: 1188 optval = inp->inp_ip.ip_tos; 1189 break; 1190 1191 case IP_TTL: 1192 optval = inp->inp_ip.ip_ttl; 1193 break; 1194 1195 case IP_MINTTL: 1196 optval = inp->inp_ip_minttl; 1197 break; 1198 1199 case IP_ERRORMTU: 1200 optval = inp->inp_errormtu; 1201 break; 1202 1203 #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) 1204 1205 case IP_RECVOPTS: 1206 optval = OPTBIT(INP_RECVOPTS); 1207 break; 1208 1209 case IP_RECVRETOPTS: 1210 optval = OPTBIT(INP_RECVRETOPTS); 1211 break; 1212 1213 case IP_RECVDSTADDR: 1214 optval = OPTBIT(INP_RECVDSTADDR); 1215 break; 1216 1217 case IP_RECVIF: 1218 optval = OPTBIT(INP_RECVIF); 1219 break; 1220 1221 case IP_RECVTTL: 1222 optval = OPTBIT(INP_RECVTTL); 1223 break; 1224 } 1225 error = sockopt_setint(sopt, optval); 1226 break; 1227 1228 #if 0 /* defined(FAST_IPSEC) */ 1229 case IP_IPSEC_POLICY: 1230 { 1231 struct mbuf *m = NULL; 1232 1233 /* XXX this will return EINVAL as sopt is empty */ 1234 error = ipsec4_get_policy(inp, sopt->sopt_data, 1235 sopt->sopt_size, &m); 1236 if (error == 0) 1237 error = sockopt_setmbuf(sopt, m); 1238 break; 1239 } 1240 #endif /*IPSEC*/ 1241 1242 case IP_MULTICAST_IF: 1243 case IP_MULTICAST_TTL: 1244 case IP_MULTICAST_LOOP: 1245 case IP_ADD_MEMBERSHIP: 1246 case IP_DROP_MEMBERSHIP: 1247 error = ip_getmoptions(inp->inp_moptions, sopt); 1248 break; 1249 1250 case IP_PORTRANGE: 1251 if (inp->inp_flags & INP_LOWPORT) 1252 optval = IP_PORTRANGE_LOW; 1253 else 1254 optval = IP_PORTRANGE_DEFAULT; 1255 1256 error = sockopt_setint(sopt, optval); 1257 1258 break; 1259 1260 default: 1261 error = ENOPROTOOPT; 1262 break; 1263 } 1264 break; 1265 } 1266 return (error); 1267 } 1268 1269 /* 1270 * Set up IP options in pcb for insertion in output packets. 1271 * Store in mbuf with pointer in pcbopt, adding pseudo-option 1272 * with destination address if source routed. 1273 */ 1274 int 1275 ip_pcbopts(struct mbuf **pcbopt, const struct sockopt *sopt) 1276 { 1277 struct mbuf *m; 1278 const u_char *cp; 1279 u_char *dp; 1280 int cnt; 1281 uint8_t optval, olen, offset; 1282 1283 /* turn off any old options */ 1284 if (*pcbopt) 1285 (void)m_free(*pcbopt); 1286 *pcbopt = NULL; 1287 1288 cp = sopt->sopt_data; 1289 cnt = sopt->sopt_size; 1290 1291 if (cnt == 0) 1292 return (0); /* Only turning off any previous options */ 1293 1294 #ifndef __vax__ 1295 if (cnt % sizeof(int32_t)) 1296 return (EINVAL); 1297 #endif 1298 1299 m = m_get(M_DONTWAIT, MT_SOOPTS); 1300 if (m == NULL) 1301 return (ENOBUFS); 1302 1303 dp = mtod(m, u_char *); 1304 memset(dp, 0, sizeof(struct in_addr)); 1305 dp += sizeof(struct in_addr); 1306 m->m_len = sizeof(struct in_addr); 1307 1308 /* 1309 * IP option list according to RFC791. Each option is of the form 1310 * 1311 * [optval] [olen] [(olen - 2) data bytes] 1312 * 1313 * we validate the list and copy options to an mbuf for prepending 1314 * to data packets. The IP first-hop destination address will be 1315 * stored before actual options and is zero if unset. 1316 */ 1317 while (cnt > 0) { 1318 optval = cp[IPOPT_OPTVAL]; 1319 1320 if (optval == IPOPT_EOL || optval == IPOPT_NOP) { 1321 olen = 1; 1322 } else { 1323 if (cnt < IPOPT_OLEN + 1) 1324 goto bad; 1325 1326 olen = cp[IPOPT_OLEN]; 1327 if (olen < IPOPT_OLEN + 1 || olen > cnt) 1328 goto bad; 1329 } 1330 1331 if (optval == IPOPT_LSRR || optval == IPOPT_SSRR) { 1332 /* 1333 * user process specifies route as: 1334 * ->A->B->C->D 1335 * D must be our final destination (but we can't 1336 * check that since we may not have connected yet). 1337 * A is first hop destination, which doesn't appear in 1338 * actual IP option, but is stored before the options. 1339 */ 1340 if (olen < IPOPT_OFFSET + 1 + sizeof(struct in_addr)) 1341 goto bad; 1342 1343 offset = cp[IPOPT_OFFSET]; 1344 memcpy(mtod(m, u_char *), cp + IPOPT_OFFSET + 1, 1345 sizeof(struct in_addr)); 1346 1347 cp += sizeof(struct in_addr); 1348 cnt -= sizeof(struct in_addr); 1349 olen -= sizeof(struct in_addr); 1350 1351 if (m->m_len + olen > MAX_IPOPTLEN + sizeof(struct in_addr)) 1352 goto bad; 1353 1354 memcpy(dp, cp, olen); 1355 dp[IPOPT_OPTVAL] = optval; 1356 dp[IPOPT_OLEN] = olen; 1357 dp[IPOPT_OFFSET] = offset; 1358 break; 1359 } else { 1360 if (m->m_len + olen > MAX_IPOPTLEN + sizeof(struct in_addr)) 1361 goto bad; 1362 1363 memcpy(dp, cp, olen); 1364 break; 1365 } 1366 1367 dp += olen; 1368 m->m_len += olen; 1369 1370 if (optval == IPOPT_EOL) 1371 break; 1372 1373 cp += olen; 1374 cnt -= olen; 1375 } 1376 1377 *pcbopt = m; 1378 return (0); 1379 1380 bad: 1381 (void)m_free(m); 1382 return (EINVAL); 1383 } 1384 1385 /* 1386 * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index. 1387 */ 1388 static struct ifnet * 1389 ip_multicast_if(struct in_addr *a, int *ifindexp) 1390 { 1391 int ifindex; 1392 struct ifnet *ifp = NULL; 1393 struct in_ifaddr *ia; 1394 1395 if (ifindexp) 1396 *ifindexp = 0; 1397 if (ntohl(a->s_addr) >> 24 == 0) { 1398 ifindex = ntohl(a->s_addr) & 0xffffff; 1399 if (ifindex < 0 || if_indexlim <= ifindex) 1400 return NULL; 1401 ifp = ifindex2ifnet[ifindex]; 1402 if (!ifp) 1403 return NULL; 1404 if (ifindexp) 1405 *ifindexp = ifindex; 1406 } else { 1407 LIST_FOREACH(ia, &IN_IFADDR_HASH(a->s_addr), ia_hash) { 1408 if (in_hosteq(ia->ia_addr.sin_addr, *a) && 1409 (ia->ia_ifp->if_flags & IFF_MULTICAST) != 0) { 1410 ifp = ia->ia_ifp; 1411 break; 1412 } 1413 } 1414 } 1415 return ifp; 1416 } 1417 1418 static int 1419 ip_getoptval(const struct sockopt *sopt, u_int8_t *val, u_int maxval) 1420 { 1421 u_int tval; 1422 u_char cval; 1423 int error; 1424 1425 if (sopt == NULL) 1426 return EINVAL; 1427 1428 switch (sopt->sopt_size) { 1429 case sizeof(u_char): 1430 error = sockopt_get(sopt, &cval, sizeof(u_char)); 1431 tval = cval; 1432 break; 1433 1434 case sizeof(u_int): 1435 error = sockopt_get(sopt, &tval, sizeof(u_int)); 1436 break; 1437 1438 default: 1439 error = EINVAL; 1440 } 1441 1442 if (error) 1443 return error; 1444 1445 if (tval > maxval) 1446 return EINVAL; 1447 1448 *val = tval; 1449 return 0; 1450 } 1451 1452 /* 1453 * Set the IP multicast options in response to user setsockopt(). 1454 */ 1455 int 1456 ip_setmoptions(struct ip_moptions **imop, const struct sockopt *sopt) 1457 { 1458 int error = 0; 1459 int i; 1460 struct in_addr addr; 1461 struct ip_mreq lmreq, *mreq; 1462 struct ifnet *ifp; 1463 struct ip_moptions *imo = *imop; 1464 int ifindex; 1465 1466 if (imo == NULL) { 1467 /* 1468 * No multicast option buffer attached to the pcb; 1469 * allocate one and initialize to default values. 1470 */ 1471 imo = malloc(sizeof(*imo), M_IPMOPTS, M_NOWAIT); 1472 if (imo == NULL) 1473 return (ENOBUFS); 1474 1475 *imop = imo; 1476 imo->imo_multicast_ifp = NULL; 1477 imo->imo_multicast_addr.s_addr = INADDR_ANY; 1478 imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 1479 imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; 1480 imo->imo_num_memberships = 0; 1481 } 1482 1483 switch (sopt->sopt_name) { 1484 case IP_MULTICAST_IF: 1485 /* 1486 * Select the interface for outgoing multicast packets. 1487 */ 1488 error = sockopt_get(sopt, &addr, sizeof(addr)); 1489 if (error) 1490 break; 1491 1492 /* 1493 * INADDR_ANY is used to remove a previous selection. 1494 * When no interface is selected, a default one is 1495 * chosen every time a multicast packet is sent. 1496 */ 1497 if (in_nullhost(addr)) { 1498 imo->imo_multicast_ifp = NULL; 1499 break; 1500 } 1501 /* 1502 * The selected interface is identified by its local 1503 * IP address. Find the interface and confirm that 1504 * it supports multicasting. 1505 */ 1506 ifp = ip_multicast_if(&addr, &ifindex); 1507 if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { 1508 error = EADDRNOTAVAIL; 1509 break; 1510 } 1511 imo->imo_multicast_ifp = ifp; 1512 if (ifindex) 1513 imo->imo_multicast_addr = addr; 1514 else 1515 imo->imo_multicast_addr.s_addr = INADDR_ANY; 1516 break; 1517 1518 case IP_MULTICAST_TTL: 1519 /* 1520 * Set the IP time-to-live for outgoing multicast packets. 1521 */ 1522 error = ip_getoptval(sopt, &imo->imo_multicast_ttl, MAXTTL); 1523 break; 1524 1525 case IP_MULTICAST_LOOP: 1526 /* 1527 * Set the loopback flag for outgoing multicast packets. 1528 * Must be zero or one. 1529 */ 1530 error = ip_getoptval(sopt, &imo->imo_multicast_loop, 1); 1531 break; 1532 1533 case IP_ADD_MEMBERSHIP: 1534 /* 1535 * Add a multicast group membership. 1536 * Group must be a valid IP multicast address. 1537 */ 1538 error = sockopt_get(sopt, &lmreq, sizeof(lmreq)); 1539 if (error) 1540 break; 1541 1542 mreq = &lmreq; 1543 1544 if (!IN_MULTICAST(mreq->imr_multiaddr.s_addr)) { 1545 error = EINVAL; 1546 break; 1547 } 1548 /* 1549 * If no interface address was provided, use the interface of 1550 * the route to the given multicast address. 1551 */ 1552 if (in_nullhost(mreq->imr_interface)) { 1553 struct rtentry *rt; 1554 union { 1555 struct sockaddr dst; 1556 struct sockaddr_in dst4; 1557 } u; 1558 struct route ro; 1559 1560 memset(&ro, 0, sizeof(ro)); 1561 1562 sockaddr_in_init(&u.dst4, &mreq->imr_multiaddr, 0); 1563 rtcache_setdst(&ro, &u.dst); 1564 ifp = (rt = rtcache_init(&ro)) != NULL ? rt->rt_ifp 1565 : NULL; 1566 rtcache_free(&ro); 1567 } else { 1568 ifp = ip_multicast_if(&mreq->imr_interface, NULL); 1569 } 1570 /* 1571 * See if we found an interface, and confirm that it 1572 * supports multicast. 1573 */ 1574 if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { 1575 error = EADDRNOTAVAIL; 1576 break; 1577 } 1578 /* 1579 * See if the membership already exists or if all the 1580 * membership slots are full. 1581 */ 1582 for (i = 0; i < imo->imo_num_memberships; ++i) { 1583 if (imo->imo_membership[i]->inm_ifp == ifp && 1584 in_hosteq(imo->imo_membership[i]->inm_addr, 1585 mreq->imr_multiaddr)) 1586 break; 1587 } 1588 if (i < imo->imo_num_memberships) { 1589 error = EADDRINUSE; 1590 break; 1591 } 1592 if (i == IP_MAX_MEMBERSHIPS) { 1593 error = ETOOMANYREFS; 1594 break; 1595 } 1596 /* 1597 * Everything looks good; add a new record to the multicast 1598 * address list for the given interface. 1599 */ 1600 if ((imo->imo_membership[i] = 1601 in_addmulti(&mreq->imr_multiaddr, ifp)) == NULL) { 1602 error = ENOBUFS; 1603 break; 1604 } 1605 ++imo->imo_num_memberships; 1606 break; 1607 1608 case IP_DROP_MEMBERSHIP: 1609 /* 1610 * Drop a multicast group membership. 1611 * Group must be a valid IP multicast address. 1612 */ 1613 error = sockopt_get(sopt, &lmreq, sizeof(lmreq)); 1614 if (error) 1615 break; 1616 1617 mreq = &lmreq; 1618 1619 if (!IN_MULTICAST(mreq->imr_multiaddr.s_addr)) { 1620 error = EINVAL; 1621 break; 1622 } 1623 /* 1624 * If an interface address was specified, get a pointer 1625 * to its ifnet structure. 1626 */ 1627 if (in_nullhost(mreq->imr_interface)) 1628 ifp = NULL; 1629 else { 1630 ifp = ip_multicast_if(&mreq->imr_interface, NULL); 1631 if (ifp == NULL) { 1632 error = EADDRNOTAVAIL; 1633 break; 1634 } 1635 } 1636 /* 1637 * Find the membership in the membership array. 1638 */ 1639 for (i = 0; i < imo->imo_num_memberships; ++i) { 1640 if ((ifp == NULL || 1641 imo->imo_membership[i]->inm_ifp == ifp) && 1642 in_hosteq(imo->imo_membership[i]->inm_addr, 1643 mreq->imr_multiaddr)) 1644 break; 1645 } 1646 if (i == imo->imo_num_memberships) { 1647 error = EADDRNOTAVAIL; 1648 break; 1649 } 1650 /* 1651 * Give up the multicast address record to which the 1652 * membership points. 1653 */ 1654 in_delmulti(imo->imo_membership[i]); 1655 /* 1656 * Remove the gap in the membership array. 1657 */ 1658 for (++i; i < imo->imo_num_memberships; ++i) 1659 imo->imo_membership[i-1] = imo->imo_membership[i]; 1660 --imo->imo_num_memberships; 1661 break; 1662 1663 default: 1664 error = EOPNOTSUPP; 1665 break; 1666 } 1667 1668 /* 1669 * If all options have default values, no need to keep the mbuf. 1670 */ 1671 if (imo->imo_multicast_ifp == NULL && 1672 imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL && 1673 imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP && 1674 imo->imo_num_memberships == 0) { 1675 free(*imop, M_IPMOPTS); 1676 *imop = NULL; 1677 } 1678 1679 return (error); 1680 } 1681 1682 /* 1683 * Return the IP multicast options in response to user getsockopt(). 1684 */ 1685 int 1686 ip_getmoptions(struct ip_moptions *imo, struct sockopt *sopt) 1687 { 1688 struct in_addr addr; 1689 struct in_ifaddr *ia; 1690 int error; 1691 uint8_t optval; 1692 1693 error = 0; 1694 1695 switch (sopt->sopt_name) { 1696 case IP_MULTICAST_IF: 1697 if (imo == NULL || imo->imo_multicast_ifp == NULL) 1698 addr = zeroin_addr; 1699 else if (imo->imo_multicast_addr.s_addr) { 1700 /* return the value user has set */ 1701 addr = imo->imo_multicast_addr; 1702 } else { 1703 IFP_TO_IA(imo->imo_multicast_ifp, ia); 1704 addr = ia ? ia->ia_addr.sin_addr : zeroin_addr; 1705 } 1706 error = sockopt_set(sopt, &addr, sizeof(addr)); 1707 break; 1708 1709 case IP_MULTICAST_TTL: 1710 optval = imo ? imo->imo_multicast_ttl 1711 : IP_DEFAULT_MULTICAST_TTL; 1712 1713 error = sockopt_set(sopt, &optval, sizeof(optval)); 1714 break; 1715 1716 case IP_MULTICAST_LOOP: 1717 optval = imo ? imo->imo_multicast_loop 1718 : IP_DEFAULT_MULTICAST_LOOP; 1719 1720 error = sockopt_set(sopt, &optval, sizeof(optval)); 1721 break; 1722 1723 default: 1724 error = EOPNOTSUPP; 1725 } 1726 1727 return (error); 1728 } 1729 1730 /* 1731 * Discard the IP multicast options. 1732 */ 1733 void 1734 ip_freemoptions(struct ip_moptions *imo) 1735 { 1736 int i; 1737 1738 if (imo != NULL) { 1739 for (i = 0; i < imo->imo_num_memberships; ++i) 1740 in_delmulti(imo->imo_membership[i]); 1741 free(imo, M_IPMOPTS); 1742 } 1743 } 1744 1745 /* 1746 * Routine called from ip_output() to loop back a copy of an IP multicast 1747 * packet to the input queue of a specified interface. Note that this 1748 * calls the output routine of the loopback "driver", but with an interface 1749 * pointer that might NOT be lo0ifp -- easier than replicating that code here. 1750 */ 1751 static void 1752 ip_mloopback(struct ifnet *ifp, struct mbuf *m, const struct sockaddr_in *dst) 1753 { 1754 struct ip *ip; 1755 struct mbuf *copym; 1756 1757 copym = m_copypacket(m, M_DONTWAIT); 1758 if (copym != NULL 1759 && (copym->m_flags & M_EXT || copym->m_len < sizeof(struct ip))) 1760 copym = m_pullup(copym, sizeof(struct ip)); 1761 if (copym == NULL) 1762 return; 1763 /* 1764 * We don't bother to fragment if the IP length is greater 1765 * than the interface's MTU. Can this possibly matter? 1766 */ 1767 ip = mtod(copym, struct ip *); 1768 1769 if (copym->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) { 1770 in_delayed_cksum(copym); 1771 copym->m_pkthdr.csum_flags &= 1772 ~(M_CSUM_TCPv4|M_CSUM_UDPv4); 1773 } 1774 1775 ip->ip_sum = 0; 1776 ip->ip_sum = in_cksum(copym, ip->ip_hl << 2); 1777 (void)looutput(ifp, copym, sintocsa(dst), NULL); 1778 } 1779