1 /* $OpenBSD: ip_input.c,v 1.372 2022/06/29 09:01:48 mvs Exp $ */ 2 /* $NetBSD: ip_input.c,v 1.30 1996/03/16 23:53:58 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 33 */ 34 35 #include "pf.h" 36 #include "carp.h" 37 38 #include <sys/param.h> 39 #include <sys/systm.h> 40 #include <sys/mbuf.h> 41 #include <sys/domain.h> 42 #include <sys/mutex.h> 43 #include <sys/protosw.h> 44 #include <sys/socket.h> 45 #include <sys/socketvar.h> 46 #include <sys/sysctl.h> 47 #include <sys/pool.h> 48 #include <sys/task.h> 49 50 #include <net/if.h> 51 #include <net/if_var.h> 52 #include <net/if_dl.h> 53 #include <net/route.h> 54 #include <net/netisr.h> 55 56 #include <netinet/in.h> 57 #include <netinet/in_systm.h> 58 #include <netinet/if_ether.h> 59 #include <netinet/ip.h> 60 #include <netinet/in_pcb.h> 61 #include <netinet/in_var.h> 62 #include <netinet/ip_var.h> 63 #include <netinet/ip_icmp.h> 64 #include <net/if_types.h> 65 66 #ifdef INET6 67 #include <netinet6/ip6_var.h> 68 #endif 69 70 #if NPF > 0 71 #include <net/pfvar.h> 72 #endif 73 74 #ifdef MROUTING 75 #include <netinet/ip_mroute.h> 76 #endif 77 78 #ifdef IPSEC 79 #include <netinet/ip_ipsp.h> 80 #endif /* IPSEC */ 81 82 #if NCARP > 0 83 #include <netinet/ip_carp.h> 84 #endif 85 86 /* values controllable via sysctl */ 87 int ipforwarding = 0; 88 int ipmforwarding = 0; 89 int ipmultipath = 0; 90 int ipsendredirects = 1; 91 int ip_dosourceroute = 0; 92 int ip_defttl = IPDEFTTL; 93 int ip_mtudisc = 1; 94 int ip_mtudisc_timeout = IPMTUDISCTIMEOUT; 95 int ip_directedbcast = 0; 96 97 /* Protects `ipq' and `ip_frags'. */ 98 struct mutex ipq_mutex = MUTEX_INITIALIZER(IPL_SOFTNET); 99 100 /* IP reassembly queue */ 101 LIST_HEAD(, ipq) ipq; 102 103 /* Keep track of memory used for reassembly */ 104 int ip_maxqueue = 300; 105 int ip_frags = 0; 106 107 const struct sysctl_bounded_args ipctl_vars[] = { 108 #ifdef MROUTING 109 { IPCTL_MRTPROTO, &ip_mrtproto, SYSCTL_INT_READONLY }, 110 #endif 111 { IPCTL_FORWARDING, &ipforwarding, 0, 2 }, 112 { IPCTL_SENDREDIRECTS, &ipsendredirects, 0, 1 }, 113 { IPCTL_DEFTTL, &ip_defttl, 0, 255 }, 114 { IPCTL_DIRECTEDBCAST, &ip_directedbcast, 0, 1 }, 115 { IPCTL_IPPORT_FIRSTAUTO, &ipport_firstauto, 0, 65535 }, 116 { IPCTL_IPPORT_LASTAUTO, &ipport_lastauto, 0, 65535 }, 117 { IPCTL_IPPORT_HIFIRSTAUTO, &ipport_hifirstauto, 0, 65535 }, 118 { IPCTL_IPPORT_HILASTAUTO, &ipport_hilastauto, 0, 65535 }, 119 { IPCTL_IPPORT_MAXQUEUE, &ip_maxqueue, 0, 10000 }, 120 { IPCTL_MFORWARDING, &ipmforwarding, 0, 1 }, 121 { IPCTL_MULTIPATH, &ipmultipath, 0, 1 }, 122 { IPCTL_ARPTIMEOUT, &arpt_keep, 0, INT_MAX }, 123 { IPCTL_ARPDOWN, &arpt_down, 0, INT_MAX }, 124 }; 125 126 struct niqueue ipintrq = NIQUEUE_INITIALIZER(IPQ_MAXLEN, NETISR_IP); 127 128 struct pool ipqent_pool; 129 struct pool ipq_pool; 130 131 struct cpumem *ipcounters; 132 133 int ip_sysctl_ipstat(void *, size_t *, void *); 134 135 static struct mbuf_queue ipsend_mq; 136 static struct mbuf_queue ipsendraw_mq; 137 138 extern struct niqueue arpinq; 139 140 int ip_ours(struct mbuf **, int *, int, int); 141 int ip_local(struct mbuf **, int *, int, int); 142 int ip_dooptions(struct mbuf *, struct ifnet *); 143 int in_ouraddr(struct mbuf *, struct ifnet *, struct rtentry **); 144 145 static void ip_send_dispatch(void *); 146 static void ip_sendraw_dispatch(void *); 147 static struct task ipsend_task = TASK_INITIALIZER(ip_send_dispatch, &ipsend_mq); 148 static struct task ipsendraw_task = 149 TASK_INITIALIZER(ip_sendraw_dispatch, &ipsendraw_mq); 150 151 /* 152 * Used to save the IP options in case a protocol wants to respond 153 * to an incoming packet over the same route if the packet got here 154 * using IP source routing. This allows connection establishment and 155 * maintenance when the remote end is on a network that is not known 156 * to us. 157 */ 158 struct ip_srcrt { 159 int isr_nhops; /* number of hops */ 160 struct in_addr isr_dst; /* final destination */ 161 char isr_nop; /* one NOP to align */ 162 char isr_hdr[IPOPT_OFFSET + 1]; /* OPTVAL, OLEN & OFFSET */ 163 struct in_addr isr_routes[MAX_IPOPTLEN/sizeof(struct in_addr)]; 164 }; 165 166 void save_rte(struct mbuf *, u_char *, struct in_addr); 167 168 /* 169 * IP initialization: fill in IP protocol switch table. 170 * All protocols not implemented in kernel go to raw IP protocol handler. 171 */ 172 void 173 ip_init(void) 174 { 175 const struct protosw *pr; 176 int i; 177 const u_int16_t defbaddynamicports_tcp[] = DEFBADDYNAMICPORTS_TCP; 178 const u_int16_t defbaddynamicports_udp[] = DEFBADDYNAMICPORTS_UDP; 179 const u_int16_t defrootonlyports_tcp[] = DEFROOTONLYPORTS_TCP; 180 const u_int16_t defrootonlyports_udp[] = DEFROOTONLYPORTS_UDP; 181 182 ipcounters = counters_alloc(ips_ncounters); 183 184 pool_init(&ipqent_pool, sizeof(struct ipqent), 0, 185 IPL_SOFTNET, 0, "ipqe", NULL); 186 pool_init(&ipq_pool, sizeof(struct ipq), 0, 187 IPL_SOFTNET, 0, "ipq", NULL); 188 189 pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); 190 if (pr == NULL) 191 panic("ip_init"); 192 for (i = 0; i < IPPROTO_MAX; i++) 193 ip_protox[i] = pr - inetsw; 194 for (pr = inetdomain.dom_protosw; 195 pr < inetdomain.dom_protoswNPROTOSW; pr++) 196 if (pr->pr_domain->dom_family == PF_INET && 197 pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW && 198 pr->pr_protocol < IPPROTO_MAX) 199 ip_protox[pr->pr_protocol] = pr - inetsw; 200 LIST_INIT(&ipq); 201 202 /* Fill in list of ports not to allocate dynamically. */ 203 memset(&baddynamicports, 0, sizeof(baddynamicports)); 204 for (i = 0; defbaddynamicports_tcp[i] != 0; i++) 205 DP_SET(baddynamicports.tcp, defbaddynamicports_tcp[i]); 206 for (i = 0; defbaddynamicports_udp[i] != 0; i++) 207 DP_SET(baddynamicports.udp, defbaddynamicports_udp[i]); 208 209 /* Fill in list of ports only root can bind to. */ 210 memset(&rootonlyports, 0, sizeof(rootonlyports)); 211 for (i = 0; defrootonlyports_tcp[i] != 0; i++) 212 DP_SET(rootonlyports.tcp, defrootonlyports_tcp[i]); 213 for (i = 0; defrootonlyports_udp[i] != 0; i++) 214 DP_SET(rootonlyports.udp, defrootonlyports_udp[i]); 215 216 mq_init(&ipsend_mq, 64, IPL_SOFTNET); 217 mq_init(&ipsendraw_mq, 64, IPL_SOFTNET); 218 219 arpinit(); 220 #ifdef IPSEC 221 ipsec_init(); 222 #endif 223 #ifdef MROUTING 224 rt_timer_queue_init(&ip_mrouterq, MCAST_EXPIRE_FREQUENCY, 225 &mfc_expire_route); 226 #endif 227 } 228 229 /* 230 * Enqueue packet for local delivery. Queuing is used as a boundary 231 * between the network layer (input/forward path) running with shared 232 * NET_RLOCK_IN_SOFTNET() and the transport layer needing it exclusively. 233 */ 234 int 235 ip_ours(struct mbuf **mp, int *offp, int nxt, int af) 236 { 237 /* We are already in a IPv4/IPv6 local deliver loop. */ 238 if (af != AF_UNSPEC) 239 return ip_local(mp, offp, nxt, af); 240 241 niq_enqueue(&ipintrq, *mp); 242 *mp = NULL; 243 return IPPROTO_DONE; 244 } 245 246 /* 247 * Dequeue and process locally delivered packets. 248 */ 249 void 250 ipintr(void) 251 { 252 struct mbuf *m; 253 int off, nxt; 254 255 while ((m = niq_dequeue(&ipintrq)) != NULL) { 256 #ifdef DIAGNOSTIC 257 if ((m->m_flags & M_PKTHDR) == 0) 258 panic("ipintr no HDR"); 259 #endif 260 off = 0; 261 nxt = ip_local(&m, &off, IPPROTO_IPV4, AF_UNSPEC); 262 KASSERT(nxt == IPPROTO_DONE); 263 } 264 } 265 266 /* 267 * IPv4 input routine. 268 * 269 * Checksum and byte swap header. Process options. Forward or deliver. 270 */ 271 void 272 ipv4_input(struct ifnet *ifp, struct mbuf *m) 273 { 274 int off, nxt; 275 276 off = 0; 277 nxt = ip_input_if(&m, &off, IPPROTO_IPV4, AF_UNSPEC, ifp); 278 KASSERT(nxt == IPPROTO_DONE); 279 } 280 281 struct mbuf * 282 ipv4_check(struct ifnet *ifp, struct mbuf *m) 283 { 284 struct ip *ip; 285 int hlen, len; 286 287 if (m->m_len < sizeof(*ip)) { 288 m = m_pullup(m, sizeof(*ip)); 289 if (m == NULL) { 290 ipstat_inc(ips_toosmall); 291 return (NULL); 292 } 293 } 294 295 ip = mtod(m, struct ip *); 296 if (ip->ip_v != IPVERSION) { 297 ipstat_inc(ips_badvers); 298 goto bad; 299 } 300 301 hlen = ip->ip_hl << 2; 302 if (hlen < sizeof(*ip)) { /* minimum header length */ 303 ipstat_inc(ips_badhlen); 304 goto bad; 305 } 306 if (hlen > m->m_len) { 307 m = m_pullup(m, hlen); 308 if (m == NULL) { 309 ipstat_inc(ips_badhlen); 310 return (NULL); 311 } 312 ip = mtod(m, struct ip *); 313 } 314 315 /* 127/8 must not appear on wire - RFC1122 */ 316 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || 317 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { 318 if ((ifp->if_flags & IFF_LOOPBACK) == 0) { 319 ipstat_inc(ips_badaddr); 320 goto bad; 321 } 322 } 323 324 if (!ISSET(m->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_OK)) { 325 if (ISSET(m->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_BAD)) { 326 ipstat_inc(ips_badsum); 327 goto bad; 328 } 329 330 ipstat_inc(ips_inswcsum); 331 if (in_cksum(m, hlen) != 0) { 332 ipstat_inc(ips_badsum); 333 goto bad; 334 } 335 336 SET(m->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_OK); 337 } 338 339 /* Retrieve the packet length. */ 340 len = ntohs(ip->ip_len); 341 342 /* 343 * Convert fields to host representation. 344 */ 345 if (len < hlen) { 346 ipstat_inc(ips_badlen); 347 goto bad; 348 } 349 350 /* 351 * Check that the amount of data in the buffers 352 * is at least as much as the IP header would have us expect. 353 * Trim mbufs if longer than we expect. 354 * Drop packet if shorter than we expect. 355 */ 356 if (m->m_pkthdr.len < len) { 357 ipstat_inc(ips_tooshort); 358 goto bad; 359 } 360 if (m->m_pkthdr.len > len) { 361 if (m->m_len == m->m_pkthdr.len) { 362 m->m_len = len; 363 m->m_pkthdr.len = len; 364 } else 365 m_adj(m, len - m->m_pkthdr.len); 366 } 367 368 return (m); 369 bad: 370 m_freem(m); 371 return (NULL); 372 } 373 374 int 375 ip_input_if(struct mbuf **mp, int *offp, int nxt, int af, struct ifnet *ifp) 376 { 377 struct mbuf *m; 378 struct rtentry *rt = NULL; 379 struct ip *ip; 380 int hlen; 381 in_addr_t pfrdr = 0; 382 383 KASSERT(*offp == 0); 384 385 ipstat_inc(ips_total); 386 m = *mp = ipv4_check(ifp, *mp); 387 if (m == NULL) 388 goto bad; 389 390 ip = mtod(m, struct ip *); 391 392 #if NCARP > 0 393 if (carp_lsdrop(ifp, m, AF_INET, &ip->ip_src.s_addr, 394 &ip->ip_dst.s_addr, (ip->ip_p == IPPROTO_ICMP ? 0 : 1))) 395 goto bad; 396 #endif 397 398 #if NPF > 0 399 /* 400 * Packet filter 401 */ 402 pfrdr = ip->ip_dst.s_addr; 403 if (pf_test(AF_INET, PF_IN, ifp, mp) != PF_PASS) 404 goto bad; 405 m = *mp; 406 if (m == NULL) 407 goto bad; 408 409 ip = mtod(m, struct ip *); 410 pfrdr = (pfrdr != ip->ip_dst.s_addr); 411 #endif 412 413 hlen = ip->ip_hl << 2; 414 415 /* 416 * Process options and, if not destined for us, 417 * ship it on. ip_dooptions returns 1 when an 418 * error was detected (causing an icmp message 419 * to be sent and the original packet to be freed). 420 */ 421 if (hlen > sizeof (struct ip) && ip_dooptions(m, ifp)) { 422 m = *mp = NULL; 423 goto bad; 424 } 425 426 if (ip->ip_dst.s_addr == INADDR_BROADCAST || 427 ip->ip_dst.s_addr == INADDR_ANY) { 428 nxt = ip_ours(mp, offp, nxt, af); 429 goto out; 430 } 431 432 switch(in_ouraddr(m, ifp, &rt)) { 433 case 2: 434 goto bad; 435 case 1: 436 nxt = ip_ours(mp, offp, nxt, af); 437 goto out; 438 } 439 440 if (IN_MULTICAST(ip->ip_dst.s_addr)) { 441 /* 442 * Make sure M_MCAST is set. It should theoretically 443 * already be there, but let's play safe because upper 444 * layers check for this flag. 445 */ 446 m->m_flags |= M_MCAST; 447 448 #ifdef MROUTING 449 if (ipmforwarding && ip_mrouter[ifp->if_rdomain]) { 450 int error; 451 452 if (m->m_flags & M_EXT) { 453 if ((m = *mp = m_pullup(m, hlen)) == NULL) { 454 ipstat_inc(ips_toosmall); 455 goto bad; 456 } 457 ip = mtod(m, struct ip *); 458 } 459 /* 460 * If we are acting as a multicast router, all 461 * incoming multicast packets are passed to the 462 * kernel-level multicast forwarding function. 463 * The packet is returned (relatively) intact; if 464 * ip_mforward() returns a non-zero value, the packet 465 * must be discarded, else it may be accepted below. 466 * 467 * (The IP ident field is put in the same byte order 468 * as expected when ip_mforward() is called from 469 * ip_output().) 470 */ 471 KERNEL_LOCK(); 472 error = ip_mforward(m, ifp); 473 KERNEL_UNLOCK(); 474 if (error) { 475 ipstat_inc(ips_cantforward); 476 goto bad; 477 } 478 479 /* 480 * The process-level routing daemon needs to receive 481 * all multicast IGMP packets, whether or not this 482 * host belongs to their destination groups. 483 */ 484 if (ip->ip_p == IPPROTO_IGMP) { 485 nxt = ip_ours(mp, offp, nxt, af); 486 goto out; 487 } 488 ipstat_inc(ips_forward); 489 } 490 #endif 491 /* 492 * See if we belong to the destination multicast group on the 493 * arrival interface. 494 */ 495 if (!in_hasmulti(&ip->ip_dst, ifp)) { 496 ipstat_inc(ips_notmember); 497 if (!IN_LOCAL_GROUP(ip->ip_dst.s_addr)) 498 ipstat_inc(ips_cantforward); 499 goto bad; 500 } 501 nxt = ip_ours(mp, offp, nxt, af); 502 goto out; 503 } 504 505 #if NCARP > 0 506 if (ip->ip_p == IPPROTO_ICMP && 507 carp_lsdrop(ifp, m, AF_INET, &ip->ip_src.s_addr, 508 &ip->ip_dst.s_addr, 1)) 509 goto bad; 510 #endif 511 /* 512 * Not for us; forward if possible and desirable. 513 */ 514 if (ipforwarding == 0) { 515 ipstat_inc(ips_cantforward); 516 goto bad; 517 } 518 #ifdef IPSEC 519 if (ipsec_in_use) { 520 int rv; 521 522 rv = ipsec_forward_check(m, hlen, AF_INET); 523 if (rv != 0) { 524 ipstat_inc(ips_cantforward); 525 goto bad; 526 } 527 /* 528 * Fall through, forward packet. Outbound IPsec policy 529 * checking will occur in ip_output(). 530 */ 531 } 532 #endif /* IPSEC */ 533 534 ip_forward(m, ifp, rt, pfrdr); 535 *mp = NULL; 536 return IPPROTO_DONE; 537 bad: 538 nxt = IPPROTO_DONE; 539 m_freemp(mp); 540 out: 541 rtfree(rt); 542 return nxt; 543 } 544 545 /* 546 * IPv4 local-delivery routine. 547 * 548 * If fragmented try to reassemble. Pass to next level. 549 */ 550 int 551 ip_local(struct mbuf **mp, int *offp, int nxt, int af) 552 { 553 struct mbuf *m = *mp; 554 struct ip *ip = mtod(m, struct ip *); 555 struct ipq *fp; 556 struct ipqent *ipqe; 557 int mff, hlen; 558 559 NET_ASSERT_WLOCKED(); 560 561 hlen = ip->ip_hl << 2; 562 563 /* 564 * If offset or IP_MF are set, must reassemble. 565 * Otherwise, nothing need be done. 566 * (We could look in the reassembly queue to see 567 * if the packet was previously fragmented, 568 * but it's not worth the time; just let them time out.) 569 */ 570 if (ip->ip_off &~ htons(IP_DF | IP_RF)) { 571 if (m->m_flags & M_EXT) { /* XXX */ 572 if ((m = *mp = m_pullup(m, hlen)) == NULL) { 573 ipstat_inc(ips_toosmall); 574 return IPPROTO_DONE; 575 } 576 ip = mtod(m, struct ip *); 577 } 578 579 mtx_enter(&ipq_mutex); 580 581 /* 582 * Look for queue of fragments 583 * of this datagram. 584 */ 585 LIST_FOREACH(fp, &ipq, ipq_q) { 586 if (ip->ip_id == fp->ipq_id && 587 ip->ip_src.s_addr == fp->ipq_src.s_addr && 588 ip->ip_dst.s_addr == fp->ipq_dst.s_addr && 589 ip->ip_p == fp->ipq_p) 590 break; 591 } 592 593 /* 594 * Adjust ip_len to not reflect header, 595 * set ipqe_mff if more fragments are expected, 596 * convert offset of this to bytes. 597 */ 598 ip->ip_len = htons(ntohs(ip->ip_len) - hlen); 599 mff = (ip->ip_off & htons(IP_MF)) != 0; 600 if (mff) { 601 /* 602 * Make sure that fragments have a data length 603 * that's a non-zero multiple of 8 bytes. 604 */ 605 if (ntohs(ip->ip_len) == 0 || 606 (ntohs(ip->ip_len) & 0x7) != 0) { 607 ipstat_inc(ips_badfrags); 608 goto bad; 609 } 610 } 611 ip->ip_off = htons(ntohs(ip->ip_off) << 3); 612 613 /* 614 * If datagram marked as having more fragments 615 * or if this is not the first fragment, 616 * attempt reassembly; if it succeeds, proceed. 617 */ 618 if (mff || ip->ip_off) { 619 ipstat_inc(ips_fragments); 620 if (ip_frags + 1 > ip_maxqueue) { 621 ip_flush(); 622 ipstat_inc(ips_rcvmemdrop); 623 goto bad; 624 } 625 626 ipqe = pool_get(&ipqent_pool, PR_NOWAIT); 627 if (ipqe == NULL) { 628 ipstat_inc(ips_rcvmemdrop); 629 goto bad; 630 } 631 ip_frags++; 632 ipqe->ipqe_mff = mff; 633 ipqe->ipqe_m = m; 634 ipqe->ipqe_ip = ip; 635 m = *mp = ip_reass(ipqe, fp); 636 if (m == NULL) 637 goto bad; 638 ipstat_inc(ips_reassembled); 639 ip = mtod(m, struct ip *); 640 hlen = ip->ip_hl << 2; 641 ip->ip_len = htons(ntohs(ip->ip_len) + hlen); 642 } else 643 if (fp) 644 ip_freef(fp); 645 646 mtx_leave(&ipq_mutex); 647 } 648 649 *offp = hlen; 650 nxt = ip->ip_p; 651 /* Check whether we are already in a IPv4/IPv6 local deliver loop. */ 652 if (af == AF_UNSPEC) 653 nxt = ip_deliver(mp, offp, nxt, AF_INET); 654 return nxt; 655 bad: 656 mtx_leave(&ipq_mutex); 657 m_freemp(mp); 658 return IPPROTO_DONE; 659 } 660 661 #ifndef INET6 662 #define IPSTAT_INC(name) ipstat_inc(ips_##name) 663 #else 664 #define IPSTAT_INC(name) (af == AF_INET ? \ 665 ipstat_inc(ips_##name) : ip6stat_inc(ip6s_##name)) 666 #endif 667 668 int 669 ip_deliver(struct mbuf **mp, int *offp, int nxt, int af) 670 { 671 const struct protosw *psw; 672 int naf = af; 673 #ifdef INET6 674 int nest = 0; 675 #endif /* INET6 */ 676 677 /* pf might have modified stuff, might have to chksum */ 678 switch (af) { 679 case AF_INET: 680 in_proto_cksum_out(*mp, NULL); 681 break; 682 #ifdef INET6 683 case AF_INET6: 684 in6_proto_cksum_out(*mp, NULL); 685 break; 686 #endif /* INET6 */ 687 } 688 689 /* 690 * Tell launch routine the next header 691 */ 692 IPSTAT_INC(delivered); 693 694 while (nxt != IPPROTO_DONE) { 695 #ifdef INET6 696 if (af == AF_INET6 && 697 ip6_hdrnestlimit && (++nest > ip6_hdrnestlimit)) { 698 ip6stat_inc(ip6s_toomanyhdr); 699 goto bad; 700 } 701 #endif /* INET6 */ 702 703 /* 704 * protection against faulty packet - there should be 705 * more sanity checks in header chain processing. 706 */ 707 if ((*mp)->m_pkthdr.len < *offp) { 708 IPSTAT_INC(tooshort); 709 goto bad; 710 } 711 712 #ifdef IPSEC 713 if (ipsec_in_use) { 714 if (ipsec_local_check(*mp, *offp, nxt, af) != 0) { 715 IPSTAT_INC(cantforward); 716 goto bad; 717 } 718 } 719 /* Otherwise, just fall through and deliver the packet */ 720 #endif /* IPSEC */ 721 722 switch (nxt) { 723 case IPPROTO_IPV4: 724 naf = AF_INET; 725 ipstat_inc(ips_delivered); 726 break; 727 #ifdef INET6 728 case IPPROTO_IPV6: 729 naf = AF_INET6; 730 ip6stat_inc(ip6s_delivered); 731 break; 732 #endif /* INET6 */ 733 } 734 switch (af) { 735 case AF_INET: 736 psw = &inetsw[ip_protox[nxt]]; 737 break; 738 #ifdef INET6 739 case AF_INET6: 740 psw = &inet6sw[ip6_protox[nxt]]; 741 break; 742 #endif /* INET6 */ 743 } 744 nxt = (*psw->pr_input)(mp, offp, nxt, af); 745 af = naf; 746 } 747 return nxt; 748 bad: 749 m_freemp(mp); 750 return IPPROTO_DONE; 751 } 752 #undef IPSTAT_INC 753 754 int 755 in_ouraddr(struct mbuf *m, struct ifnet *ifp, struct rtentry **prt) 756 { 757 struct rtentry *rt; 758 struct ip *ip; 759 struct sockaddr_in sin; 760 int match = 0; 761 762 #if NPF > 0 763 switch (pf_ouraddr(m)) { 764 case 0: 765 return (0); 766 case 1: 767 return (1); 768 default: 769 /* pf does not know it */ 770 break; 771 } 772 #endif 773 774 ip = mtod(m, struct ip *); 775 776 memset(&sin, 0, sizeof(sin)); 777 sin.sin_len = sizeof(sin); 778 sin.sin_family = AF_INET; 779 sin.sin_addr = ip->ip_dst; 780 rt = rtalloc_mpath(sintosa(&sin), &ip->ip_src.s_addr, 781 m->m_pkthdr.ph_rtableid); 782 if (rtisvalid(rt)) { 783 if (ISSET(rt->rt_flags, RTF_LOCAL)) 784 match = 1; 785 786 /* 787 * If directedbcast is enabled we only consider it local 788 * if it is received on the interface with that address. 789 */ 790 if (ISSET(rt->rt_flags, RTF_BROADCAST) && 791 (!ip_directedbcast || rt->rt_ifidx == ifp->if_index)) { 792 match = 1; 793 794 /* Make sure M_BCAST is set */ 795 m->m_flags |= M_BCAST; 796 } 797 } 798 *prt = rt; 799 800 if (!match) { 801 struct ifaddr *ifa; 802 803 /* 804 * No local address or broadcast address found, so check for 805 * ancient classful broadcast addresses. 806 * It must have been broadcast on the link layer, and for an 807 * address on the interface it was received on. 808 */ 809 if (!ISSET(m->m_flags, M_BCAST) || 810 !IN_CLASSFULBROADCAST(ip->ip_dst.s_addr, ip->ip_dst.s_addr)) 811 return (0); 812 813 if (ifp->if_rdomain != rtable_l2(m->m_pkthdr.ph_rtableid)) 814 return (0); 815 /* 816 * The check in the loop assumes you only rx a packet on an UP 817 * interface, and that M_BCAST will only be set on a BROADCAST 818 * interface. 819 */ 820 NET_ASSERT_LOCKED(); 821 TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { 822 if (ifa->ifa_addr->sa_family != AF_INET) 823 continue; 824 825 if (IN_CLASSFULBROADCAST(ip->ip_dst.s_addr, 826 ifatoia(ifa)->ia_addr.sin_addr.s_addr)) { 827 match = 1; 828 break; 829 } 830 } 831 } else if (ipforwarding == 0 && rt->rt_ifidx != ifp->if_index && 832 !((ifp->if_flags & IFF_LOOPBACK) || (ifp->if_type == IFT_ENC) || 833 (m->m_pkthdr.pf.flags & PF_TAG_TRANSLATE_LOCALHOST))) { 834 /* received on wrong interface. */ 835 #if NCARP > 0 836 struct ifnet *out_if; 837 838 /* 839 * Virtual IPs on carp interfaces need to be checked also 840 * against the parent interface and other carp interfaces 841 * sharing the same parent. 842 */ 843 out_if = if_get(rt->rt_ifidx); 844 if (!(out_if && carp_strict_addr_chk(out_if, ifp))) { 845 ipstat_inc(ips_wrongif); 846 match = 2; 847 } 848 if_put(out_if); 849 #else 850 ipstat_inc(ips_wrongif); 851 match = 2; 852 #endif 853 } 854 855 return (match); 856 } 857 858 /* 859 * Take incoming datagram fragment and try to 860 * reassemble it into whole datagram. If a chain for 861 * reassembly of this datagram already exists, then it 862 * is given as fp; otherwise have to make a chain. 863 */ 864 struct mbuf * 865 ip_reass(struct ipqent *ipqe, struct ipq *fp) 866 { 867 struct mbuf *m = ipqe->ipqe_m; 868 struct ipqent *nq, *p, *q; 869 struct ip *ip; 870 struct mbuf *t; 871 int hlen = ipqe->ipqe_ip->ip_hl << 2; 872 int i, next; 873 u_int8_t ecn, ecn0; 874 875 MUTEX_ASSERT_LOCKED(&ipq_mutex); 876 877 /* 878 * Presence of header sizes in mbufs 879 * would confuse code below. 880 */ 881 m->m_data += hlen; 882 m->m_len -= hlen; 883 884 /* 885 * If first fragment to arrive, create a reassembly queue. 886 */ 887 if (fp == NULL) { 888 fp = pool_get(&ipq_pool, PR_NOWAIT); 889 if (fp == NULL) 890 goto dropfrag; 891 LIST_INSERT_HEAD(&ipq, fp, ipq_q); 892 fp->ipq_ttl = IPFRAGTTL; 893 fp->ipq_p = ipqe->ipqe_ip->ip_p; 894 fp->ipq_id = ipqe->ipqe_ip->ip_id; 895 LIST_INIT(&fp->ipq_fragq); 896 fp->ipq_src = ipqe->ipqe_ip->ip_src; 897 fp->ipq_dst = ipqe->ipqe_ip->ip_dst; 898 p = NULL; 899 goto insert; 900 } 901 902 /* 903 * Handle ECN by comparing this segment with the first one; 904 * if CE is set, do not lose CE. 905 * drop if CE and not-ECT are mixed for the same packet. 906 */ 907 ecn = ipqe->ipqe_ip->ip_tos & IPTOS_ECN_MASK; 908 ecn0 = LIST_FIRST(&fp->ipq_fragq)->ipqe_ip->ip_tos & IPTOS_ECN_MASK; 909 if (ecn == IPTOS_ECN_CE) { 910 if (ecn0 == IPTOS_ECN_NOTECT) 911 goto dropfrag; 912 if (ecn0 != IPTOS_ECN_CE) 913 LIST_FIRST(&fp->ipq_fragq)->ipqe_ip->ip_tos |= 914 IPTOS_ECN_CE; 915 } 916 if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) 917 goto dropfrag; 918 919 /* 920 * Find a segment which begins after this one does. 921 */ 922 for (p = NULL, q = LIST_FIRST(&fp->ipq_fragq); q != NULL; 923 p = q, q = LIST_NEXT(q, ipqe_q)) 924 if (ntohs(q->ipqe_ip->ip_off) > ntohs(ipqe->ipqe_ip->ip_off)) 925 break; 926 927 /* 928 * If there is a preceding segment, it may provide some of 929 * our data already. If so, drop the data from the incoming 930 * segment. If it provides all of our data, drop us. 931 */ 932 if (p != NULL) { 933 i = ntohs(p->ipqe_ip->ip_off) + ntohs(p->ipqe_ip->ip_len) - 934 ntohs(ipqe->ipqe_ip->ip_off); 935 if (i > 0) { 936 if (i >= ntohs(ipqe->ipqe_ip->ip_len)) 937 goto dropfrag; 938 m_adj(ipqe->ipqe_m, i); 939 ipqe->ipqe_ip->ip_off = 940 htons(ntohs(ipqe->ipqe_ip->ip_off) + i); 941 ipqe->ipqe_ip->ip_len = 942 htons(ntohs(ipqe->ipqe_ip->ip_len) - i); 943 } 944 } 945 946 /* 947 * While we overlap succeeding segments trim them or, 948 * if they are completely covered, dequeue them. 949 */ 950 for (; q != NULL && 951 ntohs(ipqe->ipqe_ip->ip_off) + ntohs(ipqe->ipqe_ip->ip_len) > 952 ntohs(q->ipqe_ip->ip_off); q = nq) { 953 i = (ntohs(ipqe->ipqe_ip->ip_off) + 954 ntohs(ipqe->ipqe_ip->ip_len)) - ntohs(q->ipqe_ip->ip_off); 955 if (i < ntohs(q->ipqe_ip->ip_len)) { 956 q->ipqe_ip->ip_len = 957 htons(ntohs(q->ipqe_ip->ip_len) - i); 958 q->ipqe_ip->ip_off = 959 htons(ntohs(q->ipqe_ip->ip_off) + i); 960 m_adj(q->ipqe_m, i); 961 break; 962 } 963 nq = LIST_NEXT(q, ipqe_q); 964 m_freem(q->ipqe_m); 965 LIST_REMOVE(q, ipqe_q); 966 pool_put(&ipqent_pool, q); 967 ip_frags--; 968 } 969 970 insert: 971 /* 972 * Stick new segment in its place; 973 * check for complete reassembly. 974 */ 975 if (p == NULL) { 976 LIST_INSERT_HEAD(&fp->ipq_fragq, ipqe, ipqe_q); 977 } else { 978 LIST_INSERT_AFTER(p, ipqe, ipqe_q); 979 } 980 next = 0; 981 for (p = NULL, q = LIST_FIRST(&fp->ipq_fragq); q != NULL; 982 p = q, q = LIST_NEXT(q, ipqe_q)) { 983 if (ntohs(q->ipqe_ip->ip_off) != next) 984 return (0); 985 next += ntohs(q->ipqe_ip->ip_len); 986 } 987 if (p->ipqe_mff) 988 return (0); 989 990 /* 991 * Reassembly is complete. Check for a bogus message size and 992 * concatenate fragments. 993 */ 994 q = LIST_FIRST(&fp->ipq_fragq); 995 ip = q->ipqe_ip; 996 if ((next + (ip->ip_hl << 2)) > IP_MAXPACKET) { 997 ipstat_inc(ips_toolong); 998 ip_freef(fp); 999 return (0); 1000 } 1001 m = q->ipqe_m; 1002 t = m->m_next; 1003 m->m_next = 0; 1004 m_cat(m, t); 1005 nq = LIST_NEXT(q, ipqe_q); 1006 pool_put(&ipqent_pool, q); 1007 ip_frags--; 1008 for (q = nq; q != NULL; q = nq) { 1009 t = q->ipqe_m; 1010 nq = LIST_NEXT(q, ipqe_q); 1011 pool_put(&ipqent_pool, q); 1012 ip_frags--; 1013 m_removehdr(t); 1014 m_cat(m, t); 1015 } 1016 1017 /* 1018 * Create header for new ip packet by 1019 * modifying header of first packet; 1020 * dequeue and discard fragment reassembly header. 1021 * Make header visible. 1022 */ 1023 ip->ip_len = htons(next); 1024 ip->ip_src = fp->ipq_src; 1025 ip->ip_dst = fp->ipq_dst; 1026 LIST_REMOVE(fp, ipq_q); 1027 pool_put(&ipq_pool, fp); 1028 m->m_len += (ip->ip_hl << 2); 1029 m->m_data -= (ip->ip_hl << 2); 1030 m_calchdrlen(m); 1031 return (m); 1032 1033 dropfrag: 1034 ipstat_inc(ips_fragdropped); 1035 m_freem(m); 1036 pool_put(&ipqent_pool, ipqe); 1037 ip_frags--; 1038 return (NULL); 1039 } 1040 1041 /* 1042 * Free a fragment reassembly header and all 1043 * associated datagrams. 1044 */ 1045 void 1046 ip_freef(struct ipq *fp) 1047 { 1048 struct ipqent *q; 1049 1050 MUTEX_ASSERT_LOCKED(&ipq_mutex); 1051 1052 while ((q = LIST_FIRST(&fp->ipq_fragq)) != NULL) { 1053 LIST_REMOVE(q, ipqe_q); 1054 m_freem(q->ipqe_m); 1055 pool_put(&ipqent_pool, q); 1056 ip_frags--; 1057 } 1058 LIST_REMOVE(fp, ipq_q); 1059 pool_put(&ipq_pool, fp); 1060 } 1061 1062 /* 1063 * IP timer processing; 1064 * if a timer expires on a reassembly queue, discard it. 1065 */ 1066 void 1067 ip_slowtimo(void) 1068 { 1069 struct ipq *fp, *nfp; 1070 1071 mtx_enter(&ipq_mutex); 1072 LIST_FOREACH_SAFE(fp, &ipq, ipq_q, nfp) { 1073 if (--fp->ipq_ttl == 0) { 1074 ipstat_inc(ips_fragtimeout); 1075 ip_freef(fp); 1076 } 1077 } 1078 mtx_leave(&ipq_mutex); 1079 } 1080 1081 /* 1082 * Flush a bunch of datagram fragments, till we are down to 75%. 1083 */ 1084 void 1085 ip_flush(void) 1086 { 1087 int max = 50; 1088 1089 MUTEX_ASSERT_LOCKED(&ipq_mutex); 1090 1091 while (!LIST_EMPTY(&ipq) && ip_frags > ip_maxqueue * 3 / 4 && --max) { 1092 ipstat_inc(ips_fragdropped); 1093 ip_freef(LIST_FIRST(&ipq)); 1094 } 1095 } 1096 1097 /* 1098 * Do option processing on a datagram, 1099 * possibly discarding it if bad options are encountered, 1100 * or forwarding it if source-routed. 1101 * Returns 1 if packet has been forwarded/freed, 1102 * 0 if the packet should be processed further. 1103 */ 1104 int 1105 ip_dooptions(struct mbuf *m, struct ifnet *ifp) 1106 { 1107 struct ip *ip = mtod(m, struct ip *); 1108 unsigned int rtableid = m->m_pkthdr.ph_rtableid; 1109 struct rtentry *rt; 1110 struct sockaddr_in ipaddr; 1111 u_char *cp; 1112 struct ip_timestamp ipt; 1113 struct in_ifaddr *ia; 1114 int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0; 1115 struct in_addr sin, dst; 1116 u_int32_t ntime; 1117 1118 dst = ip->ip_dst; 1119 cp = (u_char *)(ip + 1); 1120 cnt = (ip->ip_hl << 2) - sizeof (struct ip); 1121 1122 KERNEL_LOCK(); 1123 for (; cnt > 0; cnt -= optlen, cp += optlen) { 1124 opt = cp[IPOPT_OPTVAL]; 1125 if (opt == IPOPT_EOL) 1126 break; 1127 if (opt == IPOPT_NOP) 1128 optlen = 1; 1129 else { 1130 if (cnt < IPOPT_OLEN + sizeof(*cp)) { 1131 code = &cp[IPOPT_OLEN] - (u_char *)ip; 1132 goto bad; 1133 } 1134 optlen = cp[IPOPT_OLEN]; 1135 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) { 1136 code = &cp[IPOPT_OLEN] - (u_char *)ip; 1137 goto bad; 1138 } 1139 } 1140 1141 switch (opt) { 1142 1143 default: 1144 break; 1145 1146 /* 1147 * Source routing with record. 1148 * Find interface with current destination address. 1149 * If none on this machine then drop if strictly routed, 1150 * or do nothing if loosely routed. 1151 * Record interface address and bring up next address 1152 * component. If strictly routed make sure next 1153 * address is on directly accessible net. 1154 */ 1155 case IPOPT_LSRR: 1156 case IPOPT_SSRR: 1157 if (!ip_dosourceroute) { 1158 type = ICMP_UNREACH; 1159 code = ICMP_UNREACH_SRCFAIL; 1160 goto bad; 1161 } 1162 if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { 1163 code = &cp[IPOPT_OFFSET] - (u_char *)ip; 1164 goto bad; 1165 } 1166 memset(&ipaddr, 0, sizeof(ipaddr)); 1167 ipaddr.sin_family = AF_INET; 1168 ipaddr.sin_len = sizeof(ipaddr); 1169 ipaddr.sin_addr = ip->ip_dst; 1170 ia = ifatoia(ifa_ifwithaddr(sintosa(&ipaddr), 1171 m->m_pkthdr.ph_rtableid)); 1172 if (ia == NULL) { 1173 if (opt == IPOPT_SSRR) { 1174 type = ICMP_UNREACH; 1175 code = ICMP_UNREACH_SRCFAIL; 1176 goto bad; 1177 } 1178 /* 1179 * Loose routing, and not at next destination 1180 * yet; nothing to do except forward. 1181 */ 1182 break; 1183 } 1184 off--; /* 0 origin */ 1185 if ((off + sizeof(struct in_addr)) > optlen) { 1186 /* 1187 * End of source route. Should be for us. 1188 */ 1189 save_rte(m, cp, ip->ip_src); 1190 break; 1191 } 1192 1193 /* 1194 * locate outgoing interface 1195 */ 1196 memset(&ipaddr, 0, sizeof(ipaddr)); 1197 ipaddr.sin_family = AF_INET; 1198 ipaddr.sin_len = sizeof(ipaddr); 1199 memcpy(&ipaddr.sin_addr, cp + off, 1200 sizeof(ipaddr.sin_addr)); 1201 /* keep packet in the virtual instance */ 1202 rt = rtalloc(sintosa(&ipaddr), RT_RESOLVE, rtableid); 1203 if (!rtisvalid(rt) || ((opt == IPOPT_SSRR) && 1204 ISSET(rt->rt_flags, RTF_GATEWAY))) { 1205 type = ICMP_UNREACH; 1206 code = ICMP_UNREACH_SRCFAIL; 1207 rtfree(rt); 1208 goto bad; 1209 } 1210 ia = ifatoia(rt->rt_ifa); 1211 memcpy(cp + off, &ia->ia_addr.sin_addr, 1212 sizeof(struct in_addr)); 1213 rtfree(rt); 1214 cp[IPOPT_OFFSET] += sizeof(struct in_addr); 1215 ip->ip_dst = ipaddr.sin_addr; 1216 /* 1217 * Let ip_intr's mcast routing check handle mcast pkts 1218 */ 1219 forward = !IN_MULTICAST(ip->ip_dst.s_addr); 1220 break; 1221 1222 case IPOPT_RR: 1223 if (optlen < IPOPT_OFFSET + sizeof(*cp)) { 1224 code = &cp[IPOPT_OLEN] - (u_char *)ip; 1225 goto bad; 1226 } 1227 if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { 1228 code = &cp[IPOPT_OFFSET] - (u_char *)ip; 1229 goto bad; 1230 } 1231 1232 /* 1233 * If no space remains, ignore. 1234 */ 1235 off--; /* 0 origin */ 1236 if ((off + sizeof(struct in_addr)) > optlen) 1237 break; 1238 memset(&ipaddr, 0, sizeof(ipaddr)); 1239 ipaddr.sin_family = AF_INET; 1240 ipaddr.sin_len = sizeof(ipaddr); 1241 ipaddr.sin_addr = ip->ip_dst; 1242 /* 1243 * locate outgoing interface; if we're the destination, 1244 * use the incoming interface (should be same). 1245 * Again keep the packet inside the virtual instance. 1246 */ 1247 rt = rtalloc(sintosa(&ipaddr), RT_RESOLVE, rtableid); 1248 if (!rtisvalid(rt)) { 1249 type = ICMP_UNREACH; 1250 code = ICMP_UNREACH_HOST; 1251 rtfree(rt); 1252 goto bad; 1253 } 1254 ia = ifatoia(rt->rt_ifa); 1255 memcpy(cp + off, &ia->ia_addr.sin_addr, 1256 sizeof(struct in_addr)); 1257 rtfree(rt); 1258 cp[IPOPT_OFFSET] += sizeof(struct in_addr); 1259 break; 1260 1261 case IPOPT_TS: 1262 code = cp - (u_char *)ip; 1263 if (optlen < sizeof(struct ip_timestamp)) 1264 goto bad; 1265 memcpy(&ipt, cp, sizeof(struct ip_timestamp)); 1266 if (ipt.ipt_ptr < 5 || ipt.ipt_len < 5) 1267 goto bad; 1268 if (ipt.ipt_ptr - 1 + sizeof(u_int32_t) > ipt.ipt_len) { 1269 if (++ipt.ipt_oflw == 0) 1270 goto bad; 1271 break; 1272 } 1273 memcpy(&sin, cp + ipt.ipt_ptr - 1, sizeof sin); 1274 switch (ipt.ipt_flg) { 1275 1276 case IPOPT_TS_TSONLY: 1277 break; 1278 1279 case IPOPT_TS_TSANDADDR: 1280 if (ipt.ipt_ptr - 1 + sizeof(u_int32_t) + 1281 sizeof(struct in_addr) > ipt.ipt_len) 1282 goto bad; 1283 memset(&ipaddr, 0, sizeof(ipaddr)); 1284 ipaddr.sin_family = AF_INET; 1285 ipaddr.sin_len = sizeof(ipaddr); 1286 ipaddr.sin_addr = dst; 1287 ia = ifatoia(ifaof_ifpforaddr(sintosa(&ipaddr), 1288 ifp)); 1289 if (ia == NULL) 1290 continue; 1291 memcpy(&sin, &ia->ia_addr.sin_addr, 1292 sizeof(struct in_addr)); 1293 ipt.ipt_ptr += sizeof(struct in_addr); 1294 break; 1295 1296 case IPOPT_TS_PRESPEC: 1297 if (ipt.ipt_ptr - 1 + sizeof(u_int32_t) + 1298 sizeof(struct in_addr) > ipt.ipt_len) 1299 goto bad; 1300 memset(&ipaddr, 0, sizeof(ipaddr)); 1301 ipaddr.sin_family = AF_INET; 1302 ipaddr.sin_len = sizeof(ipaddr); 1303 ipaddr.sin_addr = sin; 1304 if (ifa_ifwithaddr(sintosa(&ipaddr), 1305 m->m_pkthdr.ph_rtableid) == NULL) 1306 continue; 1307 ipt.ipt_ptr += sizeof(struct in_addr); 1308 break; 1309 1310 default: 1311 /* XXX can't take &ipt->ipt_flg */ 1312 code = (u_char *)&ipt.ipt_ptr - 1313 (u_char *)ip + 1; 1314 goto bad; 1315 } 1316 ntime = iptime(); 1317 memcpy(cp + ipt.ipt_ptr - 1, &ntime, sizeof(u_int32_t)); 1318 ipt.ipt_ptr += sizeof(u_int32_t); 1319 } 1320 } 1321 KERNEL_UNLOCK(); 1322 if (forward && ipforwarding > 0) { 1323 ip_forward(m, ifp, NULL, 1); 1324 return (1); 1325 } 1326 return (0); 1327 bad: 1328 KERNEL_UNLOCK(); 1329 icmp_error(m, type, code, 0, 0); 1330 ipstat_inc(ips_badoptions); 1331 return (1); 1332 } 1333 1334 /* 1335 * Save incoming source route for use in replies, 1336 * to be picked up later by ip_srcroute if the receiver is interested. 1337 */ 1338 void 1339 save_rte(struct mbuf *m, u_char *option, struct in_addr dst) 1340 { 1341 struct ip_srcrt *isr; 1342 struct m_tag *mtag; 1343 unsigned olen; 1344 1345 olen = option[IPOPT_OLEN]; 1346 if (olen > sizeof(isr->isr_hdr) + sizeof(isr->isr_routes)) 1347 return; 1348 1349 mtag = m_tag_get(PACKET_TAG_SRCROUTE, sizeof(*isr), M_NOWAIT); 1350 if (mtag == NULL) 1351 return; 1352 isr = (struct ip_srcrt *)(mtag + 1); 1353 1354 memcpy(isr->isr_hdr, option, olen); 1355 isr->isr_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr); 1356 isr->isr_dst = dst; 1357 m_tag_prepend(m, mtag); 1358 } 1359 1360 /* 1361 * Retrieve incoming source route for use in replies, 1362 * in the same form used by setsockopt. 1363 * The first hop is placed before the options, will be removed later. 1364 */ 1365 struct mbuf * 1366 ip_srcroute(struct mbuf *m0) 1367 { 1368 struct in_addr *p, *q; 1369 struct mbuf *m; 1370 struct ip_srcrt *isr; 1371 struct m_tag *mtag; 1372 1373 if (!ip_dosourceroute) 1374 return (NULL); 1375 1376 mtag = m_tag_find(m0, PACKET_TAG_SRCROUTE, NULL); 1377 if (mtag == NULL) 1378 return (NULL); 1379 isr = (struct ip_srcrt *)(mtag + 1); 1380 1381 if (isr->isr_nhops == 0) 1382 return (NULL); 1383 m = m_get(M_DONTWAIT, MT_SOOPTS); 1384 if (m == NULL) 1385 return (NULL); 1386 1387 #define OPTSIZ (sizeof(isr->isr_nop) + sizeof(isr->isr_hdr)) 1388 1389 /* length is (nhops+1)*sizeof(addr) + sizeof(nop + header) */ 1390 m->m_len = (isr->isr_nhops + 1) * sizeof(struct in_addr) + OPTSIZ; 1391 1392 /* 1393 * First save first hop for return route 1394 */ 1395 p = &(isr->isr_routes[isr->isr_nhops - 1]); 1396 *(mtod(m, struct in_addr *)) = *p--; 1397 1398 /* 1399 * Copy option fields and padding (nop) to mbuf. 1400 */ 1401 isr->isr_nop = IPOPT_NOP; 1402 isr->isr_hdr[IPOPT_OFFSET] = IPOPT_MINOFF; 1403 memcpy(mtod(m, caddr_t) + sizeof(struct in_addr), &isr->isr_nop, 1404 OPTSIZ); 1405 q = (struct in_addr *)(mtod(m, caddr_t) + 1406 sizeof(struct in_addr) + OPTSIZ); 1407 #undef OPTSIZ 1408 /* 1409 * Record return path as an IP source route, 1410 * reversing the path (pointers are now aligned). 1411 */ 1412 while (p >= isr->isr_routes) { 1413 *q++ = *p--; 1414 } 1415 /* 1416 * Last hop goes to final destination. 1417 */ 1418 *q = isr->isr_dst; 1419 m_tag_delete(m0, (struct m_tag *)isr); 1420 return (m); 1421 } 1422 1423 /* 1424 * Strip out IP options, at higher level protocol in the kernel. 1425 */ 1426 void 1427 ip_stripoptions(struct mbuf *m) 1428 { 1429 int i; 1430 struct ip *ip = mtod(m, struct ip *); 1431 caddr_t opts; 1432 int olen; 1433 1434 olen = (ip->ip_hl<<2) - sizeof (struct ip); 1435 opts = (caddr_t)(ip + 1); 1436 i = m->m_len - (sizeof (struct ip) + olen); 1437 memmove(opts, opts + olen, i); 1438 m->m_len -= olen; 1439 if (m->m_flags & M_PKTHDR) 1440 m->m_pkthdr.len -= olen; 1441 ip->ip_hl = sizeof(struct ip) >> 2; 1442 ip->ip_len = htons(ntohs(ip->ip_len) - olen); 1443 } 1444 1445 const u_char inetctlerrmap[PRC_NCMDS] = { 1446 0, 0, 0, 0, 1447 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, 1448 EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, 1449 EMSGSIZE, EHOSTUNREACH, 0, 0, 1450 0, 0, 0, 0, 1451 ENOPROTOOPT 1452 }; 1453 1454 /* 1455 * Forward a packet. If some error occurs return the sender 1456 * an icmp packet. Note we can't always generate a meaningful 1457 * icmp message because icmp doesn't have a large enough repertoire 1458 * of codes and types. 1459 * 1460 * If not forwarding, just drop the packet. This could be confusing 1461 * if ipforwarding was zero but some routing protocol was advancing 1462 * us as a gateway to somewhere. However, we must let the routing 1463 * protocol deal with that. 1464 * 1465 * The srcrt parameter indicates whether the packet is being forwarded 1466 * via a source route. 1467 */ 1468 void 1469 ip_forward(struct mbuf *m, struct ifnet *ifp, struct rtentry *rt, int srcrt) 1470 { 1471 struct mbuf mfake, *mcopy = NULL; 1472 struct ip *ip = mtod(m, struct ip *); 1473 struct sockaddr_in *sin; 1474 struct route ro; 1475 int error = 0, type = 0, code = 0, destmtu = 0, fake = 0, len; 1476 u_int32_t dest; 1477 1478 dest = 0; 1479 if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) { 1480 ipstat_inc(ips_cantforward); 1481 m_freem(m); 1482 goto freecopy; 1483 } 1484 if (ip->ip_ttl <= IPTTLDEC) { 1485 icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, dest, 0); 1486 goto freecopy; 1487 } 1488 1489 memset(&ro, 0, sizeof(ro)); 1490 sin = satosin(&ro.ro_dst); 1491 sin->sin_family = AF_INET; 1492 sin->sin_len = sizeof(*sin); 1493 sin->sin_addr = ip->ip_dst; 1494 1495 if (!rtisvalid(rt)) { 1496 rtfree(rt); 1497 rt = rtalloc_mpath(sintosa(sin), &ip->ip_src.s_addr, 1498 m->m_pkthdr.ph_rtableid); 1499 if (rt == NULL) { 1500 ipstat_inc(ips_noroute); 1501 icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0); 1502 return; 1503 } 1504 } 1505 1506 /* 1507 * Save at most 68 bytes of the packet in case 1508 * we need to generate an ICMP message to the src. 1509 * The data is saved in the mbuf on the stack that 1510 * acts as a temporary storage not intended to be 1511 * passed down the IP stack or to the mfree. 1512 */ 1513 memset(&mfake.m_hdr, 0, sizeof(mfake.m_hdr)); 1514 mfake.m_type = m->m_type; 1515 if (m_dup_pkthdr(&mfake, m, M_DONTWAIT) == 0) { 1516 mfake.m_data = mfake.m_pktdat; 1517 len = min(ntohs(ip->ip_len), 68); 1518 m_copydata(m, 0, len, mfake.m_pktdat); 1519 mfake.m_pkthdr.len = mfake.m_len = len; 1520 #if NPF > 0 1521 pf_pkt_addr_changed(&mfake); 1522 #endif /* NPF > 0 */ 1523 fake = 1; 1524 } 1525 1526 ip->ip_ttl -= IPTTLDEC; 1527 1528 /* 1529 * If forwarding packet using same interface that it came in on, 1530 * perhaps should send a redirect to sender to shortcut a hop. 1531 * Only send redirect if source is sending directly to us, 1532 * and if packet was not source routed (or has any options). 1533 * Also, don't send redirect if forwarding using a default route 1534 * or a route modified by a redirect. 1535 * Don't send redirect if we advertise destination's arp address 1536 * as ours (proxy arp). 1537 */ 1538 if ((rt->rt_ifidx == ifp->if_index) && 1539 (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 && 1540 satosin(rt_key(rt))->sin_addr.s_addr != 0 && 1541 ipsendredirects && !srcrt && 1542 !arpproxy(satosin(rt_key(rt))->sin_addr, m->m_pkthdr.ph_rtableid)) { 1543 if ((ip->ip_src.s_addr & ifatoia(rt->rt_ifa)->ia_netmask) == 1544 ifatoia(rt->rt_ifa)->ia_net) { 1545 if (rt->rt_flags & RTF_GATEWAY) 1546 dest = satosin(rt->rt_gateway)->sin_addr.s_addr; 1547 else 1548 dest = ip->ip_dst.s_addr; 1549 /* Router requirements says to only send host redirects */ 1550 type = ICMP_REDIRECT; 1551 code = ICMP_REDIRECT_HOST; 1552 } 1553 } 1554 1555 ro.ro_rt = rt; 1556 ro.ro_tableid = m->m_pkthdr.ph_rtableid; 1557 error = ip_output(m, NULL, &ro, 1558 (IP_FORWARDING | (ip_directedbcast ? IP_ALLOWBROADCAST : 0)), 1559 NULL, NULL, 0); 1560 rt = ro.ro_rt; 1561 if (error) 1562 ipstat_inc(ips_cantforward); 1563 else { 1564 ipstat_inc(ips_forward); 1565 if (type) 1566 ipstat_inc(ips_redirectsent); 1567 else 1568 goto freecopy; 1569 } 1570 if (!fake) 1571 goto freecopy; 1572 1573 switch (error) { 1574 case 0: /* forwarded, but need redirect */ 1575 /* type, code set above */ 1576 break; 1577 1578 case EMSGSIZE: 1579 type = ICMP_UNREACH; 1580 code = ICMP_UNREACH_NEEDFRAG; 1581 if (rt != NULL) { 1582 if (rt->rt_mtu) { 1583 destmtu = rt->rt_mtu; 1584 } else { 1585 struct ifnet *destifp; 1586 1587 destifp = if_get(rt->rt_ifidx); 1588 if (destifp != NULL) 1589 destmtu = destifp->if_mtu; 1590 if_put(destifp); 1591 } 1592 } 1593 ipstat_inc(ips_cantfrag); 1594 if (destmtu == 0) 1595 goto freecopy; 1596 break; 1597 1598 case EACCES: 1599 /* 1600 * pf(4) blocked the packet. There is no need to send an ICMP 1601 * packet back since pf(4) takes care of it. 1602 */ 1603 goto freecopy; 1604 1605 case ENOBUFS: 1606 /* 1607 * a router should not generate ICMP_SOURCEQUENCH as 1608 * required in RFC1812 Requirements for IP Version 4 Routers. 1609 * source quench could be a big problem under DoS attacks, 1610 * or the underlying interface is rate-limited. 1611 */ 1612 goto freecopy; 1613 1614 case ENETUNREACH: /* shouldn't happen, checked above */ 1615 case EHOSTUNREACH: 1616 case ENETDOWN: 1617 case EHOSTDOWN: 1618 default: 1619 type = ICMP_UNREACH; 1620 code = ICMP_UNREACH_HOST; 1621 break; 1622 } 1623 mcopy = m_copym(&mfake, 0, len, M_DONTWAIT); 1624 if (mcopy) 1625 icmp_error(mcopy, type, code, dest, destmtu); 1626 1627 freecopy: 1628 if (fake) 1629 m_tag_delete_chain(&mfake); 1630 rtfree(rt); 1631 } 1632 1633 int 1634 ip_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, 1635 size_t newlen) 1636 { 1637 int error; 1638 #ifdef MROUTING 1639 extern struct mrtstat mrtstat; 1640 #endif 1641 1642 /* Almost all sysctl names at this level are terminal. */ 1643 if (namelen != 1 && name[0] != IPCTL_IFQUEUE && 1644 name[0] != IPCTL_ARPQUEUE) 1645 return (ENOTDIR); 1646 1647 switch (name[0]) { 1648 case IPCTL_SOURCEROUTE: 1649 NET_LOCK(); 1650 error = sysctl_securelevel_int(oldp, oldlenp, newp, newlen, 1651 &ip_dosourceroute); 1652 NET_UNLOCK(); 1653 return (error); 1654 case IPCTL_MTUDISC: 1655 NET_LOCK(); 1656 error = sysctl_int(oldp, oldlenp, newp, newlen, &ip_mtudisc); 1657 if (ip_mtudisc == 0) 1658 rt_timer_queue_flush(&ip_mtudisc_timeout_q); 1659 NET_UNLOCK(); 1660 return error; 1661 case IPCTL_MTUDISCTIMEOUT: 1662 NET_LOCK(); 1663 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, 1664 &ip_mtudisc_timeout, 0, INT_MAX); 1665 rt_timer_queue_change(&ip_mtudisc_timeout_q, 1666 ip_mtudisc_timeout); 1667 NET_UNLOCK(); 1668 return (error); 1669 #ifdef IPSEC 1670 case IPCTL_ENCDEBUG: 1671 case IPCTL_IPSEC_STATS: 1672 case IPCTL_IPSEC_EXPIRE_ACQUIRE: 1673 case IPCTL_IPSEC_EMBRYONIC_SA_TIMEOUT: 1674 case IPCTL_IPSEC_REQUIRE_PFS: 1675 case IPCTL_IPSEC_SOFT_ALLOCATIONS: 1676 case IPCTL_IPSEC_ALLOCATIONS: 1677 case IPCTL_IPSEC_SOFT_BYTES: 1678 case IPCTL_IPSEC_BYTES: 1679 case IPCTL_IPSEC_TIMEOUT: 1680 case IPCTL_IPSEC_SOFT_TIMEOUT: 1681 case IPCTL_IPSEC_SOFT_FIRSTUSE: 1682 case IPCTL_IPSEC_FIRSTUSE: 1683 case IPCTL_IPSEC_ENC_ALGORITHM: 1684 case IPCTL_IPSEC_AUTH_ALGORITHM: 1685 case IPCTL_IPSEC_IPCOMP_ALGORITHM: 1686 return (ipsec_sysctl(name, namelen, oldp, oldlenp, newp, 1687 newlen)); 1688 #endif 1689 case IPCTL_IFQUEUE: 1690 return (sysctl_niq(name + 1, namelen - 1, 1691 oldp, oldlenp, newp, newlen, &ipintrq)); 1692 case IPCTL_ARPQUEUE: 1693 return (sysctl_niq(name + 1, namelen - 1, 1694 oldp, oldlenp, newp, newlen, &arpinq)); 1695 case IPCTL_ARPQUEUED: 1696 return (sysctl_rdint(oldp, oldlenp, newp, la_hold_total)); 1697 case IPCTL_STATS: 1698 return (ip_sysctl_ipstat(oldp, oldlenp, newp)); 1699 #ifdef MROUTING 1700 case IPCTL_MRTSTATS: 1701 return (sysctl_rdstruct(oldp, oldlenp, newp, 1702 &mrtstat, sizeof(mrtstat))); 1703 case IPCTL_MRTMFC: 1704 if (newp) 1705 return (EPERM); 1706 NET_LOCK(); 1707 error = mrt_sysctl_mfc(oldp, oldlenp); 1708 NET_UNLOCK(); 1709 return (error); 1710 case IPCTL_MRTVIF: 1711 if (newp) 1712 return (EPERM); 1713 NET_LOCK(); 1714 error = mrt_sysctl_vif(oldp, oldlenp); 1715 NET_UNLOCK(); 1716 return (error); 1717 #else 1718 case IPCTL_MRTPROTO: 1719 case IPCTL_MRTSTATS: 1720 case IPCTL_MRTMFC: 1721 case IPCTL_MRTVIF: 1722 return (EOPNOTSUPP); 1723 #endif 1724 default: 1725 NET_LOCK(); 1726 error = sysctl_bounded_arr(ipctl_vars, nitems(ipctl_vars), 1727 name, namelen, oldp, oldlenp, newp, newlen); 1728 NET_UNLOCK(); 1729 return (error); 1730 } 1731 /* NOTREACHED */ 1732 } 1733 1734 int 1735 ip_sysctl_ipstat(void *oldp, size_t *oldlenp, void *newp) 1736 { 1737 uint64_t counters[ips_ncounters]; 1738 struct ipstat ipstat; 1739 u_long *words = (u_long *)&ipstat; 1740 int i; 1741 1742 CTASSERT(sizeof(ipstat) == (nitems(counters) * sizeof(u_long))); 1743 memset(&ipstat, 0, sizeof ipstat); 1744 counters_read(ipcounters, counters, nitems(counters)); 1745 1746 for (i = 0; i < nitems(counters); i++) 1747 words[i] = (u_long)counters[i]; 1748 1749 return (sysctl_rdstruct(oldp, oldlenp, newp, &ipstat, sizeof(ipstat))); 1750 } 1751 1752 void 1753 ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, 1754 struct mbuf *m) 1755 { 1756 if (inp->inp_socket->so_options & SO_TIMESTAMP) { 1757 struct timeval tv; 1758 1759 m_microtime(m, &tv); 1760 *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv), 1761 SCM_TIMESTAMP, SOL_SOCKET); 1762 if (*mp) 1763 mp = &(*mp)->m_next; 1764 } 1765 1766 if (inp->inp_flags & INP_RECVDSTADDR) { 1767 *mp = sbcreatecontrol((caddr_t) &ip->ip_dst, 1768 sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP); 1769 if (*mp) 1770 mp = &(*mp)->m_next; 1771 } 1772 #ifdef notyet 1773 /* this code is broken and will probably never be fixed. */ 1774 /* options were tossed already */ 1775 if (inp->inp_flags & INP_RECVOPTS) { 1776 *mp = sbcreatecontrol((caddr_t) opts_deleted_above, 1777 sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP); 1778 if (*mp) 1779 mp = &(*mp)->m_next; 1780 } 1781 /* ip_srcroute doesn't do what we want here, need to fix */ 1782 if (inp->inp_flags & INP_RECVRETOPTS) { 1783 *mp = sbcreatecontrol((caddr_t) ip_srcroute(m), 1784 sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP); 1785 if (*mp) 1786 mp = &(*mp)->m_next; 1787 } 1788 #endif 1789 if (inp->inp_flags & INP_RECVIF) { 1790 struct sockaddr_dl sdl; 1791 struct ifnet *ifp; 1792 1793 ifp = if_get(m->m_pkthdr.ph_ifidx); 1794 if (ifp == NULL || ifp->if_sadl == NULL) { 1795 memset(&sdl, 0, sizeof(sdl)); 1796 sdl.sdl_len = offsetof(struct sockaddr_dl, sdl_data[0]); 1797 sdl.sdl_family = AF_LINK; 1798 sdl.sdl_index = ifp != NULL ? ifp->if_index : 0; 1799 sdl.sdl_nlen = sdl.sdl_alen = sdl.sdl_slen = 0; 1800 *mp = sbcreatecontrol((caddr_t) &sdl, sdl.sdl_len, 1801 IP_RECVIF, IPPROTO_IP); 1802 } else { 1803 *mp = sbcreatecontrol((caddr_t) ifp->if_sadl, 1804 ifp->if_sadl->sdl_len, IP_RECVIF, IPPROTO_IP); 1805 } 1806 if (*mp) 1807 mp = &(*mp)->m_next; 1808 if_put(ifp); 1809 } 1810 if (inp->inp_flags & INP_RECVTTL) { 1811 *mp = sbcreatecontrol((caddr_t) &ip->ip_ttl, 1812 sizeof(u_int8_t), IP_RECVTTL, IPPROTO_IP); 1813 if (*mp) 1814 mp = &(*mp)->m_next; 1815 } 1816 if (inp->inp_flags & INP_RECVRTABLE) { 1817 u_int rtableid = inp->inp_rtableid; 1818 1819 #if NPF > 0 1820 if (m && m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) { 1821 struct pf_divert *divert; 1822 1823 divert = pf_find_divert(m); 1824 KASSERT(divert != NULL); 1825 rtableid = divert->rdomain; 1826 } 1827 #endif 1828 1829 *mp = sbcreatecontrol((caddr_t) &rtableid, 1830 sizeof(u_int), IP_RECVRTABLE, IPPROTO_IP); 1831 if (*mp) 1832 mp = &(*mp)->m_next; 1833 } 1834 } 1835 1836 void 1837 ip_send_do_dispatch(void *xmq, int flags) 1838 { 1839 struct mbuf_queue *mq = xmq; 1840 struct mbuf *m; 1841 struct mbuf_list ml; 1842 struct m_tag *mtag; 1843 1844 mq_delist(mq, &ml); 1845 if (ml_empty(&ml)) 1846 return; 1847 1848 NET_LOCK(); 1849 while ((m = ml_dequeue(&ml)) != NULL) { 1850 u_int32_t ipsecflowinfo = 0; 1851 1852 if ((mtag = m_tag_find(m, PACKET_TAG_IPSEC_FLOWINFO, NULL)) 1853 != NULL) { 1854 ipsecflowinfo = *(u_int32_t *)(mtag + 1); 1855 m_tag_delete(m, mtag); 1856 } 1857 ip_output(m, NULL, NULL, flags, NULL, NULL, ipsecflowinfo); 1858 } 1859 NET_UNLOCK(); 1860 } 1861 1862 void 1863 ip_sendraw_dispatch(void *xmq) 1864 { 1865 ip_send_do_dispatch(xmq, IP_RAWOUTPUT); 1866 } 1867 1868 void 1869 ip_send_dispatch(void *xmq) 1870 { 1871 ip_send_do_dispatch(xmq, 0); 1872 } 1873 1874 void 1875 ip_send(struct mbuf *m) 1876 { 1877 mq_enqueue(&ipsend_mq, m); 1878 task_add(net_tq(0), &ipsend_task); 1879 } 1880 1881 void 1882 ip_send_raw(struct mbuf *m) 1883 { 1884 mq_enqueue(&ipsendraw_mq, m); 1885 task_add(net_tq(0), &ipsendraw_task); 1886 } 1887