1 /* $OpenBSD: ip_input.c,v 1.367 2022/04/20 09:38:26 bluhm Exp $ */ 2 /* $NetBSD: ip_input.c,v 1.30 1996/03/16 23:53:58 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 33 */ 34 35 #include "pf.h" 36 #include "carp.h" 37 38 #include <sys/param.h> 39 #include <sys/systm.h> 40 #include <sys/mbuf.h> 41 #include <sys/domain.h> 42 #include <sys/mutex.h> 43 #include <sys/protosw.h> 44 #include <sys/socket.h> 45 #include <sys/socketvar.h> 46 #include <sys/sysctl.h> 47 #include <sys/pool.h> 48 #include <sys/task.h> 49 50 #include <net/if.h> 51 #include <net/if_var.h> 52 #include <net/if_dl.h> 53 #include <net/route.h> 54 #include <net/netisr.h> 55 56 #include <netinet/in.h> 57 #include <netinet/in_systm.h> 58 #include <netinet/if_ether.h> 59 #include <netinet/ip.h> 60 #include <netinet/in_pcb.h> 61 #include <netinet/in_var.h> 62 #include <netinet/ip_var.h> 63 #include <netinet/ip_icmp.h> 64 #include <net/if_types.h> 65 66 #ifdef INET6 67 #include <netinet6/ip6_var.h> 68 #endif 69 70 #if NPF > 0 71 #include <net/pfvar.h> 72 #endif 73 74 #ifdef MROUTING 75 #include <netinet/ip_mroute.h> 76 #endif 77 78 #ifdef IPSEC 79 #include <netinet/ip_ipsp.h> 80 #endif /* IPSEC */ 81 82 #if NCARP > 0 83 #include <netinet/ip_carp.h> 84 #endif 85 86 /* values controllable via sysctl */ 87 int ipforwarding = 0; 88 int ipmforwarding = 0; 89 int ipmultipath = 0; 90 int ipsendredirects = 1; 91 int ip_dosourceroute = 0; 92 int ip_defttl = IPDEFTTL; 93 int ip_mtudisc = 1; 94 int ip_mtudisc_timeout = IPMTUDISCTIMEOUT; 95 int ip_directedbcast = 0; 96 97 struct rttimer_queue *ip_mtudisc_timeout_q; 98 99 /* Protects `ipq' and `ip_frags'. */ 100 struct mutex ipq_mutex = MUTEX_INITIALIZER(IPL_SOFTNET); 101 102 /* IP reassembly queue */ 103 LIST_HEAD(, ipq) ipq; 104 105 /* Keep track of memory used for reassembly */ 106 int ip_maxqueue = 300; 107 int ip_frags = 0; 108 109 #ifdef MROUTING 110 extern int ip_mrtproto; 111 #endif 112 113 const struct sysctl_bounded_args ipctl_vars[] = { 114 #ifdef MROUTING 115 { IPCTL_MRTPROTO, &ip_mrtproto, SYSCTL_INT_READONLY }, 116 #endif 117 { IPCTL_FORWARDING, &ipforwarding, 0, 2 }, 118 { IPCTL_SENDREDIRECTS, &ipsendredirects, 0, 1 }, 119 { IPCTL_DEFTTL, &ip_defttl, 0, 255 }, 120 { IPCTL_DIRECTEDBCAST, &ip_directedbcast, 0, 1 }, 121 { IPCTL_IPPORT_FIRSTAUTO, &ipport_firstauto, 0, 65535 }, 122 { IPCTL_IPPORT_LASTAUTO, &ipport_lastauto, 0, 65535 }, 123 { IPCTL_IPPORT_HIFIRSTAUTO, &ipport_hifirstauto, 0, 65535 }, 124 { IPCTL_IPPORT_HILASTAUTO, &ipport_hilastauto, 0, 65535 }, 125 { IPCTL_IPPORT_MAXQUEUE, &ip_maxqueue, 0, 10000 }, 126 { IPCTL_MFORWARDING, &ipmforwarding, 0, 1 }, 127 { IPCTL_MULTIPATH, &ipmultipath, 0, 1 }, 128 { IPCTL_ARPTIMEOUT, &arpt_keep, 0, INT_MAX }, 129 { IPCTL_ARPDOWN, &arpt_down, 0, INT_MAX }, 130 }; 131 132 struct pool ipqent_pool; 133 struct pool ipq_pool; 134 135 struct cpumem *ipcounters; 136 137 int ip_sysctl_ipstat(void *, size_t *, void *); 138 139 static struct mbuf_queue ipsend_mq; 140 static struct mbuf_queue ipsendraw_mq; 141 142 extern struct niqueue arpinq; 143 144 int ip_ours(struct mbuf **, int *, int, int); 145 int ip_dooptions(struct mbuf *, struct ifnet *); 146 int in_ouraddr(struct mbuf *, struct ifnet *, struct rtentry **); 147 148 static void ip_send_dispatch(void *); 149 static void ip_sendraw_dispatch(void *); 150 static struct task ipsend_task = TASK_INITIALIZER(ip_send_dispatch, &ipsend_mq); 151 static struct task ipsendraw_task = 152 TASK_INITIALIZER(ip_sendraw_dispatch, &ipsendraw_mq); 153 154 /* 155 * Used to save the IP options in case a protocol wants to respond 156 * to an incoming packet over the same route if the packet got here 157 * using IP source routing. This allows connection establishment and 158 * maintenance when the remote end is on a network that is not known 159 * to us. 160 */ 161 struct ip_srcrt { 162 int isr_nhops; /* number of hops */ 163 struct in_addr isr_dst; /* final destination */ 164 char isr_nop; /* one NOP to align */ 165 char isr_hdr[IPOPT_OFFSET + 1]; /* OPTVAL, OLEN & OFFSET */ 166 struct in_addr isr_routes[MAX_IPOPTLEN/sizeof(struct in_addr)]; 167 }; 168 169 void save_rte(struct mbuf *, u_char *, struct in_addr); 170 171 /* 172 * IP initialization: fill in IP protocol switch table. 173 * All protocols not implemented in kernel go to raw IP protocol handler. 174 */ 175 void 176 ip_init(void) 177 { 178 const struct protosw *pr; 179 int i; 180 const u_int16_t defbaddynamicports_tcp[] = DEFBADDYNAMICPORTS_TCP; 181 const u_int16_t defbaddynamicports_udp[] = DEFBADDYNAMICPORTS_UDP; 182 const u_int16_t defrootonlyports_tcp[] = DEFROOTONLYPORTS_TCP; 183 const u_int16_t defrootonlyports_udp[] = DEFROOTONLYPORTS_UDP; 184 185 ipcounters = counters_alloc(ips_ncounters); 186 187 pool_init(&ipqent_pool, sizeof(struct ipqent), 0, 188 IPL_SOFTNET, 0, "ipqe", NULL); 189 pool_init(&ipq_pool, sizeof(struct ipq), 0, 190 IPL_SOFTNET, 0, "ipq", NULL); 191 192 pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); 193 if (pr == NULL) 194 panic("ip_init"); 195 for (i = 0; i < IPPROTO_MAX; i++) 196 ip_protox[i] = pr - inetsw; 197 for (pr = inetdomain.dom_protosw; 198 pr < inetdomain.dom_protoswNPROTOSW; pr++) 199 if (pr->pr_domain->dom_family == PF_INET && 200 pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW && 201 pr->pr_protocol < IPPROTO_MAX) 202 ip_protox[pr->pr_protocol] = pr - inetsw; 203 LIST_INIT(&ipq); 204 ip_mtudisc_timeout_q = rt_timer_queue_create(ip_mtudisc_timeout); 205 206 /* Fill in list of ports not to allocate dynamically. */ 207 memset(&baddynamicports, 0, sizeof(baddynamicports)); 208 for (i = 0; defbaddynamicports_tcp[i] != 0; i++) 209 DP_SET(baddynamicports.tcp, defbaddynamicports_tcp[i]); 210 for (i = 0; defbaddynamicports_udp[i] != 0; i++) 211 DP_SET(baddynamicports.udp, defbaddynamicports_udp[i]); 212 213 /* Fill in list of ports only root can bind to. */ 214 memset(&rootonlyports, 0, sizeof(rootonlyports)); 215 for (i = 0; defrootonlyports_tcp[i] != 0; i++) 216 DP_SET(rootonlyports.tcp, defrootonlyports_tcp[i]); 217 for (i = 0; defrootonlyports_udp[i] != 0; i++) 218 DP_SET(rootonlyports.udp, defrootonlyports_udp[i]); 219 220 mq_init(&ipsend_mq, 64, IPL_SOFTNET); 221 mq_init(&ipsendraw_mq, 64, IPL_SOFTNET); 222 223 arpinit(); 224 #ifdef IPSEC 225 ipsec_init(); 226 #endif 227 } 228 229 /* 230 * IPv4 input routine. 231 * 232 * Checksum and byte swap header. Process options. Forward or deliver. 233 */ 234 void 235 ipv4_input(struct ifnet *ifp, struct mbuf *m) 236 { 237 int off, nxt; 238 239 off = 0; 240 nxt = ip_input_if(&m, &off, IPPROTO_IPV4, AF_UNSPEC, ifp); 241 KASSERT(nxt == IPPROTO_DONE); 242 } 243 244 struct mbuf * 245 ipv4_check(struct ifnet *ifp, struct mbuf *m) 246 { 247 struct ip *ip; 248 int hlen, len; 249 250 if (m->m_len < sizeof(*ip)) { 251 m = m_pullup(m, sizeof(*ip)); 252 if (m == NULL) { 253 ipstat_inc(ips_toosmall); 254 return (NULL); 255 } 256 } 257 258 ip = mtod(m, struct ip *); 259 if (ip->ip_v != IPVERSION) { 260 ipstat_inc(ips_badvers); 261 goto bad; 262 } 263 264 hlen = ip->ip_hl << 2; 265 if (hlen < sizeof(*ip)) { /* minimum header length */ 266 ipstat_inc(ips_badhlen); 267 goto bad; 268 } 269 if (hlen > m->m_len) { 270 m = m_pullup(m, hlen); 271 if (m == NULL) { 272 ipstat_inc(ips_badhlen); 273 return (NULL); 274 } 275 ip = mtod(m, struct ip *); 276 } 277 278 /* 127/8 must not appear on wire - RFC1122 */ 279 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || 280 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { 281 if ((ifp->if_flags & IFF_LOOPBACK) == 0) { 282 ipstat_inc(ips_badaddr); 283 goto bad; 284 } 285 } 286 287 if (!ISSET(m->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_OK)) { 288 if (ISSET(m->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_BAD)) { 289 ipstat_inc(ips_badsum); 290 goto bad; 291 } 292 293 ipstat_inc(ips_inswcsum); 294 if (in_cksum(m, hlen) != 0) { 295 ipstat_inc(ips_badsum); 296 goto bad; 297 } 298 299 SET(m->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_OK); 300 } 301 302 /* Retrieve the packet length. */ 303 len = ntohs(ip->ip_len); 304 305 /* 306 * Convert fields to host representation. 307 */ 308 if (len < hlen) { 309 ipstat_inc(ips_badlen); 310 goto bad; 311 } 312 313 /* 314 * Check that the amount of data in the buffers 315 * is at least as much as the IP header would have us expect. 316 * Trim mbufs if longer than we expect. 317 * Drop packet if shorter than we expect. 318 */ 319 if (m->m_pkthdr.len < len) { 320 ipstat_inc(ips_tooshort); 321 goto bad; 322 } 323 if (m->m_pkthdr.len > len) { 324 if (m->m_len == m->m_pkthdr.len) { 325 m->m_len = len; 326 m->m_pkthdr.len = len; 327 } else 328 m_adj(m, len - m->m_pkthdr.len); 329 } 330 331 return (m); 332 bad: 333 m_freem(m); 334 return (NULL); 335 } 336 337 int 338 ip_input_if(struct mbuf **mp, int *offp, int nxt, int af, struct ifnet *ifp) 339 { 340 struct mbuf *m; 341 struct rtentry *rt = NULL; 342 struct ip *ip; 343 int hlen; 344 in_addr_t pfrdr = 0; 345 346 KASSERT(*offp == 0); 347 348 ipstat_inc(ips_total); 349 m = *mp = ipv4_check(ifp, *mp); 350 if (m == NULL) 351 goto bad; 352 353 ip = mtod(m, struct ip *); 354 355 #if NCARP > 0 356 if (carp_lsdrop(ifp, m, AF_INET, &ip->ip_src.s_addr, 357 &ip->ip_dst.s_addr, (ip->ip_p == IPPROTO_ICMP ? 0 : 1))) 358 goto bad; 359 #endif 360 361 #if NPF > 0 362 /* 363 * Packet filter 364 */ 365 pfrdr = ip->ip_dst.s_addr; 366 if (pf_test(AF_INET, PF_IN, ifp, mp) != PF_PASS) 367 goto bad; 368 m = *mp; 369 if (m == NULL) 370 goto bad; 371 372 ip = mtod(m, struct ip *); 373 pfrdr = (pfrdr != ip->ip_dst.s_addr); 374 #endif 375 376 hlen = ip->ip_hl << 2; 377 378 /* 379 * Process options and, if not destined for us, 380 * ship it on. ip_dooptions returns 1 when an 381 * error was detected (causing an icmp message 382 * to be sent and the original packet to be freed). 383 */ 384 if (hlen > sizeof (struct ip) && ip_dooptions(m, ifp)) { 385 m = *mp = NULL; 386 goto bad; 387 } 388 389 if (ip->ip_dst.s_addr == INADDR_BROADCAST || 390 ip->ip_dst.s_addr == INADDR_ANY) { 391 nxt = ip_ours(mp, offp, nxt, af); 392 goto out; 393 } 394 395 switch(in_ouraddr(m, ifp, &rt)) { 396 case 2: 397 goto bad; 398 case 1: 399 nxt = ip_ours(mp, offp, nxt, af); 400 goto out; 401 } 402 403 if (IN_MULTICAST(ip->ip_dst.s_addr)) { 404 /* 405 * Make sure M_MCAST is set. It should theoretically 406 * already be there, but let's play safe because upper 407 * layers check for this flag. 408 */ 409 m->m_flags |= M_MCAST; 410 411 #ifdef MROUTING 412 if (ipmforwarding && ip_mrouter[ifp->if_rdomain]) { 413 int error; 414 415 if (m->m_flags & M_EXT) { 416 if ((m = *mp = m_pullup(m, hlen)) == NULL) { 417 ipstat_inc(ips_toosmall); 418 goto bad; 419 } 420 ip = mtod(m, struct ip *); 421 } 422 /* 423 * If we are acting as a multicast router, all 424 * incoming multicast packets are passed to the 425 * kernel-level multicast forwarding function. 426 * The packet is returned (relatively) intact; if 427 * ip_mforward() returns a non-zero value, the packet 428 * must be discarded, else it may be accepted below. 429 * 430 * (The IP ident field is put in the same byte order 431 * as expected when ip_mforward() is called from 432 * ip_output().) 433 */ 434 KERNEL_LOCK(); 435 error = ip_mforward(m, ifp); 436 KERNEL_UNLOCK(); 437 if (error) { 438 ipstat_inc(ips_cantforward); 439 goto bad; 440 } 441 442 /* 443 * The process-level routing daemon needs to receive 444 * all multicast IGMP packets, whether or not this 445 * host belongs to their destination groups. 446 */ 447 if (ip->ip_p == IPPROTO_IGMP) { 448 nxt = ip_ours(mp, offp, nxt, af); 449 goto out; 450 } 451 ipstat_inc(ips_forward); 452 } 453 #endif 454 /* 455 * See if we belong to the destination multicast group on the 456 * arrival interface. 457 */ 458 if (!in_hasmulti(&ip->ip_dst, ifp)) { 459 ipstat_inc(ips_notmember); 460 if (!IN_LOCAL_GROUP(ip->ip_dst.s_addr)) 461 ipstat_inc(ips_cantforward); 462 goto bad; 463 } 464 nxt = ip_ours(mp, offp, nxt, af); 465 goto out; 466 } 467 468 #if NCARP > 0 469 if (ip->ip_p == IPPROTO_ICMP && 470 carp_lsdrop(ifp, m, AF_INET, &ip->ip_src.s_addr, 471 &ip->ip_dst.s_addr, 1)) 472 goto bad; 473 #endif 474 /* 475 * Not for us; forward if possible and desirable. 476 */ 477 if (ipforwarding == 0) { 478 ipstat_inc(ips_cantforward); 479 goto bad; 480 } 481 #ifdef IPSEC 482 if (ipsec_in_use) { 483 int rv; 484 485 rv = ipsec_forward_check(m, hlen, AF_INET); 486 if (rv != 0) { 487 ipstat_inc(ips_cantforward); 488 goto bad; 489 } 490 /* 491 * Fall through, forward packet. Outbound IPsec policy 492 * checking will occur in ip_output(). 493 */ 494 } 495 #endif /* IPSEC */ 496 497 ip_forward(m, ifp, rt, pfrdr); 498 *mp = NULL; 499 return IPPROTO_DONE; 500 bad: 501 nxt = IPPROTO_DONE; 502 m_freemp(mp); 503 out: 504 rtfree(rt); 505 return nxt; 506 } 507 508 /* 509 * IPv4 local-delivery routine. 510 * 511 * If fragmented try to reassemble. Pass to next level. 512 */ 513 int 514 ip_ours(struct mbuf **mp, int *offp, int nxt, int af) 515 { 516 struct mbuf *m = *mp; 517 struct ip *ip = mtod(m, struct ip *); 518 struct ipq *fp; 519 struct ipqent *ipqe; 520 int mff, hlen; 521 522 hlen = ip->ip_hl << 2; 523 524 /* 525 * If offset or IP_MF are set, must reassemble. 526 * Otherwise, nothing need be done. 527 * (We could look in the reassembly queue to see 528 * if the packet was previously fragmented, 529 * but it's not worth the time; just let them time out.) 530 */ 531 if (ip->ip_off &~ htons(IP_DF | IP_RF)) { 532 if (m->m_flags & M_EXT) { /* XXX */ 533 if ((m = *mp = m_pullup(m, hlen)) == NULL) { 534 ipstat_inc(ips_toosmall); 535 return IPPROTO_DONE; 536 } 537 ip = mtod(m, struct ip *); 538 } 539 540 mtx_enter(&ipq_mutex); 541 542 /* 543 * Look for queue of fragments 544 * of this datagram. 545 */ 546 LIST_FOREACH(fp, &ipq, ipq_q) { 547 if (ip->ip_id == fp->ipq_id && 548 ip->ip_src.s_addr == fp->ipq_src.s_addr && 549 ip->ip_dst.s_addr == fp->ipq_dst.s_addr && 550 ip->ip_p == fp->ipq_p) 551 break; 552 } 553 554 /* 555 * Adjust ip_len to not reflect header, 556 * set ipqe_mff if more fragments are expected, 557 * convert offset of this to bytes. 558 */ 559 ip->ip_len = htons(ntohs(ip->ip_len) - hlen); 560 mff = (ip->ip_off & htons(IP_MF)) != 0; 561 if (mff) { 562 /* 563 * Make sure that fragments have a data length 564 * that's a non-zero multiple of 8 bytes. 565 */ 566 if (ntohs(ip->ip_len) == 0 || 567 (ntohs(ip->ip_len) & 0x7) != 0) { 568 ipstat_inc(ips_badfrags); 569 goto bad; 570 } 571 } 572 ip->ip_off = htons(ntohs(ip->ip_off) << 3); 573 574 /* 575 * If datagram marked as having more fragments 576 * or if this is not the first fragment, 577 * attempt reassembly; if it succeeds, proceed. 578 */ 579 if (mff || ip->ip_off) { 580 ipstat_inc(ips_fragments); 581 if (ip_frags + 1 > ip_maxqueue) { 582 ip_flush(); 583 ipstat_inc(ips_rcvmemdrop); 584 goto bad; 585 } 586 587 ipqe = pool_get(&ipqent_pool, PR_NOWAIT); 588 if (ipqe == NULL) { 589 ipstat_inc(ips_rcvmemdrop); 590 goto bad; 591 } 592 ip_frags++; 593 ipqe->ipqe_mff = mff; 594 ipqe->ipqe_m = m; 595 ipqe->ipqe_ip = ip; 596 m = *mp = ip_reass(ipqe, fp); 597 if (m == NULL) 598 goto bad; 599 ipstat_inc(ips_reassembled); 600 ip = mtod(m, struct ip *); 601 hlen = ip->ip_hl << 2; 602 ip->ip_len = htons(ntohs(ip->ip_len) + hlen); 603 } else 604 if (fp) 605 ip_freef(fp); 606 607 mtx_leave(&ipq_mutex); 608 } 609 610 *offp = hlen; 611 nxt = ip->ip_p; 612 /* Check whether we are already in a IPv4/IPv6 local deliver loop. */ 613 if (af == AF_UNSPEC) 614 nxt = ip_deliver(mp, offp, nxt, AF_INET); 615 return nxt; 616 bad: 617 mtx_leave(&ipq_mutex); 618 m_freemp(mp); 619 return IPPROTO_DONE; 620 } 621 622 #ifndef INET6 623 #define IPSTAT_INC(name) ipstat_inc(ips_##name) 624 #else 625 #define IPSTAT_INC(name) (af == AF_INET ? \ 626 ipstat_inc(ips_##name) : ip6stat_inc(ip6s_##name)) 627 #endif 628 629 int 630 ip_deliver(struct mbuf **mp, int *offp, int nxt, int af) 631 { 632 const struct protosw *psw; 633 int naf = af; 634 #ifdef INET6 635 int nest = 0; 636 #endif /* INET6 */ 637 638 /* pf might have modified stuff, might have to chksum */ 639 switch (af) { 640 case AF_INET: 641 in_proto_cksum_out(*mp, NULL); 642 break; 643 #ifdef INET6 644 case AF_INET6: 645 in6_proto_cksum_out(*mp, NULL); 646 break; 647 #endif /* INET6 */ 648 } 649 650 /* 651 * Tell launch routine the next header 652 */ 653 IPSTAT_INC(delivered); 654 655 while (nxt != IPPROTO_DONE) { 656 #ifdef INET6 657 if (af == AF_INET6 && 658 ip6_hdrnestlimit && (++nest > ip6_hdrnestlimit)) { 659 ip6stat_inc(ip6s_toomanyhdr); 660 goto bad; 661 } 662 #endif /* INET6 */ 663 664 /* 665 * protection against faulty packet - there should be 666 * more sanity checks in header chain processing. 667 */ 668 if ((*mp)->m_pkthdr.len < *offp) { 669 IPSTAT_INC(tooshort); 670 goto bad; 671 } 672 673 #ifdef IPSEC 674 if (ipsec_in_use) { 675 if (ipsec_local_check(*mp, *offp, nxt, af) != 0) { 676 IPSTAT_INC(cantforward); 677 goto bad; 678 } 679 } 680 /* Otherwise, just fall through and deliver the packet */ 681 #endif /* IPSEC */ 682 683 switch (nxt) { 684 case IPPROTO_IPV4: 685 naf = AF_INET; 686 ipstat_inc(ips_delivered); 687 break; 688 #ifdef INET6 689 case IPPROTO_IPV6: 690 naf = AF_INET6; 691 ip6stat_inc(ip6s_delivered); 692 break; 693 #endif /* INET6 */ 694 } 695 switch (af) { 696 case AF_INET: 697 psw = &inetsw[ip_protox[nxt]]; 698 break; 699 #ifdef INET6 700 case AF_INET6: 701 psw = &inet6sw[ip6_protox[nxt]]; 702 break; 703 #endif /* INET6 */ 704 } 705 nxt = (*psw->pr_input)(mp, offp, nxt, af); 706 af = naf; 707 } 708 return nxt; 709 bad: 710 m_freemp(mp); 711 return IPPROTO_DONE; 712 } 713 #undef IPSTAT_INC 714 715 int 716 in_ouraddr(struct mbuf *m, struct ifnet *ifp, struct rtentry **prt) 717 { 718 struct rtentry *rt; 719 struct ip *ip; 720 struct sockaddr_in sin; 721 int match = 0; 722 723 #if NPF > 0 724 switch (pf_ouraddr(m)) { 725 case 0: 726 return (0); 727 case 1: 728 return (1); 729 default: 730 /* pf does not know it */ 731 break; 732 } 733 #endif 734 735 ip = mtod(m, struct ip *); 736 737 memset(&sin, 0, sizeof(sin)); 738 sin.sin_len = sizeof(sin); 739 sin.sin_family = AF_INET; 740 sin.sin_addr = ip->ip_dst; 741 rt = rtalloc_mpath(sintosa(&sin), &ip->ip_src.s_addr, 742 m->m_pkthdr.ph_rtableid); 743 if (rtisvalid(rt)) { 744 if (ISSET(rt->rt_flags, RTF_LOCAL)) 745 match = 1; 746 747 /* 748 * If directedbcast is enabled we only consider it local 749 * if it is received on the interface with that address. 750 */ 751 if (ISSET(rt->rt_flags, RTF_BROADCAST) && 752 (!ip_directedbcast || rt->rt_ifidx == ifp->if_index)) { 753 match = 1; 754 755 /* Make sure M_BCAST is set */ 756 m->m_flags |= M_BCAST; 757 } 758 } 759 *prt = rt; 760 761 if (!match) { 762 struct ifaddr *ifa; 763 764 /* 765 * No local address or broadcast address found, so check for 766 * ancient classful broadcast addresses. 767 * It must have been broadcast on the link layer, and for an 768 * address on the interface it was received on. 769 */ 770 if (!ISSET(m->m_flags, M_BCAST) || 771 !IN_CLASSFULBROADCAST(ip->ip_dst.s_addr, ip->ip_dst.s_addr)) 772 return (0); 773 774 if (ifp->if_rdomain != rtable_l2(m->m_pkthdr.ph_rtableid)) 775 return (0); 776 /* 777 * The check in the loop assumes you only rx a packet on an UP 778 * interface, and that M_BCAST will only be set on a BROADCAST 779 * interface. 780 */ 781 NET_ASSERT_LOCKED(); 782 TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { 783 if (ifa->ifa_addr->sa_family != AF_INET) 784 continue; 785 786 if (IN_CLASSFULBROADCAST(ip->ip_dst.s_addr, 787 ifatoia(ifa)->ia_addr.sin_addr.s_addr)) { 788 match = 1; 789 break; 790 } 791 } 792 } else if (ipforwarding == 0 && rt->rt_ifidx != ifp->if_index && 793 !((ifp->if_flags & IFF_LOOPBACK) || (ifp->if_type == IFT_ENC) || 794 (m->m_pkthdr.pf.flags & PF_TAG_TRANSLATE_LOCALHOST))) { 795 /* received on wrong interface. */ 796 #if NCARP > 0 797 struct ifnet *out_if; 798 799 /* 800 * Virtual IPs on carp interfaces need to be checked also 801 * against the parent interface and other carp interfaces 802 * sharing the same parent. 803 */ 804 out_if = if_get(rt->rt_ifidx); 805 if (!(out_if && carp_strict_addr_chk(out_if, ifp))) { 806 ipstat_inc(ips_wrongif); 807 match = 2; 808 } 809 if_put(out_if); 810 #else 811 ipstat_inc(ips_wrongif); 812 match = 2; 813 #endif 814 } 815 816 return (match); 817 } 818 819 /* 820 * Take incoming datagram fragment and try to 821 * reassemble it into whole datagram. If a chain for 822 * reassembly of this datagram already exists, then it 823 * is given as fp; otherwise have to make a chain. 824 */ 825 struct mbuf * 826 ip_reass(struct ipqent *ipqe, struct ipq *fp) 827 { 828 struct mbuf *m = ipqe->ipqe_m; 829 struct ipqent *nq, *p, *q; 830 struct ip *ip; 831 struct mbuf *t; 832 int hlen = ipqe->ipqe_ip->ip_hl << 2; 833 int i, next; 834 u_int8_t ecn, ecn0; 835 836 MUTEX_ASSERT_LOCKED(&ipq_mutex); 837 838 /* 839 * Presence of header sizes in mbufs 840 * would confuse code below. 841 */ 842 m->m_data += hlen; 843 m->m_len -= hlen; 844 845 /* 846 * If first fragment to arrive, create a reassembly queue. 847 */ 848 if (fp == NULL) { 849 fp = pool_get(&ipq_pool, PR_NOWAIT); 850 if (fp == NULL) 851 goto dropfrag; 852 LIST_INSERT_HEAD(&ipq, fp, ipq_q); 853 fp->ipq_ttl = IPFRAGTTL; 854 fp->ipq_p = ipqe->ipqe_ip->ip_p; 855 fp->ipq_id = ipqe->ipqe_ip->ip_id; 856 LIST_INIT(&fp->ipq_fragq); 857 fp->ipq_src = ipqe->ipqe_ip->ip_src; 858 fp->ipq_dst = ipqe->ipqe_ip->ip_dst; 859 p = NULL; 860 goto insert; 861 } 862 863 /* 864 * Handle ECN by comparing this segment with the first one; 865 * if CE is set, do not lose CE. 866 * drop if CE and not-ECT are mixed for the same packet. 867 */ 868 ecn = ipqe->ipqe_ip->ip_tos & IPTOS_ECN_MASK; 869 ecn0 = LIST_FIRST(&fp->ipq_fragq)->ipqe_ip->ip_tos & IPTOS_ECN_MASK; 870 if (ecn == IPTOS_ECN_CE) { 871 if (ecn0 == IPTOS_ECN_NOTECT) 872 goto dropfrag; 873 if (ecn0 != IPTOS_ECN_CE) 874 LIST_FIRST(&fp->ipq_fragq)->ipqe_ip->ip_tos |= 875 IPTOS_ECN_CE; 876 } 877 if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) 878 goto dropfrag; 879 880 /* 881 * Find a segment which begins after this one does. 882 */ 883 for (p = NULL, q = LIST_FIRST(&fp->ipq_fragq); q != NULL; 884 p = q, q = LIST_NEXT(q, ipqe_q)) 885 if (ntohs(q->ipqe_ip->ip_off) > ntohs(ipqe->ipqe_ip->ip_off)) 886 break; 887 888 /* 889 * If there is a preceding segment, it may provide some of 890 * our data already. If so, drop the data from the incoming 891 * segment. If it provides all of our data, drop us. 892 */ 893 if (p != NULL) { 894 i = ntohs(p->ipqe_ip->ip_off) + ntohs(p->ipqe_ip->ip_len) - 895 ntohs(ipqe->ipqe_ip->ip_off); 896 if (i > 0) { 897 if (i >= ntohs(ipqe->ipqe_ip->ip_len)) 898 goto dropfrag; 899 m_adj(ipqe->ipqe_m, i); 900 ipqe->ipqe_ip->ip_off = 901 htons(ntohs(ipqe->ipqe_ip->ip_off) + i); 902 ipqe->ipqe_ip->ip_len = 903 htons(ntohs(ipqe->ipqe_ip->ip_len) - i); 904 } 905 } 906 907 /* 908 * While we overlap succeeding segments trim them or, 909 * if they are completely covered, dequeue them. 910 */ 911 for (; q != NULL && 912 ntohs(ipqe->ipqe_ip->ip_off) + ntohs(ipqe->ipqe_ip->ip_len) > 913 ntohs(q->ipqe_ip->ip_off); q = nq) { 914 i = (ntohs(ipqe->ipqe_ip->ip_off) + 915 ntohs(ipqe->ipqe_ip->ip_len)) - ntohs(q->ipqe_ip->ip_off); 916 if (i < ntohs(q->ipqe_ip->ip_len)) { 917 q->ipqe_ip->ip_len = 918 htons(ntohs(q->ipqe_ip->ip_len) - i); 919 q->ipqe_ip->ip_off = 920 htons(ntohs(q->ipqe_ip->ip_off) + i); 921 m_adj(q->ipqe_m, i); 922 break; 923 } 924 nq = LIST_NEXT(q, ipqe_q); 925 m_freem(q->ipqe_m); 926 LIST_REMOVE(q, ipqe_q); 927 pool_put(&ipqent_pool, q); 928 ip_frags--; 929 } 930 931 insert: 932 /* 933 * Stick new segment in its place; 934 * check for complete reassembly. 935 */ 936 if (p == NULL) { 937 LIST_INSERT_HEAD(&fp->ipq_fragq, ipqe, ipqe_q); 938 } else { 939 LIST_INSERT_AFTER(p, ipqe, ipqe_q); 940 } 941 next = 0; 942 for (p = NULL, q = LIST_FIRST(&fp->ipq_fragq); q != NULL; 943 p = q, q = LIST_NEXT(q, ipqe_q)) { 944 if (ntohs(q->ipqe_ip->ip_off) != next) 945 return (0); 946 next += ntohs(q->ipqe_ip->ip_len); 947 } 948 if (p->ipqe_mff) 949 return (0); 950 951 /* 952 * Reassembly is complete. Check for a bogus message size and 953 * concatenate fragments. 954 */ 955 q = LIST_FIRST(&fp->ipq_fragq); 956 ip = q->ipqe_ip; 957 if ((next + (ip->ip_hl << 2)) > IP_MAXPACKET) { 958 ipstat_inc(ips_toolong); 959 ip_freef(fp); 960 return (0); 961 } 962 m = q->ipqe_m; 963 t = m->m_next; 964 m->m_next = 0; 965 m_cat(m, t); 966 nq = LIST_NEXT(q, ipqe_q); 967 pool_put(&ipqent_pool, q); 968 ip_frags--; 969 for (q = nq; q != NULL; q = nq) { 970 t = q->ipqe_m; 971 nq = LIST_NEXT(q, ipqe_q); 972 pool_put(&ipqent_pool, q); 973 ip_frags--; 974 m_removehdr(t); 975 m_cat(m, t); 976 } 977 978 /* 979 * Create header for new ip packet by 980 * modifying header of first packet; 981 * dequeue and discard fragment reassembly header. 982 * Make header visible. 983 */ 984 ip->ip_len = htons(next); 985 ip->ip_src = fp->ipq_src; 986 ip->ip_dst = fp->ipq_dst; 987 LIST_REMOVE(fp, ipq_q); 988 pool_put(&ipq_pool, fp); 989 m->m_len += (ip->ip_hl << 2); 990 m->m_data -= (ip->ip_hl << 2); 991 m_calchdrlen(m); 992 return (m); 993 994 dropfrag: 995 ipstat_inc(ips_fragdropped); 996 m_freem(m); 997 pool_put(&ipqent_pool, ipqe); 998 ip_frags--; 999 return (NULL); 1000 } 1001 1002 /* 1003 * Free a fragment reassembly header and all 1004 * associated datagrams. 1005 */ 1006 void 1007 ip_freef(struct ipq *fp) 1008 { 1009 struct ipqent *q; 1010 1011 MUTEX_ASSERT_LOCKED(&ipq_mutex); 1012 1013 while ((q = LIST_FIRST(&fp->ipq_fragq)) != NULL) { 1014 LIST_REMOVE(q, ipqe_q); 1015 m_freem(q->ipqe_m); 1016 pool_put(&ipqent_pool, q); 1017 ip_frags--; 1018 } 1019 LIST_REMOVE(fp, ipq_q); 1020 pool_put(&ipq_pool, fp); 1021 } 1022 1023 /* 1024 * IP timer processing; 1025 * if a timer expires on a reassembly queue, discard it. 1026 */ 1027 void 1028 ip_slowtimo(void) 1029 { 1030 struct ipq *fp, *nfp; 1031 1032 mtx_enter(&ipq_mutex); 1033 LIST_FOREACH_SAFE(fp, &ipq, ipq_q, nfp) { 1034 if (--fp->ipq_ttl == 0) { 1035 ipstat_inc(ips_fragtimeout); 1036 ip_freef(fp); 1037 } 1038 } 1039 mtx_leave(&ipq_mutex); 1040 } 1041 1042 /* 1043 * Flush a bunch of datagram fragments, till we are down to 75%. 1044 */ 1045 void 1046 ip_flush(void) 1047 { 1048 int max = 50; 1049 1050 MUTEX_ASSERT_LOCKED(&ipq_mutex); 1051 1052 while (!LIST_EMPTY(&ipq) && ip_frags > ip_maxqueue * 3 / 4 && --max) { 1053 ipstat_inc(ips_fragdropped); 1054 ip_freef(LIST_FIRST(&ipq)); 1055 } 1056 } 1057 1058 /* 1059 * Do option processing on a datagram, 1060 * possibly discarding it if bad options are encountered, 1061 * or forwarding it if source-routed. 1062 * Returns 1 if packet has been forwarded/freed, 1063 * 0 if the packet should be processed further. 1064 */ 1065 int 1066 ip_dooptions(struct mbuf *m, struct ifnet *ifp) 1067 { 1068 struct ip *ip = mtod(m, struct ip *); 1069 unsigned int rtableid = m->m_pkthdr.ph_rtableid; 1070 struct rtentry *rt; 1071 struct sockaddr_in ipaddr; 1072 u_char *cp; 1073 struct ip_timestamp ipt; 1074 struct in_ifaddr *ia; 1075 int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0; 1076 struct in_addr sin, dst; 1077 u_int32_t ntime; 1078 1079 dst = ip->ip_dst; 1080 cp = (u_char *)(ip + 1); 1081 cnt = (ip->ip_hl << 2) - sizeof (struct ip); 1082 1083 KERNEL_LOCK(); 1084 for (; cnt > 0; cnt -= optlen, cp += optlen) { 1085 opt = cp[IPOPT_OPTVAL]; 1086 if (opt == IPOPT_EOL) 1087 break; 1088 if (opt == IPOPT_NOP) 1089 optlen = 1; 1090 else { 1091 if (cnt < IPOPT_OLEN + sizeof(*cp)) { 1092 code = &cp[IPOPT_OLEN] - (u_char *)ip; 1093 goto bad; 1094 } 1095 optlen = cp[IPOPT_OLEN]; 1096 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) { 1097 code = &cp[IPOPT_OLEN] - (u_char *)ip; 1098 goto bad; 1099 } 1100 } 1101 1102 switch (opt) { 1103 1104 default: 1105 break; 1106 1107 /* 1108 * Source routing with record. 1109 * Find interface with current destination address. 1110 * If none on this machine then drop if strictly routed, 1111 * or do nothing if loosely routed. 1112 * Record interface address and bring up next address 1113 * component. If strictly routed make sure next 1114 * address is on directly accessible net. 1115 */ 1116 case IPOPT_LSRR: 1117 case IPOPT_SSRR: 1118 if (!ip_dosourceroute) { 1119 type = ICMP_UNREACH; 1120 code = ICMP_UNREACH_SRCFAIL; 1121 goto bad; 1122 } 1123 if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { 1124 code = &cp[IPOPT_OFFSET] - (u_char *)ip; 1125 goto bad; 1126 } 1127 memset(&ipaddr, 0, sizeof(ipaddr)); 1128 ipaddr.sin_family = AF_INET; 1129 ipaddr.sin_len = sizeof(ipaddr); 1130 ipaddr.sin_addr = ip->ip_dst; 1131 ia = ifatoia(ifa_ifwithaddr(sintosa(&ipaddr), 1132 m->m_pkthdr.ph_rtableid)); 1133 if (ia == NULL) { 1134 if (opt == IPOPT_SSRR) { 1135 type = ICMP_UNREACH; 1136 code = ICMP_UNREACH_SRCFAIL; 1137 goto bad; 1138 } 1139 /* 1140 * Loose routing, and not at next destination 1141 * yet; nothing to do except forward. 1142 */ 1143 break; 1144 } 1145 off--; /* 0 origin */ 1146 if ((off + sizeof(struct in_addr)) > optlen) { 1147 /* 1148 * End of source route. Should be for us. 1149 */ 1150 save_rte(m, cp, ip->ip_src); 1151 break; 1152 } 1153 1154 /* 1155 * locate outgoing interface 1156 */ 1157 memset(&ipaddr, 0, sizeof(ipaddr)); 1158 ipaddr.sin_family = AF_INET; 1159 ipaddr.sin_len = sizeof(ipaddr); 1160 memcpy(&ipaddr.sin_addr, cp + off, 1161 sizeof(ipaddr.sin_addr)); 1162 /* keep packet in the virtual instance */ 1163 rt = rtalloc(sintosa(&ipaddr), RT_RESOLVE, rtableid); 1164 if (!rtisvalid(rt) || ((opt == IPOPT_SSRR) && 1165 ISSET(rt->rt_flags, RTF_GATEWAY))) { 1166 type = ICMP_UNREACH; 1167 code = ICMP_UNREACH_SRCFAIL; 1168 rtfree(rt); 1169 goto bad; 1170 } 1171 ia = ifatoia(rt->rt_ifa); 1172 memcpy(cp + off, &ia->ia_addr.sin_addr, 1173 sizeof(struct in_addr)); 1174 rtfree(rt); 1175 cp[IPOPT_OFFSET] += sizeof(struct in_addr); 1176 ip->ip_dst = ipaddr.sin_addr; 1177 /* 1178 * Let ip_intr's mcast routing check handle mcast pkts 1179 */ 1180 forward = !IN_MULTICAST(ip->ip_dst.s_addr); 1181 break; 1182 1183 case IPOPT_RR: 1184 if (optlen < IPOPT_OFFSET + sizeof(*cp)) { 1185 code = &cp[IPOPT_OLEN] - (u_char *)ip; 1186 goto bad; 1187 } 1188 if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { 1189 code = &cp[IPOPT_OFFSET] - (u_char *)ip; 1190 goto bad; 1191 } 1192 1193 /* 1194 * If no space remains, ignore. 1195 */ 1196 off--; /* 0 origin */ 1197 if ((off + sizeof(struct in_addr)) > optlen) 1198 break; 1199 memset(&ipaddr, 0, sizeof(ipaddr)); 1200 ipaddr.sin_family = AF_INET; 1201 ipaddr.sin_len = sizeof(ipaddr); 1202 ipaddr.sin_addr = ip->ip_dst; 1203 /* 1204 * locate outgoing interface; if we're the destination, 1205 * use the incoming interface (should be same). 1206 * Again keep the packet inside the virtual instance. 1207 */ 1208 rt = rtalloc(sintosa(&ipaddr), RT_RESOLVE, rtableid); 1209 if (!rtisvalid(rt)) { 1210 type = ICMP_UNREACH; 1211 code = ICMP_UNREACH_HOST; 1212 rtfree(rt); 1213 goto bad; 1214 } 1215 ia = ifatoia(rt->rt_ifa); 1216 memcpy(cp + off, &ia->ia_addr.sin_addr, 1217 sizeof(struct in_addr)); 1218 rtfree(rt); 1219 cp[IPOPT_OFFSET] += sizeof(struct in_addr); 1220 break; 1221 1222 case IPOPT_TS: 1223 code = cp - (u_char *)ip; 1224 if (optlen < sizeof(struct ip_timestamp)) 1225 goto bad; 1226 memcpy(&ipt, cp, sizeof(struct ip_timestamp)); 1227 if (ipt.ipt_ptr < 5 || ipt.ipt_len < 5) 1228 goto bad; 1229 if (ipt.ipt_ptr - 1 + sizeof(u_int32_t) > ipt.ipt_len) { 1230 if (++ipt.ipt_oflw == 0) 1231 goto bad; 1232 break; 1233 } 1234 memcpy(&sin, cp + ipt.ipt_ptr - 1, sizeof sin); 1235 switch (ipt.ipt_flg) { 1236 1237 case IPOPT_TS_TSONLY: 1238 break; 1239 1240 case IPOPT_TS_TSANDADDR: 1241 if (ipt.ipt_ptr - 1 + sizeof(u_int32_t) + 1242 sizeof(struct in_addr) > ipt.ipt_len) 1243 goto bad; 1244 memset(&ipaddr, 0, sizeof(ipaddr)); 1245 ipaddr.sin_family = AF_INET; 1246 ipaddr.sin_len = sizeof(ipaddr); 1247 ipaddr.sin_addr = dst; 1248 ia = ifatoia(ifaof_ifpforaddr(sintosa(&ipaddr), 1249 ifp)); 1250 if (ia == NULL) 1251 continue; 1252 memcpy(&sin, &ia->ia_addr.sin_addr, 1253 sizeof(struct in_addr)); 1254 ipt.ipt_ptr += sizeof(struct in_addr); 1255 break; 1256 1257 case IPOPT_TS_PRESPEC: 1258 if (ipt.ipt_ptr - 1 + sizeof(u_int32_t) + 1259 sizeof(struct in_addr) > ipt.ipt_len) 1260 goto bad; 1261 memset(&ipaddr, 0, sizeof(ipaddr)); 1262 ipaddr.sin_family = AF_INET; 1263 ipaddr.sin_len = sizeof(ipaddr); 1264 ipaddr.sin_addr = sin; 1265 if (ifa_ifwithaddr(sintosa(&ipaddr), 1266 m->m_pkthdr.ph_rtableid) == NULL) 1267 continue; 1268 ipt.ipt_ptr += sizeof(struct in_addr); 1269 break; 1270 1271 default: 1272 /* XXX can't take &ipt->ipt_flg */ 1273 code = (u_char *)&ipt.ipt_ptr - 1274 (u_char *)ip + 1; 1275 goto bad; 1276 } 1277 ntime = iptime(); 1278 memcpy(cp + ipt.ipt_ptr - 1, &ntime, sizeof(u_int32_t)); 1279 ipt.ipt_ptr += sizeof(u_int32_t); 1280 } 1281 } 1282 KERNEL_UNLOCK(); 1283 if (forward && ipforwarding > 0) { 1284 ip_forward(m, ifp, NULL, 1); 1285 return (1); 1286 } 1287 return (0); 1288 bad: 1289 KERNEL_UNLOCK(); 1290 icmp_error(m, type, code, 0, 0); 1291 ipstat_inc(ips_badoptions); 1292 return (1); 1293 } 1294 1295 /* 1296 * Save incoming source route for use in replies, 1297 * to be picked up later by ip_srcroute if the receiver is interested. 1298 */ 1299 void 1300 save_rte(struct mbuf *m, u_char *option, struct in_addr dst) 1301 { 1302 struct ip_srcrt *isr; 1303 struct m_tag *mtag; 1304 unsigned olen; 1305 1306 olen = option[IPOPT_OLEN]; 1307 if (olen > sizeof(isr->isr_hdr) + sizeof(isr->isr_routes)) 1308 return; 1309 1310 mtag = m_tag_get(PACKET_TAG_SRCROUTE, sizeof(*isr), M_NOWAIT); 1311 if (mtag == NULL) 1312 return; 1313 isr = (struct ip_srcrt *)(mtag + 1); 1314 1315 memcpy(isr->isr_hdr, option, olen); 1316 isr->isr_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr); 1317 isr->isr_dst = dst; 1318 m_tag_prepend(m, mtag); 1319 } 1320 1321 /* 1322 * Retrieve incoming source route for use in replies, 1323 * in the same form used by setsockopt. 1324 * The first hop is placed before the options, will be removed later. 1325 */ 1326 struct mbuf * 1327 ip_srcroute(struct mbuf *m0) 1328 { 1329 struct in_addr *p, *q; 1330 struct mbuf *m; 1331 struct ip_srcrt *isr; 1332 struct m_tag *mtag; 1333 1334 if (!ip_dosourceroute) 1335 return (NULL); 1336 1337 mtag = m_tag_find(m0, PACKET_TAG_SRCROUTE, NULL); 1338 if (mtag == NULL) 1339 return (NULL); 1340 isr = (struct ip_srcrt *)(mtag + 1); 1341 1342 if (isr->isr_nhops == 0) 1343 return (NULL); 1344 m = m_get(M_DONTWAIT, MT_SOOPTS); 1345 if (m == NULL) 1346 return (NULL); 1347 1348 #define OPTSIZ (sizeof(isr->isr_nop) + sizeof(isr->isr_hdr)) 1349 1350 /* length is (nhops+1)*sizeof(addr) + sizeof(nop + header) */ 1351 m->m_len = (isr->isr_nhops + 1) * sizeof(struct in_addr) + OPTSIZ; 1352 1353 /* 1354 * First save first hop for return route 1355 */ 1356 p = &(isr->isr_routes[isr->isr_nhops - 1]); 1357 *(mtod(m, struct in_addr *)) = *p--; 1358 1359 /* 1360 * Copy option fields and padding (nop) to mbuf. 1361 */ 1362 isr->isr_nop = IPOPT_NOP; 1363 isr->isr_hdr[IPOPT_OFFSET] = IPOPT_MINOFF; 1364 memcpy(mtod(m, caddr_t) + sizeof(struct in_addr), &isr->isr_nop, 1365 OPTSIZ); 1366 q = (struct in_addr *)(mtod(m, caddr_t) + 1367 sizeof(struct in_addr) + OPTSIZ); 1368 #undef OPTSIZ 1369 /* 1370 * Record return path as an IP source route, 1371 * reversing the path (pointers are now aligned). 1372 */ 1373 while (p >= isr->isr_routes) { 1374 *q++ = *p--; 1375 } 1376 /* 1377 * Last hop goes to final destination. 1378 */ 1379 *q = isr->isr_dst; 1380 m_tag_delete(m0, (struct m_tag *)isr); 1381 return (m); 1382 } 1383 1384 /* 1385 * Strip out IP options, at higher level protocol in the kernel. 1386 */ 1387 void 1388 ip_stripoptions(struct mbuf *m) 1389 { 1390 int i; 1391 struct ip *ip = mtod(m, struct ip *); 1392 caddr_t opts; 1393 int olen; 1394 1395 olen = (ip->ip_hl<<2) - sizeof (struct ip); 1396 opts = (caddr_t)(ip + 1); 1397 i = m->m_len - (sizeof (struct ip) + olen); 1398 memmove(opts, opts + olen, i); 1399 m->m_len -= olen; 1400 if (m->m_flags & M_PKTHDR) 1401 m->m_pkthdr.len -= olen; 1402 ip->ip_hl = sizeof(struct ip) >> 2; 1403 ip->ip_len = htons(ntohs(ip->ip_len) - olen); 1404 } 1405 1406 const u_char inetctlerrmap[PRC_NCMDS] = { 1407 0, 0, 0, 0, 1408 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, 1409 EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, 1410 EMSGSIZE, EHOSTUNREACH, 0, 0, 1411 0, 0, 0, 0, 1412 ENOPROTOOPT 1413 }; 1414 1415 /* 1416 * Forward a packet. If some error occurs return the sender 1417 * an icmp packet. Note we can't always generate a meaningful 1418 * icmp message because icmp doesn't have a large enough repertoire 1419 * of codes and types. 1420 * 1421 * If not forwarding, just drop the packet. This could be confusing 1422 * if ipforwarding was zero but some routing protocol was advancing 1423 * us as a gateway to somewhere. However, we must let the routing 1424 * protocol deal with that. 1425 * 1426 * The srcrt parameter indicates whether the packet is being forwarded 1427 * via a source route. 1428 */ 1429 void 1430 ip_forward(struct mbuf *m, struct ifnet *ifp, struct rtentry *rt, int srcrt) 1431 { 1432 struct mbuf mfake, *mcopy = NULL; 1433 struct ip *ip = mtod(m, struct ip *); 1434 struct sockaddr_in *sin; 1435 struct route ro; 1436 int error = 0, type = 0, code = 0, destmtu = 0, fake = 0, len; 1437 u_int32_t dest; 1438 1439 dest = 0; 1440 if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) { 1441 ipstat_inc(ips_cantforward); 1442 m_freem(m); 1443 goto freecopy; 1444 } 1445 if (ip->ip_ttl <= IPTTLDEC) { 1446 icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, dest, 0); 1447 goto freecopy; 1448 } 1449 1450 memset(&ro, 0, sizeof(ro)); 1451 sin = satosin(&ro.ro_dst); 1452 sin->sin_family = AF_INET; 1453 sin->sin_len = sizeof(*sin); 1454 sin->sin_addr = ip->ip_dst; 1455 1456 if (!rtisvalid(rt)) { 1457 rtfree(rt); 1458 rt = rtalloc_mpath(sintosa(sin), &ip->ip_src.s_addr, 1459 m->m_pkthdr.ph_rtableid); 1460 if (rt == NULL) { 1461 ipstat_inc(ips_noroute); 1462 icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0); 1463 return; 1464 } 1465 } 1466 1467 /* 1468 * Save at most 68 bytes of the packet in case 1469 * we need to generate an ICMP message to the src. 1470 * The data is saved in the mbuf on the stack that 1471 * acts as a temporary storage not intended to be 1472 * passed down the IP stack or to the mfree. 1473 */ 1474 memset(&mfake.m_hdr, 0, sizeof(mfake.m_hdr)); 1475 mfake.m_type = m->m_type; 1476 if (m_dup_pkthdr(&mfake, m, M_DONTWAIT) == 0) { 1477 mfake.m_data = mfake.m_pktdat; 1478 len = min(ntohs(ip->ip_len), 68); 1479 m_copydata(m, 0, len, mfake.m_pktdat); 1480 mfake.m_pkthdr.len = mfake.m_len = len; 1481 #if NPF > 0 1482 pf_pkt_addr_changed(&mfake); 1483 #endif /* NPF > 0 */ 1484 fake = 1; 1485 } 1486 1487 ip->ip_ttl -= IPTTLDEC; 1488 1489 /* 1490 * If forwarding packet using same interface that it came in on, 1491 * perhaps should send a redirect to sender to shortcut a hop. 1492 * Only send redirect if source is sending directly to us, 1493 * and if packet was not source routed (or has any options). 1494 * Also, don't send redirect if forwarding using a default route 1495 * or a route modified by a redirect. 1496 * Don't send redirect if we advertise destination's arp address 1497 * as ours (proxy arp). 1498 */ 1499 if ((rt->rt_ifidx == ifp->if_index) && 1500 (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 && 1501 satosin(rt_key(rt))->sin_addr.s_addr != 0 && 1502 ipsendredirects && !srcrt && 1503 !arpproxy(satosin(rt_key(rt))->sin_addr, m->m_pkthdr.ph_rtableid)) { 1504 if ((ip->ip_src.s_addr & ifatoia(rt->rt_ifa)->ia_netmask) == 1505 ifatoia(rt->rt_ifa)->ia_net) { 1506 if (rt->rt_flags & RTF_GATEWAY) 1507 dest = satosin(rt->rt_gateway)->sin_addr.s_addr; 1508 else 1509 dest = ip->ip_dst.s_addr; 1510 /* Router requirements says to only send host redirects */ 1511 type = ICMP_REDIRECT; 1512 code = ICMP_REDIRECT_HOST; 1513 } 1514 } 1515 1516 ro.ro_rt = rt; 1517 ro.ro_tableid = m->m_pkthdr.ph_rtableid; 1518 error = ip_output(m, NULL, &ro, 1519 (IP_FORWARDING | (ip_directedbcast ? IP_ALLOWBROADCAST : 0)), 1520 NULL, NULL, 0); 1521 rt = ro.ro_rt; 1522 if (error) 1523 ipstat_inc(ips_cantforward); 1524 else { 1525 ipstat_inc(ips_forward); 1526 if (type) 1527 ipstat_inc(ips_redirectsent); 1528 else 1529 goto freecopy; 1530 } 1531 if (!fake) 1532 goto freecopy; 1533 1534 switch (error) { 1535 case 0: /* forwarded, but need redirect */ 1536 /* type, code set above */ 1537 break; 1538 1539 case EMSGSIZE: 1540 type = ICMP_UNREACH; 1541 code = ICMP_UNREACH_NEEDFRAG; 1542 if (rt != NULL) { 1543 if (rt->rt_mtu) { 1544 destmtu = rt->rt_mtu; 1545 } else { 1546 struct ifnet *destifp; 1547 1548 destifp = if_get(rt->rt_ifidx); 1549 if (destifp != NULL) 1550 destmtu = destifp->if_mtu; 1551 if_put(destifp); 1552 } 1553 } 1554 ipstat_inc(ips_cantfrag); 1555 if (destmtu == 0) 1556 goto freecopy; 1557 break; 1558 1559 case EACCES: 1560 /* 1561 * pf(4) blocked the packet. There is no need to send an ICMP 1562 * packet back since pf(4) takes care of it. 1563 */ 1564 goto freecopy; 1565 1566 case ENOBUFS: 1567 /* 1568 * a router should not generate ICMP_SOURCEQUENCH as 1569 * required in RFC1812 Requirements for IP Version 4 Routers. 1570 * source quench could be a big problem under DoS attacks, 1571 * or the underlying interface is rate-limited. 1572 */ 1573 goto freecopy; 1574 1575 case ENETUNREACH: /* shouldn't happen, checked above */ 1576 case EHOSTUNREACH: 1577 case ENETDOWN: 1578 case EHOSTDOWN: 1579 default: 1580 type = ICMP_UNREACH; 1581 code = ICMP_UNREACH_HOST; 1582 break; 1583 } 1584 mcopy = m_copym(&mfake, 0, len, M_DONTWAIT); 1585 if (mcopy) 1586 icmp_error(mcopy, type, code, dest, destmtu); 1587 1588 freecopy: 1589 if (fake) 1590 m_tag_delete_chain(&mfake); 1591 rtfree(rt); 1592 } 1593 1594 int 1595 ip_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, 1596 size_t newlen) 1597 { 1598 int error; 1599 #ifdef MROUTING 1600 extern struct mrtstat mrtstat; 1601 #endif 1602 1603 /* Almost all sysctl names at this level are terminal. */ 1604 if (namelen != 1 && name[0] != IPCTL_IFQUEUE && 1605 name[0] != IPCTL_ARPQUEUE) 1606 return (ENOTDIR); 1607 1608 switch (name[0]) { 1609 case IPCTL_SOURCEROUTE: 1610 NET_LOCK(); 1611 error = sysctl_securelevel_int(oldp, oldlenp, newp, newlen, 1612 &ip_dosourceroute); 1613 NET_UNLOCK(); 1614 return (error); 1615 case IPCTL_MTUDISC: 1616 NET_LOCK(); 1617 error = sysctl_int(oldp, oldlenp, newp, newlen, &ip_mtudisc); 1618 if (ip_mtudisc == 0) { 1619 rt_timer_queue_destroy(ip_mtudisc_timeout_q); 1620 ip_mtudisc_timeout_q = 1621 rt_timer_queue_create(ip_mtudisc_timeout); 1622 } 1623 NET_UNLOCK(); 1624 return error; 1625 case IPCTL_MTUDISCTIMEOUT: 1626 NET_LOCK(); 1627 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, 1628 &ip_mtudisc_timeout, 0, INT_MAX); 1629 rt_timer_queue_change(ip_mtudisc_timeout_q, 1630 ip_mtudisc_timeout); 1631 NET_UNLOCK(); 1632 return (error); 1633 #ifdef IPSEC 1634 case IPCTL_ENCDEBUG: 1635 case IPCTL_IPSEC_STATS: 1636 case IPCTL_IPSEC_EXPIRE_ACQUIRE: 1637 case IPCTL_IPSEC_EMBRYONIC_SA_TIMEOUT: 1638 case IPCTL_IPSEC_REQUIRE_PFS: 1639 case IPCTL_IPSEC_SOFT_ALLOCATIONS: 1640 case IPCTL_IPSEC_ALLOCATIONS: 1641 case IPCTL_IPSEC_SOFT_BYTES: 1642 case IPCTL_IPSEC_BYTES: 1643 case IPCTL_IPSEC_TIMEOUT: 1644 case IPCTL_IPSEC_SOFT_TIMEOUT: 1645 case IPCTL_IPSEC_SOFT_FIRSTUSE: 1646 case IPCTL_IPSEC_FIRSTUSE: 1647 case IPCTL_IPSEC_ENC_ALGORITHM: 1648 case IPCTL_IPSEC_AUTH_ALGORITHM: 1649 case IPCTL_IPSEC_IPCOMP_ALGORITHM: 1650 return (ipsec_sysctl(name, namelen, oldp, oldlenp, newp, 1651 newlen)); 1652 #endif 1653 case IPCTL_IFQUEUE: 1654 return (EOPNOTSUPP); 1655 case IPCTL_ARPQUEUE: 1656 return (sysctl_niq(name + 1, namelen - 1, 1657 oldp, oldlenp, newp, newlen, &arpinq)); 1658 case IPCTL_ARPQUEUED: 1659 return (sysctl_rdint(oldp, oldlenp, newp, la_hold_total)); 1660 case IPCTL_STATS: 1661 return (ip_sysctl_ipstat(oldp, oldlenp, newp)); 1662 #ifdef MROUTING 1663 case IPCTL_MRTSTATS: 1664 return (sysctl_rdstruct(oldp, oldlenp, newp, 1665 &mrtstat, sizeof(mrtstat))); 1666 case IPCTL_MRTMFC: 1667 if (newp) 1668 return (EPERM); 1669 NET_LOCK(); 1670 error = mrt_sysctl_mfc(oldp, oldlenp); 1671 NET_UNLOCK(); 1672 return (error); 1673 case IPCTL_MRTVIF: 1674 if (newp) 1675 return (EPERM); 1676 NET_LOCK(); 1677 error = mrt_sysctl_vif(oldp, oldlenp); 1678 NET_UNLOCK(); 1679 return (error); 1680 #else 1681 case IPCTL_MRTPROTO: 1682 case IPCTL_MRTSTATS: 1683 case IPCTL_MRTMFC: 1684 case IPCTL_MRTVIF: 1685 return (EOPNOTSUPP); 1686 #endif 1687 default: 1688 NET_LOCK(); 1689 error = sysctl_bounded_arr(ipctl_vars, nitems(ipctl_vars), 1690 name, namelen, oldp, oldlenp, newp, newlen); 1691 NET_UNLOCK(); 1692 return (error); 1693 } 1694 /* NOTREACHED */ 1695 } 1696 1697 int 1698 ip_sysctl_ipstat(void *oldp, size_t *oldlenp, void *newp) 1699 { 1700 uint64_t counters[ips_ncounters]; 1701 struct ipstat ipstat; 1702 u_long *words = (u_long *)&ipstat; 1703 int i; 1704 1705 CTASSERT(sizeof(ipstat) == (nitems(counters) * sizeof(u_long))); 1706 memset(&ipstat, 0, sizeof ipstat); 1707 counters_read(ipcounters, counters, nitems(counters)); 1708 1709 for (i = 0; i < nitems(counters); i++) 1710 words[i] = (u_long)counters[i]; 1711 1712 return (sysctl_rdstruct(oldp, oldlenp, newp, &ipstat, sizeof(ipstat))); 1713 } 1714 1715 void 1716 ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, 1717 struct mbuf *m) 1718 { 1719 if (inp->inp_socket->so_options & SO_TIMESTAMP) { 1720 struct timeval tv; 1721 1722 m_microtime(m, &tv); 1723 *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv), 1724 SCM_TIMESTAMP, SOL_SOCKET); 1725 if (*mp) 1726 mp = &(*mp)->m_next; 1727 } 1728 1729 if (inp->inp_flags & INP_RECVDSTADDR) { 1730 *mp = sbcreatecontrol((caddr_t) &ip->ip_dst, 1731 sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP); 1732 if (*mp) 1733 mp = &(*mp)->m_next; 1734 } 1735 #ifdef notyet 1736 /* this code is broken and will probably never be fixed. */ 1737 /* options were tossed already */ 1738 if (inp->inp_flags & INP_RECVOPTS) { 1739 *mp = sbcreatecontrol((caddr_t) opts_deleted_above, 1740 sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP); 1741 if (*mp) 1742 mp = &(*mp)->m_next; 1743 } 1744 /* ip_srcroute doesn't do what we want here, need to fix */ 1745 if (inp->inp_flags & INP_RECVRETOPTS) { 1746 *mp = sbcreatecontrol((caddr_t) ip_srcroute(m), 1747 sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP); 1748 if (*mp) 1749 mp = &(*mp)->m_next; 1750 } 1751 #endif 1752 if (inp->inp_flags & INP_RECVIF) { 1753 struct sockaddr_dl sdl; 1754 struct ifnet *ifp; 1755 1756 ifp = if_get(m->m_pkthdr.ph_ifidx); 1757 if (ifp == NULL || ifp->if_sadl == NULL) { 1758 memset(&sdl, 0, sizeof(sdl)); 1759 sdl.sdl_len = offsetof(struct sockaddr_dl, sdl_data[0]); 1760 sdl.sdl_family = AF_LINK; 1761 sdl.sdl_index = ifp != NULL ? ifp->if_index : 0; 1762 sdl.sdl_nlen = sdl.sdl_alen = sdl.sdl_slen = 0; 1763 *mp = sbcreatecontrol((caddr_t) &sdl, sdl.sdl_len, 1764 IP_RECVIF, IPPROTO_IP); 1765 } else { 1766 *mp = sbcreatecontrol((caddr_t) ifp->if_sadl, 1767 ifp->if_sadl->sdl_len, IP_RECVIF, IPPROTO_IP); 1768 } 1769 if (*mp) 1770 mp = &(*mp)->m_next; 1771 if_put(ifp); 1772 } 1773 if (inp->inp_flags & INP_RECVTTL) { 1774 *mp = sbcreatecontrol((caddr_t) &ip->ip_ttl, 1775 sizeof(u_int8_t), IP_RECVTTL, IPPROTO_IP); 1776 if (*mp) 1777 mp = &(*mp)->m_next; 1778 } 1779 if (inp->inp_flags & INP_RECVRTABLE) { 1780 u_int rtableid = inp->inp_rtableid; 1781 1782 #if NPF > 0 1783 if (m && m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) { 1784 struct pf_divert *divert; 1785 1786 divert = pf_find_divert(m); 1787 KASSERT(divert != NULL); 1788 rtableid = divert->rdomain; 1789 } 1790 #endif 1791 1792 *mp = sbcreatecontrol((caddr_t) &rtableid, 1793 sizeof(u_int), IP_RECVRTABLE, IPPROTO_IP); 1794 if (*mp) 1795 mp = &(*mp)->m_next; 1796 } 1797 } 1798 1799 void 1800 ip_send_do_dispatch(void *xmq, int flags) 1801 { 1802 struct mbuf_queue *mq = xmq; 1803 struct mbuf *m; 1804 struct mbuf_list ml; 1805 struct m_tag *mtag; 1806 u_int32_t ipsecflowinfo = 0; 1807 1808 mq_delist(mq, &ml); 1809 if (ml_empty(&ml)) 1810 return; 1811 1812 NET_LOCK(); 1813 while ((m = ml_dequeue(&ml)) != NULL) { 1814 if ((mtag = m_tag_find(m, PACKET_TAG_IPSEC_FLOWINFO, NULL)) 1815 != NULL) { 1816 ipsecflowinfo = *(u_int32_t *)(mtag + 1); 1817 m_tag_delete(m, mtag); 1818 } 1819 ip_output(m, NULL, NULL, flags, NULL, NULL, ipsecflowinfo); 1820 } 1821 NET_UNLOCK(); 1822 } 1823 1824 void 1825 ip_sendraw_dispatch(void *xmq) 1826 { 1827 ip_send_do_dispatch(xmq, IP_RAWOUTPUT); 1828 } 1829 1830 void 1831 ip_send_dispatch(void *xmq) 1832 { 1833 ip_send_do_dispatch(xmq, 0); 1834 } 1835 1836 void 1837 ip_send(struct mbuf *m) 1838 { 1839 mq_enqueue(&ipsend_mq, m); 1840 task_add(net_tq(0), &ipsend_task); 1841 } 1842 1843 void 1844 ip_send_raw(struct mbuf *m) 1845 { 1846 mq_enqueue(&ipsendraw_mq, m); 1847 task_add(net_tq(0), &ipsendraw_task); 1848 } 1849