1 /* $OpenBSD: ip_input.c,v 1.363 2021/06/21 22:09:14 jca Exp $ */ 2 /* $NetBSD: ip_input.c,v 1.30 1996/03/16 23:53:58 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 33 */ 34 35 #include "pf.h" 36 #include "carp.h" 37 38 #include <sys/param.h> 39 #include <sys/systm.h> 40 #include <sys/mbuf.h> 41 #include <sys/domain.h> 42 #include <sys/mutex.h> 43 #include <sys/protosw.h> 44 #include <sys/socket.h> 45 #include <sys/socketvar.h> 46 #include <sys/sysctl.h> 47 #include <sys/pool.h> 48 #include <sys/task.h> 49 50 #include <net/if.h> 51 #include <net/if_var.h> 52 #include <net/if_dl.h> 53 #include <net/route.h> 54 #include <net/netisr.h> 55 56 #include <netinet/in.h> 57 #include <netinet/in_systm.h> 58 #include <netinet/if_ether.h> 59 #include <netinet/ip.h> 60 #include <netinet/in_pcb.h> 61 #include <netinet/in_var.h> 62 #include <netinet/ip_var.h> 63 #include <netinet/ip_icmp.h> 64 #include <net/if_types.h> 65 66 #ifdef INET6 67 #include <netinet6/ip6protosw.h> 68 #include <netinet6/ip6_var.h> 69 #endif 70 71 #if NPF > 0 72 #include <net/pfvar.h> 73 #endif 74 75 #ifdef MROUTING 76 #include <netinet/ip_mroute.h> 77 #endif 78 79 #ifdef IPSEC 80 #include <netinet/ip_ipsp.h> 81 #endif /* IPSEC */ 82 83 #if NCARP > 0 84 #include <netinet/ip_carp.h> 85 #endif 86 87 /* values controllable via sysctl */ 88 int ipforwarding = 0; 89 int ipmforwarding = 0; 90 int ipmultipath = 0; 91 int ipsendredirects = 1; 92 int ip_dosourceroute = 0; 93 int ip_defttl = IPDEFTTL; 94 int ip_mtudisc = 1; 95 u_int ip_mtudisc_timeout = IPMTUDISCTIMEOUT; 96 int ip_directedbcast = 0; 97 98 struct rttimer_queue *ip_mtudisc_timeout_q = NULL; 99 100 /* Protects `ipq' and `ip_frags'. */ 101 struct mutex ipq_mutex = MUTEX_INITIALIZER(IPL_SOFTNET); 102 103 /* IP reassembly queue */ 104 LIST_HEAD(, ipq) ipq; 105 106 /* Keep track of memory used for reassembly */ 107 int ip_maxqueue = 300; 108 int ip_frags = 0; 109 110 #ifdef MROUTING 111 extern int ip_mrtproto; 112 #endif 113 114 const struct sysctl_bounded_args ipctl_vars[] = { 115 #ifdef MROUTING 116 { IPCTL_MRTPROTO, &ip_mrtproto, SYSCTL_INT_READONLY }, 117 #endif 118 { IPCTL_FORWARDING, &ipforwarding, 0, 2 }, 119 { IPCTL_SENDREDIRECTS, &ipsendredirects, 0, 1 }, 120 { IPCTL_DEFTTL, &ip_defttl, 0, 255 }, 121 { IPCTL_DIRECTEDBCAST, &ip_directedbcast, 0, 1 }, 122 { IPCTL_IPPORT_FIRSTAUTO, &ipport_firstauto, 0, 65535 }, 123 { IPCTL_IPPORT_LASTAUTO, &ipport_lastauto, 0, 65535 }, 124 { IPCTL_IPPORT_HIFIRSTAUTO, &ipport_hifirstauto, 0, 65535 }, 125 { IPCTL_IPPORT_HILASTAUTO, &ipport_hilastauto, 0, 65535 }, 126 { IPCTL_IPPORT_MAXQUEUE, &ip_maxqueue, 0, 10000 }, 127 { IPCTL_MFORWARDING, &ipmforwarding, 0, 1 }, 128 { IPCTL_MULTIPATH, &ipmultipath, 0, 1 }, 129 { IPCTL_ARPTIMEOUT, &arpt_keep, 0, INT_MAX }, 130 { IPCTL_ARPDOWN, &arpt_down, 0, INT_MAX }, 131 }; 132 133 struct pool ipqent_pool; 134 struct pool ipq_pool; 135 136 struct cpumem *ipcounters; 137 138 int ip_sysctl_ipstat(void *, size_t *, void *); 139 140 static struct mbuf_queue ipsend_mq; 141 static struct mbuf_queue ipsendraw_mq; 142 143 extern struct niqueue arpinq; 144 145 int ip_ours(struct mbuf **, int *, int, int); 146 int ip_dooptions(struct mbuf *, struct ifnet *); 147 int in_ouraddr(struct mbuf *, struct ifnet *, struct rtentry **); 148 149 static void ip_send_dispatch(void *); 150 static void ip_sendraw_dispatch(void *); 151 static struct task ipsend_task = TASK_INITIALIZER(ip_send_dispatch, &ipsend_mq); 152 static struct task ipsendraw_task = 153 TASK_INITIALIZER(ip_sendraw_dispatch, &ipsendraw_mq); 154 155 /* 156 * Used to save the IP options in case a protocol wants to respond 157 * to an incoming packet over the same route if the packet got here 158 * using IP source routing. This allows connection establishment and 159 * maintenance when the remote end is on a network that is not known 160 * to us. 161 */ 162 struct ip_srcrt { 163 int isr_nhops; /* number of hops */ 164 struct in_addr isr_dst; /* final destination */ 165 char isr_nop; /* one NOP to align */ 166 char isr_hdr[IPOPT_OFFSET + 1]; /* OPTVAL, OLEN & OFFSET */ 167 struct in_addr isr_routes[MAX_IPOPTLEN/sizeof(struct in_addr)]; 168 }; 169 170 void save_rte(struct mbuf *, u_char *, struct in_addr); 171 172 /* 173 * IP initialization: fill in IP protocol switch table. 174 * All protocols not implemented in kernel go to raw IP protocol handler. 175 */ 176 void 177 ip_init(void) 178 { 179 const struct protosw *pr; 180 int i; 181 const u_int16_t defbaddynamicports_tcp[] = DEFBADDYNAMICPORTS_TCP; 182 const u_int16_t defbaddynamicports_udp[] = DEFBADDYNAMICPORTS_UDP; 183 const u_int16_t defrootonlyports_tcp[] = DEFROOTONLYPORTS_TCP; 184 const u_int16_t defrootonlyports_udp[] = DEFROOTONLYPORTS_UDP; 185 186 ipcounters = counters_alloc(ips_ncounters); 187 188 pool_init(&ipqent_pool, sizeof(struct ipqent), 0, 189 IPL_SOFTNET, 0, "ipqe", NULL); 190 pool_init(&ipq_pool, sizeof(struct ipq), 0, 191 IPL_SOFTNET, 0, "ipq", NULL); 192 193 pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); 194 if (pr == NULL) 195 panic("ip_init"); 196 for (i = 0; i < IPPROTO_MAX; i++) 197 ip_protox[i] = pr - inetsw; 198 for (pr = inetdomain.dom_protosw; 199 pr < inetdomain.dom_protoswNPROTOSW; pr++) 200 if (pr->pr_domain->dom_family == PF_INET && 201 pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW && 202 pr->pr_protocol < IPPROTO_MAX) 203 ip_protox[pr->pr_protocol] = pr - inetsw; 204 LIST_INIT(&ipq); 205 if (ip_mtudisc != 0) 206 ip_mtudisc_timeout_q = 207 rt_timer_queue_create(ip_mtudisc_timeout); 208 209 /* Fill in list of ports not to allocate dynamically. */ 210 memset(&baddynamicports, 0, sizeof(baddynamicports)); 211 for (i = 0; defbaddynamicports_tcp[i] != 0; i++) 212 DP_SET(baddynamicports.tcp, defbaddynamicports_tcp[i]); 213 for (i = 0; defbaddynamicports_udp[i] != 0; i++) 214 DP_SET(baddynamicports.udp, defbaddynamicports_udp[i]); 215 216 /* Fill in list of ports only root can bind to. */ 217 memset(&rootonlyports, 0, sizeof(rootonlyports)); 218 for (i = 0; defrootonlyports_tcp[i] != 0; i++) 219 DP_SET(rootonlyports.tcp, defrootonlyports_tcp[i]); 220 for (i = 0; defrootonlyports_udp[i] != 0; i++) 221 DP_SET(rootonlyports.udp, defrootonlyports_udp[i]); 222 223 mq_init(&ipsend_mq, 64, IPL_SOFTNET); 224 mq_init(&ipsendraw_mq, 64, IPL_SOFTNET); 225 226 arpinit(); 227 #ifdef IPSEC 228 ipsec_init(); 229 #endif 230 } 231 232 /* 233 * IPv4 input routine. 234 * 235 * Checksum and byte swap header. Process options. Forward or deliver. 236 */ 237 void 238 ipv4_input(struct ifnet *ifp, struct mbuf *m) 239 { 240 int off, nxt; 241 242 off = 0; 243 nxt = ip_input_if(&m, &off, IPPROTO_IPV4, AF_UNSPEC, ifp); 244 KASSERT(nxt == IPPROTO_DONE); 245 } 246 247 struct mbuf * 248 ipv4_check(struct ifnet *ifp, struct mbuf *m) 249 { 250 struct ip *ip; 251 int hlen, len; 252 253 if (m->m_len < sizeof(*ip)) { 254 m = m_pullup(m, sizeof(*ip)); 255 if (m == NULL) { 256 ipstat_inc(ips_toosmall); 257 return (NULL); 258 } 259 } 260 261 ip = mtod(m, struct ip *); 262 if (ip->ip_v != IPVERSION) { 263 ipstat_inc(ips_badvers); 264 goto bad; 265 } 266 267 hlen = ip->ip_hl << 2; 268 if (hlen < sizeof(*ip)) { /* minimum header length */ 269 ipstat_inc(ips_badhlen); 270 goto bad; 271 } 272 if (hlen > m->m_len) { 273 m = m_pullup(m, hlen); 274 if (m == NULL) { 275 ipstat_inc(ips_badhlen); 276 return (NULL); 277 } 278 ip = mtod(m, struct ip *); 279 } 280 281 /* 127/8 must not appear on wire - RFC1122 */ 282 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || 283 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { 284 if ((ifp->if_flags & IFF_LOOPBACK) == 0) { 285 ipstat_inc(ips_badaddr); 286 goto bad; 287 } 288 } 289 290 if (!ISSET(m->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_OK)) { 291 if (ISSET(m->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_BAD)) { 292 ipstat_inc(ips_badsum); 293 goto bad; 294 } 295 296 ipstat_inc(ips_inswcsum); 297 if (in_cksum(m, hlen) != 0) { 298 ipstat_inc(ips_badsum); 299 goto bad; 300 } 301 302 SET(m->m_pkthdr.csum_flags, M_IPV4_CSUM_IN_OK); 303 } 304 305 /* Retrieve the packet length. */ 306 len = ntohs(ip->ip_len); 307 308 /* 309 * Convert fields to host representation. 310 */ 311 if (len < hlen) { 312 ipstat_inc(ips_badlen); 313 goto bad; 314 } 315 316 /* 317 * Check that the amount of data in the buffers 318 * is at least as much as the IP header would have us expect. 319 * Trim mbufs if longer than we expect. 320 * Drop packet if shorter than we expect. 321 */ 322 if (m->m_pkthdr.len < len) { 323 ipstat_inc(ips_tooshort); 324 goto bad; 325 } 326 if (m->m_pkthdr.len > len) { 327 if (m->m_len == m->m_pkthdr.len) { 328 m->m_len = len; 329 m->m_pkthdr.len = len; 330 } else 331 m_adj(m, len - m->m_pkthdr.len); 332 } 333 334 return (m); 335 bad: 336 m_freem(m); 337 return (NULL); 338 } 339 340 int 341 ip_input_if(struct mbuf **mp, int *offp, int nxt, int af, struct ifnet *ifp) 342 { 343 struct mbuf *m; 344 struct rtentry *rt = NULL; 345 struct ip *ip; 346 int hlen; 347 in_addr_t pfrdr = 0; 348 349 KASSERT(*offp == 0); 350 351 ipstat_inc(ips_total); 352 m = *mp = ipv4_check(ifp, *mp); 353 if (m == NULL) 354 goto bad; 355 356 ip = mtod(m, struct ip *); 357 358 #if NCARP > 0 359 if (carp_lsdrop(ifp, m, AF_INET, &ip->ip_src.s_addr, 360 &ip->ip_dst.s_addr, (ip->ip_p == IPPROTO_ICMP ? 0 : 1))) 361 goto bad; 362 #endif 363 364 #if NPF > 0 365 /* 366 * Packet filter 367 */ 368 pfrdr = ip->ip_dst.s_addr; 369 if (pf_test(AF_INET, PF_IN, ifp, mp) != PF_PASS) 370 goto bad; 371 m = *mp; 372 if (m == NULL) 373 goto bad; 374 375 ip = mtod(m, struct ip *); 376 pfrdr = (pfrdr != ip->ip_dst.s_addr); 377 #endif 378 379 hlen = ip->ip_hl << 2; 380 381 /* 382 * Process options and, if not destined for us, 383 * ship it on. ip_dooptions returns 1 when an 384 * error was detected (causing an icmp message 385 * to be sent and the original packet to be freed). 386 */ 387 if (hlen > sizeof (struct ip) && ip_dooptions(m, ifp)) { 388 m = *mp = NULL; 389 goto bad; 390 } 391 392 if (ip->ip_dst.s_addr == INADDR_BROADCAST || 393 ip->ip_dst.s_addr == INADDR_ANY) { 394 nxt = ip_ours(mp, offp, nxt, af); 395 goto out; 396 } 397 398 switch(in_ouraddr(m, ifp, &rt)) { 399 case 2: 400 goto bad; 401 case 1: 402 nxt = ip_ours(mp, offp, nxt, af); 403 goto out; 404 } 405 406 if (IN_MULTICAST(ip->ip_dst.s_addr)) { 407 /* 408 * Make sure M_MCAST is set. It should theoretically 409 * already be there, but let's play safe because upper 410 * layers check for this flag. 411 */ 412 m->m_flags |= M_MCAST; 413 414 #ifdef MROUTING 415 if (ipmforwarding && ip_mrouter[ifp->if_rdomain]) { 416 int error; 417 418 if (m->m_flags & M_EXT) { 419 if ((m = *mp = m_pullup(m, hlen)) == NULL) { 420 ipstat_inc(ips_toosmall); 421 goto bad; 422 } 423 ip = mtod(m, struct ip *); 424 } 425 /* 426 * If we are acting as a multicast router, all 427 * incoming multicast packets are passed to the 428 * kernel-level multicast forwarding function. 429 * The packet is returned (relatively) intact; if 430 * ip_mforward() returns a non-zero value, the packet 431 * must be discarded, else it may be accepted below. 432 * 433 * (The IP ident field is put in the same byte order 434 * as expected when ip_mforward() is called from 435 * ip_output().) 436 */ 437 KERNEL_LOCK(); 438 error = ip_mforward(m, ifp); 439 KERNEL_UNLOCK(); 440 if (error) { 441 ipstat_inc(ips_cantforward); 442 goto bad; 443 } 444 445 /* 446 * The process-level routing daemon needs to receive 447 * all multicast IGMP packets, whether or not this 448 * host belongs to their destination groups. 449 */ 450 if (ip->ip_p == IPPROTO_IGMP) { 451 nxt = ip_ours(mp, offp, nxt, af); 452 goto out; 453 } 454 ipstat_inc(ips_forward); 455 } 456 #endif 457 /* 458 * See if we belong to the destination multicast group on the 459 * arrival interface. 460 */ 461 if (!in_hasmulti(&ip->ip_dst, ifp)) { 462 ipstat_inc(ips_notmember); 463 if (!IN_LOCAL_GROUP(ip->ip_dst.s_addr)) 464 ipstat_inc(ips_cantforward); 465 goto bad; 466 } 467 nxt = ip_ours(mp, offp, nxt, af); 468 goto out; 469 } 470 471 #if NCARP > 0 472 if (ip->ip_p == IPPROTO_ICMP && 473 carp_lsdrop(ifp, m, AF_INET, &ip->ip_src.s_addr, 474 &ip->ip_dst.s_addr, 1)) 475 goto bad; 476 #endif 477 /* 478 * Not for us; forward if possible and desirable. 479 */ 480 if (ipforwarding == 0) { 481 ipstat_inc(ips_cantforward); 482 goto bad; 483 } 484 #ifdef IPSEC 485 if (ipsec_in_use) { 486 int rv; 487 488 rv = ipsec_forward_check(m, hlen, AF_INET); 489 if (rv != 0) { 490 ipstat_inc(ips_cantforward); 491 goto bad; 492 } 493 /* 494 * Fall through, forward packet. Outbound IPsec policy 495 * checking will occur in ip_output(). 496 */ 497 } 498 #endif /* IPSEC */ 499 500 ip_forward(m, ifp, rt, pfrdr); 501 *mp = NULL; 502 return IPPROTO_DONE; 503 bad: 504 nxt = IPPROTO_DONE; 505 m_freemp(mp); 506 out: 507 rtfree(rt); 508 return nxt; 509 } 510 511 /* 512 * IPv4 local-delivery routine. 513 * 514 * If fragmented try to reassemble. Pass to next level. 515 */ 516 int 517 ip_ours(struct mbuf **mp, int *offp, int nxt, int af) 518 { 519 struct mbuf *m = *mp; 520 struct ip *ip = mtod(m, struct ip *); 521 struct ipq *fp; 522 struct ipqent *ipqe; 523 int mff, hlen; 524 525 hlen = ip->ip_hl << 2; 526 527 /* 528 * If offset or IP_MF are set, must reassemble. 529 * Otherwise, nothing need be done. 530 * (We could look in the reassembly queue to see 531 * if the packet was previously fragmented, 532 * but it's not worth the time; just let them time out.) 533 */ 534 if (ip->ip_off &~ htons(IP_DF | IP_RF)) { 535 if (m->m_flags & M_EXT) { /* XXX */ 536 if ((m = *mp = m_pullup(m, hlen)) == NULL) { 537 ipstat_inc(ips_toosmall); 538 return IPPROTO_DONE; 539 } 540 ip = mtod(m, struct ip *); 541 } 542 543 mtx_enter(&ipq_mutex); 544 545 /* 546 * Look for queue of fragments 547 * of this datagram. 548 */ 549 LIST_FOREACH(fp, &ipq, ipq_q) { 550 if (ip->ip_id == fp->ipq_id && 551 ip->ip_src.s_addr == fp->ipq_src.s_addr && 552 ip->ip_dst.s_addr == fp->ipq_dst.s_addr && 553 ip->ip_p == fp->ipq_p) 554 break; 555 } 556 557 /* 558 * Adjust ip_len to not reflect header, 559 * set ipqe_mff if more fragments are expected, 560 * convert offset of this to bytes. 561 */ 562 ip->ip_len = htons(ntohs(ip->ip_len) - hlen); 563 mff = (ip->ip_off & htons(IP_MF)) != 0; 564 if (mff) { 565 /* 566 * Make sure that fragments have a data length 567 * that's a non-zero multiple of 8 bytes. 568 */ 569 if (ntohs(ip->ip_len) == 0 || 570 (ntohs(ip->ip_len) & 0x7) != 0) { 571 ipstat_inc(ips_badfrags); 572 goto bad; 573 } 574 } 575 ip->ip_off = htons(ntohs(ip->ip_off) << 3); 576 577 /* 578 * If datagram marked as having more fragments 579 * or if this is not the first fragment, 580 * attempt reassembly; if it succeeds, proceed. 581 */ 582 if (mff || ip->ip_off) { 583 ipstat_inc(ips_fragments); 584 if (ip_frags + 1 > ip_maxqueue) { 585 ip_flush(); 586 ipstat_inc(ips_rcvmemdrop); 587 goto bad; 588 } 589 590 ipqe = pool_get(&ipqent_pool, PR_NOWAIT); 591 if (ipqe == NULL) { 592 ipstat_inc(ips_rcvmemdrop); 593 goto bad; 594 } 595 ip_frags++; 596 ipqe->ipqe_mff = mff; 597 ipqe->ipqe_m = m; 598 ipqe->ipqe_ip = ip; 599 m = *mp = ip_reass(ipqe, fp); 600 if (m == NULL) 601 goto bad; 602 ipstat_inc(ips_reassembled); 603 ip = mtod(m, struct ip *); 604 hlen = ip->ip_hl << 2; 605 ip->ip_len = htons(ntohs(ip->ip_len) + hlen); 606 } else 607 if (fp) 608 ip_freef(fp); 609 610 mtx_leave(&ipq_mutex); 611 } 612 613 *offp = hlen; 614 nxt = ip->ip_p; 615 /* Check whether we are already in a IPv4/IPv6 local deliver loop. */ 616 if (af == AF_UNSPEC) 617 nxt = ip_deliver(mp, offp, nxt, AF_INET); 618 return nxt; 619 bad: 620 mtx_leave(&ipq_mutex); 621 m_freemp(mp); 622 return IPPROTO_DONE; 623 } 624 625 #ifndef INET6 626 #define IPSTAT_INC(name) ipstat_inc(ips_##name) 627 #else 628 #define IPSTAT_INC(name) (af == AF_INET ? \ 629 ipstat_inc(ips_##name) : ip6stat_inc(ip6s_##name)) 630 #endif 631 632 int 633 ip_deliver(struct mbuf **mp, int *offp, int nxt, int af) 634 { 635 const struct protosw *psw; 636 int naf = af; 637 #ifdef INET6 638 int nest = 0; 639 #endif /* INET6 */ 640 641 /* pf might have modified stuff, might have to chksum */ 642 switch (af) { 643 case AF_INET: 644 in_proto_cksum_out(*mp, NULL); 645 break; 646 #ifdef INET6 647 case AF_INET6: 648 in6_proto_cksum_out(*mp, NULL); 649 break; 650 #endif /* INET6 */ 651 } 652 653 /* 654 * Tell launch routine the next header 655 */ 656 IPSTAT_INC(delivered); 657 658 while (nxt != IPPROTO_DONE) { 659 #ifdef INET6 660 if (af == AF_INET6 && 661 ip6_hdrnestlimit && (++nest > ip6_hdrnestlimit)) { 662 ip6stat_inc(ip6s_toomanyhdr); 663 goto bad; 664 } 665 #endif /* INET6 */ 666 667 /* 668 * protection against faulty packet - there should be 669 * more sanity checks in header chain processing. 670 */ 671 if ((*mp)->m_pkthdr.len < *offp) { 672 IPSTAT_INC(tooshort); 673 goto bad; 674 } 675 676 #ifdef IPSEC 677 if (ipsec_in_use) { 678 if (ipsec_local_check(*mp, *offp, nxt, af) != 0) { 679 IPSTAT_INC(cantforward); 680 goto bad; 681 } 682 } 683 /* Otherwise, just fall through and deliver the packet */ 684 #endif /* IPSEC */ 685 686 switch (nxt) { 687 case IPPROTO_IPV4: 688 naf = AF_INET; 689 ipstat_inc(ips_delivered); 690 break; 691 #ifdef INET6 692 case IPPROTO_IPV6: 693 naf = AF_INET6; 694 ip6stat_inc(ip6s_delivered); 695 break; 696 #endif /* INET6 */ 697 } 698 switch (af) { 699 case AF_INET: 700 psw = &inetsw[ip_protox[nxt]]; 701 break; 702 #ifdef INET6 703 case AF_INET6: 704 psw = &inet6sw[ip6_protox[nxt]]; 705 break; 706 #endif /* INET6 */ 707 } 708 nxt = (*psw->pr_input)(mp, offp, nxt, af); 709 af = naf; 710 } 711 return nxt; 712 bad: 713 m_freemp(mp); 714 return IPPROTO_DONE; 715 } 716 #undef IPSTAT_INC 717 718 int 719 in_ouraddr(struct mbuf *m, struct ifnet *ifp, struct rtentry **prt) 720 { 721 struct rtentry *rt; 722 struct ip *ip; 723 struct sockaddr_in sin; 724 int match = 0; 725 726 #if NPF > 0 727 switch (pf_ouraddr(m)) { 728 case 0: 729 return (0); 730 case 1: 731 return (1); 732 default: 733 /* pf does not know it */ 734 break; 735 } 736 #endif 737 738 ip = mtod(m, struct ip *); 739 740 memset(&sin, 0, sizeof(sin)); 741 sin.sin_len = sizeof(sin); 742 sin.sin_family = AF_INET; 743 sin.sin_addr = ip->ip_dst; 744 rt = rtalloc_mpath(sintosa(&sin), &ip->ip_src.s_addr, 745 m->m_pkthdr.ph_rtableid); 746 if (rtisvalid(rt)) { 747 if (ISSET(rt->rt_flags, RTF_LOCAL)) 748 match = 1; 749 750 /* 751 * If directedbcast is enabled we only consider it local 752 * if it is received on the interface with that address. 753 */ 754 if (ISSET(rt->rt_flags, RTF_BROADCAST) && 755 (!ip_directedbcast || rt->rt_ifidx == ifp->if_index)) { 756 match = 1; 757 758 /* Make sure M_BCAST is set */ 759 m->m_flags |= M_BCAST; 760 } 761 } 762 *prt = rt; 763 764 if (!match) { 765 struct ifaddr *ifa; 766 767 /* 768 * No local address or broadcast address found, so check for 769 * ancient classful broadcast addresses. 770 * It must have been broadcast on the link layer, and for an 771 * address on the interface it was received on. 772 */ 773 if (!ISSET(m->m_flags, M_BCAST) || 774 !IN_CLASSFULBROADCAST(ip->ip_dst.s_addr, ip->ip_dst.s_addr)) 775 return (0); 776 777 if (ifp->if_rdomain != rtable_l2(m->m_pkthdr.ph_rtableid)) 778 return (0); 779 /* 780 * The check in the loop assumes you only rx a packet on an UP 781 * interface, and that M_BCAST will only be set on a BROADCAST 782 * interface. 783 */ 784 NET_ASSERT_LOCKED(); 785 TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { 786 if (ifa->ifa_addr->sa_family != AF_INET) 787 continue; 788 789 if (IN_CLASSFULBROADCAST(ip->ip_dst.s_addr, 790 ifatoia(ifa)->ia_addr.sin_addr.s_addr)) { 791 match = 1; 792 break; 793 } 794 } 795 } else if (ipforwarding == 0 && rt->rt_ifidx != ifp->if_index && 796 !((ifp->if_flags & IFF_LOOPBACK) || (ifp->if_type == IFT_ENC) || 797 (m->m_pkthdr.pf.flags & PF_TAG_TRANSLATE_LOCALHOST))) { 798 /* received on wrong interface. */ 799 #if NCARP > 0 800 struct ifnet *out_if; 801 802 /* 803 * Virtual IPs on carp interfaces need to be checked also 804 * against the parent interface and other carp interfaces 805 * sharing the same parent. 806 */ 807 out_if = if_get(rt->rt_ifidx); 808 if (!(out_if && carp_strict_addr_chk(out_if, ifp))) { 809 ipstat_inc(ips_wrongif); 810 match = 2; 811 } 812 if_put(out_if); 813 #else 814 ipstat_inc(ips_wrongif); 815 match = 2; 816 #endif 817 } 818 819 return (match); 820 } 821 822 /* 823 * Take incoming datagram fragment and try to 824 * reassemble it into whole datagram. If a chain for 825 * reassembly of this datagram already exists, then it 826 * is given as fp; otherwise have to make a chain. 827 */ 828 struct mbuf * 829 ip_reass(struct ipqent *ipqe, struct ipq *fp) 830 { 831 struct mbuf *m = ipqe->ipqe_m; 832 struct ipqent *nq, *p, *q; 833 struct ip *ip; 834 struct mbuf *t; 835 int hlen = ipqe->ipqe_ip->ip_hl << 2; 836 int i, next; 837 u_int8_t ecn, ecn0; 838 839 MUTEX_ASSERT_LOCKED(&ipq_mutex); 840 841 /* 842 * Presence of header sizes in mbufs 843 * would confuse code below. 844 */ 845 m->m_data += hlen; 846 m->m_len -= hlen; 847 848 /* 849 * If first fragment to arrive, create a reassembly queue. 850 */ 851 if (fp == NULL) { 852 fp = pool_get(&ipq_pool, PR_NOWAIT); 853 if (fp == NULL) 854 goto dropfrag; 855 LIST_INSERT_HEAD(&ipq, fp, ipq_q); 856 fp->ipq_ttl = IPFRAGTTL; 857 fp->ipq_p = ipqe->ipqe_ip->ip_p; 858 fp->ipq_id = ipqe->ipqe_ip->ip_id; 859 LIST_INIT(&fp->ipq_fragq); 860 fp->ipq_src = ipqe->ipqe_ip->ip_src; 861 fp->ipq_dst = ipqe->ipqe_ip->ip_dst; 862 p = NULL; 863 goto insert; 864 } 865 866 /* 867 * Handle ECN by comparing this segment with the first one; 868 * if CE is set, do not lose CE. 869 * drop if CE and not-ECT are mixed for the same packet. 870 */ 871 ecn = ipqe->ipqe_ip->ip_tos & IPTOS_ECN_MASK; 872 ecn0 = LIST_FIRST(&fp->ipq_fragq)->ipqe_ip->ip_tos & IPTOS_ECN_MASK; 873 if (ecn == IPTOS_ECN_CE) { 874 if (ecn0 == IPTOS_ECN_NOTECT) 875 goto dropfrag; 876 if (ecn0 != IPTOS_ECN_CE) 877 LIST_FIRST(&fp->ipq_fragq)->ipqe_ip->ip_tos |= 878 IPTOS_ECN_CE; 879 } 880 if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) 881 goto dropfrag; 882 883 /* 884 * Find a segment which begins after this one does. 885 */ 886 for (p = NULL, q = LIST_FIRST(&fp->ipq_fragq); q != NULL; 887 p = q, q = LIST_NEXT(q, ipqe_q)) 888 if (ntohs(q->ipqe_ip->ip_off) > ntohs(ipqe->ipqe_ip->ip_off)) 889 break; 890 891 /* 892 * If there is a preceding segment, it may provide some of 893 * our data already. If so, drop the data from the incoming 894 * segment. If it provides all of our data, drop us. 895 */ 896 if (p != NULL) { 897 i = ntohs(p->ipqe_ip->ip_off) + ntohs(p->ipqe_ip->ip_len) - 898 ntohs(ipqe->ipqe_ip->ip_off); 899 if (i > 0) { 900 if (i >= ntohs(ipqe->ipqe_ip->ip_len)) 901 goto dropfrag; 902 m_adj(ipqe->ipqe_m, i); 903 ipqe->ipqe_ip->ip_off = 904 htons(ntohs(ipqe->ipqe_ip->ip_off) + i); 905 ipqe->ipqe_ip->ip_len = 906 htons(ntohs(ipqe->ipqe_ip->ip_len) - i); 907 } 908 } 909 910 /* 911 * While we overlap succeeding segments trim them or, 912 * if they are completely covered, dequeue them. 913 */ 914 for (; q != NULL && 915 ntohs(ipqe->ipqe_ip->ip_off) + ntohs(ipqe->ipqe_ip->ip_len) > 916 ntohs(q->ipqe_ip->ip_off); q = nq) { 917 i = (ntohs(ipqe->ipqe_ip->ip_off) + 918 ntohs(ipqe->ipqe_ip->ip_len)) - ntohs(q->ipqe_ip->ip_off); 919 if (i < ntohs(q->ipqe_ip->ip_len)) { 920 q->ipqe_ip->ip_len = 921 htons(ntohs(q->ipqe_ip->ip_len) - i); 922 q->ipqe_ip->ip_off = 923 htons(ntohs(q->ipqe_ip->ip_off) + i); 924 m_adj(q->ipqe_m, i); 925 break; 926 } 927 nq = LIST_NEXT(q, ipqe_q); 928 m_freem(q->ipqe_m); 929 LIST_REMOVE(q, ipqe_q); 930 pool_put(&ipqent_pool, q); 931 ip_frags--; 932 } 933 934 insert: 935 /* 936 * Stick new segment in its place; 937 * check for complete reassembly. 938 */ 939 if (p == NULL) { 940 LIST_INSERT_HEAD(&fp->ipq_fragq, ipqe, ipqe_q); 941 } else { 942 LIST_INSERT_AFTER(p, ipqe, ipqe_q); 943 } 944 next = 0; 945 for (p = NULL, q = LIST_FIRST(&fp->ipq_fragq); q != NULL; 946 p = q, q = LIST_NEXT(q, ipqe_q)) { 947 if (ntohs(q->ipqe_ip->ip_off) != next) 948 return (0); 949 next += ntohs(q->ipqe_ip->ip_len); 950 } 951 if (p->ipqe_mff) 952 return (0); 953 954 /* 955 * Reassembly is complete. Check for a bogus message size and 956 * concatenate fragments. 957 */ 958 q = LIST_FIRST(&fp->ipq_fragq); 959 ip = q->ipqe_ip; 960 if ((next + (ip->ip_hl << 2)) > IP_MAXPACKET) { 961 ipstat_inc(ips_toolong); 962 ip_freef(fp); 963 return (0); 964 } 965 m = q->ipqe_m; 966 t = m->m_next; 967 m->m_next = 0; 968 m_cat(m, t); 969 nq = LIST_NEXT(q, ipqe_q); 970 pool_put(&ipqent_pool, q); 971 ip_frags--; 972 for (q = nq; q != NULL; q = nq) { 973 t = q->ipqe_m; 974 nq = LIST_NEXT(q, ipqe_q); 975 pool_put(&ipqent_pool, q); 976 ip_frags--; 977 m_removehdr(t); 978 m_cat(m, t); 979 } 980 981 /* 982 * Create header for new ip packet by 983 * modifying header of first packet; 984 * dequeue and discard fragment reassembly header. 985 * Make header visible. 986 */ 987 ip->ip_len = htons(next); 988 ip->ip_src = fp->ipq_src; 989 ip->ip_dst = fp->ipq_dst; 990 LIST_REMOVE(fp, ipq_q); 991 pool_put(&ipq_pool, fp); 992 m->m_len += (ip->ip_hl << 2); 993 m->m_data -= (ip->ip_hl << 2); 994 m_calchdrlen(m); 995 return (m); 996 997 dropfrag: 998 ipstat_inc(ips_fragdropped); 999 m_freem(m); 1000 pool_put(&ipqent_pool, ipqe); 1001 ip_frags--; 1002 return (NULL); 1003 } 1004 1005 /* 1006 * Free a fragment reassembly header and all 1007 * associated datagrams. 1008 */ 1009 void 1010 ip_freef(struct ipq *fp) 1011 { 1012 struct ipqent *q; 1013 1014 MUTEX_ASSERT_LOCKED(&ipq_mutex); 1015 1016 while ((q = LIST_FIRST(&fp->ipq_fragq)) != NULL) { 1017 LIST_REMOVE(q, ipqe_q); 1018 m_freem(q->ipqe_m); 1019 pool_put(&ipqent_pool, q); 1020 ip_frags--; 1021 } 1022 LIST_REMOVE(fp, ipq_q); 1023 pool_put(&ipq_pool, fp); 1024 } 1025 1026 /* 1027 * IP timer processing; 1028 * if a timer expires on a reassembly queue, discard it. 1029 */ 1030 void 1031 ip_slowtimo(void) 1032 { 1033 struct ipq *fp, *nfp; 1034 1035 mtx_enter(&ipq_mutex); 1036 LIST_FOREACH_SAFE(fp, &ipq, ipq_q, nfp) { 1037 if (--fp->ipq_ttl == 0) { 1038 ipstat_inc(ips_fragtimeout); 1039 ip_freef(fp); 1040 } 1041 } 1042 mtx_leave(&ipq_mutex); 1043 } 1044 1045 /* 1046 * Flush a bunch of datagram fragments, till we are down to 75%. 1047 */ 1048 void 1049 ip_flush(void) 1050 { 1051 int max = 50; 1052 1053 MUTEX_ASSERT_LOCKED(&ipq_mutex); 1054 1055 while (!LIST_EMPTY(&ipq) && ip_frags > ip_maxqueue * 3 / 4 && --max) { 1056 ipstat_inc(ips_fragdropped); 1057 ip_freef(LIST_FIRST(&ipq)); 1058 } 1059 } 1060 1061 /* 1062 * Do option processing on a datagram, 1063 * possibly discarding it if bad options are encountered, 1064 * or forwarding it if source-routed. 1065 * Returns 1 if packet has been forwarded/freed, 1066 * 0 if the packet should be processed further. 1067 */ 1068 int 1069 ip_dooptions(struct mbuf *m, struct ifnet *ifp) 1070 { 1071 struct ip *ip = mtod(m, struct ip *); 1072 unsigned int rtableid = m->m_pkthdr.ph_rtableid; 1073 struct rtentry *rt; 1074 struct sockaddr_in ipaddr; 1075 u_char *cp; 1076 struct ip_timestamp ipt; 1077 struct in_ifaddr *ia; 1078 int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0; 1079 struct in_addr sin, dst; 1080 u_int32_t ntime; 1081 1082 dst = ip->ip_dst; 1083 cp = (u_char *)(ip + 1); 1084 cnt = (ip->ip_hl << 2) - sizeof (struct ip); 1085 1086 KERNEL_LOCK(); 1087 for (; cnt > 0; cnt -= optlen, cp += optlen) { 1088 opt = cp[IPOPT_OPTVAL]; 1089 if (opt == IPOPT_EOL) 1090 break; 1091 if (opt == IPOPT_NOP) 1092 optlen = 1; 1093 else { 1094 if (cnt < IPOPT_OLEN + sizeof(*cp)) { 1095 code = &cp[IPOPT_OLEN] - (u_char *)ip; 1096 goto bad; 1097 } 1098 optlen = cp[IPOPT_OLEN]; 1099 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) { 1100 code = &cp[IPOPT_OLEN] - (u_char *)ip; 1101 goto bad; 1102 } 1103 } 1104 1105 switch (opt) { 1106 1107 default: 1108 break; 1109 1110 /* 1111 * Source routing with record. 1112 * Find interface with current destination address. 1113 * If none on this machine then drop if strictly routed, 1114 * or do nothing if loosely routed. 1115 * Record interface address and bring up next address 1116 * component. If strictly routed make sure next 1117 * address is on directly accessible net. 1118 */ 1119 case IPOPT_LSRR: 1120 case IPOPT_SSRR: 1121 if (!ip_dosourceroute) { 1122 type = ICMP_UNREACH; 1123 code = ICMP_UNREACH_SRCFAIL; 1124 goto bad; 1125 } 1126 if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { 1127 code = &cp[IPOPT_OFFSET] - (u_char *)ip; 1128 goto bad; 1129 } 1130 memset(&ipaddr, 0, sizeof(ipaddr)); 1131 ipaddr.sin_family = AF_INET; 1132 ipaddr.sin_len = sizeof(ipaddr); 1133 ipaddr.sin_addr = ip->ip_dst; 1134 ia = ifatoia(ifa_ifwithaddr(sintosa(&ipaddr), 1135 m->m_pkthdr.ph_rtableid)); 1136 if (ia == NULL) { 1137 if (opt == IPOPT_SSRR) { 1138 type = ICMP_UNREACH; 1139 code = ICMP_UNREACH_SRCFAIL; 1140 goto bad; 1141 } 1142 /* 1143 * Loose routing, and not at next destination 1144 * yet; nothing to do except forward. 1145 */ 1146 break; 1147 } 1148 off--; /* 0 origin */ 1149 if ((off + sizeof(struct in_addr)) > optlen) { 1150 /* 1151 * End of source route. Should be for us. 1152 */ 1153 save_rte(m, cp, ip->ip_src); 1154 break; 1155 } 1156 1157 /* 1158 * locate outgoing interface 1159 */ 1160 memset(&ipaddr, 0, sizeof(ipaddr)); 1161 ipaddr.sin_family = AF_INET; 1162 ipaddr.sin_len = sizeof(ipaddr); 1163 memcpy(&ipaddr.sin_addr, cp + off, 1164 sizeof(ipaddr.sin_addr)); 1165 /* keep packet in the virtual instance */ 1166 rt = rtalloc(sintosa(&ipaddr), RT_RESOLVE, rtableid); 1167 if (!rtisvalid(rt) || ((opt == IPOPT_SSRR) && 1168 ISSET(rt->rt_flags, RTF_GATEWAY))) { 1169 type = ICMP_UNREACH; 1170 code = ICMP_UNREACH_SRCFAIL; 1171 rtfree(rt); 1172 goto bad; 1173 } 1174 ia = ifatoia(rt->rt_ifa); 1175 memcpy(cp + off, &ia->ia_addr.sin_addr, 1176 sizeof(struct in_addr)); 1177 rtfree(rt); 1178 cp[IPOPT_OFFSET] += sizeof(struct in_addr); 1179 ip->ip_dst = ipaddr.sin_addr; 1180 /* 1181 * Let ip_intr's mcast routing check handle mcast pkts 1182 */ 1183 forward = !IN_MULTICAST(ip->ip_dst.s_addr); 1184 break; 1185 1186 case IPOPT_RR: 1187 if (optlen < IPOPT_OFFSET + sizeof(*cp)) { 1188 code = &cp[IPOPT_OLEN] - (u_char *)ip; 1189 goto bad; 1190 } 1191 if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { 1192 code = &cp[IPOPT_OFFSET] - (u_char *)ip; 1193 goto bad; 1194 } 1195 1196 /* 1197 * If no space remains, ignore. 1198 */ 1199 off--; /* 0 origin */ 1200 if ((off + sizeof(struct in_addr)) > optlen) 1201 break; 1202 memset(&ipaddr, 0, sizeof(ipaddr)); 1203 ipaddr.sin_family = AF_INET; 1204 ipaddr.sin_len = sizeof(ipaddr); 1205 ipaddr.sin_addr = ip->ip_dst; 1206 /* 1207 * locate outgoing interface; if we're the destination, 1208 * use the incoming interface (should be same). 1209 * Again keep the packet inside the virtual instance. 1210 */ 1211 rt = rtalloc(sintosa(&ipaddr), RT_RESOLVE, rtableid); 1212 if (!rtisvalid(rt)) { 1213 type = ICMP_UNREACH; 1214 code = ICMP_UNREACH_HOST; 1215 rtfree(rt); 1216 goto bad; 1217 } 1218 ia = ifatoia(rt->rt_ifa); 1219 memcpy(cp + off, &ia->ia_addr.sin_addr, 1220 sizeof(struct in_addr)); 1221 rtfree(rt); 1222 cp[IPOPT_OFFSET] += sizeof(struct in_addr); 1223 break; 1224 1225 case IPOPT_TS: 1226 code = cp - (u_char *)ip; 1227 if (optlen < sizeof(struct ip_timestamp)) 1228 goto bad; 1229 memcpy(&ipt, cp, sizeof(struct ip_timestamp)); 1230 if (ipt.ipt_ptr < 5 || ipt.ipt_len < 5) 1231 goto bad; 1232 if (ipt.ipt_ptr - 1 + sizeof(u_int32_t) > ipt.ipt_len) { 1233 if (++ipt.ipt_oflw == 0) 1234 goto bad; 1235 break; 1236 } 1237 memcpy(&sin, cp + ipt.ipt_ptr - 1, sizeof sin); 1238 switch (ipt.ipt_flg) { 1239 1240 case IPOPT_TS_TSONLY: 1241 break; 1242 1243 case IPOPT_TS_TSANDADDR: 1244 if (ipt.ipt_ptr - 1 + sizeof(u_int32_t) + 1245 sizeof(struct in_addr) > ipt.ipt_len) 1246 goto bad; 1247 memset(&ipaddr, 0, sizeof(ipaddr)); 1248 ipaddr.sin_family = AF_INET; 1249 ipaddr.sin_len = sizeof(ipaddr); 1250 ipaddr.sin_addr = dst; 1251 ia = ifatoia(ifaof_ifpforaddr(sintosa(&ipaddr), 1252 ifp)); 1253 if (ia == NULL) 1254 continue; 1255 memcpy(&sin, &ia->ia_addr.sin_addr, 1256 sizeof(struct in_addr)); 1257 ipt.ipt_ptr += sizeof(struct in_addr); 1258 break; 1259 1260 case IPOPT_TS_PRESPEC: 1261 if (ipt.ipt_ptr - 1 + sizeof(u_int32_t) + 1262 sizeof(struct in_addr) > ipt.ipt_len) 1263 goto bad; 1264 memset(&ipaddr, 0, sizeof(ipaddr)); 1265 ipaddr.sin_family = AF_INET; 1266 ipaddr.sin_len = sizeof(ipaddr); 1267 ipaddr.sin_addr = sin; 1268 if (ifa_ifwithaddr(sintosa(&ipaddr), 1269 m->m_pkthdr.ph_rtableid) == NULL) 1270 continue; 1271 ipt.ipt_ptr += sizeof(struct in_addr); 1272 break; 1273 1274 default: 1275 /* XXX can't take &ipt->ipt_flg */ 1276 code = (u_char *)&ipt.ipt_ptr - 1277 (u_char *)ip + 1; 1278 goto bad; 1279 } 1280 ntime = iptime(); 1281 memcpy(cp + ipt.ipt_ptr - 1, &ntime, sizeof(u_int32_t)); 1282 ipt.ipt_ptr += sizeof(u_int32_t); 1283 } 1284 } 1285 KERNEL_UNLOCK(); 1286 if (forward && ipforwarding > 0) { 1287 ip_forward(m, ifp, NULL, 1); 1288 return (1); 1289 } 1290 return (0); 1291 bad: 1292 KERNEL_UNLOCK(); 1293 icmp_error(m, type, code, 0, 0); 1294 ipstat_inc(ips_badoptions); 1295 return (1); 1296 } 1297 1298 /* 1299 * Save incoming source route for use in replies, 1300 * to be picked up later by ip_srcroute if the receiver is interested. 1301 */ 1302 void 1303 save_rte(struct mbuf *m, u_char *option, struct in_addr dst) 1304 { 1305 struct ip_srcrt *isr; 1306 struct m_tag *mtag; 1307 unsigned olen; 1308 1309 olen = option[IPOPT_OLEN]; 1310 if (olen > sizeof(isr->isr_hdr) + sizeof(isr->isr_routes)) 1311 return; 1312 1313 mtag = m_tag_get(PACKET_TAG_SRCROUTE, sizeof(*isr), M_NOWAIT); 1314 if (mtag == NULL) 1315 return; 1316 isr = (struct ip_srcrt *)(mtag + 1); 1317 1318 memcpy(isr->isr_hdr, option, olen); 1319 isr->isr_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr); 1320 isr->isr_dst = dst; 1321 m_tag_prepend(m, mtag); 1322 } 1323 1324 /* 1325 * Retrieve incoming source route for use in replies, 1326 * in the same form used by setsockopt. 1327 * The first hop is placed before the options, will be removed later. 1328 */ 1329 struct mbuf * 1330 ip_srcroute(struct mbuf *m0) 1331 { 1332 struct in_addr *p, *q; 1333 struct mbuf *m; 1334 struct ip_srcrt *isr; 1335 struct m_tag *mtag; 1336 1337 if (!ip_dosourceroute) 1338 return (NULL); 1339 1340 mtag = m_tag_find(m0, PACKET_TAG_SRCROUTE, NULL); 1341 if (mtag == NULL) 1342 return (NULL); 1343 isr = (struct ip_srcrt *)(mtag + 1); 1344 1345 if (isr->isr_nhops == 0) 1346 return (NULL); 1347 m = m_get(M_DONTWAIT, MT_SOOPTS); 1348 if (m == NULL) 1349 return (NULL); 1350 1351 #define OPTSIZ (sizeof(isr->isr_nop) + sizeof(isr->isr_hdr)) 1352 1353 /* length is (nhops+1)*sizeof(addr) + sizeof(nop + header) */ 1354 m->m_len = (isr->isr_nhops + 1) * sizeof(struct in_addr) + OPTSIZ; 1355 1356 /* 1357 * First save first hop for return route 1358 */ 1359 p = &(isr->isr_routes[isr->isr_nhops - 1]); 1360 *(mtod(m, struct in_addr *)) = *p--; 1361 1362 /* 1363 * Copy option fields and padding (nop) to mbuf. 1364 */ 1365 isr->isr_nop = IPOPT_NOP; 1366 isr->isr_hdr[IPOPT_OFFSET] = IPOPT_MINOFF; 1367 memcpy(mtod(m, caddr_t) + sizeof(struct in_addr), &isr->isr_nop, 1368 OPTSIZ); 1369 q = (struct in_addr *)(mtod(m, caddr_t) + 1370 sizeof(struct in_addr) + OPTSIZ); 1371 #undef OPTSIZ 1372 /* 1373 * Record return path as an IP source route, 1374 * reversing the path (pointers are now aligned). 1375 */ 1376 while (p >= isr->isr_routes) { 1377 *q++ = *p--; 1378 } 1379 /* 1380 * Last hop goes to final destination. 1381 */ 1382 *q = isr->isr_dst; 1383 m_tag_delete(m0, (struct m_tag *)isr); 1384 return (m); 1385 } 1386 1387 /* 1388 * Strip out IP options, at higher level protocol in the kernel. 1389 */ 1390 void 1391 ip_stripoptions(struct mbuf *m) 1392 { 1393 int i; 1394 struct ip *ip = mtod(m, struct ip *); 1395 caddr_t opts; 1396 int olen; 1397 1398 olen = (ip->ip_hl<<2) - sizeof (struct ip); 1399 opts = (caddr_t)(ip + 1); 1400 i = m->m_len - (sizeof (struct ip) + olen); 1401 memmove(opts, opts + olen, i); 1402 m->m_len -= olen; 1403 if (m->m_flags & M_PKTHDR) 1404 m->m_pkthdr.len -= olen; 1405 ip->ip_hl = sizeof(struct ip) >> 2; 1406 ip->ip_len = htons(ntohs(ip->ip_len) - olen); 1407 } 1408 1409 const u_char inetctlerrmap[PRC_NCMDS] = { 1410 0, 0, 0, 0, 1411 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, 1412 EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, 1413 EMSGSIZE, EHOSTUNREACH, 0, 0, 1414 0, 0, 0, 0, 1415 ENOPROTOOPT 1416 }; 1417 1418 /* 1419 * Forward a packet. If some error occurs return the sender 1420 * an icmp packet. Note we can't always generate a meaningful 1421 * icmp message because icmp doesn't have a large enough repertoire 1422 * of codes and types. 1423 * 1424 * If not forwarding, just drop the packet. This could be confusing 1425 * if ipforwarding was zero but some routing protocol was advancing 1426 * us as a gateway to somewhere. However, we must let the routing 1427 * protocol deal with that. 1428 * 1429 * The srcrt parameter indicates whether the packet is being forwarded 1430 * via a source route. 1431 */ 1432 void 1433 ip_forward(struct mbuf *m, struct ifnet *ifp, struct rtentry *rt, int srcrt) 1434 { 1435 struct mbuf mfake, *mcopy = NULL; 1436 struct ip *ip = mtod(m, struct ip *); 1437 struct sockaddr_in *sin; 1438 struct route ro; 1439 int error, type = 0, code = 0, destmtu = 0, fake = 0, len; 1440 u_int32_t dest; 1441 1442 dest = 0; 1443 if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) { 1444 ipstat_inc(ips_cantforward); 1445 m_freem(m); 1446 goto freecopy; 1447 } 1448 if (ip->ip_ttl <= IPTTLDEC) { 1449 icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, dest, 0); 1450 goto freecopy; 1451 } 1452 1453 memset(&ro, 0, sizeof(ro)); 1454 sin = satosin(&ro.ro_dst); 1455 sin->sin_family = AF_INET; 1456 sin->sin_len = sizeof(*sin); 1457 sin->sin_addr = ip->ip_dst; 1458 1459 if (!rtisvalid(rt)) { 1460 rtfree(rt); 1461 rt = rtalloc_mpath(sintosa(sin), &ip->ip_src.s_addr, 1462 m->m_pkthdr.ph_rtableid); 1463 if (rt == NULL) { 1464 ipstat_inc(ips_noroute); 1465 icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0); 1466 return; 1467 } 1468 } 1469 1470 /* 1471 * Save at most 68 bytes of the packet in case 1472 * we need to generate an ICMP message to the src. 1473 * The data is saved in the mbuf on the stack that 1474 * acts as a temporary storage not intended to be 1475 * passed down the IP stack or to the mfree. 1476 */ 1477 memset(&mfake.m_hdr, 0, sizeof(mfake.m_hdr)); 1478 mfake.m_type = m->m_type; 1479 if (m_dup_pkthdr(&mfake, m, M_DONTWAIT) == 0) { 1480 mfake.m_data = mfake.m_pktdat; 1481 len = min(ntohs(ip->ip_len), 68); 1482 m_copydata(m, 0, len, mfake.m_pktdat); 1483 mfake.m_pkthdr.len = mfake.m_len = len; 1484 #if NPF > 0 1485 pf_pkt_addr_changed(&mfake); 1486 #endif /* NPF > 0 */ 1487 fake = 1; 1488 } 1489 1490 ip->ip_ttl -= IPTTLDEC; 1491 1492 /* 1493 * If forwarding packet using same interface that it came in on, 1494 * perhaps should send a redirect to sender to shortcut a hop. 1495 * Only send redirect if source is sending directly to us, 1496 * and if packet was not source routed (or has any options). 1497 * Also, don't send redirect if forwarding using a default route 1498 * or a route modified by a redirect. 1499 * Don't send redirect if we advertise destination's arp address 1500 * as ours (proxy arp). 1501 */ 1502 if ((rt->rt_ifidx == ifp->if_index) && 1503 (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 && 1504 satosin(rt_key(rt))->sin_addr.s_addr != 0 && 1505 ipsendredirects && !srcrt && 1506 !arpproxy(satosin(rt_key(rt))->sin_addr, m->m_pkthdr.ph_rtableid)) { 1507 if ((ip->ip_src.s_addr & ifatoia(rt->rt_ifa)->ia_netmask) == 1508 ifatoia(rt->rt_ifa)->ia_net) { 1509 if (rt->rt_flags & RTF_GATEWAY) 1510 dest = satosin(rt->rt_gateway)->sin_addr.s_addr; 1511 else 1512 dest = ip->ip_dst.s_addr; 1513 /* Router requirements says to only send host redirects */ 1514 type = ICMP_REDIRECT; 1515 code = ICMP_REDIRECT_HOST; 1516 } 1517 } 1518 1519 ro.ro_rt = rt; 1520 ro.ro_tableid = m->m_pkthdr.ph_rtableid; 1521 error = ip_output(m, NULL, &ro, 1522 (IP_FORWARDING | (ip_directedbcast ? IP_ALLOWBROADCAST : 0)), 1523 NULL, NULL, 0); 1524 rt = ro.ro_rt; 1525 if (error) 1526 ipstat_inc(ips_cantforward); 1527 else { 1528 ipstat_inc(ips_forward); 1529 if (type) 1530 ipstat_inc(ips_redirectsent); 1531 else 1532 goto freecopy; 1533 } 1534 if (!fake) 1535 goto freecopy; 1536 1537 switch (error) { 1538 1539 case 0: /* forwarded, but need redirect */ 1540 /* type, code set above */ 1541 break; 1542 1543 case ENETUNREACH: /* shouldn't happen, checked above */ 1544 case EHOSTUNREACH: 1545 case ENETDOWN: 1546 case EHOSTDOWN: 1547 default: 1548 type = ICMP_UNREACH; 1549 code = ICMP_UNREACH_HOST; 1550 break; 1551 1552 case EMSGSIZE: 1553 type = ICMP_UNREACH; 1554 code = ICMP_UNREACH_NEEDFRAG; 1555 1556 #ifdef IPSEC 1557 if (rt != NULL) { 1558 if (rt->rt_mtu) 1559 destmtu = rt->rt_mtu; 1560 else { 1561 struct ifnet *destifp; 1562 1563 destifp = if_get(rt->rt_ifidx); 1564 if (destifp != NULL) 1565 destmtu = destifp->if_mtu; 1566 if_put(destifp); 1567 } 1568 } 1569 #endif /*IPSEC*/ 1570 ipstat_inc(ips_cantfrag); 1571 break; 1572 1573 case EACCES: 1574 /* 1575 * pf(4) blocked the packet. There is no need to send an ICMP 1576 * packet back since pf(4) takes care of it. 1577 */ 1578 goto freecopy; 1579 case ENOBUFS: 1580 /* 1581 * a router should not generate ICMP_SOURCEQUENCH as 1582 * required in RFC1812 Requirements for IP Version 4 Routers. 1583 * source quench could be a big problem under DoS attacks, 1584 * or the underlying interface is rate-limited. 1585 */ 1586 goto freecopy; 1587 } 1588 1589 mcopy = m_copym(&mfake, 0, len, M_DONTWAIT); 1590 if (mcopy) 1591 icmp_error(mcopy, type, code, dest, destmtu); 1592 1593 freecopy: 1594 if (fake) 1595 m_tag_delete_chain(&mfake); 1596 rtfree(rt); 1597 } 1598 1599 int 1600 ip_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, 1601 size_t newlen) 1602 { 1603 int error; 1604 #ifdef MROUTING 1605 extern struct mrtstat mrtstat; 1606 #endif 1607 1608 /* Almost all sysctl names at this level are terminal. */ 1609 if (namelen != 1 && name[0] != IPCTL_IFQUEUE && 1610 name[0] != IPCTL_ARPQUEUE) 1611 return (ENOTDIR); 1612 1613 switch (name[0]) { 1614 case IPCTL_SOURCEROUTE: 1615 /* 1616 * Don't allow this to change in a secure environment. 1617 */ 1618 if (newp && securelevel > 0) 1619 return (EPERM); 1620 NET_LOCK(); 1621 error = sysctl_int(oldp, oldlenp, newp, newlen, 1622 &ip_dosourceroute); 1623 NET_UNLOCK(); 1624 return (error); 1625 case IPCTL_MTUDISC: 1626 NET_LOCK(); 1627 error = sysctl_int(oldp, oldlenp, newp, newlen, 1628 &ip_mtudisc); 1629 if (ip_mtudisc != 0 && ip_mtudisc_timeout_q == NULL) { 1630 ip_mtudisc_timeout_q = 1631 rt_timer_queue_create(ip_mtudisc_timeout); 1632 } else if (ip_mtudisc == 0 && ip_mtudisc_timeout_q != NULL) { 1633 rt_timer_queue_destroy(ip_mtudisc_timeout_q); 1634 ip_mtudisc_timeout_q = NULL; 1635 } 1636 NET_UNLOCK(); 1637 return error; 1638 case IPCTL_MTUDISCTIMEOUT: 1639 NET_LOCK(); 1640 error = sysctl_int(oldp, oldlenp, newp, newlen, 1641 &ip_mtudisc_timeout); 1642 if (ip_mtudisc_timeout_q != NULL) 1643 rt_timer_queue_change(ip_mtudisc_timeout_q, 1644 ip_mtudisc_timeout); 1645 NET_UNLOCK(); 1646 return (error); 1647 #ifdef IPSEC 1648 case IPCTL_ENCDEBUG: 1649 case IPCTL_IPSEC_STATS: 1650 case IPCTL_IPSEC_EXPIRE_ACQUIRE: 1651 case IPCTL_IPSEC_EMBRYONIC_SA_TIMEOUT: 1652 case IPCTL_IPSEC_REQUIRE_PFS: 1653 case IPCTL_IPSEC_SOFT_ALLOCATIONS: 1654 case IPCTL_IPSEC_ALLOCATIONS: 1655 case IPCTL_IPSEC_SOFT_BYTES: 1656 case IPCTL_IPSEC_BYTES: 1657 case IPCTL_IPSEC_TIMEOUT: 1658 case IPCTL_IPSEC_SOFT_TIMEOUT: 1659 case IPCTL_IPSEC_SOFT_FIRSTUSE: 1660 case IPCTL_IPSEC_FIRSTUSE: 1661 case IPCTL_IPSEC_ENC_ALGORITHM: 1662 case IPCTL_IPSEC_AUTH_ALGORITHM: 1663 case IPCTL_IPSEC_IPCOMP_ALGORITHM: 1664 return (ipsec_sysctl(name, namelen, oldp, oldlenp, newp, 1665 newlen)); 1666 #endif 1667 case IPCTL_IFQUEUE: 1668 return (EOPNOTSUPP); 1669 case IPCTL_ARPQUEUE: 1670 return (sysctl_niq(name + 1, namelen - 1, 1671 oldp, oldlenp, newp, newlen, &arpinq)); 1672 case IPCTL_ARPQUEUED: 1673 return (sysctl_rdint(oldp, oldlenp, newp, la_hold_total)); 1674 case IPCTL_STATS: 1675 return (ip_sysctl_ipstat(oldp, oldlenp, newp)); 1676 #ifdef MROUTING 1677 case IPCTL_MRTSTATS: 1678 return (sysctl_rdstruct(oldp, oldlenp, newp, 1679 &mrtstat, sizeof(mrtstat))); 1680 case IPCTL_MRTMFC: 1681 if (newp) 1682 return (EPERM); 1683 NET_LOCK(); 1684 error = mrt_sysctl_mfc(oldp, oldlenp); 1685 NET_UNLOCK(); 1686 return (error); 1687 case IPCTL_MRTVIF: 1688 if (newp) 1689 return (EPERM); 1690 NET_LOCK(); 1691 error = mrt_sysctl_vif(oldp, oldlenp); 1692 NET_UNLOCK(); 1693 return (error); 1694 #else 1695 case IPCTL_MRTPROTO: 1696 case IPCTL_MRTSTATS: 1697 case IPCTL_MRTMFC: 1698 case IPCTL_MRTVIF: 1699 return (EOPNOTSUPP); 1700 #endif 1701 default: 1702 NET_LOCK(); 1703 error = sysctl_bounded_arr(ipctl_vars, nitems(ipctl_vars), 1704 name, namelen, oldp, oldlenp, newp, newlen); 1705 NET_UNLOCK(); 1706 return (error); 1707 } 1708 /* NOTREACHED */ 1709 } 1710 1711 int 1712 ip_sysctl_ipstat(void *oldp, size_t *oldlenp, void *newp) 1713 { 1714 uint64_t counters[ips_ncounters]; 1715 struct ipstat ipstat; 1716 u_long *words = (u_long *)&ipstat; 1717 int i; 1718 1719 CTASSERT(sizeof(ipstat) == (nitems(counters) * sizeof(u_long))); 1720 memset(&ipstat, 0, sizeof ipstat); 1721 counters_read(ipcounters, counters, nitems(counters)); 1722 1723 for (i = 0; i < nitems(counters); i++) 1724 words[i] = (u_long)counters[i]; 1725 1726 return (sysctl_rdstruct(oldp, oldlenp, newp, &ipstat, sizeof(ipstat))); 1727 } 1728 1729 void 1730 ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, 1731 struct mbuf *m) 1732 { 1733 if (inp->inp_socket->so_options & SO_TIMESTAMP) { 1734 struct timeval tv; 1735 1736 m_microtime(m, &tv); 1737 *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv), 1738 SCM_TIMESTAMP, SOL_SOCKET); 1739 if (*mp) 1740 mp = &(*mp)->m_next; 1741 } 1742 1743 if (inp->inp_flags & INP_RECVDSTADDR) { 1744 *mp = sbcreatecontrol((caddr_t) &ip->ip_dst, 1745 sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP); 1746 if (*mp) 1747 mp = &(*mp)->m_next; 1748 } 1749 #ifdef notyet 1750 /* this code is broken and will probably never be fixed. */ 1751 /* options were tossed already */ 1752 if (inp->inp_flags & INP_RECVOPTS) { 1753 *mp = sbcreatecontrol((caddr_t) opts_deleted_above, 1754 sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP); 1755 if (*mp) 1756 mp = &(*mp)->m_next; 1757 } 1758 /* ip_srcroute doesn't do what we want here, need to fix */ 1759 if (inp->inp_flags & INP_RECVRETOPTS) { 1760 *mp = sbcreatecontrol((caddr_t) ip_srcroute(m), 1761 sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP); 1762 if (*mp) 1763 mp = &(*mp)->m_next; 1764 } 1765 #endif 1766 if (inp->inp_flags & INP_RECVIF) { 1767 struct sockaddr_dl sdl; 1768 struct ifnet *ifp; 1769 1770 ifp = if_get(m->m_pkthdr.ph_ifidx); 1771 if (ifp == NULL || ifp->if_sadl == NULL) { 1772 memset(&sdl, 0, sizeof(sdl)); 1773 sdl.sdl_len = offsetof(struct sockaddr_dl, sdl_data[0]); 1774 sdl.sdl_family = AF_LINK; 1775 sdl.sdl_index = ifp != NULL ? ifp->if_index : 0; 1776 sdl.sdl_nlen = sdl.sdl_alen = sdl.sdl_slen = 0; 1777 *mp = sbcreatecontrol((caddr_t) &sdl, sdl.sdl_len, 1778 IP_RECVIF, IPPROTO_IP); 1779 } else { 1780 *mp = sbcreatecontrol((caddr_t) ifp->if_sadl, 1781 ifp->if_sadl->sdl_len, IP_RECVIF, IPPROTO_IP); 1782 } 1783 if (*mp) 1784 mp = &(*mp)->m_next; 1785 if_put(ifp); 1786 } 1787 if (inp->inp_flags & INP_RECVTTL) { 1788 *mp = sbcreatecontrol((caddr_t) &ip->ip_ttl, 1789 sizeof(u_int8_t), IP_RECVTTL, IPPROTO_IP); 1790 if (*mp) 1791 mp = &(*mp)->m_next; 1792 } 1793 if (inp->inp_flags & INP_RECVRTABLE) { 1794 u_int rtableid = inp->inp_rtableid; 1795 1796 #if NPF > 0 1797 if (m && m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) { 1798 struct pf_divert *divert; 1799 1800 divert = pf_find_divert(m); 1801 KASSERT(divert != NULL); 1802 rtableid = divert->rdomain; 1803 } 1804 #endif 1805 1806 *mp = sbcreatecontrol((caddr_t) &rtableid, 1807 sizeof(u_int), IP_RECVRTABLE, IPPROTO_IP); 1808 if (*mp) 1809 mp = &(*mp)->m_next; 1810 } 1811 } 1812 1813 void 1814 ip_send_do_dispatch(void *xmq, int flags) 1815 { 1816 struct mbuf_queue *mq = xmq; 1817 struct mbuf *m; 1818 struct mbuf_list ml; 1819 struct m_tag *mtag; 1820 u_int32_t ipsecflowinfo = 0; 1821 1822 mq_delist(mq, &ml); 1823 if (ml_empty(&ml)) 1824 return; 1825 1826 NET_LOCK(); 1827 while ((m = ml_dequeue(&ml)) != NULL) { 1828 if ((mtag = m_tag_find(m, PACKET_TAG_IPSEC_FLOWINFO, NULL)) 1829 != NULL) { 1830 ipsecflowinfo = *(u_int32_t *)(mtag + 1); 1831 m_tag_delete(m, mtag); 1832 } 1833 ip_output(m, NULL, NULL, flags, NULL, NULL, ipsecflowinfo); 1834 } 1835 NET_UNLOCK(); 1836 } 1837 1838 void 1839 ip_sendraw_dispatch(void *xmq) 1840 { 1841 ip_send_do_dispatch(xmq, IP_RAWOUTPUT); 1842 } 1843 1844 void 1845 ip_send_dispatch(void *xmq) 1846 { 1847 ip_send_do_dispatch(xmq, 0); 1848 } 1849 1850 void 1851 ip_send(struct mbuf *m) 1852 { 1853 mq_enqueue(&ipsend_mq, m); 1854 task_add(net_tq(0), &ipsend_task); 1855 } 1856 1857 void 1858 ip_send_raw(struct mbuf *m) 1859 { 1860 mq_enqueue(&ipsendraw_mq, m); 1861 task_add(net_tq(0), &ipsendraw_task); 1862 } 1863