1 /* $OpenBSD: ip_input.c,v 1.360 2021/05/15 08:07:20 yasuoka Exp $ */ 2 /* $NetBSD: ip_input.c,v 1.30 1996/03/16 23:53:58 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 33 */ 34 35 #include "pf.h" 36 #include "carp.h" 37 38 #include <sys/param.h> 39 #include <sys/systm.h> 40 #include <sys/mbuf.h> 41 #include <sys/domain.h> 42 #include <sys/mutex.h> 43 #include <sys/protosw.h> 44 #include <sys/socket.h> 45 #include <sys/socketvar.h> 46 #include <sys/sysctl.h> 47 #include <sys/pool.h> 48 #include <sys/task.h> 49 50 #include <net/if.h> 51 #include <net/if_var.h> 52 #include <net/if_dl.h> 53 #include <net/route.h> 54 #include <net/netisr.h> 55 56 #include <netinet/in.h> 57 #include <netinet/in_systm.h> 58 #include <netinet/if_ether.h> 59 #include <netinet/ip.h> 60 #include <netinet/in_pcb.h> 61 #include <netinet/in_var.h> 62 #include <netinet/ip_var.h> 63 #include <netinet/ip_icmp.h> 64 #include <net/if_types.h> 65 66 #ifdef INET6 67 #include <netinet6/ip6protosw.h> 68 #include <netinet6/ip6_var.h> 69 #endif 70 71 #if NPF > 0 72 #include <net/pfvar.h> 73 #endif 74 75 #ifdef MROUTING 76 #include <netinet/ip_mroute.h> 77 #endif 78 79 #ifdef IPSEC 80 #include <netinet/ip_ipsp.h> 81 #endif /* IPSEC */ 82 83 #if NCARP > 0 84 #include <netinet/ip_carp.h> 85 #endif 86 87 /* values controllable via sysctl */ 88 int ipforwarding = 0; 89 int ipmforwarding = 0; 90 int ipmultipath = 0; 91 int ipsendredirects = 1; 92 int ip_dosourceroute = 0; 93 int ip_defttl = IPDEFTTL; 94 int ip_mtudisc = 1; 95 u_int ip_mtudisc_timeout = IPMTUDISCTIMEOUT; 96 int ip_directedbcast = 0; 97 98 struct rttimer_queue *ip_mtudisc_timeout_q = NULL; 99 100 /* Protects `ipq' and `ip_frags'. */ 101 struct mutex ipq_mutex = MUTEX_INITIALIZER(IPL_SOFTNET); 102 103 /* IP reassembly queue */ 104 LIST_HEAD(, ipq) ipq; 105 106 /* Keep track of memory used for reassembly */ 107 int ip_maxqueue = 300; 108 int ip_frags = 0; 109 110 #ifdef MROUTING 111 extern int ip_mrtproto; 112 #endif 113 114 const struct sysctl_bounded_args ipctl_vars[] = { 115 #ifdef MROUTING 116 { IPCTL_MRTPROTO, &ip_mrtproto, SYSCTL_INT_READONLY }, 117 #endif 118 { IPCTL_FORWARDING, &ipforwarding, 0, 2 }, 119 { IPCTL_SENDREDIRECTS, &ipsendredirects, 0, 1 }, 120 { IPCTL_DEFTTL, &ip_defttl, 0, 255 }, 121 { IPCTL_DIRECTEDBCAST, &ip_directedbcast, 0, 1 }, 122 { IPCTL_IPPORT_FIRSTAUTO, &ipport_firstauto, 0, 65535 }, 123 { IPCTL_IPPORT_LASTAUTO, &ipport_lastauto, 0, 65535 }, 124 { IPCTL_IPPORT_HIFIRSTAUTO, &ipport_hifirstauto, 0, 65535 }, 125 { IPCTL_IPPORT_HILASTAUTO, &ipport_hilastauto, 0, 65535 }, 126 { IPCTL_IPPORT_MAXQUEUE, &ip_maxqueue, 0, 10000 }, 127 { IPCTL_MFORWARDING, &ipmforwarding, 0, 1 }, 128 { IPCTL_MULTIPATH, &ipmultipath, 0, 1 }, 129 { IPCTL_ARPTIMEOUT, &arpt_keep, 0, INT_MAX }, 130 { IPCTL_ARPDOWN, &arpt_down, 0, INT_MAX }, 131 }; 132 133 struct pool ipqent_pool; 134 struct pool ipq_pool; 135 136 struct cpumem *ipcounters; 137 138 int ip_sysctl_ipstat(void *, size_t *, void *); 139 140 static struct mbuf_queue ipsend_mq; 141 static struct mbuf_queue ipsendraw_mq; 142 143 extern struct niqueue arpinq; 144 145 int ip_ours(struct mbuf **, int *, int, int); 146 int ip_dooptions(struct mbuf *, struct ifnet *); 147 int in_ouraddr(struct mbuf *, struct ifnet *, struct rtentry **); 148 149 static void ip_send_dispatch(void *); 150 static void ip_sendraw_dispatch(void *); 151 static struct task ipsend_task = TASK_INITIALIZER(ip_send_dispatch, &ipsend_mq); 152 static struct task ipsendraw_task = 153 TASK_INITIALIZER(ip_sendraw_dispatch, &ipsendraw_mq); 154 155 /* 156 * Used to save the IP options in case a protocol wants to respond 157 * to an incoming packet over the same route if the packet got here 158 * using IP source routing. This allows connection establishment and 159 * maintenance when the remote end is on a network that is not known 160 * to us. 161 */ 162 struct ip_srcrt { 163 int isr_nhops; /* number of hops */ 164 struct in_addr isr_dst; /* final destination */ 165 char isr_nop; /* one NOP to align */ 166 char isr_hdr[IPOPT_OFFSET + 1]; /* OPTVAL, OLEN & OFFSET */ 167 struct in_addr isr_routes[MAX_IPOPTLEN/sizeof(struct in_addr)]; 168 }; 169 170 void save_rte(struct mbuf *, u_char *, struct in_addr); 171 172 /* 173 * IP initialization: fill in IP protocol switch table. 174 * All protocols not implemented in kernel go to raw IP protocol handler. 175 */ 176 void 177 ip_init(void) 178 { 179 const struct protosw *pr; 180 int i; 181 const u_int16_t defbaddynamicports_tcp[] = DEFBADDYNAMICPORTS_TCP; 182 const u_int16_t defbaddynamicports_udp[] = DEFBADDYNAMICPORTS_UDP; 183 const u_int16_t defrootonlyports_tcp[] = DEFROOTONLYPORTS_TCP; 184 const u_int16_t defrootonlyports_udp[] = DEFROOTONLYPORTS_UDP; 185 186 ipcounters = counters_alloc(ips_ncounters); 187 188 pool_init(&ipqent_pool, sizeof(struct ipqent), 0, 189 IPL_SOFTNET, 0, "ipqe", NULL); 190 pool_init(&ipq_pool, sizeof(struct ipq), 0, 191 IPL_SOFTNET, 0, "ipq", NULL); 192 193 pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); 194 if (pr == NULL) 195 panic("ip_init"); 196 for (i = 0; i < IPPROTO_MAX; i++) 197 ip_protox[i] = pr - inetsw; 198 for (pr = inetdomain.dom_protosw; 199 pr < inetdomain.dom_protoswNPROTOSW; pr++) 200 if (pr->pr_domain->dom_family == PF_INET && 201 pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW && 202 pr->pr_protocol < IPPROTO_MAX) 203 ip_protox[pr->pr_protocol] = pr - inetsw; 204 LIST_INIT(&ipq); 205 if (ip_mtudisc != 0) 206 ip_mtudisc_timeout_q = 207 rt_timer_queue_create(ip_mtudisc_timeout); 208 209 /* Fill in list of ports not to allocate dynamically. */ 210 memset(&baddynamicports, 0, sizeof(baddynamicports)); 211 for (i = 0; defbaddynamicports_tcp[i] != 0; i++) 212 DP_SET(baddynamicports.tcp, defbaddynamicports_tcp[i]); 213 for (i = 0; defbaddynamicports_udp[i] != 0; i++) 214 DP_SET(baddynamicports.udp, defbaddynamicports_udp[i]); 215 216 /* Fill in list of ports only root can bind to. */ 217 memset(&rootonlyports, 0, sizeof(rootonlyports)); 218 for (i = 0; defrootonlyports_tcp[i] != 0; i++) 219 DP_SET(rootonlyports.tcp, defrootonlyports_tcp[i]); 220 for (i = 0; defrootonlyports_udp[i] != 0; i++) 221 DP_SET(rootonlyports.udp, defrootonlyports_udp[i]); 222 223 mq_init(&ipsend_mq, 64, IPL_SOFTNET); 224 mq_init(&ipsendraw_mq, 64, IPL_SOFTNET); 225 226 arpinit(); 227 #ifdef IPSEC 228 ipsec_init(); 229 #endif 230 } 231 232 /* 233 * IPv4 input routine. 234 * 235 * Checksum and byte swap header. Process options. Forward or deliver. 236 */ 237 void 238 ipv4_input(struct ifnet *ifp, struct mbuf *m) 239 { 240 int off, nxt; 241 242 off = 0; 243 nxt = ip_input_if(&m, &off, IPPROTO_IPV4, AF_UNSPEC, ifp); 244 KASSERT(nxt == IPPROTO_DONE); 245 } 246 247 int 248 ip_input_if(struct mbuf **mp, int *offp, int nxt, int af, struct ifnet *ifp) 249 { 250 struct mbuf *m = *mp; 251 struct rtentry *rt = NULL; 252 struct ip *ip; 253 int hlen, len; 254 in_addr_t pfrdr = 0; 255 256 KASSERT(*offp == 0); 257 258 ipstat_inc(ips_total); 259 if (m->m_len < sizeof (struct ip) && 260 (m = *mp = m_pullup(m, sizeof (struct ip))) == NULL) { 261 ipstat_inc(ips_toosmall); 262 goto bad; 263 } 264 ip = mtod(m, struct ip *); 265 if (ip->ip_v != IPVERSION) { 266 ipstat_inc(ips_badvers); 267 goto bad; 268 } 269 hlen = ip->ip_hl << 2; 270 if (hlen < sizeof(struct ip)) { /* minimum header length */ 271 ipstat_inc(ips_badhlen); 272 goto bad; 273 } 274 if (hlen > m->m_len) { 275 if ((m = *mp = m_pullup(m, hlen)) == NULL) { 276 ipstat_inc(ips_badhlen); 277 goto bad; 278 } 279 ip = mtod(m, struct ip *); 280 } 281 282 /* 127/8 must not appear on wire - RFC1122 */ 283 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || 284 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { 285 if ((ifp->if_flags & IFF_LOOPBACK) == 0) { 286 ipstat_inc(ips_badaddr); 287 goto bad; 288 } 289 } 290 291 if ((m->m_pkthdr.csum_flags & M_IPV4_CSUM_IN_OK) == 0) { 292 if (m->m_pkthdr.csum_flags & M_IPV4_CSUM_IN_BAD) { 293 ipstat_inc(ips_badsum); 294 goto bad; 295 } 296 297 ipstat_inc(ips_inswcsum); 298 if (in_cksum(m, hlen) != 0) { 299 ipstat_inc(ips_badsum); 300 goto bad; 301 } 302 } 303 304 /* Retrieve the packet length. */ 305 len = ntohs(ip->ip_len); 306 307 /* 308 * Convert fields to host representation. 309 */ 310 if (len < hlen) { 311 ipstat_inc(ips_badlen); 312 goto bad; 313 } 314 315 /* 316 * Check that the amount of data in the buffers 317 * is at least as much as the IP header would have us expect. 318 * Trim mbufs if longer than we expect. 319 * Drop packet if shorter than we expect. 320 */ 321 if (m->m_pkthdr.len < len) { 322 ipstat_inc(ips_tooshort); 323 goto bad; 324 } 325 if (m->m_pkthdr.len > len) { 326 if (m->m_len == m->m_pkthdr.len) { 327 m->m_len = len; 328 m->m_pkthdr.len = len; 329 } else 330 m_adj(m, len - m->m_pkthdr.len); 331 } 332 333 #if NCARP > 0 334 if (carp_lsdrop(ifp, m, AF_INET, &ip->ip_src.s_addr, 335 &ip->ip_dst.s_addr, (ip->ip_p == IPPROTO_ICMP ? 0 : 1))) 336 goto bad; 337 #endif 338 339 #if NPF > 0 340 /* 341 * Packet filter 342 */ 343 pfrdr = ip->ip_dst.s_addr; 344 if (pf_test(AF_INET, PF_IN, ifp, mp) != PF_PASS) 345 goto bad; 346 m = *mp; 347 if (m == NULL) 348 goto bad; 349 350 ip = mtod(m, struct ip *); 351 hlen = ip->ip_hl << 2; 352 pfrdr = (pfrdr != ip->ip_dst.s_addr); 353 #endif 354 355 /* 356 * Process options and, if not destined for us, 357 * ship it on. ip_dooptions returns 1 when an 358 * error was detected (causing an icmp message 359 * to be sent and the original packet to be freed). 360 */ 361 if (hlen > sizeof (struct ip) && ip_dooptions(m, ifp)) { 362 m = *mp = NULL; 363 goto bad; 364 } 365 366 if (ip->ip_dst.s_addr == INADDR_BROADCAST || 367 ip->ip_dst.s_addr == INADDR_ANY) { 368 nxt = ip_ours(mp, offp, nxt, af); 369 goto out; 370 } 371 372 switch(in_ouraddr(m, ifp, &rt)) { 373 case 2: 374 goto bad; 375 case 1: 376 nxt = ip_ours(mp, offp, nxt, af); 377 goto out; 378 } 379 380 if (IN_MULTICAST(ip->ip_dst.s_addr)) { 381 /* 382 * Make sure M_MCAST is set. It should theoretically 383 * already be there, but let's play safe because upper 384 * layers check for this flag. 385 */ 386 m->m_flags |= M_MCAST; 387 388 #ifdef MROUTING 389 if (ipmforwarding && ip_mrouter[ifp->if_rdomain]) { 390 int error; 391 392 if (m->m_flags & M_EXT) { 393 if ((m = *mp = m_pullup(m, hlen)) == NULL) { 394 ipstat_inc(ips_toosmall); 395 goto bad; 396 } 397 ip = mtod(m, struct ip *); 398 } 399 /* 400 * If we are acting as a multicast router, all 401 * incoming multicast packets are passed to the 402 * kernel-level multicast forwarding function. 403 * The packet is returned (relatively) intact; if 404 * ip_mforward() returns a non-zero value, the packet 405 * must be discarded, else it may be accepted below. 406 * 407 * (The IP ident field is put in the same byte order 408 * as expected when ip_mforward() is called from 409 * ip_output().) 410 */ 411 KERNEL_LOCK(); 412 error = ip_mforward(m, ifp); 413 KERNEL_UNLOCK(); 414 if (error) { 415 ipstat_inc(ips_cantforward); 416 goto bad; 417 } 418 419 /* 420 * The process-level routing daemon needs to receive 421 * all multicast IGMP packets, whether or not this 422 * host belongs to their destination groups. 423 */ 424 if (ip->ip_p == IPPROTO_IGMP) { 425 nxt = ip_ours(mp, offp, nxt, af); 426 goto out; 427 } 428 ipstat_inc(ips_forward); 429 } 430 #endif 431 /* 432 * See if we belong to the destination multicast group on the 433 * arrival interface. 434 */ 435 if (!in_hasmulti(&ip->ip_dst, ifp)) { 436 ipstat_inc(ips_notmember); 437 if (!IN_LOCAL_GROUP(ip->ip_dst.s_addr)) 438 ipstat_inc(ips_cantforward); 439 goto bad; 440 } 441 nxt = ip_ours(mp, offp, nxt, af); 442 goto out; 443 } 444 445 #if NCARP > 0 446 if (ip->ip_p == IPPROTO_ICMP && 447 carp_lsdrop(ifp, m, AF_INET, &ip->ip_src.s_addr, 448 &ip->ip_dst.s_addr, 1)) 449 goto bad; 450 #endif 451 /* 452 * Not for us; forward if possible and desirable. 453 */ 454 if (ipforwarding == 0) { 455 ipstat_inc(ips_cantforward); 456 goto bad; 457 } 458 #ifdef IPSEC 459 if (ipsec_in_use) { 460 int rv; 461 462 rv = ipsec_forward_check(m, hlen, AF_INET); 463 if (rv != 0) { 464 ipstat_inc(ips_cantforward); 465 goto bad; 466 } 467 /* 468 * Fall through, forward packet. Outbound IPsec policy 469 * checking will occur in ip_output(). 470 */ 471 } 472 #endif /* IPSEC */ 473 474 ip_forward(m, ifp, rt, pfrdr); 475 *mp = NULL; 476 return IPPROTO_DONE; 477 bad: 478 nxt = IPPROTO_DONE; 479 m_freemp(mp); 480 out: 481 rtfree(rt); 482 return nxt; 483 } 484 485 /* 486 * IPv4 local-delivery routine. 487 * 488 * If fragmented try to reassemble. Pass to next level. 489 */ 490 int 491 ip_ours(struct mbuf **mp, int *offp, int nxt, int af) 492 { 493 struct mbuf *m = *mp; 494 struct ip *ip = mtod(m, struct ip *); 495 struct ipq *fp; 496 struct ipqent *ipqe; 497 int mff, hlen; 498 499 hlen = ip->ip_hl << 2; 500 501 /* 502 * If offset or IP_MF are set, must reassemble. 503 * Otherwise, nothing need be done. 504 * (We could look in the reassembly queue to see 505 * if the packet was previously fragmented, 506 * but it's not worth the time; just let them time out.) 507 */ 508 if (ip->ip_off &~ htons(IP_DF | IP_RF)) { 509 if (m->m_flags & M_EXT) { /* XXX */ 510 if ((m = *mp = m_pullup(m, hlen)) == NULL) { 511 ipstat_inc(ips_toosmall); 512 return IPPROTO_DONE; 513 } 514 ip = mtod(m, struct ip *); 515 } 516 517 mtx_enter(&ipq_mutex); 518 519 /* 520 * Look for queue of fragments 521 * of this datagram. 522 */ 523 LIST_FOREACH(fp, &ipq, ipq_q) { 524 if (ip->ip_id == fp->ipq_id && 525 ip->ip_src.s_addr == fp->ipq_src.s_addr && 526 ip->ip_dst.s_addr == fp->ipq_dst.s_addr && 527 ip->ip_p == fp->ipq_p) 528 break; 529 } 530 531 /* 532 * Adjust ip_len to not reflect header, 533 * set ipqe_mff if more fragments are expected, 534 * convert offset of this to bytes. 535 */ 536 ip->ip_len = htons(ntohs(ip->ip_len) - hlen); 537 mff = (ip->ip_off & htons(IP_MF)) != 0; 538 if (mff) { 539 /* 540 * Make sure that fragments have a data length 541 * that's a non-zero multiple of 8 bytes. 542 */ 543 if (ntohs(ip->ip_len) == 0 || 544 (ntohs(ip->ip_len) & 0x7) != 0) { 545 ipstat_inc(ips_badfrags); 546 goto bad; 547 } 548 } 549 ip->ip_off = htons(ntohs(ip->ip_off) << 3); 550 551 /* 552 * If datagram marked as having more fragments 553 * or if this is not the first fragment, 554 * attempt reassembly; if it succeeds, proceed. 555 */ 556 if (mff || ip->ip_off) { 557 ipstat_inc(ips_fragments); 558 if (ip_frags + 1 > ip_maxqueue) { 559 ip_flush(); 560 ipstat_inc(ips_rcvmemdrop); 561 goto bad; 562 } 563 564 ipqe = pool_get(&ipqent_pool, PR_NOWAIT); 565 if (ipqe == NULL) { 566 ipstat_inc(ips_rcvmemdrop); 567 goto bad; 568 } 569 ip_frags++; 570 ipqe->ipqe_mff = mff; 571 ipqe->ipqe_m = m; 572 ipqe->ipqe_ip = ip; 573 m = *mp = ip_reass(ipqe, fp); 574 if (m == NULL) 575 goto bad; 576 ipstat_inc(ips_reassembled); 577 ip = mtod(m, struct ip *); 578 hlen = ip->ip_hl << 2; 579 ip->ip_len = htons(ntohs(ip->ip_len) + hlen); 580 } else 581 if (fp) 582 ip_freef(fp); 583 584 mtx_leave(&ipq_mutex); 585 } 586 587 *offp = hlen; 588 nxt = ip->ip_p; 589 /* Check whether we are already in a IPv4/IPv6 local deliver loop. */ 590 if (af == AF_UNSPEC) 591 nxt = ip_deliver(mp, offp, nxt, AF_INET); 592 return nxt; 593 bad: 594 mtx_leave(&ipq_mutex); 595 m_freemp(mp); 596 return IPPROTO_DONE; 597 } 598 599 #ifndef INET6 600 #define IPSTAT_INC(name) ipstat_inc(ips_##name) 601 #else 602 #define IPSTAT_INC(name) (af == AF_INET ? \ 603 ipstat_inc(ips_##name) : ip6stat_inc(ip6s_##name)) 604 #endif 605 606 int 607 ip_deliver(struct mbuf **mp, int *offp, int nxt, int af) 608 { 609 const struct protosw *psw; 610 int naf = af; 611 #ifdef INET6 612 int nest = 0; 613 #endif /* INET6 */ 614 615 /* pf might have modified stuff, might have to chksum */ 616 switch (af) { 617 case AF_INET: 618 in_proto_cksum_out(*mp, NULL); 619 break; 620 #ifdef INET6 621 case AF_INET6: 622 in6_proto_cksum_out(*mp, NULL); 623 break; 624 #endif /* INET6 */ 625 } 626 627 /* 628 * Tell launch routine the next header 629 */ 630 IPSTAT_INC(delivered); 631 632 while (nxt != IPPROTO_DONE) { 633 #ifdef INET6 634 if (af == AF_INET6 && 635 ip6_hdrnestlimit && (++nest > ip6_hdrnestlimit)) { 636 ip6stat_inc(ip6s_toomanyhdr); 637 goto bad; 638 } 639 #endif /* INET6 */ 640 641 /* 642 * protection against faulty packet - there should be 643 * more sanity checks in header chain processing. 644 */ 645 if ((*mp)->m_pkthdr.len < *offp) { 646 IPSTAT_INC(tooshort); 647 goto bad; 648 } 649 650 #ifdef IPSEC 651 if (ipsec_in_use) { 652 if (ipsec_local_check(*mp, *offp, nxt, af) != 0) { 653 IPSTAT_INC(cantforward); 654 goto bad; 655 } 656 } 657 /* Otherwise, just fall through and deliver the packet */ 658 #endif /* IPSEC */ 659 660 switch (nxt) { 661 case IPPROTO_IPV4: 662 naf = AF_INET; 663 ipstat_inc(ips_delivered); 664 break; 665 #ifdef INET6 666 case IPPROTO_IPV6: 667 naf = AF_INET6; 668 ip6stat_inc(ip6s_delivered); 669 break; 670 #endif /* INET6 */ 671 } 672 switch (af) { 673 case AF_INET: 674 psw = &inetsw[ip_protox[nxt]]; 675 break; 676 #ifdef INET6 677 case AF_INET6: 678 psw = &inet6sw[ip6_protox[nxt]]; 679 break; 680 #endif /* INET6 */ 681 } 682 nxt = (*psw->pr_input)(mp, offp, nxt, af); 683 af = naf; 684 } 685 return nxt; 686 bad: 687 m_freemp(mp); 688 return IPPROTO_DONE; 689 } 690 #undef IPSTAT_INC 691 692 int 693 in_ouraddr(struct mbuf *m, struct ifnet *ifp, struct rtentry **prt) 694 { 695 struct rtentry *rt; 696 struct ip *ip; 697 struct sockaddr_in sin; 698 int match = 0; 699 700 #if NPF > 0 701 switch (pf_ouraddr(m)) { 702 case 0: 703 return (0); 704 case 1: 705 return (1); 706 default: 707 /* pf does not know it */ 708 break; 709 } 710 #endif 711 712 ip = mtod(m, struct ip *); 713 714 memset(&sin, 0, sizeof(sin)); 715 sin.sin_len = sizeof(sin); 716 sin.sin_family = AF_INET; 717 sin.sin_addr = ip->ip_dst; 718 rt = rtalloc_mpath(sintosa(&sin), &ip->ip_src.s_addr, 719 m->m_pkthdr.ph_rtableid); 720 if (rtisvalid(rt)) { 721 if (ISSET(rt->rt_flags, RTF_LOCAL)) 722 match = 1; 723 724 /* 725 * If directedbcast is enabled we only consider it local 726 * if it is received on the interface with that address. 727 */ 728 if (ISSET(rt->rt_flags, RTF_BROADCAST) && 729 (!ip_directedbcast || rt->rt_ifidx == ifp->if_index)) { 730 match = 1; 731 732 /* Make sure M_BCAST is set */ 733 m->m_flags |= M_BCAST; 734 } 735 } 736 *prt = rt; 737 738 if (!match) { 739 struct ifaddr *ifa; 740 741 /* 742 * No local address or broadcast address found, so check for 743 * ancient classful broadcast addresses. 744 * It must have been broadcast on the link layer, and for an 745 * address on the interface it was received on. 746 */ 747 if (!ISSET(m->m_flags, M_BCAST) || 748 !IN_CLASSFULBROADCAST(ip->ip_dst.s_addr, ip->ip_dst.s_addr)) 749 return (0); 750 751 if (ifp->if_rdomain != rtable_l2(m->m_pkthdr.ph_rtableid)) 752 return (0); 753 /* 754 * The check in the loop assumes you only rx a packet on an UP 755 * interface, and that M_BCAST will only be set on a BROADCAST 756 * interface. 757 */ 758 NET_ASSERT_LOCKED(); 759 TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { 760 if (ifa->ifa_addr->sa_family != AF_INET) 761 continue; 762 763 if (IN_CLASSFULBROADCAST(ip->ip_dst.s_addr, 764 ifatoia(ifa)->ia_addr.sin_addr.s_addr)) { 765 match = 1; 766 break; 767 } 768 } 769 } else if (ipforwarding == 0 && rt->rt_ifidx != ifp->if_index && 770 !((ifp->if_flags & IFF_LOOPBACK) || (ifp->if_type == IFT_ENC) || 771 (m->m_pkthdr.pf.flags & PF_TAG_TRANSLATE_LOCALHOST))) { 772 /* received on wrong interface. */ 773 #if NCARP > 0 774 struct ifnet *out_if; 775 776 /* 777 * Virtual IPs on carp interfaces need to be checked also 778 * against the parent interface and other carp interfaces 779 * sharing the same parent. 780 */ 781 out_if = if_get(rt->rt_ifidx); 782 if (!(out_if && carp_strict_addr_chk(out_if, ifp))) { 783 ipstat_inc(ips_wrongif); 784 match = 2; 785 } 786 if_put(out_if); 787 #else 788 ipstat_inc(ips_wrongif); 789 match = 2; 790 #endif 791 } 792 793 return (match); 794 } 795 796 /* 797 * Take incoming datagram fragment and try to 798 * reassemble it into whole datagram. If a chain for 799 * reassembly of this datagram already exists, then it 800 * is given as fp; otherwise have to make a chain. 801 */ 802 struct mbuf * 803 ip_reass(struct ipqent *ipqe, struct ipq *fp) 804 { 805 struct mbuf *m = ipqe->ipqe_m; 806 struct ipqent *nq, *p, *q; 807 struct ip *ip; 808 struct mbuf *t; 809 int hlen = ipqe->ipqe_ip->ip_hl << 2; 810 int i, next; 811 u_int8_t ecn, ecn0; 812 813 MUTEX_ASSERT_LOCKED(&ipq_mutex); 814 815 /* 816 * Presence of header sizes in mbufs 817 * would confuse code below. 818 */ 819 m->m_data += hlen; 820 m->m_len -= hlen; 821 822 /* 823 * If first fragment to arrive, create a reassembly queue. 824 */ 825 if (fp == NULL) { 826 fp = pool_get(&ipq_pool, PR_NOWAIT); 827 if (fp == NULL) 828 goto dropfrag; 829 LIST_INSERT_HEAD(&ipq, fp, ipq_q); 830 fp->ipq_ttl = IPFRAGTTL; 831 fp->ipq_p = ipqe->ipqe_ip->ip_p; 832 fp->ipq_id = ipqe->ipqe_ip->ip_id; 833 LIST_INIT(&fp->ipq_fragq); 834 fp->ipq_src = ipqe->ipqe_ip->ip_src; 835 fp->ipq_dst = ipqe->ipqe_ip->ip_dst; 836 p = NULL; 837 goto insert; 838 } 839 840 /* 841 * Handle ECN by comparing this segment with the first one; 842 * if CE is set, do not lose CE. 843 * drop if CE and not-ECT are mixed for the same packet. 844 */ 845 ecn = ipqe->ipqe_ip->ip_tos & IPTOS_ECN_MASK; 846 ecn0 = LIST_FIRST(&fp->ipq_fragq)->ipqe_ip->ip_tos & IPTOS_ECN_MASK; 847 if (ecn == IPTOS_ECN_CE) { 848 if (ecn0 == IPTOS_ECN_NOTECT) 849 goto dropfrag; 850 if (ecn0 != IPTOS_ECN_CE) 851 LIST_FIRST(&fp->ipq_fragq)->ipqe_ip->ip_tos |= 852 IPTOS_ECN_CE; 853 } 854 if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) 855 goto dropfrag; 856 857 /* 858 * Find a segment which begins after this one does. 859 */ 860 for (p = NULL, q = LIST_FIRST(&fp->ipq_fragq); q != NULL; 861 p = q, q = LIST_NEXT(q, ipqe_q)) 862 if (ntohs(q->ipqe_ip->ip_off) > ntohs(ipqe->ipqe_ip->ip_off)) 863 break; 864 865 /* 866 * If there is a preceding segment, it may provide some of 867 * our data already. If so, drop the data from the incoming 868 * segment. If it provides all of our data, drop us. 869 */ 870 if (p != NULL) { 871 i = ntohs(p->ipqe_ip->ip_off) + ntohs(p->ipqe_ip->ip_len) - 872 ntohs(ipqe->ipqe_ip->ip_off); 873 if (i > 0) { 874 if (i >= ntohs(ipqe->ipqe_ip->ip_len)) 875 goto dropfrag; 876 m_adj(ipqe->ipqe_m, i); 877 ipqe->ipqe_ip->ip_off = 878 htons(ntohs(ipqe->ipqe_ip->ip_off) + i); 879 ipqe->ipqe_ip->ip_len = 880 htons(ntohs(ipqe->ipqe_ip->ip_len) - i); 881 } 882 } 883 884 /* 885 * While we overlap succeeding segments trim them or, 886 * if they are completely covered, dequeue them. 887 */ 888 for (; q != NULL && 889 ntohs(ipqe->ipqe_ip->ip_off) + ntohs(ipqe->ipqe_ip->ip_len) > 890 ntohs(q->ipqe_ip->ip_off); q = nq) { 891 i = (ntohs(ipqe->ipqe_ip->ip_off) + 892 ntohs(ipqe->ipqe_ip->ip_len)) - ntohs(q->ipqe_ip->ip_off); 893 if (i < ntohs(q->ipqe_ip->ip_len)) { 894 q->ipqe_ip->ip_len = 895 htons(ntohs(q->ipqe_ip->ip_len) - i); 896 q->ipqe_ip->ip_off = 897 htons(ntohs(q->ipqe_ip->ip_off) + i); 898 m_adj(q->ipqe_m, i); 899 break; 900 } 901 nq = LIST_NEXT(q, ipqe_q); 902 m_freem(q->ipqe_m); 903 LIST_REMOVE(q, ipqe_q); 904 pool_put(&ipqent_pool, q); 905 ip_frags--; 906 } 907 908 insert: 909 /* 910 * Stick new segment in its place; 911 * check for complete reassembly. 912 */ 913 if (p == NULL) { 914 LIST_INSERT_HEAD(&fp->ipq_fragq, ipqe, ipqe_q); 915 } else { 916 LIST_INSERT_AFTER(p, ipqe, ipqe_q); 917 } 918 next = 0; 919 for (p = NULL, q = LIST_FIRST(&fp->ipq_fragq); q != NULL; 920 p = q, q = LIST_NEXT(q, ipqe_q)) { 921 if (ntohs(q->ipqe_ip->ip_off) != next) 922 return (0); 923 next += ntohs(q->ipqe_ip->ip_len); 924 } 925 if (p->ipqe_mff) 926 return (0); 927 928 /* 929 * Reassembly is complete. Check for a bogus message size and 930 * concatenate fragments. 931 */ 932 q = LIST_FIRST(&fp->ipq_fragq); 933 ip = q->ipqe_ip; 934 if ((next + (ip->ip_hl << 2)) > IP_MAXPACKET) { 935 ipstat_inc(ips_toolong); 936 ip_freef(fp); 937 return (0); 938 } 939 m = q->ipqe_m; 940 t = m->m_next; 941 m->m_next = 0; 942 m_cat(m, t); 943 nq = LIST_NEXT(q, ipqe_q); 944 pool_put(&ipqent_pool, q); 945 ip_frags--; 946 for (q = nq; q != NULL; q = nq) { 947 t = q->ipqe_m; 948 nq = LIST_NEXT(q, ipqe_q); 949 pool_put(&ipqent_pool, q); 950 ip_frags--; 951 m_removehdr(t); 952 m_cat(m, t); 953 } 954 955 /* 956 * Create header for new ip packet by 957 * modifying header of first packet; 958 * dequeue and discard fragment reassembly header. 959 * Make header visible. 960 */ 961 ip->ip_len = htons(next); 962 ip->ip_src = fp->ipq_src; 963 ip->ip_dst = fp->ipq_dst; 964 LIST_REMOVE(fp, ipq_q); 965 pool_put(&ipq_pool, fp); 966 m->m_len += (ip->ip_hl << 2); 967 m->m_data -= (ip->ip_hl << 2); 968 m_calchdrlen(m); 969 return (m); 970 971 dropfrag: 972 ipstat_inc(ips_fragdropped); 973 m_freem(m); 974 pool_put(&ipqent_pool, ipqe); 975 ip_frags--; 976 return (NULL); 977 } 978 979 /* 980 * Free a fragment reassembly header and all 981 * associated datagrams. 982 */ 983 void 984 ip_freef(struct ipq *fp) 985 { 986 struct ipqent *q; 987 988 MUTEX_ASSERT_LOCKED(&ipq_mutex); 989 990 while ((q = LIST_FIRST(&fp->ipq_fragq)) != NULL) { 991 LIST_REMOVE(q, ipqe_q); 992 m_freem(q->ipqe_m); 993 pool_put(&ipqent_pool, q); 994 ip_frags--; 995 } 996 LIST_REMOVE(fp, ipq_q); 997 pool_put(&ipq_pool, fp); 998 } 999 1000 /* 1001 * IP timer processing; 1002 * if a timer expires on a reassembly queue, discard it. 1003 */ 1004 void 1005 ip_slowtimo(void) 1006 { 1007 struct ipq *fp, *nfp; 1008 1009 mtx_enter(&ipq_mutex); 1010 LIST_FOREACH_SAFE(fp, &ipq, ipq_q, nfp) { 1011 if (--fp->ipq_ttl == 0) { 1012 ipstat_inc(ips_fragtimeout); 1013 ip_freef(fp); 1014 } 1015 } 1016 mtx_leave(&ipq_mutex); 1017 } 1018 1019 /* 1020 * Flush a bunch of datagram fragments, till we are down to 75%. 1021 */ 1022 void 1023 ip_flush(void) 1024 { 1025 int max = 50; 1026 1027 MUTEX_ASSERT_LOCKED(&ipq_mutex); 1028 1029 while (!LIST_EMPTY(&ipq) && ip_frags > ip_maxqueue * 3 / 4 && --max) { 1030 ipstat_inc(ips_fragdropped); 1031 ip_freef(LIST_FIRST(&ipq)); 1032 } 1033 } 1034 1035 /* 1036 * Do option processing on a datagram, 1037 * possibly discarding it if bad options are encountered, 1038 * or forwarding it if source-routed. 1039 * Returns 1 if packet has been forwarded/freed, 1040 * 0 if the packet should be processed further. 1041 */ 1042 int 1043 ip_dooptions(struct mbuf *m, struct ifnet *ifp) 1044 { 1045 struct ip *ip = mtod(m, struct ip *); 1046 unsigned int rtableid = m->m_pkthdr.ph_rtableid; 1047 struct rtentry *rt; 1048 struct sockaddr_in ipaddr; 1049 u_char *cp; 1050 struct ip_timestamp ipt; 1051 struct in_ifaddr *ia; 1052 int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0; 1053 struct in_addr sin, dst; 1054 u_int32_t ntime; 1055 1056 dst = ip->ip_dst; 1057 cp = (u_char *)(ip + 1); 1058 cnt = (ip->ip_hl << 2) - sizeof (struct ip); 1059 1060 KERNEL_LOCK(); 1061 for (; cnt > 0; cnt -= optlen, cp += optlen) { 1062 opt = cp[IPOPT_OPTVAL]; 1063 if (opt == IPOPT_EOL) 1064 break; 1065 if (opt == IPOPT_NOP) 1066 optlen = 1; 1067 else { 1068 if (cnt < IPOPT_OLEN + sizeof(*cp)) { 1069 code = &cp[IPOPT_OLEN] - (u_char *)ip; 1070 goto bad; 1071 } 1072 optlen = cp[IPOPT_OLEN]; 1073 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) { 1074 code = &cp[IPOPT_OLEN] - (u_char *)ip; 1075 goto bad; 1076 } 1077 } 1078 1079 switch (opt) { 1080 1081 default: 1082 break; 1083 1084 /* 1085 * Source routing with record. 1086 * Find interface with current destination address. 1087 * If none on this machine then drop if strictly routed, 1088 * or do nothing if loosely routed. 1089 * Record interface address and bring up next address 1090 * component. If strictly routed make sure next 1091 * address is on directly accessible net. 1092 */ 1093 case IPOPT_LSRR: 1094 case IPOPT_SSRR: 1095 if (!ip_dosourceroute) { 1096 type = ICMP_UNREACH; 1097 code = ICMP_UNREACH_SRCFAIL; 1098 goto bad; 1099 } 1100 if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { 1101 code = &cp[IPOPT_OFFSET] - (u_char *)ip; 1102 goto bad; 1103 } 1104 memset(&ipaddr, 0, sizeof(ipaddr)); 1105 ipaddr.sin_family = AF_INET; 1106 ipaddr.sin_len = sizeof(ipaddr); 1107 ipaddr.sin_addr = ip->ip_dst; 1108 ia = ifatoia(ifa_ifwithaddr(sintosa(&ipaddr), 1109 m->m_pkthdr.ph_rtableid)); 1110 if (ia == NULL) { 1111 if (opt == IPOPT_SSRR) { 1112 type = ICMP_UNREACH; 1113 code = ICMP_UNREACH_SRCFAIL; 1114 goto bad; 1115 } 1116 /* 1117 * Loose routing, and not at next destination 1118 * yet; nothing to do except forward. 1119 */ 1120 break; 1121 } 1122 off--; /* 0 origin */ 1123 if ((off + sizeof(struct in_addr)) > optlen) { 1124 /* 1125 * End of source route. Should be for us. 1126 */ 1127 save_rte(m, cp, ip->ip_src); 1128 break; 1129 } 1130 1131 /* 1132 * locate outgoing interface 1133 */ 1134 memset(&ipaddr, 0, sizeof(ipaddr)); 1135 ipaddr.sin_family = AF_INET; 1136 ipaddr.sin_len = sizeof(ipaddr); 1137 memcpy(&ipaddr.sin_addr, cp + off, 1138 sizeof(ipaddr.sin_addr)); 1139 /* keep packet in the virtual instance */ 1140 rt = rtalloc(sintosa(&ipaddr), RT_RESOLVE, rtableid); 1141 if (!rtisvalid(rt) || ((opt == IPOPT_SSRR) && 1142 ISSET(rt->rt_flags, RTF_GATEWAY))) { 1143 type = ICMP_UNREACH; 1144 code = ICMP_UNREACH_SRCFAIL; 1145 rtfree(rt); 1146 goto bad; 1147 } 1148 ia = ifatoia(rt->rt_ifa); 1149 memcpy(cp + off, &ia->ia_addr.sin_addr, 1150 sizeof(struct in_addr)); 1151 rtfree(rt); 1152 cp[IPOPT_OFFSET] += sizeof(struct in_addr); 1153 ip->ip_dst = ipaddr.sin_addr; 1154 /* 1155 * Let ip_intr's mcast routing check handle mcast pkts 1156 */ 1157 forward = !IN_MULTICAST(ip->ip_dst.s_addr); 1158 break; 1159 1160 case IPOPT_RR: 1161 if (optlen < IPOPT_OFFSET + sizeof(*cp)) { 1162 code = &cp[IPOPT_OLEN] - (u_char *)ip; 1163 goto bad; 1164 } 1165 if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { 1166 code = &cp[IPOPT_OFFSET] - (u_char *)ip; 1167 goto bad; 1168 } 1169 1170 /* 1171 * If no space remains, ignore. 1172 */ 1173 off--; /* 0 origin */ 1174 if ((off + sizeof(struct in_addr)) > optlen) 1175 break; 1176 memset(&ipaddr, 0, sizeof(ipaddr)); 1177 ipaddr.sin_family = AF_INET; 1178 ipaddr.sin_len = sizeof(ipaddr); 1179 ipaddr.sin_addr = ip->ip_dst; 1180 /* 1181 * locate outgoing interface; if we're the destination, 1182 * use the incoming interface (should be same). 1183 * Again keep the packet inside the virtual instance. 1184 */ 1185 rt = rtalloc(sintosa(&ipaddr), RT_RESOLVE, rtableid); 1186 if (!rtisvalid(rt)) { 1187 type = ICMP_UNREACH; 1188 code = ICMP_UNREACH_HOST; 1189 rtfree(rt); 1190 goto bad; 1191 } 1192 ia = ifatoia(rt->rt_ifa); 1193 memcpy(cp + off, &ia->ia_addr.sin_addr, 1194 sizeof(struct in_addr)); 1195 rtfree(rt); 1196 cp[IPOPT_OFFSET] += sizeof(struct in_addr); 1197 break; 1198 1199 case IPOPT_TS: 1200 code = cp - (u_char *)ip; 1201 if (optlen < sizeof(struct ip_timestamp)) 1202 goto bad; 1203 memcpy(&ipt, cp, sizeof(struct ip_timestamp)); 1204 if (ipt.ipt_ptr < 5 || ipt.ipt_len < 5) 1205 goto bad; 1206 if (ipt.ipt_ptr - 1 + sizeof(u_int32_t) > ipt.ipt_len) { 1207 if (++ipt.ipt_oflw == 0) 1208 goto bad; 1209 break; 1210 } 1211 memcpy(&sin, cp + ipt.ipt_ptr - 1, sizeof sin); 1212 switch (ipt.ipt_flg) { 1213 1214 case IPOPT_TS_TSONLY: 1215 break; 1216 1217 case IPOPT_TS_TSANDADDR: 1218 if (ipt.ipt_ptr - 1 + sizeof(u_int32_t) + 1219 sizeof(struct in_addr) > ipt.ipt_len) 1220 goto bad; 1221 memset(&ipaddr, 0, sizeof(ipaddr)); 1222 ipaddr.sin_family = AF_INET; 1223 ipaddr.sin_len = sizeof(ipaddr); 1224 ipaddr.sin_addr = dst; 1225 ia = ifatoia(ifaof_ifpforaddr(sintosa(&ipaddr), 1226 ifp)); 1227 if (ia == NULL) 1228 continue; 1229 memcpy(&sin, &ia->ia_addr.sin_addr, 1230 sizeof(struct in_addr)); 1231 ipt.ipt_ptr += sizeof(struct in_addr); 1232 break; 1233 1234 case IPOPT_TS_PRESPEC: 1235 if (ipt.ipt_ptr - 1 + sizeof(u_int32_t) + 1236 sizeof(struct in_addr) > ipt.ipt_len) 1237 goto bad; 1238 memset(&ipaddr, 0, sizeof(ipaddr)); 1239 ipaddr.sin_family = AF_INET; 1240 ipaddr.sin_len = sizeof(ipaddr); 1241 ipaddr.sin_addr = sin; 1242 if (ifa_ifwithaddr(sintosa(&ipaddr), 1243 m->m_pkthdr.ph_rtableid) == NULL) 1244 continue; 1245 ipt.ipt_ptr += sizeof(struct in_addr); 1246 break; 1247 1248 default: 1249 /* XXX can't take &ipt->ipt_flg */ 1250 code = (u_char *)&ipt.ipt_ptr - 1251 (u_char *)ip + 1; 1252 goto bad; 1253 } 1254 ntime = iptime(); 1255 memcpy(cp + ipt.ipt_ptr - 1, &ntime, sizeof(u_int32_t)); 1256 ipt.ipt_ptr += sizeof(u_int32_t); 1257 } 1258 } 1259 KERNEL_UNLOCK(); 1260 if (forward && ipforwarding > 0) { 1261 ip_forward(m, ifp, NULL, 1); 1262 return (1); 1263 } 1264 return (0); 1265 bad: 1266 KERNEL_UNLOCK(); 1267 icmp_error(m, type, code, 0, 0); 1268 ipstat_inc(ips_badoptions); 1269 return (1); 1270 } 1271 1272 /* 1273 * Save incoming source route for use in replies, 1274 * to be picked up later by ip_srcroute if the receiver is interested. 1275 */ 1276 void 1277 save_rte(struct mbuf *m, u_char *option, struct in_addr dst) 1278 { 1279 struct ip_srcrt *isr; 1280 struct m_tag *mtag; 1281 unsigned olen; 1282 1283 olen = option[IPOPT_OLEN]; 1284 if (olen > sizeof(isr->isr_hdr) + sizeof(isr->isr_routes)) 1285 return; 1286 1287 mtag = m_tag_get(PACKET_TAG_SRCROUTE, sizeof(*isr), M_NOWAIT); 1288 if (mtag == NULL) 1289 return; 1290 isr = (struct ip_srcrt *)(mtag + 1); 1291 1292 memcpy(isr->isr_hdr, option, olen); 1293 isr->isr_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr); 1294 isr->isr_dst = dst; 1295 m_tag_prepend(m, mtag); 1296 } 1297 1298 /* 1299 * Retrieve incoming source route for use in replies, 1300 * in the same form used by setsockopt. 1301 * The first hop is placed before the options, will be removed later. 1302 */ 1303 struct mbuf * 1304 ip_srcroute(struct mbuf *m0) 1305 { 1306 struct in_addr *p, *q; 1307 struct mbuf *m; 1308 struct ip_srcrt *isr; 1309 struct m_tag *mtag; 1310 1311 if (!ip_dosourceroute) 1312 return (NULL); 1313 1314 mtag = m_tag_find(m0, PACKET_TAG_SRCROUTE, NULL); 1315 if (mtag == NULL) 1316 return (NULL); 1317 isr = (struct ip_srcrt *)(mtag + 1); 1318 1319 if (isr->isr_nhops == 0) 1320 return (NULL); 1321 m = m_get(M_DONTWAIT, MT_SOOPTS); 1322 if (m == NULL) 1323 return (NULL); 1324 1325 #define OPTSIZ (sizeof(isr->isr_nop) + sizeof(isr->isr_hdr)) 1326 1327 /* length is (nhops+1)*sizeof(addr) + sizeof(nop + header) */ 1328 m->m_len = (isr->isr_nhops + 1) * sizeof(struct in_addr) + OPTSIZ; 1329 1330 /* 1331 * First save first hop for return route 1332 */ 1333 p = &(isr->isr_routes[isr->isr_nhops - 1]); 1334 *(mtod(m, struct in_addr *)) = *p--; 1335 1336 /* 1337 * Copy option fields and padding (nop) to mbuf. 1338 */ 1339 isr->isr_nop = IPOPT_NOP; 1340 isr->isr_hdr[IPOPT_OFFSET] = IPOPT_MINOFF; 1341 memcpy(mtod(m, caddr_t) + sizeof(struct in_addr), &isr->isr_nop, 1342 OPTSIZ); 1343 q = (struct in_addr *)(mtod(m, caddr_t) + 1344 sizeof(struct in_addr) + OPTSIZ); 1345 #undef OPTSIZ 1346 /* 1347 * Record return path as an IP source route, 1348 * reversing the path (pointers are now aligned). 1349 */ 1350 while (p >= isr->isr_routes) { 1351 *q++ = *p--; 1352 } 1353 /* 1354 * Last hop goes to final destination. 1355 */ 1356 *q = isr->isr_dst; 1357 m_tag_delete(m0, (struct m_tag *)isr); 1358 return (m); 1359 } 1360 1361 /* 1362 * Strip out IP options, at higher level protocol in the kernel. 1363 */ 1364 void 1365 ip_stripoptions(struct mbuf *m) 1366 { 1367 int i; 1368 struct ip *ip = mtod(m, struct ip *); 1369 caddr_t opts; 1370 int olen; 1371 1372 olen = (ip->ip_hl<<2) - sizeof (struct ip); 1373 opts = (caddr_t)(ip + 1); 1374 i = m->m_len - (sizeof (struct ip) + olen); 1375 memmove(opts, opts + olen, i); 1376 m->m_len -= olen; 1377 if (m->m_flags & M_PKTHDR) 1378 m->m_pkthdr.len -= olen; 1379 ip->ip_hl = sizeof(struct ip) >> 2; 1380 ip->ip_len = htons(ntohs(ip->ip_len) - olen); 1381 } 1382 1383 const u_char inetctlerrmap[PRC_NCMDS] = { 1384 0, 0, 0, 0, 1385 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, 1386 EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, 1387 EMSGSIZE, EHOSTUNREACH, 0, 0, 1388 0, 0, 0, 0, 1389 ENOPROTOOPT 1390 }; 1391 1392 /* 1393 * Forward a packet. If some error occurs return the sender 1394 * an icmp packet. Note we can't always generate a meaningful 1395 * icmp message because icmp doesn't have a large enough repertoire 1396 * of codes and types. 1397 * 1398 * If not forwarding, just drop the packet. This could be confusing 1399 * if ipforwarding was zero but some routing protocol was advancing 1400 * us as a gateway to somewhere. However, we must let the routing 1401 * protocol deal with that. 1402 * 1403 * The srcrt parameter indicates whether the packet is being forwarded 1404 * via a source route. 1405 */ 1406 void 1407 ip_forward(struct mbuf *m, struct ifnet *ifp, struct rtentry *rt, int srcrt) 1408 { 1409 struct mbuf mfake, *mcopy = NULL; 1410 struct ip *ip = mtod(m, struct ip *); 1411 struct sockaddr_in *sin; 1412 struct route ro; 1413 int error, type = 0, code = 0, destmtu = 0, fake = 0, len; 1414 u_int32_t dest; 1415 1416 dest = 0; 1417 if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) { 1418 ipstat_inc(ips_cantforward); 1419 m_freem(m); 1420 goto freecopy; 1421 } 1422 if (ip->ip_ttl <= IPTTLDEC) { 1423 icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, dest, 0); 1424 goto freecopy; 1425 } 1426 1427 memset(&ro, 0, sizeof(ro)); 1428 sin = satosin(&ro.ro_dst); 1429 sin->sin_family = AF_INET; 1430 sin->sin_len = sizeof(*sin); 1431 sin->sin_addr = ip->ip_dst; 1432 1433 if (!rtisvalid(rt)) { 1434 rtfree(rt); 1435 rt = rtalloc_mpath(sintosa(sin), &ip->ip_src.s_addr, 1436 m->m_pkthdr.ph_rtableid); 1437 if (rt == NULL) { 1438 ipstat_inc(ips_noroute); 1439 icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0); 1440 return; 1441 } 1442 } 1443 1444 /* 1445 * Save at most 68 bytes of the packet in case 1446 * we need to generate an ICMP message to the src. 1447 * The data is saved in the mbuf on the stack that 1448 * acts as a temporary storage not intended to be 1449 * passed down the IP stack or to the mfree. 1450 */ 1451 memset(&mfake.m_hdr, 0, sizeof(mfake.m_hdr)); 1452 mfake.m_type = m->m_type; 1453 if (m_dup_pkthdr(&mfake, m, M_DONTWAIT) == 0) { 1454 mfake.m_data = mfake.m_pktdat; 1455 len = min(ntohs(ip->ip_len), 68); 1456 m_copydata(m, 0, len, mfake.m_pktdat); 1457 mfake.m_pkthdr.len = mfake.m_len = len; 1458 #if NPF > 0 1459 pf_pkt_addr_changed(&mfake); 1460 #endif /* NPF > 0 */ 1461 fake = 1; 1462 } 1463 1464 ip->ip_ttl -= IPTTLDEC; 1465 1466 /* 1467 * If forwarding packet using same interface that it came in on, 1468 * perhaps should send a redirect to sender to shortcut a hop. 1469 * Only send redirect if source is sending directly to us, 1470 * and if packet was not source routed (or has any options). 1471 * Also, don't send redirect if forwarding using a default route 1472 * or a route modified by a redirect. 1473 * Don't send redirect if we advertise destination's arp address 1474 * as ours (proxy arp). 1475 */ 1476 if ((rt->rt_ifidx == ifp->if_index) && 1477 (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 && 1478 satosin(rt_key(rt))->sin_addr.s_addr != 0 && 1479 ipsendredirects && !srcrt && 1480 !arpproxy(satosin(rt_key(rt))->sin_addr, m->m_pkthdr.ph_rtableid)) { 1481 if ((ip->ip_src.s_addr & ifatoia(rt->rt_ifa)->ia_netmask) == 1482 ifatoia(rt->rt_ifa)->ia_net) { 1483 if (rt->rt_flags & RTF_GATEWAY) 1484 dest = satosin(rt->rt_gateway)->sin_addr.s_addr; 1485 else 1486 dest = ip->ip_dst.s_addr; 1487 /* Router requirements says to only send host redirects */ 1488 type = ICMP_REDIRECT; 1489 code = ICMP_REDIRECT_HOST; 1490 } 1491 } 1492 1493 ro.ro_rt = rt; 1494 ro.ro_tableid = m->m_pkthdr.ph_rtableid; 1495 error = ip_output(m, NULL, &ro, 1496 (IP_FORWARDING | (ip_directedbcast ? IP_ALLOWBROADCAST : 0)), 1497 NULL, NULL, 0); 1498 rt = ro.ro_rt; 1499 if (error) 1500 ipstat_inc(ips_cantforward); 1501 else { 1502 ipstat_inc(ips_forward); 1503 if (type) 1504 ipstat_inc(ips_redirectsent); 1505 else 1506 goto freecopy; 1507 } 1508 if (!fake) 1509 goto freecopy; 1510 1511 switch (error) { 1512 1513 case 0: /* forwarded, but need redirect */ 1514 /* type, code set above */ 1515 break; 1516 1517 case ENETUNREACH: /* shouldn't happen, checked above */ 1518 case EHOSTUNREACH: 1519 case ENETDOWN: 1520 case EHOSTDOWN: 1521 default: 1522 type = ICMP_UNREACH; 1523 code = ICMP_UNREACH_HOST; 1524 break; 1525 1526 case EMSGSIZE: 1527 type = ICMP_UNREACH; 1528 code = ICMP_UNREACH_NEEDFRAG; 1529 1530 #ifdef IPSEC 1531 if (rt != NULL) { 1532 if (rt->rt_mtu) 1533 destmtu = rt->rt_mtu; 1534 else { 1535 struct ifnet *destifp; 1536 1537 destifp = if_get(rt->rt_ifidx); 1538 if (destifp != NULL) 1539 destmtu = destifp->if_mtu; 1540 if_put(destifp); 1541 } 1542 } 1543 #endif /*IPSEC*/ 1544 ipstat_inc(ips_cantfrag); 1545 break; 1546 1547 case EACCES: 1548 /* 1549 * pf(4) blocked the packet. There is no need to send an ICMP 1550 * packet back since pf(4) takes care of it. 1551 */ 1552 goto freecopy; 1553 case ENOBUFS: 1554 /* 1555 * a router should not generate ICMP_SOURCEQUENCH as 1556 * required in RFC1812 Requirements for IP Version 4 Routers. 1557 * source quench could be a big problem under DoS attacks, 1558 * or the underlying interface is rate-limited. 1559 */ 1560 goto freecopy; 1561 } 1562 1563 mcopy = m_copym(&mfake, 0, len, M_DONTWAIT); 1564 if (mcopy) 1565 icmp_error(mcopy, type, code, dest, destmtu); 1566 1567 freecopy: 1568 if (fake) 1569 m_tag_delete_chain(&mfake); 1570 rtfree(rt); 1571 } 1572 1573 int 1574 ip_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, 1575 size_t newlen) 1576 { 1577 int error; 1578 #ifdef MROUTING 1579 extern struct mrtstat mrtstat; 1580 #endif 1581 1582 /* Almost all sysctl names at this level are terminal. */ 1583 if (namelen != 1 && name[0] != IPCTL_IFQUEUE && 1584 name[0] != IPCTL_ARPQUEUE) 1585 return (ENOTDIR); 1586 1587 switch (name[0]) { 1588 case IPCTL_SOURCEROUTE: 1589 /* 1590 * Don't allow this to change in a secure environment. 1591 */ 1592 if (newp && securelevel > 0) 1593 return (EPERM); 1594 NET_LOCK(); 1595 error = sysctl_int(oldp, oldlenp, newp, newlen, 1596 &ip_dosourceroute); 1597 NET_UNLOCK(); 1598 return (error); 1599 case IPCTL_MTUDISC: 1600 NET_LOCK(); 1601 error = sysctl_int(oldp, oldlenp, newp, newlen, 1602 &ip_mtudisc); 1603 if (ip_mtudisc != 0 && ip_mtudisc_timeout_q == NULL) { 1604 ip_mtudisc_timeout_q = 1605 rt_timer_queue_create(ip_mtudisc_timeout); 1606 } else if (ip_mtudisc == 0 && ip_mtudisc_timeout_q != NULL) { 1607 rt_timer_queue_destroy(ip_mtudisc_timeout_q); 1608 ip_mtudisc_timeout_q = NULL; 1609 } 1610 NET_UNLOCK(); 1611 return error; 1612 case IPCTL_MTUDISCTIMEOUT: 1613 NET_LOCK(); 1614 error = sysctl_int(oldp, oldlenp, newp, newlen, 1615 &ip_mtudisc_timeout); 1616 if (ip_mtudisc_timeout_q != NULL) 1617 rt_timer_queue_change(ip_mtudisc_timeout_q, 1618 ip_mtudisc_timeout); 1619 NET_UNLOCK(); 1620 return (error); 1621 #ifdef IPSEC 1622 case IPCTL_ENCDEBUG: 1623 case IPCTL_IPSEC_STATS: 1624 case IPCTL_IPSEC_EXPIRE_ACQUIRE: 1625 case IPCTL_IPSEC_EMBRYONIC_SA_TIMEOUT: 1626 case IPCTL_IPSEC_REQUIRE_PFS: 1627 case IPCTL_IPSEC_SOFT_ALLOCATIONS: 1628 case IPCTL_IPSEC_ALLOCATIONS: 1629 case IPCTL_IPSEC_SOFT_BYTES: 1630 case IPCTL_IPSEC_BYTES: 1631 case IPCTL_IPSEC_TIMEOUT: 1632 case IPCTL_IPSEC_SOFT_TIMEOUT: 1633 case IPCTL_IPSEC_SOFT_FIRSTUSE: 1634 case IPCTL_IPSEC_FIRSTUSE: 1635 case IPCTL_IPSEC_ENC_ALGORITHM: 1636 case IPCTL_IPSEC_AUTH_ALGORITHM: 1637 case IPCTL_IPSEC_IPCOMP_ALGORITHM: 1638 return (ipsec_sysctl(name, namelen, oldp, oldlenp, newp, 1639 newlen)); 1640 #endif 1641 case IPCTL_IFQUEUE: 1642 return (EOPNOTSUPP); 1643 case IPCTL_ARPQUEUE: 1644 return (sysctl_niq(name + 1, namelen - 1, 1645 oldp, oldlenp, newp, newlen, &arpinq)); 1646 case IPCTL_ARPQUEUED: 1647 return (sysctl_rdint(oldp, oldlenp, newp, la_hold_total)); 1648 case IPCTL_STATS: 1649 return (ip_sysctl_ipstat(oldp, oldlenp, newp)); 1650 #ifdef MROUTING 1651 case IPCTL_MRTSTATS: 1652 return (sysctl_rdstruct(oldp, oldlenp, newp, 1653 &mrtstat, sizeof(mrtstat))); 1654 case IPCTL_MRTMFC: 1655 if (newp) 1656 return (EPERM); 1657 NET_LOCK(); 1658 error = mrt_sysctl_mfc(oldp, oldlenp); 1659 NET_UNLOCK(); 1660 return (error); 1661 case IPCTL_MRTVIF: 1662 if (newp) 1663 return (EPERM); 1664 NET_LOCK(); 1665 error = mrt_sysctl_vif(oldp, oldlenp); 1666 NET_UNLOCK(); 1667 return (error); 1668 #else 1669 case IPCTL_MRTPROTO: 1670 case IPCTL_MRTSTATS: 1671 case IPCTL_MRTMFC: 1672 case IPCTL_MRTVIF: 1673 return (EOPNOTSUPP); 1674 #endif 1675 default: 1676 NET_LOCK(); 1677 error = sysctl_bounded_arr(ipctl_vars, nitems(ipctl_vars), 1678 name, namelen, oldp, oldlenp, newp, newlen); 1679 NET_UNLOCK(); 1680 return (error); 1681 } 1682 /* NOTREACHED */ 1683 } 1684 1685 int 1686 ip_sysctl_ipstat(void *oldp, size_t *oldlenp, void *newp) 1687 { 1688 uint64_t counters[ips_ncounters]; 1689 struct ipstat ipstat; 1690 u_long *words = (u_long *)&ipstat; 1691 int i; 1692 1693 CTASSERT(sizeof(ipstat) == (nitems(counters) * sizeof(u_long))); 1694 memset(&ipstat, 0, sizeof ipstat); 1695 counters_read(ipcounters, counters, nitems(counters)); 1696 1697 for (i = 0; i < nitems(counters); i++) 1698 words[i] = (u_long)counters[i]; 1699 1700 return (sysctl_rdstruct(oldp, oldlenp, newp, &ipstat, sizeof(ipstat))); 1701 } 1702 1703 void 1704 ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, 1705 struct mbuf *m) 1706 { 1707 if (inp->inp_socket->so_options & SO_TIMESTAMP) { 1708 struct timeval tv; 1709 1710 m_microtime(m, &tv); 1711 *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv), 1712 SCM_TIMESTAMP, SOL_SOCKET); 1713 if (*mp) 1714 mp = &(*mp)->m_next; 1715 } 1716 1717 if (inp->inp_flags & INP_RECVDSTADDR) { 1718 *mp = sbcreatecontrol((caddr_t) &ip->ip_dst, 1719 sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP); 1720 if (*mp) 1721 mp = &(*mp)->m_next; 1722 } 1723 #ifdef notyet 1724 /* this code is broken and will probably never be fixed. */ 1725 /* options were tossed already */ 1726 if (inp->inp_flags & INP_RECVOPTS) { 1727 *mp = sbcreatecontrol((caddr_t) opts_deleted_above, 1728 sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP); 1729 if (*mp) 1730 mp = &(*mp)->m_next; 1731 } 1732 /* ip_srcroute doesn't do what we want here, need to fix */ 1733 if (inp->inp_flags & INP_RECVRETOPTS) { 1734 *mp = sbcreatecontrol((caddr_t) ip_srcroute(m), 1735 sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP); 1736 if (*mp) 1737 mp = &(*mp)->m_next; 1738 } 1739 #endif 1740 if (inp->inp_flags & INP_RECVIF) { 1741 struct sockaddr_dl sdl; 1742 struct ifnet *ifp; 1743 1744 ifp = if_get(m->m_pkthdr.ph_ifidx); 1745 if (ifp == NULL || ifp->if_sadl == NULL) { 1746 memset(&sdl, 0, sizeof(sdl)); 1747 sdl.sdl_len = offsetof(struct sockaddr_dl, sdl_data[0]); 1748 sdl.sdl_family = AF_LINK; 1749 sdl.sdl_index = ifp != NULL ? ifp->if_index : 0; 1750 sdl.sdl_nlen = sdl.sdl_alen = sdl.sdl_slen = 0; 1751 *mp = sbcreatecontrol((caddr_t) &sdl, sdl.sdl_len, 1752 IP_RECVIF, IPPROTO_IP); 1753 } else { 1754 *mp = sbcreatecontrol((caddr_t) ifp->if_sadl, 1755 ifp->if_sadl->sdl_len, IP_RECVIF, IPPROTO_IP); 1756 } 1757 if (*mp) 1758 mp = &(*mp)->m_next; 1759 if_put(ifp); 1760 } 1761 if (inp->inp_flags & INP_RECVTTL) { 1762 *mp = sbcreatecontrol((caddr_t) &ip->ip_ttl, 1763 sizeof(u_int8_t), IP_RECVTTL, IPPROTO_IP); 1764 if (*mp) 1765 mp = &(*mp)->m_next; 1766 } 1767 if (inp->inp_flags & INP_RECVRTABLE) { 1768 u_int rtableid = inp->inp_rtableid; 1769 1770 #if NPF > 0 1771 if (m && m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) { 1772 struct pf_divert *divert; 1773 1774 divert = pf_find_divert(m); 1775 KASSERT(divert != NULL); 1776 rtableid = divert->rdomain; 1777 } 1778 #endif 1779 1780 *mp = sbcreatecontrol((caddr_t) &rtableid, 1781 sizeof(u_int), IP_RECVRTABLE, IPPROTO_IP); 1782 if (*mp) 1783 mp = &(*mp)->m_next; 1784 } 1785 } 1786 1787 void 1788 ip_send_do_dispatch(void *xmq, int flags) 1789 { 1790 struct mbuf_queue *mq = xmq; 1791 struct mbuf *m; 1792 struct mbuf_list ml; 1793 struct m_tag *mtag; 1794 u_int32_t ipsecflowinfo = 0; 1795 1796 mq_delist(mq, &ml); 1797 if (ml_empty(&ml)) 1798 return; 1799 1800 NET_LOCK(); 1801 while ((m = ml_dequeue(&ml)) != NULL) { 1802 if ((mtag = m_tag_find(m, PACKET_TAG_IPSEC_FLOWINFO, NULL)) 1803 != NULL) { 1804 ipsecflowinfo = *(u_int32_t *)(mtag + 1); 1805 m_tag_delete(m, mtag); 1806 } 1807 ip_output(m, NULL, NULL, flags, NULL, NULL, ipsecflowinfo); 1808 } 1809 NET_UNLOCK(); 1810 } 1811 1812 void 1813 ip_sendraw_dispatch(void *xmq) 1814 { 1815 ip_send_do_dispatch(xmq, IP_RAWOUTPUT); 1816 } 1817 1818 void 1819 ip_send_dispatch(void *xmq) 1820 { 1821 ip_send_do_dispatch(xmq, 0); 1822 } 1823 1824 void 1825 ip_send(struct mbuf *m) 1826 { 1827 mq_enqueue(&ipsend_mq, m); 1828 task_add(net_tq(0), &ipsend_task); 1829 } 1830 1831 void 1832 ip_send_raw(struct mbuf *m) 1833 { 1834 mq_enqueue(&ipsendraw_mq, m); 1835 task_add(net_tq(0), &ipsendraw_task); 1836 } 1837