1 /* $OpenBSD: ip_input.c,v 1.343 2019/06/10 23:48:21 dlg Exp $ */ 2 /* $NetBSD: ip_input.c,v 1.30 1996/03/16 23:53:58 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 33 */ 34 35 #include "pf.h" 36 #include "carp.h" 37 38 #include <sys/param.h> 39 #include <sys/systm.h> 40 #include <sys/mbuf.h> 41 #include <sys/domain.h> 42 #include <sys/mutex.h> 43 #include <sys/protosw.h> 44 #include <sys/socket.h> 45 #include <sys/socketvar.h> 46 #include <sys/sysctl.h> 47 #include <sys/pool.h> 48 #include <sys/task.h> 49 50 #include <net/if.h> 51 #include <net/if_var.h> 52 #include <net/if_dl.h> 53 #include <net/route.h> 54 #include <net/netisr.h> 55 56 #include <netinet/in.h> 57 #include <netinet/in_systm.h> 58 #include <netinet/if_ether.h> 59 #include <netinet/ip.h> 60 #include <netinet/in_pcb.h> 61 #include <netinet/in_var.h> 62 #include <netinet/ip_var.h> 63 #include <netinet/ip_icmp.h> 64 65 #ifdef INET6 66 #include <netinet6/ip6protosw.h> 67 #include <netinet6/ip6_var.h> 68 #endif 69 70 #if NPF > 0 71 #include <net/pfvar.h> 72 #endif 73 74 #ifdef MROUTING 75 #include <netinet/ip_mroute.h> 76 #endif 77 78 #ifdef IPSEC 79 #include <netinet/ip_ipsp.h> 80 #endif /* IPSEC */ 81 82 #if NCARP > 0 83 #include <net/if_types.h> 84 #include <netinet/ip_carp.h> 85 #endif 86 87 /* values controllable via sysctl */ 88 int ipforwarding = 0; 89 int ipmforwarding = 0; 90 int ipmultipath = 0; 91 int ipsendredirects = 1; 92 int ip_dosourceroute = 0; 93 int ip_defttl = IPDEFTTL; 94 int ip_mtudisc = 1; 95 u_int ip_mtudisc_timeout = IPMTUDISCTIMEOUT; 96 int ip_directedbcast = 0; 97 98 struct rttimer_queue *ip_mtudisc_timeout_q = NULL; 99 100 /* Protects `ipq' and `ip_frags'. */ 101 struct mutex ipq_mutex = MUTEX_INITIALIZER(IPL_SOFTNET); 102 103 /* IP reassembly queue */ 104 LIST_HEAD(, ipq) ipq; 105 106 /* Keep track of memory used for reassembly */ 107 int ip_maxqueue = 300; 108 int ip_frags = 0; 109 110 int *ipctl_vars[IPCTL_MAXID] = IPCTL_VARS; 111 112 struct niqueue ipintrq = NIQUEUE_INITIALIZER(IPQ_MAXLEN, NETISR_IP); 113 114 struct pool ipqent_pool; 115 struct pool ipq_pool; 116 117 struct cpumem *ipcounters; 118 119 int ip_sysctl_ipstat(void *, size_t *, void *); 120 121 static struct mbuf_queue ipsend_mq; 122 123 extern struct niqueue arpinq; 124 125 int ip_ours(struct mbuf **, int *, int, int); 126 int ip_local(struct mbuf **, int *, int, int); 127 int ip_dooptions(struct mbuf *, struct ifnet *); 128 int in_ouraddr(struct mbuf *, struct ifnet *, struct rtentry **); 129 130 static void ip_send_dispatch(void *); 131 static struct task ipsend_task = TASK_INITIALIZER(ip_send_dispatch, &ipsend_mq); 132 /* 133 * Used to save the IP options in case a protocol wants to respond 134 * to an incoming packet over the same route if the packet got here 135 * using IP source routing. This allows connection establishment and 136 * maintenance when the remote end is on a network that is not known 137 * to us. 138 */ 139 struct ip_srcrt { 140 int isr_nhops; /* number of hops */ 141 struct in_addr isr_dst; /* final destination */ 142 char isr_nop; /* one NOP to align */ 143 char isr_hdr[IPOPT_OFFSET + 1]; /* OPTVAL, OLEN & OFFSET */ 144 struct in_addr isr_routes[MAX_IPOPTLEN/sizeof(struct in_addr)]; 145 }; 146 147 void save_rte(struct mbuf *, u_char *, struct in_addr); 148 149 /* 150 * IP initialization: fill in IP protocol switch table. 151 * All protocols not implemented in kernel go to raw IP protocol handler. 152 */ 153 void 154 ip_init(void) 155 { 156 const struct protosw *pr; 157 int i; 158 const u_int16_t defbaddynamicports_tcp[] = DEFBADDYNAMICPORTS_TCP; 159 const u_int16_t defbaddynamicports_udp[] = DEFBADDYNAMICPORTS_UDP; 160 const u_int16_t defrootonlyports_tcp[] = DEFROOTONLYPORTS_TCP; 161 const u_int16_t defrootonlyports_udp[] = DEFROOTONLYPORTS_UDP; 162 163 ipcounters = counters_alloc(ips_ncounters); 164 165 pool_init(&ipqent_pool, sizeof(struct ipqent), 0, 166 IPL_SOFTNET, 0, "ipqe", NULL); 167 pool_init(&ipq_pool, sizeof(struct ipq), 0, 168 IPL_SOFTNET, 0, "ipq", NULL); 169 170 pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); 171 if (pr == NULL) 172 panic("ip_init"); 173 for (i = 0; i < IPPROTO_MAX; i++) 174 ip_protox[i] = pr - inetsw; 175 for (pr = inetdomain.dom_protosw; 176 pr < inetdomain.dom_protoswNPROTOSW; pr++) 177 if (pr->pr_domain->dom_family == PF_INET && 178 pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW && 179 pr->pr_protocol < IPPROTO_MAX) 180 ip_protox[pr->pr_protocol] = pr - inetsw; 181 LIST_INIT(&ipq); 182 if (ip_mtudisc != 0) 183 ip_mtudisc_timeout_q = 184 rt_timer_queue_create(ip_mtudisc_timeout); 185 186 /* Fill in list of ports not to allocate dynamically. */ 187 memset(&baddynamicports, 0, sizeof(baddynamicports)); 188 for (i = 0; defbaddynamicports_tcp[i] != 0; i++) 189 DP_SET(baddynamicports.tcp, defbaddynamicports_tcp[i]); 190 for (i = 0; defbaddynamicports_udp[i] != 0; i++) 191 DP_SET(baddynamicports.udp, defbaddynamicports_udp[i]); 192 193 /* Fill in list of ports only root can bind to. */ 194 memset(&rootonlyports, 0, sizeof(rootonlyports)); 195 for (i = 0; defrootonlyports_tcp[i] != 0; i++) 196 DP_SET(rootonlyports.tcp, defrootonlyports_tcp[i]); 197 for (i = 0; defrootonlyports_udp[i] != 0; i++) 198 DP_SET(rootonlyports.udp, defrootonlyports_udp[i]); 199 200 mq_init(&ipsend_mq, 64, IPL_SOFTNET); 201 202 #ifdef IPSEC 203 ipsec_init(); 204 #endif 205 } 206 207 /* 208 * Enqueue packet for local delivery. Queuing is used as a boundary 209 * between the network layer (input/forward path) running without 210 * KERNEL_LOCK() and the transport layer still needing it. 211 */ 212 int 213 ip_ours(struct mbuf **mp, int *offp, int nxt, int af) 214 { 215 /* We are already in a IPv4/IPv6 local deliver loop. */ 216 if (af != AF_UNSPEC) 217 return ip_local(mp, offp, nxt, af); 218 219 niq_enqueue(&ipintrq, *mp); 220 *mp = NULL; 221 return IPPROTO_DONE; 222 } 223 224 /* 225 * Dequeue and process locally delivered packets. 226 */ 227 void 228 ipintr(void) 229 { 230 struct mbuf *m; 231 int off, nxt; 232 233 while ((m = niq_dequeue(&ipintrq)) != NULL) { 234 #ifdef DIAGNOSTIC 235 if ((m->m_flags & M_PKTHDR) == 0) 236 panic("ipintr no HDR"); 237 #endif 238 off = 0; 239 nxt = ip_local(&m, &off, IPPROTO_IPV4, AF_UNSPEC); 240 KASSERT(nxt == IPPROTO_DONE); 241 } 242 } 243 244 /* 245 * IPv4 input routine. 246 * 247 * Checksum and byte swap header. Process options. Forward or deliver. 248 */ 249 void 250 ipv4_input(struct ifnet *ifp, struct mbuf *m) 251 { 252 int off, nxt; 253 254 off = 0; 255 nxt = ip_input_if(&m, &off, IPPROTO_IPV4, AF_UNSPEC, ifp); 256 KASSERT(nxt == IPPROTO_DONE); 257 } 258 259 int 260 ip_input_if(struct mbuf **mp, int *offp, int nxt, int af, struct ifnet *ifp) 261 { 262 struct mbuf *m = *mp; 263 struct rtentry *rt = NULL; 264 struct ip *ip; 265 int hlen, len; 266 in_addr_t pfrdr = 0; 267 268 KASSERT(*offp == 0); 269 270 ipstat_inc(ips_total); 271 if (m->m_len < sizeof (struct ip) && 272 (m = *mp = m_pullup(m, sizeof (struct ip))) == NULL) { 273 ipstat_inc(ips_toosmall); 274 goto bad; 275 } 276 ip = mtod(m, struct ip *); 277 if (ip->ip_v != IPVERSION) { 278 ipstat_inc(ips_badvers); 279 goto bad; 280 } 281 hlen = ip->ip_hl << 2; 282 if (hlen < sizeof(struct ip)) { /* minimum header length */ 283 ipstat_inc(ips_badhlen); 284 goto bad; 285 } 286 if (hlen > m->m_len) { 287 if ((m = *mp = m_pullup(m, hlen)) == NULL) { 288 ipstat_inc(ips_badhlen); 289 goto bad; 290 } 291 ip = mtod(m, struct ip *); 292 } 293 294 /* 127/8 must not appear on wire - RFC1122 */ 295 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || 296 (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { 297 if ((ifp->if_flags & IFF_LOOPBACK) == 0) { 298 ipstat_inc(ips_badaddr); 299 goto bad; 300 } 301 } 302 303 if ((m->m_pkthdr.csum_flags & M_IPV4_CSUM_IN_OK) == 0) { 304 if (m->m_pkthdr.csum_flags & M_IPV4_CSUM_IN_BAD) { 305 ipstat_inc(ips_badsum); 306 goto bad; 307 } 308 309 ipstat_inc(ips_inswcsum); 310 if (in_cksum(m, hlen) != 0) { 311 ipstat_inc(ips_badsum); 312 goto bad; 313 } 314 } 315 316 /* Retrieve the packet length. */ 317 len = ntohs(ip->ip_len); 318 319 /* 320 * Convert fields to host representation. 321 */ 322 if (len < hlen) { 323 ipstat_inc(ips_badlen); 324 goto bad; 325 } 326 327 /* 328 * Check that the amount of data in the buffers 329 * is at least as much as the IP header would have us expect. 330 * Trim mbufs if longer than we expect. 331 * Drop packet if shorter than we expect. 332 */ 333 if (m->m_pkthdr.len < len) { 334 ipstat_inc(ips_tooshort); 335 goto bad; 336 } 337 if (m->m_pkthdr.len > len) { 338 if (m->m_len == m->m_pkthdr.len) { 339 m->m_len = len; 340 m->m_pkthdr.len = len; 341 } else 342 m_adj(m, len - m->m_pkthdr.len); 343 } 344 345 #if NCARP > 0 346 if (carp_lsdrop(ifp, m, AF_INET, &ip->ip_src.s_addr, 347 &ip->ip_dst.s_addr, (ip->ip_p == IPPROTO_ICMP ? 0 : 1))) 348 goto bad; 349 #endif 350 351 #if NPF > 0 352 /* 353 * Packet filter 354 */ 355 pfrdr = ip->ip_dst.s_addr; 356 if (pf_test(AF_INET, PF_IN, ifp, mp) != PF_PASS) 357 goto bad; 358 m = *mp; 359 if (m == NULL) 360 goto bad; 361 362 ip = mtod(m, struct ip *); 363 hlen = ip->ip_hl << 2; 364 pfrdr = (pfrdr != ip->ip_dst.s_addr); 365 #endif 366 367 /* 368 * Process options and, if not destined for us, 369 * ship it on. ip_dooptions returns 1 when an 370 * error was detected (causing an icmp message 371 * to be sent and the original packet to be freed). 372 */ 373 if (hlen > sizeof (struct ip) && ip_dooptions(m, ifp)) { 374 m = *mp = NULL; 375 goto bad; 376 } 377 378 if (ip->ip_dst.s_addr == INADDR_BROADCAST || 379 ip->ip_dst.s_addr == INADDR_ANY) { 380 nxt = ip_ours(mp, offp, nxt, af); 381 goto out; 382 } 383 384 if (in_ouraddr(m, ifp, &rt)) { 385 nxt = ip_ours(mp, offp, nxt, af); 386 goto out; 387 } 388 389 if (IN_MULTICAST(ip->ip_dst.s_addr)) { 390 /* 391 * Make sure M_MCAST is set. It should theoretically 392 * already be there, but let's play safe because upper 393 * layers check for this flag. 394 */ 395 m->m_flags |= M_MCAST; 396 397 #ifdef MROUTING 398 if (ipmforwarding && ip_mrouter[ifp->if_rdomain]) { 399 int error; 400 401 if (m->m_flags & M_EXT) { 402 if ((m = *mp = m_pullup(m, hlen)) == NULL) { 403 ipstat_inc(ips_toosmall); 404 goto bad; 405 } 406 ip = mtod(m, struct ip *); 407 } 408 /* 409 * If we are acting as a multicast router, all 410 * incoming multicast packets are passed to the 411 * kernel-level multicast forwarding function. 412 * The packet is returned (relatively) intact; if 413 * ip_mforward() returns a non-zero value, the packet 414 * must be discarded, else it may be accepted below. 415 * 416 * (The IP ident field is put in the same byte order 417 * as expected when ip_mforward() is called from 418 * ip_output().) 419 */ 420 KERNEL_LOCK(); 421 error = ip_mforward(m, ifp); 422 KERNEL_UNLOCK(); 423 if (error) { 424 ipstat_inc(ips_cantforward); 425 goto bad; 426 } 427 428 /* 429 * The process-level routing daemon needs to receive 430 * all multicast IGMP packets, whether or not this 431 * host belongs to their destination groups. 432 */ 433 if (ip->ip_p == IPPROTO_IGMP) { 434 nxt = ip_ours(mp, offp, nxt, af); 435 goto out; 436 } 437 ipstat_inc(ips_forward); 438 } 439 #endif 440 /* 441 * See if we belong to the destination multicast group on the 442 * arrival interface. 443 */ 444 if (!in_hasmulti(&ip->ip_dst, ifp)) { 445 ipstat_inc(ips_notmember); 446 if (!IN_LOCAL_GROUP(ip->ip_dst.s_addr)) 447 ipstat_inc(ips_cantforward); 448 goto bad; 449 } 450 nxt = ip_ours(mp, offp, nxt, af); 451 goto out; 452 } 453 454 #if NCARP > 0 455 if (ip->ip_p == IPPROTO_ICMP && 456 carp_lsdrop(ifp, m, AF_INET, &ip->ip_src.s_addr, 457 &ip->ip_dst.s_addr, 1)) 458 goto bad; 459 #endif 460 /* 461 * Not for us; forward if possible and desirable. 462 */ 463 if (ipforwarding == 0) { 464 ipstat_inc(ips_cantforward); 465 goto bad; 466 } 467 #ifdef IPSEC 468 if (ipsec_in_use) { 469 int rv; 470 471 rv = ipsec_forward_check(m, hlen, AF_INET); 472 if (rv != 0) { 473 ipstat_inc(ips_cantforward); 474 goto bad; 475 } 476 /* 477 * Fall through, forward packet. Outbound IPsec policy 478 * checking will occur in ip_output(). 479 */ 480 } 481 #endif /* IPSEC */ 482 483 ip_forward(m, ifp, rt, pfrdr); 484 *mp = NULL; 485 return IPPROTO_DONE; 486 bad: 487 nxt = IPPROTO_DONE; 488 m_freemp(mp); 489 out: 490 rtfree(rt); 491 return nxt; 492 } 493 494 /* 495 * IPv4 local-delivery routine. 496 * 497 * If fragmented try to reassemble. Pass to next level. 498 */ 499 int 500 ip_local(struct mbuf **mp, int *offp, int nxt, int af) 501 { 502 struct mbuf *m = *mp; 503 struct ip *ip = mtod(m, struct ip *); 504 struct ipq *fp; 505 struct ipqent *ipqe; 506 int mff, hlen; 507 508 hlen = ip->ip_hl << 2; 509 510 /* 511 * If offset or IP_MF are set, must reassemble. 512 * Otherwise, nothing need be done. 513 * (We could look in the reassembly queue to see 514 * if the packet was previously fragmented, 515 * but it's not worth the time; just let them time out.) 516 */ 517 if (ip->ip_off &~ htons(IP_DF | IP_RF)) { 518 if (m->m_flags & M_EXT) { /* XXX */ 519 if ((m = *mp = m_pullup(m, hlen)) == NULL) { 520 ipstat_inc(ips_toosmall); 521 return IPPROTO_DONE; 522 } 523 ip = mtod(m, struct ip *); 524 } 525 526 mtx_enter(&ipq_mutex); 527 528 /* 529 * Look for queue of fragments 530 * of this datagram. 531 */ 532 LIST_FOREACH(fp, &ipq, ipq_q) { 533 if (ip->ip_id == fp->ipq_id && 534 ip->ip_src.s_addr == fp->ipq_src.s_addr && 535 ip->ip_dst.s_addr == fp->ipq_dst.s_addr && 536 ip->ip_p == fp->ipq_p) 537 break; 538 } 539 540 /* 541 * Adjust ip_len to not reflect header, 542 * set ipqe_mff if more fragments are expected, 543 * convert offset of this to bytes. 544 */ 545 ip->ip_len = htons(ntohs(ip->ip_len) - hlen); 546 mff = (ip->ip_off & htons(IP_MF)) != 0; 547 if (mff) { 548 /* 549 * Make sure that fragments have a data length 550 * that's a non-zero multiple of 8 bytes. 551 */ 552 if (ntohs(ip->ip_len) == 0 || 553 (ntohs(ip->ip_len) & 0x7) != 0) { 554 ipstat_inc(ips_badfrags); 555 goto bad; 556 } 557 } 558 ip->ip_off = htons(ntohs(ip->ip_off) << 3); 559 560 /* 561 * If datagram marked as having more fragments 562 * or if this is not the first fragment, 563 * attempt reassembly; if it succeeds, proceed. 564 */ 565 if (mff || ip->ip_off) { 566 ipstat_inc(ips_fragments); 567 if (ip_frags + 1 > ip_maxqueue) { 568 ip_flush(); 569 ipstat_inc(ips_rcvmemdrop); 570 goto bad; 571 } 572 573 ipqe = pool_get(&ipqent_pool, PR_NOWAIT); 574 if (ipqe == NULL) { 575 ipstat_inc(ips_rcvmemdrop); 576 goto bad; 577 } 578 ip_frags++; 579 ipqe->ipqe_mff = mff; 580 ipqe->ipqe_m = m; 581 ipqe->ipqe_ip = ip; 582 m = *mp = ip_reass(ipqe, fp); 583 if (m == NULL) 584 goto bad; 585 ipstat_inc(ips_reassembled); 586 ip = mtod(m, struct ip *); 587 hlen = ip->ip_hl << 2; 588 ip->ip_len = htons(ntohs(ip->ip_len) + hlen); 589 } else 590 if (fp) 591 ip_freef(fp); 592 593 mtx_leave(&ipq_mutex); 594 } 595 596 *offp = hlen; 597 nxt = ip->ip_p; 598 /* Check wheter we are already in a IPv4/IPv6 local deliver loop. */ 599 if (af == AF_UNSPEC) 600 nxt = ip_deliver(mp, offp, nxt, AF_INET); 601 return nxt; 602 bad: 603 mtx_leave(&ipq_mutex); 604 m_freemp(mp); 605 return IPPROTO_DONE; 606 } 607 608 #ifndef INET6 609 #define IPSTAT_INC(name) ipstat_inc(ips_##name) 610 #else 611 #define IPSTAT_INC(name) (af == AF_INET ? \ 612 ipstat_inc(ips_##name) : ip6stat_inc(ip6s_##name)) 613 #endif 614 615 int 616 ip_deliver(struct mbuf **mp, int *offp, int nxt, int af) 617 { 618 const struct protosw *psw; 619 int naf = af; 620 #ifdef INET6 621 int nest = 0; 622 #endif /* INET6 */ 623 624 /* pf might have modified stuff, might have to chksum */ 625 switch (af) { 626 case AF_INET: 627 in_proto_cksum_out(*mp, NULL); 628 break; 629 #ifdef INET6 630 case AF_INET6: 631 in6_proto_cksum_out(*mp, NULL); 632 break; 633 #endif /* INET6 */ 634 } 635 636 /* 637 * Tell launch routine the next header 638 */ 639 IPSTAT_INC(delivered); 640 641 while (nxt != IPPROTO_DONE) { 642 #ifdef INET6 643 if (af == AF_INET6 && 644 ip6_hdrnestlimit && (++nest > ip6_hdrnestlimit)) { 645 ip6stat_inc(ip6s_toomanyhdr); 646 goto bad; 647 } 648 #endif /* INET6 */ 649 650 /* 651 * protection against faulty packet - there should be 652 * more sanity checks in header chain processing. 653 */ 654 if ((*mp)->m_pkthdr.len < *offp) { 655 IPSTAT_INC(tooshort); 656 goto bad; 657 } 658 659 #ifdef INET6 660 /* draft-itojun-ipv6-tcp-to-anycast */ 661 if (af == AF_INET6 && 662 ISSET((*mp)->m_flags, M_ACAST) && (nxt == IPPROTO_TCP)) { 663 if ((*mp)->m_len >= sizeof(struct ip6_hdr)) { 664 icmp6_error(*mp, ICMP6_DST_UNREACH, 665 ICMP6_DST_UNREACH_ADDR, 666 offsetof(struct ip6_hdr, ip6_dst)); 667 *mp = NULL; 668 } 669 goto bad; 670 } 671 #endif /* INET6 */ 672 673 #ifdef IPSEC 674 if (ipsec_in_use) { 675 if (ipsec_local_check(*mp, *offp, nxt, af) != 0) { 676 IPSTAT_INC(cantforward); 677 goto bad; 678 } 679 } 680 /* Otherwise, just fall through and deliver the packet */ 681 #endif /* IPSEC */ 682 683 switch (nxt) { 684 case IPPROTO_IPV4: 685 naf = AF_INET; 686 ipstat_inc(ips_delivered); 687 break; 688 #ifdef INET6 689 case IPPROTO_IPV6: 690 naf = AF_INET6; 691 ip6stat_inc(ip6s_delivered); 692 break; 693 #endif /* INET6 */ 694 } 695 switch (af) { 696 case AF_INET: 697 psw = &inetsw[ip_protox[nxt]]; 698 break; 699 #ifdef INET6 700 case AF_INET6: 701 psw = &inet6sw[ip6_protox[nxt]]; 702 break; 703 #endif /* INET6 */ 704 } 705 nxt = (*psw->pr_input)(mp, offp, nxt, af); 706 af = naf; 707 } 708 return nxt; 709 bad: 710 m_freemp(mp); 711 return IPPROTO_DONE; 712 } 713 #undef IPSTAT_INC 714 715 int 716 in_ouraddr(struct mbuf *m, struct ifnet *ifp, struct rtentry **prt) 717 { 718 struct rtentry *rt; 719 struct ip *ip; 720 struct sockaddr_in sin; 721 int match = 0; 722 723 #if NPF > 0 724 switch (pf_ouraddr(m)) { 725 case 0: 726 return (0); 727 case 1: 728 return (1); 729 default: 730 /* pf does not know it */ 731 break; 732 } 733 #endif 734 735 ip = mtod(m, struct ip *); 736 737 memset(&sin, 0, sizeof(sin)); 738 sin.sin_len = sizeof(sin); 739 sin.sin_family = AF_INET; 740 sin.sin_addr = ip->ip_dst; 741 rt = rtalloc_mpath(sintosa(&sin), &ip->ip_src.s_addr, 742 m->m_pkthdr.ph_rtableid); 743 if (rtisvalid(rt)) { 744 if (ISSET(rt->rt_flags, RTF_LOCAL)) 745 match = 1; 746 747 /* 748 * If directedbcast is enabled we only consider it local 749 * if it is received on the interface with that address. 750 */ 751 if (ISSET(rt->rt_flags, RTF_BROADCAST) && 752 (!ip_directedbcast || rt->rt_ifidx == ifp->if_index)) { 753 match = 1; 754 755 /* Make sure M_BCAST is set */ 756 m->m_flags |= M_BCAST; 757 } 758 } 759 *prt = rt; 760 761 if (!match) { 762 struct ifaddr *ifa; 763 764 /* 765 * No local address or broadcast address found, so check for 766 * ancient classful broadcast addresses. 767 * It must have been broadcast on the link layer, and for an 768 * address on the interface it was received on. 769 */ 770 if (!ISSET(m->m_flags, M_BCAST) || 771 !IN_CLASSFULBROADCAST(ip->ip_dst.s_addr, ip->ip_dst.s_addr)) 772 return (0); 773 774 if (ifp->if_rdomain != rtable_l2(m->m_pkthdr.ph_rtableid)) 775 return (0); 776 /* 777 * The check in the loop assumes you only rx a packet on an UP 778 * interface, and that M_BCAST will only be set on a BROADCAST 779 * interface. 780 */ 781 NET_ASSERT_LOCKED(); 782 TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { 783 if (ifa->ifa_addr->sa_family != AF_INET) 784 continue; 785 786 if (IN_CLASSFULBROADCAST(ip->ip_dst.s_addr, 787 ifatoia(ifa)->ia_addr.sin_addr.s_addr)) { 788 match = 1; 789 break; 790 } 791 } 792 } 793 794 return (match); 795 } 796 797 /* 798 * Take incoming datagram fragment and try to 799 * reassemble it into whole datagram. If a chain for 800 * reassembly of this datagram already exists, then it 801 * is given as fp; otherwise have to make a chain. 802 */ 803 struct mbuf * 804 ip_reass(struct ipqent *ipqe, struct ipq *fp) 805 { 806 struct mbuf *m = ipqe->ipqe_m; 807 struct ipqent *nq, *p, *q; 808 struct ip *ip; 809 struct mbuf *t; 810 int hlen = ipqe->ipqe_ip->ip_hl << 2; 811 int i, next; 812 u_int8_t ecn, ecn0; 813 814 MUTEX_ASSERT_LOCKED(&ipq_mutex); 815 816 /* 817 * Presence of header sizes in mbufs 818 * would confuse code below. 819 */ 820 m->m_data += hlen; 821 m->m_len -= hlen; 822 823 /* 824 * If first fragment to arrive, create a reassembly queue. 825 */ 826 if (fp == NULL) { 827 fp = pool_get(&ipq_pool, PR_NOWAIT); 828 if (fp == NULL) 829 goto dropfrag; 830 LIST_INSERT_HEAD(&ipq, fp, ipq_q); 831 fp->ipq_ttl = IPFRAGTTL; 832 fp->ipq_p = ipqe->ipqe_ip->ip_p; 833 fp->ipq_id = ipqe->ipqe_ip->ip_id; 834 LIST_INIT(&fp->ipq_fragq); 835 fp->ipq_src = ipqe->ipqe_ip->ip_src; 836 fp->ipq_dst = ipqe->ipqe_ip->ip_dst; 837 p = NULL; 838 goto insert; 839 } 840 841 /* 842 * Handle ECN by comparing this segment with the first one; 843 * if CE is set, do not lose CE. 844 * drop if CE and not-ECT are mixed for the same packet. 845 */ 846 ecn = ipqe->ipqe_ip->ip_tos & IPTOS_ECN_MASK; 847 ecn0 = LIST_FIRST(&fp->ipq_fragq)->ipqe_ip->ip_tos & IPTOS_ECN_MASK; 848 if (ecn == IPTOS_ECN_CE) { 849 if (ecn0 == IPTOS_ECN_NOTECT) 850 goto dropfrag; 851 if (ecn0 != IPTOS_ECN_CE) 852 LIST_FIRST(&fp->ipq_fragq)->ipqe_ip->ip_tos |= 853 IPTOS_ECN_CE; 854 } 855 if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) 856 goto dropfrag; 857 858 /* 859 * Find a segment which begins after this one does. 860 */ 861 for (p = NULL, q = LIST_FIRST(&fp->ipq_fragq); q != NULL; 862 p = q, q = LIST_NEXT(q, ipqe_q)) 863 if (ntohs(q->ipqe_ip->ip_off) > ntohs(ipqe->ipqe_ip->ip_off)) 864 break; 865 866 /* 867 * If there is a preceding segment, it may provide some of 868 * our data already. If so, drop the data from the incoming 869 * segment. If it provides all of our data, drop us. 870 */ 871 if (p != NULL) { 872 i = ntohs(p->ipqe_ip->ip_off) + ntohs(p->ipqe_ip->ip_len) - 873 ntohs(ipqe->ipqe_ip->ip_off); 874 if (i > 0) { 875 if (i >= ntohs(ipqe->ipqe_ip->ip_len)) 876 goto dropfrag; 877 m_adj(ipqe->ipqe_m, i); 878 ipqe->ipqe_ip->ip_off = 879 htons(ntohs(ipqe->ipqe_ip->ip_off) + i); 880 ipqe->ipqe_ip->ip_len = 881 htons(ntohs(ipqe->ipqe_ip->ip_len) - i); 882 } 883 } 884 885 /* 886 * While we overlap succeeding segments trim them or, 887 * if they are completely covered, dequeue them. 888 */ 889 for (; q != NULL && 890 ntohs(ipqe->ipqe_ip->ip_off) + ntohs(ipqe->ipqe_ip->ip_len) > 891 ntohs(q->ipqe_ip->ip_off); q = nq) { 892 i = (ntohs(ipqe->ipqe_ip->ip_off) + 893 ntohs(ipqe->ipqe_ip->ip_len)) - ntohs(q->ipqe_ip->ip_off); 894 if (i < ntohs(q->ipqe_ip->ip_len)) { 895 q->ipqe_ip->ip_len = 896 htons(ntohs(q->ipqe_ip->ip_len) - i); 897 q->ipqe_ip->ip_off = 898 htons(ntohs(q->ipqe_ip->ip_off) + i); 899 m_adj(q->ipqe_m, i); 900 break; 901 } 902 nq = LIST_NEXT(q, ipqe_q); 903 m_freem(q->ipqe_m); 904 LIST_REMOVE(q, ipqe_q); 905 pool_put(&ipqent_pool, q); 906 ip_frags--; 907 } 908 909 insert: 910 /* 911 * Stick new segment in its place; 912 * check for complete reassembly. 913 */ 914 if (p == NULL) { 915 LIST_INSERT_HEAD(&fp->ipq_fragq, ipqe, ipqe_q); 916 } else { 917 LIST_INSERT_AFTER(p, ipqe, ipqe_q); 918 } 919 next = 0; 920 for (p = NULL, q = LIST_FIRST(&fp->ipq_fragq); q != NULL; 921 p = q, q = LIST_NEXT(q, ipqe_q)) { 922 if (ntohs(q->ipqe_ip->ip_off) != next) 923 return (0); 924 next += ntohs(q->ipqe_ip->ip_len); 925 } 926 if (p->ipqe_mff) 927 return (0); 928 929 /* 930 * Reassembly is complete. Check for a bogus message size and 931 * concatenate fragments. 932 */ 933 q = LIST_FIRST(&fp->ipq_fragq); 934 ip = q->ipqe_ip; 935 if ((next + (ip->ip_hl << 2)) > IP_MAXPACKET) { 936 ipstat_inc(ips_toolong); 937 ip_freef(fp); 938 return (0); 939 } 940 m = q->ipqe_m; 941 t = m->m_next; 942 m->m_next = 0; 943 m_cat(m, t); 944 nq = LIST_NEXT(q, ipqe_q); 945 pool_put(&ipqent_pool, q); 946 ip_frags--; 947 for (q = nq; q != NULL; q = nq) { 948 t = q->ipqe_m; 949 nq = LIST_NEXT(q, ipqe_q); 950 pool_put(&ipqent_pool, q); 951 ip_frags--; 952 m_removehdr(t); 953 m_cat(m, t); 954 } 955 956 /* 957 * Create header for new ip packet by 958 * modifying header of first packet; 959 * dequeue and discard fragment reassembly header. 960 * Make header visible. 961 */ 962 ip->ip_len = htons(next); 963 ip->ip_src = fp->ipq_src; 964 ip->ip_dst = fp->ipq_dst; 965 LIST_REMOVE(fp, ipq_q); 966 pool_put(&ipq_pool, fp); 967 m->m_len += (ip->ip_hl << 2); 968 m->m_data -= (ip->ip_hl << 2); 969 m_calchdrlen(m); 970 return (m); 971 972 dropfrag: 973 ipstat_inc(ips_fragdropped); 974 m_freem(m); 975 pool_put(&ipqent_pool, ipqe); 976 ip_frags--; 977 return (NULL); 978 } 979 980 /* 981 * Free a fragment reassembly header and all 982 * associated datagrams. 983 */ 984 void 985 ip_freef(struct ipq *fp) 986 { 987 struct ipqent *q; 988 989 MUTEX_ASSERT_LOCKED(&ipq_mutex); 990 991 while ((q = LIST_FIRST(&fp->ipq_fragq)) != NULL) { 992 LIST_REMOVE(q, ipqe_q); 993 m_freem(q->ipqe_m); 994 pool_put(&ipqent_pool, q); 995 ip_frags--; 996 } 997 LIST_REMOVE(fp, ipq_q); 998 pool_put(&ipq_pool, fp); 999 } 1000 1001 /* 1002 * IP timer processing; 1003 * if a timer expires on a reassembly queue, discard it. 1004 */ 1005 void 1006 ip_slowtimo(void) 1007 { 1008 struct ipq *fp, *nfp; 1009 1010 mtx_enter(&ipq_mutex); 1011 LIST_FOREACH_SAFE(fp, &ipq, ipq_q, nfp) { 1012 if (--fp->ipq_ttl == 0) { 1013 ipstat_inc(ips_fragtimeout); 1014 ip_freef(fp); 1015 } 1016 } 1017 mtx_leave(&ipq_mutex); 1018 } 1019 1020 /* 1021 * Flush a bunch of datagram fragments, till we are down to 75%. 1022 */ 1023 void 1024 ip_flush(void) 1025 { 1026 int max = 50; 1027 1028 MUTEX_ASSERT_LOCKED(&ipq_mutex); 1029 1030 while (!LIST_EMPTY(&ipq) && ip_frags > ip_maxqueue * 3 / 4 && --max) { 1031 ipstat_inc(ips_fragdropped); 1032 ip_freef(LIST_FIRST(&ipq)); 1033 } 1034 } 1035 1036 /* 1037 * Do option processing on a datagram, 1038 * possibly discarding it if bad options are encountered, 1039 * or forwarding it if source-routed. 1040 * Returns 1 if packet has been forwarded/freed, 1041 * 0 if the packet should be processed further. 1042 */ 1043 int 1044 ip_dooptions(struct mbuf *m, struct ifnet *ifp) 1045 { 1046 struct ip *ip = mtod(m, struct ip *); 1047 unsigned int rtableid = m->m_pkthdr.ph_rtableid; 1048 struct rtentry *rt; 1049 struct sockaddr_in ipaddr; 1050 u_char *cp; 1051 struct ip_timestamp ipt; 1052 struct in_ifaddr *ia; 1053 int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0; 1054 struct in_addr sin, dst; 1055 u_int32_t ntime; 1056 1057 dst = ip->ip_dst; 1058 cp = (u_char *)(ip + 1); 1059 cnt = (ip->ip_hl << 2) - sizeof (struct ip); 1060 1061 KERNEL_LOCK(); 1062 for (; cnt > 0; cnt -= optlen, cp += optlen) { 1063 opt = cp[IPOPT_OPTVAL]; 1064 if (opt == IPOPT_EOL) 1065 break; 1066 if (opt == IPOPT_NOP) 1067 optlen = 1; 1068 else { 1069 if (cnt < IPOPT_OLEN + sizeof(*cp)) { 1070 code = &cp[IPOPT_OLEN] - (u_char *)ip; 1071 goto bad; 1072 } 1073 optlen = cp[IPOPT_OLEN]; 1074 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) { 1075 code = &cp[IPOPT_OLEN] - (u_char *)ip; 1076 goto bad; 1077 } 1078 } 1079 1080 switch (opt) { 1081 1082 default: 1083 break; 1084 1085 /* 1086 * Source routing with record. 1087 * Find interface with current destination address. 1088 * If none on this machine then drop if strictly routed, 1089 * or do nothing if loosely routed. 1090 * Record interface address and bring up next address 1091 * component. If strictly routed make sure next 1092 * address is on directly accessible net. 1093 */ 1094 case IPOPT_LSRR: 1095 case IPOPT_SSRR: 1096 if (!ip_dosourceroute) { 1097 type = ICMP_UNREACH; 1098 code = ICMP_UNREACH_SRCFAIL; 1099 goto bad; 1100 } 1101 if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { 1102 code = &cp[IPOPT_OFFSET] - (u_char *)ip; 1103 goto bad; 1104 } 1105 memset(&ipaddr, 0, sizeof(ipaddr)); 1106 ipaddr.sin_family = AF_INET; 1107 ipaddr.sin_len = sizeof(ipaddr); 1108 ipaddr.sin_addr = ip->ip_dst; 1109 ia = ifatoia(ifa_ifwithaddr(sintosa(&ipaddr), 1110 m->m_pkthdr.ph_rtableid)); 1111 if (ia == NULL) { 1112 if (opt == IPOPT_SSRR) { 1113 type = ICMP_UNREACH; 1114 code = ICMP_UNREACH_SRCFAIL; 1115 goto bad; 1116 } 1117 /* 1118 * Loose routing, and not at next destination 1119 * yet; nothing to do except forward. 1120 */ 1121 break; 1122 } 1123 off--; /* 0 origin */ 1124 if ((off + sizeof(struct in_addr)) > optlen) { 1125 /* 1126 * End of source route. Should be for us. 1127 */ 1128 save_rte(m, cp, ip->ip_src); 1129 break; 1130 } 1131 1132 /* 1133 * locate outgoing interface 1134 */ 1135 memset(&ipaddr, 0, sizeof(ipaddr)); 1136 ipaddr.sin_family = AF_INET; 1137 ipaddr.sin_len = sizeof(ipaddr); 1138 memcpy(&ipaddr.sin_addr, cp + off, 1139 sizeof(ipaddr.sin_addr)); 1140 /* keep packet in the virtual instance */ 1141 rt = rtalloc(sintosa(&ipaddr), RT_RESOLVE, rtableid); 1142 if (!rtisvalid(rt) || ((opt == IPOPT_SSRR) && 1143 ISSET(rt->rt_flags, RTF_GATEWAY))) { 1144 type = ICMP_UNREACH; 1145 code = ICMP_UNREACH_SRCFAIL; 1146 rtfree(rt); 1147 goto bad; 1148 } 1149 ia = ifatoia(rt->rt_ifa); 1150 memcpy(cp + off, &ia->ia_addr.sin_addr, 1151 sizeof(struct in_addr)); 1152 rtfree(rt); 1153 cp[IPOPT_OFFSET] += sizeof(struct in_addr); 1154 ip->ip_dst = ipaddr.sin_addr; 1155 /* 1156 * Let ip_intr's mcast routing check handle mcast pkts 1157 */ 1158 forward = !IN_MULTICAST(ip->ip_dst.s_addr); 1159 break; 1160 1161 case IPOPT_RR: 1162 if (optlen < IPOPT_OFFSET + sizeof(*cp)) { 1163 code = &cp[IPOPT_OLEN] - (u_char *)ip; 1164 goto bad; 1165 } 1166 if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { 1167 code = &cp[IPOPT_OFFSET] - (u_char *)ip; 1168 goto bad; 1169 } 1170 1171 /* 1172 * If no space remains, ignore. 1173 */ 1174 off--; /* 0 origin */ 1175 if ((off + sizeof(struct in_addr)) > optlen) 1176 break; 1177 memset(&ipaddr, 0, sizeof(ipaddr)); 1178 ipaddr.sin_family = AF_INET; 1179 ipaddr.sin_len = sizeof(ipaddr); 1180 ipaddr.sin_addr = ip->ip_dst; 1181 /* 1182 * locate outgoing interface; if we're the destination, 1183 * use the incoming interface (should be same). 1184 * Again keep the packet inside the virtual instance. 1185 */ 1186 rt = rtalloc(sintosa(&ipaddr), RT_RESOLVE, rtableid); 1187 if (!rtisvalid(rt)) { 1188 type = ICMP_UNREACH; 1189 code = ICMP_UNREACH_HOST; 1190 rtfree(rt); 1191 goto bad; 1192 } 1193 ia = ifatoia(rt->rt_ifa); 1194 memcpy(cp + off, &ia->ia_addr.sin_addr, 1195 sizeof(struct in_addr)); 1196 rtfree(rt); 1197 cp[IPOPT_OFFSET] += sizeof(struct in_addr); 1198 break; 1199 1200 case IPOPT_TS: 1201 code = cp - (u_char *)ip; 1202 if (optlen < sizeof(struct ip_timestamp)) 1203 goto bad; 1204 memcpy(&ipt, cp, sizeof(struct ip_timestamp)); 1205 if (ipt.ipt_ptr < 5 || ipt.ipt_len < 5) 1206 goto bad; 1207 if (ipt.ipt_ptr - 1 + sizeof(u_int32_t) > ipt.ipt_len) { 1208 if (++ipt.ipt_oflw == 0) 1209 goto bad; 1210 break; 1211 } 1212 memcpy(&sin, cp + ipt.ipt_ptr - 1, sizeof sin); 1213 switch (ipt.ipt_flg) { 1214 1215 case IPOPT_TS_TSONLY: 1216 break; 1217 1218 case IPOPT_TS_TSANDADDR: 1219 if (ipt.ipt_ptr - 1 + sizeof(u_int32_t) + 1220 sizeof(struct in_addr) > ipt.ipt_len) 1221 goto bad; 1222 memset(&ipaddr, 0, sizeof(ipaddr)); 1223 ipaddr.sin_family = AF_INET; 1224 ipaddr.sin_len = sizeof(ipaddr); 1225 ipaddr.sin_addr = dst; 1226 ia = ifatoia(ifaof_ifpforaddr(sintosa(&ipaddr), 1227 ifp)); 1228 if (ia == NULL) 1229 continue; 1230 memcpy(&sin, &ia->ia_addr.sin_addr, 1231 sizeof(struct in_addr)); 1232 ipt.ipt_ptr += sizeof(struct in_addr); 1233 break; 1234 1235 case IPOPT_TS_PRESPEC: 1236 if (ipt.ipt_ptr - 1 + sizeof(u_int32_t) + 1237 sizeof(struct in_addr) > ipt.ipt_len) 1238 goto bad; 1239 memset(&ipaddr, 0, sizeof(ipaddr)); 1240 ipaddr.sin_family = AF_INET; 1241 ipaddr.sin_len = sizeof(ipaddr); 1242 ipaddr.sin_addr = sin; 1243 if (ifa_ifwithaddr(sintosa(&ipaddr), 1244 m->m_pkthdr.ph_rtableid) == NULL) 1245 continue; 1246 ipt.ipt_ptr += sizeof(struct in_addr); 1247 break; 1248 1249 default: 1250 /* XXX can't take &ipt->ipt_flg */ 1251 code = (u_char *)&ipt.ipt_ptr - 1252 (u_char *)ip + 1; 1253 goto bad; 1254 } 1255 ntime = iptime(); 1256 memcpy(cp + ipt.ipt_ptr - 1, &ntime, sizeof(u_int32_t)); 1257 ipt.ipt_ptr += sizeof(u_int32_t); 1258 } 1259 } 1260 KERNEL_UNLOCK(); 1261 if (forward && ipforwarding) { 1262 ip_forward(m, ifp, NULL, 1); 1263 return (1); 1264 } 1265 return (0); 1266 bad: 1267 KERNEL_UNLOCK(); 1268 icmp_error(m, type, code, 0, 0); 1269 ipstat_inc(ips_badoptions); 1270 return (1); 1271 } 1272 1273 /* 1274 * Save incoming source route for use in replies, 1275 * to be picked up later by ip_srcroute if the receiver is interested. 1276 */ 1277 void 1278 save_rte(struct mbuf *m, u_char *option, struct in_addr dst) 1279 { 1280 struct ip_srcrt *isr; 1281 struct m_tag *mtag; 1282 unsigned olen; 1283 1284 olen = option[IPOPT_OLEN]; 1285 if (olen > sizeof(isr->isr_hdr) + sizeof(isr->isr_routes)) 1286 return; 1287 1288 mtag = m_tag_get(PACKET_TAG_SRCROUTE, sizeof(*isr), M_NOWAIT); 1289 if (mtag == NULL) 1290 return; 1291 isr = (struct ip_srcrt *)(mtag + 1); 1292 1293 memcpy(isr->isr_hdr, option, olen); 1294 isr->isr_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr); 1295 isr->isr_dst = dst; 1296 m_tag_prepend(m, mtag); 1297 } 1298 1299 /* 1300 * Retrieve incoming source route for use in replies, 1301 * in the same form used by setsockopt. 1302 * The first hop is placed before the options, will be removed later. 1303 */ 1304 struct mbuf * 1305 ip_srcroute(struct mbuf *m0) 1306 { 1307 struct in_addr *p, *q; 1308 struct mbuf *m; 1309 struct ip_srcrt *isr; 1310 struct m_tag *mtag; 1311 1312 if (!ip_dosourceroute) 1313 return (NULL); 1314 1315 mtag = m_tag_find(m0, PACKET_TAG_SRCROUTE, NULL); 1316 if (mtag == NULL) 1317 return (NULL); 1318 isr = (struct ip_srcrt *)(mtag + 1); 1319 1320 if (isr->isr_nhops == 0) 1321 return (NULL); 1322 m = m_get(M_DONTWAIT, MT_SOOPTS); 1323 if (m == NULL) 1324 return (NULL); 1325 1326 #define OPTSIZ (sizeof(isr->isr_nop) + sizeof(isr->isr_hdr)) 1327 1328 /* length is (nhops+1)*sizeof(addr) + sizeof(nop + header) */ 1329 m->m_len = (isr->isr_nhops + 1) * sizeof(struct in_addr) + OPTSIZ; 1330 1331 /* 1332 * First save first hop for return route 1333 */ 1334 p = &(isr->isr_routes[isr->isr_nhops - 1]); 1335 *(mtod(m, struct in_addr *)) = *p--; 1336 1337 /* 1338 * Copy option fields and padding (nop) to mbuf. 1339 */ 1340 isr->isr_nop = IPOPT_NOP; 1341 isr->isr_hdr[IPOPT_OFFSET] = IPOPT_MINOFF; 1342 memcpy(mtod(m, caddr_t) + sizeof(struct in_addr), &isr->isr_nop, 1343 OPTSIZ); 1344 q = (struct in_addr *)(mtod(m, caddr_t) + 1345 sizeof(struct in_addr) + OPTSIZ); 1346 #undef OPTSIZ 1347 /* 1348 * Record return path as an IP source route, 1349 * reversing the path (pointers are now aligned). 1350 */ 1351 while (p >= isr->isr_routes) { 1352 *q++ = *p--; 1353 } 1354 /* 1355 * Last hop goes to final destination. 1356 */ 1357 *q = isr->isr_dst; 1358 m_tag_delete(m0, (struct m_tag *)isr); 1359 return (m); 1360 } 1361 1362 /* 1363 * Strip out IP options, at higher level protocol in the kernel. 1364 */ 1365 void 1366 ip_stripoptions(struct mbuf *m) 1367 { 1368 int i; 1369 struct ip *ip = mtod(m, struct ip *); 1370 caddr_t opts; 1371 int olen; 1372 1373 olen = (ip->ip_hl<<2) - sizeof (struct ip); 1374 opts = (caddr_t)(ip + 1); 1375 i = m->m_len - (sizeof (struct ip) + olen); 1376 memmove(opts, opts + olen, i); 1377 m->m_len -= olen; 1378 if (m->m_flags & M_PKTHDR) 1379 m->m_pkthdr.len -= olen; 1380 ip->ip_hl = sizeof(struct ip) >> 2; 1381 ip->ip_len = htons(ntohs(ip->ip_len) - olen); 1382 } 1383 1384 const u_char inetctlerrmap[PRC_NCMDS] = { 1385 0, 0, 0, 0, 1386 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, 1387 EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, 1388 EMSGSIZE, EHOSTUNREACH, 0, 0, 1389 0, 0, 0, 0, 1390 ENOPROTOOPT 1391 }; 1392 1393 /* 1394 * Forward a packet. If some error occurs return the sender 1395 * an icmp packet. Note we can't always generate a meaningful 1396 * icmp message because icmp doesn't have a large enough repertoire 1397 * of codes and types. 1398 * 1399 * If not forwarding, just drop the packet. This could be confusing 1400 * if ipforwarding was zero but some routing protocol was advancing 1401 * us as a gateway to somewhere. However, we must let the routing 1402 * protocol deal with that. 1403 * 1404 * The srcrt parameter indicates whether the packet is being forwarded 1405 * via a source route. 1406 */ 1407 void 1408 ip_forward(struct mbuf *m, struct ifnet *ifp, struct rtentry *rt, int srcrt) 1409 { 1410 struct mbuf mfake, *mcopy = NULL; 1411 struct ip *ip = mtod(m, struct ip *); 1412 struct sockaddr_in *sin; 1413 struct route ro; 1414 int error, type = 0, code = 0, destmtu = 0, fake = 0, len; 1415 u_int32_t dest; 1416 1417 dest = 0; 1418 if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) { 1419 ipstat_inc(ips_cantforward); 1420 m_freem(m); 1421 goto freecopy; 1422 } 1423 if (ip->ip_ttl <= IPTTLDEC) { 1424 icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, dest, 0); 1425 goto freecopy; 1426 } 1427 1428 sin = satosin(&ro.ro_dst); 1429 memset(sin, 0, sizeof(*sin)); 1430 sin->sin_family = AF_INET; 1431 sin->sin_len = sizeof(*sin); 1432 sin->sin_addr = ip->ip_dst; 1433 1434 if (!rtisvalid(rt)) { 1435 rtfree(rt); 1436 rt = rtalloc_mpath(sintosa(sin), &ip->ip_src.s_addr, 1437 m->m_pkthdr.ph_rtableid); 1438 if (rt == NULL) { 1439 icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0); 1440 return; 1441 } 1442 } 1443 1444 /* 1445 * Save at most 68 bytes of the packet in case 1446 * we need to generate an ICMP message to the src. 1447 * The data is saved in the mbuf on the stack that 1448 * acts as a temporary storage not intended to be 1449 * passed down the IP stack or to the mfree. 1450 */ 1451 memset(&mfake.m_hdr, 0, sizeof(mfake.m_hdr)); 1452 mfake.m_type = m->m_type; 1453 if (m_dup_pkthdr(&mfake, m, M_DONTWAIT) == 0) { 1454 mfake.m_data = mfake.m_pktdat; 1455 len = min(ntohs(ip->ip_len), 68); 1456 m_copydata(m, 0, len, mfake.m_pktdat); 1457 mfake.m_pkthdr.len = mfake.m_len = len; 1458 #if NPF > 0 1459 pf_pkt_addr_changed(&mfake); 1460 #endif /* NPF > 0 */ 1461 fake = 1; 1462 } 1463 1464 ip->ip_ttl -= IPTTLDEC; 1465 1466 /* 1467 * If forwarding packet using same interface that it came in on, 1468 * perhaps should send a redirect to sender to shortcut a hop. 1469 * Only send redirect if source is sending directly to us, 1470 * and if packet was not source routed (or has any options). 1471 * Also, don't send redirect if forwarding using a default route 1472 * or a route modified by a redirect. 1473 * Don't send redirect if we advertise destination's arp address 1474 * as ours (proxy arp). 1475 */ 1476 if ((rt->rt_ifidx == ifp->if_index) && 1477 (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 && 1478 satosin(rt_key(rt))->sin_addr.s_addr != 0 && 1479 ipsendredirects && !srcrt && 1480 !arpproxy(satosin(rt_key(rt))->sin_addr, m->m_pkthdr.ph_rtableid)) { 1481 if ((ip->ip_src.s_addr & ifatoia(rt->rt_ifa)->ia_netmask) == 1482 ifatoia(rt->rt_ifa)->ia_net) { 1483 if (rt->rt_flags & RTF_GATEWAY) 1484 dest = satosin(rt->rt_gateway)->sin_addr.s_addr; 1485 else 1486 dest = ip->ip_dst.s_addr; 1487 /* Router requirements says to only send host redirects */ 1488 type = ICMP_REDIRECT; 1489 code = ICMP_REDIRECT_HOST; 1490 } 1491 } 1492 1493 ro.ro_rt = rt; 1494 ro.ro_tableid = m->m_pkthdr.ph_rtableid; 1495 error = ip_output(m, NULL, &ro, 1496 (IP_FORWARDING | (ip_directedbcast ? IP_ALLOWBROADCAST : 0)), 1497 NULL, NULL, 0); 1498 rt = ro.ro_rt; 1499 if (error) 1500 ipstat_inc(ips_cantforward); 1501 else { 1502 ipstat_inc(ips_forward); 1503 if (type) 1504 ipstat_inc(ips_redirectsent); 1505 else 1506 goto freecopy; 1507 } 1508 if (!fake) 1509 goto freecopy; 1510 1511 switch (error) { 1512 1513 case 0: /* forwarded, but need redirect */ 1514 /* type, code set above */ 1515 break; 1516 1517 case ENETUNREACH: /* shouldn't happen, checked above */ 1518 case EHOSTUNREACH: 1519 case ENETDOWN: 1520 case EHOSTDOWN: 1521 default: 1522 type = ICMP_UNREACH; 1523 code = ICMP_UNREACH_HOST; 1524 break; 1525 1526 case EMSGSIZE: 1527 type = ICMP_UNREACH; 1528 code = ICMP_UNREACH_NEEDFRAG; 1529 1530 #ifdef IPSEC 1531 if (rt != NULL) { 1532 if (rt->rt_mtu) 1533 destmtu = rt->rt_mtu; 1534 else { 1535 struct ifnet *destifp; 1536 1537 destifp = if_get(rt->rt_ifidx); 1538 if (destifp != NULL) 1539 destmtu = destifp->if_mtu; 1540 if_put(destifp); 1541 } 1542 } 1543 #endif /*IPSEC*/ 1544 ipstat_inc(ips_cantfrag); 1545 break; 1546 1547 case EACCES: 1548 /* 1549 * pf(4) blocked the packet. There is no need to send an ICMP 1550 * packet back since pf(4) takes care of it. 1551 */ 1552 goto freecopy; 1553 case ENOBUFS: 1554 /* 1555 * a router should not generate ICMP_SOURCEQUENCH as 1556 * required in RFC1812 Requirements for IP Version 4 Routers. 1557 * source quench could be a big problem under DoS attacks, 1558 * or the underlying interface is rate-limited. 1559 */ 1560 goto freecopy; 1561 } 1562 1563 mcopy = m_copym(&mfake, 0, len, M_DONTWAIT); 1564 if (mcopy) 1565 icmp_error(mcopy, type, code, dest, destmtu); 1566 1567 freecopy: 1568 if (fake) 1569 m_tag_delete_chain(&mfake); 1570 rtfree(rt); 1571 } 1572 1573 int 1574 ip_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, 1575 size_t newlen) 1576 { 1577 int error; 1578 #ifdef MROUTING 1579 extern int ip_mrtproto; 1580 extern struct mrtstat mrtstat; 1581 #endif 1582 1583 /* Almost all sysctl names at this level are terminal. */ 1584 if (namelen != 1 && name[0] != IPCTL_IFQUEUE && 1585 name[0] != IPCTL_ARPQUEUE) 1586 return (ENOTDIR); 1587 1588 switch (name[0]) { 1589 case IPCTL_SOURCEROUTE: 1590 /* 1591 * Don't allow this to change in a secure environment. 1592 */ 1593 if (newp && securelevel > 0) 1594 return (EPERM); 1595 NET_LOCK(); 1596 error = sysctl_int(oldp, oldlenp, newp, newlen, 1597 &ip_dosourceroute); 1598 NET_UNLOCK(); 1599 return (error); 1600 case IPCTL_MTUDISC: 1601 NET_LOCK(); 1602 error = sysctl_int(oldp, oldlenp, newp, newlen, 1603 &ip_mtudisc); 1604 if (ip_mtudisc != 0 && ip_mtudisc_timeout_q == NULL) { 1605 ip_mtudisc_timeout_q = 1606 rt_timer_queue_create(ip_mtudisc_timeout); 1607 } else if (ip_mtudisc == 0 && ip_mtudisc_timeout_q != NULL) { 1608 rt_timer_queue_destroy(ip_mtudisc_timeout_q); 1609 ip_mtudisc_timeout_q = NULL; 1610 } 1611 NET_UNLOCK(); 1612 return error; 1613 case IPCTL_MTUDISCTIMEOUT: 1614 NET_LOCK(); 1615 error = sysctl_int(oldp, oldlenp, newp, newlen, 1616 &ip_mtudisc_timeout); 1617 if (ip_mtudisc_timeout_q != NULL) 1618 rt_timer_queue_change(ip_mtudisc_timeout_q, 1619 ip_mtudisc_timeout); 1620 NET_UNLOCK(); 1621 return (error); 1622 #ifdef IPSEC 1623 case IPCTL_ENCDEBUG: 1624 case IPCTL_IPSEC_STATS: 1625 case IPCTL_IPSEC_EXPIRE_ACQUIRE: 1626 case IPCTL_IPSEC_EMBRYONIC_SA_TIMEOUT: 1627 case IPCTL_IPSEC_REQUIRE_PFS: 1628 case IPCTL_IPSEC_SOFT_ALLOCATIONS: 1629 case IPCTL_IPSEC_ALLOCATIONS: 1630 case IPCTL_IPSEC_SOFT_BYTES: 1631 case IPCTL_IPSEC_BYTES: 1632 case IPCTL_IPSEC_TIMEOUT: 1633 case IPCTL_IPSEC_SOFT_TIMEOUT: 1634 case IPCTL_IPSEC_SOFT_FIRSTUSE: 1635 case IPCTL_IPSEC_FIRSTUSE: 1636 case IPCTL_IPSEC_ENC_ALGORITHM: 1637 case IPCTL_IPSEC_AUTH_ALGORITHM: 1638 case IPCTL_IPSEC_IPCOMP_ALGORITHM: 1639 return (ipsec_sysctl(name, namelen, oldp, oldlenp, newp, 1640 newlen)); 1641 #endif 1642 case IPCTL_IFQUEUE: 1643 return (sysctl_niq(name + 1, namelen - 1, 1644 oldp, oldlenp, newp, newlen, &ipintrq)); 1645 case IPCTL_ARPQUEUE: 1646 return (sysctl_niq(name + 1, namelen - 1, 1647 oldp, oldlenp, newp, newlen, &arpinq)); 1648 case IPCTL_STATS: 1649 return (ip_sysctl_ipstat(oldp, oldlenp, newp)); 1650 #ifdef MROUTING 1651 case IPCTL_MRTSTATS: 1652 return (sysctl_rdstruct(oldp, oldlenp, newp, 1653 &mrtstat, sizeof(mrtstat))); 1654 case IPCTL_MRTPROTO: 1655 return (sysctl_rdint(oldp, oldlenp, newp, ip_mrtproto)); 1656 case IPCTL_MRTMFC: 1657 if (newp) 1658 return (EPERM); 1659 NET_LOCK(); 1660 error = mrt_sysctl_mfc(oldp, oldlenp); 1661 NET_UNLOCK(); 1662 return (error); 1663 case IPCTL_MRTVIF: 1664 if (newp) 1665 return (EPERM); 1666 NET_LOCK(); 1667 error = mrt_sysctl_vif(oldp, oldlenp); 1668 NET_UNLOCK(); 1669 return (error); 1670 #else 1671 case IPCTL_MRTPROTO: 1672 case IPCTL_MRTSTATS: 1673 case IPCTL_MRTMFC: 1674 case IPCTL_MRTVIF: 1675 return (EOPNOTSUPP); 1676 #endif 1677 default: 1678 if (name[0] < IPCTL_MAXID) { 1679 NET_LOCK(); 1680 error = sysctl_int_arr(ipctl_vars, name, namelen, 1681 oldp, oldlenp, newp, newlen); 1682 NET_UNLOCK(); 1683 return (error); 1684 } 1685 return (EOPNOTSUPP); 1686 } 1687 /* NOTREACHED */ 1688 } 1689 1690 int 1691 ip_sysctl_ipstat(void *oldp, size_t *oldlenp, void *newp) 1692 { 1693 uint64_t counters[ips_ncounters]; 1694 struct ipstat ipstat; 1695 u_long *words = (u_long *)&ipstat; 1696 int i; 1697 1698 CTASSERT(sizeof(ipstat) == (nitems(counters) * sizeof(u_long))); 1699 memset(&ipstat, 0, sizeof ipstat); 1700 counters_read(ipcounters, counters, nitems(counters)); 1701 1702 for (i = 0; i < nitems(counters); i++) 1703 words[i] = (u_long)counters[i]; 1704 1705 return (sysctl_rdstruct(oldp, oldlenp, newp, &ipstat, sizeof(ipstat))); 1706 } 1707 1708 void 1709 ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, 1710 struct mbuf *m) 1711 { 1712 if (inp->inp_socket->so_options & SO_TIMESTAMP) { 1713 struct timeval tv; 1714 1715 m_microtime(m, &tv); 1716 *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv), 1717 SCM_TIMESTAMP, SOL_SOCKET); 1718 if (*mp) 1719 mp = &(*mp)->m_next; 1720 } 1721 1722 if (inp->inp_flags & INP_RECVDSTADDR) { 1723 *mp = sbcreatecontrol((caddr_t) &ip->ip_dst, 1724 sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP); 1725 if (*mp) 1726 mp = &(*mp)->m_next; 1727 } 1728 #ifdef notyet 1729 /* this code is broken and will probably never be fixed. */ 1730 /* options were tossed already */ 1731 if (inp->inp_flags & INP_RECVOPTS) { 1732 *mp = sbcreatecontrol((caddr_t) opts_deleted_above, 1733 sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP); 1734 if (*mp) 1735 mp = &(*mp)->m_next; 1736 } 1737 /* ip_srcroute doesn't do what we want here, need to fix */ 1738 if (inp->inp_flags & INP_RECVRETOPTS) { 1739 *mp = sbcreatecontrol((caddr_t) ip_srcroute(m), 1740 sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP); 1741 if (*mp) 1742 mp = &(*mp)->m_next; 1743 } 1744 #endif 1745 if (inp->inp_flags & INP_RECVIF) { 1746 struct sockaddr_dl sdl; 1747 struct ifnet *ifp; 1748 1749 ifp = if_get(m->m_pkthdr.ph_ifidx); 1750 if (ifp == NULL || ifp->if_sadl == NULL) { 1751 memset(&sdl, 0, sizeof(sdl)); 1752 sdl.sdl_len = offsetof(struct sockaddr_dl, sdl_data[0]); 1753 sdl.sdl_family = AF_LINK; 1754 sdl.sdl_index = ifp != NULL ? ifp->if_index : 0; 1755 sdl.sdl_nlen = sdl.sdl_alen = sdl.sdl_slen = 0; 1756 *mp = sbcreatecontrol((caddr_t) &sdl, sdl.sdl_len, 1757 IP_RECVIF, IPPROTO_IP); 1758 } else { 1759 *mp = sbcreatecontrol((caddr_t) ifp->if_sadl, 1760 ifp->if_sadl->sdl_len, IP_RECVIF, IPPROTO_IP); 1761 } 1762 if (*mp) 1763 mp = &(*mp)->m_next; 1764 if_put(ifp); 1765 } 1766 if (inp->inp_flags & INP_RECVTTL) { 1767 *mp = sbcreatecontrol((caddr_t) &ip->ip_ttl, 1768 sizeof(u_int8_t), IP_RECVTTL, IPPROTO_IP); 1769 if (*mp) 1770 mp = &(*mp)->m_next; 1771 } 1772 if (inp->inp_flags & INP_RECVRTABLE) { 1773 u_int rtableid = inp->inp_rtableid; 1774 1775 #if NPF > 0 1776 if (m && m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) { 1777 struct pf_divert *divert; 1778 1779 divert = pf_find_divert(m); 1780 KASSERT(divert != NULL); 1781 rtableid = divert->rdomain; 1782 } 1783 #endif 1784 1785 *mp = sbcreatecontrol((caddr_t) &rtableid, 1786 sizeof(u_int), IP_RECVRTABLE, IPPROTO_IP); 1787 if (*mp) 1788 mp = &(*mp)->m_next; 1789 } 1790 } 1791 1792 void 1793 ip_send_dispatch(void *xmq) 1794 { 1795 struct mbuf_queue *mq = xmq; 1796 struct mbuf *m; 1797 struct mbuf_list ml; 1798 1799 mq_delist(mq, &ml); 1800 if (ml_empty(&ml)) 1801 return; 1802 1803 NET_RLOCK(); 1804 while ((m = ml_dequeue(&ml)) != NULL) { 1805 ip_output(m, NULL, NULL, 0, NULL, NULL, 0); 1806 } 1807 NET_RUNLOCK(); 1808 } 1809 1810 void 1811 ip_send(struct mbuf *m) 1812 { 1813 mq_enqueue(&ipsend_mq, m); 1814 task_add(net_tq(0), &ipsend_task); 1815 } 1816