xref: /openbsd-src/sys/netinet/ip_input.c (revision 50b7afb2c2c0993b0894d4e34bf857cb13ed9c80)
1 /*	$OpenBSD: ip_input.c,v 1.235 2014/07/13 13:57:56 mpi Exp $	*/
2 /*	$NetBSD: ip_input.c,v 1.30 1996/03/16 23:53:58 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1988, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)ip_input.c	8.2 (Berkeley) 1/4/94
33  */
34 
35 #include "pf.h"
36 #include "carp.h"
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/mbuf.h>
41 #include <sys/domain.h>
42 #include <sys/protosw.h>
43 #include <sys/socket.h>
44 #include <sys/socketvar.h>
45 #include <sys/syslog.h>
46 #include <sys/sysctl.h>
47 #include <sys/pool.h>
48 
49 #include <net/if.h>
50 #include <net/if_dl.h>
51 #include <net/route.h>
52 
53 #include <netinet/in.h>
54 #include <netinet/in_systm.h>
55 #include <netinet/if_ether.h>
56 #include <netinet/ip.h>
57 #include <netinet/in_pcb.h>
58 #include <netinet/in_var.h>
59 #include <netinet/ip_var.h>
60 #include <netinet/ip_icmp.h>
61 
62 #if NPF > 0
63 #include <net/pfvar.h>
64 #endif
65 
66 #ifdef MROUTING
67 #include <netinet/ip_mroute.h>
68 #endif
69 
70 #ifdef IPSEC
71 #include <netinet/ip_ipsp.h>
72 #endif /* IPSEC */
73 
74 #if NCARP > 0
75 #include <net/if_types.h>
76 #include <netinet/ip_carp.h>
77 #endif
78 
79 struct ipqhead ipq;
80 
81 int encdebug = 0;
82 int ipsec_keep_invalid = IPSEC_DEFAULT_EMBRYONIC_SA_TIMEOUT;
83 int ipsec_require_pfs = IPSEC_DEFAULT_PFS;
84 int ipsec_soft_allocations = IPSEC_DEFAULT_SOFT_ALLOCATIONS;
85 int ipsec_exp_allocations = IPSEC_DEFAULT_EXP_ALLOCATIONS;
86 int ipsec_soft_bytes = IPSEC_DEFAULT_SOFT_BYTES;
87 int ipsec_exp_bytes = IPSEC_DEFAULT_EXP_BYTES;
88 int ipsec_soft_timeout = IPSEC_DEFAULT_SOFT_TIMEOUT;
89 int ipsec_exp_timeout = IPSEC_DEFAULT_EXP_TIMEOUT;
90 int ipsec_soft_first_use = IPSEC_DEFAULT_SOFT_FIRST_USE;
91 int ipsec_exp_first_use = IPSEC_DEFAULT_EXP_FIRST_USE;
92 int ipsec_expire_acquire = IPSEC_DEFAULT_EXPIRE_ACQUIRE;
93 char ipsec_def_enc[20];
94 char ipsec_def_auth[20];
95 char ipsec_def_comp[20];
96 
97 /* values controllable via sysctl */
98 int	ipforwarding = 0;
99 int	ipmforwarding = 0;
100 int	ipmultipath = 0;
101 int	ipsendredirects = 1;
102 int	ip_dosourceroute = 0;
103 int	ip_defttl = IPDEFTTL;
104 int	ip_mtudisc = 1;
105 u_int	ip_mtudisc_timeout = IPMTUDISCTIMEOUT;
106 int	ip_directedbcast = 0;
107 
108 struct rttimer_queue *ip_mtudisc_timeout_q = NULL;
109 
110 /* Keep track of memory used for reassembly */
111 int	ip_maxqueue = 300;
112 int	ip_frags = 0;
113 
114 int *ipctl_vars[IPCTL_MAXID] = IPCTL_VARS;
115 
116 struct	in_ifaddrhead in_ifaddr;
117 struct	ifqueue ipintrq;
118 
119 struct pool ipqent_pool;
120 struct pool ipq_pool;
121 
122 struct ipstat ipstat;
123 
124 void	ip_ours(struct mbuf *);
125 int	ip_dooptions(struct mbuf *, struct ifnet *);
126 int	in_ouraddr(struct mbuf *, struct ifnet *, struct in_addr);
127 void	ip_forward(struct mbuf *, struct ifnet *, int);
128 
129 /*
130  * Used to save the IP options in case a protocol wants to respond
131  * to an incoming packet over the same route if the packet got here
132  * using IP source routing.  This allows connection establishment and
133  * maintenance when the remote end is on a network that is not known
134  * to us.
135  */
136 struct ip_srcrt {
137 	int		isr_nhops;		   /* number of hops */
138 	struct in_addr	isr_dst;		   /* final destination */
139 	char		isr_nop;		   /* one NOP to align */
140 	char		isr_hdr[IPOPT_OFFSET + 1]; /* OPTVAL, OLEN & OFFSET */
141 	struct in_addr	isr_routes[MAX_IPOPTLEN/sizeof(struct in_addr)];
142 };
143 
144 void save_rte(struct mbuf *, u_char *, struct in_addr);
145 
146 /*
147  * IP initialization: fill in IP protocol switch table.
148  * All protocols not implemented in kernel go to raw IP protocol handler.
149  */
150 void
151 ip_init(void)
152 {
153 	struct protosw *pr;
154 	int i;
155 	const u_int16_t defbaddynamicports_tcp[] = DEFBADDYNAMICPORTS_TCP;
156 	const u_int16_t defbaddynamicports_udp[] = DEFBADDYNAMICPORTS_UDP;
157 
158 	pool_init(&ipqent_pool, sizeof(struct ipqent), 0, 0, 0, "ipqepl",
159 	    NULL);
160 	pool_init(&ipq_pool, sizeof(struct ipq), 0, 0, 0, "ipqpl",
161 	    NULL);
162 
163 	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
164 	if (pr == 0)
165 		panic("ip_init");
166 	for (i = 0; i < IPPROTO_MAX; i++)
167 		ip_protox[i] = pr - inetsw;
168 	for (pr = inetdomain.dom_protosw;
169 	    pr < inetdomain.dom_protoswNPROTOSW; pr++)
170 		if (pr->pr_domain->dom_family == PF_INET &&
171 		    pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW &&
172 		    pr->pr_protocol < IPPROTO_MAX)
173 			ip_protox[pr->pr_protocol] = pr - inetsw;
174 	LIST_INIT(&ipq);
175 	IFQ_SET_MAXLEN(&ipintrq, IFQ_MAXLEN);
176 	TAILQ_INIT(&in_ifaddr);
177 	if (ip_mtudisc != 0)
178 		ip_mtudisc_timeout_q =
179 		    rt_timer_queue_create(ip_mtudisc_timeout);
180 
181 	/* Fill in list of ports not to allocate dynamically. */
182 	memset(&baddynamicports, 0, sizeof(baddynamicports));
183 	for (i = 0; defbaddynamicports_tcp[i] != 0; i++)
184 		DP_SET(baddynamicports.tcp, defbaddynamicports_tcp[i]);
185 	for (i = 0; defbaddynamicports_udp[i] != 0; i++)
186 		DP_SET(baddynamicports.udp, defbaddynamicports_udp[i]);
187 
188 	strlcpy(ipsec_def_enc, IPSEC_DEFAULT_DEF_ENC, sizeof(ipsec_def_enc));
189 	strlcpy(ipsec_def_auth, IPSEC_DEFAULT_DEF_AUTH, sizeof(ipsec_def_auth));
190 	strlcpy(ipsec_def_comp, IPSEC_DEFAULT_DEF_COMP, sizeof(ipsec_def_comp));
191 }
192 
193 struct	route ipforward_rt;
194 
195 void
196 ipintr(void)
197 {
198 	struct mbuf *m;
199 	int s;
200 
201 	for (;;) {
202 		/*
203 		 * Get next datagram off input queue and get IP header
204 		 * in first mbuf.
205 		 */
206 		s = splnet();
207 		IF_DEQUEUE(&ipintrq, m);
208 		splx(s);
209 		if (m == NULL)
210 			return;
211 #ifdef	DIAGNOSTIC
212 		if ((m->m_flags & M_PKTHDR) == 0)
213 			panic("ipintr no HDR");
214 #endif
215 		ipv4_input(m);
216 	}
217 }
218 
219 /*
220  * IPv4 input routine.
221  *
222  * Checksum and byte swap header.  Process options. Forward or deliver.
223  */
224 void
225 ipv4_input(struct mbuf *m)
226 {
227 	struct ifnet *ifp;
228 	struct ip *ip;
229 	int hlen, len;
230 	in_addr_t pfrdr = 0;
231 #ifdef IPSEC
232 	int error;
233 	struct tdb *tdb;
234 	struct tdb_ident *tdbi;
235 	struct m_tag *mtag;
236 #endif /* IPSEC */
237 
238 	ifp = m->m_pkthdr.rcvif;
239 
240 	ipstat.ips_total++;
241 	if (m->m_len < sizeof (struct ip) &&
242 	    (m = m_pullup(m, sizeof (struct ip))) == NULL) {
243 		ipstat.ips_toosmall++;
244 		return;
245 	}
246 	ip = mtod(m, struct ip *);
247 	if (ip->ip_v != IPVERSION) {
248 		ipstat.ips_badvers++;
249 		goto bad;
250 	}
251 	hlen = ip->ip_hl << 2;
252 	if (hlen < sizeof(struct ip)) {	/* minimum header length */
253 		ipstat.ips_badhlen++;
254 		goto bad;
255 	}
256 	if (hlen > m->m_len) {
257 		if ((m = m_pullup(m, hlen)) == NULL) {
258 			ipstat.ips_badhlen++;
259 			return;
260 		}
261 		ip = mtod(m, struct ip *);
262 	}
263 
264 	/* 127/8 must not appear on wire - RFC1122 */
265 	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
266 	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
267 		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
268 			ipstat.ips_badaddr++;
269 			goto bad;
270 		}
271 	}
272 
273 	if ((m->m_pkthdr.csum_flags & M_IPV4_CSUM_IN_OK) == 0) {
274 		if (m->m_pkthdr.csum_flags & M_IPV4_CSUM_IN_BAD) {
275 			ipstat.ips_badsum++;
276 			goto bad;
277 		}
278 
279 		ipstat.ips_inswcsum++;
280 		if (in_cksum(m, hlen) != 0) {
281 			ipstat.ips_badsum++;
282 			goto bad;
283 		}
284 	}
285 
286 	/* Retrieve the packet length. */
287 	len = ntohs(ip->ip_len);
288 
289 	/*
290 	 * Convert fields to host representation.
291 	 */
292 	if (len < hlen) {
293 		ipstat.ips_badlen++;
294 		goto bad;
295 	}
296 
297 	/*
298 	 * Check that the amount of data in the buffers
299 	 * is at least as much as the IP header would have us expect.
300 	 * Trim mbufs if longer than we expect.
301 	 * Drop packet if shorter than we expect.
302 	 */
303 	if (m->m_pkthdr.len < len) {
304 		ipstat.ips_tooshort++;
305 		goto bad;
306 	}
307 	if (m->m_pkthdr.len > len) {
308 		if (m->m_len == m->m_pkthdr.len) {
309 			m->m_len = len;
310 			m->m_pkthdr.len = len;
311 		} else
312 			m_adj(m, len - m->m_pkthdr.len);
313 	}
314 
315 #if NCARP > 0
316 	if (ifp->if_type == IFT_CARP && ip->ip_p != IPPROTO_ICMP &&
317 	    carp_lsdrop(m, AF_INET, &ip->ip_src.s_addr, &ip->ip_dst.s_addr))
318 		goto bad;
319 #endif
320 
321 #if NPF > 0
322 	/*
323 	 * Packet filter
324 	 */
325 	pfrdr = ip->ip_dst.s_addr;
326 	if (pf_test(AF_INET, PF_IN, ifp, &m, NULL) != PF_PASS)
327 		goto bad;
328 	if (m == NULL)
329 		return;
330 
331 	ip = mtod(m, struct ip *);
332 	hlen = ip->ip_hl << 2;
333 	pfrdr = (pfrdr != ip->ip_dst.s_addr);
334 #endif
335 
336 	/*
337 	 * Process options and, if not destined for us,
338 	 * ship it on.  ip_dooptions returns 1 when an
339 	 * error was detected (causing an icmp message
340 	 * to be sent and the original packet to be freed).
341 	 */
342 	if (hlen > sizeof (struct ip) && ip_dooptions(m, ifp)) {
343 	        return;
344 	}
345 
346 	if (in_ouraddr(m, ifp, ip->ip_dst)) {
347 		ip_ours(m);
348 		return;
349 	}
350 
351 	if (IN_MULTICAST(ip->ip_dst.s_addr)) {
352 		struct in_multi *inm;
353 #ifdef MROUTING
354 		if (ipmforwarding && ip_mrouter) {
355 			if (m->m_flags & M_EXT) {
356 				if ((m = m_pullup(m, hlen)) == NULL) {
357 					ipstat.ips_toosmall++;
358 					return;
359 				}
360 				ip = mtod(m, struct ip *);
361 			}
362 			/*
363 			 * If we are acting as a multicast router, all
364 			 * incoming multicast packets are passed to the
365 			 * kernel-level multicast forwarding function.
366 			 * The packet is returned (relatively) intact; if
367 			 * ip_mforward() returns a non-zero value, the packet
368 			 * must be discarded, else it may be accepted below.
369 			 *
370 			 * (The IP ident field is put in the same byte order
371 			 * as expected when ip_mforward() is called from
372 			 * ip_output().)
373 			 */
374 			if (ip_mforward(m, ifp) != 0) {
375 				ipstat.ips_cantforward++;
376 				goto bad;
377 			}
378 
379 			/*
380 			 * The process-level routing daemon needs to receive
381 			 * all multicast IGMP packets, whether or not this
382 			 * host belongs to their destination groups.
383 			 */
384 			if (ip->ip_p == IPPROTO_IGMP) {
385 				ip_ours(m);
386 				return;
387 			}
388 			ipstat.ips_forward++;
389 		}
390 #endif
391 		/*
392 		 * See if we belong to the destination multicast group on the
393 		 * arrival interface.
394 		 */
395 		IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm);
396 		if (inm == NULL) {
397 			ipstat.ips_notmember++;
398 			if (!IN_LOCAL_GROUP(ip->ip_dst.s_addr))
399 				ipstat.ips_cantforward++;
400 			goto bad;
401 		}
402 		ip_ours(m);
403 		return;
404 	}
405 
406 	if (ip->ip_dst.s_addr == INADDR_BROADCAST ||
407 	    ip->ip_dst.s_addr == INADDR_ANY) {
408 		ip_ours(m);
409 		return;
410 	}
411 
412 #if NCARP > 0
413 	if (ifp->if_type == IFT_CARP && ip->ip_p == IPPROTO_ICMP &&
414 	    carp_lsdrop(m, AF_INET, &ip->ip_src.s_addr, &ip->ip_dst.s_addr))
415 		goto bad;
416 #endif
417 	/*
418 	 * Not for us; forward if possible and desirable.
419 	 */
420 	if (ipforwarding == 0) {
421 		ipstat.ips_cantforward++;
422 		goto bad;
423 	}
424 #ifdef IPSEC
425 	if (ipsec_in_use) {
426 	        /*
427 		 * IPsec policy check for forwarded packets. Look at
428 		 * inner-most IPsec SA used.
429 		 */
430 		mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL);
431 		if (mtag != NULL) {
432 			tdbi = (struct tdb_ident *)(mtag + 1);
433 			tdb = gettdb(tdbi->rdomain, tdbi->spi,
434 			    &tdbi->dst, tdbi->proto);
435 		} else
436 			tdb = NULL;
437 	        ipsp_spd_lookup(m, AF_INET, hlen, &error,
438 		    IPSP_DIRECTION_IN, tdb, NULL, 0);
439 
440 		/* Error or otherwise drop-packet indication */
441 		if (error) {
442 			ipstat.ips_cantforward++;
443 			goto bad;
444 		}
445 
446 		/*
447 		 * Fall through, forward packet. Outbound IPsec policy
448 		 * checking will occur in ip_output().
449 		 */
450 	}
451 #endif /* IPSEC */
452 
453 	ip_forward(m, ifp, pfrdr);
454 	return;
455 bad:
456 	m_freem(m);
457 }
458 
459 /*
460  * IPv4 local-delivery routine.
461  *
462  * If fragmented try to reassemble.  Pass to next level.
463  */
464 void
465 ip_ours(struct mbuf *m)
466 {
467 	struct ip *ip = mtod(m, struct ip *);
468 	struct ipq *fp;
469 	struct ipqent *ipqe;
470 	int mff, hlen;
471 #ifdef IPSEC
472 	int error;
473 	struct tdb *tdb;
474 	struct tdb_ident *tdbi;
475 	struct m_tag *mtag;
476 #endif /* IPSEC */
477 
478 	hlen = ip->ip_hl << 2;
479 
480 	/* pf might have modified stuff, might have to chksum */
481 	in_proto_cksum_out(m, NULL);
482 
483 	/*
484 	 * If offset or IP_MF are set, must reassemble.
485 	 * Otherwise, nothing need be done.
486 	 * (We could look in the reassembly queue to see
487 	 * if the packet was previously fragmented,
488 	 * but it's not worth the time; just let them time out.)
489 	 */
490 	if (ip->ip_off &~ htons(IP_DF | IP_RF)) {
491 		if (m->m_flags & M_EXT) {		/* XXX */
492 			if ((m = m_pullup(m, hlen)) == NULL) {
493 				ipstat.ips_toosmall++;
494 				return;
495 			}
496 			ip = mtod(m, struct ip *);
497 		}
498 
499 		/*
500 		 * Look for queue of fragments
501 		 * of this datagram.
502 		 */
503 		LIST_FOREACH(fp, &ipq, ipq_q)
504 			if (ip->ip_id == fp->ipq_id &&
505 			    ip->ip_src.s_addr == fp->ipq_src.s_addr &&
506 			    ip->ip_dst.s_addr == fp->ipq_dst.s_addr &&
507 			    ip->ip_p == fp->ipq_p)
508 				goto found;
509 		fp = 0;
510 found:
511 
512 		/*
513 		 * Adjust ip_len to not reflect header,
514 		 * set ipqe_mff if more fragments are expected,
515 		 * convert offset of this to bytes.
516 		 */
517 		ip->ip_len = htons(ntohs(ip->ip_len) - hlen);
518 		mff = (ip->ip_off & htons(IP_MF)) != 0;
519 		if (mff) {
520 			/*
521 			 * Make sure that fragments have a data length
522 			 * that's a non-zero multiple of 8 bytes.
523 			 */
524 			if (ntohs(ip->ip_len) == 0 ||
525 			    (ntohs(ip->ip_len) & 0x7) != 0) {
526 				ipstat.ips_badfrags++;
527 				goto bad;
528 			}
529 		}
530 		ip->ip_off = htons(ntohs(ip->ip_off) << 3);
531 
532 		/*
533 		 * If datagram marked as having more fragments
534 		 * or if this is not the first fragment,
535 		 * attempt reassembly; if it succeeds, proceed.
536 		 */
537 		if (mff || ip->ip_off) {
538 			ipstat.ips_fragments++;
539 			if (ip_frags + 1 > ip_maxqueue) {
540 				ip_flush();
541 				ipstat.ips_rcvmemdrop++;
542 				goto bad;
543 			}
544 
545 			ipqe = pool_get(&ipqent_pool, PR_NOWAIT);
546 			if (ipqe == NULL) {
547 				ipstat.ips_rcvmemdrop++;
548 				goto bad;
549 			}
550 			ip_frags++;
551 			ipqe->ipqe_mff = mff;
552 			ipqe->ipqe_m = m;
553 			ipqe->ipqe_ip = ip;
554 			m = ip_reass(ipqe, fp);
555 			if (m == 0) {
556 				return;
557 			}
558 			ipstat.ips_reassembled++;
559 			ip = mtod(m, struct ip *);
560 			hlen = ip->ip_hl << 2;
561 			ip->ip_len = htons(ntohs(ip->ip_len) + hlen);
562 		} else
563 			if (fp)
564 				ip_freef(fp);
565 	}
566 
567 #ifdef IPSEC
568 	if (!ipsec_in_use)
569 		goto skipipsec;
570 
571         /*
572          * If it's a protected packet for us, skip the policy check.
573          * That's because we really only care about the properties of
574          * the protected packet, and not the intermediate versions.
575          * While this is not the most paranoid setting, it allows
576          * some flexibility in handling nested tunnels (in setting up
577 	 * the policies).
578          */
579         if ((ip->ip_p == IPPROTO_ESP) || (ip->ip_p == IPPROTO_AH) ||
580 	    (ip->ip_p == IPPROTO_IPCOMP))
581         	goto skipipsec;
582 
583 	/*
584 	 * If the protected packet was tunneled, then we need to
585 	 * verify the protected packet's information, not the
586 	 * external headers. Thus, skip the policy lookup for the
587 	 * external packet, and keep the IPsec information linked on
588 	 * the packet header (the encapsulation routines know how
589 	 * to deal with that).
590 	 */
591 	if ((ip->ip_p == IPPROTO_IPIP) || (ip->ip_p == IPPROTO_IPV6))
592 		goto skipipsec;
593 
594 	/*
595 	 * If the protected packet is TCP or UDP, we'll do the
596 	 * policy check in the respective input routine, so we can
597 	 * check for bypass sockets.
598 	 */
599 	if ((ip->ip_p == IPPROTO_TCP) || (ip->ip_p == IPPROTO_UDP))
600 		goto skipipsec;
601 
602 	/*
603 	 * IPsec policy check for local-delivery packets. Look at the
604 	 * inner-most SA that protected the packet. This is in fact
605 	 * a bit too restrictive (it could end up causing packets to
606 	 * be dropped that semantically follow the policy, e.g., in
607 	 * certain SA-bundle configurations); but the alternative is
608 	 * very complicated (and requires keeping track of what
609 	 * kinds of tunneling headers have been seen in-between the
610 	 * IPsec headers), and I don't think we lose much functionality
611 	 * that's needed in the real world (who uses bundles anyway ?).
612 	 */
613 	mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL);
614 	if (mtag) {
615 		tdbi = (struct tdb_ident *)(mtag + 1);
616 	        tdb = gettdb(tdbi->rdomain, tdbi->spi, &tdbi->dst,
617 		    tdbi->proto);
618 	} else
619 		tdb = NULL;
620 	ipsp_spd_lookup(m, AF_INET, hlen, &error, IPSP_DIRECTION_IN,
621 	    tdb, NULL, 0);
622 
623 	/* Error or otherwise drop-packet indication. */
624 	if (error) {
625 	        ipstat.ips_cantforward++;
626 	        goto bad;
627 	}
628 
629  skipipsec:
630 	/* Otherwise, just fall through and deliver the packet */
631 #endif /* IPSEC */
632 
633 	/*
634 	 * Switch out to protocol's input routine.
635 	 */
636 	ipstat.ips_delivered++;
637 	(*inetsw[ip_protox[ip->ip_p]].pr_input)(m, hlen, NULL, 0);
638 	return;
639 bad:
640 	m_freem(m);
641 }
642 
643 int
644 in_ouraddr(struct mbuf *m, struct ifnet *ifp, struct in_addr ina)
645 {
646 	struct in_ifaddr	*ia;
647 	struct sockaddr_in	 sin;
648 #if NPF > 0
649 	struct pf_state_key	*key;
650 
651 	if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED)
652 		return (1);
653 
654 	key = m->m_pkthdr.pf.statekey;
655 	if (key != NULL) {
656 		if (key->inp != NULL)
657 			return (1);
658 
659 		/* If we have linked state keys it is certainly forwarded. */
660 		if (key->reverse != NULL)
661 			return (0);
662 	}
663 #endif
664 
665 	memset(&sin, 0, sizeof(sin));
666 	sin.sin_len = sizeof(sin);
667 	sin.sin_family = AF_INET;
668 	sin.sin_addr = ina;
669 	ia = ifatoia(ifa_ifwithaddr(sintosa(&sin), m->m_pkthdr.ph_rtableid));
670 
671 	if (ia == NULL) {
672 		struct ifaddr *ifa;
673 
674 		/*
675 		 * No local address or broadcast address found, so check for
676 		 * ancient classful broadcast addresses.
677 		 * It must have been broadcast on the link layer, and for an
678 		 * address on the interface it was received on.
679 		 */
680 		if (!ISSET(m->m_flags, M_BCAST) ||
681 		    !IN_CLASSFULBROADCAST(ina.s_addr, ina.s_addr))
682 			return (0);
683 
684 		if (ifp->if_rdomain != rtable_l2(m->m_pkthdr.ph_rtableid))
685 			return (0);
686 		/*
687 		 * The check in the loop assumes you only rx a packet on an UP
688 		 * interface, and that M_BCAST will only be set on a BROADCAST
689 		 * interface.
690 		 */
691 		TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) {
692 			if (ifa->ifa_addr->sa_family != AF_INET)
693 				continue;
694 
695 			if (IN_CLASSFULBROADCAST(ina.s_addr,
696 			    ifatoia(ifa)->ia_addr.sin_addr.s_addr))
697 				return (1);
698 		}
699 
700 		return (0);
701 	}
702 
703 	if (ina.s_addr != ia->ia_addr.sin_addr.s_addr) {
704 		/*
705 		 * This matches a broadcast address on one of our interfaces.
706 		 * If directedbcast is enabled we only consider it local if it
707 		 * is received on the interface with that address.
708 		 */
709 		if (ip_directedbcast && ia->ia_ifp != ifp)
710 			return (0);
711 
712 		/* Make sure M_BCAST is set */
713 		if (m)
714 			m->m_flags |= M_BCAST;
715 	}
716 
717 	return (ISSET(ia->ia_ifp->if_flags, IFF_UP));
718 }
719 
720 struct in_ifaddr *
721 in_iawithaddr(struct in_addr ina, u_int rtableid)
722 {
723 	struct in_ifaddr	*ia;
724 	struct sockaddr_in	 sin;
725 
726 	memset(&sin, 0, sizeof(sin));
727 	sin.sin_len = sizeof(sin);
728 	sin.sin_family = AF_INET;
729 	sin.sin_addr = ina;
730 	ia = ifatoia(ifa_ifwithaddr(sintosa(&sin), rtableid));
731 	if (ia == NULL || ina.s_addr == ia->ia_addr.sin_addr.s_addr)
732 		return (ia);
733 
734 	return (NULL);
735 }
736 
737 /*
738  * Take incoming datagram fragment and try to
739  * reassemble it into whole datagram.  If a chain for
740  * reassembly of this datagram already exists, then it
741  * is given as fp; otherwise have to make a chain.
742  */
743 struct mbuf *
744 ip_reass(struct ipqent *ipqe, struct ipq *fp)
745 {
746 	struct mbuf *m = ipqe->ipqe_m;
747 	struct ipqent *nq, *p, *q;
748 	struct ip *ip;
749 	struct mbuf *t;
750 	int hlen = ipqe->ipqe_ip->ip_hl << 2;
751 	int i, next;
752 	u_int8_t ecn, ecn0;
753 
754 	/*
755 	 * Presence of header sizes in mbufs
756 	 * would confuse code below.
757 	 */
758 	m->m_data += hlen;
759 	m->m_len -= hlen;
760 
761 	/*
762 	 * If first fragment to arrive, create a reassembly queue.
763 	 */
764 	if (fp == NULL) {
765 		fp = pool_get(&ipq_pool, PR_NOWAIT);
766 		if (fp == NULL)
767 			goto dropfrag;
768 		LIST_INSERT_HEAD(&ipq, fp, ipq_q);
769 		fp->ipq_ttl = IPFRAGTTL;
770 		fp->ipq_p = ipqe->ipqe_ip->ip_p;
771 		fp->ipq_id = ipqe->ipqe_ip->ip_id;
772 		LIST_INIT(&fp->ipq_fragq);
773 		fp->ipq_src = ipqe->ipqe_ip->ip_src;
774 		fp->ipq_dst = ipqe->ipqe_ip->ip_dst;
775 		p = NULL;
776 		goto insert;
777 	}
778 
779 	/*
780 	 * Handle ECN by comparing this segment with the first one;
781 	 * if CE is set, do not lose CE.
782 	 * drop if CE and not-ECT are mixed for the same packet.
783 	 */
784 	ecn = ipqe->ipqe_ip->ip_tos & IPTOS_ECN_MASK;
785 	ecn0 = LIST_FIRST(&fp->ipq_fragq)->ipqe_ip->ip_tos & IPTOS_ECN_MASK;
786 	if (ecn == IPTOS_ECN_CE) {
787 		if (ecn0 == IPTOS_ECN_NOTECT)
788 			goto dropfrag;
789 		if (ecn0 != IPTOS_ECN_CE)
790 			LIST_FIRST(&fp->ipq_fragq)->ipqe_ip->ip_tos |= IPTOS_ECN_CE;
791 	}
792 	if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT)
793 		goto dropfrag;
794 
795 	/*
796 	 * Find a segment which begins after this one does.
797 	 */
798 	for (p = NULL, q = LIST_FIRST(&fp->ipq_fragq); q != NULL;
799 	    p = q, q = LIST_NEXT(q, ipqe_q))
800 		if (ntohs(q->ipqe_ip->ip_off) > ntohs(ipqe->ipqe_ip->ip_off))
801 			break;
802 
803 	/*
804 	 * If there is a preceding segment, it may provide some of
805 	 * our data already.  If so, drop the data from the incoming
806 	 * segment.  If it provides all of our data, drop us.
807 	 */
808 	if (p != NULL) {
809 		i = ntohs(p->ipqe_ip->ip_off) + ntohs(p->ipqe_ip->ip_len) -
810 		    ntohs(ipqe->ipqe_ip->ip_off);
811 		if (i > 0) {
812 			if (i >= ntohs(ipqe->ipqe_ip->ip_len))
813 				goto dropfrag;
814 			m_adj(ipqe->ipqe_m, i);
815 			ipqe->ipqe_ip->ip_off =
816 			    htons(ntohs(ipqe->ipqe_ip->ip_off) + i);
817 			ipqe->ipqe_ip->ip_len =
818 			    htons(ntohs(ipqe->ipqe_ip->ip_len) - i);
819 		}
820 	}
821 
822 	/*
823 	 * While we overlap succeeding segments trim them or,
824 	 * if they are completely covered, dequeue them.
825 	 */
826 	for (; q != NULL &&
827 	    ntohs(ipqe->ipqe_ip->ip_off) + ntohs(ipqe->ipqe_ip->ip_len) >
828 	    ntohs(q->ipqe_ip->ip_off); q = nq) {
829 		i = (ntohs(ipqe->ipqe_ip->ip_off) +
830 		    ntohs(ipqe->ipqe_ip->ip_len)) - ntohs(q->ipqe_ip->ip_off);
831 		if (i < ntohs(q->ipqe_ip->ip_len)) {
832 			q->ipqe_ip->ip_len =
833 			    htons(ntohs(q->ipqe_ip->ip_len) - i);
834 			q->ipqe_ip->ip_off =
835 			    htons(ntohs(q->ipqe_ip->ip_off) + i);
836 			m_adj(q->ipqe_m, i);
837 			break;
838 		}
839 		nq = LIST_NEXT(q, ipqe_q);
840 		m_freem(q->ipqe_m);
841 		LIST_REMOVE(q, ipqe_q);
842 		pool_put(&ipqent_pool, q);
843 		ip_frags--;
844 	}
845 
846 insert:
847 	/*
848 	 * Stick new segment in its place;
849 	 * check for complete reassembly.
850 	 */
851 	if (p == NULL) {
852 		LIST_INSERT_HEAD(&fp->ipq_fragq, ipqe, ipqe_q);
853 	} else {
854 		LIST_INSERT_AFTER(p, ipqe, ipqe_q);
855 	}
856 	next = 0;
857 	for (p = NULL, q = LIST_FIRST(&fp->ipq_fragq); q != NULL;
858 	    p = q, q = LIST_NEXT(q, ipqe_q)) {
859 		if (ntohs(q->ipqe_ip->ip_off) != next)
860 			return (0);
861 		next += ntohs(q->ipqe_ip->ip_len);
862 	}
863 	if (p->ipqe_mff)
864 		return (0);
865 
866 	/*
867 	 * Reassembly is complete.  Check for a bogus message size and
868 	 * concatenate fragments.
869 	 */
870 	q = LIST_FIRST(&fp->ipq_fragq);
871 	ip = q->ipqe_ip;
872 	if ((next + (ip->ip_hl << 2)) > IP_MAXPACKET) {
873 		ipstat.ips_toolong++;
874 		ip_freef(fp);
875 		return (0);
876 	}
877 	m = q->ipqe_m;
878 	t = m->m_next;
879 	m->m_next = 0;
880 	m_cat(m, t);
881 	nq = LIST_NEXT(q, ipqe_q);
882 	pool_put(&ipqent_pool, q);
883 	ip_frags--;
884 	for (q = nq; q != NULL; q = nq) {
885 		t = q->ipqe_m;
886 		nq = LIST_NEXT(q, ipqe_q);
887 		pool_put(&ipqent_pool, q);
888 		ip_frags--;
889 		m_cat(m, t);
890 	}
891 
892 	/*
893 	 * Create header for new ip packet by
894 	 * modifying header of first packet;
895 	 * dequeue and discard fragment reassembly header.
896 	 * Make header visible.
897 	 */
898 	ip->ip_len = htons(next);
899 	ip->ip_src = fp->ipq_src;
900 	ip->ip_dst = fp->ipq_dst;
901 	LIST_REMOVE(fp, ipq_q);
902 	pool_put(&ipq_pool, fp);
903 	m->m_len += (ip->ip_hl << 2);
904 	m->m_data -= (ip->ip_hl << 2);
905 	/* some debugging cruft by sklower, below, will go away soon */
906 	if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */
907 		int plen = 0;
908 		for (t = m; t; t = t->m_next)
909 			plen += t->m_len;
910 		m->m_pkthdr.len = plen;
911 	}
912 	return (m);
913 
914 dropfrag:
915 	ipstat.ips_fragdropped++;
916 	m_freem(m);
917 	pool_put(&ipqent_pool, ipqe);
918 	ip_frags--;
919 	return (0);
920 }
921 
922 /*
923  * Free a fragment reassembly header and all
924  * associated datagrams.
925  */
926 void
927 ip_freef(struct ipq *fp)
928 {
929 	struct ipqent *q, *p;
930 
931 	for (q = LIST_FIRST(&fp->ipq_fragq); q != NULL; q = p) {
932 		p = LIST_NEXT(q, ipqe_q);
933 		m_freem(q->ipqe_m);
934 		LIST_REMOVE(q, ipqe_q);
935 		pool_put(&ipqent_pool, q);
936 		ip_frags--;
937 	}
938 	LIST_REMOVE(fp, ipq_q);
939 	pool_put(&ipq_pool, fp);
940 }
941 
942 /*
943  * IP timer processing;
944  * if a timer expires on a reassembly queue, discard it.
945  * clear the forwarding cache, there might be a better route.
946  */
947 void
948 ip_slowtimo(void)
949 {
950 	struct ipq *fp, *nfp;
951 	int s = splsoftnet();
952 
953 	for (fp = LIST_FIRST(&ipq); fp != NULL; fp = nfp) {
954 		nfp = LIST_NEXT(fp, ipq_q);
955 		if (--fp->ipq_ttl == 0) {
956 			ipstat.ips_fragtimeout++;
957 			ip_freef(fp);
958 		}
959 	}
960 	if (ipforward_rt.ro_rt) {
961 		RTFREE(ipforward_rt.ro_rt);
962 		ipforward_rt.ro_rt = 0;
963 	}
964 	splx(s);
965 }
966 
967 /*
968  * Drain off all datagram fragments.
969  */
970 void
971 ip_drain(void)
972 {
973 	while (!LIST_EMPTY(&ipq)) {
974 		ipstat.ips_fragdropped++;
975 		ip_freef(LIST_FIRST(&ipq));
976 	}
977 }
978 
979 /*
980  * Flush a bunch of datagram fragments, till we are down to 75%.
981  */
982 void
983 ip_flush(void)
984 {
985 	int max = 50;
986 
987 	/* ipq already locked */
988 	while (!LIST_EMPTY(&ipq) && ip_frags > ip_maxqueue * 3 / 4 && --max) {
989 		ipstat.ips_fragdropped++;
990 		ip_freef(LIST_FIRST(&ipq));
991 	}
992 }
993 
994 /*
995  * Do option processing on a datagram,
996  * possibly discarding it if bad options are encountered,
997  * or forwarding it if source-routed.
998  * Returns 1 if packet has been forwarded/freed,
999  * 0 if the packet should be processed further.
1000  */
1001 int
1002 ip_dooptions(struct mbuf *m, struct ifnet *ifp)
1003 {
1004 	struct ip *ip = mtod(m, struct ip *);
1005 	struct sockaddr_in ipaddr;
1006 	u_char *cp;
1007 	struct ip_timestamp ipt;
1008 	struct in_ifaddr *ia;
1009 	int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0;
1010 	struct in_addr sin, dst;
1011 	u_int32_t ntime;
1012 
1013 	dst = ip->ip_dst;
1014 	cp = (u_char *)(ip + 1);
1015 	cnt = (ip->ip_hl << 2) - sizeof (struct ip);
1016 
1017 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
1018 		opt = cp[IPOPT_OPTVAL];
1019 		if (opt == IPOPT_EOL)
1020 			break;
1021 		if (opt == IPOPT_NOP)
1022 			optlen = 1;
1023 		else {
1024 			if (cnt < IPOPT_OLEN + sizeof(*cp)) {
1025 				code = &cp[IPOPT_OLEN] - (u_char *)ip;
1026 				goto bad;
1027 			}
1028 			optlen = cp[IPOPT_OLEN];
1029 			if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) {
1030 				code = &cp[IPOPT_OLEN] - (u_char *)ip;
1031 				goto bad;
1032 			}
1033 		}
1034 
1035 		switch (opt) {
1036 
1037 		default:
1038 			break;
1039 
1040 		/*
1041 		 * Source routing with record.
1042 		 * Find interface with current destination address.
1043 		 * If none on this machine then drop if strictly routed,
1044 		 * or do nothing if loosely routed.
1045 		 * Record interface address and bring up next address
1046 		 * component.  If strictly routed make sure next
1047 		 * address is on directly accessible net.
1048 		 */
1049 		case IPOPT_LSRR:
1050 		case IPOPT_SSRR:
1051 			if (!ip_dosourceroute) {
1052 				type = ICMP_UNREACH;
1053 				code = ICMP_UNREACH_SRCFAIL;
1054 				goto bad;
1055 			}
1056 			if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
1057 				code = &cp[IPOPT_OFFSET] - (u_char *)ip;
1058 				goto bad;
1059 			}
1060 			memset(&ipaddr, 0, sizeof(ipaddr));
1061 			ipaddr.sin_family = AF_INET;
1062 			ipaddr.sin_len = sizeof(ipaddr);
1063 			ipaddr.sin_addr = ip->ip_dst;
1064 			ia = ifatoia(ifa_ifwithaddr(sintosa(&ipaddr),
1065 			    m->m_pkthdr.ph_rtableid));
1066 			if (ia == 0) {
1067 				if (opt == IPOPT_SSRR) {
1068 					type = ICMP_UNREACH;
1069 					code = ICMP_UNREACH_SRCFAIL;
1070 					goto bad;
1071 				}
1072 				/*
1073 				 * Loose routing, and not at next destination
1074 				 * yet; nothing to do except forward.
1075 				 */
1076 				break;
1077 			}
1078 			off--;			/* 0 origin */
1079 			if ((off + sizeof(struct in_addr)) > optlen) {
1080 				/*
1081 				 * End of source route.  Should be for us.
1082 				 */
1083 				save_rte(m, cp, ip->ip_src);
1084 				break;
1085 			}
1086 
1087 			/*
1088 			 * locate outgoing interface
1089 			 */
1090 			memset(&ipaddr, 0, sizeof(ipaddr));
1091 			ipaddr.sin_family = AF_INET;
1092 			ipaddr.sin_len = sizeof(ipaddr);
1093 			memcpy(&ipaddr.sin_addr, cp + off,
1094 			    sizeof(ipaddr.sin_addr));
1095 			if (opt == IPOPT_SSRR) {
1096 			    if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(&ipaddr),
1097 				m->m_pkthdr.ph_rtableid))) == NULL)
1098 				ia = ifatoia(ifa_ifwithnet(sintosa(&ipaddr),
1099 				    m->m_pkthdr.ph_rtableid));
1100 			} else
1101 				/* keep packet in the virtual instance */
1102 				ia = ip_rtaddr(ipaddr.sin_addr,
1103 				    m->m_pkthdr.ph_rtableid);
1104 			if (ia == 0) {
1105 				type = ICMP_UNREACH;
1106 				code = ICMP_UNREACH_SRCFAIL;
1107 				goto bad;
1108 			}
1109 			ip->ip_dst = ipaddr.sin_addr;
1110 			memcpy(cp + off, &ia->ia_addr.sin_addr,
1111 			    sizeof(struct in_addr));
1112 			cp[IPOPT_OFFSET] += sizeof(struct in_addr);
1113 			/*
1114 			 * Let ip_intr's mcast routing check handle mcast pkts
1115 			 */
1116 			forward = !IN_MULTICAST(ip->ip_dst.s_addr);
1117 			break;
1118 
1119 		case IPOPT_RR:
1120 			if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
1121 				code = &cp[IPOPT_OLEN] - (u_char *)ip;
1122 				goto bad;
1123 			}
1124 			if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
1125 				code = &cp[IPOPT_OFFSET] - (u_char *)ip;
1126 				goto bad;
1127 			}
1128 
1129 			/*
1130 			 * If no space remains, ignore.
1131 			 */
1132 			off--;			/* 0 origin */
1133 			if ((off + sizeof(struct in_addr)) > optlen)
1134 				break;
1135 			memset(&ipaddr, 0, sizeof(ipaddr));
1136 			ipaddr.sin_family = AF_INET;
1137 			ipaddr.sin_len = sizeof(ipaddr);
1138 			ipaddr.sin_addr = ip->ip_dst;
1139 			/*
1140 			 * locate outgoing interface; if we're the destination,
1141 			 * use the incoming interface (should be same).
1142 			 * Again keep the packet inside the virtual instance.
1143 			 */
1144 			if ((ia = ifatoia(ifa_ifwithaddr(sintosa(&ipaddr),
1145 			    m->m_pkthdr.ph_rtableid))) == 0 &&
1146 			    (ia = ip_rtaddr(ipaddr.sin_addr,
1147 			    m->m_pkthdr.ph_rtableid)) == 0) {
1148 				type = ICMP_UNREACH;
1149 				code = ICMP_UNREACH_HOST;
1150 				goto bad;
1151 			}
1152 			memcpy(cp + off, &ia->ia_addr.sin_addr,
1153 			    sizeof(struct in_addr));
1154 			cp[IPOPT_OFFSET] += sizeof(struct in_addr);
1155 			break;
1156 
1157 		case IPOPT_TS:
1158 			code = cp - (u_char *)ip;
1159 			if (optlen < sizeof(struct ip_timestamp))
1160 				goto bad;
1161 			memcpy(&ipt, cp, sizeof(struct ip_timestamp));
1162 			if (ipt.ipt_ptr < 5 || ipt.ipt_len < 5)
1163 				goto bad;
1164 			if (ipt.ipt_ptr - 1 + sizeof(u_int32_t) > ipt.ipt_len) {
1165 				if (++ipt.ipt_oflw == 0)
1166 					goto bad;
1167 				break;
1168 			}
1169 			memcpy(&sin, cp + ipt.ipt_ptr - 1, sizeof sin);
1170 			switch (ipt.ipt_flg) {
1171 
1172 			case IPOPT_TS_TSONLY:
1173 				break;
1174 
1175 			case IPOPT_TS_TSANDADDR:
1176 				if (ipt.ipt_ptr - 1 + sizeof(u_int32_t) +
1177 				    sizeof(struct in_addr) > ipt.ipt_len)
1178 					goto bad;
1179 				memset(&ipaddr, 0, sizeof(ipaddr));
1180 				ipaddr.sin_family = AF_INET;
1181 				ipaddr.sin_len = sizeof(ipaddr);
1182 				ipaddr.sin_addr = dst;
1183 				ia = ifatoia(ifaof_ifpforaddr(sintosa(&ipaddr),
1184 				    ifp));
1185 				if (ia == 0)
1186 					continue;
1187 				memcpy(&sin, &ia->ia_addr.sin_addr,
1188 				    sizeof(struct in_addr));
1189 				ipt.ipt_ptr += sizeof(struct in_addr);
1190 				break;
1191 
1192 			case IPOPT_TS_PRESPEC:
1193 				if (ipt.ipt_ptr - 1 + sizeof(u_int32_t) +
1194 				    sizeof(struct in_addr) > ipt.ipt_len)
1195 					goto bad;
1196 				memset(&ipaddr, 0, sizeof(ipaddr));
1197 				ipaddr.sin_family = AF_INET;
1198 				ipaddr.sin_len = sizeof(ipaddr);
1199 				ipaddr.sin_addr = sin;
1200 				if (ifa_ifwithaddr(sintosa(&ipaddr),
1201 				    m->m_pkthdr.ph_rtableid) == 0)
1202 					continue;
1203 				ipt.ipt_ptr += sizeof(struct in_addr);
1204 				break;
1205 
1206 			default:
1207 				/* XXX can't take &ipt->ipt_flg */
1208 				code = (u_char *)&ipt.ipt_ptr -
1209 				    (u_char *)ip + 1;
1210 				goto bad;
1211 			}
1212 			ntime = iptime();
1213 			memcpy(cp + ipt.ipt_ptr - 1, &ntime, sizeof(u_int32_t));
1214 			ipt.ipt_ptr += sizeof(u_int32_t);
1215 		}
1216 	}
1217 	if (forward && ipforwarding) {
1218 		ip_forward(m, ifp, 1);
1219 		return (1);
1220 	}
1221 	return (0);
1222 bad:
1223 	icmp_error(m, type, code, 0, 0);
1224 	ipstat.ips_badoptions++;
1225 	return (1);
1226 }
1227 
1228 /*
1229  * Given address of next destination (final or next hop),
1230  * return internet address info of interface to be used to get there.
1231  */
1232 struct in_ifaddr *
1233 ip_rtaddr(struct in_addr dst, u_int rtableid)
1234 {
1235 	struct sockaddr_in *sin;
1236 
1237 	sin = satosin(&ipforward_rt.ro_dst);
1238 
1239 	if (ipforward_rt.ro_rt == 0 || dst.s_addr != sin->sin_addr.s_addr) {
1240 		if (ipforward_rt.ro_rt) {
1241 			RTFREE(ipforward_rt.ro_rt);
1242 			ipforward_rt.ro_rt = 0;
1243 		}
1244 		sin->sin_family = AF_INET;
1245 		sin->sin_len = sizeof(*sin);
1246 		sin->sin_addr = dst;
1247 
1248 		ipforward_rt.ro_rt = rtalloc1(&ipforward_rt.ro_dst, RT_REPORT,
1249 		    rtableid);
1250 	}
1251 	if (ipforward_rt.ro_rt == 0)
1252 		return (NULL);
1253 	return (ifatoia(ipforward_rt.ro_rt->rt_ifa));
1254 }
1255 
1256 /*
1257  * Save incoming source route for use in replies,
1258  * to be picked up later by ip_srcroute if the receiver is interested.
1259  */
1260 void
1261 save_rte(struct mbuf *m, u_char *option, struct in_addr dst)
1262 {
1263 	struct ip_srcrt *isr;
1264 	struct m_tag *mtag;
1265 	unsigned olen;
1266 
1267 	olen = option[IPOPT_OLEN];
1268 	if (olen > sizeof(isr->isr_hdr) + sizeof(isr->isr_routes))
1269 		return;
1270 
1271 	mtag = m_tag_get(PACKET_TAG_SRCROUTE, sizeof(*isr), M_NOWAIT);
1272 	if (mtag == NULL)
1273 		return;
1274 	isr = (struct ip_srcrt *)(mtag + 1);
1275 
1276 	memcpy(isr->isr_hdr, option, olen);
1277 	isr->isr_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr);
1278 	isr->isr_dst = dst;
1279 	m_tag_prepend(m, mtag);
1280 }
1281 
1282 /*
1283  * Retrieve incoming source route for use in replies,
1284  * in the same form used by setsockopt.
1285  * The first hop is placed before the options, will be removed later.
1286  */
1287 struct mbuf *
1288 ip_srcroute(struct mbuf *m0)
1289 {
1290 	struct in_addr *p, *q;
1291 	struct mbuf *m;
1292 	struct ip_srcrt *isr;
1293 	struct m_tag *mtag;
1294 
1295 	if (!ip_dosourceroute)
1296 		return (NULL);
1297 
1298 	mtag = m_tag_find(m0, PACKET_TAG_SRCROUTE, NULL);
1299 	if (mtag == NULL)
1300 		return (NULL);
1301 	isr = (struct ip_srcrt *)(mtag + 1);
1302 
1303 	if (isr->isr_nhops == 0)
1304 		return (NULL);
1305 	m = m_get(M_DONTWAIT, MT_SOOPTS);
1306 	if (m == NULL)
1307 		return (NULL);
1308 
1309 #define OPTSIZ	(sizeof(isr->isr_nop) + sizeof(isr->isr_hdr))
1310 
1311 	/* length is (nhops+1)*sizeof(addr) + sizeof(nop + header) */
1312 	m->m_len = (isr->isr_nhops + 1) * sizeof(struct in_addr) + OPTSIZ;
1313 
1314 	/*
1315 	 * First save first hop for return route
1316 	 */
1317 	p = &(isr->isr_routes[isr->isr_nhops - 1]);
1318 	*(mtod(m, struct in_addr *)) = *p--;
1319 
1320 	/*
1321 	 * Copy option fields and padding (nop) to mbuf.
1322 	 */
1323 	isr->isr_nop = IPOPT_NOP;
1324 	isr->isr_hdr[IPOPT_OFFSET] = IPOPT_MINOFF;
1325 	memcpy(mtod(m, caddr_t) + sizeof(struct in_addr), &isr->isr_nop,
1326 	    OPTSIZ);
1327 	q = (struct in_addr *)(mtod(m, caddr_t) +
1328 	    sizeof(struct in_addr) + OPTSIZ);
1329 #undef OPTSIZ
1330 	/*
1331 	 * Record return path as an IP source route,
1332 	 * reversing the path (pointers are now aligned).
1333 	 */
1334 	while (p >= isr->isr_routes) {
1335 		*q++ = *p--;
1336 	}
1337 	/*
1338 	 * Last hop goes to final destination.
1339 	 */
1340 	*q = isr->isr_dst;
1341 	m_tag_delete(m0, (struct m_tag *)isr);
1342 	return (m);
1343 }
1344 
1345 /*
1346  * Strip out IP options, at higher level protocol in the kernel.
1347  */
1348 void
1349 ip_stripoptions(struct mbuf *m)
1350 {
1351 	int i;
1352 	struct ip *ip = mtod(m, struct ip *);
1353 	caddr_t opts;
1354 	int olen;
1355 
1356 	olen = (ip->ip_hl<<2) - sizeof (struct ip);
1357 	opts = (caddr_t)(ip + 1);
1358 	i = m->m_len - (sizeof (struct ip) + olen);
1359 	memmove(opts, opts  + olen, i);
1360 	m->m_len -= olen;
1361 	if (m->m_flags & M_PKTHDR)
1362 		m->m_pkthdr.len -= olen;
1363 	ip->ip_hl = sizeof(struct ip) >> 2;
1364 	ip->ip_len = htons(ntohs(ip->ip_len) - olen);
1365 }
1366 
1367 int inetctlerrmap[PRC_NCMDS] = {
1368 	0,		0,		0,		0,
1369 	0,		EMSGSIZE,	EHOSTDOWN,	EHOSTUNREACH,
1370 	EHOSTUNREACH,	EHOSTUNREACH,	ECONNREFUSED,	ECONNREFUSED,
1371 	EMSGSIZE,	EHOSTUNREACH,	0,		0,
1372 	0,		0,		0,		0,
1373 	ENOPROTOOPT
1374 };
1375 
1376 /*
1377  * Forward a packet.  If some error occurs return the sender
1378  * an icmp packet.  Note we can't always generate a meaningful
1379  * icmp message because icmp doesn't have a large enough repertoire
1380  * of codes and types.
1381  *
1382  * If not forwarding, just drop the packet.  This could be confusing
1383  * if ipforwarding was zero but some routing protocol was advancing
1384  * us as a gateway to somewhere.  However, we must let the routing
1385  * protocol deal with that.
1386  *
1387  * The srcrt parameter indicates whether the packet is being forwarded
1388  * via a source route.
1389  */
1390 void
1391 ip_forward(struct mbuf *m, struct ifnet *ifp, int srcrt)
1392 {
1393 	struct mbuf mfake, *mcopy = NULL;
1394 	struct ip *ip = mtod(m, struct ip *);
1395 	struct sockaddr_in *sin;
1396 	struct rtentry *rt;
1397 	int error, type = 0, code = 0, destmtu = 0, fake = 0, len;
1398 	u_int rtableid = 0;
1399 	u_int32_t dest;
1400 
1401 	dest = 0;
1402 	if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) {
1403 		ipstat.ips_cantforward++;
1404 		m_freem(m);
1405 		return;
1406 	}
1407 	if (ip->ip_ttl <= IPTTLDEC) {
1408 		icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, dest, 0);
1409 		return;
1410 	}
1411 
1412 	rtableid = m->m_pkthdr.ph_rtableid;
1413 
1414 	sin = satosin(&ipforward_rt.ro_dst);
1415 	if ((rt = ipforward_rt.ro_rt) == 0 ||
1416 	    ip->ip_dst.s_addr != sin->sin_addr.s_addr ||
1417 	    rtableid != ipforward_rt.ro_tableid) {
1418 		if (ipforward_rt.ro_rt) {
1419 			RTFREE(ipforward_rt.ro_rt);
1420 			ipforward_rt.ro_rt = 0;
1421 		}
1422 		sin->sin_family = AF_INET;
1423 		sin->sin_len = sizeof(*sin);
1424 		sin->sin_addr = ip->ip_dst;
1425 		ipforward_rt.ro_tableid = rtableid;
1426 
1427 		rtalloc_mpath(&ipforward_rt, &ip->ip_src.s_addr);
1428 		if (ipforward_rt.ro_rt == 0) {
1429 			icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0);
1430 			return;
1431 		}
1432 		rt = ipforward_rt.ro_rt;
1433 	}
1434 
1435 	/*
1436 	 * Save at most 68 bytes of the packet in case
1437 	 * we need to generate an ICMP message to the src.
1438 	 * The data is saved in the mbuf on the stack that
1439 	 * acts as a temporary storage not intended to be
1440 	 * passed down the IP stack or to the mfree.
1441 	 */
1442 	memset(&mfake.m_hdr, 0, sizeof(mfake.m_hdr));
1443 	mfake.m_type = m->m_type;
1444 	if (m_dup_pkthdr(&mfake, m, M_DONTWAIT) == 0) {
1445 		mfake.m_data = mfake.m_pktdat;
1446 		len = min(ntohs(ip->ip_len), 68);
1447 		m_copydata(m, 0, len, mfake.m_pktdat);
1448 		mfake.m_pkthdr.len = mfake.m_len = len;
1449 		fake = 1;
1450 	}
1451 
1452 	ip->ip_ttl -= IPTTLDEC;
1453 
1454 	/*
1455 	 * If forwarding packet using same interface that it came in on,
1456 	 * perhaps should send a redirect to sender to shortcut a hop.
1457 	 * Only send redirect if source is sending directly to us,
1458 	 * and if packet was not source routed (or has any options).
1459 	 * Also, don't send redirect if forwarding using a default route
1460 	 * or a route modified by a redirect.
1461 	 * Don't send redirect if we advertise destination's arp address
1462 	 * as ours (proxy arp).
1463 	 */
1464 	if (rt->rt_ifp == ifp &&
1465 	    (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 &&
1466 	    satosin(rt_key(rt))->sin_addr.s_addr != 0 &&
1467 	    ipsendredirects && !srcrt &&
1468 	    !arpproxy(satosin(rt_key(rt))->sin_addr, m->m_pkthdr.ph_rtableid)) {
1469 		if (rt->rt_ifa &&
1470 		    (ip->ip_src.s_addr & ifatoia(rt->rt_ifa)->ia_netmask) ==
1471 		    ifatoia(rt->rt_ifa)->ia_net) {
1472 		    if (rt->rt_flags & RTF_GATEWAY)
1473 			dest = satosin(rt->rt_gateway)->sin_addr.s_addr;
1474 		    else
1475 			dest = ip->ip_dst.s_addr;
1476 		    /* Router requirements says to only send host redirects */
1477 		    type = ICMP_REDIRECT;
1478 		    code = ICMP_REDIRECT_HOST;
1479 		}
1480 	}
1481 
1482 	error = ip_output(m, NULL, &ipforward_rt,
1483 	    (IP_FORWARDING | (ip_directedbcast ? IP_ALLOWBROADCAST : 0)),
1484 	    NULL, NULL, 0);
1485 	if (error)
1486 		ipstat.ips_cantforward++;
1487 	else {
1488 		ipstat.ips_forward++;
1489 		if (type)
1490 			ipstat.ips_redirectsent++;
1491 		else
1492 			goto freecopy;
1493 	}
1494 	if (!fake)
1495 		goto freert;
1496 
1497 	switch (error) {
1498 
1499 	case 0:				/* forwarded, but need redirect */
1500 		/* type, code set above */
1501 		break;
1502 
1503 	case ENETUNREACH:		/* shouldn't happen, checked above */
1504 	case EHOSTUNREACH:
1505 	case ENETDOWN:
1506 	case EHOSTDOWN:
1507 	default:
1508 		type = ICMP_UNREACH;
1509 		code = ICMP_UNREACH_HOST;
1510 		break;
1511 
1512 	case EMSGSIZE:
1513 		type = ICMP_UNREACH;
1514 		code = ICMP_UNREACH_NEEDFRAG;
1515 
1516 #ifdef IPSEC
1517 		if (ipforward_rt.ro_rt) {
1518 			struct rtentry *rt = ipforward_rt.ro_rt;
1519 
1520 			if (rt->rt_rmx.rmx_mtu)
1521 				destmtu = rt->rt_rmx.rmx_mtu;
1522 			else
1523 				destmtu = ipforward_rt.ro_rt->rt_ifp->if_mtu;
1524 		}
1525 #endif /*IPSEC*/
1526 		ipstat.ips_cantfrag++;
1527 		break;
1528 
1529 	case EACCES:
1530 		/*
1531 		 * pf(4) blocked the packet. There is no need to send an ICMP
1532 		 * packet back since pf(4) takes care of it.
1533 		 */
1534 		goto freecopy;
1535 	case ENOBUFS:
1536 		/*
1537 		 * a router should not generate ICMP_SOURCEQUENCH as
1538 		 * required in RFC1812 Requirements for IP Version 4 Routers.
1539 		 * source quench could be a big problem under DoS attacks,
1540 		 * or the underlying interface is rate-limited.
1541 		 */
1542 		goto freecopy;
1543 	}
1544 
1545 	mcopy = m_copym(&mfake, 0, len, M_DONTWAIT);
1546 	if (mcopy)
1547 		icmp_error(mcopy, type, code, dest, destmtu);
1548 
1549  freecopy:
1550 	if (fake)
1551 		m_tag_delete_chain(&mfake);
1552  freert:
1553 #ifndef SMALL_KERNEL
1554 	if (ipmultipath && ipforward_rt.ro_rt &&
1555 	    (ipforward_rt.ro_rt->rt_flags & RTF_MPATH)) {
1556 		RTFREE(ipforward_rt.ro_rt);
1557 		ipforward_rt.ro_rt = 0;
1558 	}
1559 #endif
1560 	return;
1561 }
1562 
1563 int
1564 ip_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
1565     size_t newlen)
1566 {
1567 	int s, error;
1568 #ifdef MROUTING
1569 	extern int ip_mrtproto;
1570 	extern struct mrtstat mrtstat;
1571 #endif
1572 
1573 	/* Almost all sysctl names at this level are terminal. */
1574 	if (namelen != 1 && name[0] != IPCTL_IFQUEUE)
1575 		return (ENOTDIR);
1576 
1577 	switch (name[0]) {
1578 #ifdef notyet
1579 	case IPCTL_DEFMTU:
1580 		return (sysctl_int(oldp, oldlenp, newp, newlen, &ip_mtu));
1581 #endif
1582 	case IPCTL_SOURCEROUTE:
1583 		/*
1584 		 * Don't allow this to change in a secure environment.
1585 		 */
1586 		if (newp && securelevel > 0)
1587 			return (EPERM);
1588 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1589 		    &ip_dosourceroute));
1590 	case IPCTL_MTUDISC:
1591 		error = sysctl_int(oldp, oldlenp, newp, newlen,
1592 		    &ip_mtudisc);
1593 		if (ip_mtudisc != 0 && ip_mtudisc_timeout_q == NULL) {
1594 			ip_mtudisc_timeout_q =
1595 			    rt_timer_queue_create(ip_mtudisc_timeout);
1596 		} else if (ip_mtudisc == 0 && ip_mtudisc_timeout_q != NULL) {
1597 			s = splsoftnet();
1598 			rt_timer_queue_destroy(ip_mtudisc_timeout_q);
1599 			ip_mtudisc_timeout_q = NULL;
1600 			splx(s);
1601 		}
1602 		return error;
1603 	case IPCTL_MTUDISCTIMEOUT:
1604 		error = sysctl_int(oldp, oldlenp, newp, newlen,
1605 		   &ip_mtudisc_timeout);
1606 		if (ip_mtudisc_timeout_q != NULL) {
1607 			s = splsoftnet();
1608 			rt_timer_queue_change(ip_mtudisc_timeout_q,
1609 					      ip_mtudisc_timeout);
1610 			splx(s);
1611 		}
1612 		return (error);
1613 	case IPCTL_IPSEC_ENC_ALGORITHM:
1614 	        return (sysctl_tstring(oldp, oldlenp, newp, newlen,
1615 				       ipsec_def_enc, sizeof(ipsec_def_enc)));
1616 	case IPCTL_IPSEC_AUTH_ALGORITHM:
1617 	        return (sysctl_tstring(oldp, oldlenp, newp, newlen,
1618 				       ipsec_def_auth,
1619 				       sizeof(ipsec_def_auth)));
1620 	case IPCTL_IPSEC_IPCOMP_ALGORITHM:
1621 	        return (sysctl_tstring(oldp, oldlenp, newp, newlen,
1622 				       ipsec_def_comp,
1623 				       sizeof(ipsec_def_comp)));
1624 	case IPCTL_IFQUEUE:
1625 	        return (sysctl_ifq(name + 1, namelen - 1,
1626 		    oldp, oldlenp, newp, newlen, &ipintrq));
1627 	case IPCTL_STATS:
1628 		if (newp != NULL)
1629 			return (EPERM);
1630 		return (sysctl_struct(oldp, oldlenp, newp, newlen,
1631 		    &ipstat, sizeof(ipstat)));
1632 	case IPCTL_MRTSTATS:
1633 #ifdef MROUTING
1634 		if (newp != NULL)
1635 			return (EPERM);
1636 		return (sysctl_struct(oldp, oldlenp, newp, newlen,
1637 		    &mrtstat, sizeof(mrtstat)));
1638 #else
1639 		return (EOPNOTSUPP);
1640 #endif
1641 	case IPCTL_MRTPROTO:
1642 #ifdef MROUTING
1643 		return (sysctl_rdint(oldp, oldlenp, newp, ip_mrtproto));
1644 #else
1645 		return (EOPNOTSUPP);
1646 #endif
1647 	default:
1648 		if (name[0] < IPCTL_MAXID)
1649 			return (sysctl_int_arr(ipctl_vars, name, namelen,
1650 			    oldp, oldlenp, newp, newlen));
1651 		return (EOPNOTSUPP);
1652 	}
1653 	/* NOTREACHED */
1654 }
1655 
1656 void
1657 ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip,
1658     struct mbuf *m)
1659 {
1660 #ifdef SO_TIMESTAMP
1661 	if (inp->inp_socket->so_options & SO_TIMESTAMP) {
1662 		struct timeval tv;
1663 
1664 		microtime(&tv);
1665 		*mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv),
1666 		    SCM_TIMESTAMP, SOL_SOCKET);
1667 		if (*mp)
1668 			mp = &(*mp)->m_next;
1669 	}
1670 #endif
1671 	if (inp->inp_flags & INP_RECVDSTADDR) {
1672 		*mp = sbcreatecontrol((caddr_t) &ip->ip_dst,
1673 		    sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP);
1674 		if (*mp)
1675 			mp = &(*mp)->m_next;
1676 	}
1677 #ifdef notyet
1678 	/* this code is broken and will probably never be fixed. */
1679 	/* options were tossed already */
1680 	if (inp->inp_flags & INP_RECVOPTS) {
1681 		*mp = sbcreatecontrol((caddr_t) opts_deleted_above,
1682 		    sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP);
1683 		if (*mp)
1684 			mp = &(*mp)->m_next;
1685 	}
1686 	/* ip_srcroute doesn't do what we want here, need to fix */
1687 	if (inp->inp_flags & INP_RECVRETOPTS) {
1688 		*mp = sbcreatecontrol((caddr_t) ip_srcroute(m),
1689 		    sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP);
1690 		if (*mp)
1691 			mp = &(*mp)->m_next;
1692 	}
1693 #endif
1694 	if (inp->inp_flags & INP_RECVIF) {
1695 		struct sockaddr_dl sdl;
1696 		struct ifnet *ifp;
1697 
1698 		ifp = m->m_pkthdr.rcvif;
1699 		if (ifp == NULL || ifp->if_sadl == NULL) {
1700 			memset(&sdl, 0, sizeof(sdl));
1701 			sdl.sdl_len = offsetof(struct sockaddr_dl, sdl_data[0]);
1702 			sdl.sdl_family = AF_LINK;
1703 			sdl.sdl_index = ifp != NULL ? ifp->if_index : 0;
1704 			sdl.sdl_nlen = sdl.sdl_alen = sdl.sdl_slen = 0;
1705 			*mp = sbcreatecontrol((caddr_t) &sdl, sdl.sdl_len,
1706 			    IP_RECVIF, IPPROTO_IP);
1707 		} else {
1708 			*mp = sbcreatecontrol((caddr_t) ifp->if_sadl,
1709 			    ifp->if_sadl->sdl_len, IP_RECVIF, IPPROTO_IP);
1710 		}
1711 		if (*mp)
1712 			mp = &(*mp)->m_next;
1713 	}
1714 	if (inp->inp_flags & INP_RECVTTL) {
1715 		*mp = sbcreatecontrol((caddr_t) &ip->ip_ttl,
1716 		    sizeof(u_int8_t), IP_RECVTTL, IPPROTO_IP);
1717 		if (*mp)
1718 			mp = &(*mp)->m_next;
1719 	}
1720 	if (inp->inp_flags & INP_RECVRTABLE) {
1721 		u_int rtableid = inp->inp_rtableid;
1722 #if NPF > 0
1723 		struct pf_divert *divert;
1724 
1725 		if (m && m->m_pkthdr.pf.flags & PF_TAG_DIVERTED &&
1726 		    (divert = pf_find_divert(m)) != NULL)
1727 			rtableid = divert->rdomain;
1728 #endif
1729 
1730 		*mp = sbcreatecontrol((caddr_t) &rtableid,
1731 		    sizeof(u_int), IP_RECVRTABLE, IPPROTO_IP);
1732 		if (*mp)
1733 			mp = &(*mp)->m_next;
1734 	}
1735 }
1736 
1737