xref: /openbsd-src/sys/netinet/ip_input.c (revision 3a3fbb3f2e2521ab7c4a56b7ff7462ebd9095ec5)
1 /*	$OpenBSD: ip_input.c,v 1.96 2001/12/10 12:05:40 ho Exp $	*/
2 /*	$NetBSD: ip_input.c,v 1.30 1996/03/16 23:53:58 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1988, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *	This product includes software developed by the University of
19  *	California, Berkeley and its contributors.
20  * 4. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)ip_input.c	8.2 (Berkeley) 1/4/94
37  */
38 
39 #include "pf.h"
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/mbuf.h>
44 #include <sys/domain.h>
45 #include <sys/protosw.h>
46 #include <sys/socket.h>
47 #include <sys/syslog.h>
48 #include <sys/sysctl.h>
49 
50 #include <net/if.h>
51 #include <net/if_dl.h>
52 #include <net/route.h>
53 
54 #include <netinet/in.h>
55 #include <netinet/in_systm.h>
56 #include <netinet/if_ether.h>
57 #include <netinet/ip.h>
58 #include <netinet/in_pcb.h>
59 #include <netinet/in_var.h>
60 #include <netinet/ip_var.h>
61 #include <netinet/ip_icmp.h>
62 
63 #if NPF > 0
64 #include <net/pfvar.h>
65 #endif
66 
67 #ifdef IPSEC
68 #include <netinet/ip_ipsp.h>
69 #endif /* IPSEC */
70 
71 #ifndef	IPFORWARDING
72 #ifdef GATEWAY
73 #define	IPFORWARDING	1	/* forward IP packets not for us */
74 #else /* GATEWAY */
75 #define	IPFORWARDING	0	/* don't forward IP packets not for us */
76 #endif /* GATEWAY */
77 #endif /* IPFORWARDING */
78 #ifndef	IPSENDREDIRECTS
79 #define	IPSENDREDIRECTS	1
80 #endif
81 
82 #ifndef IPMTUDISC
83 #define IPMTUDISC	1
84 #endif
85 #ifndef IPMTUDISCTIMEOUT
86 #define IPMTUDISCTIMEOUT (10 * 60)	/* as per RFC 1191 */
87 #endif
88 
89 int encdebug = 0;
90 int ipsec_keep_invalid = IPSEC_DEFAULT_EMBRYONIC_SA_TIMEOUT;
91 int ipsec_require_pfs = IPSEC_DEFAULT_PFS;
92 int ipsec_soft_allocations = IPSEC_DEFAULT_SOFT_ALLOCATIONS;
93 int ipsec_exp_allocations = IPSEC_DEFAULT_EXP_ALLOCATIONS;
94 int ipsec_soft_bytes = IPSEC_DEFAULT_SOFT_BYTES;
95 int ipsec_exp_bytes = IPSEC_DEFAULT_EXP_BYTES;
96 int ipsec_soft_timeout = IPSEC_DEFAULT_SOFT_TIMEOUT;
97 int ipsec_exp_timeout = IPSEC_DEFAULT_EXP_TIMEOUT;
98 int ipsec_soft_first_use = IPSEC_DEFAULT_SOFT_FIRST_USE;
99 int ipsec_exp_first_use = IPSEC_DEFAULT_EXP_FIRST_USE;
100 int ipsec_expire_acquire = IPSEC_DEFAULT_EXPIRE_ACQUIRE;
101 char ipsec_def_enc[20];
102 char ipsec_def_auth[20];
103 char ipsec_def_comp[20];
104 
105 /*
106  * Note: DIRECTED_BROADCAST is handled this way so that previous
107  * configuration using this option will Just Work.
108  */
109 #ifndef IPDIRECTEDBCAST
110 #ifdef DIRECTED_BROADCAST
111 #define IPDIRECTEDBCAST	1
112 #else
113 #define	IPDIRECTEDBCAST	0
114 #endif /* DIRECTED_BROADCAST */
115 #endif /* IPDIRECTEDBCAST */
116 int	ipforwarding = IPFORWARDING;
117 int	ipsendredirects = IPSENDREDIRECTS;
118 int	ip_dosourceroute = 0;	/* no src-routing unless sysctl'd to enable */
119 int	ip_defttl = IPDEFTTL;
120 int	ip_mtudisc = IPMTUDISC;
121 u_int	ip_mtudisc_timeout = IPMTUDISCTIMEOUT;
122 int	ip_directedbcast = IPDIRECTEDBCAST;
123 #ifdef DIAGNOSTIC
124 int	ipprintfs = 0;
125 #endif
126 
127 struct rttimer_queue *ip_mtudisc_timeout_q = NULL;
128 
129 int	ipsec_auth_default_level = IPSEC_AUTH_LEVEL_DEFAULT;
130 int	ipsec_esp_trans_default_level = IPSEC_ESP_TRANS_LEVEL_DEFAULT;
131 int	ipsec_esp_network_default_level = IPSEC_ESP_NETWORK_LEVEL_DEFAULT;
132 int	ipsec_ipcomp_default_level = IPSEC_IPCOMP_LEVEL_DEFAULT;
133 
134 /* Keep track of memory used for reassembly */
135 int	ip_maxqueue = 300;
136 int	ip_frags = 0;
137 
138 /* from in_pcb.c */
139 extern int ipport_firstauto;
140 extern int ipport_lastauto;
141 extern int ipport_hifirstauto;
142 extern int ipport_hilastauto;
143 extern struct baddynamicports baddynamicports;
144 
145 extern	struct domain inetdomain;
146 extern	struct protosw inetsw[];
147 u_char	ip_protox[IPPROTO_MAX];
148 int	ipqmaxlen = IFQ_MAXLEN;
149 struct	in_ifaddrhead in_ifaddr;
150 struct	ifqueue ipintrq;
151 
152 int	ipq_locked;
153 static __inline int ipq_lock_try __P((void));
154 static __inline void ipq_unlock __P((void));
155 
156 struct pool ipqent_pool;
157 
158 static __inline int
159 ipq_lock_try()
160 {
161 	int s;
162 
163 	s = splimp();
164 	if (ipq_locked) {
165 		splx(s);
166 		return (0);
167 	}
168 	ipq_locked = 1;
169 	splx(s);
170 	return (1);
171 }
172 
173 #define ipq_lock() ipq_lock_try()
174 
175 static __inline void
176 ipq_unlock()
177 {
178 	int s;
179 
180 	s = splimp();
181 	ipq_locked = 0;
182 	splx(s);
183 }
184 
185 char *
186 inet_ntoa(ina)
187 	struct in_addr ina;
188 {
189 	static char buf[4*sizeof "123"];
190 	unsigned char *ucp = (unsigned char *)&ina;
191 
192 	sprintf(buf, "%d.%d.%d.%d", ucp[0] & 0xff, ucp[1] & 0xff,
193 	    ucp[2] & 0xff, ucp[3] & 0xff);
194 	return (buf);
195 }
196 
197 /*
198  * We need to save the IP options in case a protocol wants to respond
199  * to an incoming packet over the same route if the packet got here
200  * using IP source routing.  This allows connection establishment and
201  * maintenance when the remote end is on a network that is not known
202  * to us.
203  */
204 int	ip_nhops = 0;
205 static	struct ip_srcrt {
206 	struct	in_addr dst;			/* final destination */
207 	char	nop;				/* one NOP to align */
208 	char	srcopt[IPOPT_OFFSET + 1];	/* OPTVAL, OLEN and OFFSET */
209 	struct	in_addr route[MAX_IPOPTLEN/sizeof(struct in_addr)];
210 } ip_srcrt;
211 
212 static void save_rte __P((u_char *, struct in_addr));
213 static int ip_weadvertise(u_int32_t);
214 
215 /*
216  * IP initialization: fill in IP protocol switch table.
217  * All protocols not implemented in kernel go to raw IP protocol handler.
218  */
219 void
220 ip_init()
221 {
222 	register struct protosw *pr;
223 	register int i;
224 	const u_int16_t defbaddynamicports_tcp[] = DEFBADDYNAMICPORTS_TCP;
225 	const u_int16_t defbaddynamicports_udp[] = DEFBADDYNAMICPORTS_UDP;
226 
227 	pool_init(&ipqent_pool, sizeof(struct ipqent), 0, 0, 0, "ipqepl",
228 	    0, NULL, NULL, M_IPQ);
229 
230 	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
231 	if (pr == 0)
232 		panic("ip_init");
233 	for (i = 0; i < IPPROTO_MAX; i++)
234 		ip_protox[i] = pr - inetsw;
235 	for (pr = inetdomain.dom_protosw;
236 	    pr < inetdomain.dom_protoswNPROTOSW; pr++)
237 		if (pr->pr_domain->dom_family == PF_INET &&
238 		    pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW)
239 			ip_protox[pr->pr_protocol] = pr - inetsw;
240 	LIST_INIT(&ipq);
241 	ipintrq.ifq_maxlen = ipqmaxlen;
242 	TAILQ_INIT(&in_ifaddr);
243 	if (ip_mtudisc != 0)
244 		ip_mtudisc_timeout_q =
245 		    rt_timer_queue_create(ip_mtudisc_timeout);
246 
247 	/* Fill in list of ports not to allocate dynamically. */
248 	bzero((void *)&baddynamicports, sizeof(baddynamicports));
249 	for (i = 0; defbaddynamicports_tcp[i] != 0; i++)
250 		DP_SET(baddynamicports.tcp, defbaddynamicports_tcp[i]);
251 	for (i = 0; defbaddynamicports_udp[i] != 0; i++)
252 		DP_SET(baddynamicports.udp, defbaddynamicports_tcp[i]);
253 
254 	strncpy(ipsec_def_enc, IPSEC_DEFAULT_DEF_ENC, sizeof(ipsec_def_enc));
255 	strncpy(ipsec_def_auth, IPSEC_DEFAULT_DEF_AUTH, sizeof(ipsec_def_auth));
256 	strncpy(ipsec_def_comp, IPSEC_DEFAULT_DEF_COMP, sizeof(ipsec_def_comp));
257 }
258 
259 struct	sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET };
260 struct	route ipforward_rt;
261 
262 void
263 ipintr()
264 {
265 	register struct mbuf *m;
266 	int s;
267 
268 	if (needqueuedrain)
269 		m_reclaim();
270 
271 	while (1) {
272 		/*
273 		 * Get next datagram off input queue and get IP header
274 		 * in first mbuf.
275 		 */
276 		s = splimp();
277 		IF_DEQUEUE(&ipintrq, m);
278 		splx(s);
279 		if (m == 0)
280 			return;
281 #ifdef	DIAGNOSTIC
282 		if ((m->m_flags & M_PKTHDR) == 0)
283 			panic("ipintr no HDR");
284 #endif
285 		ipv4_input(m);
286 	}
287 }
288 
289 /*
290  * Ip input routine.  Checksum and byte swap header.  If fragmented
291  * try to reassemble.  Process options.  Pass to next level.
292  */
293 void
294 ipv4_input(m)
295 	struct mbuf *m;
296 {
297 	register struct ip *ip;
298 	register struct ipq *fp;
299 	struct in_ifaddr *ia;
300 	struct ipqent *ipqe;
301 	int hlen, mff;
302 #ifdef IPSEC
303 	int error, s;
304 	struct tdb *tdb;
305 	struct tdb_ident *tdbi;
306 	struct m_tag *mtag;
307 #endif /* IPSEC */
308 
309 	/*
310 	 * If no IP addresses have been set yet but the interfaces
311 	 * are receiving, can't do anything with incoming packets yet.
312 	 */
313 	if (in_ifaddr.tqh_first == 0)
314 		goto bad;
315 	ipstat.ips_total++;
316 	if (m->m_len < sizeof (struct ip) &&
317 	    (m = m_pullup(m, sizeof (struct ip))) == NULL) {
318 		ipstat.ips_toosmall++;
319 		return;
320 	}
321 	ip = mtod(m, struct ip *);
322 	if (ip->ip_v != IPVERSION) {
323 		ipstat.ips_badvers++;
324 		goto bad;
325 	}
326 	hlen = ip->ip_hl << 2;
327 	if (hlen < sizeof(struct ip)) {	/* minimum header length */
328 		ipstat.ips_badhlen++;
329 		goto bad;
330 	}
331 	if (hlen > m->m_len) {
332 		if ((m = m_pullup(m, hlen)) == NULL) {
333 			ipstat.ips_badhlen++;
334 			return;
335 		}
336 		ip = mtod(m, struct ip *);
337 	}
338 
339 	/* 127/8 must not appear on wire - RFC1122 */
340 	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
341 	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
342 		if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) {
343 			ipstat.ips_badaddr++;
344 			goto bad;
345 		}
346 	}
347 
348 	if ((m->m_pkthdr.csum & M_IPV4_CSUM_IN_OK) == 0) {
349 		if (m->m_pkthdr.csum & M_IPV4_CSUM_IN_BAD) {
350 			ipstat.ips_inhwcsum++;
351 			ipstat.ips_badsum++;
352 			goto bad;
353 		}
354 
355 		if (in_cksum(m, hlen) != 0) {
356 			ipstat.ips_badsum++;
357 			goto bad;
358 		}
359 	} else {
360 		m->m_pkthdr.csum &= ~M_IPV4_CSUM_IN_OK;
361 		ipstat.ips_inhwcsum++;
362 	}
363 
364 	/*
365 	 * Convert fields to host representation.
366 	 */
367 	NTOHS(ip->ip_len);
368 	if (ip->ip_len < hlen) {
369 		ipstat.ips_badlen++;
370 		goto bad;
371 	}
372 	NTOHS(ip->ip_off);
373 
374 	/*
375 	 * Check that the amount of data in the buffers
376 	 * is at least as much as the IP header would have us expect.
377 	 * Trim mbufs if longer than we expect.
378 	 * Drop packet if shorter than we expect.
379 	 */
380 	if (m->m_pkthdr.len < ip->ip_len) {
381 		ipstat.ips_tooshort++;
382 		goto bad;
383 	}
384 	if (m->m_pkthdr.len > ip->ip_len) {
385 		if (m->m_len == m->m_pkthdr.len) {
386 			m->m_len = ip->ip_len;
387 			m->m_pkthdr.len = ip->ip_len;
388 		} else
389 			m_adj(m, ip->ip_len - m->m_pkthdr.len);
390 	}
391 
392 #if NPF > 0
393 	/*
394 	 * Packet filter
395 	 */
396 	if (pf_test(PF_IN, m->m_pkthdr.rcvif, &m) != PF_PASS)
397 		goto bad;
398 	if (m == NULL)
399 		return;
400 
401 	ip = mtod(m, struct ip *);
402 	hlen = ip->ip_hl << 2;
403 #endif
404 
405 #ifdef ALTQ
406 	if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0)
407 		/* packet is dropped by traffic conditioner */
408 		return;
409 #endif
410 
411 	/*
412 	 * Process options and, if not destined for us,
413 	 * ship it on.  ip_dooptions returns 1 when an
414 	 * error was detected (causing an icmp message
415 	 * to be sent and the original packet to be freed).
416 	 */
417 	ip_nhops = 0;		/* for source routed packets */
418 	if (hlen > sizeof (struct ip) && ip_dooptions(m)) {
419 	        return;
420 	}
421 
422 	/*
423 	 * Check our list of addresses, to see if the packet is for us.
424 	 */
425 	if ((ia = in_iawithaddr(ip->ip_dst, m)) != NULL &&
426 	    (ia->ia_ifp->if_flags & IFF_UP))
427 		goto ours;
428 
429 	if (IN_MULTICAST(ip->ip_dst.s_addr)) {
430 		struct in_multi *inm;
431 #ifdef MROUTING
432 		extern struct socket *ip_mrouter;
433 
434 		if (m->m_flags & M_EXT) {
435 			if ((m = m_pullup(m, hlen)) == NULL) {
436 				ipstat.ips_toosmall++;
437 				return;
438 			}
439 			ip = mtod(m, struct ip *);
440 		}
441 
442 		if (ip_mrouter) {
443 			/*
444 			 * If we are acting as a multicast router, all
445 			 * incoming multicast packets are passed to the
446 			 * kernel-level multicast forwarding function.
447 			 * The packet is returned (relatively) intact; if
448 			 * ip_mforward() returns a non-zero value, the packet
449 			 * must be discarded, else it may be accepted below.
450 			 *
451 			 * (The IP ident field is put in the same byte order
452 			 * as expected when ip_mforward() is called from
453 			 * ip_output().)
454 			 */
455 			if (ip_mforward(m, m->m_pkthdr.rcvif) != 0) {
456 				ipstat.ips_cantforward++;
457 				m_freem(m);
458 				return;
459 			}
460 
461 			/*
462 			 * The process-level routing demon needs to receive
463 			 * all multicast IGMP packets, whether or not this
464 			 * host belongs to their destination groups.
465 			 */
466 			if (ip->ip_p == IPPROTO_IGMP)
467 				goto ours;
468 			ipstat.ips_forward++;
469 		}
470 #endif
471 		/*
472 		 * See if we belong to the destination multicast group on the
473 		 * arrival interface.
474 		 */
475 		IN_LOOKUP_MULTI(ip->ip_dst, m->m_pkthdr.rcvif, inm);
476 		if (inm == NULL) {
477 			ipstat.ips_cantforward++;
478 			m_freem(m);
479 			return;
480 		}
481 		goto ours;
482 	}
483 	if (ip->ip_dst.s_addr == INADDR_BROADCAST ||
484 	    ip->ip_dst.s_addr == INADDR_ANY)
485 		goto ours;
486 
487 	/*
488 	 * Not for us; forward if possible and desirable.
489 	 */
490 	if (ipforwarding == 0) {
491 		ipstat.ips_cantforward++;
492 		m_freem(m);
493 	} else {
494 #ifdef IPSEC
495 	        /* IPsec policy check for forwarded packets */
496 		mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL);
497                 s = splnet();
498 		if (mtag != NULL) {
499 			tdbi = (struct tdb_ident *)(mtag + 1);
500 			tdb = gettdb(tdbi->spi, &tdbi->dst, tdbi->proto);
501 		} else
502 			tdb = NULL;
503 	        ipsp_spd_lookup(m, AF_INET, hlen, &error,
504 		    IPSP_DIRECTION_IN, tdb, NULL);
505                 splx(s);
506 
507 		/* Error or otherwise drop-packet indication */
508 		if (error) {
509 			ipstat.ips_cantforward++;
510 			m_freem(m);
511 			return;
512 		}
513 
514 		/* Fall through, forward packet */
515 #endif /* IPSEC */
516 
517 		ip_forward(m, 0);
518 	}
519 	return;
520 
521 ours:
522 	/*
523 	 * If offset or IP_MF are set, must reassemble.
524 	 * Otherwise, nothing need be done.
525 	 * (We could look in the reassembly queue to see
526 	 * if the packet was previously fragmented,
527 	 * but it's not worth the time; just let them time out.)
528 	 */
529 	if (ip->ip_off &~ (IP_DF | IP_RF)) {
530 		if (m->m_flags & M_EXT) {		/* XXX */
531 			if ((m = m_pullup(m, hlen)) == NULL) {
532 				ipstat.ips_toosmall++;
533 				return;
534 			}
535 			ip = mtod(m, struct ip *);
536 		}
537 
538 		/*
539 		 * Look for queue of fragments
540 		 * of this datagram.
541 		 */
542 		ipq_lock();
543 		for (fp = ipq.lh_first; fp != NULL; fp = fp->ipq_q.le_next)
544 			if (ip->ip_id == fp->ipq_id &&
545 			    ip->ip_src.s_addr == fp->ipq_src.s_addr &&
546 			    ip->ip_dst.s_addr == fp->ipq_dst.s_addr &&
547 			    ip->ip_p == fp->ipq_p)
548 				goto found;
549 		fp = 0;
550 found:
551 
552 		/*
553 		 * Adjust ip_len to not reflect header,
554 		 * set ipqe_mff if more fragments are expected,
555 		 * convert offset of this to bytes.
556 		 */
557 		ip->ip_len -= hlen;
558 		mff = (ip->ip_off & IP_MF) != 0;
559 		if (mff) {
560 			/*
561 			 * Make sure that fragments have a data length
562 			 * that's a non-zero multiple of 8 bytes.
563 			 */
564 			if (ip->ip_len == 0 || (ip->ip_len & 0x7) != 0) {
565 				ipstat.ips_badfrags++;
566 				ipq_unlock();
567 				goto bad;
568 			}
569 		}
570 		ip->ip_off <<= 3;
571 
572 		/*
573 		 * If datagram marked as having more fragments
574 		 * or if this is not the first fragment,
575 		 * attempt reassembly; if it succeeds, proceed.
576 		 */
577 		if (mff || ip->ip_off) {
578 			ipstat.ips_fragments++;
579 			if (ip_frags + 1 > ip_maxqueue) {
580 				ip_flush();
581 				ipstat.ips_rcvmemdrop++;
582 				ipq_unlock();
583 				goto bad;
584 			}
585 
586 			ipqe = pool_get(&ipqent_pool, PR_NOWAIT);
587 			if (ipqe == NULL) {
588 				ipstat.ips_rcvmemdrop++;
589 				ipq_unlock();
590 				goto bad;
591 			}
592 			ip_frags++;
593 			ipqe->ipqe_mff = mff;
594 			ipqe->ipqe_m = m;
595 			ipqe->ipqe_ip = ip;
596 			m = ip_reass(ipqe, fp);
597 			if (m == 0) {
598 				ipq_unlock();
599 				return;
600 			}
601 			ipstat.ips_reassembled++;
602 			ip = mtod(m, struct ip *);
603 			hlen = ip->ip_hl << 2;
604 		} else
605 			if (fp)
606 				ip_freef(fp);
607 		ipq_unlock();
608 	} else
609 		ip->ip_len -= hlen;
610 
611 #ifdef IPSEC
612         /*
613          * If it's a protected packet for us, skip the policy check.
614          * That's because we really only care about the properties of
615          * the protected packet, and not the intermediate versions.
616          * While this is not the most paranoid setting, it allows
617          * some flexibility in handling of nested tunnels etc.
618          */
619         if ((ip->ip_p == IPPROTO_ESP) || (ip->ip_p == IPPROTO_AH) ||
620 	    (ip->ip_p == IPPROTO_IPCOMP))
621           goto skipipsec;
622 
623 	/*
624 	 * If the protected packet was tunneled, then we need to
625 	 * verify the protected packet's information, not the
626 	 * external headers. Thus, skip the policy lookup for the
627 	 * external packet, and keep the IPsec information linked on
628 	 * the packet header (the encapsulation routines know how
629 	 * to deal with that).
630 	 */
631 	if ((ip->ip_p == IPPROTO_IPIP) || (ip->ip_p == IPPROTO_IPV6))
632 	  goto skipipsec;
633 
634 	/*
635 	 * If the protected packet is TCP or UDP, we'll do the
636 	 * policy check in the respective input routine, so we can
637 	 * check for bypass sockets.
638 	 */
639 	if ((ip->ip_p == IPPROTO_TCP) || (ip->ip_p == IPPROTO_UDP))
640 	  goto skipipsec;
641 
642 	/* IPsec policy check for local-delivery packets */
643 	mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL);
644         s = splnet();
645 	if (mtag) {
646 		tdbi = (struct tdb_ident *)(mtag + 1);
647 	        tdb = gettdb(tdbi->spi, &tdbi->dst, tdbi->proto);
648 	} else
649 		tdb = NULL;
650 	ipsp_spd_lookup(m, AF_INET, hlen, &error, IPSP_DIRECTION_IN,
651 	    tdb, NULL);
652         splx(s);
653 
654 	/* Error or otherwise drop-packet indication */
655 	if (error) {
656 	        ipstat.ips_cantforward++;
657 		m_freem(m);
658 		return;
659 	}
660 
661  skipipsec:
662 	/* Otherwise, just fall through and deliver the packet */
663 #endif /* IPSEC */
664 
665 	/*
666 	 * Switch out to protocol's input routine.
667 	 */
668 	ipstat.ips_delivered++;
669 	(*inetsw[ip_protox[ip->ip_p]].pr_input)(m, hlen, NULL, 0);
670 	return;
671 bad:
672 	m_freem(m);
673 }
674 
675 struct in_ifaddr *
676 in_iawithaddr(ina, m)
677 	struct in_addr ina;
678 	register struct mbuf *m;
679 {
680 	register struct in_ifaddr *ia;
681 
682 	for (ia = in_ifaddr.tqh_first; ia; ia = ia->ia_list.tqe_next) {
683 		if ((ina.s_addr == ia->ia_addr.sin_addr.s_addr) ||
684 		    ((ia->ia_ifp->if_flags & (IFF_LOOPBACK|IFF_LINK1)) ==
685 			(IFF_LOOPBACK|IFF_LINK1) &&
686 		     ia->ia_subnet == (ina.s_addr & ia->ia_subnetmask)))
687 			return ia;
688 		if (((ip_directedbcast == 0) || (m && ip_directedbcast &&
689 		    ia->ia_ifp == m->m_pkthdr.rcvif)) &&
690 		    (ia->ia_ifp->if_flags & IFF_BROADCAST)) {
691 			if (ina.s_addr == ia->ia_broadaddr.sin_addr.s_addr ||
692 			    ina.s_addr == ia->ia_netbroadcast.s_addr ||
693 			    /*
694 			     * Look for all-0's host part (old broadcast addr),
695 			     * either for subnet or net.
696 			     */
697 			    ina.s_addr == ia->ia_subnet ||
698 			    ina.s_addr == ia->ia_net) {
699 				/* Make sure M_BCAST is set */
700 				if (m)
701 					m->m_flags |= M_BCAST;
702 				return ia;
703 			    }
704 		}
705 	}
706 
707 	return NULL;
708 }
709 
710 /*
711  * Take incoming datagram fragment and try to
712  * reassemble it into whole datagram.  If a chain for
713  * reassembly of this datagram already exists, then it
714  * is given as fp; otherwise have to make a chain.
715  */
716 struct mbuf *
717 ip_reass(ipqe, fp)
718 	struct ipqent *ipqe;
719 	struct ipq *fp;
720 {
721 	struct mbuf *m = ipqe->ipqe_m;
722 	struct ipqent *nq, *p, *q;
723 	struct ip *ip;
724 	struct mbuf *t;
725 	int hlen = ipqe->ipqe_ip->ip_hl << 2;
726 	int i, next;
727 
728 	/*
729 	 * Presence of header sizes in mbufs
730 	 * would confuse code below.
731 	 */
732 	m->m_data += hlen;
733 	m->m_len -= hlen;
734 
735 	/*
736 	 * If first fragment to arrive, create a reassembly queue.
737 	 */
738 	if (fp == 0) {
739 		MALLOC(fp, struct ipq *, sizeof (struct ipq),
740 		    M_FTABLE, M_NOWAIT);
741 		if (fp == NULL)
742 			goto dropfrag;
743 		LIST_INSERT_HEAD(&ipq, fp, ipq_q);
744 		fp->ipq_ttl = IPFRAGTTL;
745 		fp->ipq_p = ipqe->ipqe_ip->ip_p;
746 		fp->ipq_id = ipqe->ipqe_ip->ip_id;
747 		LIST_INIT(&fp->ipq_fragq);
748 		fp->ipq_src = ipqe->ipqe_ip->ip_src;
749 		fp->ipq_dst = ipqe->ipqe_ip->ip_dst;
750 		p = NULL;
751 		goto insert;
752 	}
753 
754 	/*
755 	 * Find a segment which begins after this one does.
756 	 */
757 	for (p = NULL, q = fp->ipq_fragq.lh_first; q != NULL;
758 	    p = q, q = q->ipqe_q.le_next)
759 		if (q->ipqe_ip->ip_off > ipqe->ipqe_ip->ip_off)
760 			break;
761 
762 	/*
763 	 * If there is a preceding segment, it may provide some of
764 	 * our data already.  If so, drop the data from the incoming
765 	 * segment.  If it provides all of our data, drop us.
766 	 */
767 	if (p != NULL) {
768 		i = p->ipqe_ip->ip_off + p->ipqe_ip->ip_len -
769 		    ipqe->ipqe_ip->ip_off;
770 		if (i > 0) {
771 			if (i >= ipqe->ipqe_ip->ip_len)
772 				goto dropfrag;
773 			m_adj(ipqe->ipqe_m, i);
774 			ipqe->ipqe_ip->ip_off += i;
775 			ipqe->ipqe_ip->ip_len -= i;
776 		}
777 	}
778 
779 	/*
780 	 * While we overlap succeeding segments trim them or,
781 	 * if they are completely covered, dequeue them.
782 	 */
783 	for (; q != NULL && ipqe->ipqe_ip->ip_off + ipqe->ipqe_ip->ip_len >
784 	    q->ipqe_ip->ip_off; q = nq) {
785 		i = (ipqe->ipqe_ip->ip_off + ipqe->ipqe_ip->ip_len) -
786 		    q->ipqe_ip->ip_off;
787 		if (i < q->ipqe_ip->ip_len) {
788 			q->ipqe_ip->ip_len -= i;
789 			q->ipqe_ip->ip_off += i;
790 			m_adj(q->ipqe_m, i);
791 			break;
792 		}
793 		nq = q->ipqe_q.le_next;
794 		m_freem(q->ipqe_m);
795 		LIST_REMOVE(q, ipqe_q);
796 		pool_put(&ipqent_pool, q);
797 		ip_frags--;
798 	}
799 
800 insert:
801 	/*
802 	 * Stick new segment in its place;
803 	 * check for complete reassembly.
804 	 */
805 	if (p == NULL) {
806 		LIST_INSERT_HEAD(&fp->ipq_fragq, ipqe, ipqe_q);
807 	} else {
808 		LIST_INSERT_AFTER(p, ipqe, ipqe_q);
809 	}
810 	next = 0;
811 	for (p = NULL, q = fp->ipq_fragq.lh_first; q != NULL;
812 	    p = q, q = q->ipqe_q.le_next) {
813 		if (q->ipqe_ip->ip_off != next)
814 			return (0);
815 		next += q->ipqe_ip->ip_len;
816 	}
817 	if (p->ipqe_mff)
818 		return (0);
819 
820 	/*
821 	 * Reassembly is complete.  Check for a bogus message size and
822 	 * concatenate fragments.
823 	 */
824 	q = fp->ipq_fragq.lh_first;
825 	ip = q->ipqe_ip;
826 	if ((next + (ip->ip_hl << 2)) > IP_MAXPACKET) {
827 		ipstat.ips_toolong++;
828 		ip_freef(fp);
829 		return (0);
830 	}
831 	m = q->ipqe_m;
832 	t = m->m_next;
833 	m->m_next = 0;
834 	m_cat(m, t);
835 	nq = q->ipqe_q.le_next;
836 	pool_put(&ipqent_pool, q);
837 	ip_frags--;
838 	for (q = nq; q != NULL; q = nq) {
839 		t = q->ipqe_m;
840 		nq = q->ipqe_q.le_next;
841 		pool_put(&ipqent_pool, q);
842 		ip_frags--;
843 		m_cat(m, t);
844 	}
845 
846 	/*
847 	 * Create header for new ip packet by
848 	 * modifying header of first packet;
849 	 * dequeue and discard fragment reassembly header.
850 	 * Make header visible.
851 	 */
852 	ip->ip_len = next;
853 	ip->ip_src = fp->ipq_src;
854 	ip->ip_dst = fp->ipq_dst;
855 	LIST_REMOVE(fp, ipq_q);
856 	FREE(fp, M_FTABLE);
857 	m->m_len += (ip->ip_hl << 2);
858 	m->m_data -= (ip->ip_hl << 2);
859 	/* some debugging cruft by sklower, below, will go away soon */
860 	if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */
861 		int plen = 0;
862 		for (t = m; t; t = t->m_next)
863 			plen += t->m_len;
864 		m->m_pkthdr.len = plen;
865 	}
866 	return (m);
867 
868 dropfrag:
869 	ipstat.ips_fragdropped++;
870 	m_freem(m);
871 	pool_put(&ipqent_pool, ipqe);
872 	ip_frags--;
873 	return (0);
874 }
875 
876 /*
877  * Free a fragment reassembly header and all
878  * associated datagrams.
879  */
880 void
881 ip_freef(fp)
882 	struct ipq *fp;
883 {
884 	register struct ipqent *q, *p;
885 
886 	for (q = fp->ipq_fragq.lh_first; q != NULL; q = p) {
887 		p = q->ipqe_q.le_next;
888 		m_freem(q->ipqe_m);
889 		LIST_REMOVE(q, ipqe_q);
890 		pool_put(&ipqent_pool, q);
891 		ip_frags--;
892 	}
893 	LIST_REMOVE(fp, ipq_q);
894 	FREE(fp, M_FTABLE);
895 }
896 
897 /*
898  * IP timer processing;
899  * if a timer expires on a reassembly
900  * queue, discard it.
901  */
902 void
903 ip_slowtimo()
904 {
905 	register struct ipq *fp, *nfp;
906 	int s = splsoftnet();
907 
908 	ipq_lock();
909 	for (fp = ipq.lh_first; fp != NULL; fp = nfp) {
910 		nfp = fp->ipq_q.le_next;
911 		if (--fp->ipq_ttl == 0) {
912 			ipstat.ips_fragtimeout++;
913 			ip_freef(fp);
914 		}
915 	}
916 	ipq_unlock();
917 	splx(s);
918 }
919 
920 /*
921  * Drain off all datagram fragments.
922  */
923 void
924 ip_drain()
925 {
926 
927 	if (ipq_lock_try() == 0)
928 		return;
929 	while (ipq.lh_first != NULL) {
930 		ipstat.ips_fragdropped++;
931 		ip_freef(ipq.lh_first);
932 	}
933 	ipq_unlock();
934 }
935 
936 /*
937  * Flush a bunch of datagram fragments, till we are down to 75%.
938  */
939 void
940 ip_flush()
941 {
942 	int max = 50;
943 
944 	/* ipq already locked */
945 	while (ipq.lh_first != NULL && ip_frags > ip_maxqueue * 3 / 4 && --max) {
946 		ipstat.ips_fragdropped++;
947 		ip_freef(ipq.lh_first);
948 	}
949 }
950 
951 /*
952  * Do option processing on a datagram,
953  * possibly discarding it if bad options are encountered,
954  * or forwarding it if source-routed.
955  * Returns 1 if packet has been forwarded/freed,
956  * 0 if the packet should be processed further.
957  */
958 int
959 ip_dooptions(m)
960 	struct mbuf *m;
961 {
962 	register struct ip *ip = mtod(m, struct ip *);
963 	register u_char *cp;
964 	struct ip_timestamp ipt;
965 	register struct in_ifaddr *ia;
966 	int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0;
967 	struct in_addr sin, dst;
968 	n_time ntime;
969 
970 	dst = ip->ip_dst;
971 	cp = (u_char *)(ip + 1);
972 	cnt = (ip->ip_hl << 2) - sizeof (struct ip);
973 
974 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
975 		opt = cp[IPOPT_OPTVAL];
976 		if (opt == IPOPT_EOL)
977 			break;
978 		if (opt == IPOPT_NOP)
979 			optlen = 1;
980 		else {
981 			if (cnt < IPOPT_OLEN + sizeof(*cp)) {
982 				code = &cp[IPOPT_OLEN] - (u_char *)ip;
983 				goto bad;
984 			}
985 			optlen = cp[IPOPT_OLEN];
986 			if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) {
987 				code = &cp[IPOPT_OLEN] - (u_char *)ip;
988 				goto bad;
989 			}
990 		}
991 
992 		switch (opt) {
993 
994 		default:
995 			break;
996 
997 		/*
998 		 * Source routing with record.
999 		 * Find interface with current destination address.
1000 		 * If none on this machine then drop if strictly routed,
1001 		 * or do nothing if loosely routed.
1002 		 * Record interface address and bring up next address
1003 		 * component.  If strictly routed make sure next
1004 		 * address is on directly accessible net.
1005 		 */
1006 		case IPOPT_LSRR:
1007 		case IPOPT_SSRR:
1008 			if (!ip_dosourceroute) {
1009 				char buf[4*sizeof "123"];
1010 
1011 				strcpy(buf, inet_ntoa(ip->ip_dst));
1012 				log(LOG_WARNING,
1013 				    "attempted source route from %s to %s\n",
1014 				    inet_ntoa(ip->ip_src), buf);
1015 				type = ICMP_UNREACH;
1016 				code = ICMP_UNREACH_SRCFAIL;
1017 				goto bad;
1018 			}
1019 			if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
1020 				code = &cp[IPOPT_OFFSET] - (u_char *)ip;
1021 				goto bad;
1022 			}
1023 			ipaddr.sin_addr = ip->ip_dst;
1024 			ia = ifatoia(ifa_ifwithaddr(sintosa(&ipaddr)));
1025 			if (ia == 0) {
1026 				if (opt == IPOPT_SSRR) {
1027 					type = ICMP_UNREACH;
1028 					code = ICMP_UNREACH_SRCFAIL;
1029 					goto bad;
1030 				}
1031 				/*
1032 				 * Loose routing, and not at next destination
1033 				 * yet; nothing to do except forward.
1034 				 */
1035 				break;
1036 			}
1037 			off--;			/* 0 origin */
1038 			if ((off + sizeof(struct in_addr)) > optlen) {
1039 				/*
1040 				 * End of source route.  Should be for us.
1041 				 */
1042 				save_rte(cp, ip->ip_src);
1043 				break;
1044 			}
1045 
1046 			/*
1047 			 * locate outgoing interface
1048 			 */
1049 			bcopy((caddr_t)(cp + off), (caddr_t)&ipaddr.sin_addr,
1050 			    sizeof(ipaddr.sin_addr));
1051 			if (opt == IPOPT_SSRR) {
1052 #define	INA	struct in_ifaddr *
1053 #define	SA	struct sockaddr *
1054 			    if ((ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr)) == 0)
1055 				ia = (INA)ifa_ifwithnet((SA)&ipaddr);
1056 			} else
1057 				ia = ip_rtaddr(ipaddr.sin_addr);
1058 			if (ia == 0) {
1059 				type = ICMP_UNREACH;
1060 				code = ICMP_UNREACH_SRCFAIL;
1061 				goto bad;
1062 			}
1063 			ip->ip_dst = ipaddr.sin_addr;
1064 			bcopy((caddr_t)&ia->ia_addr.sin_addr,
1065 			    (caddr_t)(cp + off), sizeof(struct in_addr));
1066 			cp[IPOPT_OFFSET] += sizeof(struct in_addr);
1067 			/*
1068 			 * Let ip_intr's mcast routing check handle mcast pkts
1069 			 */
1070 			forward = !IN_MULTICAST(ip->ip_dst.s_addr);
1071 			break;
1072 
1073 		case IPOPT_RR:
1074 			if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
1075 				code = &cp[IPOPT_OLEN] - (u_char *)ip;
1076 				goto bad;
1077 			}
1078 			if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
1079 				code = &cp[IPOPT_OFFSET] - (u_char *)ip;
1080 				goto bad;
1081 			}
1082 
1083 			/*
1084 			 * If no space remains, ignore.
1085 			 */
1086 			off--;			/* 0 origin */
1087 			if ((off + sizeof(struct in_addr)) > optlen)
1088 				break;
1089 			bcopy((caddr_t)(&ip->ip_dst), (caddr_t)&ipaddr.sin_addr,
1090 			    sizeof(ipaddr.sin_addr));
1091 			/*
1092 			 * locate outgoing interface; if we're the destination,
1093 			 * use the incoming interface (should be same).
1094 			 */
1095 			if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) == 0 &&
1096 			    (ia = ip_rtaddr(ipaddr.sin_addr)) == 0) {
1097 				type = ICMP_UNREACH;
1098 				code = ICMP_UNREACH_HOST;
1099 				goto bad;
1100 			}
1101 			bcopy((caddr_t)&ia->ia_addr.sin_addr,
1102 			    (caddr_t)(cp + off), sizeof(struct in_addr));
1103 			cp[IPOPT_OFFSET] += sizeof(struct in_addr);
1104 			break;
1105 
1106 		case IPOPT_TS:
1107 			code = cp - (u_char *)ip;
1108 			bcopy(cp, &ipt, sizeof(struct ip_timestamp));
1109 			if (ipt.ipt_ptr < 5 || ipt.ipt_len < 5)
1110 				goto bad;
1111 			if (ipt.ipt_ptr - 1 + sizeof(n_time) > ipt.ipt_len) {
1112 				if (++ipt.ipt_oflw == 0)
1113 					goto bad;
1114 				break;
1115 			}
1116 			bcopy(cp + ipt.ipt_ptr - 1, &sin, sizeof sin);
1117 			switch (ipt.ipt_flg) {
1118 
1119 			case IPOPT_TS_TSONLY:
1120 				break;
1121 
1122 			case IPOPT_TS_TSANDADDR:
1123 				if (ipt.ipt_ptr - 1 + sizeof(n_time) +
1124 				    sizeof(struct in_addr) > ipt.ipt_len)
1125 					goto bad;
1126 				ipaddr.sin_addr = dst;
1127 				ia = (INA)ifaof_ifpforaddr((SA)&ipaddr,
1128 							    m->m_pkthdr.rcvif);
1129 				if (ia == 0)
1130 					continue;
1131 				bcopy((caddr_t)&ia->ia_addr.sin_addr,
1132 				    (caddr_t)&sin, sizeof(struct in_addr));
1133 				ipt.ipt_ptr += sizeof(struct in_addr);
1134 				break;
1135 
1136 			case IPOPT_TS_PRESPEC:
1137 				if (ipt.ipt_ptr - 1 + sizeof(n_time) +
1138 				    sizeof(struct in_addr) > ipt.ipt_len)
1139 					goto bad;
1140 				bcopy((caddr_t)&sin, (caddr_t)&ipaddr.sin_addr,
1141 				    sizeof(struct in_addr));
1142 				if (ifa_ifwithaddr((SA)&ipaddr) == 0)
1143 					continue;
1144 				ipt.ipt_ptr += sizeof(struct in_addr);
1145 				break;
1146 
1147 			default:
1148 				/* XXX can't take &ipt->ipt_flg */
1149 				code = (u_char *)&ipt.ipt_ptr -
1150 				    (u_char *)ip + 1;
1151 				goto bad;
1152 			}
1153 			ntime = iptime();
1154 			bcopy((caddr_t)&ntime, (caddr_t)cp + ipt.ipt_ptr - 1,
1155 			    sizeof(n_time));
1156 			ipt.ipt_ptr += sizeof(n_time);
1157 		}
1158 	}
1159 	if (forward && ipforwarding) {
1160 		ip_forward(m, 1);
1161 		return (1);
1162 	}
1163 	return (0);
1164 bad:
1165 	ip->ip_len -= ip->ip_hl << 2;   /* XXX icmp_error adds in hdr length */
1166 	icmp_error(m, type, code, 0, 0);
1167 	ipstat.ips_badoptions++;
1168 	return (1);
1169 }
1170 
1171 /*
1172  * Given address of next destination (final or next hop),
1173  * return internet address info of interface to be used to get there.
1174  */
1175 struct in_ifaddr *
1176 ip_rtaddr(dst)
1177 	 struct in_addr dst;
1178 {
1179 	register struct sockaddr_in *sin;
1180 
1181 	sin = satosin(&ipforward_rt.ro_dst);
1182 
1183 	if (ipforward_rt.ro_rt == 0 || dst.s_addr != sin->sin_addr.s_addr) {
1184 		if (ipforward_rt.ro_rt) {
1185 			RTFREE(ipforward_rt.ro_rt);
1186 			ipforward_rt.ro_rt = 0;
1187 		}
1188 		sin->sin_family = AF_INET;
1189 		sin->sin_len = sizeof(*sin);
1190 		sin->sin_addr = dst;
1191 
1192 		rtalloc(&ipforward_rt);
1193 	}
1194 	if (ipforward_rt.ro_rt == 0)
1195 		return ((struct in_ifaddr *)0);
1196 	return (ifatoia(ipforward_rt.ro_rt->rt_ifa));
1197 }
1198 
1199 /*
1200  * Save incoming source route for use in replies,
1201  * to be picked up later by ip_srcroute if the receiver is interested.
1202  */
1203 void
1204 save_rte(option, dst)
1205 	u_char *option;
1206 	struct in_addr dst;
1207 {
1208 	unsigned olen;
1209 
1210 	olen = option[IPOPT_OLEN];
1211 #ifdef DIAGNOSTIC
1212 	if (ipprintfs)
1213 		printf("save_rte: olen %d\n", olen);
1214 #endif /* 0 */
1215 	if (olen > sizeof(ip_srcrt) - (1 + sizeof(dst)))
1216 		return;
1217 	bcopy((caddr_t)option, (caddr_t)ip_srcrt.srcopt, olen);
1218 	ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr);
1219 	ip_srcrt.dst = dst;
1220 }
1221 
1222 /*
1223  * Check whether we do proxy ARP for this address and we point to ourselves.
1224  * Code shamelessly copied from arplookup().
1225  */
1226 static int
1227 ip_weadvertise(addr)
1228 	u_int32_t addr;
1229 {
1230 	register struct rtentry *rt;
1231 	register struct ifnet *ifp;
1232 	register struct ifaddr *ifa;
1233 	struct sockaddr_inarp sin;
1234 
1235 	sin.sin_len = sizeof(sin);
1236 	sin.sin_family = AF_INET;
1237 	sin.sin_addr.s_addr = addr;
1238 	sin.sin_other = SIN_PROXY;
1239 	rt = rtalloc1(sintosa(&sin), 0);
1240 	if (rt == 0)
1241 		return 0;
1242 
1243 	RTFREE(rt);
1244 
1245 	if ((rt->rt_flags & RTF_GATEWAY) || (rt->rt_flags & RTF_LLINFO) == 0 ||
1246 	    rt->rt_gateway->sa_family != AF_LINK) {
1247 		RTFREE(rt);
1248 		return 0;
1249 	}
1250 
1251 	for (ifp = ifnet.tqh_first; ifp != 0; ifp = ifp->if_list.tqe_next)
1252 		for (ifa = ifp->if_addrlist.tqh_first; ifa != 0;
1253 		    ifa = ifa->ifa_list.tqe_next) {
1254 			if (ifa->ifa_addr->sa_family != rt->rt_gateway->sa_family)
1255 				continue;
1256 
1257 			if (!bcmp(LLADDR((struct sockaddr_dl *)ifa->ifa_addr),
1258 			    LLADDR((struct sockaddr_dl *)rt->rt_gateway),
1259 			    ETHER_ADDR_LEN)) {
1260 				RTFREE(rt);
1261 				return 1;
1262 			}
1263 		}
1264 
1265 	RTFREE(rt);
1266 	return 0;
1267 }
1268 
1269 /*
1270  * Retrieve incoming source route for use in replies,
1271  * in the same form used by setsockopt.
1272  * The first hop is placed before the options, will be removed later.
1273  */
1274 struct mbuf *
1275 ip_srcroute()
1276 {
1277 	register struct in_addr *p, *q;
1278 	register struct mbuf *m;
1279 
1280 	if (ip_nhops == 0)
1281 		return ((struct mbuf *)0);
1282 	m = m_get(M_DONTWAIT, MT_SOOPTS);
1283 	if (m == 0)
1284 		return ((struct mbuf *)0);
1285 
1286 #define OPTSIZ	(sizeof(ip_srcrt.nop) + sizeof(ip_srcrt.srcopt))
1287 
1288 	/* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */
1289 	m->m_len = ip_nhops * sizeof(struct in_addr) + sizeof(struct in_addr) +
1290 	    OPTSIZ;
1291 #ifdef DIAGNOSTIC
1292 	if (ipprintfs)
1293 		printf("ip_srcroute: nhops %d mlen %d", ip_nhops, m->m_len);
1294 #endif
1295 
1296 	/*
1297 	 * First save first hop for return route
1298 	 */
1299 	p = &ip_srcrt.route[ip_nhops - 1];
1300 	*(mtod(m, struct in_addr *)) = *p--;
1301 #ifdef DIAGNOSTIC
1302 	if (ipprintfs)
1303 		printf(" hops %x", ntohl(mtod(m, struct in_addr *)->s_addr));
1304 #endif
1305 
1306 	/*
1307 	 * Copy option fields and padding (nop) to mbuf.
1308 	 */
1309 	ip_srcrt.nop = IPOPT_NOP;
1310 	ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF;
1311 	bcopy((caddr_t)&ip_srcrt.nop,
1312 	    mtod(m, caddr_t) + sizeof(struct in_addr), OPTSIZ);
1313 	q = (struct in_addr *)(mtod(m, caddr_t) +
1314 	    sizeof(struct in_addr) + OPTSIZ);
1315 #undef OPTSIZ
1316 	/*
1317 	 * Record return path as an IP source route,
1318 	 * reversing the path (pointers are now aligned).
1319 	 */
1320 	while (p >= ip_srcrt.route) {
1321 #ifdef DIAGNOSTIC
1322 		if (ipprintfs)
1323 			printf(" %x", ntohl(q->s_addr));
1324 #endif
1325 		*q++ = *p--;
1326 	}
1327 	/*
1328 	 * Last hop goes to final destination.
1329 	 */
1330 	*q = ip_srcrt.dst;
1331 #ifdef DIAGNOSTIC
1332 	if (ipprintfs)
1333 		printf(" %x\n", ntohl(q->s_addr));
1334 #endif
1335 	return (m);
1336 }
1337 
1338 /*
1339  * Strip out IP options, at higher
1340  * level protocol in the kernel.
1341  * Second argument is buffer to which options
1342  * will be moved, and return value is their length.
1343  * XXX should be deleted; last arg currently ignored.
1344  */
1345 void
1346 ip_stripoptions(m, mopt)
1347 	register struct mbuf *m;
1348 	struct mbuf *mopt;
1349 {
1350 	register int i;
1351 	struct ip *ip = mtod(m, struct ip *);
1352 	register caddr_t opts;
1353 	int olen;
1354 
1355 	olen = (ip->ip_hl<<2) - sizeof (struct ip);
1356 	opts = (caddr_t)(ip + 1);
1357 	i = m->m_len - (sizeof (struct ip) + olen);
1358 	bcopy(opts  + olen, opts, (unsigned)i);
1359 	m->m_len -= olen;
1360 	if (m->m_flags & M_PKTHDR)
1361 		m->m_pkthdr.len -= olen;
1362 	ip->ip_hl = sizeof(struct ip) >> 2;
1363 }
1364 
1365 int inetctlerrmap[PRC_NCMDS] = {
1366 	0,		0,		0,		0,
1367 	0,		EMSGSIZE,	EHOSTDOWN,	EHOSTUNREACH,
1368 	EHOSTUNREACH,	EHOSTUNREACH,	ECONNREFUSED,	ECONNREFUSED,
1369 	EMSGSIZE,	EHOSTUNREACH,	0,		0,
1370 	0,		0,		0,		0,
1371 	ENOPROTOOPT
1372 };
1373 
1374 /*
1375  * Forward a packet.  If some error occurs return the sender
1376  * an icmp packet.  Note we can't always generate a meaningful
1377  * icmp message because icmp doesn't have a large enough repertoire
1378  * of codes and types.
1379  *
1380  * If not forwarding, just drop the packet.  This could be confusing
1381  * if ipforwarding was zero but some routing protocol was advancing
1382  * us as a gateway to somewhere.  However, we must let the routing
1383  * protocol deal with that.
1384  *
1385  * The srcrt parameter indicates whether the packet is being forwarded
1386  * via a source route.
1387  */
1388 void
1389 ip_forward(m, srcrt)
1390 	struct mbuf *m;
1391 	int srcrt;
1392 {
1393 	register struct ip *ip = mtod(m, struct ip *);
1394 	register struct sockaddr_in *sin;
1395 	register struct rtentry *rt;
1396 	int error, type = 0, code = 0;
1397 	struct mbuf *mcopy;
1398 	n_long dest;
1399 	struct ifnet *destifp;
1400 #ifdef IPSEC
1401 	struct ifnet dummyifp;
1402 #endif
1403 
1404 	dest = 0;
1405 #ifdef DIAGNOSTIC
1406 	if (ipprintfs)
1407 		printf("forward: src %x dst %x ttl %x\n", ip->ip_src.s_addr,
1408 		    ip->ip_dst.s_addr, ip->ip_ttl);
1409 #endif
1410 	if (m->m_flags & M_BCAST || in_canforward(ip->ip_dst) == 0) {
1411 		ipstat.ips_cantforward++;
1412 		m_freem(m);
1413 		return;
1414 	}
1415 	if (ip->ip_ttl <= IPTTLDEC) {
1416 		icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, dest, 0);
1417 		return;
1418 	}
1419 	ip->ip_ttl -= IPTTLDEC;
1420 
1421 	sin = satosin(&ipforward_rt.ro_dst);
1422 	if ((rt = ipforward_rt.ro_rt) == 0 ||
1423 	    ip->ip_dst.s_addr != sin->sin_addr.s_addr) {
1424 		if (ipforward_rt.ro_rt) {
1425 			RTFREE(ipforward_rt.ro_rt);
1426 			ipforward_rt.ro_rt = 0;
1427 		}
1428 		sin->sin_family = AF_INET;
1429 		sin->sin_len = sizeof(*sin);
1430 		sin->sin_addr = ip->ip_dst;
1431 
1432 		rtalloc(&ipforward_rt);
1433 		if (ipforward_rt.ro_rt == 0) {
1434 			icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0);
1435 			return;
1436 		}
1437 		rt = ipforward_rt.ro_rt;
1438 	}
1439 
1440 	/*
1441 	 * Save at most 68 bytes of the packet in case
1442 	 * we need to generate an ICMP message to the src.
1443 	 * Pullup to avoid sharing mbuf cluster between m and mcopy.
1444 	 */
1445 	mcopy = m_copym(m, 0, imin((int)ip->ip_len, 68), M_DONTWAIT);
1446 	if (mcopy)
1447 		mcopy = m_pullup(mcopy, ip->ip_hl << 2);
1448 
1449 	/*
1450 	 * If forwarding packet using same interface that it came in on,
1451 	 * perhaps should send a redirect to sender to shortcut a hop.
1452 	 * Only send redirect if source is sending directly to us,
1453 	 * and if packet was not source routed (or has any options).
1454 	 * Also, don't send redirect if forwarding using a default route
1455 	 * or a route modified by a redirect.
1456 	 * Don't send redirect if we advertise destination's arp address
1457 	 * as ours (proxy arp).
1458 	 */
1459 	if (rt->rt_ifp == m->m_pkthdr.rcvif &&
1460 	    (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 &&
1461 	    satosin(rt_key(rt))->sin_addr.s_addr != 0 &&
1462 	    ipsendredirects && !srcrt &&
1463 	    !ip_weadvertise(satosin(rt_key(rt))->sin_addr.s_addr)) {
1464 		if (rt->rt_ifa &&
1465 		    (ip->ip_src.s_addr & ifatoia(rt->rt_ifa)->ia_subnetmask) ==
1466 		    ifatoia(rt->rt_ifa)->ia_subnet) {
1467 		    if (rt->rt_flags & RTF_GATEWAY)
1468 			dest = satosin(rt->rt_gateway)->sin_addr.s_addr;
1469 		    else
1470 			dest = ip->ip_dst.s_addr;
1471 		    /* Router requirements says to only send host redirects */
1472 		    type = ICMP_REDIRECT;
1473 		    code = ICMP_REDIRECT_HOST;
1474 #ifdef DIAGNOSTIC
1475 		    if (ipprintfs)
1476 			printf("redirect (%d) to %x\n", code, (u_int32_t)dest);
1477 #endif
1478 		}
1479 	}
1480 
1481 	error = ip_output(m, (struct mbuf *)0, &ipforward_rt,
1482 	    (IP_FORWARDING | (ip_directedbcast ? IP_ALLOWBROADCAST : 0)),
1483 	    0, NULL, NULL);
1484 	if (error)
1485 		ipstat.ips_cantforward++;
1486 	else {
1487 		ipstat.ips_forward++;
1488 		if (type)
1489 			ipstat.ips_redirectsent++;
1490 		else {
1491 			if (mcopy)
1492 				m_freem(mcopy);
1493 			return;
1494 		}
1495 	}
1496 	if (mcopy == NULL)
1497 		return;
1498 	destifp = NULL;
1499 
1500 	switch (error) {
1501 
1502 	case 0:				/* forwarded, but need redirect */
1503 		/* type, code set above */
1504 		break;
1505 
1506 	case ENETUNREACH:		/* shouldn't happen, checked above */
1507 	case EHOSTUNREACH:
1508 	case ENETDOWN:
1509 	case EHOSTDOWN:
1510 	default:
1511 		type = ICMP_UNREACH;
1512 		code = ICMP_UNREACH_HOST;
1513 		break;
1514 
1515 	case EMSGSIZE:
1516 		type = ICMP_UNREACH;
1517 		code = ICMP_UNREACH_NEEDFRAG;
1518 
1519 #ifdef IPSEC
1520 		if (ipforward_rt.ro_rt) {
1521 			struct rtentry *rt = ipforward_rt.ro_rt;
1522 			destifp = ipforward_rt.ro_rt->rt_ifp;
1523 			/*
1524 			 * XXX BUG ALERT
1525 			 * The "dummyifp" code relies upon the fact
1526 			 * that icmp_error() touches only ifp->if_mtu.
1527 			 */
1528 			if (rt->rt_rmx.rmx_mtu) {
1529 				dummyifp.if_mtu = rt->rt_rmx.rmx_mtu;
1530 				destifp = &dummyifp;
1531 			}
1532 		}
1533 #endif /*IPSEC*/
1534 		ipstat.ips_cantfrag++;
1535 		break;
1536 
1537 	case ENOBUFS:
1538 		type = ICMP_SOURCEQUENCH;
1539 		code = 0;
1540 		break;
1541 	}
1542 
1543 	icmp_error(mcopy, type, code, dest, destifp);
1544 }
1545 
1546 int
1547 ip_sysctl(name, namelen, oldp, oldlenp, newp, newlen)
1548 	int *name;
1549 	u_int namelen;
1550 	void *oldp;
1551 	size_t *oldlenp;
1552 	void *newp;
1553 	size_t newlen;
1554 {
1555 	int error;
1556 
1557 	/* All sysctl names at this level are terminal. */
1558 	if (namelen != 1)
1559 		return (ENOTDIR);
1560 
1561 	switch (name[0]) {
1562 	case IPCTL_FORWARDING:
1563 		return (sysctl_int(oldp, oldlenp, newp, newlen, &ipforwarding));
1564 	case IPCTL_SENDREDIRECTS:
1565 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1566 			&ipsendredirects));
1567 	case IPCTL_DEFTTL:
1568 		return (sysctl_int(oldp, oldlenp, newp, newlen, &ip_defttl));
1569 #ifdef notyet
1570 	case IPCTL_DEFMTU:
1571 		return (sysctl_int(oldp, oldlenp, newp, newlen, &ip_mtu));
1572 #endif
1573 	case IPCTL_SOURCEROUTE:
1574 		/*
1575 		 * Don't allow this to change in a secure environment.
1576 		 */
1577 		if (newp && securelevel > 0)
1578 			return (EPERM);
1579 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1580 		    &ip_dosourceroute));
1581 	case IPCTL_DIRECTEDBCAST:
1582 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1583 		    &ip_directedbcast));
1584 	case IPCTL_MTUDISC:
1585 		error = sysctl_int(oldp, oldlenp, newp, newlen,
1586 		    &ip_mtudisc);
1587 		if (ip_mtudisc != 0 && ip_mtudisc_timeout_q == NULL) {
1588 			ip_mtudisc_timeout_q =
1589 			    rt_timer_queue_create(ip_mtudisc_timeout);
1590 		} else if (ip_mtudisc == 0 && ip_mtudisc_timeout_q != NULL) {
1591 			rt_timer_queue_destroy(ip_mtudisc_timeout_q, TRUE);
1592 			Free(ip_mtudisc_timeout_q);
1593 			ip_mtudisc_timeout_q = NULL;
1594 		}
1595 		return error;
1596 	case IPCTL_MTUDISCTIMEOUT:
1597 		error = sysctl_int(oldp, oldlenp, newp, newlen,
1598 		   &ip_mtudisc_timeout);
1599 		if (ip_mtudisc_timeout_q != NULL)
1600 			rt_timer_queue_change(ip_mtudisc_timeout_q,
1601 					      ip_mtudisc_timeout);
1602 		return (error);
1603 	case IPCTL_IPPORT_FIRSTAUTO:
1604 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1605 		    &ipport_firstauto));
1606 	case IPCTL_IPPORT_LASTAUTO:
1607 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1608 		    &ipport_lastauto));
1609 	case IPCTL_IPPORT_HIFIRSTAUTO:
1610 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1611 		    &ipport_hifirstauto));
1612 	case IPCTL_IPPORT_HILASTAUTO:
1613 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1614 		    &ipport_hilastauto));
1615 	case IPCTL_IPPORT_MAXQUEUE:
1616 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1617 		    &ip_maxqueue));
1618 	case IPCTL_ENCDEBUG:
1619 		return (sysctl_int(oldp, oldlenp, newp, newlen, &encdebug));
1620 	case IPCTL_IPSEC_EMBRYONIC_SA_TIMEOUT:
1621 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1622 				   &ipsec_keep_invalid));
1623 	case IPCTL_IPSEC_REQUIRE_PFS:
1624 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1625 				   &ipsec_require_pfs));
1626 	case IPCTL_IPSEC_SOFT_ALLOCATIONS:
1627 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1628 				   &ipsec_soft_allocations));
1629 	case IPCTL_IPSEC_ALLOCATIONS:
1630 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1631 				   &ipsec_exp_allocations));
1632 	case IPCTL_IPSEC_SOFT_BYTES:
1633 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1634 				   &ipsec_soft_bytes));
1635 	case IPCTL_IPSEC_BYTES:
1636 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1637 				   &ipsec_exp_bytes));
1638 	case IPCTL_IPSEC_TIMEOUT:
1639 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1640 				   &ipsec_exp_timeout));
1641 	case IPCTL_IPSEC_SOFT_TIMEOUT:
1642 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1643 				   &ipsec_soft_timeout));
1644 	case IPCTL_IPSEC_SOFT_FIRSTUSE:
1645 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1646 				   &ipsec_soft_first_use));
1647 	case IPCTL_IPSEC_FIRSTUSE:
1648 		return (sysctl_int(oldp, oldlenp, newp, newlen,
1649 				   &ipsec_exp_first_use));
1650 	case IPCTL_IPSEC_ENC_ALGORITHM:
1651 	        return (sysctl_tstring(oldp, oldlenp, newp, newlen,
1652 				       ipsec_def_enc, sizeof(ipsec_def_enc)));
1653 	case IPCTL_IPSEC_AUTH_ALGORITHM:
1654 	        return (sysctl_tstring(oldp, oldlenp, newp, newlen,
1655 				       ipsec_def_auth,
1656 				       sizeof(ipsec_def_auth)));
1657 	case IPCTL_IPSEC_EXPIRE_ACQUIRE:
1658 	        return (sysctl_int(oldp, oldlenp, newp, newlen,
1659 				   &ipsec_expire_acquire));
1660 	case IPCTL_IPSEC_IPCOMP_ALGORITHM:
1661 	        return (sysctl_tstring(oldp, oldlenp, newp, newlen,
1662 				       ipsec_def_comp,
1663 				       sizeof(ipsec_def_comp)));
1664 	default:
1665 		return (EOPNOTSUPP);
1666 	}
1667 	/* NOTREACHED */
1668 }
1669