xref: /dflybsd-src/sys/netinet/ip_input.c (revision a361ab312536a661d74caf5630c7ae20bcd8e3e4)
1  /*
2   * Copyright (c) 2003, 2004 Jeffrey M. Hsu.  All rights reserved.
3   * Copyright (c) 2003, 2004 The DragonFly Project.  All rights reserved.
4   *
5   * This code is derived from software contributed to The DragonFly Project
6   * by Jeffrey M. Hsu.
7   *
8   * Redistribution and use in source and binary forms, with or without
9   * modification, are permitted provided that the following conditions
10   * are met:
11   * 1. Redistributions of source code must retain the above copyright
12   *    notice, this list of conditions and the following disclaimer.
13   * 2. Redistributions in binary form must reproduce the above copyright
14   *    notice, this list of conditions and the following disclaimer in the
15   *    documentation and/or other materials provided with the distribution.
16   * 3. Neither the name of The DragonFly Project nor the names of its
17   *    contributors may be used to endorse or promote products derived
18   *    from this software without specific, prior written permission.
19   *
20   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21   * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22   * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23   * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
24   * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25   * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
26   * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28   * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
30   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31   * SUCH DAMAGE.
32   */
33  
34  /*
35   * Copyright (c) 1982, 1986, 1988, 1993
36   *	The Regents of the University of California.  All rights reserved.
37   *
38   * Redistribution and use in source and binary forms, with or without
39   * modification, are permitted provided that the following conditions
40   * are met:
41   * 1. Redistributions of source code must retain the above copyright
42   *    notice, this list of conditions and the following disclaimer.
43   * 2. Redistributions in binary form must reproduce the above copyright
44   *    notice, this list of conditions and the following disclaimer in the
45   *    documentation and/or other materials provided with the distribution.
46   * 3. Neither the name of the University nor the names of its contributors
47   *    may be used to endorse or promote products derived from this software
48   *    without specific prior written permission.
49   *
50   * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53   * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54   * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55   * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56   * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57   * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58   * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59   * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60   * SUCH DAMAGE.
61   *
62   *	@(#)ip_input.c	8.2 (Berkeley) 1/4/94
63   * $FreeBSD: src/sys/netinet/ip_input.c,v 1.130.2.52 2003/03/07 07:01:28 silby Exp $
64   */
65  
66  #define	_IP_VHL
67  
68  #include "opt_bootp.h"
69  #include "opt_ipdn.h"
70  #include "opt_ipdivert.h"
71  #include "opt_ipstealth.h"
72  #include "opt_rss.h"
73  
74  #include <sys/param.h>
75  #include <sys/systm.h>
76  #include <sys/mbuf.h>
77  #include <sys/malloc.h>
78  #include <sys/mpipe.h>
79  #include <sys/domain.h>
80  #include <sys/protosw.h>
81  #include <sys/socket.h>
82  #include <sys/time.h>
83  #include <sys/globaldata.h>
84  #include <sys/thread.h>
85  #include <sys/kernel.h>
86  #include <sys/syslog.h>
87  #include <sys/sysctl.h>
88  #include <sys/in_cksum.h>
89  #include <sys/lock.h>
90  
91  #include <sys/mplock2.h>
92  
93  #include <machine/stdarg.h>
94  
95  #include <net/if.h>
96  #include <net/if_types.h>
97  #include <net/if_var.h>
98  #include <net/if_dl.h>
99  #include <net/pfil.h>
100  #include <net/route.h>
101  #include <net/netisr2.h>
102  
103  #include <netinet/in.h>
104  #include <netinet/in_systm.h>
105  #include <netinet/in_var.h>
106  #include <netinet/ip.h>
107  #include <netinet/in_pcb.h>
108  #include <netinet/ip_var.h>
109  #include <netinet/ip_icmp.h>
110  #include <netinet/ip_divert.h>
111  #include <netinet/ip_flow.h>
112  
113  #include <sys/thread2.h>
114  #include <sys/msgport2.h>
115  #include <net/netmsg2.h>
116  
117  #include <sys/socketvar.h>
118  
119  #include <net/ipfw/ip_fw.h>
120  #include <net/dummynet/ip_dummynet.h>
121  
122  __read_mostly int rsvp_on = 0;
123  __read_mostly static int ip_rsvp_on;
124  struct socket *ip_rsvpd;
125  
126  __read_mostly int ipforwarding = 0;
127  SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_RW,
128      &ipforwarding, 0, "Enable IP forwarding between interfaces");
129  
130  __read_mostly static int ipsendredirects = 1; /* XXX */
131  SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_RW,
132      &ipsendredirects, 0, "Enable sending IP redirects");
133  
134  __read_mostly int ip_defttl = IPDEFTTL;
135  SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW,
136      &ip_defttl, 0, "Maximum TTL on IP packets");
137  
138  __read_mostly static int ip_dosourceroute = 0;
139  SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_RW,
140      &ip_dosourceroute, 0, "Enable forwarding source routed IP packets");
141  
142  __read_mostly static int ip_acceptsourceroute = 0;
143  SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute,
144      CTLFLAG_RW, &ip_acceptsourceroute, 0,
145      "Enable accepting source routed IP packets");
146  
147  __read_mostly static int maxnipq;
148  SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragpackets, CTLFLAG_RW,
149      &maxnipq, 0,
150      "Maximum number of IPv4 fragment reassembly queue entries");
151  
152  __read_mostly static int maxfragsperpacket;
153  SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_RW,
154      &maxfragsperpacket, 0,
155      "Maximum number of IPv4 fragments allowed per packet");
156  
157  __read_mostly static int ip_sendsourcequench = 0;
158  SYSCTL_INT(_net_inet_ip, OID_AUTO, sendsourcequench, CTLFLAG_RW,
159      &ip_sendsourcequench, 0,
160      "Enable the transmission of source quench packets");
161  
162  __read_mostly int ip_do_randomid = 1;
163  SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW,
164      &ip_do_randomid, 0,
165      "Assign random ip_id values");
166  /*
167   * XXX - Setting ip_checkinterface mostly implements the receive side of
168   * the Strong ES model described in RFC 1122, but since the routing table
169   * and transmit implementation do not implement the Strong ES model,
170   * setting this to 1 results in an odd hybrid.
171   *
172   * XXX - ip_checkinterface currently must be disabled if you use ipnat
173   * to translate the destination address to another local interface.
174   *
175   * XXX - ip_checkinterface must be disabled if you add IP aliases
176   * to the loopback interface instead of the interface where the
177   * packets for those addresses are received.
178   */
179  __read_mostly static int ip_checkinterface = 0;
180  SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW,
181      &ip_checkinterface, 0, "Verify packet arrives on correct interface");
182  
183  #ifdef RSS_DEBUG
184  static u_long ip_rehash_count = 0;
185  SYSCTL_ULONG(_net_inet_ip, OID_AUTO, rehash_count, CTLFLAG_RD,
186      &ip_rehash_count, 0, "Number of packets rehashed by IP");
187  
188  static u_long ip_dispatch_fast = 0;
189  SYSCTL_ULONG(_net_inet_ip, OID_AUTO, dispatch_fast_count, CTLFLAG_RD,
190      &ip_dispatch_fast, 0, "Number of packets handled on current CPU");
191  
192  static u_long ip_dispatch_slow = 0;
193  SYSCTL_ULONG(_net_inet_ip, OID_AUTO, dispatch_slow_count, CTLFLAG_RD,
194      &ip_dispatch_slow, 0, "Number of packets messaged to another CPU");
195  #endif
196  
197  #ifdef DIAGNOSTIC
198  static int ipprintfs = 0;
199  #endif
200  
201  extern	struct domain inetdomain;
202  extern	struct protosw inetsw[];
203  u_char	ip_protox[IPPROTO_MAX];
204  struct	in_ifaddrhead in_ifaddrheads[MAXCPU];	/* first inet address */
205  struct	in_ifaddrhashhead *in_ifaddrhashtbls[MAXCPU];
206  						/* inet addr hash table */
207  __read_mostly u_long	in_ifaddrhmask;		/* mask for hash table */
208  
209  static struct mbuf *ipforward_mtemp[MAXCPU];
210  
211  struct ip_stats ipstats_percpu[MAXCPU] __cachealign;
212  
213  static int
214  sysctl_ipstats(SYSCTL_HANDLER_ARGS)
215  {
216  	int cpu, error = 0;
217  
218  	for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
219  		if ((error = SYSCTL_OUT(req, &ipstats_percpu[cpu],
220  					sizeof(struct ip_stats))))
221  			break;
222  		if ((error = SYSCTL_IN(req, &ipstats_percpu[cpu],
223  				       sizeof(struct ip_stats))))
224  			break;
225  	}
226  
227  	return (error);
228  }
229  SYSCTL_PROC(_net_inet_ip, IPCTL_STATS, stats, (CTLTYPE_OPAQUE | CTLFLAG_RW),
230      0, 0, sysctl_ipstats, "S,ip_stats", "IP statistics");
231  
232  /* Packet reassembly stuff */
233  #define	IPREASS_NHASH_LOG2	6
234  #define	IPREASS_NHASH		(1 << IPREASS_NHASH_LOG2)
235  #define	IPREASS_HMASK		(IPREASS_NHASH - 1)
236  #define	IPREASS_HASH(x,y)						\
237      (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK)
238  
239  TAILQ_HEAD(ipqhead, ipq);
240  struct ipfrag_queue {
241  	int			nipq;
242  	volatile int		draining;
243  	struct netmsg_base	timeo_netmsg;
244  	struct callout		timeo_ch;
245  	struct netmsg_base	drain_netmsg;
246  	struct ipqhead		ipq[IPREASS_NHASH];
247  } __cachealign;
248  
249  static struct ipfrag_queue	ipfrag_queue_pcpu[MAXCPU];
250  
251  #ifdef IPCTL_DEFMTU
252  SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW,
253      &ip_mtu, 0, "Default MTU");
254  #endif
255  
256  #ifdef IPSTEALTH
257  static int ipstealth = 0;
258  SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW, &ipstealth, 0, "");
259  #else
260  static const int ipstealth = 0;
261  #endif
262  
263  struct mbuf *(*ip_divert_p)(struct mbuf *, int, int);
264  
265  struct pfil_head inet_pfil_hook;
266  
267  /*
268   * struct ip_srcrt_opt is used to store packet state while it travels
269   * through the stack.
270   *
271   * XXX Note that the code even makes assumptions on the size and
272   * alignment of fields inside struct ip_srcrt so e.g. adding some
273   * fields will break the code.  This needs to be fixed.
274   *
275   * We need to save the IP options in case a protocol wants to respond
276   * to an incoming packet over the same route if the packet got here
277   * using IP source routing.  This allows connection establishment and
278   * maintenance when the remote end is on a network that is not known
279   * to us.
280   */
281  struct ip_srcrt {
282  	struct	in_addr dst;			/* final destination */
283  	char	nop;				/* one NOP to align */
284  	char	srcopt[IPOPT_OFFSET + 1];	/* OPTVAL, OLEN and OFFSET */
285  	struct	in_addr route[MAX_IPOPTLEN/sizeof(struct in_addr)];
286  };
287  
288  struct ip_srcrt_opt {
289  	int		ip_nhops;
290  	struct ip_srcrt	ip_srcrt;
291  };
292  
293  #define IPFRAG_MPIPE_MAX	4096
294  #define MAXIPFRAG_MIN		((IPFRAG_MPIPE_MAX * 2) / 256)
295  
296  #define IPFRAG_TIMEO		(hz / PR_SLOWHZ)
297  
298  static MALLOC_DEFINE(M_IPQ, "ipq", "IP Fragment Management");
299  static struct malloc_pipe ipq_mpipe;
300  
301  static void		save_rte(struct mbuf *, u_char *, struct in_addr);
302  static int		ip_dooptions(struct mbuf *m, int, struct sockaddr_in *);
303  static void		ip_freef(struct ipfrag_queue *, struct ipqhead *,
304  			    struct ipq *);
305  static void		ip_input_handler(netmsg_t);
306  static void		ip_forward_redispatch(struct lwkt_port *port,
307  			    struct mbuf *m, boolean_t srcrt);
308  
309  static void		ipfrag_timeo_dispatch(netmsg_t);
310  static void		ipfrag_timeo(void *);
311  static void		ipfrag_drain_dispatch(netmsg_t);
312  
313  /*
314   * IP initialization: fill in IP protocol switch table.
315   * All protocols not implemented in kernel go to raw IP protocol handler.
316   */
317  void
318  ip_init(void)
319  {
320  	struct ipfrag_queue *fragq;
321  	struct protosw *pr;
322  	int cpu, i;
323  
324  	/*
325  	 * Make sure we can handle a reasonable number of fragments but
326  	 * cap it at IPFRAG_MPIPE_MAX.
327  	 */
328  	mpipe_init(&ipq_mpipe, M_IPQ, sizeof(struct ipq),
329  	    IFQ_MAXLEN, IPFRAG_MPIPE_MAX, 0, NULL, NULL, NULL);
330  
331  	/*
332  	 * Make in_ifaddrhead and in_ifaddrhashtbl available on all CPUs,
333  	 * since they could be accessed by any threads.
334  	 */
335  	for (cpu = 0; cpu < ncpus; ++cpu) {
336  		TAILQ_INIT(&in_ifaddrheads[cpu]);
337  		in_ifaddrhashtbls[cpu] =
338  		    hashinit(INADDR_NHASH, M_IFADDR, &in_ifaddrhmask);
339  	}
340  
341  	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
342  	if (pr == NULL)
343  		panic("ip_init");
344  	for (i = 0; i < IPPROTO_MAX; i++)
345  		ip_protox[i] = pr - inetsw;
346  	for (pr = inetdomain.dom_protosw;
347  	     pr < inetdomain.dom_protoswNPROTOSW; pr++) {
348  		if (pr->pr_domain->dom_family == PF_INET && pr->pr_protocol) {
349  			if (pr->pr_protocol != IPPROTO_RAW)
350  				ip_protox[pr->pr_protocol] = pr - inetsw;
351  		}
352  	}
353  
354  	inet_pfil_hook.ph_type = PFIL_TYPE_AF;
355  	inet_pfil_hook.ph_af = AF_INET;
356  	if ((i = pfil_head_register(&inet_pfil_hook)) != 0) {
357  		kprintf("%s: WARNING: unable to register pfil hook, "
358  			"error %d\n", __func__, i);
359  	}
360  
361  	maxnipq = (nmbclusters / 32) / netisr_ncpus;
362  	if (maxnipq < MAXIPFRAG_MIN)
363  		maxnipq = MAXIPFRAG_MIN;
364  	maxfragsperpacket = 16;
365  
366  	ip_id = time_second & 0xffff;	/* time_second survives reboots */
367  
368  	for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
369  		/*
370  		 * Initialize IP statistics counters for each CPU.
371  		 */
372  		bzero(&ipstats_percpu[cpu], sizeof(struct ip_stats));
373  
374  		/*
375  		 * Preallocate mbuf template for forwarding
376  		 */
377  		MGETHDR(ipforward_mtemp[cpu], M_WAITOK, MT_DATA);
378  
379  		/*
380  		 * Initialize per-cpu ip fragments queues
381  		 */
382  		fragq = &ipfrag_queue_pcpu[cpu];
383  		for (i = 0; i < IPREASS_NHASH; i++)
384  			TAILQ_INIT(&fragq->ipq[i]);
385  
386  		callout_init_mp(&fragq->timeo_ch);
387  		netmsg_init(&fragq->timeo_netmsg, NULL, &netisr_adone_rport,
388  			    MSGF_PRIORITY, ipfrag_timeo_dispatch);
389  		netmsg_init(&fragq->drain_netmsg, NULL, &netisr_adone_rport,
390  			    MSGF_PRIORITY, ipfrag_drain_dispatch);
391  	}
392  
393  	netisr_register(NETISR_IP, ip_input_handler, ip_hashfn);
394  	netisr_register_hashcheck(NETISR_IP, ip_hashcheck);
395  
396  	for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
397  		fragq = &ipfrag_queue_pcpu[cpu];
398  		callout_reset_bycpu(&fragq->timeo_ch, IPFRAG_TIMEO,
399  				    ipfrag_timeo, NULL, cpu);
400  	}
401  
402  	ip_porthash_trycount = 2 * netisr_ncpus;
403  }
404  
405  /* Do transport protocol processing. */
406  static void
407  transport_processing_oncpu(struct mbuf *m, int hlen, struct ip *ip)
408  {
409  	const struct protosw *pr = &inetsw[ip_protox[ip->ip_p]];
410  
411  	/*
412  	 * Switch out to protocol's input routine.
413  	 */
414  	PR_GET_MPLOCK(pr);
415  	pr->pr_input(&m, &hlen, ip->ip_p);
416  	PR_REL_MPLOCK(pr);
417  }
418  
419  static void
420  transport_processing_handler(netmsg_t msg)
421  {
422  	struct netmsg_packet *pmsg = &msg->packet;
423  	struct ip *ip;
424  	int hlen;
425  
426  	ip = mtod(pmsg->nm_packet, struct ip *);
427  	hlen = pmsg->base.lmsg.u.ms_result;
428  
429  	transport_processing_oncpu(pmsg->nm_packet, hlen, ip);
430  	/* msg was embedded in the mbuf, do not reply! */
431  }
432  
433  static void
434  ip_input_handler(netmsg_t msg)
435  {
436  	ip_input(msg->packet.nm_packet);
437  	/* msg was embedded in the mbuf, do not reply! */
438  }
439  
440  /*
441   * IP input routine.  Checksum and byte swap header.  If fragmented
442   * try to reassemble.  Process options.  Pass to next level.
443   */
444  void
445  ip_input(struct mbuf *m)
446  {
447  	struct ip *ip;
448  	struct in_ifaddr *ia = NULL;
449  	struct in_ifaddr_container *iac;
450  	int hlen, checkif;
451  	u_short sum;
452  	uint16_t ip_len;
453  	struct in_addr pkt_dst;
454  	boolean_t using_srcrt = FALSE;		/* forward (by PFIL_HOOKS) */
455  	struct in_addr odst;			/* original dst address(NAT) */
456  	struct m_tag *mtag;
457  	struct sockaddr_in *next_hop = NULL;
458  	lwkt_port_t port;
459  
460  	ASSERT_NETISR_NCPUS(mycpuid);
461  	M_ASSERTPKTHDR(m);
462  
463  	if (m->m_len < sizeof(struct ip)) {
464  		kprintf("Issuer to ip_input failed to check IP header atomicy (%d)\n",
465  			m->m_len);
466  		ipstat.ips_badlen++;
467  		goto bad;
468  	}
469  #if 0
470  	/* length checks already done in ip_hashfn() */
471  	KASSERT(m->m_len >= sizeof(struct ip), ("IP header not in one mbuf"));
472  #endif
473  
474  	/*
475  	 * This routine is called from numerous places which may not have
476  	 * characterized the packet.
477  	 */
478  	ip = mtod(m, struct ip *);
479  	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
480  	    (ip->ip_off & htons(IP_MF | IP_OFFMASK)))
481  	{
482  		/*
483  		 * Force hash recalculation for fragments and multicast
484  		 * packets; hardware may not do it correctly.
485  		 * XXX add flag to indicate the hash is from hardware
486  		 */
487  		m->m_flags &= ~M_HASH;
488  	}
489  	if ((m->m_flags & M_HASH) == 0) {
490  		ip_hashfn(&m, 0);
491  		if (m == NULL)
492  			return;
493  		KKASSERT(m->m_flags & M_HASH);
494  
495  		if (&curthread->td_msgport !=
496  		    netisr_hashport(m->m_pkthdr.hash)) {
497  			netisr_queue(NETISR_IP, m);
498  			/* Requeued to other netisr msgport; done */
499  			return;
500  		}
501  
502  		/* mbuf could have been changed */
503  		ip = mtod(m, struct ip *);
504  	}
505  
506  	/*
507  	 * Pull out certain tags
508  	 */
509  	if (m->m_pkthdr.fw_flags & IPFORWARD_MBUF_TAGGED) {
510  		/* Next hop */
511  		mtag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
512  		KKASSERT(mtag != NULL);
513  		next_hop = m_tag_data(mtag);
514  	}
515  
516  	if (m->m_pkthdr.fw_flags &
517  	    (DUMMYNET_MBUF_TAGGED | IPFW_MBUF_CONTINUE)) {
518  		/*
519  		 * - Dummynet already filtered this packet.
520  		 * - This packet was processed by ipfw on another
521  		 *   cpu, and the rest of the ipfw processing should
522  		 *   be carried out on this cpu.
523  		 */
524  		ip = mtod(m, struct ip *);
525  		hlen = IP_VHL_HL(ip->ip_vhl) << 2;
526  		goto iphack;
527  	}
528  
529  	ipstat.ips_total++;
530  
531  	if (IP_VHL_V(ip->ip_vhl) != IPVERSION) {
532  		ipstat.ips_badvers++;
533  		goto bad;
534  	}
535  
536  	hlen = IP_VHL_HL(ip->ip_vhl) << 2;
537  	/* length checks already done in ip_hashfn() */
538  	KASSERT(hlen >= sizeof(struct ip), ("IP header len too small"));
539  	KASSERT(m->m_len >= hlen, ("complete IP header not in one mbuf"));
540  
541  	/* 127/8 must not appear on wire - RFC1122 */
542  	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
543  	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
544  		if (!(m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK)) {
545  			ipstat.ips_badaddr++;
546  			goto bad;
547  		}
548  	}
549  
550  	if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) {
551  		sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID);
552  	} else {
553  		if (hlen == sizeof(struct ip))
554  			sum = in_cksum_hdr(ip);
555  		else
556  			sum = in_cksum(m, hlen);
557  	}
558  	if (sum != 0) {
559  		ipstat.ips_badsum++;
560  		goto bad;
561  	}
562  
563  #ifdef ALTQ
564  	if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0) {
565  		/* packet is dropped by traffic conditioner */
566  		return;
567  	}
568  #endif
569  	/*
570  	 * Convert fields to host representation.
571  	 */
572  	ip_len = ntohs(ip->ip_len);
573  
574  	/* length checks already done in ip_hashfn() */
575  	KASSERT(ip_len >= hlen, ("total length incl header"));
576  	KASSERT(m->m_pkthdr.len >= ip_len, ("mbuf too short"));
577  
578  	/*
579  	 * Trim mbufs if longer than the IP header would have us expect.
580  	 */
581  	if (m->m_pkthdr.len > ip_len) {
582  		if (m->m_len == m->m_pkthdr.len) {
583  			m->m_len = ip_len;
584  			m->m_pkthdr.len = ip_len;
585  		} else {
586  			m_adj(m, ip_len - m->m_pkthdr.len);
587  		}
588  	}
589  
590  	/*
591  	 * IpHack's section.
592  	 * Right now when no processing on packet has done
593  	 * and it is still fresh out of network we do our black
594  	 * deals with it.
595  	 * - Firewall: deny/allow/divert
596  	 * - Xlate: translate packet's addr/port (NAT).
597  	 * - Pipe: pass pkt through dummynet.
598  	 * - Wrap: fake packet's addr/port <unimpl.>
599  	 * - Encapsulate: put it in another IP and send out. <unimp.>
600  	 */
601  
602  iphack:
603  	/*
604  	 * If we've been forwarded from the output side, then
605  	 * skip the firewall a second time
606  	 */
607  	if (next_hop != NULL)
608  		goto ours;
609  
610  	/* No pfil hooks */
611  	if (!pfil_has_hooks(&inet_pfil_hook)) {
612  		if (m->m_pkthdr.fw_flags & DUMMYNET_MBUF_TAGGED) {
613  			/*
614  			 * Strip dummynet tags from stranded packets
615  			 */
616  			mtag = m_tag_find(m, PACKET_TAG_DUMMYNET, NULL);
617  			KKASSERT(mtag != NULL);
618  			m_tag_delete(m, mtag);
619  			m->m_pkthdr.fw_flags &= ~DUMMYNET_MBUF_TAGGED;
620  		}
621  		goto pass;
622  	}
623  
624  	/*
625  	 * Run through list of hooks for input packets.
626  	 *
627  	 * NOTE!  If the packet is rewritten pf/ipfw/whoever must
628  	 *	  clear M_HASH.
629  	 */
630  	odst = ip->ip_dst;
631  	if (pfil_run_hooks(&inet_pfil_hook, &m, m->m_pkthdr.rcvif, PFIL_IN))
632  		return;
633  	if (m == NULL)	/* consumed by filter */
634  		return;
635  	ip = mtod(m, struct ip *);
636  	hlen = IP_VHL_HL(ip->ip_vhl) << 2;
637  	using_srcrt = (odst.s_addr != ip->ip_dst.s_addr);
638  
639  	if (m->m_pkthdr.fw_flags & IPFORWARD_MBUF_TAGGED) {
640  		mtag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
641  		KKASSERT(mtag != NULL);
642  		next_hop = m_tag_data(mtag);
643  	}
644  	if (m->m_pkthdr.fw_flags & DUMMYNET_MBUF_TAGGED) {
645  		ip_dn_queue(m);
646  		return;
647  	}
648  	if (m->m_pkthdr.fw_flags & FW_MBUF_REDISPATCH)
649  		m->m_pkthdr.fw_flags &= ~FW_MBUF_REDISPATCH;
650  	if (m->m_pkthdr.fw_flags & IPFW_MBUF_CONTINUE) {
651  		/* ipfw was disabled/unloaded. */
652  		goto bad;
653  	}
654  pass:
655  	/*
656  	 * Process options and, if not destined for us,
657  	 * ship it on.  ip_dooptions returns 1 when an
658  	 * error was detected (causing an icmp message
659  	 * to be sent and the original packet to be freed).
660  	 */
661  	if (hlen > sizeof(struct ip) && ip_dooptions(m, 0, next_hop))
662  		return;
663  
664  	/* greedy RSVP, snatches any PATH packet of the RSVP protocol and no
665  	 * matter if it is destined to another node, or whether it is
666  	 * a multicast one, RSVP wants it! and prevents it from being forwarded
667  	 * anywhere else. Also checks if the rsvp daemon is running before
668  	 * grabbing the packet.
669  	 */
670  	if (rsvp_on && ip->ip_p == IPPROTO_RSVP)
671  		goto ours;
672  
673  	/*
674  	 * Check our list of addresses, to see if the packet is for us.
675  	 * If we don't have any addresses, assume any unicast packet
676  	 * we receive might be for us (and let the upper layers deal
677  	 * with it).
678  	 */
679  	if (TAILQ_EMPTY(&in_ifaddrheads[mycpuid]) &&
680  	    !(m->m_flags & (M_MCAST | M_BCAST)))
681  	{
682  		goto ours;
683  	}
684  
685  	/*
686  	 * Cache the destination address of the packet; this may be
687  	 * changed by use of 'ipfw fwd'.
688  	 */
689  	pkt_dst = next_hop ? next_hop->sin_addr : ip->ip_dst;
690  
691  	/*
692  	 * Enable a consistency check between the destination address
693  	 * and the arrival interface for a unicast packet (the RFC 1122
694  	 * strong ES model) if IP forwarding is disabled and the packet
695  	 * is not locally generated and the packet is not subject to
696  	 * 'ipfw fwd'.
697  	 *
698  	 * XXX - Checking also should be disabled if the destination
699  	 * address is ipnat'ed to a different interface.
700  	 *
701  	 * XXX - Checking is incompatible with IP aliases added
702  	 * to the loopback interface instead of the interface where
703  	 * the packets are received.
704  	 */
705  	checkif = ip_checkinterface &&
706  		  !ipforwarding &&
707  		  m->m_pkthdr.rcvif != NULL &&
708  		  !(m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) &&
709  		  next_hop == NULL;
710  
711  	/*
712  	 * Check for exact addresses in the hash bucket.
713  	 */
714  	LIST_FOREACH(iac, INADDR_HASH(pkt_dst.s_addr), ia_hash) {
715  		ia = iac->ia;
716  
717  		/*
718  		 * If the address matches, verify that the packet
719  		 * arrived via the correct interface if checking is
720  		 * enabled.
721  		 */
722  		if (IA_SIN(ia)->sin_addr.s_addr == pkt_dst.s_addr &&
723  		    (!checkif || ia->ia_ifp == m->m_pkthdr.rcvif))
724  		{
725  			goto ours;
726  		}
727  	}
728  	ia = NULL;
729  
730  	/*
731  	 * Check for broadcast addresses.
732  	 *
733  	 * Only accept broadcast packets that arrive via the matching
734  	 * interface.  Reception of forwarded directed broadcasts would
735  	 * be handled via ip_forward() and ether_output() with the loopback
736  	 * into the stack for SIMPLEX interfaces handled by ether_output().
737  	 */
738  	if (m->m_pkthdr.rcvif != NULL &&
739  	    m->m_pkthdr.rcvif->if_flags & IFF_BROADCAST) {
740  		struct ifaddr_container *ifac;
741  
742  		TAILQ_FOREACH(ifac, &m->m_pkthdr.rcvif->if_addrheads[mycpuid],
743  			      ifa_link) {
744  			struct ifaddr *ifa = ifac->ifa;
745  
746  			if (ifa->ifa_addr == NULL) /* shutdown/startup race */
747  				continue;
748  			if (ifa->ifa_addr->sa_family != AF_INET)
749  				continue;
750  			ia = ifatoia(ifa);
751  			if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr ==
752  								pkt_dst.s_addr)
753  				goto ours;
754  			if (ia->ia_netbroadcast.s_addr == pkt_dst.s_addr)
755  				goto ours;
756  #ifdef BOOTP_COMPAT
757  			if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY)
758  				goto ours;
759  #endif
760  		}
761  	}
762  	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
763  		struct in_multi *inm;
764  
765  		if (ip_mrouter != NULL) {
766  			/* XXX Multicast routing is not MPSAFE yet */
767  			get_mplock();
768  
769  			/*
770  			 * If we are acting as a multicast router, all
771  			 * incoming multicast packets are passed to the
772  			 * kernel-level multicast forwarding function.
773  			 * The packet is returned (relatively) intact; if
774  			 * ip_mforward() returns a non-zero value, the packet
775  			 * must be discarded, else it may be accepted below.
776  			 */
777  			if (ip_mforward != NULL &&
778  			    ip_mforward(ip, m->m_pkthdr.rcvif, m, NULL) != 0) {
779  				rel_mplock();
780  				ipstat.ips_cantforward++;
781  				m_freem(m);
782  				return;
783  			}
784  
785  			rel_mplock();
786  
787  			/*
788  			 * The process-level routing daemon needs to receive
789  			 * all multicast IGMP packets, whether or not this
790  			 * host belongs to their destination groups.
791  			 */
792  			if (ip->ip_p == IPPROTO_IGMP)
793  				goto ours;
794  			ipstat.ips_forward++;
795  		}
796  		/*
797  		 * See if we belong to the destination multicast group on the
798  		 * arrival interface.
799  		 */
800  		inm = IN_LOOKUP_MULTI(&ip->ip_dst, m->m_pkthdr.rcvif);
801  		if (inm == NULL) {
802  			ipstat.ips_notmember++;
803  			m_freem(m);
804  			return;
805  		}
806  		goto ours;
807  	}
808  	if (ip->ip_dst.s_addr == INADDR_BROADCAST)
809  		goto ours;
810  	if (ip->ip_dst.s_addr == INADDR_ANY)
811  		goto ours;
812  
813  	/*
814  	 * Not for us; forward if possible and desirable.
815  	 */
816  	if (!ipforwarding) {
817  		ipstat.ips_cantforward++;
818  		m_freem(m);
819  	} else {
820  		ip_forward(m, using_srcrt, next_hop);
821  	}
822  	return;
823  
824  ours:
825  
826  	/*
827  	 * IPSTEALTH: Process non-routing options only
828  	 * if the packet is destined for us.
829  	 */
830  	if (ipstealth &&
831  	    hlen > sizeof(struct ip) &&
832  	    ip_dooptions(m, 1, next_hop))
833  	{
834  		return;
835  	}
836  
837  	/* Count the packet in the ip address stats */
838  	if (ia != NULL) {
839  		IFA_STAT_INC(&ia->ia_ifa, ipackets, 1);
840  		IFA_STAT_INC(&ia->ia_ifa, ibytes, m->m_pkthdr.len);
841  	}
842  
843  	/*
844  	 * If offset or IP_MF are set, must reassemble.
845  	 * Otherwise, nothing need be done.
846  	 * (We could look in the reassembly queue to see
847  	 * if the packet was previously fragmented,
848  	 * but it's not worth the time; just let them time out.)
849  	 */
850  	if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) {
851  		/*
852  		 * Attempt reassembly; if it succeeds, proceed.  ip_reass()
853  		 * will return a different mbuf.
854  		 *
855  		 * NOTE: ip_reass() returns m with M_HASH cleared to force
856  		 *	 us to recharacterize the packet.
857  		 */
858  		m = ip_reass(m);
859  		if (m == NULL)
860  			return;
861  		ip = mtod(m, struct ip *);
862  
863  		/* Get the header length of the reassembled packet */
864  		hlen = IP_VHL_HL(ip->ip_vhl) << 2;
865  	}
866  
867  	/*
868  	 * We must forward the packet to the correct protocol thread if
869  	 * we are not already in it.
870  	 *
871  	 * NOTE: ip_len is left in network form.  ip_len is not adjusted
872  	 *	 further for protocol processing, instead we pass hlen
873  	 *	 to the protosw and let it deal with it.
874  	 */
875  	ipstat.ips_delivered++;
876  
877  	if ((m->m_flags & M_HASH) == 0) {
878  		m = ip_rehashm(m);
879  		if (m == NULL)
880  			return;
881  		ip = mtod(m, struct ip *);
882  	}
883  	port = netisr_hashport(m->m_pkthdr.hash);
884  
885  	if (port != &curthread->td_msgport) {
886  		ip_transport_redispatch(port, m, hlen);
887  	} else {
888  #ifdef RSS_DEBUG
889  		atomic_add_long(&ip_dispatch_fast, 1);
890  #endif
891  		transport_processing_oncpu(m, hlen, ip);
892  	}
893  	return;
894  
895  bad:
896  	m_freem(m);
897  }
898  
899  struct mbuf *
900  ip_rehashm(struct mbuf *m)
901  {
902  	struct ip *ip = mtod(m, struct ip *);
903  
904  #ifdef RSS_DEBUG
905  	atomic_add_long(&ip_rehash_count, 1);
906  #endif
907  	ip_hashfn(&m, 0);
908  	if (m == NULL)
909  		return NULL;
910  
911  	/* 'm' might be changed by ip_hashfn(). */
912  	ip = mtod(m, struct ip *);
913  	KASSERT(m->m_flags & M_HASH, ("no hash"));
914  
915  	return (m);
916  }
917  
918  void
919  ip_transport_redispatch(struct lwkt_port *port, struct mbuf *m, int hlen)
920  {
921  	struct netmsg_packet *pmsg;
922  
923  #ifdef RSS_DEBUG
924  	atomic_add_long(&ip_dispatch_slow, 1);
925  #endif
926  
927  	pmsg = &m->m_hdr.mh_netmsg;
928  	netmsg_init(&pmsg->base, NULL, &netisr_apanic_rport,
929  		    0, transport_processing_handler);
930  	pmsg->nm_packet = m;
931  	pmsg->base.lmsg.u.ms_result = hlen;
932  	lwkt_sendmsg(port, &pmsg->base.lmsg);
933  }
934  
935  static void
936  ip_forward_handler(netmsg_t msg)
937  {
938  	struct netmsg_forward *fmsg;
939  	struct mbuf *m;
940  	struct m_tag *mtag;
941  	struct sockaddr_in *next_hop = NULL;
942  
943  	fmsg = &msg->forward;
944  	m = fmsg->nm_packet;
945  
946  	/* Re-extract the next hop if it exists */
947  	if (m->m_pkthdr.fw_flags & IPFORWARD_MBUF_TAGGED) {
948  		/* Next hop */
949  		mtag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
950  		KKASSERT(mtag != NULL);
951  		next_hop = m_tag_data(mtag);
952  	}
953  
954  	ip_forward(m, fmsg->using_srcrt, next_hop);
955  	/* msg was embedded in the mbuf, do not reply! */
956  }
957  
958  static void
959  ip_forward_redispatch(struct lwkt_port *port, struct mbuf *m, boolean_t srcrt)
960  {
961  	struct netmsg_forward *fmsg;
962  
963  	fmsg = &m->m_hdr.mh_fwdmsg;
964  	netmsg_init(&fmsg->base, NULL, &netisr_apanic_rport,
965  		    0, ip_forward_handler);
966  	fmsg->nm_packet = m;
967  	fmsg->using_srcrt = srcrt;
968  	lwkt_sendmsg(port, &fmsg->base.lmsg);
969  }
970  
971  /*
972   * Take incoming datagram fragment and try to reassemble it into
973   * whole datagram.  If a chain for reassembly of this datagram already
974   * exists, then it is given as fp; otherwise have to make a chain.
975   */
976  struct mbuf *
977  ip_reass(struct mbuf *m)
978  {
979  	struct ipfrag_queue *fragq = &ipfrag_queue_pcpu[mycpuid];
980  	struct ip *ip = mtod(m, struct ip *);
981  	struct mbuf *p = NULL, *q, *nq;
982  	struct mbuf *n;
983  	struct ipq *fp = NULL;
984  	struct ipqhead *head;
985  	int hlen = IP_VHL_HL(ip->ip_vhl) << 2;
986  	int i, next;
987  	u_short sum;
988  	uint16_t ip_off;
989  	uint16_t ip_len;
990  
991  	/* If maxnipq or maxfragsperpacket are 0, never accept fragments. */
992  	if (maxnipq == 0 || maxfragsperpacket == 0) {
993  		ipstat.ips_fragments++;
994  		ipstat.ips_fragdropped++;
995  		m_freem(m);
996  		return NULL;
997  	}
998  
999  	sum = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id);
1000  	/*
1001  	 * Look for queue of fragments of this datagram.
1002  	 */
1003  	head = &fragq->ipq[sum];
1004  	TAILQ_FOREACH(fp, head, ipq_list) {
1005  		if (ip->ip_id == fp->ipq_id &&
1006  		    ip->ip_src.s_addr == fp->ipq_src.s_addr &&
1007  		    ip->ip_dst.s_addr == fp->ipq_dst.s_addr &&
1008  		    ip->ip_p == fp->ipq_p)
1009  		{
1010  			goto found;
1011  		}
1012  	}
1013  
1014  	fp = NULL;
1015  
1016  	/*
1017  	 * Enforce upper bound on number of fragmented packets
1018  	 * for which we attempt reassembly;
1019  	 * If maxnipq is -1, accept all fragments without limitation.
1020  	 */
1021  	if (fragq->nipq > maxnipq && maxnipq > 0) {
1022  		/*
1023  		 * drop something from the tail of the current queue
1024  		 * before proceeding further
1025  		 */
1026  		struct ipq *q = TAILQ_LAST(head, ipqhead);
1027  		if (q == NULL) {
1028  			/*
1029  			 * The current queue is empty,
1030  			 * so drop from one of the others.
1031  			 */
1032  			for (i = 0; i < IPREASS_NHASH; i++) {
1033  				struct ipq *r;
1034  
1035  				r = TAILQ_LAST(&fragq->ipq[i], ipqhead);
1036  				if (r) {
1037  					ipstat.ips_fragtimeout += r->ipq_nfrags;
1038  					ip_freef(fragq, &fragq->ipq[i], r);
1039  					break;
1040  				}
1041  			}
1042  		} else {
1043  			ipstat.ips_fragtimeout += q->ipq_nfrags;
1044  			ip_freef(fragq, head, q);
1045  		}
1046  	}
1047  found:
1048  	/*
1049  	 * NOTE: ip_len is no longer adjusted to remove the header length.
1050  	 */
1051  	if (ip->ip_off & htons(IP_MF)) {
1052  		/*
1053  		 * Make sure that fragments have a data length
1054  		 * that's a non-zero multiple of 8 bytes.  The
1055  		 * IP header itself might be in multiples of 4
1056  		 * bytes and is discounted.
1057  		 */
1058  		ip_len = ntohs(ip->ip_len) - hlen;
1059  		if (ip_len == 0 || (ip_len & 7) != 0) {
1060  			ipstat.ips_toosmall++; /* XXX */
1061  			m_freem(m);
1062  			goto done;
1063  		}
1064  		m->m_flags |= M_FRAG;
1065  	} else {
1066  		m->m_flags &= ~M_FRAG;
1067  	}
1068  
1069  	ipstat.ips_fragments++;
1070  	m->m_pkthdr.header = ip;
1071  
1072  	/*
1073  	 * If the hardware has not done csum over this fragment
1074  	 * then csum_data is not valid at all.
1075  	 */
1076  	if ((m->m_pkthdr.csum_flags & (CSUM_FRAG_NOT_CHECKED | CSUM_DATA_VALID))
1077  	    == (CSUM_FRAG_NOT_CHECKED | CSUM_DATA_VALID))
1078  	{
1079  		m->m_pkthdr.csum_data = 0;
1080  		m->m_pkthdr.csum_flags &= ~(CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1081  	}
1082  
1083  	/*
1084  	 * Presence of header sizes in mbufs would confuse code below.
1085  	 * Note that ip->ip_len is not modified and retains the header length,
1086  	 * but local ip_len and fp_len variables remove the header length.
1087  	 */
1088  	m->m_data += hlen;
1089  	m->m_len -= hlen;
1090  
1091  	/*
1092  	 * If first fragment to arrive, create a reassembly queue.
1093  	 */
1094  	if (fp == NULL) {
1095  		if ((fp = mpipe_alloc_nowait(&ipq_mpipe)) == NULL)
1096  			goto dropfrag;
1097  		TAILQ_INSERT_HEAD(head, fp, ipq_list);
1098  		fragq->nipq++;
1099  		fp->ipq_nfrags = 1;
1100  		fp->ipq_ttl = IPFRAGTTL;
1101  		fp->ipq_p = ip->ip_p;
1102  		fp->ipq_id = ip->ip_id;
1103  		fp->ipq_src = ip->ip_src;
1104  		fp->ipq_dst = ip->ip_dst;
1105  		fp->ipq_frags = m;
1106  		m->m_nextpkt = NULL;
1107  		goto inserted;
1108  	}
1109  	fp->ipq_nfrags++;
1110  
1111  #define	GETIP(m)	((struct ip*)((m)->m_pkthdr.header))
1112  
1113  	/*
1114  	 * Find a segment which begins after this one does.  We
1115  	 * don't have to fully convert the offset field for this
1116  	 * test.
1117  	 */
1118  	for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
1119  		if ((ntohs(GETIP(q)->ip_off) & IP_OFFMASK) >
1120  		    (ntohs(ip->ip_off) & IP_OFFMASK))
1121  		{
1122  			break;
1123  		}
1124  	}
1125  
1126  	/*
1127  	 * Drop fragment if it overflows the maximum allowed IP
1128  	 * packet size.
1129  	 */
1130  	ip_off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3;
1131  	ip_len = ntohs(ip->ip_len);
1132  
1133  	if (ip_off + ip_len > 65535U)
1134  		goto dropfrag;
1135  
1136  	ip_len -= hlen;
1137  
1138  	/*
1139  	 * If there is a preceding segment, it may provide some of
1140  	 * our data already.  If so, drop the data from the incoming
1141  	 * segment.  If it provides all of our data, drop us, otherwise
1142  	 * stick new segment in the proper place.
1143  	 *
1144  	 * If some of the data is dropped from the the preceding
1145  	 * segment, then it's checksum is invalidated.
1146  	 */
1147  
1148  	if (p) {
1149  		uint16_t fp_off;
1150  		uint16_t fp_len;
1151  
1152  		/*
1153  		 * Calculations in bytes and ip_len/fp_len do not reflect
1154  		 * the header size.
1155  		 */
1156  		fp_off = (ntohs(GETIP(p)->ip_off) & IP_OFFMASK) << 3;
1157  		fp_len = ntohs(GETIP(p)->ip_len) -
1158  			 (IP_VHL_HL(GETIP(p)->ip_vhl) << 2);
1159  
1160  		if (fp_off + fp_len > ip_off) {
1161  			i = fp_off + fp_len - ip_off;
1162  			if (i >= ip_len)
1163  				goto dropfrag;
1164  			m_adj(m, i);
1165  			m->m_pkthdr.csum_flags = 0;
1166  			ip_off = fp_off + fp_len;
1167  			ip_len -= i;
1168  
1169  			/*
1170  			 * Non-optimal modification of packet content, but
1171  			 * in this rare case we don't care.
1172  			 */
1173  			ip->ip_off = htons(ip_off >> 3);
1174  			ip->ip_len = htons(ip_len + hlen);
1175  		}
1176  		m->m_nextpkt = p->m_nextpkt;
1177  		p->m_nextpkt = m;
1178  	} else {
1179  		m->m_nextpkt = fp->ipq_frags;
1180  		fp->ipq_frags = m;
1181  	}
1182  
1183  	/*
1184  	 * Dequeue any later segments that we completely overlap.
1185  	 * While we overlap succeeding segments trim them or,
1186  	 * if they are completely covered, dequeue them.
1187  	 */
1188  	while (q) {
1189  		uint16_t fp_off;
1190  		uint16_t fp_len;
1191  		uint16_t fp_hlen;
1192  
1193  		fp_off = (ntohs(GETIP(q)->ip_off) & IP_OFFMASK) << 3;
1194  		fp_hlen = (IP_VHL_HL(GETIP(q)->ip_vhl) << 2);
1195  		fp_len = ntohs(GETIP(q)->ip_len) - fp_hlen;
1196  		if (ip_off + ip_len <= fp_off)
1197  			break;
1198  		i = ip_off + ip_len - fp_off;	/* bytes overlapped */
1199  
1200  		if (i < fp_len) {
1201  			/*
1202  			 * Non-optimal modification of packet content, but
1203  			 * in this rare case we don't care.
1204  			 */
1205  			GETIP(q)->ip_len = htons(fp_len - i + fp_hlen);
1206  			GETIP(q)->ip_off = htons((fp_off + i) >> 3);
1207  			m_adj(q, i);
1208  			q->m_pkthdr.csum_flags = 0;
1209  			break;
1210  		}
1211  		nq = q->m_nextpkt;
1212  		m->m_nextpkt = nq;
1213  		ipstat.ips_fragdropped++;
1214  		fp->ipq_nfrags--;
1215  		q->m_nextpkt = NULL;
1216  		m_freem(q);
1217  
1218  		q = nq;
1219  	}
1220  
1221  inserted:
1222  	/*
1223  	 * Check for complete reassembly and perform frag per packet
1224  	 * limiting.
1225  	 *
1226  	 * Frag limiting is performed here so that the nth frag has
1227  	 * a chance to complete the packet before we drop the packet.
1228  	 * As a result, n+1 frags are actually allowed per packet, but
1229  	 * only n will ever be stored. (n = maxfragsperpacket.)
1230  	 *
1231  	 */
1232  	next = 0;
1233  	for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
1234  		uint16_t fp_off;
1235  		uint16_t fp_len;
1236  		uint16_t fp_hlen;
1237  
1238  		fp_off = (ntohs(GETIP(q)->ip_off) & IP_OFFMASK) << 3;
1239  		fp_hlen = (IP_VHL_HL(GETIP(q)->ip_vhl) << 2);
1240  		fp_len = ntohs(GETIP(q)->ip_len) - fp_hlen;
1241  		if (fp_off != next) {
1242  			if (fp->ipq_nfrags > maxfragsperpacket) {
1243  				ipstat.ips_fragdropped += fp->ipq_nfrags;
1244  				ip_freef(fragq, head, fp);
1245  			}
1246  			goto done;
1247  		}
1248  		next += fp_len;
1249  	}
1250  	/* Make sure the last packet didn't have the IP_MF flag */
1251  	if (p->m_flags & M_FRAG) {
1252  		if (fp->ipq_nfrags > maxfragsperpacket) {
1253  			ipstat.ips_fragdropped += fp->ipq_nfrags;
1254  			ip_freef(fragq, head, fp);
1255  		}
1256  		goto done;
1257  	}
1258  
1259  	/*
1260  	 * Reassembly is complete.  Make sure the packet is a sane size.
1261  	 */
1262  	q = fp->ipq_frags;
1263  	ip = GETIP(q);
1264  	hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1265  	if (next + (IP_VHL_HL(ip->ip_vhl) << 2) > IP_MAXPACKET) {
1266  		ipstat.ips_toolong++;
1267  		ipstat.ips_fragdropped += fp->ipq_nfrags;
1268  		ip_freef(fragq, head, fp);
1269  		goto done;
1270  	}
1271  
1272  	/*
1273  	 * Concatenate fragments.
1274  	 */
1275  	m = q;
1276  	n = m->m_next;
1277  	m->m_next = NULL;
1278  	m_cat(m, n);
1279  	nq = q->m_nextpkt;
1280  	q->m_nextpkt = NULL;
1281  	for (q = nq; q != NULL; q = nq) {
1282  		nq = q->m_nextpkt;
1283  		q->m_nextpkt = NULL;
1284  		m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags;
1285  		m->m_pkthdr.csum_data += q->m_pkthdr.csum_data;
1286  		m_cat(m, q);
1287  	}
1288  
1289  	/*
1290  	 * Clean up the 1's complement checksum.  Carry over 16 bits must
1291  	 * be added back.  This assumes no more then 65535 packet fragments
1292  	 * were reassembled.  A second carry can also occur (but not a third).
1293  	 */
1294  	m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) +
1295  				(m->m_pkthdr.csum_data >> 16);
1296  	if (m->m_pkthdr.csum_data > 0xFFFF)
1297  		m->m_pkthdr.csum_data -= 0xFFFF;
1298  
1299  	/*
1300  	 * Create header for new ip packet by modifying the header of the
1301  	 * first packet.  Dequeue and discard the fragment reassembly header.
1302  	 * Make the header visible.  Set the offset to 0 and keep only the
1303  	 * DF flag from the first packet's ip_off field.
1304  	 *
1305  	 * Note that ip_len includes the header length.
1306  	 */
1307  	ip->ip_len = htons(next + hlen);
1308  	ip->ip_src = fp->ipq_src;
1309  	ip->ip_dst = fp->ipq_dst;
1310  	ip->ip_off &= htons(IP_DF);
1311  	TAILQ_REMOVE(head, fp, ipq_list);
1312  	fragq->nipq--;
1313  	mpipe_free(&ipq_mpipe, fp);
1314  
1315  	m->m_len += (IP_VHL_HL(ip->ip_vhl) << 2);
1316  	m->m_data -= (IP_VHL_HL(ip->ip_vhl) << 2);
1317  	/* some debugging cruft by sklower, below, will go away soon */
1318  	if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */
1319  		int plen = 0;
1320  
1321  		for (n = m; n; n = n->m_next)
1322  			plen += n->m_len;
1323  		m->m_pkthdr.len = plen;
1324  	}
1325  
1326  	/*
1327  	 * Reassembly complete, return the next protocol.
1328  	 *
1329  	 * Be sure to clear M_HASH to force the packet
1330  	 * to be re-characterized.
1331  	 *
1332  	 * Clear M_FRAG, we are no longer a fragment.
1333  	 */
1334  	m->m_flags &= ~(M_HASH | M_FRAG);
1335  
1336  	ipstat.ips_reassembled++;
1337  	return (m);
1338  
1339  dropfrag:
1340  	ipstat.ips_fragdropped++;
1341  	if (fp != NULL)
1342  		fp->ipq_nfrags--;
1343  	m_freem(m);
1344  done:
1345  	return (NULL);
1346  
1347  #undef GETIP
1348  }
1349  
1350  /*
1351   * Free a fragment reassembly header and all
1352   * associated datagrams.
1353   */
1354  static void
1355  ip_freef(struct ipfrag_queue *fragq, struct ipqhead *fhp, struct ipq *fp)
1356  {
1357  	struct mbuf *q;
1358  
1359  	/*
1360  	 * Remove first to protect against blocking
1361  	 */
1362  	TAILQ_REMOVE(fhp, fp, ipq_list);
1363  
1364  	/*
1365  	 * Clean out at our leisure
1366  	 */
1367  	while (fp->ipq_frags) {
1368  		q = fp->ipq_frags;
1369  		fp->ipq_frags = q->m_nextpkt;
1370  		q->m_nextpkt = NULL;
1371  		m_freem(q);
1372  	}
1373  	mpipe_free(&ipq_mpipe, fp);
1374  	fragq->nipq--;
1375  }
1376  
1377  /*
1378   * If a timer expires on a reassembly queue, discard it.
1379   */
1380  static void
1381  ipfrag_timeo_dispatch(netmsg_t nmsg)
1382  {
1383  	struct ipfrag_queue *fragq = &ipfrag_queue_pcpu[mycpuid];
1384  	struct ipq *fp, *fp_temp;
1385  	struct ipqhead *head;
1386  	int i;
1387  
1388  	crit_enter();
1389  	netisr_replymsg(&nmsg->base, 0);  /* reply ASAP */
1390  	crit_exit();
1391  
1392  	if (fragq->nipq == 0)
1393  		goto done;
1394  
1395  	for (i = 0; i < IPREASS_NHASH; i++) {
1396  		head = &fragq->ipq[i];
1397  		TAILQ_FOREACH_MUTABLE(fp, head, ipq_list, fp_temp) {
1398  			if (--fp->ipq_ttl == 0) {
1399  				ipstat.ips_fragtimeout += fp->ipq_nfrags;
1400  				ip_freef(fragq, head, fp);
1401  			}
1402  		}
1403  	}
1404  	/*
1405  	 * If we are over the maximum number of fragments
1406  	 * (due to the limit being lowered), drain off
1407  	 * enough to get down to the new limit.
1408  	 */
1409  	if (maxnipq >= 0 && fragq->nipq > maxnipq) {
1410  		for (i = 0; i < IPREASS_NHASH; i++) {
1411  			head = &fragq->ipq[i];
1412  			while (fragq->nipq > maxnipq && !TAILQ_EMPTY(head)) {
1413  				ipstat.ips_fragdropped +=
1414  				    TAILQ_FIRST(head)->ipq_nfrags;
1415  				ip_freef(fragq, head, TAILQ_FIRST(head));
1416  			}
1417  		}
1418  	}
1419  done:
1420  	callout_reset(&fragq->timeo_ch, IPFRAG_TIMEO, ipfrag_timeo, NULL);
1421  }
1422  
1423  static void
1424  ipfrag_timeo(void *dummy __unused)
1425  {
1426  	struct netmsg_base *msg = &ipfrag_queue_pcpu[mycpuid].timeo_netmsg;
1427  
1428  	crit_enter();
1429  	if (msg->lmsg.ms_flags & MSGF_DONE)
1430  		netisr_sendmsg_oncpu(msg);
1431  	crit_exit();
1432  }
1433  
1434  /*
1435   * Drain off all datagram fragments.
1436   */
1437  static void
1438  ipfrag_drain_oncpu(struct ipfrag_queue *fragq)
1439  {
1440  	struct ipqhead *head;
1441  	int i;
1442  
1443  	for (i = 0; i < IPREASS_NHASH; i++) {
1444  		head = &fragq->ipq[i];
1445  		while (!TAILQ_EMPTY(head)) {
1446  			ipstat.ips_fragdropped += TAILQ_FIRST(head)->ipq_nfrags;
1447  			ip_freef(fragq, head, TAILQ_FIRST(head));
1448  		}
1449  	}
1450  }
1451  
1452  static void
1453  ipfrag_drain_dispatch(netmsg_t nmsg)
1454  {
1455  	struct ipfrag_queue *fragq = &ipfrag_queue_pcpu[mycpuid];
1456  
1457  	crit_enter();
1458  	lwkt_replymsg(&nmsg->lmsg, 0);  /* reply ASAP */
1459  	crit_exit();
1460  
1461  	ipfrag_drain_oncpu(fragq);
1462  	fragq->draining = 0;
1463  }
1464  
1465  static void
1466  ipfrag_drain_ipi(void *arg __unused)
1467  {
1468  	int cpu = mycpuid;
1469  	struct lwkt_msg *msg = &ipfrag_queue_pcpu[cpu].drain_netmsg.lmsg;
1470  
1471  	crit_enter();
1472  	if (msg->ms_flags & MSGF_DONE)
1473  		lwkt_sendmsg_oncpu(netisr_cpuport(cpu), msg);
1474  	crit_exit();
1475  }
1476  
1477  static void
1478  ipfrag_drain(void)
1479  {
1480  	cpumask_t mask;
1481  	int cpu;
1482  
1483  	CPUMASK_ASSBMASK(mask, netisr_ncpus);
1484  	CPUMASK_ANDMASK(mask, smp_active_mask);
1485  
1486  	if (IN_NETISR_NCPUS(mycpuid)) {
1487  		ipfrag_drain_oncpu(&ipfrag_queue_pcpu[mycpuid]);
1488  		CPUMASK_NANDBIT(mask, mycpuid);
1489  	}
1490  
1491  	for (cpu = 0; cpu < netisr_ncpus; ++cpu) {
1492  		struct ipfrag_queue *fragq = &ipfrag_queue_pcpu[cpu];
1493  
1494  		if (!CPUMASK_TESTBIT(mask, cpu))
1495  			continue;
1496  
1497  		if (fragq->nipq == 0 || fragq->draining) {
1498  			/* No fragments or is draining; skip this cpu. */
1499  			CPUMASK_NANDBIT(mask, cpu);
1500  			continue;
1501  		}
1502  		fragq->draining = 1;
1503  	}
1504  
1505  	if (CPUMASK_TESTNZERO(mask))
1506  		lwkt_send_ipiq_mask(mask, ipfrag_drain_ipi, NULL);
1507  }
1508  
1509  void
1510  ip_drain(void)
1511  {
1512  	ipfrag_drain();
1513  	in_rtqdrain();
1514  }
1515  
1516  /*
1517   * Do option processing on a datagram,
1518   * possibly discarding it if bad options are encountered,
1519   * or forwarding it if source-routed.
1520   * The pass argument is used when operating in the IPSTEALTH
1521   * mode to tell what options to process:
1522   * [LS]SRR (pass 0) or the others (pass 1).
1523   * The reason for as many as two passes is that when doing IPSTEALTH,
1524   * non-routing options should be processed only if the packet is for us.
1525   * Returns 1 if packet has been forwarded/freed,
1526   * 0 if the packet should be processed further.
1527   */
1528  static int
1529  ip_dooptions(struct mbuf *m, int pass, struct sockaddr_in *next_hop)
1530  {
1531  	struct sockaddr_in ipaddr = { sizeof ipaddr, AF_INET };
1532  	struct ip *ip = mtod(m, struct ip *);
1533  	u_char *cp;
1534  	struct in_ifaddr *ia;
1535  	int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB;
1536  	boolean_t forward = FALSE;
1537  	struct in_addr *sin, dst;
1538  	n_time ntime;
1539  
1540  	dst = ip->ip_dst;
1541  	cp = (u_char *)(ip + 1);
1542  	cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof(struct ip);
1543  	for (; cnt > 0; cnt -= optlen, cp += optlen) {
1544  		opt = cp[IPOPT_OPTVAL];
1545  		if (opt == IPOPT_EOL)
1546  			break;
1547  		if (opt == IPOPT_NOP)
1548  			optlen = 1;
1549  		else {
1550  			if (cnt < IPOPT_OLEN + sizeof(*cp)) {
1551  				code = &cp[IPOPT_OLEN] - (u_char *)ip;
1552  				goto bad;
1553  			}
1554  			optlen = cp[IPOPT_OLEN];
1555  			if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) {
1556  				code = &cp[IPOPT_OLEN] - (u_char *)ip;
1557  				goto bad;
1558  			}
1559  		}
1560  		switch (opt) {
1561  
1562  		default:
1563  			break;
1564  
1565  		/*
1566  		 * Source routing with record.
1567  		 * Find interface with current destination address.
1568  		 * If none on this machine then drop if strictly routed,
1569  		 * or do nothing if loosely routed.
1570  		 * Record interface address and bring up next address
1571  		 * component.  If strictly routed make sure next
1572  		 * address is on directly accessible net.
1573  		 */
1574  		case IPOPT_LSRR:
1575  		case IPOPT_SSRR:
1576  			if (ipstealth && pass > 0)
1577  				break;
1578  			if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
1579  				code = &cp[IPOPT_OLEN] - (u_char *)ip;
1580  				goto bad;
1581  			}
1582  			if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
1583  				code = &cp[IPOPT_OFFSET] - (u_char *)ip;
1584  				goto bad;
1585  			}
1586  			ipaddr.sin_addr = ip->ip_dst;
1587  			ia = (struct in_ifaddr *)
1588  				ifa_ifwithaddr((struct sockaddr *)&ipaddr);
1589  			if (ia == NULL) {
1590  				if (opt == IPOPT_SSRR) {
1591  					type = ICMP_UNREACH;
1592  					code = ICMP_UNREACH_SRCFAIL;
1593  					goto bad;
1594  				}
1595  				if (!ip_dosourceroute)
1596  					goto nosourcerouting;
1597  				/*
1598  				 * Loose routing, and not at next destination
1599  				 * yet; nothing to do except forward.
1600  				 */
1601  				break;
1602  			}
1603  			off--;			/* 0 origin */
1604  			if (off > optlen - (int)sizeof(struct in_addr)) {
1605  				/*
1606  				 * End of source route.  Should be for us.
1607  				 */
1608  				if (!ip_acceptsourceroute)
1609  					goto nosourcerouting;
1610  				save_rte(m, cp, ip->ip_src);
1611  				break;
1612  			}
1613  			if (ipstealth)
1614  				goto dropit;
1615  			if (!ip_dosourceroute) {
1616  				if (ipforwarding) {
1617  					char sbuf[INET_ADDRSTRLEN];
1618  					char dbuf[INET_ADDRSTRLEN];
1619  
1620  					/*
1621  					 * Acting as a router, so generate ICMP
1622  					 */
1623  nosourcerouting:
1624  					log(LOG_WARNING,
1625  					    "attempted source route from %s to %s\n",
1626  					    kinet_ntoa(ip->ip_src, sbuf),
1627  					    kinet_ntoa(ip->ip_dst, dbuf));
1628  					type = ICMP_UNREACH;
1629  					code = ICMP_UNREACH_SRCFAIL;
1630  					goto bad;
1631  				} else {
1632  					/*
1633  					 * Not acting as a router,
1634  					 * so silently drop.
1635  					 */
1636  dropit:
1637  					ipstat.ips_cantforward++;
1638  					m_freem(m);
1639  					return (1);
1640  				}
1641  			}
1642  
1643  			/*
1644  			 * locate outgoing interface
1645  			 */
1646  			memcpy(&ipaddr.sin_addr, cp + off,
1647  			    sizeof ipaddr.sin_addr);
1648  
1649  			if (opt == IPOPT_SSRR) {
1650  #define	INA	struct in_ifaddr *
1651  #define	SA	struct sockaddr *
1652  				if ((ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr))
1653  									== NULL)
1654  					ia = (INA)ifa_ifwithnet((SA)&ipaddr);
1655  			} else {
1656  				ia = ip_rtaddr(ipaddr.sin_addr, NULL);
1657  			}
1658  			if (ia == NULL) {
1659  				type = ICMP_UNREACH;
1660  				code = ICMP_UNREACH_SRCFAIL;
1661  				goto bad;
1662  			}
1663  			ip->ip_dst = ipaddr.sin_addr;
1664  			memcpy(cp + off, &IA_SIN(ia)->sin_addr,
1665  			    sizeof(struct in_addr));
1666  			cp[IPOPT_OFFSET] += sizeof(struct in_addr);
1667  			/*
1668  			 * Let ip_intr's mcast routing check handle mcast pkts
1669  			 */
1670  			forward = !IN_MULTICAST(ntohl(ip->ip_dst.s_addr));
1671  			break;
1672  
1673  		case IPOPT_RR:
1674  			if (ipstealth && pass == 0)
1675  				break;
1676  			if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
1677  				code = &cp[IPOPT_OFFSET] - (u_char *)ip;
1678  				goto bad;
1679  			}
1680  			if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
1681  				code = &cp[IPOPT_OFFSET] - (u_char *)ip;
1682  				goto bad;
1683  			}
1684  			/*
1685  			 * If no space remains, ignore.
1686  			 */
1687  			off--;			/* 0 origin */
1688  			if (off > optlen - (int)sizeof(struct in_addr))
1689  				break;
1690  			memcpy(&ipaddr.sin_addr, &ip->ip_dst,
1691  			    sizeof ipaddr.sin_addr);
1692  			/*
1693  			 * locate outgoing interface; if we're the destination,
1694  			 * use the incoming interface (should be same).
1695  			 */
1696  			if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) == NULL &&
1697  			    (ia = ip_rtaddr(ipaddr.sin_addr, NULL)) == NULL) {
1698  				type = ICMP_UNREACH;
1699  				code = ICMP_UNREACH_HOST;
1700  				goto bad;
1701  			}
1702  			memcpy(cp + off, &IA_SIN(ia)->sin_addr,
1703  			    sizeof(struct in_addr));
1704  			cp[IPOPT_OFFSET] += sizeof(struct in_addr);
1705  			break;
1706  
1707  		case IPOPT_TS:
1708  			if (ipstealth && pass == 0)
1709  				break;
1710  			code = cp - (u_char *)ip;
1711  			if (optlen < 4 || optlen > 40) {
1712  				code = &cp[IPOPT_OLEN] - (u_char *)ip;
1713  				goto bad;
1714  			}
1715  			if ((off = cp[IPOPT_OFFSET]) < 5) {
1716  				code = &cp[IPOPT_OLEN] - (u_char *)ip;
1717  				goto bad;
1718  			}
1719  			if (off > optlen - (int)sizeof(int32_t)) {
1720  				cp[IPOPT_OFFSET + 1] += (1 << 4);
1721  				if ((cp[IPOPT_OFFSET + 1] & 0xf0) == 0) {
1722  					code = &cp[IPOPT_OFFSET] - (u_char *)ip;
1723  					goto bad;
1724  				}
1725  				break;
1726  			}
1727  			off--;				/* 0 origin */
1728  			sin = (struct in_addr *)(cp + off);
1729  			switch (cp[IPOPT_OFFSET + 1] & 0x0f) {
1730  
1731  			case IPOPT_TS_TSONLY:
1732  				break;
1733  
1734  			case IPOPT_TS_TSANDADDR:
1735  				if (off + sizeof(n_time) +
1736  				    sizeof(struct in_addr) > optlen) {
1737  					code = &cp[IPOPT_OFFSET] - (u_char *)ip;
1738  					goto bad;
1739  				}
1740  				ipaddr.sin_addr = dst;
1741  				ia = (INA)ifaof_ifpforaddr((SA)&ipaddr,
1742  							    m->m_pkthdr.rcvif);
1743  				if (ia == NULL)
1744  					continue;
1745  				memcpy(sin, &IA_SIN(ia)->sin_addr,
1746  				    sizeof(struct in_addr));
1747  				cp[IPOPT_OFFSET] += sizeof(struct in_addr);
1748  				off += sizeof(struct in_addr);
1749  				break;
1750  
1751  			case IPOPT_TS_PRESPEC:
1752  				if (off + sizeof(n_time) +
1753  				    sizeof(struct in_addr) > optlen) {
1754  					code = &cp[IPOPT_OFFSET] - (u_char *)ip;
1755  					goto bad;
1756  				}
1757  				memcpy(&ipaddr.sin_addr, sin,
1758  				    sizeof(struct in_addr));
1759  				if (ifa_ifwithaddr((SA)&ipaddr) == NULL)
1760  					continue;
1761  				cp[IPOPT_OFFSET] += sizeof(struct in_addr);
1762  				off += sizeof(struct in_addr);
1763  				break;
1764  
1765  			default:
1766  				code = &cp[IPOPT_OFFSET + 1] - (u_char *)ip;
1767  				goto bad;
1768  			}
1769  			ntime = iptime();
1770  			memcpy(cp + off, &ntime, sizeof(n_time));
1771  			cp[IPOPT_OFFSET] += sizeof(n_time);
1772  		}
1773  	}
1774  	if (forward && ipforwarding) {
1775  		ip_forward(m, TRUE, next_hop);
1776  		return (1);
1777  	}
1778  	return (0);
1779  bad:
1780  	icmp_error(m, type, code, 0, 0);
1781  	ipstat.ips_badoptions++;
1782  	return (1);
1783  }
1784  
1785  /*
1786   * Given address of next destination (final or next hop),
1787   * return internet address info of interface to be used to get there.
1788   */
1789  struct in_ifaddr *
1790  ip_rtaddr(struct in_addr dst, struct route *ro0)
1791  {
1792  	struct route sro, *ro;
1793  	struct sockaddr_in *sin;
1794  	struct in_ifaddr *ia;
1795  
1796  	if (ro0 != NULL) {
1797  		ro = ro0;
1798  	} else {
1799  		bzero(&sro, sizeof(sro));
1800  		ro = &sro;
1801  	}
1802  
1803  	sin = (struct sockaddr_in *)&ro->ro_dst;
1804  
1805  	if (ro->ro_rt == NULL || dst.s_addr != sin->sin_addr.s_addr) {
1806  		if (ro->ro_rt != NULL) {
1807  			RTFREE(ro->ro_rt);
1808  			ro->ro_rt = NULL;
1809  		}
1810  		sin->sin_family = AF_INET;
1811  		sin->sin_len = sizeof *sin;
1812  		sin->sin_addr = dst;
1813  		rtalloc_ign(ro, RTF_PRCLONING);
1814  	}
1815  
1816  	if (ro->ro_rt == NULL)
1817  		return (NULL);
1818  
1819  	ia = ifatoia(ro->ro_rt->rt_ifa);
1820  
1821  	if (ro == &sro)
1822  		RTFREE(ro->ro_rt);
1823  	return ia;
1824  }
1825  
1826  /*
1827   * Save incoming source route for use in replies,
1828   * to be picked up later by ip_srcroute if the receiver is interested.
1829   */
1830  static void
1831  save_rte(struct mbuf *m, u_char *option, struct in_addr dst)
1832  {
1833  	struct m_tag *mtag;
1834  	struct ip_srcrt_opt *opt;
1835  	unsigned olen;
1836  
1837  	mtag = m_tag_get(PACKET_TAG_IPSRCRT, sizeof(*opt), M_NOWAIT);
1838  	if (mtag == NULL)
1839  		return;
1840  	opt = m_tag_data(mtag);
1841  
1842  	olen = option[IPOPT_OLEN];
1843  #ifdef DIAGNOSTIC
1844  	if (ipprintfs)
1845  		kprintf("save_rte: olen %d\n", olen);
1846  #endif
1847  	if (olen > sizeof(opt->ip_srcrt) - (1 + sizeof(dst))) {
1848  		m_tag_free(mtag);
1849  		return;
1850  	}
1851  	bcopy(option, opt->ip_srcrt.srcopt, olen);
1852  	opt->ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr);
1853  	opt->ip_srcrt.dst = dst;
1854  	m_tag_prepend(m, mtag);
1855  }
1856  
1857  /*
1858   * Retrieve incoming source route for use in replies,
1859   * in the same form used by setsockopt.
1860   * The first hop is placed before the options, will be removed later.
1861   */
1862  struct mbuf *
1863  ip_srcroute(struct mbuf *m0)
1864  {
1865  	struct in_addr *p, *q;
1866  	struct mbuf *m;
1867  	struct m_tag *mtag;
1868  	struct ip_srcrt_opt *opt;
1869  
1870  	if (m0 == NULL)
1871  		return NULL;
1872  
1873  	mtag = m_tag_find(m0, PACKET_TAG_IPSRCRT, NULL);
1874  	if (mtag == NULL)
1875  		return NULL;
1876  	opt = m_tag_data(mtag);
1877  
1878  	if (opt->ip_nhops == 0)
1879  		return (NULL);
1880  	m = m_get(M_NOWAIT, MT_HEADER);
1881  	if (m == NULL)
1882  		return (NULL);
1883  
1884  #define	OPTSIZ	(sizeof(opt->ip_srcrt.nop) + sizeof(opt->ip_srcrt.srcopt))
1885  
1886  	/* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */
1887  	m->m_len = opt->ip_nhops * sizeof(struct in_addr) +
1888  		   sizeof(struct in_addr) + OPTSIZ;
1889  #ifdef DIAGNOSTIC
1890  	if (ipprintfs) {
1891  		kprintf("ip_srcroute: nhops %d mlen %d",
1892  			opt->ip_nhops, m->m_len);
1893  	}
1894  #endif
1895  
1896  	/*
1897  	 * First save first hop for return route
1898  	 */
1899  	p = &opt->ip_srcrt.route[opt->ip_nhops - 1];
1900  	*(mtod(m, struct in_addr *)) = *p--;
1901  #ifdef DIAGNOSTIC
1902  	if (ipprintfs)
1903  		kprintf(" hops %x", ntohl(mtod(m, struct in_addr *)->s_addr));
1904  #endif
1905  
1906  	/*
1907  	 * Copy option fields and padding (nop) to mbuf.
1908  	 */
1909  	opt->ip_srcrt.nop = IPOPT_NOP;
1910  	opt->ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF;
1911  	memcpy(mtod(m, caddr_t) + sizeof(struct in_addr), &opt->ip_srcrt.nop,
1912  	    OPTSIZ);
1913  	q = (struct in_addr *)(mtod(m, caddr_t) +
1914  	    sizeof(struct in_addr) + OPTSIZ);
1915  #undef OPTSIZ
1916  	/*
1917  	 * Record return path as an IP source route,
1918  	 * reversing the path (pointers are now aligned).
1919  	 */
1920  	while (p >= opt->ip_srcrt.route) {
1921  #ifdef DIAGNOSTIC
1922  		if (ipprintfs)
1923  			kprintf(" %x", ntohl(q->s_addr));
1924  #endif
1925  		*q++ = *p--;
1926  	}
1927  	/*
1928  	 * Last hop goes to final destination.
1929  	 */
1930  	*q = opt->ip_srcrt.dst;
1931  	m_tag_delete(m0, mtag);
1932  #ifdef DIAGNOSTIC
1933  	if (ipprintfs)
1934  		kprintf(" %x\n", ntohl(q->s_addr));
1935  #endif
1936  	return (m);
1937  }
1938  
1939  /*
1940   * Strip out IP options.
1941   */
1942  void
1943  ip_stripoptions(struct mbuf *m)
1944  {
1945  	int datalen;
1946  	struct ip *ip = mtod(m, struct ip *);
1947  	caddr_t opts;
1948  	int optlen;
1949  
1950  	optlen = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof(struct ip);
1951  	opts = (caddr_t)(ip + 1);
1952  	datalen = m->m_len - (sizeof(struct ip) + optlen);
1953  	bcopy(opts + optlen, opts, datalen);
1954  	m->m_len -= optlen;
1955  	if (m->m_flags & M_PKTHDR)
1956  		m->m_pkthdr.len -= optlen;
1957  	/* leave ip version intact */
1958  	ip->ip_len = htons(ntohs(ip->ip_len) - optlen);
1959  	ip->ip_vhl = IP_MAKE_VHL(IP_VHL_V(ip->ip_vhl), sizeof(struct ip) >> 2);
1960  }
1961  
1962  u_char inetctlerrmap[PRC_NCMDS] = {
1963  	0,		0,		0,		0,
1964  	0,		EMSGSIZE,	EHOSTDOWN,	EHOSTUNREACH,
1965  	EHOSTUNREACH,	EHOSTUNREACH,	ECONNREFUSED,	ECONNREFUSED,
1966  	EMSGSIZE,	EHOSTUNREACH,	0,		0,
1967  	0,		0,		0,		0,
1968  	ENOPROTOOPT,	ECONNREFUSED
1969  };
1970  
1971  /*
1972   * Forward a packet.  If some error occurs return the sender
1973   * an icmp packet.  Note we can't always generate a meaningful
1974   * icmp message because icmp doesn't have a large enough repertoire
1975   * of codes and types.
1976   *
1977   * If not forwarding, just drop the packet.  This could be confusing
1978   * if ipforwarding was zero but some routing protocol was advancing
1979   * us as a gateway to somewhere.  However, we must let the routing
1980   * protocol deal with that.
1981   *
1982   * The using_srcrt parameter indicates whether the packet is being forwarded
1983   * via a source route.
1984   */
1985  void
1986  ip_forward(struct mbuf *m, boolean_t using_srcrt, struct sockaddr_in *next_hop)
1987  {
1988  	struct ip *ip = mtod(m, struct ip *);
1989  	struct rtentry *rt;
1990  	struct route fwd_ro;
1991  	int error, type = 0, code = 0, destmtu = 0;
1992  	struct mbuf *mcopy, *mtemp = NULL;
1993  	n_long dest;
1994  	struct in_addr pkt_dst;
1995  
1996  	dest = INADDR_ANY;
1997  	/*
1998  	 * Cache the destination address of the packet; this may be
1999  	 * changed by use of 'ipfw fwd'.
2000  	 */
2001  	pkt_dst = (next_hop != NULL) ? next_hop->sin_addr : ip->ip_dst;
2002  
2003  #ifdef DIAGNOSTIC
2004  	if (ipprintfs)
2005  		kprintf("forward: src %x dst %x ttl %x\n",
2006  		       ip->ip_src.s_addr, pkt_dst.s_addr, ip->ip_ttl);
2007  #endif
2008  
2009  	if ((m->m_flags & M_HASH) == 0) {
2010  		lwkt_port_t port;
2011  
2012  		m = ip_rehashm(m);
2013  		if (m == NULL)
2014  			return;
2015  
2016  		port = netisr_hashport(m->m_pkthdr.hash);
2017  
2018  		if (port != &curthread->td_msgport) {
2019  			ip_forward_redispatch(port, m, using_srcrt);
2020  			/* Requeued to other msgport; done */
2021  			return;
2022  		}
2023  	}
2024  	if (m->m_flags & (M_BCAST | M_MCAST) || !in_canforward(pkt_dst)) {
2025  		ipstat.ips_cantforward++;
2026  		m_freem(m);
2027  		return;
2028  	}
2029  	if (!ipstealth && ip->ip_ttl <= IPTTLDEC) {
2030  		icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, dest, 0);
2031  		return;
2032  	}
2033  
2034  	bzero(&fwd_ro, sizeof(fwd_ro));
2035  	ip_rtaddr(pkt_dst, &fwd_ro);
2036  	if (fwd_ro.ro_rt == NULL) {
2037  		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0);
2038  		return;
2039  	}
2040  	rt = fwd_ro.ro_rt;
2041  
2042  	if (curthread->td_type == TD_TYPE_NETISR) {
2043  		/*
2044  		 * Save the IP header and at most 8 bytes of the payload,
2045  		 * in case we need to generate an ICMP message to the src.
2046  		 */
2047  		mtemp = ipforward_mtemp[mycpuid];
2048  		KASSERT((mtemp->m_flags & M_EXT) == 0 &&
2049  		    mtemp->m_data == mtemp->m_pktdat &&
2050  		    m_tag_first(mtemp) == NULL,
2051  		    ("ip_forward invalid mtemp1"));
2052  
2053  		if (!m_dup_pkthdr(mtemp, m, M_NOWAIT)) {
2054  			/*
2055  			 * It's probably ok if the pkthdr dup fails (because
2056  			 * the deep copy of the tag chain failed), but for now
2057  			 * be conservative and just discard the copy since
2058  			 * code below may some day want the tags.
2059  			 */
2060  			mtemp = NULL;
2061  		} else {
2062  			mtemp->m_type = m->m_type;
2063  			mtemp->m_len = imin((IP_VHL_HL(ip->ip_vhl) << 2) + 8,
2064  					    (int)ntohs(ip->ip_len));
2065  			mtemp->m_pkthdr.len = mtemp->m_len;
2066  			m_copydata(m, 0, mtemp->m_len, mtod(mtemp, void *));
2067  		}
2068  	}
2069  
2070  	if (!ipstealth)
2071  		ip->ip_ttl -= IPTTLDEC;
2072  
2073  	/*
2074  	 * If forwarding packet using same interface that it came in on,
2075  	 * perhaps should send a redirect to sender to shortcut a hop.
2076  	 * Only send redirect if source is sending directly to us,
2077  	 * and if packet was not source routed (or has any options).
2078  	 * Also, don't send redirect if forwarding using a default route
2079  	 * or a route modified by a redirect.
2080  	 */
2081  	if (rt->rt_ifp == m->m_pkthdr.rcvif &&
2082  	    !(rt->rt_flags & (RTF_DYNAMIC | RTF_MODIFIED)) &&
2083  	    satosin(rt_key(rt))->sin_addr.s_addr != INADDR_ANY &&
2084  	    ipsendredirects && !using_srcrt && next_hop == NULL) {
2085  		u_long src = ntohl(ip->ip_src.s_addr);
2086  		struct in_ifaddr *rt_ifa = (struct in_ifaddr *)rt->rt_ifa;
2087  
2088  		if (rt_ifa != NULL &&
2089  		    (src & rt_ifa->ia_subnetmask) == rt_ifa->ia_subnet) {
2090  			if (rt->rt_flags & RTF_GATEWAY)
2091  				dest = satosin(rt->rt_gateway)->sin_addr.s_addr;
2092  			else
2093  				dest = pkt_dst.s_addr;
2094  			/*
2095  			 * Router requirements says to only send
2096  			 * host redirects.
2097  			 */
2098  			type = ICMP_REDIRECT;
2099  			code = ICMP_REDIRECT_HOST;
2100  #ifdef DIAGNOSTIC
2101  			if (ipprintfs)
2102  				kprintf("redirect (%d) to %x\n", code, dest);
2103  #endif
2104  		}
2105  	}
2106  
2107  	error = ip_output(m, NULL, &fwd_ro, IP_FORWARDING, NULL, NULL);
2108  	if (error == 0) {
2109  		ipstat.ips_forward++;
2110  		if (type == 0) {
2111  			if (mtemp)
2112  				ipflow_create(&fwd_ro, mtemp);
2113  			goto done;
2114  		}
2115  		ipstat.ips_redirectsent++;
2116  	} else {
2117  		ipstat.ips_cantforward++;
2118  	}
2119  
2120  	if (mtemp == NULL)
2121  		goto done;
2122  
2123  	/*
2124  	 * Errors that do not require generating ICMP message
2125  	 */
2126  	switch (error) {
2127  	case ENOBUFS:
2128  		/*
2129  		 * A router should not generate ICMP_SOURCEQUENCH as
2130  		 * required in RFC1812 Requirements for IP Version 4 Routers.
2131  		 * Source quench could be a big problem under DoS attacks,
2132  		 * or if the underlying interface is rate-limited.
2133  		 * Those who need source quench packets may re-enable them
2134  		 * via the net.inet.ip.sendsourcequench sysctl.
2135  		 */
2136  		if (!ip_sendsourcequench)
2137  			goto done;
2138  		break;
2139  
2140  	case EACCES:			/* ipfw denied packet */
2141  		goto done;
2142  	}
2143  
2144  	KASSERT((mtemp->m_flags & M_EXT) == 0 &&
2145  	    mtemp->m_data == mtemp->m_pktdat,
2146  	    ("ip_forward invalid mtemp2"));
2147  	mcopy = m_copym(mtemp, 0, mtemp->m_len, M_NOWAIT);
2148  	if (mcopy == NULL)
2149  		goto done;
2150  
2151  	/*
2152  	 * Send ICMP message.
2153  	 */
2154  	switch (error) {
2155  	case 0:				/* forwarded, but need redirect */
2156  		/* type, code set above */
2157  		break;
2158  
2159  	case ENETUNREACH:		/* shouldn't happen, checked above */
2160  	case EHOSTUNREACH:
2161  	case ENETDOWN:
2162  	case EHOSTDOWN:
2163  	default:
2164  		type = ICMP_UNREACH;
2165  		code = ICMP_UNREACH_HOST;
2166  		break;
2167  
2168  	case EMSGSIZE:
2169  		type = ICMP_UNREACH;
2170  		code = ICMP_UNREACH_NEEDFRAG;
2171  		if (fwd_ro.ro_rt != NULL)
2172  			destmtu = fwd_ro.ro_rt->rt_ifp->if_mtu;
2173  		ipstat.ips_cantfrag++;
2174  		break;
2175  
2176  	case ENOBUFS:
2177  		type = ICMP_SOURCEQUENCH;
2178  		code = 0;
2179  		break;
2180  
2181  	case EACCES:			/* ipfw denied packet */
2182  		panic("ip_forward EACCES should not reach");
2183  	}
2184  	icmp_error(mcopy, type, code, dest, destmtu);
2185  done:
2186  	if (mtemp != NULL)
2187  		m_tag_delete_chain(mtemp);
2188  	if (fwd_ro.ro_rt != NULL)
2189  		RTFREE(fwd_ro.ro_rt);
2190  }
2191  
2192  void
2193  ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip,
2194  	       struct mbuf *m)
2195  {
2196  	if (inp->inp_socket->so_options & SO_TIMESTAMP) {
2197  		struct timeval tv;
2198  
2199  		microtime(&tv);
2200  		*mp = sbcreatecontrol(&tv, sizeof(tv),
2201  		    SCM_TIMESTAMP, SOL_SOCKET);
2202  		if (*mp)
2203  			mp = &(*mp)->m_next;
2204  	}
2205  	if (inp->inp_flags & INP_RECVDSTADDR) {
2206  		*mp = sbcreatecontrol(&ip->ip_dst, sizeof(struct in_addr),
2207  		    IP_RECVDSTADDR, IPPROTO_IP);
2208  		if (*mp)
2209  			mp = &(*mp)->m_next;
2210  	}
2211  	if (inp->inp_flags & INP_RECVTTL) {
2212  		*mp = sbcreatecontrol(&ip->ip_ttl, sizeof(u_char),
2213  		    IP_RECVTTL, IPPROTO_IP);
2214  		if (*mp)
2215  			mp = &(*mp)->m_next;
2216  	}
2217  	if (inp->inp_flags & INP_RECVTOS) {
2218  		*mp = sbcreatecontrol(&ip->ip_tos, sizeof(u_char),
2219  		    IP_RECVTOS, IPPROTO_IP);
2220  		if (*mp)
2221  			mp = &(*mp)->m_next;
2222  	}
2223  #ifdef notyet
2224  	/* XXX
2225  	 * Moving these out of udp_input() made them even more broken
2226  	 * than they already were.
2227  	 */
2228  	/* options were tossed already */
2229  	if (inp->inp_flags & INP_RECVOPTS) {
2230  		*mp = sbcreatecontrol(opts_deleted_above,
2231  		    sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP);
2232  		if (*mp)
2233  			mp = &(*mp)->m_next;
2234  	}
2235  	/* ip_srcroute doesn't do what we want here, need to fix */
2236  	if (inp->inp_flags & INP_RECVRETOPTS) {
2237  		*mp = sbcreatecontrol(ip_srcroute(m), sizeof(struct in_addr),
2238  		    IP_RECVRETOPTS, IPPROTO_IP);
2239  		if (*mp)
2240  			mp = &(*mp)->m_next;
2241  	}
2242  #endif
2243  	if (inp->inp_flags & INP_RECVIF) {
2244  		struct ifnet *ifp;
2245  		struct sdlbuf {
2246  			struct sockaddr_dl sdl;
2247  			u_char	pad[32];
2248  		} sdlbuf;
2249  		struct sockaddr_dl *sdp;
2250  		struct sockaddr_dl *sdl2 = &sdlbuf.sdl;
2251  
2252  		if (((ifp = m->m_pkthdr.rcvif)) &&
2253  		    ((ifp->if_index != 0) && (ifp->if_index <= if_index))) {
2254  			sdp = IF_LLSOCKADDR(ifp);
2255  			/*
2256  			 * Change our mind and don't try copy.
2257  			 */
2258  			if ((sdp->sdl_family != AF_LINK) ||
2259  			    (sdp->sdl_len > sizeof(sdlbuf))) {
2260  				goto makedummy;
2261  			}
2262  			bcopy(sdp, sdl2, sdp->sdl_len);
2263  		} else {
2264  makedummy:
2265  			sdl2->sdl_len =
2266  			    offsetof(struct sockaddr_dl, sdl_data[0]);
2267  			sdl2->sdl_family = AF_LINK;
2268  			sdl2->sdl_index = 0;
2269  			sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0;
2270  		}
2271  		*mp = sbcreatecontrol(sdl2, sdl2->sdl_len,
2272  		    IP_RECVIF, IPPROTO_IP);
2273  		if (*mp)
2274  			mp = &(*mp)->m_next;
2275  	}
2276  }
2277  
2278  /*
2279   * XXX these routines are called from the upper part of the kernel.
2280   *
2281   * They could also be moved to ip_mroute.c, since all the RSVP
2282   *  handling is done there already.
2283   */
2284  int
2285  ip_rsvp_init(struct socket *so)
2286  {
2287  	if (so->so_type != SOCK_RAW ||
2288  	    so->so_proto->pr_protocol != IPPROTO_RSVP)
2289  		return EOPNOTSUPP;
2290  
2291  	if (ip_rsvpd != NULL)
2292  		return EADDRINUSE;
2293  
2294  	ip_rsvpd = so;
2295  	/*
2296  	 * This may seem silly, but we need to be sure we don't over-increment
2297  	 * the RSVP counter, in case something slips up.
2298  	 */
2299  	if (!ip_rsvp_on) {
2300  		ip_rsvp_on = 1;
2301  		rsvp_on++;
2302  	}
2303  
2304  	return 0;
2305  }
2306  
2307  int
2308  ip_rsvp_done(void)
2309  {
2310  	ip_rsvpd = NULL;
2311  	/*
2312  	 * This may seem silly, but we need to be sure we don't over-decrement
2313  	 * the RSVP counter, in case something slips up.
2314  	 */
2315  	if (ip_rsvp_on) {
2316  		ip_rsvp_on = 0;
2317  		rsvp_on--;
2318  	}
2319  	return 0;
2320  }
2321  
2322  int
2323  rsvp_input(struct mbuf **mp, int *offp, int proto)
2324  {
2325  	struct mbuf *m = *mp;
2326  
2327  	*mp = NULL;
2328  
2329  	if (rsvp_input_p) { /* call the real one if loaded */
2330  		*mp = m;
2331  		rsvp_input_p(mp, offp, proto);
2332  		return(IPPROTO_DONE);
2333  	}
2334  
2335  	/* Can still get packets with rsvp_on = 0 if there is a local member
2336  	 * of the group to which the RSVP packet is addressed.  But in this
2337  	 * case we want to throw the packet away.
2338  	 */
2339  
2340  	if (!rsvp_on) {
2341  		m_freem(m);
2342  		return(IPPROTO_DONE);
2343  	}
2344  
2345  	if (ip_rsvpd != NULL) {
2346  		*mp = m;
2347  		rip_input(mp, offp, proto);
2348  		return(IPPROTO_DONE);
2349  	}
2350  	/* Drop the packet */
2351  	m_freem(m);
2352  	return(IPPROTO_DONE);
2353  }
2354