xref: /openbsd-src/sys/netinet/ip_output.c (revision 8445c53715e7030056b779e8ab40efb7820981f2)
1 /*	$OpenBSD: ip_output.c,v 1.137 2001/08/26 21:12:06 niklas Exp $	*/
2 /*	$NetBSD: ip_output.c,v 1.28 1996/02/13 23:43:07 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1988, 1990, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *	This product includes software developed by the University of
19  *	California, Berkeley and its contributors.
20  * 4. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
37  */
38 
39 #include "pf.h"
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/mbuf.h>
44 #include <sys/protosw.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/proc.h>
48 #include <sys/kernel.h>
49 
50 #include <net/if.h>
51 #include <net/if_enc.h>
52 #include <net/route.h>
53 
54 #include <netinet/in.h>
55 #include <netinet/in_systm.h>
56 #include <netinet/ip.h>
57 #include <netinet/in_pcb.h>
58 #include <netinet/in_var.h>
59 #include <netinet/ip_var.h>
60 #include <netinet/ip_icmp.h>
61 #include <netinet/tcp.h>
62 #include <netinet/udp.h>
63 #include <netinet/tcp_timer.h>
64 #include <netinet/tcp_var.h>
65 #include <netinet/udp_var.h>
66 
67 #if NPF > 0
68 #include <net/pfvar.h>
69 #endif
70 
71 #ifdef vax
72 #include <machine/mtpr.h>
73 #endif
74 
75 #ifdef IPSEC
76 #ifdef ENCDEBUG
77 #define DPRINTF(x)    do { if (encdebug) printf x ; } while (0)
78 #else
79 #define DPRINTF(x)
80 #endif
81 
82 extern u_int8_t get_sa_require  __P((struct inpcb *));
83 
84 extern int ipsec_auth_default_level;
85 extern int ipsec_esp_trans_default_level;
86 extern int ipsec_esp_network_default_level;
87 extern int ipsec_ipcomp_default_level;
88 #endif /* IPSEC */
89 
90 static struct mbuf *ip_insertoptions __P((struct mbuf *, struct mbuf *, int *));
91 static void ip_mloopback
92 	__P((struct ifnet *, struct mbuf *, struct sockaddr_in *));
93 
94 /*
95  * IP output.  The packet in mbuf chain m contains a skeletal IP
96  * header (with len, off, ttl, proto, tos, src, dst).
97  * The mbuf chain containing the packet will be freed.
98  * The mbuf opt, if present, will not be freed.
99  */
100 int
101 #if __STDC__
102 ip_output(struct mbuf *m0, ...)
103 #else
104 ip_output(m0, va_alist)
105 	struct mbuf *m0;
106 	va_dcl
107 #endif
108 {
109 	register struct ip *ip, *mhip;
110 	register struct ifnet *ifp;
111 	struct mbuf *m = m0;
112 	register int hlen = sizeof (struct ip);
113 	int len, off, error = 0;
114 	struct route iproute;
115 	struct sockaddr_in *dst;
116 	struct in_ifaddr *ia;
117 	struct mbuf *opt;
118 	struct route *ro;
119 	int flags;
120 	struct ip_moptions *imo;
121 	va_list ap;
122 	u_int8_t sproto = 0, donerouting = 0;
123 #ifdef IPSEC
124 	u_int32_t icmp_mtu = 0;
125 	union sockaddr_union sdst;
126 	u_int32_t sspi;
127 	struct m_tag *mtag;
128 	struct tdb_ident *tdbi;
129 
130 	struct inpcb *inp;
131 	struct tdb *tdb;
132 	int s;
133 #endif /* IPSEC */
134 
135 	va_start(ap, m0);
136 	opt = va_arg(ap, struct mbuf *);
137 	ro = va_arg(ap, struct route *);
138 	flags = va_arg(ap, int);
139 	imo = va_arg(ap, struct ip_moptions *);
140 #ifdef IPSEC
141 	inp = va_arg(ap, struct inpcb *);
142 	if (inp && (inp->inp_flags & INP_IPV6) != 0)
143 		panic("ip_output: IPv6 pcb is passed");
144 #endif /* IPSEC */
145 	va_end(ap);
146 
147 #ifdef	DIAGNOSTIC
148 	if ((m->m_flags & M_PKTHDR) == 0)
149 		panic("ip_output no HDR");
150 #endif
151 	if (opt) {
152 		m = ip_insertoptions(m, opt, &len);
153 		hlen = len;
154 	}
155 
156 	ip = mtod(m, struct ip *);
157 
158 	/*
159 	 * Fill in IP header.
160 	 */
161 	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
162 		ip->ip_v = IPVERSION;
163 		ip->ip_off &= IP_DF;
164 		ip->ip_id = htons(ip_randomid());
165 		ip->ip_hl = hlen >> 2;
166 		ipstat.ips_localout++;
167 	} else {
168 		hlen = ip->ip_hl << 2;
169 	}
170 
171 	/*
172 	 * If we're missing the IP source address, do a route lookup. We'll
173 	 * remember this result, in case we don't need to do any IPsec
174 	 * processing on the packet. We need the source address so we can
175 	 * do an SPD lookup in IPsec; for most packets, the source address
176 	 * is set at a higher level protocol. ICMPs and other packets
177 	 * though (e.g., traceroute) have a source address of zeroes.
178 	 */
179 	if (ip->ip_src.s_addr == INADDR_ANY) {
180 	        donerouting = 1;
181 
182 	        if (ro == 0) {
183 		        ro = &iproute;
184 			bzero((caddr_t)ro, sizeof (*ro));
185 		}
186 
187 		dst = satosin(&ro->ro_dst);
188 
189 		/*
190 		 * If there is a cached route, check that it is to the same
191 		 * destination and is still up.  If not, free it and try again.
192 		 */
193 		if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
194 				  dst->sin_addr.s_addr != ip->ip_dst.s_addr)) {
195 		        RTFREE(ro->ro_rt);
196 			ro->ro_rt = (struct rtentry *)0;
197 		}
198 
199 		if (ro->ro_rt == 0) {
200 		        dst->sin_family = AF_INET;
201 			dst->sin_len = sizeof(*dst);
202 			dst->sin_addr = ip->ip_dst;
203 		}
204 
205 		/*
206 		 * If routing to interface only, short-circuit routing lookup.
207 		 */
208 		if (flags & IP_ROUTETOIF) {
209 		        if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == 0 &&
210 			    (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == 0) {
211 			    ipstat.ips_noroute++;
212 			    error = ENETUNREACH;
213 			    goto bad;
214 			}
215 
216 			ifp = ia->ia_ifp;
217 			ip->ip_ttl = 1;
218 		} else {
219 		        if (ro->ro_rt == 0)
220 			        rtalloc(ro);
221 
222 			if (ro->ro_rt == 0) {
223 			        ipstat.ips_noroute++;
224 				error = EHOSTUNREACH;
225 				goto bad;
226 			}
227 
228 			ia = ifatoia(ro->ro_rt->rt_ifa);
229 			ifp = ro->ro_rt->rt_ifp;
230 			ro->ro_rt->rt_use++;
231 
232 			if (ro->ro_rt->rt_flags & RTF_GATEWAY)
233 			        dst = satosin(ro->ro_rt->rt_gateway);
234 		}
235 
236 		/* Set the source IP address */
237                 if (!IN_MULTICAST(ip->ip_dst.s_addr))
238 		        ip->ip_src = ia->ia_addr.sin_addr;
239 	}
240 
241 #ifdef IPSEC
242 	/*
243 	 * splnet is chosen over spltdb because we are not allowed to
244 	 * lower the level, and udp_output calls us in splnet().
245 	 */
246 	s = splnet();
247 
248 	/* Do we have any pending SAs to apply ? */
249 	mtag = m_tag_find(m, PACKET_TAG_IPSEC_PENDING_TDB, NULL);
250 	if (mtag != NULL) {
251 #ifdef DIAGNOSTIC
252 		if (mtag->m_tag_len != sizeof (struct tdb_ident))
253 			panic("ip_output: tag of length %d (should be %d",
254 			    mtag->m_tag_len, sizeof (struct tdb_ident));
255 #endif
256 		tdbi = (struct tdb_ident *)(mtag + 1);
257 		tdb = gettdb(tdbi->spi, &tdbi->dst, tdbi->proto);
258 		if (tdb == NULL)
259 			error = -EINVAL;
260 		m_tag_delete(m, mtag);
261 	}
262 	else
263 		tdb = ipsp_spd_lookup(m, AF_INET, hlen, &error,
264 		    IPSP_DIRECTION_OUT, NULL, inp);
265 
266 	if (tdb == NULL) {
267 	        splx(s);
268 
269 		if (error == 0) {
270 		        /*
271 			 * No IPsec processing required, we'll just send the
272 			 * packet out.
273 			 */
274 		        sproto = 0;
275 
276 			/* Fall through to routing/multicast handling */
277 		} else {
278 		        /*
279 			 * -EINVAL is used to indicate that the packet should
280 			 * be silently dropped, typically because we've asked
281 			 * key management for an SA.
282 			 */
283 		        if (error == -EINVAL) /* Should silently drop packet */
284 			  error = 0;
285 
286 			m_freem(m);
287 			goto done;
288 		}
289 	} else {
290 		/* Loop detection */
291 		for (mtag = m_tag_first(m); mtag != NULL;
292 		    mtag = m_tag_next(m, mtag)) {
293 			if (mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_DONE &&
294 			    mtag->m_tag_id !=
295 			    PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED)
296 				continue;
297 			tdbi = (struct tdb_ident *)(mtag + 1);
298 			if (tdbi->spi == tdb->tdb_spi &&
299 			    tdbi->proto == tdb->tdb_sproto &&
300 			    !bcmp(&tdbi->dst, &tdb->tdb_dst,
301 			    sizeof(union sockaddr_union))) {
302 				splx(s);
303 				sproto = 0; /* mark as no-IPsec-needed */
304 				goto done_spd;
305 			}
306 		}
307 
308 	        /* We need to do IPsec */
309 	        bcopy(&tdb->tdb_dst, &sdst, sizeof(sdst));
310 		sspi = tdb->tdb_spi;
311 		sproto = tdb->tdb_sproto;
312 		splx(s);
313 
314 		/*
315 		 * If it needs TCP/UDP hardware-checksumming, do the
316 		 * computation now.
317 		 */
318 		if (m->m_pkthdr.csum & (M_TCPV4_CSUM_OUT | M_UDPV4_CSUM_OUT)) {
319 			in_delayed_cksum(m);
320 			m->m_pkthdr.csum &=
321 			    ~(M_UDPV4_CSUM_OUT | M_TCPV4_CSUM_OUT);
322 		}
323 
324 		/* If it's not a multicast packet, try to fast-path */
325 		if (!IN_MULTICAST(ip->ip_dst.s_addr)) {
326 			goto sendit;
327 		}
328 	}
329 
330 	/* Fall through to the routing/multicast handling code */
331  done_spd:
332 #endif /* IPSEC */
333 
334 	if (donerouting == 0) {
335 	        if (ro == 0) {
336 		        ro = &iproute;
337 			bzero((caddr_t)ro, sizeof (*ro));
338 		}
339 
340 		dst = satosin(&ro->ro_dst);
341 
342 		/*
343 		 * If there is a cached route, check that it is to the same
344 		 * destination and is still up.  If not, free it and try again.
345 		 */
346 		if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
347 				  dst->sin_addr.s_addr != ip->ip_dst.s_addr)) {
348 		        RTFREE(ro->ro_rt);
349 			ro->ro_rt = (struct rtentry *)0;
350 		}
351 
352 		if (ro->ro_rt == 0) {
353 		        dst->sin_family = AF_INET;
354 			dst->sin_len = sizeof(*dst);
355 			dst->sin_addr = ip->ip_dst;
356 		}
357 
358 		/*
359 		 * If routing to interface only, short-circuit routing lookup.
360 		 */
361 		if (flags & IP_ROUTETOIF) {
362 		        if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == 0 &&
363 			    (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == 0) {
364 			    ipstat.ips_noroute++;
365 			    error = ENETUNREACH;
366 			    goto bad;
367 			}
368 
369 			ifp = ia->ia_ifp;
370 			ip->ip_ttl = 1;
371 		} else {
372 		        if (ro->ro_rt == 0)
373 			        rtalloc(ro);
374 
375 			if (ro->ro_rt == 0) {
376 			        ipstat.ips_noroute++;
377 				error = EHOSTUNREACH;
378 				goto bad;
379 			}
380 
381 			ia = ifatoia(ro->ro_rt->rt_ifa);
382 			ifp = ro->ro_rt->rt_ifp;
383 			ro->ro_rt->rt_use++;
384 
385 			if (ro->ro_rt->rt_flags & RTF_GATEWAY)
386 			        dst = satosin(ro->ro_rt->rt_gateway);
387 		}
388 
389 		/* Set the source IP address */
390 		if (ip->ip_src.s_addr == INADDR_ANY)
391 			ip->ip_src = ia->ia_addr.sin_addr;
392 	}
393 
394 	if (IN_MULTICAST(ip->ip_dst.s_addr) ||
395 	    (ip->ip_dst.s_addr == INADDR_BROADCAST)) {
396 		struct in_multi *inm;
397 
398 		m->m_flags |= (ip->ip_dst.s_addr == INADDR_BROADCAST) ?
399 			M_BCAST : M_MCAST;
400 
401 		/*
402 		 * IP destination address is multicast.  Make sure "dst"
403 		 * still points to the address in "ro".  (It may have been
404 		 * changed to point to a gateway address, above.)
405 		 */
406 		dst = satosin(&ro->ro_dst);
407 
408 		/*
409 		 * See if the caller provided any multicast options
410 		 */
411 		if (imo != NULL) {
412 			ip->ip_ttl = imo->imo_multicast_ttl;
413 			if (imo->imo_multicast_ifp != NULL)
414 				ifp = imo->imo_multicast_ifp;
415 		} else
416 			ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
417 
418 		/*
419 		 * Confirm that the outgoing interface supports multicast,
420 		 * but only if the packet actually is going out on that
421 		 * interface (i.e., no IPsec is applied).
422 		 */
423 		if ((((m->m_flags & M_MCAST) &&
424 		      (ifp->if_flags & IFF_MULTICAST) == 0) ||
425 		     ((m->m_flags & M_BCAST) &&
426 		      (ifp->if_flags & IFF_BROADCAST) == 0)) && (sproto == 0))  {
427 			ipstat.ips_noroute++;
428 			error = ENETUNREACH;
429 			goto bad;
430 		}
431 
432 		/*
433 		 * If source address not specified yet, use address
434 		 * of outgoing interface.
435 		 */
436 		if (ip->ip_src.s_addr == INADDR_ANY) {
437 			register struct in_ifaddr *ia;
438 
439 			for (ia = in_ifaddr.tqh_first;
440 			     ia;
441 			     ia = ia->ia_list.tqe_next)
442 				if (ia->ia_ifp == ifp) {
443 					ip->ip_src = ia->ia_addr.sin_addr;
444 					break;
445 				}
446 		}
447 
448 		IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm);
449 		if (inm != NULL &&
450 		   (imo == NULL || imo->imo_multicast_loop)) {
451 			/*
452 			 * If we belong to the destination multicast group
453 			 * on the outgoing interface, and the caller did not
454 			 * forbid loopback, loop back a copy.
455 			 */
456 			ip_mloopback(ifp, m, dst);
457 		}
458 #ifdef MROUTING
459 		else {
460 			/*
461 			 * If we are acting as a multicast router, perform
462 			 * multicast forwarding as if the packet had just
463 			 * arrived on the interface to which we are about
464 			 * to send.  The multicast forwarding function
465 			 * recursively calls this function, using the
466 			 * IP_FORWARDING flag to prevent infinite recursion.
467 			 *
468 			 * Multicasts that are looped back by ip_mloopback(),
469 			 * above, will be forwarded by the ip_input() routine,
470 			 * if necessary.
471 			 */
472 			extern struct socket *ip_mrouter;
473 
474 			if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
475 				if (ip_mforward(m, ifp) != 0) {
476 					m_freem(m);
477 					goto done;
478 				}
479 			}
480 		}
481 #endif
482 		/*
483 		 * Multicasts with a time-to-live of zero may be looped-
484 		 * back, above, but must not be transmitted on a network.
485 		 * Also, multicasts addressed to the loopback interface
486 		 * are not sent -- the above call to ip_mloopback() will
487 		 * loop back a copy if this host actually belongs to the
488 		 * destination group on the loopback interface.
489 		 */
490 		if (ip->ip_ttl == 0 || (ifp->if_flags & IFF_LOOPBACK) != 0) {
491 			m_freem(m);
492 			goto done;
493 		}
494 
495 		goto sendit;
496 	}
497 
498 	/*
499 	 * Look for broadcast address and and verify user is allowed to send
500 	 * such a packet; if the packet is going in an IPsec tunnel, skip
501 	 * this check.
502 	 */
503 	if ((sproto == 0) && (in_broadcast(dst->sin_addr, ifp))) {
504 		if ((ifp->if_flags & IFF_BROADCAST) == 0) {
505 			error = EADDRNOTAVAIL;
506 			goto bad;
507 		}
508 		if ((flags & IP_ALLOWBROADCAST) == 0) {
509 			error = EACCES;
510 			goto bad;
511 		}
512 
513 		/* Don't allow broadcast messages to be fragmented */
514 		if ((u_int16_t)ip->ip_len > ifp->if_mtu) {
515 			error = EMSGSIZE;
516 			goto bad;
517 		}
518 		m->m_flags |= M_BCAST;
519 	} else
520 		m->m_flags &= ~M_BCAST;
521 
522 sendit:
523         /*
524          * If we're doing Path MTU discovery, we need to set DF unless
525          * the route's MTU is locked.
526 	 */
527 	if ((flags & IP_MTUDISC) && ro && ro->ro_rt &&
528 	    (ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) == 0)
529 		ip->ip_off |= IP_DF;
530 
531 #ifdef IPSEC
532 	/*
533 	 * Check if the packet needs encapsulation.
534 	 */
535 	if (sproto != 0) {
536 	        s = splnet();
537 
538 		/*
539 		 * Packet filter
540 		 */
541 #if NPF > 0
542 
543 		if (pf_test(PF_OUT, &encif[0].sc_if, &m) != PF_PASS) {
544 			error = EHOSTUNREACH;
545 			splx(s);
546 			m_freem(m);
547 			goto done;
548 		}
549 		ip = mtod(m, struct ip *);
550 		hlen = ip->ip_hl << 2;
551 #endif
552 
553 		tdb = gettdb(sspi, &sdst, sproto);
554 		if (tdb == NULL) {
555 			error = EHOSTUNREACH;
556 			splx(s);
557 			m_freem(m);
558 			goto done;
559 		}
560 
561 		/* Latch to PCB */
562 		if (inp)
563 		        tdb_add_inp(tdb, inp, 0);
564 
565 		/* Check if we are allowed to fragment */
566 		if ((ip->ip_off & IP_DF) && tdb->tdb_mtu &&
567 		    (u_int16_t)ip->ip_len > tdb->tdb_mtu &&
568 		    tdb->tdb_mtutimeout > time.tv_sec) {
569 			struct rtentry *rt = NULL;
570 
571 			icmp_mtu = tdb->tdb_mtu;
572 			splx(s);
573 
574 			/* Find a host route to store the mtu in */
575 			if (ro != NULL)
576 				rt = ro->ro_rt;
577 			if (rt == NULL || (rt->rt_flags & RTF_HOST) == 0) {
578 				struct sockaddr_in dst = {
579 					sizeof(struct sockaddr_in), AF_INET};
580 				dst.sin_addr = ip->ip_dst;
581 				rt = icmp_mtudisc_clone((struct sockaddr *)&dst);
582 			}
583 			if (rt != NULL) {
584 				rt->rt_rmx.rmx_mtu = icmp_mtu;
585 				if (ro && ro->ro_rt != NULL) {
586 					RTFREE(ro->ro_rt);
587 					ro->ro_rt = (struct rtentry *) 0;
588 					rtalloc(ro);
589 				}
590 			}
591 			error = EMSGSIZE;
592 			goto bad;
593 		}
594 
595 		/* Massage the IP header for use by the IPsec code */
596 		ip->ip_len = htons((u_short) ip->ip_len);
597 		ip->ip_off = htons((u_short) ip->ip_off);
598 
599 		/*
600 		 * Clear these -- they'll be set in the recursive invocation
601 		 * as needed.
602 		 */
603 		m->m_flags &= ~(M_MCAST | M_BCAST);
604 
605 		/* Callee frees mbuf */
606 		error = ipsp_process_packet(m, tdb, AF_INET, 0);
607 		splx(s);
608 		return error;  /* Nothing more to be done */
609 	}
610 
611 	/*
612 	 * If deferred crypto processing is needed, check that the
613 	 * interface supports it.
614 	 */
615 	if ((mtag = m_tag_find(m, PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED, NULL))
616 	    != NULL && (ifp->if_capabilities & IFCAP_IPSEC) == 0) {
617 		/* Notify IPsec to do its own crypto. */
618 		ipsp_skipcrypto_unmark((struct tdb_ident *)(mtag + 1));
619 		m_freem(m);
620 		error = EHOSTUNREACH;
621 		goto done;
622 	}
623 #endif /* IPSEC */
624 
625 	/* Catch routing changes wrt. hardware checksumming for TCP or UDP. */
626 	if (m->m_pkthdr.csum & M_TCPV4_CSUM_OUT) {
627 		if (!(ifp->if_capabilities & IFCAP_CSUM_TCPv4) ||
628 		    ifp->if_bridge != NULL) {
629 			in_delayed_cksum(m);
630 			m->m_pkthdr.csum &= ~M_TCPV4_CSUM_OUT; /* Clear */
631 		}
632 	} else if (m->m_pkthdr.csum & M_UDPV4_CSUM_OUT) {
633 		if (!(ifp->if_capabilities & IFCAP_CSUM_UDPv4) ||
634 		    ifp->if_bridge != NULL) {
635 			in_delayed_cksum(m);
636 			m->m_pkthdr.csum &= ~M_UDPV4_CSUM_OUT; /* Clear */
637 		}
638 	}
639 
640 	/*
641 	 * Packet filter
642 	 */
643 #if NPF > 0
644 	if (pf_test(PF_OUT, ifp, &m) != PF_PASS) {
645 		error = EHOSTUNREACH;
646 		m_freem(m);
647 		goto done;
648 	}
649 	ip = mtod(m, struct ip *);
650 	hlen = ip->ip_hl << 2;
651 #endif
652 
653 	/*
654 	 * If small enough for interface, can just send directly.
655 	 */
656 	if ((u_int16_t)ip->ip_len <= ifp->if_mtu) {
657 		ip->ip_len = htons((u_int16_t)ip->ip_len);
658 		ip->ip_off = htons((u_int16_t)ip->ip_off);
659 		if ((ifp->if_capabilities & IFCAP_CSUM_IPv4) &&
660 		    ifp->if_bridge == NULL) {
661 			m->m_pkthdr.csum |= M_IPV4_CSUM_OUT;
662 			ipstat.ips_outhwcsum++;
663 		} else {
664 			ip->ip_sum = 0;
665 			ip->ip_sum = in_cksum(m, hlen);
666 		}
667 		/* Update relevant hardware checksum stats for TCP/UDP */
668 		if (m->m_pkthdr.csum & M_TCPV4_CSUM_OUT)
669 			tcpstat.tcps_outhwcsum++;
670 		else if (m->m_pkthdr.csum & M_UDPV4_CSUM_OUT)
671 			udpstat.udps_outhwcsum++;
672 		error = (*ifp->if_output)(ifp, m, sintosa(dst), ro->ro_rt);
673 		goto done;
674 	}
675 
676 	/*
677 	 * Too large for interface; fragment if possible.
678 	 * Must be able to put at least 8 bytes per fragment.
679 	 */
680 	if (ip->ip_off & IP_DF) {
681 #ifdef IPSEC
682 		icmp_mtu = ifp->if_mtu;
683 #endif
684 		error = EMSGSIZE;
685 		/*
686 		 * This case can happen if the user changed the MTU
687 		 * of an interface after enabling IP on it.  Because
688 		 * most netifs don't keep track of routes pointing to
689 		 * them, there is no way for one to update all its
690 		 * routes when the MTU is changed.
691 		 */
692 		if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST))
693 		    && !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU)
694 		    && (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) {
695 			ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
696 		}
697 		ipstat.ips_cantfrag++;
698 		goto bad;
699 	}
700 	len = (ifp->if_mtu - hlen) &~ 7;
701 	if (len < 8) {
702 		error = EMSGSIZE;
703 		goto bad;
704 	}
705 
706 	/*
707 	 * If we are doing fragmentation, we can't defer TCP/UDP
708 	 * checksumming; compute the checksum and clear the flag.
709 	 */
710 	if (m->m_pkthdr.csum & (M_TCPV4_CSUM_OUT | M_UDPV4_CSUM_OUT)) {
711 		in_delayed_cksum(m);
712 		m->m_pkthdr.csum &= ~(M_UDPV4_CSUM_OUT | M_TCPV4_CSUM_OUT);
713 	}
714 
715     {
716 	int mhlen, firstlen = len;
717 	struct mbuf **mnext = &m->m_nextpkt;
718 
719 	/*
720 	 * Loop through length of segment after first fragment,
721 	 * make new header and copy data of each part and link onto chain.
722 	 */
723 	m0 = m;
724 	mhlen = sizeof (struct ip);
725 	for (off = hlen + len; off < (u_int16_t)ip->ip_len; off += len) {
726 		MGETHDR(m, M_DONTWAIT, MT_HEADER);
727 		if (m == 0) {
728 			error = ENOBUFS;
729 			ipstat.ips_odropped++;
730 			goto sendorfree;
731 		}
732 		*mnext = m;
733 		mnext = &m->m_nextpkt;
734 		m->m_data += max_linkhdr;
735 		mhip = mtod(m, struct ip *);
736 		*mhip = *ip;
737 		/* we must inherit MCAST and BCAST flags */
738 		m->m_flags |= m0->m_flags & (M_MCAST|M_BCAST);
739 		if (hlen > sizeof (struct ip)) {
740 			mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
741 			mhip->ip_hl = mhlen >> 2;
742 		}
743 		m->m_len = mhlen;
744 		mhip->ip_off = ((off - hlen) >> 3) + (ip->ip_off & ~IP_MF);
745 		if (ip->ip_off & IP_MF)
746 			mhip->ip_off |= IP_MF;
747 		if (off + len >= (u_int16_t)ip->ip_len)
748 			len = (u_int16_t)ip->ip_len - off;
749 		else
750 			mhip->ip_off |= IP_MF;
751 		mhip->ip_len = htons((u_int16_t)(len + mhlen));
752 		m->m_next = m_copy(m0, off, len);
753 		if (m->m_next == 0) {
754 			error = ENOBUFS;	/* ??? */
755 			ipstat.ips_odropped++;
756 			goto sendorfree;
757 		}
758 		m->m_pkthdr.len = mhlen + len;
759 		m->m_pkthdr.rcvif = (struct ifnet *)0;
760 		mhip->ip_off = htons((u_int16_t)mhip->ip_off);
761 		if ((ifp->if_capabilities & IFCAP_CSUM_IPv4) &&
762 		    ifp->if_bridge == NULL) {
763 			m->m_pkthdr.csum |= M_IPV4_CSUM_OUT;
764 			ipstat.ips_outhwcsum++;
765 		} else {
766 			mhip->ip_sum = 0;
767 			mhip->ip_sum = in_cksum(m, mhlen);
768 		}
769 		ipstat.ips_ofragments++;
770 	}
771 	/*
772 	 * Update first fragment by trimming what's been copied out
773 	 * and updating header, then send each fragment (in order).
774 	 */
775 	m = m0;
776 	m_adj(m, hlen + firstlen - (u_int16_t)ip->ip_len);
777 	m->m_pkthdr.len = hlen + firstlen;
778 	ip->ip_len = htons((u_int16_t)m->m_pkthdr.len);
779 	ip->ip_off = htons((u_int16_t)(ip->ip_off | IP_MF));
780 	if ((ifp->if_capabilities & IFCAP_CSUM_IPv4) &&
781 	    ifp->if_bridge == NULL) {
782 		m->m_pkthdr.csum |= M_IPV4_CSUM_OUT;
783 		ipstat.ips_outhwcsum++;
784 	} else {
785 		ip->ip_sum = 0;
786 		ip->ip_sum = in_cksum(m, hlen);
787 	}
788 sendorfree:
789 	for (m = m0; m; m = m0) {
790 		m0 = m->m_nextpkt;
791 		m->m_nextpkt = 0;
792 		if (error == 0)
793 			error = (*ifp->if_output)(ifp, m, sintosa(dst),
794 			    ro->ro_rt);
795 		else
796 			m_freem(m);
797 	}
798 
799 	if (error == 0)
800 		ipstat.ips_fragmented++;
801     }
802 done:
803 	if (ro == &iproute && (flags & IP_ROUTETOIF) == 0 && ro->ro_rt)
804 		RTFREE(ro->ro_rt);
805 	return (error);
806 bad:
807 #ifdef IPSEC
808 	if (error == EMSGSIZE && icmp_mtu != 0)
809 		ipsec_adjust_mtu(m, icmp_mtu);
810 #endif
811 	m_freem(m0);
812 	goto done;
813 }
814 
815 /*
816  * Insert IP options into preformed packet.
817  * Adjust IP destination as required for IP source routing,
818  * as indicated by a non-zero in_addr at the start of the options.
819  */
820 static struct mbuf *
821 ip_insertoptions(m, opt, phlen)
822 	register struct mbuf *m;
823 	struct mbuf *opt;
824 	int *phlen;
825 {
826 	register struct ipoption *p = mtod(opt, struct ipoption *);
827 	struct mbuf *n;
828 	register struct ip *ip = mtod(m, struct ip *);
829 	unsigned optlen;
830 
831 	optlen = opt->m_len - sizeof(p->ipopt_dst);
832 	if (optlen + (u_int16_t)ip->ip_len > IP_MAXPACKET)
833 		return (m);		/* XXX should fail */
834 	if (p->ipopt_dst.s_addr)
835 		ip->ip_dst = p->ipopt_dst;
836 	if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) {
837 		MGETHDR(n, M_DONTWAIT, MT_HEADER);
838 		if (n == 0)
839 			return (m);
840 		M_MOVE_HDR(n, m);
841 		n->m_pkthdr.len += optlen;
842 		m->m_len -= sizeof(struct ip);
843 		m->m_data += sizeof(struct ip);
844 		n->m_next = m;
845 		m = n;
846 		m->m_len = optlen + sizeof(struct ip);
847 		m->m_data += max_linkhdr;
848 		bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
849 	} else {
850 		m->m_data -= optlen;
851 		m->m_len += optlen;
852 		m->m_pkthdr.len += optlen;
853 		ovbcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
854 	}
855 	ip = mtod(m, struct ip *);
856 	bcopy((caddr_t)p->ipopt_list, (caddr_t)(ip + 1), (unsigned)optlen);
857 	*phlen = sizeof(struct ip) + optlen;
858 	ip->ip_len += optlen;
859 	return (m);
860 }
861 
862 /*
863  * Copy options from ip to jp,
864  * omitting those not copied during fragmentation.
865  */
866 int
867 ip_optcopy(ip, jp)
868 	struct ip *ip, *jp;
869 {
870 	register u_char *cp, *dp;
871 	int opt, optlen, cnt;
872 
873 	cp = (u_char *)(ip + 1);
874 	dp = (u_char *)(jp + 1);
875 	cnt = (ip->ip_hl << 2) - sizeof (struct ip);
876 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
877 		opt = cp[0];
878 		if (opt == IPOPT_EOL)
879 			break;
880 		if (opt == IPOPT_NOP) {
881 			/* Preserve for IP mcast tunnel's LSRR alignment. */
882 			*dp++ = IPOPT_NOP;
883 			optlen = 1;
884 			continue;
885 		}
886 #ifdef DIAGNOSTIC
887 		if (cnt < IPOPT_OLEN + sizeof(*cp))
888 			panic("malformed IPv4 option passed to ip_optcopy");
889 #endif
890 		optlen = cp[IPOPT_OLEN];
891 #ifdef DIAGNOSTIC
892 		if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
893 			panic("malformed IPv4 option passed to ip_optcopy");
894 #endif
895 		/* bogus lengths should have been caught by ip_dooptions */
896 		if (optlen > cnt)
897 			optlen = cnt;
898 		if (IPOPT_COPIED(opt)) {
899 			bcopy((caddr_t)cp, (caddr_t)dp, (unsigned)optlen);
900 			dp += optlen;
901 		}
902 	}
903 	for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++)
904 		*dp++ = IPOPT_EOL;
905 	return (optlen);
906 }
907 
908 /*
909  * IP socket option processing.
910  */
911 int
912 ip_ctloutput(op, so, level, optname, mp)
913 	int op;
914 	struct socket *so;
915 	int level, optname;
916 	struct mbuf **mp;
917 {
918 	register struct inpcb *inp = sotoinpcb(so);
919 	register struct mbuf *m = *mp;
920 	register int optval = 0;
921 #ifdef IPSEC
922 	struct proc *p = curproc; /* XXX */
923 	struct ipsec_ref *ipr;
924 	u_int16_t opt16val;
925 #endif
926 	int error = 0;
927 
928 	if (level != IPPROTO_IP) {
929 		error = EINVAL;
930 		if (op == PRCO_SETOPT && *mp)
931 			(void) m_free(*mp);
932 	} else switch (op) {
933 	case PRCO_SETOPT:
934 		switch (optname) {
935 		case IP_OPTIONS:
936 #ifdef notyet
937 		case IP_RETOPTS:
938 			return (ip_pcbopts(optname, &inp->inp_options, m));
939 #else
940 			return (ip_pcbopts(&inp->inp_options, m));
941 #endif
942 
943 		case IP_TOS:
944 		case IP_TTL:
945 		case IP_RECVOPTS:
946 		case IP_RECVRETOPTS:
947 		case IP_RECVDSTADDR:
948 			if (m == NULL || m->m_len != sizeof(int))
949 				error = EINVAL;
950 			else {
951 				optval = *mtod(m, int *);
952 				switch (optname) {
953 
954 				case IP_TOS:
955 					inp->inp_ip.ip_tos = optval;
956 					break;
957 
958 				case IP_TTL:
959 					inp->inp_ip.ip_ttl = optval;
960 					break;
961 #define	OPTSET(bit) \
962 	if (optval) \
963 		inp->inp_flags |= bit; \
964 	else \
965 		inp->inp_flags &= ~bit;
966 
967 				case IP_RECVOPTS:
968 					OPTSET(INP_RECVOPTS);
969 					break;
970 
971 				case IP_RECVRETOPTS:
972 					OPTSET(INP_RECVRETOPTS);
973 					break;
974 
975 				case IP_RECVDSTADDR:
976 					OPTSET(INP_RECVDSTADDR);
977 					break;
978 				}
979 			}
980 			break;
981 #undef OPTSET
982 
983 		case IP_MULTICAST_IF:
984 		case IP_MULTICAST_TTL:
985 		case IP_MULTICAST_LOOP:
986 		case IP_ADD_MEMBERSHIP:
987 		case IP_DROP_MEMBERSHIP:
988 			error = ip_setmoptions(optname, &inp->inp_moptions, m);
989 			break;
990 
991 		case IP_PORTRANGE:
992 			if (m == 0 || m->m_len != sizeof(int))
993 				error = EINVAL;
994 			else {
995 				optval = *mtod(m, int *);
996 
997 				switch (optval) {
998 
999 				case IP_PORTRANGE_DEFAULT:
1000 					inp->inp_flags &= ~(INP_LOWPORT);
1001 					inp->inp_flags &= ~(INP_HIGHPORT);
1002 					break;
1003 
1004 				case IP_PORTRANGE_HIGH:
1005 					inp->inp_flags &= ~(INP_LOWPORT);
1006 					inp->inp_flags |= INP_HIGHPORT;
1007 					break;
1008 
1009 				case IP_PORTRANGE_LOW:
1010 					inp->inp_flags &= ~(INP_HIGHPORT);
1011 					inp->inp_flags |= INP_LOWPORT;
1012 					break;
1013 
1014 				default:
1015 
1016 					error = EINVAL;
1017 					break;
1018 				}
1019 			}
1020 			break;
1021 		case IP_AUTH_LEVEL:
1022 		case IP_ESP_TRANS_LEVEL:
1023 		case IP_ESP_NETWORK_LEVEL:
1024 		case IP_IPCOMP_LEVEL:
1025 #ifndef IPSEC
1026 			error = EOPNOTSUPP;
1027 #else
1028 			if (m == 0 || m->m_len != sizeof(int)) {
1029 				error = EINVAL;
1030 				break;
1031 			}
1032 			optval = *mtod(m, int *);
1033 
1034 			if (optval < IPSEC_LEVEL_BYPASS ||
1035 			    optval > IPSEC_LEVEL_UNIQUE) {
1036 				error = EINVAL;
1037 				break;
1038 			}
1039 
1040 			/* Unlink cached output TDB to force a re-search */
1041 			if (inp->inp_tdb_out) {
1042 				int s = spltdb();
1043 				TAILQ_REMOVE(&inp->inp_tdb_out->tdb_inp_out,
1044 				    inp, inp_tdb_out_next);
1045 				splx(s);
1046 			}
1047 
1048 			if (inp->inp_tdb_in) {
1049 				int s = spltdb();
1050 				TAILQ_REMOVE(&inp->inp_tdb_in->tdb_inp_in,
1051 				    inp, inp_tdb_in_next);
1052 				splx(s);
1053 			}
1054 
1055 			switch (optname) {
1056 			case IP_AUTH_LEVEL:
1057 			        if (optval < ipsec_auth_default_level &&
1058 				    suser(p->p_ucred, &p->p_acflag)) {
1059 					error = EACCES;
1060 					break;
1061 				}
1062 				inp->inp_seclevel[SL_AUTH] = optval;
1063 				break;
1064 
1065 			case IP_ESP_TRANS_LEVEL:
1066 			        if (optval < ipsec_esp_trans_default_level &&
1067 				    suser(p->p_ucred, &p->p_acflag)) {
1068 					error = EACCES;
1069 					break;
1070 				}
1071 				inp->inp_seclevel[SL_ESP_TRANS] = optval;
1072 				break;
1073 
1074 			case IP_ESP_NETWORK_LEVEL:
1075 			        if (optval < ipsec_esp_network_default_level &&
1076 				    suser(p->p_ucred, &p->p_acflag)) {
1077 					error = EACCES;
1078 					break;
1079 				}
1080 				inp->inp_seclevel[SL_ESP_NETWORK] = optval;
1081 				break;
1082 			case IP_IPCOMP_LEVEL:
1083 			        if (optval < ipsec_ipcomp_default_level &&
1084 				    suser(p->p_ucred, &p->p_acflag)) {
1085 				        error = EACCES;
1086 					break;
1087 				}
1088 				inp->inp_seclevel[SL_IPCOMP] = optval;
1089 				break;
1090 			}
1091 			if (!error)
1092 				inp->inp_secrequire = get_sa_require(inp);
1093 #endif
1094 			break;
1095 
1096 		case IP_IPSEC_REMOTE_CRED:
1097 		case IP_IPSEC_REMOTE_AUTH:
1098 			/* Can't set the remote credential or key */
1099 			error = EOPNOTSUPP;
1100 			break;
1101 
1102 		case IP_IPSEC_LOCAL_ID:
1103 		case IP_IPSEC_REMOTE_ID:
1104 		case IP_IPSEC_LOCAL_CRED:
1105 		case IP_IPSEC_LOCAL_AUTH:
1106 #ifndef IPSEC
1107 			error = EOPNOTSUPP;
1108 #else
1109 			if (m->m_len < 2) {
1110 				error = EINVAL;
1111 				break;
1112 			}
1113 
1114 			m_copydata(m, 0, 2, (caddr_t) &opt16val);
1115 
1116 			/* If the type is 0, then we cleanup and return */
1117 			if (opt16val == 0) {
1118 				switch (optname) {
1119 				case IP_IPSEC_LOCAL_ID:
1120 					if (inp->inp_ipsec_localid != NULL)
1121 						ipsp_reffree(inp->inp_ipsec_localid);
1122 					inp->inp_ipsec_localid = NULL;
1123 					break;
1124 
1125 				case IP_IPSEC_REMOTE_ID:
1126 					if (inp->inp_ipsec_remoteid != NULL)
1127 						ipsp_reffree(inp->inp_ipsec_remoteid);
1128 					inp->inp_ipsec_remoteid = NULL;
1129 					break;
1130 
1131 				case IP_IPSEC_LOCAL_CRED:
1132 					if (inp->inp_ipsec_localcred != NULL)
1133 						ipsp_reffree(inp->inp_ipsec_localcred);
1134 					inp->inp_ipsec_localcred = NULL;
1135 					break;
1136 
1137 				case IP_IPSEC_LOCAL_AUTH:
1138 					if (inp->inp_ipsec_localauth != NULL)
1139 						ipsp_reffree(inp->inp_ipsec_localauth);
1140 					inp->inp_ipsec_localauth = NULL;
1141 					break;
1142 				}
1143 
1144 				error = 0;
1145 				break;
1146 			}
1147 
1148 			/* Can't have an empty payload */
1149 			if (m->m_len == 2) {
1150 				error = EINVAL;
1151 				break;
1152 			}
1153 
1154 			MALLOC(ipr, struct ipsec_ref *,
1155 			       sizeof(struct ipsec_ref) + m->m_len - 2,
1156 			       M_CREDENTIALS, M_NOWAIT);
1157 			if (ipr == NULL) {
1158 				error = ENOBUFS;
1159 				break;
1160 			}
1161 			ipr->ref_count = 1;
1162 			ipr->ref_malloctype = M_CREDENTIALS;
1163 			ipr->ref_len = m->m_len - 2;
1164 			ipr->ref_type = opt16val;
1165 			m_copydata(m, 2, m->m_len - 2, (caddr_t)(ipr + 1));
1166 
1167 			switch (optname) {
1168 			case IP_IPSEC_LOCAL_ID:
1169 				/* Check valid types and NUL-termination */
1170 				if (ipr->ref_type < IPSP_IDENTITY_PREFIX
1171 				    || ipr->ref_type > IPSP_IDENTITY_CONNECTION
1172 				    || ((char *)(ipr + 1))[ipr->ref_len - 1]) {
1173 					FREE(ipr, M_CREDENTIALS);
1174 					error = EINVAL;
1175 				} else {
1176 					if (inp->inp_ipsec_localid != NULL)
1177 						ipsp_reffree(inp->inp_ipsec_localid);
1178 					inp->inp_ipsec_localid = ipr;
1179 				}
1180 				break;
1181 			case IP_IPSEC_REMOTE_ID:
1182 				/* Check valid types and NUL-termination */
1183 				if (ipr->ref_type < IPSP_IDENTITY_PREFIX
1184 				    || ipr->ref_type > IPSP_IDENTITY_CONNECTION
1185 				    || ((char *)(ipr + 1))[ipr->ref_len - 1]) {
1186 					FREE(ipr, M_CREDENTIALS);
1187 					error = EINVAL;
1188 				} else {
1189 					if (inp->inp_ipsec_remoteid != NULL)
1190 						ipsp_reffree(inp->inp_ipsec_remoteid);
1191 					inp->inp_ipsec_remoteid = ipr;
1192 				}
1193 				break;
1194 			case IP_IPSEC_LOCAL_CRED:
1195 				if (ipr->ref_type < IPSP_CRED_KEYNOTE ||
1196 				    ipr->ref_type > IPSP_CRED_X509) {
1197 					FREE(ipr, M_CREDENTIALS);
1198 					error = EINVAL;
1199 				} else {
1200 					if (inp->inp_ipsec_localcred != NULL)
1201 						ipsp_reffree(inp->inp_ipsec_localcred);
1202 					inp->inp_ipsec_localcred = ipr;
1203 				}
1204 				break;
1205 			case IP_IPSEC_LOCAL_AUTH:
1206 				if (ipr->ref_type < IPSP_AUTH_PASSPHRASE ||
1207 				    ipr->ref_type > IPSP_AUTH_RSA) {
1208 					FREE(ipr, M_CREDENTIALS);
1209 					error = EINVAL;
1210 				} else {
1211 					if (inp->inp_ipsec_localauth != NULL)
1212 						ipsp_reffree(inp->inp_ipsec_localauth);
1213 					inp->inp_ipsec_localauth = ipr;
1214 				}
1215 				break;
1216 			}
1217 
1218 			/* Unlink cached output TDB to force a re-search */
1219 			if (inp->inp_tdb_out) {
1220 				int s = spltdb();
1221 				TAILQ_REMOVE(&inp->inp_tdb_out->tdb_inp_out,
1222 				    inp, inp_tdb_out_next);
1223 				splx(s);
1224 			}
1225 
1226 			if (inp->inp_tdb_in) {
1227 				int s = spltdb();
1228 				TAILQ_REMOVE(&inp->inp_tdb_in->tdb_inp_in,
1229 				    inp, inp_tdb_in_next);
1230 				splx(s);
1231 			}
1232 #endif
1233 			break;
1234 		default:
1235 			error = ENOPROTOOPT;
1236 			break;
1237 		}
1238 		if (m)
1239 			(void)m_free(m);
1240 		break;
1241 
1242 	case PRCO_GETOPT:
1243 		switch (optname) {
1244 		case IP_OPTIONS:
1245 		case IP_RETOPTS:
1246 			*mp = m = m_get(M_WAIT, MT_SOOPTS);
1247 			if (inp->inp_options) {
1248 				m->m_len = inp->inp_options->m_len;
1249 				bcopy(mtod(inp->inp_options, caddr_t),
1250 				    mtod(m, caddr_t), (unsigned)m->m_len);
1251 			} else
1252 				m->m_len = 0;
1253 			break;
1254 
1255 		case IP_TOS:
1256 		case IP_TTL:
1257 		case IP_RECVOPTS:
1258 		case IP_RECVRETOPTS:
1259 		case IP_RECVDSTADDR:
1260 			*mp = m = m_get(M_WAIT, MT_SOOPTS);
1261 			m->m_len = sizeof(int);
1262 			switch (optname) {
1263 
1264 			case IP_TOS:
1265 				optval = inp->inp_ip.ip_tos;
1266 				break;
1267 
1268 			case IP_TTL:
1269 				optval = inp->inp_ip.ip_ttl;
1270 				break;
1271 
1272 #define	OPTBIT(bit)	(inp->inp_flags & bit ? 1 : 0)
1273 
1274 			case IP_RECVOPTS:
1275 				optval = OPTBIT(INP_RECVOPTS);
1276 				break;
1277 
1278 			case IP_RECVRETOPTS:
1279 				optval = OPTBIT(INP_RECVRETOPTS);
1280 				break;
1281 
1282 			case IP_RECVDSTADDR:
1283 				optval = OPTBIT(INP_RECVDSTADDR);
1284 				break;
1285 			}
1286 			*mtod(m, int *) = optval;
1287 			break;
1288 
1289 		case IP_MULTICAST_IF:
1290 		case IP_MULTICAST_TTL:
1291 		case IP_MULTICAST_LOOP:
1292 		case IP_ADD_MEMBERSHIP:
1293 		case IP_DROP_MEMBERSHIP:
1294 			error = ip_getmoptions(optname, inp->inp_moptions, mp);
1295 			break;
1296 
1297 		case IP_PORTRANGE:
1298 			*mp = m = m_get(M_WAIT, MT_SOOPTS);
1299 			m->m_len = sizeof(int);
1300 
1301 			if (inp->inp_flags & INP_HIGHPORT)
1302 				optval = IP_PORTRANGE_HIGH;
1303 			else if (inp->inp_flags & INP_LOWPORT)
1304 				optval = IP_PORTRANGE_LOW;
1305 			else
1306 				optval = 0;
1307 
1308 			*mtod(m, int *) = optval;
1309 			break;
1310 
1311 		case IP_AUTH_LEVEL:
1312 		case IP_ESP_TRANS_LEVEL:
1313 		case IP_ESP_NETWORK_LEVEL:
1314 		case IP_IPCOMP_LEVEL:
1315 #ifndef IPSEC
1316 			m->m_len = sizeof(int);
1317 			*mtod(m, int *) = IPSEC_LEVEL_NONE;
1318 #else
1319 			m->m_len = sizeof(int);
1320 			switch (optname) {
1321 			case IP_AUTH_LEVEL:
1322 				optval = inp->inp_seclevel[SL_AUTH];
1323 				break;
1324 
1325 			case IP_ESP_TRANS_LEVEL:
1326 				optval = inp->inp_seclevel[SL_ESP_TRANS];
1327 				break;
1328 
1329 			case IP_ESP_NETWORK_LEVEL:
1330 				optval = inp->inp_seclevel[SL_ESP_NETWORK];
1331 				break;
1332 			case IP_IPCOMP_LEVEL:
1333 			        optval = inp->inp_seclevel[SL_IPCOMP];
1334 				break;
1335 			}
1336 			*mtod(m, int *) = optval;
1337 #endif
1338 			break;
1339 		case IP_IPSEC_LOCAL_ID:
1340 		case IP_IPSEC_REMOTE_ID:
1341 		case IP_IPSEC_LOCAL_CRED:
1342 		case IP_IPSEC_REMOTE_CRED:
1343 		case IP_IPSEC_LOCAL_AUTH:
1344 		case IP_IPSEC_REMOTE_AUTH:
1345 #ifndef IPSEC
1346 			error = EOPNOTSUPP;
1347 #else
1348 			*mp = m = m_get(M_WAIT, MT_SOOPTS);
1349 			m->m_len = sizeof(u_int16_t);
1350 			switch (optname) {
1351 			case IP_IPSEC_LOCAL_ID:
1352 				ipr = inp->inp_ipsec_localid;
1353 				opt16val = IPSP_IDENTITY_NONE;
1354 				break;
1355 			case IP_IPSEC_REMOTE_ID:
1356 				ipr = inp->inp_ipsec_remoteid;
1357 				opt16val = IPSP_IDENTITY_NONE;
1358 				break;
1359 			case IP_IPSEC_LOCAL_CRED:
1360 				ipr = inp->inp_ipsec_localcred;
1361 				opt16val = IPSP_CRED_NONE;
1362 				break;
1363 			case IP_IPSEC_REMOTE_CRED:
1364 				ipr = inp->inp_ipsec_remotecred;
1365 				opt16val = IPSP_CRED_NONE;
1366 				break;
1367 			case IP_IPSEC_LOCAL_AUTH:
1368 				ipr = inp->inp_ipsec_localauth;
1369 				break;
1370 			case IP_IPSEC_REMOTE_AUTH:
1371 				ipr = inp->inp_ipsec_remoteauth;
1372 				break;
1373 			}
1374 			if (ipr == NULL)
1375 				*mtod(m, u_int16_t *) = opt16val;
1376 			else {
1377 				m->m_len += ipr->ref_len;
1378 				*mtod(m, u_int16_t *) = ipr->ref_type;
1379 				m_copyback(m, sizeof(u_int16_t), ipr->ref_len,
1380 					   (caddr_t)(ipr + 1));
1381 			}
1382 #endif
1383 			break;
1384 		default:
1385 			error = ENOPROTOOPT;
1386 			break;
1387 		}
1388 		break;
1389 	}
1390 	return (error);
1391 }
1392 
1393 /*
1394  * Set up IP options in pcb for insertion in output packets.
1395  * Store in mbuf with pointer in pcbopt, adding pseudo-option
1396  * with destination address if source routed.
1397  */
1398 int
1399 #ifdef notyet
1400 ip_pcbopts(optname, pcbopt, m)
1401 	int optname;
1402 #else
1403 ip_pcbopts(pcbopt, m)
1404 #endif
1405 	struct mbuf **pcbopt;
1406 	register struct mbuf *m;
1407 {
1408 	register int cnt, optlen;
1409 	register u_char *cp;
1410 	u_char opt;
1411 
1412 	/* turn off any old options */
1413 	if (*pcbopt)
1414 		(void)m_free(*pcbopt);
1415 	*pcbopt = 0;
1416 	if (m == (struct mbuf *)0 || m->m_len == 0) {
1417 		/*
1418 		 * Only turning off any previous options.
1419 		 */
1420 		if (m)
1421 			(void)m_free(m);
1422 		return (0);
1423 	}
1424 
1425 #ifndef	vax
1426 	if (m->m_len % sizeof(int32_t))
1427 		goto bad;
1428 #endif
1429 	/*
1430 	 * IP first-hop destination address will be stored before
1431 	 * actual options; move other options back
1432 	 * and clear it when none present.
1433 	 */
1434 	if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN])
1435 		goto bad;
1436 	cnt = m->m_len;
1437 	m->m_len += sizeof(struct in_addr);
1438 	cp = mtod(m, u_char *) + sizeof(struct in_addr);
1439 	ovbcopy(mtod(m, caddr_t), (caddr_t)cp, (unsigned)cnt);
1440 	bzero(mtod(m, caddr_t), sizeof(struct in_addr));
1441 
1442 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
1443 		opt = cp[IPOPT_OPTVAL];
1444 		if (opt == IPOPT_EOL)
1445 			break;
1446 		if (opt == IPOPT_NOP)
1447 			optlen = 1;
1448 		else {
1449 			if (cnt < IPOPT_OLEN + sizeof(*cp))
1450 				goto bad;
1451 			optlen = cp[IPOPT_OLEN];
1452 			if (optlen < IPOPT_OLEN  + sizeof(*cp) || optlen > cnt)
1453 				goto bad;
1454 		}
1455 		switch (opt) {
1456 
1457 		default:
1458 			break;
1459 
1460 		case IPOPT_LSRR:
1461 		case IPOPT_SSRR:
1462 			/*
1463 			 * user process specifies route as:
1464 			 *	->A->B->C->D
1465 			 * D must be our final destination (but we can't
1466 			 * check that since we may not have connected yet).
1467 			 * A is first hop destination, which doesn't appear in
1468 			 * actual IP option, but is stored before the options.
1469 			 */
1470 			if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr))
1471 				goto bad;
1472 			m->m_len -= sizeof(struct in_addr);
1473 			cnt -= sizeof(struct in_addr);
1474 			optlen -= sizeof(struct in_addr);
1475 			cp[IPOPT_OLEN] = optlen;
1476 			/*
1477 			 * Move first hop before start of options.
1478 			 */
1479 			bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t),
1480 			    sizeof(struct in_addr));
1481 			/*
1482 			 * Then copy rest of options back
1483 			 * to close up the deleted entry.
1484 			 */
1485 			ovbcopy((caddr_t)(&cp[IPOPT_OFFSET+1] +
1486 			    sizeof(struct in_addr)),
1487 			    (caddr_t)&cp[IPOPT_OFFSET+1],
1488 			    (unsigned)cnt + sizeof(struct in_addr));
1489 			break;
1490 		}
1491 	}
1492 	if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr))
1493 		goto bad;
1494 	*pcbopt = m;
1495 	return (0);
1496 
1497 bad:
1498 	(void)m_free(m);
1499 	return (EINVAL);
1500 }
1501 
1502 /*
1503  * Set the IP multicast options in response to user setsockopt().
1504  */
1505 int
1506 ip_setmoptions(optname, imop, m)
1507 	int optname;
1508 	struct ip_moptions **imop;
1509 	struct mbuf *m;
1510 {
1511 	register int error = 0;
1512 	u_char loop;
1513 	register int i;
1514 	struct in_addr addr;
1515 	register struct ip_mreq *mreq;
1516 	register struct ifnet *ifp;
1517 	register struct ip_moptions *imo = *imop;
1518 	struct route ro;
1519 	register struct sockaddr_in *dst;
1520 
1521 	if (imo == NULL) {
1522 		/*
1523 		 * No multicast option buffer attached to the pcb;
1524 		 * allocate one and initialize to default values.
1525 		 */
1526 		imo = (struct ip_moptions *)malloc(sizeof(*imo), M_IPMOPTS,
1527 		    M_WAITOK);
1528 
1529 		*imop = imo;
1530 		imo->imo_multicast_ifp = NULL;
1531 		imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1532 		imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
1533 		imo->imo_num_memberships = 0;
1534 	}
1535 
1536 	switch (optname) {
1537 
1538 	case IP_MULTICAST_IF:
1539 		/*
1540 		 * Select the interface for outgoing multicast packets.
1541 		 */
1542 		if (m == NULL || m->m_len != sizeof(struct in_addr)) {
1543 			error = EINVAL;
1544 			break;
1545 		}
1546 		addr = *(mtod(m, struct in_addr *));
1547 		/*
1548 		 * INADDR_ANY is used to remove a previous selection.
1549 		 * When no interface is selected, a default one is
1550 		 * chosen every time a multicast packet is sent.
1551 		 */
1552 		if (addr.s_addr == INADDR_ANY) {
1553 			imo->imo_multicast_ifp = NULL;
1554 			break;
1555 		}
1556 		/*
1557 		 * The selected interface is identified by its local
1558 		 * IP address.  Find the interface and confirm that
1559 		 * it supports multicasting.
1560 		 */
1561 		INADDR_TO_IFP(addr, ifp);
1562 		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1563 			error = EADDRNOTAVAIL;
1564 			break;
1565 		}
1566 		imo->imo_multicast_ifp = ifp;
1567 		break;
1568 
1569 	case IP_MULTICAST_TTL:
1570 		/*
1571 		 * Set the IP time-to-live for outgoing multicast packets.
1572 		 */
1573 		if (m == NULL || m->m_len != 1) {
1574 			error = EINVAL;
1575 			break;
1576 		}
1577 		imo->imo_multicast_ttl = *(mtod(m, u_char *));
1578 		break;
1579 
1580 	case IP_MULTICAST_LOOP:
1581 		/*
1582 		 * Set the loopback flag for outgoing multicast packets.
1583 		 * Must be zero or one.
1584 		 */
1585 		if (m == NULL || m->m_len != 1 ||
1586 		   (loop = *(mtod(m, u_char *))) > 1) {
1587 			error = EINVAL;
1588 			break;
1589 		}
1590 		imo->imo_multicast_loop = loop;
1591 		break;
1592 
1593 	case IP_ADD_MEMBERSHIP:
1594 		/*
1595 		 * Add a multicast group membership.
1596 		 * Group must be a valid IP multicast address.
1597 		 */
1598 		if (m == NULL || m->m_len != sizeof(struct ip_mreq)) {
1599 			error = EINVAL;
1600 			break;
1601 		}
1602 		mreq = mtod(m, struct ip_mreq *);
1603 		if (!IN_MULTICAST(mreq->imr_multiaddr.s_addr)) {
1604 			error = EINVAL;
1605 			break;
1606 		}
1607 		/*
1608 		 * If no interface address was provided, use the interface of
1609 		 * the route to the given multicast address.
1610 		 */
1611 		if (mreq->imr_interface.s_addr == INADDR_ANY) {
1612 			ro.ro_rt = NULL;
1613 			dst = satosin(&ro.ro_dst);
1614 			dst->sin_len = sizeof(*dst);
1615 			dst->sin_family = AF_INET;
1616 			dst->sin_addr = mreq->imr_multiaddr;
1617 			rtalloc(&ro);
1618 			if (ro.ro_rt == NULL) {
1619 				error = EADDRNOTAVAIL;
1620 				break;
1621 			}
1622 			ifp = ro.ro_rt->rt_ifp;
1623 			rtfree(ro.ro_rt);
1624 		} else {
1625 			INADDR_TO_IFP(mreq->imr_interface, ifp);
1626 		}
1627 		/*
1628 		 * See if we found an interface, and confirm that it
1629 		 * supports multicast.
1630 		 */
1631 		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
1632 			error = EADDRNOTAVAIL;
1633 			break;
1634 		}
1635 		/*
1636 		 * See if the membership already exists or if all the
1637 		 * membership slots are full.
1638 		 */
1639 		for (i = 0; i < imo->imo_num_memberships; ++i) {
1640 			if (imo->imo_membership[i]->inm_ifp == ifp &&
1641 			    imo->imo_membership[i]->inm_addr.s_addr
1642 						== mreq->imr_multiaddr.s_addr)
1643 				break;
1644 		}
1645 		if (i < imo->imo_num_memberships) {
1646 			error = EADDRINUSE;
1647 			break;
1648 		}
1649 		if (i == IP_MAX_MEMBERSHIPS) {
1650 			error = ETOOMANYREFS;
1651 			break;
1652 		}
1653 		/*
1654 		 * Everything looks good; add a new record to the multicast
1655 		 * address list for the given interface.
1656 		 */
1657 		if ((imo->imo_membership[i] =
1658 		    in_addmulti(&mreq->imr_multiaddr, ifp)) == NULL) {
1659 			error = ENOBUFS;
1660 			break;
1661 		}
1662 		++imo->imo_num_memberships;
1663 		break;
1664 
1665 	case IP_DROP_MEMBERSHIP:
1666 		/*
1667 		 * Drop a multicast group membership.
1668 		 * Group must be a valid IP multicast address.
1669 		 */
1670 		if (m == NULL || m->m_len != sizeof(struct ip_mreq)) {
1671 			error = EINVAL;
1672 			break;
1673 		}
1674 		mreq = mtod(m, struct ip_mreq *);
1675 		if (!IN_MULTICAST(mreq->imr_multiaddr.s_addr)) {
1676 			error = EINVAL;
1677 			break;
1678 		}
1679 		/*
1680 		 * If an interface address was specified, get a pointer
1681 		 * to its ifnet structure.
1682 		 */
1683 		if (mreq->imr_interface.s_addr == INADDR_ANY)
1684 			ifp = NULL;
1685 		else {
1686 			INADDR_TO_IFP(mreq->imr_interface, ifp);
1687 			if (ifp == NULL) {
1688 				error = EADDRNOTAVAIL;
1689 				break;
1690 			}
1691 		}
1692 		/*
1693 		 * Find the membership in the membership array.
1694 		 */
1695 		for (i = 0; i < imo->imo_num_memberships; ++i) {
1696 			if ((ifp == NULL ||
1697 			     imo->imo_membership[i]->inm_ifp == ifp) &&
1698 			     imo->imo_membership[i]->inm_addr.s_addr ==
1699 			     mreq->imr_multiaddr.s_addr)
1700 				break;
1701 		}
1702 		if (i == imo->imo_num_memberships) {
1703 			error = EADDRNOTAVAIL;
1704 			break;
1705 		}
1706 		/*
1707 		 * Give up the multicast address record to which the
1708 		 * membership points.
1709 		 */
1710 		in_delmulti(imo->imo_membership[i]);
1711 		/*
1712 		 * Remove the gap in the membership array.
1713 		 */
1714 		for (++i; i < imo->imo_num_memberships; ++i)
1715 			imo->imo_membership[i-1] = imo->imo_membership[i];
1716 		--imo->imo_num_memberships;
1717 		break;
1718 
1719 	default:
1720 		error = EOPNOTSUPP;
1721 		break;
1722 	}
1723 
1724 	/*
1725 	 * If all options have default values, no need to keep the mbuf.
1726 	 */
1727 	if (imo->imo_multicast_ifp == NULL &&
1728 	    imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL &&
1729 	    imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP &&
1730 	    imo->imo_num_memberships == 0) {
1731 		free(*imop, M_IPMOPTS);
1732 		*imop = NULL;
1733 	}
1734 
1735 	return (error);
1736 }
1737 
1738 /*
1739  * Return the IP multicast options in response to user getsockopt().
1740  */
1741 int
1742 ip_getmoptions(optname, imo, mp)
1743 	int optname;
1744 	register struct ip_moptions *imo;
1745 	register struct mbuf **mp;
1746 {
1747 	u_char *ttl;
1748 	u_char *loop;
1749 	struct in_addr *addr;
1750 	struct in_ifaddr *ia;
1751 
1752 	*mp = m_get(M_WAIT, MT_SOOPTS);
1753 
1754 	switch (optname) {
1755 
1756 	case IP_MULTICAST_IF:
1757 		addr = mtod(*mp, struct in_addr *);
1758 		(*mp)->m_len = sizeof(struct in_addr);
1759 		if (imo == NULL || imo->imo_multicast_ifp == NULL)
1760 			addr->s_addr = INADDR_ANY;
1761 		else {
1762 			IFP_TO_IA(imo->imo_multicast_ifp, ia);
1763 			addr->s_addr = (ia == NULL) ? INADDR_ANY
1764 					: ia->ia_addr.sin_addr.s_addr;
1765 		}
1766 		return (0);
1767 
1768 	case IP_MULTICAST_TTL:
1769 		ttl = mtod(*mp, u_char *);
1770 		(*mp)->m_len = 1;
1771 		*ttl = (imo == NULL) ? IP_DEFAULT_MULTICAST_TTL
1772 				     : imo->imo_multicast_ttl;
1773 		return (0);
1774 
1775 	case IP_MULTICAST_LOOP:
1776 		loop = mtod(*mp, u_char *);
1777 		(*mp)->m_len = 1;
1778 		*loop = (imo == NULL) ? IP_DEFAULT_MULTICAST_LOOP
1779 				      : imo->imo_multicast_loop;
1780 		return (0);
1781 
1782 	default:
1783 		return (EOPNOTSUPP);
1784 	}
1785 }
1786 
1787 /*
1788  * Discard the IP multicast options.
1789  */
1790 void
1791 ip_freemoptions(imo)
1792 	register struct ip_moptions *imo;
1793 {
1794 	register int i;
1795 
1796 	if (imo != NULL) {
1797 		for (i = 0; i < imo->imo_num_memberships; ++i)
1798 			in_delmulti(imo->imo_membership[i]);
1799 		free(imo, M_IPMOPTS);
1800 	}
1801 }
1802 
1803 /*
1804  * Routine called from ip_output() to loop back a copy of an IP multicast
1805  * packet to the input queue of a specified interface.  Note that this
1806  * calls the output routine of the loopback "driver", but with an interface
1807  * pointer that might NOT be &loif -- easier than replicating that code here.
1808  */
1809 static void
1810 ip_mloopback(ifp, m, dst)
1811 	struct ifnet *ifp;
1812 	register struct mbuf *m;
1813 	register struct sockaddr_in *dst;
1814 {
1815 	register struct ip *ip;
1816 	struct mbuf *copym;
1817 
1818 	copym = m_copym2(m, 0, M_COPYALL, M_DONTWAIT);
1819 	if (copym != NULL) {
1820 		/*
1821 		 * We don't bother to fragment if the IP length is greater
1822 		 * than the interface's MTU.  Can this possibly matter?
1823 		 */
1824 		ip = mtod(copym, struct ip *);
1825 		ip->ip_len = htons((u_int16_t)ip->ip_len);
1826 		ip->ip_off = htons((u_int16_t)ip->ip_off);
1827 		ip->ip_sum = 0;
1828 		ip->ip_sum = in_cksum(copym, ip->ip_hl << 2);
1829 		(void) looutput(ifp, copym, sintosa(dst), NULL);
1830 	}
1831 }
1832 
1833 /*
1834  * Process a delayed payload checksum calculation.
1835  */
1836 void
1837 in_delayed_cksum(struct mbuf *m)
1838 {
1839 	struct ip *ip;
1840 	u_int16_t csum, offset;
1841 
1842 	ip = mtod(m, struct ip *);
1843 	offset = ip->ip_hl << 2;
1844 	csum = in4_cksum(m, 0, offset, m->m_pkthdr.len - offset);
1845 	if (csum == 0 && ip->ip_p == IPPROTO_UDP)
1846 		csum = 0xffff;
1847 
1848 	switch (ip->ip_p) {
1849 	case IPPROTO_TCP:
1850 		offset += offsetof(struct tcphdr, th_sum);
1851 		break;
1852 
1853 	case IPPROTO_UDP:
1854 		offset += offsetof(struct udphdr, uh_sum);
1855 		break;
1856 
1857 	default:
1858 		return;
1859 	}
1860 
1861 	if ((offset + sizeof(u_int16_t)) > m->m_len)
1862 		m_copyback(m, offset, sizeof(csum), (caddr_t) &csum);
1863 	else
1864 		*(u_int16_t *)(mtod(m, caddr_t) + offset) = csum;
1865 }
1866