xref: /openbsd-src/sys/net/rtsock.c (revision f6aab3d83b51b91c24247ad2c2573574de475a82)
1 /*	$OpenBSD: rtsock.c,v 1.370 2023/09/16 09:33:27 mpi Exp $	*/
2 /*	$NetBSD: rtsock.c,v 1.18 1996/03/29 00:32:10 cgd Exp $	*/
3 
4 /*
5  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the project nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 /*
34  * Copyright (c) 1988, 1991, 1993
35  *	The Regents of the University of California.  All rights reserved.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. Neither the name of the University nor the names of its contributors
46  *    may be used to endorse or promote products derived from this software
47  *    without specific prior written permission.
48  *
49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59  * SUCH DAMAGE.
60  *
61  *	@(#)rtsock.c	8.6 (Berkeley) 2/11/95
62  */
63 
64 #include <sys/param.h>
65 #include <sys/systm.h>
66 #include <sys/proc.h>
67 #include <sys/sysctl.h>
68 #include <sys/mbuf.h>
69 #include <sys/socket.h>
70 #include <sys/socketvar.h>
71 #include <sys/domain.h>
72 #include <sys/pool.h>
73 #include <sys/protosw.h>
74 #include <sys/srp.h>
75 
76 #include <net/if.h>
77 #include <net/if_dl.h>
78 #include <net/if_var.h>
79 #include <net/route.h>
80 
81 #include <netinet/in.h>
82 
83 #ifdef MPLS
84 #include <netmpls/mpls.h>
85 #endif
86 #ifdef IPSEC
87 #include <netinet/ip_ipsp.h>
88 #include <net/if_enc.h>
89 #endif
90 #ifdef BFD
91 #include <net/bfd.h>
92 #endif
93 
94 #include <sys/stdarg.h>
95 #include <sys/kernel.h>
96 #include <sys/timeout.h>
97 
98 #define	ROUTESNDQ	8192
99 #define	ROUTERCVQ	8192
100 
101 const struct sockaddr route_src = { 2, PF_ROUTE, };
102 
103 struct walkarg {
104 	int	w_op, w_arg, w_tmemsize;
105 	size_t	w_given, w_needed;
106 	caddr_t	w_where, w_tmem;
107 };
108 
109 void	route_prinit(void);
110 void	rcb_ref(void *, void *);
111 void	rcb_unref(void *, void *);
112 int	route_output(struct mbuf *, struct socket *);
113 int	route_ctloutput(int, struct socket *, int, int, struct mbuf *);
114 int	route_attach(struct socket *, int, int);
115 int	route_detach(struct socket *);
116 int	route_disconnect(struct socket *);
117 int	route_shutdown(struct socket *);
118 void	route_rcvd(struct socket *);
119 int	route_send(struct socket *, struct mbuf *, struct mbuf *,
120 	    struct mbuf *);
121 int	route_sockaddr(struct socket *, struct mbuf *);
122 int	route_peeraddr(struct socket *, struct mbuf *);
123 void	route_input(struct mbuf *m0, struct socket *, sa_family_t);
124 int	route_arp_conflict(struct rtentry *, struct rt_addrinfo *);
125 int	route_cleargateway(struct rtentry *, void *, unsigned int);
126 void	rtm_senddesync_timer(void *);
127 void	rtm_senddesync(struct socket *);
128 int	rtm_sendup(struct socket *, struct mbuf *);
129 
130 int	rtm_getifa(struct rt_addrinfo *, unsigned int);
131 int	rtm_output(struct rt_msghdr *, struct rtentry **, struct rt_addrinfo *,
132 	    uint8_t, unsigned int);
133 struct rt_msghdr *rtm_report(struct rtentry *, u_char, int, int);
134 struct mbuf	*rtm_msg1(int, struct rt_addrinfo *);
135 int		 rtm_msg2(int, int, struct rt_addrinfo *, caddr_t,
136 		     struct walkarg *);
137 int		 rtm_xaddrs(caddr_t, caddr_t, struct rt_addrinfo *);
138 int		 rtm_validate_proposal(struct rt_addrinfo *);
139 void		 rtm_setmetrics(u_long, const struct rt_metrics *,
140 		     struct rt_kmetrics *);
141 void		 rtm_getmetrics(const struct rtentry *,
142 		     struct rt_metrics *);
143 
144 int		 sysctl_iflist(int, struct walkarg *);
145 int		 sysctl_ifnames(struct walkarg *);
146 int		 sysctl_rtable_rtstat(void *, size_t *, void *);
147 
148 int		 rt_setsource(unsigned int, struct sockaddr *);
149 
150 /*
151  * Locks used to protect struct members
152  *       I       immutable after creation
153  *       s       solock
154  */
155 struct rtpcb {
156 	struct socket		*rop_socket;		/* [I] */
157 
158 	SRPL_ENTRY(rtpcb)	rop_list;
159 	struct refcnt		rop_refcnt;
160 	struct timeout		rop_timeout;
161 	unsigned int		rop_msgfilter;		/* [s] */
162 	unsigned int		rop_flagfilter;		/* [s] */
163 	unsigned int		rop_flags;		/* [s] */
164 	u_int			rop_rtableid;		/* [s] */
165 	unsigned short		rop_proto;		/* [I] */
166 	u_char			rop_priority;		/* [s] */
167 };
168 #define	sotortpcb(so)	((struct rtpcb *)(so)->so_pcb)
169 
170 struct rtptable {
171 	SRPL_HEAD(, rtpcb)	rtp_list;
172 	struct srpl_rc		rtp_rc;
173 	struct rwlock		rtp_lk;
174 	unsigned int		rtp_count;
175 };
176 
177 struct pool rtpcb_pool;
178 struct rtptable rtptable;
179 
180 /*
181  * These flags and timeout are used for indicating to userland (via a
182  * RTM_DESYNC msg) when the route socket has overflowed and messages
183  * have been lost.
184  */
185 #define ROUTECB_FLAG_DESYNC	0x1	/* Route socket out of memory */
186 #define ROUTECB_FLAG_FLUSH	0x2	/* Wait until socket is empty before
187 					   queueing more packets */
188 
189 #define ROUTE_DESYNC_RESEND_TIMEOUT	200	/* In ms */
190 
191 void
192 route_prinit(void)
193 {
194 	srpl_rc_init(&rtptable.rtp_rc, rcb_ref, rcb_unref, NULL);
195 	rw_init(&rtptable.rtp_lk, "rtsock");
196 	SRPL_INIT(&rtptable.rtp_list);
197 	pool_init(&rtpcb_pool, sizeof(struct rtpcb), 0,
198 	    IPL_SOFTNET, PR_WAITOK, "rtpcb", NULL);
199 }
200 
201 void
202 rcb_ref(void *null, void *v)
203 {
204 	struct rtpcb *rop = v;
205 
206 	refcnt_take(&rop->rop_refcnt);
207 }
208 
209 void
210 rcb_unref(void *null, void *v)
211 {
212 	struct rtpcb *rop = v;
213 
214 	refcnt_rele_wake(&rop->rop_refcnt);
215 }
216 
217 int
218 route_attach(struct socket *so, int proto, int wait)
219 {
220 	struct rtpcb	*rop;
221 	int		 error;
222 
223 	error = soreserve(so, ROUTESNDQ, ROUTERCVQ);
224 	if (error)
225 		return (error);
226 	/*
227 	 * use the rawcb but allocate a rtpcb, this
228 	 * code does not care about the additional fields
229 	 * and works directly on the raw socket.
230 	 */
231 	rop = pool_get(&rtpcb_pool, (wait == M_WAIT ? PR_WAITOK : PR_NOWAIT) |
232 	    PR_ZERO);
233 	if (rop == NULL)
234 		return (ENOBUFS);
235 	so->so_pcb = rop;
236 	/* Init the timeout structure */
237 	timeout_set_proc(&rop->rop_timeout, rtm_senddesync_timer, so);
238 	refcnt_init(&rop->rop_refcnt);
239 
240 	rop->rop_socket = so;
241 	rop->rop_proto = proto;
242 
243 	rop->rop_rtableid = curproc->p_p->ps_rtableid;
244 
245 	soisconnected(so);
246 	so->so_options |= SO_USELOOPBACK;
247 
248 	rw_enter(&rtptable.rtp_lk, RW_WRITE);
249 	SRPL_INSERT_HEAD_LOCKED(&rtptable.rtp_rc, &rtptable.rtp_list, rop,
250 	    rop_list);
251 	rtptable.rtp_count++;
252 	rw_exit(&rtptable.rtp_lk);
253 
254 	return (0);
255 }
256 
257 int
258 route_detach(struct socket *so)
259 {
260 	struct rtpcb	*rop;
261 
262 	soassertlocked(so);
263 
264 	rop = sotortpcb(so);
265 	if (rop == NULL)
266 		return (EINVAL);
267 
268 	rw_enter(&rtptable.rtp_lk, RW_WRITE);
269 
270 	rtptable.rtp_count--;
271 	SRPL_REMOVE_LOCKED(&rtptable.rtp_rc, &rtptable.rtp_list, rop, rtpcb,
272 	    rop_list);
273 	rw_exit(&rtptable.rtp_lk);
274 
275 	sounlock(so);
276 
277 	/* wait for all references to drop */
278 	refcnt_finalize(&rop->rop_refcnt, "rtsockrefs");
279 	timeout_del_barrier(&rop->rop_timeout);
280 
281 	solock(so);
282 
283 	so->so_pcb = NULL;
284 	KASSERT((so->so_state & SS_NOFDREF) == 0);
285 	pool_put(&rtpcb_pool, rop);
286 
287 	return (0);
288 }
289 
290 int
291 route_disconnect(struct socket *so)
292 {
293 	soisdisconnected(so);
294 	return (0);
295 }
296 
297 int
298 route_shutdown(struct socket *so)
299 {
300 	socantsendmore(so);
301 	return (0);
302 }
303 
304 void
305 route_rcvd(struct socket *so)
306 {
307 	struct rtpcb *rop = sotortpcb(so);
308 
309 	soassertlocked(so);
310 
311 	/*
312 	 * If we are in a FLUSH state, check if the buffer is
313 	 * empty so that we can clear the flag.
314 	 */
315 	if (((rop->rop_flags & ROUTECB_FLAG_FLUSH) != 0) &&
316 	    ((sbspace(rop->rop_socket, &rop->rop_socket->so_rcv) ==
317 	    rop->rop_socket->so_rcv.sb_hiwat)))
318 		rop->rop_flags &= ~ROUTECB_FLAG_FLUSH;
319 }
320 
321 int
322 route_send(struct socket *so, struct mbuf *m, struct mbuf *nam,
323     struct mbuf *control)
324 {
325 	int error;
326 
327 	soassertlocked(so);
328 
329 	if (control && control->m_len) {
330 		error = EOPNOTSUPP;
331 		goto out;
332 	}
333 
334 	if (nam) {
335 		error = EISCONN;
336 		goto out;
337 	}
338 
339 	error = route_output(m, so);
340 	m = NULL;
341 
342 out:
343 	m_freem(control);
344 	m_freem(m);
345 
346 	return (error);
347 }
348 
349 int
350 route_sockaddr(struct socket *so, struct mbuf *nam)
351 {
352 	return (EINVAL);
353 }
354 
355 int
356 route_peeraddr(struct socket *so, struct mbuf *nam)
357 {
358 	/* minimal support, just implement a fake peer address */
359 	bcopy(&route_src, mtod(nam, caddr_t), route_src.sa_len);
360 	nam->m_len = route_src.sa_len;
361 	return (0);
362 }
363 
364 int
365 route_ctloutput(int op, struct socket *so, int level, int optname,
366     struct mbuf *m)
367 {
368 	struct rtpcb *rop = sotortpcb(so);
369 	int error = 0;
370 	unsigned int tid, prio;
371 
372 	if (level != AF_ROUTE)
373 		return (EINVAL);
374 
375 	switch (op) {
376 	case PRCO_SETOPT:
377 		switch (optname) {
378 		case ROUTE_MSGFILTER:
379 			if (m == NULL || m->m_len != sizeof(unsigned int))
380 				error = EINVAL;
381 			else
382 				rop->rop_msgfilter = *mtod(m, unsigned int *);
383 			break;
384 		case ROUTE_TABLEFILTER:
385 			if (m == NULL || m->m_len != sizeof(unsigned int)) {
386 				error = EINVAL;
387 				break;
388 			}
389 			tid = *mtod(m, unsigned int *);
390 			if (tid != RTABLE_ANY && !rtable_exists(tid))
391 				error = ENOENT;
392 			else
393 				rop->rop_rtableid = tid;
394 			break;
395 		case ROUTE_PRIOFILTER:
396 			if (m == NULL || m->m_len != sizeof(unsigned int)) {
397 				error = EINVAL;
398 				break;
399 			}
400 			prio = *mtod(m, unsigned int *);
401 			if (prio > RTP_MAX)
402 				error = EINVAL;
403 			else
404 				rop->rop_priority = prio;
405 			break;
406 		case ROUTE_FLAGFILTER:
407 			if (m == NULL || m->m_len != sizeof(unsigned int))
408 				error = EINVAL;
409 			else
410 				rop->rop_flagfilter = *mtod(m, unsigned int *);
411 			break;
412 		default:
413 			error = ENOPROTOOPT;
414 			break;
415 		}
416 		break;
417 	case PRCO_GETOPT:
418 		switch (optname) {
419 		case ROUTE_MSGFILTER:
420 			m->m_len = sizeof(unsigned int);
421 			*mtod(m, unsigned int *) = rop->rop_msgfilter;
422 			break;
423 		case ROUTE_TABLEFILTER:
424 			m->m_len = sizeof(unsigned int);
425 			*mtod(m, unsigned int *) = rop->rop_rtableid;
426 			break;
427 		case ROUTE_PRIOFILTER:
428 			m->m_len = sizeof(unsigned int);
429 			*mtod(m, unsigned int *) = rop->rop_priority;
430 			break;
431 		case ROUTE_FLAGFILTER:
432 			m->m_len = sizeof(unsigned int);
433 			*mtod(m, unsigned int *) = rop->rop_flagfilter;
434 			break;
435 		default:
436 			error = ENOPROTOOPT;
437 			break;
438 		}
439 	}
440 	return (error);
441 }
442 
443 void
444 rtm_senddesync_timer(void *xso)
445 {
446 	struct socket	*so = xso;
447 
448 	solock(so);
449 	rtm_senddesync(so);
450 	sounlock(so);
451 }
452 
453 void
454 rtm_senddesync(struct socket *so)
455 {
456 	struct rtpcb	*rop = sotortpcb(so);
457 	struct mbuf	*desync_mbuf;
458 
459 	soassertlocked(so);
460 
461 	/*
462 	 * Dying socket is disconnected by upper layer and there is
463 	 * no reason to send packet. Also we shouldn't reschedule
464 	 * timeout(9), otherwise timeout_del_barrier(9) can't help us.
465 	 */
466 	if ((so->so_state & SS_ISCONNECTED) == 0 ||
467 	    (so->so_rcv.sb_state & SS_CANTRCVMORE))
468 		return;
469 
470 	/* If we are in a DESYNC state, try to send a RTM_DESYNC packet */
471 	if ((rop->rop_flags & ROUTECB_FLAG_DESYNC) == 0)
472 		return;
473 
474 	/*
475 	 * If we fail to alloc memory or if sbappendaddr()
476 	 * fails, re-add timeout and try again.
477 	 */
478 	desync_mbuf = rtm_msg1(RTM_DESYNC, NULL);
479 	if (desync_mbuf != NULL) {
480 		if (sbappendaddr(so, &so->so_rcv, &route_src,
481 		    desync_mbuf, NULL) != 0) {
482 			rop->rop_flags &= ~ROUTECB_FLAG_DESYNC;
483 			sorwakeup(rop->rop_socket);
484 			return;
485 		}
486 		m_freem(desync_mbuf);
487 	}
488 	/* Re-add timeout to try sending msg again */
489 	timeout_add_msec(&rop->rop_timeout, ROUTE_DESYNC_RESEND_TIMEOUT);
490 }
491 
492 void
493 route_input(struct mbuf *m0, struct socket *so0, sa_family_t sa_family)
494 {
495 	struct socket *so;
496 	struct rtpcb *rop;
497 	struct rt_msghdr *rtm;
498 	struct mbuf *m = m0;
499 	struct srp_ref sr;
500 
501 	/* ensure that we can access the rtm_type via mtod() */
502 	if (m->m_len < offsetof(struct rt_msghdr, rtm_type) + 1) {
503 		m_freem(m);
504 		return;
505 	}
506 
507 	SRPL_FOREACH(rop, &sr, &rtptable.rtp_list, rop_list) {
508 		/*
509 		 * If route socket is bound to an address family only send
510 		 * messages that match the address family. Address family
511 		 * agnostic messages are always sent.
512 		 */
513 		if (sa_family != AF_UNSPEC && rop->rop_proto != AF_UNSPEC &&
514 		    rop->rop_proto != sa_family)
515 			continue;
516 
517 
518 		so = rop->rop_socket;
519 		solock(so);
520 
521 		/*
522 		 * Check to see if we don't want our own messages and
523 		 * if we can receive anything.
524 		 */
525 		if ((so0 == so && !(so0->so_options & SO_USELOOPBACK)) ||
526 		    !(so->so_state & SS_ISCONNECTED) ||
527 		    (so->so_rcv.sb_state & SS_CANTRCVMORE))
528 			goto next;
529 
530 		/* filter messages that the process does not want */
531 		rtm = mtod(m, struct rt_msghdr *);
532 		/* but RTM_DESYNC can't be filtered */
533 		if (rtm->rtm_type != RTM_DESYNC) {
534 			if (rop->rop_msgfilter != 0 &&
535 			    !(rop->rop_msgfilter & (1U << rtm->rtm_type)))
536 				goto next;
537 			if (ISSET(rop->rop_flagfilter, rtm->rtm_flags))
538 				goto next;
539 		}
540 		switch (rtm->rtm_type) {
541 		case RTM_IFANNOUNCE:
542 		case RTM_DESYNC:
543 			/* no tableid */
544 			break;
545 		case RTM_RESOLVE:
546 		case RTM_NEWADDR:
547 		case RTM_DELADDR:
548 		case RTM_IFINFO:
549 		case RTM_80211INFO:
550 		case RTM_BFD:
551 			/* check against rdomain id */
552 			if (rop->rop_rtableid != RTABLE_ANY &&
553 			    rtable_l2(rop->rop_rtableid) != rtm->rtm_tableid)
554 				goto next;
555 			break;
556 		default:
557 			if (rop->rop_priority != 0 &&
558 			    rop->rop_priority < rtm->rtm_priority)
559 				goto next;
560 			/* check against rtable id */
561 			if (rop->rop_rtableid != RTABLE_ANY &&
562 			    rop->rop_rtableid != rtm->rtm_tableid)
563 				goto next;
564 			break;
565 		}
566 
567 		/*
568 		 * Check to see if the flush flag is set. If so, don't queue
569 		 * any more messages until the flag is cleared.
570 		 */
571 		if ((rop->rop_flags & ROUTECB_FLAG_FLUSH) != 0)
572 			goto next;
573 
574 		rtm_sendup(so, m);
575 next:
576 		sounlock(so);
577 	}
578 	SRPL_LEAVE(&sr);
579 
580 	m_freem(m);
581 }
582 
583 int
584 rtm_sendup(struct socket *so, struct mbuf *m0)
585 {
586 	struct rtpcb *rop = sotortpcb(so);
587 	struct mbuf *m;
588 
589 	soassertlocked(so);
590 
591 	m = m_copym(m0, 0, M_COPYALL, M_NOWAIT);
592 	if (m == NULL)
593 		return (ENOMEM);
594 
595 	if (sbspace(so, &so->so_rcv) < (2 * MSIZE) ||
596 	    sbappendaddr(so, &so->so_rcv, &route_src, m, NULL) == 0) {
597 		/* Flag socket as desync'ed and flush required */
598 		rop->rop_flags |= ROUTECB_FLAG_DESYNC | ROUTECB_FLAG_FLUSH;
599 		rtm_senddesync(so);
600 		m_freem(m);
601 		return (ENOBUFS);
602 	}
603 
604 	sorwakeup(so);
605 	return (0);
606 }
607 
608 struct rt_msghdr *
609 rtm_report(struct rtentry *rt, u_char type, int seq, int tableid)
610 {
611 	struct rt_msghdr	*rtm;
612 	struct rt_addrinfo	 info;
613 	struct sockaddr_rtlabel	 sa_rl;
614 	struct sockaddr_in6	 sa_mask;
615 #ifdef BFD
616 	struct sockaddr_bfd	 sa_bfd;
617 #endif
618 	struct ifnet		*ifp = NULL;
619 	int			 len;
620 
621 	bzero(&info, sizeof(info));
622 	info.rti_info[RTAX_DST] = rt_key(rt);
623 	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
624 	info.rti_info[RTAX_NETMASK] = rt_plen2mask(rt, &sa_mask);
625 	info.rti_info[RTAX_LABEL] = rtlabel_id2sa(rt->rt_labelid, &sa_rl);
626 #ifdef BFD
627 	if (rt->rt_flags & RTF_BFD) {
628 		KERNEL_LOCK();
629 		info.rti_info[RTAX_BFD] = bfd2sa(rt, &sa_bfd);
630 		KERNEL_UNLOCK();
631 	}
632 #endif
633 #ifdef MPLS
634 	if (rt->rt_flags & RTF_MPLS) {
635 		struct sockaddr_mpls	 sa_mpls;
636 
637 		bzero(&sa_mpls, sizeof(sa_mpls));
638 		sa_mpls.smpls_family = AF_MPLS;
639 		sa_mpls.smpls_len = sizeof(sa_mpls);
640 		sa_mpls.smpls_label = ((struct rt_mpls *)
641 		    rt->rt_llinfo)->mpls_label;
642 		info.rti_info[RTAX_SRC] = (struct sockaddr *)&sa_mpls;
643 		info.rti_mpls = ((struct rt_mpls *)
644 		    rt->rt_llinfo)->mpls_operation;
645 	}
646 #endif
647 	ifp = if_get(rt->rt_ifidx);
648 	if (ifp != NULL) {
649 		info.rti_info[RTAX_IFP] = sdltosa(ifp->if_sadl);
650 		info.rti_info[RTAX_IFA] = rtable_getsource(tableid,
651 		    info.rti_info[RTAX_DST]->sa_family);
652 		if (info.rti_info[RTAX_IFA] == NULL)
653 			info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
654 		if (ifp->if_flags & IFF_POINTOPOINT)
655 			info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr;
656 	}
657 	if_put(ifp);
658 	/* RTAX_GENMASK, RTAX_AUTHOR, RTAX_SRCMASK ignored */
659 
660 	/* build new route message */
661 	len = rtm_msg2(type, RTM_VERSION, &info, NULL, NULL);
662 	rtm = malloc(len, M_RTABLE, M_WAITOK | M_ZERO);
663 
664 	rtm_msg2(type, RTM_VERSION, &info, (caddr_t)rtm, NULL);
665 	rtm->rtm_type = type;
666 	rtm->rtm_index = rt->rt_ifidx;
667 	rtm->rtm_tableid = tableid;
668 	rtm->rtm_priority = rt->rt_priority & RTP_MASK;
669 	rtm->rtm_flags = rt->rt_flags;
670 	rtm->rtm_pid = curproc->p_p->ps_pid;
671 	rtm->rtm_seq = seq;
672 	rtm_getmetrics(rt, &rtm->rtm_rmx);
673 	rtm->rtm_addrs = info.rti_addrs;
674 #ifdef MPLS
675 	rtm->rtm_mpls = info.rti_mpls;
676 #endif
677 	return rtm;
678 }
679 
680 int
681 route_output(struct mbuf *m, struct socket *so)
682 {
683 	struct rt_msghdr	*rtm = NULL;
684 	struct rtentry		*rt = NULL;
685 	struct rt_addrinfo	 info;
686 	struct ifnet		*ifp;
687 	int			 len, seq, useloopback, error = 0;
688 	u_int			 tableid;
689 	u_int8_t		 prio;
690 	u_char			 vers, type;
691 
692 	if (m == NULL || ((m->m_len < sizeof(int32_t)) &&
693 	    (m = m_pullup(m, sizeof(int32_t))) == NULL))
694 		return (ENOBUFS);
695 	if ((m->m_flags & M_PKTHDR) == 0)
696 		panic("route_output");
697 
698 	useloopback = so->so_options & SO_USELOOPBACK;
699 
700 	/*
701 	 * The socket can't be closed concurrently because the file
702 	 * descriptor reference is still held.
703 	 */
704 
705 	sounlock(so);
706 
707 	len = m->m_pkthdr.len;
708 	if (len < offsetof(struct rt_msghdr, rtm_hdrlen) +
709 	    sizeof(rtm->rtm_hdrlen) ||
710 	    len != mtod(m, struct rt_msghdr *)->rtm_msglen) {
711 		error = EINVAL;
712 		goto fail;
713 	}
714 	vers = mtod(m, struct rt_msghdr *)->rtm_version;
715 	switch (vers) {
716 	case RTM_VERSION:
717 		if (len < sizeof(struct rt_msghdr)) {
718 			error = EINVAL;
719 			goto fail;
720 		}
721 		if (len > RTM_MAXSIZE) {
722 			error = EMSGSIZE;
723 			goto fail;
724 		}
725 		rtm = malloc(len, M_RTABLE, M_WAITOK);
726 		m_copydata(m, 0, len, rtm);
727 		break;
728 	default:
729 		error = EPROTONOSUPPORT;
730 		goto fail;
731 	}
732 
733 	/* Verify that the caller is sending an appropriate message early */
734 	switch (rtm->rtm_type) {
735 	case RTM_ADD:
736 	case RTM_DELETE:
737 	case RTM_GET:
738 	case RTM_CHANGE:
739 	case RTM_PROPOSAL:
740 	case RTM_SOURCE:
741 		break;
742 	default:
743 		error = EOPNOTSUPP;
744 		goto fail;
745 	}
746 	/*
747 	 * Verify that the header length is valid.
748 	 * All messages from userland start with a struct rt_msghdr.
749 	 */
750 	if (rtm->rtm_hdrlen == 0)	/* old client */
751 		rtm->rtm_hdrlen = sizeof(struct rt_msghdr);
752 	if (rtm->rtm_hdrlen < sizeof(struct rt_msghdr) ||
753 	    len < rtm->rtm_hdrlen) {
754 		error = EINVAL;
755 		goto fail;
756 	}
757 
758 	rtm->rtm_pid = curproc->p_p->ps_pid;
759 
760 	/*
761 	 * Verify that the caller has the appropriate privilege; RTM_GET
762 	 * is the only operation the non-superuser is allowed.
763 	 */
764 	if (rtm->rtm_type != RTM_GET && suser(curproc) != 0) {
765 		error = EACCES;
766 		goto fail;
767 	}
768 	tableid = rtm->rtm_tableid;
769 	if (!rtable_exists(tableid)) {
770 		if (rtm->rtm_type == RTM_ADD) {
771 			if ((error = rtable_add(tableid)) != 0)
772 				goto fail;
773 		} else {
774 			error = EINVAL;
775 			goto fail;
776 		}
777 	}
778 
779 	/* Do not let userland play with kernel-only flags. */
780 	if ((rtm->rtm_flags & (RTF_LOCAL|RTF_BROADCAST)) != 0) {
781 		error = EINVAL;
782 		goto fail;
783 	}
784 
785 	/* make sure that kernel-only bits are not set */
786 	rtm->rtm_priority &= RTP_MASK;
787 	rtm->rtm_flags &= ~(RTF_DONE|RTF_CLONED|RTF_CACHED);
788 	rtm->rtm_fmask &= RTF_FMASK;
789 
790 	if (rtm->rtm_priority != 0) {
791 		if (rtm->rtm_priority > RTP_MAX ||
792 		    rtm->rtm_priority == RTP_LOCAL) {
793 			error = EINVAL;
794 			goto fail;
795 		}
796 		prio = rtm->rtm_priority;
797 	} else if (rtm->rtm_type != RTM_ADD)
798 		prio = RTP_ANY;
799 	else if (rtm->rtm_flags & RTF_STATIC)
800 		prio = 0;
801 	else
802 		prio = RTP_DEFAULT;
803 
804 	bzero(&info, sizeof(info));
805 	info.rti_addrs = rtm->rtm_addrs;
806 	if ((error = rtm_xaddrs(rtm->rtm_hdrlen + (caddr_t)rtm,
807 	    len + (caddr_t)rtm, &info)) != 0)
808 		goto fail;
809 
810 	info.rti_flags = rtm->rtm_flags;
811 
812 	if (rtm->rtm_type != RTM_SOURCE &&
813 	    rtm->rtm_type != RTM_PROPOSAL &&
814 	    (info.rti_info[RTAX_DST] == NULL ||
815 	    info.rti_info[RTAX_DST]->sa_family >= AF_MAX ||
816 	    (info.rti_info[RTAX_GATEWAY] != NULL &&
817 	    info.rti_info[RTAX_GATEWAY]->sa_family >= AF_MAX) ||
818 	    info.rti_info[RTAX_GENMASK] != NULL)) {
819 		error = EINVAL;
820 		goto fail;
821 	}
822 #ifdef MPLS
823 	info.rti_mpls = rtm->rtm_mpls;
824 #endif
825 
826 	if (info.rti_info[RTAX_GATEWAY] != NULL &&
827 	    info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK &&
828 	    (info.rti_flags & RTF_CLONING) == 0) {
829 		info.rti_flags |= RTF_LLINFO;
830 	}
831 
832 	/*
833 	 * Validate RTM_PROPOSAL and pass it along or error out.
834 	 */
835 	if (rtm->rtm_type == RTM_PROPOSAL) {
836 		if (rtm_validate_proposal(&info) == -1) {
837 			error = EINVAL;
838 			goto fail;
839 		}
840 		/*
841 		 * If this is a solicitation proposal forward request to
842 		 * all interfaces. Most handlers will ignore it but at least
843 		 * umb(4) will send a response to this event.
844 		 */
845 		if (rtm->rtm_priority == RTP_PROPOSAL_SOLICIT) {
846 			NET_LOCK();
847 			TAILQ_FOREACH(ifp, &ifnetlist, if_list) {
848 				ifp->if_rtrequest(ifp, RTM_PROPOSAL, NULL);
849 			}
850 			NET_UNLOCK();
851 		}
852 	} else if (rtm->rtm_type == RTM_SOURCE) {
853 		if (info.rti_info[RTAX_IFA] == NULL) {
854 			error = EINVAL;
855 			goto fail;
856 		}
857 		NET_LOCK();
858 		error = rt_setsource(tableid, info.rti_info[RTAX_IFA]);
859 		NET_UNLOCK();
860 		if (error)
861 			goto fail;
862 	} else {
863 		error = rtm_output(rtm, &rt, &info, prio, tableid);
864 		if (!error) {
865 			type = rtm->rtm_type;
866 			seq = rtm->rtm_seq;
867 			free(rtm, M_RTABLE, len);
868 			NET_LOCK_SHARED();
869 			rtm = rtm_report(rt, type, seq, tableid);
870 			NET_UNLOCK_SHARED();
871 			len = rtm->rtm_msglen;
872 		}
873 	}
874 
875 	rtfree(rt);
876 	if (error) {
877 		rtm->rtm_errno = error;
878 	} else {
879 		rtm->rtm_flags |= RTF_DONE;
880 	}
881 
882 	/*
883 	 * Check to see if we don't want our own messages.
884 	 */
885 	if (!useloopback) {
886 		if (rtptable.rtp_count == 0) {
887 			/* no other listener and no loopback of messages */
888 			goto fail;
889 		}
890 	}
891 	if (m_copyback(m, 0, len, rtm, M_NOWAIT)) {
892 		m_freem(m);
893 		m = NULL;
894 	} else if (m->m_pkthdr.len > len)
895 		m_adj(m, len - m->m_pkthdr.len);
896 	free(rtm, M_RTABLE, len);
897 	if (m)
898 		route_input(m, so, info.rti_info[RTAX_DST] ?
899 		    info.rti_info[RTAX_DST]->sa_family : AF_UNSPEC);
900 	solock(so);
901 
902 	return (error);
903 fail:
904 	free(rtm, M_RTABLE, len);
905 	m_freem(m);
906 	solock(so);
907 
908 	return (error);
909 }
910 
911 int
912 rtm_output(struct rt_msghdr *rtm, struct rtentry **prt,
913     struct rt_addrinfo *info, uint8_t prio, unsigned int tableid)
914 {
915 	struct rtentry		*rt = *prt;
916 	struct ifnet		*ifp = NULL;
917 	int			 plen, newgate = 0, error = 0;
918 
919 	switch (rtm->rtm_type) {
920 	case RTM_ADD:
921 		if (info->rti_info[RTAX_GATEWAY] == NULL) {
922 			error = EINVAL;
923 			break;
924 		}
925 
926 		rt = rtable_match(tableid, info->rti_info[RTAX_DST], NULL);
927 		if ((error = route_arp_conflict(rt, info))) {
928 			rtfree(rt);
929 			rt = NULL;
930 			break;
931 		}
932 
933 		/*
934 		 * We cannot go through a delete/create/insert cycle for
935 		 * cached route because this can lead to races in the
936 		 * receive path.  Instead we update the L2 cache.
937 		 */
938 		if ((rt != NULL) && ISSET(rt->rt_flags, RTF_CACHED)) {
939 			ifp = if_get(rt->rt_ifidx);
940 			if (ifp == NULL) {
941 				rtfree(rt);
942 				rt = NULL;
943 				error = ESRCH;
944 				break;
945 			}
946 
947 			goto change;
948 		}
949 
950 		rtfree(rt);
951 		rt = NULL;
952 
953 		NET_LOCK();
954 		if ((error = rtm_getifa(info, tableid)) != 0) {
955 			NET_UNLOCK();
956 			break;
957 		}
958 		error = rtrequest(RTM_ADD, info, prio, &rt, tableid);
959 		NET_UNLOCK();
960 		if (error == 0)
961 			rtm_setmetrics(rtm->rtm_inits, &rtm->rtm_rmx,
962 			    &rt->rt_rmx);
963 		break;
964 	case RTM_DELETE:
965 		rt = rtable_lookup(tableid, info->rti_info[RTAX_DST],
966 		    info->rti_info[RTAX_NETMASK], info->rti_info[RTAX_GATEWAY],
967 		    prio);
968 		if (rt == NULL) {
969 			error = ESRCH;
970 			break;
971 		}
972 
973 		/*
974 		 * If we got multipath routes, we require users to specify
975 		 * a matching gateway.
976 		 */
977 		if (ISSET(rt->rt_flags, RTF_MPATH) &&
978 		    info->rti_info[RTAX_GATEWAY] == NULL) {
979 			error = ESRCH;
980 			break;
981 		}
982 
983 		ifp = if_get(rt->rt_ifidx);
984 		if (ifp == NULL) {
985 			rtfree(rt);
986 			rt = NULL;
987 			error = ESRCH;
988 			break;
989 		}
990 
991 		/*
992 		 * Invalidate the cache of automagically created and
993 		 * referenced L2 entries to make sure that ``rt_gwroute''
994 		 * pointer stays valid for other CPUs.
995 		 */
996 		if ((ISSET(rt->rt_flags, RTF_CACHED))) {
997 			NET_LOCK();
998 			ifp->if_rtrequest(ifp, RTM_INVALIDATE, rt);
999 			/* Reset the MTU of the gateway route. */
1000 			rtable_walk(tableid, rt_key(rt)->sa_family, NULL,
1001 			    route_cleargateway, rt);
1002 			NET_UNLOCK();
1003 			break;
1004 		}
1005 
1006 		/*
1007 		 * Make sure that local routes are only modified by the
1008 		 * kernel.
1009 		 */
1010 		if (ISSET(rt->rt_flags, RTF_LOCAL|RTF_BROADCAST)) {
1011 			error = EINVAL;
1012 			break;
1013 		}
1014 
1015 		rtfree(rt);
1016 		rt = NULL;
1017 
1018 		NET_LOCK();
1019 		error = rtrequest_delete(info, prio, ifp, &rt, tableid);
1020 		NET_UNLOCK();
1021 		break;
1022 	case RTM_CHANGE:
1023 		rt = rtable_lookup(tableid, info->rti_info[RTAX_DST],
1024 		    info->rti_info[RTAX_NETMASK], info->rti_info[RTAX_GATEWAY],
1025 		    prio);
1026 		/*
1027 		 * If we got multipath routes, we require users to specify
1028 		 * a matching gateway.
1029 		 */
1030 		if ((rt != NULL) && ISSET(rt->rt_flags, RTF_MPATH) &&
1031 		    (info->rti_info[RTAX_GATEWAY] == NULL)) {
1032 			rtfree(rt);
1033 			rt = NULL;
1034 		}
1035 
1036 		/*
1037 		 * If RTAX_GATEWAY is the argument we're trying to
1038 		 * change, try to find a compatible route.
1039 		 */
1040 		if ((rt == NULL) && (info->rti_info[RTAX_GATEWAY] != NULL)) {
1041 			rt = rtable_lookup(tableid, info->rti_info[RTAX_DST],
1042 			    info->rti_info[RTAX_NETMASK], NULL, prio);
1043 			/* Ensure we don't pick a multipath one. */
1044 			if ((rt != NULL) && ISSET(rt->rt_flags, RTF_MPATH)) {
1045 				rtfree(rt);
1046 				rt = NULL;
1047 			}
1048 		}
1049 
1050 		if (rt == NULL) {
1051 			error = ESRCH;
1052 			break;
1053 		}
1054 
1055 		/*
1056 		 * Make sure that local routes are only modified by the
1057 		 * kernel.
1058 		 */
1059 		if (ISSET(rt->rt_flags, RTF_LOCAL|RTF_BROADCAST)) {
1060 			error = EINVAL;
1061 			break;
1062 		}
1063 
1064 		ifp = if_get(rt->rt_ifidx);
1065 		if (ifp == NULL) {
1066 			rtfree(rt);
1067 			rt = NULL;
1068 			error = ESRCH;
1069 			break;
1070 		}
1071 
1072 		/*
1073 		 * RTM_CHANGE needs a perfect match.
1074 		 */
1075 		plen = rtable_satoplen(info->rti_info[RTAX_DST]->sa_family,
1076 		    info->rti_info[RTAX_NETMASK]);
1077 		if (rt_plen(rt) != plen) {
1078 			error = ESRCH;
1079 			break;
1080 		}
1081 
1082 		if (info->rti_info[RTAX_GATEWAY] != NULL)
1083 			if (rt->rt_gateway == NULL ||
1084 			    bcmp(rt->rt_gateway,
1085 			    info->rti_info[RTAX_GATEWAY],
1086 			    info->rti_info[RTAX_GATEWAY]->sa_len)) {
1087 				newgate = 1;
1088 			}
1089 		/*
1090 		 * Check reachable gateway before changing the route.
1091 		 * New gateway could require new ifaddr, ifp;
1092 		 * flags may also be different; ifp may be specified
1093 		 * by ll sockaddr when protocol address is ambiguous.
1094 		 */
1095 		if (newgate || info->rti_info[RTAX_IFP] != NULL ||
1096 		    info->rti_info[RTAX_IFA] != NULL) {
1097 			struct ifaddr	*ifa = NULL;
1098 
1099 			NET_LOCK();
1100 			if ((error = rtm_getifa(info, tableid)) != 0) {
1101 				NET_UNLOCK();
1102 				break;
1103 			}
1104 			ifa = info->rti_ifa;
1105 			if (rt->rt_ifa != ifa) {
1106 				ifp->if_rtrequest(ifp, RTM_DELETE, rt);
1107 				ifafree(rt->rt_ifa);
1108 
1109 				rt->rt_ifa = ifaref(ifa);
1110 				rt->rt_ifidx = ifa->ifa_ifp->if_index;
1111 				/* recheck link state after ifp change */
1112 				rt_if_linkstate_change(rt, ifa->ifa_ifp,
1113 				    tableid);
1114 			}
1115 			NET_UNLOCK();
1116 		}
1117 change:
1118 		if (info->rti_info[RTAX_GATEWAY] != NULL) {
1119 			/* When updating the gateway, make sure it is valid. */
1120 			if (!newgate && rt->rt_gateway->sa_family !=
1121 			    info->rti_info[RTAX_GATEWAY]->sa_family) {
1122 				error = EINVAL;
1123 				break;
1124 			}
1125 
1126 			NET_LOCK();
1127 			error = rt_setgate(rt,
1128 			    info->rti_info[RTAX_GATEWAY], tableid);
1129 			NET_UNLOCK();
1130 			if (error)
1131 				break;
1132 		}
1133 #ifdef MPLS
1134 		if (rtm->rtm_flags & RTF_MPLS) {
1135 			NET_LOCK();
1136 			error = rt_mpls_set(rt,
1137 			    info->rti_info[RTAX_SRC], info->rti_mpls);
1138 			NET_UNLOCK();
1139 			if (error)
1140 				break;
1141 		} else if (newgate || (rtm->rtm_fmask & RTF_MPLS)) {
1142 			NET_LOCK();
1143 			/* if gateway changed remove MPLS information */
1144 			rt_mpls_clear(rt);
1145 			NET_UNLOCK();
1146 		}
1147 #endif
1148 
1149 #ifdef BFD
1150 		if (ISSET(rtm->rtm_flags, RTF_BFD)) {
1151 			KERNEL_LOCK();
1152 			error = bfdset(rt);
1153 			KERNEL_UNLOCK();
1154 			if (error)
1155 				break;
1156 		} else if (!ISSET(rtm->rtm_flags, RTF_BFD) &&
1157 		    ISSET(rtm->rtm_fmask, RTF_BFD)) {
1158 			KERNEL_LOCK();
1159 			bfdclear(rt);
1160 			KERNEL_UNLOCK();
1161 		}
1162 #endif
1163 
1164 		NET_LOCK();
1165 		/* Hack to allow some flags to be toggled */
1166 		if (rtm->rtm_fmask) {
1167 			/* MPLS flag it is set by rt_mpls_set() */
1168 			rtm->rtm_fmask &= ~RTF_MPLS;
1169 			rtm->rtm_flags &= ~RTF_MPLS;
1170 			rt->rt_flags =
1171 			    (rt->rt_flags & ~rtm->rtm_fmask) |
1172 			    (rtm->rtm_flags & rtm->rtm_fmask);
1173 		}
1174 		rtm_setmetrics(rtm->rtm_inits, &rtm->rtm_rmx, &rt->rt_rmx);
1175 
1176 		ifp->if_rtrequest(ifp, RTM_ADD, rt);
1177 
1178 		if (info->rti_info[RTAX_LABEL] != NULL) {
1179 			char *rtlabel = ((struct sockaddr_rtlabel *)
1180 			    info->rti_info[RTAX_LABEL])->sr_label;
1181 			rtlabel_unref(rt->rt_labelid);
1182 			rt->rt_labelid = rtlabel_name2id(rtlabel);
1183 		}
1184 		if_group_routechange(info->rti_info[RTAX_DST],
1185 		    info->rti_info[RTAX_NETMASK]);
1186 		rt->rt_locks &= ~(rtm->rtm_inits);
1187 		rt->rt_locks |= (rtm->rtm_inits & rtm->rtm_rmx.rmx_locks);
1188 		NET_UNLOCK();
1189 		break;
1190 	case RTM_GET:
1191 		rt = rtable_lookup(tableid, info->rti_info[RTAX_DST],
1192 		    info->rti_info[RTAX_NETMASK], info->rti_info[RTAX_GATEWAY],
1193 		    prio);
1194 		if (rt == NULL)
1195 			error = ESRCH;
1196 		break;
1197 	}
1198 
1199 	if_put(ifp);
1200 	*prt = rt;
1201 	return (error);
1202 }
1203 
1204 struct ifaddr *
1205 ifa_ifwithroute(int flags, struct sockaddr *dst, struct sockaddr *gateway,
1206     unsigned int rtableid)
1207 {
1208 	struct ifaddr	*ifa;
1209 
1210 	if ((flags & RTF_GATEWAY) == 0) {
1211 		/*
1212 		 * If we are adding a route to an interface,
1213 		 * and the interface is a pt to pt link
1214 		 * we should search for the destination
1215 		 * as our clue to the interface.  Otherwise
1216 		 * we can use the local address.
1217 		 */
1218 		ifa = NULL;
1219 		if (flags & RTF_HOST)
1220 			ifa = ifa_ifwithdstaddr(dst, rtableid);
1221 		if (ifa == NULL)
1222 			ifa = ifa_ifwithaddr(gateway, rtableid);
1223 	} else {
1224 		/*
1225 		 * If we are adding a route to a remote net
1226 		 * or host, the gateway may still be on the
1227 		 * other end of a pt to pt link.
1228 		 */
1229 		ifa = ifa_ifwithdstaddr(gateway, rtableid);
1230 	}
1231 	if (ifa == NULL) {
1232 		if (gateway->sa_family == AF_LINK) {
1233 			struct sockaddr_dl *sdl = satosdl(gateway);
1234 			struct ifnet *ifp = if_get(sdl->sdl_index);
1235 
1236 			if (ifp != NULL)
1237 				ifa = ifaof_ifpforaddr(dst, ifp);
1238 			if_put(ifp);
1239 		} else {
1240 			struct rtentry *rt;
1241 
1242 			rt = rtalloc(gateway, RT_RESOLVE, rtable_l2(rtableid));
1243 			if (rt != NULL)
1244 				ifa = rt->rt_ifa;
1245 			rtfree(rt);
1246 		}
1247 	}
1248 	if (ifa == NULL)
1249 		return (NULL);
1250 	if (ifa->ifa_addr->sa_family != dst->sa_family) {
1251 		struct ifaddr	*oifa = ifa;
1252 		ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp);
1253 		if (ifa == NULL)
1254 			ifa = oifa;
1255 	}
1256 	return (ifa);
1257 }
1258 
1259 int
1260 rtm_getifa(struct rt_addrinfo *info, unsigned int rtid)
1261 {
1262 	struct ifnet	*ifp = NULL;
1263 
1264 	/*
1265 	 * The "returned" `ifa' is guaranteed to be alive only if
1266 	 * the NET_LOCK() is held.
1267 	 */
1268 	NET_ASSERT_LOCKED();
1269 
1270 	/*
1271 	 * ifp may be specified by sockaddr_dl when protocol address
1272 	 * is ambiguous
1273 	 */
1274 	if (info->rti_info[RTAX_IFP] != NULL) {
1275 		struct sockaddr_dl *sdl;
1276 
1277 		sdl = satosdl(info->rti_info[RTAX_IFP]);
1278 		ifp = if_get(sdl->sdl_index);
1279 	}
1280 
1281 #ifdef IPSEC
1282 	/*
1283 	 * If the destination is a PF_KEY address, we'll look
1284 	 * for the existence of a encap interface number or address
1285 	 * in the options list of the gateway. By default, we'll return
1286 	 * enc0.
1287 	 */
1288 	if (info->rti_info[RTAX_DST] &&
1289 	    info->rti_info[RTAX_DST]->sa_family == PF_KEY)
1290 		info->rti_ifa = enc_getifa(rtid, 0);
1291 #endif
1292 
1293 	if (info->rti_ifa == NULL && info->rti_info[RTAX_IFA] != NULL)
1294 		info->rti_ifa = ifa_ifwithaddr(info->rti_info[RTAX_IFA], rtid);
1295 
1296 	if (info->rti_ifa == NULL) {
1297 		struct sockaddr	*sa;
1298 
1299 		if ((sa = info->rti_info[RTAX_IFA]) == NULL)
1300 			if ((sa = info->rti_info[RTAX_GATEWAY]) == NULL)
1301 				sa = info->rti_info[RTAX_DST];
1302 
1303 		if (sa != NULL && ifp != NULL)
1304 			info->rti_ifa = ifaof_ifpforaddr(sa, ifp);
1305 		else if (info->rti_info[RTAX_DST] != NULL &&
1306 		    info->rti_info[RTAX_GATEWAY] != NULL)
1307 			info->rti_ifa = ifa_ifwithroute(info->rti_flags,
1308 			    info->rti_info[RTAX_DST],
1309 			    info->rti_info[RTAX_GATEWAY],
1310 			    rtid);
1311 		else if (sa != NULL)
1312 			info->rti_ifa = ifa_ifwithroute(info->rti_flags,
1313 			    sa, sa, rtid);
1314 	}
1315 
1316 	if_put(ifp);
1317 
1318 	if (info->rti_ifa == NULL)
1319 		return (ENETUNREACH);
1320 
1321 	return (0);
1322 }
1323 
1324 int
1325 route_cleargateway(struct rtentry *rt, void *arg, unsigned int rtableid)
1326 {
1327 	struct rtentry *nhrt = arg;
1328 
1329 	if (ISSET(rt->rt_flags, RTF_GATEWAY) && rt->rt_gwroute == nhrt &&
1330 	    !ISSET(rt->rt_locks, RTV_MTU))
1331 		rt->rt_mtu = 0;
1332 
1333 	return (0);
1334 }
1335 
1336 /*
1337  * Check if the user request to insert an ARP entry does not conflict
1338  * with existing ones.
1339  *
1340  * Only two entries are allowed for a given IP address: a private one
1341  * (priv) and a public one (pub).
1342  */
1343 int
1344 route_arp_conflict(struct rtentry *rt, struct rt_addrinfo *info)
1345 {
1346 	int		 proxy = (info->rti_flags & RTF_ANNOUNCE);
1347 
1348 	if ((info->rti_flags & RTF_LLINFO) == 0 ||
1349 	    (info->rti_info[RTAX_DST]->sa_family != AF_INET))
1350 		return (0);
1351 
1352 	if (rt == NULL || !ISSET(rt->rt_flags, RTF_LLINFO))
1353 		return (0);
1354 
1355 	/* If the entry is cached, it can be updated. */
1356 	if (ISSET(rt->rt_flags, RTF_CACHED))
1357 		return (0);
1358 
1359 	/*
1360 	 * Same destination, not cached and both "priv" or "pub" conflict.
1361 	 * If a second entry exists, it always conflict.
1362 	 */
1363 	if ((ISSET(rt->rt_flags, RTF_ANNOUNCE) == proxy) ||
1364 	    ISSET(rt->rt_flags, RTF_MPATH))
1365 		return (EEXIST);
1366 
1367 	/* No conflict but an entry exist so we need to force mpath. */
1368 	info->rti_flags |= RTF_MPATH;
1369 	return (0);
1370 }
1371 
1372 void
1373 rtm_setmetrics(u_long which, const struct rt_metrics *in,
1374     struct rt_kmetrics *out)
1375 {
1376 	int64_t expire;
1377 
1378 	if (which & RTV_MTU)
1379 		out->rmx_mtu = in->rmx_mtu;
1380 	if (which & RTV_EXPIRE) {
1381 		expire = in->rmx_expire;
1382 		if (expire != 0) {
1383 			expire -= gettime();
1384 			expire += getuptime();
1385 		}
1386 
1387 		out->rmx_expire = expire;
1388 	}
1389 }
1390 
1391 void
1392 rtm_getmetrics(const struct rtentry *rt, struct rt_metrics *out)
1393 {
1394 	const struct rt_kmetrics *in = &rt->rt_rmx;
1395 	int64_t expire;
1396 
1397 	expire = in->rmx_expire;
1398 	if (expire == 0)
1399 		expire = rt_timer_get_expire(rt);
1400 	if (expire != 0) {
1401 		expire -= getuptime();
1402 		expire += gettime();
1403 	}
1404 
1405 	bzero(out, sizeof(*out));
1406 	out->rmx_locks = in->rmx_locks;
1407 	out->rmx_mtu = in->rmx_mtu;
1408 	out->rmx_expire = expire;
1409 	out->rmx_pksent = in->rmx_pksent;
1410 }
1411 
1412 #define ROUNDUP(a) \
1413 	((a) > 0 ? (1 + (((a) - 1) | (sizeof(long) - 1))) : sizeof(long))
1414 #define ADVANCE(x, n) (x += ROUNDUP((n)->sa_len))
1415 
1416 int
1417 rtm_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo)
1418 {
1419 	struct sockaddr	*sa;
1420 	int		 i;
1421 
1422 	/*
1423 	 * Parse address bits, split address storage in chunks, and
1424 	 * set info pointers.  Use sa_len for traversing the memory
1425 	 * and check that we stay within in the limit.
1426 	 */
1427 	bzero(rtinfo->rti_info, sizeof(rtinfo->rti_info));
1428 	for (i = 0; i < sizeof(rtinfo->rti_addrs) * 8; i++) {
1429 		if ((rtinfo->rti_addrs & (1U << i)) == 0)
1430 			continue;
1431 		if (i >= RTAX_MAX || cp + sizeof(socklen_t) > cplim)
1432 			return (EINVAL);
1433 		sa = (struct sockaddr *)cp;
1434 		if (cp + sa->sa_len > cplim)
1435 			return (EINVAL);
1436 		rtinfo->rti_info[i] = sa;
1437 		ADVANCE(cp, sa);
1438 	}
1439 	/*
1440 	 * Check that the address family is suitable for the route address
1441 	 * type.  Check that each address has a size that fits its family
1442 	 * and its length is within the size.  Strings within addresses must
1443 	 * be NUL terminated.
1444 	 */
1445 	for (i = 0; i < RTAX_MAX; i++) {
1446 		size_t len, maxlen, size;
1447 
1448 		sa = rtinfo->rti_info[i];
1449 		if (sa == NULL)
1450 			continue;
1451 		maxlen = size = 0;
1452 		switch (i) {
1453 		case RTAX_DST:
1454 		case RTAX_GATEWAY:
1455 		case RTAX_SRC:
1456 			switch (sa->sa_family) {
1457 			case AF_INET:
1458 				size = sizeof(struct sockaddr_in);
1459 				break;
1460 			case AF_LINK:
1461 				size = sizeof(struct sockaddr_dl);
1462 				break;
1463 #ifdef INET6
1464 			case AF_INET6:
1465 				size = sizeof(struct sockaddr_in6);
1466 				break;
1467 #endif
1468 #ifdef MPLS
1469 			case AF_MPLS:
1470 				size = sizeof(struct sockaddr_mpls);
1471 				break;
1472 #endif
1473 			}
1474 			break;
1475 		case RTAX_IFP:
1476 			if (sa->sa_family != AF_LINK)
1477 				return (EAFNOSUPPORT);
1478 			/*
1479 			 * XXX Should be sizeof(struct sockaddr_dl), but
1480 			 * route(8) has a bug and provides less memory.
1481 			 * arp(8) has another bug and uses sizeof pointer.
1482 			 */
1483 			size = 4;
1484 			break;
1485 		case RTAX_IFA:
1486 			switch (sa->sa_family) {
1487 			case AF_INET:
1488 				size = sizeof(struct sockaddr_in);
1489 				break;
1490 #ifdef INET6
1491 			case AF_INET6:
1492 				size = sizeof(struct sockaddr_in6);
1493 				break;
1494 #endif
1495 			default:
1496 				return (EAFNOSUPPORT);
1497 			}
1498 			break;
1499 		case RTAX_LABEL:
1500 			sa->sa_family = AF_UNSPEC;
1501 			maxlen = RTLABEL_LEN;
1502 			size = sizeof(struct sockaddr_rtlabel);
1503 			break;
1504 #ifdef BFD
1505 		case RTAX_BFD:
1506 			sa->sa_family = AF_UNSPEC;
1507 			size = sizeof(struct sockaddr_bfd);
1508 			break;
1509 #endif
1510 		case RTAX_DNS:
1511 			/* more validation in rtm_validate_proposal */
1512 			if (sa->sa_len > sizeof(struct sockaddr_rtdns))
1513 				return (EINVAL);
1514 			if (sa->sa_len < offsetof(struct sockaddr_rtdns,
1515 			    sr_dns))
1516 				return (EINVAL);
1517 			switch (sa->sa_family) {
1518 			case AF_INET:
1519 #ifdef INET6
1520 			case AF_INET6:
1521 #endif
1522 				break;
1523 			default:
1524 				return (EAFNOSUPPORT);
1525 			}
1526 			break;
1527 		case RTAX_STATIC:
1528 			sa->sa_family = AF_UNSPEC;
1529 			maxlen = RTSTATIC_LEN;
1530 			size = sizeof(struct sockaddr_rtstatic);
1531 			break;
1532 		case RTAX_SEARCH:
1533 			sa->sa_family = AF_UNSPEC;
1534 			maxlen = RTSEARCH_LEN;
1535 			size = sizeof(struct sockaddr_rtsearch);
1536 			break;
1537 		}
1538 		if (size) {
1539 			/* memory for the full struct must be provided */
1540 			if (sa->sa_len < size)
1541 				return (EINVAL);
1542 		}
1543 		if (maxlen) {
1544 			/* this should not happen */
1545 			if (2 + maxlen > size)
1546 				return (EINVAL);
1547 			/* strings must be NUL terminated within the struct */
1548 			len = strnlen(sa->sa_data, maxlen);
1549 			if (len >= maxlen || 2 + len >= sa->sa_len)
1550 				return (EINVAL);
1551 			break;
1552 		}
1553 	}
1554 	return (0);
1555 }
1556 
1557 struct mbuf *
1558 rtm_msg1(int type, struct rt_addrinfo *rtinfo)
1559 {
1560 	struct rt_msghdr	*rtm;
1561 	struct mbuf		*m;
1562 	int			 i;
1563 	struct sockaddr		*sa;
1564 	int			 len, dlen, hlen;
1565 
1566 	switch (type) {
1567 	case RTM_DELADDR:
1568 	case RTM_NEWADDR:
1569 		hlen = sizeof(struct ifa_msghdr);
1570 		break;
1571 	case RTM_IFINFO:
1572 		hlen = sizeof(struct if_msghdr);
1573 		break;
1574 	case RTM_IFANNOUNCE:
1575 		hlen = sizeof(struct if_announcemsghdr);
1576 		break;
1577 #ifdef BFD
1578 	case RTM_BFD:
1579 		hlen = sizeof(struct bfd_msghdr);
1580 		break;
1581 #endif
1582 	case RTM_80211INFO:
1583 		hlen = sizeof(struct if_ieee80211_msghdr);
1584 		break;
1585 	default:
1586 		hlen = sizeof(struct rt_msghdr);
1587 		break;
1588 	}
1589 	len = hlen;
1590 	for (i = 0; i < RTAX_MAX; i++) {
1591 		if (rtinfo == NULL || (sa = rtinfo->rti_info[i]) == NULL)
1592 			continue;
1593 		len += ROUNDUP(sa->sa_len);
1594 	}
1595 	if (len > MCLBYTES)
1596 		panic("rtm_msg1");
1597 	m = m_gethdr(M_DONTWAIT, MT_DATA);
1598 	if (m && len > MHLEN) {
1599 		MCLGET(m, M_DONTWAIT);
1600 		if ((m->m_flags & M_EXT) == 0) {
1601 			m_free(m);
1602 			m = NULL;
1603 		}
1604 	}
1605 	if (m == NULL)
1606 		return (m);
1607 	m->m_pkthdr.len = m->m_len = len;
1608 	m->m_pkthdr.ph_ifidx = 0;
1609 	rtm = mtod(m, struct rt_msghdr *);
1610 	bzero(rtm, len);
1611 	len = hlen;
1612 	for (i = 0; i < RTAX_MAX; i++) {
1613 		if (rtinfo == NULL || (sa = rtinfo->rti_info[i]) == NULL)
1614 			continue;
1615 		rtinfo->rti_addrs |= (1U << i);
1616 		dlen = ROUNDUP(sa->sa_len);
1617 		if (m_copyback(m, len, sa->sa_len, sa, M_NOWAIT)) {
1618 			m_freem(m);
1619 			return (NULL);
1620 		}
1621 		len += dlen;
1622 	}
1623 	rtm->rtm_msglen = len;
1624 	rtm->rtm_hdrlen = hlen;
1625 	rtm->rtm_version = RTM_VERSION;
1626 	rtm->rtm_type = type;
1627 	return (m);
1628 }
1629 
1630 int
1631 rtm_msg2(int type, int vers, struct rt_addrinfo *rtinfo, caddr_t cp,
1632     struct walkarg *w)
1633 {
1634 	int		i;
1635 	int		len, dlen, hlen, second_time = 0;
1636 	caddr_t		cp0;
1637 
1638 	rtinfo->rti_addrs = 0;
1639 again:
1640 	switch (type) {
1641 	case RTM_DELADDR:
1642 	case RTM_NEWADDR:
1643 		len = sizeof(struct ifa_msghdr);
1644 		break;
1645 	case RTM_IFINFO:
1646 		len = sizeof(struct if_msghdr);
1647 		break;
1648 	default:
1649 		len = sizeof(struct rt_msghdr);
1650 		break;
1651 	}
1652 	hlen = len;
1653 	if ((cp0 = cp) != NULL)
1654 		cp += len;
1655 	for (i = 0; i < RTAX_MAX; i++) {
1656 		struct sockaddr *sa;
1657 
1658 		if ((sa = rtinfo->rti_info[i]) == NULL)
1659 			continue;
1660 		rtinfo->rti_addrs |= (1U << i);
1661 		dlen = ROUNDUP(sa->sa_len);
1662 		if (cp) {
1663 			bcopy(sa, cp, sa->sa_len);
1664 			bzero(cp + sa->sa_len, dlen - sa->sa_len);
1665 			cp += dlen;
1666 		}
1667 		len += dlen;
1668 	}
1669 	/* align message length to the next natural boundary */
1670 	len = ALIGN(len);
1671 	if (cp == 0 && w != NULL && !second_time) {
1672 		w->w_needed += len;
1673 		if (w->w_needed <= w->w_given && w->w_where) {
1674 			if (w->w_tmemsize < len) {
1675 				free(w->w_tmem, M_RTABLE, w->w_tmemsize);
1676 				w->w_tmem = malloc(len, M_RTABLE,
1677 				    M_NOWAIT | M_ZERO);
1678 				if (w->w_tmem)
1679 					w->w_tmemsize = len;
1680 			}
1681 			if (w->w_tmem) {
1682 				cp = w->w_tmem;
1683 				second_time = 1;
1684 				goto again;
1685 			} else
1686 				w->w_where = 0;
1687 		}
1688 	}
1689 	if (cp && w)		/* clear the message header */
1690 		bzero(cp0, hlen);
1691 
1692 	if (cp) {
1693 		struct rt_msghdr *rtm = (struct rt_msghdr *)cp0;
1694 
1695 		rtm->rtm_version = RTM_VERSION;
1696 		rtm->rtm_type = type;
1697 		rtm->rtm_msglen = len;
1698 		rtm->rtm_hdrlen = hlen;
1699 	}
1700 	return (len);
1701 }
1702 
1703 void
1704 rtm_send(struct rtentry *rt, int cmd, int error, unsigned int rtableid)
1705 {
1706 	struct rt_addrinfo	 info;
1707 	struct ifnet		*ifp;
1708 	struct sockaddr_rtlabel	 sa_rl;
1709 	struct sockaddr_in6	 sa_mask;
1710 
1711 	memset(&info, 0, sizeof(info));
1712 	info.rti_info[RTAX_DST] = rt_key(rt);
1713 	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
1714 	if (!ISSET(rt->rt_flags, RTF_HOST))
1715 		info.rti_info[RTAX_NETMASK] = rt_plen2mask(rt, &sa_mask);
1716 	info.rti_info[RTAX_LABEL] = rtlabel_id2sa(rt->rt_labelid, &sa_rl);
1717 	ifp = if_get(rt->rt_ifidx);
1718 	if (ifp != NULL) {
1719 		info.rti_info[RTAX_IFP] = sdltosa(ifp->if_sadl);
1720 		info.rti_info[RTAX_IFA] = rtable_getsource(rtableid,
1721 		    info.rti_info[RTAX_DST]->sa_family);
1722 		if (info.rti_info[RTAX_IFA] == NULL)
1723 			info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
1724 	}
1725 
1726 	rtm_miss(cmd, &info, rt->rt_flags, rt->rt_priority, rt->rt_ifidx, error,
1727 	    rtableid);
1728 	if_put(ifp);
1729 }
1730 
1731 /*
1732  * This routine is called to generate a message from the routing
1733  * socket indicating that a redirect has occurred, a routing lookup
1734  * has failed, or that a protocol has detected timeouts to a particular
1735  * destination.
1736  */
1737 void
1738 rtm_miss(int type, struct rt_addrinfo *rtinfo, int flags, uint8_t prio,
1739     u_int ifidx, int error, u_int tableid)
1740 {
1741 	struct rt_msghdr	*rtm;
1742 	struct mbuf		*m;
1743 	struct sockaddr		*sa = rtinfo->rti_info[RTAX_DST];
1744 
1745 	if (rtptable.rtp_count == 0)
1746 		return;
1747 	m = rtm_msg1(type, rtinfo);
1748 	if (m == NULL)
1749 		return;
1750 	rtm = mtod(m, struct rt_msghdr *);
1751 	rtm->rtm_flags = RTF_DONE | flags;
1752 	rtm->rtm_priority = prio;
1753 	rtm->rtm_errno = error;
1754 	rtm->rtm_tableid = tableid;
1755 	rtm->rtm_addrs = rtinfo->rti_addrs;
1756 	rtm->rtm_index = ifidx;
1757 	route_input(m, NULL, sa ? sa->sa_family : AF_UNSPEC);
1758 }
1759 
1760 /*
1761  * This routine is called to generate a message from the routing
1762  * socket indicating that the status of a network interface has changed.
1763  */
1764 void
1765 rtm_ifchg(struct ifnet *ifp)
1766 {
1767 	struct rt_addrinfo	 info;
1768 	struct if_msghdr	*ifm;
1769 	struct mbuf		*m;
1770 
1771 	if (rtptable.rtp_count == 0)
1772 		return;
1773 	memset(&info, 0, sizeof(info));
1774 	info.rti_info[RTAX_IFP] = sdltosa(ifp->if_sadl);
1775 	m = rtm_msg1(RTM_IFINFO, &info);
1776 	if (m == NULL)
1777 		return;
1778 	ifm = mtod(m, struct if_msghdr *);
1779 	ifm->ifm_index = ifp->if_index;
1780 	ifm->ifm_tableid = ifp->if_rdomain;
1781 	ifm->ifm_flags = ifp->if_flags;
1782 	ifm->ifm_xflags = ifp->if_xflags;
1783 	if_getdata(ifp, &ifm->ifm_data);
1784 	ifm->ifm_addrs = info.rti_addrs;
1785 	route_input(m, NULL, AF_UNSPEC);
1786 }
1787 
1788 /*
1789  * This is called to generate messages from the routing socket
1790  * indicating a network interface has had addresses associated with it.
1791  * if we ever reverse the logic and replace messages TO the routing
1792  * socket indicate a request to configure interfaces, then it will
1793  * be unnecessary as the routing socket will automatically generate
1794  * copies of it.
1795  */
1796 void
1797 rtm_addr(int cmd, struct ifaddr *ifa)
1798 {
1799 	struct ifnet		*ifp = ifa->ifa_ifp;
1800 	struct mbuf		*m;
1801 	struct rt_addrinfo	 info;
1802 	struct ifa_msghdr	*ifam;
1803 
1804 	if (rtptable.rtp_count == 0)
1805 		return;
1806 
1807 	memset(&info, 0, sizeof(info));
1808 	info.rti_info[RTAX_IFA] = ifa->ifa_addr;
1809 	info.rti_info[RTAX_IFP] = sdltosa(ifp->if_sadl);
1810 	info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask;
1811 	info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
1812 	if ((m = rtm_msg1(cmd, &info)) == NULL)
1813 		return;
1814 	ifam = mtod(m, struct ifa_msghdr *);
1815 	ifam->ifam_index = ifp->if_index;
1816 	ifam->ifam_metric = ifa->ifa_metric;
1817 	ifam->ifam_flags = ifa->ifa_flags;
1818 	ifam->ifam_addrs = info.rti_addrs;
1819 	ifam->ifam_tableid = ifp->if_rdomain;
1820 
1821 	route_input(m, NULL,
1822 	    ifa->ifa_addr ? ifa->ifa_addr->sa_family : AF_UNSPEC);
1823 }
1824 
1825 /*
1826  * This is called to generate routing socket messages indicating
1827  * network interface arrival and departure.
1828  */
1829 void
1830 rtm_ifannounce(struct ifnet *ifp, int what)
1831 {
1832 	struct if_announcemsghdr	*ifan;
1833 	struct mbuf			*m;
1834 
1835 	if (rtptable.rtp_count == 0)
1836 		return;
1837 	m = rtm_msg1(RTM_IFANNOUNCE, NULL);
1838 	if (m == NULL)
1839 		return;
1840 	ifan = mtod(m, struct if_announcemsghdr *);
1841 	ifan->ifan_index = ifp->if_index;
1842 	strlcpy(ifan->ifan_name, ifp->if_xname, sizeof(ifan->ifan_name));
1843 	ifan->ifan_what = what;
1844 	route_input(m, NULL, AF_UNSPEC);
1845 }
1846 
1847 #ifdef BFD
1848 /*
1849  * This is used to generate routing socket messages indicating
1850  * the state of a BFD session.
1851  */
1852 void
1853 rtm_bfd(struct bfd_config *bfd)
1854 {
1855 	struct bfd_msghdr	*bfdm;
1856 	struct sockaddr_bfd	 sa_bfd;
1857 	struct mbuf		*m;
1858 	struct rt_addrinfo	 info;
1859 
1860 	if (rtptable.rtp_count == 0)
1861 		return;
1862 	memset(&info, 0, sizeof(info));
1863 	info.rti_info[RTAX_DST] = rt_key(bfd->bc_rt);
1864 	info.rti_info[RTAX_IFA] = bfd->bc_rt->rt_ifa->ifa_addr;
1865 
1866 	m = rtm_msg1(RTM_BFD, &info);
1867 	if (m == NULL)
1868 		return;
1869 	bfdm = mtod(m, struct bfd_msghdr *);
1870 	bfdm->bm_addrs = info.rti_addrs;
1871 
1872 	KERNEL_ASSERT_LOCKED();
1873 	bfd2sa(bfd->bc_rt, &sa_bfd);
1874 	memcpy(&bfdm->bm_sa, &sa_bfd, sizeof(sa_bfd));
1875 
1876 	route_input(m, NULL, info.rti_info[RTAX_DST]->sa_family);
1877 }
1878 #endif /* BFD */
1879 
1880 /*
1881  * This is used to generate routing socket messages indicating
1882  * the state of an ieee80211 interface.
1883  */
1884 void
1885 rtm_80211info(struct ifnet *ifp, struct if_ieee80211_data *ifie)
1886 {
1887 	struct if_ieee80211_msghdr	*ifim;
1888 	struct mbuf			*m;
1889 
1890 	if (rtptable.rtp_count == 0)
1891 		return;
1892 	m = rtm_msg1(RTM_80211INFO, NULL);
1893 	if (m == NULL)
1894 		return;
1895 	ifim = mtod(m, struct if_ieee80211_msghdr *);
1896 	ifim->ifim_index = ifp->if_index;
1897 	ifim->ifim_tableid = ifp->if_rdomain;
1898 
1899 	memcpy(&ifim->ifim_ifie, ifie, sizeof(ifim->ifim_ifie));
1900 	route_input(m, NULL, AF_UNSPEC);
1901 }
1902 
1903 /*
1904  * This is used to generate routing socket messages indicating
1905  * the address selection proposal from an interface.
1906  */
1907 void
1908 rtm_proposal(struct ifnet *ifp, struct rt_addrinfo *rtinfo, int flags,
1909     uint8_t prio)
1910 {
1911 	struct rt_msghdr	*rtm;
1912 	struct mbuf		*m;
1913 
1914 	m = rtm_msg1(RTM_PROPOSAL, rtinfo);
1915 	if (m == NULL)
1916 		return;
1917 	rtm = mtod(m, struct rt_msghdr *);
1918 	rtm->rtm_flags = RTF_DONE | flags;
1919 	rtm->rtm_priority = prio;
1920 	rtm->rtm_tableid = ifp->if_rdomain;
1921 	rtm->rtm_index = ifp->if_index;
1922 	rtm->rtm_addrs = rtinfo->rti_addrs;
1923 
1924 	route_input(m, NULL, rtinfo->rti_info[RTAX_DNS]->sa_family);
1925 }
1926 
1927 /*
1928  * This is used in dumping the kernel table via sysctl().
1929  */
1930 int
1931 sysctl_dumpentry(struct rtentry *rt, void *v, unsigned int id)
1932 {
1933 	struct walkarg		*w = v;
1934 	int			 error = 0, size;
1935 	struct rt_addrinfo	 info;
1936 	struct ifnet		*ifp;
1937 #ifdef BFD
1938 	struct sockaddr_bfd	 sa_bfd;
1939 #endif
1940 	struct sockaddr_rtlabel	 sa_rl;
1941 	struct sockaddr_in6	 sa_mask;
1942 
1943 	if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg))
1944 		return 0;
1945 	if (w->w_op == NET_RT_DUMP && w->w_arg) {
1946 		u_int8_t prio = w->w_arg & RTP_MASK;
1947 		if (w->w_arg < 0) {
1948 			prio = (-w->w_arg) & RTP_MASK;
1949 			/* Show all routes that are not this priority */
1950 			if (prio == (rt->rt_priority & RTP_MASK))
1951 				return 0;
1952 		} else {
1953 			if (prio != (rt->rt_priority & RTP_MASK) &&
1954 			    prio != RTP_ANY)
1955 				return 0;
1956 		}
1957 	}
1958 	bzero(&info, sizeof(info));
1959 	info.rti_info[RTAX_DST] = rt_key(rt);
1960 	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
1961 	info.rti_info[RTAX_NETMASK] = rt_plen2mask(rt, &sa_mask);
1962 	ifp = if_get(rt->rt_ifidx);
1963 	if (ifp != NULL) {
1964 		info.rti_info[RTAX_IFP] = sdltosa(ifp->if_sadl);
1965 		info.rti_info[RTAX_IFA] =
1966 		    rtable_getsource(id, info.rti_info[RTAX_DST]->sa_family);
1967 		if (info.rti_info[RTAX_IFA] == NULL)
1968 			info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
1969 		if (ifp->if_flags & IFF_POINTOPOINT)
1970 			info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr;
1971 	}
1972 	if_put(ifp);
1973 	info.rti_info[RTAX_LABEL] = rtlabel_id2sa(rt->rt_labelid, &sa_rl);
1974 #ifdef BFD
1975 	if (rt->rt_flags & RTF_BFD) {
1976 		KERNEL_ASSERT_LOCKED();
1977 		info.rti_info[RTAX_BFD] = bfd2sa(rt, &sa_bfd);
1978 	}
1979 #endif
1980 #ifdef MPLS
1981 	if (rt->rt_flags & RTF_MPLS) {
1982 		struct sockaddr_mpls	 sa_mpls;
1983 
1984 		bzero(&sa_mpls, sizeof(sa_mpls));
1985 		sa_mpls.smpls_family = AF_MPLS;
1986 		sa_mpls.smpls_len = sizeof(sa_mpls);
1987 		sa_mpls.smpls_label = ((struct rt_mpls *)
1988 		    rt->rt_llinfo)->mpls_label;
1989 		info.rti_info[RTAX_SRC] = (struct sockaddr *)&sa_mpls;
1990 		info.rti_mpls = ((struct rt_mpls *)
1991 		    rt->rt_llinfo)->mpls_operation;
1992 	}
1993 #endif
1994 
1995 	size = rtm_msg2(RTM_GET, RTM_VERSION, &info, NULL, w);
1996 	if (w->w_where && w->w_tmem && w->w_needed <= w->w_given) {
1997 		struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem;
1998 
1999 		rtm->rtm_pid = curproc->p_p->ps_pid;
2000 		rtm->rtm_flags = RTF_DONE | rt->rt_flags;
2001 		rtm->rtm_priority = rt->rt_priority & RTP_MASK;
2002 		rtm_getmetrics(rt, &rtm->rtm_rmx);
2003 		/* Do not account the routing table's reference. */
2004 		rtm->rtm_rmx.rmx_refcnt = refcnt_read(&rt->rt_refcnt) - 1;
2005 		rtm->rtm_index = rt->rt_ifidx;
2006 		rtm->rtm_addrs = info.rti_addrs;
2007 		rtm->rtm_tableid = id;
2008 #ifdef MPLS
2009 		rtm->rtm_mpls = info.rti_mpls;
2010 #endif
2011 		if ((error = copyout(rtm, w->w_where, size)) != 0)
2012 			w->w_where = NULL;
2013 		else
2014 			w->w_where += size;
2015 	}
2016 	return (error);
2017 }
2018 
2019 int
2020 sysctl_iflist(int af, struct walkarg *w)
2021 {
2022 	struct ifnet		*ifp;
2023 	struct ifaddr		*ifa;
2024 	struct rt_addrinfo	 info;
2025 	int			 len, error = 0;
2026 
2027 	bzero(&info, sizeof(info));
2028 	TAILQ_FOREACH(ifp, &ifnetlist, if_list) {
2029 		if (w->w_arg && w->w_arg != ifp->if_index)
2030 			continue;
2031 		/* Copy the link-layer address first */
2032 		info.rti_info[RTAX_IFP] = sdltosa(ifp->if_sadl);
2033 		len = rtm_msg2(RTM_IFINFO, RTM_VERSION, &info, 0, w);
2034 		if (w->w_where && w->w_tmem && w->w_needed <= w->w_given) {
2035 			struct if_msghdr *ifm;
2036 
2037 			ifm = (struct if_msghdr *)w->w_tmem;
2038 			ifm->ifm_index = ifp->if_index;
2039 			ifm->ifm_tableid = ifp->if_rdomain;
2040 			ifm->ifm_flags = ifp->if_flags;
2041 			if_getdata(ifp, &ifm->ifm_data);
2042 			ifm->ifm_addrs = info.rti_addrs;
2043 			error = copyout(ifm, w->w_where, len);
2044 			if (error)
2045 				return (error);
2046 			w->w_where += len;
2047 		}
2048 		info.rti_info[RTAX_IFP] = NULL;
2049 		TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) {
2050 			KASSERT(ifa->ifa_addr->sa_family != AF_LINK);
2051 			if (af && af != ifa->ifa_addr->sa_family)
2052 				continue;
2053 			info.rti_info[RTAX_IFA] = ifa->ifa_addr;
2054 			info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask;
2055 			info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
2056 			len = rtm_msg2(RTM_NEWADDR, RTM_VERSION, &info, 0, w);
2057 			if (w->w_where && w->w_tmem &&
2058 			    w->w_needed <= w->w_given) {
2059 				struct ifa_msghdr *ifam;
2060 
2061 				ifam = (struct ifa_msghdr *)w->w_tmem;
2062 				ifam->ifam_index = ifa->ifa_ifp->if_index;
2063 				ifam->ifam_flags = ifa->ifa_flags;
2064 				ifam->ifam_metric = ifa->ifa_metric;
2065 				ifam->ifam_addrs = info.rti_addrs;
2066 				error = copyout(w->w_tmem, w->w_where, len);
2067 				if (error)
2068 					return (error);
2069 				w->w_where += len;
2070 			}
2071 		}
2072 		info.rti_info[RTAX_IFA] = info.rti_info[RTAX_NETMASK] =
2073 		    info.rti_info[RTAX_BRD] = NULL;
2074 	}
2075 	return (0);
2076 }
2077 
2078 int
2079 sysctl_ifnames(struct walkarg *w)
2080 {
2081 	struct if_nameindex_msg ifn;
2082 	struct ifnet *ifp;
2083 	int error = 0;
2084 
2085 	/* XXX ignore tableid for now */
2086 	TAILQ_FOREACH(ifp, &ifnetlist, if_list) {
2087 		if (w->w_arg && w->w_arg != ifp->if_index)
2088 			continue;
2089 		w->w_needed += sizeof(ifn);
2090 		if (w->w_where && w->w_needed <= w->w_given) {
2091 
2092 			memset(&ifn, 0, sizeof(ifn));
2093 			ifn.if_index = ifp->if_index;
2094 			strlcpy(ifn.if_name, ifp->if_xname,
2095 			    sizeof(ifn.if_name));
2096 			error = copyout(&ifn, w->w_where, sizeof(ifn));
2097 			if (error)
2098 				return (error);
2099 			w->w_where += sizeof(ifn);
2100 		}
2101 	}
2102 
2103 	return (0);
2104 }
2105 
2106 int
2107 sysctl_source(int af, u_int tableid, struct walkarg *w)
2108 {
2109 	struct sockaddr	*sa;
2110 	int		 size, error = 0;
2111 
2112 	sa = rtable_getsource(tableid, af);
2113 	if (sa) {
2114 		switch (sa->sa_family) {
2115 		case AF_INET:
2116 			size = sizeof(struct sockaddr_in);
2117 			break;
2118 #ifdef INET6
2119 		case AF_INET6:
2120 			size = sizeof(struct sockaddr_in6);
2121 			break;
2122 #endif
2123 		default:
2124 			return (0);
2125 		}
2126 		w->w_needed += size;
2127 		if (w->w_where && w->w_needed <= w->w_given) {
2128 			if ((error = copyout(sa, w->w_where, size)))
2129 				return (error);
2130 			w->w_where += size;
2131 		}
2132 	}
2133 	return (0);
2134 }
2135 
2136 int
2137 sysctl_rtable(int *name, u_int namelen, void *where, size_t *given, void *new,
2138     size_t newlen)
2139 {
2140 	int			 i, error = EINVAL;
2141 	u_char			 af;
2142 	struct walkarg		 w;
2143 	struct rt_tableinfo	 tableinfo;
2144 	u_int			 tableid = 0;
2145 
2146 	if (new)
2147 		return (EPERM);
2148 	if (namelen < 3 || namelen > 4)
2149 		return (EINVAL);
2150 	af = name[0];
2151 	bzero(&w, sizeof(w));
2152 	w.w_where = where;
2153 	w.w_given = *given;
2154 	w.w_op = name[1];
2155 	w.w_arg = name[2];
2156 
2157 	if (namelen == 4) {
2158 		tableid = name[3];
2159 		if (!rtable_exists(tableid))
2160 			return (ENOENT);
2161 	} else
2162 		tableid = curproc->p_p->ps_rtableid;
2163 
2164 	switch (w.w_op) {
2165 	case NET_RT_DUMP:
2166 	case NET_RT_FLAGS:
2167 		NET_LOCK_SHARED();
2168 		for (i = 1; i <= AF_MAX; i++) {
2169 			if (af != 0 && af != i)
2170 				continue;
2171 
2172 			error = rtable_walk(tableid, i, NULL, sysctl_dumpentry,
2173 			    &w);
2174 			if (error == EAFNOSUPPORT)
2175 				error = 0;
2176 			if (error)
2177 				break;
2178 		}
2179 		NET_UNLOCK_SHARED();
2180 		break;
2181 
2182 	case NET_RT_IFLIST:
2183 		NET_LOCK_SHARED();
2184 		error = sysctl_iflist(af, &w);
2185 		NET_UNLOCK_SHARED();
2186 		break;
2187 
2188 	case NET_RT_STATS:
2189 		return (sysctl_rtable_rtstat(where, given, new));
2190 	case NET_RT_TABLE:
2191 		tableid = w.w_arg;
2192 		if (!rtable_exists(tableid))
2193 			return (ENOENT);
2194 		memset(&tableinfo, 0, sizeof tableinfo);
2195 		tableinfo.rti_tableid = tableid;
2196 		tableinfo.rti_domainid = rtable_l2(tableid);
2197 		error = sysctl_rdstruct(where, given, new,
2198 		    &tableinfo, sizeof(tableinfo));
2199 		return (error);
2200 	case NET_RT_IFNAMES:
2201 		NET_LOCK_SHARED();
2202 		error = sysctl_ifnames(&w);
2203 		NET_UNLOCK_SHARED();
2204 		break;
2205 	case NET_RT_SOURCE:
2206 		tableid = w.w_arg;
2207 		if (!rtable_exists(tableid))
2208 			return (ENOENT);
2209 		NET_LOCK_SHARED();
2210 		for (i = 1; i <= AF_MAX; i++) {
2211 			if (af != 0 && af != i)
2212 				continue;
2213 
2214 			error = sysctl_source(i, tableid, &w);
2215 			if (error == EAFNOSUPPORT)
2216 				error = 0;
2217 			if (error)
2218 				break;
2219 		}
2220 		NET_UNLOCK_SHARED();
2221 		break;
2222 	}
2223 	free(w.w_tmem, M_RTABLE, w.w_tmemsize);
2224 	if (where) {
2225 		*given = w.w_where - (caddr_t)where;
2226 		if (w.w_needed > w.w_given)
2227 			return (ENOMEM);
2228 	} else if (w.w_needed == 0) {
2229 		*given = 0;
2230 	} else {
2231 		*given = roundup(w.w_needed + MAX(w.w_needed / 10, 1024),
2232 		    PAGE_SIZE);
2233 	}
2234 	return (error);
2235 }
2236 
2237 int
2238 sysctl_rtable_rtstat(void *oldp, size_t *oldlenp, void *newp)
2239 {
2240 	extern struct cpumem *rtcounters;
2241 	uint64_t counters[rts_ncounters];
2242 	struct rtstat rtstat;
2243 	uint32_t *words = (uint32_t *)&rtstat;
2244 	int i;
2245 
2246 	CTASSERT(sizeof(rtstat) == (nitems(counters) * sizeof(uint32_t)));
2247 	memset(&rtstat, 0, sizeof rtstat);
2248 	counters_read(rtcounters, counters, nitems(counters), NULL);
2249 
2250 	for (i = 0; i < nitems(counters); i++)
2251 		words[i] = (uint32_t)counters[i];
2252 
2253 	return (sysctl_rdstruct(oldp, oldlenp, newp, &rtstat, sizeof(rtstat)));
2254 }
2255 
2256 int
2257 rtm_validate_proposal(struct rt_addrinfo *info)
2258 {
2259 	if (info->rti_addrs & ~(RTA_NETMASK | RTA_IFA | RTA_DNS | RTA_STATIC |
2260 	    RTA_SEARCH)) {
2261 		return -1;
2262 	}
2263 
2264 	if (ISSET(info->rti_addrs, RTA_NETMASK)) {
2265 		struct sockaddr *sa = info->rti_info[RTAX_NETMASK];
2266 		if (sa == NULL)
2267 			return -1;
2268 		switch (sa->sa_family) {
2269 		case AF_INET:
2270 			if (sa->sa_len != sizeof(struct sockaddr_in))
2271 				return -1;
2272 			break;
2273 		case AF_INET6:
2274 			if (sa->sa_len != sizeof(struct sockaddr_in6))
2275 				return -1;
2276 			break;
2277 		default:
2278 			return -1;
2279 		}
2280 	}
2281 
2282 	if (ISSET(info->rti_addrs, RTA_IFA)) {
2283 		struct sockaddr *sa = info->rti_info[RTAX_IFA];
2284 		if (sa == NULL)
2285 			return -1;
2286 		switch (sa->sa_family) {
2287 		case AF_INET:
2288 			if (sa->sa_len != sizeof(struct sockaddr_in))
2289 				return -1;
2290 			break;
2291 		case AF_INET6:
2292 			if (sa->sa_len != sizeof(struct sockaddr_in6))
2293 				return -1;
2294 			break;
2295 		default:
2296 			return -1;
2297 		}
2298 	}
2299 
2300 	if (ISSET(info->rti_addrs, RTA_DNS)) {
2301 		struct sockaddr_rtdns *rtdns =
2302 		    (struct sockaddr_rtdns *)info->rti_info[RTAX_DNS];
2303 		if (rtdns == NULL)
2304 			return -1;
2305 		if (rtdns->sr_len > sizeof(*rtdns))
2306 			return -1;
2307 		if (rtdns->sr_len < offsetof(struct sockaddr_rtdns, sr_dns))
2308 			return -1;
2309 		switch (rtdns->sr_family) {
2310 		case AF_INET:
2311 			if ((rtdns->sr_len - offsetof(struct sockaddr_rtdns,
2312 			    sr_dns)) % sizeof(struct in_addr) != 0)
2313 				return -1;
2314 			break;
2315 #ifdef INET6
2316 		case AF_INET6:
2317 			if ((rtdns->sr_len - offsetof(struct sockaddr_rtdns,
2318 			    sr_dns)) % sizeof(struct in6_addr) != 0)
2319 				return -1;
2320 			break;
2321 #endif
2322 		default:
2323 			return -1;
2324 		}
2325 	}
2326 
2327 	if (ISSET(info->rti_addrs, RTA_STATIC)) {
2328 		struct sockaddr_rtstatic *rtstatic =
2329 		    (struct sockaddr_rtstatic *)info->rti_info[RTAX_STATIC];
2330 		if (rtstatic == NULL)
2331 			return -1;
2332 		if (rtstatic->sr_len > sizeof(*rtstatic))
2333 			return -1;
2334 		if (rtstatic->sr_len <=
2335 		    offsetof(struct sockaddr_rtstatic, sr_static))
2336 			return -1;
2337 	}
2338 
2339 	if (ISSET(info->rti_addrs, RTA_SEARCH)) {
2340 		struct sockaddr_rtsearch *rtsearch =
2341 		    (struct sockaddr_rtsearch *)info->rti_info[RTAX_SEARCH];
2342 		if (rtsearch == NULL)
2343 			return -1;
2344 		if (rtsearch->sr_len > sizeof(*rtsearch))
2345 			return -1;
2346 		if (rtsearch->sr_len <=
2347 		    offsetof(struct sockaddr_rtsearch, sr_search))
2348 			return -1;
2349 	}
2350 
2351 	return 0;
2352 }
2353 
2354 int
2355 rt_setsource(unsigned int rtableid, struct sockaddr *src)
2356 {
2357 	struct ifaddr	*ifa;
2358 	/*
2359 	 * If source address is 0.0.0.0 or ::
2360 	 * use automatic source selection
2361 	 */
2362 	switch(src->sa_family) {
2363 	case AF_INET:
2364 		if(satosin(src)->sin_addr.s_addr == INADDR_ANY) {
2365 			rtable_setsource(rtableid, AF_INET, NULL);
2366 			return (0);
2367 		}
2368 		break;
2369 #ifdef INET6
2370 	case AF_INET6:
2371 		if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(src)->sin6_addr)) {
2372 			rtable_setsource(rtableid, AF_INET6, NULL);
2373 			return (0);
2374 		}
2375 		break;
2376 #endif
2377 	default:
2378 		return (EAFNOSUPPORT);
2379 	}
2380 
2381 	/*
2382 	 * Check if source address is assigned to an interface in the
2383 	 * same rdomain
2384 	 */
2385 	if ((ifa = ifa_ifwithaddr(src, rtableid)) == NULL)
2386 		return (EINVAL);
2387 
2388 	return rtable_setsource(rtableid, src->sa_family, ifa->ifa_addr);
2389 }
2390 
2391 /*
2392  * Definitions of protocols supported in the ROUTE domain.
2393  */
2394 
2395 const struct pr_usrreqs route_usrreqs = {
2396 	.pru_attach	= route_attach,
2397 	.pru_detach	= route_detach,
2398 	.pru_disconnect	= route_disconnect,
2399 	.pru_shutdown	= route_shutdown,
2400 	.pru_rcvd	= route_rcvd,
2401 	.pru_send	= route_send,
2402 	.pru_sockaddr	= route_sockaddr,
2403 	.pru_peeraddr	= route_peeraddr,
2404 };
2405 
2406 const struct protosw routesw[] = {
2407 {
2408   .pr_type	= SOCK_RAW,
2409   .pr_domain	= &routedomain,
2410   .pr_flags	= PR_ATOMIC|PR_ADDR|PR_WANTRCVD,
2411   .pr_ctloutput	= route_ctloutput,
2412   .pr_usrreqs	= &route_usrreqs,
2413   .pr_init	= route_prinit,
2414   .pr_sysctl	= sysctl_rtable
2415 }
2416 };
2417 
2418 const struct domain routedomain = {
2419   .dom_family = PF_ROUTE,
2420   .dom_name = "route",
2421   .dom_init = route_init,
2422   .dom_protosw = routesw,
2423   .dom_protoswNPROTOSW = &routesw[nitems(routesw)]
2424 };
2425