xref: /openbsd-src/sys/net/rtsock.c (revision 25c4e8bd056e974b28f4a0ffd39d76c190a56013)
1 /*	$OpenBSD: rtsock.c,v 1.334 2022/06/28 10:01:13 bluhm Exp $	*/
2 /*	$NetBSD: rtsock.c,v 1.18 1996/03/29 00:32:10 cgd Exp $	*/
3 
4 /*
5  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the project nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 /*
34  * Copyright (c) 1988, 1991, 1993
35  *	The Regents of the University of California.  All rights reserved.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. Neither the name of the University nor the names of its contributors
46  *    may be used to endorse or promote products derived from this software
47  *    without specific prior written permission.
48  *
49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59  * SUCH DAMAGE.
60  *
61  *	@(#)rtsock.c	8.6 (Berkeley) 2/11/95
62  */
63 
64 #include <sys/param.h>
65 #include <sys/systm.h>
66 #include <sys/proc.h>
67 #include <sys/sysctl.h>
68 #include <sys/mbuf.h>
69 #include <sys/socket.h>
70 #include <sys/socketvar.h>
71 #include <sys/domain.h>
72 #include <sys/pool.h>
73 #include <sys/protosw.h>
74 #include <sys/srp.h>
75 
76 #include <net/if.h>
77 #include <net/if_dl.h>
78 #include <net/if_var.h>
79 #include <net/route.h>
80 
81 #include <netinet/in.h>
82 
83 #ifdef MPLS
84 #include <netmpls/mpls.h>
85 #endif
86 #ifdef IPSEC
87 #include <netinet/ip_ipsp.h>
88 #include <net/if_enc.h>
89 #endif
90 #ifdef BFD
91 #include <net/bfd.h>
92 #endif
93 
94 #include <sys/stdarg.h>
95 #include <sys/kernel.h>
96 #include <sys/timeout.h>
97 
98 #define	ROUTESNDQ	8192
99 #define	ROUTERCVQ	8192
100 
101 const struct sockaddr route_src = { 2, PF_ROUTE, };
102 
103 struct walkarg {
104 	int	w_op, w_arg, w_tmemsize;
105 	size_t	w_given, w_needed;
106 	caddr_t	w_where, w_tmem;
107 };
108 
109 void	route_prinit(void);
110 void	rcb_ref(void *, void *);
111 void	rcb_unref(void *, void *);
112 int	route_output(struct mbuf *, struct socket *, struct sockaddr *,
113 	    struct mbuf *);
114 int	route_ctloutput(int, struct socket *, int, int, struct mbuf *);
115 int	route_usrreq(struct socket *, int, struct mbuf *, struct mbuf *,
116 	    struct mbuf *, struct proc *);
117 void	route_input(struct mbuf *m0, struct socket *, sa_family_t);
118 int	route_arp_conflict(struct rtentry *, struct rt_addrinfo *);
119 int	route_cleargateway(struct rtentry *, void *, unsigned int);
120 void	rtm_senddesync_timer(void *);
121 void	rtm_senddesync(struct socket *);
122 int	rtm_sendup(struct socket *, struct mbuf *);
123 
124 int	rtm_getifa(struct rt_addrinfo *, unsigned int);
125 int	rtm_output(struct rt_msghdr *, struct rtentry **, struct rt_addrinfo *,
126 	    uint8_t, unsigned int);
127 struct rt_msghdr *rtm_report(struct rtentry *, u_char, int, int);
128 struct mbuf	*rtm_msg1(int, struct rt_addrinfo *);
129 int		 rtm_msg2(int, int, struct rt_addrinfo *, caddr_t,
130 		     struct walkarg *);
131 int		 rtm_xaddrs(caddr_t, caddr_t, struct rt_addrinfo *);
132 int		 rtm_validate_proposal(struct rt_addrinfo *);
133 void		 rtm_setmetrics(u_long, const struct rt_metrics *,
134 		     struct rt_kmetrics *);
135 void		 rtm_getmetrics(const struct rtentry *,
136 		     struct rt_metrics *);
137 
138 int		 sysctl_iflist(int, struct walkarg *);
139 int		 sysctl_ifnames(struct walkarg *);
140 int		 sysctl_rtable_rtstat(void *, size_t *, void *);
141 
142 int		 rt_setsource(unsigned int, struct sockaddr *);
143 
144 /*
145  * Locks used to protect struct members
146  *       I       immutable after creation
147  *       s       solock
148  */
149 struct rtpcb {
150 	struct socket		*rop_socket;		/* [I] */
151 
152 	SRPL_ENTRY(rtpcb)	rop_list;
153 	struct refcnt		rop_refcnt;
154 	struct timeout		rop_timeout;
155 	unsigned int		rop_msgfilter;		/* [s] */
156 	unsigned int		rop_flagfilter;		/* [s] */
157 	unsigned int		rop_flags;		/* [s] */
158 	u_int			rop_rtableid;		/* [s] */
159 	unsigned short		rop_proto;		/* [I] */
160 	u_char			rop_priority;		/* [s] */
161 };
162 #define	sotortpcb(so)	((struct rtpcb *)(so)->so_pcb)
163 
164 struct rtptable {
165 	SRPL_HEAD(, rtpcb)	rtp_list;
166 	struct srpl_rc		rtp_rc;
167 	struct rwlock		rtp_lk;
168 	unsigned int		rtp_count;
169 };
170 
171 struct pool rtpcb_pool;
172 struct rtptable rtptable;
173 
174 /*
175  * These flags and timeout are used for indicating to userland (via a
176  * RTM_DESYNC msg) when the route socket has overflowed and messages
177  * have been lost.
178  */
179 #define ROUTECB_FLAG_DESYNC	0x1	/* Route socket out of memory */
180 #define ROUTECB_FLAG_FLUSH	0x2	/* Wait until socket is empty before
181 					   queueing more packets */
182 
183 #define ROUTE_DESYNC_RESEND_TIMEOUT	200	/* In ms */
184 
185 void
186 route_prinit(void)
187 {
188 	srpl_rc_init(&rtptable.rtp_rc, rcb_ref, rcb_unref, NULL);
189 	rw_init(&rtptable.rtp_lk, "rtsock");
190 	SRPL_INIT(&rtptable.rtp_list);
191 	pool_init(&rtpcb_pool, sizeof(struct rtpcb), 0,
192 	    IPL_SOFTNET, PR_WAITOK, "rtpcb", NULL);
193 }
194 
195 void
196 rcb_ref(void *null, void *v)
197 {
198 	struct rtpcb *rop = v;
199 
200 	refcnt_take(&rop->rop_refcnt);
201 }
202 
203 void
204 rcb_unref(void *null, void *v)
205 {
206 	struct rtpcb *rop = v;
207 
208 	refcnt_rele_wake(&rop->rop_refcnt);
209 }
210 
211 int
212 route_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
213     struct mbuf *control, struct proc *p)
214 {
215 	struct rtpcb	*rop;
216 	int		 error = 0;
217 
218 	if (req == PRU_CONTROL)
219 		return (EOPNOTSUPP);
220 
221 	soassertlocked(so);
222 
223 	if (control && control->m_len) {
224 		error = EOPNOTSUPP;
225 		goto release;
226 	}
227 
228 	rop = sotortpcb(so);
229 	if (rop == NULL) {
230 		error = EINVAL;
231 		goto release;
232 	}
233 
234 	switch (req) {
235 	/* no connect, bind, accept. Socket is connected from the start */
236 	case PRU_CONNECT:
237 	case PRU_BIND:
238 	case PRU_CONNECT2:
239 	case PRU_LISTEN:
240 	case PRU_ACCEPT:
241 		error = EOPNOTSUPP;
242 		break;
243 
244 	case PRU_DISCONNECT:
245 	case PRU_ABORT:
246 		soisdisconnected(so);
247 		break;
248 	case PRU_SHUTDOWN:
249 		socantsendmore(so);
250 		break;
251 	case PRU_SENSE:
252 		/* stat: don't bother with a blocksize. */
253 		break;
254 
255 	/* minimal support, just implement a fake peer address */
256 	case PRU_SOCKADDR:
257 		error = EINVAL;
258 		break;
259 	case PRU_PEERADDR:
260 		bcopy(&route_src, mtod(nam, caddr_t), route_src.sa_len);
261 		nam->m_len = route_src.sa_len;
262 		break;
263 
264 	case PRU_RCVD:
265 		/*
266 		 * If we are in a FLUSH state, check if the buffer is
267 		 * empty so that we can clear the flag.
268 		 */
269 		if (((rop->rop_flags & ROUTECB_FLAG_FLUSH) != 0) &&
270 		    ((sbspace(rop->rop_socket, &rop->rop_socket->so_rcv) ==
271 		    rop->rop_socket->so_rcv.sb_hiwat)))
272 			rop->rop_flags &= ~ROUTECB_FLAG_FLUSH;
273 		break;
274 
275 	case PRU_RCVOOB:
276 	case PRU_SENDOOB:
277 		error = EOPNOTSUPP;
278 		break;
279 	case PRU_SEND:
280 		if (nam) {
281 			error = EISCONN;
282 			break;
283 		}
284 		error = (*so->so_proto->pr_output)(m, so, NULL, NULL);
285 		m = NULL;
286 		break;
287 	default:
288 		panic("route_usrreq");
289 	}
290 
291  release:
292 	if (req != PRU_RCVD && req != PRU_RCVOOB && req != PRU_SENSE) {
293 		m_freem(control);
294 		m_freem(m);
295 	}
296 	return (error);
297 }
298 
299 int
300 route_attach(struct socket *so, int proto)
301 {
302 	struct rtpcb	*rop;
303 	int		 error;
304 
305 	error = soreserve(so, ROUTESNDQ, ROUTERCVQ);
306 	if (error)
307 		return (error);
308 	/*
309 	 * use the rawcb but allocate a rtpcb, this
310 	 * code does not care about the additional fields
311 	 * and works directly on the raw socket.
312 	 */
313 	rop = pool_get(&rtpcb_pool, PR_WAITOK|PR_ZERO);
314 	so->so_pcb = rop;
315 	/* Init the timeout structure */
316 	timeout_set_proc(&rop->rop_timeout, rtm_senddesync_timer, so);
317 	refcnt_init(&rop->rop_refcnt);
318 
319 	rop->rop_socket = so;
320 	rop->rop_proto = proto;
321 
322 	rop->rop_rtableid = curproc->p_p->ps_rtableid;
323 
324 	soisconnected(so);
325 	so->so_options |= SO_USELOOPBACK;
326 
327 	rw_enter(&rtptable.rtp_lk, RW_WRITE);
328 	SRPL_INSERT_HEAD_LOCKED(&rtptable.rtp_rc, &rtptable.rtp_list, rop,
329 	    rop_list);
330 	rtptable.rtp_count++;
331 	rw_exit(&rtptable.rtp_lk);
332 
333 	return (0);
334 }
335 
336 int
337 route_detach(struct socket *so)
338 {
339 	struct rtpcb	*rop;
340 
341 	soassertlocked(so);
342 
343 	rop = sotortpcb(so);
344 	if (rop == NULL)
345 		return (EINVAL);
346 
347 	rw_enter(&rtptable.rtp_lk, RW_WRITE);
348 
349 	rtptable.rtp_count--;
350 	SRPL_REMOVE_LOCKED(&rtptable.rtp_rc, &rtptable.rtp_list, rop, rtpcb,
351 	    rop_list);
352 	rw_exit(&rtptable.rtp_lk);
353 
354 	sounlock(so);
355 
356 	/* wait for all references to drop */
357 	refcnt_finalize(&rop->rop_refcnt, "rtsockrefs");
358 	timeout_del_barrier(&rop->rop_timeout);
359 
360 	solock(so);
361 
362 	so->so_pcb = NULL;
363 	KASSERT((so->so_state & SS_NOFDREF) == 0);
364 	pool_put(&rtpcb_pool, rop);
365 
366 	return (0);
367 }
368 
369 int
370 route_ctloutput(int op, struct socket *so, int level, int optname,
371     struct mbuf *m)
372 {
373 	struct rtpcb *rop = sotortpcb(so);
374 	int error = 0;
375 	unsigned int tid, prio;
376 
377 	if (level != AF_ROUTE)
378 		return (EINVAL);
379 
380 	switch (op) {
381 	case PRCO_SETOPT:
382 		switch (optname) {
383 		case ROUTE_MSGFILTER:
384 			if (m == NULL || m->m_len != sizeof(unsigned int))
385 				error = EINVAL;
386 			else
387 				rop->rop_msgfilter = *mtod(m, unsigned int *);
388 			break;
389 		case ROUTE_TABLEFILTER:
390 			if (m == NULL || m->m_len != sizeof(unsigned int)) {
391 				error = EINVAL;
392 				break;
393 			}
394 			tid = *mtod(m, unsigned int *);
395 			if (tid != RTABLE_ANY && !rtable_exists(tid))
396 				error = ENOENT;
397 			else
398 				rop->rop_rtableid = tid;
399 			break;
400 		case ROUTE_PRIOFILTER:
401 			if (m == NULL || m->m_len != sizeof(unsigned int)) {
402 				error = EINVAL;
403 				break;
404 			}
405 			prio = *mtod(m, unsigned int *);
406 			if (prio > RTP_MAX)
407 				error = EINVAL;
408 			else
409 				rop->rop_priority = prio;
410 			break;
411 		case ROUTE_FLAGFILTER:
412 			if (m == NULL || m->m_len != sizeof(unsigned int))
413 				error = EINVAL;
414 			else
415 				rop->rop_flagfilter = *mtod(m, unsigned int *);
416 			break;
417 		default:
418 			error = ENOPROTOOPT;
419 			break;
420 		}
421 		break;
422 	case PRCO_GETOPT:
423 		switch (optname) {
424 		case ROUTE_MSGFILTER:
425 			m->m_len = sizeof(unsigned int);
426 			*mtod(m, unsigned int *) = rop->rop_msgfilter;
427 			break;
428 		case ROUTE_TABLEFILTER:
429 			m->m_len = sizeof(unsigned int);
430 			*mtod(m, unsigned int *) = rop->rop_rtableid;
431 			break;
432 		case ROUTE_PRIOFILTER:
433 			m->m_len = sizeof(unsigned int);
434 			*mtod(m, unsigned int *) = rop->rop_priority;
435 			break;
436 		case ROUTE_FLAGFILTER:
437 			m->m_len = sizeof(unsigned int);
438 			*mtod(m, unsigned int *) = rop->rop_flagfilter;
439 			break;
440 		default:
441 			error = ENOPROTOOPT;
442 			break;
443 		}
444 	}
445 	return (error);
446 }
447 
448 void
449 rtm_senddesync_timer(void *xso)
450 {
451 	struct socket	*so = xso;
452 
453 	solock(so);
454 	rtm_senddesync(so);
455 	sounlock(so);
456 }
457 
458 void
459 rtm_senddesync(struct socket *so)
460 {
461 	struct rtpcb	*rop = sotortpcb(so);
462 	struct mbuf	*desync_mbuf;
463 
464 	soassertlocked(so);
465 
466 	/*
467 	 * Dying socket is disconnected by upper layer and there is
468 	 * no reason to send packet. Also we shouldn't reschedule
469 	 * timeout(9), otherwise timeout_del_barrier(9) can't help us.
470 	 */
471 	if ((so->so_state & SS_ISCONNECTED) == 0 ||
472 	    (so->so_state & SS_CANTRCVMORE))
473 		return;
474 
475 	/* If we are in a DESYNC state, try to send a RTM_DESYNC packet */
476 	if ((rop->rop_flags & ROUTECB_FLAG_DESYNC) == 0)
477 		return;
478 
479 	/*
480 	 * If we fail to alloc memory or if sbappendaddr()
481 	 * fails, re-add timeout and try again.
482 	 */
483 	desync_mbuf = rtm_msg1(RTM_DESYNC, NULL);
484 	if (desync_mbuf != NULL) {
485 		if (sbappendaddr(so, &so->so_rcv, &route_src,
486 		    desync_mbuf, NULL) != 0) {
487 			rop->rop_flags &= ~ROUTECB_FLAG_DESYNC;
488 			sorwakeup(rop->rop_socket);
489 			return;
490 		}
491 		m_freem(desync_mbuf);
492 	}
493 	/* Re-add timeout to try sending msg again */
494 	timeout_add_msec(&rop->rop_timeout, ROUTE_DESYNC_RESEND_TIMEOUT);
495 }
496 
497 void
498 route_input(struct mbuf *m0, struct socket *so0, sa_family_t sa_family)
499 {
500 	struct socket *so;
501 	struct rtpcb *rop;
502 	struct rt_msghdr *rtm;
503 	struct mbuf *m = m0;
504 	struct srp_ref sr;
505 
506 	/* ensure that we can access the rtm_type via mtod() */
507 	if (m->m_len < offsetof(struct rt_msghdr, rtm_type) + 1) {
508 		m_freem(m);
509 		return;
510 	}
511 
512 	SRPL_FOREACH(rop, &sr, &rtptable.rtp_list, rop_list) {
513 		/*
514 		 * If route socket is bound to an address family only send
515 		 * messages that match the address family. Address family
516 		 * agnostic messages are always sent.
517 		 */
518 		if (sa_family != AF_UNSPEC && rop->rop_proto != AF_UNSPEC &&
519 		    rop->rop_proto != sa_family)
520 			continue;
521 
522 
523 		so = rop->rop_socket;
524 		solock(so);
525 
526 		/*
527 		 * Check to see if we don't want our own messages and
528 		 * if we can receive anything.
529 		 */
530 		if ((so0 == so && !(so0->so_options & SO_USELOOPBACK)) ||
531 		    !(so->so_state & SS_ISCONNECTED) ||
532 		    (so->so_state & SS_CANTRCVMORE))
533 			goto next;
534 
535 		/* filter messages that the process does not want */
536 		rtm = mtod(m, struct rt_msghdr *);
537 		/* but RTM_DESYNC can't be filtered */
538 		if (rtm->rtm_type != RTM_DESYNC) {
539 			if (rop->rop_msgfilter != 0 &&
540 			    !(rop->rop_msgfilter & (1U << rtm->rtm_type)))
541 				goto next;
542 			if (ISSET(rop->rop_flagfilter, rtm->rtm_flags))
543 				goto next;
544 		}
545 		switch (rtm->rtm_type) {
546 		case RTM_IFANNOUNCE:
547 		case RTM_DESYNC:
548 			/* no tableid */
549 			break;
550 		case RTM_RESOLVE:
551 		case RTM_NEWADDR:
552 		case RTM_DELADDR:
553 		case RTM_IFINFO:
554 		case RTM_80211INFO:
555 		case RTM_BFD:
556 			/* check against rdomain id */
557 			if (rop->rop_rtableid != RTABLE_ANY &&
558 			    rtable_l2(rop->rop_rtableid) != rtm->rtm_tableid)
559 				goto next;
560 			break;
561 		default:
562 			if (rop->rop_priority != 0 &&
563 			    rop->rop_priority < rtm->rtm_priority)
564 				goto next;
565 			/* check against rtable id */
566 			if (rop->rop_rtableid != RTABLE_ANY &&
567 			    rop->rop_rtableid != rtm->rtm_tableid)
568 				goto next;
569 			break;
570 		}
571 
572 		/*
573 		 * Check to see if the flush flag is set. If so, don't queue
574 		 * any more messages until the flag is cleared.
575 		 */
576 		if ((rop->rop_flags & ROUTECB_FLAG_FLUSH) != 0)
577 			goto next;
578 
579 		rtm_sendup(so, m);
580 next:
581 		sounlock(so);
582 	}
583 	SRPL_LEAVE(&sr);
584 
585 	m_freem(m);
586 }
587 
588 int
589 rtm_sendup(struct socket *so, struct mbuf *m0)
590 {
591 	struct rtpcb *rop = sotortpcb(so);
592 	struct mbuf *m;
593 
594 	soassertlocked(so);
595 
596 	m = m_copym(m0, 0, M_COPYALL, M_NOWAIT);
597 	if (m == NULL)
598 		return (ENOMEM);
599 
600 	if (sbspace(so, &so->so_rcv) < (2 * MSIZE) ||
601 	    sbappendaddr(so, &so->so_rcv, &route_src, m, NULL) == 0) {
602 		/* Flag socket as desync'ed and flush required */
603 		rop->rop_flags |= ROUTECB_FLAG_DESYNC | ROUTECB_FLAG_FLUSH;
604 		rtm_senddesync(so);
605 		m_freem(m);
606 		return (ENOBUFS);
607 	}
608 
609 	sorwakeup(so);
610 	return (0);
611 }
612 
613 struct rt_msghdr *
614 rtm_report(struct rtentry *rt, u_char type, int seq, int tableid)
615 {
616 	struct rt_msghdr	*rtm;
617 	struct rt_addrinfo	 info;
618 	struct sockaddr_rtlabel	 sa_rl;
619 	struct sockaddr_in6	 sa_mask;
620 #ifdef BFD
621 	struct sockaddr_bfd	 sa_bfd;
622 #endif
623 	struct ifnet		*ifp = NULL;
624 	int			 len;
625 
626 	bzero(&info, sizeof(info));
627 	info.rti_info[RTAX_DST] = rt_key(rt);
628 	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
629 	info.rti_info[RTAX_NETMASK] = rt_plen2mask(rt, &sa_mask);
630 	info.rti_info[RTAX_LABEL] = rtlabel_id2sa(rt->rt_labelid, &sa_rl);
631 #ifdef BFD
632 	if (rt->rt_flags & RTF_BFD) {
633 		KERNEL_LOCK();
634 		info.rti_info[RTAX_BFD] = bfd2sa(rt, &sa_bfd);
635 		KERNEL_UNLOCK();
636 	}
637 #endif
638 #ifdef MPLS
639 	if (rt->rt_flags & RTF_MPLS) {
640 		struct sockaddr_mpls	 sa_mpls;
641 
642 		bzero(&sa_mpls, sizeof(sa_mpls));
643 		sa_mpls.smpls_family = AF_MPLS;
644 		sa_mpls.smpls_len = sizeof(sa_mpls);
645 		sa_mpls.smpls_label = ((struct rt_mpls *)
646 		    rt->rt_llinfo)->mpls_label;
647 		info.rti_info[RTAX_SRC] = (struct sockaddr *)&sa_mpls;
648 		info.rti_mpls = ((struct rt_mpls *)
649 		    rt->rt_llinfo)->mpls_operation;
650 	}
651 #endif
652 	ifp = if_get(rt->rt_ifidx);
653 	if (ifp != NULL) {
654 		info.rti_info[RTAX_IFP] = sdltosa(ifp->if_sadl);
655 		info.rti_info[RTAX_IFA] = rtable_getsource(tableid,
656 		    info.rti_info[RTAX_DST]->sa_family);
657 		if (info.rti_info[RTAX_IFA] == NULL)
658 			info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
659 		if (ifp->if_flags & IFF_POINTOPOINT)
660 			info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr;
661 	}
662 	if_put(ifp);
663 	/* RTAX_GENMASK, RTAX_AUTHOR, RTAX_SRCMASK ignored */
664 
665 	/* build new route message */
666 	len = rtm_msg2(type, RTM_VERSION, &info, NULL, NULL);
667 	rtm = malloc(len, M_RTABLE, M_WAITOK | M_ZERO);
668 
669 	rtm_msg2(type, RTM_VERSION, &info, (caddr_t)rtm, NULL);
670 	rtm->rtm_type = type;
671 	rtm->rtm_index = rt->rt_ifidx;
672 	rtm->rtm_tableid = tableid;
673 	rtm->rtm_priority = rt->rt_priority & RTP_MASK;
674 	rtm->rtm_flags = rt->rt_flags;
675 	rtm->rtm_pid = curproc->p_p->ps_pid;
676 	rtm->rtm_seq = seq;
677 	rtm_getmetrics(rt, &rtm->rtm_rmx);
678 	rtm->rtm_addrs = info.rti_addrs;
679 #ifdef MPLS
680 	rtm->rtm_mpls = info.rti_mpls;
681 #endif
682 	return rtm;
683 }
684 
685 int
686 route_output(struct mbuf *m, struct socket *so, struct sockaddr *dstaddr,
687     struct mbuf *control)
688 {
689 	struct rt_msghdr	*rtm = NULL;
690 	struct rtentry		*rt = NULL;
691 	struct rt_addrinfo	 info;
692 	struct ifnet		*ifp;
693 	int			 len, seq, useloopback, error = 0;
694 	u_int			 tableid;
695 	u_int8_t		 prio;
696 	u_char			 vers, type;
697 
698 	if (m == NULL || ((m->m_len < sizeof(int32_t)) &&
699 	    (m = m_pullup(m, sizeof(int32_t))) == 0))
700 		return (ENOBUFS);
701 	if ((m->m_flags & M_PKTHDR) == 0)
702 		panic("route_output");
703 
704 	useloopback = so->so_options & SO_USELOOPBACK;
705 
706 	/*
707 	 * The socket can't be closed concurrently because the file
708 	 * descriptor reference is still held.
709 	 */
710 
711 	sounlock(so);
712 
713 	len = m->m_pkthdr.len;
714 	if (len < offsetof(struct rt_msghdr, rtm_hdrlen) + 1 ||
715 	    len != mtod(m, struct rt_msghdr *)->rtm_msglen) {
716 		error = EINVAL;
717 		goto fail;
718 	}
719 	vers = mtod(m, struct rt_msghdr *)->rtm_version;
720 	switch (vers) {
721 	case RTM_VERSION:
722 		if (len < sizeof(struct rt_msghdr)) {
723 			error = EINVAL;
724 			goto fail;
725 		}
726 		if (len > RTM_MAXSIZE) {
727 			error = EMSGSIZE;
728 			goto fail;
729 		}
730 		rtm = malloc(len, M_RTABLE, M_WAITOK);
731 		m_copydata(m, 0, len, rtm);
732 		break;
733 	default:
734 		error = EPROTONOSUPPORT;
735 		goto fail;
736 	}
737 
738 	/* Verify that the caller is sending an appropriate message early */
739 	switch (rtm->rtm_type) {
740 	case RTM_ADD:
741 	case RTM_DELETE:
742 	case RTM_GET:
743 	case RTM_CHANGE:
744 	case RTM_PROPOSAL:
745 	case RTM_SOURCE:
746 		break;
747 	default:
748 		error = EOPNOTSUPP;
749 		goto fail;
750 	}
751 	/*
752 	 * Verify that the header length is valid.
753 	 * All messages from userland start with a struct rt_msghdr.
754 	 */
755 	if (rtm->rtm_hdrlen == 0)	/* old client */
756 		rtm->rtm_hdrlen = sizeof(struct rt_msghdr);
757 	if (rtm->rtm_hdrlen < sizeof(struct rt_msghdr) ||
758 	    len < rtm->rtm_hdrlen) {
759 		error = EINVAL;
760 		goto fail;
761 	}
762 
763 	rtm->rtm_pid = curproc->p_p->ps_pid;
764 
765 	/*
766 	 * Verify that the caller has the appropriate privilege; RTM_GET
767 	 * is the only operation the non-superuser is allowed.
768 	 */
769 	if (rtm->rtm_type != RTM_GET && suser(curproc) != 0) {
770 		error = EACCES;
771 		goto fail;
772 	}
773 	tableid = rtm->rtm_tableid;
774 	if (!rtable_exists(tableid)) {
775 		if (rtm->rtm_type == RTM_ADD) {
776 			if ((error = rtable_add(tableid)) != 0)
777 				goto fail;
778 		} else {
779 			error = EINVAL;
780 			goto fail;
781 		}
782 	}
783 
784 	/* Do not let userland play with kernel-only flags. */
785 	if ((rtm->rtm_flags & (RTF_LOCAL|RTF_BROADCAST)) != 0) {
786 		error = EINVAL;
787 		goto fail;
788 	}
789 
790 	/* make sure that kernel-only bits are not set */
791 	rtm->rtm_priority &= RTP_MASK;
792 	rtm->rtm_flags &= ~(RTF_DONE|RTF_CLONED|RTF_CACHED);
793 	rtm->rtm_fmask &= RTF_FMASK;
794 
795 	if (rtm->rtm_priority != 0) {
796 		if (rtm->rtm_priority > RTP_MAX ||
797 		    rtm->rtm_priority == RTP_LOCAL) {
798 			error = EINVAL;
799 			goto fail;
800 		}
801 		prio = rtm->rtm_priority;
802 	} else if (rtm->rtm_type != RTM_ADD)
803 		prio = RTP_ANY;
804 	else if (rtm->rtm_flags & RTF_STATIC)
805 		prio = 0;
806 	else
807 		prio = RTP_DEFAULT;
808 
809 	bzero(&info, sizeof(info));
810 	info.rti_addrs = rtm->rtm_addrs;
811 	if ((error = rtm_xaddrs(rtm->rtm_hdrlen + (caddr_t)rtm,
812 	    len + (caddr_t)rtm, &info)) != 0)
813 		goto fail;
814 
815 	info.rti_flags = rtm->rtm_flags;
816 
817 	if (rtm->rtm_type != RTM_SOURCE &&
818 	    rtm->rtm_type != RTM_PROPOSAL &&
819 	    (info.rti_info[RTAX_DST] == NULL ||
820 	    info.rti_info[RTAX_DST]->sa_family >= AF_MAX ||
821 	    (info.rti_info[RTAX_GATEWAY] != NULL &&
822 	    info.rti_info[RTAX_GATEWAY]->sa_family >= AF_MAX) ||
823 	    info.rti_info[RTAX_GENMASK] != NULL)) {
824 		error = EINVAL;
825 		goto fail;
826 	}
827 #ifdef MPLS
828 	info.rti_mpls = rtm->rtm_mpls;
829 #endif
830 
831 	if (info.rti_info[RTAX_GATEWAY] != NULL &&
832 	    info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK &&
833 	    (info.rti_flags & RTF_CLONING) == 0) {
834 		info.rti_flags |= RTF_LLINFO;
835 	}
836 
837 	/*
838 	 * Validate RTM_PROPOSAL and pass it along or error out.
839 	 */
840 	if (rtm->rtm_type == RTM_PROPOSAL) {
841 		if (rtm_validate_proposal(&info) == -1) {
842 			error = EINVAL;
843 			goto fail;
844 		}
845 		/*
846 		 * If this is a solicitation proposal forward request to
847 		 * all interfaces. Most handlers will ignore it but at least
848 		 * umb(4) will send a response to this event.
849 		 */
850 		if (rtm->rtm_priority == RTP_PROPOSAL_SOLICIT) {
851 			NET_LOCK();
852 			TAILQ_FOREACH(ifp, &ifnet, if_list) {
853 				ifp->if_rtrequest(ifp, RTM_PROPOSAL, NULL);
854 			}
855 			NET_UNLOCK();
856 		}
857 	} else if (rtm->rtm_type == RTM_SOURCE) {
858 		if (info.rti_info[RTAX_IFA] == NULL) {
859 			error = EINVAL;
860 			goto fail;
861 		}
862 		if ((error =
863 		    rt_setsource(tableid, info.rti_info[RTAX_IFA])) != 0)
864 			goto fail;
865 	} else {
866 		error = rtm_output(rtm, &rt, &info, prio, tableid);
867 		if (!error) {
868 			type = rtm->rtm_type;
869 			seq = rtm->rtm_seq;
870 			free(rtm, M_RTABLE, len);
871 			rtm = rtm_report(rt, type, seq, tableid);
872 			len = rtm->rtm_msglen;
873 		}
874 	}
875 
876 	rtfree(rt);
877 	if (error) {
878 		rtm->rtm_errno = error;
879 	} else {
880 		rtm->rtm_flags |= RTF_DONE;
881 	}
882 
883 	/*
884 	 * Check to see if we don't want our own messages.
885 	 */
886 	if (!useloopback) {
887 		if (rtptable.rtp_count == 0) {
888 			/* no other listener and no loopback of messages */
889 			goto fail;
890 		}
891 	}
892 	if (m_copyback(m, 0, len, rtm, M_NOWAIT)) {
893 		m_freem(m);
894 		m = NULL;
895 	} else if (m->m_pkthdr.len > len)
896 		m_adj(m, len - m->m_pkthdr.len);
897 	free(rtm, M_RTABLE, len);
898 	if (m)
899 		route_input(m, so, info.rti_info[RTAX_DST] ?
900 		    info.rti_info[RTAX_DST]->sa_family : AF_UNSPEC);
901 	solock(so);
902 
903 	return (error);
904 fail:
905 	free(rtm, M_RTABLE, len);
906 	m_freem(m);
907 	solock(so);
908 
909 	return (error);
910 }
911 
912 int
913 rtm_output(struct rt_msghdr *rtm, struct rtentry **prt,
914     struct rt_addrinfo *info, uint8_t prio, unsigned int tableid)
915 {
916 	struct rtentry		*rt = *prt;
917 	struct ifnet		*ifp = NULL;
918 	int			 plen, newgate = 0, error = 0;
919 
920 	switch (rtm->rtm_type) {
921 	case RTM_ADD:
922 		if (info->rti_info[RTAX_GATEWAY] == NULL) {
923 			error = EINVAL;
924 			break;
925 		}
926 
927 		rt = rtable_match(tableid, info->rti_info[RTAX_DST], NULL);
928 		if ((error = route_arp_conflict(rt, info))) {
929 			rtfree(rt);
930 			rt = NULL;
931 			break;
932 		}
933 
934 		/*
935 		 * We cannot go through a delete/create/insert cycle for
936 		 * cached route because this can lead to races in the
937 		 * receive path.  Instead we update the L2 cache.
938 		 */
939 		if ((rt != NULL) && ISSET(rt->rt_flags, RTF_CACHED)) {
940 			ifp = if_get(rt->rt_ifidx);
941 			if (ifp == NULL) {
942 				rtfree(rt);
943 				rt = NULL;
944 				error = ESRCH;
945 				break;
946 			}
947 
948 			goto change;
949 		}
950 
951 		rtfree(rt);
952 		rt = NULL;
953 
954 		NET_LOCK();
955 		if ((error = rtm_getifa(info, tableid)) != 0) {
956 			NET_UNLOCK();
957 			break;
958 		}
959 		error = rtrequest(RTM_ADD, info, prio, &rt, tableid);
960 		NET_UNLOCK();
961 		if (error == 0)
962 			rtm_setmetrics(rtm->rtm_inits, &rtm->rtm_rmx,
963 			    &rt->rt_rmx);
964 		break;
965 	case RTM_DELETE:
966 		rt = rtable_lookup(tableid, info->rti_info[RTAX_DST],
967 		    info->rti_info[RTAX_NETMASK], info->rti_info[RTAX_GATEWAY],
968 		    prio);
969 		if (rt == NULL) {
970 			error = ESRCH;
971 			break;
972 		}
973 
974 		/*
975 		 * If we got multipath routes, we require users to specify
976 		 * a matching gateway.
977 		 */
978 		if (ISSET(rt->rt_flags, RTF_MPATH) &&
979 		    info->rti_info[RTAX_GATEWAY] == NULL) {
980 			error = ESRCH;
981 			break;
982 		}
983 
984 		ifp = if_get(rt->rt_ifidx);
985 		if (ifp == NULL) {
986 			rtfree(rt);
987 			rt = NULL;
988 			error = ESRCH;
989 			break;
990 		}
991 
992 		/*
993 		 * Invalidate the cache of automagically created and
994 		 * referenced L2 entries to make sure that ``rt_gwroute''
995 		 * pointer stays valid for other CPUs.
996 		 */
997 		if ((ISSET(rt->rt_flags, RTF_CACHED))) {
998 			NET_LOCK();
999 			ifp->if_rtrequest(ifp, RTM_INVALIDATE, rt);
1000 			/* Reset the MTU of the gateway route. */
1001 			rtable_walk(tableid, rt_key(rt)->sa_family, NULL,
1002 			    route_cleargateway, rt);
1003 			NET_UNLOCK();
1004 			break;
1005 		}
1006 
1007 		/*
1008 		 * Make sure that local routes are only modified by the
1009 		 * kernel.
1010 		 */
1011 		if (ISSET(rt->rt_flags, RTF_LOCAL|RTF_BROADCAST)) {
1012 			error = EINVAL;
1013 			break;
1014 		}
1015 
1016 		rtfree(rt);
1017 		rt = NULL;
1018 
1019 		NET_LOCK();
1020 		error = rtrequest_delete(info, prio, ifp, &rt, tableid);
1021 		NET_UNLOCK();
1022 		break;
1023 	case RTM_CHANGE:
1024 		rt = rtable_lookup(tableid, info->rti_info[RTAX_DST],
1025 		    info->rti_info[RTAX_NETMASK], info->rti_info[RTAX_GATEWAY],
1026 		    prio);
1027 		/*
1028 		 * If we got multipath routes, we require users to specify
1029 		 * a matching gateway.
1030 		 */
1031 		if ((rt != NULL) && ISSET(rt->rt_flags, RTF_MPATH) &&
1032 		    (info->rti_info[RTAX_GATEWAY] == NULL)) {
1033 			rtfree(rt);
1034 			rt = NULL;
1035 		}
1036 
1037 		/*
1038 		 * If RTAX_GATEWAY is the argument we're trying to
1039 		 * change, try to find a compatible route.
1040 		 */
1041 		if ((rt == NULL) && (info->rti_info[RTAX_GATEWAY] != NULL)) {
1042 			rt = rtable_lookup(tableid, info->rti_info[RTAX_DST],
1043 			    info->rti_info[RTAX_NETMASK], NULL, prio);
1044 			/* Ensure we don't pick a multipath one. */
1045 			if ((rt != NULL) && ISSET(rt->rt_flags, RTF_MPATH)) {
1046 				rtfree(rt);
1047 				rt = NULL;
1048 			}
1049 		}
1050 
1051 		if (rt == NULL) {
1052 			error = ESRCH;
1053 			break;
1054 		}
1055 
1056 		/*
1057 		 * Make sure that local routes are only modified by the
1058 		 * kernel.
1059 		 */
1060 		if (ISSET(rt->rt_flags, RTF_LOCAL|RTF_BROADCAST)) {
1061 			error = EINVAL;
1062 			break;
1063 		}
1064 
1065 		ifp = if_get(rt->rt_ifidx);
1066 		if (ifp == NULL) {
1067 			rtfree(rt);
1068 			rt = NULL;
1069 			error = ESRCH;
1070 			break;
1071 		}
1072 
1073 		/*
1074 		 * RTM_CHANGE needs a perfect match.
1075 		 */
1076 		plen = rtable_satoplen(info->rti_info[RTAX_DST]->sa_family,
1077 		    info->rti_info[RTAX_NETMASK]);
1078 		if (rt_plen(rt) != plen) {
1079 			error = ESRCH;
1080 			break;
1081 		}
1082 
1083 		if (info->rti_info[RTAX_GATEWAY] != NULL)
1084 			if (rt->rt_gateway == NULL ||
1085 			    bcmp(rt->rt_gateway,
1086 			    info->rti_info[RTAX_GATEWAY],
1087 			    info->rti_info[RTAX_GATEWAY]->sa_len)) {
1088 				newgate = 1;
1089 			}
1090 		/*
1091 		 * Check reachable gateway before changing the route.
1092 		 * New gateway could require new ifaddr, ifp;
1093 		 * flags may also be different; ifp may be specified
1094 		 * by ll sockaddr when protocol address is ambiguous.
1095 		 */
1096 		if (newgate || info->rti_info[RTAX_IFP] != NULL ||
1097 		    info->rti_info[RTAX_IFA] != NULL) {
1098 			struct ifaddr	*ifa = NULL;
1099 
1100 			NET_LOCK();
1101 			if ((error = rtm_getifa(info, tableid)) != 0) {
1102 				NET_UNLOCK();
1103 				break;
1104 			}
1105 			ifa = info->rti_ifa;
1106 			if (rt->rt_ifa != ifa) {
1107 				ifp->if_rtrequest(ifp, RTM_DELETE, rt);
1108 				ifafree(rt->rt_ifa);
1109 
1110 				ifa->ifa_refcnt++;
1111 				rt->rt_ifa = ifa;
1112 				rt->rt_ifidx = ifa->ifa_ifp->if_index;
1113 				/* recheck link state after ifp change */
1114 				rt_if_linkstate_change(rt, ifa->ifa_ifp,
1115 				    tableid);
1116 			}
1117 			NET_UNLOCK();
1118 		}
1119 change:
1120 		if (info->rti_info[RTAX_GATEWAY] != NULL) {
1121 			/* When updating the gateway, make sure it is valid. */
1122 			if (!newgate && rt->rt_gateway->sa_family !=
1123 			    info->rti_info[RTAX_GATEWAY]->sa_family) {
1124 				error = EINVAL;
1125 				break;
1126 			}
1127 
1128 			NET_LOCK();
1129 			error = rt_setgate(rt,
1130 			    info->rti_info[RTAX_GATEWAY], tableid);
1131 			NET_UNLOCK();
1132 			if (error)
1133 				break;
1134 		}
1135 #ifdef MPLS
1136 		if (rtm->rtm_flags & RTF_MPLS) {
1137 			NET_LOCK();
1138 			error = rt_mpls_set(rt,
1139 			    info->rti_info[RTAX_SRC], info->rti_mpls);
1140 			NET_UNLOCK();
1141 			if (error)
1142 				break;
1143 		} else if (newgate || (rtm->rtm_fmask & RTF_MPLS)) {
1144 			NET_LOCK();
1145 			/* if gateway changed remove MPLS information */
1146 			rt_mpls_clear(rt);
1147 			NET_UNLOCK();
1148 		}
1149 #endif
1150 
1151 #ifdef BFD
1152 		if (ISSET(rtm->rtm_flags, RTF_BFD)) {
1153 			KERNEL_LOCK();
1154 			error = bfdset(rt);
1155 			KERNEL_UNLOCK();
1156 			if (error)
1157 				break;
1158 		} else if (!ISSET(rtm->rtm_flags, RTF_BFD) &&
1159 		    ISSET(rtm->rtm_fmask, RTF_BFD)) {
1160 			KERNEL_LOCK();
1161 			bfdclear(rt);
1162 			KERNEL_UNLOCK();
1163 		}
1164 #endif
1165 
1166 		NET_LOCK();
1167 		/* Hack to allow some flags to be toggled */
1168 		if (rtm->rtm_fmask) {
1169 			/* MPLS flag it is set by rt_mpls_set() */
1170 			rtm->rtm_fmask &= ~RTF_MPLS;
1171 			rtm->rtm_flags &= ~RTF_MPLS;
1172 			rt->rt_flags =
1173 			    (rt->rt_flags & ~rtm->rtm_fmask) |
1174 			    (rtm->rtm_flags & rtm->rtm_fmask);
1175 		}
1176 		rtm_setmetrics(rtm->rtm_inits, &rtm->rtm_rmx, &rt->rt_rmx);
1177 
1178 		ifp->if_rtrequest(ifp, RTM_ADD, rt);
1179 
1180 		if (info->rti_info[RTAX_LABEL] != NULL) {
1181 			char *rtlabel = ((struct sockaddr_rtlabel *)
1182 			    info->rti_info[RTAX_LABEL])->sr_label;
1183 			rtlabel_unref(rt->rt_labelid);
1184 			rt->rt_labelid = rtlabel_name2id(rtlabel);
1185 		}
1186 		if_group_routechange(info->rti_info[RTAX_DST],
1187 		    info->rti_info[RTAX_NETMASK]);
1188 		rt->rt_locks &= ~(rtm->rtm_inits);
1189 		rt->rt_locks |= (rtm->rtm_inits & rtm->rtm_rmx.rmx_locks);
1190 		NET_UNLOCK();
1191 		break;
1192 	case RTM_GET:
1193 		rt = rtable_lookup(tableid, info->rti_info[RTAX_DST],
1194 		    info->rti_info[RTAX_NETMASK], info->rti_info[RTAX_GATEWAY],
1195 		    prio);
1196 		if (rt == NULL)
1197 			error = ESRCH;
1198 		break;
1199 	}
1200 
1201 	if_put(ifp);
1202 	*prt = rt;
1203 	return (error);
1204 }
1205 
1206 struct ifaddr *
1207 ifa_ifwithroute(int flags, struct sockaddr *dst, struct sockaddr *gateway,
1208     unsigned int rtableid)
1209 {
1210 	struct ifaddr	*ifa;
1211 
1212 	if ((flags & RTF_GATEWAY) == 0) {
1213 		/*
1214 		 * If we are adding a route to an interface,
1215 		 * and the interface is a pt to pt link
1216 		 * we should search for the destination
1217 		 * as our clue to the interface.  Otherwise
1218 		 * we can use the local address.
1219 		 */
1220 		ifa = NULL;
1221 		if (flags & RTF_HOST)
1222 			ifa = ifa_ifwithdstaddr(dst, rtableid);
1223 		if (ifa == NULL)
1224 			ifa = ifa_ifwithaddr(gateway, rtableid);
1225 	} else {
1226 		/*
1227 		 * If we are adding a route to a remote net
1228 		 * or host, the gateway may still be on the
1229 		 * other end of a pt to pt link.
1230 		 */
1231 		ifa = ifa_ifwithdstaddr(gateway, rtableid);
1232 	}
1233 	if (ifa == NULL) {
1234 		if (gateway->sa_family == AF_LINK) {
1235 			struct sockaddr_dl *sdl = satosdl(gateway);
1236 			struct ifnet *ifp = if_get(sdl->sdl_index);
1237 
1238 			if (ifp != NULL)
1239 				ifa = ifaof_ifpforaddr(dst, ifp);
1240 			if_put(ifp);
1241 		} else {
1242 			struct rtentry *rt;
1243 
1244 			rt = rtalloc(gateway, RT_RESOLVE, rtable_l2(rtableid));
1245 			if (rt != NULL)
1246 				ifa = rt->rt_ifa;
1247 			rtfree(rt);
1248 		}
1249 	}
1250 	if (ifa == NULL)
1251 		return (NULL);
1252 	if (ifa->ifa_addr->sa_family != dst->sa_family) {
1253 		struct ifaddr	*oifa = ifa;
1254 		ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp);
1255 		if (ifa == NULL)
1256 			ifa = oifa;
1257 	}
1258 	return (ifa);
1259 }
1260 
1261 int
1262 rtm_getifa(struct rt_addrinfo *info, unsigned int rtid)
1263 {
1264 	struct ifnet	*ifp = NULL;
1265 
1266 	/*
1267 	 * The "returned" `ifa' is guaranteed to be alive only if
1268 	 * the NET_LOCK() is held.
1269 	 */
1270 	NET_ASSERT_LOCKED();
1271 
1272 	/*
1273 	 * ifp may be specified by sockaddr_dl when protocol address
1274 	 * is ambiguous
1275 	 */
1276 	if (info->rti_info[RTAX_IFP] != NULL) {
1277 		struct sockaddr_dl *sdl;
1278 
1279 		sdl = satosdl(info->rti_info[RTAX_IFP]);
1280 		ifp = if_get(sdl->sdl_index);
1281 	}
1282 
1283 #ifdef IPSEC
1284 	/*
1285 	 * If the destination is a PF_KEY address, we'll look
1286 	 * for the existence of a encap interface number or address
1287 	 * in the options list of the gateway. By default, we'll return
1288 	 * enc0.
1289 	 */
1290 	if (info->rti_info[RTAX_DST] &&
1291 	    info->rti_info[RTAX_DST]->sa_family == PF_KEY)
1292 		info->rti_ifa = enc_getifa(rtid, 0);
1293 #endif
1294 
1295 	if (info->rti_ifa == NULL && info->rti_info[RTAX_IFA] != NULL)
1296 		info->rti_ifa = ifa_ifwithaddr(info->rti_info[RTAX_IFA], rtid);
1297 
1298 	if (info->rti_ifa == NULL) {
1299 		struct sockaddr	*sa;
1300 
1301 		if ((sa = info->rti_info[RTAX_IFA]) == NULL)
1302 			if ((sa = info->rti_info[RTAX_GATEWAY]) == NULL)
1303 				sa = info->rti_info[RTAX_DST];
1304 
1305 		if (sa != NULL && ifp != NULL)
1306 			info->rti_ifa = ifaof_ifpforaddr(sa, ifp);
1307 		else if (info->rti_info[RTAX_DST] != NULL &&
1308 		    info->rti_info[RTAX_GATEWAY] != NULL)
1309 			info->rti_ifa = ifa_ifwithroute(info->rti_flags,
1310 			    info->rti_info[RTAX_DST],
1311 			    info->rti_info[RTAX_GATEWAY],
1312 			    rtid);
1313 		else if (sa != NULL)
1314 			info->rti_ifa = ifa_ifwithroute(info->rti_flags,
1315 			    sa, sa, rtid);
1316 	}
1317 
1318 	if_put(ifp);
1319 
1320 	if (info->rti_ifa == NULL)
1321 		return (ENETUNREACH);
1322 
1323 	return (0);
1324 }
1325 
1326 int
1327 route_cleargateway(struct rtentry *rt, void *arg, unsigned int rtableid)
1328 {
1329 	struct rtentry *nhrt = arg;
1330 
1331 	if (ISSET(rt->rt_flags, RTF_GATEWAY) && rt->rt_gwroute == nhrt &&
1332 	    !ISSET(rt->rt_locks, RTV_MTU))
1333 		rt->rt_mtu = 0;
1334 
1335 	return (0);
1336 }
1337 
1338 /*
1339  * Check if the user request to insert an ARP entry does not conflict
1340  * with existing ones.
1341  *
1342  * Only two entries are allowed for a given IP address: a private one
1343  * (priv) and a public one (pub).
1344  */
1345 int
1346 route_arp_conflict(struct rtentry *rt, struct rt_addrinfo *info)
1347 {
1348 	int		 proxy = (info->rti_flags & RTF_ANNOUNCE);
1349 
1350 	if ((info->rti_flags & RTF_LLINFO) == 0 ||
1351 	    (info->rti_info[RTAX_DST]->sa_family != AF_INET))
1352 		return (0);
1353 
1354 	if (rt == NULL || !ISSET(rt->rt_flags, RTF_LLINFO))
1355 		return (0);
1356 
1357 	/* If the entry is cached, it can be updated. */
1358 	if (ISSET(rt->rt_flags, RTF_CACHED))
1359 		return (0);
1360 
1361 	/*
1362 	 * Same destination, not cached and both "priv" or "pub" conflict.
1363 	 * If a second entry exists, it always conflict.
1364 	 */
1365 	if ((ISSET(rt->rt_flags, RTF_ANNOUNCE) == proxy) ||
1366 	    ISSET(rt->rt_flags, RTF_MPATH))
1367 		return (EEXIST);
1368 
1369 	/* No conflict but an entry exist so we need to force mpath. */
1370 	info->rti_flags |= RTF_MPATH;
1371 	return (0);
1372 }
1373 
1374 void
1375 rtm_setmetrics(u_long which, const struct rt_metrics *in,
1376     struct rt_kmetrics *out)
1377 {
1378 	int64_t expire;
1379 
1380 	if (which & RTV_MTU)
1381 		out->rmx_mtu = in->rmx_mtu;
1382 	if (which & RTV_EXPIRE) {
1383 		expire = in->rmx_expire;
1384 		if (expire != 0) {
1385 			expire -= gettime();
1386 			expire += getuptime();
1387 		}
1388 
1389 		out->rmx_expire = expire;
1390 	}
1391 }
1392 
1393 void
1394 rtm_getmetrics(const struct rtentry *rt, struct rt_metrics *out)
1395 {
1396 	const struct rt_kmetrics *in = &rt->rt_rmx;
1397 	int64_t expire;
1398 
1399 	expire = in->rmx_expire;
1400 	if (expire == 0)
1401 		expire = rt_timer_get_expire(rt);
1402 	if (expire != 0) {
1403 		expire -= getuptime();
1404 		expire += gettime();
1405 	}
1406 
1407 	bzero(out, sizeof(*out));
1408 	out->rmx_locks = in->rmx_locks;
1409 	out->rmx_mtu = in->rmx_mtu;
1410 	out->rmx_expire = expire;
1411 	out->rmx_pksent = in->rmx_pksent;
1412 }
1413 
1414 #define ROUNDUP(a) \
1415 	((a) > 0 ? (1 + (((a) - 1) | (sizeof(long) - 1))) : sizeof(long))
1416 #define ADVANCE(x, n) (x += ROUNDUP((n)->sa_len))
1417 
1418 int
1419 rtm_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo)
1420 {
1421 	struct sockaddr	*sa;
1422 	int		 i;
1423 
1424 	/*
1425 	 * Parse address bits, split address storage in chunks, and
1426 	 * set info pointers.  Use sa_len for traversing the memory
1427 	 * and check that we stay within in the limit.
1428 	 */
1429 	bzero(rtinfo->rti_info, sizeof(rtinfo->rti_info));
1430 	for (i = 0; i < sizeof(rtinfo->rti_addrs) * 8; i++) {
1431 		if ((rtinfo->rti_addrs & (1U << i)) == 0)
1432 			continue;
1433 		if (i >= RTAX_MAX || cp + sizeof(socklen_t) > cplim)
1434 			return (EINVAL);
1435 		sa = (struct sockaddr *)cp;
1436 		if (cp + sa->sa_len > cplim)
1437 			return (EINVAL);
1438 		rtinfo->rti_info[i] = sa;
1439 		ADVANCE(cp, sa);
1440 	}
1441 	/*
1442 	 * Check that the address family is suitable for the route address
1443 	 * type.  Check that each address has a size that fits its family
1444 	 * and its length is within the size.  Strings within addresses must
1445 	 * be NUL terminated.
1446 	 */
1447 	for (i = 0; i < RTAX_MAX; i++) {
1448 		size_t len, maxlen, size;
1449 
1450 		sa = rtinfo->rti_info[i];
1451 		if (sa == NULL)
1452 			continue;
1453 		maxlen = size = 0;
1454 		switch (i) {
1455 		case RTAX_DST:
1456 		case RTAX_GATEWAY:
1457 		case RTAX_SRC:
1458 			switch (sa->sa_family) {
1459 			case AF_INET:
1460 				size = sizeof(struct sockaddr_in);
1461 				break;
1462 			case AF_LINK:
1463 				size = sizeof(struct sockaddr_dl);
1464 				break;
1465 #ifdef INET6
1466 			case AF_INET6:
1467 				size = sizeof(struct sockaddr_in6);
1468 				break;
1469 #endif
1470 #ifdef MPLS
1471 			case AF_MPLS:
1472 				size = sizeof(struct sockaddr_mpls);
1473 				break;
1474 #endif
1475 			}
1476 			break;
1477 		case RTAX_IFP:
1478 			if (sa->sa_family != AF_LINK)
1479 				return (EAFNOSUPPORT);
1480 			/*
1481 			 * XXX Should be sizeof(struct sockaddr_dl), but
1482 			 * route(8) has a bug and provides less memory.
1483 			 * arp(8) has another bug and uses sizeof pointer.
1484 			 */
1485 			size = 4;
1486 			break;
1487 		case RTAX_IFA:
1488 			switch (sa->sa_family) {
1489 			case AF_INET:
1490 				size = sizeof(struct sockaddr_in);
1491 				break;
1492 #ifdef INET6
1493 			case AF_INET6:
1494 				size = sizeof(struct sockaddr_in6);
1495 				break;
1496 #endif
1497 			default:
1498 				return (EAFNOSUPPORT);
1499 			}
1500 			break;
1501 		case RTAX_LABEL:
1502 			sa->sa_family = AF_UNSPEC;
1503 			maxlen = RTLABEL_LEN;
1504 			size = sizeof(struct sockaddr_rtlabel);
1505 			break;
1506 #ifdef BFD
1507 		case RTAX_BFD:
1508 			sa->sa_family = AF_UNSPEC;
1509 			size = sizeof(struct sockaddr_bfd);
1510 			break;
1511 #endif
1512 		case RTAX_DNS:
1513 			/* more validation in rtm_validate_proposal */
1514 			if (sa->sa_len > sizeof(struct sockaddr_rtdns))
1515 				return (EINVAL);
1516 			if (sa->sa_len < offsetof(struct sockaddr_rtdns,
1517 			    sr_dns))
1518 				return (EINVAL);
1519 			switch (sa->sa_family) {
1520 			case AF_INET:
1521 #ifdef INET6
1522 			case AF_INET6:
1523 #endif
1524 				break;
1525 			default:
1526 				return (EAFNOSUPPORT);
1527 			}
1528 			break;
1529 		case RTAX_STATIC:
1530 			sa->sa_family = AF_UNSPEC;
1531 			maxlen = RTSTATIC_LEN;
1532 			size = sizeof(struct sockaddr_rtstatic);
1533 			break;
1534 		case RTAX_SEARCH:
1535 			sa->sa_family = AF_UNSPEC;
1536 			maxlen = RTSEARCH_LEN;
1537 			size = sizeof(struct sockaddr_rtsearch);
1538 			break;
1539 		}
1540 		if (size) {
1541 			/* memory for the full struct must be provided */
1542 			if (sa->sa_len < size)
1543 				return (EINVAL);
1544 		}
1545 		if (maxlen) {
1546 			/* this should not happen */
1547 			if (2 + maxlen > size)
1548 				return (EINVAL);
1549 			/* strings must be NUL terminated within the struct */
1550 			len = strnlen(sa->sa_data, maxlen);
1551 			if (len >= maxlen || 2 + len >= sa->sa_len)
1552 				return (EINVAL);
1553 			break;
1554 		}
1555 	}
1556 	return (0);
1557 }
1558 
1559 struct mbuf *
1560 rtm_msg1(int type, struct rt_addrinfo *rtinfo)
1561 {
1562 	struct rt_msghdr	*rtm;
1563 	struct mbuf		*m;
1564 	int			 i;
1565 	struct sockaddr		*sa;
1566 	int			 len, dlen, hlen;
1567 
1568 	switch (type) {
1569 	case RTM_DELADDR:
1570 	case RTM_NEWADDR:
1571 		hlen = sizeof(struct ifa_msghdr);
1572 		break;
1573 	case RTM_IFINFO:
1574 		hlen = sizeof(struct if_msghdr);
1575 		break;
1576 	case RTM_IFANNOUNCE:
1577 		hlen = sizeof(struct if_announcemsghdr);
1578 		break;
1579 #ifdef BFD
1580 	case RTM_BFD:
1581 		hlen = sizeof(struct bfd_msghdr);
1582 		break;
1583 #endif
1584 	case RTM_80211INFO:
1585 		hlen = sizeof(struct if_ieee80211_msghdr);
1586 		break;
1587 	default:
1588 		hlen = sizeof(struct rt_msghdr);
1589 		break;
1590 	}
1591 	len = hlen;
1592 	for (i = 0; i < RTAX_MAX; i++) {
1593 		if (rtinfo == NULL || (sa = rtinfo->rti_info[i]) == NULL)
1594 			continue;
1595 		len += ROUNDUP(sa->sa_len);
1596 	}
1597 	if (len > MCLBYTES)
1598 		panic("rtm_msg1");
1599 	m = m_gethdr(M_DONTWAIT, MT_DATA);
1600 	if (m && len > MHLEN) {
1601 		MCLGET(m, M_DONTWAIT);
1602 		if ((m->m_flags & M_EXT) == 0) {
1603 			m_free(m);
1604 			m = NULL;
1605 		}
1606 	}
1607 	if (m == NULL)
1608 		return (m);
1609 	m->m_pkthdr.len = m->m_len = len;
1610 	m->m_pkthdr.ph_ifidx = 0;
1611 	rtm = mtod(m, struct rt_msghdr *);
1612 	bzero(rtm, len);
1613 	len = hlen;
1614 	for (i = 0; i < RTAX_MAX; i++) {
1615 		if (rtinfo == NULL || (sa = rtinfo->rti_info[i]) == NULL)
1616 			continue;
1617 		rtinfo->rti_addrs |= (1U << i);
1618 		dlen = ROUNDUP(sa->sa_len);
1619 		if (m_copyback(m, len, sa->sa_len, sa, M_NOWAIT)) {
1620 			m_freem(m);
1621 			return (NULL);
1622 		}
1623 		len += dlen;
1624 	}
1625 	rtm->rtm_msglen = len;
1626 	rtm->rtm_hdrlen = hlen;
1627 	rtm->rtm_version = RTM_VERSION;
1628 	rtm->rtm_type = type;
1629 	return (m);
1630 }
1631 
1632 int
1633 rtm_msg2(int type, int vers, struct rt_addrinfo *rtinfo, caddr_t cp,
1634     struct walkarg *w)
1635 {
1636 	int		i;
1637 	int		len, dlen, hlen, second_time = 0;
1638 	caddr_t		cp0;
1639 
1640 	rtinfo->rti_addrs = 0;
1641 again:
1642 	switch (type) {
1643 	case RTM_DELADDR:
1644 	case RTM_NEWADDR:
1645 		len = sizeof(struct ifa_msghdr);
1646 		break;
1647 	case RTM_IFINFO:
1648 		len = sizeof(struct if_msghdr);
1649 		break;
1650 	default:
1651 		len = sizeof(struct rt_msghdr);
1652 		break;
1653 	}
1654 	hlen = len;
1655 	if ((cp0 = cp) != NULL)
1656 		cp += len;
1657 	for (i = 0; i < RTAX_MAX; i++) {
1658 		struct sockaddr *sa;
1659 
1660 		if ((sa = rtinfo->rti_info[i]) == NULL)
1661 			continue;
1662 		rtinfo->rti_addrs |= (1U << i);
1663 		dlen = ROUNDUP(sa->sa_len);
1664 		if (cp) {
1665 			bcopy(sa, cp, sa->sa_len);
1666 			bzero(cp + sa->sa_len, dlen - sa->sa_len);
1667 			cp += dlen;
1668 		}
1669 		len += dlen;
1670 	}
1671 	/* align message length to the next natural boundary */
1672 	len = ALIGN(len);
1673 	if (cp == 0 && w != NULL && !second_time) {
1674 		w->w_needed += len;
1675 		if (w->w_needed <= w->w_given && w->w_where) {
1676 			if (w->w_tmemsize < len) {
1677 				free(w->w_tmem, M_RTABLE, w->w_tmemsize);
1678 				w->w_tmem = malloc(len, M_RTABLE,
1679 				    M_NOWAIT | M_ZERO);
1680 				if (w->w_tmem)
1681 					w->w_tmemsize = len;
1682 			}
1683 			if (w->w_tmem) {
1684 				cp = w->w_tmem;
1685 				second_time = 1;
1686 				goto again;
1687 			} else
1688 				w->w_where = 0;
1689 		}
1690 	}
1691 	if (cp && w)		/* clear the message header */
1692 		bzero(cp0, hlen);
1693 
1694 	if (cp) {
1695 		struct rt_msghdr *rtm = (struct rt_msghdr *)cp0;
1696 
1697 		rtm->rtm_version = RTM_VERSION;
1698 		rtm->rtm_type = type;
1699 		rtm->rtm_msglen = len;
1700 		rtm->rtm_hdrlen = hlen;
1701 	}
1702 	return (len);
1703 }
1704 
1705 void
1706 rtm_send(struct rtentry *rt, int cmd, int error, unsigned int rtableid)
1707 {
1708 	struct rt_addrinfo	 info;
1709 	struct ifnet		*ifp;
1710 	struct sockaddr_rtlabel	 sa_rl;
1711 	struct sockaddr_in6	 sa_mask;
1712 
1713 	memset(&info, 0, sizeof(info));
1714 	info.rti_info[RTAX_DST] = rt_key(rt);
1715 	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
1716 	if (!ISSET(rt->rt_flags, RTF_HOST))
1717 		info.rti_info[RTAX_NETMASK] = rt_plen2mask(rt, &sa_mask);
1718 	info.rti_info[RTAX_LABEL] = rtlabel_id2sa(rt->rt_labelid, &sa_rl);
1719 	ifp = if_get(rt->rt_ifidx);
1720 	if (ifp != NULL) {
1721 		info.rti_info[RTAX_IFP] = sdltosa(ifp->if_sadl);
1722 		info.rti_info[RTAX_IFA] = rtable_getsource(rtableid,
1723 		    info.rti_info[RTAX_DST]->sa_family);
1724 		if (info.rti_info[RTAX_IFA] == NULL)
1725 			info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
1726 	}
1727 
1728 	rtm_miss(cmd, &info, rt->rt_flags, rt->rt_priority, rt->rt_ifidx, error,
1729 	    rtableid);
1730 	if_put(ifp);
1731 }
1732 
1733 /*
1734  * This routine is called to generate a message from the routing
1735  * socket indicating that a redirect has occurred, a routing lookup
1736  * has failed, or that a protocol has detected timeouts to a particular
1737  * destination.
1738  */
1739 void
1740 rtm_miss(int type, struct rt_addrinfo *rtinfo, int flags, uint8_t prio,
1741     u_int ifidx, int error, u_int tableid)
1742 {
1743 	struct rt_msghdr	*rtm;
1744 	struct mbuf		*m;
1745 	struct sockaddr		*sa = rtinfo->rti_info[RTAX_DST];
1746 
1747 	if (rtptable.rtp_count == 0)
1748 		return;
1749 	m = rtm_msg1(type, rtinfo);
1750 	if (m == NULL)
1751 		return;
1752 	rtm = mtod(m, struct rt_msghdr *);
1753 	rtm->rtm_flags = RTF_DONE | flags;
1754 	rtm->rtm_priority = prio;
1755 	rtm->rtm_errno = error;
1756 	rtm->rtm_tableid = tableid;
1757 	rtm->rtm_addrs = rtinfo->rti_addrs;
1758 	rtm->rtm_index = ifidx;
1759 	route_input(m, NULL, sa ? sa->sa_family : AF_UNSPEC);
1760 }
1761 
1762 /*
1763  * This routine is called to generate a message from the routing
1764  * socket indicating that the status of a network interface has changed.
1765  */
1766 void
1767 rtm_ifchg(struct ifnet *ifp)
1768 {
1769 	struct rt_addrinfo	 info;
1770 	struct if_msghdr	*ifm;
1771 	struct mbuf		*m;
1772 
1773 	if (rtptable.rtp_count == 0)
1774 		return;
1775 	memset(&info, 0, sizeof(info));
1776 	info.rti_info[RTAX_IFP] = sdltosa(ifp->if_sadl);
1777 	m = rtm_msg1(RTM_IFINFO, &info);
1778 	if (m == NULL)
1779 		return;
1780 	ifm = mtod(m, struct if_msghdr *);
1781 	ifm->ifm_index = ifp->if_index;
1782 	ifm->ifm_tableid = ifp->if_rdomain;
1783 	ifm->ifm_flags = ifp->if_flags;
1784 	ifm->ifm_xflags = ifp->if_xflags;
1785 	if_getdata(ifp, &ifm->ifm_data);
1786 	ifm->ifm_addrs = info.rti_addrs;
1787 	route_input(m, NULL, AF_UNSPEC);
1788 }
1789 
1790 /*
1791  * This is called to generate messages from the routing socket
1792  * indicating a network interface has had addresses associated with it.
1793  * if we ever reverse the logic and replace messages TO the routing
1794  * socket indicate a request to configure interfaces, then it will
1795  * be unnecessary as the routing socket will automatically generate
1796  * copies of it.
1797  */
1798 void
1799 rtm_addr(int cmd, struct ifaddr *ifa)
1800 {
1801 	struct ifnet		*ifp = ifa->ifa_ifp;
1802 	struct mbuf		*m;
1803 	struct rt_addrinfo	 info;
1804 	struct ifa_msghdr	*ifam;
1805 
1806 	if (rtptable.rtp_count == 0)
1807 		return;
1808 
1809 	memset(&info, 0, sizeof(info));
1810 	info.rti_info[RTAX_IFA] = ifa->ifa_addr;
1811 	info.rti_info[RTAX_IFP] = sdltosa(ifp->if_sadl);
1812 	info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask;
1813 	info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
1814 	if ((m = rtm_msg1(cmd, &info)) == NULL)
1815 		return;
1816 	ifam = mtod(m, struct ifa_msghdr *);
1817 	ifam->ifam_index = ifp->if_index;
1818 	ifam->ifam_metric = ifa->ifa_metric;
1819 	ifam->ifam_flags = ifa->ifa_flags;
1820 	ifam->ifam_addrs = info.rti_addrs;
1821 	ifam->ifam_tableid = ifp->if_rdomain;
1822 
1823 	route_input(m, NULL,
1824 	    ifa->ifa_addr ? ifa->ifa_addr->sa_family : AF_UNSPEC);
1825 }
1826 
1827 /*
1828  * This is called to generate routing socket messages indicating
1829  * network interface arrival and departure.
1830  */
1831 void
1832 rtm_ifannounce(struct ifnet *ifp, int what)
1833 {
1834 	struct if_announcemsghdr	*ifan;
1835 	struct mbuf			*m;
1836 
1837 	if (rtptable.rtp_count == 0)
1838 		return;
1839 	m = rtm_msg1(RTM_IFANNOUNCE, NULL);
1840 	if (m == NULL)
1841 		return;
1842 	ifan = mtod(m, struct if_announcemsghdr *);
1843 	ifan->ifan_index = ifp->if_index;
1844 	strlcpy(ifan->ifan_name, ifp->if_xname, sizeof(ifan->ifan_name));
1845 	ifan->ifan_what = what;
1846 	route_input(m, NULL, AF_UNSPEC);
1847 }
1848 
1849 #ifdef BFD
1850 /*
1851  * This is used to generate routing socket messages indicating
1852  * the state of a BFD session.
1853  */
1854 void
1855 rtm_bfd(struct bfd_config *bfd)
1856 {
1857 	struct bfd_msghdr	*bfdm;
1858 	struct sockaddr_bfd	 sa_bfd;
1859 	struct mbuf		*m;
1860 	struct rt_addrinfo	 info;
1861 
1862 	if (rtptable.rtp_count == 0)
1863 		return;
1864 	memset(&info, 0, sizeof(info));
1865 	info.rti_info[RTAX_DST] = rt_key(bfd->bc_rt);
1866 	info.rti_info[RTAX_IFA] = bfd->bc_rt->rt_ifa->ifa_addr;
1867 
1868 	m = rtm_msg1(RTM_BFD, &info);
1869 	if (m == NULL)
1870 		return;
1871 	bfdm = mtod(m, struct bfd_msghdr *);
1872 	bfdm->bm_addrs = info.rti_addrs;
1873 
1874 	KERNEL_ASSERT_LOCKED();
1875 	bfd2sa(bfd->bc_rt, &sa_bfd);
1876 	memcpy(&bfdm->bm_sa, &sa_bfd, sizeof(sa_bfd));
1877 
1878 	route_input(m, NULL, info.rti_info[RTAX_DST]->sa_family);
1879 }
1880 #endif /* BFD */
1881 
1882 /*
1883  * This is used to generate routing socket messages indicating
1884  * the state of an ieee80211 interface.
1885  */
1886 void
1887 rtm_80211info(struct ifnet *ifp, struct if_ieee80211_data *ifie)
1888 {
1889 	struct if_ieee80211_msghdr	*ifim;
1890 	struct mbuf			*m;
1891 
1892 	if (rtptable.rtp_count == 0)
1893 		return;
1894 	m = rtm_msg1(RTM_80211INFO, NULL);
1895 	if (m == NULL)
1896 		return;
1897 	ifim = mtod(m, struct if_ieee80211_msghdr *);
1898 	ifim->ifim_index = ifp->if_index;
1899 	ifim->ifim_tableid = ifp->if_rdomain;
1900 
1901 	memcpy(&ifim->ifim_ifie, ifie, sizeof(ifim->ifim_ifie));
1902 	route_input(m, NULL, AF_UNSPEC);
1903 }
1904 
1905 /*
1906  * This is used to generate routing socket messages indicating
1907  * the address selection proposal from an interface.
1908  */
1909 void
1910 rtm_proposal(struct ifnet *ifp, struct rt_addrinfo *rtinfo, int flags,
1911     uint8_t prio)
1912 {
1913 	struct rt_msghdr	*rtm;
1914 	struct mbuf		*m;
1915 
1916 	m = rtm_msg1(RTM_PROPOSAL, rtinfo);
1917 	if (m == NULL)
1918 		return;
1919 	rtm = mtod(m, struct rt_msghdr *);
1920 	rtm->rtm_flags = RTF_DONE | flags;
1921 	rtm->rtm_priority = prio;
1922 	rtm->rtm_tableid = ifp->if_rdomain;
1923 	rtm->rtm_index = ifp->if_index;
1924 	rtm->rtm_addrs = rtinfo->rti_addrs;
1925 
1926 	route_input(m, NULL, rtinfo->rti_info[RTAX_DNS]->sa_family);
1927 }
1928 
1929 /*
1930  * This is used in dumping the kernel table via sysctl().
1931  */
1932 int
1933 sysctl_dumpentry(struct rtentry *rt, void *v, unsigned int id)
1934 {
1935 	struct walkarg		*w = v;
1936 	int			 error = 0, size;
1937 	struct rt_addrinfo	 info;
1938 	struct ifnet		*ifp;
1939 #ifdef BFD
1940 	struct sockaddr_bfd	 sa_bfd;
1941 #endif
1942 	struct sockaddr_rtlabel	 sa_rl;
1943 	struct sockaddr_in6	 sa_mask;
1944 
1945 	if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg))
1946 		return 0;
1947 	if (w->w_op == NET_RT_DUMP && w->w_arg) {
1948 		u_int8_t prio = w->w_arg & RTP_MASK;
1949 		if (w->w_arg < 0) {
1950 			prio = (-w->w_arg) & RTP_MASK;
1951 			/* Show all routes that are not this priority */
1952 			if (prio == (rt->rt_priority & RTP_MASK))
1953 				return 0;
1954 		} else {
1955 			if (prio != (rt->rt_priority & RTP_MASK) &&
1956 			    prio != RTP_ANY)
1957 				return 0;
1958 		}
1959 	}
1960 	bzero(&info, sizeof(info));
1961 	info.rti_info[RTAX_DST] = rt_key(rt);
1962 	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
1963 	info.rti_info[RTAX_NETMASK] = rt_plen2mask(rt, &sa_mask);
1964 	ifp = if_get(rt->rt_ifidx);
1965 	if (ifp != NULL) {
1966 		info.rti_info[RTAX_IFP] = sdltosa(ifp->if_sadl);
1967 		info.rti_info[RTAX_IFA] =
1968 		    rtable_getsource(id, info.rti_info[RTAX_DST]->sa_family);
1969 		if (info.rti_info[RTAX_IFA] == NULL)
1970 			info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
1971 		if (ifp->if_flags & IFF_POINTOPOINT)
1972 			info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr;
1973 	}
1974 	if_put(ifp);
1975 	info.rti_info[RTAX_LABEL] = rtlabel_id2sa(rt->rt_labelid, &sa_rl);
1976 #ifdef BFD
1977 	if (rt->rt_flags & RTF_BFD) {
1978 		KERNEL_ASSERT_LOCKED();
1979 		info.rti_info[RTAX_BFD] = bfd2sa(rt, &sa_bfd);
1980 	}
1981 #endif
1982 #ifdef MPLS
1983 	if (rt->rt_flags & RTF_MPLS) {
1984 		struct sockaddr_mpls	 sa_mpls;
1985 
1986 		bzero(&sa_mpls, sizeof(sa_mpls));
1987 		sa_mpls.smpls_family = AF_MPLS;
1988 		sa_mpls.smpls_len = sizeof(sa_mpls);
1989 		sa_mpls.smpls_label = ((struct rt_mpls *)
1990 		    rt->rt_llinfo)->mpls_label;
1991 		info.rti_info[RTAX_SRC] = (struct sockaddr *)&sa_mpls;
1992 		info.rti_mpls = ((struct rt_mpls *)
1993 		    rt->rt_llinfo)->mpls_operation;
1994 	}
1995 #endif
1996 
1997 	size = rtm_msg2(RTM_GET, RTM_VERSION, &info, NULL, w);
1998 	if (w->w_where && w->w_tmem && w->w_needed <= w->w_given) {
1999 		struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem;
2000 
2001 		rtm->rtm_pid = curproc->p_p->ps_pid;
2002 		rtm->rtm_flags = RTF_DONE | rt->rt_flags;
2003 		rtm->rtm_priority = rt->rt_priority & RTP_MASK;
2004 		rtm_getmetrics(rt, &rtm->rtm_rmx);
2005 		/* Do not account the routing table's reference. */
2006 		rtm->rtm_rmx.rmx_refcnt = refcnt_read(&rt->rt_refcnt) - 1;
2007 		rtm->rtm_index = rt->rt_ifidx;
2008 		rtm->rtm_addrs = info.rti_addrs;
2009 		rtm->rtm_tableid = id;
2010 #ifdef MPLS
2011 		rtm->rtm_mpls = info.rti_mpls;
2012 #endif
2013 		if ((error = copyout(rtm, w->w_where, size)) != 0)
2014 			w->w_where = NULL;
2015 		else
2016 			w->w_where += size;
2017 	}
2018 	return (error);
2019 }
2020 
2021 int
2022 sysctl_iflist(int af, struct walkarg *w)
2023 {
2024 	struct ifnet		*ifp;
2025 	struct ifaddr		*ifa;
2026 	struct rt_addrinfo	 info;
2027 	int			 len, error = 0;
2028 
2029 	bzero(&info, sizeof(info));
2030 	TAILQ_FOREACH(ifp, &ifnet, if_list) {
2031 		if (w->w_arg && w->w_arg != ifp->if_index)
2032 			continue;
2033 		/* Copy the link-layer address first */
2034 		info.rti_info[RTAX_IFP] = sdltosa(ifp->if_sadl);
2035 		len = rtm_msg2(RTM_IFINFO, RTM_VERSION, &info, 0, w);
2036 		if (w->w_where && w->w_tmem && w->w_needed <= w->w_given) {
2037 			struct if_msghdr *ifm;
2038 
2039 			ifm = (struct if_msghdr *)w->w_tmem;
2040 			ifm->ifm_index = ifp->if_index;
2041 			ifm->ifm_tableid = ifp->if_rdomain;
2042 			ifm->ifm_flags = ifp->if_flags;
2043 			if_getdata(ifp, &ifm->ifm_data);
2044 			ifm->ifm_addrs = info.rti_addrs;
2045 			error = copyout(ifm, w->w_where, len);
2046 			if (error)
2047 				return (error);
2048 			w->w_where += len;
2049 		}
2050 		info.rti_info[RTAX_IFP] = NULL;
2051 		TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) {
2052 			KASSERT(ifa->ifa_addr->sa_family != AF_LINK);
2053 			if (af && af != ifa->ifa_addr->sa_family)
2054 				continue;
2055 			info.rti_info[RTAX_IFA] = ifa->ifa_addr;
2056 			info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask;
2057 			info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
2058 			len = rtm_msg2(RTM_NEWADDR, RTM_VERSION, &info, 0, w);
2059 			if (w->w_where && w->w_tmem &&
2060 			    w->w_needed <= w->w_given) {
2061 				struct ifa_msghdr *ifam;
2062 
2063 				ifam = (struct ifa_msghdr *)w->w_tmem;
2064 				ifam->ifam_index = ifa->ifa_ifp->if_index;
2065 				ifam->ifam_flags = ifa->ifa_flags;
2066 				ifam->ifam_metric = ifa->ifa_metric;
2067 				ifam->ifam_addrs = info.rti_addrs;
2068 				error = copyout(w->w_tmem, w->w_where, len);
2069 				if (error)
2070 					return (error);
2071 				w->w_where += len;
2072 			}
2073 		}
2074 		info.rti_info[RTAX_IFA] = info.rti_info[RTAX_NETMASK] =
2075 		    info.rti_info[RTAX_BRD] = NULL;
2076 	}
2077 	return (0);
2078 }
2079 
2080 int
2081 sysctl_ifnames(struct walkarg *w)
2082 {
2083 	struct if_nameindex_msg ifn;
2084 	struct ifnet *ifp;
2085 	int error = 0;
2086 
2087 	/* XXX ignore tableid for now */
2088 	TAILQ_FOREACH(ifp, &ifnet, if_list) {
2089 		if (w->w_arg && w->w_arg != ifp->if_index)
2090 			continue;
2091 		w->w_needed += sizeof(ifn);
2092 		if (w->w_where && w->w_needed <= w->w_given) {
2093 
2094 			memset(&ifn, 0, sizeof(ifn));
2095 			ifn.if_index = ifp->if_index;
2096 			strlcpy(ifn.if_name, ifp->if_xname,
2097 			    sizeof(ifn.if_name));
2098 			error = copyout(&ifn, w->w_where, sizeof(ifn));
2099 			if (error)
2100 				return (error);
2101 			w->w_where += sizeof(ifn);
2102 		}
2103 	}
2104 
2105 	return (0);
2106 }
2107 
2108 int
2109 sysctl_source(int af, u_int tableid, struct walkarg *w)
2110 {
2111 	struct sockaddr	*sa;
2112 	int		 size, error = 0;
2113 
2114 	sa = rtable_getsource(tableid, af);
2115 	if (sa) {
2116 		switch (sa->sa_family) {
2117 		case AF_INET:
2118 			size = sizeof(struct sockaddr_in);
2119 			break;
2120 #ifdef INET6
2121 		case AF_INET6:
2122 			size = sizeof(struct sockaddr_in6);
2123 			break;
2124 #endif
2125 		default:
2126 			return (0);
2127 		}
2128 		w->w_needed += size;
2129 		if (w->w_where && w->w_needed <= w->w_given) {
2130 			if ((error = copyout(sa, w->w_where, size)))
2131 				return (error);
2132 			w->w_where += size;
2133 		}
2134 	}
2135 	return (0);
2136 }
2137 
2138 int
2139 sysctl_rtable(int *name, u_int namelen, void *where, size_t *given, void *new,
2140     size_t newlen)
2141 {
2142 	int			 i, error = EINVAL;
2143 	u_char			 af;
2144 	struct walkarg		 w;
2145 	struct rt_tableinfo	 tableinfo;
2146 	u_int			 tableid = 0;
2147 
2148 	if (new)
2149 		return (EPERM);
2150 	if (namelen < 3 || namelen > 4)
2151 		return (EINVAL);
2152 	af = name[0];
2153 	bzero(&w, sizeof(w));
2154 	w.w_where = where;
2155 	w.w_given = *given;
2156 	w.w_op = name[1];
2157 	w.w_arg = name[2];
2158 
2159 	if (namelen == 4) {
2160 		tableid = name[3];
2161 		if (!rtable_exists(tableid))
2162 			return (ENOENT);
2163 	} else
2164 		tableid = curproc->p_p->ps_rtableid;
2165 
2166 	switch (w.w_op) {
2167 	case NET_RT_DUMP:
2168 	case NET_RT_FLAGS:
2169 		NET_LOCK();
2170 		for (i = 1; i <= AF_MAX; i++) {
2171 			if (af != 0 && af != i)
2172 				continue;
2173 
2174 			error = rtable_walk(tableid, i, NULL, sysctl_dumpentry,
2175 			    &w);
2176 			if (error == EAFNOSUPPORT)
2177 				error = 0;
2178 			if (error)
2179 				break;
2180 		}
2181 		NET_UNLOCK();
2182 		break;
2183 
2184 	case NET_RT_IFLIST:
2185 		NET_LOCK();
2186 		error = sysctl_iflist(af, &w);
2187 		NET_UNLOCK();
2188 		break;
2189 
2190 	case NET_RT_STATS:
2191 		return (sysctl_rtable_rtstat(where, given, new));
2192 	case NET_RT_TABLE:
2193 		tableid = w.w_arg;
2194 		if (!rtable_exists(tableid))
2195 			return (ENOENT);
2196 		memset(&tableinfo, 0, sizeof tableinfo);
2197 		tableinfo.rti_tableid = tableid;
2198 		tableinfo.rti_domainid = rtable_l2(tableid);
2199 		error = sysctl_rdstruct(where, given, new,
2200 		    &tableinfo, sizeof(tableinfo));
2201 		return (error);
2202 	case NET_RT_IFNAMES:
2203 		NET_LOCK();
2204 		error = sysctl_ifnames(&w);
2205 		NET_UNLOCK();
2206 		break;
2207 	case NET_RT_SOURCE:
2208 		tableid = w.w_arg;
2209 		if (!rtable_exists(tableid))
2210 			return (ENOENT);
2211 		NET_LOCK();
2212 		for (i = 1; i <= AF_MAX; i++) {
2213 			if (af != 0 && af != i)
2214 				continue;
2215 
2216 			error = sysctl_source(i, tableid, &w);
2217 			if (error == EAFNOSUPPORT)
2218 				error = 0;
2219 			if (error)
2220 				break;
2221 		}
2222 		NET_UNLOCK();
2223 		break;
2224 	}
2225 	free(w.w_tmem, M_RTABLE, w.w_tmemsize);
2226 	if (where) {
2227 		*given = w.w_where - (caddr_t)where;
2228 		if (w.w_needed > w.w_given)
2229 			return (ENOMEM);
2230 	} else if (w.w_needed == 0) {
2231 		*given = 0;
2232 	} else {
2233 		*given = roundup(w.w_needed + MAX(w.w_needed / 10, 1024),
2234 		    PAGE_SIZE);
2235 	}
2236 	return (error);
2237 }
2238 
2239 int
2240 sysctl_rtable_rtstat(void *oldp, size_t *oldlenp, void *newp)
2241 {
2242 	extern struct cpumem *rtcounters;
2243 	uint64_t counters[rts_ncounters];
2244 	struct rtstat rtstat;
2245 	uint32_t *words = (uint32_t *)&rtstat;
2246 	int i;
2247 
2248 	CTASSERT(sizeof(rtstat) == (nitems(counters) * sizeof(uint32_t)));
2249 	memset(&rtstat, 0, sizeof rtstat);
2250 	counters_read(rtcounters, counters, nitems(counters));
2251 
2252 	for (i = 0; i < nitems(counters); i++)
2253 		words[i] = (uint32_t)counters[i];
2254 
2255 	return (sysctl_rdstruct(oldp, oldlenp, newp, &rtstat, sizeof(rtstat)));
2256 }
2257 
2258 int
2259 rtm_validate_proposal(struct rt_addrinfo *info)
2260 {
2261 	if (info->rti_addrs & ~(RTA_NETMASK | RTA_IFA | RTA_DNS | RTA_STATIC |
2262 	    RTA_SEARCH)) {
2263 		return -1;
2264 	}
2265 
2266 	if (ISSET(info->rti_addrs, RTA_NETMASK)) {
2267 		struct sockaddr *sa = info->rti_info[RTAX_NETMASK];
2268 		if (sa == NULL)
2269 			return -1;
2270 		switch (sa->sa_family) {
2271 		case AF_INET:
2272 			if (sa->sa_len != sizeof(struct sockaddr_in))
2273 				return -1;
2274 			break;
2275 		case AF_INET6:
2276 			if (sa->sa_len != sizeof(struct sockaddr_in6))
2277 				return -1;
2278 			break;
2279 		default:
2280 			return -1;
2281 		}
2282 	}
2283 
2284 	if (ISSET(info->rti_addrs, RTA_IFA)) {
2285 		struct sockaddr *sa = info->rti_info[RTAX_IFA];
2286 		if (sa == NULL)
2287 			return -1;
2288 		switch (sa->sa_family) {
2289 		case AF_INET:
2290 			if (sa->sa_len != sizeof(struct sockaddr_in))
2291 				return -1;
2292 			break;
2293 		case AF_INET6:
2294 			if (sa->sa_len != sizeof(struct sockaddr_in6))
2295 				return -1;
2296 			break;
2297 		default:
2298 			return -1;
2299 		}
2300 	}
2301 
2302 	if (ISSET(info->rti_addrs, RTA_DNS)) {
2303 		struct sockaddr_rtdns *rtdns =
2304 		    (struct sockaddr_rtdns *)info->rti_info[RTAX_DNS];
2305 		if (rtdns == NULL)
2306 			return -1;
2307 		if (rtdns->sr_len > sizeof(*rtdns))
2308 			return -1;
2309 		if (rtdns->sr_len < offsetof(struct sockaddr_rtdns, sr_dns))
2310 			return -1;
2311 		switch (rtdns->sr_family) {
2312 		case AF_INET:
2313 			if ((rtdns->sr_len - offsetof(struct sockaddr_rtdns,
2314 			    sr_dns)) % sizeof(struct in_addr) != 0)
2315 				return -1;
2316 			break;
2317 #ifdef INET6
2318 		case AF_INET6:
2319 			if ((rtdns->sr_len - offsetof(struct sockaddr_rtdns,
2320 			    sr_dns)) % sizeof(struct in6_addr) != 0)
2321 				return -1;
2322 			break;
2323 #endif
2324 		default:
2325 			return -1;
2326 		}
2327 	}
2328 
2329 	if (ISSET(info->rti_addrs, RTA_STATIC)) {
2330 		struct sockaddr_rtstatic *rtstatic =
2331 		    (struct sockaddr_rtstatic *)info->rti_info[RTAX_STATIC];
2332 		if (rtstatic == NULL)
2333 			return -1;
2334 		if (rtstatic->sr_len > sizeof(*rtstatic))
2335 			return -1;
2336 		if (rtstatic->sr_len <=
2337 		    offsetof(struct sockaddr_rtstatic, sr_static))
2338 			return -1;
2339 	}
2340 
2341 	if (ISSET(info->rti_addrs, RTA_SEARCH)) {
2342 		struct sockaddr_rtsearch *rtsearch =
2343 		    (struct sockaddr_rtsearch *)info->rti_info[RTAX_SEARCH];
2344 		if (rtsearch == NULL)
2345 			return -1;
2346 		if (rtsearch->sr_len > sizeof(*rtsearch))
2347 			return -1;
2348 		if (rtsearch->sr_len <=
2349 		    offsetof(struct sockaddr_rtsearch, sr_search))
2350 			return -1;
2351 	}
2352 
2353 	return 0;
2354 }
2355 
2356 int
2357 rt_setsource(unsigned int rtableid, struct sockaddr *src)
2358 {
2359 	struct ifaddr	*ifa;
2360 	int		error;
2361 	/*
2362 	 * If source address is 0.0.0.0 or ::
2363 	 * use automatic source selection
2364 	 */
2365 	switch(src->sa_family) {
2366 	case AF_INET:
2367 		if(satosin(src)->sin_addr.s_addr == INADDR_ANY) {
2368 			rtable_setsource(rtableid, AF_INET, NULL);
2369 			return (0);
2370 		}
2371 		break;
2372 #ifdef INET6
2373 	case AF_INET6:
2374 		if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(src)->sin6_addr)) {
2375 			rtable_setsource(rtableid, AF_INET6, NULL);
2376 			return (0);
2377 		}
2378 		break;
2379 #endif
2380 	default:
2381 		return (EAFNOSUPPORT);
2382 	}
2383 
2384 	KERNEL_LOCK();
2385 	/*
2386 	 * Check if source address is assigned to an interface in the
2387 	 * same rdomain
2388 	 */
2389 	if ((ifa = ifa_ifwithaddr(src, rtableid)) == NULL) {
2390 		KERNEL_UNLOCK();
2391 		return (EINVAL);
2392 	}
2393 
2394 	error = rtable_setsource(rtableid, src->sa_family, ifa->ifa_addr);
2395 	KERNEL_UNLOCK();
2396 
2397 	return (error);
2398 }
2399 
2400 /*
2401  * Definitions of protocols supported in the ROUTE domain.
2402  */
2403 
2404 const struct protosw routesw[] = {
2405 {
2406   .pr_type	= SOCK_RAW,
2407   .pr_domain	= &routedomain,
2408   .pr_flags	= PR_ATOMIC|PR_ADDR|PR_WANTRCVD,
2409   .pr_output	= route_output,
2410   .pr_ctloutput	= route_ctloutput,
2411   .pr_usrreq	= route_usrreq,
2412   .pr_attach	= route_attach,
2413   .pr_detach	= route_detach,
2414   .pr_init	= route_prinit,
2415   .pr_sysctl	= sysctl_rtable
2416 }
2417 };
2418 
2419 const struct domain routedomain = {
2420   .dom_family = PF_ROUTE,
2421   .dom_name = "route",
2422   .dom_init = route_init,
2423   .dom_protosw = routesw,
2424   .dom_protoswNPROTOSW = &routesw[nitems(routesw)]
2425 };
2426