xref: /openbsd-src/sys/net/rtsock.c (revision b8851fcc53cbe24fd20b090f26dd149e353f6174)
1 /*	$OpenBSD: rtsock.c,v 1.222 2017/02/01 20:59:47 dhill Exp $	*/
2 /*	$NetBSD: rtsock.c,v 1.18 1996/03/29 00:32:10 cgd Exp $	*/
3 
4 /*
5  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the project nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 /*
34  * Copyright (c) 1988, 1991, 1993
35  *	The Regents of the University of California.  All rights reserved.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. Neither the name of the University nor the names of its contributors
46  *    may be used to endorse or promote products derived from this software
47  *    without specific prior written permission.
48  *
49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59  * SUCH DAMAGE.
60  *
61  *	@(#)rtsock.c	8.6 (Berkeley) 2/11/95
62  */
63 
64 #include <sys/param.h>
65 #include <sys/systm.h>
66 #include <sys/proc.h>
67 #include <sys/sysctl.h>
68 #include <sys/mbuf.h>
69 #include <sys/socket.h>
70 #include <sys/socketvar.h>
71 #include <sys/domain.h>
72 #include <sys/protosw.h>
73 
74 #include <net/if.h>
75 #include <net/if_dl.h>
76 #include <net/if_var.h>
77 #include <net/route.h>
78 #include <net/raw_cb.h>
79 
80 #include <netinet/in.h>
81 
82 #ifdef MPLS
83 #include <netmpls/mpls.h>
84 #endif
85 #ifdef BFD
86 #include <net/bfd.h>
87 #endif
88 
89 #include <sys/stdarg.h>
90 #include <sys/kernel.h>
91 #include <sys/timeout.h>
92 
93 struct sockaddr		route_dst = { 2, PF_ROUTE, };
94 struct sockaddr		route_src = { 2, PF_ROUTE, };
95 
96 struct walkarg {
97 	int	w_op, w_arg, w_given, w_needed, w_tmemsize;
98 	caddr_t	w_where, w_tmem;
99 };
100 
101 int	route_ctloutput(int, struct socket *, int, int, struct mbuf *);
102 void	route_input(struct mbuf *m0, sa_family_t);
103 int	route_arp_conflict(struct rtentry *, struct rt_addrinfo *);
104 int	route_cleargateway(struct rtentry *, void *, unsigned int);
105 
106 struct mbuf	*rt_msg1(int, struct rt_addrinfo *);
107 int		 rt_msg2(int, int, struct rt_addrinfo *, caddr_t,
108 		     struct walkarg *);
109 void		 rt_xaddrs(caddr_t, caddr_t, struct rt_addrinfo *);
110 
111 int		 sysctl_iflist(int, struct walkarg *);
112 int		 sysctl_ifnames(struct walkarg *);
113 int		 sysctl_rtable_rtstat(void *, size_t *, void *);
114 
115 struct routecb {
116 	struct rawcb	rcb;
117 	struct timeout	timeout;
118 	unsigned int	msgfilter;
119 	unsigned int	flags;
120 	u_int		rtableid;
121 };
122 #define	sotoroutecb(so)	((struct routecb *)(so)->so_pcb)
123 
124 struct route_cb {
125 	int		ip_count;
126 	int		ip6_count;
127 	int		mpls_count;
128 	int		any_count;
129 };
130 
131 struct route_cb route_cb;
132 
133 /*
134  * These flags and timeout are used for indicating to userland (via a
135  * RTM_DESYNC msg) when the route socket has overflowed and messages
136  * have been lost.
137  */
138 #define ROUTECB_FLAG_DESYNC	0x1	/* Route socket out of memory */
139 #define ROUTECB_FLAG_FLUSH	0x2	/* Wait until socket is empty before
140 					   queueing more packets */
141 
142 #define ROUTE_DESYNC_RESEND_TIMEOUT	(hz / 5)	/* In hz */
143 
144 void	rt_senddesync(void *);
145 
146 int
147 route_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
148     struct mbuf *control, struct proc *p)
149 {
150 	struct rawcb	*rp;
151 	struct routecb	*rop;
152 	int		 af;
153 	int		 error = 0;
154 
155 	NET_ASSERT_LOCKED();
156 
157 	rp = sotorawcb(so);
158 
159 	switch (req) {
160 	case PRU_ATTACH:
161 		/*
162 		 * use the rawcb but allocate a routecb, this
163 		 * code does not care about the additional fields
164 		 * and works directly on the raw socket.
165 		 */
166 		rop = malloc(sizeof(struct routecb), M_PCB, M_WAITOK|M_ZERO);
167 		rp = &rop->rcb;
168 		so->so_pcb = rp;
169 		/* Init the timeout structure */
170 		timeout_set(&((struct routecb *)rp)->timeout, rt_senddesync, rp);
171 		/*
172 		 * Don't call raw_usrreq() in the attach case, because
173 		 * we want to allow non-privileged processes to listen
174 		 * on and send "safe" commands to the routing socket.
175 		 */
176 		if (curproc == 0)
177 			error = EACCES;
178 		else
179 			error = raw_attach(so, (int)(long)nam);
180 		if (error) {
181 			free(rop, M_PCB, sizeof(struct routecb));
182 			return (error);
183 		}
184 		rop->rtableid = curproc->p_p->ps_rtableid;
185 		af = rp->rcb_proto.sp_protocol;
186 		if (af == AF_INET)
187 			route_cb.ip_count++;
188 		else if (af == AF_INET6)
189 			route_cb.ip6_count++;
190 #ifdef MPLS
191 		else if (af == AF_MPLS)
192 			route_cb.mpls_count++;
193 #endif
194 		rp->rcb_faddr = &route_src;
195 		route_cb.any_count++;
196 		soisconnected(so);
197 		so->so_options |= SO_USELOOPBACK;
198 		break;
199 
200 	case PRU_RCVD:
201 		rop = (struct routecb *)rp;
202 
203 		/*
204 		 * If we are in a FLUSH state, check if the buffer is
205 		 * empty so that we can clear the flag.
206 		 */
207 		if (((rop->flags & ROUTECB_FLAG_FLUSH) != 0) &&
208 		    ((sbspace(&rp->rcb_socket->so_rcv) ==
209 		    rp->rcb_socket->so_rcv.sb_hiwat)))
210 			rop->flags &= ~ROUTECB_FLAG_FLUSH;
211 		break;
212 
213 	case PRU_DETACH:
214 		if (rp) {
215 			timeout_del(&((struct routecb *)rp)->timeout);
216 			af = rp->rcb_proto.sp_protocol;
217 			if (af == AF_INET)
218 				route_cb.ip_count--;
219 			else if (af == AF_INET6)
220 				route_cb.ip6_count--;
221 #ifdef MPLS
222 			else if (af == AF_MPLS)
223 				route_cb.mpls_count--;
224 #endif
225 			route_cb.any_count--;
226 		}
227 		/* FALLTHROUGH */
228 	default:
229 		error = raw_usrreq(so, req, m, nam, control, p);
230 	}
231 
232 	return (error);
233 }
234 
235 int
236 route_ctloutput(int op, struct socket *so, int level, int optname,
237     struct mbuf *m)
238 {
239 	struct routecb *rop = sotoroutecb(so);
240 	int error = 0;
241 	unsigned int tid;
242 
243 	if (level != AF_ROUTE) {
244 		error = EINVAL;
245 		if (op == PRCO_SETOPT && m)
246 			m_free(m);
247 		return (error);
248 	}
249 
250 	switch (op) {
251 	case PRCO_SETOPT:
252 		switch (optname) {
253 		case ROUTE_MSGFILTER:
254 			if (m == NULL || m->m_len != sizeof(unsigned int))
255 				error = EINVAL;
256 			else
257 				rop->msgfilter = *mtod(m, unsigned int *);
258 			break;
259 		case ROUTE_TABLEFILTER:
260 			if (m == NULL || m->m_len != sizeof(unsigned int)) {
261 				error = EINVAL;
262 				break;
263 			}
264 			tid = *mtod(m, unsigned int *);
265 			if (tid != RTABLE_ANY && !rtable_exists(tid))
266 				error = ENOENT;
267 			else
268 				rop->rtableid = tid;
269 			break;
270 		default:
271 			error = ENOPROTOOPT;
272 			break;
273 		}
274 		m_free(m);
275 		break;
276 	case PRCO_GETOPT:
277 		switch (optname) {
278 		case ROUTE_MSGFILTER:
279 			m->m_len = sizeof(unsigned int);
280 			*mtod(m, unsigned int *) = rop->msgfilter;
281 			break;
282 		case ROUTE_TABLEFILTER:
283 			m->m_len = sizeof(unsigned int);
284 			*mtod(m, unsigned int *) = rop->rtableid;
285 			break;
286 		default:
287 			error = ENOPROTOOPT;
288 			break;
289 		}
290 	}
291 	return (error);
292 }
293 
294 void
295 rt_senddesync(void *data)
296 {
297 	struct rawcb	*rp;
298 	struct routecb	*rop;
299 	struct mbuf	*desync_mbuf;
300 	int		 s;
301 
302 	rp = (struct rawcb *)data;
303 	rop = (struct routecb *)rp;
304 
305 	/* If we are in a DESYNC state, try to send a RTM_DESYNC packet */
306 	if ((rop->flags & ROUTECB_FLAG_DESYNC) == 0)
307 		return;
308 
309 	/*
310 	 * If we fail to alloc memory or if sbappendaddr()
311 	 * fails, re-add timeout and try again.
312 	 */
313 	desync_mbuf = rt_msg1(RTM_DESYNC, NULL);
314 	if (desync_mbuf != NULL) {
315 		s = splsoftnet();
316 		if (sbappendaddr(&rp->rcb_socket->so_rcv, &route_src,
317 		    desync_mbuf, NULL) != 0) {
318 			rop->flags &= ~ROUTECB_FLAG_DESYNC;
319 			sorwakeup(rp->rcb_socket);
320 			splx(s);
321 			return;
322 		}
323 		splx(s);
324 		m_freem(desync_mbuf);
325 	}
326 	/* Re-add timeout to try sending msg again */
327 	timeout_add(&rop->timeout, ROUTE_DESYNC_RESEND_TIMEOUT);
328 }
329 
330 void
331 route_input(struct mbuf *m0, sa_family_t sa_family)
332 {
333 	struct rawcb *rp;
334 	struct routecb *rop;
335 	struct rt_msghdr *rtm;
336 	struct mbuf *m = m0;
337 	int s, sockets = 0;
338 	struct socket *last = NULL;
339 	struct sockaddr *sosrc, *sodst;
340 
341 	sosrc = &route_src;
342 	sodst = &route_dst;
343 
344 	/* ensure that we can access the rtm_type via mtod() */
345 	if (m->m_len < offsetof(struct rt_msghdr, rtm_type) + 1) {
346 		m_freem(m);
347 		return;
348 	}
349 
350 	LIST_FOREACH(rp, &rawcb, rcb_list) {
351 		if (rp->rcb_socket->so_state & SS_CANTRCVMORE)
352 			continue;
353 		if (rp->rcb_proto.sp_family != PF_ROUTE)
354 			continue;
355 		/*
356 		 * If route socket is bound to an address family only send
357 		 * messages that match the address family. Address family
358 		 * agnostic messages are always send.
359 		 */
360 		if (rp->rcb_proto.sp_protocol != AF_UNSPEC &&
361 		    sa_family != AF_UNSPEC &&
362 		    rp->rcb_proto.sp_protocol != sa_family)
363 			continue;
364 		/*
365 		 * We assume the lower level routines have
366 		 * placed the address in a canonical format
367 		 * suitable for a structure comparison.
368 		 *
369 		 * Note that if the lengths are not the same
370 		 * the comparison will fail at the first byte.
371 		 */
372 #define	equal(a1, a2) \
373   (bcmp((caddr_t)(a1), (caddr_t)(a2), a1->sa_len) == 0)
374 		if (rp->rcb_laddr && !equal(rp->rcb_laddr, sodst))
375 			continue;
376 		if (rp->rcb_faddr && !equal(rp->rcb_faddr, sosrc))
377 			continue;
378 
379 		/* filter messages that the process does not want */
380 		rop = (struct routecb *)rp;
381 		rtm = mtod(m, struct rt_msghdr *);
382 		/* but RTM_DESYNC can't be filtered */
383 		if (rtm->rtm_type != RTM_DESYNC && rop->msgfilter != 0 &&
384 		    !(rop->msgfilter & (1 << rtm->rtm_type)))
385 			continue;
386 		switch (rtm->rtm_type) {
387 		case RTM_IFANNOUNCE:
388 		case RTM_DESYNC:
389 			/* no tableid */
390 			break;
391 		case RTM_RESOLVE:
392 		case RTM_NEWADDR:
393 		case RTM_DELADDR:
394 		case RTM_IFINFO:
395 			/* check against rdomain id */
396 			if (rop->rtableid != RTABLE_ANY &&
397 			    rtable_l2(rop->rtableid) != rtm->rtm_tableid)
398 				continue;
399 			break;
400 		default:
401 			/* check against rtable id */
402 			if (rop->rtableid != RTABLE_ANY &&
403 			    rop->rtableid != rtm->rtm_tableid)
404 				continue;
405 			break;
406 		}
407 
408 		/*
409 		 * Check to see if the flush flag is set. If so, don't queue
410 		 * any more messages until the flag is cleared.
411 		 */
412 		if ((rop->flags & ROUTECB_FLAG_FLUSH) != 0)
413 			continue;
414 
415 		if (last) {
416 			struct mbuf *n;
417 			if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) {
418 				s = splsoftnet();
419 				if (sbspace(&last->so_rcv) < (2 * MSIZE) ||
420 				    sbappendaddr(&last->so_rcv, sosrc,
421 				    n, (struct mbuf *)NULL) == 0) {
422 					/*
423 					 * Flag socket as desync'ed and
424 					 * flush required
425 					 */
426 					sotoroutecb(last)->flags |=
427 					    ROUTECB_FLAG_DESYNC |
428 					    ROUTECB_FLAG_FLUSH;
429 					rt_senddesync((void *) sotorawcb(last));
430 					m_freem(n);
431 				} else {
432 					sorwakeup(last);
433 					sockets++;
434 				}
435 				splx(s);
436 			}
437 		}
438 		last = rp->rcb_socket;
439 	}
440 	if (last) {
441 		s = splsoftnet();
442 		if (sbspace(&last->so_rcv) < (2 * MSIZE) ||
443 		    sbappendaddr(&last->so_rcv, sosrc,
444 		    m, (struct mbuf *)NULL) == 0) {
445 			/* Flag socket as desync'ed and flush required */
446 			sotoroutecb(last)->flags |=
447 			    ROUTECB_FLAG_DESYNC | ROUTECB_FLAG_FLUSH;
448 			rt_senddesync((void *) sotorawcb(last));
449 			m_freem(m);
450 		} else {
451 			sorwakeup(last);
452 			sockets++;
453 		}
454 		splx(s);
455 	} else
456 		m_freem(m);
457 }
458 
459 struct rt_msghdr *
460 rt_report(struct rtentry *rt, u_char type, int seq, int tableid)
461 {
462 	struct rt_msghdr	*rtm;
463 	struct rt_addrinfo	 info;
464 	struct sockaddr_rtlabel	 sa_rl;
465 	struct sockaddr_in6	 sa_mask;
466 #ifdef BFD
467 	struct sockaddr_bfd	 sa_bfd;
468 #endif
469 #ifdef MPLS
470 	struct sockaddr_mpls	 sa_mpls;
471 #endif
472 	struct ifnet		*ifp = NULL;
473 	int			 len;
474 
475 	bzero(&info, sizeof(info));
476 	info.rti_info[RTAX_DST] = rt_key(rt);
477 	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
478 	info.rti_info[RTAX_NETMASK] = rt_plen2mask(rt, &sa_mask);
479 	info.rti_info[RTAX_LABEL] = rtlabel_id2sa(rt->rt_labelid, &sa_rl);
480 #ifdef BFD
481 	if (rt->rt_flags & RTF_BFD)
482 		info.rti_info[RTAX_BFD] = bfd2sa(rt, &sa_bfd);
483 #endif
484 #ifdef MPLS
485 	if (rt->rt_flags & RTF_MPLS) {
486 		bzero(&sa_mpls, sizeof(sa_mpls));
487 		sa_mpls.smpls_family = AF_MPLS;
488 		sa_mpls.smpls_len = sizeof(sa_mpls);
489 		sa_mpls.smpls_label = ((struct rt_mpls *)
490 		    rt->rt_llinfo)->mpls_label;
491 		info.rti_info[RTAX_SRC] = (struct sockaddr *)&sa_mpls;
492 		info.rti_mpls = ((struct rt_mpls *)
493 		    rt->rt_llinfo)->mpls_operation;
494 	}
495 #endif
496 	ifp = if_get(rt->rt_ifidx);
497 	if (ifp != NULL) {
498 		info.rti_info[RTAX_IFP] = sdltosa(ifp->if_sadl);
499 		info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
500 		if (ifp->if_flags & IFF_POINTOPOINT)
501 			info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr;
502 	}
503 	if_put(ifp);
504 	/* RTAX_GENMASK, RTAX_AUTHOR, RTAX_SRCMASK ignored */
505 
506 	/* build new route message */
507 	len = rt_msg2(type, RTM_VERSION, &info, NULL, NULL);
508 	/* XXX why can't we wait? Should be process context... */
509 	rtm = malloc(len, M_RTABLE, M_NOWAIT | M_ZERO);
510 	if (rtm == NULL)
511 		return NULL;
512 
513 	rt_msg2(type, RTM_VERSION, &info, (caddr_t)rtm, NULL);
514 	rtm->rtm_type = type;
515 	rtm->rtm_index = rt->rt_ifidx;
516 	rtm->rtm_tableid = tableid;
517 	rtm->rtm_priority = rt->rt_priority & RTP_MASK;
518 	rtm->rtm_flags = rt->rt_flags;
519 	rtm->rtm_pid = curproc->p_p->ps_pid;
520 	rtm->rtm_seq = seq;
521 	rt_getmetrics(&rt->rt_rmx, &rtm->rtm_rmx);
522 	rtm->rtm_addrs = info.rti_addrs;
523 #ifdef MPLS
524 	rtm->rtm_mpls = info.rti_mpls;
525 #endif
526 	return rtm;
527 }
528 
529 int
530 route_output(struct mbuf *m, ...)
531 {
532 	struct rt_msghdr	*rtm = NULL;
533 	struct rtentry		*rt = NULL;
534 	struct rt_addrinfo	 info;
535 	int			 plen, len, seq, newgate = 0, error = 0;
536 	struct ifnet		*ifp = NULL;
537 	struct ifaddr		*ifa = NULL;
538 	struct socket		*so;
539 	struct rawcb		*rp = NULL;
540 #ifdef MPLS
541 	struct sockaddr_mpls	*psa_mpls;
542 #endif
543 	va_list			 ap;
544 	u_int			 tableid;
545 	u_int8_t		 prio;
546 	u_char			 vers, type;
547 
548 	va_start(ap, m);
549 	so = va_arg(ap, struct socket *);
550 	va_end(ap);
551 
552 	if (m == NULL || ((m->m_len < sizeof(int32_t)) &&
553 	    (m = m_pullup(m, sizeof(int32_t))) == 0))
554 		return (ENOBUFS);
555 	if ((m->m_flags & M_PKTHDR) == 0)
556 		panic("route_output");
557 	len = m->m_pkthdr.len;
558 	if (len < offsetof(struct rt_msghdr, rtm_type) + 1 ||
559 	    len != mtod(m, struct rt_msghdr *)->rtm_msglen) {
560 		error = EINVAL;
561 		goto fail;
562 	}
563 	vers = mtod(m, struct rt_msghdr *)->rtm_version;
564 	switch (vers) {
565 	case RTM_VERSION:
566 		if (len < sizeof(struct rt_msghdr)) {
567 			error = EINVAL;
568 			goto fail;
569 		}
570 		if (len > RTM_MAXSIZE) {
571 			error = EMSGSIZE;
572 			goto fail;
573 		}
574 		rtm = malloc(len, M_RTABLE, M_NOWAIT);
575 		if (rtm == NULL) {
576 			error = ENOBUFS;
577 			goto fail;
578 		}
579 		m_copydata(m, 0, len, (caddr_t)rtm);
580 		break;
581 	default:
582 		error = EPROTONOSUPPORT;
583 		goto fail;
584 	}
585 	rtm->rtm_pid = curproc->p_p->ps_pid;
586 	if (rtm->rtm_hdrlen == 0)	/* old client */
587 		rtm->rtm_hdrlen = sizeof(struct rt_msghdr);
588 	if (len < rtm->rtm_hdrlen) {
589 		error = EINVAL;
590 		goto fail;
591 	}
592 
593 	/* Verify that the caller is sending an appropriate message early */
594 	switch (rtm->rtm_type) {
595 	case RTM_ADD:
596 	case RTM_DELETE:
597 	case RTM_GET:
598 	case RTM_CHANGE:
599 	case RTM_LOCK:
600 		break;
601 	default:
602 		error = EOPNOTSUPP;
603 		goto fail;
604 	}
605 
606 	/*
607 	 * Verify that the caller has the appropriate privilege; RTM_GET
608 	 * is the only operation the non-superuser is allowed.
609 	 */
610 	if (rtm->rtm_type != RTM_GET && suser(curproc, 0) != 0) {
611 		error = EACCES;
612 		goto fail;
613 	}
614 	tableid = rtm->rtm_tableid;
615 	if (!rtable_exists(tableid)) {
616 		if (rtm->rtm_type == RTM_ADD) {
617 			if ((error = rtable_add(tableid)) != 0)
618 				goto fail;
619 		} else {
620 			error = EINVAL;
621 			goto fail;
622 		}
623 	}
624 
625 
626 	/* Do not let userland play with kernel-only flags. */
627 	if ((rtm->rtm_flags & (RTF_LOCAL|RTF_BROADCAST)) != 0) {
628 		error = EINVAL;
629 		goto fail;
630 	}
631 
632 	/* make sure that kernel-only bits are not set */
633 	rtm->rtm_priority &= RTP_MASK;
634 	rtm->rtm_flags &= ~(RTF_DONE|RTF_CLONED|RTF_CACHED);
635 	rtm->rtm_fmask &= RTF_FMASK;
636 
637 	if (rtm->rtm_priority != 0) {
638 		if (rtm->rtm_priority > RTP_MAX ||
639 		    rtm->rtm_priority == RTP_LOCAL) {
640 			error = EINVAL;
641 			goto fail;
642 		}
643 		prio = rtm->rtm_priority;
644 	} else if (rtm->rtm_type != RTM_ADD)
645 		prio = RTP_ANY;
646 	else if (rtm->rtm_flags & RTF_STATIC)
647 		prio = 0;
648 	else
649 		prio = RTP_DEFAULT;
650 
651 	bzero(&info, sizeof(info));
652 	info.rti_addrs = rtm->rtm_addrs;
653 	rt_xaddrs(rtm->rtm_hdrlen + (caddr_t)rtm, len + (caddr_t)rtm, &info);
654 	info.rti_flags = rtm->rtm_flags;
655 	if (info.rti_info[RTAX_DST] == NULL ||
656 	    info.rti_info[RTAX_DST]->sa_family >= AF_MAX ||
657 	    (info.rti_info[RTAX_GATEWAY] != NULL &&
658 	    info.rti_info[RTAX_GATEWAY]->sa_family >= AF_MAX) ||
659 	    info.rti_info[RTAX_GENMASK] != NULL) {
660 		error = EINVAL;
661 		goto fail;
662 	}
663 #ifdef MPLS
664 	info.rti_mpls = rtm->rtm_mpls;
665 #endif
666 
667 	if (info.rti_info[RTAX_GATEWAY] != NULL &&
668 	    info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK &&
669 	    (info.rti_flags & RTF_CLONING) == 0) {
670 		info.rti_flags |= RTF_LLINFO;
671 	}
672 
673 	/*
674 	 * Do not use goto flush before this point since the message itself
675 	 * may be not consistent and could cause unexpected behaviour in other
676 	 * userland clients. Use goto fail instead.
677 	 */
678 	switch (rtm->rtm_type) {
679 	case RTM_ADD:
680 		if (info.rti_info[RTAX_GATEWAY] == NULL) {
681 			error = EINVAL;
682 			goto flush;
683 		}
684 
685 		rt = rtable_match(tableid, info.rti_info[RTAX_DST], NULL);
686 		if ((error = route_arp_conflict(rt, &info))) {
687 			rtfree(rt);
688 			rt = NULL;
689 			goto flush;
690 		}
691 
692 		/*
693 		 * We cannot go through a delete/create/insert cycle for
694 		 * cached route because this can lead to races in the
695 		 * receive path.  Instead we upade the L2 cache.
696 		 */
697 		if ((rt != NULL) && ISSET(rt->rt_flags, RTF_CACHED))
698 			goto change;
699 
700 		rtfree(rt);
701 		rt = NULL;
702 
703 		error = rtrequest(RTM_ADD, &info, prio, &rt, tableid);
704 		if (error == 0)
705 			rt_setmetrics(rtm->rtm_inits, &rtm->rtm_rmx,
706 			    &rt->rt_rmx);
707 		else
708 			goto flush;
709 		break;
710 	case RTM_DELETE:
711 		rt = rtable_lookup(tableid, info.rti_info[RTAX_DST],
712 		    info.rti_info[RTAX_NETMASK], info.rti_info[RTAX_GATEWAY],
713 		    prio);
714 
715 		/*
716 		 * Invalidate the cache of automagically created and
717 		 * referenced L2 entries to make sure that ``rt_gwroute''
718 		 * pointer stays valid for other CPUs.
719 		 */
720 		if ((rt != NULL) && (ISSET(rt->rt_flags, RTF_CACHED))) {
721 			ifp = if_get(rt->rt_ifidx);
722 			KASSERT(ifp != NULL);
723 			ifp->if_rtrequest(ifp, RTM_INVALIDATE, rt);
724 			if_put(ifp);
725 			/* Reset the MTU of the gateway route. */
726 			rtable_walk(tableid, rt_key(rt)->sa_family,
727 			    route_cleargateway, rt);
728 			break;
729 		}
730 
731 		/*
732 		 * Make sure that local routes are only modified by the
733 		 * kernel.
734 		 */
735 		if ((rt != NULL) &&
736 		    ISSET(rt->rt_flags, RTF_LOCAL|RTF_BROADCAST)) {
737 			error = EINVAL;
738 			break;
739 		}
740 
741 		rtfree(rt);
742 		rt = NULL;
743 
744 		error = rtrequest(RTM_DELETE, &info, prio, &rt, tableid);
745 		if (error != 0)
746 			goto flush;
747 		break;
748 	case RTM_CHANGE:
749 	case RTM_LOCK:
750 		rt = rtable_lookup(tableid, info.rti_info[RTAX_DST],
751 		    info.rti_info[RTAX_NETMASK], info.rti_info[RTAX_GATEWAY],
752 		    prio);
753 #ifndef SMALL_KERNEL
754 		/*
755 		 * If we got multipath routes, we require users to specify
756 		 * a matching gateway.
757 		 */
758 		if ((rt != NULL) && ISSET(rt->rt_flags, RTF_MPATH) &&
759 		    (info.rti_info[RTAX_GATEWAY] == NULL)) {
760 			rtfree(rt);
761 			rt = NULL;
762 		}
763 #endif
764 		/*
765 		 * If RTAX_GATEWAY is the argument we're trying to
766 		 * change, try to find a compatible route.
767 		 */
768 		if ((rt == NULL) && (info.rti_info[RTAX_GATEWAY] != NULL) &&
769 		    (rtm->rtm_type == RTM_CHANGE)) {
770 			rt = rtable_lookup(tableid, info.rti_info[RTAX_DST],
771 			    info.rti_info[RTAX_NETMASK], NULL, prio);
772 #ifndef SMALL_KERNEL
773 			/* Ensure we don't pick a multipath one. */
774 			if ((rt != NULL) && ISSET(rt->rt_flags, RTF_MPATH)) {
775 				rtfree(rt);
776 				rt = NULL;
777 			}
778 #endif
779 		}
780 
781 		if (rt == NULL) {
782 			error = ESRCH;
783 			goto flush;
784 		}
785 
786 		/*
787 		 * RTM_CHANGE/LOCK need a perfect match.
788 		 */
789 		plen = rtable_satoplen(info.rti_info[RTAX_DST]->sa_family,
790 		    info.rti_info[RTAX_NETMASK]);
791 		if (rt_plen(rt) != plen ) {
792 			error = ESRCH;
793 			goto flush;
794 		}
795 
796 		switch (rtm->rtm_type) {
797 		case RTM_CHANGE:
798 			if (info.rti_info[RTAX_GATEWAY] != NULL)
799 				if (rt->rt_gateway == NULL ||
800 				    bcmp(rt->rt_gateway,
801 				    info.rti_info[RTAX_GATEWAY],
802 				    info.rti_info[RTAX_GATEWAY]->sa_len)) {
803 					newgate = 1;
804 				}
805 			/*
806 			 * Check reachable gateway before changing the route.
807 			 * New gateway could require new ifaddr, ifp;
808 			 * flags may also be different; ifp may be specified
809 			 * by ll sockaddr when protocol address is ambiguous.
810 			 */
811 			if (newgate || info.rti_info[RTAX_IFP] != NULL ||
812 			    info.rti_info[RTAX_IFA] != NULL) {
813 				if ((error = rt_getifa(&info, tableid)) != 0)
814 					goto flush;
815 				ifa = info.rti_ifa;
816 				if (rt->rt_ifa != ifa) {
817 					ifp = if_get(rt->rt_ifidx);
818 					KASSERT(ifp != NULL);
819 					ifp->if_rtrequest(ifp, RTM_DELETE, rt);
820 					ifafree(rt->rt_ifa);
821 					if_put(ifp);
822 
823 					ifa->ifa_refcnt++;
824 					rt->rt_ifa = ifa;
825 					rt->rt_ifidx = ifa->ifa_ifp->if_index;
826 #ifndef SMALL_KERNEL
827 					/* recheck link state after ifp change*/
828 					rt_if_linkstate_change(rt, ifa->ifa_ifp,
829 					    tableid);
830 #endif
831 				}
832 			}
833 change:
834 			if (info.rti_info[RTAX_GATEWAY] != NULL && (error =
835 			    rt_setgate(rt, info.rti_info[RTAX_GATEWAY],
836 			    tableid)))
837 				goto flush;
838 #ifdef MPLS
839 			if ((rtm->rtm_flags & RTF_MPLS) &&
840 			    info.rti_info[RTAX_SRC] != NULL) {
841 				struct rt_mpls *rt_mpls;
842 
843 				psa_mpls = (struct sockaddr_mpls *)
844 				    info.rti_info[RTAX_SRC];
845 
846 				if (rt->rt_llinfo == NULL) {
847 					rt->rt_llinfo =
848 					    malloc(sizeof(struct rt_mpls),
849 					    M_TEMP, M_NOWAIT|M_ZERO);
850 				}
851 				if (rt->rt_llinfo == NULL) {
852 					error = ENOMEM;
853 					goto flush;
854 				}
855 
856 				rt_mpls = (struct rt_mpls *)rt->rt_llinfo;
857 
858 				if (psa_mpls != NULL) {
859 					rt_mpls->mpls_label =
860 					    psa_mpls->smpls_label;
861 				}
862 
863 				rt_mpls->mpls_operation = info.rti_mpls;
864 
865 				/* XXX: set experimental bits */
866 
867 				rt->rt_flags |= RTF_MPLS;
868 			} else if (newgate || ((rtm->rtm_fmask & RTF_MPLS) &&
869 			    !(rtm->rtm_flags & RTF_MPLS))) {
870 				/* if gateway changed remove MPLS information */
871 				if (rt->rt_llinfo != NULL &&
872 				    rt->rt_flags & RTF_MPLS) {
873 					free(rt->rt_llinfo, M_TEMP, 0);
874 					rt->rt_llinfo = NULL;
875 					rt->rt_flags &= ~RTF_MPLS;
876 				}
877 			}
878 #endif
879 
880 #ifdef BFD
881 			if (ISSET(rtm->rtm_flags, RTF_BFD)) {
882 				if ((error = bfdset(rt)))
883 					goto flush;
884 			} else if (!ISSET(rtm->rtm_flags, RTF_BFD) &&
885 			    ISSET(rtm->rtm_fmask, RTF_BFD)) {
886 				bfdclear(rt);
887 			}
888 #endif
889 
890 			/* Hack to allow some flags to be toggled */
891 			if (rtm->rtm_fmask)
892 				rt->rt_flags =
893 				    (rt->rt_flags & ~rtm->rtm_fmask) |
894 				    (rtm->rtm_flags & rtm->rtm_fmask);
895 
896 			rt_setmetrics(rtm->rtm_inits, &rtm->rtm_rmx,
897 			    &rt->rt_rmx);
898 
899 			ifp = if_get(rt->rt_ifidx);
900 			KASSERT(ifp != NULL);
901 			ifp->if_rtrequest(ifp, RTM_ADD, rt);
902 			if_put(ifp);
903 
904 			if (info.rti_info[RTAX_LABEL] != NULL) {
905 				char *rtlabel = ((struct sockaddr_rtlabel *)
906 				    info.rti_info[RTAX_LABEL])->sr_label;
907 				rtlabel_unref(rt->rt_labelid);
908 				rt->rt_labelid = rtlabel_name2id(rtlabel);
909 			}
910 			if_group_routechange(info.rti_info[RTAX_DST],
911 			    info.rti_info[RTAX_NETMASK]);
912 			/* FALLTHROUGH */
913 		case RTM_LOCK:
914 			rt->rt_rmx.rmx_locks &= ~(rtm->rtm_inits);
915 			rt->rt_rmx.rmx_locks |=
916 			    (rtm->rtm_inits & rtm->rtm_rmx.rmx_locks);
917 			break;
918 		}
919 		break;
920 	case RTM_GET:
921 		rt = rtable_lookup(tableid, info.rti_info[RTAX_DST],
922 		    info.rti_info[RTAX_NETMASK], info.rti_info[RTAX_GATEWAY],
923 		    prio);
924 		if (rt == NULL) {
925 			error = ESRCH;
926 			goto flush;
927 		}
928 		break;
929 	}
930 
931 	/*
932 	 * From here on these vars need to be valid
933 	 * rt, rtm, error, so, m, tableid, sa_family
934 	 *
935 	 * Other notes:
936 	 * - to end up here previous calls passed OK, error is most probably 0
937 	 * - error cases take the flush route or in bad cases fail
938 	 * - fail does not report the message back but just fails the call
939 	 *   if the message is not valid then fail should be used
940 	 */
941 
942 	type = rtm->rtm_type;
943 	seq = rtm->rtm_seq;
944 	free(rtm, M_RTABLE, 0);
945 	rtm = rt_report(rt, type, seq, tableid);
946 	if (rtm == NULL) {
947 		error = ENOBUFS;
948 		goto fail;
949 	}
950 
951 flush:
952 	if (rt)
953 		rtfree(rt);
954 	if (rtm) {
955 		if (error)
956 			rtm->rtm_errno = error;
957 		else {
958 			rtm->rtm_flags |= RTF_DONE;
959 		}
960 	}
961 
962 	/*
963 	 * Check to see if we don't want our own messages.
964 	 */
965 	if (!(so->so_options & SO_USELOOPBACK)) {
966 		if (route_cb.any_count <= 1) {
967 			/* no other listener and no loopback of messages */
968 fail:
969 			free(rtm, M_RTABLE, 0);
970 			m_freem(m);
971 			return (error);
972 		}
973 		/* There is another listener, so construct message */
974 		rp = sotorawcb(so);
975 		rp->rcb_proto.sp_family = 0; /* Avoid us */
976 	}
977 	if (rtm) {
978 		if (m_copyback(m, 0, rtm->rtm_msglen, rtm, M_NOWAIT)) {
979 			m_freem(m);
980 			m = NULL;
981 		} else if (m->m_pkthdr.len > rtm->rtm_msglen)
982 			m_adj(m, rtm->rtm_msglen - m->m_pkthdr.len);
983 		free(rtm, M_RTABLE, 0);
984 	}
985 	if (m)
986 		route_input(m, info.rti_info[RTAX_DST] ?
987 		    info.rti_info[RTAX_DST]->sa_family : AF_UNSPEC);
988 	if (rp)
989 		rp->rcb_proto.sp_family = PF_ROUTE; /* Readd us */
990 
991 	return (error);
992 }
993 
994 int
995 route_cleargateway(struct rtentry *rt, void *arg, unsigned int rtableid)
996 {
997 	struct rtentry *nhrt = arg;
998 
999 	if (ISSET(rt->rt_flags, RTF_GATEWAY) && rt->rt_gwroute == nhrt &&
1000 	    !ISSET(rt->rt_locks, RTV_MTU))
1001                 rt->rt_mtu = 0;
1002 
1003 	return (0);
1004 }
1005 
1006 /*
1007  * Check if the user request to insert an ARP entry does not conflict
1008  * with existing ones.
1009  *
1010  * Only two entries are allowed for a given IP address: a private one
1011  * (priv) and a public one (pub).
1012  */
1013 int
1014 route_arp_conflict(struct rtentry *rt, struct rt_addrinfo *info)
1015 {
1016 #ifdef ART
1017 	int		 proxy = (info->rti_flags & RTF_ANNOUNCE);
1018 
1019 	if ((info->rti_flags & RTF_LLINFO) == 0 ||
1020 	    (info->rti_info[RTAX_DST]->sa_family != AF_INET))
1021 		return (0);
1022 
1023 	if (rt == NULL || !ISSET(rt->rt_flags, RTF_LLINFO))
1024 		return (0);
1025 
1026 	/* If the entry is cached, it can be updated. */
1027 	if (ISSET(rt->rt_flags, RTF_CACHED))
1028 		return (0);
1029 
1030 	/*
1031 	 * Same destination, not cached and both "priv" or "pub" conflict.
1032 	 * If a second entry exists, it always conflict.
1033 	 */
1034 	if ((ISSET(rt->rt_flags, RTF_ANNOUNCE) == proxy) ||
1035 	    ISSET(rt->rt_flags, RTF_MPATH))
1036 		return (EEXIST);
1037 
1038 	/* No conflict but an entry exist so we need to force mpath. */
1039 	info->rti_flags |= RTF_MPATH;
1040 #endif /* ART */
1041 	return (0);
1042 }
1043 
1044 void
1045 rt_setmetrics(u_long which, const struct rt_metrics *in,
1046     struct rt_kmetrics *out)
1047 {
1048 	int64_t expire;
1049 
1050 	if (which & RTV_MTU)
1051 		out->rmx_mtu = in->rmx_mtu;
1052 	if (which & RTV_EXPIRE) {
1053 		expire = in->rmx_expire;
1054 		if (expire != 0) {
1055 			expire -= time_second;
1056 			expire += time_uptime;
1057 		}
1058 
1059 		out->rmx_expire = expire;
1060 	}
1061 }
1062 
1063 void
1064 rt_getmetrics(const struct rt_kmetrics *in, struct rt_metrics *out)
1065 {
1066 	int64_t expire;
1067 
1068 	expire = in->rmx_expire;
1069 	if (expire != 0) {
1070 		expire -= time_uptime;
1071 		expire += time_second;
1072 	}
1073 
1074 	bzero(out, sizeof(*out));
1075 	out->rmx_locks = in->rmx_locks;
1076 	out->rmx_mtu = in->rmx_mtu;
1077 	out->rmx_expire = expire;
1078 	out->rmx_pksent = in->rmx_pksent;
1079 }
1080 
1081 #define ROUNDUP(a) \
1082 	((a) > 0 ? (1 + (((a) - 1) | (sizeof(long) - 1))) : sizeof(long))
1083 #define ADVANCE(x, n) (x += ROUNDUP((n)->sa_len))
1084 
1085 void
1086 rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo)
1087 {
1088 	struct sockaddr	*sa;
1089 	int		 i;
1090 
1091 	bzero(rtinfo->rti_info, sizeof(rtinfo->rti_info));
1092 	for (i = 0; (i < RTAX_MAX) && (cp < cplim); i++) {
1093 		if ((rtinfo->rti_addrs & (1 << i)) == 0)
1094 			continue;
1095 		rtinfo->rti_info[i] = sa = (struct sockaddr *)cp;
1096 		ADVANCE(cp, sa);
1097 	}
1098 }
1099 
1100 struct mbuf *
1101 rt_msg1(int type, struct rt_addrinfo *rtinfo)
1102 {
1103 	struct rt_msghdr	*rtm;
1104 	struct mbuf		*m;
1105 	int			 i;
1106 	struct sockaddr		*sa;
1107 	int			 len, dlen, hlen;
1108 
1109 	switch (type) {
1110 	case RTM_DELADDR:
1111 	case RTM_NEWADDR:
1112 		len = sizeof(struct ifa_msghdr);
1113 		break;
1114 	case RTM_IFINFO:
1115 		len = sizeof(struct if_msghdr);
1116 		break;
1117 	case RTM_IFANNOUNCE:
1118 		len = sizeof(struct if_announcemsghdr);
1119 		break;
1120 #ifdef BFD
1121 	case RTM_BFD:
1122 		len = sizeof(struct bfd_msghdr);
1123 		break;
1124 #endif
1125 	default:
1126 		len = sizeof(struct rt_msghdr);
1127 		break;
1128 	}
1129 	if (len > MCLBYTES)
1130 		panic("rt_msg1");
1131 	m = m_gethdr(M_DONTWAIT, MT_DATA);
1132 	if (m && len > MHLEN) {
1133 		MCLGET(m, M_DONTWAIT);
1134 		if ((m->m_flags & M_EXT) == 0) {
1135 			m_free(m);
1136 			m = NULL;
1137 		}
1138 	}
1139 	if (m == NULL)
1140 		return (m);
1141 	m->m_pkthdr.len = m->m_len = hlen = len;
1142 	m->m_pkthdr.ph_ifidx = 0;
1143 	rtm = mtod(m, struct rt_msghdr *);
1144 	bzero(rtm, len);
1145 	for (i = 0; i < RTAX_MAX; i++) {
1146 		if (rtinfo == NULL || (sa = rtinfo->rti_info[i]) == NULL)
1147 			continue;
1148 		rtinfo->rti_addrs |= (1 << i);
1149 		dlen = ROUNDUP(sa->sa_len);
1150 		if (m_copyback(m, len, dlen, sa, M_NOWAIT)) {
1151 			m_freem(m);
1152 			return (NULL);
1153 		}
1154 		len += dlen;
1155 	}
1156 	rtm->rtm_msglen = len;
1157 	rtm->rtm_hdrlen = hlen;
1158 	rtm->rtm_version = RTM_VERSION;
1159 	rtm->rtm_type = type;
1160 	return (m);
1161 }
1162 
1163 int
1164 rt_msg2(int type, int vers, struct rt_addrinfo *rtinfo, caddr_t cp,
1165     struct walkarg *w)
1166 {
1167 	int		i;
1168 	int		len, dlen, hlen, second_time = 0;
1169 	caddr_t		cp0;
1170 
1171 	rtinfo->rti_addrs = 0;
1172 again:
1173 	switch (type) {
1174 	case RTM_DELADDR:
1175 	case RTM_NEWADDR:
1176 		len = sizeof(struct ifa_msghdr);
1177 		break;
1178 	case RTM_IFINFO:
1179 		len = sizeof(struct if_msghdr);
1180 		break;
1181 	default:
1182 		len = sizeof(struct rt_msghdr);
1183 		break;
1184 	}
1185 	hlen = len;
1186 	if ((cp0 = cp) != NULL)
1187 		cp += len;
1188 	for (i = 0; i < RTAX_MAX; i++) {
1189 		struct sockaddr *sa;
1190 
1191 		if ((sa = rtinfo->rti_info[i]) == NULL)
1192 			continue;
1193 		rtinfo->rti_addrs |= (1 << i);
1194 		dlen = ROUNDUP(sa->sa_len);
1195 		if (cp) {
1196 			bcopy(sa, cp, (size_t)dlen);
1197 			cp += dlen;
1198 		}
1199 		len += dlen;
1200 	}
1201 	/* align message length to the next natural boundary */
1202 	len = ALIGN(len);
1203 	if (cp == 0 && w != NULL && !second_time) {
1204 		struct walkarg *rw = w;
1205 
1206 		rw->w_needed += len;
1207 		if (rw->w_needed <= 0 && rw->w_where) {
1208 			if (rw->w_tmemsize < len) {
1209 				free(rw->w_tmem, M_RTABLE, 0);
1210 				rw->w_tmem = malloc(len, M_RTABLE, M_NOWAIT);
1211 				if (rw->w_tmem)
1212 					rw->w_tmemsize = len;
1213 			}
1214 			if (rw->w_tmem) {
1215 				cp = rw->w_tmem;
1216 				second_time = 1;
1217 				goto again;
1218 			} else
1219 				rw->w_where = 0;
1220 		}
1221 	}
1222 	if (cp && w)		/* clear the message header */
1223 		bzero(cp0, hlen);
1224 
1225 	if (cp) {
1226 		struct rt_msghdr *rtm = (struct rt_msghdr *)cp0;
1227 
1228 		rtm->rtm_version = RTM_VERSION;
1229 		rtm->rtm_type = type;
1230 		rtm->rtm_msglen = len;
1231 		rtm->rtm_hdrlen = hlen;
1232 	}
1233 	return (len);
1234 }
1235 
1236 /*
1237  * This routine is called to generate a message from the routing
1238  * socket indicating that a redirect has occurred, a routing lookup
1239  * has failed, or that a protocol has detected timeouts to a particular
1240  * destination.
1241  */
1242 void
1243 rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, uint8_t prio,
1244     u_int ifidx, int error, u_int tableid)
1245 {
1246 	struct rt_msghdr	*rtm;
1247 	struct mbuf		*m;
1248 	struct sockaddr		*sa = rtinfo->rti_info[RTAX_DST];
1249 
1250 	if (route_cb.any_count == 0)
1251 		return;
1252 	m = rt_msg1(type, rtinfo);
1253 	if (m == NULL)
1254 		return;
1255 	rtm = mtod(m, struct rt_msghdr *);
1256 	rtm->rtm_flags = RTF_DONE | flags;
1257 	rtm->rtm_priority = prio;
1258 	rtm->rtm_errno = error;
1259 	rtm->rtm_tableid = tableid;
1260 	rtm->rtm_addrs = rtinfo->rti_addrs;
1261 	rtm->rtm_index = ifidx;
1262 	route_input(m, sa ? sa->sa_family : AF_UNSPEC);
1263 }
1264 
1265 /*
1266  * This routine is called to generate a message from the routing
1267  * socket indicating that the status of a network interface has changed.
1268  */
1269 void
1270 rt_ifmsg(struct ifnet *ifp)
1271 {
1272 	struct if_msghdr	*ifm;
1273 	struct mbuf		*m;
1274 
1275 	if (route_cb.any_count == 0)
1276 		return;
1277 	m = rt_msg1(RTM_IFINFO, NULL);
1278 	if (m == NULL)
1279 		return;
1280 	ifm = mtod(m, struct if_msghdr *);
1281 	ifm->ifm_index = ifp->if_index;
1282 	ifm->ifm_tableid = ifp->if_rdomain;
1283 	ifm->ifm_flags = ifp->if_flags;
1284 	ifm->ifm_xflags = ifp->if_xflags;
1285 	if_getdata(ifp, &ifm->ifm_data);
1286 	ifm->ifm_addrs = 0;
1287 	route_input(m, AF_UNSPEC);
1288 }
1289 
1290 /*
1291  * This is called to generate messages from the routing socket
1292  * indicating a network interface has had addresses associated with it.
1293  * if we ever reverse the logic and replace messages TO the routing
1294  * socket indicate a request to configure interfaces, then it will
1295  * be unnecessary as the routing socket will automatically generate
1296  * copies of it.
1297  */
1298 void
1299 rt_sendaddrmsg(struct rtentry *rt, int cmd, struct ifaddr *ifa)
1300 {
1301 	struct ifnet		*ifp = ifa->ifa_ifp;
1302 	struct mbuf		*m = NULL;
1303 	struct rt_addrinfo	 info;
1304 	struct ifa_msghdr	*ifam;
1305 
1306 	if (route_cb.any_count == 0)
1307 		return;
1308 
1309 	memset(&info, 0, sizeof(info));
1310 	info.rti_info[RTAX_IFA] = ifa->ifa_addr;
1311 	info.rti_info[RTAX_IFP] = sdltosa(ifp->if_sadl);
1312 	info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask;
1313 	info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
1314 	if ((m = rt_msg1(cmd, &info)) == NULL)
1315 		return;
1316 	ifam = mtod(m, struct ifa_msghdr *);
1317 	ifam->ifam_index = ifp->if_index;
1318 	ifam->ifam_metric = ifa->ifa_metric;
1319 	ifam->ifam_flags = ifa->ifa_flags;
1320 	ifam->ifam_addrs = info.rti_addrs;
1321 	ifam->ifam_tableid = ifp->if_rdomain;
1322 
1323 	route_input(m, ifa->ifa_addr ? ifa->ifa_addr->sa_family : AF_UNSPEC);
1324 }
1325 
1326 /*
1327  * This is called to generate routing socket messages indicating
1328  * network interface arrival and departure.
1329  */
1330 void
1331 rt_ifannouncemsg(struct ifnet *ifp, int what)
1332 {
1333 	struct if_announcemsghdr	*ifan;
1334 	struct mbuf			*m;
1335 
1336 	if (route_cb.any_count == 0)
1337 		return;
1338 	m = rt_msg1(RTM_IFANNOUNCE, NULL);
1339 	if (m == NULL)
1340 		return;
1341 	ifan = mtod(m, struct if_announcemsghdr *);
1342 	ifan->ifan_index = ifp->if_index;
1343 	strlcpy(ifan->ifan_name, ifp->if_xname, sizeof(ifan->ifan_name));
1344 	ifan->ifan_what = what;
1345 	route_input(m, AF_UNSPEC);
1346 }
1347 
1348 #ifdef BFD
1349 /*
1350  * This is used to generate routing socket messages indicating
1351  * the state of a BFD session.
1352  */
1353 void
1354 rt_bfdmsg(struct bfd_config *bfd)
1355 {
1356 	struct bfd_msghdr	*bfdm;
1357 	struct sockaddr_bfd	 sa_bfd;
1358 	struct mbuf		*m;
1359 	struct rt_addrinfo	 info;
1360 
1361 	if (route_cb.any_count == 0)
1362 		return;
1363 	memset(&info, 0, sizeof(info));
1364 	info.rti_info[RTAX_DST] = rt_key(bfd->bc_rt);
1365 	info.rti_info[RTAX_IFA] = bfd->bc_rt->rt_ifa->ifa_addr;
1366 
1367 	m = rt_msg1(RTM_BFD, &info);
1368 	if (m == NULL)
1369 		return;
1370 	bfdm = mtod(m, struct bfd_msghdr *);
1371 	bfdm->bm_addrs = info.rti_addrs;
1372 
1373 	bfd2sa(bfd->bc_rt, &sa_bfd);
1374 	memcpy(&bfdm->bm_sa, &sa_bfd, sizeof(sa_bfd));
1375 
1376 	route_input(m, info.rti_info[RTAX_DST]->sa_family);
1377 }
1378 #endif /* BFD */
1379 
1380 /*
1381  * This is used in dumping the kernel table via sysctl().
1382  */
1383 int
1384 sysctl_dumpentry(struct rtentry *rt, void *v, unsigned int id)
1385 {
1386 	struct walkarg		*w = v;
1387 	int			 error = 0, size;
1388 	struct rt_addrinfo	 info;
1389 	struct ifnet		*ifp;
1390 #ifdef BFD
1391 	struct sockaddr_bfd	 sa_bfd;
1392 #endif
1393 #ifdef MPLS
1394 	struct sockaddr_mpls	 sa_mpls;
1395 #endif
1396 	struct sockaddr_rtlabel	 sa_rl;
1397 	struct sockaddr_in6	 sa_mask;
1398 
1399 	if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg))
1400 		return 0;
1401 	if (w->w_op == NET_RT_DUMP && w->w_arg) {
1402 		u_int8_t prio = w->w_arg & RTP_MASK;
1403 		if (w->w_arg < 0) {
1404 			prio = (-w->w_arg) & RTP_MASK;
1405 			/* Show all routes that are not this priority */
1406 			if (prio == (rt->rt_priority & RTP_MASK))
1407 				return 0;
1408 		} else {
1409 			if (prio != (rt->rt_priority & RTP_MASK) &&
1410 			    prio != RTP_ANY)
1411 				return 0;
1412 		}
1413 	}
1414 	bzero(&info, sizeof(info));
1415 	info.rti_info[RTAX_DST] = rt_key(rt);
1416 	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
1417 	info.rti_info[RTAX_NETMASK] = rt_plen2mask(rt, &sa_mask);
1418 	ifp = if_get(rt->rt_ifidx);
1419 	if (ifp != NULL) {
1420 		info.rti_info[RTAX_IFP] = sdltosa(ifp->if_sadl);
1421 		info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
1422 		if (ifp->if_flags & IFF_POINTOPOINT)
1423 			info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr;
1424 	}
1425 	if_put(ifp);
1426 	info.rti_info[RTAX_LABEL] = rtlabel_id2sa(rt->rt_labelid, &sa_rl);
1427 #ifdef BFD
1428 	if (rt->rt_flags & RTF_BFD)
1429 		info.rti_info[RTAX_BFD] = bfd2sa(rt, &sa_bfd);
1430 #endif
1431 #ifdef MPLS
1432 	if (rt->rt_flags & RTF_MPLS) {
1433 		bzero(&sa_mpls, sizeof(sa_mpls));
1434 		sa_mpls.smpls_family = AF_MPLS;
1435 		sa_mpls.smpls_len = sizeof(sa_mpls);
1436 		sa_mpls.smpls_label = ((struct rt_mpls *)
1437 		    rt->rt_llinfo)->mpls_label;
1438 		info.rti_info[RTAX_SRC] = (struct sockaddr *)&sa_mpls;
1439 		info.rti_mpls = ((struct rt_mpls *)
1440 		    rt->rt_llinfo)->mpls_operation;
1441 	}
1442 #endif
1443 
1444 	size = rt_msg2(RTM_GET, RTM_VERSION, &info, NULL, w);
1445 	if (w->w_where && w->w_tmem && w->w_needed <= 0) {
1446 		struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem;
1447 
1448 		rtm->rtm_pid = curproc->p_p->ps_pid;
1449 		rtm->rtm_flags = rt->rt_flags;
1450 		rtm->rtm_priority = rt->rt_priority & RTP_MASK;
1451 		rt_getmetrics(&rt->rt_rmx, &rtm->rtm_rmx);
1452 		/* Do not account the routing table's reference. */
1453 		rtm->rtm_rmx.rmx_refcnt = rt->rt_refcnt - 1;
1454 		rtm->rtm_index = rt->rt_ifidx;
1455 		rtm->rtm_addrs = info.rti_addrs;
1456 		rtm->rtm_tableid = id;
1457 #ifdef MPLS
1458 		rtm->rtm_mpls = info.rti_mpls;
1459 #endif
1460 		if ((error = copyout(rtm, w->w_where, size)) != 0)
1461 			w->w_where = NULL;
1462 		else
1463 			w->w_where += size;
1464 	}
1465 	return (error);
1466 }
1467 
1468 int
1469 sysctl_iflist(int af, struct walkarg *w)
1470 {
1471 	struct ifnet		*ifp;
1472 	struct ifaddr		*ifa;
1473 	struct rt_addrinfo	 info;
1474 	int			 len, error = 0;
1475 
1476 	bzero(&info, sizeof(info));
1477 	TAILQ_FOREACH(ifp, &ifnet, if_list) {
1478 		if (w->w_arg && w->w_arg != ifp->if_index)
1479 			continue;
1480 		/* Copy the link-layer address first */
1481 		info.rti_info[RTAX_IFP] = sdltosa(ifp->if_sadl);
1482 		len = rt_msg2(RTM_IFINFO, RTM_VERSION, &info, 0, w);
1483 		if (w->w_where && w->w_tmem && w->w_needed <= 0) {
1484 			struct if_msghdr *ifm;
1485 
1486 			ifm = (struct if_msghdr *)w->w_tmem;
1487 			ifm->ifm_index = ifp->if_index;
1488 			ifm->ifm_tableid = ifp->if_rdomain;
1489 			ifm->ifm_flags = ifp->if_flags;
1490 			if_getdata(ifp, &ifm->ifm_data);
1491 			ifm->ifm_addrs = info.rti_addrs;
1492 			error = copyout(ifm, w->w_where, len);
1493 			if (error)
1494 				return (error);
1495 			w->w_where += len;
1496 		}
1497 		info.rti_info[RTAX_IFP] = NULL;
1498 		TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) {
1499 			KASSERT(ifa->ifa_addr->sa_family != AF_LINK);
1500 			if (af && af != ifa->ifa_addr->sa_family)
1501 				continue;
1502 			info.rti_info[RTAX_IFA] = ifa->ifa_addr;
1503 			info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask;
1504 			info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
1505 			len = rt_msg2(RTM_NEWADDR, RTM_VERSION, &info, 0, w);
1506 			if (w->w_where && w->w_tmem && w->w_needed <= 0) {
1507 				struct ifa_msghdr *ifam;
1508 
1509 				ifam = (struct ifa_msghdr *)w->w_tmem;
1510 				ifam->ifam_index = ifa->ifa_ifp->if_index;
1511 				ifam->ifam_flags = ifa->ifa_flags;
1512 				ifam->ifam_metric = ifa->ifa_metric;
1513 				ifam->ifam_addrs = info.rti_addrs;
1514 				error = copyout(w->w_tmem, w->w_where, len);
1515 				if (error)
1516 					return (error);
1517 				w->w_where += len;
1518 			}
1519 		}
1520 		info.rti_info[RTAX_IFA] = info.rti_info[RTAX_NETMASK] =
1521 		    info.rti_info[RTAX_BRD] = NULL;
1522 	}
1523 	return (0);
1524 }
1525 
1526 int
1527 sysctl_ifnames(struct walkarg *w)
1528 {
1529 	struct if_nameindex_msg ifn;
1530 	struct ifnet *ifp;
1531 	int error = 0;
1532 
1533 	/* XXX ignore tableid for now */
1534 	TAILQ_FOREACH(ifp, &ifnet, if_list) {
1535 		if (w->w_arg && w->w_arg != ifp->if_index)
1536 			continue;
1537 		w->w_needed += sizeof(ifn);
1538 		if (w->w_where && w->w_needed <= 0) {
1539 
1540 			memset(&ifn, 0, sizeof(ifn));
1541 			ifn.if_index = ifp->if_index;
1542 			strlcpy(ifn.if_name, ifp->if_xname,
1543 			    sizeof(ifn.if_name));
1544 			error = copyout(&ifn, w->w_where, sizeof(ifn));
1545 			if (error)
1546 				return (error);
1547 			w->w_where += sizeof(ifn);
1548 		}
1549 	}
1550 
1551 	return (0);
1552 }
1553 
1554 int
1555 sysctl_rtable(int *name, u_int namelen, void *where, size_t *given, void *new,
1556     size_t newlen)
1557 {
1558 	int			 i, error = EINVAL;
1559 	u_char			 af;
1560 	struct walkarg		 w;
1561 	struct rt_tableinfo	 tableinfo;
1562 	u_int			 tableid = 0;
1563 
1564 	NET_ASSERT_LOCKED();
1565 
1566 	if (new)
1567 		return (EPERM);
1568 	if (namelen < 3 || namelen > 4)
1569 		return (EINVAL);
1570 	af = name[0];
1571 	bzero(&w, sizeof(w));
1572 	w.w_where = where;
1573 	w.w_given = *given;
1574 	w.w_needed = 0 - w.w_given;
1575 	w.w_op = name[1];
1576 	w.w_arg = name[2];
1577 
1578 	if (namelen == 4) {
1579 		tableid = name[3];
1580 		if (!rtable_exists(tableid))
1581 			return (ENOENT);
1582 	} else
1583 		tableid = curproc->p_p->ps_rtableid;
1584 
1585 	switch (w.w_op) {
1586 	case NET_RT_DUMP:
1587 	case NET_RT_FLAGS:
1588 		for (i = 1; i <= AF_MAX; i++) {
1589 			if (af != 0 && af != i)
1590 				continue;
1591 
1592 			error = rtable_walk(tableid, i, sysctl_dumpentry, &w);
1593 			if (error == EAFNOSUPPORT)
1594 				error = 0;
1595 			if (error)
1596 				break;
1597 		}
1598 		break;
1599 
1600 	case NET_RT_IFLIST:
1601 		error = sysctl_iflist(af, &w);
1602 		break;
1603 
1604 	case NET_RT_STATS:
1605 		return (sysctl_rtable_rtstat(where, given, new));
1606 	case NET_RT_TABLE:
1607 		tableid = w.w_arg;
1608 		if (!rtable_exists(tableid))
1609 			return (ENOENT);
1610 		tableinfo.rti_tableid = tableid;
1611 		tableinfo.rti_domainid = rtable_l2(tableid);
1612 		error = sysctl_rdstruct(where, given, new,
1613 		    &tableinfo, sizeof(tableinfo));
1614 		return (error);
1615 	case NET_RT_IFNAMES:
1616 		error = sysctl_ifnames(&w);
1617 		break;
1618 	}
1619 	free(w.w_tmem, M_RTABLE, 0);
1620 	w.w_needed += w.w_given;
1621 	if (where) {
1622 		*given = w.w_where - (caddr_t)where;
1623 		if (*given < w.w_needed)
1624 			return (ENOMEM);
1625 	} else
1626 		*given = (11 * w.w_needed) / 10;
1627 
1628 	return (error);
1629 }
1630 
1631 int
1632 sysctl_rtable_rtstat(void *oldp, size_t *oldlenp, void *newp)
1633 {
1634 	extern struct cpumem *rtcounters;
1635 	uint64_t counters[rts_ncounters];
1636 	struct rtstat rtstat;
1637 	uint32_t *words = (uint32_t *)&rtstat;
1638 	int i;
1639 
1640 	CTASSERT(sizeof(rtstat) == (nitems(counters) * sizeof(uint32_t)));
1641 
1642 	counters_read(rtcounters, counters, nitems(counters));
1643 
1644 	for (i = 0; i < nitems(counters); i++)
1645 		words[i] = (uint32_t)counters[i];
1646 
1647 	return (sysctl_rdstruct(oldp, oldlenp, newp, &rtstat, sizeof(rtstat)));
1648 }
1649 
1650 /*
1651  * Definitions of protocols supported in the ROUTE domain.
1652  */
1653 
1654 extern	struct domain routedomain;		/* or at least forward */
1655 
1656 struct protosw routesw[] = {
1657 { SOCK_RAW,	&routedomain,	0,		PR_ATOMIC|PR_ADDR|PR_WANTRCVD,
1658   0,		route_output,	0,		route_ctloutput,
1659   route_usrreq,
1660   raw_init,	0,		0,		0,
1661   sysctl_rtable,
1662 }
1663 };
1664 
1665 struct domain routedomain =
1666     { PF_ROUTE, "route", route_init, 0, 0,
1667       routesw, &routesw[nitems(routesw)] };
1668