xref: /openbsd-src/sys/net/rtsock.c (revision d59bb9942320b767f2a19aaa7690c8c6e30b724c)
1 /*	$OpenBSD: rtsock.c,v 1.228 2017/03/03 15:48:02 bluhm Exp $	*/
2 /*	$NetBSD: rtsock.c,v 1.18 1996/03/29 00:32:10 cgd Exp $	*/
3 
4 /*
5  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the project nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 /*
34  * Copyright (c) 1988, 1991, 1993
35  *	The Regents of the University of California.  All rights reserved.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. Neither the name of the University nor the names of its contributors
46  *    may be used to endorse or promote products derived from this software
47  *    without specific prior written permission.
48  *
49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59  * SUCH DAMAGE.
60  *
61  *	@(#)rtsock.c	8.6 (Berkeley) 2/11/95
62  */
63 
64 #include <sys/param.h>
65 #include <sys/systm.h>
66 #include <sys/proc.h>
67 #include <sys/sysctl.h>
68 #include <sys/mbuf.h>
69 #include <sys/socket.h>
70 #include <sys/socketvar.h>
71 #include <sys/domain.h>
72 #include <sys/protosw.h>
73 
74 #include <net/if.h>
75 #include <net/if_dl.h>
76 #include <net/if_var.h>
77 #include <net/route.h>
78 #include <net/raw_cb.h>
79 
80 #include <netinet/in.h>
81 
82 #ifdef MPLS
83 #include <netmpls/mpls.h>
84 #endif
85 #ifdef BFD
86 #include <net/bfd.h>
87 #endif
88 
89 #include <sys/stdarg.h>
90 #include <sys/kernel.h>
91 #include <sys/timeout.h>
92 
93 struct sockaddr		route_dst = { 2, PF_ROUTE, };
94 struct sockaddr		route_src = { 2, PF_ROUTE, };
95 
96 struct walkarg {
97 	int	w_op, w_arg, w_given, w_needed, w_tmemsize;
98 	caddr_t	w_where, w_tmem;
99 };
100 
101 int	route_ctloutput(int, struct socket *, int, int, struct mbuf *);
102 void	route_input(struct mbuf *m0, sa_family_t);
103 int	route_arp_conflict(struct rtentry *, struct rt_addrinfo *);
104 int	route_cleargateway(struct rtentry *, void *, unsigned int);
105 
106 struct mbuf	*rt_msg1(int, struct rt_addrinfo *);
107 int		 rt_msg2(int, int, struct rt_addrinfo *, caddr_t,
108 		     struct walkarg *);
109 void		 rt_xaddrs(caddr_t, caddr_t, struct rt_addrinfo *);
110 
111 void		 rt_proposalmsg(struct rt_msghdr *, struct rt_addrinfo *);
112 
113 int		 sysctl_iflist(int, struct walkarg *);
114 int		 sysctl_ifnames(struct walkarg *);
115 int		 sysctl_rtable_rtstat(void *, size_t *, void *);
116 
117 int		 validate_proposal(struct rt_addrinfo *);
118 
119 struct routecb {
120 	struct rawcb	rcb;
121 	struct timeout	timeout;
122 	unsigned int	msgfilter;
123 	unsigned int	flags;
124 	u_int		rtableid;
125 };
126 #define	sotoroutecb(so)	((struct routecb *)(so)->so_pcb)
127 
128 struct route_cb {
129 	int		ip_count;
130 	int		ip6_count;
131 	int		mpls_count;
132 	int		any_count;
133 };
134 
135 struct route_cb route_cb;
136 
137 /*
138  * These flags and timeout are used for indicating to userland (via a
139  * RTM_DESYNC msg) when the route socket has overflowed and messages
140  * have been lost.
141  */
142 #define ROUTECB_FLAG_DESYNC	0x1	/* Route socket out of memory */
143 #define ROUTECB_FLAG_FLUSH	0x2	/* Wait until socket is empty before
144 					   queueing more packets */
145 
146 #define ROUTE_DESYNC_RESEND_TIMEOUT	(hz / 5)	/* In hz */
147 
148 void	rt_senddesync(void *);
149 
150 int
151 route_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
152     struct mbuf *control, struct proc *p)
153 {
154 	struct rawcb	*rp;
155 	struct routecb	*rop;
156 	int		 af;
157 	int		 error = 0;
158 
159 	NET_ASSERT_LOCKED();
160 
161 	rp = sotorawcb(so);
162 
163 	switch (req) {
164 	case PRU_ATTACH:
165 		/*
166 		 * use the rawcb but allocate a routecb, this
167 		 * code does not care about the additional fields
168 		 * and works directly on the raw socket.
169 		 */
170 		rop = malloc(sizeof(struct routecb), M_PCB, M_WAITOK|M_ZERO);
171 		rp = &rop->rcb;
172 		so->so_pcb = rp;
173 		/* Init the timeout structure */
174 		timeout_set(&((struct routecb *)rp)->timeout, rt_senddesync, rp);
175 		/*
176 		 * Don't call raw_usrreq() in the attach case, because
177 		 * we want to allow non-privileged processes to listen
178 		 * on and send "safe" commands to the routing socket.
179 		 */
180 		if (curproc == 0)
181 			error = EACCES;
182 		else
183 			error = raw_attach(so, (int)(long)nam);
184 		if (error) {
185 			free(rop, M_PCB, sizeof(struct routecb));
186 			return (error);
187 		}
188 		rop->rtableid = curproc->p_p->ps_rtableid;
189 		af = rp->rcb_proto.sp_protocol;
190 		if (af == AF_INET)
191 			route_cb.ip_count++;
192 		else if (af == AF_INET6)
193 			route_cb.ip6_count++;
194 #ifdef MPLS
195 		else if (af == AF_MPLS)
196 			route_cb.mpls_count++;
197 #endif
198 		rp->rcb_faddr = &route_src;
199 		route_cb.any_count++;
200 		soisconnected(so);
201 		so->so_options |= SO_USELOOPBACK;
202 		break;
203 
204 	case PRU_RCVD:
205 		rop = (struct routecb *)rp;
206 
207 		/*
208 		 * If we are in a FLUSH state, check if the buffer is
209 		 * empty so that we can clear the flag.
210 		 */
211 		if (((rop->flags & ROUTECB_FLAG_FLUSH) != 0) &&
212 		    ((sbspace(&rp->rcb_socket->so_rcv) ==
213 		    rp->rcb_socket->so_rcv.sb_hiwat)))
214 			rop->flags &= ~ROUTECB_FLAG_FLUSH;
215 		break;
216 
217 	case PRU_DETACH:
218 		if (rp) {
219 			timeout_del(&((struct routecb *)rp)->timeout);
220 			af = rp->rcb_proto.sp_protocol;
221 			if (af == AF_INET)
222 				route_cb.ip_count--;
223 			else if (af == AF_INET6)
224 				route_cb.ip6_count--;
225 #ifdef MPLS
226 			else if (af == AF_MPLS)
227 				route_cb.mpls_count--;
228 #endif
229 			route_cb.any_count--;
230 		}
231 		/* FALLTHROUGH */
232 	default:
233 		error = raw_usrreq(so, req, m, nam, control, p);
234 	}
235 
236 	return (error);
237 }
238 
239 int
240 route_ctloutput(int op, struct socket *so, int level, int optname,
241     struct mbuf *m)
242 {
243 	struct routecb *rop = sotoroutecb(so);
244 	int error = 0;
245 	unsigned int tid;
246 
247 	if (level != AF_ROUTE) {
248 		error = EINVAL;
249 		if (op == PRCO_SETOPT && m)
250 			m_free(m);
251 		return (error);
252 	}
253 
254 	switch (op) {
255 	case PRCO_SETOPT:
256 		switch (optname) {
257 		case ROUTE_MSGFILTER:
258 			if (m == NULL || m->m_len != sizeof(unsigned int))
259 				error = EINVAL;
260 			else
261 				rop->msgfilter = *mtod(m, unsigned int *);
262 			break;
263 		case ROUTE_TABLEFILTER:
264 			if (m == NULL || m->m_len != sizeof(unsigned int)) {
265 				error = EINVAL;
266 				break;
267 			}
268 			tid = *mtod(m, unsigned int *);
269 			if (tid != RTABLE_ANY && !rtable_exists(tid))
270 				error = ENOENT;
271 			else
272 				rop->rtableid = tid;
273 			break;
274 		default:
275 			error = ENOPROTOOPT;
276 			break;
277 		}
278 		m_free(m);
279 		break;
280 	case PRCO_GETOPT:
281 		switch (optname) {
282 		case ROUTE_MSGFILTER:
283 			m->m_len = sizeof(unsigned int);
284 			*mtod(m, unsigned int *) = rop->msgfilter;
285 			break;
286 		case ROUTE_TABLEFILTER:
287 			m->m_len = sizeof(unsigned int);
288 			*mtod(m, unsigned int *) = rop->rtableid;
289 			break;
290 		default:
291 			error = ENOPROTOOPT;
292 			break;
293 		}
294 	}
295 	return (error);
296 }
297 
298 void
299 rt_senddesync(void *data)
300 {
301 	struct rawcb	*rp;
302 	struct routecb	*rop;
303 	struct mbuf	*desync_mbuf;
304 	int		 s;
305 
306 	rp = (struct rawcb *)data;
307 	rop = (struct routecb *)rp;
308 
309 	/* If we are in a DESYNC state, try to send a RTM_DESYNC packet */
310 	if ((rop->flags & ROUTECB_FLAG_DESYNC) == 0)
311 		return;
312 
313 	/*
314 	 * If we fail to alloc memory or if sbappendaddr()
315 	 * fails, re-add timeout and try again.
316 	 */
317 	desync_mbuf = rt_msg1(RTM_DESYNC, NULL);
318 	if (desync_mbuf != NULL) {
319 		s = splsoftnet();
320 		if (sbappendaddr(&rp->rcb_socket->so_rcv, &route_src,
321 		    desync_mbuf, NULL) != 0) {
322 			rop->flags &= ~ROUTECB_FLAG_DESYNC;
323 			sorwakeup(rp->rcb_socket);
324 			splx(s);
325 			return;
326 		}
327 		splx(s);
328 		m_freem(desync_mbuf);
329 	}
330 	/* Re-add timeout to try sending msg again */
331 	timeout_add(&rop->timeout, ROUTE_DESYNC_RESEND_TIMEOUT);
332 }
333 
334 void
335 route_input(struct mbuf *m0, sa_family_t sa_family)
336 {
337 	struct rawcb *rp;
338 	struct routecb *rop;
339 	struct rt_msghdr *rtm;
340 	struct mbuf *m = m0;
341 	int s, sockets = 0;
342 	struct socket *last = NULL;
343 	struct sockaddr *sosrc, *sodst;
344 
345 	sosrc = &route_src;
346 	sodst = &route_dst;
347 
348 	/* ensure that we can access the rtm_type via mtod() */
349 	if (m->m_len < offsetof(struct rt_msghdr, rtm_type) + 1) {
350 		m_freem(m);
351 		return;
352 	}
353 
354 	LIST_FOREACH(rp, &rawcb, rcb_list) {
355 		if (rp->rcb_socket->so_state & SS_CANTRCVMORE)
356 			continue;
357 		if (rp->rcb_proto.sp_family != PF_ROUTE)
358 			continue;
359 		/*
360 		 * If route socket is bound to an address family only send
361 		 * messages that match the address family. Address family
362 		 * agnostic messages are always send.
363 		 */
364 		if (rp->rcb_proto.sp_protocol != AF_UNSPEC &&
365 		    sa_family != AF_UNSPEC &&
366 		    rp->rcb_proto.sp_protocol != sa_family)
367 			continue;
368 		/*
369 		 * We assume the lower level routines have
370 		 * placed the address in a canonical format
371 		 * suitable for a structure comparison.
372 		 *
373 		 * Note that if the lengths are not the same
374 		 * the comparison will fail at the first byte.
375 		 */
376 #define	equal(a1, a2) \
377   (bcmp((caddr_t)(a1), (caddr_t)(a2), a1->sa_len) == 0)
378 		if (rp->rcb_laddr && !equal(rp->rcb_laddr, sodst))
379 			continue;
380 		if (rp->rcb_faddr && !equal(rp->rcb_faddr, sosrc))
381 			continue;
382 
383 		/* filter messages that the process does not want */
384 		rop = (struct routecb *)rp;
385 		rtm = mtod(m, struct rt_msghdr *);
386 		/* but RTM_DESYNC can't be filtered */
387 		if (rtm->rtm_type != RTM_DESYNC && rop->msgfilter != 0 &&
388 		    !(rop->msgfilter & (1 << rtm->rtm_type)))
389 			continue;
390 		switch (rtm->rtm_type) {
391 		case RTM_IFANNOUNCE:
392 		case RTM_DESYNC:
393 			/* no tableid */
394 			break;
395 		case RTM_RESOLVE:
396 		case RTM_NEWADDR:
397 		case RTM_DELADDR:
398 		case RTM_IFINFO:
399 			/* check against rdomain id */
400 			if (rop->rtableid != RTABLE_ANY &&
401 			    rtable_l2(rop->rtableid) != rtm->rtm_tableid)
402 				continue;
403 			break;
404 		default:
405 			/* check against rtable id */
406 			if (rop->rtableid != RTABLE_ANY &&
407 			    rop->rtableid != rtm->rtm_tableid)
408 				continue;
409 			break;
410 		}
411 
412 		/*
413 		 * Check to see if the flush flag is set. If so, don't queue
414 		 * any more messages until the flag is cleared.
415 		 */
416 		if ((rop->flags & ROUTECB_FLAG_FLUSH) != 0)
417 			continue;
418 
419 		if (last) {
420 			struct mbuf *n;
421 			if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) {
422 				s = splsoftnet();
423 				if (sbspace(&last->so_rcv) < (2 * MSIZE) ||
424 				    sbappendaddr(&last->so_rcv, sosrc,
425 				    n, (struct mbuf *)NULL) == 0) {
426 					/*
427 					 * Flag socket as desync'ed and
428 					 * flush required
429 					 */
430 					sotoroutecb(last)->flags |=
431 					    ROUTECB_FLAG_DESYNC |
432 					    ROUTECB_FLAG_FLUSH;
433 					rt_senddesync((void *) sotorawcb(last));
434 					m_freem(n);
435 				} else {
436 					sorwakeup(last);
437 					sockets++;
438 				}
439 				splx(s);
440 			}
441 		}
442 		last = rp->rcb_socket;
443 	}
444 	if (last) {
445 		s = splsoftnet();
446 		if (sbspace(&last->so_rcv) < (2 * MSIZE) ||
447 		    sbappendaddr(&last->so_rcv, sosrc,
448 		    m, (struct mbuf *)NULL) == 0) {
449 			/* Flag socket as desync'ed and flush required */
450 			sotoroutecb(last)->flags |=
451 			    ROUTECB_FLAG_DESYNC | ROUTECB_FLAG_FLUSH;
452 			rt_senddesync((void *) sotorawcb(last));
453 			m_freem(m);
454 		} else {
455 			sorwakeup(last);
456 			sockets++;
457 		}
458 		splx(s);
459 	} else
460 		m_freem(m);
461 }
462 
463 struct rt_msghdr *
464 rt_report(struct rtentry *rt, u_char type, int seq, int tableid)
465 {
466 	struct rt_msghdr	*rtm;
467 	struct rt_addrinfo	 info;
468 	struct sockaddr_rtlabel	 sa_rl;
469 	struct sockaddr_in6	 sa_mask;
470 #ifdef BFD
471 	struct sockaddr_bfd	 sa_bfd;
472 #endif
473 #ifdef MPLS
474 	struct sockaddr_mpls	 sa_mpls;
475 #endif
476 	struct ifnet		*ifp = NULL;
477 	int			 len;
478 
479 	bzero(&info, sizeof(info));
480 	info.rti_info[RTAX_DST] = rt_key(rt);
481 	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
482 	info.rti_info[RTAX_NETMASK] = rt_plen2mask(rt, &sa_mask);
483 	info.rti_info[RTAX_LABEL] = rtlabel_id2sa(rt->rt_labelid, &sa_rl);
484 #ifdef BFD
485 	if (rt->rt_flags & RTF_BFD)
486 		info.rti_info[RTAX_BFD] = bfd2sa(rt, &sa_bfd);
487 #endif
488 #ifdef MPLS
489 	if (rt->rt_flags & RTF_MPLS) {
490 		bzero(&sa_mpls, sizeof(sa_mpls));
491 		sa_mpls.smpls_family = AF_MPLS;
492 		sa_mpls.smpls_len = sizeof(sa_mpls);
493 		sa_mpls.smpls_label = ((struct rt_mpls *)
494 		    rt->rt_llinfo)->mpls_label;
495 		info.rti_info[RTAX_SRC] = (struct sockaddr *)&sa_mpls;
496 		info.rti_mpls = ((struct rt_mpls *)
497 		    rt->rt_llinfo)->mpls_operation;
498 	}
499 #endif
500 	ifp = if_get(rt->rt_ifidx);
501 	if (ifp != NULL) {
502 		info.rti_info[RTAX_IFP] = sdltosa(ifp->if_sadl);
503 		info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
504 		if (ifp->if_flags & IFF_POINTOPOINT)
505 			info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr;
506 	}
507 	if_put(ifp);
508 	/* RTAX_GENMASK, RTAX_AUTHOR, RTAX_SRCMASK ignored */
509 
510 	/* build new route message */
511 	len = rt_msg2(type, RTM_VERSION, &info, NULL, NULL);
512 	rtm = malloc(len, M_RTABLE, M_WAITOK | M_ZERO);
513 
514 	rt_msg2(type, RTM_VERSION, &info, (caddr_t)rtm, NULL);
515 	rtm->rtm_type = type;
516 	rtm->rtm_index = rt->rt_ifidx;
517 	rtm->rtm_tableid = tableid;
518 	rtm->rtm_priority = rt->rt_priority & RTP_MASK;
519 	rtm->rtm_flags = rt->rt_flags;
520 	rtm->rtm_pid = curproc->p_p->ps_pid;
521 	rtm->rtm_seq = seq;
522 	rt_getmetrics(&rt->rt_rmx, &rtm->rtm_rmx);
523 	rtm->rtm_addrs = info.rti_addrs;
524 #ifdef MPLS
525 	rtm->rtm_mpls = info.rti_mpls;
526 #endif
527 	return rtm;
528 }
529 
530 int
531 route_output(struct mbuf *m, struct socket *so, struct sockaddr *dstaddr,
532     struct mbuf *control)
533 {
534 	struct rt_msghdr	*rtm = NULL;
535 	struct rtentry		*rt = NULL;
536 	struct rt_addrinfo	 info;
537 	int			 plen, len, seq, newgate = 0, error = 0;
538 	struct ifnet		*ifp = NULL;
539 	struct ifaddr		*ifa = NULL;
540 	struct rawcb		*rp = NULL;
541 #ifdef MPLS
542 	struct sockaddr_mpls	*psa_mpls;
543 #endif
544 	u_int			 tableid;
545 	u_int8_t		 prio;
546 	u_char			 vers, type;
547 
548 	if (m == NULL || ((m->m_len < sizeof(int32_t)) &&
549 	    (m = m_pullup(m, sizeof(int32_t))) == 0))
550 		return (ENOBUFS);
551 	if ((m->m_flags & M_PKTHDR) == 0)
552 		panic("route_output");
553 	len = m->m_pkthdr.len;
554 	if (len < offsetof(struct rt_msghdr, rtm_type) + 1 ||
555 	    len != mtod(m, struct rt_msghdr *)->rtm_msglen) {
556 		error = EINVAL;
557 		goto fail;
558 	}
559 	vers = mtod(m, struct rt_msghdr *)->rtm_version;
560 	switch (vers) {
561 	case RTM_VERSION:
562 		if (len < sizeof(struct rt_msghdr)) {
563 			error = EINVAL;
564 			goto fail;
565 		}
566 		if (len > RTM_MAXSIZE) {
567 			error = EMSGSIZE;
568 			goto fail;
569 		}
570 		rtm = malloc(len, M_RTABLE, M_WAITOK);
571 		m_copydata(m, 0, len, (caddr_t)rtm);
572 		break;
573 	default:
574 		error = EPROTONOSUPPORT;
575 		goto fail;
576 	}
577 	rtm->rtm_pid = curproc->p_p->ps_pid;
578 	if (rtm->rtm_hdrlen == 0)	/* old client */
579 		rtm->rtm_hdrlen = sizeof(struct rt_msghdr);
580 	if (len < rtm->rtm_hdrlen) {
581 		error = EINVAL;
582 		goto fail;
583 	}
584 
585 	/* Verify that the caller is sending an appropriate message early */
586 	switch (rtm->rtm_type) {
587 	case RTM_ADD:
588 	case RTM_DELETE:
589 	case RTM_GET:
590 	case RTM_CHANGE:
591 	case RTM_LOCK:
592 	case RTM_PROPOSAL:
593 		break;
594 	default:
595 		error = EOPNOTSUPP;
596 		goto fail;
597 	}
598 
599 	/*
600 	 * Verify that the caller has the appropriate privilege; RTM_GET
601 	 * is the only operation the non-superuser is allowed.
602 	 */
603 	if (rtm->rtm_type != RTM_GET && suser(curproc, 0) != 0) {
604 		error = EACCES;
605 		goto fail;
606 	}
607 	tableid = rtm->rtm_tableid;
608 	if (!rtable_exists(tableid)) {
609 		if (rtm->rtm_type == RTM_ADD) {
610 			if ((error = rtable_add(tableid)) != 0)
611 				goto fail;
612 		} else {
613 			error = EINVAL;
614 			goto fail;
615 		}
616 	}
617 
618 
619 	/* Do not let userland play with kernel-only flags. */
620 	if ((rtm->rtm_flags & (RTF_LOCAL|RTF_BROADCAST)) != 0) {
621 		error = EINVAL;
622 		goto fail;
623 	}
624 
625 	/* make sure that kernel-only bits are not set */
626 	rtm->rtm_priority &= RTP_MASK;
627 	rtm->rtm_flags &= ~(RTF_DONE|RTF_CLONED|RTF_CACHED);
628 	rtm->rtm_fmask &= RTF_FMASK;
629 
630 	if (rtm->rtm_priority != 0) {
631 		if (rtm->rtm_priority > RTP_MAX ||
632 		    rtm->rtm_priority == RTP_LOCAL) {
633 			error = EINVAL;
634 			goto fail;
635 		}
636 		prio = rtm->rtm_priority;
637 	} else if (rtm->rtm_type != RTM_ADD)
638 		prio = RTP_ANY;
639 	else if (rtm->rtm_flags & RTF_STATIC)
640 		prio = 0;
641 	else
642 		prio = RTP_DEFAULT;
643 
644 	bzero(&info, sizeof(info));
645 	info.rti_addrs = rtm->rtm_addrs;
646 	rt_xaddrs(rtm->rtm_hdrlen + (caddr_t)rtm, len + (caddr_t)rtm, &info);
647 	info.rti_flags = rtm->rtm_flags;
648 	if (rtm->rtm_type != RTM_PROPOSAL &&
649 	   (info.rti_info[RTAX_DST] == NULL ||
650 	    info.rti_info[RTAX_DST]->sa_family >= AF_MAX ||
651 	    (info.rti_info[RTAX_GATEWAY] != NULL &&
652 	    info.rti_info[RTAX_GATEWAY]->sa_family >= AF_MAX) ||
653 	    info.rti_info[RTAX_GENMASK] != NULL)) {
654 		error = EINVAL;
655 		goto fail;
656 	}
657 #ifdef MPLS
658 	info.rti_mpls = rtm->rtm_mpls;
659 #endif
660 
661 	if (info.rti_info[RTAX_GATEWAY] != NULL &&
662 	    info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK &&
663 	    (info.rti_flags & RTF_CLONING) == 0) {
664 		info.rti_flags |= RTF_LLINFO;
665 	}
666 
667 	/*
668 	 * Do not use goto flush before this point since the message itself
669 	 * may be not consistent and could cause unexpected behaviour in other
670 	 * userland clients. Use goto fail instead.
671 	 */
672 
673 	/*
674 	 * Validate RTM_PROPOSAL and pass it along or error out.
675 	 */
676 	if (rtm->rtm_type == RTM_PROPOSAL) {
677 	       if (validate_proposal(&info) == -1) {
678 			error = EINVAL;
679 			goto fail;
680 	       }
681 	       goto flush;
682 	}
683 
684 	switch (rtm->rtm_type) {
685 	case RTM_ADD:
686 		if (info.rti_info[RTAX_GATEWAY] == NULL) {
687 			error = EINVAL;
688 			goto flush;
689 		}
690 
691 		rt = rtable_match(tableid, info.rti_info[RTAX_DST], NULL);
692 		if ((error = route_arp_conflict(rt, &info))) {
693 			rtfree(rt);
694 			rt = NULL;
695 			goto flush;
696 		}
697 
698 		/*
699 		 * We cannot go through a delete/create/insert cycle for
700 		 * cached route because this can lead to races in the
701 		 * receive path.  Instead we upade the L2 cache.
702 		 */
703 		if ((rt != NULL) && ISSET(rt->rt_flags, RTF_CACHED))
704 			goto change;
705 
706 		rtfree(rt);
707 		rt = NULL;
708 
709 		error = rtrequest(RTM_ADD, &info, prio, &rt, tableid);
710 		if (error == 0)
711 			rt_setmetrics(rtm->rtm_inits, &rtm->rtm_rmx,
712 			    &rt->rt_rmx);
713 		else
714 			goto flush;
715 		break;
716 	case RTM_DELETE:
717 		rt = rtable_lookup(tableid, info.rti_info[RTAX_DST],
718 		    info.rti_info[RTAX_NETMASK], info.rti_info[RTAX_GATEWAY],
719 		    prio);
720 
721 		/*
722 		 * Invalidate the cache of automagically created and
723 		 * referenced L2 entries to make sure that ``rt_gwroute''
724 		 * pointer stays valid for other CPUs.
725 		 */
726 		if ((rt != NULL) && (ISSET(rt->rt_flags, RTF_CACHED))) {
727 			ifp = if_get(rt->rt_ifidx);
728 			KASSERT(ifp != NULL);
729 			ifp->if_rtrequest(ifp, RTM_INVALIDATE, rt);
730 			if_put(ifp);
731 			/* Reset the MTU of the gateway route. */
732 			rtable_walk(tableid, rt_key(rt)->sa_family,
733 			    route_cleargateway, rt);
734 			break;
735 		}
736 
737 		/*
738 		 * Make sure that local routes are only modified by the
739 		 * kernel.
740 		 */
741 		if ((rt != NULL) &&
742 		    ISSET(rt->rt_flags, RTF_LOCAL|RTF_BROADCAST)) {
743 			error = EINVAL;
744 			break;
745 		}
746 
747 		rtfree(rt);
748 		rt = NULL;
749 
750 		error = rtrequest(RTM_DELETE, &info, prio, &rt, tableid);
751 		if (error != 0)
752 			goto flush;
753 		break;
754 	case RTM_CHANGE:
755 	case RTM_LOCK:
756 		rt = rtable_lookup(tableid, info.rti_info[RTAX_DST],
757 		    info.rti_info[RTAX_NETMASK], info.rti_info[RTAX_GATEWAY],
758 		    prio);
759 #ifndef SMALL_KERNEL
760 		/*
761 		 * If we got multipath routes, we require users to specify
762 		 * a matching gateway.
763 		 */
764 		if ((rt != NULL) && ISSET(rt->rt_flags, RTF_MPATH) &&
765 		    (info.rti_info[RTAX_GATEWAY] == NULL)) {
766 			rtfree(rt);
767 			rt = NULL;
768 		}
769 #endif
770 		/*
771 		 * If RTAX_GATEWAY is the argument we're trying to
772 		 * change, try to find a compatible route.
773 		 */
774 		if ((rt == NULL) && (info.rti_info[RTAX_GATEWAY] != NULL) &&
775 		    (rtm->rtm_type == RTM_CHANGE)) {
776 			rt = rtable_lookup(tableid, info.rti_info[RTAX_DST],
777 			    info.rti_info[RTAX_NETMASK], NULL, prio);
778 #ifndef SMALL_KERNEL
779 			/* Ensure we don't pick a multipath one. */
780 			if ((rt != NULL) && ISSET(rt->rt_flags, RTF_MPATH)) {
781 				rtfree(rt);
782 				rt = NULL;
783 			}
784 #endif
785 		}
786 
787 		if (rt == NULL) {
788 			error = ESRCH;
789 			goto flush;
790 		}
791 
792 		/*
793 		 * RTM_CHANGE/LOCK need a perfect match.
794 		 */
795 		plen = rtable_satoplen(info.rti_info[RTAX_DST]->sa_family,
796 		    info.rti_info[RTAX_NETMASK]);
797 		if (rt_plen(rt) != plen ) {
798 			error = ESRCH;
799 			goto flush;
800 		}
801 
802 		switch (rtm->rtm_type) {
803 		case RTM_CHANGE:
804 			if (info.rti_info[RTAX_GATEWAY] != NULL)
805 				if (rt->rt_gateway == NULL ||
806 				    bcmp(rt->rt_gateway,
807 				    info.rti_info[RTAX_GATEWAY],
808 				    info.rti_info[RTAX_GATEWAY]->sa_len)) {
809 					newgate = 1;
810 				}
811 			/*
812 			 * Check reachable gateway before changing the route.
813 			 * New gateway could require new ifaddr, ifp;
814 			 * flags may also be different; ifp may be specified
815 			 * by ll sockaddr when protocol address is ambiguous.
816 			 */
817 			if (newgate || info.rti_info[RTAX_IFP] != NULL ||
818 			    info.rti_info[RTAX_IFA] != NULL) {
819 				if ((error = rt_getifa(&info, tableid)) != 0)
820 					goto flush;
821 				ifa = info.rti_ifa;
822 				if (rt->rt_ifa != ifa) {
823 					ifp = if_get(rt->rt_ifidx);
824 					KASSERT(ifp != NULL);
825 					ifp->if_rtrequest(ifp, RTM_DELETE, rt);
826 					ifafree(rt->rt_ifa);
827 					if_put(ifp);
828 
829 					ifa->ifa_refcnt++;
830 					rt->rt_ifa = ifa;
831 					rt->rt_ifidx = ifa->ifa_ifp->if_index;
832 #ifndef SMALL_KERNEL
833 					/* recheck link state after ifp change*/
834 					rt_if_linkstate_change(rt, ifa->ifa_ifp,
835 					    tableid);
836 #endif
837 				}
838 			}
839 change:
840 			if (info.rti_info[RTAX_GATEWAY] != NULL && (error =
841 			    rt_setgate(rt, info.rti_info[RTAX_GATEWAY],
842 			    tableid)))
843 				goto flush;
844 #ifdef MPLS
845 			if ((rtm->rtm_flags & RTF_MPLS) &&
846 			    info.rti_info[RTAX_SRC] != NULL) {
847 				struct rt_mpls *rt_mpls;
848 
849 				psa_mpls = (struct sockaddr_mpls *)
850 				    info.rti_info[RTAX_SRC];
851 
852 				if (rt->rt_llinfo == NULL) {
853 					rt->rt_llinfo =
854 					    malloc(sizeof(struct rt_mpls),
855 					    M_TEMP, M_WAITOK | M_ZERO);
856 				}
857 
858 				rt_mpls = (struct rt_mpls *)rt->rt_llinfo;
859 
860 				if (psa_mpls != NULL) {
861 					rt_mpls->mpls_label =
862 					    psa_mpls->smpls_label;
863 				}
864 
865 				rt_mpls->mpls_operation = info.rti_mpls;
866 
867 				/* XXX: set experimental bits */
868 
869 				rt->rt_flags |= RTF_MPLS;
870 			} else if (newgate || ((rtm->rtm_fmask & RTF_MPLS) &&
871 			    !(rtm->rtm_flags & RTF_MPLS))) {
872 				/* if gateway changed remove MPLS information */
873 				if (rt->rt_llinfo != NULL &&
874 				    rt->rt_flags & RTF_MPLS) {
875 					free(rt->rt_llinfo, M_TEMP, 0);
876 					rt->rt_llinfo = NULL;
877 					rt->rt_flags &= ~RTF_MPLS;
878 				}
879 			}
880 #endif
881 
882 #ifdef BFD
883 			if (ISSET(rtm->rtm_flags, RTF_BFD)) {
884 				if ((error = bfdset(rt)))
885 					goto flush;
886 			} else if (!ISSET(rtm->rtm_flags, RTF_BFD) &&
887 			    ISSET(rtm->rtm_fmask, RTF_BFD)) {
888 				bfdclear(rt);
889 			}
890 #endif
891 
892 			/* Hack to allow some flags to be toggled */
893 			if (rtm->rtm_fmask)
894 				rt->rt_flags =
895 				    (rt->rt_flags & ~rtm->rtm_fmask) |
896 				    (rtm->rtm_flags & rtm->rtm_fmask);
897 
898 			rt_setmetrics(rtm->rtm_inits, &rtm->rtm_rmx,
899 			    &rt->rt_rmx);
900 
901 			ifp = if_get(rt->rt_ifidx);
902 			KASSERT(ifp != NULL);
903 			ifp->if_rtrequest(ifp, RTM_ADD, rt);
904 			if_put(ifp);
905 
906 			if (info.rti_info[RTAX_LABEL] != NULL) {
907 				char *rtlabel = ((struct sockaddr_rtlabel *)
908 				    info.rti_info[RTAX_LABEL])->sr_label;
909 				rtlabel_unref(rt->rt_labelid);
910 				rt->rt_labelid = rtlabel_name2id(rtlabel);
911 			}
912 			if_group_routechange(info.rti_info[RTAX_DST],
913 			    info.rti_info[RTAX_NETMASK]);
914 			/* FALLTHROUGH */
915 		case RTM_LOCK:
916 			rt->rt_rmx.rmx_locks &= ~(rtm->rtm_inits);
917 			rt->rt_rmx.rmx_locks |=
918 			    (rtm->rtm_inits & rtm->rtm_rmx.rmx_locks);
919 			break;
920 		}
921 		break;
922 	case RTM_GET:
923 		rt = rtable_lookup(tableid, info.rti_info[RTAX_DST],
924 		    info.rti_info[RTAX_NETMASK], info.rti_info[RTAX_GATEWAY],
925 		    prio);
926 		if (rt == NULL) {
927 			error = ESRCH;
928 			goto flush;
929 		}
930 		break;
931 	}
932 
933 	/*
934 	 * From here on these vars need to be valid
935 	 * rt, rtm, error, so, m, tableid, sa_family
936 	 *
937 	 * Other notes:
938 	 * - to end up here previous calls passed OK, error is most probably 0
939 	 * - error cases take the flush route or in bad cases fail
940 	 * - fail does not report the message back but just fails the call
941 	 *   if the message is not valid then fail should be used
942 	 */
943 
944 	type = rtm->rtm_type;
945 	seq = rtm->rtm_seq;
946 	free(rtm, M_RTABLE, 0);
947 	rtm = rt_report(rt, type, seq, tableid);
948 flush:
949 	rtfree(rt);
950 	if (error) {
951 		rtm->rtm_errno = error;
952 	} else {
953 		rtm->rtm_flags |= RTF_DONE;
954 	}
955 
956 	/*
957 	 * Check to see if we don't want our own messages.
958 	 */
959 	if (!(so->so_options & SO_USELOOPBACK)) {
960 		if (route_cb.any_count <= 1) {
961 			/* no other listener and no loopback of messages */
962 fail:
963 			free(rtm, M_RTABLE, 0);
964 			m_freem(m);
965 			return (error);
966 		}
967 		/* There is another listener, so construct message */
968 		rp = sotorawcb(so);
969 		rp->rcb_proto.sp_family = 0; /* Avoid us */
970 	}
971 	if (rtm) {
972 		if (m_copyback(m, 0, rtm->rtm_msglen, rtm, M_NOWAIT)) {
973 			m_freem(m);
974 			m = NULL;
975 		} else if (m->m_pkthdr.len > rtm->rtm_msglen)
976 			m_adj(m, rtm->rtm_msglen - m->m_pkthdr.len);
977 		free(rtm, M_RTABLE, 0);
978 	}
979 	if (m)
980 		route_input(m, info.rti_info[RTAX_DST] ?
981 		    info.rti_info[RTAX_DST]->sa_family : AF_UNSPEC);
982 	if (rp)
983 		rp->rcb_proto.sp_family = PF_ROUTE; /* Readd us */
984 
985 	return (error);
986 }
987 
988 int
989 route_cleargateway(struct rtentry *rt, void *arg, unsigned int rtableid)
990 {
991 	struct rtentry *nhrt = arg;
992 
993 	if (ISSET(rt->rt_flags, RTF_GATEWAY) && rt->rt_gwroute == nhrt &&
994 	    !ISSET(rt->rt_locks, RTV_MTU))
995                 rt->rt_mtu = 0;
996 
997 	return (0);
998 }
999 
1000 /*
1001  * Check if the user request to insert an ARP entry does not conflict
1002  * with existing ones.
1003  *
1004  * Only two entries are allowed for a given IP address: a private one
1005  * (priv) and a public one (pub).
1006  */
1007 int
1008 route_arp_conflict(struct rtentry *rt, struct rt_addrinfo *info)
1009 {
1010 #ifdef ART
1011 	int		 proxy = (info->rti_flags & RTF_ANNOUNCE);
1012 
1013 	if ((info->rti_flags & RTF_LLINFO) == 0 ||
1014 	    (info->rti_info[RTAX_DST]->sa_family != AF_INET))
1015 		return (0);
1016 
1017 	if (rt == NULL || !ISSET(rt->rt_flags, RTF_LLINFO))
1018 		return (0);
1019 
1020 	/* If the entry is cached, it can be updated. */
1021 	if (ISSET(rt->rt_flags, RTF_CACHED))
1022 		return (0);
1023 
1024 	/*
1025 	 * Same destination, not cached and both "priv" or "pub" conflict.
1026 	 * If a second entry exists, it always conflict.
1027 	 */
1028 	if ((ISSET(rt->rt_flags, RTF_ANNOUNCE) == proxy) ||
1029 	    ISSET(rt->rt_flags, RTF_MPATH))
1030 		return (EEXIST);
1031 
1032 	/* No conflict but an entry exist so we need to force mpath. */
1033 	info->rti_flags |= RTF_MPATH;
1034 #endif /* ART */
1035 	return (0);
1036 }
1037 
1038 void
1039 rt_setmetrics(u_long which, const struct rt_metrics *in,
1040     struct rt_kmetrics *out)
1041 {
1042 	int64_t expire;
1043 
1044 	if (which & RTV_MTU)
1045 		out->rmx_mtu = in->rmx_mtu;
1046 	if (which & RTV_EXPIRE) {
1047 		expire = in->rmx_expire;
1048 		if (expire != 0) {
1049 			expire -= time_second;
1050 			expire += time_uptime;
1051 		}
1052 
1053 		out->rmx_expire = expire;
1054 	}
1055 }
1056 
1057 void
1058 rt_getmetrics(const struct rt_kmetrics *in, struct rt_metrics *out)
1059 {
1060 	int64_t expire;
1061 
1062 	expire = in->rmx_expire;
1063 	if (expire != 0) {
1064 		expire -= time_uptime;
1065 		expire += time_second;
1066 	}
1067 
1068 	bzero(out, sizeof(*out));
1069 	out->rmx_locks = in->rmx_locks;
1070 	out->rmx_mtu = in->rmx_mtu;
1071 	out->rmx_expire = expire;
1072 	out->rmx_pksent = in->rmx_pksent;
1073 }
1074 
1075 #define ROUNDUP(a) \
1076 	((a) > 0 ? (1 + (((a) - 1) | (sizeof(long) - 1))) : sizeof(long))
1077 #define ADVANCE(x, n) (x += ROUNDUP((n)->sa_len))
1078 
1079 void
1080 rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo)
1081 {
1082 	struct sockaddr	*sa;
1083 	int		 i;
1084 
1085 	bzero(rtinfo->rti_info, sizeof(rtinfo->rti_info));
1086 	for (i = 0; (i < RTAX_MAX) && (cp < cplim); i++) {
1087 		if ((rtinfo->rti_addrs & (1 << i)) == 0)
1088 			continue;
1089 		rtinfo->rti_info[i] = sa = (struct sockaddr *)cp;
1090 		ADVANCE(cp, sa);
1091 	}
1092 }
1093 
1094 struct mbuf *
1095 rt_msg1(int type, struct rt_addrinfo *rtinfo)
1096 {
1097 	struct rt_msghdr	*rtm;
1098 	struct mbuf		*m;
1099 	int			 i;
1100 	struct sockaddr		*sa;
1101 	int			 len, dlen, hlen;
1102 
1103 	switch (type) {
1104 	case RTM_DELADDR:
1105 	case RTM_NEWADDR:
1106 		len = sizeof(struct ifa_msghdr);
1107 		break;
1108 	case RTM_IFINFO:
1109 		len = sizeof(struct if_msghdr);
1110 		break;
1111 	case RTM_IFANNOUNCE:
1112 		len = sizeof(struct if_announcemsghdr);
1113 		break;
1114 #ifdef BFD
1115 	case RTM_BFD:
1116 		len = sizeof(struct bfd_msghdr);
1117 		break;
1118 #endif
1119 	default:
1120 		len = sizeof(struct rt_msghdr);
1121 		break;
1122 	}
1123 	if (len > MCLBYTES)
1124 		panic("rt_msg1");
1125 	m = m_gethdr(M_DONTWAIT, MT_DATA);
1126 	if (m && len > MHLEN) {
1127 		MCLGET(m, M_DONTWAIT);
1128 		if ((m->m_flags & M_EXT) == 0) {
1129 			m_free(m);
1130 			m = NULL;
1131 		}
1132 	}
1133 	if (m == NULL)
1134 		return (m);
1135 	m->m_pkthdr.len = m->m_len = hlen = len;
1136 	m->m_pkthdr.ph_ifidx = 0;
1137 	rtm = mtod(m, struct rt_msghdr *);
1138 	bzero(rtm, len);
1139 	for (i = 0; i < RTAX_MAX; i++) {
1140 		if (rtinfo == NULL || (sa = rtinfo->rti_info[i]) == NULL)
1141 			continue;
1142 		rtinfo->rti_addrs |= (1 << i);
1143 		dlen = ROUNDUP(sa->sa_len);
1144 		if (m_copyback(m, len, dlen, sa, M_NOWAIT)) {
1145 			m_freem(m);
1146 			return (NULL);
1147 		}
1148 		len += dlen;
1149 	}
1150 	rtm->rtm_msglen = len;
1151 	rtm->rtm_hdrlen = hlen;
1152 	rtm->rtm_version = RTM_VERSION;
1153 	rtm->rtm_type = type;
1154 	return (m);
1155 }
1156 
1157 int
1158 rt_msg2(int type, int vers, struct rt_addrinfo *rtinfo, caddr_t cp,
1159     struct walkarg *w)
1160 {
1161 	int		i;
1162 	int		len, dlen, hlen, second_time = 0;
1163 	caddr_t		cp0;
1164 
1165 	rtinfo->rti_addrs = 0;
1166 again:
1167 	switch (type) {
1168 	case RTM_DELADDR:
1169 	case RTM_NEWADDR:
1170 		len = sizeof(struct ifa_msghdr);
1171 		break;
1172 	case RTM_IFINFO:
1173 		len = sizeof(struct if_msghdr);
1174 		break;
1175 	default:
1176 		len = sizeof(struct rt_msghdr);
1177 		break;
1178 	}
1179 	hlen = len;
1180 	if ((cp0 = cp) != NULL)
1181 		cp += len;
1182 	for (i = 0; i < RTAX_MAX; i++) {
1183 		struct sockaddr *sa;
1184 
1185 		if ((sa = rtinfo->rti_info[i]) == NULL)
1186 			continue;
1187 		rtinfo->rti_addrs |= (1 << i);
1188 		dlen = ROUNDUP(sa->sa_len);
1189 		if (cp) {
1190 			bcopy(sa, cp, (size_t)dlen);
1191 			cp += dlen;
1192 		}
1193 		len += dlen;
1194 	}
1195 	/* align message length to the next natural boundary */
1196 	len = ALIGN(len);
1197 	if (cp == 0 && w != NULL && !second_time) {
1198 		struct walkarg *rw = w;
1199 
1200 		rw->w_needed += len;
1201 		if (rw->w_needed <= 0 && rw->w_where) {
1202 			if (rw->w_tmemsize < len) {
1203 				free(rw->w_tmem, M_RTABLE, 0);
1204 				rw->w_tmem = malloc(len, M_RTABLE, M_NOWAIT);
1205 				if (rw->w_tmem)
1206 					rw->w_tmemsize = len;
1207 			}
1208 			if (rw->w_tmem) {
1209 				cp = rw->w_tmem;
1210 				second_time = 1;
1211 				goto again;
1212 			} else
1213 				rw->w_where = 0;
1214 		}
1215 	}
1216 	if (cp && w)		/* clear the message header */
1217 		bzero(cp0, hlen);
1218 
1219 	if (cp) {
1220 		struct rt_msghdr *rtm = (struct rt_msghdr *)cp0;
1221 
1222 		rtm->rtm_version = RTM_VERSION;
1223 		rtm->rtm_type = type;
1224 		rtm->rtm_msglen = len;
1225 		rtm->rtm_hdrlen = hlen;
1226 	}
1227 	return (len);
1228 }
1229 
1230 /*
1231  * This routine is called to generate a message from the routing
1232  * socket indicating that a redirect has occurred, a routing lookup
1233  * has failed, or that a protocol has detected timeouts to a particular
1234  * destination.
1235  */
1236 void
1237 rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, uint8_t prio,
1238     u_int ifidx, int error, u_int tableid)
1239 {
1240 	struct rt_msghdr	*rtm;
1241 	struct mbuf		*m;
1242 	struct sockaddr		*sa = rtinfo->rti_info[RTAX_DST];
1243 
1244 	if (route_cb.any_count == 0)
1245 		return;
1246 	m = rt_msg1(type, rtinfo);
1247 	if (m == NULL)
1248 		return;
1249 	rtm = mtod(m, struct rt_msghdr *);
1250 	rtm->rtm_flags = RTF_DONE | flags;
1251 	rtm->rtm_priority = prio;
1252 	rtm->rtm_errno = error;
1253 	rtm->rtm_tableid = tableid;
1254 	rtm->rtm_addrs = rtinfo->rti_addrs;
1255 	rtm->rtm_index = ifidx;
1256 	route_input(m, sa ? sa->sa_family : AF_UNSPEC);
1257 }
1258 
1259 /*
1260  * This routine is called to generate a message from the routing
1261  * socket indicating that the status of a network interface has changed.
1262  */
1263 void
1264 rt_ifmsg(struct ifnet *ifp)
1265 {
1266 	struct if_msghdr	*ifm;
1267 	struct mbuf		*m;
1268 
1269 	if (route_cb.any_count == 0)
1270 		return;
1271 	m = rt_msg1(RTM_IFINFO, NULL);
1272 	if (m == NULL)
1273 		return;
1274 	ifm = mtod(m, struct if_msghdr *);
1275 	ifm->ifm_index = ifp->if_index;
1276 	ifm->ifm_tableid = ifp->if_rdomain;
1277 	ifm->ifm_flags = ifp->if_flags;
1278 	ifm->ifm_xflags = ifp->if_xflags;
1279 	if_getdata(ifp, &ifm->ifm_data);
1280 	ifm->ifm_addrs = 0;
1281 	route_input(m, AF_UNSPEC);
1282 }
1283 
1284 /*
1285  * This is called to generate messages from the routing socket
1286  * indicating a network interface has had addresses associated with it.
1287  * if we ever reverse the logic and replace messages TO the routing
1288  * socket indicate a request to configure interfaces, then it will
1289  * be unnecessary as the routing socket will automatically generate
1290  * copies of it.
1291  */
1292 void
1293 rt_sendaddrmsg(struct rtentry *rt, int cmd, struct ifaddr *ifa)
1294 {
1295 	struct ifnet		*ifp = ifa->ifa_ifp;
1296 	struct mbuf		*m = NULL;
1297 	struct rt_addrinfo	 info;
1298 	struct ifa_msghdr	*ifam;
1299 
1300 	if (route_cb.any_count == 0)
1301 		return;
1302 
1303 	memset(&info, 0, sizeof(info));
1304 	info.rti_info[RTAX_IFA] = ifa->ifa_addr;
1305 	info.rti_info[RTAX_IFP] = sdltosa(ifp->if_sadl);
1306 	info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask;
1307 	info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
1308 	if ((m = rt_msg1(cmd, &info)) == NULL)
1309 		return;
1310 	ifam = mtod(m, struct ifa_msghdr *);
1311 	ifam->ifam_index = ifp->if_index;
1312 	ifam->ifam_metric = ifa->ifa_metric;
1313 	ifam->ifam_flags = ifa->ifa_flags;
1314 	ifam->ifam_addrs = info.rti_addrs;
1315 	ifam->ifam_tableid = ifp->if_rdomain;
1316 
1317 	route_input(m, ifa->ifa_addr ? ifa->ifa_addr->sa_family : AF_UNSPEC);
1318 }
1319 
1320 /*
1321  * This is called to generate routing socket messages indicating
1322  * network interface arrival and departure.
1323  */
1324 void
1325 rt_ifannouncemsg(struct ifnet *ifp, int what)
1326 {
1327 	struct if_announcemsghdr	*ifan;
1328 	struct mbuf			*m;
1329 
1330 	if (route_cb.any_count == 0)
1331 		return;
1332 	m = rt_msg1(RTM_IFANNOUNCE, NULL);
1333 	if (m == NULL)
1334 		return;
1335 	ifan = mtod(m, struct if_announcemsghdr *);
1336 	ifan->ifan_index = ifp->if_index;
1337 	strlcpy(ifan->ifan_name, ifp->if_xname, sizeof(ifan->ifan_name));
1338 	ifan->ifan_what = what;
1339 	route_input(m, AF_UNSPEC);
1340 }
1341 
1342 #ifdef BFD
1343 /*
1344  * This is used to generate routing socket messages indicating
1345  * the state of a BFD session.
1346  */
1347 void
1348 rt_bfdmsg(struct bfd_config *bfd)
1349 {
1350 	struct bfd_msghdr	*bfdm;
1351 	struct sockaddr_bfd	 sa_bfd;
1352 	struct mbuf		*m;
1353 	struct rt_addrinfo	 info;
1354 
1355 	if (route_cb.any_count == 0)
1356 		return;
1357 	memset(&info, 0, sizeof(info));
1358 	info.rti_info[RTAX_DST] = rt_key(bfd->bc_rt);
1359 	info.rti_info[RTAX_IFA] = bfd->bc_rt->rt_ifa->ifa_addr;
1360 
1361 	m = rt_msg1(RTM_BFD, &info);
1362 	if (m == NULL)
1363 		return;
1364 	bfdm = mtod(m, struct bfd_msghdr *);
1365 	bfdm->bm_addrs = info.rti_addrs;
1366 
1367 	bfd2sa(bfd->bc_rt, &sa_bfd);
1368 	memcpy(&bfdm->bm_sa, &sa_bfd, sizeof(sa_bfd));
1369 
1370 	route_input(m, info.rti_info[RTAX_DST]->sa_family);
1371 }
1372 #endif /* BFD */
1373 
1374 /*
1375  * This is used in dumping the kernel table via sysctl().
1376  */
1377 int
1378 sysctl_dumpentry(struct rtentry *rt, void *v, unsigned int id)
1379 {
1380 	struct walkarg		*w = v;
1381 	int			 error = 0, size;
1382 	struct rt_addrinfo	 info;
1383 	struct ifnet		*ifp;
1384 #ifdef BFD
1385 	struct sockaddr_bfd	 sa_bfd;
1386 #endif
1387 #ifdef MPLS
1388 	struct sockaddr_mpls	 sa_mpls;
1389 #endif
1390 	struct sockaddr_rtlabel	 sa_rl;
1391 	struct sockaddr_in6	 sa_mask;
1392 
1393 	if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg))
1394 		return 0;
1395 	if (w->w_op == NET_RT_DUMP && w->w_arg) {
1396 		u_int8_t prio = w->w_arg & RTP_MASK;
1397 		if (w->w_arg < 0) {
1398 			prio = (-w->w_arg) & RTP_MASK;
1399 			/* Show all routes that are not this priority */
1400 			if (prio == (rt->rt_priority & RTP_MASK))
1401 				return 0;
1402 		} else {
1403 			if (prio != (rt->rt_priority & RTP_MASK) &&
1404 			    prio != RTP_ANY)
1405 				return 0;
1406 		}
1407 	}
1408 	bzero(&info, sizeof(info));
1409 	info.rti_info[RTAX_DST] = rt_key(rt);
1410 	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
1411 	info.rti_info[RTAX_NETMASK] = rt_plen2mask(rt, &sa_mask);
1412 	ifp = if_get(rt->rt_ifidx);
1413 	if (ifp != NULL) {
1414 		info.rti_info[RTAX_IFP] = sdltosa(ifp->if_sadl);
1415 		info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
1416 		if (ifp->if_flags & IFF_POINTOPOINT)
1417 			info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr;
1418 	}
1419 	if_put(ifp);
1420 	info.rti_info[RTAX_LABEL] = rtlabel_id2sa(rt->rt_labelid, &sa_rl);
1421 #ifdef BFD
1422 	if (rt->rt_flags & RTF_BFD)
1423 		info.rti_info[RTAX_BFD] = bfd2sa(rt, &sa_bfd);
1424 #endif
1425 #ifdef MPLS
1426 	if (rt->rt_flags & RTF_MPLS) {
1427 		bzero(&sa_mpls, sizeof(sa_mpls));
1428 		sa_mpls.smpls_family = AF_MPLS;
1429 		sa_mpls.smpls_len = sizeof(sa_mpls);
1430 		sa_mpls.smpls_label = ((struct rt_mpls *)
1431 		    rt->rt_llinfo)->mpls_label;
1432 		info.rti_info[RTAX_SRC] = (struct sockaddr *)&sa_mpls;
1433 		info.rti_mpls = ((struct rt_mpls *)
1434 		    rt->rt_llinfo)->mpls_operation;
1435 	}
1436 #endif
1437 
1438 	size = rt_msg2(RTM_GET, RTM_VERSION, &info, NULL, w);
1439 	if (w->w_where && w->w_tmem && w->w_needed <= 0) {
1440 		struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem;
1441 
1442 		rtm->rtm_pid = curproc->p_p->ps_pid;
1443 		rtm->rtm_flags = rt->rt_flags;
1444 		rtm->rtm_priority = rt->rt_priority & RTP_MASK;
1445 		rt_getmetrics(&rt->rt_rmx, &rtm->rtm_rmx);
1446 		/* Do not account the routing table's reference. */
1447 		rtm->rtm_rmx.rmx_refcnt = rt->rt_refcnt - 1;
1448 		rtm->rtm_index = rt->rt_ifidx;
1449 		rtm->rtm_addrs = info.rti_addrs;
1450 		rtm->rtm_tableid = id;
1451 #ifdef MPLS
1452 		rtm->rtm_mpls = info.rti_mpls;
1453 #endif
1454 		if ((error = copyout(rtm, w->w_where, size)) != 0)
1455 			w->w_where = NULL;
1456 		else
1457 			w->w_where += size;
1458 	}
1459 	return (error);
1460 }
1461 
1462 int
1463 sysctl_iflist(int af, struct walkarg *w)
1464 {
1465 	struct ifnet		*ifp;
1466 	struct ifaddr		*ifa;
1467 	struct rt_addrinfo	 info;
1468 	int			 len, error = 0;
1469 
1470 	bzero(&info, sizeof(info));
1471 	TAILQ_FOREACH(ifp, &ifnet, if_list) {
1472 		if (w->w_arg && w->w_arg != ifp->if_index)
1473 			continue;
1474 		/* Copy the link-layer address first */
1475 		info.rti_info[RTAX_IFP] = sdltosa(ifp->if_sadl);
1476 		len = rt_msg2(RTM_IFINFO, RTM_VERSION, &info, 0, w);
1477 		if (w->w_where && w->w_tmem && w->w_needed <= 0) {
1478 			struct if_msghdr *ifm;
1479 
1480 			ifm = (struct if_msghdr *)w->w_tmem;
1481 			ifm->ifm_index = ifp->if_index;
1482 			ifm->ifm_tableid = ifp->if_rdomain;
1483 			ifm->ifm_flags = ifp->if_flags;
1484 			if_getdata(ifp, &ifm->ifm_data);
1485 			ifm->ifm_addrs = info.rti_addrs;
1486 			error = copyout(ifm, w->w_where, len);
1487 			if (error)
1488 				return (error);
1489 			w->w_where += len;
1490 		}
1491 		info.rti_info[RTAX_IFP] = NULL;
1492 		TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) {
1493 			KASSERT(ifa->ifa_addr->sa_family != AF_LINK);
1494 			if (af && af != ifa->ifa_addr->sa_family)
1495 				continue;
1496 			info.rti_info[RTAX_IFA] = ifa->ifa_addr;
1497 			info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask;
1498 			info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
1499 			len = rt_msg2(RTM_NEWADDR, RTM_VERSION, &info, 0, w);
1500 			if (w->w_where && w->w_tmem && w->w_needed <= 0) {
1501 				struct ifa_msghdr *ifam;
1502 
1503 				ifam = (struct ifa_msghdr *)w->w_tmem;
1504 				ifam->ifam_index = ifa->ifa_ifp->if_index;
1505 				ifam->ifam_flags = ifa->ifa_flags;
1506 				ifam->ifam_metric = ifa->ifa_metric;
1507 				ifam->ifam_addrs = info.rti_addrs;
1508 				error = copyout(w->w_tmem, w->w_where, len);
1509 				if (error)
1510 					return (error);
1511 				w->w_where += len;
1512 			}
1513 		}
1514 		info.rti_info[RTAX_IFA] = info.rti_info[RTAX_NETMASK] =
1515 		    info.rti_info[RTAX_BRD] = NULL;
1516 	}
1517 	return (0);
1518 }
1519 
1520 int
1521 sysctl_ifnames(struct walkarg *w)
1522 {
1523 	struct if_nameindex_msg ifn;
1524 	struct ifnet *ifp;
1525 	int error = 0;
1526 
1527 	/* XXX ignore tableid for now */
1528 	TAILQ_FOREACH(ifp, &ifnet, if_list) {
1529 		if (w->w_arg && w->w_arg != ifp->if_index)
1530 			continue;
1531 		w->w_needed += sizeof(ifn);
1532 		if (w->w_where && w->w_needed <= 0) {
1533 
1534 			memset(&ifn, 0, sizeof(ifn));
1535 			ifn.if_index = ifp->if_index;
1536 			strlcpy(ifn.if_name, ifp->if_xname,
1537 			    sizeof(ifn.if_name));
1538 			error = copyout(&ifn, w->w_where, sizeof(ifn));
1539 			if (error)
1540 				return (error);
1541 			w->w_where += sizeof(ifn);
1542 		}
1543 	}
1544 
1545 	return (0);
1546 }
1547 
1548 int
1549 sysctl_rtable(int *name, u_int namelen, void *where, size_t *given, void *new,
1550     size_t newlen)
1551 {
1552 	int			 i, error = EINVAL;
1553 	u_char			 af;
1554 	struct walkarg		 w;
1555 	struct rt_tableinfo	 tableinfo;
1556 	u_int			 tableid = 0;
1557 
1558 	NET_ASSERT_LOCKED();
1559 
1560 	if (new)
1561 		return (EPERM);
1562 	if (namelen < 3 || namelen > 4)
1563 		return (EINVAL);
1564 	af = name[0];
1565 	bzero(&w, sizeof(w));
1566 	w.w_where = where;
1567 	w.w_given = *given;
1568 	w.w_needed = 0 - w.w_given;
1569 	w.w_op = name[1];
1570 	w.w_arg = name[2];
1571 
1572 	if (namelen == 4) {
1573 		tableid = name[3];
1574 		if (!rtable_exists(tableid))
1575 			return (ENOENT);
1576 	} else
1577 		tableid = curproc->p_p->ps_rtableid;
1578 
1579 	switch (w.w_op) {
1580 	case NET_RT_DUMP:
1581 	case NET_RT_FLAGS:
1582 		for (i = 1; i <= AF_MAX; i++) {
1583 			if (af != 0 && af != i)
1584 				continue;
1585 
1586 			error = rtable_walk(tableid, i, sysctl_dumpentry, &w);
1587 			if (error == EAFNOSUPPORT)
1588 				error = 0;
1589 			if (error)
1590 				break;
1591 		}
1592 		break;
1593 
1594 	case NET_RT_IFLIST:
1595 		error = sysctl_iflist(af, &w);
1596 		break;
1597 
1598 	case NET_RT_STATS:
1599 		return (sysctl_rtable_rtstat(where, given, new));
1600 	case NET_RT_TABLE:
1601 		tableid = w.w_arg;
1602 		if (!rtable_exists(tableid))
1603 			return (ENOENT);
1604 		tableinfo.rti_tableid = tableid;
1605 		tableinfo.rti_domainid = rtable_l2(tableid);
1606 		error = sysctl_rdstruct(where, given, new,
1607 		    &tableinfo, sizeof(tableinfo));
1608 		return (error);
1609 	case NET_RT_IFNAMES:
1610 		error = sysctl_ifnames(&w);
1611 		break;
1612 	}
1613 	free(w.w_tmem, M_RTABLE, 0);
1614 	w.w_needed += w.w_given;
1615 	if (where) {
1616 		*given = w.w_where - (caddr_t)where;
1617 		if (*given < w.w_needed)
1618 			return (ENOMEM);
1619 	} else
1620 		*given = (11 * w.w_needed) / 10;
1621 
1622 	return (error);
1623 }
1624 
1625 int
1626 sysctl_rtable_rtstat(void *oldp, size_t *oldlenp, void *newp)
1627 {
1628 	extern struct cpumem *rtcounters;
1629 	uint64_t counters[rts_ncounters];
1630 	struct rtstat rtstat;
1631 	uint32_t *words = (uint32_t *)&rtstat;
1632 	int i;
1633 
1634 	CTASSERT(sizeof(rtstat) == (nitems(counters) * sizeof(uint32_t)));
1635 
1636 	counters_read(rtcounters, counters, nitems(counters));
1637 
1638 	for (i = 0; i < nitems(counters); i++)
1639 		words[i] = (uint32_t)counters[i];
1640 
1641 	return (sysctl_rdstruct(oldp, oldlenp, newp, &rtstat, sizeof(rtstat)));
1642 }
1643 
1644 int
1645 validate_proposal(struct rt_addrinfo *info)
1646 {
1647 	if (info->rti_addrs & ~(RTA_NETMASK | RTA_IFA | RTA_DNS | RTA_STATIC |
1648 	    RTA_SEARCH)) {
1649 		return -1;
1650 	}
1651 
1652 	if (ISSET(info->rti_addrs, RTA_NETMASK)) {
1653 		struct sockaddr *sa = info->rti_info[RTAX_NETMASK];
1654 		if (sa == NULL)
1655 			return -1;
1656 		switch (sa->sa_family) {
1657 		case AF_INET:
1658 			if (sa->sa_len != sizeof(struct sockaddr_in))
1659 				return -1;
1660 			break;
1661 		case AF_INET6:
1662 			if (sa->sa_len != sizeof(struct sockaddr_in6))
1663 				return -1;
1664 			break;
1665 		default:
1666 			return -1;
1667 		}
1668 	}
1669 
1670 	if (ISSET(info->rti_addrs, RTA_IFA)) {
1671 		struct sockaddr *sa = info->rti_info[RTAX_IFA];
1672 		if (sa == NULL)
1673 			return -1;
1674 		switch (sa->sa_family) {
1675 		case AF_INET:
1676 			if (sa->sa_len != sizeof(struct sockaddr_in))
1677 				return -1;
1678 			break;
1679 		case AF_INET6:
1680 			if (sa->sa_len != sizeof(struct sockaddr_in6))
1681 				return -1;
1682 			break;
1683 		default:
1684 			return -1;
1685 		}
1686 	}
1687 
1688 	if (ISSET(info->rti_addrs, RTA_DNS)) {
1689 		struct sockaddr_rtdns *rtdns =
1690 		    (struct sockaddr_rtdns *)info->rti_info[RTAX_DNS];
1691 		if (rtdns == NULL)
1692 			return -1;
1693 		if (rtdns->sr_len > sizeof(*rtdns))
1694 			return -1;
1695 		if (rtdns->sr_len <=
1696 		    offsetof(struct sockaddr_rtdns, sr_dns))
1697 			return -1;
1698 	}
1699 
1700 	if (ISSET(info->rti_addrs, RTA_STATIC)) {
1701 		struct sockaddr_rtstatic *rtstatic =
1702 		    (struct sockaddr_rtstatic *)info->rti_info[RTAX_STATIC];
1703 		if (rtstatic == NULL)
1704 			return -1;
1705 		if (rtstatic->sr_len > sizeof(*rtstatic))
1706 			return -1;
1707 		if (rtstatic->sr_len <=
1708 		    offsetof(struct sockaddr_rtstatic, sr_static))
1709 			return -1;
1710 	}
1711 
1712 	if (ISSET(info->rti_addrs, RTA_SEARCH)) {
1713 		struct sockaddr_rtsearch *rtsearch =
1714 		    (struct sockaddr_rtsearch *)info->rti_info[RTAX_SEARCH];
1715 		if (rtsearch == NULL)
1716 			return -1;
1717 		if (rtsearch->sr_len > sizeof(*rtsearch))
1718 			return -1;
1719 		if (rtsearch->sr_len <=
1720 		    offsetof(struct sockaddr_rtsearch, sr_search))
1721 			return -1;
1722 	}
1723 
1724 	return 0;
1725 }
1726 
1727 /*
1728  * Definitions of protocols supported in the ROUTE domain.
1729  */
1730 
1731 extern	struct domain routedomain;		/* or at least forward */
1732 
1733 struct protosw routesw[] = {
1734 {
1735   .pr_type	= SOCK_RAW,
1736   .pr_domain	= &routedomain,
1737   .pr_flags	= PR_ATOMIC|PR_ADDR|PR_WANTRCVD,
1738   .pr_output	= route_output,
1739   .pr_ctloutput	= route_ctloutput,
1740   .pr_usrreq	= route_usrreq,
1741   .pr_init	= raw_init,
1742   .pr_sysctl	= sysctl_rtable
1743 }
1744 };
1745 
1746 struct domain routedomain = {
1747   .dom_family = PF_ROUTE,
1748   .dom_name = "route",
1749   .dom_init = route_init,
1750   .dom_protosw = routesw,
1751   .dom_protoswNPROTOSW = &routesw[nitems(routesw)]
1752 };
1753