xref: /openbsd-src/sys/net/route.c (revision 50b7afb2c2c0993b0894d4e34bf857cb13ed9c80)
1 /*	$OpenBSD: route.c,v 1.174 2014/07/12 18:44:22 tedu Exp $	*/
2 /*	$NetBSD: route.c,v 1.14 1996/02/13 22:00:46 christos Exp $	*/
3 
4 /*
5  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the project nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 /*
34  * Copyright (c) 1980, 1986, 1991, 1993
35  *	The Regents of the University of California.  All rights reserved.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. Neither the name of the University nor the names of its contributors
46  *    may be used to endorse or promote products derived from this software
47  *    without specific prior written permission.
48  *
49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59  * SUCH DAMAGE.
60  *
61  *	@(#)route.c	8.2 (Berkeley) 11/15/93
62  */
63 
64 /*
65  *	@(#)COPYRIGHT	1.1 (NRL) 17 January 1995
66  *
67  * NRL grants permission for redistribution and use in source and binary
68  * forms, with or without modification, of the software and documentation
69  * created at NRL provided that the following conditions are met:
70  *
71  * 1. Redistributions of source code must retain the above copyright
72  *    notice, this list of conditions and the following disclaimer.
73  * 2. Redistributions in binary form must reproduce the above copyright
74  *    notice, this list of conditions and the following disclaimer in the
75  *    documentation and/or other materials provided with the distribution.
76  * 3. All advertising materials mentioning features or use of this software
77  *    must display the following acknowledgements:
78  * 	This product includes software developed by the University of
79  * 	California, Berkeley and its contributors.
80  * 	This product includes software developed at the Information
81  * 	Technology Division, US Naval Research Laboratory.
82  * 4. Neither the name of the NRL nor the names of its contributors
83  *    may be used to endorse or promote products derived from this software
84  *    without specific prior written permission.
85  *
86  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
87  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
88  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
89  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
90  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
91  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
92  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
93  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
94  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
95  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
96  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
97  *
98  * The views and conclusions contained in the software and documentation
99  * are those of the authors and should not be interpreted as representing
100  * official policies, either expressed or implied, of the US Naval
101  * Research Laboratory (NRL).
102  */
103 
104 #include <sys/param.h>
105 #include <sys/systm.h>
106 #include <sys/mbuf.h>
107 #include <sys/socket.h>
108 #include <sys/socketvar.h>
109 #include <sys/timeout.h>
110 #include <sys/domain.h>
111 #include <sys/protosw.h>
112 #include <sys/ioctl.h>
113 #include <sys/kernel.h>
114 #include <sys/queue.h>
115 #include <sys/pool.h>
116 
117 #include <net/if.h>
118 #include <net/if_dl.h>
119 #include <net/route.h>
120 #include <net/raw_cb.h>
121 
122 #include <netinet/in.h>
123 
124 #ifdef MPLS
125 #include <netmpls/mpls.h>
126 #endif
127 
128 #ifdef IPSEC
129 #include <netinet/ip_ipsp.h>
130 #include <net/if_enc.h>
131 
132 struct ifaddr	*encap_findgwifa(struct sockaddr *, u_int);
133 #endif
134 
135 struct	route_cb	   route_cb;
136 struct	rtstat		   rtstat;
137 struct	radix_node_head	***rt_tables;
138 u_int8_t		   af2rtafidx[AF_MAX+1];
139 u_int8_t		   rtafidx_max;
140 u_int			   rtbl_id_max = 0;
141 u_int			  *rt_tab2dom;	/* rt table to domain lookup table */
142 
143 int			rttrash;	/* routes not in table but not freed */
144 
145 struct pool		rtentry_pool;	/* pool for rtentry structures */
146 struct pool		rttimer_pool;	/* pool for rttimer structures */
147 
148 void	rt_timer_init(void);
149 int	rtable_init(struct radix_node_head ***, u_int);
150 int	rtflushclone1(struct radix_node *, void *, u_int);
151 void	rtflushclone(struct radix_node_head *, struct rtentry *);
152 int	rt_if_remove_rtdelete(struct radix_node *, void *, u_int);
153 
154 struct	ifaddr *ifa_ifwithroute(int, struct sockaddr *, struct sockaddr *,
155 		    u_int);
156 
157 #define	LABELID_MAX	50000
158 
159 struct rt_label {
160 	TAILQ_ENTRY(rt_label)	rtl_entry;
161 	char			rtl_name[RTLABEL_LEN];
162 	u_int16_t		rtl_id;
163 	int			rtl_ref;
164 };
165 
166 TAILQ_HEAD(rt_labels, rt_label)	rt_labels = TAILQ_HEAD_INITIALIZER(rt_labels);
167 
168 #ifdef IPSEC
169 struct ifaddr *
170 encap_findgwifa(struct sockaddr *gw, u_int rdomain)
171 {
172 	struct ifnet	*encif;
173 
174 	if ((encif = enc_getif(rdomain, 0)) == NULL)
175 		return (NULL);
176 
177 	/*
178 	 * This is not a real link-layer address, it is an empty ifa of
179 	 * type AF_LINK.
180 	 * It is used when adding an encap route entry because RTM_ADD
181 	 * and rt_getifa() want an ifa to find an ifp to associate it to
182 	 * the route.
183 	 */
184 	return (encif->if_lladdr);
185 }
186 #endif
187 
188 int
189 rtable_init(struct radix_node_head ***table, u_int id)
190 {
191 	void		**p;
192 	struct domain	 *dom;
193 	u_int8_t	  i;
194 
195 	if ((p = malloc(sizeof(void *) * (rtafidx_max + 1), M_RTABLE,
196 	    M_NOWAIT|M_ZERO)) == NULL)
197 		return (ENOMEM);
198 
199 	/* 2nd pass: attach */
200 	for (dom = domains; dom != NULL; dom = dom->dom_next)
201 		if (dom->dom_rtattach)
202 			dom->dom_rtattach(&p[af2rtafidx[dom->dom_family]],
203 			    dom->dom_rtoffset);
204 
205 	*table = (struct radix_node_head **)p;
206 
207 	for (i = 0; i < rtafidx_max; i++) {
208 		if ((*table)[i] != NULL)
209 			(*table)[i]->rnh_rtableid = id;
210 	}
211 
212 	return (0);
213 }
214 
215 void
216 route_init(void)
217 {
218 	struct domain	 *dom;
219 
220 	pool_init(&rtentry_pool, sizeof(struct rtentry), 0, 0, 0, "rtentpl",
221 	    NULL);
222 	rn_init();	/* initialize all zeroes, all ones, mask table */
223 
224 	bzero(af2rtafidx, sizeof(af2rtafidx));
225 	rtafidx_max = 1;	/* must have NULL at index 0, so start at 1 */
226 
227 	/* find out how many tables to allocate */
228 	for (dom = domains; dom != NULL; dom = dom->dom_next)
229 		if (dom->dom_rtattach)
230 			af2rtafidx[dom->dom_family] = rtafidx_max++;
231 
232 	if (rtable_add(0) != 0)
233 		panic("route_init rtable_add");
234 }
235 
236 int
237 rtable_add(u_int id)
238 {
239 	void	*p, *q;
240 
241 	splsoftassert(IPL_SOFTNET);
242 
243 	if (id > RT_TABLEID_MAX)
244 		return (EINVAL);
245 
246 	if (id == 0 || id > rtbl_id_max) {
247 		size_t	newlen = sizeof(void *) * (id+1);
248 		size_t	newlen2 = sizeof(u_int) * (id+1);
249 
250 		if ((p = malloc(newlen, M_RTABLE, M_NOWAIT|M_ZERO)) == NULL)
251 			return (ENOMEM);
252 		if ((q = malloc(newlen2, M_RTABLE, M_NOWAIT|M_ZERO)) == NULL) {
253 			free(p, M_RTABLE, 0);
254 			return (ENOMEM);
255 		}
256 		if (rt_tables) {
257 			bcopy(rt_tables, p, sizeof(void *) * (rtbl_id_max+1));
258 			bcopy(rt_tab2dom, q, sizeof(u_int) * (rtbl_id_max+1));
259 			free(rt_tables, M_RTABLE, 0);
260 			free(rt_tab2dom, M_RTABLE, 0);
261 		}
262 		rt_tables = p;
263 		rt_tab2dom = q;
264 		rtbl_id_max = id;
265 	}
266 
267 	if (rt_tables[id] != NULL)	/* already exists */
268 		return (EEXIST);
269 
270 	rt_tab2dom[id] = 0;	/* use main table/domain by default */
271 	return (rtable_init(&rt_tables[id], id));
272 }
273 
274 struct radix_node_head *
275 rtable_get(u_int id, sa_family_t af)
276 {
277 	if (id > rtbl_id_max)
278 		return (NULL);
279 	return (rt_tables[id] ? rt_tables[id][af2rtafidx[af]] : NULL);
280 }
281 
282 u_int
283 rtable_l2(u_int id)
284 {
285 	if (id > rtbl_id_max)
286 		return (0);
287 	return (rt_tab2dom[id]);
288 }
289 
290 void
291 rtable_l2set(u_int id, u_int parent)
292 {
293 	splsoftassert(IPL_SOFTNET);
294 
295 	if (!rtable_exists(id) || !rtable_exists(parent))
296 		return;
297 	rt_tab2dom[id] = parent;
298 }
299 
300 int
301 rtable_exists(u_int id)	/* verify table with that ID exists */
302 {
303 	if (id > RT_TABLEID_MAX)
304 		return (0);
305 
306 	if (id > rtbl_id_max)
307 		return (0);
308 
309 	if (rt_tables[id] == NULL)
310 		return (0);
311 
312 	return (1);
313 }
314 
315 /*
316  * Packet routing routines.
317  */
318 void
319 rtalloc_noclone(struct route *ro)
320 {
321 	if (ro->ro_rt && ro->ro_rt->rt_ifp && (ro->ro_rt->rt_flags & RTF_UP))
322 		return;		/* cached route is still valid */
323 	ro->ro_rt = rtalloc1(&ro->ro_dst, RT_REPORT | RT_NOCLONING,
324 	    ro->ro_tableid);
325 }
326 
327 void
328 rtalloc(struct route *ro)
329 {
330 	if (ro->ro_rt && ro->ro_rt->rt_ifp && (ro->ro_rt->rt_flags & RTF_UP))
331 		return;		/* cached route is still valid */
332 	ro->ro_rt = rtalloc1(&ro->ro_dst, RT_REPORT, ro->ro_tableid);
333 }
334 
335 struct rtentry *
336 rtalloc1(struct sockaddr *dst, int flags, u_int tableid)
337 {
338 	struct radix_node_head	*rnh;
339 	struct rtentry		*rt;
340 	struct radix_node	*rn;
341 	struct rtentry		*newrt = 0;
342 	struct rt_addrinfo	 info;
343 	int			 s = splsoftnet(), err = 0, msgtype = RTM_MISS;
344 
345 	bzero(&info, sizeof(info));
346 	info.rti_info[RTAX_DST] = dst;
347 
348 	rnh = rtable_get(tableid, dst->sa_family);
349 	if (rnh && (rn = rnh->rnh_matchaddr((caddr_t)dst, rnh)) &&
350 	    ((rn->rn_flags & RNF_ROOT) == 0)) {
351 		newrt = rt = (struct rtentry *)rn;
352 		if ((rt->rt_flags & RTF_CLONING) &&
353 		    ISSET(flags,  RT_REPORT | RT_NOCLONING) == RT_REPORT) {
354 			err = rtrequest1(RTM_RESOLVE, &info, RTP_DEFAULT,
355 			    &newrt, tableid);
356 			if (err) {
357 				newrt = rt;
358 				rt->rt_refcnt++;
359 				goto miss;
360 			}
361 			if ((rt = newrt) && (rt->rt_flags & RTF_XRESOLVE)) {
362 				msgtype = RTM_RESOLVE;
363 				goto miss;
364 			}
365 			/* Inform listeners of the new route */
366 			rt_sendmsg(rt, RTM_ADD, tableid);
367 		} else
368 			rt->rt_refcnt++;
369 	} else {
370 		if (dst->sa_family != PF_KEY)
371 			rtstat.rts_unreach++;
372 	/*
373 	 * IP encapsulation does lots of lookups where we don't need nor want
374 	 * the RTM_MISSes that would be generated.  It causes RTM_MISS storms
375 	 * sent upward breaking user-level routing queries.
376 	 */
377 miss:
378 		if (ISSET(flags, RT_REPORT) && dst->sa_family != PF_KEY) {
379 			bzero((caddr_t)&info, sizeof(info));
380 			info.rti_info[RTAX_DST] = dst;
381 			rt_missmsg(msgtype, &info, 0, NULL, err, tableid);
382 		}
383 	}
384 	splx(s);
385 	return (newrt);
386 }
387 
388 void
389 rtfree(struct rtentry *rt)
390 {
391 	struct ifaddr	*ifa;
392 
393 	if (rt == NULL)
394 		panic("rtfree");
395 
396 	rt->rt_refcnt--;
397 
398 	if (rt->rt_refcnt <= 0 && (rt->rt_flags & RTF_UP) == 0) {
399 		if (rt->rt_refcnt == 0 && (rt->rt_nodes->rn_flags & RNF_ACTIVE))
400 			return; /* route still active but currently down */
401 		if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT))
402 			panic("rtfree 2");
403 		rttrash--;
404 		if (rt->rt_refcnt < 0) {
405 			printf("rtfree: %p not freed (neg refs)\n", rt);
406 			return;
407 		}
408 		rt_timer_remove_all(rt);
409 		ifa = rt->rt_ifa;
410 		if (ifa)
411 			ifafree(ifa);
412 		rtlabel_unref(rt->rt_labelid);
413 #ifdef MPLS
414 		if (rt->rt_flags & RTF_MPLS)
415 			free(rt->rt_llinfo, M_TEMP, 0);
416 #endif
417 		free(rt_key(rt), M_RTABLE, 0);
418 		pool_put(&rtentry_pool, rt);
419 	}
420 }
421 
422 void
423 rt_sendmsg(struct rtentry *rt, int cmd, u_int rtableid)
424 {
425 	struct rt_addrinfo info;
426 
427 	bzero(&info, sizeof(info));
428 	info.rti_info[RTAX_DST] = rt_key(rt);
429 	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
430 	info.rti_info[RTAX_NETMASK] = rt_mask(rt);
431 	if (rt->rt_ifp != NULL) {
432 		info.rti_info[RTAX_IFP] =(struct sockaddr *)rt->rt_ifp->if_sadl;
433 		info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
434 	}
435 
436 	rt_missmsg(cmd, &info, rt->rt_flags, rt->rt_ifp, 0, rtableid);
437 }
438 
439 void
440 ifafree(struct ifaddr *ifa)
441 {
442 	if (ifa == NULL)
443 		panic("ifafree");
444 	if (ifa->ifa_refcnt == 0)
445 		free(ifa, M_IFADDR, 0);
446 	else
447 		ifa->ifa_refcnt--;
448 }
449 
450 /*
451  * Force a routing table entry to the specified
452  * destination to go through the given gateway.
453  * Normally called as a result of a routing redirect
454  * message from the network layer.
455  *
456  * N.B.: must be called at splsoftnet
457  */
458 void
459 rtredirect(struct sockaddr *dst, struct sockaddr *gateway,
460     struct sockaddr *netmask, int flags, struct sockaddr *src,
461     struct rtentry **rtp, u_int rdomain)
462 {
463 	struct rtentry		*rt;
464 	int			 error = 0;
465 	u_int32_t		*stat = NULL;
466 	struct rt_addrinfo	 info;
467 	struct ifaddr		*ifa;
468 	struct ifnet		*ifp = NULL;
469 
470 	splsoftassert(IPL_SOFTNET);
471 
472 	/* verify the gateway is directly reachable */
473 	if ((ifa = ifa_ifwithnet(gateway, rdomain)) == NULL) {
474 		error = ENETUNREACH;
475 		goto out;
476 	}
477 	ifp = ifa->ifa_ifp;
478 	rt = rtalloc1(dst, 0, rdomain);
479 	/*
480 	 * If the redirect isn't from our current router for this dst,
481 	 * it's either old or wrong.  If it redirects us to ourselves,
482 	 * we have a routing loop, perhaps as a result of an interface
483 	 * going down recently.
484 	 */
485 #define	equal(a1, a2) \
486 	((a1)->sa_len == (a2)->sa_len && \
487 	 bcmp((caddr_t)(a1), (caddr_t)(a2), (a1)->sa_len) == 0)
488 	if (!(flags & RTF_DONE) && rt &&
489 	     (!equal(src, rt->rt_gateway) || rt->rt_ifa != ifa))
490 		error = EINVAL;
491 	else if (ifa_ifwithaddr(gateway, rdomain) != NULL)
492 		error = EHOSTUNREACH;
493 	if (error)
494 		goto done;
495 	/*
496 	 * Create a new entry if we just got back a wildcard entry
497 	 * or the lookup failed.  This is necessary for hosts
498 	 * which use routing redirects generated by smart gateways
499 	 * to dynamically build the routing tables.
500 	 */
501 	if ((rt == NULL) || (rt_mask(rt) && rt_mask(rt)->sa_len < 2))
502 		goto create;
503 	/*
504 	 * Don't listen to the redirect if it's
505 	 * for a route to an interface.
506 	 */
507 	if (rt->rt_flags & RTF_GATEWAY) {
508 		if (((rt->rt_flags & RTF_HOST) == 0) && (flags & RTF_HOST)) {
509 			/*
510 			 * Changing from route to net => route to host.
511 			 * Create new route, rather than smashing route to net.
512 			 */
513 create:
514 			if (rt)
515 				rtfree(rt);
516 			flags |= RTF_GATEWAY | RTF_DYNAMIC;
517 			bzero(&info, sizeof(info));
518 			info.rti_info[RTAX_DST] = dst;
519 			info.rti_info[RTAX_GATEWAY] = gateway;
520 			info.rti_info[RTAX_NETMASK] = netmask;
521 			info.rti_ifa = ifa;
522 			info.rti_flags = flags;
523 			rt = NULL;
524 			error = rtrequest1(RTM_ADD, &info, RTP_DEFAULT, &rt,
525 			    rdomain);
526 			if (rt != NULL)
527 				flags = rt->rt_flags;
528 			stat = &rtstat.rts_dynamic;
529 		} else {
530 			/*
531 			 * Smash the current notion of the gateway to
532 			 * this destination.  Should check about netmask!!!
533 			 */
534 			rt->rt_flags |= RTF_MODIFIED;
535 			flags |= RTF_MODIFIED;
536 			stat = &rtstat.rts_newgateway;
537 			rt_setgate(rt, rt_key(rt), gateway, rdomain);
538 		}
539 	} else
540 		error = EHOSTUNREACH;
541 done:
542 	if (rt) {
543 		if (rtp && !error)
544 			*rtp = rt;
545 		else
546 			rtfree(rt);
547 	}
548 out:
549 	if (error)
550 		rtstat.rts_badredirect++;
551 	else if (stat != NULL)
552 		(*stat)++;
553 	bzero((caddr_t)&info, sizeof(info));
554 	info.rti_info[RTAX_DST] = dst;
555 	info.rti_info[RTAX_GATEWAY] = gateway;
556 	info.rti_info[RTAX_NETMASK] = netmask;
557 	info.rti_info[RTAX_AUTHOR] = src;
558 	rt_missmsg(RTM_REDIRECT, &info, flags, ifp, error, rdomain);
559 }
560 
561 /*
562  * Delete a route and generate a message
563  */
564 int
565 rtdeletemsg(struct rtentry *rt, u_int tableid)
566 {
567 	int			error;
568 	struct rt_addrinfo	info;
569 	struct ifnet		*ifp;
570 
571 	/*
572 	 * Request the new route so that the entry is not actually
573 	 * deleted.  That will allow the information being reported to
574 	 * be accurate (and consistent with route_output()).
575 	 */
576 	bzero((caddr_t)&info, sizeof(info));
577 	info.rti_info[RTAX_DST] = rt_key(rt);
578 	info.rti_info[RTAX_NETMASK] = rt_mask(rt);
579 	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
580 	info.rti_flags = rt->rt_flags;
581 	ifp = rt->rt_ifp;
582 	error = rtrequest1(RTM_DELETE, &info, rt->rt_priority, &rt, tableid);
583 
584 	rt_missmsg(RTM_DELETE, &info, info.rti_flags, ifp, error, tableid);
585 
586 	/* Adjust the refcount */
587 	if (error == 0 && rt->rt_refcnt <= 0) {
588 		rt->rt_refcnt++;
589 		rtfree(rt);
590 	}
591 	return (error);
592 }
593 
594 int
595 rtflushclone1(struct radix_node *rn, void *arg, u_int id)
596 {
597 	struct rtentry	*rt, *parent;
598 
599 	rt = (struct rtentry *)rn;
600 	parent = (struct rtentry *)arg;
601 	if ((rt->rt_flags & RTF_CLONED) != 0 && rt->rt_parent == parent)
602 		rtdeletemsg(rt, id);
603 	return 0;
604 }
605 
606 void
607 rtflushclone(struct radix_node_head *rnh, struct rtentry *parent)
608 {
609 
610 #ifdef DIAGNOSTIC
611 	if (!parent || (parent->rt_flags & RTF_CLONING) == 0)
612 		panic("rtflushclone: called with a non-cloning route");
613 	if (!rnh->rnh_walktree)
614 		panic("rtflushclone: no rnh_walktree");
615 #endif
616 	rnh->rnh_walktree(rnh, rtflushclone1, (void *)parent);
617 }
618 
619 int
620 rtioctl(u_long req, caddr_t data, struct proc *p)
621 {
622 	return (EOPNOTSUPP);
623 }
624 
625 struct ifaddr *
626 ifa_ifwithroute(int flags, struct sockaddr *dst, struct sockaddr *gateway,
627     u_int rtableid)
628 {
629 	struct ifaddr	*ifa;
630 
631 #ifdef IPSEC
632 	/*
633 	 * If the destination is a PF_KEY address, we'll look
634 	 * for the existence of a encap interface number or address
635 	 * in the options list of the gateway. By default, we'll return
636 	 * enc0.
637 	 */
638 	if (dst && (dst->sa_family == PF_KEY))
639 		return (encap_findgwifa(gateway, rtableid));
640 #endif
641 
642 	if ((flags & RTF_GATEWAY) == 0) {
643 		/*
644 		 * If we are adding a route to an interface,
645 		 * and the interface is a pt to pt link
646 		 * we should search for the destination
647 		 * as our clue to the interface.  Otherwise
648 		 * we can use the local address.
649 		 */
650 		ifa = NULL;
651 		if (flags & RTF_HOST)
652 			ifa = ifa_ifwithdstaddr(dst, rtableid);
653 		if (ifa == NULL)
654 			ifa = ifa_ifwithaddr(gateway, rtableid);
655 	} else {
656 		/*
657 		 * If we are adding a route to a remote net
658 		 * or host, the gateway may still be on the
659 		 * other end of a pt to pt link.
660 		 */
661 		ifa = ifa_ifwithdstaddr(gateway, rtableid);
662 	}
663 	if (ifa == NULL) {
664 		if (gateway->sa_family == AF_LINK) {
665 			struct sockaddr_dl *sdl = (struct sockaddr_dl *)gateway;
666 			struct ifnet *ifp = if_get(sdl->sdl_index);
667 
668 			if (ifp == NULL)
669 				ifp = ifunit(sdl->sdl_data);
670 			if (ifp != NULL)
671 				ifa = ifp->if_lladdr;
672 		} else {
673 			ifa = ifa_ifwithnet(gateway, rtableid);
674 		}
675 	}
676 	if (ifa == NULL) {
677 		struct rtentry	*rt = rtalloc1(gateway, 0, rtable_l2(rtableid));
678 		if (rt == NULL)
679 			return (NULL);
680 		rt->rt_refcnt--;
681 		/* The gateway must be local if the same address family. */
682 		if ((rt->rt_flags & RTF_GATEWAY) &&
683 		    rt_key(rt)->sa_family == dst->sa_family)
684 			return (NULL);
685 		if ((ifa = rt->rt_ifa) == NULL)
686 			return (NULL);
687 	}
688 	if (ifa->ifa_addr->sa_family != dst->sa_family) {
689 		struct ifaddr	*oifa = ifa;
690 		ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp);
691 		if (ifa == NULL)
692 			ifa = oifa;
693 	}
694 	return (ifa);
695 }
696 
697 #define ROUNDUP(a) (a>0 ? (1 + (((a) - 1) | (sizeof(long) - 1))) : sizeof(long))
698 
699 int
700 rt_getifa(struct rt_addrinfo *info, u_int rtid)
701 {
702 	struct ifaddr	*ifa;
703 	struct ifnet	*ifp = NULL;
704 
705 	/*
706 	 * ifp may be specified by sockaddr_dl when protocol address
707 	 * is ambiguous
708 	 */
709 	if (info->rti_info[RTAX_IFP] != NULL) {
710 		struct sockaddr_dl *sdl;
711 
712 		sdl = (struct sockaddr_dl *)info->rti_info[RTAX_IFP];
713 		ifp = if_get(sdl->sdl_index);
714 		if (ifp == NULL)
715 			ifp = ifunit(sdl->sdl_data);
716 	}
717 
718 	if (info->rti_ifa == NULL && info->rti_info[RTAX_IFA] != NULL)
719 		info->rti_ifa = ifa_ifwithaddr(info->rti_info[RTAX_IFA], rtid);
720 
721 	if (info->rti_ifa == NULL) {
722 		struct sockaddr	*sa;
723 
724 		if ((sa = info->rti_info[RTAX_IFA]) == NULL)
725 			if ((sa = info->rti_info[RTAX_GATEWAY]) == NULL)
726 				sa = info->rti_info[RTAX_DST];
727 
728 		if (sa != NULL && ifp != NULL)
729 			info->rti_ifa = ifaof_ifpforaddr(sa, ifp);
730 		else if (info->rti_info[RTAX_DST] != NULL &&
731 		    info->rti_info[RTAX_GATEWAY] != NULL)
732 			info->rti_ifa = ifa_ifwithroute(info->rti_flags,
733 			    info->rti_info[RTAX_DST],
734 			    info->rti_info[RTAX_GATEWAY],
735 			    rtid);
736 		else if (sa != NULL)
737 			info->rti_ifa = ifa_ifwithroute(info->rti_flags,
738 			    sa, sa, rtid);
739 	}
740 
741 	if ((ifa = info->rti_ifa) == NULL)
742 		return (ENETUNREACH);
743 
744 	return (0);
745 }
746 
747 int
748 rtrequest1(int req, struct rt_addrinfo *info, u_int8_t prio,
749     struct rtentry **ret_nrt, u_int tableid)
750 {
751 	int			 s = splsoftnet(); int error = 0;
752 	struct rtentry		*rt, *crt;
753 	struct radix_node	*rn;
754 	struct radix_node_head	*rnh;
755 	struct ifaddr		*ifa;
756 	struct sockaddr		*ndst;
757 	struct sockaddr_rtlabel	*sa_rl, sa_rl2;
758 #ifdef MPLS
759 	struct sockaddr_mpls	*sa_mpls;
760 #endif
761 #define senderr(x) { error = x ; goto bad; }
762 
763 	if ((rnh = rtable_get(tableid, info->rti_info[RTAX_DST]->sa_family)) ==
764 	    NULL)
765 		senderr(EAFNOSUPPORT);
766 	if (info->rti_flags & RTF_HOST)
767 		info->rti_info[RTAX_NETMASK] = NULL;
768 	switch (req) {
769 	case RTM_DELETE:
770 		if ((rn = rnh->rnh_lookup(info->rti_info[RTAX_DST],
771 		    info->rti_info[RTAX_NETMASK], rnh)) == NULL)
772 			senderr(ESRCH);
773 		rt = (struct rtentry *)rn;
774 #ifndef SMALL_KERNEL
775 		/*
776 		 * if we got multipath routes, we require users to specify
777 		 * a matching RTAX_GATEWAY.
778 		 */
779 		if (rn_mpath_capable(rnh)) {
780 			rt = rt_mpath_matchgate(rt,
781 			    info->rti_info[RTAX_GATEWAY], prio);
782 			rn = (struct radix_node *)rt;
783 			if (!rt ||
784 			    (!info->rti_info[RTAX_GATEWAY] &&
785 			    rt->rt_flags & RTF_MPATH))
786 				senderr(ESRCH);
787 		}
788 #endif
789 
790 		/*
791 		 * Since RTP_LOCAL cannot be set by userland, make
792 		 * sure that local routes are only modified by the
793 		 * kernel.
794 		 */
795 		if (rt->rt_flags & RTF_LOCAL && prio != RTP_LOCAL)
796 			senderr(EINVAL);
797 
798 		if ((rn = rnh->rnh_deladdr(info->rti_info[RTAX_DST],
799 		    info->rti_info[RTAX_NETMASK], rnh, rn)) == NULL)
800 			senderr(ESRCH);
801 		rt = (struct rtentry *)rn;
802 
803 		/* clean up any cloned children */
804 		if ((rt->rt_flags & RTF_CLONING) != 0)
805 			rtflushclone(rnh, rt);
806 
807 		if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT))
808 			panic ("rtrequest delete");
809 
810 		if (rt->rt_gwroute) {
811 			rt = rt->rt_gwroute; RTFREE(rt);
812 			(rt = (struct rtentry *)rn)->rt_gwroute = NULL;
813 		}
814 
815 		if (rt->rt_parent) {
816 			rt->rt_parent->rt_refcnt--;
817 			rt->rt_parent = NULL;
818 		}
819 
820 		rt->rt_flags &= ~RTF_UP;
821 		if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest)
822 			ifa->ifa_rtrequest(RTM_DELETE, rt);
823 		rttrash++;
824 
825 		if (ret_nrt)
826 			*ret_nrt = rt;
827 		else if (rt->rt_refcnt <= 0) {
828 			rt->rt_refcnt++;
829 			rtfree(rt);
830 		}
831 		break;
832 
833 	case RTM_RESOLVE:
834 		if (ret_nrt == NULL || (rt = *ret_nrt) == NULL)
835 			senderr(EINVAL);
836 		if ((rt->rt_flags & RTF_CLONING) == 0)
837 			senderr(EINVAL);
838 		if (rt->rt_ifa->ifa_ifp) {
839 			info->rti_ifa = rt->rt_ifa;
840 		} else {
841 			/*
842 			 * The address of the cloning route is not longer
843 			 * configured on an interface, but its descriptor
844 			 * is still there because of reference counting.
845 			 *
846 			 * Try to find a similar active address and use
847 			 * it for the cloned route.  The cloning route
848 			 * will get the new address and interface later.
849 			 */
850 			info->rti_ifa = NULL;
851 			info->rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
852 		}
853 		info->rti_flags = rt->rt_flags & ~(RTF_CLONING | RTF_STATIC);
854 		info->rti_flags |= RTF_CLONED;
855 		info->rti_info[RTAX_GATEWAY] = rt->rt_gateway;
856 		info->rti_flags |= RTF_HOST;
857 		info->rti_info[RTAX_LABEL] =
858 		    rtlabel_id2sa(rt->rt_labelid, &sa_rl2);
859 		/* FALLTHROUGH */
860 
861 	case RTM_ADD:
862 		if (info->rti_ifa == NULL && (error = rt_getifa(info, tableid)))
863 			senderr(error);
864 		ifa = info->rti_ifa;
865 		rt = pool_get(&rtentry_pool, PR_NOWAIT | PR_ZERO);
866 		if (rt == NULL)
867 			senderr(ENOBUFS);
868 
869 		rt->rt_flags = info->rti_flags;
870 
871 		if (prio == 0)
872 			prio = ifa->ifa_ifp->if_priority + RTP_STATIC;
873 		rt->rt_priority = prio;	/* init routing priority */
874 		LIST_INIT(&rt->rt_timer);
875 		if (rt_setgate(rt, info->rti_info[RTAX_DST],
876 		    info->rti_info[RTAX_GATEWAY], tableid)) {
877 			pool_put(&rtentry_pool, rt);
878 			senderr(ENOBUFS);
879 		}
880 		ndst = rt_key(rt);
881 		if (info->rti_info[RTAX_NETMASK] != NULL) {
882 			rt_maskedcopy(info->rti_info[RTAX_DST], ndst,
883 			    info->rti_info[RTAX_NETMASK]);
884 		} else
885 			memcpy(ndst, info->rti_info[RTAX_DST],
886 			    info->rti_info[RTAX_DST]->sa_len);
887 #ifndef SMALL_KERNEL
888 		if (rn_mpath_capable(rnh)) {
889 			/* do not permit exactly the same dst/mask/gw pair */
890 			if (rt_mpath_conflict(rnh, rt,
891 			    info->rti_info[RTAX_NETMASK],
892 			    info->rti_flags & RTF_MPATH)) {
893 				if (rt->rt_gwroute)
894 					rtfree(rt->rt_gwroute);
895 				free(rt_key(rt), M_RTABLE, 0);
896 				pool_put(&rtentry_pool, rt);
897 				senderr(EEXIST);
898 			}
899 			/* check the link state since the table supports it */
900 			if (LINK_STATE_IS_UP(ifa->ifa_ifp->if_link_state) &&
901 			    ifa->ifa_ifp->if_flags & IFF_UP)
902 				rt->rt_flags |= RTF_UP;
903 			else {
904 				rt->rt_flags &= ~RTF_UP;
905 				rt->rt_priority |= RTP_DOWN;
906 			}
907 		}
908 #endif
909 
910 		if (info->rti_info[RTAX_LABEL] != NULL) {
911 			sa_rl = (struct sockaddr_rtlabel *)
912 			    info->rti_info[RTAX_LABEL];
913 			rt->rt_labelid = rtlabel_name2id(sa_rl->sr_label);
914 		}
915 
916 #ifdef MPLS
917 		/* We have to allocate additional space for MPLS infos */
918 		if (info->rti_flags & RTF_MPLS &&
919 		    (info->rti_info[RTAX_SRC] != NULL ||
920 		    info->rti_info[RTAX_DST]->sa_family == AF_MPLS)) {
921 			struct rt_mpls *rt_mpls;
922 
923 			sa_mpls = (struct sockaddr_mpls *)
924 			    info->rti_info[RTAX_SRC];
925 
926 			rt->rt_llinfo = malloc(sizeof(struct rt_mpls),
927 			    M_TEMP, M_NOWAIT|M_ZERO);
928 
929 			if (rt->rt_llinfo == NULL) {
930 				if (rt->rt_gwroute)
931 					rtfree(rt->rt_gwroute);
932 				free(rt_key(rt), M_RTABLE, 0);
933 				pool_put(&rtentry_pool, rt);
934 				senderr(ENOMEM);
935 			}
936 
937 			rt_mpls = (struct rt_mpls *)rt->rt_llinfo;
938 
939 			if (sa_mpls != NULL)
940 				rt_mpls->mpls_label = sa_mpls->smpls_label;
941 
942 			rt_mpls->mpls_operation = info->rti_mpls;
943 
944 			/* XXX: set experimental bits */
945 
946 			rt->rt_flags |= RTF_MPLS;
947 		} else
948 			rt->rt_flags &= ~RTF_MPLS;
949 #endif
950 
951 		ifa->ifa_refcnt++;
952 		rt->rt_ifa = ifa;
953 		rt->rt_ifp = ifa->ifa_ifp;
954 		if (req == RTM_RESOLVE) {
955 			/*
956 			 * If the ifa of the cloning route was stale, a
957 			 * successful lookup for an ifa with the same address
958 			 * has been made.  Use this ifa also for the cloning
959 			 * route.
960 			 */
961 			if ((*ret_nrt)->rt_ifa->ifa_ifp == NULL) {
962 				printf("rtrequest1 RTM_RESOLVE: wrong ifa (%p) "
963 				    "was (%p)\n", ifa, (*ret_nrt)->rt_ifa);
964 				if ((*ret_nrt)->rt_ifa->ifa_rtrequest)
965 					(*ret_nrt)->rt_ifa->ifa_rtrequest(
966 					    RTM_DELETE, *ret_nrt);
967 				ifafree((*ret_nrt)->rt_ifa);
968 				(*ret_nrt)->rt_ifa = ifa;
969 				(*ret_nrt)->rt_ifp = ifa->ifa_ifp;
970 				ifa->ifa_refcnt++;
971 				if (ifa->ifa_rtrequest)
972 					ifa->ifa_rtrequest(RTM_ADD, *ret_nrt);
973 			}
974 			/*
975 			 * Copy both metrics and a back pointer to the cloned
976 			 * route's parent.
977 			 */
978 			rt->rt_rmx = (*ret_nrt)->rt_rmx; /* copy metrics */
979 			rt->rt_priority = (*ret_nrt)->rt_priority;
980 			rt->rt_parent = *ret_nrt;	 /* Back ptr. to parent. */
981 			rt->rt_parent->rt_refcnt++;
982 		}
983 		rn = rnh->rnh_addaddr((caddr_t)ndst,
984 		    (caddr_t)info->rti_info[RTAX_NETMASK], rnh, rt->rt_nodes,
985 		    rt->rt_priority);
986 		if (rn == NULL && (crt = rtalloc1(ndst, 0, tableid)) != NULL) {
987 			/* overwrite cloned route */
988 			if ((crt->rt_flags & RTF_CLONED) != 0) {
989 				rtdeletemsg(crt, tableid);
990 				rn = rnh->rnh_addaddr((caddr_t)ndst,
991 				    (caddr_t)info->rti_info[RTAX_NETMASK],
992 				    rnh, rt->rt_nodes, rt->rt_priority);
993 			}
994 			RTFREE(crt);
995 		}
996 		if (rn == 0) {
997 			ifafree(ifa);
998 			if ((rt->rt_flags & RTF_CLONED) != 0 && rt->rt_parent)
999 				rtfree(rt->rt_parent);
1000 			if (rt->rt_gwroute)
1001 				rtfree(rt->rt_gwroute);
1002 			free(rt_key(rt), M_RTABLE, 0);
1003 			pool_put(&rtentry_pool, rt);
1004 			senderr(EEXIST);
1005 		}
1006 
1007 		if (ifa->ifa_rtrequest)
1008 			ifa->ifa_rtrequest(req, rt);
1009 		if (ret_nrt) {
1010 			*ret_nrt = rt;
1011 			rt->rt_refcnt++;
1012 		}
1013 		if ((rt->rt_flags & RTF_CLONING) != 0) {
1014 			/* clean up any cloned children */
1015 			rtflushclone(rnh, rt);
1016 		}
1017 
1018 		if_group_routechange(info->rti_info[RTAX_DST],
1019 			info->rti_info[RTAX_NETMASK]);
1020 		break;
1021 	}
1022 bad:
1023 	splx(s);
1024 	return (error);
1025 }
1026 
1027 int
1028 rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate,
1029     u_int tableid)
1030 {
1031 	caddr_t	new, old;
1032 	int	dlen = ROUNDUP(dst->sa_len), glen = ROUNDUP(gate->sa_len);
1033 
1034 	if (rt->rt_gateway == NULL || glen > ROUNDUP(rt->rt_gateway->sa_len)) {
1035 		old = (caddr_t)rt_key(rt);
1036 		new = malloc(dlen + glen, M_RTABLE, M_NOWAIT);
1037 		if (new == NULL)
1038 			return 1;
1039 		rt->rt_nodes->rn_key = new;
1040 	} else {
1041 		new = rt->rt_nodes->rn_key;
1042 		old = NULL;
1043 	}
1044 	rt->rt_gateway = (struct sockaddr *)(new + dlen);
1045 	memmove(rt->rt_gateway, gate, glen);
1046 	if (old) {
1047 		memmove(new, dst, dlen);
1048 		free(old, M_RTABLE, 0);
1049 	}
1050 	if (rt->rt_gwroute != NULL) {
1051 		RTFREE(rt->rt_gwroute);
1052 		rt->rt_gwroute = NULL;
1053 	}
1054 	if (rt->rt_flags & RTF_GATEWAY) {
1055 		/* XXX is this actually valid to cross tables here? */
1056 		rt->rt_gwroute = rtalloc1(gate, RT_REPORT, rtable_l2(tableid));
1057 		/*
1058 		 * If we switched gateways, grab the MTU from the new
1059 		 * gateway route if the current MTU is 0 or greater
1060 		 * than the MTU of gateway.
1061 		 * Note that, if the MTU of gateway is 0, we will reset the
1062 		 * MTU of the route to run PMTUD again from scratch. XXX
1063 		 */
1064 		if (rt->rt_gwroute && !(rt->rt_rmx.rmx_locks & RTV_MTU) &&
1065 		    rt->rt_rmx.rmx_mtu &&
1066 		    rt->rt_rmx.rmx_mtu > rt->rt_gwroute->rt_rmx.rmx_mtu) {
1067 			rt->rt_rmx.rmx_mtu = rt->rt_gwroute->rt_rmx.rmx_mtu;
1068 		}
1069 	}
1070 	return (0);
1071 }
1072 
1073 void
1074 rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst,
1075     struct sockaddr *netmask)
1076 {
1077 	u_char	*cp1 = (u_char *)src;
1078 	u_char	*cp2 = (u_char *)dst;
1079 	u_char	*cp3 = (u_char *)netmask;
1080 	u_char	*cplim = cp2 + *cp3;
1081 	u_char	*cplim2 = cp2 + *cp1;
1082 
1083 	*cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
1084 	cp3 += 2;
1085 	if (cplim > cplim2)
1086 		cplim = cplim2;
1087 	while (cp2 < cplim)
1088 		*cp2++ = *cp1++ & *cp3++;
1089 	if (cp2 < cplim2)
1090 		bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2));
1091 }
1092 
1093 int
1094 rt_ifa_add(struct ifaddr *ifa, int flags, struct sockaddr *dst)
1095 {
1096 	struct rtentry		*rt, *nrt = NULL;
1097 	struct sockaddr_rtlabel	 sa_rl;
1098 	struct rt_addrinfo	 info;
1099 	u_short			 rtableid = ifa->ifa_ifp->if_rdomain;
1100 	u_int8_t		 prio = RTP_CONNECTED;
1101 	int			 error;
1102 
1103 	memset(&info, 0, sizeof(info));
1104 	info.rti_ifa = ifa;
1105 	info.rti_flags = flags;
1106 	info.rti_info[RTAX_DST] = dst;
1107 	info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr;
1108 	info.rti_info[RTAX_LABEL] =
1109 	    rtlabel_id2sa(ifa->ifa_ifp->if_rtlabelid, &sa_rl);
1110 
1111 	if ((flags & RTF_HOST) == 0)
1112 		info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask;
1113 
1114 	if (flags & RTF_LOCAL)
1115 		prio = RTP_LOCAL;
1116 
1117 	error = rtrequest1(RTM_ADD, &info, prio, &nrt, rtableid);
1118 	if (error == 0 && (rt = nrt) != NULL) {
1119 		rt->rt_refcnt--;
1120 		if (rt->rt_ifa != ifa) {
1121 			printf("%s: wrong ifa (%p) was (%p)\n", __func__,
1122 			    ifa, rt->rt_ifa);
1123 			if (rt->rt_ifa->ifa_rtrequest)
1124 				rt->rt_ifa->ifa_rtrequest(RTM_DELETE, rt);
1125 			ifafree(rt->rt_ifa);
1126 			rt->rt_ifa = ifa;
1127 			rt->rt_ifp = ifa->ifa_ifp;
1128 			ifa->ifa_refcnt++;
1129 			if (ifa->ifa_rtrequest)
1130 				ifa->ifa_rtrequest(RTM_ADD, rt);
1131 		}
1132 		if (flags & RTF_LOCAL)
1133 			rt_newaddrmsg(RTM_ADD, ifa, error, nrt);
1134 	}
1135 	return (error);
1136 }
1137 
1138 int
1139 rt_ifa_del(struct ifaddr *ifa, int flags, struct sockaddr *dst)
1140 {
1141 	struct rtentry		*rt, *nrt = NULL;
1142 	struct mbuf		*m = NULL;
1143 	struct sockaddr		*deldst;
1144 	struct rt_addrinfo	 info;
1145 	struct sockaddr_rtlabel	 sa_rl;
1146 	u_short			 rtableid = ifa->ifa_ifp->if_rdomain;
1147 	u_int8_t		 prio = RTP_CONNECTED;
1148 	int			 error;
1149 
1150 	if ((flags & RTF_HOST) == 0 && ifa->ifa_netmask) {
1151 		m = m_get(M_DONTWAIT, MT_SONAME);
1152 		if (m == NULL)
1153 			return (ENOBUFS);
1154 		deldst = mtod(m, struct sockaddr *);
1155 		rt_maskedcopy(dst, deldst, ifa->ifa_netmask);
1156 		dst = deldst;
1157 	}
1158 	if ((rt = rtalloc1(dst, 0, rtableid)) != NULL) {
1159 		rt->rt_refcnt--;
1160 		/* try to find the right route */
1161 		while (rt && rt->rt_ifa != ifa)
1162 			rt = (struct rtentry *)
1163 			    ((struct radix_node *)rt)->rn_dupedkey;
1164 		if (!rt) {
1165 			if (m != NULL)
1166 				(void) m_free(m);
1167 			return (flags & RTF_HOST ? EHOSTUNREACH
1168 						: ENETUNREACH);
1169 		}
1170 	}
1171 
1172 	memset(&info, 0, sizeof(info));
1173 	info.rti_ifa = ifa;
1174 	info.rti_flags = flags;
1175 	info.rti_info[RTAX_DST] = dst;
1176 	info.rti_info[RTAX_LABEL] =
1177 	    rtlabel_id2sa(ifa->ifa_ifp->if_rtlabelid, &sa_rl);
1178 
1179 	if ((flags & RTF_HOST) == 0)
1180 		info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask;
1181 
1182 	if (flags & RTF_LOCAL)
1183 		prio = RTP_LOCAL;
1184 
1185 	error = rtrequest1(RTM_DELETE, &info, prio, &nrt, rtableid);
1186 	if (error == 0 && (rt = nrt) != NULL) {
1187 		if (flags & RTF_LOCAL)
1188 			rt_newaddrmsg(RTM_DELETE, ifa, error, nrt);
1189 		if (rt->rt_refcnt <= 0) {
1190 			rt->rt_refcnt++;
1191 			rtfree(rt);
1192 		}
1193 	}
1194 	if (m != NULL)
1195 		m_free(m);
1196 
1197 	return (error);
1198 }
1199 
1200 /*
1201  * Add ifa's address as a loopback rtentry.
1202  */
1203 void
1204 rt_ifa_addloop(struct ifaddr *ifa)
1205 {
1206 	struct rtentry *rt;
1207 
1208 	/*
1209 	 * If the configured address correspond to the magical "any"
1210 	 * address do not add a local route entry because that might
1211 	 * corrupt the routing tree which uses this value for the
1212 	 * default routes.
1213 	 */
1214 	switch (ifa->ifa_addr->sa_family) {
1215 	case AF_INET:
1216 		if (satosin(ifa->ifa_addr)->sin_addr.s_addr == INADDR_ANY)
1217 			return;
1218 		break;
1219 #ifdef INET6
1220 	case AF_INET6:
1221 		if (IN6_ARE_ADDR_EQUAL(&satosin6(ifa->ifa_addr)->sin6_addr,
1222 		    &in6addr_any))
1223 			return;
1224 		break;
1225 #endif
1226 	default:
1227 		break;
1228 	}
1229 
1230 	/* If there is no loopback entry, allocate one. */
1231 	rt = rtalloc1(ifa->ifa_addr, 0, ifa->ifa_ifp->if_rdomain);
1232 	if (rt == NULL || (rt->rt_flags & RTF_HOST) == 0 ||
1233 	    (rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0)
1234 		rt_ifa_add(ifa, RTF_UP| RTF_HOST | RTF_LLINFO | RTF_LOCAL,
1235 		    ifa->ifa_addr);
1236 	if (rt)
1237 		rt->rt_refcnt--;
1238 }
1239 
1240 /*
1241  * Remove loopback rtentry of ifa's addresss if it exists.
1242  */
1243 void
1244 rt_ifa_delloop(struct ifaddr *ifa)
1245 {
1246 	struct rtentry *rt;
1247 
1248 	/*
1249 	 * We do not add local routes for such address, so do not bother
1250 	 * removing them.
1251 	 */
1252 	switch (ifa->ifa_addr->sa_family) {
1253 	case AF_INET:
1254 		if (satosin(ifa->ifa_addr)->sin_addr.s_addr == INADDR_ANY)
1255 			return;
1256 		break;
1257 #ifdef INET6
1258 	case AF_INET6:
1259 		if (IN6_ARE_ADDR_EQUAL(&satosin6(ifa->ifa_addr)->sin6_addr,
1260 		    &in6addr_any))
1261 			return;
1262 		break;
1263 #endif
1264 	default:
1265 		break;
1266 	}
1267 
1268 	/*
1269 	 * Before deleting, check if a corresponding loopbacked host
1270 	 * route surely exists.  With this check, we can avoid to
1271 	 * delete an interface direct route whose destination is same
1272 	 * as the address being removed.  This can happen when removing
1273 	 * a subnet-router anycast address on an interface attached
1274 	 * to a shared medium.
1275 	 */
1276 	rt = rtalloc1(ifa->ifa_addr, 0, ifa->ifa_ifp->if_rdomain);
1277 	if (rt != NULL && (rt->rt_flags & RTF_HOST) != 0 &&
1278 	    (rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0)
1279 		rt_ifa_del(ifa,  RTF_HOST | RTF_LLINFO | RTF_LOCAL,
1280 		    ifa->ifa_addr);
1281 	if (rt)
1282 		rt->rt_refcnt--;
1283 }
1284 
1285 /*
1286  * Route timer routines.  These routes allow functions to be called
1287  * for various routes at any time.  This is useful in supporting
1288  * path MTU discovery and redirect route deletion.
1289  *
1290  * This is similar to some BSDI internal functions, but it provides
1291  * for multiple queues for efficiency's sake...
1292  */
1293 
1294 LIST_HEAD(, rttimer_queue)	rttimer_queue_head;
1295 static int			rt_init_done = 0;
1296 
1297 #define RTTIMER_CALLOUT(r)	{				\
1298 	if (r->rtt_func != NULL) {				\
1299 		(*r->rtt_func)(r->rtt_rt, r);			\
1300 	} else {						\
1301 		struct rt_addrinfo info;			\
1302 		bzero(&info, sizeof(info));			\
1303 		info.rti_info[RTAX_DST] = rt_key(r->rtt_rt);	\
1304 		rtrequest1(RTM_DELETE, &info,			\
1305 		    r->rtt_rt->rt_priority, NULL, r->rtt_tableid);	\
1306 	}							\
1307 }
1308 
1309 /*
1310  * Some subtle order problems with domain initialization mean that
1311  * we cannot count on this being run from rt_init before various
1312  * protocol initializations are done.  Therefore, we make sure
1313  * that this is run when the first queue is added...
1314  */
1315 
1316 void
1317 rt_timer_init()
1318 {
1319 	static struct timeout	rt_timer_timeout;
1320 
1321 	if (rt_init_done)
1322 		panic("rt_timer_init: already initialized");
1323 
1324 	pool_init(&rttimer_pool, sizeof(struct rttimer), 0, 0, 0, "rttmrpl",
1325 	    NULL);
1326 
1327 	LIST_INIT(&rttimer_queue_head);
1328 	timeout_set(&rt_timer_timeout, rt_timer_timer, &rt_timer_timeout);
1329 	timeout_add_sec(&rt_timer_timeout, 1);
1330 	rt_init_done = 1;
1331 }
1332 
1333 struct rttimer_queue *
1334 rt_timer_queue_create(u_int timeout)
1335 {
1336 	struct rttimer_queue	*rtq;
1337 
1338 	if (rt_init_done == 0)
1339 		rt_timer_init();
1340 
1341 	if ((rtq = malloc(sizeof(*rtq), M_RTABLE, M_NOWAIT|M_ZERO)) == NULL)
1342 		return (NULL);
1343 
1344 	rtq->rtq_timeout = timeout;
1345 	rtq->rtq_count = 0;
1346 	TAILQ_INIT(&rtq->rtq_head);
1347 	LIST_INSERT_HEAD(&rttimer_queue_head, rtq, rtq_link);
1348 
1349 	return (rtq);
1350 }
1351 
1352 void
1353 rt_timer_queue_change(struct rttimer_queue *rtq, long timeout)
1354 {
1355 	rtq->rtq_timeout = timeout;
1356 }
1357 
1358 void
1359 rt_timer_queue_destroy(struct rttimer_queue *rtq)
1360 {
1361 	struct rttimer	*r;
1362 
1363 	while ((r = TAILQ_FIRST(&rtq->rtq_head)) != NULL) {
1364 		LIST_REMOVE(r, rtt_link);
1365 		TAILQ_REMOVE(&rtq->rtq_head, r, rtt_next);
1366 		RTTIMER_CALLOUT(r);
1367 		pool_put(&rttimer_pool, r);
1368 		if (rtq->rtq_count > 0)
1369 			rtq->rtq_count--;
1370 		else
1371 			printf("rt_timer_queue_destroy: rtq_count reached 0\n");
1372 	}
1373 
1374 	LIST_REMOVE(rtq, rtq_link);
1375 	free(rtq, M_RTABLE, 0);
1376 }
1377 
1378 unsigned long
1379 rt_timer_queue_count(struct rttimer_queue *rtq)
1380 {
1381 	return (rtq->rtq_count);
1382 }
1383 
1384 void
1385 rt_timer_remove_all(struct rtentry *rt)
1386 {
1387 	struct rttimer	*r;
1388 
1389 	while ((r = LIST_FIRST(&rt->rt_timer)) != NULL) {
1390 		LIST_REMOVE(r, rtt_link);
1391 		TAILQ_REMOVE(&r->rtt_queue->rtq_head, r, rtt_next);
1392 		if (r->rtt_queue->rtq_count > 0)
1393 			r->rtt_queue->rtq_count--;
1394 		else
1395 			printf("rt_timer_remove_all: rtq_count reached 0\n");
1396 		pool_put(&rttimer_pool, r);
1397 	}
1398 }
1399 
1400 int
1401 rt_timer_add(struct rtentry *rt, void (*func)(struct rtentry *,
1402     struct rttimer *), struct rttimer_queue *queue, u_int rtableid)
1403 {
1404 	struct rttimer	*r;
1405 	long		 current_time;
1406 
1407 	current_time = time_uptime;
1408 	rt->rt_rmx.rmx_expire = time_second + queue->rtq_timeout;
1409 
1410 	/*
1411 	 * If there's already a timer with this action, destroy it before
1412 	 * we add a new one.
1413 	 */
1414 	for (r = LIST_FIRST(&rt->rt_timer); r != NULL;
1415 	     r = LIST_NEXT(r, rtt_link)) {
1416 		if (r->rtt_func == func) {
1417 			LIST_REMOVE(r, rtt_link);
1418 			TAILQ_REMOVE(&r->rtt_queue->rtq_head, r, rtt_next);
1419 			if (r->rtt_queue->rtq_count > 0)
1420 				r->rtt_queue->rtq_count--;
1421 			else
1422 				printf("rt_timer_add: rtq_count reached 0\n");
1423 			pool_put(&rttimer_pool, r);
1424 			break;  /* only one per list, so we can quit... */
1425 		}
1426 	}
1427 
1428 	r = pool_get(&rttimer_pool, PR_NOWAIT | PR_ZERO);
1429 	if (r == NULL)
1430 		return (ENOBUFS);
1431 
1432 	r->rtt_rt = rt;
1433 	r->rtt_time = current_time;
1434 	r->rtt_func = func;
1435 	r->rtt_queue = queue;
1436 	r->rtt_tableid = rtableid;
1437 	LIST_INSERT_HEAD(&rt->rt_timer, r, rtt_link);
1438 	TAILQ_INSERT_TAIL(&queue->rtq_head, r, rtt_next);
1439 	r->rtt_queue->rtq_count++;
1440 
1441 	return (0);
1442 }
1443 
1444 struct rtentry *
1445 rt_lookup(struct sockaddr *dst, struct sockaddr *mask, u_int tableid)
1446 {
1447 	struct radix_node_head	*rnh;
1448 
1449 	if ((rnh = rtable_get(tableid, dst->sa_family)) == NULL)
1450 		return (NULL);
1451 
1452 	return ((struct rtentry *)rnh->rnh_lookup(dst, mask, rnh));
1453 }
1454 
1455 /* ARGSUSED */
1456 void
1457 rt_timer_timer(void *arg)
1458 {
1459 	struct timeout		*to = (struct timeout *)arg;
1460 	struct rttimer_queue	*rtq;
1461 	struct rttimer		*r;
1462 	long			 current_time;
1463 	int			 s;
1464 
1465 	current_time = time_uptime;
1466 
1467 	s = splsoftnet();
1468 	for (rtq = LIST_FIRST(&rttimer_queue_head); rtq != NULL;
1469 	     rtq = LIST_NEXT(rtq, rtq_link)) {
1470 		while ((r = TAILQ_FIRST(&rtq->rtq_head)) != NULL &&
1471 		    (r->rtt_time + rtq->rtq_timeout) < current_time) {
1472 			LIST_REMOVE(r, rtt_link);
1473 			TAILQ_REMOVE(&rtq->rtq_head, r, rtt_next);
1474 			RTTIMER_CALLOUT(r);
1475 			pool_put(&rttimer_pool, r);
1476 			if (rtq->rtq_count > 0)
1477 				rtq->rtq_count--;
1478 			else
1479 				printf("rt_timer_timer: rtq_count reached 0\n");
1480 		}
1481 	}
1482 	splx(s);
1483 
1484 	timeout_add_sec(to, 1);
1485 }
1486 
1487 u_int16_t
1488 rtlabel_name2id(char *name)
1489 {
1490 	struct rt_label		*label, *p = NULL;
1491 	u_int16_t		 new_id = 1;
1492 
1493 	if (!name[0])
1494 		return (0);
1495 
1496 	TAILQ_FOREACH(label, &rt_labels, rtl_entry)
1497 		if (strcmp(name, label->rtl_name) == 0) {
1498 			label->rtl_ref++;
1499 			return (label->rtl_id);
1500 		}
1501 
1502 	/*
1503 	 * to avoid fragmentation, we do a linear search from the beginning
1504 	 * and take the first free slot we find. if there is none or the list
1505 	 * is empty, append a new entry at the end.
1506 	 */
1507 
1508 	if (!TAILQ_EMPTY(&rt_labels))
1509 		for (p = TAILQ_FIRST(&rt_labels); p != NULL &&
1510 		    p->rtl_id == new_id; p = TAILQ_NEXT(p, rtl_entry))
1511 			new_id = p->rtl_id + 1;
1512 
1513 	if (new_id > LABELID_MAX)
1514 		return (0);
1515 
1516 	label = malloc(sizeof(*label), M_TEMP, M_NOWAIT|M_ZERO);
1517 	if (label == NULL)
1518 		return (0);
1519 	strlcpy(label->rtl_name, name, sizeof(label->rtl_name));
1520 	label->rtl_id = new_id;
1521 	label->rtl_ref++;
1522 
1523 	if (p != NULL)	/* insert new entry before p */
1524 		TAILQ_INSERT_BEFORE(p, label, rtl_entry);
1525 	else		/* either list empty or no free slot in between */
1526 		TAILQ_INSERT_TAIL(&rt_labels, label, rtl_entry);
1527 
1528 	return (label->rtl_id);
1529 }
1530 
1531 const char *
1532 rtlabel_id2name(u_int16_t id)
1533 {
1534 	struct rt_label	*label;
1535 
1536 	TAILQ_FOREACH(label, &rt_labels, rtl_entry)
1537 		if (label->rtl_id == id)
1538 			return (label->rtl_name);
1539 
1540 	return (NULL);
1541 }
1542 
1543 struct sockaddr *
1544 rtlabel_id2sa(u_int16_t labelid, struct sockaddr_rtlabel *sa_rl)
1545 {
1546 	const char	*label;
1547 
1548 	if (labelid == 0 || (label = rtlabel_id2name(labelid)) == NULL)
1549 		return (NULL);
1550 
1551 	bzero(sa_rl, sizeof(*sa_rl));
1552 	sa_rl->sr_len = sizeof(*sa_rl);
1553 	sa_rl->sr_family = AF_UNSPEC;
1554 	strlcpy(sa_rl->sr_label, label, sizeof(sa_rl->sr_label));
1555 
1556 	return ((struct sockaddr *)sa_rl);
1557 }
1558 
1559 void
1560 rtlabel_unref(u_int16_t id)
1561 {
1562 	struct rt_label	*p, *next;
1563 
1564 	if (id == 0)
1565 		return;
1566 
1567 	for (p = TAILQ_FIRST(&rt_labels); p != NULL; p = next) {
1568 		next = TAILQ_NEXT(p, rtl_entry);
1569 		if (id == p->rtl_id) {
1570 			if (--p->rtl_ref == 0) {
1571 				TAILQ_REMOVE(&rt_labels, p, rtl_entry);
1572 				free(p, M_TEMP, 0);
1573 			}
1574 			break;
1575 		}
1576 	}
1577 }
1578 
1579 void
1580 rt_if_remove(struct ifnet *ifp)
1581 {
1582 	int			 i;
1583 	u_int			 tid;
1584 	struct radix_node_head	*rnh;
1585 
1586 	for (tid = 0; tid <= rtbl_id_max; tid++) {
1587 		for (i = 1; i <= AF_MAX; i++) {
1588 			if ((rnh = rtable_get(tid, i)) != NULL)
1589 				while ((*rnh->rnh_walktree)(rnh,
1590 				    rt_if_remove_rtdelete, ifp) == EAGAIN)
1591 					;	/* nothing */
1592 		}
1593 	}
1594 }
1595 
1596 /*
1597  * Note that deleting a RTF_CLONING route can trigger the
1598  * deletion of more entries, so we need to cancel the walk
1599  * and return EAGAIN.  The caller should restart the walk
1600  * as long as EAGAIN is returned.
1601  */
1602 int
1603 rt_if_remove_rtdelete(struct radix_node *rn, void *vifp, u_int id)
1604 {
1605 	struct ifnet	*ifp = vifp;
1606 	struct rtentry	*rt = (struct rtentry *)rn;
1607 
1608 	if (rt->rt_ifp == ifp) {
1609 		int	cloning = (rt->rt_flags & RTF_CLONING);
1610 
1611 		if (rtdeletemsg(rt, id) == 0 && cloning)
1612 			return (EAGAIN);
1613 	}
1614 
1615 	/*
1616 	 * XXX There should be no need to check for rt_ifa belonging to this
1617 	 * interface, because then rt_ifp is set, right?
1618 	 */
1619 
1620 	return (0);
1621 }
1622 
1623 #ifndef SMALL_KERNEL
1624 void
1625 rt_if_track(struct ifnet *ifp)
1626 {
1627 	struct radix_node_head *rnh;
1628 	int i;
1629 	u_int tid;
1630 
1631 	if (rt_tables == NULL)
1632 		return;
1633 
1634 	for (tid = 0; tid <= rtbl_id_max; tid++) {
1635 		for (i = 1; i <= AF_MAX; i++) {
1636 			if ((rnh = rtable_get(tid, i)) != NULL) {
1637 				if (!rn_mpath_capable(rnh))
1638 					continue;
1639 				while ((*rnh->rnh_walktree)(rnh,
1640 				    rt_if_linkstate_change, ifp) == EAGAIN)
1641 					;	/* nothing */
1642 			}
1643 		}
1644 	}
1645 }
1646 
1647 int
1648 rt_if_linkstate_change(struct radix_node *rn, void *arg, u_int id)
1649 {
1650 	struct ifnet *ifp = arg;
1651 	struct rtentry *rt = (struct rtentry *)rn;
1652 
1653 	if (rt->rt_ifp == ifp) {
1654 		if (LINK_STATE_IS_UP(ifp->if_link_state) &&
1655 		    ifp->if_flags & IFF_UP) {
1656 			if (!(rt->rt_flags & RTF_UP)) {
1657 				/* bring route up */
1658 				rt->rt_flags |= RTF_UP;
1659 				rn_mpath_reprio(rn, rt->rt_priority & RTP_MASK);
1660 			}
1661 		} else {
1662 			if (rt->rt_flags & RTF_UP) {
1663 				/* take route down */
1664 				rt->rt_flags &= ~RTF_UP;
1665 				rn_mpath_reprio(rn, rt->rt_priority | RTP_DOWN);
1666 			}
1667 		}
1668 		if_group_routechange(rt_key(rt), rt_mask(rt));
1669 	}
1670 
1671 	return (0);
1672 }
1673 #endif
1674