xref: /netbsd-src/sys/net/route.c (revision f3cfa6f6ce31685c6c4a758bc430e69eb99f50a4)
1 /*	$NetBSD: route.c,v 1.219 2019/05/17 03:34:26 ozaki-r Exp $	*/
2 
3 /*-
4  * Copyright (c) 1998, 2008 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Kevin M. Lahey of the Numerical Aerospace Simulation Facility,
9  * NASA Ames Research Center.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 /*
34  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
35  * All rights reserved.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. Neither the name of the project nor the names of its contributors
46  *    may be used to endorse or promote products derived from this software
47  *    without specific prior written permission.
48  *
49  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59  * SUCH DAMAGE.
60  */
61 
62 /*
63  * Copyright (c) 1980, 1986, 1991, 1993
64  *	The Regents of the University of California.  All rights reserved.
65  *
66  * Redistribution and use in source and binary forms, with or without
67  * modification, are permitted provided that the following conditions
68  * are met:
69  * 1. Redistributions of source code must retain the above copyright
70  *    notice, this list of conditions and the following disclaimer.
71  * 2. Redistributions in binary form must reproduce the above copyright
72  *    notice, this list of conditions and the following disclaimer in the
73  *    documentation and/or other materials provided with the distribution.
74  * 3. Neither the name of the University nor the names of its contributors
75  *    may be used to endorse or promote products derived from this software
76  *    without specific prior written permission.
77  *
78  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
79  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
80  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
81  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
82  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
83  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
84  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
85  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
86  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
87  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
88  * SUCH DAMAGE.
89  *
90  *	@(#)route.c	8.3 (Berkeley) 1/9/95
91  */
92 
93 #ifdef _KERNEL_OPT
94 #include "opt_inet.h"
95 #include "opt_route.h"
96 #include "opt_net_mpsafe.h"
97 #endif
98 
99 #include <sys/cdefs.h>
100 __KERNEL_RCSID(0, "$NetBSD: route.c,v 1.219 2019/05/17 03:34:26 ozaki-r Exp $");
101 
102 #include <sys/param.h>
103 #ifdef RTFLUSH_DEBUG
104 #include <sys/sysctl.h>
105 #endif
106 #include <sys/systm.h>
107 #include <sys/callout.h>
108 #include <sys/proc.h>
109 #include <sys/mbuf.h>
110 #include <sys/socket.h>
111 #include <sys/socketvar.h>
112 #include <sys/domain.h>
113 #include <sys/kernel.h>
114 #include <sys/ioctl.h>
115 #include <sys/pool.h>
116 #include <sys/kauth.h>
117 #include <sys/workqueue.h>
118 #include <sys/syslog.h>
119 #include <sys/rwlock.h>
120 #include <sys/mutex.h>
121 #include <sys/cpu.h>
122 
123 #include <net/if.h>
124 #include <net/if_dl.h>
125 #include <net/route.h>
126 #if defined(INET) || defined(INET6)
127 #include <net/if_llatbl.h>
128 #endif
129 
130 #include <netinet/in.h>
131 #include <netinet/in_var.h>
132 
133 #define	PRESERVED_RTF	(RTF_UP | RTF_GATEWAY | RTF_HOST | RTF_DONE | RTF_MASK)
134 
135 #ifdef RTFLUSH_DEBUG
136 #define	rtcache_debug() __predict_false(_rtcache_debug)
137 #else /* RTFLUSH_DEBUG */
138 #define	rtcache_debug() 0
139 #endif /* RTFLUSH_DEBUG */
140 
141 #ifdef RT_DEBUG
142 #define RT_REFCNT_TRACE(rt)	printf("%s:%d: rt=%p refcnt=%d\n", \
143 				    __func__, __LINE__, (rt), (rt)->rt_refcnt)
144 #else
145 #define RT_REFCNT_TRACE(rt)	do {} while (0)
146 #endif
147 
148 #ifdef RT_DEBUG
149 #define dlog(level, fmt, args...)	log(level, fmt, ##args)
150 #else
151 #define dlog(level, fmt, args...)	do {} while (0)
152 #endif
153 
154 struct rtstat		rtstat;
155 
156 static int		rttrash;	/* routes not in table but not freed */
157 
158 static struct pool	rtentry_pool;
159 static struct pool	rttimer_pool;
160 
161 static struct callout	rt_timer_ch; /* callout for rt_timer_timer() */
162 static struct workqueue	*rt_timer_wq;
163 static struct work	rt_timer_wk;
164 
165 static void	rt_timer_init(void);
166 static void	rt_timer_queue_remove_all(struct rttimer_queue *);
167 static void	rt_timer_remove_all(struct rtentry *);
168 static void	rt_timer_timer(void *);
169 
170 /*
171  * Locking notes:
172  * - The routing table is protected by a global rwlock
173  *   - API: RT_RLOCK and friends
174  * - rtcaches are NOT protected by the framework
175  *   - Callers must guarantee a rtcache isn't accessed simultaneously
176  *   - How the constraint is guranteed in the wild
177  *     - Protect a rtcache by a mutex (e.g., inp_route)
178  *     - Make rtcache per-CPU and allow only accesses from softint
179  *       (e.g., ipforward_rt_percpu)
180  * - References to a rtentry is managed by reference counting and psref
181  *   - Reference couting is used for temporal reference when a rtentry
182  *     is fetched from the routing table
183  *   - psref is used for temporal reference when a rtentry is fetched
184  *     from a rtcache
185  *     - struct route (rtcache) has struct psref, so we cannot obtain
186  *       a reference twice on the same struct route
187  *   - Befere destroying or updating a rtentry, we have to wait for
188  *     all references left (see below for details)
189  *   - APIs
190  *     - An obtained rtentry via rtalloc1 or rtrequest* must be
191  *       unreferenced by rt_unref
192  *     - An obtained rtentry via rtcache_* must be unreferenced by
193  *       rtcache_unref
194  *   - TODO: once we get a lockless routing table, we should use only
195  *           psref for rtentries
196  * - rtentry destruction
197  *   - A rtentry is destroyed (freed) only when we call rtrequest(RTM_DELETE)
198  *   - If a caller of rtrequest grabs a reference of a rtentry, the caller
199  *     has a responsibility to destroy the rtentry by itself by calling
200  *     rt_free
201  *     - If not, rtrequest itself does that
202  *   - If rt_free is called in softint, the actual destruction routine is
203  *     deferred to a workqueue
204  * - rtentry update
205  *   - When updating a rtentry, RTF_UPDATING flag is set
206  *   - If a rtentry is set RTF_UPDATING, fetching the rtentry from
207  *     the routing table or a rtcache results in either of the following
208  *     cases:
209  *     - if the caller runs in softint, the caller fails to fetch
210  *     - otherwise, the caller waits for the update completed and retries
211  *       to fetch (probably succeed to fetch for the second time)
212  * - rtcache invalidation
213  *   - There is a global generation counter that is incremented when
214  *     any routes have been added or deleted
215  *   - When a rtcache caches a rtentry into itself, it also stores
216  *     a snapshot of the generation counter
217  *   - If the snapshot equals to the global counter, the cache is valid,
218  *     otherwise the cache is invalidated
219  */
220 
221 /*
222  * Global lock for the routing table.
223  */
224 static krwlock_t		rt_lock __cacheline_aligned;
225 #ifdef NET_MPSAFE
226 #define RT_RLOCK()		rw_enter(&rt_lock, RW_READER)
227 #define RT_WLOCK()		rw_enter(&rt_lock, RW_WRITER)
228 #define RT_UNLOCK()		rw_exit(&rt_lock)
229 #define RT_WLOCKED()		rw_write_held(&rt_lock)
230 #define	RT_ASSERT_WLOCK()	KASSERT(rw_write_held(&rt_lock))
231 #else
232 #define RT_RLOCK()		do {} while (0)
233 #define RT_WLOCK()		do {} while (0)
234 #define RT_UNLOCK()		do {} while (0)
235 #define RT_WLOCKED()		true
236 #define	RT_ASSERT_WLOCK()	do {} while (0)
237 #endif
238 
239 static uint64_t rtcache_generation;
240 
241 /*
242  * mutex and cv that are used to wait for references to a rtentry left
243  * before updating the rtentry.
244  */
245 static struct {
246 	kmutex_t		lock;
247 	kcondvar_t		cv;
248 	bool			ongoing;
249 	const struct lwp	*lwp;
250 } rt_update_global __cacheline_aligned;
251 
252 /*
253  * A workqueue and stuff that are used to defer the destruction routine
254  * of rtentries.
255  */
256 static struct {
257 	struct workqueue	*wq;
258 	struct work		wk;
259 	kmutex_t		lock;
260 	SLIST_HEAD(, rtentry)	queue;
261 	bool			enqueued;
262 } rt_free_global __cacheline_aligned;
263 
264 /* psref for rtentry */
265 static struct psref_class *rt_psref_class __read_mostly;
266 
267 #ifdef RTFLUSH_DEBUG
268 static int _rtcache_debug = 0;
269 #endif /* RTFLUSH_DEBUG */
270 
271 static kauth_listener_t route_listener;
272 
273 static int rtdeletemsg(struct rtentry *);
274 
275 static void rt_maskedcopy(const struct sockaddr *,
276     struct sockaddr *, const struct sockaddr *);
277 
278 static void rtcache_invalidate(void);
279 
280 static void rt_ref(struct rtentry *);
281 
282 static struct rtentry *
283     rtalloc1_locked(const struct sockaddr *, int, bool, bool);
284 
285 static struct ifaddr *rt_getifa(struct rt_addrinfo *, struct psref *);
286 static struct ifnet *rt_getifp(struct rt_addrinfo *, struct psref *);
287 static struct ifaddr *ifa_ifwithroute_psref(int, const struct sockaddr *,
288     const struct sockaddr *, struct psref *);
289 
290 static void rtcache_ref(struct rtentry *, struct route *);
291 
292 #ifdef NET_MPSAFE
293 static void rt_update_wait(void);
294 #endif
295 
296 static bool rt_wait_ok(void);
297 static void rt_wait_refcnt(const char *, struct rtentry *, int);
298 static void rt_wait_psref(struct rtentry *);
299 
300 #ifdef DDB
301 static void db_print_sa(const struct sockaddr *);
302 static void db_print_ifa(struct ifaddr *);
303 static int db_show_rtentry(struct rtentry *, void *);
304 #endif
305 
306 #ifdef RTFLUSH_DEBUG
307 static void sysctl_net_rtcache_setup(struct sysctllog **);
308 static void
309 sysctl_net_rtcache_setup(struct sysctllog **clog)
310 {
311 	const struct sysctlnode *rnode;
312 
313 	if (sysctl_createv(clog, 0, NULL, &rnode, CTLFLAG_PERMANENT,
314 	    CTLTYPE_NODE,
315 	    "rtcache", SYSCTL_DESCR("Route cache related settings"),
316 	    NULL, 0, NULL, 0, CTL_NET, CTL_CREATE, CTL_EOL) != 0)
317 		return;
318 	if (sysctl_createv(clog, 0, &rnode, &rnode,
319 	    CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT,
320 	    "debug", SYSCTL_DESCR("Debug route caches"),
321 	    NULL, 0, &_rtcache_debug, 0, CTL_CREATE, CTL_EOL) != 0)
322 		return;
323 }
324 #endif /* RTFLUSH_DEBUG */
325 
326 static inline void
327 rt_destroy(struct rtentry *rt)
328 {
329 	if (rt->_rt_key != NULL)
330 		sockaddr_free(rt->_rt_key);
331 	if (rt->rt_gateway != NULL)
332 		sockaddr_free(rt->rt_gateway);
333 	if (rt_gettag(rt) != NULL)
334 		sockaddr_free(rt_gettag(rt));
335 	rt->_rt_key = rt->rt_gateway = rt->rt_tag = NULL;
336 }
337 
338 static inline const struct sockaddr *
339 rt_setkey(struct rtentry *rt, const struct sockaddr *key, int flags)
340 {
341 	if (rt->_rt_key == key)
342 		goto out;
343 
344 	if (rt->_rt_key != NULL)
345 		sockaddr_free(rt->_rt_key);
346 	rt->_rt_key = sockaddr_dup(key, flags);
347 out:
348 	rt->rt_nodes->rn_key = (const char *)rt->_rt_key;
349 	return rt->_rt_key;
350 }
351 
352 struct ifaddr *
353 rt_get_ifa(struct rtentry *rt)
354 {
355 	struct ifaddr *ifa;
356 
357 	if ((ifa = rt->rt_ifa) == NULL)
358 		return ifa;
359 	else if (ifa->ifa_getifa == NULL)
360 		return ifa;
361 #if 0
362 	else if (ifa->ifa_seqno != NULL && *ifa->ifa_seqno == rt->rt_ifa_seqno)
363 		return ifa;
364 #endif
365 	else {
366 		ifa = (*ifa->ifa_getifa)(ifa, rt_getkey(rt));
367 		if (ifa == NULL)
368 			return NULL;
369 		rt_replace_ifa(rt, ifa);
370 		return ifa;
371 	}
372 }
373 
374 static void
375 rt_set_ifa1(struct rtentry *rt, struct ifaddr *ifa)
376 {
377 	rt->rt_ifa = ifa;
378 	if (ifa->ifa_seqno != NULL)
379 		rt->rt_ifa_seqno = *ifa->ifa_seqno;
380 }
381 
382 /*
383  * Is this route the connected route for the ifa?
384  */
385 static int
386 rt_ifa_connected(const struct rtentry *rt, const struct ifaddr *ifa)
387 {
388 	const struct sockaddr *key, *dst, *odst;
389 	struct sockaddr_storage maskeddst;
390 
391 	key = rt_getkey(rt);
392 	dst = rt->rt_flags & RTF_HOST ? ifa->ifa_dstaddr : ifa->ifa_addr;
393 	if (dst == NULL ||
394 	    dst->sa_family != key->sa_family ||
395 	    dst->sa_len != key->sa_len)
396 		return 0;
397 	if ((rt->rt_flags & RTF_HOST) == 0 && ifa->ifa_netmask) {
398 		odst = dst;
399 		dst = (struct sockaddr *)&maskeddst;
400 		rt_maskedcopy(odst, (struct sockaddr *)&maskeddst,
401 		    ifa->ifa_netmask);
402 	}
403 	return (memcmp(dst, key, dst->sa_len) == 0);
404 }
405 
406 void
407 rt_replace_ifa(struct rtentry *rt, struct ifaddr *ifa)
408 {
409 	struct ifaddr *old;
410 
411 	if (rt->rt_ifa == ifa)
412 		return;
413 
414 	if (rt->rt_ifa &&
415 	    rt->rt_ifa != ifa &&
416 	    rt->rt_ifa->ifa_flags & IFA_ROUTE &&
417 	    rt_ifa_connected(rt, rt->rt_ifa))
418 	{
419 		RT_DPRINTF("rt->_rt_key = %p, ifa = %p, "
420 		    "replace deleted IFA_ROUTE\n",
421 		    (void *)rt->_rt_key, (void *)rt->rt_ifa);
422 		rt->rt_ifa->ifa_flags &= ~IFA_ROUTE;
423 		if (rt_ifa_connected(rt, ifa)) {
424 			RT_DPRINTF("rt->_rt_key = %p, ifa = %p, "
425 			    "replace added IFA_ROUTE\n",
426 			    (void *)rt->_rt_key, (void *)ifa);
427 			ifa->ifa_flags |= IFA_ROUTE;
428 		}
429 	}
430 
431 	ifaref(ifa);
432 	old = rt->rt_ifa;
433 	rt_set_ifa1(rt, ifa);
434 	ifafree(old);
435 }
436 
437 static void
438 rt_set_ifa(struct rtentry *rt, struct ifaddr *ifa)
439 {
440 	ifaref(ifa);
441 	rt_set_ifa1(rt, ifa);
442 }
443 
444 static int
445 route_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
446     void *arg0, void *arg1, void *arg2, void *arg3)
447 {
448 	struct rt_msghdr *rtm;
449 	int result;
450 
451 	result = KAUTH_RESULT_DEFER;
452 	rtm = arg1;
453 
454 	if (action != KAUTH_NETWORK_ROUTE)
455 		return result;
456 
457 	if (rtm->rtm_type == RTM_GET)
458 		result = KAUTH_RESULT_ALLOW;
459 
460 	return result;
461 }
462 
463 static void rt_free_work(struct work *, void *);
464 
465 void
466 rt_init(void)
467 {
468 	int error;
469 
470 #ifdef RTFLUSH_DEBUG
471 	sysctl_net_rtcache_setup(NULL);
472 #endif
473 
474 	mutex_init(&rt_free_global.lock, MUTEX_DEFAULT, IPL_SOFTNET);
475 	SLIST_INIT(&rt_free_global.queue);
476 	rt_free_global.enqueued = false;
477 
478 	rt_psref_class = psref_class_create("rtentry", IPL_SOFTNET);
479 
480 	error = workqueue_create(&rt_free_global.wq, "rt_free",
481 	    rt_free_work, NULL, PRI_SOFTNET, IPL_SOFTNET, WQ_MPSAFE);
482 	if (error)
483 		panic("%s: workqueue_create failed (%d)\n", __func__, error);
484 
485 	mutex_init(&rt_update_global.lock, MUTEX_DEFAULT, IPL_SOFTNET);
486 	cv_init(&rt_update_global.cv, "rt_update");
487 
488 	pool_init(&rtentry_pool, sizeof(struct rtentry), 0, 0, 0, "rtentpl",
489 	    NULL, IPL_SOFTNET);
490 	pool_init(&rttimer_pool, sizeof(struct rttimer), 0, 0, 0, "rttmrpl",
491 	    NULL, IPL_SOFTNET);
492 
493 	rn_init();	/* initialize all zeroes, all ones, mask table */
494 	rtbl_init();
495 
496 	route_listener = kauth_listen_scope(KAUTH_SCOPE_NETWORK,
497 	    route_listener_cb, NULL);
498 }
499 
500 static void
501 rtcache_invalidate(void)
502 {
503 
504 	RT_ASSERT_WLOCK();
505 
506 	if (rtcache_debug())
507 		printf("%s: enter\n", __func__);
508 
509 	rtcache_generation++;
510 }
511 
512 #ifdef RT_DEBUG
513 static void
514 dump_rt(const struct rtentry *rt)
515 {
516 	char buf[512];
517 
518 	log(LOG_DEBUG, "rt: ");
519 	log(LOG_DEBUG, "p=%p ", rt);
520 	if (rt->_rt_key == NULL) {
521 		log(LOG_DEBUG, "dst=(NULL) ");
522 	} else {
523 		sockaddr_format(rt->_rt_key, buf, sizeof(buf));
524 		log(LOG_DEBUG, "dst=%s ", buf);
525 	}
526 	if (rt->rt_gateway == NULL) {
527 		log(LOG_DEBUG, "gw=(NULL) ");
528 	} else {
529 		sockaddr_format(rt->_rt_key, buf, sizeof(buf));
530 		log(LOG_DEBUG, "gw=%s ", buf);
531 	}
532 	log(LOG_DEBUG, "flags=%x ", rt->rt_flags);
533 	if (rt->rt_ifp == NULL) {
534 		log(LOG_DEBUG, "if=(NULL) ");
535 	} else {
536 		log(LOG_DEBUG, "if=%s ", rt->rt_ifp->if_xname);
537 	}
538 	log(LOG_DEBUG, "\n");
539 }
540 #endif /* RT_DEBUG */
541 
542 /*
543  * Packet routing routines. If success, refcnt of a returned rtentry
544  * will be incremented. The caller has to rtfree it by itself.
545  */
546 struct rtentry *
547 rtalloc1_locked(const struct sockaddr *dst, int report, bool wait_ok,
548     bool wlock)
549 {
550 	rtbl_t *rtbl;
551 	struct rtentry *rt;
552 	int s;
553 
554 #ifdef NET_MPSAFE
555 retry:
556 #endif
557 	s = splsoftnet();
558 	rtbl = rt_gettable(dst->sa_family);
559 	if (rtbl == NULL)
560 		goto miss;
561 
562 	rt = rt_matchaddr(rtbl, dst);
563 	if (rt == NULL)
564 		goto miss;
565 
566 	if (!ISSET(rt->rt_flags, RTF_UP))
567 		goto miss;
568 
569 #ifdef NET_MPSAFE
570 	if (ISSET(rt->rt_flags, RTF_UPDATING) &&
571 	    /* XXX updater should be always able to acquire */
572 	    curlwp != rt_update_global.lwp) {
573 		if (!wait_ok || !rt_wait_ok())
574 			goto miss;
575 		RT_UNLOCK();
576 		splx(s);
577 
578 		/* We can wait until the update is complete */
579 		rt_update_wait();
580 
581 		if (wlock)
582 			RT_WLOCK();
583 		else
584 			RT_RLOCK();
585 		goto retry;
586 	}
587 #endif /* NET_MPSAFE */
588 
589 	rt_ref(rt);
590 	RT_REFCNT_TRACE(rt);
591 
592 	splx(s);
593 	return rt;
594 miss:
595 	rtstat.rts_unreach++;
596 	if (report) {
597 		struct rt_addrinfo info;
598 
599 		memset(&info, 0, sizeof(info));
600 		info.rti_info[RTAX_DST] = dst;
601 		rt_missmsg(RTM_MISS, &info, 0, 0);
602 	}
603 	splx(s);
604 	return NULL;
605 }
606 
607 struct rtentry *
608 rtalloc1(const struct sockaddr *dst, int report)
609 {
610 	struct rtentry *rt;
611 
612 	RT_RLOCK();
613 	rt = rtalloc1_locked(dst, report, true, false);
614 	RT_UNLOCK();
615 
616 	return rt;
617 }
618 
619 static void
620 rt_ref(struct rtentry *rt)
621 {
622 
623 	KASSERT(rt->rt_refcnt >= 0);
624 	atomic_inc_uint(&rt->rt_refcnt);
625 }
626 
627 void
628 rt_unref(struct rtentry *rt)
629 {
630 
631 	KASSERT(rt != NULL);
632 	KASSERTMSG(rt->rt_refcnt > 0, "refcnt=%d", rt->rt_refcnt);
633 
634 	atomic_dec_uint(&rt->rt_refcnt);
635 	if (!ISSET(rt->rt_flags, RTF_UP) || ISSET(rt->rt_flags, RTF_UPDATING)) {
636 		mutex_enter(&rt_free_global.lock);
637 		cv_broadcast(&rt->rt_cv);
638 		mutex_exit(&rt_free_global.lock);
639 	}
640 }
641 
642 static bool
643 rt_wait_ok(void)
644 {
645 
646 	KASSERT(!cpu_intr_p());
647 	return !cpu_softintr_p();
648 }
649 
650 void
651 rt_wait_refcnt(const char *title, struct rtentry *rt, int cnt)
652 {
653 	mutex_enter(&rt_free_global.lock);
654 	while (rt->rt_refcnt > cnt) {
655 		dlog(LOG_DEBUG, "%s: %s waiting (refcnt=%d)\n",
656 		    __func__, title, rt->rt_refcnt);
657 		cv_wait(&rt->rt_cv, &rt_free_global.lock);
658 		dlog(LOG_DEBUG, "%s: %s waited (refcnt=%d)\n",
659 		    __func__, title, rt->rt_refcnt);
660 	}
661 	mutex_exit(&rt_free_global.lock);
662 }
663 
664 void
665 rt_wait_psref(struct rtentry *rt)
666 {
667 
668 	psref_target_destroy(&rt->rt_psref, rt_psref_class);
669 	psref_target_init(&rt->rt_psref, rt_psref_class);
670 }
671 
672 static void
673 _rt_free(struct rtentry *rt)
674 {
675 	struct ifaddr *ifa;
676 
677 	/*
678 	 * Need to avoid a deadlock on rt_wait_refcnt of update
679 	 * and a conflict on psref_target_destroy of update.
680 	 */
681 #ifdef NET_MPSAFE
682 	rt_update_wait();
683 #endif
684 
685 	RT_REFCNT_TRACE(rt);
686 	KASSERTMSG(rt->rt_refcnt >= 0, "refcnt=%d", rt->rt_refcnt);
687 	rt_wait_refcnt("free", rt, 0);
688 #ifdef NET_MPSAFE
689 	psref_target_destroy(&rt->rt_psref, rt_psref_class);
690 #endif
691 
692 	rt_assert_inactive(rt);
693 	rttrash--;
694 	ifa = rt->rt_ifa;
695 	rt->rt_ifa = NULL;
696 	ifafree(ifa);
697 	rt->rt_ifp = NULL;
698 	cv_destroy(&rt->rt_cv);
699 	rt_destroy(rt);
700 	pool_put(&rtentry_pool, rt);
701 }
702 
703 static void
704 rt_free_work(struct work *wk, void *arg)
705 {
706 
707 	for (;;) {
708 		struct rtentry *rt;
709 
710 		mutex_enter(&rt_free_global.lock);
711 		if ((rt = SLIST_FIRST(&rt_free_global.queue)) == NULL) {
712 			rt_free_global.enqueued = false;
713 			mutex_exit(&rt_free_global.lock);
714 			return;
715 		}
716 		SLIST_REMOVE_HEAD(&rt_free_global.queue, rt_free);
717 		mutex_exit(&rt_free_global.lock);
718 		atomic_dec_uint(&rt->rt_refcnt);
719 		_rt_free(rt);
720 	}
721 }
722 
723 void
724 rt_free(struct rtentry *rt)
725 {
726 
727 	KASSERT(rt->rt_refcnt > 0);
728 	if (rt_wait_ok()) {
729 		atomic_dec_uint(&rt->rt_refcnt);
730 		_rt_free(rt);
731 		return;
732 	}
733 
734 	mutex_enter(&rt_free_global.lock);
735 	/* No need to add a reference here. */
736 	SLIST_INSERT_HEAD(&rt_free_global.queue, rt, rt_free);
737 	if (!rt_free_global.enqueued) {
738 		workqueue_enqueue(rt_free_global.wq, &rt_free_global.wk, NULL);
739 		rt_free_global.enqueued = true;
740 	}
741 	mutex_exit(&rt_free_global.lock);
742 }
743 
744 #ifdef NET_MPSAFE
745 static void
746 rt_update_wait(void)
747 {
748 
749 	mutex_enter(&rt_update_global.lock);
750 	while (rt_update_global.ongoing) {
751 		dlog(LOG_DEBUG, "%s: waiting lwp=%p\n", __func__, curlwp);
752 		cv_wait(&rt_update_global.cv, &rt_update_global.lock);
753 		dlog(LOG_DEBUG, "%s: waited lwp=%p\n", __func__, curlwp);
754 	}
755 	mutex_exit(&rt_update_global.lock);
756 }
757 #endif
758 
759 int
760 rt_update_prepare(struct rtentry *rt)
761 {
762 
763 	dlog(LOG_DEBUG, "%s: updating rt=%p lwp=%p\n", __func__, rt, curlwp);
764 
765 	RT_WLOCK();
766 	/* If the entry is being destroyed, don't proceed the update. */
767 	if (!ISSET(rt->rt_flags, RTF_UP)) {
768 		RT_UNLOCK();
769 		return ESRCH;
770 	}
771 	rt->rt_flags |= RTF_UPDATING;
772 	RT_UNLOCK();
773 
774 	mutex_enter(&rt_update_global.lock);
775 	while (rt_update_global.ongoing) {
776 		dlog(LOG_DEBUG, "%s: waiting ongoing updating rt=%p lwp=%p\n",
777 		    __func__, rt, curlwp);
778 		cv_wait(&rt_update_global.cv, &rt_update_global.lock);
779 		dlog(LOG_DEBUG, "%s: waited ongoing updating rt=%p lwp=%p\n",
780 		    __func__, rt, curlwp);
781 	}
782 	rt_update_global.ongoing = true;
783 	/* XXX need it to avoid rt_update_wait by updater itself. */
784 	rt_update_global.lwp = curlwp;
785 	mutex_exit(&rt_update_global.lock);
786 
787 	rt_wait_refcnt("update", rt, 1);
788 	rt_wait_psref(rt);
789 
790 	return 0;
791 }
792 
793 void
794 rt_update_finish(struct rtentry *rt)
795 {
796 
797 	RT_WLOCK();
798 	rt->rt_flags &= ~RTF_UPDATING;
799 	RT_UNLOCK();
800 
801 	mutex_enter(&rt_update_global.lock);
802 	rt_update_global.ongoing = false;
803 	rt_update_global.lwp = NULL;
804 	cv_broadcast(&rt_update_global.cv);
805 	mutex_exit(&rt_update_global.lock);
806 
807 	dlog(LOG_DEBUG, "%s: updated rt=%p lwp=%p\n", __func__, rt, curlwp);
808 }
809 
810 /*
811  * Force a routing table entry to the specified
812  * destination to go through the given gateway.
813  * Normally called as a result of a routing redirect
814  * message from the network layer.
815  *
816  * N.B.: must be called at splsoftnet
817  */
818 void
819 rtredirect(const struct sockaddr *dst, const struct sockaddr *gateway,
820 	const struct sockaddr *netmask, int flags, const struct sockaddr *src,
821 	struct rtentry **rtp)
822 {
823 	struct rtentry *rt;
824 	int error = 0;
825 	uint64_t *stat = NULL;
826 	struct rt_addrinfo info;
827 	struct ifaddr *ifa;
828 	struct psref psref;
829 
830 	/* verify the gateway is directly reachable */
831 	if ((ifa = ifa_ifwithnet_psref(gateway, &psref)) == NULL) {
832 		error = ENETUNREACH;
833 		goto out;
834 	}
835 	rt = rtalloc1(dst, 0);
836 	/*
837 	 * If the redirect isn't from our current router for this dst,
838 	 * it's either old or wrong.  If it redirects us to ourselves,
839 	 * we have a routing loop, perhaps as a result of an interface
840 	 * going down recently.
841 	 */
842 	if (!(flags & RTF_DONE) && rt &&
843 	     (sockaddr_cmp(src, rt->rt_gateway) != 0 || rt->rt_ifa != ifa))
844 		error = EINVAL;
845 	else {
846 		int s = pserialize_read_enter();
847 		struct ifaddr *_ifa;
848 
849 		_ifa = ifa_ifwithaddr(gateway);
850 		if (_ifa != NULL)
851 			error = EHOSTUNREACH;
852 		pserialize_read_exit(s);
853 	}
854 	if (error)
855 		goto done;
856 	/*
857 	 * Create a new entry if we just got back a wildcard entry
858 	 * or the lookup failed.  This is necessary for hosts
859 	 * which use routing redirects generated by smart gateways
860 	 * to dynamically build the routing tables.
861 	 */
862 	if (rt == NULL || (rt_mask(rt) && rt_mask(rt)->sa_len < 2))
863 		goto create;
864 	/*
865 	 * Don't listen to the redirect if it's
866 	 * for a route to an interface.
867 	 */
868 	if (rt->rt_flags & RTF_GATEWAY) {
869 		if (((rt->rt_flags & RTF_HOST) == 0) && (flags & RTF_HOST)) {
870 			/*
871 			 * Changing from route to net => route to host.
872 			 * Create new route, rather than smashing route to net.
873 			 */
874 		create:
875 			if (rt != NULL)
876 				rt_unref(rt);
877 			flags |=  RTF_GATEWAY | RTF_DYNAMIC;
878 			memset(&info, 0, sizeof(info));
879 			info.rti_info[RTAX_DST] = dst;
880 			info.rti_info[RTAX_GATEWAY] = gateway;
881 			info.rti_info[RTAX_NETMASK] = netmask;
882 			info.rti_ifa = ifa;
883 			info.rti_flags = flags;
884 			rt = NULL;
885 			error = rtrequest1(RTM_ADD, &info, &rt);
886 			if (rt != NULL)
887 				flags = rt->rt_flags;
888 			stat = &rtstat.rts_dynamic;
889 		} else {
890 			/*
891 			 * Smash the current notion of the gateway to
892 			 * this destination.  Should check about netmask!!!
893 			 */
894 #ifdef NET_MPSAFE
895 			KASSERT(!cpu_softintr_p());
896 
897 			error = rt_update_prepare(rt);
898 			if (error == 0) {
899 #endif
900 				RT_WLOCK();
901 				error = rt_setgate(rt, gateway);
902 				if (error == 0) {
903 					rt->rt_flags |= RTF_MODIFIED;
904 					flags |= RTF_MODIFIED;
905 				}
906 				RT_UNLOCK();
907 #ifdef NET_MPSAFE
908 				rt_update_finish(rt);
909 			} else {
910 				/*
911 				 * If error != 0, the rtentry is being
912 				 * destroyed, so doing nothing doesn't
913 				 * matter.
914 				 */
915 			}
916 #endif
917 			stat = &rtstat.rts_newgateway;
918 		}
919 	} else
920 		error = EHOSTUNREACH;
921 done:
922 	if (rt) {
923 		if (rtp != NULL && !error)
924 			*rtp = rt;
925 		else
926 			rt_unref(rt);
927 	}
928 out:
929 	if (error)
930 		rtstat.rts_badredirect++;
931 	else if (stat != NULL)
932 		(*stat)++;
933 	memset(&info, 0, sizeof(info));
934 	info.rti_info[RTAX_DST] = dst;
935 	info.rti_info[RTAX_GATEWAY] = gateway;
936 	info.rti_info[RTAX_NETMASK] = netmask;
937 	info.rti_info[RTAX_AUTHOR] = src;
938 	rt_missmsg(RTM_REDIRECT, &info, flags, error);
939 	ifa_release(ifa, &psref);
940 }
941 
942 /*
943  * Delete a route and generate a message.
944  * It doesn't free a passed rt.
945  */
946 static int
947 rtdeletemsg(struct rtentry *rt)
948 {
949 	int error;
950 	struct rt_addrinfo info;
951 	struct rtentry *retrt;
952 
953 	/*
954 	 * Request the new route so that the entry is not actually
955 	 * deleted.  That will allow the information being reported to
956 	 * be accurate (and consistent with route_output()).
957 	 */
958 	memset(&info, 0, sizeof(info));
959 	info.rti_info[RTAX_DST] = rt_getkey(rt);
960 	info.rti_info[RTAX_NETMASK] = rt_mask(rt);
961 	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
962 	info.rti_flags = rt->rt_flags;
963 	error = rtrequest1(RTM_DELETE, &info, &retrt);
964 
965 	rt_missmsg(RTM_DELETE, &info, info.rti_flags, error);
966 
967 	return error;
968 }
969 
970 static struct ifaddr *
971 ifa_ifwithroute_psref(int flags, const struct sockaddr *dst,
972     const struct sockaddr *gateway, struct psref *psref)
973 {
974 	struct ifaddr *ifa = NULL;
975 
976 	if ((flags & RTF_GATEWAY) == 0) {
977 		/*
978 		 * If we are adding a route to an interface,
979 		 * and the interface is a pt to pt link
980 		 * we should search for the destination
981 		 * as our clue to the interface.  Otherwise
982 		 * we can use the local address.
983 		 */
984 		if ((flags & RTF_HOST) && gateway->sa_family != AF_LINK)
985 			ifa = ifa_ifwithdstaddr_psref(dst, psref);
986 		if (ifa == NULL)
987 			ifa = ifa_ifwithaddr_psref(gateway, psref);
988 	} else {
989 		/*
990 		 * If we are adding a route to a remote net
991 		 * or host, the gateway may still be on the
992 		 * other end of a pt to pt link.
993 		 */
994 		ifa = ifa_ifwithdstaddr_psref(gateway, psref);
995 	}
996 	if (ifa == NULL)
997 		ifa = ifa_ifwithnet_psref(gateway, psref);
998 	if (ifa == NULL) {
999 		int s;
1000 		struct rtentry *rt;
1001 
1002 		rt = rtalloc1_locked(gateway, 0, true, true);
1003 		if (rt == NULL)
1004 			return NULL;
1005 		if (rt->rt_flags & RTF_GATEWAY) {
1006 			rt_unref(rt);
1007 			return NULL;
1008 		}
1009 		/*
1010 		 * Just in case. May not need to do this workaround.
1011 		 * Revisit when working on rtentry MP-ification.
1012 		 */
1013 		s = pserialize_read_enter();
1014 		IFADDR_READER_FOREACH(ifa, rt->rt_ifp) {
1015 			if (ifa == rt->rt_ifa)
1016 				break;
1017 		}
1018 		if (ifa != NULL)
1019 			ifa_acquire(ifa, psref);
1020 		pserialize_read_exit(s);
1021 		rt_unref(rt);
1022 		if (ifa == NULL)
1023 			return NULL;
1024 	}
1025 	if (ifa->ifa_addr->sa_family != dst->sa_family) {
1026 		struct ifaddr *nifa;
1027 		int s;
1028 
1029 		s = pserialize_read_enter();
1030 		nifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp);
1031 		if (nifa != NULL) {
1032 			ifa_release(ifa, psref);
1033 			ifa_acquire(nifa, psref);
1034 			ifa = nifa;
1035 		}
1036 		pserialize_read_exit(s);
1037 	}
1038 	return ifa;
1039 }
1040 
1041 /*
1042  * If it suceeds and ret_nrt isn't NULL, refcnt of ret_nrt is incremented.
1043  * The caller has to rtfree it by itself.
1044  */
1045 int
1046 rtrequest(int req, const struct sockaddr *dst, const struct sockaddr *gateway,
1047 	const struct sockaddr *netmask, int flags, struct rtentry **ret_nrt)
1048 {
1049 	struct rt_addrinfo info;
1050 
1051 	memset(&info, 0, sizeof(info));
1052 	info.rti_flags = flags;
1053 	info.rti_info[RTAX_DST] = dst;
1054 	info.rti_info[RTAX_GATEWAY] = gateway;
1055 	info.rti_info[RTAX_NETMASK] = netmask;
1056 	return rtrequest1(req, &info, ret_nrt);
1057 }
1058 
1059 /*
1060  * It's a utility function to add/remove a route to/from the routing table
1061  * and tell user processes the addition/removal on success.
1062  */
1063 int
1064 rtrequest_newmsg(const int req, const struct sockaddr *dst,
1065 	const struct sockaddr *gateway, const struct sockaddr *netmask,
1066 	const int flags)
1067 {
1068 	int error;
1069 	struct rtentry *ret_nrt = NULL;
1070 
1071 	KASSERT(req == RTM_ADD || req == RTM_DELETE);
1072 
1073 	error = rtrequest(req, dst, gateway, netmask, flags, &ret_nrt);
1074 	if (error != 0)
1075 		return error;
1076 
1077 	KASSERT(ret_nrt != NULL);
1078 
1079 	rt_newmsg(req, ret_nrt); /* tell user process */
1080 	if (req == RTM_DELETE)
1081 		rt_free(ret_nrt);
1082 	else
1083 		rt_unref(ret_nrt);
1084 
1085 	return 0;
1086 }
1087 
1088 static struct ifnet *
1089 rt_getifp(struct rt_addrinfo *info, struct psref *psref)
1090 {
1091 	const struct sockaddr *ifpaddr = info->rti_info[RTAX_IFP];
1092 
1093 	if (info->rti_ifp != NULL)
1094 		return NULL;
1095 	/*
1096 	 * ifp may be specified by sockaddr_dl when protocol address
1097 	 * is ambiguous
1098 	 */
1099 	if (ifpaddr != NULL && ifpaddr->sa_family == AF_LINK) {
1100 		struct ifaddr *ifa;
1101 		int s = pserialize_read_enter();
1102 
1103 		ifa = ifa_ifwithnet(ifpaddr);
1104 		if (ifa != NULL)
1105 			info->rti_ifp = if_get_byindex(ifa->ifa_ifp->if_index,
1106 			    psref);
1107 		pserialize_read_exit(s);
1108 	}
1109 
1110 	return info->rti_ifp;
1111 }
1112 
1113 static struct ifaddr *
1114 rt_getifa(struct rt_addrinfo *info, struct psref *psref)
1115 {
1116 	struct ifaddr *ifa = NULL;
1117 	const struct sockaddr *dst = info->rti_info[RTAX_DST];
1118 	const struct sockaddr *gateway = info->rti_info[RTAX_GATEWAY];
1119 	const struct sockaddr *ifaaddr = info->rti_info[RTAX_IFA];
1120 	int flags = info->rti_flags;
1121 	const struct sockaddr *sa;
1122 
1123 	if (info->rti_ifa == NULL && ifaaddr != NULL) {
1124 		ifa = ifa_ifwithaddr_psref(ifaaddr, psref);
1125 		if (ifa != NULL)
1126 			goto got;
1127 	}
1128 
1129 	sa = ifaaddr != NULL ? ifaaddr :
1130 	    (gateway != NULL ? gateway : dst);
1131 	if (sa != NULL && info->rti_ifp != NULL)
1132 		ifa = ifaof_ifpforaddr_psref(sa, info->rti_ifp, psref);
1133 	else if (dst != NULL && gateway != NULL)
1134 		ifa = ifa_ifwithroute_psref(flags, dst, gateway, psref);
1135 	else if (sa != NULL)
1136 		ifa = ifa_ifwithroute_psref(flags, sa, sa, psref);
1137 	if (ifa == NULL)
1138 		return NULL;
1139 got:
1140 	if (ifa->ifa_getifa != NULL) {
1141 		/* FIXME ifa_getifa is NOMPSAFE */
1142 		ifa = (*ifa->ifa_getifa)(ifa, dst);
1143 		if (ifa == NULL)
1144 			return NULL;
1145 		ifa_acquire(ifa, psref);
1146 	}
1147 	info->rti_ifa = ifa;
1148 	if (info->rti_ifp == NULL)
1149 		info->rti_ifp = ifa->ifa_ifp;
1150 	return ifa;
1151 }
1152 
1153 /*
1154  * If it suceeds and ret_nrt isn't NULL, refcnt of ret_nrt is incremented.
1155  * The caller has to rtfree it by itself.
1156  */
1157 int
1158 rtrequest1(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt)
1159 {
1160 	int s = splsoftnet(), ss;
1161 	int error = 0, rc;
1162 	struct rtentry *rt;
1163 	rtbl_t *rtbl;
1164 	struct ifaddr *ifa = NULL;
1165 	struct sockaddr_storage maskeddst;
1166 	const struct sockaddr *dst = info->rti_info[RTAX_DST];
1167 	const struct sockaddr *gateway = info->rti_info[RTAX_GATEWAY];
1168 	const struct sockaddr *netmask = info->rti_info[RTAX_NETMASK];
1169 	int flags = info->rti_flags;
1170 	struct psref psref_ifp, psref_ifa;
1171 	int bound = 0;
1172 	struct ifnet *ifp = NULL;
1173 	bool need_to_release_ifa = true;
1174 	bool need_unlock = true;
1175 #define senderr(x) { error = x ; goto bad; }
1176 
1177 	RT_WLOCK();
1178 
1179 	bound = curlwp_bind();
1180 	if ((rtbl = rt_gettable(dst->sa_family)) == NULL)
1181 		senderr(ESRCH);
1182 	if (flags & RTF_HOST)
1183 		netmask = NULL;
1184 	switch (req) {
1185 	case RTM_DELETE:
1186 		if (netmask) {
1187 			rt_maskedcopy(dst, (struct sockaddr *)&maskeddst,
1188 			    netmask);
1189 			dst = (struct sockaddr *)&maskeddst;
1190 		}
1191 		if ((rt = rt_lookup(rtbl, dst, netmask)) == NULL)
1192 			senderr(ESRCH);
1193 		if ((rt = rt_deladdr(rtbl, dst, netmask)) == NULL)
1194 			senderr(ESRCH);
1195 		rt->rt_flags &= ~RTF_UP;
1196 		if ((ifa = rt->rt_ifa)) {
1197 			if (ifa->ifa_flags & IFA_ROUTE &&
1198 			    rt_ifa_connected(rt, ifa)) {
1199 				RT_DPRINTF("rt->_rt_key = %p, ifa = %p, "
1200 				    "deleted IFA_ROUTE\n",
1201 				    (void *)rt->_rt_key, (void *)ifa);
1202 				ifa->ifa_flags &= ~IFA_ROUTE;
1203 			}
1204 			if (ifa->ifa_rtrequest)
1205 				ifa->ifa_rtrequest(RTM_DELETE, rt, info);
1206 			ifa = NULL;
1207 		}
1208 		rttrash++;
1209 		if (ret_nrt) {
1210 			*ret_nrt = rt;
1211 			rt_ref(rt);
1212 			RT_REFCNT_TRACE(rt);
1213 		}
1214 		rtcache_invalidate();
1215 		RT_UNLOCK();
1216 		need_unlock = false;
1217 		rt_timer_remove_all(rt);
1218 #if defined(INET) || defined(INET6)
1219 		if (netmask != NULL)
1220 			lltable_prefix_free(dst->sa_family, dst, netmask, 0);
1221 #endif
1222 		if (ret_nrt == NULL) {
1223 			/* Adjust the refcount */
1224 			rt_ref(rt);
1225 			RT_REFCNT_TRACE(rt);
1226 			rt_free(rt);
1227 		}
1228 		break;
1229 
1230 	case RTM_ADD:
1231 		if (info->rti_ifa == NULL) {
1232 			ifp = rt_getifp(info, &psref_ifp);
1233 			ifa = rt_getifa(info, &psref_ifa);
1234 			if (ifa == NULL)
1235 				senderr(ENETUNREACH);
1236 		} else {
1237 			/* Caller should have a reference of ifa */
1238 			ifa = info->rti_ifa;
1239 			need_to_release_ifa = false;
1240 		}
1241 		rt = pool_get(&rtentry_pool, PR_NOWAIT);
1242 		if (rt == NULL)
1243 			senderr(ENOBUFS);
1244 		memset(rt, 0, sizeof(*rt));
1245 		rt->rt_flags = RTF_UP | (flags & ~RTF_DONTCHANGEIFA);
1246 		LIST_INIT(&rt->rt_timer);
1247 
1248 		RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key);
1249 		if (netmask) {
1250 			rt_maskedcopy(dst, (struct sockaddr *)&maskeddst,
1251 			    netmask);
1252 			rt_setkey(rt, (struct sockaddr *)&maskeddst, M_NOWAIT);
1253 		} else {
1254 			rt_setkey(rt, dst, M_NOWAIT);
1255 		}
1256 		RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key);
1257 		if (rt_getkey(rt) == NULL ||
1258 		    rt_setgate(rt, gateway) != 0) {
1259 			pool_put(&rtentry_pool, rt);
1260 			senderr(ENOBUFS);
1261 		}
1262 
1263 		rt_set_ifa(rt, ifa);
1264 		if (info->rti_info[RTAX_TAG] != NULL) {
1265 			const struct sockaddr *tag;
1266 			tag = rt_settag(rt, info->rti_info[RTAX_TAG]);
1267 			if (tag == NULL)
1268 				senderr(ENOBUFS);
1269 		}
1270 		RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key);
1271 
1272 		ss = pserialize_read_enter();
1273 		if (info->rti_info[RTAX_IFP] != NULL) {
1274 			struct ifaddr *ifa2;
1275 			ifa2 = ifa_ifwithnet(info->rti_info[RTAX_IFP]);
1276 			if (ifa2 != NULL)
1277 				rt->rt_ifp = ifa2->ifa_ifp;
1278 			else
1279 				rt->rt_ifp = ifa->ifa_ifp;
1280 		} else
1281 			rt->rt_ifp = ifa->ifa_ifp;
1282 		pserialize_read_exit(ss);
1283 		cv_init(&rt->rt_cv, "rtentry");
1284 		psref_target_init(&rt->rt_psref, rt_psref_class);
1285 
1286 		RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key);
1287 		rc = rt_addaddr(rtbl, rt, netmask);
1288 		RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key);
1289 		if (rc != 0) {
1290 			ifafree(ifa); /* for rt_set_ifa above */
1291 			cv_destroy(&rt->rt_cv);
1292 			rt_destroy(rt);
1293 			pool_put(&rtentry_pool, rt);
1294 			senderr(rc);
1295 		}
1296 		RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key);
1297 		if (ifa->ifa_rtrequest)
1298 			ifa->ifa_rtrequest(req, rt, info);
1299 		if (need_to_release_ifa)
1300 			ifa_release(ifa, &psref_ifa);
1301 		ifa = NULL;
1302 		if_put(ifp, &psref_ifp);
1303 		ifp = NULL;
1304 		RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key);
1305 		if (ret_nrt) {
1306 			*ret_nrt = rt;
1307 			rt_ref(rt);
1308 			RT_REFCNT_TRACE(rt);
1309 		}
1310 		rtcache_invalidate();
1311 		RT_UNLOCK();
1312 		need_unlock = false;
1313 		break;
1314 	case RTM_GET:
1315 		if (netmask != NULL) {
1316 			rt_maskedcopy(dst, (struct sockaddr *)&maskeddst,
1317 			    netmask);
1318 			dst = (struct sockaddr *)&maskeddst;
1319 		}
1320 		if ((rt = rt_lookup(rtbl, dst, netmask)) == NULL)
1321 			senderr(ESRCH);
1322 		if (ret_nrt != NULL) {
1323 			*ret_nrt = rt;
1324 			rt_ref(rt);
1325 			RT_REFCNT_TRACE(rt);
1326 		}
1327 		break;
1328 	}
1329 bad:
1330 	if (need_to_release_ifa)
1331 		ifa_release(ifa, &psref_ifa);
1332 	if_put(ifp, &psref_ifp);
1333 	curlwp_bindx(bound);
1334 	if (need_unlock)
1335 		RT_UNLOCK();
1336 	splx(s);
1337 	return error;
1338 }
1339 
1340 int
1341 rt_setgate(struct rtentry *rt, const struct sockaddr *gate)
1342 {
1343 	struct sockaddr *new, *old;
1344 
1345 	KASSERT(RT_WLOCKED());
1346 	KASSERT(rt->_rt_key != NULL);
1347 	RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key);
1348 
1349 	new = sockaddr_dup(gate, M_ZERO | M_NOWAIT);
1350 	if (new == NULL)
1351 		return ENOMEM;
1352 
1353 	old = rt->rt_gateway;
1354 	rt->rt_gateway = new;
1355 	if (old != NULL)
1356 		sockaddr_free(old);
1357 
1358 	KASSERT(rt->_rt_key != NULL);
1359 	RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key);
1360 
1361 	if (rt->rt_flags & RTF_GATEWAY) {
1362 		struct rtentry *gwrt;
1363 
1364 		gwrt = rtalloc1_locked(gate, 1, false, true);
1365 		/*
1366 		 * If we switched gateways, grab the MTU from the new
1367 		 * gateway route if the current MTU, if the current MTU is
1368 		 * greater than the MTU of gateway.
1369 		 * Note that, if the MTU of gateway is 0, we will reset the
1370 		 * MTU of the route to run PMTUD again from scratch. XXX
1371 		 */
1372 		if (gwrt != NULL) {
1373 			KASSERT(gwrt->_rt_key != NULL);
1374 			RT_DPRINTF("gwrt->_rt_key = %p\n", gwrt->_rt_key);
1375 			if ((rt->rt_rmx.rmx_locks & RTV_MTU) == 0 &&
1376 			    rt->rt_rmx.rmx_mtu &&
1377 			    rt->rt_rmx.rmx_mtu > gwrt->rt_rmx.rmx_mtu) {
1378 				rt->rt_rmx.rmx_mtu = gwrt->rt_rmx.rmx_mtu;
1379 			}
1380 			rt_unref(gwrt);
1381 		}
1382 	}
1383 	KASSERT(rt->_rt_key != NULL);
1384 	RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key);
1385 	return 0;
1386 }
1387 
1388 static struct ifaddr *
1389 rt_update_get_ifa(const struct rt_addrinfo info, const struct rtentry *rt,
1390     struct ifnet **ifp, struct psref *psref_ifp, struct psref *psref)
1391 {
1392 	struct ifaddr *ifa = NULL;
1393 
1394 	*ifp = NULL;
1395 	if (info.rti_info[RTAX_IFP] != NULL) {
1396 		ifa = ifa_ifwithnet_psref(info.rti_info[RTAX_IFP], psref);
1397 		if (ifa == NULL)
1398 			goto next;
1399 		*ifp = ifa->ifa_ifp;
1400 		if_acquire(*ifp, psref_ifp);
1401 		if (info.rti_info[RTAX_IFA] == NULL &&
1402 		    info.rti_info[RTAX_GATEWAY] == NULL)
1403 			goto next;
1404 		ifa_release(ifa, psref);
1405 		if (info.rti_info[RTAX_IFA] == NULL) {
1406 			/* route change <dst> <gw> -ifp <if> */
1407 			ifa = ifaof_ifpforaddr_psref(info.rti_info[RTAX_GATEWAY],
1408 			    *ifp, psref);
1409 		} else {
1410 			/* route change <dst> -ifp <if> -ifa <addr> */
1411 			ifa = ifa_ifwithaddr_psref(info.rti_info[RTAX_IFA], psref);
1412 			if (ifa != NULL)
1413 				goto out;
1414 			ifa = ifaof_ifpforaddr_psref(info.rti_info[RTAX_IFA],
1415 			    *ifp, psref);
1416 		}
1417 		goto out;
1418 	}
1419 next:
1420 	if (info.rti_info[RTAX_IFA] != NULL) {
1421 		/* route change <dst> <gw> -ifa <addr> */
1422 		ifa = ifa_ifwithaddr_psref(info.rti_info[RTAX_IFA], psref);
1423 		if (ifa != NULL)
1424 			goto out;
1425 	}
1426 	if (info.rti_info[RTAX_GATEWAY] != NULL) {
1427 		/* route change <dst> <gw> */
1428 		ifa = ifa_ifwithroute_psref(rt->rt_flags, rt_getkey(rt),
1429 		    info.rti_info[RTAX_GATEWAY], psref);
1430 	}
1431 out:
1432 	if (ifa != NULL && *ifp == NULL) {
1433 		*ifp = ifa->ifa_ifp;
1434 		if_acquire(*ifp, psref_ifp);
1435 	}
1436 	if (ifa == NULL && *ifp != NULL) {
1437 		if_put(*ifp, psref_ifp);
1438 		*ifp = NULL;
1439 	}
1440 	return ifa;
1441 }
1442 
1443 int
1444 rt_update(struct rtentry *rt, struct rt_addrinfo *info, void *rtm)
1445 {
1446 	int error = 0;
1447 	struct ifnet *ifp = NULL, *new_ifp = NULL;
1448 	struct ifaddr *ifa = NULL, *new_ifa;
1449 	struct psref psref_ifa, psref_new_ifa, psref_ifp, psref_new_ifp;
1450 	bool newgw, ifp_changed = false;
1451 
1452 	RT_WLOCK();
1453 	/*
1454 	 * New gateway could require new ifaddr, ifp;
1455 	 * flags may also be different; ifp may be specified
1456 	 * by ll sockaddr when protocol address is ambiguous
1457 	 */
1458 	newgw = info->rti_info[RTAX_GATEWAY] != NULL &&
1459 	    sockaddr_cmp(info->rti_info[RTAX_GATEWAY], rt->rt_gateway) != 0;
1460 
1461 	if (newgw || info->rti_info[RTAX_IFP] != NULL ||
1462 	    info->rti_info[RTAX_IFA] != NULL) {
1463 		ifp = rt_getifp(info, &psref_ifp);
1464 		/* info refers ifp so we need to keep a reference */
1465 		ifa = rt_getifa(info, &psref_ifa);
1466 		if (ifa == NULL) {
1467 			error = ENETUNREACH;
1468 			goto out;
1469 		}
1470 	}
1471 	if (newgw) {
1472 		error = rt_setgate(rt, info->rti_info[RTAX_GATEWAY]);
1473 		if (error != 0)
1474 			goto out;
1475 	}
1476 	if (info->rti_info[RTAX_TAG]) {
1477 		const struct sockaddr *tag;
1478 		tag = rt_settag(rt, info->rti_info[RTAX_TAG]);
1479 		if (tag == NULL) {
1480 			error = ENOBUFS;
1481 			goto out;
1482 		}
1483 	}
1484 	/*
1485 	 * New gateway could require new ifaddr, ifp;
1486 	 * flags may also be different; ifp may be specified
1487 	 * by ll sockaddr when protocol address is ambiguous
1488 	 */
1489 	new_ifa = rt_update_get_ifa(*info, rt, &new_ifp, &psref_new_ifp,
1490 	    &psref_new_ifa);
1491 	if (new_ifa != NULL) {
1492 		ifa_release(ifa, &psref_ifa);
1493 		ifa = new_ifa;
1494 	}
1495 	if (ifa) {
1496 		struct ifaddr *oifa = rt->rt_ifa;
1497 		if (oifa != ifa && !ifa_is_destroying(ifa) &&
1498 		    new_ifp != NULL && !if_is_deactivated(new_ifp)) {
1499 			if (oifa && oifa->ifa_rtrequest)
1500 				oifa->ifa_rtrequest(RTM_DELETE, rt, info);
1501 			rt_replace_ifa(rt, ifa);
1502 			rt->rt_ifp = new_ifp;
1503 			ifp_changed = true;
1504 		}
1505 		if (new_ifa == NULL)
1506 			ifa_release(ifa, &psref_ifa);
1507 		/* To avoid ifa_release below */
1508 		ifa = NULL;
1509 	}
1510 	ifa_release(new_ifa, &psref_new_ifa);
1511 	if (new_ifp && rt->rt_ifp != new_ifp && !if_is_deactivated(new_ifp)) {
1512 		rt->rt_ifp = new_ifp;
1513 		ifp_changed = true;
1514 	}
1515 	rt_setmetrics(rtm, rt);
1516 	if (rt->rt_flags != info->rti_flags) {
1517 		rt->rt_flags = (info->rti_flags & ~PRESERVED_RTF) |
1518 		    (rt->rt_flags & PRESERVED_RTF);
1519 	}
1520 	if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest)
1521 		rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, info);
1522 #if defined(INET) || defined(INET6)
1523 	if (ifp_changed && rt_mask(rt) != NULL)
1524 		lltable_prefix_free(rt_getkey(rt)->sa_family, rt_getkey(rt),
1525 		    rt_mask(rt), 0);
1526 #else
1527 	(void)ifp_changed; /* XXX gcc */
1528 #endif
1529 out:
1530 	ifa_release(ifa, &psref_ifa);
1531 	if_put(new_ifp, &psref_new_ifp);
1532 	if_put(ifp, &psref_ifp);
1533 
1534 	RT_UNLOCK();
1535 
1536 	return error;
1537 }
1538 
1539 static void
1540 rt_maskedcopy(const struct sockaddr *src, struct sockaddr *dst,
1541 	const struct sockaddr *netmask)
1542 {
1543 	const char *netmaskp = &netmask->sa_data[0],
1544 	           *srcp = &src->sa_data[0];
1545 	char *dstp = &dst->sa_data[0];
1546 	const char *maskend = (char *)dst + MIN(netmask->sa_len, src->sa_len);
1547 	const char *srcend = (char *)dst + src->sa_len;
1548 
1549 	dst->sa_len = src->sa_len;
1550 	dst->sa_family = src->sa_family;
1551 
1552 	while (dstp < maskend)
1553 		*dstp++ = *srcp++ & *netmaskp++;
1554 	if (dstp < srcend)
1555 		memset(dstp, 0, (size_t)(srcend - dstp));
1556 }
1557 
1558 /*
1559  * Inform the routing socket of a route change.
1560  */
1561 void
1562 rt_newmsg(const int cmd, const struct rtentry *rt)
1563 {
1564 	struct rt_addrinfo info;
1565 
1566 	memset((void *)&info, 0, sizeof(info));
1567 	info.rti_info[RTAX_DST] = rt_getkey(rt);
1568 	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
1569 	info.rti_info[RTAX_NETMASK] = rt_mask(rt);
1570 	if (rt->rt_ifp) {
1571 		info.rti_info[RTAX_IFP] = rt->rt_ifp->if_dl->ifa_addr;
1572 		info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
1573 	}
1574 
1575 	rt_missmsg(cmd, &info, rt->rt_flags, 0);
1576 }
1577 
1578 /*
1579  * Set up or tear down a routing table entry, normally
1580  * for an interface.
1581  */
1582 int
1583 rtinit(struct ifaddr *ifa, int cmd, int flags)
1584 {
1585 	struct rtentry *rt;
1586 	struct sockaddr *dst, *odst;
1587 	struct sockaddr_storage maskeddst;
1588 	struct rtentry *nrt = NULL;
1589 	int error;
1590 	struct rt_addrinfo info;
1591 
1592 	dst = flags & RTF_HOST ? ifa->ifa_dstaddr : ifa->ifa_addr;
1593 	if (cmd == RTM_DELETE) {
1594 		if ((flags & RTF_HOST) == 0 && ifa->ifa_netmask) {
1595 			/* Delete subnet route for this interface */
1596 			odst = dst;
1597 			dst = (struct sockaddr *)&maskeddst;
1598 			rt_maskedcopy(odst, dst, ifa->ifa_netmask);
1599 		}
1600 		if ((rt = rtalloc1(dst, 0)) != NULL) {
1601 			if (rt->rt_ifa != ifa) {
1602 				rt_unref(rt);
1603 				return (flags & RTF_HOST) ? EHOSTUNREACH
1604 							: ENETUNREACH;
1605 			}
1606 			rt_unref(rt);
1607 		}
1608 	}
1609 	memset(&info, 0, sizeof(info));
1610 	info.rti_ifa = ifa;
1611 	info.rti_flags = flags | ifa->ifa_flags | RTF_DONTCHANGEIFA;
1612 	info.rti_info[RTAX_DST] = dst;
1613 	info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr;
1614 
1615 	/*
1616 	 * XXX here, it seems that we are assuming that ifa_netmask is NULL
1617 	 * for RTF_HOST.  bsdi4 passes NULL explicitly (via intermediate
1618 	 * variable) when RTF_HOST is 1.  still not sure if i can safely
1619 	 * change it to meet bsdi4 behavior.
1620 	 */
1621 	if (cmd != RTM_LLINFO_UPD)
1622 		info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask;
1623 	error = rtrequest1((cmd == RTM_LLINFO_UPD) ? RTM_GET : cmd, &info,
1624 	    &nrt);
1625 	if (error != 0)
1626 		return error;
1627 
1628 	rt = nrt;
1629 	RT_REFCNT_TRACE(rt);
1630 	switch (cmd) {
1631 	case RTM_DELETE:
1632 		rt_newmsg(cmd, rt);
1633 		rt_free(rt);
1634 		break;
1635 	case RTM_LLINFO_UPD:
1636 		if (cmd == RTM_LLINFO_UPD && ifa->ifa_rtrequest != NULL)
1637 			ifa->ifa_rtrequest(RTM_LLINFO_UPD, rt, &info);
1638 		rt_newmsg(RTM_CHANGE, rt);
1639 		rt_unref(rt);
1640 		break;
1641 	case RTM_ADD:
1642 		KASSERT(rt->rt_ifa == ifa);
1643 		rt_newmsg(cmd, rt);
1644 		rt_unref(rt);
1645 		RT_REFCNT_TRACE(rt);
1646 		break;
1647 	}
1648 	return error;
1649 }
1650 
1651 /*
1652  * Create a local route entry for the address.
1653  * Announce the addition of the address and the route to the routing socket.
1654  */
1655 int
1656 rt_ifa_addlocal(struct ifaddr *ifa)
1657 {
1658 	struct rtentry *rt;
1659 	int e;
1660 
1661 	/* If there is no loopback entry, allocate one. */
1662 	rt = rtalloc1(ifa->ifa_addr, 0);
1663 #ifdef RT_DEBUG
1664 	if (rt != NULL)
1665 		dump_rt(rt);
1666 #endif
1667 	if (rt == NULL || (rt->rt_flags & RTF_HOST) == 0 ||
1668 	    (rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0)
1669 	{
1670 		struct rt_addrinfo info;
1671 		struct rtentry *nrt;
1672 
1673 		memset(&info, 0, sizeof(info));
1674 		info.rti_flags = RTF_HOST | RTF_LOCAL | RTF_DONTCHANGEIFA;
1675 		info.rti_info[RTAX_DST] = ifa->ifa_addr;
1676 		info.rti_info[RTAX_GATEWAY] =
1677 		    (const struct sockaddr *)ifa->ifa_ifp->if_sadl;
1678 		info.rti_ifa = ifa;
1679 		nrt = NULL;
1680 		e = rtrequest1(RTM_ADD, &info, &nrt);
1681 		rt_addrmsg_rt(RTM_ADD, ifa, e, nrt);
1682 		if (nrt != NULL) {
1683 			KASSERT(nrt->rt_ifa == ifa);
1684 #ifdef RT_DEBUG
1685 			dump_rt(nrt);
1686 #endif
1687 			rt_unref(nrt);
1688 			RT_REFCNT_TRACE(nrt);
1689 		}
1690 	} else {
1691 		e = 0;
1692 		rt_addrmsg(RTM_NEWADDR, ifa);
1693 	}
1694 	if (rt != NULL)
1695 		rt_unref(rt);
1696 	return e;
1697 }
1698 
1699 /*
1700  * Remove the local route entry for the address.
1701  * Announce the removal of the address and the route to the routing socket.
1702  */
1703 int
1704 rt_ifa_remlocal(struct ifaddr *ifa, struct ifaddr *alt_ifa)
1705 {
1706 	struct rtentry *rt;
1707 	int e = 0;
1708 
1709 	rt = rtalloc1(ifa->ifa_addr, 0);
1710 
1711 	/*
1712 	 * Before deleting, check if a corresponding loopbacked
1713 	 * host route surely exists.  With this check, we can avoid
1714 	 * deleting an interface direct route whose destination is
1715 	 * the same as the address being removed.  This can happen
1716 	 * when removing a subnet-router anycast address on an
1717 	 * interface attached to a shared medium.
1718 	 */
1719 	if (rt != NULL &&
1720 	    (rt->rt_flags & RTF_HOST) &&
1721 	    (rt->rt_ifp->if_flags & IFF_LOOPBACK))
1722 	{
1723 		/* If we cannot replace the route's ifaddr with the equivalent
1724 		 * ifaddr of another interface, I believe it is safest to
1725 		 * delete the route.
1726 		 */
1727 		if (alt_ifa == NULL) {
1728 			e = rtdeletemsg(rt);
1729 			if (e == 0) {
1730 				rt_unref(rt);
1731 				rt_free(rt);
1732 				rt = NULL;
1733 			}
1734 			rt_addrmsg(RTM_DELADDR, ifa);
1735 		} else {
1736 #ifdef NET_MPSAFE
1737 			int error = rt_update_prepare(rt);
1738 			if (error == 0) {
1739 				rt_replace_ifa(rt, alt_ifa);
1740 				rt_update_finish(rt);
1741 			} else {
1742 				/*
1743 				 * If error != 0, the rtentry is being
1744 				 * destroyed, so doing nothing doesn't
1745 				 * matter.
1746 				 */
1747 			}
1748 #else
1749 			rt_replace_ifa(rt, alt_ifa);
1750 #endif
1751 			rt_newmsg(RTM_CHANGE, rt);
1752 		}
1753 	} else
1754 		rt_addrmsg(RTM_DELADDR, ifa);
1755 	if (rt != NULL)
1756 		rt_unref(rt);
1757 	return e;
1758 }
1759 
1760 /*
1761  * Route timer routines.  These routes allow functions to be called
1762  * for various routes at any time.  This is useful in supporting
1763  * path MTU discovery and redirect route deletion.
1764  *
1765  * This is similar to some BSDI internal functions, but it provides
1766  * for multiple queues for efficiency's sake...
1767  */
1768 
1769 LIST_HEAD(, rttimer_queue) rttimer_queue_head;
1770 static int rt_init_done = 0;
1771 
1772 /*
1773  * Some subtle order problems with domain initialization mean that
1774  * we cannot count on this being run from rt_init before various
1775  * protocol initializations are done.  Therefore, we make sure
1776  * that this is run when the first queue is added...
1777  */
1778 
1779 static void rt_timer_work(struct work *, void *);
1780 
1781 static void
1782 rt_timer_init(void)
1783 {
1784 	int error;
1785 
1786 	assert(rt_init_done == 0);
1787 
1788 	/* XXX should be in rt_init */
1789 	rw_init(&rt_lock);
1790 
1791 	LIST_INIT(&rttimer_queue_head);
1792 	callout_init(&rt_timer_ch, CALLOUT_MPSAFE);
1793 	error = workqueue_create(&rt_timer_wq, "rt_timer",
1794 	    rt_timer_work, NULL, PRI_SOFTNET, IPL_SOFTNET, WQ_MPSAFE);
1795 	if (error)
1796 		panic("%s: workqueue_create failed (%d)\n", __func__, error);
1797 	callout_reset(&rt_timer_ch, hz, rt_timer_timer, NULL);
1798 	rt_init_done = 1;
1799 }
1800 
1801 struct rttimer_queue *
1802 rt_timer_queue_create(u_int timeout)
1803 {
1804 	struct rttimer_queue *rtq;
1805 
1806 	if (rt_init_done == 0)
1807 		rt_timer_init();
1808 
1809 	R_Malloc(rtq, struct rttimer_queue *, sizeof *rtq);
1810 	if (rtq == NULL)
1811 		return NULL;
1812 	memset(rtq, 0, sizeof(*rtq));
1813 
1814 	rtq->rtq_timeout = timeout;
1815 	TAILQ_INIT(&rtq->rtq_head);
1816 	RT_WLOCK();
1817 	LIST_INSERT_HEAD(&rttimer_queue_head, rtq, rtq_link);
1818 	RT_UNLOCK();
1819 
1820 	return rtq;
1821 }
1822 
1823 void
1824 rt_timer_queue_change(struct rttimer_queue *rtq, long timeout)
1825 {
1826 
1827 	rtq->rtq_timeout = timeout;
1828 }
1829 
1830 static void
1831 rt_timer_queue_remove_all(struct rttimer_queue *rtq)
1832 {
1833 	struct rttimer *r;
1834 
1835 	RT_ASSERT_WLOCK();
1836 
1837 	while ((r = TAILQ_FIRST(&rtq->rtq_head)) != NULL) {
1838 		LIST_REMOVE(r, rtt_link);
1839 		TAILQ_REMOVE(&rtq->rtq_head, r, rtt_next);
1840 		rt_ref(r->rtt_rt); /* XXX */
1841 		RT_REFCNT_TRACE(r->rtt_rt);
1842 		RT_UNLOCK();
1843 		(*r->rtt_func)(r->rtt_rt, r);
1844 		pool_put(&rttimer_pool, r);
1845 		RT_WLOCK();
1846 		if (rtq->rtq_count > 0)
1847 			rtq->rtq_count--;
1848 		else
1849 			printf("rt_timer_queue_remove_all: "
1850 			    "rtq_count reached 0\n");
1851 	}
1852 }
1853 
1854 void
1855 rt_timer_queue_destroy(struct rttimer_queue *rtq)
1856 {
1857 
1858 	RT_WLOCK();
1859 	rt_timer_queue_remove_all(rtq);
1860 	LIST_REMOVE(rtq, rtq_link);
1861 	RT_UNLOCK();
1862 
1863 	/*
1864 	 * Caller is responsible for freeing the rttimer_queue structure.
1865 	 */
1866 }
1867 
1868 unsigned long
1869 rt_timer_count(struct rttimer_queue *rtq)
1870 {
1871 	return rtq->rtq_count;
1872 }
1873 
1874 static void
1875 rt_timer_remove_all(struct rtentry *rt)
1876 {
1877 	struct rttimer *r;
1878 
1879 	RT_WLOCK();
1880 	while ((r = LIST_FIRST(&rt->rt_timer)) != NULL) {
1881 		LIST_REMOVE(r, rtt_link);
1882 		TAILQ_REMOVE(&r->rtt_queue->rtq_head, r, rtt_next);
1883 		if (r->rtt_queue->rtq_count > 0)
1884 			r->rtt_queue->rtq_count--;
1885 		else
1886 			printf("rt_timer_remove_all: rtq_count reached 0\n");
1887 		pool_put(&rttimer_pool, r);
1888 	}
1889 	RT_UNLOCK();
1890 }
1891 
1892 int
1893 rt_timer_add(struct rtentry *rt,
1894 	void (*func)(struct rtentry *, struct rttimer *),
1895 	struct rttimer_queue *queue)
1896 {
1897 	struct rttimer *r;
1898 
1899 	KASSERT(func != NULL);
1900 	RT_WLOCK();
1901 	/*
1902 	 * If there's already a timer with this action, destroy it before
1903 	 * we add a new one.
1904 	 */
1905 	LIST_FOREACH(r, &rt->rt_timer, rtt_link) {
1906 		if (r->rtt_func == func)
1907 			break;
1908 	}
1909 	if (r != NULL) {
1910 		LIST_REMOVE(r, rtt_link);
1911 		TAILQ_REMOVE(&r->rtt_queue->rtq_head, r, rtt_next);
1912 		if (r->rtt_queue->rtq_count > 0)
1913 			r->rtt_queue->rtq_count--;
1914 		else
1915 			printf("rt_timer_add: rtq_count reached 0\n");
1916 	} else {
1917 		r = pool_get(&rttimer_pool, PR_NOWAIT);
1918 		if (r == NULL) {
1919 			RT_UNLOCK();
1920 			return ENOBUFS;
1921 		}
1922 	}
1923 
1924 	memset(r, 0, sizeof(*r));
1925 
1926 	r->rtt_rt = rt;
1927 	r->rtt_time = time_uptime;
1928 	r->rtt_func = func;
1929 	r->rtt_queue = queue;
1930 	LIST_INSERT_HEAD(&rt->rt_timer, r, rtt_link);
1931 	TAILQ_INSERT_TAIL(&queue->rtq_head, r, rtt_next);
1932 	r->rtt_queue->rtq_count++;
1933 
1934 	RT_UNLOCK();
1935 
1936 	return 0;
1937 }
1938 
1939 static void
1940 rt_timer_work(struct work *wk, void *arg)
1941 {
1942 	struct rttimer_queue *rtq;
1943 	struct rttimer *r;
1944 
1945 	RT_WLOCK();
1946 	LIST_FOREACH(rtq, &rttimer_queue_head, rtq_link) {
1947 		while ((r = TAILQ_FIRST(&rtq->rtq_head)) != NULL &&
1948 		    (r->rtt_time + rtq->rtq_timeout) < time_uptime) {
1949 			LIST_REMOVE(r, rtt_link);
1950 			TAILQ_REMOVE(&rtq->rtq_head, r, rtt_next);
1951 			/*
1952 			 * Take a reference to avoid the rtentry is freed
1953 			 * accidentally after RT_UNLOCK.  The callback
1954 			 * (rtt_func) must rt_unref it by itself.
1955 			 */
1956 			rt_ref(r->rtt_rt);
1957 			RT_REFCNT_TRACE(r->rtt_rt);
1958 			RT_UNLOCK();
1959 			(*r->rtt_func)(r->rtt_rt, r);
1960 			pool_put(&rttimer_pool, r);
1961 			RT_WLOCK();
1962 			if (rtq->rtq_count > 0)
1963 				rtq->rtq_count--;
1964 			else
1965 				printf("rt_timer_timer: rtq_count reached 0\n");
1966 		}
1967 	}
1968 	RT_UNLOCK();
1969 
1970 	callout_reset(&rt_timer_ch, hz, rt_timer_timer, NULL);
1971 }
1972 
1973 static void
1974 rt_timer_timer(void *arg)
1975 {
1976 
1977 	workqueue_enqueue(rt_timer_wq, &rt_timer_wk, NULL);
1978 }
1979 
1980 static struct rtentry *
1981 _rtcache_init(struct route *ro, int flag)
1982 {
1983 	struct rtentry *rt;
1984 
1985 	rtcache_invariants(ro);
1986 	KASSERT(ro->_ro_rt == NULL);
1987 
1988 	if (rtcache_getdst(ro) == NULL)
1989 		return NULL;
1990 	rt = rtalloc1(rtcache_getdst(ro), flag);
1991 	if (rt != NULL) {
1992 		RT_RLOCK();
1993 		if (ISSET(rt->rt_flags, RTF_UP)) {
1994 			ro->_ro_rt = rt;
1995 			ro->ro_rtcache_generation = rtcache_generation;
1996 			rtcache_ref(rt, ro);
1997 		}
1998 		RT_UNLOCK();
1999 		rt_unref(rt);
2000 	}
2001 
2002 	rtcache_invariants(ro);
2003 	return ro->_ro_rt;
2004 }
2005 
2006 struct rtentry *
2007 rtcache_init(struct route *ro)
2008 {
2009 
2010 	return _rtcache_init(ro, 1);
2011 }
2012 
2013 struct rtentry *
2014 rtcache_init_noclone(struct route *ro)
2015 {
2016 
2017 	return _rtcache_init(ro, 0);
2018 }
2019 
2020 struct rtentry *
2021 rtcache_update(struct route *ro, int clone)
2022 {
2023 
2024 	ro->_ro_rt = NULL;
2025 	return _rtcache_init(ro, clone);
2026 }
2027 
2028 void
2029 rtcache_copy(struct route *new_ro, struct route *old_ro)
2030 {
2031 	struct rtentry *rt;
2032 	int ret;
2033 
2034 	KASSERT(new_ro != old_ro);
2035 	rtcache_invariants(new_ro);
2036 	rtcache_invariants(old_ro);
2037 
2038 	rt = rtcache_validate(old_ro);
2039 
2040 	if (rtcache_getdst(old_ro) == NULL)
2041 		goto out;
2042 	ret = rtcache_setdst(new_ro, rtcache_getdst(old_ro));
2043 	if (ret != 0)
2044 		goto out;
2045 
2046 	RT_RLOCK();
2047 	new_ro->_ro_rt = rt;
2048 	new_ro->ro_rtcache_generation = rtcache_generation;
2049 	RT_UNLOCK();
2050 	rtcache_invariants(new_ro);
2051 out:
2052 	rtcache_unref(rt, old_ro);
2053 	return;
2054 }
2055 
2056 #if defined(RT_DEBUG) && defined(NET_MPSAFE)
2057 static void
2058 rtcache_trace(const char *func, struct rtentry *rt, struct route *ro)
2059 {
2060 	char dst[64];
2061 
2062 	sockaddr_format(ro->ro_sa, dst, 64);
2063 	printf("trace: %s:\tdst=%s cpu=%d lwp=%p psref=%p target=%p\n", func, dst,
2064 	    cpu_index(curcpu()), curlwp, &ro->ro_psref, &rt->rt_psref);
2065 }
2066 #define RTCACHE_PSREF_TRACE(rt, ro)	rtcache_trace(__func__, (rt), (ro))
2067 #else
2068 #define RTCACHE_PSREF_TRACE(rt, ro)	do {} while (0)
2069 #endif
2070 
2071 static void
2072 rtcache_ref(struct rtentry *rt, struct route *ro)
2073 {
2074 
2075 	KASSERT(rt != NULL);
2076 
2077 #ifdef NET_MPSAFE
2078 	RTCACHE_PSREF_TRACE(rt, ro);
2079 	ro->ro_bound = curlwp_bind();
2080 	/* XXX Use a real caller's address */
2081 	PSREF_DEBUG_FILL_RETURN_ADDRESS(&ro->ro_psref);
2082 	psref_acquire(&ro->ro_psref, &rt->rt_psref, rt_psref_class);
2083 #endif
2084 }
2085 
2086 void
2087 rtcache_unref(struct rtentry *rt, struct route *ro)
2088 {
2089 
2090 	if (rt == NULL)
2091 		return;
2092 
2093 #ifdef NET_MPSAFE
2094 	psref_release(&ro->ro_psref, &rt->rt_psref, rt_psref_class);
2095 	curlwp_bindx(ro->ro_bound);
2096 	RTCACHE_PSREF_TRACE(rt, ro);
2097 #endif
2098 }
2099 
2100 struct rtentry *
2101 rtcache_validate(struct route *ro)
2102 {
2103 	struct rtentry *rt = NULL;
2104 
2105 #ifdef NET_MPSAFE
2106 retry:
2107 #endif
2108 	rtcache_invariants(ro);
2109 	RT_RLOCK();
2110 	if (ro->ro_rtcache_generation != rtcache_generation) {
2111 		/* The cache is invalidated */
2112 		rt = NULL;
2113 		goto out;
2114 	}
2115 
2116 	rt = ro->_ro_rt;
2117 	if (rt == NULL)
2118 		goto out;
2119 
2120 	if ((rt->rt_flags & RTF_UP) == 0) {
2121 		rt = NULL;
2122 		goto out;
2123 	}
2124 #ifdef NET_MPSAFE
2125 	if (ISSET(rt->rt_flags, RTF_UPDATING)) {
2126 		if (rt_wait_ok()) {
2127 			RT_UNLOCK();
2128 
2129 			/* We can wait until the update is complete */
2130 			rt_update_wait();
2131 			goto retry;
2132 		} else {
2133 			rt = NULL;
2134 		}
2135 	} else
2136 #endif
2137 		rtcache_ref(rt, ro);
2138 out:
2139 	RT_UNLOCK();
2140 	return rt;
2141 }
2142 
2143 struct rtentry *
2144 rtcache_lookup2(struct route *ro, const struct sockaddr *dst,
2145     int clone, int *hitp)
2146 {
2147 	const struct sockaddr *odst;
2148 	struct rtentry *rt = NULL;
2149 
2150 	odst = rtcache_getdst(ro);
2151 	if (odst == NULL)
2152 		goto miss;
2153 
2154 	if (sockaddr_cmp(odst, dst) != 0) {
2155 		rtcache_free(ro);
2156 		goto miss;
2157 	}
2158 
2159 	rt = rtcache_validate(ro);
2160 	if (rt == NULL) {
2161 		ro->_ro_rt = NULL;
2162 		goto miss;
2163 	}
2164 
2165 	rtcache_invariants(ro);
2166 
2167 	if (hitp != NULL)
2168 		*hitp = 1;
2169 	return rt;
2170 miss:
2171 	if (hitp != NULL)
2172 		*hitp = 0;
2173 	if (rtcache_setdst(ro, dst) == 0)
2174 		rt = _rtcache_init(ro, clone);
2175 
2176 	rtcache_invariants(ro);
2177 
2178 	return rt;
2179 }
2180 
2181 void
2182 rtcache_free(struct route *ro)
2183 {
2184 
2185 	ro->_ro_rt = NULL;
2186 	if (ro->ro_sa != NULL) {
2187 		sockaddr_free(ro->ro_sa);
2188 		ro->ro_sa = NULL;
2189 	}
2190 	rtcache_invariants(ro);
2191 }
2192 
2193 int
2194 rtcache_setdst(struct route *ro, const struct sockaddr *sa)
2195 {
2196 	KASSERT(sa != NULL);
2197 
2198 	rtcache_invariants(ro);
2199 	if (ro->ro_sa != NULL) {
2200 		if (ro->ro_sa->sa_family == sa->sa_family) {
2201 			ro->_ro_rt = NULL;
2202 			sockaddr_copy(ro->ro_sa, ro->ro_sa->sa_len, sa);
2203 			rtcache_invariants(ro);
2204 			return 0;
2205 		}
2206 		/* free ro_sa, wrong family */
2207 		rtcache_free(ro);
2208 	}
2209 
2210 	KASSERT(ro->_ro_rt == NULL);
2211 
2212 	if ((ro->ro_sa = sockaddr_dup(sa, M_ZERO | M_NOWAIT)) == NULL) {
2213 		rtcache_invariants(ro);
2214 		return ENOMEM;
2215 	}
2216 	rtcache_invariants(ro);
2217 	return 0;
2218 }
2219 
2220 const struct sockaddr *
2221 rt_settag(struct rtentry *rt, const struct sockaddr *tag)
2222 {
2223 	if (rt->rt_tag != tag) {
2224 		if (rt->rt_tag != NULL)
2225 			sockaddr_free(rt->rt_tag);
2226 		rt->rt_tag = sockaddr_dup(tag, M_ZERO | M_NOWAIT);
2227 	}
2228 	return rt->rt_tag;
2229 }
2230 
2231 struct sockaddr *
2232 rt_gettag(const struct rtentry *rt)
2233 {
2234 	return rt->rt_tag;
2235 }
2236 
2237 int
2238 rt_check_reject_route(const struct rtentry *rt, const struct ifnet *ifp)
2239 {
2240 
2241 	if ((rt->rt_flags & RTF_REJECT) != 0) {
2242 		/* Mimic looutput */
2243 		if (ifp->if_flags & IFF_LOOPBACK)
2244 			return (rt->rt_flags & RTF_HOST) ?
2245 			    EHOSTUNREACH : ENETUNREACH;
2246 		else if (rt->rt_rmx.rmx_expire == 0 ||
2247 		    time_uptime < rt->rt_rmx.rmx_expire)
2248 			return (rt->rt_flags & RTF_GATEWAY) ?
2249 			    EHOSTUNREACH : EHOSTDOWN;
2250 	}
2251 
2252 	return 0;
2253 }
2254 
2255 void
2256 rt_delete_matched_entries(sa_family_t family, int (*f)(struct rtentry *, void *),
2257     void *v)
2258 {
2259 
2260 	for (;;) {
2261 		int s;
2262 		int error;
2263 		struct rtentry *rt, *retrt = NULL;
2264 
2265 		RT_RLOCK();
2266 		s = splsoftnet();
2267 		rt = rtbl_search_matched_entry(family, f, v);
2268 		if (rt == NULL) {
2269 			splx(s);
2270 			RT_UNLOCK();
2271 			return;
2272 		}
2273 		rt_ref(rt);
2274 		splx(s);
2275 		RT_UNLOCK();
2276 
2277 		error = rtrequest(RTM_DELETE, rt_getkey(rt), rt->rt_gateway,
2278 		    rt_mask(rt), rt->rt_flags, &retrt);
2279 		if (error == 0) {
2280 			KASSERT(retrt == rt);
2281 			KASSERT((retrt->rt_flags & RTF_UP) == 0);
2282 			retrt->rt_ifp = NULL;
2283 			rt_unref(rt);
2284 			rt_free(retrt);
2285 		} else if (error == ESRCH) {
2286 			/* Someone deleted the entry already. */
2287 			rt_unref(rt);
2288 		} else {
2289 			log(LOG_ERR, "%s: unable to delete rtentry @ %p, "
2290 			    "error = %d\n", rt->rt_ifp->if_xname, rt, error);
2291 			/* XXX how to treat this case? */
2292 		}
2293 	}
2294 }
2295 
2296 static int
2297 rt_walktree_locked(sa_family_t family, int (*f)(struct rtentry *, void *),
2298     void *v)
2299 {
2300 
2301 	return rtbl_walktree(family, f, v);
2302 }
2303 
2304 int
2305 rt_walktree(sa_family_t family, int (*f)(struct rtentry *, void *), void *v)
2306 {
2307 	int error;
2308 
2309 	RT_RLOCK();
2310 	error = rt_walktree_locked(family, f, v);
2311 	RT_UNLOCK();
2312 
2313 	return error;
2314 }
2315 
2316 #ifdef DDB
2317 
2318 #include <machine/db_machdep.h>
2319 #include <ddb/db_interface.h>
2320 #include <ddb/db_output.h>
2321 
2322 #define	rt_expire rt_rmx.rmx_expire
2323 
2324 static void
2325 db_print_sa(const struct sockaddr *sa)
2326 {
2327 	int len;
2328 	const u_char *p;
2329 
2330 	if (sa == NULL) {
2331 		db_printf("[NULL]");
2332 		return;
2333 	}
2334 
2335 	p = (const u_char *)sa;
2336 	len = sa->sa_len;
2337 	db_printf("[");
2338 	while (len > 0) {
2339 		db_printf("%d", *p);
2340 		p++; len--;
2341 		if (len) db_printf(",");
2342 	}
2343 	db_printf("]\n");
2344 }
2345 
2346 static void
2347 db_print_ifa(struct ifaddr *ifa)
2348 {
2349 	if (ifa == NULL)
2350 		return;
2351 	db_printf("  ifa_addr=");
2352 	db_print_sa(ifa->ifa_addr);
2353 	db_printf("  ifa_dsta=");
2354 	db_print_sa(ifa->ifa_dstaddr);
2355 	db_printf("  ifa_mask=");
2356 	db_print_sa(ifa->ifa_netmask);
2357 	db_printf("  flags=0x%x,refcnt=%d,metric=%d\n",
2358 			  ifa->ifa_flags,
2359 			  ifa->ifa_refcnt,
2360 			  ifa->ifa_metric);
2361 }
2362 
2363 /*
2364  * Function to pass to rt_walktree().
2365  * Return non-zero error to abort walk.
2366  */
2367 static int
2368 db_show_rtentry(struct rtentry *rt, void *w)
2369 {
2370 	db_printf("rtentry=%p", rt);
2371 
2372 	db_printf(" flags=0x%x refcnt=%d use=%"PRId64" expire=%"PRId64"\n",
2373 			  rt->rt_flags, rt->rt_refcnt,
2374 			  rt->rt_use, (uint64_t)rt->rt_expire);
2375 
2376 	db_printf(" key="); db_print_sa(rt_getkey(rt));
2377 	db_printf(" mask="); db_print_sa(rt_mask(rt));
2378 	db_printf(" gw="); db_print_sa(rt->rt_gateway);
2379 
2380 	db_printf(" ifp=%p ", rt->rt_ifp);
2381 	if (rt->rt_ifp)
2382 		db_printf("(%s)", rt->rt_ifp->if_xname);
2383 	else
2384 		db_printf("(NULL)");
2385 
2386 	db_printf(" ifa=%p\n", rt->rt_ifa);
2387 	db_print_ifa(rt->rt_ifa);
2388 
2389 	db_printf(" gwroute=%p llinfo=%p\n",
2390 			  rt->rt_gwroute, rt->rt_llinfo);
2391 
2392 	return 0;
2393 }
2394 
2395 /*
2396  * Function to print all the route trees.
2397  * Use this from ddb:  "show routes"
2398  */
2399 void
2400 db_show_routes(db_expr_t addr, bool have_addr,
2401     db_expr_t count, const char *modif)
2402 {
2403 
2404 	/* Taking RT_LOCK will fail if LOCKDEBUG is enabled. */
2405 	rt_walktree_locked(AF_INET, db_show_rtentry, NULL);
2406 }
2407 #endif
2408