xref: /openbsd-src/sys/net/rtable.c (revision 505ee9ea3b177e2387d907a91ca7da069f3f14d8)
1 /*	$OpenBSD: rtable.c,v 1.69 2019/06/21 17:11:42 mpi Exp $ */
2 
3 /*
4  * Copyright (c) 2014-2016 Martin Pieuchot
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #ifndef _KERNEL
20 #include "kern_compat.h"
21 #else
22 #include <sys/param.h>
23 #include <sys/systm.h>
24 #include <sys/socket.h>
25 #include <sys/malloc.h>
26 #include <sys/queue.h>
27 #include <sys/domain.h>
28 #include <sys/srp.h>
29 #endif
30 
31 #include <net/rtable.h>
32 #include <net/route.h>
33 
34 /*
35  * Structures used by rtable_get() to retrieve the corresponding
36  * routing table for a given pair of ``af'' and ``rtableid''.
37  *
38  * Note that once allocated routing table heads are never freed.
39  * This way we do not need to reference count them.
40  *
41  *	afmap		    rtmap/dommp
42  *   -----------          ---------     -----
43  *   |   0     |--------> | 0 | 0 | ... | 0 |	Array mapping rtableid (=index)
44  *   -----------          ---------     -----   to rdomain/loopback (=value).
45  *   | AF_INET |.
46  *   ----------- `.       .---------.     .---------.
47  *       ...	   `----> | rtable0 | ... | rtableN |	Array of pointers for
48  *   -----------          '---------'     '---------'	IPv4 routing tables
49  *   | AF_MPLS |					indexed by ``rtableid''.
50  *   -----------
51  */
52 struct srp	  *afmap;
53 uint8_t		   af2idx[AF_MAX+1];	/* To only allocate supported AF */
54 uint8_t		   af2idx_max;
55 
56 /* Array of routing table pointers. */
57 struct rtmap {
58 	unsigned int	   limit;
59 	void		 **tbl;
60 };
61 
62 /*
63  * Array of rtableid -> rdomain mapping.
64  *
65  * Only used for the first index as describbed above.
66  */
67 struct dommp {
68 	unsigned int	   limit;
69 	/*
70 	 * Array to get the routing domain and loopback interface related to
71 	 * a routing table. Format:
72 	 *
73 	 * 8 unused bits | 16 bits for loopback index | 8 bits for rdomain
74 	 */
75 	unsigned int	  *value;
76 };
77 
78 unsigned int	   rtmap_limit = 0;
79 
80 void		   rtmap_init(void);
81 void		   rtmap_grow(unsigned int, sa_family_t);
82 void		   rtmap_dtor(void *, void *);
83 
84 struct srp_gc	   rtmap_gc = SRP_GC_INITIALIZER(rtmap_dtor, NULL);
85 
86 void		   rtable_init_backend(void);
87 void		  *rtable_alloc(unsigned int, unsigned int, unsigned int);
88 void		  *rtable_get(unsigned int, sa_family_t);
89 
90 void
91 rtmap_init(void)
92 {
93 	struct domain	*dp;
94 	int		 i;
95 
96 	/* Start with a single table for every domain that requires it. */
97 	for (i = 0; (dp = domains[i]) != NULL; i++) {
98 		if (dp->dom_rtoffset == 0)
99 			continue;
100 
101 		rtmap_grow(1, dp->dom_family);
102 	}
103 
104 	/* Initialize the rtableid->rdomain mapping table. */
105 	rtmap_grow(1, 0);
106 
107 	rtmap_limit = 1;
108 }
109 
110 /*
111  * Grow the size of the array of routing table for AF ``af'' to ``nlimit''.
112  */
113 void
114 rtmap_grow(unsigned int nlimit, sa_family_t af)
115 {
116 	struct rtmap	*map, *nmap;
117 	int		 i;
118 
119 	KERNEL_ASSERT_LOCKED();
120 
121 	KASSERT(nlimit > rtmap_limit);
122 
123 	nmap = malloc(sizeof(*nmap), M_RTABLE, M_WAITOK);
124 	nmap->limit = nlimit;
125 	nmap->tbl = mallocarray(nlimit, sizeof(*nmap[0].tbl), M_RTABLE,
126 	    M_WAITOK|M_ZERO);
127 
128 	map = srp_get_locked(&afmap[af2idx[af]]);
129 	if (map != NULL) {
130 		KASSERT(map->limit == rtmap_limit);
131 
132 		for (i = 0; i < map->limit; i++)
133 			nmap->tbl[i] = map->tbl[i];
134 	}
135 
136 	srp_update_locked(&rtmap_gc, &afmap[af2idx[af]], nmap);
137 }
138 
139 void
140 rtmap_dtor(void *null, void *xmap)
141 {
142 	struct rtmap	*map = xmap;
143 
144 	/*
145 	 * doesnt need to be serialized since this is the last reference
146 	 * to this map. there's nothing to race against.
147 	 */
148 	free(map->tbl, M_RTABLE, map->limit * sizeof(*map[0].tbl));
149 	free(map, M_RTABLE, sizeof(*map));
150 }
151 
152 void
153 rtable_init(void)
154 {
155 	struct domain	*dp;
156 	int		 i;
157 
158 	KASSERT(sizeof(struct rtmap) == sizeof(struct dommp));
159 
160 	/* We use index 0 for the rtable/rdomain map. */
161 	af2idx_max = 1;
162 	memset(af2idx, 0, sizeof(af2idx));
163 
164 	/*
165 	 * Compute the maximum supported key length in case the routing
166 	 * table backend needs it.
167 	 */
168 	for (i = 0; (dp = domains[i]) != NULL; i++) {
169 		if (dp->dom_rtoffset == 0)
170 			continue;
171 
172 		af2idx[dp->dom_family] = af2idx_max++;
173 	}
174 	rtable_init_backend();
175 
176 	/*
177 	 * Allocate AF-to-id table now that we now how many AFs this
178 	 * kernel supports.
179 	 */
180 	afmap = mallocarray(af2idx_max + 1, sizeof(*afmap), M_RTABLE,
181 	    M_WAITOK|M_ZERO);
182 
183 	rtmap_init();
184 
185 	if (rtable_add(0) != 0)
186 		panic("unable to create default routing table");
187 }
188 
189 int
190 rtable_add(unsigned int id)
191 {
192 	struct domain	*dp;
193 	void		*tbl;
194 	struct rtmap	*map;
195 	struct dommp	*dmm;
196 	sa_family_t	 af;
197 	unsigned int	 off, alen;
198 	int		 i;
199 
200 	KERNEL_ASSERT_LOCKED();
201 
202 	if (id > RT_TABLEID_MAX)
203 		return (EINVAL);
204 
205 	if (rtable_exists(id))
206 		return (EEXIST);
207 
208 	for (i = 0; (dp = domains[i]) != NULL; i++) {
209 		if (dp->dom_rtoffset == 0)
210 			continue;
211 
212 		af = dp->dom_family;
213 		off = dp->dom_rtoffset;
214 		alen = dp->dom_maxplen;
215 
216 		if (id >= rtmap_limit)
217 			rtmap_grow(id + 1, af);
218 
219 		tbl = rtable_alloc(id, alen, off);
220 		if (tbl == NULL)
221 			return (ENOMEM);
222 
223 		map = srp_get_locked(&afmap[af2idx[af]]);
224 		map->tbl[id] = tbl;
225 	}
226 
227 	/* Reflect possible growth. */
228 	if (id >= rtmap_limit) {
229 		rtmap_grow(id + 1, 0);
230 		rtmap_limit = id + 1;
231 	}
232 
233 	/* Use main rtable/rdomain by default. */
234 	dmm = srp_get_locked(&afmap[0]);
235 	dmm->value[id] = 0;
236 
237 	return (0);
238 }
239 
240 void *
241 rtable_get(unsigned int rtableid, sa_family_t af)
242 {
243 	struct rtmap	*map;
244 	void		*tbl = NULL;
245 	struct srp_ref	 sr;
246 
247 	if (af >= nitems(af2idx) || af2idx[af] == 0)
248 		return (NULL);
249 
250 	map = srp_enter(&sr, &afmap[af2idx[af]]);
251 	if (rtableid < map->limit)
252 		tbl = map->tbl[rtableid];
253 	srp_leave(&sr);
254 
255 	return (tbl);
256 }
257 
258 int
259 rtable_exists(unsigned int rtableid)
260 {
261 	struct domain	*dp;
262 	void		*tbl;
263 	int		 i;
264 
265 	for (i = 0; (dp = domains[i]) != NULL; i++) {
266 		if (dp->dom_rtoffset == 0)
267 			continue;
268 
269 		tbl = rtable_get(rtableid, dp->dom_family);
270 		if (tbl != NULL)
271 			return (1);
272 	}
273 
274 	return (0);
275 }
276 
277 int
278 rtable_empty(unsigned int rtableid)
279 {
280 	struct domain	*dp;
281 	int		 i;
282 	struct art_root	*tbl;
283 
284 	for (i = 0; (dp = domains[i]) != NULL; i++) {
285 		if (dp->dom_rtoffset == 0)
286 			continue;
287 
288 		tbl = rtable_get(rtableid, dp->dom_family);
289 		if (tbl == NULL)
290 			continue;
291 		if (tbl->ar_root.ref != NULL)
292 			return (0);
293 	}
294 
295 	return (1);
296 }
297 
298 unsigned int
299 rtable_l2(unsigned int rtableid)
300 {
301 	struct dommp	*dmm;
302 	unsigned int	 rdomain = 0;
303 	struct srp_ref	 sr;
304 
305 	dmm = srp_enter(&sr, &afmap[0]);
306 	if (rtableid < dmm->limit)
307 		rdomain = (dmm->value[rtableid] & RT_TABLEID_MASK);
308 	srp_leave(&sr);
309 
310 	return (rdomain);
311 }
312 
313 unsigned int
314 rtable_loindex(unsigned int rtableid)
315 {
316 	struct dommp	*dmm;
317 	unsigned int	 loifidx = 0;
318 	struct srp_ref	 sr;
319 
320 	dmm = srp_enter(&sr, &afmap[0]);
321 	if (rtableid < dmm->limit)
322 		loifidx = (dmm->value[rtableid] >> RT_TABLEID_BITS);
323 	srp_leave(&sr);
324 
325 	return (loifidx);
326 }
327 
328 void
329 rtable_l2set(unsigned int rtableid, unsigned int rdomain, unsigned int loifidx)
330 {
331 	struct dommp	*dmm;
332 	unsigned int	 value;
333 
334 	KERNEL_ASSERT_LOCKED();
335 
336 	if (!rtable_exists(rtableid) || !rtable_exists(rdomain))
337 		return;
338 
339 	value = (rdomain & RT_TABLEID_MASK) | (loifidx << RT_TABLEID_BITS);
340 
341 	dmm = srp_get_locked(&afmap[0]);
342 	dmm->value[rtableid] = value;
343 }
344 
345 
346 static inline uint8_t	*satoaddr(struct art_root *, struct sockaddr *);
347 
348 int	an_match(struct art_node *, struct sockaddr *, int);
349 void	rtentry_ref(void *, void *);
350 void	rtentry_unref(void *, void *);
351 
352 void	rtable_mpath_insert(struct art_node *, struct rtentry *);
353 
354 struct srpl_rc rt_rc = SRPL_RC_INITIALIZER(rtentry_ref, rtentry_unref, NULL);
355 
356 void
357 rtable_init_backend(void)
358 {
359 	art_init();
360 }
361 
362 void *
363 rtable_alloc(unsigned int rtableid, unsigned int alen, unsigned int off)
364 {
365 	return (art_alloc(rtableid, alen, off));
366 }
367 
368 struct rtentry *
369 rtable_lookup(unsigned int rtableid, struct sockaddr *dst,
370     struct sockaddr *mask, struct sockaddr *gateway, uint8_t prio)
371 {
372 	struct art_root			*ar;
373 	struct art_node			*an;
374 	struct rtentry			*rt = NULL;
375 	struct srp_ref			 sr, nsr;
376 	uint8_t				*addr;
377 	int				 plen;
378 
379 	ar = rtable_get(rtableid, dst->sa_family);
380 	if (ar == NULL)
381 		return (NULL);
382 
383 	addr = satoaddr(ar, dst);
384 
385 	/* No need for a perfect match. */
386 	if (mask == NULL) {
387 		an = art_match(ar, addr, &nsr);
388 		if (an == NULL)
389 			goto out;
390 	} else {
391 		plen = rtable_satoplen(dst->sa_family, mask);
392 		if (plen == -1)
393 			return (NULL);
394 
395 		an = art_lookup(ar, addr, plen, &nsr);
396 
397 		/* Make sure we've got a perfect match. */
398 		if (!an_match(an, dst, plen))
399 			goto out;
400 	}
401 
402 	SRPL_FOREACH(rt, &sr, &an->an_rtlist, rt_next) {
403 		if (prio != RTP_ANY &&
404 		    (rt->rt_priority & RTP_MASK) != (prio & RTP_MASK))
405 			continue;
406 
407 		if (gateway == NULL)
408 			break;
409 
410 		if (rt->rt_gateway->sa_len == gateway->sa_len &&
411 		    memcmp(rt->rt_gateway, gateway, gateway->sa_len) == 0)
412 			break;
413 	}
414 	if (rt != NULL)
415 		rtref(rt);
416 
417 	SRPL_LEAVE(&sr);
418 out:
419 	srp_leave(&nsr);
420 
421 	return (rt);
422 }
423 
424 struct rtentry *
425 rtable_match(unsigned int rtableid, struct sockaddr *dst, uint32_t *src)
426 {
427 	struct art_root			*ar;
428 	struct art_node			*an;
429 	struct rtentry			*rt = NULL;
430 	struct srp_ref			 sr, nsr;
431 	uint8_t				*addr;
432 	int				 hash;
433 
434 	ar = rtable_get(rtableid, dst->sa_family);
435 	if (ar == NULL)
436 		return (NULL);
437 
438 	addr = satoaddr(ar, dst);
439 
440 	an = art_match(ar, addr, &nsr);
441 	if (an == NULL)
442 		goto out;
443 
444 	rt = SRPL_FIRST(&sr, &an->an_rtlist);
445 	rtref(rt);
446 	SRPL_LEAVE(&sr);
447 
448 	/* Gateway selection by Hash-Threshold (RFC 2992) */
449 	if ((hash = rt_hash(rt, dst, src)) != -1) {
450 		struct rtentry		*mrt;
451 		int			 threshold, npaths = 0;
452 
453 		KASSERT(hash <= 0xffff);
454 
455 		SRPL_FOREACH(mrt, &sr, &an->an_rtlist, rt_next) {
456 			/* Only count nexthops with the same priority. */
457 			if (mrt->rt_priority == rt->rt_priority)
458 				npaths++;
459 		}
460 		SRPL_LEAVE(&sr);
461 
462 		threshold = (0xffff / npaths) + 1;
463 
464 		/*
465 		 * we have no protection against concurrent modification of the
466 		 * route list attached to the node, so we won't necessarily
467 		 * have the same number of routes.  for most modifications,
468 		 * we'll pick a route that we wouldn't have if we only saw the
469 		 * list before or after the change.  if we were going to use
470 		 * the last available route, but it got removed, we'll hit
471 		 * the end of the list and then pick the first route.
472 		 */
473 
474 		mrt = SRPL_FIRST(&sr, &an->an_rtlist);
475 		while (hash > threshold && mrt != NULL) {
476 			if (mrt->rt_priority == rt->rt_priority)
477 				hash -= threshold;
478 			mrt = SRPL_FOLLOW(&sr, mrt, rt_next);
479 		}
480 
481 		if (mrt != NULL) {
482 			rtref(mrt);
483 			rtfree(rt);
484 			rt = mrt;
485 		}
486 		SRPL_LEAVE(&sr);
487 	}
488 out:
489 	srp_leave(&nsr);
490 	return (rt);
491 }
492 
493 int
494 rtable_insert(unsigned int rtableid, struct sockaddr *dst,
495     struct sockaddr *mask, struct sockaddr *gateway, uint8_t prio,
496     struct rtentry *rt)
497 {
498 	struct rtentry			*mrt;
499 	struct srp_ref			 sr;
500 	struct art_root			*ar;
501 	struct art_node			*an, *prev;
502 	uint8_t				*addr;
503 	int				 plen;
504 	unsigned int			 rt_flags;
505 	int				 error = 0;
506 
507 	ar = rtable_get(rtableid, dst->sa_family);
508 	if (ar == NULL)
509 		return (EAFNOSUPPORT);
510 
511 	addr = satoaddr(ar, dst);
512 	plen = rtable_satoplen(dst->sa_family, mask);
513 	if (plen == -1)
514 		return (EINVAL);
515 
516 	rtref(rt); /* guarantee rtfree won't do anything during insert */
517 	rw_enter_write(&ar->ar_lock);
518 
519 	/* Do not permit exactly the same dst/mask/gw pair. */
520 	an = art_lookup(ar, addr, plen, &sr);
521 	srp_leave(&sr); /* an can't go away while we have the lock */
522 	if (an_match(an, dst, plen)) {
523 		struct rtentry  *mrt;
524 		int		 mpathok = ISSET(rt->rt_flags, RTF_MPATH);
525 
526 		SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next) {
527 			if (prio != RTP_ANY &&
528 			    (mrt->rt_priority & RTP_MASK) != (prio & RTP_MASK))
529 				continue;
530 
531 			if (!mpathok ||
532 			    (mrt->rt_gateway->sa_len == gateway->sa_len &&
533 			    !memcmp(mrt->rt_gateway, gateway, gateway->sa_len))){
534 				error = EEXIST;
535 				goto leave;
536 			}
537 		}
538 	}
539 
540 	an = art_get(dst, plen);
541 	if (an == NULL) {
542 		error = ENOBUFS;
543 		goto leave;
544 	}
545 
546 	/* prepare for immediate operation if insert succeeds */
547 	rt_flags = rt->rt_flags;
548 	rt->rt_flags &= ~RTF_MPATH;
549 	rt->rt_dest = dst;
550 	rt->rt_plen = plen;
551 	SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
552 
553 	prev = art_insert(ar, an, addr, plen);
554 	if (prev != an) {
555 		SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry,
556 		    rt_next);
557 		rt->rt_flags = rt_flags;
558 		art_put(an);
559 
560 		if (prev == NULL) {
561 			error = ESRCH;
562 			goto leave;
563 		}
564 
565 		an = prev;
566 
567 		mrt = SRPL_FIRST_LOCKED(&an->an_rtlist);
568 		KASSERT(mrt != NULL);
569 		KASSERT((rt->rt_flags & RTF_MPATH) || mrt->rt_priority != prio);
570 
571 		/*
572 		 * An ART node with the same destination/netmask already
573 		 * exists, MPATH conflict must have been already checked.
574 		 */
575 		if (rt->rt_flags & RTF_MPATH) {
576 			/*
577 			 * Only keep the RTF_MPATH flag if two routes have
578 			 * the same gateway.
579 			 */
580 			rt->rt_flags &= ~RTF_MPATH;
581 			SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next) {
582 				if (mrt->rt_priority == prio) {
583 					mrt->rt_flags |= RTF_MPATH;
584 					rt->rt_flags |= RTF_MPATH;
585 				}
586 			}
587 		}
588 
589 		/* Put newly inserted entry at the right place. */
590 		rtable_mpath_insert(an, rt);
591 	}
592 leave:
593 	rw_exit_write(&ar->ar_lock);
594 	rtfree(rt);
595 	return (error);
596 }
597 
598 int
599 rtable_delete(unsigned int rtableid, struct sockaddr *dst,
600     struct sockaddr *mask, struct rtentry *rt)
601 {
602 	struct art_root			*ar;
603 	struct art_node			*an;
604 	struct srp_ref			 sr;
605 	uint8_t				*addr;
606 	int				 plen;
607 	struct rtentry			*mrt;
608 	int				 npaths = 0;
609 	int				 error = 0;
610 
611 	ar = rtable_get(rtableid, dst->sa_family);
612 	if (ar == NULL)
613 		return (EAFNOSUPPORT);
614 
615 	addr = satoaddr(ar, dst);
616 	plen = rtable_satoplen(dst->sa_family, mask);
617 	if (plen == -1)
618 		return (EINVAL);
619 
620 	rtref(rt); /* guarantee rtfree won't do anything under ar_lock */
621 	rw_enter_write(&ar->ar_lock);
622 	an = art_lookup(ar, addr, plen, &sr);
623 	srp_leave(&sr); /* an can't go away while we have the lock */
624 
625 	/* Make sure we've got a perfect match. */
626 	if (!an_match(an, dst, plen)) {
627 		error = ESRCH;
628 		goto leave;
629 	}
630 
631 	/*
632 	 * If other multipath route entries are still attached to
633 	 * this ART node we only have to unlink it.
634 	 */
635 	SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next)
636 		npaths++;
637 
638 	if (npaths > 1) {
639 		KASSERT(rt->rt_refcnt >= 1);
640 		SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry,
641 		    rt_next);
642 
643 		mrt = SRPL_FIRST_LOCKED(&an->an_rtlist);
644 		if (npaths == 2)
645 			mrt->rt_flags &= ~RTF_MPATH;
646 
647 		goto leave;
648 	}
649 
650 	if (art_delete(ar, an, addr, plen) == NULL)
651 		panic("art_delete failed to find node %p", an);
652 
653 	KASSERT(rt->rt_refcnt >= 1);
654 	SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry, rt_next);
655 	art_put(an);
656 
657 leave:
658 	rw_exit_write(&ar->ar_lock);
659 	rtfree(rt);
660 
661 	return (error);
662 }
663 
664 struct rtable_walk_cookie {
665 	int		(*rwc_func)(struct rtentry *, void *, unsigned int);
666 	void		 *rwc_arg;
667 	struct rtentry	**rwc_prt;
668 	unsigned int	  rwc_rid;
669 };
670 
671 /*
672  * Helper for rtable_walk to keep the ART code free from any "struct rtentry".
673  */
674 int
675 rtable_walk_helper(struct art_node *an, void *xrwc)
676 {
677 	struct srp_ref			 sr;
678 	struct rtable_walk_cookie	*rwc = xrwc;
679 	struct rtentry			*rt;
680 	int				 error = 0;
681 
682 	SRPL_FOREACH(rt, &sr, &an->an_rtlist, rt_next) {
683 		error = (*rwc->rwc_func)(rt, rwc->rwc_arg, rwc->rwc_rid);
684 		if (error != 0)
685 			break;
686 	}
687 	if (rwc->rwc_prt != NULL && rt != NULL) {
688 		rtref(rt);
689 		*rwc->rwc_prt = rt;
690 	}
691 	SRPL_LEAVE(&sr);
692 
693 	return (error);
694 }
695 
696 int
697 rtable_walk(unsigned int rtableid, sa_family_t af, struct rtentry **prt,
698     int (*func)(struct rtentry *, void *, unsigned int), void *arg)
699 {
700 	struct art_root			*ar;
701 	struct rtable_walk_cookie	 rwc;
702 	int				 error;
703 
704 	ar = rtable_get(rtableid, af);
705 	if (ar == NULL)
706 		return (EAFNOSUPPORT);
707 
708 	rwc.rwc_func = func;
709 	rwc.rwc_arg = arg;
710 	rwc.rwc_prt = prt;
711 	rwc.rwc_rid = rtableid;
712 
713 	error = art_walk(ar, rtable_walk_helper, &rwc);
714 
715 	return (error);
716 }
717 
718 struct rtentry *
719 rtable_iterate(struct rtentry *rt0)
720 {
721 	struct rtentry *rt = NULL;
722 	struct srp_ref sr;
723 
724 	rt = SRPL_NEXT(&sr, rt0, rt_next);
725 	if (rt != NULL)
726 		rtref(rt);
727 	SRPL_LEAVE(&sr);
728 	rtfree(rt0);
729 	return (rt);
730 }
731 
732 int
733 rtable_mpath_capable(unsigned int rtableid, sa_family_t af)
734 {
735 	return (1);
736 }
737 
738 int
739 rtable_mpath_reprio(unsigned int rtableid, struct sockaddr *dst,
740     int plen, uint8_t prio, struct rtentry *rt)
741 {
742 	struct art_root			*ar;
743 	struct art_node			*an;
744 	struct srp_ref			 sr;
745 	uint8_t				*addr;
746 	int				 error = 0;
747 
748 	ar = rtable_get(rtableid, dst->sa_family);
749 	if (ar == NULL)
750 		return (EAFNOSUPPORT);
751 
752 	addr = satoaddr(ar, dst);
753 
754 	rw_enter_write(&ar->ar_lock);
755 	an = art_lookup(ar, addr, plen, &sr);
756 	srp_leave(&sr); /* an can't go away while we have the lock */
757 
758 	/* Make sure we've got a perfect match. */
759 	if (!an_match(an, dst, plen)) {
760 		error = ESRCH;
761 	} else if (SRPL_FIRST_LOCKED(&an->an_rtlist) == rt &&
762 		SRPL_NEXT_LOCKED(rt, rt_next) == NULL) {
763 		/*
764 		 * If there's only one entry on the list do not go
765 		 * through an insert/remove cycle.  This is done to
766 		 * guarantee that ``an->an_rtlist''  is never empty
767 		 * when a node is in the tree.
768 		 */
769 		rt->rt_priority = prio;
770 	} else {
771 		rtref(rt); /* keep rt alive in between remove and insert */
772 		SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist,
773 		    rt, rtentry, rt_next);
774 		rt->rt_priority = prio;
775 		rtable_mpath_insert(an, rt);
776 		rtfree(rt);
777 		error = EAGAIN;
778 	}
779 	rw_exit_write(&ar->ar_lock);
780 
781 	return (error);
782 }
783 
784 void
785 rtable_mpath_insert(struct art_node *an, struct rtentry *rt)
786 {
787 	struct rtentry			*mrt, *prt = NULL;
788 	uint8_t				 prio = rt->rt_priority;
789 
790 	if ((mrt = SRPL_FIRST_LOCKED(&an->an_rtlist)) == NULL) {
791 		SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
792 		return;
793 	}
794 
795 	/* Iterate until we find the route to be placed after ``rt''. */
796 	while (mrt->rt_priority <= prio && SRPL_NEXT_LOCKED(mrt, rt_next)) {
797 		prt = mrt;
798 		mrt = SRPL_NEXT_LOCKED(mrt, rt_next);
799 	}
800 
801 	if (mrt->rt_priority <= prio) {
802 		SRPL_INSERT_AFTER_LOCKED(&rt_rc, mrt, rt, rt_next);
803 	} else if (prt != NULL) {
804 		SRPL_INSERT_AFTER_LOCKED(&rt_rc, prt, rt, rt_next);
805 	} else {
806 		SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
807 	}
808 }
809 
810 /*
811  * Returns 1 if ``an'' perfectly matches (``dst'', ``plen''), 0 otherwise.
812  */
813 int
814 an_match(struct art_node *an, struct sockaddr *dst, int plen)
815 {
816 	struct rtentry			*rt;
817 	struct srp_ref			 sr;
818 	int				 match;
819 
820 	if (an == NULL || an->an_plen != plen)
821 		return (0);
822 
823 	rt = SRPL_FIRST(&sr, &an->an_rtlist);
824 	match = (memcmp(rt->rt_dest, dst, dst->sa_len) == 0);
825 	SRPL_LEAVE(&sr);
826 
827 	return (match);
828 }
829 
830 void
831 rtentry_ref(void *null, void *xrt)
832 {
833 	struct rtentry *rt = xrt;
834 
835 	rtref(rt);
836 }
837 
838 void
839 rtentry_unref(void *null, void *xrt)
840 {
841 	struct rtentry *rt = xrt;
842 
843 	rtfree(rt);
844 }
845 
846 /*
847  * Return a pointer to the address (key).  This is an heritage from the
848  * BSD radix tree needed to skip the non-address fields from the flavor
849  * of "struct sockaddr" used by this routing table.
850  */
851 static inline uint8_t *
852 satoaddr(struct art_root *at, struct sockaddr *sa)
853 {
854 	return (((uint8_t *)sa) + at->ar_off);
855 }
856 
857 /*
858  * Return the prefix length of a mask.
859  */
860 int
861 rtable_satoplen(sa_family_t af, struct sockaddr *mask)
862 {
863 	struct domain	*dp;
864 	uint8_t		*ap, *ep;
865 	int		 mlen, plen = 0;
866 	int		 i;
867 
868 	for (i = 0; (dp = domains[i]) != NULL; i++) {
869 		if (dp->dom_rtoffset == 0)
870 			continue;
871 
872 		if (af == dp->dom_family)
873 			break;
874 	}
875 	if (dp == NULL)
876 		return (-1);
877 
878 	/* Host route */
879 	if (mask == NULL)
880 		return (dp->dom_maxplen);
881 
882 	mlen = mask->sa_len;
883 
884 	/* Default route */
885 	if (mlen == 0)
886 		return (0);
887 
888 	ap = (uint8_t *)((uint8_t *)mask) + dp->dom_rtoffset;
889 	ep = (uint8_t *)((uint8_t *)mask) + mlen;
890 	if (ap > ep)
891 		return (-1);
892 
893 	/* Trim trailing zeroes. */
894 	while (ap < ep && ep[-1] == 0)
895 		ep--;
896 
897 	if (ap == ep)
898 		return (0);
899 
900 	/* "Beauty" adapted from sbin/route/show.c ... */
901 	while (ap < ep) {
902 		switch (*ap++) {
903 		case 0xff:
904 			plen += 8;
905 			break;
906 		case 0xfe:
907 			plen += 7;
908 			goto out;
909 		case 0xfc:
910 			plen += 6;
911 			goto out;
912 		case 0xf8:
913 			plen += 5;
914 			goto out;
915 		case 0xf0:
916 			plen += 4;
917 			goto out;
918 		case 0xe0:
919 			plen += 3;
920 			goto out;
921 		case 0xc0:
922 			plen += 2;
923 			goto out;
924 		case 0x80:
925 			plen += 1;
926 			goto out;
927 		default:
928 			/* Non contiguous mask. */
929 			return (-1);
930 		}
931 	}
932 
933 out:
934 	if (plen > dp->dom_maxplen || ap != ep)
935 		return -1;
936 
937 	return (plen);
938 }
939