xref: /openbsd-src/sys/net/rtable.c (revision d1df930ffab53da22f3324c32bed7ac5709915e6)
1 /*	$OpenBSD: rtable.c,v 1.64 2018/09/09 10:07:38 henning Exp $ */
2 
3 /*
4  * Copyright (c) 2014-2016 Martin Pieuchot
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #ifndef _KERNEL
20 #include "kern_compat.h"
21 #else
22 #include <sys/param.h>
23 #include <sys/systm.h>
24 #include <sys/socket.h>
25 #include <sys/malloc.h>
26 #include <sys/queue.h>
27 #include <sys/domain.h>
28 #include <sys/srp.h>
29 #endif
30 
31 #include <net/rtable.h>
32 #include <net/route.h>
33 
34 /*
35  * Structures used by rtable_get() to retrieve the corresponding
36  * routing table for a given pair of ``af'' and ``rtableid''.
37  *
38  * Note that once allocated routing table heads are never freed.
39  * This way we do not need to reference count them.
40  *
41  *	afmap		    rtmap/dommp
42  *   -----------          ---------     -----
43  *   |   0     |--------> | 0 | 0 | ... | 0 |	Array mapping rtableid (=index)
44  *   -----------          ---------     -----   to rdomain/loopback (=value).
45  *   | AF_INET |.
46  *   ----------- `.       .---------.     .---------.
47  *       ...	   `----> | rtable0 | ... | rtableN |	Array of pointers for
48  *   -----------          '---------'     '---------'	IPv4 routing tables
49  *   | AF_MPLS |					indexed by ``rtableid''.
50  *   -----------
51  */
52 struct srp	  *afmap;
53 uint8_t		   af2idx[AF_MAX+1];	/* To only allocate supported AF */
54 uint8_t		   af2idx_max;
55 
56 /* Array of routing table pointers. */
57 struct rtmap {
58 	unsigned int	   limit;
59 	void		 **tbl;
60 };
61 
62 /*
63  * Array of rtableid -> rdomain mapping.
64  *
65  * Only used for the first index as describbed above.
66  */
67 struct dommp {
68 	unsigned int	   limit;
69 	/*
70 	 * Array to get the routing domain and loopback interface related to
71 	 * a routing table. Format:
72 	 *
73 	 * 8 unused bits | 16 bits for loopback index | 8 bits for rdomain
74 	 */
75 	unsigned int	  *value;
76 };
77 
78 unsigned int	   rtmap_limit = 0;
79 
80 void		   rtmap_init(void);
81 void		   rtmap_grow(unsigned int, sa_family_t);
82 void		   rtmap_dtor(void *, void *);
83 
84 struct srp_gc	   rtmap_gc = SRP_GC_INITIALIZER(rtmap_dtor, NULL);
85 
86 void		   rtable_init_backend(unsigned int);
87 void		  *rtable_alloc(unsigned int, unsigned int, unsigned int);
88 void		  *rtable_get(unsigned int, sa_family_t);
89 
90 void
91 rtmap_init(void)
92 {
93 	struct domain	*dp;
94 	int		 i;
95 
96 	/* Start with a single table for every domain that requires it. */
97 	for (i = 0; (dp = domains[i]) != NULL; i++) {
98 		if (dp->dom_rtoffset == 0)
99 			continue;
100 
101 		rtmap_grow(1, dp->dom_family);
102 	}
103 
104 	/* Initialize the rtableid->rdomain mapping table. */
105 	rtmap_grow(1, 0);
106 
107 	rtmap_limit = 1;
108 }
109 
110 /*
111  * Grow the size of the array of routing table for AF ``af'' to ``nlimit''.
112  */
113 void
114 rtmap_grow(unsigned int nlimit, sa_family_t af)
115 {
116 	struct rtmap	*map, *nmap;
117 	int		 i;
118 
119 	KERNEL_ASSERT_LOCKED();
120 
121 	KASSERT(nlimit > rtmap_limit);
122 
123 	nmap = malloc(sizeof(*nmap), M_RTABLE, M_WAITOK);
124 	nmap->limit = nlimit;
125 	nmap->tbl = mallocarray(nlimit, sizeof(*nmap[0].tbl), M_RTABLE,
126 	    M_WAITOK|M_ZERO);
127 
128 	map = srp_get_locked(&afmap[af2idx[af]]);
129 	if (map != NULL) {
130 		KASSERT(map->limit == rtmap_limit);
131 
132 		for (i = 0; i < map->limit; i++)
133 			nmap->tbl[i] = map->tbl[i];
134 	}
135 
136 	srp_update_locked(&rtmap_gc, &afmap[af2idx[af]], nmap);
137 }
138 
139 void
140 rtmap_dtor(void *null, void *xmap)
141 {
142 	struct rtmap	*map = xmap;
143 
144 	/*
145 	 * doesnt need to be serialized since this is the last reference
146 	 * to this map. there's nothing to race against.
147 	 */
148 	free(map->tbl, M_RTABLE, map->limit * sizeof(*map[0].tbl));
149 	free(map, M_RTABLE, sizeof(*map));
150 }
151 
152 void
153 rtable_init(void)
154 {
155 	struct domain	*dp;
156 	unsigned int	 keylen = 0;
157 	int		 i;
158 
159 	KASSERT(sizeof(struct rtmap) == sizeof(struct dommp));
160 
161 	/* We use index 0 for the rtable/rdomain map. */
162 	af2idx_max = 1;
163 	memset(af2idx, 0, sizeof(af2idx));
164 
165 	/*
166 	 * Compute the maximum supported key length in case the routing
167 	 * table backend needs it.
168 	 */
169 	for (i = 0; (dp = domains[i]) != NULL; i++) {
170 		if (dp->dom_rtoffset == 0)
171 			continue;
172 
173 		af2idx[dp->dom_family] = af2idx_max++;
174 		if (dp->dom_rtkeylen > keylen)
175 			keylen = dp->dom_rtkeylen;
176 
177 	}
178 	rtable_init_backend(keylen);
179 
180 	/*
181 	 * Allocate AF-to-id table now that we now how many AFs this
182 	 * kernel supports.
183 	 */
184 	afmap = mallocarray(af2idx_max + 1, sizeof(*afmap), M_RTABLE,
185 	    M_WAITOK|M_ZERO);
186 
187 	rtmap_init();
188 
189 	if (rtable_add(0) != 0)
190 		panic("unable to create default routing table");
191 }
192 
193 int
194 rtable_add(unsigned int id)
195 {
196 	struct domain	*dp;
197 	void		*tbl;
198 	struct rtmap	*map;
199 	struct dommp	*dmm;
200 	sa_family_t	 af;
201 	unsigned int	 off, alen;
202 	int		 i;
203 
204 	KERNEL_ASSERT_LOCKED();
205 
206 	if (id > RT_TABLEID_MAX)
207 		return (EINVAL);
208 
209 	if (rtable_exists(id))
210 		return (EEXIST);
211 
212 	for (i = 0; (dp = domains[i]) != NULL; i++) {
213 		if (dp->dom_rtoffset == 0)
214 			continue;
215 
216 		af = dp->dom_family;
217 		off = dp->dom_rtoffset;
218 		alen = dp->dom_maxplen;
219 
220 		if (id >= rtmap_limit)
221 			rtmap_grow(id + 1, af);
222 
223 		tbl = rtable_alloc(id, alen, off);
224 		if (tbl == NULL)
225 			return (ENOMEM);
226 
227 		map = srp_get_locked(&afmap[af2idx[af]]);
228 		map->tbl[id] = tbl;
229 	}
230 
231 	/* Reflect possible growth. */
232 	if (id >= rtmap_limit) {
233 		rtmap_grow(id + 1, 0);
234 		rtmap_limit = id + 1;
235 	}
236 
237 	/* Use main rtable/rdomain by default. */
238 	dmm = srp_get_locked(&afmap[0]);
239 	dmm->value[id] = 0;
240 
241 	return (0);
242 }
243 
244 void *
245 rtable_get(unsigned int rtableid, sa_family_t af)
246 {
247 	struct rtmap	*map;
248 	void		*tbl = NULL;
249 	struct srp_ref	 sr;
250 
251 	if (af >= nitems(af2idx) || af2idx[af] == 0)
252 		return (NULL);
253 
254 	map = srp_enter(&sr, &afmap[af2idx[af]]);
255 	if (rtableid < map->limit)
256 		tbl = map->tbl[rtableid];
257 	srp_leave(&sr);
258 
259 	return (tbl);
260 }
261 
262 int
263 rtable_exists(unsigned int rtableid)
264 {
265 	struct domain	*dp;
266 	void		*tbl;
267 	int		 i;
268 
269 	for (i = 0; (dp = domains[i]) != NULL; i++) {
270 		if (dp->dom_rtoffset == 0)
271 			continue;
272 
273 		tbl = rtable_get(rtableid, dp->dom_family);
274 		if (tbl != NULL)
275 			return (1);
276 	}
277 
278 	return (0);
279 }
280 
281 int
282 rtable_empty(unsigned int rtableid)
283 {
284 	struct domain	*dp;
285 	int		 i;
286 	struct art_root	*tbl;
287 
288 	for (i = 0; (dp = domains[i]) != NULL; i++) {
289 		if (dp->dom_rtoffset == 0)
290 			continue;
291 
292 		tbl = rtable_get(rtableid, dp->dom_family);
293 		if (tbl == NULL)
294 			continue;
295 		if (tbl->ar_root.ref != NULL)
296 			return (0);
297 	}
298 
299 	return (1);
300 }
301 
302 unsigned int
303 rtable_l2(unsigned int rtableid)
304 {
305 	struct dommp	*dmm;
306 	unsigned int	 rdomain = 0;
307 	struct srp_ref	 sr;
308 
309 	dmm = srp_enter(&sr, &afmap[0]);
310 	if (rtableid < dmm->limit)
311 		rdomain = (dmm->value[rtableid] & RT_TABLEID_MASK);
312 	srp_leave(&sr);
313 
314 	return (rdomain);
315 }
316 
317 unsigned int
318 rtable_loindex(unsigned int rtableid)
319 {
320 	struct dommp	*dmm;
321 	unsigned int	 loifidx = 0;
322 	struct srp_ref	 sr;
323 
324 	dmm = srp_enter(&sr, &afmap[0]);
325 	if (rtableid < dmm->limit)
326 		loifidx = (dmm->value[rtableid] >> RT_TABLEID_BITS);
327 	srp_leave(&sr);
328 
329 	return (loifidx);
330 }
331 
332 void
333 rtable_l2set(unsigned int rtableid, unsigned int rdomain, unsigned int loifidx)
334 {
335 	struct dommp	*dmm;
336 	unsigned int	 value;
337 
338 	KERNEL_ASSERT_LOCKED();
339 
340 	if (!rtable_exists(rtableid) || !rtable_exists(rdomain))
341 		return;
342 
343 	value = (rdomain & RT_TABLEID_MASK) | (loifidx << RT_TABLEID_BITS);
344 
345 	dmm = srp_get_locked(&afmap[0]);
346 	dmm->value[rtableid] = value;
347 }
348 
349 
350 static inline uint8_t	*satoaddr(struct art_root *, struct sockaddr *);
351 
352 int	an_match(struct art_node *, struct sockaddr *, int);
353 void	rtentry_ref(void *, void *);
354 void	rtentry_unref(void *, void *);
355 
356 void	rtable_mpath_insert(struct art_node *, struct rtentry *);
357 
358 struct srpl_rc rt_rc = SRPL_RC_INITIALIZER(rtentry_ref, rtentry_unref, NULL);
359 
360 void
361 rtable_init_backend(unsigned int keylen)
362 {
363 	art_init();
364 }
365 
366 void *
367 rtable_alloc(unsigned int rtableid, unsigned int alen, unsigned int off)
368 {
369 	return (art_alloc(rtableid, alen, off));
370 }
371 
372 struct rtentry *
373 rtable_lookup(unsigned int rtableid, struct sockaddr *dst,
374     struct sockaddr *mask, struct sockaddr *gateway, uint8_t prio)
375 {
376 	struct art_root			*ar;
377 	struct art_node			*an;
378 	struct rtentry			*rt = NULL;
379 	struct srp_ref			 sr, nsr;
380 	uint8_t				*addr;
381 	int				 plen;
382 
383 	ar = rtable_get(rtableid, dst->sa_family);
384 	if (ar == NULL)
385 		return (NULL);
386 
387 	addr = satoaddr(ar, dst);
388 
389 	/* No need for a perfect match. */
390 	if (mask == NULL) {
391 		an = art_match(ar, addr, &nsr);
392 		if (an == NULL)
393 			goto out;
394 	} else {
395 		plen = rtable_satoplen(dst->sa_family, mask);
396 		if (plen == -1)
397 			return (NULL);
398 
399 		an = art_lookup(ar, addr, plen, &nsr);
400 
401 		/* Make sure we've got a perfect match. */
402 		if (!an_match(an, dst, plen))
403 			goto out;
404 	}
405 
406 	SRPL_FOREACH(rt, &sr, &an->an_rtlist, rt_next) {
407 		if (prio != RTP_ANY &&
408 		    (rt->rt_priority & RTP_MASK) != (prio & RTP_MASK))
409 			continue;
410 
411 		if (gateway == NULL)
412 			break;
413 
414 		if (rt->rt_gateway->sa_len == gateway->sa_len &&
415 		    memcmp(rt->rt_gateway, gateway, gateway->sa_len) == 0)
416 			break;
417 	}
418 	if (rt != NULL)
419 		rtref(rt);
420 
421 	SRPL_LEAVE(&sr);
422 out:
423 	srp_leave(&nsr);
424 
425 	return (rt);
426 }
427 
428 struct rtentry *
429 rtable_match(unsigned int rtableid, struct sockaddr *dst, uint32_t *src)
430 {
431 	struct art_root			*ar;
432 	struct art_node			*an;
433 	struct rtentry			*rt = NULL;
434 	struct srp_ref			 sr, nsr;
435 	uint8_t				*addr;
436 	int				 hash;
437 
438 	ar = rtable_get(rtableid, dst->sa_family);
439 	if (ar == NULL)
440 		return (NULL);
441 
442 	addr = satoaddr(ar, dst);
443 
444 	an = art_match(ar, addr, &nsr);
445 	if (an == NULL)
446 		goto out;
447 
448 	rt = SRPL_FIRST(&sr, &an->an_rtlist);
449 	rtref(rt);
450 	SRPL_LEAVE(&sr);
451 
452 	/* Gateway selection by Hash-Threshold (RFC 2992) */
453 	if ((hash = rt_hash(rt, dst, src)) != -1) {
454 		struct rtentry		*mrt;
455 		int			 threshold, npaths = 0;
456 
457 		KASSERT(hash <= 0xffff);
458 
459 		SRPL_FOREACH(mrt, &sr, &an->an_rtlist, rt_next) {
460 			/* Only count nexthops with the same priority. */
461 			if (mrt->rt_priority == rt->rt_priority)
462 				npaths++;
463 		}
464 		SRPL_LEAVE(&sr);
465 
466 		threshold = (0xffff / npaths) + 1;
467 
468 		/*
469 		 * we have no protection against concurrent modification of the
470 		 * route list attached to the node, so we won't necessarily
471 		 * have the same number of routes.  for most modifications,
472 		 * we'll pick a route that we wouldn't have if we only saw the
473 		 * list before or after the change.  if we were going to use
474 		 * the last available route, but it got removed, we'll hit
475 		 * the end of the list and then pick the first route.
476 		 */
477 
478 		mrt = SRPL_FIRST(&sr, &an->an_rtlist);
479 		while (hash > threshold && mrt != NULL) {
480 			if (mrt->rt_priority == rt->rt_priority)
481 				hash -= threshold;
482 			mrt = SRPL_FOLLOW(&sr, mrt, rt_next);
483 		}
484 
485 		if (mrt != NULL) {
486 			rtref(mrt);
487 			rtfree(rt);
488 			rt = mrt;
489 		}
490 		SRPL_LEAVE(&sr);
491 	}
492 out:
493 	srp_leave(&nsr);
494 	return (rt);
495 }
496 
497 int
498 rtable_insert(unsigned int rtableid, struct sockaddr *dst,
499     struct sockaddr *mask, struct sockaddr *gateway, uint8_t prio,
500     struct rtentry *rt)
501 {
502 	struct rtentry			*mrt;
503 	struct srp_ref			 sr;
504 	struct art_root			*ar;
505 	struct art_node			*an, *prev;
506 	uint8_t				*addr;
507 	int				 plen;
508 	unsigned int			 rt_flags;
509 	int				 error = 0;
510 
511 	ar = rtable_get(rtableid, dst->sa_family);
512 	if (ar == NULL)
513 		return (EAFNOSUPPORT);
514 
515 	addr = satoaddr(ar, dst);
516 	plen = rtable_satoplen(dst->sa_family, mask);
517 	if (plen == -1)
518 		return (EINVAL);
519 
520 	rtref(rt); /* guarantee rtfree won't do anything during insert */
521 	rw_enter_write(&ar->ar_lock);
522 
523 	/* Do not permit exactly the same dst/mask/gw pair. */
524 	an = art_lookup(ar, addr, plen, &sr);
525 	srp_leave(&sr); /* an can't go away while we have the lock */
526 	if (an_match(an, dst, plen)) {
527 		struct rtentry  *mrt;
528 		int		 mpathok = ISSET(rt->rt_flags, RTF_MPATH);
529 
530 		SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next) {
531 			if (prio != RTP_ANY &&
532 			    (mrt->rt_priority & RTP_MASK) != (prio & RTP_MASK))
533 				continue;
534 
535 			if (!mpathok ||
536 			    (mrt->rt_gateway->sa_len == gateway->sa_len &&
537 			    !memcmp(mrt->rt_gateway, gateway, gateway->sa_len))){
538 				error = EEXIST;
539 				goto leave;
540 			}
541 		}
542 	}
543 
544 	an = art_get(dst, plen);
545 	if (an == NULL) {
546 		error = ENOBUFS;
547 		goto leave;
548 	}
549 
550 	/* prepare for immediate operation if insert succeeds */
551 	rt_flags = rt->rt_flags;
552 	rt->rt_flags &= ~RTF_MPATH;
553 	rt->rt_dest = dst;
554 	rt->rt_plen = plen;
555 	SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
556 
557 	prev = art_insert(ar, an, addr, plen);
558 	if (prev != an) {
559 		SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry,
560 		    rt_next);
561 		rt->rt_flags = rt_flags;
562 		art_put(an);
563 
564 		if (prev == NULL) {
565 			error = ESRCH;
566 			goto leave;
567 		}
568 
569 		an = prev;
570 
571 		mrt = SRPL_FIRST_LOCKED(&an->an_rtlist);
572 		KASSERT(mrt != NULL);
573 		KASSERT((rt->rt_flags & RTF_MPATH) || mrt->rt_priority != prio);
574 
575 		/*
576 		 * An ART node with the same destination/netmask already
577 		 * exists, MPATH conflict must have been already checked.
578 		 */
579 		if (rt->rt_flags & RTF_MPATH) {
580 			/*
581 			 * Only keep the RTF_MPATH flag if two routes have
582 			 * the same gateway.
583 			 */
584 			rt->rt_flags &= ~RTF_MPATH;
585 			SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next) {
586 				if (mrt->rt_priority == prio) {
587 					mrt->rt_flags |= RTF_MPATH;
588 					rt->rt_flags |= RTF_MPATH;
589 				}
590 			}
591 		}
592 
593 		/* Put newly inserted entry at the right place. */
594 		rtable_mpath_insert(an, rt);
595 	}
596 leave:
597 	rw_exit_write(&ar->ar_lock);
598 	rtfree(rt);
599 	return (error);
600 }
601 
602 int
603 rtable_delete(unsigned int rtableid, struct sockaddr *dst,
604     struct sockaddr *mask, struct rtentry *rt)
605 {
606 	struct art_root			*ar;
607 	struct art_node			*an;
608 	struct srp_ref			 sr;
609 	uint8_t				*addr;
610 	int				 plen;
611 	struct rtentry			*mrt;
612 	int				 npaths = 0;
613 	int				 error = 0;
614 
615 	ar = rtable_get(rtableid, dst->sa_family);
616 	if (ar == NULL)
617 		return (EAFNOSUPPORT);
618 
619 	addr = satoaddr(ar, dst);
620 	plen = rtable_satoplen(dst->sa_family, mask);
621 
622 	rtref(rt); /* guarantee rtfree won't do anything under ar_lock */
623 	rw_enter_write(&ar->ar_lock);
624 	an = art_lookup(ar, addr, plen, &sr);
625 	srp_leave(&sr); /* an can't go away while we have the lock */
626 
627 	/* Make sure we've got a perfect match. */
628 	if (!an_match(an, dst, plen)) {
629 		error = ESRCH;
630 		goto leave;
631 	}
632 
633 	/*
634 	 * If other multipath route entries are still attached to
635 	 * this ART node we only have to unlink it.
636 	 */
637 	SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next)
638 		npaths++;
639 
640 	if (npaths > 1) {
641 		KASSERT(rt->rt_refcnt >= 1);
642 		SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry,
643 		    rt_next);
644 
645 		mrt = SRPL_FIRST_LOCKED(&an->an_rtlist);
646 		if (npaths == 2)
647 			mrt->rt_flags &= ~RTF_MPATH;
648 
649 		goto leave;
650 	}
651 
652 	if (art_delete(ar, an, addr, plen) == NULL)
653 		panic("art_delete failed to find node %p", an);
654 
655 	KASSERT(rt->rt_refcnt >= 1);
656 	SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry, rt_next);
657 	art_put(an);
658 
659 leave:
660 	rw_exit_write(&ar->ar_lock);
661 	rtfree(rt);
662 
663 	return (error);
664 }
665 
666 struct rtable_walk_cookie {
667 	int		(*rwc_func)(struct rtentry *, void *, unsigned int);
668 	void		 *rwc_arg;
669 	unsigned int	  rwc_rid;
670 };
671 
672 /*
673  * Helper for rtable_walk to keep the ART code free from any "struct rtentry".
674  */
675 int
676 rtable_walk_helper(struct art_node *an, void *xrwc)
677 {
678 	struct srp_ref			 sr;
679 	struct rtable_walk_cookie	*rwc = xrwc;
680 	struct rtentry			*rt;
681 	int				 error = 0;
682 
683 	SRPL_FOREACH(rt, &sr, &an->an_rtlist, rt_next) {
684 		if ((error = (*rwc->rwc_func)(rt, rwc->rwc_arg, rwc->rwc_rid)))
685 			break;
686 	}
687 	SRPL_LEAVE(&sr);
688 
689 	return (error);
690 }
691 
692 int
693 rtable_walk(unsigned int rtableid, sa_family_t af,
694     int (*func)(struct rtentry *, void *, unsigned int), void *arg)
695 {
696 	struct art_root			*ar;
697 	struct rtable_walk_cookie	 rwc;
698 	int				 error;
699 
700 	ar = rtable_get(rtableid, af);
701 	if (ar == NULL)
702 		return (EAFNOSUPPORT);
703 
704 	rwc.rwc_func = func;
705 	rwc.rwc_arg = arg;
706 	rwc.rwc_rid = rtableid;
707 
708 	while ((error = art_walk(ar, rtable_walk_helper, &rwc)) == EAGAIN)
709 		continue;
710 
711 	return (error);
712 }
713 
714 struct rtentry *
715 rtable_iterate(struct rtentry *rt0)
716 {
717 	struct rtentry *rt = NULL;
718 	struct srp_ref sr;
719 
720 	rt = SRPL_NEXT(&sr, rt0, rt_next);
721 	if (rt != NULL)
722 		rtref(rt);
723 	SRPL_LEAVE(&sr);
724 	rtfree(rt0);
725 	return (rt);
726 }
727 
728 int
729 rtable_mpath_capable(unsigned int rtableid, sa_family_t af)
730 {
731 	return (1);
732 }
733 
734 int
735 rtable_mpath_reprio(unsigned int rtableid, struct sockaddr *dst,
736     struct sockaddr *mask, uint8_t prio, struct rtentry *rt)
737 {
738 	struct art_root			*ar;
739 	struct art_node			*an;
740 	struct srp_ref			 sr;
741 	uint8_t				*addr;
742 	int				 plen;
743 	int				 error = 0;
744 
745 	ar = rtable_get(rtableid, dst->sa_family);
746 	if (ar == NULL)
747 		return (EAFNOSUPPORT);
748 
749 	addr = satoaddr(ar, dst);
750 	plen = rtable_satoplen(dst->sa_family, mask);
751 
752 	rw_enter_write(&ar->ar_lock);
753 	an = art_lookup(ar, addr, plen, &sr);
754 	srp_leave(&sr); /* an can't go away while we have the lock */
755 
756 	/* Make sure we've got a perfect match. */
757 	if (!an_match(an, dst, plen)) {
758 		error = ESRCH;
759 	} else if (SRPL_FIRST_LOCKED(&an->an_rtlist) == rt &&
760 		SRPL_NEXT_LOCKED(rt, rt_next) == NULL) {
761 		/*
762 		 * If there's only one entry on the list do not go
763 		 * through an insert/remove cycle.  This is done to
764 		 * guarantee that ``an->an_rtlist''  is never empty
765 		 * when a node is in the tree.
766 		 */
767 		rt->rt_priority = prio;
768 	} else {
769 		rtref(rt); /* keep rt alive in between remove and insert */
770 		SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist,
771 		    rt, rtentry, rt_next);
772 		rt->rt_priority = prio;
773 		rtable_mpath_insert(an, rt);
774 		rtfree(rt);
775 		error = EAGAIN;
776 	}
777 	rw_exit_write(&ar->ar_lock);
778 
779 	return (error);
780 }
781 
782 void
783 rtable_mpath_insert(struct art_node *an, struct rtentry *rt)
784 {
785 	struct rtentry			*mrt, *prt = NULL;
786 	uint8_t				 prio = rt->rt_priority;
787 
788 	if ((mrt = SRPL_FIRST_LOCKED(&an->an_rtlist)) == NULL) {
789 		SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
790 		return;
791 	}
792 
793 	/* Iterate until we find the route to be placed after ``rt''. */
794 	while (mrt->rt_priority <= prio && SRPL_NEXT_LOCKED(mrt, rt_next)) {
795 		prt = mrt;
796 		mrt = SRPL_NEXT_LOCKED(mrt, rt_next);
797 	}
798 
799 	if (mrt->rt_priority <= prio) {
800 		SRPL_INSERT_AFTER_LOCKED(&rt_rc, mrt, rt, rt_next);
801 	} else if (prt != NULL) {
802 		SRPL_INSERT_AFTER_LOCKED(&rt_rc, prt, rt, rt_next);
803 	} else {
804 		SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
805 	}
806 }
807 
808 /*
809  * Returns 1 if ``an'' perfectly matches (``dst'', ``plen''), 0 otherwise.
810  */
811 int
812 an_match(struct art_node *an, struct sockaddr *dst, int plen)
813 {
814 	struct rtentry			*rt;
815 	struct srp_ref			 sr;
816 	int				 match;
817 
818 	if (an == NULL || an->an_plen != plen)
819 		return (0);
820 
821 	rt = SRPL_FIRST(&sr, &an->an_rtlist);
822 	match = (memcmp(rt->rt_dest, dst, dst->sa_len) == 0);
823 	SRPL_LEAVE(&sr);
824 
825 	return (match);
826 }
827 
828 void
829 rtentry_ref(void *null, void *xrt)
830 {
831 	struct rtentry *rt = xrt;
832 
833 	rtref(rt);
834 }
835 
836 void
837 rtentry_unref(void *null, void *xrt)
838 {
839 	struct rtentry *rt = xrt;
840 
841 	rtfree(rt);
842 }
843 
844 /*
845  * Return a pointer to the address (key).  This is an heritage from the
846  * BSD radix tree needed to skip the non-address fields from the flavor
847  * of "struct sockaddr" used by this routing table.
848  */
849 static inline uint8_t *
850 satoaddr(struct art_root *at, struct sockaddr *sa)
851 {
852 	return (((uint8_t *)sa) + at->ar_off);
853 }
854 
855 /*
856  * Return the prefix length of a mask.
857  */
858 int
859 rtable_satoplen(sa_family_t af, struct sockaddr *mask)
860 {
861 	struct domain	*dp;
862 	uint8_t		*ap, *ep;
863 	int		 mlen, plen = 0;
864 	int		 i;
865 
866 	for (i = 0; (dp = domains[i]) != NULL; i++) {
867 		if (dp->dom_rtoffset == 0)
868 			continue;
869 
870 		if (af == dp->dom_family)
871 			break;
872 	}
873 	if (dp == NULL)
874 		return (-1);
875 
876 	/* Host route */
877 	if (mask == NULL)
878 		return (dp->dom_maxplen);
879 
880 	mlen = mask->sa_len;
881 
882 	/* Default route */
883 	if (mlen == 0)
884 		return (0);
885 
886 	ap = (uint8_t *)((uint8_t *)mask) + dp->dom_rtoffset;
887 	ep = (uint8_t *)((uint8_t *)mask) + mlen;
888 	if (ap > ep)
889 		return (-1);
890 
891 	if (ap == ep)
892 		return (0);
893 
894 	/* "Beauty" adapted from sbin/route/show.c ... */
895 	while (ap < ep) {
896 		switch (*ap) {
897 		case 0xff:
898 			plen += 8;
899 			ap++;
900 			break;
901 		case 0xfe:
902 			plen += 7;
903 			ap++;
904 			goto out;
905 		case 0xfc:
906 			plen += 6;
907 			ap++;
908 			goto out;
909 		case 0xf8:
910 			plen += 5;
911 			ap++;
912 			goto out;
913 		case 0xf0:
914 			plen += 4;
915 			ap++;
916 			goto out;
917 		case 0xe0:
918 			plen += 3;
919 			ap++;
920 			goto out;
921 		case 0xc0:
922 			plen += 2;
923 			ap++;
924 			goto out;
925 		case 0x80:
926 			plen += 1;
927 			ap++;
928 			goto out;
929 		case 0x00:
930 			goto out;
931 		default:
932 			/* Non contiguous mask. */
933 			return (-1);
934 		}
935 
936 	}
937 
938 out:
939 #ifdef DIAGNOSTIC
940 	for (; ap < ep; ap++) {
941 		if (*ap != 0x00)
942 			return (-1);
943 	}
944 #endif /* DIAGNOSTIC */
945 
946 	return (plen);
947 }
948