xref: /openbsd-src/sys/net/rtable.c (revision 46035553bfdd96e63c94e32da0210227ec2e3cf1)
1 /*	$OpenBSD: rtable.c,v 1.72 2020/11/07 09:51:40 denis Exp $ */
2 
3 /*
4  * Copyright (c) 2014-2016 Martin Pieuchot
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #ifndef _KERNEL
20 #include "kern_compat.h"
21 #else
22 #include <sys/param.h>
23 #include <sys/systm.h>
24 #include <sys/socket.h>
25 #include <sys/malloc.h>
26 #include <sys/queue.h>
27 #include <sys/domain.h>
28 #include <sys/srp.h>
29 #endif
30 
31 #include <net/rtable.h>
32 #include <net/route.h>
33 
34 /*
35  * Structures used by rtable_get() to retrieve the corresponding
36  * routing table for a given pair of ``af'' and ``rtableid''.
37  *
38  * Note that once allocated routing table heads are never freed.
39  * This way we do not need to reference count them.
40  *
41  *	afmap		    rtmap/dommp
42  *   -----------          ---------     -----
43  *   |   0     |--------> | 0 | 0 | ... | 0 |	Array mapping rtableid (=index)
44  *   -----------          ---------     -----   to rdomain/loopback (=value).
45  *   | AF_INET |.
46  *   ----------- `.       .---------.     .---------.
47  *       ...	   `----> | rtable0 | ... | rtableN |	Array of pointers for
48  *   -----------          '---------'     '---------'	IPv4 routing tables
49  *   | AF_MPLS |					indexed by ``rtableid''.
50  *   -----------
51  */
52 struct srp	  *afmap;
53 uint8_t		   af2idx[AF_MAX+1];	/* To only allocate supported AF */
54 uint8_t		   af2idx_max;
55 
56 /* Array of routing table pointers. */
57 struct rtmap {
58 	unsigned int	   limit;
59 	void		 **tbl;
60 };
61 
62 /*
63  * Array of rtableid -> rdomain mapping.
64  *
65  * Only used for the first index as describbed above.
66  */
67 struct dommp {
68 	unsigned int	   limit;
69 	/*
70 	 * Array to get the routing domain and loopback interface related to
71 	 * a routing table. Format:
72 	 *
73 	 * 8 unused bits | 16 bits for loopback index | 8 bits for rdomain
74 	 */
75 	unsigned int	  *value;
76 };
77 
78 unsigned int	   rtmap_limit = 0;
79 
80 void		   rtmap_init(void);
81 void		   rtmap_grow(unsigned int, sa_family_t);
82 void		   rtmap_dtor(void *, void *);
83 
84 struct srp_gc	   rtmap_gc = SRP_GC_INITIALIZER(rtmap_dtor, NULL);
85 
86 void		   rtable_init_backend(void);
87 void		  *rtable_alloc(unsigned int, unsigned int, unsigned int);
88 void		  *rtable_get(unsigned int, sa_family_t);
89 
90 void
91 rtmap_init(void)
92 {
93 	struct domain	*dp;
94 	int		 i;
95 
96 	/* Start with a single table for every domain that requires it. */
97 	for (i = 0; (dp = domains[i]) != NULL; i++) {
98 		if (dp->dom_rtoffset == 0)
99 			continue;
100 
101 		rtmap_grow(1, dp->dom_family);
102 	}
103 
104 	/* Initialize the rtableid->rdomain mapping table. */
105 	rtmap_grow(1, 0);
106 
107 	rtmap_limit = 1;
108 }
109 
110 /*
111  * Grow the size of the array of routing table for AF ``af'' to ``nlimit''.
112  */
113 void
114 rtmap_grow(unsigned int nlimit, sa_family_t af)
115 {
116 	struct rtmap	*map, *nmap;
117 	int		 i;
118 
119 	KERNEL_ASSERT_LOCKED();
120 
121 	KASSERT(nlimit > rtmap_limit);
122 
123 	nmap = malloc(sizeof(*nmap), M_RTABLE, M_WAITOK);
124 	nmap->limit = nlimit;
125 	nmap->tbl = mallocarray(nlimit, sizeof(*nmap[0].tbl), M_RTABLE,
126 	    M_WAITOK|M_ZERO);
127 
128 	map = srp_get_locked(&afmap[af2idx[af]]);
129 	if (map != NULL) {
130 		KASSERT(map->limit == rtmap_limit);
131 
132 		for (i = 0; i < map->limit; i++)
133 			nmap->tbl[i] = map->tbl[i];
134 	}
135 
136 	srp_update_locked(&rtmap_gc, &afmap[af2idx[af]], nmap);
137 }
138 
139 void
140 rtmap_dtor(void *null, void *xmap)
141 {
142 	struct rtmap	*map = xmap;
143 
144 	/*
145 	 * doesnt need to be serialized since this is the last reference
146 	 * to this map. there's nothing to race against.
147 	 */
148 	free(map->tbl, M_RTABLE, map->limit * sizeof(*map[0].tbl));
149 	free(map, M_RTABLE, sizeof(*map));
150 }
151 
152 void
153 rtable_init(void)
154 {
155 	struct domain	*dp;
156 	int		 i;
157 
158 	KASSERT(sizeof(struct rtmap) == sizeof(struct dommp));
159 
160 	/* We use index 0 for the rtable/rdomain map. */
161 	af2idx_max = 1;
162 	memset(af2idx, 0, sizeof(af2idx));
163 
164 	/*
165 	 * Compute the maximum supported key length in case the routing
166 	 * table backend needs it.
167 	 */
168 	for (i = 0; (dp = domains[i]) != NULL; i++) {
169 		if (dp->dom_rtoffset == 0)
170 			continue;
171 
172 		af2idx[dp->dom_family] = af2idx_max++;
173 	}
174 	rtable_init_backend();
175 
176 	/*
177 	 * Allocate AF-to-id table now that we now how many AFs this
178 	 * kernel supports.
179 	 */
180 	afmap = mallocarray(af2idx_max + 1, sizeof(*afmap), M_RTABLE,
181 	    M_WAITOK|M_ZERO);
182 
183 	rtmap_init();
184 
185 	if (rtable_add(0) != 0)
186 		panic("unable to create default routing table");
187 }
188 
189 int
190 rtable_add(unsigned int id)
191 {
192 	struct domain	*dp;
193 	void		*tbl;
194 	struct rtmap	*map;
195 	struct dommp	*dmm;
196 	sa_family_t	 af;
197 	unsigned int	 off, alen;
198 	int		 i;
199 
200 	KERNEL_ASSERT_LOCKED();
201 
202 	if (id > RT_TABLEID_MAX)
203 		return (EINVAL);
204 
205 	if (rtable_exists(id))
206 		return (EEXIST);
207 
208 	for (i = 0; (dp = domains[i]) != NULL; i++) {
209 		if (dp->dom_rtoffset == 0)
210 			continue;
211 
212 		af = dp->dom_family;
213 		off = dp->dom_rtoffset;
214 		alen = dp->dom_maxplen;
215 
216 		if (id >= rtmap_limit)
217 			rtmap_grow(id + 1, af);
218 
219 		tbl = rtable_alloc(id, alen, off);
220 		if (tbl == NULL)
221 			return (ENOMEM);
222 
223 		map = srp_get_locked(&afmap[af2idx[af]]);
224 		map->tbl[id] = tbl;
225 	}
226 
227 	/* Reflect possible growth. */
228 	if (id >= rtmap_limit) {
229 		rtmap_grow(id + 1, 0);
230 		rtmap_limit = id + 1;
231 	}
232 
233 	/* Use main rtable/rdomain by default. */
234 	dmm = srp_get_locked(&afmap[0]);
235 	dmm->value[id] = 0;
236 
237 	return (0);
238 }
239 
240 void *
241 rtable_get(unsigned int rtableid, sa_family_t af)
242 {
243 	struct rtmap	*map;
244 	void		*tbl = NULL;
245 	struct srp_ref	 sr;
246 
247 	if (af >= nitems(af2idx) || af2idx[af] == 0)
248 		return (NULL);
249 
250 	map = srp_enter(&sr, &afmap[af2idx[af]]);
251 	if (rtableid < map->limit)
252 		tbl = map->tbl[rtableid];
253 	srp_leave(&sr);
254 
255 	return (tbl);
256 }
257 
258 int
259 rtable_exists(unsigned int rtableid)
260 {
261 	struct domain	*dp;
262 	void		*tbl;
263 	int		 i;
264 
265 	for (i = 0; (dp = domains[i]) != NULL; i++) {
266 		if (dp->dom_rtoffset == 0)
267 			continue;
268 
269 		tbl = rtable_get(rtableid, dp->dom_family);
270 		if (tbl != NULL)
271 			return (1);
272 	}
273 
274 	return (0);
275 }
276 
277 int
278 rtable_empty(unsigned int rtableid)
279 {
280 	struct domain	*dp;
281 	int		 i;
282 	struct art_root	*tbl;
283 
284 	for (i = 0; (dp = domains[i]) != NULL; i++) {
285 		if (dp->dom_rtoffset == 0)
286 			continue;
287 
288 		tbl = rtable_get(rtableid, dp->dom_family);
289 		if (tbl == NULL)
290 			continue;
291 		if (tbl->ar_root.ref != NULL)
292 			return (0);
293 	}
294 
295 	return (1);
296 }
297 
298 unsigned int
299 rtable_l2(unsigned int rtableid)
300 {
301 	struct dommp	*dmm;
302 	unsigned int	 rdomain = 0;
303 	struct srp_ref	 sr;
304 
305 	dmm = srp_enter(&sr, &afmap[0]);
306 	if (rtableid < dmm->limit)
307 		rdomain = (dmm->value[rtableid] & RT_TABLEID_MASK);
308 	srp_leave(&sr);
309 
310 	return (rdomain);
311 }
312 
313 unsigned int
314 rtable_loindex(unsigned int rtableid)
315 {
316 	struct dommp	*dmm;
317 	unsigned int	 loifidx = 0;
318 	struct srp_ref	 sr;
319 
320 	dmm = srp_enter(&sr, &afmap[0]);
321 	if (rtableid < dmm->limit)
322 		loifidx = (dmm->value[rtableid] >> RT_TABLEID_BITS);
323 	srp_leave(&sr);
324 
325 	return (loifidx);
326 }
327 
328 void
329 rtable_l2set(unsigned int rtableid, unsigned int rdomain, unsigned int loifidx)
330 {
331 	struct dommp	*dmm;
332 	unsigned int	 value;
333 
334 	KERNEL_ASSERT_LOCKED();
335 
336 	if (!rtable_exists(rtableid) || !rtable_exists(rdomain))
337 		return;
338 
339 	value = (rdomain & RT_TABLEID_MASK) | (loifidx << RT_TABLEID_BITS);
340 
341 	dmm = srp_get_locked(&afmap[0]);
342 	dmm->value[rtableid] = value;
343 }
344 
345 
346 static inline uint8_t	*satoaddr(struct art_root *, struct sockaddr *);
347 
348 int	an_match(struct art_node *, struct sockaddr *, int);
349 void	rtentry_ref(void *, void *);
350 void	rtentry_unref(void *, void *);
351 
352 void	rtable_mpath_insert(struct art_node *, struct rtentry *);
353 
354 struct srpl_rc rt_rc = SRPL_RC_INITIALIZER(rtentry_ref, rtentry_unref, NULL);
355 
356 void
357 rtable_init_backend(void)
358 {
359 	art_init();
360 }
361 
362 void *
363 rtable_alloc(unsigned int rtableid, unsigned int alen, unsigned int off)
364 {
365 	return (art_alloc(rtableid, alen, off));
366 }
367 
368 int
369 rtable_setsource(unsigned int rtableid, int af, struct sockaddr *src)
370 {
371 	struct art_root		*ar;
372 
373 	if ((ar = rtable_get(rtableid, af)) == NULL)
374 		return (EAFNOSUPPORT);
375 
376 	ar->source = src;
377 
378 	return (0);
379 }
380 
381 struct sockaddr *
382 rtable_getsource(unsigned int rtableid, int af)
383 {
384 	struct art_root		*ar;
385 
386 	ar = rtable_get(rtableid, af);
387 	if (ar == NULL)
388 		return (NULL);
389 
390 	return (ar->source);
391 }
392 
393 void
394 rtable_clearsource(unsigned int rtableid, struct sockaddr *src)
395 {
396 	struct sockaddr	*addr;
397 
398 	addr = rtable_getsource(rtableid, src->sa_family);
399 	if (addr && (addr->sa_len == src->sa_len)) {
400 		if (memcmp(src, addr, addr->sa_len) == 0) {
401 			rtable_setsource(rtableid, src->sa_family, NULL);
402 		}
403 	}
404 }
405 
406 struct rtentry *
407 rtable_lookup(unsigned int rtableid, struct sockaddr *dst,
408     struct sockaddr *mask, struct sockaddr *gateway, uint8_t prio)
409 {
410 	struct art_root			*ar;
411 	struct art_node			*an;
412 	struct rtentry			*rt = NULL;
413 	struct srp_ref			 sr, nsr;
414 	uint8_t				*addr;
415 	int				 plen;
416 
417 	ar = rtable_get(rtableid, dst->sa_family);
418 	if (ar == NULL)
419 		return (NULL);
420 
421 	addr = satoaddr(ar, dst);
422 
423 	/* No need for a perfect match. */
424 	if (mask == NULL) {
425 		an = art_match(ar, addr, &nsr);
426 		if (an == NULL)
427 			goto out;
428 	} else {
429 		plen = rtable_satoplen(dst->sa_family, mask);
430 		if (plen == -1)
431 			return (NULL);
432 
433 		an = art_lookup(ar, addr, plen, &nsr);
434 
435 		/* Make sure we've got a perfect match. */
436 		if (!an_match(an, dst, plen))
437 			goto out;
438 	}
439 
440 	SRPL_FOREACH(rt, &sr, &an->an_rtlist, rt_next) {
441 		if (prio != RTP_ANY &&
442 		    (rt->rt_priority & RTP_MASK) != (prio & RTP_MASK))
443 			continue;
444 
445 		if (gateway == NULL)
446 			break;
447 
448 		if (rt->rt_gateway->sa_len == gateway->sa_len &&
449 		    memcmp(rt->rt_gateway, gateway, gateway->sa_len) == 0)
450 			break;
451 	}
452 	if (rt != NULL)
453 		rtref(rt);
454 
455 	SRPL_LEAVE(&sr);
456 out:
457 	srp_leave(&nsr);
458 
459 	return (rt);
460 }
461 
462 struct rtentry *
463 rtable_match(unsigned int rtableid, struct sockaddr *dst, uint32_t *src)
464 {
465 	struct art_root			*ar;
466 	struct art_node			*an;
467 	struct rtentry			*rt = NULL;
468 	struct srp_ref			 sr, nsr;
469 	uint8_t				*addr;
470 	int				 hash;
471 
472 	ar = rtable_get(rtableid, dst->sa_family);
473 	if (ar == NULL)
474 		return (NULL);
475 
476 	addr = satoaddr(ar, dst);
477 
478 	an = art_match(ar, addr, &nsr);
479 	if (an == NULL)
480 		goto out;
481 
482 	rt = SRPL_FIRST(&sr, &an->an_rtlist);
483 	rtref(rt);
484 	SRPL_LEAVE(&sr);
485 
486 	/* Gateway selection by Hash-Threshold (RFC 2992) */
487 	if ((hash = rt_hash(rt, dst, src)) != -1) {
488 		struct rtentry		*mrt;
489 		int			 threshold, npaths = 0;
490 
491 		KASSERT(hash <= 0xffff);
492 
493 		SRPL_FOREACH(mrt, &sr, &an->an_rtlist, rt_next) {
494 			/* Only count nexthops with the same priority. */
495 			if (mrt->rt_priority == rt->rt_priority)
496 				npaths++;
497 		}
498 		SRPL_LEAVE(&sr);
499 
500 		threshold = (0xffff / npaths) + 1;
501 
502 		/*
503 		 * we have no protection against concurrent modification of the
504 		 * route list attached to the node, so we won't necessarily
505 		 * have the same number of routes.  for most modifications,
506 		 * we'll pick a route that we wouldn't have if we only saw the
507 		 * list before or after the change.  if we were going to use
508 		 * the last available route, but it got removed, we'll hit
509 		 * the end of the list and then pick the first route.
510 		 */
511 
512 		mrt = SRPL_FIRST(&sr, &an->an_rtlist);
513 		while (hash > threshold && mrt != NULL) {
514 			if (mrt->rt_priority == rt->rt_priority)
515 				hash -= threshold;
516 			mrt = SRPL_FOLLOW(&sr, mrt, rt_next);
517 		}
518 
519 		if (mrt != NULL) {
520 			rtref(mrt);
521 			rtfree(rt);
522 			rt = mrt;
523 		}
524 		SRPL_LEAVE(&sr);
525 	}
526 out:
527 	srp_leave(&nsr);
528 	return (rt);
529 }
530 
531 int
532 rtable_insert(unsigned int rtableid, struct sockaddr *dst,
533     struct sockaddr *mask, struct sockaddr *gateway, uint8_t prio,
534     struct rtentry *rt)
535 {
536 	struct rtentry			*mrt;
537 	struct srp_ref			 sr;
538 	struct art_root			*ar;
539 	struct art_node			*an, *prev;
540 	uint8_t				*addr;
541 	int				 plen;
542 	unsigned int			 rt_flags;
543 	int				 error = 0;
544 
545 	ar = rtable_get(rtableid, dst->sa_family);
546 	if (ar == NULL)
547 		return (EAFNOSUPPORT);
548 
549 	addr = satoaddr(ar, dst);
550 	plen = rtable_satoplen(dst->sa_family, mask);
551 	if (plen == -1)
552 		return (EINVAL);
553 
554 	rtref(rt); /* guarantee rtfree won't do anything during insert */
555 	rw_enter_write(&ar->ar_lock);
556 
557 	/* Do not permit exactly the same dst/mask/gw pair. */
558 	an = art_lookup(ar, addr, plen, &sr);
559 	srp_leave(&sr); /* an can't go away while we have the lock */
560 	if (an_match(an, dst, plen)) {
561 		struct rtentry  *mrt;
562 		int		 mpathok = ISSET(rt->rt_flags, RTF_MPATH);
563 
564 		SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next) {
565 			if (prio != RTP_ANY &&
566 			    (mrt->rt_priority & RTP_MASK) != (prio & RTP_MASK))
567 				continue;
568 
569 			if (!mpathok ||
570 			    (mrt->rt_gateway->sa_len == gateway->sa_len &&
571 			    !memcmp(mrt->rt_gateway, gateway, gateway->sa_len))){
572 				error = EEXIST;
573 				goto leave;
574 			}
575 		}
576 	}
577 
578 	an = art_get(dst, plen);
579 	if (an == NULL) {
580 		error = ENOBUFS;
581 		goto leave;
582 	}
583 
584 	/* prepare for immediate operation if insert succeeds */
585 	rt_flags = rt->rt_flags;
586 	rt->rt_flags &= ~RTF_MPATH;
587 	rt->rt_dest = dst;
588 	rt->rt_plen = plen;
589 	SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
590 
591 	prev = art_insert(ar, an, addr, plen);
592 	if (prev != an) {
593 		SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry,
594 		    rt_next);
595 		rt->rt_flags = rt_flags;
596 		art_put(an);
597 
598 		if (prev == NULL) {
599 			error = ESRCH;
600 			goto leave;
601 		}
602 
603 		an = prev;
604 
605 		mrt = SRPL_FIRST_LOCKED(&an->an_rtlist);
606 		KASSERT(mrt != NULL);
607 		KASSERT((rt->rt_flags & RTF_MPATH) || mrt->rt_priority != prio);
608 
609 		/*
610 		 * An ART node with the same destination/netmask already
611 		 * exists, MPATH conflict must have been already checked.
612 		 */
613 		if (rt->rt_flags & RTF_MPATH) {
614 			/*
615 			 * Only keep the RTF_MPATH flag if two routes have
616 			 * the same gateway.
617 			 */
618 			rt->rt_flags &= ~RTF_MPATH;
619 			SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next) {
620 				if (mrt->rt_priority == prio) {
621 					mrt->rt_flags |= RTF_MPATH;
622 					rt->rt_flags |= RTF_MPATH;
623 				}
624 			}
625 		}
626 
627 		/* Put newly inserted entry at the right place. */
628 		rtable_mpath_insert(an, rt);
629 	}
630 leave:
631 	rw_exit_write(&ar->ar_lock);
632 	rtfree(rt);
633 	return (error);
634 }
635 
636 int
637 rtable_delete(unsigned int rtableid, struct sockaddr *dst,
638     struct sockaddr *mask, struct rtentry *rt)
639 {
640 	struct art_root			*ar;
641 	struct art_node			*an;
642 	struct srp_ref			 sr;
643 	uint8_t				*addr;
644 	int				 plen;
645 	struct rtentry			*mrt;
646 	int				 npaths = 0;
647 	int				 error = 0;
648 
649 	ar = rtable_get(rtableid, dst->sa_family);
650 	if (ar == NULL)
651 		return (EAFNOSUPPORT);
652 
653 	addr = satoaddr(ar, dst);
654 	plen = rtable_satoplen(dst->sa_family, mask);
655 	if (plen == -1)
656 		return (EINVAL);
657 
658 	rtref(rt); /* guarantee rtfree won't do anything under ar_lock */
659 	rw_enter_write(&ar->ar_lock);
660 	an = art_lookup(ar, addr, plen, &sr);
661 	srp_leave(&sr); /* an can't go away while we have the lock */
662 
663 	/* Make sure we've got a perfect match. */
664 	if (!an_match(an, dst, plen)) {
665 		error = ESRCH;
666 		goto leave;
667 	}
668 
669 	/*
670 	 * If other multipath route entries are still attached to
671 	 * this ART node we only have to unlink it.
672 	 */
673 	SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next)
674 		npaths++;
675 
676 	if (npaths > 1) {
677 		KASSERT(rt->rt_refcnt >= 1);
678 		SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry,
679 		    rt_next);
680 
681 		mrt = SRPL_FIRST_LOCKED(&an->an_rtlist);
682 		if (npaths == 2)
683 			mrt->rt_flags &= ~RTF_MPATH;
684 
685 		goto leave;
686 	}
687 
688 	if (art_delete(ar, an, addr, plen) == NULL)
689 		panic("art_delete failed to find node %p", an);
690 
691 	KASSERT(rt->rt_refcnt >= 1);
692 	SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry, rt_next);
693 	art_put(an);
694 
695 leave:
696 	rw_exit_write(&ar->ar_lock);
697 	rtfree(rt);
698 
699 	return (error);
700 }
701 
702 struct rtable_walk_cookie {
703 	int		(*rwc_func)(struct rtentry *, void *, unsigned int);
704 	void		 *rwc_arg;
705 	struct rtentry	**rwc_prt;
706 	unsigned int	  rwc_rid;
707 };
708 
709 /*
710  * Helper for rtable_walk to keep the ART code free from any "struct rtentry".
711  */
712 int
713 rtable_walk_helper(struct art_node *an, void *xrwc)
714 {
715 	struct srp_ref			 sr;
716 	struct rtable_walk_cookie	*rwc = xrwc;
717 	struct rtentry			*rt;
718 	int				 error = 0;
719 
720 	SRPL_FOREACH(rt, &sr, &an->an_rtlist, rt_next) {
721 		error = (*rwc->rwc_func)(rt, rwc->rwc_arg, rwc->rwc_rid);
722 		if (error != 0)
723 			break;
724 	}
725 	if (rwc->rwc_prt != NULL && rt != NULL) {
726 		rtref(rt);
727 		*rwc->rwc_prt = rt;
728 	}
729 	SRPL_LEAVE(&sr);
730 
731 	return (error);
732 }
733 
734 int
735 rtable_walk(unsigned int rtableid, sa_family_t af, struct rtentry **prt,
736     int (*func)(struct rtentry *, void *, unsigned int), void *arg)
737 {
738 	struct art_root			*ar;
739 	struct rtable_walk_cookie	 rwc;
740 	int				 error;
741 
742 	ar = rtable_get(rtableid, af);
743 	if (ar == NULL)
744 		return (EAFNOSUPPORT);
745 
746 	rwc.rwc_func = func;
747 	rwc.rwc_arg = arg;
748 	rwc.rwc_prt = prt;
749 	rwc.rwc_rid = rtableid;
750 
751 	error = art_walk(ar, rtable_walk_helper, &rwc);
752 
753 	return (error);
754 }
755 
756 struct rtentry *
757 rtable_iterate(struct rtentry *rt0)
758 {
759 	struct rtentry *rt = NULL;
760 	struct srp_ref sr;
761 
762 	rt = SRPL_NEXT(&sr, rt0, rt_next);
763 	if (rt != NULL)
764 		rtref(rt);
765 	SRPL_LEAVE(&sr);
766 	rtfree(rt0);
767 	return (rt);
768 }
769 
770 int
771 rtable_mpath_capable(unsigned int rtableid, sa_family_t af)
772 {
773 	return (1);
774 }
775 
776 int
777 rtable_mpath_reprio(unsigned int rtableid, struct sockaddr *dst,
778     int plen, uint8_t prio, struct rtentry *rt)
779 {
780 	struct art_root			*ar;
781 	struct art_node			*an;
782 	struct srp_ref			 sr;
783 	uint8_t				*addr;
784 	int				 error = 0;
785 
786 	ar = rtable_get(rtableid, dst->sa_family);
787 	if (ar == NULL)
788 		return (EAFNOSUPPORT);
789 
790 	addr = satoaddr(ar, dst);
791 
792 	rw_enter_write(&ar->ar_lock);
793 	an = art_lookup(ar, addr, plen, &sr);
794 	srp_leave(&sr); /* an can't go away while we have the lock */
795 
796 	/* Make sure we've got a perfect match. */
797 	if (!an_match(an, dst, plen)) {
798 		error = ESRCH;
799 	} else if (SRPL_FIRST_LOCKED(&an->an_rtlist) == rt &&
800 		SRPL_NEXT_LOCKED(rt, rt_next) == NULL) {
801 		/*
802 		 * If there's only one entry on the list do not go
803 		 * through an insert/remove cycle.  This is done to
804 		 * guarantee that ``an->an_rtlist''  is never empty
805 		 * when a node is in the tree.
806 		 */
807 		rt->rt_priority = prio;
808 	} else {
809 		rtref(rt); /* keep rt alive in between remove and insert */
810 		SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist,
811 		    rt, rtentry, rt_next);
812 		rt->rt_priority = prio;
813 		rtable_mpath_insert(an, rt);
814 		rtfree(rt);
815 		error = EAGAIN;
816 	}
817 	rw_exit_write(&ar->ar_lock);
818 
819 	return (error);
820 }
821 
822 void
823 rtable_mpath_insert(struct art_node *an, struct rtentry *rt)
824 {
825 	struct rtentry			*mrt, *prt = NULL;
826 	uint8_t				 prio = rt->rt_priority;
827 
828 	if ((mrt = SRPL_FIRST_LOCKED(&an->an_rtlist)) == NULL) {
829 		SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
830 		return;
831 	}
832 
833 	/* Iterate until we find the route to be placed after ``rt''. */
834 	while (mrt->rt_priority <= prio && SRPL_NEXT_LOCKED(mrt, rt_next)) {
835 		prt = mrt;
836 		mrt = SRPL_NEXT_LOCKED(mrt, rt_next);
837 	}
838 
839 	if (mrt->rt_priority <= prio) {
840 		SRPL_INSERT_AFTER_LOCKED(&rt_rc, mrt, rt, rt_next);
841 	} else if (prt != NULL) {
842 		SRPL_INSERT_AFTER_LOCKED(&rt_rc, prt, rt, rt_next);
843 	} else {
844 		SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
845 	}
846 }
847 
848 /*
849  * Returns 1 if ``an'' perfectly matches (``dst'', ``plen''), 0 otherwise.
850  */
851 int
852 an_match(struct art_node *an, struct sockaddr *dst, int plen)
853 {
854 	struct rtentry			*rt;
855 	struct srp_ref			 sr;
856 	int				 match;
857 
858 	if (an == NULL || an->an_plen != plen)
859 		return (0);
860 
861 	rt = SRPL_FIRST(&sr, &an->an_rtlist);
862 	match = (memcmp(rt->rt_dest, dst, dst->sa_len) == 0);
863 	SRPL_LEAVE(&sr);
864 
865 	return (match);
866 }
867 
868 void
869 rtentry_ref(void *null, void *xrt)
870 {
871 	struct rtentry *rt = xrt;
872 
873 	rtref(rt);
874 }
875 
876 void
877 rtentry_unref(void *null, void *xrt)
878 {
879 	struct rtentry *rt = xrt;
880 
881 	rtfree(rt);
882 }
883 
884 /*
885  * Return a pointer to the address (key).  This is an heritage from the
886  * BSD radix tree needed to skip the non-address fields from the flavor
887  * of "struct sockaddr" used by this routing table.
888  */
889 static inline uint8_t *
890 satoaddr(struct art_root *at, struct sockaddr *sa)
891 {
892 	return (((uint8_t *)sa) + at->ar_off);
893 }
894 
895 /*
896  * Return the prefix length of a mask.
897  */
898 int
899 rtable_satoplen(sa_family_t af, struct sockaddr *mask)
900 {
901 	struct domain	*dp;
902 	uint8_t		*ap, *ep;
903 	int		 mlen, plen = 0;
904 	int		 i;
905 
906 	for (i = 0; (dp = domains[i]) != NULL; i++) {
907 		if (dp->dom_rtoffset == 0)
908 			continue;
909 
910 		if (af == dp->dom_family)
911 			break;
912 	}
913 	if (dp == NULL)
914 		return (-1);
915 
916 	/* Host route */
917 	if (mask == NULL)
918 		return (dp->dom_maxplen);
919 
920 	mlen = mask->sa_len;
921 
922 	/* Default route */
923 	if (mlen == 0)
924 		return (0);
925 
926 	ap = (uint8_t *)((uint8_t *)mask) + dp->dom_rtoffset;
927 	ep = (uint8_t *)((uint8_t *)mask) + mlen;
928 	if (ap > ep)
929 		return (-1);
930 
931 	/* Trim trailing zeroes. */
932 	while (ap < ep && ep[-1] == 0)
933 		ep--;
934 
935 	if (ap == ep)
936 		return (0);
937 
938 	/* "Beauty" adapted from sbin/route/show.c ... */
939 	while (ap < ep) {
940 		switch (*ap++) {
941 		case 0xff:
942 			plen += 8;
943 			break;
944 		case 0xfe:
945 			plen += 7;
946 			goto out;
947 		case 0xfc:
948 			plen += 6;
949 			goto out;
950 		case 0xf8:
951 			plen += 5;
952 			goto out;
953 		case 0xf0:
954 			plen += 4;
955 			goto out;
956 		case 0xe0:
957 			plen += 3;
958 			goto out;
959 		case 0xc0:
960 			plen += 2;
961 			goto out;
962 		case 0x80:
963 			plen += 1;
964 			goto out;
965 		default:
966 			/* Non contiguous mask. */
967 			return (-1);
968 		}
969 	}
970 
971 out:
972 	if (plen > dp->dom_maxplen || ap != ep)
973 		return -1;
974 
975 	return (plen);
976 }
977