xref: /openbsd-src/sys/net/rtable.c (revision 24bb5fcea3ed904bc467217bdaadb5dfc618d5bf)
1 /*	$OpenBSD: rtable.c,v 1.75 2021/05/25 22:45:09 bluhm Exp $ */
2 
3 /*
4  * Copyright (c) 2014-2016 Martin Pieuchot
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #ifndef _KERNEL
20 #include "kern_compat.h"
21 #else
22 #include <sys/param.h>
23 #include <sys/systm.h>
24 #include <sys/socket.h>
25 #include <sys/malloc.h>
26 #include <sys/queue.h>
27 #include <sys/domain.h>
28 #include <sys/srp.h>
29 #endif
30 
31 #include <net/rtable.h>
32 #include <net/route.h>
33 
34 /*
35  * Structures used by rtable_get() to retrieve the corresponding
36  * routing table for a given pair of ``af'' and ``rtableid''.
37  *
38  * Note that once allocated routing table heads are never freed.
39  * This way we do not need to reference count them.
40  *
41  *	afmap		    rtmap/dommp
42  *   -----------          ---------     -----
43  *   |   0     |--------> | 0 | 0 | ... | 0 |	Array mapping rtableid (=index)
44  *   -----------          ---------     -----   to rdomain/loopback (=value).
45  *   | AF_INET |.
46  *   ----------- `.       .---------.     .---------.
47  *       ...	   `----> | rtable0 | ... | rtableN |	Array of pointers for
48  *   -----------          '---------'     '---------'	IPv4 routing tables
49  *   | AF_MPLS |					indexed by ``rtableid''.
50  *   -----------
51  */
52 struct srp	  *afmap;
53 uint8_t		   af2idx[AF_MAX+1];	/* To only allocate supported AF */
54 uint8_t		   af2idx_max;
55 
56 /* Array of routing table pointers. */
57 struct rtmap {
58 	unsigned int	   limit;
59 	void		 **tbl;
60 };
61 
62 /*
63  * Array of rtableid -> rdomain mapping.
64  *
65  * Only used for the first index as describbed above.
66  */
67 struct dommp {
68 	unsigned int	   limit;
69 	/*
70 	 * Array to get the routing domain and loopback interface related to
71 	 * a routing table. Format:
72 	 *
73 	 * 8 unused bits | 16 bits for loopback index | 8 bits for rdomain
74 	 */
75 	unsigned int	  *value;
76 };
77 
78 unsigned int	   rtmap_limit = 0;
79 
80 void		   rtmap_init(void);
81 void		   rtmap_grow(unsigned int, sa_family_t);
82 void		   rtmap_dtor(void *, void *);
83 
84 struct srp_gc	   rtmap_gc = SRP_GC_INITIALIZER(rtmap_dtor, NULL);
85 
86 void		   rtable_init_backend(void);
87 void		  *rtable_alloc(unsigned int, unsigned int, unsigned int);
88 void		  *rtable_get(unsigned int, sa_family_t);
89 
90 void
91 rtmap_init(void)
92 {
93 	const struct domain	*dp;
94 	int			 i;
95 
96 	/* Start with a single table for every domain that requires it. */
97 	for (i = 0; (dp = domains[i]) != NULL; i++) {
98 		if (dp->dom_rtoffset == 0)
99 			continue;
100 
101 		rtmap_grow(1, dp->dom_family);
102 	}
103 
104 	/* Initialize the rtableid->rdomain mapping table. */
105 	rtmap_grow(1, 0);
106 
107 	rtmap_limit = 1;
108 }
109 
110 /*
111  * Grow the size of the array of routing table for AF ``af'' to ``nlimit''.
112  */
113 void
114 rtmap_grow(unsigned int nlimit, sa_family_t af)
115 {
116 	struct rtmap	*map, *nmap;
117 	int		 i;
118 
119 	KERNEL_ASSERT_LOCKED();
120 
121 	KASSERT(nlimit > rtmap_limit);
122 
123 	nmap = malloc(sizeof(*nmap), M_RTABLE, M_WAITOK);
124 	nmap->limit = nlimit;
125 	nmap->tbl = mallocarray(nlimit, sizeof(*nmap[0].tbl), M_RTABLE,
126 	    M_WAITOK|M_ZERO);
127 
128 	map = srp_get_locked(&afmap[af2idx[af]]);
129 	if (map != NULL) {
130 		KASSERT(map->limit == rtmap_limit);
131 
132 		for (i = 0; i < map->limit; i++)
133 			nmap->tbl[i] = map->tbl[i];
134 	}
135 
136 	srp_update_locked(&rtmap_gc, &afmap[af2idx[af]], nmap);
137 }
138 
139 void
140 rtmap_dtor(void *null, void *xmap)
141 {
142 	struct rtmap	*map = xmap;
143 
144 	/*
145 	 * doesn't need to be serialized since this is the last reference
146 	 * to this map. there's nothing to race against.
147 	 */
148 	free(map->tbl, M_RTABLE, map->limit * sizeof(*map[0].tbl));
149 	free(map, M_RTABLE, sizeof(*map));
150 }
151 
152 void
153 rtable_init(void)
154 {
155 	const struct domain	*dp;
156 	int			 i;
157 
158 	KASSERT(sizeof(struct rtmap) == sizeof(struct dommp));
159 
160 	/* We use index 0 for the rtable/rdomain map. */
161 	af2idx_max = 1;
162 	memset(af2idx, 0, sizeof(af2idx));
163 
164 	/*
165 	 * Compute the maximum supported key length in case the routing
166 	 * table backend needs it.
167 	 */
168 	for (i = 0; (dp = domains[i]) != NULL; i++) {
169 		if (dp->dom_rtoffset == 0)
170 			continue;
171 
172 		af2idx[dp->dom_family] = af2idx_max++;
173 	}
174 	rtable_init_backend();
175 
176 	/*
177 	 * Allocate AF-to-id table now that we now how many AFs this
178 	 * kernel supports.
179 	 */
180 	afmap = mallocarray(af2idx_max + 1, sizeof(*afmap), M_RTABLE,
181 	    M_WAITOK|M_ZERO);
182 
183 	rtmap_init();
184 
185 	if (rtable_add(0) != 0)
186 		panic("unable to create default routing table");
187 }
188 
189 int
190 rtable_add(unsigned int id)
191 {
192 	const struct domain	*dp;
193 	void			*tbl;
194 	struct rtmap		*map;
195 	struct dommp		*dmm;
196 	sa_family_t		 af;
197 	unsigned int		 off, alen;
198 	int			 i, error = 0;
199 
200 	if (id > RT_TABLEID_MAX)
201 		return (EINVAL);
202 
203 	KERNEL_LOCK();
204 
205 	if (rtable_exists(id))
206 		goto out;
207 
208 	for (i = 0; (dp = domains[i]) != NULL; i++) {
209 		if (dp->dom_rtoffset == 0)
210 			continue;
211 
212 		af = dp->dom_family;
213 		off = dp->dom_rtoffset;
214 		alen = dp->dom_maxplen;
215 
216 		if (id >= rtmap_limit)
217 			rtmap_grow(id + 1, af);
218 
219 		tbl = rtable_alloc(id, alen, off);
220 		if (tbl == NULL) {
221 			error = ENOMEM;
222 			goto out;
223 		}
224 
225 		map = srp_get_locked(&afmap[af2idx[af]]);
226 		map->tbl[id] = tbl;
227 	}
228 
229 	/* Reflect possible growth. */
230 	if (id >= rtmap_limit) {
231 		rtmap_grow(id + 1, 0);
232 		rtmap_limit = id + 1;
233 	}
234 
235 	/* Use main rtable/rdomain by default. */
236 	dmm = srp_get_locked(&afmap[0]);
237 	dmm->value[id] = 0;
238 out:
239 	KERNEL_UNLOCK();
240 
241 	return (error);
242 }
243 
244 void *
245 rtable_get(unsigned int rtableid, sa_family_t af)
246 {
247 	struct rtmap	*map;
248 	void		*tbl = NULL;
249 	struct srp_ref	 sr;
250 
251 	if (af >= nitems(af2idx) || af2idx[af] == 0)
252 		return (NULL);
253 
254 	map = srp_enter(&sr, &afmap[af2idx[af]]);
255 	if (rtableid < map->limit)
256 		tbl = map->tbl[rtableid];
257 	srp_leave(&sr);
258 
259 	return (tbl);
260 }
261 
262 int
263 rtable_exists(unsigned int rtableid)
264 {
265 	const struct domain	*dp;
266 	void			*tbl;
267 	int			 i;
268 
269 	for (i = 0; (dp = domains[i]) != NULL; i++) {
270 		if (dp->dom_rtoffset == 0)
271 			continue;
272 
273 		tbl = rtable_get(rtableid, dp->dom_family);
274 		if (tbl != NULL)
275 			return (1);
276 	}
277 
278 	return (0);
279 }
280 
281 int
282 rtable_empty(unsigned int rtableid)
283 {
284 	const struct domain	*dp;
285 	int			 i;
286 	struct art_root		*tbl;
287 
288 	for (i = 0; (dp = domains[i]) != NULL; i++) {
289 		if (dp->dom_rtoffset == 0)
290 			continue;
291 
292 		tbl = rtable_get(rtableid, dp->dom_family);
293 		if (tbl == NULL)
294 			continue;
295 		if (tbl->ar_root.ref != NULL)
296 			return (0);
297 	}
298 
299 	return (1);
300 }
301 
302 unsigned int
303 rtable_l2(unsigned int rtableid)
304 {
305 	struct dommp	*dmm;
306 	unsigned int	 rdomain = 0;
307 	struct srp_ref	 sr;
308 
309 	dmm = srp_enter(&sr, &afmap[0]);
310 	if (rtableid < dmm->limit)
311 		rdomain = (dmm->value[rtableid] & RT_TABLEID_MASK);
312 	srp_leave(&sr);
313 
314 	return (rdomain);
315 }
316 
317 unsigned int
318 rtable_loindex(unsigned int rtableid)
319 {
320 	struct dommp	*dmm;
321 	unsigned int	 loifidx = 0;
322 	struct srp_ref	 sr;
323 
324 	dmm = srp_enter(&sr, &afmap[0]);
325 	if (rtableid < dmm->limit)
326 		loifidx = (dmm->value[rtableid] >> RT_TABLEID_BITS);
327 	srp_leave(&sr);
328 
329 	return (loifidx);
330 }
331 
332 void
333 rtable_l2set(unsigned int rtableid, unsigned int rdomain, unsigned int loifidx)
334 {
335 	struct dommp	*dmm;
336 	unsigned int	 value;
337 
338 	KERNEL_ASSERT_LOCKED();
339 
340 	if (!rtable_exists(rtableid) || !rtable_exists(rdomain))
341 		return;
342 
343 	value = (rdomain & RT_TABLEID_MASK) | (loifidx << RT_TABLEID_BITS);
344 
345 	dmm = srp_get_locked(&afmap[0]);
346 	dmm->value[rtableid] = value;
347 }
348 
349 
350 static inline uint8_t	*satoaddr(struct art_root *, struct sockaddr *);
351 
352 int	an_match(struct art_node *, struct sockaddr *, int);
353 void	rtentry_ref(void *, void *);
354 void	rtentry_unref(void *, void *);
355 
356 void	rtable_mpath_insert(struct art_node *, struct rtentry *);
357 
358 struct srpl_rc rt_rc = SRPL_RC_INITIALIZER(rtentry_ref, rtentry_unref, NULL);
359 
360 void
361 rtable_init_backend(void)
362 {
363 	art_init();
364 }
365 
366 void *
367 rtable_alloc(unsigned int rtableid, unsigned int alen, unsigned int off)
368 {
369 	return (art_alloc(rtableid, alen, off));
370 }
371 
372 int
373 rtable_setsource(unsigned int rtableid, int af, struct sockaddr *src)
374 {
375 	struct art_root		*ar;
376 
377 	if ((ar = rtable_get(rtableid, af)) == NULL)
378 		return (EAFNOSUPPORT);
379 
380 	ar->source = src;
381 
382 	return (0);
383 }
384 
385 struct sockaddr *
386 rtable_getsource(unsigned int rtableid, int af)
387 {
388 	struct art_root		*ar;
389 
390 	ar = rtable_get(rtableid, af);
391 	if (ar == NULL)
392 		return (NULL);
393 
394 	return (ar->source);
395 }
396 
397 void
398 rtable_clearsource(unsigned int rtableid, struct sockaddr *src)
399 {
400 	struct sockaddr	*addr;
401 
402 	addr = rtable_getsource(rtableid, src->sa_family);
403 	if (addr && (addr->sa_len == src->sa_len)) {
404 		if (memcmp(src, addr, addr->sa_len) == 0) {
405 			rtable_setsource(rtableid, src->sa_family, NULL);
406 		}
407 	}
408 }
409 
410 struct rtentry *
411 rtable_lookup(unsigned int rtableid, struct sockaddr *dst,
412     struct sockaddr *mask, struct sockaddr *gateway, uint8_t prio)
413 {
414 	struct art_root			*ar;
415 	struct art_node			*an;
416 	struct rtentry			*rt = NULL;
417 	struct srp_ref			 sr, nsr;
418 	uint8_t				*addr;
419 	int				 plen;
420 
421 	ar = rtable_get(rtableid, dst->sa_family);
422 	if (ar == NULL)
423 		return (NULL);
424 
425 	addr = satoaddr(ar, dst);
426 
427 	/* No need for a perfect match. */
428 	if (mask == NULL) {
429 		an = art_match(ar, addr, &nsr);
430 		if (an == NULL)
431 			goto out;
432 	} else {
433 		plen = rtable_satoplen(dst->sa_family, mask);
434 		if (plen == -1)
435 			return (NULL);
436 
437 		an = art_lookup(ar, addr, plen, &nsr);
438 
439 		/* Make sure we've got a perfect match. */
440 		if (!an_match(an, dst, plen))
441 			goto out;
442 	}
443 
444 	SRPL_FOREACH(rt, &sr, &an->an_rtlist, rt_next) {
445 		if (prio != RTP_ANY &&
446 		    (rt->rt_priority & RTP_MASK) != (prio & RTP_MASK))
447 			continue;
448 
449 		if (gateway == NULL)
450 			break;
451 
452 		if (rt->rt_gateway->sa_len == gateway->sa_len &&
453 		    memcmp(rt->rt_gateway, gateway, gateway->sa_len) == 0)
454 			break;
455 	}
456 	if (rt != NULL)
457 		rtref(rt);
458 
459 	SRPL_LEAVE(&sr);
460 out:
461 	srp_leave(&nsr);
462 
463 	return (rt);
464 }
465 
466 struct rtentry *
467 rtable_match(unsigned int rtableid, struct sockaddr *dst, uint32_t *src)
468 {
469 	struct art_root			*ar;
470 	struct art_node			*an;
471 	struct rtentry			*rt = NULL;
472 	struct srp_ref			 sr, nsr;
473 	uint8_t				*addr;
474 	int				 hash;
475 
476 	ar = rtable_get(rtableid, dst->sa_family);
477 	if (ar == NULL)
478 		return (NULL);
479 
480 	addr = satoaddr(ar, dst);
481 
482 	an = art_match(ar, addr, &nsr);
483 	if (an == NULL)
484 		goto out;
485 
486 	rt = SRPL_FIRST(&sr, &an->an_rtlist);
487 	rtref(rt);
488 	SRPL_LEAVE(&sr);
489 
490 	/* Gateway selection by Hash-Threshold (RFC 2992) */
491 	if ((hash = rt_hash(rt, dst, src)) != -1) {
492 		struct rtentry		*mrt;
493 		int			 threshold, npaths = 0;
494 
495 		KASSERT(hash <= 0xffff);
496 
497 		SRPL_FOREACH(mrt, &sr, &an->an_rtlist, rt_next) {
498 			/* Only count nexthops with the same priority. */
499 			if (mrt->rt_priority == rt->rt_priority)
500 				npaths++;
501 		}
502 		SRPL_LEAVE(&sr);
503 
504 		threshold = (0xffff / npaths) + 1;
505 
506 		/*
507 		 * we have no protection against concurrent modification of the
508 		 * route list attached to the node, so we won't necessarily
509 		 * have the same number of routes.  for most modifications,
510 		 * we'll pick a route that we wouldn't have if we only saw the
511 		 * list before or after the change.  if we were going to use
512 		 * the last available route, but it got removed, we'll hit
513 		 * the end of the list and then pick the first route.
514 		 */
515 
516 		mrt = SRPL_FIRST(&sr, &an->an_rtlist);
517 		while (hash > threshold && mrt != NULL) {
518 			if (mrt->rt_priority == rt->rt_priority)
519 				hash -= threshold;
520 			mrt = SRPL_FOLLOW(&sr, mrt, rt_next);
521 		}
522 
523 		if (mrt != NULL) {
524 			rtref(mrt);
525 			rtfree(rt);
526 			rt = mrt;
527 		}
528 		SRPL_LEAVE(&sr);
529 	}
530 out:
531 	srp_leave(&nsr);
532 	return (rt);
533 }
534 
535 int
536 rtable_insert(unsigned int rtableid, struct sockaddr *dst,
537     struct sockaddr *mask, struct sockaddr *gateway, uint8_t prio,
538     struct rtentry *rt)
539 {
540 	struct rtentry			*mrt;
541 	struct srp_ref			 sr;
542 	struct art_root			*ar;
543 	struct art_node			*an, *prev;
544 	uint8_t				*addr;
545 	int				 plen;
546 	unsigned int			 rt_flags;
547 	int				 error = 0;
548 
549 	ar = rtable_get(rtableid, dst->sa_family);
550 	if (ar == NULL)
551 		return (EAFNOSUPPORT);
552 
553 	addr = satoaddr(ar, dst);
554 	plen = rtable_satoplen(dst->sa_family, mask);
555 	if (plen == -1)
556 		return (EINVAL);
557 
558 	rtref(rt); /* guarantee rtfree won't do anything during insert */
559 	rw_enter_write(&ar->ar_lock);
560 
561 	/* Do not permit exactly the same dst/mask/gw pair. */
562 	an = art_lookup(ar, addr, plen, &sr);
563 	srp_leave(&sr); /* an can't go away while we have the lock */
564 	if (an_match(an, dst, plen)) {
565 		struct rtentry  *mrt;
566 		int		 mpathok = ISSET(rt->rt_flags, RTF_MPATH);
567 
568 		SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next) {
569 			if (prio != RTP_ANY &&
570 			    (mrt->rt_priority & RTP_MASK) != (prio & RTP_MASK))
571 				continue;
572 
573 			if (!mpathok ||
574 			    (mrt->rt_gateway->sa_len == gateway->sa_len &&
575 			    !memcmp(mrt->rt_gateway, gateway, gateway->sa_len))){
576 				error = EEXIST;
577 				goto leave;
578 			}
579 		}
580 	}
581 
582 	an = art_get(dst, plen);
583 	if (an == NULL) {
584 		error = ENOBUFS;
585 		goto leave;
586 	}
587 
588 	/* prepare for immediate operation if insert succeeds */
589 	rt_flags = rt->rt_flags;
590 	rt->rt_flags &= ~RTF_MPATH;
591 	rt->rt_dest = dst;
592 	rt->rt_plen = plen;
593 	SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
594 
595 	prev = art_insert(ar, an, addr, plen);
596 	if (prev != an) {
597 		SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry,
598 		    rt_next);
599 		rt->rt_flags = rt_flags;
600 		art_put(an);
601 
602 		if (prev == NULL) {
603 			error = ESRCH;
604 			goto leave;
605 		}
606 
607 		an = prev;
608 
609 		mrt = SRPL_FIRST_LOCKED(&an->an_rtlist);
610 		KASSERT(mrt != NULL);
611 		KASSERT((rt->rt_flags & RTF_MPATH) || mrt->rt_priority != prio);
612 
613 		/*
614 		 * An ART node with the same destination/netmask already
615 		 * exists, MPATH conflict must have been already checked.
616 		 */
617 		if (rt->rt_flags & RTF_MPATH) {
618 			/*
619 			 * Only keep the RTF_MPATH flag if two routes have
620 			 * the same gateway.
621 			 */
622 			rt->rt_flags &= ~RTF_MPATH;
623 			SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next) {
624 				if (mrt->rt_priority == prio) {
625 					mrt->rt_flags |= RTF_MPATH;
626 					rt->rt_flags |= RTF_MPATH;
627 				}
628 			}
629 		}
630 
631 		/* Put newly inserted entry at the right place. */
632 		rtable_mpath_insert(an, rt);
633 	}
634 leave:
635 	rw_exit_write(&ar->ar_lock);
636 	rtfree(rt);
637 	return (error);
638 }
639 
640 int
641 rtable_delete(unsigned int rtableid, struct sockaddr *dst,
642     struct sockaddr *mask, struct rtentry *rt)
643 {
644 	struct art_root			*ar;
645 	struct art_node			*an;
646 	struct srp_ref			 sr;
647 	uint8_t				*addr;
648 	int				 plen;
649 	struct rtentry			*mrt;
650 	int				 npaths = 0;
651 	int				 error = 0;
652 
653 	ar = rtable_get(rtableid, dst->sa_family);
654 	if (ar == NULL)
655 		return (EAFNOSUPPORT);
656 
657 	addr = satoaddr(ar, dst);
658 	plen = rtable_satoplen(dst->sa_family, mask);
659 	if (plen == -1)
660 		return (EINVAL);
661 
662 	rtref(rt); /* guarantee rtfree won't do anything under ar_lock */
663 	rw_enter_write(&ar->ar_lock);
664 	an = art_lookup(ar, addr, plen, &sr);
665 	srp_leave(&sr); /* an can't go away while we have the lock */
666 
667 	/* Make sure we've got a perfect match. */
668 	if (!an_match(an, dst, plen)) {
669 		error = ESRCH;
670 		goto leave;
671 	}
672 
673 	/*
674 	 * If other multipath route entries are still attached to
675 	 * this ART node we only have to unlink it.
676 	 */
677 	SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next)
678 		npaths++;
679 
680 	if (npaths > 1) {
681 		KASSERT(rt->rt_refcnt >= 1);
682 		SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry,
683 		    rt_next);
684 
685 		mrt = SRPL_FIRST_LOCKED(&an->an_rtlist);
686 		if (npaths == 2)
687 			mrt->rt_flags &= ~RTF_MPATH;
688 
689 		goto leave;
690 	}
691 
692 	if (art_delete(ar, an, addr, plen) == NULL)
693 		panic("art_delete failed to find node %p", an);
694 
695 	KASSERT(rt->rt_refcnt >= 1);
696 	SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry, rt_next);
697 	art_put(an);
698 
699 leave:
700 	rw_exit_write(&ar->ar_lock);
701 	rtfree(rt);
702 
703 	return (error);
704 }
705 
706 struct rtable_walk_cookie {
707 	int		(*rwc_func)(struct rtentry *, void *, unsigned int);
708 	void		 *rwc_arg;
709 	struct rtentry	**rwc_prt;
710 	unsigned int	  rwc_rid;
711 };
712 
713 /*
714  * Helper for rtable_walk to keep the ART code free from any "struct rtentry".
715  */
716 int
717 rtable_walk_helper(struct art_node *an, void *xrwc)
718 {
719 	struct srp_ref			 sr;
720 	struct rtable_walk_cookie	*rwc = xrwc;
721 	struct rtentry			*rt;
722 	int				 error = 0;
723 
724 	SRPL_FOREACH(rt, &sr, &an->an_rtlist, rt_next) {
725 		error = (*rwc->rwc_func)(rt, rwc->rwc_arg, rwc->rwc_rid);
726 		if (error != 0)
727 			break;
728 	}
729 	if (rwc->rwc_prt != NULL && rt != NULL) {
730 		rtref(rt);
731 		*rwc->rwc_prt = rt;
732 	}
733 	SRPL_LEAVE(&sr);
734 
735 	return (error);
736 }
737 
738 int
739 rtable_walk(unsigned int rtableid, sa_family_t af, struct rtentry **prt,
740     int (*func)(struct rtentry *, void *, unsigned int), void *arg)
741 {
742 	struct art_root			*ar;
743 	struct rtable_walk_cookie	 rwc;
744 	int				 error;
745 
746 	ar = rtable_get(rtableid, af);
747 	if (ar == NULL)
748 		return (EAFNOSUPPORT);
749 
750 	rwc.rwc_func = func;
751 	rwc.rwc_arg = arg;
752 	rwc.rwc_prt = prt;
753 	rwc.rwc_rid = rtableid;
754 
755 	error = art_walk(ar, rtable_walk_helper, &rwc);
756 
757 	return (error);
758 }
759 
760 struct rtentry *
761 rtable_iterate(struct rtentry *rt0)
762 {
763 	struct rtentry *rt = NULL;
764 	struct srp_ref sr;
765 
766 	rt = SRPL_NEXT(&sr, rt0, rt_next);
767 	if (rt != NULL)
768 		rtref(rt);
769 	SRPL_LEAVE(&sr);
770 	rtfree(rt0);
771 	return (rt);
772 }
773 
774 int
775 rtable_mpath_capable(unsigned int rtableid, sa_family_t af)
776 {
777 	return (1);
778 }
779 
780 int
781 rtable_mpath_reprio(unsigned int rtableid, struct sockaddr *dst,
782     int plen, uint8_t prio, struct rtentry *rt)
783 {
784 	struct art_root			*ar;
785 	struct art_node			*an;
786 	struct srp_ref			 sr;
787 	uint8_t				*addr;
788 	int				 error = 0;
789 
790 	ar = rtable_get(rtableid, dst->sa_family);
791 	if (ar == NULL)
792 		return (EAFNOSUPPORT);
793 
794 	addr = satoaddr(ar, dst);
795 
796 	rw_enter_write(&ar->ar_lock);
797 	an = art_lookup(ar, addr, plen, &sr);
798 	srp_leave(&sr); /* an can't go away while we have the lock */
799 
800 	/* Make sure we've got a perfect match. */
801 	if (!an_match(an, dst, plen)) {
802 		error = ESRCH;
803 	} else if (SRPL_FIRST_LOCKED(&an->an_rtlist) == rt &&
804 		SRPL_NEXT_LOCKED(rt, rt_next) == NULL) {
805 		/*
806 		 * If there's only one entry on the list do not go
807 		 * through an insert/remove cycle.  This is done to
808 		 * guarantee that ``an->an_rtlist''  is never empty
809 		 * when a node is in the tree.
810 		 */
811 		rt->rt_priority = prio;
812 	} else {
813 		rtref(rt); /* keep rt alive in between remove and insert */
814 		SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist,
815 		    rt, rtentry, rt_next);
816 		rt->rt_priority = prio;
817 		rtable_mpath_insert(an, rt);
818 		rtfree(rt);
819 		error = EAGAIN;
820 	}
821 	rw_exit_write(&ar->ar_lock);
822 
823 	return (error);
824 }
825 
826 void
827 rtable_mpath_insert(struct art_node *an, struct rtentry *rt)
828 {
829 	struct rtentry			*mrt, *prt = NULL;
830 	uint8_t				 prio = rt->rt_priority;
831 
832 	if ((mrt = SRPL_FIRST_LOCKED(&an->an_rtlist)) == NULL) {
833 		SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
834 		return;
835 	}
836 
837 	/* Iterate until we find the route to be placed after ``rt''. */
838 	while (mrt->rt_priority <= prio && SRPL_NEXT_LOCKED(mrt, rt_next)) {
839 		prt = mrt;
840 		mrt = SRPL_NEXT_LOCKED(mrt, rt_next);
841 	}
842 
843 	if (mrt->rt_priority <= prio) {
844 		SRPL_INSERT_AFTER_LOCKED(&rt_rc, mrt, rt, rt_next);
845 	} else if (prt != NULL) {
846 		SRPL_INSERT_AFTER_LOCKED(&rt_rc, prt, rt, rt_next);
847 	} else {
848 		SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
849 	}
850 }
851 
852 /*
853  * Returns 1 if ``an'' perfectly matches (``dst'', ``plen''), 0 otherwise.
854  */
855 int
856 an_match(struct art_node *an, struct sockaddr *dst, int plen)
857 {
858 	struct rtentry			*rt;
859 	struct srp_ref			 sr;
860 	int				 match;
861 
862 	if (an == NULL || an->an_plen != plen)
863 		return (0);
864 
865 	rt = SRPL_FIRST(&sr, &an->an_rtlist);
866 	match = (memcmp(rt->rt_dest, dst, dst->sa_len) == 0);
867 	SRPL_LEAVE(&sr);
868 
869 	return (match);
870 }
871 
872 void
873 rtentry_ref(void *null, void *xrt)
874 {
875 	struct rtentry *rt = xrt;
876 
877 	rtref(rt);
878 }
879 
880 void
881 rtentry_unref(void *null, void *xrt)
882 {
883 	struct rtentry *rt = xrt;
884 
885 	rtfree(rt);
886 }
887 
888 /*
889  * Return a pointer to the address (key).  This is an heritage from the
890  * BSD radix tree needed to skip the non-address fields from the flavor
891  * of "struct sockaddr" used by this routing table.
892  */
893 static inline uint8_t *
894 satoaddr(struct art_root *at, struct sockaddr *sa)
895 {
896 	return (((uint8_t *)sa) + at->ar_off);
897 }
898 
899 /*
900  * Return the prefix length of a mask.
901  */
902 int
903 rtable_satoplen(sa_family_t af, struct sockaddr *mask)
904 {
905 	const struct domain	*dp;
906 	uint8_t			*ap, *ep;
907 	int			 mlen, plen = 0;
908 	int			 i;
909 
910 	for (i = 0; (dp = domains[i]) != NULL; i++) {
911 		if (dp->dom_rtoffset == 0)
912 			continue;
913 
914 		if (af == dp->dom_family)
915 			break;
916 	}
917 	if (dp == NULL)
918 		return (-1);
919 
920 	/* Host route */
921 	if (mask == NULL)
922 		return (dp->dom_maxplen);
923 
924 	mlen = mask->sa_len;
925 
926 	/* Default route */
927 	if (mlen == 0)
928 		return (0);
929 
930 	ap = (uint8_t *)((uint8_t *)mask) + dp->dom_rtoffset;
931 	ep = (uint8_t *)((uint8_t *)mask) + mlen;
932 	if (ap > ep)
933 		return (-1);
934 
935 	/* Trim trailing zeroes. */
936 	while (ap < ep && ep[-1] == 0)
937 		ep--;
938 
939 	if (ap == ep)
940 		return (0);
941 
942 	/* "Beauty" adapted from sbin/route/show.c ... */
943 	while (ap < ep) {
944 		switch (*ap++) {
945 		case 0xff:
946 			plen += 8;
947 			break;
948 		case 0xfe:
949 			plen += 7;
950 			goto out;
951 		case 0xfc:
952 			plen += 6;
953 			goto out;
954 		case 0xf8:
955 			plen += 5;
956 			goto out;
957 		case 0xf0:
958 			plen += 4;
959 			goto out;
960 		case 0xe0:
961 			plen += 3;
962 			goto out;
963 		case 0xc0:
964 			plen += 2;
965 			goto out;
966 		case 0x80:
967 			plen += 1;
968 			goto out;
969 		default:
970 			/* Non contiguous mask. */
971 			return (-1);
972 		}
973 	}
974 
975 out:
976 	if (plen > dp->dom_maxplen || ap != ep)
977 		return -1;
978 
979 	return (plen);
980 }
981