xref: /openbsd-src/sys/net/rtable.c (revision ff0e7be1ebbcc809ea8ad2b6dafe215824da9e46)
1 /*	$OpenBSD: rtable.c,v 1.82 2023/04/19 17:42:47 bluhm Exp $ */
2 
3 /*
4  * Copyright (c) 2014-2016 Martin Pieuchot
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #ifndef _KERNEL
20 #include "kern_compat.h"
21 #else
22 #include <sys/param.h>
23 #include <sys/systm.h>
24 #include <sys/socket.h>
25 #include <sys/malloc.h>
26 #include <sys/queue.h>
27 #include <sys/domain.h>
28 #include <sys/srp.h>
29 #endif
30 
31 #include <net/rtable.h>
32 #include <net/route.h>
33 
34 /*
35  * Structures used by rtable_get() to retrieve the corresponding
36  * routing table for a given pair of ``af'' and ``rtableid''.
37  *
38  * Note that once allocated routing table heads are never freed.
39  * This way we do not need to reference count them.
40  *
41  *	afmap		    rtmap/dommp
42  *   -----------          ---------     -----
43  *   |   0     |--------> | 0 | 0 | ... | 0 |	Array mapping rtableid (=index)
44  *   -----------          ---------     -----   to rdomain/loopback (=value).
45  *   | AF_INET |.
46  *   ----------- `.       .---------.     .---------.
47  *       ...	   `----> | rtable0 | ... | rtableN |	Array of pointers for
48  *   -----------          '---------'     '---------'	IPv4 routing tables
49  *   | AF_MPLS |					indexed by ``rtableid''.
50  *   -----------
51  */
52 struct srp	  *afmap;
53 uint8_t		   af2idx[AF_MAX+1];	/* To only allocate supported AF */
54 uint8_t		   af2idx_max;
55 
56 /* Array of routing table pointers. */
57 struct rtmap {
58 	unsigned int	   limit;
59 	void		 **tbl;
60 };
61 
62 /*
63  * Array of rtableid -> rdomain mapping.
64  *
65  * Only used for the first index as described above.
66  */
67 struct dommp {
68 	unsigned int	   limit;
69 	/*
70 	 * Array to get the routing domain and loopback interface related to
71 	 * a routing table. Format:
72 	 *
73 	 * 8 unused bits | 16 bits for loopback index | 8 bits for rdomain
74 	 */
75 	unsigned int	  *value;
76 };
77 
78 unsigned int	   rtmap_limit = 0;
79 
80 void		   rtmap_init(void);
81 void		   rtmap_grow(unsigned int, sa_family_t);
82 void		   rtmap_dtor(void *, void *);
83 
84 struct srp_gc	   rtmap_gc = SRP_GC_INITIALIZER(rtmap_dtor, NULL);
85 
86 void		   rtable_init_backend(void);
87 void		  *rtable_alloc(unsigned int, unsigned int, unsigned int);
88 void		  *rtable_get(unsigned int, sa_family_t);
89 
90 void
91 rtmap_init(void)
92 {
93 	const struct domain	*dp;
94 	int			 i;
95 
96 	/* Start with a single table for every domain that requires it. */
97 	for (i = 0; (dp = domains[i]) != NULL; i++) {
98 		if (dp->dom_rtoffset == 0)
99 			continue;
100 
101 		rtmap_grow(1, dp->dom_family);
102 	}
103 
104 	/* Initialize the rtableid->rdomain mapping table. */
105 	rtmap_grow(1, 0);
106 
107 	rtmap_limit = 1;
108 }
109 
110 /*
111  * Grow the size of the array of routing table for AF ``af'' to ``nlimit''.
112  */
113 void
114 rtmap_grow(unsigned int nlimit, sa_family_t af)
115 {
116 	struct rtmap	*map, *nmap;
117 	int		 i;
118 
119 	KERNEL_ASSERT_LOCKED();
120 
121 	KASSERT(nlimit > rtmap_limit);
122 
123 	nmap = malloc(sizeof(*nmap), M_RTABLE, M_WAITOK);
124 	nmap->limit = nlimit;
125 	nmap->tbl = mallocarray(nlimit, sizeof(*nmap[0].tbl), M_RTABLE,
126 	    M_WAITOK|M_ZERO);
127 
128 	map = srp_get_locked(&afmap[af2idx[af]]);
129 	if (map != NULL) {
130 		KASSERT(map->limit == rtmap_limit);
131 
132 		for (i = 0; i < map->limit; i++)
133 			nmap->tbl[i] = map->tbl[i];
134 	}
135 
136 	srp_update_locked(&rtmap_gc, &afmap[af2idx[af]], nmap);
137 }
138 
139 void
140 rtmap_dtor(void *null, void *xmap)
141 {
142 	struct rtmap	*map = xmap;
143 
144 	/*
145 	 * doesn't need to be serialized since this is the last reference
146 	 * to this map. there's nothing to race against.
147 	 */
148 	free(map->tbl, M_RTABLE, map->limit * sizeof(*map[0].tbl));
149 	free(map, M_RTABLE, sizeof(*map));
150 }
151 
152 void
153 rtable_init(void)
154 {
155 	const struct domain	*dp;
156 	int			 i;
157 
158 	KASSERT(sizeof(struct rtmap) == sizeof(struct dommp));
159 
160 	/* We use index 0 for the rtable/rdomain map. */
161 	af2idx_max = 1;
162 	memset(af2idx, 0, sizeof(af2idx));
163 
164 	/*
165 	 * Compute the maximum supported key length in case the routing
166 	 * table backend needs it.
167 	 */
168 	for (i = 0; (dp = domains[i]) != NULL; i++) {
169 		if (dp->dom_rtoffset == 0)
170 			continue;
171 
172 		af2idx[dp->dom_family] = af2idx_max++;
173 	}
174 	rtable_init_backend();
175 
176 	/*
177 	 * Allocate AF-to-id table now that we now how many AFs this
178 	 * kernel supports.
179 	 */
180 	afmap = mallocarray(af2idx_max + 1, sizeof(*afmap), M_RTABLE,
181 	    M_WAITOK|M_ZERO);
182 
183 	rtmap_init();
184 
185 	if (rtable_add(0) != 0)
186 		panic("unable to create default routing table");
187 
188 	rt_timer_init();
189 }
190 
191 int
192 rtable_add(unsigned int id)
193 {
194 	const struct domain	*dp;
195 	void			*tbl;
196 	struct rtmap		*map;
197 	struct dommp		*dmm;
198 	sa_family_t		 af;
199 	unsigned int		 off, alen;
200 	int			 i, error = 0;
201 
202 	if (id > RT_TABLEID_MAX)
203 		return (EINVAL);
204 
205 	KERNEL_LOCK();
206 
207 	if (rtable_exists(id))
208 		goto out;
209 
210 	for (i = 0; (dp = domains[i]) != NULL; i++) {
211 		if (dp->dom_rtoffset == 0)
212 			continue;
213 
214 		af = dp->dom_family;
215 		off = dp->dom_rtoffset;
216 		alen = dp->dom_maxplen;
217 
218 		if (id >= rtmap_limit)
219 			rtmap_grow(id + 1, af);
220 
221 		tbl = rtable_alloc(id, alen, off);
222 		if (tbl == NULL) {
223 			error = ENOMEM;
224 			goto out;
225 		}
226 
227 		map = srp_get_locked(&afmap[af2idx[af]]);
228 		map->tbl[id] = tbl;
229 	}
230 
231 	/* Reflect possible growth. */
232 	if (id >= rtmap_limit) {
233 		rtmap_grow(id + 1, 0);
234 		rtmap_limit = id + 1;
235 	}
236 
237 	/* Use main rtable/rdomain by default. */
238 	dmm = srp_get_locked(&afmap[0]);
239 	dmm->value[id] = 0;
240 out:
241 	KERNEL_UNLOCK();
242 
243 	return (error);
244 }
245 
246 void *
247 rtable_get(unsigned int rtableid, sa_family_t af)
248 {
249 	struct rtmap	*map;
250 	void		*tbl = NULL;
251 	struct srp_ref	 sr;
252 
253 	if (af >= nitems(af2idx) || af2idx[af] == 0)
254 		return (NULL);
255 
256 	map = srp_enter(&sr, &afmap[af2idx[af]]);
257 	if (rtableid < map->limit)
258 		tbl = map->tbl[rtableid];
259 	srp_leave(&sr);
260 
261 	return (tbl);
262 }
263 
264 int
265 rtable_exists(unsigned int rtableid)
266 {
267 	const struct domain	*dp;
268 	void			*tbl;
269 	int			 i;
270 
271 	for (i = 0; (dp = domains[i]) != NULL; i++) {
272 		if (dp->dom_rtoffset == 0)
273 			continue;
274 
275 		tbl = rtable_get(rtableid, dp->dom_family);
276 		if (tbl != NULL)
277 			return (1);
278 	}
279 
280 	return (0);
281 }
282 
283 int
284 rtable_empty(unsigned int rtableid)
285 {
286 	const struct domain	*dp;
287 	int			 i;
288 	struct art_root		*tbl;
289 
290 	for (i = 0; (dp = domains[i]) != NULL; i++) {
291 		if (dp->dom_rtoffset == 0)
292 			continue;
293 
294 		tbl = rtable_get(rtableid, dp->dom_family);
295 		if (tbl == NULL)
296 			continue;
297 		if (tbl->ar_root.ref != NULL)
298 			return (0);
299 	}
300 
301 	return (1);
302 }
303 
304 unsigned int
305 rtable_l2(unsigned int rtableid)
306 {
307 	struct dommp	*dmm;
308 	unsigned int	 rdomain = 0;
309 	struct srp_ref	 sr;
310 
311 	dmm = srp_enter(&sr, &afmap[0]);
312 	if (rtableid < dmm->limit)
313 		rdomain = (dmm->value[rtableid] & RT_TABLEID_MASK);
314 	srp_leave(&sr);
315 
316 	return (rdomain);
317 }
318 
319 unsigned int
320 rtable_loindex(unsigned int rtableid)
321 {
322 	struct dommp	*dmm;
323 	unsigned int	 loifidx = 0;
324 	struct srp_ref	 sr;
325 
326 	dmm = srp_enter(&sr, &afmap[0]);
327 	if (rtableid < dmm->limit)
328 		loifidx = (dmm->value[rtableid] >> RT_TABLEID_BITS);
329 	srp_leave(&sr);
330 
331 	return (loifidx);
332 }
333 
334 void
335 rtable_l2set(unsigned int rtableid, unsigned int rdomain, unsigned int loifidx)
336 {
337 	struct dommp	*dmm;
338 	unsigned int	 value;
339 
340 	KERNEL_ASSERT_LOCKED();
341 
342 	if (!rtable_exists(rtableid) || !rtable_exists(rdomain))
343 		return;
344 
345 	value = (rdomain & RT_TABLEID_MASK) | (loifidx << RT_TABLEID_BITS);
346 
347 	dmm = srp_get_locked(&afmap[0]);
348 	dmm->value[rtableid] = value;
349 }
350 
351 
352 static inline uint8_t	*satoaddr(struct art_root *, struct sockaddr *);
353 
354 int	an_match(struct art_node *, struct sockaddr *, int);
355 void	rtentry_ref(void *, void *);
356 void	rtentry_unref(void *, void *);
357 
358 void	rtable_mpath_insert(struct art_node *, struct rtentry *);
359 
360 struct srpl_rc rt_rc = SRPL_RC_INITIALIZER(rtentry_ref, rtentry_unref, NULL);
361 
362 void
363 rtable_init_backend(void)
364 {
365 	art_init();
366 }
367 
368 void *
369 rtable_alloc(unsigned int rtableid, unsigned int alen, unsigned int off)
370 {
371 	return (art_alloc(rtableid, alen, off));
372 }
373 
374 int
375 rtable_setsource(unsigned int rtableid, int af, struct sockaddr *src)
376 {
377 	struct art_root		*ar;
378 
379 	NET_ASSERT_LOCKED_EXCLUSIVE();
380 
381 	if ((ar = rtable_get(rtableid, af)) == NULL)
382 		return (EAFNOSUPPORT);
383 
384 	ar->ar_source = src;
385 
386 	return (0);
387 }
388 
389 struct sockaddr *
390 rtable_getsource(unsigned int rtableid, int af)
391 {
392 	struct art_root		*ar;
393 
394 	NET_ASSERT_LOCKED();
395 
396 	ar = rtable_get(rtableid, af);
397 	if (ar == NULL)
398 		return (NULL);
399 
400 	return (ar->ar_source);
401 }
402 
403 void
404 rtable_clearsource(unsigned int rtableid, struct sockaddr *src)
405 {
406 	struct sockaddr	*addr;
407 
408 	addr = rtable_getsource(rtableid, src->sa_family);
409 	if (addr && (addr->sa_len == src->sa_len)) {
410 		if (memcmp(src, addr, addr->sa_len) == 0) {
411 			rtable_setsource(rtableid, src->sa_family, NULL);
412 		}
413 	}
414 }
415 
416 struct rtentry *
417 rtable_lookup(unsigned int rtableid, struct sockaddr *dst,
418     struct sockaddr *mask, struct sockaddr *gateway, uint8_t prio)
419 {
420 	struct art_root			*ar;
421 	struct art_node			*an;
422 	struct rtentry			*rt = NULL;
423 	struct srp_ref			 sr, nsr;
424 	uint8_t				*addr;
425 	int				 plen;
426 
427 	ar = rtable_get(rtableid, dst->sa_family);
428 	if (ar == NULL)
429 		return (NULL);
430 
431 	addr = satoaddr(ar, dst);
432 
433 	/* No need for a perfect match. */
434 	if (mask == NULL) {
435 		an = art_match(ar, addr, &nsr);
436 		if (an == NULL)
437 			goto out;
438 	} else {
439 		plen = rtable_satoplen(dst->sa_family, mask);
440 		if (plen == -1)
441 			return (NULL);
442 
443 		an = art_lookup(ar, addr, plen, &nsr);
444 
445 		/* Make sure we've got a perfect match. */
446 		if (!an_match(an, dst, plen))
447 			goto out;
448 	}
449 
450 	SRPL_FOREACH(rt, &sr, &an->an_rtlist, rt_next) {
451 		if (prio != RTP_ANY &&
452 		    (rt->rt_priority & RTP_MASK) != (prio & RTP_MASK))
453 			continue;
454 
455 		if (gateway == NULL)
456 			break;
457 
458 		if (rt->rt_gateway->sa_len == gateway->sa_len &&
459 		    memcmp(rt->rt_gateway, gateway, gateway->sa_len) == 0)
460 			break;
461 	}
462 	if (rt != NULL)
463 		rtref(rt);
464 
465 	SRPL_LEAVE(&sr);
466 out:
467 	srp_leave(&nsr);
468 
469 	return (rt);
470 }
471 
472 struct rtentry *
473 rtable_match(unsigned int rtableid, struct sockaddr *dst, uint32_t *src)
474 {
475 	struct art_root			*ar;
476 	struct art_node			*an;
477 	struct rtentry			*rt = NULL;
478 	struct srp_ref			 sr, nsr;
479 	uint8_t				*addr;
480 	int				 hash;
481 
482 	ar = rtable_get(rtableid, dst->sa_family);
483 	if (ar == NULL)
484 		return (NULL);
485 
486 	addr = satoaddr(ar, dst);
487 
488 	an = art_match(ar, addr, &nsr);
489 	if (an == NULL)
490 		goto out;
491 
492 	rt = SRPL_FIRST(&sr, &an->an_rtlist);
493 	if (rt == NULL) {
494 		SRPL_LEAVE(&sr);
495 		goto out;
496 	}
497 	rtref(rt);
498 	SRPL_LEAVE(&sr);
499 
500 	/* Gateway selection by Hash-Threshold (RFC 2992) */
501 	if ((hash = rt_hash(rt, dst, src)) != -1) {
502 		struct rtentry		*mrt;
503 		int			 threshold, npaths = 0;
504 
505 		KASSERT(hash <= 0xffff);
506 
507 		SRPL_FOREACH(mrt, &sr, &an->an_rtlist, rt_next) {
508 			/* Only count nexthops with the same priority. */
509 			if (mrt->rt_priority == rt->rt_priority)
510 				npaths++;
511 		}
512 		SRPL_LEAVE(&sr);
513 
514 		threshold = (0xffff / npaths) + 1;
515 
516 		/*
517 		 * we have no protection against concurrent modification of the
518 		 * route list attached to the node, so we won't necessarily
519 		 * have the same number of routes.  for most modifications,
520 		 * we'll pick a route that we wouldn't have if we only saw the
521 		 * list before or after the change.  if we were going to use
522 		 * the last available route, but it got removed, we'll hit
523 		 * the end of the list and then pick the first route.
524 		 */
525 
526 		mrt = SRPL_FIRST(&sr, &an->an_rtlist);
527 		while (hash > threshold && mrt != NULL) {
528 			if (mrt->rt_priority == rt->rt_priority)
529 				hash -= threshold;
530 			mrt = SRPL_FOLLOW(&sr, mrt, rt_next);
531 		}
532 
533 		if (mrt != NULL) {
534 			rtref(mrt);
535 			rtfree(rt);
536 			rt = mrt;
537 		}
538 		SRPL_LEAVE(&sr);
539 	}
540 out:
541 	srp_leave(&nsr);
542 	return (rt);
543 }
544 
545 int
546 rtable_insert(unsigned int rtableid, struct sockaddr *dst,
547     struct sockaddr *mask, struct sockaddr *gateway, uint8_t prio,
548     struct rtentry *rt)
549 {
550 	struct rtentry			*mrt;
551 	struct srp_ref			 sr;
552 	struct art_root			*ar;
553 	struct art_node			*an, *prev;
554 	uint8_t				*addr;
555 	int				 plen;
556 	unsigned int			 rt_flags;
557 	int				 error = 0;
558 
559 	ar = rtable_get(rtableid, dst->sa_family);
560 	if (ar == NULL)
561 		return (EAFNOSUPPORT);
562 
563 	addr = satoaddr(ar, dst);
564 	plen = rtable_satoplen(dst->sa_family, mask);
565 	if (plen == -1)
566 		return (EINVAL);
567 
568 	rtref(rt); /* guarantee rtfree won't do anything during insert */
569 	rw_enter_write(&ar->ar_lock);
570 
571 	/* Do not permit exactly the same dst/mask/gw pair. */
572 	an = art_lookup(ar, addr, plen, &sr);
573 	srp_leave(&sr); /* an can't go away while we have the lock */
574 	if (an_match(an, dst, plen)) {
575 		struct rtentry  *mrt;
576 		int		 mpathok = ISSET(rt->rt_flags, RTF_MPATH);
577 
578 		SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next) {
579 			if (prio != RTP_ANY &&
580 			    (mrt->rt_priority & RTP_MASK) != (prio & RTP_MASK))
581 				continue;
582 
583 			if (!mpathok ||
584 			    (mrt->rt_gateway->sa_len == gateway->sa_len &&
585 			    memcmp(mrt->rt_gateway, gateway,
586 			    gateway->sa_len) == 0)) {
587 				error = EEXIST;
588 				goto leave;
589 			}
590 		}
591 	}
592 
593 	an = art_get(dst, plen);
594 	if (an == NULL) {
595 		error = ENOBUFS;
596 		goto leave;
597 	}
598 
599 	/* prepare for immediate operation if insert succeeds */
600 	rt_flags = rt->rt_flags;
601 	rt->rt_flags &= ~RTF_MPATH;
602 	rt->rt_dest = dst;
603 	rt->rt_plen = plen;
604 	SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
605 
606 	prev = art_insert(ar, an, addr, plen);
607 	if (prev != an) {
608 		SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry,
609 		    rt_next);
610 		rt->rt_flags = rt_flags;
611 		art_put(an);
612 
613 		if (prev == NULL) {
614 			error = ESRCH;
615 			goto leave;
616 		}
617 
618 		an = prev;
619 
620 		mrt = SRPL_FIRST_LOCKED(&an->an_rtlist);
621 		KASSERT(mrt != NULL);
622 		KASSERT((rt->rt_flags & RTF_MPATH) || mrt->rt_priority != prio);
623 
624 		/*
625 		 * An ART node with the same destination/netmask already
626 		 * exists, MPATH conflict must have been already checked.
627 		 */
628 		if (rt->rt_flags & RTF_MPATH) {
629 			/*
630 			 * Only keep the RTF_MPATH flag if two routes have
631 			 * the same gateway.
632 			 */
633 			rt->rt_flags &= ~RTF_MPATH;
634 			SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next) {
635 				if (mrt->rt_priority == prio) {
636 					mrt->rt_flags |= RTF_MPATH;
637 					rt->rt_flags |= RTF_MPATH;
638 				}
639 			}
640 		}
641 
642 		/* Put newly inserted entry at the right place. */
643 		rtable_mpath_insert(an, rt);
644 	}
645 leave:
646 	rw_exit_write(&ar->ar_lock);
647 	rtfree(rt);
648 	return (error);
649 }
650 
651 int
652 rtable_delete(unsigned int rtableid, struct sockaddr *dst,
653     struct sockaddr *mask, struct rtentry *rt)
654 {
655 	struct art_root			*ar;
656 	struct art_node			*an;
657 	struct srp_ref			 sr;
658 	uint8_t				*addr;
659 	int				 plen;
660 	struct rtentry			*mrt;
661 	int				 npaths = 0;
662 	int				 error = 0;
663 
664 	ar = rtable_get(rtableid, dst->sa_family);
665 	if (ar == NULL)
666 		return (EAFNOSUPPORT);
667 
668 	addr = satoaddr(ar, dst);
669 	plen = rtable_satoplen(dst->sa_family, mask);
670 	if (plen == -1)
671 		return (EINVAL);
672 
673 	rtref(rt); /* guarantee rtfree won't do anything under ar_lock */
674 	rw_enter_write(&ar->ar_lock);
675 	an = art_lookup(ar, addr, plen, &sr);
676 	srp_leave(&sr); /* an can't go away while we have the lock */
677 
678 	/* Make sure we've got a perfect match. */
679 	if (!an_match(an, dst, plen)) {
680 		error = ESRCH;
681 		goto leave;
682 	}
683 
684 	/*
685 	 * If other multipath route entries are still attached to
686 	 * this ART node we only have to unlink it.
687 	 */
688 	SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next)
689 		npaths++;
690 
691 	if (npaths > 1) {
692 		KASSERT(refcnt_read(&rt->rt_refcnt) >= 1);
693 		SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry,
694 		    rt_next);
695 
696 		mrt = SRPL_FIRST_LOCKED(&an->an_rtlist);
697 		if (npaths == 2)
698 			mrt->rt_flags &= ~RTF_MPATH;
699 
700 		goto leave;
701 	}
702 
703 	if (art_delete(ar, an, addr, plen) == NULL)
704 		panic("art_delete failed to find node %p", an);
705 
706 	KASSERT(refcnt_read(&rt->rt_refcnt) >= 1);
707 	SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry, rt_next);
708 	art_put(an);
709 
710 leave:
711 	rw_exit_write(&ar->ar_lock);
712 	rtfree(rt);
713 
714 	return (error);
715 }
716 
717 struct rtable_walk_cookie {
718 	int		(*rwc_func)(struct rtentry *, void *, unsigned int);
719 	void		 *rwc_arg;
720 	struct rtentry	**rwc_prt;
721 	unsigned int	  rwc_rid;
722 };
723 
724 /*
725  * Helper for rtable_walk to keep the ART code free from any "struct rtentry".
726  */
727 int
728 rtable_walk_helper(struct art_node *an, void *xrwc)
729 {
730 	struct srp_ref			 sr;
731 	struct rtable_walk_cookie	*rwc = xrwc;
732 	struct rtentry			*rt;
733 	int				 error = 0;
734 
735 	SRPL_FOREACH(rt, &sr, &an->an_rtlist, rt_next) {
736 		error = (*rwc->rwc_func)(rt, rwc->rwc_arg, rwc->rwc_rid);
737 		if (error != 0)
738 			break;
739 	}
740 	if (rwc->rwc_prt != NULL && rt != NULL) {
741 		rtref(rt);
742 		*rwc->rwc_prt = rt;
743 	}
744 	SRPL_LEAVE(&sr);
745 
746 	return (error);
747 }
748 
749 int
750 rtable_walk(unsigned int rtableid, sa_family_t af, struct rtentry **prt,
751     int (*func)(struct rtentry *, void *, unsigned int), void *arg)
752 {
753 	struct art_root			*ar;
754 	struct rtable_walk_cookie	 rwc;
755 	int				 error;
756 
757 	ar = rtable_get(rtableid, af);
758 	if (ar == NULL)
759 		return (EAFNOSUPPORT);
760 
761 	rwc.rwc_func = func;
762 	rwc.rwc_arg = arg;
763 	rwc.rwc_prt = prt;
764 	rwc.rwc_rid = rtableid;
765 
766 	error = art_walk(ar, rtable_walk_helper, &rwc);
767 
768 	return (error);
769 }
770 
771 struct rtentry *
772 rtable_iterate(struct rtentry *rt0)
773 {
774 	struct rtentry *rt = NULL;
775 	struct srp_ref sr;
776 
777 	rt = SRPL_NEXT(&sr, rt0, rt_next);
778 	if (rt != NULL)
779 		rtref(rt);
780 	SRPL_LEAVE(&sr);
781 	rtfree(rt0);
782 	return (rt);
783 }
784 
785 int
786 rtable_mpath_capable(unsigned int rtableid, sa_family_t af)
787 {
788 	return (1);
789 }
790 
791 int
792 rtable_mpath_reprio(unsigned int rtableid, struct sockaddr *dst,
793     int plen, uint8_t prio, struct rtentry *rt)
794 {
795 	struct art_root			*ar;
796 	struct art_node			*an;
797 	struct srp_ref			 sr;
798 	uint8_t				*addr;
799 	int				 error = 0;
800 
801 	ar = rtable_get(rtableid, dst->sa_family);
802 	if (ar == NULL)
803 		return (EAFNOSUPPORT);
804 
805 	addr = satoaddr(ar, dst);
806 
807 	rw_enter_write(&ar->ar_lock);
808 	an = art_lookup(ar, addr, plen, &sr);
809 	srp_leave(&sr); /* an can't go away while we have the lock */
810 
811 	/* Make sure we've got a perfect match. */
812 	if (!an_match(an, dst, plen)) {
813 		error = ESRCH;
814 	} else if (SRPL_FIRST_LOCKED(&an->an_rtlist) == rt &&
815 		SRPL_NEXT_LOCKED(rt, rt_next) == NULL) {
816 		/*
817 		 * If there's only one entry on the list do not go
818 		 * through an insert/remove cycle.  This is done to
819 		 * guarantee that ``an->an_rtlist''  is never empty
820 		 * when a node is in the tree.
821 		 */
822 		rt->rt_priority = prio;
823 	} else {
824 		rtref(rt); /* keep rt alive in between remove and insert */
825 		SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist,
826 		    rt, rtentry, rt_next);
827 		rt->rt_priority = prio;
828 		rtable_mpath_insert(an, rt);
829 		rtfree(rt);
830 		error = EAGAIN;
831 	}
832 	rw_exit_write(&ar->ar_lock);
833 
834 	return (error);
835 }
836 
837 void
838 rtable_mpath_insert(struct art_node *an, struct rtentry *rt)
839 {
840 	struct rtentry			*mrt, *prt = NULL;
841 	uint8_t				 prio = rt->rt_priority;
842 
843 	if ((mrt = SRPL_FIRST_LOCKED(&an->an_rtlist)) == NULL) {
844 		SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
845 		return;
846 	}
847 
848 	/* Iterate until we find the route to be placed after ``rt''. */
849 	while (mrt->rt_priority <= prio && SRPL_NEXT_LOCKED(mrt, rt_next)) {
850 		prt = mrt;
851 		mrt = SRPL_NEXT_LOCKED(mrt, rt_next);
852 	}
853 
854 	if (mrt->rt_priority <= prio) {
855 		SRPL_INSERT_AFTER_LOCKED(&rt_rc, mrt, rt, rt_next);
856 	} else if (prt != NULL) {
857 		SRPL_INSERT_AFTER_LOCKED(&rt_rc, prt, rt, rt_next);
858 	} else {
859 		SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
860 	}
861 }
862 
863 /*
864  * Returns 1 if ``an'' perfectly matches (``dst'', ``plen''), 0 otherwise.
865  */
866 int
867 an_match(struct art_node *an, struct sockaddr *dst, int plen)
868 {
869 	struct rtentry			*rt;
870 	struct srp_ref			 sr;
871 	int				 match;
872 
873 	if (an == NULL || an->an_plen != plen)
874 		return (0);
875 
876 	rt = SRPL_FIRST(&sr, &an->an_rtlist);
877 	match = (memcmp(rt->rt_dest, dst, dst->sa_len) == 0);
878 	SRPL_LEAVE(&sr);
879 
880 	return (match);
881 }
882 
883 void
884 rtentry_ref(void *null, void *xrt)
885 {
886 	struct rtentry *rt = xrt;
887 
888 	rtref(rt);
889 }
890 
891 void
892 rtentry_unref(void *null, void *xrt)
893 {
894 	struct rtentry *rt = xrt;
895 
896 	rtfree(rt);
897 }
898 
899 /*
900  * Return a pointer to the address (key).  This is an heritage from the
901  * BSD radix tree needed to skip the non-address fields from the flavor
902  * of "struct sockaddr" used by this routing table.
903  */
904 static inline uint8_t *
905 satoaddr(struct art_root *at, struct sockaddr *sa)
906 {
907 	return (((uint8_t *)sa) + at->ar_off);
908 }
909 
910 /*
911  * Return the prefix length of a mask.
912  */
913 int
914 rtable_satoplen(sa_family_t af, struct sockaddr *mask)
915 {
916 	const struct domain	*dp;
917 	uint8_t			*ap, *ep;
918 	int			 mlen, plen = 0;
919 	int			 i;
920 
921 	for (i = 0; (dp = domains[i]) != NULL; i++) {
922 		if (dp->dom_rtoffset == 0)
923 			continue;
924 
925 		if (af == dp->dom_family)
926 			break;
927 	}
928 	if (dp == NULL)
929 		return (-1);
930 
931 	/* Host route */
932 	if (mask == NULL)
933 		return (dp->dom_maxplen);
934 
935 	mlen = mask->sa_len;
936 
937 	/* Default route */
938 	if (mlen == 0)
939 		return (0);
940 
941 	ap = (uint8_t *)((uint8_t *)mask) + dp->dom_rtoffset;
942 	ep = (uint8_t *)((uint8_t *)mask) + mlen;
943 	if (ap > ep)
944 		return (-1);
945 
946 	/* Trim trailing zeroes. */
947 	while (ap < ep && ep[-1] == 0)
948 		ep--;
949 
950 	if (ap == ep)
951 		return (0);
952 
953 	/* "Beauty" adapted from sbin/route/show.c ... */
954 	while (ap < ep) {
955 		switch (*ap++) {
956 		case 0xff:
957 			plen += 8;
958 			break;
959 		case 0xfe:
960 			plen += 7;
961 			goto out;
962 		case 0xfc:
963 			plen += 6;
964 			goto out;
965 		case 0xf8:
966 			plen += 5;
967 			goto out;
968 		case 0xf0:
969 			plen += 4;
970 			goto out;
971 		case 0xe0:
972 			plen += 3;
973 			goto out;
974 		case 0xc0:
975 			plen += 2;
976 			goto out;
977 		case 0x80:
978 			plen += 1;
979 			goto out;
980 		default:
981 			/* Non contiguous mask. */
982 			return (-1);
983 		}
984 	}
985 
986 out:
987 	if (plen > dp->dom_maxplen || ap != ep)
988 		return -1;
989 
990 	return (plen);
991 }
992