xref: /openbsd-src/sys/net/rtable.c (revision 4b70baf6e17fc8b27fc1f7fa7929335753fa94c3)
1 /*	$OpenBSD: rtable.c,v 1.68 2019/03/05 19:07:56 anton Exp $ */
2 
3 /*
4  * Copyright (c) 2014-2016 Martin Pieuchot
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #ifndef _KERNEL
20 #include "kern_compat.h"
21 #else
22 #include <sys/param.h>
23 #include <sys/systm.h>
24 #include <sys/socket.h>
25 #include <sys/malloc.h>
26 #include <sys/queue.h>
27 #include <sys/domain.h>
28 #include <sys/srp.h>
29 #endif
30 
31 #include <net/rtable.h>
32 #include <net/route.h>
33 
34 /*
35  * Structures used by rtable_get() to retrieve the corresponding
36  * routing table for a given pair of ``af'' and ``rtableid''.
37  *
38  * Note that once allocated routing table heads are never freed.
39  * This way we do not need to reference count them.
40  *
41  *	afmap		    rtmap/dommp
42  *   -----------          ---------     -----
43  *   |   0     |--------> | 0 | 0 | ... | 0 |	Array mapping rtableid (=index)
44  *   -----------          ---------     -----   to rdomain/loopback (=value).
45  *   | AF_INET |.
46  *   ----------- `.       .---------.     .---------.
47  *       ...	   `----> | rtable0 | ... | rtableN |	Array of pointers for
48  *   -----------          '---------'     '---------'	IPv4 routing tables
49  *   | AF_MPLS |					indexed by ``rtableid''.
50  *   -----------
51  */
52 struct srp	  *afmap;
53 uint8_t		   af2idx[AF_MAX+1];	/* To only allocate supported AF */
54 uint8_t		   af2idx_max;
55 
56 /* Array of routing table pointers. */
57 struct rtmap {
58 	unsigned int	   limit;
59 	void		 **tbl;
60 };
61 
62 /*
63  * Array of rtableid -> rdomain mapping.
64  *
65  * Only used for the first index as describbed above.
66  */
67 struct dommp {
68 	unsigned int	   limit;
69 	/*
70 	 * Array to get the routing domain and loopback interface related to
71 	 * a routing table. Format:
72 	 *
73 	 * 8 unused bits | 16 bits for loopback index | 8 bits for rdomain
74 	 */
75 	unsigned int	  *value;
76 };
77 
78 unsigned int	   rtmap_limit = 0;
79 
80 void		   rtmap_init(void);
81 void		   rtmap_grow(unsigned int, sa_family_t);
82 void		   rtmap_dtor(void *, void *);
83 
84 struct srp_gc	   rtmap_gc = SRP_GC_INITIALIZER(rtmap_dtor, NULL);
85 
86 void		   rtable_init_backend(void);
87 void		  *rtable_alloc(unsigned int, unsigned int, unsigned int);
88 void		  *rtable_get(unsigned int, sa_family_t);
89 
90 void
91 rtmap_init(void)
92 {
93 	struct domain	*dp;
94 	int		 i;
95 
96 	/* Start with a single table for every domain that requires it. */
97 	for (i = 0; (dp = domains[i]) != NULL; i++) {
98 		if (dp->dom_rtoffset == 0)
99 			continue;
100 
101 		rtmap_grow(1, dp->dom_family);
102 	}
103 
104 	/* Initialize the rtableid->rdomain mapping table. */
105 	rtmap_grow(1, 0);
106 
107 	rtmap_limit = 1;
108 }
109 
110 /*
111  * Grow the size of the array of routing table for AF ``af'' to ``nlimit''.
112  */
113 void
114 rtmap_grow(unsigned int nlimit, sa_family_t af)
115 {
116 	struct rtmap	*map, *nmap;
117 	int		 i;
118 
119 	KERNEL_ASSERT_LOCKED();
120 
121 	KASSERT(nlimit > rtmap_limit);
122 
123 	nmap = malloc(sizeof(*nmap), M_RTABLE, M_WAITOK);
124 	nmap->limit = nlimit;
125 	nmap->tbl = mallocarray(nlimit, sizeof(*nmap[0].tbl), M_RTABLE,
126 	    M_WAITOK|M_ZERO);
127 
128 	map = srp_get_locked(&afmap[af2idx[af]]);
129 	if (map != NULL) {
130 		KASSERT(map->limit == rtmap_limit);
131 
132 		for (i = 0; i < map->limit; i++)
133 			nmap->tbl[i] = map->tbl[i];
134 	}
135 
136 	srp_update_locked(&rtmap_gc, &afmap[af2idx[af]], nmap);
137 }
138 
139 void
140 rtmap_dtor(void *null, void *xmap)
141 {
142 	struct rtmap	*map = xmap;
143 
144 	/*
145 	 * doesnt need to be serialized since this is the last reference
146 	 * to this map. there's nothing to race against.
147 	 */
148 	free(map->tbl, M_RTABLE, map->limit * sizeof(*map[0].tbl));
149 	free(map, M_RTABLE, sizeof(*map));
150 }
151 
152 void
153 rtable_init(void)
154 {
155 	struct domain	*dp;
156 	int		 i;
157 
158 	KASSERT(sizeof(struct rtmap) == sizeof(struct dommp));
159 
160 	/* We use index 0 for the rtable/rdomain map. */
161 	af2idx_max = 1;
162 	memset(af2idx, 0, sizeof(af2idx));
163 
164 	/*
165 	 * Compute the maximum supported key length in case the routing
166 	 * table backend needs it.
167 	 */
168 	for (i = 0; (dp = domains[i]) != NULL; i++) {
169 		if (dp->dom_rtoffset == 0)
170 			continue;
171 
172 		af2idx[dp->dom_family] = af2idx_max++;
173 	}
174 	rtable_init_backend();
175 
176 	/*
177 	 * Allocate AF-to-id table now that we now how many AFs this
178 	 * kernel supports.
179 	 */
180 	afmap = mallocarray(af2idx_max + 1, sizeof(*afmap), M_RTABLE,
181 	    M_WAITOK|M_ZERO);
182 
183 	rtmap_init();
184 
185 	if (rtable_add(0) != 0)
186 		panic("unable to create default routing table");
187 }
188 
189 int
190 rtable_add(unsigned int id)
191 {
192 	struct domain	*dp;
193 	void		*tbl;
194 	struct rtmap	*map;
195 	struct dommp	*dmm;
196 	sa_family_t	 af;
197 	unsigned int	 off, alen;
198 	int		 i;
199 
200 	KERNEL_ASSERT_LOCKED();
201 
202 	if (id > RT_TABLEID_MAX)
203 		return (EINVAL);
204 
205 	if (rtable_exists(id))
206 		return (EEXIST);
207 
208 	for (i = 0; (dp = domains[i]) != NULL; i++) {
209 		if (dp->dom_rtoffset == 0)
210 			continue;
211 
212 		af = dp->dom_family;
213 		off = dp->dom_rtoffset;
214 		alen = dp->dom_maxplen;
215 
216 		if (id >= rtmap_limit)
217 			rtmap_grow(id + 1, af);
218 
219 		tbl = rtable_alloc(id, alen, off);
220 		if (tbl == NULL)
221 			return (ENOMEM);
222 
223 		map = srp_get_locked(&afmap[af2idx[af]]);
224 		map->tbl[id] = tbl;
225 	}
226 
227 	/* Reflect possible growth. */
228 	if (id >= rtmap_limit) {
229 		rtmap_grow(id + 1, 0);
230 		rtmap_limit = id + 1;
231 	}
232 
233 	/* Use main rtable/rdomain by default. */
234 	dmm = srp_get_locked(&afmap[0]);
235 	dmm->value[id] = 0;
236 
237 	return (0);
238 }
239 
240 void *
241 rtable_get(unsigned int rtableid, sa_family_t af)
242 {
243 	struct rtmap	*map;
244 	void		*tbl = NULL;
245 	struct srp_ref	 sr;
246 
247 	if (af >= nitems(af2idx) || af2idx[af] == 0)
248 		return (NULL);
249 
250 	map = srp_enter(&sr, &afmap[af2idx[af]]);
251 	if (rtableid < map->limit)
252 		tbl = map->tbl[rtableid];
253 	srp_leave(&sr);
254 
255 	return (tbl);
256 }
257 
258 int
259 rtable_exists(unsigned int rtableid)
260 {
261 	struct domain	*dp;
262 	void		*tbl;
263 	int		 i;
264 
265 	for (i = 0; (dp = domains[i]) != NULL; i++) {
266 		if (dp->dom_rtoffset == 0)
267 			continue;
268 
269 		tbl = rtable_get(rtableid, dp->dom_family);
270 		if (tbl != NULL)
271 			return (1);
272 	}
273 
274 	return (0);
275 }
276 
277 int
278 rtable_empty(unsigned int rtableid)
279 {
280 	struct domain	*dp;
281 	int		 i;
282 	struct art_root	*tbl;
283 
284 	for (i = 0; (dp = domains[i]) != NULL; i++) {
285 		if (dp->dom_rtoffset == 0)
286 			continue;
287 
288 		tbl = rtable_get(rtableid, dp->dom_family);
289 		if (tbl == NULL)
290 			continue;
291 		if (tbl->ar_root.ref != NULL)
292 			return (0);
293 	}
294 
295 	return (1);
296 }
297 
298 unsigned int
299 rtable_l2(unsigned int rtableid)
300 {
301 	struct dommp	*dmm;
302 	unsigned int	 rdomain = 0;
303 	struct srp_ref	 sr;
304 
305 	dmm = srp_enter(&sr, &afmap[0]);
306 	if (rtableid < dmm->limit)
307 		rdomain = (dmm->value[rtableid] & RT_TABLEID_MASK);
308 	srp_leave(&sr);
309 
310 	return (rdomain);
311 }
312 
313 unsigned int
314 rtable_loindex(unsigned int rtableid)
315 {
316 	struct dommp	*dmm;
317 	unsigned int	 loifidx = 0;
318 	struct srp_ref	 sr;
319 
320 	dmm = srp_enter(&sr, &afmap[0]);
321 	if (rtableid < dmm->limit)
322 		loifidx = (dmm->value[rtableid] >> RT_TABLEID_BITS);
323 	srp_leave(&sr);
324 
325 	return (loifidx);
326 }
327 
328 void
329 rtable_l2set(unsigned int rtableid, unsigned int rdomain, unsigned int loifidx)
330 {
331 	struct dommp	*dmm;
332 	unsigned int	 value;
333 
334 	KERNEL_ASSERT_LOCKED();
335 
336 	if (!rtable_exists(rtableid) || !rtable_exists(rdomain))
337 		return;
338 
339 	value = (rdomain & RT_TABLEID_MASK) | (loifidx << RT_TABLEID_BITS);
340 
341 	dmm = srp_get_locked(&afmap[0]);
342 	dmm->value[rtableid] = value;
343 }
344 
345 
346 static inline uint8_t	*satoaddr(struct art_root *, struct sockaddr *);
347 
348 int	an_match(struct art_node *, struct sockaddr *, int);
349 void	rtentry_ref(void *, void *);
350 void	rtentry_unref(void *, void *);
351 
352 void	rtable_mpath_insert(struct art_node *, struct rtentry *);
353 
354 struct srpl_rc rt_rc = SRPL_RC_INITIALIZER(rtentry_ref, rtentry_unref, NULL);
355 
356 void
357 rtable_init_backend(void)
358 {
359 	art_init();
360 }
361 
362 void *
363 rtable_alloc(unsigned int rtableid, unsigned int alen, unsigned int off)
364 {
365 	return (art_alloc(rtableid, alen, off));
366 }
367 
368 struct rtentry *
369 rtable_lookup(unsigned int rtableid, struct sockaddr *dst,
370     struct sockaddr *mask, struct sockaddr *gateway, uint8_t prio)
371 {
372 	struct art_root			*ar;
373 	struct art_node			*an;
374 	struct rtentry			*rt = NULL;
375 	struct srp_ref			 sr, nsr;
376 	uint8_t				*addr;
377 	int				 plen;
378 
379 	ar = rtable_get(rtableid, dst->sa_family);
380 	if (ar == NULL)
381 		return (NULL);
382 
383 	addr = satoaddr(ar, dst);
384 
385 	/* No need for a perfect match. */
386 	if (mask == NULL) {
387 		an = art_match(ar, addr, &nsr);
388 		if (an == NULL)
389 			goto out;
390 	} else {
391 		plen = rtable_satoplen(dst->sa_family, mask);
392 		if (plen == -1)
393 			return (NULL);
394 
395 		an = art_lookup(ar, addr, plen, &nsr);
396 
397 		/* Make sure we've got a perfect match. */
398 		if (!an_match(an, dst, plen))
399 			goto out;
400 	}
401 
402 	SRPL_FOREACH(rt, &sr, &an->an_rtlist, rt_next) {
403 		if (prio != RTP_ANY &&
404 		    (rt->rt_priority & RTP_MASK) != (prio & RTP_MASK))
405 			continue;
406 
407 		if (gateway == NULL)
408 			break;
409 
410 		if (rt->rt_gateway->sa_len == gateway->sa_len &&
411 		    memcmp(rt->rt_gateway, gateway, gateway->sa_len) == 0)
412 			break;
413 	}
414 	if (rt != NULL)
415 		rtref(rt);
416 
417 	SRPL_LEAVE(&sr);
418 out:
419 	srp_leave(&nsr);
420 
421 	return (rt);
422 }
423 
424 struct rtentry *
425 rtable_match(unsigned int rtableid, struct sockaddr *dst, uint32_t *src)
426 {
427 	struct art_root			*ar;
428 	struct art_node			*an;
429 	struct rtentry			*rt = NULL;
430 	struct srp_ref			 sr, nsr;
431 	uint8_t				*addr;
432 	int				 hash;
433 
434 	ar = rtable_get(rtableid, dst->sa_family);
435 	if (ar == NULL)
436 		return (NULL);
437 
438 	addr = satoaddr(ar, dst);
439 
440 	an = art_match(ar, addr, &nsr);
441 	if (an == NULL)
442 		goto out;
443 
444 	rt = SRPL_FIRST(&sr, &an->an_rtlist);
445 	rtref(rt);
446 	SRPL_LEAVE(&sr);
447 
448 	/* Gateway selection by Hash-Threshold (RFC 2992) */
449 	if ((hash = rt_hash(rt, dst, src)) != -1) {
450 		struct rtentry		*mrt;
451 		int			 threshold, npaths = 0;
452 
453 		KASSERT(hash <= 0xffff);
454 
455 		SRPL_FOREACH(mrt, &sr, &an->an_rtlist, rt_next) {
456 			/* Only count nexthops with the same priority. */
457 			if (mrt->rt_priority == rt->rt_priority)
458 				npaths++;
459 		}
460 		SRPL_LEAVE(&sr);
461 
462 		threshold = (0xffff / npaths) + 1;
463 
464 		/*
465 		 * we have no protection against concurrent modification of the
466 		 * route list attached to the node, so we won't necessarily
467 		 * have the same number of routes.  for most modifications,
468 		 * we'll pick a route that we wouldn't have if we only saw the
469 		 * list before or after the change.  if we were going to use
470 		 * the last available route, but it got removed, we'll hit
471 		 * the end of the list and then pick the first route.
472 		 */
473 
474 		mrt = SRPL_FIRST(&sr, &an->an_rtlist);
475 		while (hash > threshold && mrt != NULL) {
476 			if (mrt->rt_priority == rt->rt_priority)
477 				hash -= threshold;
478 			mrt = SRPL_FOLLOW(&sr, mrt, rt_next);
479 		}
480 
481 		if (mrt != NULL) {
482 			rtref(mrt);
483 			rtfree(rt);
484 			rt = mrt;
485 		}
486 		SRPL_LEAVE(&sr);
487 	}
488 out:
489 	srp_leave(&nsr);
490 	return (rt);
491 }
492 
493 int
494 rtable_insert(unsigned int rtableid, struct sockaddr *dst,
495     struct sockaddr *mask, struct sockaddr *gateway, uint8_t prio,
496     struct rtentry *rt)
497 {
498 	struct rtentry			*mrt;
499 	struct srp_ref			 sr;
500 	struct art_root			*ar;
501 	struct art_node			*an, *prev;
502 	uint8_t				*addr;
503 	int				 plen;
504 	unsigned int			 rt_flags;
505 	int				 error = 0;
506 
507 	ar = rtable_get(rtableid, dst->sa_family);
508 	if (ar == NULL)
509 		return (EAFNOSUPPORT);
510 
511 	addr = satoaddr(ar, dst);
512 	plen = rtable_satoplen(dst->sa_family, mask);
513 	if (plen == -1)
514 		return (EINVAL);
515 
516 	rtref(rt); /* guarantee rtfree won't do anything during insert */
517 	rw_enter_write(&ar->ar_lock);
518 
519 	/* Do not permit exactly the same dst/mask/gw pair. */
520 	an = art_lookup(ar, addr, plen, &sr);
521 	srp_leave(&sr); /* an can't go away while we have the lock */
522 	if (an_match(an, dst, plen)) {
523 		struct rtentry  *mrt;
524 		int		 mpathok = ISSET(rt->rt_flags, RTF_MPATH);
525 
526 		SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next) {
527 			if (prio != RTP_ANY &&
528 			    (mrt->rt_priority & RTP_MASK) != (prio & RTP_MASK))
529 				continue;
530 
531 			if (!mpathok ||
532 			    (mrt->rt_gateway->sa_len == gateway->sa_len &&
533 			    !memcmp(mrt->rt_gateway, gateway, gateway->sa_len))){
534 				error = EEXIST;
535 				goto leave;
536 			}
537 		}
538 	}
539 
540 	an = art_get(dst, plen);
541 	if (an == NULL) {
542 		error = ENOBUFS;
543 		goto leave;
544 	}
545 
546 	/* prepare for immediate operation if insert succeeds */
547 	rt_flags = rt->rt_flags;
548 	rt->rt_flags &= ~RTF_MPATH;
549 	rt->rt_dest = dst;
550 	rt->rt_plen = plen;
551 	SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
552 
553 	prev = art_insert(ar, an, addr, plen);
554 	if (prev != an) {
555 		SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry,
556 		    rt_next);
557 		rt->rt_flags = rt_flags;
558 		art_put(an);
559 
560 		if (prev == NULL) {
561 			error = ESRCH;
562 			goto leave;
563 		}
564 
565 		an = prev;
566 
567 		mrt = SRPL_FIRST_LOCKED(&an->an_rtlist);
568 		KASSERT(mrt != NULL);
569 		KASSERT((rt->rt_flags & RTF_MPATH) || mrt->rt_priority != prio);
570 
571 		/*
572 		 * An ART node with the same destination/netmask already
573 		 * exists, MPATH conflict must have been already checked.
574 		 */
575 		if (rt->rt_flags & RTF_MPATH) {
576 			/*
577 			 * Only keep the RTF_MPATH flag if two routes have
578 			 * the same gateway.
579 			 */
580 			rt->rt_flags &= ~RTF_MPATH;
581 			SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next) {
582 				if (mrt->rt_priority == prio) {
583 					mrt->rt_flags |= RTF_MPATH;
584 					rt->rt_flags |= RTF_MPATH;
585 				}
586 			}
587 		}
588 
589 		/* Put newly inserted entry at the right place. */
590 		rtable_mpath_insert(an, rt);
591 	}
592 leave:
593 	rw_exit_write(&ar->ar_lock);
594 	rtfree(rt);
595 	return (error);
596 }
597 
598 int
599 rtable_delete(unsigned int rtableid, struct sockaddr *dst,
600     struct sockaddr *mask, struct rtentry *rt)
601 {
602 	struct art_root			*ar;
603 	struct art_node			*an;
604 	struct srp_ref			 sr;
605 	uint8_t				*addr;
606 	int				 plen;
607 	struct rtentry			*mrt;
608 	int				 npaths = 0;
609 	int				 error = 0;
610 
611 	ar = rtable_get(rtableid, dst->sa_family);
612 	if (ar == NULL)
613 		return (EAFNOSUPPORT);
614 
615 	addr = satoaddr(ar, dst);
616 	plen = rtable_satoplen(dst->sa_family, mask);
617 	if (plen == -1)
618 		return (EINVAL);
619 
620 	rtref(rt); /* guarantee rtfree won't do anything under ar_lock */
621 	rw_enter_write(&ar->ar_lock);
622 	an = art_lookup(ar, addr, plen, &sr);
623 	srp_leave(&sr); /* an can't go away while we have the lock */
624 
625 	/* Make sure we've got a perfect match. */
626 	if (!an_match(an, dst, plen)) {
627 		error = ESRCH;
628 		goto leave;
629 	}
630 
631 	/*
632 	 * If other multipath route entries are still attached to
633 	 * this ART node we only have to unlink it.
634 	 */
635 	SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next)
636 		npaths++;
637 
638 	if (npaths > 1) {
639 		KASSERT(rt->rt_refcnt >= 1);
640 		SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry,
641 		    rt_next);
642 
643 		mrt = SRPL_FIRST_LOCKED(&an->an_rtlist);
644 		if (npaths == 2)
645 			mrt->rt_flags &= ~RTF_MPATH;
646 
647 		goto leave;
648 	}
649 
650 	if (art_delete(ar, an, addr, plen) == NULL)
651 		panic("art_delete failed to find node %p", an);
652 
653 	KASSERT(rt->rt_refcnt >= 1);
654 	SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry, rt_next);
655 	art_put(an);
656 
657 leave:
658 	rw_exit_write(&ar->ar_lock);
659 	rtfree(rt);
660 
661 	return (error);
662 }
663 
664 struct rtable_walk_cookie {
665 	int		(*rwc_func)(struct rtentry *, void *, unsigned int);
666 	void		 *rwc_arg;
667 	unsigned int	  rwc_rid;
668 };
669 
670 /*
671  * Helper for rtable_walk to keep the ART code free from any "struct rtentry".
672  */
673 int
674 rtable_walk_helper(struct art_node *an, void *xrwc)
675 {
676 	struct srp_ref			 sr;
677 	struct rtable_walk_cookie	*rwc = xrwc;
678 	struct rtentry			*rt;
679 	int				 error = 0;
680 
681 	SRPL_FOREACH(rt, &sr, &an->an_rtlist, rt_next) {
682 		if ((error = (*rwc->rwc_func)(rt, rwc->rwc_arg, rwc->rwc_rid)))
683 			break;
684 	}
685 	SRPL_LEAVE(&sr);
686 
687 	return (error);
688 }
689 
690 int
691 rtable_walk(unsigned int rtableid, sa_family_t af,
692     int (*func)(struct rtentry *, void *, unsigned int), void *arg)
693 {
694 	struct art_root			*ar;
695 	struct rtable_walk_cookie	 rwc;
696 	int				 error;
697 
698 	ar = rtable_get(rtableid, af);
699 	if (ar == NULL)
700 		return (EAFNOSUPPORT);
701 
702 	rwc.rwc_func = func;
703 	rwc.rwc_arg = arg;
704 	rwc.rwc_rid = rtableid;
705 
706 	while ((error = art_walk(ar, rtable_walk_helper, &rwc)) == EAGAIN)
707 		continue;
708 
709 	return (error);
710 }
711 
712 struct rtentry *
713 rtable_iterate(struct rtentry *rt0)
714 {
715 	struct rtentry *rt = NULL;
716 	struct srp_ref sr;
717 
718 	rt = SRPL_NEXT(&sr, rt0, rt_next);
719 	if (rt != NULL)
720 		rtref(rt);
721 	SRPL_LEAVE(&sr);
722 	rtfree(rt0);
723 	return (rt);
724 }
725 
726 int
727 rtable_mpath_capable(unsigned int rtableid, sa_family_t af)
728 {
729 	return (1);
730 }
731 
732 int
733 rtable_mpath_reprio(unsigned int rtableid, struct sockaddr *dst,
734     int plen, uint8_t prio, struct rtentry *rt)
735 {
736 	struct art_root			*ar;
737 	struct art_node			*an;
738 	struct srp_ref			 sr;
739 	uint8_t				*addr;
740 	int				 error = 0;
741 
742 	ar = rtable_get(rtableid, dst->sa_family);
743 	if (ar == NULL)
744 		return (EAFNOSUPPORT);
745 
746 	addr = satoaddr(ar, dst);
747 
748 	rw_enter_write(&ar->ar_lock);
749 	an = art_lookup(ar, addr, plen, &sr);
750 	srp_leave(&sr); /* an can't go away while we have the lock */
751 
752 	/* Make sure we've got a perfect match. */
753 	if (!an_match(an, dst, plen)) {
754 		error = ESRCH;
755 	} else if (SRPL_FIRST_LOCKED(&an->an_rtlist) == rt &&
756 		SRPL_NEXT_LOCKED(rt, rt_next) == NULL) {
757 		/*
758 		 * If there's only one entry on the list do not go
759 		 * through an insert/remove cycle.  This is done to
760 		 * guarantee that ``an->an_rtlist''  is never empty
761 		 * when a node is in the tree.
762 		 */
763 		rt->rt_priority = prio;
764 	} else {
765 		rtref(rt); /* keep rt alive in between remove and insert */
766 		SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist,
767 		    rt, rtentry, rt_next);
768 		rt->rt_priority = prio;
769 		rtable_mpath_insert(an, rt);
770 		rtfree(rt);
771 		error = EAGAIN;
772 	}
773 	rw_exit_write(&ar->ar_lock);
774 
775 	return (error);
776 }
777 
778 void
779 rtable_mpath_insert(struct art_node *an, struct rtentry *rt)
780 {
781 	struct rtentry			*mrt, *prt = NULL;
782 	uint8_t				 prio = rt->rt_priority;
783 
784 	if ((mrt = SRPL_FIRST_LOCKED(&an->an_rtlist)) == NULL) {
785 		SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
786 		return;
787 	}
788 
789 	/* Iterate until we find the route to be placed after ``rt''. */
790 	while (mrt->rt_priority <= prio && SRPL_NEXT_LOCKED(mrt, rt_next)) {
791 		prt = mrt;
792 		mrt = SRPL_NEXT_LOCKED(mrt, rt_next);
793 	}
794 
795 	if (mrt->rt_priority <= prio) {
796 		SRPL_INSERT_AFTER_LOCKED(&rt_rc, mrt, rt, rt_next);
797 	} else if (prt != NULL) {
798 		SRPL_INSERT_AFTER_LOCKED(&rt_rc, prt, rt, rt_next);
799 	} else {
800 		SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
801 	}
802 }
803 
804 /*
805  * Returns 1 if ``an'' perfectly matches (``dst'', ``plen''), 0 otherwise.
806  */
807 int
808 an_match(struct art_node *an, struct sockaddr *dst, int plen)
809 {
810 	struct rtentry			*rt;
811 	struct srp_ref			 sr;
812 	int				 match;
813 
814 	if (an == NULL || an->an_plen != plen)
815 		return (0);
816 
817 	rt = SRPL_FIRST(&sr, &an->an_rtlist);
818 	match = (memcmp(rt->rt_dest, dst, dst->sa_len) == 0);
819 	SRPL_LEAVE(&sr);
820 
821 	return (match);
822 }
823 
824 void
825 rtentry_ref(void *null, void *xrt)
826 {
827 	struct rtentry *rt = xrt;
828 
829 	rtref(rt);
830 }
831 
832 void
833 rtentry_unref(void *null, void *xrt)
834 {
835 	struct rtentry *rt = xrt;
836 
837 	rtfree(rt);
838 }
839 
840 /*
841  * Return a pointer to the address (key).  This is an heritage from the
842  * BSD radix tree needed to skip the non-address fields from the flavor
843  * of "struct sockaddr" used by this routing table.
844  */
845 static inline uint8_t *
846 satoaddr(struct art_root *at, struct sockaddr *sa)
847 {
848 	return (((uint8_t *)sa) + at->ar_off);
849 }
850 
851 /*
852  * Return the prefix length of a mask.
853  */
854 int
855 rtable_satoplen(sa_family_t af, struct sockaddr *mask)
856 {
857 	struct domain	*dp;
858 	uint8_t		*ap, *ep;
859 	int		 mlen, plen = 0;
860 	int		 i;
861 
862 	for (i = 0; (dp = domains[i]) != NULL; i++) {
863 		if (dp->dom_rtoffset == 0)
864 			continue;
865 
866 		if (af == dp->dom_family)
867 			break;
868 	}
869 	if (dp == NULL)
870 		return (-1);
871 
872 	/* Host route */
873 	if (mask == NULL)
874 		return (dp->dom_maxplen);
875 
876 	mlen = mask->sa_len;
877 
878 	/* Default route */
879 	if (mlen == 0)
880 		return (0);
881 
882 	ap = (uint8_t *)((uint8_t *)mask) + dp->dom_rtoffset;
883 	ep = (uint8_t *)((uint8_t *)mask) + mlen;
884 	if (ap > ep)
885 		return (-1);
886 
887 	/* Trim trailing zeroes. */
888 	while (ap < ep && ep[-1] == 0)
889 		ep--;
890 
891 	if (ap == ep)
892 		return (0);
893 
894 	/* "Beauty" adapted from sbin/route/show.c ... */
895 	while (ap < ep) {
896 		switch (*ap++) {
897 		case 0xff:
898 			plen += 8;
899 			break;
900 		case 0xfe:
901 			plen += 7;
902 			goto out;
903 		case 0xfc:
904 			plen += 6;
905 			goto out;
906 		case 0xf8:
907 			plen += 5;
908 			goto out;
909 		case 0xf0:
910 			plen += 4;
911 			goto out;
912 		case 0xe0:
913 			plen += 3;
914 			goto out;
915 		case 0xc0:
916 			plen += 2;
917 			goto out;
918 		case 0x80:
919 			plen += 1;
920 			goto out;
921 		default:
922 			/* Non contiguous mask. */
923 			return (-1);
924 		}
925 	}
926 
927 out:
928 	if (plen > dp->dom_maxplen || ap != ep)
929 		return -1;
930 
931 	return (plen);
932 }
933