xref: /openbsd-src/sys/net/rtable.c (revision f2da64fbbbf1b03f09f390ab01267c93dfd77c4c)
1 /*	$OpenBSD: rtable.c,v 1.52 2016/09/07 09:36:49 mpi Exp $ */
2 
3 /*
4  * Copyright (c) 2014-2015 Martin Pieuchot
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #ifndef _KERNEL
20 #include "kern_compat.h"
21 #else
22 #include <sys/param.h>
23 #include <sys/systm.h>
24 #include <sys/socket.h>
25 #include <sys/malloc.h>
26 #include <sys/queue.h>
27 #include <sys/domain.h>
28 #include <sys/srp.h>
29 #endif
30 
31 #include <net/rtable.h>
32 #include <net/route.h>
33 
34 /*
35  * Structures used by rtable_get() to retrieve the corresponding
36  * routing table for a given pair of ``af'' and ``rtableid''.
37  *
38  * Note that once allocated routing table heads are never freed.
39  * This way we do not need to reference count them.
40  *
41  *	afmap		    rtmap/dommp
42  *   -----------          ---------     -----
43  *   |   0     |--------> | 0 | 0 | ... | 0 |	Array mapping rtableid (=index)
44  *   -----------          ---------     -----   to rdomain (=value).
45  *   | AF_INET |.
46  *   ----------- `.       .---------.     .---------.
47  *       ...	   `----> | rtable0 | ... | rtableN |	Array of pointers for
48  *   -----------          '---------'     '---------'	IPv4 routing tables
49  *   | AF_MPLS |            			 	indexed by ``rtableid''.
50  *   -----------
51  */
52 struct srp	  *afmap;
53 uint8_t		   af2idx[AF_MAX+1];	/* To only allocate supported AF */
54 uint8_t		   af2idx_max;
55 
56 /* Array of routing table pointers. */
57 struct rtmap {
58 	unsigned int	   limit;
59 	void		 **tbl;
60 };
61 
62 /* Array of rtableid -> rdomain mapping. */
63 struct dommp {
64 	unsigned int	   limit;
65 	unsigned int	  *dom;
66 };
67 
68 unsigned int	   rtmap_limit = 0;
69 
70 void		   rtmap_init(void);
71 void		   rtmap_grow(unsigned int, sa_family_t);
72 void		   rtmap_dtor(void *, void *);
73 
74 struct srp_gc	   rtmap_gc = SRP_GC_INITIALIZER(rtmap_dtor, NULL);
75 
76 void		   rtable_init_backend(unsigned int);
77 void		  *rtable_alloc(unsigned int, unsigned int, unsigned int);
78 void		  *rtable_get(unsigned int, sa_family_t);
79 
80 void
81 rtmap_init(void)
82 {
83 	struct domain	*dp;
84 	int		 i;
85 
86 	/* Start with a single table for every domain that requires it. */
87 	for (i = 0; (dp = domains[i]) != NULL; i++) {
88 		if (dp->dom_rtoffset == 0)
89 			continue;
90 
91 		rtmap_grow(1, dp->dom_family);
92 	}
93 
94 	/* Initialize the rtableid->rdomain mapping table. */
95 	rtmap_grow(1, 0);
96 
97 	rtmap_limit = 1;
98 }
99 
100 /*
101  * Grow the size of the array of routing table for AF ``af'' to ``nlimit''.
102  */
103 void
104 rtmap_grow(unsigned int nlimit, sa_family_t af)
105 {
106 	struct rtmap	*map, *nmap;
107 	int		 i;
108 
109 	KERNEL_ASSERT_LOCKED();
110 
111 	KASSERT(nlimit > rtmap_limit);
112 
113 	nmap = malloc(sizeof(*nmap), M_RTABLE, M_WAITOK);
114 	nmap->limit = nlimit;
115 	nmap->tbl = mallocarray(nlimit, sizeof(*nmap[0].tbl), M_RTABLE,
116 	    M_WAITOK|M_ZERO);
117 
118 	map = srp_get_locked(&afmap[af2idx[af]]);
119 	if (map != NULL) {
120 		KASSERT(map->limit == rtmap_limit);
121 
122 		for (i = 0; i < map->limit; i++)
123 			nmap->tbl[i] = map->tbl[i];
124 	}
125 
126 	srp_update_locked(&rtmap_gc, &afmap[af2idx[af]], nmap);
127 }
128 
129 void
130 rtmap_dtor(void *null, void *xmap)
131 {
132 	struct rtmap	*map = xmap;
133 
134 	/*
135 	 * doesnt need to be serialized since this is the last reference
136 	 * to this map. there's nothing to race against.
137 	 */
138 	free(map->tbl, M_RTABLE, map->limit * sizeof(*map[0].tbl));
139 	free(map, M_RTABLE, sizeof(*map));
140 }
141 
142 void
143 rtable_init(void)
144 {
145 	struct domain	*dp;
146 	unsigned int	 keylen = 0;
147 	int		 i;
148 
149 	/* We use index 0 for the rtable/rdomain map. */
150 	af2idx_max = 1;
151 	memset(af2idx, 0, sizeof(af2idx));
152 
153 	/*
154 	 * Compute the maximum supported key length in case the routing
155 	 * table backend needs it.
156 	 */
157 	for (i = 0; (dp = domains[i]) != NULL; i++) {
158 		if (dp->dom_rtoffset == 0)
159 			continue;
160 
161 		af2idx[dp->dom_family] = af2idx_max++;
162 		if (dp->dom_rtkeylen > keylen)
163 			keylen = dp->dom_rtkeylen;
164 
165 	}
166 	rtable_init_backend(keylen);
167 
168 	/*
169 	 * Allocate AF-to-id table now that we now how many AFs this
170 	 * kernel supports.
171 	 */
172 	afmap = mallocarray(af2idx_max + 1, sizeof(*afmap), M_RTABLE,
173 	    M_WAITOK|M_ZERO);
174 
175 	rtmap_init();
176 }
177 
178 int
179 rtable_add(unsigned int id)
180 {
181 	struct domain	*dp;
182 	void		*tbl;
183 	struct rtmap	*map;
184 	struct dommp	*dmm;
185 	sa_family_t	 af;
186 	unsigned int	 off, alen;
187 	int		 i;
188 
189 	KERNEL_ASSERT_LOCKED();
190 
191 	if (id > RT_TABLEID_MAX)
192 		return (EINVAL);
193 
194 	if (rtable_exists(id))
195 		return (EEXIST);
196 
197 	for (i = 0; (dp = domains[i]) != NULL; i++) {
198 		if (dp->dom_rtoffset == 0)
199 			continue;
200 
201 		af = dp->dom_family;
202 		off = dp->dom_rtoffset;
203 		alen = dp->dom_maxplen;
204 
205 		if (id >= rtmap_limit)
206 			rtmap_grow(id + 1, af);
207 
208 		tbl = rtable_alloc(id, alen, off);
209 		if (tbl == NULL)
210 			return (ENOMEM);
211 
212 		map = srp_get_locked(&afmap[af2idx[af]]);
213 		map->tbl[id] = tbl;
214 	}
215 
216 	/* Reflect possible growth. */
217 	if (id >= rtmap_limit) {
218 		rtmap_grow(id + 1, 0);
219 		rtmap_limit = id + 1;
220 	}
221 
222 	/* Use main rtable/rdomain by default. */
223 	dmm = srp_get_locked(&afmap[0]);
224 	dmm->dom[id] = 0;
225 
226 	return (0);
227 }
228 
229 void *
230 rtable_get(unsigned int rtableid, sa_family_t af)
231 {
232 	struct rtmap	*map;
233 	void		*tbl = NULL;
234 	struct srp_ref	 sr;
235 
236 	if (af >= nitems(af2idx) || af2idx[af] == 0)
237 		return (NULL);
238 
239 	map = srp_enter(&sr, &afmap[af2idx[af]]);
240 	if (rtableid < map->limit)
241 		tbl = map->tbl[rtableid];
242 	srp_leave(&sr);
243 
244 	return (tbl);
245 }
246 
247 int
248 rtable_exists(unsigned int rtableid)
249 {
250 	struct domain	*dp;
251 	void		*tbl;
252 	int		 i;
253 
254 	for (i = 0; (dp = domains[i]) != NULL; i++) {
255 		if (dp->dom_rtoffset == 0)
256 			continue;
257 
258 		tbl = rtable_get(rtableid, dp->dom_family);
259 		if (tbl != NULL)
260 			return (1);
261 	}
262 
263 	return (0);
264 }
265 
266 unsigned int
267 rtable_l2(unsigned int rtableid)
268 {
269 	struct dommp	*dmm;
270 	unsigned int	 rdomain = 0;
271 	struct srp_ref	 sr;
272 
273 	dmm = srp_enter(&sr, &afmap[0]);
274 	if (rtableid < dmm->limit)
275 		rdomain = dmm->dom[rtableid];
276 	srp_leave(&sr);
277 
278 	return (rdomain);
279 }
280 
281 void
282 rtable_l2set(unsigned int rtableid, unsigned int rdomain)
283 {
284 	struct dommp	*dmm;
285 
286 	KERNEL_ASSERT_LOCKED();
287 
288 	if (!rtable_exists(rtableid) || !rtable_exists(rdomain))
289 		return;
290 
291 	dmm = srp_get_locked(&afmap[0]);
292 	dmm->dom[rtableid] = rdomain;
293 }
294 
295 #ifndef ART
296 void
297 rtable_init_backend(unsigned int keylen)
298 {
299 	rn_init(keylen); /* initialize all zeroes, all ones, mask table */
300 }
301 
302 void *
303 rtable_alloc(unsigned int rtableid, unsigned int alen, unsigned int off)
304 {
305 	struct radix_node_head *rnh = NULL;
306 
307 	if (rn_inithead((void **)&rnh, off)) {
308 #ifndef SMALL_KERNEL
309 		rnh->rnh_multipath = 1;
310 #endif /* SMALL_KERNEL */
311 		rnh->rnh_rtableid = rtableid;
312 	}
313 
314 	return (rnh);
315 }
316 
317 struct rtentry *
318 rtable_lookup(unsigned int rtableid, struct sockaddr *dst,
319     struct sockaddr *mask, struct sockaddr *gateway, uint8_t prio)
320 {
321 	struct radix_node_head	*rnh;
322 	struct radix_node	*rn;
323 	struct rtentry		*rt;
324 
325 	rnh = rtable_get(rtableid, dst->sa_family);
326 	if (rnh == NULL)
327 		return (NULL);
328 
329 	rn = rn_lookup(dst, mask, rnh);
330 	if (rn == NULL || (rn->rn_flags & RNF_ROOT) != 0)
331 		return (NULL);
332 
333 	rt = ((struct rtentry *)rn);
334 
335 #ifndef SMALL_KERNEL
336 	if (rnh->rnh_multipath) {
337 		rt = rt_mpath_matchgate(rt, gateway, prio);
338 		if (rt == NULL)
339 			return (NULL);
340 	}
341 #endif /* !SMALL_KERNEL */
342 
343 	rtref(rt);
344 	return (rt);
345 }
346 
347 struct rtentry *
348 rtable_match(unsigned int rtableid, struct sockaddr *dst, uint32_t *src)
349 {
350 	struct radix_node_head	*rnh;
351 	struct radix_node	*rn;
352 	struct rtentry		*rt = NULL;
353 #ifndef SMALL_KERNEL
354 	int			 hash;
355 #endif /* SMALL_KERNEL */
356 
357 	rnh = rtable_get(rtableid, dst->sa_family);
358 	if (rnh == NULL)
359 		return (NULL);
360 
361 	KERNEL_LOCK();
362 	rn = rn_match(dst, rnh);
363 	if (rn == NULL || (rn->rn_flags & RNF_ROOT) != 0)
364 		goto out;
365 
366 	rt = ((struct rtentry *)rn);
367 	rtref(rt);
368 
369 #ifndef SMALL_KERNEL
370 	/* Gateway selection by Hash-Threshold (RFC 2992) */
371 	if ((hash = rt_hash(rt, dst, src)) != -1) {
372 		struct rtentry		*mrt = rt;
373 		int			 threshold, npaths = 1;
374 
375 		KASSERT(hash <= 0xffff);
376 
377 		rtref(rt);
378 
379 		while ((mrt = rtable_iterate(mrt)) != NULL)
380 			npaths++;
381 
382 		threshold = (0xffff / npaths) + 1;
383 
384 		mrt = rt;
385 		while (hash > threshold && mrt != NULL) {
386 			/* stay within the multipath routes */
387 			mrt = rtable_iterate(mrt);
388 			hash -= threshold;
389 		}
390 
391 		/* if gw selection fails, use the first match (default) */
392 		if (mrt != NULL) {
393 			rtfree(rt);
394 			rt = mrt;
395 		}
396 	}
397 #endif /* SMALL_KERNEL */
398 out:
399 	KERNEL_UNLOCK();
400 	return (rt);
401 }
402 
403 int
404 rtable_insert(unsigned int rtableid, struct sockaddr *dst,
405     struct sockaddr *mask, struct sockaddr *gateway, uint8_t prio,
406     struct rtentry *rt)
407 {
408 	struct radix_node_head	*rnh;
409 	struct radix_node	*rn = (struct radix_node *)rt;
410 
411 	rnh = rtable_get(rtableid, dst->sa_family);
412 	if (rnh == NULL)
413 		return (EAFNOSUPPORT);
414 
415 #ifndef SMALL_KERNEL
416 	if (rnh->rnh_multipath) {
417 		/* Do not permit exactly the same dst/mask/gw pair. */
418 		if (rt_mpath_conflict(rnh, dst, mask, gateway, prio,
419 	    	    ISSET(rt->rt_flags, RTF_MPATH))) {
420 			return (EEXIST);
421 		}
422 	}
423 #endif /* SMALL_KERNEL */
424 
425 	rn = rn_addroute(dst, mask, rnh, rn, prio);
426 	if (rn == NULL)
427 		return (ESRCH);
428 
429 	rt = ((struct rtentry *)rn);
430 	rtref(rt);
431 
432 	return (0);
433 }
434 
435 int
436 rtable_delete(unsigned int rtableid, struct sockaddr *dst,
437     struct sockaddr *mask, struct rtentry *rt)
438 {
439 	struct radix_node_head	*rnh;
440 	struct radix_node	*rn = (struct radix_node *)rt;
441 
442 	rnh = rtable_get(rtableid, dst->sa_family);
443 	if (rnh == NULL)
444 		return (EAFNOSUPPORT);
445 
446 	rn = rn_delete(dst, mask, rnh, rn);
447 	if (rn == NULL)
448 		return (ESRCH);
449 
450 	if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT))
451 		panic("active node flags=%x", rn->rn_flags);
452 
453 	rt = ((struct rtentry *)rn);
454 	rtfree(rt);
455 
456 	return (0);
457 }
458 
459 int
460 rtable_walk(unsigned int rtableid, sa_family_t af,
461     int (*func)(struct rtentry *, void *, unsigned int), void *arg)
462 {
463 	struct radix_node_head	*rnh;
464 	int (*f)(struct radix_node *, void *, unsigned int) = (void *)func;
465 	int error;
466 
467 	rnh = rtable_get(rtableid, af);
468 	if (rnh == NULL)
469 		return (EAFNOSUPPORT);
470 
471 	while ((error = rn_walktree(rnh, f, arg)) == EAGAIN)
472 		continue;
473 
474 	return (error);
475 }
476 
477 struct rtentry *
478 rtable_iterate(struct rtentry *rt0)
479 {
480 #ifndef SMALL_KERNEL
481 	struct radix_node *rn = (struct radix_node *)rt0;
482 	struct rtentry *rt;
483 
484 	rt = (struct rtentry *)rn_mpath_next(rn, RMP_MODE_ACTIVE);
485 	if (rt != NULL)
486 		rtref(rt);
487 	rtfree(rt0);
488 
489 	return (rt);
490 #else
491 	rtfree(rt0);
492 	return (NULL);
493 #endif /* SMALL_KERNEL */
494 }
495 
496 #ifndef SMALL_KERNEL
497 int
498 rtable_mpath_capable(unsigned int rtableid, sa_family_t af)
499 {
500 	struct radix_node_head	*rnh;
501 	int mpath;
502 
503 	rnh = rtable_get(rtableid, af);
504 	if (rnh == NULL)
505 		return (0);
506 
507 	mpath = rnh->rnh_multipath;
508 	return (mpath);
509 }
510 
511 int
512 rtable_mpath_reprio(unsigned int rtableid, struct sockaddr *dst,
513     struct sockaddr *mask, uint8_t prio, struct rtentry *rt)
514 {
515 	struct radix_node	*rn = (struct radix_node *)rt;
516 
517 	rn_mpath_reprio(rn, prio);
518 
519 	return (0);
520 }
521 #endif /* SMALL_KERNEL */
522 
523 #else /* ART */
524 
525 static inline uint8_t	*satoaddr(struct art_root *, struct sockaddr *);
526 
527 void	rtentry_ref(void *, void *);
528 void	rtentry_unref(void *, void *);
529 
530 #ifndef SMALL_KERNEL
531 void	rtable_mpath_insert(struct art_node *, struct rtentry *);
532 #endif
533 
534 struct srpl_rc rt_rc = SRPL_RC_INITIALIZER(rtentry_ref, rtentry_unref, NULL);
535 
536 void
537 rtable_init_backend(unsigned int keylen)
538 {
539 	art_init();
540 }
541 
542 void *
543 rtable_alloc(unsigned int rtableid, unsigned int alen, unsigned int off)
544 {
545 	return (art_alloc(rtableid, alen, off));
546 }
547 
548 struct rtentry *
549 rtable_lookup(unsigned int rtableid, struct sockaddr *dst,
550     struct sockaddr *mask, struct sockaddr *gateway, uint8_t prio)
551 {
552 	struct art_root			*ar;
553 	struct art_node			*an;
554 	struct rtentry			*rt = NULL;
555 	struct srp_ref			 sr, nsr;
556 	uint8_t				*addr;
557 	int				 plen;
558 
559 	ar = rtable_get(rtableid, dst->sa_family);
560 	if (ar == NULL)
561 		return (NULL);
562 
563 	addr = satoaddr(ar, dst);
564 
565 	/* No need for a perfect match. */
566 	if (mask == NULL) {
567 		an = art_match(ar, addr, &nsr);
568 		if (an == NULL)
569 			goto out;
570 	} else {
571 		plen = rtable_satoplen(dst->sa_family, mask);
572 		if (plen == -1)
573 			return (NULL);
574 
575 		an = art_lookup(ar, addr, plen, &nsr);
576 
577 		/* Make sure we've got a perfect match. */
578 		if (an == NULL || an->an_plen != plen ||
579 		    memcmp(an->an_dst, dst, dst->sa_len))
580 			goto out;
581 	}
582 
583 #ifdef SMALL_KERNEL
584 	rt = SRPL_ENTER(&sr, &an->an_rtlist);
585 #else
586 	SRPL_FOREACH(rt, &sr, &an->an_rtlist, rt_next) {
587 		if (prio != RTP_ANY &&
588 		    (rt->rt_priority & RTP_MASK) != (prio & RTP_MASK))
589 			continue;
590 
591 		if (gateway == NULL)
592 			break;
593 
594 		if (rt->rt_gateway->sa_len == gateway->sa_len &&
595 		    memcmp(rt->rt_gateway, gateway, gateway->sa_len) == 0)
596 			break;
597 	}
598 #endif /* SMALL_KERNEL */
599 	if (rt != NULL)
600 		rtref(rt);
601 
602 	SRPL_LEAVE(&sr);
603 out:
604 	srp_leave(&nsr);
605 
606 	return (rt);
607 }
608 
609 struct rtentry *
610 rtable_match(unsigned int rtableid, struct sockaddr *dst, uint32_t *src)
611 {
612 	struct art_root			*ar;
613 	struct art_node			*an;
614 	struct rtentry			*rt = NULL;
615 	struct srp_ref			 sr, nsr;
616 	uint8_t				*addr;
617 #ifndef SMALL_KERNEL
618 	int				 hash;
619 #endif /* SMALL_KERNEL */
620 
621 	ar = rtable_get(rtableid, dst->sa_family);
622 	if (ar == NULL)
623 		return (NULL);
624 
625 	addr = satoaddr(ar, dst);
626 
627 	an = art_match(ar, addr, &nsr);
628 	if (an == NULL)
629 		goto out;
630 
631 	rt = SRPL_ENTER(&sr, &an->an_rtlist);
632 	rtref(rt);
633 	SRPL_LEAVE(&sr);
634 
635 #ifndef SMALL_KERNEL
636 	/* Gateway selection by Hash-Threshold (RFC 2992) */
637 	if ((hash = rt_hash(rt, dst, src)) != -1) {
638 		struct rtentry		*mrt;
639 		int			 threshold, npaths = 0;
640 
641 		KASSERT(hash <= 0xffff);
642 
643 		SRPL_FOREACH(mrt, &sr, &an->an_rtlist, rt_next) {
644 			/* Only count nexthops with the same priority. */
645 			if (mrt->rt_priority == rt->rt_priority)
646 				npaths++;
647 		}
648 		SRPL_LEAVE(&sr);
649 
650 		threshold = (0xffff / npaths) + 1;
651 
652 		/*
653 		 * we have no protection against concurrent modification of the
654 		 * route list attached to the node, so we won't necessarily
655 		 * have the same number of routes.  for most modifications,
656 		 * we'll pick a route that we wouldn't have if we only saw the
657 		 * list before or after the change.  if we were going to use
658 		 * the last available route, but it got removed, we'll hit
659 		 * the end of the list and then pick the first route.
660 		 */
661 
662 		mrt = SRPL_ENTER(&sr, &an->an_rtlist);
663 		while (hash > threshold && mrt != NULL) {
664 			if (mrt->rt_priority == rt->rt_priority)
665 				hash -= threshold;
666 			mrt = SRPL_NEXT(&sr, mrt, rt_next);
667 		}
668 
669 		if (mrt != NULL) {
670 			rtref(mrt);
671 			rtfree(rt);
672 			rt = mrt;
673 		}
674 		SRPL_LEAVE(&sr);
675 	}
676 #endif /* SMALL_KERNEL */
677 out:
678 	srp_leave(&nsr);
679 	return (rt);
680 }
681 
682 int
683 rtable_insert(unsigned int rtableid, struct sockaddr *dst,
684     struct sockaddr *mask, struct sockaddr *gateway, uint8_t prio,
685     struct rtentry *rt)
686 {
687 #ifndef SMALL_KERNEL
688 	struct rtentry			*mrt;
689 	struct srp_ref			 sr;
690 #endif /* SMALL_KERNEL */
691 	struct art_root			*ar;
692 	struct art_node			*an, *prev;
693 	uint8_t				*addr;
694 	int				 plen;
695 	unsigned int			 rt_flags;
696 	int				 error = 0;
697 
698 	ar = rtable_get(rtableid, dst->sa_family);
699 	if (ar == NULL)
700 		return (EAFNOSUPPORT);
701 
702 	addr = satoaddr(ar, dst);
703 	plen = rtable_satoplen(dst->sa_family, mask);
704 	if (plen == -1)
705 		return (EINVAL);
706 
707 	rtref(rt); /* guarantee rtfree won't do anything during insert */
708 	rw_enter_write(&ar->ar_lock);
709 
710 #ifndef SMALL_KERNEL
711 	/* Do not permit exactly the same dst/mask/gw pair. */
712 	an = art_lookup(ar, addr, plen, &sr);
713 	srp_leave(&sr); /* an can't go away while we have the lock */
714 	if (an != NULL && an->an_plen == plen &&
715 	    !memcmp(an->an_dst, dst, dst->sa_len)) {
716 		struct rtentry  *mrt;
717 		int		 mpathok = ISSET(rt->rt_flags, RTF_MPATH);
718 
719 		SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next) {
720 			if (prio != RTP_ANY &&
721 			    (mrt->rt_priority & RTP_MASK) != (prio & RTP_MASK))
722 				continue;
723 
724 			if (!mpathok ||
725 			    (mrt->rt_gateway->sa_len == gateway->sa_len &&
726 			    !memcmp(mrt->rt_gateway, gateway, gateway->sa_len))){
727 				error = EEXIST;
728 				goto leave;
729 			}
730 		}
731 	}
732 #endif /* SMALL_KERNEL */
733 
734 	an = art_get(dst, plen);
735 	if (an == NULL) {
736 		error = ENOBUFS;
737 		goto leave;
738 	}
739 
740 	/* prepare for immediate operation if insert succeeds */
741 	rt_flags = rt->rt_flags;
742 	rt->rt_flags &= ~RTF_MPATH;
743 	rt->rt_dest = dst;
744 	rt->rt_plen = plen;
745 	SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
746 
747 	prev = art_insert(ar, an, addr, plen);
748 	if (prev != an) {
749 		SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry,
750 		    rt_next);
751 		rt->rt_flags = rt_flags;
752 		art_put(an);
753 
754 		if (prev == NULL) {
755 			error = ESRCH;
756 			goto leave;
757 		}
758 
759 #ifndef SMALL_KERNEL
760 		an = prev;
761 
762 		mrt = SRPL_FIRST_LOCKED(&an->an_rtlist);
763 		KASSERT(mrt != NULL);
764 		KASSERT((rt->rt_flags & RTF_MPATH) || mrt->rt_priority != prio);
765 
766 		/*
767 		 * An ART node with the same destination/netmask already
768 		 * exists, MPATH conflict must have been already checked.
769 		 */
770 		if (rt->rt_flags & RTF_MPATH) {
771 			/*
772 			 * Only keep the RTF_MPATH flag if two routes have
773 			 * the same gateway.
774 			 */
775 			rt->rt_flags &= ~RTF_MPATH;
776 			SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next) {
777 				if (mrt->rt_priority == prio) {
778 					mrt->rt_flags |= RTF_MPATH;
779 					rt->rt_flags |= RTF_MPATH;
780 				}
781 			}
782 		}
783 
784 		/* Put newly inserted entry at the right place. */
785 		rtable_mpath_insert(an, rt);
786 #else
787 		error = EEXIST;
788 #endif /* SMALL_KERNEL */
789 	}
790 leave:
791 	rw_exit_write(&ar->ar_lock);
792 	rtfree(rt);
793 	return (error);
794 }
795 
796 int
797 rtable_delete(unsigned int rtableid, struct sockaddr *dst,
798     struct sockaddr *mask, struct rtentry *rt)
799 {
800 	struct art_root			*ar;
801 	struct art_node			*an;
802 	struct srp_ref			 sr;
803 	uint8_t				*addr;
804 	int				 plen;
805 #ifndef SMALL_KERNEL
806 	struct rtentry			*mrt;
807 	int				 npaths = 0;
808 #endif /* SMALL_KERNEL */
809 	int				 error = 0;
810 
811 	ar = rtable_get(rtableid, dst->sa_family);
812 	if (ar == NULL)
813 		return (EAFNOSUPPORT);
814 
815 	addr = satoaddr(ar, dst);
816 	plen = rtable_satoplen(dst->sa_family, mask);
817 
818 	rtref(rt); /* guarantee rtfree won't do anything under ar_lock */
819 	rw_enter_write(&ar->ar_lock);
820 	an = art_lookup(ar, addr, plen, &sr);
821 	srp_leave(&sr); /* an can't go away while we have the lock */
822 
823 	/* Make sure we've got a perfect match. */
824 	if (an == NULL || an->an_plen != plen ||
825 	    memcmp(an->an_dst, dst, dst->sa_len)) {
826 		error = ESRCH;
827 		goto leave;
828 	}
829 
830 #ifndef SMALL_KERNEL
831 	/*
832 	 * If other multipath route entries are still attached to
833 	 * this ART node we only have to unlink it.
834 	 */
835 	SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next)
836 		npaths++;
837 
838 	if (npaths > 1) {
839 		KASSERT(rt->rt_refcnt >= 1);
840 		SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry,
841 		    rt_next);
842 
843 		mrt = SRPL_FIRST_LOCKED(&an->an_rtlist);
844 		an->an_dst = mrt->rt_dest;
845 		if (npaths == 2)
846 			mrt->rt_flags &= ~RTF_MPATH;
847 
848 		goto leave;
849 	}
850 #endif /* SMALL_KERNEL */
851 
852 	if (art_delete(ar, an, addr, plen) == NULL)
853 		panic("art_delete failed to find node %p", an);
854 
855 	KASSERT(rt->rt_refcnt >= 1);
856 	SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry, rt_next);
857 	art_put(an);
858 
859 leave:
860 	rw_exit_write(&ar->ar_lock);
861 	rtfree(rt);
862 
863 	return (error);
864 }
865 
866 struct rtable_walk_cookie {
867 	int		(*rwc_func)(struct rtentry *, void *, unsigned int);
868 	void		 *rwc_arg;
869 	unsigned int	  rwc_rid;
870 };
871 
872 /*
873  * Helper for rtable_walk to keep the ART code free from any "struct rtentry".
874  */
875 int
876 rtable_walk_helper(struct art_node *an, void *xrwc)
877 {
878 	struct srp_ref			 sr;
879 	struct rtable_walk_cookie	*rwc = xrwc;
880 	struct rtentry			*rt;
881 	int				 error = 0;
882 
883 	SRPL_FOREACH(rt, &sr, &an->an_rtlist, rt_next) {
884 		if ((error = (*rwc->rwc_func)(rt, rwc->rwc_arg, rwc->rwc_rid)))
885 			break;
886 	}
887 	SRPL_LEAVE(&sr);
888 
889 	return (error);
890 }
891 
892 int
893 rtable_walk(unsigned int rtableid, sa_family_t af,
894     int (*func)(struct rtentry *, void *, unsigned int), void *arg)
895 {
896 	struct art_root			*ar;
897 	struct rtable_walk_cookie	 rwc;
898 	int				 error;
899 
900 	ar = rtable_get(rtableid, af);
901 	if (ar == NULL)
902 		return (EAFNOSUPPORT);
903 
904 	rwc.rwc_func = func;
905 	rwc.rwc_arg = arg;
906 	rwc.rwc_rid = rtableid;
907 
908 	while ((error = art_walk(ar, rtable_walk_helper, &rwc)) == EAGAIN)
909 		continue;
910 
911 	return (error);
912 }
913 
914 struct rtentry *
915 rtable_iterate(struct rtentry *rt0)
916 {
917 #ifndef SMALL_KERNEL
918 	struct rtentry *rt;
919 
920 	KERNEL_ASSERT_LOCKED();
921 
922 	rt = SRPL_NEXT_LOCKED(rt0, rt_next);
923 	if (rt != NULL)
924 		rtref(rt);
925 	rtfree(rt0);
926 
927 	return (rt);
928 #else
929 	rtfree(rt0);
930 	return (NULL);
931 #endif /* SMALL_KERNEL */
932 }
933 
934 #ifndef SMALL_KERNEL
935 int
936 rtable_mpath_capable(unsigned int rtableid, sa_family_t af)
937 {
938 	return (1);
939 }
940 
941 int
942 rtable_mpath_reprio(unsigned int rtableid, struct sockaddr *dst,
943     struct sockaddr *mask, uint8_t prio, struct rtentry *rt)
944 {
945 	struct art_root			*ar;
946 	struct art_node			*an;
947 	struct srp_ref			 sr;
948 	uint8_t				*addr;
949 	int				 plen;
950 	int				 error = 0;
951 
952 	ar = rtable_get(rtableid, dst->sa_family);
953 	if (ar == NULL)
954 		return (EAFNOSUPPORT);
955 
956 	addr = satoaddr(ar, dst);
957 	plen = rtable_satoplen(dst->sa_family, mask);
958 
959 	rw_enter_write(&ar->ar_lock);
960 	an = art_lookup(ar, addr, plen, &sr);
961 	srp_leave(&sr); /* an can't go away while we have the lock */
962 
963 	/* Make sure we've got a perfect match. */
964 	if (an == NULL || an->an_plen != plen ||
965 	    memcmp(an->an_dst, dst, dst->sa_len))
966 		error = ESRCH;
967 	else {
968 		rtref(rt); /* keep rt alive in between remove and insert */
969 		SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist,
970 		    rt, rtentry, rt_next);
971 		rt->rt_priority = prio;
972 		rtable_mpath_insert(an, rt);
973 		rtfree(rt);
974 	}
975 	rw_exit_write(&ar->ar_lock);
976 
977 	return (error);
978 }
979 
980 void
981 rtable_mpath_insert(struct art_node *an, struct rtentry *rt)
982 {
983 	struct rtentry			*mrt, *prt = NULL;
984 	uint8_t				 prio = rt->rt_priority;
985 
986 	if ((mrt = SRPL_FIRST_LOCKED(&an->an_rtlist)) != NULL) {
987 		/*
988 		 * Select the order of the MPATH routes.
989 		 */
990 		while (SRPL_NEXT_LOCKED(mrt, rt_next) != NULL) {
991 			if (mrt->rt_priority > prio)
992 				break;
993 		    	prt = mrt;
994 			mrt = SRPL_NEXT_LOCKED(mrt, rt_next);
995 		}
996 
997 		if (mrt->rt_priority > prio) {
998 			/*
999 			 * ``rt'' has a higher (smaller) priority than
1000 			 * ``mrt'' so put it before in the list.
1001 			 */
1002 			if (prt != NULL) {
1003 				SRPL_INSERT_AFTER_LOCKED(&rt_rc, prt, rt,
1004 				    rt_next);
1005 			} else {
1006 				SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist,
1007 				    rt, rt_next);
1008 			}
1009 		} else {
1010 			SRPL_INSERT_AFTER_LOCKED(&rt_rc, mrt, rt, rt_next);
1011 		}
1012 	} else {
1013 		SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
1014 	}
1015 }
1016 #endif /* SMALL_KERNEL */
1017 
1018 void
1019 rtentry_ref(void *null, void *xrt)
1020 {
1021 	struct rtentry *rt = xrt;
1022 
1023 	rtref(rt);
1024 }
1025 
1026 void
1027 rtentry_unref(void *null, void *xrt)
1028 {
1029 	struct rtentry *rt = xrt;
1030 
1031 	rtfree(rt);
1032 }
1033 
1034 /*
1035  * Return a pointer to the address (key).  This is an heritage from the
1036  * BSD radix tree needed to skip the non-address fields from the flavor
1037  * of "struct sockaddr" used by this routing table.
1038  */
1039 static inline uint8_t *
1040 satoaddr(struct art_root *at, struct sockaddr *sa)
1041 {
1042 	return (((uint8_t *)sa) + at->ar_off);
1043 }
1044 #endif /* ART */
1045 
1046 /*
1047  * Return the prefix length of a mask.
1048  */
1049 int
1050 rtable_satoplen(sa_family_t af, struct sockaddr *mask)
1051 {
1052 	struct domain	*dp;
1053 	uint8_t		*ap, *ep;
1054 	int		 mlen, plen = 0;
1055 	int		 i;
1056 
1057 	for (i = 0; (dp = domains[i]) != NULL; i++) {
1058 		if (dp->dom_rtoffset == 0)
1059 			continue;
1060 
1061 		if (af == dp->dom_family)
1062 			break;
1063 	}
1064 	if (dp == NULL)
1065 		return (-1);
1066 
1067 	/* Host route */
1068 	if (mask == NULL)
1069 		return (dp->dom_maxplen);
1070 
1071 	mlen = mask->sa_len;
1072 
1073 	/* Default route */
1074 	if (mlen == 0)
1075 		return (0);
1076 
1077 	ap = (uint8_t *)((uint8_t *)mask) + dp->dom_rtoffset;
1078 	ep = (uint8_t *)((uint8_t *)mask) + mlen;
1079 	if (ap > ep)
1080 		return (-1);
1081 
1082 	if (ap == ep)
1083 		return (0);
1084 
1085 	/* "Beauty" adapted from sbin/route/show.c ... */
1086 	while (ap < ep) {
1087 		switch (*ap) {
1088 		case 0xff:
1089 			plen += 8;
1090 			ap++;
1091 			break;
1092 		case 0xfe:
1093 			plen += 7;
1094 			ap++;
1095 			goto out;
1096 		case 0xfc:
1097 			plen += 6;
1098 			ap++;
1099 			goto out;
1100 		case 0xf8:
1101 			plen += 5;
1102 			ap++;
1103 			goto out;
1104 		case 0xf0:
1105 			plen += 4;
1106 			ap++;
1107 			goto out;
1108 		case 0xe0:
1109 			plen += 3;
1110 			ap++;
1111 			goto out;
1112 		case 0xc0:
1113 			plen += 2;
1114 			ap++;
1115 			goto out;
1116 		case 0x80:
1117 			plen += 1;
1118 			ap++;
1119 			goto out;
1120 		case 0x00:
1121 			goto out;
1122 		default:
1123 			/* Non contiguous mask. */
1124 			return (-1);
1125 		}
1126 
1127 	}
1128 
1129 out:
1130 #ifdef DIAGNOSTIC
1131 	for (; ap < ep; ap++) {
1132 		if (*ap != 0x00)
1133 			return (-1);
1134 	}
1135 #endif /* DIAGNOSTIC */
1136 
1137 	return (plen);
1138 }
1139