xref: /openbsd-src/sys/net/rtable.c (revision 0b7734b3d77bb9b21afec6f4621cae6c805dbd45)
1 /*	$OpenBSD: rtable.c,v 1.50 2016/07/19 10:51:44 mpi Exp $ */
2 
3 /*
4  * Copyright (c) 2014-2015 Martin Pieuchot
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #ifndef _KERNEL
20 #include "kern_compat.h"
21 #else
22 #include <sys/param.h>
23 #include <sys/systm.h>
24 #include <sys/socket.h>
25 #include <sys/malloc.h>
26 #include <sys/queue.h>
27 #include <sys/domain.h>
28 #include <sys/srp.h>
29 #endif
30 
31 #include <net/rtable.h>
32 #include <net/route.h>
33 
34 /*
35  * Structures used by rtable_get() to retrieve the corresponding
36  * routing table for a given pair of ``af'' and ``rtableid''.
37  *
38  * Note that once allocated routing table heads are never freed.
39  * This way we do not need to reference count them.
40  *
41  *	afmap		    rtmap/dommp
42  *   -----------          ---------     -----
43  *   |   0     |--------> | 0 | 0 | ... | 0 |	Array mapping rtableid (=index)
44  *   -----------          ---------     -----   to rdomain (=value).
45  *   | AF_INET |.
46  *   ----------- `.       .---------.     .---------.
47  *       ...	   `----> | rtable0 | ... | rtableN |	Array of pointers for
48  *   -----------          '---------'     '---------'	IPv4 routing tables
49  *   | AF_MPLS |            			 	indexed by ``rtableid''.
50  *   -----------
51  */
52 struct srp	  *afmap;
53 uint8_t		   af2idx[AF_MAX+1];	/* To only allocate supported AF */
54 uint8_t		   af2idx_max;
55 
56 /* Array of routing table pointers. */
57 struct rtmap {
58 	unsigned int	   limit;
59 	void		 **tbl;
60 };
61 
62 /* Array of rtableid -> rdomain mapping. */
63 struct dommp {
64 	unsigned int	   limit;
65 	unsigned int	  *dom;
66 };
67 
68 unsigned int	   rtmap_limit = 0;
69 
70 void		   rtmap_init(void);
71 void		   rtmap_grow(unsigned int, sa_family_t);
72 void		   rtmap_dtor(void *, void *);
73 
74 struct srp_gc	   rtmap_gc = SRP_GC_INITIALIZER(rtmap_dtor, NULL);
75 
76 void		   rtable_init_backend(unsigned int);
77 void		  *rtable_alloc(unsigned int, unsigned int, unsigned int);
78 void		  *rtable_get(unsigned int, sa_family_t);
79 
80 void
81 rtmap_init(void)
82 {
83 	struct domain	*dp;
84 	int		 i;
85 
86 	/* Start with a single table for every domain that requires it. */
87 	for (i = 0; (dp = domains[i]) != NULL; i++) {
88 		if (dp->dom_rtoffset == 0)
89 			continue;
90 
91 		rtmap_grow(1, dp->dom_family);
92 	}
93 
94 	/* Initialize the rtableid->rdomain mapping table. */
95 	rtmap_grow(1, 0);
96 
97 	rtmap_limit = 1;
98 }
99 
100 /*
101  * Grow the size of the array of routing table for AF ``af'' to ``nlimit''.
102  */
103 void
104 rtmap_grow(unsigned int nlimit, sa_family_t af)
105 {
106 	struct rtmap	*map, *nmap;
107 	int		 i;
108 
109 	KERNEL_ASSERT_LOCKED();
110 
111 	KASSERT(nlimit > rtmap_limit);
112 
113 	nmap = malloc(sizeof(*nmap), M_RTABLE, M_WAITOK);
114 	nmap->limit = nlimit;
115 	nmap->tbl = mallocarray(nlimit, sizeof(*nmap[0].tbl), M_RTABLE,
116 	    M_WAITOK|M_ZERO);
117 
118 	map = srp_get_locked(&afmap[af2idx[af]]);
119 	if (map != NULL) {
120 		KASSERT(map->limit == rtmap_limit);
121 
122 		for (i = 0; i < map->limit; i++)
123 			nmap->tbl[i] = map->tbl[i];
124 	}
125 
126 	srp_update_locked(&rtmap_gc, &afmap[af2idx[af]], nmap);
127 }
128 
129 void
130 rtmap_dtor(void *null, void *xmap)
131 {
132 	struct rtmap	*map = xmap;
133 
134 	/*
135 	 * doesnt need to be serialized since this is the last reference
136 	 * to this map. there's nothing to race against.
137 	 */
138 	free(map->tbl, M_RTABLE, map->limit * sizeof(*map[0].tbl));
139 	free(map, M_RTABLE, sizeof(*map));
140 }
141 
142 void
143 rtable_init(void)
144 {
145 	struct domain	*dp;
146 	unsigned int	 keylen = 0;
147 	int		 i;
148 
149 	/* We use index 0 for the rtable/rdomain map. */
150 	af2idx_max = 1;
151 	memset(af2idx, 0, sizeof(af2idx));
152 
153 	/*
154 	 * Compute the maximum supported key length in case the routing
155 	 * table backend needs it.
156 	 */
157 	for (i = 0; (dp = domains[i]) != NULL; i++) {
158 		if (dp->dom_rtoffset == 0)
159 			continue;
160 
161 		af2idx[dp->dom_family] = af2idx_max++;
162 		if (dp->dom_rtkeylen > keylen)
163 			keylen = dp->dom_rtkeylen;
164 
165 	}
166 	rtable_init_backend(keylen);
167 
168 	/*
169 	 * Allocate AF-to-id table now that we now how many AFs this
170 	 * kernel supports.
171 	 */
172 	afmap = mallocarray(af2idx_max + 1, sizeof(*afmap), M_RTABLE,
173 	    M_WAITOK|M_ZERO);
174 
175 	rtmap_init();
176 }
177 
178 int
179 rtable_add(unsigned int id)
180 {
181 	struct domain	*dp;
182 	void		*tbl;
183 	struct rtmap	*map;
184 	struct dommp	*dmm;
185 	sa_family_t	 af;
186 	unsigned int	 off, alen;
187 	int		 i;
188 
189 	KERNEL_ASSERT_LOCKED();
190 
191 	if (id > RT_TABLEID_MAX)
192 		return (EINVAL);
193 
194 	if (rtable_exists(id))
195 		return (EEXIST);
196 
197 	for (i = 0; (dp = domains[i]) != NULL; i++) {
198 		if (dp->dom_rtoffset == 0)
199 			continue;
200 
201 		af = dp->dom_family;
202 		off = dp->dom_rtoffset;
203 		alen = dp->dom_maxplen;
204 
205 		if (id >= rtmap_limit)
206 			rtmap_grow(id + 1, af);
207 
208 		tbl = rtable_alloc(id, alen, off);
209 		if (tbl == NULL)
210 			return (ENOMEM);
211 
212 		map = srp_get_locked(&afmap[af2idx[af]]);
213 		map->tbl[id] = tbl;
214 	}
215 
216 	/* Reflect possible growth. */
217 	if (id >= rtmap_limit) {
218 		rtmap_grow(id + 1, 0);
219 		rtmap_limit = id + 1;
220 	}
221 
222 	/* Use main rtable/rdomain by default. */
223 	dmm = srp_get_locked(&afmap[0]);
224 	dmm->dom[id] = 0;
225 
226 	return (0);
227 }
228 
229 void *
230 rtable_get(unsigned int rtableid, sa_family_t af)
231 {
232 	struct rtmap	*map;
233 	void		*tbl = NULL;
234 	struct srp_ref	 sr;
235 
236 	if (af >= nitems(af2idx) || af2idx[af] == 0)
237 		return (NULL);
238 
239 	map = srp_enter(&sr, &afmap[af2idx[af]]);
240 	if (rtableid < map->limit)
241 		tbl = map->tbl[rtableid];
242 	srp_leave(&sr);
243 
244 	return (tbl);
245 }
246 
247 int
248 rtable_exists(unsigned int rtableid)
249 {
250 	struct domain	*dp;
251 	void		*tbl;
252 	int		 i;
253 
254 	for (i = 0; (dp = domains[i]) != NULL; i++) {
255 		if (dp->dom_rtoffset == 0)
256 			continue;
257 
258 		tbl = rtable_get(rtableid, dp->dom_family);
259 		if (tbl != NULL)
260 			return (1);
261 	}
262 
263 	return (0);
264 }
265 
266 unsigned int
267 rtable_l2(unsigned int rtableid)
268 {
269 	struct dommp	*dmm;
270 	unsigned int	 rdomain = 0;
271 	struct srp_ref	 sr;
272 
273 	dmm = srp_enter(&sr, &afmap[0]);
274 	if (rtableid < dmm->limit)
275 		rdomain = dmm->dom[rtableid];
276 	srp_leave(&sr);
277 
278 	return (rdomain);
279 }
280 
281 void
282 rtable_l2set(unsigned int rtableid, unsigned int rdomain)
283 {
284 	struct dommp	*dmm;
285 
286 	KERNEL_ASSERT_LOCKED();
287 
288 	if (!rtable_exists(rtableid) || !rtable_exists(rdomain))
289 		return;
290 
291 	dmm = srp_get_locked(&afmap[0]);
292 	dmm->dom[rtableid] = rdomain;
293 }
294 
295 #ifndef ART
296 void
297 rtable_init_backend(unsigned int keylen)
298 {
299 	rn_init(keylen); /* initialize all zeroes, all ones, mask table */
300 }
301 
302 void *
303 rtable_alloc(unsigned int rtableid, unsigned int alen, unsigned int off)
304 {
305 	struct radix_node_head *rnh = NULL;
306 
307 	if (rn_inithead((void **)&rnh, off)) {
308 #ifndef SMALL_KERNEL
309 		rnh->rnh_multipath = 1;
310 #endif /* SMALL_KERNEL */
311 		rnh->rnh_rtableid = rtableid;
312 	}
313 
314 	return (rnh);
315 }
316 
317 struct rtentry *
318 rtable_lookup(unsigned int rtableid, struct sockaddr *dst,
319     struct sockaddr *mask, struct sockaddr *gateway, uint8_t prio)
320 {
321 	struct radix_node_head	*rnh;
322 	struct radix_node	*rn;
323 	struct rtentry		*rt;
324 
325 	rnh = rtable_get(rtableid, dst->sa_family);
326 	if (rnh == NULL)
327 		return (NULL);
328 
329 	rn = rn_lookup(dst, mask, rnh);
330 	if (rn == NULL || (rn->rn_flags & RNF_ROOT) != 0)
331 		return (NULL);
332 
333 	rt = ((struct rtentry *)rn);
334 
335 #ifndef SMALL_KERNEL
336 	if (rnh->rnh_multipath) {
337 		rt = rt_mpath_matchgate(rt, gateway, prio);
338 		if (rt == NULL)
339 			return (NULL);
340 	}
341 #endif /* !SMALL_KERNEL */
342 
343 	rtref(rt);
344 	return (rt);
345 }
346 
347 struct rtentry *
348 rtable_match(unsigned int rtableid, struct sockaddr *dst, uint32_t *src)
349 {
350 	struct radix_node_head	*rnh;
351 	struct radix_node	*rn;
352 	struct rtentry		*rt = NULL;
353 #ifndef SMALL_KERNEL
354 	int			 hash;
355 #endif /* SMALL_KERNEL */
356 
357 	rnh = rtable_get(rtableid, dst->sa_family);
358 	if (rnh == NULL)
359 		return (NULL);
360 
361 	KERNEL_LOCK();
362 	rn = rn_match(dst, rnh);
363 	if (rn == NULL || (rn->rn_flags & RNF_ROOT) != 0)
364 		goto out;
365 
366 	rt = ((struct rtentry *)rn);
367 	rtref(rt);
368 
369 #ifndef SMALL_KERNEL
370 	/* Gateway selection by Hash-Threshold (RFC 2992) */
371 	if ((hash = rt_hash(rt, dst, src)) != -1) {
372 		struct rtentry		*mrt = rt;
373 		int			 threshold, npaths = 1;
374 
375 		KASSERT(hash <= 0xffff);
376 
377 		while ((mrt = rtable_mpath_next(mrt)) != NULL)
378 			npaths++;
379 
380 		threshold = (0xffff / npaths) + 1;
381 
382 		mrt = rt;
383 		while (hash > threshold && mrt != NULL) {
384 			/* stay within the multipath routes */
385 			mrt = rtable_mpath_next(mrt);
386 			hash -= threshold;
387 		}
388 
389 		/* if gw selection fails, use the first match (default) */
390 		if (mrt != NULL) {
391 			rtref(mrt);
392 			rtfree(rt);
393 			rt = mrt;
394 		}
395 	}
396 #endif /* SMALL_KERNEL */
397 out:
398 	KERNEL_UNLOCK();
399 	return (rt);
400 }
401 
402 int
403 rtable_insert(unsigned int rtableid, struct sockaddr *dst,
404     struct sockaddr *mask, struct sockaddr *gateway, uint8_t prio,
405     struct rtentry *rt)
406 {
407 	struct radix_node_head	*rnh;
408 	struct radix_node	*rn = (struct radix_node *)rt;
409 
410 	rnh = rtable_get(rtableid, dst->sa_family);
411 	if (rnh == NULL)
412 		return (EAFNOSUPPORT);
413 
414 #ifndef SMALL_KERNEL
415 	if (rnh->rnh_multipath) {
416 		/* Do not permit exactly the same dst/mask/gw pair. */
417 		if (rt_mpath_conflict(rnh, dst, mask, gateway, prio,
418 	    	    ISSET(rt->rt_flags, RTF_MPATH))) {
419 			return (EEXIST);
420 		}
421 	}
422 #endif /* SMALL_KERNEL */
423 
424 	rn = rn_addroute(dst, mask, rnh, rn, prio);
425 	if (rn == NULL)
426 		return (ESRCH);
427 
428 	rt = ((struct rtentry *)rn);
429 	rtref(rt);
430 
431 	return (0);
432 }
433 
434 int
435 rtable_delete(unsigned int rtableid, struct sockaddr *dst,
436     struct sockaddr *mask, struct rtentry *rt)
437 {
438 	struct radix_node_head	*rnh;
439 	struct radix_node	*rn = (struct radix_node *)rt;
440 
441 	rnh = rtable_get(rtableid, dst->sa_family);
442 	if (rnh == NULL)
443 		return (EAFNOSUPPORT);
444 
445 	rn = rn_delete(dst, mask, rnh, rn);
446 	if (rn == NULL)
447 		return (ESRCH);
448 
449 	if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT))
450 		panic("active node flags=%x", rn->rn_flags);
451 
452 	rt = ((struct rtentry *)rn);
453 	rtfree(rt);
454 
455 	return (0);
456 }
457 
458 int
459 rtable_walk(unsigned int rtableid, sa_family_t af,
460     int (*func)(struct rtentry *, void *, unsigned int), void *arg)
461 {
462 	struct radix_node_head	*rnh;
463 	int (*f)(struct radix_node *, void *, unsigned int) = (void *)func;
464 	int error;
465 
466 	rnh = rtable_get(rtableid, af);
467 	if (rnh == NULL)
468 		return (EAFNOSUPPORT);
469 
470 	while ((error = rn_walktree(rnh, f, arg)) == EAGAIN)
471 		continue;
472 
473 	return (error);
474 }
475 
476 #ifndef SMALL_KERNEL
477 int
478 rtable_mpath_capable(unsigned int rtableid, sa_family_t af)
479 {
480 	struct radix_node_head	*rnh;
481 	int mpath;
482 
483 	rnh = rtable_get(rtableid, af);
484 	if (rnh == NULL)
485 		return (0);
486 
487 	mpath = rnh->rnh_multipath;
488 	return (mpath);
489 }
490 
491 int
492 rtable_mpath_reprio(unsigned int rtableid, struct sockaddr *dst,
493     struct sockaddr *mask, uint8_t prio, struct rtentry *rt)
494 {
495 	struct radix_node	*rn = (struct radix_node *)rt;
496 
497 	rn_mpath_reprio(rn, prio);
498 
499 	return (0);
500 }
501 
502 struct rtentry *
503 rtable_mpath_next(struct rtentry *rt)
504 {
505 	struct radix_node *rn = (struct radix_node *)rt;
506 
507 	return ((struct rtentry *)rn_mpath_next(rn, RMP_MODE_ACTIVE));
508 }
509 #endif /* SMALL_KERNEL */
510 
511 #else /* ART */
512 
513 static inline uint8_t	*satoaddr(struct art_root *, struct sockaddr *);
514 
515 void	rtentry_ref(void *, void *);
516 void	rtentry_unref(void *, void *);
517 
518 struct srpl_rc rt_rc = SRPL_RC_INITIALIZER(rtentry_ref, rtentry_unref, NULL);
519 
520 void
521 rtable_init_backend(unsigned int keylen)
522 {
523 	art_init();
524 }
525 
526 void *
527 rtable_alloc(unsigned int rtableid, unsigned int alen, unsigned int off)
528 {
529 	return (art_alloc(rtableid, alen, off));
530 }
531 
532 struct rtentry *
533 rtable_lookup(unsigned int rtableid, struct sockaddr *dst,
534     struct sockaddr *mask, struct sockaddr *gateway, uint8_t prio)
535 {
536 	struct art_root			*ar;
537 	struct art_node			*an;
538 	struct rtentry			*rt = NULL;
539 	struct srp_ref			 sr, nsr;
540 	uint8_t				*addr;
541 	int				 plen;
542 
543 	ar = rtable_get(rtableid, dst->sa_family);
544 	if (ar == NULL)
545 		return (NULL);
546 
547 	addr = satoaddr(ar, dst);
548 
549 	/* No need for a perfect match. */
550 	if (mask == NULL) {
551 		an = art_match(ar, addr, &nsr);
552 		if (an == NULL)
553 			goto out;
554 	} else {
555 		plen = rtable_satoplen(dst->sa_family, mask);
556 		if (plen == -1)
557 			return (NULL);
558 
559 		an = art_lookup(ar, addr, plen, &nsr);
560 
561 		/* Make sure we've got a perfect match. */
562 		if (an == NULL || an->an_plen != plen ||
563 		    memcmp(an->an_dst, dst, dst->sa_len))
564 			goto out;
565 	}
566 
567 #ifdef SMALL_KERNEL
568 	rt = SRPL_ENTER(&sr, &an->an_rtlist);
569 #else
570 	SRPL_FOREACH(rt, &sr, &an->an_rtlist, rt_next) {
571 		if (prio != RTP_ANY &&
572 		    (rt->rt_priority & RTP_MASK) != (prio & RTP_MASK))
573 			continue;
574 
575 		if (gateway == NULL)
576 			break;
577 
578 		if (rt->rt_gateway->sa_len == gateway->sa_len &&
579 		    memcmp(rt->rt_gateway, gateway, gateway->sa_len) == 0)
580 			break;
581 	}
582 #endif /* SMALL_KERNEL */
583 	if (rt != NULL)
584 		rtref(rt);
585 
586 	SRPL_LEAVE(&sr);
587 out:
588 	srp_leave(&nsr);
589 
590 	return (rt);
591 }
592 
593 struct rtentry *
594 rtable_match(unsigned int rtableid, struct sockaddr *dst, uint32_t *src)
595 {
596 	struct art_root			*ar;
597 	struct art_node			*an;
598 	struct rtentry			*rt = NULL;
599 	struct srp_ref			 sr, nsr;
600 	uint8_t				*addr;
601 #ifndef SMALL_KERNEL
602 	int				 hash;
603 #endif /* SMALL_KERNEL */
604 
605 	ar = rtable_get(rtableid, dst->sa_family);
606 	if (ar == NULL)
607 		return (NULL);
608 
609 	addr = satoaddr(ar, dst);
610 
611 	an = art_match(ar, addr, &nsr);
612 	if (an == NULL)
613 		goto out;
614 
615 	rt = SRPL_ENTER(&sr, &an->an_rtlist);
616 	rtref(rt);
617 	SRPL_LEAVE(&sr);
618 
619 #ifndef SMALL_KERNEL
620 	/* Gateway selection by Hash-Threshold (RFC 2992) */
621 	if ((hash = rt_hash(rt, dst, src)) != -1) {
622 		struct rtentry		*mrt;
623 		int			 threshold, npaths = 0;
624 
625 		KASSERT(hash <= 0xffff);
626 
627 		SRPL_FOREACH(mrt, &sr, &an->an_rtlist, rt_next) {
628 			/* Only count nexthops with the same priority. */
629 			if (mrt->rt_priority == rt->rt_priority)
630 				npaths++;
631 		}
632 		SRPL_LEAVE(&sr);
633 
634 		threshold = (0xffff / npaths) + 1;
635 
636 		/*
637 		 * we have no protection against concurrent modification of the
638 		 * route list attached to the node, so we won't necessarily
639 		 * have the same number of routes.  for most modifications,
640 		 * we'll pick a route that we wouldn't have if we only saw the
641 		 * list before or after the change.  if we were going to use
642 		 * the last available route, but it got removed, we'll hit
643 		 * the end of the list and then pick the first route.
644 		 */
645 
646 		mrt = SRPL_ENTER(&sr, &an->an_rtlist);
647 		while (hash > threshold && mrt != NULL) {
648 			if (mrt->rt_priority == rt->rt_priority)
649 				hash -= threshold;
650 			mrt = SRPL_NEXT(&sr, mrt, rt_next);
651 		}
652 
653 		if (mrt != NULL) {
654 			rtref(mrt);
655 			rtfree(rt);
656 			rt = mrt;
657 		}
658 		SRPL_LEAVE(&sr);
659 	}
660 #endif /* SMALL_KERNEL */
661 out:
662 	srp_leave(&nsr);
663 	return (rt);
664 }
665 
666 int
667 rtable_insert(unsigned int rtableid, struct sockaddr *dst,
668     struct sockaddr *mask, struct sockaddr *gateway, uint8_t prio,
669     struct rtentry *rt)
670 {
671 #ifndef SMALL_KERNEL
672 	struct rtentry			*mrt;
673 	struct srp_ref			 sr;
674 #endif /* SMALL_KERNEL */
675 	struct art_root			*ar;
676 	struct art_node			*an, *prev;
677 	uint8_t				*addr;
678 	int				 plen;
679 	unsigned int			 rt_flags;
680 	int				 error = 0;
681 
682 	KERNEL_ASSERT_LOCKED();
683 
684 	ar = rtable_get(rtableid, dst->sa_family);
685 	if (ar == NULL)
686 		return (EAFNOSUPPORT);
687 
688 	addr = satoaddr(ar, dst);
689 	plen = rtable_satoplen(dst->sa_family, mask);
690 	if (plen == -1)
691 		return (EINVAL);
692 
693 	rtref(rt); /* guarantee rtfree won't do anything during insert */
694 
695 #ifndef SMALL_KERNEL
696 	/* Do not permit exactly the same dst/mask/gw pair. */
697 	an = art_lookup(ar, addr, plen, &sr);
698 	srp_leave(&sr); /* an can't go away while we have the lock */
699 	if (an != NULL && an->an_plen == plen &&
700 	    !memcmp(an->an_dst, dst, dst->sa_len)) {
701 		struct rtentry  *mrt;
702 		int		 mpathok = ISSET(rt->rt_flags, RTF_MPATH);
703 
704 		SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next) {
705 			if (prio != RTP_ANY &&
706 			    (mrt->rt_priority & RTP_MASK) != (prio & RTP_MASK))
707 				continue;
708 
709 			if (!mpathok ||
710 			    (mrt->rt_gateway->sa_len == gateway->sa_len &&
711 			    !memcmp(mrt->rt_gateway, gateway, gateway->sa_len))){
712 				error = EEXIST;
713 				goto leave;
714 			}
715 		}
716 	}
717 #endif /* SMALL_KERNEL */
718 
719 	an = art_get(dst, plen);
720 	if (an == NULL) {
721 		error = ENOBUFS;
722 		goto leave;
723 	}
724 
725 	/* prepare for immediate operation if insert succeeds */
726 	rt_flags = rt->rt_flags;
727 	rt->rt_flags &= ~RTF_MPATH;
728 	rt->rt_dest = dst;
729 	rt->rt_plen = plen;
730 	SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
731 
732 	prev = art_insert(ar, an, addr, plen);
733 	if (prev != an) {
734 		SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry,
735 		    rt_next);
736 		rt->rt_flags = rt_flags;
737 		art_put(an);
738 
739 		if (prev == NULL) {
740 			error = ESRCH;
741 			goto leave;
742 		}
743 
744 #ifndef SMALL_KERNEL
745 		an = prev;
746 
747 		mrt = SRPL_FIRST_LOCKED(&an->an_rtlist);
748 		KASSERT(mrt != NULL);
749 		KASSERT((rt->rt_flags & RTF_MPATH) || mrt->rt_priority != prio);
750 
751 		/*
752 		 * An ART node with the same destination/netmask already
753 		 * exists, MPATH conflict must have been already checked.
754 		 */
755 		if (rt->rt_flags & RTF_MPATH) {
756 			/*
757 			 * Only keep the RTF_MPATH flag if two routes have
758 			 * the same gateway.
759 			 */
760 			rt->rt_flags &= ~RTF_MPATH;
761 			SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next) {
762 				if (mrt->rt_priority == prio) {
763 					mrt->rt_flags |= RTF_MPATH;
764 					rt->rt_flags |= RTF_MPATH;
765 				}
766 			}
767 		}
768 
769 		SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
770 
771 		/* Put newly inserted entry at the right place. */
772 		rtable_mpath_reprio(rtableid, dst, mask, rt->rt_priority, rt);
773 #else
774 		error = EEXIST;
775 #endif /* SMALL_KERNEL */
776 	}
777 leave:
778 	rtfree(rt);
779 	return (error);
780 }
781 
782 int
783 rtable_delete(unsigned int rtableid, struct sockaddr *dst,
784     struct sockaddr *mask, struct rtentry *rt)
785 {
786 	struct art_root			*ar;
787 	struct art_node			*an;
788 	struct srp_ref			 sr;
789 	uint8_t				*addr;
790 	int				 plen;
791 #ifndef SMALL_KERNEL
792 	struct rtentry			*mrt;
793 	int				 npaths = 0;
794 #endif /* SMALL_KERNEL */
795 
796 	ar = rtable_get(rtableid, dst->sa_family);
797 	if (ar == NULL)
798 		return (EAFNOSUPPORT);
799 
800 	addr = satoaddr(ar, dst);
801 	plen = rtable_satoplen(dst->sa_family, mask);
802 
803 	KERNEL_ASSERT_LOCKED();
804 
805 	an = art_lookup(ar, addr, plen, &sr);
806 	srp_leave(&sr); /* an can't go away while we have the lock */
807 
808 	/* Make sure we've got a perfect match. */
809 	if (an == NULL || an->an_plen != plen ||
810 	    memcmp(an->an_dst, dst, dst->sa_len))
811 		return (ESRCH);
812 
813 #ifndef SMALL_KERNEL
814 	/*
815 	 * If other multipath route entries are still attached to
816 	 * this ART node we only have to unlink it.
817 	 */
818 	SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next)
819 		npaths++;
820 
821 	if (npaths > 1) {
822 		KASSERT(rt->rt_refcnt >= 1);
823 		SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry,
824 		    rt_next);
825 
826 		mrt = SRPL_FIRST_LOCKED(&an->an_rtlist);
827 		an->an_dst = mrt->rt_dest;
828 		if (npaths == 2)
829 			mrt->rt_flags &= ~RTF_MPATH;
830 		return (0);
831 	}
832 #endif /* SMALL_KERNEL */
833 
834 	if (art_delete(ar, an, addr, plen) == NULL)
835 		return (ESRCH);
836 
837 	KASSERT(rt->rt_refcnt >= 1);
838 	SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry, rt_next);
839 
840 	art_put(an);
841 	return (0);
842 }
843 
844 struct rtable_walk_cookie {
845 	int		(*rwc_func)(struct rtentry *, void *, unsigned int);
846 	void		 *rwc_arg;
847 	unsigned int	  rwc_rid;
848 };
849 
850 /*
851  * Helper for rtable_walk to keep the ART code free from any "struct rtentry".
852  */
853 int
854 rtable_walk_helper(struct art_node *an, void *xrwc)
855 {
856 	struct srp_ref			 sr;
857 	struct rtable_walk_cookie	*rwc = xrwc;
858 	struct rtentry			*rt;
859 	int				 error = 0;
860 
861 	SRPL_FOREACH(rt, &sr, &an->an_rtlist, rt_next) {
862 		if ((error = (*rwc->rwc_func)(rt, rwc->rwc_arg, rwc->rwc_rid)))
863 			break;
864 	}
865 	SRPL_LEAVE(&sr);
866 
867 	return (error);
868 }
869 
870 int
871 rtable_walk(unsigned int rtableid, sa_family_t af,
872     int (*func)(struct rtentry *, void *, unsigned int), void *arg)
873 {
874 	struct art_root			*ar;
875 	struct rtable_walk_cookie	 rwc;
876 	int				 error;
877 
878 	ar = rtable_get(rtableid, af);
879 	if (ar == NULL)
880 		return (EAFNOSUPPORT);
881 
882 	rwc.rwc_func = func;
883 	rwc.rwc_arg = arg;
884 	rwc.rwc_rid = rtableid;
885 
886 	while ((error = art_walk(ar, rtable_walk_helper, &rwc)) == EAGAIN)
887 		continue;
888 
889 	return (error);
890 }
891 
892 #ifndef SMALL_KERNEL
893 int
894 rtable_mpath_capable(unsigned int rtableid, sa_family_t af)
895 {
896 	return (1);
897 }
898 
899 int
900 rtable_mpath_reprio(unsigned int rtableid, struct sockaddr *dst,
901     struct sockaddr *mask, uint8_t prio, struct rtentry *rt)
902 {
903 	struct art_root			*ar;
904 	struct art_node			*an;
905 	struct srp_ref			 sr;
906 	uint8_t				*addr;
907 	int				 plen;
908 	struct rtentry			*mrt, *prt = NULL;
909 
910 	ar = rtable_get(rtableid, dst->sa_family);
911 	if (ar == NULL)
912 		return (EAFNOSUPPORT);
913 
914 	addr = satoaddr(ar, dst);
915 	plen = rtable_satoplen(dst->sa_family, mask);
916 
917 	KERNEL_ASSERT_LOCKED();
918 
919 	an = art_lookup(ar, addr, plen, &sr);
920 	srp_leave(&sr); /* an can't go away while we have the lock */
921 
922 	/* Make sure we've got a perfect match. */
923 	if (an == NULL || an->an_plen != plen ||
924 	    memcmp(an->an_dst, dst, dst->sa_len))
925 		return (ESRCH);
926 
927 	rtref(rt); /* keep rt alive in between remove and add */
928 	SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry, rt_next);
929 	rt->rt_priority = prio;
930 
931 	if ((mrt = SRPL_FIRST_LOCKED(&an->an_rtlist)) != NULL) {
932 		/*
933 		 * Select the order of the MPATH routes.
934 		 */
935 		while (SRPL_NEXT_LOCKED(mrt, rt_next) != NULL) {
936 			if (mrt->rt_priority > prio)
937 				break;
938 		    	prt = mrt;
939 			mrt = SRPL_NEXT_LOCKED(mrt, rt_next);
940 		}
941 
942 		if (mrt->rt_priority > prio) {
943 			/*
944 			 * ``rt'' has a higher (smaller) priority than
945 			 * ``mrt'' so put it before in the list.
946 			 */
947 			if (prt != NULL) {
948 				SRPL_INSERT_AFTER_LOCKED(&rt_rc, prt, rt,
949 				    rt_next);
950 			} else {
951 				SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist,
952 				    rt, rt_next);
953 			}
954 		} else {
955 			SRPL_INSERT_AFTER_LOCKED(&rt_rc, mrt, rt, rt_next);
956 		}
957 	} else {
958 		SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
959 	}
960 	rtfree(rt);
961 
962 	return (0);
963 }
964 
965 struct rtentry *
966 rtable_mpath_next(struct rtentry *rt)
967 {
968 	KERNEL_ASSERT_LOCKED();
969 	return (SRPL_NEXT_LOCKED(rt, rt_next));
970 }
971 #endif /* SMALL_KERNEL */
972 
973 void
974 rtentry_ref(void *null, void *xrt)
975 {
976 	struct rtentry *rt = xrt;
977 
978 	rtref(rt);
979 }
980 
981 void
982 rtentry_unref(void *null, void *xrt)
983 {
984 	struct rtentry *rt = xrt;
985 
986 	rtfree(rt);
987 }
988 
989 /*
990  * Return a pointer to the address (key).  This is an heritage from the
991  * BSD radix tree needed to skip the non-address fields from the flavor
992  * of "struct sockaddr" used by this routing table.
993  */
994 static inline uint8_t *
995 satoaddr(struct art_root *at, struct sockaddr *sa)
996 {
997 	return (((uint8_t *)sa) + at->ar_off);
998 }
999 #endif /* ART */
1000 
1001 /*
1002  * Return the prefix length of a mask.
1003  */
1004 int
1005 rtable_satoplen(sa_family_t af, struct sockaddr *mask)
1006 {
1007 	struct domain	*dp;
1008 	uint8_t		*ap, *ep;
1009 	int		 mlen, plen = 0;
1010 	int		 i;
1011 
1012 	for (i = 0; (dp = domains[i]) != NULL; i++) {
1013 		if (dp->dom_rtoffset == 0)
1014 			continue;
1015 
1016 		if (af == dp->dom_family)
1017 			break;
1018 	}
1019 	if (dp == NULL)
1020 		return (-1);
1021 
1022 	/* Host route */
1023 	if (mask == NULL)
1024 		return (dp->dom_maxplen);
1025 
1026 	mlen = mask->sa_len;
1027 
1028 	/* Default route */
1029 	if (mlen == 0)
1030 		return (0);
1031 
1032 	ap = (uint8_t *)((uint8_t *)mask) + dp->dom_rtoffset;
1033 	ep = (uint8_t *)((uint8_t *)mask) + mlen;
1034 	if (ap > ep)
1035 		return (-1);
1036 
1037 	if (ap == ep)
1038 		return (0);
1039 
1040 	/* "Beauty" adapted from sbin/route/show.c ... */
1041 	while (ap < ep) {
1042 		switch (*ap) {
1043 		case 0xff:
1044 			plen += 8;
1045 			ap++;
1046 			break;
1047 		case 0xfe:
1048 			plen += 7;
1049 			ap++;
1050 			goto out;
1051 		case 0xfc:
1052 			plen += 6;
1053 			ap++;
1054 			goto out;
1055 		case 0xf8:
1056 			plen += 5;
1057 			ap++;
1058 			goto out;
1059 		case 0xf0:
1060 			plen += 4;
1061 			ap++;
1062 			goto out;
1063 		case 0xe0:
1064 			plen += 3;
1065 			ap++;
1066 			goto out;
1067 		case 0xc0:
1068 			plen += 2;
1069 			ap++;
1070 			goto out;
1071 		case 0x80:
1072 			plen += 1;
1073 			ap++;
1074 			goto out;
1075 		case 0x00:
1076 			goto out;
1077 		default:
1078 			/* Non contiguous mask. */
1079 			return (-1);
1080 		}
1081 
1082 	}
1083 
1084 out:
1085 #ifdef DIAGNOSTIC
1086 	for (; ap < ep; ap++) {
1087 		if (*ap != 0x00)
1088 			return (-1);
1089 	}
1090 #endif /* DIAGNOSTIC */
1091 
1092 	return (plen);
1093 }
1094