xref: /openbsd-src/sys/net/rtable.c (revision d59bb9942320b767f2a19aaa7690c8c6e30b724c)
1 /*	$OpenBSD: rtable.c,v 1.58 2017/02/28 09:50:13 mpi Exp $ */
2 
3 /*
4  * Copyright (c) 2014-2016 Martin Pieuchot
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #ifndef _KERNEL
20 #include "kern_compat.h"
21 #else
22 #include <sys/param.h>
23 #include <sys/systm.h>
24 #include <sys/socket.h>
25 #include <sys/malloc.h>
26 #include <sys/queue.h>
27 #include <sys/domain.h>
28 #include <sys/srp.h>
29 #endif
30 
31 #include <net/rtable.h>
32 #include <net/route.h>
33 
34 /*
35  * Structures used by rtable_get() to retrieve the corresponding
36  * routing table for a given pair of ``af'' and ``rtableid''.
37  *
38  * Note that once allocated routing table heads are never freed.
39  * This way we do not need to reference count them.
40  *
41  *	afmap		    rtmap/dommp
42  *   -----------          ---------     -----
43  *   |   0     |--------> | 0 | 0 | ... | 0 |	Array mapping rtableid (=index)
44  *   -----------          ---------     -----   to rdomain/loopback (=value).
45  *   | AF_INET |.
46  *   ----------- `.       .---------.     .---------.
47  *       ...	   `----> | rtable0 | ... | rtableN |	Array of pointers for
48  *   -----------          '---------'     '---------'	IPv4 routing tables
49  *   | AF_MPLS |					indexed by ``rtableid''.
50  *   -----------
51  */
52 struct srp	  *afmap;
53 uint8_t		   af2idx[AF_MAX+1];	/* To only allocate supported AF */
54 uint8_t		   af2idx_max;
55 
56 /* Array of routing table pointers. */
57 struct rtmap {
58 	unsigned int	   limit;
59 	void		 **tbl;
60 };
61 
62 /*
63  * Array of rtableid -> rdomain mapping.
64  *
65  * Only used for the first index as describbed above.
66  */
67 struct dommp {
68 	unsigned int	   limit;
69 	/*
70 	 * Array to get the routing domain and loopback interface related to
71 	 * a routing table. Format:
72 	 *
73 	 * 8 unused bits | 16 bits for loopback index | 8 bits for rdomain
74 	 */
75 	unsigned int	  *value;
76 };
77 
78 unsigned int	   rtmap_limit = 0;
79 
80 void		   rtmap_init(void);
81 void		   rtmap_grow(unsigned int, sa_family_t);
82 void		   rtmap_dtor(void *, void *);
83 
84 struct srp_gc	   rtmap_gc = SRP_GC_INITIALIZER(rtmap_dtor, NULL);
85 
86 void		   rtable_init_backend(unsigned int);
87 void		  *rtable_alloc(unsigned int, unsigned int, unsigned int);
88 void		  *rtable_get(unsigned int, sa_family_t);
89 
90 void
91 rtmap_init(void)
92 {
93 	struct domain	*dp;
94 	int		 i;
95 
96 	/* Start with a single table for every domain that requires it. */
97 	for (i = 0; (dp = domains[i]) != NULL; i++) {
98 		if (dp->dom_rtoffset == 0)
99 			continue;
100 
101 		rtmap_grow(1, dp->dom_family);
102 	}
103 
104 	/* Initialize the rtableid->rdomain mapping table. */
105 	rtmap_grow(1, 0);
106 
107 	rtmap_limit = 1;
108 }
109 
110 /*
111  * Grow the size of the array of routing table for AF ``af'' to ``nlimit''.
112  */
113 void
114 rtmap_grow(unsigned int nlimit, sa_family_t af)
115 {
116 	struct rtmap	*map, *nmap;
117 	int		 i;
118 
119 	KERNEL_ASSERT_LOCKED();
120 
121 	KASSERT(nlimit > rtmap_limit);
122 
123 	nmap = malloc(sizeof(*nmap), M_RTABLE, M_WAITOK);
124 	nmap->limit = nlimit;
125 	nmap->tbl = mallocarray(nlimit, sizeof(*nmap[0].tbl), M_RTABLE,
126 	    M_WAITOK|M_ZERO);
127 
128 	map = srp_get_locked(&afmap[af2idx[af]]);
129 	if (map != NULL) {
130 		KASSERT(map->limit == rtmap_limit);
131 
132 		for (i = 0; i < map->limit; i++)
133 			nmap->tbl[i] = map->tbl[i];
134 	}
135 
136 	srp_update_locked(&rtmap_gc, &afmap[af2idx[af]], nmap);
137 }
138 
139 void
140 rtmap_dtor(void *null, void *xmap)
141 {
142 	struct rtmap	*map = xmap;
143 
144 	/*
145 	 * doesnt need to be serialized since this is the last reference
146 	 * to this map. there's nothing to race against.
147 	 */
148 	free(map->tbl, M_RTABLE, map->limit * sizeof(*map[0].tbl));
149 	free(map, M_RTABLE, sizeof(*map));
150 }
151 
152 void
153 rtable_init(void)
154 {
155 	struct domain	*dp;
156 	unsigned int	 keylen = 0;
157 	int		 i;
158 
159 	KASSERT(sizeof(struct rtmap) == sizeof(struct dommp));
160 
161 	/* We use index 0 for the rtable/rdomain map. */
162 	af2idx_max = 1;
163 	memset(af2idx, 0, sizeof(af2idx));
164 
165 	/*
166 	 * Compute the maximum supported key length in case the routing
167 	 * table backend needs it.
168 	 */
169 	for (i = 0; (dp = domains[i]) != NULL; i++) {
170 		if (dp->dom_rtoffset == 0)
171 			continue;
172 
173 		af2idx[dp->dom_family] = af2idx_max++;
174 		if (dp->dom_rtkeylen > keylen)
175 			keylen = dp->dom_rtkeylen;
176 
177 	}
178 	rtable_init_backend(keylen);
179 
180 	/*
181 	 * Allocate AF-to-id table now that we now how many AFs this
182 	 * kernel supports.
183 	 */
184 	afmap = mallocarray(af2idx_max + 1, sizeof(*afmap), M_RTABLE,
185 	    M_WAITOK|M_ZERO);
186 
187 	rtmap_init();
188 
189 	if (rtable_add(0) != 0)
190 		panic("unable to create default routing table");
191 }
192 
193 int
194 rtable_add(unsigned int id)
195 {
196 	struct domain	*dp;
197 	void		*tbl;
198 	struct rtmap	*map;
199 	struct dommp	*dmm;
200 	sa_family_t	 af;
201 	unsigned int	 off, alen;
202 	int		 i;
203 
204 	KERNEL_ASSERT_LOCKED();
205 
206 	if (id > RT_TABLEID_MAX)
207 		return (EINVAL);
208 
209 	if (rtable_exists(id))
210 		return (EEXIST);
211 
212 	for (i = 0; (dp = domains[i]) != NULL; i++) {
213 		if (dp->dom_rtoffset == 0)
214 			continue;
215 
216 		af = dp->dom_family;
217 		off = dp->dom_rtoffset;
218 		alen = dp->dom_maxplen;
219 
220 		if (id >= rtmap_limit)
221 			rtmap_grow(id + 1, af);
222 
223 		tbl = rtable_alloc(id, alen, off);
224 		if (tbl == NULL)
225 			return (ENOMEM);
226 
227 		map = srp_get_locked(&afmap[af2idx[af]]);
228 		map->tbl[id] = tbl;
229 	}
230 
231 	/* Reflect possible growth. */
232 	if (id >= rtmap_limit) {
233 		rtmap_grow(id + 1, 0);
234 		rtmap_limit = id + 1;
235 	}
236 
237 	/* Use main rtable/rdomain by default. */
238 	dmm = srp_get_locked(&afmap[0]);
239 	dmm->value[id] = 0;
240 
241 	return (0);
242 }
243 
244 void *
245 rtable_get(unsigned int rtableid, sa_family_t af)
246 {
247 	struct rtmap	*map;
248 	void		*tbl = NULL;
249 	struct srp_ref	 sr;
250 
251 	if (af >= nitems(af2idx) || af2idx[af] == 0)
252 		return (NULL);
253 
254 	map = srp_enter(&sr, &afmap[af2idx[af]]);
255 	if (rtableid < map->limit)
256 		tbl = map->tbl[rtableid];
257 	srp_leave(&sr);
258 
259 	return (tbl);
260 }
261 
262 int
263 rtable_exists(unsigned int rtableid)
264 {
265 	struct domain	*dp;
266 	void		*tbl;
267 	int		 i;
268 
269 	for (i = 0; (dp = domains[i]) != NULL; i++) {
270 		if (dp->dom_rtoffset == 0)
271 			continue;
272 
273 		tbl = rtable_get(rtableid, dp->dom_family);
274 		if (tbl != NULL)
275 			return (1);
276 	}
277 
278 	return (0);
279 }
280 
281 unsigned int
282 rtable_l2(unsigned int rtableid)
283 {
284 	struct dommp	*dmm;
285 	unsigned int	 rdomain = 0;
286 	struct srp_ref	 sr;
287 
288 	dmm = srp_enter(&sr, &afmap[0]);
289 	if (rtableid < dmm->limit)
290 		rdomain = (dmm->value[rtableid] & RT_TABLEID_MASK);
291 	srp_leave(&sr);
292 
293 	return (rdomain);
294 }
295 
296 unsigned int
297 rtable_loindex(unsigned int rtableid)
298 {
299 	struct dommp	*dmm;
300 	unsigned int	 loifidx = 0;
301 	struct srp_ref	 sr;
302 
303 	dmm = srp_enter(&sr, &afmap[0]);
304 	if (rtableid < dmm->limit)
305 		loifidx = (dmm->value[rtableid] >> RT_TABLEID_BITS);
306 	srp_leave(&sr);
307 
308 	return (loifidx);
309 }
310 
311 void
312 rtable_l2set(unsigned int rtableid, unsigned int rdomain, unsigned int loifidx)
313 {
314 	struct dommp	*dmm;
315 	unsigned int	 value;
316 
317 	KERNEL_ASSERT_LOCKED();
318 
319 	if (!rtable_exists(rtableid) || !rtable_exists(rdomain))
320 		return;
321 
322 	value = (rdomain & RT_TABLEID_MASK) | (loifidx << RT_TABLEID_BITS);
323 
324 	dmm = srp_get_locked(&afmap[0]);
325 	dmm->value[rtableid] = value;
326 }
327 
328 #ifndef ART
329 void
330 rtable_init_backend(unsigned int keylen)
331 {
332 	rn_init(keylen); /* initialize all zeroes, all ones, mask table */
333 }
334 
335 void *
336 rtable_alloc(unsigned int rtableid, unsigned int alen, unsigned int off)
337 {
338 	struct radix_node_head *rnh = NULL;
339 
340 	if (rn_inithead((void **)&rnh, off)) {
341 		rnh->rnh_rtableid = rtableid;
342 	}
343 
344 	return (rnh);
345 }
346 
347 struct rtentry *
348 rtable_lookup(unsigned int rtableid, struct sockaddr *dst,
349     struct sockaddr *mask, struct sockaddr *gateway, uint8_t prio)
350 {
351 	struct radix_node_head	*rnh;
352 	struct radix_node	*rn;
353 	struct rtentry		*rt;
354 
355 	rnh = rtable_get(rtableid, dst->sa_family);
356 	if (rnh == NULL)
357 		return (NULL);
358 
359 	rn = rn_lookup(dst, mask, rnh);
360 	if (rn == NULL || (rn->rn_flags & RNF_ROOT) != 0)
361 		return (NULL);
362 
363 	rt = ((struct rtentry *)rn);
364 
365 	rtref(rt);
366 	return (rt);
367 }
368 
369 struct rtentry *
370 rtable_match(unsigned int rtableid, struct sockaddr *dst, uint32_t *src)
371 {
372 	struct radix_node_head	*rnh;
373 	struct radix_node	*rn;
374 	struct rtentry		*rt = NULL;
375 
376 	rnh = rtable_get(rtableid, dst->sa_family);
377 	if (rnh == NULL)
378 		return (NULL);
379 
380 	KERNEL_LOCK();
381 	rn = rn_match(dst, rnh);
382 	if (rn == NULL || (rn->rn_flags & RNF_ROOT) != 0)
383 		goto out;
384 
385 	rt = ((struct rtentry *)rn);
386 	rtref(rt);
387 out:
388 	KERNEL_UNLOCK();
389 	return (rt);
390 }
391 
392 int
393 rtable_insert(unsigned int rtableid, struct sockaddr *dst,
394     struct sockaddr *mask, struct sockaddr *gateway, uint8_t prio,
395     struct rtentry *rt)
396 {
397 	struct radix_node_head	*rnh;
398 	struct radix_node	*rn = (struct radix_node *)rt;
399 
400 	rnh = rtable_get(rtableid, dst->sa_family);
401 	if (rnh == NULL)
402 		return (EAFNOSUPPORT);
403 
404 	rn = rn_addroute(dst, mask, rnh, rn, prio);
405 	if (rn == NULL)
406 		return (ESRCH);
407 
408 	rt = ((struct rtentry *)rn);
409 	rtref(rt);
410 
411 	return (0);
412 }
413 
414 int
415 rtable_delete(unsigned int rtableid, struct sockaddr *dst,
416     struct sockaddr *mask, struct rtentry *rt)
417 {
418 	struct radix_node_head	*rnh;
419 	struct radix_node	*rn = (struct radix_node *)rt;
420 
421 	rnh = rtable_get(rtableid, dst->sa_family);
422 	if (rnh == NULL)
423 		return (EAFNOSUPPORT);
424 
425 	rn = rn_delete(dst, mask, rnh, rn);
426 	if (rn == NULL)
427 		return (ESRCH);
428 
429 	if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT))
430 		panic("active node flags=%x", rn->rn_flags);
431 
432 	rt = ((struct rtentry *)rn);
433 	rtfree(rt);
434 
435 	return (0);
436 }
437 
438 int
439 rtable_walk(unsigned int rtableid, sa_family_t af,
440     int (*func)(struct rtentry *, void *, unsigned int), void *arg)
441 {
442 	struct radix_node_head	*rnh;
443 	int (*f)(struct radix_node *, void *, unsigned int) = (void *)func;
444 	int error;
445 
446 	rnh = rtable_get(rtableid, af);
447 	if (rnh == NULL)
448 		return (EAFNOSUPPORT);
449 
450 	while ((error = rn_walktree(rnh, f, arg)) == EAGAIN)
451 		continue;
452 
453 	return (error);
454 }
455 
456 struct rtentry *
457 rtable_iterate(struct rtentry *rt0)
458 {
459 	rtfree(rt0);
460 	return (NULL);
461 }
462 
463 #ifndef SMALL_KERNEL
464 int
465 rtable_mpath_capable(unsigned int rtableid, sa_family_t af)
466 {
467 	return (0);
468 }
469 
470 int
471 rtable_mpath_reprio(unsigned int rtableid, struct sockaddr *dst,
472     struct sockaddr *mask, uint8_t prio, struct rtentry *rt)
473 {
474 	return (0);
475 }
476 #endif /* SMALL_KERNEL */
477 
478 #else /* ART */
479 
480 static inline uint8_t	*satoaddr(struct art_root *, struct sockaddr *);
481 
482 int	an_match(struct art_node *, struct sockaddr *, int);
483 void	rtentry_ref(void *, void *);
484 void	rtentry_unref(void *, void *);
485 
486 #ifndef SMALL_KERNEL
487 void	rtable_mpath_insert(struct art_node *, struct rtentry *);
488 #endif
489 
490 struct srpl_rc rt_rc = SRPL_RC_INITIALIZER(rtentry_ref, rtentry_unref, NULL);
491 
492 void
493 rtable_init_backend(unsigned int keylen)
494 {
495 	art_init();
496 }
497 
498 void *
499 rtable_alloc(unsigned int rtableid, unsigned int alen, unsigned int off)
500 {
501 	return (art_alloc(rtableid, alen, off));
502 }
503 
504 struct rtentry *
505 rtable_lookup(unsigned int rtableid, struct sockaddr *dst,
506     struct sockaddr *mask, struct sockaddr *gateway, uint8_t prio)
507 {
508 	struct art_root			*ar;
509 	struct art_node			*an;
510 	struct rtentry			*rt = NULL;
511 	struct srp_ref			 sr, nsr;
512 	uint8_t				*addr;
513 	int				 plen;
514 
515 	ar = rtable_get(rtableid, dst->sa_family);
516 	if (ar == NULL)
517 		return (NULL);
518 
519 	addr = satoaddr(ar, dst);
520 
521 	/* No need for a perfect match. */
522 	if (mask == NULL) {
523 		an = art_match(ar, addr, &nsr);
524 		if (an == NULL)
525 			goto out;
526 	} else {
527 		plen = rtable_satoplen(dst->sa_family, mask);
528 		if (plen == -1)
529 			return (NULL);
530 
531 		an = art_lookup(ar, addr, plen, &nsr);
532 
533 		/* Make sure we've got a perfect match. */
534 		if (!an_match(an, dst, plen))
535 			goto out;
536 	}
537 
538 #ifdef SMALL_KERNEL
539 	rt = SRPL_FIRST(&sr, &an->an_rtlist);
540 #else
541 	SRPL_FOREACH(rt, &sr, &an->an_rtlist, rt_next) {
542 		if (prio != RTP_ANY &&
543 		    (rt->rt_priority & RTP_MASK) != (prio & RTP_MASK))
544 			continue;
545 
546 		if (gateway == NULL)
547 			break;
548 
549 		if (rt->rt_gateway->sa_len == gateway->sa_len &&
550 		    memcmp(rt->rt_gateway, gateway, gateway->sa_len) == 0)
551 			break;
552 	}
553 #endif /* SMALL_KERNEL */
554 	if (rt != NULL)
555 		rtref(rt);
556 
557 	SRPL_LEAVE(&sr);
558 out:
559 	srp_leave(&nsr);
560 
561 	return (rt);
562 }
563 
564 struct rtentry *
565 rtable_match(unsigned int rtableid, struct sockaddr *dst, uint32_t *src)
566 {
567 	struct art_root			*ar;
568 	struct art_node			*an;
569 	struct rtentry			*rt = NULL;
570 	struct srp_ref			 sr, nsr;
571 	uint8_t				*addr;
572 #ifndef SMALL_KERNEL
573 	int				 hash;
574 #endif /* SMALL_KERNEL */
575 
576 	ar = rtable_get(rtableid, dst->sa_family);
577 	if (ar == NULL)
578 		return (NULL);
579 
580 	addr = satoaddr(ar, dst);
581 
582 	an = art_match(ar, addr, &nsr);
583 	if (an == NULL)
584 		goto out;
585 
586 	rt = SRPL_FIRST(&sr, &an->an_rtlist);
587 	rtref(rt);
588 	SRPL_LEAVE(&sr);
589 
590 #ifndef SMALL_KERNEL
591 	/* Gateway selection by Hash-Threshold (RFC 2992) */
592 	if ((hash = rt_hash(rt, dst, src)) != -1) {
593 		struct rtentry		*mrt;
594 		int			 threshold, npaths = 0;
595 
596 		KASSERT(hash <= 0xffff);
597 
598 		SRPL_FOREACH(mrt, &sr, &an->an_rtlist, rt_next) {
599 			/* Only count nexthops with the same priority. */
600 			if (mrt->rt_priority == rt->rt_priority)
601 				npaths++;
602 		}
603 		SRPL_LEAVE(&sr);
604 
605 		threshold = (0xffff / npaths) + 1;
606 
607 		/*
608 		 * we have no protection against concurrent modification of the
609 		 * route list attached to the node, so we won't necessarily
610 		 * have the same number of routes.  for most modifications,
611 		 * we'll pick a route that we wouldn't have if we only saw the
612 		 * list before or after the change.  if we were going to use
613 		 * the last available route, but it got removed, we'll hit
614 		 * the end of the list and then pick the first route.
615 		 */
616 
617 		mrt = SRPL_FIRST(&sr, &an->an_rtlist);
618 		while (hash > threshold && mrt != NULL) {
619 			if (mrt->rt_priority == rt->rt_priority)
620 				hash -= threshold;
621 			mrt = SRPL_FOLLOW(&sr, mrt, rt_next);
622 		}
623 
624 		if (mrt != NULL) {
625 			rtref(mrt);
626 			rtfree(rt);
627 			rt = mrt;
628 		}
629 		SRPL_LEAVE(&sr);
630 	}
631 #endif /* SMALL_KERNEL */
632 out:
633 	srp_leave(&nsr);
634 	return (rt);
635 }
636 
637 int
638 rtable_insert(unsigned int rtableid, struct sockaddr *dst,
639     struct sockaddr *mask, struct sockaddr *gateway, uint8_t prio,
640     struct rtentry *rt)
641 {
642 #ifndef SMALL_KERNEL
643 	struct rtentry			*mrt;
644 	struct srp_ref			 sr;
645 #endif /* SMALL_KERNEL */
646 	struct art_root			*ar;
647 	struct art_node			*an, *prev;
648 	uint8_t				*addr;
649 	int				 plen;
650 	unsigned int			 rt_flags;
651 	int				 error = 0;
652 
653 	ar = rtable_get(rtableid, dst->sa_family);
654 	if (ar == NULL)
655 		return (EAFNOSUPPORT);
656 
657 	addr = satoaddr(ar, dst);
658 	plen = rtable_satoplen(dst->sa_family, mask);
659 	if (plen == -1)
660 		return (EINVAL);
661 
662 	rtref(rt); /* guarantee rtfree won't do anything during insert */
663 	rw_enter_write(&ar->ar_lock);
664 
665 #ifndef SMALL_KERNEL
666 	/* Do not permit exactly the same dst/mask/gw pair. */
667 	an = art_lookup(ar, addr, plen, &sr);
668 	srp_leave(&sr); /* an can't go away while we have the lock */
669 	if (an_match(an, dst, plen)) {
670 		struct rtentry  *mrt;
671 		int		 mpathok = ISSET(rt->rt_flags, RTF_MPATH);
672 
673 		SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next) {
674 			if (prio != RTP_ANY &&
675 			    (mrt->rt_priority & RTP_MASK) != (prio & RTP_MASK))
676 				continue;
677 
678 			if (!mpathok ||
679 			    (mrt->rt_gateway->sa_len == gateway->sa_len &&
680 			    !memcmp(mrt->rt_gateway, gateway, gateway->sa_len))){
681 				error = EEXIST;
682 				goto leave;
683 			}
684 		}
685 	}
686 #endif /* SMALL_KERNEL */
687 
688 	an = art_get(dst, plen);
689 	if (an == NULL) {
690 		error = ENOBUFS;
691 		goto leave;
692 	}
693 
694 	/* prepare for immediate operation if insert succeeds */
695 	rt_flags = rt->rt_flags;
696 	rt->rt_flags &= ~RTF_MPATH;
697 	rt->rt_dest = dst;
698 	rt->rt_plen = plen;
699 	SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
700 
701 	prev = art_insert(ar, an, addr, plen);
702 	if (prev != an) {
703 		SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry,
704 		    rt_next);
705 		rt->rt_flags = rt_flags;
706 		art_put(an);
707 
708 		if (prev == NULL) {
709 			error = ESRCH;
710 			goto leave;
711 		}
712 
713 #ifndef SMALL_KERNEL
714 		an = prev;
715 
716 		mrt = SRPL_FIRST_LOCKED(&an->an_rtlist);
717 		KASSERT(mrt != NULL);
718 		KASSERT((rt->rt_flags & RTF_MPATH) || mrt->rt_priority != prio);
719 
720 		/*
721 		 * An ART node with the same destination/netmask already
722 		 * exists, MPATH conflict must have been already checked.
723 		 */
724 		if (rt->rt_flags & RTF_MPATH) {
725 			/*
726 			 * Only keep the RTF_MPATH flag if two routes have
727 			 * the same gateway.
728 			 */
729 			rt->rt_flags &= ~RTF_MPATH;
730 			SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next) {
731 				if (mrt->rt_priority == prio) {
732 					mrt->rt_flags |= RTF_MPATH;
733 					rt->rt_flags |= RTF_MPATH;
734 				}
735 			}
736 		}
737 
738 		/* Put newly inserted entry at the right place. */
739 		rtable_mpath_insert(an, rt);
740 #else
741 		error = EEXIST;
742 #endif /* SMALL_KERNEL */
743 	}
744 leave:
745 	rw_exit_write(&ar->ar_lock);
746 	rtfree(rt);
747 	return (error);
748 }
749 
750 int
751 rtable_delete(unsigned int rtableid, struct sockaddr *dst,
752     struct sockaddr *mask, struct rtentry *rt)
753 {
754 	struct art_root			*ar;
755 	struct art_node			*an;
756 	struct srp_ref			 sr;
757 	uint8_t				*addr;
758 	int				 plen;
759 #ifndef SMALL_KERNEL
760 	struct rtentry			*mrt;
761 	int				 npaths = 0;
762 #endif /* SMALL_KERNEL */
763 	int				 error = 0;
764 
765 	ar = rtable_get(rtableid, dst->sa_family);
766 	if (ar == NULL)
767 		return (EAFNOSUPPORT);
768 
769 	addr = satoaddr(ar, dst);
770 	plen = rtable_satoplen(dst->sa_family, mask);
771 
772 	rtref(rt); /* guarantee rtfree won't do anything under ar_lock */
773 	rw_enter_write(&ar->ar_lock);
774 	an = art_lookup(ar, addr, plen, &sr);
775 	srp_leave(&sr); /* an can't go away while we have the lock */
776 
777 	/* Make sure we've got a perfect match. */
778 	if (!an_match(an, dst, plen)) {
779 		error = ESRCH;
780 		goto leave;
781 	}
782 
783 #ifndef SMALL_KERNEL
784 	/*
785 	 * If other multipath route entries are still attached to
786 	 * this ART node we only have to unlink it.
787 	 */
788 	SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next)
789 		npaths++;
790 
791 	if (npaths > 1) {
792 		KASSERT(rt->rt_refcnt >= 1);
793 		SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry,
794 		    rt_next);
795 
796 		mrt = SRPL_FIRST_LOCKED(&an->an_rtlist);
797 		if (npaths == 2)
798 			mrt->rt_flags &= ~RTF_MPATH;
799 
800 		goto leave;
801 	}
802 #endif /* SMALL_KERNEL */
803 
804 	if (art_delete(ar, an, addr, plen) == NULL)
805 		panic("art_delete failed to find node %p", an);
806 
807 	KASSERT(rt->rt_refcnt >= 1);
808 	SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry, rt_next);
809 	art_put(an);
810 
811 leave:
812 	rw_exit_write(&ar->ar_lock);
813 	rtfree(rt);
814 
815 	return (error);
816 }
817 
818 struct rtable_walk_cookie {
819 	int		(*rwc_func)(struct rtentry *, void *, unsigned int);
820 	void		 *rwc_arg;
821 	unsigned int	  rwc_rid;
822 };
823 
824 /*
825  * Helper for rtable_walk to keep the ART code free from any "struct rtentry".
826  */
827 int
828 rtable_walk_helper(struct art_node *an, void *xrwc)
829 {
830 	struct srp_ref			 sr;
831 	struct rtable_walk_cookie	*rwc = xrwc;
832 	struct rtentry			*rt;
833 	int				 error = 0;
834 
835 	SRPL_FOREACH(rt, &sr, &an->an_rtlist, rt_next) {
836 		if ((error = (*rwc->rwc_func)(rt, rwc->rwc_arg, rwc->rwc_rid)))
837 			break;
838 	}
839 	SRPL_LEAVE(&sr);
840 
841 	return (error);
842 }
843 
844 int
845 rtable_walk(unsigned int rtableid, sa_family_t af,
846     int (*func)(struct rtentry *, void *, unsigned int), void *arg)
847 {
848 	struct art_root			*ar;
849 	struct rtable_walk_cookie	 rwc;
850 	int				 error;
851 
852 	ar = rtable_get(rtableid, af);
853 	if (ar == NULL)
854 		return (EAFNOSUPPORT);
855 
856 	rwc.rwc_func = func;
857 	rwc.rwc_arg = arg;
858 	rwc.rwc_rid = rtableid;
859 
860 	while ((error = art_walk(ar, rtable_walk_helper, &rwc)) == EAGAIN)
861 		continue;
862 
863 	return (error);
864 }
865 
866 struct rtentry *
867 rtable_iterate(struct rtentry *rt0)
868 {
869 	struct rtentry *rt = NULL;
870 #ifndef SMALL_KERNEL
871 	struct srp_ref sr;
872 
873 	rt = SRPL_NEXT(&sr, rt0, rt_next);
874 	if (rt != NULL)
875 		rtref(rt);
876 	SRPL_LEAVE(&sr);
877 #endif /* SMALL_KERNEL */
878 	rtfree(rt0);
879 	return (rt);
880 }
881 
882 #ifndef SMALL_KERNEL
883 int
884 rtable_mpath_capable(unsigned int rtableid, sa_family_t af)
885 {
886 	return (1);
887 }
888 
889 int
890 rtable_mpath_reprio(unsigned int rtableid, struct sockaddr *dst,
891     struct sockaddr *mask, uint8_t prio, struct rtentry *rt)
892 {
893 	struct art_root			*ar;
894 	struct art_node			*an;
895 	struct srp_ref			 sr;
896 	uint8_t				*addr;
897 	int				 plen;
898 	int				 error = 0;
899 
900 	ar = rtable_get(rtableid, dst->sa_family);
901 	if (ar == NULL)
902 		return (EAFNOSUPPORT);
903 
904 	addr = satoaddr(ar, dst);
905 	plen = rtable_satoplen(dst->sa_family, mask);
906 
907 	rw_enter_write(&ar->ar_lock);
908 	an = art_lookup(ar, addr, plen, &sr);
909 	srp_leave(&sr); /* an can't go away while we have the lock */
910 
911 	/* Make sure we've got a perfect match. */
912 	if (!an_match(an, dst, plen))
913 		error = ESRCH;
914 	else {
915 		rtref(rt); /* keep rt alive in between remove and insert */
916 		SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist,
917 		    rt, rtentry, rt_next);
918 		rt->rt_priority = prio;
919 		rtable_mpath_insert(an, rt);
920 		rtfree(rt);
921 	}
922 	rw_exit_write(&ar->ar_lock);
923 
924 	return (error);
925 }
926 
927 void
928 rtable_mpath_insert(struct art_node *an, struct rtentry *rt)
929 {
930 	struct rtentry			*mrt, *prt = NULL;
931 	uint8_t				 prio = rt->rt_priority;
932 
933 	if ((mrt = SRPL_FIRST_LOCKED(&an->an_rtlist)) != NULL) {
934 		/*
935 		 * Select the order of the MPATH routes.
936 		 */
937 		while (SRPL_NEXT_LOCKED(mrt, rt_next) != NULL) {
938 			if (mrt->rt_priority > prio)
939 				break;
940 			prt = mrt;
941 			mrt = SRPL_NEXT_LOCKED(mrt, rt_next);
942 		}
943 
944 		if (mrt->rt_priority > prio) {
945 			/*
946 			 * ``rt'' has a higher (smaller) priority than
947 			 * ``mrt'' so put it before in the list.
948 			 */
949 			if (prt != NULL) {
950 				SRPL_INSERT_AFTER_LOCKED(&rt_rc, prt, rt,
951 				    rt_next);
952 			} else {
953 				SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist,
954 				    rt, rt_next);
955 			}
956 		} else {
957 			SRPL_INSERT_AFTER_LOCKED(&rt_rc, mrt, rt, rt_next);
958 		}
959 	} else {
960 		SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
961 	}
962 }
963 #endif /* SMALL_KERNEL */
964 
965 /*
966  * Returns 1 if ``an'' perfectly matches (``dst'', ``plen''), 0 otherwise.
967  */
968 int
969 an_match(struct art_node *an, struct sockaddr *dst, int plen)
970 {
971 	struct rtentry			*rt;
972 	struct srp_ref			 sr;
973 	int				 match;
974 
975 	if (an == NULL || an->an_plen != plen)
976 		return (0);
977 
978 	rt = SRPL_FIRST(&sr, &an->an_rtlist);
979 	match = (memcmp(rt->rt_dest, dst, dst->sa_len) == 0);
980 	SRPL_LEAVE(&sr);
981 
982 	return (match);
983 }
984 
985 void
986 rtentry_ref(void *null, void *xrt)
987 {
988 	struct rtentry *rt = xrt;
989 
990 	rtref(rt);
991 }
992 
993 void
994 rtentry_unref(void *null, void *xrt)
995 {
996 	struct rtentry *rt = xrt;
997 
998 	rtfree(rt);
999 }
1000 
1001 /*
1002  * Return a pointer to the address (key).  This is an heritage from the
1003  * BSD radix tree needed to skip the non-address fields from the flavor
1004  * of "struct sockaddr" used by this routing table.
1005  */
1006 static inline uint8_t *
1007 satoaddr(struct art_root *at, struct sockaddr *sa)
1008 {
1009 	return (((uint8_t *)sa) + at->ar_off);
1010 }
1011 #endif /* ART */
1012 
1013 /*
1014  * Return the prefix length of a mask.
1015  */
1016 int
1017 rtable_satoplen(sa_family_t af, struct sockaddr *mask)
1018 {
1019 	struct domain	*dp;
1020 	uint8_t		*ap, *ep;
1021 	int		 mlen, plen = 0;
1022 	int		 i;
1023 
1024 	for (i = 0; (dp = domains[i]) != NULL; i++) {
1025 		if (dp->dom_rtoffset == 0)
1026 			continue;
1027 
1028 		if (af == dp->dom_family)
1029 			break;
1030 	}
1031 	if (dp == NULL)
1032 		return (-1);
1033 
1034 	/* Host route */
1035 	if (mask == NULL)
1036 		return (dp->dom_maxplen);
1037 
1038 	mlen = mask->sa_len;
1039 
1040 	/* Default route */
1041 	if (mlen == 0)
1042 		return (0);
1043 
1044 	ap = (uint8_t *)((uint8_t *)mask) + dp->dom_rtoffset;
1045 	ep = (uint8_t *)((uint8_t *)mask) + mlen;
1046 	if (ap > ep)
1047 		return (-1);
1048 
1049 	if (ap == ep)
1050 		return (0);
1051 
1052 	/* "Beauty" adapted from sbin/route/show.c ... */
1053 	while (ap < ep) {
1054 		switch (*ap) {
1055 		case 0xff:
1056 			plen += 8;
1057 			ap++;
1058 			break;
1059 		case 0xfe:
1060 			plen += 7;
1061 			ap++;
1062 			goto out;
1063 		case 0xfc:
1064 			plen += 6;
1065 			ap++;
1066 			goto out;
1067 		case 0xf8:
1068 			plen += 5;
1069 			ap++;
1070 			goto out;
1071 		case 0xf0:
1072 			plen += 4;
1073 			ap++;
1074 			goto out;
1075 		case 0xe0:
1076 			plen += 3;
1077 			ap++;
1078 			goto out;
1079 		case 0xc0:
1080 			plen += 2;
1081 			ap++;
1082 			goto out;
1083 		case 0x80:
1084 			plen += 1;
1085 			ap++;
1086 			goto out;
1087 		case 0x00:
1088 			goto out;
1089 		default:
1090 			/* Non contiguous mask. */
1091 			return (-1);
1092 		}
1093 
1094 	}
1095 
1096 out:
1097 #ifdef DIAGNOSTIC
1098 	for (; ap < ep; ap++) {
1099 		if (*ap != 0x00)
1100 			return (-1);
1101 	}
1102 #endif /* DIAGNOSTIC */
1103 
1104 	return (plen);
1105 }
1106