xref: /openbsd-src/usr.sbin/bgpd/rde_rib.c (revision 25c4e8bd056e974b28f4a0ffd39d76c190a56013)
1 /*	$OpenBSD: rde_rib.c,v 1.240 2022/07/08 10:01:52 claudio Exp $ */
2 
3 /*
4  * Copyright (c) 2003, 2004 Claudio Jeker <claudio@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 #include <sys/queue.h>
21 
22 #include <limits.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <siphash.h>
26 #include <time.h>
27 
28 #include "bgpd.h"
29 #include "rde.h"
30 #include "log.h"
31 
32 /*
33  * BGP RIB -- Routing Information Base
34  *
35  * The RIB is build with one aspect in mind. Speed -- actually update speed.
36  * Therefore one thing needs to be absolutely avoided, long table walks.
37  * This is achieved by heavily linking the different parts together.
38  */
39 uint16_t rib_size;
40 struct rib **ribs;
41 
42 struct rib_entry *rib_add(struct rib *, struct bgpd_addr *, int);
43 static inline int rib_compare(const struct rib_entry *,
44 			const struct rib_entry *);
45 void rib_remove(struct rib_entry *);
46 int rib_empty(struct rib_entry *);
47 static void rib_dump_abort(uint16_t);
48 
49 RB_PROTOTYPE(rib_tree, rib_entry, rib_e, rib_compare);
50 RB_GENERATE(rib_tree, rib_entry, rib_e, rib_compare);
51 
52 struct rib_context {
53 	LIST_ENTRY(rib_context)		 entry;
54 	struct rib_entry		*ctx_re;
55 	struct prefix			*ctx_p;
56 	uint32_t			 ctx_id;
57 	void		(*ctx_rib_call)(struct rib_entry *, void *);
58 	void		(*ctx_prefix_call)(struct prefix *, void *);
59 	void		(*ctx_done)(void *, uint8_t);
60 	int		(*ctx_throttle)(void *);
61 	void				*ctx_arg;
62 	unsigned int			 ctx_count;
63 	uint8_t				 ctx_aid;
64 };
65 LIST_HEAD(, rib_context) rib_dumps = LIST_HEAD_INITIALIZER(rib_dumps);
66 
67 static void	prefix_dump_r(struct rib_context *);
68 
69 static inline struct rib_entry *
70 re_lock(struct rib_entry *re)
71 {
72 	if (re->lock != 0)
73 		log_warnx("%s: entry already locked", __func__);
74 	re->lock = 1;
75 	return re;
76 }
77 
78 static inline struct rib_entry *
79 re_unlock(struct rib_entry *re)
80 {
81 	if (re->lock == 0)
82 		log_warnx("%s: entry already unlocked", __func__);
83 	re->lock = 0;
84 	return re;
85 }
86 
87 static inline int
88 re_is_locked(struct rib_entry *re)
89 {
90 	return (re->lock != 0);
91 }
92 
93 static inline struct prefix *
94 prefix_lock(struct prefix *p)
95 {
96 	if (p->flags & PREFIX_FLAG_LOCKED)
97 		fatalx("%s: locking locked prefix", __func__);
98 	p->flags |= PREFIX_FLAG_LOCKED;
99 	return p;
100 }
101 
102 static inline struct prefix *
103 prefix_unlock(struct prefix *p)
104 {
105 	if ((p->flags & PREFIX_FLAG_LOCKED) == 0)
106 		fatalx("%s: unlocking unlocked prefix", __func__);
107 	p->flags &= ~PREFIX_FLAG_LOCKED;
108 	return p;
109 }
110 
111 static inline int
112 prefix_is_locked(struct prefix *p)
113 {
114 	return (p->flags & PREFIX_FLAG_LOCKED) != 0;
115 }
116 
117 static inline int
118 prefix_is_dead(struct prefix *p)
119 {
120 	return (p->flags & PREFIX_FLAG_DEAD) != 0;
121 }
122 
123 static inline struct rib_tree *
124 rib_tree(struct rib *rib)
125 {
126 	return (&rib->tree);
127 }
128 
129 static inline int
130 rib_compare(const struct rib_entry *a, const struct rib_entry *b)
131 {
132 	return (pt_prefix_cmp(a->prefix, b->prefix));
133 }
134 
135 /* RIB specific functions */
136 struct rib *
137 rib_new(char *name, u_int rtableid, uint16_t flags)
138 {
139 	struct rib *new;
140 	uint16_t id;
141 
142 	for (id = 0; id < rib_size; id++) {
143 		if (ribs[id] == NULL)
144 			break;
145 	}
146 
147 	if (id >= rib_size) {
148 		if ((ribs = recallocarray(ribs, id, id + 8,
149 		    sizeof(struct rib))) == NULL)
150 			fatal(NULL);
151 		rib_size = id + 8;
152 	}
153 
154 	if ((new = calloc(1, sizeof(*new))) == NULL)
155 		fatal(NULL);
156 
157 	strlcpy(new->name, name, sizeof(new->name));
158 	RB_INIT(rib_tree(new));
159 	new->state = RECONF_REINIT;
160 	new->id = id;
161 	new->flags = flags;
162 	new->rtableid = rtableid;
163 
164 	new->in_rules = calloc(1, sizeof(struct filter_head));
165 	if (new->in_rules == NULL)
166 		fatal(NULL);
167 	TAILQ_INIT(new->in_rules);
168 
169 	ribs[id] = new;
170 
171 	log_debug("%s: %s -> %u", __func__, name, id);
172 	return (new);
173 }
174 
175 /*
176  * This function is only called when the FIB information of a RIB changed.
177  * It will flush the FIB if there was one previously and change the fibstate
178  * from RECONF_NONE (nothing to do) to either RECONF_RELOAD (reload the FIB)
179  * or RECONF_REINIT (rerun the route decision process for every element)
180  * depending on the new flags.
181  */
182 int
183 rib_update(struct rib *rib)
184 {
185 	/* flush fib first if there was one */
186 	if ((rib->flags & (F_RIB_NOFIB | F_RIB_NOEVALUATE)) == 0)
187 		rde_send_kroute_flush(rib);
188 
189 	/* if no evaluate changes then a full reinit is needed */
190 	if ((rib->flags & F_RIB_NOEVALUATE) !=
191 	    (rib->flags_tmp & F_RIB_NOEVALUATE))
192 		rib->fibstate = RECONF_REINIT;
193 
194 	rib->flags = rib->flags_tmp;
195 	rib->rtableid = rib->rtableid_tmp;
196 
197 	/* reload fib if there is no reinit pending and there will be a fib */
198 	if (rib->fibstate != RECONF_REINIT &&
199 	    (rib->flags & (F_RIB_NOFIB | F_RIB_NOEVALUATE)) == 0)
200 		rib->fibstate = RECONF_RELOAD;
201 
202 	return (rib->fibstate == RECONF_REINIT);
203 }
204 
205 struct rib *
206 rib_byid(uint16_t id)
207 {
208 	if (id == RIB_NOTFOUND || id >= rib_size || ribs[id] == NULL)
209 		return NULL;
210 	return ribs[id];
211 }
212 
213 uint16_t
214 rib_find(char *name)
215 {
216 	uint16_t id;
217 
218 	/* no name returns the first Loc-RIB */
219 	if (name == NULL || *name == '\0')
220 		return RIB_LOC_START;
221 
222 	for (id = 0; id < rib_size; id++) {
223 		if (ribs[id] != NULL && !strcmp(ribs[id]->name, name))
224 			return id;
225 	}
226 
227 	return RIB_NOTFOUND;
228 }
229 
230 void
231 rib_free(struct rib *rib)
232 {
233 	struct rib_entry *re, *xre;
234 	struct prefix *p;
235 
236 	rib_dump_abort(rib->id);
237 
238 	/*
239 	 * flush the rib, disable route evaluation and fib sync to speed up
240 	 * the prefix removal. Nothing depends on this data anymore.
241 	 */
242 	if ((rib->flags & (F_RIB_NOFIB | F_RIB_NOEVALUATE)) == 0)
243 		rde_send_kroute_flush(rib);
244 	rib->flags |= F_RIB_NOEVALUATE | F_RIB_NOFIB;
245 
246 	for (re = RB_MIN(rib_tree, rib_tree(rib)); re != NULL; re = xre) {
247 		xre = RB_NEXT(rib_tree, rib_tree(rib), re);
248 
249 		/*
250 		 * Removing the prefixes is tricky because the last one
251 		 * will remove the rib_entry as well and because we do
252 		 * an empty check in prefix_destroy() it is not possible to
253 		 * use the default for loop.
254 		 */
255 		while ((p = TAILQ_FIRST(&re->prefix_h))) {
256 			struct rde_aspath *asp = prefix_aspath(p);
257 			if (asp && asp->pftableid)
258 				rde_pftable_del(asp->pftableid, p);
259 			prefix_destroy(p);
260 		}
261 	}
262 	if (rib->id <= RIB_LOC_START)
263 		return; /* never remove the default ribs */
264 	filterlist_free(rib->in_rules_tmp);
265 	filterlist_free(rib->in_rules);
266 	ribs[rib->id] = NULL;
267 	free(rib);
268 }
269 
270 void
271 rib_shutdown(void)
272 {
273 	struct rib *rib;
274 	uint16_t id;
275 
276 	for (id = 0; id < rib_size; id++) {
277 		rib = rib_byid(id);
278 		if (rib == NULL)
279 			continue;
280 		if (!RB_EMPTY(rib_tree(ribs[id]))) {
281 			log_warnx("%s: rib %s is not empty", __func__,
282 			    ribs[id]->name);
283 		}
284 		rib_free(ribs[id]);
285 	}
286 	for (id = 0; id <= RIB_LOC_START; id++) {
287 		rib = rib_byid(id);
288 		if (rib == NULL)
289 			continue;
290 		filterlist_free(rib->in_rules_tmp);
291 		filterlist_free(rib->in_rules);
292 		ribs[id] = NULL;
293 		free(rib);
294 	}
295 	free(ribs);
296 }
297 
298 struct rib_entry *
299 rib_get(struct rib *rib, struct bgpd_addr *prefix, int prefixlen)
300 {
301 	struct rib_entry xre, *re;
302 
303 	memset(&xre, 0, sizeof(xre));
304 	xre.prefix = pt_fill(prefix, prefixlen);
305 
306 	re = RB_FIND(rib_tree, rib_tree(rib), &xre);
307 	if (re && re->rib_id != rib->id)
308 		fatalx("%s: Unexpected RIB %u != %u.", __func__,
309 		    re->rib_id, rib->id);
310 	return re;
311 }
312 
313 struct rib_entry *
314 rib_match(struct rib *rib, struct bgpd_addr *addr)
315 {
316 	struct rib_entry *re;
317 	int		 i;
318 
319 	switch (addr->aid) {
320 	case AID_INET:
321 	case AID_VPN_IPv4:
322 		for (i = 32; i >= 0; i--) {
323 			re = rib_get(rib, addr, i);
324 			if (re != NULL)
325 				return (re);
326 		}
327 		break;
328 	case AID_INET6:
329 	case AID_VPN_IPv6:
330 		for (i = 128; i >= 0; i--) {
331 			re = rib_get(rib, addr, i);
332 			if (re != NULL)
333 				return (re);
334 		}
335 		break;
336 	default:
337 		fatalx("%s: unknown af", __func__);
338 	}
339 	return (NULL);
340 }
341 
342 
343 struct rib_entry *
344 rib_add(struct rib *rib, struct bgpd_addr *prefix, int prefixlen)
345 {
346 	struct pt_entry	*pte;
347 	struct rib_entry *re;
348 
349 	pte = pt_get(prefix, prefixlen);
350 	if (pte == NULL)
351 		pte = pt_add(prefix, prefixlen);
352 
353 	if ((re = calloc(1, sizeof(*re))) == NULL)
354 		fatal("rib_add");
355 
356 	TAILQ_INIT(&re->prefix_h);
357 	re->prefix = pt_ref(pte);
358 	re->rib_id = rib->id;
359 
360 	if (RB_INSERT(rib_tree, rib_tree(rib), re) != NULL) {
361 		log_warnx("rib_add: insert failed");
362 		free(re);
363 		return (NULL);
364 	}
365 
366 
367 	rdemem.rib_cnt++;
368 
369 	return (re);
370 }
371 
372 void
373 rib_remove(struct rib_entry *re)
374 {
375 	if (!rib_empty(re))
376 		fatalx("rib_remove: entry not empty");
377 
378 	if (re_is_locked(re))
379 		/* entry is locked, don't free it. */
380 		return;
381 
382 	pt_unref(re->prefix);
383 
384 	if (RB_REMOVE(rib_tree, rib_tree(re_rib(re)), re) == NULL)
385 		log_warnx("rib_remove: remove failed.");
386 
387 	free(re);
388 	rdemem.rib_cnt--;
389 }
390 
391 int
392 rib_empty(struct rib_entry *re)
393 {
394 	return TAILQ_EMPTY(&re->prefix_h);
395 }
396 
397 static struct rib_entry *
398 rib_restart(struct rib_context *ctx)
399 {
400 	struct rib_entry *re;
401 
402 	re = re_unlock(ctx->ctx_re);
403 
404 	/* find first non empty element */
405 	while (re && rib_empty(re))
406 		re = RB_NEXT(rib_tree, unused, re);
407 
408 	/* free the previously locked rib element if empty */
409 	if (rib_empty(ctx->ctx_re))
410 		rib_remove(ctx->ctx_re);
411 	ctx->ctx_re = NULL;
412 	return (re);
413 }
414 
415 static void
416 rib_dump_r(struct rib_context *ctx)
417 {
418 	struct rib_entry	*re, *next;
419 	struct rib		*rib;
420 	unsigned int		 i;
421 
422 	rib = rib_byid(ctx->ctx_id);
423 	if (rib == NULL)
424 		fatalx("%s: rib id %u gone", __func__, ctx->ctx_id);
425 
426 	if (ctx->ctx_re == NULL)
427 		re = RB_MIN(rib_tree, rib_tree(rib));
428 	else
429 		re = rib_restart(ctx);
430 
431 	for (i = 0; re != NULL; re = next) {
432 		next = RB_NEXT(rib_tree, unused, re);
433 		if (re->rib_id != ctx->ctx_id)
434 			fatalx("%s: Unexpected RIB %u != %u.", __func__,
435 			    re->rib_id, ctx->ctx_id);
436 		if (ctx->ctx_aid != AID_UNSPEC &&
437 		    ctx->ctx_aid != re->prefix->aid)
438 			continue;
439 		if (ctx->ctx_count && i++ >= ctx->ctx_count &&
440 		    !re_is_locked(re)) {
441 			/* store and lock last element */
442 			ctx->ctx_re = re_lock(re);
443 			return;
444 		}
445 		ctx->ctx_rib_call(re, ctx->ctx_arg);
446 	}
447 
448 	if (ctx->ctx_done)
449 		ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid);
450 	LIST_REMOVE(ctx, entry);
451 	free(ctx);
452 }
453 
454 int
455 rib_dump_pending(void)
456 {
457 	struct rib_context *ctx;
458 
459 	/* return true if at least one context is not throttled */
460 	LIST_FOREACH(ctx, &rib_dumps, entry) {
461 		if (ctx->ctx_throttle && ctx->ctx_throttle(ctx->ctx_arg))
462 			continue;
463 		return 1;
464 	}
465 	return 0;
466 }
467 
468 void
469 rib_dump_runner(void)
470 {
471 	struct rib_context *ctx, *next;
472 
473 	LIST_FOREACH_SAFE(ctx, &rib_dumps, entry, next) {
474 		if (ctx->ctx_throttle && ctx->ctx_throttle(ctx->ctx_arg))
475 			continue;
476 		if (ctx->ctx_rib_call != NULL)
477 			rib_dump_r(ctx);
478 		else
479 			prefix_dump_r(ctx);
480 	}
481 }
482 
483 static void
484 rib_dump_abort(uint16_t id)
485 {
486 	struct rib_context *ctx, *next;
487 
488 	LIST_FOREACH_SAFE(ctx, &rib_dumps, entry, next) {
489 		if (id != ctx->ctx_id)
490 			continue;
491 		if (ctx->ctx_done)
492 			ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid);
493 		if (ctx->ctx_re && rib_empty(re_unlock(ctx->ctx_re)))
494 			rib_remove(ctx->ctx_re);
495 		if (ctx->ctx_p && prefix_is_dead(prefix_unlock(ctx->ctx_p)))
496 			prefix_adjout_destroy(ctx->ctx_p);
497 		LIST_REMOVE(ctx, entry);
498 		free(ctx);
499 	}
500 }
501 
502 void
503 rib_dump_terminate(void *arg)
504 {
505 	struct rib_context *ctx, *next;
506 
507 	LIST_FOREACH_SAFE(ctx, &rib_dumps, entry, next) {
508 		if (ctx->ctx_arg != arg)
509 			continue;
510 		if (ctx->ctx_done)
511 			ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid);
512 		if (ctx->ctx_re && rib_empty(re_unlock(ctx->ctx_re)))
513 			rib_remove(ctx->ctx_re);
514 		if (ctx->ctx_p && prefix_is_dead(prefix_unlock(ctx->ctx_p)))
515 			prefix_adjout_destroy(ctx->ctx_p);
516 		LIST_REMOVE(ctx, entry);
517 		free(ctx);
518 	}
519 }
520 
521 int
522 rib_dump_new(uint16_t id, uint8_t aid, unsigned int count, void *arg,
523     void (*upcall)(struct rib_entry *, void *), void (*done)(void *, uint8_t),
524     int (*throttle)(void *))
525 {
526 	struct rib_context *ctx;
527 
528 	if ((ctx = calloc(1, sizeof(*ctx))) == NULL)
529 		return -1;
530 	ctx->ctx_id = id;
531 	ctx->ctx_aid = aid;
532 	ctx->ctx_count = count;
533 	ctx->ctx_arg = arg;
534 	ctx->ctx_rib_call = upcall;
535 	ctx->ctx_done = done;
536 	ctx->ctx_throttle = throttle;
537 
538 	LIST_INSERT_HEAD(&rib_dumps, ctx, entry);
539 
540 	/* requested a sync traversal */
541 	if (count == 0)
542 		rib_dump_r(ctx);
543 
544 	return 0;
545 }
546 
547 /* path specific functions */
548 
549 static struct rde_aspath *path_lookup(struct rde_aspath *);
550 static uint64_t path_hash(struct rde_aspath *);
551 static void path_link(struct rde_aspath *);
552 static void path_unlink(struct rde_aspath *);
553 
554 struct path_table {
555 	struct aspath_head	*path_hashtbl;
556 	uint64_t		 path_hashmask;
557 } pathtable;
558 
559 SIPHASH_KEY pathtablekey;
560 
561 #define	PATH_HASH(x)	&pathtable.path_hashtbl[x & pathtable.path_hashmask]
562 
563 static inline struct rde_aspath *
564 path_ref(struct rde_aspath *asp)
565 {
566 	if ((asp->flags & F_ATTR_LINKED) == 0)
567 		fatalx("%s: unlinked object", __func__);
568 	asp->refcnt++;
569 	rdemem.path_refs++;
570 
571 	return asp;
572 }
573 
574 static inline void
575 path_unref(struct rde_aspath *asp)
576 {
577 	if (asp == NULL)
578 		return;
579 	if ((asp->flags & F_ATTR_LINKED) == 0)
580 		fatalx("%s: unlinked object", __func__);
581 	asp->refcnt--;
582 	rdemem.path_refs--;
583 	if (asp->refcnt <= 0)
584 		path_unlink(asp);
585 }
586 
587 void
588 path_init(uint32_t hashsize)
589 {
590 	uint32_t	hs, i;
591 
592 	for (hs = 1; hs < hashsize; hs <<= 1)
593 		;
594 	pathtable.path_hashtbl = calloc(hs, sizeof(*pathtable.path_hashtbl));
595 	if (pathtable.path_hashtbl == NULL)
596 		fatal("path_init");
597 
598 	for (i = 0; i < hs; i++)
599 		LIST_INIT(&pathtable.path_hashtbl[i]);
600 
601 	pathtable.path_hashmask = hs - 1;
602 	arc4random_buf(&pathtablekey, sizeof(pathtablekey));
603 }
604 
605 void
606 path_shutdown(void)
607 {
608 	uint32_t	i;
609 
610 	for (i = 0; i <= pathtable.path_hashmask; i++)
611 		if (!LIST_EMPTY(&pathtable.path_hashtbl[i]))
612 			log_warnx("path_free: free non-free table");
613 
614 	free(pathtable.path_hashtbl);
615 }
616 
617 void
618 path_hash_stats(struct rde_hashstats *hs)
619 {
620 	struct rde_aspath	*a;
621 	uint32_t		i;
622 	int64_t			n;
623 
624 	memset(hs, 0, sizeof(*hs));
625 	strlcpy(hs->name, "path hash", sizeof(hs->name));
626 	hs->min = LLONG_MAX;
627 	hs->num = pathtable.path_hashmask + 1;
628 
629 	for (i = 0; i <= pathtable.path_hashmask; i++) {
630 		n = 0;
631 		LIST_FOREACH(a, &pathtable.path_hashtbl[i], path_l)
632 			n++;
633 		if (n < hs->min)
634 			hs->min = n;
635 		if (n > hs->max)
636 			hs->max = n;
637 		hs->sum += n;
638 		hs->sumq += n * n;
639 	}
640 }
641 
642 int
643 path_compare(struct rde_aspath *a, struct rde_aspath *b)
644 {
645 	int		 r;
646 
647 	if (a == NULL && b == NULL)
648 		return (0);
649 	else if (b == NULL)
650 		return (1);
651 	else if (a == NULL)
652 		return (-1);
653 	if ((a->flags & ~F_ATTR_LINKED) > (b->flags & ~F_ATTR_LINKED))
654 		return (1);
655 	if ((a->flags & ~F_ATTR_LINKED) < (b->flags & ~F_ATTR_LINKED))
656 		return (-1);
657 	if (a->origin > b->origin)
658 		return (1);
659 	if (a->origin < b->origin)
660 		return (-1);
661 	if (a->med > b->med)
662 		return (1);
663 	if (a->med < b->med)
664 		return (-1);
665 	if (a->lpref > b->lpref)
666 		return (1);
667 	if (a->lpref < b->lpref)
668 		return (-1);
669 	if (a->weight > b->weight)
670 		return (1);
671 	if (a->weight < b->weight)
672 		return (-1);
673 	if (a->rtlabelid > b->rtlabelid)
674 		return (1);
675 	if (a->rtlabelid < b->rtlabelid)
676 		return (-1);
677 	if (a->pftableid > b->pftableid)
678 		return (1);
679 	if (a->pftableid < b->pftableid)
680 		return (-1);
681 
682 	r = aspath_compare(a->aspath, b->aspath);
683 	if (r > 0)
684 		return (1);
685 	if (r < 0)
686 		return (-1);
687 
688 	return (attr_compare(a, b));
689 }
690 
691 static uint64_t
692 path_hash(struct rde_aspath *asp)
693 {
694 	SIPHASH_CTX	ctx;
695 	uint64_t	hash;
696 
697 	SipHash24_Init(&ctx, &pathtablekey);
698 	SipHash24_Update(&ctx, &asp->aspath_hashstart,
699 	    (char *)&asp->aspath_hashend - (char *)&asp->aspath_hashstart);
700 
701 	if (asp->aspath)
702 		SipHash24_Update(&ctx, asp->aspath->data, asp->aspath->len);
703 
704 	hash = attr_hash(asp);
705 	SipHash24_Update(&ctx, &hash, sizeof(hash));
706 
707 	return (SipHash24_End(&ctx));
708 }
709 
710 static struct rde_aspath *
711 path_lookup(struct rde_aspath *aspath)
712 {
713 	struct aspath_head	*head;
714 	struct rde_aspath	*asp;
715 	uint64_t		 hash;
716 
717 	hash = path_hash(aspath);
718 	head = PATH_HASH(hash);
719 
720 	LIST_FOREACH(asp, head, path_l) {
721 		if (asp->hash == hash && path_compare(aspath, asp) == 0)
722 			return (asp);
723 	}
724 	return (NULL);
725 }
726 
727 /*
728  * Link this aspath into the global hash table.
729  * The asp had to be alloced with path_get.
730  */
731 static void
732 path_link(struct rde_aspath *asp)
733 {
734 	struct aspath_head	*head;
735 
736 	asp->hash = path_hash(asp);
737 	head = PATH_HASH(asp->hash);
738 
739 	LIST_INSERT_HEAD(head, asp, path_l);
740 	asp->flags |= F_ATTR_LINKED;
741 }
742 
743 /*
744  * This function can only be called when all prefix have been removed first.
745  * Normally this happens directly out of the prefix removal functions.
746  */
747 static void
748 path_unlink(struct rde_aspath *asp)
749 {
750 	if (asp == NULL)
751 		return;
752 
753 	/* make sure no reference is hold for this rde_aspath */
754 	if (asp->refcnt != 0)
755 		fatalx("%s: still holds references", __func__);
756 
757 	LIST_REMOVE(asp, path_l);
758 	asp->flags &= ~F_ATTR_LINKED;
759 
760 	path_put(asp);
761 }
762 
763 /*
764  * Copy asp to a new UNLINKED aspath.
765  * On dst either path_get() or path_prep() had to be called before.
766  */
767 struct rde_aspath *
768 path_copy(struct rde_aspath *dst, const struct rde_aspath *src)
769 {
770 	dst->aspath = src->aspath;
771 	if (dst->aspath != NULL) {
772 		dst->aspath->refcnt++;
773 		rdemem.aspath_refs++;
774 	}
775 	dst->hash = 0;		/* not linked so no hash and no refcnt */
776 	dst->refcnt = 0;
777 	dst->flags = src->flags & ~F_ATTR_LINKED;
778 
779 	dst->med = src->med;
780 	dst->lpref = src->lpref;
781 	dst->weight = src->weight;
782 	dst->rtlabelid = rtlabel_ref(src->rtlabelid);
783 	dst->pftableid = pftable_ref(src->pftableid);
784 	dst->origin = src->origin;
785 
786 	attr_copy(dst, src);
787 
788 	return (dst);
789 }
790 
791 /* initialize or pepare an aspath for use */
792 struct rde_aspath *
793 path_prep(struct rde_aspath *asp)
794 {
795 	memset(asp, 0, sizeof(*asp));
796 	asp->origin = ORIGIN_INCOMPLETE;
797 	asp->lpref = DEFAULT_LPREF;
798 
799 	return (asp);
800 }
801 
802 /* alloc and initialize new entry. May not fail. */
803 struct rde_aspath *
804 path_get(void)
805 {
806 	struct rde_aspath *asp;
807 
808 	asp = malloc(sizeof(*asp));
809 	if (asp == NULL)
810 		fatal("path_get");
811 	rdemem.path_cnt++;
812 
813 	return (path_prep(asp));
814 }
815 
816 /* clean up an asp after use (frees all references to sub-objects) */
817 void
818 path_clean(struct rde_aspath *asp)
819 {
820 	if (asp->flags & F_ATTR_LINKED)
821 		fatalx("%s: linked object", __func__);
822 
823 	rtlabel_unref(asp->rtlabelid);
824 	pftable_unref(asp->pftableid);
825 	aspath_put(asp->aspath);
826 	attr_freeall(asp);
827 }
828 
829 /* free an unlinked element */
830 void
831 path_put(struct rde_aspath *asp)
832 {
833 	if (asp == NULL)
834 		return;
835 
836 	path_clean(asp);
837 
838 	rdemem.path_cnt--;
839 	free(asp);
840 }
841 
842 /* prefix specific functions */
843 
844 static int	prefix_add(struct bgpd_addr *, int, struct rib *,
845 		    struct rde_peer *, uint32_t, uint32_t, struct rde_aspath *,
846 		    struct rde_community *, struct nexthop *,
847 		    uint8_t, uint8_t);
848 static int	prefix_move(struct prefix *, struct rde_peer *,
849 		    struct rde_aspath *, struct rde_community *,
850 		    struct nexthop *, uint8_t, uint8_t);
851 
852 static void	prefix_link(struct prefix *, struct rib_entry *,
853 		     struct pt_entry *, struct rde_peer *, uint32_t, uint32_t,
854 		     struct rde_aspath *, struct rde_community *,
855 		     struct nexthop *, uint8_t, uint8_t);
856 static void	prefix_unlink(struct prefix *);
857 
858 static struct prefix	*prefix_alloc(void);
859 static void		 prefix_free(struct prefix *);
860 
861 /* RB tree comparison function */
862 static inline int
863 prefix_index_cmp(struct prefix *a, struct prefix *b)
864 {
865 	int r;
866 	r = pt_prefix_cmp(a->pt, b->pt);
867 	if (r != 0)
868 		return r;
869 
870 	if (a->path_id_tx > b->path_id_tx)
871 		return 1;
872 	if (a->path_id_tx < b->path_id_tx)
873 		return -1;
874 	return 0;
875 }
876 
877 static inline int
878 prefix_cmp(struct prefix *a, struct prefix *b)
879 {
880 	if ((a->flags & PREFIX_FLAG_EOR) != (b->flags & PREFIX_FLAG_EOR))
881 		return (a->flags & PREFIX_FLAG_EOR) ? 1 : -1;
882 	/* if EOR marker no need to check the rest */
883 	if (a->flags & PREFIX_FLAG_EOR)
884 		return 0;
885 
886 	if (a->aspath != b->aspath)
887 		return (a->aspath > b->aspath ? 1 : -1);
888 	if (a->communities != b->communities)
889 		return (a->communities > b->communities ? 1 : -1);
890 	if (a->nexthop != b->nexthop)
891 		return (a->nexthop > b->nexthop ? 1 : -1);
892 	if (a->nhflags != b->nhflags)
893 		return (a->nhflags > b->nhflags ? 1 : -1);
894 	return prefix_index_cmp(a, b);
895 }
896 
897 RB_GENERATE(prefix_tree, prefix, entry.tree.update, prefix_cmp)
898 RB_GENERATE_STATIC(prefix_index, prefix, entry.tree.index, prefix_index_cmp)
899 
900 /*
901  * Search for specified prefix of a peer. Returns NULL if not found.
902  */
903 struct prefix *
904 prefix_get(struct rib *rib, struct rde_peer *peer, uint32_t path_id,
905     struct bgpd_addr *prefix, int prefixlen)
906 {
907 	struct rib_entry	*re;
908 
909 	re = rib_get(rib, prefix, prefixlen);
910 	if (re == NULL)
911 		return (NULL);
912 	return (prefix_bypeer(re, peer, path_id));
913 }
914 
915 /*
916  * Search for specified prefix in the peer prefix_index.
917  * Returns NULL if not found.
918  */
919 struct prefix *
920 prefix_adjout_get(struct rde_peer *peer, uint32_t path_id_tx,
921     struct bgpd_addr *prefix, int prefixlen)
922 {
923 	struct prefix xp;
924 
925 	memset(&xp, 0, sizeof(xp));
926 	xp.pt = pt_fill(prefix, prefixlen);
927 	xp.path_id_tx = path_id_tx;
928 
929 	return RB_FIND(prefix_index, &peer->adj_rib_out, &xp);
930 }
931 
932 /*
933  * Lookup a prefix without considering path_id in the peer prefix_index.
934  * Returns NULL if not found.
935  */
936 struct prefix *
937 prefix_adjout_lookup(struct rde_peer *peer, struct bgpd_addr *prefix,
938     int prefixlen)
939 {
940 	struct prefix xp, *np;
941 
942 	memset(&xp, 0, sizeof(xp));
943 	xp.pt = pt_fill(prefix, prefixlen);
944 
945 	np = RB_NFIND(prefix_index, &peer->adj_rib_out, &xp);
946 	if (np == NULL || pt_prefix_cmp(np->pt, xp.pt) != 0)
947 		return NULL;
948 	return np;
949 }
950 
951 /*
952  * Return next prefix after a lookup that is actually an update.
953  */
954 struct prefix *
955 prefix_adjout_next(struct rde_peer *peer, struct prefix *p)
956 {
957 	struct prefix *np;
958 
959 	np = RB_NEXT(prefix_index, &peer->adj_rib_out, p);
960 	if (np == NULL || np->pt != p->pt)
961 		return NULL;
962 	return np;
963 }
964 
965 /*
966  * Lookup addr in the peer prefix_index. Returns first match.
967  * Returns NULL if not found.
968  */
969 struct prefix *
970 prefix_adjout_match(struct rde_peer *peer, struct bgpd_addr *addr)
971 {
972 	struct prefix *p;
973 	int i;
974 
975 	switch (addr->aid) {
976 	case AID_INET:
977 	case AID_VPN_IPv4:
978 		for (i = 32; i >= 0; i--) {
979 			p = prefix_adjout_lookup(peer, addr, i);
980 			if (p != NULL)
981 				return p;
982 		}
983 		break;
984 	case AID_INET6:
985 	case AID_VPN_IPv6:
986 		for (i = 128; i >= 0; i--) {
987 			p = prefix_adjout_lookup(peer, addr, i);
988 			if (p != NULL)
989 				return p;
990 		}
991 		break;
992 	default:
993 		fatalx("%s: unknown af", __func__);
994 	}
995 	return NULL;
996 }
997 
998 /*
999  * Update a prefix.
1000  * Return 1 if prefix was newly added, 0 if it was just changed.
1001  */
1002 int
1003 prefix_update(struct rib *rib, struct rde_peer *peer, uint32_t path_id,
1004     uint32_t path_id_tx, struct filterstate *state, struct bgpd_addr *prefix,
1005     int prefixlen, uint8_t vstate)
1006 {
1007 	struct rde_aspath	*asp, *nasp = &state->aspath;
1008 	struct rde_community	*comm, *ncomm = &state->communities;
1009 	struct prefix		*p;
1010 
1011 	/*
1012 	 * First try to find a prefix in the specified RIB.
1013 	 */
1014 	if ((p = prefix_get(rib, peer, path_id, prefix, prefixlen)) != NULL) {
1015 		if (path_id_tx != p->path_id_tx)
1016 			fatalx("path_id mismatch");
1017 		if (prefix_nexthop(p) == state->nexthop &&
1018 		    prefix_nhflags(p) == state->nhflags &&
1019 		    communities_equal(ncomm, prefix_communities(p)) &&
1020 		    path_compare(nasp, prefix_aspath(p)) == 0) {
1021 			/* no change, update last change */
1022 			p->lastchange = getmonotime();
1023 			p->validation_state = vstate;
1024 			return (0);
1025 		}
1026 	}
1027 
1028 	/*
1029 	 * Either the prefix does not exist or the path changed.
1030 	 * In both cases lookup the new aspath to make sure it is not
1031 	 * already in the RIB.
1032 	 */
1033 	if ((asp = path_lookup(nasp)) == NULL) {
1034 		/* Path not available, create and link a new one. */
1035 		asp = path_copy(path_get(), nasp);
1036 		path_link(asp);
1037 	}
1038 
1039 	if ((comm = communities_lookup(ncomm)) == NULL) {
1040 		/* Communities not available, create and link a new one. */
1041 		comm = communities_link(ncomm);
1042 	}
1043 
1044 	/* If the prefix was found move it else add it to the RIB. */
1045 	if (p != NULL)
1046 		return (prefix_move(p, peer, asp, comm, state->nexthop,
1047 		    state->nhflags, vstate));
1048 	else
1049 		return (prefix_add(prefix, prefixlen, rib, peer, path_id,
1050 		    path_id_tx, asp, comm, state->nexthop, state->nhflags,
1051 		    vstate));
1052 }
1053 
1054 /*
1055  * Adds or updates a prefix.
1056  */
1057 static int
1058 prefix_add(struct bgpd_addr *prefix, int prefixlen, struct rib *rib,
1059     struct rde_peer *peer, uint32_t path_id, uint32_t path_id_tx,
1060     struct rde_aspath *asp, struct rde_community *comm,
1061     struct nexthop *nexthop, uint8_t nhflags, uint8_t vstate)
1062 {
1063 	struct prefix		*p;
1064 	struct rib_entry	*re;
1065 
1066 	re = rib_get(rib, prefix, prefixlen);
1067 	if (re == NULL)
1068 		re = rib_add(rib, prefix, prefixlen);
1069 
1070 	p = prefix_alloc();
1071 	prefix_link(p, re, re->prefix, peer, path_id, path_id_tx, asp, comm,
1072 	    nexthop, nhflags, vstate);
1073 
1074 	/* add possible pftable reference form aspath */
1075 	if (asp && asp->pftableid)
1076 		rde_pftable_add(asp->pftableid, p);
1077 	/* make route decision */
1078 	prefix_evaluate(re, p, NULL);
1079 	return (1);
1080 }
1081 
1082 /*
1083  * Move the prefix to the specified as path, removes the old asp if needed.
1084  */
1085 static int
1086 prefix_move(struct prefix *p, struct rde_peer *peer,
1087     struct rde_aspath *asp, struct rde_community *comm,
1088     struct nexthop *nexthop, uint8_t nhflags, uint8_t vstate)
1089 {
1090 	struct prefix		*np;
1091 
1092 	if (p->flags & PREFIX_FLAG_ADJOUT)
1093 		fatalx("%s: prefix with PREFIX_FLAG_ADJOUT hit", __func__);
1094 
1095 	if (peer != prefix_peer(p))
1096 		fatalx("prefix_move: cross peer move");
1097 
1098 	/* add new prefix node */
1099 	np = prefix_alloc();
1100 	prefix_link(np, prefix_re(p), p->pt, peer, p->path_id, p->path_id_tx,
1101 	    asp, comm, nexthop, nhflags, vstate);
1102 
1103 	/* add possible pftable reference from new aspath */
1104 	if (asp && asp->pftableid)
1105 		rde_pftable_add(asp->pftableid, np);
1106 
1107 	/*
1108 	 * no need to update the peer prefix count because we are only moving
1109 	 * the prefix without changing the peer.
1110 	 */
1111 	prefix_evaluate(prefix_re(np), np, p);
1112 
1113 	/* remove possible pftable reference from old path */
1114 	if (p->aspath && p->aspath->pftableid)
1115 		rde_pftable_del(p->aspath->pftableid, p);
1116 
1117 	/* remove old prefix node */
1118 	prefix_unlink(p);
1119 	prefix_free(p);
1120 
1121 	return (0);
1122 }
1123 
1124 /*
1125  * Removes a prefix from the specified RIB. If the parent objects -- rib_entry
1126  * or pt_entry -- become empty remove them too.
1127  */
1128 int
1129 prefix_withdraw(struct rib *rib, struct rde_peer *peer, uint32_t path_id,
1130     struct bgpd_addr *prefix, int prefixlen)
1131 {
1132 	struct prefix		*p;
1133 	struct rde_aspath	*asp;
1134 
1135 	p = prefix_get(rib, peer, path_id, prefix, prefixlen);
1136 	if (p == NULL)		/* Got a dummy withdrawn request. */
1137 		return (0);
1138 
1139 	if (p->flags & PREFIX_FLAG_ADJOUT)
1140 		fatalx("%s: prefix with PREFIX_FLAG_ADJOUT hit", __func__);
1141 	asp = prefix_aspath(p);
1142 	if (asp && asp->pftableid)
1143 		/* only prefixes in the local RIB were pushed into pf */
1144 		rde_pftable_del(asp->pftableid, p);
1145 
1146 	prefix_destroy(p);
1147 
1148 	return (1);
1149 }
1150 
1151 /*
1152  * Insert an End-of-RIB marker into the update queue.
1153  */
1154 void
1155 prefix_add_eor(struct rde_peer *peer, uint8_t aid)
1156 {
1157 	struct prefix *p;
1158 
1159 	p = prefix_alloc();
1160 	p->flags = PREFIX_FLAG_ADJOUT | PREFIX_FLAG_UPDATE | PREFIX_FLAG_EOR;
1161 	if (RB_INSERT(prefix_tree, &peer->updates[aid], p) != NULL)
1162 		/* no need to add if EoR marker already present */
1163 		prefix_free(p);
1164 	/* EOR marker is not inserted into the adj_rib_out index */
1165 }
1166 
1167 /*
1168  * Put a prefix from the Adj-RIB-Out onto the update queue.
1169  */
1170 void
1171 prefix_adjout_update(struct prefix *p, struct rde_peer *peer,
1172     struct filterstate *state, struct bgpd_addr *prefix, int prefixlen,
1173     uint32_t path_id_tx, uint8_t vstate)
1174 {
1175 	struct rde_aspath *asp;
1176 	struct rde_community *comm;
1177 
1178 	if (p == NULL) {
1179 		p = prefix_alloc();
1180 		/* initally mark DEAD so code below is skipped */
1181 		p->flags |= PREFIX_FLAG_ADJOUT | PREFIX_FLAG_DEAD;
1182 
1183 		p->pt = pt_get(prefix, prefixlen);
1184 		if (p->pt == NULL)
1185 			p->pt = pt_add(prefix, prefixlen);
1186 		pt_ref(p->pt);
1187 		p->peer = peer;
1188 		p->path_id_tx = path_id_tx;
1189 
1190 		if (RB_INSERT(prefix_index, &peer->adj_rib_out, p) != NULL)
1191 			fatalx("%s: RB index invariant violated", __func__);
1192 	}
1193 
1194 	if ((p->flags & PREFIX_FLAG_ADJOUT) == 0)
1195 		fatalx("%s: prefix without PREFIX_FLAG_ADJOUT hit", __func__);
1196 	if ((p->flags & (PREFIX_FLAG_WITHDRAW | PREFIX_FLAG_DEAD)) == 0) {
1197 		/*
1198 		 * XXX for now treat a different path_id_tx like different
1199 		 * attributes and force out an update. It is unclear how
1200 		 * common it is to have equivalent updates from alternative
1201 		 * paths.
1202 		 */
1203 		if (p->path_id_tx == path_id_tx &&
1204 		    prefix_nhflags(p) == state->nhflags &&
1205 		    prefix_nexthop(p) == state->nexthop &&
1206 		    communities_equal(&state->communities,
1207 		    prefix_communities(p)) &&
1208 		    path_compare(&state->aspath, prefix_aspath(p)) == 0) {
1209 			/* nothing changed */
1210 			p->validation_state = vstate;
1211 			p->lastchange = getmonotime();
1212 			p->flags &= ~PREFIX_FLAG_STALE;
1213 			return;
1214 		}
1215 
1216 		/* if pending update unhook it before it is unlinked */
1217 		if (p->flags & PREFIX_FLAG_UPDATE) {
1218 			RB_REMOVE(prefix_tree, &peer->updates[prefix->aid], p);
1219 			peer->up_nlricnt--;
1220 		}
1221 
1222 		/* unlink prefix so it can be relinked below */
1223 		prefix_unlink(p);
1224 		peer->prefix_out_cnt--;
1225 	}
1226 	if (p->flags & PREFIX_FLAG_WITHDRAW) {
1227 		RB_REMOVE(prefix_tree, &peer->withdraws[prefix->aid], p);
1228 		peer->up_wcnt--;
1229 	}
1230 
1231 	/* nothing needs to be done for PREFIX_FLAG_DEAD and STALE */
1232 	p->flags &= ~PREFIX_FLAG_MASK;
1233 
1234 	/* update path_id_tx now that the prefix is unlinked */
1235 	if (p->path_id_tx != path_id_tx) {
1236 		/* path_id_tx is part of the index so remove and re-insert p */
1237 		RB_REMOVE(prefix_index, &peer->adj_rib_out, p);
1238 		p->path_id_tx = path_id_tx;
1239 		if (RB_INSERT(prefix_index, &peer->adj_rib_out, p) != NULL)
1240 			fatalx("%s: RB index invariant violated", __func__);
1241 	}
1242 
1243 	if ((asp = path_lookup(&state->aspath)) == NULL) {
1244 		/* Path not available, create and link a new one. */
1245 		asp = path_copy(path_get(), &state->aspath);
1246 		path_link(asp);
1247 	}
1248 
1249 	if ((comm = communities_lookup(&state->communities)) == NULL) {
1250 		/* Communities not available, create and link a new one. */
1251 		comm = communities_link(&state->communities);
1252 	}
1253 
1254 	prefix_link(p, NULL, p->pt, peer, 0, p->path_id_tx, asp, comm,
1255 	    state->nexthop, state->nhflags, vstate);
1256 	peer->prefix_out_cnt++;
1257 
1258 	if (p->flags & PREFIX_FLAG_MASK)
1259 		fatalx("%s: bad flags %x", __func__, p->flags);
1260 	p->flags |= PREFIX_FLAG_UPDATE;
1261 	if (RB_INSERT(prefix_tree, &peer->updates[prefix->aid], p) != NULL)
1262 		fatalx("%s: RB tree invariant violated", __func__);
1263 	peer->up_nlricnt++;
1264 }
1265 
1266 /*
1267  * Withdraw a prefix from the Adj-RIB-Out, this unlinks the aspath but leaves
1268  * the prefix in the RIB linked to the peer withdraw list.
1269  */
1270 void
1271 prefix_adjout_withdraw(struct prefix *p)
1272 {
1273 	struct rde_peer *peer = prefix_peer(p);
1274 
1275 	if ((p->flags & PREFIX_FLAG_ADJOUT) == 0)
1276 		fatalx("%s: prefix without PREFIX_FLAG_ADJOUT hit", __func__);
1277 
1278 	/* already a withdraw, shortcut */
1279 	if (p->flags & PREFIX_FLAG_WITHDRAW) {
1280 		p->lastchange = getmonotime();
1281 		p->flags &= ~PREFIX_FLAG_STALE;
1282 		return;
1283 	}
1284 	/* pending update just got withdrawn */
1285 	if (p->flags & PREFIX_FLAG_UPDATE) {
1286 		RB_REMOVE(prefix_tree, &peer->updates[p->pt->aid], p);
1287 		peer->up_nlricnt--;
1288 	}
1289 	/* unlink prefix if it was linked (not a withdraw or dead) */
1290 	if ((p->flags & (PREFIX_FLAG_WITHDRAW | PREFIX_FLAG_DEAD)) == 0) {
1291 		prefix_unlink(p);
1292 		peer->prefix_out_cnt--;
1293 	}
1294 
1295 	/* nothing needs to be done for PREFIX_FLAG_DEAD and STALE */
1296 	p->flags &= ~PREFIX_FLAG_MASK;
1297 	p->lastchange = getmonotime();
1298 
1299 	p->flags |= PREFIX_FLAG_WITHDRAW;
1300 	if (RB_INSERT(prefix_tree, &peer->withdraws[p->pt->aid], p) != NULL)
1301 		fatalx("%s: RB tree invariant violated", __func__);
1302 	peer->up_wcnt++;
1303 }
1304 
1305 void
1306 prefix_adjout_destroy(struct prefix *p)
1307 {
1308 	struct rde_peer *peer = prefix_peer(p);
1309 
1310 	if ((p->flags & PREFIX_FLAG_ADJOUT) == 0)
1311 		fatalx("%s: prefix without PREFIX_FLAG_ADJOUT hit", __func__);
1312 
1313 	if (p->flags & PREFIX_FLAG_EOR) {
1314 		/* EOR marker is not linked in the index */
1315 		prefix_free(p);
1316 		return;
1317 	}
1318 
1319 	if (p->flags & PREFIX_FLAG_WITHDRAW) {
1320 		RB_REMOVE(prefix_tree, &peer->withdraws[p->pt->aid], p);
1321 		peer->up_wcnt--;
1322 	}
1323 	if (p->flags & PREFIX_FLAG_UPDATE) {
1324 		RB_REMOVE(prefix_tree, &peer->updates[p->pt->aid], p);
1325 		peer->up_nlricnt--;
1326 	}
1327 	/* unlink prefix if it was linked (not a withdraw or dead) */
1328 	if ((p->flags & (PREFIX_FLAG_WITHDRAW | PREFIX_FLAG_DEAD)) == 0) {
1329 		prefix_unlink(p);
1330 		peer->prefix_out_cnt--;
1331 	}
1332 
1333 	/* nothing needs to be done for PREFIX_FLAG_DEAD and STALE */
1334 	p->flags &= ~PREFIX_FLAG_MASK;
1335 
1336 	if (prefix_is_locked(p)) {
1337 		/* mark prefix dead but leave it for prefix_restart */
1338 		p->flags |= PREFIX_FLAG_DEAD;
1339 	} else {
1340 		RB_REMOVE(prefix_index, &peer->adj_rib_out, p);
1341 		/* remove the last prefix reference before free */
1342 		pt_unref(p->pt);
1343 		prefix_free(p);
1344 	}
1345 }
1346 
1347 static struct prefix *
1348 prefix_restart(struct rib_context *ctx)
1349 {
1350 	struct prefix *p;
1351 
1352 	p = prefix_unlock(ctx->ctx_p);
1353 
1354 	if (prefix_is_dead(p)) {
1355 		struct prefix *next;
1356 
1357 		next = RB_NEXT(prefix_index, unused, p);
1358 		prefix_adjout_destroy(p);
1359 		p = next;
1360 	}
1361 	ctx->ctx_p = NULL;
1362 	return p;
1363 }
1364 
1365 static void
1366 prefix_dump_r(struct rib_context *ctx)
1367 {
1368 	struct prefix *p, *next;
1369 	struct rde_peer *peer;
1370 	unsigned int i;
1371 
1372 	if ((peer = peer_get(ctx->ctx_id)) == NULL)
1373 		goto done;
1374 
1375 	if (ctx->ctx_p == NULL)
1376 		p = RB_MIN(prefix_index, &peer->adj_rib_out);
1377 	else
1378 		p = prefix_restart(ctx);
1379 
1380 	for (i = 0; p != NULL; p = next) {
1381 		next = RB_NEXT(prefix_index, unused, p);
1382 		if (prefix_is_dead(p))
1383 			continue;
1384 		if (ctx->ctx_aid != AID_UNSPEC &&
1385 		    ctx->ctx_aid != p->pt->aid)
1386 			continue;
1387 		if (ctx->ctx_count && i++ >= ctx->ctx_count &&
1388 		    !prefix_is_locked(p)) {
1389 			/* store and lock last element */
1390 			ctx->ctx_p = prefix_lock(p);
1391 			return;
1392 		}
1393 		ctx->ctx_prefix_call(p, ctx->ctx_arg);
1394 	}
1395 
1396 done:
1397 	if (ctx->ctx_done)
1398 		ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid);
1399 	LIST_REMOVE(ctx, entry);
1400 	free(ctx);
1401 }
1402 
1403 int
1404 prefix_dump_new(struct rde_peer *peer, uint8_t aid, unsigned int count,
1405     void *arg, void (*upcall)(struct prefix *, void *),
1406     void (*done)(void *, uint8_t), int (*throttle)(void *))
1407 {
1408 	struct rib_context *ctx;
1409 
1410 	if ((ctx = calloc(1, sizeof(*ctx))) == NULL)
1411 		return -1;
1412 	ctx->ctx_id = peer->conf.id;
1413 	ctx->ctx_aid = aid;
1414 	ctx->ctx_count = count;
1415 	ctx->ctx_arg = arg;
1416 	ctx->ctx_prefix_call = upcall;
1417 	ctx->ctx_done = done;
1418 	ctx->ctx_throttle = throttle;
1419 
1420 	LIST_INSERT_HEAD(&rib_dumps, ctx, entry);
1421 
1422 	/* requested a sync traversal */
1423 	if (count == 0)
1424 		prefix_dump_r(ctx);
1425 
1426 	return 0;
1427 }
1428 
1429 /* dump a prefix into specified buffer */
1430 int
1431 prefix_write(u_char *buf, int len, struct bgpd_addr *prefix, uint8_t plen,
1432     int withdraw)
1433 {
1434 	int	totlen, psize;
1435 
1436 	switch (prefix->aid) {
1437 	case AID_INET:
1438 	case AID_INET6:
1439 		totlen = PREFIX_SIZE(plen);
1440 
1441 		if (totlen > len)
1442 			return (-1);
1443 		*buf++ = plen;
1444 		memcpy(buf, &prefix->ba, totlen - 1);
1445 		return (totlen);
1446 	case AID_VPN_IPv4:
1447 	case AID_VPN_IPv6:
1448 		totlen = PREFIX_SIZE(plen) + sizeof(prefix->rd);
1449 		psize = PREFIX_SIZE(plen) - 1;
1450 		plen += sizeof(prefix->rd) * 8;
1451 		if (withdraw) {
1452 			/* withdraw have one compat label as placeholder */
1453 			totlen += 3;
1454 			plen += 3 * 8;
1455 		} else {
1456 			totlen += prefix->labellen;
1457 			plen += prefix->labellen * 8;
1458 		}
1459 
1460 		if (totlen > len)
1461 			return (-1);
1462 		*buf++ = plen;
1463 		if (withdraw) {
1464 			/* magic compatibility label as per rfc8277 */
1465 			*buf++ = 0x80;
1466 			*buf++ = 0x0;
1467 			*buf++ = 0x0;
1468 		} else {
1469 			memcpy(buf, &prefix->labelstack,
1470 			    prefix->labellen);
1471 			buf += prefix->labellen;
1472 		}
1473 		memcpy(buf, &prefix->rd, sizeof(prefix->rd));
1474 		buf += sizeof(prefix->rd);
1475 		memcpy(buf, &prefix->ba, psize);
1476 		return (totlen);
1477 	default:
1478 		return (-1);
1479 	}
1480 }
1481 
1482 int
1483 prefix_writebuf(struct ibuf *buf, struct bgpd_addr *prefix, uint8_t plen)
1484 {
1485 	int	 totlen;
1486 	void	*bptr;
1487 
1488 	switch (prefix->aid) {
1489 	case AID_INET:
1490 	case AID_INET6:
1491 		totlen = PREFIX_SIZE(plen);
1492 		break;
1493 	case AID_VPN_IPv4:
1494 	case AID_VPN_IPv6:
1495 		totlen = PREFIX_SIZE(plen) + sizeof(prefix->rd) +
1496 		    prefix->labellen;
1497 		break;
1498 	default:
1499 		return (-1);
1500 	}
1501 
1502 	if ((bptr = ibuf_reserve(buf, totlen)) == NULL)
1503 		return (-1);
1504 	if (prefix_write(bptr, totlen, prefix, plen, 0) == -1)
1505 		return (-1);
1506 	return (0);
1507 }
1508 
1509 /*
1510  * Searches in the prefix list of specified rib_entry for a prefix entry
1511  * belonging to the peer peer. Returns NULL if no match found.
1512  */
1513 struct prefix *
1514 prefix_bypeer(struct rib_entry *re, struct rde_peer *peer, uint32_t path_id)
1515 {
1516 	struct prefix	*p;
1517 
1518 	TAILQ_FOREACH(p, &re->prefix_h, entry.list.rib)
1519 		if (prefix_peer(p) == peer && p->path_id == path_id)
1520 			return (p);
1521 	return (NULL);
1522 }
1523 
1524 static void
1525 prefix_evaluate_all(struct prefix *p, enum nexthop_state state,
1526     enum nexthop_state oldstate)
1527 {
1528 	struct rib_entry *re = prefix_re(p);
1529 
1530 	/* Skip non local-RIBs or RIBs that are flagged as noeval. */
1531 	if (re_rib(re)->flags & F_RIB_NOEVALUATE) {
1532 		log_warnx("%s: prefix with F_RIB_NOEVALUATE hit", __func__);
1533 		return;
1534 	}
1535 
1536 	if (oldstate == state) {
1537 		/*
1538 		 * The state of the nexthop did not change. The only
1539 		 * thing that may have changed is the true_nexthop
1540 		 * or other internal infos. This will not change
1541 		 * the routing decision so shortcut here.
1542 		 */
1543 		if (state == NEXTHOP_REACH) {
1544 			if ((re_rib(re)->flags & F_RIB_NOFIB) == 0 &&
1545 			    p == prefix_best(re))
1546 				rde_send_kroute(re_rib(re), p, NULL);
1547 		}
1548 		return;
1549 	}
1550 
1551 	/* redo the route decision */
1552 	prefix_evaluate(prefix_re(p), p, p);
1553 }
1554 
1555 /* kill a prefix. */
1556 void
1557 prefix_destroy(struct prefix *p)
1558 {
1559 	/* make route decision */
1560 	prefix_evaluate(prefix_re(p), NULL, p);
1561 
1562 	prefix_unlink(p);
1563 	prefix_free(p);
1564 }
1565 
1566 /*
1567  * Link a prefix into the different parent objects.
1568  */
1569 static void
1570 prefix_link(struct prefix *p, struct rib_entry *re, struct pt_entry *pt,
1571     struct rde_peer *peer, uint32_t path_id, uint32_t path_id_tx,
1572     struct rde_aspath *asp, struct rde_community *comm,
1573     struct nexthop *nexthop, uint8_t nhflags, uint8_t vstate)
1574 {
1575 	if (re)
1576 		p->entry.list.re = re;
1577 	p->aspath = path_ref(asp);
1578 	p->communities = communities_ref(comm);
1579 	p->peer = peer;
1580 	p->pt = pt_ref(pt);
1581 	p->path_id = path_id;
1582 	p->path_id_tx = path_id_tx;
1583 	p->validation_state = vstate;
1584 	p->nhflags = nhflags;
1585 	p->nexthop = nexthop_ref(nexthop);
1586 	nexthop_link(p);
1587 	p->lastchange = getmonotime();
1588 }
1589 
1590 /*
1591  * Unlink a prefix from the different parent objects.
1592  */
1593 static void
1594 prefix_unlink(struct prefix *p)
1595 {
1596 	struct rib_entry	*re = prefix_re(p);
1597 
1598 	/* destroy all references to other objects */
1599 	/* remove nexthop ref ... */
1600 	nexthop_unlink(p);
1601 	nexthop_unref(p->nexthop);
1602 	p->nexthop = NULL;
1603 	p->nhflags = 0;
1604 	/* ... communities ... */
1605 	communities_unref(p->communities);
1606 	p->communities = NULL;
1607 	/* and unlink from aspath */
1608 	path_unref(p->aspath);
1609 	p->aspath = NULL;
1610 
1611 	if (re && rib_empty(re))
1612 		rib_remove(re);
1613 
1614 	pt_unref(p->pt);
1615 }
1616 
1617 /* alloc and zero new entry. May not fail. */
1618 static struct prefix *
1619 prefix_alloc(void)
1620 {
1621 	struct prefix *p;
1622 
1623 	p = calloc(1, sizeof(*p));
1624 	if (p == NULL)
1625 		fatal("prefix_alloc");
1626 	rdemem.prefix_cnt++;
1627 	return p;
1628 }
1629 
1630 /* free a unlinked entry */
1631 static void
1632 prefix_free(struct prefix *p)
1633 {
1634 	rdemem.prefix_cnt--;
1635 	free(p);
1636 }
1637 
1638 /*
1639  * nexthop functions
1640  */
1641 struct nexthop_head	*nexthop_hash(struct bgpd_addr *);
1642 struct nexthop		*nexthop_lookup(struct bgpd_addr *);
1643 
1644 /*
1645  * In BGP there exist two nexthops: the exit nexthop which was announced via
1646  * BGP and the true nexthop which is used in the FIB -- forward information
1647  * base a.k.a kernel routing table. When sending updates it is even more
1648  * confusing. In IBGP we pass the unmodified exit nexthop to the neighbors
1649  * while in EBGP normally the address of the router is sent. The exit nexthop
1650  * may be passed to the external neighbor if the neighbor and the exit nexthop
1651  * reside in the same subnet -- directly connected.
1652  */
1653 struct nexthop_table {
1654 	LIST_HEAD(nexthop_head, nexthop)	*nexthop_hashtbl;
1655 	uint32_t				 nexthop_hashmask;
1656 } nexthoptable;
1657 
1658 SIPHASH_KEY nexthoptablekey;
1659 
1660 TAILQ_HEAD(nexthop_queue, nexthop)	nexthop_runners;
1661 
1662 void
1663 nexthop_init(uint32_t hashsize)
1664 {
1665 	uint32_t	 hs, i;
1666 
1667 	for (hs = 1; hs < hashsize; hs <<= 1)
1668 		;
1669 	nexthoptable.nexthop_hashtbl = calloc(hs, sizeof(struct nexthop_head));
1670 	if (nexthoptable.nexthop_hashtbl == NULL)
1671 		fatal("nextop_init");
1672 
1673 	TAILQ_INIT(&nexthop_runners);
1674 	for (i = 0; i < hs; i++)
1675 		LIST_INIT(&nexthoptable.nexthop_hashtbl[i]);
1676 	arc4random_buf(&nexthoptablekey, sizeof(nexthoptablekey));
1677 
1678 	nexthoptable.nexthop_hashmask = hs - 1;
1679 }
1680 
1681 void
1682 nexthop_shutdown(void)
1683 {
1684 	uint32_t		 i;
1685 	struct nexthop		*nh, *nnh;
1686 
1687 	for (i = 0; i <= nexthoptable.nexthop_hashmask; i++) {
1688 		for (nh = LIST_FIRST(&nexthoptable.nexthop_hashtbl[i]);
1689 		    nh != NULL; nh = nnh) {
1690 			nnh = LIST_NEXT(nh, nexthop_l);
1691 			nh->state = NEXTHOP_UNREACH;
1692 			nexthop_unref(nh);
1693 		}
1694 		if (!LIST_EMPTY(&nexthoptable.nexthop_hashtbl[i])) {
1695 			nh = LIST_FIRST(&nexthoptable.nexthop_hashtbl[i]);
1696 			log_warnx("nexthop_shutdown: non-free table, "
1697 			    "nexthop %s refcnt %d",
1698 			    log_addr(&nh->exit_nexthop), nh->refcnt);
1699 		}
1700 	}
1701 
1702 	free(nexthoptable.nexthop_hashtbl);
1703 }
1704 
1705 int
1706 nexthop_pending(void)
1707 {
1708 	return !TAILQ_EMPTY(&nexthop_runners);
1709 }
1710 
1711 void
1712 nexthop_runner(void)
1713 {
1714 	struct nexthop *nh;
1715 	struct prefix *p;
1716 	uint32_t j;
1717 
1718 	nh = TAILQ_FIRST(&nexthop_runners);
1719 	if (nh == NULL)
1720 		return;
1721 
1722 	/* remove from runnner queue */
1723 	TAILQ_REMOVE(&nexthop_runners, nh, runner_l);
1724 
1725 	p = nh->next_prefix;
1726 	for (j = 0; p != NULL && j < RDE_RUNNER_ROUNDS; j++) {
1727 		prefix_evaluate_all(p, nh->state, nh->oldstate);
1728 		p = LIST_NEXT(p, entry.list.nexthop);
1729 	}
1730 
1731 	/* prep for next run, if not finished readd to tail of queue */
1732 	nh->next_prefix = p;
1733 	if (p != NULL)
1734 		TAILQ_INSERT_TAIL(&nexthop_runners, nh, runner_l);
1735 	else
1736 		log_debug("nexthop %s update finished",
1737 		    log_addr(&nh->exit_nexthop));
1738 }
1739 
1740 void
1741 nexthop_update(struct kroute_nexthop *msg)
1742 {
1743 	struct nexthop		*nh;
1744 
1745 	nh = nexthop_lookup(&msg->nexthop);
1746 	if (nh == NULL) {
1747 		log_warnx("nexthop_update: non-existent nexthop %s",
1748 		    log_addr(&msg->nexthop));
1749 		return;
1750 	}
1751 
1752 	nh->oldstate = nh->state;
1753 	if (msg->valid)
1754 		nh->state = NEXTHOP_REACH;
1755 	else
1756 		nh->state = NEXTHOP_UNREACH;
1757 
1758 	if (nh->oldstate == NEXTHOP_LOOKUP)
1759 		/* drop reference which was hold during the lookup */
1760 		if (nexthop_unref(nh))
1761 			return;		/* nh lost last ref, no work left */
1762 
1763 	if (nh->next_prefix) {
1764 		/*
1765 		 * If nexthop_runner() is not finished with this nexthop
1766 		 * then ensure that all prefixes are updated by setting
1767 		 * the oldstate to NEXTHOP_FLAPPED.
1768 		 */
1769 		nh->oldstate = NEXTHOP_FLAPPED;
1770 		TAILQ_REMOVE(&nexthop_runners, nh, runner_l);
1771 	}
1772 
1773 	if (msg->connected) {
1774 		nh->flags |= NEXTHOP_CONNECTED;
1775 		memcpy(&nh->true_nexthop, &nh->exit_nexthop,
1776 		    sizeof(nh->true_nexthop));
1777 	} else
1778 		memcpy(&nh->true_nexthop, &msg->gateway,
1779 		    sizeof(nh->true_nexthop));
1780 
1781 	memcpy(&nh->nexthop_net, &msg->net,
1782 	    sizeof(nh->nexthop_net));
1783 	nh->nexthop_netlen = msg->netlen;
1784 
1785 	nh->next_prefix = LIST_FIRST(&nh->prefix_h);
1786 	if (nh->next_prefix != NULL) {
1787 		TAILQ_INSERT_HEAD(&nexthop_runners, nh, runner_l);
1788 		log_debug("nexthop %s update starting",
1789 		    log_addr(&nh->exit_nexthop));
1790 	}
1791 }
1792 
1793 void
1794 nexthop_modify(struct nexthop *setnh, enum action_types type, uint8_t aid,
1795     struct nexthop **nexthop, uint8_t *flags)
1796 {
1797 	switch (type) {
1798 	case ACTION_SET_NEXTHOP_REJECT:
1799 		*flags = NEXTHOP_REJECT;
1800 		break;
1801 	case ACTION_SET_NEXTHOP_BLACKHOLE:
1802 		*flags = NEXTHOP_BLACKHOLE;
1803 		break;
1804 	case ACTION_SET_NEXTHOP_NOMODIFY:
1805 		*flags = NEXTHOP_NOMODIFY;
1806 		break;
1807 	case ACTION_SET_NEXTHOP_SELF:
1808 		*flags = NEXTHOP_SELF;
1809 		break;
1810 	case ACTION_SET_NEXTHOP_REF:
1811 		/*
1812 		 * it is possible that a prefix matches but has the wrong
1813 		 * address family for the set nexthop. In this case ignore it.
1814 		 */
1815 		if (aid != setnh->exit_nexthop.aid)
1816 			break;
1817 		nexthop_unref(*nexthop);
1818 		*nexthop = nexthop_ref(setnh);
1819 		*flags = 0;
1820 		break;
1821 	default:
1822 		break;
1823 	}
1824 }
1825 
1826 void
1827 nexthop_link(struct prefix *p)
1828 {
1829 	if (p->nexthop == NULL)
1830 		return;
1831 	if (p->flags & PREFIX_FLAG_ADJOUT)
1832 		return;
1833 
1834 	/* no need to link prefixes in RIBs that have no decision process */
1835 	if (re_rib(prefix_re(p))->flags & F_RIB_NOEVALUATE)
1836 		return;
1837 
1838 	p->flags |= PREFIX_NEXTHOP_LINKED;
1839 	LIST_INSERT_HEAD(&p->nexthop->prefix_h, p, entry.list.nexthop);
1840 }
1841 
1842 void
1843 nexthop_unlink(struct prefix *p)
1844 {
1845 	if (p->nexthop == NULL || (p->flags & PREFIX_NEXTHOP_LINKED) == 0)
1846 		return;
1847 
1848 	if (p == p->nexthop->next_prefix) {
1849 		p->nexthop->next_prefix = LIST_NEXT(p, entry.list.nexthop);
1850 		/* remove nexthop from list if no prefixes left to update */
1851 		if (p->nexthop->next_prefix == NULL) {
1852 			TAILQ_REMOVE(&nexthop_runners, p->nexthop, runner_l);
1853 			log_debug("nexthop %s update finished",
1854 			    log_addr(&p->nexthop->exit_nexthop));
1855 		}
1856 	}
1857 
1858 	p->flags &= ~PREFIX_NEXTHOP_LINKED;
1859 	LIST_REMOVE(p, entry.list.nexthop);
1860 }
1861 
1862 struct nexthop *
1863 nexthop_get(struct bgpd_addr *nexthop)
1864 {
1865 	struct nexthop	*nh;
1866 
1867 	nh = nexthop_lookup(nexthop);
1868 	if (nh == NULL) {
1869 		nh = calloc(1, sizeof(*nh));
1870 		if (nh == NULL)
1871 			fatal("nexthop_alloc");
1872 		rdemem.nexthop_cnt++;
1873 
1874 		LIST_INIT(&nh->prefix_h);
1875 		nh->state = NEXTHOP_LOOKUP;
1876 		nexthop_ref(nh);	/* take reference for lookup */
1877 		nh->exit_nexthop = *nexthop;
1878 		LIST_INSERT_HEAD(nexthop_hash(nexthop), nh,
1879 		    nexthop_l);
1880 
1881 		rde_send_nexthop(&nh->exit_nexthop, 1);
1882 	}
1883 
1884 	return nexthop_ref(nh);
1885 }
1886 
1887 struct nexthop *
1888 nexthop_ref(struct nexthop *nexthop)
1889 {
1890 	if (nexthop)
1891 		nexthop->refcnt++;
1892 	return (nexthop);
1893 }
1894 
1895 int
1896 nexthop_unref(struct nexthop *nh)
1897 {
1898 	if (nh == NULL)
1899 		return (0);
1900 	if (--nh->refcnt > 0)
1901 		return (0);
1902 
1903 	/* sanity check */
1904 	if (!LIST_EMPTY(&nh->prefix_h) || nh->state == NEXTHOP_LOOKUP)
1905 		fatalx("%s: refcnt error", __func__);
1906 
1907 	/* is nexthop update running? impossible, that is a refcnt error */
1908 	if (nh->next_prefix)
1909 		fatalx("%s: next_prefix not NULL", __func__);
1910 
1911 	LIST_REMOVE(nh, nexthop_l);
1912 	rde_send_nexthop(&nh->exit_nexthop, 0);
1913 
1914 	rdemem.nexthop_cnt--;
1915 	free(nh);
1916 	return (1);
1917 }
1918 
1919 int
1920 nexthop_compare(struct nexthop *na, struct nexthop *nb)
1921 {
1922 	struct bgpd_addr	*a, *b;
1923 
1924 	if (na == nb)
1925 		return (0);
1926 	if (na == NULL)
1927 		return (-1);
1928 	if (nb == NULL)
1929 		return (1);
1930 
1931 	a = &na->exit_nexthop;
1932 	b = &nb->exit_nexthop;
1933 
1934 	if (a->aid != b->aid)
1935 		return (a->aid - b->aid);
1936 
1937 	switch (a->aid) {
1938 	case AID_INET:
1939 		if (ntohl(a->v4.s_addr) > ntohl(b->v4.s_addr))
1940 			return (1);
1941 		if (ntohl(a->v4.s_addr) < ntohl(b->v4.s_addr))
1942 			return (-1);
1943 		return (0);
1944 	case AID_INET6:
1945 		return (memcmp(&a->v6, &b->v6, sizeof(struct in6_addr)));
1946 	default:
1947 		fatalx("nexthop_cmp: unknown af");
1948 	}
1949 	return (-1);
1950 }
1951 
1952 struct nexthop *
1953 nexthop_lookup(struct bgpd_addr *nexthop)
1954 {
1955 	struct nexthop	*nh;
1956 
1957 	LIST_FOREACH(nh, nexthop_hash(nexthop), nexthop_l) {
1958 		if (memcmp(&nh->exit_nexthop, nexthop,
1959 		    sizeof(struct bgpd_addr)) == 0)
1960 			return (nh);
1961 	}
1962 	return (NULL);
1963 }
1964 
1965 struct nexthop_head *
1966 nexthop_hash(struct bgpd_addr *nexthop)
1967 {
1968 	uint32_t	 h = 0;
1969 
1970 	switch (nexthop->aid) {
1971 	case AID_INET:
1972 		h = SipHash24(&nexthoptablekey, &nexthop->v4.s_addr,
1973 		    sizeof(nexthop->v4.s_addr));
1974 		break;
1975 	case AID_INET6:
1976 		h = SipHash24(&nexthoptablekey, &nexthop->v6,
1977 		    sizeof(struct in6_addr));
1978 		break;
1979 	default:
1980 		fatalx("nexthop_hash: unsupported AID %d", nexthop->aid);
1981 	}
1982 	return (&nexthoptable.nexthop_hashtbl[h & nexthoptable.nexthop_hashmask]);
1983 }
1984