xref: /openbsd-src/usr.sbin/bgpd/rde_rib.c (revision d13be5d47e4149db2549a9828e244d59dbc43f15)
1 /*	$OpenBSD: rde_rib.c,v 1.128 2011/01/14 20:07:00 henning Exp $ */
2 
3 /*
4  * Copyright (c) 2003, 2004 Claudio Jeker <claudio@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 #include <sys/queue.h>
21 #include <sys/hash.h>
22 
23 #include <stdlib.h>
24 #include <string.h>
25 
26 #include "bgpd.h"
27 #include "rde.h"
28 
29 /*
30  * BGP RIB -- Routing Information Base
31  *
32  * The RIB is build with one aspect in mind. Speed -- actually update speed.
33  * Therefore one thing needs to be absolutely avoided, long table walks.
34  * This is achieved by heavily linking the different parts together.
35  */
36 u_int16_t rib_size;
37 struct rib *ribs;
38 
39 LIST_HEAD(, rib_context) rib_dump_h = LIST_HEAD_INITIALIZER(rib_dump_h);
40 
41 struct rib_entry *rib_add(struct rib *, struct bgpd_addr *, int);
42 int rib_compare(const struct rib_entry *, const struct rib_entry *);
43 void rib_remove(struct rib_entry *);
44 int rib_empty(struct rib_entry *);
45 struct rib_entry *rib_restart(struct rib_context *);
46 
47 RB_PROTOTYPE(rib_tree, rib_entry, rib_e, rib_compare);
48 RB_GENERATE(rib_tree, rib_entry, rib_e, rib_compare);
49 
50 
51 /* RIB specific functions */
52 u_int16_t
53 rib_new(char *name, u_int rtableid, u_int16_t flags)
54 {
55 	struct rib	*xribs;
56 	size_t		newsize;
57 	u_int16_t	id;
58 
59 	for (id = 0; id < rib_size; id++) {
60 		if (*ribs[id].name == '\0')
61 			break;
62 	}
63 
64 	if (id == RIB_FAILED)
65 		fatalx("rib_new: trying to use reserved id");
66 
67 	if (id >= rib_size) {
68 		newsize = sizeof(struct rib) * (id + 1);
69 		if ((xribs = realloc(ribs, newsize)) == NULL) {
70 			/* XXX this is not clever */
71 			fatal("rib_add");
72 		}
73 		ribs = xribs;
74 		rib_size = id + 1;
75 	}
76 
77 	bzero(&ribs[id], sizeof(struct rib));
78 	strlcpy(ribs[id].name, name, sizeof(ribs[id].name));
79 	RB_INIT(&ribs[id].rib);
80 	ribs[id].state = RECONF_REINIT;
81 	ribs[id].id = id;
82 	ribs[id].flags = flags;
83 	ribs[id].rtableid = rtableid;
84 
85 	return (id);
86 }
87 
88 u_int16_t
89 rib_find(char *name)
90 {
91 	u_int16_t id;
92 
93 	if (name == NULL || *name == '\0')
94 		return (1);	/* XXX */
95 
96 	for (id = 0; id < rib_size; id++) {
97 		if (!strcmp(ribs[id].name, name))
98 			return (id);
99 	}
100 
101 	return (RIB_FAILED);
102 }
103 
104 void
105 rib_free(struct rib *rib)
106 {
107 	struct rib_context *ctx, *next;
108 	struct rib_entry *re, *xre;
109 	struct prefix *p, *np;
110 
111 	for (ctx = LIST_FIRST(&rib_dump_h); ctx != NULL; ctx = next) {
112 		next = LIST_NEXT(ctx, entry);
113 		if (ctx->ctx_rib == rib) {
114 			re = ctx->ctx_re;
115 			re->flags &= ~F_RIB_ENTRYLOCK;
116 			LIST_REMOVE(ctx, entry);
117 			if (ctx->ctx_done)
118 				ctx->ctx_done(ctx->ctx_arg);
119 			else
120 				free(ctx);
121 		}
122 	}
123 
124 	for (re = RB_MIN(rib_tree, &rib->rib); re != NULL; re = xre) {
125 		xre = RB_NEXT(rib_tree,  &rib->rib, re);
126 
127 		/*
128 		 * Removing the prefixes is tricky because the last one
129 		 * will remove the rib_entry as well and at because we do
130 		 * a empty check in prefix_destroy() it is not possible to
131 		 * use the default for loop.
132 		 */
133 		while ((p = LIST_FIRST(&re->prefix_h))) {
134 			np = LIST_NEXT(p, rib_l);
135 			if (p->aspath->pftableid) {
136 				struct bgpd_addr addr;
137 
138 				pt_getaddr(p->prefix, &addr);
139 				/* Commit is done in peer_down() */
140 				rde_send_pftable(p->aspath->pftableid, &addr,
141 				    p->prefix->prefixlen, 1);
142 			}
143 			prefix_destroy(p);
144 			if (np == NULL)
145 				break;
146 		}
147 	}
148 	bzero(rib, sizeof(struct rib));
149 }
150 
151 int
152 rib_compare(const struct rib_entry *a, const struct rib_entry *b)
153 {
154 	return (pt_prefix_cmp(a->prefix, b->prefix));
155 }
156 
157 struct rib_entry *
158 rib_get(struct rib *rib, struct bgpd_addr *prefix, int prefixlen)
159 {
160 	struct rib_entry xre;
161 	struct pt_entry	*pte;
162 
163 	pte = pt_fill(prefix, prefixlen);
164 	bzero(&xre, sizeof(xre));
165 	xre.prefix = pte;
166 
167 	return (RB_FIND(rib_tree, &rib->rib, &xre));
168 }
169 
170 struct rib_entry *
171 rib_lookup(struct rib *rib, struct bgpd_addr *addr)
172 {
173 	struct rib_entry *re;
174 	int		 i;
175 
176 	switch (addr->aid) {
177 	case AID_INET:
178 	case AID_VPN_IPv4:
179 		for (i = 32; i >= 0; i--) {
180 			re = rib_get(rib, addr, i);
181 			if (re != NULL)
182 				return (re);
183 		}
184 		break;
185 	case AID_INET6:
186 		for (i = 128; i >= 0; i--) {
187 			re = rib_get(rib, addr, i);
188 			if (re != NULL)
189 				return (re);
190 		}
191 		break;
192 	default:
193 		fatalx("rib_lookup: unknown af");
194 	}
195 	return (NULL);
196 }
197 
198 
199 struct rib_entry *
200 rib_add(struct rib *rib, struct bgpd_addr *prefix, int prefixlen)
201 {
202 	struct pt_entry	*pte;
203 	struct rib_entry *re;
204 
205 	pte = pt_get(prefix, prefixlen);
206 	if (pte == NULL)
207 		pte = pt_add(prefix, prefixlen);
208 
209 	if ((re = calloc(1, sizeof(*re))) == NULL)
210 		fatal("rib_add");
211 
212 	LIST_INIT(&re->prefix_h);
213 	re->prefix = pte;
214 	re->flags = rib->flags;
215 	re->ribid = rib->id;
216 
217         if (RB_INSERT(rib_tree, &rib->rib, re) != NULL) {
218 		log_warnx("rib_add: insert failed");
219 		free(re);
220 		return (NULL);
221 	}
222 
223 	pt_ref(pte);
224 
225 	rdemem.rib_cnt++;
226 
227 	return (re);
228 }
229 
230 void
231 rib_remove(struct rib_entry *re)
232 {
233 	if (!rib_empty(re))
234 		fatalx("rib_remove: entry not empty");
235 
236 	if (re->flags & F_RIB_ENTRYLOCK)
237 		/* entry is locked, don't free it. */
238 		return;
239 
240 	pt_unref(re->prefix);
241 	if (pt_empty(re->prefix))
242 		pt_remove(re->prefix);
243 
244 	if (RB_REMOVE(rib_tree, &ribs[re->ribid].rib, re) == NULL)
245 		log_warnx("rib_remove: remove failed.");
246 
247 	free(re);
248 	rdemem.rib_cnt--;
249 }
250 
251 int
252 rib_empty(struct rib_entry *re)
253 {
254 	return LIST_EMPTY(&re->prefix_h);
255 }
256 
257 void
258 rib_dump(struct rib *rib, void (*upcall)(struct rib_entry *, void *),
259     void *arg, u_int8_t aid)
260 {
261 	struct rib_context	*ctx;
262 
263 	if ((ctx = calloc(1, sizeof(*ctx))) == NULL)
264 		fatal("rib_dump");
265 	ctx->ctx_rib = rib;
266 	ctx->ctx_upcall = upcall;
267 	ctx->ctx_arg = arg;
268 	ctx->ctx_aid = aid;
269 	rib_dump_r(ctx);
270 }
271 
272 void
273 rib_dump_r(struct rib_context *ctx)
274 {
275 	struct rib_entry	*re;
276 	unsigned int		 i;
277 
278 	if (ctx->ctx_re == NULL) {
279 		re = RB_MIN(rib_tree, &ctx->ctx_rib->rib);
280 		LIST_INSERT_HEAD(&rib_dump_h, ctx, entry);
281 	} else
282 		re = rib_restart(ctx);
283 
284 	for (i = 0; re != NULL; re = RB_NEXT(rib_tree, unused, re)) {
285 		if (ctx->ctx_aid != AID_UNSPEC &&
286 		    ctx->ctx_aid != re->prefix->aid)
287 			continue;
288 		if (ctx->ctx_count && i++ >= ctx->ctx_count &&
289 		    (re->flags & F_RIB_ENTRYLOCK) == 0) {
290 			/* store and lock last element */
291 			ctx->ctx_re = re;
292 			re->flags |= F_RIB_ENTRYLOCK;
293 			return;
294 		}
295 		ctx->ctx_upcall(re, ctx->ctx_arg);
296 	}
297 
298 	LIST_REMOVE(ctx, entry);
299 	if (ctx->ctx_done)
300 		ctx->ctx_done(ctx->ctx_arg);
301 	else
302 		free(ctx);
303 }
304 
305 struct rib_entry *
306 rib_restart(struct rib_context *ctx)
307 {
308 	struct rib_entry *re;
309 
310 	re = ctx->ctx_re;
311 	re->flags &= ~F_RIB_ENTRYLOCK;
312 
313 	/* find first non empty element */
314 	while (re && rib_empty(re))
315 		re = RB_NEXT(rib_tree, unused, re);
316 
317 	/* free the previously locked rib element if empty */
318 	if (rib_empty(ctx->ctx_re))
319 		rib_remove(ctx->ctx_re);
320 	ctx->ctx_re = NULL;
321 	return (re);
322 }
323 
324 void
325 rib_dump_runner(void)
326 {
327 	struct rib_context	*ctx, *next;
328 
329 	for (ctx = LIST_FIRST(&rib_dump_h); ctx != NULL; ctx = next) {
330 		next = LIST_NEXT(ctx, entry);
331 		rib_dump_r(ctx);
332 	}
333 }
334 
335 int
336 rib_dump_pending(void)
337 {
338 	return (!LIST_EMPTY(&rib_dump_h));
339 }
340 
341 /* used to bump correct prefix counters */
342 #define PREFIX_COUNT(x, op)			\
343 	do {					\
344 		(x)->prefix_cnt += (op);	\
345 	} while (0)
346 
347 /* path specific functions */
348 
349 static void	path_link(struct rde_aspath *, struct rde_peer *);
350 
351 struct path_table pathtable;
352 
353 /* XXX the hash should also include communities and the other attrs */
354 #define PATH_HASH(x)				\
355 	&pathtable.path_hashtbl[hash32_buf((x)->data, (x)->len, HASHINIT) & \
356 	    pathtable.path_hashmask]
357 
358 void
359 path_init(u_int32_t hashsize)
360 {
361 	u_int32_t	hs, i;
362 
363 	for (hs = 1; hs < hashsize; hs <<= 1)
364 		;
365 	pathtable.path_hashtbl = calloc(hs, sizeof(struct aspath_head));
366 	if (pathtable.path_hashtbl == NULL)
367 		fatal("path_init");
368 
369 	for (i = 0; i < hs; i++)
370 		LIST_INIT(&pathtable.path_hashtbl[i]);
371 
372 	pathtable.path_hashmask = hs - 1;
373 }
374 
375 void
376 path_shutdown(void)
377 {
378 	u_int32_t	i;
379 
380 	for (i = 0; i <= pathtable.path_hashmask; i++)
381 		if (!LIST_EMPTY(&pathtable.path_hashtbl[i]))
382 			log_warnx("path_free: free non-free table");
383 
384 	free(pathtable.path_hashtbl);
385 }
386 
387 int
388 path_update(struct rib *rib, struct rde_peer *peer, struct rde_aspath *nasp,
389     struct bgpd_addr *prefix, int prefixlen)
390 {
391 	struct rde_aspath	*asp;
392 	struct prefix		*p;
393 
394 	if (nasp->pftableid) {
395 		rde_send_pftable(nasp->pftableid, prefix, prefixlen, 0);
396 		rde_send_pftable_commit();
397 	}
398 
399 	/*
400 	 * First try to find a prefix in the specified RIB.
401 	 */
402 	if ((p = prefix_get(rib, peer, prefix, prefixlen, 0)) != NULL) {
403 		if (path_compare(nasp, p->aspath) == 0) {
404 			/* no change, update last change */
405 			p->lastchange = time(NULL);
406 			return (0);
407 		}
408 	}
409 
410 	/*
411 	 * Either the prefix does not exist or the path changed.
412 	 * In both cases lookup the new aspath to make sure it is not
413 	 * already in the RIB.
414 	 */
415 	if ((asp = path_lookup(nasp, peer)) == NULL) {
416 		/* Path not available, create and link a new one. */
417 		asp = path_copy(nasp);
418 		path_link(asp, peer);
419 	}
420 
421 	/* If the prefix was found move it else add it to the aspath. */
422 	if (p != NULL)
423 		prefix_move(asp, p);
424 	else
425 		return (prefix_add(rib, asp, prefix, prefixlen));
426 	return (0);
427 }
428 
429 int
430 path_compare(struct rde_aspath *a, struct rde_aspath *b)
431 {
432 	int		 r;
433 
434 	if (a->origin > b->origin)
435 		return (1);
436 	if (a->origin < b->origin)
437 		return (-1);
438 	if ((a->flags & ~F_ATTR_LINKED) > (b->flags & ~F_ATTR_LINKED))
439 		return (1);
440 	if ((a->flags & ~F_ATTR_LINKED) < (b->flags & ~F_ATTR_LINKED))
441 		return (-1);
442 	if (a->med > b->med)
443 		return (1);
444 	if (a->med < b->med)
445 		return (-1);
446 	if (a->lpref > b->lpref)
447 		return (1);
448 	if (a->lpref < b->lpref)
449 		return (-1);
450 	if (a->weight > b->weight)
451 		return (1);
452 	if (a->weight < b->weight)
453 		return (-1);
454 	if (a->rtlabelid > b->rtlabelid)
455 		return (1);
456 	if (a->rtlabelid < b->rtlabelid)
457 		return (-1);
458 	if (a->pftableid > b->pftableid)
459 		return (1);
460 	if (a->pftableid < b->pftableid)
461 		return (-1);
462 
463 	r = aspath_compare(a->aspath, b->aspath);
464 	if (r == 0)
465 		r = nexthop_compare(a->nexthop, b->nexthop);
466 	if (r > 0)
467 		return (1);
468 	if (r < 0)
469 		return (-1);
470 
471 	return (attr_compare(a, b));
472 }
473 
474 struct rde_aspath *
475 path_lookup(struct rde_aspath *aspath, struct rde_peer *peer)
476 {
477 	struct aspath_head	*head;
478 	struct rde_aspath	*asp;
479 
480 	head = PATH_HASH(aspath->aspath);
481 
482 	LIST_FOREACH(asp, head, path_l) {
483 		if (peer == asp->peer && path_compare(aspath, asp) == 0)
484 			return (asp);
485 	}
486 	return (NULL);
487 }
488 
489 void
490 path_remove(struct rde_aspath *asp)
491 {
492 	struct prefix	*p, *np;
493 
494 	for (p = LIST_FIRST(&asp->prefix_h); p != NULL; p = np) {
495 		np = LIST_NEXT(p, path_l);
496 		if (asp->pftableid) {
497 			struct bgpd_addr addr;
498 
499 			pt_getaddr(p->prefix, &addr);
500 			/* Commit is done in peer_down() */
501 			rde_send_pftable(p->aspath->pftableid, &addr,
502 			    p->prefix->prefixlen, 1);
503 		}
504 		prefix_destroy(p);
505 	}
506 }
507 
508 /* this function is only called by prefix_remove and path_remove */
509 void
510 path_destroy(struct rde_aspath *asp)
511 {
512 	/* path_destroy can only unlink and free empty rde_aspath */
513 	if (asp->prefix_cnt != 0 || asp->active_cnt != 0)
514 		log_warnx("path_destroy: prefix count out of sync");
515 
516 	nexthop_unlink(asp);
517 	LIST_REMOVE(asp, path_l);
518 	LIST_REMOVE(asp, peer_l);
519 	asp->peer = NULL;
520 	asp->nexthop = NULL;
521 	asp->flags &= ~F_ATTR_LINKED;
522 
523 	path_put(asp);
524 }
525 
526 int
527 path_empty(struct rde_aspath *asp)
528 {
529 	return LIST_EMPTY(&asp->prefix_h);
530 }
531 
532 /*
533  * the path object is linked into multiple lists for fast access.
534  * These are peer_l, path_l and nexthop_l.
535  * peer_l: list of all aspaths that belong to that peer
536  * path_l: hash list to find paths quickly
537  * nexthop_l: list of all aspaths with an equal exit nexthop
538  */
539 static void
540 path_link(struct rde_aspath *asp, struct rde_peer *peer)
541 {
542 	struct aspath_head	*head;
543 
544 	head = PATH_HASH(asp->aspath);
545 
546 	LIST_INSERT_HEAD(head, asp, path_l);
547 	LIST_INSERT_HEAD(&peer->path_h, asp, peer_l);
548 	asp->peer = peer;
549 	nexthop_link(asp);
550 	asp->flags |= F_ATTR_LINKED;
551 }
552 
553 /*
554  * copy asp to a new UNLINKED one mainly for filtering
555  */
556 struct rde_aspath *
557 path_copy(struct rde_aspath *asp)
558 {
559 	struct rde_aspath *nasp;
560 
561 	nasp = path_get();
562 	nasp->aspath = asp->aspath;
563 	if (nasp->aspath != NULL) {
564 		nasp->aspath->refcnt++;
565 		rdemem.aspath_refs++;
566 	}
567 	nasp->nexthop = asp->nexthop;
568 	nasp->med = asp->med;
569 	nasp->lpref = asp->lpref;
570 	nasp->weight = asp->weight;
571 	nasp->origin = asp->origin;
572 	nasp->rtlabelid = asp->rtlabelid;
573 	rtlabel_ref(nasp->rtlabelid);
574 	nasp->pftableid = asp->pftableid;
575 	pftable_ref(nasp->pftableid);
576 
577 	nasp->flags = asp->flags & ~F_ATTR_LINKED;
578 	attr_copy(nasp, asp);
579 
580 	return (nasp);
581 }
582 
583 /* alloc and initialize new entry. May not fail. */
584 struct rde_aspath *
585 path_get(void)
586 {
587 	struct rde_aspath *asp;
588 
589 	asp = calloc(1, sizeof(*asp));
590 	if (asp == NULL)
591 		fatal("path_alloc");
592 	rdemem.path_cnt++;
593 
594 	LIST_INIT(&asp->prefix_h);
595 	asp->origin = ORIGIN_INCOMPLETE;
596 	asp->lpref = DEFAULT_LPREF;
597 	/* med = 0 */
598 	/* weight = 0 */
599 	/* rtlabel = 0 */
600 
601 	return (asp);
602 }
603 
604 /* free an unlinked element */
605 void
606 path_put(struct rde_aspath *asp)
607 {
608 	if (asp == NULL)
609 		return;
610 
611 	if (asp->flags & F_ATTR_LINKED)
612 		fatalx("path_put: linked object");
613 
614 	rtlabel_unref(asp->rtlabelid);
615 	pftable_unref(asp->pftableid);
616 	aspath_put(asp->aspath);
617 	attr_freeall(asp);
618 	rdemem.path_cnt--;
619 	free(asp);
620 }
621 
622 /* prefix specific functions */
623 
624 static struct prefix	*prefix_alloc(void);
625 static void		 prefix_free(struct prefix *);
626 static void		 prefix_link(struct prefix *, struct rib_entry *,
627 			     struct rde_aspath *);
628 static void		 prefix_unlink(struct prefix *);
629 
630 int
631 prefix_compare(const struct bgpd_addr *a, const struct bgpd_addr *b,
632     int prefixlen)
633 {
634 	in_addr_t	mask, aa, ba;
635 	int		i;
636 	u_int8_t	m;
637 
638 	if (a->aid != b->aid)
639 		return (a->aid - b->aid);
640 
641 	switch (a->aid) {
642 	case AID_INET:
643 		if (prefixlen > 32)
644 			fatalx("prefix_cmp: bad IPv4 prefixlen");
645 		mask = htonl(prefixlen2mask(prefixlen));
646 		aa = ntohl(a->v4.s_addr & mask);
647 		ba = ntohl(b->v4.s_addr & mask);
648 		if (aa != ba)
649 			return (aa - ba);
650 		return (0);
651 	case AID_INET6:
652 		if (prefixlen > 128)
653 			fatalx("prefix_cmp: bad IPv6 prefixlen");
654 		for (i = 0; i < prefixlen / 8; i++)
655 			if (a->v6.s6_addr[i] != b->v6.s6_addr[i])
656 				return (a->v6.s6_addr[i] - b->v6.s6_addr[i]);
657 		i = prefixlen % 8;
658 		if (i) {
659 			m = 0xff00 >> i;
660 			if ((a->v6.s6_addr[prefixlen / 8] & m) !=
661 			    (b->v6.s6_addr[prefixlen / 8] & m))
662 				return ((a->v6.s6_addr[prefixlen / 8] & m) -
663 				    (b->v6.s6_addr[prefixlen / 8] & m));
664 		}
665 		return (0);
666 	case AID_VPN_IPv4:
667 		if (prefixlen > 32)
668 			fatalx("prefix_cmp: bad IPv4 VPN prefixlen");
669 		if (betoh64(a->vpn4.rd) > betoh64(b->vpn4.rd))
670 			return (1);
671 		if (betoh64(a->vpn4.rd) < betoh64(b->vpn4.rd))
672 			return (-1);
673 		mask = htonl(prefixlen2mask(prefixlen));
674 		aa = ntohl(a->vpn4.addr.s_addr & mask);
675 		ba = ntohl(b->vpn4.addr.s_addr & mask);
676 		if (aa != ba)
677 			return (aa - ba);
678 		if (a->vpn4.labellen > b->vpn4.labellen)
679 			return (1);
680 		if (a->vpn4.labellen < b->vpn4.labellen)
681 			return (-1);
682 		return (memcmp(a->vpn4.labelstack, b->vpn4.labelstack,
683 		    a->vpn4.labellen));
684 	default:
685 		fatalx("prefix_cmp: unknown af");
686 	}
687 	return (-1);
688 }
689 
690 /*
691  * search for specified prefix of a peer. Returns NULL if not found.
692  */
693 struct prefix *
694 prefix_get(struct rib *rib, struct rde_peer *peer, struct bgpd_addr *prefix,
695     int prefixlen, u_int32_t flags)
696 {
697 	struct rib_entry	*re;
698 
699 	re = rib_get(rib, prefix, prefixlen);
700 	if (re == NULL)
701 		return (NULL);
702 	return (prefix_bypeer(re, peer, flags));
703 }
704 
705 /*
706  * Adds or updates a prefix.
707  */
708 int
709 prefix_add(struct rib *rib, struct rde_aspath *asp, struct bgpd_addr *prefix,
710     int prefixlen)
711 
712 {
713 	struct prefix		*p;
714 	struct rib_entry	*re;
715 
716 	re = rib_get(rib, prefix, prefixlen);
717 	if (re == NULL)
718 		re = rib_add(rib, prefix, prefixlen);
719 
720 	p = prefix_bypeer(re, asp->peer, asp->flags);
721 	if (p == NULL) {
722 		p = prefix_alloc();
723 		prefix_link(p, re, asp);
724 		return (1);
725 	} else {
726 		if (p->aspath != asp) {
727 			/* prefix belongs to a different aspath so move */
728 			prefix_move(asp, p);
729 		} else
730 			p->lastchange = time(NULL);
731 		return (0);
732 	}
733 }
734 
735 /*
736  * Move the prefix to the specified as path, removes the old asp if needed.
737  */
738 void
739 prefix_move(struct rde_aspath *asp, struct prefix *p)
740 {
741 	struct prefix		*np;
742 	struct rde_aspath	*oasp;
743 
744 	if (asp->peer != p->aspath->peer)
745 		fatalx("prefix_move: cross peer move");
746 
747 	/* create new prefix node */
748 	np = prefix_alloc();
749 	np->aspath = asp;
750 	/* peer and prefix pointers are still equal */
751 	np->prefix = p->prefix;
752 	np->rib = p->rib;
753 	np->lastchange = time(NULL);
754 
755 	/* add to new as path */
756 	LIST_INSERT_HEAD(&asp->prefix_h, np, path_l);
757 	PREFIX_COUNT(asp, 1);
758 	/*
759 	 * no need to update the peer prefix count because we are only moving
760 	 * the prefix without changing the peer.
761 	 */
762 
763 	/*
764 	 * First kick the old prefix node out of the prefix list,
765 	 * afterwards run the route decision for new prefix node.
766 	 * Because of this only one update is generated if the prefix
767 	 * was active.
768 	 * This is save because we create a new prefix and so the change
769 	 * is noticed by prefix_evaluate().
770 	 */
771 	LIST_REMOVE(p, rib_l);
772 	prefix_evaluate(np, np->rib);
773 
774 	/* remove old prefix node */
775 	oasp = p->aspath;
776 	LIST_REMOVE(p, path_l);
777 	PREFIX_COUNT(oasp, -1);
778 	/* as before peer count needs no update because of move */
779 
780 	/* destroy all references to other objects and free the old prefix */
781 	p->aspath = NULL;
782 	p->prefix = NULL;
783 	p->rib = NULL;
784 	prefix_free(p);
785 
786 	/* destroy old path if empty */
787 	if (path_empty(oasp))
788 		path_destroy(oasp);
789 }
790 
791 /*
792  * Removes a prefix from all lists. If the parent objects -- path or
793  * pt_entry -- become empty remove them too.
794  */
795 int
796 prefix_remove(struct rib *rib, struct rde_peer *peer, struct bgpd_addr *prefix,
797     int prefixlen, u_int32_t flags)
798 {
799 	struct prefix		*p;
800 	struct rib_entry	*re;
801 	struct rde_aspath	*asp;
802 
803 	re = rib_get(rib, prefix, prefixlen);
804 	if (re == NULL)	/* Got a dummy withdrawn request */
805 		return (0);
806 
807 	p = prefix_bypeer(re, peer, flags);
808 	if (p == NULL)		/* Got a dummy withdrawn request. */
809 		return (0);
810 
811 	asp = p->aspath;
812 
813 	if (asp->pftableid) {
814 		/* only prefixes in the local RIB were pushed into pf */
815 		rde_send_pftable(asp->pftableid, prefix, prefixlen, 1);
816 		rde_send_pftable_commit();
817 	}
818 
819 	prefix_destroy(p);
820 
821 	return (1);
822 }
823 
824 /* dump a prefix into specified buffer */
825 int
826 prefix_write(u_char *buf, int len, struct bgpd_addr *prefix, u_int8_t plen)
827 {
828 	int	totlen;
829 
830 	switch (prefix->aid) {
831 	case AID_INET:
832 	case AID_INET6:
833 		totlen = PREFIX_SIZE(plen);
834 
835 		if (totlen > len)
836 			return (-1);
837 		*buf++ = plen;
838 		memcpy(buf, &prefix->ba, totlen - 1);
839 		return (totlen);
840 	case AID_VPN_IPv4:
841 		totlen = PREFIX_SIZE(plen) + sizeof(prefix->vpn4.rd) +
842 		    prefix->vpn4.labellen;
843 		plen += (sizeof(prefix->vpn4.rd) + prefix->vpn4.labellen) * 8;
844 
845 		if (totlen > len)
846 			return (-1);
847 		*buf++ = plen;
848 		memcpy(buf, &prefix->vpn4.labelstack, prefix->vpn4.labellen);
849 		buf += prefix->vpn4.labellen;
850 		memcpy(buf, &prefix->vpn4.rd, sizeof(prefix->vpn4.rd));
851 		buf += sizeof(prefix->vpn4.rd);
852 		memcpy(buf, &prefix->vpn4.addr, PREFIX_SIZE(plen) - 1);
853 		return (totlen);
854 	default:
855 		return (-1);
856 	}
857 }
858 
859 /*
860  * Searches in the prefix list of specified pt_entry for a prefix entry
861  * belonging to the peer peer. Returns NULL if no match found.
862  */
863 struct prefix *
864 prefix_bypeer(struct rib_entry *re, struct rde_peer *peer, u_int32_t flags)
865 {
866 	struct prefix	*p;
867 
868 	LIST_FOREACH(p, &re->prefix_h, rib_l) {
869 		if (p->aspath->peer != peer)
870 			continue;
871 		if (p->aspath->flags & flags &&
872 		    (flags & F_ANN_DYNAMIC) !=
873 		    (p->aspath->flags & F_ANN_DYNAMIC))
874 			continue;
875 		return (p);
876 	}
877 	return (NULL);
878 }
879 
880 void
881 prefix_updateall(struct rde_aspath *asp, enum nexthop_state state,
882     enum nexthop_state oldstate)
883 {
884 	struct prefix	*p;
885 
886 	LIST_FOREACH(p, &asp->prefix_h, path_l) {
887 		/*
888 		 * skip non local-RIBs or RIBs that are flagged as noeval.
889 		 */
890 		if (p->rib->flags & F_RIB_NOEVALUATE)
891 			continue;
892 
893 		if (oldstate == state && state == NEXTHOP_REACH) {
894 			/*
895 			 * The state of the nexthop did not change. The only
896 			 * thing that may have changed is the true_nexthop
897 			 * or other internal infos. This will not change
898 			 * the routing decision so shortcut here.
899 			 */
900 			if ((p->rib->flags & F_RIB_NOFIB) == 0 &&
901 			    p == p->rib->active)
902 				rde_send_kroute(p, NULL, p->rib->ribid);
903 			continue;
904 		}
905 
906 		/* redo the route decision */
907 		LIST_REMOVE(p, rib_l);
908 		/*
909 		 * If the prefix is the active one remove it first,
910 		 * this has to be done because we can not detect when
911 		 * the active prefix changes its state. In this case
912 		 * we know that this is a withdrawl and so the second
913 		 * prefix_evaluate() will generate no update because
914 		 * the nexthop is unreachable or ineligible.
915 		 */
916 		if (p == p->rib->active)
917 			prefix_evaluate(NULL, p->rib);
918 		prefix_evaluate(p, p->rib);
919 	}
920 }
921 
922 /* kill a prefix. */
923 void
924 prefix_destroy(struct prefix *p)
925 {
926 	struct rde_aspath	*asp;
927 
928 	asp = p->aspath;
929 	prefix_unlink(p);
930 	prefix_free(p);
931 
932 	if (path_empty(asp))
933 		path_destroy(asp);
934 }
935 
936 /*
937  * helper function to clean up the connected networks after a reload
938  */
939 void
940 prefix_network_clean(struct rde_peer *peer, time_t reloadtime, u_int32_t flags)
941 {
942 	struct rde_aspath	*asp, *xasp;
943 	struct prefix		*p, *xp;
944 
945 	for (asp = LIST_FIRST(&peer->path_h); asp != NULL; asp = xasp) {
946 		xasp = LIST_NEXT(asp, peer_l);
947 		if ((asp->flags & F_ANN_DYNAMIC) == flags)
948 			continue;
949 		for (p = LIST_FIRST(&asp->prefix_h); p != NULL; p = xp) {
950 			xp = LIST_NEXT(p, path_l);
951 			if (reloadtime > p->lastchange) {
952 				prefix_unlink(p);
953 				prefix_free(p);
954 			}
955 		}
956 		if (path_empty(asp))
957 			path_destroy(asp);
958 	}
959 }
960 
961 /*
962  * Link a prefix into the different parent objects.
963  */
964 static void
965 prefix_link(struct prefix *pref, struct rib_entry *re, struct rde_aspath *asp)
966 {
967 	LIST_INSERT_HEAD(&asp->prefix_h, pref, path_l);
968 	PREFIX_COUNT(asp, 1);
969 
970 	pref->aspath = asp;
971 	pref->rib = re;
972 	pref->prefix = re->prefix;
973 	pt_ref(pref->prefix);
974 	pref->lastchange = time(NULL);
975 
976 	/* make route decision */
977 	prefix_evaluate(pref, re);
978 }
979 
980 /*
981  * Unlink a prefix from the different parent objects.
982  */
983 static void
984 prefix_unlink(struct prefix *pref)
985 {
986 	struct rib_entry	*re = pref->rib;
987 
988 	/* make route decision */
989 	LIST_REMOVE(pref, rib_l);
990 	prefix_evaluate(NULL, re);
991 
992 	LIST_REMOVE(pref, path_l);
993 	PREFIX_COUNT(pref->aspath, -1);
994 
995 	pt_unref(pref->prefix);
996 	if (pt_empty(pref->prefix))
997 		pt_remove(pref->prefix);
998 	if (rib_empty(re))
999 		rib_remove(re);
1000 
1001 	/* destroy all references to other objects */
1002 	pref->aspath = NULL;
1003 	pref->prefix = NULL;
1004 	pref->rib = NULL;
1005 
1006 	/*
1007 	 * It's the caller's duty to remove empty aspath structures.
1008 	 * Also freeing the unlinked prefix is the caller's duty.
1009 	 */
1010 }
1011 
1012 /* alloc and bzero new entry. May not fail. */
1013 static struct prefix *
1014 prefix_alloc(void)
1015 {
1016 	struct prefix *p;
1017 
1018 	p = calloc(1, sizeof(*p));
1019 	if (p == NULL)
1020 		fatal("prefix_alloc");
1021 	rdemem.prefix_cnt++;
1022 	return p;
1023 }
1024 
1025 /* free a unlinked entry */
1026 static void
1027 prefix_free(struct prefix *pref)
1028 {
1029 	rdemem.prefix_cnt--;
1030 	free(pref);
1031 }
1032 
1033 /*
1034  * nexthop functions
1035  */
1036 struct nexthop_head	*nexthop_hash(struct bgpd_addr *);
1037 struct nexthop		*nexthop_lookup(struct bgpd_addr *);
1038 
1039 /*
1040  * In BGP there exist two nexthops: the exit nexthop which was announced via
1041  * BGP and the true nexthop which is used in the FIB -- forward information
1042  * base a.k.a kernel routing table. When sending updates it is even more
1043  * confusing. In IBGP we pass the unmodified exit nexthop to the neighbors
1044  * while in EBGP normally the address of the router is sent. The exit nexthop
1045  * may be passed to the external neighbor if the neighbor and the exit nexthop
1046  * reside in the same subnet -- directly connected.
1047  */
1048 struct nexthop_table {
1049 	LIST_HEAD(nexthop_head, nexthop)	*nexthop_hashtbl;
1050 	u_int32_t				 nexthop_hashmask;
1051 } nexthoptable;
1052 
1053 void
1054 nexthop_init(u_int32_t hashsize)
1055 {
1056 	u_int32_t	 hs, i;
1057 
1058 	for (hs = 1; hs < hashsize; hs <<= 1)
1059 		;
1060 	nexthoptable.nexthop_hashtbl = calloc(hs, sizeof(struct nexthop_table));
1061 	if (nexthoptable.nexthop_hashtbl == NULL)
1062 		fatal("nextop_init");
1063 
1064 	for (i = 0; i < hs; i++)
1065 		LIST_INIT(&nexthoptable.nexthop_hashtbl[i]);
1066 
1067 	nexthoptable.nexthop_hashmask = hs - 1;
1068 }
1069 
1070 void
1071 nexthop_shutdown(void)
1072 {
1073 	u_int32_t		 i;
1074 	struct nexthop		*nh, *nnh;
1075 
1076 	for (i = 0; i <= nexthoptable.nexthop_hashmask; i++) {
1077 		for (nh = LIST_FIRST(&nexthoptable.nexthop_hashtbl[i]);
1078 		    nh != NULL; nh = nnh) {
1079 			nnh = LIST_NEXT(nh, nexthop_l);
1080 			nh->state = NEXTHOP_UNREACH;
1081 			(void)nexthop_delete(nh);
1082 		}
1083 		if (!LIST_EMPTY(&nexthoptable.nexthop_hashtbl[i]))
1084 			log_warnx("nexthop_shutdown: non-free table");
1085 	}
1086 
1087 	free(nexthoptable.nexthop_hashtbl);
1088 }
1089 
1090 void
1091 nexthop_update(struct kroute_nexthop *msg)
1092 {
1093 	struct nexthop		*nh;
1094 	struct rde_aspath	*asp;
1095 	enum nexthop_state	 oldstate;
1096 
1097 	nh = nexthop_lookup(&msg->nexthop);
1098 	if (nh == NULL) {
1099 		log_warnx("nexthop_update: non-existent nexthop %s",
1100 		    log_addr(&msg->nexthop));
1101 		return;
1102 	}
1103 
1104 	oldstate = nh->state;
1105 	if (msg->valid)
1106 		nh->state = NEXTHOP_REACH;
1107 	else
1108 		nh->state = NEXTHOP_UNREACH;
1109 
1110 	if (msg->connected) {
1111 		nh->flags |= NEXTHOP_CONNECTED;
1112 		memcpy(&nh->true_nexthop, &nh->exit_nexthop,
1113 		    sizeof(nh->true_nexthop));
1114 	} else
1115 		memcpy(&nh->true_nexthop, &msg->gateway,
1116 		    sizeof(nh->true_nexthop));
1117 
1118 	memcpy(&nh->nexthop_net, &msg->net,
1119 	    sizeof(nh->nexthop_net));
1120 	nh->nexthop_netlen = msg->netlen;
1121 
1122 	if (nexthop_delete(nh))
1123 		/* nexthop no longer used */
1124 		return;
1125 
1126 	if (rde_noevaluate())
1127 		/*
1128 		 * if the decision process is turned off there is no need
1129 		 * for the aspath list walk.
1130 		 */
1131 		return;
1132 
1133 	LIST_FOREACH(asp, &nh->path_h, nexthop_l) {
1134 		prefix_updateall(asp, nh->state, oldstate);
1135 	}
1136 }
1137 
1138 void
1139 nexthop_modify(struct rde_aspath *asp, struct bgpd_addr *nexthop,
1140     enum action_types type, u_int8_t aid)
1141 {
1142 	struct nexthop	*nh;
1143 
1144 	if (type == ACTION_SET_NEXTHOP_REJECT) {
1145 		asp->flags |= F_NEXTHOP_REJECT;
1146 		return;
1147 	}
1148 	if (type  == ACTION_SET_NEXTHOP_BLACKHOLE) {
1149 		asp->flags |= F_NEXTHOP_BLACKHOLE;
1150 		return;
1151 	}
1152 	if (type == ACTION_SET_NEXTHOP_NOMODIFY) {
1153 		asp->flags |= F_NEXTHOP_NOMODIFY;
1154 		return;
1155 	}
1156 	if (type == ACTION_SET_NEXTHOP_SELF) {
1157 		asp->flags |= F_NEXTHOP_SELF;
1158 		return;
1159 	}
1160 	if (aid != nexthop->aid)
1161 		return;
1162 
1163 	nh = nexthop_get(nexthop);
1164 	if (asp->flags & F_ATTR_LINKED)
1165 		nexthop_unlink(asp);
1166 	asp->nexthop = nh;
1167 	if (asp->flags & F_ATTR_LINKED)
1168 		nexthop_link(asp);
1169 }
1170 
1171 void
1172 nexthop_link(struct rde_aspath *asp)
1173 {
1174 	if (asp->nexthop == NULL)
1175 		return;
1176 
1177 	LIST_INSERT_HEAD(&asp->nexthop->path_h, asp, nexthop_l);
1178 }
1179 
1180 void
1181 nexthop_unlink(struct rde_aspath *asp)
1182 {
1183 	struct nexthop	*nh;
1184 
1185 	if (asp->nexthop == NULL)
1186 		return;
1187 
1188 	LIST_REMOVE(asp, nexthop_l);
1189 
1190 	/* see if list is empty */
1191 	nh = asp->nexthop;
1192 	asp->nexthop = NULL;
1193 
1194 	(void)nexthop_delete(nh);
1195 }
1196 
1197 int
1198 nexthop_delete(struct nexthop *nh)
1199 {
1200 	/* nexthop still used by some other aspath */
1201 	if (!LIST_EMPTY(&nh->path_h))
1202 		return (0);
1203 
1204 	/* either pinned or in a state where it may not be deleted */
1205 	if (nh->refcnt > 0 || nh->state == NEXTHOP_LOOKUP)
1206 		return (0);
1207 
1208 	LIST_REMOVE(nh, nexthop_l);
1209 	rde_send_nexthop(&nh->exit_nexthop, 0);
1210 
1211 	rdemem.nexthop_cnt--;
1212 	free(nh);
1213 	return (1);
1214 }
1215 
1216 struct nexthop *
1217 nexthop_get(struct bgpd_addr *nexthop)
1218 {
1219 	struct nexthop	*nh;
1220 
1221 	nh = nexthop_lookup(nexthop);
1222 	if (nh == NULL) {
1223 		nh = calloc(1, sizeof(*nh));
1224 		if (nh == NULL)
1225 			fatal("nexthop_alloc");
1226 		rdemem.nexthop_cnt++;
1227 
1228 		LIST_INIT(&nh->path_h);
1229 		nh->state = NEXTHOP_LOOKUP;
1230 		nh->exit_nexthop = *nexthop;
1231 		LIST_INSERT_HEAD(nexthop_hash(nexthop), nh,
1232 		    nexthop_l);
1233 
1234 		rde_send_nexthop(&nh->exit_nexthop, 1);
1235 	}
1236 
1237 	return (nh);
1238 }
1239 
1240 int
1241 nexthop_compare(struct nexthop *na, struct nexthop *nb)
1242 {
1243 	struct bgpd_addr	*a, *b;
1244 
1245 	if (na == nb)
1246 		return (0);
1247 	if (na == NULL)
1248 		return (-1);
1249 	if (nb == NULL)
1250 		return (1);
1251 
1252 	a = &na->exit_nexthop;
1253 	b = &nb->exit_nexthop;
1254 
1255 	if (a->aid != b->aid)
1256 		return (a->aid - b->aid);
1257 
1258 	switch (a->aid) {
1259 	case AID_INET:
1260 		if (ntohl(a->v4.s_addr) > ntohl(b->v4.s_addr))
1261 			return (1);
1262 		if (ntohl(a->v4.s_addr) < ntohl(b->v4.s_addr))
1263 			return (-1);
1264 		return (0);
1265 	case AID_INET6:
1266 		return (memcmp(&a->v6, &b->v6, sizeof(struct in6_addr)));
1267 	default:
1268 		fatalx("nexthop_cmp: unknown af");
1269 	}
1270 	return (-1);
1271 }
1272 
1273 struct nexthop *
1274 nexthop_lookup(struct bgpd_addr *nexthop)
1275 {
1276 	struct nexthop	*nh;
1277 
1278 	LIST_FOREACH(nh, nexthop_hash(nexthop), nexthop_l) {
1279 		if (memcmp(&nh->exit_nexthop, nexthop,
1280 		    sizeof(struct bgpd_addr)) == 0)
1281 			return (nh);
1282 	}
1283 	return (NULL);
1284 }
1285 
1286 struct nexthop_head *
1287 nexthop_hash(struct bgpd_addr *nexthop)
1288 {
1289 	u_int32_t	 h = 0;
1290 
1291 	switch (nexthop->aid) {
1292 	case AID_INET:
1293 		h = (AF_INET ^ ntohl(nexthop->v4.s_addr) ^
1294 		    ntohl(nexthop->v4.s_addr) >> 13) &
1295 		    nexthoptable.nexthop_hashmask;
1296 		break;
1297 	case AID_INET6:
1298 		h = hash32_buf(&nexthop->v6, sizeof(struct in6_addr),
1299 		    HASHINIT) & nexthoptable.nexthop_hashmask;
1300 		break;
1301 	default:
1302 		fatalx("nexthop_hash: unsupported AF");
1303 	}
1304 	return (&nexthoptable.nexthop_hashtbl[h]);
1305 }
1306 
1307