xref: /openbsd-src/usr.sbin/bgpd/rde_rib.c (revision 4c1e55dc91edd6e69ccc60ce855900fbc12cf34f)
1 /*	$OpenBSD: rde_rib.c,v 1.133 2012/07/01 11:55:13 sthen Exp $ */
2 
3 /*
4  * Copyright (c) 2003, 2004 Claudio Jeker <claudio@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 #include <sys/queue.h>
21 #include <sys/hash.h>
22 
23 #include <stdlib.h>
24 #include <string.h>
25 
26 #include "bgpd.h"
27 #include "rde.h"
28 
29 /*
30  * BGP RIB -- Routing Information Base
31  *
32  * The RIB is build with one aspect in mind. Speed -- actually update speed.
33  * Therefore one thing needs to be absolutely avoided, long table walks.
34  * This is achieved by heavily linking the different parts together.
35  */
36 u_int16_t rib_size;
37 struct rib *ribs;
38 
39 LIST_HEAD(, rib_context) rib_dump_h = LIST_HEAD_INITIALIZER(rib_dump_h);
40 
41 struct rib_entry *rib_add(struct rib *, struct bgpd_addr *, int);
42 int rib_compare(const struct rib_entry *, const struct rib_entry *);
43 void rib_remove(struct rib_entry *);
44 int rib_empty(struct rib_entry *);
45 struct rib_entry *rib_restart(struct rib_context *);
46 
47 RB_PROTOTYPE(rib_tree, rib_entry, rib_e, rib_compare);
48 RB_GENERATE(rib_tree, rib_entry, rib_e, rib_compare);
49 
50 
51 /* RIB specific functions */
52 u_int16_t
53 rib_new(char *name, u_int rtableid, u_int16_t flags)
54 {
55 	struct rib	*xribs;
56 	size_t		newsize;
57 	u_int16_t	id;
58 
59 	for (id = 0; id < rib_size; id++) {
60 		if (*ribs[id].name == '\0')
61 			break;
62 	}
63 
64 	if (id == RIB_FAILED)
65 		fatalx("rib_new: trying to use reserved id");
66 
67 	if (id >= rib_size) {
68 		newsize = sizeof(struct rib) * (id + 1);
69 		if ((xribs = realloc(ribs, newsize)) == NULL) {
70 			/* XXX this is not clever */
71 			fatal("rib_add");
72 		}
73 		ribs = xribs;
74 		rib_size = id + 1;
75 	}
76 
77 	bzero(&ribs[id], sizeof(struct rib));
78 	strlcpy(ribs[id].name, name, sizeof(ribs[id].name));
79 	RB_INIT(&ribs[id].rib);
80 	ribs[id].state = RECONF_REINIT;
81 	ribs[id].id = id;
82 	ribs[id].flags = flags;
83 	ribs[id].rtableid = rtableid;
84 
85 	return (id);
86 }
87 
88 u_int16_t
89 rib_find(char *name)
90 {
91 	u_int16_t id;
92 
93 	if (name == NULL || *name == '\0')
94 		return (1);	/* XXX */
95 
96 	for (id = 0; id < rib_size; id++) {
97 		if (!strcmp(ribs[id].name, name))
98 			return (id);
99 	}
100 
101 	return (RIB_FAILED);
102 }
103 
104 void
105 rib_free(struct rib *rib)
106 {
107 	struct rib_context *ctx, *next;
108 	struct rib_entry *re, *xre;
109 	struct prefix *p, *np;
110 
111 	for (ctx = LIST_FIRST(&rib_dump_h); ctx != NULL; ctx = next) {
112 		next = LIST_NEXT(ctx, entry);
113 		if (ctx->ctx_rib == rib) {
114 			re = ctx->ctx_re;
115 			re->flags &= ~F_RIB_ENTRYLOCK;
116 			LIST_REMOVE(ctx, entry);
117 			if (ctx->ctx_done)
118 				ctx->ctx_done(ctx->ctx_arg);
119 			else
120 				free(ctx);
121 		}
122 	}
123 
124 	for (re = RB_MIN(rib_tree, &rib->rib); re != NULL; re = xre) {
125 		xre = RB_NEXT(rib_tree,  &rib->rib, re);
126 
127 		/*
128 		 * Removing the prefixes is tricky because the last one
129 		 * will remove the rib_entry as well and at because we do
130 		 * a empty check in prefix_destroy() it is not possible to
131 		 * use the default for loop.
132 		 */
133 		while ((p = LIST_FIRST(&re->prefix_h))) {
134 			np = LIST_NEXT(p, rib_l);
135 			if (p->aspath->pftableid) {
136 				struct bgpd_addr addr;
137 
138 				pt_getaddr(p->prefix, &addr);
139 				/* Commit is done in peer_down() */
140 				rde_send_pftable(p->aspath->pftableid, &addr,
141 				    p->prefix->prefixlen, 1);
142 			}
143 			prefix_destroy(p);
144 			if (np == NULL)
145 				break;
146 		}
147 	}
148 	bzero(rib, sizeof(struct rib));
149 }
150 
151 int
152 rib_compare(const struct rib_entry *a, const struct rib_entry *b)
153 {
154 	return (pt_prefix_cmp(a->prefix, b->prefix));
155 }
156 
157 struct rib_entry *
158 rib_get(struct rib *rib, struct bgpd_addr *prefix, int prefixlen)
159 {
160 	struct rib_entry xre;
161 	struct pt_entry	*pte;
162 
163 	pte = pt_fill(prefix, prefixlen);
164 	bzero(&xre, sizeof(xre));
165 	xre.prefix = pte;
166 
167 	return (RB_FIND(rib_tree, &rib->rib, &xre));
168 }
169 
170 struct rib_entry *
171 rib_lookup(struct rib *rib, struct bgpd_addr *addr)
172 {
173 	struct rib_entry *re;
174 	int		 i;
175 
176 	switch (addr->aid) {
177 	case AID_INET:
178 	case AID_VPN_IPv4:
179 		for (i = 32; i >= 0; i--) {
180 			re = rib_get(rib, addr, i);
181 			if (re != NULL)
182 				return (re);
183 		}
184 		break;
185 	case AID_INET6:
186 		for (i = 128; i >= 0; i--) {
187 			re = rib_get(rib, addr, i);
188 			if (re != NULL)
189 				return (re);
190 		}
191 		break;
192 	default:
193 		fatalx("rib_lookup: unknown af");
194 	}
195 	return (NULL);
196 }
197 
198 
199 struct rib_entry *
200 rib_add(struct rib *rib, struct bgpd_addr *prefix, int prefixlen)
201 {
202 	struct pt_entry	*pte;
203 	struct rib_entry *re;
204 
205 	pte = pt_get(prefix, prefixlen);
206 	if (pte == NULL)
207 		pte = pt_add(prefix, prefixlen);
208 
209 	if ((re = calloc(1, sizeof(*re))) == NULL)
210 		fatal("rib_add");
211 
212 	LIST_INIT(&re->prefix_h);
213 	re->prefix = pte;
214 	re->flags = rib->flags;
215 	re->ribid = rib->id;
216 
217         if (RB_INSERT(rib_tree, &rib->rib, re) != NULL) {
218 		log_warnx("rib_add: insert failed");
219 		free(re);
220 		return (NULL);
221 	}
222 
223 	pt_ref(pte);
224 
225 	rdemem.rib_cnt++;
226 
227 	return (re);
228 }
229 
230 void
231 rib_remove(struct rib_entry *re)
232 {
233 	if (!rib_empty(re))
234 		fatalx("rib_remove: entry not empty");
235 
236 	if (re->flags & F_RIB_ENTRYLOCK)
237 		/* entry is locked, don't free it. */
238 		return;
239 
240 	pt_unref(re->prefix);
241 	if (pt_empty(re->prefix))
242 		pt_remove(re->prefix);
243 
244 	if (RB_REMOVE(rib_tree, &ribs[re->ribid].rib, re) == NULL)
245 		log_warnx("rib_remove: remove failed.");
246 
247 	free(re);
248 	rdemem.rib_cnt--;
249 }
250 
251 int
252 rib_empty(struct rib_entry *re)
253 {
254 	return LIST_EMPTY(&re->prefix_h);
255 }
256 
257 void
258 rib_dump(struct rib *rib, void (*upcall)(struct rib_entry *, void *),
259     void *arg, u_int8_t aid)
260 {
261 	struct rib_context	*ctx;
262 
263 	if ((ctx = calloc(1, sizeof(*ctx))) == NULL)
264 		fatal("rib_dump");
265 	ctx->ctx_rib = rib;
266 	ctx->ctx_upcall = upcall;
267 	ctx->ctx_arg = arg;
268 	ctx->ctx_aid = aid;
269 	rib_dump_r(ctx);
270 }
271 
272 void
273 rib_dump_r(struct rib_context *ctx)
274 {
275 	struct rib_entry	*re;
276 	unsigned int		 i;
277 
278 	if (ctx->ctx_re == NULL) {
279 		re = RB_MIN(rib_tree, &ctx->ctx_rib->rib);
280 		LIST_INSERT_HEAD(&rib_dump_h, ctx, entry);
281 	} else
282 		re = rib_restart(ctx);
283 
284 	for (i = 0; re != NULL; re = RB_NEXT(rib_tree, unused, re)) {
285 		if (ctx->ctx_aid != AID_UNSPEC &&
286 		    ctx->ctx_aid != re->prefix->aid)
287 			continue;
288 		if (ctx->ctx_count && i++ >= ctx->ctx_count &&
289 		    (re->flags & F_RIB_ENTRYLOCK) == 0) {
290 			/* store and lock last element */
291 			ctx->ctx_re = re;
292 			re->flags |= F_RIB_ENTRYLOCK;
293 			return;
294 		}
295 		ctx->ctx_upcall(re, ctx->ctx_arg);
296 	}
297 
298 	LIST_REMOVE(ctx, entry);
299 	if (ctx->ctx_done)
300 		ctx->ctx_done(ctx->ctx_arg);
301 	else
302 		free(ctx);
303 }
304 
305 struct rib_entry *
306 rib_restart(struct rib_context *ctx)
307 {
308 	struct rib_entry *re;
309 
310 	re = ctx->ctx_re;
311 	re->flags &= ~F_RIB_ENTRYLOCK;
312 
313 	/* find first non empty element */
314 	while (re && rib_empty(re))
315 		re = RB_NEXT(rib_tree, unused, re);
316 
317 	/* free the previously locked rib element if empty */
318 	if (rib_empty(ctx->ctx_re))
319 		rib_remove(ctx->ctx_re);
320 	ctx->ctx_re = NULL;
321 	return (re);
322 }
323 
324 void
325 rib_dump_runner(void)
326 {
327 	struct rib_context	*ctx, *next;
328 
329 	for (ctx = LIST_FIRST(&rib_dump_h); ctx != NULL; ctx = next) {
330 		next = LIST_NEXT(ctx, entry);
331 		rib_dump_r(ctx);
332 	}
333 }
334 
335 int
336 rib_dump_pending(void)
337 {
338 	return (!LIST_EMPTY(&rib_dump_h));
339 }
340 
341 /* used to bump correct prefix counters */
342 #define PREFIX_COUNT(x, op)			\
343 	do {					\
344 		(x)->prefix_cnt += (op);	\
345 	} while (0)
346 
347 /* path specific functions */
348 
349 static void	path_link(struct rde_aspath *, struct rde_peer *);
350 
351 struct path_table pathtable;
352 
353 /* XXX the hash should also include communities and the other attrs */
354 #define PATH_HASH(x)				\
355 	&pathtable.path_hashtbl[hash32_buf((x)->data, (x)->len, HASHINIT) & \
356 	    pathtable.path_hashmask]
357 
358 void
359 path_init(u_int32_t hashsize)
360 {
361 	u_int32_t	hs, i;
362 
363 	for (hs = 1; hs < hashsize; hs <<= 1)
364 		;
365 	pathtable.path_hashtbl = calloc(hs, sizeof(struct aspath_head));
366 	if (pathtable.path_hashtbl == NULL)
367 		fatal("path_init");
368 
369 	for (i = 0; i < hs; i++)
370 		LIST_INIT(&pathtable.path_hashtbl[i]);
371 
372 	pathtable.path_hashmask = hs - 1;
373 }
374 
375 void
376 path_shutdown(void)
377 {
378 	u_int32_t	i;
379 
380 	for (i = 0; i <= pathtable.path_hashmask; i++)
381 		if (!LIST_EMPTY(&pathtable.path_hashtbl[i]))
382 			log_warnx("path_free: free non-free table");
383 
384 	free(pathtable.path_hashtbl);
385 }
386 
387 int
388 path_update(struct rib *rib, struct rde_peer *peer, struct rde_aspath *nasp,
389     struct bgpd_addr *prefix, int prefixlen)
390 {
391 	struct rde_aspath	*asp;
392 	struct prefix		*p;
393 
394 	if (nasp->pftableid) {
395 		rde_send_pftable(nasp->pftableid, prefix, prefixlen, 0);
396 		rde_send_pftable_commit();
397 	}
398 
399 	/*
400 	 * First try to find a prefix in the specified RIB.
401 	 */
402 	if ((p = prefix_get(rib, peer, prefix, prefixlen, 0)) != NULL) {
403 		if (path_compare(nasp, p->aspath) == 0) {
404 			/* no change, update last change */
405 			p->lastchange = time(NULL);
406 			return (0);
407 		}
408 	}
409 
410 	/*
411 	 * Either the prefix does not exist or the path changed.
412 	 * In both cases lookup the new aspath to make sure it is not
413 	 * already in the RIB.
414 	 */
415 	if ((asp = path_lookup(nasp, peer)) == NULL) {
416 		/* Path not available, create and link a new one. */
417 		asp = path_copy(nasp);
418 		path_link(asp, peer);
419 	}
420 
421 	/* If the prefix was found move it else add it to the aspath. */
422 	if (p != NULL)
423 		prefix_move(asp, p);
424 	else
425 		return (prefix_add(rib, asp, prefix, prefixlen));
426 	return (0);
427 }
428 
429 int
430 path_compare(struct rde_aspath *a, struct rde_aspath *b)
431 {
432 	int		 r;
433 
434 	if (a->origin > b->origin)
435 		return (1);
436 	if (a->origin < b->origin)
437 		return (-1);
438 	if ((a->flags & ~F_ATTR_LINKED) > (b->flags & ~F_ATTR_LINKED))
439 		return (1);
440 	if ((a->flags & ~F_ATTR_LINKED) < (b->flags & ~F_ATTR_LINKED))
441 		return (-1);
442 	if (a->med > b->med)
443 		return (1);
444 	if (a->med < b->med)
445 		return (-1);
446 	if (a->lpref > b->lpref)
447 		return (1);
448 	if (a->lpref < b->lpref)
449 		return (-1);
450 	if (a->weight > b->weight)
451 		return (1);
452 	if (a->weight < b->weight)
453 		return (-1);
454 	if (a->rtlabelid > b->rtlabelid)
455 		return (1);
456 	if (a->rtlabelid < b->rtlabelid)
457 		return (-1);
458 	if (a->pftableid > b->pftableid)
459 		return (1);
460 	if (a->pftableid < b->pftableid)
461 		return (-1);
462 
463 	r = aspath_compare(a->aspath, b->aspath);
464 	if (r == 0)
465 		r = nexthop_compare(a->nexthop, b->nexthop);
466 	if (r > 0)
467 		return (1);
468 	if (r < 0)
469 		return (-1);
470 
471 	return (attr_compare(a, b));
472 }
473 
474 struct rde_aspath *
475 path_lookup(struct rde_aspath *aspath, struct rde_peer *peer)
476 {
477 	struct aspath_head	*head;
478 	struct rde_aspath	*asp;
479 
480 	head = PATH_HASH(aspath->aspath);
481 
482 	LIST_FOREACH(asp, head, path_l) {
483 		if (peer == asp->peer && path_compare(aspath, asp) == 0)
484 			return (asp);
485 	}
486 	return (NULL);
487 }
488 
489 void
490 path_remove(struct rde_aspath *asp)
491 {
492 	struct prefix	*p, *np;
493 
494 	for (p = LIST_FIRST(&asp->prefix_h); p != NULL; p = np) {
495 		np = LIST_NEXT(p, path_l);
496 		if (asp->pftableid) {
497 			struct bgpd_addr addr;
498 
499 			pt_getaddr(p->prefix, &addr);
500 			/* Commit is done in peer_down() */
501 			rde_send_pftable(p->aspath->pftableid, &addr,
502 			    p->prefix->prefixlen, 1);
503 		}
504 		prefix_destroy(p);
505 	}
506 }
507 
508 /* this function is only called by prefix_remove and path_remove */
509 void
510 path_destroy(struct rde_aspath *asp)
511 {
512 	/* path_destroy can only unlink and free empty rde_aspath */
513 	if (asp->prefix_cnt != 0 || asp->active_cnt != 0)
514 		log_warnx("path_destroy: prefix count out of sync");
515 
516 	nexthop_unlink(asp);
517 	LIST_REMOVE(asp, path_l);
518 	LIST_REMOVE(asp, peer_l);
519 	asp->peer = NULL;
520 	asp->nexthop = NULL;
521 	asp->flags &= ~F_ATTR_LINKED;
522 
523 	path_put(asp);
524 }
525 
526 int
527 path_empty(struct rde_aspath *asp)
528 {
529 	return LIST_EMPTY(&asp->prefix_h);
530 }
531 
532 /*
533  * the path object is linked into multiple lists for fast access.
534  * These are peer_l, path_l and nexthop_l.
535  * peer_l: list of all aspaths that belong to that peer
536  * path_l: hash list to find paths quickly
537  * nexthop_l: list of all aspaths with an equal exit nexthop
538  */
539 static void
540 path_link(struct rde_aspath *asp, struct rde_peer *peer)
541 {
542 	struct aspath_head	*head;
543 
544 	head = PATH_HASH(asp->aspath);
545 
546 	LIST_INSERT_HEAD(head, asp, path_l);
547 	LIST_INSERT_HEAD(&peer->path_h, asp, peer_l);
548 	asp->peer = peer;
549 	nexthop_link(asp);
550 	asp->flags |= F_ATTR_LINKED;
551 }
552 
553 /*
554  * copy asp to a new UNLINKED one mainly for filtering
555  */
556 struct rde_aspath *
557 path_copy(struct rde_aspath *asp)
558 {
559 	struct rde_aspath *nasp;
560 
561 	nasp = path_get();
562 	nasp->aspath = asp->aspath;
563 	if (nasp->aspath != NULL) {
564 		nasp->aspath->refcnt++;
565 		rdemem.aspath_refs++;
566 	}
567 	nasp->nexthop = asp->nexthop;
568 	nasp->med = asp->med;
569 	nasp->lpref = asp->lpref;
570 	nasp->weight = asp->weight;
571 	nasp->origin = asp->origin;
572 	nasp->rtlabelid = asp->rtlabelid;
573 	rtlabel_ref(nasp->rtlabelid);
574 	nasp->pftableid = asp->pftableid;
575 	pftable_ref(nasp->pftableid);
576 
577 	nasp->flags = asp->flags & ~F_ATTR_LINKED;
578 	attr_copy(nasp, asp);
579 
580 	return (nasp);
581 }
582 
583 /* alloc and initialize new entry. May not fail. */
584 struct rde_aspath *
585 path_get(void)
586 {
587 	struct rde_aspath *asp;
588 
589 	asp = calloc(1, sizeof(*asp));
590 	if (asp == NULL)
591 		fatal("path_alloc");
592 	rdemem.path_cnt++;
593 
594 	LIST_INIT(&asp->prefix_h);
595 	asp->origin = ORIGIN_INCOMPLETE;
596 	asp->lpref = DEFAULT_LPREF;
597 	/* med = 0 */
598 	/* weight = 0 */
599 	/* rtlabel = 0 */
600 
601 	return (asp);
602 }
603 
604 /* free an unlinked element */
605 void
606 path_put(struct rde_aspath *asp)
607 {
608 	if (asp == NULL)
609 		return;
610 
611 	if (asp->flags & F_ATTR_LINKED)
612 		fatalx("path_put: linked object");
613 
614 	rtlabel_unref(asp->rtlabelid);
615 	pftable_unref(asp->pftableid);
616 	aspath_put(asp->aspath);
617 	attr_freeall(asp);
618 	rdemem.path_cnt--;
619 	free(asp);
620 }
621 
622 /* prefix specific functions */
623 
624 static struct prefix	*prefix_alloc(void);
625 static void		 prefix_free(struct prefix *);
626 static void		 prefix_link(struct prefix *, struct rib_entry *,
627 			     struct rde_aspath *);
628 static void		 prefix_unlink(struct prefix *);
629 
630 /*
631  * search for specified prefix of a peer. Returns NULL if not found.
632  */
633 struct prefix *
634 prefix_get(struct rib *rib, struct rde_peer *peer, struct bgpd_addr *prefix,
635     int prefixlen, u_int32_t flags)
636 {
637 	struct rib_entry	*re;
638 
639 	re = rib_get(rib, prefix, prefixlen);
640 	if (re == NULL)
641 		return (NULL);
642 	return (prefix_bypeer(re, peer, flags));
643 }
644 
645 /*
646  * Adds or updates a prefix.
647  */
648 int
649 prefix_add(struct rib *rib, struct rde_aspath *asp, struct bgpd_addr *prefix,
650     int prefixlen)
651 
652 {
653 	struct prefix		*p;
654 	struct rib_entry	*re;
655 
656 	re = rib_get(rib, prefix, prefixlen);
657 	if (re == NULL)
658 		re = rib_add(rib, prefix, prefixlen);
659 
660 	p = prefix_bypeer(re, asp->peer, asp->flags);
661 	if (p == NULL) {
662 		p = prefix_alloc();
663 		prefix_link(p, re, asp);
664 		return (1);
665 	} else {
666 		if (p->aspath != asp) {
667 			/* prefix belongs to a different aspath so move */
668 			prefix_move(asp, p);
669 		} else
670 			p->lastchange = time(NULL);
671 		return (0);
672 	}
673 }
674 
675 /*
676  * Move the prefix to the specified as path, removes the old asp if needed.
677  */
678 void
679 prefix_move(struct rde_aspath *asp, struct prefix *p)
680 {
681 	struct prefix		*np;
682 	struct rde_aspath	*oasp;
683 
684 	if (asp->peer != p->aspath->peer)
685 		fatalx("prefix_move: cross peer move");
686 
687 	/* create new prefix node */
688 	np = prefix_alloc();
689 	np->aspath = asp;
690 	/* peer and prefix pointers are still equal */
691 	np->prefix = p->prefix;
692 	np->rib = p->rib;
693 	np->lastchange = time(NULL);
694 
695 	/* add to new as path */
696 	LIST_INSERT_HEAD(&asp->prefix_h, np, path_l);
697 	PREFIX_COUNT(asp, 1);
698 	/*
699 	 * no need to update the peer prefix count because we are only moving
700 	 * the prefix without changing the peer.
701 	 */
702 
703 	/*
704 	 * First kick the old prefix node out of the prefix list,
705 	 * afterwards run the route decision for new prefix node.
706 	 * Because of this only one update is generated if the prefix
707 	 * was active.
708 	 * This is save because we create a new prefix and so the change
709 	 * is noticed by prefix_evaluate().
710 	 */
711 	LIST_REMOVE(p, rib_l);
712 	prefix_evaluate(np, np->rib);
713 
714 	/* remove old prefix node */
715 	oasp = p->aspath;
716 	LIST_REMOVE(p, path_l);
717 	PREFIX_COUNT(oasp, -1);
718 	/* as before peer count needs no update because of move */
719 
720 	/* destroy all references to other objects and free the old prefix */
721 	p->aspath = NULL;
722 	p->prefix = NULL;
723 	p->rib = NULL;
724 	prefix_free(p);
725 
726 	/* destroy old path if empty */
727 	if (path_empty(oasp))
728 		path_destroy(oasp);
729 }
730 
731 /*
732  * Removes a prefix from all lists. If the parent objects -- path or
733  * pt_entry -- become empty remove them too.
734  */
735 int
736 prefix_remove(struct rib *rib, struct rde_peer *peer, struct bgpd_addr *prefix,
737     int prefixlen, u_int32_t flags)
738 {
739 	struct prefix		*p;
740 	struct rib_entry	*re;
741 	struct rde_aspath	*asp;
742 
743 	re = rib_get(rib, prefix, prefixlen);
744 	if (re == NULL)	/* Got a dummy withdrawn request */
745 		return (0);
746 
747 	p = prefix_bypeer(re, peer, flags);
748 	if (p == NULL)		/* Got a dummy withdrawn request. */
749 		return (0);
750 
751 	asp = p->aspath;
752 
753 	if (asp->pftableid) {
754 		/* only prefixes in the local RIB were pushed into pf */
755 		rde_send_pftable(asp->pftableid, prefix, prefixlen, 1);
756 		rde_send_pftable_commit();
757 	}
758 
759 	prefix_destroy(p);
760 
761 	return (1);
762 }
763 
764 /* dump a prefix into specified buffer */
765 int
766 prefix_write(u_char *buf, int len, struct bgpd_addr *prefix, u_int8_t plen)
767 {
768 	int	totlen;
769 
770 	switch (prefix->aid) {
771 	case AID_INET:
772 	case AID_INET6:
773 		totlen = PREFIX_SIZE(plen);
774 
775 		if (totlen > len)
776 			return (-1);
777 		*buf++ = plen;
778 		memcpy(buf, &prefix->ba, totlen - 1);
779 		return (totlen);
780 	case AID_VPN_IPv4:
781 		totlen = PREFIX_SIZE(plen) + sizeof(prefix->vpn4.rd) +
782 		    prefix->vpn4.labellen;
783 		plen += (sizeof(prefix->vpn4.rd) + prefix->vpn4.labellen) * 8;
784 
785 		if (totlen > len)
786 			return (-1);
787 		*buf++ = plen;
788 		memcpy(buf, &prefix->vpn4.labelstack, prefix->vpn4.labellen);
789 		buf += prefix->vpn4.labellen;
790 		memcpy(buf, &prefix->vpn4.rd, sizeof(prefix->vpn4.rd));
791 		buf += sizeof(prefix->vpn4.rd);
792 		memcpy(buf, &prefix->vpn4.addr, PREFIX_SIZE(plen) - 1);
793 		return (totlen);
794 	default:
795 		return (-1);
796 	}
797 }
798 
799 int
800 prefix_writebuf(struct ibuf *buf, struct bgpd_addr *prefix, u_int8_t plen)
801 {
802 	int	 totlen;
803 	void	*bptr;
804 
805 	switch (prefix->aid) {
806 	case AID_INET:
807 	case AID_INET6:
808 		totlen = PREFIX_SIZE(plen);
809 		break;
810 	case AID_VPN_IPv4:
811 		totlen = PREFIX_SIZE(plen) + sizeof(prefix->vpn4.rd) +
812 		    prefix->vpn4.labellen;
813 	default:
814 		return (-1);
815 	}
816 
817 	if ((bptr = ibuf_reserve(buf, totlen)) == NULL)
818 		return (-1);
819 	if (prefix_write(bptr, totlen, prefix, plen) == -1)
820 		return (-1);
821 	return (0);
822 }
823 
824 /*
825  * Searches in the prefix list of specified pt_entry for a prefix entry
826  * belonging to the peer peer. Returns NULL if no match found.
827  */
828 struct prefix *
829 prefix_bypeer(struct rib_entry *re, struct rde_peer *peer, u_int32_t flags)
830 {
831 	struct prefix	*p;
832 
833 	LIST_FOREACH(p, &re->prefix_h, rib_l) {
834 		if (p->aspath->peer != peer)
835 			continue;
836 		if (p->aspath->flags & flags &&
837 		    (flags & F_ANN_DYNAMIC) !=
838 		    (p->aspath->flags & F_ANN_DYNAMIC))
839 			continue;
840 		return (p);
841 	}
842 	return (NULL);
843 }
844 
845 void
846 prefix_updateall(struct rde_aspath *asp, enum nexthop_state state,
847     enum nexthop_state oldstate)
848 {
849 	struct prefix	*p;
850 
851 	LIST_FOREACH(p, &asp->prefix_h, path_l) {
852 		/*
853 		 * skip non local-RIBs or RIBs that are flagged as noeval.
854 		 */
855 		if (p->rib->flags & F_RIB_NOEVALUATE)
856 			continue;
857 
858 		if (oldstate == state && state == NEXTHOP_REACH) {
859 			/*
860 			 * The state of the nexthop did not change. The only
861 			 * thing that may have changed is the true_nexthop
862 			 * or other internal infos. This will not change
863 			 * the routing decision so shortcut here.
864 			 */
865 			if ((p->rib->flags & F_RIB_NOFIB) == 0 &&
866 			    p == p->rib->active)
867 				rde_send_kroute(p, NULL, p->rib->ribid);
868 			continue;
869 		}
870 
871 		/* redo the route decision */
872 		LIST_REMOVE(p, rib_l);
873 		/*
874 		 * If the prefix is the active one remove it first,
875 		 * this has to be done because we can not detect when
876 		 * the active prefix changes its state. In this case
877 		 * we know that this is a withdrawal and so the second
878 		 * prefix_evaluate() will generate no update because
879 		 * the nexthop is unreachable or ineligible.
880 		 */
881 		if (p == p->rib->active)
882 			prefix_evaluate(NULL, p->rib);
883 		prefix_evaluate(p, p->rib);
884 	}
885 }
886 
887 /* kill a prefix. */
888 void
889 prefix_destroy(struct prefix *p)
890 {
891 	struct rde_aspath	*asp;
892 
893 	asp = p->aspath;
894 	prefix_unlink(p);
895 	prefix_free(p);
896 
897 	if (path_empty(asp))
898 		path_destroy(asp);
899 }
900 
901 /*
902  * helper function to clean up the connected networks after a reload
903  */
904 void
905 prefix_network_clean(struct rde_peer *peer, time_t reloadtime, u_int32_t flags)
906 {
907 	struct rde_aspath	*asp, *xasp;
908 	struct prefix		*p, *xp;
909 
910 	for (asp = LIST_FIRST(&peer->path_h); asp != NULL; asp = xasp) {
911 		xasp = LIST_NEXT(asp, peer_l);
912 		if ((asp->flags & F_ANN_DYNAMIC) != flags)
913 			continue;
914 		for (p = LIST_FIRST(&asp->prefix_h); p != NULL; p = xp) {
915 			xp = LIST_NEXT(p, path_l);
916 			if (reloadtime > p->lastchange) {
917 				prefix_unlink(p);
918 				prefix_free(p);
919 			}
920 		}
921 		if (path_empty(asp))
922 			path_destroy(asp);
923 	}
924 }
925 
926 /*
927  * Link a prefix into the different parent objects.
928  */
929 static void
930 prefix_link(struct prefix *pref, struct rib_entry *re, struct rde_aspath *asp)
931 {
932 	LIST_INSERT_HEAD(&asp->prefix_h, pref, path_l);
933 	PREFIX_COUNT(asp, 1);
934 
935 	pref->aspath = asp;
936 	pref->rib = re;
937 	pref->prefix = re->prefix;
938 	pt_ref(pref->prefix);
939 	pref->lastchange = time(NULL);
940 
941 	/* make route decision */
942 	prefix_evaluate(pref, re);
943 }
944 
945 /*
946  * Unlink a prefix from the different parent objects.
947  */
948 static void
949 prefix_unlink(struct prefix *pref)
950 {
951 	struct rib_entry	*re = pref->rib;
952 
953 	/* make route decision */
954 	LIST_REMOVE(pref, rib_l);
955 	prefix_evaluate(NULL, re);
956 
957 	LIST_REMOVE(pref, path_l);
958 	PREFIX_COUNT(pref->aspath, -1);
959 
960 	pt_unref(pref->prefix);
961 	if (pt_empty(pref->prefix))
962 		pt_remove(pref->prefix);
963 	if (rib_empty(re))
964 		rib_remove(re);
965 
966 	/* destroy all references to other objects */
967 	pref->aspath = NULL;
968 	pref->prefix = NULL;
969 	pref->rib = NULL;
970 
971 	/*
972 	 * It's the caller's duty to remove empty aspath structures.
973 	 * Also freeing the unlinked prefix is the caller's duty.
974 	 */
975 }
976 
977 /* alloc and bzero new entry. May not fail. */
978 static struct prefix *
979 prefix_alloc(void)
980 {
981 	struct prefix *p;
982 
983 	p = calloc(1, sizeof(*p));
984 	if (p == NULL)
985 		fatal("prefix_alloc");
986 	rdemem.prefix_cnt++;
987 	return p;
988 }
989 
990 /* free a unlinked entry */
991 static void
992 prefix_free(struct prefix *pref)
993 {
994 	rdemem.prefix_cnt--;
995 	free(pref);
996 }
997 
998 /*
999  * nexthop functions
1000  */
1001 struct nexthop_head	*nexthop_hash(struct bgpd_addr *);
1002 struct nexthop		*nexthop_lookup(struct bgpd_addr *);
1003 
1004 /*
1005  * In BGP there exist two nexthops: the exit nexthop which was announced via
1006  * BGP and the true nexthop which is used in the FIB -- forward information
1007  * base a.k.a kernel routing table. When sending updates it is even more
1008  * confusing. In IBGP we pass the unmodified exit nexthop to the neighbors
1009  * while in EBGP normally the address of the router is sent. The exit nexthop
1010  * may be passed to the external neighbor if the neighbor and the exit nexthop
1011  * reside in the same subnet -- directly connected.
1012  */
1013 struct nexthop_table {
1014 	LIST_HEAD(nexthop_head, nexthop)	*nexthop_hashtbl;
1015 	u_int32_t				 nexthop_hashmask;
1016 } nexthoptable;
1017 
1018 void
1019 nexthop_init(u_int32_t hashsize)
1020 {
1021 	u_int32_t	 hs, i;
1022 
1023 	for (hs = 1; hs < hashsize; hs <<= 1)
1024 		;
1025 	nexthoptable.nexthop_hashtbl = calloc(hs, sizeof(struct nexthop_table));
1026 	if (nexthoptable.nexthop_hashtbl == NULL)
1027 		fatal("nextop_init");
1028 
1029 	for (i = 0; i < hs; i++)
1030 		LIST_INIT(&nexthoptable.nexthop_hashtbl[i]);
1031 
1032 	nexthoptable.nexthop_hashmask = hs - 1;
1033 }
1034 
1035 void
1036 nexthop_shutdown(void)
1037 {
1038 	u_int32_t		 i;
1039 	struct nexthop		*nh, *nnh;
1040 
1041 	for (i = 0; i <= nexthoptable.nexthop_hashmask; i++) {
1042 		for (nh = LIST_FIRST(&nexthoptable.nexthop_hashtbl[i]);
1043 		    nh != NULL; nh = nnh) {
1044 			nnh = LIST_NEXT(nh, nexthop_l);
1045 			nh->state = NEXTHOP_UNREACH;
1046 			(void)nexthop_delete(nh);
1047 		}
1048 		if (!LIST_EMPTY(&nexthoptable.nexthop_hashtbl[i]))
1049 			log_warnx("nexthop_shutdown: non-free table");
1050 	}
1051 
1052 	free(nexthoptable.nexthop_hashtbl);
1053 }
1054 
1055 void
1056 nexthop_update(struct kroute_nexthop *msg)
1057 {
1058 	struct nexthop		*nh;
1059 	struct rde_aspath	*asp;
1060 	enum nexthop_state	 oldstate;
1061 
1062 	nh = nexthop_lookup(&msg->nexthop);
1063 	if (nh == NULL) {
1064 		log_warnx("nexthop_update: non-existent nexthop %s",
1065 		    log_addr(&msg->nexthop));
1066 		return;
1067 	}
1068 
1069 	oldstate = nh->state;
1070 	if (msg->valid)
1071 		nh->state = NEXTHOP_REACH;
1072 	else
1073 		nh->state = NEXTHOP_UNREACH;
1074 
1075 	if (msg->connected) {
1076 		nh->flags |= NEXTHOP_CONNECTED;
1077 		memcpy(&nh->true_nexthop, &nh->exit_nexthop,
1078 		    sizeof(nh->true_nexthop));
1079 	} else
1080 		memcpy(&nh->true_nexthop, &msg->gateway,
1081 		    sizeof(nh->true_nexthop));
1082 
1083 	memcpy(&nh->nexthop_net, &msg->net,
1084 	    sizeof(nh->nexthop_net));
1085 	nh->nexthop_netlen = msg->netlen;
1086 
1087 	if (nexthop_delete(nh))
1088 		/* nexthop no longer used */
1089 		return;
1090 
1091 	if (rde_noevaluate())
1092 		/*
1093 		 * if the decision process is turned off there is no need
1094 		 * for the aspath list walk.
1095 		 */
1096 		return;
1097 
1098 	LIST_FOREACH(asp, &nh->path_h, nexthop_l) {
1099 		prefix_updateall(asp, nh->state, oldstate);
1100 	}
1101 }
1102 
1103 void
1104 nexthop_modify(struct rde_aspath *asp, struct bgpd_addr *nexthop,
1105     enum action_types type, u_int8_t aid)
1106 {
1107 	struct nexthop	*nh;
1108 
1109 	if (type == ACTION_SET_NEXTHOP && aid != nexthop->aid)
1110 		return;
1111 
1112 	asp->flags &= ~F_NEXTHOP_MASK;
1113 	switch (type) {
1114 	case ACTION_SET_NEXTHOP_REJECT:
1115 		asp->flags |= F_NEXTHOP_REJECT;
1116 		break;
1117 	case ACTION_SET_NEXTHOP_BLACKHOLE:
1118 		asp->flags |= F_NEXTHOP_BLACKHOLE;
1119 		break;
1120 	case ACTION_SET_NEXTHOP_NOMODIFY:
1121 		asp->flags |= F_NEXTHOP_NOMODIFY;
1122 		break;
1123 	case ACTION_SET_NEXTHOP_SELF:
1124 		asp->flags |= F_NEXTHOP_SELF;
1125 		break;
1126 	case ACTION_SET_NEXTHOP:
1127 		nh = nexthop_get(nexthop);
1128 		if (asp->flags & F_ATTR_LINKED)
1129 			nexthop_unlink(asp);
1130 		asp->nexthop = nh;
1131 		if (asp->flags & F_ATTR_LINKED)
1132 			nexthop_link(asp);
1133 		break;
1134 	default:
1135 		break;
1136 	}
1137 }
1138 
1139 void
1140 nexthop_link(struct rde_aspath *asp)
1141 {
1142 	if (asp->nexthop == NULL)
1143 		return;
1144 
1145 	LIST_INSERT_HEAD(&asp->nexthop->path_h, asp, nexthop_l);
1146 }
1147 
1148 void
1149 nexthop_unlink(struct rde_aspath *asp)
1150 {
1151 	struct nexthop	*nh;
1152 
1153 	if (asp->nexthop == NULL)
1154 		return;
1155 
1156 	LIST_REMOVE(asp, nexthop_l);
1157 
1158 	/* see if list is empty */
1159 	nh = asp->nexthop;
1160 	asp->nexthop = NULL;
1161 
1162 	(void)nexthop_delete(nh);
1163 }
1164 
1165 int
1166 nexthop_delete(struct nexthop *nh)
1167 {
1168 	/* nexthop still used by some other aspath */
1169 	if (!LIST_EMPTY(&nh->path_h))
1170 		return (0);
1171 
1172 	/* either pinned or in a state where it may not be deleted */
1173 	if (nh->refcnt > 0 || nh->state == NEXTHOP_LOOKUP)
1174 		return (0);
1175 
1176 	LIST_REMOVE(nh, nexthop_l);
1177 	rde_send_nexthop(&nh->exit_nexthop, 0);
1178 
1179 	rdemem.nexthop_cnt--;
1180 	free(nh);
1181 	return (1);
1182 }
1183 
1184 struct nexthop *
1185 nexthop_get(struct bgpd_addr *nexthop)
1186 {
1187 	struct nexthop	*nh;
1188 
1189 	nh = nexthop_lookup(nexthop);
1190 	if (nh == NULL) {
1191 		nh = calloc(1, sizeof(*nh));
1192 		if (nh == NULL)
1193 			fatal("nexthop_alloc");
1194 		rdemem.nexthop_cnt++;
1195 
1196 		LIST_INIT(&nh->path_h);
1197 		nh->state = NEXTHOP_LOOKUP;
1198 		nh->exit_nexthop = *nexthop;
1199 		LIST_INSERT_HEAD(nexthop_hash(nexthop), nh,
1200 		    nexthop_l);
1201 
1202 		rde_send_nexthop(&nh->exit_nexthop, 1);
1203 	}
1204 
1205 	return (nh);
1206 }
1207 
1208 int
1209 nexthop_compare(struct nexthop *na, struct nexthop *nb)
1210 {
1211 	struct bgpd_addr	*a, *b;
1212 
1213 	if (na == nb)
1214 		return (0);
1215 	if (na == NULL)
1216 		return (-1);
1217 	if (nb == NULL)
1218 		return (1);
1219 
1220 	a = &na->exit_nexthop;
1221 	b = &nb->exit_nexthop;
1222 
1223 	if (a->aid != b->aid)
1224 		return (a->aid - b->aid);
1225 
1226 	switch (a->aid) {
1227 	case AID_INET:
1228 		if (ntohl(a->v4.s_addr) > ntohl(b->v4.s_addr))
1229 			return (1);
1230 		if (ntohl(a->v4.s_addr) < ntohl(b->v4.s_addr))
1231 			return (-1);
1232 		return (0);
1233 	case AID_INET6:
1234 		return (memcmp(&a->v6, &b->v6, sizeof(struct in6_addr)));
1235 	default:
1236 		fatalx("nexthop_cmp: unknown af");
1237 	}
1238 	return (-1);
1239 }
1240 
1241 struct nexthop *
1242 nexthop_lookup(struct bgpd_addr *nexthop)
1243 {
1244 	struct nexthop	*nh;
1245 
1246 	LIST_FOREACH(nh, nexthop_hash(nexthop), nexthop_l) {
1247 		if (memcmp(&nh->exit_nexthop, nexthop,
1248 		    sizeof(struct bgpd_addr)) == 0)
1249 			return (nh);
1250 	}
1251 	return (NULL);
1252 }
1253 
1254 struct nexthop_head *
1255 nexthop_hash(struct bgpd_addr *nexthop)
1256 {
1257 	u_int32_t	 h = 0;
1258 
1259 	switch (nexthop->aid) {
1260 	case AID_INET:
1261 		h = (AF_INET ^ ntohl(nexthop->v4.s_addr) ^
1262 		    ntohl(nexthop->v4.s_addr) >> 13) &
1263 		    nexthoptable.nexthop_hashmask;
1264 		break;
1265 	case AID_INET6:
1266 		h = hash32_buf(&nexthop->v6, sizeof(struct in6_addr),
1267 		    HASHINIT) & nexthoptable.nexthop_hashmask;
1268 		break;
1269 	default:
1270 		fatalx("nexthop_hash: unsupported AF");
1271 	}
1272 	return (&nexthoptable.nexthop_hashtbl[h]);
1273 }
1274 
1275