xref: /openbsd-src/usr.sbin/bgpd/rde_rib.c (revision 50b7afb2c2c0993b0894d4e34bf857cb13ed9c80)
1 /*	$OpenBSD: rde_rib.c,v 1.138 2013/08/14 20:34:27 claudio Exp $ */
2 
3 /*
4  * Copyright (c) 2003, 2004 Claudio Jeker <claudio@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 #include <sys/queue.h>
21 #include <sys/hash.h>
22 
23 #include <stdlib.h>
24 #include <string.h>
25 
26 #include "bgpd.h"
27 #include "rde.h"
28 
29 /*
30  * BGP RIB -- Routing Information Base
31  *
32  * The RIB is build with one aspect in mind. Speed -- actually update speed.
33  * Therefore one thing needs to be absolutely avoided, long table walks.
34  * This is achieved by heavily linking the different parts together.
35  */
36 u_int16_t rib_size;
37 struct rib *ribs;
38 
39 LIST_HEAD(, rib_context) rib_dump_h = LIST_HEAD_INITIALIZER(rib_dump_h);
40 
41 struct rib_entry *rib_add(struct rib *, struct bgpd_addr *, int);
42 int rib_compare(const struct rib_entry *, const struct rib_entry *);
43 void rib_remove(struct rib_entry *);
44 int rib_empty(struct rib_entry *);
45 struct rib_entry *rib_restart(struct rib_context *);
46 
47 RB_PROTOTYPE(rib_tree, rib_entry, rib_e, rib_compare);
48 RB_GENERATE(rib_tree, rib_entry, rib_e, rib_compare);
49 
50 
51 /* RIB specific functions */
52 u_int16_t
53 rib_new(char *name, u_int rtableid, u_int16_t flags)
54 {
55 	struct rib	*xribs;
56 	size_t		newsize;
57 	u_int16_t	id;
58 
59 	for (id = 0; id < rib_size; id++) {
60 		if (*ribs[id].name == '\0')
61 			break;
62 	}
63 
64 	if (id == RIB_FAILED)
65 		fatalx("rib_new: trying to use reserved id");
66 
67 	if (id >= rib_size) {
68 		newsize = sizeof(struct rib) * (id + 1);
69 		if ((xribs = realloc(ribs, newsize)) == NULL) {
70 			/* XXX this is not clever */
71 			fatal("rib_add");
72 		}
73 		ribs = xribs;
74 		rib_size = id + 1;
75 	}
76 
77 	bzero(&ribs[id], sizeof(struct rib));
78 	strlcpy(ribs[id].name, name, sizeof(ribs[id].name));
79 	RB_INIT(&ribs[id].rib);
80 	ribs[id].state = RECONF_REINIT;
81 	ribs[id].id = id;
82 	ribs[id].flags = flags;
83 	ribs[id].rtableid = rtableid;
84 
85 	ribs[id].in_rules = calloc(1, sizeof(struct filter_head));
86 	if (ribs[id].in_rules == NULL)
87 		fatal(NULL);
88 	TAILQ_INIT(ribs[id].in_rules);
89 
90 	return (id);
91 }
92 
93 u_int16_t
94 rib_find(char *name)
95 {
96 	u_int16_t id;
97 
98 	if (name == NULL || *name == '\0')
99 		return (1);	/* no name returns the Loc-RIB */
100 
101 	for (id = 0; id < rib_size; id++) {
102 		if (!strcmp(ribs[id].name, name))
103 			return (id);
104 	}
105 
106 	return (RIB_FAILED);
107 }
108 
109 void
110 rib_free(struct rib *rib)
111 {
112 	struct rib_context *ctx, *next;
113 	struct rib_entry *re, *xre;
114 	struct prefix *p, *np;
115 
116 	/* abort pending rib_dumps */
117 	for (ctx = LIST_FIRST(&rib_dump_h); ctx != NULL; ctx = next) {
118 		next = LIST_NEXT(ctx, entry);
119 		if (ctx->ctx_rib == rib) {
120 			re = ctx->ctx_re;
121 			re->flags &= ~F_RIB_ENTRYLOCK;
122 			LIST_REMOVE(ctx, entry);
123 			if (ctx->ctx_done)
124 				ctx->ctx_done(ctx->ctx_arg);
125 			else
126 				free(ctx);
127 		}
128 	}
129 
130 	for (re = RB_MIN(rib_tree, &rib->rib); re != NULL; re = xre) {
131 		xre = RB_NEXT(rib_tree,  &rib->rib, re);
132 
133 		/*
134 		 * Removing the prefixes is tricky because the last one
135 		 * will remove the rib_entry as well and because we do
136 		 * an empty check in prefix_destroy() it is not possible to
137 		 * use the default for loop.
138 		 */
139 		while ((p = LIST_FIRST(&re->prefix_h))) {
140 			np = LIST_NEXT(p, rib_l);
141 			if (p->aspath->pftableid) {
142 				struct bgpd_addr addr;
143 
144 				pt_getaddr(p->prefix, &addr);
145 				/* Commit is done in peer_down() */
146 				rde_send_pftable(p->aspath->pftableid, &addr,
147 				    p->prefix->prefixlen, 1);
148 			}
149 			prefix_destroy(p);
150 			if (np == NULL)
151 				break;
152 		}
153 	}
154 	rde_free_filter(rib->in_rules_tmp);
155 	rde_free_filter(rib->in_rules);
156 	bzero(rib, sizeof(struct rib));
157 }
158 
159 int
160 rib_compare(const struct rib_entry *a, const struct rib_entry *b)
161 {
162 	return (pt_prefix_cmp(a->prefix, b->prefix));
163 }
164 
165 struct rib_entry *
166 rib_get(struct rib *rib, struct bgpd_addr *prefix, int prefixlen)
167 {
168 	struct rib_entry xre;
169 	struct pt_entry	*pte;
170 
171 	pte = pt_fill(prefix, prefixlen);
172 	bzero(&xre, sizeof(xre));
173 	xre.prefix = pte;
174 
175 	return (RB_FIND(rib_tree, &rib->rib, &xre));
176 }
177 
178 struct rib_entry *
179 rib_lookup(struct rib *rib, struct bgpd_addr *addr)
180 {
181 	struct rib_entry *re;
182 	int		 i;
183 
184 	switch (addr->aid) {
185 	case AID_INET:
186 	case AID_VPN_IPv4:
187 		for (i = 32; i >= 0; i--) {
188 			re = rib_get(rib, addr, i);
189 			if (re != NULL)
190 				return (re);
191 		}
192 		break;
193 	case AID_INET6:
194 		for (i = 128; i >= 0; i--) {
195 			re = rib_get(rib, addr, i);
196 			if (re != NULL)
197 				return (re);
198 		}
199 		break;
200 	default:
201 		fatalx("rib_lookup: unknown af");
202 	}
203 	return (NULL);
204 }
205 
206 
207 struct rib_entry *
208 rib_add(struct rib *rib, struct bgpd_addr *prefix, int prefixlen)
209 {
210 	struct pt_entry	*pte;
211 	struct rib_entry *re;
212 
213 	pte = pt_get(prefix, prefixlen);
214 	if (pte == NULL)
215 		pte = pt_add(prefix, prefixlen);
216 
217 	if ((re = calloc(1, sizeof(*re))) == NULL)
218 		fatal("rib_add");
219 
220 	LIST_INIT(&re->prefix_h);
221 	re->prefix = pte;
222 	re->flags = rib->flags;
223 	re->ribid = rib->id;
224 
225         if (RB_INSERT(rib_tree, &rib->rib, re) != NULL) {
226 		log_warnx("rib_add: insert failed");
227 		free(re);
228 		return (NULL);
229 	}
230 
231 	pt_ref(pte);
232 
233 	rdemem.rib_cnt++;
234 
235 	return (re);
236 }
237 
238 void
239 rib_remove(struct rib_entry *re)
240 {
241 	if (!rib_empty(re))
242 		fatalx("rib_remove: entry not empty");
243 
244 	if (re->flags & F_RIB_ENTRYLOCK)
245 		/* entry is locked, don't free it. */
246 		return;
247 
248 	pt_unref(re->prefix);
249 	if (pt_empty(re->prefix))
250 		pt_remove(re->prefix);
251 
252 	if (RB_REMOVE(rib_tree, &ribs[re->ribid].rib, re) == NULL)
253 		log_warnx("rib_remove: remove failed.");
254 
255 	free(re);
256 	rdemem.rib_cnt--;
257 }
258 
259 int
260 rib_empty(struct rib_entry *re)
261 {
262 	return LIST_EMPTY(&re->prefix_h);
263 }
264 
265 void
266 rib_dump(struct rib *rib, void (*upcall)(struct rib_entry *, void *),
267     void *arg, u_int8_t aid)
268 {
269 	struct rib_context	*ctx;
270 
271 	if ((ctx = calloc(1, sizeof(*ctx))) == NULL)
272 		fatal("rib_dump");
273 	ctx->ctx_rib = rib;
274 	ctx->ctx_upcall = upcall;
275 	ctx->ctx_arg = arg;
276 	ctx->ctx_aid = aid;
277 	rib_dump_r(ctx);
278 }
279 
280 void
281 rib_dump_r(struct rib_context *ctx)
282 {
283 	struct rib_entry	*re;
284 	unsigned int		 i;
285 
286 	if (ctx->ctx_re == NULL) {
287 		re = RB_MIN(rib_tree, &ctx->ctx_rib->rib);
288 		LIST_INSERT_HEAD(&rib_dump_h, ctx, entry);
289 	} else
290 		re = rib_restart(ctx);
291 
292 	for (i = 0; re != NULL; re = RB_NEXT(rib_tree, unused, re)) {
293 		if (ctx->ctx_aid != AID_UNSPEC &&
294 		    ctx->ctx_aid != re->prefix->aid)
295 			continue;
296 		if (ctx->ctx_count && i++ >= ctx->ctx_count &&
297 		    (re->flags & F_RIB_ENTRYLOCK) == 0) {
298 			/* store and lock last element */
299 			ctx->ctx_re = re;
300 			re->flags |= F_RIB_ENTRYLOCK;
301 			return;
302 		}
303 		ctx->ctx_upcall(re, ctx->ctx_arg);
304 	}
305 
306 	LIST_REMOVE(ctx, entry);
307 	if (ctx->ctx_done)
308 		ctx->ctx_done(ctx->ctx_arg);
309 	else
310 		free(ctx);
311 }
312 
313 struct rib_entry *
314 rib_restart(struct rib_context *ctx)
315 {
316 	struct rib_entry *re;
317 
318 	re = ctx->ctx_re;
319 	re->flags &= ~F_RIB_ENTRYLOCK;
320 
321 	/* find first non empty element */
322 	while (re && rib_empty(re))
323 		re = RB_NEXT(rib_tree, unused, re);
324 
325 	/* free the previously locked rib element if empty */
326 	if (rib_empty(ctx->ctx_re))
327 		rib_remove(ctx->ctx_re);
328 	ctx->ctx_re = NULL;
329 	return (re);
330 }
331 
332 void
333 rib_dump_runner(void)
334 {
335 	struct rib_context	*ctx, *next;
336 
337 	for (ctx = LIST_FIRST(&rib_dump_h); ctx != NULL; ctx = next) {
338 		next = LIST_NEXT(ctx, entry);
339 		rib_dump_r(ctx);
340 	}
341 }
342 
343 int
344 rib_dump_pending(void)
345 {
346 	return (!LIST_EMPTY(&rib_dump_h));
347 }
348 
349 /* used to bump correct prefix counters */
350 #define PREFIX_COUNT(x, op)			\
351 	do {					\
352 		(x)->prefix_cnt += (op);	\
353 	} while (0)
354 
355 /* path specific functions */
356 
357 static void	path_link(struct rde_aspath *, struct rde_peer *);
358 
359 struct path_table pathtable;
360 
361 /* XXX the hash should also include communities and the other attrs */
362 #define PATH_HASH(x)				\
363 	&pathtable.path_hashtbl[hash32_buf((x)->data, (x)->len, HASHINIT) & \
364 	    pathtable.path_hashmask]
365 
366 void
367 path_init(u_int32_t hashsize)
368 {
369 	u_int32_t	hs, i;
370 
371 	for (hs = 1; hs < hashsize; hs <<= 1)
372 		;
373 	pathtable.path_hashtbl = calloc(hs, sizeof(struct aspath_head));
374 	if (pathtable.path_hashtbl == NULL)
375 		fatal("path_init");
376 
377 	for (i = 0; i < hs; i++)
378 		LIST_INIT(&pathtable.path_hashtbl[i]);
379 
380 	pathtable.path_hashmask = hs - 1;
381 }
382 
383 void
384 path_shutdown(void)
385 {
386 	u_int32_t	i;
387 
388 	for (i = 0; i <= pathtable.path_hashmask; i++)
389 		if (!LIST_EMPTY(&pathtable.path_hashtbl[i]))
390 			log_warnx("path_free: free non-free table");
391 
392 	free(pathtable.path_hashtbl);
393 }
394 
395 int
396 path_update(struct rib *rib, struct rde_peer *peer, struct rde_aspath *nasp,
397     struct bgpd_addr *prefix, int prefixlen)
398 {
399 	struct rde_aspath	*asp;
400 	struct prefix		*p;
401 
402 	if (nasp->pftableid) {
403 		rde_send_pftable(nasp->pftableid, prefix, prefixlen, 0);
404 		rde_send_pftable_commit();
405 	}
406 
407 	/*
408 	 * First try to find a prefix in the specified RIB.
409 	 */
410 	if ((p = prefix_get(rib, peer, prefix, prefixlen, 0)) != NULL) {
411 		if (path_compare(nasp, p->aspath) == 0) {
412 			/* no change, update last change */
413 			p->lastchange = time(NULL);
414 			return (0);
415 		}
416 	}
417 
418 	/*
419 	 * Either the prefix does not exist or the path changed.
420 	 * In both cases lookup the new aspath to make sure it is not
421 	 * already in the RIB.
422 	 */
423 	if ((asp = path_lookup(nasp, peer)) == NULL) {
424 		/* Path not available, create and link a new one. */
425 		asp = path_copy(nasp);
426 		path_link(asp, peer);
427 	}
428 
429 	/* If the prefix was found move it else add it to the aspath. */
430 	if (p != NULL)
431 		prefix_move(asp, p);
432 	else
433 		return (prefix_add(rib, asp, prefix, prefixlen));
434 	return (0);
435 }
436 
437 int
438 path_compare(struct rde_aspath *a, struct rde_aspath *b)
439 {
440 	int		 r;
441 
442 	if (a->origin > b->origin)
443 		return (1);
444 	if (a->origin < b->origin)
445 		return (-1);
446 	if ((a->flags & ~F_ATTR_LINKED) > (b->flags & ~F_ATTR_LINKED))
447 		return (1);
448 	if ((a->flags & ~F_ATTR_LINKED) < (b->flags & ~F_ATTR_LINKED))
449 		return (-1);
450 	if (a->med > b->med)
451 		return (1);
452 	if (a->med < b->med)
453 		return (-1);
454 	if (a->lpref > b->lpref)
455 		return (1);
456 	if (a->lpref < b->lpref)
457 		return (-1);
458 	if (a->weight > b->weight)
459 		return (1);
460 	if (a->weight < b->weight)
461 		return (-1);
462 	if (a->rtlabelid > b->rtlabelid)
463 		return (1);
464 	if (a->rtlabelid < b->rtlabelid)
465 		return (-1);
466 	if (a->pftableid > b->pftableid)
467 		return (1);
468 	if (a->pftableid < b->pftableid)
469 		return (-1);
470 
471 	r = aspath_compare(a->aspath, b->aspath);
472 	if (r == 0)
473 		r = nexthop_compare(a->nexthop, b->nexthop);
474 	if (r > 0)
475 		return (1);
476 	if (r < 0)
477 		return (-1);
478 
479 	return (attr_compare(a, b));
480 }
481 
482 struct rde_aspath *
483 path_lookup(struct rde_aspath *aspath, struct rde_peer *peer)
484 {
485 	struct aspath_head	*head;
486 	struct rde_aspath	*asp;
487 
488 	head = PATH_HASH(aspath->aspath);
489 
490 	LIST_FOREACH(asp, head, path_l) {
491 		if (peer == asp->peer && path_compare(aspath, asp) == 0)
492 			return (asp);
493 	}
494 	return (NULL);
495 }
496 
497 void
498 path_remove(struct rde_aspath *asp)
499 {
500 	struct prefix	*p, *np;
501 
502 	for (p = LIST_FIRST(&asp->prefix_h); p != NULL; p = np) {
503 		np = LIST_NEXT(p, path_l);
504 		if (asp->pftableid) {
505 			struct bgpd_addr addr;
506 
507 			pt_getaddr(p->prefix, &addr);
508 			/* Commit is done in peer_down() */
509 			rde_send_pftable(p->aspath->pftableid, &addr,
510 			    p->prefix->prefixlen, 1);
511 		}
512 		prefix_destroy(p);
513 	}
514 }
515 
516 /* remove all stale routes or if staletime is 0 remove all routes for
517    a specified AID. */
518 u_int32_t
519 path_remove_stale(struct rde_aspath *asp, u_int8_t aid)
520 {
521 	struct prefix	*p, *np;
522 	time_t		 staletime;
523 	u_int32_t	 rprefixes;
524 
525 	rprefixes=0;
526 	staletime = asp->peer->staletime[aid];
527 	for (p = LIST_FIRST(&asp->prefix_h); p != NULL; p = np) {
528 		np = LIST_NEXT(p, path_l);
529 		if (p->prefix->aid != aid)
530 			continue;
531 
532 		if (staletime && p->lastchange > staletime)
533 			continue;
534 
535 		if (asp->pftableid) {
536 			struct bgpd_addr addr;
537 
538 			pt_getaddr(p->prefix, &addr);
539 			/* Commit is done in peer_flush() */
540 			rde_send_pftable(p->aspath->pftableid, &addr,
541 			    p->prefix->prefixlen, 1);
542 		}
543 
544 		/* only count Adj-RIB-In */
545 		if (p->rib->ribid == 0)
546 			rprefixes++;
547 
548 		prefix_destroy(p);
549 	}
550 	return (rprefixes);
551 }
552 
553 
554 /* this function is only called by prefix_remove and path_remove */
555 void
556 path_destroy(struct rde_aspath *asp)
557 {
558 	/* path_destroy can only unlink and free empty rde_aspath */
559 	if (asp->prefix_cnt != 0 || asp->active_cnt != 0)
560 		log_warnx("path_destroy: prefix count out of sync");
561 
562 	nexthop_unlink(asp);
563 	LIST_REMOVE(asp, path_l);
564 	LIST_REMOVE(asp, peer_l);
565 	asp->peer = NULL;
566 	asp->nexthop = NULL;
567 	asp->flags &= ~F_ATTR_LINKED;
568 
569 	path_put(asp);
570 }
571 
572 int
573 path_empty(struct rde_aspath *asp)
574 {
575 	return LIST_EMPTY(&asp->prefix_h);
576 }
577 
578 /*
579  * the path object is linked into multiple lists for fast access.
580  * These are peer_l, path_l and nexthop_l.
581  * peer_l: list of all aspaths that belong to that peer
582  * path_l: hash list to find paths quickly
583  * nexthop_l: list of all aspaths with an equal exit nexthop
584  */
585 static void
586 path_link(struct rde_aspath *asp, struct rde_peer *peer)
587 {
588 	struct aspath_head	*head;
589 
590 	head = PATH_HASH(asp->aspath);
591 
592 	LIST_INSERT_HEAD(head, asp, path_l);
593 	LIST_INSERT_HEAD(&peer->path_h, asp, peer_l);
594 	asp->peer = peer;
595 	nexthop_link(asp);
596 	asp->flags |= F_ATTR_LINKED;
597 }
598 
599 /*
600  * copy asp to a new UNLINKED one mainly for filtering
601  */
602 struct rde_aspath *
603 path_copy(struct rde_aspath *asp)
604 {
605 	struct rde_aspath *nasp;
606 
607 	nasp = path_get();
608 	nasp->aspath = asp->aspath;
609 	if (nasp->aspath != NULL) {
610 		nasp->aspath->refcnt++;
611 		rdemem.aspath_refs++;
612 	}
613 	nasp->nexthop = asp->nexthop;
614 	nasp->med = asp->med;
615 	nasp->lpref = asp->lpref;
616 	nasp->weight = asp->weight;
617 	nasp->origin = asp->origin;
618 	nasp->rtlabelid = asp->rtlabelid;
619 	rtlabel_ref(nasp->rtlabelid);
620 	nasp->pftableid = asp->pftableid;
621 	pftable_ref(nasp->pftableid);
622 
623 	nasp->flags = asp->flags & ~F_ATTR_LINKED;
624 	attr_copy(nasp, asp);
625 
626 	return (nasp);
627 }
628 
629 /* alloc and initialize new entry. May not fail. */
630 struct rde_aspath *
631 path_get(void)
632 {
633 	struct rde_aspath *asp;
634 
635 	asp = calloc(1, sizeof(*asp));
636 	if (asp == NULL)
637 		fatal("path_alloc");
638 	rdemem.path_cnt++;
639 
640 	LIST_INIT(&asp->prefix_h);
641 	asp->origin = ORIGIN_INCOMPLETE;
642 	asp->lpref = DEFAULT_LPREF;
643 	/* med = 0 */
644 	/* weight = 0 */
645 	/* rtlabel = 0 */
646 
647 	return (asp);
648 }
649 
650 /* free an unlinked element */
651 void
652 path_put(struct rde_aspath *asp)
653 {
654 	if (asp == NULL)
655 		return;
656 
657 	if (asp->flags & F_ATTR_LINKED)
658 		fatalx("path_put: linked object");
659 
660 	rtlabel_unref(asp->rtlabelid);
661 	pftable_unref(asp->pftableid);
662 	aspath_put(asp->aspath);
663 	attr_freeall(asp);
664 	rdemem.path_cnt--;
665 	free(asp);
666 }
667 
668 /* prefix specific functions */
669 
670 static struct prefix	*prefix_alloc(void);
671 static void		 prefix_free(struct prefix *);
672 static void		 prefix_link(struct prefix *, struct rib_entry *,
673 			     struct rde_aspath *);
674 static void		 prefix_unlink(struct prefix *);
675 
676 /*
677  * search for specified prefix of a peer. Returns NULL if not found.
678  */
679 struct prefix *
680 prefix_get(struct rib *rib, struct rde_peer *peer, struct bgpd_addr *prefix,
681     int prefixlen, u_int32_t flags)
682 {
683 	struct rib_entry	*re;
684 
685 	re = rib_get(rib, prefix, prefixlen);
686 	if (re == NULL)
687 		return (NULL);
688 	return (prefix_bypeer(re, peer, flags));
689 }
690 
691 /*
692  * Adds or updates a prefix.
693  */
694 int
695 prefix_add(struct rib *rib, struct rde_aspath *asp, struct bgpd_addr *prefix,
696     int prefixlen)
697 
698 {
699 	struct prefix		*p;
700 	struct rib_entry	*re;
701 
702 	re = rib_get(rib, prefix, prefixlen);
703 	if (re == NULL)
704 		re = rib_add(rib, prefix, prefixlen);
705 
706 	p = prefix_bypeer(re, asp->peer, asp->flags);
707 	if (p == NULL) {
708 		p = prefix_alloc();
709 		prefix_link(p, re, asp);
710 		return (1);
711 	} else {
712 		if (p->aspath != asp) {
713 			/* prefix belongs to a different aspath so move */
714 			prefix_move(asp, p);
715 		} else
716 			p->lastchange = time(NULL);
717 		return (0);
718 	}
719 }
720 
721 /*
722  * Move the prefix to the specified as path, removes the old asp if needed.
723  */
724 void
725 prefix_move(struct rde_aspath *asp, struct prefix *p)
726 {
727 	struct prefix		*np;
728 	struct rde_aspath	*oasp;
729 
730 	if (asp->peer != p->aspath->peer)
731 		fatalx("prefix_move: cross peer move");
732 
733 	/* create new prefix node */
734 	np = prefix_alloc();
735 	np->aspath = asp;
736 	/* peer and prefix pointers are still equal */
737 	np->prefix = p->prefix;
738 	np->rib = p->rib;
739 	np->lastchange = time(NULL);
740 
741 	/* add to new as path */
742 	LIST_INSERT_HEAD(&asp->prefix_h, np, path_l);
743 	PREFIX_COUNT(asp, 1);
744 	/*
745 	 * no need to update the peer prefix count because we are only moving
746 	 * the prefix without changing the peer.
747 	 */
748 
749 	/*
750 	 * First kick the old prefix node out of the prefix list,
751 	 * afterwards run the route decision for new prefix node.
752 	 * Because of this only one update is generated if the prefix
753 	 * was active.
754 	 * This is save because we create a new prefix and so the change
755 	 * is noticed by prefix_evaluate().
756 	 */
757 	LIST_REMOVE(p, rib_l);
758 	prefix_evaluate(np, np->rib);
759 
760 	/* remove old prefix node */
761 	oasp = p->aspath;
762 	LIST_REMOVE(p, path_l);
763 	PREFIX_COUNT(oasp, -1);
764 	/* as before peer count needs no update because of move */
765 
766 	/* destroy all references to other objects and free the old prefix */
767 	p->aspath = NULL;
768 	p->prefix = NULL;
769 	p->rib = NULL;
770 	prefix_free(p);
771 
772 	/* destroy old path if empty */
773 	if (path_empty(oasp))
774 		path_destroy(oasp);
775 }
776 
777 /*
778  * Removes a prefix from all lists. If the parent objects -- path or
779  * pt_entry -- become empty remove them too.
780  */
781 int
782 prefix_remove(struct rib *rib, struct rde_peer *peer, struct bgpd_addr *prefix,
783     int prefixlen, u_int32_t flags)
784 {
785 	struct prefix		*p;
786 	struct rib_entry	*re;
787 	struct rde_aspath	*asp;
788 
789 	re = rib_get(rib, prefix, prefixlen);
790 	if (re == NULL)	/* Got a dummy withdrawn request */
791 		return (0);
792 
793 	p = prefix_bypeer(re, peer, flags);
794 	if (p == NULL)		/* Got a dummy withdrawn request. */
795 		return (0);
796 
797 	asp = p->aspath;
798 
799 	if (asp->pftableid) {
800 		/* only prefixes in the local RIB were pushed into pf */
801 		rde_send_pftable(asp->pftableid, prefix, prefixlen, 1);
802 		rde_send_pftable_commit();
803 	}
804 
805 	prefix_destroy(p);
806 
807 	return (1);
808 }
809 
810 /* dump a prefix into specified buffer */
811 int
812 prefix_write(u_char *buf, int len, struct bgpd_addr *prefix, u_int8_t plen)
813 {
814 	int	totlen;
815 
816 	switch (prefix->aid) {
817 	case AID_INET:
818 	case AID_INET6:
819 		totlen = PREFIX_SIZE(plen);
820 
821 		if (totlen > len)
822 			return (-1);
823 		*buf++ = plen;
824 		memcpy(buf, &prefix->ba, totlen - 1);
825 		return (totlen);
826 	case AID_VPN_IPv4:
827 		totlen = PREFIX_SIZE(plen) + sizeof(prefix->vpn4.rd) +
828 		    prefix->vpn4.labellen;
829 		plen += (sizeof(prefix->vpn4.rd) + prefix->vpn4.labellen) * 8;
830 
831 		if (totlen > len)
832 			return (-1);
833 		*buf++ = plen;
834 		memcpy(buf, &prefix->vpn4.labelstack, prefix->vpn4.labellen);
835 		buf += prefix->vpn4.labellen;
836 		memcpy(buf, &prefix->vpn4.rd, sizeof(prefix->vpn4.rd));
837 		buf += sizeof(prefix->vpn4.rd);
838 		memcpy(buf, &prefix->vpn4.addr, PREFIX_SIZE(plen) - 1);
839 		return (totlen);
840 	default:
841 		return (-1);
842 	}
843 }
844 
845 int
846 prefix_writebuf(struct ibuf *buf, struct bgpd_addr *prefix, u_int8_t plen)
847 {
848 	int	 totlen;
849 	void	*bptr;
850 
851 	switch (prefix->aid) {
852 	case AID_INET:
853 	case AID_INET6:
854 		totlen = PREFIX_SIZE(plen);
855 		break;
856 	case AID_VPN_IPv4:
857 		totlen = PREFIX_SIZE(plen) + sizeof(prefix->vpn4.rd) +
858 		    prefix->vpn4.labellen;
859 		break;
860 	default:
861 		return (-1);
862 	}
863 
864 	if ((bptr = ibuf_reserve(buf, totlen)) == NULL)
865 		return (-1);
866 	if (prefix_write(bptr, totlen, prefix, plen) == -1)
867 		return (-1);
868 	return (0);
869 }
870 
871 /*
872  * Searches in the prefix list of specified pt_entry for a prefix entry
873  * belonging to the peer peer. Returns NULL if no match found.
874  */
875 struct prefix *
876 prefix_bypeer(struct rib_entry *re, struct rde_peer *peer, u_int32_t flags)
877 {
878 	struct prefix	*p;
879 
880 	LIST_FOREACH(p, &re->prefix_h, rib_l) {
881 		if (p->aspath->peer != peer)
882 			continue;
883 		if (p->aspath->flags & flags &&
884 		    (flags & F_ANN_DYNAMIC) !=
885 		    (p->aspath->flags & F_ANN_DYNAMIC))
886 			continue;
887 		return (p);
888 	}
889 	return (NULL);
890 }
891 
892 void
893 prefix_updateall(struct rde_aspath *asp, enum nexthop_state state,
894     enum nexthop_state oldstate)
895 {
896 	struct prefix	*p;
897 
898 	LIST_FOREACH(p, &asp->prefix_h, path_l) {
899 		/*
900 		 * skip non local-RIBs or RIBs that are flagged as noeval.
901 		 */
902 		if (p->rib->flags & F_RIB_NOEVALUATE)
903 			continue;
904 
905 		if (oldstate == state && state == NEXTHOP_REACH) {
906 			/*
907 			 * The state of the nexthop did not change. The only
908 			 * thing that may have changed is the true_nexthop
909 			 * or other internal infos. This will not change
910 			 * the routing decision so shortcut here.
911 			 */
912 			if ((p->rib->flags & F_RIB_NOFIB) == 0 &&
913 			    p == p->rib->active)
914 				rde_send_kroute(p, NULL, p->rib->ribid);
915 			continue;
916 		}
917 
918 		/* redo the route decision */
919 		LIST_REMOVE(p, rib_l);
920 		/*
921 		 * If the prefix is the active one remove it first,
922 		 * this has to be done because we can not detect when
923 		 * the active prefix changes its state. In this case
924 		 * we know that this is a withdrawal and so the second
925 		 * prefix_evaluate() will generate no update because
926 		 * the nexthop is unreachable or ineligible.
927 		 */
928 		if (p == p->rib->active)
929 			prefix_evaluate(NULL, p->rib);
930 		prefix_evaluate(p, p->rib);
931 	}
932 }
933 
934 /* kill a prefix. */
935 void
936 prefix_destroy(struct prefix *p)
937 {
938 	struct rde_aspath	*asp;
939 
940 	asp = p->aspath;
941 	prefix_unlink(p);
942 	prefix_free(p);
943 
944 	if (path_empty(asp))
945 		path_destroy(asp);
946 }
947 
948 /*
949  * helper function to clean up the connected networks after a reload
950  */
951 void
952 prefix_network_clean(struct rde_peer *peer, time_t reloadtime, u_int32_t flags)
953 {
954 	struct rde_aspath	*asp, *xasp;
955 	struct prefix		*p, *xp;
956 
957 	for (asp = LIST_FIRST(&peer->path_h); asp != NULL; asp = xasp) {
958 		xasp = LIST_NEXT(asp, peer_l);
959 		if ((asp->flags & F_ANN_DYNAMIC) != flags)
960 			continue;
961 		for (p = LIST_FIRST(&asp->prefix_h); p != NULL; p = xp) {
962 			xp = LIST_NEXT(p, path_l);
963 			if (reloadtime > p->lastchange) {
964 				prefix_unlink(p);
965 				prefix_free(p);
966 			}
967 		}
968 		if (path_empty(asp))
969 			path_destroy(asp);
970 	}
971 }
972 
973 /*
974  * Link a prefix into the different parent objects.
975  */
976 static void
977 prefix_link(struct prefix *pref, struct rib_entry *re, struct rde_aspath *asp)
978 {
979 	LIST_INSERT_HEAD(&asp->prefix_h, pref, path_l);
980 	PREFIX_COUNT(asp, 1);
981 
982 	pref->aspath = asp;
983 	pref->rib = re;
984 	pref->prefix = re->prefix;
985 	pt_ref(pref->prefix);
986 	pref->lastchange = time(NULL);
987 
988 	/* make route decision */
989 	prefix_evaluate(pref, re);
990 }
991 
992 /*
993  * Unlink a prefix from the different parent objects.
994  */
995 static void
996 prefix_unlink(struct prefix *pref)
997 {
998 	struct rib_entry	*re = pref->rib;
999 
1000 	/* make route decision */
1001 	LIST_REMOVE(pref, rib_l);
1002 	prefix_evaluate(NULL, re);
1003 
1004 	LIST_REMOVE(pref, path_l);
1005 	PREFIX_COUNT(pref->aspath, -1);
1006 
1007 	pt_unref(pref->prefix);
1008 	if (pt_empty(pref->prefix))
1009 		pt_remove(pref->prefix);
1010 	if (rib_empty(re))
1011 		rib_remove(re);
1012 
1013 	/* destroy all references to other objects */
1014 	pref->aspath = NULL;
1015 	pref->prefix = NULL;
1016 	pref->rib = NULL;
1017 
1018 	/*
1019 	 * It's the caller's duty to remove empty aspath structures.
1020 	 * Also freeing the unlinked prefix is the caller's duty.
1021 	 */
1022 }
1023 
1024 /* alloc and bzero new entry. May not fail. */
1025 static struct prefix *
1026 prefix_alloc(void)
1027 {
1028 	struct prefix *p;
1029 
1030 	p = calloc(1, sizeof(*p));
1031 	if (p == NULL)
1032 		fatal("prefix_alloc");
1033 	rdemem.prefix_cnt++;
1034 	return p;
1035 }
1036 
1037 /* free a unlinked entry */
1038 static void
1039 prefix_free(struct prefix *pref)
1040 {
1041 	rdemem.prefix_cnt--;
1042 	free(pref);
1043 }
1044 
1045 /*
1046  * nexthop functions
1047  */
1048 struct nexthop_head	*nexthop_hash(struct bgpd_addr *);
1049 struct nexthop		*nexthop_lookup(struct bgpd_addr *);
1050 
1051 /*
1052  * In BGP there exist two nexthops: the exit nexthop which was announced via
1053  * BGP and the true nexthop which is used in the FIB -- forward information
1054  * base a.k.a kernel routing table. When sending updates it is even more
1055  * confusing. In IBGP we pass the unmodified exit nexthop to the neighbors
1056  * while in EBGP normally the address of the router is sent. The exit nexthop
1057  * may be passed to the external neighbor if the neighbor and the exit nexthop
1058  * reside in the same subnet -- directly connected.
1059  */
1060 struct nexthop_table {
1061 	LIST_HEAD(nexthop_head, nexthop)	*nexthop_hashtbl;
1062 	u_int32_t				 nexthop_hashmask;
1063 } nexthoptable;
1064 
1065 void
1066 nexthop_init(u_int32_t hashsize)
1067 {
1068 	u_int32_t	 hs, i;
1069 
1070 	for (hs = 1; hs < hashsize; hs <<= 1)
1071 		;
1072 	nexthoptable.nexthop_hashtbl = calloc(hs, sizeof(struct nexthop_head));
1073 	if (nexthoptable.nexthop_hashtbl == NULL)
1074 		fatal("nextop_init");
1075 
1076 	for (i = 0; i < hs; i++)
1077 		LIST_INIT(&nexthoptable.nexthop_hashtbl[i]);
1078 
1079 	nexthoptable.nexthop_hashmask = hs - 1;
1080 }
1081 
1082 void
1083 nexthop_shutdown(void)
1084 {
1085 	u_int32_t		 i;
1086 	struct nexthop		*nh, *nnh;
1087 
1088 	for (i = 0; i <= nexthoptable.nexthop_hashmask; i++) {
1089 		for (nh = LIST_FIRST(&nexthoptable.nexthop_hashtbl[i]);
1090 		    nh != NULL; nh = nnh) {
1091 			nnh = LIST_NEXT(nh, nexthop_l);
1092 			nh->state = NEXTHOP_UNREACH;
1093 			(void)nexthop_delete(nh);
1094 		}
1095 		if (!LIST_EMPTY(&nexthoptable.nexthop_hashtbl[i]))
1096 			log_warnx("nexthop_shutdown: non-free table");
1097 	}
1098 
1099 	free(nexthoptable.nexthop_hashtbl);
1100 }
1101 
1102 void
1103 nexthop_update(struct kroute_nexthop *msg)
1104 {
1105 	struct nexthop		*nh;
1106 	struct rde_aspath	*asp;
1107 	enum nexthop_state	 oldstate;
1108 
1109 	nh = nexthop_lookup(&msg->nexthop);
1110 	if (nh == NULL) {
1111 		log_warnx("nexthop_update: non-existent nexthop %s",
1112 		    log_addr(&msg->nexthop));
1113 		return;
1114 	}
1115 
1116 	oldstate = nh->state;
1117 	if (msg->valid)
1118 		nh->state = NEXTHOP_REACH;
1119 	else
1120 		nh->state = NEXTHOP_UNREACH;
1121 
1122 	if (msg->connected) {
1123 		nh->flags |= NEXTHOP_CONNECTED;
1124 		memcpy(&nh->true_nexthop, &nh->exit_nexthop,
1125 		    sizeof(nh->true_nexthop));
1126 	} else
1127 		memcpy(&nh->true_nexthop, &msg->gateway,
1128 		    sizeof(nh->true_nexthop));
1129 
1130 	memcpy(&nh->nexthop_net, &msg->net,
1131 	    sizeof(nh->nexthop_net));
1132 	nh->nexthop_netlen = msg->netlen;
1133 
1134 	if (nexthop_delete(nh))
1135 		/* nexthop no longer used */
1136 		return;
1137 
1138 	if (rde_noevaluate())
1139 		/*
1140 		 * if the decision process is turned off there is no need
1141 		 * for the aspath list walk.
1142 		 */
1143 		return;
1144 
1145 	LIST_FOREACH(asp, &nh->path_h, nexthop_l) {
1146 		prefix_updateall(asp, nh->state, oldstate);
1147 	}
1148 }
1149 
1150 void
1151 nexthop_modify(struct rde_aspath *asp, struct bgpd_addr *nexthop,
1152     enum action_types type, u_int8_t aid)
1153 {
1154 	struct nexthop	*nh;
1155 
1156 	if (type == ACTION_SET_NEXTHOP && aid != nexthop->aid)
1157 		return;
1158 
1159 	asp->flags &= ~F_NEXTHOP_MASK;
1160 	switch (type) {
1161 	case ACTION_SET_NEXTHOP_REJECT:
1162 		asp->flags |= F_NEXTHOP_REJECT;
1163 		break;
1164 	case ACTION_SET_NEXTHOP_BLACKHOLE:
1165 		asp->flags |= F_NEXTHOP_BLACKHOLE;
1166 		break;
1167 	case ACTION_SET_NEXTHOP_NOMODIFY:
1168 		asp->flags |= F_NEXTHOP_NOMODIFY;
1169 		break;
1170 	case ACTION_SET_NEXTHOP_SELF:
1171 		asp->flags |= F_NEXTHOP_SELF;
1172 		break;
1173 	case ACTION_SET_NEXTHOP:
1174 		nh = nexthop_get(nexthop);
1175 		if (asp->flags & F_ATTR_LINKED)
1176 			nexthop_unlink(asp);
1177 		asp->nexthop = nh;
1178 		if (asp->flags & F_ATTR_LINKED)
1179 			nexthop_link(asp);
1180 		break;
1181 	default:
1182 		break;
1183 	}
1184 }
1185 
1186 void
1187 nexthop_link(struct rde_aspath *asp)
1188 {
1189 	if (asp->nexthop == NULL)
1190 		return;
1191 
1192 	LIST_INSERT_HEAD(&asp->nexthop->path_h, asp, nexthop_l);
1193 }
1194 
1195 void
1196 nexthop_unlink(struct rde_aspath *asp)
1197 {
1198 	struct nexthop	*nh;
1199 
1200 	if (asp->nexthop == NULL)
1201 		return;
1202 
1203 	LIST_REMOVE(asp, nexthop_l);
1204 
1205 	/* see if list is empty */
1206 	nh = asp->nexthop;
1207 	asp->nexthop = NULL;
1208 
1209 	(void)nexthop_delete(nh);
1210 }
1211 
1212 int
1213 nexthop_delete(struct nexthop *nh)
1214 {
1215 	/* nexthop still used by some other aspath */
1216 	if (!LIST_EMPTY(&nh->path_h))
1217 		return (0);
1218 
1219 	/* either pinned or in a state where it may not be deleted */
1220 	if (nh->refcnt > 0 || nh->state == NEXTHOP_LOOKUP)
1221 		return (0);
1222 
1223 	LIST_REMOVE(nh, nexthop_l);
1224 	rde_send_nexthop(&nh->exit_nexthop, 0);
1225 
1226 	rdemem.nexthop_cnt--;
1227 	free(nh);
1228 	return (1);
1229 }
1230 
1231 struct nexthop *
1232 nexthop_get(struct bgpd_addr *nexthop)
1233 {
1234 	struct nexthop	*nh;
1235 
1236 	nh = nexthop_lookup(nexthop);
1237 	if (nh == NULL) {
1238 		nh = calloc(1, sizeof(*nh));
1239 		if (nh == NULL)
1240 			fatal("nexthop_alloc");
1241 		rdemem.nexthop_cnt++;
1242 
1243 		LIST_INIT(&nh->path_h);
1244 		nh->state = NEXTHOP_LOOKUP;
1245 		nh->exit_nexthop = *nexthop;
1246 		LIST_INSERT_HEAD(nexthop_hash(nexthop), nh,
1247 		    nexthop_l);
1248 
1249 		rde_send_nexthop(&nh->exit_nexthop, 1);
1250 	}
1251 
1252 	return (nh);
1253 }
1254 
1255 int
1256 nexthop_compare(struct nexthop *na, struct nexthop *nb)
1257 {
1258 	struct bgpd_addr	*a, *b;
1259 
1260 	if (na == nb)
1261 		return (0);
1262 	if (na == NULL)
1263 		return (-1);
1264 	if (nb == NULL)
1265 		return (1);
1266 
1267 	a = &na->exit_nexthop;
1268 	b = &nb->exit_nexthop;
1269 
1270 	if (a->aid != b->aid)
1271 		return (a->aid - b->aid);
1272 
1273 	switch (a->aid) {
1274 	case AID_INET:
1275 		if (ntohl(a->v4.s_addr) > ntohl(b->v4.s_addr))
1276 			return (1);
1277 		if (ntohl(a->v4.s_addr) < ntohl(b->v4.s_addr))
1278 			return (-1);
1279 		return (0);
1280 	case AID_INET6:
1281 		return (memcmp(&a->v6, &b->v6, sizeof(struct in6_addr)));
1282 	default:
1283 		fatalx("nexthop_cmp: unknown af");
1284 	}
1285 	return (-1);
1286 }
1287 
1288 struct nexthop *
1289 nexthop_lookup(struct bgpd_addr *nexthop)
1290 {
1291 	struct nexthop	*nh;
1292 
1293 	LIST_FOREACH(nh, nexthop_hash(nexthop), nexthop_l) {
1294 		if (memcmp(&nh->exit_nexthop, nexthop,
1295 		    sizeof(struct bgpd_addr)) == 0)
1296 			return (nh);
1297 	}
1298 	return (NULL);
1299 }
1300 
1301 struct nexthop_head *
1302 nexthop_hash(struct bgpd_addr *nexthop)
1303 {
1304 	u_int32_t	 h = 0;
1305 
1306 	switch (nexthop->aid) {
1307 	case AID_INET:
1308 		h = (AF_INET ^ ntohl(nexthop->v4.s_addr) ^
1309 		    ntohl(nexthop->v4.s_addr) >> 13) &
1310 		    nexthoptable.nexthop_hashmask;
1311 		break;
1312 	case AID_INET6:
1313 		h = hash32_buf(&nexthop->v6, sizeof(struct in6_addr),
1314 		    HASHINIT) & nexthoptable.nexthop_hashmask;
1315 		break;
1316 	default:
1317 		fatalx("nexthop_hash: unsupported AF");
1318 	}
1319 	return (&nexthoptable.nexthop_hashtbl[h]);
1320 }
1321 
1322