xref: /openbsd-src/usr.sbin/bgpd/rde_rib.c (revision 2b0358df1d88d06ef4139321dd05bd5e05d91eaf)
1 /*	$OpenBSD: rde_rib.c,v 1.97 2008/11/21 17:41:22 claudio Exp $ */
2 
3 /*
4  * Copyright (c) 2003, 2004 Claudio Jeker <claudio@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 #include <sys/queue.h>
21 #include <sys/hash.h>
22 
23 #include <stdlib.h>
24 #include <string.h>
25 
26 #include "bgpd.h"
27 #include "rde.h"
28 
29 /*
30  * BGP RIB -- Routing Information Base
31  *
32  * The RIB is build with one aspect in mind. Speed -- actually update speed.
33  * Therefore one thing needs to be absolutely avoided, long table walks.
34  * This is achieved by heavily linking the different parts together.
35  */
36 
37 /* used to bump correct prefix counters */
38 #define PREFIX_COUNT(x, f, op)				\
39 	do {						\
40 		if (f & F_LOCAL)			\
41 			(x)->prefix_cnt += (op);	\
42 		if (f & F_ORIGINAL)			\
43 			(x)->adjrib_cnt += (op);	\
44 	} while (0)
45 
46 /* path specific functions */
47 
48 static void	path_link(struct rde_aspath *, struct rde_peer *);
49 
50 struct path_table pathtable;
51 
52 /* XXX the hash should also include communities and the other attrs */
53 #define PATH_HASH(x)				\
54 	&pathtable.path_hashtbl[hash32_buf((x)->data, (x)->len, HASHINIT) & \
55 	    pathtable.path_hashmask]
56 
57 void
58 path_init(u_int32_t hashsize)
59 {
60 	u_int32_t	hs, i;
61 
62 	for (hs = 1; hs < hashsize; hs <<= 1)
63 		;
64 	pathtable.path_hashtbl = calloc(hs, sizeof(struct aspath_head));
65 	if (pathtable.path_hashtbl == NULL)
66 		fatal("path_init");
67 
68 	for (i = 0; i < hs; i++)
69 		LIST_INIT(&pathtable.path_hashtbl[i]);
70 
71 	pathtable.path_hashmask = hs - 1;
72 }
73 
74 void
75 path_shutdown(void)
76 {
77 	u_int32_t	i;
78 
79 	for (i = 0; i <= pathtable.path_hashmask; i++)
80 		if (!LIST_EMPTY(&pathtable.path_hashtbl[i]))
81 			log_warnx("path_free: free non-free table");
82 
83 	free(pathtable.path_hashtbl);
84 }
85 
86 void
87 path_update(struct rde_peer *peer, struct rde_aspath *nasp,
88     struct bgpd_addr *prefix, int prefixlen, u_int32_t flags)
89 {
90 	struct rde_aspath	*asp;
91 	struct prefix		*p, *oldp = NULL;
92 
93 	if (flags & F_LOCAL) {
94 		rde_send_pftable(nasp->pftableid, prefix, prefixlen, 0);
95 		rde_send_pftable_commit();
96 	}
97 
98 	/*
99 	 * First try to find a prefix in the specified RIB or in the
100 	 * Adj-RIB-In. This works because Local-RIB has precedence over the
101 	 * Adj-RIB-In. In the end this saves use some additional lookups.
102 	 */
103 	if ((p = prefix_get(peer, prefix, prefixlen, flags | F_ORIGINAL)) !=
104 	    NULL) {
105 		do {
106 			if (path_compare(nasp, p->aspath) == 0) {
107 				if ((p->flags & flags) == 0) {
108 					if (oldp != NULL) {
109 						asp = oldp->aspath;
110 						prefix_destroy(oldp);
111 						if (path_empty(asp))
112 							path_destroy(asp);
113 					}
114 					p->flags |= flags;
115 					PREFIX_COUNT(p->aspath, flags, 1);
116 					PREFIX_COUNT(peer, flags, 1);
117 
118 					/* re-evaluate prefix */
119 					LIST_REMOVE(p, prefix_l);
120 					prefix_evaluate(p, p->prefix);
121 				}
122 				/* update last change */
123 				p->lastchange = time(NULL);
124 				return;
125 			}
126 			/*
127 			 * If the prefix is not already part of the Adj-RIB-In
128 			 * do a lookup in there. But keep the original prefix
129 			 * around so that it can be removed later.
130 			 */
131 			if (p->flags & F_ORIGINAL)
132 				break;
133 			oldp = p;
134 			p = prefix_get(peer, prefix, prefixlen, F_ORIGINAL);
135 		} while (p != NULL);
136 	}
137 
138 	/* Do not try to move a prefix that is in the wrong RIB. */
139 	if (p == NULL || (p->flags & flags) == 0)
140 		p = oldp;
141 
142 	/*
143 	 * Either the prefix does not exist or the path changed.
144 	 * In both cases lookup the new aspath to make sure it is not
145 	 * already in the RIB.
146 	 */
147 	if ((asp = path_lookup(nasp, peer)) == NULL) {
148 		/* Path not available, create and link a new one. */
149 		asp = path_copy(nasp);
150 		path_link(asp, peer);
151 	}
152 
153 	/* If the prefix was found move it else add it to the aspath. */
154 	if (p != NULL)
155 		prefix_move(asp, p, flags);
156 	else
157 		prefix_add(asp, prefix, prefixlen, flags);
158 }
159 
160 int
161 path_compare(struct rde_aspath *a, struct rde_aspath *b)
162 {
163 	int		 r;
164 
165 	if (a->origin > b->origin)
166 		return (1);
167 	if (a->origin < b->origin)
168 		return (-1);
169 	if ((a->flags & ~F_ATTR_LINKED) > (b->flags & ~F_ATTR_LINKED))
170 		return (1);
171 	if ((a->flags & ~F_ATTR_LINKED) < (b->flags & ~F_ATTR_LINKED))
172 		return (-1);
173 	if (a->med > b->med)
174 		return (1);
175 	if (a->med < b->med)
176 		return (-1);
177 	if (a->lpref > b->lpref)
178 		return (1);
179 	if (a->lpref < b->lpref)
180 		return (-1);
181 	if (a->weight > b->weight)
182 		return (1);
183 	if (a->weight < b->weight)
184 		return (-1);
185 	if (a->rtlabelid > b->rtlabelid)
186 		return (1);
187 	if (a->rtlabelid < b->rtlabelid)
188 		return (-1);
189 	if (a->pftableid > b->pftableid)
190 		return (1);
191 	if (a->pftableid < b->pftableid)
192 		return (-1);
193 
194 	r = aspath_compare(a->aspath, b->aspath);
195 	if (r == 0)
196 		r = nexthop_compare(a->nexthop, b->nexthop);
197 	if (r > 0)
198 		return (1);
199 	if (r < 0)
200 		return (-1);
201 
202 	return (attr_compare(a, b));
203 }
204 
205 struct rde_aspath *
206 path_lookup(struct rde_aspath *aspath, struct rde_peer *peer)
207 {
208 	struct aspath_head	*head;
209 	struct rde_aspath	*asp;
210 
211 	head = PATH_HASH(aspath->aspath);
212 
213 	LIST_FOREACH(asp, head, path_l) {
214 		if (peer == asp->peer && path_compare(aspath, asp) == 0)
215 			return (asp);
216 	}
217 	return (NULL);
218 }
219 
220 void
221 path_remove(struct rde_aspath *asp)
222 {
223 	struct prefix	*p;
224 	struct bgpd_addr addr;
225 
226 	while ((p = LIST_FIRST(&asp->prefix_h)) != NULL) {
227 		/* Commit is done in peer_down() */
228 		pt_getaddr(p->prefix, &addr);
229 		if (p->flags & F_LOCAL)
230 			rde_send_pftable(p->aspath->pftableid, &addr,
231 			    p->prefix->prefixlen, 1);
232 
233 		prefix_destroy(p);
234 	}
235 	path_destroy(asp);
236 }
237 
238 /* this function is only called by prefix_remove and path_remove */
239 void
240 path_destroy(struct rde_aspath *asp)
241 {
242 	/* path_destroy can only unlink and free empty rde_aspath */
243 	if (asp->prefix_cnt != 0 || asp->active_cnt != 0 ||
244 	    asp->adjrib_cnt != 0)
245 		log_warnx("path_destroy: prefix count out of sync");
246 
247 	nexthop_unlink(asp);
248 	LIST_REMOVE(asp, path_l);
249 	LIST_REMOVE(asp, peer_l);
250 	asp->peer = NULL;
251 	asp->nexthop = NULL;
252 	asp->flags &= ~F_ATTR_LINKED;
253 
254 	path_put(asp);
255 }
256 
257 int
258 path_empty(struct rde_aspath *asp)
259 {
260 	return LIST_EMPTY(&asp->prefix_h);
261 }
262 
263 /*
264  * the path object is linked into multiple lists for fast access.
265  * These are peer_l, path_l and nexthop_l.
266  * peer_l: list of all aspaths that belong to that peer
267  * path_l: hash list to find paths quickly
268  * nexthop_l: list of all aspaths with an equal exit nexthop
269  */
270 static void
271 path_link(struct rde_aspath *asp, struct rde_peer *peer)
272 {
273 	struct aspath_head	*head;
274 
275 	head = PATH_HASH(asp->aspath);
276 
277 	LIST_INSERT_HEAD(head, asp, path_l);
278 	LIST_INSERT_HEAD(&peer->path_h, asp, peer_l);
279 	asp->peer = peer;
280 	nexthop_link(asp);
281 	asp->flags |= F_ATTR_LINKED;
282 }
283 
284 /*
285  * copy asp to a new UNLINKED one mainly for filtering
286  */
287 struct rde_aspath *
288 path_copy(struct rde_aspath *asp)
289 {
290 	struct rde_aspath *nasp;
291 
292 	nasp = path_get();
293 	nasp->aspath = asp->aspath;
294 	if (nasp->aspath != NULL) {
295 		nasp->aspath->refcnt++;
296 		rdemem.aspath_refs++;
297 	}
298 	nasp->nexthop = asp->nexthop;
299 	nasp->med = asp->med;
300 	nasp->lpref = asp->lpref;
301 	nasp->weight = asp->weight;
302 	nasp->origin = asp->origin;
303 	nasp->rtlabelid = asp->rtlabelid;
304 	rtlabel_ref(nasp->rtlabelid);
305 	nasp->pftableid = asp->pftableid;
306 	pftable_ref(nasp->pftableid);
307 
308 	nasp->flags = asp->flags & ~F_ATTR_LINKED;
309 	attr_copy(nasp, asp);
310 
311 	return (nasp);
312 }
313 
314 /* alloc and initialize new entry. May not fail. */
315 struct rde_aspath *
316 path_get(void)
317 {
318 	struct rde_aspath *asp;
319 
320 	asp = calloc(1, sizeof(*asp));
321 	if (asp == NULL)
322 		fatal("path_alloc");
323 	rdemem.path_cnt++;
324 
325 	LIST_INIT(&asp->prefix_h);
326 	asp->origin = ORIGIN_INCOMPLETE;
327 	asp->lpref = DEFAULT_LPREF;
328 	/* med = 0 */
329 	/* weight = 0 */
330 	/* rtlabel = 0 */
331 
332 	return (asp);
333 }
334 
335 /* free an unlinked element */
336 void
337 path_put(struct rde_aspath *asp)
338 {
339 	if (asp == NULL)
340 		return;
341 
342 	if (asp->flags & F_ATTR_LINKED)
343 		fatalx("path_put: linked object");
344 
345 	rtlabel_unref(asp->rtlabelid);
346 	pftable_unref(asp->pftableid);
347 	aspath_put(asp->aspath);
348 	attr_freeall(asp);
349 	rdemem.path_cnt--;
350 	free(asp);
351 }
352 
353 /* prefix specific functions */
354 
355 static struct prefix	*prefix_alloc(void);
356 static void		 prefix_free(struct prefix *);
357 static void		 prefix_link(struct prefix *, struct pt_entry *,
358 			     struct rde_aspath *, u_int32_t);
359 static void		 prefix_unlink(struct prefix *);
360 
361 int
362 prefix_compare(const struct bgpd_addr *a, const struct bgpd_addr *b,
363     int prefixlen)
364 {
365 	in_addr_t	mask, aa, ba;
366 	int		i;
367 	u_int8_t	m;
368 
369 	if (a->af != b->af)
370 		return (a->af - b->af);
371 
372 	switch (a->af) {
373 	case AF_INET:
374 		if (prefixlen > 32)
375 			fatalx("prefix_cmp: bad IPv4 prefixlen");
376 		mask = htonl(prefixlen2mask(prefixlen));
377 		aa = ntohl(a->v4.s_addr & mask);
378 		ba = ntohl(b->v4.s_addr & mask);
379 		if (aa != ba)
380 			return (aa - ba);
381 		return (0);
382 	case AF_INET6:
383 		if (prefixlen > 128)
384 			fatalx("prefix_cmp: bad IPv6 prefixlen");
385 		for (i = 0; i < prefixlen / 8; i++)
386 			if (a->v6.s6_addr[i] != b->v6.s6_addr[i])
387 				return (a->v6.s6_addr[i] - b->v6.s6_addr[i]);
388 		i = prefixlen % 8;
389 		if (i) {
390 			m = 0xff00 >> i;
391 			if ((a->v6.s6_addr[prefixlen / 8] & m) !=
392 			    (b->v6.s6_addr[prefixlen / 8] & m))
393 				return ((a->v6.s6_addr[prefixlen / 8] & m) -
394 				    (b->v6.s6_addr[prefixlen / 8] & m));
395 		}
396 		return (0);
397 	default:
398 		fatalx("prefix_cmp: unknown af");
399 	}
400 	return (-1);
401 }
402 
403 /*
404  * search for specified prefix of a peer. Returns NULL if not found.
405  */
406 struct prefix *
407 prefix_get(struct rde_peer *peer, struct bgpd_addr *prefix, int prefixlen,
408     u_int32_t flags)
409 {
410 	struct pt_entry	*pte;
411 
412 	pte = pt_get(prefix, prefixlen);
413 	if (pte == NULL)
414 		return (NULL);
415 	return (prefix_bypeer(pte, peer, flags));
416 }
417 
418 /*
419  * Adds or updates a prefix.
420  */
421 struct pt_entry *
422 prefix_add(struct rde_aspath *asp, struct bgpd_addr *prefix, int prefixlen,
423     u_int32_t flags)
424 
425 {
426 	struct prefix	*p;
427 	struct pt_entry	*pte;
428 
429 	pte = pt_get(prefix, prefixlen);
430 	if (pte == NULL)
431 		pte = pt_add(prefix, prefixlen);
432 
433 	p = prefix_bypeer(pte, asp->peer, flags);
434 	if (p == NULL) {
435 		p = prefix_alloc();
436 		prefix_link(p, pte, asp, flags);
437 	} else {
438 		if (p->aspath != asp)
439 			/* prefix belongs to a different aspath so move */
440 			return (prefix_move(asp, p, flags));
441 		p->lastchange = time(NULL);
442 	}
443 
444 	return (pte);
445 }
446 
447 /*
448  * Move the prefix to the specified as path, removes the old asp if needed.
449  */
450 struct pt_entry *
451 prefix_move(struct rde_aspath *asp, struct prefix *p, u_int32_t flags)
452 {
453 	struct prefix		*np;
454 	struct rde_aspath	*oasp;
455 
456 	if (asp->peer != p->aspath->peer)
457 		fatalx("prefix_move: cross peer move");
458 
459 	/* create new prefix node */
460 	np = prefix_alloc();
461 	np->aspath = asp;
462 	/* peer and prefix pointers are still equal */
463 	np->prefix = p->prefix;
464 	np->lastchange = time(NULL);
465 	np->flags = flags;
466 
467 	/* add to new as path */
468 	LIST_INSERT_HEAD(&asp->prefix_h, np, path_l);
469 	PREFIX_COUNT(asp, flags, 1);
470 	/*
471 	 * no need to update the peer prefix count because we are only moving
472 	 * the prefix without changing the peer.
473 	 */
474 
475 	/*
476 	 * fiddle around with the flags. If the p->flags is not equal
477 	 * to flags the old prefix p may not be removed but instead p->flags
478 	 * needs to be adjusted.
479 	 */
480 	if (p->flags != flags) {
481 		if ((p->flags & flags) == 0)
482 			fatalx("prefix_move: "
483 			    "prefix is not part of desired RIB");
484 
485 		p->flags &= ~flags;
486 		PREFIX_COUNT(p->aspath, flags, -1);
487 		/* as before peer count needs no update because of move */
488 
489 		/* redo the route decision for p */
490 		LIST_REMOVE(p, prefix_l);
491 		/* If the prefix is the active one remove it first. */
492 		if (p == p->prefix->active)
493 			prefix_evaluate(NULL, p->prefix);
494 		prefix_evaluate(p, p->prefix);
495 
496 		/* and now for np */
497 		prefix_evaluate(np, np->prefix);
498 
499 		return (np->prefix);
500 	}
501 
502 	/*
503 	 * First kick the old prefix node out of the prefix list,
504 	 * afterwards run the route decision for new prefix node.
505 	 * Because of this only one update is generated if the prefix
506 	 * was active.
507 	 * This is save because we create a new prefix and so the change
508 	 * is noticed by prefix_evaluate().
509 	 */
510 	LIST_REMOVE(p, prefix_l);
511 	prefix_evaluate(np, np->prefix);
512 
513 	/* remove old prefix node */
514 	oasp = p->aspath;
515 	LIST_REMOVE(p, path_l);
516 	PREFIX_COUNT(oasp, flags, -1);
517 	/* as before peer count needs no update because of move */
518 
519 	/* destroy all references to other objects and free the old prefix */
520 	p->aspath = NULL;
521 	p->prefix = NULL;
522 	prefix_free(p);
523 
524 	/* destroy old path if empty */
525 	if (path_empty(oasp))
526 		path_destroy(oasp);
527 
528 	return (np->prefix);
529 }
530 
531 /*
532  * Removes a prefix from all lists. If the parent objects -- path or
533  * pt_entry -- become empty remove them too.
534  */
535 void
536 prefix_remove(struct rde_peer *peer, struct bgpd_addr *prefix, int prefixlen,
537     u_int32_t flags)
538 {
539 	struct prefix		*p;
540 	struct pt_entry		*pte;
541 	struct rde_aspath	*asp;
542 
543 	pte = pt_get(prefix, prefixlen);
544 	if (pte == NULL)	/* Got a dummy withdrawn request */
545 		return;
546 
547 	p = prefix_bypeer(pte, peer, flags);
548 	if (p == NULL)		/* Got a dummy withdrawn request. */
549 		return;
550 
551 	asp = p->aspath;
552 
553 	if (p->flags & F_LOCAL) {
554 		/* only prefixes in the local RIB were pushed into pf */
555 		rde_send_pftable(asp->pftableid, prefix, prefixlen, 1);
556 		rde_send_pftable_commit();
557 	}
558 
559 	/* if prefix belongs to more than one RIB just remove one instance */
560 	if (p->flags != flags) {
561 		p->flags &= ~flags;
562 
563 		PREFIX_COUNT(p->aspath, flags, -1);
564 		PREFIX_COUNT(peer, flags, -1);
565 
566 		/* redo the route decision for p */
567 		LIST_REMOVE(p, prefix_l);
568 		/* If the prefix is the active one remove it first. */
569 		if (p == p->prefix->active)
570 			prefix_evaluate(NULL, p->prefix);
571 		prefix_evaluate(p, p->prefix);
572 		return;
573 	}
574 
575 	prefix_unlink(p);
576 	prefix_free(p);
577 
578 	if (pt_empty(pte))
579 		pt_remove(pte);
580 	if (path_empty(asp))
581 		path_destroy(asp);
582 }
583 
584 /* dump a prefix into specified buffer */
585 int
586 prefix_write(u_char *buf, int len, struct bgpd_addr *prefix, u_int8_t plen)
587 {
588 	int	totlen;
589 
590 	if (prefix->af != AF_INET && prefix->af != AF_INET6)
591 		return (-1);
592 
593 	totlen = PREFIX_SIZE(plen);
594 
595 	if (totlen > len)
596 		return (-1);
597 	*buf++ = plen;
598 	memcpy(buf, &prefix->ba, totlen - 1);
599 	return (totlen);
600 }
601 
602 /*
603  * Searches in the prefix list of specified pt_entry for a prefix entry
604  * belonging to the peer peer. Returns NULL if no match found.
605  */
606 struct prefix *
607 prefix_bypeer(struct pt_entry *pte, struct rde_peer *peer, u_int32_t flags)
608 {
609 	struct prefix	*p;
610 
611 	LIST_FOREACH(p, &pte->prefix_h, prefix_l) {
612 		if (p->aspath->peer == peer && p->flags & flags)
613 			return (p);
614 	}
615 	return (NULL);
616 }
617 
618 void
619 prefix_updateall(struct rde_aspath *asp, enum nexthop_state state,
620     enum nexthop_state oldstate)
621 {
622 	struct prefix	*p;
623 
624 	if (rde_noevaluate())
625 		/* if the decision process is turned off this is a no-op */
626 		return;
627 
628 	LIST_FOREACH(p, &asp->prefix_h, path_l) {
629 		/*
630 		 * skip non local-RIB nodes, only local-RIB prefixes are
631 		 * eligible. Both F_LOCAL and F_ORIGINAL may be set.
632 		 */
633 		if (!(p->flags & F_LOCAL))
634 			continue;
635 
636 		if (oldstate == state && state == NEXTHOP_REACH) {
637 			/*
638 			 * The state of the nexthop did not change. The only
639 			 * thing that may have changed is the true_nexthop
640 			 * or other internal infos. This will not change
641 			 * the routing decision so shortcut here.
642 			 */
643 			if (p == p->prefix->active)
644 				rde_send_kroute(p, NULL);
645 			continue;
646 		}
647 
648 		/* redo the route decision */
649 		LIST_REMOVE(p, prefix_l);
650 		/*
651 		 * If the prefix is the active one remove it first,
652 		 * this has to be done because we can not detect when
653 		 * the active prefix changes its state. In this case
654 		 * we know that this is a withdrawl and so the second
655 		 * prefix_evaluate() will generate no update because
656 		 * the nexthop is unreachable or ineligible.
657 		 */
658 		if (p == p->prefix->active)
659 			prefix_evaluate(NULL, p->prefix);
660 		prefix_evaluate(p, p->prefix);
661 	}
662 }
663 
664 /* kill a prefix. Only called by path_remove and path_update. */
665 void
666 prefix_destroy(struct prefix *p)
667 {
668 	struct pt_entry		*pte;
669 
670 	pte = p->prefix;
671 	prefix_unlink(p);
672 	prefix_free(p);
673 
674 	if (pt_empty(pte))
675 		pt_remove(pte);
676 }
677 
678 /*
679  * helper function to clean up the connected networks after a reload
680  */
681 void
682 prefix_network_clean(struct rde_peer *peer, time_t reloadtime)
683 {
684 	struct rde_aspath	*asp, *xasp;
685 	struct prefix		*p, *xp;
686 	struct pt_entry		*pte;
687 
688 	for (asp = LIST_FIRST(&peer->path_h); asp != NULL; asp = xasp) {
689 		xasp = LIST_NEXT(asp, peer_l);
690 		for (p = LIST_FIRST(&asp->prefix_h); p != NULL; p = xp) {
691 			xp = LIST_NEXT(p, path_l);
692 			if (reloadtime > p->lastchange) {
693 				pte = p->prefix;
694 				prefix_unlink(p);
695 				prefix_free(p);
696 
697 				if (pt_empty(pte))
698 					pt_remove(pte);
699 			}
700 		}
701 		if (path_empty(asp))
702 			path_destroy(asp);
703 	}
704 }
705 
706 /*
707  * Link a prefix into the different parent objects.
708  */
709 static void
710 prefix_link(struct prefix *pref, struct pt_entry *pte, struct rde_aspath *asp,
711     u_int32_t flags)
712 {
713 	LIST_INSERT_HEAD(&asp->prefix_h, pref, path_l);
714 	PREFIX_COUNT(asp, flags, 1);
715 	PREFIX_COUNT(asp->peer, flags, 1);
716 
717 	pref->aspath = asp;
718 	pref->prefix = pte;
719 	pref->lastchange = time(NULL);
720 	pref->flags = flags;
721 
722 	/* make route decision */
723 	prefix_evaluate(pref, pte);
724 }
725 
726 /*
727  * Unlink a prefix from the different parent objects.
728  */
729 static void
730 prefix_unlink(struct prefix *pref)
731 {
732 	/* make route decision */
733 	LIST_REMOVE(pref, prefix_l);
734 	prefix_evaluate(NULL, pref->prefix);
735 
736 	LIST_REMOVE(pref, path_l);
737 	PREFIX_COUNT(pref->aspath, pref->flags, -1);
738 	PREFIX_COUNT(pref->aspath->peer, pref->flags, -1);
739 
740 	/* destroy all references to other objects */
741 	pref->aspath = NULL;
742 	pref->prefix = NULL;
743 
744 	/*
745 	 * It's the caller's duty to remove empty aspath respectively pt_entry
746 	 * structures. Also freeing the unlinked prefix is the caller's duty.
747 	 */
748 }
749 
750 /* alloc and bzero new entry. May not fail. */
751 static struct prefix *
752 prefix_alloc(void)
753 {
754 	struct prefix *p;
755 
756 	p = calloc(1, sizeof(*p));
757 	if (p == NULL)
758 		fatal("prefix_alloc");
759 	rdemem.prefix_cnt++;
760 	return p;
761 }
762 
763 /* free a unlinked entry */
764 static void
765 prefix_free(struct prefix *pref)
766 {
767 	rdemem.prefix_cnt--;
768 	free(pref);
769 }
770 
771 /*
772  * nexthop functions
773  */
774 struct nexthop_head	*nexthop_hash(struct bgpd_addr *);
775 struct nexthop		*nexthop_lookup(struct bgpd_addr *);
776 
777 /*
778  * In BGP there exist two nexthops: the exit nexthop which was announced via
779  * BGP and the true nexthop which is used in the FIB -- forward information
780  * base a.k.a kernel routing table. When sending updates it is even more
781  * confusing. In IBGP we pass the unmodified exit nexthop to the neighbors
782  * while in EBGP normally the address of the router is sent. The exit nexthop
783  * may be passed to the external neighbor if the neighbor and the exit nexthop
784  * reside in the same subnet -- directly connected.
785  */
786 struct nexthop_table {
787 	LIST_HEAD(nexthop_head, nexthop)	*nexthop_hashtbl;
788 	u_int32_t				 nexthop_hashmask;
789 } nexthoptable;
790 
791 void
792 nexthop_init(u_int32_t hashsize)
793 {
794 	u_int32_t	 hs, i;
795 
796 	for (hs = 1; hs < hashsize; hs <<= 1)
797 		;
798 	nexthoptable.nexthop_hashtbl = calloc(hs, sizeof(struct nexthop_table));
799 	if (nexthoptable.nexthop_hashtbl == NULL)
800 		fatal("nextop_init");
801 
802 	for (i = 0; i < hs; i++)
803 		LIST_INIT(&nexthoptable.nexthop_hashtbl[i]);
804 
805 	nexthoptable.nexthop_hashmask = hs - 1;
806 }
807 
808 void
809 nexthop_shutdown(void)
810 {
811 	u_int32_t		 i;
812 	struct nexthop		*nh, *nnh;
813 
814 	for (i = 0; i <= nexthoptable.nexthop_hashmask; i++) {
815 		for (nh = LIST_FIRST(&nexthoptable.nexthop_hashtbl[i]);
816 		    nh != NULL; nh = nnh) {
817 			nnh = LIST_NEXT(nh, nexthop_l);
818 			nh->state = NEXTHOP_UNREACH;
819 			(void)nexthop_delete(nh);
820 		}
821 		if (!LIST_EMPTY(&nexthoptable.nexthop_hashtbl[i]))
822 			log_warnx("nexthop_shutdown: non-free table");
823 	}
824 
825 	free(nexthoptable.nexthop_hashtbl);
826 }
827 
828 void
829 nexthop_update(struct kroute_nexthop *msg)
830 {
831 	struct nexthop		*nh;
832 	struct rde_aspath	*asp;
833 	enum nexthop_state	 oldstate;
834 
835 	nh = nexthop_lookup(&msg->nexthop);
836 	if (nh == NULL) {
837 		log_warnx("nexthop_update: non-existent nexthop %s",
838 		    log_addr(&msg->nexthop));
839 		return;
840 	}
841 
842 	if (nexthop_delete(nh))
843 		/* nexthop no longer used */
844 		return;
845 
846 	oldstate = nh->state;
847 	if (msg->valid)
848 		nh->state = NEXTHOP_REACH;
849 	else
850 		nh->state = NEXTHOP_UNREACH;
851 
852 	if (msg->connected) {
853 		nh->flags |= NEXTHOP_CONNECTED;
854 		memcpy(&nh->true_nexthop, &nh->exit_nexthop,
855 		    sizeof(nh->true_nexthop));
856 	} else
857 		memcpy(&nh->true_nexthop, &msg->gateway,
858 		    sizeof(nh->true_nexthop));
859 
860 	switch (msg->nexthop.af) {
861 	case AF_INET:
862 		nh->nexthop_netlen = msg->kr.kr4.prefixlen;
863 		nh->nexthop_net.af = AF_INET;
864 		nh->nexthop_net.v4.s_addr = msg->kr.kr4.prefix.s_addr;
865 		break;
866 	case AF_INET6:
867 		nh->nexthop_netlen = msg->kr.kr6.prefixlen;
868 		nh->nexthop_net.af = AF_INET6;
869 		memcpy(&nh->nexthop_net.v6, &msg->kr.kr6.prefix,
870 		    sizeof(struct in6_addr));
871 		break;
872 	default:
873 		fatalx("nexthop_update: unknown af");
874 	}
875 
876 	if (rde_noevaluate())
877 		/*
878 		 * if the decision process is turned off there is no need
879 		 * for the aspath list walk.
880 		 */
881 		return;
882 
883 	LIST_FOREACH(asp, &nh->path_h, nexthop_l) {
884 		prefix_updateall(asp, nh->state, oldstate);
885 	}
886 }
887 
888 void
889 nexthop_modify(struct rde_aspath *asp, struct bgpd_addr *nexthop,
890     enum action_types type, sa_family_t af)
891 {
892 	struct nexthop	*nh;
893 
894 	if (type == ACTION_SET_NEXTHOP_REJECT) {
895 		asp->flags |= F_NEXTHOP_REJECT;
896 		return;
897 	}
898 	if (type  == ACTION_SET_NEXTHOP_BLACKHOLE) {
899 		asp->flags |= F_NEXTHOP_BLACKHOLE;
900 		return;
901 	}
902 	if (type == ACTION_SET_NEXTHOP_NOMODIFY) {
903 		asp->flags |= F_NEXTHOP_NOMODIFY;
904 		return;
905 	}
906 	if (type == ACTION_SET_NEXTHOP_SELF) {
907 		asp->flags |= F_NEXTHOP_SELF;
908 		return;
909 	}
910 	if (af != nexthop->af)
911 		return;
912 
913 	nh = nexthop_get(nexthop);
914 	if (asp->flags & F_ATTR_LINKED)
915 		nexthop_unlink(asp);
916 	asp->nexthop = nh;
917 	if (asp->flags & F_ATTR_LINKED)
918 		nexthop_link(asp);
919 }
920 
921 void
922 nexthop_link(struct rde_aspath *asp)
923 {
924 	if (asp->nexthop == NULL)
925 		return;
926 
927 	LIST_INSERT_HEAD(&asp->nexthop->path_h, asp, nexthop_l);
928 }
929 
930 void
931 nexthop_unlink(struct rde_aspath *asp)
932 {
933 	struct nexthop	*nh;
934 
935 	if (asp->nexthop == NULL)
936 		return;
937 
938 	LIST_REMOVE(asp, nexthop_l);
939 
940 	/* see if list is empty */
941 	nh = asp->nexthop;
942 	asp->nexthop = NULL;
943 
944 	(void)nexthop_delete(nh);
945 }
946 
947 int
948 nexthop_delete(struct nexthop *nh)
949 {
950 	/* nexthop still used by some other aspath */
951 	if (!LIST_EMPTY(&nh->path_h))
952 		return (0);
953 
954 	/* either pinned or in a state where it may not be deleted */
955 	if (nh->refcnt > 0 || nh->state == NEXTHOP_LOOKUP)
956 		return (0);
957 
958 	LIST_REMOVE(nh, nexthop_l);
959 	rde_send_nexthop(&nh->exit_nexthop, 0);
960 
961 	rdemem.nexthop_cnt--;
962 	free(nh);
963 	return (1);
964 }
965 
966 struct nexthop *
967 nexthop_get(struct bgpd_addr *nexthop)
968 {
969 	struct nexthop	*nh;
970 
971 	nh = nexthop_lookup(nexthop);
972 	if (nh == NULL) {
973 		nh = calloc(1, sizeof(*nh));
974 		if (nh == NULL)
975 			fatal("nexthop_alloc");
976 		rdemem.nexthop_cnt++;
977 
978 		LIST_INIT(&nh->path_h);
979 		nh->state = NEXTHOP_LOOKUP;
980 		nh->exit_nexthop = *nexthop;
981 		LIST_INSERT_HEAD(nexthop_hash(nexthop), nh,
982 		    nexthop_l);
983 
984 		rde_send_nexthop(&nh->exit_nexthop, 1);
985 	}
986 
987 	return (nh);
988 }
989 
990 int
991 nexthop_compare(struct nexthop *na, struct nexthop *nb)
992 {
993 	struct bgpd_addr	*a, *b;
994 
995 	if (na == nb)
996 		return (0);
997 	if (na == NULL)
998 		return (-1);
999 	if (nb == NULL)
1000 		return (1);
1001 
1002 	a = &na->exit_nexthop;
1003 	b = &nb->exit_nexthop;
1004 
1005 	if (a->af != b->af)
1006 		return (a->af - b->af);
1007 
1008 	switch (a->af) {
1009 	case AF_INET:
1010 		if (ntohl(a->v4.s_addr) > ntohl(b->v4.s_addr))
1011 			return (1);
1012 		if (ntohl(a->v4.s_addr) < ntohl(b->v4.s_addr))
1013 			return (-1);
1014 		return (0);
1015 	case AF_INET6:
1016 		return (memcmp(&a->v6, &b->v6, sizeof(struct in6_addr)));
1017 	default:
1018 		fatalx("nexthop_cmp: unknown af");
1019 	}
1020 	return (-1);
1021 }
1022 
1023 struct nexthop *
1024 nexthop_lookup(struct bgpd_addr *nexthop)
1025 {
1026 	struct nexthop	*nh;
1027 
1028 	LIST_FOREACH(nh, nexthop_hash(nexthop), nexthop_l) {
1029 		if (memcmp(&nh->exit_nexthop, nexthop,
1030 		    sizeof(struct bgpd_addr)) == 0)
1031 			return (nh);
1032 	}
1033 	return (NULL);
1034 }
1035 
1036 struct nexthop_head *
1037 nexthop_hash(struct bgpd_addr *nexthop)
1038 {
1039 	u_int32_t	 h = 0;
1040 
1041 	switch (nexthop->af) {
1042 	case AF_INET:
1043 		h = (AF_INET ^ ntohl(nexthop->v4.s_addr) ^
1044 		    ntohl(nexthop->v4.s_addr) >> 13) &
1045 		    nexthoptable.nexthop_hashmask;
1046 		break;
1047 	case AF_INET6:
1048 		h = hash32_buf(nexthop->v6.s6_addr, sizeof(struct in6_addr),
1049 		    HASHINIT) & nexthoptable.nexthop_hashmask;
1050 		break;
1051 	default:
1052 		fatalx("nexthop_hash: unsupported AF");
1053 	}
1054 	return (&nexthoptable.nexthop_hashtbl[h]);
1055 }
1056 
1057