xref: /openbsd-src/usr.sbin/bgpd/rde_decide.c (revision 89ee02f7f35d33f55336ac46ffa1bfe24de4cc54)
1*89ee02f7Sclaudio /*	$OpenBSD: rde_decide.c,v 1.103 2024/08/14 19:09:51 claudio Exp $ */
2a16c0992Shenning 
3a16c0992Shenning /*
4e854144bSclaudio  * Copyright (c) 2003, 2004 Claudio Jeker <claudio@openbsd.org>
5050527e1Shenning  * Copyright (c) 2003, 2004 Henning Brauer <henning@openbsd.org>
6a16c0992Shenning  *
7a16c0992Shenning  * Permission to use, copy, modify, and distribute this software for any
8a16c0992Shenning  * purpose with or without fee is hereby granted, provided that the above
9a16c0992Shenning  * copyright notice and this permission notice appear in all copies.
10a16c0992Shenning  *
11a16c0992Shenning  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12a16c0992Shenning  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13a16c0992Shenning  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14a16c0992Shenning  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15a16c0992Shenning  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16a16c0992Shenning  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17a16c0992Shenning  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18a16c0992Shenning  */
19a16c0992Shenning 
20a16c0992Shenning #include <sys/types.h>
21a16c0992Shenning #include <sys/queue.h>
22a16c0992Shenning 
237f9525e9Sclaudio #include <string.h>
247f9525e9Sclaudio 
25a16c0992Shenning #include "bgpd.h"
26a16c0992Shenning #include "rde.h"
275e3f6f95Sbenno #include "log.h"
287f9525e9Sclaudio 
2942f9861bSclaudio int	prefix_cmp(struct prefix *, struct prefix *, int *);
30b1dddf40Sclaudio void	prefix_set_dmetric(struct prefix *, struct prefix *);
3142f9861bSclaudio void	prefix_insert(struct prefix *, struct prefix *, struct rib_entry *);
3242f9861bSclaudio void	prefix_remove(struct prefix *, struct rib_entry *);
33a16c0992Shenning /*
34a16c0992Shenning  * Decision Engine RFC implementation:
35a16c0992Shenning  *  Phase 1:
36a16c0992Shenning  *   - calculate LOCAL_PREF if needed -- EBGP or IGP learnt routes
37a16c0992Shenning  *   - IBGP routes may either use LOCAL_PREF or the local system computes
38a16c0992Shenning  *     the degree of preference
39a16c0992Shenning  *   - If the route is ineligible, the route MAY NOT serve as an input to
40a16c0992Shenning  *     the next phase of route selection
41a16c0992Shenning  *   - if the route is eligible the computed value MUST be used as the
42a16c0992Shenning  *     LOCAL_PREF value in any IBGP readvertisement
43a16c0992Shenning  *
44a16c0992Shenning  *  Phase 2:
45a16c0992Shenning  *   - If the NEXT_HOP attribute of a BGP route depicts an address that is
46a16c0992Shenning  *     not resolvable the BGP route MUST be excluded from the Phase 2 decision
47a16c0992Shenning  *     function.
48a16c0992Shenning  *   - If the AS_PATH attribute of a BGP route contains an AS loop, the BGP
49a16c0992Shenning  *     route should be excluded from the Phase 2 decision function.
50a16c0992Shenning  *   - The local BGP speaker identifies the route that has:
51a16c0992Shenning  *     a) the highest degree of preference of any route to the same set
52a16c0992Shenning  *        of destinations
53a16c0992Shenning  *     b) is the only route to that destination
54a16c0992Shenning  *     c) is selected as a result of the Phase 2 tie breaking rules
55a16c0992Shenning  *   - The local speaker MUST determine the immediate next-hop address from
56a16c0992Shenning  *     the NEXT_HOP attribute of the selected route.
57a16c0992Shenning  *   - If either the immediate next hop or the IGP cost to the NEXT_HOP changes,
58a16c0992Shenning  *     Phase 2 Route Selection MUST be performed again.
59a16c0992Shenning  *
60a16c0992Shenning  *  Route Resolvability Condition
61a16c0992Shenning  *   - A route Rte1, referencing only the intermediate network address, is
62a16c0992Shenning  *     considered resolvable if the Routing Table contains at least one
63a16c0992Shenning  *     resolvable route Rte2 that matches Rte1's intermediate network address
64a16c0992Shenning  *     and is not recursively resolved through Rte1.
65a16c0992Shenning  *   - Routes referencing interfaces are considered resolvable if the state of
66a16c0992Shenning  *     the referenced interface is up and IP processing is enabled.
67a16c0992Shenning  *
68a16c0992Shenning  *  Breaking Ties (Phase 2)
69a16c0992Shenning  *   1. Remove from consideration all routes which are not tied for having the
70a16c0992Shenning  *      smallest number of AS numbers present in their AS_PATH attributes.
71a16c0992Shenning  *      Note, that when counting this number, an AS_SET counts as 1
72a16c0992Shenning  *   2. Remove from consideration all routes which are not tied for having the
73a16c0992Shenning  *      lowest Origin number in their Origin attribute.
74a16c0992Shenning  *   3. Remove from consideration routes with less-preferred MULTI_EXIT_DISC
75a16c0992Shenning  *      attributes. MULTI_EXIT_DISC is only comparable between routes learned
76a16c0992Shenning  *      from the same neighboring AS.
77a16c0992Shenning  *   4. If at least one of the candidate routes was received via EBGP,
78a16c0992Shenning  *      remove from consideration all routes which were received via IBGP.
79a16c0992Shenning  *   5. Remove from consideration any routes with less-preferred interior cost.
80a16c0992Shenning  *      If the NEXT_HOP hop for a route is reachable, but no cost can be
81a16c0992Shenning  *      determined, then this step should be skipped.
82a16c0992Shenning  *   6. Remove from consideration all routes other than the route that was
83a16c0992Shenning  *      advertised by the BGP speaker whose BGP Identifier has the lowest value.
84a16c0992Shenning  *   7. Prefer the route received from the lowest peer address.
85a16c0992Shenning  *
86a16c0992Shenning  * Phase 3: Route Dissemination
87a16c0992Shenning  *   - All routes in the Loc-RIB are processed into Adj-RIBs-Out according
88a16c0992Shenning  *     to configured policy. A route SHALL NOT be installed in the Adj-Rib-Out
89a16c0992Shenning  *     unless the destination and NEXT_HOP described by this route may be
90a16c0992Shenning  *     forwarded appropriately by the Routing Table.
91a16c0992Shenning  */
92a16c0992Shenning 
93a16c0992Shenning /*
94a16c0992Shenning  * Decision Engine OUR implementation:
9505ebbbf6Sclaudio  * The filtering is done first. The filtering calculates the preference and
9605ebbbf6Sclaudio  * stores it in LOCAL_PREF (Phase 1).
9726904cb3Sclaudio  * Ineligible routes are flagged as ineligible via nexthop_add().
9826904cb3Sclaudio  * Phase 3 is done together with Phase 2.
99a16c0992Shenning  * In following cases a prefix needs to be reevaluated:
100227b4aedSclaudio  *  - update of a prefix (prefix_update)
101227b4aedSclaudio  *  - withdraw of a prefix (prefix_withdraw)
102a16c0992Shenning  *  - state change of the nexthop (nexthop-{in}validate)
103a16c0992Shenning  *  - state change of session (session down)
104a16c0992Shenning  */
105a16c0992Shenning 
106a16c0992Shenning /*
107655dc4c7Sclaudio  * Compare two prefixes with equal pt_entry. Returns an integer greater than or
10804f2231bShenning  * less than 0, according to whether the prefix p1 is more or less preferred
109a16c0992Shenning  * than the prefix p2. p1 should be used for the new prefix and p2 for a
110b1dddf40Sclaudio  * already added prefix. The absolute value returned specifies the similarity
111b1dddf40Sclaudio  * of the prefixes.
112b1dddf40Sclaudio  *   1: prefixes differ because of validity
113b1dddf40Sclaudio  *   2: prefixes don't belong in any multipath set
114b1dddf40Sclaudio  *   3: prefixes belong only in the as-wide multipath set
115b1dddf40Sclaudio  *   4: prefixes belong in both the ecmp and as-wide multipath set
116b1dddf40Sclaudio  *   TODO: maybe we also need a strict ecmp set that requires
117b1dddf40Sclaudio  *   prefixes to e.g. equal ASPATH or equal neighbor-as (like for MED).
118a16c0992Shenning  */
1197f9525e9Sclaudio int
12042f9861bSclaudio prefix_cmp(struct prefix *p1, struct prefix *p2, int *testall)
121a16c0992Shenning {
122a16c0992Shenning 	struct rde_aspath	*asp1, *asp2;
1232b6d2161Sclaudio 	struct rde_peer		*peer1, *peer2;
124e1febd93Sclaudio 	struct attr		*a;
12539386878Sclaudio 	uint32_t		 p1id, p2id;
126652909eeSclaudio 	int			 p1cnt, p2cnt, i;
127b1dddf40Sclaudio 	int			 rv = 1;
128a16c0992Shenning 
12942f9861bSclaudio 	/*
13042f9861bSclaudio 	 * If a match happens before the MED check then the list is
13142f9861bSclaudio 	 * correctly sorted. If a match happens after MED then further
13242f9861bSclaudio 	 * elements may need to be checked to ensure that all paths
13342f9861bSclaudio 	 * which could affect this path were considered. This only
13442f9861bSclaudio 	 * matters for strict MED evaluation and in that case testall
13542f9861bSclaudio 	 * is set to 1. If the check happens to be on the MED check
13642f9861bSclaudio 	 * itself testall is set to 2.
13742f9861bSclaudio 	 */
13842f9861bSclaudio 	*testall = 0;
13942f9861bSclaudio 
140ba0c695bSclaudio 	if (p1 == NULL)
141b1dddf40Sclaudio 		return -rv;
142a16c0992Shenning 	if (p2 == NULL)
143b1dddf40Sclaudio 		return rv;
144a16c0992Shenning 
1452b6d2161Sclaudio 	asp1 = prefix_aspath(p1);
1462b6d2161Sclaudio 	asp2 = prefix_aspath(p2);
1476f9c354eSclaudio 	peer1 = prefix_peer(p1);
1486f9c354eSclaudio 	peer2 = prefix_peer(p2);
14926904cb3Sclaudio 
15013d31ce9Sclaudio 	/* 1. check if prefix is eligible a.k.a reachable */
15113d31ce9Sclaudio 	if (!prefix_eligible(p2))
152b1dddf40Sclaudio 		return rv;
15313d31ce9Sclaudio 	if (!prefix_eligible(p1))
154b1dddf40Sclaudio 		return -rv;
155b1dddf40Sclaudio 
156b1dddf40Sclaudio 	/* bump rv, from here on prefix is considered valid */
157b1dddf40Sclaudio 	rv++;
158a16c0992Shenning 
1595424e8d9Sclaudio 	/* 2. local preference of prefix, bigger is better */
160652909eeSclaudio 	if (asp1->lpref > asp2->lpref)
161b1dddf40Sclaudio 		return rv;
162652909eeSclaudio 	if (asp1->lpref < asp2->lpref)
163b1dddf40Sclaudio 		return -rv;
164a16c0992Shenning 
165a16c0992Shenning 	/* 3. aspath count, the shorter the better */
166b1dddf40Sclaudio 	if (asp1->aspath->ascnt < asp2->aspath->ascnt)
167b1dddf40Sclaudio 		return rv;
168b1dddf40Sclaudio 	if (asp1->aspath->ascnt > asp2->aspath->ascnt)
169b1dddf40Sclaudio 		return -rv;
170a16c0992Shenning 
171a16c0992Shenning 	/* 4. origin, the lower the better */
172b1dddf40Sclaudio 	if (asp1->origin < asp2->origin)
173b1dddf40Sclaudio 		return rv;
174b1dddf40Sclaudio 	if (asp1->origin > asp2->origin)
175b1dddf40Sclaudio 		return -rv;
176a16c0992Shenning 
177652909eeSclaudio 	/*
178652909eeSclaudio 	 * 5. MED decision
179652909eeSclaudio 	 * Only comparable between the same neighboring AS or if
18042f9861bSclaudio 	 * 'rde med compare always' is set. In the first case
18142f9861bSclaudio 	 * set the testall flag since further elements need to be
18242f9861bSclaudio 	 * evaluated as well.
183652909eeSclaudio 	 */
184652909eeSclaudio 	if ((rde_decisionflags() & BGPD_FLAG_DECISION_MED_ALWAYS) ||
185652909eeSclaudio 	    aspath_neighbor(asp1->aspath) == aspath_neighbor(asp2->aspath)) {
18642f9861bSclaudio 		if (!(rde_decisionflags() & BGPD_FLAG_DECISION_MED_ALWAYS))
18742f9861bSclaudio 			*testall = 2;
188dc26f558Sclaudio 		/* lowest value wins */
189652909eeSclaudio 		if (asp1->med < asp2->med)
190b1dddf40Sclaudio 			return rv;
191652909eeSclaudio 		if (asp1->med > asp2->med)
192b1dddf40Sclaudio 			return -rv;
193652909eeSclaudio 	}
194a16c0992Shenning 
19542f9861bSclaudio 	if (!(rde_decisionflags() & BGPD_FLAG_DECISION_MED_ALWAYS))
19642f9861bSclaudio 		*testall = 1;
19742f9861bSclaudio 
198a16c0992Shenning 	/*
199655dc4c7Sclaudio 	 * 6. EBGP is cooler than IBGP
20004f2231bShenning 	 * It is absolutely important that the ebgp value in peer_config.ebgp
201a16c0992Shenning 	 * is bigger than all other ones (IBGP, confederations)
202a16c0992Shenning 	 */
2036f9c354eSclaudio 	if (peer1->conf.ebgp != peer2->conf.ebgp) {
2046f9c354eSclaudio 		if (peer1->conf.ebgp) /* peer1 is EBGP other is lower */
205b1dddf40Sclaudio 			return rv;
2066f9c354eSclaudio 		else if (peer2->conf.ebgp) /* peer2 is EBGP */
207b1dddf40Sclaudio 			return -rv;
208a16c0992Shenning 	}
209a16c0992Shenning 
210b1dddf40Sclaudio 	/* bump rv, as-wide multipath */
211b1dddf40Sclaudio 	rv++;
212b1dddf40Sclaudio 
213aa5d92feSclaudio 	/*
214aa5d92feSclaudio 	 * 7. local tie-breaker, this weight is here to tip equal long AS
215c5508ee4Sclaudio 	 * paths in one or the other direction. It happens more and more
216c5508ee4Sclaudio 	 * that AS paths are equally long and so traffic engineering needs
217aa5d92feSclaudio 	 * a metric that weights a prefix at a very late stage in the
218aa5d92feSclaudio 	 * decision process.
219aa5d92feSclaudio 	 */
220652909eeSclaudio 	if (asp1->weight > asp2->weight)
221b1dddf40Sclaudio 		return rv;
222652909eeSclaudio 	if (asp1->weight < asp2->weight)
223b1dddf40Sclaudio 		return -rv;
224aa5d92feSclaudio 
225aa5d92feSclaudio 	/* 8. nexthop costs. NOT YET -> IGNORE */
226a16c0992Shenning 
227b1dddf40Sclaudio 	/* bump rv, equal cost multipath */
228b1dddf40Sclaudio 	rv++;
229b1dddf40Sclaudio 
23088ca7a8dSclaudio 	/*
231aa5d92feSclaudio 	 * 9. older route (more stable) wins but only if route-age
23288ca7a8dSclaudio 	 * evaluation is enabled.
23388ca7a8dSclaudio 	 */
234652909eeSclaudio 	if (rde_decisionflags() & BGPD_FLAG_DECISION_ROUTEAGE) {
235652909eeSclaudio 		if (p1->lastchange < p2->lastchange) /* p1 is older */
236b1dddf40Sclaudio 			return rv;
237652909eeSclaudio 		if (p1->lastchange > p2->lastchange)
238b1dddf40Sclaudio 			return -rv;
239652909eeSclaudio 	}
240a16c0992Shenning 
241e1febd93Sclaudio 	/* 10. lowest BGP Id wins, use ORIGINATOR_ID if present */
242e1febd93Sclaudio 	if ((a = attr_optget(asp1, ATTR_ORIGINATOR_ID)) != NULL) {
243e1febd93Sclaudio 		memcpy(&p1id, a->data, sizeof(p1id));
244e1febd93Sclaudio 		p1id = ntohl(p1id);
245e1febd93Sclaudio 	} else
2466f9c354eSclaudio 		p1id = peer1->remote_bgpid;
247e1febd93Sclaudio 	if ((a = attr_optget(asp2, ATTR_ORIGINATOR_ID)) != NULL) {
248e1febd93Sclaudio 		memcpy(&p2id, a->data, sizeof(p2id));
249e1febd93Sclaudio 		p2id = ntohl(p2id);
250e1febd93Sclaudio 	} else
2516f9c354eSclaudio 		p2id = peer2->remote_bgpid;
252652909eeSclaudio 	if (p1id < p2id)
253b1dddf40Sclaudio 		return rv;
254652909eeSclaudio 	if (p1id > p2id)
255b1dddf40Sclaudio 		return -rv;
256933012f8Shenning 
257e1febd93Sclaudio 	/* 11. compare CLUSTER_LIST length, shorter is better */
258e1febd93Sclaudio 	p1cnt = p2cnt = 0;
259e1febd93Sclaudio 	if ((a = attr_optget(asp1, ATTR_CLUSTER_LIST)) != NULL)
26039386878Sclaudio 		p1cnt = a->len / sizeof(uint32_t);
261e1febd93Sclaudio 	if ((a = attr_optget(asp2, ATTR_CLUSTER_LIST)) != NULL)
26239386878Sclaudio 		p2cnt = a->len / sizeof(uint32_t);
263b1dddf40Sclaudio 	if (p1cnt < p2cnt)
264b1dddf40Sclaudio 		return rv;
265b1dddf40Sclaudio 	if (p1cnt > p2cnt)
266b1dddf40Sclaudio 		return -rv;
267e1febd93Sclaudio 
268e1febd93Sclaudio 	/* 12. lowest peer address wins (IPv4 is better than IPv6) */
269652909eeSclaudio 	if (peer1->remote_addr.aid < peer2->remote_addr.aid)
270b1dddf40Sclaudio 		return rv;
271652909eeSclaudio 	if (peer1->remote_addr.aid > peer2->remote_addr.aid)
272b1dddf40Sclaudio 		return -rv;
273652909eeSclaudio 	switch (peer1->remote_addr.aid) {
274652909eeSclaudio 	case AID_INET:
275652909eeSclaudio 		i = memcmp(&peer1->remote_addr.v4, &peer2->remote_addr.v4,
276652909eeSclaudio 		    sizeof(struct in_addr));
277652909eeSclaudio 		break;
278652909eeSclaudio 	case AID_INET6:
279652909eeSclaudio 		i = memcmp(&peer1->remote_addr.v6, &peer2->remote_addr.v6,
280652909eeSclaudio 		    sizeof(struct in6_addr));
281652909eeSclaudio 		break;
282652909eeSclaudio 	default:
283652909eeSclaudio 		fatalx("%s: unknown af", __func__);
284652909eeSclaudio 	}
285652909eeSclaudio 	if (i < 0)
286b1dddf40Sclaudio 		return rv;
287652909eeSclaudio 	if (i > 0)
288b1dddf40Sclaudio 		return -rv;
289a16c0992Shenning 
290591142c4Sclaudio 	/* RFC7911 does not specify this but something like this is needed. */
29129b527fbSclaudio 	/* 13. lowest path identifier wins */
29229b527fbSclaudio 	if (p1->path_id < p2->path_id)
293b1dddf40Sclaudio 		return rv;
29429b527fbSclaudio 	if (p1->path_id > p2->path_id)
295b1dddf40Sclaudio 		return -rv;
29629b527fbSclaudio 
297f87a5020Shenning 	fatalx("Uh, oh a politician in the decision process");
298a16c0992Shenning }
299a16c0992Shenning 
30005ebbbf6Sclaudio /*
301b1dddf40Sclaudio  * set the dmetric value of np based on the return value of
302b1dddf40Sclaudio  * prefix_evaluate(pp, np) or set it to either PREFIX_DMETRIC_BEST
303b1dddf40Sclaudio  * or PREFIX_DMETRIC_INVALID for the first element.
304b1dddf40Sclaudio  */
305b1dddf40Sclaudio void
306b1dddf40Sclaudio prefix_set_dmetric(struct prefix *pp, struct prefix *np)
307b1dddf40Sclaudio {
308b1dddf40Sclaudio 	int testall;
309b1dddf40Sclaudio 
310b1dddf40Sclaudio 	if (np != NULL) {
311b1dddf40Sclaudio 		if (pp == NULL)
312b1dddf40Sclaudio 			np->dmetric = prefix_eligible(np) ?
313b1dddf40Sclaudio 			    PREFIX_DMETRIC_BEST : PREFIX_DMETRIC_INVALID;
314b1dddf40Sclaudio 		else
315b1dddf40Sclaudio 			np->dmetric = prefix_cmp(pp, np, &testall);
316a485c7fbSclaudio 		if (np->dmetric < 0) {
317a485c7fbSclaudio 			struct bgpd_addr addr;
318a485c7fbSclaudio 			pt_getaddr(np->pt, &addr);
319a485c7fbSclaudio 			log_debug("bad dmetric in decision process: %s/%u",
320a485c7fbSclaudio 			    log_addr(&addr), np->pt->prefixlen);
321a485c7fbSclaudio 		}
322b1dddf40Sclaudio 	}
323b1dddf40Sclaudio }
324b1dddf40Sclaudio 
325b1dddf40Sclaudio /*
32605ebbbf6Sclaudio  * Insert a prefix keeping the total order of the list. For routes
32705ebbbf6Sclaudio  * that may depend on a MED selection the set is scanned until the
32805ebbbf6Sclaudio  * condition is cleared. If a MED inversion is detected the respective
32905ebbbf6Sclaudio  * prefix is taken of the rib list and put onto a redo queue. All
33005ebbbf6Sclaudio  * prefixes on the redo queue are re-inserted at the end.
33105ebbbf6Sclaudio  */
33242f9861bSclaudio void
33342f9861bSclaudio prefix_insert(struct prefix *new, struct prefix *ep, struct rib_entry *re)
33442f9861bSclaudio {
335df08e9a0Sclaudio 	struct prefix_queue redo = TAILQ_HEAD_INITIALIZER(redo);
336df08e9a0Sclaudio 	struct prefix *xp, *np, *insertp = ep;
3371d5ff589Sclaudio 	int testall, preferred, selected = 0, removed = 0;
33842f9861bSclaudio 
339591142c4Sclaudio 	/* start scan at the entry point (ep) or the head if ep == NULL */
34042f9861bSclaudio 	if (ep == NULL)
341df08e9a0Sclaudio 		ep = TAILQ_FIRST(&re->prefix_h);
34242f9861bSclaudio 
34342f9861bSclaudio 	for (xp = ep; xp != NULL; xp = np) {
344df08e9a0Sclaudio 		np = TAILQ_NEXT(xp, entry.list.rib);
34542f9861bSclaudio 
3461d5ff589Sclaudio 		if ((preferred = (prefix_cmp(new, xp, &testall) > 0))) {
34742f9861bSclaudio 			/* new is preferred over xp */
3481d5ff589Sclaudio 			if (testall == 2) {
34942f9861bSclaudio 				/*
35042f9861bSclaudio 				 * MED inversion, take out prefix and
35142f9861bSclaudio 				 * put it onto redo queue.
35242f9861bSclaudio 				 */
353df08e9a0Sclaudio 				TAILQ_REMOVE(&re->prefix_h, xp, entry.list.rib);
354df08e9a0Sclaudio 				TAILQ_INSERT_TAIL(&redo, xp, entry.list.rib);
3551d5ff589Sclaudio 				removed = 1;
3561d5ff589Sclaudio 				continue;
3571d5ff589Sclaudio 			}
3581d5ff589Sclaudio 
3591d5ff589Sclaudio 			if (testall == 1) {
36042f9861bSclaudio 				/*
36142f9861bSclaudio 				 * lock insertion point and
36242f9861bSclaudio 				 * continue on with scan
36342f9861bSclaudio 				 */
36442f9861bSclaudio 				selected = 1;
36542f9861bSclaudio 			}
36642f9861bSclaudio 		} else {
36742f9861bSclaudio 			/*
368591142c4Sclaudio 			 * xp is preferred over new.
369591142c4Sclaudio 			 * Remember insertion point for later unless the
370591142c4Sclaudio 			 * traverse is just looking for a possible MED
371591142c4Sclaudio 			 * inversion (selected == 1).
372591142c4Sclaudio 			 * If the last comparison's tie-breaker was the MED
373591142c4Sclaudio 			 * check reset selected and with it insertp since
374591142c4Sclaudio 			 * this was an actual MED priority inversion.
37542f9861bSclaudio 			 */
37642f9861bSclaudio 			if (testall == 2)
37742f9861bSclaudio 				selected = 0;
37842f9861bSclaudio 			if (!selected)
37943888d92Sclaudio 				insertp = xp;
38042f9861bSclaudio 		}
3811d5ff589Sclaudio 
3821d5ff589Sclaudio 		/*
3831d5ff589Sclaudio 		 * If previous element(s) got removed, fixup the
3841d5ff589Sclaudio 		 * dmetric, now that it is clear that this element
3851d5ff589Sclaudio 		 * is on the list.
3861d5ff589Sclaudio 		 */
3871d5ff589Sclaudio 		if (removed) {
3881d5ff589Sclaudio 			prefix_set_dmetric(TAILQ_PREV(xp, prefix_queue,
3891d5ff589Sclaudio 			    entry.list.rib), xp);
3901d5ff589Sclaudio 			removed = 0;
3911d5ff589Sclaudio 		}
3921d5ff589Sclaudio 
3931d5ff589Sclaudio 		if (preferred && testall == 0)
3941d5ff589Sclaudio 			break;			/* we're done */
39542f9861bSclaudio 	}
39642f9861bSclaudio 
397b1dddf40Sclaudio 	if (insertp == NULL) {
398df08e9a0Sclaudio 		TAILQ_INSERT_HEAD(&re->prefix_h, new, entry.list.rib);
399b1dddf40Sclaudio 	} else {
400df08e9a0Sclaudio 		TAILQ_INSERT_AFTER(&re->prefix_h, insertp, new, entry.list.rib);
401b1dddf40Sclaudio 	}
402b1dddf40Sclaudio 
403b1dddf40Sclaudio 	prefix_set_dmetric(insertp, new);
404b1dddf40Sclaudio 	prefix_set_dmetric(new, TAILQ_NEXT(new, entry.list.rib));
40542f9861bSclaudio 
40642f9861bSclaudio 	/* Fixup MED order again. All elements are < new */
407df08e9a0Sclaudio 	while (!TAILQ_EMPTY(&redo)) {
408df08e9a0Sclaudio 		xp = TAILQ_FIRST(&redo);
409df08e9a0Sclaudio 		TAILQ_REMOVE(&redo, xp, entry.list.rib);
41042f9861bSclaudio 
41142f9861bSclaudio 		prefix_insert(xp, new, re);
41242f9861bSclaudio 	}
41342f9861bSclaudio }
41442f9861bSclaudio 
41505ebbbf6Sclaudio /*
41605ebbbf6Sclaudio  * Remove a prefix from the RIB list ensuring that the total order of the
41705ebbbf6Sclaudio  * list remains intact. All routes that differ in the MED are taken of the
41805ebbbf6Sclaudio  * list and put on the redo list. To figure out if a route could cause a
41905ebbbf6Sclaudio  * resort because of a MED check the next prefix of the to-remove prefix
42005ebbbf6Sclaudio  * is compared with the old prefix. A full scan is only done if the next
42105ebbbf6Sclaudio  * route differs because of the MED or later checks.
42205ebbbf6Sclaudio  * Again at the end all routes on the redo queue are reinserted.
42305ebbbf6Sclaudio  */
42442f9861bSclaudio void
42542f9861bSclaudio prefix_remove(struct prefix *old, struct rib_entry *re)
42642f9861bSclaudio {
427df08e9a0Sclaudio 	struct prefix_queue redo = TAILQ_HEAD_INITIALIZER(redo);
4281d5ff589Sclaudio 	struct prefix *xp, *np, *pp;
4291d5ff589Sclaudio 	int testall, removed = 0;
43042f9861bSclaudio 
431df08e9a0Sclaudio 	xp = TAILQ_NEXT(old, entry.list.rib);
4321d5ff589Sclaudio 	pp = TAILQ_PREV(old, prefix_queue, entry.list.rib);
433df08e9a0Sclaudio 	TAILQ_REMOVE(&re->prefix_h, old, entry.list.rib);
434b1dddf40Sclaudio 
43542f9861bSclaudio 	/* check if a MED inversion could be possible */
43642f9861bSclaudio 	prefix_cmp(old, xp, &testall);
43742f9861bSclaudio 	if (testall > 0) {
43842f9861bSclaudio 		/* maybe MED route, scan tail for other possible routes */
43942f9861bSclaudio 		for (; xp != NULL; xp = np) {
440df08e9a0Sclaudio 			np = TAILQ_NEXT(xp, entry.list.rib);
44142f9861bSclaudio 
44242f9861bSclaudio 			/* only interested in the testall result */
44342f9861bSclaudio 			prefix_cmp(old, xp, &testall);
4441d5ff589Sclaudio 			if (testall == 2) {
44542f9861bSclaudio 				/*
44642f9861bSclaudio 				 * possible MED inversion, take out prefix and
44742f9861bSclaudio 				 * put it onto redo queue.
44842f9861bSclaudio 				 */
449df08e9a0Sclaudio 				TAILQ_REMOVE(&re->prefix_h, xp, entry.list.rib);
450df08e9a0Sclaudio 				TAILQ_INSERT_TAIL(&redo, xp, entry.list.rib);
4511d5ff589Sclaudio 				removed = 1;
4521d5ff589Sclaudio 				continue;
4531d5ff589Sclaudio 			}
4541d5ff589Sclaudio 			/*
4551d5ff589Sclaudio 			 * If previous element(s) got removed, fixup the
4561d5ff589Sclaudio 			 * dmetric, now that it is clear that this element
4571d5ff589Sclaudio 			 * is on the list.
4581d5ff589Sclaudio 			 */
4591d5ff589Sclaudio 			if (removed) {
4601d5ff589Sclaudio 				prefix_set_dmetric(TAILQ_PREV(xp, prefix_queue,
4611d5ff589Sclaudio 				    entry.list.rib), xp);
4621d5ff589Sclaudio 				removed = 0;
4631d5ff589Sclaudio 			}
4641d5ff589Sclaudio 			if (testall == 0)
4651d5ff589Sclaudio 				break;		/* we're done */
46642f9861bSclaudio 		}
46742f9861bSclaudio 	}
4681d5ff589Sclaudio 
4691d5ff589Sclaudio 	if (pp)
4701d5ff589Sclaudio 		prefix_set_dmetric(pp, TAILQ_NEXT(pp, entry.list.rib));
4711d5ff589Sclaudio 	else
4721d5ff589Sclaudio 		prefix_set_dmetric(NULL, TAILQ_FIRST(&re->prefix_h));
47342f9861bSclaudio 
47442f9861bSclaudio 	/* Fixup MED order again, reinsert prefixes from the start */
475df08e9a0Sclaudio 	while (!TAILQ_EMPTY(&redo)) {
476df08e9a0Sclaudio 		xp = TAILQ_FIRST(&redo);
477df08e9a0Sclaudio 		TAILQ_REMOVE(&redo, xp, entry.list.rib);
47842f9861bSclaudio 
47942f9861bSclaudio 		prefix_insert(xp, NULL, re);
48042f9861bSclaudio 	}
48142f9861bSclaudio }
48242f9861bSclaudio 
48305ebbbf6Sclaudio /* helper function to check if a prefix is valid to be selected */
48405ebbbf6Sclaudio int
48505ebbbf6Sclaudio prefix_eligible(struct prefix *p)
48605ebbbf6Sclaudio {
48705ebbbf6Sclaudio 	struct rde_aspath *asp = prefix_aspath(p);
48805ebbbf6Sclaudio 
489*89ee02f7Sclaudio 	/* prefix itself is marked ineligible */
490*89ee02f7Sclaudio 	if (prefix_filtered(p))
491*89ee02f7Sclaudio 		return 0;
492*89ee02f7Sclaudio 
49305ebbbf6Sclaudio 	/* The aspath needs to be loop and error free */
494372bb3aaSclaudio 	if (asp == NULL ||
495f337fe2fSclaudio 	    asp->flags & (F_ATTR_LOOP|F_ATTR_OTC_LEAK|F_ATTR_PARSE_ERR))
49605ebbbf6Sclaudio 		return 0;
49713d31ce9Sclaudio 
49813d31ce9Sclaudio 	/* The nexthop must be valid. */
49913d31ce9Sclaudio 	if (!prefix_nhvalid(p))
50005ebbbf6Sclaudio 		return 0;
50105ebbbf6Sclaudio 
50205ebbbf6Sclaudio 	return 1;
50305ebbbf6Sclaudio }
50405ebbbf6Sclaudio 
5058bf5540bSclaudio struct prefix *
5068bf5540bSclaudio prefix_best(struct rib_entry *re)
5078bf5540bSclaudio {
5088bf5540bSclaudio 	struct prefix	*xp;
5098bf5540bSclaudio 	struct rib	*rib;
5108bf5540bSclaudio 
5118bf5540bSclaudio 	rib = re_rib(re);
5128bf5540bSclaudio 	if (rib->flags & F_RIB_NOEVALUATE)
5138bf5540bSclaudio 		/* decision process is turned off */
5148bf5540bSclaudio 		return NULL;
5158bf5540bSclaudio 
516df08e9a0Sclaudio 	xp = TAILQ_FIRST(&re->prefix_h);
5178bf5540bSclaudio 	if (xp != NULL && !prefix_eligible(xp))
5188bf5540bSclaudio 		xp = NULL;
5198bf5540bSclaudio 	return xp;
5208bf5540bSclaudio }
5218bf5540bSclaudio 
522a16c0992Shenning /*
523a16c0992Shenning  * Find the correct place to insert the prefix in the prefix list.
5249f1086bfSclaudio  * If the active prefix has changed we need to send an update also special
5259f1086bfSclaudio  * treatment is needed if 'rde evaluate all' is used on some peers.
5269f1086bfSclaudio  * To re-evaluate a prefix just call prefix_evaluate with old and new pointing
5279f1086bfSclaudio  * to the same prefix.
528a16c0992Shenning  */
529a16c0992Shenning void
530d884cecfSclaudio prefix_evaluate(struct rib_entry *re, struct prefix *new, struct prefix *old)
531a16c0992Shenning {
532f6e85e58Sclaudio 	struct prefix	*newbest, *oldbest;
533c042b633Sclaudio 	struct rib	*rib;
534a16c0992Shenning 
535c042b633Sclaudio 	rib = re_rib(re);
536c042b633Sclaudio 	if (rib->flags & F_RIB_NOEVALUATE) {
537e4020a54Sclaudio 		/* decision process is turned off */
538d884cecfSclaudio 		if (old != NULL)
539df08e9a0Sclaudio 			TAILQ_REMOVE(&re->prefix_h, old, entry.list.rib);
540b1dddf40Sclaudio 		if (new != NULL) {
541df08e9a0Sclaudio 			TAILQ_INSERT_HEAD(&re->prefix_h, new, entry.list.rib);
542b1dddf40Sclaudio 			new->dmetric = PREFIX_DMETRIC_INVALID;
543b1dddf40Sclaudio 		}
544e4020a54Sclaudio 		return;
545e4020a54Sclaudio 	}
546e4020a54Sclaudio 
547f6e85e58Sclaudio 	oldbest = prefix_best(re);
548d884cecfSclaudio 	if (old != NULL)
54942f9861bSclaudio 		prefix_remove(old, re);
55042f9861bSclaudio 	if (new != NULL)
55142f9861bSclaudio 		prefix_insert(new, NULL, re);
55213d31ce9Sclaudio 	newbest = prefix_best(re);
553d70d731bSclaudio 
554a16c0992Shenning 	/*
555d884cecfSclaudio 	 * If the active prefix changed or the active prefix was removed
556d884cecfSclaudio 	 * and added again then generate an update.
557d884cecfSclaudio 	 */
558f6e85e58Sclaudio 	if (oldbest != newbest || (old != NULL && newbest == old)) {
559d884cecfSclaudio 		/*
560f6e85e58Sclaudio 		 * Send update withdrawing oldbest and adding newbest
561f6e85e58Sclaudio 		 * but remember that newbest may be NULL aka ineligible.
5621207958aSclaudio 		 * Additional decision may be made by the called functions.
563a16c0992Shenning 		 */
564c042b633Sclaudio 		if ((rib->flags & F_RIB_NOFIB) == 0)
565f6e85e58Sclaudio 			rde_send_kroute(rib, newbest, oldbest);
566988ba0baSclaudio 		rde_generate_updates(re, new, old, EVAL_DEFAULT);
56705ebbbf6Sclaudio 		return;
568a16c0992Shenning 	}
56905ebbbf6Sclaudio 
57005ebbbf6Sclaudio 	/*
57105ebbbf6Sclaudio 	 * If there are peers with 'rde evaluate all' every update needs
57205ebbbf6Sclaudio 	 * to be passed on (not only a change of the best prefix).
57305ebbbf6Sclaudio 	 * rde_generate_updates() will then take care of distribution.
57405ebbbf6Sclaudio 	 */
575bcdda550Sclaudio 	if (rde_evaluate_all()) {
576bcdda550Sclaudio 		if (new != NULL && !prefix_eligible(new))
577bcdda550Sclaudio 			new = NULL;
578bcdda550Sclaudio 		if (new != NULL || old != NULL)
579988ba0baSclaudio 			rde_generate_updates(re, new, old, EVAL_ALL);
580a16c0992Shenning 	}
581bcdda550Sclaudio }
58213d31ce9Sclaudio 
58313d31ce9Sclaudio void
58413d31ce9Sclaudio prefix_evaluate_nexthop(struct prefix *p, enum nexthop_state state,
58513d31ce9Sclaudio     enum nexthop_state oldstate)
58613d31ce9Sclaudio {
58713d31ce9Sclaudio 	struct rib_entry *re = prefix_re(p);
588bcdda550Sclaudio 	struct prefix	*newbest, *oldbest, *new, *old;
58913d31ce9Sclaudio 	struct rib	*rib;
59013d31ce9Sclaudio 
59113d31ce9Sclaudio 	/* Skip non local-RIBs or RIBs that are flagged as noeval. */
59213d31ce9Sclaudio 	rib = re_rib(re);
59313d31ce9Sclaudio 	if (rib->flags & F_RIB_NOEVALUATE) {
59413d31ce9Sclaudio 		log_warnx("%s: prefix with F_RIB_NOEVALUATE hit", __func__);
59513d31ce9Sclaudio 		return;
59613d31ce9Sclaudio 	}
59713d31ce9Sclaudio 
59813d31ce9Sclaudio 	if (oldstate == state) {
59913d31ce9Sclaudio 		/*
60013d31ce9Sclaudio 		 * The state of the nexthop did not change. The only
60113d31ce9Sclaudio 		 * thing that may have changed is the true_nexthop
60213d31ce9Sclaudio 		 * or other internal infos. This will not change
60313d31ce9Sclaudio 		 * the routing decision so shortcut here.
60413d31ce9Sclaudio 		 * XXX needs to be changed for ECMP
60513d31ce9Sclaudio 		 */
60613d31ce9Sclaudio 		if (state == NEXTHOP_REACH) {
60713d31ce9Sclaudio 			if ((rib->flags & F_RIB_NOFIB) == 0 &&
60813d31ce9Sclaudio 			    p == prefix_best(re))
60913d31ce9Sclaudio 				rde_send_kroute(rib, p, NULL);
61013d31ce9Sclaudio 		}
61113d31ce9Sclaudio 		return;
61213d31ce9Sclaudio 	}
61313d31ce9Sclaudio 
61413d31ce9Sclaudio 	/*
61513d31ce9Sclaudio 	 * Re-evaluate the prefix by removing the prefix then updating the
61613d31ce9Sclaudio 	 * nexthop state and reinserting the prefix again.
61713d31ce9Sclaudio 	 */
618bcdda550Sclaudio 	old = p;
61913d31ce9Sclaudio 	oldbest = prefix_best(re);
62013d31ce9Sclaudio 	prefix_remove(p, re);
62113d31ce9Sclaudio 
62213d31ce9Sclaudio 	if (state == NEXTHOP_REACH)
62313d31ce9Sclaudio 		p->nhflags |= NEXTHOP_VALID;
62413d31ce9Sclaudio 	else
62513d31ce9Sclaudio 		p->nhflags &= ~NEXTHOP_VALID;
62613d31ce9Sclaudio 
62713d31ce9Sclaudio 	prefix_insert(p, NULL, re);
62813d31ce9Sclaudio 	newbest = prefix_best(re);
629bcdda550Sclaudio 	new = p;
630bcdda550Sclaudio 	if (!prefix_eligible(new))
631bcdda550Sclaudio 		new = NULL;
63213d31ce9Sclaudio 
63313d31ce9Sclaudio 	/*
63413d31ce9Sclaudio 	 * If the active prefix changed or the active prefix was removed
63513d31ce9Sclaudio 	 * and added again then generate an update.
63613d31ce9Sclaudio 	 */
63713d31ce9Sclaudio 	if (oldbest != newbest || newbest == p) {
63813d31ce9Sclaudio 		/*
63913d31ce9Sclaudio 		 * Send update withdrawing oldbest and adding newbest
64013d31ce9Sclaudio 		 * but remember that newbest may be NULL aka ineligible.
64113d31ce9Sclaudio 		 * Additional decision may be made by the called functions.
64213d31ce9Sclaudio 		 */
64313d31ce9Sclaudio 		if ((rib->flags & F_RIB_NOFIB) == 0)
64413d31ce9Sclaudio 			rde_send_kroute(rib, newbest, oldbest);
645bcdda550Sclaudio 		rde_generate_updates(re, new, old, EVAL_DEFAULT);
64613d31ce9Sclaudio 		return;
64713d31ce9Sclaudio 	}
64813d31ce9Sclaudio 
64913d31ce9Sclaudio 	/*
65013d31ce9Sclaudio 	 * If there are peers with 'rde evaluate all' every update needs
65113d31ce9Sclaudio 	 * to be passed on (not only a change of the best prefix).
65213d31ce9Sclaudio 	 * rde_generate_updates() will then take care of distribution.
65313d31ce9Sclaudio 	 */
65413d31ce9Sclaudio 	if (rde_evaluate_all())
655bcdda550Sclaudio 		rde_generate_updates(re, new, old, EVAL_ALL);
65613d31ce9Sclaudio }
657