xref: /openbsd-src/usr.sbin/bgpd/rde_decide.c (revision a28daedfc357b214be5c701aa8ba8adb29a7f1c2)
1 /*	$OpenBSD: rde_decide.c,v 1.52 2009/04/23 19:23:27 claudio Exp $ */
2 
3 /*
4  * Copyright (c) 2003, 2004 Claudio Jeker <claudio@openbsd.org>
5  * Copyright (c) 2003, 2004 Henning Brauer <henning@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 
20 #include <sys/types.h>
21 #include <sys/queue.h>
22 
23 #include <string.h>
24 
25 #include "bgpd.h"
26 #include "rde.h"
27 
28 int	prefix_cmp(struct prefix *, struct prefix *);
29 /*
30  * Decision Engine RFC implementation:
31  *  Phase 1:
32  *   - calculate LOCAL_PREF if needed -- EBGP or IGP learnt routes
33  *   - IBGP routes may either use LOCAL_PREF or the local system computes
34  *     the degree of preference
35  *   - If the route is ineligible, the route MAY NOT serve as an input to
36  *     the next phase of route selection
37  *   - if the route is eligible the computed value MUST be used as the
38  *     LOCAL_PREF value in any IBGP readvertisement
39  *
40  *  Phase 2:
41  *   - If the NEXT_HOP attribute of a BGP route depicts an address that is
42  *     not resolvable the BGP route MUST be excluded from the Phase 2 decision
43  *     function.
44  *   - If the AS_PATH attribute of a BGP route contains an AS loop, the BGP
45  *     route should be excluded from the Phase 2 decision function.
46  *   - The local BGP speaker identifies the route that has:
47  *     a) the highest degree of preference of any route to the same set
48  *        of destinations
49  *     b) is the only route to that destination
50  *     c) is selected as a result of the Phase 2 tie breaking rules
51  *   - The local speaker MUST determine the immediate next-hop address from
52  *     the NEXT_HOP attribute of the selected route.
53  *   - If either the immediate next hop or the IGP cost to the NEXT_HOP changes,
54  *     Phase 2 Route Selection MUST be performed again.
55  *
56  *  Route Resolvability Condition
57  *   - A route Rte1, referencing only the intermediate network address, is
58  *     considered resolvable if the Routing Table contains at least one
59  *     resolvable route Rte2 that matches Rte1's intermediate network address
60  *     and is not recursively resolved through Rte1.
61  *   - Routes referencing interfaces are considered resolvable if the state of
62  *     the referenced interface is up and IP processing is enabled.
63  *
64  *  Breaking Ties (Phase 2)
65  *   1. Remove from consideration all routes which are not tied for having the
66  *      smallest number of AS numbers present in their AS_PATH attributes.
67  *      Note, that when counting this number, an AS_SET counts as 1
68  *   2. Remove from consideration all routes which are not tied for having the
69  *      lowest Origin number in their Origin attribute.
70  *   3. Remove from consideration routes with less-preferred MULTI_EXIT_DISC
71  *      attributes. MULTI_EXIT_DISC is only comparable between routes learned
72  *      from the same neighboring AS.
73  *   4. If at least one of the candidate routes was received via EBGP,
74  *      remove from consideration all routes which were received via IBGP.
75  *   5. Remove from consideration any routes with less-preferred interior cost.
76  *      If the NEXT_HOP hop for a route is reachable, but no cost can be
77  *      determined, then this step should be skipped.
78  *   6. Remove from consideration all routes other than the route that was
79  *      advertised by the BGP speaker whose BGP Identifier has the lowest value.
80  *   7. Prefer the route received from the lowest peer address.
81  *
82  * Phase 3: Route Dissemination
83  *   - All routes in the Loc-RIB are processed into Adj-RIBs-Out according
84  *     to configured policy. A route SHALL NOT be installed in the Adj-Rib-Out
85  *     unless the destination and NEXT_HOP described by this route may be
86  *     forwarded appropriately by the Routing Table.
87  */
88 
89 /*
90  * Decision Engine OUR implementation:
91  * Our implementation has only one RIB. The filtering is done first. The
92  * filtering calculates the preference and stores it in LOCAL_PREF (Phase 1).
93  * Ineligible routes are flagged as ineligible via nexthop_add().
94  * Phase 3 is done together with Phase 2.
95  * In following cases a prefix needs to be reevaluated:
96  *  - update of a prefix (path_update)
97  *  - withdraw of a prefix (prefix_remove)
98  *  - state change of the nexthop (nexthop-{in}validate)
99  *  - state change of session (session down)
100  */
101 
102 /*
103  * Compare two prefixes with equal pt_entry. Returns an integer greater than or
104  * less than 0, according to whether the prefix p1 is more or less preferred
105  * than the prefix p2. p1 should be used for the new prefix and p2 for a
106  * already added prefix.
107  */
108 int
109 prefix_cmp(struct prefix *p1, struct prefix *p2)
110 {
111 	struct rde_aspath	*asp1, *asp2;
112 
113 	if (p1 == NULL)
114 		return (-1);
115 	if (p2 == NULL)
116 		return (1);
117 
118 	/* only prefixes in the Local-RIB are eligible */
119 	if (!(p1->flags & F_LOCAL))
120 		return (-1);
121 	if (!(p2->flags & F_LOCAL))
122 		return (1);
123 
124 	asp1 = p1->aspath;
125 	asp2 = p2->aspath;
126 
127 	/* only loop free pathes are eligible */
128 	if (asp1->flags & F_ATTR_LOOP)
129 		return (-1);
130 	if (asp2->flags & F_ATTR_LOOP)
131 		return (1);
132 
133 	/* 1. check if prefix is eligible a.k.a reachable */
134 	if (asp2->nexthop != NULL && asp2->nexthop->state != NEXTHOP_REACH)
135 		return (1);
136 	if (asp1->nexthop != NULL && asp1->nexthop->state != NEXTHOP_REACH)
137 		return (-1);
138 
139 	/* 2. preference of prefix, bigger is better */
140 	if ((asp1->lpref - asp2->lpref) != 0)
141 		return (asp1->lpref - asp2->lpref);
142 
143 	/* 3. aspath count, the shorter the better */
144 	if ((asp2->aspath->ascnt - asp1->aspath->ascnt) != 0)
145 		return (asp2->aspath->ascnt - asp1->aspath->ascnt);
146 
147 	/* 4. origin, the lower the better */
148 	if ((asp2->origin - asp1->origin) != 0)
149 		return (asp2->origin - asp1->origin);
150 
151 	/* 5. MED decision, only comparable between the same neighboring AS */
152 	if (rde_decisionflags() & BGPD_FLAG_DECISION_MED_ALWAYS ||
153 	    aspath_neighbor(asp1->aspath) == aspath_neighbor(asp2->aspath))
154 		/* lowest value wins */
155 		if ((asp2->med - asp1->med) != 0)
156 			return (asp2->med - asp1->med);
157 
158 	/*
159 	 * 6. EBGP is cooler than IBGP
160 	 * It is absolutely important that the ebgp value in peer_config.ebgp
161 	 * is bigger than all other ones (IBGP, confederations)
162 	 */
163 	if ((asp1->peer->conf.ebgp - asp2->peer->conf.ebgp) != 0) {
164 		if (asp1->peer->conf.ebgp == 1) /* p1 is EBGP other is lower */
165 			return 1;
166 		else if (asp2->peer->conf.ebgp == 1) /* p2 is EBGP */
167 			return -1;
168 	}
169 
170 	/*
171 	 * 7. local tie-breaker, this weight is here to tip equal long AS
172 	 * paths in one or the other direction. It happens more and more
173 	 * that AS paths are equally long and so traffic engineering needs
174 	 * a metric that weights a prefix at a very late stage in the
175 	 * decision process.
176 	 */
177 	if ((asp1->weight - asp2->weight) != 0)
178 		return (asp1->weight - asp2->weight);
179 
180 	/* 8. nexthop costs. NOT YET -> IGNORE */
181 
182 	/*
183 	 * 9. older route (more stable) wins but only if route-age
184 	 * evaluation is enabled.
185 	 */
186 	if (rde_decisionflags() & BGPD_FLAG_DECISION_ROUTEAGE)
187 		if ((p2->lastchange - p1->lastchange) != 0)
188 			return (p2->lastchange - p1->lastchange);
189 
190 	/* 10. lowest BGP Id wins */
191 	if ((p2->aspath->peer->remote_bgpid -
192 	    p1->aspath->peer->remote_bgpid) != 0)
193 		return (p2->aspath->peer->remote_bgpid -
194 		    p1->aspath->peer->remote_bgpid);
195 
196 	/* 11. lowest peer address wins (IPv4 is better than IPv6) */
197 	if (memcmp(&p1->aspath->peer->remote_addr,
198 	    &p2->aspath->peer->remote_addr,
199 	    sizeof(p1->aspath->peer->remote_addr)) != 0)
200 		return (-memcmp(&p1->aspath->peer->remote_addr,
201 		    &p2->aspath->peer->remote_addr,
202 		    sizeof(p1->aspath->peer->remote_addr)));
203 
204 	/* 12. for announced prefixes prefer dynamic routes */
205 	if ((p1->flags & F_ANN_DYNAMIC) != (p2->flags & F_ANN_DYNAMIC)) {
206 		if (p1->flags & F_ANN_DYNAMIC)
207 			return (1);
208 		else
209 			return (-1);
210 	}
211 
212 	fatalx("Uh, oh a politician in the decision process");
213 	/* NOTREACHED */
214 }
215 
216 /*
217  * Find the correct place to insert the prefix in the prefix list.
218  * If the active prefix has changed we need to send an update.
219  * The to evaluate prefix must not be in the prefix list.
220  */
221 void
222 prefix_evaluate(struct prefix *p, struct pt_entry *pte)
223 {
224 	struct prefix	*xp;
225 
226 	if (rde_noevaluate()) {
227 		/* decision process is turned off */
228 		if (p != NULL)
229 			LIST_INSERT_HEAD(&pte->prefix_h, p, prefix_l);
230 		if (pte->active != NULL) {
231 			pte->active->aspath->active_cnt--;
232 			pte->active = NULL;
233 		}
234 		return;
235 	}
236 
237 	if (p != NULL) {
238 		if (LIST_EMPTY(&pte->prefix_h))
239 			LIST_INSERT_HEAD(&pte->prefix_h, p, prefix_l);
240 		else {
241 			LIST_FOREACH(xp, &pte->prefix_h, prefix_l)
242 				if (prefix_cmp(p, xp) > 0) {
243 					LIST_INSERT_BEFORE(xp, p, prefix_l);
244 					break;
245 				} else if (LIST_NEXT(xp, prefix_l) == NULL) {
246 					/* if xp last element ... */
247 					LIST_INSERT_AFTER(xp, p, prefix_l);
248 					break;
249 				}
250 		}
251 	}
252 
253 	xp = LIST_FIRST(&pte->prefix_h);
254 	if (xp == NULL || !(xp->flags & F_LOCAL) ||
255 	    xp->aspath->flags & F_ATTR_LOOP ||
256 	    (xp->aspath->nexthop != NULL &&
257 	    xp->aspath->nexthop->state != NEXTHOP_REACH))
258 		/* xp is ineligible */
259 		xp = NULL;
260 
261 	if (pte->active != xp) {
262 		/* need to generate an update */
263 		if (pte->active != NULL)
264 			pte->active->aspath->active_cnt--;
265 
266 		/*
267 		 * Send update with remove for pte->active and add for xp
268 		 * but remember that xp may be NULL aka ineligible.
269 		 * Additional decision may be made by the called functions.
270 		 */
271 		rde_generate_updates(xp, pte->active);
272 		rde_send_kroute(xp, pte->active);
273 
274 		pte->active = xp;
275 		if (xp != NULL)
276 			xp->aspath->active_cnt++;
277 	}
278 }
279