1*89ee02f7Sclaudio /* $OpenBSD: rde_decide.c,v 1.103 2024/08/14 19:09:51 claudio Exp $ */ 2a16c0992Shenning 3a16c0992Shenning /* 4e854144bSclaudio * Copyright (c) 2003, 2004 Claudio Jeker <claudio@openbsd.org> 5050527e1Shenning * Copyright (c) 2003, 2004 Henning Brauer <henning@openbsd.org> 6a16c0992Shenning * 7a16c0992Shenning * Permission to use, copy, modify, and distribute this software for any 8a16c0992Shenning * purpose with or without fee is hereby granted, provided that the above 9a16c0992Shenning * copyright notice and this permission notice appear in all copies. 10a16c0992Shenning * 11a16c0992Shenning * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 12a16c0992Shenning * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 13a16c0992Shenning * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 14a16c0992Shenning * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 15a16c0992Shenning * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 16a16c0992Shenning * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 17a16c0992Shenning * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 18a16c0992Shenning */ 19a16c0992Shenning 20a16c0992Shenning #include <sys/types.h> 21a16c0992Shenning #include <sys/queue.h> 22a16c0992Shenning 237f9525e9Sclaudio #include <string.h> 247f9525e9Sclaudio 25a16c0992Shenning #include "bgpd.h" 26a16c0992Shenning #include "rde.h" 275e3f6f95Sbenno #include "log.h" 287f9525e9Sclaudio 2942f9861bSclaudio int prefix_cmp(struct prefix *, struct prefix *, int *); 30b1dddf40Sclaudio void prefix_set_dmetric(struct prefix *, struct prefix *); 3142f9861bSclaudio void prefix_insert(struct prefix *, struct prefix *, struct rib_entry *); 3242f9861bSclaudio void prefix_remove(struct prefix *, struct rib_entry *); 33a16c0992Shenning /* 34a16c0992Shenning * Decision Engine RFC implementation: 35a16c0992Shenning * Phase 1: 36a16c0992Shenning * - calculate LOCAL_PREF if needed -- EBGP or IGP learnt routes 37a16c0992Shenning * - IBGP routes may either use LOCAL_PREF or the local system computes 38a16c0992Shenning * the degree of preference 39a16c0992Shenning * - If the route is ineligible, the route MAY NOT serve as an input to 40a16c0992Shenning * the next phase of route selection 41a16c0992Shenning * - if the route is eligible the computed value MUST be used as the 42a16c0992Shenning * LOCAL_PREF value in any IBGP readvertisement 43a16c0992Shenning * 44a16c0992Shenning * Phase 2: 45a16c0992Shenning * - If the NEXT_HOP attribute of a BGP route depicts an address that is 46a16c0992Shenning * not resolvable the BGP route MUST be excluded from the Phase 2 decision 47a16c0992Shenning * function. 48a16c0992Shenning * - If the AS_PATH attribute of a BGP route contains an AS loop, the BGP 49a16c0992Shenning * route should be excluded from the Phase 2 decision function. 50a16c0992Shenning * - The local BGP speaker identifies the route that has: 51a16c0992Shenning * a) the highest degree of preference of any route to the same set 52a16c0992Shenning * of destinations 53a16c0992Shenning * b) is the only route to that destination 54a16c0992Shenning * c) is selected as a result of the Phase 2 tie breaking rules 55a16c0992Shenning * - The local speaker MUST determine the immediate next-hop address from 56a16c0992Shenning * the NEXT_HOP attribute of the selected route. 57a16c0992Shenning * - If either the immediate next hop or the IGP cost to the NEXT_HOP changes, 58a16c0992Shenning * Phase 2 Route Selection MUST be performed again. 59a16c0992Shenning * 60a16c0992Shenning * Route Resolvability Condition 61a16c0992Shenning * - A route Rte1, referencing only the intermediate network address, is 62a16c0992Shenning * considered resolvable if the Routing Table contains at least one 63a16c0992Shenning * resolvable route Rte2 that matches Rte1's intermediate network address 64a16c0992Shenning * and is not recursively resolved through Rte1. 65a16c0992Shenning * - Routes referencing interfaces are considered resolvable if the state of 66a16c0992Shenning * the referenced interface is up and IP processing is enabled. 67a16c0992Shenning * 68a16c0992Shenning * Breaking Ties (Phase 2) 69a16c0992Shenning * 1. Remove from consideration all routes which are not tied for having the 70a16c0992Shenning * smallest number of AS numbers present in their AS_PATH attributes. 71a16c0992Shenning * Note, that when counting this number, an AS_SET counts as 1 72a16c0992Shenning * 2. Remove from consideration all routes which are not tied for having the 73a16c0992Shenning * lowest Origin number in their Origin attribute. 74a16c0992Shenning * 3. Remove from consideration routes with less-preferred MULTI_EXIT_DISC 75a16c0992Shenning * attributes. MULTI_EXIT_DISC is only comparable between routes learned 76a16c0992Shenning * from the same neighboring AS. 77a16c0992Shenning * 4. If at least one of the candidate routes was received via EBGP, 78a16c0992Shenning * remove from consideration all routes which were received via IBGP. 79a16c0992Shenning * 5. Remove from consideration any routes with less-preferred interior cost. 80a16c0992Shenning * If the NEXT_HOP hop for a route is reachable, but no cost can be 81a16c0992Shenning * determined, then this step should be skipped. 82a16c0992Shenning * 6. Remove from consideration all routes other than the route that was 83a16c0992Shenning * advertised by the BGP speaker whose BGP Identifier has the lowest value. 84a16c0992Shenning * 7. Prefer the route received from the lowest peer address. 85a16c0992Shenning * 86a16c0992Shenning * Phase 3: Route Dissemination 87a16c0992Shenning * - All routes in the Loc-RIB are processed into Adj-RIBs-Out according 88a16c0992Shenning * to configured policy. A route SHALL NOT be installed in the Adj-Rib-Out 89a16c0992Shenning * unless the destination and NEXT_HOP described by this route may be 90a16c0992Shenning * forwarded appropriately by the Routing Table. 91a16c0992Shenning */ 92a16c0992Shenning 93a16c0992Shenning /* 94a16c0992Shenning * Decision Engine OUR implementation: 9505ebbbf6Sclaudio * The filtering is done first. The filtering calculates the preference and 9605ebbbf6Sclaudio * stores it in LOCAL_PREF (Phase 1). 9726904cb3Sclaudio * Ineligible routes are flagged as ineligible via nexthop_add(). 9826904cb3Sclaudio * Phase 3 is done together with Phase 2. 99a16c0992Shenning * In following cases a prefix needs to be reevaluated: 100227b4aedSclaudio * - update of a prefix (prefix_update) 101227b4aedSclaudio * - withdraw of a prefix (prefix_withdraw) 102a16c0992Shenning * - state change of the nexthop (nexthop-{in}validate) 103a16c0992Shenning * - state change of session (session down) 104a16c0992Shenning */ 105a16c0992Shenning 106a16c0992Shenning /* 107655dc4c7Sclaudio * Compare two prefixes with equal pt_entry. Returns an integer greater than or 10804f2231bShenning * less than 0, according to whether the prefix p1 is more or less preferred 109a16c0992Shenning * than the prefix p2. p1 should be used for the new prefix and p2 for a 110b1dddf40Sclaudio * already added prefix. The absolute value returned specifies the similarity 111b1dddf40Sclaudio * of the prefixes. 112b1dddf40Sclaudio * 1: prefixes differ because of validity 113b1dddf40Sclaudio * 2: prefixes don't belong in any multipath set 114b1dddf40Sclaudio * 3: prefixes belong only in the as-wide multipath set 115b1dddf40Sclaudio * 4: prefixes belong in both the ecmp and as-wide multipath set 116b1dddf40Sclaudio * TODO: maybe we also need a strict ecmp set that requires 117b1dddf40Sclaudio * prefixes to e.g. equal ASPATH or equal neighbor-as (like for MED). 118a16c0992Shenning */ 1197f9525e9Sclaudio int 12042f9861bSclaudio prefix_cmp(struct prefix *p1, struct prefix *p2, int *testall) 121a16c0992Shenning { 122a16c0992Shenning struct rde_aspath *asp1, *asp2; 1232b6d2161Sclaudio struct rde_peer *peer1, *peer2; 124e1febd93Sclaudio struct attr *a; 12539386878Sclaudio uint32_t p1id, p2id; 126652909eeSclaudio int p1cnt, p2cnt, i; 127b1dddf40Sclaudio int rv = 1; 128a16c0992Shenning 12942f9861bSclaudio /* 13042f9861bSclaudio * If a match happens before the MED check then the list is 13142f9861bSclaudio * correctly sorted. If a match happens after MED then further 13242f9861bSclaudio * elements may need to be checked to ensure that all paths 13342f9861bSclaudio * which could affect this path were considered. This only 13442f9861bSclaudio * matters for strict MED evaluation and in that case testall 13542f9861bSclaudio * is set to 1. If the check happens to be on the MED check 13642f9861bSclaudio * itself testall is set to 2. 13742f9861bSclaudio */ 13842f9861bSclaudio *testall = 0; 13942f9861bSclaudio 140ba0c695bSclaudio if (p1 == NULL) 141b1dddf40Sclaudio return -rv; 142a16c0992Shenning if (p2 == NULL) 143b1dddf40Sclaudio return rv; 144a16c0992Shenning 1452b6d2161Sclaudio asp1 = prefix_aspath(p1); 1462b6d2161Sclaudio asp2 = prefix_aspath(p2); 1476f9c354eSclaudio peer1 = prefix_peer(p1); 1486f9c354eSclaudio peer2 = prefix_peer(p2); 14926904cb3Sclaudio 15013d31ce9Sclaudio /* 1. check if prefix is eligible a.k.a reachable */ 15113d31ce9Sclaudio if (!prefix_eligible(p2)) 152b1dddf40Sclaudio return rv; 15313d31ce9Sclaudio if (!prefix_eligible(p1)) 154b1dddf40Sclaudio return -rv; 155b1dddf40Sclaudio 156b1dddf40Sclaudio /* bump rv, from here on prefix is considered valid */ 157b1dddf40Sclaudio rv++; 158a16c0992Shenning 1595424e8d9Sclaudio /* 2. local preference of prefix, bigger is better */ 160652909eeSclaudio if (asp1->lpref > asp2->lpref) 161b1dddf40Sclaudio return rv; 162652909eeSclaudio if (asp1->lpref < asp2->lpref) 163b1dddf40Sclaudio return -rv; 164a16c0992Shenning 165a16c0992Shenning /* 3. aspath count, the shorter the better */ 166b1dddf40Sclaudio if (asp1->aspath->ascnt < asp2->aspath->ascnt) 167b1dddf40Sclaudio return rv; 168b1dddf40Sclaudio if (asp1->aspath->ascnt > asp2->aspath->ascnt) 169b1dddf40Sclaudio return -rv; 170a16c0992Shenning 171a16c0992Shenning /* 4. origin, the lower the better */ 172b1dddf40Sclaudio if (asp1->origin < asp2->origin) 173b1dddf40Sclaudio return rv; 174b1dddf40Sclaudio if (asp1->origin > asp2->origin) 175b1dddf40Sclaudio return -rv; 176a16c0992Shenning 177652909eeSclaudio /* 178652909eeSclaudio * 5. MED decision 179652909eeSclaudio * Only comparable between the same neighboring AS or if 18042f9861bSclaudio * 'rde med compare always' is set. In the first case 18142f9861bSclaudio * set the testall flag since further elements need to be 18242f9861bSclaudio * evaluated as well. 183652909eeSclaudio */ 184652909eeSclaudio if ((rde_decisionflags() & BGPD_FLAG_DECISION_MED_ALWAYS) || 185652909eeSclaudio aspath_neighbor(asp1->aspath) == aspath_neighbor(asp2->aspath)) { 18642f9861bSclaudio if (!(rde_decisionflags() & BGPD_FLAG_DECISION_MED_ALWAYS)) 18742f9861bSclaudio *testall = 2; 188dc26f558Sclaudio /* lowest value wins */ 189652909eeSclaudio if (asp1->med < asp2->med) 190b1dddf40Sclaudio return rv; 191652909eeSclaudio if (asp1->med > asp2->med) 192b1dddf40Sclaudio return -rv; 193652909eeSclaudio } 194a16c0992Shenning 19542f9861bSclaudio if (!(rde_decisionflags() & BGPD_FLAG_DECISION_MED_ALWAYS)) 19642f9861bSclaudio *testall = 1; 19742f9861bSclaudio 198a16c0992Shenning /* 199655dc4c7Sclaudio * 6. EBGP is cooler than IBGP 20004f2231bShenning * It is absolutely important that the ebgp value in peer_config.ebgp 201a16c0992Shenning * is bigger than all other ones (IBGP, confederations) 202a16c0992Shenning */ 2036f9c354eSclaudio if (peer1->conf.ebgp != peer2->conf.ebgp) { 2046f9c354eSclaudio if (peer1->conf.ebgp) /* peer1 is EBGP other is lower */ 205b1dddf40Sclaudio return rv; 2066f9c354eSclaudio else if (peer2->conf.ebgp) /* peer2 is EBGP */ 207b1dddf40Sclaudio return -rv; 208a16c0992Shenning } 209a16c0992Shenning 210b1dddf40Sclaudio /* bump rv, as-wide multipath */ 211b1dddf40Sclaudio rv++; 212b1dddf40Sclaudio 213aa5d92feSclaudio /* 214aa5d92feSclaudio * 7. local tie-breaker, this weight is here to tip equal long AS 215c5508ee4Sclaudio * paths in one or the other direction. It happens more and more 216c5508ee4Sclaudio * that AS paths are equally long and so traffic engineering needs 217aa5d92feSclaudio * a metric that weights a prefix at a very late stage in the 218aa5d92feSclaudio * decision process. 219aa5d92feSclaudio */ 220652909eeSclaudio if (asp1->weight > asp2->weight) 221b1dddf40Sclaudio return rv; 222652909eeSclaudio if (asp1->weight < asp2->weight) 223b1dddf40Sclaudio return -rv; 224aa5d92feSclaudio 225aa5d92feSclaudio /* 8. nexthop costs. NOT YET -> IGNORE */ 226a16c0992Shenning 227b1dddf40Sclaudio /* bump rv, equal cost multipath */ 228b1dddf40Sclaudio rv++; 229b1dddf40Sclaudio 23088ca7a8dSclaudio /* 231aa5d92feSclaudio * 9. older route (more stable) wins but only if route-age 23288ca7a8dSclaudio * evaluation is enabled. 23388ca7a8dSclaudio */ 234652909eeSclaudio if (rde_decisionflags() & BGPD_FLAG_DECISION_ROUTEAGE) { 235652909eeSclaudio if (p1->lastchange < p2->lastchange) /* p1 is older */ 236b1dddf40Sclaudio return rv; 237652909eeSclaudio if (p1->lastchange > p2->lastchange) 238b1dddf40Sclaudio return -rv; 239652909eeSclaudio } 240a16c0992Shenning 241e1febd93Sclaudio /* 10. lowest BGP Id wins, use ORIGINATOR_ID if present */ 242e1febd93Sclaudio if ((a = attr_optget(asp1, ATTR_ORIGINATOR_ID)) != NULL) { 243e1febd93Sclaudio memcpy(&p1id, a->data, sizeof(p1id)); 244e1febd93Sclaudio p1id = ntohl(p1id); 245e1febd93Sclaudio } else 2466f9c354eSclaudio p1id = peer1->remote_bgpid; 247e1febd93Sclaudio if ((a = attr_optget(asp2, ATTR_ORIGINATOR_ID)) != NULL) { 248e1febd93Sclaudio memcpy(&p2id, a->data, sizeof(p2id)); 249e1febd93Sclaudio p2id = ntohl(p2id); 250e1febd93Sclaudio } else 2516f9c354eSclaudio p2id = peer2->remote_bgpid; 252652909eeSclaudio if (p1id < p2id) 253b1dddf40Sclaudio return rv; 254652909eeSclaudio if (p1id > p2id) 255b1dddf40Sclaudio return -rv; 256933012f8Shenning 257e1febd93Sclaudio /* 11. compare CLUSTER_LIST length, shorter is better */ 258e1febd93Sclaudio p1cnt = p2cnt = 0; 259e1febd93Sclaudio if ((a = attr_optget(asp1, ATTR_CLUSTER_LIST)) != NULL) 26039386878Sclaudio p1cnt = a->len / sizeof(uint32_t); 261e1febd93Sclaudio if ((a = attr_optget(asp2, ATTR_CLUSTER_LIST)) != NULL) 26239386878Sclaudio p2cnt = a->len / sizeof(uint32_t); 263b1dddf40Sclaudio if (p1cnt < p2cnt) 264b1dddf40Sclaudio return rv; 265b1dddf40Sclaudio if (p1cnt > p2cnt) 266b1dddf40Sclaudio return -rv; 267e1febd93Sclaudio 268e1febd93Sclaudio /* 12. lowest peer address wins (IPv4 is better than IPv6) */ 269652909eeSclaudio if (peer1->remote_addr.aid < peer2->remote_addr.aid) 270b1dddf40Sclaudio return rv; 271652909eeSclaudio if (peer1->remote_addr.aid > peer2->remote_addr.aid) 272b1dddf40Sclaudio return -rv; 273652909eeSclaudio switch (peer1->remote_addr.aid) { 274652909eeSclaudio case AID_INET: 275652909eeSclaudio i = memcmp(&peer1->remote_addr.v4, &peer2->remote_addr.v4, 276652909eeSclaudio sizeof(struct in_addr)); 277652909eeSclaudio break; 278652909eeSclaudio case AID_INET6: 279652909eeSclaudio i = memcmp(&peer1->remote_addr.v6, &peer2->remote_addr.v6, 280652909eeSclaudio sizeof(struct in6_addr)); 281652909eeSclaudio break; 282652909eeSclaudio default: 283652909eeSclaudio fatalx("%s: unknown af", __func__); 284652909eeSclaudio } 285652909eeSclaudio if (i < 0) 286b1dddf40Sclaudio return rv; 287652909eeSclaudio if (i > 0) 288b1dddf40Sclaudio return -rv; 289a16c0992Shenning 290591142c4Sclaudio /* RFC7911 does not specify this but something like this is needed. */ 29129b527fbSclaudio /* 13. lowest path identifier wins */ 29229b527fbSclaudio if (p1->path_id < p2->path_id) 293b1dddf40Sclaudio return rv; 29429b527fbSclaudio if (p1->path_id > p2->path_id) 295b1dddf40Sclaudio return -rv; 29629b527fbSclaudio 297f87a5020Shenning fatalx("Uh, oh a politician in the decision process"); 298a16c0992Shenning } 299a16c0992Shenning 30005ebbbf6Sclaudio /* 301b1dddf40Sclaudio * set the dmetric value of np based on the return value of 302b1dddf40Sclaudio * prefix_evaluate(pp, np) or set it to either PREFIX_DMETRIC_BEST 303b1dddf40Sclaudio * or PREFIX_DMETRIC_INVALID for the first element. 304b1dddf40Sclaudio */ 305b1dddf40Sclaudio void 306b1dddf40Sclaudio prefix_set_dmetric(struct prefix *pp, struct prefix *np) 307b1dddf40Sclaudio { 308b1dddf40Sclaudio int testall; 309b1dddf40Sclaudio 310b1dddf40Sclaudio if (np != NULL) { 311b1dddf40Sclaudio if (pp == NULL) 312b1dddf40Sclaudio np->dmetric = prefix_eligible(np) ? 313b1dddf40Sclaudio PREFIX_DMETRIC_BEST : PREFIX_DMETRIC_INVALID; 314b1dddf40Sclaudio else 315b1dddf40Sclaudio np->dmetric = prefix_cmp(pp, np, &testall); 316a485c7fbSclaudio if (np->dmetric < 0) { 317a485c7fbSclaudio struct bgpd_addr addr; 318a485c7fbSclaudio pt_getaddr(np->pt, &addr); 319a485c7fbSclaudio log_debug("bad dmetric in decision process: %s/%u", 320a485c7fbSclaudio log_addr(&addr), np->pt->prefixlen); 321a485c7fbSclaudio } 322b1dddf40Sclaudio } 323b1dddf40Sclaudio } 324b1dddf40Sclaudio 325b1dddf40Sclaudio /* 32605ebbbf6Sclaudio * Insert a prefix keeping the total order of the list. For routes 32705ebbbf6Sclaudio * that may depend on a MED selection the set is scanned until the 32805ebbbf6Sclaudio * condition is cleared. If a MED inversion is detected the respective 32905ebbbf6Sclaudio * prefix is taken of the rib list and put onto a redo queue. All 33005ebbbf6Sclaudio * prefixes on the redo queue are re-inserted at the end. 33105ebbbf6Sclaudio */ 33242f9861bSclaudio void 33342f9861bSclaudio prefix_insert(struct prefix *new, struct prefix *ep, struct rib_entry *re) 33442f9861bSclaudio { 335df08e9a0Sclaudio struct prefix_queue redo = TAILQ_HEAD_INITIALIZER(redo); 336df08e9a0Sclaudio struct prefix *xp, *np, *insertp = ep; 3371d5ff589Sclaudio int testall, preferred, selected = 0, removed = 0; 33842f9861bSclaudio 339591142c4Sclaudio /* start scan at the entry point (ep) or the head if ep == NULL */ 34042f9861bSclaudio if (ep == NULL) 341df08e9a0Sclaudio ep = TAILQ_FIRST(&re->prefix_h); 34242f9861bSclaudio 34342f9861bSclaudio for (xp = ep; xp != NULL; xp = np) { 344df08e9a0Sclaudio np = TAILQ_NEXT(xp, entry.list.rib); 34542f9861bSclaudio 3461d5ff589Sclaudio if ((preferred = (prefix_cmp(new, xp, &testall) > 0))) { 34742f9861bSclaudio /* new is preferred over xp */ 3481d5ff589Sclaudio if (testall == 2) { 34942f9861bSclaudio /* 35042f9861bSclaudio * MED inversion, take out prefix and 35142f9861bSclaudio * put it onto redo queue. 35242f9861bSclaudio */ 353df08e9a0Sclaudio TAILQ_REMOVE(&re->prefix_h, xp, entry.list.rib); 354df08e9a0Sclaudio TAILQ_INSERT_TAIL(&redo, xp, entry.list.rib); 3551d5ff589Sclaudio removed = 1; 3561d5ff589Sclaudio continue; 3571d5ff589Sclaudio } 3581d5ff589Sclaudio 3591d5ff589Sclaudio if (testall == 1) { 36042f9861bSclaudio /* 36142f9861bSclaudio * lock insertion point and 36242f9861bSclaudio * continue on with scan 36342f9861bSclaudio */ 36442f9861bSclaudio selected = 1; 36542f9861bSclaudio } 36642f9861bSclaudio } else { 36742f9861bSclaudio /* 368591142c4Sclaudio * xp is preferred over new. 369591142c4Sclaudio * Remember insertion point for later unless the 370591142c4Sclaudio * traverse is just looking for a possible MED 371591142c4Sclaudio * inversion (selected == 1). 372591142c4Sclaudio * If the last comparison's tie-breaker was the MED 373591142c4Sclaudio * check reset selected and with it insertp since 374591142c4Sclaudio * this was an actual MED priority inversion. 37542f9861bSclaudio */ 37642f9861bSclaudio if (testall == 2) 37742f9861bSclaudio selected = 0; 37842f9861bSclaudio if (!selected) 37943888d92Sclaudio insertp = xp; 38042f9861bSclaudio } 3811d5ff589Sclaudio 3821d5ff589Sclaudio /* 3831d5ff589Sclaudio * If previous element(s) got removed, fixup the 3841d5ff589Sclaudio * dmetric, now that it is clear that this element 3851d5ff589Sclaudio * is on the list. 3861d5ff589Sclaudio */ 3871d5ff589Sclaudio if (removed) { 3881d5ff589Sclaudio prefix_set_dmetric(TAILQ_PREV(xp, prefix_queue, 3891d5ff589Sclaudio entry.list.rib), xp); 3901d5ff589Sclaudio removed = 0; 3911d5ff589Sclaudio } 3921d5ff589Sclaudio 3931d5ff589Sclaudio if (preferred && testall == 0) 3941d5ff589Sclaudio break; /* we're done */ 39542f9861bSclaudio } 39642f9861bSclaudio 397b1dddf40Sclaudio if (insertp == NULL) { 398df08e9a0Sclaudio TAILQ_INSERT_HEAD(&re->prefix_h, new, entry.list.rib); 399b1dddf40Sclaudio } else { 400df08e9a0Sclaudio TAILQ_INSERT_AFTER(&re->prefix_h, insertp, new, entry.list.rib); 401b1dddf40Sclaudio } 402b1dddf40Sclaudio 403b1dddf40Sclaudio prefix_set_dmetric(insertp, new); 404b1dddf40Sclaudio prefix_set_dmetric(new, TAILQ_NEXT(new, entry.list.rib)); 40542f9861bSclaudio 40642f9861bSclaudio /* Fixup MED order again. All elements are < new */ 407df08e9a0Sclaudio while (!TAILQ_EMPTY(&redo)) { 408df08e9a0Sclaudio xp = TAILQ_FIRST(&redo); 409df08e9a0Sclaudio TAILQ_REMOVE(&redo, xp, entry.list.rib); 41042f9861bSclaudio 41142f9861bSclaudio prefix_insert(xp, new, re); 41242f9861bSclaudio } 41342f9861bSclaudio } 41442f9861bSclaudio 41505ebbbf6Sclaudio /* 41605ebbbf6Sclaudio * Remove a prefix from the RIB list ensuring that the total order of the 41705ebbbf6Sclaudio * list remains intact. All routes that differ in the MED are taken of the 41805ebbbf6Sclaudio * list and put on the redo list. To figure out if a route could cause a 41905ebbbf6Sclaudio * resort because of a MED check the next prefix of the to-remove prefix 42005ebbbf6Sclaudio * is compared with the old prefix. A full scan is only done if the next 42105ebbbf6Sclaudio * route differs because of the MED or later checks. 42205ebbbf6Sclaudio * Again at the end all routes on the redo queue are reinserted. 42305ebbbf6Sclaudio */ 42442f9861bSclaudio void 42542f9861bSclaudio prefix_remove(struct prefix *old, struct rib_entry *re) 42642f9861bSclaudio { 427df08e9a0Sclaudio struct prefix_queue redo = TAILQ_HEAD_INITIALIZER(redo); 4281d5ff589Sclaudio struct prefix *xp, *np, *pp; 4291d5ff589Sclaudio int testall, removed = 0; 43042f9861bSclaudio 431df08e9a0Sclaudio xp = TAILQ_NEXT(old, entry.list.rib); 4321d5ff589Sclaudio pp = TAILQ_PREV(old, prefix_queue, entry.list.rib); 433df08e9a0Sclaudio TAILQ_REMOVE(&re->prefix_h, old, entry.list.rib); 434b1dddf40Sclaudio 43542f9861bSclaudio /* check if a MED inversion could be possible */ 43642f9861bSclaudio prefix_cmp(old, xp, &testall); 43742f9861bSclaudio if (testall > 0) { 43842f9861bSclaudio /* maybe MED route, scan tail for other possible routes */ 43942f9861bSclaudio for (; xp != NULL; xp = np) { 440df08e9a0Sclaudio np = TAILQ_NEXT(xp, entry.list.rib); 44142f9861bSclaudio 44242f9861bSclaudio /* only interested in the testall result */ 44342f9861bSclaudio prefix_cmp(old, xp, &testall); 4441d5ff589Sclaudio if (testall == 2) { 44542f9861bSclaudio /* 44642f9861bSclaudio * possible MED inversion, take out prefix and 44742f9861bSclaudio * put it onto redo queue. 44842f9861bSclaudio */ 449df08e9a0Sclaudio TAILQ_REMOVE(&re->prefix_h, xp, entry.list.rib); 450df08e9a0Sclaudio TAILQ_INSERT_TAIL(&redo, xp, entry.list.rib); 4511d5ff589Sclaudio removed = 1; 4521d5ff589Sclaudio continue; 4531d5ff589Sclaudio } 4541d5ff589Sclaudio /* 4551d5ff589Sclaudio * If previous element(s) got removed, fixup the 4561d5ff589Sclaudio * dmetric, now that it is clear that this element 4571d5ff589Sclaudio * is on the list. 4581d5ff589Sclaudio */ 4591d5ff589Sclaudio if (removed) { 4601d5ff589Sclaudio prefix_set_dmetric(TAILQ_PREV(xp, prefix_queue, 4611d5ff589Sclaudio entry.list.rib), xp); 4621d5ff589Sclaudio removed = 0; 4631d5ff589Sclaudio } 4641d5ff589Sclaudio if (testall == 0) 4651d5ff589Sclaudio break; /* we're done */ 46642f9861bSclaudio } 46742f9861bSclaudio } 4681d5ff589Sclaudio 4691d5ff589Sclaudio if (pp) 4701d5ff589Sclaudio prefix_set_dmetric(pp, TAILQ_NEXT(pp, entry.list.rib)); 4711d5ff589Sclaudio else 4721d5ff589Sclaudio prefix_set_dmetric(NULL, TAILQ_FIRST(&re->prefix_h)); 47342f9861bSclaudio 47442f9861bSclaudio /* Fixup MED order again, reinsert prefixes from the start */ 475df08e9a0Sclaudio while (!TAILQ_EMPTY(&redo)) { 476df08e9a0Sclaudio xp = TAILQ_FIRST(&redo); 477df08e9a0Sclaudio TAILQ_REMOVE(&redo, xp, entry.list.rib); 47842f9861bSclaudio 47942f9861bSclaudio prefix_insert(xp, NULL, re); 48042f9861bSclaudio } 48142f9861bSclaudio } 48242f9861bSclaudio 48305ebbbf6Sclaudio /* helper function to check if a prefix is valid to be selected */ 48405ebbbf6Sclaudio int 48505ebbbf6Sclaudio prefix_eligible(struct prefix *p) 48605ebbbf6Sclaudio { 48705ebbbf6Sclaudio struct rde_aspath *asp = prefix_aspath(p); 48805ebbbf6Sclaudio 489*89ee02f7Sclaudio /* prefix itself is marked ineligible */ 490*89ee02f7Sclaudio if (prefix_filtered(p)) 491*89ee02f7Sclaudio return 0; 492*89ee02f7Sclaudio 49305ebbbf6Sclaudio /* The aspath needs to be loop and error free */ 494372bb3aaSclaudio if (asp == NULL || 495f337fe2fSclaudio asp->flags & (F_ATTR_LOOP|F_ATTR_OTC_LEAK|F_ATTR_PARSE_ERR)) 49605ebbbf6Sclaudio return 0; 49713d31ce9Sclaudio 49813d31ce9Sclaudio /* The nexthop must be valid. */ 49913d31ce9Sclaudio if (!prefix_nhvalid(p)) 50005ebbbf6Sclaudio return 0; 50105ebbbf6Sclaudio 50205ebbbf6Sclaudio return 1; 50305ebbbf6Sclaudio } 50405ebbbf6Sclaudio 5058bf5540bSclaudio struct prefix * 5068bf5540bSclaudio prefix_best(struct rib_entry *re) 5078bf5540bSclaudio { 5088bf5540bSclaudio struct prefix *xp; 5098bf5540bSclaudio struct rib *rib; 5108bf5540bSclaudio 5118bf5540bSclaudio rib = re_rib(re); 5128bf5540bSclaudio if (rib->flags & F_RIB_NOEVALUATE) 5138bf5540bSclaudio /* decision process is turned off */ 5148bf5540bSclaudio return NULL; 5158bf5540bSclaudio 516df08e9a0Sclaudio xp = TAILQ_FIRST(&re->prefix_h); 5178bf5540bSclaudio if (xp != NULL && !prefix_eligible(xp)) 5188bf5540bSclaudio xp = NULL; 5198bf5540bSclaudio return xp; 5208bf5540bSclaudio } 5218bf5540bSclaudio 522a16c0992Shenning /* 523a16c0992Shenning * Find the correct place to insert the prefix in the prefix list. 5249f1086bfSclaudio * If the active prefix has changed we need to send an update also special 5259f1086bfSclaudio * treatment is needed if 'rde evaluate all' is used on some peers. 5269f1086bfSclaudio * To re-evaluate a prefix just call prefix_evaluate with old and new pointing 5279f1086bfSclaudio * to the same prefix. 528a16c0992Shenning */ 529a16c0992Shenning void 530d884cecfSclaudio prefix_evaluate(struct rib_entry *re, struct prefix *new, struct prefix *old) 531a16c0992Shenning { 532f6e85e58Sclaudio struct prefix *newbest, *oldbest; 533c042b633Sclaudio struct rib *rib; 534a16c0992Shenning 535c042b633Sclaudio rib = re_rib(re); 536c042b633Sclaudio if (rib->flags & F_RIB_NOEVALUATE) { 537e4020a54Sclaudio /* decision process is turned off */ 538d884cecfSclaudio if (old != NULL) 539df08e9a0Sclaudio TAILQ_REMOVE(&re->prefix_h, old, entry.list.rib); 540b1dddf40Sclaudio if (new != NULL) { 541df08e9a0Sclaudio TAILQ_INSERT_HEAD(&re->prefix_h, new, entry.list.rib); 542b1dddf40Sclaudio new->dmetric = PREFIX_DMETRIC_INVALID; 543b1dddf40Sclaudio } 544e4020a54Sclaudio return; 545e4020a54Sclaudio } 546e4020a54Sclaudio 547f6e85e58Sclaudio oldbest = prefix_best(re); 548d884cecfSclaudio if (old != NULL) 54942f9861bSclaudio prefix_remove(old, re); 55042f9861bSclaudio if (new != NULL) 55142f9861bSclaudio prefix_insert(new, NULL, re); 55213d31ce9Sclaudio newbest = prefix_best(re); 553d70d731bSclaudio 554a16c0992Shenning /* 555d884cecfSclaudio * If the active prefix changed or the active prefix was removed 556d884cecfSclaudio * and added again then generate an update. 557d884cecfSclaudio */ 558f6e85e58Sclaudio if (oldbest != newbest || (old != NULL && newbest == old)) { 559d884cecfSclaudio /* 560f6e85e58Sclaudio * Send update withdrawing oldbest and adding newbest 561f6e85e58Sclaudio * but remember that newbest may be NULL aka ineligible. 5621207958aSclaudio * Additional decision may be made by the called functions. 563a16c0992Shenning */ 564c042b633Sclaudio if ((rib->flags & F_RIB_NOFIB) == 0) 565f6e85e58Sclaudio rde_send_kroute(rib, newbest, oldbest); 566988ba0baSclaudio rde_generate_updates(re, new, old, EVAL_DEFAULT); 56705ebbbf6Sclaudio return; 568a16c0992Shenning } 56905ebbbf6Sclaudio 57005ebbbf6Sclaudio /* 57105ebbbf6Sclaudio * If there are peers with 'rde evaluate all' every update needs 57205ebbbf6Sclaudio * to be passed on (not only a change of the best prefix). 57305ebbbf6Sclaudio * rde_generate_updates() will then take care of distribution. 57405ebbbf6Sclaudio */ 575bcdda550Sclaudio if (rde_evaluate_all()) { 576bcdda550Sclaudio if (new != NULL && !prefix_eligible(new)) 577bcdda550Sclaudio new = NULL; 578bcdda550Sclaudio if (new != NULL || old != NULL) 579988ba0baSclaudio rde_generate_updates(re, new, old, EVAL_ALL); 580a16c0992Shenning } 581bcdda550Sclaudio } 58213d31ce9Sclaudio 58313d31ce9Sclaudio void 58413d31ce9Sclaudio prefix_evaluate_nexthop(struct prefix *p, enum nexthop_state state, 58513d31ce9Sclaudio enum nexthop_state oldstate) 58613d31ce9Sclaudio { 58713d31ce9Sclaudio struct rib_entry *re = prefix_re(p); 588bcdda550Sclaudio struct prefix *newbest, *oldbest, *new, *old; 58913d31ce9Sclaudio struct rib *rib; 59013d31ce9Sclaudio 59113d31ce9Sclaudio /* Skip non local-RIBs or RIBs that are flagged as noeval. */ 59213d31ce9Sclaudio rib = re_rib(re); 59313d31ce9Sclaudio if (rib->flags & F_RIB_NOEVALUATE) { 59413d31ce9Sclaudio log_warnx("%s: prefix with F_RIB_NOEVALUATE hit", __func__); 59513d31ce9Sclaudio return; 59613d31ce9Sclaudio } 59713d31ce9Sclaudio 59813d31ce9Sclaudio if (oldstate == state) { 59913d31ce9Sclaudio /* 60013d31ce9Sclaudio * The state of the nexthop did not change. The only 60113d31ce9Sclaudio * thing that may have changed is the true_nexthop 60213d31ce9Sclaudio * or other internal infos. This will not change 60313d31ce9Sclaudio * the routing decision so shortcut here. 60413d31ce9Sclaudio * XXX needs to be changed for ECMP 60513d31ce9Sclaudio */ 60613d31ce9Sclaudio if (state == NEXTHOP_REACH) { 60713d31ce9Sclaudio if ((rib->flags & F_RIB_NOFIB) == 0 && 60813d31ce9Sclaudio p == prefix_best(re)) 60913d31ce9Sclaudio rde_send_kroute(rib, p, NULL); 61013d31ce9Sclaudio } 61113d31ce9Sclaudio return; 61213d31ce9Sclaudio } 61313d31ce9Sclaudio 61413d31ce9Sclaudio /* 61513d31ce9Sclaudio * Re-evaluate the prefix by removing the prefix then updating the 61613d31ce9Sclaudio * nexthop state and reinserting the prefix again. 61713d31ce9Sclaudio */ 618bcdda550Sclaudio old = p; 61913d31ce9Sclaudio oldbest = prefix_best(re); 62013d31ce9Sclaudio prefix_remove(p, re); 62113d31ce9Sclaudio 62213d31ce9Sclaudio if (state == NEXTHOP_REACH) 62313d31ce9Sclaudio p->nhflags |= NEXTHOP_VALID; 62413d31ce9Sclaudio else 62513d31ce9Sclaudio p->nhflags &= ~NEXTHOP_VALID; 62613d31ce9Sclaudio 62713d31ce9Sclaudio prefix_insert(p, NULL, re); 62813d31ce9Sclaudio newbest = prefix_best(re); 629bcdda550Sclaudio new = p; 630bcdda550Sclaudio if (!prefix_eligible(new)) 631bcdda550Sclaudio new = NULL; 63213d31ce9Sclaudio 63313d31ce9Sclaudio /* 63413d31ce9Sclaudio * If the active prefix changed or the active prefix was removed 63513d31ce9Sclaudio * and added again then generate an update. 63613d31ce9Sclaudio */ 63713d31ce9Sclaudio if (oldbest != newbest || newbest == p) { 63813d31ce9Sclaudio /* 63913d31ce9Sclaudio * Send update withdrawing oldbest and adding newbest 64013d31ce9Sclaudio * but remember that newbest may be NULL aka ineligible. 64113d31ce9Sclaudio * Additional decision may be made by the called functions. 64213d31ce9Sclaudio */ 64313d31ce9Sclaudio if ((rib->flags & F_RIB_NOFIB) == 0) 64413d31ce9Sclaudio rde_send_kroute(rib, newbest, oldbest); 645bcdda550Sclaudio rde_generate_updates(re, new, old, EVAL_DEFAULT); 64613d31ce9Sclaudio return; 64713d31ce9Sclaudio } 64813d31ce9Sclaudio 64913d31ce9Sclaudio /* 65013d31ce9Sclaudio * If there are peers with 'rde evaluate all' every update needs 65113d31ce9Sclaudio * to be passed on (not only a change of the best prefix). 65213d31ce9Sclaudio * rde_generate_updates() will then take care of distribution. 65313d31ce9Sclaudio */ 65413d31ce9Sclaudio if (rde_evaluate_all()) 655bcdda550Sclaudio rde_generate_updates(re, new, old, EVAL_ALL); 65613d31ce9Sclaudio } 657