1 /* $OpenBSD: rde_rib.c,v 1.240 2022/07/08 10:01:52 claudio Exp $ */ 2 3 /* 4 * Copyright (c) 2003, 2004 Claudio Jeker <claudio@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/types.h> 20 #include <sys/queue.h> 21 22 #include <limits.h> 23 #include <stdlib.h> 24 #include <string.h> 25 #include <siphash.h> 26 #include <time.h> 27 28 #include "bgpd.h" 29 #include "rde.h" 30 #include "log.h" 31 32 /* 33 * BGP RIB -- Routing Information Base 34 * 35 * The RIB is build with one aspect in mind. Speed -- actually update speed. 36 * Therefore one thing needs to be absolutely avoided, long table walks. 37 * This is achieved by heavily linking the different parts together. 38 */ 39 uint16_t rib_size; 40 struct rib **ribs; 41 42 struct rib_entry *rib_add(struct rib *, struct bgpd_addr *, int); 43 static inline int rib_compare(const struct rib_entry *, 44 const struct rib_entry *); 45 void rib_remove(struct rib_entry *); 46 int rib_empty(struct rib_entry *); 47 static void rib_dump_abort(uint16_t); 48 49 RB_PROTOTYPE(rib_tree, rib_entry, rib_e, rib_compare); 50 RB_GENERATE(rib_tree, rib_entry, rib_e, rib_compare); 51 52 struct rib_context { 53 LIST_ENTRY(rib_context) entry; 54 struct rib_entry *ctx_re; 55 struct prefix *ctx_p; 56 uint32_t ctx_id; 57 void (*ctx_rib_call)(struct rib_entry *, void *); 58 void (*ctx_prefix_call)(struct prefix *, void *); 59 void (*ctx_done)(void *, uint8_t); 60 int (*ctx_throttle)(void *); 61 void *ctx_arg; 62 unsigned int ctx_count; 63 uint8_t ctx_aid; 64 }; 65 LIST_HEAD(, rib_context) rib_dumps = LIST_HEAD_INITIALIZER(rib_dumps); 66 67 static void prefix_dump_r(struct rib_context *); 68 69 static inline struct rib_entry * 70 re_lock(struct rib_entry *re) 71 { 72 if (re->lock != 0) 73 log_warnx("%s: entry already locked", __func__); 74 re->lock = 1; 75 return re; 76 } 77 78 static inline struct rib_entry * 79 re_unlock(struct rib_entry *re) 80 { 81 if (re->lock == 0) 82 log_warnx("%s: entry already unlocked", __func__); 83 re->lock = 0; 84 return re; 85 } 86 87 static inline int 88 re_is_locked(struct rib_entry *re) 89 { 90 return (re->lock != 0); 91 } 92 93 static inline struct prefix * 94 prefix_lock(struct prefix *p) 95 { 96 if (p->flags & PREFIX_FLAG_LOCKED) 97 fatalx("%s: locking locked prefix", __func__); 98 p->flags |= PREFIX_FLAG_LOCKED; 99 return p; 100 } 101 102 static inline struct prefix * 103 prefix_unlock(struct prefix *p) 104 { 105 if ((p->flags & PREFIX_FLAG_LOCKED) == 0) 106 fatalx("%s: unlocking unlocked prefix", __func__); 107 p->flags &= ~PREFIX_FLAG_LOCKED; 108 return p; 109 } 110 111 static inline int 112 prefix_is_locked(struct prefix *p) 113 { 114 return (p->flags & PREFIX_FLAG_LOCKED) != 0; 115 } 116 117 static inline int 118 prefix_is_dead(struct prefix *p) 119 { 120 return (p->flags & PREFIX_FLAG_DEAD) != 0; 121 } 122 123 static inline struct rib_tree * 124 rib_tree(struct rib *rib) 125 { 126 return (&rib->tree); 127 } 128 129 static inline int 130 rib_compare(const struct rib_entry *a, const struct rib_entry *b) 131 { 132 return (pt_prefix_cmp(a->prefix, b->prefix)); 133 } 134 135 /* RIB specific functions */ 136 struct rib * 137 rib_new(char *name, u_int rtableid, uint16_t flags) 138 { 139 struct rib *new; 140 uint16_t id; 141 142 for (id = 0; id < rib_size; id++) { 143 if (ribs[id] == NULL) 144 break; 145 } 146 147 if (id >= rib_size) { 148 if ((ribs = recallocarray(ribs, id, id + 8, 149 sizeof(struct rib))) == NULL) 150 fatal(NULL); 151 rib_size = id + 8; 152 } 153 154 if ((new = calloc(1, sizeof(*new))) == NULL) 155 fatal(NULL); 156 157 strlcpy(new->name, name, sizeof(new->name)); 158 RB_INIT(rib_tree(new)); 159 new->state = RECONF_REINIT; 160 new->id = id; 161 new->flags = flags; 162 new->rtableid = rtableid; 163 164 new->in_rules = calloc(1, sizeof(struct filter_head)); 165 if (new->in_rules == NULL) 166 fatal(NULL); 167 TAILQ_INIT(new->in_rules); 168 169 ribs[id] = new; 170 171 log_debug("%s: %s -> %u", __func__, name, id); 172 return (new); 173 } 174 175 /* 176 * This function is only called when the FIB information of a RIB changed. 177 * It will flush the FIB if there was one previously and change the fibstate 178 * from RECONF_NONE (nothing to do) to either RECONF_RELOAD (reload the FIB) 179 * or RECONF_REINIT (rerun the route decision process for every element) 180 * depending on the new flags. 181 */ 182 int 183 rib_update(struct rib *rib) 184 { 185 /* flush fib first if there was one */ 186 if ((rib->flags & (F_RIB_NOFIB | F_RIB_NOEVALUATE)) == 0) 187 rde_send_kroute_flush(rib); 188 189 /* if no evaluate changes then a full reinit is needed */ 190 if ((rib->flags & F_RIB_NOEVALUATE) != 191 (rib->flags_tmp & F_RIB_NOEVALUATE)) 192 rib->fibstate = RECONF_REINIT; 193 194 rib->flags = rib->flags_tmp; 195 rib->rtableid = rib->rtableid_tmp; 196 197 /* reload fib if there is no reinit pending and there will be a fib */ 198 if (rib->fibstate != RECONF_REINIT && 199 (rib->flags & (F_RIB_NOFIB | F_RIB_NOEVALUATE)) == 0) 200 rib->fibstate = RECONF_RELOAD; 201 202 return (rib->fibstate == RECONF_REINIT); 203 } 204 205 struct rib * 206 rib_byid(uint16_t id) 207 { 208 if (id == RIB_NOTFOUND || id >= rib_size || ribs[id] == NULL) 209 return NULL; 210 return ribs[id]; 211 } 212 213 uint16_t 214 rib_find(char *name) 215 { 216 uint16_t id; 217 218 /* no name returns the first Loc-RIB */ 219 if (name == NULL || *name == '\0') 220 return RIB_LOC_START; 221 222 for (id = 0; id < rib_size; id++) { 223 if (ribs[id] != NULL && !strcmp(ribs[id]->name, name)) 224 return id; 225 } 226 227 return RIB_NOTFOUND; 228 } 229 230 void 231 rib_free(struct rib *rib) 232 { 233 struct rib_entry *re, *xre; 234 struct prefix *p; 235 236 rib_dump_abort(rib->id); 237 238 /* 239 * flush the rib, disable route evaluation and fib sync to speed up 240 * the prefix removal. Nothing depends on this data anymore. 241 */ 242 if ((rib->flags & (F_RIB_NOFIB | F_RIB_NOEVALUATE)) == 0) 243 rde_send_kroute_flush(rib); 244 rib->flags |= F_RIB_NOEVALUATE | F_RIB_NOFIB; 245 246 for (re = RB_MIN(rib_tree, rib_tree(rib)); re != NULL; re = xre) { 247 xre = RB_NEXT(rib_tree, rib_tree(rib), re); 248 249 /* 250 * Removing the prefixes is tricky because the last one 251 * will remove the rib_entry as well and because we do 252 * an empty check in prefix_destroy() it is not possible to 253 * use the default for loop. 254 */ 255 while ((p = TAILQ_FIRST(&re->prefix_h))) { 256 struct rde_aspath *asp = prefix_aspath(p); 257 if (asp && asp->pftableid) 258 rde_pftable_del(asp->pftableid, p); 259 prefix_destroy(p); 260 } 261 } 262 if (rib->id <= RIB_LOC_START) 263 return; /* never remove the default ribs */ 264 filterlist_free(rib->in_rules_tmp); 265 filterlist_free(rib->in_rules); 266 ribs[rib->id] = NULL; 267 free(rib); 268 } 269 270 void 271 rib_shutdown(void) 272 { 273 struct rib *rib; 274 uint16_t id; 275 276 for (id = 0; id < rib_size; id++) { 277 rib = rib_byid(id); 278 if (rib == NULL) 279 continue; 280 if (!RB_EMPTY(rib_tree(ribs[id]))) { 281 log_warnx("%s: rib %s is not empty", __func__, 282 ribs[id]->name); 283 } 284 rib_free(ribs[id]); 285 } 286 for (id = 0; id <= RIB_LOC_START; id++) { 287 rib = rib_byid(id); 288 if (rib == NULL) 289 continue; 290 filterlist_free(rib->in_rules_tmp); 291 filterlist_free(rib->in_rules); 292 ribs[id] = NULL; 293 free(rib); 294 } 295 free(ribs); 296 } 297 298 struct rib_entry * 299 rib_get(struct rib *rib, struct bgpd_addr *prefix, int prefixlen) 300 { 301 struct rib_entry xre, *re; 302 303 memset(&xre, 0, sizeof(xre)); 304 xre.prefix = pt_fill(prefix, prefixlen); 305 306 re = RB_FIND(rib_tree, rib_tree(rib), &xre); 307 if (re && re->rib_id != rib->id) 308 fatalx("%s: Unexpected RIB %u != %u.", __func__, 309 re->rib_id, rib->id); 310 return re; 311 } 312 313 struct rib_entry * 314 rib_match(struct rib *rib, struct bgpd_addr *addr) 315 { 316 struct rib_entry *re; 317 int i; 318 319 switch (addr->aid) { 320 case AID_INET: 321 case AID_VPN_IPv4: 322 for (i = 32; i >= 0; i--) { 323 re = rib_get(rib, addr, i); 324 if (re != NULL) 325 return (re); 326 } 327 break; 328 case AID_INET6: 329 case AID_VPN_IPv6: 330 for (i = 128; i >= 0; i--) { 331 re = rib_get(rib, addr, i); 332 if (re != NULL) 333 return (re); 334 } 335 break; 336 default: 337 fatalx("%s: unknown af", __func__); 338 } 339 return (NULL); 340 } 341 342 343 struct rib_entry * 344 rib_add(struct rib *rib, struct bgpd_addr *prefix, int prefixlen) 345 { 346 struct pt_entry *pte; 347 struct rib_entry *re; 348 349 pte = pt_get(prefix, prefixlen); 350 if (pte == NULL) 351 pte = pt_add(prefix, prefixlen); 352 353 if ((re = calloc(1, sizeof(*re))) == NULL) 354 fatal("rib_add"); 355 356 TAILQ_INIT(&re->prefix_h); 357 re->prefix = pt_ref(pte); 358 re->rib_id = rib->id; 359 360 if (RB_INSERT(rib_tree, rib_tree(rib), re) != NULL) { 361 log_warnx("rib_add: insert failed"); 362 free(re); 363 return (NULL); 364 } 365 366 367 rdemem.rib_cnt++; 368 369 return (re); 370 } 371 372 void 373 rib_remove(struct rib_entry *re) 374 { 375 if (!rib_empty(re)) 376 fatalx("rib_remove: entry not empty"); 377 378 if (re_is_locked(re)) 379 /* entry is locked, don't free it. */ 380 return; 381 382 pt_unref(re->prefix); 383 384 if (RB_REMOVE(rib_tree, rib_tree(re_rib(re)), re) == NULL) 385 log_warnx("rib_remove: remove failed."); 386 387 free(re); 388 rdemem.rib_cnt--; 389 } 390 391 int 392 rib_empty(struct rib_entry *re) 393 { 394 return TAILQ_EMPTY(&re->prefix_h); 395 } 396 397 static struct rib_entry * 398 rib_restart(struct rib_context *ctx) 399 { 400 struct rib_entry *re; 401 402 re = re_unlock(ctx->ctx_re); 403 404 /* find first non empty element */ 405 while (re && rib_empty(re)) 406 re = RB_NEXT(rib_tree, unused, re); 407 408 /* free the previously locked rib element if empty */ 409 if (rib_empty(ctx->ctx_re)) 410 rib_remove(ctx->ctx_re); 411 ctx->ctx_re = NULL; 412 return (re); 413 } 414 415 static void 416 rib_dump_r(struct rib_context *ctx) 417 { 418 struct rib_entry *re, *next; 419 struct rib *rib; 420 unsigned int i; 421 422 rib = rib_byid(ctx->ctx_id); 423 if (rib == NULL) 424 fatalx("%s: rib id %u gone", __func__, ctx->ctx_id); 425 426 if (ctx->ctx_re == NULL) 427 re = RB_MIN(rib_tree, rib_tree(rib)); 428 else 429 re = rib_restart(ctx); 430 431 for (i = 0; re != NULL; re = next) { 432 next = RB_NEXT(rib_tree, unused, re); 433 if (re->rib_id != ctx->ctx_id) 434 fatalx("%s: Unexpected RIB %u != %u.", __func__, 435 re->rib_id, ctx->ctx_id); 436 if (ctx->ctx_aid != AID_UNSPEC && 437 ctx->ctx_aid != re->prefix->aid) 438 continue; 439 if (ctx->ctx_count && i++ >= ctx->ctx_count && 440 !re_is_locked(re)) { 441 /* store and lock last element */ 442 ctx->ctx_re = re_lock(re); 443 return; 444 } 445 ctx->ctx_rib_call(re, ctx->ctx_arg); 446 } 447 448 if (ctx->ctx_done) 449 ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid); 450 LIST_REMOVE(ctx, entry); 451 free(ctx); 452 } 453 454 int 455 rib_dump_pending(void) 456 { 457 struct rib_context *ctx; 458 459 /* return true if at least one context is not throttled */ 460 LIST_FOREACH(ctx, &rib_dumps, entry) { 461 if (ctx->ctx_throttle && ctx->ctx_throttle(ctx->ctx_arg)) 462 continue; 463 return 1; 464 } 465 return 0; 466 } 467 468 void 469 rib_dump_runner(void) 470 { 471 struct rib_context *ctx, *next; 472 473 LIST_FOREACH_SAFE(ctx, &rib_dumps, entry, next) { 474 if (ctx->ctx_throttle && ctx->ctx_throttle(ctx->ctx_arg)) 475 continue; 476 if (ctx->ctx_rib_call != NULL) 477 rib_dump_r(ctx); 478 else 479 prefix_dump_r(ctx); 480 } 481 } 482 483 static void 484 rib_dump_abort(uint16_t id) 485 { 486 struct rib_context *ctx, *next; 487 488 LIST_FOREACH_SAFE(ctx, &rib_dumps, entry, next) { 489 if (id != ctx->ctx_id) 490 continue; 491 if (ctx->ctx_done) 492 ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid); 493 if (ctx->ctx_re && rib_empty(re_unlock(ctx->ctx_re))) 494 rib_remove(ctx->ctx_re); 495 if (ctx->ctx_p && prefix_is_dead(prefix_unlock(ctx->ctx_p))) 496 prefix_adjout_destroy(ctx->ctx_p); 497 LIST_REMOVE(ctx, entry); 498 free(ctx); 499 } 500 } 501 502 void 503 rib_dump_terminate(void *arg) 504 { 505 struct rib_context *ctx, *next; 506 507 LIST_FOREACH_SAFE(ctx, &rib_dumps, entry, next) { 508 if (ctx->ctx_arg != arg) 509 continue; 510 if (ctx->ctx_done) 511 ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid); 512 if (ctx->ctx_re && rib_empty(re_unlock(ctx->ctx_re))) 513 rib_remove(ctx->ctx_re); 514 if (ctx->ctx_p && prefix_is_dead(prefix_unlock(ctx->ctx_p))) 515 prefix_adjout_destroy(ctx->ctx_p); 516 LIST_REMOVE(ctx, entry); 517 free(ctx); 518 } 519 } 520 521 int 522 rib_dump_new(uint16_t id, uint8_t aid, unsigned int count, void *arg, 523 void (*upcall)(struct rib_entry *, void *), void (*done)(void *, uint8_t), 524 int (*throttle)(void *)) 525 { 526 struct rib_context *ctx; 527 528 if ((ctx = calloc(1, sizeof(*ctx))) == NULL) 529 return -1; 530 ctx->ctx_id = id; 531 ctx->ctx_aid = aid; 532 ctx->ctx_count = count; 533 ctx->ctx_arg = arg; 534 ctx->ctx_rib_call = upcall; 535 ctx->ctx_done = done; 536 ctx->ctx_throttle = throttle; 537 538 LIST_INSERT_HEAD(&rib_dumps, ctx, entry); 539 540 /* requested a sync traversal */ 541 if (count == 0) 542 rib_dump_r(ctx); 543 544 return 0; 545 } 546 547 /* path specific functions */ 548 549 static struct rde_aspath *path_lookup(struct rde_aspath *); 550 static uint64_t path_hash(struct rde_aspath *); 551 static void path_link(struct rde_aspath *); 552 static void path_unlink(struct rde_aspath *); 553 554 struct path_table { 555 struct aspath_head *path_hashtbl; 556 uint64_t path_hashmask; 557 } pathtable; 558 559 SIPHASH_KEY pathtablekey; 560 561 #define PATH_HASH(x) &pathtable.path_hashtbl[x & pathtable.path_hashmask] 562 563 static inline struct rde_aspath * 564 path_ref(struct rde_aspath *asp) 565 { 566 if ((asp->flags & F_ATTR_LINKED) == 0) 567 fatalx("%s: unlinked object", __func__); 568 asp->refcnt++; 569 rdemem.path_refs++; 570 571 return asp; 572 } 573 574 static inline void 575 path_unref(struct rde_aspath *asp) 576 { 577 if (asp == NULL) 578 return; 579 if ((asp->flags & F_ATTR_LINKED) == 0) 580 fatalx("%s: unlinked object", __func__); 581 asp->refcnt--; 582 rdemem.path_refs--; 583 if (asp->refcnt <= 0) 584 path_unlink(asp); 585 } 586 587 void 588 path_init(uint32_t hashsize) 589 { 590 uint32_t hs, i; 591 592 for (hs = 1; hs < hashsize; hs <<= 1) 593 ; 594 pathtable.path_hashtbl = calloc(hs, sizeof(*pathtable.path_hashtbl)); 595 if (pathtable.path_hashtbl == NULL) 596 fatal("path_init"); 597 598 for (i = 0; i < hs; i++) 599 LIST_INIT(&pathtable.path_hashtbl[i]); 600 601 pathtable.path_hashmask = hs - 1; 602 arc4random_buf(&pathtablekey, sizeof(pathtablekey)); 603 } 604 605 void 606 path_shutdown(void) 607 { 608 uint32_t i; 609 610 for (i = 0; i <= pathtable.path_hashmask; i++) 611 if (!LIST_EMPTY(&pathtable.path_hashtbl[i])) 612 log_warnx("path_free: free non-free table"); 613 614 free(pathtable.path_hashtbl); 615 } 616 617 void 618 path_hash_stats(struct rde_hashstats *hs) 619 { 620 struct rde_aspath *a; 621 uint32_t i; 622 int64_t n; 623 624 memset(hs, 0, sizeof(*hs)); 625 strlcpy(hs->name, "path hash", sizeof(hs->name)); 626 hs->min = LLONG_MAX; 627 hs->num = pathtable.path_hashmask + 1; 628 629 for (i = 0; i <= pathtable.path_hashmask; i++) { 630 n = 0; 631 LIST_FOREACH(a, &pathtable.path_hashtbl[i], path_l) 632 n++; 633 if (n < hs->min) 634 hs->min = n; 635 if (n > hs->max) 636 hs->max = n; 637 hs->sum += n; 638 hs->sumq += n * n; 639 } 640 } 641 642 int 643 path_compare(struct rde_aspath *a, struct rde_aspath *b) 644 { 645 int r; 646 647 if (a == NULL && b == NULL) 648 return (0); 649 else if (b == NULL) 650 return (1); 651 else if (a == NULL) 652 return (-1); 653 if ((a->flags & ~F_ATTR_LINKED) > (b->flags & ~F_ATTR_LINKED)) 654 return (1); 655 if ((a->flags & ~F_ATTR_LINKED) < (b->flags & ~F_ATTR_LINKED)) 656 return (-1); 657 if (a->origin > b->origin) 658 return (1); 659 if (a->origin < b->origin) 660 return (-1); 661 if (a->med > b->med) 662 return (1); 663 if (a->med < b->med) 664 return (-1); 665 if (a->lpref > b->lpref) 666 return (1); 667 if (a->lpref < b->lpref) 668 return (-1); 669 if (a->weight > b->weight) 670 return (1); 671 if (a->weight < b->weight) 672 return (-1); 673 if (a->rtlabelid > b->rtlabelid) 674 return (1); 675 if (a->rtlabelid < b->rtlabelid) 676 return (-1); 677 if (a->pftableid > b->pftableid) 678 return (1); 679 if (a->pftableid < b->pftableid) 680 return (-1); 681 682 r = aspath_compare(a->aspath, b->aspath); 683 if (r > 0) 684 return (1); 685 if (r < 0) 686 return (-1); 687 688 return (attr_compare(a, b)); 689 } 690 691 static uint64_t 692 path_hash(struct rde_aspath *asp) 693 { 694 SIPHASH_CTX ctx; 695 uint64_t hash; 696 697 SipHash24_Init(&ctx, &pathtablekey); 698 SipHash24_Update(&ctx, &asp->aspath_hashstart, 699 (char *)&asp->aspath_hashend - (char *)&asp->aspath_hashstart); 700 701 if (asp->aspath) 702 SipHash24_Update(&ctx, asp->aspath->data, asp->aspath->len); 703 704 hash = attr_hash(asp); 705 SipHash24_Update(&ctx, &hash, sizeof(hash)); 706 707 return (SipHash24_End(&ctx)); 708 } 709 710 static struct rde_aspath * 711 path_lookup(struct rde_aspath *aspath) 712 { 713 struct aspath_head *head; 714 struct rde_aspath *asp; 715 uint64_t hash; 716 717 hash = path_hash(aspath); 718 head = PATH_HASH(hash); 719 720 LIST_FOREACH(asp, head, path_l) { 721 if (asp->hash == hash && path_compare(aspath, asp) == 0) 722 return (asp); 723 } 724 return (NULL); 725 } 726 727 /* 728 * Link this aspath into the global hash table. 729 * The asp had to be alloced with path_get. 730 */ 731 static void 732 path_link(struct rde_aspath *asp) 733 { 734 struct aspath_head *head; 735 736 asp->hash = path_hash(asp); 737 head = PATH_HASH(asp->hash); 738 739 LIST_INSERT_HEAD(head, asp, path_l); 740 asp->flags |= F_ATTR_LINKED; 741 } 742 743 /* 744 * This function can only be called when all prefix have been removed first. 745 * Normally this happens directly out of the prefix removal functions. 746 */ 747 static void 748 path_unlink(struct rde_aspath *asp) 749 { 750 if (asp == NULL) 751 return; 752 753 /* make sure no reference is hold for this rde_aspath */ 754 if (asp->refcnt != 0) 755 fatalx("%s: still holds references", __func__); 756 757 LIST_REMOVE(asp, path_l); 758 asp->flags &= ~F_ATTR_LINKED; 759 760 path_put(asp); 761 } 762 763 /* 764 * Copy asp to a new UNLINKED aspath. 765 * On dst either path_get() or path_prep() had to be called before. 766 */ 767 struct rde_aspath * 768 path_copy(struct rde_aspath *dst, const struct rde_aspath *src) 769 { 770 dst->aspath = src->aspath; 771 if (dst->aspath != NULL) { 772 dst->aspath->refcnt++; 773 rdemem.aspath_refs++; 774 } 775 dst->hash = 0; /* not linked so no hash and no refcnt */ 776 dst->refcnt = 0; 777 dst->flags = src->flags & ~F_ATTR_LINKED; 778 779 dst->med = src->med; 780 dst->lpref = src->lpref; 781 dst->weight = src->weight; 782 dst->rtlabelid = rtlabel_ref(src->rtlabelid); 783 dst->pftableid = pftable_ref(src->pftableid); 784 dst->origin = src->origin; 785 786 attr_copy(dst, src); 787 788 return (dst); 789 } 790 791 /* initialize or pepare an aspath for use */ 792 struct rde_aspath * 793 path_prep(struct rde_aspath *asp) 794 { 795 memset(asp, 0, sizeof(*asp)); 796 asp->origin = ORIGIN_INCOMPLETE; 797 asp->lpref = DEFAULT_LPREF; 798 799 return (asp); 800 } 801 802 /* alloc and initialize new entry. May not fail. */ 803 struct rde_aspath * 804 path_get(void) 805 { 806 struct rde_aspath *asp; 807 808 asp = malloc(sizeof(*asp)); 809 if (asp == NULL) 810 fatal("path_get"); 811 rdemem.path_cnt++; 812 813 return (path_prep(asp)); 814 } 815 816 /* clean up an asp after use (frees all references to sub-objects) */ 817 void 818 path_clean(struct rde_aspath *asp) 819 { 820 if (asp->flags & F_ATTR_LINKED) 821 fatalx("%s: linked object", __func__); 822 823 rtlabel_unref(asp->rtlabelid); 824 pftable_unref(asp->pftableid); 825 aspath_put(asp->aspath); 826 attr_freeall(asp); 827 } 828 829 /* free an unlinked element */ 830 void 831 path_put(struct rde_aspath *asp) 832 { 833 if (asp == NULL) 834 return; 835 836 path_clean(asp); 837 838 rdemem.path_cnt--; 839 free(asp); 840 } 841 842 /* prefix specific functions */ 843 844 static int prefix_add(struct bgpd_addr *, int, struct rib *, 845 struct rde_peer *, uint32_t, uint32_t, struct rde_aspath *, 846 struct rde_community *, struct nexthop *, 847 uint8_t, uint8_t); 848 static int prefix_move(struct prefix *, struct rde_peer *, 849 struct rde_aspath *, struct rde_community *, 850 struct nexthop *, uint8_t, uint8_t); 851 852 static void prefix_link(struct prefix *, struct rib_entry *, 853 struct pt_entry *, struct rde_peer *, uint32_t, uint32_t, 854 struct rde_aspath *, struct rde_community *, 855 struct nexthop *, uint8_t, uint8_t); 856 static void prefix_unlink(struct prefix *); 857 858 static struct prefix *prefix_alloc(void); 859 static void prefix_free(struct prefix *); 860 861 /* RB tree comparison function */ 862 static inline int 863 prefix_index_cmp(struct prefix *a, struct prefix *b) 864 { 865 int r; 866 r = pt_prefix_cmp(a->pt, b->pt); 867 if (r != 0) 868 return r; 869 870 if (a->path_id_tx > b->path_id_tx) 871 return 1; 872 if (a->path_id_tx < b->path_id_tx) 873 return -1; 874 return 0; 875 } 876 877 static inline int 878 prefix_cmp(struct prefix *a, struct prefix *b) 879 { 880 if ((a->flags & PREFIX_FLAG_EOR) != (b->flags & PREFIX_FLAG_EOR)) 881 return (a->flags & PREFIX_FLAG_EOR) ? 1 : -1; 882 /* if EOR marker no need to check the rest */ 883 if (a->flags & PREFIX_FLAG_EOR) 884 return 0; 885 886 if (a->aspath != b->aspath) 887 return (a->aspath > b->aspath ? 1 : -1); 888 if (a->communities != b->communities) 889 return (a->communities > b->communities ? 1 : -1); 890 if (a->nexthop != b->nexthop) 891 return (a->nexthop > b->nexthop ? 1 : -1); 892 if (a->nhflags != b->nhflags) 893 return (a->nhflags > b->nhflags ? 1 : -1); 894 return prefix_index_cmp(a, b); 895 } 896 897 RB_GENERATE(prefix_tree, prefix, entry.tree.update, prefix_cmp) 898 RB_GENERATE_STATIC(prefix_index, prefix, entry.tree.index, prefix_index_cmp) 899 900 /* 901 * Search for specified prefix of a peer. Returns NULL if not found. 902 */ 903 struct prefix * 904 prefix_get(struct rib *rib, struct rde_peer *peer, uint32_t path_id, 905 struct bgpd_addr *prefix, int prefixlen) 906 { 907 struct rib_entry *re; 908 909 re = rib_get(rib, prefix, prefixlen); 910 if (re == NULL) 911 return (NULL); 912 return (prefix_bypeer(re, peer, path_id)); 913 } 914 915 /* 916 * Search for specified prefix in the peer prefix_index. 917 * Returns NULL if not found. 918 */ 919 struct prefix * 920 prefix_adjout_get(struct rde_peer *peer, uint32_t path_id_tx, 921 struct bgpd_addr *prefix, int prefixlen) 922 { 923 struct prefix xp; 924 925 memset(&xp, 0, sizeof(xp)); 926 xp.pt = pt_fill(prefix, prefixlen); 927 xp.path_id_tx = path_id_tx; 928 929 return RB_FIND(prefix_index, &peer->adj_rib_out, &xp); 930 } 931 932 /* 933 * Lookup a prefix without considering path_id in the peer prefix_index. 934 * Returns NULL if not found. 935 */ 936 struct prefix * 937 prefix_adjout_lookup(struct rde_peer *peer, struct bgpd_addr *prefix, 938 int prefixlen) 939 { 940 struct prefix xp, *np; 941 942 memset(&xp, 0, sizeof(xp)); 943 xp.pt = pt_fill(prefix, prefixlen); 944 945 np = RB_NFIND(prefix_index, &peer->adj_rib_out, &xp); 946 if (np == NULL || pt_prefix_cmp(np->pt, xp.pt) != 0) 947 return NULL; 948 return np; 949 } 950 951 /* 952 * Return next prefix after a lookup that is actually an update. 953 */ 954 struct prefix * 955 prefix_adjout_next(struct rde_peer *peer, struct prefix *p) 956 { 957 struct prefix *np; 958 959 np = RB_NEXT(prefix_index, &peer->adj_rib_out, p); 960 if (np == NULL || np->pt != p->pt) 961 return NULL; 962 return np; 963 } 964 965 /* 966 * Lookup addr in the peer prefix_index. Returns first match. 967 * Returns NULL if not found. 968 */ 969 struct prefix * 970 prefix_adjout_match(struct rde_peer *peer, struct bgpd_addr *addr) 971 { 972 struct prefix *p; 973 int i; 974 975 switch (addr->aid) { 976 case AID_INET: 977 case AID_VPN_IPv4: 978 for (i = 32; i >= 0; i--) { 979 p = prefix_adjout_lookup(peer, addr, i); 980 if (p != NULL) 981 return p; 982 } 983 break; 984 case AID_INET6: 985 case AID_VPN_IPv6: 986 for (i = 128; i >= 0; i--) { 987 p = prefix_adjout_lookup(peer, addr, i); 988 if (p != NULL) 989 return p; 990 } 991 break; 992 default: 993 fatalx("%s: unknown af", __func__); 994 } 995 return NULL; 996 } 997 998 /* 999 * Update a prefix. 1000 * Return 1 if prefix was newly added, 0 if it was just changed. 1001 */ 1002 int 1003 prefix_update(struct rib *rib, struct rde_peer *peer, uint32_t path_id, 1004 uint32_t path_id_tx, struct filterstate *state, struct bgpd_addr *prefix, 1005 int prefixlen, uint8_t vstate) 1006 { 1007 struct rde_aspath *asp, *nasp = &state->aspath; 1008 struct rde_community *comm, *ncomm = &state->communities; 1009 struct prefix *p; 1010 1011 /* 1012 * First try to find a prefix in the specified RIB. 1013 */ 1014 if ((p = prefix_get(rib, peer, path_id, prefix, prefixlen)) != NULL) { 1015 if (path_id_tx != p->path_id_tx) 1016 fatalx("path_id mismatch"); 1017 if (prefix_nexthop(p) == state->nexthop && 1018 prefix_nhflags(p) == state->nhflags && 1019 communities_equal(ncomm, prefix_communities(p)) && 1020 path_compare(nasp, prefix_aspath(p)) == 0) { 1021 /* no change, update last change */ 1022 p->lastchange = getmonotime(); 1023 p->validation_state = vstate; 1024 return (0); 1025 } 1026 } 1027 1028 /* 1029 * Either the prefix does not exist or the path changed. 1030 * In both cases lookup the new aspath to make sure it is not 1031 * already in the RIB. 1032 */ 1033 if ((asp = path_lookup(nasp)) == NULL) { 1034 /* Path not available, create and link a new one. */ 1035 asp = path_copy(path_get(), nasp); 1036 path_link(asp); 1037 } 1038 1039 if ((comm = communities_lookup(ncomm)) == NULL) { 1040 /* Communities not available, create and link a new one. */ 1041 comm = communities_link(ncomm); 1042 } 1043 1044 /* If the prefix was found move it else add it to the RIB. */ 1045 if (p != NULL) 1046 return (prefix_move(p, peer, asp, comm, state->nexthop, 1047 state->nhflags, vstate)); 1048 else 1049 return (prefix_add(prefix, prefixlen, rib, peer, path_id, 1050 path_id_tx, asp, comm, state->nexthop, state->nhflags, 1051 vstate)); 1052 } 1053 1054 /* 1055 * Adds or updates a prefix. 1056 */ 1057 static int 1058 prefix_add(struct bgpd_addr *prefix, int prefixlen, struct rib *rib, 1059 struct rde_peer *peer, uint32_t path_id, uint32_t path_id_tx, 1060 struct rde_aspath *asp, struct rde_community *comm, 1061 struct nexthop *nexthop, uint8_t nhflags, uint8_t vstate) 1062 { 1063 struct prefix *p; 1064 struct rib_entry *re; 1065 1066 re = rib_get(rib, prefix, prefixlen); 1067 if (re == NULL) 1068 re = rib_add(rib, prefix, prefixlen); 1069 1070 p = prefix_alloc(); 1071 prefix_link(p, re, re->prefix, peer, path_id, path_id_tx, asp, comm, 1072 nexthop, nhflags, vstate); 1073 1074 /* add possible pftable reference form aspath */ 1075 if (asp && asp->pftableid) 1076 rde_pftable_add(asp->pftableid, p); 1077 /* make route decision */ 1078 prefix_evaluate(re, p, NULL); 1079 return (1); 1080 } 1081 1082 /* 1083 * Move the prefix to the specified as path, removes the old asp if needed. 1084 */ 1085 static int 1086 prefix_move(struct prefix *p, struct rde_peer *peer, 1087 struct rde_aspath *asp, struct rde_community *comm, 1088 struct nexthop *nexthop, uint8_t nhflags, uint8_t vstate) 1089 { 1090 struct prefix *np; 1091 1092 if (p->flags & PREFIX_FLAG_ADJOUT) 1093 fatalx("%s: prefix with PREFIX_FLAG_ADJOUT hit", __func__); 1094 1095 if (peer != prefix_peer(p)) 1096 fatalx("prefix_move: cross peer move"); 1097 1098 /* add new prefix node */ 1099 np = prefix_alloc(); 1100 prefix_link(np, prefix_re(p), p->pt, peer, p->path_id, p->path_id_tx, 1101 asp, comm, nexthop, nhflags, vstate); 1102 1103 /* add possible pftable reference from new aspath */ 1104 if (asp && asp->pftableid) 1105 rde_pftable_add(asp->pftableid, np); 1106 1107 /* 1108 * no need to update the peer prefix count because we are only moving 1109 * the prefix without changing the peer. 1110 */ 1111 prefix_evaluate(prefix_re(np), np, p); 1112 1113 /* remove possible pftable reference from old path */ 1114 if (p->aspath && p->aspath->pftableid) 1115 rde_pftable_del(p->aspath->pftableid, p); 1116 1117 /* remove old prefix node */ 1118 prefix_unlink(p); 1119 prefix_free(p); 1120 1121 return (0); 1122 } 1123 1124 /* 1125 * Removes a prefix from the specified RIB. If the parent objects -- rib_entry 1126 * or pt_entry -- become empty remove them too. 1127 */ 1128 int 1129 prefix_withdraw(struct rib *rib, struct rde_peer *peer, uint32_t path_id, 1130 struct bgpd_addr *prefix, int prefixlen) 1131 { 1132 struct prefix *p; 1133 struct rde_aspath *asp; 1134 1135 p = prefix_get(rib, peer, path_id, prefix, prefixlen); 1136 if (p == NULL) /* Got a dummy withdrawn request. */ 1137 return (0); 1138 1139 if (p->flags & PREFIX_FLAG_ADJOUT) 1140 fatalx("%s: prefix with PREFIX_FLAG_ADJOUT hit", __func__); 1141 asp = prefix_aspath(p); 1142 if (asp && asp->pftableid) 1143 /* only prefixes in the local RIB were pushed into pf */ 1144 rde_pftable_del(asp->pftableid, p); 1145 1146 prefix_destroy(p); 1147 1148 return (1); 1149 } 1150 1151 /* 1152 * Insert an End-of-RIB marker into the update queue. 1153 */ 1154 void 1155 prefix_add_eor(struct rde_peer *peer, uint8_t aid) 1156 { 1157 struct prefix *p; 1158 1159 p = prefix_alloc(); 1160 p->flags = PREFIX_FLAG_ADJOUT | PREFIX_FLAG_UPDATE | PREFIX_FLAG_EOR; 1161 if (RB_INSERT(prefix_tree, &peer->updates[aid], p) != NULL) 1162 /* no need to add if EoR marker already present */ 1163 prefix_free(p); 1164 /* EOR marker is not inserted into the adj_rib_out index */ 1165 } 1166 1167 /* 1168 * Put a prefix from the Adj-RIB-Out onto the update queue. 1169 */ 1170 void 1171 prefix_adjout_update(struct prefix *p, struct rde_peer *peer, 1172 struct filterstate *state, struct bgpd_addr *prefix, int prefixlen, 1173 uint32_t path_id_tx, uint8_t vstate) 1174 { 1175 struct rde_aspath *asp; 1176 struct rde_community *comm; 1177 1178 if (p == NULL) { 1179 p = prefix_alloc(); 1180 /* initally mark DEAD so code below is skipped */ 1181 p->flags |= PREFIX_FLAG_ADJOUT | PREFIX_FLAG_DEAD; 1182 1183 p->pt = pt_get(prefix, prefixlen); 1184 if (p->pt == NULL) 1185 p->pt = pt_add(prefix, prefixlen); 1186 pt_ref(p->pt); 1187 p->peer = peer; 1188 p->path_id_tx = path_id_tx; 1189 1190 if (RB_INSERT(prefix_index, &peer->adj_rib_out, p) != NULL) 1191 fatalx("%s: RB index invariant violated", __func__); 1192 } 1193 1194 if ((p->flags & PREFIX_FLAG_ADJOUT) == 0) 1195 fatalx("%s: prefix without PREFIX_FLAG_ADJOUT hit", __func__); 1196 if ((p->flags & (PREFIX_FLAG_WITHDRAW | PREFIX_FLAG_DEAD)) == 0) { 1197 /* 1198 * XXX for now treat a different path_id_tx like different 1199 * attributes and force out an update. It is unclear how 1200 * common it is to have equivalent updates from alternative 1201 * paths. 1202 */ 1203 if (p->path_id_tx == path_id_tx && 1204 prefix_nhflags(p) == state->nhflags && 1205 prefix_nexthop(p) == state->nexthop && 1206 communities_equal(&state->communities, 1207 prefix_communities(p)) && 1208 path_compare(&state->aspath, prefix_aspath(p)) == 0) { 1209 /* nothing changed */ 1210 p->validation_state = vstate; 1211 p->lastchange = getmonotime(); 1212 p->flags &= ~PREFIX_FLAG_STALE; 1213 return; 1214 } 1215 1216 /* if pending update unhook it before it is unlinked */ 1217 if (p->flags & PREFIX_FLAG_UPDATE) { 1218 RB_REMOVE(prefix_tree, &peer->updates[prefix->aid], p); 1219 peer->up_nlricnt--; 1220 } 1221 1222 /* unlink prefix so it can be relinked below */ 1223 prefix_unlink(p); 1224 peer->prefix_out_cnt--; 1225 } 1226 if (p->flags & PREFIX_FLAG_WITHDRAW) { 1227 RB_REMOVE(prefix_tree, &peer->withdraws[prefix->aid], p); 1228 peer->up_wcnt--; 1229 } 1230 1231 /* nothing needs to be done for PREFIX_FLAG_DEAD and STALE */ 1232 p->flags &= ~PREFIX_FLAG_MASK; 1233 1234 /* update path_id_tx now that the prefix is unlinked */ 1235 if (p->path_id_tx != path_id_tx) { 1236 /* path_id_tx is part of the index so remove and re-insert p */ 1237 RB_REMOVE(prefix_index, &peer->adj_rib_out, p); 1238 p->path_id_tx = path_id_tx; 1239 if (RB_INSERT(prefix_index, &peer->adj_rib_out, p) != NULL) 1240 fatalx("%s: RB index invariant violated", __func__); 1241 } 1242 1243 if ((asp = path_lookup(&state->aspath)) == NULL) { 1244 /* Path not available, create and link a new one. */ 1245 asp = path_copy(path_get(), &state->aspath); 1246 path_link(asp); 1247 } 1248 1249 if ((comm = communities_lookup(&state->communities)) == NULL) { 1250 /* Communities not available, create and link a new one. */ 1251 comm = communities_link(&state->communities); 1252 } 1253 1254 prefix_link(p, NULL, p->pt, peer, 0, p->path_id_tx, asp, comm, 1255 state->nexthop, state->nhflags, vstate); 1256 peer->prefix_out_cnt++; 1257 1258 if (p->flags & PREFIX_FLAG_MASK) 1259 fatalx("%s: bad flags %x", __func__, p->flags); 1260 p->flags |= PREFIX_FLAG_UPDATE; 1261 if (RB_INSERT(prefix_tree, &peer->updates[prefix->aid], p) != NULL) 1262 fatalx("%s: RB tree invariant violated", __func__); 1263 peer->up_nlricnt++; 1264 } 1265 1266 /* 1267 * Withdraw a prefix from the Adj-RIB-Out, this unlinks the aspath but leaves 1268 * the prefix in the RIB linked to the peer withdraw list. 1269 */ 1270 void 1271 prefix_adjout_withdraw(struct prefix *p) 1272 { 1273 struct rde_peer *peer = prefix_peer(p); 1274 1275 if ((p->flags & PREFIX_FLAG_ADJOUT) == 0) 1276 fatalx("%s: prefix without PREFIX_FLAG_ADJOUT hit", __func__); 1277 1278 /* already a withdraw, shortcut */ 1279 if (p->flags & PREFIX_FLAG_WITHDRAW) { 1280 p->lastchange = getmonotime(); 1281 p->flags &= ~PREFIX_FLAG_STALE; 1282 return; 1283 } 1284 /* pending update just got withdrawn */ 1285 if (p->flags & PREFIX_FLAG_UPDATE) { 1286 RB_REMOVE(prefix_tree, &peer->updates[p->pt->aid], p); 1287 peer->up_nlricnt--; 1288 } 1289 /* unlink prefix if it was linked (not a withdraw or dead) */ 1290 if ((p->flags & (PREFIX_FLAG_WITHDRAW | PREFIX_FLAG_DEAD)) == 0) { 1291 prefix_unlink(p); 1292 peer->prefix_out_cnt--; 1293 } 1294 1295 /* nothing needs to be done for PREFIX_FLAG_DEAD and STALE */ 1296 p->flags &= ~PREFIX_FLAG_MASK; 1297 p->lastchange = getmonotime(); 1298 1299 p->flags |= PREFIX_FLAG_WITHDRAW; 1300 if (RB_INSERT(prefix_tree, &peer->withdraws[p->pt->aid], p) != NULL) 1301 fatalx("%s: RB tree invariant violated", __func__); 1302 peer->up_wcnt++; 1303 } 1304 1305 void 1306 prefix_adjout_destroy(struct prefix *p) 1307 { 1308 struct rde_peer *peer = prefix_peer(p); 1309 1310 if ((p->flags & PREFIX_FLAG_ADJOUT) == 0) 1311 fatalx("%s: prefix without PREFIX_FLAG_ADJOUT hit", __func__); 1312 1313 if (p->flags & PREFIX_FLAG_EOR) { 1314 /* EOR marker is not linked in the index */ 1315 prefix_free(p); 1316 return; 1317 } 1318 1319 if (p->flags & PREFIX_FLAG_WITHDRAW) { 1320 RB_REMOVE(prefix_tree, &peer->withdraws[p->pt->aid], p); 1321 peer->up_wcnt--; 1322 } 1323 if (p->flags & PREFIX_FLAG_UPDATE) { 1324 RB_REMOVE(prefix_tree, &peer->updates[p->pt->aid], p); 1325 peer->up_nlricnt--; 1326 } 1327 /* unlink prefix if it was linked (not a withdraw or dead) */ 1328 if ((p->flags & (PREFIX_FLAG_WITHDRAW | PREFIX_FLAG_DEAD)) == 0) { 1329 prefix_unlink(p); 1330 peer->prefix_out_cnt--; 1331 } 1332 1333 /* nothing needs to be done for PREFIX_FLAG_DEAD and STALE */ 1334 p->flags &= ~PREFIX_FLAG_MASK; 1335 1336 if (prefix_is_locked(p)) { 1337 /* mark prefix dead but leave it for prefix_restart */ 1338 p->flags |= PREFIX_FLAG_DEAD; 1339 } else { 1340 RB_REMOVE(prefix_index, &peer->adj_rib_out, p); 1341 /* remove the last prefix reference before free */ 1342 pt_unref(p->pt); 1343 prefix_free(p); 1344 } 1345 } 1346 1347 static struct prefix * 1348 prefix_restart(struct rib_context *ctx) 1349 { 1350 struct prefix *p; 1351 1352 p = prefix_unlock(ctx->ctx_p); 1353 1354 if (prefix_is_dead(p)) { 1355 struct prefix *next; 1356 1357 next = RB_NEXT(prefix_index, unused, p); 1358 prefix_adjout_destroy(p); 1359 p = next; 1360 } 1361 ctx->ctx_p = NULL; 1362 return p; 1363 } 1364 1365 static void 1366 prefix_dump_r(struct rib_context *ctx) 1367 { 1368 struct prefix *p, *next; 1369 struct rde_peer *peer; 1370 unsigned int i; 1371 1372 if ((peer = peer_get(ctx->ctx_id)) == NULL) 1373 goto done; 1374 1375 if (ctx->ctx_p == NULL) 1376 p = RB_MIN(prefix_index, &peer->adj_rib_out); 1377 else 1378 p = prefix_restart(ctx); 1379 1380 for (i = 0; p != NULL; p = next) { 1381 next = RB_NEXT(prefix_index, unused, p); 1382 if (prefix_is_dead(p)) 1383 continue; 1384 if (ctx->ctx_aid != AID_UNSPEC && 1385 ctx->ctx_aid != p->pt->aid) 1386 continue; 1387 if (ctx->ctx_count && i++ >= ctx->ctx_count && 1388 !prefix_is_locked(p)) { 1389 /* store and lock last element */ 1390 ctx->ctx_p = prefix_lock(p); 1391 return; 1392 } 1393 ctx->ctx_prefix_call(p, ctx->ctx_arg); 1394 } 1395 1396 done: 1397 if (ctx->ctx_done) 1398 ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid); 1399 LIST_REMOVE(ctx, entry); 1400 free(ctx); 1401 } 1402 1403 int 1404 prefix_dump_new(struct rde_peer *peer, uint8_t aid, unsigned int count, 1405 void *arg, void (*upcall)(struct prefix *, void *), 1406 void (*done)(void *, uint8_t), int (*throttle)(void *)) 1407 { 1408 struct rib_context *ctx; 1409 1410 if ((ctx = calloc(1, sizeof(*ctx))) == NULL) 1411 return -1; 1412 ctx->ctx_id = peer->conf.id; 1413 ctx->ctx_aid = aid; 1414 ctx->ctx_count = count; 1415 ctx->ctx_arg = arg; 1416 ctx->ctx_prefix_call = upcall; 1417 ctx->ctx_done = done; 1418 ctx->ctx_throttle = throttle; 1419 1420 LIST_INSERT_HEAD(&rib_dumps, ctx, entry); 1421 1422 /* requested a sync traversal */ 1423 if (count == 0) 1424 prefix_dump_r(ctx); 1425 1426 return 0; 1427 } 1428 1429 /* dump a prefix into specified buffer */ 1430 int 1431 prefix_write(u_char *buf, int len, struct bgpd_addr *prefix, uint8_t plen, 1432 int withdraw) 1433 { 1434 int totlen, psize; 1435 1436 switch (prefix->aid) { 1437 case AID_INET: 1438 case AID_INET6: 1439 totlen = PREFIX_SIZE(plen); 1440 1441 if (totlen > len) 1442 return (-1); 1443 *buf++ = plen; 1444 memcpy(buf, &prefix->ba, totlen - 1); 1445 return (totlen); 1446 case AID_VPN_IPv4: 1447 case AID_VPN_IPv6: 1448 totlen = PREFIX_SIZE(plen) + sizeof(prefix->rd); 1449 psize = PREFIX_SIZE(plen) - 1; 1450 plen += sizeof(prefix->rd) * 8; 1451 if (withdraw) { 1452 /* withdraw have one compat label as placeholder */ 1453 totlen += 3; 1454 plen += 3 * 8; 1455 } else { 1456 totlen += prefix->labellen; 1457 plen += prefix->labellen * 8; 1458 } 1459 1460 if (totlen > len) 1461 return (-1); 1462 *buf++ = plen; 1463 if (withdraw) { 1464 /* magic compatibility label as per rfc8277 */ 1465 *buf++ = 0x80; 1466 *buf++ = 0x0; 1467 *buf++ = 0x0; 1468 } else { 1469 memcpy(buf, &prefix->labelstack, 1470 prefix->labellen); 1471 buf += prefix->labellen; 1472 } 1473 memcpy(buf, &prefix->rd, sizeof(prefix->rd)); 1474 buf += sizeof(prefix->rd); 1475 memcpy(buf, &prefix->ba, psize); 1476 return (totlen); 1477 default: 1478 return (-1); 1479 } 1480 } 1481 1482 int 1483 prefix_writebuf(struct ibuf *buf, struct bgpd_addr *prefix, uint8_t plen) 1484 { 1485 int totlen; 1486 void *bptr; 1487 1488 switch (prefix->aid) { 1489 case AID_INET: 1490 case AID_INET6: 1491 totlen = PREFIX_SIZE(plen); 1492 break; 1493 case AID_VPN_IPv4: 1494 case AID_VPN_IPv6: 1495 totlen = PREFIX_SIZE(plen) + sizeof(prefix->rd) + 1496 prefix->labellen; 1497 break; 1498 default: 1499 return (-1); 1500 } 1501 1502 if ((bptr = ibuf_reserve(buf, totlen)) == NULL) 1503 return (-1); 1504 if (prefix_write(bptr, totlen, prefix, plen, 0) == -1) 1505 return (-1); 1506 return (0); 1507 } 1508 1509 /* 1510 * Searches in the prefix list of specified rib_entry for a prefix entry 1511 * belonging to the peer peer. Returns NULL if no match found. 1512 */ 1513 struct prefix * 1514 prefix_bypeer(struct rib_entry *re, struct rde_peer *peer, uint32_t path_id) 1515 { 1516 struct prefix *p; 1517 1518 TAILQ_FOREACH(p, &re->prefix_h, entry.list.rib) 1519 if (prefix_peer(p) == peer && p->path_id == path_id) 1520 return (p); 1521 return (NULL); 1522 } 1523 1524 static void 1525 prefix_evaluate_all(struct prefix *p, enum nexthop_state state, 1526 enum nexthop_state oldstate) 1527 { 1528 struct rib_entry *re = prefix_re(p); 1529 1530 /* Skip non local-RIBs or RIBs that are flagged as noeval. */ 1531 if (re_rib(re)->flags & F_RIB_NOEVALUATE) { 1532 log_warnx("%s: prefix with F_RIB_NOEVALUATE hit", __func__); 1533 return; 1534 } 1535 1536 if (oldstate == state) { 1537 /* 1538 * The state of the nexthop did not change. The only 1539 * thing that may have changed is the true_nexthop 1540 * or other internal infos. This will not change 1541 * the routing decision so shortcut here. 1542 */ 1543 if (state == NEXTHOP_REACH) { 1544 if ((re_rib(re)->flags & F_RIB_NOFIB) == 0 && 1545 p == prefix_best(re)) 1546 rde_send_kroute(re_rib(re), p, NULL); 1547 } 1548 return; 1549 } 1550 1551 /* redo the route decision */ 1552 prefix_evaluate(prefix_re(p), p, p); 1553 } 1554 1555 /* kill a prefix. */ 1556 void 1557 prefix_destroy(struct prefix *p) 1558 { 1559 /* make route decision */ 1560 prefix_evaluate(prefix_re(p), NULL, p); 1561 1562 prefix_unlink(p); 1563 prefix_free(p); 1564 } 1565 1566 /* 1567 * Link a prefix into the different parent objects. 1568 */ 1569 static void 1570 prefix_link(struct prefix *p, struct rib_entry *re, struct pt_entry *pt, 1571 struct rde_peer *peer, uint32_t path_id, uint32_t path_id_tx, 1572 struct rde_aspath *asp, struct rde_community *comm, 1573 struct nexthop *nexthop, uint8_t nhflags, uint8_t vstate) 1574 { 1575 if (re) 1576 p->entry.list.re = re; 1577 p->aspath = path_ref(asp); 1578 p->communities = communities_ref(comm); 1579 p->peer = peer; 1580 p->pt = pt_ref(pt); 1581 p->path_id = path_id; 1582 p->path_id_tx = path_id_tx; 1583 p->validation_state = vstate; 1584 p->nhflags = nhflags; 1585 p->nexthop = nexthop_ref(nexthop); 1586 nexthop_link(p); 1587 p->lastchange = getmonotime(); 1588 } 1589 1590 /* 1591 * Unlink a prefix from the different parent objects. 1592 */ 1593 static void 1594 prefix_unlink(struct prefix *p) 1595 { 1596 struct rib_entry *re = prefix_re(p); 1597 1598 /* destroy all references to other objects */ 1599 /* remove nexthop ref ... */ 1600 nexthop_unlink(p); 1601 nexthop_unref(p->nexthop); 1602 p->nexthop = NULL; 1603 p->nhflags = 0; 1604 /* ... communities ... */ 1605 communities_unref(p->communities); 1606 p->communities = NULL; 1607 /* and unlink from aspath */ 1608 path_unref(p->aspath); 1609 p->aspath = NULL; 1610 1611 if (re && rib_empty(re)) 1612 rib_remove(re); 1613 1614 pt_unref(p->pt); 1615 } 1616 1617 /* alloc and zero new entry. May not fail. */ 1618 static struct prefix * 1619 prefix_alloc(void) 1620 { 1621 struct prefix *p; 1622 1623 p = calloc(1, sizeof(*p)); 1624 if (p == NULL) 1625 fatal("prefix_alloc"); 1626 rdemem.prefix_cnt++; 1627 return p; 1628 } 1629 1630 /* free a unlinked entry */ 1631 static void 1632 prefix_free(struct prefix *p) 1633 { 1634 rdemem.prefix_cnt--; 1635 free(p); 1636 } 1637 1638 /* 1639 * nexthop functions 1640 */ 1641 struct nexthop_head *nexthop_hash(struct bgpd_addr *); 1642 struct nexthop *nexthop_lookup(struct bgpd_addr *); 1643 1644 /* 1645 * In BGP there exist two nexthops: the exit nexthop which was announced via 1646 * BGP and the true nexthop which is used in the FIB -- forward information 1647 * base a.k.a kernel routing table. When sending updates it is even more 1648 * confusing. In IBGP we pass the unmodified exit nexthop to the neighbors 1649 * while in EBGP normally the address of the router is sent. The exit nexthop 1650 * may be passed to the external neighbor if the neighbor and the exit nexthop 1651 * reside in the same subnet -- directly connected. 1652 */ 1653 struct nexthop_table { 1654 LIST_HEAD(nexthop_head, nexthop) *nexthop_hashtbl; 1655 uint32_t nexthop_hashmask; 1656 } nexthoptable; 1657 1658 SIPHASH_KEY nexthoptablekey; 1659 1660 TAILQ_HEAD(nexthop_queue, nexthop) nexthop_runners; 1661 1662 void 1663 nexthop_init(uint32_t hashsize) 1664 { 1665 uint32_t hs, i; 1666 1667 for (hs = 1; hs < hashsize; hs <<= 1) 1668 ; 1669 nexthoptable.nexthop_hashtbl = calloc(hs, sizeof(struct nexthop_head)); 1670 if (nexthoptable.nexthop_hashtbl == NULL) 1671 fatal("nextop_init"); 1672 1673 TAILQ_INIT(&nexthop_runners); 1674 for (i = 0; i < hs; i++) 1675 LIST_INIT(&nexthoptable.nexthop_hashtbl[i]); 1676 arc4random_buf(&nexthoptablekey, sizeof(nexthoptablekey)); 1677 1678 nexthoptable.nexthop_hashmask = hs - 1; 1679 } 1680 1681 void 1682 nexthop_shutdown(void) 1683 { 1684 uint32_t i; 1685 struct nexthop *nh, *nnh; 1686 1687 for (i = 0; i <= nexthoptable.nexthop_hashmask; i++) { 1688 for (nh = LIST_FIRST(&nexthoptable.nexthop_hashtbl[i]); 1689 nh != NULL; nh = nnh) { 1690 nnh = LIST_NEXT(nh, nexthop_l); 1691 nh->state = NEXTHOP_UNREACH; 1692 nexthop_unref(nh); 1693 } 1694 if (!LIST_EMPTY(&nexthoptable.nexthop_hashtbl[i])) { 1695 nh = LIST_FIRST(&nexthoptable.nexthop_hashtbl[i]); 1696 log_warnx("nexthop_shutdown: non-free table, " 1697 "nexthop %s refcnt %d", 1698 log_addr(&nh->exit_nexthop), nh->refcnt); 1699 } 1700 } 1701 1702 free(nexthoptable.nexthop_hashtbl); 1703 } 1704 1705 int 1706 nexthop_pending(void) 1707 { 1708 return !TAILQ_EMPTY(&nexthop_runners); 1709 } 1710 1711 void 1712 nexthop_runner(void) 1713 { 1714 struct nexthop *nh; 1715 struct prefix *p; 1716 uint32_t j; 1717 1718 nh = TAILQ_FIRST(&nexthop_runners); 1719 if (nh == NULL) 1720 return; 1721 1722 /* remove from runnner queue */ 1723 TAILQ_REMOVE(&nexthop_runners, nh, runner_l); 1724 1725 p = nh->next_prefix; 1726 for (j = 0; p != NULL && j < RDE_RUNNER_ROUNDS; j++) { 1727 prefix_evaluate_all(p, nh->state, nh->oldstate); 1728 p = LIST_NEXT(p, entry.list.nexthop); 1729 } 1730 1731 /* prep for next run, if not finished readd to tail of queue */ 1732 nh->next_prefix = p; 1733 if (p != NULL) 1734 TAILQ_INSERT_TAIL(&nexthop_runners, nh, runner_l); 1735 else 1736 log_debug("nexthop %s update finished", 1737 log_addr(&nh->exit_nexthop)); 1738 } 1739 1740 void 1741 nexthop_update(struct kroute_nexthop *msg) 1742 { 1743 struct nexthop *nh; 1744 1745 nh = nexthop_lookup(&msg->nexthop); 1746 if (nh == NULL) { 1747 log_warnx("nexthop_update: non-existent nexthop %s", 1748 log_addr(&msg->nexthop)); 1749 return; 1750 } 1751 1752 nh->oldstate = nh->state; 1753 if (msg->valid) 1754 nh->state = NEXTHOP_REACH; 1755 else 1756 nh->state = NEXTHOP_UNREACH; 1757 1758 if (nh->oldstate == NEXTHOP_LOOKUP) 1759 /* drop reference which was hold during the lookup */ 1760 if (nexthop_unref(nh)) 1761 return; /* nh lost last ref, no work left */ 1762 1763 if (nh->next_prefix) { 1764 /* 1765 * If nexthop_runner() is not finished with this nexthop 1766 * then ensure that all prefixes are updated by setting 1767 * the oldstate to NEXTHOP_FLAPPED. 1768 */ 1769 nh->oldstate = NEXTHOP_FLAPPED; 1770 TAILQ_REMOVE(&nexthop_runners, nh, runner_l); 1771 } 1772 1773 if (msg->connected) { 1774 nh->flags |= NEXTHOP_CONNECTED; 1775 memcpy(&nh->true_nexthop, &nh->exit_nexthop, 1776 sizeof(nh->true_nexthop)); 1777 } else 1778 memcpy(&nh->true_nexthop, &msg->gateway, 1779 sizeof(nh->true_nexthop)); 1780 1781 memcpy(&nh->nexthop_net, &msg->net, 1782 sizeof(nh->nexthop_net)); 1783 nh->nexthop_netlen = msg->netlen; 1784 1785 nh->next_prefix = LIST_FIRST(&nh->prefix_h); 1786 if (nh->next_prefix != NULL) { 1787 TAILQ_INSERT_HEAD(&nexthop_runners, nh, runner_l); 1788 log_debug("nexthop %s update starting", 1789 log_addr(&nh->exit_nexthop)); 1790 } 1791 } 1792 1793 void 1794 nexthop_modify(struct nexthop *setnh, enum action_types type, uint8_t aid, 1795 struct nexthop **nexthop, uint8_t *flags) 1796 { 1797 switch (type) { 1798 case ACTION_SET_NEXTHOP_REJECT: 1799 *flags = NEXTHOP_REJECT; 1800 break; 1801 case ACTION_SET_NEXTHOP_BLACKHOLE: 1802 *flags = NEXTHOP_BLACKHOLE; 1803 break; 1804 case ACTION_SET_NEXTHOP_NOMODIFY: 1805 *flags = NEXTHOP_NOMODIFY; 1806 break; 1807 case ACTION_SET_NEXTHOP_SELF: 1808 *flags = NEXTHOP_SELF; 1809 break; 1810 case ACTION_SET_NEXTHOP_REF: 1811 /* 1812 * it is possible that a prefix matches but has the wrong 1813 * address family for the set nexthop. In this case ignore it. 1814 */ 1815 if (aid != setnh->exit_nexthop.aid) 1816 break; 1817 nexthop_unref(*nexthop); 1818 *nexthop = nexthop_ref(setnh); 1819 *flags = 0; 1820 break; 1821 default: 1822 break; 1823 } 1824 } 1825 1826 void 1827 nexthop_link(struct prefix *p) 1828 { 1829 if (p->nexthop == NULL) 1830 return; 1831 if (p->flags & PREFIX_FLAG_ADJOUT) 1832 return; 1833 1834 /* no need to link prefixes in RIBs that have no decision process */ 1835 if (re_rib(prefix_re(p))->flags & F_RIB_NOEVALUATE) 1836 return; 1837 1838 p->flags |= PREFIX_NEXTHOP_LINKED; 1839 LIST_INSERT_HEAD(&p->nexthop->prefix_h, p, entry.list.nexthop); 1840 } 1841 1842 void 1843 nexthop_unlink(struct prefix *p) 1844 { 1845 if (p->nexthop == NULL || (p->flags & PREFIX_NEXTHOP_LINKED) == 0) 1846 return; 1847 1848 if (p == p->nexthop->next_prefix) { 1849 p->nexthop->next_prefix = LIST_NEXT(p, entry.list.nexthop); 1850 /* remove nexthop from list if no prefixes left to update */ 1851 if (p->nexthop->next_prefix == NULL) { 1852 TAILQ_REMOVE(&nexthop_runners, p->nexthop, runner_l); 1853 log_debug("nexthop %s update finished", 1854 log_addr(&p->nexthop->exit_nexthop)); 1855 } 1856 } 1857 1858 p->flags &= ~PREFIX_NEXTHOP_LINKED; 1859 LIST_REMOVE(p, entry.list.nexthop); 1860 } 1861 1862 struct nexthop * 1863 nexthop_get(struct bgpd_addr *nexthop) 1864 { 1865 struct nexthop *nh; 1866 1867 nh = nexthop_lookup(nexthop); 1868 if (nh == NULL) { 1869 nh = calloc(1, sizeof(*nh)); 1870 if (nh == NULL) 1871 fatal("nexthop_alloc"); 1872 rdemem.nexthop_cnt++; 1873 1874 LIST_INIT(&nh->prefix_h); 1875 nh->state = NEXTHOP_LOOKUP; 1876 nexthop_ref(nh); /* take reference for lookup */ 1877 nh->exit_nexthop = *nexthop; 1878 LIST_INSERT_HEAD(nexthop_hash(nexthop), nh, 1879 nexthop_l); 1880 1881 rde_send_nexthop(&nh->exit_nexthop, 1); 1882 } 1883 1884 return nexthop_ref(nh); 1885 } 1886 1887 struct nexthop * 1888 nexthop_ref(struct nexthop *nexthop) 1889 { 1890 if (nexthop) 1891 nexthop->refcnt++; 1892 return (nexthop); 1893 } 1894 1895 int 1896 nexthop_unref(struct nexthop *nh) 1897 { 1898 if (nh == NULL) 1899 return (0); 1900 if (--nh->refcnt > 0) 1901 return (0); 1902 1903 /* sanity check */ 1904 if (!LIST_EMPTY(&nh->prefix_h) || nh->state == NEXTHOP_LOOKUP) 1905 fatalx("%s: refcnt error", __func__); 1906 1907 /* is nexthop update running? impossible, that is a refcnt error */ 1908 if (nh->next_prefix) 1909 fatalx("%s: next_prefix not NULL", __func__); 1910 1911 LIST_REMOVE(nh, nexthop_l); 1912 rde_send_nexthop(&nh->exit_nexthop, 0); 1913 1914 rdemem.nexthop_cnt--; 1915 free(nh); 1916 return (1); 1917 } 1918 1919 int 1920 nexthop_compare(struct nexthop *na, struct nexthop *nb) 1921 { 1922 struct bgpd_addr *a, *b; 1923 1924 if (na == nb) 1925 return (0); 1926 if (na == NULL) 1927 return (-1); 1928 if (nb == NULL) 1929 return (1); 1930 1931 a = &na->exit_nexthop; 1932 b = &nb->exit_nexthop; 1933 1934 if (a->aid != b->aid) 1935 return (a->aid - b->aid); 1936 1937 switch (a->aid) { 1938 case AID_INET: 1939 if (ntohl(a->v4.s_addr) > ntohl(b->v4.s_addr)) 1940 return (1); 1941 if (ntohl(a->v4.s_addr) < ntohl(b->v4.s_addr)) 1942 return (-1); 1943 return (0); 1944 case AID_INET6: 1945 return (memcmp(&a->v6, &b->v6, sizeof(struct in6_addr))); 1946 default: 1947 fatalx("nexthop_cmp: unknown af"); 1948 } 1949 return (-1); 1950 } 1951 1952 struct nexthop * 1953 nexthop_lookup(struct bgpd_addr *nexthop) 1954 { 1955 struct nexthop *nh; 1956 1957 LIST_FOREACH(nh, nexthop_hash(nexthop), nexthop_l) { 1958 if (memcmp(&nh->exit_nexthop, nexthop, 1959 sizeof(struct bgpd_addr)) == 0) 1960 return (nh); 1961 } 1962 return (NULL); 1963 } 1964 1965 struct nexthop_head * 1966 nexthop_hash(struct bgpd_addr *nexthop) 1967 { 1968 uint32_t h = 0; 1969 1970 switch (nexthop->aid) { 1971 case AID_INET: 1972 h = SipHash24(&nexthoptablekey, &nexthop->v4.s_addr, 1973 sizeof(nexthop->v4.s_addr)); 1974 break; 1975 case AID_INET6: 1976 h = SipHash24(&nexthoptablekey, &nexthop->v6, 1977 sizeof(struct in6_addr)); 1978 break; 1979 default: 1980 fatalx("nexthop_hash: unsupported AID %d", nexthop->aid); 1981 } 1982 return (&nexthoptable.nexthop_hashtbl[h & nexthoptable.nexthop_hashmask]); 1983 } 1984