1 /* $OpenBSD: rde_rib.c,v 1.221 2021/05/04 09:27:09 claudio Exp $ */ 2 3 /* 4 * Copyright (c) 2003, 2004 Claudio Jeker <claudio@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/types.h> 20 #include <sys/queue.h> 21 22 #include <limits.h> 23 #include <stdlib.h> 24 #include <string.h> 25 #include <siphash.h> 26 #include <time.h> 27 28 #include "bgpd.h" 29 #include "rde.h" 30 #include "log.h" 31 32 /* 33 * BGP RIB -- Routing Information Base 34 * 35 * The RIB is build with one aspect in mind. Speed -- actually update speed. 36 * Therefore one thing needs to be absolutely avoided, long table walks. 37 * This is achieved by heavily linking the different parts together. 38 */ 39 u_int16_t rib_size; 40 struct rib **ribs; 41 42 struct rib_entry *rib_add(struct rib *, struct bgpd_addr *, int); 43 static inline int rib_compare(const struct rib_entry *, 44 const struct rib_entry *); 45 void rib_remove(struct rib_entry *); 46 int rib_empty(struct rib_entry *); 47 static void rib_dump_abort(u_int16_t); 48 49 RB_PROTOTYPE(rib_tree, rib_entry, rib_e, rib_compare); 50 RB_GENERATE(rib_tree, rib_entry, rib_e, rib_compare); 51 52 struct rib_context { 53 LIST_ENTRY(rib_context) entry; 54 struct rib_entry *ctx_re; 55 struct prefix *ctx_p; 56 u_int32_t ctx_id; 57 void (*ctx_rib_call)(struct rib_entry *, void *); 58 void (*ctx_prefix_call)(struct prefix *, void *); 59 void (*ctx_done)(void *, u_int8_t); 60 int (*ctx_throttle)(void *); 61 void *ctx_arg; 62 unsigned int ctx_count; 63 u_int8_t ctx_aid; 64 }; 65 LIST_HEAD(, rib_context) rib_dumps = LIST_HEAD_INITIALIZER(rib_dumps); 66 67 static void prefix_dump_r(struct rib_context *); 68 69 static inline struct rib_entry * 70 re_lock(struct rib_entry *re) 71 { 72 if (re->lock != 0) 73 log_warnx("%s: entry already locked", __func__); 74 re->lock = 1; 75 return re; 76 } 77 78 static inline struct rib_entry * 79 re_unlock(struct rib_entry *re) 80 { 81 if (re->lock == 0) 82 log_warnx("%s: entry already unlocked", __func__); 83 re->lock = 0; 84 return re; 85 } 86 87 static inline int 88 re_is_locked(struct rib_entry *re) 89 { 90 return (re->lock != 0); 91 } 92 93 static inline struct prefix * 94 prefix_lock(struct prefix *p) 95 { 96 if (p->flags & PREFIX_FLAG_LOCKED) 97 fatalx("%s: locking locked prefix", __func__); 98 p->flags |= PREFIX_FLAG_LOCKED; 99 return p; 100 } 101 102 static inline struct prefix * 103 prefix_unlock(struct prefix *p) 104 { 105 if ((p->flags & PREFIX_FLAG_LOCKED) == 0) 106 fatalx("%s: unlocking unlocked prefix", __func__); 107 p->flags &= ~PREFIX_FLAG_LOCKED; 108 return p; 109 } 110 111 static inline int 112 prefix_is_locked(struct prefix *p) 113 { 114 return (p->flags & PREFIX_FLAG_LOCKED) != 0; 115 } 116 117 static inline int 118 prefix_is_dead(struct prefix *p) 119 { 120 return (p->flags & PREFIX_FLAG_DEAD) != 0; 121 } 122 123 static inline struct rib_tree * 124 rib_tree(struct rib *rib) 125 { 126 return (&rib->tree); 127 } 128 129 static inline int 130 rib_compare(const struct rib_entry *a, const struct rib_entry *b) 131 { 132 return (pt_prefix_cmp(a->prefix, b->prefix)); 133 } 134 135 /* RIB specific functions */ 136 struct rib * 137 rib_new(char *name, u_int rtableid, u_int16_t flags) 138 { 139 struct rib *new; 140 u_int16_t id; 141 142 for (id = 0; id < rib_size; id++) { 143 if (ribs[id] == NULL) 144 break; 145 } 146 147 if (id >= rib_size) { 148 if ((ribs = recallocarray(ribs, id, id + 8, 149 sizeof(struct rib))) == NULL) 150 fatal(NULL); 151 rib_size = id + 8; 152 } 153 154 if ((new = calloc(1, sizeof(*new))) == NULL) 155 fatal(NULL); 156 157 strlcpy(new->name, name, sizeof(new->name)); 158 RB_INIT(rib_tree(new)); 159 new->state = RECONF_REINIT; 160 new->id = id; 161 new->flags = flags; 162 new->rtableid = rtableid; 163 164 new->in_rules = calloc(1, sizeof(struct filter_head)); 165 if (new->in_rules == NULL) 166 fatal(NULL); 167 TAILQ_INIT(new->in_rules); 168 169 ribs[id] = new; 170 171 log_debug("%s: %s -> %u", __func__, name, id); 172 return (new); 173 } 174 175 /* 176 * This function is only called when the FIB information of a RIB changed. 177 * It will flush the FIB if there was one previously and change the fibstate 178 * from RECONF_NONE (nothing to do) to either RECONF_RELOAD (reload the FIB) 179 * or RECONF_REINIT (rerun the route decision process for every element) 180 * depending on the new flags. 181 */ 182 void 183 rib_update(struct rib *rib) 184 { 185 /* flush fib first if there was one */ 186 if ((rib->flags & (F_RIB_NOFIB | F_RIB_NOEVALUATE)) == 0) 187 rde_send_kroute_flush(rib); 188 189 /* if no evaluate changes then a full reinit is needed */ 190 if ((rib->flags & F_RIB_NOEVALUATE) != 191 (rib->flags_tmp & F_RIB_NOEVALUATE)) 192 rib->fibstate = RECONF_REINIT; 193 194 rib->flags = rib->flags_tmp; 195 rib->rtableid = rib->rtableid_tmp; 196 197 /* reload fib if there is no reinit pending and there will be a fib */ 198 if (rib->fibstate != RECONF_REINIT && 199 (rib->flags & (F_RIB_NOFIB | F_RIB_NOEVALUATE)) == 0) 200 rib->fibstate = RECONF_RELOAD; 201 } 202 203 struct rib * 204 rib_byid(u_int16_t id) 205 { 206 if (id == RIB_NOTFOUND || id >= rib_size || ribs[id] == NULL) 207 return NULL; 208 return ribs[id]; 209 } 210 211 u_int16_t 212 rib_find(char *name) 213 { 214 u_int16_t id; 215 216 /* no name returns the first Loc-RIB */ 217 if (name == NULL || *name == '\0') 218 return RIB_LOC_START; 219 220 for (id = 0; id < rib_size; id++) { 221 if (ribs[id] != NULL && !strcmp(ribs[id]->name, name)) 222 return id; 223 } 224 225 return RIB_NOTFOUND; 226 } 227 228 void 229 rib_free(struct rib *rib) 230 { 231 struct rib_entry *re, *xre; 232 struct prefix *p; 233 234 rib_dump_abort(rib->id); 235 236 /* 237 * flush the rib, disable route evaluation and fib sync to speed up 238 * the prefix removal. Nothing depends on this data anymore. 239 */ 240 if ((rib->flags & (F_RIB_NOFIB | F_RIB_NOEVALUATE)) == 0) 241 rde_send_kroute_flush(rib); 242 rib->flags |= F_RIB_NOEVALUATE | F_RIB_NOFIB; 243 244 for (re = RB_MIN(rib_tree, rib_tree(rib)); re != NULL; re = xre) { 245 xre = RB_NEXT(rib_tree, rib_tree(rib), re); 246 247 /* 248 * Removing the prefixes is tricky because the last one 249 * will remove the rib_entry as well and because we do 250 * an empty check in prefix_destroy() it is not possible to 251 * use the default for loop. 252 */ 253 while ((p = LIST_FIRST(&re->prefix_h))) { 254 struct rde_aspath *asp = prefix_aspath(p); 255 if (asp && asp->pftableid) 256 rde_pftable_del(asp->pftableid, p); 257 prefix_destroy(p); 258 } 259 } 260 if (rib->id <= RIB_LOC_START) 261 return; /* never remove the default ribs */ 262 filterlist_free(rib->in_rules_tmp); 263 filterlist_free(rib->in_rules); 264 ribs[rib->id] = NULL; 265 free(rib); 266 } 267 268 void 269 rib_shutdown(void) 270 { 271 struct rib *rib; 272 u_int16_t id; 273 274 for (id = 0; id < rib_size; id++) { 275 rib = rib_byid(id); 276 if (rib == NULL) 277 continue; 278 if (!RB_EMPTY(rib_tree(ribs[id]))) { 279 log_warnx("%s: rib %s is not empty", __func__, 280 ribs[id]->name); 281 } 282 rib_free(ribs[id]); 283 } 284 for (id = 0; id <= RIB_LOC_START; id++) { 285 rib = rib_byid(id); 286 if (rib == NULL) 287 continue; 288 filterlist_free(rib->in_rules_tmp); 289 filterlist_free(rib->in_rules); 290 ribs[id] = NULL; 291 free(rib); 292 } 293 free(ribs); 294 } 295 296 struct rib_entry * 297 rib_get(struct rib *rib, struct bgpd_addr *prefix, int prefixlen) 298 { 299 struct rib_entry xre, *re; 300 struct pt_entry *pte; 301 302 pte = pt_fill(prefix, prefixlen); 303 memset(&xre, 0, sizeof(xre)); 304 xre.prefix = pte; 305 306 re = RB_FIND(rib_tree, rib_tree(rib), &xre); 307 if (re && re->rib_id != rib->id) 308 fatalx("%s: Unexpected RIB %u != %u.", __func__, 309 re->rib_id, rib->id); 310 return re; 311 } 312 313 struct rib_entry * 314 rib_match(struct rib *rib, struct bgpd_addr *addr) 315 { 316 struct rib_entry *re; 317 int i; 318 319 switch (addr->aid) { 320 case AID_INET: 321 case AID_VPN_IPv4: 322 for (i = 32; i >= 0; i--) { 323 re = rib_get(rib, addr, i); 324 if (re != NULL) 325 return (re); 326 } 327 break; 328 case AID_INET6: 329 case AID_VPN_IPv6: 330 for (i = 128; i >= 0; i--) { 331 re = rib_get(rib, addr, i); 332 if (re != NULL) 333 return (re); 334 } 335 break; 336 default: 337 fatalx("%s: unknown af", __func__); 338 } 339 return (NULL); 340 } 341 342 343 struct rib_entry * 344 rib_add(struct rib *rib, struct bgpd_addr *prefix, int prefixlen) 345 { 346 struct pt_entry *pte; 347 struct rib_entry *re; 348 349 pte = pt_get(prefix, prefixlen); 350 if (pte == NULL) 351 pte = pt_add(prefix, prefixlen); 352 353 if ((re = calloc(1, sizeof(*re))) == NULL) 354 fatal("rib_add"); 355 356 LIST_INIT(&re->prefix_h); 357 re->prefix = pt_ref(pte); 358 re->rib_id = rib->id; 359 360 if (RB_INSERT(rib_tree, rib_tree(rib), re) != NULL) { 361 log_warnx("rib_add: insert failed"); 362 free(re); 363 return (NULL); 364 } 365 366 367 rdemem.rib_cnt++; 368 369 return (re); 370 } 371 372 void 373 rib_remove(struct rib_entry *re) 374 { 375 if (!rib_empty(re)) 376 fatalx("rib_remove: entry not empty"); 377 378 if (re_is_locked(re)) 379 /* entry is locked, don't free it. */ 380 return; 381 382 pt_unref(re->prefix); 383 384 if (RB_REMOVE(rib_tree, rib_tree(re_rib(re)), re) == NULL) 385 log_warnx("rib_remove: remove failed."); 386 387 free(re); 388 rdemem.rib_cnt--; 389 } 390 391 int 392 rib_empty(struct rib_entry *re) 393 { 394 return LIST_EMPTY(&re->prefix_h); 395 } 396 397 static struct rib_entry * 398 rib_restart(struct rib_context *ctx) 399 { 400 struct rib_entry *re; 401 402 re = re_unlock(ctx->ctx_re); 403 404 /* find first non empty element */ 405 while (re && rib_empty(re)) 406 re = RB_NEXT(rib_tree, unused, re); 407 408 /* free the previously locked rib element if empty */ 409 if (rib_empty(ctx->ctx_re)) 410 rib_remove(ctx->ctx_re); 411 ctx->ctx_re = NULL; 412 return (re); 413 } 414 415 static void 416 rib_dump_r(struct rib_context *ctx) 417 { 418 struct rib_entry *re, *next; 419 struct rib *rib; 420 unsigned int i; 421 422 rib = rib_byid(ctx->ctx_id); 423 if (rib == NULL) 424 fatalx("%s: rib id %u gone", __func__, ctx->ctx_id); 425 426 if (ctx->ctx_re == NULL) 427 re = RB_MIN(rib_tree, rib_tree(rib)); 428 else 429 re = rib_restart(ctx); 430 431 for (i = 0; re != NULL; re = next) { 432 next = RB_NEXT(rib_tree, unused, re); 433 if (re->rib_id != ctx->ctx_id) 434 fatalx("%s: Unexpected RIB %u != %u.", __func__, 435 re->rib_id, ctx->ctx_id); 436 if (ctx->ctx_aid != AID_UNSPEC && 437 ctx->ctx_aid != re->prefix->aid) 438 continue; 439 if (ctx->ctx_count && i++ >= ctx->ctx_count && 440 !re_is_locked(re)) { 441 /* store and lock last element */ 442 ctx->ctx_re = re_lock(re); 443 return; 444 } 445 ctx->ctx_rib_call(re, ctx->ctx_arg); 446 } 447 448 if (ctx->ctx_done) 449 ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid); 450 LIST_REMOVE(ctx, entry); 451 free(ctx); 452 } 453 454 int 455 rib_dump_pending(void) 456 { 457 struct rib_context *ctx; 458 459 /* return true if at least one context is not throttled */ 460 LIST_FOREACH(ctx, &rib_dumps, entry) { 461 if (ctx->ctx_throttle && ctx->ctx_throttle(ctx->ctx_arg)) 462 continue; 463 return 1; 464 } 465 return 0; 466 } 467 468 void 469 rib_dump_runner(void) 470 { 471 struct rib_context *ctx, *next; 472 473 LIST_FOREACH_SAFE(ctx, &rib_dumps, entry, next) { 474 if (ctx->ctx_throttle && ctx->ctx_throttle(ctx->ctx_arg)) 475 continue; 476 if (ctx->ctx_rib_call != NULL) 477 rib_dump_r(ctx); 478 else 479 prefix_dump_r(ctx); 480 } 481 } 482 483 static void 484 rib_dump_abort(u_int16_t id) 485 { 486 struct rib_context *ctx, *next; 487 488 LIST_FOREACH_SAFE(ctx, &rib_dumps, entry, next) { 489 if (id != ctx->ctx_id) 490 continue; 491 if (ctx->ctx_done) 492 ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid); 493 if (ctx->ctx_re && rib_empty(re_unlock(ctx->ctx_re))) 494 rib_remove(ctx->ctx_re); 495 if (ctx->ctx_p && prefix_is_dead(prefix_unlock(ctx->ctx_p))) 496 prefix_adjout_destroy(ctx->ctx_p); 497 LIST_REMOVE(ctx, entry); 498 free(ctx); 499 } 500 } 501 502 void 503 rib_dump_terminate(void *arg) 504 { 505 struct rib_context *ctx, *next; 506 507 LIST_FOREACH_SAFE(ctx, &rib_dumps, entry, next) { 508 if (ctx->ctx_arg != arg) 509 continue; 510 if (ctx->ctx_done) 511 ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid); 512 if (ctx->ctx_re && rib_empty(re_unlock(ctx->ctx_re))) 513 rib_remove(ctx->ctx_re); 514 if (ctx->ctx_p && prefix_is_dead(prefix_unlock(ctx->ctx_p))) 515 prefix_adjout_destroy(ctx->ctx_p); 516 LIST_REMOVE(ctx, entry); 517 free(ctx); 518 } 519 } 520 521 int 522 rib_dump_new(u_int16_t id, u_int8_t aid, unsigned int count, void *arg, 523 void (*upcall)(struct rib_entry *, void *), void (*done)(void *, u_int8_t), 524 int (*throttle)(void *)) 525 { 526 struct rib_context *ctx; 527 528 if ((ctx = calloc(1, sizeof(*ctx))) == NULL) 529 return -1; 530 ctx->ctx_id = id; 531 ctx->ctx_aid = aid; 532 ctx->ctx_count = count; 533 ctx->ctx_arg = arg; 534 ctx->ctx_rib_call = upcall; 535 ctx->ctx_done = done; 536 ctx->ctx_throttle = throttle; 537 538 LIST_INSERT_HEAD(&rib_dumps, ctx, entry); 539 540 /* requested a sync traversal */ 541 if (count == 0) 542 rib_dump_r(ctx); 543 544 return 0; 545 } 546 547 /* path specific functions */ 548 549 static struct rde_aspath *path_lookup(struct rde_aspath *); 550 static u_int64_t path_hash(struct rde_aspath *); 551 static void path_link(struct rde_aspath *); 552 static void path_unlink(struct rde_aspath *); 553 554 struct path_table { 555 struct aspath_head *path_hashtbl; 556 u_int64_t path_hashmask; 557 } pathtable; 558 559 SIPHASH_KEY pathtablekey; 560 561 #define PATH_HASH(x) &pathtable.path_hashtbl[x & pathtable.path_hashmask] 562 563 static inline struct rde_aspath * 564 path_ref(struct rde_aspath *asp) 565 { 566 if ((asp->flags & F_ATTR_LINKED) == 0) 567 fatalx("%s: unlinked object", __func__); 568 asp->refcnt++; 569 rdemem.path_refs++; 570 571 return asp; 572 } 573 574 static inline void 575 path_unref(struct rde_aspath *asp) 576 { 577 if (asp == NULL) 578 return; 579 if ((asp->flags & F_ATTR_LINKED) == 0) 580 fatalx("%s: unlinked object", __func__); 581 asp->refcnt--; 582 rdemem.path_refs--; 583 if (asp->refcnt <= 0) 584 path_unlink(asp); 585 } 586 587 void 588 path_init(u_int32_t hashsize) 589 { 590 u_int32_t hs, i; 591 592 for (hs = 1; hs < hashsize; hs <<= 1) 593 ; 594 pathtable.path_hashtbl = calloc(hs, sizeof(*pathtable.path_hashtbl)); 595 if (pathtable.path_hashtbl == NULL) 596 fatal("path_init"); 597 598 for (i = 0; i < hs; i++) 599 LIST_INIT(&pathtable.path_hashtbl[i]); 600 601 pathtable.path_hashmask = hs - 1; 602 arc4random_buf(&pathtablekey, sizeof(pathtablekey)); 603 } 604 605 void 606 path_shutdown(void) 607 { 608 u_int32_t i; 609 610 for (i = 0; i <= pathtable.path_hashmask; i++) 611 if (!LIST_EMPTY(&pathtable.path_hashtbl[i])) 612 log_warnx("path_free: free non-free table"); 613 614 free(pathtable.path_hashtbl); 615 } 616 617 void 618 path_hash_stats(struct rde_hashstats *hs) 619 { 620 struct rde_aspath *a; 621 u_int32_t i; 622 int64_t n; 623 624 memset(hs, 0, sizeof(*hs)); 625 strlcpy(hs->name, "path hash", sizeof(hs->name)); 626 hs->min = LLONG_MAX; 627 hs->num = pathtable.path_hashmask + 1; 628 629 for (i = 0; i <= pathtable.path_hashmask; i++) { 630 n = 0; 631 LIST_FOREACH(a, &pathtable.path_hashtbl[i], path_l) 632 n++; 633 if (n < hs->min) 634 hs->min = n; 635 if (n > hs->max) 636 hs->max = n; 637 hs->sum += n; 638 hs->sumq += n * n; 639 } 640 } 641 642 int 643 path_compare(struct rde_aspath *a, struct rde_aspath *b) 644 { 645 int r; 646 647 if (a == NULL && b == NULL) 648 return (0); 649 else if (b == NULL) 650 return (1); 651 else if (a == NULL) 652 return (-1); 653 if ((a->flags & ~F_ATTR_LINKED) > (b->flags & ~F_ATTR_LINKED)) 654 return (1); 655 if ((a->flags & ~F_ATTR_LINKED) < (b->flags & ~F_ATTR_LINKED)) 656 return (-1); 657 if (a->origin > b->origin) 658 return (1); 659 if (a->origin < b->origin) 660 return (-1); 661 if (a->med > b->med) 662 return (1); 663 if (a->med < b->med) 664 return (-1); 665 if (a->lpref > b->lpref) 666 return (1); 667 if (a->lpref < b->lpref) 668 return (-1); 669 if (a->weight > b->weight) 670 return (1); 671 if (a->weight < b->weight) 672 return (-1); 673 if (a->rtlabelid > b->rtlabelid) 674 return (1); 675 if (a->rtlabelid < b->rtlabelid) 676 return (-1); 677 if (a->pftableid > b->pftableid) 678 return (1); 679 if (a->pftableid < b->pftableid) 680 return (-1); 681 682 r = aspath_compare(a->aspath, b->aspath); 683 if (r > 0) 684 return (1); 685 if (r < 0) 686 return (-1); 687 688 return (attr_compare(a, b)); 689 } 690 691 static u_int64_t 692 path_hash(struct rde_aspath *asp) 693 { 694 SIPHASH_CTX ctx; 695 u_int64_t hash; 696 697 SipHash24_Init(&ctx, &pathtablekey); 698 SipHash24_Update(&ctx, &asp->aspath_hashstart, 699 (char *)&asp->aspath_hashend - (char *)&asp->aspath_hashstart); 700 701 if (asp->aspath) 702 SipHash24_Update(&ctx, asp->aspath->data, asp->aspath->len); 703 704 hash = attr_hash(asp); 705 SipHash24_Update(&ctx, &hash, sizeof(hash)); 706 707 return (SipHash24_End(&ctx)); 708 } 709 710 static struct rde_aspath * 711 path_lookup(struct rde_aspath *aspath) 712 { 713 struct aspath_head *head; 714 struct rde_aspath *asp; 715 u_int64_t hash; 716 717 hash = path_hash(aspath); 718 head = PATH_HASH(hash); 719 720 LIST_FOREACH(asp, head, path_l) { 721 if (asp->hash == hash && path_compare(aspath, asp) == 0) 722 return (asp); 723 } 724 return (NULL); 725 } 726 727 /* 728 * Link this aspath into the global hash table. 729 * The asp had to be alloced with path_get. 730 */ 731 static void 732 path_link(struct rde_aspath *asp) 733 { 734 struct aspath_head *head; 735 736 asp->hash = path_hash(asp); 737 head = PATH_HASH(asp->hash); 738 739 LIST_INSERT_HEAD(head, asp, path_l); 740 asp->flags |= F_ATTR_LINKED; 741 } 742 743 /* 744 * This function can only be called when all prefix have been removed first. 745 * Normally this happens directly out of the prefix removal functions. 746 */ 747 static void 748 path_unlink(struct rde_aspath *asp) 749 { 750 if (asp == NULL) 751 return; 752 753 /* make sure no reference is hold for this rde_aspath */ 754 if (asp->refcnt != 0) 755 fatalx("%s: still holds references", __func__); 756 757 LIST_REMOVE(asp, path_l); 758 asp->flags &= ~F_ATTR_LINKED; 759 760 path_put(asp); 761 } 762 763 /* 764 * Copy asp to a new UNLINKED aspath. 765 * On dst either path_get() or path_prep() had to be called before. 766 */ 767 struct rde_aspath * 768 path_copy(struct rde_aspath *dst, const struct rde_aspath *src) 769 { 770 dst->aspath = src->aspath; 771 if (dst->aspath != NULL) { 772 dst->aspath->refcnt++; 773 rdemem.aspath_refs++; 774 } 775 dst->hash = 0; /* not linked so no hash and no refcnt */ 776 dst->refcnt = 0; 777 dst->flags = src->flags & ~F_ATTR_LINKED; 778 779 dst->med = src->med; 780 dst->lpref = src->lpref; 781 dst->weight = src->weight; 782 dst->rtlabelid = rtlabel_ref(src->rtlabelid); 783 dst->pftableid = pftable_ref(src->pftableid); 784 dst->origin = src->origin; 785 786 attr_copy(dst, src); 787 788 return (dst); 789 } 790 791 /* initialize or pepare an aspath for use */ 792 struct rde_aspath * 793 path_prep(struct rde_aspath *asp) 794 { 795 memset(asp, 0, sizeof(*asp)); 796 asp->origin = ORIGIN_INCOMPLETE; 797 asp->lpref = DEFAULT_LPREF; 798 799 return (asp); 800 } 801 802 /* alloc and initialize new entry. May not fail. */ 803 struct rde_aspath * 804 path_get(void) 805 { 806 struct rde_aspath *asp; 807 808 asp = malloc(sizeof(*asp)); 809 if (asp == NULL) 810 fatal("path_get"); 811 rdemem.path_cnt++; 812 813 return (path_prep(asp)); 814 } 815 816 /* clean up an asp after use (frees all references to sub-objects) */ 817 void 818 path_clean(struct rde_aspath *asp) 819 { 820 if (asp->flags & F_ATTR_LINKED) 821 fatalx("%s: linked object", __func__); 822 823 rtlabel_unref(asp->rtlabelid); 824 pftable_unref(asp->pftableid); 825 aspath_put(asp->aspath); 826 attr_freeall(asp); 827 } 828 829 /* free an unlinked element */ 830 void 831 path_put(struct rde_aspath *asp) 832 { 833 if (asp == NULL) 834 return; 835 836 path_clean(asp); 837 838 rdemem.path_cnt--; 839 free(asp); 840 } 841 842 /* prefix specific functions */ 843 844 static int prefix_add(struct bgpd_addr *, int, struct rib *, 845 struct rde_peer *, struct rde_aspath *, 846 struct rde_community *, struct nexthop *, 847 u_int8_t, u_int8_t); 848 static int prefix_move(struct prefix *, struct rde_peer *, 849 struct rde_aspath *, struct rde_community *, 850 struct nexthop *, u_int8_t, u_int8_t); 851 852 static void prefix_link(struct prefix *, struct rib_entry *, 853 struct rde_peer *, struct rde_aspath *, 854 struct rde_community *, struct nexthop *, 855 u_int8_t, u_int8_t); 856 static void prefix_unlink(struct prefix *); 857 858 static struct prefix *prefix_alloc(void); 859 static void prefix_free(struct prefix *); 860 861 /* RB tree comparison function */ 862 static inline int 863 prefix_cmp(struct prefix *a, struct prefix *b) 864 { 865 if (a->eor != b->eor) 866 return a->eor - b->eor; 867 /* if EOR marker no need to check the rest also a->eor == b->eor */ 868 if (a->eor) 869 return 0; 870 871 if (a->aspath != b->aspath) 872 return (a->aspath > b->aspath ? 1 : -1); 873 if (a->communities != b->communities) 874 return (a->communities > b->communities ? 1 : -1); 875 if (a->nexthop != b->nexthop) 876 return (a->nexthop > b->nexthop ? 1 : -1); 877 if (a->nhflags != b->nhflags) 878 return (a->nhflags > b->nhflags ? 1 : -1); 879 return pt_prefix_cmp(a->pt, b->pt); 880 } 881 882 static inline int 883 prefix_index_cmp(struct prefix *a, struct prefix *b) 884 { 885 return pt_prefix_cmp(a->pt, b->pt); 886 } 887 888 RB_GENERATE(prefix_tree, prefix, entry.tree.update, prefix_cmp) 889 RB_GENERATE_STATIC(prefix_index, prefix, entry.tree.index, prefix_index_cmp) 890 891 /* 892 * search for specified prefix of a peer. Returns NULL if not found. 893 */ 894 struct prefix * 895 prefix_get(struct rib *rib, struct rde_peer *peer, struct bgpd_addr *prefix, 896 int prefixlen) 897 { 898 struct rib_entry *re; 899 900 re = rib_get(rib, prefix, prefixlen); 901 if (re == NULL) 902 return (NULL); 903 return (prefix_bypeer(re, peer)); 904 } 905 906 /* 907 * lookup prefix in the peer prefix_index. Returns NULL if not found. 908 */ 909 struct prefix * 910 prefix_lookup(struct rde_peer *peer, struct bgpd_addr *prefix, 911 int prefixlen) 912 { 913 struct prefix xp; 914 struct pt_entry *pte; 915 916 memset(&xp, 0, sizeof(xp)); 917 pte = pt_fill(prefix, prefixlen); 918 xp.pt = pte; 919 920 return RB_FIND(prefix_index, &peer->adj_rib_out, &xp); 921 } 922 923 struct prefix * 924 prefix_match(struct rde_peer *peer, struct bgpd_addr *addr) 925 { 926 struct prefix *p; 927 int i; 928 929 switch (addr->aid) { 930 case AID_INET: 931 case AID_VPN_IPv4: 932 for (i = 32; i >= 0; i--) { 933 p = prefix_lookup(peer, addr, i); 934 if (p != NULL) 935 return p; 936 } 937 break; 938 case AID_INET6: 939 case AID_VPN_IPv6: 940 for (i = 128; i >= 0; i--) { 941 p = prefix_lookup(peer, addr, i); 942 if (p != NULL) 943 return p; 944 } 945 break; 946 default: 947 fatalx("%s: unknown af", __func__); 948 } 949 return NULL; 950 } 951 952 /* 953 * Update a prefix. 954 * Return 1 if prefix was newly added, 0 if it was just changed. 955 */ 956 int 957 prefix_update(struct rib *rib, struct rde_peer *peer, struct filterstate *state, 958 struct bgpd_addr *prefix, int prefixlen, u_int8_t vstate) 959 { 960 struct rde_aspath *asp, *nasp = &state->aspath; 961 struct rde_community *comm, *ncomm = &state->communities; 962 struct prefix *p; 963 964 /* 965 * First try to find a prefix in the specified RIB. 966 */ 967 if ((p = prefix_get(rib, peer, prefix, prefixlen)) != NULL) { 968 if (prefix_nexthop(p) == state->nexthop && 969 prefix_nhflags(p) == state->nhflags && 970 communities_equal(ncomm, prefix_communities(p)) && 971 path_compare(nasp, prefix_aspath(p)) == 0) { 972 /* no change, update last change */ 973 p->lastchange = getmonotime(); 974 p->validation_state = vstate; 975 return (0); 976 } 977 } 978 979 /* 980 * Either the prefix does not exist or the path changed. 981 * In both cases lookup the new aspath to make sure it is not 982 * already in the RIB. 983 */ 984 if ((asp = path_lookup(nasp)) == NULL) { 985 /* Path not available, create and link a new one. */ 986 asp = path_copy(path_get(), nasp); 987 path_link(asp); 988 } 989 990 if ((comm = communities_lookup(ncomm)) == NULL) { 991 /* Communities not available, create and link a new one. */ 992 comm = communities_link(ncomm); 993 } 994 995 /* If the prefix was found move it else add it to the RIB. */ 996 if (p != NULL) 997 return (prefix_move(p, peer, asp, comm, state->nexthop, 998 state->nhflags, vstate)); 999 else 1000 return (prefix_add(prefix, prefixlen, rib, peer, asp, comm, 1001 state->nexthop, state->nhflags, vstate)); 1002 } 1003 1004 /* 1005 * Adds or updates a prefix. 1006 */ 1007 static int 1008 prefix_add(struct bgpd_addr *prefix, int prefixlen, struct rib *rib, 1009 struct rde_peer *peer, struct rde_aspath *asp, struct rde_community *comm, 1010 struct nexthop *nexthop, u_int8_t nhflags, u_int8_t vstate) 1011 { 1012 struct prefix *p; 1013 struct rib_entry *re; 1014 1015 re = rib_get(rib, prefix, prefixlen); 1016 if (re == NULL) 1017 re = rib_add(rib, prefix, prefixlen); 1018 1019 p = prefix_alloc(); 1020 prefix_link(p, re, peer, asp, comm, nexthop, nhflags, vstate); 1021 return (1); 1022 } 1023 1024 /* 1025 * Move the prefix to the specified as path, removes the old asp if needed. 1026 */ 1027 static int 1028 prefix_move(struct prefix *p, struct rde_peer *peer, 1029 struct rde_aspath *asp, struct rde_community *comm, 1030 struct nexthop *nexthop, u_int8_t nhflags, u_int8_t vstate) 1031 { 1032 struct prefix *np; 1033 1034 if (peer != prefix_peer(p)) 1035 fatalx("prefix_move: cross peer move"); 1036 1037 /* add new prefix node */ 1038 np = prefix_alloc(); 1039 /* add reference to new AS path and communities */ 1040 np->aspath = path_ref(asp); 1041 np->communities = communities_ref(comm); 1042 np->peer = peer; 1043 np->re = p->re; 1044 np->pt = p->pt; /* skip refcnt update since ref is moved */ 1045 np->validation_state = vstate; 1046 np->nhflags = nhflags; 1047 np->nexthop = nexthop_ref(nexthop); 1048 nexthop_link(np); 1049 np->lastchange = getmonotime(); 1050 1051 /* add possible pftable reference from new aspath */ 1052 if (asp && asp->pftableid) 1053 rde_pftable_add(asp->pftableid, np); 1054 1055 /* 1056 * no need to update the peer prefix count because we are only moving 1057 * the prefix without changing the peer. 1058 */ 1059 prefix_evaluate(np->re, np, p); 1060 1061 /* remove possible pftable reference from old path first */ 1062 if (p->aspath && p->aspath->pftableid) 1063 rde_pftable_del(p->aspath->pftableid, p); 1064 1065 /* remove old prefix node */ 1066 nexthop_unlink(p); 1067 nexthop_unref(p->nexthop); 1068 communities_unref(p->communities); 1069 path_unref(p->aspath); 1070 p->communities = NULL; 1071 p->nexthop = NULL; 1072 p->aspath = NULL; 1073 p->peer = NULL; 1074 p->re = NULL; 1075 p->pt = NULL; 1076 prefix_free(p); 1077 1078 return (0); 1079 } 1080 1081 /* 1082 * Removes a prefix from the specified RIB. If the parent objects -- rib_entry 1083 * or pt_entry -- become empty remove them too. 1084 */ 1085 int 1086 prefix_withdraw(struct rib *rib, struct rde_peer *peer, 1087 struct bgpd_addr *prefix, int prefixlen) 1088 { 1089 struct prefix *p; 1090 struct rde_aspath *asp; 1091 1092 p = prefix_get(rib, peer, prefix, prefixlen); 1093 if (p == NULL) /* Got a dummy withdrawn request. */ 1094 return (0); 1095 1096 asp = prefix_aspath(p); 1097 if (asp && asp->pftableid) 1098 /* only prefixes in the local RIB were pushed into pf */ 1099 rde_pftable_del(asp->pftableid, p); 1100 1101 prefix_destroy(p); 1102 1103 return (1); 1104 } 1105 1106 /* 1107 * Insert an End-of-RIB marker into the update queue. 1108 */ 1109 void 1110 prefix_add_eor(struct rde_peer *peer, u_int8_t aid) 1111 { 1112 struct prefix *p; 1113 1114 p = prefix_alloc(); 1115 p->eor = 1; 1116 p->flags = PREFIX_FLAG_UPDATE; 1117 if (RB_INSERT(prefix_tree, &peer->updates[aid], p) != NULL) 1118 /* no need to add if EoR marker already present */ 1119 prefix_free(p); 1120 /* EOR marker is not inserted into the adj_rib_out index */ 1121 } 1122 1123 /* 1124 * Put a prefix from the Adj-RIB-Out onto the update queue. 1125 */ 1126 int 1127 prefix_adjout_update(struct rde_peer *peer, struct filterstate *state, 1128 struct bgpd_addr *prefix, int prefixlen, u_int8_t vstate) 1129 { 1130 struct prefix_tree *prefix_head = NULL; 1131 struct rde_aspath *asp; 1132 struct rde_community *comm; 1133 struct prefix *p; 1134 int created = 0; 1135 1136 if ((p = prefix_lookup(peer, prefix, prefixlen)) != NULL) { 1137 /* prefix is already in the Adj-RIB-Out */ 1138 if (p->flags & PREFIX_FLAG_WITHDRAW) { 1139 created = 1; /* consider this a new entry */ 1140 peer->up_wcnt--; 1141 prefix_head = &peer->withdraws[prefix->aid]; 1142 RB_REMOVE(prefix_tree, prefix_head, p); 1143 } else if (p->flags & PREFIX_FLAG_DEAD) { 1144 created = 1; /* consider this a new entry */ 1145 } else { 1146 if (prefix_nhflags(p) == state->nhflags && 1147 prefix_nexthop(p) == state->nexthop && 1148 communities_equal(&state->communities, 1149 prefix_communities(p)) && 1150 path_compare(&state->aspath, prefix_aspath(p)) == 1151 0) { 1152 /* nothing changed */ 1153 p->validation_state = vstate; 1154 p->lastchange = getmonotime(); 1155 p->flags &= ~PREFIX_FLAG_STALE; 1156 return 0; 1157 } 1158 1159 if (p->flags & PREFIX_FLAG_UPDATE) { 1160 /* created = 0 so up_nlricnt is not increased */ 1161 prefix_head = &peer->updates[prefix->aid]; 1162 RB_REMOVE(prefix_tree, prefix_head, p); 1163 } 1164 } 1165 /* unlink from aspath and remove nexthop ref */ 1166 nexthop_unref(p->nexthop); 1167 communities_unref(p->communities); 1168 path_unref(p->aspath); 1169 p->flags &= ~PREFIX_FLAG_MASK; 1170 1171 /* peer and pt remain */ 1172 } else { 1173 p = prefix_alloc(); 1174 created = 1; 1175 1176 p->pt = pt_get(prefix, prefixlen); 1177 if (p->pt == NULL) 1178 p->pt = pt_add(prefix, prefixlen); 1179 pt_ref(p->pt); 1180 p->peer = peer; 1181 1182 if (RB_INSERT(prefix_index, &peer->adj_rib_out, p) != NULL) 1183 fatalx("%s: RB index invariant violated", __func__); 1184 } 1185 1186 if ((asp = path_lookup(&state->aspath)) == NULL) { 1187 /* Path not available, create and link a new one. */ 1188 asp = path_copy(path_get(), &state->aspath); 1189 path_link(asp); 1190 } 1191 1192 if ((comm = communities_lookup(&state->communities)) == NULL) { 1193 /* Communities not available, create and link a new one. */ 1194 comm = communities_link(&state->communities); 1195 } 1196 1197 p->aspath = path_ref(asp); 1198 p->communities = communities_ref(comm); 1199 p->nexthop = nexthop_ref(state->nexthop); 1200 p->nhflags = state->nhflags; 1201 1202 p->validation_state = vstate; 1203 p->lastchange = getmonotime(); 1204 1205 if (p->flags & PREFIX_FLAG_MASK) 1206 fatalx("%s: bad flags %x", __func__, p->flags); 1207 p->flags |= PREFIX_FLAG_UPDATE; 1208 if (RB_INSERT(prefix_tree, &peer->updates[prefix->aid], p) != NULL) 1209 fatalx("%s: RB tree invariant violated", __func__); 1210 1211 return created; 1212 } 1213 1214 /* 1215 * Withdraw a prefix from the Adj-RIB-Out, this unlinks the aspath but leaves 1216 * the prefix in the RIB linked to the peer withdraw list. 1217 */ 1218 int 1219 prefix_adjout_withdraw(struct rde_peer *peer, struct bgpd_addr *prefix, 1220 int prefixlen) 1221 { 1222 struct prefix *p; 1223 1224 p = prefix_lookup(peer, prefix, prefixlen); 1225 if (p == NULL) /* Got a dummy withdrawn request. */ 1226 return (0); 1227 1228 /* already a withdraw, shortcut */ 1229 if (p->flags & PREFIX_FLAG_WITHDRAW) { 1230 p->lastchange = getmonotime(); 1231 p->flags &= ~PREFIX_FLAG_STALE; 1232 return (0); 1233 } 1234 /* pending update just got withdrawn */ 1235 if (p->flags & PREFIX_FLAG_UPDATE) 1236 RB_REMOVE(prefix_tree, &peer->updates[p->pt->aid], p); 1237 /* nothing needs to be done for PREFIX_FLAG_DEAD and STALE */ 1238 p->flags &= ~PREFIX_FLAG_MASK; 1239 1240 /* remove nexthop ref ... */ 1241 nexthop_unref(p->nexthop); 1242 p->nexthop = NULL; 1243 p->nhflags = 0; 1244 1245 /* unlink from aspath ...*/ 1246 path_unref(p->aspath); 1247 p->aspath = NULL; 1248 1249 /* ... communities ... */ 1250 communities_unref(p->communities); 1251 p->communities = NULL; 1252 /* and unlink from aspath */ 1253 path_unref(p->aspath); 1254 p->aspath = NULL; 1255 /* re already NULL */ 1256 1257 p->lastchange = getmonotime(); 1258 1259 p->flags |= PREFIX_FLAG_WITHDRAW; 1260 if (RB_INSERT(prefix_tree, &peer->withdraws[prefix->aid], p) != NULL) 1261 fatalx("%s: RB tree invariant violated", __func__); 1262 return (1); 1263 } 1264 1265 static struct prefix * 1266 prefix_restart(struct rib_context *ctx) 1267 { 1268 struct prefix *p; 1269 1270 p = prefix_unlock(ctx->ctx_p); 1271 1272 if (prefix_is_dead(p)) { 1273 struct prefix *next; 1274 1275 next = RB_NEXT(prefix_index, unused, p); 1276 prefix_adjout_destroy(p); 1277 p = next; 1278 } 1279 ctx->ctx_p = NULL; 1280 return p; 1281 } 1282 1283 void 1284 prefix_adjout_destroy(struct prefix *p) 1285 { 1286 struct rde_peer *peer = prefix_peer(p); 1287 1288 if (p->eor) { 1289 /* EOR marker is not linked in the index */ 1290 prefix_free(p); 1291 return; 1292 } 1293 1294 if (p->flags & PREFIX_FLAG_WITHDRAW) 1295 RB_REMOVE(prefix_tree, &peer->withdraws[p->pt->aid], p); 1296 else if (p->flags & PREFIX_FLAG_UPDATE) 1297 RB_REMOVE(prefix_tree, &peer->updates[p->pt->aid], p); 1298 /* nothing needs to be done for PREFIX_FLAG_DEAD and STALE */ 1299 p->flags &= ~PREFIX_FLAG_MASK; 1300 1301 1302 if (prefix_is_locked(p)) { 1303 /* remove nexthop ref ... */ 1304 nexthop_unref(p->nexthop); 1305 p->nexthop = NULL; 1306 /* ... communities ... */ 1307 communities_unref(p->communities); 1308 p->communities = NULL; 1309 /* and unlink from aspath */ 1310 path_unref(p->aspath); 1311 p->aspath = NULL; 1312 p->nhflags = 0; 1313 /* re already NULL */ 1314 1315 /* finally mark prefix dead */ 1316 p->flags |= PREFIX_FLAG_DEAD; 1317 return; 1318 } 1319 1320 RB_REMOVE(prefix_index, &peer->adj_rib_out, p); 1321 1322 prefix_unlink(p); 1323 prefix_free(p); 1324 } 1325 1326 static void 1327 prefix_dump_r(struct rib_context *ctx) 1328 { 1329 struct prefix *p, *next; 1330 struct rde_peer *peer; 1331 unsigned int i; 1332 1333 if ((peer = peer_get(ctx->ctx_id)) == NULL) 1334 goto done; 1335 1336 if (ctx->ctx_p == NULL) 1337 p = RB_MIN(prefix_index, &peer->adj_rib_out); 1338 else 1339 p = prefix_restart(ctx); 1340 1341 for (i = 0; p != NULL; p = next) { 1342 next = RB_NEXT(prefix_index, unused, p); 1343 if (prefix_is_dead(p)) 1344 continue; 1345 if (ctx->ctx_aid != AID_UNSPEC && 1346 ctx->ctx_aid != p->pt->aid) 1347 continue; 1348 if (ctx->ctx_count && i++ >= ctx->ctx_count && 1349 !prefix_is_locked(p)) { 1350 /* store and lock last element */ 1351 ctx->ctx_p = prefix_lock(p); 1352 return; 1353 } 1354 ctx->ctx_prefix_call(p, ctx->ctx_arg); 1355 } 1356 1357 done: 1358 if (ctx->ctx_done) 1359 ctx->ctx_done(ctx->ctx_arg, ctx->ctx_aid); 1360 LIST_REMOVE(ctx, entry); 1361 free(ctx); 1362 } 1363 1364 int 1365 prefix_dump_new(struct rde_peer *peer, u_int8_t aid, unsigned int count, 1366 void *arg, void (*upcall)(struct prefix *, void *), 1367 void (*done)(void *, u_int8_t), int (*throttle)(void *)) 1368 { 1369 struct rib_context *ctx; 1370 1371 if ((ctx = calloc(1, sizeof(*ctx))) == NULL) 1372 return -1; 1373 ctx->ctx_id = peer->conf.id; 1374 ctx->ctx_aid = aid; 1375 ctx->ctx_count = count; 1376 ctx->ctx_arg = arg; 1377 ctx->ctx_prefix_call = upcall; 1378 ctx->ctx_done = done; 1379 ctx->ctx_throttle = throttle; 1380 1381 LIST_INSERT_HEAD(&rib_dumps, ctx, entry); 1382 1383 /* requested a sync traversal */ 1384 if (count == 0) 1385 prefix_dump_r(ctx); 1386 1387 return 0; 1388 } 1389 1390 /* dump a prefix into specified buffer */ 1391 int 1392 prefix_write(u_char *buf, int len, struct bgpd_addr *prefix, u_int8_t plen, 1393 int withdraw) 1394 { 1395 int totlen, psize; 1396 1397 switch (prefix->aid) { 1398 case AID_INET: 1399 case AID_INET6: 1400 totlen = PREFIX_SIZE(plen); 1401 1402 if (totlen > len) 1403 return (-1); 1404 *buf++ = plen; 1405 memcpy(buf, &prefix->ba, totlen - 1); 1406 return (totlen); 1407 case AID_VPN_IPv4: 1408 case AID_VPN_IPv6: 1409 totlen = PREFIX_SIZE(plen) + sizeof(prefix->rd); 1410 psize = PREFIX_SIZE(plen) - 1; 1411 plen += sizeof(prefix->rd) * 8; 1412 if (withdraw) { 1413 /* withdraw have one compat label as placeholder */ 1414 totlen += 3; 1415 plen += 3 * 8; 1416 } else { 1417 totlen += prefix->labellen; 1418 plen += prefix->labellen * 8; 1419 } 1420 1421 if (totlen > len) 1422 return (-1); 1423 *buf++ = plen; 1424 if (withdraw) { 1425 /* magic compatibility label as per rfc8277 */ 1426 *buf++ = 0x80; 1427 *buf++ = 0x0; 1428 *buf++ = 0x0; 1429 } else { 1430 memcpy(buf, &prefix->labelstack, 1431 prefix->labellen); 1432 buf += prefix->labellen; 1433 } 1434 memcpy(buf, &prefix->rd, sizeof(prefix->rd)); 1435 buf += sizeof(prefix->rd); 1436 memcpy(buf, &prefix->ba, psize); 1437 return (totlen); 1438 default: 1439 return (-1); 1440 } 1441 } 1442 1443 int 1444 prefix_writebuf(struct ibuf *buf, struct bgpd_addr *prefix, u_int8_t plen) 1445 { 1446 int totlen; 1447 void *bptr; 1448 1449 switch (prefix->aid) { 1450 case AID_INET: 1451 case AID_INET6: 1452 totlen = PREFIX_SIZE(plen); 1453 break; 1454 case AID_VPN_IPv4: 1455 case AID_VPN_IPv6: 1456 totlen = PREFIX_SIZE(plen) + sizeof(prefix->rd) + 1457 prefix->labellen; 1458 break; 1459 default: 1460 return (-1); 1461 } 1462 1463 if ((bptr = ibuf_reserve(buf, totlen)) == NULL) 1464 return (-1); 1465 if (prefix_write(bptr, totlen, prefix, plen, 0) == -1) 1466 return (-1); 1467 return (0); 1468 } 1469 1470 /* 1471 * Searches in the prefix list of specified rib_entry for a prefix entry 1472 * belonging to the peer peer. Returns NULL if no match found. 1473 */ 1474 struct prefix * 1475 prefix_bypeer(struct rib_entry *re, struct rde_peer *peer) 1476 { 1477 struct prefix *p; 1478 1479 LIST_FOREACH(p, &re->prefix_h, entry.list.rib) 1480 if (prefix_peer(p) == peer) 1481 return (p); 1482 return (NULL); 1483 } 1484 1485 static void 1486 prefix_evaluate_all(struct prefix *p, enum nexthop_state state, 1487 enum nexthop_state oldstate) 1488 { 1489 /* Skip non local-RIBs or RIBs that are flagged as noeval. */ 1490 if (re_rib(p->re)->flags & F_RIB_NOEVALUATE) { 1491 log_warnx("%s: prefix with F_RIB_NOEVALUATE hit", __func__); 1492 return; 1493 } 1494 1495 if (oldstate == state) { 1496 /* 1497 * The state of the nexthop did not change. The only 1498 * thing that may have changed is the true_nexthop 1499 * or other internal infos. This will not change 1500 * the routing decision so shortcut here. 1501 */ 1502 if (state == NEXTHOP_REACH) { 1503 if ((re_rib(p->re)->flags & F_RIB_NOFIB) == 0 && 1504 p == p->re->active) 1505 rde_send_kroute(re_rib(p->re), p, NULL); 1506 } 1507 return; 1508 } 1509 1510 /* redo the route decision */ 1511 prefix_evaluate(p->re, p, p); 1512 } 1513 1514 /* kill a prefix. */ 1515 void 1516 prefix_destroy(struct prefix *p) 1517 { 1518 /* make route decision */ 1519 prefix_evaluate(p->re, NULL, p); 1520 1521 prefix_unlink(p); 1522 prefix_free(p); 1523 } 1524 1525 /* 1526 * Link a prefix into the different parent objects. 1527 */ 1528 static void 1529 prefix_link(struct prefix *p, struct rib_entry *re, struct rde_peer *peer, 1530 struct rde_aspath *asp, struct rde_community *comm, 1531 struct nexthop *nexthop, u_int8_t nhflags, u_int8_t vstate) 1532 { 1533 p->aspath = path_ref(asp); 1534 p->communities = communities_ref(comm); 1535 p->peer = peer; 1536 p->pt = pt_ref(re->prefix); 1537 p->re = re; 1538 p->validation_state = vstate; 1539 p->nhflags = nhflags; 1540 p->nexthop = nexthop_ref(nexthop); 1541 nexthop_link(p); 1542 p->lastchange = getmonotime(); 1543 1544 /* add possible pftable reference from aspath */ 1545 if (asp && asp->pftableid) 1546 rde_pftable_add(asp->pftableid, p); 1547 1548 /* make route decision */ 1549 prefix_evaluate(re, p, NULL); 1550 } 1551 1552 /* 1553 * Unlink a prefix from the different parent objects. 1554 */ 1555 static void 1556 prefix_unlink(struct prefix *p) 1557 { 1558 struct rib_entry *re = p->re; 1559 1560 /* destroy all references to other objects */ 1561 nexthop_unlink(p); 1562 nexthop_unref(p->nexthop); 1563 communities_unref(p->communities); 1564 path_unref(p->aspath); 1565 pt_unref(p->pt); 1566 p->communities = NULL; 1567 p->nexthop = NULL; 1568 p->aspath = NULL; 1569 p->peer = NULL; 1570 p->re = NULL; 1571 p->pt = NULL; 1572 1573 if (re && rib_empty(re)) 1574 rib_remove(re); 1575 1576 /* 1577 * It's the caller's duty to do accounting and remove empty aspath 1578 * structures. Also freeing the unlinked prefix is the caller's duty. 1579 */ 1580 } 1581 1582 /* alloc and zero new entry. May not fail. */ 1583 static struct prefix * 1584 prefix_alloc(void) 1585 { 1586 struct prefix *p; 1587 1588 p = calloc(1, sizeof(*p)); 1589 if (p == NULL) 1590 fatal("prefix_alloc"); 1591 rdemem.prefix_cnt++; 1592 return p; 1593 } 1594 1595 /* free a unlinked entry */ 1596 static void 1597 prefix_free(struct prefix *p) 1598 { 1599 rdemem.prefix_cnt--; 1600 free(p); 1601 } 1602 1603 /* 1604 * nexthop functions 1605 */ 1606 struct nexthop_head *nexthop_hash(struct bgpd_addr *); 1607 struct nexthop *nexthop_lookup(struct bgpd_addr *); 1608 1609 /* 1610 * In BGP there exist two nexthops: the exit nexthop which was announced via 1611 * BGP and the true nexthop which is used in the FIB -- forward information 1612 * base a.k.a kernel routing table. When sending updates it is even more 1613 * confusing. In IBGP we pass the unmodified exit nexthop to the neighbors 1614 * while in EBGP normally the address of the router is sent. The exit nexthop 1615 * may be passed to the external neighbor if the neighbor and the exit nexthop 1616 * reside in the same subnet -- directly connected. 1617 */ 1618 struct nexthop_table { 1619 LIST_HEAD(nexthop_head, nexthop) *nexthop_hashtbl; 1620 u_int32_t nexthop_hashmask; 1621 } nexthoptable; 1622 1623 SIPHASH_KEY nexthoptablekey; 1624 1625 TAILQ_HEAD(nexthop_queue, nexthop) nexthop_runners; 1626 1627 void 1628 nexthop_init(u_int32_t hashsize) 1629 { 1630 u_int32_t hs, i; 1631 1632 for (hs = 1; hs < hashsize; hs <<= 1) 1633 ; 1634 nexthoptable.nexthop_hashtbl = calloc(hs, sizeof(struct nexthop_head)); 1635 if (nexthoptable.nexthop_hashtbl == NULL) 1636 fatal("nextop_init"); 1637 1638 TAILQ_INIT(&nexthop_runners); 1639 for (i = 0; i < hs; i++) 1640 LIST_INIT(&nexthoptable.nexthop_hashtbl[i]); 1641 arc4random_buf(&nexthoptablekey, sizeof(nexthoptablekey)); 1642 1643 nexthoptable.nexthop_hashmask = hs - 1; 1644 } 1645 1646 void 1647 nexthop_shutdown(void) 1648 { 1649 u_int32_t i; 1650 struct nexthop *nh, *nnh; 1651 1652 for (i = 0; i <= nexthoptable.nexthop_hashmask; i++) { 1653 for (nh = LIST_FIRST(&nexthoptable.nexthop_hashtbl[i]); 1654 nh != NULL; nh = nnh) { 1655 nnh = LIST_NEXT(nh, nexthop_l); 1656 nh->state = NEXTHOP_UNREACH; 1657 nexthop_unref(nh); 1658 } 1659 if (!LIST_EMPTY(&nexthoptable.nexthop_hashtbl[i])) { 1660 nh = LIST_FIRST(&nexthoptable.nexthop_hashtbl[i]); 1661 log_warnx("nexthop_shutdown: non-free table, " 1662 "nexthop %s refcnt %d", 1663 log_addr(&nh->exit_nexthop), nh->refcnt); 1664 } 1665 } 1666 1667 free(nexthoptable.nexthop_hashtbl); 1668 } 1669 1670 int 1671 nexthop_pending(void) 1672 { 1673 return !TAILQ_EMPTY(&nexthop_runners); 1674 } 1675 1676 void 1677 nexthop_runner(void) 1678 { 1679 struct nexthop *nh; 1680 struct prefix *p; 1681 u_int32_t j; 1682 1683 nh = TAILQ_FIRST(&nexthop_runners); 1684 if (nh == NULL) 1685 return; 1686 1687 /* remove from runnner queue */ 1688 TAILQ_REMOVE(&nexthop_runners, nh, runner_l); 1689 1690 p = nh->next_prefix; 1691 for (j = 0; p != NULL && j < RDE_RUNNER_ROUNDS; j++) { 1692 prefix_evaluate_all(p, nh->state, nh->oldstate); 1693 p = LIST_NEXT(p, entry.list.nexthop); 1694 } 1695 1696 /* prep for next run, if not finished readd to tail of queue */ 1697 nh->next_prefix = p; 1698 if (p != NULL) 1699 TAILQ_INSERT_TAIL(&nexthop_runners, nh, runner_l); 1700 else 1701 log_debug("nexthop %s update finished", 1702 log_addr(&nh->exit_nexthop)); 1703 } 1704 1705 void 1706 nexthop_update(struct kroute_nexthop *msg) 1707 { 1708 struct nexthop *nh; 1709 1710 nh = nexthop_lookup(&msg->nexthop); 1711 if (nh == NULL) { 1712 log_warnx("nexthop_update: non-existent nexthop %s", 1713 log_addr(&msg->nexthop)); 1714 return; 1715 } 1716 1717 nh->oldstate = nh->state; 1718 if (msg->valid) 1719 nh->state = NEXTHOP_REACH; 1720 else 1721 nh->state = NEXTHOP_UNREACH; 1722 1723 if (nh->oldstate == NEXTHOP_LOOKUP) 1724 /* drop reference which was hold during the lookup */ 1725 if (nexthop_unref(nh)) 1726 return; /* nh lost last ref, no work left */ 1727 1728 if (nh->next_prefix) { 1729 /* 1730 * If nexthop_runner() is not finished with this nexthop 1731 * then ensure that all prefixes are updated by setting 1732 * the oldstate to NEXTHOP_FLAPPED. 1733 */ 1734 nh->oldstate = NEXTHOP_FLAPPED; 1735 TAILQ_REMOVE(&nexthop_runners, nh, runner_l); 1736 } 1737 1738 if (msg->connected) { 1739 nh->flags |= NEXTHOP_CONNECTED; 1740 memcpy(&nh->true_nexthop, &nh->exit_nexthop, 1741 sizeof(nh->true_nexthop)); 1742 } else 1743 memcpy(&nh->true_nexthop, &msg->gateway, 1744 sizeof(nh->true_nexthop)); 1745 1746 memcpy(&nh->nexthop_net, &msg->net, 1747 sizeof(nh->nexthop_net)); 1748 nh->nexthop_netlen = msg->netlen; 1749 1750 nh->next_prefix = LIST_FIRST(&nh->prefix_h); 1751 if (nh->next_prefix != NULL) { 1752 TAILQ_INSERT_HEAD(&nexthop_runners, nh, runner_l); 1753 log_debug("nexthop %s update starting", 1754 log_addr(&nh->exit_nexthop)); 1755 } 1756 } 1757 1758 void 1759 nexthop_modify(struct nexthop *setnh, enum action_types type, u_int8_t aid, 1760 struct nexthop **nexthop, u_int8_t *flags) 1761 { 1762 switch (type) { 1763 case ACTION_SET_NEXTHOP_REJECT: 1764 *flags = NEXTHOP_REJECT; 1765 break; 1766 case ACTION_SET_NEXTHOP_BLACKHOLE: 1767 *flags = NEXTHOP_BLACKHOLE; 1768 break; 1769 case ACTION_SET_NEXTHOP_NOMODIFY: 1770 *flags = NEXTHOP_NOMODIFY; 1771 break; 1772 case ACTION_SET_NEXTHOP_SELF: 1773 *flags = NEXTHOP_SELF; 1774 break; 1775 case ACTION_SET_NEXTHOP_REF: 1776 /* 1777 * it is possible that a prefix matches but has the wrong 1778 * address family for the set nexthop. In this case ignore it. 1779 */ 1780 if (aid != setnh->exit_nexthop.aid) 1781 break; 1782 nexthop_unref(*nexthop); 1783 *nexthop = nexthop_ref(setnh); 1784 *flags = 0; 1785 break; 1786 default: 1787 break; 1788 } 1789 } 1790 1791 void 1792 nexthop_link(struct prefix *p) 1793 { 1794 if (p->nexthop == NULL) 1795 return; 1796 1797 /* no need to link prefixes in RIBs that have no decision process */ 1798 if (re_rib(p->re)->flags & F_RIB_NOEVALUATE) 1799 return; 1800 1801 p->flags |= PREFIX_NEXTHOP_LINKED; 1802 LIST_INSERT_HEAD(&p->nexthop->prefix_h, p, entry.list.nexthop); 1803 } 1804 1805 void 1806 nexthop_unlink(struct prefix *p) 1807 { 1808 if (p->nexthop == NULL || (p->flags & PREFIX_NEXTHOP_LINKED) == 0) 1809 return; 1810 1811 if (p == p->nexthop->next_prefix) { 1812 p->nexthop->next_prefix = LIST_NEXT(p, entry.list.nexthop); 1813 /* remove nexthop from list if no prefixes left to update */ 1814 if (p->nexthop->next_prefix == NULL) { 1815 TAILQ_REMOVE(&nexthop_runners, p->nexthop, runner_l); 1816 log_debug("nexthop %s update finished", 1817 log_addr(&p->nexthop->exit_nexthop)); 1818 } 1819 } 1820 1821 p->flags &= ~PREFIX_NEXTHOP_LINKED; 1822 LIST_REMOVE(p, entry.list.nexthop); 1823 } 1824 1825 struct nexthop * 1826 nexthop_get(struct bgpd_addr *nexthop) 1827 { 1828 struct nexthop *nh; 1829 1830 nh = nexthop_lookup(nexthop); 1831 if (nh == NULL) { 1832 nh = calloc(1, sizeof(*nh)); 1833 if (nh == NULL) 1834 fatal("nexthop_alloc"); 1835 rdemem.nexthop_cnt++; 1836 1837 LIST_INIT(&nh->prefix_h); 1838 nh->state = NEXTHOP_LOOKUP; 1839 nexthop_ref(nh); /* take reference for lookup */ 1840 nh->exit_nexthop = *nexthop; 1841 LIST_INSERT_HEAD(nexthop_hash(nexthop), nh, 1842 nexthop_l); 1843 1844 rde_send_nexthop(&nh->exit_nexthop, 1); 1845 } 1846 1847 return nexthop_ref(nh); 1848 } 1849 1850 struct nexthop * 1851 nexthop_ref(struct nexthop *nexthop) 1852 { 1853 if (nexthop) 1854 nexthop->refcnt++; 1855 return (nexthop); 1856 } 1857 1858 int 1859 nexthop_unref(struct nexthop *nh) 1860 { 1861 if (nh == NULL) 1862 return (0); 1863 if (--nh->refcnt > 0) 1864 return (0); 1865 1866 /* sanity check */ 1867 if (!LIST_EMPTY(&nh->prefix_h) || nh->state == NEXTHOP_LOOKUP) 1868 fatalx("%s: refcnt error", __func__); 1869 1870 /* is nexthop update running? impossible, that is a refcnt error */ 1871 if (nh->next_prefix) 1872 fatalx("%s: next_prefix not NULL", __func__); 1873 1874 LIST_REMOVE(nh, nexthop_l); 1875 rde_send_nexthop(&nh->exit_nexthop, 0); 1876 1877 rdemem.nexthop_cnt--; 1878 free(nh); 1879 return (1); 1880 } 1881 1882 int 1883 nexthop_compare(struct nexthop *na, struct nexthop *nb) 1884 { 1885 struct bgpd_addr *a, *b; 1886 1887 if (na == nb) 1888 return (0); 1889 if (na == NULL) 1890 return (-1); 1891 if (nb == NULL) 1892 return (1); 1893 1894 a = &na->exit_nexthop; 1895 b = &nb->exit_nexthop; 1896 1897 if (a->aid != b->aid) 1898 return (a->aid - b->aid); 1899 1900 switch (a->aid) { 1901 case AID_INET: 1902 if (ntohl(a->v4.s_addr) > ntohl(b->v4.s_addr)) 1903 return (1); 1904 if (ntohl(a->v4.s_addr) < ntohl(b->v4.s_addr)) 1905 return (-1); 1906 return (0); 1907 case AID_INET6: 1908 return (memcmp(&a->v6, &b->v6, sizeof(struct in6_addr))); 1909 default: 1910 fatalx("nexthop_cmp: unknown af"); 1911 } 1912 return (-1); 1913 } 1914 1915 struct nexthop * 1916 nexthop_lookup(struct bgpd_addr *nexthop) 1917 { 1918 struct nexthop *nh; 1919 1920 LIST_FOREACH(nh, nexthop_hash(nexthop), nexthop_l) { 1921 if (memcmp(&nh->exit_nexthop, nexthop, 1922 sizeof(struct bgpd_addr)) == 0) 1923 return (nh); 1924 } 1925 return (NULL); 1926 } 1927 1928 struct nexthop_head * 1929 nexthop_hash(struct bgpd_addr *nexthop) 1930 { 1931 u_int32_t h = 0; 1932 1933 switch (nexthop->aid) { 1934 case AID_INET: 1935 h = SipHash24(&nexthoptablekey, &nexthop->v4.s_addr, 1936 sizeof(nexthop->v4.s_addr)); 1937 break; 1938 case AID_INET6: 1939 h = SipHash24(&nexthoptablekey, &nexthop->v6, 1940 sizeof(struct in6_addr)); 1941 break; 1942 default: 1943 fatalx("nexthop_hash: unsupported AF"); 1944 } 1945 return (&nexthoptable.nexthop_hashtbl[h & nexthoptable.nexthop_hashmask]); 1946 } 1947