1 /* $NetBSD: optimize.c,v 1.13 2024/09/02 15:33:37 christos Exp $ */ 2 3 /* 4 * Copyright (c) 1988, 1989, 1990, 1991, 1993, 1994, 1995, 1996 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that: (1) source code distributions 9 * retain the above copyright notice and this paragraph in its entirety, (2) 10 * distributions including binary code include the above copyright notice and 11 * this paragraph in its entirety in the documentation or other materials 12 * provided with the distribution, and (3) all advertising materials mentioning 13 * features or use of this software display the following acknowledgement: 14 * ``This product includes software developed by the University of California, 15 * Lawrence Berkeley Laboratory and its contributors.'' Neither the name of 16 * the University nor the names of its contributors may be used to endorse 17 * or promote products derived from this software without specific prior 18 * written permission. 19 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED 20 * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF 21 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. 22 * 23 * Optimization module for BPF code intermediate representation. 24 */ 25 26 #include <sys/cdefs.h> 27 __RCSID("$NetBSD: optimize.c,v 1.13 2024/09/02 15:33:37 christos Exp $"); 28 29 #include <config.h> 30 31 #include <pcap-types.h> 32 33 #include <stdio.h> 34 #include <stdlib.h> 35 #include <memory.h> 36 #include <setjmp.h> 37 #include <string.h> 38 #include <limits.h> /* for SIZE_MAX */ 39 #include <errno.h> 40 41 #include "pcap-int.h" 42 43 #include "gencode.h" 44 #include "optimize.h" 45 #include "diag-control.h" 46 47 #ifdef HAVE_OS_PROTO_H 48 #include "os-proto.h" 49 #endif 50 51 #ifdef BDEBUG 52 /* 53 * The internal "debug printout" flag for the filter expression optimizer. 54 * The code to print that stuff is present only if BDEBUG is defined, so 55 * the flag, and the routine to set it, are defined only if BDEBUG is 56 * defined. 57 */ 58 static int pcap_optimizer_debug; 59 60 /* 61 * Routine to set that flag. 62 * 63 * This is intended for libpcap developers, not for general use. 64 * If you want to set these in a program, you'll have to declare this 65 * routine yourself, with the appropriate DLL import attribute on Windows; 66 * it's not declared in any header file, and won't be declared in any 67 * header file provided by libpcap. 68 */ 69 PCAP_API void pcap_set_optimizer_debug(int value); 70 71 PCAP_API_DEF void 72 pcap_set_optimizer_debug(int value) 73 { 74 pcap_optimizer_debug = value; 75 } 76 77 /* 78 * The internal "print dot graph" flag for the filter expression optimizer. 79 * The code to print that stuff is present only if BDEBUG is defined, so 80 * the flag, and the routine to set it, are defined only if BDEBUG is 81 * defined. 82 */ 83 static int pcap_print_dot_graph; 84 85 /* 86 * Routine to set that flag. 87 * 88 * This is intended for libpcap developers, not for general use. 89 * If you want to set these in a program, you'll have to declare this 90 * routine yourself, with the appropriate DLL import attribute on Windows; 91 * it's not declared in any header file, and won't be declared in any 92 * header file provided by libpcap. 93 */ 94 PCAP_API void pcap_set_print_dot_graph(int value); 95 96 PCAP_API_DEF void 97 pcap_set_print_dot_graph(int value) 98 { 99 pcap_print_dot_graph = value; 100 } 101 102 #endif 103 104 /* 105 * lowest_set_bit(). 106 * 107 * Takes a 32-bit integer as an argument. 108 * 109 * If handed a non-zero value, returns the index of the lowest set bit, 110 * counting upwards from zero. 111 * 112 * If handed zero, the results are platform- and compiler-dependent. 113 * Keep it out of the light, don't give it any water, don't feed it 114 * after midnight, and don't pass zero to it. 115 * 116 * This is the same as the count of trailing zeroes in the word. 117 */ 118 #if PCAP_IS_AT_LEAST_GNUC_VERSION(3,4) 119 /* 120 * GCC 3.4 and later; we have __builtin_ctz(). 121 */ 122 #define lowest_set_bit(mask) ((u_int)__builtin_ctz(mask)) 123 #elif defined(_MSC_VER) 124 /* 125 * Visual Studio; we support only 2005 and later, so use 126 * _BitScanForward(). 127 */ 128 #include <intrin.h> 129 130 #ifndef __clang__ 131 #pragma intrinsic(_BitScanForward) 132 #endif 133 134 static __forceinline u_int 135 lowest_set_bit(int mask) 136 { 137 unsigned long bit; 138 139 /* 140 * Don't sign-extend mask if long is longer than int. 141 * (It's currently not, in MSVC, even on 64-bit platforms, but....) 142 */ 143 if (_BitScanForward(&bit, (unsigned int)mask) == 0) 144 abort(); /* mask is zero */ 145 return (u_int)bit; 146 } 147 #elif defined(MSDOS) && defined(__DJGPP__) 148 /* 149 * MS-DOS with DJGPP, which declares ffs() in <string.h>, which 150 * we've already included. 151 */ 152 #define lowest_set_bit(mask) ((u_int)(ffs((mask)) - 1)) 153 #elif (defined(MSDOS) && defined(__WATCOMC__)) || defined(STRINGS_H_DECLARES_FFS) 154 /* 155 * MS-DOS with Watcom C, which has <strings.h> and declares ffs() there, 156 * or some other platform (UN*X conforming to a sufficient recent version 157 * of the Single UNIX Specification). 158 */ 159 #include <strings.h> 160 #define lowest_set_bit(mask) (u_int)((ffs((mask)) - 1)) 161 #else 162 /* 163 * None of the above. 164 * Use a perfect-hash-function-based function. 165 */ 166 static u_int 167 lowest_set_bit(int mask) 168 { 169 unsigned int v = (unsigned int)mask; 170 171 static const u_int MultiplyDeBruijnBitPosition[32] = { 172 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 173 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 174 }; 175 176 /* 177 * We strip off all but the lowermost set bit (v & ~v), 178 * and perform a minimal perfect hash on it to look up the 179 * number of low-order zero bits in a table. 180 * 181 * See: 182 * 183 * http://7ooo.mooo.com/text/ComputingTrailingZerosHOWTO.pdf 184 * 185 * http://supertech.csail.mit.edu/papers/debruijn.pdf 186 */ 187 return (MultiplyDeBruijnBitPosition[((v & -v) * 0x077CB531U) >> 27]); 188 } 189 #endif 190 191 /* 192 * Represents a deleted instruction. 193 */ 194 #define NOP -1 195 196 /* 197 * Register numbers for use-def values. 198 * 0 through BPF_MEMWORDS-1 represent the corresponding scratch memory 199 * location. A_ATOM is the accumulator and X_ATOM is the index 200 * register. 201 */ 202 #define A_ATOM BPF_MEMWORDS 203 #define X_ATOM (BPF_MEMWORDS+1) 204 205 /* 206 * This define is used to represent *both* the accumulator and 207 * x register in use-def computations. 208 * Currently, the use-def code assumes only one definition per instruction. 209 */ 210 #define AX_ATOM N_ATOMS 211 212 /* 213 * These data structures are used in a Cocke and Schwartz style 214 * value numbering scheme. Since the flowgraph is acyclic, 215 * exit values can be propagated from a node's predecessors 216 * provided it is uniquely defined. 217 */ 218 struct valnode { 219 int code; 220 bpf_u_int32 v0, v1; 221 int val; /* the value number */ 222 struct valnode *next; 223 }; 224 225 /* Integer constants mapped with the load immediate opcode. */ 226 #define K(i) F(opt_state, BPF_LD|BPF_IMM|BPF_W, i, 0U) 227 228 struct vmapinfo { 229 int is_const; 230 bpf_u_int32 const_val; 231 }; 232 233 typedef struct { 234 /* 235 * Place to longjmp to on an error. 236 */ 237 jmp_buf top_ctx; 238 239 /* 240 * The buffer into which to put error message. 241 */ 242 char *errbuf; 243 244 /* 245 * A flag to indicate that further optimization is needed. 246 * Iterative passes are continued until a given pass yields no 247 * code simplification or branch movement. 248 */ 249 int done; 250 251 /* 252 * XXX - detect loops that do nothing but repeated AND/OR pullups 253 * and edge moves. 254 * If 100 passes in a row do nothing but that, treat that as a 255 * sign that we're in a loop that just shuffles in a cycle in 256 * which each pass just shuffles the code and we eventually 257 * get back to the original configuration. 258 * 259 * XXX - we need a non-heuristic way of detecting, or preventing, 260 * such a cycle. 261 */ 262 int non_branch_movement_performed; 263 264 u_int n_blocks; /* number of blocks in the CFG; guaranteed to be > 0, as it's a RET instruction at a minimum */ 265 struct block **blocks; 266 u_int n_edges; /* twice n_blocks, so guaranteed to be > 0 */ 267 struct edge **edges; 268 269 /* 270 * A bit vector set representation of the dominators. 271 * We round up the set size to the next power of two. 272 */ 273 u_int nodewords; /* number of 32-bit words for a bit vector of "number of nodes" bits; guaranteed to be > 0 */ 274 u_int edgewords; /* number of 32-bit words for a bit vector of "number of edges" bits; guaranteed to be > 0 */ 275 struct block **levels; 276 bpf_u_int32 *space; 277 278 #define BITS_PER_WORD (8*sizeof(bpf_u_int32)) 279 /* 280 * True if a is in uset {p} 281 */ 282 #define SET_MEMBER(p, a) \ 283 ((p)[(unsigned)(a) / BITS_PER_WORD] & ((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD))) 284 285 /* 286 * Add 'a' to uset p. 287 */ 288 #define SET_INSERT(p, a) \ 289 (p)[(unsigned)(a) / BITS_PER_WORD] |= ((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD)) 290 291 /* 292 * Delete 'a' from uset p. 293 */ 294 #define SET_DELETE(p, a) \ 295 (p)[(unsigned)(a) / BITS_PER_WORD] &= ~((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD)) 296 297 /* 298 * a := a intersect b 299 * n must be guaranteed to be > 0 300 */ 301 #define SET_INTERSECT(a, b, n)\ 302 {\ 303 register bpf_u_int32 *_x = a, *_y = b;\ 304 register u_int _n = n;\ 305 do *_x++ &= *_y++; while (--_n != 0);\ 306 } 307 308 /* 309 * a := a - b 310 * n must be guaranteed to be > 0 311 */ 312 #define SET_SUBTRACT(a, b, n)\ 313 {\ 314 register bpf_u_int32 *_x = a, *_y = b;\ 315 register u_int _n = n;\ 316 do *_x++ &=~ *_y++; while (--_n != 0);\ 317 } 318 319 /* 320 * a := a union b 321 * n must be guaranteed to be > 0 322 */ 323 #define SET_UNION(a, b, n)\ 324 {\ 325 register bpf_u_int32 *_x = a, *_y = b;\ 326 register u_int _n = n;\ 327 do *_x++ |= *_y++; while (--_n != 0);\ 328 } 329 330 uset all_dom_sets; 331 uset all_closure_sets; 332 uset all_edge_sets; 333 334 #define MODULUS 213 335 struct valnode *hashtbl[MODULUS]; 336 bpf_u_int32 curval; 337 bpf_u_int32 maxval; 338 339 struct vmapinfo *vmap; 340 struct valnode *vnode_base; 341 struct valnode *next_vnode; 342 } opt_state_t; 343 344 typedef struct { 345 /* 346 * Place to longjmp to on an error. 347 */ 348 jmp_buf top_ctx; 349 350 /* 351 * The buffer into which to put error message. 352 */ 353 char *errbuf; 354 355 /* 356 * Some pointers used to convert the basic block form of the code, 357 * into the array form that BPF requires. 'fstart' will point to 358 * the malloc'd array while 'ftail' is used during the recursive 359 * traversal. 360 */ 361 struct bpf_insn *fstart; 362 struct bpf_insn *ftail; 363 } conv_state_t; 364 365 static void opt_init(opt_state_t *, struct icode *); 366 static void opt_cleanup(opt_state_t *); 367 static void PCAP_NORETURN opt_error(opt_state_t *, const char *, ...) 368 PCAP_PRINTFLIKE(2, 3); 369 370 static void intern_blocks(opt_state_t *, struct icode *); 371 372 static void find_inedges(opt_state_t *, struct block *); 373 #ifdef BDEBUG 374 static void opt_dump(opt_state_t *, struct icode *); 375 #endif 376 377 #ifndef MAX 378 #define MAX(a,b) ((a)>(b)?(a):(b)) 379 #endif 380 381 static void 382 find_levels_r(opt_state_t *opt_state, struct icode *ic, struct block *b) 383 { 384 int level; 385 386 if (isMarked(ic, b)) 387 return; 388 389 Mark(ic, b); 390 b->link = 0; 391 392 if (JT(b)) { 393 find_levels_r(opt_state, ic, JT(b)); 394 find_levels_r(opt_state, ic, JF(b)); 395 level = MAX(JT(b)->level, JF(b)->level) + 1; 396 } else 397 level = 0; 398 b->level = level; 399 b->link = opt_state->levels[level]; 400 opt_state->levels[level] = b; 401 } 402 403 /* 404 * Level graph. The levels go from 0 at the leaves to 405 * N_LEVELS at the root. The opt_state->levels[] array points to the 406 * first node of the level list, whose elements are linked 407 * with the 'link' field of the struct block. 408 */ 409 static void 410 find_levels(opt_state_t *opt_state, struct icode *ic) 411 { 412 memset((char *)opt_state->levels, 0, opt_state->n_blocks * sizeof(*opt_state->levels)); 413 unMarkAll(ic); 414 find_levels_r(opt_state, ic, ic->root); 415 } 416 417 /* 418 * Find dominator relationships. 419 * Assumes graph has been leveled. 420 */ 421 static void 422 find_dom(opt_state_t *opt_state, struct block *root) 423 { 424 u_int i; 425 int level; 426 struct block *b; 427 bpf_u_int32 *x; 428 429 /* 430 * Initialize sets to contain all nodes. 431 */ 432 x = opt_state->all_dom_sets; 433 /* 434 * In opt_init(), we've made sure the product doesn't overflow. 435 */ 436 i = opt_state->n_blocks * opt_state->nodewords; 437 while (i != 0) { 438 --i; 439 *x++ = 0xFFFFFFFFU; 440 } 441 /* Root starts off empty. */ 442 for (i = opt_state->nodewords; i != 0;) { 443 --i; 444 root->dom[i] = 0; 445 } 446 447 /* root->level is the highest level no found. */ 448 for (level = root->level; level >= 0; --level) { 449 for (b = opt_state->levels[level]; b; b = b->link) { 450 SET_INSERT(b->dom, b->id); 451 if (JT(b) == 0) 452 continue; 453 SET_INTERSECT(JT(b)->dom, b->dom, opt_state->nodewords); 454 SET_INTERSECT(JF(b)->dom, b->dom, opt_state->nodewords); 455 } 456 } 457 } 458 459 static void 460 propedom(opt_state_t *opt_state, struct edge *ep) 461 { 462 SET_INSERT(ep->edom, ep->id); 463 if (ep->succ) { 464 SET_INTERSECT(ep->succ->et.edom, ep->edom, opt_state->edgewords); 465 SET_INTERSECT(ep->succ->ef.edom, ep->edom, opt_state->edgewords); 466 } 467 } 468 469 /* 470 * Compute edge dominators. 471 * Assumes graph has been leveled and predecessors established. 472 */ 473 static void 474 find_edom(opt_state_t *opt_state, struct block *root) 475 { 476 u_int i; 477 uset x; 478 int level; 479 struct block *b; 480 481 x = opt_state->all_edge_sets; 482 /* 483 * In opt_init(), we've made sure the product doesn't overflow. 484 */ 485 for (i = opt_state->n_edges * opt_state->edgewords; i != 0; ) { 486 --i; 487 x[i] = 0xFFFFFFFFU; 488 } 489 490 /* root->level is the highest level no found. */ 491 memset(root->et.edom, 0, opt_state->edgewords * sizeof(*(uset)0)); 492 memset(root->ef.edom, 0, opt_state->edgewords * sizeof(*(uset)0)); 493 for (level = root->level; level >= 0; --level) { 494 for (b = opt_state->levels[level]; b != 0; b = b->link) { 495 propedom(opt_state, &b->et); 496 propedom(opt_state, &b->ef); 497 } 498 } 499 } 500 501 /* 502 * Find the backwards transitive closure of the flow graph. These sets 503 * are backwards in the sense that we find the set of nodes that reach 504 * a given node, not the set of nodes that can be reached by a node. 505 * 506 * Assumes graph has been leveled. 507 */ 508 static void 509 find_closure(opt_state_t *opt_state, struct block *root) 510 { 511 int level; 512 struct block *b; 513 514 /* 515 * Initialize sets to contain no nodes. 516 */ 517 memset((char *)opt_state->all_closure_sets, 0, 518 opt_state->n_blocks * opt_state->nodewords * sizeof(*opt_state->all_closure_sets)); 519 520 /* root->level is the highest level no found. */ 521 for (level = root->level; level >= 0; --level) { 522 for (b = opt_state->levels[level]; b; b = b->link) { 523 SET_INSERT(b->closure, b->id); 524 if (JT(b) == 0) 525 continue; 526 SET_UNION(JT(b)->closure, b->closure, opt_state->nodewords); 527 SET_UNION(JF(b)->closure, b->closure, opt_state->nodewords); 528 } 529 } 530 } 531 532 /* 533 * Return the register number that is used by s. 534 * 535 * Returns ATOM_A if A is used, ATOM_X if X is used, AX_ATOM if both A and X 536 * are used, the scratch memory location's number if a scratch memory 537 * location is used (e.g., 0 for M[0]), or -1 if none of those are used. 538 * 539 * The implementation should probably change to an array access. 540 */ 541 static int 542 atomuse(struct stmt *s) 543 { 544 register int c = s->code; 545 546 if (c == NOP) 547 return -1; 548 549 switch (BPF_CLASS(c)) { 550 551 case BPF_RET: 552 return (BPF_RVAL(c) == BPF_A) ? A_ATOM : 553 (BPF_RVAL(c) == BPF_X) ? X_ATOM : -1; 554 555 case BPF_LD: 556 case BPF_LDX: 557 /* 558 * As there are fewer than 2^31 memory locations, 559 * s->k should be convertible to int without problems. 560 */ 561 return (BPF_MODE(c) == BPF_IND) ? X_ATOM : 562 (BPF_MODE(c) == BPF_MEM) ? (int)s->k : -1; 563 564 case BPF_ST: 565 return A_ATOM; 566 567 case BPF_STX: 568 return X_ATOM; 569 570 case BPF_JMP: 571 case BPF_ALU: 572 if (BPF_SRC(c) == BPF_X) 573 return AX_ATOM; 574 return A_ATOM; 575 576 case BPF_MISC: 577 return BPF_MISCOP(c) == BPF_TXA ? X_ATOM : A_ATOM; 578 } 579 abort(); 580 /* NOTREACHED */ 581 } 582 583 /* 584 * Return the register number that is defined by 's'. We assume that 585 * a single stmt cannot define more than one register. If no register 586 * is defined, return -1. 587 * 588 * The implementation should probably change to an array access. 589 */ 590 static int 591 atomdef(struct stmt *s) 592 { 593 if (s->code == NOP) 594 return -1; 595 596 switch (BPF_CLASS(s->code)) { 597 598 case BPF_LD: 599 case BPF_ALU: 600 return A_ATOM; 601 602 case BPF_LDX: 603 return X_ATOM; 604 605 case BPF_ST: 606 case BPF_STX: 607 return s->k; 608 609 case BPF_MISC: 610 return BPF_MISCOP(s->code) == BPF_TAX ? X_ATOM : A_ATOM; 611 } 612 return -1; 613 } 614 615 /* 616 * Compute the sets of registers used, defined, and killed by 'b'. 617 * 618 * "Used" means that a statement in 'b' uses the register before any 619 * statement in 'b' defines it, i.e. it uses the value left in 620 * that register by a predecessor block of this block. 621 * "Defined" means that a statement in 'b' defines it. 622 * "Killed" means that a statement in 'b' defines it before any 623 * statement in 'b' uses it, i.e. it kills the value left in that 624 * register by a predecessor block of this block. 625 */ 626 static void 627 compute_local_ud(struct block *b) 628 { 629 struct slist *s; 630 atomset def = 0, use = 0, killed = 0; 631 int atom; 632 633 for (s = b->stmts; s; s = s->next) { 634 if (s->s.code == NOP) 635 continue; 636 atom = atomuse(&s->s); 637 if (atom >= 0) { 638 if (atom == AX_ATOM) { 639 if (!ATOMELEM(def, X_ATOM)) 640 use |= ATOMMASK(X_ATOM); 641 if (!ATOMELEM(def, A_ATOM)) 642 use |= ATOMMASK(A_ATOM); 643 } 644 else if (atom < N_ATOMS) { 645 if (!ATOMELEM(def, atom)) 646 use |= ATOMMASK(atom); 647 } 648 else 649 abort(); 650 } 651 atom = atomdef(&s->s); 652 if (atom >= 0) { 653 if (!ATOMELEM(use, atom)) 654 killed |= ATOMMASK(atom); 655 def |= ATOMMASK(atom); 656 } 657 } 658 if (BPF_CLASS(b->s.code) == BPF_JMP) { 659 /* 660 * XXX - what about RET? 661 */ 662 atom = atomuse(&b->s); 663 if (atom >= 0) { 664 if (atom == AX_ATOM) { 665 if (!ATOMELEM(def, X_ATOM)) 666 use |= ATOMMASK(X_ATOM); 667 if (!ATOMELEM(def, A_ATOM)) 668 use |= ATOMMASK(A_ATOM); 669 } 670 else if (atom < N_ATOMS) { 671 if (!ATOMELEM(def, atom)) 672 use |= ATOMMASK(atom); 673 } 674 else 675 abort(); 676 } 677 } 678 679 b->def = def; 680 b->kill = killed; 681 b->in_use = use; 682 } 683 684 /* 685 * Assume graph is already leveled. 686 */ 687 static void 688 find_ud(opt_state_t *opt_state, struct block *root) 689 { 690 int i, maxlevel; 691 struct block *p; 692 693 /* 694 * root->level is the highest level no found; 695 * count down from there. 696 */ 697 maxlevel = root->level; 698 for (i = maxlevel; i >= 0; --i) 699 for (p = opt_state->levels[i]; p; p = p->link) { 700 compute_local_ud(p); 701 p->out_use = 0; 702 } 703 704 for (i = 1; i <= maxlevel; ++i) { 705 for (p = opt_state->levels[i]; p; p = p->link) { 706 p->out_use |= JT(p)->in_use | JF(p)->in_use; 707 p->in_use |= p->out_use &~ p->kill; 708 } 709 } 710 } 711 static void 712 init_val(opt_state_t *opt_state) 713 { 714 opt_state->curval = 0; 715 opt_state->next_vnode = opt_state->vnode_base; 716 memset((char *)opt_state->vmap, 0, opt_state->maxval * sizeof(*opt_state->vmap)); 717 memset((char *)opt_state->hashtbl, 0, sizeof opt_state->hashtbl); 718 } 719 720 /* 721 * Because we really don't have an IR, this stuff is a little messy. 722 * 723 * This routine looks in the table of existing value number for a value 724 * with generated from an operation with the specified opcode and 725 * the specified values. If it finds it, it returns its value number, 726 * otherwise it makes a new entry in the table and returns the 727 * value number of that entry. 728 */ 729 static bpf_u_int32 730 F(opt_state_t *opt_state, int code, bpf_u_int32 v0, bpf_u_int32 v1) 731 { 732 u_int hash; 733 bpf_u_int32 val; 734 struct valnode *p; 735 736 hash = (u_int)code ^ (v0 << 4) ^ (v1 << 8); 737 hash %= MODULUS; 738 739 for (p = opt_state->hashtbl[hash]; p; p = p->next) 740 if (p->code == code && p->v0 == v0 && p->v1 == v1) 741 return p->val; 742 743 /* 744 * Not found. Allocate a new value, and assign it a new 745 * value number. 746 * 747 * opt_state->curval starts out as 0, which means VAL_UNKNOWN; we 748 * increment it before using it as the new value number, which 749 * means we never assign VAL_UNKNOWN. 750 * 751 * XXX - unless we overflow, but we probably won't have 2^32-1 752 * values; we treat 32 bits as effectively infinite. 753 */ 754 val = ++opt_state->curval; 755 if (BPF_MODE(code) == BPF_IMM && 756 (BPF_CLASS(code) == BPF_LD || BPF_CLASS(code) == BPF_LDX)) { 757 opt_state->vmap[val].const_val = v0; 758 opt_state->vmap[val].is_const = 1; 759 } 760 p = opt_state->next_vnode++; 761 p->val = val; 762 p->code = code; 763 p->v0 = v0; 764 p->v1 = v1; 765 p->next = opt_state->hashtbl[hash]; 766 opt_state->hashtbl[hash] = p; 767 768 return val; 769 } 770 771 static inline void 772 vstore(struct stmt *s, bpf_u_int32 *valp, bpf_u_int32 newval, int alter) 773 { 774 if (alter && newval != VAL_UNKNOWN && *valp == newval) 775 s->code = NOP; 776 else 777 *valp = newval; 778 } 779 780 /* 781 * Do constant-folding on binary operators. 782 * (Unary operators are handled elsewhere.) 783 */ 784 static void 785 fold_op(opt_state_t *opt_state, struct stmt *s, bpf_u_int32 v0, bpf_u_int32 v1) 786 { 787 bpf_u_int32 a, b; 788 789 a = opt_state->vmap[v0].const_val; 790 b = opt_state->vmap[v1].const_val; 791 792 switch (BPF_OP(s->code)) { 793 case BPF_ADD: 794 a += b; 795 break; 796 797 case BPF_SUB: 798 a -= b; 799 break; 800 801 case BPF_MUL: 802 a *= b; 803 break; 804 805 case BPF_DIV: 806 if (b == 0) 807 opt_error(opt_state, "division by zero"); 808 a /= b; 809 break; 810 811 case BPF_MOD: 812 if (b == 0) 813 opt_error(opt_state, "modulus by zero"); 814 a %= b; 815 break; 816 817 case BPF_AND: 818 a &= b; 819 break; 820 821 case BPF_OR: 822 a |= b; 823 break; 824 825 case BPF_XOR: 826 a ^= b; 827 break; 828 829 case BPF_LSH: 830 /* 831 * A left shift of more than the width of the type 832 * is undefined in C; we'll just treat it as shifting 833 * all the bits out. 834 * 835 * XXX - the BPF interpreter doesn't check for this, 836 * so its behavior is dependent on the behavior of 837 * the processor on which it's running. There are 838 * processors on which it shifts all the bits out 839 * and processors on which it does no shift. 840 */ 841 if (b < 32) 842 a <<= b; 843 else 844 a = 0; 845 break; 846 847 case BPF_RSH: 848 /* 849 * A right shift of more than the width of the type 850 * is undefined in C; we'll just treat it as shifting 851 * all the bits out. 852 * 853 * XXX - the BPF interpreter doesn't check for this, 854 * so its behavior is dependent on the behavior of 855 * the processor on which it's running. There are 856 * processors on which it shifts all the bits out 857 * and processors on which it does no shift. 858 */ 859 if (b < 32) 860 a >>= b; 861 else 862 a = 0; 863 break; 864 865 default: 866 abort(); 867 } 868 s->k = a; 869 s->code = BPF_LD|BPF_IMM; 870 /* 871 * XXX - optimizer loop detection. 872 */ 873 opt_state->non_branch_movement_performed = 1; 874 opt_state->done = 0; 875 } 876 877 static inline struct slist * 878 this_op(struct slist *s) 879 { 880 while (s != 0 && s->s.code == NOP) 881 s = s->next; 882 return s; 883 } 884 885 static void 886 opt_not(struct block *b) 887 { 888 struct block *tmp = JT(b); 889 890 JT(b) = JF(b); 891 JF(b) = tmp; 892 } 893 894 static void 895 opt_peep(opt_state_t *opt_state, struct block *b) 896 { 897 struct slist *s; 898 struct slist *next, *last; 899 bpf_u_int32 val; 900 901 s = b->stmts; 902 if (s == 0) 903 return; 904 905 last = s; 906 for (/*empty*/; /*empty*/; s = next) { 907 /* 908 * Skip over nops. 909 */ 910 s = this_op(s); 911 if (s == 0) 912 break; /* nothing left in the block */ 913 914 /* 915 * Find the next real instruction after that one 916 * (skipping nops). 917 */ 918 next = this_op(s->next); 919 if (next == 0) 920 break; /* no next instruction */ 921 last = next; 922 923 /* 924 * st M[k] --> st M[k] 925 * ldx M[k] tax 926 */ 927 if (s->s.code == BPF_ST && 928 next->s.code == (BPF_LDX|BPF_MEM) && 929 s->s.k == next->s.k) { 930 /* 931 * XXX - optimizer loop detection. 932 */ 933 opt_state->non_branch_movement_performed = 1; 934 opt_state->done = 0; 935 next->s.code = BPF_MISC|BPF_TAX; 936 } 937 /* 938 * ld #k --> ldx #k 939 * tax txa 940 */ 941 if (s->s.code == (BPF_LD|BPF_IMM) && 942 next->s.code == (BPF_MISC|BPF_TAX)) { 943 s->s.code = BPF_LDX|BPF_IMM; 944 next->s.code = BPF_MISC|BPF_TXA; 945 /* 946 * XXX - optimizer loop detection. 947 */ 948 opt_state->non_branch_movement_performed = 1; 949 opt_state->done = 0; 950 } 951 /* 952 * This is an ugly special case, but it happens 953 * when you say tcp[k] or udp[k] where k is a constant. 954 */ 955 if (s->s.code == (BPF_LD|BPF_IMM)) { 956 struct slist *add, *tax, *ild; 957 958 /* 959 * Check that X isn't used on exit from this 960 * block (which the optimizer might cause). 961 * We know the code generator won't generate 962 * any local dependencies. 963 */ 964 if (ATOMELEM(b->out_use, X_ATOM)) 965 continue; 966 967 /* 968 * Check that the instruction following the ldi 969 * is an addx, or it's an ldxms with an addx 970 * following it (with 0 or more nops between the 971 * ldxms and addx). 972 */ 973 if (next->s.code != (BPF_LDX|BPF_MSH|BPF_B)) 974 add = next; 975 else 976 add = this_op(next->next); 977 if (add == 0 || add->s.code != (BPF_ALU|BPF_ADD|BPF_X)) 978 continue; 979 980 /* 981 * Check that a tax follows that (with 0 or more 982 * nops between them). 983 */ 984 tax = this_op(add->next); 985 if (tax == 0 || tax->s.code != (BPF_MISC|BPF_TAX)) 986 continue; 987 988 /* 989 * Check that an ild follows that (with 0 or more 990 * nops between them). 991 */ 992 ild = this_op(tax->next); 993 if (ild == 0 || BPF_CLASS(ild->s.code) != BPF_LD || 994 BPF_MODE(ild->s.code) != BPF_IND) 995 continue; 996 /* 997 * We want to turn this sequence: 998 * 999 * (004) ldi #0x2 {s} 1000 * (005) ldxms [14] {next} -- optional 1001 * (006) addx {add} 1002 * (007) tax {tax} 1003 * (008) ild [x+0] {ild} 1004 * 1005 * into this sequence: 1006 * 1007 * (004) nop 1008 * (005) ldxms [14] 1009 * (006) nop 1010 * (007) nop 1011 * (008) ild [x+2] 1012 * 1013 * XXX We need to check that X is not 1014 * subsequently used, because we want to change 1015 * what'll be in it after this sequence. 1016 * 1017 * We know we can eliminate the accumulator 1018 * modifications earlier in the sequence since 1019 * it is defined by the last stmt of this sequence 1020 * (i.e., the last statement of the sequence loads 1021 * a value into the accumulator, so we can eliminate 1022 * earlier operations on the accumulator). 1023 */ 1024 ild->s.k += s->s.k; 1025 s->s.code = NOP; 1026 add->s.code = NOP; 1027 tax->s.code = NOP; 1028 /* 1029 * XXX - optimizer loop detection. 1030 */ 1031 opt_state->non_branch_movement_performed = 1; 1032 opt_state->done = 0; 1033 } 1034 } 1035 /* 1036 * If the comparison at the end of a block is an equality 1037 * comparison against a constant, and nobody uses the value 1038 * we leave in the A register at the end of a block, and 1039 * the operation preceding the comparison is an arithmetic 1040 * operation, we can sometime optimize it away. 1041 */ 1042 if (b->s.code == (BPF_JMP|BPF_JEQ|BPF_K) && 1043 !ATOMELEM(b->out_use, A_ATOM)) { 1044 /* 1045 * We can optimize away certain subtractions of the 1046 * X register. 1047 */ 1048 if (last->s.code == (BPF_ALU|BPF_SUB|BPF_X)) { 1049 val = b->val[X_ATOM]; 1050 if (opt_state->vmap[val].is_const) { 1051 /* 1052 * If we have a subtract to do a comparison, 1053 * and the X register is a known constant, 1054 * we can merge this value into the 1055 * comparison: 1056 * 1057 * sub x -> nop 1058 * jeq #y jeq #(x+y) 1059 */ 1060 b->s.k += opt_state->vmap[val].const_val; 1061 last->s.code = NOP; 1062 /* 1063 * XXX - optimizer loop detection. 1064 */ 1065 opt_state->non_branch_movement_performed = 1; 1066 opt_state->done = 0; 1067 } else if (b->s.k == 0) { 1068 /* 1069 * If the X register isn't a constant, 1070 * and the comparison in the test is 1071 * against 0, we can compare with the 1072 * X register, instead: 1073 * 1074 * sub x -> nop 1075 * jeq #0 jeq x 1076 */ 1077 last->s.code = NOP; 1078 b->s.code = BPF_JMP|BPF_JEQ|BPF_X; 1079 /* 1080 * XXX - optimizer loop detection. 1081 */ 1082 opt_state->non_branch_movement_performed = 1; 1083 opt_state->done = 0; 1084 } 1085 } 1086 /* 1087 * Likewise, a constant subtract can be simplified: 1088 * 1089 * sub #x -> nop 1090 * jeq #y -> jeq #(x+y) 1091 */ 1092 else if (last->s.code == (BPF_ALU|BPF_SUB|BPF_K)) { 1093 last->s.code = NOP; 1094 b->s.k += last->s.k; 1095 /* 1096 * XXX - optimizer loop detection. 1097 */ 1098 opt_state->non_branch_movement_performed = 1; 1099 opt_state->done = 0; 1100 } 1101 /* 1102 * And, similarly, a constant AND can be simplified 1103 * if we're testing against 0, i.e.: 1104 * 1105 * and #k nop 1106 * jeq #0 -> jset #k 1107 */ 1108 else if (last->s.code == (BPF_ALU|BPF_AND|BPF_K) && 1109 b->s.k == 0) { 1110 b->s.k = last->s.k; 1111 b->s.code = BPF_JMP|BPF_K|BPF_JSET; 1112 last->s.code = NOP; 1113 /* 1114 * XXX - optimizer loop detection. 1115 */ 1116 opt_state->non_branch_movement_performed = 1; 1117 opt_state->done = 0; 1118 opt_not(b); 1119 } 1120 } 1121 /* 1122 * jset #0 -> never 1123 * jset #ffffffff -> always 1124 */ 1125 if (b->s.code == (BPF_JMP|BPF_K|BPF_JSET)) { 1126 if (b->s.k == 0) 1127 JT(b) = JF(b); 1128 if (b->s.k == 0xffffffffU) 1129 JF(b) = JT(b); 1130 } 1131 /* 1132 * If we're comparing against the index register, and the index 1133 * register is a known constant, we can just compare against that 1134 * constant. 1135 */ 1136 val = b->val[X_ATOM]; 1137 if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_X) { 1138 bpf_u_int32 v = opt_state->vmap[val].const_val; 1139 b->s.code &= ~BPF_X; 1140 b->s.k = v; 1141 } 1142 /* 1143 * If the accumulator is a known constant, we can compute the 1144 * comparison result. 1145 */ 1146 val = b->val[A_ATOM]; 1147 if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_K) { 1148 bpf_u_int32 v = opt_state->vmap[val].const_val; 1149 switch (BPF_OP(b->s.code)) { 1150 1151 case BPF_JEQ: 1152 v = v == b->s.k; 1153 break; 1154 1155 case BPF_JGT: 1156 v = v > b->s.k; 1157 break; 1158 1159 case BPF_JGE: 1160 v = v >= b->s.k; 1161 break; 1162 1163 case BPF_JSET: 1164 v &= b->s.k; 1165 break; 1166 1167 default: 1168 abort(); 1169 } 1170 if (JF(b) != JT(b)) { 1171 /* 1172 * XXX - optimizer loop detection. 1173 */ 1174 opt_state->non_branch_movement_performed = 1; 1175 opt_state->done = 0; 1176 } 1177 if (v) 1178 JF(b) = JT(b); 1179 else 1180 JT(b) = JF(b); 1181 } 1182 } 1183 1184 /* 1185 * Compute the symbolic value of expression of 's', and update 1186 * anything it defines in the value table 'val'. If 'alter' is true, 1187 * do various optimizations. This code would be cleaner if symbolic 1188 * evaluation and code transformations weren't folded together. 1189 */ 1190 static void 1191 opt_stmt(opt_state_t *opt_state, struct stmt *s, bpf_u_int32 val[], int alter) 1192 { 1193 int op; 1194 bpf_u_int32 v; 1195 1196 switch (s->code) { 1197 1198 case BPF_LD|BPF_ABS|BPF_W: 1199 case BPF_LD|BPF_ABS|BPF_H: 1200 case BPF_LD|BPF_ABS|BPF_B: 1201 v = F(opt_state, s->code, s->k, 0L); 1202 vstore(s, &val[A_ATOM], v, alter); 1203 break; 1204 1205 case BPF_LD|BPF_IND|BPF_W: 1206 case BPF_LD|BPF_IND|BPF_H: 1207 case BPF_LD|BPF_IND|BPF_B: 1208 v = val[X_ATOM]; 1209 if (alter && opt_state->vmap[v].is_const) { 1210 s->code = BPF_LD|BPF_ABS|BPF_SIZE(s->code); 1211 s->k += opt_state->vmap[v].const_val; 1212 v = F(opt_state, s->code, s->k, 0L); 1213 /* 1214 * XXX - optimizer loop detection. 1215 */ 1216 opt_state->non_branch_movement_performed = 1; 1217 opt_state->done = 0; 1218 } 1219 else 1220 v = F(opt_state, s->code, s->k, v); 1221 vstore(s, &val[A_ATOM], v, alter); 1222 break; 1223 1224 case BPF_LD|BPF_LEN: 1225 v = F(opt_state, s->code, 0L, 0L); 1226 vstore(s, &val[A_ATOM], v, alter); 1227 break; 1228 1229 case BPF_LD|BPF_IMM: 1230 v = K(s->k); 1231 vstore(s, &val[A_ATOM], v, alter); 1232 break; 1233 1234 case BPF_LDX|BPF_IMM: 1235 v = K(s->k); 1236 vstore(s, &val[X_ATOM], v, alter); 1237 break; 1238 1239 case BPF_LDX|BPF_MSH|BPF_B: 1240 v = F(opt_state, s->code, s->k, 0L); 1241 vstore(s, &val[X_ATOM], v, alter); 1242 break; 1243 1244 case BPF_ALU|BPF_NEG: 1245 if (alter && opt_state->vmap[val[A_ATOM]].is_const) { 1246 s->code = BPF_LD|BPF_IMM; 1247 /* 1248 * Do this negation as unsigned arithmetic; that's 1249 * what modern BPF engines do, and it guarantees 1250 * that all possible values can be negated. (Yeah, 1251 * negating 0x80000000, the minimum signed 32-bit 1252 * two's-complement value, results in 0x80000000, 1253 * so it's still negative, but we *should* be doing 1254 * all unsigned arithmetic here, to match what 1255 * modern BPF engines do.) 1256 * 1257 * Express it as 0U - (unsigned value) so that we 1258 * don't get compiler warnings about negating an 1259 * unsigned value and don't get UBSan warnings 1260 * about the result of negating 0x80000000 being 1261 * undefined. 1262 */ 1263 s->k = 0U - opt_state->vmap[val[A_ATOM]].const_val; 1264 val[A_ATOM] = K(s->k); 1265 } 1266 else 1267 val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], 0L); 1268 break; 1269 1270 case BPF_ALU|BPF_ADD|BPF_K: 1271 case BPF_ALU|BPF_SUB|BPF_K: 1272 case BPF_ALU|BPF_MUL|BPF_K: 1273 case BPF_ALU|BPF_DIV|BPF_K: 1274 case BPF_ALU|BPF_MOD|BPF_K: 1275 case BPF_ALU|BPF_AND|BPF_K: 1276 case BPF_ALU|BPF_OR|BPF_K: 1277 case BPF_ALU|BPF_XOR|BPF_K: 1278 case BPF_ALU|BPF_LSH|BPF_K: 1279 case BPF_ALU|BPF_RSH|BPF_K: 1280 op = BPF_OP(s->code); 1281 if (alter) { 1282 if (s->k == 0) { 1283 /* 1284 * Optimize operations where the constant 1285 * is zero. 1286 * 1287 * Don't optimize away "sub #0" 1288 * as it may be needed later to 1289 * fixup the generated math code. 1290 * 1291 * Fail if we're dividing by zero or taking 1292 * a modulus by zero. 1293 */ 1294 if (op == BPF_ADD || 1295 op == BPF_LSH || op == BPF_RSH || 1296 op == BPF_OR || op == BPF_XOR) { 1297 s->code = NOP; 1298 break; 1299 } 1300 if (op == BPF_MUL || op == BPF_AND) { 1301 s->code = BPF_LD|BPF_IMM; 1302 val[A_ATOM] = K(s->k); 1303 break; 1304 } 1305 if (op == BPF_DIV) 1306 opt_error(opt_state, 1307 "division by zero"); 1308 if (op == BPF_MOD) 1309 opt_error(opt_state, 1310 "modulus by zero"); 1311 } 1312 if (opt_state->vmap[val[A_ATOM]].is_const) { 1313 fold_op(opt_state, s, val[A_ATOM], K(s->k)); 1314 val[A_ATOM] = K(s->k); 1315 break; 1316 } 1317 } 1318 val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], K(s->k)); 1319 break; 1320 1321 case BPF_ALU|BPF_ADD|BPF_X: 1322 case BPF_ALU|BPF_SUB|BPF_X: 1323 case BPF_ALU|BPF_MUL|BPF_X: 1324 case BPF_ALU|BPF_DIV|BPF_X: 1325 case BPF_ALU|BPF_MOD|BPF_X: 1326 case BPF_ALU|BPF_AND|BPF_X: 1327 case BPF_ALU|BPF_OR|BPF_X: 1328 case BPF_ALU|BPF_XOR|BPF_X: 1329 case BPF_ALU|BPF_LSH|BPF_X: 1330 case BPF_ALU|BPF_RSH|BPF_X: 1331 op = BPF_OP(s->code); 1332 if (alter && opt_state->vmap[val[X_ATOM]].is_const) { 1333 if (opt_state->vmap[val[A_ATOM]].is_const) { 1334 fold_op(opt_state, s, val[A_ATOM], val[X_ATOM]); 1335 val[A_ATOM] = K(s->k); 1336 } 1337 else { 1338 s->code = BPF_ALU|BPF_K|op; 1339 s->k = opt_state->vmap[val[X_ATOM]].const_val; 1340 if ((op == BPF_LSH || op == BPF_RSH) && 1341 s->k > 31) 1342 opt_error(opt_state, 1343 "shift by more than 31 bits"); 1344 /* 1345 * XXX - optimizer loop detection. 1346 */ 1347 opt_state->non_branch_movement_performed = 1; 1348 opt_state->done = 0; 1349 val[A_ATOM] = 1350 F(opt_state, s->code, val[A_ATOM], K(s->k)); 1351 } 1352 break; 1353 } 1354 /* 1355 * Check if we're doing something to an accumulator 1356 * that is 0, and simplify. This may not seem like 1357 * much of a simplification but it could open up further 1358 * optimizations. 1359 * XXX We could also check for mul by 1, etc. 1360 */ 1361 if (alter && opt_state->vmap[val[A_ATOM]].is_const 1362 && opt_state->vmap[val[A_ATOM]].const_val == 0) { 1363 if (op == BPF_ADD || op == BPF_OR || op == BPF_XOR) { 1364 s->code = BPF_MISC|BPF_TXA; 1365 vstore(s, &val[A_ATOM], val[X_ATOM], alter); 1366 break; 1367 } 1368 else if (op == BPF_MUL || op == BPF_DIV || op == BPF_MOD || 1369 op == BPF_AND || op == BPF_LSH || op == BPF_RSH) { 1370 s->code = BPF_LD|BPF_IMM; 1371 s->k = 0; 1372 vstore(s, &val[A_ATOM], K(s->k), alter); 1373 break; 1374 } 1375 else if (op == BPF_NEG) { 1376 s->code = NOP; 1377 break; 1378 } 1379 } 1380 val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], val[X_ATOM]); 1381 break; 1382 1383 case BPF_MISC|BPF_TXA: 1384 vstore(s, &val[A_ATOM], val[X_ATOM], alter); 1385 break; 1386 1387 case BPF_LD|BPF_MEM: 1388 v = val[s->k]; 1389 if (alter && opt_state->vmap[v].is_const) { 1390 s->code = BPF_LD|BPF_IMM; 1391 s->k = opt_state->vmap[v].const_val; 1392 /* 1393 * XXX - optimizer loop detection. 1394 */ 1395 opt_state->non_branch_movement_performed = 1; 1396 opt_state->done = 0; 1397 } 1398 vstore(s, &val[A_ATOM], v, alter); 1399 break; 1400 1401 case BPF_MISC|BPF_TAX: 1402 vstore(s, &val[X_ATOM], val[A_ATOM], alter); 1403 break; 1404 1405 case BPF_LDX|BPF_MEM: 1406 v = val[s->k]; 1407 if (alter && opt_state->vmap[v].is_const) { 1408 s->code = BPF_LDX|BPF_IMM; 1409 s->k = opt_state->vmap[v].const_val; 1410 /* 1411 * XXX - optimizer loop detection. 1412 */ 1413 opt_state->non_branch_movement_performed = 1; 1414 opt_state->done = 0; 1415 } 1416 vstore(s, &val[X_ATOM], v, alter); 1417 break; 1418 1419 case BPF_ST: 1420 vstore(s, &val[s->k], val[A_ATOM], alter); 1421 break; 1422 1423 case BPF_STX: 1424 vstore(s, &val[s->k], val[X_ATOM], alter); 1425 break; 1426 } 1427 } 1428 1429 static void 1430 deadstmt(opt_state_t *opt_state, register struct stmt *s, register struct stmt *last[]) 1431 { 1432 register int atom; 1433 1434 atom = atomuse(s); 1435 if (atom >= 0) { 1436 if (atom == AX_ATOM) { 1437 last[X_ATOM] = 0; 1438 last[A_ATOM] = 0; 1439 } 1440 else 1441 last[atom] = 0; 1442 } 1443 atom = atomdef(s); 1444 if (atom >= 0) { 1445 if (last[atom]) { 1446 /* 1447 * XXX - optimizer loop detection. 1448 */ 1449 opt_state->non_branch_movement_performed = 1; 1450 opt_state->done = 0; 1451 last[atom]->code = NOP; 1452 } 1453 last[atom] = s; 1454 } 1455 } 1456 1457 static void 1458 opt_deadstores(opt_state_t *opt_state, register struct block *b) 1459 { 1460 register struct slist *s; 1461 register int atom; 1462 struct stmt *last[N_ATOMS]; 1463 1464 memset((char *)last, 0, sizeof last); 1465 1466 for (s = b->stmts; s != 0; s = s->next) 1467 deadstmt(opt_state, &s->s, last); 1468 deadstmt(opt_state, &b->s, last); 1469 1470 for (atom = 0; atom < N_ATOMS; ++atom) 1471 if (last[atom] && !ATOMELEM(b->out_use, atom)) { 1472 last[atom]->code = NOP; 1473 /* 1474 * The store was removed as it's dead, 1475 * so the value stored into now has 1476 * an unknown value. 1477 */ 1478 vstore(0, &b->val[atom], VAL_UNKNOWN, 0); 1479 /* 1480 * XXX - optimizer loop detection. 1481 */ 1482 opt_state->non_branch_movement_performed = 1; 1483 opt_state->done = 0; 1484 } 1485 } 1486 1487 static void 1488 opt_blk(opt_state_t *opt_state, struct block *b, int do_stmts) 1489 { 1490 struct slist *s; 1491 struct edge *p; 1492 int i; 1493 bpf_u_int32 aval, xval; 1494 1495 #if 0 1496 for (s = b->stmts; s && s->next; s = s->next) 1497 if (BPF_CLASS(s->s.code) == BPF_JMP) { 1498 do_stmts = 0; 1499 break; 1500 } 1501 #endif 1502 1503 /* 1504 * Initialize the atom values. 1505 */ 1506 p = b->in_edges; 1507 if (p == 0) { 1508 /* 1509 * We have no predecessors, so everything is undefined 1510 * upon entry to this block. 1511 */ 1512 memset((char *)b->val, 0, sizeof(b->val)); 1513 } else { 1514 /* 1515 * Inherit values from our predecessors. 1516 * 1517 * First, get the values from the predecessor along the 1518 * first edge leading to this node. 1519 */ 1520 memcpy((char *)b->val, (char *)p->pred->val, sizeof(b->val)); 1521 /* 1522 * Now look at all the other nodes leading to this node. 1523 * If, for the predecessor along that edge, a register 1524 * has a different value from the one we have (i.e., 1525 * control paths are merging, and the merging paths 1526 * assign different values to that register), give the 1527 * register the undefined value of 0. 1528 */ 1529 while ((p = p->next) != NULL) { 1530 for (i = 0; i < N_ATOMS; ++i) 1531 if (b->val[i] != p->pred->val[i]) 1532 b->val[i] = 0; 1533 } 1534 } 1535 aval = b->val[A_ATOM]; 1536 xval = b->val[X_ATOM]; 1537 for (s = b->stmts; s; s = s->next) 1538 opt_stmt(opt_state, &s->s, b->val, do_stmts); 1539 1540 /* 1541 * This is a special case: if we don't use anything from this 1542 * block, and we load the accumulator or index register with a 1543 * value that is already there, or if this block is a return, 1544 * eliminate all the statements. 1545 * 1546 * XXX - what if it does a store? Presumably that falls under 1547 * the heading of "if we don't use anything from this block", 1548 * i.e., if we use any memory location set to a different 1549 * value by this block, then we use something from this block. 1550 * 1551 * XXX - why does it matter whether we use anything from this 1552 * block? If the accumulator or index register doesn't change 1553 * its value, isn't that OK even if we use that value? 1554 * 1555 * XXX - if we load the accumulator with a different value, 1556 * and the block ends with a conditional branch, we obviously 1557 * can't eliminate it, as the branch depends on that value. 1558 * For the index register, the conditional branch only depends 1559 * on the index register value if the test is against the index 1560 * register value rather than a constant; if nothing uses the 1561 * value we put into the index register, and we're not testing 1562 * against the index register's value, and there aren't any 1563 * other problems that would keep us from eliminating this 1564 * block, can we eliminate it? 1565 */ 1566 if (do_stmts && 1567 ((b->out_use == 0 && 1568 aval != VAL_UNKNOWN && b->val[A_ATOM] == aval && 1569 xval != VAL_UNKNOWN && b->val[X_ATOM] == xval) || 1570 BPF_CLASS(b->s.code) == BPF_RET)) { 1571 if (b->stmts != 0) { 1572 b->stmts = 0; 1573 /* 1574 * XXX - optimizer loop detection. 1575 */ 1576 opt_state->non_branch_movement_performed = 1; 1577 opt_state->done = 0; 1578 } 1579 } else { 1580 opt_peep(opt_state, b); 1581 opt_deadstores(opt_state, b); 1582 } 1583 /* 1584 * Set up values for branch optimizer. 1585 */ 1586 if (BPF_SRC(b->s.code) == BPF_K) 1587 b->oval = K(b->s.k); 1588 else 1589 b->oval = b->val[X_ATOM]; 1590 b->et.code = b->s.code; 1591 b->ef.code = -b->s.code; 1592 } 1593 1594 /* 1595 * Return true if any register that is used on exit from 'succ', has 1596 * an exit value that is different from the corresponding exit value 1597 * from 'b'. 1598 */ 1599 static int 1600 use_conflict(struct block *b, struct block *succ) 1601 { 1602 int atom; 1603 atomset use = succ->out_use; 1604 1605 if (use == 0) 1606 return 0; 1607 1608 for (atom = 0; atom < N_ATOMS; ++atom) 1609 if (ATOMELEM(use, atom)) 1610 if (b->val[atom] != succ->val[atom]) 1611 return 1; 1612 return 0; 1613 } 1614 1615 /* 1616 * Given a block that is the successor of an edge, and an edge that 1617 * dominates that edge, return either a pointer to a child of that 1618 * block (a block to which that block jumps) if that block is a 1619 * candidate to replace the successor of the latter edge or NULL 1620 * if neither of the children of the first block are candidates. 1621 */ 1622 static struct block * 1623 fold_edge(struct block *child, struct edge *ep) 1624 { 1625 int sense; 1626 bpf_u_int32 aval0, aval1, oval0, oval1; 1627 int code = ep->code; 1628 1629 if (code < 0) { 1630 /* 1631 * This edge is a "branch if false" edge. 1632 */ 1633 code = -code; 1634 sense = 0; 1635 } else { 1636 /* 1637 * This edge is a "branch if true" edge. 1638 */ 1639 sense = 1; 1640 } 1641 1642 /* 1643 * If the opcode for the branch at the end of the block we 1644 * were handed isn't the same as the opcode for the branch 1645 * to which the edge we were handed corresponds, the tests 1646 * for those branches aren't testing the same conditions, 1647 * so the blocks to which the first block branches aren't 1648 * candidates to replace the successor of the edge. 1649 */ 1650 if (child->s.code != code) 1651 return 0; 1652 1653 aval0 = child->val[A_ATOM]; 1654 oval0 = child->oval; 1655 aval1 = ep->pred->val[A_ATOM]; 1656 oval1 = ep->pred->oval; 1657 1658 /* 1659 * If the A register value on exit from the successor block 1660 * isn't the same as the A register value on exit from the 1661 * predecessor of the edge, the blocks to which the first 1662 * block branches aren't candidates to replace the successor 1663 * of the edge. 1664 */ 1665 if (aval0 != aval1) 1666 return 0; 1667 1668 if (oval0 == oval1) 1669 /* 1670 * The operands of the branch instructions are 1671 * identical, so the branches are testing the 1672 * same condition, and the result is true if a true 1673 * branch was taken to get here, otherwise false. 1674 */ 1675 return sense ? JT(child) : JF(child); 1676 1677 if (sense && code == (BPF_JMP|BPF_JEQ|BPF_K)) 1678 /* 1679 * At this point, we only know the comparison if we 1680 * came down the true branch, and it was an equality 1681 * comparison with a constant. 1682 * 1683 * I.e., if we came down the true branch, and the branch 1684 * was an equality comparison with a constant, we know the 1685 * accumulator contains that constant. If we came down 1686 * the false branch, or the comparison wasn't with a 1687 * constant, we don't know what was in the accumulator. 1688 * 1689 * We rely on the fact that distinct constants have distinct 1690 * value numbers. 1691 */ 1692 return JF(child); 1693 1694 return 0; 1695 } 1696 1697 /* 1698 * If we can make this edge go directly to a child of the edge's current 1699 * successor, do so. 1700 */ 1701 static void 1702 opt_j(opt_state_t *opt_state, struct edge *ep) 1703 { 1704 register u_int i, k; 1705 register struct block *target; 1706 1707 /* 1708 * Does this edge go to a block where, if the test 1709 * at the end of it succeeds, it goes to a block 1710 * that's a leaf node of the DAG, i.e. a return 1711 * statement? 1712 * If so, there's nothing to optimize. 1713 */ 1714 if (JT(ep->succ) == 0) 1715 return; 1716 1717 /* 1718 * Does this edge go to a block that goes, in turn, to 1719 * the same block regardless of whether the test at the 1720 * end succeeds or fails? 1721 */ 1722 if (JT(ep->succ) == JF(ep->succ)) { 1723 /* 1724 * Common branch targets can be eliminated, provided 1725 * there is no data dependency. 1726 * 1727 * Check whether any register used on exit from the 1728 * block to which the successor of this edge goes 1729 * has a value at that point that's different from 1730 * the value it has on exit from the predecessor of 1731 * this edge. If not, the predecessor of this edge 1732 * can just go to the block to which the successor 1733 * of this edge goes, bypassing the successor of this 1734 * edge, as the successor of this edge isn't doing 1735 * any calculations whose results are different 1736 * from what the blocks before it did and isn't 1737 * doing any tests the results of which matter. 1738 */ 1739 if (!use_conflict(ep->pred, JT(ep->succ))) { 1740 /* 1741 * No, there isn't. 1742 * Make this edge go to the block to 1743 * which the successor of that edge 1744 * goes. 1745 * 1746 * XXX - optimizer loop detection. 1747 */ 1748 opt_state->non_branch_movement_performed = 1; 1749 opt_state->done = 0; 1750 ep->succ = JT(ep->succ); 1751 } 1752 } 1753 /* 1754 * For each edge dominator that matches the successor of this 1755 * edge, promote the edge successor to the its grandchild. 1756 * 1757 * XXX We violate the set abstraction here in favor a reasonably 1758 * efficient loop. 1759 */ 1760 top: 1761 for (i = 0; i < opt_state->edgewords; ++i) { 1762 /* i'th word in the bitset of dominators */ 1763 register bpf_u_int32 x = ep->edom[i]; 1764 1765 while (x != 0) { 1766 /* Find the next dominator in that word and mark it as found */ 1767 k = lowest_set_bit(x); 1768 x &=~ ((bpf_u_int32)1 << k); 1769 k += i * BITS_PER_WORD; 1770 1771 target = fold_edge(ep->succ, opt_state->edges[k]); 1772 /* 1773 * We have a candidate to replace the successor 1774 * of ep. 1775 * 1776 * Check that there is no data dependency between 1777 * nodes that will be violated if we move the edge; 1778 * i.e., if any register used on exit from the 1779 * candidate has a value at that point different 1780 * from the value it has when we exit the 1781 * predecessor of that edge, there's a data 1782 * dependency that will be violated. 1783 */ 1784 if (target != 0 && !use_conflict(ep->pred, target)) { 1785 /* 1786 * It's safe to replace the successor of 1787 * ep; do so, and note that we've made 1788 * at least one change. 1789 * 1790 * XXX - this is one of the operations that 1791 * happens when the optimizer gets into 1792 * one of those infinite loops. 1793 */ 1794 opt_state->done = 0; 1795 ep->succ = target; 1796 if (JT(target) != 0) 1797 /* 1798 * Start over unless we hit a leaf. 1799 */ 1800 goto top; 1801 return; 1802 } 1803 } 1804 } 1805 } 1806 1807 /* 1808 * XXX - is this, and and_pullup(), what's described in section 6.1.2 1809 * "Predicate Assertion Propagation" in the BPF+ paper? 1810 * 1811 * Note that this looks at block dominators, not edge dominators. 1812 * Don't think so. 1813 * 1814 * "A or B" compiles into 1815 * 1816 * A 1817 * t / \ f 1818 * / B 1819 * / t / \ f 1820 * \ / 1821 * \ / 1822 * X 1823 * 1824 * 1825 */ 1826 static void 1827 or_pullup(opt_state_t *opt_state, struct block *b, struct block *root) 1828 { 1829 bpf_u_int32 val; 1830 int at_top; 1831 struct block *pull; 1832 struct block **diffp, **samep; 1833 struct edge *ep; 1834 1835 ep = b->in_edges; 1836 if (ep == 0) 1837 return; 1838 1839 /* 1840 * Make sure each predecessor loads the same value. 1841 * XXX why? 1842 */ 1843 val = ep->pred->val[A_ATOM]; 1844 for (ep = ep->next; ep != 0; ep = ep->next) 1845 if (val != ep->pred->val[A_ATOM]) 1846 return; 1847 1848 /* 1849 * For the first edge in the list of edges coming into this block, 1850 * see whether the predecessor of that edge comes here via a true 1851 * branch or a false branch. 1852 */ 1853 if (JT(b->in_edges->pred) == b) 1854 diffp = &JT(b->in_edges->pred); /* jt */ 1855 else 1856 diffp = &JF(b->in_edges->pred); /* jf */ 1857 1858 /* 1859 * diffp is a pointer to a pointer to the block. 1860 * 1861 * Go down the false chain looking as far as you can, 1862 * making sure that each jump-compare is doing the 1863 * same as the original block. 1864 * 1865 * If you reach the bottom before you reach a 1866 * different jump-compare, just exit. There's nothing 1867 * to do here. XXX - no, this version is checking for 1868 * the value leaving the block; that's from the BPF+ 1869 * pullup routine. 1870 */ 1871 at_top = 1; 1872 for (;;) { 1873 /* 1874 * Done if that's not going anywhere XXX 1875 */ 1876 if (*diffp == 0) 1877 return; 1878 1879 /* 1880 * Done if that predecessor blah blah blah isn't 1881 * going the same place we're going XXX 1882 * 1883 * Does the true edge of this block point to the same 1884 * location as the true edge of b? 1885 */ 1886 if (JT(*diffp) != JT(b)) 1887 return; 1888 1889 /* 1890 * Done if this node isn't a dominator of that 1891 * node blah blah blah XXX 1892 * 1893 * Does b dominate diffp? 1894 */ 1895 if (!SET_MEMBER((*diffp)->dom, b->id)) 1896 return; 1897 1898 /* 1899 * Break out of the loop if that node's value of A 1900 * isn't the value of A above XXX 1901 */ 1902 if ((*diffp)->val[A_ATOM] != val) 1903 break; 1904 1905 /* 1906 * Get the JF for that node XXX 1907 * Go down the false path. 1908 */ 1909 diffp = &JF(*diffp); 1910 at_top = 0; 1911 } 1912 1913 /* 1914 * Now that we've found a different jump-compare in a chain 1915 * below b, search further down until we find another 1916 * jump-compare that looks at the original value. This 1917 * jump-compare should get pulled up. XXX again we're 1918 * comparing values not jump-compares. 1919 */ 1920 samep = &JF(*diffp); 1921 for (;;) { 1922 /* 1923 * Done if that's not going anywhere XXX 1924 */ 1925 if (*samep == 0) 1926 return; 1927 1928 /* 1929 * Done if that predecessor blah blah blah isn't 1930 * going the same place we're going XXX 1931 */ 1932 if (JT(*samep) != JT(b)) 1933 return; 1934 1935 /* 1936 * Done if this node isn't a dominator of that 1937 * node blah blah blah XXX 1938 * 1939 * Does b dominate samep? 1940 */ 1941 if (!SET_MEMBER((*samep)->dom, b->id)) 1942 return; 1943 1944 /* 1945 * Break out of the loop if that node's value of A 1946 * is the value of A above XXX 1947 */ 1948 if ((*samep)->val[A_ATOM] == val) 1949 break; 1950 1951 /* XXX Need to check that there are no data dependencies 1952 between dp0 and dp1. Currently, the code generator 1953 will not produce such dependencies. */ 1954 samep = &JF(*samep); 1955 } 1956 #ifdef notdef 1957 /* XXX This doesn't cover everything. */ 1958 for (i = 0; i < N_ATOMS; ++i) 1959 if ((*samep)->val[i] != pred->val[i]) 1960 return; 1961 #endif 1962 /* Pull up the node. */ 1963 pull = *samep; 1964 *samep = JF(pull); 1965 JF(pull) = *diffp; 1966 1967 /* 1968 * At the top of the chain, each predecessor needs to point at the 1969 * pulled up node. Inside the chain, there is only one predecessor 1970 * to worry about. 1971 */ 1972 if (at_top) { 1973 for (ep = b->in_edges; ep != 0; ep = ep->next) { 1974 if (JT(ep->pred) == b) 1975 JT(ep->pred) = pull; 1976 else 1977 JF(ep->pred) = pull; 1978 } 1979 } 1980 else 1981 *diffp = pull; 1982 1983 /* 1984 * XXX - this is one of the operations that happens when the 1985 * optimizer gets into one of those infinite loops. 1986 */ 1987 opt_state->done = 0; 1988 1989 /* 1990 * Recompute dominator sets as control flow graph has changed. 1991 */ 1992 find_dom(opt_state, root); 1993 } 1994 1995 static void 1996 and_pullup(opt_state_t *opt_state, struct block *b, struct block *root) 1997 { 1998 bpf_u_int32 val; 1999 int at_top; 2000 struct block *pull; 2001 struct block **diffp, **samep; 2002 struct edge *ep; 2003 2004 ep = b->in_edges; 2005 if (ep == 0) 2006 return; 2007 2008 /* 2009 * Make sure each predecessor loads the same value. 2010 */ 2011 val = ep->pred->val[A_ATOM]; 2012 for (ep = ep->next; ep != 0; ep = ep->next) 2013 if (val != ep->pred->val[A_ATOM]) 2014 return; 2015 2016 if (JT(b->in_edges->pred) == b) 2017 diffp = &JT(b->in_edges->pred); 2018 else 2019 diffp = &JF(b->in_edges->pred); 2020 2021 at_top = 1; 2022 for (;;) { 2023 if (*diffp == 0) 2024 return; 2025 2026 if (JF(*diffp) != JF(b)) 2027 return; 2028 2029 if (!SET_MEMBER((*diffp)->dom, b->id)) 2030 return; 2031 2032 if ((*diffp)->val[A_ATOM] != val) 2033 break; 2034 2035 diffp = &JT(*diffp); 2036 at_top = 0; 2037 } 2038 samep = &JT(*diffp); 2039 for (;;) { 2040 if (*samep == 0) 2041 return; 2042 2043 if (JF(*samep) != JF(b)) 2044 return; 2045 2046 if (!SET_MEMBER((*samep)->dom, b->id)) 2047 return; 2048 2049 if ((*samep)->val[A_ATOM] == val) 2050 break; 2051 2052 /* XXX Need to check that there are no data dependencies 2053 between diffp and samep. Currently, the code generator 2054 will not produce such dependencies. */ 2055 samep = &JT(*samep); 2056 } 2057 #ifdef notdef 2058 /* XXX This doesn't cover everything. */ 2059 for (i = 0; i < N_ATOMS; ++i) 2060 if ((*samep)->val[i] != pred->val[i]) 2061 return; 2062 #endif 2063 /* Pull up the node. */ 2064 pull = *samep; 2065 *samep = JT(pull); 2066 JT(pull) = *diffp; 2067 2068 /* 2069 * At the top of the chain, each predecessor needs to point at the 2070 * pulled up node. Inside the chain, there is only one predecessor 2071 * to worry about. 2072 */ 2073 if (at_top) { 2074 for (ep = b->in_edges; ep != 0; ep = ep->next) { 2075 if (JT(ep->pred) == b) 2076 JT(ep->pred) = pull; 2077 else 2078 JF(ep->pred) = pull; 2079 } 2080 } 2081 else 2082 *diffp = pull; 2083 2084 /* 2085 * XXX - this is one of the operations that happens when the 2086 * optimizer gets into one of those infinite loops. 2087 */ 2088 opt_state->done = 0; 2089 2090 /* 2091 * Recompute dominator sets as control flow graph has changed. 2092 */ 2093 find_dom(opt_state, root); 2094 } 2095 2096 static void 2097 opt_blks(opt_state_t *opt_state, struct icode *ic, int do_stmts) 2098 { 2099 int i, maxlevel; 2100 struct block *p; 2101 2102 init_val(opt_state); 2103 maxlevel = ic->root->level; 2104 2105 find_inedges(opt_state, ic->root); 2106 for (i = maxlevel; i >= 0; --i) 2107 for (p = opt_state->levels[i]; p; p = p->link) 2108 opt_blk(opt_state, p, do_stmts); 2109 2110 if (do_stmts) 2111 /* 2112 * No point trying to move branches; it can't possibly 2113 * make a difference at this point. 2114 * 2115 * XXX - this might be after we detect a loop where 2116 * we were just looping infinitely moving branches 2117 * in such a fashion that we went through two or more 2118 * versions of the machine code, eventually returning 2119 * to the first version. (We're really not doing a 2120 * full loop detection, we're just testing for two 2121 * passes in a row where we do nothing but 2122 * move branches.) 2123 */ 2124 return; 2125 2126 /* 2127 * Is this what the BPF+ paper describes in sections 6.1.1, 2128 * 6.1.2, and 6.1.3? 2129 */ 2130 for (i = 1; i <= maxlevel; ++i) { 2131 for (p = opt_state->levels[i]; p; p = p->link) { 2132 opt_j(opt_state, &p->et); 2133 opt_j(opt_state, &p->ef); 2134 } 2135 } 2136 2137 find_inedges(opt_state, ic->root); 2138 for (i = 1; i <= maxlevel; ++i) { 2139 for (p = opt_state->levels[i]; p; p = p->link) { 2140 or_pullup(opt_state, p, ic->root); 2141 and_pullup(opt_state, p, ic->root); 2142 } 2143 } 2144 } 2145 2146 static inline void 2147 link_inedge(struct edge *parent, struct block *child) 2148 { 2149 parent->next = child->in_edges; 2150 child->in_edges = parent; 2151 } 2152 2153 static void 2154 find_inedges(opt_state_t *opt_state, struct block *root) 2155 { 2156 u_int i; 2157 int level; 2158 struct block *b; 2159 2160 for (i = 0; i < opt_state->n_blocks; ++i) 2161 opt_state->blocks[i]->in_edges = 0; 2162 2163 /* 2164 * Traverse the graph, adding each edge to the predecessor 2165 * list of its successors. Skip the leaves (i.e. level 0). 2166 */ 2167 for (level = root->level; level > 0; --level) { 2168 for (b = opt_state->levels[level]; b != 0; b = b->link) { 2169 link_inedge(&b->et, JT(b)); 2170 link_inedge(&b->ef, JF(b)); 2171 } 2172 } 2173 } 2174 2175 static void 2176 opt_root(struct block **b) 2177 { 2178 struct slist *tmp, *s; 2179 2180 s = (*b)->stmts; 2181 (*b)->stmts = 0; 2182 while (BPF_CLASS((*b)->s.code) == BPF_JMP && JT(*b) == JF(*b)) 2183 *b = JT(*b); 2184 2185 tmp = (*b)->stmts; 2186 if (tmp != 0) 2187 sappend(s, tmp); 2188 (*b)->stmts = s; 2189 2190 /* 2191 * If the root node is a return, then there is no 2192 * point executing any statements (since the bpf machine 2193 * has no side effects). 2194 */ 2195 if (BPF_CLASS((*b)->s.code) == BPF_RET) 2196 (*b)->stmts = 0; 2197 } 2198 2199 static void 2200 opt_loop(opt_state_t *opt_state, struct icode *ic, int do_stmts) 2201 { 2202 2203 #ifdef BDEBUG 2204 if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) { 2205 printf("opt_loop(root, %d) begin\n", do_stmts); 2206 opt_dump(opt_state, ic); 2207 } 2208 #endif 2209 2210 /* 2211 * XXX - optimizer loop detection. 2212 */ 2213 int loop_count = 0; 2214 for (;;) { 2215 opt_state->done = 1; 2216 /* 2217 * XXX - optimizer loop detection. 2218 */ 2219 opt_state->non_branch_movement_performed = 0; 2220 find_levels(opt_state, ic); 2221 find_dom(opt_state, ic->root); 2222 find_closure(opt_state, ic->root); 2223 find_ud(opt_state, ic->root); 2224 find_edom(opt_state, ic->root); 2225 opt_blks(opt_state, ic, do_stmts); 2226 #ifdef BDEBUG 2227 if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) { 2228 printf("opt_loop(root, %d) bottom, done=%d\n", do_stmts, opt_state->done); 2229 opt_dump(opt_state, ic); 2230 } 2231 #endif 2232 2233 /* 2234 * Was anything done in this optimizer pass? 2235 */ 2236 if (opt_state->done) { 2237 /* 2238 * No, so we've reached a fixed point. 2239 * We're done. 2240 */ 2241 break; 2242 } 2243 2244 /* 2245 * XXX - was anything done other than branch movement 2246 * in this pass? 2247 */ 2248 if (opt_state->non_branch_movement_performed) { 2249 /* 2250 * Yes. Clear any loop-detection counter; 2251 * we're making some form of progress (assuming 2252 * we can't get into a cycle doing *other* 2253 * optimizations...). 2254 */ 2255 loop_count = 0; 2256 } else { 2257 /* 2258 * No - increment the counter, and quit if 2259 * it's up to 100. 2260 */ 2261 loop_count++; 2262 if (loop_count >= 100) { 2263 /* 2264 * We've done nothing but branch movement 2265 * for 100 passes; we're probably 2266 * in a cycle and will never reach a 2267 * fixed point. 2268 * 2269 * XXX - yes, we really need a non- 2270 * heuristic way of detecting a cycle. 2271 */ 2272 opt_state->done = 1; 2273 break; 2274 } 2275 } 2276 } 2277 } 2278 2279 /* 2280 * Optimize the filter code in its dag representation. 2281 * Return 0 on success, -1 on error. 2282 */ 2283 int 2284 bpf_optimize(struct icode *ic, char *errbuf) 2285 { 2286 opt_state_t opt_state; 2287 2288 memset(&opt_state, 0, sizeof(opt_state)); 2289 opt_state.errbuf = errbuf; 2290 opt_state.non_branch_movement_performed = 0; 2291 if (setjmp(opt_state.top_ctx)) { 2292 opt_cleanup(&opt_state); 2293 return -1; 2294 } 2295 opt_init(&opt_state, ic); 2296 opt_loop(&opt_state, ic, 0); 2297 opt_loop(&opt_state, ic, 1); 2298 intern_blocks(&opt_state, ic); 2299 #ifdef BDEBUG 2300 if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) { 2301 printf("after intern_blocks()\n"); 2302 opt_dump(&opt_state, ic); 2303 } 2304 #endif 2305 opt_root(&ic->root); 2306 #ifdef BDEBUG 2307 if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) { 2308 printf("after opt_root()\n"); 2309 opt_dump(&opt_state, ic); 2310 } 2311 #endif 2312 opt_cleanup(&opt_state); 2313 return 0; 2314 } 2315 2316 static void 2317 make_marks(struct icode *ic, struct block *p) 2318 { 2319 if (!isMarked(ic, p)) { 2320 Mark(ic, p); 2321 if (BPF_CLASS(p->s.code) != BPF_RET) { 2322 make_marks(ic, JT(p)); 2323 make_marks(ic, JF(p)); 2324 } 2325 } 2326 } 2327 2328 /* 2329 * Mark code array such that isMarked(ic->cur_mark, i) is true 2330 * only for nodes that are alive. 2331 */ 2332 static void 2333 mark_code(struct icode *ic) 2334 { 2335 ic->cur_mark += 1; 2336 make_marks(ic, ic->root); 2337 } 2338 2339 /* 2340 * True iff the two stmt lists load the same value from the packet into 2341 * the accumulator. 2342 */ 2343 static int 2344 eq_slist(struct slist *x, struct slist *y) 2345 { 2346 for (;;) { 2347 while (x && x->s.code == NOP) 2348 x = x->next; 2349 while (y && y->s.code == NOP) 2350 y = y->next; 2351 if (x == 0) 2352 return y == 0; 2353 if (y == 0) 2354 return x == 0; 2355 if (x->s.code != y->s.code || x->s.k != y->s.k) 2356 return 0; 2357 x = x->next; 2358 y = y->next; 2359 } 2360 } 2361 2362 static inline int 2363 eq_blk(struct block *b0, struct block *b1) 2364 { 2365 if (b0->s.code == b1->s.code && 2366 b0->s.k == b1->s.k && 2367 b0->et.succ == b1->et.succ && 2368 b0->ef.succ == b1->ef.succ) 2369 return eq_slist(b0->stmts, b1->stmts); 2370 return 0; 2371 } 2372 2373 static void 2374 intern_blocks(opt_state_t *opt_state, struct icode *ic) 2375 { 2376 struct block *p; 2377 u_int i, j; 2378 int done1; /* don't shadow global */ 2379 top: 2380 done1 = 1; 2381 for (i = 0; i < opt_state->n_blocks; ++i) 2382 opt_state->blocks[i]->link = 0; 2383 2384 mark_code(ic); 2385 2386 for (i = opt_state->n_blocks - 1; i != 0; ) { 2387 --i; 2388 if (!isMarked(ic, opt_state->blocks[i])) 2389 continue; 2390 for (j = i + 1; j < opt_state->n_blocks; ++j) { 2391 if (!isMarked(ic, opt_state->blocks[j])) 2392 continue; 2393 if (eq_blk(opt_state->blocks[i], opt_state->blocks[j])) { 2394 opt_state->blocks[i]->link = opt_state->blocks[j]->link ? 2395 opt_state->blocks[j]->link : opt_state->blocks[j]; 2396 break; 2397 } 2398 } 2399 } 2400 for (i = 0; i < opt_state->n_blocks; ++i) { 2401 p = opt_state->blocks[i]; 2402 if (JT(p) == 0) 2403 continue; 2404 if (JT(p)->link) { 2405 done1 = 0; 2406 JT(p) = JT(p)->link; 2407 } 2408 if (JF(p)->link) { 2409 done1 = 0; 2410 JF(p) = JF(p)->link; 2411 } 2412 } 2413 if (!done1) 2414 goto top; 2415 } 2416 2417 static void 2418 opt_cleanup(opt_state_t *opt_state) 2419 { 2420 free((void *)opt_state->vnode_base); 2421 free((void *)opt_state->vmap); 2422 free((void *)opt_state->edges); 2423 free((void *)opt_state->space); 2424 free((void *)opt_state->levels); 2425 free((void *)opt_state->blocks); 2426 } 2427 2428 /* 2429 * For optimizer errors. 2430 */ 2431 static void PCAP_NORETURN 2432 opt_error(opt_state_t *opt_state, const char *fmt, ...) 2433 { 2434 va_list ap; 2435 2436 if (opt_state->errbuf != NULL) { 2437 va_start(ap, fmt); 2438 (void)vsnprintf(opt_state->errbuf, 2439 PCAP_ERRBUF_SIZE, fmt, ap); 2440 va_end(ap); 2441 } 2442 longjmp(opt_state->top_ctx, 1); 2443 /* NOTREACHED */ 2444 #ifdef _AIX 2445 PCAP_UNREACHABLE 2446 #endif /* _AIX */ 2447 } 2448 2449 /* 2450 * Return the number of stmts in 's'. 2451 */ 2452 static u_int 2453 slength(struct slist *s) 2454 { 2455 u_int n = 0; 2456 2457 for (; s; s = s->next) 2458 if (s->s.code != NOP) 2459 ++n; 2460 return n; 2461 } 2462 2463 /* 2464 * Return the number of nodes reachable by 'p'. 2465 * All nodes should be initially unmarked. 2466 */ 2467 static int 2468 count_blocks(struct icode *ic, struct block *p) 2469 { 2470 if (p == 0 || isMarked(ic, p)) 2471 return 0; 2472 Mark(ic, p); 2473 return count_blocks(ic, JT(p)) + count_blocks(ic, JF(p)) + 1; 2474 } 2475 2476 /* 2477 * Do a depth first search on the flow graph, numbering the 2478 * the basic blocks, and entering them into the 'blocks' array.` 2479 */ 2480 static void 2481 number_blks_r(opt_state_t *opt_state, struct icode *ic, struct block *p) 2482 { 2483 u_int n; 2484 2485 if (p == 0 || isMarked(ic, p)) 2486 return; 2487 2488 Mark(ic, p); 2489 n = opt_state->n_blocks++; 2490 if (opt_state->n_blocks == 0) { 2491 /* 2492 * Overflow. 2493 */ 2494 opt_error(opt_state, "filter is too complex to optimize"); 2495 } 2496 p->id = n; 2497 opt_state->blocks[n] = p; 2498 2499 number_blks_r(opt_state, ic, JT(p)); 2500 number_blks_r(opt_state, ic, JF(p)); 2501 } 2502 2503 /* 2504 * Return the number of stmts in the flowgraph reachable by 'p'. 2505 * The nodes should be unmarked before calling. 2506 * 2507 * Note that "stmts" means "instructions", and that this includes 2508 * 2509 * side-effect statements in 'p' (slength(p->stmts)); 2510 * 2511 * statements in the true branch from 'p' (count_stmts(JT(p))); 2512 * 2513 * statements in the false branch from 'p' (count_stmts(JF(p))); 2514 * 2515 * the conditional jump itself (1); 2516 * 2517 * an extra long jump if the true branch requires it (p->longjt); 2518 * 2519 * an extra long jump if the false branch requires it (p->longjf). 2520 */ 2521 static u_int 2522 count_stmts(struct icode *ic, struct block *p) 2523 { 2524 u_int n; 2525 2526 if (p == 0 || isMarked(ic, p)) 2527 return 0; 2528 Mark(ic, p); 2529 n = count_stmts(ic, JT(p)) + count_stmts(ic, JF(p)); 2530 return slength(p->stmts) + n + 1 + p->longjt + p->longjf; 2531 } 2532 2533 /* 2534 * Allocate memory. All allocation is done before optimization 2535 * is begun. A linear bound on the size of all data structures is computed 2536 * from the total number of blocks and/or statements. 2537 */ 2538 static void 2539 opt_init(opt_state_t *opt_state, struct icode *ic) 2540 { 2541 bpf_u_int32 *p; 2542 int i, n, max_stmts; 2543 u_int product; 2544 size_t block_memsize, edge_memsize; 2545 2546 /* 2547 * First, count the blocks, so we can malloc an array to map 2548 * block number to block. Then, put the blocks into the array. 2549 */ 2550 unMarkAll(ic); 2551 n = count_blocks(ic, ic->root); 2552 opt_state->blocks = (struct block **)calloc(n, sizeof(*opt_state->blocks)); 2553 if (opt_state->blocks == NULL) 2554 opt_error(opt_state, "malloc"); 2555 unMarkAll(ic); 2556 opt_state->n_blocks = 0; 2557 number_blks_r(opt_state, ic, ic->root); 2558 2559 /* 2560 * This "should not happen". 2561 */ 2562 if (opt_state->n_blocks == 0) 2563 opt_error(opt_state, "filter has no instructions; please report this as a libpcap issue"); 2564 2565 opt_state->n_edges = 2 * opt_state->n_blocks; 2566 if ((opt_state->n_edges / 2) != opt_state->n_blocks) { 2567 /* 2568 * Overflow. 2569 */ 2570 opt_error(opt_state, "filter is too complex to optimize"); 2571 } 2572 opt_state->edges = (struct edge **)calloc(opt_state->n_edges, sizeof(*opt_state->edges)); 2573 if (opt_state->edges == NULL) { 2574 opt_error(opt_state, "malloc"); 2575 } 2576 2577 /* 2578 * The number of levels is bounded by the number of nodes. 2579 */ 2580 opt_state->levels = (struct block **)calloc(opt_state->n_blocks, sizeof(*opt_state->levels)); 2581 if (opt_state->levels == NULL) { 2582 opt_error(opt_state, "malloc"); 2583 } 2584 2585 opt_state->edgewords = opt_state->n_edges / BITS_PER_WORD + 1; 2586 opt_state->nodewords = opt_state->n_blocks / BITS_PER_WORD + 1; 2587 2588 /* 2589 * Make sure opt_state->n_blocks * opt_state->nodewords fits 2590 * in a u_int; we use it as a u_int number-of-iterations 2591 * value. 2592 */ 2593 product = opt_state->n_blocks * opt_state->nodewords; 2594 if ((product / opt_state->n_blocks) != opt_state->nodewords) { 2595 /* 2596 * XXX - just punt and don't try to optimize? 2597 * In practice, this is unlikely to happen with 2598 * a normal filter. 2599 */ 2600 opt_error(opt_state, "filter is too complex to optimize"); 2601 } 2602 2603 /* 2604 * Make sure the total memory required for that doesn't 2605 * overflow. 2606 */ 2607 block_memsize = (size_t)2 * product * sizeof(*opt_state->space); 2608 if ((block_memsize / product) != 2 * sizeof(*opt_state->space)) { 2609 opt_error(opt_state, "filter is too complex to optimize"); 2610 } 2611 2612 /* 2613 * Make sure opt_state->n_edges * opt_state->edgewords fits 2614 * in a u_int; we use it as a u_int number-of-iterations 2615 * value. 2616 */ 2617 product = opt_state->n_edges * opt_state->edgewords; 2618 if ((product / opt_state->n_edges) != opt_state->edgewords) { 2619 opt_error(opt_state, "filter is too complex to optimize"); 2620 } 2621 2622 /* 2623 * Make sure the total memory required for that doesn't 2624 * overflow. 2625 */ 2626 edge_memsize = (size_t)product * sizeof(*opt_state->space); 2627 if (edge_memsize / product != sizeof(*opt_state->space)) { 2628 opt_error(opt_state, "filter is too complex to optimize"); 2629 } 2630 2631 /* 2632 * Make sure the total memory required for both of them doesn't 2633 * overflow. 2634 */ 2635 if (block_memsize > SIZE_MAX - edge_memsize) { 2636 opt_error(opt_state, "filter is too complex to optimize"); 2637 } 2638 2639 /* XXX */ 2640 opt_state->space = (bpf_u_int32 *)malloc(block_memsize + edge_memsize); 2641 if (opt_state->space == NULL) { 2642 opt_error(opt_state, "malloc"); 2643 } 2644 p = opt_state->space; 2645 opt_state->all_dom_sets = p; 2646 for (i = 0; i < n; ++i) { 2647 opt_state->blocks[i]->dom = p; 2648 p += opt_state->nodewords; 2649 } 2650 opt_state->all_closure_sets = p; 2651 for (i = 0; i < n; ++i) { 2652 opt_state->blocks[i]->closure = p; 2653 p += opt_state->nodewords; 2654 } 2655 opt_state->all_edge_sets = p; 2656 for (i = 0; i < n; ++i) { 2657 register struct block *b = opt_state->blocks[i]; 2658 2659 b->et.edom = p; 2660 p += opt_state->edgewords; 2661 b->ef.edom = p; 2662 p += opt_state->edgewords; 2663 b->et.id = i; 2664 opt_state->edges[i] = &b->et; 2665 b->ef.id = opt_state->n_blocks + i; 2666 opt_state->edges[opt_state->n_blocks + i] = &b->ef; 2667 b->et.pred = b; 2668 b->ef.pred = b; 2669 } 2670 max_stmts = 0; 2671 for (i = 0; i < n; ++i) 2672 max_stmts += slength(opt_state->blocks[i]->stmts) + 1; 2673 /* 2674 * We allocate at most 3 value numbers per statement, 2675 * so this is an upper bound on the number of valnodes 2676 * we'll need. 2677 */ 2678 opt_state->maxval = 3 * max_stmts; 2679 opt_state->vmap = (struct vmapinfo *)calloc(opt_state->maxval, sizeof(*opt_state->vmap)); 2680 if (opt_state->vmap == NULL) { 2681 opt_error(opt_state, "malloc"); 2682 } 2683 opt_state->vnode_base = (struct valnode *)calloc(opt_state->maxval, sizeof(*opt_state->vnode_base)); 2684 if (opt_state->vnode_base == NULL) { 2685 opt_error(opt_state, "malloc"); 2686 } 2687 } 2688 2689 /* 2690 * This is only used when supporting optimizer debugging. It is 2691 * global state, so do *not* do more than one compile in parallel 2692 * and expect it to provide meaningful information. 2693 */ 2694 #ifdef BDEBUG 2695 int bids[NBIDS]; 2696 #endif 2697 2698 static void PCAP_NORETURN conv_error(conv_state_t *, const char *, ...) 2699 PCAP_PRINTFLIKE(2, 3); 2700 2701 /* 2702 * Returns true if successful. Returns false if a branch has 2703 * an offset that is too large. If so, we have marked that 2704 * branch so that on a subsequent iteration, it will be treated 2705 * properly. 2706 */ 2707 static int 2708 convert_code_r(conv_state_t *conv_state, struct icode *ic, struct block *p) 2709 { 2710 struct bpf_insn *dst; 2711 struct slist *src; 2712 u_int slen; 2713 u_int off; 2714 struct slist **offset = NULL; 2715 2716 if (p == 0 || isMarked(ic, p)) 2717 return (1); 2718 Mark(ic, p); 2719 2720 if (convert_code_r(conv_state, ic, JF(p)) == 0) 2721 return (0); 2722 if (convert_code_r(conv_state, ic, JT(p)) == 0) 2723 return (0); 2724 2725 slen = slength(p->stmts); 2726 dst = conv_state->ftail -= (slen + 1 + p->longjt + p->longjf); 2727 /* inflate length by any extra jumps */ 2728 2729 p->offset = (int)(dst - conv_state->fstart); 2730 2731 /* generate offset[] for convenience */ 2732 if (slen) { 2733 offset = (struct slist **)calloc(slen, sizeof(struct slist *)); 2734 if (!offset) { 2735 conv_error(conv_state, "not enough core"); 2736 /*NOTREACHED*/ 2737 } 2738 } 2739 src = p->stmts; 2740 for (off = 0; off < slen && src; off++) { 2741 #if 0 2742 printf("off=%d src=%x\n", off, src); 2743 #endif 2744 offset[off] = src; 2745 src = src->next; 2746 } 2747 2748 off = 0; 2749 for (src = p->stmts; src; src = src->next) { 2750 if (src->s.code == NOP) 2751 continue; 2752 dst->code = (u_short)src->s.code; 2753 dst->k = src->s.k; 2754 2755 /* fill block-local relative jump */ 2756 if (BPF_CLASS(src->s.code) != BPF_JMP || src->s.code == (BPF_JMP|BPF_JA)) { 2757 #if 0 2758 if (src->s.jt || src->s.jf) { 2759 free(offset); 2760 conv_error(conv_state, "illegal jmp destination"); 2761 /*NOTREACHED*/ 2762 } 2763 #endif 2764 goto filled; 2765 } 2766 if (off == slen - 2) /*???*/ 2767 goto filled; 2768 2769 { 2770 u_int i; 2771 int jt, jf; 2772 const char ljerr[] = "%s for block-local relative jump: off=%d"; 2773 2774 #if 0 2775 printf("code=%x off=%d %x %x\n", src->s.code, 2776 off, src->s.jt, src->s.jf); 2777 #endif 2778 2779 if (!src->s.jt || !src->s.jf) { 2780 free(offset); 2781 conv_error(conv_state, ljerr, "no jmp destination", off); 2782 /*NOTREACHED*/ 2783 } 2784 2785 jt = jf = 0; 2786 for (i = 0; i < slen; i++) { 2787 if (offset[i] == src->s.jt) { 2788 if (jt) { 2789 free(offset); 2790 conv_error(conv_state, ljerr, "multiple matches", off); 2791 /*NOTREACHED*/ 2792 } 2793 2794 if (i - off - 1 >= 256) { 2795 free(offset); 2796 conv_error(conv_state, ljerr, "out-of-range jump", off); 2797 /*NOTREACHED*/ 2798 } 2799 dst->jt = (u_char)(i - off - 1); 2800 jt++; 2801 } 2802 if (offset[i] == src->s.jf) { 2803 if (jf) { 2804 free(offset); 2805 conv_error(conv_state, ljerr, "multiple matches", off); 2806 /*NOTREACHED*/ 2807 } 2808 if (i - off - 1 >= 256) { 2809 free(offset); 2810 conv_error(conv_state, ljerr, "out-of-range jump", off); 2811 /*NOTREACHED*/ 2812 } 2813 dst->jf = (u_char)(i - off - 1); 2814 jf++; 2815 } 2816 } 2817 if (!jt || !jf) { 2818 free(offset); 2819 conv_error(conv_state, ljerr, "no destination found", off); 2820 /*NOTREACHED*/ 2821 } 2822 } 2823 filled: 2824 ++dst; 2825 ++off; 2826 } 2827 if (offset) 2828 free(offset); 2829 2830 #ifdef BDEBUG 2831 if (dst - conv_state->fstart < NBIDS) 2832 bids[dst - conv_state->fstart] = p->id + 1; 2833 #endif 2834 dst->code = (u_short)p->s.code; 2835 dst->k = p->s.k; 2836 if (JT(p)) { 2837 /* number of extra jumps inserted */ 2838 u_char extrajmps = 0; 2839 off = JT(p)->offset - (p->offset + slen) - 1; 2840 if (off >= 256) { 2841 /* offset too large for branch, must add a jump */ 2842 if (p->longjt == 0) { 2843 /* mark this instruction and retry */ 2844 p->longjt++; 2845 return(0); 2846 } 2847 dst->jt = extrajmps; 2848 extrajmps++; 2849 dst[extrajmps].code = BPF_JMP|BPF_JA; 2850 dst[extrajmps].k = off - extrajmps; 2851 } 2852 else 2853 dst->jt = (u_char)off; 2854 off = JF(p)->offset - (p->offset + slen) - 1; 2855 if (off >= 256) { 2856 /* offset too large for branch, must add a jump */ 2857 if (p->longjf == 0) { 2858 /* mark this instruction and retry */ 2859 p->longjf++; 2860 return(0); 2861 } 2862 /* branch if F to following jump */ 2863 /* if two jumps are inserted, F goes to second one */ 2864 dst->jf = extrajmps; 2865 extrajmps++; 2866 dst[extrajmps].code = BPF_JMP|BPF_JA; 2867 dst[extrajmps].k = off - extrajmps; 2868 } 2869 else 2870 dst->jf = (u_char)off; 2871 } 2872 return (1); 2873 } 2874 2875 2876 /* 2877 * Convert flowgraph intermediate representation to the 2878 * BPF array representation. Set *lenp to the number of instructions. 2879 * 2880 * This routine does *NOT* leak the memory pointed to by fp. It *must 2881 * not* do free(fp) before returning fp; doing so would make no sense, 2882 * as the BPF array pointed to by the return value of icode_to_fcode() 2883 * must be valid - it's being returned for use in a bpf_program structure. 2884 * 2885 * If it appears that icode_to_fcode() is leaking, the problem is that 2886 * the program using pcap_compile() is failing to free the memory in 2887 * the BPF program when it's done - the leak is in the program, not in 2888 * the routine that happens to be allocating the memory. (By analogy, if 2889 * a program calls fopen() without ever calling fclose() on the FILE *, 2890 * it will leak the FILE structure; the leak is not in fopen(), it's in 2891 * the program.) Change the program to use pcap_freecode() when it's 2892 * done with the filter program. See the pcap man page. 2893 */ 2894 struct bpf_insn * 2895 icode_to_fcode(struct icode *ic, struct block *root, u_int *lenp, 2896 char *errbuf) 2897 { 2898 u_int n; 2899 struct bpf_insn *fp; 2900 conv_state_t conv_state; 2901 2902 conv_state.fstart = NULL; 2903 conv_state.errbuf = errbuf; 2904 if (setjmp(conv_state.top_ctx) != 0) { 2905 free(conv_state.fstart); 2906 return NULL; 2907 } 2908 2909 /* 2910 * Loop doing convert_code_r() until no branches remain 2911 * with too-large offsets. 2912 */ 2913 for (;;) { 2914 unMarkAll(ic); 2915 n = *lenp = count_stmts(ic, root); 2916 2917 fp = (struct bpf_insn *)malloc(sizeof(*fp) * n); 2918 if (fp == NULL) { 2919 (void)snprintf(errbuf, PCAP_ERRBUF_SIZE, 2920 "malloc"); 2921 return NULL; 2922 } 2923 memset((char *)fp, 0, sizeof(*fp) * n); 2924 conv_state.fstart = fp; 2925 conv_state.ftail = fp + n; 2926 2927 unMarkAll(ic); 2928 if (convert_code_r(&conv_state, ic, root)) 2929 break; 2930 free(fp); 2931 } 2932 2933 return fp; 2934 } 2935 2936 /* 2937 * For iconv_to_fconv() errors. 2938 */ 2939 static void PCAP_NORETURN 2940 conv_error(conv_state_t *conv_state, const char *fmt, ...) 2941 { 2942 va_list ap; 2943 2944 va_start(ap, fmt); 2945 (void)vsnprintf(conv_state->errbuf, 2946 PCAP_ERRBUF_SIZE, fmt, ap); 2947 va_end(ap); 2948 longjmp(conv_state->top_ctx, 1); 2949 /* NOTREACHED */ 2950 #ifdef _AIX 2951 PCAP_UNREACHABLE 2952 #endif /* _AIX */ 2953 } 2954 2955 /* 2956 * Make a copy of a BPF program and put it in the "fcode" member of 2957 * a "pcap_t". 2958 * 2959 * If we fail to allocate memory for the copy, fill in the "errbuf" 2960 * member of the "pcap_t" with an error message, and return -1; 2961 * otherwise, return 0. 2962 */ 2963 int 2964 pcapint_install_bpf_program(pcap_t *p, struct bpf_program *fp) 2965 { 2966 size_t prog_size; 2967 2968 /* 2969 * Validate the program. 2970 */ 2971 if (!pcapint_validate_filter(fp->bf_insns, fp->bf_len)) { 2972 snprintf(p->errbuf, sizeof(p->errbuf), 2973 "BPF program is not valid"); 2974 return (-1); 2975 } 2976 2977 /* 2978 * Free up any already installed program. 2979 */ 2980 pcap_freecode(&p->fcode); 2981 2982 prog_size = sizeof(*fp->bf_insns) * fp->bf_len; 2983 p->fcode.bf_len = fp->bf_len; 2984 p->fcode.bf_insns = (struct bpf_insn *)malloc(prog_size); 2985 if (p->fcode.bf_insns == NULL) { 2986 pcapint_fmt_errmsg_for_errno(p->errbuf, sizeof(p->errbuf), 2987 errno, "malloc"); 2988 return (-1); 2989 } 2990 memcpy(p->fcode.bf_insns, fp->bf_insns, prog_size); 2991 return (0); 2992 } 2993 2994 #ifdef BDEBUG 2995 static void 2996 dot_dump_node(struct icode *ic, struct block *block, struct bpf_program *prog, 2997 FILE *out) 2998 { 2999 int icount, noffset; 3000 int i; 3001 3002 if (block == NULL || isMarked(ic, block)) 3003 return; 3004 Mark(ic, block); 3005 3006 icount = slength(block->stmts) + 1 + block->longjt + block->longjf; 3007 noffset = min(block->offset + icount, (int)prog->bf_len); 3008 3009 fprintf(out, "\tblock%u [shape=ellipse, id=\"block-%u\" label=\"BLOCK%u\\n", block->id, block->id, block->id); 3010 for (i = block->offset; i < noffset; i++) { 3011 fprintf(out, "\\n%s", bpf_image(prog->bf_insns + i, i)); 3012 } 3013 fprintf(out, "\" tooltip=\""); 3014 for (i = 0; i < BPF_MEMWORDS; i++) 3015 if (block->val[i] != VAL_UNKNOWN) 3016 fprintf(out, "val[%d]=%d ", i, block->val[i]); 3017 fprintf(out, "val[A]=%d ", block->val[A_ATOM]); 3018 fprintf(out, "val[X]=%d", block->val[X_ATOM]); 3019 fprintf(out, "\""); 3020 if (JT(block) == NULL) 3021 fprintf(out, ", peripheries=2"); 3022 fprintf(out, "];\n"); 3023 3024 dot_dump_node(ic, JT(block), prog, out); 3025 dot_dump_node(ic, JF(block), prog, out); 3026 } 3027 3028 static void 3029 dot_dump_edge(struct icode *ic, struct block *block, FILE *out) 3030 { 3031 if (block == NULL || isMarked(ic, block)) 3032 return; 3033 Mark(ic, block); 3034 3035 if (JT(block)) { 3036 fprintf(out, "\t\"block%u\":se -> \"block%u\":n [label=\"T\"]; \n", 3037 block->id, JT(block)->id); 3038 fprintf(out, "\t\"block%u\":sw -> \"block%u\":n [label=\"F\"]; \n", 3039 block->id, JF(block)->id); 3040 } 3041 dot_dump_edge(ic, JT(block), out); 3042 dot_dump_edge(ic, JF(block), out); 3043 } 3044 3045 /* Output the block CFG using graphviz/DOT language 3046 * In the CFG, block's code, value index for each registers at EXIT, 3047 * and the jump relationship is show. 3048 * 3049 * example DOT for BPF `ip src host 1.1.1.1' is: 3050 digraph BPF { 3051 block0 [shape=ellipse, id="block-0" label="BLOCK0\n\n(000) ldh [12]\n(001) jeq #0x800 jt 2 jf 5" tooltip="val[A]=0 val[X]=0"]; 3052 block1 [shape=ellipse, id="block-1" label="BLOCK1\n\n(002) ld [26]\n(003) jeq #0x1010101 jt 4 jf 5" tooltip="val[A]=0 val[X]=0"]; 3053 block2 [shape=ellipse, id="block-2" label="BLOCK2\n\n(004) ret #68" tooltip="val[A]=0 val[X]=0", peripheries=2]; 3054 block3 [shape=ellipse, id="block-3" label="BLOCK3\n\n(005) ret #0" tooltip="val[A]=0 val[X]=0", peripheries=2]; 3055 "block0":se -> "block1":n [label="T"]; 3056 "block0":sw -> "block3":n [label="F"]; 3057 "block1":se -> "block2":n [label="T"]; 3058 "block1":sw -> "block3":n [label="F"]; 3059 } 3060 * 3061 * After install graphviz on https://www.graphviz.org/, save it as bpf.dot 3062 * and run `dot -Tpng -O bpf.dot' to draw the graph. 3063 */ 3064 static int 3065 dot_dump(struct icode *ic, char *errbuf) 3066 { 3067 struct bpf_program f; 3068 FILE *out = stdout; 3069 3070 memset(bids, 0, sizeof bids); 3071 f.bf_insns = icode_to_fcode(ic, ic->root, &f.bf_len, errbuf); 3072 if (f.bf_insns == NULL) 3073 return -1; 3074 3075 fprintf(out, "digraph BPF {\n"); 3076 unMarkAll(ic); 3077 dot_dump_node(ic, ic->root, &f, out); 3078 unMarkAll(ic); 3079 dot_dump_edge(ic, ic->root, out); 3080 fprintf(out, "}\n"); 3081 3082 free((char *)f.bf_insns); 3083 return 0; 3084 } 3085 3086 static int 3087 plain_dump(struct icode *ic, char *errbuf) 3088 { 3089 struct bpf_program f; 3090 3091 memset(bids, 0, sizeof bids); 3092 f.bf_insns = icode_to_fcode(ic, ic->root, &f.bf_len, errbuf); 3093 if (f.bf_insns == NULL) 3094 return -1; 3095 bpf_dump(&f, 1); 3096 putchar('\n'); 3097 free((char *)f.bf_insns); 3098 return 0; 3099 } 3100 3101 static void 3102 opt_dump(opt_state_t *opt_state, struct icode *ic) 3103 { 3104 int status; 3105 char errbuf[PCAP_ERRBUF_SIZE]; 3106 3107 /* 3108 * If the CFG, in DOT format, is requested, output it rather than 3109 * the code that would be generated from that graph. 3110 */ 3111 if (pcap_print_dot_graph) 3112 status = dot_dump(ic, errbuf); 3113 else 3114 status = plain_dump(ic, errbuf); 3115 if (status == -1) 3116 opt_error(opt_state, "opt_dump: icode_to_fcode failed: %s", errbuf); 3117 } 3118 #endif 3119