1 /* $OpenBSD: optimize.c,v 1.23 2024/04/08 02:51:14 jsg Exp $ */ 2 3 /* 4 * Copyright (c) 1988, 1989, 1990, 1991, 1993, 1994, 1995, 1996 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that: (1) source code distributions 9 * retain the above copyright notice and this paragraph in its entirety, (2) 10 * distributions including binary code include the above copyright notice and 11 * this paragraph in its entirety in the documentation or other materials 12 * provided with the distribution, and (3) all advertising materials mentioning 13 * features or use of this software display the following acknowledgement: 14 * ``This product includes software developed by the University of California, 15 * Lawrence Berkeley Laboratory and its contributors.'' Neither the name of 16 * the University nor the names of its contributors may be used to endorse 17 * or promote products derived from this software without specific prior 18 * written permission. 19 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED 20 * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF 21 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. 22 * 23 * Optimization module for tcpdump intermediate representation. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/time.h> 28 29 #include <stdio.h> 30 #include <stdlib.h> 31 #include <stdint.h> 32 #include <string.h> 33 34 #include "pcap-int.h" 35 36 #include "gencode.h" 37 38 #ifdef HAVE_OS_PROTO_H 39 #include "os-proto.h" 40 #endif 41 42 #ifdef BDEBUG 43 extern int dflag; 44 #endif 45 46 #define A_ATOM BPF_MEMWORDS 47 #define X_ATOM (BPF_MEMWORDS+1) 48 49 #define NOP -1 50 51 /* 52 * This define is used to represent *both* the accumulator and 53 * x register in use-def computations. 54 * Currently, the use-def code assumes only one definition per instruction. 55 */ 56 #define AX_ATOM N_ATOMS 57 58 /* 59 * A flag to indicate that further optimization is needed. 60 * Iterative passes are continued until a given pass yields no 61 * branch movement. 62 */ 63 static int done; 64 65 /* 66 * A block is marked if only if its mark equals the current mark. 67 * Rather than traverse the code array, marking each item, 'cur_mark' is 68 * incremented. This automatically makes each element unmarked. 69 */ 70 static int cur_mark; 71 #define isMarked(p) ((p)->mark == cur_mark) 72 #define unMarkAll() cur_mark += 1 73 #define Mark(p) ((p)->mark = cur_mark) 74 75 static void opt_init(struct block *); 76 static void opt_cleanup(void); 77 78 static void make_marks(struct block *); 79 static void mark_code(struct block *); 80 81 static void intern_blocks(struct block *); 82 83 static int eq_slist(struct slist *, struct slist *); 84 85 static void find_levels_r(struct block *); 86 87 static void find_levels(struct block *); 88 static void find_dom(struct block *); 89 static void propedom(struct edge *); 90 static void find_edom(struct block *); 91 static void find_closure(struct block *); 92 static int atomuse(struct stmt *); 93 static int atomdef(struct stmt *); 94 static void compute_local_ud(struct block *); 95 static void find_ud(struct block *); 96 static void init_val(void); 97 static int F(int, int, int); 98 static __inline void vstore(struct stmt *, int *, int, int); 99 static void opt_blk(struct block *, int); 100 static int use_conflict(struct block *, struct block *); 101 static void opt_j(struct edge *); 102 static void or_pullup(struct block *); 103 static void and_pullup(struct block *); 104 static void opt_blks(struct block *, int); 105 static __inline void link_inedge(struct edge *, struct block *); 106 static void find_inedges(struct block *); 107 static void opt_root(struct block **); 108 static void opt_loop(struct block *, int); 109 static void fold_op(struct stmt *, int, int); 110 static __inline struct slist *this_op(struct slist *); 111 static void opt_not(struct block *); 112 static void opt_peep(struct block *); 113 static void opt_stmt(struct stmt *, int[], int); 114 static void deadstmt(struct stmt *, struct stmt *[]); 115 static void opt_deadstores(struct block *); 116 static void opt_blk(struct block *, int); 117 static int use_conflict(struct block *, struct block *); 118 static void opt_j(struct edge *); 119 static struct block *fold_edge(struct block *, struct edge *); 120 static __inline int eq_blk(struct block *, struct block *); 121 static int slength(struct slist *); 122 static int count_blocks(struct block *); 123 static void number_blks_r(struct block *); 124 static int count_stmts(struct block *); 125 static int convert_code_r(struct block *); 126 #ifdef BDEBUG 127 static void opt_dump(struct block *); 128 #endif 129 130 static int n_blocks; 131 struct block **blocks; 132 static int n_edges; 133 struct edge **edges; 134 135 /* 136 * A bit vector set representation of the dominators. 137 * We round up the set size to the next power of two. 138 */ 139 static int nodewords; 140 static int edgewords; 141 struct block **levels; 142 bpf_u_int32 *space1; 143 bpf_u_int32 *space2; 144 #define BITS_PER_WORD (8*sizeof(bpf_u_int32)) 145 /* 146 * True if a is in uset {p} 147 */ 148 #define SET_MEMBER(p, a) \ 149 ((p)[(unsigned)(a) / BITS_PER_WORD] & (1 << ((unsigned)(a) % BITS_PER_WORD))) 150 151 /* 152 * Add 'a' to uset p. 153 */ 154 #define SET_INSERT(p, a) \ 155 (p)[(unsigned)(a) / BITS_PER_WORD] |= (1 << ((unsigned)(a) % BITS_PER_WORD)) 156 157 /* 158 * Delete 'a' from uset p. 159 */ 160 #define SET_DELETE(p, a) \ 161 (p)[(unsigned)(a) / BITS_PER_WORD] &= ~(1 << ((unsigned)(a) % BITS_PER_WORD)) 162 163 /* 164 * a := a intersect b 165 */ 166 #define SET_INTERSECT(a, b, n)\ 167 {\ 168 bpf_u_int32 *_x = a, *_y = b;\ 169 int _n = n;\ 170 while (--_n >= 0) *_x++ &= *_y++;\ 171 } 172 173 /* 174 * a := a - b 175 */ 176 #define SET_SUBTRACT(a, b, n)\ 177 {\ 178 bpf_u_int32 *_x = a, *_y = b;\ 179 int _n = n;\ 180 while (--_n >= 0) *_x++ &=~ *_y++;\ 181 } 182 183 /* 184 * a := a union b 185 */ 186 #define SET_UNION(a, b, n)\ 187 {\ 188 bpf_u_int32 *_x = a, *_y = b;\ 189 int _n = n;\ 190 while (--_n >= 0) *_x++ |= *_y++;\ 191 } 192 193 static uset all_dom_sets; 194 static uset all_closure_sets; 195 static uset all_edge_sets; 196 197 #ifndef MAX 198 #define MAX(a,b) ((a)>(b)?(a):(b)) 199 #endif 200 201 static void 202 find_levels_r(struct block *b) 203 { 204 int level; 205 206 if (isMarked(b)) 207 return; 208 209 Mark(b); 210 b->link = 0; 211 212 if (JT(b)) { 213 find_levels_r(JT(b)); 214 find_levels_r(JF(b)); 215 level = MAX(JT(b)->level, JF(b)->level) + 1; 216 } else 217 level = 0; 218 b->level = level; 219 b->link = levels[level]; 220 levels[level] = b; 221 } 222 223 /* 224 * Level graph. The levels go from 0 at the leaves to 225 * N_LEVELS at the root. The levels[] array points to the 226 * first node of the level list, whose elements are linked 227 * with the 'link' field of the struct block. 228 */ 229 static void 230 find_levels(struct block *root) 231 { 232 memset((char *)levels, 0, n_blocks * sizeof(*levels)); 233 unMarkAll(); 234 find_levels_r(root); 235 } 236 237 /* 238 * Find dominator relationships. 239 * Assumes graph has been leveled. 240 */ 241 static void 242 find_dom(struct block *root) 243 { 244 int i; 245 struct block *b; 246 bpf_u_int32 *x; 247 248 /* 249 * Initialize sets to contain all nodes. 250 */ 251 x = all_dom_sets; 252 i = n_blocks * nodewords; 253 while (--i >= 0) 254 *x++ = ~0; 255 /* Root starts off empty. */ 256 for (i = nodewords; --i >= 0;) 257 root->dom[i] = 0; 258 259 /* root->level is the highest level no found. */ 260 for (i = root->level; i >= 0; --i) { 261 for (b = levels[i]; b; b = b->link) { 262 SET_INSERT(b->dom, b->id); 263 if (JT(b) == 0) 264 continue; 265 SET_INTERSECT(JT(b)->dom, b->dom, nodewords); 266 SET_INTERSECT(JF(b)->dom, b->dom, nodewords); 267 } 268 } 269 } 270 271 static void 272 propedom(struct edge *ep) 273 { 274 SET_INSERT(ep->edom, ep->id); 275 if (ep->succ) { 276 SET_INTERSECT(ep->succ->et.edom, ep->edom, edgewords); 277 SET_INTERSECT(ep->succ->ef.edom, ep->edom, edgewords); 278 } 279 } 280 281 /* 282 * Compute edge dominators. 283 * Assumes graph has been leveled and predecessors established. 284 */ 285 static void 286 find_edom(struct block *root) 287 { 288 int i; 289 uset x; 290 struct block *b; 291 292 x = all_edge_sets; 293 for (i = n_edges * edgewords; --i >= 0; ) 294 x[i] = ~0; 295 296 /* root->level is the highest level no found. */ 297 memset(root->et.edom, 0, edgewords * sizeof(*(uset)0)); 298 memset(root->ef.edom, 0, edgewords * sizeof(*(uset)0)); 299 for (i = root->level; i >= 0; --i) { 300 for (b = levels[i]; b != 0; b = b->link) { 301 propedom(&b->et); 302 propedom(&b->ef); 303 } 304 } 305 } 306 307 /* 308 * Find the backwards transitive closure of the flow graph. These sets 309 * are backwards in the sense that we find the set of nodes that reach 310 * a given node, not the set of nodes that can be reached by a node. 311 * 312 * Assumes graph has been leveled. 313 */ 314 static void 315 find_closure(struct block *root) 316 { 317 int i; 318 struct block *b; 319 320 /* 321 * Initialize sets to contain no nodes. 322 */ 323 memset((char *)all_closure_sets, 0, 324 n_blocks * nodewords * sizeof(*all_closure_sets)); 325 326 /* root->level is the highest level no found. */ 327 for (i = root->level; i >= 0; --i) { 328 for (b = levels[i]; b; b = b->link) { 329 SET_INSERT(b->closure, b->id); 330 if (JT(b) == 0) 331 continue; 332 SET_UNION(JT(b)->closure, b->closure, nodewords); 333 SET_UNION(JF(b)->closure, b->closure, nodewords); 334 } 335 } 336 } 337 338 /* 339 * Return the register number that is used by s. If A and X are both 340 * used, return AX_ATOM. If no register is used, return -1. 341 * 342 * The implementation should probably change to an array access. 343 */ 344 static int 345 atomuse(struct stmt *s) 346 { 347 int c = s->code; 348 349 if (c == NOP) 350 return -1; 351 352 switch (BPF_CLASS(c)) { 353 354 case BPF_RET: 355 return (BPF_RVAL(c) == BPF_A) ? A_ATOM : 356 (BPF_RVAL(c) == BPF_X) ? X_ATOM : -1; 357 358 case BPF_LD: 359 case BPF_LDX: 360 return (BPF_MODE(c) == BPF_IND) ? X_ATOM : 361 (BPF_MODE(c) == BPF_MEM) ? s->k : -1; 362 363 case BPF_ST: 364 return A_ATOM; 365 366 case BPF_STX: 367 return X_ATOM; 368 369 case BPF_JMP: 370 case BPF_ALU: 371 if (BPF_SRC(c) == BPF_X) 372 return AX_ATOM; 373 return A_ATOM; 374 375 case BPF_MISC: 376 return BPF_MISCOP(c) == BPF_TXA ? X_ATOM : A_ATOM; 377 } 378 abort(); 379 /* NOTREACHED */ 380 } 381 382 /* 383 * Return the register number that is defined by 's'. We assume that 384 * a single stmt cannot define more than one register. If no register 385 * is defined, return -1. 386 * 387 * The implementation should probably change to an array access. 388 */ 389 static int 390 atomdef(struct stmt *s) 391 { 392 if (s->code == NOP) 393 return -1; 394 395 switch (BPF_CLASS(s->code)) { 396 397 case BPF_LD: 398 case BPF_ALU: 399 return A_ATOM; 400 401 case BPF_LDX: 402 return X_ATOM; 403 404 case BPF_ST: 405 case BPF_STX: 406 return s->k; 407 408 case BPF_MISC: 409 return BPF_MISCOP(s->code) == BPF_TAX ? X_ATOM : A_ATOM; 410 } 411 return -1; 412 } 413 414 static void 415 compute_local_ud(struct block *b) 416 { 417 struct slist *s; 418 atomset def = 0, use = 0, kill = 0; 419 int atom; 420 421 for (s = b->stmts; s; s = s->next) { 422 if (s->s.code == NOP) 423 continue; 424 atom = atomuse(&s->s); 425 if (atom >= 0) { 426 if (atom == AX_ATOM) { 427 if (!ATOMELEM(def, X_ATOM)) 428 use |= ATOMMASK(X_ATOM); 429 if (!ATOMELEM(def, A_ATOM)) 430 use |= ATOMMASK(A_ATOM); 431 } 432 else if (atom < N_ATOMS) { 433 if (!ATOMELEM(def, atom)) 434 use |= ATOMMASK(atom); 435 } 436 else 437 abort(); 438 } 439 atom = atomdef(&s->s); 440 if (atom >= 0) { 441 if (!ATOMELEM(use, atom)) 442 kill |= ATOMMASK(atom); 443 def |= ATOMMASK(atom); 444 } 445 } 446 if (!ATOMELEM(def, A_ATOM) && BPF_CLASS(b->s.code) == BPF_JMP) 447 use |= ATOMMASK(A_ATOM); 448 449 b->def = def; 450 b->kill = kill; 451 b->in_use = use; 452 } 453 454 /* 455 * Assume graph is already leveled. 456 */ 457 static void 458 find_ud(struct block *root) 459 { 460 int i, maxlevel; 461 struct block *p; 462 463 /* 464 * root->level is the highest level no found; 465 * count down from there. 466 */ 467 maxlevel = root->level; 468 for (i = maxlevel; i >= 0; --i) 469 for (p = levels[i]; p; p = p->link) { 470 compute_local_ud(p); 471 p->out_use = 0; 472 } 473 474 for (i = 1; i <= maxlevel; ++i) { 475 for (p = levels[i]; p; p = p->link) { 476 p->out_use |= JT(p)->in_use | JF(p)->in_use; 477 p->in_use |= p->out_use &~ p->kill; 478 } 479 } 480 } 481 482 /* 483 * These data structures are used in a Cocke and Shwarz style 484 * value numbering scheme. Since the flowgraph is acyclic, 485 * exit values can be propagated from a node's predecessors 486 * provided it is uniquely defined. 487 */ 488 struct valnode { 489 int code; 490 int v0, v1; 491 int val; 492 struct valnode *next; 493 }; 494 495 #define MODULUS 213 496 static struct valnode *hashtbl[MODULUS]; 497 static int curval; 498 static int maxval; 499 500 /* Integer constants mapped with the load immediate opcode. */ 501 #define K(i) F(BPF_LD|BPF_IMM|BPF_W, i, 0L) 502 503 struct vmapinfo { 504 int is_const; 505 bpf_int32 const_val; 506 }; 507 508 struct vmapinfo *vmap; 509 struct valnode *vnode_base; 510 struct valnode *next_vnode; 511 512 static void 513 init_val(void) 514 { 515 curval = 0; 516 next_vnode = vnode_base; 517 memset((char *)vmap, 0, maxval * sizeof(*vmap)); 518 memset((char *)hashtbl, 0, sizeof hashtbl); 519 } 520 521 /* Because we really don't have an IR, this stuff is a little messy. */ 522 static int 523 F(int code, int v0, int v1) 524 { 525 u_int hash; 526 int val; 527 struct valnode *p; 528 529 hash = (u_int)code ^ (v0 << 4) ^ (v1 << 8); 530 hash %= MODULUS; 531 532 for (p = hashtbl[hash]; p; p = p->next) 533 if (p->code == code && p->v0 == v0 && p->v1 == v1) 534 return p->val; 535 536 val = ++curval; 537 if (BPF_MODE(code) == BPF_IMM && 538 (BPF_CLASS(code) == BPF_LD || BPF_CLASS(code) == BPF_LDX)) { 539 vmap[val].const_val = v0; 540 vmap[val].is_const = 1; 541 } 542 p = next_vnode++; 543 p->val = val; 544 p->code = code; 545 p->v0 = v0; 546 p->v1 = v1; 547 p->next = hashtbl[hash]; 548 hashtbl[hash] = p; 549 550 return val; 551 } 552 553 static __inline void 554 vstore(struct stmt *s, int *valp, int newval, int alter) 555 { 556 if (alter && *valp == newval) 557 s->code = NOP; 558 else 559 *valp = newval; 560 } 561 562 static void 563 fold_op(struct stmt *s, int v0, int v1) 564 { 565 bpf_int32 a, b; 566 567 a = vmap[v0].const_val; 568 b = vmap[v1].const_val; 569 570 switch (BPF_OP(s->code)) { 571 case BPF_ADD: 572 a += b; 573 break; 574 575 case BPF_SUB: 576 a -= b; 577 break; 578 579 case BPF_MUL: 580 a *= b; 581 break; 582 583 case BPF_DIV: 584 if (b == 0) 585 bpf_error("division by zero"); 586 a /= b; 587 break; 588 589 case BPF_AND: 590 a &= b; 591 break; 592 593 case BPF_OR: 594 a |= b; 595 break; 596 597 case BPF_LSH: 598 a <<= b; 599 break; 600 601 case BPF_RSH: 602 a >>= b; 603 break; 604 605 case BPF_NEG: 606 a = -a; 607 break; 608 609 default: 610 abort(); 611 } 612 s->k = a; 613 s->code = BPF_LD|BPF_IMM; 614 done = 0; 615 } 616 617 static __inline struct slist * 618 this_op(struct slist *s) 619 { 620 while (s != 0 && s->s.code == NOP) 621 s = s->next; 622 return s; 623 } 624 625 static void 626 opt_not(struct block *b) 627 { 628 struct block *tmp = JT(b); 629 630 JT(b) = JF(b); 631 JF(b) = tmp; 632 } 633 634 static void 635 opt_peep(struct block *b) 636 { 637 struct slist *s; 638 struct slist *next, *last; 639 int val; 640 641 s = b->stmts; 642 if (s == 0) 643 return; 644 645 last = s; 646 while (1) { 647 s = this_op(s); 648 if (s == 0) 649 break; 650 next = this_op(s->next); 651 if (next == 0) 652 break; 653 last = next; 654 655 /* 656 * st M[k] --> st M[k] 657 * ldx M[k] tax 658 */ 659 if (s->s.code == BPF_ST && 660 next->s.code == (BPF_LDX|BPF_MEM) && 661 s->s.k == next->s.k) { 662 done = 0; 663 next->s.code = BPF_MISC|BPF_TAX; 664 } 665 /* 666 * ld #k --> ldx #k 667 * tax txa 668 */ 669 if (s->s.code == (BPF_LD|BPF_IMM) && 670 next->s.code == (BPF_MISC|BPF_TAX)) { 671 s->s.code = BPF_LDX|BPF_IMM; 672 next->s.code = BPF_MISC|BPF_TXA; 673 done = 0; 674 } 675 /* 676 * This is an ugly special case, but it happens 677 * when you say tcp[k] or udp[k] where k is a constant. 678 */ 679 if (s->s.code == (BPF_LD|BPF_IMM)) { 680 struct slist *add, *tax, *ild; 681 682 /* 683 * Check that X isn't used on exit from this 684 * block (which the optimizer might cause). 685 * We know the code generator won't generate 686 * any local dependencies. 687 */ 688 if (ATOMELEM(b->out_use, X_ATOM)) 689 break; 690 691 if (next->s.code != (BPF_LDX|BPF_MSH|BPF_B)) 692 add = next; 693 else 694 add = this_op(next->next); 695 if (add == 0 || add->s.code != (BPF_ALU|BPF_ADD|BPF_X)) 696 break; 697 698 tax = this_op(add->next); 699 if (tax == 0 || tax->s.code != (BPF_MISC|BPF_TAX)) 700 break; 701 702 ild = this_op(tax->next); 703 if (ild == 0 || BPF_CLASS(ild->s.code) != BPF_LD || 704 BPF_MODE(ild->s.code) != BPF_IND) 705 break; 706 /* 707 * XXX We need to check that X is not 708 * subsequently used. We know we can eliminate the 709 * accumulator modifications since it is defined 710 * by the last stmt of this sequence. 711 * 712 * We want to turn this sequence: 713 * 714 * (004) ldi #0x2 {s} 715 * (005) ldxms [14] {next} -- optional 716 * (006) addx {add} 717 * (007) tax {tax} 718 * (008) ild [x+0] {ild} 719 * 720 * into this sequence: 721 * 722 * (004) nop 723 * (005) ldxms [14] 724 * (006) nop 725 * (007) nop 726 * (008) ild [x+2] 727 * 728 */ 729 ild->s.k += s->s.k; 730 s->s.code = NOP; 731 add->s.code = NOP; 732 tax->s.code = NOP; 733 done = 0; 734 } 735 s = next; 736 } 737 /* 738 * If we have a subtract to do a comparison, and the X register 739 * is a known constant, we can merge this value into the 740 * comparison. 741 */ 742 if (last->s.code == (BPF_ALU|BPF_SUB|BPF_X) && 743 !ATOMELEM(b->out_use, A_ATOM)) { 744 val = b->val[X_ATOM]; 745 if (vmap[val].is_const) { 746 int op; 747 748 b->s.k += vmap[val].const_val; 749 op = BPF_OP(b->s.code); 750 if (op == BPF_JGT || op == BPF_JGE) { 751 struct block *t = JT(b); 752 JT(b) = JF(b); 753 JF(b) = t; 754 b->s.k += 0x80000000; 755 } 756 last->s.code = NOP; 757 done = 0; 758 } else if (b->s.k == 0) { 759 /* 760 * sub x -> nop 761 * j #0 j x 762 */ 763 last->s.code = NOP; 764 b->s.code = BPF_CLASS(b->s.code) | BPF_OP(b->s.code) | 765 BPF_X; 766 done = 0; 767 } 768 } 769 /* 770 * Likewise, a constant subtract can be simplified. 771 */ 772 else if (last->s.code == (BPF_ALU|BPF_SUB|BPF_K) && 773 !ATOMELEM(b->out_use, A_ATOM)) { 774 int op; 775 776 b->s.k += last->s.k; 777 last->s.code = NOP; 778 op = BPF_OP(b->s.code); 779 if (op == BPF_JGT || op == BPF_JGE) { 780 struct block *t = JT(b); 781 JT(b) = JF(b); 782 JF(b) = t; 783 b->s.k += 0x80000000; 784 } 785 done = 0; 786 } 787 /* 788 * and #k nop 789 * jeq #0 -> jset #k 790 */ 791 if (last->s.code == (BPF_ALU|BPF_AND|BPF_K) && 792 !ATOMELEM(b->out_use, A_ATOM) && b->s.k == 0) { 793 b->s.k = last->s.k; 794 b->s.code = BPF_JMP|BPF_K|BPF_JSET; 795 last->s.code = NOP; 796 done = 0; 797 opt_not(b); 798 } 799 /* 800 * If the accumulator is a known constant, we can compute the 801 * comparison result. 802 */ 803 val = b->val[A_ATOM]; 804 if (vmap[val].is_const && BPF_SRC(b->s.code) == BPF_K) { 805 bpf_int32 v = vmap[val].const_val; 806 switch (BPF_OP(b->s.code)) { 807 808 case BPF_JEQ: 809 v = v == b->s.k; 810 break; 811 812 case BPF_JGT: 813 v = (unsigned)v > b->s.k; 814 break; 815 816 case BPF_JGE: 817 v = (unsigned)v >= b->s.k; 818 break; 819 820 case BPF_JSET: 821 v &= b->s.k; 822 break; 823 824 default: 825 abort(); 826 } 827 if (JF(b) != JT(b)) 828 done = 0; 829 if (v) 830 JF(b) = JT(b); 831 else 832 JT(b) = JF(b); 833 } 834 } 835 836 /* 837 * Compute the symbolic value of expression of 's', and update 838 * anything it defines in the value table 'val'. If 'alter' is true, 839 * do various optimizations. This code would be cleaner if symbolic 840 * evaluation and code transformations weren't folded together. 841 */ 842 static void 843 opt_stmt(struct stmt *s, int val[], int alter) 844 { 845 int op; 846 int v; 847 848 switch (s->code) { 849 850 case BPF_LD|BPF_ABS|BPF_W: 851 case BPF_LD|BPF_ABS|BPF_H: 852 case BPF_LD|BPF_ABS|BPF_B: 853 v = F(s->code, s->k, 0L); 854 vstore(s, &val[A_ATOM], v, alter); 855 break; 856 857 case BPF_LD|BPF_IND|BPF_W: 858 case BPF_LD|BPF_IND|BPF_H: 859 case BPF_LD|BPF_IND|BPF_B: 860 v = val[X_ATOM]; 861 if (alter && vmap[v].is_const) { 862 s->code = BPF_LD|BPF_ABS|BPF_SIZE(s->code); 863 s->k += vmap[v].const_val; 864 v = F(s->code, s->k, 0L); 865 done = 0; 866 } 867 else 868 v = F(s->code, s->k, v); 869 vstore(s, &val[A_ATOM], v, alter); 870 break; 871 872 case BPF_LD|BPF_LEN: 873 case BPF_LD|BPF_RND: 874 v = F(s->code, 0L, 0L); 875 vstore(s, &val[A_ATOM], v, alter); 876 break; 877 878 case BPF_LD|BPF_IMM: 879 v = K(s->k); 880 vstore(s, &val[A_ATOM], v, alter); 881 break; 882 883 case BPF_LDX|BPF_IMM: 884 v = K(s->k); 885 vstore(s, &val[X_ATOM], v, alter); 886 break; 887 888 case BPF_LDX|BPF_MSH|BPF_B: 889 v = F(s->code, s->k, 0L); 890 vstore(s, &val[X_ATOM], v, alter); 891 break; 892 893 case BPF_ALU|BPF_NEG: 894 if (alter && vmap[val[A_ATOM]].is_const) { 895 s->code = BPF_LD|BPF_IMM; 896 s->k = -vmap[val[A_ATOM]].const_val; 897 val[A_ATOM] = K(s->k); 898 } 899 else 900 val[A_ATOM] = F(s->code, val[A_ATOM], 0L); 901 break; 902 903 case BPF_ALU|BPF_ADD|BPF_K: 904 case BPF_ALU|BPF_SUB|BPF_K: 905 case BPF_ALU|BPF_MUL|BPF_K: 906 case BPF_ALU|BPF_DIV|BPF_K: 907 case BPF_ALU|BPF_AND|BPF_K: 908 case BPF_ALU|BPF_OR|BPF_K: 909 case BPF_ALU|BPF_LSH|BPF_K: 910 case BPF_ALU|BPF_RSH|BPF_K: 911 op = BPF_OP(s->code); 912 if (alter) { 913 if (s->k == 0) { 914 if (op == BPF_ADD || op == BPF_SUB || 915 op == BPF_LSH || op == BPF_RSH || 916 op == BPF_OR) { 917 s->code = NOP; 918 break; 919 } 920 if (op == BPF_MUL || op == BPF_AND) { 921 s->code = BPF_LD|BPF_IMM; 922 val[A_ATOM] = K(s->k); 923 break; 924 } 925 } 926 if (vmap[val[A_ATOM]].is_const) { 927 fold_op(s, val[A_ATOM], K(s->k)); 928 val[A_ATOM] = K(s->k); 929 break; 930 } 931 } 932 val[A_ATOM] = F(s->code, val[A_ATOM], K(s->k)); 933 break; 934 935 case BPF_ALU|BPF_ADD|BPF_X: 936 case BPF_ALU|BPF_SUB|BPF_X: 937 case BPF_ALU|BPF_MUL|BPF_X: 938 case BPF_ALU|BPF_DIV|BPF_X: 939 case BPF_ALU|BPF_AND|BPF_X: 940 case BPF_ALU|BPF_OR|BPF_X: 941 case BPF_ALU|BPF_LSH|BPF_X: 942 case BPF_ALU|BPF_RSH|BPF_X: 943 op = BPF_OP(s->code); 944 if (alter && vmap[val[X_ATOM]].is_const) { 945 if (vmap[val[A_ATOM]].is_const) { 946 fold_op(s, val[A_ATOM], val[X_ATOM]); 947 val[A_ATOM] = K(s->k); 948 } 949 else { 950 s->code = BPF_ALU|BPF_K|op; 951 s->k = vmap[val[X_ATOM]].const_val; 952 done = 0; 953 val[A_ATOM] = 954 F(s->code, val[A_ATOM], K(s->k)); 955 } 956 break; 957 } 958 /* 959 * Check if we're doing something to an accumulator 960 * that is 0, and simplify. This may not seem like 961 * much of a simplification but it could open up further 962 * optimizations. 963 * XXX We could also check for mul by 1, and -1, etc. 964 */ 965 if (alter && vmap[val[A_ATOM]].is_const 966 && vmap[val[A_ATOM]].const_val == 0) { 967 if (op == BPF_ADD || op == BPF_OR || 968 op == BPF_LSH || op == BPF_RSH || op == BPF_SUB) { 969 s->code = BPF_MISC|BPF_TXA; 970 vstore(s, &val[A_ATOM], val[X_ATOM], alter); 971 break; 972 } 973 else if (op == BPF_MUL || op == BPF_DIV || 974 op == BPF_AND) { 975 s->code = BPF_LD|BPF_IMM; 976 s->k = 0; 977 vstore(s, &val[A_ATOM], K(s->k), alter); 978 break; 979 } 980 else if (op == BPF_NEG) { 981 s->code = NOP; 982 break; 983 } 984 } 985 val[A_ATOM] = F(s->code, val[A_ATOM], val[X_ATOM]); 986 break; 987 988 case BPF_MISC|BPF_TXA: 989 vstore(s, &val[A_ATOM], val[X_ATOM], alter); 990 break; 991 992 case BPF_LD|BPF_MEM: 993 v = val[s->k]; 994 if (alter && vmap[v].is_const) { 995 s->code = BPF_LD|BPF_IMM; 996 s->k = vmap[v].const_val; 997 done = 0; 998 } 999 vstore(s, &val[A_ATOM], v, alter); 1000 break; 1001 1002 case BPF_MISC|BPF_TAX: 1003 vstore(s, &val[X_ATOM], val[A_ATOM], alter); 1004 break; 1005 1006 case BPF_LDX|BPF_MEM: 1007 v = val[s->k]; 1008 if (alter && vmap[v].is_const) { 1009 s->code = BPF_LDX|BPF_IMM; 1010 s->k = vmap[v].const_val; 1011 done = 0; 1012 } 1013 vstore(s, &val[X_ATOM], v, alter); 1014 break; 1015 1016 case BPF_ST: 1017 vstore(s, &val[s->k], val[A_ATOM], alter); 1018 break; 1019 1020 case BPF_STX: 1021 vstore(s, &val[s->k], val[X_ATOM], alter); 1022 break; 1023 } 1024 } 1025 1026 static void 1027 deadstmt(struct stmt *s, struct stmt *last[]) 1028 { 1029 int atom; 1030 1031 atom = atomuse(s); 1032 if (atom >= 0) { 1033 if (atom == AX_ATOM) { 1034 last[X_ATOM] = 0; 1035 last[A_ATOM] = 0; 1036 } 1037 else 1038 last[atom] = 0; 1039 } 1040 atom = atomdef(s); 1041 if (atom >= 0) { 1042 if (last[atom]) { 1043 done = 0; 1044 last[atom]->code = NOP; 1045 } 1046 last[atom] = s; 1047 } 1048 } 1049 1050 static void 1051 opt_deadstores(struct block *b) 1052 { 1053 struct slist *s; 1054 int atom; 1055 struct stmt *last[N_ATOMS]; 1056 1057 memset((char *)last, 0, sizeof last); 1058 1059 for (s = b->stmts; s != 0; s = s->next) 1060 deadstmt(&s->s, last); 1061 deadstmt(&b->s, last); 1062 1063 for (atom = 0; atom < N_ATOMS; ++atom) 1064 if (last[atom] && !ATOMELEM(b->out_use, atom)) { 1065 last[atom]->code = NOP; 1066 done = 0; 1067 } 1068 } 1069 1070 static void 1071 opt_blk(struct block *b, int do_stmts) 1072 { 1073 struct slist *s; 1074 struct edge *p; 1075 int i; 1076 bpf_int32 aval; 1077 1078 #if 0 1079 for (s = b->stmts; s && s->next; s = s->next) 1080 if (BPF_CLASS(s->s.code) == BPF_JMP) { 1081 do_stmts = 0; 1082 break; 1083 } 1084 #endif 1085 1086 /* 1087 * Initialize the atom values. 1088 * If we have no predecessors, everything is undefined. 1089 * Otherwise, we inherent our values from our predecessors. 1090 * If any register has an ambiguous value (i.e. control paths are 1091 * merging) give it the undefined value of 0. 1092 */ 1093 p = b->in_edges; 1094 if (p == 0) 1095 memset((char *)b->val, 0, sizeof(b->val)); 1096 else { 1097 memcpy((char *)b->val, (char *)p->pred->val, sizeof(b->val)); 1098 while ((p = p->next) != NULL) { 1099 for (i = 0; i < N_ATOMS; ++i) 1100 if (b->val[i] != p->pred->val[i]) 1101 b->val[i] = 0; 1102 } 1103 } 1104 aval = b->val[A_ATOM]; 1105 for (s = b->stmts; s; s = s->next) 1106 opt_stmt(&s->s, b->val, do_stmts); 1107 1108 /* 1109 * This is a special case: if we don't use anything from this 1110 * block, and we load the accumulator with value that is 1111 * already there, or if this block is a return, 1112 * eliminate all the statements. 1113 */ 1114 if (do_stmts && 1115 ((b->out_use == 0 && aval != 0 &&b->val[A_ATOM] == aval) || 1116 BPF_CLASS(b->s.code) == BPF_RET)) { 1117 if (b->stmts != 0) { 1118 b->stmts = 0; 1119 done = 0; 1120 } 1121 } else { 1122 opt_peep(b); 1123 opt_deadstores(b); 1124 } 1125 /* 1126 * Set up values for branch optimizer. 1127 */ 1128 if (BPF_SRC(b->s.code) == BPF_K) 1129 b->oval = K(b->s.k); 1130 else 1131 b->oval = b->val[X_ATOM]; 1132 b->et.code = b->s.code; 1133 b->ef.code = -b->s.code; 1134 } 1135 1136 /* 1137 * Return true if any register that is used on exit from 'succ', has 1138 * an exit value that is different from the corresponding exit value 1139 * from 'b'. 1140 */ 1141 static int 1142 use_conflict(struct block *b, struct block *succ) 1143 { 1144 int atom; 1145 atomset use = succ->out_use; 1146 1147 if (use == 0) 1148 return 0; 1149 1150 for (atom = 0; atom < N_ATOMS; ++atom) 1151 if (ATOMELEM(use, atom)) 1152 if (b->val[atom] != succ->val[atom]) 1153 return 1; 1154 return 0; 1155 } 1156 1157 static struct block * 1158 fold_edge(struct block *child, struct edge *ep) 1159 { 1160 int sense; 1161 int aval0, aval1, oval0, oval1; 1162 int code = ep->code; 1163 1164 if (code < 0) { 1165 code = -code; 1166 sense = 0; 1167 } else 1168 sense = 1; 1169 1170 if (child->s.code != code) 1171 return 0; 1172 1173 aval0 = child->val[A_ATOM]; 1174 oval0 = child->oval; 1175 aval1 = ep->pred->val[A_ATOM]; 1176 oval1 = ep->pred->oval; 1177 1178 if (aval0 != aval1) 1179 return 0; 1180 1181 if (oval0 == oval1) 1182 /* 1183 * The operands are identical, so the 1184 * result is true if a true branch was 1185 * taken to get here, otherwise false. 1186 */ 1187 return sense ? JT(child) : JF(child); 1188 1189 if (sense && code == (BPF_JMP|BPF_JEQ|BPF_K)) 1190 /* 1191 * At this point, we only know the comparison if we 1192 * came down the true branch, and it was an equality 1193 * comparison with a constant. We rely on the fact that 1194 * distinct constants have distinct value numbers. 1195 */ 1196 return JF(child); 1197 1198 return 0; 1199 } 1200 1201 static void 1202 opt_j(struct edge *ep) 1203 { 1204 int i, k; 1205 struct block *target; 1206 1207 if (JT(ep->succ) == 0) 1208 return; 1209 1210 if (JT(ep->succ) == JF(ep->succ)) { 1211 /* 1212 * Common branch targets can be eliminated, provided 1213 * there is no data dependency. 1214 */ 1215 if (!use_conflict(ep->pred, ep->succ->et.succ)) { 1216 done = 0; 1217 ep->succ = JT(ep->succ); 1218 } 1219 } 1220 /* 1221 * For each edge dominator that matches the successor of this 1222 * edge, promote the edge successor to the its grandchild. 1223 * 1224 * XXX We violate the set abstraction here in favor a reasonably 1225 * efficient loop. 1226 */ 1227 top: 1228 for (i = 0; i < edgewords; ++i) { 1229 bpf_u_int32 x = ep->edom[i]; 1230 1231 while (x != 0) { 1232 k = ffs(x) - 1; 1233 x &=~ (1 << k); 1234 k += i * BITS_PER_WORD; 1235 1236 target = fold_edge(ep->succ, edges[k]); 1237 /* 1238 * Check that there is no data dependency between 1239 * nodes that will be violated if we move the edge. 1240 */ 1241 if (target != 0 && !use_conflict(ep->pred, target)) { 1242 done = 0; 1243 ep->succ = target; 1244 if (JT(target) != 0) 1245 /* 1246 * Start over unless we hit a leaf. 1247 */ 1248 goto top; 1249 return; 1250 } 1251 } 1252 } 1253 } 1254 1255 1256 static void 1257 or_pullup(struct block *b) 1258 { 1259 int val, at_top; 1260 struct block *pull; 1261 struct block **diffp, **samep; 1262 struct edge *ep; 1263 1264 ep = b->in_edges; 1265 if (ep == 0) 1266 return; 1267 1268 /* 1269 * Make sure each predecessor loads the same value. 1270 * XXX why? 1271 */ 1272 val = ep->pred->val[A_ATOM]; 1273 for (ep = ep->next; ep != 0; ep = ep->next) 1274 if (val != ep->pred->val[A_ATOM]) 1275 return; 1276 1277 if (JT(b->in_edges->pred) == b) 1278 diffp = &JT(b->in_edges->pred); 1279 else 1280 diffp = &JF(b->in_edges->pred); 1281 1282 at_top = 1; 1283 while (1) { 1284 if (*diffp == 0) 1285 return; 1286 1287 if (JT(*diffp) != JT(b)) 1288 return; 1289 1290 if (!SET_MEMBER((*diffp)->dom, b->id)) 1291 return; 1292 1293 if ((*diffp)->val[A_ATOM] != val) 1294 break; 1295 1296 diffp = &JF(*diffp); 1297 at_top = 0; 1298 } 1299 samep = &JF(*diffp); 1300 while (1) { 1301 if (*samep == 0) 1302 return; 1303 1304 if (JT(*samep) != JT(b)) 1305 return; 1306 1307 if (!SET_MEMBER((*samep)->dom, b->id)) 1308 return; 1309 1310 if ((*samep)->val[A_ATOM] == val) 1311 break; 1312 1313 /* XXX Need to check that there are no data dependencies 1314 between dp0 and dp1. Currently, the code generator 1315 will not produce such dependencies. */ 1316 samep = &JF(*samep); 1317 } 1318 #ifdef notdef 1319 /* XXX This doesn't cover everything. */ 1320 for (i = 0; i < N_ATOMS; ++i) 1321 if ((*samep)->val[i] != pred->val[i]) 1322 return; 1323 #endif 1324 /* Pull up the node. */ 1325 pull = *samep; 1326 *samep = JF(pull); 1327 JF(pull) = *diffp; 1328 1329 /* 1330 * At the top of the chain, each predecessor needs to point at the 1331 * pulled up node. Inside the chain, there is only one predecessor 1332 * to worry about. 1333 */ 1334 if (at_top) { 1335 for (ep = b->in_edges; ep != 0; ep = ep->next) { 1336 if (JT(ep->pred) == b) 1337 JT(ep->pred) = pull; 1338 else 1339 JF(ep->pred) = pull; 1340 } 1341 } 1342 else 1343 *diffp = pull; 1344 1345 done = 0; 1346 } 1347 1348 static void 1349 and_pullup(struct block *b) 1350 { 1351 int val, at_top; 1352 struct block *pull; 1353 struct block **diffp, **samep; 1354 struct edge *ep; 1355 1356 ep = b->in_edges; 1357 if (ep == 0) 1358 return; 1359 1360 /* 1361 * Make sure each predecessor loads the same value. 1362 */ 1363 val = ep->pred->val[A_ATOM]; 1364 for (ep = ep->next; ep != 0; ep = ep->next) 1365 if (val != ep->pred->val[A_ATOM]) 1366 return; 1367 1368 if (JT(b->in_edges->pred) == b) 1369 diffp = &JT(b->in_edges->pred); 1370 else 1371 diffp = &JF(b->in_edges->pred); 1372 1373 at_top = 1; 1374 while (1) { 1375 if (*diffp == 0) 1376 return; 1377 1378 if (JF(*diffp) != JF(b)) 1379 return; 1380 1381 if (!SET_MEMBER((*diffp)->dom, b->id)) 1382 return; 1383 1384 if ((*diffp)->val[A_ATOM] != val) 1385 break; 1386 1387 diffp = &JT(*diffp); 1388 at_top = 0; 1389 } 1390 samep = &JT(*diffp); 1391 while (1) { 1392 if (*samep == 0) 1393 return; 1394 1395 if (JF(*samep) != JF(b)) 1396 return; 1397 1398 if (!SET_MEMBER((*samep)->dom, b->id)) 1399 return; 1400 1401 if ((*samep)->val[A_ATOM] == val) 1402 break; 1403 1404 /* XXX Need to check that there are no data dependencies 1405 between diffp and samep. Currently, the code generator 1406 will not produce such dependencies. */ 1407 samep = &JT(*samep); 1408 } 1409 #ifdef notdef 1410 /* XXX This doesn't cover everything. */ 1411 for (i = 0; i < N_ATOMS; ++i) 1412 if ((*samep)->val[i] != pred->val[i]) 1413 return; 1414 #endif 1415 /* Pull up the node. */ 1416 pull = *samep; 1417 *samep = JT(pull); 1418 JT(pull) = *diffp; 1419 1420 /* 1421 * At the top of the chain, each predecessor needs to point at the 1422 * pulled up node. Inside the chain, there is only one predecessor 1423 * to worry about. 1424 */ 1425 if (at_top) { 1426 for (ep = b->in_edges; ep != 0; ep = ep->next) { 1427 if (JT(ep->pred) == b) 1428 JT(ep->pred) = pull; 1429 else 1430 JF(ep->pred) = pull; 1431 } 1432 } 1433 else 1434 *diffp = pull; 1435 1436 done = 0; 1437 } 1438 1439 static void 1440 opt_blks(struct block *root, int do_stmts) 1441 { 1442 int i, maxlevel; 1443 struct block *p; 1444 1445 init_val(); 1446 maxlevel = root->level; 1447 for (i = maxlevel; i >= 0; --i) 1448 for (p = levels[i]; p; p = p->link) 1449 opt_blk(p, do_stmts); 1450 1451 if (do_stmts) 1452 /* 1453 * No point trying to move branches; it can't possibly 1454 * make a difference at this point. 1455 */ 1456 return; 1457 1458 for (i = 1; i <= maxlevel; ++i) { 1459 for (p = levels[i]; p; p = p->link) { 1460 opt_j(&p->et); 1461 opt_j(&p->ef); 1462 } 1463 } 1464 for (i = 1; i <= maxlevel; ++i) { 1465 for (p = levels[i]; p; p = p->link) { 1466 or_pullup(p); 1467 and_pullup(p); 1468 } 1469 } 1470 } 1471 1472 static __inline void 1473 link_inedge(struct edge *parent, struct block *child) 1474 { 1475 parent->next = child->in_edges; 1476 child->in_edges = parent; 1477 } 1478 1479 static void 1480 find_inedges(struct block *root) 1481 { 1482 int i; 1483 struct block *b; 1484 1485 for (i = 0; i < n_blocks; ++i) 1486 blocks[i]->in_edges = 0; 1487 1488 /* 1489 * Traverse the graph, adding each edge to the predecessor 1490 * list of its successors. Skip the leaves (i.e. level 0). 1491 */ 1492 for (i = root->level; i > 0; --i) { 1493 for (b = levels[i]; b != 0; b = b->link) { 1494 link_inedge(&b->et, JT(b)); 1495 link_inedge(&b->ef, JF(b)); 1496 } 1497 } 1498 } 1499 1500 static void 1501 opt_root(struct block **b) 1502 { 1503 struct slist *tmp, *s; 1504 1505 s = (*b)->stmts; 1506 (*b)->stmts = 0; 1507 while (BPF_CLASS((*b)->s.code) == BPF_JMP && JT(*b) == JF(*b)) 1508 *b = JT(*b); 1509 1510 tmp = (*b)->stmts; 1511 if (tmp != 0) 1512 sappend(s, tmp); 1513 (*b)->stmts = s; 1514 1515 /* 1516 * If the root node is a return, then there is no 1517 * point executing any statements (since the bpf machine 1518 * has no side effects). 1519 */ 1520 if (BPF_CLASS((*b)->s.code) == BPF_RET) 1521 (*b)->stmts = 0; 1522 } 1523 1524 static void 1525 opt_loop(struct block *root, int do_stmts) 1526 { 1527 1528 #ifdef BDEBUG 1529 if (dflag > 1) 1530 opt_dump(root); 1531 #endif 1532 do { 1533 done = 1; 1534 find_levels(root); 1535 find_dom(root); 1536 find_closure(root); 1537 find_inedges(root); 1538 find_ud(root); 1539 find_edom(root); 1540 opt_blks(root, do_stmts); 1541 #ifdef BDEBUG 1542 if (dflag > 1) 1543 opt_dump(root); 1544 #endif 1545 } while (!done); 1546 } 1547 1548 /* 1549 * Optimize the filter code in its dag representation. 1550 */ 1551 void 1552 bpf_optimize(struct block **rootp) 1553 { 1554 struct block *root; 1555 1556 root = *rootp; 1557 1558 opt_init(root); 1559 opt_loop(root, 0); 1560 opt_loop(root, 1); 1561 intern_blocks(root); 1562 opt_root(rootp); 1563 opt_cleanup(); 1564 } 1565 1566 static void 1567 make_marks(struct block *p) 1568 { 1569 if (!isMarked(p)) { 1570 Mark(p); 1571 if (BPF_CLASS(p->s.code) != BPF_RET) { 1572 make_marks(JT(p)); 1573 make_marks(JF(p)); 1574 } 1575 } 1576 } 1577 1578 /* 1579 * Mark code array such that isMarked(i) is true 1580 * only for nodes that are alive. 1581 */ 1582 static void 1583 mark_code(struct block *p) 1584 { 1585 cur_mark += 1; 1586 make_marks(p); 1587 } 1588 1589 /* 1590 * True iff the two stmt lists load the same value from the packet into 1591 * the accumulator. 1592 */ 1593 static int 1594 eq_slist(struct slist *x, struct slist *y) 1595 { 1596 while (1) { 1597 while (x && x->s.code == NOP) 1598 x = x->next; 1599 while (y && y->s.code == NOP) 1600 y = y->next; 1601 if (x == 0) 1602 return y == 0; 1603 if (y == 0) 1604 return x == 0; 1605 if (x->s.code != y->s.code || x->s.k != y->s.k) 1606 return 0; 1607 x = x->next; 1608 y = y->next; 1609 } 1610 } 1611 1612 static __inline int 1613 eq_blk(struct block *b0, struct block *b1) 1614 { 1615 if (b0->s.code == b1->s.code && 1616 b0->s.k == b1->s.k && 1617 b0->et.succ == b1->et.succ && 1618 b0->ef.succ == b1->ef.succ) 1619 return eq_slist(b0->stmts, b1->stmts); 1620 return 0; 1621 } 1622 1623 static void 1624 intern_blocks(struct block *root) 1625 { 1626 struct block *p; 1627 int i, j; 1628 int done; 1629 top: 1630 done = 1; 1631 for (i = 0; i < n_blocks; ++i) 1632 blocks[i]->link = 0; 1633 1634 mark_code(root); 1635 1636 for (i = n_blocks - 1; --i >= 0; ) { 1637 if (!isMarked(blocks[i])) 1638 continue; 1639 for (j = i + 1; j < n_blocks; ++j) { 1640 if (!isMarked(blocks[j])) 1641 continue; 1642 if (eq_blk(blocks[i], blocks[j])) { 1643 blocks[i]->link = blocks[j]->link ? 1644 blocks[j]->link : blocks[j]; 1645 break; 1646 } 1647 } 1648 } 1649 for (i = 0; i < n_blocks; ++i) { 1650 p = blocks[i]; 1651 if (JT(p) == 0) 1652 continue; 1653 if (JT(p)->link) { 1654 done = 0; 1655 JT(p) = JT(p)->link; 1656 } 1657 if (JF(p)->link) { 1658 done = 0; 1659 JF(p) = JF(p)->link; 1660 } 1661 } 1662 if (!done) 1663 goto top; 1664 } 1665 1666 static void 1667 opt_cleanup(void) 1668 { 1669 free((void *)vnode_base); 1670 free((void *)vmap); 1671 free((void *)edges); 1672 free((void *)space1); 1673 free((void *)space2); 1674 free((void *)levels); 1675 free((void *)blocks); 1676 } 1677 1678 /* 1679 * Return the number of stmts in 's'. 1680 */ 1681 static int 1682 slength(struct slist *s) 1683 { 1684 int n = 0; 1685 1686 for (; s; s = s->next) 1687 if (s->s.code != NOP) 1688 ++n; 1689 return n; 1690 } 1691 1692 /* 1693 * Return the number of nodes reachable by 'p'. 1694 * All nodes should be initially unmarked. 1695 */ 1696 static int 1697 count_blocks(struct block *p) 1698 { 1699 if (p == 0 || isMarked(p)) 1700 return 0; 1701 Mark(p); 1702 return count_blocks(JT(p)) + count_blocks(JF(p)) + 1; 1703 } 1704 1705 /* 1706 * Do a depth first search on the flow graph, numbering the 1707 * the basic blocks, and entering them into the 'blocks' array.` 1708 */ 1709 static void 1710 number_blks_r(struct block *p) 1711 { 1712 int n; 1713 1714 if (p == 0 || isMarked(p)) 1715 return; 1716 1717 Mark(p); 1718 n = n_blocks++; 1719 p->id = n; 1720 blocks[n] = p; 1721 1722 number_blks_r(JT(p)); 1723 number_blks_r(JF(p)); 1724 } 1725 1726 /* 1727 * Return the number of stmts in the flowgraph reachable by 'p'. 1728 * The nodes should be unmarked before calling. 1729 */ 1730 static int 1731 count_stmts(struct block *p) 1732 { 1733 int n; 1734 1735 if (p == 0 || isMarked(p)) 1736 return 0; 1737 Mark(p); 1738 n = count_stmts(JT(p)) + count_stmts(JF(p)); 1739 return slength(p->stmts) + n + 1 + p->longjt + p->longjf; 1740 } 1741 1742 /* 1743 * Allocate memory. All allocation is done before optimization 1744 * is begun. A linear bound on the size of all data structures is computed 1745 * from the total number of blocks and/or statements. 1746 */ 1747 static void 1748 opt_init(struct block *root) 1749 { 1750 bpf_u_int32 *p; 1751 int i, n, max_stmts; 1752 size_t size1, size2; 1753 1754 /* 1755 * First, count the blocks, so we can malloc an array to map 1756 * block number to block. Then, put the blocks into the array. 1757 */ 1758 unMarkAll(); 1759 n = count_blocks(root); 1760 blocks = reallocarray(NULL, n, sizeof(*blocks)); 1761 if (blocks == NULL) 1762 bpf_error("malloc"); 1763 1764 unMarkAll(); 1765 n_blocks = 0; 1766 number_blks_r(root); 1767 1768 n_edges = 2 * n_blocks; 1769 edges = reallocarray(NULL, n_edges, sizeof(*edges)); 1770 if (edges == NULL) 1771 bpf_error("malloc"); 1772 1773 /* 1774 * The number of levels is bounded by the number of nodes. 1775 */ 1776 levels = reallocarray(NULL, n_blocks, sizeof(*levels)); 1777 if (levels == NULL) 1778 bpf_error("malloc"); 1779 1780 edgewords = n_edges / (8 * sizeof(bpf_u_int32)) + 1; 1781 nodewords = n_blocks / (8 * sizeof(bpf_u_int32)) + 1; 1782 1783 size1 = 2; 1784 if (n_blocks > SIZE_MAX / size1) 1785 goto fail1; 1786 size1 *= n_blocks; 1787 if (nodewords > SIZE_MAX / size1) 1788 goto fail1; 1789 size1 *= nodewords; 1790 if (sizeof(*space1) > SIZE_MAX / size1) 1791 goto fail1; 1792 size1 *= sizeof(*space1); 1793 1794 space1 = (bpf_u_int32 *)malloc(size1); 1795 if (space1 == NULL) { 1796 fail1: 1797 bpf_error("malloc"); 1798 } 1799 1800 size2 = n_edges; 1801 if (edgewords > SIZE_MAX / size2) 1802 goto fail2; 1803 size2 *= edgewords; 1804 if (sizeof(*space2) > SIZE_MAX / size2) 1805 goto fail2; 1806 size2 *= sizeof(*space2); 1807 1808 space2 = (bpf_u_int32 *)malloc(size2); 1809 if (space2 == NULL) { 1810 fail2: 1811 free(space1); 1812 bpf_error("malloc"); 1813 } 1814 1815 p = space1; 1816 all_dom_sets = p; 1817 for (i = 0; i < n; ++i) { 1818 blocks[i]->dom = p; 1819 p += nodewords; 1820 } 1821 all_closure_sets = p; 1822 for (i = 0; i < n; ++i) { 1823 blocks[i]->closure = p; 1824 p += nodewords; 1825 } 1826 p = space2; 1827 all_edge_sets = p; 1828 for (i = 0; i < n; ++i) { 1829 struct block *b = blocks[i]; 1830 1831 b->et.edom = p; 1832 p += edgewords; 1833 b->ef.edom = p; 1834 p += edgewords; 1835 b->et.id = i; 1836 edges[i] = &b->et; 1837 b->ef.id = n_blocks + i; 1838 edges[n_blocks + i] = &b->ef; 1839 b->et.pred = b; 1840 b->ef.pred = b; 1841 } 1842 max_stmts = 0; 1843 for (i = 0; i < n; ++i) 1844 max_stmts += slength(blocks[i]->stmts) + 1; 1845 /* 1846 * We allocate at most 3 value numbers per statement, 1847 * so this is an upper bound on the number of valnodes 1848 * we'll need. 1849 */ 1850 maxval = 3 * max_stmts; 1851 vmap = reallocarray(NULL, maxval, sizeof(*vmap)); 1852 vnode_base = reallocarray(NULL, maxval, sizeof(*vnode_base)); 1853 if (vmap == NULL || vnode_base == NULL) 1854 bpf_error("malloc"); 1855 } 1856 1857 /* 1858 * Some pointers used to convert the basic block form of the code, 1859 * into the array form that BPF requires. 'fstart' will point to 1860 * the malloc'd array while 'ftail' is used during the recursive traversal. 1861 */ 1862 static struct bpf_insn *fstart; 1863 static struct bpf_insn *ftail; 1864 1865 #ifdef BDEBUG 1866 int bids[1000]; 1867 #endif 1868 1869 /* 1870 * Returns true if successful. Returns false if a branch has 1871 * an offset that is too large. If so, we have marked that 1872 * branch so that on a subsequent iteration, it will be treated 1873 * properly. 1874 */ 1875 static int 1876 convert_code_r(struct block *p) 1877 { 1878 struct bpf_insn *dst; 1879 struct slist *src; 1880 int slen; 1881 u_int off; 1882 int extrajmps; /* number of extra jumps inserted */ 1883 struct slist **offset = NULL; 1884 1885 if (p == 0 || isMarked(p)) 1886 return (1); 1887 Mark(p); 1888 1889 if (convert_code_r(JF(p)) == 0) 1890 return (0); 1891 if (convert_code_r(JT(p)) == 0) 1892 return (0); 1893 1894 slen = slength(p->stmts); 1895 dst = ftail -= (slen + 1 + p->longjt + p->longjf); 1896 /* inflate length by any extra jumps */ 1897 1898 p->offset = dst - fstart; 1899 1900 /* generate offset[] for convenience */ 1901 if (slen) { 1902 offset = calloc(slen, sizeof(struct slist *)); 1903 if (!offset) { 1904 bpf_error("not enough core"); 1905 /*NOTREACHED*/ 1906 } 1907 } 1908 src = p->stmts; 1909 for (off = 0; off < slen && src; off++) { 1910 #if 0 1911 printf("off=%d src=%x\n", off, src); 1912 #endif 1913 offset[off] = src; 1914 src = src->next; 1915 } 1916 1917 off = 0; 1918 for (src = p->stmts; src; src = src->next) { 1919 if (src->s.code == NOP) 1920 continue; 1921 dst->code = (u_short)src->s.code; 1922 dst->k = src->s.k; 1923 1924 /* fill block-local relative jump */ 1925 if (BPF_CLASS(src->s.code) != BPF_JMP || src->s.code == (BPF_JMP|BPF_JA)) { 1926 #if 0 1927 if (src->s.jt || src->s.jf) { 1928 bpf_error("illegal jmp destination"); 1929 /*NOTREACHED*/ 1930 } 1931 #endif 1932 goto filled; 1933 } 1934 if (off == slen - 2) /*???*/ 1935 goto filled; 1936 1937 { 1938 int i; 1939 int jt, jf; 1940 static const char ljerr[] = 1941 "%s for block-local relative jump: off=%d"; 1942 1943 #if 0 1944 printf("code=%x off=%d %x %x\n", src->s.code, 1945 off, src->s.jt, src->s.jf); 1946 #endif 1947 1948 if (!src->s.jt || !src->s.jf) { 1949 bpf_error(ljerr, "no jmp destination", off); 1950 /*NOTREACHED*/ 1951 } 1952 1953 jt = jf = 0; 1954 for (i = 0; i < slen; i++) { 1955 if (offset[i] == src->s.jt) { 1956 if (jt) { 1957 bpf_error(ljerr, "multiple matches", off); 1958 /*NOTREACHED*/ 1959 } 1960 1961 dst->jt = i - off - 1; 1962 jt++; 1963 } 1964 if (offset[i] == src->s.jf) { 1965 if (jf) { 1966 bpf_error(ljerr, "multiple matches", off); 1967 /*NOTREACHED*/ 1968 } 1969 dst->jf = i - off - 1; 1970 jf++; 1971 } 1972 } 1973 if (!jt || !jf) { 1974 bpf_error(ljerr, "no destination found", off); 1975 /*NOTREACHED*/ 1976 } 1977 } 1978 filled: 1979 ++dst; 1980 ++off; 1981 } 1982 free(offset); 1983 1984 #ifdef BDEBUG 1985 bids[dst - fstart] = p->id + 1; 1986 #endif 1987 dst->code = (u_short)p->s.code; 1988 dst->k = p->s.k; 1989 if (JT(p)) { 1990 extrajmps = 0; 1991 off = JT(p)->offset - (p->offset + slen) - 1; 1992 if (off >= 256) { 1993 /* offset too large for branch, must add a jump */ 1994 if (p->longjt == 0) { 1995 /* mark this instruction and retry */ 1996 p->longjt++; 1997 return(0); 1998 } 1999 /* branch if T to following jump */ 2000 dst->jt = extrajmps; 2001 extrajmps++; 2002 dst[extrajmps].code = BPF_JMP|BPF_JA; 2003 dst[extrajmps].k = off - extrajmps; 2004 } 2005 else 2006 dst->jt = off; 2007 off = JF(p)->offset - (p->offset + slen) - 1; 2008 if (off >= 256) { 2009 /* offset too large for branch, must add a jump */ 2010 if (p->longjf == 0) { 2011 /* mark this instruction and retry */ 2012 p->longjf++; 2013 return(0); 2014 } 2015 /* branch if F to following jump */ 2016 /* if two jumps are inserted, F goes to second one */ 2017 dst->jf = extrajmps; 2018 extrajmps++; 2019 dst[extrajmps].code = BPF_JMP|BPF_JA; 2020 dst[extrajmps].k = off - extrajmps; 2021 } 2022 else 2023 dst->jf = off; 2024 } 2025 return (1); 2026 } 2027 2028 2029 /* 2030 * Convert flowgraph intermediate representation to the 2031 * BPF array representation. Set *lenp to the number of instructions. 2032 */ 2033 struct bpf_insn * 2034 icode_to_fcode(struct block *root, int *lenp) 2035 { 2036 int n; 2037 struct bpf_insn *fp; 2038 2039 /* 2040 * Loop doing convert_codr_r() until no branches remain 2041 * with too-large offsets. 2042 */ 2043 while (1) { 2044 unMarkAll(); 2045 n = *lenp = count_stmts(root); 2046 2047 fp = calloc(n, sizeof(*fp)); 2048 if (fp == NULL) 2049 bpf_error("calloc"); 2050 2051 fstart = fp; 2052 ftail = fp + n; 2053 2054 unMarkAll(); 2055 if (convert_code_r(root)) 2056 break; 2057 free(fp); 2058 } 2059 2060 return fp; 2061 } 2062 2063 #ifdef BDEBUG 2064 static void 2065 opt_dump(struct block *root) 2066 { 2067 struct bpf_program f; 2068 2069 memset(bids, 0, sizeof bids); 2070 f.bf_insns = icode_to_fcode(root, &f.bf_len); 2071 bpf_dump(&f, 1); 2072 putchar('\n'); 2073 free((char *)f.bf_insns); 2074 } 2075 #endif 2076