xref: /openbsd-src/lib/libpcap/optimize.c (revision ce7e0fc6a9d74d25b78fb6ad846387717f5172b6)
1 /*	$OpenBSD: optimize.c,v 1.7 2000/04/26 21:25:53 jakob Exp $	*/
2 
3 /*
4  * Copyright (c) 1988, 1989, 1990, 1991, 1993, 1994, 1995, 1996
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that: (1) source code distributions
9  * retain the above copyright notice and this paragraph in its entirety, (2)
10  * distributions including binary code include the above copyright notice and
11  * this paragraph in its entirety in the documentation or other materials
12  * provided with the distribution, and (3) all advertising materials mentioning
13  * features or use of this software display the following acknowledgement:
14  * ``This product includes software developed by the University of California,
15  * Lawrence Berkeley Laboratory and its contributors.'' Neither the name of
16  * the University nor the names of its contributors may be used to endorse
17  * or promote products derived from this software without specific prior
18  * written permission.
19  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
20  * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
21  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
22  *
23  *  Optimization module for tcpdump intermediate representation.
24  */
25 #ifndef lint
26 static const char rcsid[] =
27     "@(#) $Header: /home/cvs/src/lib/libpcap/optimize.c,v 1.7 2000/04/26 21:25:53 jakob Exp $ (LBL)";
28 #endif
29 
30 #include <sys/types.h>
31 #include <sys/time.h>
32 
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <memory.h>
36 
37 #include "pcap-int.h"
38 
39 #include "gencode.h"
40 
41 #ifdef HAVE_OS_PROTO_H
42 #include "os-proto.h"
43 #endif
44 
45 #ifdef BDEBUG
46 extern int dflag;
47 #endif
48 
49 #define A_ATOM BPF_MEMWORDS
50 #define X_ATOM (BPF_MEMWORDS+1)
51 
52 #define NOP -1
53 
54 /*
55  * This define is used to represent *both* the accumulator and
56  * x register in use-def computations.
57  * Currently, the use-def code assumes only one definition per instruction.
58  */
59 #define AX_ATOM N_ATOMS
60 
61 /*
62  * A flag to indicate that further optimization is needed.
63  * Iterative passes are continued until a given pass yields no
64  * branch movement.
65  */
66 static int done;
67 
68 /*
69  * A block is marked if only if its mark equals the current mark.
70  * Rather than traverse the code array, marking each item, 'cur_mark' is
71  * incremented.  This automatically makes each element unmarked.
72  */
73 static int cur_mark;
74 #define isMarked(p) ((p)->mark == cur_mark)
75 #define unMarkAll() cur_mark += 1
76 #define Mark(p) ((p)->mark = cur_mark)
77 
78 static void opt_init(struct block *);
79 static void opt_cleanup(void);
80 
81 static void make_marks(struct block *);
82 static void mark_code(struct block *);
83 
84 static void intern_blocks(struct block *);
85 
86 static int eq_slist(struct slist *, struct slist *);
87 
88 static void find_levels_r(struct block *);
89 
90 static void find_levels(struct block *);
91 static void find_dom(struct block *);
92 static void propedom(struct edge *);
93 static void find_edom(struct block *);
94 static void find_closure(struct block *);
95 static int atomuse(struct stmt *);
96 static int atomdef(struct stmt *);
97 static void compute_local_ud(struct block *);
98 static void find_ud(struct block *);
99 static void init_val(void);
100 static int F(int, int, int);
101 static __inline void vstore(struct stmt *, int *, int, int);
102 static void opt_blk(struct block *, int);
103 static int use_conflict(struct block *, struct block *);
104 static void opt_j(struct edge *);
105 static void or_pullup(struct block *);
106 static void and_pullup(struct block *);
107 static void opt_blks(struct block *, int);
108 static __inline void link_inedge(struct edge *, struct block *);
109 static void find_inedges(struct block *);
110 static void opt_root(struct block **);
111 static void opt_loop(struct block *, int);
112 static void fold_op(struct stmt *, int, int);
113 static __inline struct slist *this_op(struct slist *);
114 static void opt_not(struct block *);
115 static void opt_peep(struct block *);
116 static void opt_stmt(struct stmt *, int[], int);
117 static void deadstmt(struct stmt *, struct stmt *[]);
118 static void opt_deadstores(struct block *);
119 static void opt_blk(struct block *, int);
120 static int use_conflict(struct block *, struct block *);
121 static void opt_j(struct edge *);
122 static struct block *fold_edge(struct block *, struct edge *);
123 static __inline int eq_blk(struct block *, struct block *);
124 static int slength(struct slist *);
125 static int count_blocks(struct block *);
126 static void number_blks_r(struct block *);
127 static int count_stmts(struct block *);
128 static int convert_code_r(struct block *);
129 #ifdef BDEBUG
130 static void opt_dump(struct block *);
131 #endif
132 
133 static int n_blocks;
134 struct block **blocks;
135 static int n_edges;
136 struct edge **edges;
137 
138 /*
139  * A bit vector set representation of the dominators.
140  * We round up the set size to the next power of two.
141  */
142 static int nodewords;
143 static int edgewords;
144 struct block **levels;
145 bpf_u_int32 *space;
146 #define BITS_PER_WORD (8*sizeof(bpf_u_int32))
147 /*
148  * True if a is in uset {p}
149  */
150 #define SET_MEMBER(p, a) \
151 ((p)[(unsigned)(a) / BITS_PER_WORD] & (1 << ((unsigned)(a) % BITS_PER_WORD)))
152 
153 /*
154  * Add 'a' to uset p.
155  */
156 #define SET_INSERT(p, a) \
157 (p)[(unsigned)(a) / BITS_PER_WORD] |= (1 << ((unsigned)(a) % BITS_PER_WORD))
158 
159 /*
160  * Delete 'a' from uset p.
161  */
162 #define SET_DELETE(p, a) \
163 (p)[(unsigned)(a) / BITS_PER_WORD] &= ~(1 << ((unsigned)(a) % BITS_PER_WORD))
164 
165 /*
166  * a := a intersect b
167  */
168 #define SET_INTERSECT(a, b, n)\
169 {\
170 	register bpf_u_int32 *_x = a, *_y = b;\
171 	register int _n = n;\
172 	while (--_n >= 0) *_x++ &= *_y++;\
173 }
174 
175 /*
176  * a := a - b
177  */
178 #define SET_SUBTRACT(a, b, n)\
179 {\
180 	register bpf_u_int32 *_x = a, *_y = b;\
181 	register int _n = n;\
182 	while (--_n >= 0) *_x++ &=~ *_y++;\
183 }
184 
185 /*
186  * a := a union b
187  */
188 #define SET_UNION(a, b, n)\
189 {\
190 	register bpf_u_int32 *_x = a, *_y = b;\
191 	register int _n = n;\
192 	while (--_n >= 0) *_x++ |= *_y++;\
193 }
194 
195 static uset all_dom_sets;
196 static uset all_closure_sets;
197 static uset all_edge_sets;
198 
199 #ifndef MAX
200 #define MAX(a,b) ((a)>(b)?(a):(b))
201 #endif
202 
203 static void
204 find_levels_r(b)
205 	struct block *b;
206 {
207 	int level;
208 
209 	if (isMarked(b))
210 		return;
211 
212 	Mark(b);
213 	b->link = 0;
214 
215 	if (JT(b)) {
216 		find_levels_r(JT(b));
217 		find_levels_r(JF(b));
218 		level = MAX(JT(b)->level, JF(b)->level) + 1;
219 	} else
220 		level = 0;
221 	b->level = level;
222 	b->link = levels[level];
223 	levels[level] = b;
224 }
225 
226 /*
227  * Level graph.  The levels go from 0 at the leaves to
228  * N_LEVELS at the root.  The levels[] array points to the
229  * first node of the level list, whose elements are linked
230  * with the 'link' field of the struct block.
231  */
232 static void
233 find_levels(root)
234 	struct block *root;
235 {
236 	memset((char *)levels, 0, n_blocks * sizeof(*levels));
237 	unMarkAll();
238 	find_levels_r(root);
239 }
240 
241 /*
242  * Find dominator relationships.
243  * Assumes graph has been leveled.
244  */
245 static void
246 find_dom(root)
247 	struct block *root;
248 {
249 	int i;
250 	struct block *b;
251 	bpf_u_int32 *x;
252 
253 	/*
254 	 * Initialize sets to contain all nodes.
255 	 */
256 	x = all_dom_sets;
257 	i = n_blocks * nodewords;
258 	while (--i >= 0)
259 		*x++ = ~0;
260 	/* Root starts off empty. */
261 	for (i = nodewords; --i >= 0;)
262 		root->dom[i] = 0;
263 
264 	/* root->level is the highest level no found. */
265 	for (i = root->level; i >= 0; --i) {
266 		for (b = levels[i]; b; b = b->link) {
267 			SET_INSERT(b->dom, b->id);
268 			if (JT(b) == 0)
269 				continue;
270 			SET_INTERSECT(JT(b)->dom, b->dom, nodewords);
271 			SET_INTERSECT(JF(b)->dom, b->dom, nodewords);
272 		}
273 	}
274 }
275 
276 static void
277 propedom(ep)
278 	struct edge *ep;
279 {
280 	SET_INSERT(ep->edom, ep->id);
281 	if (ep->succ) {
282 		SET_INTERSECT(ep->succ->et.edom, ep->edom, edgewords);
283 		SET_INTERSECT(ep->succ->ef.edom, ep->edom, edgewords);
284 	}
285 }
286 
287 /*
288  * Compute edge dominators.
289  * Assumes graph has been leveled and predecessors established.
290  */
291 static void
292 find_edom(root)
293 	struct block *root;
294 {
295 	int i;
296 	uset x;
297 	struct block *b;
298 
299 	x = all_edge_sets;
300 	for (i = n_edges * edgewords; --i >= 0; )
301 		x[i] = ~0;
302 
303 	/* root->level is the highest level no found. */
304 	memset(root->et.edom, 0, edgewords * sizeof(*(uset)0));
305 	memset(root->ef.edom, 0, edgewords * sizeof(*(uset)0));
306 	for (i = root->level; i >= 0; --i) {
307 		for (b = levels[i]; b != 0; b = b->link) {
308 			propedom(&b->et);
309 			propedom(&b->ef);
310 		}
311 	}
312 }
313 
314 /*
315  * Find the backwards transitive closure of the flow graph.  These sets
316  * are backwards in the sense that we find the set of nodes that reach
317  * a given node, not the set of nodes that can be reached by a node.
318  *
319  * Assumes graph has been leveled.
320  */
321 static void
322 find_closure(root)
323 	struct block *root;
324 {
325 	int i;
326 	struct block *b;
327 
328 	/*
329 	 * Initialize sets to contain no nodes.
330 	 */
331 	memset((char *)all_closure_sets, 0,
332 	      n_blocks * nodewords * sizeof(*all_closure_sets));
333 
334 	/* root->level is the highest level no found. */
335 	for (i = root->level; i >= 0; --i) {
336 		for (b = levels[i]; b; b = b->link) {
337 			SET_INSERT(b->closure, b->id);
338 			if (JT(b) == 0)
339 				continue;
340 			SET_UNION(JT(b)->closure, b->closure, nodewords);
341 			SET_UNION(JF(b)->closure, b->closure, nodewords);
342 		}
343 	}
344 }
345 
346 /*
347  * Return the register number that is used by s.  If A and X are both
348  * used, return AX_ATOM.  If no register is used, return -1.
349  *
350  * The implementation should probably change to an array access.
351  */
352 static int
353 atomuse(s)
354 	struct stmt *s;
355 {
356 	register int c = s->code;
357 
358 	if (c == NOP)
359 		return -1;
360 
361 	switch (BPF_CLASS(c)) {
362 
363 	case BPF_RET:
364 		return (BPF_RVAL(c) == BPF_A) ? A_ATOM :
365 			(BPF_RVAL(c) == BPF_X) ? X_ATOM : -1;
366 
367 	case BPF_LD:
368 	case BPF_LDX:
369 		return (BPF_MODE(c) == BPF_IND) ? X_ATOM :
370 			(BPF_MODE(c) == BPF_MEM) ? s->k : -1;
371 
372 	case BPF_ST:
373 		return A_ATOM;
374 
375 	case BPF_STX:
376 		return X_ATOM;
377 
378 	case BPF_JMP:
379 	case BPF_ALU:
380 		if (BPF_SRC(c) == BPF_X)
381 			return AX_ATOM;
382 		return A_ATOM;
383 
384 	case BPF_MISC:
385 		return BPF_MISCOP(c) == BPF_TXA ? X_ATOM : A_ATOM;
386 	}
387 	abort();
388 	/* NOTREACHED */
389 }
390 
391 /*
392  * Return the register number that is defined by 's'.  We assume that
393  * a single stmt cannot define more than one register.  If no register
394  * is defined, return -1.
395  *
396  * The implementation should probably change to an array access.
397  */
398 static int
399 atomdef(s)
400 	struct stmt *s;
401 {
402 	if (s->code == NOP)
403 		return -1;
404 
405 	switch (BPF_CLASS(s->code)) {
406 
407 	case BPF_LD:
408 	case BPF_ALU:
409 		return A_ATOM;
410 
411 	case BPF_LDX:
412 		return X_ATOM;
413 
414 	case BPF_ST:
415 	case BPF_STX:
416 		return s->k;
417 
418 	case BPF_MISC:
419 		return BPF_MISCOP(s->code) == BPF_TAX ? X_ATOM : A_ATOM;
420 	}
421 	return -1;
422 }
423 
424 static void
425 compute_local_ud(b)
426 	struct block *b;
427 {
428 	struct slist *s;
429 	atomset def = 0, use = 0, kill = 0;
430 	int atom;
431 
432 	for (s = b->stmts; s; s = s->next) {
433 		if (s->s.code == NOP)
434 			continue;
435 		atom = atomuse(&s->s);
436 		if (atom >= 0) {
437 			if (atom == AX_ATOM) {
438 				if (!ATOMELEM(def, X_ATOM))
439 					use |= ATOMMASK(X_ATOM);
440 				if (!ATOMELEM(def, A_ATOM))
441 					use |= ATOMMASK(A_ATOM);
442 			}
443 			else if (atom < N_ATOMS) {
444 				if (!ATOMELEM(def, atom))
445 					use |= ATOMMASK(atom);
446 			}
447 			else
448 				abort();
449 		}
450 		atom = atomdef(&s->s);
451 		if (atom >= 0) {
452 			if (!ATOMELEM(use, atom))
453 				kill |= ATOMMASK(atom);
454 			def |= ATOMMASK(atom);
455 		}
456 	}
457 	if (!ATOMELEM(def, A_ATOM) && BPF_CLASS(b->s.code) == BPF_JMP)
458 		use |= ATOMMASK(A_ATOM);
459 
460 	b->def = def;
461 	b->kill = kill;
462 	b->in_use = use;
463 }
464 
465 /*
466  * Assume graph is already leveled.
467  */
468 static void
469 find_ud(root)
470 	struct block *root;
471 {
472 	int i, maxlevel;
473 	struct block *p;
474 
475 	/*
476 	 * root->level is the highest level no found;
477 	 * count down from there.
478 	 */
479 	maxlevel = root->level;
480 	for (i = maxlevel; i >= 0; --i)
481 		for (p = levels[i]; p; p = p->link) {
482 			compute_local_ud(p);
483 			p->out_use = 0;
484 		}
485 
486 	for (i = 1; i <= maxlevel; ++i) {
487 		for (p = levels[i]; p; p = p->link) {
488 			p->out_use |= JT(p)->in_use | JF(p)->in_use;
489 			p->in_use |= p->out_use &~ p->kill;
490 		}
491 	}
492 }
493 
494 /*
495  * These data structures are used in a Cocke and Shwarz style
496  * value numbering scheme.  Since the flowgraph is acyclic,
497  * exit values can be propagated from a node's predecessors
498  * provided it is uniquely defined.
499  */
500 struct valnode {
501 	int code;
502 	int v0, v1;
503 	int val;
504 	struct valnode *next;
505 };
506 
507 #define MODULUS 213
508 static struct valnode *hashtbl[MODULUS];
509 static int curval;
510 static int maxval;
511 
512 /* Integer constants mapped with the load immediate opcode. */
513 #define K(i) F(BPF_LD|BPF_IMM|BPF_W, i, 0L)
514 
515 struct vmapinfo {
516 	int is_const;
517 	bpf_int32 const_val;
518 };
519 
520 struct vmapinfo *vmap;
521 struct valnode *vnode_base;
522 struct valnode *next_vnode;
523 
524 static void
525 init_val()
526 {
527 	curval = 0;
528 	next_vnode = vnode_base;
529 	memset((char *)vmap, 0, maxval * sizeof(*vmap));
530 	memset((char *)hashtbl, 0, sizeof hashtbl);
531 }
532 
533 /* Because we really don't have an IR, this stuff is a little messy. */
534 static int
535 F(code, v0, v1)
536 	int code;
537 	int v0, v1;
538 {
539 	u_int hash;
540 	int val;
541 	struct valnode *p;
542 
543 	hash = (u_int)code ^ (v0 << 4) ^ (v1 << 8);
544 	hash %= MODULUS;
545 
546 	for (p = hashtbl[hash]; p; p = p->next)
547 		if (p->code == code && p->v0 == v0 && p->v1 == v1)
548 			return p->val;
549 
550 	val = ++curval;
551 	if (BPF_MODE(code) == BPF_IMM &&
552 	    (BPF_CLASS(code) == BPF_LD || BPF_CLASS(code) == BPF_LDX)) {
553 		vmap[val].const_val = v0;
554 		vmap[val].is_const = 1;
555 	}
556 	p = next_vnode++;
557 	p->val = val;
558 	p->code = code;
559 	p->v0 = v0;
560 	p->v1 = v1;
561 	p->next = hashtbl[hash];
562 	hashtbl[hash] = p;
563 
564 	return val;
565 }
566 
567 static __inline void
568 vstore(s, valp, newval, alter)
569 	struct stmt *s;
570 	int *valp;
571 	int newval;
572 	int alter;
573 {
574 	if (alter && *valp == newval)
575 		s->code = NOP;
576 	else
577 		*valp = newval;
578 }
579 
580 static void
581 fold_op(s, v0, v1)
582 	struct stmt *s;
583 	int v0, v1;
584 {
585 	bpf_int32 a, b;
586 
587 	a = vmap[v0].const_val;
588 	b = vmap[v1].const_val;
589 
590 	switch (BPF_OP(s->code)) {
591 	case BPF_ADD:
592 		a += b;
593 		break;
594 
595 	case BPF_SUB:
596 		a -= b;
597 		break;
598 
599 	case BPF_MUL:
600 		a *= b;
601 		break;
602 
603 	case BPF_DIV:
604 		if (b == 0)
605 			bpf_error("division by zero");
606 		a /= b;
607 		break;
608 
609 	case BPF_AND:
610 		a &= b;
611 		break;
612 
613 	case BPF_OR:
614 		a |= b;
615 		break;
616 
617 	case BPF_LSH:
618 		a <<= b;
619 		break;
620 
621 	case BPF_RSH:
622 		a >>= b;
623 		break;
624 
625 	case BPF_NEG:
626 		a = -a;
627 		break;
628 
629 	default:
630 		abort();
631 	}
632 	s->k = a;
633 	s->code = BPF_LD|BPF_IMM;
634 	done = 0;
635 }
636 
637 static __inline struct slist *
638 this_op(s)
639 	struct slist *s;
640 {
641 	while (s != 0 && s->s.code == NOP)
642 		s = s->next;
643 	return s;
644 }
645 
646 static void
647 opt_not(b)
648 	struct block *b;
649 {
650 	struct block *tmp = JT(b);
651 
652 	JT(b) = JF(b);
653 	JF(b) = tmp;
654 }
655 
656 static void
657 opt_peep(b)
658 	struct block *b;
659 {
660 	struct slist *s;
661 	struct slist *next, *last;
662 	int val;
663 
664 	s = b->stmts;
665 	if (s == 0)
666 		return;
667 
668 	last = s;
669 	while (1) {
670 		s = this_op(s);
671 		if (s == 0)
672 			break;
673 		next = this_op(s->next);
674 		if (next == 0)
675 			break;
676 		last = next;
677 
678 		/*
679 		 * st  M[k]	-->	st  M[k]
680 		 * ldx M[k]		tax
681 		 */
682 		if (s->s.code == BPF_ST &&
683 		    next->s.code == (BPF_LDX|BPF_MEM) &&
684 		    s->s.k == next->s.k) {
685 			done = 0;
686 			next->s.code = BPF_MISC|BPF_TAX;
687 		}
688 		/*
689 		 * ld  #k	-->	ldx  #k
690 		 * tax			txa
691 		 */
692 		if (s->s.code == (BPF_LD|BPF_IMM) &&
693 		    next->s.code == (BPF_MISC|BPF_TAX)) {
694 			s->s.code = BPF_LDX|BPF_IMM;
695 			next->s.code = BPF_MISC|BPF_TXA;
696 			done = 0;
697 		}
698 		/*
699 		 * This is an ugly special case, but it happens
700 		 * when you say tcp[k] or udp[k] where k is a constant.
701 		 */
702 		if (s->s.code == (BPF_LD|BPF_IMM)) {
703 			struct slist *add, *tax, *ild;
704 
705 			/*
706 			 * Check that X isn't used on exit from this
707 			 * block (which the optimizer might cause).
708 			 * We know the code generator won't generate
709 			 * any local dependencies.
710 			 */
711 			if (ATOMELEM(b->out_use, X_ATOM))
712 				break;
713 
714 			if (next->s.code != (BPF_LDX|BPF_MSH|BPF_B))
715 				add = next;
716 			else
717 				add = this_op(next->next);
718 			if (add == 0 || add->s.code != (BPF_ALU|BPF_ADD|BPF_X))
719 				break;
720 
721 			tax = this_op(add->next);
722 			if (tax == 0 || tax->s.code != (BPF_MISC|BPF_TAX))
723 				break;
724 
725 			ild = this_op(tax->next);
726 			if (ild == 0 || BPF_CLASS(ild->s.code) != BPF_LD ||
727 			    BPF_MODE(ild->s.code) != BPF_IND)
728 				break;
729 			/*
730 			 * XXX We need to check that X is not
731 			 * subsequently used.  We know we can eliminate the
732 			 * accumulator modifications since it is defined
733 			 * by the last stmt of this sequence.
734 			 *
735 			 * We want to turn this sequence:
736 			 *
737 			 * (004) ldi     #0x2		{s}
738 			 * (005) ldxms   [14]		{next}  -- optional
739 			 * (006) addx			{add}
740 			 * (007) tax			{tax}
741 			 * (008) ild     [x+0]		{ild}
742 			 *
743 			 * into this sequence:
744 			 *
745 			 * (004) nop
746 			 * (005) ldxms   [14]
747 			 * (006) nop
748 			 * (007) nop
749 			 * (008) ild     [x+2]
750 			 *
751 			 */
752 			ild->s.k += s->s.k;
753 			s->s.code = NOP;
754 			add->s.code = NOP;
755 			tax->s.code = NOP;
756 			done = 0;
757 		}
758 		s = next;
759 	}
760 	/*
761 	 * If we have a subtract to do a comparison, and the X register
762 	 * is a known constant, we can merge this value into the
763 	 * comparison.
764 	 */
765 	if (last->s.code == (BPF_ALU|BPF_SUB|BPF_X) &&
766 	    !ATOMELEM(b->out_use, A_ATOM)) {
767 		val = b->val[X_ATOM];
768 		if (vmap[val].is_const) {
769 			int op;
770 
771 			b->s.k += vmap[val].const_val;
772 			op = BPF_OP(b->s.code);
773 			if (op == BPF_JGT || op == BPF_JGE) {
774 				struct block *t = JT(b);
775 				JT(b) = JF(b);
776 				JF(b) = t;
777 				b->s.k += 0x80000000;
778 			}
779 			last->s.code = NOP;
780 			done = 0;
781 		} else if (b->s.k == 0) {
782 			/*
783 			 * sub x  ->	nop
784 			 * j  #0	j  x
785 			 */
786 			last->s.code = NOP;
787 			b->s.code = BPF_CLASS(b->s.code) | BPF_OP(b->s.code) |
788 				BPF_X;
789 			done = 0;
790 		}
791 	}
792 	/*
793 	 * Likewise, a constant subtract can be simplified.
794 	 */
795 	else if (last->s.code == (BPF_ALU|BPF_SUB|BPF_K) &&
796 		 !ATOMELEM(b->out_use, A_ATOM)) {
797 		int op;
798 
799 		b->s.k += last->s.k;
800 		last->s.code = NOP;
801 		op = BPF_OP(b->s.code);
802 		if (op == BPF_JGT || op == BPF_JGE) {
803 			struct block *t = JT(b);
804 			JT(b) = JF(b);
805 			JF(b) = t;
806 			b->s.k += 0x80000000;
807 		}
808 		done = 0;
809 	}
810 	/*
811 	 * and #k	nop
812 	 * jeq #0  ->	jset #k
813 	 */
814 	if (last->s.code == (BPF_ALU|BPF_AND|BPF_K) &&
815 	    !ATOMELEM(b->out_use, A_ATOM) && b->s.k == 0) {
816 		b->s.k = last->s.k;
817 		b->s.code = BPF_JMP|BPF_K|BPF_JSET;
818 		last->s.code = NOP;
819 		done = 0;
820 		opt_not(b);
821 	}
822 	/*
823 	 * If the accumulator is a known constant, we can compute the
824 	 * comparison result.
825 	 */
826 	val = b->val[A_ATOM];
827 	if (vmap[val].is_const && BPF_SRC(b->s.code) == BPF_K) {
828 		bpf_int32 v = vmap[val].const_val;
829 		switch (BPF_OP(b->s.code)) {
830 
831 		case BPF_JEQ:
832 			v = v == b->s.k;
833 			break;
834 
835 		case BPF_JGT:
836 			v = (unsigned)v > b->s.k;
837 			break;
838 
839 		case BPF_JGE:
840 			v = (unsigned)v >= b->s.k;
841 			break;
842 
843 		case BPF_JSET:
844 			v &= b->s.k;
845 			break;
846 
847 		default:
848 			abort();
849 		}
850 		if (JF(b) != JT(b))
851 			done = 0;
852 		if (v)
853 			JF(b) = JT(b);
854 		else
855 			JT(b) = JF(b);
856 	}
857 }
858 
859 /*
860  * Compute the symbolic value of expression of 's', and update
861  * anything it defines in the value table 'val'.  If 'alter' is true,
862  * do various optimizations.  This code would be cleaner if symbolic
863  * evaluation and code transformations weren't folded together.
864  */
865 static void
866 opt_stmt(s, val, alter)
867 	struct stmt *s;
868 	int val[];
869 	int alter;
870 {
871 	int op;
872 	int v;
873 
874 	switch (s->code) {
875 
876 	case BPF_LD|BPF_ABS|BPF_W:
877 	case BPF_LD|BPF_ABS|BPF_H:
878 	case BPF_LD|BPF_ABS|BPF_B:
879 		v = F(s->code, s->k, 0L);
880 		vstore(s, &val[A_ATOM], v, alter);
881 		break;
882 
883 	case BPF_LD|BPF_IND|BPF_W:
884 	case BPF_LD|BPF_IND|BPF_H:
885 	case BPF_LD|BPF_IND|BPF_B:
886 		v = val[X_ATOM];
887 		if (alter && vmap[v].is_const) {
888 			s->code = BPF_LD|BPF_ABS|BPF_SIZE(s->code);
889 			s->k += vmap[v].const_val;
890 			v = F(s->code, s->k, 0L);
891 			done = 0;
892 		}
893 		else
894 			v = F(s->code, s->k, v);
895 		vstore(s, &val[A_ATOM], v, alter);
896 		break;
897 
898 	case BPF_LD|BPF_LEN:
899 		v = F(s->code, 0L, 0L);
900 		vstore(s, &val[A_ATOM], v, alter);
901 		break;
902 
903 	case BPF_LD|BPF_IMM:
904 		v = K(s->k);
905 		vstore(s, &val[A_ATOM], v, alter);
906 		break;
907 
908 	case BPF_LDX|BPF_IMM:
909 		v = K(s->k);
910 		vstore(s, &val[X_ATOM], v, alter);
911 		break;
912 
913 	case BPF_LDX|BPF_MSH|BPF_B:
914 		v = F(s->code, s->k, 0L);
915 		vstore(s, &val[X_ATOM], v, alter);
916 		break;
917 
918 	case BPF_ALU|BPF_NEG:
919 		if (alter && vmap[val[A_ATOM]].is_const) {
920 			s->code = BPF_LD|BPF_IMM;
921 			s->k = -vmap[val[A_ATOM]].const_val;
922 			val[A_ATOM] = K(s->k);
923 		}
924 		else
925 			val[A_ATOM] = F(s->code, val[A_ATOM], 0L);
926 		break;
927 
928 	case BPF_ALU|BPF_ADD|BPF_K:
929 	case BPF_ALU|BPF_SUB|BPF_K:
930 	case BPF_ALU|BPF_MUL|BPF_K:
931 	case BPF_ALU|BPF_DIV|BPF_K:
932 	case BPF_ALU|BPF_AND|BPF_K:
933 	case BPF_ALU|BPF_OR|BPF_K:
934 	case BPF_ALU|BPF_LSH|BPF_K:
935 	case BPF_ALU|BPF_RSH|BPF_K:
936 		op = BPF_OP(s->code);
937 		if (alter) {
938 			if (s->k == 0) {
939 				if (op == BPF_ADD || op == BPF_SUB ||
940 				    op == BPF_LSH || op == BPF_RSH ||
941 				    op == BPF_OR) {
942 					s->code = NOP;
943 					break;
944 				}
945 				if (op == BPF_MUL || op == BPF_AND) {
946 					s->code = BPF_LD|BPF_IMM;
947 					val[A_ATOM] = K(s->k);
948 					break;
949 				}
950 			}
951 			if (vmap[val[A_ATOM]].is_const) {
952 				fold_op(s, val[A_ATOM], K(s->k));
953 				val[A_ATOM] = K(s->k);
954 				break;
955 			}
956 		}
957 		val[A_ATOM] = F(s->code, val[A_ATOM], K(s->k));
958 		break;
959 
960 	case BPF_ALU|BPF_ADD|BPF_X:
961 	case BPF_ALU|BPF_SUB|BPF_X:
962 	case BPF_ALU|BPF_MUL|BPF_X:
963 	case BPF_ALU|BPF_DIV|BPF_X:
964 	case BPF_ALU|BPF_AND|BPF_X:
965 	case BPF_ALU|BPF_OR|BPF_X:
966 	case BPF_ALU|BPF_LSH|BPF_X:
967 	case BPF_ALU|BPF_RSH|BPF_X:
968 		op = BPF_OP(s->code);
969 		if (alter && vmap[val[X_ATOM]].is_const) {
970 			if (vmap[val[A_ATOM]].is_const) {
971 				fold_op(s, val[A_ATOM], val[X_ATOM]);
972 				val[A_ATOM] = K(s->k);
973 			}
974 			else {
975 				s->code = BPF_ALU|BPF_K|op;
976 				s->k = vmap[val[X_ATOM]].const_val;
977 				done = 0;
978 				val[A_ATOM] =
979 					F(s->code, val[A_ATOM], K(s->k));
980 			}
981 			break;
982 		}
983 		/*
984 		 * Check if we're doing something to an accumulator
985 		 * that is 0, and simplify.  This may not seem like
986 		 * much of a simplification but it could open up further
987 		 * optimizations.
988 		 * XXX We could also check for mul by 1, and -1, etc.
989 		 */
990 		if (alter && vmap[val[A_ATOM]].is_const
991 		    && vmap[val[A_ATOM]].const_val == 0) {
992 			if (op == BPF_ADD || op == BPF_OR ||
993 			    op == BPF_LSH || op == BPF_RSH || op == BPF_SUB) {
994 				s->code = BPF_MISC|BPF_TXA;
995 				vstore(s, &val[A_ATOM], val[X_ATOM], alter);
996 				break;
997 			}
998 			else if (op == BPF_MUL || op == BPF_DIV ||
999 				 op == BPF_AND) {
1000 				s->code = BPF_LD|BPF_IMM;
1001 				s->k = 0;
1002 				vstore(s, &val[A_ATOM], K(s->k), alter);
1003 				break;
1004 			}
1005 			else if (op == BPF_NEG) {
1006 				s->code = NOP;
1007 				break;
1008 			}
1009 		}
1010 		val[A_ATOM] = F(s->code, val[A_ATOM], val[X_ATOM]);
1011 		break;
1012 
1013 	case BPF_MISC|BPF_TXA:
1014 		vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1015 		break;
1016 
1017 	case BPF_LD|BPF_MEM:
1018 		v = val[s->k];
1019 		if (alter && vmap[v].is_const) {
1020 			s->code = BPF_LD|BPF_IMM;
1021 			s->k = vmap[v].const_val;
1022 			done = 0;
1023 		}
1024 		vstore(s, &val[A_ATOM], v, alter);
1025 		break;
1026 
1027 	case BPF_MISC|BPF_TAX:
1028 		vstore(s, &val[X_ATOM], val[A_ATOM], alter);
1029 		break;
1030 
1031 	case BPF_LDX|BPF_MEM:
1032 		v = val[s->k];
1033 		if (alter && vmap[v].is_const) {
1034 			s->code = BPF_LDX|BPF_IMM;
1035 			s->k = vmap[v].const_val;
1036 			done = 0;
1037 		}
1038 		vstore(s, &val[X_ATOM], v, alter);
1039 		break;
1040 
1041 	case BPF_ST:
1042 		vstore(s, &val[s->k], val[A_ATOM], alter);
1043 		break;
1044 
1045 	case BPF_STX:
1046 		vstore(s, &val[s->k], val[X_ATOM], alter);
1047 		break;
1048 	}
1049 }
1050 
1051 static void
1052 deadstmt(s, last)
1053 	register struct stmt *s;
1054 	register struct stmt *last[];
1055 {
1056 	register int atom;
1057 
1058 	atom = atomuse(s);
1059 	if (atom >= 0) {
1060 		if (atom == AX_ATOM) {
1061 			last[X_ATOM] = 0;
1062 			last[A_ATOM] = 0;
1063 		}
1064 		else
1065 			last[atom] = 0;
1066 	}
1067 	atom = atomdef(s);
1068 	if (atom >= 0) {
1069 		if (last[atom]) {
1070 			done = 0;
1071 			last[atom]->code = NOP;
1072 		}
1073 		last[atom] = s;
1074 	}
1075 }
1076 
1077 static void
1078 opt_deadstores(b)
1079 	register struct block *b;
1080 {
1081 	register struct slist *s;
1082 	register int atom;
1083 	struct stmt *last[N_ATOMS];
1084 
1085 	memset((char *)last, 0, sizeof last);
1086 
1087 	for (s = b->stmts; s != 0; s = s->next)
1088 		deadstmt(&s->s, last);
1089 	deadstmt(&b->s, last);
1090 
1091 	for (atom = 0; atom < N_ATOMS; ++atom)
1092 		if (last[atom] && !ATOMELEM(b->out_use, atom)) {
1093 			last[atom]->code = NOP;
1094 			done = 0;
1095 		}
1096 }
1097 
1098 static void
1099 opt_blk(b, do_stmts)
1100 	struct block *b;
1101 	int do_stmts;
1102 {
1103 	struct slist *s;
1104 	struct edge *p;
1105 	int i;
1106 	bpf_int32 aval;
1107 
1108 #if 0
1109 	for (s = b->stmts; s && s->next; s = s->next)
1110 		if (BPF_CLASS(s->s.code) == BPF_JMP) {
1111 			do_stmts = 0;
1112 			break;
1113 		}
1114 #endif
1115 
1116 	/*
1117 	 * Initialize the atom values.
1118 	 * If we have no predecessors, everything is undefined.
1119 	 * Otherwise, we inherent our values from our predecessors.
1120 	 * If any register has an ambiguous value (i.e. control paths are
1121 	 * merging) give it the undefined value of 0.
1122 	 */
1123 	p = b->in_edges;
1124 	if (p == 0)
1125 		memset((char *)b->val, 0, sizeof(b->val));
1126 	else {
1127 		memcpy((char *)b->val, (char *)p->pred->val, sizeof(b->val));
1128 		while ((p = p->next) != NULL) {
1129 			for (i = 0; i < N_ATOMS; ++i)
1130 				if (b->val[i] != p->pred->val[i])
1131 					b->val[i] = 0;
1132 		}
1133 	}
1134 	aval = b->val[A_ATOM];
1135 	for (s = b->stmts; s; s = s->next)
1136 		opt_stmt(&s->s, b->val, do_stmts);
1137 
1138 	/*
1139 	 * This is a special case: if we don't use anything from this
1140 	 * block, and we load the accumulator with value that is
1141 	 * already there, or if this block is a return,
1142 	 * eliminate all the statements.
1143 	 */
1144 	if (do_stmts &&
1145 	    ((b->out_use == 0 && aval != 0 &&b->val[A_ATOM] == aval) ||
1146 	     BPF_CLASS(b->s.code) == BPF_RET)) {
1147 		if (b->stmts != 0) {
1148 			b->stmts = 0;
1149 			done = 0;
1150 		}
1151 	} else {
1152 		opt_peep(b);
1153 		opt_deadstores(b);
1154 	}
1155 	/*
1156 	 * Set up values for branch optimizer.
1157 	 */
1158 	if (BPF_SRC(b->s.code) == BPF_K)
1159 		b->oval = K(b->s.k);
1160 	else
1161 		b->oval = b->val[X_ATOM];
1162 	b->et.code = b->s.code;
1163 	b->ef.code = -b->s.code;
1164 }
1165 
1166 /*
1167  * Return true if any register that is used on exit from 'succ', has
1168  * an exit value that is different from the corresponding exit value
1169  * from 'b'.
1170  */
1171 static int
1172 use_conflict(b, succ)
1173 	struct block *b, *succ;
1174 {
1175 	int atom;
1176 	atomset use = succ->out_use;
1177 
1178 	if (use == 0)
1179 		return 0;
1180 
1181 	for (atom = 0; atom < N_ATOMS; ++atom)
1182 		if (ATOMELEM(use, atom))
1183 			if (b->val[atom] != succ->val[atom])
1184 				return 1;
1185 	return 0;
1186 }
1187 
1188 static struct block *
1189 fold_edge(child, ep)
1190 	struct block *child;
1191 	struct edge *ep;
1192 {
1193 	int sense;
1194 	int aval0, aval1, oval0, oval1;
1195 	int code = ep->code;
1196 
1197 	if (code < 0) {
1198 		code = -code;
1199 		sense = 0;
1200 	} else
1201 		sense = 1;
1202 
1203 	if (child->s.code != code)
1204 		return 0;
1205 
1206 	aval0 = child->val[A_ATOM];
1207 	oval0 = child->oval;
1208 	aval1 = ep->pred->val[A_ATOM];
1209 	oval1 = ep->pred->oval;
1210 
1211 	if (aval0 != aval1)
1212 		return 0;
1213 
1214 	if (oval0 == oval1)
1215 		/*
1216 		 * The operands are identical, so the
1217 		 * result is true if a true branch was
1218 		 * taken to get here, otherwise false.
1219 		 */
1220 		return sense ? JT(child) : JF(child);
1221 
1222 	if (sense && code == (BPF_JMP|BPF_JEQ|BPF_K))
1223 		/*
1224 		 * At this point, we only know the comparison if we
1225 		 * came down the true branch, and it was an equality
1226 		 * comparison with a constant.  We rely on the fact that
1227 		 * distinct constants have distinct value numbers.
1228 		 */
1229 		return JF(child);
1230 
1231 	return 0;
1232 }
1233 
1234 static void
1235 opt_j(ep)
1236 	struct edge *ep;
1237 {
1238 	register int i, k;
1239 	register struct block *target;
1240 
1241 	if (JT(ep->succ) == 0)
1242 		return;
1243 
1244 	if (JT(ep->succ) == JF(ep->succ)) {
1245 		/*
1246 		 * Common branch targets can be eliminated, provided
1247 		 * there is no data dependency.
1248 		 */
1249 		if (!use_conflict(ep->pred, ep->succ->et.succ)) {
1250 			done = 0;
1251 			ep->succ = JT(ep->succ);
1252 		}
1253 	}
1254 	/*
1255 	 * For each edge dominator that matches the successor of this
1256 	 * edge, promote the edge successor to the its grandchild.
1257 	 *
1258 	 * XXX We violate the set abstraction here in favor a reasonably
1259 	 * efficient loop.
1260 	 */
1261  top:
1262 	for (i = 0; i < edgewords; ++i) {
1263 		register bpf_u_int32 x = ep->edom[i];
1264 
1265 		while (x != 0) {
1266 			k = ffs(x) - 1;
1267 			x &=~ (1 << k);
1268 			k += i * BITS_PER_WORD;
1269 
1270 			target = fold_edge(ep->succ, edges[k]);
1271 			/*
1272 			 * Check that there is no data dependency between
1273 			 * nodes that will be violated if we move the edge.
1274 			 */
1275 			if (target != 0 && !use_conflict(ep->pred, target)) {
1276 				done = 0;
1277 				ep->succ = target;
1278 				if (JT(target) != 0)
1279 					/*
1280 					 * Start over unless we hit a leaf.
1281 					 */
1282 					goto top;
1283 				return;
1284 			}
1285 		}
1286 	}
1287 }
1288 
1289 
1290 static void
1291 or_pullup(b)
1292 	struct block *b;
1293 {
1294 	int val, at_top;
1295 	struct block *pull;
1296 	struct block **diffp, **samep;
1297 	struct edge *ep;
1298 
1299 	ep = b->in_edges;
1300 	if (ep == 0)
1301 		return;
1302 
1303 	/*
1304 	 * Make sure each predecessor loads the same value.
1305 	 * XXX why?
1306 	 */
1307 	val = ep->pred->val[A_ATOM];
1308 	for (ep = ep->next; ep != 0; ep = ep->next)
1309 		if (val != ep->pred->val[A_ATOM])
1310 			return;
1311 
1312 	if (JT(b->in_edges->pred) == b)
1313 		diffp = &JT(b->in_edges->pred);
1314 	else
1315 		diffp = &JF(b->in_edges->pred);
1316 
1317 	at_top = 1;
1318 	while (1) {
1319 		if (*diffp == 0)
1320 			return;
1321 
1322 		if (JT(*diffp) != JT(b))
1323 			return;
1324 
1325 		if (!SET_MEMBER((*diffp)->dom, b->id))
1326 			return;
1327 
1328 		if ((*diffp)->val[A_ATOM] != val)
1329 			break;
1330 
1331 		diffp = &JF(*diffp);
1332 		at_top = 0;
1333 	}
1334 	samep = &JF(*diffp);
1335 	while (1) {
1336 		if (*samep == 0)
1337 			return;
1338 
1339 		if (JT(*samep) != JT(b))
1340 			return;
1341 
1342 		if (!SET_MEMBER((*samep)->dom, b->id))
1343 			return;
1344 
1345 		if ((*samep)->val[A_ATOM] == val)
1346 			break;
1347 
1348 		/* XXX Need to check that there are no data dependencies
1349 		   between dp0 and dp1.  Currently, the code generator
1350 		   will not produce such dependencies. */
1351 		samep = &JF(*samep);
1352 	}
1353 #ifdef notdef
1354 	/* XXX This doesn't cover everything. */
1355 	for (i = 0; i < N_ATOMS; ++i)
1356 		if ((*samep)->val[i] != pred->val[i])
1357 			return;
1358 #endif
1359 	/* Pull up the node. */
1360 	pull = *samep;
1361 	*samep = JF(pull);
1362 	JF(pull) = *diffp;
1363 
1364 	/*
1365 	 * At the top of the chain, each predecessor needs to point at the
1366 	 * pulled up node.  Inside the chain, there is only one predecessor
1367 	 * to worry about.
1368 	 */
1369 	if (at_top) {
1370 		for (ep = b->in_edges; ep != 0; ep = ep->next) {
1371 			if (JT(ep->pred) == b)
1372 				JT(ep->pred) = pull;
1373 			else
1374 				JF(ep->pred) = pull;
1375 		}
1376 	}
1377 	else
1378 		*diffp = pull;
1379 
1380 	done = 0;
1381 }
1382 
1383 static void
1384 and_pullup(b)
1385 	struct block *b;
1386 {
1387 	int val, at_top;
1388 	struct block *pull;
1389 	struct block **diffp, **samep;
1390 	struct edge *ep;
1391 
1392 	ep = b->in_edges;
1393 	if (ep == 0)
1394 		return;
1395 
1396 	/*
1397 	 * Make sure each predecessor loads the same value.
1398 	 */
1399 	val = ep->pred->val[A_ATOM];
1400 	for (ep = ep->next; ep != 0; ep = ep->next)
1401 		if (val != ep->pred->val[A_ATOM])
1402 			return;
1403 
1404 	if (JT(b->in_edges->pred) == b)
1405 		diffp = &JT(b->in_edges->pred);
1406 	else
1407 		diffp = &JF(b->in_edges->pred);
1408 
1409 	at_top = 1;
1410 	while (1) {
1411 		if (*diffp == 0)
1412 			return;
1413 
1414 		if (JF(*diffp) != JF(b))
1415 			return;
1416 
1417 		if (!SET_MEMBER((*diffp)->dom, b->id))
1418 			return;
1419 
1420 		if ((*diffp)->val[A_ATOM] != val)
1421 			break;
1422 
1423 		diffp = &JT(*diffp);
1424 		at_top = 0;
1425 	}
1426 	samep = &JT(*diffp);
1427 	while (1) {
1428 		if (*samep == 0)
1429 			return;
1430 
1431 		if (JF(*samep) != JF(b))
1432 			return;
1433 
1434 		if (!SET_MEMBER((*samep)->dom, b->id))
1435 			return;
1436 
1437 		if ((*samep)->val[A_ATOM] == val)
1438 			break;
1439 
1440 		/* XXX Need to check that there are no data dependencies
1441 		   between diffp and samep.  Currently, the code generator
1442 		   will not produce such dependencies. */
1443 		samep = &JT(*samep);
1444 	}
1445 #ifdef notdef
1446 	/* XXX This doesn't cover everything. */
1447 	for (i = 0; i < N_ATOMS; ++i)
1448 		if ((*samep)->val[i] != pred->val[i])
1449 			return;
1450 #endif
1451 	/* Pull up the node. */
1452 	pull = *samep;
1453 	*samep = JT(pull);
1454 	JT(pull) = *diffp;
1455 
1456 	/*
1457 	 * At the top of the chain, each predecessor needs to point at the
1458 	 * pulled up node.  Inside the chain, there is only one predecessor
1459 	 * to worry about.
1460 	 */
1461 	if (at_top) {
1462 		for (ep = b->in_edges; ep != 0; ep = ep->next) {
1463 			if (JT(ep->pred) == b)
1464 				JT(ep->pred) = pull;
1465 			else
1466 				JF(ep->pred) = pull;
1467 		}
1468 	}
1469 	else
1470 		*diffp = pull;
1471 
1472 	done = 0;
1473 }
1474 
1475 static void
1476 opt_blks(root, do_stmts)
1477 	struct block *root;
1478 	int do_stmts;
1479 {
1480 	int i, maxlevel;
1481 	struct block *p;
1482 
1483 	init_val();
1484 	maxlevel = root->level;
1485 	for (i = maxlevel; i >= 0; --i)
1486 		for (p = levels[i]; p; p = p->link)
1487 			opt_blk(p, do_stmts);
1488 
1489 	if (do_stmts)
1490 		/*
1491 		 * No point trying to move branches; it can't possibly
1492 		 * make a difference at this point.
1493 		 */
1494 		return;
1495 
1496 	for (i = 1; i <= maxlevel; ++i) {
1497 		for (p = levels[i]; p; p = p->link) {
1498 			opt_j(&p->et);
1499 			opt_j(&p->ef);
1500 		}
1501 	}
1502 	for (i = 1; i <= maxlevel; ++i) {
1503 		for (p = levels[i]; p; p = p->link) {
1504 			or_pullup(p);
1505 			and_pullup(p);
1506 		}
1507 	}
1508 }
1509 
1510 static __inline void
1511 link_inedge(parent, child)
1512 	struct edge *parent;
1513 	struct block *child;
1514 {
1515 	parent->next = child->in_edges;
1516 	child->in_edges = parent;
1517 }
1518 
1519 static void
1520 find_inedges(root)
1521 	struct block *root;
1522 {
1523 	int i;
1524 	struct block *b;
1525 
1526 	for (i = 0; i < n_blocks; ++i)
1527 		blocks[i]->in_edges = 0;
1528 
1529 	/*
1530 	 * Traverse the graph, adding each edge to the predecessor
1531 	 * list of its successors.  Skip the leaves (i.e. level 0).
1532 	 */
1533 	for (i = root->level; i > 0; --i) {
1534 		for (b = levels[i]; b != 0; b = b->link) {
1535 			link_inedge(&b->et, JT(b));
1536 			link_inedge(&b->ef, JF(b));
1537 		}
1538 	}
1539 }
1540 
1541 static void
1542 opt_root(b)
1543 	struct block **b;
1544 {
1545 	struct slist *tmp, *s;
1546 
1547 	s = (*b)->stmts;
1548 	(*b)->stmts = 0;
1549 	while (BPF_CLASS((*b)->s.code) == BPF_JMP && JT(*b) == JF(*b))
1550 		*b = JT(*b);
1551 
1552 	tmp = (*b)->stmts;
1553 	if (tmp != 0)
1554 		sappend(s, tmp);
1555 	(*b)->stmts = s;
1556 
1557 	/*
1558 	 * If the root node is a return, then there is no
1559 	 * point executing any statements (since the bpf machine
1560 	 * has no side effects).
1561 	 */
1562 	if (BPF_CLASS((*b)->s.code) == BPF_RET)
1563 		(*b)->stmts = 0;
1564 }
1565 
1566 static void
1567 opt_loop(root, do_stmts)
1568 	struct block *root;
1569 	int do_stmts;
1570 {
1571 
1572 #ifdef BDEBUG
1573 	if (dflag > 1)
1574 		opt_dump(root);
1575 #endif
1576 	do {
1577 		done = 1;
1578 		find_levels(root);
1579 		find_dom(root);
1580 		find_closure(root);
1581 		find_inedges(root);
1582 		find_ud(root);
1583 		find_edom(root);
1584 		opt_blks(root, do_stmts);
1585 #ifdef BDEBUG
1586 		if (dflag > 1)
1587 			opt_dump(root);
1588 #endif
1589 	} while (!done);
1590 }
1591 
1592 /*
1593  * Optimize the filter code in its dag representation.
1594  */
1595 void
1596 bpf_optimize(rootp)
1597 	struct block **rootp;
1598 {
1599 	struct block *root;
1600 
1601 	root = *rootp;
1602 
1603 	opt_init(root);
1604 	opt_loop(root, 0);
1605 	opt_loop(root, 1);
1606 	intern_blocks(root);
1607 	opt_root(rootp);
1608 	opt_cleanup();
1609 }
1610 
1611 static void
1612 make_marks(p)
1613 	struct block *p;
1614 {
1615 	if (!isMarked(p)) {
1616 		Mark(p);
1617 		if (BPF_CLASS(p->s.code) != BPF_RET) {
1618 			make_marks(JT(p));
1619 			make_marks(JF(p));
1620 		}
1621 	}
1622 }
1623 
1624 /*
1625  * Mark code array such that isMarked(i) is true
1626  * only for nodes that are alive.
1627  */
1628 static void
1629 mark_code(p)
1630 	struct block *p;
1631 {
1632 	cur_mark += 1;
1633 	make_marks(p);
1634 }
1635 
1636 /*
1637  * True iff the two stmt lists load the same value from the packet into
1638  * the accumulator.
1639  */
1640 static int
1641 eq_slist(x, y)
1642 	struct slist *x, *y;
1643 {
1644 	while (1) {
1645 		while (x && x->s.code == NOP)
1646 			x = x->next;
1647 		while (y && y->s.code == NOP)
1648 			y = y->next;
1649 		if (x == 0)
1650 			return y == 0;
1651 		if (y == 0)
1652 			return x == 0;
1653 		if (x->s.code != y->s.code || x->s.k != y->s.k)
1654 			return 0;
1655 		x = x->next;
1656 		y = y->next;
1657 	}
1658 }
1659 
1660 static __inline int
1661 eq_blk(b0, b1)
1662 	struct block *b0, *b1;
1663 {
1664 	if (b0->s.code == b1->s.code &&
1665 	    b0->s.k == b1->s.k &&
1666 	    b0->et.succ == b1->et.succ &&
1667 	    b0->ef.succ == b1->ef.succ)
1668 		return eq_slist(b0->stmts, b1->stmts);
1669 	return 0;
1670 }
1671 
1672 static void
1673 intern_blocks(root)
1674 	struct block *root;
1675 {
1676 	struct block *p;
1677 	int i, j;
1678 	int done;
1679  top:
1680 	done = 1;
1681 	for (i = 0; i < n_blocks; ++i)
1682 		blocks[i]->link = 0;
1683 
1684 	mark_code(root);
1685 
1686 	for (i = n_blocks - 1; --i >= 0; ) {
1687 		if (!isMarked(blocks[i]))
1688 			continue;
1689 		for (j = i + 1; j < n_blocks; ++j) {
1690 			if (!isMarked(blocks[j]))
1691 				continue;
1692 			if (eq_blk(blocks[i], blocks[j])) {
1693 				blocks[i]->link = blocks[j]->link ?
1694 					blocks[j]->link : blocks[j];
1695 				break;
1696 			}
1697 		}
1698 	}
1699 	for (i = 0; i < n_blocks; ++i) {
1700 		p = blocks[i];
1701 		if (JT(p) == 0)
1702 			continue;
1703 		if (JT(p)->link) {
1704 			done = 0;
1705 			JT(p) = JT(p)->link;
1706 		}
1707 		if (JF(p)->link) {
1708 			done = 0;
1709 			JF(p) = JF(p)->link;
1710 		}
1711 	}
1712 	if (!done)
1713 		goto top;
1714 }
1715 
1716 static void
1717 opt_cleanup()
1718 {
1719 	free((void *)vnode_base);
1720 	free((void *)vmap);
1721 	free((void *)edges);
1722 	free((void *)space);
1723 	free((void *)levels);
1724 	free((void *)blocks);
1725 }
1726 
1727 /*
1728  * Return the number of stmts in 's'.
1729  */
1730 static int
1731 slength(s)
1732 	struct slist *s;
1733 {
1734 	int n = 0;
1735 
1736 	for (; s; s = s->next)
1737 		if (s->s.code != NOP)
1738 			++n;
1739 	return n;
1740 }
1741 
1742 /*
1743  * Return the number of nodes reachable by 'p'.
1744  * All nodes should be initially unmarked.
1745  */
1746 static int
1747 count_blocks(p)
1748 	struct block *p;
1749 {
1750 	if (p == 0 || isMarked(p))
1751 		return 0;
1752 	Mark(p);
1753 	return count_blocks(JT(p)) + count_blocks(JF(p)) + 1;
1754 }
1755 
1756 /*
1757  * Do a depth first search on the flow graph, numbering the
1758  * the basic blocks, and entering them into the 'blocks' array.`
1759  */
1760 static void
1761 number_blks_r(p)
1762 	struct block *p;
1763 {
1764 	int n;
1765 
1766 	if (p == 0 || isMarked(p))
1767 		return;
1768 
1769 	Mark(p);
1770 	n = n_blocks++;
1771 	p->id = n;
1772 	blocks[n] = p;
1773 
1774 	number_blks_r(JT(p));
1775 	number_blks_r(JF(p));
1776 }
1777 
1778 /*
1779  * Return the number of stmts in the flowgraph reachable by 'p'.
1780  * The nodes should be unmarked before calling.
1781  */
1782 static int
1783 count_stmts(p)
1784 	struct block *p;
1785 {
1786 	int n;
1787 
1788 	if (p == 0 || isMarked(p))
1789 		return 0;
1790 	Mark(p);
1791 	n = count_stmts(JT(p)) + count_stmts(JF(p));
1792 	return slength(p->stmts) + n + 1;
1793 }
1794 
1795 /*
1796  * Allocate memory.  All allocation is done before optimization
1797  * is begun.  A linear bound on the size of all data structures is computed
1798  * from the total number of blocks and/or statements.
1799  */
1800 static void
1801 opt_init(root)
1802 	struct block *root;
1803 {
1804 	bpf_u_int32 *p;
1805 	int i, n, max_stmts;
1806 
1807 	/*
1808 	 * First, count the blocks, so we can malloc an array to map
1809 	 * block number to block.  Then, put the blocks into the array.
1810 	 */
1811 	unMarkAll();
1812 	n = count_blocks(root);
1813 	blocks = (struct block **)malloc(n * sizeof(*blocks));
1814 	unMarkAll();
1815 	n_blocks = 0;
1816 	number_blks_r(root);
1817 
1818 	n_edges = 2 * n_blocks;
1819 	edges = (struct edge **)malloc(n_edges * sizeof(*edges));
1820 
1821 	/*
1822 	 * The number of levels is bounded by the number of nodes.
1823 	 */
1824 	levels = (struct block **)malloc(n_blocks * sizeof(*levels));
1825 
1826 	edgewords = n_edges / (8 * sizeof(bpf_u_int32)) + 1;
1827 	nodewords = n_blocks / (8 * sizeof(bpf_u_int32)) + 1;
1828 
1829 	/* XXX */
1830 	space = (bpf_u_int32 *)malloc(2 * n_blocks * nodewords * sizeof(*space)
1831 				 + n_edges * edgewords * sizeof(*space));
1832 	p = space;
1833 	all_dom_sets = p;
1834 	for (i = 0; i < n; ++i) {
1835 		blocks[i]->dom = p;
1836 		p += nodewords;
1837 	}
1838 	all_closure_sets = p;
1839 	for (i = 0; i < n; ++i) {
1840 		blocks[i]->closure = p;
1841 		p += nodewords;
1842 	}
1843 	all_edge_sets = p;
1844 	for (i = 0; i < n; ++i) {
1845 		register struct block *b = blocks[i];
1846 
1847 		b->et.edom = p;
1848 		p += edgewords;
1849 		b->ef.edom = p;
1850 		p += edgewords;
1851 		b->et.id = i;
1852 		edges[i] = &b->et;
1853 		b->ef.id = n_blocks + i;
1854 		edges[n_blocks + i] = &b->ef;
1855 		b->et.pred = b;
1856 		b->ef.pred = b;
1857 	}
1858 	max_stmts = 0;
1859 	for (i = 0; i < n; ++i)
1860 		max_stmts += slength(blocks[i]->stmts) + 1;
1861 	/*
1862 	 * We allocate at most 3 value numbers per statement,
1863 	 * so this is an upper bound on the number of valnodes
1864 	 * we'll need.
1865 	 */
1866 	maxval = 3 * max_stmts;
1867 	vmap = (struct vmapinfo *)malloc(maxval * sizeof(*vmap));
1868 	vnode_base = (struct valnode *)malloc(maxval * sizeof(*vmap));
1869 }
1870 
1871 /*
1872  * Some pointers used to convert the basic block form of the code,
1873  * into the array form that BPF requires.  'fstart' will point to
1874  * the malloc'd array while 'ftail' is used during the recursive traversal.
1875  */
1876 static struct bpf_insn *fstart;
1877 static struct bpf_insn *ftail;
1878 
1879 #ifdef BDEBUG
1880 int bids[1000];
1881 #endif
1882 
1883 /*
1884  * Returns true if successful.  Returns false if a branch has
1885  * an offset that is too large.  If so, we have marked that
1886  * branch so that on a subsequent iteration, it will be treated
1887  * properly.
1888  */
1889 static int
1890 convert_code_r(p)
1891 	struct block *p;
1892 {
1893 	struct bpf_insn *dst;
1894 	struct slist *src;
1895 	int slen;
1896 	u_int off;
1897 	int extrajmps;		/* number of extra jumps inserted */
1898 	struct slist **offset = NULL;
1899 
1900 	if (p == 0 || isMarked(p))
1901 		return (1);
1902 	Mark(p);
1903 
1904 	if (convert_code_r(JF(p)) == 0)
1905 		return (0);
1906 	if (convert_code_r(JT(p)) == 0)
1907 		return (0);
1908 
1909 	slen = slength(p->stmts);
1910 	dst = ftail -= (slen + 1 + p->longjt + p->longjf);
1911 		/* inflate length by any extra jumps */
1912 
1913 	p->offset = dst - fstart;
1914 
1915 	/* generate offset[] for convenience  */
1916 	if (slen) {
1917 		offset = (struct slist **)calloc(sizeof(struct slist *), slen);
1918 		if (!offset) {
1919 			bpf_error("not enough core");
1920 			/*NOTREACHED*/
1921 		}
1922 	}
1923 	src = p->stmts;
1924 	for (off = 0; off < slen && src; off++) {
1925 #if 0
1926 		printf("off=%d src=%x\n", off, src);
1927 #endif
1928 		offset[off] = src;
1929 		src = src->next;
1930 	}
1931 
1932 	off = 0;
1933 	for (src = p->stmts; src; src = src->next) {
1934 		if (src->s.code == NOP)
1935 			continue;
1936 		dst->code = (u_short)src->s.code;
1937 		dst->k = src->s.k;
1938 
1939 		/* fill block-local relative jump */
1940 		if (BPF_CLASS(src->s.code) != BPF_JMP || src->s.code == (BPF_JMP|BPF_JA)) {
1941 #if 0
1942 			if (src->s.jt || src->s.jf) {
1943 				bpf_error("illegal jmp destination");
1944 				/*NOTREACHED*/
1945 			}
1946 #endif
1947 			goto filled;
1948 		}
1949 		if (off == slen - 2)	/*???*/
1950 			goto filled;
1951 
1952 	    {
1953 		int i;
1954 		int jt, jf;
1955 		char *ljerr = "%s for block-local relative jump: off=%d";
1956 
1957 #if 0
1958 		printf("code=%x off=%d %x %x\n", src->s.code,
1959 			off, src->s.jt, src->s.jf);
1960 #endif
1961 
1962 		if (!src->s.jt || !src->s.jf) {
1963 			bpf_error(ljerr, "no jmp destination", off);
1964 			/*NOTREACHED*/
1965 		}
1966 
1967 		jt = jf = 0;
1968 		for (i = 0; i < slen; i++) {
1969 			if (offset[i] == src->s.jt) {
1970 				if (jt) {
1971 					bpf_error(ljerr, "multiple matches", off);
1972 					/*NOTREACHED*/
1973 				}
1974 
1975 				dst->jt = i - off - 1;
1976 				jt++;
1977 			}
1978 			if (offset[i] == src->s.jf) {
1979 				if (jf) {
1980 					bpf_error(ljerr, "multiple matches", off);
1981 					/*NOTREACHED*/
1982 				}
1983 				dst->jf = i - off - 1;
1984 				jf++;
1985 			}
1986 		}
1987 		if (!jt || !jf) {
1988 			bpf_error(ljerr, "no destination found", off);
1989 			/*NOTREACHED*/
1990 		}
1991 	    }
1992 filled:
1993 		++dst;
1994 		++off;
1995 	}
1996 	if (offset)
1997 		free(offset);
1998 
1999 #ifdef BDEBUG
2000 	bids[dst - fstart] = p->id + 1;
2001 #endif
2002 	dst->code = (u_short)p->s.code;
2003 	dst->k = p->s.k;
2004 	if (JT(p)) {
2005 		extrajmps = 0;
2006 		off = JT(p)->offset - (p->offset + slen) - 1;
2007 		if (off >= 256) {
2008 		    /* offset too large for branch, must add a jump */
2009 		    if (p->longjt == 0) {
2010 		    	/* mark this instruction and retry */
2011 			p->longjt++;
2012 			return(0);
2013 		    }
2014 		    /* branch if T to following jump */
2015 		    dst->jt = extrajmps;
2016 		    extrajmps++;
2017 		    dst[extrajmps].code = BPF_JMP|BPF_JA;
2018 		    dst[extrajmps].k = off - extrajmps;
2019 		}
2020 		else
2021 		    dst->jt = off;
2022 		off = JF(p)->offset - (p->offset + slen) - 1;
2023 		if (off >= 256) {
2024 		    /* offset too large for branch, must add a jump */
2025 		    if (p->longjf == 0) {
2026 		    	/* mark this instruction and retry */
2027 			p->longjf++;
2028 			return(0);
2029 		    }
2030 		    /* branch if F to following jump */
2031 		    /* if two jumps are inserted, F goes to second one */
2032 		    dst->jf = extrajmps;
2033 		    extrajmps++;
2034 		    dst[extrajmps].code = BPF_JMP|BPF_JA;
2035 		    dst[extrajmps].k = off - extrajmps;
2036 		}
2037 		else
2038 		    dst->jf = off;
2039 	}
2040 	return (1);
2041 }
2042 
2043 
2044 /*
2045  * Convert flowgraph intermediate representation to the
2046  * BPF array representation.  Set *lenp to the number of instructions.
2047  */
2048 struct bpf_insn *
2049 icode_to_fcode(root, lenp)
2050 	struct block *root;
2051 	int *lenp;
2052 {
2053 	int n;
2054 	struct bpf_insn *fp;
2055 
2056 	/*
2057 	 * Loop doing convert_codr_r() until no branches remain
2058 	 * with too-large offsets.
2059 	 */
2060 	while (1) {
2061 	    unMarkAll();
2062 	    n = *lenp = count_stmts(root);
2063 
2064 	    fp = (struct bpf_insn *)malloc(sizeof(*fp) * n);
2065 	    memset((char *)fp, 0, sizeof(*fp) * n);
2066 	    fstart = fp;
2067 	    ftail = fp + n;
2068 
2069 	    unMarkAll();
2070 	    if (convert_code_r(root))
2071 		break;
2072 	    free(fp);
2073 	}
2074 
2075 	return fp;
2076 }
2077 
2078 #ifdef BDEBUG
2079 static void
2080 opt_dump(root)
2081 	struct block *root;
2082 {
2083 	struct bpf_program f;
2084 
2085 	memset(bids, 0, sizeof bids);
2086 	f.bf_insns = icode_to_fcode(root, &f.bf_len);
2087 	bpf_dump(&f, 1);
2088 	putchar('\n');
2089 	free((char *)f.bf_insns);
2090 }
2091 #endif
2092