xref: /openbsd-src/lib/libpcap/optimize.c (revision 9b113833b7123ecd504a0b56703dd9d7af6223a6)
1 /*	$OpenBSD: optimize.c,v 1.4 1996/07/12 13:19:10 mickey Exp $	*/
2 
3 /*
4  * Copyright (c) 1988, 1989, 1990, 1991, 1993, 1994, 1995, 1996
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that: (1) source code distributions
9  * retain the above copyright notice and this paragraph in its entirety, (2)
10  * distributions including binary code include the above copyright notice and
11  * this paragraph in its entirety in the documentation or other materials
12  * provided with the distribution, and (3) all advertising materials mentioning
13  * features or use of this software display the following acknowledgement:
14  * ``This product includes software developed by the University of California,
15  * Lawrence Berkeley Laboratory and its contributors.'' Neither the name of
16  * the University nor the names of its contributors may be used to endorse
17  * or promote products derived from this software without specific prior
18  * written permission.
19  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
20  * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
21  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
22  *
23  *  Optimization module for tcpdump intermediate representation.
24  */
25 #ifndef lint
26 static char rcsid[] =
27     "@(#) Header: optimize.c,v 1.58 96/06/16 22:36:59 leres Exp (LBL)";
28 #endif
29 
30 #include <sys/types.h>
31 #include <sys/time.h>
32 
33 #include <net/bpf.h>
34 
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <memory.h>
38 
39 #ifdef HAVE_OS_PROTO_H
40 #include "os-proto.h"
41 #endif
42 
43 #include "pcap-int.h"
44 #include "gencode.h"
45 
46 #ifdef BDEBUG
47 extern int dflag;
48 #endif
49 
50 #define A_ATOM BPF_MEMWORDS
51 #define X_ATOM (BPF_MEMWORDS+1)
52 
53 #define NOP -1
54 
55 /*
56  * This define is used to represent *both* the accumulator and
57  * x register in use-def computations.
58  * Currently, the use-def code assumes only one definition per instruction.
59  */
60 #define AX_ATOM N_ATOMS
61 
62 /*
63  * A flag to indicate that further optimization is needed.
64  * Iterative passes are continued until a given pass yields no
65  * branch movement.
66  */
67 static int done;
68 
69 /*
70  * A block is marked if only if its mark equals the current mark.
71  * Rather than traverse the code array, marking each item, 'cur_mark' is
72  * incremented.  This automatically makes each element unmarked.
73  */
74 static int cur_mark;
75 #define isMarked(p) ((p)->mark == cur_mark)
76 #define unMarkAll() cur_mark += 1
77 #define Mark(p) ((p)->mark = cur_mark)
78 
79 static void opt_init(struct block *);
80 static void opt_cleanup(void);
81 
82 static void make_marks(struct block *);
83 static void mark_code(struct block *);
84 
85 static void intern_blocks(struct block *);
86 
87 static int eq_slist(struct slist *, struct slist *);
88 
89 static void find_levels_r(struct block *);
90 
91 static void find_levels(struct block *);
92 static void find_dom(struct block *);
93 static void propedom(struct edge *);
94 static void find_edom(struct block *);
95 static void find_closure(struct block *);
96 static int atomuse(struct stmt *);
97 static int atomdef(struct stmt *);
98 static void compute_local_ud(struct block *);
99 static void find_ud(struct block *);
100 static void init_val(void);
101 static int F(int, int, int);
102 static inline void vstore(struct stmt *, int *, int, int);
103 static void opt_blk(struct block *, int);
104 static int use_conflict(struct block *, struct block *);
105 static void opt_j(struct edge *);
106 static void or_pullup(struct block *);
107 static void and_pullup(struct block *);
108 static void opt_blks(struct block *, int);
109 static inline void link_inedge(struct edge *, struct block *);
110 static void find_inedges(struct block *);
111 static void opt_root(struct block **);
112 static void opt_loop(struct block *, int);
113 static void fold_op(struct stmt *, int, int);
114 static inline struct slist *this_op(struct slist *);
115 static void opt_not(struct block *);
116 static void opt_peep(struct block *);
117 static void opt_stmt(struct stmt *, int[], int);
118 static void deadstmt(struct stmt *, struct stmt *[]);
119 static void opt_deadstores(struct block *);
120 static void opt_blk(struct block *, int);
121 static int use_conflict(struct block *, struct block *);
122 static void opt_j(struct edge *);
123 static struct block *fold_edge(struct block *, struct edge *);
124 static inline int eq_blk(struct block *, struct block *);
125 static int slength(struct slist *);
126 static int count_blocks(struct block *);
127 static void number_blks_r(struct block *);
128 static int count_stmts(struct block *);
129 static int convert_code_r(struct block *);
130 #ifdef BDEBUG
131 static void opt_dump(struct block *);
132 #endif
133 
134 static int n_blocks;
135 struct block **blocks;
136 static int n_edges;
137 struct edge **edges;
138 
139 /*
140  * A bit vector set representation of the dominators.
141  * We round up the set size to the next power of two.
142  */
143 static int nodewords;
144 static int edgewords;
145 struct block **levels;
146 bpf_u_int32 *space;
147 #define BITS_PER_WORD (8*sizeof(bpf_u_int32))
148 /*
149  * True if a is in uset {p}
150  */
151 #define SET_MEMBER(p, a) \
152 ((p)[(unsigned)(a) / BITS_PER_WORD] & (1 << ((unsigned)(a) % BITS_PER_WORD)))
153 
154 /*
155  * Add 'a' to uset p.
156  */
157 #define SET_INSERT(p, a) \
158 (p)[(unsigned)(a) / BITS_PER_WORD] |= (1 << ((unsigned)(a) % BITS_PER_WORD))
159 
160 /*
161  * Delete 'a' from uset p.
162  */
163 #define SET_DELETE(p, a) \
164 (p)[(unsigned)(a) / BITS_PER_WORD] &= ~(1 << ((unsigned)(a) % BITS_PER_WORD))
165 
166 /*
167  * a := a intersect b
168  */
169 #define SET_INTERSECT(a, b, n)\
170 {\
171 	register bpf_u_int32 *_x = a, *_y = b;\
172 	register int _n = n;\
173 	while (--_n >= 0) *_x++ &= *_y++;\
174 }
175 
176 /*
177  * a := a - b
178  */
179 #define SET_SUBTRACT(a, b, n)\
180 {\
181 	register bpf_u_int32 *_x = a, *_y = b;\
182 	register int _n = n;\
183 	while (--_n >= 0) *_x++ &=~ *_y++;\
184 }
185 
186 /*
187  * a := a union b
188  */
189 #define SET_UNION(a, b, n)\
190 {\
191 	register bpf_u_int32 *_x = a, *_y = b;\
192 	register int _n = n;\
193 	while (--_n >= 0) *_x++ |= *_y++;\
194 }
195 
196 static uset all_dom_sets;
197 static uset all_closure_sets;
198 static uset all_edge_sets;
199 
200 #ifndef MAX
201 #define MAX(a,b) ((a)>(b)?(a):(b))
202 #endif
203 
204 static void
205 find_levels_r(b)
206 	struct block *b;
207 {
208 	int level;
209 
210 	if (isMarked(b))
211 		return;
212 
213 	Mark(b);
214 	b->link = 0;
215 
216 	if (JT(b)) {
217 		find_levels_r(JT(b));
218 		find_levels_r(JF(b));
219 		level = MAX(JT(b)->level, JF(b)->level) + 1;
220 	} else
221 		level = 0;
222 	b->level = level;
223 	b->link = levels[level];
224 	levels[level] = b;
225 }
226 
227 /*
228  * Level graph.  The levels go from 0 at the leaves to
229  * N_LEVELS at the root.  The levels[] array points to the
230  * first node of the level list, whose elements are linked
231  * with the 'link' field of the struct block.
232  */
233 static void
234 find_levels(root)
235 	struct block *root;
236 {
237 	memset((char *)levels, 0, n_blocks * sizeof(*levels));
238 	unMarkAll();
239 	find_levels_r(root);
240 }
241 
242 /*
243  * Find dominator relationships.
244  * Assumes graph has been leveled.
245  */
246 static void
247 find_dom(root)
248 	struct block *root;
249 {
250 	int i;
251 	struct block *b;
252 	bpf_u_int32 *x;
253 
254 	/*
255 	 * Initialize sets to contain all nodes.
256 	 */
257 	x = all_dom_sets;
258 	i = n_blocks * nodewords;
259 	while (--i >= 0)
260 		*x++ = ~0;
261 	/* Root starts off empty. */
262 	for (i = nodewords; --i >= 0;)
263 		root->dom[i] = 0;
264 
265 	/* root->level is the highest level no found. */
266 	for (i = root->level; i >= 0; --i) {
267 		for (b = levels[i]; b; b = b->link) {
268 			SET_INSERT(b->dom, b->id);
269 			if (JT(b) == 0)
270 				continue;
271 			SET_INTERSECT(JT(b)->dom, b->dom, nodewords);
272 			SET_INTERSECT(JF(b)->dom, b->dom, nodewords);
273 		}
274 	}
275 }
276 
277 static void
278 propedom(ep)
279 	struct edge *ep;
280 {
281 	SET_INSERT(ep->edom, ep->id);
282 	if (ep->succ) {
283 		SET_INTERSECT(ep->succ->et.edom, ep->edom, edgewords);
284 		SET_INTERSECT(ep->succ->ef.edom, ep->edom, edgewords);
285 	}
286 }
287 
288 /*
289  * Compute edge dominators.
290  * Assumes graph has been leveled and predecessors established.
291  */
292 static void
293 find_edom(root)
294 	struct block *root;
295 {
296 	int i;
297 	uset x;
298 	struct block *b;
299 
300 	x = all_edge_sets;
301 	for (i = n_edges * edgewords; --i >= 0; )
302 		x[i] = ~0;
303 
304 	/* root->level is the highest level no found. */
305 	memset(root->et.edom, 0, edgewords * sizeof(*(uset)0));
306 	memset(root->ef.edom, 0, edgewords * sizeof(*(uset)0));
307 	for (i = root->level; i >= 0; --i) {
308 		for (b = levels[i]; b != 0; b = b->link) {
309 			propedom(&b->et);
310 			propedom(&b->ef);
311 		}
312 	}
313 }
314 
315 /*
316  * Find the backwards transitive closure of the flow graph.  These sets
317  * are backwards in the sense that we find the set of nodes that reach
318  * a given node, not the set of nodes that can be reached by a node.
319  *
320  * Assumes graph has been leveled.
321  */
322 static void
323 find_closure(root)
324 	struct block *root;
325 {
326 	int i;
327 	struct block *b;
328 
329 	/*
330 	 * Initialize sets to contain no nodes.
331 	 */
332 	memset((char *)all_closure_sets, 0,
333 	      n_blocks * nodewords * sizeof(*all_closure_sets));
334 
335 	/* root->level is the highest level no found. */
336 	for (i = root->level; i >= 0; --i) {
337 		for (b = levels[i]; b; b = b->link) {
338 			SET_INSERT(b->closure, b->id);
339 			if (JT(b) == 0)
340 				continue;
341 			SET_UNION(JT(b)->closure, b->closure, nodewords);
342 			SET_UNION(JF(b)->closure, b->closure, nodewords);
343 		}
344 	}
345 }
346 
347 /*
348  * Return the register number that is used by s.  If A and X are both
349  * used, return AX_ATOM.  If no register is used, return -1.
350  *
351  * The implementation should probably change to an array access.
352  */
353 static int
354 atomuse(s)
355 	struct stmt *s;
356 {
357 	register int c = s->code;
358 
359 	if (c == NOP)
360 		return -1;
361 
362 	switch (BPF_CLASS(c)) {
363 
364 	case BPF_RET:
365 		return (BPF_RVAL(c) == BPF_A) ? A_ATOM :
366 			(BPF_RVAL(c) == BPF_X) ? X_ATOM : -1;
367 
368 	case BPF_LD:
369 	case BPF_LDX:
370 		return (BPF_MODE(c) == BPF_IND) ? X_ATOM :
371 			(BPF_MODE(c) == BPF_MEM) ? s->k : -1;
372 
373 	case BPF_ST:
374 		return A_ATOM;
375 
376 	case BPF_STX:
377 		return X_ATOM;
378 
379 	case BPF_JMP:
380 	case BPF_ALU:
381 		if (BPF_SRC(c) == BPF_X)
382 			return AX_ATOM;
383 		return A_ATOM;
384 
385 	case BPF_MISC:
386 		return BPF_MISCOP(c) == BPF_TXA ? X_ATOM : A_ATOM;
387 	}
388 	abort();
389 	/* NOTREACHED */
390 }
391 
392 /*
393  * Return the register number that is defined by 's'.  We assume that
394  * a single stmt cannot define more than one register.  If no register
395  * is defined, return -1.
396  *
397  * The implementation should probably change to an array access.
398  */
399 static int
400 atomdef(s)
401 	struct stmt *s;
402 {
403 	if (s->code == NOP)
404 		return -1;
405 
406 	switch (BPF_CLASS(s->code)) {
407 
408 	case BPF_LD:
409 	case BPF_ALU:
410 		return A_ATOM;
411 
412 	case BPF_LDX:
413 		return X_ATOM;
414 
415 	case BPF_ST:
416 	case BPF_STX:
417 		return s->k;
418 
419 	case BPF_MISC:
420 		return BPF_MISCOP(s->code) == BPF_TAX ? X_ATOM : A_ATOM;
421 	}
422 	return -1;
423 }
424 
425 static void
426 compute_local_ud(b)
427 	struct block *b;
428 {
429 	struct slist *s;
430 	atomset def = 0, use = 0, kill = 0;
431 	int atom;
432 
433 	for (s = b->stmts; s; s = s->next) {
434 		if (s->s.code == NOP)
435 			continue;
436 		atom = atomuse(&s->s);
437 		if (atom >= 0) {
438 			if (atom == AX_ATOM) {
439 				if (!ATOMELEM(def, X_ATOM))
440 					use |= ATOMMASK(X_ATOM);
441 				if (!ATOMELEM(def, A_ATOM))
442 					use |= ATOMMASK(A_ATOM);
443 			}
444 			else if (atom < N_ATOMS) {
445 				if (!ATOMELEM(def, atom))
446 					use |= ATOMMASK(atom);
447 			}
448 			else
449 				abort();
450 		}
451 		atom = atomdef(&s->s);
452 		if (atom >= 0) {
453 			if (!ATOMELEM(use, atom))
454 				kill |= ATOMMASK(atom);
455 			def |= ATOMMASK(atom);
456 		}
457 	}
458 	if (!ATOMELEM(def, A_ATOM) && BPF_CLASS(b->s.code) == BPF_JMP)
459 		use |= ATOMMASK(A_ATOM);
460 
461 	b->def = def;
462 	b->kill = kill;
463 	b->in_use = use;
464 }
465 
466 /*
467  * Assume graph is already leveled.
468  */
469 static void
470 find_ud(root)
471 	struct block *root;
472 {
473 	int i, maxlevel;
474 	struct block *p;
475 
476 	/*
477 	 * root->level is the highest level no found;
478 	 * count down from there.
479 	 */
480 	maxlevel = root->level;
481 	for (i = maxlevel; i >= 0; --i)
482 		for (p = levels[i]; p; p = p->link) {
483 			compute_local_ud(p);
484 			p->out_use = 0;
485 		}
486 
487 	for (i = 1; i <= maxlevel; ++i) {
488 		for (p = levels[i]; p; p = p->link) {
489 			p->out_use |= JT(p)->in_use | JF(p)->in_use;
490 			p->in_use |= p->out_use &~ p->kill;
491 		}
492 	}
493 }
494 
495 /*
496  * These data structures are used in a Cocke and Shwarz style
497  * value numbering scheme.  Since the flowgraph is acyclic,
498  * exit values can be propagated from a node's predecessors
499  * provided it is uniquely defined.
500  */
501 struct valnode {
502 	int code;
503 	int v0, v1;
504 	int val;
505 	struct valnode *next;
506 };
507 
508 #define MODULUS 213
509 static struct valnode *hashtbl[MODULUS];
510 static int curval;
511 static int maxval;
512 
513 /* Integer constants mapped with the load immediate opcode. */
514 #define K(i) F(BPF_LD|BPF_IMM|BPF_W, i, 0L)
515 
516 struct vmapinfo {
517 	int is_const;
518 	bpf_int32 const_val;
519 };
520 
521 struct vmapinfo *vmap;
522 struct valnode *vnode_base;
523 struct valnode *next_vnode;
524 
525 static void
526 init_val()
527 {
528 	curval = 0;
529 	next_vnode = vnode_base;
530 	memset((char *)vmap, 0, maxval * sizeof(*vmap));
531 	memset((char *)hashtbl, 0, sizeof hashtbl);
532 }
533 
534 /* Because we really don't have an IR, this stuff is a little messy. */
535 static int
536 F(code, v0, v1)
537 	int code;
538 	int v0, v1;
539 {
540 	u_int hash;
541 	int val;
542 	struct valnode *p;
543 
544 	hash = (u_int)code ^ (v0 << 4) ^ (v1 << 8);
545 	hash %= MODULUS;
546 
547 	for (p = hashtbl[hash]; p; p = p->next)
548 		if (p->code == code && p->v0 == v0 && p->v1 == v1)
549 			return p->val;
550 
551 	val = ++curval;
552 	if (BPF_MODE(code) == BPF_IMM &&
553 	    (BPF_CLASS(code) == BPF_LD || BPF_CLASS(code) == BPF_LDX)) {
554 		vmap[val].const_val = v0;
555 		vmap[val].is_const = 1;
556 	}
557 	p = next_vnode++;
558 	p->val = val;
559 	p->code = code;
560 	p->v0 = v0;
561 	p->v1 = v1;
562 	p->next = hashtbl[hash];
563 	hashtbl[hash] = p;
564 
565 	return val;
566 }
567 
568 static inline void
569 vstore(s, valp, newval, alter)
570 	struct stmt *s;
571 	int *valp;
572 	int newval;
573 	int alter;
574 {
575 	if (alter && *valp == newval)
576 		s->code = NOP;
577 	else
578 		*valp = newval;
579 }
580 
581 static void
582 fold_op(s, v0, v1)
583 	struct stmt *s;
584 	int v0, v1;
585 {
586 	bpf_int32 a, b;
587 
588 	a = vmap[v0].const_val;
589 	b = vmap[v1].const_val;
590 
591 	switch (BPF_OP(s->code)) {
592 	case BPF_ADD:
593 		a += b;
594 		break;
595 
596 	case BPF_SUB:
597 		a -= b;
598 		break;
599 
600 	case BPF_MUL:
601 		a *= b;
602 		break;
603 
604 	case BPF_DIV:
605 		if (b == 0)
606 			bpf_error("division by zero");
607 		a /= b;
608 		break;
609 
610 	case BPF_AND:
611 		a &= b;
612 		break;
613 
614 	case BPF_OR:
615 		a |= b;
616 		break;
617 
618 	case BPF_LSH:
619 		a <<= b;
620 		break;
621 
622 	case BPF_RSH:
623 		a >>= b;
624 		break;
625 
626 	case BPF_NEG:
627 		a = -a;
628 		break;
629 
630 	default:
631 		abort();
632 	}
633 	s->k = a;
634 	s->code = BPF_LD|BPF_IMM;
635 	done = 0;
636 }
637 
638 static inline struct slist *
639 this_op(s)
640 	struct slist *s;
641 {
642 	while (s != 0 && s->s.code == NOP)
643 		s = s->next;
644 	return s;
645 }
646 
647 static void
648 opt_not(b)
649 	struct block *b;
650 {
651 	struct block *tmp = JT(b);
652 
653 	JT(b) = JF(b);
654 	JF(b) = tmp;
655 }
656 
657 static void
658 opt_peep(b)
659 	struct block *b;
660 {
661 	struct slist *s;
662 	struct slist *next, *last;
663 	int val;
664 
665 	s = b->stmts;
666 	if (s == 0)
667 		return;
668 
669 	last = s;
670 	while (1) {
671 		s = this_op(s);
672 		if (s == 0)
673 			break;
674 		next = this_op(s->next);
675 		if (next == 0)
676 			break;
677 		last = next;
678 
679 		/*
680 		 * st  M[k]	-->	st  M[k]
681 		 * ldx M[k]		tax
682 		 */
683 		if (s->s.code == BPF_ST &&
684 		    next->s.code == (BPF_LDX|BPF_MEM) &&
685 		    s->s.k == next->s.k) {
686 			done = 0;
687 			next->s.code = BPF_MISC|BPF_TAX;
688 		}
689 		/*
690 		 * ld  #k	-->	ldx  #k
691 		 * tax			txa
692 		 */
693 		if (s->s.code == (BPF_LD|BPF_IMM) &&
694 		    next->s.code == (BPF_MISC|BPF_TAX)) {
695 			s->s.code = BPF_LDX|BPF_IMM;
696 			next->s.code = BPF_MISC|BPF_TXA;
697 			done = 0;
698 		}
699 		/*
700 		 * This is an ugly special case, but it happens
701 		 * when you say tcp[k] or udp[k] where k is a constant.
702 		 */
703 		if (s->s.code == (BPF_LD|BPF_IMM)) {
704 			struct slist *add, *tax, *ild;
705 
706 			/*
707 			 * Check that X isn't used on exit from this
708 			 * block (which the optimizer might cause).
709 			 * We know the code generator won't generate
710 			 * any local dependencies.
711 			 */
712 			if (ATOMELEM(b->out_use, X_ATOM))
713 				break;
714 
715 			if (next->s.code != (BPF_LDX|BPF_MSH|BPF_B))
716 				add = next;
717 			else
718 				add = this_op(next->next);
719 			if (add == 0 || add->s.code != (BPF_ALU|BPF_ADD|BPF_X))
720 				break;
721 
722 			tax = this_op(add->next);
723 			if (tax == 0 || tax->s.code != (BPF_MISC|BPF_TAX))
724 				break;
725 
726 			ild = this_op(tax->next);
727 			if (ild == 0 || BPF_CLASS(ild->s.code) != BPF_LD ||
728 			    BPF_MODE(ild->s.code) != BPF_IND)
729 				break;
730 			/*
731 			 * XXX We need to check that X is not
732 			 * subsequently used.  We know we can eliminate the
733 			 * accumulator modifications since it is defined
734 			 * by the last stmt of this sequence.
735 			 *
736 			 * We want to turn this sequence:
737 			 *
738 			 * (004) ldi     #0x2		{s}
739 			 * (005) ldxms   [14]		{next}  -- optional
740 			 * (006) addx			{add}
741 			 * (007) tax			{tax}
742 			 * (008) ild     [x+0]		{ild}
743 			 *
744 			 * into this sequence:
745 			 *
746 			 * (004) nop
747 			 * (005) ldxms   [14]
748 			 * (006) nop
749 			 * (007) nop
750 			 * (008) ild     [x+2]
751 			 *
752 			 */
753 			ild->s.k += s->s.k;
754 			s->s.code = NOP;
755 			add->s.code = NOP;
756 			tax->s.code = NOP;
757 			done = 0;
758 		}
759 		s = next;
760 	}
761 	/*
762 	 * If we have a subtract to do a comparison, and the X register
763 	 * is a known constant, we can merge this value into the
764 	 * comparison.
765 	 */
766 	if (last->s.code == (BPF_ALU|BPF_SUB|BPF_X) &&
767 	    !ATOMELEM(b->out_use, A_ATOM)) {
768 		val = b->val[X_ATOM];
769 		if (vmap[val].is_const) {
770 			int op;
771 
772 			b->s.k += vmap[val].const_val;
773 			op = BPF_OP(b->s.code);
774 			if (op == BPF_JGT || op == BPF_JGE) {
775 				struct block *t = JT(b);
776 				JT(b) = JF(b);
777 				JF(b) = t;
778 				b->s.k += 0x80000000;
779 			}
780 			last->s.code = NOP;
781 			done = 0;
782 		} else if (b->s.k == 0) {
783 			/*
784 			 * sub x  ->	nop
785 			 * j  #0	j  x
786 			 */
787 			last->s.code = NOP;
788 			b->s.code = BPF_CLASS(b->s.code) | BPF_OP(b->s.code) |
789 				BPF_X;
790 			done = 0;
791 		}
792 	}
793 	/*
794 	 * Likewise, a constant subtract can be simplified.
795 	 */
796 	else if (last->s.code == (BPF_ALU|BPF_SUB|BPF_K) &&
797 		 !ATOMELEM(b->out_use, A_ATOM)) {
798 		int op;
799 
800 		b->s.k += last->s.k;
801 		last->s.code = NOP;
802 		op = BPF_OP(b->s.code);
803 		if (op == BPF_JGT || op == BPF_JGE) {
804 			struct block *t = JT(b);
805 			JT(b) = JF(b);
806 			JF(b) = t;
807 			b->s.k += 0x80000000;
808 		}
809 		done = 0;
810 	}
811 	/*
812 	 * and #k	nop
813 	 * jeq #0  ->	jset #k
814 	 */
815 	if (last->s.code == (BPF_ALU|BPF_AND|BPF_K) &&
816 	    !ATOMELEM(b->out_use, A_ATOM) && b->s.k == 0) {
817 		b->s.k = last->s.k;
818 		b->s.code = BPF_JMP|BPF_K|BPF_JSET;
819 		last->s.code = NOP;
820 		done = 0;
821 		opt_not(b);
822 	}
823 	/*
824 	 * If the accumulator is a known constant, we can compute the
825 	 * comparison result.
826 	 */
827 	val = b->val[A_ATOM];
828 	if (vmap[val].is_const && BPF_SRC(b->s.code) == BPF_K) {
829 		bpf_int32 v = vmap[val].const_val;
830 		switch (BPF_OP(b->s.code)) {
831 
832 		case BPF_JEQ:
833 			v = v == b->s.k;
834 			break;
835 
836 		case BPF_JGT:
837 			v = (unsigned)v > b->s.k;
838 			break;
839 
840 		case BPF_JGE:
841 			v = (unsigned)v >= b->s.k;
842 			break;
843 
844 		case BPF_JSET:
845 			v &= b->s.k;
846 			break;
847 
848 		default:
849 			abort();
850 		}
851 		if (JF(b) != JT(b))
852 			done = 0;
853 		if (v)
854 			JF(b) = JT(b);
855 		else
856 			JT(b) = JF(b);
857 	}
858 }
859 
860 /*
861  * Compute the symbolic value of expression of 's', and update
862  * anything it defines in the value table 'val'.  If 'alter' is true,
863  * do various optimizations.  This code would be cleaner if symbolic
864  * evaluation and code transformations weren't folded together.
865  */
866 static void
867 opt_stmt(s, val, alter)
868 	struct stmt *s;
869 	int val[];
870 	int alter;
871 {
872 	int op;
873 	int v;
874 
875 	switch (s->code) {
876 
877 	case BPF_LD|BPF_ABS|BPF_W:
878 	case BPF_LD|BPF_ABS|BPF_H:
879 	case BPF_LD|BPF_ABS|BPF_B:
880 		v = F(s->code, s->k, 0L);
881 		vstore(s, &val[A_ATOM], v, alter);
882 		break;
883 
884 	case BPF_LD|BPF_IND|BPF_W:
885 	case BPF_LD|BPF_IND|BPF_H:
886 	case BPF_LD|BPF_IND|BPF_B:
887 		v = val[X_ATOM];
888 		if (alter && vmap[v].is_const) {
889 			s->code = BPF_LD|BPF_ABS|BPF_SIZE(s->code);
890 			s->k += vmap[v].const_val;
891 			v = F(s->code, s->k, 0L);
892 			done = 0;
893 		}
894 		else
895 			v = F(s->code, s->k, v);
896 		vstore(s, &val[A_ATOM], v, alter);
897 		break;
898 
899 	case BPF_LD|BPF_LEN:
900 		v = F(s->code, 0L, 0L);
901 		vstore(s, &val[A_ATOM], v, alter);
902 		break;
903 
904 	case BPF_LD|BPF_IMM:
905 		v = K(s->k);
906 		vstore(s, &val[A_ATOM], v, alter);
907 		break;
908 
909 	case BPF_LDX|BPF_IMM:
910 		v = K(s->k);
911 		vstore(s, &val[X_ATOM], v, alter);
912 		break;
913 
914 	case BPF_LDX|BPF_MSH|BPF_B:
915 		v = F(s->code, s->k, 0L);
916 		vstore(s, &val[X_ATOM], v, alter);
917 		break;
918 
919 	case BPF_ALU|BPF_NEG:
920 		if (alter && vmap[val[A_ATOM]].is_const) {
921 			s->code = BPF_LD|BPF_IMM;
922 			s->k = -vmap[val[A_ATOM]].const_val;
923 			val[A_ATOM] = K(s->k);
924 		}
925 		else
926 			val[A_ATOM] = F(s->code, val[A_ATOM], 0L);
927 		break;
928 
929 	case BPF_ALU|BPF_ADD|BPF_K:
930 	case BPF_ALU|BPF_SUB|BPF_K:
931 	case BPF_ALU|BPF_MUL|BPF_K:
932 	case BPF_ALU|BPF_DIV|BPF_K:
933 	case BPF_ALU|BPF_AND|BPF_K:
934 	case BPF_ALU|BPF_OR|BPF_K:
935 	case BPF_ALU|BPF_LSH|BPF_K:
936 	case BPF_ALU|BPF_RSH|BPF_K:
937 		op = BPF_OP(s->code);
938 		if (alter) {
939 			if (s->k == 0) {
940 				if (op == BPF_ADD || op == BPF_SUB ||
941 				    op == BPF_LSH || op == BPF_RSH ||
942 				    op == BPF_OR) {
943 					s->code = NOP;
944 					break;
945 				}
946 				if (op == BPF_MUL || op == BPF_AND) {
947 					s->code = BPF_LD|BPF_IMM;
948 					val[A_ATOM] = K(s->k);
949 					break;
950 				}
951 			}
952 			if (vmap[val[A_ATOM]].is_const) {
953 				fold_op(s, val[A_ATOM], K(s->k));
954 				val[A_ATOM] = K(s->k);
955 				break;
956 			}
957 		}
958 		val[A_ATOM] = F(s->code, val[A_ATOM], K(s->k));
959 		break;
960 
961 	case BPF_ALU|BPF_ADD|BPF_X:
962 	case BPF_ALU|BPF_SUB|BPF_X:
963 	case BPF_ALU|BPF_MUL|BPF_X:
964 	case BPF_ALU|BPF_DIV|BPF_X:
965 	case BPF_ALU|BPF_AND|BPF_X:
966 	case BPF_ALU|BPF_OR|BPF_X:
967 	case BPF_ALU|BPF_LSH|BPF_X:
968 	case BPF_ALU|BPF_RSH|BPF_X:
969 		op = BPF_OP(s->code);
970 		if (alter && vmap[val[X_ATOM]].is_const) {
971 			if (vmap[val[A_ATOM]].is_const) {
972 				fold_op(s, val[A_ATOM], val[X_ATOM]);
973 				val[A_ATOM] = K(s->k);
974 			}
975 			else {
976 				s->code = BPF_ALU|BPF_K|op;
977 				s->k = vmap[val[X_ATOM]].const_val;
978 				done = 0;
979 				val[A_ATOM] =
980 					F(s->code, val[A_ATOM], K(s->k));
981 			}
982 			break;
983 		}
984 		/*
985 		 * Check if we're doing something to an accumulator
986 		 * that is 0, and simplify.  This may not seem like
987 		 * much of a simplification but it could open up further
988 		 * optimizations.
989 		 * XXX We could also check for mul by 1, and -1, etc.
990 		 */
991 		if (alter && vmap[val[A_ATOM]].is_const
992 		    && vmap[val[A_ATOM]].const_val == 0) {
993 			if (op == BPF_ADD || op == BPF_OR ||
994 			    op == BPF_LSH || op == BPF_RSH || op == BPF_SUB) {
995 				s->code = BPF_MISC|BPF_TXA;
996 				vstore(s, &val[A_ATOM], val[X_ATOM], alter);
997 				break;
998 			}
999 			else if (op == BPF_MUL || op == BPF_DIV ||
1000 				 op == BPF_AND) {
1001 				s->code = BPF_LD|BPF_IMM;
1002 				s->k = 0;
1003 				vstore(s, &val[A_ATOM], K(s->k), alter);
1004 				break;
1005 			}
1006 			else if (op == BPF_NEG) {
1007 				s->code = NOP;
1008 				break;
1009 			}
1010 		}
1011 		val[A_ATOM] = F(s->code, val[A_ATOM], val[X_ATOM]);
1012 		break;
1013 
1014 	case BPF_MISC|BPF_TXA:
1015 		vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1016 		break;
1017 
1018 	case BPF_LD|BPF_MEM:
1019 		v = val[s->k];
1020 		if (alter && vmap[v].is_const) {
1021 			s->code = BPF_LD|BPF_IMM;
1022 			s->k = vmap[v].const_val;
1023 			done = 0;
1024 		}
1025 		vstore(s, &val[A_ATOM], v, alter);
1026 		break;
1027 
1028 	case BPF_MISC|BPF_TAX:
1029 		vstore(s, &val[X_ATOM], val[A_ATOM], alter);
1030 		break;
1031 
1032 	case BPF_LDX|BPF_MEM:
1033 		v = val[s->k];
1034 		if (alter && vmap[v].is_const) {
1035 			s->code = BPF_LDX|BPF_IMM;
1036 			s->k = vmap[v].const_val;
1037 			done = 0;
1038 		}
1039 		vstore(s, &val[X_ATOM], v, alter);
1040 		break;
1041 
1042 	case BPF_ST:
1043 		vstore(s, &val[s->k], val[A_ATOM], alter);
1044 		break;
1045 
1046 	case BPF_STX:
1047 		vstore(s, &val[s->k], val[X_ATOM], alter);
1048 		break;
1049 	}
1050 }
1051 
1052 static void
1053 deadstmt(s, last)
1054 	register struct stmt *s;
1055 	register struct stmt *last[];
1056 {
1057 	register int atom;
1058 
1059 	atom = atomuse(s);
1060 	if (atom >= 0) {
1061 		if (atom == AX_ATOM) {
1062 			last[X_ATOM] = 0;
1063 			last[A_ATOM] = 0;
1064 		}
1065 		else
1066 			last[atom] = 0;
1067 	}
1068 	atom = atomdef(s);
1069 	if (atom >= 0) {
1070 		if (last[atom]) {
1071 			done = 0;
1072 			last[atom]->code = NOP;
1073 		}
1074 		last[atom] = s;
1075 	}
1076 }
1077 
1078 static void
1079 opt_deadstores(b)
1080 	register struct block *b;
1081 {
1082 	register struct slist *s;
1083 	register int atom;
1084 	struct stmt *last[N_ATOMS];
1085 
1086 	memset((char *)last, 0, sizeof last);
1087 
1088 	for (s = b->stmts; s != 0; s = s->next)
1089 		deadstmt(&s->s, last);
1090 	deadstmt(&b->s, last);
1091 
1092 	for (atom = 0; atom < N_ATOMS; ++atom)
1093 		if (last[atom] && !ATOMELEM(b->out_use, atom)) {
1094 			last[atom]->code = NOP;
1095 			done = 0;
1096 		}
1097 }
1098 
1099 static void
1100 opt_blk(b, do_stmts)
1101 	struct block *b;
1102 	int do_stmts;
1103 {
1104 	struct slist *s;
1105 	struct edge *p;
1106 	int i;
1107 	bpf_int32 aval;
1108 
1109 	/*
1110 	 * Initialize the atom values.
1111 	 * If we have no predecessors, everything is undefined.
1112 	 * Otherwise, we inherent our values from our predecessors.
1113 	 * If any register has an ambiguous value (i.e. control paths are
1114 	 * merging) give it the undefined value of 0.
1115 	 */
1116 	p = b->in_edges;
1117 	if (p == 0)
1118 		memset((char *)b->val, 0, sizeof(b->val));
1119 	else {
1120 		memcpy((char *)b->val, (char *)p->pred->val, sizeof(b->val));
1121 		while ((p = p->next) != NULL) {
1122 			for (i = 0; i < N_ATOMS; ++i)
1123 				if (b->val[i] != p->pred->val[i])
1124 					b->val[i] = 0;
1125 		}
1126 	}
1127 	aval = b->val[A_ATOM];
1128 	for (s = b->stmts; s; s = s->next)
1129 		opt_stmt(&s->s, b->val, do_stmts);
1130 
1131 	/*
1132 	 * This is a special case: if we don't use anything from this
1133 	 * block, and we load the accumulator with value that is
1134 	 * already there, or if this block is a return,
1135 	 * eliminate all the statements.
1136 	 */
1137 	if (do_stmts &&
1138 	    ((b->out_use == 0 && aval != 0 &&b->val[A_ATOM] == aval) ||
1139 	     BPF_CLASS(b->s.code) == BPF_RET)) {
1140 		if (b->stmts != 0) {
1141 			b->stmts = 0;
1142 			done = 0;
1143 		}
1144 	} else {
1145 		opt_peep(b);
1146 		opt_deadstores(b);
1147 	}
1148 	/*
1149 	 * Set up values for branch optimizer.
1150 	 */
1151 	if (BPF_SRC(b->s.code) == BPF_K)
1152 		b->oval = K(b->s.k);
1153 	else
1154 		b->oval = b->val[X_ATOM];
1155 	b->et.code = b->s.code;
1156 	b->ef.code = -b->s.code;
1157 }
1158 
1159 /*
1160  * Return true if any register that is used on exit from 'succ', has
1161  * an exit value that is different from the corresponding exit value
1162  * from 'b'.
1163  */
1164 static int
1165 use_conflict(b, succ)
1166 	struct block *b, *succ;
1167 {
1168 	int atom;
1169 	atomset use = succ->out_use;
1170 
1171 	if (use == 0)
1172 		return 0;
1173 
1174 	for (atom = 0; atom < N_ATOMS; ++atom)
1175 		if (ATOMELEM(use, atom))
1176 			if (b->val[atom] != succ->val[atom])
1177 				return 1;
1178 	return 0;
1179 }
1180 
1181 static struct block *
1182 fold_edge(child, ep)
1183 	struct block *child;
1184 	struct edge *ep;
1185 {
1186 	int sense;
1187 	int aval0, aval1, oval0, oval1;
1188 	int code = ep->code;
1189 
1190 	if (code < 0) {
1191 		code = -code;
1192 		sense = 0;
1193 	} else
1194 		sense = 1;
1195 
1196 	if (child->s.code != code)
1197 		return 0;
1198 
1199 	aval0 = child->val[A_ATOM];
1200 	oval0 = child->oval;
1201 	aval1 = ep->pred->val[A_ATOM];
1202 	oval1 = ep->pred->oval;
1203 
1204 	if (aval0 != aval1)
1205 		return 0;
1206 
1207 	if (oval0 == oval1)
1208 		/*
1209 		 * The operands are identical, so the
1210 		 * result is true if a true branch was
1211 		 * taken to get here, otherwise false.
1212 		 */
1213 		return sense ? JT(child) : JF(child);
1214 
1215 	if (sense && code == (BPF_JMP|BPF_JEQ|BPF_K))
1216 		/*
1217 		 * At this point, we only know the comparison if we
1218 		 * came down the true branch, and it was an equality
1219 		 * comparison with a constant.  We rely on the fact that
1220 		 * distinct constants have distinct value numbers.
1221 		 */
1222 		return JF(child);
1223 
1224 	return 0;
1225 }
1226 
1227 static void
1228 opt_j(ep)
1229 	struct edge *ep;
1230 {
1231 	register int i, k;
1232 	register struct block *target;
1233 
1234 	if (JT(ep->succ) == 0)
1235 		return;
1236 
1237 	if (JT(ep->succ) == JF(ep->succ)) {
1238 		/*
1239 		 * Common branch targets can be eliminated, provided
1240 		 * there is no data dependency.
1241 		 */
1242 		if (!use_conflict(ep->pred, ep->succ->et.succ)) {
1243 			done = 0;
1244 			ep->succ = JT(ep->succ);
1245 		}
1246 	}
1247 	/*
1248 	 * For each edge dominator that matches the successor of this
1249 	 * edge, promote the edge successor to the its grandchild.
1250 	 *
1251 	 * XXX We violate the set abstraction here in favor a reasonably
1252 	 * efficient loop.
1253 	 */
1254  top:
1255 	for (i = 0; i < edgewords; ++i) {
1256 		register bpf_u_int32 x = ep->edom[i];
1257 
1258 		while (x != 0) {
1259 			k = ffs(x) - 1;
1260 			x &=~ (1 << k);
1261 			k += i * BITS_PER_WORD;
1262 
1263 			target = fold_edge(ep->succ, edges[k]);
1264 			/*
1265 			 * Check that there is no data dependency between
1266 			 * nodes that will be violated if we move the edge.
1267 			 */
1268 			if (target != 0 && !use_conflict(ep->pred, target)) {
1269 				done = 0;
1270 				ep->succ = target;
1271 				if (JT(target) != 0)
1272 					/*
1273 					 * Start over unless we hit a leaf.
1274 					 */
1275 					goto top;
1276 				return;
1277 			}
1278 		}
1279 	}
1280 }
1281 
1282 
1283 static void
1284 or_pullup(b)
1285 	struct block *b;
1286 {
1287 	int val, at_top;
1288 	struct block *pull;
1289 	struct block **diffp, **samep;
1290 	struct edge *ep;
1291 
1292 	ep = b->in_edges;
1293 	if (ep == 0)
1294 		return;
1295 
1296 	/*
1297 	 * Make sure each predecessor loads the same value.
1298 	 * XXX why?
1299 	 */
1300 	val = ep->pred->val[A_ATOM];
1301 	for (ep = ep->next; ep != 0; ep = ep->next)
1302 		if (val != ep->pred->val[A_ATOM])
1303 			return;
1304 
1305 	if (JT(b->in_edges->pred) == b)
1306 		diffp = &JT(b->in_edges->pred);
1307 	else
1308 		diffp = &JF(b->in_edges->pred);
1309 
1310 	at_top = 1;
1311 	while (1) {
1312 		if (*diffp == 0)
1313 			return;
1314 
1315 		if (JT(*diffp) != JT(b))
1316 			return;
1317 
1318 		if (!SET_MEMBER((*diffp)->dom, b->id))
1319 			return;
1320 
1321 		if ((*diffp)->val[A_ATOM] != val)
1322 			break;
1323 
1324 		diffp = &JF(*diffp);
1325 		at_top = 0;
1326 	}
1327 	samep = &JF(*diffp);
1328 	while (1) {
1329 		if (*samep == 0)
1330 			return;
1331 
1332 		if (JT(*samep) != JT(b))
1333 			return;
1334 
1335 		if (!SET_MEMBER((*samep)->dom, b->id))
1336 			return;
1337 
1338 		if ((*samep)->val[A_ATOM] == val)
1339 			break;
1340 
1341 		/* XXX Need to check that there are no data dependencies
1342 		   between dp0 and dp1.  Currently, the code generator
1343 		   will not produce such dependencies. */
1344 		samep = &JF(*samep);
1345 	}
1346 #ifdef notdef
1347 	/* XXX This doesn't cover everything. */
1348 	for (i = 0; i < N_ATOMS; ++i)
1349 		if ((*samep)->val[i] != pred->val[i])
1350 			return;
1351 #endif
1352 	/* Pull up the node. */
1353 	pull = *samep;
1354 	*samep = JF(pull);
1355 	JF(pull) = *diffp;
1356 
1357 	/*
1358 	 * At the top of the chain, each predecessor needs to point at the
1359 	 * pulled up node.  Inside the chain, there is only one predecessor
1360 	 * to worry about.
1361 	 */
1362 	if (at_top) {
1363 		for (ep = b->in_edges; ep != 0; ep = ep->next) {
1364 			if (JT(ep->pred) == b)
1365 				JT(ep->pred) = pull;
1366 			else
1367 				JF(ep->pred) = pull;
1368 		}
1369 	}
1370 	else
1371 		*diffp = pull;
1372 
1373 	done = 0;
1374 }
1375 
1376 static void
1377 and_pullup(b)
1378 	struct block *b;
1379 {
1380 	int val, at_top;
1381 	struct block *pull;
1382 	struct block **diffp, **samep;
1383 	struct edge *ep;
1384 
1385 	ep = b->in_edges;
1386 	if (ep == 0)
1387 		return;
1388 
1389 	/*
1390 	 * Make sure each predecessor loads the same value.
1391 	 */
1392 	val = ep->pred->val[A_ATOM];
1393 	for (ep = ep->next; ep != 0; ep = ep->next)
1394 		if (val != ep->pred->val[A_ATOM])
1395 			return;
1396 
1397 	if (JT(b->in_edges->pred) == b)
1398 		diffp = &JT(b->in_edges->pred);
1399 	else
1400 		diffp = &JF(b->in_edges->pred);
1401 
1402 	at_top = 1;
1403 	while (1) {
1404 		if (*diffp == 0)
1405 			return;
1406 
1407 		if (JF(*diffp) != JF(b))
1408 			return;
1409 
1410 		if (!SET_MEMBER((*diffp)->dom, b->id))
1411 			return;
1412 
1413 		if ((*diffp)->val[A_ATOM] != val)
1414 			break;
1415 
1416 		diffp = &JT(*diffp);
1417 		at_top = 0;
1418 	}
1419 	samep = &JT(*diffp);
1420 	while (1) {
1421 		if (*samep == 0)
1422 			return;
1423 
1424 		if (JF(*samep) != JF(b))
1425 			return;
1426 
1427 		if (!SET_MEMBER((*samep)->dom, b->id))
1428 			return;
1429 
1430 		if ((*samep)->val[A_ATOM] == val)
1431 			break;
1432 
1433 		/* XXX Need to check that there are no data dependencies
1434 		   between diffp and samep.  Currently, the code generator
1435 		   will not produce such dependencies. */
1436 		samep = &JT(*samep);
1437 	}
1438 #ifdef notdef
1439 	/* XXX This doesn't cover everything. */
1440 	for (i = 0; i < N_ATOMS; ++i)
1441 		if ((*samep)->val[i] != pred->val[i])
1442 			return;
1443 #endif
1444 	/* Pull up the node. */
1445 	pull = *samep;
1446 	*samep = JT(pull);
1447 	JT(pull) = *diffp;
1448 
1449 	/*
1450 	 * At the top of the chain, each predecessor needs to point at the
1451 	 * pulled up node.  Inside the chain, there is only one predecessor
1452 	 * to worry about.
1453 	 */
1454 	if (at_top) {
1455 		for (ep = b->in_edges; ep != 0; ep = ep->next) {
1456 			if (JT(ep->pred) == b)
1457 				JT(ep->pred) = pull;
1458 			else
1459 				JF(ep->pred) = pull;
1460 		}
1461 	}
1462 	else
1463 		*diffp = pull;
1464 
1465 	done = 0;
1466 }
1467 
1468 static void
1469 opt_blks(root, do_stmts)
1470 	struct block *root;
1471 	int do_stmts;
1472 {
1473 	int i, maxlevel;
1474 	struct block *p;
1475 
1476 	init_val();
1477 	maxlevel = root->level;
1478 	for (i = maxlevel; i >= 0; --i)
1479 		for (p = levels[i]; p; p = p->link)
1480 			opt_blk(p, do_stmts);
1481 
1482 	if (do_stmts)
1483 		/*
1484 		 * No point trying to move branches; it can't possibly
1485 		 * make a difference at this point.
1486 		 */
1487 		return;
1488 
1489 	for (i = 1; i <= maxlevel; ++i) {
1490 		for (p = levels[i]; p; p = p->link) {
1491 			opt_j(&p->et);
1492 			opt_j(&p->ef);
1493 		}
1494 	}
1495 	for (i = 1; i <= maxlevel; ++i) {
1496 		for (p = levels[i]; p; p = p->link) {
1497 			or_pullup(p);
1498 			and_pullup(p);
1499 		}
1500 	}
1501 }
1502 
1503 static inline void
1504 link_inedge(parent, child)
1505 	struct edge *parent;
1506 	struct block *child;
1507 {
1508 	parent->next = child->in_edges;
1509 	child->in_edges = parent;
1510 }
1511 
1512 static void
1513 find_inedges(root)
1514 	struct block *root;
1515 {
1516 	int i;
1517 	struct block *b;
1518 
1519 	for (i = 0; i < n_blocks; ++i)
1520 		blocks[i]->in_edges = 0;
1521 
1522 	/*
1523 	 * Traverse the graph, adding each edge to the predecessor
1524 	 * list of its successors.  Skip the leaves (i.e. level 0).
1525 	 */
1526 	for (i = root->level; i > 0; --i) {
1527 		for (b = levels[i]; b != 0; b = b->link) {
1528 			link_inedge(&b->et, JT(b));
1529 			link_inedge(&b->ef, JF(b));
1530 		}
1531 	}
1532 }
1533 
1534 static void
1535 opt_root(b)
1536 	struct block **b;
1537 {
1538 	struct slist *tmp, *s;
1539 
1540 	s = (*b)->stmts;
1541 	(*b)->stmts = 0;
1542 	while (BPF_CLASS((*b)->s.code) == BPF_JMP && JT(*b) == JF(*b))
1543 		*b = JT(*b);
1544 
1545 	tmp = (*b)->stmts;
1546 	if (tmp != 0)
1547 		sappend(s, tmp);
1548 	(*b)->stmts = s;
1549 
1550 	/*
1551 	 * If the root node is a return, then there is no
1552 	 * point executing any statements (since the bpf machine
1553 	 * has no side effects).
1554 	 */
1555 	if (BPF_CLASS((*b)->s.code) == BPF_RET)
1556 		(*b)->stmts = 0;
1557 }
1558 
1559 static void
1560 opt_loop(root, do_stmts)
1561 	struct block *root;
1562 	int do_stmts;
1563 {
1564 
1565 #ifdef BDEBUG
1566 	if (dflag > 1)
1567 		opt_dump(root);
1568 #endif
1569 	do {
1570 		done = 1;
1571 		find_levels(root);
1572 		find_dom(root);
1573 		find_closure(root);
1574 		find_inedges(root);
1575 		find_ud(root);
1576 		find_edom(root);
1577 		opt_blks(root, do_stmts);
1578 #ifdef BDEBUG
1579 		if (dflag > 1)
1580 			opt_dump(root);
1581 #endif
1582 	} while (!done);
1583 }
1584 
1585 /*
1586  * Optimize the filter code in its dag representation.
1587  */
1588 void
1589 bpf_optimize(rootp)
1590 	struct block **rootp;
1591 {
1592 	struct block *root;
1593 
1594 	root = *rootp;
1595 
1596 	opt_init(root);
1597 	opt_loop(root, 0);
1598 	opt_loop(root, 1);
1599 	intern_blocks(root);
1600 	opt_root(rootp);
1601 	opt_cleanup();
1602 }
1603 
1604 static void
1605 make_marks(p)
1606 	struct block *p;
1607 {
1608 	if (!isMarked(p)) {
1609 		Mark(p);
1610 		if (BPF_CLASS(p->s.code) != BPF_RET) {
1611 			make_marks(JT(p));
1612 			make_marks(JF(p));
1613 		}
1614 	}
1615 }
1616 
1617 /*
1618  * Mark code array such that isMarked(i) is true
1619  * only for nodes that are alive.
1620  */
1621 static void
1622 mark_code(p)
1623 	struct block *p;
1624 {
1625 	cur_mark += 1;
1626 	make_marks(p);
1627 }
1628 
1629 /*
1630  * True iff the two stmt lists load the same value from the packet into
1631  * the accumulator.
1632  */
1633 static int
1634 eq_slist(x, y)
1635 	struct slist *x, *y;
1636 {
1637 	while (1) {
1638 		while (x && x->s.code == NOP)
1639 			x = x->next;
1640 		while (y && y->s.code == NOP)
1641 			y = y->next;
1642 		if (x == 0)
1643 			return y == 0;
1644 		if (y == 0)
1645 			return x == 0;
1646 		if (x->s.code != y->s.code || x->s.k != y->s.k)
1647 			return 0;
1648 		x = x->next;
1649 		y = y->next;
1650 	}
1651 }
1652 
1653 static inline int
1654 eq_blk(b0, b1)
1655 	struct block *b0, *b1;
1656 {
1657 	if (b0->s.code == b1->s.code &&
1658 	    b0->s.k == b1->s.k &&
1659 	    b0->et.succ == b1->et.succ &&
1660 	    b0->ef.succ == b1->ef.succ)
1661 		return eq_slist(b0->stmts, b1->stmts);
1662 	return 0;
1663 }
1664 
1665 static void
1666 intern_blocks(root)
1667 	struct block *root;
1668 {
1669 	struct block *p;
1670 	int i, j;
1671 	int done;
1672  top:
1673 	done = 1;
1674 	for (i = 0; i < n_blocks; ++i)
1675 		blocks[i]->link = 0;
1676 
1677 	mark_code(root);
1678 
1679 	for (i = n_blocks - 1; --i >= 0; ) {
1680 		if (!isMarked(blocks[i]))
1681 			continue;
1682 		for (j = i + 1; j < n_blocks; ++j) {
1683 			if (!isMarked(blocks[j]))
1684 				continue;
1685 			if (eq_blk(blocks[i], blocks[j])) {
1686 				blocks[i]->link = blocks[j]->link ?
1687 					blocks[j]->link : blocks[j];
1688 				break;
1689 			}
1690 		}
1691 	}
1692 	for (i = 0; i < n_blocks; ++i) {
1693 		p = blocks[i];
1694 		if (JT(p) == 0)
1695 			continue;
1696 		if (JT(p)->link) {
1697 			done = 0;
1698 			JT(p) = JT(p)->link;
1699 		}
1700 		if (JF(p)->link) {
1701 			done = 0;
1702 			JF(p) = JF(p)->link;
1703 		}
1704 	}
1705 	if (!done)
1706 		goto top;
1707 }
1708 
1709 static void
1710 opt_cleanup()
1711 {
1712 	free((void *)vnode_base);
1713 	free((void *)vmap);
1714 	free((void *)edges);
1715 	free((void *)space);
1716 	free((void *)levels);
1717 	free((void *)blocks);
1718 }
1719 
1720 /*
1721  * Return the number of stmts in 's'.
1722  */
1723 static int
1724 slength(s)
1725 	struct slist *s;
1726 {
1727 	int n = 0;
1728 
1729 	for (; s; s = s->next)
1730 		if (s->s.code != NOP)
1731 			++n;
1732 	return n;
1733 }
1734 
1735 /*
1736  * Return the number of nodes reachable by 'p'.
1737  * All nodes should be initially unmarked.
1738  */
1739 static int
1740 count_blocks(p)
1741 	struct block *p;
1742 {
1743 	if (p == 0 || isMarked(p))
1744 		return 0;
1745 	Mark(p);
1746 	return count_blocks(JT(p)) + count_blocks(JF(p)) + 1;
1747 }
1748 
1749 /*
1750  * Do a depth first search on the flow graph, numbering the
1751  * the basic blocks, and entering them into the 'blocks' array.`
1752  */
1753 static void
1754 number_blks_r(p)
1755 	struct block *p;
1756 {
1757 	int n;
1758 
1759 	if (p == 0 || isMarked(p))
1760 		return;
1761 
1762 	Mark(p);
1763 	n = n_blocks++;
1764 	p->id = n;
1765 	blocks[n] = p;
1766 
1767 	number_blks_r(JT(p));
1768 	number_blks_r(JF(p));
1769 }
1770 
1771 /*
1772  * Return the number of stmts in the flowgraph reachable by 'p'.
1773  * The nodes should be unmarked before calling.
1774  */
1775 static int
1776 count_stmts(p)
1777 	struct block *p;
1778 {
1779 	int n;
1780 
1781 	if (p == 0 || isMarked(p))
1782 		return 0;
1783 	Mark(p);
1784 	n = count_stmts(JT(p)) + count_stmts(JF(p));
1785 	return slength(p->stmts) + n + 1;
1786 }
1787 
1788 /*
1789  * Allocate memory.  All allocation is done before optimization
1790  * is begun.  A linear bound on the size of all data structures is computed
1791  * from the total number of blocks and/or statements.
1792  */
1793 static void
1794 opt_init(root)
1795 	struct block *root;
1796 {
1797 	bpf_u_int32 *p;
1798 	int i, n, max_stmts;
1799 
1800 	/*
1801 	 * First, count the blocks, so we can malloc an array to map
1802 	 * block number to block.  Then, put the blocks into the array.
1803 	 */
1804 	unMarkAll();
1805 	n = count_blocks(root);
1806 	blocks = (struct block **)malloc(n * sizeof(*blocks));
1807 	unMarkAll();
1808 	n_blocks = 0;
1809 	number_blks_r(root);
1810 
1811 	n_edges = 2 * n_blocks;
1812 	edges = (struct edge **)malloc(n_edges * sizeof(*edges));
1813 
1814 	/*
1815 	 * The number of levels is bounded by the number of nodes.
1816 	 */
1817 	levels = (struct block **)malloc(n_blocks * sizeof(*levels));
1818 
1819 	edgewords = n_edges / (8 * sizeof(bpf_u_int32)) + 1;
1820 	nodewords = n_blocks / (8 * sizeof(bpf_u_int32)) + 1;
1821 
1822 	/* XXX */
1823 	space = (bpf_u_int32 *)malloc(2 * n_blocks * nodewords * sizeof(*space)
1824 				 + n_edges * edgewords * sizeof(*space));
1825 	p = space;
1826 	all_dom_sets = p;
1827 	for (i = 0; i < n; ++i) {
1828 		blocks[i]->dom = p;
1829 		p += nodewords;
1830 	}
1831 	all_closure_sets = p;
1832 	for (i = 0; i < n; ++i) {
1833 		blocks[i]->closure = p;
1834 		p += nodewords;
1835 	}
1836 	all_edge_sets = p;
1837 	for (i = 0; i < n; ++i) {
1838 		register struct block *b = blocks[i];
1839 
1840 		b->et.edom = p;
1841 		p += edgewords;
1842 		b->ef.edom = p;
1843 		p += edgewords;
1844 		b->et.id = i;
1845 		edges[i] = &b->et;
1846 		b->ef.id = n_blocks + i;
1847 		edges[n_blocks + i] = &b->ef;
1848 		b->et.pred = b;
1849 		b->ef.pred = b;
1850 	}
1851 	max_stmts = 0;
1852 	for (i = 0; i < n; ++i)
1853 		max_stmts += slength(blocks[i]->stmts) + 1;
1854 	/*
1855 	 * We allocate at most 3 value numbers per statement,
1856 	 * so this is an upper bound on the number of valnodes
1857 	 * we'll need.
1858 	 */
1859 	maxval = 3 * max_stmts;
1860 	vmap = (struct vmapinfo *)malloc(maxval * sizeof(*vmap));
1861 	vnode_base = (struct valnode *)malloc(maxval * sizeof(*vmap));
1862 }
1863 
1864 /*
1865  * Some pointers used to convert the basic block form of the code,
1866  * into the array form that BPF requires.  'fstart' will point to
1867  * the malloc'd array while 'ftail' is used during the recursive traversal.
1868  */
1869 static struct bpf_insn *fstart;
1870 static struct bpf_insn *ftail;
1871 
1872 #ifdef BDEBUG
1873 int bids[1000];
1874 #endif
1875 
1876 /*
1877  * Returns true if successful.  Returns false if a branch has
1878  * an offset that is too large.  If so, we have marked that
1879  * branch so that on a subsequent iteration, it will be treated
1880  * properly.
1881  */
1882 static int
1883 convert_code_r(p)
1884 	struct block *p;
1885 {
1886 	struct bpf_insn *dst;
1887 	struct slist *src;
1888 	int slen;
1889 	u_int off;
1890 	int extrajmps;		/* number of extra jumps inserted */
1891 
1892 	if (p == 0 || isMarked(p))
1893 		return (1);
1894 	Mark(p);
1895 
1896 	if (convert_code_r(JF(p)) == 0)
1897 		return (0);
1898 	if (convert_code_r(JT(p)) == 0)
1899 		return (0);
1900 
1901 	slen = slength(p->stmts);
1902 	dst = ftail -= (slen + 1 + p->longjt + p->longjf);
1903 		/* inflate length by any extra jumps */
1904 
1905 	p->offset = dst - fstart;
1906 
1907 	for (src = p->stmts; src; src = src->next) {
1908 		if (src->s.code == NOP)
1909 			continue;
1910 		dst->code = (u_short)src->s.code;
1911 		dst->k = src->s.k;
1912 		++dst;
1913 	}
1914 #ifdef BDEBUG
1915 	bids[dst - fstart] = p->id + 1;
1916 #endif
1917 	dst->code = (u_short)p->s.code;
1918 	dst->k = p->s.k;
1919 	if (JT(p)) {
1920 		extrajmps = 0;
1921 		off = JT(p)->offset - (p->offset + slen) - 1;
1922 		if (off >= 256) {
1923 		    /* offset too large for branch, must add a jump */
1924 		    if (p->longjt == 0) {
1925 		    	/* mark this instruction and retry */
1926 			p->longjt++;
1927 			return(0);
1928 		    }
1929 		    /* branch if T to following jump */
1930 		    dst->jt = extrajmps;
1931 		    extrajmps++;
1932 		    dst[extrajmps].code = BPF_JMP|BPF_JA;
1933 		    dst[extrajmps].k = off - extrajmps;
1934 		}
1935 		else
1936 		    dst->jt = off;
1937 		off = JF(p)->offset - (p->offset + slen) - 1;
1938 		if (off >= 256) {
1939 		    /* offset too large for branch, must add a jump */
1940 		    if (p->longjf == 0) {
1941 		    	/* mark this instruction and retry */
1942 			p->longjf++;
1943 			return(0);
1944 		    }
1945 		    /* branch if F to following jump */
1946 		    /* if two jumps are inserted, F goes to second one */
1947 		    dst->jf = extrajmps;
1948 		    extrajmps++;
1949 		    dst[extrajmps].code = BPF_JMP|BPF_JA;
1950 		    dst[extrajmps].k = off - extrajmps;
1951 		}
1952 		else
1953 		    dst->jf = off;
1954 	}
1955 	return (1);
1956 }
1957 
1958 
1959 /*
1960  * Convert flowgraph intermediate representation to the
1961  * BPF array representation.  Set *lenp to the number of instructions.
1962  */
1963 struct bpf_insn *
1964 icode_to_fcode(root, lenp)
1965 	struct block *root;
1966 	int *lenp;
1967 {
1968 	int n;
1969 	struct bpf_insn *fp;
1970 
1971 	/*
1972 	 * Loop doing convert_codr_r() until no branches remain
1973 	 * with too-large offsets.
1974 	 */
1975 	while (1) {
1976 	    unMarkAll();
1977 	    n = *lenp = count_stmts(root);
1978 
1979 	    fp = (struct bpf_insn *)malloc(sizeof(*fp) * n);
1980 	    memset((char *)fp, 0, sizeof(*fp) * n);
1981 	    fstart = fp;
1982 	    ftail = fp + n;
1983 
1984 	    unMarkAll();
1985 	    if (convert_code_r(root))
1986 		break;
1987 	    free(fp);
1988 	}
1989 
1990 	return fp;
1991 }
1992 
1993 #ifdef BDEBUG
1994 static void
1995 opt_dump(root)
1996 	struct block *root;
1997 {
1998 	struct bpf_program f;
1999 
2000 	memset(bids, 0, sizeof bids);
2001 	f.bf_insns = icode_to_fcode(root, &f.bf_len);
2002 	bpf_dump(&f, 1);
2003 	putchar('\n');
2004 	free((char *)f.bf_insns);
2005 }
2006 #endif
2007