xref: /openbsd-src/lib/libpcap/optimize.c (revision 5262edafad5bc5784d91cd8e4a68ecd2668ca51e)
1 /*	$OpenBSD: optimize.c,v 1.2 1996/03/04 15:47:24 mickey Exp $	*/
2 /*	$NetBSD: optimize.c,v 1.3 1995/04/29 05:42:28 cgd Exp $	*/
3 
4 /*
5  * Copyright (c) 1988, 1989, 1990, 1991, 1993, 1994
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that: (1) source code distributions
10  * retain the above copyright notice and this paragraph in its entirety, (2)
11  * distributions including binary code include the above copyright notice and
12  * this paragraph in its entirety in the documentation or other materials
13  * provided with the distribution, and (3) all advertising materials mentioning
14  * features or use of this software display the following acknowledgement:
15  * ``This product includes software developed by the University of California,
16  * Lawrence Berkeley Laboratory and its contributors.'' Neither the name of
17  * the University nor the names of its contributors may be used to endorse
18  * or promote products derived from this software without specific prior
19  * written permission.
20  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
21  * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
22  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
23  *
24  *  Optimization module for tcpdump intermediate representation.
25  */
26 #ifndef lint
27 static char rcsid[] =
28     "@(#) Header: optimize.c,v 1.45 94/06/20 19:07:55 leres Exp (LBL)";
29 #endif
30 
31 #include <sys/types.h>
32 #include <sys/time.h>
33 
34 #include <net/bpf.h>
35 
36 #include <stdio.h>
37 #ifdef __osf__
38 #include <stdlib.h>
39 #include <malloc.h>
40 #endif
41 #ifdef __NetBSD__
42 #include <stdlib.h>
43 #endif
44 #include <memory.h>
45 
46 #include "gencode.h"
47 
48 #ifndef __GNUC__
49 #define inline
50 #endif
51 
52 #define A_ATOM BPF_MEMWORDS
53 #define X_ATOM (BPF_MEMWORDS+1)
54 
55 #define NOP -1
56 
57 /*
58  * This define is used to represent *both* the accumulator and
59  * x register in use-def computations.
60  * Currently, the use-def code assumes only one definition per instruction.
61  */
62 #define AX_ATOM N_ATOMS
63 
64 /*
65  * A flag to indicate that further optimization is needed.
66  * Iterative passes are continued until a given pass yields no
67  * branch movement.
68  */
69 static int done;
70 
71 /*
72  * A block is marked if only if its mark equals the current mark.
73  * Rather than traverse the code array, marking each item, 'cur_mark' is
74  * incremented.  This automatically makes each element unmarked.
75  */
76 static int cur_mark;
77 #define isMarked(p) ((p)->mark == cur_mark)
78 #define unMarkAll() cur_mark += 1
79 #define Mark(p) ((p)->mark = cur_mark)
80 
81 static void opt_init(struct block *);
82 static void opt_cleanup(void);
83 
84 static void make_marks(struct block *);
85 static void mark_code(struct block *);
86 
87 static void intern_blocks(struct block *);
88 
89 static int eq_slist(struct slist *, struct slist *);
90 
91 static void find_levels_r(struct block *);
92 
93 static void find_levels(struct block *);
94 static void find_dom(struct block *);
95 static void propedom(struct edge *);
96 static void find_edom(struct block *);
97 static void find_closure(struct block *);
98 static int atomuse(struct stmt *);
99 static int atomdef(struct stmt *);
100 static void compute_local_ud(struct block *);
101 static void find_ud(struct block *);
102 static void init_val(void);
103 static long F(int, long, long);
104 static inline void vstore(struct stmt *, long *, long, int);
105 static void opt_blk(struct block *, int);
106 static int use_conflict(struct block *, struct block *);
107 static void opt_j(struct edge *);
108 static void or_pullup(struct block *);
109 static void and_pullup(struct block *);
110 static void opt_blks(struct block *, int);
111 static inline void link_inedge(struct edge *, struct block *);
112 static void find_inedges(struct block *);
113 static void opt_root(struct block **);
114 static void opt_loop(struct block *, int);
115 static void fold_op(struct stmt *, long, long);
116 static inline struct slist *this_op(struct slist *);
117 static void opt_not(struct block *);
118 static void opt_peep(struct block *);
119 static void opt_stmt(struct stmt *, long[], int);
120 static void deadstmt(struct stmt *, struct stmt *[]);
121 static void opt_deadstores(struct block *);
122 static void opt_blk(struct block *, int);
123 static int use_conflict(struct block *, struct block *);
124 static void opt_j(struct edge *);
125 static struct block *fold_edge(struct block *, struct edge *);
126 static inline int eq_blk(struct block *, struct block *);
127 static int slength(struct slist *);
128 static int count_blocks(struct block *);
129 static void number_blks_r(struct block *);
130 static int count_stmts(struct block *);
131 static void convert_code_r(struct block *);
132 
133 static int n_blocks;
134 struct block **blocks;
135 static int n_edges;
136 struct edge **edges;
137 
138 /*
139  * A bit vector set representation of the dominators.
140  * We round up the set size to the next power of two.
141  */
142 static int nodewords;
143 static int edgewords;
144 struct block **levels;
145 u_long *space;
146 #define BITS_PER_WORD (8*sizeof(u_long))
147 /*
148  * True if a is in uset {p}
149  */
150 #define SET_MEMBER(p, a) \
151 ((p)[(unsigned)(a) / BITS_PER_WORD] & (1 << ((unsigned)(a) % BITS_PER_WORD)))
152 
153 /*
154  * Add 'a' to uset p.
155  */
156 #define SET_INSERT(p, a) \
157 (p)[(unsigned)(a) / BITS_PER_WORD] |= (1 << ((unsigned)(a) % BITS_PER_WORD))
158 
159 /*
160  * Delete 'a' from uset p.
161  */
162 #define SET_DELETE(p, a) \
163 (p)[(unsigned)(a) / BITS_PER_WORD] &= ~(1 << ((unsigned)(a) % BITS_PER_WORD))
164 
165 /*
166  * a := a intersect b
167  */
168 #define SET_INTERSECT(a, b, n)\
169 {\
170 	register u_long *_x = a, *_y = b;\
171 	register int _n = n;\
172 	while (--_n >= 0) *_x++ &= *_y++;\
173 }
174 
175 /*
176  * a := a - b
177  */
178 #define SET_SUBTRACT(a, b, n)\
179 {\
180 	register u_long *_x = a, *_y = b;\
181 	register int _n = n;\
182 	while (--_n >= 0) *_x++ &=~ *_y++;\
183 }
184 
185 /*
186  * a := a union b
187  */
188 #define SET_UNION(a, b, n)\
189 {\
190 	register u_long *_x = a, *_y = b;\
191 	register int _n = n;\
192 	while (--_n >= 0) *_x++ |= *_y++;\
193 }
194 
195 static uset all_dom_sets;
196 static uset all_closure_sets;
197 static uset all_edge_sets;
198 
199 #ifndef MAX
200 #define MAX(a,b) ((a)>(b)?(a):(b))
201 #endif
202 
203 static void
204 find_levels_r(b)
205 	struct block *b;
206 {
207 	int level;
208 
209 	if (isMarked(b))
210 		return;
211 
212 	Mark(b);
213 	b->link = 0;
214 
215 	if (JT(b)) {
216 		find_levels_r(JT(b));
217 		find_levels_r(JF(b));
218 		level = MAX(JT(b)->level, JF(b)->level) + 1;
219 	} else
220 		level = 0;
221 	b->level = level;
222 	b->link = levels[level];
223 	levels[level] = b;
224 }
225 
226 /*
227  * Level graph.  The levels go from 0 at the leaves to
228  * N_LEVELS at the root.  The levels[] array points to the
229  * first node of the level list, whose elements are linked
230  * with the 'link' field of the struct block.
231  */
232 static void
233 find_levels(root)
234 	struct block *root;
235 {
236 	memset((char *)levels, 0, n_blocks * sizeof(*levels));
237 	unMarkAll();
238 	find_levels_r(root);
239 }
240 
241 /*
242  * Find dominator relationships.
243  * Assumes graph has been leveled.
244  */
245 static void
246 find_dom(root)
247 	struct block *root;
248 {
249 	int i;
250 	struct block *b;
251 	u_long *x;
252 
253 	/*
254 	 * Initialize sets to contain all nodes.
255 	 */
256 	x = all_dom_sets;
257 	i = n_blocks * nodewords;
258 	while (--i >= 0)
259 		*x++ = ~0;
260 	/* Root starts off empty. */
261 	for (i = nodewords; --i >= 0;)
262 		root->dom[i] = 0;
263 
264 	/* root->level is the highest level no found. */
265 	for (i = root->level; i >= 0; --i) {
266 		for (b = levels[i]; b; b = b->link) {
267 			SET_INSERT(b->dom, b->id);
268 			if (JT(b) == 0)
269 				continue;
270 			SET_INTERSECT(JT(b)->dom, b->dom, nodewords);
271 			SET_INTERSECT(JF(b)->dom, b->dom, nodewords);
272 		}
273 	}
274 }
275 
276 static void
277 propedom(ep)
278 	struct edge *ep;
279 {
280 	SET_INSERT(ep->edom, ep->id);
281 	if (ep->succ) {
282 		SET_INTERSECT(ep->succ->et.edom, ep->edom, edgewords);
283 		SET_INTERSECT(ep->succ->ef.edom, ep->edom, edgewords);
284 	}
285 }
286 
287 /*
288  * Compute edge dominators.
289  * Assumes graph has been leveled and predecessors established.
290  */
291 static void
292 find_edom(root)
293 	struct block *root;
294 {
295 	int i;
296 	uset x;
297 	struct block *b;
298 
299 	x = all_edge_sets;
300 	for (i = n_edges * edgewords; --i >= 0; )
301 		x[i] = ~0;
302 
303 	/* root->level is the highest level no found. */
304 	memset(root->et.edom, 0, edgewords * sizeof(*(uset)0));
305 	memset(root->ef.edom, 0, edgewords * sizeof(*(uset)0));
306 	for (i = root->level; i >= 0; --i) {
307 		for (b = levels[i]; b != 0; b = b->link) {
308 			propedom(&b->et);
309 			propedom(&b->ef);
310 		}
311 	}
312 }
313 
314 /*
315  * Find the backwards transitive closure of the flow graph.  These sets
316  * are backwards in the sense that we find the set of nodes that reach
317  * a given node, not the set of nodes that can be reached by a node.
318  *
319  * Assumes graph has been leveled.
320  */
321 static void
322 find_closure(root)
323 	struct block *root;
324 {
325 	int i;
326 	struct block *b;
327 
328 	/*
329 	 * Initialize sets to contain no nodes.
330 	 */
331 	memset((char *)all_closure_sets, 0,
332 	      n_blocks * nodewords * sizeof(*all_closure_sets));
333 
334 	/* root->level is the highest level no found. */
335 	for (i = root->level; i >= 0; --i) {
336 		for (b = levels[i]; b; b = b->link) {
337 			SET_INSERT(b->closure, b->id);
338 			if (JT(b) == 0)
339 				continue;
340 			SET_UNION(JT(b)->closure, b->closure, nodewords);
341 			SET_UNION(JF(b)->closure, b->closure, nodewords);
342 		}
343 	}
344 }
345 
346 /*
347  * Return the register number that is used by s.  If A and X are both
348  * used, return AX_ATOM.  If no register is used, return -1.
349  *
350  * The implementation should probably change to an array access.
351  */
352 static int
353 atomuse(s)
354 	struct stmt *s;
355 {
356 	register int c = s->code;
357 
358 	if (c == NOP)
359 		return -1;
360 
361 	switch (BPF_CLASS(c)) {
362 
363 	case BPF_RET:
364 		return (BPF_RVAL(c) == BPF_A) ? A_ATOM :
365 			(BPF_RVAL(c) == BPF_X) ? X_ATOM : -1;
366 
367 	case BPF_LD:
368 	case BPF_LDX:
369 		return (BPF_MODE(c) == BPF_IND) ? X_ATOM :
370 			(BPF_MODE(c) == BPF_MEM) ? s->k : -1;
371 
372 	case BPF_ST:
373 		return A_ATOM;
374 
375 	case BPF_STX:
376 		return X_ATOM;
377 
378 	case BPF_JMP:
379 	case BPF_ALU:
380 		if (BPF_SRC(c) == BPF_X)
381 			return AX_ATOM;
382 		return A_ATOM;
383 
384 	case BPF_MISC:
385 		return BPF_MISCOP(c) == BPF_TXA ? X_ATOM : A_ATOM;
386 	}
387 	abort();
388 	/* NOTREACHED */
389 }
390 
391 /*
392  * Return the register number that is defined by 's'.  We assume that
393  * a single stmt cannot define more than one register.  If no register
394  * is defined, return -1.
395  *
396  * The implementation should probably change to an array access.
397  */
398 static int
399 atomdef(s)
400 	struct stmt *s;
401 {
402 	if (s->code == NOP)
403 		return -1;
404 
405 	switch (BPF_CLASS(s->code)) {
406 
407 	case BPF_LD:
408 	case BPF_ALU:
409 		return A_ATOM;
410 
411 	case BPF_LDX:
412 		return X_ATOM;
413 
414 	case BPF_ST:
415 	case BPF_STX:
416 		return s->k;
417 
418 	case BPF_MISC:
419 		return BPF_MISCOP(s->code) == BPF_TAX ? X_ATOM : A_ATOM;
420 	}
421 	return -1;
422 }
423 
424 static void
425 compute_local_ud(b)
426 	struct block *b;
427 {
428 	struct slist *s;
429 	atomset def = 0, use = 0, kill = 0;
430 	int atom;
431 
432 	for (s = b->stmts; s; s = s->next) {
433 		if (s->s.code == NOP)
434 			continue;
435 		atom = atomuse(&s->s);
436 		if (atom >= 0) {
437 			if (atom == AX_ATOM) {
438 				if (!ATOMELEM(def, X_ATOM))
439 					use |= ATOMMASK(X_ATOM);
440 				if (!ATOMELEM(def, A_ATOM))
441 					use |= ATOMMASK(A_ATOM);
442 			}
443 			else if (atom < N_ATOMS) {
444 				if (!ATOMELEM(def, atom))
445 					use |= ATOMMASK(atom);
446 			}
447 			else
448 				abort();
449 		}
450 		atom = atomdef(&s->s);
451 		if (atom >= 0) {
452 			if (!ATOMELEM(use, atom))
453 				kill |= ATOMMASK(atom);
454 			def |= ATOMMASK(atom);
455 		}
456 	}
457 	if (!ATOMELEM(def, A_ATOM) && BPF_CLASS(b->s.code) == BPF_JMP)
458 		use |= ATOMMASK(A_ATOM);
459 
460 	b->def = def;
461 	b->kill = kill;
462 	b->in_use = use;
463 }
464 
465 /*
466  * Assume graph is already leveled.
467  */
468 static void
469 find_ud(root)
470 	struct block *root;
471 {
472 	int i, maxlevel;
473 	struct block *p;
474 
475 	/*
476 	 * root->level is the highest level no found;
477 	 * count down from there.
478 	 */
479 	maxlevel = root->level;
480 	for (i = maxlevel; i >= 0; --i)
481 		for (p = levels[i]; p; p = p->link) {
482 			compute_local_ud(p);
483 			p->out_use = 0;
484 		}
485 
486 	for (i = 1; i <= maxlevel; ++i) {
487 		for (p = levels[i]; p; p = p->link) {
488 			p->out_use |= JT(p)->in_use | JF(p)->in_use;
489 			p->in_use |= p->out_use &~ p->kill;
490 		}
491 	}
492 }
493 
494 /*
495  * These data structures are used in a Cocke and Shwarz style
496  * value numbering scheme.  Since the flowgraph is acyclic,
497  * exit values can be propagated from a node's predecessors
498  * provided it is uniquely defined.
499  */
500 struct valnode {
501 	int code;
502 	long v0, v1;
503 	long val;
504 	struct valnode *next;
505 };
506 
507 #define MODULUS 213
508 static struct valnode *hashtbl[MODULUS];
509 static int curval;
510 static int maxval;
511 
512 /* Integer constants mapped with the load immediate opcode. */
513 #define K(i) F(BPF_LD|BPF_IMM|BPF_W, i, 0L)
514 
515 struct vmapinfo {
516 	int is_const;
517 	long const_val;
518 };
519 
520 struct vmapinfo *vmap;
521 struct valnode *vnode_base;
522 struct valnode *next_vnode;
523 
524 static void
525 init_val()
526 {
527 	curval = 0;
528 	next_vnode = vnode_base;
529 	memset((char *)vmap, 0, maxval * sizeof(*vmap));
530 	memset((char *)hashtbl, 0, sizeof hashtbl);
531 }
532 
533 /* Because we really don't have an IR, this stuff is a little messy. */
534 static long
535 F(code, v0, v1)
536 	int code;
537 	long v0, v1;
538 {
539 	u_int hash;
540 	int val;
541 	struct valnode *p;
542 
543 	hash = (u_int)code ^ (v0 << 4) ^ (v1 << 8);
544 	hash %= MODULUS;
545 
546 	for (p = hashtbl[hash]; p; p = p->next)
547 		if (p->code == code && p->v0 == v0 && p->v1 == v1)
548 			return p->val;
549 
550 	val = ++curval;
551 	if (BPF_MODE(code) == BPF_IMM &&
552 	    (BPF_CLASS(code) == BPF_LD || BPF_CLASS(code) == BPF_LDX)) {
553 		vmap[val].const_val = v0;
554 		vmap[val].is_const = 1;
555 	}
556 	p = next_vnode++;
557 	p->val = val;
558 	p->code = code;
559 	p->v0 = v0;
560 	p->v1 = v1;
561 	p->next = hashtbl[hash];
562 	hashtbl[hash] = p;
563 
564 	return val;
565 }
566 
567 static inline void
568 vstore(s, valp, newval, alter)
569 	struct stmt *s;
570 	long *valp;
571 	long newval;
572 	int alter;
573 {
574 	if (alter && *valp == newval)
575 		s->code = NOP;
576 	else
577 		*valp = newval;
578 }
579 
580 static void
581 fold_op(s, v0, v1)
582 	struct stmt *s;
583 	long v0, v1;
584 {
585 	long a, b;
586 
587 	a = vmap[v0].const_val;
588 	b = vmap[v1].const_val;
589 
590 	switch (BPF_OP(s->code)) {
591 	case BPF_ADD:
592 		a += b;
593 		break;
594 
595 	case BPF_SUB:
596 		a -= b;
597 		break;
598 
599 	case BPF_MUL:
600 		a *= b;
601 		break;
602 
603 	case BPF_DIV:
604 		if (b == 0)
605 			bpf_error("division by zero");
606 		a /= b;
607 		break;
608 
609 	case BPF_AND:
610 		a &= b;
611 		break;
612 
613 	case BPF_OR:
614 		a |= b;
615 		break;
616 
617 	case BPF_LSH:
618 		a <<= b;
619 		break;
620 
621 	case BPF_RSH:
622 		a >>= b;
623 		break;
624 
625 	case BPF_NEG:
626 		a = -a;
627 		break;
628 
629 	default:
630 		abort();
631 	}
632 	s->k = a;
633 	s->code = BPF_LD|BPF_IMM;
634 	done = 0;
635 }
636 
637 static inline struct slist *
638 this_op(s)
639 	struct slist *s;
640 {
641 	while (s != 0 && s->s.code == NOP)
642 		s = s->next;
643 	return s;
644 }
645 
646 static void
647 opt_not(b)
648 	struct block *b;
649 {
650 	struct block *tmp = JT(b);
651 
652 	JT(b) = JF(b);
653 	JF(b) = tmp;
654 }
655 
656 static void
657 opt_peep(b)
658 	struct block *b;
659 {
660 	struct slist *s;
661 	struct slist *next, *last;
662 	int val;
663 	long v;
664 
665 	s = b->stmts;
666 	if (s == 0)
667 		return;
668 
669 	last = s;
670 	while (1) {
671 		s = this_op(s);
672 		if (s == 0)
673 			break;
674 		next = this_op(s->next);
675 		if (next == 0)
676 			break;
677 		last = next;
678 
679 		/*
680 		 * st  M[k]	-->	st  M[k]
681 		 * ldx M[k]		tax
682 		 */
683 		if (s->s.code == BPF_ST &&
684 		    next->s.code == (BPF_LDX|BPF_MEM) &&
685 		    s->s.k == next->s.k) {
686 			done = 0;
687 			next->s.code = BPF_MISC|BPF_TAX;
688 		}
689 		/*
690 		 * ld  #k	-->	ldx  #k
691 		 * tax			txa
692 		 */
693 		if (s->s.code == (BPF_LD|BPF_IMM) &&
694 		    next->s.code == (BPF_MISC|BPF_TAX)) {
695 			s->s.code = BPF_LDX|BPF_IMM;
696 			next->s.code = BPF_MISC|BPF_TXA;
697 			done = 0;
698 		}
699 		/*
700 		 * This is an ugly special case, but it happens
701 		 * when you say tcp[k] or udp[k] where k is a constant.
702 		 */
703 		if (s->s.code == (BPF_LD|BPF_IMM)) {
704 			struct slist *add, *tax, *ild;
705 
706 			/*
707 			 * Check that X isn't used on exit from this
708 			 * block (which the optimizer might cause).
709 			 * We know the code generator won't generate
710 			 * any local dependencies.
711 			 */
712 			if (ATOMELEM(b->out_use, X_ATOM))
713 				break;
714 
715 			if (next->s.code != (BPF_LDX|BPF_MSH|BPF_B))
716 				add = next;
717 			else
718 				add = this_op(next->next);
719 			if (add == 0 || add->s.code != (BPF_ALU|BPF_ADD|BPF_X))
720 				break;
721 
722 			tax = this_op(add->next);
723 			if (tax == 0 || tax->s.code != (BPF_MISC|BPF_TAX))
724 				break;
725 
726 			ild = this_op(tax->next);
727 			if (ild == 0 || BPF_CLASS(ild->s.code) != BPF_LD ||
728 			    BPF_MODE(ild->s.code) != BPF_IND)
729 				break;
730 			/*
731 			 * XXX We need to check that X is not
732 			 * subsequently used.  We know we can eliminate the
733 			 * accumulator modifications since it is defined
734 			 * by the last stmt of this sequence.
735 			 *
736 			 * We want to turn this sequence:
737 			 *
738 			 * (004) ldi     #0x2		{s}
739 			 * (005) ldxms   [14]		{next}  -- optional
740 			 * (006) addx			{add}
741 			 * (007) tax			{tax}
742 			 * (008) ild     [x+0]		{ild}
743 			 *
744 			 * into this sequence:
745 			 *
746 			 * (004) nop
747 			 * (005) ldxms   [14]
748 			 * (006) nop
749 			 * (007) nop
750 			 * (008) ild     [x+2]
751 			 *
752 			 */
753 			ild->s.k += s->s.k;
754 			s->s.code = NOP;
755 			add->s.code = NOP;
756 			tax->s.code = NOP;
757 			done = 0;
758 		}
759 		s = next;
760 	}
761 	/*
762 	 * If we have a subtract to do a comparison, and the X register
763 	 * is a known constant, we can merge this value into the
764 	 * comparison.
765 	 */
766 	if (last->s.code == (BPF_ALU|BPF_SUB|BPF_X) &&
767 	    !ATOMELEM(b->out_use, A_ATOM)) {
768 		val = b->val[X_ATOM];
769 		if (vmap[val].is_const) {
770 			b->s.k += vmap[val].const_val;
771 			last->s.code = NOP;
772 			done = 0;
773 		} else if (b->s.k == 0) {
774 			/*
775 			 * sub x  ->	nop
776 			 * j  #0	j  x
777 			 */
778 			last->s.code = NOP;
779 			b->s.code = BPF_CLASS(b->s.code) | BPF_OP(b->s.code) |
780 				BPF_X;
781 			done = 0;
782 		}
783 	}
784 	/*
785 	 * Likewise, a constant subtract can be simplified.
786 	 */
787 	else if (last->s.code == (BPF_ALU|BPF_SUB|BPF_K) &&
788 		 !ATOMELEM(b->out_use, A_ATOM)) {
789 		b->s.k += last->s.k;
790 		last->s.code = NOP;
791 		done = 0;
792 	}
793 	/*
794 	 * and #k	nop
795 	 * jeq #0  ->	jset #k
796 	 */
797 	if (last->s.code == (BPF_ALU|BPF_AND|BPF_K) &&
798 	    !ATOMELEM(b->out_use, A_ATOM) && b->s.k == 0) {
799 		b->s.k = last->s.k;
800 		b->s.code = BPF_JMP|BPF_K|BPF_JSET;
801 		last->s.code = NOP;
802 		done = 0;
803 		opt_not(b);
804 	}
805 	/*
806 	 * If the accumulator is a known constant, we can compute the
807 	 * comparison result.
808 	 */
809 	val = b->val[A_ATOM];
810 	if (vmap[val].is_const && BPF_SRC(b->s.code) == BPF_K) {
811 		v = vmap[val].const_val;
812 		switch (BPF_OP(b->s.code)) {
813 
814 		case BPF_JEQ:
815 			v = v == b->s.k;
816 			break;
817 
818 		case BPF_JGT:
819 			v = v > b->s.k;
820 			break;
821 
822 		case BPF_JGE:
823 			v = v >= b->s.k;
824 			break;
825 
826 		case BPF_JSET:
827 			v &= b->s.k;
828 			break;
829 
830 		default:
831 			abort();
832 		}
833 		if (JF(b) != JT(b))
834 			done = 0;
835 		if (v)
836 			JF(b) = JT(b);
837 		else
838 			JT(b) = JF(b);
839 	}
840 }
841 
842 /*
843  * Compute the symbolic value of expression of 's', and update
844  * anything it defines in the value table 'val'.  If 'alter' is true,
845  * do various optimizations.  This code would be cleaner if symbolic
846  * evaluation and code transformations weren't folded together.
847  */
848 static void
849 opt_stmt(s, val, alter)
850 	struct stmt *s;
851 	long val[];
852 	int alter;
853 {
854 	int op;
855 	long v;
856 
857 	switch (s->code) {
858 
859 	case BPF_LD|BPF_ABS|BPF_W:
860 	case BPF_LD|BPF_ABS|BPF_H:
861 	case BPF_LD|BPF_ABS|BPF_B:
862 		v = F(s->code, s->k, 0L);
863 		vstore(s, &val[A_ATOM], v, alter);
864 		break;
865 
866 	case BPF_LD|BPF_IND|BPF_W:
867 	case BPF_LD|BPF_IND|BPF_H:
868 	case BPF_LD|BPF_IND|BPF_B:
869 		v = val[X_ATOM];
870 		if (alter && vmap[v].is_const) {
871 			s->code = BPF_LD|BPF_ABS|BPF_SIZE(s->code);
872 			s->k += vmap[v].const_val;
873 			v = F(s->code, s->k, 0L);
874 			done = 0;
875 		}
876 		else
877 			v = F(s->code, s->k, v);
878 		vstore(s, &val[A_ATOM], v, alter);
879 		break;
880 
881 	case BPF_LD|BPF_LEN:
882 		v = F(s->code, 0L, 0L);
883 		vstore(s, &val[A_ATOM], v, alter);
884 		break;
885 
886 	case BPF_LD|BPF_IMM:
887 		v = K(s->k);
888 		vstore(s, &val[A_ATOM], v, alter);
889 		break;
890 
891 	case BPF_LDX|BPF_IMM:
892 		v = K(s->k);
893 		vstore(s, &val[X_ATOM], v, alter);
894 		break;
895 
896 	case BPF_LDX|BPF_MSH|BPF_B:
897 		v = F(s->code, s->k, 0L);
898 		vstore(s, &val[X_ATOM], v, alter);
899 		break;
900 
901 	case BPF_ALU|BPF_NEG:
902 		if (alter && vmap[val[A_ATOM]].is_const) {
903 			s->code = BPF_LD|BPF_IMM;
904 			s->k = -vmap[val[A_ATOM]].const_val;
905 			val[A_ATOM] = K(s->k);
906 		}
907 		else
908 			val[A_ATOM] = F(s->code, val[A_ATOM], 0L);
909 		break;
910 
911 	case BPF_ALU|BPF_ADD|BPF_K:
912 	case BPF_ALU|BPF_SUB|BPF_K:
913 	case BPF_ALU|BPF_MUL|BPF_K:
914 	case BPF_ALU|BPF_DIV|BPF_K:
915 	case BPF_ALU|BPF_AND|BPF_K:
916 	case BPF_ALU|BPF_OR|BPF_K:
917 	case BPF_ALU|BPF_LSH|BPF_K:
918 	case BPF_ALU|BPF_RSH|BPF_K:
919 		op = BPF_OP(s->code);
920 		if (alter) {
921 			if (s->k == 0) {
922 				if (op == BPF_ADD || op == BPF_SUB ||
923 				    op == BPF_LSH || op == BPF_RSH ||
924 				    op == BPF_OR) {
925 					s->code = NOP;
926 					break;
927 				}
928 				if (op == BPF_MUL || op == BPF_AND) {
929 					s->code = BPF_LD|BPF_IMM;
930 					val[A_ATOM] = K(s->k);
931 					break;
932 				}
933 			}
934 			if (vmap[val[A_ATOM]].is_const) {
935 				fold_op(s, val[A_ATOM], K(s->k));
936 				val[A_ATOM] = K(s->k);
937 				break;
938 			}
939 		}
940 		val[A_ATOM] = F(s->code, val[A_ATOM], K(s->k));
941 		break;
942 
943 	case BPF_ALU|BPF_ADD|BPF_X:
944 	case BPF_ALU|BPF_SUB|BPF_X:
945 	case BPF_ALU|BPF_MUL|BPF_X:
946 	case BPF_ALU|BPF_DIV|BPF_X:
947 	case BPF_ALU|BPF_AND|BPF_X:
948 	case BPF_ALU|BPF_OR|BPF_X:
949 	case BPF_ALU|BPF_LSH|BPF_X:
950 	case BPF_ALU|BPF_RSH|BPF_X:
951 		op = BPF_OP(s->code);
952 		if (alter && vmap[val[X_ATOM]].is_const) {
953 			if (vmap[val[A_ATOM]].is_const) {
954 				fold_op(s, val[A_ATOM], val[X_ATOM]);
955 				val[A_ATOM] = K(s->k);
956 			}
957 			else {
958 				s->code = BPF_ALU|BPF_K|op;
959 				s->k = vmap[val[X_ATOM]].const_val;
960 				done = 0;
961 				val[A_ATOM] =
962 					F(s->code, val[A_ATOM], K(s->k));
963 			}
964 			break;
965 		}
966 		/*
967 		 * Check if we're doing something to an accumulator
968 		 * that is 0, and simplify.  This may not seem like
969 		 * much of a simplification but it could open up further
970 		 * optimizations.
971 		 * XXX We could also check for mul by 1, and -1, etc.
972 		 */
973 		if (alter && vmap[val[A_ATOM]].is_const
974 		    && vmap[val[A_ATOM]].const_val == 0) {
975 			if (op == BPF_ADD || op == BPF_OR ||
976 			    op == BPF_LSH || op == BPF_RSH || op == BPF_SUB) {
977 				s->code = BPF_MISC|BPF_TXA;
978 				vstore(s, &val[A_ATOM], val[X_ATOM], alter);
979 				break;
980 			}
981 			else if (op == BPF_MUL || op == BPF_DIV ||
982 				 op == BPF_AND) {
983 				s->code = BPF_LD|BPF_IMM;
984 				s->k = 0;
985 				vstore(s, &val[A_ATOM], K(s->k), alter);
986 				break;
987 			}
988 			else if (op == BPF_NEG) {
989 				s->code = NOP;
990 				break;
991 			}
992 		}
993 		val[A_ATOM] = F(s->code, val[A_ATOM], val[X_ATOM]);
994 		break;
995 
996 	case BPF_MISC|BPF_TXA:
997 		vstore(s, &val[A_ATOM], val[X_ATOM], alter);
998 		break;
999 
1000 	case BPF_LD|BPF_MEM:
1001 		v = val[s->k];
1002 		if (alter && vmap[v].is_const) {
1003 			s->code = BPF_LD|BPF_IMM;
1004 			s->k = vmap[v].const_val;
1005 			done = 0;
1006 		}
1007 		vstore(s, &val[A_ATOM], v, alter);
1008 		break;
1009 
1010 	case BPF_MISC|BPF_TAX:
1011 		vstore(s, &val[X_ATOM], val[A_ATOM], alter);
1012 		break;
1013 
1014 	case BPF_LDX|BPF_MEM:
1015 		v = val[s->k];
1016 		if (alter && vmap[v].is_const) {
1017 			s->code = BPF_LDX|BPF_IMM;
1018 			s->k = vmap[v].const_val;
1019 			done = 0;
1020 		}
1021 		vstore(s, &val[X_ATOM], v, alter);
1022 		break;
1023 
1024 	case BPF_ST:
1025 		vstore(s, &val[s->k], val[A_ATOM], alter);
1026 		break;
1027 
1028 	case BPF_STX:
1029 		vstore(s, &val[s->k], val[X_ATOM], alter);
1030 		break;
1031 	}
1032 }
1033 
1034 static void
1035 deadstmt(s, last)
1036 	register struct stmt *s;
1037 	register struct stmt *last[];
1038 {
1039 	register int atom;
1040 
1041 	atom = atomuse(s);
1042 	if (atom >= 0) {
1043 		if (atom == AX_ATOM) {
1044 			last[X_ATOM] = 0;
1045 			last[A_ATOM] = 0;
1046 		}
1047 		else
1048 			last[atom] = 0;
1049 	}
1050 	atom = atomdef(s);
1051 	if (atom >= 0) {
1052 		if (last[atom]) {
1053 			done = 0;
1054 			last[atom]->code = NOP;
1055 		}
1056 		last[atom] = s;
1057 	}
1058 }
1059 
1060 static void
1061 opt_deadstores(b)
1062 	register struct block *b;
1063 {
1064 	register struct slist *s;
1065 	register int atom;
1066 	struct stmt *last[N_ATOMS];
1067 
1068 	memset((char *)last, 0, sizeof last);
1069 
1070 	for (s = b->stmts; s != 0; s = s->next)
1071 		deadstmt(&s->s, last);
1072 	deadstmt(&b->s, last);
1073 
1074 	for (atom = 0; atom < N_ATOMS; ++atom)
1075 		if (last[atom] && !ATOMELEM(b->out_use, atom)) {
1076 			last[atom]->code = NOP;
1077 			done = 0;
1078 		}
1079 }
1080 
1081 static void
1082 opt_blk(b, do_stmts)
1083 	struct block *b;
1084 	int do_stmts;
1085 {
1086 	struct slist *s;
1087 	struct edge *p;
1088 	int i;
1089 	long aval;
1090 
1091 	/*
1092 	 * Initialize the atom values.
1093 	 * If we have no predecessors, everything is undefined.
1094 	 * Otherwise, we inherent our values from our predecessors.
1095 	 * If any register has an ambiguous value (i.e. control paths are
1096 	 * merging) give it the undefined value of 0.
1097 	 */
1098 	p = b->in_edges;
1099 	if (p == 0)
1100 		memset((char *)b->val, 0, sizeof(b->val));
1101 	else {
1102 		memcpy((char *)b->val, (char *)p->pred->val, sizeof(b->val));
1103 		while ((p = p->next) != NULL) {
1104 			for (i = 0; i < N_ATOMS; ++i)
1105 				if (b->val[i] != p->pred->val[i])
1106 					b->val[i] = 0;
1107 		}
1108 	}
1109 	aval = b->val[A_ATOM];
1110 	for (s = b->stmts; s; s = s->next)
1111 		opt_stmt(&s->s, b->val, do_stmts);
1112 
1113 	/*
1114 	 * This is a special case: if we don't use anything from this
1115 	 * block, and we load the accumulator with value that is
1116 	 * already there, eliminate all the statements.
1117 	 */
1118 	if (do_stmts && b->out_use == 0 && aval != 0 &&
1119 	    b->val[A_ATOM] == aval)
1120 		b->stmts = 0;
1121 	else {
1122 		opt_peep(b);
1123 		opt_deadstores(b);
1124 	}
1125 	/*
1126 	 * Set up values for branch optimizer.
1127 	 */
1128 	if (BPF_SRC(b->s.code) == BPF_K)
1129 		b->oval = K(b->s.k);
1130 	else
1131 		b->oval = b->val[X_ATOM];
1132 	b->et.code = b->s.code;
1133 	b->ef.code = -b->s.code;
1134 }
1135 
1136 /*
1137  * Return true if any register that is used on exit from 'succ', has
1138  * an exit value that is different from the corresponding exit value
1139  * from 'b'.
1140  */
1141 static int
1142 use_conflict(b, succ)
1143 	struct block *b, *succ;
1144 {
1145 	int atom;
1146 	atomset use = succ->out_use;
1147 
1148 	if (use == 0)
1149 		return 0;
1150 
1151 	for (atom = 0; atom < N_ATOMS; ++atom)
1152 		if (ATOMELEM(use, atom))
1153 			if (b->val[atom] != succ->val[atom])
1154 				return 1;
1155 	return 0;
1156 }
1157 
1158 static struct block *
1159 fold_edge(child, ep)
1160 	struct block *child;
1161 	struct edge *ep;
1162 {
1163 	int sense;
1164 	int aval0, aval1, oval0, oval1;
1165 	int code = ep->code;
1166 
1167 	if (code < 0) {
1168 		code = -code;
1169 		sense = 0;
1170 	} else
1171 		sense = 1;
1172 
1173 	if (child->s.code != code)
1174 		return 0;
1175 
1176 	aval0 = child->val[A_ATOM];
1177 	oval0 = child->oval;
1178 	aval1 = ep->pred->val[A_ATOM];
1179 	oval1 = ep->pred->oval;
1180 
1181 	if (aval0 != aval1)
1182 		return 0;
1183 
1184 	if (oval0 == oval1)
1185 		/*
1186 		 * The operands are identical, so the
1187 		 * result is true if a true branch was
1188 		 * taken to get here, otherwise false.
1189 		 */
1190 		return sense ? JT(child) : JF(child);
1191 
1192 	if (sense && code == (BPF_JMP|BPF_JEQ|BPF_K))
1193 		/*
1194 		 * At this point, we only know the comparison if we
1195 		 * came down the true branch, and it was an equality
1196 		 * comparison with a constant.  We rely on the fact that
1197 		 * distinct constants have distinct value numbers.
1198 		 */
1199 		return JF(child);
1200 
1201 	return 0;
1202 }
1203 
1204 static void
1205 opt_j(ep)
1206 	struct edge *ep;
1207 {
1208 	register int i, k;
1209 	register struct block *target;
1210 
1211 	if (JT(ep->succ) == 0)
1212 		return;
1213 
1214 	if (JT(ep->succ) == JF(ep->succ)) {
1215 		/*
1216 		 * Common branch targets can be eliminated, provided
1217 		 * there is no data dependency.
1218 		 */
1219 		if (!use_conflict(ep->pred, ep->succ->et.succ)) {
1220 			done = 0;
1221 			ep->succ = JT(ep->succ);
1222 		}
1223 	}
1224 	/*
1225 	 * For each edge dominator that matches the successor of this
1226 	 * edge, promote the edge successor to the its grandchild.
1227 	 *
1228 	 * XXX We violate the set abstraction here in favor a reasonably
1229 	 * efficient loop.
1230 	 */
1231  top:
1232 	for (i = 0; i < edgewords; ++i) {
1233 		register u_long x = ep->edom[i];
1234 
1235 		while (x != 0) {
1236 			k = ffs(x) - 1;
1237 			x &=~ (1 << k);
1238 			k += i * BITS_PER_WORD;
1239 
1240 			target = fold_edge(ep->succ, edges[k]);
1241 			/*
1242 			 * Check that there is no data dependency between
1243 			 * nodes that will be violated if we move the edge.
1244 			 */
1245 			if (target != 0 && !use_conflict(ep->pred, target)) {
1246 				done = 0;
1247 				ep->succ = target;
1248 				if (JT(target) != 0)
1249 					/*
1250 					 * Start over unless we hit a leaf.
1251 					 */
1252 					goto top;
1253 				return;
1254 			}
1255 		}
1256 	}
1257 }
1258 
1259 
1260 static void
1261 or_pullup(b)
1262 	struct block *b;
1263 {
1264 	int val, at_top;
1265 	struct block *pull;
1266 	struct block **diffp, **samep;
1267 	struct edge *ep;
1268 
1269 	ep = b->in_edges;
1270 	if (ep == 0)
1271 		return;
1272 
1273 	/*
1274 	 * Make sure each predecessor loads the same value.
1275 	 * XXX why?
1276 	 */
1277 	val = ep->pred->val[A_ATOM];
1278 	for (ep = ep->next; ep != 0; ep = ep->next)
1279 		if (val != ep->pred->val[A_ATOM])
1280 			return;
1281 
1282 	if (JT(b->in_edges->pred) == b)
1283 		diffp = &JT(b->in_edges->pred);
1284 	else
1285 		diffp = &JF(b->in_edges->pred);
1286 
1287 	at_top = 1;
1288 	while (1) {
1289 		if (*diffp == 0)
1290 			return;
1291 
1292 		if (JT(*diffp) != JT(b))
1293 			return;
1294 
1295 		if (!SET_MEMBER((*diffp)->dom, b->id))
1296 			return;
1297 
1298 		if ((*diffp)->val[A_ATOM] != val)
1299 			break;
1300 
1301 		diffp = &JF(*diffp);
1302 		at_top = 0;
1303 	}
1304 	samep = &JF(*diffp);
1305 	while (1) {
1306 		if (*samep == 0)
1307 			return;
1308 
1309 		if (JT(*samep) != JT(b))
1310 			return;
1311 
1312 		if (!SET_MEMBER((*samep)->dom, b->id))
1313 			return;
1314 
1315 		if ((*samep)->val[A_ATOM] == val)
1316 			break;
1317 
1318 		/* XXX Need to check that there are no data dependencies
1319 		   between dp0 and dp1.  Currently, the code generator
1320 		   will not produce such dependencies. */
1321 		samep = &JF(*samep);
1322 	}
1323 #ifdef notdef
1324 	/* XXX This doesn't cover everything. */
1325 	for (i = 0; i < N_ATOMS; ++i)
1326 		if ((*samep)->val[i] != pred->val[i])
1327 			return;
1328 #endif
1329 	/* Pull up the node. */
1330 	pull = *samep;
1331 	*samep = JF(pull);
1332 	JF(pull) = *diffp;
1333 
1334 	/*
1335 	 * At the top of the chain, each predecessor needs to point at the
1336 	 * pulled up node.  Inside the chain, there is only one predecessor
1337 	 * to worry about.
1338 	 */
1339 	if (at_top) {
1340 		for (ep = b->in_edges; ep != 0; ep = ep->next) {
1341 			if (JT(ep->pred) == b)
1342 				JT(ep->pred) = pull;
1343 			else
1344 				JF(ep->pred) = pull;
1345 		}
1346 	}
1347 	else
1348 		*diffp = pull;
1349 
1350 	done = 0;
1351 }
1352 
1353 static void
1354 and_pullup(b)
1355 	struct block *b;
1356 {
1357 	int val, at_top;
1358 	struct block *pull;
1359 	struct block **diffp, **samep;
1360 	struct edge *ep;
1361 
1362 	ep = b->in_edges;
1363 	if (ep == 0)
1364 		return;
1365 
1366 	/*
1367 	 * Make sure each predecessor loads the same value.
1368 	 */
1369 	val = ep->pred->val[A_ATOM];
1370 	for (ep = ep->next; ep != 0; ep = ep->next)
1371 		if (val != ep->pred->val[A_ATOM])
1372 			return;
1373 
1374 	if (JT(b->in_edges->pred) == b)
1375 		diffp = &JT(b->in_edges->pred);
1376 	else
1377 		diffp = &JF(b->in_edges->pred);
1378 
1379 	at_top = 1;
1380 	while (1) {
1381 		if (*diffp == 0)
1382 			return;
1383 
1384 		if (JF(*diffp) != JF(b))
1385 			return;
1386 
1387 		if (!SET_MEMBER((*diffp)->dom, b->id))
1388 			return;
1389 
1390 		if ((*diffp)->val[A_ATOM] != val)
1391 			break;
1392 
1393 		diffp = &JT(*diffp);
1394 		at_top = 0;
1395 	}
1396 	samep = &JT(*diffp);
1397 	while (1) {
1398 		if (*samep == 0)
1399 			return;
1400 
1401 		if (JF(*samep) != JF(b))
1402 			return;
1403 
1404 		if (!SET_MEMBER((*samep)->dom, b->id))
1405 			return;
1406 
1407 		if ((*samep)->val[A_ATOM] == val)
1408 			break;
1409 
1410 		/* XXX Need to check that there are no data dependencies
1411 		   between diffp and samep.  Currently, the code generator
1412 		   will not produce such dependencies. */
1413 		samep = &JT(*samep);
1414 	}
1415 #ifdef notdef
1416 	/* XXX This doesn't cover everything. */
1417 	for (i = 0; i < N_ATOMS; ++i)
1418 		if ((*samep)->val[i] != pred->val[i])
1419 			return;
1420 #endif
1421 	/* Pull up the node. */
1422 	pull = *samep;
1423 	*samep = JT(pull);
1424 	JT(pull) = *diffp;
1425 
1426 	/*
1427 	 * At the top of the chain, each predecessor needs to point at the
1428 	 * pulled up node.  Inside the chain, there is only one predecessor
1429 	 * to worry about.
1430 	 */
1431 	if (at_top) {
1432 		for (ep = b->in_edges; ep != 0; ep = ep->next) {
1433 			if (JT(ep->pred) == b)
1434 				JT(ep->pred) = pull;
1435 			else
1436 				JF(ep->pred) = pull;
1437 		}
1438 	}
1439 	else
1440 		*diffp = pull;
1441 
1442 	done = 0;
1443 }
1444 
1445 static void
1446 opt_blks(root, do_stmts)
1447 	struct block *root;
1448 	int do_stmts;
1449 {
1450 	int i, maxlevel;
1451 	struct block *p;
1452 
1453 	init_val();
1454 	maxlevel = root->level;
1455 	for (i = maxlevel; i >= 0; --i)
1456 		for (p = levels[i]; p; p = p->link)
1457 			opt_blk(p, do_stmts);
1458 
1459 	if (do_stmts)
1460 		/*
1461 		 * No point trying to move branches; it can't possibly
1462 		 * make a difference at this point.
1463 		 */
1464 		return;
1465 
1466 	for (i = 1; i <= maxlevel; ++i) {
1467 		for (p = levels[i]; p; p = p->link) {
1468 			opt_j(&p->et);
1469 			opt_j(&p->ef);
1470 		}
1471 	}
1472 	for (i = 1; i <= maxlevel; ++i) {
1473 		for (p = levels[i]; p; p = p->link) {
1474 			or_pullup(p);
1475 			and_pullup(p);
1476 		}
1477 	}
1478 }
1479 
1480 static inline void
1481 link_inedge(parent, child)
1482 	struct edge *parent;
1483 	struct block *child;
1484 {
1485 	parent->next = child->in_edges;
1486 	child->in_edges = parent;
1487 }
1488 
1489 static void
1490 find_inedges(root)
1491 	struct block *root;
1492 {
1493 	int i;
1494 	struct block *b;
1495 
1496 	for (i = 0; i < n_blocks; ++i)
1497 		blocks[i]->in_edges = 0;
1498 
1499 	/*
1500 	 * Traverse the graph, adding each edge to the predecessor
1501 	 * list of its successors.  Skip the leaves (i.e. level 0).
1502 	 */
1503 	for (i = root->level; i > 0; --i) {
1504 		for (b = levels[i]; b != 0; b = b->link) {
1505 			link_inedge(&b->et, JT(b));
1506 			link_inedge(&b->ef, JF(b));
1507 		}
1508 	}
1509 }
1510 
1511 static void
1512 opt_root(b)
1513 	struct block **b;
1514 {
1515 	struct slist *tmp, *s;
1516 
1517 	s = (*b)->stmts;
1518 	(*b)->stmts = 0;
1519 	while (BPF_CLASS((*b)->s.code) == BPF_JMP && JT(*b) == JF(*b))
1520 		*b = JT(*b);
1521 
1522 	tmp = (*b)->stmts;
1523 	if (tmp != 0)
1524 		sappend(s, tmp);
1525 	(*b)->stmts = s;
1526 }
1527 
1528 static void
1529 opt_loop(root, do_stmts)
1530 	struct block *root;
1531 	int do_stmts;
1532 {
1533 
1534 #ifdef BDEBUG
1535 	if (dflag > 1)
1536 		opt_dump(root);
1537 #endif
1538 	do {
1539 		done = 1;
1540 		find_levels(root);
1541 		find_dom(root);
1542 		find_closure(root);
1543 		find_inedges(root);
1544 		find_ud(root);
1545 		find_edom(root);
1546 		opt_blks(root, do_stmts);
1547 #ifdef BDEBUG
1548 		if (dflag > 1)
1549 			opt_dump(root);
1550 #endif
1551 	} while (!done);
1552 }
1553 
1554 /*
1555  * Optimize the filter code in its dag representation.
1556  */
1557 void
1558 bpf_optimize(rootp)
1559 	struct block **rootp;
1560 {
1561 	struct block *root;
1562 
1563 	root = *rootp;
1564 
1565 	opt_init(root);
1566 	opt_loop(root, 0);
1567 	opt_loop(root, 1);
1568 	intern_blocks(root);
1569 	opt_root(rootp);
1570 	opt_cleanup();
1571 }
1572 
1573 static void
1574 make_marks(p)
1575 	struct block *p;
1576 {
1577 	if (!isMarked(p)) {
1578 		Mark(p);
1579 		if (BPF_CLASS(p->s.code) != BPF_RET) {
1580 			make_marks(JT(p));
1581 			make_marks(JF(p));
1582 		}
1583 	}
1584 }
1585 
1586 /*
1587  * Mark code array such that isMarked(i) is true
1588  * only for nodes that are alive.
1589  */
1590 static void
1591 mark_code(p)
1592 	struct block *p;
1593 {
1594 	cur_mark += 1;
1595 	make_marks(p);
1596 }
1597 
1598 /*
1599  * True iff the two stmt lists load the same value from the packet into
1600  * the accumulator.
1601  */
1602 static int
1603 eq_slist(x, y)
1604 	struct slist *x, *y;
1605 {
1606 	while (1) {
1607 		while (x && x->s.code == NOP)
1608 			x = x->next;
1609 		while (y && y->s.code == NOP)
1610 			y = y->next;
1611 		if (x == 0)
1612 			return y == 0;
1613 		if (y == 0)
1614 			return x == 0;
1615 		if (x->s.code != y->s.code || x->s.k != y->s.k)
1616 			return 0;
1617 		x = x->next;
1618 		y = y->next;
1619 	}
1620 }
1621 
1622 static inline int
1623 eq_blk(b0, b1)
1624 	struct block *b0, *b1;
1625 {
1626 	if (b0->s.code == b1->s.code &&
1627 	    b0->s.k == b1->s.k &&
1628 	    b0->et.succ == b1->et.succ &&
1629 	    b0->ef.succ == b1->ef.succ)
1630 		return eq_slist(b0->stmts, b1->stmts);
1631 	return 0;
1632 }
1633 
1634 static void
1635 intern_blocks(root)
1636 	struct block *root;
1637 {
1638 	struct block *p;
1639 	int i, j;
1640 	int done;
1641  top:
1642 	done = 1;
1643 	for (i = 0; i < n_blocks; ++i)
1644 		blocks[i]->link = 0;
1645 
1646 	mark_code(root);
1647 
1648 	for (i = n_blocks - 1; --i >= 0; ) {
1649 		if (!isMarked(blocks[i]))
1650 			continue;
1651 		for (j = i + 1; j < n_blocks; ++j) {
1652 			if (!isMarked(blocks[j]))
1653 				continue;
1654 			if (eq_blk(blocks[i], blocks[j])) {
1655 				blocks[i]->link = blocks[j]->link ?
1656 					blocks[j]->link : blocks[j];
1657 				break;
1658 			}
1659 		}
1660 	}
1661 	for (i = 0; i < n_blocks; ++i) {
1662 		p = blocks[i];
1663 		if (JT(p) == 0)
1664 			continue;
1665 		if (JT(p)->link) {
1666 			done = 0;
1667 			JT(p) = JT(p)->link;
1668 		}
1669 		if (JF(p)->link) {
1670 			done = 0;
1671 			JF(p) = JF(p)->link;
1672 		}
1673 	}
1674 	if (!done)
1675 		goto top;
1676 }
1677 
1678 static void
1679 opt_cleanup()
1680 {
1681 	free((void *)vnode_base);
1682 	free((void *)vmap);
1683 	free((void *)edges);
1684 	free((void *)space);
1685 	free((void *)levels);
1686 	free((void *)blocks);
1687 }
1688 
1689 /*
1690  * Return the number of stmts in 's'.
1691  */
1692 static int
1693 slength(s)
1694 	struct slist *s;
1695 {
1696 	int n = 0;
1697 
1698 	for (; s; s = s->next)
1699 		if (s->s.code != NOP)
1700 			++n;
1701 	return n;
1702 }
1703 
1704 /*
1705  * Return the number of nodes reachable by 'p'.
1706  * All nodes should be initially unmarked.
1707  */
1708 static int
1709 count_blocks(p)
1710 	struct block *p;
1711 {
1712 	if (p == 0 || isMarked(p))
1713 		return 0;
1714 	Mark(p);
1715 	return count_blocks(JT(p)) + count_blocks(JF(p)) + 1;
1716 }
1717 
1718 /*
1719  * Do a depth first search on the flow graph, numbering the
1720  * the basic blocks, and entering them into the 'blocks' array.`
1721  */
1722 static void
1723 number_blks_r(p)
1724 	struct block *p;
1725 {
1726 	int n;
1727 
1728 	if (p == 0 || isMarked(p))
1729 		return;
1730 
1731 	Mark(p);
1732 	n = n_blocks++;
1733 	p->id = n;
1734 	blocks[n] = p;
1735 
1736 	number_blks_r(JT(p));
1737 	number_blks_r(JF(p));
1738 }
1739 
1740 /*
1741  * Return the number of stmts in the flowgraph reachable by 'p'.
1742  * The nodes should be unmarked before calling.
1743  */
1744 static int
1745 count_stmts(p)
1746 	struct block *p;
1747 {
1748 	int n;
1749 
1750 	if (p == 0 || isMarked(p))
1751 		return 0;
1752 	Mark(p);
1753 	n = count_stmts(JT(p)) + count_stmts(JF(p));
1754 	return slength(p->stmts) + n + 1;
1755 }
1756 
1757 /*
1758  * Allocate memory.  All allocation is done before optimization
1759  * is begun.  A linear bound on the size of all data structures is computed
1760  * from the total number of blocks and/or statements.
1761  */
1762 static void
1763 opt_init(root)
1764 	struct block *root;
1765 {
1766 	u_long *p;
1767 	int i, n, max_stmts;
1768 
1769 	/*
1770 	 * First, count the blocks, so we can malloc an array to map
1771 	 * block number to block.  Then, put the blocks into the array.
1772 	 */
1773 	unMarkAll();
1774 	n = count_blocks(root);
1775 	blocks = (struct block **)malloc(n * sizeof(*blocks));
1776 	unMarkAll();
1777 	n_blocks = 0;
1778 	number_blks_r(root);
1779 
1780 	n_edges = 2 * n_blocks;
1781 	edges = (struct edge **)malloc(n_edges * sizeof(*edges));
1782 
1783 	/*
1784 	 * The number of levels is bounded by the number of nodes.
1785 	 */
1786 	levels = (struct block **)malloc(n_blocks * sizeof(*levels));
1787 
1788 	edgewords = n_edges / (8 * sizeof(u_long)) + 1;
1789 	nodewords = n_blocks / (8 * sizeof(u_long)) + 1;
1790 
1791 	/* XXX */
1792 	space = (u_long *)malloc(2 * n_blocks * nodewords * sizeof(*space)
1793 				 + n_edges * edgewords * sizeof(*space));
1794 	p = space;
1795 	all_dom_sets = p;
1796 	for (i = 0; i < n; ++i) {
1797 		blocks[i]->dom = p;
1798 		p += nodewords;
1799 	}
1800 	all_closure_sets = p;
1801 	for (i = 0; i < n; ++i) {
1802 		blocks[i]->closure = p;
1803 		p += nodewords;
1804 	}
1805 	all_edge_sets = p;
1806 	for (i = 0; i < n; ++i) {
1807 		register struct block *b = blocks[i];
1808 
1809 		b->et.edom = p;
1810 		p += edgewords;
1811 		b->ef.edom = p;
1812 		p += edgewords;
1813 		b->et.id = i;
1814 		edges[i] = &b->et;
1815 		b->ef.id = n_blocks + i;
1816 		edges[n_blocks + i] = &b->ef;
1817 		b->et.pred = b;
1818 		b->ef.pred = b;
1819 	}
1820 	max_stmts = 0;
1821 	for (i = 0; i < n; ++i)
1822 		max_stmts += slength(blocks[i]->stmts) + 1;
1823 	/*
1824 	 * We allocate at most 3 value numbers per statement,
1825 	 * so this is an upper bound on the number of valnodes
1826 	 * we'll need.
1827 	 */
1828 	maxval = 3 * max_stmts;
1829 	vmap = (struct vmapinfo *)malloc(maxval * sizeof(*vmap));
1830 	vnode_base = (struct valnode *)malloc(maxval * sizeof(*vmap));
1831 }
1832 
1833 /*
1834  * Some pointers used to convert the basic block form of the code,
1835  * into the array form that BPF requires.  'fstart' will point to
1836  * the malloc'd array while 'ftail' is used during the recursive traversal.
1837  */
1838 static struct bpf_insn *fstart;
1839 static struct bpf_insn *ftail;
1840 
1841 #ifdef BDEBUG
1842 int bids[1000];
1843 #endif
1844 
1845 static void
1846 convert_code_r(p)
1847 	struct block *p;
1848 {
1849 	struct bpf_insn *dst;
1850 	struct slist *src;
1851 	int slen;
1852 	u_int off;
1853 
1854 	if (p == 0 || isMarked(p))
1855 		return;
1856 	Mark(p);
1857 
1858 	convert_code_r(JF(p));
1859 	convert_code_r(JT(p));
1860 
1861 	slen = slength(p->stmts);
1862 	dst = ftail -= slen + 1;
1863 
1864 	p->offset = dst - fstart;
1865 
1866 	for (src = p->stmts; src; src = src->next) {
1867 		if (src->s.code == NOP)
1868 			continue;
1869 		dst->code = (u_short)src->s.code;
1870 		dst->k = src->s.k;
1871 		++dst;
1872 	}
1873 #ifdef BDEBUG
1874 	bids[dst - fstart] = p->id + 1;
1875 #endif
1876 	dst->code = (u_short)p->s.code;
1877 	dst->k = p->s.k;
1878 	if (JT(p)) {
1879 		off = JT(p)->offset - (p->offset + slen) - 1;
1880 		if (off >= 256)
1881 			bpf_error("long jumps not supported");
1882 		dst->jt = off;
1883 		off = JF(p)->offset - (p->offset + slen) - 1;
1884 		if (off >= 256)
1885 			bpf_error("long jumps not supported");
1886 		dst->jf = off;
1887 	}
1888 }
1889 
1890 
1891 /*
1892  * Convert flowgraph intermediate representation to the
1893  * BPF array representation.  Set *lenp to the number of instructions.
1894  */
1895 struct bpf_insn *
1896 icode_to_fcode(root, lenp)
1897 	struct block *root;
1898 	int *lenp;
1899 {
1900 	int n;
1901 	struct bpf_insn *fp;
1902 
1903 	unMarkAll();
1904 	n = *lenp = count_stmts(root);
1905 
1906 	fp = (struct bpf_insn *)malloc(sizeof(*fp) * n);
1907 	memset((char *)fp, 0, sizeof(*fp) * n);
1908 	fstart = fp;
1909 	ftail = fp + n;
1910 
1911 	unMarkAll();
1912 	convert_code_r(root);
1913 
1914 	return fp;
1915 }
1916 
1917 #ifdef BDEBUG
1918 opt_dump(root)
1919 	struct block *root;
1920 {
1921 	struct bpf_program f;
1922 
1923 	memset(bids, 0, sizeof bids);
1924 	f.bf_insns = icode_to_fcode(root, &f.bf_len);
1925 	bpf_dump(&f, 1);
1926 	putchar('\n');
1927 	free((char *)f.bf_insns);
1928 }
1929 #endif
1930