xref: /netbsd-src/external/bsd/libpcap/dist/optimize.c (revision f3cfa6f6ce31685c6c4a758bc430e69eb99f50a4)
1 /*	$NetBSD: optimize.c,v 1.10 2018/09/03 15:26:43 christos Exp $	*/
2 
3 /*
4  * Copyright (c) 1988, 1989, 1990, 1991, 1993, 1994, 1995, 1996
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that: (1) source code distributions
9  * retain the above copyright notice and this paragraph in its entirety, (2)
10  * distributions including binary code include the above copyright notice and
11  * this paragraph in its entirety in the documentation or other materials
12  * provided with the distribution, and (3) all advertising materials mentioning
13  * features or use of this software display the following acknowledgement:
14  * ``This product includes software developed by the University of California,
15  * Lawrence Berkeley Laboratory and its contributors.'' Neither the name of
16  * the University nor the names of its contributors may be used to endorse
17  * or promote products derived from this software without specific prior
18  * written permission.
19  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
20  * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
21  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
22  *
23  *  Optimization module for BPF code intermediate representation.
24  */
25 
26 #include <sys/cdefs.h>
27 __RCSID("$NetBSD: optimize.c,v 1.10 2018/09/03 15:26:43 christos Exp $");
28 
29 #ifdef HAVE_CONFIG_H
30 #include <config.h>
31 #endif
32 
33 #include <pcap-types.h>
34 
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <memory.h>
38 #include <string.h>
39 
40 #include <errno.h>
41 
42 #include "pcap-int.h"
43 
44 #include "gencode.h"
45 #include "optimize.h"
46 
47 #ifdef HAVE_OS_PROTO_H
48 #include "os-proto.h"
49 #endif
50 
51 #ifdef BDEBUG
52 /*
53  * The internal "debug printout" flag for the filter expression optimizer.
54  * The code to print that stuff is present only if BDEBUG is defined, so
55  * the flag, and the routine to set it, are defined only if BDEBUG is
56  * defined.
57  */
58 static int pcap_optimizer_debug;
59 
60 /*
61  * Routine to set that flag.
62  *
63  * This is intended for libpcap developers, not for general use.
64  * If you want to set these in a program, you'll have to declare this
65  * routine yourself, with the appropriate DLL import attribute on Windows;
66  * it's not declared in any header file, and won't be declared in any
67  * header file provided by libpcap.
68  */
69 PCAP_API void pcap_set_optimizer_debug(int value);
70 
71 PCAP_API_DEF void
72 pcap_set_optimizer_debug(int value)
73 {
74 	pcap_optimizer_debug = value;
75 }
76 
77 /*
78  * The internal "print dot graph" flag for the filter expression optimizer.
79  * The code to print that stuff is present only if BDEBUG is defined, so
80  * the flag, and the routine to set it, are defined only if BDEBUG is
81  * defined.
82  */
83 static int pcap_print_dot_graph;
84 
85 /*
86  * Routine to set that flag.
87  *
88  * This is intended for libpcap developers, not for general use.
89  * If you want to set these in a program, you'll have to declare this
90  * routine yourself, with the appropriate DLL import attribute on Windows;
91  * it's not declared in any header file, and won't be declared in any
92  * header file provided by libpcap.
93  */
94 PCAP_API void pcap_set_print_dot_graph(int value);
95 
96 PCAP_API_DEF void
97 pcap_set_print_dot_graph(int value)
98 {
99 	pcap_print_dot_graph = value;
100 }
101 
102 #endif
103 
104 /*
105  * lowest_set_bit().
106  *
107  * Takes a 32-bit integer as an argument.
108  *
109  * If handed a non-zero value, returns the index of the lowest set bit,
110  * counting upwards fro zero.
111  *
112  * If handed zero, the results are platform- and compiler-dependent.
113  * Keep it out of the light, don't give it any water, don't feed it
114  * after midnight, and don't pass zero to it.
115  *
116  * This is the same as the count of trailing zeroes in the word.
117  */
118 #if PCAP_IS_AT_LEAST_GNUC_VERSION(3,4)
119   /*
120    * GCC 3.4 and later; we have __builtin_ctz().
121    */
122   #define lowest_set_bit(mask) __builtin_ctz(mask)
123 #elif defined(_MSC_VER)
124   /*
125    * Visual Studio; we support only 2005 and later, so use
126    * _BitScanForward().
127    */
128 #include <intrin.h>
129 
130 #ifndef __clang__
131 #pragma intrinsic(_BitScanForward)
132 #endif
133 
134 static __forceinline int
135 lowest_set_bit(int mask)
136 {
137 	unsigned long bit;
138 
139 	/*
140 	 * Don't sign-extend mask if long is longer than int.
141 	 * (It's currently not, in MSVC, even on 64-bit platforms, but....)
142 	 */
143 	if (_BitScanForward(&bit, (unsigned int)mask) == 0)
144 		return -1;	/* mask is zero */
145 	return (int)bit;
146 }
147 #elif defined(MSDOS) && defined(__DJGPP__)
148   /*
149    * MS-DOS with DJGPP, which declares ffs() in <string.h>, which
150    * we've already included.
151    */
152   #define lowest_set_bit(mask)	(ffs((mask)) - 1)
153 #elif (defined(MSDOS) && defined(__WATCOMC__)) || defined(STRINGS_H_DECLARES_FFS)
154   /*
155    * MS-DOS with Watcom C, which has <strings.h> and declares ffs() there,
156    * or some other platform (UN*X conforming to a sufficient recent version
157    * of the Single UNIX Specification).
158    */
159   #include <strings.h>
160   #define lowest_set_bit(mask)	(ffs((mask)) - 1)
161 #else
162 /*
163  * None of the above.
164  * Use a perfect-hash-function-based function.
165  */
166 static int
167 lowest_set_bit(int mask)
168 {
169 	unsigned int v = (unsigned int)mask;
170 
171 	static const int MultiplyDeBruijnBitPosition[32] = {
172 		0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
173 		31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
174 	};
175 
176 	/*
177 	 * We strip off all but the lowermost set bit (v & ~v),
178 	 * and perform a minimal perfect hash on it to look up the
179 	 * number of low-order zero bits in a table.
180 	 *
181 	 * See:
182 	 *
183 	 *	http://7ooo.mooo.com/text/ComputingTrailingZerosHOWTO.pdf
184 	 *
185 	 *	http://supertech.csail.mit.edu/papers/debruijn.pdf
186 	 */
187 	return (MultiplyDeBruijnBitPosition[((v & -v) * 0x077CB531U) >> 27]);
188 }
189 #endif
190 
191 /*
192  * Represents a deleted instruction.
193  */
194 #define NOP -1
195 
196 /*
197  * Register numbers for use-def values.
198  * 0 through BPF_MEMWORDS-1 represent the corresponding scratch memory
199  * location.  A_ATOM is the accumulator and X_ATOM is the index
200  * register.
201  */
202 #define A_ATOM BPF_MEMWORDS
203 #define X_ATOM (BPF_MEMWORDS+1)
204 
205 /*
206  * This define is used to represent *both* the accumulator and
207  * x register in use-def computations.
208  * Currently, the use-def code assumes only one definition per instruction.
209  */
210 #define AX_ATOM N_ATOMS
211 
212 /*
213  * These data structures are used in a Cocke and Shwarz style
214  * value numbering scheme.  Since the flowgraph is acyclic,
215  * exit values can be propagated from a node's predecessors
216  * provided it is uniquely defined.
217  */
218 struct valnode {
219 	int code;
220 	int v0, v1;
221 	int val;
222 	struct valnode *next;
223 };
224 
225 /* Integer constants mapped with the load immediate opcode. */
226 #define K(i) F(opt_state, BPF_LD|BPF_IMM|BPF_W, i, 0L)
227 
228 struct vmapinfo {
229 	int is_const;
230 	bpf_int32 const_val;
231 };
232 
233 typedef struct {
234 	/*
235 	 * A flag to indicate that further optimization is needed.
236 	 * Iterative passes are continued until a given pass yields no
237 	 * branch movement.
238 	 */
239 	int done;
240 
241 	int n_blocks;
242 	struct block **blocks;
243 	int n_edges;
244 	struct edge **edges;
245 
246 	/*
247 	 * A bit vector set representation of the dominators.
248 	 * We round up the set size to the next power of two.
249 	 */
250 	int nodewords;
251 	int edgewords;
252 	struct block **levels;
253 	bpf_u_int32 *space;
254 
255 #define BITS_PER_WORD (8*sizeof(bpf_u_int32))
256 /*
257  * True if a is in uset {p}
258  */
259 #define SET_MEMBER(p, a) \
260 ((p)[(unsigned)(a) / BITS_PER_WORD] & (1 << ((unsigned)(a) % BITS_PER_WORD)))
261 
262 /*
263  * Add 'a' to uset p.
264  */
265 #define SET_INSERT(p, a) \
266 (p)[(unsigned)(a) / BITS_PER_WORD] |= (1 << ((unsigned)(a) % BITS_PER_WORD))
267 
268 /*
269  * Delete 'a' from uset p.
270  */
271 #define SET_DELETE(p, a) \
272 (p)[(unsigned)(a) / BITS_PER_WORD] &= ~(1 << ((unsigned)(a) % BITS_PER_WORD))
273 
274 /*
275  * a := a intersect b
276  */
277 #define SET_INTERSECT(a, b, n)\
278 {\
279 	register bpf_u_int32 *_x = a, *_y = b;\
280 	register int _n = n;\
281 	while (--_n >= 0) *_x++ &= *_y++;\
282 }
283 
284 /*
285  * a := a - b
286  */
287 #define SET_SUBTRACT(a, b, n)\
288 {\
289 	register bpf_u_int32 *_x = a, *_y = b;\
290 	register int _n = n;\
291 	while (--_n >= 0) *_x++ &=~ *_y++;\
292 }
293 
294 /*
295  * a := a union b
296  */
297 #define SET_UNION(a, b, n)\
298 {\
299 	register bpf_u_int32 *_x = a, *_y = b;\
300 	register int _n = n;\
301 	while (--_n >= 0) *_x++ |= *_y++;\
302 }
303 
304 	uset all_dom_sets;
305 	uset all_closure_sets;
306 	uset all_edge_sets;
307 
308 #define MODULUS 213
309 	struct valnode *hashtbl[MODULUS];
310 	int curval;
311 	int maxval;
312 
313 	struct vmapinfo *vmap;
314 	struct valnode *vnode_base;
315 	struct valnode *next_vnode;
316 } opt_state_t;
317 
318 typedef struct {
319 	/*
320 	 * Some pointers used to convert the basic block form of the code,
321 	 * into the array form that BPF requires.  'fstart' will point to
322 	 * the malloc'd array while 'ftail' is used during the recursive
323 	 * traversal.
324 	 */
325 	struct bpf_insn *fstart;
326 	struct bpf_insn *ftail;
327 } conv_state_t;
328 
329 static void opt_init(compiler_state_t *, opt_state_t *, struct icode *);
330 static void opt_cleanup(opt_state_t *);
331 
332 static void intern_blocks(opt_state_t *, struct icode *);
333 
334 static void find_inedges(opt_state_t *, struct block *);
335 #ifdef BDEBUG
336 static void opt_dump(compiler_state_t *, struct icode *);
337 #endif
338 
339 #ifndef MAX
340 #define MAX(a,b) ((a)>(b)?(a):(b))
341 #endif
342 
343 static void
344 find_levels_r(opt_state_t *opt_state, struct icode *ic, struct block *b)
345 {
346 	int level;
347 
348 	if (isMarked(ic, b))
349 		return;
350 
351 	Mark(ic, b);
352 	b->link = 0;
353 
354 	if (JT(b)) {
355 		find_levels_r(opt_state, ic, JT(b));
356 		find_levels_r(opt_state, ic, JF(b));
357 		level = MAX(JT(b)->level, JF(b)->level) + 1;
358 	} else
359 		level = 0;
360 	b->level = level;
361 	b->link = opt_state->levels[level];
362 	opt_state->levels[level] = b;
363 }
364 
365 /*
366  * Level graph.  The levels go from 0 at the leaves to
367  * N_LEVELS at the root.  The opt_state->levels[] array points to the
368  * first node of the level list, whose elements are linked
369  * with the 'link' field of the struct block.
370  */
371 static void
372 find_levels(opt_state_t *opt_state, struct icode *ic)
373 {
374 	memset((char *)opt_state->levels, 0, opt_state->n_blocks * sizeof(*opt_state->levels));
375 	unMarkAll(ic);
376 	find_levels_r(opt_state, ic, ic->root);
377 }
378 
379 /*
380  * Find dominator relationships.
381  * Assumes graph has been leveled.
382  */
383 static void
384 find_dom(opt_state_t *opt_state, struct block *root)
385 {
386 	int i;
387 	struct block *b;
388 	bpf_u_int32 *x;
389 
390 	/*
391 	 * Initialize sets to contain all nodes.
392 	 */
393 	x = opt_state->all_dom_sets;
394 	i = opt_state->n_blocks * opt_state->nodewords;
395 	while (--i >= 0)
396 		*x++ = 0xFFFFFFFFU;
397 	/* Root starts off empty. */
398 	for (i = opt_state->nodewords; --i >= 0;)
399 		root->dom[i] = 0;
400 
401 	/* root->level is the highest level no found. */
402 	for (i = root->level; i >= 0; --i) {
403 		for (b = opt_state->levels[i]; b; b = b->link) {
404 			SET_INSERT(b->dom, b->id);
405 			if (JT(b) == 0)
406 				continue;
407 			SET_INTERSECT(JT(b)->dom, b->dom, opt_state->nodewords);
408 			SET_INTERSECT(JF(b)->dom, b->dom, opt_state->nodewords);
409 		}
410 	}
411 }
412 
413 static void
414 propedom(opt_state_t *opt_state, struct edge *ep)
415 {
416 	SET_INSERT(ep->edom, ep->id);
417 	if (ep->succ) {
418 		SET_INTERSECT(ep->succ->et.edom, ep->edom, opt_state->edgewords);
419 		SET_INTERSECT(ep->succ->ef.edom, ep->edom, opt_state->edgewords);
420 	}
421 }
422 
423 /*
424  * Compute edge dominators.
425  * Assumes graph has been leveled and predecessors established.
426  */
427 static void
428 find_edom(opt_state_t *opt_state, struct block *root)
429 {
430 	int i;
431 	uset x;
432 	struct block *b;
433 
434 	x = opt_state->all_edge_sets;
435 	for (i = opt_state->n_edges * opt_state->edgewords; --i >= 0; )
436 		x[i] = 0xFFFFFFFFU;
437 
438 	/* root->level is the highest level no found. */
439 	memset(root->et.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
440 	memset(root->ef.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
441 	for (i = root->level; i >= 0; --i) {
442 		for (b = opt_state->levels[i]; b != 0; b = b->link) {
443 			propedom(opt_state, &b->et);
444 			propedom(opt_state, &b->ef);
445 		}
446 	}
447 }
448 
449 /*
450  * Find the backwards transitive closure of the flow graph.  These sets
451  * are backwards in the sense that we find the set of nodes that reach
452  * a given node, not the set of nodes that can be reached by a node.
453  *
454  * Assumes graph has been leveled.
455  */
456 static void
457 find_closure(opt_state_t *opt_state, struct block *root)
458 {
459 	int i;
460 	struct block *b;
461 
462 	/*
463 	 * Initialize sets to contain no nodes.
464 	 */
465 	memset((char *)opt_state->all_closure_sets, 0,
466 	      opt_state->n_blocks * opt_state->nodewords * sizeof(*opt_state->all_closure_sets));
467 
468 	/* root->level is the highest level no found. */
469 	for (i = root->level; i >= 0; --i) {
470 		for (b = opt_state->levels[i]; b; b = b->link) {
471 			SET_INSERT(b->closure, b->id);
472 			if (JT(b) == 0)
473 				continue;
474 			SET_UNION(JT(b)->closure, b->closure, opt_state->nodewords);
475 			SET_UNION(JF(b)->closure, b->closure, opt_state->nodewords);
476 		}
477 	}
478 }
479 
480 /*
481  * Return the register number that is used by s.  If A and X are both
482  * used, return AX_ATOM.  If no register is used, return -1.
483  *
484  * The implementation should probably change to an array access.
485  */
486 static int
487 atomuse(struct stmt *s)
488 {
489 	register int c = s->code;
490 
491 	if (c == NOP)
492 		return -1;
493 
494 	switch (BPF_CLASS(c)) {
495 
496 	case BPF_RET:
497 		return (BPF_RVAL(c) == BPF_A) ? A_ATOM :
498 			(BPF_RVAL(c) == BPF_X) ? X_ATOM : -1;
499 
500 	case BPF_LD:
501 	case BPF_LDX:
502 		return (BPF_MODE(c) == BPF_IND) ? X_ATOM :
503 			(BPF_MODE(c) == BPF_MEM) ? s->k : -1;
504 
505 	case BPF_ST:
506 		return A_ATOM;
507 
508 	case BPF_STX:
509 		return X_ATOM;
510 
511 	case BPF_JMP:
512 	case BPF_ALU:
513 		if (BPF_SRC(c) == BPF_X)
514 			return AX_ATOM;
515 		return A_ATOM;
516 
517 	case BPF_MISC:
518 		return BPF_MISCOP(c) == BPF_TXA ? X_ATOM : A_ATOM;
519 	}
520 	abort();
521 	/* NOTREACHED */
522 }
523 
524 /*
525  * Return the register number that is defined by 's'.  We assume that
526  * a single stmt cannot define more than one register.  If no register
527  * is defined, return -1.
528  *
529  * The implementation should probably change to an array access.
530  */
531 static int
532 atomdef(struct stmt *s)
533 {
534 	if (s->code == NOP)
535 		return -1;
536 
537 	switch (BPF_CLASS(s->code)) {
538 
539 	case BPF_LD:
540 	case BPF_ALU:
541 		return A_ATOM;
542 
543 	case BPF_LDX:
544 		return X_ATOM;
545 
546 	case BPF_ST:
547 	case BPF_STX:
548 		return s->k;
549 
550 	case BPF_MISC:
551 		return BPF_MISCOP(s->code) == BPF_TAX ? X_ATOM : A_ATOM;
552 	}
553 	return -1;
554 }
555 
556 /*
557  * Compute the sets of registers used, defined, and killed by 'b'.
558  *
559  * "Used" means that a statement in 'b' uses the register before any
560  * statement in 'b' defines it, i.e. it uses the value left in
561  * that register by a predecessor block of this block.
562  * "Defined" means that a statement in 'b' defines it.
563  * "Killed" means that a statement in 'b' defines it before any
564  * statement in 'b' uses it, i.e. it kills the value left in that
565  * register by a predecessor block of this block.
566  */
567 static void
568 compute_local_ud(struct block *b)
569 {
570 	struct slist *s;
571 	atomset def = 0, use = 0, killed = 0;
572 	int atom;
573 
574 	for (s = b->stmts; s; s = s->next) {
575 		if (s->s.code == NOP)
576 			continue;
577 		atom = atomuse(&s->s);
578 		if (atom >= 0) {
579 			if (atom == AX_ATOM) {
580 				if (!ATOMELEM(def, X_ATOM))
581 					use |= ATOMMASK(X_ATOM);
582 				if (!ATOMELEM(def, A_ATOM))
583 					use |= ATOMMASK(A_ATOM);
584 			}
585 			else if (atom < N_ATOMS) {
586 				if (!ATOMELEM(def, atom))
587 					use |= ATOMMASK(atom);
588 			}
589 			else
590 				abort();
591 		}
592 		atom = atomdef(&s->s);
593 		if (atom >= 0) {
594 			if (!ATOMELEM(use, atom))
595 				killed |= ATOMMASK(atom);
596 			def |= ATOMMASK(atom);
597 		}
598 	}
599 	if (BPF_CLASS(b->s.code) == BPF_JMP) {
600 		/*
601 		 * XXX - what about RET?
602 		 */
603 		atom = atomuse(&b->s);
604 		if (atom >= 0) {
605 			if (atom == AX_ATOM) {
606 				if (!ATOMELEM(def, X_ATOM))
607 					use |= ATOMMASK(X_ATOM);
608 				if (!ATOMELEM(def, A_ATOM))
609 					use |= ATOMMASK(A_ATOM);
610 			}
611 			else if (atom < N_ATOMS) {
612 				if (!ATOMELEM(def, atom))
613 					use |= ATOMMASK(atom);
614 			}
615 			else
616 				abort();
617 		}
618 	}
619 
620 	b->def = def;
621 	b->kill = killed;
622 	b->in_use = use;
623 }
624 
625 /*
626  * Assume graph is already leveled.
627  */
628 static void
629 find_ud(opt_state_t *opt_state, struct block *root)
630 {
631 	int i, maxlevel;
632 	struct block *p;
633 
634 	/*
635 	 * root->level is the highest level no found;
636 	 * count down from there.
637 	 */
638 	maxlevel = root->level;
639 	for (i = maxlevel; i >= 0; --i)
640 		for (p = opt_state->levels[i]; p; p = p->link) {
641 			compute_local_ud(p);
642 			p->out_use = 0;
643 		}
644 
645 	for (i = 1; i <= maxlevel; ++i) {
646 		for (p = opt_state->levels[i]; p; p = p->link) {
647 			p->out_use |= JT(p)->in_use | JF(p)->in_use;
648 			p->in_use |= p->out_use &~ p->kill;
649 		}
650 	}
651 }
652 static void
653 init_val(opt_state_t *opt_state)
654 {
655 	opt_state->curval = 0;
656 	opt_state->next_vnode = opt_state->vnode_base;
657 	memset((char *)opt_state->vmap, 0, opt_state->maxval * sizeof(*opt_state->vmap));
658 	memset((char *)opt_state->hashtbl, 0, sizeof opt_state->hashtbl);
659 }
660 
661 /* Because we really don't have an IR, this stuff is a little messy. */
662 static int
663 F(opt_state_t *opt_state, int code, int v0, int v1)
664 {
665 	u_int hash;
666 	int val;
667 	struct valnode *p;
668 
669 	hash = (u_int)code ^ ((u_int)v0 << 4) ^ ((u_int)v1 << 8);
670 	hash %= MODULUS;
671 
672 	for (p = opt_state->hashtbl[hash]; p; p = p->next)
673 		if (p->code == code && p->v0 == v0 && p->v1 == v1)
674 			return p->val;
675 
676 	val = ++opt_state->curval;
677 	if (BPF_MODE(code) == BPF_IMM &&
678 	    (BPF_CLASS(code) == BPF_LD || BPF_CLASS(code) == BPF_LDX)) {
679 		opt_state->vmap[val].const_val = v0;
680 		opt_state->vmap[val].is_const = 1;
681 	}
682 	p = opt_state->next_vnode++;
683 	p->val = val;
684 	p->code = code;
685 	p->v0 = v0;
686 	p->v1 = v1;
687 	p->next = opt_state->hashtbl[hash];
688 	opt_state->hashtbl[hash] = p;
689 
690 	return val;
691 }
692 
693 static inline void
694 vstore(struct stmt *s, int *valp, int newval, int alter)
695 {
696 	if (alter && newval != VAL_UNKNOWN && *valp == newval)
697 		s->code = NOP;
698 	else
699 		*valp = newval;
700 }
701 
702 /*
703  * Do constant-folding on binary operators.
704  * (Unary operators are handled elsewhere.)
705  */
706 static void
707 fold_op(compiler_state_t *cstate, opt_state_t *opt_state,
708     struct stmt *s, int v0, int v1)
709 {
710 	bpf_u_int32 a, b;
711 
712 	a = opt_state->vmap[v0].const_val;
713 	b = opt_state->vmap[v1].const_val;
714 
715 	switch (BPF_OP(s->code)) {
716 	case BPF_ADD:
717 		a += b;
718 		break;
719 
720 	case BPF_SUB:
721 		a -= b;
722 		break;
723 
724 	case BPF_MUL:
725 		a *= b;
726 		break;
727 
728 	case BPF_DIV:
729 		if (b == 0)
730 			bpf_error(cstate, "division by zero");
731 		a /= b;
732 		break;
733 
734 	case BPF_MOD:
735 		if (b == 0)
736 			bpf_error(cstate, "modulus by zero");
737 		a %= b;
738 		break;
739 
740 	case BPF_AND:
741 		a &= b;
742 		break;
743 
744 	case BPF_OR:
745 		a |= b;
746 		break;
747 
748 	case BPF_XOR:
749 		a ^= b;
750 		break;
751 
752 	case BPF_LSH:
753 		a <<= b;
754 		break;
755 
756 	case BPF_RSH:
757 		a >>= b;
758 		break;
759 
760 	default:
761 		abort();
762 	}
763 	s->k = a;
764 	s->code = BPF_LD|BPF_IMM;
765 	opt_state->done = 0;
766 }
767 
768 static inline struct slist *
769 this_op(struct slist *s)
770 {
771 	while (s != 0 && s->s.code == NOP)
772 		s = s->next;
773 	return s;
774 }
775 
776 static void
777 opt_not(struct block *b)
778 {
779 	struct block *tmp = JT(b);
780 
781 	JT(b) = JF(b);
782 	JF(b) = tmp;
783 }
784 
785 static void
786 opt_peep(opt_state_t *opt_state, struct block *b)
787 {
788 	struct slist *s;
789 	struct slist *next, *last;
790 	int val;
791 
792 	s = b->stmts;
793 	if (s == 0)
794 		return;
795 
796 	last = s;
797 	for (/*empty*/; /*empty*/; s = next) {
798 		/*
799 		 * Skip over nops.
800 		 */
801 		s = this_op(s);
802 		if (s == 0)
803 			break;	/* nothing left in the block */
804 
805 		/*
806 		 * Find the next real instruction after that one
807 		 * (skipping nops).
808 		 */
809 		next = this_op(s->next);
810 		if (next == 0)
811 			break;	/* no next instruction */
812 		last = next;
813 
814 		/*
815 		 * st  M[k]	-->	st  M[k]
816 		 * ldx M[k]		tax
817 		 */
818 		if (s->s.code == BPF_ST &&
819 		    next->s.code == (BPF_LDX|BPF_MEM) &&
820 		    s->s.k == next->s.k) {
821 			opt_state->done = 0;
822 			next->s.code = BPF_MISC|BPF_TAX;
823 		}
824 		/*
825 		 * ld  #k	-->	ldx  #k
826 		 * tax			txa
827 		 */
828 		if (s->s.code == (BPF_LD|BPF_IMM) &&
829 		    next->s.code == (BPF_MISC|BPF_TAX)) {
830 			s->s.code = BPF_LDX|BPF_IMM;
831 			next->s.code = BPF_MISC|BPF_TXA;
832 			opt_state->done = 0;
833 		}
834 		/*
835 		 * This is an ugly special case, but it happens
836 		 * when you say tcp[k] or udp[k] where k is a constant.
837 		 */
838 		if (s->s.code == (BPF_LD|BPF_IMM)) {
839 			struct slist *add, *tax, *ild;
840 
841 			/*
842 			 * Check that X isn't used on exit from this
843 			 * block (which the optimizer might cause).
844 			 * We know the code generator won't generate
845 			 * any local dependencies.
846 			 */
847 			if (ATOMELEM(b->out_use, X_ATOM))
848 				continue;
849 
850 			/*
851 			 * Check that the instruction following the ldi
852 			 * is an addx, or it's an ldxms with an addx
853 			 * following it (with 0 or more nops between the
854 			 * ldxms and addx).
855 			 */
856 			if (next->s.code != (BPF_LDX|BPF_MSH|BPF_B))
857 				add = next;
858 			else
859 				add = this_op(next->next);
860 			if (add == 0 || add->s.code != (BPF_ALU|BPF_ADD|BPF_X))
861 				continue;
862 
863 			/*
864 			 * Check that a tax follows that (with 0 or more
865 			 * nops between them).
866 			 */
867 			tax = this_op(add->next);
868 			if (tax == 0 || tax->s.code != (BPF_MISC|BPF_TAX))
869 				continue;
870 
871 			/*
872 			 * Check that an ild follows that (with 0 or more
873 			 * nops between them).
874 			 */
875 			ild = this_op(tax->next);
876 			if (ild == 0 || BPF_CLASS(ild->s.code) != BPF_LD ||
877 			    BPF_MODE(ild->s.code) != BPF_IND)
878 				continue;
879 			/*
880 			 * We want to turn this sequence:
881 			 *
882 			 * (004) ldi     #0x2		{s}
883 			 * (005) ldxms   [14]		{next}  -- optional
884 			 * (006) addx			{add}
885 			 * (007) tax			{tax}
886 			 * (008) ild     [x+0]		{ild}
887 			 *
888 			 * into this sequence:
889 			 *
890 			 * (004) nop
891 			 * (005) ldxms   [14]
892 			 * (006) nop
893 			 * (007) nop
894 			 * (008) ild     [x+2]
895 			 *
896 			 * XXX We need to check that X is not
897 			 * subsequently used, because we want to change
898 			 * what'll be in it after this sequence.
899 			 *
900 			 * We know we can eliminate the accumulator
901 			 * modifications earlier in the sequence since
902 			 * it is defined by the last stmt of this sequence
903 			 * (i.e., the last statement of the sequence loads
904 			 * a value into the accumulator, so we can eliminate
905 			 * earlier operations on the accumulator).
906 			 */
907 			ild->s.k += s->s.k;
908 			s->s.code = NOP;
909 			add->s.code = NOP;
910 			tax->s.code = NOP;
911 			opt_state->done = 0;
912 		}
913 	}
914 	/*
915 	 * If the comparison at the end of a block is an equality
916 	 * comparison against a constant, and nobody uses the value
917 	 * we leave in the A register at the end of a block, and
918 	 * the operation preceding the comparison is an arithmetic
919 	 * operation, we can sometime optimize it away.
920 	 */
921 	if (b->s.code == (BPF_JMP|BPF_JEQ|BPF_K) &&
922 	    !ATOMELEM(b->out_use, A_ATOM)) {
923 	    	/*
924 	    	 * We can optimize away certain subtractions of the
925 	    	 * X register.
926 	    	 */
927 		if (last->s.code == (BPF_ALU|BPF_SUB|BPF_X)) {
928 			val = b->val[X_ATOM];
929 			if (opt_state->vmap[val].is_const) {
930 				/*
931 				 * If we have a subtract to do a comparison,
932 				 * and the X register is a known constant,
933 				 * we can merge this value into the
934 				 * comparison:
935 				 *
936 				 * sub x  ->	nop
937 				 * jeq #y	jeq #(x+y)
938 				 */
939 				b->s.k += opt_state->vmap[val].const_val;
940 				last->s.code = NOP;
941 				opt_state->done = 0;
942 			} else if (b->s.k == 0) {
943 				/*
944 				 * If the X register isn't a constant,
945 				 * and the comparison in the test is
946 				 * against 0, we can compare with the
947 				 * X register, instead:
948 				 *
949 				 * sub x  ->	nop
950 				 * jeq #0	jeq x
951 				 */
952 				last->s.code = NOP;
953 				b->s.code = BPF_JMP|BPF_JEQ|BPF_X;
954 				opt_state->done = 0;
955 			}
956 		}
957 		/*
958 		 * Likewise, a constant subtract can be simplified:
959 		 *
960 		 * sub #x ->	nop
961 		 * jeq #y ->	jeq #(x+y)
962 		 */
963 		else if (last->s.code == (BPF_ALU|BPF_SUB|BPF_K)) {
964 			last->s.code = NOP;
965 			b->s.k += last->s.k;
966 			opt_state->done = 0;
967 		}
968 		/*
969 		 * And, similarly, a constant AND can be simplified
970 		 * if we're testing against 0, i.e.:
971 		 *
972 		 * and #k	nop
973 		 * jeq #0  ->	jset #k
974 		 */
975 		else if (last->s.code == (BPF_ALU|BPF_AND|BPF_K) &&
976 		    b->s.k == 0) {
977 			b->s.k = last->s.k;
978 			b->s.code = BPF_JMP|BPF_K|BPF_JSET;
979 			last->s.code = NOP;
980 			opt_state->done = 0;
981 			opt_not(b);
982 		}
983 	}
984 	/*
985 	 * jset #0        ->   never
986 	 * jset #ffffffff ->   always
987 	 */
988 	if (b->s.code == (BPF_JMP|BPF_K|BPF_JSET)) {
989 		if (b->s.k == 0)
990 			JT(b) = JF(b);
991 		if ((u_int)b->s.k == 0xffffffffU)
992 			JF(b) = JT(b);
993 	}
994 	/*
995 	 * If we're comparing against the index register, and the index
996 	 * register is a known constant, we can just compare against that
997 	 * constant.
998 	 */
999 	val = b->val[X_ATOM];
1000 	if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_X) {
1001 		bpf_int32 v = opt_state->vmap[val].const_val;
1002 		b->s.code &= ~BPF_X;
1003 		b->s.k = v;
1004 	}
1005 	/*
1006 	 * If the accumulator is a known constant, we can compute the
1007 	 * comparison result.
1008 	 */
1009 	val = b->val[A_ATOM];
1010 	if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_K) {
1011 		bpf_int32 v = opt_state->vmap[val].const_val;
1012 		switch (BPF_OP(b->s.code)) {
1013 
1014 		case BPF_JEQ:
1015 			v = v == b->s.k;
1016 			break;
1017 
1018 		case BPF_JGT:
1019 			v = (unsigned)v > (unsigned)b->s.k;
1020 			break;
1021 
1022 		case BPF_JGE:
1023 			v = (unsigned)v >= (unsigned)b->s.k;
1024 			break;
1025 
1026 		case BPF_JSET:
1027 			v &= b->s.k;
1028 			break;
1029 
1030 		default:
1031 			abort();
1032 		}
1033 		if (JF(b) != JT(b))
1034 			opt_state->done = 0;
1035 		if (v)
1036 			JF(b) = JT(b);
1037 		else
1038 			JT(b) = JF(b);
1039 	}
1040 }
1041 
1042 /*
1043  * Compute the symbolic value of expression of 's', and update
1044  * anything it defines in the value table 'val'.  If 'alter' is true,
1045  * do various optimizations.  This code would be cleaner if symbolic
1046  * evaluation and code transformations weren't folded together.
1047  */
1048 static void
1049 opt_stmt(compiler_state_t *cstate, opt_state_t *opt_state,
1050     struct stmt *s, int val[], int alter)
1051 {
1052 	int op;
1053 	int v;
1054 
1055 	switch (s->code) {
1056 
1057 	case BPF_LD|BPF_ABS|BPF_W:
1058 	case BPF_LD|BPF_ABS|BPF_H:
1059 	case BPF_LD|BPF_ABS|BPF_B:
1060 		v = F(opt_state, s->code, s->k, 0L);
1061 		vstore(s, &val[A_ATOM], v, alter);
1062 		break;
1063 
1064 	case BPF_LD|BPF_IND|BPF_W:
1065 	case BPF_LD|BPF_IND|BPF_H:
1066 	case BPF_LD|BPF_IND|BPF_B:
1067 		v = val[X_ATOM];
1068 		if (alter && opt_state->vmap[v].is_const) {
1069 			s->code = BPF_LD|BPF_ABS|BPF_SIZE(s->code);
1070 			s->k += opt_state->vmap[v].const_val;
1071 			v = F(opt_state, s->code, s->k, 0L);
1072 			opt_state->done = 0;
1073 		}
1074 		else
1075 			v = F(opt_state, s->code, s->k, v);
1076 		vstore(s, &val[A_ATOM], v, alter);
1077 		break;
1078 
1079 	case BPF_LD|BPF_LEN:
1080 		v = F(opt_state, s->code, 0L, 0L);
1081 		vstore(s, &val[A_ATOM], v, alter);
1082 		break;
1083 
1084 	case BPF_LD|BPF_IMM:
1085 		v = K(s->k);
1086 		vstore(s, &val[A_ATOM], v, alter);
1087 		break;
1088 
1089 	case BPF_LDX|BPF_IMM:
1090 		v = K(s->k);
1091 		vstore(s, &val[X_ATOM], v, alter);
1092 		break;
1093 
1094 	case BPF_LDX|BPF_MSH|BPF_B:
1095 		v = F(opt_state, s->code, s->k, 0L);
1096 		vstore(s, &val[X_ATOM], v, alter);
1097 		break;
1098 
1099 	case BPF_ALU|BPF_NEG:
1100 		if (alter && opt_state->vmap[val[A_ATOM]].is_const) {
1101 			s->code = BPF_LD|BPF_IMM;
1102 			s->k = -opt_state->vmap[val[A_ATOM]].const_val;
1103 			val[A_ATOM] = K(s->k);
1104 		}
1105 		else
1106 			val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], 0L);
1107 		break;
1108 
1109 	case BPF_ALU|BPF_ADD|BPF_K:
1110 	case BPF_ALU|BPF_SUB|BPF_K:
1111 	case BPF_ALU|BPF_MUL|BPF_K:
1112 	case BPF_ALU|BPF_DIV|BPF_K:
1113 	case BPF_ALU|BPF_MOD|BPF_K:
1114 	case BPF_ALU|BPF_AND|BPF_K:
1115 	case BPF_ALU|BPF_OR|BPF_K:
1116 	case BPF_ALU|BPF_XOR|BPF_K:
1117 	case BPF_ALU|BPF_LSH|BPF_K:
1118 	case BPF_ALU|BPF_RSH|BPF_K:
1119 		op = BPF_OP(s->code);
1120 		if (alter) {
1121 			if (s->k == 0) {
1122 				/* don't optimize away "sub #0"
1123 				 * as it may be needed later to
1124 				 * fixup the generated math code */
1125 				if (op == BPF_ADD ||
1126 				    op == BPF_LSH || op == BPF_RSH ||
1127 				    op == BPF_OR || op == BPF_XOR) {
1128 					s->code = NOP;
1129 					break;
1130 				}
1131 				if (op == BPF_MUL || op == BPF_AND) {
1132 					s->code = BPF_LD|BPF_IMM;
1133 					val[A_ATOM] = K(s->k);
1134 					break;
1135 				}
1136 			}
1137 			if (opt_state->vmap[val[A_ATOM]].is_const) {
1138 				fold_op(cstate, opt_state, s, val[A_ATOM], K(s->k));
1139 				val[A_ATOM] = K(s->k);
1140 				break;
1141 			}
1142 		}
1143 		val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], K(s->k));
1144 		break;
1145 
1146 	case BPF_ALU|BPF_ADD|BPF_X:
1147 	case BPF_ALU|BPF_SUB|BPF_X:
1148 	case BPF_ALU|BPF_MUL|BPF_X:
1149 	case BPF_ALU|BPF_DIV|BPF_X:
1150 	case BPF_ALU|BPF_MOD|BPF_X:
1151 	case BPF_ALU|BPF_AND|BPF_X:
1152 	case BPF_ALU|BPF_OR|BPF_X:
1153 	case BPF_ALU|BPF_XOR|BPF_X:
1154 	case BPF_ALU|BPF_LSH|BPF_X:
1155 	case BPF_ALU|BPF_RSH|BPF_X:
1156 		op = BPF_OP(s->code);
1157 		if (alter && opt_state->vmap[val[X_ATOM]].is_const) {
1158 			if (opt_state->vmap[val[A_ATOM]].is_const) {
1159 				fold_op(cstate, opt_state, s, val[A_ATOM], val[X_ATOM]);
1160 				val[A_ATOM] = K(s->k);
1161 			}
1162 			else {
1163 				s->code = BPF_ALU|BPF_K|op;
1164 				s->k = opt_state->vmap[val[X_ATOM]].const_val;
1165 				opt_state->done = 0;
1166 				val[A_ATOM] =
1167 					F(opt_state, s->code, val[A_ATOM], K(s->k));
1168 			}
1169 			break;
1170 		}
1171 		/*
1172 		 * Check if we're doing something to an accumulator
1173 		 * that is 0, and simplify.  This may not seem like
1174 		 * much of a simplification but it could open up further
1175 		 * optimizations.
1176 		 * XXX We could also check for mul by 1, etc.
1177 		 */
1178 		if (alter && opt_state->vmap[val[A_ATOM]].is_const
1179 		    && opt_state->vmap[val[A_ATOM]].const_val == 0) {
1180 			if (op == BPF_ADD || op == BPF_OR || op == BPF_XOR) {
1181 				s->code = BPF_MISC|BPF_TXA;
1182 				vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1183 				break;
1184 			}
1185 			else if (op == BPF_MUL || op == BPF_DIV || op == BPF_MOD ||
1186 				 op == BPF_AND || op == BPF_LSH || op == BPF_RSH) {
1187 				s->code = BPF_LD|BPF_IMM;
1188 				s->k = 0;
1189 				vstore(s, &val[A_ATOM], K(s->k), alter);
1190 				break;
1191 			}
1192 			else if (op == BPF_NEG) {
1193 				s->code = NOP;
1194 				break;
1195 			}
1196 		}
1197 		val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], val[X_ATOM]);
1198 		break;
1199 
1200 	case BPF_MISC|BPF_TXA:
1201 		vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1202 		break;
1203 
1204 	case BPF_LD|BPF_MEM:
1205 		v = val[s->k];
1206 		if (alter && opt_state->vmap[v].is_const) {
1207 			s->code = BPF_LD|BPF_IMM;
1208 			s->k = opt_state->vmap[v].const_val;
1209 			opt_state->done = 0;
1210 		}
1211 		vstore(s, &val[A_ATOM], v, alter);
1212 		break;
1213 
1214 	case BPF_MISC|BPF_TAX:
1215 		vstore(s, &val[X_ATOM], val[A_ATOM], alter);
1216 		break;
1217 
1218 	case BPF_LDX|BPF_MEM:
1219 		v = val[s->k];
1220 		if (alter && opt_state->vmap[v].is_const) {
1221 			s->code = BPF_LDX|BPF_IMM;
1222 			s->k = opt_state->vmap[v].const_val;
1223 			opt_state->done = 0;
1224 		}
1225 		vstore(s, &val[X_ATOM], v, alter);
1226 		break;
1227 
1228 	case BPF_ST:
1229 		vstore(s, &val[s->k], val[A_ATOM], alter);
1230 		break;
1231 
1232 	case BPF_STX:
1233 		vstore(s, &val[s->k], val[X_ATOM], alter);
1234 		break;
1235 	}
1236 }
1237 
1238 static void
1239 deadstmt(opt_state_t *opt_state, register struct stmt *s, register struct stmt *last[])
1240 {
1241 	register int atom;
1242 
1243 	atom = atomuse(s);
1244 	if (atom >= 0) {
1245 		if (atom == AX_ATOM) {
1246 			last[X_ATOM] = 0;
1247 			last[A_ATOM] = 0;
1248 		}
1249 		else
1250 			last[atom] = 0;
1251 	}
1252 	atom = atomdef(s);
1253 	if (atom >= 0) {
1254 		if (last[atom]) {
1255 			opt_state->done = 0;
1256 			last[atom]->code = NOP;
1257 		}
1258 		last[atom] = s;
1259 	}
1260 }
1261 
1262 static void
1263 opt_deadstores(opt_state_t *opt_state, register struct block *b)
1264 {
1265 	register struct slist *s;
1266 	register int atom;
1267 	struct stmt *last[N_ATOMS];
1268 
1269 	memset((char *)last, 0, sizeof last);
1270 
1271 	for (s = b->stmts; s != 0; s = s->next)
1272 		deadstmt(opt_state, &s->s, last);
1273 	deadstmt(opt_state, &b->s, last);
1274 
1275 	for (atom = 0; atom < N_ATOMS; ++atom)
1276 		if (last[atom] && !ATOMELEM(b->out_use, atom)) {
1277 			last[atom]->code = NOP;
1278 			opt_state->done = 0;
1279 		}
1280 }
1281 
1282 static void
1283 opt_blk(compiler_state_t *cstate, opt_state_t *opt_state,
1284     struct block *b, int do_stmts)
1285 {
1286 	struct slist *s;
1287 	struct edge *p;
1288 	int i;
1289 	bpf_int32 aval, xval;
1290 
1291 #if 0
1292 	for (s = b->stmts; s && s->next; s = s->next)
1293 		if (BPF_CLASS(s->s.code) == BPF_JMP) {
1294 			do_stmts = 0;
1295 			break;
1296 		}
1297 #endif
1298 
1299 	/*
1300 	 * Initialize the atom values.
1301 	 */
1302 	p = b->in_edges;
1303 	if (p == 0) {
1304 		/*
1305 		 * We have no predecessors, so everything is undefined
1306 		 * upon entry to this block.
1307 		 */
1308 		memset((char *)b->val, 0, sizeof(b->val));
1309 	} else {
1310 		/*
1311 		 * Inherit values from our predecessors.
1312 		 *
1313 		 * First, get the values from the predecessor along the
1314 		 * first edge leading to this node.
1315 		 */
1316 		memcpy((char *)b->val, (char *)p->pred->val, sizeof(b->val));
1317 		/*
1318 		 * Now look at all the other nodes leading to this node.
1319 		 * If, for the predecessor along that edge, a register
1320 		 * has a different value from the one we have (i.e.,
1321 		 * control paths are merging, and the merging paths
1322 		 * assign different values to that register), give the
1323 		 * register the undefined value of 0.
1324 		 */
1325 		while ((p = p->next) != NULL) {
1326 			for (i = 0; i < N_ATOMS; ++i)
1327 				if (b->val[i] != p->pred->val[i])
1328 					b->val[i] = 0;
1329 		}
1330 	}
1331 	aval = b->val[A_ATOM];
1332 	xval = b->val[X_ATOM];
1333 	for (s = b->stmts; s; s = s->next)
1334 		opt_stmt(cstate, opt_state, &s->s, b->val, do_stmts);
1335 
1336 	/*
1337 	 * This is a special case: if we don't use anything from this
1338 	 * block, and we load the accumulator or index register with a
1339 	 * value that is already there, or if this block is a return,
1340 	 * eliminate all the statements.
1341 	 *
1342 	 * XXX - what if it does a store?
1343 	 *
1344 	 * XXX - why does it matter whether we use anything from this
1345 	 * block?  If the accumulator or index register doesn't change
1346 	 * its value, isn't that OK even if we use that value?
1347 	 *
1348 	 * XXX - if we load the accumulator with a different value,
1349 	 * and the block ends with a conditional branch, we obviously
1350 	 * can't eliminate it, as the branch depends on that value.
1351 	 * For the index register, the conditional branch only depends
1352 	 * on the index register value if the test is against the index
1353 	 * register value rather than a constant; if nothing uses the
1354 	 * value we put into the index register, and we're not testing
1355 	 * against the index register's value, and there aren't any
1356 	 * other problems that would keep us from eliminating this
1357 	 * block, can we eliminate it?
1358 	 */
1359 	if (do_stmts &&
1360 	    ((b->out_use == 0 &&
1361 	      aval != VAL_UNKNOWN && b->val[A_ATOM] == aval &&
1362 	      xval != VAL_UNKNOWN && b->val[X_ATOM] == xval) ||
1363 	     BPF_CLASS(b->s.code) == BPF_RET)) {
1364 		if (b->stmts != 0) {
1365 			b->stmts = 0;
1366 			opt_state->done = 0;
1367 		}
1368 	} else {
1369 		opt_peep(opt_state, b);
1370 		opt_deadstores(opt_state, b);
1371 	}
1372 	/*
1373 	 * Set up values for branch optimizer.
1374 	 */
1375 	if (BPF_SRC(b->s.code) == BPF_K)
1376 		b->oval = K(b->s.k);
1377 	else
1378 		b->oval = b->val[X_ATOM];
1379 	b->et.code = b->s.code;
1380 	b->ef.code = -b->s.code;
1381 }
1382 
1383 /*
1384  * Return true if any register that is used on exit from 'succ', has
1385  * an exit value that is different from the corresponding exit value
1386  * from 'b'.
1387  */
1388 static int
1389 use_conflict(struct block *b, struct block *succ)
1390 {
1391 	int atom;
1392 	atomset use = succ->out_use;
1393 
1394 	if (use == 0)
1395 		return 0;
1396 
1397 	for (atom = 0; atom < N_ATOMS; ++atom)
1398 		if (ATOMELEM(use, atom))
1399 			if (b->val[atom] != succ->val[atom])
1400 				return 1;
1401 	return 0;
1402 }
1403 
1404 static struct block *
1405 fold_edge(struct block *child, struct edge *ep)
1406 {
1407 	int sense;
1408 	int aval0, aval1, oval0, oval1;
1409 	int code = ep->code;
1410 
1411 	if (code < 0) {
1412 		code = -code;
1413 		sense = 0;
1414 	} else
1415 		sense = 1;
1416 
1417 	if (child->s.code != code)
1418 		return 0;
1419 
1420 	aval0 = child->val[A_ATOM];
1421 	oval0 = child->oval;
1422 	aval1 = ep->pred->val[A_ATOM];
1423 	oval1 = ep->pred->oval;
1424 
1425 	if (aval0 != aval1)
1426 		return 0;
1427 
1428 	if (oval0 == oval1)
1429 		/*
1430 		 * The operands of the branch instructions are
1431 		 * identical, so the result is true if a true
1432 		 * branch was taken to get here, otherwise false.
1433 		 */
1434 		return sense ? JT(child) : JF(child);
1435 
1436 	if (sense && code == (BPF_JMP|BPF_JEQ|BPF_K))
1437 		/*
1438 		 * At this point, we only know the comparison if we
1439 		 * came down the true branch, and it was an equality
1440 		 * comparison with a constant.
1441 		 *
1442 		 * I.e., if we came down the true branch, and the branch
1443 		 * was an equality comparison with a constant, we know the
1444 		 * accumulator contains that constant.  If we came down
1445 		 * the false branch, or the comparison wasn't with a
1446 		 * constant, we don't know what was in the accumulator.
1447 		 *
1448 		 * We rely on the fact that distinct constants have distinct
1449 		 * value numbers.
1450 		 */
1451 		return JF(child);
1452 
1453 	return 0;
1454 }
1455 
1456 static void
1457 opt_j(opt_state_t *opt_state, struct edge *ep)
1458 {
1459 	register int i, k;
1460 	register struct block *target;
1461 
1462 	if (JT(ep->succ) == 0)
1463 		return;
1464 
1465 	if (JT(ep->succ) == JF(ep->succ)) {
1466 		/*
1467 		 * Common branch targets can be eliminated, provided
1468 		 * there is no data dependency.
1469 		 */
1470 		if (!use_conflict(ep->pred, ep->succ->et.succ)) {
1471 			opt_state->done = 0;
1472 			ep->succ = JT(ep->succ);
1473 		}
1474 	}
1475 	/*
1476 	 * For each edge dominator that matches the successor of this
1477 	 * edge, promote the edge successor to the its grandchild.
1478 	 *
1479 	 * XXX We violate the set abstraction here in favor a reasonably
1480 	 * efficient loop.
1481 	 */
1482  top:
1483 	for (i = 0; i < opt_state->edgewords; ++i) {
1484 		register bpf_u_int32 x = ep->edom[i];
1485 
1486 		while (x != 0) {
1487 			k = lowest_set_bit(x);
1488 			x &=~ (1 << k);
1489 			k += i * BITS_PER_WORD;
1490 
1491 			target = fold_edge(ep->succ, opt_state->edges[k]);
1492 			/*
1493 			 * Check that there is no data dependency between
1494 			 * nodes that will be violated if we move the edge.
1495 			 */
1496 			if (target != 0 && !use_conflict(ep->pred, target)) {
1497 				opt_state->done = 0;
1498 				ep->succ = target;
1499 				if (JT(target) != 0)
1500 					/*
1501 					 * Start over unless we hit a leaf.
1502 					 */
1503 					goto top;
1504 				return;
1505 			}
1506 		}
1507 	}
1508 }
1509 
1510 
1511 static void
1512 or_pullup(opt_state_t *opt_state, struct block *b)
1513 {
1514 	int val, at_top;
1515 	struct block *pull;
1516 	struct block **diffp, **samep;
1517 	struct edge *ep;
1518 
1519 	ep = b->in_edges;
1520 	if (ep == 0)
1521 		return;
1522 
1523 	/*
1524 	 * Make sure each predecessor loads the same value.
1525 	 * XXX why?
1526 	 */
1527 	val = ep->pred->val[A_ATOM];
1528 	for (ep = ep->next; ep != 0; ep = ep->next)
1529 		if (val != ep->pred->val[A_ATOM])
1530 			return;
1531 
1532 	if (JT(b->in_edges->pred) == b)
1533 		diffp = &JT(b->in_edges->pred);
1534 	else
1535 		diffp = &JF(b->in_edges->pred);
1536 
1537 	at_top = 1;
1538 	for (;;) {
1539 		if (*diffp == 0)
1540 			return;
1541 
1542 		if (JT(*diffp) != JT(b))
1543 			return;
1544 
1545 		if (!SET_MEMBER((*diffp)->dom, b->id))
1546 			return;
1547 
1548 		if ((*diffp)->val[A_ATOM] != val)
1549 			break;
1550 
1551 		diffp = &JF(*diffp);
1552 		at_top = 0;
1553 	}
1554 	samep = &JF(*diffp);
1555 	for (;;) {
1556 		if (*samep == 0)
1557 			return;
1558 
1559 		if (JT(*samep) != JT(b))
1560 			return;
1561 
1562 		if (!SET_MEMBER((*samep)->dom, b->id))
1563 			return;
1564 
1565 		if ((*samep)->val[A_ATOM] == val)
1566 			break;
1567 
1568 		/* XXX Need to check that there are no data dependencies
1569 		   between dp0 and dp1.  Currently, the code generator
1570 		   will not produce such dependencies. */
1571 		samep = &JF(*samep);
1572 	}
1573 #ifdef notdef
1574 	/* XXX This doesn't cover everything. */
1575 	for (i = 0; i < N_ATOMS; ++i)
1576 		if ((*samep)->val[i] != pred->val[i])
1577 			return;
1578 #endif
1579 	/* Pull up the node. */
1580 	pull = *samep;
1581 	*samep = JF(pull);
1582 	JF(pull) = *diffp;
1583 
1584 	/*
1585 	 * At the top of the chain, each predecessor needs to point at the
1586 	 * pulled up node.  Inside the chain, there is only one predecessor
1587 	 * to worry about.
1588 	 */
1589 	if (at_top) {
1590 		for (ep = b->in_edges; ep != 0; ep = ep->next) {
1591 			if (JT(ep->pred) == b)
1592 				JT(ep->pred) = pull;
1593 			else
1594 				JF(ep->pred) = pull;
1595 		}
1596 	}
1597 	else
1598 		*diffp = pull;
1599 
1600 	opt_state->done = 0;
1601 }
1602 
1603 static void
1604 and_pullup(opt_state_t *opt_state, struct block *b)
1605 {
1606 	int val, at_top;
1607 	struct block *pull;
1608 	struct block **diffp, **samep;
1609 	struct edge *ep;
1610 
1611 	ep = b->in_edges;
1612 	if (ep == 0)
1613 		return;
1614 
1615 	/*
1616 	 * Make sure each predecessor loads the same value.
1617 	 */
1618 	val = ep->pred->val[A_ATOM];
1619 	for (ep = ep->next; ep != 0; ep = ep->next)
1620 		if (val != ep->pred->val[A_ATOM])
1621 			return;
1622 
1623 	if (JT(b->in_edges->pred) == b)
1624 		diffp = &JT(b->in_edges->pred);
1625 	else
1626 		diffp = &JF(b->in_edges->pred);
1627 
1628 	at_top = 1;
1629 	for (;;) {
1630 		if (*diffp == 0)
1631 			return;
1632 
1633 		if (JF(*diffp) != JF(b))
1634 			return;
1635 
1636 		if (!SET_MEMBER((*diffp)->dom, b->id))
1637 			return;
1638 
1639 		if ((*diffp)->val[A_ATOM] != val)
1640 			break;
1641 
1642 		diffp = &JT(*diffp);
1643 		at_top = 0;
1644 	}
1645 	samep = &JT(*diffp);
1646 	for (;;) {
1647 		if (*samep == 0)
1648 			return;
1649 
1650 		if (JF(*samep) != JF(b))
1651 			return;
1652 
1653 		if (!SET_MEMBER((*samep)->dom, b->id))
1654 			return;
1655 
1656 		if ((*samep)->val[A_ATOM] == val)
1657 			break;
1658 
1659 		/* XXX Need to check that there are no data dependencies
1660 		   between diffp and samep.  Currently, the code generator
1661 		   will not produce such dependencies. */
1662 		samep = &JT(*samep);
1663 	}
1664 #ifdef notdef
1665 	/* XXX This doesn't cover everything. */
1666 	for (i = 0; i < N_ATOMS; ++i)
1667 		if ((*samep)->val[i] != pred->val[i])
1668 			return;
1669 #endif
1670 	/* Pull up the node. */
1671 	pull = *samep;
1672 	*samep = JT(pull);
1673 	JT(pull) = *diffp;
1674 
1675 	/*
1676 	 * At the top of the chain, each predecessor needs to point at the
1677 	 * pulled up node.  Inside the chain, there is only one predecessor
1678 	 * to worry about.
1679 	 */
1680 	if (at_top) {
1681 		for (ep = b->in_edges; ep != 0; ep = ep->next) {
1682 			if (JT(ep->pred) == b)
1683 				JT(ep->pred) = pull;
1684 			else
1685 				JF(ep->pred) = pull;
1686 		}
1687 	}
1688 	else
1689 		*diffp = pull;
1690 
1691 	opt_state->done = 0;
1692 }
1693 
1694 static void
1695 opt_blks(compiler_state_t *cstate, opt_state_t *opt_state, struct icode *ic,
1696     int do_stmts)
1697 {
1698 	int i, maxlevel;
1699 	struct block *p;
1700 
1701 	init_val(opt_state);
1702 	maxlevel = ic->root->level;
1703 
1704 	find_inedges(opt_state, ic->root);
1705 	for (i = maxlevel; i >= 0; --i)
1706 		for (p = opt_state->levels[i]; p; p = p->link)
1707 			opt_blk(cstate, opt_state, p, do_stmts);
1708 
1709 	if (do_stmts)
1710 		/*
1711 		 * No point trying to move branches; it can't possibly
1712 		 * make a difference at this point.
1713 		 */
1714 		return;
1715 
1716 	for (i = 1; i <= maxlevel; ++i) {
1717 		for (p = opt_state->levels[i]; p; p = p->link) {
1718 			opt_j(opt_state, &p->et);
1719 			opt_j(opt_state, &p->ef);
1720 		}
1721 	}
1722 
1723 	find_inedges(opt_state, ic->root);
1724 	for (i = 1; i <= maxlevel; ++i) {
1725 		for (p = opt_state->levels[i]; p; p = p->link) {
1726 			or_pullup(opt_state, p);
1727 			and_pullup(opt_state, p);
1728 		}
1729 	}
1730 }
1731 
1732 static inline void
1733 link_inedge(struct edge *parent, struct block *child)
1734 {
1735 	parent->next = child->in_edges;
1736 	child->in_edges = parent;
1737 }
1738 
1739 static void
1740 find_inedges(opt_state_t *opt_state, struct block *root)
1741 {
1742 	int i;
1743 	struct block *b;
1744 
1745 	for (i = 0; i < opt_state->n_blocks; ++i)
1746 		opt_state->blocks[i]->in_edges = 0;
1747 
1748 	/*
1749 	 * Traverse the graph, adding each edge to the predecessor
1750 	 * list of its successors.  Skip the leaves (i.e. level 0).
1751 	 */
1752 	for (i = root->level; i > 0; --i) {
1753 		for (b = opt_state->levels[i]; b != 0; b = b->link) {
1754 			link_inedge(&b->et, JT(b));
1755 			link_inedge(&b->ef, JF(b));
1756 		}
1757 	}
1758 }
1759 
1760 static void
1761 opt_root(struct block **b)
1762 {
1763 	struct slist *tmp, *s;
1764 
1765 	s = (*b)->stmts;
1766 	(*b)->stmts = 0;
1767 	while (BPF_CLASS((*b)->s.code) == BPF_JMP && JT(*b) == JF(*b))
1768 		*b = JT(*b);
1769 
1770 	tmp = (*b)->stmts;
1771 	if (tmp != 0)
1772 		sappend(s, tmp);
1773 	(*b)->stmts = s;
1774 
1775 	/*
1776 	 * If the root node is a return, then there is no
1777 	 * point executing any statements (since the bpf machine
1778 	 * has no side effects).
1779 	 */
1780 	if (BPF_CLASS((*b)->s.code) == BPF_RET)
1781 		(*b)->stmts = 0;
1782 }
1783 
1784 static void
1785 opt_loop(compiler_state_t *cstate, opt_state_t *opt_state, struct icode *ic,
1786     int do_stmts)
1787 {
1788 
1789 #ifdef BDEBUG
1790 	if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1791 		printf("opt_loop(root, %d) begin\n", do_stmts);
1792 		opt_dump(cstate, ic);
1793 	}
1794 #endif
1795 	do {
1796 		opt_state->done = 1;
1797 		find_levels(opt_state, ic);
1798 		find_dom(opt_state, ic->root);
1799 		find_closure(opt_state, ic->root);
1800 		find_ud(opt_state, ic->root);
1801 		find_edom(opt_state, ic->root);
1802 		opt_blks(cstate, opt_state, ic, do_stmts);
1803 #ifdef BDEBUG
1804 		if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1805 			printf("opt_loop(root, %d) bottom, done=%d\n", do_stmts, opt_state->done);
1806 			opt_dump(cstate, ic);
1807 		}
1808 #endif
1809 	} while (!opt_state->done);
1810 }
1811 
1812 /*
1813  * Optimize the filter code in its dag representation.
1814  */
1815 void
1816 bpf_optimize(compiler_state_t *cstate, struct icode *ic)
1817 {
1818 	opt_state_t opt_state;
1819 
1820 	opt_init(cstate, &opt_state, ic);
1821 	opt_loop(cstate, &opt_state, ic, 0);
1822 	opt_loop(cstate, &opt_state, ic, 1);
1823 	intern_blocks(&opt_state, ic);
1824 #ifdef BDEBUG
1825 	if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1826 		printf("after intern_blocks()\n");
1827 		opt_dump(cstate, ic);
1828 	}
1829 #endif
1830 	opt_root(&ic->root);
1831 #ifdef BDEBUG
1832 	if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1833 		printf("after opt_root()\n");
1834 		opt_dump(cstate, ic);
1835 	}
1836 #endif
1837 	opt_cleanup(&opt_state);
1838 }
1839 
1840 static void
1841 make_marks(struct icode *ic, struct block *p)
1842 {
1843 	if (!isMarked(ic, p)) {
1844 		Mark(ic, p);
1845 		if (BPF_CLASS(p->s.code) != BPF_RET) {
1846 			make_marks(ic, JT(p));
1847 			make_marks(ic, JF(p));
1848 		}
1849 	}
1850 }
1851 
1852 /*
1853  * Mark code array such that isMarked(ic->cur_mark, i) is true
1854  * only for nodes that are alive.
1855  */
1856 static void
1857 mark_code(struct icode *ic)
1858 {
1859 	ic->cur_mark += 1;
1860 	make_marks(ic, ic->root);
1861 }
1862 
1863 /*
1864  * True iff the two stmt lists load the same value from the packet into
1865  * the accumulator.
1866  */
1867 static int
1868 eq_slist(struct slist *x, struct slist *y)
1869 {
1870 	for (;;) {
1871 		while (x && x->s.code == NOP)
1872 			x = x->next;
1873 		while (y && y->s.code == NOP)
1874 			y = y->next;
1875 		if (x == 0)
1876 			return y == 0;
1877 		if (y == 0)
1878 			return x == 0;
1879 		if (x->s.code != y->s.code || x->s.k != y->s.k)
1880 			return 0;
1881 		x = x->next;
1882 		y = y->next;
1883 	}
1884 }
1885 
1886 static inline int
1887 eq_blk(struct block *b0, struct block *b1)
1888 {
1889 	if (b0->s.code == b1->s.code &&
1890 	    b0->s.k == b1->s.k &&
1891 	    b0->et.succ == b1->et.succ &&
1892 	    b0->ef.succ == b1->ef.succ)
1893 		return eq_slist(b0->stmts, b1->stmts);
1894 	return 0;
1895 }
1896 
1897 static void
1898 intern_blocks(opt_state_t *opt_state, struct icode *ic)
1899 {
1900 	struct block *p;
1901 	int i, j;
1902 	int done1; /* don't shadow global */
1903  top:
1904 	done1 = 1;
1905 	for (i = 0; i < opt_state->n_blocks; ++i)
1906 		opt_state->blocks[i]->link = 0;
1907 
1908 	mark_code(ic);
1909 
1910 	for (i = opt_state->n_blocks - 1; --i >= 0; ) {
1911 		if (!isMarked(ic, opt_state->blocks[i]))
1912 			continue;
1913 		for (j = i + 1; j < opt_state->n_blocks; ++j) {
1914 			if (!isMarked(ic, opt_state->blocks[j]))
1915 				continue;
1916 			if (eq_blk(opt_state->blocks[i], opt_state->blocks[j])) {
1917 				opt_state->blocks[i]->link = opt_state->blocks[j]->link ?
1918 					opt_state->blocks[j]->link : opt_state->blocks[j];
1919 				break;
1920 			}
1921 		}
1922 	}
1923 	for (i = 0; i < opt_state->n_blocks; ++i) {
1924 		p = opt_state->blocks[i];
1925 		if (JT(p) == 0)
1926 			continue;
1927 		if (JT(p)->link) {
1928 			done1 = 0;
1929 			JT(p) = JT(p)->link;
1930 		}
1931 		if (JF(p)->link) {
1932 			done1 = 0;
1933 			JF(p) = JF(p)->link;
1934 		}
1935 	}
1936 	if (!done1)
1937 		goto top;
1938 }
1939 
1940 static void
1941 opt_cleanup(opt_state_t *opt_state)
1942 {
1943 	free((void *)opt_state->vnode_base);
1944 	free((void *)opt_state->vmap);
1945 	free((void *)opt_state->edges);
1946 	free((void *)opt_state->space);
1947 	free((void *)opt_state->levels);
1948 	free((void *)opt_state->blocks);
1949 }
1950 
1951 /*
1952  * Return the number of stmts in 's'.
1953  */
1954 static u_int
1955 slength(struct slist *s)
1956 {
1957 	u_int n = 0;
1958 
1959 	for (; s; s = s->next)
1960 		if (s->s.code != NOP)
1961 			++n;
1962 	return n;
1963 }
1964 
1965 /*
1966  * Return the number of nodes reachable by 'p'.
1967  * All nodes should be initially unmarked.
1968  */
1969 static int
1970 count_blocks(struct icode *ic, struct block *p)
1971 {
1972 	if (p == 0 || isMarked(ic, p))
1973 		return 0;
1974 	Mark(ic, p);
1975 	return count_blocks(ic, JT(p)) + count_blocks(ic, JF(p)) + 1;
1976 }
1977 
1978 /*
1979  * Do a depth first search on the flow graph, numbering the
1980  * the basic blocks, and entering them into the 'blocks' array.`
1981  */
1982 static void
1983 number_blks_r(opt_state_t *opt_state, struct icode *ic, struct block *p)
1984 {
1985 	int n;
1986 
1987 	if (p == 0 || isMarked(ic, p))
1988 		return;
1989 
1990 	Mark(ic, p);
1991 	n = opt_state->n_blocks++;
1992 	p->id = n;
1993 	opt_state->blocks[n] = p;
1994 
1995 	number_blks_r(opt_state, ic, JT(p));
1996 	number_blks_r(opt_state, ic, JF(p));
1997 }
1998 
1999 /*
2000  * Return the number of stmts in the flowgraph reachable by 'p'.
2001  * The nodes should be unmarked before calling.
2002  *
2003  * Note that "stmts" means "instructions", and that this includes
2004  *
2005  *	side-effect statements in 'p' (slength(p->stmts));
2006  *
2007  *	statements in the true branch from 'p' (count_stmts(JT(p)));
2008  *
2009  *	statements in the false branch from 'p' (count_stmts(JF(p)));
2010  *
2011  *	the conditional jump itself (1);
2012  *
2013  *	an extra long jump if the true branch requires it (p->longjt);
2014  *
2015  *	an extra long jump if the false branch requires it (p->longjf).
2016  */
2017 static u_int
2018 count_stmts(struct icode *ic, struct block *p)
2019 {
2020 	u_int n;
2021 
2022 	if (p == 0 || isMarked(ic, p))
2023 		return 0;
2024 	Mark(ic, p);
2025 	n = count_stmts(ic, JT(p)) + count_stmts(ic, JF(p));
2026 	return slength(p->stmts) + n + 1 + p->longjt + p->longjf;
2027 }
2028 
2029 /*
2030  * Allocate memory.  All allocation is done before optimization
2031  * is begun.  A linear bound on the size of all data structures is computed
2032  * from the total number of blocks and/or statements.
2033  */
2034 static void
2035 opt_init(compiler_state_t *cstate, opt_state_t *opt_state, struct icode *ic)
2036 {
2037 	bpf_u_int32 *p;
2038 	int i, n, max_stmts;
2039 
2040 	/*
2041 	 * First, count the blocks, so we can malloc an array to map
2042 	 * block number to block.  Then, put the blocks into the array.
2043 	 */
2044 	unMarkAll(ic);
2045 	n = count_blocks(ic, ic->root);
2046 	opt_state->blocks = (struct block **)calloc(n, sizeof(*opt_state->blocks));
2047 	if (opt_state->blocks == NULL)
2048 		bpf_error(cstate, "malloc");
2049 	unMarkAll(ic);
2050 	opt_state->n_blocks = 0;
2051 	number_blks_r(opt_state, ic, ic->root);
2052 
2053 	opt_state->n_edges = 2 * opt_state->n_blocks;
2054 	opt_state->edges = (struct edge **)calloc(opt_state->n_edges, sizeof(*opt_state->edges));
2055 	if (opt_state->edges == NULL)
2056 		bpf_error(cstate, "malloc");
2057 
2058 	/*
2059 	 * The number of levels is bounded by the number of nodes.
2060 	 */
2061 	opt_state->levels = (struct block **)calloc(opt_state->n_blocks, sizeof(*opt_state->levels));
2062 	if (opt_state->levels == NULL)
2063 		bpf_error(cstate, "malloc");
2064 
2065 	opt_state->edgewords = opt_state->n_edges / (8 * sizeof(bpf_u_int32)) + 1;
2066 	opt_state->nodewords = opt_state->n_blocks / (8 * sizeof(bpf_u_int32)) + 1;
2067 
2068 	/* XXX */
2069 	opt_state->space = (bpf_u_int32 *)malloc(2 * opt_state->n_blocks * opt_state->nodewords * sizeof(*opt_state->space)
2070 				 + opt_state->n_edges * opt_state->edgewords * sizeof(*opt_state->space));
2071 	if (opt_state->space == NULL)
2072 		bpf_error(cstate, "malloc");
2073 	p = opt_state->space;
2074 	opt_state->all_dom_sets = p;
2075 	for (i = 0; i < n; ++i) {
2076 		opt_state->blocks[i]->dom = p;
2077 		p += opt_state->nodewords;
2078 	}
2079 	opt_state->all_closure_sets = p;
2080 	for (i = 0; i < n; ++i) {
2081 		opt_state->blocks[i]->closure = p;
2082 		p += opt_state->nodewords;
2083 	}
2084 	opt_state->all_edge_sets = p;
2085 	for (i = 0; i < n; ++i) {
2086 		register struct block *b = opt_state->blocks[i];
2087 
2088 		b->et.edom = p;
2089 		p += opt_state->edgewords;
2090 		b->ef.edom = p;
2091 		p += opt_state->edgewords;
2092 		b->et.id = i;
2093 		opt_state->edges[i] = &b->et;
2094 		b->ef.id = opt_state->n_blocks + i;
2095 		opt_state->edges[opt_state->n_blocks + i] = &b->ef;
2096 		b->et.pred = b;
2097 		b->ef.pred = b;
2098 	}
2099 	max_stmts = 0;
2100 	for (i = 0; i < n; ++i)
2101 		max_stmts += slength(opt_state->blocks[i]->stmts) + 1;
2102 	/*
2103 	 * We allocate at most 3 value numbers per statement,
2104 	 * so this is an upper bound on the number of valnodes
2105 	 * we'll need.
2106 	 */
2107 	opt_state->maxval = 3 * max_stmts;
2108 	opt_state->vmap = (struct vmapinfo *)calloc(opt_state->maxval, sizeof(*opt_state->vmap));
2109 	opt_state->vnode_base = (struct valnode *)calloc(opt_state->maxval, sizeof(*opt_state->vnode_base));
2110 	if (opt_state->vmap == NULL || opt_state->vnode_base == NULL)
2111 		bpf_error(cstate, "malloc");
2112 }
2113 
2114 /*
2115  * This is only used when supporting optimizer debugging.  It is
2116  * global state, so do *not* do more than one compile in parallel
2117  * and expect it to provide meaningful information.
2118  */
2119 #ifdef BDEBUG
2120 int bids[NBIDS];
2121 #endif
2122 
2123 /*
2124  * Returns true if successful.  Returns false if a branch has
2125  * an offset that is too large.  If so, we have marked that
2126  * branch so that on a subsequent iteration, it will be treated
2127  * properly.
2128  */
2129 static int
2130 convert_code_r(compiler_state_t *cstate, conv_state_t *conv_state,
2131     struct icode *ic, struct block *p)
2132 {
2133 	struct bpf_insn *dst;
2134 	struct slist *src;
2135 	u_int slen;
2136 	u_int off;
2137 	u_int extrajmps;	/* number of extra jumps inserted */
2138 	struct slist **offset = NULL;
2139 
2140 	if (p == 0 || isMarked(ic, p))
2141 		return (1);
2142 	Mark(ic, p);
2143 
2144 	if (convert_code_r(cstate, conv_state, ic, JF(p)) == 0)
2145 		return (0);
2146 	if (convert_code_r(cstate, conv_state, ic, JT(p)) == 0)
2147 		return (0);
2148 
2149 	slen = slength(p->stmts);
2150 	dst = conv_state->ftail -= (slen + 1 + p->longjt + p->longjf);
2151 		/* inflate length by any extra jumps */
2152 
2153 	p->offset = (int)(dst - conv_state->fstart);
2154 
2155 	/* generate offset[] for convenience  */
2156 	if (slen) {
2157 		offset = (struct slist **)calloc(slen, sizeof(struct slist *));
2158 		if (!offset) {
2159 			bpf_error(cstate, "not enough core");
2160 			/*NOTREACHED*/
2161 		}
2162 	}
2163 	src = p->stmts;
2164 	for (off = 0; off < slen && src; off++) {
2165 #if 0
2166 		printf("off=%d src=%x\n", off, src);
2167 #endif
2168 		offset[off] = src;
2169 		src = src->next;
2170 	}
2171 
2172 	off = 0;
2173 	for (src = p->stmts; src; src = src->next) {
2174 		if (src->s.code == NOP)
2175 			continue;
2176 		dst->code = (u_short)src->s.code;
2177 		dst->k = src->s.k;
2178 
2179 		/* fill block-local relative jump */
2180 		if (BPF_CLASS(src->s.code) != BPF_JMP || src->s.code == (BPF_JMP|BPF_JA)) {
2181 #if 0
2182 			if (src->s.jt || src->s.jf) {
2183 				bpf_error(cstate, "illegal jmp destination");
2184 				/*NOTREACHED*/
2185 			}
2186 #endif
2187 			goto filled;
2188 		}
2189 		if (off == slen - 2)	/*???*/
2190 			goto filled;
2191 
2192 	    {
2193 		u_int i;
2194 		int jt, jf;
2195 		const char ljerr[] = "%s for block-local relative jump: off=%d";
2196 
2197 #if 0
2198 		printf("code=%x off=%d %x %x\n", src->s.code,
2199 			off, src->s.jt, src->s.jf);
2200 #endif
2201 
2202 		if (!src->s.jt || !src->s.jf) {
2203 			bpf_error(cstate, ljerr, "no jmp destination", off);
2204 			/*NOTREACHED*/
2205 		}
2206 
2207 		jt = jf = 0;
2208 		for (i = 0; i < slen; i++) {
2209 			if (offset[i] == src->s.jt) {
2210 				if (jt) {
2211 					bpf_error(cstate, ljerr, "multiple matches", off);
2212 					/*NOTREACHED*/
2213 				}
2214 
2215 				if (i - off - 1 >= 256) {
2216 					bpf_error(cstate, ljerr, "out-of-range jump", off);
2217 					/*NOTREACHED*/
2218 				}
2219 				dst->jt = (u_char)(i - off - 1);
2220 				jt++;
2221 			}
2222 			if (offset[i] == src->s.jf) {
2223 				if (jf) {
2224 					bpf_error(cstate, ljerr, "multiple matches", off);
2225 					/*NOTREACHED*/
2226 				}
2227 				if (i - off - 1 >= 256) {
2228 					bpf_error(cstate, ljerr, "out-of-range jump", off);
2229 					/*NOTREACHED*/
2230 				}
2231 				dst->jf = (u_char)(i - off - 1);
2232 				jf++;
2233 			}
2234 		}
2235 		if (!jt || !jf) {
2236 			bpf_error(cstate, ljerr, "no destination found", off);
2237 			/*NOTREACHED*/
2238 		}
2239 	    }
2240 filled:
2241 		++dst;
2242 		++off;
2243 	}
2244 	if (offset)
2245 		free(offset);
2246 
2247 #ifdef BDEBUG
2248 	if (dst - conv_state->fstart < NBIDS)
2249 		bids[dst - conv_state->fstart] = p->id + 1;
2250 #endif
2251 	dst->code = (u_short)p->s.code;
2252 	dst->k = p->s.k;
2253 	if (JT(p)) {
2254 		extrajmps = 0;
2255 		off = JT(p)->offset - (p->offset + slen) - 1;
2256 		if (off >= 256) {
2257 		    /* offset too large for branch, must add a jump */
2258 		    if (p->longjt == 0) {
2259 		    	/* mark this instruction and retry */
2260 			p->longjt++;
2261 			return(0);
2262 		    }
2263 		    /* branch if T to following jump */
2264 		    if (extrajmps >= 256) {
2265 			bpf_error(cstate, "too many extra jumps");
2266 			/*NOTREACHED*/
2267 		    }
2268 		    dst->jt = (u_char)extrajmps;
2269 		    extrajmps++;
2270 		    dst[extrajmps].code = BPF_JMP|BPF_JA;
2271 		    dst[extrajmps].k = off - extrajmps;
2272 		}
2273 		else
2274 		    dst->jt = (u_char)off;
2275 		off = JF(p)->offset - (p->offset + slen) - 1;
2276 		if (off >= 256) {
2277 		    /* offset too large for branch, must add a jump */
2278 		    if (p->longjf == 0) {
2279 		    	/* mark this instruction and retry */
2280 			p->longjf++;
2281 			return(0);
2282 		    }
2283 		    /* branch if F to following jump */
2284 		    /* if two jumps are inserted, F goes to second one */
2285 		    if (extrajmps >= 256) {
2286 			bpf_error(cstate, "too many extra jumps");
2287 			/*NOTREACHED*/
2288 		    }
2289 		    dst->jf = (u_char)extrajmps;
2290 		    extrajmps++;
2291 		    dst[extrajmps].code = BPF_JMP|BPF_JA;
2292 		    dst[extrajmps].k = off - extrajmps;
2293 		}
2294 		else
2295 		    dst->jf = (u_char)off;
2296 	}
2297 	return (1);
2298 }
2299 
2300 
2301 /*
2302  * Convert flowgraph intermediate representation to the
2303  * BPF array representation.  Set *lenp to the number of instructions.
2304  *
2305  * This routine does *NOT* leak the memory pointed to by fp.  It *must
2306  * not* do free(fp) before returning fp; doing so would make no sense,
2307  * as the BPF array pointed to by the return value of icode_to_fcode()
2308  * must be valid - it's being returned for use in a bpf_program structure.
2309  *
2310  * If it appears that icode_to_fcode() is leaking, the problem is that
2311  * the program using pcap_compile() is failing to free the memory in
2312  * the BPF program when it's done - the leak is in the program, not in
2313  * the routine that happens to be allocating the memory.  (By analogy, if
2314  * a program calls fopen() without ever calling fclose() on the FILE *,
2315  * it will leak the FILE structure; the leak is not in fopen(), it's in
2316  * the program.)  Change the program to use pcap_freecode() when it's
2317  * done with the filter program.  See the pcap man page.
2318  */
2319 struct bpf_insn *
2320 icode_to_fcode(compiler_state_t *cstate, struct icode *ic,
2321     struct block *root, u_int *lenp)
2322 {
2323 	u_int n;
2324 	struct bpf_insn *fp;
2325 	conv_state_t conv_state;
2326 
2327 	/*
2328 	 * Loop doing convert_code_r() until no branches remain
2329 	 * with too-large offsets.
2330 	 */
2331 	for (;;) {
2332 	    unMarkAll(ic);
2333 	    n = *lenp = count_stmts(ic, root);
2334 
2335 	    fp = (struct bpf_insn *)malloc(sizeof(*fp) * n);
2336 	    if (fp == NULL)
2337 		    bpf_error(cstate, "malloc");
2338 	    memset((char *)fp, 0, sizeof(*fp) * n);
2339 	    conv_state.fstart = fp;
2340 	    conv_state.ftail = fp + n;
2341 
2342 	    unMarkAll(ic);
2343 	    if (convert_code_r(cstate, &conv_state, ic, root))
2344 		break;
2345 	    free(fp);
2346 	}
2347 
2348 	return fp;
2349 }
2350 
2351 /*
2352  * Make a copy of a BPF program and put it in the "fcode" member of
2353  * a "pcap_t".
2354  *
2355  * If we fail to allocate memory for the copy, fill in the "errbuf"
2356  * member of the "pcap_t" with an error message, and return -1;
2357  * otherwise, return 0.
2358  */
2359 int
2360 install_bpf_program(pcap_t *p, struct bpf_program *fp)
2361 {
2362 	size_t prog_size;
2363 
2364 	/*
2365 	 * Validate the program.
2366 	 */
2367 	if (!bpf_validate(fp->bf_insns, fp->bf_len)) {
2368 		pcap_snprintf(p->errbuf, sizeof(p->errbuf),
2369 			"BPF program is not valid");
2370 		return (-1);
2371 	}
2372 
2373 	/*
2374 	 * Free up any already installed program.
2375 	 */
2376 	pcap_freecode(&p->fcode);
2377 
2378 	prog_size = sizeof(*fp->bf_insns) * fp->bf_len;
2379 	p->fcode.bf_len = fp->bf_len;
2380 	p->fcode.bf_insns = (struct bpf_insn *)malloc(prog_size);
2381 	if (p->fcode.bf_insns == NULL) {
2382 		pcap_fmt_errmsg_for_errno(p->errbuf, sizeof(p->errbuf),
2383 		    errno, "malloc");
2384 		return (-1);
2385 	}
2386 	memcpy(p->fcode.bf_insns, fp->bf_insns, prog_size);
2387 	return (0);
2388 }
2389 
2390 #ifdef BDEBUG
2391 static void
2392 dot_dump_node(struct icode *ic, struct block *block, struct bpf_program *prog,
2393     FILE *out)
2394 {
2395 	int icount, noffset;
2396 	int i;
2397 
2398 	if (block == NULL || isMarked(ic, block))
2399 		return;
2400 	Mark(ic, block);
2401 
2402 	icount = slength(block->stmts) + 1 + block->longjt + block->longjf;
2403 	noffset = min(block->offset + icount, (int)prog->bf_len);
2404 
2405 	fprintf(out, "\tblock%d [shape=ellipse, id=\"block-%d\" label=\"BLOCK%d\\n", block->id, block->id, block->id);
2406 	for (i = block->offset; i < noffset; i++) {
2407 		fprintf(out, "\\n%s", bpf_image(prog->bf_insns + i, i));
2408 	}
2409 	fprintf(out, "\" tooltip=\"");
2410 	for (i = 0; i < BPF_MEMWORDS; i++)
2411 		if (block->val[i] != VAL_UNKNOWN)
2412 			fprintf(out, "val[%d]=%d ", i, block->val[i]);
2413 	fprintf(out, "val[A]=%d ", block->val[A_ATOM]);
2414 	fprintf(out, "val[X]=%d", block->val[X_ATOM]);
2415 	fprintf(out, "\"");
2416 	if (JT(block) == NULL)
2417 		fprintf(out, ", peripheries=2");
2418 	fprintf(out, "];\n");
2419 
2420 	dot_dump_node(ic, JT(block), prog, out);
2421 	dot_dump_node(ic, JF(block), prog, out);
2422 }
2423 
2424 static void
2425 dot_dump_edge(struct icode *ic, struct block *block, FILE *out)
2426 {
2427 	if (block == NULL || isMarked(ic, block))
2428 		return;
2429 	Mark(ic, block);
2430 
2431 	if (JT(block)) {
2432 		fprintf(out, "\t\"block%d\":se -> \"block%d\":n [label=\"T\"]; \n",
2433 				block->id, JT(block)->id);
2434 		fprintf(out, "\t\"block%d\":sw -> \"block%d\":n [label=\"F\"]; \n",
2435 			   block->id, JF(block)->id);
2436 	}
2437 	dot_dump_edge(ic, JT(block), out);
2438 	dot_dump_edge(ic, JF(block), out);
2439 }
2440 
2441 /* Output the block CFG using graphviz/DOT language
2442  * In the CFG, block's code, value index for each registers at EXIT,
2443  * and the jump relationship is show.
2444  *
2445  * example DOT for BPF `ip src host 1.1.1.1' is:
2446     digraph BPF {
2447     	block0 [shape=ellipse, id="block-0" label="BLOCK0\n\n(000) ldh      [12]\n(001) jeq      #0x800           jt 2	jf 5" tooltip="val[A]=0 val[X]=0"];
2448     	block1 [shape=ellipse, id="block-1" label="BLOCK1\n\n(002) ld       [26]\n(003) jeq      #0x1010101       jt 4	jf 5" tooltip="val[A]=0 val[X]=0"];
2449     	block2 [shape=ellipse, id="block-2" label="BLOCK2\n\n(004) ret      #68" tooltip="val[A]=0 val[X]=0", peripheries=2];
2450     	block3 [shape=ellipse, id="block-3" label="BLOCK3\n\n(005) ret      #0" tooltip="val[A]=0 val[X]=0", peripheries=2];
2451     	"block0":se -> "block1":n [label="T"];
2452     	"block0":sw -> "block3":n [label="F"];
2453     	"block1":se -> "block2":n [label="T"];
2454     	"block1":sw -> "block3":n [label="F"];
2455     }
2456  *
2457  *  After install graphviz on http://www.graphviz.org/, save it as bpf.dot
2458  *  and run `dot -Tpng -O bpf.dot' to draw the graph.
2459  */
2460 static void
2461 dot_dump(compiler_state_t *cstate, struct icode *ic)
2462 {
2463 	struct bpf_program f;
2464 	FILE *out = stdout;
2465 
2466 	memset(bids, 0, sizeof bids);
2467 	f.bf_insns = icode_to_fcode(cstate, ic, ic->root, &f.bf_len);
2468 
2469 	fprintf(out, "digraph BPF {\n");
2470 	unMarkAll(ic);
2471 	dot_dump_node(ic, ic->root, &f, out);
2472 	unMarkAll(ic);
2473 	dot_dump_edge(ic, ic->root, out);
2474 	fprintf(out, "}\n");
2475 
2476 	free((char *)f.bf_insns);
2477 }
2478 
2479 static void
2480 plain_dump(compiler_state_t *cstate, struct icode *ic)
2481 {
2482 	struct bpf_program f;
2483 
2484 	memset(bids, 0, sizeof bids);
2485 	f.bf_insns = icode_to_fcode(cstate, ic, ic->root, &f.bf_len);
2486 	bpf_dump(&f, 1);
2487 	putchar('\n');
2488 	free((char *)f.bf_insns);
2489 }
2490 
2491 static void
2492 opt_dump(compiler_state_t *cstate, struct icode *ic)
2493 {
2494 	/*
2495 	 * If the CFG, in DOT format, is requested, output it rather than
2496 	 * the code that would be generated from that graph.
2497 	 */
2498 	if (pcap_print_dot_graph)
2499 		dot_dump(cstate, ic);
2500 	else
2501 		plain_dump(cstate, ic);
2502 }
2503 #endif
2504