xref: /netbsd-src/external/bsd/libpcap/dist/optimize.c (revision 53b02e147d4ed531c0d2a5ca9b3e8026ba3e99b5)
1 /*	$NetBSD: optimize.c,v 1.11 2019/10/01 16:02:11 christos Exp $	*/
2 
3 /*
4  * Copyright (c) 1988, 1989, 1990, 1991, 1993, 1994, 1995, 1996
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that: (1) source code distributions
9  * retain the above copyright notice and this paragraph in its entirety, (2)
10  * distributions including binary code include the above copyright notice and
11  * this paragraph in its entirety in the documentation or other materials
12  * provided with the distribution, and (3) all advertising materials mentioning
13  * features or use of this software display the following acknowledgement:
14  * ``This product includes software developed by the University of California,
15  * Lawrence Berkeley Laboratory and its contributors.'' Neither the name of
16  * the University nor the names of its contributors may be used to endorse
17  * or promote products derived from this software without specific prior
18  * written permission.
19  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
20  * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
21  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
22  *
23  *  Optimization module for BPF code intermediate representation.
24  */
25 
26 #include <sys/cdefs.h>
27 __RCSID("$NetBSD: optimize.c,v 1.11 2019/10/01 16:02:11 christos Exp $");
28 
29 #ifdef HAVE_CONFIG_H
30 #include <config.h>
31 #endif
32 
33 #include <pcap-types.h>
34 
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <memory.h>
38 #include <setjmp.h>
39 #include <string.h>
40 
41 #include <errno.h>
42 
43 #include "pcap-int.h"
44 
45 #include "gencode.h"
46 #include "optimize.h"
47 
48 #ifdef HAVE_OS_PROTO_H
49 #include "os-proto.h"
50 #endif
51 
52 #ifdef BDEBUG
53 /*
54  * The internal "debug printout" flag for the filter expression optimizer.
55  * The code to print that stuff is present only if BDEBUG is defined, so
56  * the flag, and the routine to set it, are defined only if BDEBUG is
57  * defined.
58  */
59 static int pcap_optimizer_debug;
60 
61 /*
62  * Routine to set that flag.
63  *
64  * This is intended for libpcap developers, not for general use.
65  * If you want to set these in a program, you'll have to declare this
66  * routine yourself, with the appropriate DLL import attribute on Windows;
67  * it's not declared in any header file, and won't be declared in any
68  * header file provided by libpcap.
69  */
70 PCAP_API void pcap_set_optimizer_debug(int value);
71 
72 PCAP_API_DEF void
73 pcap_set_optimizer_debug(int value)
74 {
75 	pcap_optimizer_debug = value;
76 }
77 
78 /*
79  * The internal "print dot graph" flag for the filter expression optimizer.
80  * The code to print that stuff is present only if BDEBUG is defined, so
81  * the flag, and the routine to set it, are defined only if BDEBUG is
82  * defined.
83  */
84 static int pcap_print_dot_graph;
85 
86 /*
87  * Routine to set that flag.
88  *
89  * This is intended for libpcap developers, not for general use.
90  * If you want to set these in a program, you'll have to declare this
91  * routine yourself, with the appropriate DLL import attribute on Windows;
92  * it's not declared in any header file, and won't be declared in any
93  * header file provided by libpcap.
94  */
95 PCAP_API void pcap_set_print_dot_graph(int value);
96 
97 PCAP_API_DEF void
98 pcap_set_print_dot_graph(int value)
99 {
100 	pcap_print_dot_graph = value;
101 }
102 
103 #endif
104 
105 /*
106  * lowest_set_bit().
107  *
108  * Takes a 32-bit integer as an argument.
109  *
110  * If handed a non-zero value, returns the index of the lowest set bit,
111  * counting upwards fro zero.
112  *
113  * If handed zero, the results are platform- and compiler-dependent.
114  * Keep it out of the light, don't give it any water, don't feed it
115  * after midnight, and don't pass zero to it.
116  *
117  * This is the same as the count of trailing zeroes in the word.
118  */
119 #if PCAP_IS_AT_LEAST_GNUC_VERSION(3,4)
120   /*
121    * GCC 3.4 and later; we have __builtin_ctz().
122    */
123   #define lowest_set_bit(mask) __builtin_ctz(mask)
124 #elif defined(_MSC_VER)
125   /*
126    * Visual Studio; we support only 2005 and later, so use
127    * _BitScanForward().
128    */
129 #include <intrin.h>
130 
131 #ifndef __clang__
132 #pragma intrinsic(_BitScanForward)
133 #endif
134 
135 static __forceinline int
136 lowest_set_bit(int mask)
137 {
138 	unsigned long bit;
139 
140 	/*
141 	 * Don't sign-extend mask if long is longer than int.
142 	 * (It's currently not, in MSVC, even on 64-bit platforms, but....)
143 	 */
144 	if (_BitScanForward(&bit, (unsigned int)mask) == 0)
145 		return -1;	/* mask is zero */
146 	return (int)bit;
147 }
148 #elif defined(MSDOS) && defined(__DJGPP__)
149   /*
150    * MS-DOS with DJGPP, which declares ffs() in <string.h>, which
151    * we've already included.
152    */
153   #define lowest_set_bit(mask)	(ffs((mask)) - 1)
154 #elif (defined(MSDOS) && defined(__WATCOMC__)) || defined(STRINGS_H_DECLARES_FFS)
155   /*
156    * MS-DOS with Watcom C, which has <strings.h> and declares ffs() there,
157    * or some other platform (UN*X conforming to a sufficient recent version
158    * of the Single UNIX Specification).
159    */
160   #include <strings.h>
161   #define lowest_set_bit(mask)	(ffs((mask)) - 1)
162 #else
163 /*
164  * None of the above.
165  * Use a perfect-hash-function-based function.
166  */
167 static int
168 lowest_set_bit(int mask)
169 {
170 	unsigned int v = (unsigned int)mask;
171 
172 	static const int MultiplyDeBruijnBitPosition[32] = {
173 		0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
174 		31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
175 	};
176 
177 	/*
178 	 * We strip off all but the lowermost set bit (v & ~v),
179 	 * and perform a minimal perfect hash on it to look up the
180 	 * number of low-order zero bits in a table.
181 	 *
182 	 * See:
183 	 *
184 	 *	http://7ooo.mooo.com/text/ComputingTrailingZerosHOWTO.pdf
185 	 *
186 	 *	http://supertech.csail.mit.edu/papers/debruijn.pdf
187 	 */
188 	return (MultiplyDeBruijnBitPosition[((v & -v) * 0x077CB531U) >> 27]);
189 }
190 #endif
191 
192 /*
193  * Represents a deleted instruction.
194  */
195 #define NOP -1
196 
197 /*
198  * Register numbers for use-def values.
199  * 0 through BPF_MEMWORDS-1 represent the corresponding scratch memory
200  * location.  A_ATOM is the accumulator and X_ATOM is the index
201  * register.
202  */
203 #define A_ATOM BPF_MEMWORDS
204 #define X_ATOM (BPF_MEMWORDS+1)
205 
206 /*
207  * This define is used to represent *both* the accumulator and
208  * x register in use-def computations.
209  * Currently, the use-def code assumes only one definition per instruction.
210  */
211 #define AX_ATOM N_ATOMS
212 
213 /*
214  * These data structures are used in a Cocke and Shwarz style
215  * value numbering scheme.  Since the flowgraph is acyclic,
216  * exit values can be propagated from a node's predecessors
217  * provided it is uniquely defined.
218  */
219 struct valnode {
220 	int code;
221 	int v0, v1;
222 	int val;
223 	struct valnode *next;
224 };
225 
226 /* Integer constants mapped with the load immediate opcode. */
227 #define K(i) F(opt_state, BPF_LD|BPF_IMM|BPF_W, i, 0L)
228 
229 struct vmapinfo {
230 	int is_const;
231 	bpf_int32 const_val;
232 };
233 
234 typedef struct {
235 	/*
236 	 * Place to longjmp to on an error.
237 	 */
238 	jmp_buf top_ctx;
239 
240 	/*
241 	 * The buffer into which to put error message.
242 	 */
243 	char *errbuf;
244 
245 	/*
246 	 * A flag to indicate that further optimization is needed.
247 	 * Iterative passes are continued until a given pass yields no
248 	 * branch movement.
249 	 */
250 	int done;
251 
252 	int n_blocks;
253 	struct block **blocks;
254 	int n_edges;
255 	struct edge **edges;
256 
257 	/*
258 	 * A bit vector set representation of the dominators.
259 	 * We round up the set size to the next power of two.
260 	 */
261 	int nodewords;
262 	int edgewords;
263 	struct block **levels;
264 	bpf_u_int32 *space;
265 
266 #define BITS_PER_WORD (8*sizeof(bpf_u_int32))
267 /*
268  * True if a is in uset {p}
269  */
270 #define SET_MEMBER(p, a) \
271 ((p)[(unsigned)(a) / BITS_PER_WORD] & ((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD)))
272 
273 /*
274  * Add 'a' to uset p.
275  */
276 #define SET_INSERT(p, a) \
277 (p)[(unsigned)(a) / BITS_PER_WORD] |= ((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD))
278 
279 /*
280  * Delete 'a' from uset p.
281  */
282 #define SET_DELETE(p, a) \
283 (p)[(unsigned)(a) / BITS_PER_WORD] &= ~((bpf_u_int32)1 << ((unsigned)(a) % BITS_PER_WORD))
284 
285 /*
286  * a := a intersect b
287  */
288 #define SET_INTERSECT(a, b, n)\
289 {\
290 	register bpf_u_int32 *_x = a, *_y = b;\
291 	register int _n = n;\
292 	while (--_n >= 0) *_x++ &= *_y++;\
293 }
294 
295 /*
296  * a := a - b
297  */
298 #define SET_SUBTRACT(a, b, n)\
299 {\
300 	register bpf_u_int32 *_x = a, *_y = b;\
301 	register int _n = n;\
302 	while (--_n >= 0) *_x++ &=~ *_y++;\
303 }
304 
305 /*
306  * a := a union b
307  */
308 #define SET_UNION(a, b, n)\
309 {\
310 	register bpf_u_int32 *_x = a, *_y = b;\
311 	register int _n = n;\
312 	while (--_n >= 0) *_x++ |= *_y++;\
313 }
314 
315 	uset all_dom_sets;
316 	uset all_closure_sets;
317 	uset all_edge_sets;
318 
319 #define MODULUS 213
320 	struct valnode *hashtbl[MODULUS];
321 	int curval;
322 	int maxval;
323 
324 	struct vmapinfo *vmap;
325 	struct valnode *vnode_base;
326 	struct valnode *next_vnode;
327 } opt_state_t;
328 
329 typedef struct {
330 	/*
331 	 * Place to longjmp to on an error.
332 	 */
333 	jmp_buf top_ctx;
334 
335 	/*
336 	 * The buffer into which to put error message.
337 	 */
338 	char *errbuf;
339 
340 	/*
341 	 * Some pointers used to convert the basic block form of the code,
342 	 * into the array form that BPF requires.  'fstart' will point to
343 	 * the malloc'd array while 'ftail' is used during the recursive
344 	 * traversal.
345 	 */
346 	struct bpf_insn *fstart;
347 	struct bpf_insn *ftail;
348 } conv_state_t;
349 
350 static void opt_init(opt_state_t *, struct icode *);
351 static void opt_cleanup(opt_state_t *);
352 static void PCAP_NORETURN opt_error(opt_state_t *, const char *, ...)
353     PCAP_PRINTFLIKE(2, 3);
354 
355 static void intern_blocks(opt_state_t *, struct icode *);
356 
357 static void find_inedges(opt_state_t *, struct block *);
358 #ifdef BDEBUG
359 static void opt_dump(opt_state_t *, struct icode *);
360 #endif
361 
362 #ifndef MAX
363 #define MAX(a,b) ((a)>(b)?(a):(b))
364 #endif
365 
366 static void
367 find_levels_r(opt_state_t *opt_state, struct icode *ic, struct block *b)
368 {
369 	int level;
370 
371 	if (isMarked(ic, b))
372 		return;
373 
374 	Mark(ic, b);
375 	b->link = 0;
376 
377 	if (JT(b)) {
378 		find_levels_r(opt_state, ic, JT(b));
379 		find_levels_r(opt_state, ic, JF(b));
380 		level = MAX(JT(b)->level, JF(b)->level) + 1;
381 	} else
382 		level = 0;
383 	b->level = level;
384 	b->link = opt_state->levels[level];
385 	opt_state->levels[level] = b;
386 }
387 
388 /*
389  * Level graph.  The levels go from 0 at the leaves to
390  * N_LEVELS at the root.  The opt_state->levels[] array points to the
391  * first node of the level list, whose elements are linked
392  * with the 'link' field of the struct block.
393  */
394 static void
395 find_levels(opt_state_t *opt_state, struct icode *ic)
396 {
397 	memset((char *)opt_state->levels, 0, opt_state->n_blocks * sizeof(*opt_state->levels));
398 	unMarkAll(ic);
399 	find_levels_r(opt_state, ic, ic->root);
400 }
401 
402 /*
403  * Find dominator relationships.
404  * Assumes graph has been leveled.
405  */
406 static void
407 find_dom(opt_state_t *opt_state, struct block *root)
408 {
409 	int i;
410 	struct block *b;
411 	bpf_u_int32 *x;
412 
413 	/*
414 	 * Initialize sets to contain all nodes.
415 	 */
416 	x = opt_state->all_dom_sets;
417 	i = opt_state->n_blocks * opt_state->nodewords;
418 	while (--i >= 0)
419 		*x++ = 0xFFFFFFFFU;
420 	/* Root starts off empty. */
421 	for (i = opt_state->nodewords; --i >= 0;)
422 		root->dom[i] = 0;
423 
424 	/* root->level is the highest level no found. */
425 	for (i = root->level; i >= 0; --i) {
426 		for (b = opt_state->levels[i]; b; b = b->link) {
427 			SET_INSERT(b->dom, b->id);
428 			if (JT(b) == 0)
429 				continue;
430 			SET_INTERSECT(JT(b)->dom, b->dom, opt_state->nodewords);
431 			SET_INTERSECT(JF(b)->dom, b->dom, opt_state->nodewords);
432 		}
433 	}
434 }
435 
436 static void
437 propedom(opt_state_t *opt_state, struct edge *ep)
438 {
439 	SET_INSERT(ep->edom, ep->id);
440 	if (ep->succ) {
441 		SET_INTERSECT(ep->succ->et.edom, ep->edom, opt_state->edgewords);
442 		SET_INTERSECT(ep->succ->ef.edom, ep->edom, opt_state->edgewords);
443 	}
444 }
445 
446 /*
447  * Compute edge dominators.
448  * Assumes graph has been leveled and predecessors established.
449  */
450 static void
451 find_edom(opt_state_t *opt_state, struct block *root)
452 {
453 	int i;
454 	uset x;
455 	struct block *b;
456 
457 	x = opt_state->all_edge_sets;
458 	for (i = opt_state->n_edges * opt_state->edgewords; --i >= 0; )
459 		x[i] = 0xFFFFFFFFU;
460 
461 	/* root->level is the highest level no found. */
462 	memset(root->et.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
463 	memset(root->ef.edom, 0, opt_state->edgewords * sizeof(*(uset)0));
464 	for (i = root->level; i >= 0; --i) {
465 		for (b = opt_state->levels[i]; b != 0; b = b->link) {
466 			propedom(opt_state, &b->et);
467 			propedom(opt_state, &b->ef);
468 		}
469 	}
470 }
471 
472 /*
473  * Find the backwards transitive closure of the flow graph.  These sets
474  * are backwards in the sense that we find the set of nodes that reach
475  * a given node, not the set of nodes that can be reached by a node.
476  *
477  * Assumes graph has been leveled.
478  */
479 static void
480 find_closure(opt_state_t *opt_state, struct block *root)
481 {
482 	int i;
483 	struct block *b;
484 
485 	/*
486 	 * Initialize sets to contain no nodes.
487 	 */
488 	memset((char *)opt_state->all_closure_sets, 0,
489 	      opt_state->n_blocks * opt_state->nodewords * sizeof(*opt_state->all_closure_sets));
490 
491 	/* root->level is the highest level no found. */
492 	for (i = root->level; i >= 0; --i) {
493 		for (b = opt_state->levels[i]; b; b = b->link) {
494 			SET_INSERT(b->closure, b->id);
495 			if (JT(b) == 0)
496 				continue;
497 			SET_UNION(JT(b)->closure, b->closure, opt_state->nodewords);
498 			SET_UNION(JF(b)->closure, b->closure, opt_state->nodewords);
499 		}
500 	}
501 }
502 
503 /*
504  * Return the register number that is used by s.  If A and X are both
505  * used, return AX_ATOM.  If no register is used, return -1.
506  *
507  * The implementation should probably change to an array access.
508  */
509 static int
510 atomuse(struct stmt *s)
511 {
512 	register int c = s->code;
513 
514 	if (c == NOP)
515 		return -1;
516 
517 	switch (BPF_CLASS(c)) {
518 
519 	case BPF_RET:
520 		return (BPF_RVAL(c) == BPF_A) ? A_ATOM :
521 			(BPF_RVAL(c) == BPF_X) ? X_ATOM : -1;
522 
523 	case BPF_LD:
524 	case BPF_LDX:
525 		return (BPF_MODE(c) == BPF_IND) ? X_ATOM :
526 			(BPF_MODE(c) == BPF_MEM) ? s->k : -1;
527 
528 	case BPF_ST:
529 		return A_ATOM;
530 
531 	case BPF_STX:
532 		return X_ATOM;
533 
534 	case BPF_JMP:
535 	case BPF_ALU:
536 		if (BPF_SRC(c) == BPF_X)
537 			return AX_ATOM;
538 		return A_ATOM;
539 
540 	case BPF_MISC:
541 		return BPF_MISCOP(c) == BPF_TXA ? X_ATOM : A_ATOM;
542 	}
543 	abort();
544 	/* NOTREACHED */
545 }
546 
547 /*
548  * Return the register number that is defined by 's'.  We assume that
549  * a single stmt cannot define more than one register.  If no register
550  * is defined, return -1.
551  *
552  * The implementation should probably change to an array access.
553  */
554 static int
555 atomdef(struct stmt *s)
556 {
557 	if (s->code == NOP)
558 		return -1;
559 
560 	switch (BPF_CLASS(s->code)) {
561 
562 	case BPF_LD:
563 	case BPF_ALU:
564 		return A_ATOM;
565 
566 	case BPF_LDX:
567 		return X_ATOM;
568 
569 	case BPF_ST:
570 	case BPF_STX:
571 		return s->k;
572 
573 	case BPF_MISC:
574 		return BPF_MISCOP(s->code) == BPF_TAX ? X_ATOM : A_ATOM;
575 	}
576 	return -1;
577 }
578 
579 /*
580  * Compute the sets of registers used, defined, and killed by 'b'.
581  *
582  * "Used" means that a statement in 'b' uses the register before any
583  * statement in 'b' defines it, i.e. it uses the value left in
584  * that register by a predecessor block of this block.
585  * "Defined" means that a statement in 'b' defines it.
586  * "Killed" means that a statement in 'b' defines it before any
587  * statement in 'b' uses it, i.e. it kills the value left in that
588  * register by a predecessor block of this block.
589  */
590 static void
591 compute_local_ud(struct block *b)
592 {
593 	struct slist *s;
594 	atomset def = 0, use = 0, killed = 0;
595 	int atom;
596 
597 	for (s = b->stmts; s; s = s->next) {
598 		if (s->s.code == NOP)
599 			continue;
600 		atom = atomuse(&s->s);
601 		if (atom >= 0) {
602 			if (atom == AX_ATOM) {
603 				if (!ATOMELEM(def, X_ATOM))
604 					use |= ATOMMASK(X_ATOM);
605 				if (!ATOMELEM(def, A_ATOM))
606 					use |= ATOMMASK(A_ATOM);
607 			}
608 			else if (atom < N_ATOMS) {
609 				if (!ATOMELEM(def, atom))
610 					use |= ATOMMASK(atom);
611 			}
612 			else
613 				abort();
614 		}
615 		atom = atomdef(&s->s);
616 		if (atom >= 0) {
617 			if (!ATOMELEM(use, atom))
618 				killed |= ATOMMASK(atom);
619 			def |= ATOMMASK(atom);
620 		}
621 	}
622 	if (BPF_CLASS(b->s.code) == BPF_JMP) {
623 		/*
624 		 * XXX - what about RET?
625 		 */
626 		atom = atomuse(&b->s);
627 		if (atom >= 0) {
628 			if (atom == AX_ATOM) {
629 				if (!ATOMELEM(def, X_ATOM))
630 					use |= ATOMMASK(X_ATOM);
631 				if (!ATOMELEM(def, A_ATOM))
632 					use |= ATOMMASK(A_ATOM);
633 			}
634 			else if (atom < N_ATOMS) {
635 				if (!ATOMELEM(def, atom))
636 					use |= ATOMMASK(atom);
637 			}
638 			else
639 				abort();
640 		}
641 	}
642 
643 	b->def = def;
644 	b->kill = killed;
645 	b->in_use = use;
646 }
647 
648 /*
649  * Assume graph is already leveled.
650  */
651 static void
652 find_ud(opt_state_t *opt_state, struct block *root)
653 {
654 	int i, maxlevel;
655 	struct block *p;
656 
657 	/*
658 	 * root->level is the highest level no found;
659 	 * count down from there.
660 	 */
661 	maxlevel = root->level;
662 	for (i = maxlevel; i >= 0; --i)
663 		for (p = opt_state->levels[i]; p; p = p->link) {
664 			compute_local_ud(p);
665 			p->out_use = 0;
666 		}
667 
668 	for (i = 1; i <= maxlevel; ++i) {
669 		for (p = opt_state->levels[i]; p; p = p->link) {
670 			p->out_use |= JT(p)->in_use | JF(p)->in_use;
671 			p->in_use |= p->out_use &~ p->kill;
672 		}
673 	}
674 }
675 static void
676 init_val(opt_state_t *opt_state)
677 {
678 	opt_state->curval = 0;
679 	opt_state->next_vnode = opt_state->vnode_base;
680 	memset((char *)opt_state->vmap, 0, opt_state->maxval * sizeof(*opt_state->vmap));
681 	memset((char *)opt_state->hashtbl, 0, sizeof opt_state->hashtbl);
682 }
683 
684 /* Because we really don't have an IR, this stuff is a little messy. */
685 static int
686 F(opt_state_t *opt_state, int code, int v0, int v1)
687 {
688 	u_int hash;
689 	int val;
690 	struct valnode *p;
691 
692 	hash = (u_int)code ^ ((u_int)v0 << 4) ^ ((u_int)v1 << 8);
693 	hash %= MODULUS;
694 
695 	for (p = opt_state->hashtbl[hash]; p; p = p->next)
696 		if (p->code == code && p->v0 == v0 && p->v1 == v1)
697 			return p->val;
698 
699 	val = ++opt_state->curval;
700 	if (BPF_MODE(code) == BPF_IMM &&
701 	    (BPF_CLASS(code) == BPF_LD || BPF_CLASS(code) == BPF_LDX)) {
702 		opt_state->vmap[val].const_val = v0;
703 		opt_state->vmap[val].is_const = 1;
704 	}
705 	p = opt_state->next_vnode++;
706 	p->val = val;
707 	p->code = code;
708 	p->v0 = v0;
709 	p->v1 = v1;
710 	p->next = opt_state->hashtbl[hash];
711 	opt_state->hashtbl[hash] = p;
712 
713 	return val;
714 }
715 
716 static inline void
717 vstore(struct stmt *s, int *valp, int newval, int alter)
718 {
719 	if (alter && newval != VAL_UNKNOWN && *valp == newval)
720 		s->code = NOP;
721 	else
722 		*valp = newval;
723 }
724 
725 /*
726  * Do constant-folding on binary operators.
727  * (Unary operators are handled elsewhere.)
728  */
729 static void
730 fold_op(opt_state_t *opt_state, struct stmt *s, int v0, int v1)
731 {
732 	bpf_u_int32 a, b;
733 
734 	a = opt_state->vmap[v0].const_val;
735 	b = opt_state->vmap[v1].const_val;
736 
737 	switch (BPF_OP(s->code)) {
738 	case BPF_ADD:
739 		a += b;
740 		break;
741 
742 	case BPF_SUB:
743 		a -= b;
744 		break;
745 
746 	case BPF_MUL:
747 		a *= b;
748 		break;
749 
750 	case BPF_DIV:
751 		if (b == 0)
752 			opt_error(opt_state, "division by zero");
753 		a /= b;
754 		break;
755 
756 	case BPF_MOD:
757 		if (b == 0)
758 			opt_error(opt_state, "modulus by zero");
759 		a %= b;
760 		break;
761 
762 	case BPF_AND:
763 		a &= b;
764 		break;
765 
766 	case BPF_OR:
767 		a |= b;
768 		break;
769 
770 	case BPF_XOR:
771 		a ^= b;
772 		break;
773 
774 	case BPF_LSH:
775 		/*
776 		 * A left shift of more than the width of the type
777 		 * is undefined in C; we'll just treat it as shifting
778 		 * all the bits out.
779 		 *
780 		 * XXX - the BPF interpreter doesn't check for this,
781 		 * so its behavior is dependent on the behavior of
782 		 * the processor on which it's running.  There are
783 		 * processors on which it shifts all the bits out
784 		 * and processors on which it does no shift.
785 		 */
786 		if (b < 32)
787 			a <<= b;
788 		else
789 			a = 0;
790 		break;
791 
792 	case BPF_RSH:
793 		/*
794 		 * A right shift of more than the width of the type
795 		 * is undefined in C; we'll just treat it as shifting
796 		 * all the bits out.
797 		 *
798 		 * XXX - the BPF interpreter doesn't check for this,
799 		 * so its behavior is dependent on the behavior of
800 		 * the processor on which it's running.  There are
801 		 * processors on which it shifts all the bits out
802 		 * and processors on which it does no shift.
803 		 */
804 		if (b < 32)
805 			a >>= b;
806 		else
807 			a = 0;
808 		break;
809 
810 	default:
811 		abort();
812 	}
813 	s->k = a;
814 	s->code = BPF_LD|BPF_IMM;
815 	opt_state->done = 0;
816 }
817 
818 static inline struct slist *
819 this_op(struct slist *s)
820 {
821 	while (s != 0 && s->s.code == NOP)
822 		s = s->next;
823 	return s;
824 }
825 
826 static void
827 opt_not(struct block *b)
828 {
829 	struct block *tmp = JT(b);
830 
831 	JT(b) = JF(b);
832 	JF(b) = tmp;
833 }
834 
835 static void
836 opt_peep(opt_state_t *opt_state, struct block *b)
837 {
838 	struct slist *s;
839 	struct slist *next, *last;
840 	int val;
841 
842 	s = b->stmts;
843 	if (s == 0)
844 		return;
845 
846 	last = s;
847 	for (/*empty*/; /*empty*/; s = next) {
848 		/*
849 		 * Skip over nops.
850 		 */
851 		s = this_op(s);
852 		if (s == 0)
853 			break;	/* nothing left in the block */
854 
855 		/*
856 		 * Find the next real instruction after that one
857 		 * (skipping nops).
858 		 */
859 		next = this_op(s->next);
860 		if (next == 0)
861 			break;	/* no next instruction */
862 		last = next;
863 
864 		/*
865 		 * st  M[k]	-->	st  M[k]
866 		 * ldx M[k]		tax
867 		 */
868 		if (s->s.code == BPF_ST &&
869 		    next->s.code == (BPF_LDX|BPF_MEM) &&
870 		    s->s.k == next->s.k) {
871 			opt_state->done = 0;
872 			next->s.code = BPF_MISC|BPF_TAX;
873 		}
874 		/*
875 		 * ld  #k	-->	ldx  #k
876 		 * tax			txa
877 		 */
878 		if (s->s.code == (BPF_LD|BPF_IMM) &&
879 		    next->s.code == (BPF_MISC|BPF_TAX)) {
880 			s->s.code = BPF_LDX|BPF_IMM;
881 			next->s.code = BPF_MISC|BPF_TXA;
882 			opt_state->done = 0;
883 		}
884 		/*
885 		 * This is an ugly special case, but it happens
886 		 * when you say tcp[k] or udp[k] where k is a constant.
887 		 */
888 		if (s->s.code == (BPF_LD|BPF_IMM)) {
889 			struct slist *add, *tax, *ild;
890 
891 			/*
892 			 * Check that X isn't used on exit from this
893 			 * block (which the optimizer might cause).
894 			 * We know the code generator won't generate
895 			 * any local dependencies.
896 			 */
897 			if (ATOMELEM(b->out_use, X_ATOM))
898 				continue;
899 
900 			/*
901 			 * Check that the instruction following the ldi
902 			 * is an addx, or it's an ldxms with an addx
903 			 * following it (with 0 or more nops between the
904 			 * ldxms and addx).
905 			 */
906 			if (next->s.code != (BPF_LDX|BPF_MSH|BPF_B))
907 				add = next;
908 			else
909 				add = this_op(next->next);
910 			if (add == 0 || add->s.code != (BPF_ALU|BPF_ADD|BPF_X))
911 				continue;
912 
913 			/*
914 			 * Check that a tax follows that (with 0 or more
915 			 * nops between them).
916 			 */
917 			tax = this_op(add->next);
918 			if (tax == 0 || tax->s.code != (BPF_MISC|BPF_TAX))
919 				continue;
920 
921 			/*
922 			 * Check that an ild follows that (with 0 or more
923 			 * nops between them).
924 			 */
925 			ild = this_op(tax->next);
926 			if (ild == 0 || BPF_CLASS(ild->s.code) != BPF_LD ||
927 			    BPF_MODE(ild->s.code) != BPF_IND)
928 				continue;
929 			/*
930 			 * We want to turn this sequence:
931 			 *
932 			 * (004) ldi     #0x2		{s}
933 			 * (005) ldxms   [14]		{next}  -- optional
934 			 * (006) addx			{add}
935 			 * (007) tax			{tax}
936 			 * (008) ild     [x+0]		{ild}
937 			 *
938 			 * into this sequence:
939 			 *
940 			 * (004) nop
941 			 * (005) ldxms   [14]
942 			 * (006) nop
943 			 * (007) nop
944 			 * (008) ild     [x+2]
945 			 *
946 			 * XXX We need to check that X is not
947 			 * subsequently used, because we want to change
948 			 * what'll be in it after this sequence.
949 			 *
950 			 * We know we can eliminate the accumulator
951 			 * modifications earlier in the sequence since
952 			 * it is defined by the last stmt of this sequence
953 			 * (i.e., the last statement of the sequence loads
954 			 * a value into the accumulator, so we can eliminate
955 			 * earlier operations on the accumulator).
956 			 */
957 			ild->s.k += s->s.k;
958 			s->s.code = NOP;
959 			add->s.code = NOP;
960 			tax->s.code = NOP;
961 			opt_state->done = 0;
962 		}
963 	}
964 	/*
965 	 * If the comparison at the end of a block is an equality
966 	 * comparison against a constant, and nobody uses the value
967 	 * we leave in the A register at the end of a block, and
968 	 * the operation preceding the comparison is an arithmetic
969 	 * operation, we can sometime optimize it away.
970 	 */
971 	if (b->s.code == (BPF_JMP|BPF_JEQ|BPF_K) &&
972 	    !ATOMELEM(b->out_use, A_ATOM)) {
973 	    	/*
974 	    	 * We can optimize away certain subtractions of the
975 	    	 * X register.
976 	    	 */
977 		if (last->s.code == (BPF_ALU|BPF_SUB|BPF_X)) {
978 			val = b->val[X_ATOM];
979 			if (opt_state->vmap[val].is_const) {
980 				/*
981 				 * If we have a subtract to do a comparison,
982 				 * and the X register is a known constant,
983 				 * we can merge this value into the
984 				 * comparison:
985 				 *
986 				 * sub x  ->	nop
987 				 * jeq #y	jeq #(x+y)
988 				 */
989 				b->s.k += opt_state->vmap[val].const_val;
990 				last->s.code = NOP;
991 				opt_state->done = 0;
992 			} else if (b->s.k == 0) {
993 				/*
994 				 * If the X register isn't a constant,
995 				 * and the comparison in the test is
996 				 * against 0, we can compare with the
997 				 * X register, instead:
998 				 *
999 				 * sub x  ->	nop
1000 				 * jeq #0	jeq x
1001 				 */
1002 				last->s.code = NOP;
1003 				b->s.code = BPF_JMP|BPF_JEQ|BPF_X;
1004 				opt_state->done = 0;
1005 			}
1006 		}
1007 		/*
1008 		 * Likewise, a constant subtract can be simplified:
1009 		 *
1010 		 * sub #x ->	nop
1011 		 * jeq #y ->	jeq #(x+y)
1012 		 */
1013 		else if (last->s.code == (BPF_ALU|BPF_SUB|BPF_K)) {
1014 			last->s.code = NOP;
1015 			b->s.k += last->s.k;
1016 			opt_state->done = 0;
1017 		}
1018 		/*
1019 		 * And, similarly, a constant AND can be simplified
1020 		 * if we're testing against 0, i.e.:
1021 		 *
1022 		 * and #k	nop
1023 		 * jeq #0  ->	jset #k
1024 		 */
1025 		else if (last->s.code == (BPF_ALU|BPF_AND|BPF_K) &&
1026 		    b->s.k == 0) {
1027 			b->s.k = last->s.k;
1028 			b->s.code = BPF_JMP|BPF_K|BPF_JSET;
1029 			last->s.code = NOP;
1030 			opt_state->done = 0;
1031 			opt_not(b);
1032 		}
1033 	}
1034 	/*
1035 	 * jset #0        ->   never
1036 	 * jset #ffffffff ->   always
1037 	 */
1038 	if (b->s.code == (BPF_JMP|BPF_K|BPF_JSET)) {
1039 		if (b->s.k == 0)
1040 			JT(b) = JF(b);
1041 		if ((u_int)b->s.k == 0xffffffffU)
1042 			JF(b) = JT(b);
1043 	}
1044 	/*
1045 	 * If we're comparing against the index register, and the index
1046 	 * register is a known constant, we can just compare against that
1047 	 * constant.
1048 	 */
1049 	val = b->val[X_ATOM];
1050 	if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_X) {
1051 		bpf_int32 v = opt_state->vmap[val].const_val;
1052 		b->s.code &= ~BPF_X;
1053 		b->s.k = v;
1054 	}
1055 	/*
1056 	 * If the accumulator is a known constant, we can compute the
1057 	 * comparison result.
1058 	 */
1059 	val = b->val[A_ATOM];
1060 	if (opt_state->vmap[val].is_const && BPF_SRC(b->s.code) == BPF_K) {
1061 		bpf_int32 v = opt_state->vmap[val].const_val;
1062 		switch (BPF_OP(b->s.code)) {
1063 
1064 		case BPF_JEQ:
1065 			v = v == b->s.k;
1066 			break;
1067 
1068 		case BPF_JGT:
1069 			v = (unsigned)v > (unsigned)b->s.k;
1070 			break;
1071 
1072 		case BPF_JGE:
1073 			v = (unsigned)v >= (unsigned)b->s.k;
1074 			break;
1075 
1076 		case BPF_JSET:
1077 			v &= b->s.k;
1078 			break;
1079 
1080 		default:
1081 			abort();
1082 		}
1083 		if (JF(b) != JT(b))
1084 			opt_state->done = 0;
1085 		if (v)
1086 			JF(b) = JT(b);
1087 		else
1088 			JT(b) = JF(b);
1089 	}
1090 }
1091 
1092 /*
1093  * Compute the symbolic value of expression of 's', and update
1094  * anything it defines in the value table 'val'.  If 'alter' is true,
1095  * do various optimizations.  This code would be cleaner if symbolic
1096  * evaluation and code transformations weren't folded together.
1097  */
1098 static void
1099 opt_stmt(opt_state_t *opt_state, struct stmt *s, int val[], int alter)
1100 {
1101 	int op;
1102 	int v;
1103 
1104 	switch (s->code) {
1105 
1106 	case BPF_LD|BPF_ABS|BPF_W:
1107 	case BPF_LD|BPF_ABS|BPF_H:
1108 	case BPF_LD|BPF_ABS|BPF_B:
1109 		v = F(opt_state, s->code, s->k, 0L);
1110 		vstore(s, &val[A_ATOM], v, alter);
1111 		break;
1112 
1113 	case BPF_LD|BPF_IND|BPF_W:
1114 	case BPF_LD|BPF_IND|BPF_H:
1115 	case BPF_LD|BPF_IND|BPF_B:
1116 		v = val[X_ATOM];
1117 		if (alter && opt_state->vmap[v].is_const) {
1118 			s->code = BPF_LD|BPF_ABS|BPF_SIZE(s->code);
1119 			s->k += opt_state->vmap[v].const_val;
1120 			v = F(opt_state, s->code, s->k, 0L);
1121 			opt_state->done = 0;
1122 		}
1123 		else
1124 			v = F(opt_state, s->code, s->k, v);
1125 		vstore(s, &val[A_ATOM], v, alter);
1126 		break;
1127 
1128 	case BPF_LD|BPF_LEN:
1129 		v = F(opt_state, s->code, 0L, 0L);
1130 		vstore(s, &val[A_ATOM], v, alter);
1131 		break;
1132 
1133 	case BPF_LD|BPF_IMM:
1134 		v = K(s->k);
1135 		vstore(s, &val[A_ATOM], v, alter);
1136 		break;
1137 
1138 	case BPF_LDX|BPF_IMM:
1139 		v = K(s->k);
1140 		vstore(s, &val[X_ATOM], v, alter);
1141 		break;
1142 
1143 	case BPF_LDX|BPF_MSH|BPF_B:
1144 		v = F(opt_state, s->code, s->k, 0L);
1145 		vstore(s, &val[X_ATOM], v, alter);
1146 		break;
1147 
1148 	case BPF_ALU|BPF_NEG:
1149 		if (alter && opt_state->vmap[val[A_ATOM]].is_const) {
1150 			s->code = BPF_LD|BPF_IMM;
1151 			/*
1152 			 * Do this negation as unsigned arithmetic; that's
1153 			 * what modern BPF engines do, and it guarantees
1154 			 * that all possible values can be negated.  (Yeah,
1155 			 * negating 0x80000000, the minimum signed 32-bit
1156 			 * two's-complement value, results in 0x80000000,
1157 			 * so it's still negative, but we *should* be doing
1158 			 * all unsigned arithmetic here, to match what
1159 			 * modern BPF engines do.)
1160 			 *
1161 			 * Express it as 0U - (unsigned value) so that we
1162 			 * don't get compiler warnings about negating an
1163 			 * unsigned value and don't get UBSan warnings
1164 			 * about the result of negating 0x80000000 being
1165 			 * undefined.
1166 			 */
1167 			s->k = 0U - (bpf_u_int32)(opt_state->vmap[val[A_ATOM]].const_val);
1168 			val[A_ATOM] = K(s->k);
1169 		}
1170 		else
1171 			val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], 0L);
1172 		break;
1173 
1174 	case BPF_ALU|BPF_ADD|BPF_K:
1175 	case BPF_ALU|BPF_SUB|BPF_K:
1176 	case BPF_ALU|BPF_MUL|BPF_K:
1177 	case BPF_ALU|BPF_DIV|BPF_K:
1178 	case BPF_ALU|BPF_MOD|BPF_K:
1179 	case BPF_ALU|BPF_AND|BPF_K:
1180 	case BPF_ALU|BPF_OR|BPF_K:
1181 	case BPF_ALU|BPF_XOR|BPF_K:
1182 	case BPF_ALU|BPF_LSH|BPF_K:
1183 	case BPF_ALU|BPF_RSH|BPF_K:
1184 		op = BPF_OP(s->code);
1185 		if (alter) {
1186 			if (s->k == 0) {
1187 				/*
1188 				 * Optimize operations where the constant
1189 				 * is zero.
1190 				 *
1191 				 * Don't optimize away "sub #0"
1192 				 * as it may be needed later to
1193 				 * fixup the generated math code.
1194 				 *
1195 				 * Fail if we're dividing by zero or taking
1196 				 * a modulus by zero.
1197 				 */
1198 				if (op == BPF_ADD ||
1199 				    op == BPF_LSH || op == BPF_RSH ||
1200 				    op == BPF_OR || op == BPF_XOR) {
1201 					s->code = NOP;
1202 					break;
1203 				}
1204 				if (op == BPF_MUL || op == BPF_AND) {
1205 					s->code = BPF_LD|BPF_IMM;
1206 					val[A_ATOM] = K(s->k);
1207 					break;
1208 				}
1209 				if (op == BPF_DIV)
1210 					opt_error(opt_state,
1211 					    "division by zero");
1212 				if (op == BPF_MOD)
1213 					opt_error(opt_state,
1214 					    "modulus by zero");
1215 			}
1216 			if (opt_state->vmap[val[A_ATOM]].is_const) {
1217 				fold_op(opt_state, s, val[A_ATOM], K(s->k));
1218 				val[A_ATOM] = K(s->k);
1219 				break;
1220 			}
1221 		}
1222 		val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], K(s->k));
1223 		break;
1224 
1225 	case BPF_ALU|BPF_ADD|BPF_X:
1226 	case BPF_ALU|BPF_SUB|BPF_X:
1227 	case BPF_ALU|BPF_MUL|BPF_X:
1228 	case BPF_ALU|BPF_DIV|BPF_X:
1229 	case BPF_ALU|BPF_MOD|BPF_X:
1230 	case BPF_ALU|BPF_AND|BPF_X:
1231 	case BPF_ALU|BPF_OR|BPF_X:
1232 	case BPF_ALU|BPF_XOR|BPF_X:
1233 	case BPF_ALU|BPF_LSH|BPF_X:
1234 	case BPF_ALU|BPF_RSH|BPF_X:
1235 		op = BPF_OP(s->code);
1236 		if (alter && opt_state->vmap[val[X_ATOM]].is_const) {
1237 			if (opt_state->vmap[val[A_ATOM]].is_const) {
1238 				fold_op(opt_state, s, val[A_ATOM], val[X_ATOM]);
1239 				val[A_ATOM] = K(s->k);
1240 			}
1241 			else {
1242 				s->code = BPF_ALU|BPF_K|op;
1243 				s->k = opt_state->vmap[val[X_ATOM]].const_val;
1244 				/*
1245 				 * XXX - we need to make up our minds
1246 				 * as to what integers are signed and
1247 				 * what integers are unsigned in BPF
1248 				 * programs and in our IR.
1249 				 */
1250 				if ((op == BPF_LSH || op == BPF_RSH) &&
1251 				    (s->k < 0 || s->k > 31))
1252 					opt_error(opt_state,
1253 					    "shift by more than 31 bits");
1254 				opt_state->done = 0;
1255 				val[A_ATOM] =
1256 					F(opt_state, s->code, val[A_ATOM], K(s->k));
1257 			}
1258 			break;
1259 		}
1260 		/*
1261 		 * Check if we're doing something to an accumulator
1262 		 * that is 0, and simplify.  This may not seem like
1263 		 * much of a simplification but it could open up further
1264 		 * optimizations.
1265 		 * XXX We could also check for mul by 1, etc.
1266 		 */
1267 		if (alter && opt_state->vmap[val[A_ATOM]].is_const
1268 		    && opt_state->vmap[val[A_ATOM]].const_val == 0) {
1269 			if (op == BPF_ADD || op == BPF_OR || op == BPF_XOR) {
1270 				s->code = BPF_MISC|BPF_TXA;
1271 				vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1272 				break;
1273 			}
1274 			else if (op == BPF_MUL || op == BPF_DIV || op == BPF_MOD ||
1275 				 op == BPF_AND || op == BPF_LSH || op == BPF_RSH) {
1276 				s->code = BPF_LD|BPF_IMM;
1277 				s->k = 0;
1278 				vstore(s, &val[A_ATOM], K(s->k), alter);
1279 				break;
1280 			}
1281 			else if (op == BPF_NEG) {
1282 				s->code = NOP;
1283 				break;
1284 			}
1285 		}
1286 		val[A_ATOM] = F(opt_state, s->code, val[A_ATOM], val[X_ATOM]);
1287 		break;
1288 
1289 	case BPF_MISC|BPF_TXA:
1290 		vstore(s, &val[A_ATOM], val[X_ATOM], alter);
1291 		break;
1292 
1293 	case BPF_LD|BPF_MEM:
1294 		v = val[s->k];
1295 		if (alter && opt_state->vmap[v].is_const) {
1296 			s->code = BPF_LD|BPF_IMM;
1297 			s->k = opt_state->vmap[v].const_val;
1298 			opt_state->done = 0;
1299 		}
1300 		vstore(s, &val[A_ATOM], v, alter);
1301 		break;
1302 
1303 	case BPF_MISC|BPF_TAX:
1304 		vstore(s, &val[X_ATOM], val[A_ATOM], alter);
1305 		break;
1306 
1307 	case BPF_LDX|BPF_MEM:
1308 		v = val[s->k];
1309 		if (alter && opt_state->vmap[v].is_const) {
1310 			s->code = BPF_LDX|BPF_IMM;
1311 			s->k = opt_state->vmap[v].const_val;
1312 			opt_state->done = 0;
1313 		}
1314 		vstore(s, &val[X_ATOM], v, alter);
1315 		break;
1316 
1317 	case BPF_ST:
1318 		vstore(s, &val[s->k], val[A_ATOM], alter);
1319 		break;
1320 
1321 	case BPF_STX:
1322 		vstore(s, &val[s->k], val[X_ATOM], alter);
1323 		break;
1324 	}
1325 }
1326 
1327 static void
1328 deadstmt(opt_state_t *opt_state, register struct stmt *s, register struct stmt *last[])
1329 {
1330 	register int atom;
1331 
1332 	atom = atomuse(s);
1333 	if (atom >= 0) {
1334 		if (atom == AX_ATOM) {
1335 			last[X_ATOM] = 0;
1336 			last[A_ATOM] = 0;
1337 		}
1338 		else
1339 			last[atom] = 0;
1340 	}
1341 	atom = atomdef(s);
1342 	if (atom >= 0) {
1343 		if (last[atom]) {
1344 			opt_state->done = 0;
1345 			last[atom]->code = NOP;
1346 		}
1347 		last[atom] = s;
1348 	}
1349 }
1350 
1351 static void
1352 opt_deadstores(opt_state_t *opt_state, register struct block *b)
1353 {
1354 	register struct slist *s;
1355 	register int atom;
1356 	struct stmt *last[N_ATOMS];
1357 
1358 	memset((char *)last, 0, sizeof last);
1359 
1360 	for (s = b->stmts; s != 0; s = s->next)
1361 		deadstmt(opt_state, &s->s, last);
1362 	deadstmt(opt_state, &b->s, last);
1363 
1364 	for (atom = 0; atom < N_ATOMS; ++atom)
1365 		if (last[atom] && !ATOMELEM(b->out_use, atom)) {
1366 			last[atom]->code = NOP;
1367 			opt_state->done = 0;
1368 		}
1369 }
1370 
1371 static void
1372 opt_blk(opt_state_t *opt_state, struct block *b, int do_stmts)
1373 {
1374 	struct slist *s;
1375 	struct edge *p;
1376 	int i;
1377 	bpf_int32 aval, xval;
1378 
1379 #if 0
1380 	for (s = b->stmts; s && s->next; s = s->next)
1381 		if (BPF_CLASS(s->s.code) == BPF_JMP) {
1382 			do_stmts = 0;
1383 			break;
1384 		}
1385 #endif
1386 
1387 	/*
1388 	 * Initialize the atom values.
1389 	 */
1390 	p = b->in_edges;
1391 	if (p == 0) {
1392 		/*
1393 		 * We have no predecessors, so everything is undefined
1394 		 * upon entry to this block.
1395 		 */
1396 		memset((char *)b->val, 0, sizeof(b->val));
1397 	} else {
1398 		/*
1399 		 * Inherit values from our predecessors.
1400 		 *
1401 		 * First, get the values from the predecessor along the
1402 		 * first edge leading to this node.
1403 		 */
1404 		memcpy((char *)b->val, (char *)p->pred->val, sizeof(b->val));
1405 		/*
1406 		 * Now look at all the other nodes leading to this node.
1407 		 * If, for the predecessor along that edge, a register
1408 		 * has a different value from the one we have (i.e.,
1409 		 * control paths are merging, and the merging paths
1410 		 * assign different values to that register), give the
1411 		 * register the undefined value of 0.
1412 		 */
1413 		while ((p = p->next) != NULL) {
1414 			for (i = 0; i < N_ATOMS; ++i)
1415 				if (b->val[i] != p->pred->val[i])
1416 					b->val[i] = 0;
1417 		}
1418 	}
1419 	aval = b->val[A_ATOM];
1420 	xval = b->val[X_ATOM];
1421 	for (s = b->stmts; s; s = s->next)
1422 		opt_stmt(opt_state, &s->s, b->val, do_stmts);
1423 
1424 	/*
1425 	 * This is a special case: if we don't use anything from this
1426 	 * block, and we load the accumulator or index register with a
1427 	 * value that is already there, or if this block is a return,
1428 	 * eliminate all the statements.
1429 	 *
1430 	 * XXX - what if it does a store?
1431 	 *
1432 	 * XXX - why does it matter whether we use anything from this
1433 	 * block?  If the accumulator or index register doesn't change
1434 	 * its value, isn't that OK even if we use that value?
1435 	 *
1436 	 * XXX - if we load the accumulator with a different value,
1437 	 * and the block ends with a conditional branch, we obviously
1438 	 * can't eliminate it, as the branch depends on that value.
1439 	 * For the index register, the conditional branch only depends
1440 	 * on the index register value if the test is against the index
1441 	 * register value rather than a constant; if nothing uses the
1442 	 * value we put into the index register, and we're not testing
1443 	 * against the index register's value, and there aren't any
1444 	 * other problems that would keep us from eliminating this
1445 	 * block, can we eliminate it?
1446 	 */
1447 	if (do_stmts &&
1448 	    ((b->out_use == 0 &&
1449 	      aval != VAL_UNKNOWN && b->val[A_ATOM] == aval &&
1450 	      xval != VAL_UNKNOWN && b->val[X_ATOM] == xval) ||
1451 	     BPF_CLASS(b->s.code) == BPF_RET)) {
1452 		if (b->stmts != 0) {
1453 			b->stmts = 0;
1454 			opt_state->done = 0;
1455 		}
1456 	} else {
1457 		opt_peep(opt_state, b);
1458 		opt_deadstores(opt_state, b);
1459 	}
1460 	/*
1461 	 * Set up values for branch optimizer.
1462 	 */
1463 	if (BPF_SRC(b->s.code) == BPF_K)
1464 		b->oval = K(b->s.k);
1465 	else
1466 		b->oval = b->val[X_ATOM];
1467 	b->et.code = b->s.code;
1468 	b->ef.code = -b->s.code;
1469 }
1470 
1471 /*
1472  * Return true if any register that is used on exit from 'succ', has
1473  * an exit value that is different from the corresponding exit value
1474  * from 'b'.
1475  */
1476 static int
1477 use_conflict(struct block *b, struct block *succ)
1478 {
1479 	int atom;
1480 	atomset use = succ->out_use;
1481 
1482 	if (use == 0)
1483 		return 0;
1484 
1485 	for (atom = 0; atom < N_ATOMS; ++atom)
1486 		if (ATOMELEM(use, atom))
1487 			if (b->val[atom] != succ->val[atom])
1488 				return 1;
1489 	return 0;
1490 }
1491 
1492 static struct block *
1493 fold_edge(struct block *child, struct edge *ep)
1494 {
1495 	int sense;
1496 	int aval0, aval1, oval0, oval1;
1497 	int code = ep->code;
1498 
1499 	if (code < 0) {
1500 		code = -code;
1501 		sense = 0;
1502 	} else
1503 		sense = 1;
1504 
1505 	if (child->s.code != code)
1506 		return 0;
1507 
1508 	aval0 = child->val[A_ATOM];
1509 	oval0 = child->oval;
1510 	aval1 = ep->pred->val[A_ATOM];
1511 	oval1 = ep->pred->oval;
1512 
1513 	if (aval0 != aval1)
1514 		return 0;
1515 
1516 	if (oval0 == oval1)
1517 		/*
1518 		 * The operands of the branch instructions are
1519 		 * identical, so the result is true if a true
1520 		 * branch was taken to get here, otherwise false.
1521 		 */
1522 		return sense ? JT(child) : JF(child);
1523 
1524 	if (sense && code == (BPF_JMP|BPF_JEQ|BPF_K))
1525 		/*
1526 		 * At this point, we only know the comparison if we
1527 		 * came down the true branch, and it was an equality
1528 		 * comparison with a constant.
1529 		 *
1530 		 * I.e., if we came down the true branch, and the branch
1531 		 * was an equality comparison with a constant, we know the
1532 		 * accumulator contains that constant.  If we came down
1533 		 * the false branch, or the comparison wasn't with a
1534 		 * constant, we don't know what was in the accumulator.
1535 		 *
1536 		 * We rely on the fact that distinct constants have distinct
1537 		 * value numbers.
1538 		 */
1539 		return JF(child);
1540 
1541 	return 0;
1542 }
1543 
1544 static void
1545 opt_j(opt_state_t *opt_state, struct edge *ep)
1546 {
1547 	register int i, k;
1548 	register struct block *target;
1549 
1550 	if (JT(ep->succ) == 0)
1551 		return;
1552 
1553 	if (JT(ep->succ) == JF(ep->succ)) {
1554 		/*
1555 		 * Common branch targets can be eliminated, provided
1556 		 * there is no data dependency.
1557 		 */
1558 		if (!use_conflict(ep->pred, ep->succ->et.succ)) {
1559 			opt_state->done = 0;
1560 			ep->succ = JT(ep->succ);
1561 		}
1562 	}
1563 	/*
1564 	 * For each edge dominator that matches the successor of this
1565 	 * edge, promote the edge successor to the its grandchild.
1566 	 *
1567 	 * XXX We violate the set abstraction here in favor a reasonably
1568 	 * efficient loop.
1569 	 */
1570  top:
1571 	for (i = 0; i < opt_state->edgewords; ++i) {
1572 		register bpf_u_int32 x = ep->edom[i];
1573 
1574 		while (x != 0) {
1575 			k = lowest_set_bit(x);
1576 			x &=~ ((bpf_u_int32)1 << k);
1577 			k += i * BITS_PER_WORD;
1578 
1579 			target = fold_edge(ep->succ, opt_state->edges[k]);
1580 			/*
1581 			 * Check that there is no data dependency between
1582 			 * nodes that will be violated if we move the edge.
1583 			 */
1584 			if (target != 0 && !use_conflict(ep->pred, target)) {
1585 				opt_state->done = 0;
1586 				ep->succ = target;
1587 				if (JT(target) != 0)
1588 					/*
1589 					 * Start over unless we hit a leaf.
1590 					 */
1591 					goto top;
1592 				return;
1593 			}
1594 		}
1595 	}
1596 }
1597 
1598 
1599 static void
1600 or_pullup(opt_state_t *opt_state, struct block *b)
1601 {
1602 	int val, at_top;
1603 	struct block *pull;
1604 	struct block **diffp, **samep;
1605 	struct edge *ep;
1606 
1607 	ep = b->in_edges;
1608 	if (ep == 0)
1609 		return;
1610 
1611 	/*
1612 	 * Make sure each predecessor loads the same value.
1613 	 * XXX why?
1614 	 */
1615 	val = ep->pred->val[A_ATOM];
1616 	for (ep = ep->next; ep != 0; ep = ep->next)
1617 		if (val != ep->pred->val[A_ATOM])
1618 			return;
1619 
1620 	if (JT(b->in_edges->pred) == b)
1621 		diffp = &JT(b->in_edges->pred);
1622 	else
1623 		diffp = &JF(b->in_edges->pred);
1624 
1625 	at_top = 1;
1626 	for (;;) {
1627 		if (*diffp == 0)
1628 			return;
1629 
1630 		if (JT(*diffp) != JT(b))
1631 			return;
1632 
1633 		if (!SET_MEMBER((*diffp)->dom, b->id))
1634 			return;
1635 
1636 		if ((*diffp)->val[A_ATOM] != val)
1637 			break;
1638 
1639 		diffp = &JF(*diffp);
1640 		at_top = 0;
1641 	}
1642 	samep = &JF(*diffp);
1643 	for (;;) {
1644 		if (*samep == 0)
1645 			return;
1646 
1647 		if (JT(*samep) != JT(b))
1648 			return;
1649 
1650 		if (!SET_MEMBER((*samep)->dom, b->id))
1651 			return;
1652 
1653 		if ((*samep)->val[A_ATOM] == val)
1654 			break;
1655 
1656 		/* XXX Need to check that there are no data dependencies
1657 		   between dp0 and dp1.  Currently, the code generator
1658 		   will not produce such dependencies. */
1659 		samep = &JF(*samep);
1660 	}
1661 #ifdef notdef
1662 	/* XXX This doesn't cover everything. */
1663 	for (i = 0; i < N_ATOMS; ++i)
1664 		if ((*samep)->val[i] != pred->val[i])
1665 			return;
1666 #endif
1667 	/* Pull up the node. */
1668 	pull = *samep;
1669 	*samep = JF(pull);
1670 	JF(pull) = *diffp;
1671 
1672 	/*
1673 	 * At the top of the chain, each predecessor needs to point at the
1674 	 * pulled up node.  Inside the chain, there is only one predecessor
1675 	 * to worry about.
1676 	 */
1677 	if (at_top) {
1678 		for (ep = b->in_edges; ep != 0; ep = ep->next) {
1679 			if (JT(ep->pred) == b)
1680 				JT(ep->pred) = pull;
1681 			else
1682 				JF(ep->pred) = pull;
1683 		}
1684 	}
1685 	else
1686 		*diffp = pull;
1687 
1688 	opt_state->done = 0;
1689 }
1690 
1691 static void
1692 and_pullup(opt_state_t *opt_state, struct block *b)
1693 {
1694 	int val, at_top;
1695 	struct block *pull;
1696 	struct block **diffp, **samep;
1697 	struct edge *ep;
1698 
1699 	ep = b->in_edges;
1700 	if (ep == 0)
1701 		return;
1702 
1703 	/*
1704 	 * Make sure each predecessor loads the same value.
1705 	 */
1706 	val = ep->pred->val[A_ATOM];
1707 	for (ep = ep->next; ep != 0; ep = ep->next)
1708 		if (val != ep->pred->val[A_ATOM])
1709 			return;
1710 
1711 	if (JT(b->in_edges->pred) == b)
1712 		diffp = &JT(b->in_edges->pred);
1713 	else
1714 		diffp = &JF(b->in_edges->pred);
1715 
1716 	at_top = 1;
1717 	for (;;) {
1718 		if (*diffp == 0)
1719 			return;
1720 
1721 		if (JF(*diffp) != JF(b))
1722 			return;
1723 
1724 		if (!SET_MEMBER((*diffp)->dom, b->id))
1725 			return;
1726 
1727 		if ((*diffp)->val[A_ATOM] != val)
1728 			break;
1729 
1730 		diffp = &JT(*diffp);
1731 		at_top = 0;
1732 	}
1733 	samep = &JT(*diffp);
1734 	for (;;) {
1735 		if (*samep == 0)
1736 			return;
1737 
1738 		if (JF(*samep) != JF(b))
1739 			return;
1740 
1741 		if (!SET_MEMBER((*samep)->dom, b->id))
1742 			return;
1743 
1744 		if ((*samep)->val[A_ATOM] == val)
1745 			break;
1746 
1747 		/* XXX Need to check that there are no data dependencies
1748 		   between diffp and samep.  Currently, the code generator
1749 		   will not produce such dependencies. */
1750 		samep = &JT(*samep);
1751 	}
1752 #ifdef notdef
1753 	/* XXX This doesn't cover everything. */
1754 	for (i = 0; i < N_ATOMS; ++i)
1755 		if ((*samep)->val[i] != pred->val[i])
1756 			return;
1757 #endif
1758 	/* Pull up the node. */
1759 	pull = *samep;
1760 	*samep = JT(pull);
1761 	JT(pull) = *diffp;
1762 
1763 	/*
1764 	 * At the top of the chain, each predecessor needs to point at the
1765 	 * pulled up node.  Inside the chain, there is only one predecessor
1766 	 * to worry about.
1767 	 */
1768 	if (at_top) {
1769 		for (ep = b->in_edges; ep != 0; ep = ep->next) {
1770 			if (JT(ep->pred) == b)
1771 				JT(ep->pred) = pull;
1772 			else
1773 				JF(ep->pred) = pull;
1774 		}
1775 	}
1776 	else
1777 		*diffp = pull;
1778 
1779 	opt_state->done = 0;
1780 }
1781 
1782 static void
1783 opt_blks(opt_state_t *opt_state, struct icode *ic, int do_stmts)
1784 {
1785 	int i, maxlevel;
1786 	struct block *p;
1787 
1788 	init_val(opt_state);
1789 	maxlevel = ic->root->level;
1790 
1791 	find_inedges(opt_state, ic->root);
1792 	for (i = maxlevel; i >= 0; --i)
1793 		for (p = opt_state->levels[i]; p; p = p->link)
1794 			opt_blk(opt_state, p, do_stmts);
1795 
1796 	if (do_stmts)
1797 		/*
1798 		 * No point trying to move branches; it can't possibly
1799 		 * make a difference at this point.
1800 		 */
1801 		return;
1802 
1803 	for (i = 1; i <= maxlevel; ++i) {
1804 		for (p = opt_state->levels[i]; p; p = p->link) {
1805 			opt_j(opt_state, &p->et);
1806 			opt_j(opt_state, &p->ef);
1807 		}
1808 	}
1809 
1810 	find_inedges(opt_state, ic->root);
1811 	for (i = 1; i <= maxlevel; ++i) {
1812 		for (p = opt_state->levels[i]; p; p = p->link) {
1813 			or_pullup(opt_state, p);
1814 			and_pullup(opt_state, p);
1815 		}
1816 	}
1817 }
1818 
1819 static inline void
1820 link_inedge(struct edge *parent, struct block *child)
1821 {
1822 	parent->next = child->in_edges;
1823 	child->in_edges = parent;
1824 }
1825 
1826 static void
1827 find_inedges(opt_state_t *opt_state, struct block *root)
1828 {
1829 	int i;
1830 	struct block *b;
1831 
1832 	for (i = 0; i < opt_state->n_blocks; ++i)
1833 		opt_state->blocks[i]->in_edges = 0;
1834 
1835 	/*
1836 	 * Traverse the graph, adding each edge to the predecessor
1837 	 * list of its successors.  Skip the leaves (i.e. level 0).
1838 	 */
1839 	for (i = root->level; i > 0; --i) {
1840 		for (b = opt_state->levels[i]; b != 0; b = b->link) {
1841 			link_inedge(&b->et, JT(b));
1842 			link_inedge(&b->ef, JF(b));
1843 		}
1844 	}
1845 }
1846 
1847 static void
1848 opt_root(struct block **b)
1849 {
1850 	struct slist *tmp, *s;
1851 
1852 	s = (*b)->stmts;
1853 	(*b)->stmts = 0;
1854 	while (BPF_CLASS((*b)->s.code) == BPF_JMP && JT(*b) == JF(*b))
1855 		*b = JT(*b);
1856 
1857 	tmp = (*b)->stmts;
1858 	if (tmp != 0)
1859 		sappend(s, tmp);
1860 	(*b)->stmts = s;
1861 
1862 	/*
1863 	 * If the root node is a return, then there is no
1864 	 * point executing any statements (since the bpf machine
1865 	 * has no side effects).
1866 	 */
1867 	if (BPF_CLASS((*b)->s.code) == BPF_RET)
1868 		(*b)->stmts = 0;
1869 }
1870 
1871 static void
1872 opt_loop(opt_state_t *opt_state, struct icode *ic, int do_stmts)
1873 {
1874 
1875 #ifdef BDEBUG
1876 	if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1877 		printf("opt_loop(root, %d) begin\n", do_stmts);
1878 		opt_dump(opt_state, ic);
1879 	}
1880 #endif
1881 	do {
1882 		opt_state->done = 1;
1883 		find_levels(opt_state, ic);
1884 		find_dom(opt_state, ic->root);
1885 		find_closure(opt_state, ic->root);
1886 		find_ud(opt_state, ic->root);
1887 		find_edom(opt_state, ic->root);
1888 		opt_blks(opt_state, ic, do_stmts);
1889 #ifdef BDEBUG
1890 		if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1891 			printf("opt_loop(root, %d) bottom, done=%d\n", do_stmts, opt_state->done);
1892 			opt_dump(opt_state, ic);
1893 		}
1894 #endif
1895 	} while (!opt_state->done);
1896 }
1897 
1898 /*
1899  * Optimize the filter code in its dag representation.
1900  * Return 0 on success, -1 on error.
1901  */
1902 int
1903 bpf_optimize(struct icode *ic, char *errbuf)
1904 {
1905 	opt_state_t opt_state;
1906 
1907 	memset(&opt_state, 0, sizeof(opt_state));
1908 	opt_state.errbuf = errbuf;
1909 	if (setjmp(opt_state.top_ctx)) {
1910 		opt_cleanup(&opt_state);
1911 		return -1;
1912 	}
1913 	opt_init(&opt_state, ic);
1914 	opt_loop(&opt_state, ic, 0);
1915 	opt_loop(&opt_state, ic, 1);
1916 	intern_blocks(&opt_state, ic);
1917 #ifdef BDEBUG
1918 	if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1919 		printf("after intern_blocks()\n");
1920 		opt_dump(&opt_state, ic);
1921 	}
1922 #endif
1923 	opt_root(&ic->root);
1924 #ifdef BDEBUG
1925 	if (pcap_optimizer_debug > 1 || pcap_print_dot_graph) {
1926 		printf("after opt_root()\n");
1927 		opt_dump(&opt_state, ic);
1928 	}
1929 #endif
1930 	opt_cleanup(&opt_state);
1931 	return 0;
1932 }
1933 
1934 static void
1935 make_marks(struct icode *ic, struct block *p)
1936 {
1937 	if (!isMarked(ic, p)) {
1938 		Mark(ic, p);
1939 		if (BPF_CLASS(p->s.code) != BPF_RET) {
1940 			make_marks(ic, JT(p));
1941 			make_marks(ic, JF(p));
1942 		}
1943 	}
1944 }
1945 
1946 /*
1947  * Mark code array such that isMarked(ic->cur_mark, i) is true
1948  * only for nodes that are alive.
1949  */
1950 static void
1951 mark_code(struct icode *ic)
1952 {
1953 	ic->cur_mark += 1;
1954 	make_marks(ic, ic->root);
1955 }
1956 
1957 /*
1958  * True iff the two stmt lists load the same value from the packet into
1959  * the accumulator.
1960  */
1961 static int
1962 eq_slist(struct slist *x, struct slist *y)
1963 {
1964 	for (;;) {
1965 		while (x && x->s.code == NOP)
1966 			x = x->next;
1967 		while (y && y->s.code == NOP)
1968 			y = y->next;
1969 		if (x == 0)
1970 			return y == 0;
1971 		if (y == 0)
1972 			return x == 0;
1973 		if (x->s.code != y->s.code || x->s.k != y->s.k)
1974 			return 0;
1975 		x = x->next;
1976 		y = y->next;
1977 	}
1978 }
1979 
1980 static inline int
1981 eq_blk(struct block *b0, struct block *b1)
1982 {
1983 	if (b0->s.code == b1->s.code &&
1984 	    b0->s.k == b1->s.k &&
1985 	    b0->et.succ == b1->et.succ &&
1986 	    b0->ef.succ == b1->ef.succ)
1987 		return eq_slist(b0->stmts, b1->stmts);
1988 	return 0;
1989 }
1990 
1991 static void
1992 intern_blocks(opt_state_t *opt_state, struct icode *ic)
1993 {
1994 	struct block *p;
1995 	int i, j;
1996 	int done1; /* don't shadow global */
1997  top:
1998 	done1 = 1;
1999 	for (i = 0; i < opt_state->n_blocks; ++i)
2000 		opt_state->blocks[i]->link = 0;
2001 
2002 	mark_code(ic);
2003 
2004 	for (i = opt_state->n_blocks - 1; --i >= 0; ) {
2005 		if (!isMarked(ic, opt_state->blocks[i]))
2006 			continue;
2007 		for (j = i + 1; j < opt_state->n_blocks; ++j) {
2008 			if (!isMarked(ic, opt_state->blocks[j]))
2009 				continue;
2010 			if (eq_blk(opt_state->blocks[i], opt_state->blocks[j])) {
2011 				opt_state->blocks[i]->link = opt_state->blocks[j]->link ?
2012 					opt_state->blocks[j]->link : opt_state->blocks[j];
2013 				break;
2014 			}
2015 		}
2016 	}
2017 	for (i = 0; i < opt_state->n_blocks; ++i) {
2018 		p = opt_state->blocks[i];
2019 		if (JT(p) == 0)
2020 			continue;
2021 		if (JT(p)->link) {
2022 			done1 = 0;
2023 			JT(p) = JT(p)->link;
2024 		}
2025 		if (JF(p)->link) {
2026 			done1 = 0;
2027 			JF(p) = JF(p)->link;
2028 		}
2029 	}
2030 	if (!done1)
2031 		goto top;
2032 }
2033 
2034 static void
2035 opt_cleanup(opt_state_t *opt_state)
2036 {
2037 	free((void *)opt_state->vnode_base);
2038 	free((void *)opt_state->vmap);
2039 	free((void *)opt_state->edges);
2040 	free((void *)opt_state->space);
2041 	free((void *)opt_state->levels);
2042 	free((void *)opt_state->blocks);
2043 }
2044 
2045 /*
2046  * For optimizer errors.
2047  */
2048 static void PCAP_NORETURN
2049 opt_error(opt_state_t *opt_state, const char *fmt, ...)
2050 {
2051 	va_list ap;
2052 
2053 	if (opt_state->errbuf != NULL) {
2054 		va_start(ap, fmt);
2055 		(void)pcap_vsnprintf(opt_state->errbuf,
2056 		    PCAP_ERRBUF_SIZE, fmt, ap);
2057 		va_end(ap);
2058 	}
2059 	longjmp(opt_state->top_ctx, 1);
2060 	/* NOTREACHED */
2061 }
2062 
2063 /*
2064  * Return the number of stmts in 's'.
2065  */
2066 static u_int
2067 slength(struct slist *s)
2068 {
2069 	u_int n = 0;
2070 
2071 	for (; s; s = s->next)
2072 		if (s->s.code != NOP)
2073 			++n;
2074 	return n;
2075 }
2076 
2077 /*
2078  * Return the number of nodes reachable by 'p'.
2079  * All nodes should be initially unmarked.
2080  */
2081 static int
2082 count_blocks(struct icode *ic, struct block *p)
2083 {
2084 	if (p == 0 || isMarked(ic, p))
2085 		return 0;
2086 	Mark(ic, p);
2087 	return count_blocks(ic, JT(p)) + count_blocks(ic, JF(p)) + 1;
2088 }
2089 
2090 /*
2091  * Do a depth first search on the flow graph, numbering the
2092  * the basic blocks, and entering them into the 'blocks' array.`
2093  */
2094 static void
2095 number_blks_r(opt_state_t *opt_state, struct icode *ic, struct block *p)
2096 {
2097 	int n;
2098 
2099 	if (p == 0 || isMarked(ic, p))
2100 		return;
2101 
2102 	Mark(ic, p);
2103 	n = opt_state->n_blocks++;
2104 	p->id = n;
2105 	opt_state->blocks[n] = p;
2106 
2107 	number_blks_r(opt_state, ic, JT(p));
2108 	number_blks_r(opt_state, ic, JF(p));
2109 }
2110 
2111 /*
2112  * Return the number of stmts in the flowgraph reachable by 'p'.
2113  * The nodes should be unmarked before calling.
2114  *
2115  * Note that "stmts" means "instructions", and that this includes
2116  *
2117  *	side-effect statements in 'p' (slength(p->stmts));
2118  *
2119  *	statements in the true branch from 'p' (count_stmts(JT(p)));
2120  *
2121  *	statements in the false branch from 'p' (count_stmts(JF(p)));
2122  *
2123  *	the conditional jump itself (1);
2124  *
2125  *	an extra long jump if the true branch requires it (p->longjt);
2126  *
2127  *	an extra long jump if the false branch requires it (p->longjf).
2128  */
2129 static u_int
2130 count_stmts(struct icode *ic, struct block *p)
2131 {
2132 	u_int n;
2133 
2134 	if (p == 0 || isMarked(ic, p))
2135 		return 0;
2136 	Mark(ic, p);
2137 	n = count_stmts(ic, JT(p)) + count_stmts(ic, JF(p));
2138 	return slength(p->stmts) + n + 1 + p->longjt + p->longjf;
2139 }
2140 
2141 /*
2142  * Allocate memory.  All allocation is done before optimization
2143  * is begun.  A linear bound on the size of all data structures is computed
2144  * from the total number of blocks and/or statements.
2145  */
2146 static void
2147 opt_init(opt_state_t *opt_state, struct icode *ic)
2148 {
2149 	bpf_u_int32 *p;
2150 	int i, n, max_stmts;
2151 
2152 	/*
2153 	 * First, count the blocks, so we can malloc an array to map
2154 	 * block number to block.  Then, put the blocks into the array.
2155 	 */
2156 	unMarkAll(ic);
2157 	n = count_blocks(ic, ic->root);
2158 	opt_state->blocks = (struct block **)calloc(n, sizeof(*opt_state->blocks));
2159 	if (opt_state->blocks == NULL)
2160 		opt_error(opt_state, "malloc");
2161 	unMarkAll(ic);
2162 	opt_state->n_blocks = 0;
2163 	number_blks_r(opt_state, ic, ic->root);
2164 
2165 	opt_state->n_edges = 2 * opt_state->n_blocks;
2166 	opt_state->edges = (struct edge **)calloc(opt_state->n_edges, sizeof(*opt_state->edges));
2167 	if (opt_state->edges == NULL) {
2168 		opt_error(opt_state, "malloc");
2169 	}
2170 
2171 	/*
2172 	 * The number of levels is bounded by the number of nodes.
2173 	 */
2174 	opt_state->levels = (struct block **)calloc(opt_state->n_blocks, sizeof(*opt_state->levels));
2175 	if (opt_state->levels == NULL) {
2176 		opt_error(opt_state, "malloc");
2177 	}
2178 
2179 	opt_state->edgewords = opt_state->n_edges / (8 * sizeof(bpf_u_int32)) + 1;
2180 	opt_state->nodewords = opt_state->n_blocks / (8 * sizeof(bpf_u_int32)) + 1;
2181 
2182 	/* XXX */
2183 	opt_state->space = (bpf_u_int32 *)malloc(2 * opt_state->n_blocks * opt_state->nodewords * sizeof(*opt_state->space)
2184 				 + opt_state->n_edges * opt_state->edgewords * sizeof(*opt_state->space));
2185 	if (opt_state->space == NULL) {
2186 		opt_error(opt_state, "malloc");
2187 	}
2188 	p = opt_state->space;
2189 	opt_state->all_dom_sets = p;
2190 	for (i = 0; i < n; ++i) {
2191 		opt_state->blocks[i]->dom = p;
2192 		p += opt_state->nodewords;
2193 	}
2194 	opt_state->all_closure_sets = p;
2195 	for (i = 0; i < n; ++i) {
2196 		opt_state->blocks[i]->closure = p;
2197 		p += opt_state->nodewords;
2198 	}
2199 	opt_state->all_edge_sets = p;
2200 	for (i = 0; i < n; ++i) {
2201 		register struct block *b = opt_state->blocks[i];
2202 
2203 		b->et.edom = p;
2204 		p += opt_state->edgewords;
2205 		b->ef.edom = p;
2206 		p += opt_state->edgewords;
2207 		b->et.id = i;
2208 		opt_state->edges[i] = &b->et;
2209 		b->ef.id = opt_state->n_blocks + i;
2210 		opt_state->edges[opt_state->n_blocks + i] = &b->ef;
2211 		b->et.pred = b;
2212 		b->ef.pred = b;
2213 	}
2214 	max_stmts = 0;
2215 	for (i = 0; i < n; ++i)
2216 		max_stmts += slength(opt_state->blocks[i]->stmts) + 1;
2217 	/*
2218 	 * We allocate at most 3 value numbers per statement,
2219 	 * so this is an upper bound on the number of valnodes
2220 	 * we'll need.
2221 	 */
2222 	opt_state->maxval = 3 * max_stmts;
2223 	opt_state->vmap = (struct vmapinfo *)calloc(opt_state->maxval, sizeof(*opt_state->vmap));
2224 	if (opt_state->vmap == NULL) {
2225 		opt_error(opt_state, "malloc");
2226 	}
2227 	opt_state->vnode_base = (struct valnode *)calloc(opt_state->maxval, sizeof(*opt_state->vnode_base));
2228 	if (opt_state->vnode_base == NULL) {
2229 		opt_error(opt_state, "malloc");
2230 	}
2231 }
2232 
2233 /*
2234  * This is only used when supporting optimizer debugging.  It is
2235  * global state, so do *not* do more than one compile in parallel
2236  * and expect it to provide meaningful information.
2237  */
2238 #ifdef BDEBUG
2239 int bids[NBIDS];
2240 #endif
2241 
2242 static void PCAP_NORETURN conv_error(conv_state_t *, const char *, ...)
2243     PCAP_PRINTFLIKE(2, 3);
2244 
2245 /*
2246  * Returns true if successful.  Returns false if a branch has
2247  * an offset that is too large.  If so, we have marked that
2248  * branch so that on a subsequent iteration, it will be treated
2249  * properly.
2250  */
2251 static int
2252 convert_code_r(conv_state_t *conv_state, struct icode *ic, struct block *p)
2253 {
2254 	struct bpf_insn *dst;
2255 	struct slist *src;
2256 	u_int slen;
2257 	u_int off;
2258 	u_int extrajmps;	/* number of extra jumps inserted */
2259 	struct slist **offset = NULL;
2260 
2261 	if (p == 0 || isMarked(ic, p))
2262 		return (1);
2263 	Mark(ic, p);
2264 
2265 	if (convert_code_r(conv_state, ic, JF(p)) == 0)
2266 		return (0);
2267 	if (convert_code_r(conv_state, ic, JT(p)) == 0)
2268 		return (0);
2269 
2270 	slen = slength(p->stmts);
2271 	dst = conv_state->ftail -= (slen + 1 + p->longjt + p->longjf);
2272 		/* inflate length by any extra jumps */
2273 
2274 	p->offset = (int)(dst - conv_state->fstart);
2275 
2276 	/* generate offset[] for convenience  */
2277 	if (slen) {
2278 		offset = (struct slist **)calloc(slen, sizeof(struct slist *));
2279 		if (!offset) {
2280 			conv_error(conv_state, "not enough core");
2281 			/*NOTREACHED*/
2282 		}
2283 	}
2284 	src = p->stmts;
2285 	for (off = 0; off < slen && src; off++) {
2286 #if 0
2287 		printf("off=%d src=%x\n", off, src);
2288 #endif
2289 		offset[off] = src;
2290 		src = src->next;
2291 	}
2292 
2293 	off = 0;
2294 	for (src = p->stmts; src; src = src->next) {
2295 		if (src->s.code == NOP)
2296 			continue;
2297 		dst->code = (u_short)src->s.code;
2298 		dst->k = src->s.k;
2299 
2300 		/* fill block-local relative jump */
2301 		if (BPF_CLASS(src->s.code) != BPF_JMP || src->s.code == (BPF_JMP|BPF_JA)) {
2302 #if 0
2303 			if (src->s.jt || src->s.jf) {
2304 				free(offset);
2305 				conv_error(conv_state, "illegal jmp destination");
2306 				/*NOTREACHED*/
2307 			}
2308 #endif
2309 			goto filled;
2310 		}
2311 		if (off == slen - 2)	/*???*/
2312 			goto filled;
2313 
2314 	    {
2315 		u_int i;
2316 		int jt, jf;
2317 		const char ljerr[] = "%s for block-local relative jump: off=%d";
2318 
2319 #if 0
2320 		printf("code=%x off=%d %x %x\n", src->s.code,
2321 			off, src->s.jt, src->s.jf);
2322 #endif
2323 
2324 		if (!src->s.jt || !src->s.jf) {
2325 			free(offset);
2326 			conv_error(conv_state, ljerr, "no jmp destination", off);
2327 			/*NOTREACHED*/
2328 		}
2329 
2330 		jt = jf = 0;
2331 		for (i = 0; i < slen; i++) {
2332 			if (offset[i] == src->s.jt) {
2333 				if (jt) {
2334 					free(offset);
2335 					conv_error(conv_state, ljerr, "multiple matches", off);
2336 					/*NOTREACHED*/
2337 				}
2338 
2339 				if (i - off - 1 >= 256) {
2340 					free(offset);
2341 					conv_error(conv_state, ljerr, "out-of-range jump", off);
2342 					/*NOTREACHED*/
2343 				}
2344 				dst->jt = (u_char)(i - off - 1);
2345 				jt++;
2346 			}
2347 			if (offset[i] == src->s.jf) {
2348 				if (jf) {
2349 					free(offset);
2350 					conv_error(conv_state, ljerr, "multiple matches", off);
2351 					/*NOTREACHED*/
2352 				}
2353 				if (i - off - 1 >= 256) {
2354 					free(offset);
2355 					conv_error(conv_state, ljerr, "out-of-range jump", off);
2356 					/*NOTREACHED*/
2357 				}
2358 				dst->jf = (u_char)(i - off - 1);
2359 				jf++;
2360 			}
2361 		}
2362 		if (!jt || !jf) {
2363 			free(offset);
2364 			conv_error(conv_state, ljerr, "no destination found", off);
2365 			/*NOTREACHED*/
2366 		}
2367 	    }
2368 filled:
2369 		++dst;
2370 		++off;
2371 	}
2372 	if (offset)
2373 		free(offset);
2374 
2375 #ifdef BDEBUG
2376 	if (dst - conv_state->fstart < NBIDS)
2377 		bids[dst - conv_state->fstart] = p->id + 1;
2378 #endif
2379 	dst->code = (u_short)p->s.code;
2380 	dst->k = p->s.k;
2381 	if (JT(p)) {
2382 		extrajmps = 0;
2383 		off = JT(p)->offset - (p->offset + slen) - 1;
2384 		if (off >= 256) {
2385 		    /* offset too large for branch, must add a jump */
2386 		    if (p->longjt == 0) {
2387 		    	/* mark this instruction and retry */
2388 			p->longjt++;
2389 			return(0);
2390 		    }
2391 		    /* branch if T to following jump */
2392 		    if (extrajmps >= 256) {
2393 			conv_error(conv_state, "too many extra jumps");
2394 			/*NOTREACHED*/
2395 		    }
2396 		    dst->jt = (u_char)extrajmps;
2397 		    extrajmps++;
2398 		    dst[extrajmps].code = BPF_JMP|BPF_JA;
2399 		    dst[extrajmps].k = off - extrajmps;
2400 		}
2401 		else
2402 		    dst->jt = (u_char)off;
2403 		off = JF(p)->offset - (p->offset + slen) - 1;
2404 		if (off >= 256) {
2405 		    /* offset too large for branch, must add a jump */
2406 		    if (p->longjf == 0) {
2407 		    	/* mark this instruction and retry */
2408 			p->longjf++;
2409 			return(0);
2410 		    }
2411 		    /* branch if F to following jump */
2412 		    /* if two jumps are inserted, F goes to second one */
2413 		    if (extrajmps >= 256) {
2414 			conv_error(conv_state, "too many extra jumps");
2415 			/*NOTREACHED*/
2416 		    }
2417 		    dst->jf = (u_char)extrajmps;
2418 		    extrajmps++;
2419 		    dst[extrajmps].code = BPF_JMP|BPF_JA;
2420 		    dst[extrajmps].k = off - extrajmps;
2421 		}
2422 		else
2423 		    dst->jf = (u_char)off;
2424 	}
2425 	return (1);
2426 }
2427 
2428 
2429 /*
2430  * Convert flowgraph intermediate representation to the
2431  * BPF array representation.  Set *lenp to the number of instructions.
2432  *
2433  * This routine does *NOT* leak the memory pointed to by fp.  It *must
2434  * not* do free(fp) before returning fp; doing so would make no sense,
2435  * as the BPF array pointed to by the return value of icode_to_fcode()
2436  * must be valid - it's being returned for use in a bpf_program structure.
2437  *
2438  * If it appears that icode_to_fcode() is leaking, the problem is that
2439  * the program using pcap_compile() is failing to free the memory in
2440  * the BPF program when it's done - the leak is in the program, not in
2441  * the routine that happens to be allocating the memory.  (By analogy, if
2442  * a program calls fopen() without ever calling fclose() on the FILE *,
2443  * it will leak the FILE structure; the leak is not in fopen(), it's in
2444  * the program.)  Change the program to use pcap_freecode() when it's
2445  * done with the filter program.  See the pcap man page.
2446  */
2447 struct bpf_insn *
2448 icode_to_fcode(struct icode *ic, struct block *root, u_int *lenp,
2449     char *errbuf)
2450 {
2451 	u_int n;
2452 	struct bpf_insn *fp;
2453 	conv_state_t conv_state;
2454 
2455 	conv_state.fstart = NULL;
2456 	conv_state.errbuf = errbuf;
2457 	if (setjmp(conv_state.top_ctx) != 0) {
2458 		free(conv_state.fstart);
2459 		return NULL;
2460 	}
2461 
2462 	/*
2463 	 * Loop doing convert_code_r() until no branches remain
2464 	 * with too-large offsets.
2465 	 */
2466 	for (;;) {
2467 	    unMarkAll(ic);
2468 	    n = *lenp = count_stmts(ic, root);
2469 
2470 	    fp = (struct bpf_insn *)malloc(sizeof(*fp) * n);
2471 	    if (fp == NULL) {
2472 		(void)pcap_snprintf(errbuf, PCAP_ERRBUF_SIZE,
2473 		    "malloc");
2474 		free(fp);
2475 		return NULL;
2476 	    }
2477 	    memset((char *)fp, 0, sizeof(*fp) * n);
2478 	    conv_state.fstart = fp;
2479 	    conv_state.ftail = fp + n;
2480 
2481 	    unMarkAll(ic);
2482 	    if (convert_code_r(&conv_state, ic, root))
2483 		break;
2484 	    free(fp);
2485 	}
2486 
2487 	return fp;
2488 }
2489 
2490 /*
2491  * For iconv_to_fconv() errors.
2492  */
2493 static void PCAP_NORETURN
2494 conv_error(conv_state_t *conv_state, const char *fmt, ...)
2495 {
2496 	va_list ap;
2497 
2498 	va_start(ap, fmt);
2499 	(void)pcap_vsnprintf(conv_state->errbuf,
2500 	    PCAP_ERRBUF_SIZE, fmt, ap);
2501 	va_end(ap);
2502 	longjmp(conv_state->top_ctx, 1);
2503 	/* NOTREACHED */
2504 }
2505 
2506 /*
2507  * Make a copy of a BPF program and put it in the "fcode" member of
2508  * a "pcap_t".
2509  *
2510  * If we fail to allocate memory for the copy, fill in the "errbuf"
2511  * member of the "pcap_t" with an error message, and return -1;
2512  * otherwise, return 0.
2513  */
2514 int
2515 install_bpf_program(pcap_t *p, struct bpf_program *fp)
2516 {
2517 	size_t prog_size;
2518 
2519 	/*
2520 	 * Validate the program.
2521 	 */
2522 	if (!bpf_validate(fp->bf_insns, fp->bf_len)) {
2523 		pcap_snprintf(p->errbuf, sizeof(p->errbuf),
2524 			"BPF program is not valid");
2525 		return (-1);
2526 	}
2527 
2528 	/*
2529 	 * Free up any already installed program.
2530 	 */
2531 	pcap_freecode(&p->fcode);
2532 
2533 	prog_size = sizeof(*fp->bf_insns) * fp->bf_len;
2534 	p->fcode.bf_len = fp->bf_len;
2535 	p->fcode.bf_insns = (struct bpf_insn *)malloc(prog_size);
2536 	if (p->fcode.bf_insns == NULL) {
2537 		pcap_fmt_errmsg_for_errno(p->errbuf, sizeof(p->errbuf),
2538 		    errno, "malloc");
2539 		return (-1);
2540 	}
2541 	memcpy(p->fcode.bf_insns, fp->bf_insns, prog_size);
2542 	return (0);
2543 }
2544 
2545 #ifdef BDEBUG
2546 static void
2547 dot_dump_node(struct icode *ic, struct block *block, struct bpf_program *prog,
2548     FILE *out)
2549 {
2550 	int icount, noffset;
2551 	int i;
2552 
2553 	if (block == NULL || isMarked(ic, block))
2554 		return;
2555 	Mark(ic, block);
2556 
2557 	icount = slength(block->stmts) + 1 + block->longjt + block->longjf;
2558 	noffset = min(block->offset + icount, (int)prog->bf_len);
2559 
2560 	fprintf(out, "\tblock%d [shape=ellipse, id=\"block-%d\" label=\"BLOCK%d\\n", block->id, block->id, block->id);
2561 	for (i = block->offset; i < noffset; i++) {
2562 		fprintf(out, "\\n%s", bpf_image(prog->bf_insns + i, i));
2563 	}
2564 	fprintf(out, "\" tooltip=\"");
2565 	for (i = 0; i < BPF_MEMWORDS; i++)
2566 		if (block->val[i] != VAL_UNKNOWN)
2567 			fprintf(out, "val[%d]=%d ", i, block->val[i]);
2568 	fprintf(out, "val[A]=%d ", block->val[A_ATOM]);
2569 	fprintf(out, "val[X]=%d", block->val[X_ATOM]);
2570 	fprintf(out, "\"");
2571 	if (JT(block) == NULL)
2572 		fprintf(out, ", peripheries=2");
2573 	fprintf(out, "];\n");
2574 
2575 	dot_dump_node(ic, JT(block), prog, out);
2576 	dot_dump_node(ic, JF(block), prog, out);
2577 }
2578 
2579 static void
2580 dot_dump_edge(struct icode *ic, struct block *block, FILE *out)
2581 {
2582 	if (block == NULL || isMarked(ic, block))
2583 		return;
2584 	Mark(ic, block);
2585 
2586 	if (JT(block)) {
2587 		fprintf(out, "\t\"block%d\":se -> \"block%d\":n [label=\"T\"]; \n",
2588 				block->id, JT(block)->id);
2589 		fprintf(out, "\t\"block%d\":sw -> \"block%d\":n [label=\"F\"]; \n",
2590 			   block->id, JF(block)->id);
2591 	}
2592 	dot_dump_edge(ic, JT(block), out);
2593 	dot_dump_edge(ic, JF(block), out);
2594 }
2595 
2596 /* Output the block CFG using graphviz/DOT language
2597  * In the CFG, block's code, value index for each registers at EXIT,
2598  * and the jump relationship is show.
2599  *
2600  * example DOT for BPF `ip src host 1.1.1.1' is:
2601     digraph BPF {
2602     	block0 [shape=ellipse, id="block-0" label="BLOCK0\n\n(000) ldh      [12]\n(001) jeq      #0x800           jt 2	jf 5" tooltip="val[A]=0 val[X]=0"];
2603     	block1 [shape=ellipse, id="block-1" label="BLOCK1\n\n(002) ld       [26]\n(003) jeq      #0x1010101       jt 4	jf 5" tooltip="val[A]=0 val[X]=0"];
2604     	block2 [shape=ellipse, id="block-2" label="BLOCK2\n\n(004) ret      #68" tooltip="val[A]=0 val[X]=0", peripheries=2];
2605     	block3 [shape=ellipse, id="block-3" label="BLOCK3\n\n(005) ret      #0" tooltip="val[A]=0 val[X]=0", peripheries=2];
2606     	"block0":se -> "block1":n [label="T"];
2607     	"block0":sw -> "block3":n [label="F"];
2608     	"block1":se -> "block2":n [label="T"];
2609     	"block1":sw -> "block3":n [label="F"];
2610     }
2611  *
2612  *  After install graphviz on http://www.graphviz.org/, save it as bpf.dot
2613  *  and run `dot -Tpng -O bpf.dot' to draw the graph.
2614  */
2615 static int
2616 dot_dump(struct icode *ic, char *errbuf)
2617 {
2618 	struct bpf_program f;
2619 	FILE *out = stdout;
2620 
2621 	memset(bids, 0, sizeof bids);
2622 	f.bf_insns = icode_to_fcode(ic, ic->root, &f.bf_len, errbuf);
2623 	if (f.bf_insns == NULL)
2624 		return -1;
2625 
2626 	fprintf(out, "digraph BPF {\n");
2627 	unMarkAll(ic);
2628 	dot_dump_node(ic, ic->root, &f, out);
2629 	unMarkAll(ic);
2630 	dot_dump_edge(ic, ic->root, out);
2631 	fprintf(out, "}\n");
2632 
2633 	free((char *)f.bf_insns);
2634 	return 0;
2635 }
2636 
2637 static int
2638 plain_dump(struct icode *ic, char *errbuf)
2639 {
2640 	struct bpf_program f;
2641 
2642 	memset(bids, 0, sizeof bids);
2643 	f.bf_insns = icode_to_fcode(ic, ic->root, &f.bf_len, errbuf);
2644 	if (f.bf_insns == NULL)
2645 		return -1;
2646 	bpf_dump(&f, 1);
2647 	putchar('\n');
2648 	free((char *)f.bf_insns);
2649 	return 0;
2650 }
2651 
2652 static void
2653 opt_dump(opt_state_t *opt_state, struct icode *ic)
2654 {
2655 	int status;
2656 	char errbuf[PCAP_ERRBUF_SIZE];
2657 
2658 	/*
2659 	 * If the CFG, in DOT format, is requested, output it rather than
2660 	 * the code that would be generated from that graph.
2661 	 */
2662 	if (pcap_print_dot_graph)
2663 		status = dot_dump(ic, errbuf);
2664 	else
2665 		status = plain_dump(ic, errbuf);
2666 	if (status == -1)
2667 		opt_error(opt_state, "opt_dump: icode_to_fcode failed: %s", errbuf);
2668 }
2669 #endif
2670