xref: /netbsd-src/usr.sbin/npf/npfctl/npf_bpf_comp.c (revision b75b735c034847301b1bbda84c62f9c4a7ae2ed5)
1 /*-
2  * Copyright (c) 2010-2020 The NetBSD Foundation, Inc.
3  * All rights reserved.
4  *
5  * This material is based upon work partially supported by The
6  * NetBSD Foundation under a contract with Mindaugas Rasiukevicius.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
18  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
21  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27  * POSSIBILITY OF SUCH DAMAGE.
28  */
29 
30 /*
31  * BPF byte-code generation for NPF rules.
32  *
33  * Overview
34  *
35  *	Each NPF rule is compiled into a BPF micro-program.  There is a
36  *	BPF byte-code fragment for each higher-level filtering logic,
37  *	e.g. to match L4 protocol, IP/mask, etc.  The generation process
38  *	combines multiple BPF-byte code fragments into one program.
39  *
40  * Basic case
41  *
42  *	Consider a basic case where all filters should match.  They
43  *	are expressed as logical conjunction, e.g.:
44  *
45  *		A and B and C and D
46  *
47  *	Each test (filter) criterion can be evaluated to true (match) or
48  *	false (no match) and the logic is as follows:
49  *
50  *	- If the value is true, then jump to the "next" test (offset 0).
51  *
52  *	- If the value is false, then jump to the JUMP_MAGIC value (0xff).
53  *	This "magic" value is used to indicate that it will have to be
54  *	patched at a later stage.
55  *
56  *	Once all byte-code fragments are combined into one, then there
57  *	are two additional steps:
58  *
59  *	- Two instructions are appended at the end of the program: "return
60  *	success" followed by "return failure".
61  *
62  *	- All jumps with the JUMP_MAGIC value are patched to point to the
63  *	"return failure" instruction.
64  *
65  *	Therefore, if all filter criteria will match, then the first
66  *	instruction will be reached, indicating a successful match of the
67  *	rule.  Otherwise, if any of the criteria will not match, it will
68  *	take the failure path and the rule will not be matching.
69  *
70  * Grouping
71  *
72  *	Filters can have groups, which have an effect of logical
73  *	disjunction, e.g.:
74  *
75  *		A and B and (C or D)
76  *
77  *	In such case, the logic inside the group has to be inverted i.e.
78  *	the jump values swapped.  If the test value is true, then jump
79  *	out of the group; if false, then jump "next".  At the end of the
80  *	group, an addition failure path is appended and the JUMP_MAGIC
81  *	uses within the group are patched to jump past the said path.
82  *
83  *	For multi-word comparisons (IPv6 addresses), there is another
84  *	layer of grouping:
85  *
86  *		A and B and ((C and D) or (E and F))
87  *
88  *	This strains the simple-minded JUMP_MAGIC logic, so for now,
89  *	when generating the jump-if-false targets for (C and D), we
90  *	simply count the number of instructions left to skip over.
91  *
92  *	A better architecture might be to create asm-type labels for
93  *	the jt and jf continuations in the first pass, and then, once
94  *	their offsets are determined, go back and fill them in in the
95  *	second pass.  This would simplify the logic (no need to compute
96  *	exactly how many instructions we're about to generate in a
97  *	chain of conditionals) and eliminate redundant RET #0
98  *	instructions which are currently generated after some groups.
99  */
100 
101 #include <sys/cdefs.h>
102 __RCSID("$NetBSD: npf_bpf_comp.c,v 1.17 2024/10/30 11:19:38 riastradh Exp $");
103 
104 #include <stdlib.h>
105 #include <stdbool.h>
106 #include <stddef.h>
107 #include <string.h>
108 #include <inttypes.h>
109 #include <err.h>
110 #include <assert.h>
111 
112 #include <netinet/in.h>
113 #include <netinet/in_systm.h>
114 #define	__FAVOR_BSD
115 #include <netinet/ip.h>
116 #include <netinet/ip6.h>
117 #include <netinet/udp.h>
118 #include <netinet/tcp.h>
119 #include <netinet/ip_icmp.h>
120 #include <netinet/icmp6.h>
121 
122 #include <net/bpf.h>
123 
124 #include "npfctl.h"
125 
126 /*
127  * Note: clear X_EQ_L4OFF when register X is invalidated i.e. it stores
128  * something other than L4 header offset.  Generally, when BPF_LDX is used.
129  */
130 #define	FETCHED_L3		0x01
131 #define	CHECKED_L4_PROTO	0x02
132 #define	X_EQ_L4OFF		0x04
133 
134 struct npf_bpf {
135 	/*
136 	 * BPF program code, the allocated length (in bytes), the number
137 	 * of logical blocks and the flags.
138 	 */
139 	struct bpf_program	prog;
140 	size_t			alen;
141 	unsigned		nblocks;
142 	sa_family_t		af;
143 	uint32_t		flags;
144 
145 	/*
146 	 * Indicators whether we are inside the group and whether this
147 	 * group is implementing inverted logic.
148 	 *
149 	 * The current group offset (counted in BPF instructions)
150 	 * and block number at the start of the group.
151 	 */
152 	unsigned		ingroup;
153 	bool			invert;
154 	bool			multiword;
155 	unsigned		goff;
156 	unsigned		gblock;
157 
158 	/* Track inversion (excl. mark). */
159 	uint32_t		invflags;
160 
161 	/* BPF marks, allocated length and the real length. */
162 	uint32_t *		marks;
163 	size_t			malen;
164 	size_t			mlen;
165 };
166 
167 /*
168  * NPF success and failure values to be returned from BPF.
169  */
170 #define	NPF_BPF_SUCCESS		((u_int)-1)
171 #define	NPF_BPF_FAILURE		0
172 
173 /*
174  * Magic value to indicate the failure path, which is fixed up on completion.
175  * Note: this is the longest jump offset in BPF, since the offset is one byte.
176  */
177 #define	JUMP_MAGIC		0xff
178 
179 /* Reduce re-allocations by expanding in 64 byte blocks. */
180 #define	ALLOC_MASK		(64 - 1)
181 #define	ALLOC_ROUND(x)		(((x) + ALLOC_MASK) & ~ALLOC_MASK)
182 
183 #ifndef IPV6_VERSION
184 #define	IPV6_VERSION		0x60
185 #endif
186 
187 npf_bpf_t *
188 npfctl_bpf_create(void)
189 {
190 	return ecalloc(1, sizeof(npf_bpf_t));
191 }
192 
193 static void
194 fixup_jumps(npf_bpf_t *ctx, u_int start, u_int end, bool swap)
195 {
196 	struct bpf_program *bp = &ctx->prog;
197 
198 	for (u_int i = start; i < end; i++) {
199 		struct bpf_insn *insn = &bp->bf_insns[i];
200 		const u_int fail_off = end - i;
201 		bool seen_magic = false;
202 
203 		if (fail_off >= JUMP_MAGIC) {
204 			errx(EXIT_FAILURE, "BPF generation error: "
205 			    "the number of instructions is over the limit");
206 		}
207 		if (BPF_CLASS(insn->code) != BPF_JMP) {
208 			continue;
209 		}
210 		if (BPF_OP(insn->code) == BPF_JA) {
211 			/*
212 			 * BPF_JA can be used to jump to the failure path.
213 			 * If we are swapping i.e. inside the group, then
214 			 * jump "next"; groups have a failure path appended
215 			 * at their end.
216 			 */
217 			if (insn->k == JUMP_MAGIC) {
218 				insn->k = swap ? 0 : fail_off;
219 			}
220 			continue;
221 		}
222 
223 		/*
224 		 * Fixup the "magic" value.  Swap only the "magic" jumps.
225 		 */
226 
227 		if (insn->jt == JUMP_MAGIC) {
228 			insn->jt = fail_off;
229 			seen_magic = true;
230 		}
231 		if (insn->jf == JUMP_MAGIC) {
232 			insn->jf = fail_off;
233 			seen_magic = true;
234 		}
235 
236 		if (seen_magic && swap) {
237 			uint8_t jt = insn->jt;
238 			insn->jt = insn->jf;
239 			insn->jf = jt;
240 		}
241 	}
242 }
243 
244 static void
245 add_insns(npf_bpf_t *ctx, struct bpf_insn *insns, size_t count)
246 {
247 	struct bpf_program *bp = &ctx->prog;
248 	size_t offset, len, reqlen;
249 
250 	/* Note: bf_len is the count of instructions. */
251 	offset = bp->bf_len * sizeof(struct bpf_insn);
252 	len = count * sizeof(struct bpf_insn);
253 
254 	/* Ensure the memory buffer for the program. */
255 	reqlen = ALLOC_ROUND(offset + len);
256 	if (reqlen > ctx->alen) {
257 		bp->bf_insns = erealloc(bp->bf_insns, reqlen);
258 		ctx->alen = reqlen;
259 	}
260 
261 	/* Add the code block. */
262 	memcpy((uint8_t *)bp->bf_insns + offset, insns, len);
263 	bp->bf_len += count;
264 }
265 
266 static void
267 add_bmarks(npf_bpf_t *ctx, const uint32_t *m, size_t len)
268 {
269 	size_t reqlen, nargs = m[1];
270 
271 	if ((len / sizeof(uint32_t) - 2) != nargs) {
272 		errx(EXIT_FAILURE, "invalid BPF block description");
273 	}
274 	reqlen = ALLOC_ROUND(ctx->mlen + len);
275 	if (reqlen > ctx->malen) {
276 		ctx->marks = erealloc(ctx->marks, reqlen);
277 		ctx->malen = reqlen;
278 	}
279 	memcpy((uint8_t *)ctx->marks + ctx->mlen, m, len);
280 	ctx->mlen += len;
281 }
282 
283 static void
284 done_block(npf_bpf_t *ctx, const uint32_t *m, size_t len)
285 {
286 	add_bmarks(ctx, m, len);
287 	ctx->nblocks++;
288 }
289 
290 struct bpf_program *
291 npfctl_bpf_complete(npf_bpf_t *ctx)
292 {
293 	struct bpf_program *bp = &ctx->prog;
294 	const u_int retoff = bp->bf_len;
295 
296 	/* No instructions (optimised out). */
297 	if (!bp->bf_len)
298 		return NULL;
299 
300 	/* Add the return fragment (success and failure paths). */
301 	struct bpf_insn insns_ret[] = {
302 		BPF_STMT(BPF_RET+BPF_K, NPF_BPF_SUCCESS),
303 		BPF_STMT(BPF_RET+BPF_K, NPF_BPF_FAILURE),
304 	};
305 	add_insns(ctx, insns_ret, __arraycount(insns_ret));
306 
307 	/* Fixup all jumps to the main failure path. */
308 	fixup_jumps(ctx, 0, retoff, false);
309 
310 	return &ctx->prog;
311 }
312 
313 const void *
314 npfctl_bpf_bmarks(npf_bpf_t *ctx, size_t *len)
315 {
316 	*len = ctx->mlen;
317 	return ctx->marks;
318 }
319 
320 void
321 npfctl_bpf_destroy(npf_bpf_t *ctx)
322 {
323 	free(ctx->prog.bf_insns);
324 	free(ctx->marks);
325 	free(ctx);
326 }
327 
328 /*
329  * npfctl_bpf_group_enter: begin a logical group.  It merely uses logical
330  * disjunction (OR) for comparisons within the group.
331  */
332 void
333 npfctl_bpf_group_enter(npf_bpf_t *ctx, bool invert)
334 {
335 	struct bpf_program *bp = &ctx->prog;
336 
337 	assert(ctx->goff == 0);
338 	assert(ctx->gblock == 0);
339 
340 	ctx->goff = bp->bf_len;
341 	ctx->gblock = ctx->nblocks;
342 	ctx->invert = invert;
343 	ctx->multiword = false;
344 	ctx->ingroup++;
345 }
346 
347 void
348 npfctl_bpf_group_exit(npf_bpf_t *ctx)
349 {
350 	struct bpf_program *bp = &ctx->prog;
351 	const size_t curoff = bp->bf_len;
352 
353 	assert(ctx->ingroup);
354 	ctx->ingroup--;
355 
356 	/*
357 	 * If we're not inverting, there were only zero or one options,
358 	 * and the last comparison was not a multi-word comparison
359 	 * requiring a fallthrough failure -- nothing to do.
360 	 */
361 	if (!ctx->invert &&
362 	    (ctx->nblocks - ctx->gblock) <= 1 &&
363 	    !ctx->multiword) {
364 		ctx->goff = ctx->gblock = 0;
365 		return;
366 	}
367 
368 	/*
369 	 * If inverting, then prepend a jump over the statement below.
370 	 * On match, it will skip-through and the fail path will be taken.
371 	 */
372 	if (ctx->invert) {
373 		struct bpf_insn insns_ret[] = {
374 			BPF_STMT(BPF_JMP+BPF_JA, 1),
375 		};
376 		add_insns(ctx, insns_ret, __arraycount(insns_ret));
377 	}
378 
379 	/*
380 	 * Append a failure return as a fall-through i.e. if there is
381 	 * no match within the group.
382 	 */
383 	struct bpf_insn insns_ret[] = {
384 		BPF_STMT(BPF_RET+BPF_K, NPF_BPF_FAILURE),
385 	};
386 	add_insns(ctx, insns_ret, __arraycount(insns_ret));
387 
388 	/*
389 	 * Adjust jump offsets: on match - jump outside the group i.e.
390 	 * to the current offset.  Otherwise, jump to the next instruction
391 	 * which would lead to the fall-through code above if none matches.
392 	 */
393 	fixup_jumps(ctx, ctx->goff, curoff, true);
394 	ctx->goff = ctx->gblock = 0;
395 }
396 
397 static void
398 fetch_l3(npf_bpf_t *ctx, sa_family_t af, unsigned flags)
399 {
400 	unsigned ver;
401 
402 	switch (af) {
403 	case AF_INET:
404 		ver = IPVERSION;
405 		break;
406 	case AF_INET6:
407 		ver = IPV6_VERSION >> 4;
408 		break;
409 	case AF_UNSPEC:
410 		ver = 0;
411 		break;
412 	default:
413 		abort();
414 	}
415 
416 	/*
417 	 * The memory store is populated with:
418 	 * - BPF_MW_IPVER: IP version (4 or 6).
419 	 * - BPF_MW_L4OFF: L4 header offset.
420 	 * - BPF_MW_L4PROTO: L4 protocol.
421 	 */
422 	if ((ctx->flags & FETCHED_L3) == 0 || (af && ctx->af == 0)) {
423 		const uint8_t jt = ver ? 0 : JUMP_MAGIC;
424 		const uint8_t jf = ver ? JUMP_MAGIC : 0;
425 		const bool ingroup = ctx->ingroup != 0;
426 		const bool invert = ctx->invert;
427 
428 		/*
429 		 * L3 block cannot be inserted in the middle of a group.
430 		 * In fact, it never is.  Check and start the group after.
431 		 */
432 		if (ingroup) {
433 			assert(ctx->nblocks == ctx->gblock);
434 			npfctl_bpf_group_exit(ctx);
435 		}
436 
437 		/*
438 		 * A <- IP version; A == expected-version?
439 		 * If no particular version specified, check for non-zero.
440 		 */
441 		struct bpf_insn insns_af[] = {
442 			BPF_STMT(BPF_LD+BPF_W+BPF_MEM, BPF_MW_IPVER),
443 			BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, ver, jt, jf),
444 		};
445 		add_insns(ctx, insns_af, __arraycount(insns_af));
446 		ctx->flags |= FETCHED_L3;
447 		ctx->af = af;
448 
449 		if (af) {
450 			uint32_t mwords[] = { BM_IPVER, 1, af };
451 			add_bmarks(ctx, mwords, sizeof(mwords));
452 		}
453 		if (ingroup) {
454 			npfctl_bpf_group_enter(ctx, invert);
455 		}
456 
457 	} else if (af && af != ctx->af) {
458 		errx(EXIT_FAILURE, "address family mismatch");
459 	}
460 
461 	if ((flags & X_EQ_L4OFF) != 0 && (ctx->flags & X_EQ_L4OFF) == 0) {
462 		/* X <- IP header length */
463 		struct bpf_insn insns_hlen[] = {
464 			BPF_STMT(BPF_LDX+BPF_MEM, BPF_MW_L4OFF),
465 		};
466 		add_insns(ctx, insns_hlen, __arraycount(insns_hlen));
467 		ctx->flags |= X_EQ_L4OFF;
468 	}
469 }
470 
471 static void
472 bm_invert_checkpoint(npf_bpf_t *ctx, const unsigned opts)
473 {
474 	uint32_t bm = 0;
475 
476 	if (ctx->ingroup && ctx->invert) {
477 		const unsigned seen = ctx->invflags;
478 
479 		if ((opts & MATCH_SRC) != 0 && (seen & MATCH_SRC) == 0) {
480 			bm = BM_SRC_NEG;
481 		}
482 		if ((opts & MATCH_DST) != 0 && (seen & MATCH_DST) == 0) {
483 			bm = BM_DST_NEG;
484 		}
485 		ctx->invflags |= opts & (MATCH_SRC | MATCH_DST);
486 	}
487 	if (bm) {
488 		uint32_t mwords[] = { bm, 0 };
489 		add_bmarks(ctx, mwords, sizeof(mwords));
490 	}
491 }
492 
493 /*
494  * npfctl_bpf_ipver: match the IP version.
495  */
496 void
497 npfctl_bpf_ipver(npf_bpf_t *ctx, sa_family_t af)
498 {
499 	fetch_l3(ctx, af, 0);
500 }
501 
502 /*
503  * npfctl_bpf_proto: code block to match IP version and L4 protocol.
504  */
505 void
506 npfctl_bpf_proto(npf_bpf_t *ctx, unsigned proto)
507 {
508 	struct bpf_insn insns_proto[] = {
509 		/* A <- L4 protocol; A == expected-protocol? */
510 		BPF_STMT(BPF_LD+BPF_W+BPF_MEM, BPF_MW_L4PROTO),
511 		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, proto, 0, JUMP_MAGIC),
512 	};
513 	add_insns(ctx, insns_proto, __arraycount(insns_proto));
514 
515 	uint32_t mwords[] = { BM_PROTO, 1, proto };
516 	done_block(ctx, mwords, sizeof(mwords));
517 	ctx->flags |= CHECKED_L4_PROTO;
518 }
519 
520 /*
521  * npfctl_bpf_cidr: code block to match IPv4 or IPv6 CIDR.
522  *
523  * => IP address shall be in the network byte order.
524  */
525 void
526 npfctl_bpf_cidr(npf_bpf_t *ctx, unsigned opts, sa_family_t af,
527     const npf_addr_t *addr, const npf_netmask_t mask)
528 {
529 	const uint32_t *awords = (const uint32_t *)addr;
530 	unsigned nwords, origlength, length, maxmask, off;
531 
532 	assert(((opts & MATCH_SRC) != 0) ^ ((opts & MATCH_DST) != 0));
533 	assert((mask && mask <= NPF_MAX_NETMASK) || mask == NPF_NO_NETMASK);
534 
535 	switch (af) {
536 	case AF_INET:
537 		maxmask = 32;
538 		off = (opts & MATCH_SRC) ?
539 		    offsetof(struct ip, ip_src) :
540 		    offsetof(struct ip, ip_dst);
541 		nwords = sizeof(struct in_addr) / sizeof(uint32_t);
542 		break;
543 	case AF_INET6:
544 		maxmask = 128;
545 		off = (opts & MATCH_SRC) ?
546 		    offsetof(struct ip6_hdr, ip6_src) :
547 		    offsetof(struct ip6_hdr, ip6_dst);
548 		nwords = sizeof(struct in6_addr) / sizeof(uint32_t);
549 		break;
550 	default:
551 		abort();
552 	}
553 
554 	/* Ensure address family. */
555 	fetch_l3(ctx, af, 0);
556 
557 	length = origlength = (mask == NPF_NO_NETMASK) ? maxmask : mask;
558 
559 	/* CAUTION: BPF operates in host byte-order. */
560 	for (unsigned i = 0; i < nwords; i++) {
561 		const unsigned woff = i * sizeof(uint32_t);
562 		uint32_t word = ntohl(awords[i]);
563 		uint32_t wordmask;
564 
565 		if (length >= 32) {
566 			/* The mask is a full word - do not apply it. */
567 			wordmask = 0;
568 			length -= 32;
569 		} else if (length) {
570 			wordmask = 0xffffffff << (32 - length);
571 			length = 0;
572 		} else {
573 			/* The mask became zero - skip the rest. */
574 			break;
575 		}
576 
577 		/* A <- IP address (or one word of it) */
578 		struct bpf_insn insns_ip[] = {
579 			BPF_STMT(BPF_LD+BPF_W+BPF_ABS, off + woff),
580 		};
581 		add_insns(ctx, insns_ip, __arraycount(insns_ip));
582 
583 		/* A <- (A & MASK) */
584 		if (wordmask) {
585 			struct bpf_insn insns_mask[] = {
586 				BPF_STMT(BPF_ALU+BPF_AND+BPF_K, wordmask),
587 			};
588 			add_insns(ctx, insns_mask, __arraycount(insns_mask));
589 		}
590 
591 		/*
592 		 * Determine how many instructions we have to jump
593 		 * ahead if the match fails.
594 		 *
595 		 * - If this is the last word, we jump to the final
596                  *   failure, JUMP_MAGIC.
597 		 *
598 		 * - If this is not the last word, we jump past the
599 		 *   remaining instructions to match this sequence.
600 		 *   Each 32-bit word in the sequence takes two
601 		 *   instructions (BPF_LD and BPF_JMP).  If there is a
602 		 *   partial-word mask ahead, there will be one
603 		 *   additional instruction (BPF_ALU).
604 		 */
605 		uint8_t jf;
606 		if (i + 1 == (origlength + 31)/32) {
607 			jf = JUMP_MAGIC;
608 		} else {
609 			jf = 2*((origlength + 31)/32 - i - 1);
610 			if (origlength % 32 != 0 && wordmask == 0)
611 				jf += 1;
612 		}
613 
614 		/* A == expected-IP-word ? */
615 		struct bpf_insn insns_cmp[] = {
616 			BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, word, 0, jf),
617 		};
618 		add_insns(ctx, insns_cmp, __arraycount(insns_cmp));
619 	}
620 
621 	/*
622 	 * If we checked a chain of words in sequence, mark this as a
623 	 * multi-word comparison so if this is in a group there will be
624 	 * a fallthrough case.
625 	 *
626 	 * XXX This is a little silly; the compiler should really just
627 	 * record holes where conditional jumps need success/failure
628 	 * continuations, and go back to fill in the holes when the
629 	 * locations of the continuations are determined later.  But
630 	 * that requires restructuring this code a little more.
631 	 */
632 	ctx->multiword = (origlength + 31)/32 > 1;
633 
634 	uint32_t mwords[] = {
635 		(opts & MATCH_SRC) ? BM_SRC_CIDR: BM_DST_CIDR, 6,
636 		af, mask, awords[0], awords[1], awords[2], awords[3],
637 	};
638 	bm_invert_checkpoint(ctx, opts);
639 	done_block(ctx, mwords, sizeof(mwords));
640 }
641 
642 /*
643  * npfctl_bpf_ports: code block to match TCP/UDP port range.
644  *
645  * => Port numbers shall be in the network byte order.
646  */
647 void
648 npfctl_bpf_ports(npf_bpf_t *ctx, unsigned opts, in_port_t from, in_port_t to)
649 {
650 	const unsigned sport_off = offsetof(struct udphdr, uh_sport);
651 	const unsigned dport_off = offsetof(struct udphdr, uh_dport);
652 	unsigned off;
653 
654 	/* TCP and UDP port offsets are the same. */
655 	assert(sport_off == offsetof(struct tcphdr, th_sport));
656 	assert(dport_off == offsetof(struct tcphdr, th_dport));
657 	assert(ctx->flags & CHECKED_L4_PROTO);
658 
659 	assert(((opts & MATCH_SRC) != 0) ^ ((opts & MATCH_DST) != 0));
660 	off = (opts & MATCH_SRC) ? sport_off : dport_off;
661 
662 	/* X <- IP header length */
663 	fetch_l3(ctx, AF_UNSPEC, X_EQ_L4OFF);
664 
665 	struct bpf_insn insns_fetch[] = {
666 		/* A <- port */
667 		BPF_STMT(BPF_LD+BPF_H+BPF_IND, off),
668 	};
669 	add_insns(ctx, insns_fetch, __arraycount(insns_fetch));
670 
671 	/* CAUTION: BPF operates in host byte-order. */
672 	from = ntohs(from);
673 	to = ntohs(to);
674 
675 	if (from == to) {
676 		/* Single port case. */
677 		struct bpf_insn insns_port[] = {
678 			BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, from, 0, JUMP_MAGIC),
679 		};
680 		add_insns(ctx, insns_port, __arraycount(insns_port));
681 	} else {
682 		/* Port range case. */
683 		struct bpf_insn insns_range[] = {
684 			BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, from, 0, 1),
685 			BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, to, 0, 1),
686 			BPF_STMT(BPF_JMP+BPF_JA, JUMP_MAGIC),
687 		};
688 		add_insns(ctx, insns_range, __arraycount(insns_range));
689 	}
690 
691 	uint32_t mwords[] = {
692 		(opts & MATCH_SRC) ? BM_SRC_PORTS : BM_DST_PORTS, 2, from, to
693 	};
694 	done_block(ctx, mwords, sizeof(mwords));
695 }
696 
697 /*
698  * npfctl_bpf_tcpfl: code block to match TCP flags.
699  */
700 void
701 npfctl_bpf_tcpfl(npf_bpf_t *ctx, uint8_t tf, uint8_t tf_mask)
702 {
703 	const unsigned tcpfl_off = offsetof(struct tcphdr, th_flags);
704 	const bool usingmask = tf_mask != tf;
705 
706 	/* X <- IP header length */
707 	fetch_l3(ctx, AF_UNSPEC, X_EQ_L4OFF);
708 
709 	if ((ctx->flags & CHECKED_L4_PROTO) == 0) {
710 		const unsigned jf = usingmask ? 3 : 2;
711 		assert(ctx->ingroup == 0);
712 
713 		/*
714 		 * A <- L4 protocol; A == TCP?  If not, jump out.
715 		 *
716 		 * Note: the TCP flag matching might be without 'proto tcp'
717 		 * when using a plain 'stateful' rule.  In such case it also
718 		 * handles other protocols, thus no strict TCP check.
719 		 */
720 		struct bpf_insn insns_tcp[] = {
721 			BPF_STMT(BPF_LD+BPF_W+BPF_MEM, BPF_MW_L4PROTO),
722 			BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, IPPROTO_TCP, 0, jf),
723 		};
724 		add_insns(ctx, insns_tcp, __arraycount(insns_tcp));
725 	}
726 
727 	struct bpf_insn insns_tf[] = {
728 		/* A <- TCP flags */
729 		BPF_STMT(BPF_LD+BPF_B+BPF_IND, tcpfl_off),
730 	};
731 	add_insns(ctx, insns_tf, __arraycount(insns_tf));
732 
733 	if (usingmask) {
734 		/* A <- (A & mask) */
735 		struct bpf_insn insns_mask[] = {
736 			BPF_STMT(BPF_ALU+BPF_AND+BPF_K, tf_mask),
737 		};
738 		add_insns(ctx, insns_mask, __arraycount(insns_mask));
739 	}
740 
741 	struct bpf_insn insns_cmp[] = {
742 		/* A == expected-TCP-flags? */
743 		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, tf, 0, JUMP_MAGIC),
744 	};
745 	add_insns(ctx, insns_cmp, __arraycount(insns_cmp));
746 
747 	uint32_t mwords[] = { BM_TCPFL, 2, tf, tf_mask };
748 	done_block(ctx, mwords, sizeof(mwords));
749 }
750 
751 /*
752  * npfctl_bpf_icmp: code block to match ICMP type and/or code.
753  * Note: suitable for both the ICMPv4 and ICMPv6.
754  */
755 void
756 npfctl_bpf_icmp(npf_bpf_t *ctx, int type, int code)
757 {
758 	const u_int type_off = offsetof(struct icmp, icmp_type);
759 	const u_int code_off = offsetof(struct icmp, icmp_code);
760 
761 	assert(ctx->flags & CHECKED_L4_PROTO);
762 	assert(offsetof(struct icmp6_hdr, icmp6_type) == type_off);
763 	assert(offsetof(struct icmp6_hdr, icmp6_code) == code_off);
764 	assert(type != -1 || code != -1);
765 
766 	/* X <- IP header length */
767 	fetch_l3(ctx, AF_UNSPEC, X_EQ_L4OFF);
768 
769 	if (type != -1) {
770 		struct bpf_insn insns_type[] = {
771 			BPF_STMT(BPF_LD+BPF_B+BPF_IND, type_off),
772 			BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, type, 0, JUMP_MAGIC),
773 		};
774 		add_insns(ctx, insns_type, __arraycount(insns_type));
775 
776 		uint32_t mwords[] = { BM_ICMP_TYPE, 1, type };
777 		done_block(ctx, mwords, sizeof(mwords));
778 	}
779 
780 	if (code != -1) {
781 		struct bpf_insn insns_code[] = {
782 			BPF_STMT(BPF_LD+BPF_B+BPF_IND, code_off),
783 			BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, code, 0, JUMP_MAGIC),
784 		};
785 		add_insns(ctx, insns_code, __arraycount(insns_code));
786 
787 		uint32_t mwords[] = { BM_ICMP_CODE, 1, code };
788 		done_block(ctx, mwords, sizeof(mwords));
789 	}
790 }
791 
792 #define	SRC_FLAG_BIT	(1U << 31)
793 
794 /*
795  * npfctl_bpf_table: code block to match source/destination IP address
796  * against NPF table specified by ID.
797  */
798 void
799 npfctl_bpf_table(npf_bpf_t *ctx, unsigned opts, unsigned tid)
800 {
801 	const bool src = (opts & MATCH_SRC) != 0;
802 
803 	struct bpf_insn insns_table[] = {
804 		BPF_STMT(BPF_LD+BPF_IMM, (src ? SRC_FLAG_BIT : 0) | tid),
805 		BPF_STMT(BPF_MISC+BPF_COP, NPF_COP_TABLE),
806 		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, 0, JUMP_MAGIC, 0),
807 	};
808 	add_insns(ctx, insns_table, __arraycount(insns_table));
809 
810 	uint32_t mwords[] = { src ? BM_SRC_TABLE: BM_DST_TABLE, 1, tid };
811 	bm_invert_checkpoint(ctx, opts);
812 	done_block(ctx, mwords, sizeof(mwords));
813 }
814