14e592132Srmind /*- 2b899bfd9Srmind * Copyright (c) 2010-2020 The NetBSD Foundation, Inc. 34e592132Srmind * All rights reserved. 44e592132Srmind * 54e592132Srmind * This material is based upon work partially supported by The 64e592132Srmind * NetBSD Foundation under a contract with Mindaugas Rasiukevicius. 74e592132Srmind * 84e592132Srmind * Redistribution and use in source and binary forms, with or without 94e592132Srmind * modification, are permitted provided that the following conditions 104e592132Srmind * are met: 114e592132Srmind * 1. Redistributions of source code must retain the above copyright 124e592132Srmind * notice, this list of conditions and the following disclaimer. 134e592132Srmind * 2. Redistributions in binary form must reproduce the above copyright 144e592132Srmind * notice, this list of conditions and the following disclaimer in the 154e592132Srmind * documentation and/or other materials provided with the distribution. 164e592132Srmind * 174e592132Srmind * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 184e592132Srmind * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 194e592132Srmind * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 204e592132Srmind * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 214e592132Srmind * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 224e592132Srmind * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 234e592132Srmind * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 244e592132Srmind * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 254e592132Srmind * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 264e592132Srmind * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 274e592132Srmind * POSSIBILITY OF SUCH DAMAGE. 284e592132Srmind */ 294e592132Srmind 304e592132Srmind /* 314e592132Srmind * BPF byte-code generation for NPF rules. 3254d339ecSrmind * 3354d339ecSrmind * Overview 3454d339ecSrmind * 35298883feSrmind * Each NPF rule is compiled into a BPF micro-program. There is a 3654d339ecSrmind * BPF byte-code fragment for each higher-level filtering logic, 3754d339ecSrmind * e.g. to match L4 protocol, IP/mask, etc. The generation process 3854d339ecSrmind * combines multiple BPF-byte code fragments into one program. 3954d339ecSrmind * 4054d339ecSrmind * Basic case 4154d339ecSrmind * 42298883feSrmind * Consider a basic case where all filters should match. They 4354d339ecSrmind * are expressed as logical conjunction, e.g.: 4454d339ecSrmind * 4554d339ecSrmind * A and B and C and D 4654d339ecSrmind * 4754d339ecSrmind * Each test (filter) criterion can be evaluated to true (match) or 4854d339ecSrmind * false (no match) and the logic is as follows: 4954d339ecSrmind * 5054d339ecSrmind * - If the value is true, then jump to the "next" test (offset 0). 5154d339ecSrmind * 5254d339ecSrmind * - If the value is false, then jump to the JUMP_MAGIC value (0xff). 5354d339ecSrmind * This "magic" value is used to indicate that it will have to be 5454d339ecSrmind * patched at a later stage. 5554d339ecSrmind * 5654d339ecSrmind * Once all byte-code fragments are combined into one, then there 5754d339ecSrmind * are two additional steps: 5854d339ecSrmind * 59298883feSrmind * - Two instructions are appended at the end of the program: "return 60298883feSrmind * success" followed by "return failure". 6154d339ecSrmind * 6254d339ecSrmind * - All jumps with the JUMP_MAGIC value are patched to point to the 6354d339ecSrmind * "return failure" instruction. 6454d339ecSrmind * 6554d339ecSrmind * Therefore, if all filter criteria will match, then the first 6654d339ecSrmind * instruction will be reached, indicating a successful match of the 6754d339ecSrmind * rule. Otherwise, if any of the criteria will not match, it will 68298883feSrmind * take the failure path and the rule will not be matching. 6954d339ecSrmind * 7054d339ecSrmind * Grouping 7154d339ecSrmind * 72b899bfd9Srmind * Filters can have groups, which have an effect of logical 7354d339ecSrmind * disjunction, e.g.: 7454d339ecSrmind * 7554d339ecSrmind * A and B and (C or D) 7654d339ecSrmind * 7754d339ecSrmind * In such case, the logic inside the group has to be inverted i.e. 7854d339ecSrmind * the jump values swapped. If the test value is true, then jump 7954d339ecSrmind * out of the group; if false, then jump "next". At the end of the 8054d339ecSrmind * group, an addition failure path is appended and the JUMP_MAGIC 8154d339ecSrmind * uses within the group are patched to jump past the said path. 82*b75b735cSriastradh * 83*b75b735cSriastradh * For multi-word comparisons (IPv6 addresses), there is another 84*b75b735cSriastradh * layer of grouping: 85*b75b735cSriastradh * 86*b75b735cSriastradh * A and B and ((C and D) or (E and F)) 87*b75b735cSriastradh * 88*b75b735cSriastradh * This strains the simple-minded JUMP_MAGIC logic, so for now, 89*b75b735cSriastradh * when generating the jump-if-false targets for (C and D), we 90*b75b735cSriastradh * simply count the number of instructions left to skip over. 91*b75b735cSriastradh * 92*b75b735cSriastradh * A better architecture might be to create asm-type labels for 93*b75b735cSriastradh * the jt and jf continuations in the first pass, and then, once 94*b75b735cSriastradh * their offsets are determined, go back and fill them in in the 95*b75b735cSriastradh * second pass. This would simplify the logic (no need to compute 96*b75b735cSriastradh * exactly how many instructions we're about to generate in a 97*b75b735cSriastradh * chain of conditionals) and eliminate redundant RET #0 98*b75b735cSriastradh * instructions which are currently generated after some groups. 994e592132Srmind */ 1004e592132Srmind 1014e592132Srmind #include <sys/cdefs.h> 102*b75b735cSriastradh __RCSID("$NetBSD: npf_bpf_comp.c,v 1.17 2024/10/30 11:19:38 riastradh Exp $"); 1034e592132Srmind 1044e592132Srmind #include <stdlib.h> 1054e592132Srmind #include <stdbool.h> 1064e592132Srmind #include <stddef.h> 1074e592132Srmind #include <string.h> 1084e592132Srmind #include <inttypes.h> 1094e592132Srmind #include <err.h> 1104e592132Srmind #include <assert.h> 1114e592132Srmind 1124e592132Srmind #include <netinet/in.h> 1134e592132Srmind #include <netinet/in_systm.h> 114f75d79ebSchristos #define __FAVOR_BSD 1154e592132Srmind #include <netinet/ip.h> 1164e592132Srmind #include <netinet/ip6.h> 1174e592132Srmind #include <netinet/udp.h> 1184e592132Srmind #include <netinet/tcp.h> 1194e592132Srmind #include <netinet/ip_icmp.h> 1204e592132Srmind #include <netinet/icmp6.h> 1214e592132Srmind 1224e592132Srmind #include <net/bpf.h> 1234e592132Srmind 1244e592132Srmind #include "npfctl.h" 1254e592132Srmind 1264e592132Srmind /* 1274e592132Srmind * Note: clear X_EQ_L4OFF when register X is invalidated i.e. it stores 1284e592132Srmind * something other than L4 header offset. Generally, when BPF_LDX is used. 1294e592132Srmind */ 1304e592132Srmind #define FETCHED_L3 0x01 131b899bfd9Srmind #define CHECKED_L4_PROTO 0x02 132410bae3fSrmind #define X_EQ_L4OFF 0x04 1334e592132Srmind 1344e592132Srmind struct npf_bpf { 1354e592132Srmind /* 1364e592132Srmind * BPF program code, the allocated length (in bytes), the number 1374e592132Srmind * of logical blocks and the flags. 1384e592132Srmind */ 1394e592132Srmind struct bpf_program prog; 1404e592132Srmind size_t alen; 141b899bfd9Srmind unsigned nblocks; 1424e592132Srmind sa_family_t af; 1434e592132Srmind uint32_t flags; 1444e592132Srmind 14554d339ecSrmind /* 146b899bfd9Srmind * Indicators whether we are inside the group and whether this 147b899bfd9Srmind * group is implementing inverted logic. 148b899bfd9Srmind * 14954d339ecSrmind * The current group offset (counted in BPF instructions) 15054d339ecSrmind * and block number at the start of the group. 15154d339ecSrmind */ 152b899bfd9Srmind unsigned ingroup; 153b899bfd9Srmind bool invert; 154*b75b735cSriastradh bool multiword; 155b899bfd9Srmind unsigned goff; 156b899bfd9Srmind unsigned gblock; 157b899bfd9Srmind 158b899bfd9Srmind /* Track inversion (excl. mark). */ 159b899bfd9Srmind uint32_t invflags; 1604e592132Srmind 1614e592132Srmind /* BPF marks, allocated length and the real length. */ 1624e592132Srmind uint32_t * marks; 1634e592132Srmind size_t malen; 1644e592132Srmind size_t mlen; 1654e592132Srmind }; 1664e592132Srmind 1674e592132Srmind /* 1684e592132Srmind * NPF success and failure values to be returned from BPF. 1694e592132Srmind */ 1704e592132Srmind #define NPF_BPF_SUCCESS ((u_int)-1) 1714e592132Srmind #define NPF_BPF_FAILURE 0 1724e592132Srmind 1734e592132Srmind /* 1744e592132Srmind * Magic value to indicate the failure path, which is fixed up on completion. 1754e592132Srmind * Note: this is the longest jump offset in BPF, since the offset is one byte. 1764e592132Srmind */ 1774e592132Srmind #define JUMP_MAGIC 0xff 1784e592132Srmind 1794e592132Srmind /* Reduce re-allocations by expanding in 64 byte blocks. */ 1804e592132Srmind #define ALLOC_MASK (64 - 1) 1814e592132Srmind #define ALLOC_ROUND(x) (((x) + ALLOC_MASK) & ~ALLOC_MASK) 1824e592132Srmind 183f75d79ebSchristos #ifndef IPV6_VERSION 184f75d79ebSchristos #define IPV6_VERSION 0x60 185f75d79ebSchristos #endif 186f75d79ebSchristos 1874e592132Srmind npf_bpf_t * 1884e592132Srmind npfctl_bpf_create(void) 1894e592132Srmind { 1904e592132Srmind return ecalloc(1, sizeof(npf_bpf_t)); 1914e592132Srmind } 1924e592132Srmind 1934e592132Srmind static void 1944e592132Srmind fixup_jumps(npf_bpf_t *ctx, u_int start, u_int end, bool swap) 1954e592132Srmind { 1964e592132Srmind struct bpf_program *bp = &ctx->prog; 1974e592132Srmind 1984e592132Srmind for (u_int i = start; i < end; i++) { 1994e592132Srmind struct bpf_insn *insn = &bp->bf_insns[i]; 2004e592132Srmind const u_int fail_off = end - i; 20154d339ecSrmind bool seen_magic = false; 2024e592132Srmind 2034e592132Srmind if (fail_off >= JUMP_MAGIC) { 2044e592132Srmind errx(EXIT_FAILURE, "BPF generation error: " 2054e592132Srmind "the number of instructions is over the limit"); 2064e592132Srmind } 2074e592132Srmind if (BPF_CLASS(insn->code) != BPF_JMP) { 2084e592132Srmind continue; 2094e592132Srmind } 21054d339ecSrmind if (BPF_OP(insn->code) == BPF_JA) { 21154d339ecSrmind /* 21254d339ecSrmind * BPF_JA can be used to jump to the failure path. 21354d339ecSrmind * If we are swapping i.e. inside the group, then 21454d339ecSrmind * jump "next"; groups have a failure path appended 21554d339ecSrmind * at their end. 21654d339ecSrmind */ 21754d339ecSrmind if (insn->k == JUMP_MAGIC) { 21854d339ecSrmind insn->k = swap ? 0 : fail_off; 21954d339ecSrmind } 22054d339ecSrmind continue; 22154d339ecSrmind } 22254d339ecSrmind 22354d339ecSrmind /* 22454d339ecSrmind * Fixup the "magic" value. Swap only the "magic" jumps. 22554d339ecSrmind */ 22654d339ecSrmind 22754d339ecSrmind if (insn->jt == JUMP_MAGIC) { 22854d339ecSrmind insn->jt = fail_off; 22954d339ecSrmind seen_magic = true; 23054d339ecSrmind } 23154d339ecSrmind if (insn->jf == JUMP_MAGIC) { 23254d339ecSrmind insn->jf = fail_off; 23354d339ecSrmind seen_magic = true; 23454d339ecSrmind } 23554d339ecSrmind 23654d339ecSrmind if (seen_magic && swap) { 2374e592132Srmind uint8_t jt = insn->jt; 2384e592132Srmind insn->jt = insn->jf; 2394e592132Srmind insn->jf = jt; 2404e592132Srmind } 2414e592132Srmind } 2424e592132Srmind } 2434e592132Srmind 2444e592132Srmind static void 2454e592132Srmind add_insns(npf_bpf_t *ctx, struct bpf_insn *insns, size_t count) 2464e592132Srmind { 2474e592132Srmind struct bpf_program *bp = &ctx->prog; 2484e592132Srmind size_t offset, len, reqlen; 2494e592132Srmind 2504e592132Srmind /* Note: bf_len is the count of instructions. */ 2514e592132Srmind offset = bp->bf_len * sizeof(struct bpf_insn); 2524e592132Srmind len = count * sizeof(struct bpf_insn); 2534e592132Srmind 2544e592132Srmind /* Ensure the memory buffer for the program. */ 2554e592132Srmind reqlen = ALLOC_ROUND(offset + len); 2564e592132Srmind if (reqlen > ctx->alen) { 2574e592132Srmind bp->bf_insns = erealloc(bp->bf_insns, reqlen); 2584e592132Srmind ctx->alen = reqlen; 2594e592132Srmind } 2604e592132Srmind 2614e592132Srmind /* Add the code block. */ 2624e592132Srmind memcpy((uint8_t *)bp->bf_insns + offset, insns, len); 2634e592132Srmind bp->bf_len += count; 2644e592132Srmind } 2654e592132Srmind 2664e592132Srmind static void 267b899bfd9Srmind add_bmarks(npf_bpf_t *ctx, const uint32_t *m, size_t len) 2684e592132Srmind { 2694e592132Srmind size_t reqlen, nargs = m[1]; 2704e592132Srmind 2714e592132Srmind if ((len / sizeof(uint32_t) - 2) != nargs) { 2724e592132Srmind errx(EXIT_FAILURE, "invalid BPF block description"); 2734e592132Srmind } 2744e592132Srmind reqlen = ALLOC_ROUND(ctx->mlen + len); 2754e592132Srmind if (reqlen > ctx->malen) { 2764e592132Srmind ctx->marks = erealloc(ctx->marks, reqlen); 2774e592132Srmind ctx->malen = reqlen; 2784e592132Srmind } 2794e592132Srmind memcpy((uint8_t *)ctx->marks + ctx->mlen, m, len); 2804e592132Srmind ctx->mlen += len; 2814e592132Srmind } 2824e592132Srmind 2834e592132Srmind static void 2844e592132Srmind done_block(npf_bpf_t *ctx, const uint32_t *m, size_t len) 2854e592132Srmind { 286b899bfd9Srmind add_bmarks(ctx, m, len); 2874e592132Srmind ctx->nblocks++; 2884e592132Srmind } 2894e592132Srmind 2904e592132Srmind struct bpf_program * 2914e592132Srmind npfctl_bpf_complete(npf_bpf_t *ctx) 2924e592132Srmind { 2934e592132Srmind struct bpf_program *bp = &ctx->prog; 2944e592132Srmind const u_int retoff = bp->bf_len; 2954e592132Srmind 2961662d4f4Srmind /* No instructions (optimised out). */ 2971662d4f4Srmind if (!bp->bf_len) 2981662d4f4Srmind return NULL; 2991662d4f4Srmind 3004e592132Srmind /* Add the return fragment (success and failure paths). */ 3014e592132Srmind struct bpf_insn insns_ret[] = { 3024e592132Srmind BPF_STMT(BPF_RET+BPF_K, NPF_BPF_SUCCESS), 3034e592132Srmind BPF_STMT(BPF_RET+BPF_K, NPF_BPF_FAILURE), 3044e592132Srmind }; 3054e592132Srmind add_insns(ctx, insns_ret, __arraycount(insns_ret)); 3064e592132Srmind 3074e592132Srmind /* Fixup all jumps to the main failure path. */ 3084e592132Srmind fixup_jumps(ctx, 0, retoff, false); 3094e592132Srmind 3104e592132Srmind return &ctx->prog; 3114e592132Srmind } 3124e592132Srmind 3134e592132Srmind const void * 3144e592132Srmind npfctl_bpf_bmarks(npf_bpf_t *ctx, size_t *len) 3154e592132Srmind { 3164e592132Srmind *len = ctx->mlen; 3174e592132Srmind return ctx->marks; 3184e592132Srmind } 3194e592132Srmind 3204e592132Srmind void 3214e592132Srmind npfctl_bpf_destroy(npf_bpf_t *ctx) 3224e592132Srmind { 3234e592132Srmind free(ctx->prog.bf_insns); 3244e592132Srmind free(ctx->marks); 3254e592132Srmind free(ctx); 3264e592132Srmind } 3274e592132Srmind 3284e592132Srmind /* 32954d339ecSrmind * npfctl_bpf_group_enter: begin a logical group. It merely uses logical 330b899bfd9Srmind * disjunction (OR) for comparisons within the group. 3314e592132Srmind */ 3324e592132Srmind void 333b899bfd9Srmind npfctl_bpf_group_enter(npf_bpf_t *ctx, bool invert) 3344e592132Srmind { 3354e592132Srmind struct bpf_program *bp = &ctx->prog; 3364e592132Srmind 3374e592132Srmind assert(ctx->goff == 0); 3384e592132Srmind assert(ctx->gblock == 0); 3394e592132Srmind 3404e592132Srmind ctx->goff = bp->bf_len; 3414e592132Srmind ctx->gblock = ctx->nblocks; 342b899bfd9Srmind ctx->invert = invert; 343*b75b735cSriastradh ctx->multiword = false; 344b899bfd9Srmind ctx->ingroup++; 3454e592132Srmind } 3464e592132Srmind 3474e592132Srmind void 348b899bfd9Srmind npfctl_bpf_group_exit(npf_bpf_t *ctx) 3494e592132Srmind { 3504e592132Srmind struct bpf_program *bp = &ctx->prog; 3514e592132Srmind const size_t curoff = bp->bf_len; 3524e592132Srmind 353b899bfd9Srmind assert(ctx->ingroup); 354b899bfd9Srmind ctx->ingroup--; 355b899bfd9Srmind 356*b75b735cSriastradh /* 357*b75b735cSriastradh * If we're not inverting, there were only zero or one options, 358*b75b735cSriastradh * and the last comparison was not a multi-word comparison 359*b75b735cSriastradh * requiring a fallthrough failure -- nothing to do. 360*b75b735cSriastradh */ 361*b75b735cSriastradh if (!ctx->invert && 362*b75b735cSriastradh (ctx->nblocks - ctx->gblock) <= 1 && 363*b75b735cSriastradh !ctx->multiword) { 3644e592132Srmind ctx->goff = ctx->gblock = 0; 3654e592132Srmind return; 3664e592132Srmind } 3674e592132Srmind 3684e592132Srmind /* 3698334b9bcSrmind * If inverting, then prepend a jump over the statement below. 37054d339ecSrmind * On match, it will skip-through and the fail path will be taken. 3718334b9bcSrmind */ 372b899bfd9Srmind if (ctx->invert) { 3738334b9bcSrmind struct bpf_insn insns_ret[] = { 3748334b9bcSrmind BPF_STMT(BPF_JMP+BPF_JA, 1), 3758334b9bcSrmind }; 3768334b9bcSrmind add_insns(ctx, insns_ret, __arraycount(insns_ret)); 3778334b9bcSrmind } 3788334b9bcSrmind 3798334b9bcSrmind /* 3804e592132Srmind * Append a failure return as a fall-through i.e. if there is 3814e592132Srmind * no match within the group. 3824e592132Srmind */ 3834e592132Srmind struct bpf_insn insns_ret[] = { 3844e592132Srmind BPF_STMT(BPF_RET+BPF_K, NPF_BPF_FAILURE), 3854e592132Srmind }; 3864e592132Srmind add_insns(ctx, insns_ret, __arraycount(insns_ret)); 3874e592132Srmind 3884e592132Srmind /* 3894e592132Srmind * Adjust jump offsets: on match - jump outside the group i.e. 3904e592132Srmind * to the current offset. Otherwise, jump to the next instruction 3914e592132Srmind * which would lead to the fall-through code above if none matches. 3924e592132Srmind */ 3934e592132Srmind fixup_jumps(ctx, ctx->goff, curoff, true); 3944e592132Srmind ctx->goff = ctx->gblock = 0; 3954e592132Srmind } 3964e592132Srmind 3974e592132Srmind static void 398b899bfd9Srmind fetch_l3(npf_bpf_t *ctx, sa_family_t af, unsigned flags) 3994e592132Srmind { 400b899bfd9Srmind unsigned ver; 4014e592132Srmind 4024e592132Srmind switch (af) { 4034e592132Srmind case AF_INET: 4044e592132Srmind ver = IPVERSION; 4054e592132Srmind break; 4064e592132Srmind case AF_INET6: 4074e592132Srmind ver = IPV6_VERSION >> 4; 4084e592132Srmind break; 4094e592132Srmind case AF_UNSPEC: 4104e592132Srmind ver = 0; 4114e592132Srmind break; 4124e592132Srmind default: 4134e592132Srmind abort(); 4144e592132Srmind } 4154e592132Srmind 4164e592132Srmind /* 4179c7a886eSrmind * The memory store is populated with: 4184e592132Srmind * - BPF_MW_IPVER: IP version (4 or 6). 4194e592132Srmind * - BPF_MW_L4OFF: L4 header offset. 4204e592132Srmind * - BPF_MW_L4PROTO: L4 protocol. 4214e592132Srmind */ 4224e592132Srmind if ((ctx->flags & FETCHED_L3) == 0 || (af && ctx->af == 0)) { 4234e592132Srmind const uint8_t jt = ver ? 0 : JUMP_MAGIC; 4244e592132Srmind const uint8_t jf = ver ? JUMP_MAGIC : 0; 425b899bfd9Srmind const bool ingroup = ctx->ingroup != 0; 426b899bfd9Srmind const bool invert = ctx->invert; 4274e592132Srmind 4284e592132Srmind /* 4294e592132Srmind * L3 block cannot be inserted in the middle of a group. 4304e592132Srmind * In fact, it never is. Check and start the group after. 4314e592132Srmind */ 4324e592132Srmind if (ingroup) { 4334e592132Srmind assert(ctx->nblocks == ctx->gblock); 434b899bfd9Srmind npfctl_bpf_group_exit(ctx); 4354e592132Srmind } 4364e592132Srmind 4374e592132Srmind /* 4384e592132Srmind * A <- IP version; A == expected-version? 4394e592132Srmind * If no particular version specified, check for non-zero. 4404e592132Srmind */ 44105a7a9a5Srmind struct bpf_insn insns_af[] = { 44205a7a9a5Srmind BPF_STMT(BPF_LD+BPF_W+BPF_MEM, BPF_MW_IPVER), 44305a7a9a5Srmind BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, ver, jt, jf), 44405a7a9a5Srmind }; 44505a7a9a5Srmind add_insns(ctx, insns_af, __arraycount(insns_af)); 4469c7a886eSrmind ctx->flags |= FETCHED_L3; 4474e592132Srmind ctx->af = af; 4484e592132Srmind 4494e592132Srmind if (af) { 4504e592132Srmind uint32_t mwords[] = { BM_IPVER, 1, af }; 451b899bfd9Srmind add_bmarks(ctx, mwords, sizeof(mwords)); 4524e592132Srmind } 4534e592132Srmind if (ingroup) { 454b899bfd9Srmind npfctl_bpf_group_enter(ctx, invert); 4554e592132Srmind } 4564e592132Srmind 4574e592132Srmind } else if (af && af != ctx->af) { 4584e592132Srmind errx(EXIT_FAILURE, "address family mismatch"); 4594e592132Srmind } 4604e592132Srmind 4614e592132Srmind if ((flags & X_EQ_L4OFF) != 0 && (ctx->flags & X_EQ_L4OFF) == 0) { 4624e592132Srmind /* X <- IP header length */ 4634e592132Srmind struct bpf_insn insns_hlen[] = { 4644e592132Srmind BPF_STMT(BPF_LDX+BPF_MEM, BPF_MW_L4OFF), 4654e592132Srmind }; 4664e592132Srmind add_insns(ctx, insns_hlen, __arraycount(insns_hlen)); 4674e592132Srmind ctx->flags |= X_EQ_L4OFF; 4684e592132Srmind } 4694e592132Srmind } 4704e592132Srmind 471b899bfd9Srmind static void 472b899bfd9Srmind bm_invert_checkpoint(npf_bpf_t *ctx, const unsigned opts) 473b899bfd9Srmind { 474b899bfd9Srmind uint32_t bm = 0; 475b899bfd9Srmind 476b899bfd9Srmind if (ctx->ingroup && ctx->invert) { 477b899bfd9Srmind const unsigned seen = ctx->invflags; 478b899bfd9Srmind 479b899bfd9Srmind if ((opts & MATCH_SRC) != 0 && (seen & MATCH_SRC) == 0) { 480b899bfd9Srmind bm = BM_SRC_NEG; 481b899bfd9Srmind } 482b899bfd9Srmind if ((opts & MATCH_DST) != 0 && (seen & MATCH_DST) == 0) { 483b899bfd9Srmind bm = BM_DST_NEG; 484b899bfd9Srmind } 485b899bfd9Srmind ctx->invflags |= opts & (MATCH_SRC | MATCH_DST); 486b899bfd9Srmind } 487b899bfd9Srmind if (bm) { 488b899bfd9Srmind uint32_t mwords[] = { bm, 0 }; 489b899bfd9Srmind add_bmarks(ctx, mwords, sizeof(mwords)); 490b899bfd9Srmind } 491b899bfd9Srmind } 492b899bfd9Srmind 493b899bfd9Srmind /* 494b899bfd9Srmind * npfctl_bpf_ipver: match the IP version. 495b899bfd9Srmind */ 496b899bfd9Srmind void 497b899bfd9Srmind npfctl_bpf_ipver(npf_bpf_t *ctx, sa_family_t af) 498b899bfd9Srmind { 499b899bfd9Srmind fetch_l3(ctx, af, 0); 500b899bfd9Srmind } 501b899bfd9Srmind 5024e592132Srmind /* 5034e592132Srmind * npfctl_bpf_proto: code block to match IP version and L4 protocol. 5044e592132Srmind */ 5054e592132Srmind void 506b899bfd9Srmind npfctl_bpf_proto(npf_bpf_t *ctx, unsigned proto) 5074e592132Srmind { 5084e592132Srmind struct bpf_insn insns_proto[] = { 5094e592132Srmind /* A <- L4 protocol; A == expected-protocol? */ 5104e592132Srmind BPF_STMT(BPF_LD+BPF_W+BPF_MEM, BPF_MW_L4PROTO), 5114e592132Srmind BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, proto, 0, JUMP_MAGIC), 5124e592132Srmind }; 5134e592132Srmind add_insns(ctx, insns_proto, __arraycount(insns_proto)); 5144e592132Srmind 5154e592132Srmind uint32_t mwords[] = { BM_PROTO, 1, proto }; 5164e592132Srmind done_block(ctx, mwords, sizeof(mwords)); 517b899bfd9Srmind ctx->flags |= CHECKED_L4_PROTO; 5184e592132Srmind } 5194e592132Srmind 5204e592132Srmind /* 5214e592132Srmind * npfctl_bpf_cidr: code block to match IPv4 or IPv6 CIDR. 5224e592132Srmind * 5234e592132Srmind * => IP address shall be in the network byte order. 5244e592132Srmind */ 5254e592132Srmind void 526b899bfd9Srmind npfctl_bpf_cidr(npf_bpf_t *ctx, unsigned opts, sa_family_t af, 5274e592132Srmind const npf_addr_t *addr, const npf_netmask_t mask) 5284e592132Srmind { 5294e592132Srmind const uint32_t *awords = (const uint32_t *)addr; 530*b75b735cSriastradh unsigned nwords, origlength, length, maxmask, off; 5314e592132Srmind 5324e592132Srmind assert(((opts & MATCH_SRC) != 0) ^ ((opts & MATCH_DST) != 0)); 5334e592132Srmind assert((mask && mask <= NPF_MAX_NETMASK) || mask == NPF_NO_NETMASK); 5344e592132Srmind 5354e592132Srmind switch (af) { 5364e592132Srmind case AF_INET: 5374e592132Srmind maxmask = 32; 5384e592132Srmind off = (opts & MATCH_SRC) ? 5394e592132Srmind offsetof(struct ip, ip_src) : 5404e592132Srmind offsetof(struct ip, ip_dst); 5414e592132Srmind nwords = sizeof(struct in_addr) / sizeof(uint32_t); 5424e592132Srmind break; 5434e592132Srmind case AF_INET6: 5444e592132Srmind maxmask = 128; 5454e592132Srmind off = (opts & MATCH_SRC) ? 5464e592132Srmind offsetof(struct ip6_hdr, ip6_src) : 5474e592132Srmind offsetof(struct ip6_hdr, ip6_dst); 5484e592132Srmind nwords = sizeof(struct in6_addr) / sizeof(uint32_t); 5494e592132Srmind break; 5504e592132Srmind default: 5514e592132Srmind abort(); 5524e592132Srmind } 5534e592132Srmind 5544e592132Srmind /* Ensure address family. */ 5554e592132Srmind fetch_l3(ctx, af, 0); 5564e592132Srmind 557*b75b735cSriastradh length = origlength = (mask == NPF_NO_NETMASK) ? maxmask : mask; 5584e592132Srmind 5594e592132Srmind /* CAUTION: BPF operates in host byte-order. */ 560b899bfd9Srmind for (unsigned i = 0; i < nwords; i++) { 561b899bfd9Srmind const unsigned woff = i * sizeof(uint32_t); 5624e592132Srmind uint32_t word = ntohl(awords[i]); 5634e592132Srmind uint32_t wordmask; 5644e592132Srmind 5654e592132Srmind if (length >= 32) { 5664e592132Srmind /* The mask is a full word - do not apply it. */ 5674e592132Srmind wordmask = 0; 5684e592132Srmind length -= 32; 5694e592132Srmind } else if (length) { 570f1567f86Srmind wordmask = 0xffffffff << (32 - length); 5714e592132Srmind length = 0; 5724e592132Srmind } else { 57382f6ff32Srmind /* The mask became zero - skip the rest. */ 57482f6ff32Srmind break; 5754e592132Srmind } 5764e592132Srmind 5774e592132Srmind /* A <- IP address (or one word of it) */ 5784e592132Srmind struct bpf_insn insns_ip[] = { 5794e592132Srmind BPF_STMT(BPF_LD+BPF_W+BPF_ABS, off + woff), 5804e592132Srmind }; 5814e592132Srmind add_insns(ctx, insns_ip, __arraycount(insns_ip)); 5824e592132Srmind 5834e592132Srmind /* A <- (A & MASK) */ 5844e592132Srmind if (wordmask) { 5854e592132Srmind struct bpf_insn insns_mask[] = { 5864e592132Srmind BPF_STMT(BPF_ALU+BPF_AND+BPF_K, wordmask), 5874e592132Srmind }; 5884e592132Srmind add_insns(ctx, insns_mask, __arraycount(insns_mask)); 5894e592132Srmind } 5904e592132Srmind 591*b75b735cSriastradh /* 592*b75b735cSriastradh * Determine how many instructions we have to jump 593*b75b735cSriastradh * ahead if the match fails. 594*b75b735cSriastradh * 595*b75b735cSriastradh * - If this is the last word, we jump to the final 596*b75b735cSriastradh * failure, JUMP_MAGIC. 597*b75b735cSriastradh * 598*b75b735cSriastradh * - If this is not the last word, we jump past the 599*b75b735cSriastradh * remaining instructions to match this sequence. 600*b75b735cSriastradh * Each 32-bit word in the sequence takes two 601*b75b735cSriastradh * instructions (BPF_LD and BPF_JMP). If there is a 602*b75b735cSriastradh * partial-word mask ahead, there will be one 603*b75b735cSriastradh * additional instruction (BPF_ALU). 604*b75b735cSriastradh */ 605*b75b735cSriastradh uint8_t jf; 606*b75b735cSriastradh if (i + 1 == (origlength + 31)/32) { 607*b75b735cSriastradh jf = JUMP_MAGIC; 608*b75b735cSriastradh } else { 609*b75b735cSriastradh jf = 2*((origlength + 31)/32 - i - 1); 610*b75b735cSriastradh if (origlength % 32 != 0 && wordmask == 0) 611*b75b735cSriastradh jf += 1; 612*b75b735cSriastradh } 613*b75b735cSriastradh 6144e592132Srmind /* A == expected-IP-word ? */ 6154e592132Srmind struct bpf_insn insns_cmp[] = { 616*b75b735cSriastradh BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, word, 0, jf), 6174e592132Srmind }; 6184e592132Srmind add_insns(ctx, insns_cmp, __arraycount(insns_cmp)); 6194e592132Srmind } 6204e592132Srmind 621*b75b735cSriastradh /* 622*b75b735cSriastradh * If we checked a chain of words in sequence, mark this as a 623*b75b735cSriastradh * multi-word comparison so if this is in a group there will be 624*b75b735cSriastradh * a fallthrough case. 625*b75b735cSriastradh * 626*b75b735cSriastradh * XXX This is a little silly; the compiler should really just 627*b75b735cSriastradh * record holes where conditional jumps need success/failure 628*b75b735cSriastradh * continuations, and go back to fill in the holes when the 629*b75b735cSriastradh * locations of the continuations are determined later. But 630*b75b735cSriastradh * that requires restructuring this code a little more. 631*b75b735cSriastradh */ 632*b75b735cSriastradh ctx->multiword = (origlength + 31)/32 > 1; 633*b75b735cSriastradh 6344e592132Srmind uint32_t mwords[] = { 6354e592132Srmind (opts & MATCH_SRC) ? BM_SRC_CIDR: BM_DST_CIDR, 6, 6364e592132Srmind af, mask, awords[0], awords[1], awords[2], awords[3], 6374e592132Srmind }; 638b899bfd9Srmind bm_invert_checkpoint(ctx, opts); 6394e592132Srmind done_block(ctx, mwords, sizeof(mwords)); 6404e592132Srmind } 6414e592132Srmind 6424e592132Srmind /* 6434e592132Srmind * npfctl_bpf_ports: code block to match TCP/UDP port range. 6444e592132Srmind * 6454e592132Srmind * => Port numbers shall be in the network byte order. 6464e592132Srmind */ 6474e592132Srmind void 648b899bfd9Srmind npfctl_bpf_ports(npf_bpf_t *ctx, unsigned opts, in_port_t from, in_port_t to) 6494e592132Srmind { 650b899bfd9Srmind const unsigned sport_off = offsetof(struct udphdr, uh_sport); 651b899bfd9Srmind const unsigned dport_off = offsetof(struct udphdr, uh_dport); 652b899bfd9Srmind unsigned off; 6534e592132Srmind 6544e592132Srmind /* TCP and UDP port offsets are the same. */ 6554e592132Srmind assert(sport_off == offsetof(struct tcphdr, th_sport)); 6564e592132Srmind assert(dport_off == offsetof(struct tcphdr, th_dport)); 657b899bfd9Srmind assert(ctx->flags & CHECKED_L4_PROTO); 6584e592132Srmind 6594e592132Srmind assert(((opts & MATCH_SRC) != 0) ^ ((opts & MATCH_DST) != 0)); 6604e592132Srmind off = (opts & MATCH_SRC) ? sport_off : dport_off; 6614e592132Srmind 6624e592132Srmind /* X <- IP header length */ 66305a7a9a5Srmind fetch_l3(ctx, AF_UNSPEC, X_EQ_L4OFF); 6644e592132Srmind 6654e592132Srmind struct bpf_insn insns_fetch[] = { 6664e592132Srmind /* A <- port */ 6674e592132Srmind BPF_STMT(BPF_LD+BPF_H+BPF_IND, off), 6684e592132Srmind }; 6694e592132Srmind add_insns(ctx, insns_fetch, __arraycount(insns_fetch)); 6704e592132Srmind 6714e592132Srmind /* CAUTION: BPF operates in host byte-order. */ 6724e592132Srmind from = ntohs(from); 6734e592132Srmind to = ntohs(to); 6744e592132Srmind 6754e592132Srmind if (from == to) { 6764e592132Srmind /* Single port case. */ 6774e592132Srmind struct bpf_insn insns_port[] = { 6784e592132Srmind BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, from, 0, JUMP_MAGIC), 6794e592132Srmind }; 6804e592132Srmind add_insns(ctx, insns_port, __arraycount(insns_port)); 6814e592132Srmind } else { 6824e592132Srmind /* Port range case. */ 6834e592132Srmind struct bpf_insn insns_range[] = { 68454d339ecSrmind BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, from, 0, 1), 68554d339ecSrmind BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, to, 0, 1), 68654d339ecSrmind BPF_STMT(BPF_JMP+BPF_JA, JUMP_MAGIC), 6874e592132Srmind }; 6884e592132Srmind add_insns(ctx, insns_range, __arraycount(insns_range)); 6894e592132Srmind } 6904e592132Srmind 6914e592132Srmind uint32_t mwords[] = { 692b899bfd9Srmind (opts & MATCH_SRC) ? BM_SRC_PORTS : BM_DST_PORTS, 2, from, to 6934e592132Srmind }; 6944e592132Srmind done_block(ctx, mwords, sizeof(mwords)); 6954e592132Srmind } 6964e592132Srmind 6974e592132Srmind /* 6984e592132Srmind * npfctl_bpf_tcpfl: code block to match TCP flags. 6994e592132Srmind */ 7004e592132Srmind void 701b899bfd9Srmind npfctl_bpf_tcpfl(npf_bpf_t *ctx, uint8_t tf, uint8_t tf_mask) 7024e592132Srmind { 703b899bfd9Srmind const unsigned tcpfl_off = offsetof(struct tcphdr, th_flags); 704410bae3fSrmind const bool usingmask = tf_mask != tf; 7054e592132Srmind 7064e592132Srmind /* X <- IP header length */ 70705a7a9a5Srmind fetch_l3(ctx, AF_UNSPEC, X_EQ_L4OFF); 708e05005e0Srmind 709b899bfd9Srmind if ((ctx->flags & CHECKED_L4_PROTO) == 0) { 710b899bfd9Srmind const unsigned jf = usingmask ? 3 : 2; 711b899bfd9Srmind assert(ctx->ingroup == 0); 712b899bfd9Srmind 713b899bfd9Srmind /* 714b899bfd9Srmind * A <- L4 protocol; A == TCP? If not, jump out. 715b899bfd9Srmind * 716b899bfd9Srmind * Note: the TCP flag matching might be without 'proto tcp' 717b899bfd9Srmind * when using a plain 'stateful' rule. In such case it also 718b899bfd9Srmind * handles other protocols, thus no strict TCP check. 719b899bfd9Srmind */ 720e05005e0Srmind struct bpf_insn insns_tcp[] = { 721e05005e0Srmind BPF_STMT(BPF_LD+BPF_W+BPF_MEM, BPF_MW_L4PROTO), 722e05005e0Srmind BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, IPPROTO_TCP, 0, jf), 723e05005e0Srmind }; 724e05005e0Srmind add_insns(ctx, insns_tcp, __arraycount(insns_tcp)); 725e05005e0Srmind } 7264e592132Srmind 7274e592132Srmind struct bpf_insn insns_tf[] = { 7284e592132Srmind /* A <- TCP flags */ 7294e592132Srmind BPF_STMT(BPF_LD+BPF_B+BPF_IND, tcpfl_off), 7304e592132Srmind }; 7314e592132Srmind add_insns(ctx, insns_tf, __arraycount(insns_tf)); 7324e592132Srmind 733410bae3fSrmind if (usingmask) { 7344e592132Srmind /* A <- (A & mask) */ 7354e592132Srmind struct bpf_insn insns_mask[] = { 7364e592132Srmind BPF_STMT(BPF_ALU+BPF_AND+BPF_K, tf_mask), 7374e592132Srmind }; 7384e592132Srmind add_insns(ctx, insns_mask, __arraycount(insns_mask)); 7394e592132Srmind } 7404e592132Srmind 7414e592132Srmind struct bpf_insn insns_cmp[] = { 7424e592132Srmind /* A == expected-TCP-flags? */ 7434e592132Srmind BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, tf, 0, JUMP_MAGIC), 7444e592132Srmind }; 7454e592132Srmind add_insns(ctx, insns_cmp, __arraycount(insns_cmp)); 7464e592132Srmind 7474e592132Srmind uint32_t mwords[] = { BM_TCPFL, 2, tf, tf_mask }; 7484e592132Srmind done_block(ctx, mwords, sizeof(mwords)); 7494e592132Srmind } 7504e592132Srmind 7514e592132Srmind /* 7524e592132Srmind * npfctl_bpf_icmp: code block to match ICMP type and/or code. 753b899bfd9Srmind * Note: suitable for both the ICMPv4 and ICMPv6. 7544e592132Srmind */ 7554e592132Srmind void 7564e592132Srmind npfctl_bpf_icmp(npf_bpf_t *ctx, int type, int code) 7574e592132Srmind { 7584e592132Srmind const u_int type_off = offsetof(struct icmp, icmp_type); 7594e592132Srmind const u_int code_off = offsetof(struct icmp, icmp_code); 7604e592132Srmind 761b899bfd9Srmind assert(ctx->flags & CHECKED_L4_PROTO); 7624e592132Srmind assert(offsetof(struct icmp6_hdr, icmp6_type) == type_off); 7634e592132Srmind assert(offsetof(struct icmp6_hdr, icmp6_code) == code_off); 7644e592132Srmind assert(type != -1 || code != -1); 7654e592132Srmind 7664e592132Srmind /* X <- IP header length */ 76705a7a9a5Srmind fetch_l3(ctx, AF_UNSPEC, X_EQ_L4OFF); 7684e592132Srmind 7694e592132Srmind if (type != -1) { 7704e592132Srmind struct bpf_insn insns_type[] = { 7714e592132Srmind BPF_STMT(BPF_LD+BPF_B+BPF_IND, type_off), 7724e592132Srmind BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, type, 0, JUMP_MAGIC), 7734e592132Srmind }; 7744e592132Srmind add_insns(ctx, insns_type, __arraycount(insns_type)); 7754e592132Srmind 7764e592132Srmind uint32_t mwords[] = { BM_ICMP_TYPE, 1, type }; 7774e592132Srmind done_block(ctx, mwords, sizeof(mwords)); 7784e592132Srmind } 7794e592132Srmind 7804e592132Srmind if (code != -1) { 7814e592132Srmind struct bpf_insn insns_code[] = { 7824e592132Srmind BPF_STMT(BPF_LD+BPF_B+BPF_IND, code_off), 7834e592132Srmind BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, code, 0, JUMP_MAGIC), 7844e592132Srmind }; 7854e592132Srmind add_insns(ctx, insns_code, __arraycount(insns_code)); 7864e592132Srmind 7874e592132Srmind uint32_t mwords[] = { BM_ICMP_CODE, 1, code }; 7884e592132Srmind done_block(ctx, mwords, sizeof(mwords)); 7894e592132Srmind } 7904e592132Srmind } 7914e592132Srmind 7924e592132Srmind #define SRC_FLAG_BIT (1U << 31) 7934e592132Srmind 7944e592132Srmind /* 7954e592132Srmind * npfctl_bpf_table: code block to match source/destination IP address 7964e592132Srmind * against NPF table specified by ID. 7974e592132Srmind */ 7984e592132Srmind void 799b899bfd9Srmind npfctl_bpf_table(npf_bpf_t *ctx, unsigned opts, unsigned tid) 8004e592132Srmind { 8014e592132Srmind const bool src = (opts & MATCH_SRC) != 0; 8024e592132Srmind 8034e592132Srmind struct bpf_insn insns_table[] = { 8044e592132Srmind BPF_STMT(BPF_LD+BPF_IMM, (src ? SRC_FLAG_BIT : 0) | tid), 8054e592132Srmind BPF_STMT(BPF_MISC+BPF_COP, NPF_COP_TABLE), 8064e592132Srmind BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, 0, JUMP_MAGIC, 0), 8074e592132Srmind }; 8084e592132Srmind add_insns(ctx, insns_table, __arraycount(insns_table)); 8094e592132Srmind 8104e592132Srmind uint32_t mwords[] = { src ? BM_SRC_TABLE: BM_DST_TABLE, 1, tid }; 811b899bfd9Srmind bm_invert_checkpoint(ctx, opts); 8124e592132Srmind done_block(ctx, mwords, sizeof(mwords)); 8134e592132Srmind } 814