xref: /netbsd-src/sys/net/npf/npf_conn.c (revision edfe75c0f8e53392739757e6a4a14b4e565d0dce)
1 /*-
2  * Copyright (c) 2014-2020 Mindaugas Rasiukevicius <rmind at noxt eu>
3  * Copyright (c) 2010-2014 The NetBSD Foundation, Inc.
4  * All rights reserved.
5  *
6  * This material is based upon work partially supported by The
7  * NetBSD Foundation under a contract with Mindaugas Rasiukevicius.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
22  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 /*
32  * NPF connection tracking for stateful filtering and translation.
33  *
34  * Overview
35  *
36  *	Packets can be incoming or outgoing with respect to an interface.
37  *	Connection direction is identified by the direction of its first
38  *	packet.  The meaning of incoming/outgoing packet in the context of
39  *	connection direction can be confusing.  Therefore, we will use the
40  *	terms "forwards stream" and "backwards stream", where packets in
41  *	the forwards stream mean the packets travelling in the direction
42  *	as the connection direction.
43  *
44  *	All connections have two keys and thus two entries:
45  *
46  *	- npf_conn_getforwkey(con)        -- for the forwards stream;
47  *	- npf_conn_getbackkey(con, alen)  -- for the backwards stream.
48  *
49  *	Note: the keys are stored in npf_conn_t::c_keys[], which is used
50  *	to allocate variable-length npf_conn_t structures based on whether
51  *	the IPv4 or IPv6 addresses are used.
52  *
53  *	The key is an n-tuple used to identify the connection flow: see the
54  *	npf_connkey.c source file for the description of the key layouts.
55  *	The key may be formed using translated values in a case of NAT.
56  *
57  *	Connections can serve two purposes: for the implicit passing and/or
58  *	to accommodate the dynamic NAT.  Connections for the former purpose
59  *	are created by the rules with "stateful" attribute and are used for
60  *	stateful filtering.  Such connections indicate that the packet of
61  *	the backwards stream should be passed without inspection of the
62  *	ruleset.  The other purpose is to associate a dynamic NAT mechanism
63  *	with a connection.  Such connections are created by the NAT policies
64  *	and they have a relationship with NAT translation structure via
65  *	npf_conn_t::c_nat.  A single connection can serve both purposes,
66  *	which is a common case.
67  *
68  * Connection life-cycle
69  *
70  *	Connections are established when a packet matches said rule or
71  *	NAT policy.  Both keys of the established connection are inserted
72  *	into the connection database.  A garbage collection thread
73  *	periodically scans all connections and depending on connection
74  *	properties (e.g. last activity time, protocol) removes connection
75  *	entries and expires the actual connections.
76  *
77  *	Each connection has a reference count.  The reference is acquired
78  *	on lookup and should be released by the caller.  It guarantees that
79  *	the connection will not be destroyed, although it may be expired.
80  *
81  * Synchronization
82  *
83  *	Connection database is accessed in a lock-free manner by the main
84  *	routines: npf_conn_inspect() and npf_conn_establish().  Since they
85  *	are always called from a software interrupt, the database is
86  *	protected using EBR.  The main place which can destroy a connection
87  *	is npf_conn_worker().  The database itself can be replaced and
88  *	destroyed in npf_conn_reload().
89  *
90  * ALG support
91  *
92  *	Application-level gateways (ALGs) can override generic connection
93  *	inspection (npf_alg_conn() call in npf_conn_inspect() function) by
94  *	performing their own lookup using different key.  Recursive call
95  *	to npf_conn_inspect() is not allowed.  The ALGs ought to use the
96  *	npf_conn_lookup() function for this purpose.
97  *
98  * Lock order
99  *
100  *	npf_t::config_lock ->
101  *		conn_lock ->
102  *			npf_conn_t::c_lock
103  */
104 
105 #ifdef _KERNEL
106 #include <sys/cdefs.h>
107 __KERNEL_RCSID(0, "$NetBSD: npf_conn.c,v 1.35 2023/01/22 18:39:35 riastradh Exp $");
108 
109 #include <sys/param.h>
110 #include <sys/types.h>
111 
112 #include <netinet/in.h>
113 #include <netinet/tcp.h>
114 
115 #include <sys/atomic.h>
116 #include <sys/kmem.h>
117 #include <sys/mutex.h>
118 #include <net/pfil.h>
119 #include <sys/pool.h>
120 #include <sys/queue.h>
121 #include <sys/systm.h>
122 #endif
123 
124 #define __NPF_CONN_PRIVATE
125 #include "npf_conn.h"
126 #include "npf_impl.h"
127 
128 /* A helper to select the IPv4 or IPv6 connection cache. */
129 #define	NPF_CONNCACHE(alen)	(((alen) >> 4) & 0x1)
130 
131 /*
132  * Connection flags: PFIL_IN and PFIL_OUT values are reserved for direction.
133  */
134 CTASSERT(PFIL_ALL == (0x001 | 0x002));
135 #define	CONN_ACTIVE	0x004	/* visible on inspection */
136 #define	CONN_PASS	0x008	/* perform implicit passing */
137 #define	CONN_EXPIRE	0x010	/* explicitly expire */
138 #define	CONN_REMOVED	0x020	/* "forw/back" entries removed */
139 
140 enum { CONN_TRACKING_OFF, CONN_TRACKING_ON };
141 
142 static int	npf_conn_export(npf_t *, npf_conn_t *, nvlist_t *);
143 
144 /*
145  * npf_conn_sys{init,fini}: initialize/destroy connection tracking.
146  */
147 
148 void
npf_conn_init(npf_t * npf)149 npf_conn_init(npf_t *npf)
150 {
151 	npf_conn_params_t *params = npf_param_allocgroup(npf,
152 	    NPF_PARAMS_CONN, sizeof(npf_conn_params_t));
153 	npf_param_t param_map[] = {
154 		{
155 			"state.key.interface",
156 			&params->connkey_interface,
157 			.default_val = 1, // true
158 			.min = 0, .max = 1
159 		},
160 		{
161 			"state.key.direction",
162 			&params->connkey_direction,
163 			.default_val = 1, // true
164 			.min = 0, .max = 1
165 		},
166 	};
167 	npf_param_register(npf, param_map, __arraycount(param_map));
168 
169 	npf->conn_cache[0] = pool_cache_init(
170 	    offsetof(npf_conn_t, c_keys[NPF_CONNKEY_V4WORDS * 2]),
171 	    0, 0, 0, "npfcn4pl", NULL, IPL_NET, NULL, NULL, NULL);
172 	npf->conn_cache[1] = pool_cache_init(
173 	    offsetof(npf_conn_t, c_keys[NPF_CONNKEY_V6WORDS * 2]),
174 	    0, 0, 0, "npfcn6pl", NULL, IPL_NET, NULL, NULL, NULL);
175 
176 	mutex_init(&npf->conn_lock, MUTEX_DEFAULT, IPL_NONE);
177 	atomic_store_relaxed(&npf->conn_tracking, CONN_TRACKING_OFF);
178 	npf->conn_db = npf_conndb_create();
179 	npf_conndb_sysinit(npf);
180 
181 	npf_worker_addfunc(npf, npf_conn_worker);
182 }
183 
184 void
npf_conn_fini(npf_t * npf)185 npf_conn_fini(npf_t *npf)
186 {
187 	const size_t len = sizeof(npf_conn_params_t);
188 
189 	/* Note: the caller should have flushed the connections. */
190 	KASSERT(atomic_load_relaxed(&npf->conn_tracking) == CONN_TRACKING_OFF);
191 
192 	npf_conndb_destroy(npf->conn_db);
193 	pool_cache_destroy(npf->conn_cache[0]);
194 	pool_cache_destroy(npf->conn_cache[1]);
195 	mutex_destroy(&npf->conn_lock);
196 
197 	npf_param_freegroup(npf, NPF_PARAMS_CONN, len);
198 	npf_conndb_sysfini(npf);
199 }
200 
201 /*
202  * npf_conn_load: perform the load by flushing the current connection
203  * database and replacing it with the new one or just destroying.
204  *
205  * => The caller must disable the connection tracking and ensure that
206  *    there are no connection database lookups or references in-flight.
207  */
208 void
npf_conn_load(npf_t * npf,npf_conndb_t * ndb,bool track)209 npf_conn_load(npf_t *npf, npf_conndb_t *ndb, bool track)
210 {
211 	npf_conndb_t *odb = NULL;
212 
213 	KASSERT(npf_config_locked_p(npf));
214 
215 	/*
216 	 * The connection database is in the quiescent state.
217 	 * Prevent G/C thread from running and install a new database.
218 	 */
219 	mutex_enter(&npf->conn_lock);
220 	if (ndb) {
221 		KASSERT(atomic_load_relaxed(&npf->conn_tracking)
222 		    == CONN_TRACKING_OFF);
223 		odb = atomic_load_relaxed(&npf->conn_db);
224 		atomic_store_release(&npf->conn_db, ndb);
225 	}
226 	if (track) {
227 		/* After this point lookups start flying in. */
228 		membar_producer();
229 		atomic_store_relaxed(&npf->conn_tracking, CONN_TRACKING_ON);
230 	}
231 	mutex_exit(&npf->conn_lock);
232 
233 	if (odb) {
234 		/*
235 		 * Flush all, no sync since the caller did it for us.
236 		 * Also, release the pool cache memory.
237 		 */
238 		npf_conndb_gc(npf, odb, true, false);
239 		npf_conndb_destroy(odb);
240 		pool_cache_invalidate(npf->conn_cache[0]);
241 		pool_cache_invalidate(npf->conn_cache[1]);
242 	}
243 }
244 
245 /*
246  * npf_conn_tracking: enable/disable connection tracking.
247  */
248 void
npf_conn_tracking(npf_t * npf,bool track)249 npf_conn_tracking(npf_t *npf, bool track)
250 {
251 	KASSERT(npf_config_locked_p(npf));
252 	atomic_store_relaxed(&npf->conn_tracking,
253 	    track ? CONN_TRACKING_ON : CONN_TRACKING_OFF);
254 }
255 
256 static inline bool
npf_conn_trackable_p(const npf_cache_t * npc)257 npf_conn_trackable_p(const npf_cache_t *npc)
258 {
259 	const npf_t *npf = npc->npc_ctx;
260 
261 	/*
262 	 * Check if connection tracking is on.  Also, if layer 3 and 4 are
263 	 * not cached - protocol is not supported or packet is invalid.
264 	 */
265 	if (atomic_load_relaxed(&npf->conn_tracking) != CONN_TRACKING_ON) {
266 		return false;
267 	}
268 	if (!npf_iscached(npc, NPC_IP46) || !npf_iscached(npc, NPC_LAYER4)) {
269 		return false;
270 	}
271 	return true;
272 }
273 
274 static inline void
conn_update_atime(npf_conn_t * con)275 conn_update_atime(npf_conn_t *con)
276 {
277 	struct timespec tsnow;
278 
279 	getnanouptime(&tsnow);
280 	atomic_store_relaxed(&con->c_atime, tsnow.tv_sec);
281 }
282 
283 /*
284  * npf_conn_check: check that:
285  *
286  *	- the connection is active;
287  *
288  *	- the packet is travelling in the right direction with the respect
289  *	  to the connection direction (if interface-id is not zero);
290  *
291  *	- the packet is travelling on the same interface as the
292  *	  connection interface (if interface-id is not zero).
293  */
294 static bool
npf_conn_check(const npf_conn_t * con,const nbuf_t * nbuf,const unsigned di,const npf_flow_t flow)295 npf_conn_check(const npf_conn_t *con, const nbuf_t *nbuf,
296     const unsigned di, const npf_flow_t flow)
297 {
298 	const uint32_t flags = atomic_load_relaxed(&con->c_flags);
299 	const unsigned ifid = atomic_load_relaxed(&con->c_ifid);
300 	bool active;
301 
302 	active = (flags & (CONN_ACTIVE | CONN_EXPIRE)) == CONN_ACTIVE;
303 	if (__predict_false(!active)) {
304 		return false;
305 	}
306 	if (ifid && nbuf) {
307 		const bool match = (flags & PFIL_ALL) == di;
308 		npf_flow_t pflow = match ? NPF_FLOW_FORW : NPF_FLOW_BACK;
309 
310 		if (__predict_false(flow != pflow)) {
311 			return false;
312 		}
313 		if (__predict_false(ifid != nbuf->nb_ifid)) {
314 			return false;
315 		}
316 	}
317 	return true;
318 }
319 
320 /*
321  * npf_conn_lookup: lookup if there is an established connection.
322  *
323  * => If found, we will hold a reference for the caller.
324  */
325 npf_conn_t *
npf_conn_lookup(const npf_cache_t * npc,const unsigned di,npf_flow_t * flow)326 npf_conn_lookup(const npf_cache_t *npc, const unsigned di, npf_flow_t *flow)
327 {
328 	npf_t *npf = npc->npc_ctx;
329 	const nbuf_t *nbuf = npc->npc_nbuf;
330 	npf_conn_t *con;
331 	npf_connkey_t key;
332 
333 	/* Construct a key and lookup for a connection in the store. */
334 	if (!npf_conn_conkey(npc, &key, di, NPF_FLOW_FORW)) {
335 		return NULL;
336 	}
337 	con = npf_conndb_lookup(npf, &key, flow);
338 	if (con == NULL) {
339 		return NULL;
340 	}
341 	KASSERT(npc->npc_proto == atomic_load_relaxed(&con->c_proto));
342 
343 	/* Extra checks for the connection and packet. */
344 	if (!npf_conn_check(con, nbuf, di, *flow)) {
345 		atomic_dec_uint(&con->c_refcnt);
346 		return NULL;
347 	}
348 
349 	/* Update the last activity time. */
350 	conn_update_atime(con);
351 	return con;
352 }
353 
354 /*
355  * npf_conn_inspect: lookup a connection and inspecting the protocol data.
356  *
357  * => If found, we will hold a reference for the caller.
358  */
359 npf_conn_t *
npf_conn_inspect(npf_cache_t * npc,const unsigned di,int * error)360 npf_conn_inspect(npf_cache_t *npc, const unsigned di, int *error)
361 {
362 	nbuf_t *nbuf = npc->npc_nbuf;
363 	npf_flow_t flow;
364 	npf_conn_t *con;
365 	bool ok;
366 
367 	KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
368 	if (!npf_conn_trackable_p(npc)) {
369 		return NULL;
370 	}
371 
372 	/* Query ALG which may lookup connection for us. */
373 	if ((con = npf_alg_conn(npc, di)) != NULL) {
374 		/* Note: reference is held. */
375 		return con;
376 	}
377 	if (nbuf_head_mbuf(nbuf) == NULL) {
378 		*error = ENOMEM;
379 		return NULL;
380 	}
381 	KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
382 
383 	/* The main lookup of the connection (acquires a reference). */
384 	if ((con = npf_conn_lookup(npc, di, &flow)) == NULL) {
385 		return NULL;
386 	}
387 
388 	/* Inspect the protocol data and handle state changes. */
389 	mutex_enter(&con->c_lock);
390 	ok = npf_state_inspect(npc, &con->c_state, flow);
391 	mutex_exit(&con->c_lock);
392 
393 	/* If invalid state: let the rules deal with it. */
394 	if (__predict_false(!ok)) {
395 		npf_conn_release(con);
396 		npf_stats_inc(npc->npc_ctx, NPF_STAT_INVALID_STATE);
397 		return NULL;
398 	}
399 #if 0
400 	/*
401 	 * TODO -- determine when this might be wanted/used.
402 	 *
403 	 * Note: skipping the connection lookup and ruleset inspection
404 	 * on other interfaces will also bypass dynamic NAT.
405 	 */
406 	if (atomic_load_relaxed(&con->c_flags) & CONN_GPASS) {
407 		/*
408 		 * Note: if tagging fails, then give this packet a chance
409 		 * to go through a regular ruleset.
410 		 */
411 		(void)nbuf_add_tag(nbuf, NPF_NTAG_PASS);
412 	}
413 #endif
414 	return con;
415 }
416 
417 /*
418  * npf_conn_establish: create a new connection, insert into the global list.
419  *
420  * => Connection is created with the reference held for the caller.
421  * => Connection will be activated on the first reference release.
422  */
423 npf_conn_t *
npf_conn_establish(npf_cache_t * npc,const unsigned di,bool global)424 npf_conn_establish(npf_cache_t *npc, const unsigned di, bool global)
425 {
426 	npf_t *npf = npc->npc_ctx;
427 	const unsigned alen = npc->npc_alen;
428 	const unsigned idx = NPF_CONNCACHE(alen);
429 	const nbuf_t *nbuf = npc->npc_nbuf;
430 	npf_connkey_t *fw, *bk;
431 	npf_conndb_t *conn_db;
432 	npf_conn_t *con;
433 	int error = 0;
434 
435 	KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
436 
437 	if (!npf_conn_trackable_p(npc)) {
438 		return NULL;
439 	}
440 
441 	/* Allocate and initialize the new connection. */
442 	con = pool_cache_get(npf->conn_cache[idx], PR_NOWAIT);
443 	if (__predict_false(!con)) {
444 		npf_worker_signal(npf);
445 		return NULL;
446 	}
447 	NPF_PRINTF(("NPF: create conn %p\n", con));
448 	npf_stats_inc(npf, NPF_STAT_CONN_CREATE);
449 
450 	mutex_init(&con->c_lock, MUTEX_DEFAULT, IPL_SOFTNET);
451 	atomic_store_relaxed(&con->c_flags, di & PFIL_ALL);
452 	atomic_store_relaxed(&con->c_refcnt, 0);
453 	con->c_rproc = NULL;
454 	con->c_nat = NULL;
455 
456 	con->c_proto = npc->npc_proto;
457 	CTASSERT(sizeof(con->c_proto) >= sizeof(npc->npc_proto));
458 	con->c_alen = alen;
459 
460 	/* Initialize the protocol state. */
461 	if (!npf_state_init(npc, &con->c_state)) {
462 		npf_conn_destroy(npf, con);
463 		return NULL;
464 	}
465 	KASSERT(npf_iscached(npc, NPC_IP46));
466 
467 	fw = npf_conn_getforwkey(con);
468 	bk = npf_conn_getbackkey(con, alen);
469 
470 	/*
471 	 * Construct "forwards" and "backwards" keys.  Also, set the
472 	 * interface ID for this connection (unless it is global).
473 	 */
474 	if (!npf_conn_conkey(npc, fw, di, NPF_FLOW_FORW) ||
475 	    !npf_conn_conkey(npc, bk, di ^ PFIL_ALL, NPF_FLOW_BACK)) {
476 		npf_conn_destroy(npf, con);
477 		return NULL;
478 	}
479 	con->c_ifid = global ? nbuf->nb_ifid : 0;
480 
481 	/*
482 	 * Set last activity time for a new connection and acquire
483 	 * a reference for the caller before we make it visible.
484 	 */
485 	conn_update_atime(con);
486 	atomic_store_relaxed(&con->c_refcnt, 1);
487 
488 	/*
489 	 * Insert both keys (entries representing directions) of the
490 	 * connection.  At this point it becomes visible, but we activate
491 	 * the connection later.
492 	 */
493 	mutex_enter(&con->c_lock);
494 	conn_db = atomic_load_consume(&npf->conn_db);
495 	if (!npf_conndb_insert(conn_db, fw, con, NPF_FLOW_FORW)) {
496 		error = EISCONN;
497 		goto err;
498 	}
499 	if (!npf_conndb_insert(conn_db, bk, con, NPF_FLOW_BACK)) {
500 		npf_conn_t *ret __diagused;
501 		ret = npf_conndb_remove(conn_db, fw);
502 		KASSERT(ret == con);
503 		error = EISCONN;
504 		goto err;
505 	}
506 err:
507 	/*
508 	 * If we have hit the duplicate: mark the connection as expired
509 	 * and let the G/C thread to take care of it.  We cannot do it
510 	 * here since there might be references acquired already.
511 	 */
512 	if (error) {
513 		atomic_or_uint(&con->c_flags, CONN_REMOVED | CONN_EXPIRE);
514 		atomic_dec_uint(&con->c_refcnt);
515 		npf_stats_inc(npf, NPF_STAT_RACE_CONN);
516 	} else {
517 		NPF_PRINTF(("NPF: establish conn %p\n", con));
518 	}
519 
520 	/* Finally, insert into the connection list. */
521 	npf_conndb_enqueue(conn_db, con);
522 	mutex_exit(&con->c_lock);
523 
524 	return error ? NULL : con;
525 }
526 
527 void
npf_conn_destroy(npf_t * npf,npf_conn_t * con)528 npf_conn_destroy(npf_t *npf, npf_conn_t *con)
529 {
530 	const unsigned idx __unused = NPF_CONNCACHE(con->c_alen);
531 
532 	KASSERT(atomic_load_relaxed(&con->c_refcnt) == 0);
533 
534 	if (con->c_nat) {
535 		/* Release any NAT structures. */
536 		npf_nat_destroy(con, con->c_nat);
537 	}
538 	if (con->c_rproc) {
539 		/* Release the rule procedure. */
540 		npf_rproc_release(con->c_rproc);
541 	}
542 
543 	/* Destroy the state. */
544 	npf_state_destroy(&con->c_state);
545 	mutex_destroy(&con->c_lock);
546 
547 	/* Free the structure, increase the counter. */
548 	pool_cache_put(npf->conn_cache[idx], con);
549 	npf_stats_inc(npf, NPF_STAT_CONN_DESTROY);
550 	NPF_PRINTF(("NPF: conn %p destroyed\n", con));
551 }
552 
553 /*
554  * npf_conn_setnat: associate NAT entry with the connection, update and
555  * re-insert connection entry using the translation values.
556  *
557  * => The caller must be holding a reference.
558  */
559 int
npf_conn_setnat(const npf_cache_t * npc,npf_conn_t * con,npf_nat_t * nt,unsigned ntype)560 npf_conn_setnat(const npf_cache_t *npc, npf_conn_t *con,
561     npf_nat_t *nt, unsigned ntype)
562 {
563 	static const unsigned nat_type_which[] = {
564 		/* See the description in npf_nat_which(). */
565 		[NPF_NATOUT] = NPF_DST,
566 		[NPF_NATIN] = NPF_SRC,
567 	};
568 	npf_t *npf = npc->npc_ctx;
569 	npf_conn_t *ret __diagused;
570 	npf_conndb_t *conn_db;
571 	npf_connkey_t *bk;
572 	npf_addr_t *taddr;
573 	in_port_t tport;
574 	uint32_t flags;
575 
576 	KASSERT(atomic_load_relaxed(&con->c_refcnt) > 0);
577 
578 	npf_nat_gettrans(nt, &taddr, &tport);
579 	KASSERT(ntype == NPF_NATOUT || ntype == NPF_NATIN);
580 
581 	/* Acquire the lock and check for the races. */
582 	mutex_enter(&con->c_lock);
583 	flags = atomic_load_relaxed(&con->c_flags);
584 	if (__predict_false(flags & CONN_EXPIRE)) {
585 		/* The connection got expired. */
586 		mutex_exit(&con->c_lock);
587 		return EINVAL;
588 	}
589 	KASSERT((flags & CONN_REMOVED) == 0);
590 
591 	if (__predict_false(con->c_nat != NULL)) {
592 		/* Race with a duplicate packet. */
593 		mutex_exit(&con->c_lock);
594 		npf_stats_inc(npc->npc_ctx, NPF_STAT_RACE_NAT);
595 		return EISCONN;
596 	}
597 
598 	/* Remove the "backwards" key. */
599 	conn_db = atomic_load_consume(&npf->conn_db);
600 	bk = npf_conn_getbackkey(con, con->c_alen);
601 	ret = npf_conndb_remove(conn_db, bk);
602 	KASSERT(ret == con);
603 
604 	/* Set the source/destination IDs to the translation values. */
605 	npf_conn_adjkey(bk, taddr, tport, nat_type_which[ntype]);
606 
607 	/* Finally, re-insert the "backwards" key. */
608 	if (!npf_conndb_insert(conn_db, bk, con, NPF_FLOW_BACK)) {
609 		/*
610 		 * Race: we have hit the duplicate, remove the "forwards"
611 		 * key and expire our connection; it is no longer valid.
612 		 */
613 		npf_connkey_t *fw = npf_conn_getforwkey(con);
614 		ret = npf_conndb_remove(conn_db, fw);
615 		KASSERT(ret == con);
616 
617 		atomic_or_uint(&con->c_flags, CONN_REMOVED | CONN_EXPIRE);
618 		mutex_exit(&con->c_lock);
619 
620 		npf_stats_inc(npc->npc_ctx, NPF_STAT_RACE_NAT);
621 		return EISCONN;
622 	}
623 
624 	/* Associate the NAT entry and release the lock. */
625 	con->c_nat = nt;
626 	mutex_exit(&con->c_lock);
627 	return 0;
628 }
629 
630 /*
631  * npf_conn_expire: explicitly mark connection as expired.
632  *
633  * => Must be called with: a) reference held  b) the relevant lock held.
634  *    The relevant lock should prevent from connection destruction, e.g.
635  *    npf_t::conn_lock or npf_natpolicy_t::n_lock.
636  */
637 void
npf_conn_expire(npf_conn_t * con)638 npf_conn_expire(npf_conn_t *con)
639 {
640 	atomic_or_uint(&con->c_flags, CONN_EXPIRE);
641 }
642 
643 /*
644  * npf_conn_pass: return true if connection is "pass" one, otherwise false.
645  */
646 bool
npf_conn_pass(const npf_conn_t * con,npf_match_info_t * mi,npf_rproc_t ** rp)647 npf_conn_pass(const npf_conn_t *con, npf_match_info_t *mi, npf_rproc_t **rp)
648 {
649 	KASSERT(atomic_load_relaxed(&con->c_refcnt) > 0);
650 	if (__predict_true(atomic_load_relaxed(&con->c_flags) & CONN_PASS)) {
651 		mi->mi_retfl = atomic_load_relaxed(&con->c_retfl);
652 		mi->mi_rid = con->c_rid;
653 		*rp = con->c_rproc;
654 		return true;
655 	}
656 	return false;
657 }
658 
659 /*
660  * npf_conn_setpass: mark connection as a "pass" one and associate the
661  * rule procedure with it.
662  */
663 void
npf_conn_setpass(npf_conn_t * con,const npf_match_info_t * mi,npf_rproc_t * rp)664 npf_conn_setpass(npf_conn_t *con, const npf_match_info_t *mi, npf_rproc_t *rp)
665 {
666 	KASSERT((atomic_load_relaxed(&con->c_flags) & CONN_ACTIVE) == 0);
667 	KASSERT(atomic_load_relaxed(&con->c_refcnt) > 0);
668 	KASSERT(con->c_rproc == NULL);
669 
670 	/*
671 	 * No need for atomic since the connection is not yet active.
672 	 * If rproc is set, the caller transfers its reference to us,
673 	 * which will be released on npf_conn_destroy().
674 	 */
675 	atomic_or_uint(&con->c_flags, CONN_PASS);
676 	con->c_rproc = rp;
677 	if (rp) {
678 		con->c_rid = mi->mi_rid;
679 		con->c_retfl = mi->mi_retfl;
680 	}
681 }
682 
683 /*
684  * npf_conn_release: release a reference, which might allow G/C thread
685  * to destroy this connection.
686  */
687 void
npf_conn_release(npf_conn_t * con)688 npf_conn_release(npf_conn_t *con)
689 {
690 	const unsigned flags = atomic_load_relaxed(&con->c_flags);
691 
692 	if ((flags & (CONN_ACTIVE | CONN_EXPIRE)) == 0) {
693 		/* Activate: after this, connection is globally visible. */
694 		atomic_or_uint(&con->c_flags, CONN_ACTIVE);
695 	}
696 	KASSERT(atomic_load_relaxed(&con->c_refcnt) > 0);
697 	atomic_dec_uint(&con->c_refcnt);
698 }
699 
700 /*
701  * npf_conn_getnat: return the associated NAT entry, if any.
702  */
703 npf_nat_t *
npf_conn_getnat(const npf_conn_t * con)704 npf_conn_getnat(const npf_conn_t *con)
705 {
706 	return con->c_nat;
707 }
708 
709 /*
710  * npf_conn_expired: criterion to check if connection is expired.
711  */
712 bool
npf_conn_expired(npf_t * npf,const npf_conn_t * con,uint64_t tsnow)713 npf_conn_expired(npf_t *npf, const npf_conn_t *con, uint64_t tsnow)
714 {
715 	const unsigned flags = atomic_load_relaxed(&con->c_flags);
716 	const int etime = npf_state_etime(npf, &con->c_state, con->c_proto);
717 	int elapsed;
718 
719 	if (__predict_false(flags & CONN_EXPIRE)) {
720 		/* Explicitly marked to be expired. */
721 		return true;
722 	}
723 
724 	/*
725 	 * Note: another thread may update 'atime' and it might
726 	 * become greater than 'now'.
727 	 */
728 	elapsed = (int64_t)tsnow - atomic_load_relaxed(&con->c_atime);
729 	return elapsed > etime;
730 }
731 
732 /*
733  * npf_conn_remove: unlink the connection and mark as expired.
734  */
735 void
npf_conn_remove(npf_conndb_t * cd,npf_conn_t * con)736 npf_conn_remove(npf_conndb_t *cd, npf_conn_t *con)
737 {
738 	/* Remove both entries of the connection. */
739 	mutex_enter(&con->c_lock);
740 	if ((atomic_load_relaxed(&con->c_flags) & CONN_REMOVED) == 0) {
741 		npf_connkey_t *fw, *bk;
742 		npf_conn_t *ret __diagused;
743 
744 		fw = npf_conn_getforwkey(con);
745 		ret = npf_conndb_remove(cd, fw);
746 		KASSERT(ret == con);
747 
748 		bk = npf_conn_getbackkey(con, NPF_CONNKEY_ALEN(fw));
749 		ret = npf_conndb_remove(cd, bk);
750 		KASSERT(ret == con);
751 	}
752 
753 	/* Flag the removal and expiration. */
754 	atomic_or_uint(&con->c_flags, CONN_REMOVED | CONN_EXPIRE);
755 	mutex_exit(&con->c_lock);
756 }
757 
758 /*
759  * npf_conn_worker: G/C to run from a worker thread or via npfk_gc().
760  */
761 void
npf_conn_worker(npf_t * npf)762 npf_conn_worker(npf_t *npf)
763 {
764 	npf_conndb_t *conn_db = atomic_load_consume(&npf->conn_db);
765 	npf_conndb_gc(npf, conn_db, false, true);
766 }
767 
768 /*
769  * npf_conndb_export: construct a list of connections prepared for saving.
770  * Note: this is expected to be an expensive operation.
771  */
772 int
npf_conndb_export(npf_t * npf,nvlist_t * nvl)773 npf_conndb_export(npf_t *npf, nvlist_t *nvl)
774 {
775 	npf_conn_t *head, *con;
776 	npf_conndb_t *conn_db;
777 
778 	/*
779 	 * Note: acquire conn_lock to prevent from the database
780 	 * destruction and G/C thread.
781 	 */
782 	mutex_enter(&npf->conn_lock);
783 	if (atomic_load_relaxed(&npf->conn_tracking) != CONN_TRACKING_ON) {
784 		mutex_exit(&npf->conn_lock);
785 		return 0;
786 	}
787 	conn_db = atomic_load_relaxed(&npf->conn_db);
788 	head = npf_conndb_getlist(conn_db);
789 	con = head;
790 	while (con) {
791 		nvlist_t *con_nvl;
792 
793 		con_nvl = nvlist_create(0);
794 		if (npf_conn_export(npf, con, con_nvl) == 0) {
795 			nvlist_append_nvlist_array(nvl, "conn-list", con_nvl);
796 		}
797 		nvlist_destroy(con_nvl);
798 
799 		if ((con = npf_conndb_getnext(conn_db, con)) == head) {
800 			break;
801 		}
802 	}
803 	mutex_exit(&npf->conn_lock);
804 	return 0;
805 }
806 
807 /*
808  * npf_conn_export: serialize a single connection.
809  */
810 static int
npf_conn_export(npf_t * npf,npf_conn_t * con,nvlist_t * nvl)811 npf_conn_export(npf_t *npf, npf_conn_t *con, nvlist_t *nvl)
812 {
813 	nvlist_t *knvl;
814 	npf_connkey_t *fw, *bk;
815 	unsigned flags, alen;
816 
817 	flags = atomic_load_relaxed(&con->c_flags);
818 	if ((flags & (CONN_ACTIVE|CONN_EXPIRE)) != CONN_ACTIVE) {
819 		return ESRCH;
820 	}
821 	nvlist_add_number(nvl, "flags", flags);
822 	nvlist_add_number(nvl, "proto", con->c_proto);
823 	if (con->c_ifid) {
824 		char ifname[IFNAMSIZ];
825 		npf_ifmap_copyname(npf, con->c_ifid, ifname, sizeof(ifname));
826 		nvlist_add_string(nvl, "ifname", ifname);
827 	}
828 	nvlist_add_binary(nvl, "state", &con->c_state, sizeof(npf_state_t));
829 
830 	fw = npf_conn_getforwkey(con);
831 	alen = NPF_CONNKEY_ALEN(fw);
832 	KASSERT(alen == con->c_alen);
833 	bk = npf_conn_getbackkey(con, alen);
834 
835 	knvl = npf_connkey_export(npf, fw);
836 	nvlist_move_nvlist(nvl, "forw-key", knvl);
837 
838 	knvl = npf_connkey_export(npf, bk);
839 	nvlist_move_nvlist(nvl, "back-key", knvl);
840 
841 	/* Let the address length be based on on first key. */
842 	nvlist_add_number(nvl, "alen", alen);
843 
844 	if (con->c_nat) {
845 		npf_nat_export(npf, con->c_nat, nvl);
846 	}
847 	return 0;
848 }
849 
850 /*
851  * npf_conn_import: fully reconstruct a single connection from a
852  * nvlist and insert into the given database.
853  */
854 int
npf_conn_import(npf_t * npf,npf_conndb_t * cd,const nvlist_t * cdict,npf_ruleset_t * natlist)855 npf_conn_import(npf_t *npf, npf_conndb_t *cd, const nvlist_t *cdict,
856     npf_ruleset_t *natlist)
857 {
858 	npf_conn_t *con;
859 	npf_connkey_t *fw, *bk;
860 	const nvlist_t *nat, *conkey;
861 	unsigned flags, alen, idx;
862 	const char *ifname;
863 	const void *state;
864 	size_t len;
865 
866 	/*
867 	 * To determine the length of the connection, which depends
868 	 * on the address length in the connection keys.
869 	 */
870 	alen = dnvlist_get_number(cdict, "alen", 0);
871 	idx = NPF_CONNCACHE(alen);
872 
873 	/* Allocate a connection and initialize it (clear first). */
874 	con = pool_cache_get(npf->conn_cache[idx], PR_WAITOK);
875 	memset(con, 0, sizeof(npf_conn_t));
876 	mutex_init(&con->c_lock, MUTEX_DEFAULT, IPL_SOFTNET);
877 	npf_stats_inc(npf, NPF_STAT_CONN_CREATE);
878 
879 	con->c_proto = dnvlist_get_number(cdict, "proto", 0);
880 	flags = dnvlist_get_number(cdict, "flags", 0);
881 	flags &= PFIL_ALL | CONN_ACTIVE | CONN_PASS;
882 	atomic_store_relaxed(&con->c_flags, flags);
883 	conn_update_atime(con);
884 
885 	ifname = dnvlist_get_string(cdict, "ifname", NULL);
886 	if (ifname && (con->c_ifid = npf_ifmap_register(npf, ifname)) == 0) {
887 		goto err;
888 	}
889 
890 	state = dnvlist_get_binary(cdict, "state", &len, NULL, 0);
891 	if (!state || len != sizeof(npf_state_t)) {
892 		goto err;
893 	}
894 	memcpy(&con->c_state, state, sizeof(npf_state_t));
895 
896 	/* Reconstruct NAT association, if any. */
897 	if ((nat = dnvlist_get_nvlist(cdict, "nat", NULL)) != NULL &&
898 	    (con->c_nat = npf_nat_import(npf, nat, natlist, con)) == NULL) {
899 		goto err;
900 	}
901 
902 	/*
903 	 * Fetch and copy the keys for each direction.
904 	 */
905 	fw = npf_conn_getforwkey(con);
906 	conkey = dnvlist_get_nvlist(cdict, "forw-key", NULL);
907 	if (conkey == NULL || !npf_connkey_import(npf, conkey, fw)) {
908 		goto err;
909 	}
910 	bk = npf_conn_getbackkey(con, NPF_CONNKEY_ALEN(fw));
911 	conkey = dnvlist_get_nvlist(cdict, "back-key", NULL);
912 	if (conkey == NULL || !npf_connkey_import(npf, conkey, bk)) {
913 		goto err;
914 	}
915 
916 	/* Guard against the contradicting address lengths. */
917 	if (NPF_CONNKEY_ALEN(fw) != alen || NPF_CONNKEY_ALEN(bk) != alen) {
918 		goto err;
919 	}
920 
921 	/* Insert the entries and the connection itself. */
922 	if (!npf_conndb_insert(cd, fw, con, NPF_FLOW_FORW)) {
923 		goto err;
924 	}
925 	if (!npf_conndb_insert(cd, bk, con, NPF_FLOW_BACK)) {
926 		npf_conndb_remove(cd, fw);
927 		goto err;
928 	}
929 
930 	NPF_PRINTF(("NPF: imported conn %p\n", con));
931 	npf_conndb_enqueue(cd, con);
932 	return 0;
933 err:
934 	npf_conn_destroy(npf, con);
935 	return EINVAL;
936 }
937 
938 /*
939  * npf_conn_find: lookup a connection in the list of connections
940  */
941 int
npf_conn_find(npf_t * npf,const nvlist_t * req,nvlist_t * resp)942 npf_conn_find(npf_t *npf, const nvlist_t *req, nvlist_t *resp)
943 {
944 	const nvlist_t *key_nv;
945 	npf_conn_t *con;
946 	npf_connkey_t key;
947 	npf_flow_t flow;
948 	int error;
949 
950 	key_nv = dnvlist_get_nvlist(req, "key", NULL);
951 	if (!key_nv || !npf_connkey_import(npf, key_nv, &key)) {
952 		return EINVAL;
953 	}
954 	con = npf_conndb_lookup(npf, &key, &flow);
955 	if (con == NULL) {
956 		return ESRCH;
957 	}
958 	if (!npf_conn_check(con, NULL, 0, NPF_FLOW_FORW)) {
959 		atomic_dec_uint(&con->c_refcnt);
960 		return ESRCH;
961 	}
962 	error = npf_conn_export(npf, con, resp);
963 	nvlist_add_number(resp, "flow", flow);
964 	atomic_dec_uint(&con->c_refcnt);
965 	return error;
966 }
967 
968 #if defined(DDB) || defined(_NPF_TESTING)
969 
970 void
npf_conn_print(npf_conn_t * con)971 npf_conn_print(npf_conn_t *con)
972 {
973 	const npf_connkey_t *fw = npf_conn_getforwkey(con);
974 	const npf_connkey_t *bk = npf_conn_getbackkey(con, NPF_CONNKEY_ALEN(fw));
975 	const unsigned flags = atomic_load_relaxed(&con->c_flags);
976 	const unsigned proto = con->c_proto;
977 	struct timespec tspnow;
978 
979 	getnanouptime(&tspnow);
980 	printf("%p:\n\tproto %d flags 0x%x tsdiff %ld etime %d\n", con,
981 	    proto, flags, (long)(tspnow.tv_sec - con->c_atime),
982 	    npf_state_etime(npf_getkernctx(), &con->c_state, proto));
983 	npf_connkey_print(fw);
984 	npf_connkey_print(bk);
985 	npf_state_dump(&con->c_state);
986 	if (con->c_nat) {
987 		npf_nat_dump(con->c_nat);
988 	}
989 }
990 
991 #endif
992