xref: /netbsd-src/sys/net/npf/npf_conn.c (revision e6c7e151de239c49d2e38720a061ed9d1fa99309)
1 /*-
2  * Copyright (c) 2014-2018 Mindaugas Rasiukevicius <rmind at netbsd org>
3  * Copyright (c) 2010-2014 The NetBSD Foundation, Inc.
4  * All rights reserved.
5  *
6  * This material is based upon work partially supported by The
7  * NetBSD Foundation under a contract with Mindaugas Rasiukevicius.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
22  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 /*
32  * NPF connection tracking for stateful filtering and translation.
33  *
34  * Overview
35  *
36  *	Packets can be incoming or outgoing with respect to an interface.
37  *	Connection direction is identified by the direction of its first
38  *	packet.  The meaning of incoming/outgoing packet in the context of
39  *	connection direction can be confusing.  Therefore, we will use the
40  *	terms "forwards stream" and "backwards stream", where packets in
41  *	the forwards stream mean the packets travelling in the direction
42  *	as the connection direction.
43  *
44  *	All connections have two keys and thus two entries:
45  *
46  *	- npf_conn_getforwkey(con)        -- for the forwards stream;
47  *	- npf_conn_getbackkey(con, alen)  -- for the backwards stream.
48  *
49  *	Note: the keys are stored in npf_conn_t::c_keys[], which is used
50  *	to allocate variable-length npf_conn_t structures based on whether
51  *	the IPv4 or IPv6 addresses are used.  See the npf_connkey.c source
52  *	file for the description of the key layouts.
53  *
54  *	The keys are formed from the 5-tuple (source/destination address,
55  *	source/destination port and the protocol).  Additional matching
56  *	is performed for the interface (a common behaviour is equivalent
57  *	to the 6-tuple lookup including the interface ID).  Note that the
58  *	key may be formed using translated values in a case of NAT.
59  *
60  *	Connections can serve two purposes: for the implicit passing or
61  *	to accommodate the dynamic NAT.  Connections for the former purpose
62  *	are created by the rules with "stateful" attribute and are used for
63  *	stateful filtering.  Such connections indicate that the packet of
64  *	the backwards stream should be passed without inspection of the
65  *	ruleset.  The other purpose is to associate a dynamic NAT mechanism
66  *	with a connection.  Such connections are created by the NAT policies
67  *	and they have a relationship with NAT translation structure via
68  *	npf_conn_t::c_nat.  A single connection can serve both purposes,
69  *	which is a common case.
70  *
71  * Connection life-cycle
72  *
73  *	Connections are established when a packet matches said rule or
74  *	NAT policy.  Both keys of the established connection are inserted
75  *	into the connection database.  A garbage collection thread
76  *	periodically scans all connections and depending on connection
77  *	properties (e.g. last activity time, protocol) removes connection
78  *	entries and expires the actual connections.
79  *
80  *	Each connection has a reference count.  The reference is acquired
81  *	on lookup and should be released by the caller.  It guarantees that
82  *	the connection will not be destroyed, although it may be expired.
83  *
84  * Synchronisation
85  *
86  *	Connection database is accessed in a lock-less manner by the main
87  *	routines: npf_conn_inspect() and npf_conn_establish().  Since they
88  *	are always called from a software interrupt, the database is
89  *	protected using passive serialisation.  The main place which can
90  *	destroy a connection is npf_conn_worker().  The database itself
91  *	can be replaced and destroyed in npf_conn_reload().
92  *
93  * ALG support
94  *
95  *	Application-level gateways (ALGs) can override generic connection
96  *	inspection (npf_alg_conn() call in npf_conn_inspect() function) by
97  *	performing their own lookup using different key.  Recursive call
98  *	to npf_conn_inspect() is not allowed.  The ALGs ought to use the
99  *	npf_conn_lookup() function for this purpose.
100  *
101  * Lock order
102  *
103  *	npf_config_lock ->
104  *		conn_lock ->
105  *			npf_conn_t::c_lock
106  */
107 
108 #ifdef _KERNEL
109 #include <sys/cdefs.h>
110 __KERNEL_RCSID(0, "$NetBSD: npf_conn.c,v 1.30 2019/09/29 17:00:29 rmind Exp $");
111 
112 #include <sys/param.h>
113 #include <sys/types.h>
114 
115 #include <netinet/in.h>
116 #include <netinet/tcp.h>
117 
118 #include <sys/atomic.h>
119 #include <sys/kmem.h>
120 #include <sys/mutex.h>
121 #include <net/pfil.h>
122 #include <sys/pool.h>
123 #include <sys/queue.h>
124 #include <sys/systm.h>
125 #endif
126 
127 #define __NPF_CONN_PRIVATE
128 #include "npf_conn.h"
129 #include "npf_impl.h"
130 
131 /* A helper to select the IPv4 or IPv6 connection cache. */
132 #define	NPF_CONNCACHE(alen)	(((alen) >> 4) & 0x1)
133 
134 /*
135  * Connection flags: PFIL_IN and PFIL_OUT values are reserved for direction.
136  */
137 CTASSERT(PFIL_ALL == (0x001 | 0x002));
138 #define	CONN_ACTIVE	0x004	/* visible on inspection */
139 #define	CONN_PASS	0x008	/* perform implicit passing */
140 #define	CONN_EXPIRE	0x010	/* explicitly expire */
141 #define	CONN_REMOVED	0x020	/* "forw/back" entries removed */
142 
143 enum { CONN_TRACKING_OFF, CONN_TRACKING_ON };
144 
145 static nvlist_t *npf_conn_export(npf_t *, npf_conn_t *);
146 
147 /*
148  * npf_conn_sys{init,fini}: initialise/destroy connection tracking.
149  */
150 
151 void
152 npf_conn_init(npf_t *npf)
153 {
154 	npf->conn_cache[0] = pool_cache_init(
155 	    offsetof(npf_conn_t, c_keys[NPF_CONNKEY_V4WORDS * 2]),
156 	    0, 0, 0, "npfcn4pl", NULL, IPL_NET, NULL, NULL, NULL);
157 	npf->conn_cache[1] = pool_cache_init(
158 	    offsetof(npf_conn_t, c_keys[NPF_CONNKEY_V6WORDS * 2]),
159 	    0, 0, 0, "npfcn6pl", NULL, IPL_NET, NULL, NULL, NULL);
160 
161 	mutex_init(&npf->conn_lock, MUTEX_DEFAULT, IPL_NONE);
162 	npf->conn_tracking = CONN_TRACKING_OFF;
163 	npf->conn_db = npf_conndb_create();
164 	npf_conndb_sysinit(npf);
165 }
166 
167 void
168 npf_conn_fini(npf_t *npf)
169 {
170 	npf_conndb_sysfini(npf);
171 
172 	/* Note: the caller should have flushed the connections. */
173 	KASSERT(npf->conn_tracking == CONN_TRACKING_OFF);
174 	npf_worker_unregister(npf, npf_conn_worker);
175 
176 	npf_conndb_destroy(npf->conn_db);
177 	pool_cache_destroy(npf->conn_cache[0]);
178 	pool_cache_destroy(npf->conn_cache[1]);
179 	mutex_destroy(&npf->conn_lock);
180 }
181 
182 /*
183  * npf_conn_load: perform the load by flushing the current connection
184  * database and replacing it with the new one or just destroying.
185  *
186  * => The caller must disable the connection tracking and ensure that
187  *    there are no connection database lookups or references in-flight.
188  */
189 void
190 npf_conn_load(npf_t *npf, npf_conndb_t *ndb, bool track)
191 {
192 	npf_conndb_t *odb = NULL;
193 
194 	KASSERT(npf_config_locked_p(npf));
195 
196 	/*
197 	 * The connection database is in the quiescent state.
198 	 * Prevent G/C thread from running and install a new database.
199 	 */
200 	mutex_enter(&npf->conn_lock);
201 	if (ndb) {
202 		KASSERT(npf->conn_tracking == CONN_TRACKING_OFF);
203 		odb = npf->conn_db;
204 		npf->conn_db = ndb;
205 		membar_sync();
206 	}
207 	if (track) {
208 		/* After this point lookups start flying in. */
209 		npf->conn_tracking = CONN_TRACKING_ON;
210 	}
211 	mutex_exit(&npf->conn_lock);
212 
213 	if (odb) {
214 		/*
215 		 * Flush all, no sync since the caller did it for us.
216 		 * Also, release the pool cache memory.
217 		 */
218 		npf_conndb_gc(npf, odb, true, false);
219 		npf_conndb_destroy(odb);
220 		pool_cache_invalidate(npf->conn_cache[0]);
221 		pool_cache_invalidate(npf->conn_cache[1]);
222 	}
223 }
224 
225 /*
226  * npf_conn_tracking: enable/disable connection tracking.
227  */
228 void
229 npf_conn_tracking(npf_t *npf, bool track)
230 {
231 	KASSERT(npf_config_locked_p(npf));
232 	npf->conn_tracking = track ? CONN_TRACKING_ON : CONN_TRACKING_OFF;
233 }
234 
235 static inline bool
236 npf_conn_trackable_p(const npf_cache_t *npc)
237 {
238 	const npf_t *npf = npc->npc_ctx;
239 
240 	/*
241 	 * Check if connection tracking is on.  Also, if layer 3 and 4 are
242 	 * not cached - protocol is not supported or packet is invalid.
243 	 */
244 	if (npf->conn_tracking != CONN_TRACKING_ON) {
245 		return false;
246 	}
247 	if (!npf_iscached(npc, NPC_IP46) || !npf_iscached(npc, NPC_LAYER4)) {
248 		return false;
249 	}
250 	return true;
251 }
252 
253 static inline void
254 conn_update_atime(npf_conn_t *con)
255 {
256 	struct timespec tsnow;
257 
258 	getnanouptime(&tsnow);
259 	con->c_atime = tsnow.tv_sec;
260 }
261 
262 /*
263  * npf_conn_check: check that:
264  *
265  *	- the connection is active;
266  *
267  *	- the packet is travelling in the right direction with the respect
268  *	  to the connection direction (if interface-id is not zero);
269  *
270  *	- the packet is travelling on the same interface as the
271  *	  connection interface (if interface-id is not zero).
272  */
273 static bool
274 npf_conn_check(const npf_conn_t *con, const nbuf_t *nbuf,
275     const unsigned di, const bool forw)
276 {
277 	const uint32_t flags = con->c_flags;
278 	const unsigned ifid = con->c_ifid;
279 	bool active, pforw;
280 
281 	active = (flags & (CONN_ACTIVE | CONN_EXPIRE)) == CONN_ACTIVE;
282 	if (__predict_false(!active)) {
283 		return false;
284 	}
285 	if (ifid && nbuf) {
286 		pforw = (flags & PFIL_ALL) == (unsigned)di;
287 		if (__predict_false(forw != pforw)) {
288 			return false;
289 		}
290 		if (__predict_false(ifid != nbuf->nb_ifid)) {
291 			return false;
292 		}
293 	}
294 	return true;
295 }
296 
297 /*
298  * npf_conn_lookup: lookup if there is an established connection.
299  *
300  * => If found, we will hold a reference for the caller.
301  */
302 npf_conn_t *
303 npf_conn_lookup(const npf_cache_t *npc, const int di, bool *forw)
304 {
305 	npf_t *npf = npc->npc_ctx;
306 	const nbuf_t *nbuf = npc->npc_nbuf;
307 	npf_conn_t *con;
308 	npf_connkey_t key;
309 
310 	/* Construct a key and lookup for a connection in the store. */
311 	if (!npf_conn_conkey(npc, &key, true)) {
312 		return NULL;
313 	}
314 	con = npf_conndb_lookup(npf->conn_db, &key, forw);
315 	if (con == NULL) {
316 		return NULL;
317 	}
318 	KASSERT(npc->npc_proto == con->c_proto);
319 
320 	/* Extra checks for the connection and packet. */
321 	if (!npf_conn_check(con, nbuf, di, *forw)) {
322 		atomic_dec_uint(&con->c_refcnt);
323 		return NULL;
324 	}
325 
326 	/* Update the last activity time. */
327 	conn_update_atime(con);
328 	return con;
329 }
330 
331 /*
332  * npf_conn_inspect: lookup a connection and inspecting the protocol data.
333  *
334  * => If found, we will hold a reference for the caller.
335  */
336 npf_conn_t *
337 npf_conn_inspect(npf_cache_t *npc, const int di, int *error)
338 {
339 	nbuf_t *nbuf = npc->npc_nbuf;
340 	npf_conn_t *con;
341 	bool forw, ok;
342 
343 	KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
344 	if (!npf_conn_trackable_p(npc)) {
345 		return NULL;
346 	}
347 
348 	/* Query ALG which may lookup connection for us. */
349 	if ((con = npf_alg_conn(npc, di)) != NULL) {
350 		/* Note: reference is held. */
351 		return con;
352 	}
353 	if (nbuf_head_mbuf(nbuf) == NULL) {
354 		*error = ENOMEM;
355 		return NULL;
356 	}
357 	KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
358 
359 	/* Main lookup of the connection. */
360 	if ((con = npf_conn_lookup(npc, di, &forw)) == NULL) {
361 		return NULL;
362 	}
363 
364 	/* Inspect the protocol data and handle state changes. */
365 	mutex_enter(&con->c_lock);
366 	ok = npf_state_inspect(npc, &con->c_state, forw);
367 	mutex_exit(&con->c_lock);
368 
369 	/* If invalid state: let the rules deal with it. */
370 	if (__predict_false(!ok)) {
371 		npf_conn_release(con);
372 		npf_stats_inc(npc->npc_ctx, NPF_STAT_INVALID_STATE);
373 		return NULL;
374 	}
375 
376 	/*
377 	 * If this is multi-end state, then specially tag the packet
378 	 * so it will be just passed-through on other interfaces.
379 	 */
380 	if (con->c_ifid == 0 && nbuf_add_tag(nbuf, NPF_NTAG_PASS) != 0) {
381 		npf_conn_release(con);
382 		*error = ENOMEM;
383 		return NULL;
384 	}
385 	return con;
386 }
387 
388 /*
389  * npf_conn_establish: create a new connection, insert into the global list.
390  *
391  * => Connection is created with the reference held for the caller.
392  * => Connection will be activated on the first reference release.
393  */
394 npf_conn_t *
395 npf_conn_establish(npf_cache_t *npc, int di, bool global)
396 {
397 	npf_t *npf = npc->npc_ctx;
398 	const unsigned alen = npc->npc_alen;
399 	const unsigned idx = NPF_CONNCACHE(alen);
400 	const nbuf_t *nbuf = npc->npc_nbuf;
401 	npf_connkey_t *fw, *bk;
402 	npf_conn_t *con;
403 	int error = 0;
404 
405 	KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
406 
407 	if (!npf_conn_trackable_p(npc)) {
408 		return NULL;
409 	}
410 
411 	/* Allocate and initialise the new connection. */
412 	con = pool_cache_get(npf->conn_cache[idx], PR_NOWAIT);
413 	if (__predict_false(!con)) {
414 		npf_worker_signal(npf);
415 		return NULL;
416 	}
417 	NPF_PRINTF(("NPF: create conn %p\n", con));
418 	npf_stats_inc(npf, NPF_STAT_CONN_CREATE);
419 
420 	mutex_init(&con->c_lock, MUTEX_DEFAULT, IPL_SOFTNET);
421 	con->c_flags = (di & PFIL_ALL);
422 	con->c_refcnt = 0;
423 	con->c_rproc = NULL;
424 	con->c_nat = NULL;
425 
426 	con->c_proto = npc->npc_proto;
427 	CTASSERT(sizeof(con->c_proto) >= sizeof(npc->npc_proto));
428 	con->c_alen = alen;
429 
430 	/* Initialize the protocol state. */
431 	if (!npf_state_init(npc, &con->c_state)) {
432 		npf_conn_destroy(npf, con);
433 		return NULL;
434 	}
435 	KASSERT(npf_iscached(npc, NPC_IP46));
436 
437 	fw = npf_conn_getforwkey(con);
438 	bk = npf_conn_getbackkey(con, alen);
439 
440 	/*
441 	 * Construct "forwards" and "backwards" keys.  Also, set the
442 	 * interface ID for this connection (unless it is global).
443 	 */
444 	if (!npf_conn_conkey(npc, fw, true) ||
445 	    !npf_conn_conkey(npc, bk, false)) {
446 		npf_conn_destroy(npf, con);
447 		return NULL;
448 	}
449 	con->c_ifid = global ? nbuf->nb_ifid : 0;
450 
451 	/*
452 	 * Set last activity time for a new connection and acquire
453 	 * a reference for the caller before we make it visible.
454 	 */
455 	conn_update_atime(con);
456 	con->c_refcnt = 1;
457 
458 	/*
459 	 * Insert both keys (entries representing directions) of the
460 	 * connection.  At this point it becomes visible, but we activate
461 	 * the connection later.
462 	 */
463 	mutex_enter(&con->c_lock);
464 	if (!npf_conndb_insert(npf->conn_db, fw, con, true)) {
465 		error = EISCONN;
466 		goto err;
467 	}
468 	if (!npf_conndb_insert(npf->conn_db, bk, con, false)) {
469 		npf_conn_t *ret __diagused;
470 		ret = npf_conndb_remove(npf->conn_db, fw);
471 		KASSERT(ret == con);
472 		error = EISCONN;
473 		goto err;
474 	}
475 err:
476 	/*
477 	 * If we have hit the duplicate: mark the connection as expired
478 	 * and let the G/C thread to take care of it.  We cannot do it
479 	 * here since there might be references acquired already.
480 	 */
481 	if (error) {
482 		atomic_or_uint(&con->c_flags, CONN_REMOVED | CONN_EXPIRE);
483 		atomic_dec_uint(&con->c_refcnt);
484 		npf_stats_inc(npf, NPF_STAT_RACE_CONN);
485 	} else {
486 		NPF_PRINTF(("NPF: establish conn %p\n", con));
487 	}
488 
489 	/* Finally, insert into the connection list. */
490 	npf_conndb_enqueue(npf->conn_db, con);
491 	mutex_exit(&con->c_lock);
492 
493 	return error ? NULL : con;
494 }
495 
496 void
497 npf_conn_destroy(npf_t *npf, npf_conn_t *con)
498 {
499 	const unsigned idx __unused = NPF_CONNCACHE(con->c_alen);
500 
501 	KASSERT(con->c_refcnt == 0);
502 
503 	if (con->c_nat) {
504 		/* Release any NAT structures. */
505 		npf_nat_destroy(con->c_nat);
506 	}
507 	if (con->c_rproc) {
508 		/* Release the rule procedure. */
509 		npf_rproc_release(con->c_rproc);
510 	}
511 
512 	/* Destroy the state. */
513 	npf_state_destroy(&con->c_state);
514 	mutex_destroy(&con->c_lock);
515 
516 	/* Free the structure, increase the counter. */
517 	pool_cache_put(npf->conn_cache[idx], con);
518 	npf_stats_inc(npf, NPF_STAT_CONN_DESTROY);
519 	NPF_PRINTF(("NPF: conn %p destroyed\n", con));
520 }
521 
522 /*
523  * npf_conn_setnat: associate NAT entry with the connection, update and
524  * re-insert connection entry using the translation values.
525  *
526  * => The caller must be holding a reference.
527  */
528 int
529 npf_conn_setnat(const npf_cache_t *npc, npf_conn_t *con,
530     npf_nat_t *nt, unsigned ntype)
531 {
532 	static const u_int nat_type_dimap[] = {
533 		[NPF_NATOUT] = NPF_DST,
534 		[NPF_NATIN] = NPF_SRC,
535 	};
536 	npf_t *npf = npc->npc_ctx;
537 	npf_connkey_t key, *fw, *bk;
538 	npf_conn_t *ret __diagused;
539 	npf_addr_t *taddr;
540 	in_port_t tport;
541 
542 	KASSERT(con->c_refcnt > 0);
543 
544 	npf_nat_gettrans(nt, &taddr, &tport);
545 	KASSERT(ntype == NPF_NATOUT || ntype == NPF_NATIN);
546 
547 	/* Construct a "backwards" key. */
548 	if (!npf_conn_conkey(npc, &key, false)) {
549 		return EINVAL;
550 	}
551 
552 	/* Acquire the lock and check for the races. */
553 	mutex_enter(&con->c_lock);
554 	if (__predict_false(con->c_flags & CONN_EXPIRE)) {
555 		/* The connection got expired. */
556 		mutex_exit(&con->c_lock);
557 		return EINVAL;
558 	}
559 	KASSERT((con->c_flags & CONN_REMOVED) == 0);
560 
561 	if (__predict_false(con->c_nat != NULL)) {
562 		/* Race with a duplicate packet. */
563 		mutex_exit(&con->c_lock);
564 		npf_stats_inc(npc->npc_ctx, NPF_STAT_RACE_NAT);
565 		return EISCONN;
566 	}
567 
568 	/* Remove the "backwards" key. */
569 	fw = npf_conn_getforwkey(con);
570 	bk = npf_conn_getbackkey(con, NPF_CONNKEY_ALEN(fw));
571 	ret = npf_conndb_remove(npf->conn_db, bk);
572 	KASSERT(ret == con);
573 
574 	/* Set the source/destination IDs to the translation values. */
575 	npf_conn_adjkey(bk, taddr, tport, nat_type_dimap[ntype]);
576 
577 	/* Finally, re-insert the "backwards" key. */
578 	if (!npf_conndb_insert(npf->conn_db, bk, con, false)) {
579 		/*
580 		 * Race: we have hit the duplicate, remove the "forwards"
581 		 * key and expire our connection; it is no longer valid.
582 		 */
583 		ret = npf_conndb_remove(npf->conn_db, fw);
584 		KASSERT(ret == con);
585 
586 		atomic_or_uint(&con->c_flags, CONN_REMOVED | CONN_EXPIRE);
587 		mutex_exit(&con->c_lock);
588 
589 		npf_stats_inc(npc->npc_ctx, NPF_STAT_RACE_NAT);
590 		return EISCONN;
591 	}
592 
593 	/* Associate the NAT entry and release the lock. */
594 	con->c_nat = nt;
595 	mutex_exit(&con->c_lock);
596 	return 0;
597 }
598 
599 /*
600  * npf_conn_expire: explicitly mark connection as expired.
601  */
602 void
603 npf_conn_expire(npf_conn_t *con)
604 {
605 	/* KASSERT(con->c_refcnt > 0); XXX: npf_nat_freepolicy() */
606 	atomic_or_uint(&con->c_flags, CONN_EXPIRE);
607 }
608 
609 /*
610  * npf_conn_pass: return true if connection is "pass" one, otherwise false.
611  */
612 bool
613 npf_conn_pass(const npf_conn_t *con, npf_match_info_t *mi, npf_rproc_t **rp)
614 {
615 	KASSERT(con->c_refcnt > 0);
616 	if (__predict_true(con->c_flags & CONN_PASS)) {
617 		mi->mi_rid = con->c_rid;
618 		mi->mi_retfl = con->c_retfl;
619 		*rp = con->c_rproc;
620 		return true;
621 	}
622 	return false;
623 }
624 
625 /*
626  * npf_conn_setpass: mark connection as a "pass" one and associate the
627  * rule procedure with it.
628  */
629 void
630 npf_conn_setpass(npf_conn_t *con, const npf_match_info_t *mi, npf_rproc_t *rp)
631 {
632 	KASSERT((con->c_flags & CONN_ACTIVE) == 0);
633 	KASSERT(con->c_refcnt > 0);
634 	KASSERT(con->c_rproc == NULL);
635 
636 	/*
637 	 * No need for atomic since the connection is not yet active.
638 	 * If rproc is set, the caller transfers its reference to us,
639 	 * which will be released on npf_conn_destroy().
640 	 */
641 	atomic_or_uint(&con->c_flags, CONN_PASS);
642 	con->c_rproc = rp;
643 	if (rp) {
644 		con->c_rid = mi->mi_rid;
645 		con->c_retfl = mi->mi_retfl;
646 	}
647 }
648 
649 /*
650  * npf_conn_release: release a reference, which might allow G/C thread
651  * to destroy this connection.
652  */
653 void
654 npf_conn_release(npf_conn_t *con)
655 {
656 	if ((con->c_flags & (CONN_ACTIVE | CONN_EXPIRE)) == 0) {
657 		/* Activate: after this, connection is globally visible. */
658 		atomic_or_uint(&con->c_flags, CONN_ACTIVE);
659 	}
660 	KASSERT(con->c_refcnt > 0);
661 	atomic_dec_uint(&con->c_refcnt);
662 }
663 
664 /*
665  * npf_conn_getnat: return associated NAT data entry and indicate
666  * whether it is a "forwards" or "backwards" stream.
667  */
668 npf_nat_t *
669 npf_conn_getnat(npf_conn_t *con, const int di, bool *forw)
670 {
671 	KASSERT(con->c_refcnt > 0);
672 	*forw = (con->c_flags & PFIL_ALL) == (u_int)di;
673 	return con->c_nat;
674 }
675 
676 /*
677  * npf_conn_expired: criterion to check if connection is expired.
678  */
679 bool
680 npf_conn_expired(npf_t *npf, const npf_conn_t *con, uint64_t tsnow)
681 {
682 	const int etime = npf_state_etime(npf, &con->c_state, con->c_proto);
683 	int elapsed;
684 
685 	if (__predict_false(con->c_flags & CONN_EXPIRE)) {
686 		/* Explicitly marked to be expired. */
687 		return true;
688 	}
689 
690 	/*
691 	 * Note: another thread may update 'atime' and it might
692 	 * become greater than 'now'.
693 	 */
694 	elapsed = (int64_t)tsnow - con->c_atime;
695 	return elapsed > etime;
696 }
697 
698 /*
699  * npf_conn_remove: unlink the connection and mark as expired.
700  */
701 void
702 npf_conn_remove(npf_conndb_t *cd, npf_conn_t *con)
703 {
704 	/* Remove both entries of the connection. */
705 	mutex_enter(&con->c_lock);
706 	if ((con->c_flags & CONN_REMOVED) == 0) {
707 		npf_connkey_t *fw, *bk;
708 		npf_conn_t *ret __diagused;
709 
710 		fw = npf_conn_getforwkey(con);
711 		ret = npf_conndb_remove(cd, fw);
712 		KASSERT(ret == con);
713 
714 		bk = npf_conn_getbackkey(con, NPF_CONNKEY_ALEN(fw));
715 		ret = npf_conndb_remove(cd, bk);
716 		KASSERT(ret == con);
717 	}
718 
719 	/* Flag the removal and expiration. */
720 	atomic_or_uint(&con->c_flags, CONN_REMOVED | CONN_EXPIRE);
721 	mutex_exit(&con->c_lock);
722 }
723 
724 /*
725  * npf_conn_worker: G/C to run from a worker thread.
726  */
727 void
728 npf_conn_worker(npf_t *npf)
729 {
730 	npf_conndb_gc(npf, npf->conn_db, false, true);
731 }
732 
733 /*
734  * npf_conndb_export: construct a list of connections prepared for saving.
735  * Note: this is expected to be an expensive operation.
736  */
737 int
738 npf_conndb_export(npf_t *npf, nvlist_t *npf_dict)
739 {
740 	npf_conn_t *head, *con;
741 
742 	/*
743 	 * Note: acquire conn_lock to prevent from the database
744 	 * destruction and G/C thread.
745 	 */
746 	mutex_enter(&npf->conn_lock);
747 	if (npf->conn_tracking != CONN_TRACKING_ON) {
748 		mutex_exit(&npf->conn_lock);
749 		return 0;
750 	}
751 	head = npf_conndb_getlist(npf->conn_db);
752 	con = head;
753 	while (con) {
754 		nvlist_t *cdict;
755 
756 		if ((cdict = npf_conn_export(npf, con)) != NULL) {
757 			nvlist_append_nvlist_array(npf_dict, "conn-list", cdict);
758 			nvlist_destroy(cdict);
759 		}
760 		if ((con = npf_conndb_getnext(npf->conn_db, con)) == head) {
761 			break;
762 		}
763 	}
764 	mutex_exit(&npf->conn_lock);
765 	return 0;
766 }
767 
768 /*
769  * npf_conn_export: serialise a single connection.
770  */
771 static nvlist_t *
772 npf_conn_export(npf_t *npf, npf_conn_t *con)
773 {
774 	nvlist_t *cdict, *kdict;
775 	npf_connkey_t *fw, *bk;
776 	unsigned alen;
777 
778 	if ((con->c_flags & (CONN_ACTIVE|CONN_EXPIRE)) != CONN_ACTIVE) {
779 		return NULL;
780 	}
781 	cdict = nvlist_create(0);
782 	nvlist_add_number(cdict, "flags", con->c_flags);
783 	nvlist_add_number(cdict, "proto", con->c_proto);
784 	if (con->c_ifid) {
785 		char ifname[IFNAMSIZ];
786 		npf_ifmap_copyname(npf, con->c_ifid, ifname, sizeof(ifname));
787 		nvlist_add_string(cdict, "ifname", ifname);
788 	}
789 	nvlist_add_binary(cdict, "state", &con->c_state, sizeof(npf_state_t));
790 
791 	fw = npf_conn_getforwkey(con);
792 	alen = NPF_CONNKEY_ALEN(fw);
793 	KASSERT(alen == con->c_alen);
794 	bk = npf_conn_getbackkey(con, alen);
795 
796 	kdict = npf_connkey_export(fw);
797 	nvlist_move_nvlist(cdict, "forw-key", kdict);
798 
799 	kdict = npf_connkey_export(bk);
800 	nvlist_move_nvlist(cdict, "back-key", kdict);
801 
802 	/* Let the address length be based on on first key. */
803 	nvlist_add_number(cdict, "alen", alen);
804 
805 	if (con->c_nat) {
806 		npf_nat_export(cdict, con->c_nat);
807 	}
808 	return cdict;
809 }
810 
811 /*
812  * npf_conn_import: fully reconstruct a single connection from a
813  * nvlist and insert into the given database.
814  */
815 int
816 npf_conn_import(npf_t *npf, npf_conndb_t *cd, const nvlist_t *cdict,
817     npf_ruleset_t *natlist)
818 {
819 	npf_conn_t *con;
820 	npf_connkey_t *fw, *bk;
821 	const nvlist_t *nat, *conkey;
822 	const char *ifname;
823 	const void *state;
824 	unsigned alen, idx;
825 	size_t len;
826 
827 	/*
828 	 * To determine the length of the connection, which depends
829 	 * on the address length in the connection keys.
830 	 */
831 	alen = dnvlist_get_number(cdict, "alen", 0);
832 	idx = NPF_CONNCACHE(alen);
833 
834 	/* Allocate a connection and initialise it (clear first). */
835 	con = pool_cache_get(npf->conn_cache[idx], PR_WAITOK);
836 	memset(con, 0, sizeof(npf_conn_t));
837 	mutex_init(&con->c_lock, MUTEX_DEFAULT, IPL_SOFTNET);
838 	npf_stats_inc(npf, NPF_STAT_CONN_CREATE);
839 
840 	con->c_proto = dnvlist_get_number(cdict, "proto", 0);
841 	con->c_flags = dnvlist_get_number(cdict, "flags", 0);
842 	con->c_flags &= PFIL_ALL | CONN_ACTIVE | CONN_PASS;
843 	conn_update_atime(con);
844 
845 	ifname = dnvlist_get_string(cdict, "ifname", NULL);
846 	if (ifname && (con->c_ifid = npf_ifmap_register(npf, ifname)) == 0) {
847 		goto err;
848 	}
849 
850 	state = dnvlist_get_binary(cdict, "state", &len, NULL, 0);
851 	if (!state || len != sizeof(npf_state_t)) {
852 		goto err;
853 	}
854 	memcpy(&con->c_state, state, sizeof(npf_state_t));
855 
856 	/* Reconstruct NAT association, if any. */
857 	if ((nat = dnvlist_get_nvlist(cdict, "nat", NULL)) != NULL &&
858 	    (con->c_nat = npf_nat_import(npf, nat, natlist, con)) == NULL) {
859 		goto err;
860 	}
861 
862 	/*
863 	 * Fetch and copy the keys for each direction.
864 	 */
865 	fw = npf_conn_getforwkey(con);
866 	conkey = dnvlist_get_nvlist(cdict, "forw-key", NULL);
867 	if (conkey == NULL || !npf_connkey_import(conkey, fw)) {
868 		goto err;
869 	}
870 	bk = npf_conn_getbackkey(con, NPF_CONNKEY_ALEN(fw));
871 	conkey = dnvlist_get_nvlist(cdict, "back-key", NULL);
872 	if (conkey == NULL || !npf_connkey_import(conkey, bk)) {
873 		goto err;
874 	}
875 
876 	/* Guard against the contradicting address lengths. */
877 	if (NPF_CONNKEY_ALEN(fw) != alen || NPF_CONNKEY_ALEN(bk) != alen) {
878 		goto err;
879 	}
880 
881 	/* Insert the entries and the connection itself. */
882 	if (!npf_conndb_insert(cd, fw, con, true)) {
883 		goto err;
884 	}
885 	if (!npf_conndb_insert(cd, bk, con, false)) {
886 		npf_conndb_remove(cd, fw);
887 		goto err;
888 	}
889 
890 	NPF_PRINTF(("NPF: imported conn %p\n", con));
891 	npf_conndb_enqueue(cd, con);
892 	return 0;
893 err:
894 	npf_conn_destroy(npf, con);
895 	return EINVAL;
896 }
897 
898 int
899 npf_conn_find(npf_t *npf, const nvlist_t *idict, nvlist_t **odict)
900 {
901 	const nvlist_t *kdict;
902 	npf_connkey_t key;
903 	npf_conn_t *con;
904 	uint16_t dir;
905 	bool forw;
906 
907 	kdict = dnvlist_get_nvlist(idict, "key", NULL);
908 	if (!kdict || !npf_connkey_import(kdict, &key)) {
909 		return EINVAL;
910 	}
911 	con = npf_conndb_lookup(npf->conn_db, &key, &forw);
912 	if (con == NULL) {
913 		return ESRCH;
914 	}
915 	dir = dnvlist_get_number(idict, "direction", 0);
916 	if (!npf_conn_check(con, NULL, dir, true)) {
917 		atomic_dec_uint(&con->c_refcnt);
918 		return ESRCH;
919 	}
920 	*odict = npf_conn_export(npf, con);
921 	atomic_dec_uint(&con->c_refcnt);
922 	return *odict ? 0 : ENOSPC;
923 }
924 
925 #if defined(DDB) || defined(_NPF_TESTING)
926 
927 void
928 npf_conn_print(npf_conn_t *con)
929 {
930 	const npf_connkey_t *fw = npf_conn_getforwkey(con);
931 	const npf_connkey_t *bk = npf_conn_getbackkey(con, NPF_CONNKEY_ALEN(fw));
932 	const unsigned proto = con->c_proto;
933 	struct timespec tspnow;
934 
935 	getnanouptime(&tspnow);
936 	printf("%p:\n\tproto %d flags 0x%x tsdiff %ld etime %d\n", con,
937 	    proto, con->c_flags, (long)(tspnow.tv_sec - con->c_atime),
938 	    npf_state_etime(npf_getkernctx(), &con->c_state, proto));
939 	npf_connkey_print(fw);
940 	npf_connkey_print(bk);
941 	npf_state_dump(&con->c_state);
942 	if (con->c_nat) {
943 		npf_nat_dump(con->c_nat);
944 	}
945 }
946 
947 #endif
948