xref: /netbsd-src/sys/net/npf/npf_conn.c (revision f3cfa6f6ce31685c6c4a758bc430e69eb99f50a4)
1 /*-
2  * Copyright (c) 2014-2018 Mindaugas Rasiukevicius <rmind at netbsd org>
3  * Copyright (c) 2010-2014 The NetBSD Foundation, Inc.
4  * All rights reserved.
5  *
6  * This material is based upon work partially supported by The
7  * NetBSD Foundation under a contract with Mindaugas Rasiukevicius.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
22  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 /*
32  * NPF connection tracking for stateful filtering and translation.
33  *
34  * Overview
35  *
36  *	Packets can be incoming or outgoing with respect to an interface.
37  *	Connection direction is identified by the direction of its first
38  *	packet.  The meaning of incoming/outgoing packet in the context of
39  *	connection direction can be confusing.  Therefore, we will use the
40  *	terms "forwards stream" and "backwards stream", where packets in
41  *	the forwards stream mean the packets travelling in the direction
42  *	as the connection direction.
43  *
44  *	All connections have two keys and thus two entries:
45  *
46  *		npf_conn_t::c_forw_entry for the forwards stream and
47  *		npf_conn_t::c_back_entry for the backwards stream.
48  *
49  *	The keys are formed from the 5-tuple (source/destination address,
50  *	source/destination port and the protocol).  Additional matching
51  *	is performed for the interface (a common behaviour is equivalent
52  *	to the 6-tuple lookup including the interface ID).  Note that the
53  *	key may be formed using translated values in a case of NAT.
54  *
55  *	Connections can serve two purposes: for the implicit passing or
56  *	to accommodate the dynamic NAT.  Connections for the former purpose
57  *	are created by the rules with "stateful" attribute and are used for
58  *	stateful filtering.  Such connections indicate that the packet of
59  *	the backwards stream should be passed without inspection of the
60  *	ruleset.  The other purpose is to associate a dynamic NAT mechanism
61  *	with a connection.  Such connections are created by the NAT policies
62  *	and they have a relationship with NAT translation structure via
63  *	npf_conn_t::c_nat.  A single connection can serve both purposes,
64  *	which is a common case.
65  *
66  * Connection life-cycle
67  *
68  *	Connections are established when a packet matches said rule or
69  *	NAT policy.  Both keys of the established connection are inserted
70  *	into the connection database.  A garbage collection thread
71  *	periodically scans all connections and depending on connection
72  *	properties (e.g. last activity time, protocol) removes connection
73  *	entries and expires the actual connections.
74  *
75  *	Each connection has a reference count.  The reference is acquired
76  *	on lookup and should be released by the caller.  It guarantees that
77  *	the connection will not be destroyed, although it may be expired.
78  *
79  * Synchronisation
80  *
81  *	Connection database is accessed in a lock-less manner by the main
82  *	routines: npf_conn_inspect() and npf_conn_establish().  Since they
83  *	are always called from a software interrupt, the database is
84  *	protected using passive serialisation.  The main place which can
85  *	destroy a connection is npf_conn_worker().  The database itself
86  *	can be replaced and destroyed in npf_conn_reload().
87  *
88  * ALG support
89  *
90  *	Application-level gateways (ALGs) can override generic connection
91  *	inspection (npf_alg_conn() call in npf_conn_inspect() function) by
92  *	performing their own lookup using different key.  Recursive call
93  *	to npf_conn_inspect() is not allowed.  The ALGs ought to use the
94  *	npf_conn_lookup() function for this purpose.
95  *
96  * Lock order
97  *
98  *	npf_config_lock ->
99  *		conn_lock ->
100  *			npf_conn_t::c_lock
101  */
102 
103 #ifdef _KERNEL
104 #include <sys/cdefs.h>
105 __KERNEL_RCSID(0, "$NetBSD: npf_conn.c,v 1.26 2019/01/19 21:19:31 rmind Exp $");
106 
107 #include <sys/param.h>
108 #include <sys/types.h>
109 
110 #include <netinet/in.h>
111 #include <netinet/tcp.h>
112 
113 #include <sys/atomic.h>
114 #include <sys/condvar.h>
115 #include <sys/kmem.h>
116 #include <sys/kthread.h>
117 #include <sys/mutex.h>
118 #include <net/pfil.h>
119 #include <sys/pool.h>
120 #include <sys/queue.h>
121 #include <sys/systm.h>
122 #endif
123 
124 #define __NPF_CONN_PRIVATE
125 #include "npf_conn.h"
126 #include "npf_impl.h"
127 
128 /*
129  * Connection flags: PFIL_IN and PFIL_OUT values are reserved for direction.
130  */
131 CTASSERT(PFIL_ALL == (0x001 | 0x002));
132 #define	CONN_ACTIVE	0x004	/* visible on inspection */
133 #define	CONN_PASS	0x008	/* perform implicit passing */
134 #define	CONN_EXPIRE	0x010	/* explicitly expire */
135 #define	CONN_REMOVED	0x020	/* "forw/back" entries removed */
136 
137 enum { CONN_TRACKING_OFF, CONN_TRACKING_ON };
138 
139 static nvlist_t *npf_conn_export(npf_t *, const npf_conn_t *);
140 
141 /*
142  * npf_conn_sys{init,fini}: initialise/destroy connection tracking.
143  */
144 
145 void
146 npf_conn_init(npf_t *npf, int flags)
147 {
148 	npf->conn_cache = pool_cache_init(sizeof(npf_conn_t), coherency_unit,
149 	    0, 0, "npfconpl", NULL, IPL_NET, NULL, NULL, NULL);
150 	mutex_init(&npf->conn_lock, MUTEX_DEFAULT, IPL_NONE);
151 	npf->conn_tracking = CONN_TRACKING_OFF;
152 	npf->conn_db = npf_conndb_create();
153 
154 	if ((flags & NPF_NO_GC) == 0) {
155 		npf_worker_register(npf, npf_conn_worker);
156 	}
157 }
158 
159 void
160 npf_conn_fini(npf_t *npf)
161 {
162 	/* Note: the caller should have flushed the connections. */
163 	KASSERT(npf->conn_tracking == CONN_TRACKING_OFF);
164 	npf_worker_unregister(npf, npf_conn_worker);
165 
166 	npf_conndb_destroy(npf->conn_db);
167 	pool_cache_destroy(npf->conn_cache);
168 	mutex_destroy(&npf->conn_lock);
169 }
170 
171 /*
172  * npf_conn_load: perform the load by flushing the current connection
173  * database and replacing it with the new one or just destroying.
174  *
175  * => The caller must disable the connection tracking and ensure that
176  *    there are no connection database lookups or references in-flight.
177  */
178 void
179 npf_conn_load(npf_t *npf, npf_conndb_t *ndb, bool track)
180 {
181 	npf_conndb_t *odb = NULL;
182 
183 	KASSERT(npf_config_locked_p(npf));
184 
185 	/*
186 	 * The connection database is in the quiescent state.
187 	 * Prevent G/C thread from running and install a new database.
188 	 */
189 	mutex_enter(&npf->conn_lock);
190 	if (ndb) {
191 		KASSERT(npf->conn_tracking == CONN_TRACKING_OFF);
192 		odb = npf->conn_db;
193 		npf->conn_db = ndb;
194 		membar_sync();
195 	}
196 	if (track) {
197 		/* After this point lookups start flying in. */
198 		npf->conn_tracking = CONN_TRACKING_ON;
199 	}
200 	mutex_exit(&npf->conn_lock);
201 
202 	if (odb) {
203 		/*
204 		 * Flush all, no sync since the caller did it for us.
205 		 * Also, release the pool cache memory.
206 		 */
207 		npf_conndb_gc(npf, odb, true, false);
208 		npf_conndb_destroy(odb);
209 		pool_cache_invalidate(npf->conn_cache);
210 	}
211 }
212 
213 /*
214  * npf_conn_tracking: enable/disable connection tracking.
215  */
216 void
217 npf_conn_tracking(npf_t *npf, bool track)
218 {
219 	KASSERT(npf_config_locked_p(npf));
220 	npf->conn_tracking = track ? CONN_TRACKING_ON : CONN_TRACKING_OFF;
221 }
222 
223 static inline bool
224 npf_conn_trackable_p(const npf_cache_t *npc)
225 {
226 	const npf_t *npf = npc->npc_ctx;
227 
228 	/*
229 	 * Check if connection tracking is on.  Also, if layer 3 and 4 are
230 	 * not cached - protocol is not supported or packet is invalid.
231 	 */
232 	if (npf->conn_tracking != CONN_TRACKING_ON) {
233 		return false;
234 	}
235 	if (!npf_iscached(npc, NPC_IP46) || !npf_iscached(npc, NPC_LAYER4)) {
236 		return false;
237 	}
238 	return true;
239 }
240 
241 static uint32_t
242 connkey_setkey(npf_connkey_t *key, uint16_t proto, const void *ipv,
243     const uint16_t *id, unsigned alen, bool forw)
244 {
245 	uint32_t isrc, idst, *k = key->ck_key;
246 	const npf_addr_t * const *ips = ipv;
247 
248 	if (__predict_true(forw)) {
249 		isrc = NPF_SRC, idst = NPF_DST;
250 	} else {
251 		isrc = NPF_DST, idst = NPF_SRC;
252 	}
253 
254 	/*
255 	 * Construct a key formed out of 32-bit integers.  The key layout:
256 	 *
257 	 * Field: | proto  |  alen  | src-id | dst-id | src-addr | dst-addr |
258 	 *        +--------+--------+--------+--------+----------+----------+
259 	 * Bits:  |   16   |   16   |   16   |   16   |  32-128  |  32-128  |
260 	 *
261 	 * The source and destination are inverted if they key is for the
262 	 * backwards stream (forw == false).  The address length depends
263 	 * on the 'alen' field; it is a length in bytes, either 4 or 16.
264 	 */
265 
266 	k[0] = ((uint32_t)proto << 16) | (alen & 0xffff);
267 	k[1] = ((uint32_t)id[isrc] << 16) | id[idst];
268 
269 	if (__predict_true(alen == sizeof(in_addr_t))) {
270 		k[2] = ips[isrc]->word32[0];
271 		k[3] = ips[idst]->word32[0];
272 		return 4 * sizeof(uint32_t);
273 	} else {
274 		const u_int nwords = alen >> 2;
275 		memcpy(&k[2], ips[isrc], alen);
276 		memcpy(&k[2 + nwords], ips[idst], alen);
277 		return (2 + (nwords * 2)) * sizeof(uint32_t);
278 	}
279 }
280 
281 static void
282 connkey_getkey(const npf_connkey_t *key, uint16_t *proto, npf_addr_t *ips,
283     uint16_t *id, uint16_t *alen)
284 {
285 	const uint32_t *k = key->ck_key;
286 
287 	*proto = k[0] >> 16;
288 	*alen = k[0] & 0xffff;
289 	id[NPF_SRC] = k[1] >> 16;
290 	id[NPF_DST] = k[1] & 0xffff;
291 
292 	switch (*alen) {
293 	case sizeof(struct in6_addr):
294 	case sizeof(struct in_addr):
295 		memcpy(&ips[NPF_SRC], &k[2], *alen);
296 		memcpy(&ips[NPF_DST], &k[2 + ((unsigned)*alen >> 2)], *alen);
297 		return;
298 	default:
299 		KASSERT(0);
300 	}
301 }
302 
303 /*
304  * npf_conn_conkey: construct a key for the connection lookup.
305  *
306  * => Returns the key length in bytes or zero on failure.
307  */
308 unsigned
309 npf_conn_conkey(const npf_cache_t *npc, npf_connkey_t *key, const bool forw)
310 {
311 	const u_int proto = npc->npc_proto;
312 	const u_int alen = npc->npc_alen;
313 	const struct tcphdr *th;
314 	const struct udphdr *uh;
315 	uint16_t id[2];
316 
317 	switch (proto) {
318 	case IPPROTO_TCP:
319 		KASSERT(npf_iscached(npc, NPC_TCP));
320 		th = npc->npc_l4.tcp;
321 		id[NPF_SRC] = th->th_sport;
322 		id[NPF_DST] = th->th_dport;
323 		break;
324 	case IPPROTO_UDP:
325 		KASSERT(npf_iscached(npc, NPC_UDP));
326 		uh = npc->npc_l4.udp;
327 		id[NPF_SRC] = uh->uh_sport;
328 		id[NPF_DST] = uh->uh_dport;
329 		break;
330 	case IPPROTO_ICMP:
331 		if (npf_iscached(npc, NPC_ICMP_ID)) {
332 			const struct icmp *ic = npc->npc_l4.icmp;
333 			id[NPF_SRC] = ic->icmp_id;
334 			id[NPF_DST] = ic->icmp_id;
335 			break;
336 		}
337 		return 0;
338 	case IPPROTO_ICMPV6:
339 		if (npf_iscached(npc, NPC_ICMP_ID)) {
340 			const struct icmp6_hdr *ic6 = npc->npc_l4.icmp6;
341 			id[NPF_SRC] = ic6->icmp6_id;
342 			id[NPF_DST] = ic6->icmp6_id;
343 			break;
344 		}
345 		return 0;
346 	default:
347 		/* Unsupported protocol. */
348 		return 0;
349 	}
350 	return connkey_setkey(key, proto, npc->npc_ips, id, alen, forw);
351 }
352 
353 static __inline void
354 connkey_set_addr(npf_connkey_t *key, const npf_addr_t *naddr, const int di)
355 {
356 	const u_int alen = key->ck_key[0] & 0xffff;
357 	uint32_t *addr = &key->ck_key[2 + ((alen >> 2) * di)];
358 
359 	KASSERT(alen > 0);
360 	memcpy(addr, naddr, alen);
361 }
362 
363 static __inline void
364 connkey_set_id(npf_connkey_t *key, const uint16_t id, const int di)
365 {
366 	const uint32_t oid = key->ck_key[1];
367 	const u_int shift = 16 * !di;
368 	const uint32_t mask = 0xffff0000 >> shift;
369 
370 	key->ck_key[1] = ((uint32_t)id << shift) | (oid & mask);
371 }
372 
373 static inline void
374 conn_update_atime(npf_conn_t *con)
375 {
376 	struct timespec tsnow;
377 
378 	getnanouptime(&tsnow);
379 	con->c_atime = tsnow.tv_sec;
380 }
381 
382 /*
383  * npf_conn_ok: check if the connection is active and has the right direction.
384  */
385 static bool
386 npf_conn_ok(const npf_conn_t *con, const int di, bool forw)
387 {
388 	const uint32_t flags = con->c_flags;
389 
390 	/* Check if connection is active and not expired. */
391 	bool ok = (flags & (CONN_ACTIVE | CONN_EXPIRE)) == CONN_ACTIVE;
392 	if (__predict_false(!ok)) {
393 		return false;
394 	}
395 
396 	/* Check if the direction is consistent */
397 	bool pforw = (flags & PFIL_ALL) == (unsigned)di;
398 	if (__predict_false(forw != pforw)) {
399 		return false;
400 	}
401 	return true;
402 }
403 
404 /*
405  * npf_conn_lookup: lookup if there is an established connection.
406  *
407  * => If found, we will hold a reference for the caller.
408  */
409 npf_conn_t *
410 npf_conn_lookup(const npf_cache_t *npc, const int di, bool *forw)
411 {
412 	npf_t *npf = npc->npc_ctx;
413 	const nbuf_t *nbuf = npc->npc_nbuf;
414 	npf_conn_t *con;
415 	npf_connkey_t key;
416 	u_int cifid;
417 
418 	/* Construct a key and lookup for a connection in the store. */
419 	if (!npf_conn_conkey(npc, &key, true)) {
420 		return NULL;
421 	}
422 	con = npf_conndb_lookup(npf->conn_db, &key, forw);
423 	if (con == NULL) {
424 		return NULL;
425 	}
426 	KASSERT(npc->npc_proto == con->c_proto);
427 
428 	/* Check if connection is active and not expired. */
429 	if (!npf_conn_ok(con, di, *forw)) {
430 		atomic_dec_uint(&con->c_refcnt);
431 		return NULL;
432 	}
433 
434 	/*
435 	 * Match the interface and the direction of the connection entry
436 	 * and the packet.
437 	 */
438 	cifid = con->c_ifid;
439 	if (__predict_false(cifid && cifid != nbuf->nb_ifid)) {
440 		atomic_dec_uint(&con->c_refcnt);
441 		return NULL;
442 	}
443 
444 	/* Update the last activity time. */
445 	conn_update_atime(con);
446 	return con;
447 }
448 
449 /*
450  * npf_conn_inspect: lookup a connection and inspecting the protocol data.
451  *
452  * => If found, we will hold a reference for the caller.
453  */
454 npf_conn_t *
455 npf_conn_inspect(npf_cache_t *npc, const int di, int *error)
456 {
457 	nbuf_t *nbuf = npc->npc_nbuf;
458 	npf_conn_t *con;
459 	bool forw, ok;
460 
461 	KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
462 	if (!npf_conn_trackable_p(npc)) {
463 		return NULL;
464 	}
465 
466 	/* Query ALG which may lookup connection for us. */
467 	if ((con = npf_alg_conn(npc, di)) != NULL) {
468 		/* Note: reference is held. */
469 		return con;
470 	}
471 	if (nbuf_head_mbuf(nbuf) == NULL) {
472 		*error = ENOMEM;
473 		return NULL;
474 	}
475 	KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
476 
477 	/* Main lookup of the connection. */
478 	if ((con = npf_conn_lookup(npc, di, &forw)) == NULL) {
479 		return NULL;
480 	}
481 
482 	/* Inspect the protocol data and handle state changes. */
483 	mutex_enter(&con->c_lock);
484 	ok = npf_state_inspect(npc, &con->c_state, forw);
485 	mutex_exit(&con->c_lock);
486 
487 	/* If invalid state: let the rules deal with it. */
488 	if (__predict_false(!ok)) {
489 		npf_conn_release(con);
490 		npf_stats_inc(npc->npc_ctx, NPF_STAT_INVALID_STATE);
491 		return NULL;
492 	}
493 
494 	/*
495 	 * If this is multi-end state, then specially tag the packet
496 	 * so it will be just passed-through on other interfaces.
497 	 */
498 	if (con->c_ifid == 0 && nbuf_add_tag(nbuf, NPF_NTAG_PASS) != 0) {
499 		npf_conn_release(con);
500 		*error = ENOMEM;
501 		return NULL;
502 	}
503 	return con;
504 }
505 
506 /*
507  * npf_conn_establish: create a new connection, insert into the global list.
508  *
509  * => Connection is created with the reference held for the caller.
510  * => Connection will be activated on the first reference release.
511  */
512 npf_conn_t *
513 npf_conn_establish(npf_cache_t *npc, int di, bool per_if)
514 {
515 	npf_t *npf = npc->npc_ctx;
516 	const nbuf_t *nbuf = npc->npc_nbuf;
517 	npf_conn_t *con;
518 	int error = 0;
519 
520 	KASSERT(!nbuf_flag_p(nbuf, NBUF_DATAREF_RESET));
521 
522 	if (!npf_conn_trackable_p(npc)) {
523 		return NULL;
524 	}
525 
526 	/* Allocate and initialise the new connection. */
527 	con = pool_cache_get(npf->conn_cache, PR_NOWAIT);
528 	if (__predict_false(!con)) {
529 		npf_worker_signal(npf);
530 		return NULL;
531 	}
532 	NPF_PRINTF(("NPF: create conn %p\n", con));
533 	npf_stats_inc(npf, NPF_STAT_CONN_CREATE);
534 
535 	mutex_init(&con->c_lock, MUTEX_DEFAULT, IPL_SOFTNET);
536 	con->c_flags = (di & PFIL_ALL);
537 	con->c_refcnt = 0;
538 	con->c_rproc = NULL;
539 	con->c_nat = NULL;
540 
541 	/* Initialize the protocol state. */
542 	if (!npf_state_init(npc, &con->c_state)) {
543 		npf_conn_destroy(npf, con);
544 		return NULL;
545 	}
546 
547 	KASSERT(npf_iscached(npc, NPC_IP46));
548 	npf_connkey_t *fw = &con->c_forw_entry;
549 	npf_connkey_t *bk = &con->c_back_entry;
550 
551 	/*
552 	 * Construct "forwards" and "backwards" keys.  Also, set the
553 	 * interface ID for this connection (unless it is global).
554 	 */
555 	if (!npf_conn_conkey(npc, fw, true) ||
556 	    !npf_conn_conkey(npc, bk, false)) {
557 		npf_conn_destroy(npf, con);
558 		return NULL;
559 	}
560 	fw->ck_backptr = bk->ck_backptr = con;
561 	con->c_ifid = per_if ? nbuf->nb_ifid : 0;
562 	con->c_proto = npc->npc_proto;
563 
564 	/*
565 	 * Set last activity time for a new connection and acquire
566 	 * a reference for the caller before we make it visible.
567 	 */
568 	conn_update_atime(con);
569 	con->c_refcnt = 1;
570 
571 	/*
572 	 * Insert both keys (entries representing directions) of the
573 	 * connection.  At this point it becomes visible, but we activate
574 	 * the connection later.
575 	 */
576 	mutex_enter(&con->c_lock);
577 	if (!npf_conndb_insert(npf->conn_db, fw)) {
578 		error = EISCONN;
579 		goto err;
580 	}
581 	if (!npf_conndb_insert(npf->conn_db, bk)) {
582 		npf_conn_t *ret __diagused;
583 		ret = npf_conndb_remove(npf->conn_db, fw);
584 		KASSERT(ret == con);
585 		error = EISCONN;
586 		goto err;
587 	}
588 err:
589 	/*
590 	 * If we have hit the duplicate: mark the connection as expired
591 	 * and let the G/C thread to take care of it.  We cannot do it
592 	 * here since there might be references acquired already.
593 	 */
594 	if (error) {
595 		atomic_or_uint(&con->c_flags, CONN_REMOVED | CONN_EXPIRE);
596 		atomic_dec_uint(&con->c_refcnt);
597 		npf_stats_inc(npf, NPF_STAT_RACE_CONN);
598 	} else {
599 		NPF_PRINTF(("NPF: establish conn %p\n", con));
600 	}
601 
602 	/* Finally, insert into the connection list. */
603 	npf_conndb_enqueue(npf->conn_db, con);
604 	mutex_exit(&con->c_lock);
605 
606 	return error ? NULL : con;
607 }
608 
609 void
610 npf_conn_destroy(npf_t *npf, npf_conn_t *con)
611 {
612 	KASSERT(con->c_refcnt == 0);
613 
614 	if (con->c_nat) {
615 		/* Release any NAT structures. */
616 		npf_nat_destroy(con->c_nat);
617 	}
618 	if (con->c_rproc) {
619 		/* Release the rule procedure. */
620 		npf_rproc_release(con->c_rproc);
621 	}
622 
623 	/* Destroy the state. */
624 	npf_state_destroy(&con->c_state);
625 	mutex_destroy(&con->c_lock);
626 
627 	/* Free the structure, increase the counter. */
628 	pool_cache_put(npf->conn_cache, con);
629 	npf_stats_inc(npf, NPF_STAT_CONN_DESTROY);
630 	NPF_PRINTF(("NPF: conn %p destroyed\n", con));
631 }
632 
633 /*
634  * npf_conn_setnat: associate NAT entry with the connection, update and
635  * re-insert connection entry using the translation values.
636  *
637  * => The caller must be holding a reference.
638  */
639 int
640 npf_conn_setnat(const npf_cache_t *npc, npf_conn_t *con,
641     npf_nat_t *nt, u_int ntype)
642 {
643 	static const u_int nat_type_dimap[] = {
644 		[NPF_NATOUT] = NPF_DST,
645 		[NPF_NATIN] = NPF_SRC,
646 	};
647 	npf_t *npf = npc->npc_ctx;
648 	npf_connkey_t key, *bk;
649 	npf_conn_t *ret __diagused;
650 	npf_addr_t *taddr;
651 	in_port_t tport;
652 	u_int tidx;
653 
654 	KASSERT(con->c_refcnt > 0);
655 
656 	npf_nat_gettrans(nt, &taddr, &tport);
657 	KASSERT(ntype == NPF_NATOUT || ntype == NPF_NATIN);
658 	tidx = nat_type_dimap[ntype];
659 
660 	/* Construct a "backwards" key. */
661 	if (!npf_conn_conkey(npc, &key, false)) {
662 		return EINVAL;
663 	}
664 
665 	/* Acquire the lock and check for the races. */
666 	mutex_enter(&con->c_lock);
667 	if (__predict_false(con->c_flags & CONN_EXPIRE)) {
668 		/* The connection got expired. */
669 		mutex_exit(&con->c_lock);
670 		return EINVAL;
671 	}
672 	KASSERT((con->c_flags & CONN_REMOVED) == 0);
673 
674 	if (__predict_false(con->c_nat != NULL)) {
675 		/* Race with a duplicate packet. */
676 		mutex_exit(&con->c_lock);
677 		npf_stats_inc(npc->npc_ctx, NPF_STAT_RACE_NAT);
678 		return EISCONN;
679 	}
680 
681 	/* Remove the "backwards" entry. */
682 	ret = npf_conndb_remove(npf->conn_db, &con->c_back_entry);
683 	KASSERT(ret == con);
684 
685 	/* Set the source/destination IDs to the translation values. */
686 	bk = &con->c_back_entry;
687 	connkey_set_addr(bk, taddr, tidx);
688 	if (tport) {
689 		connkey_set_id(bk, tport, tidx);
690 	}
691 
692 	/* Finally, re-insert the "backwards" entry. */
693 	if (!npf_conndb_insert(npf->conn_db, bk)) {
694 		/*
695 		 * Race: we have hit the duplicate, remove the "forwards"
696 		 * entry and expire our connection; it is no longer valid.
697 		 */
698 		ret = npf_conndb_remove(npf->conn_db, &con->c_forw_entry);
699 		KASSERT(ret == con);
700 
701 		atomic_or_uint(&con->c_flags, CONN_REMOVED | CONN_EXPIRE);
702 		mutex_exit(&con->c_lock);
703 
704 		npf_stats_inc(npc->npc_ctx, NPF_STAT_RACE_NAT);
705 		return EISCONN;
706 	}
707 
708 	/* Associate the NAT entry and release the lock. */
709 	con->c_nat = nt;
710 	mutex_exit(&con->c_lock);
711 	return 0;
712 }
713 
714 /*
715  * npf_conn_expire: explicitly mark connection as expired.
716  */
717 void
718 npf_conn_expire(npf_conn_t *con)
719 {
720 	/* KASSERT(con->c_refcnt > 0); XXX: npf_nat_freepolicy() */
721 	atomic_or_uint(&con->c_flags, CONN_EXPIRE);
722 }
723 
724 /*
725  * npf_conn_pass: return true if connection is "pass" one, otherwise false.
726  */
727 bool
728 npf_conn_pass(const npf_conn_t *con, npf_match_info_t *mi, npf_rproc_t **rp)
729 {
730 	KASSERT(con->c_refcnt > 0);
731 	if (__predict_true(con->c_flags & CONN_PASS)) {
732 		mi->mi_rid = con->c_rid;
733 		mi->mi_retfl = con->c_retfl;
734 		*rp = con->c_rproc;
735 		return true;
736 	}
737 	return false;
738 }
739 
740 /*
741  * npf_conn_setpass: mark connection as a "pass" one and associate the
742  * rule procedure with it.
743  */
744 void
745 npf_conn_setpass(npf_conn_t *con, const npf_match_info_t *mi, npf_rproc_t *rp)
746 {
747 	KASSERT((con->c_flags & CONN_ACTIVE) == 0);
748 	KASSERT(con->c_refcnt > 0);
749 	KASSERT(con->c_rproc == NULL);
750 
751 	/*
752 	 * No need for atomic since the connection is not yet active.
753 	 * If rproc is set, the caller transfers its reference to us,
754 	 * which will be released on npf_conn_destroy().
755 	 */
756 	atomic_or_uint(&con->c_flags, CONN_PASS);
757 	con->c_rproc = rp;
758 	if (rp) {
759 		con->c_rid = mi->mi_rid;
760 		con->c_retfl = mi->mi_retfl;
761 	}
762 }
763 
764 /*
765  * npf_conn_release: release a reference, which might allow G/C thread
766  * to destroy this connection.
767  */
768 void
769 npf_conn_release(npf_conn_t *con)
770 {
771 	if ((con->c_flags & (CONN_ACTIVE | CONN_EXPIRE)) == 0) {
772 		/* Activate: after this, connection is globally visible. */
773 		atomic_or_uint(&con->c_flags, CONN_ACTIVE);
774 	}
775 	KASSERT(con->c_refcnt > 0);
776 	atomic_dec_uint(&con->c_refcnt);
777 }
778 
779 /*
780  * npf_conn_getnat: return associated NAT data entry and indicate
781  * whether it is a "forwards" or "backwards" stream.
782  */
783 npf_nat_t *
784 npf_conn_getnat(npf_conn_t *con, const int di, bool *forw)
785 {
786 	KASSERT(con->c_refcnt > 0);
787 	*forw = (con->c_flags & PFIL_ALL) == (u_int)di;
788 	return con->c_nat;
789 }
790 
791 /*
792  * npf_conn_expired: criterion to check if connection is expired.
793  */
794 bool
795 npf_conn_expired(const npf_conn_t *con, uint64_t tsnow)
796 {
797 	const int etime = npf_state_etime(&con->c_state, con->c_proto);
798 	int elapsed;
799 
800 	if (__predict_false(con->c_flags & CONN_EXPIRE)) {
801 		/* Explicitly marked to be expired. */
802 		return true;
803 	}
804 
805 	/*
806 	 * Note: another thread may update 'atime' and it might
807 	 * become greater than 'now'.
808 	 */
809 	elapsed = (int64_t)tsnow - con->c_atime;
810 	return elapsed > etime;
811 }
812 
813 /*
814  * npf_conn_remove: unlink the connection and mark as expired.
815  */
816 void
817 npf_conn_remove(npf_conndb_t *cd, npf_conn_t *con)
818 {
819 	/* Remove both entries of the connection. */
820 	mutex_enter(&con->c_lock);
821 	if ((con->c_flags & CONN_REMOVED) == 0) {
822 		npf_conn_t *ret __diagused;
823 
824 		ret = npf_conndb_remove(cd, &con->c_forw_entry);
825 		KASSERT(ret == con);
826 		ret = npf_conndb_remove(cd, &con->c_back_entry);
827 		KASSERT(ret == con);
828 	}
829 
830 	/* Flag the removal and expiration. */
831 	atomic_or_uint(&con->c_flags, CONN_REMOVED | CONN_EXPIRE);
832 	mutex_exit(&con->c_lock);
833 }
834 
835 /*
836  * npf_conn_worker: G/C to run from a worker thread.
837  */
838 void
839 npf_conn_worker(npf_t *npf)
840 {
841 	npf_conndb_gc(npf, npf->conn_db, false, true);
842 }
843 
844 /*
845  * npf_conndb_export: construct a list of connections prepared for saving.
846  * Note: this is expected to be an expensive operation.
847  */
848 int
849 npf_conndb_export(npf_t *npf, nvlist_t *npf_dict)
850 {
851 	npf_conn_t *head, *con;
852 
853 	/*
854 	 * Note: acquire conn_lock to prevent from the database
855 	 * destruction and G/C thread.
856 	 */
857 	mutex_enter(&npf->conn_lock);
858 	if (npf->conn_tracking != CONN_TRACKING_ON) {
859 		mutex_exit(&npf->conn_lock);
860 		return 0;
861 	}
862 	head = npf_conndb_getlist(npf->conn_db);
863 	con = head;
864 	while (con) {
865 		nvlist_t *cdict;
866 
867 		if ((cdict = npf_conn_export(npf, con)) != NULL) {
868 			nvlist_append_nvlist_array(npf_dict, "conn-list", cdict);
869 			nvlist_destroy(cdict);
870 		}
871 		if ((con = npf_conndb_getnext(npf->conn_db, con)) == head) {
872 			break;
873 		}
874 	}
875 	mutex_exit(&npf->conn_lock);
876 	return 0;
877 }
878 
879 static nvlist_t *
880 npf_connkey_export(const npf_connkey_t *key)
881 {
882 	uint16_t id[2], alen, proto;
883 	npf_addr_t ips[2];
884 	nvlist_t *kdict;
885 
886 	kdict = nvlist_create(0);
887 	connkey_getkey(key, &proto, ips, id, &alen);
888 	nvlist_add_number(kdict, "proto", proto);
889 	nvlist_add_number(kdict, "sport", id[NPF_SRC]);
890 	nvlist_add_number(kdict, "dport", id[NPF_DST]);
891 	nvlist_add_binary(kdict, "saddr", &ips[NPF_SRC], alen);
892 	nvlist_add_binary(kdict, "daddr", &ips[NPF_DST], alen);
893 	return kdict;
894 }
895 
896 /*
897  * npf_conn_export: serialise a single connection.
898  */
899 static nvlist_t *
900 npf_conn_export(npf_t *npf, const npf_conn_t *con)
901 {
902 	nvlist_t *cdict, *kdict;
903 
904 	if ((con->c_flags & (CONN_ACTIVE|CONN_EXPIRE)) != CONN_ACTIVE) {
905 		return NULL;
906 	}
907 	cdict = nvlist_create(0);
908 	nvlist_add_number(cdict, "flags", con->c_flags);
909 	nvlist_add_number(cdict, "proto", con->c_proto);
910 	if (con->c_ifid) {
911 		const char *ifname = npf_ifmap_getname(npf, con->c_ifid);
912 		nvlist_add_string(cdict, "ifname", ifname);
913 	}
914 	nvlist_add_binary(cdict, "state", &con->c_state, sizeof(npf_state_t));
915 
916 	kdict = npf_connkey_export(&con->c_forw_entry);
917 	nvlist_move_nvlist(cdict, "forw-key", kdict);
918 
919 	kdict = npf_connkey_export(&con->c_back_entry);
920 	nvlist_move_nvlist(cdict, "back-key", kdict);
921 
922 	if (con->c_nat) {
923 		npf_nat_export(cdict, con->c_nat);
924 	}
925 	return cdict;
926 }
927 
928 static uint32_t
929 npf_connkey_import(const nvlist_t *kdict, npf_connkey_t *key)
930 {
931 	npf_addr_t const * ips[2];
932 	uint16_t proto, id[2];
933 	size_t alen1, alen2;
934 
935 	proto = dnvlist_get_number(kdict, "proto", 0);
936 	id[NPF_SRC] = dnvlist_get_number(kdict, "sport", 0);
937 	id[NPF_DST] = dnvlist_get_number(kdict, "dport", 0);
938 	ips[NPF_SRC] = dnvlist_get_binary(kdict, "saddr", &alen1, NULL, 0);
939 	ips[NPF_DST] = dnvlist_get_binary(kdict, "daddr", &alen2, NULL, 0);
940 	if (__predict_false(alen1 == 0 || alen1 != alen2)) {
941 		return 0;
942 	}
943 	return connkey_setkey(key, proto, ips, id, alen1, true);
944 }
945 
946 /*
947  * npf_conn_import: fully reconstruct a single connection from a
948  * nvlist and insert into the given database.
949  */
950 int
951 npf_conn_import(npf_t *npf, npf_conndb_t *cd, const nvlist_t *cdict,
952     npf_ruleset_t *natlist)
953 {
954 	npf_conn_t *con;
955 	npf_connkey_t *fw, *bk;
956 	const nvlist_t *nat, *conkey;
957 	const char *ifname;
958 	const void *state;
959 	size_t len;
960 
961 	/* Allocate a connection and initialise it (clear first). */
962 	con = pool_cache_get(npf->conn_cache, PR_WAITOK);
963 	memset(con, 0, sizeof(npf_conn_t));
964 	mutex_init(&con->c_lock, MUTEX_DEFAULT, IPL_SOFTNET);
965 	npf_stats_inc(npf, NPF_STAT_CONN_CREATE);
966 
967 	con->c_proto = dnvlist_get_number(cdict, "proto", 0);
968 	con->c_flags = dnvlist_get_number(cdict, "flags", 0);
969 	con->c_flags &= PFIL_ALL | CONN_ACTIVE | CONN_PASS;
970 	conn_update_atime(con);
971 
972 	ifname = dnvlist_get_string(cdict, "ifname", NULL);
973 	if (ifname && (con->c_ifid = npf_ifmap_register(npf, ifname)) == 0) {
974 		goto err;
975 	}
976 
977 	state = dnvlist_get_binary(cdict, "state", &len, NULL, 0);
978 	if (!state || len != sizeof(npf_state_t)) {
979 		goto err;
980 	}
981 	memcpy(&con->c_state, state, sizeof(npf_state_t));
982 
983 	/* Reconstruct NAT association, if any. */
984 	if ((nat = dnvlist_get_nvlist(cdict, "nat", NULL)) != NULL &&
985 	    (con->c_nat = npf_nat_import(npf, nat, natlist, con)) == NULL) {
986 		goto err;
987 	}
988 
989 	/*
990 	 * Fetch and copy the keys for each direction.
991 	 */
992 	conkey = dnvlist_get_nvlist(cdict, "forw-key", NULL);
993 	fw = &con->c_forw_entry;
994 	if (conkey == NULL || !npf_connkey_import(conkey, fw)) {
995 		goto err;
996 	}
997 	conkey = dnvlist_get_nvlist(cdict, "back-key", NULL);
998 	bk = &con->c_back_entry;
999 	if (conkey == NULL || !npf_connkey_import(conkey, bk)) {
1000 		goto err;
1001 	}
1002 	fw->ck_backptr = bk->ck_backptr = con;
1003 
1004 	/* Insert the entries and the connection itself. */
1005 	if (!npf_conndb_insert(cd, fw)) {
1006 		goto err;
1007 	}
1008 	if (!npf_conndb_insert(cd, bk)) {
1009 		npf_conndb_remove(cd, fw);
1010 		goto err;
1011 	}
1012 
1013 	NPF_PRINTF(("NPF: imported conn %p\n", con));
1014 	npf_conndb_enqueue(cd, con);
1015 	return 0;
1016 err:
1017 	npf_conn_destroy(npf, con);
1018 	return EINVAL;
1019 }
1020 
1021 int
1022 npf_conn_find(npf_t *npf, const nvlist_t *idict, nvlist_t **odict)
1023 {
1024 	const nvlist_t *kdict;
1025 	npf_connkey_t key;
1026 	npf_conn_t *con;
1027 	uint16_t dir;
1028 	bool forw;
1029 
1030 	kdict = dnvlist_get_nvlist(idict, "key", NULL);
1031 	if (!kdict || !npf_connkey_import(kdict, &key)) {
1032 		return EINVAL;
1033 	}
1034 	dir = dnvlist_get_number(idict, "direction", 0);
1035 	con = npf_conndb_lookup(npf->conn_db, &key, &forw);
1036 	if (con == NULL) {
1037 		return ESRCH;
1038 	}
1039 	if (!npf_conn_ok(con, dir, true)) {
1040 		atomic_dec_uint(&con->c_refcnt);
1041 		return ESRCH;
1042 	}
1043 	*odict = npf_conn_export(npf, con);
1044 	atomic_dec_uint(&con->c_refcnt);
1045 	return *odict ? 0 : ENOSPC;
1046 }
1047 
1048 #if defined(DDB) || defined(_NPF_TESTING)
1049 
1050 void
1051 npf_conn_print(const npf_conn_t *con)
1052 {
1053 	const u_int alen = NPF_CONN_GETALEN(&con->c_forw_entry);
1054 	const uint32_t *fkey = con->c_forw_entry.ck_key;
1055 	const uint32_t *bkey = con->c_back_entry.ck_key;
1056 	const u_int proto = con->c_proto;
1057 	struct timespec tspnow;
1058 	const void *src, *dst;
1059 	int etime;
1060 
1061 	getnanouptime(&tspnow);
1062 	etime = npf_state_etime(&con->c_state, proto);
1063 
1064 	printf("%p:\n\tproto %d flags 0x%x tsdiff %ld etime %d\n", con,
1065 	    proto, con->c_flags, (long)(tspnow.tv_sec - con->c_atime), etime);
1066 
1067 	src = &fkey[2], dst = &fkey[2 + (alen >> 2)];
1068 	printf("\tforw %s:%d", npf_addr_dump(src, alen), ntohs(fkey[1] >> 16));
1069 	printf("-> %s:%d\n", npf_addr_dump(dst, alen), ntohs(fkey[1] & 0xffff));
1070 
1071 	src = &bkey[2], dst = &bkey[2 + (alen >> 2)];
1072 	printf("\tback %s:%d", npf_addr_dump(src, alen), ntohs(bkey[1] >> 16));
1073 	printf("-> %s:%d\n", npf_addr_dump(dst, alen), ntohs(bkey[1] & 0xffff));
1074 
1075 	npf_state_dump(&con->c_state);
1076 	if (con->c_nat) {
1077 		npf_nat_dump(con->c_nat);
1078 	}
1079 }
1080 
1081 #endif
1082