xref: /openbsd-src/sys/net/if_pfsync.c (revision 5a38ef86d0b61900239c7913d24a05e7b88a58f0)
1 /*	$OpenBSD: if_pfsync.c,v 1.299 2021/11/25 13:46:02 bluhm Exp $	*/
2 
3 /*
4  * Copyright (c) 2002 Michael Shalayeff
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
20  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22  * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
24  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
25  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26  * THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*
30  * Copyright (c) 2009 David Gwynne <dlg@openbsd.org>
31  *
32  * Permission to use, copy, modify, and distribute this software for any
33  * purpose with or without fee is hereby granted, provided that the above
34  * copyright notice and this permission notice appear in all copies.
35  *
36  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
37  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
38  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
39  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
40  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
41  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
42  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
43  */
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/time.h>
48 #include <sys/malloc.h>
49 #include <sys/mbuf.h>
50 #include <sys/socket.h>
51 #include <sys/ioctl.h>
52 #include <sys/timeout.h>
53 #include <sys/kernel.h>
54 #include <sys/sysctl.h>
55 #include <sys/pool.h>
56 #include <sys/syslog.h>
57 
58 #include <net/if.h>
59 #include <net/if_types.h>
60 #include <net/bpf.h>
61 #include <net/netisr.h>
62 
63 #include <netinet/in.h>
64 #include <netinet/if_ether.h>
65 #include <netinet/ip.h>
66 #include <netinet/in_var.h>
67 #include <netinet/ip_var.h>
68 #include <netinet/ip_ipsp.h>
69 #include <netinet/ip_icmp.h>
70 #include <netinet/icmp6.h>
71 #include <netinet/tcp.h>
72 #include <netinet/tcp_seq.h>
73 #include <netinet/tcp_fsm.h>
74 #include <netinet/udp.h>
75 
76 #ifdef INET6
77 #include <netinet6/in6_var.h>
78 #include <netinet/ip6.h>
79 #include <netinet6/ip6_var.h>
80 #include <netinet6/nd6.h>
81 #endif /* INET6 */
82 
83 #include "carp.h"
84 #if NCARP > 0
85 #include <netinet/ip_carp.h>
86 #endif
87 
88 #define PF_DEBUGNAME	"pfsync: "
89 #include <net/pfvar.h>
90 #include <net/pfvar_priv.h>
91 #include <net/if_pfsync.h>
92 
93 #include "bpfilter.h"
94 #include "pfsync.h"
95 
96 #define PFSYNC_DEFER_NSEC 20000000ULL
97 
98 #define PFSYNC_MINPKT ( \
99 	sizeof(struct ip) + \
100 	sizeof(struct pfsync_header))
101 
102 int	pfsync_upd_tcp(struct pf_state *, struct pfsync_state_peer *,
103 	    struct pfsync_state_peer *);
104 
105 int	pfsync_in_clr(caddr_t, int, int, int);
106 int	pfsync_in_iack(caddr_t, int, int, int);
107 int	pfsync_in_upd_c(caddr_t, int, int, int);
108 int	pfsync_in_ureq(caddr_t, int, int, int);
109 int	pfsync_in_del(caddr_t, int, int, int);
110 int	pfsync_in_del_c(caddr_t, int, int, int);
111 int	pfsync_in_bus(caddr_t, int, int, int);
112 int	pfsync_in_tdb(caddr_t, int, int, int);
113 int	pfsync_in_ins(caddr_t, int, int, int);
114 int	pfsync_in_upd(caddr_t, int, int, int);
115 int	pfsync_in_eof(caddr_t, int, int, int);
116 
117 int	pfsync_in_error(caddr_t, int, int, int);
118 
119 void	pfsync_update_state_locked(struct pf_state *);
120 
121 struct {
122 	int	(*in)(caddr_t, int, int, int);
123 	size_t	len;
124 } pfsync_acts[] = {
125 	/* PFSYNC_ACT_CLR */
126 	{ pfsync_in_clr,	sizeof(struct pfsync_clr) },
127 	 /* PFSYNC_ACT_OINS */
128 	{ pfsync_in_error,	0 },
129 	/* PFSYNC_ACT_INS_ACK */
130 	{ pfsync_in_iack,	sizeof(struct pfsync_ins_ack) },
131 	/* PFSYNC_ACT_OUPD */
132 	{ pfsync_in_error,	0 },
133 	/* PFSYNC_ACT_UPD_C */
134 	{ pfsync_in_upd_c,	sizeof(struct pfsync_upd_c) },
135 	/* PFSYNC_ACT_UPD_REQ */
136 	{ pfsync_in_ureq,	sizeof(struct pfsync_upd_req) },
137 	/* PFSYNC_ACT_DEL */
138 	{ pfsync_in_del,	sizeof(struct pfsync_state) },
139 	/* PFSYNC_ACT_DEL_C */
140 	{ pfsync_in_del_c,	sizeof(struct pfsync_del_c) },
141 	/* PFSYNC_ACT_INS_F */
142 	{ pfsync_in_error,	0 },
143 	/* PFSYNC_ACT_DEL_F */
144 	{ pfsync_in_error,	0 },
145 	/* PFSYNC_ACT_BUS */
146 	{ pfsync_in_bus,	sizeof(struct pfsync_bus) },
147 	/* PFSYNC_ACT_OTDB */
148 	{ pfsync_in_error,	0 },
149 	/* PFSYNC_ACT_EOF */
150 	{ pfsync_in_error,	0 },
151 	/* PFSYNC_ACT_INS */
152 	{ pfsync_in_ins,	sizeof(struct pfsync_state) },
153 	/* PFSYNC_ACT_UPD */
154 	{ pfsync_in_upd,	sizeof(struct pfsync_state) },
155 	/* PFSYNC_ACT_TDB */
156 	{ pfsync_in_tdb,	sizeof(struct pfsync_tdb) },
157 };
158 
159 struct pfsync_q {
160 	void		(*write)(struct pf_state *, void *);
161 	size_t		len;
162 	u_int8_t	action;
163 };
164 
165 /* we have one of these for every PFSYNC_S_ */
166 void	pfsync_out_state(struct pf_state *, void *);
167 void	pfsync_out_iack(struct pf_state *, void *);
168 void	pfsync_out_upd_c(struct pf_state *, void *);
169 void	pfsync_out_del(struct pf_state *, void *);
170 
171 struct pfsync_q pfsync_qs[] = {
172 	{ pfsync_out_iack,  sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK },
173 	{ pfsync_out_upd_c, sizeof(struct pfsync_upd_c),   PFSYNC_ACT_UPD_C },
174 	{ pfsync_out_del,   sizeof(struct pfsync_del_c),   PFSYNC_ACT_DEL_C },
175 	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_INS },
176 	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_UPD }
177 };
178 
179 void	pfsync_q_ins(struct pf_state *, int);
180 void	pfsync_q_del(struct pf_state *);
181 
182 struct pfsync_upd_req_item {
183 	TAILQ_ENTRY(pfsync_upd_req_item)	ur_entry;
184 	struct pfsync_upd_req			ur_msg;
185 };
186 TAILQ_HEAD(pfsync_upd_reqs, pfsync_upd_req_item);
187 
188 struct pfsync_deferral {
189 	TAILQ_ENTRY(pfsync_deferral)		 pd_entry;
190 	struct pf_state				*pd_st;
191 	struct mbuf				*pd_m;
192 	uint64_t				 pd_deadline;
193 };
194 TAILQ_HEAD(pfsync_deferrals, pfsync_deferral);
195 
196 #define PFSYNC_PLSIZE	MAX(sizeof(struct pfsync_upd_req_item), \
197 			    sizeof(struct pfsync_deferral))
198 
199 void	pfsync_out_tdb(struct tdb *, void *);
200 
201 struct pfsync_softc {
202 	struct ifnet		 sc_if;
203 	unsigned int		 sc_sync_ifidx;
204 
205 	struct pool		 sc_pool;
206 
207 	struct ip_moptions	 sc_imo;
208 
209 	struct in_addr		 sc_sync_peer;
210 	u_int8_t		 sc_maxupdates;
211 
212 	struct ip		 sc_template;
213 
214 	struct pf_state_queue	 sc_qs[PFSYNC_S_COUNT];
215 	struct mutex		 sc_mtx[PFSYNC_S_COUNT];
216 	size_t			 sc_len;
217 
218 	struct pfsync_upd_reqs	 sc_upd_req_list;
219 	struct mutex		 sc_upd_req_mtx;
220 
221 	int			 sc_initial_bulk;
222 	int			 sc_link_demoted;
223 
224 	int			 sc_defer;
225 	struct pfsync_deferrals	 sc_deferrals;
226 	u_int			 sc_deferred;
227 	struct mutex		 sc_deferrals_mtx;
228 	struct timeout		 sc_deferrals_tmo;
229 
230 	void			*sc_plus;
231 	size_t			 sc_pluslen;
232 
233 	u_int32_t		 sc_ureq_sent;
234 	int			 sc_bulk_tries;
235 	struct timeout		 sc_bulkfail_tmo;
236 
237 	u_int32_t		 sc_ureq_received;
238 	struct pf_state		*sc_bulk_next;
239 	struct pf_state		*sc_bulk_last;
240 	struct timeout		 sc_bulk_tmo;
241 
242 	TAILQ_HEAD(, tdb)	 sc_tdb_q;
243 	struct mutex		 sc_tdb_mtx;
244 
245 	struct task		 sc_ltask;
246 	struct task		 sc_dtask;
247 
248 	struct timeout		 sc_tmo;
249 };
250 
251 struct pfsync_snapshot {
252 	struct pfsync_softc	*sn_sc;
253 	struct pf_state_queue	 sn_qs[PFSYNC_S_COUNT];
254 	struct pfsync_upd_reqs	 sn_upd_req_list;
255 	TAILQ_HEAD(, tdb)	 sn_tdb_q;
256 	size_t			 sn_len;
257 	void			*sn_plus;
258 	size_t			 sn_pluslen;
259 };
260 
261 struct pfsync_softc	*pfsyncif = NULL;
262 struct cpumem		*pfsynccounters;
263 
264 void	pfsyncattach(int);
265 int	pfsync_clone_create(struct if_clone *, int);
266 int	pfsync_clone_destroy(struct ifnet *);
267 int	pfsync_alloc_scrub_memory(struct pfsync_state_peer *,
268 	    struct pf_state_peer *);
269 void	pfsync_update_net_tdb(struct pfsync_tdb *);
270 int	pfsyncoutput(struct ifnet *, struct mbuf *, struct sockaddr *,
271 	    struct rtentry *);
272 int	pfsyncioctl(struct ifnet *, u_long, caddr_t);
273 void	pfsyncstart(struct ifqueue *);
274 void	pfsync_syncdev_state(void *);
275 void	pfsync_ifdetach(void *);
276 
277 void	pfsync_deferred(struct pf_state *, int);
278 void	pfsync_undefer(struct pfsync_deferral *, int);
279 void	pfsync_deferrals_tmo(void *);
280 
281 void	pfsync_cancel_full_update(struct pfsync_softc *);
282 void	pfsync_request_full_update(struct pfsync_softc *);
283 void	pfsync_request_update(u_int32_t, u_int64_t);
284 void	pfsync_update_state_req(struct pf_state *);
285 
286 void	pfsync_drop(struct pfsync_softc *);
287 void	pfsync_sendout(void);
288 void	pfsync_send_plus(void *, size_t);
289 void	pfsync_timeout(void *);
290 void	pfsync_tdb_timeout(void *);
291 
292 void	pfsync_bulk_start(void);
293 void	pfsync_bulk_status(u_int8_t);
294 void	pfsync_bulk_update(void *);
295 void	pfsync_bulk_fail(void *);
296 
297 void	pfsync_grab_snapshot(struct pfsync_snapshot *, struct pfsync_softc *);
298 void	pfsync_drop_snapshot(struct pfsync_snapshot *);
299 
300 void	pfsync_send_dispatch(void *);
301 void	pfsync_send_pkt(struct mbuf *);
302 
303 static struct mbuf_queue	pfsync_mq;
304 static struct task	pfsync_task =
305     TASK_INITIALIZER(pfsync_send_dispatch, &pfsync_mq);
306 
307 #define PFSYNC_MAX_BULKTRIES	12
308 int	pfsync_sync_ok;
309 
310 struct if_clone	pfsync_cloner =
311     IF_CLONE_INITIALIZER("pfsync", pfsync_clone_create, pfsync_clone_destroy);
312 
313 void
314 pfsyncattach(int npfsync)
315 {
316 	if_clone_attach(&pfsync_cloner);
317 	pfsynccounters = counters_alloc(pfsyncs_ncounters);
318 	mq_init(&pfsync_mq, 4096, IPL_SOFTNET);
319 }
320 
321 int
322 pfsync_clone_create(struct if_clone *ifc, int unit)
323 {
324 	struct pfsync_softc *sc;
325 	struct ifnet *ifp;
326 	int q;
327 	static const char *mtx_names[] = {
328 		"iack_mtx",
329 		"upd_c_mtx",
330 		"del_mtx",
331 		"ins_mtx",
332 		"upd_mtx",
333 		"" };
334 
335 	if (unit != 0)
336 		return (EINVAL);
337 
338 	pfsync_sync_ok = 1;
339 
340 	sc = malloc(sizeof(*pfsyncif), M_DEVBUF, M_WAITOK|M_ZERO);
341 	for (q = 0; q < PFSYNC_S_COUNT; q++) {
342 		TAILQ_INIT(&sc->sc_qs[q]);
343 		mtx_init_flags(&sc->sc_mtx[q], IPL_SOFTNET, mtx_names[q], 0);
344 	}
345 
346 	pool_init(&sc->sc_pool, PFSYNC_PLSIZE, 0, IPL_SOFTNET, 0, "pfsync",
347 	    NULL);
348 	TAILQ_INIT(&sc->sc_upd_req_list);
349 	mtx_init(&sc->sc_upd_req_mtx, IPL_SOFTNET);
350 	TAILQ_INIT(&sc->sc_deferrals);
351 	mtx_init(&sc->sc_deferrals_mtx, IPL_SOFTNET);
352 	timeout_set_proc(&sc->sc_deferrals_tmo, pfsync_deferrals_tmo, sc);
353 	task_set(&sc->sc_ltask, pfsync_syncdev_state, sc);
354 	task_set(&sc->sc_dtask, pfsync_ifdetach, sc);
355 	sc->sc_deferred = 0;
356 
357 	TAILQ_INIT(&sc->sc_tdb_q);
358 	mtx_init(&sc->sc_tdb_mtx, IPL_SOFTNET);
359 
360 	sc->sc_len = PFSYNC_MINPKT;
361 	sc->sc_maxupdates = 128;
362 
363 	sc->sc_imo.imo_membership = mallocarray(IP_MIN_MEMBERSHIPS,
364 	    sizeof(struct in_multi *), M_IPMOPTS, M_WAITOK|M_ZERO);
365 	sc->sc_imo.imo_max_memberships = IP_MIN_MEMBERSHIPS;
366 
367 	ifp = &sc->sc_if;
368 	snprintf(ifp->if_xname, sizeof ifp->if_xname, "pfsync%d", unit);
369 	ifp->if_softc = sc;
370 	ifp->if_ioctl = pfsyncioctl;
371 	ifp->if_output = pfsyncoutput;
372 	ifp->if_qstart = pfsyncstart;
373 	ifp->if_type = IFT_PFSYNC;
374 	ifp->if_hdrlen = sizeof(struct pfsync_header);
375 	ifp->if_mtu = ETHERMTU;
376 	ifp->if_xflags = IFXF_CLONED | IFXF_MPSAFE;
377 	timeout_set_proc(&sc->sc_tmo, pfsync_timeout, NULL);
378 	timeout_set_proc(&sc->sc_bulk_tmo, pfsync_bulk_update, NULL);
379 	timeout_set_proc(&sc->sc_bulkfail_tmo, pfsync_bulk_fail, NULL);
380 
381 	if_attach(ifp);
382 	if_alloc_sadl(ifp);
383 
384 #if NCARP > 0
385 	if_addgroup(ifp, "carp");
386 #endif
387 
388 #if NBPFILTER > 0
389 	bpfattach(&sc->sc_if.if_bpf, ifp, DLT_PFSYNC, PFSYNC_HDRLEN);
390 #endif
391 
392 	pfsyncif = sc;
393 
394 	return (0);
395 }
396 
397 int
398 pfsync_clone_destroy(struct ifnet *ifp)
399 {
400 	struct pfsync_softc *sc = ifp->if_softc;
401 	struct ifnet *ifp0;
402 	struct pfsync_deferral *pd;
403 	struct pfsync_deferrals	 deferrals;
404 
405 	NET_LOCK();
406 
407 #if NCARP > 0
408 	if (!pfsync_sync_ok)
409 		carp_group_demote_adj(&sc->sc_if, -1, "pfsync destroy");
410 	if (sc->sc_link_demoted)
411 		carp_group_demote_adj(&sc->sc_if, -1, "pfsync destroy");
412 #endif
413 	if ((ifp0 = if_get(sc->sc_sync_ifidx)) != NULL) {
414 		if_linkstatehook_del(ifp0, &sc->sc_ltask);
415 		if_detachhook_del(ifp0, &sc->sc_dtask);
416 	}
417 	if_put(ifp0);
418 
419 	/* XXXSMP breaks atomicity */
420 	NET_UNLOCK();
421 	if_detach(ifp);
422 	NET_LOCK();
423 
424 	pfsync_drop(sc);
425 
426 	if (sc->sc_deferred > 0) {
427 		TAILQ_INIT(&deferrals);
428 		mtx_enter(&sc->sc_deferrals_mtx);
429 		TAILQ_CONCAT(&deferrals, &sc->sc_deferrals, pd_entry);
430 		sc->sc_deferred = 0;
431 		mtx_leave(&sc->sc_deferrals_mtx);
432 
433 		while (!TAILQ_EMPTY(&deferrals)) {
434 			pd = TAILQ_FIRST(&deferrals);
435 			TAILQ_REMOVE(&deferrals, pd, pd_entry);
436 			pfsync_undefer(pd, 0);
437 		}
438 	}
439 
440 	pfsyncif = NULL;
441 	timeout_del(&sc->sc_bulkfail_tmo);
442 	timeout_del(&sc->sc_bulk_tmo);
443 	timeout_del(&sc->sc_tmo);
444 
445 	NET_UNLOCK();
446 
447 	pool_destroy(&sc->sc_pool);
448 	free(sc->sc_imo.imo_membership, M_IPMOPTS,
449 	    sc->sc_imo.imo_max_memberships * sizeof(struct in_multi *));
450 	free(sc, M_DEVBUF, sizeof(*sc));
451 
452 	return (0);
453 }
454 
455 /*
456  * Start output on the pfsync interface.
457  */
458 void
459 pfsyncstart(struct ifqueue *ifq)
460 {
461 	ifq_purge(ifq);
462 }
463 
464 void
465 pfsync_syncdev_state(void *arg)
466 {
467 	struct pfsync_softc *sc = arg;
468 	struct ifnet *ifp;
469 
470 	if ((sc->sc_if.if_flags & IFF_UP) == 0)
471 		return;
472 	if ((ifp = if_get(sc->sc_sync_ifidx)) == NULL)
473 		return;
474 
475 	if (ifp->if_link_state == LINK_STATE_DOWN) {
476 		sc->sc_if.if_flags &= ~IFF_RUNNING;
477 		if (!sc->sc_link_demoted) {
478 #if NCARP > 0
479 			carp_group_demote_adj(&sc->sc_if, 1,
480 			    "pfsync link state down");
481 #endif
482 			sc->sc_link_demoted = 1;
483 		}
484 
485 		/* drop everything */
486 		timeout_del(&sc->sc_tmo);
487 		pfsync_drop(sc);
488 
489 		pfsync_cancel_full_update(sc);
490 	} else if (sc->sc_link_demoted) {
491 		sc->sc_if.if_flags |= IFF_RUNNING;
492 
493 		pfsync_request_full_update(sc);
494 	}
495 
496 	if_put(ifp);
497 }
498 
499 void
500 pfsync_ifdetach(void *arg)
501 {
502 	struct pfsync_softc *sc = arg;
503 	struct ifnet *ifp;
504 
505 	if ((ifp = if_get(sc->sc_sync_ifidx)) != NULL) {
506 		if_linkstatehook_del(ifp, &sc->sc_ltask);
507 		if_detachhook_del(ifp, &sc->sc_dtask);
508 	}
509 	if_put(ifp);
510 
511 	sc->sc_sync_ifidx = 0;
512 }
513 
514 int
515 pfsync_alloc_scrub_memory(struct pfsync_state_peer *s,
516     struct pf_state_peer *d)
517 {
518 	if (s->scrub.scrub_flag && d->scrub == NULL) {
519 		d->scrub = pool_get(&pf_state_scrub_pl, PR_NOWAIT | PR_ZERO);
520 		if (d->scrub == NULL)
521 			return (ENOMEM);
522 	}
523 
524 	return (0);
525 }
526 
527 void
528 pfsync_state_export(struct pfsync_state *sp, struct pf_state *st)
529 {
530 	pf_state_export(sp, st);
531 }
532 
533 int
534 pfsync_state_import(struct pfsync_state *sp, int flags)
535 {
536 	struct pf_state	*st = NULL;
537 	struct pf_state_key *skw = NULL, *sks = NULL;
538 	struct pf_rule *r = NULL;
539 	struct pfi_kif	*kif;
540 	int pool_flags;
541 	int error = ENOMEM;
542 	int n = 0;
543 
544 	if (sp->creatorid == 0) {
545 		DPFPRINTF(LOG_NOTICE, "pfsync_state_import: "
546 		    "invalid creator id: %08x", ntohl(sp->creatorid));
547 		return (EINVAL);
548 	}
549 
550 	if ((kif = pfi_kif_get(sp->ifname, NULL)) == NULL) {
551 		DPFPRINTF(LOG_NOTICE, "pfsync_state_import: "
552 		    "unknown interface: %s", sp->ifname);
553 		if (flags & PFSYNC_SI_IOCTL)
554 			return (EINVAL);
555 		return (0);	/* skip this state */
556 	}
557 
558 	if (sp->af == 0)
559 		return (0);	/* skip this state */
560 
561 	/*
562 	 * If the ruleset checksums match or the state is coming from the ioctl,
563 	 * it's safe to associate the state with the rule of that number.
564 	 */
565 	if (sp->rule != htonl(-1) && sp->anchor == htonl(-1) &&
566 	    (flags & (PFSYNC_SI_IOCTL | PFSYNC_SI_CKSUM)) && ntohl(sp->rule) <
567 	    pf_main_ruleset.rules.active.rcount) {
568 		TAILQ_FOREACH(r, pf_main_ruleset.rules.active.ptr, entries)
569 			if (ntohl(sp->rule) == n++)
570 				break;
571 	} else
572 		r = &pf_default_rule;
573 
574 	if ((r->max_states && r->states_cur >= r->max_states))
575 		goto cleanup;
576 
577 	if (flags & PFSYNC_SI_IOCTL)
578 		pool_flags = PR_WAITOK | PR_LIMITFAIL | PR_ZERO;
579 	else
580 		pool_flags = PR_NOWAIT | PR_LIMITFAIL | PR_ZERO;
581 
582 	if ((st = pool_get(&pf_state_pl, pool_flags)) == NULL)
583 		goto cleanup;
584 
585 	if ((skw = pf_alloc_state_key(pool_flags)) == NULL)
586 		goto cleanup;
587 
588 	if ((sp->key[PF_SK_WIRE].af &&
589 	    (sp->key[PF_SK_WIRE].af != sp->key[PF_SK_STACK].af)) ||
590 	    PF_ANEQ(&sp->key[PF_SK_WIRE].addr[0],
591 	    &sp->key[PF_SK_STACK].addr[0], sp->af) ||
592 	    PF_ANEQ(&sp->key[PF_SK_WIRE].addr[1],
593 	    &sp->key[PF_SK_STACK].addr[1], sp->af) ||
594 	    sp->key[PF_SK_WIRE].port[0] != sp->key[PF_SK_STACK].port[0] ||
595 	    sp->key[PF_SK_WIRE].port[1] != sp->key[PF_SK_STACK].port[1] ||
596 	    sp->key[PF_SK_WIRE].rdomain != sp->key[PF_SK_STACK].rdomain) {
597 		if ((sks = pf_alloc_state_key(pool_flags)) == NULL)
598 			goto cleanup;
599 	} else
600 		sks = skw;
601 
602 	/* allocate memory for scrub info */
603 	if (pfsync_alloc_scrub_memory(&sp->src, &st->src) ||
604 	    pfsync_alloc_scrub_memory(&sp->dst, &st->dst))
605 		goto cleanup;
606 
607 	/* copy to state key(s) */
608 	skw->addr[0] = sp->key[PF_SK_WIRE].addr[0];
609 	skw->addr[1] = sp->key[PF_SK_WIRE].addr[1];
610 	skw->port[0] = sp->key[PF_SK_WIRE].port[0];
611 	skw->port[1] = sp->key[PF_SK_WIRE].port[1];
612 	skw->rdomain = ntohs(sp->key[PF_SK_WIRE].rdomain);
613 	PF_REF_INIT(skw->refcnt);
614 	skw->proto = sp->proto;
615 	if (!(skw->af = sp->key[PF_SK_WIRE].af))
616 		skw->af = sp->af;
617 	if (sks != skw) {
618 		sks->addr[0] = sp->key[PF_SK_STACK].addr[0];
619 		sks->addr[1] = sp->key[PF_SK_STACK].addr[1];
620 		sks->port[0] = sp->key[PF_SK_STACK].port[0];
621 		sks->port[1] = sp->key[PF_SK_STACK].port[1];
622 		sks->rdomain = ntohs(sp->key[PF_SK_STACK].rdomain);
623 		PF_REF_INIT(sks->refcnt);
624 		if (!(sks->af = sp->key[PF_SK_STACK].af))
625 			sks->af = sp->af;
626 		if (sks->af != skw->af) {
627 			switch (sp->proto) {
628 			case IPPROTO_ICMP:
629 				sks->proto = IPPROTO_ICMPV6;
630 				break;
631 			case IPPROTO_ICMPV6:
632 				sks->proto = IPPROTO_ICMP;
633 				break;
634 			default:
635 				sks->proto = sp->proto;
636 			}
637 		} else
638 			sks->proto = sp->proto;
639 
640 		if (((sks->af != AF_INET) && (sks->af != AF_INET6)) ||
641 		    ((skw->af != AF_INET) && (skw->af != AF_INET6))) {
642 			error = EINVAL;
643 			goto cleanup;
644 		}
645 
646 	} else if ((sks->af != AF_INET) && (sks->af != AF_INET6)) {
647 		error = EINVAL;
648 		goto cleanup;
649 	}
650 	st->rtableid[PF_SK_WIRE] = ntohl(sp->rtableid[PF_SK_WIRE]);
651 	st->rtableid[PF_SK_STACK] = ntohl(sp->rtableid[PF_SK_STACK]);
652 
653 	/* copy to state */
654 	st->rt_addr = sp->rt_addr;
655 	st->rt = sp->rt;
656 	st->creation = getuptime() - ntohl(sp->creation);
657 	st->expire = getuptime();
658 	if (ntohl(sp->expire)) {
659 		u_int32_t timeout;
660 
661 		timeout = r->timeout[sp->timeout];
662 		if (!timeout)
663 			timeout = pf_default_rule.timeout[sp->timeout];
664 
665 		/* sp->expire may have been adaptively scaled by export. */
666 		st->expire -= timeout - ntohl(sp->expire);
667 	}
668 
669 	st->direction = sp->direction;
670 	st->log = sp->log;
671 	st->timeout = sp->timeout;
672 	st->state_flags = ntohs(sp->state_flags);
673 	st->max_mss = ntohs(sp->max_mss);
674 	st->min_ttl = sp->min_ttl;
675 	st->set_tos = sp->set_tos;
676 	st->set_prio[0] = sp->set_prio[0];
677 	st->set_prio[1] = sp->set_prio[1];
678 
679 	st->id = sp->id;
680 	st->creatorid = sp->creatorid;
681 	pf_state_peer_ntoh(&sp->src, &st->src);
682 	pf_state_peer_ntoh(&sp->dst, &st->dst);
683 
684 	st->rule.ptr = r;
685 	st->anchor.ptr = NULL;
686 
687 	st->pfsync_time = getuptime();
688 	st->sync_state = PFSYNC_S_NONE;
689 
690 	refcnt_init(&st->refcnt);
691 
692 	/* XXX when we have anchors, use STATE_INC_COUNTERS */
693 	r->states_cur++;
694 	r->states_tot++;
695 
696 	if (!ISSET(flags, PFSYNC_SI_IOCTL))
697 		SET(st->state_flags, PFSTATE_NOSYNC);
698 
699 	/*
700 	 * We just set PFSTATE_NOSYNC bit, which prevents
701 	 * pfsync_insert_state() to insert state to pfsync.
702 	 */
703 	if (pf_state_insert(kif, &skw, &sks, st) != 0) {
704 		/* XXX when we have anchors, use STATE_DEC_COUNTERS */
705 		r->states_cur--;
706 		error = EEXIST;
707 		goto cleanup_state;
708 	}
709 
710 	if (!ISSET(flags, PFSYNC_SI_IOCTL)) {
711 		CLR(st->state_flags, PFSTATE_NOSYNC);
712 		if (ISSET(st->state_flags, PFSTATE_ACK)) {
713 			pfsync_q_ins(st, PFSYNC_S_IACK);
714 			schednetisr(NETISR_PFSYNC);
715 		}
716 	}
717 	CLR(st->state_flags, PFSTATE_ACK);
718 
719 	return (0);
720 
721  cleanup:
722 	if (skw == sks)
723 		sks = NULL;
724 	if (skw != NULL)
725 		pool_put(&pf_state_key_pl, skw);
726 	if (sks != NULL)
727 		pool_put(&pf_state_key_pl, sks);
728 
729  cleanup_state:	/* pf_state_insert frees the state keys */
730 	if (st) {
731 		if (st->dst.scrub)
732 			pool_put(&pf_state_scrub_pl, st->dst.scrub);
733 		if (st->src.scrub)
734 			pool_put(&pf_state_scrub_pl, st->src.scrub);
735 		pool_put(&pf_state_pl, st);
736 	}
737 	return (error);
738 }
739 
740 int
741 pfsync_input(struct mbuf **mp, int *offp, int proto, int af)
742 {
743 	struct mbuf *n, *m = *mp;
744 	struct pfsync_softc *sc = pfsyncif;
745 	struct ip *ip = mtod(m, struct ip *);
746 	struct pfsync_header *ph;
747 	struct pfsync_subheader subh;
748 	int offset, noff, len, count, mlen, flags = 0;
749 	int e;
750 
751 	NET_ASSERT_LOCKED();
752 
753 	pfsyncstat_inc(pfsyncs_ipackets);
754 
755 	/* verify that we have a sync interface configured */
756 	if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING) ||
757 	    sc->sc_sync_ifidx == 0 || !pf_status.running)
758 		goto done;
759 
760 	/* verify that the packet came in on the right interface */
761 	if (sc->sc_sync_ifidx != m->m_pkthdr.ph_ifidx) {
762 		pfsyncstat_inc(pfsyncs_badif);
763 		goto done;
764 	}
765 
766 	sc->sc_if.if_ipackets++;
767 	sc->sc_if.if_ibytes += m->m_pkthdr.len;
768 
769 	/* verify that the IP TTL is 255. */
770 	if (ip->ip_ttl != PFSYNC_DFLTTL) {
771 		pfsyncstat_inc(pfsyncs_badttl);
772 		goto done;
773 	}
774 
775 	offset = ip->ip_hl << 2;
776 	n = m_pulldown(m, offset, sizeof(*ph), &noff);
777 	if (n == NULL) {
778 		pfsyncstat_inc(pfsyncs_hdrops);
779 		return IPPROTO_DONE;
780 	}
781 	ph = (struct pfsync_header *)(n->m_data + noff);
782 
783 	/* verify the version */
784 	if (ph->version != PFSYNC_VERSION) {
785 		pfsyncstat_inc(pfsyncs_badver);
786 		goto done;
787 	}
788 	len = ntohs(ph->len) + offset;
789 	if (m->m_pkthdr.len < len) {
790 		pfsyncstat_inc(pfsyncs_badlen);
791 		goto done;
792 	}
793 
794 	if (!bcmp(&ph->pfcksum, &pf_status.pf_chksum, PF_MD5_DIGEST_LENGTH))
795 		flags = PFSYNC_SI_CKSUM;
796 
797 	offset += sizeof(*ph);
798 	while (offset <= len - sizeof(subh)) {
799 		m_copydata(m, offset, sizeof(subh), &subh);
800 		offset += sizeof(subh);
801 
802 		mlen = subh.len << 2;
803 		count = ntohs(subh.count);
804 
805 		if (subh.action >= PFSYNC_ACT_MAX ||
806 		    subh.action >= nitems(pfsync_acts) ||
807 		    mlen < pfsync_acts[subh.action].len) {
808 			/*
809 			 * subheaders are always followed by at least one
810 			 * message, so if the peer is new
811 			 * enough to tell us how big its messages are then we
812 			 * know enough to skip them.
813 			 */
814 			if (count > 0 && mlen > 0) {
815 				offset += count * mlen;
816 				continue;
817 			}
818 			pfsyncstat_inc(pfsyncs_badact);
819 			goto done;
820 		}
821 
822 		n = m_pulldown(m, offset, mlen * count, &noff);
823 		if (n == NULL) {
824 			pfsyncstat_inc(pfsyncs_badlen);
825 			return IPPROTO_DONE;
826 		}
827 
828 		e = pfsync_acts[subh.action].in(n->m_data + noff, mlen, count,
829 		    flags);
830 		if (e != 0)
831 			goto done;
832 
833 		offset += mlen * count;
834 	}
835 
836 done:
837 	m_freem(m);
838 	return IPPROTO_DONE;
839 }
840 
841 int
842 pfsync_in_clr(caddr_t buf, int len, int count, int flags)
843 {
844 	struct pfsync_clr *clr;
845 	struct pf_state *st, *nexts;
846 	struct pfi_kif *kif;
847 	u_int32_t creatorid;
848 	int i;
849 
850 	PF_LOCK();
851 	for (i = 0; i < count; i++) {
852 		clr = (struct pfsync_clr *)buf + len * i;
853 		kif = NULL;
854 		creatorid = clr->creatorid;
855 		if (strlen(clr->ifname) &&
856 		    (kif = pfi_kif_find(clr->ifname)) == NULL)
857 			continue;
858 
859 		PF_STATE_ENTER_WRITE();
860 		for (st = RB_MIN(pf_state_tree_id, &tree_id); st; st = nexts) {
861 			nexts = RB_NEXT(pf_state_tree_id, &tree_id, st);
862 			if (st->creatorid == creatorid &&
863 			    ((kif && st->kif == kif) || !kif)) {
864 				SET(st->state_flags, PFSTATE_NOSYNC);
865 				pf_remove_state(st);
866 			}
867 		}
868 		PF_STATE_EXIT_WRITE();
869 	}
870 	PF_UNLOCK();
871 
872 	return (0);
873 }
874 
875 int
876 pfsync_in_ins(caddr_t buf, int len, int count, int flags)
877 {
878 	struct pfsync_state *sp;
879 	sa_family_t af1, af2;
880 	int i;
881 
882 	PF_LOCK();
883 	for (i = 0; i < count; i++) {
884 		sp = (struct pfsync_state *)(buf + len * i);
885 		af1 = sp->key[0].af;
886 		af2 = sp->key[1].af;
887 
888 		/* check for invalid values */
889 		if (sp->timeout >= PFTM_MAX ||
890 		    sp->src.state > PF_TCPS_PROXY_DST ||
891 		    sp->dst.state > PF_TCPS_PROXY_DST ||
892 		    sp->direction > PF_OUT ||
893 		    (((af1 || af2) &&
894 		     ((af1 != AF_INET && af1 != AF_INET6) ||
895 		      (af2 != AF_INET && af2 != AF_INET6))) ||
896 		    (sp->af != AF_INET && sp->af != AF_INET6))) {
897 			DPFPRINTF(LOG_NOTICE,
898 			    "pfsync_input: PFSYNC5_ACT_INS: invalid value");
899 			pfsyncstat_inc(pfsyncs_badval);
900 			continue;
901 		}
902 
903 		if (pfsync_state_import(sp, flags) == ENOMEM) {
904 			/* drop out, but process the rest of the actions */
905 			break;
906 		}
907 	}
908 	PF_UNLOCK();
909 
910 	return (0);
911 }
912 
913 int
914 pfsync_in_iack(caddr_t buf, int len, int count, int flags)
915 {
916 	struct pfsync_ins_ack *ia;
917 	struct pf_state_cmp id_key;
918 	struct pf_state *st;
919 	int i;
920 
921 	for (i = 0; i < count; i++) {
922 		ia = (struct pfsync_ins_ack *)(buf + len * i);
923 
924 		id_key.id = ia->id;
925 		id_key.creatorid = ia->creatorid;
926 
927 		PF_STATE_ENTER_READ();
928 		st = pf_find_state_byid(&id_key);
929 		pf_state_ref(st);
930 		PF_STATE_EXIT_READ();
931 		if (st == NULL)
932 			continue;
933 
934 		if (ISSET(st->state_flags, PFSTATE_ACK))
935 			pfsync_deferred(st, 0);
936 
937 		pf_state_unref(st);
938 	}
939 
940 	return (0);
941 }
942 
943 int
944 pfsync_upd_tcp(struct pf_state *st, struct pfsync_state_peer *src,
945     struct pfsync_state_peer *dst)
946 {
947 	int sync = 0;
948 
949 	/*
950 	 * The state should never go backwards except
951 	 * for syn-proxy states.  Neither should the
952 	 * sequence window slide backwards.
953 	 */
954 	if ((st->src.state > src->state &&
955 	    (st->src.state < PF_TCPS_PROXY_SRC ||
956 	    src->state >= PF_TCPS_PROXY_SRC)) ||
957 
958 	    (st->src.state == src->state &&
959 	    SEQ_GT(st->src.seqlo, ntohl(src->seqlo))))
960 		sync++;
961 	else
962 		pf_state_peer_ntoh(src, &st->src);
963 
964 	if ((st->dst.state > dst->state) ||
965 
966 	    (st->dst.state >= TCPS_SYN_SENT &&
967 	    SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo))))
968 		sync++;
969 	else
970 		pf_state_peer_ntoh(dst, &st->dst);
971 
972 	return (sync);
973 }
974 
975 int
976 pfsync_in_upd(caddr_t buf, int len, int count, int flags)
977 {
978 	struct pfsync_state *sp;
979 	struct pf_state_cmp id_key;
980 	struct pf_state *st;
981 	int sync, error;
982 	int i;
983 
984 	for (i = 0; i < count; i++) {
985 		sp = (struct pfsync_state *)(buf + len * i);
986 
987 		/* check for invalid values */
988 		if (sp->timeout >= PFTM_MAX ||
989 		    sp->src.state > PF_TCPS_PROXY_DST ||
990 		    sp->dst.state > PF_TCPS_PROXY_DST) {
991 			DPFPRINTF(LOG_NOTICE,
992 			    "pfsync_input: PFSYNC_ACT_UPD: invalid value");
993 			pfsyncstat_inc(pfsyncs_badval);
994 			continue;
995 		}
996 
997 		id_key.id = sp->id;
998 		id_key.creatorid = sp->creatorid;
999 
1000 		PF_STATE_ENTER_READ();
1001 		st = pf_find_state_byid(&id_key);
1002 		pf_state_ref(st);
1003 		PF_STATE_EXIT_READ();
1004 		if (st == NULL) {
1005 			/* insert the update */
1006 			PF_LOCK();
1007 			error = pfsync_state_import(sp, flags);
1008 			if (error)
1009 				pfsyncstat_inc(pfsyncs_badstate);
1010 			PF_UNLOCK();
1011 			continue;
1012 		}
1013 
1014 		if (ISSET(st->state_flags, PFSTATE_ACK))
1015 			pfsync_deferred(st, 1);
1016 
1017 		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
1018 			sync = pfsync_upd_tcp(st, &sp->src, &sp->dst);
1019 		else {
1020 			sync = 0;
1021 
1022 			/*
1023 			 * Non-TCP protocol state machine always go
1024 			 * forwards
1025 			 */
1026 			if (st->src.state > sp->src.state)
1027 				sync++;
1028 			else
1029 				pf_state_peer_ntoh(&sp->src, &st->src);
1030 
1031 			if (st->dst.state > sp->dst.state)
1032 				sync++;
1033 			else
1034 				pf_state_peer_ntoh(&sp->dst, &st->dst);
1035 		}
1036 
1037 		if (sync < 2) {
1038 			pfsync_alloc_scrub_memory(&sp->dst, &st->dst);
1039 			pf_state_peer_ntoh(&sp->dst, &st->dst);
1040 			st->expire = getuptime();
1041 			st->timeout = sp->timeout;
1042 		}
1043 		st->pfsync_time = getuptime();
1044 
1045 		if (sync) {
1046 			pfsyncstat_inc(pfsyncs_stale);
1047 
1048 			pfsync_update_state(st);
1049 			schednetisr(NETISR_PFSYNC);
1050 		}
1051 
1052 		pf_state_unref(st);
1053 	}
1054 
1055 	return (0);
1056 }
1057 
1058 int
1059 pfsync_in_upd_c(caddr_t buf, int len, int count, int flags)
1060 {
1061 	struct pfsync_upd_c *up;
1062 	struct pf_state_cmp id_key;
1063 	struct pf_state *st;
1064 
1065 	int sync;
1066 
1067 	int i;
1068 
1069 	for (i = 0; i < count; i++) {
1070 		up = (struct pfsync_upd_c *)(buf + len * i);
1071 
1072 		/* check for invalid values */
1073 		if (up->timeout >= PFTM_MAX ||
1074 		    up->src.state > PF_TCPS_PROXY_DST ||
1075 		    up->dst.state > PF_TCPS_PROXY_DST) {
1076 			DPFPRINTF(LOG_NOTICE,
1077 			    "pfsync_input: PFSYNC_ACT_UPD_C: invalid value");
1078 			pfsyncstat_inc(pfsyncs_badval);
1079 			continue;
1080 		}
1081 
1082 		id_key.id = up->id;
1083 		id_key.creatorid = up->creatorid;
1084 
1085 		PF_STATE_ENTER_READ();
1086 		st = pf_find_state_byid(&id_key);
1087 		pf_state_ref(st);
1088 		PF_STATE_EXIT_READ();
1089 		if (st == NULL) {
1090 			/* We don't have this state. Ask for it. */
1091 			pfsync_request_update(id_key.creatorid, id_key.id);
1092 			continue;
1093 		}
1094 
1095 		if (ISSET(st->state_flags, PFSTATE_ACK))
1096 			pfsync_deferred(st, 1);
1097 
1098 		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
1099 			sync = pfsync_upd_tcp(st, &up->src, &up->dst);
1100 		else {
1101 			sync = 0;
1102 			/*
1103 			 * Non-TCP protocol state machine always go
1104 			 * forwards
1105 			 */
1106 			if (st->src.state > up->src.state)
1107 				sync++;
1108 			else
1109 				pf_state_peer_ntoh(&up->src, &st->src);
1110 
1111 			if (st->dst.state > up->dst.state)
1112 				sync++;
1113 			else
1114 				pf_state_peer_ntoh(&up->dst, &st->dst);
1115 		}
1116 		if (sync < 2) {
1117 			pfsync_alloc_scrub_memory(&up->dst, &st->dst);
1118 			pf_state_peer_ntoh(&up->dst, &st->dst);
1119 			st->expire = getuptime();
1120 			st->timeout = up->timeout;
1121 		}
1122 		st->pfsync_time = getuptime();
1123 
1124 		if (sync) {
1125 			pfsyncstat_inc(pfsyncs_stale);
1126 
1127 			pfsync_update_state(st);
1128 			schednetisr(NETISR_PFSYNC);
1129 		}
1130 
1131 		pf_state_unref(st);
1132 	}
1133 
1134 	return (0);
1135 }
1136 
1137 int
1138 pfsync_in_ureq(caddr_t buf, int len, int count, int flags)
1139 {
1140 	struct pfsync_upd_req *ur;
1141 	int i;
1142 
1143 	struct pf_state_cmp id_key;
1144 	struct pf_state *st;
1145 
1146 	for (i = 0; i < count; i++) {
1147 		ur = (struct pfsync_upd_req *)(buf + len * i);
1148 
1149 		id_key.id = ur->id;
1150 		id_key.creatorid = ur->creatorid;
1151 
1152 		if (id_key.id == 0 && id_key.creatorid == 0)
1153 			pfsync_bulk_start();
1154 		else {
1155 			PF_STATE_ENTER_READ();
1156 			st = pf_find_state_byid(&id_key);
1157 			pf_state_ref(st);
1158 			PF_STATE_EXIT_READ();
1159 			if (st == NULL) {
1160 				pfsyncstat_inc(pfsyncs_badstate);
1161 				continue;
1162 			}
1163 			if (ISSET(st->state_flags, PFSTATE_NOSYNC)) {
1164 				pf_state_unref(st);
1165 				continue;
1166 			}
1167 
1168 			pfsync_update_state_req(st);
1169 			pf_state_unref(st);
1170 		}
1171 	}
1172 
1173 	return (0);
1174 }
1175 
1176 int
1177 pfsync_in_del(caddr_t buf, int len, int count, int flags)
1178 {
1179 	struct pfsync_state *sp;
1180 	struct pf_state_cmp id_key;
1181 	struct pf_state *st;
1182 	int i;
1183 
1184 	PF_STATE_ENTER_WRITE();
1185 	for (i = 0; i < count; i++) {
1186 		sp = (struct pfsync_state *)(buf + len * i);
1187 
1188 		id_key.id = sp->id;
1189 		id_key.creatorid = sp->creatorid;
1190 
1191 		st = pf_find_state_byid(&id_key);
1192 		if (st == NULL) {
1193 			pfsyncstat_inc(pfsyncs_badstate);
1194 			continue;
1195 		}
1196 		SET(st->state_flags, PFSTATE_NOSYNC);
1197 		pf_remove_state(st);
1198 	}
1199 	PF_STATE_EXIT_WRITE();
1200 
1201 	return (0);
1202 }
1203 
1204 int
1205 pfsync_in_del_c(caddr_t buf, int len, int count, int flags)
1206 {
1207 	struct pfsync_del_c *sp;
1208 	struct pf_state_cmp id_key;
1209 	struct pf_state *st;
1210 	int i;
1211 
1212 	PF_LOCK();
1213 	PF_STATE_ENTER_WRITE();
1214 	for (i = 0; i < count; i++) {
1215 		sp = (struct pfsync_del_c *)(buf + len * i);
1216 
1217 		id_key.id = sp->id;
1218 		id_key.creatorid = sp->creatorid;
1219 
1220 		st = pf_find_state_byid(&id_key);
1221 		if (st == NULL) {
1222 			pfsyncstat_inc(pfsyncs_badstate);
1223 			continue;
1224 		}
1225 
1226 		SET(st->state_flags, PFSTATE_NOSYNC);
1227 		pf_remove_state(st);
1228 	}
1229 	PF_STATE_EXIT_WRITE();
1230 	PF_UNLOCK();
1231 
1232 	return (0);
1233 }
1234 
1235 int
1236 pfsync_in_bus(caddr_t buf, int len, int count, int flags)
1237 {
1238 	struct pfsync_softc *sc = pfsyncif;
1239 	struct pfsync_bus *bus;
1240 
1241 	/* If we're not waiting for a bulk update, who cares. */
1242 	if (sc->sc_ureq_sent == 0)
1243 		return (0);
1244 
1245 	bus = (struct pfsync_bus *)buf;
1246 
1247 	switch (bus->status) {
1248 	case PFSYNC_BUS_START:
1249 		timeout_add(&sc->sc_bulkfail_tmo, 4 * hz +
1250 		    pf_pool_limits[PF_LIMIT_STATES].limit /
1251 		    ((sc->sc_if.if_mtu - PFSYNC_MINPKT) /
1252 		    sizeof(struct pfsync_state)));
1253 		DPFPRINTF(LOG_INFO, "received bulk update start");
1254 		break;
1255 
1256 	case PFSYNC_BUS_END:
1257 		if (getuptime() - ntohl(bus->endtime) >=
1258 		    sc->sc_ureq_sent) {
1259 			/* that's it, we're happy */
1260 			sc->sc_ureq_sent = 0;
1261 			sc->sc_bulk_tries = 0;
1262 			timeout_del(&sc->sc_bulkfail_tmo);
1263 #if NCARP > 0
1264 			if (!pfsync_sync_ok)
1265 				carp_group_demote_adj(&sc->sc_if, -1,
1266 				    sc->sc_link_demoted ?
1267 				    "pfsync link state up" :
1268 				    "pfsync bulk done");
1269 			if (sc->sc_initial_bulk) {
1270 				carp_group_demote_adj(&sc->sc_if, -32,
1271 				    "pfsync init");
1272 				sc->sc_initial_bulk = 0;
1273 			}
1274 #endif
1275 			pfsync_sync_ok = 1;
1276 			sc->sc_link_demoted = 0;
1277 			DPFPRINTF(LOG_INFO, "received valid bulk update end");
1278 		} else {
1279 			DPFPRINTF(LOG_WARNING, "received invalid "
1280 			    "bulk update end: bad timestamp");
1281 		}
1282 		break;
1283 	}
1284 
1285 	return (0);
1286 }
1287 
1288 int
1289 pfsync_in_tdb(caddr_t buf, int len, int count, int flags)
1290 {
1291 #if defined(IPSEC)
1292 	struct pfsync_tdb *tp;
1293 	int i;
1294 
1295 	for (i = 0; i < count; i++) {
1296 		tp = (struct pfsync_tdb *)(buf + len * i);
1297 		pfsync_update_net_tdb(tp);
1298 	}
1299 #endif
1300 
1301 	return (0);
1302 }
1303 
1304 #if defined(IPSEC)
1305 /* Update an in-kernel tdb. Silently fail if no tdb is found. */
1306 void
1307 pfsync_update_net_tdb(struct pfsync_tdb *pt)
1308 {
1309 	struct tdb		*tdb;
1310 
1311 	NET_ASSERT_LOCKED();
1312 
1313 	/* check for invalid values */
1314 	if (ntohl(pt->spi) <= SPI_RESERVED_MAX ||
1315 	    (pt->dst.sa.sa_family != AF_INET &&
1316 	     pt->dst.sa.sa_family != AF_INET6))
1317 		goto bad;
1318 
1319 	tdb = gettdb(ntohs(pt->rdomain), pt->spi,
1320 	    (union sockaddr_union *)&pt->dst, pt->sproto);
1321 	if (tdb) {
1322 		pt->rpl = betoh64(pt->rpl);
1323 		pt->cur_bytes = betoh64(pt->cur_bytes);
1324 
1325 		/* Neither replay nor byte counter should ever decrease. */
1326 		if (pt->rpl < tdb->tdb_rpl ||
1327 		    pt->cur_bytes < tdb->tdb_cur_bytes) {
1328 			tdb_unref(tdb);
1329 			goto bad;
1330 		}
1331 
1332 		tdb->tdb_rpl = pt->rpl;
1333 		tdb->tdb_cur_bytes = pt->cur_bytes;
1334 		tdb_unref(tdb);
1335 	}
1336 	return;
1337 
1338  bad:
1339 	DPFPRINTF(LOG_WARNING, "pfsync_insert: PFSYNC_ACT_TDB_UPD: "
1340 	    "invalid value");
1341 	pfsyncstat_inc(pfsyncs_badstate);
1342 	return;
1343 }
1344 #endif
1345 
1346 
1347 int
1348 pfsync_in_eof(caddr_t buf, int len, int count, int flags)
1349 {
1350 	if (len > 0 || count > 0)
1351 		pfsyncstat_inc(pfsyncs_badact);
1352 
1353 	/* we're done. let the caller return */
1354 	return (1);
1355 }
1356 
1357 int
1358 pfsync_in_error(caddr_t buf, int len, int count, int flags)
1359 {
1360 	pfsyncstat_inc(pfsyncs_badact);
1361 	return (-1);
1362 }
1363 
1364 int
1365 pfsyncoutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
1366 	struct rtentry *rt)
1367 {
1368 	m_freem(m);	/* drop packet */
1369 	return (EAFNOSUPPORT);
1370 }
1371 
1372 int
1373 pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
1374 {
1375 	struct proc *p = curproc;
1376 	struct pfsync_softc *sc = ifp->if_softc;
1377 	struct ifreq *ifr = (struct ifreq *)data;
1378 	struct ip_moptions *imo = &sc->sc_imo;
1379 	struct pfsyncreq pfsyncr;
1380 	struct ifnet *ifp0, *sifp;
1381 	struct ip *ip;
1382 	int error;
1383 
1384 	switch (cmd) {
1385 	case SIOCSIFFLAGS:
1386 		if ((ifp->if_flags & IFF_RUNNING) == 0 &&
1387 		    (ifp->if_flags & IFF_UP)) {
1388 			ifp->if_flags |= IFF_RUNNING;
1389 
1390 #if NCARP > 0
1391 			sc->sc_initial_bulk = 1;
1392 			carp_group_demote_adj(&sc->sc_if, 32, "pfsync init");
1393 #endif
1394 
1395 			pfsync_request_full_update(sc);
1396 		}
1397 		if ((ifp->if_flags & IFF_RUNNING) &&
1398 		    (ifp->if_flags & IFF_UP) == 0) {
1399 			ifp->if_flags &= ~IFF_RUNNING;
1400 
1401 			/* drop everything */
1402 			timeout_del(&sc->sc_tmo);
1403 			pfsync_drop(sc);
1404 
1405 			pfsync_cancel_full_update(sc);
1406 		}
1407 		break;
1408 	case SIOCSIFMTU:
1409 		if ((ifp0 = if_get(sc->sc_sync_ifidx)) == NULL)
1410 			return (EINVAL);
1411 		error = 0;
1412 		if (ifr->ifr_mtu <= PFSYNC_MINPKT ||
1413 		    ifr->ifr_mtu > ifp0->if_mtu) {
1414 			error = EINVAL;
1415 		}
1416 		if_put(ifp0);
1417 		if (error)
1418 			return error;
1419 		if (ifr->ifr_mtu < ifp->if_mtu)
1420 			pfsync_sendout();
1421 		ifp->if_mtu = ifr->ifr_mtu;
1422 		break;
1423 	case SIOCGETPFSYNC:
1424 		bzero(&pfsyncr, sizeof(pfsyncr));
1425 		if ((ifp0 = if_get(sc->sc_sync_ifidx)) != NULL) {
1426 			strlcpy(pfsyncr.pfsyncr_syncdev,
1427 			    ifp0->if_xname, IFNAMSIZ);
1428 		}
1429 		if_put(ifp0);
1430 		pfsyncr.pfsyncr_syncpeer = sc->sc_sync_peer;
1431 		pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates;
1432 		pfsyncr.pfsyncr_defer = sc->sc_defer;
1433 		return (copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr)));
1434 
1435 	case SIOCSETPFSYNC:
1436 		if ((error = suser(p)) != 0)
1437 			return (error);
1438 		if ((error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr))))
1439 			return (error);
1440 
1441 		if (pfsyncr.pfsyncr_syncpeer.s_addr == 0)
1442 			sc->sc_sync_peer.s_addr = INADDR_PFSYNC_GROUP;
1443 		else
1444 			sc->sc_sync_peer.s_addr =
1445 			    pfsyncr.pfsyncr_syncpeer.s_addr;
1446 
1447 		if (pfsyncr.pfsyncr_maxupdates > 255)
1448 			return (EINVAL);
1449 		sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates;
1450 
1451 		sc->sc_defer = pfsyncr.pfsyncr_defer;
1452 
1453 		if (pfsyncr.pfsyncr_syncdev[0] == 0) {
1454 			if ((ifp0 = if_get(sc->sc_sync_ifidx)) != NULL) {
1455 				if_linkstatehook_del(ifp0, &sc->sc_ltask);
1456 				if_detachhook_del(ifp0, &sc->sc_dtask);
1457 			}
1458 			if_put(ifp0);
1459 			sc->sc_sync_ifidx = 0;
1460 			if (imo->imo_num_memberships > 0) {
1461 				in_delmulti(imo->imo_membership[
1462 				    --imo->imo_num_memberships]);
1463 				imo->imo_ifidx = 0;
1464 			}
1465 			break;
1466 		}
1467 
1468 		if ((sifp = if_unit(pfsyncr.pfsyncr_syncdev)) == NULL)
1469 			return (EINVAL);
1470 
1471 		ifp0 = if_get(sc->sc_sync_ifidx);
1472 
1473 		if (sifp->if_mtu < sc->sc_if.if_mtu || (ifp0 != NULL &&
1474 		    sifp->if_mtu < ifp0->if_mtu) ||
1475 		    sifp->if_mtu < MCLBYTES - sizeof(struct ip))
1476 			pfsync_sendout();
1477 
1478 		if (ifp0) {
1479 			if_linkstatehook_del(ifp0, &sc->sc_ltask);
1480 			if_detachhook_del(ifp0, &sc->sc_dtask);
1481 		}
1482 		if_put(ifp0);
1483 		sc->sc_sync_ifidx = sifp->if_index;
1484 
1485 		if (imo->imo_num_memberships > 0) {
1486 			in_delmulti(imo->imo_membership[--imo->imo_num_memberships]);
1487 			imo->imo_ifidx = 0;
1488 		}
1489 
1490 		if (sc->sc_sync_peer.s_addr == INADDR_PFSYNC_GROUP) {
1491 			struct in_addr addr;
1492 
1493 			if (!(sifp->if_flags & IFF_MULTICAST)) {
1494 				sc->sc_sync_ifidx = 0;
1495 				if_put(sifp);
1496 				return (EADDRNOTAVAIL);
1497 			}
1498 
1499 			addr.s_addr = INADDR_PFSYNC_GROUP;
1500 
1501 			if ((imo->imo_membership[0] =
1502 			    in_addmulti(&addr, sifp)) == NULL) {
1503 				sc->sc_sync_ifidx = 0;
1504 				if_put(sifp);
1505 				return (ENOBUFS);
1506 			}
1507 			imo->imo_num_memberships++;
1508 			imo->imo_ifidx = sc->sc_sync_ifidx;
1509 			imo->imo_ttl = PFSYNC_DFLTTL;
1510 			imo->imo_loop = 0;
1511 		}
1512 
1513 		ip = &sc->sc_template;
1514 		bzero(ip, sizeof(*ip));
1515 		ip->ip_v = IPVERSION;
1516 		ip->ip_hl = sizeof(sc->sc_template) >> 2;
1517 		ip->ip_tos = IPTOS_LOWDELAY;
1518 		/* len and id are set later */
1519 		ip->ip_off = htons(IP_DF);
1520 		ip->ip_ttl = PFSYNC_DFLTTL;
1521 		ip->ip_p = IPPROTO_PFSYNC;
1522 		ip->ip_src.s_addr = INADDR_ANY;
1523 		ip->ip_dst.s_addr = sc->sc_sync_peer.s_addr;
1524 
1525 		if_linkstatehook_add(sifp, &sc->sc_ltask);
1526 		if_detachhook_add(sifp, &sc->sc_dtask);
1527 		if_put(sifp);
1528 
1529 		pfsync_request_full_update(sc);
1530 
1531 		break;
1532 
1533 	default:
1534 		return (ENOTTY);
1535 	}
1536 
1537 	return (0);
1538 }
1539 
1540 void
1541 pfsync_out_state(struct pf_state *st, void *buf)
1542 {
1543 	struct pfsync_state *sp = buf;
1544 
1545 	pfsync_state_export(sp, st);
1546 }
1547 
1548 void
1549 pfsync_out_iack(struct pf_state *st, void *buf)
1550 {
1551 	struct pfsync_ins_ack *iack = buf;
1552 
1553 	iack->id = st->id;
1554 	iack->creatorid = st->creatorid;
1555 }
1556 
1557 void
1558 pfsync_out_upd_c(struct pf_state *st, void *buf)
1559 {
1560 	struct pfsync_upd_c *up = buf;
1561 
1562 	bzero(up, sizeof(*up));
1563 	up->id = st->id;
1564 	pf_state_peer_hton(&st->src, &up->src);
1565 	pf_state_peer_hton(&st->dst, &up->dst);
1566 	up->creatorid = st->creatorid;
1567 	up->timeout = st->timeout;
1568 }
1569 
1570 void
1571 pfsync_out_del(struct pf_state *st, void *buf)
1572 {
1573 	struct pfsync_del_c *dp = buf;
1574 
1575 	dp->id = st->id;
1576 	dp->creatorid = st->creatorid;
1577 
1578 	SET(st->state_flags, PFSTATE_NOSYNC);
1579 }
1580 
1581 void
1582 pfsync_grab_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc *sc)
1583 {
1584 	int q;
1585 
1586 	sn->sn_sc = sc;
1587 
1588 	for (q = 0; q < PFSYNC_S_COUNT; q++)
1589 		mtx_enter(&sc->sc_mtx[q]);
1590 
1591 	mtx_enter(&sc->sc_upd_req_mtx);
1592 	mtx_enter(&sc->sc_tdb_mtx);
1593 
1594 	for (q = 0; q < PFSYNC_S_COUNT; q++) {
1595 		TAILQ_INIT(&sn->sn_qs[q]);
1596 		TAILQ_CONCAT(&sn->sn_qs[q], &sc->sc_qs[q], sync_list);
1597 	}
1598 
1599 	TAILQ_INIT(&sn->sn_upd_req_list);
1600 	TAILQ_CONCAT(&sn->sn_upd_req_list, &sc->sc_upd_req_list, ur_entry);
1601 
1602 	TAILQ_INIT(&sn->sn_tdb_q);
1603 	TAILQ_CONCAT(&sn->sn_tdb_q, &sc->sc_tdb_q, tdb_sync_entry);
1604 
1605 	sn->sn_len = sc->sc_len;
1606 	sc->sc_len = PFSYNC_MINPKT;
1607 
1608 	sn->sn_plus = sc->sc_plus;
1609 	sc->sc_plus = NULL;
1610 	sn->sn_pluslen = sc->sc_pluslen;
1611 	sc->sc_pluslen = 0;
1612 
1613 	mtx_leave(&sc->sc_tdb_mtx);
1614 	mtx_leave(&sc->sc_upd_req_mtx);
1615 
1616 	for (q = (PFSYNC_S_COUNT - 1); q >= 0; q--)
1617 		mtx_leave(&sc->sc_mtx[q]);
1618 }
1619 
1620 void
1621 pfsync_drop_snapshot(struct pfsync_snapshot *sn)
1622 {
1623 	struct pf_state *st;
1624 	struct pfsync_upd_req_item *ur;
1625 	struct tdb *t;
1626 	int q;
1627 
1628 
1629 	for (q = 0; q < PFSYNC_S_COUNT; q++) {
1630 		if (TAILQ_EMPTY(&sn->sn_qs[q]))
1631 			continue;
1632 
1633 		while ((st = TAILQ_FIRST(&sn->sn_qs[q])) != NULL) {
1634 			TAILQ_REMOVE(&sn->sn_qs[q], st, sync_list);
1635 #ifdef PFSYNC_DEBUG
1636 			KASSERT(st->sync_state == q);
1637 #endif
1638 			st->sync_state = PFSYNC_S_NONE;
1639 			pf_state_unref(st);
1640 		}
1641 	}
1642 
1643 	while ((ur = TAILQ_FIRST(&sn->sn_upd_req_list)) != NULL) {
1644 		TAILQ_REMOVE(&sn->sn_upd_req_list, ur, ur_entry);
1645 		pool_put(&sn->sn_sc->sc_pool, ur);
1646 	}
1647 
1648 	while ((t = TAILQ_FIRST(&sn->sn_tdb_q)) != NULL) {
1649 		TAILQ_REMOVE(&sn->sn_tdb_q, t, tdb_sync_entry);
1650 		CLR(t->tdb_flags, TDBF_PFSYNC);
1651 	}
1652 }
1653 
1654 int
1655 pfsync_is_snapshot_empty(struct pfsync_snapshot *sn)
1656 {
1657 	int	q;
1658 
1659 	for (q = 0; q < PFSYNC_S_COUNT; q++)
1660 		if (!TAILQ_EMPTY(&sn->sn_qs[q]))
1661 			return (0);
1662 
1663 	if (!TAILQ_EMPTY(&sn->sn_upd_req_list))
1664 		return (0);
1665 
1666 	if (!TAILQ_EMPTY(&sn->sn_tdb_q))
1667 		return (0);
1668 
1669 	return (sn->sn_plus == NULL);
1670 }
1671 
1672 void
1673 pfsync_drop(struct pfsync_softc *sc)
1674 {
1675 	struct pfsync_snapshot	sn;
1676 
1677 	pfsync_grab_snapshot(&sn, sc);
1678 	pfsync_drop_snapshot(&sn);
1679 }
1680 
1681 void
1682 pfsync_send_dispatch(void *xmq)
1683 {
1684 	struct mbuf_queue *mq = xmq;
1685 	struct pfsync_softc *sc;
1686 	struct mbuf *m;
1687 	struct mbuf_list ml;
1688 	int error;
1689 
1690 	mq_delist(mq, &ml);
1691 	if (ml_empty(&ml))
1692 		return;
1693 
1694 	NET_LOCK();
1695 	sc = pfsyncif;
1696 	if (sc == NULL) {
1697 		ml_purge(&ml);
1698 		goto done;
1699 	}
1700 
1701 	while ((m = ml_dequeue(&ml)) != NULL) {
1702 		if ((error = ip_output(m, NULL, NULL, IP_RAWOUTPUT,
1703 		    &sc->sc_imo, NULL, 0)) == 0)
1704 			pfsyncstat_inc(pfsyncs_opackets);
1705 		else {
1706 			DPFPRINTF(LOG_DEBUG,
1707 			    "ip_output() @ %s failed (%d)\n", __func__, error);
1708 			pfsyncstat_inc(pfsyncs_oerrors);
1709 		}
1710 	}
1711 done:
1712 	NET_UNLOCK();
1713 }
1714 
1715 void
1716 pfsync_send_pkt(struct mbuf *m)
1717 {
1718 	if (mq_enqueue(&pfsync_mq, m) != 0) {
1719 		pfsyncstat_inc(pfsyncs_oerrors);
1720 		DPFPRINTF(LOG_DEBUG, "mq_enqueue() @ %s failed, queue full\n",
1721 		    __func__);
1722 	} else
1723 		task_add(net_tq(0), &pfsync_task);
1724 }
1725 
1726 void
1727 pfsync_sendout(void)
1728 {
1729 	struct pfsync_snapshot sn;
1730 	struct pfsync_softc *sc = pfsyncif;
1731 #if NBPFILTER > 0
1732 	struct ifnet *ifp = &sc->sc_if;
1733 #endif
1734 	struct mbuf *m;
1735 	struct ip *ip;
1736 	struct pfsync_header *ph;
1737 	struct pfsync_subheader *subh;
1738 	struct pf_state *st;
1739 	struct pfsync_upd_req_item *ur;
1740 	struct tdb *t;
1741 	int offset;
1742 	int q, count = 0;
1743 
1744 	if (sc == NULL || sc->sc_len == PFSYNC_MINPKT)
1745 		return;
1746 
1747 	if (!ISSET(sc->sc_if.if_flags, IFF_RUNNING) ||
1748 #if NBPFILTER > 0
1749 	    (ifp->if_bpf == NULL && sc->sc_sync_ifidx == 0)) {
1750 #else
1751 	    sc->sc_sync_ifidx == 0) {
1752 #endif
1753 		pfsync_drop(sc);
1754 		return;
1755 	}
1756 
1757 	pfsync_grab_snapshot(&sn, sc);
1758 
1759 	/*
1760 	 * Check below is sufficient to prevent us from sending empty packets,
1761 	 * but it does not stop us from sending short packets.
1762 	 */
1763 	if (pfsync_is_snapshot_empty(&sn))
1764 		return;
1765 
1766 	MGETHDR(m, M_DONTWAIT, MT_DATA);
1767 	if (m == NULL) {
1768 		sc->sc_if.if_oerrors++;
1769 		pfsyncstat_inc(pfsyncs_onomem);
1770 		pfsync_drop_snapshot(&sn);
1771 		return;
1772 	}
1773 
1774 	if (max_linkhdr + sn.sn_len > MHLEN) {
1775 		MCLGETL(m, M_DONTWAIT, max_linkhdr + sn.sn_len);
1776 		if (!ISSET(m->m_flags, M_EXT)) {
1777 			m_free(m);
1778 			sc->sc_if.if_oerrors++;
1779 			pfsyncstat_inc(pfsyncs_onomem);
1780 			pfsync_drop_snapshot(&sn);
1781 			return;
1782 		}
1783 	}
1784 	m->m_data += max_linkhdr;
1785 	m->m_len = m->m_pkthdr.len = sn.sn_len;
1786 
1787 	/* build the ip header */
1788 	ip = mtod(m, struct ip *);
1789 	bcopy(&sc->sc_template, ip, sizeof(*ip));
1790 	offset = sizeof(*ip);
1791 
1792 	ip->ip_len = htons(m->m_pkthdr.len);
1793 	ip->ip_id = htons(ip_randomid());
1794 
1795 	/* build the pfsync header */
1796 	ph = (struct pfsync_header *)(m->m_data + offset);
1797 	bzero(ph, sizeof(*ph));
1798 	offset += sizeof(*ph);
1799 
1800 	ph->version = PFSYNC_VERSION;
1801 	ph->len = htons(sn.sn_len - sizeof(*ip));
1802 	bcopy(pf_status.pf_chksum, ph->pfcksum, PF_MD5_DIGEST_LENGTH);
1803 
1804 	if (!TAILQ_EMPTY(&sn.sn_upd_req_list)) {
1805 		subh = (struct pfsync_subheader *)(m->m_data + offset);
1806 		offset += sizeof(*subh);
1807 
1808 		count = 0;
1809 		while ((ur = TAILQ_FIRST(&sn.sn_upd_req_list)) != NULL) {
1810 			TAILQ_REMOVE(&sn.sn_upd_req_list, ur, ur_entry);
1811 
1812 			bcopy(&ur->ur_msg, m->m_data + offset,
1813 			    sizeof(ur->ur_msg));
1814 			offset += sizeof(ur->ur_msg);
1815 
1816 			pool_put(&sc->sc_pool, ur);
1817 
1818 			count++;
1819 		}
1820 
1821 		bzero(subh, sizeof(*subh));
1822 		subh->len = sizeof(ur->ur_msg) >> 2;
1823 		subh->action = PFSYNC_ACT_UPD_REQ;
1824 		subh->count = htons(count);
1825 	}
1826 
1827 	/* has someone built a custom region for us to add? */
1828 	if (sn.sn_plus != NULL) {
1829 		bcopy(sn.sn_plus, m->m_data + offset, sn.sn_pluslen);
1830 		offset += sn.sn_pluslen;
1831 		sn.sn_plus = NULL;	/* XXX memory leak ? */
1832 	}
1833 
1834 	if (!TAILQ_EMPTY(&sn.sn_tdb_q)) {
1835 		subh = (struct pfsync_subheader *)(m->m_data + offset);
1836 		offset += sizeof(*subh);
1837 
1838 		count = 0;
1839 		while ((t = TAILQ_FIRST(&sn.sn_tdb_q)) != NULL) {
1840 			TAILQ_REMOVE(&sn.sn_tdb_q, t, tdb_sync_entry);
1841 			pfsync_out_tdb(t, m->m_data + offset);
1842 			offset += sizeof(struct pfsync_tdb);
1843 			CLR(t->tdb_flags, TDBF_PFSYNC);
1844 			count++;
1845 		}
1846 
1847 		bzero(subh, sizeof(*subh));
1848 		subh->action = PFSYNC_ACT_TDB;
1849 		subh->len = sizeof(struct pfsync_tdb) >> 2;
1850 		subh->count = htons(count);
1851 	}
1852 
1853 	/* walk the queues */
1854 	for (q = 0; q < PFSYNC_S_COUNT; q++) {
1855 		if (TAILQ_EMPTY(&sn.sn_qs[q]))
1856 			continue;
1857 
1858 		subh = (struct pfsync_subheader *)(m->m_data + offset);
1859 		offset += sizeof(*subh);
1860 
1861 		count = 0;
1862 		while ((st = TAILQ_FIRST(&sn.sn_qs[q])) != NULL) {
1863 			TAILQ_REMOVE(&sn.sn_qs[q], st, sync_list);
1864 #ifdef PFSYNC_DEBUG
1865 			KASSERT(st->sync_state == q);
1866 #endif
1867 			st->sync_state = PFSYNC_S_NONE;
1868 			pfsync_qs[q].write(st, m->m_data + offset);
1869 			offset += pfsync_qs[q].len;
1870 
1871 			pf_state_unref(st);
1872 			count++;
1873 		}
1874 
1875 		bzero(subh, sizeof(*subh));
1876 		subh->action = pfsync_qs[q].action;
1877 		subh->len = pfsync_qs[q].len >> 2;
1878 		subh->count = htons(count);
1879 	}
1880 
1881 	/* we're done, let's put it on the wire */
1882 #if NBPFILTER > 0
1883 	if (ifp->if_bpf) {
1884 		m->m_data += sizeof(*ip);
1885 		m->m_len = m->m_pkthdr.len = sn.sn_len - sizeof(*ip);
1886 		bpf_mtap(ifp->if_bpf, m, BPF_DIRECTION_OUT);
1887 		m->m_data -= sizeof(*ip);
1888 		m->m_len = m->m_pkthdr.len = sn.sn_len;
1889 	}
1890 
1891 	if (sc->sc_sync_ifidx == 0) {
1892 		sc->sc_len = PFSYNC_MINPKT;
1893 		m_freem(m);
1894 		return;
1895 	}
1896 #endif
1897 
1898 	sc->sc_if.if_opackets++;
1899 	sc->sc_if.if_obytes += m->m_pkthdr.len;
1900 
1901 	m->m_pkthdr.ph_rtableid = sc->sc_if.if_rdomain;
1902 
1903 	pfsync_send_pkt(m);
1904 }
1905 
1906 void
1907 pfsync_insert_state(struct pf_state *st)
1908 {
1909 	struct pfsync_softc *sc = pfsyncif;
1910 
1911 	NET_ASSERT_LOCKED();
1912 
1913 	if (ISSET(st->rule.ptr->rule_flag, PFRULE_NOSYNC) ||
1914 	    st->key[PF_SK_WIRE]->proto == IPPROTO_PFSYNC) {
1915 		SET(st->state_flags, PFSTATE_NOSYNC);
1916 		return;
1917 	}
1918 
1919 	if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING) ||
1920 	    ISSET(st->state_flags, PFSTATE_NOSYNC))
1921 		return;
1922 
1923 #ifdef PFSYNC_DEBUG
1924 	KASSERT(st->sync_state == PFSYNC_S_NONE);
1925 #endif
1926 
1927 	if (sc->sc_len == PFSYNC_MINPKT)
1928 		timeout_add_sec(&sc->sc_tmo, 1);
1929 
1930 	pfsync_q_ins(st, PFSYNC_S_INS);
1931 
1932 	st->sync_updates = 0;
1933 }
1934 
1935 int
1936 pfsync_defer(struct pf_state *st, struct mbuf *m, struct pfsync_deferral **ppd)
1937 {
1938 	struct pfsync_softc *sc = pfsyncif;
1939 	struct pfsync_deferral *pd;
1940 	unsigned int sched;
1941 
1942 	NET_ASSERT_LOCKED();
1943 
1944 	if (!sc->sc_defer ||
1945 	    ISSET(st->state_flags, PFSTATE_NOSYNC) ||
1946 	    m->m_flags & (M_BCAST|M_MCAST))
1947 		return (0);
1948 
1949 	pd = pool_get(&sc->sc_pool, M_NOWAIT);
1950 	if (pd == NULL)
1951 		return (0);
1952 
1953 	/*
1954 	 * deferral queue grows faster, than timeout can consume,
1955 	 * we have to ask packet (caller) to help timer and dispatch
1956 	 * one deferral for us.
1957 	 *
1958 	 * We wish to call pfsync_undefer() here. Unfortunately we can't,
1959 	 * because pfsync_undefer() will be calling to ip_output(),
1960 	 * which in turn will call to pf_test(), which would then attempt
1961 	 * to grab PF_LOCK() we currently hold.
1962 	 */
1963 	if (sc->sc_deferred >= 128) {
1964 		mtx_enter(&sc->sc_deferrals_mtx);
1965 		*ppd = TAILQ_FIRST(&sc->sc_deferrals);
1966 		if (*ppd != NULL) {
1967 			TAILQ_REMOVE(&sc->sc_deferrals, *ppd, pd_entry);
1968 			sc->sc_deferred--;
1969 		}
1970 		mtx_leave(&sc->sc_deferrals_mtx);
1971 	} else
1972 		*ppd = NULL;
1973 
1974 	m->m_pkthdr.pf.flags |= PF_TAG_GENERATED;
1975 	SET(st->state_flags, PFSTATE_ACK);
1976 
1977 	pd->pd_st = pf_state_ref(st);
1978 	pd->pd_m = m;
1979 
1980 	pd->pd_deadline = getnsecuptime() + PFSYNC_DEFER_NSEC;
1981 
1982 	mtx_enter(&sc->sc_deferrals_mtx);
1983 	sched = TAILQ_EMPTY(&sc->sc_deferrals);
1984 
1985 	TAILQ_INSERT_TAIL(&sc->sc_deferrals, pd, pd_entry);
1986 	sc->sc_deferred++;
1987 	mtx_leave(&sc->sc_deferrals_mtx);
1988 
1989 	if (sched)
1990 		timeout_add_nsec(&sc->sc_deferrals_tmo, PFSYNC_DEFER_NSEC);
1991 
1992 	schednetisr(NETISR_PFSYNC);
1993 
1994 	return (1);
1995 }
1996 
1997 void
1998 pfsync_undefer_notify(struct pfsync_deferral *pd)
1999 {
2000 	struct pf_pdesc pdesc;
2001 	struct pf_state *st = pd->pd_st;
2002 
2003 	/*
2004 	 * pf_remove_state removes the state keys and sets st->timeout
2005 	 * to PFTM_UNLINKED. this is done under NET_LOCK which should
2006 	 * be held here, so we can use PFTM_UNLINKED as a test for
2007 	 * whether the state keys are set for the address family
2008 	 * lookup.
2009 	 */
2010 
2011 	if (st->timeout == PFTM_UNLINKED)
2012 		return;
2013 
2014 	if (st->rt == PF_ROUTETO) {
2015 		if (pf_setup_pdesc(&pdesc, st->key[PF_SK_WIRE]->af,
2016 		    st->direction, st->kif, pd->pd_m, NULL) != PF_PASS)
2017 			return;
2018 		switch (st->key[PF_SK_WIRE]->af) {
2019 		case AF_INET:
2020 			pf_route(&pdesc, st);
2021 			break;
2022 #ifdef INET6
2023 		case AF_INET6:
2024 			pf_route6(&pdesc, st);
2025 			break;
2026 #endif /* INET6 */
2027 		default:
2028 			unhandled_af(st->key[PF_SK_WIRE]->af);
2029 		}
2030 		pd->pd_m = pdesc.m;
2031 	} else {
2032 		switch (st->key[PF_SK_WIRE]->af) {
2033 		case AF_INET:
2034 			ip_output(pd->pd_m, NULL, NULL, 0, NULL, NULL, 0);
2035 			break;
2036 #ifdef INET6
2037 		case AF_INET6:
2038 			ip6_output(pd->pd_m, NULL, NULL, 0, NULL, NULL);
2039 			break;
2040 #endif /* INET6 */
2041 		default:
2042 			unhandled_af(st->key[PF_SK_WIRE]->af);
2043 		}
2044 
2045 		pd->pd_m = NULL;
2046 	}
2047 }
2048 
2049 void
2050 pfsync_free_deferral(struct pfsync_deferral *pd)
2051 {
2052 	struct pfsync_softc *sc = pfsyncif;
2053 
2054 	pf_state_unref(pd->pd_st);
2055 	m_freem(pd->pd_m);
2056 	pool_put(&sc->sc_pool, pd);
2057 }
2058 
2059 void
2060 pfsync_undefer(struct pfsync_deferral *pd, int drop)
2061 {
2062 	struct pfsync_softc *sc = pfsyncif;
2063 
2064 	NET_ASSERT_LOCKED();
2065 
2066 	if (sc == NULL)
2067 		return;
2068 
2069 	CLR(pd->pd_st->state_flags, PFSTATE_ACK);
2070 	if (!drop)
2071 		pfsync_undefer_notify(pd);
2072 
2073 	pfsync_free_deferral(pd);
2074 }
2075 
2076 void
2077 pfsync_deferrals_tmo(void *arg)
2078 {
2079 	struct pfsync_softc *sc = arg;
2080 	struct pfsync_deferral *pd;
2081 	uint64_t now, nsec = 0;
2082 	struct pfsync_deferrals pds = TAILQ_HEAD_INITIALIZER(pds);
2083 
2084 	now = getnsecuptime();
2085 
2086 	mtx_enter(&sc->sc_deferrals_mtx);
2087 	for (;;) {
2088 		pd = TAILQ_FIRST(&sc->sc_deferrals);
2089 		if (pd == NULL)
2090 			break;
2091 
2092 		if (now < pd->pd_deadline) {
2093 			nsec = pd->pd_deadline - now;
2094 			break;
2095 		}
2096 
2097 		TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
2098 		sc->sc_deferred--;
2099 		TAILQ_INSERT_TAIL(&pds, pd, pd_entry);
2100 	}
2101 	mtx_leave(&sc->sc_deferrals_mtx);
2102 
2103 	if (nsec > 0) {
2104 		/* we were looking at a pd, but it wasn't old enough */
2105 		timeout_add_nsec(&sc->sc_deferrals_tmo, nsec);
2106 	}
2107 
2108 	if (TAILQ_EMPTY(&pds))
2109 		return;
2110 
2111 	NET_LOCK();
2112 	while ((pd = TAILQ_FIRST(&pds)) != NULL) {
2113 		TAILQ_REMOVE(&pds, pd, pd_entry);
2114 
2115 		pfsync_undefer(pd, 0);
2116 	}
2117 	NET_UNLOCK();
2118 }
2119 
2120 void
2121 pfsync_deferred(struct pf_state *st, int drop)
2122 {
2123 	struct pfsync_softc *sc = pfsyncif;
2124 	struct pfsync_deferral *pd;
2125 
2126 	NET_ASSERT_LOCKED();
2127 
2128 	mtx_enter(&sc->sc_deferrals_mtx);
2129 	TAILQ_FOREACH(pd, &sc->sc_deferrals, pd_entry) {
2130 		 if (pd->pd_st == st) {
2131 			TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
2132 			sc->sc_deferred--;
2133 			break;
2134 		}
2135 	}
2136 	mtx_leave(&sc->sc_deferrals_mtx);
2137 
2138 	if (pd != NULL)
2139 		pfsync_undefer(pd, drop);
2140 }
2141 
2142 void
2143 pfsync_update_state(struct pf_state *st)
2144 {
2145 	struct pfsync_softc *sc = pfsyncif;
2146 	int sync = 0;
2147 
2148 	NET_ASSERT_LOCKED();
2149 
2150 	if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
2151 		return;
2152 
2153 	if (ISSET(st->state_flags, PFSTATE_ACK))
2154 		pfsync_deferred(st, 0);
2155 	if (ISSET(st->state_flags, PFSTATE_NOSYNC)) {
2156 		if (st->sync_state != PFSYNC_S_NONE)
2157 			pfsync_q_del(st);
2158 		return;
2159 	}
2160 
2161 	if (sc->sc_len == PFSYNC_MINPKT)
2162 		timeout_add_sec(&sc->sc_tmo, 1);
2163 
2164 	switch (st->sync_state) {
2165 	case PFSYNC_S_UPD_C:
2166 	case PFSYNC_S_UPD:
2167 	case PFSYNC_S_INS:
2168 		/* we're already handling it */
2169 
2170 		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) {
2171 			st->sync_updates++;
2172 			if (st->sync_updates >= sc->sc_maxupdates)
2173 				sync = 1;
2174 		}
2175 		break;
2176 
2177 	case PFSYNC_S_IACK:
2178 		pfsync_q_del(st);
2179 	case PFSYNC_S_NONE:
2180 		pfsync_q_ins(st, PFSYNC_S_UPD_C);
2181 		st->sync_updates = 0;
2182 		break;
2183 
2184 	default:
2185 		panic("pfsync_update_state: unexpected sync state %d",
2186 		    st->sync_state);
2187 	}
2188 
2189 	if (sync || (getuptime() - st->pfsync_time) < 2)
2190 		schednetisr(NETISR_PFSYNC);
2191 }
2192 
2193 void
2194 pfsync_cancel_full_update(struct pfsync_softc *sc)
2195 {
2196 	if (timeout_pending(&sc->sc_bulkfail_tmo) ||
2197 	    timeout_pending(&sc->sc_bulk_tmo)) {
2198 #if NCARP > 0
2199 		if (!pfsync_sync_ok)
2200 			carp_group_demote_adj(&sc->sc_if, -1,
2201 			    "pfsync bulk cancelled");
2202 		if (sc->sc_initial_bulk) {
2203 			carp_group_demote_adj(&sc->sc_if, -32,
2204 			    "pfsync init");
2205 			sc->sc_initial_bulk = 0;
2206 		}
2207 #endif
2208 		pfsync_sync_ok = 1;
2209 		DPFPRINTF(LOG_INFO, "cancelling bulk update");
2210 	}
2211 	timeout_del(&sc->sc_bulkfail_tmo);
2212 	timeout_del(&sc->sc_bulk_tmo);
2213 	sc->sc_bulk_next = NULL;
2214 	sc->sc_bulk_last = NULL;
2215 	sc->sc_ureq_sent = 0;
2216 	sc->sc_bulk_tries = 0;
2217 }
2218 
2219 void
2220 pfsync_request_full_update(struct pfsync_softc *sc)
2221 {
2222 	if (sc->sc_sync_ifidx != 0 && ISSET(sc->sc_if.if_flags, IFF_RUNNING)) {
2223 		/* Request a full state table update. */
2224 		sc->sc_ureq_sent = getuptime();
2225 #if NCARP > 0
2226 		if (!sc->sc_link_demoted && pfsync_sync_ok)
2227 			carp_group_demote_adj(&sc->sc_if, 1,
2228 			    "pfsync bulk start");
2229 #endif
2230 		pfsync_sync_ok = 0;
2231 		DPFPRINTF(LOG_INFO, "requesting bulk update");
2232 		timeout_add(&sc->sc_bulkfail_tmo, 4 * hz +
2233 		    pf_pool_limits[PF_LIMIT_STATES].limit /
2234 		    ((sc->sc_if.if_mtu - PFSYNC_MINPKT) /
2235 		    sizeof(struct pfsync_state)));
2236 		pfsync_request_update(0, 0);
2237 	}
2238 }
2239 
2240 void
2241 pfsync_request_update(u_int32_t creatorid, u_int64_t id)
2242 {
2243 	struct pfsync_softc *sc = pfsyncif;
2244 	struct pfsync_upd_req_item *item;
2245 	size_t nlen, sc_len;
2246 	int retry;
2247 
2248 	/*
2249 	 * this code does nothing to prevent multiple update requests for the
2250 	 * same state being generated.
2251 	 */
2252 
2253 	item = pool_get(&sc->sc_pool, PR_NOWAIT);
2254 	if (item == NULL) {
2255 		/* XXX stats */
2256 		return;
2257 	}
2258 
2259 	item->ur_msg.id = id;
2260 	item->ur_msg.creatorid = creatorid;
2261 
2262 	for (;;) {
2263 		mtx_enter(&sc->sc_upd_req_mtx);
2264 
2265 		nlen = sizeof(struct pfsync_upd_req);
2266 		if (TAILQ_EMPTY(&sc->sc_upd_req_list))
2267 			nlen += sizeof(struct pfsync_subheader);
2268 
2269 		sc_len = atomic_add_long_nv(&sc->sc_len, nlen);
2270 		retry = (sc_len > sc->sc_if.if_mtu);
2271 		if (retry)
2272 			atomic_sub_long(&sc->sc_len, nlen);
2273 		else
2274 			TAILQ_INSERT_TAIL(&sc->sc_upd_req_list, item, ur_entry);
2275 
2276 		mtx_leave(&sc->sc_upd_req_mtx);
2277 
2278 		if (!retry)
2279 			break;
2280 
2281 		pfsync_sendout();
2282 	}
2283 
2284 	schednetisr(NETISR_PFSYNC);
2285 }
2286 
2287 void
2288 pfsync_update_state_req(struct pf_state *st)
2289 {
2290 	struct pfsync_softc *sc = pfsyncif;
2291 
2292 	if (sc == NULL)
2293 		panic("pfsync_update_state_req: nonexistent instance");
2294 
2295 	if (ISSET(st->state_flags, PFSTATE_NOSYNC)) {
2296 		if (st->sync_state != PFSYNC_S_NONE)
2297 			pfsync_q_del(st);
2298 		return;
2299 	}
2300 
2301 	switch (st->sync_state) {
2302 	case PFSYNC_S_UPD_C:
2303 	case PFSYNC_S_IACK:
2304 		pfsync_q_del(st);
2305 	case PFSYNC_S_NONE:
2306 		pfsync_q_ins(st, PFSYNC_S_UPD);
2307 		schednetisr(NETISR_PFSYNC);
2308 		return;
2309 
2310 	case PFSYNC_S_INS:
2311 	case PFSYNC_S_UPD:
2312 	case PFSYNC_S_DEL:
2313 		/* we're already handling it */
2314 		return;
2315 
2316 	default:
2317 		panic("pfsync_update_state_req: unexpected sync state %d",
2318 		    st->sync_state);
2319 	}
2320 }
2321 
2322 void
2323 pfsync_delete_state(struct pf_state *st)
2324 {
2325 	struct pfsync_softc *sc = pfsyncif;
2326 
2327 	NET_ASSERT_LOCKED();
2328 
2329 	if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
2330 		return;
2331 
2332 	if (ISSET(st->state_flags, PFSTATE_ACK))
2333 		pfsync_deferred(st, 1);
2334 	if (ISSET(st->state_flags, PFSTATE_NOSYNC)) {
2335 		if (st->sync_state != PFSYNC_S_NONE)
2336 			pfsync_q_del(st);
2337 		return;
2338 	}
2339 
2340 	if (sc->sc_len == PFSYNC_MINPKT)
2341 		timeout_add_sec(&sc->sc_tmo, 1);
2342 
2343 	switch (st->sync_state) {
2344 	case PFSYNC_S_INS:
2345 		/* we never got to tell the world so just forget about it */
2346 		pfsync_q_del(st);
2347 		return;
2348 
2349 	case PFSYNC_S_UPD_C:
2350 	case PFSYNC_S_UPD:
2351 	case PFSYNC_S_IACK:
2352 		pfsync_q_del(st);
2353 		/*
2354 		 * FALLTHROUGH to putting it on the del list
2355 		 * Note on reference count bookkeeping:
2356 		 *	pfsync_q_del() drops reference for queue
2357 		 *	ownership. But the st entry survives, because
2358 		 *	our caller still holds a reference.
2359 		 */
2360 
2361 	case PFSYNC_S_NONE:
2362 		/*
2363 		 * We either fall through here, or there is no reference to
2364 		 * st owned by pfsync queues at this point.
2365 		 *
2366 		 * Calling pfsync_q_ins() puts st to del queue. The pfsync_q_ins()
2367 		 * grabs a reference for delete queue.
2368 		 */
2369 		pfsync_q_ins(st, PFSYNC_S_DEL);
2370 		return;
2371 
2372 	default:
2373 		panic("pfsync_delete_state: unexpected sync state %d",
2374 		    st->sync_state);
2375 	}
2376 }
2377 
2378 void
2379 pfsync_clear_states(u_int32_t creatorid, const char *ifname)
2380 {
2381 	struct pfsync_softc *sc = pfsyncif;
2382 	struct {
2383 		struct pfsync_subheader subh;
2384 		struct pfsync_clr clr;
2385 	} __packed r;
2386 
2387 	NET_ASSERT_LOCKED();
2388 
2389 	if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
2390 		return;
2391 
2392 	bzero(&r, sizeof(r));
2393 
2394 	r.subh.action = PFSYNC_ACT_CLR;
2395 	r.subh.len = sizeof(struct pfsync_clr) >> 2;
2396 	r.subh.count = htons(1);
2397 
2398 	strlcpy(r.clr.ifname, ifname, sizeof(r.clr.ifname));
2399 	r.clr.creatorid = creatorid;
2400 
2401 	pfsync_send_plus(&r, sizeof(r));
2402 }
2403 
2404 void
2405 pfsync_q_ins(struct pf_state *st, int q)
2406 {
2407 	struct pfsync_softc *sc = pfsyncif;
2408 	size_t nlen, sc_len;
2409 
2410 #if defined(PFSYNC_DEBUG)
2411 	if (sc->sc_len < PFSYNC_MINPKT)
2412 		panic("pfsync pkt len is too low %zd", sc->sc_len);
2413 #endif
2414 	do {
2415 		mtx_enter(&sc->sc_mtx[q]);
2416 
2417 		/*
2418 		 * If two threads are competing to insert the same state, then
2419 		 * there must be just single winner.
2420 		 */
2421 		if (st->sync_state != PFSYNC_S_NONE) {
2422 			mtx_leave(&sc->sc_mtx[q]);
2423 			break;
2424 		}
2425 
2426 		nlen = pfsync_qs[q].len;
2427 
2428 		if (TAILQ_EMPTY(&sc->sc_qs[q]))
2429 			nlen += sizeof(struct pfsync_subheader);
2430 
2431 		sc_len = atomic_add_long_nv(&sc->sc_len, nlen);
2432 		if (sc_len > sc->sc_if.if_mtu) {
2433 			atomic_sub_long(&sc->sc_len, nlen);
2434 			mtx_leave(&sc->sc_mtx[q]);
2435 			pfsync_sendout();
2436 			continue;
2437 		}
2438 
2439 		pf_state_ref(st);
2440 
2441 		TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list);
2442 		st->sync_state = q;
2443 		mtx_leave(&sc->sc_mtx[q]);
2444 	} while (0);
2445 }
2446 
2447 void
2448 pfsync_q_del(struct pf_state *st)
2449 {
2450 	struct pfsync_softc *sc = pfsyncif;
2451 	int q = st->sync_state;
2452 
2453 	KASSERT(st->sync_state != PFSYNC_S_NONE);
2454 
2455 	mtx_enter(&sc->sc_mtx[q]);
2456 	atomic_sub_long(&sc->sc_len, pfsync_qs[q].len);
2457 	TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
2458 	if (TAILQ_EMPTY(&sc->sc_qs[q]))
2459 		atomic_sub_long(&sc->sc_len, sizeof (struct pfsync_subheader));
2460 	mtx_leave(&sc->sc_mtx[q]);
2461 
2462 	st->sync_state = PFSYNC_S_NONE;
2463 	pf_state_unref(st);
2464 }
2465 
2466 void
2467 pfsync_update_tdb(struct tdb *t, int output)
2468 {
2469 	struct pfsync_softc *sc = pfsyncif;
2470 	size_t nlen, sc_len;
2471 
2472 	if (sc == NULL)
2473 		return;
2474 
2475 	if (!ISSET(t->tdb_flags, TDBF_PFSYNC)) {
2476 		do {
2477 			mtx_enter(&sc->sc_tdb_mtx);
2478 			nlen = sizeof(struct pfsync_tdb);
2479 
2480 			if (TAILQ_EMPTY(&sc->sc_tdb_q))
2481 				nlen += sizeof(struct pfsync_subheader);
2482 
2483 			sc_len = atomic_add_long_nv(&sc->sc_len, nlen);
2484 			if (sc_len > sc->sc_if.if_mtu) {
2485 				atomic_sub_long(&sc->sc_len, nlen);
2486 				mtx_leave(&sc->sc_tdb_mtx);
2487 				pfsync_sendout();
2488 				continue;
2489 			}
2490 
2491 			TAILQ_INSERT_TAIL(&sc->sc_tdb_q, t, tdb_sync_entry);
2492 			mtx_leave(&sc->sc_tdb_mtx);
2493 
2494 			SET(t->tdb_flags, TDBF_PFSYNC);
2495 			t->tdb_updates = 0;
2496 		} while (0);
2497 	} else {
2498 		if (++t->tdb_updates >= sc->sc_maxupdates)
2499 			schednetisr(NETISR_PFSYNC);
2500 	}
2501 
2502 	if (output)
2503 		SET(t->tdb_flags, TDBF_PFSYNC_RPL);
2504 	else
2505 		CLR(t->tdb_flags, TDBF_PFSYNC_RPL);
2506 }
2507 
2508 void
2509 pfsync_delete_tdb(struct tdb *t)
2510 {
2511 	struct pfsync_softc *sc = pfsyncif;
2512 	size_t nlen;
2513 
2514 	if (sc == NULL || !ISSET(t->tdb_flags, TDBF_PFSYNC))
2515 		return;
2516 
2517 	mtx_enter(&sc->sc_tdb_mtx);
2518 
2519 	TAILQ_REMOVE(&sc->sc_tdb_q, t, tdb_sync_entry);
2520 	CLR(t->tdb_flags, TDBF_PFSYNC);
2521 
2522 	nlen = sizeof(struct pfsync_tdb);
2523 	if (TAILQ_EMPTY(&sc->sc_tdb_q))
2524 		nlen += sizeof(struct pfsync_subheader);
2525 	atomic_sub_long(&sc->sc_len, nlen);
2526 
2527 	mtx_leave(&sc->sc_tdb_mtx);
2528 }
2529 
2530 void
2531 pfsync_out_tdb(struct tdb *t, void *buf)
2532 {
2533 	struct pfsync_tdb *ut = buf;
2534 
2535 	bzero(ut, sizeof(*ut));
2536 	ut->spi = t->tdb_spi;
2537 	bcopy(&t->tdb_dst, &ut->dst, sizeof(ut->dst));
2538 	/*
2539 	 * When a failover happens, the master's rpl is probably above
2540 	 * what we see here (we may be up to a second late), so
2541 	 * increase it a bit for outbound tdbs to manage most such
2542 	 * situations.
2543 	 *
2544 	 * For now, just add an offset that is likely to be larger
2545 	 * than the number of packets we can see in one second. The RFC
2546 	 * just says the next packet must have a higher seq value.
2547 	 *
2548 	 * XXX What is a good algorithm for this? We could use
2549 	 * a rate-determined increase, but to know it, we would have
2550 	 * to extend struct tdb.
2551 	 * XXX pt->rpl can wrap over MAXINT, but if so the real tdb
2552 	 * will soon be replaced anyway. For now, just don't handle
2553 	 * this edge case.
2554 	 */
2555 #define RPL_INCR 16384
2556 	ut->rpl = htobe64(t->tdb_rpl + (ISSET(t->tdb_flags, TDBF_PFSYNC_RPL) ?
2557 	    RPL_INCR : 0));
2558 	ut->cur_bytes = htobe64(t->tdb_cur_bytes);
2559 	ut->sproto = t->tdb_sproto;
2560 	ut->rdomain = htons(t->tdb_rdomain);
2561 }
2562 
2563 void
2564 pfsync_bulk_start(void)
2565 {
2566 	struct pfsync_softc *sc = pfsyncif;
2567 
2568 	NET_ASSERT_LOCKED();
2569 
2570 	/*
2571 	 * pf gc via pfsync_state_in_use reads sc_bulk_next and
2572 	 * sc_bulk_last while exclusively holding the pf_state_list
2573 	 * rwlock. make sure it can't race with us setting these
2574 	 * pointers. they basically act as hazards, and borrow the
2575 	 * lists state reference count.
2576 	 */
2577 	rw_enter_read(&pf_state_list.pfs_rwl);
2578 
2579 	/* get a consistent view of the list pointers */
2580 	mtx_enter(&pf_state_list.pfs_mtx);
2581 	if (sc->sc_bulk_next == NULL)
2582 		sc->sc_bulk_next = TAILQ_FIRST(&pf_state_list.pfs_list);
2583 
2584 	sc->sc_bulk_last = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
2585 	mtx_leave(&pf_state_list.pfs_mtx);
2586 
2587 	rw_exit_read(&pf_state_list.pfs_rwl);
2588 
2589 	DPFPRINTF(LOG_INFO, "received bulk update request");
2590 
2591 	if (sc->sc_bulk_last == NULL)
2592 		pfsync_bulk_status(PFSYNC_BUS_END);
2593 	else {
2594 		sc->sc_ureq_received = getuptime();
2595 
2596 		pfsync_bulk_status(PFSYNC_BUS_START);
2597 		timeout_add(&sc->sc_bulk_tmo, 0);
2598 	}
2599 }
2600 
2601 void
2602 pfsync_bulk_update(void *arg)
2603 {
2604 	struct pfsync_softc *sc;
2605 	struct pf_state *st;
2606 	int i = 0;
2607 
2608 	NET_LOCK();
2609 	sc = pfsyncif;
2610 	if (sc == NULL)
2611 		goto out;
2612 
2613 	rw_enter_read(&pf_state_list.pfs_rwl);
2614 	st = sc->sc_bulk_next;
2615 	sc->sc_bulk_next = NULL;
2616 
2617 	for (;;) {
2618 		if (st->sync_state == PFSYNC_S_NONE &&
2619 		    st->timeout < PFTM_MAX &&
2620 		    st->pfsync_time <= sc->sc_ureq_received) {
2621 			pfsync_update_state_req(st);
2622 			i++;
2623 		}
2624 
2625 		st = TAILQ_NEXT(st, entry_list);
2626 		if ((st == NULL) || (st == sc->sc_bulk_last)) {
2627 			/* we're done */
2628 			sc->sc_bulk_last = NULL;
2629 			pfsync_bulk_status(PFSYNC_BUS_END);
2630 			break;
2631 		}
2632 
2633 		if (i > 1 && (sc->sc_if.if_mtu - sc->sc_len) <
2634 		    sizeof(struct pfsync_state)) {
2635 			/* we've filled a packet */
2636 			sc->sc_bulk_next = st;
2637 			timeout_add(&sc->sc_bulk_tmo, 1);
2638 			break;
2639 		}
2640 	}
2641 
2642 	rw_exit_read(&pf_state_list.pfs_rwl);
2643  out:
2644 	NET_UNLOCK();
2645 }
2646 
2647 void
2648 pfsync_bulk_status(u_int8_t status)
2649 {
2650 	struct {
2651 		struct pfsync_subheader subh;
2652 		struct pfsync_bus bus;
2653 	} __packed r;
2654 
2655 	struct pfsync_softc *sc = pfsyncif;
2656 
2657 	bzero(&r, sizeof(r));
2658 
2659 	r.subh.action = PFSYNC_ACT_BUS;
2660 	r.subh.len = sizeof(struct pfsync_bus) >> 2;
2661 	r.subh.count = htons(1);
2662 
2663 	r.bus.creatorid = pf_status.hostid;
2664 	r.bus.endtime = htonl(getuptime() - sc->sc_ureq_received);
2665 	r.bus.status = status;
2666 
2667 	pfsync_send_plus(&r, sizeof(r));
2668 }
2669 
2670 void
2671 pfsync_bulk_fail(void *arg)
2672 {
2673 	struct pfsync_softc *sc;
2674 
2675 	NET_LOCK();
2676 	sc = pfsyncif;
2677 	if (sc == NULL)
2678 		goto out;
2679 	if (sc->sc_bulk_tries++ < PFSYNC_MAX_BULKTRIES) {
2680 		/* Try again */
2681 		timeout_add_sec(&sc->sc_bulkfail_tmo, 5);
2682 		pfsync_request_update(0, 0);
2683 	} else {
2684 		/* Pretend like the transfer was ok */
2685 		sc->sc_ureq_sent = 0;
2686 		sc->sc_bulk_tries = 0;
2687 #if NCARP > 0
2688 		if (!pfsync_sync_ok)
2689 			carp_group_demote_adj(&sc->sc_if, -1,
2690 			    sc->sc_link_demoted ?
2691 			    "pfsync link state up" :
2692 			    "pfsync bulk fail");
2693 		if (sc->sc_initial_bulk) {
2694 			carp_group_demote_adj(&sc->sc_if, -32,
2695 			    "pfsync init");
2696 			sc->sc_initial_bulk = 0;
2697 		}
2698 #endif
2699 		pfsync_sync_ok = 1;
2700 		sc->sc_link_demoted = 0;
2701 		DPFPRINTF(LOG_ERR, "failed to receive bulk update");
2702 	}
2703  out:
2704 	NET_UNLOCK();
2705 }
2706 
2707 void
2708 pfsync_send_plus(void *plus, size_t pluslen)
2709 {
2710 	struct pfsync_softc *sc = pfsyncif;
2711 
2712 	if (sc->sc_len + pluslen > sc->sc_if.if_mtu)
2713 		pfsync_sendout();
2714 
2715 	sc->sc_plus = plus;
2716 	sc->sc_len += (sc->sc_pluslen = pluslen);
2717 
2718 	pfsync_sendout();
2719 }
2720 
2721 int
2722 pfsync_up(void)
2723 {
2724 	struct pfsync_softc *sc = pfsyncif;
2725 
2726 	if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
2727 		return (0);
2728 
2729 	return (1);
2730 }
2731 
2732 int
2733 pfsync_state_in_use(struct pf_state *st)
2734 {
2735 	struct pfsync_softc *sc = pfsyncif;
2736 
2737 	if (sc == NULL)
2738 		return (0);
2739 
2740 	rw_assert_wrlock(&pf_state_list.pfs_rwl);
2741 
2742 	if (st->sync_state != PFSYNC_S_NONE ||
2743 	    st == sc->sc_bulk_next ||
2744 	    st == sc->sc_bulk_last)
2745 		return (1);
2746 
2747 	return (0);
2748 }
2749 
2750 void
2751 pfsync_timeout(void *arg)
2752 {
2753 	NET_LOCK();
2754 	pfsync_sendout();
2755 	NET_UNLOCK();
2756 }
2757 
2758 /* this is a softnet/netisr handler */
2759 void
2760 pfsyncintr(void)
2761 {
2762 	pfsync_sendout();
2763 }
2764 
2765 int
2766 pfsync_sysctl_pfsyncstat(void *oldp, size_t *oldlenp, void *newp)
2767 {
2768 	struct pfsyncstats pfsyncstat;
2769 
2770 	CTASSERT(sizeof(pfsyncstat) == (pfsyncs_ncounters * sizeof(uint64_t)));
2771 	memset(&pfsyncstat, 0, sizeof pfsyncstat);
2772 	counters_read(pfsynccounters, (uint64_t *)&pfsyncstat,
2773 	    pfsyncs_ncounters);
2774 	return (sysctl_rdstruct(oldp, oldlenp, newp,
2775 	    &pfsyncstat, sizeof(pfsyncstat)));
2776 }
2777 
2778 int
2779 pfsync_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
2780     size_t newlen)
2781 {
2782 	/* All sysctl names at this level are terminal. */
2783 	if (namelen != 1)
2784 		return (ENOTDIR);
2785 
2786 	switch (name[0]) {
2787 	case PFSYNCCTL_STATS:
2788 		return (pfsync_sysctl_pfsyncstat(oldp, oldlenp, newp));
2789 	default:
2790 		return (ENOPROTOOPT);
2791 	}
2792 }
2793