xref: /openbsd-src/sys/net/if_pfsync.c (revision c1a45aed656e7d5627c30c92421893a76f370ccb)
1 /*	$OpenBSD: if_pfsync.c,v 1.305 2022/04/21 15:22:49 sashan Exp $	*/
2 
3 /*
4  * Copyright (c) 2002 Michael Shalayeff
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
20  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22  * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
24  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
25  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26  * THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*
30  * Copyright (c) 2009 David Gwynne <dlg@openbsd.org>
31  *
32  * Permission to use, copy, modify, and distribute this software for any
33  * purpose with or without fee is hereby granted, provided that the above
34  * copyright notice and this permission notice appear in all copies.
35  *
36  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
37  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
38  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
39  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
40  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
41  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
42  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
43  */
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/time.h>
48 #include <sys/malloc.h>
49 #include <sys/mbuf.h>
50 #include <sys/socket.h>
51 #include <sys/ioctl.h>
52 #include <sys/timeout.h>
53 #include <sys/kernel.h>
54 #include <sys/sysctl.h>
55 #include <sys/pool.h>
56 #include <sys/syslog.h>
57 
58 #include <net/if.h>
59 #include <net/if_types.h>
60 #include <net/bpf.h>
61 #include <net/netisr.h>
62 
63 #include <netinet/in.h>
64 #include <netinet/if_ether.h>
65 #include <netinet/ip.h>
66 #include <netinet/in_var.h>
67 #include <netinet/ip_var.h>
68 #include <netinet/ip_ipsp.h>
69 #include <netinet/ip_icmp.h>
70 #include <netinet/icmp6.h>
71 #include <netinet/tcp.h>
72 #include <netinet/tcp_seq.h>
73 #include <netinet/tcp_fsm.h>
74 #include <netinet/udp.h>
75 
76 #ifdef INET6
77 #include <netinet6/in6_var.h>
78 #include <netinet/ip6.h>
79 #include <netinet6/ip6_var.h>
80 #include <netinet6/nd6.h>
81 #endif /* INET6 */
82 
83 #include "carp.h"
84 #if NCARP > 0
85 #include <netinet/ip_carp.h>
86 #endif
87 
88 #define PF_DEBUGNAME	"pfsync: "
89 #include <net/pfvar.h>
90 #include <net/pfvar_priv.h>
91 #include <net/if_pfsync.h>
92 
93 #include "bpfilter.h"
94 #include "pfsync.h"
95 
96 #define PFSYNC_DEFER_NSEC 20000000ULL
97 
98 #define PFSYNC_MINPKT ( \
99 	sizeof(struct ip) + \
100 	sizeof(struct pfsync_header))
101 
102 int	pfsync_upd_tcp(struct pf_state *, struct pfsync_state_peer *,
103 	    struct pfsync_state_peer *);
104 
105 int	pfsync_in_clr(caddr_t, int, int, int);
106 int	pfsync_in_iack(caddr_t, int, int, int);
107 int	pfsync_in_upd_c(caddr_t, int, int, int);
108 int	pfsync_in_ureq(caddr_t, int, int, int);
109 int	pfsync_in_del(caddr_t, int, int, int);
110 int	pfsync_in_del_c(caddr_t, int, int, int);
111 int	pfsync_in_bus(caddr_t, int, int, int);
112 int	pfsync_in_tdb(caddr_t, int, int, int);
113 int	pfsync_in_ins(caddr_t, int, int, int);
114 int	pfsync_in_upd(caddr_t, int, int, int);
115 int	pfsync_in_eof(caddr_t, int, int, int);
116 
117 int	pfsync_in_error(caddr_t, int, int, int);
118 
119 void	pfsync_update_state_locked(struct pf_state *);
120 
121 struct {
122 	int	(*in)(caddr_t, int, int, int);
123 	size_t	len;
124 } pfsync_acts[] = {
125 	/* PFSYNC_ACT_CLR */
126 	{ pfsync_in_clr,	sizeof(struct pfsync_clr) },
127 	 /* PFSYNC_ACT_OINS */
128 	{ pfsync_in_error,	0 },
129 	/* PFSYNC_ACT_INS_ACK */
130 	{ pfsync_in_iack,	sizeof(struct pfsync_ins_ack) },
131 	/* PFSYNC_ACT_OUPD */
132 	{ pfsync_in_error,	0 },
133 	/* PFSYNC_ACT_UPD_C */
134 	{ pfsync_in_upd_c,	sizeof(struct pfsync_upd_c) },
135 	/* PFSYNC_ACT_UPD_REQ */
136 	{ pfsync_in_ureq,	sizeof(struct pfsync_upd_req) },
137 	/* PFSYNC_ACT_DEL */
138 	{ pfsync_in_del,	sizeof(struct pfsync_state) },
139 	/* PFSYNC_ACT_DEL_C */
140 	{ pfsync_in_del_c,	sizeof(struct pfsync_del_c) },
141 	/* PFSYNC_ACT_INS_F */
142 	{ pfsync_in_error,	0 },
143 	/* PFSYNC_ACT_DEL_F */
144 	{ pfsync_in_error,	0 },
145 	/* PFSYNC_ACT_BUS */
146 	{ pfsync_in_bus,	sizeof(struct pfsync_bus) },
147 	/* PFSYNC_ACT_OTDB */
148 	{ pfsync_in_error,	0 },
149 	/* PFSYNC_ACT_EOF */
150 	{ pfsync_in_error,	0 },
151 	/* PFSYNC_ACT_INS */
152 	{ pfsync_in_ins,	sizeof(struct pfsync_state) },
153 	/* PFSYNC_ACT_UPD */
154 	{ pfsync_in_upd,	sizeof(struct pfsync_state) },
155 	/* PFSYNC_ACT_TDB */
156 	{ pfsync_in_tdb,	sizeof(struct pfsync_tdb) },
157 };
158 
159 struct pfsync_q {
160 	void		(*write)(struct pf_state *, void *);
161 	size_t		len;
162 	u_int8_t	action;
163 };
164 
165 /* we have one of these for every PFSYNC_S_ */
166 void	pfsync_out_state(struct pf_state *, void *);
167 void	pfsync_out_iack(struct pf_state *, void *);
168 void	pfsync_out_upd_c(struct pf_state *, void *);
169 void	pfsync_out_del(struct pf_state *, void *);
170 
171 struct pfsync_q pfsync_qs[] = {
172 	{ pfsync_out_iack,  sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK },
173 	{ pfsync_out_upd_c, sizeof(struct pfsync_upd_c),   PFSYNC_ACT_UPD_C },
174 	{ pfsync_out_del,   sizeof(struct pfsync_del_c),   PFSYNC_ACT_DEL_C },
175 	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_INS },
176 	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_UPD }
177 };
178 
179 void	pfsync_q_ins(struct pf_state *, int);
180 void	pfsync_q_del(struct pf_state *);
181 
182 struct pfsync_upd_req_item {
183 	TAILQ_ENTRY(pfsync_upd_req_item)	ur_entry;
184 	TAILQ_ENTRY(pfsync_upd_req_item)	ur_snap;
185 	struct pfsync_upd_req			ur_msg;
186 };
187 TAILQ_HEAD(pfsync_upd_reqs, pfsync_upd_req_item);
188 
189 struct pfsync_deferral {
190 	TAILQ_ENTRY(pfsync_deferral)		 pd_entry;
191 	struct pf_state				*pd_st;
192 	struct mbuf				*pd_m;
193 	uint64_t				 pd_deadline;
194 };
195 TAILQ_HEAD(pfsync_deferrals, pfsync_deferral);
196 
197 #define PFSYNC_PLSIZE	MAX(sizeof(struct pfsync_upd_req_item), \
198 			    sizeof(struct pfsync_deferral))
199 
200 void	pfsync_out_tdb(struct tdb *, void *);
201 
202 struct pfsync_softc {
203 	struct ifnet		 sc_if;
204 	unsigned int		 sc_sync_ifidx;
205 
206 	struct pool		 sc_pool;
207 
208 	struct ip_moptions	 sc_imo;
209 
210 	struct in_addr		 sc_sync_peer;
211 	u_int8_t		 sc_maxupdates;
212 
213 	struct ip		 sc_template;
214 
215 	struct pf_state_queue	 sc_qs[PFSYNC_S_COUNT];
216 	struct mutex		 sc_st_mtx;
217 	size_t			 sc_len;
218 
219 	struct pfsync_upd_reqs	 sc_upd_req_list;
220 	struct mutex		 sc_upd_req_mtx;
221 
222 	int			 sc_initial_bulk;
223 	int			 sc_link_demoted;
224 
225 	int			 sc_defer;
226 	struct pfsync_deferrals	 sc_deferrals;
227 	u_int			 sc_deferred;
228 	struct mutex		 sc_deferrals_mtx;
229 	struct timeout		 sc_deferrals_tmo;
230 
231 	void			*sc_plus;
232 	size_t			 sc_pluslen;
233 
234 	u_int32_t		 sc_ureq_sent;
235 	int			 sc_bulk_tries;
236 	struct timeout		 sc_bulkfail_tmo;
237 
238 	u_int32_t		 sc_ureq_received;
239 	struct pf_state		*sc_bulk_next;
240 	struct pf_state		*sc_bulk_last;
241 	struct timeout		 sc_bulk_tmo;
242 
243 	TAILQ_HEAD(, tdb)	 sc_tdb_q;
244 	struct mutex		 sc_tdb_mtx;
245 
246 	struct task		 sc_ltask;
247 	struct task		 sc_dtask;
248 
249 	struct timeout		 sc_tmo;
250 };
251 
252 struct pfsync_snapshot {
253 	struct pfsync_softc	*sn_sc;
254 	struct pf_state_queue	 sn_qs[PFSYNC_S_COUNT];
255 	struct pfsync_upd_reqs	 sn_upd_req_list;
256 	TAILQ_HEAD(, tdb)	 sn_tdb_q;
257 	size_t			 sn_len;
258 	void			*sn_plus;
259 	size_t			 sn_pluslen;
260 };
261 
262 struct pfsync_softc	*pfsyncif = NULL;
263 struct cpumem		*pfsynccounters;
264 
265 void	pfsyncattach(int);
266 int	pfsync_clone_create(struct if_clone *, int);
267 int	pfsync_clone_destroy(struct ifnet *);
268 int	pfsync_alloc_scrub_memory(struct pfsync_state_peer *,
269 	    struct pf_state_peer *);
270 void	pfsync_update_net_tdb(struct pfsync_tdb *);
271 int	pfsyncoutput(struct ifnet *, struct mbuf *, struct sockaddr *,
272 	    struct rtentry *);
273 int	pfsyncioctl(struct ifnet *, u_long, caddr_t);
274 void	pfsyncstart(struct ifqueue *);
275 void	pfsync_syncdev_state(void *);
276 void	pfsync_ifdetach(void *);
277 
278 void	pfsync_deferred(struct pf_state *, int);
279 void	pfsync_undefer(struct pfsync_deferral *, int);
280 void	pfsync_deferrals_tmo(void *);
281 
282 void	pfsync_cancel_full_update(struct pfsync_softc *);
283 void	pfsync_request_full_update(struct pfsync_softc *);
284 void	pfsync_request_update(u_int32_t, u_int64_t);
285 void	pfsync_update_state_req(struct pf_state *);
286 
287 void	pfsync_drop(struct pfsync_softc *);
288 void	pfsync_sendout(void);
289 void	pfsync_send_plus(void *, size_t);
290 void	pfsync_timeout(void *);
291 void	pfsync_tdb_timeout(void *);
292 
293 void	pfsync_bulk_start(void);
294 void	pfsync_bulk_status(u_int8_t);
295 void	pfsync_bulk_update(void *);
296 void	pfsync_bulk_fail(void *);
297 
298 void	pfsync_grab_snapshot(struct pfsync_snapshot *, struct pfsync_softc *);
299 void	pfsync_drop_snapshot(struct pfsync_snapshot *);
300 
301 void	pfsync_send_dispatch(void *);
302 void	pfsync_send_pkt(struct mbuf *);
303 
304 static struct mbuf_queue	pfsync_mq;
305 static struct task	pfsync_task =
306     TASK_INITIALIZER(pfsync_send_dispatch, &pfsync_mq);
307 
308 #define PFSYNC_MAX_BULKTRIES	12
309 int	pfsync_sync_ok;
310 
311 struct if_clone	pfsync_cloner =
312     IF_CLONE_INITIALIZER("pfsync", pfsync_clone_create, pfsync_clone_destroy);
313 
314 void
315 pfsyncattach(int npfsync)
316 {
317 	if_clone_attach(&pfsync_cloner);
318 	pfsynccounters = counters_alloc(pfsyncs_ncounters);
319 	mq_init(&pfsync_mq, 4096, IPL_MPFLOOR);
320 }
321 
322 int
323 pfsync_clone_create(struct if_clone *ifc, int unit)
324 {
325 	struct pfsync_softc *sc;
326 	struct ifnet *ifp;
327 	int q;
328 
329 	if (unit != 0)
330 		return (EINVAL);
331 
332 	pfsync_sync_ok = 1;
333 
334 	sc = malloc(sizeof(*pfsyncif), M_DEVBUF, M_WAITOK|M_ZERO);
335 	for (q = 0; q < PFSYNC_S_COUNT; q++)
336 		TAILQ_INIT(&sc->sc_qs[q]);
337 	mtx_init(&sc->sc_st_mtx, IPL_MPFLOOR);
338 
339 	pool_init(&sc->sc_pool, PFSYNC_PLSIZE, 0, IPL_MPFLOOR, 0, "pfsync",
340 	    NULL);
341 	TAILQ_INIT(&sc->sc_upd_req_list);
342 	mtx_init(&sc->sc_upd_req_mtx, IPL_MPFLOOR);
343 	TAILQ_INIT(&sc->sc_deferrals);
344 	mtx_init(&sc->sc_deferrals_mtx, IPL_MPFLOOR);
345 	timeout_set_proc(&sc->sc_deferrals_tmo, pfsync_deferrals_tmo, sc);
346 	task_set(&sc->sc_ltask, pfsync_syncdev_state, sc);
347 	task_set(&sc->sc_dtask, pfsync_ifdetach, sc);
348 	sc->sc_deferred = 0;
349 
350 	TAILQ_INIT(&sc->sc_tdb_q);
351 	mtx_init(&sc->sc_tdb_mtx, IPL_MPFLOOR);
352 
353 	sc->sc_len = PFSYNC_MINPKT;
354 	sc->sc_maxupdates = 128;
355 
356 	sc->sc_imo.imo_membership = mallocarray(IP_MIN_MEMBERSHIPS,
357 	    sizeof(struct in_multi *), M_IPMOPTS, M_WAITOK|M_ZERO);
358 	sc->sc_imo.imo_max_memberships = IP_MIN_MEMBERSHIPS;
359 
360 	ifp = &sc->sc_if;
361 	snprintf(ifp->if_xname, sizeof ifp->if_xname, "pfsync%d", unit);
362 	ifp->if_softc = sc;
363 	ifp->if_ioctl = pfsyncioctl;
364 	ifp->if_output = pfsyncoutput;
365 	ifp->if_qstart = pfsyncstart;
366 	ifp->if_type = IFT_PFSYNC;
367 	ifp->if_hdrlen = sizeof(struct pfsync_header);
368 	ifp->if_mtu = ETHERMTU;
369 	ifp->if_xflags = IFXF_CLONED | IFXF_MPSAFE;
370 	timeout_set_proc(&sc->sc_tmo, pfsync_timeout, NULL);
371 	timeout_set_proc(&sc->sc_bulk_tmo, pfsync_bulk_update, NULL);
372 	timeout_set_proc(&sc->sc_bulkfail_tmo, pfsync_bulk_fail, NULL);
373 
374 	if_attach(ifp);
375 	if_alloc_sadl(ifp);
376 
377 #if NCARP > 0
378 	if_addgroup(ifp, "carp");
379 #endif
380 
381 #if NBPFILTER > 0
382 	bpfattach(&sc->sc_if.if_bpf, ifp, DLT_PFSYNC, PFSYNC_HDRLEN);
383 #endif
384 
385 	pfsyncif = sc;
386 
387 	return (0);
388 }
389 
390 int
391 pfsync_clone_destroy(struct ifnet *ifp)
392 {
393 	struct pfsync_softc *sc = ifp->if_softc;
394 	struct ifnet *ifp0;
395 	struct pfsync_deferral *pd;
396 	struct pfsync_deferrals	 deferrals;
397 
398 	NET_LOCK();
399 
400 #if NCARP > 0
401 	if (!pfsync_sync_ok)
402 		carp_group_demote_adj(&sc->sc_if, -1, "pfsync destroy");
403 	if (sc->sc_link_demoted)
404 		carp_group_demote_adj(&sc->sc_if, -1, "pfsync destroy");
405 #endif
406 	if ((ifp0 = if_get(sc->sc_sync_ifidx)) != NULL) {
407 		if_linkstatehook_del(ifp0, &sc->sc_ltask);
408 		if_detachhook_del(ifp0, &sc->sc_dtask);
409 	}
410 	if_put(ifp0);
411 
412 	/* XXXSMP breaks atomicity */
413 	NET_UNLOCK();
414 	if_detach(ifp);
415 	NET_LOCK();
416 
417 	pfsync_drop(sc);
418 
419 	if (sc->sc_deferred > 0) {
420 		TAILQ_INIT(&deferrals);
421 		mtx_enter(&sc->sc_deferrals_mtx);
422 		TAILQ_CONCAT(&deferrals, &sc->sc_deferrals, pd_entry);
423 		sc->sc_deferred = 0;
424 		mtx_leave(&sc->sc_deferrals_mtx);
425 
426 		while ((pd = TAILQ_FIRST(&deferrals)) != NULL) {
427 			TAILQ_REMOVE(&deferrals, pd, pd_entry);
428 			pfsync_undefer(pd, 0);
429 		}
430 	}
431 
432 	pfsyncif = NULL;
433 	timeout_del(&sc->sc_bulkfail_tmo);
434 	timeout_del(&sc->sc_bulk_tmo);
435 	timeout_del(&sc->sc_tmo);
436 
437 	NET_UNLOCK();
438 
439 	pool_destroy(&sc->sc_pool);
440 	free(sc->sc_imo.imo_membership, M_IPMOPTS,
441 	    sc->sc_imo.imo_max_memberships * sizeof(struct in_multi *));
442 	free(sc, M_DEVBUF, sizeof(*sc));
443 
444 	return (0);
445 }
446 
447 /*
448  * Start output on the pfsync interface.
449  */
450 void
451 pfsyncstart(struct ifqueue *ifq)
452 {
453 	ifq_purge(ifq);
454 }
455 
456 void
457 pfsync_syncdev_state(void *arg)
458 {
459 	struct pfsync_softc *sc = arg;
460 	struct ifnet *ifp;
461 
462 	if ((sc->sc_if.if_flags & IFF_UP) == 0)
463 		return;
464 	if ((ifp = if_get(sc->sc_sync_ifidx)) == NULL)
465 		return;
466 
467 	if (ifp->if_link_state == LINK_STATE_DOWN) {
468 		sc->sc_if.if_flags &= ~IFF_RUNNING;
469 		if (!sc->sc_link_demoted) {
470 #if NCARP > 0
471 			carp_group_demote_adj(&sc->sc_if, 1,
472 			    "pfsync link state down");
473 #endif
474 			sc->sc_link_demoted = 1;
475 		}
476 
477 		/* drop everything */
478 		timeout_del(&sc->sc_tmo);
479 		pfsync_drop(sc);
480 
481 		pfsync_cancel_full_update(sc);
482 	} else if (sc->sc_link_demoted) {
483 		sc->sc_if.if_flags |= IFF_RUNNING;
484 
485 		pfsync_request_full_update(sc);
486 	}
487 
488 	if_put(ifp);
489 }
490 
491 void
492 pfsync_ifdetach(void *arg)
493 {
494 	struct pfsync_softc *sc = arg;
495 	struct ifnet *ifp;
496 
497 	if ((ifp = if_get(sc->sc_sync_ifidx)) != NULL) {
498 		if_linkstatehook_del(ifp, &sc->sc_ltask);
499 		if_detachhook_del(ifp, &sc->sc_dtask);
500 	}
501 	if_put(ifp);
502 
503 	sc->sc_sync_ifidx = 0;
504 }
505 
506 int
507 pfsync_alloc_scrub_memory(struct pfsync_state_peer *s,
508     struct pf_state_peer *d)
509 {
510 	if (s->scrub.scrub_flag && d->scrub == NULL) {
511 		d->scrub = pool_get(&pf_state_scrub_pl, PR_NOWAIT | PR_ZERO);
512 		if (d->scrub == NULL)
513 			return (ENOMEM);
514 	}
515 
516 	return (0);
517 }
518 
519 void
520 pfsync_state_export(struct pfsync_state *sp, struct pf_state *st)
521 {
522 	pf_state_export(sp, st);
523 }
524 
525 int
526 pfsync_state_import(struct pfsync_state *sp, int flags)
527 {
528 	struct pf_state	*st = NULL;
529 	struct pf_state_key *skw = NULL, *sks = NULL;
530 	struct pf_rule *r = NULL;
531 	struct pfi_kif	*kif;
532 	int pool_flags;
533 	int error = ENOMEM;
534 	int n = 0;
535 
536 	if (sp->creatorid == 0) {
537 		DPFPRINTF(LOG_NOTICE, "pfsync_state_import: "
538 		    "invalid creator id: %08x", ntohl(sp->creatorid));
539 		return (EINVAL);
540 	}
541 
542 	if ((kif = pfi_kif_get(sp->ifname, NULL)) == NULL) {
543 		DPFPRINTF(LOG_NOTICE, "pfsync_state_import: "
544 		    "unknown interface: %s", sp->ifname);
545 		if (flags & PFSYNC_SI_IOCTL)
546 			return (EINVAL);
547 		return (0);	/* skip this state */
548 	}
549 
550 	if (sp->af == 0)
551 		return (0);	/* skip this state */
552 
553 	/*
554 	 * If the ruleset checksums match or the state is coming from the ioctl,
555 	 * it's safe to associate the state with the rule of that number.
556 	 */
557 	if (sp->rule != htonl(-1) && sp->anchor == htonl(-1) &&
558 	    (flags & (PFSYNC_SI_IOCTL | PFSYNC_SI_CKSUM)) && ntohl(sp->rule) <
559 	    pf_main_ruleset.rules.active.rcount) {
560 		TAILQ_FOREACH(r, pf_main_ruleset.rules.active.ptr, entries)
561 			if (ntohl(sp->rule) == n++)
562 				break;
563 	} else
564 		r = &pf_default_rule;
565 
566 	if ((r->max_states && r->states_cur >= r->max_states))
567 		goto cleanup;
568 
569 	if (flags & PFSYNC_SI_IOCTL)
570 		pool_flags = PR_WAITOK | PR_LIMITFAIL | PR_ZERO;
571 	else
572 		pool_flags = PR_NOWAIT | PR_LIMITFAIL | PR_ZERO;
573 
574 	if ((st = pool_get(&pf_state_pl, pool_flags)) == NULL)
575 		goto cleanup;
576 
577 	if ((skw = pf_alloc_state_key(pool_flags)) == NULL)
578 		goto cleanup;
579 
580 	if ((sp->key[PF_SK_WIRE].af &&
581 	    (sp->key[PF_SK_WIRE].af != sp->key[PF_SK_STACK].af)) ||
582 	    PF_ANEQ(&sp->key[PF_SK_WIRE].addr[0],
583 	    &sp->key[PF_SK_STACK].addr[0], sp->af) ||
584 	    PF_ANEQ(&sp->key[PF_SK_WIRE].addr[1],
585 	    &sp->key[PF_SK_STACK].addr[1], sp->af) ||
586 	    sp->key[PF_SK_WIRE].port[0] != sp->key[PF_SK_STACK].port[0] ||
587 	    sp->key[PF_SK_WIRE].port[1] != sp->key[PF_SK_STACK].port[1] ||
588 	    sp->key[PF_SK_WIRE].rdomain != sp->key[PF_SK_STACK].rdomain) {
589 		if ((sks = pf_alloc_state_key(pool_flags)) == NULL)
590 			goto cleanup;
591 	} else
592 		sks = skw;
593 
594 	/* allocate memory for scrub info */
595 	if (pfsync_alloc_scrub_memory(&sp->src, &st->src) ||
596 	    pfsync_alloc_scrub_memory(&sp->dst, &st->dst))
597 		goto cleanup;
598 
599 	/* copy to state key(s) */
600 	skw->addr[0] = sp->key[PF_SK_WIRE].addr[0];
601 	skw->addr[1] = sp->key[PF_SK_WIRE].addr[1];
602 	skw->port[0] = sp->key[PF_SK_WIRE].port[0];
603 	skw->port[1] = sp->key[PF_SK_WIRE].port[1];
604 	skw->rdomain = ntohs(sp->key[PF_SK_WIRE].rdomain);
605 	PF_REF_INIT(skw->refcnt);
606 	skw->proto = sp->proto;
607 	if (!(skw->af = sp->key[PF_SK_WIRE].af))
608 		skw->af = sp->af;
609 	if (sks != skw) {
610 		sks->addr[0] = sp->key[PF_SK_STACK].addr[0];
611 		sks->addr[1] = sp->key[PF_SK_STACK].addr[1];
612 		sks->port[0] = sp->key[PF_SK_STACK].port[0];
613 		sks->port[1] = sp->key[PF_SK_STACK].port[1];
614 		sks->rdomain = ntohs(sp->key[PF_SK_STACK].rdomain);
615 		PF_REF_INIT(sks->refcnt);
616 		if (!(sks->af = sp->key[PF_SK_STACK].af))
617 			sks->af = sp->af;
618 		if (sks->af != skw->af) {
619 			switch (sp->proto) {
620 			case IPPROTO_ICMP:
621 				sks->proto = IPPROTO_ICMPV6;
622 				break;
623 			case IPPROTO_ICMPV6:
624 				sks->proto = IPPROTO_ICMP;
625 				break;
626 			default:
627 				sks->proto = sp->proto;
628 			}
629 		} else
630 			sks->proto = sp->proto;
631 
632 		if (((sks->af != AF_INET) && (sks->af != AF_INET6)) ||
633 		    ((skw->af != AF_INET) && (skw->af != AF_INET6))) {
634 			error = EINVAL;
635 			goto cleanup;
636 		}
637 
638 	} else if ((sks->af != AF_INET) && (sks->af != AF_INET6)) {
639 		error = EINVAL;
640 		goto cleanup;
641 	}
642 	st->rtableid[PF_SK_WIRE] = ntohl(sp->rtableid[PF_SK_WIRE]);
643 	st->rtableid[PF_SK_STACK] = ntohl(sp->rtableid[PF_SK_STACK]);
644 
645 	/* copy to state */
646 	st->rt_addr = sp->rt_addr;
647 	st->rt = sp->rt;
648 	st->creation = getuptime() - ntohl(sp->creation);
649 	st->expire = getuptime();
650 	if (ntohl(sp->expire)) {
651 		u_int32_t timeout;
652 
653 		timeout = r->timeout[sp->timeout];
654 		if (!timeout)
655 			timeout = pf_default_rule.timeout[sp->timeout];
656 
657 		/* sp->expire may have been adaptively scaled by export. */
658 		st->expire -= timeout - ntohl(sp->expire);
659 	}
660 
661 	st->direction = sp->direction;
662 	st->log = sp->log;
663 	st->timeout = sp->timeout;
664 	st->state_flags = ntohs(sp->state_flags);
665 	st->max_mss = ntohs(sp->max_mss);
666 	st->min_ttl = sp->min_ttl;
667 	st->set_tos = sp->set_tos;
668 	st->set_prio[0] = sp->set_prio[0];
669 	st->set_prio[1] = sp->set_prio[1];
670 
671 	st->id = sp->id;
672 	st->creatorid = sp->creatorid;
673 	pf_state_peer_ntoh(&sp->src, &st->src);
674 	pf_state_peer_ntoh(&sp->dst, &st->dst);
675 
676 	st->rule.ptr = r;
677 	st->anchor.ptr = NULL;
678 
679 	st->pfsync_time = getuptime();
680 	st->sync_state = PFSYNC_S_NONE;
681 
682 	refcnt_init(&st->refcnt);
683 
684 	/* XXX when we have anchors, use STATE_INC_COUNTERS */
685 	r->states_cur++;
686 	r->states_tot++;
687 
688 	if (!ISSET(flags, PFSYNC_SI_IOCTL))
689 		SET(st->state_flags, PFSTATE_NOSYNC);
690 
691 	/*
692 	 * We just set PFSTATE_NOSYNC bit, which prevents
693 	 * pfsync_insert_state() to insert state to pfsync.
694 	 */
695 	if (pf_state_insert(kif, &skw, &sks, st) != 0) {
696 		/* XXX when we have anchors, use STATE_DEC_COUNTERS */
697 		r->states_cur--;
698 		error = EEXIST;
699 		goto cleanup_state;
700 	}
701 
702 	if (!ISSET(flags, PFSYNC_SI_IOCTL)) {
703 		CLR(st->state_flags, PFSTATE_NOSYNC);
704 		if (ISSET(st->state_flags, PFSTATE_ACK)) {
705 			pfsync_q_ins(st, PFSYNC_S_IACK);
706 			schednetisr(NETISR_PFSYNC);
707 		}
708 	}
709 	CLR(st->state_flags, PFSTATE_ACK);
710 
711 	return (0);
712 
713  cleanup:
714 	if (skw == sks)
715 		sks = NULL;
716 	if (skw != NULL)
717 		pool_put(&pf_state_key_pl, skw);
718 	if (sks != NULL)
719 		pool_put(&pf_state_key_pl, sks);
720 
721  cleanup_state:	/* pf_state_insert frees the state keys */
722 	if (st) {
723 		if (st->dst.scrub)
724 			pool_put(&pf_state_scrub_pl, st->dst.scrub);
725 		if (st->src.scrub)
726 			pool_put(&pf_state_scrub_pl, st->src.scrub);
727 		pool_put(&pf_state_pl, st);
728 	}
729 	return (error);
730 }
731 
732 int
733 pfsync_input(struct mbuf **mp, int *offp, int proto, int af)
734 {
735 	struct mbuf *n, *m = *mp;
736 	struct pfsync_softc *sc = pfsyncif;
737 	struct ip *ip = mtod(m, struct ip *);
738 	struct pfsync_header *ph;
739 	struct pfsync_subheader subh;
740 	int offset, noff, len, count, mlen, flags = 0;
741 	int e;
742 
743 	NET_ASSERT_LOCKED();
744 
745 	pfsyncstat_inc(pfsyncs_ipackets);
746 
747 	/* verify that we have a sync interface configured */
748 	if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING) ||
749 	    sc->sc_sync_ifidx == 0 || !pf_status.running)
750 		goto done;
751 
752 	/* verify that the packet came in on the right interface */
753 	if (sc->sc_sync_ifidx != m->m_pkthdr.ph_ifidx) {
754 		pfsyncstat_inc(pfsyncs_badif);
755 		goto done;
756 	}
757 
758 	sc->sc_if.if_ipackets++;
759 	sc->sc_if.if_ibytes += m->m_pkthdr.len;
760 
761 	/* verify that the IP TTL is 255. */
762 	if (ip->ip_ttl != PFSYNC_DFLTTL) {
763 		pfsyncstat_inc(pfsyncs_badttl);
764 		goto done;
765 	}
766 
767 	offset = ip->ip_hl << 2;
768 	n = m_pulldown(m, offset, sizeof(*ph), &noff);
769 	if (n == NULL) {
770 		pfsyncstat_inc(pfsyncs_hdrops);
771 		return IPPROTO_DONE;
772 	}
773 	ph = (struct pfsync_header *)(n->m_data + noff);
774 
775 	/* verify the version */
776 	if (ph->version != PFSYNC_VERSION) {
777 		pfsyncstat_inc(pfsyncs_badver);
778 		goto done;
779 	}
780 	len = ntohs(ph->len) + offset;
781 	if (m->m_pkthdr.len < len) {
782 		pfsyncstat_inc(pfsyncs_badlen);
783 		goto done;
784 	}
785 
786 	if (!bcmp(&ph->pfcksum, &pf_status.pf_chksum, PF_MD5_DIGEST_LENGTH))
787 		flags = PFSYNC_SI_CKSUM;
788 
789 	offset += sizeof(*ph);
790 	while (offset <= len - sizeof(subh)) {
791 		m_copydata(m, offset, sizeof(subh), &subh);
792 		offset += sizeof(subh);
793 
794 		mlen = subh.len << 2;
795 		count = ntohs(subh.count);
796 
797 		if (subh.action >= PFSYNC_ACT_MAX ||
798 		    subh.action >= nitems(pfsync_acts) ||
799 		    mlen < pfsync_acts[subh.action].len) {
800 			/*
801 			 * subheaders are always followed by at least one
802 			 * message, so if the peer is new
803 			 * enough to tell us how big its messages are then we
804 			 * know enough to skip them.
805 			 */
806 			if (count > 0 && mlen > 0) {
807 				offset += count * mlen;
808 				continue;
809 			}
810 			pfsyncstat_inc(pfsyncs_badact);
811 			goto done;
812 		}
813 
814 		n = m_pulldown(m, offset, mlen * count, &noff);
815 		if (n == NULL) {
816 			pfsyncstat_inc(pfsyncs_badlen);
817 			return IPPROTO_DONE;
818 		}
819 
820 		e = pfsync_acts[subh.action].in(n->m_data + noff, mlen, count,
821 		    flags);
822 		if (e != 0)
823 			goto done;
824 
825 		offset += mlen * count;
826 	}
827 
828 done:
829 	m_freem(m);
830 	return IPPROTO_DONE;
831 }
832 
833 int
834 pfsync_in_clr(caddr_t buf, int len, int count, int flags)
835 {
836 	struct pfsync_clr *clr;
837 	struct pf_state *st, *nexts;
838 	struct pfi_kif *kif;
839 	u_int32_t creatorid;
840 	int i;
841 
842 	PF_LOCK();
843 	for (i = 0; i < count; i++) {
844 		clr = (struct pfsync_clr *)buf + len * i;
845 		kif = NULL;
846 		creatorid = clr->creatorid;
847 		if (strlen(clr->ifname) &&
848 		    (kif = pfi_kif_find(clr->ifname)) == NULL)
849 			continue;
850 
851 		PF_STATE_ENTER_WRITE();
852 		for (st = RB_MIN(pf_state_tree_id, &tree_id); st; st = nexts) {
853 			nexts = RB_NEXT(pf_state_tree_id, &tree_id, st);
854 			if (st->creatorid == creatorid &&
855 			    ((kif && st->kif == kif) || !kif)) {
856 				SET(st->state_flags, PFSTATE_NOSYNC);
857 				pf_remove_state(st);
858 			}
859 		}
860 		PF_STATE_EXIT_WRITE();
861 	}
862 	PF_UNLOCK();
863 
864 	return (0);
865 }
866 
867 int
868 pfsync_in_ins(caddr_t buf, int len, int count, int flags)
869 {
870 	struct pfsync_state *sp;
871 	sa_family_t af1, af2;
872 	int i;
873 
874 	PF_LOCK();
875 	for (i = 0; i < count; i++) {
876 		sp = (struct pfsync_state *)(buf + len * i);
877 		af1 = sp->key[0].af;
878 		af2 = sp->key[1].af;
879 
880 		/* check for invalid values */
881 		if (sp->timeout >= PFTM_MAX ||
882 		    sp->src.state > PF_TCPS_PROXY_DST ||
883 		    sp->dst.state > PF_TCPS_PROXY_DST ||
884 		    sp->direction > PF_OUT ||
885 		    (((af1 || af2) &&
886 		     ((af1 != AF_INET && af1 != AF_INET6) ||
887 		      (af2 != AF_INET && af2 != AF_INET6))) ||
888 		    (sp->af != AF_INET && sp->af != AF_INET6))) {
889 			DPFPRINTF(LOG_NOTICE,
890 			    "pfsync_input: PFSYNC5_ACT_INS: invalid value");
891 			pfsyncstat_inc(pfsyncs_badval);
892 			continue;
893 		}
894 
895 		if (pfsync_state_import(sp, flags) == ENOMEM) {
896 			/* drop out, but process the rest of the actions */
897 			break;
898 		}
899 	}
900 	PF_UNLOCK();
901 
902 	return (0);
903 }
904 
905 int
906 pfsync_in_iack(caddr_t buf, int len, int count, int flags)
907 {
908 	struct pfsync_ins_ack *ia;
909 	struct pf_state_cmp id_key;
910 	struct pf_state *st;
911 	int i;
912 
913 	for (i = 0; i < count; i++) {
914 		ia = (struct pfsync_ins_ack *)(buf + len * i);
915 
916 		id_key.id = ia->id;
917 		id_key.creatorid = ia->creatorid;
918 
919 		PF_STATE_ENTER_READ();
920 		st = pf_find_state_byid(&id_key);
921 		pf_state_ref(st);
922 		PF_STATE_EXIT_READ();
923 		if (st == NULL)
924 			continue;
925 
926 		if (ISSET(st->state_flags, PFSTATE_ACK))
927 			pfsync_deferred(st, 0);
928 
929 		pf_state_unref(st);
930 	}
931 
932 	return (0);
933 }
934 
935 int
936 pfsync_upd_tcp(struct pf_state *st, struct pfsync_state_peer *src,
937     struct pfsync_state_peer *dst)
938 {
939 	int sync = 0;
940 
941 	/*
942 	 * The state should never go backwards except
943 	 * for syn-proxy states.  Neither should the
944 	 * sequence window slide backwards.
945 	 */
946 	if ((st->src.state > src->state &&
947 	    (st->src.state < PF_TCPS_PROXY_SRC ||
948 	    src->state >= PF_TCPS_PROXY_SRC)) ||
949 
950 	    (st->src.state == src->state &&
951 	    SEQ_GT(st->src.seqlo, ntohl(src->seqlo))))
952 		sync++;
953 	else
954 		pf_state_peer_ntoh(src, &st->src);
955 
956 	if ((st->dst.state > dst->state) ||
957 
958 	    (st->dst.state >= TCPS_SYN_SENT &&
959 	    SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo))))
960 		sync++;
961 	else
962 		pf_state_peer_ntoh(dst, &st->dst);
963 
964 	return (sync);
965 }
966 
967 int
968 pfsync_in_upd(caddr_t buf, int len, int count, int flags)
969 {
970 	struct pfsync_state *sp;
971 	struct pf_state_cmp id_key;
972 	struct pf_state *st;
973 	int sync, error;
974 	int i;
975 
976 	for (i = 0; i < count; i++) {
977 		sp = (struct pfsync_state *)(buf + len * i);
978 
979 		/* check for invalid values */
980 		if (sp->timeout >= PFTM_MAX ||
981 		    sp->src.state > PF_TCPS_PROXY_DST ||
982 		    sp->dst.state > PF_TCPS_PROXY_DST) {
983 			DPFPRINTF(LOG_NOTICE,
984 			    "pfsync_input: PFSYNC_ACT_UPD: invalid value");
985 			pfsyncstat_inc(pfsyncs_badval);
986 			continue;
987 		}
988 
989 		id_key.id = sp->id;
990 		id_key.creatorid = sp->creatorid;
991 
992 		PF_STATE_ENTER_READ();
993 		st = pf_find_state_byid(&id_key);
994 		pf_state_ref(st);
995 		PF_STATE_EXIT_READ();
996 		if (st == NULL) {
997 			/* insert the update */
998 			PF_LOCK();
999 			error = pfsync_state_import(sp, flags);
1000 			if (error)
1001 				pfsyncstat_inc(pfsyncs_badstate);
1002 			PF_UNLOCK();
1003 			continue;
1004 		}
1005 
1006 		if (ISSET(st->state_flags, PFSTATE_ACK))
1007 			pfsync_deferred(st, 1);
1008 
1009 		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
1010 			sync = pfsync_upd_tcp(st, &sp->src, &sp->dst);
1011 		else {
1012 			sync = 0;
1013 
1014 			/*
1015 			 * Non-TCP protocol state machine always go
1016 			 * forwards
1017 			 */
1018 			if (st->src.state > sp->src.state)
1019 				sync++;
1020 			else
1021 				pf_state_peer_ntoh(&sp->src, &st->src);
1022 
1023 			if (st->dst.state > sp->dst.state)
1024 				sync++;
1025 			else
1026 				pf_state_peer_ntoh(&sp->dst, &st->dst);
1027 		}
1028 
1029 		if (sync < 2) {
1030 			pfsync_alloc_scrub_memory(&sp->dst, &st->dst);
1031 			pf_state_peer_ntoh(&sp->dst, &st->dst);
1032 			st->expire = getuptime();
1033 			st->timeout = sp->timeout;
1034 		}
1035 		st->pfsync_time = getuptime();
1036 
1037 		if (sync) {
1038 			pfsyncstat_inc(pfsyncs_stale);
1039 
1040 			pfsync_update_state(st);
1041 			schednetisr(NETISR_PFSYNC);
1042 		}
1043 
1044 		pf_state_unref(st);
1045 	}
1046 
1047 	return (0);
1048 }
1049 
1050 int
1051 pfsync_in_upd_c(caddr_t buf, int len, int count, int flags)
1052 {
1053 	struct pfsync_upd_c *up;
1054 	struct pf_state_cmp id_key;
1055 	struct pf_state *st;
1056 
1057 	int sync;
1058 
1059 	int i;
1060 
1061 	for (i = 0; i < count; i++) {
1062 		up = (struct pfsync_upd_c *)(buf + len * i);
1063 
1064 		/* check for invalid values */
1065 		if (up->timeout >= PFTM_MAX ||
1066 		    up->src.state > PF_TCPS_PROXY_DST ||
1067 		    up->dst.state > PF_TCPS_PROXY_DST) {
1068 			DPFPRINTF(LOG_NOTICE,
1069 			    "pfsync_input: PFSYNC_ACT_UPD_C: invalid value");
1070 			pfsyncstat_inc(pfsyncs_badval);
1071 			continue;
1072 		}
1073 
1074 		id_key.id = up->id;
1075 		id_key.creatorid = up->creatorid;
1076 
1077 		PF_STATE_ENTER_READ();
1078 		st = pf_find_state_byid(&id_key);
1079 		pf_state_ref(st);
1080 		PF_STATE_EXIT_READ();
1081 		if (st == NULL) {
1082 			/* We don't have this state. Ask for it. */
1083 			pfsync_request_update(id_key.creatorid, id_key.id);
1084 			continue;
1085 		}
1086 
1087 		if (ISSET(st->state_flags, PFSTATE_ACK))
1088 			pfsync_deferred(st, 1);
1089 
1090 		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
1091 			sync = pfsync_upd_tcp(st, &up->src, &up->dst);
1092 		else {
1093 			sync = 0;
1094 			/*
1095 			 * Non-TCP protocol state machine always go
1096 			 * forwards
1097 			 */
1098 			if (st->src.state > up->src.state)
1099 				sync++;
1100 			else
1101 				pf_state_peer_ntoh(&up->src, &st->src);
1102 
1103 			if (st->dst.state > up->dst.state)
1104 				sync++;
1105 			else
1106 				pf_state_peer_ntoh(&up->dst, &st->dst);
1107 		}
1108 		if (sync < 2) {
1109 			pfsync_alloc_scrub_memory(&up->dst, &st->dst);
1110 			pf_state_peer_ntoh(&up->dst, &st->dst);
1111 			st->expire = getuptime();
1112 			st->timeout = up->timeout;
1113 		}
1114 		st->pfsync_time = getuptime();
1115 
1116 		if (sync) {
1117 			pfsyncstat_inc(pfsyncs_stale);
1118 
1119 			pfsync_update_state(st);
1120 			schednetisr(NETISR_PFSYNC);
1121 		}
1122 
1123 		pf_state_unref(st);
1124 	}
1125 
1126 	return (0);
1127 }
1128 
1129 int
1130 pfsync_in_ureq(caddr_t buf, int len, int count, int flags)
1131 {
1132 	struct pfsync_upd_req *ur;
1133 	int i;
1134 
1135 	struct pf_state_cmp id_key;
1136 	struct pf_state *st;
1137 
1138 	for (i = 0; i < count; i++) {
1139 		ur = (struct pfsync_upd_req *)(buf + len * i);
1140 
1141 		id_key.id = ur->id;
1142 		id_key.creatorid = ur->creatorid;
1143 
1144 		if (id_key.id == 0 && id_key.creatorid == 0)
1145 			pfsync_bulk_start();
1146 		else {
1147 			PF_STATE_ENTER_READ();
1148 			st = pf_find_state_byid(&id_key);
1149 			pf_state_ref(st);
1150 			PF_STATE_EXIT_READ();
1151 			if (st == NULL) {
1152 				pfsyncstat_inc(pfsyncs_badstate);
1153 				continue;
1154 			}
1155 			if (ISSET(st->state_flags, PFSTATE_NOSYNC)) {
1156 				pf_state_unref(st);
1157 				continue;
1158 			}
1159 
1160 			pfsync_update_state_req(st);
1161 			pf_state_unref(st);
1162 		}
1163 	}
1164 
1165 	return (0);
1166 }
1167 
1168 int
1169 pfsync_in_del(caddr_t buf, int len, int count, int flags)
1170 {
1171 	struct pfsync_state *sp;
1172 	struct pf_state_cmp id_key;
1173 	struct pf_state *st;
1174 	int i;
1175 
1176 	PF_STATE_ENTER_WRITE();
1177 	for (i = 0; i < count; i++) {
1178 		sp = (struct pfsync_state *)(buf + len * i);
1179 
1180 		id_key.id = sp->id;
1181 		id_key.creatorid = sp->creatorid;
1182 
1183 		st = pf_find_state_byid(&id_key);
1184 		if (st == NULL) {
1185 			pfsyncstat_inc(pfsyncs_badstate);
1186 			continue;
1187 		}
1188 		SET(st->state_flags, PFSTATE_NOSYNC);
1189 		pf_remove_state(st);
1190 	}
1191 	PF_STATE_EXIT_WRITE();
1192 
1193 	return (0);
1194 }
1195 
1196 int
1197 pfsync_in_del_c(caddr_t buf, int len, int count, int flags)
1198 {
1199 	struct pfsync_del_c *sp;
1200 	struct pf_state_cmp id_key;
1201 	struct pf_state *st;
1202 	int i;
1203 
1204 	PF_LOCK();
1205 	PF_STATE_ENTER_WRITE();
1206 	for (i = 0; i < count; i++) {
1207 		sp = (struct pfsync_del_c *)(buf + len * i);
1208 
1209 		id_key.id = sp->id;
1210 		id_key.creatorid = sp->creatorid;
1211 
1212 		st = pf_find_state_byid(&id_key);
1213 		if (st == NULL) {
1214 			pfsyncstat_inc(pfsyncs_badstate);
1215 			continue;
1216 		}
1217 
1218 		SET(st->state_flags, PFSTATE_NOSYNC);
1219 		pf_remove_state(st);
1220 	}
1221 	PF_STATE_EXIT_WRITE();
1222 	PF_UNLOCK();
1223 
1224 	return (0);
1225 }
1226 
1227 int
1228 pfsync_in_bus(caddr_t buf, int len, int count, int flags)
1229 {
1230 	struct pfsync_softc *sc = pfsyncif;
1231 	struct pfsync_bus *bus;
1232 
1233 	/* If we're not waiting for a bulk update, who cares. */
1234 	if (sc->sc_ureq_sent == 0)
1235 		return (0);
1236 
1237 	bus = (struct pfsync_bus *)buf;
1238 
1239 	switch (bus->status) {
1240 	case PFSYNC_BUS_START:
1241 		timeout_add(&sc->sc_bulkfail_tmo, 4 * hz +
1242 		    pf_pool_limits[PF_LIMIT_STATES].limit /
1243 		    ((sc->sc_if.if_mtu - PFSYNC_MINPKT) /
1244 		    sizeof(struct pfsync_state)));
1245 		DPFPRINTF(LOG_INFO, "received bulk update start");
1246 		break;
1247 
1248 	case PFSYNC_BUS_END:
1249 		if (getuptime() - ntohl(bus->endtime) >=
1250 		    sc->sc_ureq_sent) {
1251 			/* that's it, we're happy */
1252 			sc->sc_ureq_sent = 0;
1253 			sc->sc_bulk_tries = 0;
1254 			timeout_del(&sc->sc_bulkfail_tmo);
1255 #if NCARP > 0
1256 			if (!pfsync_sync_ok)
1257 				carp_group_demote_adj(&sc->sc_if, -1,
1258 				    sc->sc_link_demoted ?
1259 				    "pfsync link state up" :
1260 				    "pfsync bulk done");
1261 			if (sc->sc_initial_bulk) {
1262 				carp_group_demote_adj(&sc->sc_if, -32,
1263 				    "pfsync init");
1264 				sc->sc_initial_bulk = 0;
1265 			}
1266 #endif
1267 			pfsync_sync_ok = 1;
1268 			sc->sc_link_demoted = 0;
1269 			DPFPRINTF(LOG_INFO, "received valid bulk update end");
1270 		} else {
1271 			DPFPRINTF(LOG_WARNING, "received invalid "
1272 			    "bulk update end: bad timestamp");
1273 		}
1274 		break;
1275 	}
1276 
1277 	return (0);
1278 }
1279 
1280 int
1281 pfsync_in_tdb(caddr_t buf, int len, int count, int flags)
1282 {
1283 #if defined(IPSEC)
1284 	struct pfsync_tdb *tp;
1285 	int i;
1286 
1287 	for (i = 0; i < count; i++) {
1288 		tp = (struct pfsync_tdb *)(buf + len * i);
1289 		pfsync_update_net_tdb(tp);
1290 	}
1291 #endif
1292 
1293 	return (0);
1294 }
1295 
1296 #if defined(IPSEC)
1297 /* Update an in-kernel tdb. Silently fail if no tdb is found. */
1298 void
1299 pfsync_update_net_tdb(struct pfsync_tdb *pt)
1300 {
1301 	struct tdb		*tdb;
1302 
1303 	NET_ASSERT_LOCKED();
1304 
1305 	/* check for invalid values */
1306 	if (ntohl(pt->spi) <= SPI_RESERVED_MAX ||
1307 	    (pt->dst.sa.sa_family != AF_INET &&
1308 	     pt->dst.sa.sa_family != AF_INET6))
1309 		goto bad;
1310 
1311 	tdb = gettdb(ntohs(pt->rdomain), pt->spi,
1312 	    (union sockaddr_union *)&pt->dst, pt->sproto);
1313 	if (tdb) {
1314 		pt->rpl = betoh64(pt->rpl);
1315 		pt->cur_bytes = betoh64(pt->cur_bytes);
1316 
1317 		/* Neither replay nor byte counter should ever decrease. */
1318 		if (pt->rpl < tdb->tdb_rpl ||
1319 		    pt->cur_bytes < tdb->tdb_cur_bytes) {
1320 			tdb_unref(tdb);
1321 			goto bad;
1322 		}
1323 
1324 		tdb->tdb_rpl = pt->rpl;
1325 		tdb->tdb_cur_bytes = pt->cur_bytes;
1326 		tdb_unref(tdb);
1327 	}
1328 	return;
1329 
1330  bad:
1331 	DPFPRINTF(LOG_WARNING, "pfsync_insert: PFSYNC_ACT_TDB_UPD: "
1332 	    "invalid value");
1333 	pfsyncstat_inc(pfsyncs_badstate);
1334 	return;
1335 }
1336 #endif
1337 
1338 
1339 int
1340 pfsync_in_eof(caddr_t buf, int len, int count, int flags)
1341 {
1342 	if (len > 0 || count > 0)
1343 		pfsyncstat_inc(pfsyncs_badact);
1344 
1345 	/* we're done. let the caller return */
1346 	return (1);
1347 }
1348 
1349 int
1350 pfsync_in_error(caddr_t buf, int len, int count, int flags)
1351 {
1352 	pfsyncstat_inc(pfsyncs_badact);
1353 	return (-1);
1354 }
1355 
1356 int
1357 pfsyncoutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
1358 	struct rtentry *rt)
1359 {
1360 	m_freem(m);	/* drop packet */
1361 	return (EAFNOSUPPORT);
1362 }
1363 
1364 int
1365 pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
1366 {
1367 	struct proc *p = curproc;
1368 	struct pfsync_softc *sc = ifp->if_softc;
1369 	struct ifreq *ifr = (struct ifreq *)data;
1370 	struct ip_moptions *imo = &sc->sc_imo;
1371 	struct pfsyncreq pfsyncr;
1372 	struct ifnet *ifp0, *sifp;
1373 	struct ip *ip;
1374 	int error;
1375 
1376 	switch (cmd) {
1377 	case SIOCSIFFLAGS:
1378 		if ((ifp->if_flags & IFF_RUNNING) == 0 &&
1379 		    (ifp->if_flags & IFF_UP)) {
1380 			ifp->if_flags |= IFF_RUNNING;
1381 
1382 #if NCARP > 0
1383 			sc->sc_initial_bulk = 1;
1384 			carp_group_demote_adj(&sc->sc_if, 32, "pfsync init");
1385 #endif
1386 
1387 			pfsync_request_full_update(sc);
1388 		}
1389 		if ((ifp->if_flags & IFF_RUNNING) &&
1390 		    (ifp->if_flags & IFF_UP) == 0) {
1391 			ifp->if_flags &= ~IFF_RUNNING;
1392 
1393 			/* drop everything */
1394 			timeout_del(&sc->sc_tmo);
1395 			pfsync_drop(sc);
1396 
1397 			pfsync_cancel_full_update(sc);
1398 		}
1399 		break;
1400 	case SIOCSIFMTU:
1401 		if ((ifp0 = if_get(sc->sc_sync_ifidx)) == NULL)
1402 			return (EINVAL);
1403 		error = 0;
1404 		if (ifr->ifr_mtu <= PFSYNC_MINPKT ||
1405 		    ifr->ifr_mtu > ifp0->if_mtu) {
1406 			error = EINVAL;
1407 		}
1408 		if_put(ifp0);
1409 		if (error)
1410 			return error;
1411 		if (ifr->ifr_mtu < ifp->if_mtu)
1412 			pfsync_sendout();
1413 		ifp->if_mtu = ifr->ifr_mtu;
1414 		break;
1415 	case SIOCGETPFSYNC:
1416 		bzero(&pfsyncr, sizeof(pfsyncr));
1417 		if ((ifp0 = if_get(sc->sc_sync_ifidx)) != NULL) {
1418 			strlcpy(pfsyncr.pfsyncr_syncdev,
1419 			    ifp0->if_xname, IFNAMSIZ);
1420 		}
1421 		if_put(ifp0);
1422 		pfsyncr.pfsyncr_syncpeer = sc->sc_sync_peer;
1423 		pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates;
1424 		pfsyncr.pfsyncr_defer = sc->sc_defer;
1425 		return (copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr)));
1426 
1427 	case SIOCSETPFSYNC:
1428 		if ((error = suser(p)) != 0)
1429 			return (error);
1430 		if ((error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr))))
1431 			return (error);
1432 
1433 		if (pfsyncr.pfsyncr_syncpeer.s_addr == 0)
1434 			sc->sc_sync_peer.s_addr = INADDR_PFSYNC_GROUP;
1435 		else
1436 			sc->sc_sync_peer.s_addr =
1437 			    pfsyncr.pfsyncr_syncpeer.s_addr;
1438 
1439 		if (pfsyncr.pfsyncr_maxupdates > 255)
1440 			return (EINVAL);
1441 		sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates;
1442 
1443 		sc->sc_defer = pfsyncr.pfsyncr_defer;
1444 
1445 		if (pfsyncr.pfsyncr_syncdev[0] == 0) {
1446 			if ((ifp0 = if_get(sc->sc_sync_ifidx)) != NULL) {
1447 				if_linkstatehook_del(ifp0, &sc->sc_ltask);
1448 				if_detachhook_del(ifp0, &sc->sc_dtask);
1449 			}
1450 			if_put(ifp0);
1451 			sc->sc_sync_ifidx = 0;
1452 			if (imo->imo_num_memberships > 0) {
1453 				in_delmulti(imo->imo_membership[
1454 				    --imo->imo_num_memberships]);
1455 				imo->imo_ifidx = 0;
1456 			}
1457 			break;
1458 		}
1459 
1460 		if ((sifp = if_unit(pfsyncr.pfsyncr_syncdev)) == NULL)
1461 			return (EINVAL);
1462 
1463 		ifp0 = if_get(sc->sc_sync_ifidx);
1464 
1465 		if (sifp->if_mtu < sc->sc_if.if_mtu || (ifp0 != NULL &&
1466 		    sifp->if_mtu < ifp0->if_mtu) ||
1467 		    sifp->if_mtu < MCLBYTES - sizeof(struct ip))
1468 			pfsync_sendout();
1469 
1470 		if (ifp0) {
1471 			if_linkstatehook_del(ifp0, &sc->sc_ltask);
1472 			if_detachhook_del(ifp0, &sc->sc_dtask);
1473 		}
1474 		if_put(ifp0);
1475 		sc->sc_sync_ifidx = sifp->if_index;
1476 
1477 		if (imo->imo_num_memberships > 0) {
1478 			in_delmulti(imo->imo_membership[--imo->imo_num_memberships]);
1479 			imo->imo_ifidx = 0;
1480 		}
1481 
1482 		if (sc->sc_sync_peer.s_addr == INADDR_PFSYNC_GROUP) {
1483 			struct in_addr addr;
1484 
1485 			if (!(sifp->if_flags & IFF_MULTICAST)) {
1486 				sc->sc_sync_ifidx = 0;
1487 				if_put(sifp);
1488 				return (EADDRNOTAVAIL);
1489 			}
1490 
1491 			addr.s_addr = INADDR_PFSYNC_GROUP;
1492 
1493 			if ((imo->imo_membership[0] =
1494 			    in_addmulti(&addr, sifp)) == NULL) {
1495 				sc->sc_sync_ifidx = 0;
1496 				if_put(sifp);
1497 				return (ENOBUFS);
1498 			}
1499 			imo->imo_num_memberships++;
1500 			imo->imo_ifidx = sc->sc_sync_ifidx;
1501 			imo->imo_ttl = PFSYNC_DFLTTL;
1502 			imo->imo_loop = 0;
1503 		}
1504 
1505 		ip = &sc->sc_template;
1506 		bzero(ip, sizeof(*ip));
1507 		ip->ip_v = IPVERSION;
1508 		ip->ip_hl = sizeof(sc->sc_template) >> 2;
1509 		ip->ip_tos = IPTOS_LOWDELAY;
1510 		/* len and id are set later */
1511 		ip->ip_off = htons(IP_DF);
1512 		ip->ip_ttl = PFSYNC_DFLTTL;
1513 		ip->ip_p = IPPROTO_PFSYNC;
1514 		ip->ip_src.s_addr = INADDR_ANY;
1515 		ip->ip_dst.s_addr = sc->sc_sync_peer.s_addr;
1516 
1517 		if_linkstatehook_add(sifp, &sc->sc_ltask);
1518 		if_detachhook_add(sifp, &sc->sc_dtask);
1519 		if_put(sifp);
1520 
1521 		pfsync_request_full_update(sc);
1522 
1523 		break;
1524 
1525 	default:
1526 		return (ENOTTY);
1527 	}
1528 
1529 	return (0);
1530 }
1531 
1532 void
1533 pfsync_out_state(struct pf_state *st, void *buf)
1534 {
1535 	struct pfsync_state *sp = buf;
1536 
1537 	pfsync_state_export(sp, st);
1538 }
1539 
1540 void
1541 pfsync_out_iack(struct pf_state *st, void *buf)
1542 {
1543 	struct pfsync_ins_ack *iack = buf;
1544 
1545 	iack->id = st->id;
1546 	iack->creatorid = st->creatorid;
1547 }
1548 
1549 void
1550 pfsync_out_upd_c(struct pf_state *st, void *buf)
1551 {
1552 	struct pfsync_upd_c *up = buf;
1553 
1554 	bzero(up, sizeof(*up));
1555 	up->id = st->id;
1556 	pf_state_peer_hton(&st->src, &up->src);
1557 	pf_state_peer_hton(&st->dst, &up->dst);
1558 	up->creatorid = st->creatorid;
1559 	up->timeout = st->timeout;
1560 }
1561 
1562 void
1563 pfsync_out_del(struct pf_state *st, void *buf)
1564 {
1565 	struct pfsync_del_c *dp = buf;
1566 
1567 	dp->id = st->id;
1568 	dp->creatorid = st->creatorid;
1569 
1570 	SET(st->state_flags, PFSTATE_NOSYNC);
1571 }
1572 
1573 void
1574 pfsync_grab_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc *sc)
1575 {
1576 	int q;
1577 	struct pf_state *st;
1578 	struct pfsync_upd_req_item *ur;
1579 	struct tdb *tdb;
1580 
1581 	sn->sn_sc = sc;
1582 
1583 	mtx_enter(&sc->sc_st_mtx);
1584 	mtx_enter(&sc->sc_upd_req_mtx);
1585 	mtx_enter(&sc->sc_tdb_mtx);
1586 
1587 	for (q = 0; q < PFSYNC_S_COUNT; q++) {
1588 		TAILQ_INIT(&sn->sn_qs[q]);
1589 
1590 		while ((st = TAILQ_FIRST(&sc->sc_qs[q])) != NULL) {
1591 			KASSERT(st->snapped == 0);
1592 			TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
1593 			TAILQ_INSERT_TAIL(&sn->sn_qs[q], st, sync_snap);
1594 			st->snapped = 1;
1595 		}
1596 	}
1597 
1598 	TAILQ_INIT(&sn->sn_upd_req_list);
1599 	while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
1600 		TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
1601 		TAILQ_INSERT_TAIL(&sn->sn_upd_req_list, ur, ur_snap);
1602 	}
1603 
1604 	TAILQ_INIT(&sn->sn_tdb_q);
1605 	while ((tdb = TAILQ_FIRST(&sc->sc_tdb_q)) != NULL) {
1606 		TAILQ_REMOVE(&sc->sc_tdb_q, tdb, tdb_sync_entry);
1607 		TAILQ_INSERT_TAIL(&sn->sn_tdb_q, tdb, tdb_sync_snap);
1608 
1609 		mtx_enter(&tdb->tdb_mtx);
1610 		KASSERT(!ISSET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED));
1611 		SET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED);
1612 		mtx_leave(&tdb->tdb_mtx);
1613 	}
1614 
1615 	sn->sn_len = sc->sc_len;
1616 	sc->sc_len = PFSYNC_MINPKT;
1617 
1618 	sn->sn_plus = sc->sc_plus;
1619 	sc->sc_plus = NULL;
1620 	sn->sn_pluslen = sc->sc_pluslen;
1621 	sc->sc_pluslen = 0;
1622 
1623 	mtx_leave(&sc->sc_tdb_mtx);
1624 	mtx_leave(&sc->sc_upd_req_mtx);
1625 	mtx_leave(&sc->sc_st_mtx);
1626 }
1627 
1628 void
1629 pfsync_drop_snapshot(struct pfsync_snapshot *sn)
1630 {
1631 	struct pf_state *st;
1632 	struct pfsync_upd_req_item *ur;
1633 	struct tdb *t;
1634 	int q;
1635 
1636 	for (q = 0; q < PFSYNC_S_COUNT; q++) {
1637 		if (TAILQ_EMPTY(&sn->sn_qs[q]))
1638 			continue;
1639 
1640 		while ((st = TAILQ_FIRST(&sn->sn_qs[q])) != NULL) {
1641 			KASSERT(st->sync_state == q);
1642 			KASSERT(st->snapped == 1);
1643 			TAILQ_REMOVE(&sn->sn_qs[q], st, sync_snap);
1644 			st->sync_state = PFSYNC_S_NONE;
1645 			st->snapped = 0;
1646 			pf_state_unref(st);
1647 		}
1648 	}
1649 
1650 	while ((ur = TAILQ_FIRST(&sn->sn_upd_req_list)) != NULL) {
1651 		TAILQ_REMOVE(&sn->sn_upd_req_list, ur, ur_snap);
1652 		pool_put(&sn->sn_sc->sc_pool, ur);
1653 	}
1654 
1655 	while ((t = TAILQ_FIRST(&sn->sn_tdb_q)) != NULL) {
1656 		TAILQ_REMOVE(&sn->sn_tdb_q, t, tdb_sync_snap);
1657 		mtx_enter(&t->tdb_mtx);
1658 		KASSERT(ISSET(t->tdb_flags, TDBF_PFSYNC_SNAPPED));
1659 		CLR(t->tdb_flags, TDBF_PFSYNC_SNAPPED);
1660 		CLR(t->tdb_flags, TDBF_PFSYNC);
1661 		mtx_leave(&t->tdb_mtx);
1662 	}
1663 }
1664 
1665 int
1666 pfsync_is_snapshot_empty(struct pfsync_snapshot *sn)
1667 {
1668 	int	q;
1669 
1670 	for (q = 0; q < PFSYNC_S_COUNT; q++)
1671 		if (!TAILQ_EMPTY(&sn->sn_qs[q]))
1672 			return (0);
1673 
1674 	if (!TAILQ_EMPTY(&sn->sn_upd_req_list))
1675 		return (0);
1676 
1677 	if (!TAILQ_EMPTY(&sn->sn_tdb_q))
1678 		return (0);
1679 
1680 	return (sn->sn_plus == NULL);
1681 }
1682 
1683 void
1684 pfsync_drop(struct pfsync_softc *sc)
1685 {
1686 	struct pfsync_snapshot	sn;
1687 
1688 	pfsync_grab_snapshot(&sn, sc);
1689 	pfsync_drop_snapshot(&sn);
1690 }
1691 
1692 void
1693 pfsync_send_dispatch(void *xmq)
1694 {
1695 	struct mbuf_queue *mq = xmq;
1696 	struct pfsync_softc *sc;
1697 	struct mbuf *m;
1698 	struct mbuf_list ml;
1699 	int error;
1700 
1701 	mq_delist(mq, &ml);
1702 	if (ml_empty(&ml))
1703 		return;
1704 
1705 	NET_LOCK();
1706 	sc = pfsyncif;
1707 	if (sc == NULL) {
1708 		ml_purge(&ml);
1709 		goto done;
1710 	}
1711 
1712 	while ((m = ml_dequeue(&ml)) != NULL) {
1713 		if ((error = ip_output(m, NULL, NULL, IP_RAWOUTPUT,
1714 		    &sc->sc_imo, NULL, 0)) == 0)
1715 			pfsyncstat_inc(pfsyncs_opackets);
1716 		else {
1717 			DPFPRINTF(LOG_DEBUG,
1718 			    "ip_output() @ %s failed (%d)\n", __func__, error);
1719 			pfsyncstat_inc(pfsyncs_oerrors);
1720 		}
1721 	}
1722 done:
1723 	NET_UNLOCK();
1724 }
1725 
1726 void
1727 pfsync_send_pkt(struct mbuf *m)
1728 {
1729 	if (mq_enqueue(&pfsync_mq, m) != 0) {
1730 		pfsyncstat_inc(pfsyncs_oerrors);
1731 		DPFPRINTF(LOG_DEBUG, "mq_enqueue() @ %s failed, queue full\n",
1732 		    __func__);
1733 	} else
1734 		task_add(net_tq(0), &pfsync_task);
1735 }
1736 
1737 void
1738 pfsync_sendout(void)
1739 {
1740 	struct pfsync_snapshot sn;
1741 	struct pfsync_softc *sc = pfsyncif;
1742 #if NBPFILTER > 0
1743 	struct ifnet *ifp = &sc->sc_if;
1744 #endif
1745 	struct mbuf *m;
1746 	struct ip *ip;
1747 	struct pfsync_header *ph;
1748 	struct pfsync_subheader *subh;
1749 	struct pf_state *st;
1750 	struct pfsync_upd_req_item *ur;
1751 	struct tdb *t;
1752 	int offset;
1753 	int q, count = 0;
1754 
1755 	if (sc == NULL || sc->sc_len == PFSYNC_MINPKT)
1756 		return;
1757 
1758 	if (!ISSET(sc->sc_if.if_flags, IFF_RUNNING) ||
1759 #if NBPFILTER > 0
1760 	    (ifp->if_bpf == NULL && sc->sc_sync_ifidx == 0)) {
1761 #else
1762 	    sc->sc_sync_ifidx == 0) {
1763 #endif
1764 		pfsync_drop(sc);
1765 		return;
1766 	}
1767 
1768 	pfsync_grab_snapshot(&sn, sc);
1769 
1770 	/*
1771 	 * Check below is sufficient to prevent us from sending empty packets,
1772 	 * but it does not stop us from sending short packets.
1773 	 */
1774 	if (pfsync_is_snapshot_empty(&sn))
1775 		return;
1776 
1777 	MGETHDR(m, M_DONTWAIT, MT_DATA);
1778 	if (m == NULL) {
1779 		sc->sc_if.if_oerrors++;
1780 		pfsyncstat_inc(pfsyncs_onomem);
1781 		pfsync_drop_snapshot(&sn);
1782 		return;
1783 	}
1784 
1785 	if (max_linkhdr + sn.sn_len > MHLEN) {
1786 		MCLGETL(m, M_DONTWAIT, max_linkhdr + sn.sn_len);
1787 		if (!ISSET(m->m_flags, M_EXT)) {
1788 			m_free(m);
1789 			sc->sc_if.if_oerrors++;
1790 			pfsyncstat_inc(pfsyncs_onomem);
1791 			pfsync_drop_snapshot(&sn);
1792 			return;
1793 		}
1794 	}
1795 	m->m_data += max_linkhdr;
1796 	m->m_len = m->m_pkthdr.len = sn.sn_len;
1797 
1798 	/* build the ip header */
1799 	ip = mtod(m, struct ip *);
1800 	bcopy(&sc->sc_template, ip, sizeof(*ip));
1801 	offset = sizeof(*ip);
1802 
1803 	ip->ip_len = htons(m->m_pkthdr.len);
1804 	ip->ip_id = htons(ip_randomid());
1805 
1806 	/* build the pfsync header */
1807 	ph = (struct pfsync_header *)(m->m_data + offset);
1808 	bzero(ph, sizeof(*ph));
1809 	offset += sizeof(*ph);
1810 
1811 	ph->version = PFSYNC_VERSION;
1812 	ph->len = htons(sn.sn_len - sizeof(*ip));
1813 	bcopy(pf_status.pf_chksum, ph->pfcksum, PF_MD5_DIGEST_LENGTH);
1814 
1815 	if (!TAILQ_EMPTY(&sn.sn_upd_req_list)) {
1816 		subh = (struct pfsync_subheader *)(m->m_data + offset);
1817 		offset += sizeof(*subh);
1818 
1819 		count = 0;
1820 		while ((ur = TAILQ_FIRST(&sn.sn_upd_req_list)) != NULL) {
1821 			TAILQ_REMOVE(&sn.sn_upd_req_list, ur, ur_snap);
1822 
1823 			bcopy(&ur->ur_msg, m->m_data + offset,
1824 			    sizeof(ur->ur_msg));
1825 			offset += sizeof(ur->ur_msg);
1826 
1827 			pool_put(&sc->sc_pool, ur);
1828 
1829 			count++;
1830 		}
1831 
1832 		bzero(subh, sizeof(*subh));
1833 		subh->len = sizeof(ur->ur_msg) >> 2;
1834 		subh->action = PFSYNC_ACT_UPD_REQ;
1835 		subh->count = htons(count);
1836 	}
1837 
1838 	/* has someone built a custom region for us to add? */
1839 	if (sn.sn_plus != NULL) {
1840 		bcopy(sn.sn_plus, m->m_data + offset, sn.sn_pluslen);
1841 		offset += sn.sn_pluslen;
1842 		sn.sn_plus = NULL;	/* XXX memory leak ? */
1843 	}
1844 
1845 	if (!TAILQ_EMPTY(&sn.sn_tdb_q)) {
1846 		subh = (struct pfsync_subheader *)(m->m_data + offset);
1847 		offset += sizeof(*subh);
1848 
1849 		count = 0;
1850 		while ((t = TAILQ_FIRST(&sn.sn_tdb_q)) != NULL) {
1851 			TAILQ_REMOVE(&sn.sn_tdb_q, t, tdb_sync_snap);
1852 			pfsync_out_tdb(t, m->m_data + offset);
1853 			offset += sizeof(struct pfsync_tdb);
1854 			mtx_enter(&t->tdb_mtx);
1855 			KASSERT(ISSET(t->tdb_flags, TDBF_PFSYNC_SNAPPED));
1856 			CLR(t->tdb_flags, TDBF_PFSYNC_SNAPPED);
1857 			CLR(t->tdb_flags, TDBF_PFSYNC);
1858 			mtx_leave(&t->tdb_mtx);
1859 			tdb_unref(t);
1860 			count++;
1861 		}
1862 
1863 		bzero(subh, sizeof(*subh));
1864 		subh->action = PFSYNC_ACT_TDB;
1865 		subh->len = sizeof(struct pfsync_tdb) >> 2;
1866 		subh->count = htons(count);
1867 	}
1868 
1869 	/* walk the queues */
1870 	for (q = 0; q < PFSYNC_S_COUNT; q++) {
1871 		if (TAILQ_EMPTY(&sn.sn_qs[q]))
1872 			continue;
1873 
1874 		subh = (struct pfsync_subheader *)(m->m_data + offset);
1875 		offset += sizeof(*subh);
1876 
1877 		count = 0;
1878 		while ((st = TAILQ_FIRST(&sn.sn_qs[q])) != NULL) {
1879 			TAILQ_REMOVE(&sn.sn_qs[q], st, sync_snap);
1880 			KASSERT(st->sync_state == q);
1881 			KASSERT(st->snapped == 1);
1882 			st->sync_state = PFSYNC_S_NONE;
1883 			st->snapped = 0;
1884 			pfsync_qs[q].write(st, m->m_data + offset);
1885 			offset += pfsync_qs[q].len;
1886 
1887 			pf_state_unref(st);
1888 			count++;
1889 		}
1890 
1891 		bzero(subh, sizeof(*subh));
1892 		subh->action = pfsync_qs[q].action;
1893 		subh->len = pfsync_qs[q].len >> 2;
1894 		subh->count = htons(count);
1895 	}
1896 
1897 	/* we're done, let's put it on the wire */
1898 #if NBPFILTER > 0
1899 	if (ifp->if_bpf) {
1900 		m->m_data += sizeof(*ip);
1901 		m->m_len = m->m_pkthdr.len = sn.sn_len - sizeof(*ip);
1902 		bpf_mtap(ifp->if_bpf, m, BPF_DIRECTION_OUT);
1903 		m->m_data -= sizeof(*ip);
1904 		m->m_len = m->m_pkthdr.len = sn.sn_len;
1905 	}
1906 
1907 	if (sc->sc_sync_ifidx == 0) {
1908 		sc->sc_len = PFSYNC_MINPKT;
1909 		m_freem(m);
1910 		return;
1911 	}
1912 #endif
1913 
1914 	sc->sc_if.if_opackets++;
1915 	sc->sc_if.if_obytes += m->m_pkthdr.len;
1916 
1917 	m->m_pkthdr.ph_rtableid = sc->sc_if.if_rdomain;
1918 
1919 	pfsync_send_pkt(m);
1920 }
1921 
1922 void
1923 pfsync_insert_state(struct pf_state *st)
1924 {
1925 	struct pfsync_softc *sc = pfsyncif;
1926 
1927 	NET_ASSERT_LOCKED();
1928 
1929 	if (ISSET(st->rule.ptr->rule_flag, PFRULE_NOSYNC) ||
1930 	    st->key[PF_SK_WIRE]->proto == IPPROTO_PFSYNC) {
1931 		SET(st->state_flags, PFSTATE_NOSYNC);
1932 		return;
1933 	}
1934 
1935 	if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING) ||
1936 	    ISSET(st->state_flags, PFSTATE_NOSYNC))
1937 		return;
1938 
1939 	KASSERT(st->sync_state == PFSYNC_S_NONE);
1940 
1941 	if (sc->sc_len == PFSYNC_MINPKT)
1942 		timeout_add_sec(&sc->sc_tmo, 1);
1943 
1944 	pfsync_q_ins(st, PFSYNC_S_INS);
1945 
1946 	st->sync_updates = 0;
1947 }
1948 
1949 int
1950 pfsync_defer(struct pf_state *st, struct mbuf *m, struct pfsync_deferral **ppd)
1951 {
1952 	struct pfsync_softc *sc = pfsyncif;
1953 	struct pfsync_deferral *pd;
1954 	unsigned int sched;
1955 
1956 	NET_ASSERT_LOCKED();
1957 
1958 	if (!sc->sc_defer ||
1959 	    ISSET(st->state_flags, PFSTATE_NOSYNC) ||
1960 	    m->m_flags & (M_BCAST|M_MCAST))
1961 		return (0);
1962 
1963 	pd = pool_get(&sc->sc_pool, M_NOWAIT);
1964 	if (pd == NULL)
1965 		return (0);
1966 
1967 	/*
1968 	 * deferral queue grows faster, than timeout can consume,
1969 	 * we have to ask packet (caller) to help timer and dispatch
1970 	 * one deferral for us.
1971 	 *
1972 	 * We wish to call pfsync_undefer() here. Unfortunately we can't,
1973 	 * because pfsync_undefer() will be calling to ip_output(),
1974 	 * which in turn will call to pf_test(), which would then attempt
1975 	 * to grab PF_LOCK() we currently hold.
1976 	 */
1977 	if (sc->sc_deferred >= 128) {
1978 		mtx_enter(&sc->sc_deferrals_mtx);
1979 		*ppd = TAILQ_FIRST(&sc->sc_deferrals);
1980 		if (*ppd != NULL) {
1981 			TAILQ_REMOVE(&sc->sc_deferrals, *ppd, pd_entry);
1982 			sc->sc_deferred--;
1983 		}
1984 		mtx_leave(&sc->sc_deferrals_mtx);
1985 	} else
1986 		*ppd = NULL;
1987 
1988 	m->m_pkthdr.pf.flags |= PF_TAG_GENERATED;
1989 	SET(st->state_flags, PFSTATE_ACK);
1990 
1991 	pd->pd_st = pf_state_ref(st);
1992 	pd->pd_m = m;
1993 
1994 	pd->pd_deadline = getnsecuptime() + PFSYNC_DEFER_NSEC;
1995 
1996 	mtx_enter(&sc->sc_deferrals_mtx);
1997 	sched = TAILQ_EMPTY(&sc->sc_deferrals);
1998 
1999 	TAILQ_INSERT_TAIL(&sc->sc_deferrals, pd, pd_entry);
2000 	sc->sc_deferred++;
2001 	mtx_leave(&sc->sc_deferrals_mtx);
2002 
2003 	if (sched)
2004 		timeout_add_nsec(&sc->sc_deferrals_tmo, PFSYNC_DEFER_NSEC);
2005 
2006 	schednetisr(NETISR_PFSYNC);
2007 
2008 	return (1);
2009 }
2010 
2011 void
2012 pfsync_undefer_notify(struct pfsync_deferral *pd)
2013 {
2014 	struct pf_pdesc pdesc;
2015 	struct pf_state *st = pd->pd_st;
2016 
2017 	/*
2018 	 * pf_remove_state removes the state keys and sets st->timeout
2019 	 * to PFTM_UNLINKED. this is done under NET_LOCK which should
2020 	 * be held here, so we can use PFTM_UNLINKED as a test for
2021 	 * whether the state keys are set for the address family
2022 	 * lookup.
2023 	 */
2024 
2025 	if (st->timeout == PFTM_UNLINKED)
2026 		return;
2027 
2028 	if (st->rt == PF_ROUTETO) {
2029 		if (pf_setup_pdesc(&pdesc, st->key[PF_SK_WIRE]->af,
2030 		    st->direction, st->kif, pd->pd_m, NULL) != PF_PASS)
2031 			return;
2032 		switch (st->key[PF_SK_WIRE]->af) {
2033 		case AF_INET:
2034 			pf_route(&pdesc, st);
2035 			break;
2036 #ifdef INET6
2037 		case AF_INET6:
2038 			pf_route6(&pdesc, st);
2039 			break;
2040 #endif /* INET6 */
2041 		default:
2042 			unhandled_af(st->key[PF_SK_WIRE]->af);
2043 		}
2044 		pd->pd_m = pdesc.m;
2045 	} else {
2046 		switch (st->key[PF_SK_WIRE]->af) {
2047 		case AF_INET:
2048 			ip_output(pd->pd_m, NULL, NULL, 0, NULL, NULL, 0);
2049 			break;
2050 #ifdef INET6
2051 		case AF_INET6:
2052 			ip6_output(pd->pd_m, NULL, NULL, 0, NULL, NULL);
2053 			break;
2054 #endif /* INET6 */
2055 		default:
2056 			unhandled_af(st->key[PF_SK_WIRE]->af);
2057 		}
2058 
2059 		pd->pd_m = NULL;
2060 	}
2061 }
2062 
2063 void
2064 pfsync_free_deferral(struct pfsync_deferral *pd)
2065 {
2066 	struct pfsync_softc *sc = pfsyncif;
2067 
2068 	pf_state_unref(pd->pd_st);
2069 	m_freem(pd->pd_m);
2070 	pool_put(&sc->sc_pool, pd);
2071 }
2072 
2073 void
2074 pfsync_undefer(struct pfsync_deferral *pd, int drop)
2075 {
2076 	struct pfsync_softc *sc = pfsyncif;
2077 
2078 	NET_ASSERT_LOCKED();
2079 
2080 	if (sc == NULL)
2081 		return;
2082 
2083 	CLR(pd->pd_st->state_flags, PFSTATE_ACK);
2084 	if (!drop)
2085 		pfsync_undefer_notify(pd);
2086 
2087 	pfsync_free_deferral(pd);
2088 }
2089 
2090 void
2091 pfsync_deferrals_tmo(void *arg)
2092 {
2093 	struct pfsync_softc *sc = arg;
2094 	struct pfsync_deferral *pd;
2095 	uint64_t now, nsec = 0;
2096 	struct pfsync_deferrals pds = TAILQ_HEAD_INITIALIZER(pds);
2097 
2098 	now = getnsecuptime();
2099 
2100 	mtx_enter(&sc->sc_deferrals_mtx);
2101 	for (;;) {
2102 		pd = TAILQ_FIRST(&sc->sc_deferrals);
2103 		if (pd == NULL)
2104 			break;
2105 
2106 		if (now < pd->pd_deadline) {
2107 			nsec = pd->pd_deadline - now;
2108 			break;
2109 		}
2110 
2111 		TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
2112 		sc->sc_deferred--;
2113 		TAILQ_INSERT_TAIL(&pds, pd, pd_entry);
2114 	}
2115 	mtx_leave(&sc->sc_deferrals_mtx);
2116 
2117 	if (nsec > 0) {
2118 		/* we were looking at a pd, but it wasn't old enough */
2119 		timeout_add_nsec(&sc->sc_deferrals_tmo, nsec);
2120 	}
2121 
2122 	if (TAILQ_EMPTY(&pds))
2123 		return;
2124 
2125 	NET_LOCK();
2126 	while ((pd = TAILQ_FIRST(&pds)) != NULL) {
2127 		TAILQ_REMOVE(&pds, pd, pd_entry);
2128 
2129 		pfsync_undefer(pd, 0);
2130 	}
2131 	NET_UNLOCK();
2132 }
2133 
2134 void
2135 pfsync_deferred(struct pf_state *st, int drop)
2136 {
2137 	struct pfsync_softc *sc = pfsyncif;
2138 	struct pfsync_deferral *pd;
2139 
2140 	NET_ASSERT_LOCKED();
2141 
2142 	mtx_enter(&sc->sc_deferrals_mtx);
2143 	TAILQ_FOREACH(pd, &sc->sc_deferrals, pd_entry) {
2144 		 if (pd->pd_st == st) {
2145 			TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
2146 			sc->sc_deferred--;
2147 			break;
2148 		}
2149 	}
2150 	mtx_leave(&sc->sc_deferrals_mtx);
2151 
2152 	if (pd != NULL)
2153 		pfsync_undefer(pd, drop);
2154 }
2155 
2156 void
2157 pfsync_update_state(struct pf_state *st)
2158 {
2159 	struct pfsync_softc *sc = pfsyncif;
2160 	int sync = 0;
2161 
2162 	NET_ASSERT_LOCKED();
2163 
2164 	if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
2165 		return;
2166 
2167 	if (ISSET(st->state_flags, PFSTATE_ACK))
2168 		pfsync_deferred(st, 0);
2169 	if (ISSET(st->state_flags, PFSTATE_NOSYNC)) {
2170 		if (st->sync_state != PFSYNC_S_NONE)
2171 			pfsync_q_del(st);
2172 		return;
2173 	}
2174 
2175 	if (sc->sc_len == PFSYNC_MINPKT)
2176 		timeout_add_sec(&sc->sc_tmo, 1);
2177 
2178 	switch (st->sync_state) {
2179 	case PFSYNC_S_UPD_C:
2180 	case PFSYNC_S_UPD:
2181 	case PFSYNC_S_INS:
2182 		/* we're already handling it */
2183 
2184 		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) {
2185 			st->sync_updates++;
2186 			if (st->sync_updates >= sc->sc_maxupdates)
2187 				sync = 1;
2188 		}
2189 		break;
2190 
2191 	case PFSYNC_S_IACK:
2192 		pfsync_q_del(st);
2193 	case PFSYNC_S_NONE:
2194 		pfsync_q_ins(st, PFSYNC_S_UPD_C);
2195 		st->sync_updates = 0;
2196 		break;
2197 
2198 	default:
2199 		panic("pfsync_update_state: unexpected sync state %d",
2200 		    st->sync_state);
2201 	}
2202 
2203 	if (sync || (getuptime() - st->pfsync_time) < 2)
2204 		schednetisr(NETISR_PFSYNC);
2205 }
2206 
2207 void
2208 pfsync_cancel_full_update(struct pfsync_softc *sc)
2209 {
2210 	if (timeout_pending(&sc->sc_bulkfail_tmo) ||
2211 	    timeout_pending(&sc->sc_bulk_tmo)) {
2212 #if NCARP > 0
2213 		if (!pfsync_sync_ok)
2214 			carp_group_demote_adj(&sc->sc_if, -1,
2215 			    "pfsync bulk cancelled");
2216 		if (sc->sc_initial_bulk) {
2217 			carp_group_demote_adj(&sc->sc_if, -32,
2218 			    "pfsync init");
2219 			sc->sc_initial_bulk = 0;
2220 		}
2221 #endif
2222 		pfsync_sync_ok = 1;
2223 		DPFPRINTF(LOG_INFO, "cancelling bulk update");
2224 	}
2225 	timeout_del(&sc->sc_bulkfail_tmo);
2226 	timeout_del(&sc->sc_bulk_tmo);
2227 	sc->sc_bulk_next = NULL;
2228 	sc->sc_bulk_last = NULL;
2229 	sc->sc_ureq_sent = 0;
2230 	sc->sc_bulk_tries = 0;
2231 }
2232 
2233 void
2234 pfsync_request_full_update(struct pfsync_softc *sc)
2235 {
2236 	if (sc->sc_sync_ifidx != 0 && ISSET(sc->sc_if.if_flags, IFF_RUNNING)) {
2237 		/* Request a full state table update. */
2238 		sc->sc_ureq_sent = getuptime();
2239 #if NCARP > 0
2240 		if (!sc->sc_link_demoted && pfsync_sync_ok)
2241 			carp_group_demote_adj(&sc->sc_if, 1,
2242 			    "pfsync bulk start");
2243 #endif
2244 		pfsync_sync_ok = 0;
2245 		DPFPRINTF(LOG_INFO, "requesting bulk update");
2246 		timeout_add(&sc->sc_bulkfail_tmo, 4 * hz +
2247 		    pf_pool_limits[PF_LIMIT_STATES].limit /
2248 		    ((sc->sc_if.if_mtu - PFSYNC_MINPKT) /
2249 		    sizeof(struct pfsync_state)));
2250 		pfsync_request_update(0, 0);
2251 	}
2252 }
2253 
2254 void
2255 pfsync_request_update(u_int32_t creatorid, u_int64_t id)
2256 {
2257 	struct pfsync_softc *sc = pfsyncif;
2258 	struct pfsync_upd_req_item *item;
2259 	size_t nlen, sclen;
2260 	int retry;
2261 
2262 	/*
2263 	 * this code does nothing to prevent multiple update requests for the
2264 	 * same state being generated.
2265 	 */
2266 
2267 	item = pool_get(&sc->sc_pool, PR_NOWAIT);
2268 	if (item == NULL) {
2269 		/* XXX stats */
2270 		return;
2271 	}
2272 
2273 	item->ur_msg.id = id;
2274 	item->ur_msg.creatorid = creatorid;
2275 
2276 	for (;;) {
2277 		mtx_enter(&sc->sc_upd_req_mtx);
2278 
2279 		nlen = sizeof(struct pfsync_upd_req);
2280 		if (TAILQ_EMPTY(&sc->sc_upd_req_list))
2281 			nlen += sizeof(struct pfsync_subheader);
2282 
2283 		sclen = atomic_add_long_nv(&sc->sc_len, nlen);
2284 		retry = (sclen > sc->sc_if.if_mtu);
2285 		if (retry)
2286 			atomic_sub_long(&sc->sc_len, nlen);
2287 		else
2288 			TAILQ_INSERT_TAIL(&sc->sc_upd_req_list, item, ur_entry);
2289 
2290 		mtx_leave(&sc->sc_upd_req_mtx);
2291 
2292 		if (!retry)
2293 			break;
2294 
2295 		pfsync_sendout();
2296 	}
2297 
2298 	schednetisr(NETISR_PFSYNC);
2299 }
2300 
2301 void
2302 pfsync_update_state_req(struct pf_state *st)
2303 {
2304 	struct pfsync_softc *sc = pfsyncif;
2305 
2306 	if (sc == NULL)
2307 		panic("pfsync_update_state_req: nonexistent instance");
2308 
2309 	if (ISSET(st->state_flags, PFSTATE_NOSYNC)) {
2310 		if (st->sync_state != PFSYNC_S_NONE)
2311 			pfsync_q_del(st);
2312 		return;
2313 	}
2314 
2315 	switch (st->sync_state) {
2316 	case PFSYNC_S_UPD_C:
2317 	case PFSYNC_S_IACK:
2318 		pfsync_q_del(st);
2319 	case PFSYNC_S_NONE:
2320 		pfsync_q_ins(st, PFSYNC_S_UPD);
2321 		schednetisr(NETISR_PFSYNC);
2322 		return;
2323 
2324 	case PFSYNC_S_INS:
2325 	case PFSYNC_S_UPD:
2326 	case PFSYNC_S_DEL:
2327 		/* we're already handling it */
2328 		return;
2329 
2330 	default:
2331 		panic("pfsync_update_state_req: unexpected sync state %d",
2332 		    st->sync_state);
2333 	}
2334 }
2335 
2336 void
2337 pfsync_delete_state(struct pf_state *st)
2338 {
2339 	struct pfsync_softc *sc = pfsyncif;
2340 
2341 	NET_ASSERT_LOCKED();
2342 
2343 	if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
2344 		return;
2345 
2346 	if (ISSET(st->state_flags, PFSTATE_ACK))
2347 		pfsync_deferred(st, 1);
2348 	if (ISSET(st->state_flags, PFSTATE_NOSYNC)) {
2349 		if (st->sync_state != PFSYNC_S_NONE)
2350 			pfsync_q_del(st);
2351 		return;
2352 	}
2353 
2354 	if (sc->sc_len == PFSYNC_MINPKT)
2355 		timeout_add_sec(&sc->sc_tmo, 1);
2356 
2357 	switch (st->sync_state) {
2358 	case PFSYNC_S_INS:
2359 		/* we never got to tell the world so just forget about it */
2360 		pfsync_q_del(st);
2361 		return;
2362 
2363 	case PFSYNC_S_UPD_C:
2364 	case PFSYNC_S_UPD:
2365 	case PFSYNC_S_IACK:
2366 		pfsync_q_del(st);
2367 		/*
2368 		 * FALLTHROUGH to putting it on the del list
2369 		 * Note on reference count bookkeeping:
2370 		 *	pfsync_q_del() drops reference for queue
2371 		 *	ownership. But the st entry survives, because
2372 		 *	our caller still holds a reference.
2373 		 */
2374 
2375 	case PFSYNC_S_NONE:
2376 		/*
2377 		 * We either fall through here, or there is no reference to
2378 		 * st owned by pfsync queues at this point.
2379 		 *
2380 		 * Calling pfsync_q_ins() puts st to del queue. The pfsync_q_ins()
2381 		 * grabs a reference for delete queue.
2382 		 */
2383 		pfsync_q_ins(st, PFSYNC_S_DEL);
2384 		return;
2385 
2386 	default:
2387 		panic("pfsync_delete_state: unexpected sync state %d",
2388 		    st->sync_state);
2389 	}
2390 }
2391 
2392 void
2393 pfsync_clear_states(u_int32_t creatorid, const char *ifname)
2394 {
2395 	struct pfsync_softc *sc = pfsyncif;
2396 	struct {
2397 		struct pfsync_subheader subh;
2398 		struct pfsync_clr clr;
2399 	} __packed r;
2400 
2401 	NET_ASSERT_LOCKED();
2402 
2403 	if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
2404 		return;
2405 
2406 	bzero(&r, sizeof(r));
2407 
2408 	r.subh.action = PFSYNC_ACT_CLR;
2409 	r.subh.len = sizeof(struct pfsync_clr) >> 2;
2410 	r.subh.count = htons(1);
2411 
2412 	strlcpy(r.clr.ifname, ifname, sizeof(r.clr.ifname));
2413 	r.clr.creatorid = creatorid;
2414 
2415 	pfsync_send_plus(&r, sizeof(r));
2416 }
2417 
2418 void
2419 pfsync_q_ins(struct pf_state *st, int q)
2420 {
2421 	struct pfsync_softc *sc = pfsyncif;
2422 	size_t nlen, sclen;
2423 
2424 	if (sc->sc_len < PFSYNC_MINPKT)
2425 		panic("pfsync pkt len is too low %zd", sc->sc_len);
2426 	do {
2427 		mtx_enter(&sc->sc_st_mtx);
2428 
2429 		/*
2430 		 * There are either two threads trying to update the
2431 		 * the same state, or the state is just being processed
2432 		 * (is on snapshot queue).
2433 		 */
2434 		if (st->sync_state != PFSYNC_S_NONE) {
2435 			mtx_leave(&sc->sc_st_mtx);
2436 			break;
2437 		}
2438 
2439 		nlen = pfsync_qs[q].len;
2440 
2441 		if (TAILQ_EMPTY(&sc->sc_qs[q]))
2442 			nlen += sizeof(struct pfsync_subheader);
2443 
2444 		sclen = atomic_add_long_nv(&sc->sc_len, nlen);
2445 		if (sclen > sc->sc_if.if_mtu) {
2446 			atomic_sub_long(&sc->sc_len, nlen);
2447 			mtx_leave(&sc->sc_st_mtx);
2448 			pfsync_sendout();
2449 			continue;
2450 		}
2451 
2452 		pf_state_ref(st);
2453 
2454 		TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list);
2455 		st->sync_state = q;
2456 		mtx_leave(&sc->sc_st_mtx);
2457 	} while (0);
2458 }
2459 
2460 void
2461 pfsync_q_del(struct pf_state *st)
2462 {
2463 	struct pfsync_softc *sc = pfsyncif;
2464 	int q;
2465 
2466 	KASSERT(st->sync_state != PFSYNC_S_NONE);
2467 
2468 	mtx_enter(&sc->sc_st_mtx);
2469 	q = st->sync_state;
2470 	/*
2471 	 * re-check under mutex
2472 	 * if state is snapped already, then just bail out, because we came
2473 	 * too late, the state is being just processed/dispatched to peer.
2474 	 */
2475 	if ((q == PFSYNC_S_NONE) || (st->snapped)) {
2476 		mtx_leave(&sc->sc_st_mtx);
2477 		return;
2478 	}
2479 	atomic_sub_long(&sc->sc_len, pfsync_qs[q].len);
2480 	TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
2481 	if (TAILQ_EMPTY(&sc->sc_qs[q]))
2482 		atomic_sub_long(&sc->sc_len, sizeof (struct pfsync_subheader));
2483 	st->sync_state = PFSYNC_S_NONE;
2484 	mtx_leave(&sc->sc_st_mtx);
2485 
2486 	pf_state_unref(st);
2487 }
2488 
2489 void
2490 pfsync_update_tdb(struct tdb *t, int output)
2491 {
2492 	struct pfsync_softc *sc = pfsyncif;
2493 	size_t nlen, sclen;
2494 
2495 	if (sc == NULL)
2496 		return;
2497 
2498 	if (!ISSET(t->tdb_flags, TDBF_PFSYNC)) {
2499 		do {
2500 			mtx_enter(&sc->sc_tdb_mtx);
2501 			nlen = sizeof(struct pfsync_tdb);
2502 
2503 			mtx_enter(&t->tdb_mtx);
2504 			if (ISSET(t->tdb_flags, TDBF_PFSYNC)) {
2505 				/* we've lost race, no action for us then */
2506 				mtx_leave(&t->tdb_mtx);
2507 				mtx_leave(&sc->sc_tdb_mtx);
2508 				break;
2509 			}
2510 
2511 			if (TAILQ_EMPTY(&sc->sc_tdb_q))
2512 				nlen += sizeof(struct pfsync_subheader);
2513 
2514 			sclen = atomic_add_long_nv(&sc->sc_len, nlen);
2515 			if (sclen > sc->sc_if.if_mtu) {
2516 				atomic_sub_long(&sc->sc_len, nlen);
2517 				mtx_leave(&t->tdb_mtx);
2518 				mtx_leave(&sc->sc_tdb_mtx);
2519 				pfsync_sendout();
2520 				continue;
2521 			}
2522 
2523 			TAILQ_INSERT_TAIL(&sc->sc_tdb_q, t, tdb_sync_entry);
2524 			tdb_ref(t);
2525 			SET(t->tdb_flags, TDBF_PFSYNC);
2526 			mtx_leave(&t->tdb_mtx);
2527 
2528 			mtx_leave(&sc->sc_tdb_mtx);
2529 			t->tdb_updates = 0;
2530 		} while (0);
2531 	} else {
2532 		if (++t->tdb_updates >= sc->sc_maxupdates)
2533 			schednetisr(NETISR_PFSYNC);
2534 	}
2535 
2536 	mtx_enter(&t->tdb_mtx);
2537 	if (output)
2538 		SET(t->tdb_flags, TDBF_PFSYNC_RPL);
2539 	else
2540 		CLR(t->tdb_flags, TDBF_PFSYNC_RPL);
2541 	mtx_leave(&t->tdb_mtx);
2542 }
2543 
2544 void
2545 pfsync_delete_tdb(struct tdb *t)
2546 {
2547 	struct pfsync_softc *sc = pfsyncif;
2548 	size_t nlen;
2549 
2550 	if (sc == NULL || !ISSET(t->tdb_flags, TDBF_PFSYNC))
2551 		return;
2552 
2553 	mtx_enter(&sc->sc_tdb_mtx);
2554 
2555 	/*
2556 	 * if tdb entry is just being processed (found in snapshot),
2557 	 * then it can not be deleted. we just came too late
2558 	 */
2559 	if (ISSET(t->tdb_flags, TDBF_PFSYNC_SNAPPED)) {
2560 		mtx_leave(&sc->sc_tdb_mtx);
2561 		return;
2562 	}
2563 
2564 	TAILQ_REMOVE(&sc->sc_tdb_q, t, tdb_sync_entry);
2565 
2566 	mtx_enter(&t->tdb_mtx);
2567 	CLR(t->tdb_flags, TDBF_PFSYNC);
2568 	mtx_leave(&t->tdb_mtx);
2569 
2570 	nlen = sizeof(struct pfsync_tdb);
2571 	if (TAILQ_EMPTY(&sc->sc_tdb_q))
2572 		nlen += sizeof(struct pfsync_subheader);
2573 	atomic_sub_long(&sc->sc_len, nlen);
2574 
2575 	mtx_leave(&sc->sc_tdb_mtx);
2576 
2577 	tdb_unref(t);
2578 }
2579 
2580 void
2581 pfsync_out_tdb(struct tdb *t, void *buf)
2582 {
2583 	struct pfsync_tdb *ut = buf;
2584 
2585 	bzero(ut, sizeof(*ut));
2586 	ut->spi = t->tdb_spi;
2587 	bcopy(&t->tdb_dst, &ut->dst, sizeof(ut->dst));
2588 	/*
2589 	 * When a failover happens, the master's rpl is probably above
2590 	 * what we see here (we may be up to a second late), so
2591 	 * increase it a bit for outbound tdbs to manage most such
2592 	 * situations.
2593 	 *
2594 	 * For now, just add an offset that is likely to be larger
2595 	 * than the number of packets we can see in one second. The RFC
2596 	 * just says the next packet must have a higher seq value.
2597 	 *
2598 	 * XXX What is a good algorithm for this? We could use
2599 	 * a rate-determined increase, but to know it, we would have
2600 	 * to extend struct tdb.
2601 	 * XXX pt->rpl can wrap over MAXINT, but if so the real tdb
2602 	 * will soon be replaced anyway. For now, just don't handle
2603 	 * this edge case.
2604 	 */
2605 #define RPL_INCR 16384
2606 	ut->rpl = htobe64(t->tdb_rpl + (ISSET(t->tdb_flags, TDBF_PFSYNC_RPL) ?
2607 	    RPL_INCR : 0));
2608 	ut->cur_bytes = htobe64(t->tdb_cur_bytes);
2609 	ut->sproto = t->tdb_sproto;
2610 	ut->rdomain = htons(t->tdb_rdomain);
2611 }
2612 
2613 void
2614 pfsync_bulk_start(void)
2615 {
2616 	struct pfsync_softc *sc = pfsyncif;
2617 
2618 	NET_ASSERT_LOCKED();
2619 
2620 	/*
2621 	 * pf gc via pfsync_state_in_use reads sc_bulk_next and
2622 	 * sc_bulk_last while exclusively holding the pf_state_list
2623 	 * rwlock. make sure it can't race with us setting these
2624 	 * pointers. they basically act as hazards, and borrow the
2625 	 * lists state reference count.
2626 	 */
2627 	rw_enter_read(&pf_state_list.pfs_rwl);
2628 
2629 	/* get a consistent view of the list pointers */
2630 	mtx_enter(&pf_state_list.pfs_mtx);
2631 	if (sc->sc_bulk_next == NULL)
2632 		sc->sc_bulk_next = TAILQ_FIRST(&pf_state_list.pfs_list);
2633 
2634 	sc->sc_bulk_last = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
2635 	mtx_leave(&pf_state_list.pfs_mtx);
2636 
2637 	rw_exit_read(&pf_state_list.pfs_rwl);
2638 
2639 	DPFPRINTF(LOG_INFO, "received bulk update request");
2640 
2641 	if (sc->sc_bulk_last == NULL)
2642 		pfsync_bulk_status(PFSYNC_BUS_END);
2643 	else {
2644 		sc->sc_ureq_received = getuptime();
2645 
2646 		pfsync_bulk_status(PFSYNC_BUS_START);
2647 		timeout_add(&sc->sc_bulk_tmo, 0);
2648 	}
2649 }
2650 
2651 void
2652 pfsync_bulk_update(void *arg)
2653 {
2654 	struct pfsync_softc *sc;
2655 	struct pf_state *st;
2656 	int i = 0;
2657 
2658 	NET_LOCK();
2659 	sc = pfsyncif;
2660 	if (sc == NULL)
2661 		goto out;
2662 
2663 	rw_enter_read(&pf_state_list.pfs_rwl);
2664 	st = sc->sc_bulk_next;
2665 	sc->sc_bulk_next = NULL;
2666 
2667 	for (;;) {
2668 		if (st->sync_state == PFSYNC_S_NONE &&
2669 		    st->timeout < PFTM_MAX &&
2670 		    st->pfsync_time <= sc->sc_ureq_received) {
2671 			pfsync_update_state_req(st);
2672 			i++;
2673 		}
2674 
2675 		st = TAILQ_NEXT(st, entry_list);
2676 		if ((st == NULL) || (st == sc->sc_bulk_last)) {
2677 			/* we're done */
2678 			sc->sc_bulk_last = NULL;
2679 			pfsync_bulk_status(PFSYNC_BUS_END);
2680 			break;
2681 		}
2682 
2683 		if (i > 1 && (sc->sc_if.if_mtu - sc->sc_len) <
2684 		    sizeof(struct pfsync_state)) {
2685 			/* we've filled a packet */
2686 			sc->sc_bulk_next = st;
2687 			timeout_add(&sc->sc_bulk_tmo, 1);
2688 			break;
2689 		}
2690 	}
2691 
2692 	rw_exit_read(&pf_state_list.pfs_rwl);
2693  out:
2694 	NET_UNLOCK();
2695 }
2696 
2697 void
2698 pfsync_bulk_status(u_int8_t status)
2699 {
2700 	struct {
2701 		struct pfsync_subheader subh;
2702 		struct pfsync_bus bus;
2703 	} __packed r;
2704 
2705 	struct pfsync_softc *sc = pfsyncif;
2706 
2707 	bzero(&r, sizeof(r));
2708 
2709 	r.subh.action = PFSYNC_ACT_BUS;
2710 	r.subh.len = sizeof(struct pfsync_bus) >> 2;
2711 	r.subh.count = htons(1);
2712 
2713 	r.bus.creatorid = pf_status.hostid;
2714 	r.bus.endtime = htonl(getuptime() - sc->sc_ureq_received);
2715 	r.bus.status = status;
2716 
2717 	pfsync_send_plus(&r, sizeof(r));
2718 }
2719 
2720 void
2721 pfsync_bulk_fail(void *arg)
2722 {
2723 	struct pfsync_softc *sc;
2724 
2725 	NET_LOCK();
2726 	sc = pfsyncif;
2727 	if (sc == NULL)
2728 		goto out;
2729 	if (sc->sc_bulk_tries++ < PFSYNC_MAX_BULKTRIES) {
2730 		/* Try again */
2731 		timeout_add_sec(&sc->sc_bulkfail_tmo, 5);
2732 		pfsync_request_update(0, 0);
2733 	} else {
2734 		/* Pretend like the transfer was ok */
2735 		sc->sc_ureq_sent = 0;
2736 		sc->sc_bulk_tries = 0;
2737 #if NCARP > 0
2738 		if (!pfsync_sync_ok)
2739 			carp_group_demote_adj(&sc->sc_if, -1,
2740 			    sc->sc_link_demoted ?
2741 			    "pfsync link state up" :
2742 			    "pfsync bulk fail");
2743 		if (sc->sc_initial_bulk) {
2744 			carp_group_demote_adj(&sc->sc_if, -32,
2745 			    "pfsync init");
2746 			sc->sc_initial_bulk = 0;
2747 		}
2748 #endif
2749 		pfsync_sync_ok = 1;
2750 		sc->sc_link_demoted = 0;
2751 		DPFPRINTF(LOG_ERR, "failed to receive bulk update");
2752 	}
2753  out:
2754 	NET_UNLOCK();
2755 }
2756 
2757 void
2758 pfsync_send_plus(void *plus, size_t pluslen)
2759 {
2760 	struct pfsync_softc *sc = pfsyncif;
2761 
2762 	if (sc->sc_len + pluslen > sc->sc_if.if_mtu)
2763 		pfsync_sendout();
2764 
2765 	sc->sc_plus = plus;
2766 	sc->sc_pluslen = pluslen;
2767 	atomic_add_long(&sc->sc_len, pluslen);
2768 
2769 	pfsync_sendout();
2770 }
2771 
2772 int
2773 pfsync_up(void)
2774 {
2775 	struct pfsync_softc *sc = pfsyncif;
2776 
2777 	if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
2778 		return (0);
2779 
2780 	return (1);
2781 }
2782 
2783 int
2784 pfsync_state_in_use(struct pf_state *st)
2785 {
2786 	struct pfsync_softc *sc = pfsyncif;
2787 
2788 	if (sc == NULL)
2789 		return (0);
2790 
2791 	rw_assert_wrlock(&pf_state_list.pfs_rwl);
2792 
2793 	if (st->sync_state != PFSYNC_S_NONE ||
2794 	    st == sc->sc_bulk_next ||
2795 	    st == sc->sc_bulk_last)
2796 		return (1);
2797 
2798 	return (0);
2799 }
2800 
2801 void
2802 pfsync_timeout(void *arg)
2803 {
2804 	NET_LOCK();
2805 	pfsync_sendout();
2806 	NET_UNLOCK();
2807 }
2808 
2809 /* this is a softnet/netisr handler */
2810 void
2811 pfsyncintr(void)
2812 {
2813 	pfsync_sendout();
2814 }
2815 
2816 int
2817 pfsync_sysctl_pfsyncstat(void *oldp, size_t *oldlenp, void *newp)
2818 {
2819 	struct pfsyncstats pfsyncstat;
2820 
2821 	CTASSERT(sizeof(pfsyncstat) == (pfsyncs_ncounters * sizeof(uint64_t)));
2822 	memset(&pfsyncstat, 0, sizeof pfsyncstat);
2823 	counters_read(pfsynccounters, (uint64_t *)&pfsyncstat,
2824 	    pfsyncs_ncounters);
2825 	return (sysctl_rdstruct(oldp, oldlenp, newp,
2826 	    &pfsyncstat, sizeof(pfsyncstat)));
2827 }
2828 
2829 int
2830 pfsync_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
2831     size_t newlen)
2832 {
2833 	/* All sysctl names at this level are terminal. */
2834 	if (namelen != 1)
2835 		return (ENOTDIR);
2836 
2837 	switch (name[0]) {
2838 	case PFSYNCCTL_STATS:
2839 		return (pfsync_sysctl_pfsyncstat(oldp, oldlenp, newp));
2840 	default:
2841 		return (ENOPROTOOPT);
2842 	}
2843 }
2844