xref: /openbsd-src/sys/net/if_pfsync.c (revision b99ef4df7fac99f3475b694d6cd4990521c99ae6)
1 /*	$OpenBSD: if_pfsync.c,v 1.284 2021/02/09 14:06:19 patrick Exp $	*/
2 
3 /*
4  * Copyright (c) 2002 Michael Shalayeff
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
20  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22  * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
24  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
25  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26  * THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*
30  * Copyright (c) 2009 David Gwynne <dlg@openbsd.org>
31  *
32  * Permission to use, copy, modify, and distribute this software for any
33  * purpose with or without fee is hereby granted, provided that the above
34  * copyright notice and this permission notice appear in all copies.
35  *
36  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
37  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
38  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
39  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
40  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
41  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
42  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
43  */
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/time.h>
48 #include <sys/malloc.h>
49 #include <sys/mbuf.h>
50 #include <sys/socket.h>
51 #include <sys/ioctl.h>
52 #include <sys/timeout.h>
53 #include <sys/kernel.h>
54 #include <sys/sysctl.h>
55 #include <sys/pool.h>
56 #include <sys/syslog.h>
57 
58 #include <net/if.h>
59 #include <net/if_types.h>
60 #include <net/bpf.h>
61 #include <net/netisr.h>
62 
63 #include <netinet/in.h>
64 #include <netinet/if_ether.h>
65 #include <netinet/ip.h>
66 #include <netinet/in_var.h>
67 #include <netinet/ip_var.h>
68 #include <netinet/ip_ipsp.h>
69 #include <netinet/ip_icmp.h>
70 #include <netinet/icmp6.h>
71 #include <netinet/tcp.h>
72 #include <netinet/tcp_seq.h>
73 #include <netinet/tcp_fsm.h>
74 #include <netinet/udp.h>
75 
76 #ifdef INET6
77 #include <netinet6/in6_var.h>
78 #include <netinet/ip6.h>
79 #include <netinet6/ip6_var.h>
80 #include <netinet6/nd6.h>
81 #endif /* INET6 */
82 
83 #include "carp.h"
84 #if NCARP > 0
85 #include <netinet/ip_carp.h>
86 #endif
87 
88 #define PF_DEBUGNAME	"pfsync: "
89 #include <net/pfvar.h>
90 #include <net/pfvar_priv.h>
91 #include <net/if_pfsync.h>
92 
93 #include "bpfilter.h"
94 #include "pfsync.h"
95 
96 #define PFSYNC_MINPKT ( \
97 	sizeof(struct ip) + \
98 	sizeof(struct pfsync_header))
99 
100 int	pfsync_upd_tcp(struct pf_state *, struct pfsync_state_peer *,
101 	    struct pfsync_state_peer *);
102 
103 int	pfsync_in_clr(caddr_t, int, int, int);
104 int	pfsync_in_iack(caddr_t, int, int, int);
105 int	pfsync_in_upd_c(caddr_t, int, int, int);
106 int	pfsync_in_ureq(caddr_t, int, int, int);
107 int	pfsync_in_del(caddr_t, int, int, int);
108 int	pfsync_in_del_c(caddr_t, int, int, int);
109 int	pfsync_in_bus(caddr_t, int, int, int);
110 int	pfsync_in_tdb(caddr_t, int, int, int);
111 int	pfsync_in_ins(caddr_t, int, int, int);
112 int	pfsync_in_upd(caddr_t, int, int, int);
113 int	pfsync_in_eof(caddr_t, int, int, int);
114 
115 int	pfsync_in_error(caddr_t, int, int, int);
116 
117 void	pfsync_update_state_locked(struct pf_state *);
118 
119 struct {
120 	int	(*in)(caddr_t, int, int, int);
121 	size_t	len;
122 } pfsync_acts[] = {
123 	/* PFSYNC_ACT_CLR */
124 	{ pfsync_in_clr,	sizeof(struct pfsync_clr) },
125 	 /* PFSYNC_ACT_OINS */
126 	{ pfsync_in_error,	0 },
127 	/* PFSYNC_ACT_INS_ACK */
128 	{ pfsync_in_iack,	sizeof(struct pfsync_ins_ack) },
129 	/* PFSYNC_ACT_OUPD */
130 	{ pfsync_in_error,	0 },
131 	/* PFSYNC_ACT_UPD_C */
132 	{ pfsync_in_upd_c,	sizeof(struct pfsync_upd_c) },
133 	/* PFSYNC_ACT_UPD_REQ */
134 	{ pfsync_in_ureq,	sizeof(struct pfsync_upd_req) },
135 	/* PFSYNC_ACT_DEL */
136 	{ pfsync_in_del,	sizeof(struct pfsync_state) },
137 	/* PFSYNC_ACT_DEL_C */
138 	{ pfsync_in_del_c,	sizeof(struct pfsync_del_c) },
139 	/* PFSYNC_ACT_INS_F */
140 	{ pfsync_in_error,	0 },
141 	/* PFSYNC_ACT_DEL_F */
142 	{ pfsync_in_error,	0 },
143 	/* PFSYNC_ACT_BUS */
144 	{ pfsync_in_bus,	sizeof(struct pfsync_bus) },
145 	/* PFSYNC_ACT_OTDB */
146 	{ pfsync_in_error,	0 },
147 	/* PFSYNC_ACT_EOF */
148 	{ pfsync_in_error,	0 },
149 	/* PFSYNC_ACT_INS */
150 	{ pfsync_in_ins,	sizeof(struct pfsync_state) },
151 	/* PFSYNC_ACT_UPD */
152 	{ pfsync_in_upd,	sizeof(struct pfsync_state) },
153 	/* PFSYNC_ACT_TDB */
154 	{ pfsync_in_tdb,	sizeof(struct pfsync_tdb) },
155 };
156 
157 struct pfsync_q {
158 	void		(*write)(struct pf_state *, void *);
159 	size_t		len;
160 	u_int8_t	action;
161 };
162 
163 /* we have one of these for every PFSYNC_S_ */
164 void	pfsync_out_state(struct pf_state *, void *);
165 void	pfsync_out_iack(struct pf_state *, void *);
166 void	pfsync_out_upd_c(struct pf_state *, void *);
167 void	pfsync_out_del(struct pf_state *, void *);
168 
169 struct pfsync_q pfsync_qs[] = {
170 	{ pfsync_out_iack,  sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK },
171 	{ pfsync_out_upd_c, sizeof(struct pfsync_upd_c),   PFSYNC_ACT_UPD_C },
172 	{ pfsync_out_del,   sizeof(struct pfsync_del_c),   PFSYNC_ACT_DEL_C },
173 	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_INS },
174 	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_UPD }
175 };
176 
177 void	pfsync_q_ins(struct pf_state *, int);
178 void	pfsync_q_del(struct pf_state *);
179 
180 struct pfsync_upd_req_item {
181 	TAILQ_ENTRY(pfsync_upd_req_item)	ur_entry;
182 	struct pfsync_upd_req			ur_msg;
183 };
184 TAILQ_HEAD(pfsync_upd_reqs, pfsync_upd_req_item);
185 
186 struct pfsync_deferral {
187 	TAILQ_ENTRY(pfsync_deferral)		 pd_entry;
188 	struct pf_state				*pd_st;
189 	struct mbuf				*pd_m;
190 	struct timeout				 pd_tmo;
191 };
192 TAILQ_HEAD(pfsync_deferrals, pfsync_deferral);
193 
194 #define PFSYNC_PLSIZE	MAX(sizeof(struct pfsync_upd_req_item), \
195 			    sizeof(struct pfsync_deferral))
196 
197 void	pfsync_out_tdb(struct tdb *, void *);
198 
199 struct pfsync_softc {
200 	struct ifnet		 sc_if;
201 	unsigned int		 sc_sync_ifidx;
202 
203 	struct pool		 sc_pool;
204 
205 	struct ip_moptions	 sc_imo;
206 
207 	struct in_addr		 sc_sync_peer;
208 	u_int8_t		 sc_maxupdates;
209 
210 	struct ip		 sc_template;
211 
212 	struct pf_state_queue	 sc_qs[PFSYNC_S_COUNT];
213 	struct mutex		 sc_mtx[PFSYNC_S_COUNT];
214 	size_t			 sc_len;
215 
216 	struct pfsync_upd_reqs	 sc_upd_req_list;
217 	struct mutex		 sc_upd_req_mtx;
218 
219 	int			 sc_initial_bulk;
220 	int			 sc_link_demoted;
221 
222 	int			 sc_defer;
223 	struct pfsync_deferrals	 sc_deferrals;
224 	u_int			 sc_deferred;
225 	struct mutex		 sc_deferrals_mtx;
226 
227 	void			*sc_plus;
228 	size_t			 sc_pluslen;
229 
230 	u_int32_t		 sc_ureq_sent;
231 	int			 sc_bulk_tries;
232 	struct timeout		 sc_bulkfail_tmo;
233 
234 	u_int32_t		 sc_ureq_received;
235 	struct pf_state		*sc_bulk_next;
236 	struct pf_state		*sc_bulk_last;
237 	struct timeout		 sc_bulk_tmo;
238 
239 	TAILQ_HEAD(, tdb)	 sc_tdb_q;
240 	struct mutex		 sc_tdb_mtx;
241 
242 	struct task		 sc_ltask;
243 	struct task		 sc_dtask;
244 
245 	struct timeout		 sc_tmo;
246 };
247 
248 struct pfsync_snapshot {
249 	struct pfsync_softc	*sn_sc;
250 	struct pf_state_queue	 sn_qs[PFSYNC_S_COUNT];
251 	struct pfsync_upd_reqs	 sn_upd_req_list;
252 	TAILQ_HEAD(, tdb)	 sn_tdb_q;
253 	size_t			 sn_len;
254 	void			*sn_plus;
255 	size_t			 sn_pluslen;
256 };
257 
258 struct pfsync_softc	*pfsyncif = NULL;
259 struct cpumem		*pfsynccounters;
260 
261 void	pfsyncattach(int);
262 int	pfsync_clone_create(struct if_clone *, int);
263 int	pfsync_clone_destroy(struct ifnet *);
264 int	pfsync_alloc_scrub_memory(struct pfsync_state_peer *,
265 	    struct pf_state_peer *);
266 void	pfsync_update_net_tdb(struct pfsync_tdb *);
267 int	pfsyncoutput(struct ifnet *, struct mbuf *, struct sockaddr *,
268 	    struct rtentry *);
269 int	pfsyncioctl(struct ifnet *, u_long, caddr_t);
270 void	pfsyncstart(struct ifqueue *);
271 void	pfsync_syncdev_state(void *);
272 void	pfsync_ifdetach(void *);
273 
274 void	pfsync_deferred(struct pf_state *, int);
275 void	pfsync_undefer(struct pfsync_deferral *, int);
276 void	pfsync_defer_tmo(void *);
277 
278 void	pfsync_cancel_full_update(struct pfsync_softc *);
279 void	pfsync_request_full_update(struct pfsync_softc *);
280 void	pfsync_request_update(u_int32_t, u_int64_t);
281 void	pfsync_update_state_req(struct pf_state *);
282 
283 void	pfsync_drop(struct pfsync_softc *);
284 void	pfsync_sendout(void);
285 void	pfsync_send_plus(void *, size_t);
286 void	pfsync_timeout(void *);
287 void	pfsync_tdb_timeout(void *);
288 
289 void	pfsync_bulk_start(void);
290 void	pfsync_bulk_status(u_int8_t);
291 void	pfsync_bulk_update(void *);
292 void	pfsync_bulk_fail(void *);
293 
294 void	pfsync_grab_snapshot(struct pfsync_snapshot *, struct pfsync_softc *);
295 void	pfsync_drop_snapshot(struct pfsync_snapshot *);
296 
297 void	pfsync_send_dispatch(void *);
298 void	pfsync_send_pkt(struct mbuf *);
299 
300 static struct mbuf_queue	pfsync_mq;
301 static struct task	pfsync_task =
302     TASK_INITIALIZER(pfsync_send_dispatch, &pfsync_mq);
303 
304 #define PFSYNC_MAX_BULKTRIES	12
305 int	pfsync_sync_ok;
306 
307 struct if_clone	pfsync_cloner =
308     IF_CLONE_INITIALIZER("pfsync", pfsync_clone_create, pfsync_clone_destroy);
309 
310 void
311 pfsyncattach(int npfsync)
312 {
313 	if_clone_attach(&pfsync_cloner);
314 	pfsynccounters = counters_alloc(pfsyncs_ncounters);
315 	mq_init(&pfsync_mq, 4096, IPL_SOFTNET);
316 }
317 
318 int
319 pfsync_clone_create(struct if_clone *ifc, int unit)
320 {
321 	struct pfsync_softc *sc;
322 	struct ifnet *ifp;
323 	int q;
324 	static const char *mtx_names[] = {
325 		"iack_mtx",
326 		"upd_c_mtx",
327 		"del_mtx",
328 		"ins_mtx",
329 		"upd_mtx",
330 		"" };
331 
332 	if (unit != 0)
333 		return (EINVAL);
334 
335 	pfsync_sync_ok = 1;
336 
337 	sc = malloc(sizeof(*pfsyncif), M_DEVBUF, M_WAITOK|M_ZERO);
338 	for (q = 0; q < PFSYNC_S_COUNT; q++) {
339 		TAILQ_INIT(&sc->sc_qs[q]);
340 		mtx_init_flags(&sc->sc_mtx[q], IPL_SOFTNET, mtx_names[q], 0);
341 	}
342 
343 	pool_init(&sc->sc_pool, PFSYNC_PLSIZE, 0, IPL_SOFTNET, 0, "pfsync",
344 	    NULL);
345 	TAILQ_INIT(&sc->sc_upd_req_list);
346 	mtx_init(&sc->sc_upd_req_mtx, IPL_SOFTNET);
347 	TAILQ_INIT(&sc->sc_deferrals);
348 	mtx_init(&sc->sc_deferrals_mtx, IPL_SOFTNET);
349 	task_set(&sc->sc_ltask, pfsync_syncdev_state, sc);
350 	task_set(&sc->sc_dtask, pfsync_ifdetach, sc);
351 	sc->sc_deferred = 0;
352 
353 	TAILQ_INIT(&sc->sc_tdb_q);
354 	mtx_init(&sc->sc_tdb_mtx, IPL_SOFTNET);
355 
356 	sc->sc_len = PFSYNC_MINPKT;
357 	sc->sc_maxupdates = 128;
358 
359 	sc->sc_imo.imo_membership = mallocarray(IP_MIN_MEMBERSHIPS,
360 	    sizeof(struct in_multi *), M_IPMOPTS, M_WAITOK|M_ZERO);
361 	sc->sc_imo.imo_max_memberships = IP_MIN_MEMBERSHIPS;
362 
363 	ifp = &sc->sc_if;
364 	snprintf(ifp->if_xname, sizeof ifp->if_xname, "pfsync%d", unit);
365 	ifp->if_softc = sc;
366 	ifp->if_ioctl = pfsyncioctl;
367 	ifp->if_output = pfsyncoutput;
368 	ifp->if_qstart = pfsyncstart;
369 	ifp->if_type = IFT_PFSYNC;
370 	ifp->if_hdrlen = sizeof(struct pfsync_header);
371 	ifp->if_mtu = ETHERMTU;
372 	ifp->if_xflags = IFXF_CLONED | IFXF_MPSAFE;
373 	timeout_set_proc(&sc->sc_tmo, pfsync_timeout, NULL);
374 	timeout_set_proc(&sc->sc_bulk_tmo, pfsync_bulk_update, NULL);
375 	timeout_set_proc(&sc->sc_bulkfail_tmo, pfsync_bulk_fail, NULL);
376 
377 	if_attach(ifp);
378 	if_alloc_sadl(ifp);
379 
380 #if NCARP > 0
381 	if_addgroup(ifp, "carp");
382 #endif
383 
384 #if NBPFILTER > 0
385 	bpfattach(&sc->sc_if.if_bpf, ifp, DLT_PFSYNC, PFSYNC_HDRLEN);
386 #endif
387 
388 	pfsyncif = sc;
389 
390 	return (0);
391 }
392 
393 int
394 pfsync_clone_destroy(struct ifnet *ifp)
395 {
396 	struct pfsync_softc *sc = ifp->if_softc;
397 	struct ifnet *ifp0;
398 	struct pfsync_deferral *pd;
399 	struct pfsync_deferrals	 deferrals;
400 
401 	NET_LOCK();
402 
403 #if NCARP > 0
404 	if (!pfsync_sync_ok)
405 		carp_group_demote_adj(&sc->sc_if, -1, "pfsync destroy");
406 	if (sc->sc_link_demoted)
407 		carp_group_demote_adj(&sc->sc_if, -1, "pfsync destroy");
408 #endif
409 	if ((ifp0 = if_get(sc->sc_sync_ifidx)) != NULL) {
410 		if_linkstatehook_del(ifp0, &sc->sc_ltask);
411 		if_detachhook_del(ifp0, &sc->sc_dtask);
412 	}
413 	if_put(ifp0);
414 
415 	/* XXXSMP breaks atomicity */
416 	NET_UNLOCK();
417 	if_detach(ifp);
418 	NET_LOCK();
419 
420 	pfsync_drop(sc);
421 
422 	if (sc->sc_deferred > 0) {
423 		TAILQ_INIT(&deferrals);
424 		mtx_enter(&sc->sc_deferrals_mtx);
425 		TAILQ_CONCAT(&deferrals, &sc->sc_deferrals, pd_entry);
426 		sc->sc_deferred = 0;
427 		mtx_leave(&sc->sc_deferrals_mtx);
428 
429 		while (!TAILQ_EMPTY(&deferrals)) {
430 			pd = TAILQ_FIRST(&deferrals);
431 			TAILQ_REMOVE(&deferrals, pd, pd_entry);
432 			pfsync_undefer(pd, 0);
433 		}
434 	}
435 
436 	pfsyncif = NULL;
437 	timeout_del(&sc->sc_bulkfail_tmo);
438 	timeout_del(&sc->sc_bulk_tmo);
439 	timeout_del(&sc->sc_tmo);
440 
441 	NET_UNLOCK();
442 
443 	pool_destroy(&sc->sc_pool);
444 	free(sc->sc_imo.imo_membership, M_IPMOPTS,
445 	    sc->sc_imo.imo_max_memberships * sizeof(struct in_multi *));
446 	free(sc, M_DEVBUF, sizeof(*sc));
447 
448 	return (0);
449 }
450 
451 /*
452  * Start output on the pfsync interface.
453  */
454 void
455 pfsyncstart(struct ifqueue *ifq)
456 {
457 	ifq_purge(ifq);
458 }
459 
460 void
461 pfsync_syncdev_state(void *arg)
462 {
463 	struct pfsync_softc *sc = arg;
464 	struct ifnet *ifp;
465 
466 	if ((sc->sc_if.if_flags & IFF_UP) == 0)
467 		return;
468 	if ((ifp = if_get(sc->sc_sync_ifidx)) == NULL)
469 		return;
470 
471 	if (ifp->if_link_state == LINK_STATE_DOWN) {
472 		sc->sc_if.if_flags &= ~IFF_RUNNING;
473 		if (!sc->sc_link_demoted) {
474 #if NCARP > 0
475 			carp_group_demote_adj(&sc->sc_if, 1,
476 			    "pfsync link state down");
477 #endif
478 			sc->sc_link_demoted = 1;
479 		}
480 
481 		/* drop everything */
482 		timeout_del(&sc->sc_tmo);
483 		pfsync_drop(sc);
484 
485 		pfsync_cancel_full_update(sc);
486 	} else if (sc->sc_link_demoted) {
487 		sc->sc_if.if_flags |= IFF_RUNNING;
488 
489 		pfsync_request_full_update(sc);
490 	}
491 
492 	if_put(ifp);
493 }
494 
495 void
496 pfsync_ifdetach(void *arg)
497 {
498 	struct pfsync_softc *sc = arg;
499 	struct ifnet *ifp;
500 
501 	if ((ifp = if_get(sc->sc_sync_ifidx)) != NULL) {
502 		if_linkstatehook_del(ifp, &sc->sc_ltask);
503 		if_detachhook_del(ifp, &sc->sc_dtask);
504 	}
505 	if_put(ifp);
506 
507 	sc->sc_sync_ifidx = 0;
508 }
509 
510 int
511 pfsync_alloc_scrub_memory(struct pfsync_state_peer *s,
512     struct pf_state_peer *d)
513 {
514 	if (s->scrub.scrub_flag && d->scrub == NULL) {
515 		d->scrub = pool_get(&pf_state_scrub_pl, PR_NOWAIT | PR_ZERO);
516 		if (d->scrub == NULL)
517 			return (ENOMEM);
518 	}
519 
520 	return (0);
521 }
522 
523 void
524 pfsync_state_export(struct pfsync_state *sp, struct pf_state *st)
525 {
526 	pf_state_export(sp, st);
527 }
528 
529 int
530 pfsync_state_import(struct pfsync_state *sp, int flags)
531 {
532 	struct pf_state	*st = NULL;
533 	struct pf_state_key *skw = NULL, *sks = NULL;
534 	struct pf_rule *r = NULL;
535 	struct pfi_kif	*kif;
536 	int pool_flags;
537 	int error = ENOMEM;
538 	int n = 0;
539 
540 	if (sp->creatorid == 0) {
541 		DPFPRINTF(LOG_NOTICE, "pfsync_state_import: "
542 		    "invalid creator id: %08x", ntohl(sp->creatorid));
543 		return (EINVAL);
544 	}
545 
546 	if ((kif = pfi_kif_get(sp->ifname)) == NULL) {
547 		DPFPRINTF(LOG_NOTICE, "pfsync_state_import: "
548 		    "unknown interface: %s", sp->ifname);
549 		if (flags & PFSYNC_SI_IOCTL)
550 			return (EINVAL);
551 		return (0);	/* skip this state */
552 	}
553 
554 	if (sp->af == 0)
555 		return (0);	/* skip this state */
556 
557 	/*
558 	 * If the ruleset checksums match or the state is coming from the ioctl,
559 	 * it's safe to associate the state with the rule of that number.
560 	 */
561 	if (sp->rule != htonl(-1) && sp->anchor == htonl(-1) &&
562 	    (flags & (PFSYNC_SI_IOCTL | PFSYNC_SI_CKSUM)) && ntohl(sp->rule) <
563 	    pf_main_ruleset.rules.active.rcount) {
564 		TAILQ_FOREACH(r, pf_main_ruleset.rules.active.ptr, entries)
565 			if (ntohl(sp->rule) == n++)
566 				break;
567 	} else
568 		r = &pf_default_rule;
569 
570 	if ((r->max_states && r->states_cur >= r->max_states))
571 		goto cleanup;
572 
573 	if (flags & PFSYNC_SI_IOCTL)
574 		pool_flags = PR_WAITOK | PR_LIMITFAIL | PR_ZERO;
575 	else
576 		pool_flags = PR_NOWAIT | PR_LIMITFAIL | PR_ZERO;
577 
578 	if ((st = pool_get(&pf_state_pl, pool_flags)) == NULL)
579 		goto cleanup;
580 
581 	if ((skw = pf_alloc_state_key(pool_flags)) == NULL)
582 		goto cleanup;
583 
584 	if ((sp->key[PF_SK_WIRE].af &&
585 	    (sp->key[PF_SK_WIRE].af != sp->key[PF_SK_STACK].af)) ||
586 	    PF_ANEQ(&sp->key[PF_SK_WIRE].addr[0],
587 	    &sp->key[PF_SK_STACK].addr[0], sp->af) ||
588 	    PF_ANEQ(&sp->key[PF_SK_WIRE].addr[1],
589 	    &sp->key[PF_SK_STACK].addr[1], sp->af) ||
590 	    sp->key[PF_SK_WIRE].port[0] != sp->key[PF_SK_STACK].port[0] ||
591 	    sp->key[PF_SK_WIRE].port[1] != sp->key[PF_SK_STACK].port[1] ||
592 	    sp->key[PF_SK_WIRE].rdomain != sp->key[PF_SK_STACK].rdomain) {
593 		if ((sks = pf_alloc_state_key(pool_flags)) == NULL)
594 			goto cleanup;
595 	} else
596 		sks = skw;
597 
598 	/* allocate memory for scrub info */
599 	if (pfsync_alloc_scrub_memory(&sp->src, &st->src) ||
600 	    pfsync_alloc_scrub_memory(&sp->dst, &st->dst))
601 		goto cleanup;
602 
603 	/* copy to state key(s) */
604 	skw->addr[0] = sp->key[PF_SK_WIRE].addr[0];
605 	skw->addr[1] = sp->key[PF_SK_WIRE].addr[1];
606 	skw->port[0] = sp->key[PF_SK_WIRE].port[0];
607 	skw->port[1] = sp->key[PF_SK_WIRE].port[1];
608 	skw->rdomain = ntohs(sp->key[PF_SK_WIRE].rdomain);
609 	PF_REF_INIT(skw->refcnt);
610 	skw->proto = sp->proto;
611 	if (!(skw->af = sp->key[PF_SK_WIRE].af))
612 		skw->af = sp->af;
613 	if (sks != skw) {
614 		sks->addr[0] = sp->key[PF_SK_STACK].addr[0];
615 		sks->addr[1] = sp->key[PF_SK_STACK].addr[1];
616 		sks->port[0] = sp->key[PF_SK_STACK].port[0];
617 		sks->port[1] = sp->key[PF_SK_STACK].port[1];
618 		sks->rdomain = ntohs(sp->key[PF_SK_STACK].rdomain);
619 		PF_REF_INIT(sks->refcnt);
620 		if (!(sks->af = sp->key[PF_SK_STACK].af))
621 			sks->af = sp->af;
622 		if (sks->af != skw->af) {
623 			switch (sp->proto) {
624 			case IPPROTO_ICMP:
625 				sks->proto = IPPROTO_ICMPV6;
626 				break;
627 			case IPPROTO_ICMPV6:
628 				sks->proto = IPPROTO_ICMP;
629 				break;
630 			default:
631 				sks->proto = sp->proto;
632 			}
633 		} else
634 			sks->proto = sp->proto;
635 
636 		if (((sks->af != AF_INET) && (sks->af != AF_INET6)) ||
637 		    ((skw->af != AF_INET) && (skw->af != AF_INET6))) {
638 			error = EINVAL;
639 			goto cleanup;
640 		}
641 
642 	} else if ((sks->af != AF_INET) && (sks->af != AF_INET6)) {
643 		error = EINVAL;
644 		goto cleanup;
645 	}
646 	st->rtableid[PF_SK_WIRE] = ntohl(sp->rtableid[PF_SK_WIRE]);
647 	st->rtableid[PF_SK_STACK] = ntohl(sp->rtableid[PF_SK_STACK]);
648 
649 	/* copy to state */
650 	st->rt_addr = sp->rt_addr;
651 	st->rt = sp->rt;
652 	st->creation = getuptime() - ntohl(sp->creation);
653 	st->expire = getuptime();
654 	if (ntohl(sp->expire)) {
655 		u_int32_t timeout;
656 
657 		timeout = r->timeout[sp->timeout];
658 		if (!timeout)
659 			timeout = pf_default_rule.timeout[sp->timeout];
660 
661 		/* sp->expire may have been adaptively scaled by export. */
662 		st->expire -= timeout - ntohl(sp->expire);
663 	}
664 
665 	st->direction = sp->direction;
666 	st->log = sp->log;
667 	st->timeout = sp->timeout;
668 	st->state_flags = ntohs(sp->state_flags);
669 	st->max_mss = ntohs(sp->max_mss);
670 	st->min_ttl = sp->min_ttl;
671 	st->set_tos = sp->set_tos;
672 	st->set_prio[0] = sp->set_prio[0];
673 	st->set_prio[1] = sp->set_prio[1];
674 
675 	st->id = sp->id;
676 	st->creatorid = sp->creatorid;
677 	pf_state_peer_ntoh(&sp->src, &st->src);
678 	pf_state_peer_ntoh(&sp->dst, &st->dst);
679 
680 	st->rule.ptr = r;
681 	st->anchor.ptr = NULL;
682 
683 	st->pfsync_time = getuptime();
684 	st->sync_state = PFSYNC_S_NONE;
685 
686 	refcnt_init(&st->refcnt);
687 
688 	/* XXX when we have anchors, use STATE_INC_COUNTERS */
689 	r->states_cur++;
690 	r->states_tot++;
691 
692 	if (!ISSET(flags, PFSYNC_SI_IOCTL))
693 		SET(st->state_flags, PFSTATE_NOSYNC);
694 
695 	/*
696 	 * We just set PFSTATE_NOSYNC bit, which prevents
697 	 * pfsync_insert_state() to insert state to pfsync.
698 	 */
699 	if (pf_state_insert(kif, &skw, &sks, st) != 0) {
700 		/* XXX when we have anchors, use STATE_DEC_COUNTERS */
701 		r->states_cur--;
702 		error = EEXIST;
703 		goto cleanup_state;
704 	}
705 
706 	if (!ISSET(flags, PFSYNC_SI_IOCTL)) {
707 		CLR(st->state_flags, PFSTATE_NOSYNC);
708 		if (ISSET(st->state_flags, PFSTATE_ACK)) {
709 			pfsync_q_ins(st, PFSYNC_S_IACK);
710 			schednetisr(NETISR_PFSYNC);
711 		}
712 	}
713 	CLR(st->state_flags, PFSTATE_ACK);
714 
715 	return (0);
716 
717  cleanup:
718 	if (skw == sks)
719 		sks = NULL;
720 	if (skw != NULL)
721 		pool_put(&pf_state_key_pl, skw);
722 	if (sks != NULL)
723 		pool_put(&pf_state_key_pl, sks);
724 
725  cleanup_state:	/* pf_state_insert frees the state keys */
726 	if (st) {
727 		if (st->dst.scrub)
728 			pool_put(&pf_state_scrub_pl, st->dst.scrub);
729 		if (st->src.scrub)
730 			pool_put(&pf_state_scrub_pl, st->src.scrub);
731 		pool_put(&pf_state_pl, st);
732 	}
733 	return (error);
734 }
735 
736 int
737 pfsync_input(struct mbuf **mp, int *offp, int proto, int af)
738 {
739 	struct mbuf *n, *m = *mp;
740 	struct pfsync_softc *sc = pfsyncif;
741 	struct ip *ip = mtod(m, struct ip *);
742 	struct pfsync_header *ph;
743 	struct pfsync_subheader subh;
744 	int offset, noff, len, count, mlen, flags = 0;
745 	int e;
746 
747 	NET_ASSERT_LOCKED();
748 
749 	pfsyncstat_inc(pfsyncs_ipackets);
750 
751 	/* verify that we have a sync interface configured */
752 	if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING) ||
753 	    sc->sc_sync_ifidx == 0 || !pf_status.running)
754 		goto done;
755 
756 	/* verify that the packet came in on the right interface */
757 	if (sc->sc_sync_ifidx != m->m_pkthdr.ph_ifidx) {
758 		pfsyncstat_inc(pfsyncs_badif);
759 		goto done;
760 	}
761 
762 	sc->sc_if.if_ipackets++;
763 	sc->sc_if.if_ibytes += m->m_pkthdr.len;
764 
765 	/* verify that the IP TTL is 255. */
766 	if (ip->ip_ttl != PFSYNC_DFLTTL) {
767 		pfsyncstat_inc(pfsyncs_badttl);
768 		goto done;
769 	}
770 
771 	offset = ip->ip_hl << 2;
772 	n = m_pulldown(m, offset, sizeof(*ph), &noff);
773 	if (n == NULL) {
774 		pfsyncstat_inc(pfsyncs_hdrops);
775 		return IPPROTO_DONE;
776 	}
777 	ph = (struct pfsync_header *)(n->m_data + noff);
778 
779 	/* verify the version */
780 	if (ph->version != PFSYNC_VERSION) {
781 		pfsyncstat_inc(pfsyncs_badver);
782 		goto done;
783 	}
784 	len = ntohs(ph->len) + offset;
785 	if (m->m_pkthdr.len < len) {
786 		pfsyncstat_inc(pfsyncs_badlen);
787 		goto done;
788 	}
789 
790 	if (!bcmp(&ph->pfcksum, &pf_status.pf_chksum, PF_MD5_DIGEST_LENGTH))
791 		flags = PFSYNC_SI_CKSUM;
792 
793 	offset += sizeof(*ph);
794 	while (offset <= len - sizeof(subh)) {
795 		m_copydata(m, offset, sizeof(subh), (caddr_t)&subh);
796 		offset += sizeof(subh);
797 
798 		mlen = subh.len << 2;
799 		count = ntohs(subh.count);
800 
801 		if (subh.action >= PFSYNC_ACT_MAX ||
802 		    subh.action >= nitems(pfsync_acts) ||
803 		    mlen < pfsync_acts[subh.action].len) {
804 			/*
805 			 * subheaders are always followed by at least one
806 			 * message, so if the peer is new
807 			 * enough to tell us how big its messages are then we
808 			 * know enough to skip them.
809 			 */
810 			if (count > 0 && mlen > 0) {
811 				offset += count * mlen;
812 				continue;
813 			}
814 			pfsyncstat_inc(pfsyncs_badact);
815 			goto done;
816 		}
817 
818 		n = m_pulldown(m, offset, mlen * count, &noff);
819 		if (n == NULL) {
820 			pfsyncstat_inc(pfsyncs_badlen);
821 			return IPPROTO_DONE;
822 		}
823 
824 		e = pfsync_acts[subh.action].in(n->m_data + noff, mlen, count,
825 		    flags);
826 		if (e != 0)
827 			goto done;
828 
829 		offset += mlen * count;
830 	}
831 
832 done:
833 	m_freem(m);
834 	return IPPROTO_DONE;
835 }
836 
837 int
838 pfsync_in_clr(caddr_t buf, int len, int count, int flags)
839 {
840 	struct pfsync_clr *clr;
841 	struct pf_state *st, *nexts;
842 	struct pfi_kif *kif;
843 	u_int32_t creatorid;
844 	int i;
845 
846 	PF_LOCK();
847 	for (i = 0; i < count; i++) {
848 		clr = (struct pfsync_clr *)buf + len * i;
849 		kif = NULL;
850 		creatorid = clr->creatorid;
851 		if (strlen(clr->ifname) &&
852 		    (kif = pfi_kif_find(clr->ifname)) == NULL)
853 			continue;
854 
855 		PF_STATE_ENTER_WRITE();
856 		for (st = RB_MIN(pf_state_tree_id, &tree_id); st; st = nexts) {
857 			nexts = RB_NEXT(pf_state_tree_id, &tree_id, st);
858 			if (st->creatorid == creatorid &&
859 			    ((kif && st->kif == kif) || !kif)) {
860 				SET(st->state_flags, PFSTATE_NOSYNC);
861 				pf_remove_state(st);
862 			}
863 		}
864 		PF_STATE_EXIT_WRITE();
865 	}
866 	PF_UNLOCK();
867 
868 	return (0);
869 }
870 
871 int
872 pfsync_in_ins(caddr_t buf, int len, int count, int flags)
873 {
874 	struct pfsync_state *sp;
875 	sa_family_t af1, af2;
876 	int i;
877 
878 	PF_LOCK();
879 	for (i = 0; i < count; i++) {
880 		sp = (struct pfsync_state *)(buf + len * i);
881 		af1 = sp->key[0].af;
882 		af2 = sp->key[1].af;
883 
884 		/* check for invalid values */
885 		if (sp->timeout >= PFTM_MAX ||
886 		    sp->src.state > PF_TCPS_PROXY_DST ||
887 		    sp->dst.state > PF_TCPS_PROXY_DST ||
888 		    sp->direction > PF_OUT ||
889 		    (((af1 || af2) &&
890 		     ((af1 != AF_INET && af1 != AF_INET6) ||
891 		      (af2 != AF_INET && af2 != AF_INET6))) ||
892 		    (sp->af != AF_INET && sp->af != AF_INET6))) {
893 			DPFPRINTF(LOG_NOTICE,
894 			    "pfsync_input: PFSYNC5_ACT_INS: invalid value");
895 			pfsyncstat_inc(pfsyncs_badval);
896 			continue;
897 		}
898 
899 		if (pfsync_state_import(sp, flags) == ENOMEM) {
900 			/* drop out, but process the rest of the actions */
901 			break;
902 		}
903 	}
904 	PF_UNLOCK();
905 
906 	return (0);
907 }
908 
909 int
910 pfsync_in_iack(caddr_t buf, int len, int count, int flags)
911 {
912 	struct pfsync_ins_ack *ia;
913 	struct pf_state_cmp id_key;
914 	struct pf_state *st;
915 	int i;
916 
917 	for (i = 0; i < count; i++) {
918 		ia = (struct pfsync_ins_ack *)(buf + len * i);
919 
920 		id_key.id = ia->id;
921 		id_key.creatorid = ia->creatorid;
922 
923 		PF_STATE_ENTER_READ();
924 		st = pf_find_state_byid(&id_key);
925 		pf_state_ref(st);
926 		PF_STATE_EXIT_READ();
927 		if (st == NULL)
928 			continue;
929 
930 		if (ISSET(st->state_flags, PFSTATE_ACK))
931 			pfsync_deferred(st, 0);
932 
933 		pf_state_unref(st);
934 	}
935 
936 	return (0);
937 }
938 
939 int
940 pfsync_upd_tcp(struct pf_state *st, struct pfsync_state_peer *src,
941     struct pfsync_state_peer *dst)
942 {
943 	int sync = 0;
944 
945 	/*
946 	 * The state should never go backwards except
947 	 * for syn-proxy states.  Neither should the
948 	 * sequence window slide backwards.
949 	 */
950 	if ((st->src.state > src->state &&
951 	    (st->src.state < PF_TCPS_PROXY_SRC ||
952 	    src->state >= PF_TCPS_PROXY_SRC)) ||
953 
954 	    (st->src.state == src->state &&
955 	    SEQ_GT(st->src.seqlo, ntohl(src->seqlo))))
956 		sync++;
957 	else
958 		pf_state_peer_ntoh(src, &st->src);
959 
960 	if ((st->dst.state > dst->state) ||
961 
962 	    (st->dst.state >= TCPS_SYN_SENT &&
963 	    SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo))))
964 		sync++;
965 	else
966 		pf_state_peer_ntoh(dst, &st->dst);
967 
968 	return (sync);
969 }
970 
971 int
972 pfsync_in_upd(caddr_t buf, int len, int count, int flags)
973 {
974 	struct pfsync_state *sp;
975 	struct pf_state_cmp id_key;
976 	struct pf_state *st;
977 	int sync, error;
978 	int i;
979 
980 	for (i = 0; i < count; i++) {
981 		sp = (struct pfsync_state *)(buf + len * i);
982 
983 		/* check for invalid values */
984 		if (sp->timeout >= PFTM_MAX ||
985 		    sp->src.state > PF_TCPS_PROXY_DST ||
986 		    sp->dst.state > PF_TCPS_PROXY_DST) {
987 			DPFPRINTF(LOG_NOTICE,
988 			    "pfsync_input: PFSYNC_ACT_UPD: invalid value");
989 			pfsyncstat_inc(pfsyncs_badval);
990 			continue;
991 		}
992 
993 		id_key.id = sp->id;
994 		id_key.creatorid = sp->creatorid;
995 
996 		PF_STATE_ENTER_READ();
997 		st = pf_find_state_byid(&id_key);
998 		pf_state_ref(st);
999 		PF_STATE_EXIT_READ();
1000 		if (st == NULL) {
1001 			/* insert the update */
1002 			PF_LOCK();
1003 			error = pfsync_state_import(sp, flags);
1004 			if (error)
1005 				pfsyncstat_inc(pfsyncs_badstate);
1006 			PF_UNLOCK();
1007 			continue;
1008 		}
1009 
1010 		if (ISSET(st->state_flags, PFSTATE_ACK))
1011 			pfsync_deferred(st, 1);
1012 
1013 		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
1014 			sync = pfsync_upd_tcp(st, &sp->src, &sp->dst);
1015 		else {
1016 			sync = 0;
1017 
1018 			/*
1019 			 * Non-TCP protocol state machine always go
1020 			 * forwards
1021 			 */
1022 			if (st->src.state > sp->src.state)
1023 				sync++;
1024 			else
1025 				pf_state_peer_ntoh(&sp->src, &st->src);
1026 
1027 			if (st->dst.state > sp->dst.state)
1028 				sync++;
1029 			else
1030 				pf_state_peer_ntoh(&sp->dst, &st->dst);
1031 		}
1032 
1033 		if (sync < 2) {
1034 			pfsync_alloc_scrub_memory(&sp->dst, &st->dst);
1035 			pf_state_peer_ntoh(&sp->dst, &st->dst);
1036 			st->expire = getuptime();
1037 			st->timeout = sp->timeout;
1038 		}
1039 		st->pfsync_time = getuptime();
1040 
1041 		if (sync) {
1042 			pfsyncstat_inc(pfsyncs_stale);
1043 
1044 			pfsync_update_state(st);
1045 			schednetisr(NETISR_PFSYNC);
1046 		}
1047 
1048 		pf_state_unref(st);
1049 	}
1050 
1051 	return (0);
1052 }
1053 
1054 int
1055 pfsync_in_upd_c(caddr_t buf, int len, int count, int flags)
1056 {
1057 	struct pfsync_upd_c *up;
1058 	struct pf_state_cmp id_key;
1059 	struct pf_state *st;
1060 
1061 	int sync;
1062 
1063 	int i;
1064 
1065 	for (i = 0; i < count; i++) {
1066 		up = (struct pfsync_upd_c *)(buf + len * i);
1067 
1068 		/* check for invalid values */
1069 		if (up->timeout >= PFTM_MAX ||
1070 		    up->src.state > PF_TCPS_PROXY_DST ||
1071 		    up->dst.state > PF_TCPS_PROXY_DST) {
1072 			DPFPRINTF(LOG_NOTICE,
1073 			    "pfsync_input: PFSYNC_ACT_UPD_C: invalid value");
1074 			pfsyncstat_inc(pfsyncs_badval);
1075 			continue;
1076 		}
1077 
1078 		id_key.id = up->id;
1079 		id_key.creatorid = up->creatorid;
1080 
1081 		PF_STATE_ENTER_READ();
1082 		st = pf_find_state_byid(&id_key);
1083 		pf_state_ref(st);
1084 		PF_STATE_EXIT_READ();
1085 		if (st == NULL) {
1086 			/* We don't have this state. Ask for it. */
1087 			pfsync_request_update(id_key.creatorid, id_key.id);
1088 			continue;
1089 		}
1090 
1091 		if (ISSET(st->state_flags, PFSTATE_ACK))
1092 			pfsync_deferred(st, 1);
1093 
1094 		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
1095 			sync = pfsync_upd_tcp(st, &up->src, &up->dst);
1096 		else {
1097 			sync = 0;
1098 			/*
1099 			 * Non-TCP protocol state machine always go
1100 			 * forwards
1101 			 */
1102 			if (st->src.state > up->src.state)
1103 				sync++;
1104 			else
1105 				pf_state_peer_ntoh(&up->src, &st->src);
1106 
1107 			if (st->dst.state > up->dst.state)
1108 				sync++;
1109 			else
1110 				pf_state_peer_ntoh(&up->dst, &st->dst);
1111 		}
1112 		if (sync < 2) {
1113 			pfsync_alloc_scrub_memory(&up->dst, &st->dst);
1114 			pf_state_peer_ntoh(&up->dst, &st->dst);
1115 			st->expire = getuptime();
1116 			st->timeout = up->timeout;
1117 		}
1118 		st->pfsync_time = getuptime();
1119 
1120 		if (sync) {
1121 			pfsyncstat_inc(pfsyncs_stale);
1122 
1123 			pfsync_update_state(st);
1124 			schednetisr(NETISR_PFSYNC);
1125 		}
1126 
1127 		pf_state_unref(st);
1128 	}
1129 
1130 	return (0);
1131 }
1132 
1133 int
1134 pfsync_in_ureq(caddr_t buf, int len, int count, int flags)
1135 {
1136 	struct pfsync_upd_req *ur;
1137 	int i;
1138 
1139 	struct pf_state_cmp id_key;
1140 	struct pf_state *st;
1141 
1142 	for (i = 0; i < count; i++) {
1143 		ur = (struct pfsync_upd_req *)(buf + len * i);
1144 
1145 		id_key.id = ur->id;
1146 		id_key.creatorid = ur->creatorid;
1147 
1148 		if (id_key.id == 0 && id_key.creatorid == 0)
1149 			pfsync_bulk_start();
1150 		else {
1151 			PF_STATE_ENTER_READ();
1152 			st = pf_find_state_byid(&id_key);
1153 			pf_state_ref(st);
1154 			PF_STATE_EXIT_READ();
1155 			if (st == NULL) {
1156 				pfsyncstat_inc(pfsyncs_badstate);
1157 				continue;
1158 			}
1159 			if (ISSET(st->state_flags, PFSTATE_NOSYNC)) {
1160 				pf_state_unref(st);
1161 				continue;
1162 			}
1163 
1164 			pfsync_update_state_req(st);
1165 			pf_state_unref(st);
1166 		}
1167 	}
1168 
1169 	return (0);
1170 }
1171 
1172 int
1173 pfsync_in_del(caddr_t buf, int len, int count, int flags)
1174 {
1175 	struct pfsync_state *sp;
1176 	struct pf_state_cmp id_key;
1177 	struct pf_state *st;
1178 	int i;
1179 
1180 	PF_STATE_ENTER_WRITE();
1181 	for (i = 0; i < count; i++) {
1182 		sp = (struct pfsync_state *)(buf + len * i);
1183 
1184 		id_key.id = sp->id;
1185 		id_key.creatorid = sp->creatorid;
1186 
1187 		st = pf_find_state_byid(&id_key);
1188 		if (st == NULL) {
1189 			pfsyncstat_inc(pfsyncs_badstate);
1190 			continue;
1191 		}
1192 		SET(st->state_flags, PFSTATE_NOSYNC);
1193 		pf_remove_state(st);
1194 	}
1195 	PF_STATE_EXIT_WRITE();
1196 
1197 	return (0);
1198 }
1199 
1200 int
1201 pfsync_in_del_c(caddr_t buf, int len, int count, int flags)
1202 {
1203 	struct pfsync_del_c *sp;
1204 	struct pf_state_cmp id_key;
1205 	struct pf_state *st;
1206 	int i;
1207 
1208 	PF_LOCK();
1209 	PF_STATE_ENTER_WRITE();
1210 	for (i = 0; i < count; i++) {
1211 		sp = (struct pfsync_del_c *)(buf + len * i);
1212 
1213 		id_key.id = sp->id;
1214 		id_key.creatorid = sp->creatorid;
1215 
1216 		st = pf_find_state_byid(&id_key);
1217 		if (st == NULL) {
1218 			pfsyncstat_inc(pfsyncs_badstate);
1219 			continue;
1220 		}
1221 
1222 		SET(st->state_flags, PFSTATE_NOSYNC);
1223 		pf_remove_state(st);
1224 	}
1225 	PF_STATE_EXIT_WRITE();
1226 	PF_UNLOCK();
1227 
1228 	return (0);
1229 }
1230 
1231 int
1232 pfsync_in_bus(caddr_t buf, int len, int count, int flags)
1233 {
1234 	struct pfsync_softc *sc = pfsyncif;
1235 	struct pfsync_bus *bus;
1236 
1237 	/* If we're not waiting for a bulk update, who cares. */
1238 	if (sc->sc_ureq_sent == 0)
1239 		return (0);
1240 
1241 	bus = (struct pfsync_bus *)buf;
1242 
1243 	switch (bus->status) {
1244 	case PFSYNC_BUS_START:
1245 		timeout_add(&sc->sc_bulkfail_tmo, 4 * hz +
1246 		    pf_pool_limits[PF_LIMIT_STATES].limit /
1247 		    ((sc->sc_if.if_mtu - PFSYNC_MINPKT) /
1248 		    sizeof(struct pfsync_state)));
1249 		DPFPRINTF(LOG_INFO, "received bulk update start");
1250 		break;
1251 
1252 	case PFSYNC_BUS_END:
1253 		if (getuptime() - ntohl(bus->endtime) >=
1254 		    sc->sc_ureq_sent) {
1255 			/* that's it, we're happy */
1256 			sc->sc_ureq_sent = 0;
1257 			sc->sc_bulk_tries = 0;
1258 			timeout_del(&sc->sc_bulkfail_tmo);
1259 #if NCARP > 0
1260 			if (!pfsync_sync_ok)
1261 				carp_group_demote_adj(&sc->sc_if, -1,
1262 				    sc->sc_link_demoted ?
1263 				    "pfsync link state up" :
1264 				    "pfsync bulk done");
1265 			if (sc->sc_initial_bulk) {
1266 				carp_group_demote_adj(&sc->sc_if, -32,
1267 				    "pfsync init");
1268 				sc->sc_initial_bulk = 0;
1269 			}
1270 #endif
1271 			pfsync_sync_ok = 1;
1272 			sc->sc_link_demoted = 0;
1273 			DPFPRINTF(LOG_INFO, "received valid bulk update end");
1274 		} else {
1275 			DPFPRINTF(LOG_WARNING, "received invalid "
1276 			    "bulk update end: bad timestamp");
1277 		}
1278 		break;
1279 	}
1280 
1281 	return (0);
1282 }
1283 
1284 int
1285 pfsync_in_tdb(caddr_t buf, int len, int count, int flags)
1286 {
1287 #if defined(IPSEC)
1288 	struct pfsync_tdb *tp;
1289 	int i;
1290 
1291 	for (i = 0; i < count; i++) {
1292 		tp = (struct pfsync_tdb *)(buf + len * i);
1293 		pfsync_update_net_tdb(tp);
1294 	}
1295 #endif
1296 
1297 	return (0);
1298 }
1299 
1300 #if defined(IPSEC)
1301 /* Update an in-kernel tdb. Silently fail if no tdb is found. */
1302 void
1303 pfsync_update_net_tdb(struct pfsync_tdb *pt)
1304 {
1305 	struct tdb		*tdb;
1306 
1307 	NET_ASSERT_LOCKED();
1308 
1309 	/* check for invalid values */
1310 	if (ntohl(pt->spi) <= SPI_RESERVED_MAX ||
1311 	    (pt->dst.sa.sa_family != AF_INET &&
1312 	     pt->dst.sa.sa_family != AF_INET6))
1313 		goto bad;
1314 
1315 	tdb = gettdb(ntohs(pt->rdomain), pt->spi,
1316 	    (union sockaddr_union *)&pt->dst, pt->sproto);
1317 	if (tdb) {
1318 		pt->rpl = betoh64(pt->rpl);
1319 		pt->cur_bytes = betoh64(pt->cur_bytes);
1320 
1321 		/* Neither replay nor byte counter should ever decrease. */
1322 		if (pt->rpl < tdb->tdb_rpl ||
1323 		    pt->cur_bytes < tdb->tdb_cur_bytes) {
1324 			goto bad;
1325 		}
1326 
1327 		tdb->tdb_rpl = pt->rpl;
1328 		tdb->tdb_cur_bytes = pt->cur_bytes;
1329 	}
1330 	return;
1331 
1332  bad:
1333 	DPFPRINTF(LOG_WARNING, "pfsync_insert: PFSYNC_ACT_TDB_UPD: "
1334 	    "invalid value");
1335 	pfsyncstat_inc(pfsyncs_badstate);
1336 	return;
1337 }
1338 #endif
1339 
1340 
1341 int
1342 pfsync_in_eof(caddr_t buf, int len, int count, int flags)
1343 {
1344 	if (len > 0 || count > 0)
1345 		pfsyncstat_inc(pfsyncs_badact);
1346 
1347 	/* we're done. let the caller return */
1348 	return (1);
1349 }
1350 
1351 int
1352 pfsync_in_error(caddr_t buf, int len, int count, int flags)
1353 {
1354 	pfsyncstat_inc(pfsyncs_badact);
1355 	return (-1);
1356 }
1357 
1358 int
1359 pfsyncoutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
1360 	struct rtentry *rt)
1361 {
1362 	m_freem(m);	/* drop packet */
1363 	return (EAFNOSUPPORT);
1364 }
1365 
1366 int
1367 pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
1368 {
1369 	struct proc *p = curproc;
1370 	struct pfsync_softc *sc = ifp->if_softc;
1371 	struct ifreq *ifr = (struct ifreq *)data;
1372 	struct ip_moptions *imo = &sc->sc_imo;
1373 	struct pfsyncreq pfsyncr;
1374 	struct ifnet *ifp0, *sifp;
1375 	struct ip *ip;
1376 	int error;
1377 
1378 	switch (cmd) {
1379 	case SIOCSIFFLAGS:
1380 		if ((ifp->if_flags & IFF_RUNNING) == 0 &&
1381 		    (ifp->if_flags & IFF_UP)) {
1382 			ifp->if_flags |= IFF_RUNNING;
1383 
1384 #if NCARP > 0
1385 			sc->sc_initial_bulk = 1;
1386 			carp_group_demote_adj(&sc->sc_if, 32, "pfsync init");
1387 #endif
1388 
1389 			pfsync_request_full_update(sc);
1390 		}
1391 		if ((ifp->if_flags & IFF_RUNNING) &&
1392 		    (ifp->if_flags & IFF_UP) == 0) {
1393 			ifp->if_flags &= ~IFF_RUNNING;
1394 
1395 			/* drop everything */
1396 			timeout_del(&sc->sc_tmo);
1397 			pfsync_drop(sc);
1398 
1399 			pfsync_cancel_full_update(sc);
1400 		}
1401 		break;
1402 	case SIOCSIFMTU:
1403 		if ((ifp0 = if_get(sc->sc_sync_ifidx)) == NULL)
1404 			return (EINVAL);
1405 		error = 0;
1406 		if (ifr->ifr_mtu <= PFSYNC_MINPKT ||
1407 		    ifr->ifr_mtu > ifp0->if_mtu) {
1408 			error = EINVAL;
1409 		}
1410 		if_put(ifp0);
1411 		if (error)
1412 			return error;
1413 		if (ifr->ifr_mtu < ifp->if_mtu)
1414 			pfsync_sendout();
1415 		ifp->if_mtu = ifr->ifr_mtu;
1416 		break;
1417 	case SIOCGETPFSYNC:
1418 		bzero(&pfsyncr, sizeof(pfsyncr));
1419 		if ((ifp0 = if_get(sc->sc_sync_ifidx)) != NULL) {
1420 			strlcpy(pfsyncr.pfsyncr_syncdev,
1421 			    ifp0->if_xname, IFNAMSIZ);
1422 		}
1423 		if_put(ifp0);
1424 		pfsyncr.pfsyncr_syncpeer = sc->sc_sync_peer;
1425 		pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates;
1426 		pfsyncr.pfsyncr_defer = sc->sc_defer;
1427 		return (copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr)));
1428 
1429 	case SIOCSETPFSYNC:
1430 		if ((error = suser(p)) != 0)
1431 			return (error);
1432 		if ((error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr))))
1433 			return (error);
1434 
1435 		if (pfsyncr.pfsyncr_syncpeer.s_addr == 0)
1436 			sc->sc_sync_peer.s_addr = INADDR_PFSYNC_GROUP;
1437 		else
1438 			sc->sc_sync_peer.s_addr =
1439 			    pfsyncr.pfsyncr_syncpeer.s_addr;
1440 
1441 		if (pfsyncr.pfsyncr_maxupdates > 255)
1442 			return (EINVAL);
1443 		sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates;
1444 
1445 		sc->sc_defer = pfsyncr.pfsyncr_defer;
1446 
1447 		if (pfsyncr.pfsyncr_syncdev[0] == 0) {
1448 			if ((ifp0 = if_get(sc->sc_sync_ifidx)) != NULL) {
1449 				if_linkstatehook_del(ifp0, &sc->sc_ltask);
1450 				if_detachhook_del(ifp0, &sc->sc_dtask);
1451 			}
1452 			if_put(ifp0);
1453 			sc->sc_sync_ifidx = 0;
1454 			if (imo->imo_num_memberships > 0) {
1455 				in_delmulti(imo->imo_membership[
1456 				    --imo->imo_num_memberships]);
1457 				imo->imo_ifidx = 0;
1458 			}
1459 			break;
1460 		}
1461 
1462 		if ((sifp = if_unit(pfsyncr.pfsyncr_syncdev)) == NULL)
1463 			return (EINVAL);
1464 
1465 		ifp0 = if_get(sc->sc_sync_ifidx);
1466 
1467 		if (sifp->if_mtu < sc->sc_if.if_mtu || (ifp0 != NULL &&
1468 		    sifp->if_mtu < ifp0->if_mtu) ||
1469 		    sifp->if_mtu < MCLBYTES - sizeof(struct ip))
1470 			pfsync_sendout();
1471 
1472 		if (ifp0) {
1473 			if_linkstatehook_del(ifp0, &sc->sc_ltask);
1474 			if_detachhook_del(ifp0, &sc->sc_dtask);
1475 		}
1476 		if_put(ifp0);
1477 		sc->sc_sync_ifidx = sifp->if_index;
1478 
1479 		if (imo->imo_num_memberships > 0) {
1480 			in_delmulti(imo->imo_membership[--imo->imo_num_memberships]);
1481 			imo->imo_ifidx = 0;
1482 		}
1483 
1484 		if (sc->sc_sync_peer.s_addr == INADDR_PFSYNC_GROUP) {
1485 			struct in_addr addr;
1486 
1487 			if (!(sifp->if_flags & IFF_MULTICAST)) {
1488 				sc->sc_sync_ifidx = 0;
1489 				if_put(sifp);
1490 				return (EADDRNOTAVAIL);
1491 			}
1492 
1493 			addr.s_addr = INADDR_PFSYNC_GROUP;
1494 
1495 			if ((imo->imo_membership[0] =
1496 			    in_addmulti(&addr, sifp)) == NULL) {
1497 				sc->sc_sync_ifidx = 0;
1498 				if_put(sifp);
1499 				return (ENOBUFS);
1500 			}
1501 			imo->imo_num_memberships++;
1502 			imo->imo_ifidx = sc->sc_sync_ifidx;
1503 			imo->imo_ttl = PFSYNC_DFLTTL;
1504 			imo->imo_loop = 0;
1505 		}
1506 
1507 		ip = &sc->sc_template;
1508 		bzero(ip, sizeof(*ip));
1509 		ip->ip_v = IPVERSION;
1510 		ip->ip_hl = sizeof(sc->sc_template) >> 2;
1511 		ip->ip_tos = IPTOS_LOWDELAY;
1512 		/* len and id are set later */
1513 		ip->ip_off = htons(IP_DF);
1514 		ip->ip_ttl = PFSYNC_DFLTTL;
1515 		ip->ip_p = IPPROTO_PFSYNC;
1516 		ip->ip_src.s_addr = INADDR_ANY;
1517 		ip->ip_dst.s_addr = sc->sc_sync_peer.s_addr;
1518 
1519 		if_linkstatehook_add(sifp, &sc->sc_ltask);
1520 		if_detachhook_add(sifp, &sc->sc_dtask);
1521 		if_put(sifp);
1522 
1523 		pfsync_request_full_update(sc);
1524 
1525 		break;
1526 
1527 	default:
1528 		return (ENOTTY);
1529 	}
1530 
1531 	return (0);
1532 }
1533 
1534 void
1535 pfsync_out_state(struct pf_state *st, void *buf)
1536 {
1537 	struct pfsync_state *sp = buf;
1538 
1539 	pfsync_state_export(sp, st);
1540 }
1541 
1542 void
1543 pfsync_out_iack(struct pf_state *st, void *buf)
1544 {
1545 	struct pfsync_ins_ack *iack = buf;
1546 
1547 	iack->id = st->id;
1548 	iack->creatorid = st->creatorid;
1549 }
1550 
1551 void
1552 pfsync_out_upd_c(struct pf_state *st, void *buf)
1553 {
1554 	struct pfsync_upd_c *up = buf;
1555 
1556 	bzero(up, sizeof(*up));
1557 	up->id = st->id;
1558 	pf_state_peer_hton(&st->src, &up->src);
1559 	pf_state_peer_hton(&st->dst, &up->dst);
1560 	up->creatorid = st->creatorid;
1561 	up->timeout = st->timeout;
1562 }
1563 
1564 void
1565 pfsync_out_del(struct pf_state *st, void *buf)
1566 {
1567 	struct pfsync_del_c *dp = buf;
1568 
1569 	dp->id = st->id;
1570 	dp->creatorid = st->creatorid;
1571 
1572 	SET(st->state_flags, PFSTATE_NOSYNC);
1573 }
1574 
1575 void
1576 pfsync_grab_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc *sc)
1577 {
1578 	int q;
1579 
1580 	sn->sn_sc = sc;
1581 
1582 	for (q = 0; q < PFSYNC_S_COUNT; q++)
1583 		mtx_enter(&sc->sc_mtx[q]);
1584 
1585 	mtx_enter(&sc->sc_upd_req_mtx);
1586 	mtx_enter(&sc->sc_tdb_mtx);
1587 
1588 	for (q = 0; q < PFSYNC_S_COUNT; q++) {
1589 		TAILQ_INIT(&sn->sn_qs[q]);
1590 		TAILQ_CONCAT(&sn->sn_qs[q], &sc->sc_qs[q], sync_list);
1591 	}
1592 
1593 	TAILQ_INIT(&sn->sn_upd_req_list);
1594 	TAILQ_CONCAT(&sn->sn_upd_req_list, &sc->sc_upd_req_list, ur_entry);
1595 
1596 	TAILQ_INIT(&sn->sn_tdb_q);
1597 	TAILQ_CONCAT(&sn->sn_tdb_q, &sc->sc_tdb_q, tdb_sync_entry);
1598 
1599 	sn->sn_len = sc->sc_len;
1600 	sc->sc_len = PFSYNC_MINPKT;
1601 
1602 	sn->sn_plus = sc->sc_plus;
1603 	sc->sc_plus = NULL;
1604 	sn->sn_pluslen = sc->sc_pluslen;
1605 	sc->sc_pluslen = 0;
1606 
1607 	mtx_leave(&sc->sc_tdb_mtx);
1608 	mtx_leave(&sc->sc_upd_req_mtx);
1609 
1610 	for (q = (PFSYNC_S_COUNT - 1); q >= 0; q--)
1611 		mtx_leave(&sc->sc_mtx[q]);
1612 }
1613 
1614 void
1615 pfsync_drop_snapshot(struct pfsync_snapshot *sn)
1616 {
1617 	struct pf_state *st;
1618 	struct pfsync_upd_req_item *ur;
1619 	struct tdb *t;
1620 	int q;
1621 
1622 
1623 	for (q = 0; q < PFSYNC_S_COUNT; q++) {
1624 		if (TAILQ_EMPTY(&sn->sn_qs[q]))
1625 			continue;
1626 
1627 		while ((st = TAILQ_FIRST(&sn->sn_qs[q])) != NULL) {
1628 			TAILQ_REMOVE(&sn->sn_qs[q], st, sync_list);
1629 #ifdef PFSYNC_DEBUG
1630 			KASSERT(st->sync_state == q);
1631 #endif
1632 			st->sync_state = PFSYNC_S_NONE;
1633 			pf_state_unref(st);
1634 		}
1635 	}
1636 
1637 	while ((ur = TAILQ_FIRST(&sn->sn_upd_req_list)) != NULL) {
1638 		TAILQ_REMOVE(&sn->sn_upd_req_list, ur, ur_entry);
1639 		pool_put(&sn->sn_sc->sc_pool, ur);
1640 	}
1641 
1642 	while ((t = TAILQ_FIRST(&sn->sn_tdb_q)) != NULL) {
1643 		TAILQ_REMOVE(&sn->sn_tdb_q, t, tdb_sync_entry);
1644 		CLR(t->tdb_flags, TDBF_PFSYNC);
1645 	}
1646 }
1647 
1648 int
1649 pfsync_is_snapshot_empty(struct pfsync_snapshot *sn)
1650 {
1651 	int	q;
1652 
1653 	for (q = 0; q < PFSYNC_S_COUNT; q++)
1654 		if (!TAILQ_EMPTY(&sn->sn_qs[q]))
1655 			return (0);
1656 
1657 	if (!TAILQ_EMPTY(&sn->sn_upd_req_list))
1658 		return (0);
1659 
1660 	if (!TAILQ_EMPTY(&sn->sn_tdb_q))
1661 		return (0);
1662 
1663 	return (sn->sn_plus == NULL);
1664 }
1665 
1666 void
1667 pfsync_drop(struct pfsync_softc *sc)
1668 {
1669 	struct pfsync_snapshot	sn;
1670 
1671 	pfsync_grab_snapshot(&sn, sc);
1672 	pfsync_drop_snapshot(&sn);
1673 }
1674 
1675 void
1676 pfsync_send_dispatch(void *xmq)
1677 {
1678 	struct mbuf_queue *mq = xmq;
1679 	struct pfsync_softc *sc;
1680 	struct mbuf *m;
1681 	struct mbuf_list ml;
1682 	int error;
1683 
1684 	mq_delist(mq, &ml);
1685 	if (ml_empty(&ml))
1686 		return;
1687 
1688 	NET_LOCK();
1689 	sc = pfsyncif;
1690 	if (sc == NULL) {
1691 		ml_purge(&ml);
1692 		goto done;
1693 	}
1694 
1695 	while ((m = ml_dequeue(&ml)) != NULL) {
1696 		if ((error = ip_output(m, NULL, NULL, IP_RAWOUTPUT,
1697 		    &sc->sc_imo, NULL, 0)) == 0)
1698 			pfsyncstat_inc(pfsyncs_opackets);
1699 		else {
1700 			DPFPRINTF(LOG_DEBUG,
1701 			    "ip_output() @ %s failed (%d)\n", __func__, error);
1702 			pfsyncstat_inc(pfsyncs_oerrors);
1703 		}
1704 	}
1705 done:
1706 	NET_UNLOCK();
1707 }
1708 
1709 void
1710 pfsync_send_pkt(struct mbuf *m)
1711 {
1712 	if (mq_enqueue(&pfsync_mq, m) != 0) {
1713 		pfsyncstat_inc(pfsyncs_oerrors);
1714 		DPFPRINTF(LOG_DEBUG, "mq_enqueue() @ %s failed, queue full\n",
1715 		    __func__);
1716 	} else
1717 		task_add(net_tq(0), &pfsync_task);
1718 }
1719 
1720 void
1721 pfsync_sendout(void)
1722 {
1723 	struct pfsync_snapshot sn;
1724 	struct pfsync_softc *sc = pfsyncif;
1725 #if NBPFILTER > 0
1726 	struct ifnet *ifp = &sc->sc_if;
1727 #endif
1728 	struct mbuf *m;
1729 	struct ip *ip;
1730 	struct pfsync_header *ph;
1731 	struct pfsync_subheader *subh;
1732 	struct pf_state *st;
1733 	struct pfsync_upd_req_item *ur;
1734 	struct tdb *t;
1735 	int offset;
1736 	int q, count = 0;
1737 
1738 	if (sc == NULL || sc->sc_len == PFSYNC_MINPKT)
1739 		return;
1740 
1741 	if (!ISSET(sc->sc_if.if_flags, IFF_RUNNING) ||
1742 #if NBPFILTER > 0
1743 	    (ifp->if_bpf == NULL && sc->sc_sync_ifidx == 0)) {
1744 #else
1745 	    sc->sc_sync_ifidx == 0) {
1746 #endif
1747 		pfsync_drop(sc);
1748 		return;
1749 	}
1750 
1751 	pfsync_grab_snapshot(&sn, sc);
1752 
1753 	/*
1754 	 * Check below is sufficient to prevent us from sending empty packets,
1755 	 * but it does not stop us from sending short packets.
1756 	 */
1757 	if (pfsync_is_snapshot_empty(&sn))
1758 		return;
1759 
1760 	MGETHDR(m, M_DONTWAIT, MT_DATA);
1761 	if (m == NULL) {
1762 		sc->sc_if.if_oerrors++;
1763 		pfsyncstat_inc(pfsyncs_onomem);
1764 		pfsync_drop_snapshot(&sn);
1765 		return;
1766 	}
1767 
1768 	if (max_linkhdr + sn.sn_len > MHLEN) {
1769 		MCLGETL(m, M_DONTWAIT, max_linkhdr + sn.sn_len);
1770 		if (!ISSET(m->m_flags, M_EXT)) {
1771 			m_free(m);
1772 			sc->sc_if.if_oerrors++;
1773 			pfsyncstat_inc(pfsyncs_onomem);
1774 			pfsync_drop_snapshot(&sn);
1775 			return;
1776 		}
1777 	}
1778 	m->m_data += max_linkhdr;
1779 	m->m_len = m->m_pkthdr.len = sn.sn_len;
1780 
1781 	/* build the ip header */
1782 	ip = mtod(m, struct ip *);
1783 	bcopy(&sc->sc_template, ip, sizeof(*ip));
1784 	offset = sizeof(*ip);
1785 
1786 	ip->ip_len = htons(m->m_pkthdr.len);
1787 	ip->ip_id = htons(ip_randomid());
1788 
1789 	/* build the pfsync header */
1790 	ph = (struct pfsync_header *)(m->m_data + offset);
1791 	bzero(ph, sizeof(*ph));
1792 	offset += sizeof(*ph);
1793 
1794 	ph->version = PFSYNC_VERSION;
1795 	ph->len = htons(sn.sn_len - sizeof(*ip));
1796 	bcopy(pf_status.pf_chksum, ph->pfcksum, PF_MD5_DIGEST_LENGTH);
1797 
1798 	if (!TAILQ_EMPTY(&sn.sn_upd_req_list)) {
1799 		subh = (struct pfsync_subheader *)(m->m_data + offset);
1800 		offset += sizeof(*subh);
1801 
1802 		count = 0;
1803 		while ((ur = TAILQ_FIRST(&sn.sn_upd_req_list)) != NULL) {
1804 			TAILQ_REMOVE(&sn.sn_upd_req_list, ur, ur_entry);
1805 
1806 			bcopy(&ur->ur_msg, m->m_data + offset,
1807 			    sizeof(ur->ur_msg));
1808 			offset += sizeof(ur->ur_msg);
1809 
1810 			pool_put(&sc->sc_pool, ur);
1811 
1812 			count++;
1813 		}
1814 
1815 		bzero(subh, sizeof(*subh));
1816 		subh->len = sizeof(ur->ur_msg) >> 2;
1817 		subh->action = PFSYNC_ACT_UPD_REQ;
1818 		subh->count = htons(count);
1819 	}
1820 
1821 	/* has someone built a custom region for us to add? */
1822 	if (sn.sn_plus != NULL) {
1823 		bcopy(sn.sn_plus, m->m_data + offset, sn.sn_pluslen);
1824 		offset += sn.sn_pluslen;
1825 		sn.sn_plus = NULL;	/* XXX memory leak ? */
1826 	}
1827 
1828 	if (!TAILQ_EMPTY(&sn.sn_tdb_q)) {
1829 		subh = (struct pfsync_subheader *)(m->m_data + offset);
1830 		offset += sizeof(*subh);
1831 
1832 		count = 0;
1833 		while ((t = TAILQ_FIRST(&sn.sn_tdb_q)) != NULL) {
1834 			TAILQ_REMOVE(&sn.sn_tdb_q, t, tdb_sync_entry);
1835 			pfsync_out_tdb(t, m->m_data + offset);
1836 			offset += sizeof(struct pfsync_tdb);
1837 			CLR(t->tdb_flags, TDBF_PFSYNC);
1838 			count++;
1839 		}
1840 
1841 		bzero(subh, sizeof(*subh));
1842 		subh->action = PFSYNC_ACT_TDB;
1843 		subh->len = sizeof(struct pfsync_tdb) >> 2;
1844 		subh->count = htons(count);
1845 	}
1846 
1847 	/* walk the queues */
1848 	for (q = 0; q < PFSYNC_S_COUNT; q++) {
1849 		if (TAILQ_EMPTY(&sn.sn_qs[q]))
1850 			continue;
1851 
1852 		subh = (struct pfsync_subheader *)(m->m_data + offset);
1853 		offset += sizeof(*subh);
1854 
1855 		count = 0;
1856 		while ((st = TAILQ_FIRST(&sn.sn_qs[q])) != NULL) {
1857 			TAILQ_REMOVE(&sn.sn_qs[q], st, sync_list);
1858 #ifdef PFSYNC_DEBUG
1859 			KASSERT(st->sync_state == q);
1860 #endif
1861 			st->sync_state = PFSYNC_S_NONE;
1862 			pfsync_qs[q].write(st, m->m_data + offset);
1863 			offset += pfsync_qs[q].len;
1864 
1865 			pf_state_unref(st);
1866 			count++;
1867 		}
1868 
1869 		bzero(subh, sizeof(*subh));
1870 		subh->action = pfsync_qs[q].action;
1871 		subh->len = pfsync_qs[q].len >> 2;
1872 		subh->count = htons(count);
1873 	}
1874 
1875 	/* we're done, let's put it on the wire */
1876 #if NBPFILTER > 0
1877 	if (ifp->if_bpf) {
1878 		m->m_data += sizeof(*ip);
1879 		m->m_len = m->m_pkthdr.len = sn.sn_len - sizeof(*ip);
1880 		bpf_mtap(ifp->if_bpf, m, BPF_DIRECTION_OUT);
1881 		m->m_data -= sizeof(*ip);
1882 		m->m_len = m->m_pkthdr.len = sn.sn_len;
1883 	}
1884 
1885 	if (sc->sc_sync_ifidx == 0) {
1886 		sc->sc_len = PFSYNC_MINPKT;
1887 		m_freem(m);
1888 		return;
1889 	}
1890 #endif
1891 
1892 	sc->sc_if.if_opackets++;
1893 	sc->sc_if.if_obytes += m->m_pkthdr.len;
1894 
1895 	m->m_pkthdr.ph_rtableid = sc->sc_if.if_rdomain;
1896 
1897 	pfsync_send_pkt(m);
1898 }
1899 
1900 void
1901 pfsync_insert_state(struct pf_state *st)
1902 {
1903 	struct pfsync_softc *sc = pfsyncif;
1904 
1905 	NET_ASSERT_LOCKED();
1906 
1907 	if (ISSET(st->rule.ptr->rule_flag, PFRULE_NOSYNC) ||
1908 	    st->key[PF_SK_WIRE]->proto == IPPROTO_PFSYNC) {
1909 		SET(st->state_flags, PFSTATE_NOSYNC);
1910 		return;
1911 	}
1912 
1913 	if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING) ||
1914 	    ISSET(st->state_flags, PFSTATE_NOSYNC))
1915 		return;
1916 
1917 #ifdef PFSYNC_DEBUG
1918 	KASSERT(st->sync_state == PFSYNC_S_NONE);
1919 #endif
1920 
1921 	if (sc->sc_len == PFSYNC_MINPKT)
1922 		timeout_add_sec(&sc->sc_tmo, 1);
1923 
1924 	pfsync_q_ins(st, PFSYNC_S_INS);
1925 
1926 	st->sync_updates = 0;
1927 }
1928 
1929 int
1930 pfsync_defer(struct pf_state *st, struct mbuf *m)
1931 {
1932 	struct pfsync_softc *sc = pfsyncif;
1933 	struct pfsync_deferral *pd;
1934 
1935 	NET_ASSERT_LOCKED();
1936 
1937 	if (!sc->sc_defer ||
1938 	    ISSET(st->state_flags, PFSTATE_NOSYNC) ||
1939 	    m->m_flags & (M_BCAST|M_MCAST))
1940 		return (0);
1941 
1942 	if (sc->sc_deferred >= 128) {
1943 		mtx_enter(&sc->sc_deferrals_mtx);
1944 		pd = TAILQ_FIRST(&sc->sc_deferrals);
1945 		TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
1946 		sc->sc_deferred--;
1947 		mtx_leave(&sc->sc_deferrals_mtx);
1948 		if (timeout_del(&pd->pd_tmo))
1949 			pfsync_undefer(pd, 0);
1950 	}
1951 
1952 	pd = pool_get(&sc->sc_pool, M_NOWAIT);
1953 	if (pd == NULL)
1954 		return (0);
1955 
1956 	m->m_pkthdr.pf.flags |= PF_TAG_GENERATED;
1957 	SET(st->state_flags, PFSTATE_ACK);
1958 
1959 	pd->pd_st = pf_state_ref(st);
1960 	pd->pd_m = m;
1961 
1962 	mtx_enter(&sc->sc_deferrals_mtx);
1963 	sc->sc_deferred++;
1964 	TAILQ_INSERT_TAIL(&sc->sc_deferrals, pd, pd_entry);
1965 	mtx_leave(&sc->sc_deferrals_mtx);
1966 
1967 	timeout_set_proc(&pd->pd_tmo, pfsync_defer_tmo, pd);
1968 	timeout_add_msec(&pd->pd_tmo, 20);
1969 
1970 	schednetisr(NETISR_PFSYNC);
1971 
1972 	return (1);
1973 }
1974 
1975 void
1976 pfsync_undefer_notify(struct pfsync_deferral *pd)
1977 {
1978 	struct pf_pdesc pdesc;
1979 	struct pf_state *st = pd->pd_st;
1980 
1981 	if (st->rule.ptr->rt == PF_ROUTETO) {
1982 		if (pf_setup_pdesc(&pdesc, st->key[PF_SK_WIRE]->af,
1983 		    st->direction, st->kif, pd->pd_m, NULL) != PF_PASS) {
1984 			m_freem(pd->pd_m);
1985 			return;
1986 		}
1987 		switch (st->key[PF_SK_WIRE]->af) {
1988 		case AF_INET:
1989 			pf_route(&pdesc, st);
1990 			break;
1991 #ifdef INET6
1992 		case AF_INET6:
1993 			pf_route6(&pdesc, st);
1994 			break;
1995 #endif /* INET6 */
1996 		default:
1997 			unhandled_af(st->key[PF_SK_WIRE]->af);
1998 		}
1999 		pd->pd_m = pdesc.m;
2000 	} else {
2001 		switch (st->key[PF_SK_WIRE]->af) {
2002 		case AF_INET:
2003 			ip_output(pd->pd_m, NULL, NULL, 0, NULL, NULL,
2004 			    0);
2005 			break;
2006 #ifdef INET6
2007 		case AF_INET6:
2008 			ip6_output(pd->pd_m, NULL, NULL, 0,
2009 			    NULL, NULL);
2010 			break;
2011 #endif /* INET6 */
2012 		default:
2013 			unhandled_af(st->key[PF_SK_WIRE]->af);
2014 		}
2015 
2016 		pd->pd_m = NULL;
2017 	}
2018 }
2019 
2020 void
2021 pfsync_free_deferral(struct pfsync_deferral *pd)
2022 {
2023 	struct pfsync_softc *sc = pfsyncif;
2024 
2025 	pf_state_unref(pd->pd_st);
2026 	if (pd->pd_m != NULL)
2027 		m_freem(pd->pd_m);
2028 	pool_put(&sc->sc_pool, pd);
2029 }
2030 
2031 void
2032 pfsync_undefer(struct pfsync_deferral *pd, int drop)
2033 {
2034 	struct pfsync_softc *sc = pfsyncif;
2035 
2036 	NET_ASSERT_LOCKED();
2037 
2038 	if (sc == NULL)
2039 		return;
2040 
2041 	CLR(pd->pd_st->state_flags, PFSTATE_ACK);
2042 	if (drop) {
2043 		m_freem(pd->pd_m);
2044 		pd->pd_m = NULL;
2045 	} else
2046 		pfsync_undefer_notify(pd);
2047 
2048 	pfsync_free_deferral(pd);
2049 }
2050 
2051 void
2052 pfsync_defer_tmo(void *arg)
2053 {
2054 	struct pfsync_softc *sc = pfsyncif;
2055 	struct pfsync_deferral *pd = arg;
2056 
2057 	mtx_enter(&sc->sc_deferrals_mtx);
2058 	TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
2059 	sc->sc_deferred--;
2060 	mtx_leave(&sc->sc_deferrals_mtx);
2061 	NET_LOCK();
2062 	pfsync_undefer(pd, 0);
2063 	NET_UNLOCK();
2064 }
2065 
2066 void
2067 pfsync_deferred(struct pf_state *st, int drop)
2068 {
2069 	struct pfsync_softc *sc = pfsyncif;
2070 	struct pfsync_deferral *pd;
2071 
2072 	NET_ASSERT_LOCKED();
2073 
2074 	mtx_enter(&sc->sc_deferrals_mtx);
2075 	TAILQ_FOREACH(pd, &sc->sc_deferrals, pd_entry) {
2076 		 if (pd->pd_st == st) {
2077 			if (timeout_del(&pd->pd_tmo)) {
2078 				TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
2079 				sc->sc_deferred--;
2080 				mtx_leave(&sc->sc_deferrals_mtx);
2081 				pfsync_undefer(pd, drop);
2082 			} else
2083 				mtx_leave(&sc->sc_deferrals_mtx);
2084 			return;
2085 		}
2086 	}
2087 	mtx_leave(&sc->sc_deferrals_mtx);
2088 }
2089 
2090 void
2091 pfsync_update_state(struct pf_state *st)
2092 {
2093 	struct pfsync_softc *sc = pfsyncif;
2094 	int sync = 0;
2095 
2096 	NET_ASSERT_LOCKED();
2097 
2098 	if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
2099 		return;
2100 
2101 	if (ISSET(st->state_flags, PFSTATE_ACK))
2102 		pfsync_deferred(st, 0);
2103 	if (ISSET(st->state_flags, PFSTATE_NOSYNC)) {
2104 		if (st->sync_state != PFSYNC_S_NONE)
2105 			pfsync_q_del(st);
2106 		return;
2107 	}
2108 
2109 	if (sc->sc_len == PFSYNC_MINPKT)
2110 		timeout_add_sec(&sc->sc_tmo, 1);
2111 
2112 	switch (st->sync_state) {
2113 	case PFSYNC_S_UPD_C:
2114 	case PFSYNC_S_UPD:
2115 	case PFSYNC_S_INS:
2116 		/* we're already handling it */
2117 
2118 		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) {
2119 			st->sync_updates++;
2120 			if (st->sync_updates >= sc->sc_maxupdates)
2121 				sync = 1;
2122 		}
2123 		break;
2124 
2125 	case PFSYNC_S_IACK:
2126 		pfsync_q_del(st);
2127 	case PFSYNC_S_NONE:
2128 		pfsync_q_ins(st, PFSYNC_S_UPD_C);
2129 		st->sync_updates = 0;
2130 		break;
2131 
2132 	default:
2133 		panic("pfsync_update_state: unexpected sync state %d",
2134 		    st->sync_state);
2135 	}
2136 
2137 	if (sync || (getuptime() - st->pfsync_time) < 2)
2138 		schednetisr(NETISR_PFSYNC);
2139 }
2140 
2141 void
2142 pfsync_cancel_full_update(struct pfsync_softc *sc)
2143 {
2144 	if (timeout_pending(&sc->sc_bulkfail_tmo) ||
2145 	    timeout_pending(&sc->sc_bulk_tmo)) {
2146 #if NCARP > 0
2147 		if (!pfsync_sync_ok)
2148 			carp_group_demote_adj(&sc->sc_if, -1,
2149 			    "pfsync bulk cancelled");
2150 		if (sc->sc_initial_bulk) {
2151 			carp_group_demote_adj(&sc->sc_if, -32,
2152 			    "pfsync init");
2153 			sc->sc_initial_bulk = 0;
2154 		}
2155 #endif
2156 		pfsync_sync_ok = 1;
2157 		DPFPRINTF(LOG_INFO, "cancelling bulk update");
2158 	}
2159 	timeout_del(&sc->sc_bulkfail_tmo);
2160 	timeout_del(&sc->sc_bulk_tmo);
2161 	sc->sc_bulk_next = NULL;
2162 	sc->sc_bulk_last = NULL;
2163 	sc->sc_ureq_sent = 0;
2164 	sc->sc_bulk_tries = 0;
2165 }
2166 
2167 void
2168 pfsync_request_full_update(struct pfsync_softc *sc)
2169 {
2170 	if (sc->sc_sync_ifidx != 0 && ISSET(sc->sc_if.if_flags, IFF_RUNNING)) {
2171 		/* Request a full state table update. */
2172 		sc->sc_ureq_sent = getuptime();
2173 #if NCARP > 0
2174 		if (!sc->sc_link_demoted && pfsync_sync_ok)
2175 			carp_group_demote_adj(&sc->sc_if, 1,
2176 			    "pfsync bulk start");
2177 #endif
2178 		pfsync_sync_ok = 0;
2179 		DPFPRINTF(LOG_INFO, "requesting bulk update");
2180 		timeout_add(&sc->sc_bulkfail_tmo, 4 * hz +
2181 		    pf_pool_limits[PF_LIMIT_STATES].limit /
2182 		    ((sc->sc_if.if_mtu - PFSYNC_MINPKT) /
2183 		    sizeof(struct pfsync_state)));
2184 		pfsync_request_update(0, 0);
2185 	}
2186 }
2187 
2188 void
2189 pfsync_request_update(u_int32_t creatorid, u_int64_t id)
2190 {
2191 	struct pfsync_softc *sc = pfsyncif;
2192 	struct pfsync_upd_req_item *item;
2193 	size_t nlen, sc_len;
2194 
2195 	/*
2196 	 * this code does nothing to prevent multiple update requests for the
2197 	 * same state being generated.
2198 	 */
2199 
2200 	item = pool_get(&sc->sc_pool, PR_NOWAIT);
2201 	if (item == NULL) {
2202 		/* XXX stats */
2203 		return;
2204 	}
2205 
2206 	item->ur_msg.id = id;
2207 	item->ur_msg.creatorid = creatorid;
2208 
2209 	do {
2210 		mtx_enter(&sc->sc_upd_req_mtx);
2211 
2212 		nlen = sizeof(struct pfsync_upd_req);
2213 		if (TAILQ_EMPTY(&sc->sc_upd_req_list))
2214 			nlen += sizeof(struct pfsync_subheader);
2215 
2216 		sc_len = atomic_add_long_nv(&sc->sc_len, nlen);
2217 		if (sc_len > sc->sc_if.if_mtu) {
2218 			atomic_sub_long(&sc->sc_len, nlen);
2219 			mtx_leave(&sc->sc_upd_req_mtx);
2220 			pfsync_sendout();
2221 			continue;
2222 		}
2223 
2224 		TAILQ_INSERT_TAIL(&sc->sc_upd_req_list, item, ur_entry);
2225 		mtx_leave(&sc->sc_upd_req_mtx);
2226 	} while (0);
2227 
2228 	schednetisr(NETISR_PFSYNC);
2229 }
2230 
2231 void
2232 pfsync_update_state_req(struct pf_state *st)
2233 {
2234 	struct pfsync_softc *sc = pfsyncif;
2235 
2236 	if (sc == NULL)
2237 		panic("pfsync_update_state_req: nonexistent instance");
2238 
2239 	if (ISSET(st->state_flags, PFSTATE_NOSYNC)) {
2240 		if (st->sync_state != PFSYNC_S_NONE)
2241 			pfsync_q_del(st);
2242 		return;
2243 	}
2244 
2245 	switch (st->sync_state) {
2246 	case PFSYNC_S_UPD_C:
2247 	case PFSYNC_S_IACK:
2248 		pfsync_q_del(st);
2249 	case PFSYNC_S_NONE:
2250 		pfsync_q_ins(st, PFSYNC_S_UPD);
2251 		schednetisr(NETISR_PFSYNC);
2252 		return;
2253 
2254 	case PFSYNC_S_INS:
2255 	case PFSYNC_S_UPD:
2256 	case PFSYNC_S_DEL:
2257 		/* we're already handling it */
2258 		return;
2259 
2260 	default:
2261 		panic("pfsync_update_state_req: unexpected sync state %d",
2262 		    st->sync_state);
2263 	}
2264 }
2265 
2266 void
2267 pfsync_delete_state(struct pf_state *st)
2268 {
2269 	struct pfsync_softc *sc = pfsyncif;
2270 
2271 	NET_ASSERT_LOCKED();
2272 
2273 	if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
2274 		return;
2275 
2276 	if (ISSET(st->state_flags, PFSTATE_ACK))
2277 		pfsync_deferred(st, 1);
2278 	if (ISSET(st->state_flags, PFSTATE_NOSYNC)) {
2279 		if (st->sync_state != PFSYNC_S_NONE)
2280 			pfsync_q_del(st);
2281 		return;
2282 	}
2283 
2284 	if (sc->sc_len == PFSYNC_MINPKT)
2285 		timeout_add_sec(&sc->sc_tmo, 1);
2286 
2287 	switch (st->sync_state) {
2288 	case PFSYNC_S_INS:
2289 		/* we never got to tell the world so just forget about it */
2290 		pfsync_q_del(st);
2291 		return;
2292 
2293 	case PFSYNC_S_UPD_C:
2294 	case PFSYNC_S_UPD:
2295 	case PFSYNC_S_IACK:
2296 		pfsync_q_del(st);
2297 		/*
2298 		 * FALLTHROUGH to putting it on the del list
2299 		 * Note on refence count bookeeping:
2300 		 *	pfsync_q_del() drops reference for queue
2301 		 *	ownership. But the st entry survives, because
2302 		 *	our caller still holds a reference.
2303 		 */
2304 
2305 	case PFSYNC_S_NONE:
2306 		/*
2307 		 * We either fall through here, or there is no reference to
2308 		 * st owned by pfsync queues at this point.
2309 		 *
2310 		 * Calling pfsync_q_ins() puts st to del queue. The pfsync_q_ins()
2311 		 * grabs a reference for delete queue.
2312 		 */
2313 		pfsync_q_ins(st, PFSYNC_S_DEL);
2314 		return;
2315 
2316 	default:
2317 		panic("pfsync_delete_state: unexpected sync state %d",
2318 		    st->sync_state);
2319 	}
2320 }
2321 
2322 void
2323 pfsync_clear_states(u_int32_t creatorid, const char *ifname)
2324 {
2325 	struct pfsync_softc *sc = pfsyncif;
2326 	struct {
2327 		struct pfsync_subheader subh;
2328 		struct pfsync_clr clr;
2329 	} __packed r;
2330 
2331 	NET_ASSERT_LOCKED();
2332 
2333 	if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
2334 		return;
2335 
2336 	bzero(&r, sizeof(r));
2337 
2338 	r.subh.action = PFSYNC_ACT_CLR;
2339 	r.subh.len = sizeof(struct pfsync_clr) >> 2;
2340 	r.subh.count = htons(1);
2341 
2342 	strlcpy(r.clr.ifname, ifname, sizeof(r.clr.ifname));
2343 	r.clr.creatorid = creatorid;
2344 
2345 	pfsync_send_plus(&r, sizeof(r));
2346 }
2347 
2348 void
2349 pfsync_q_ins(struct pf_state *st, int q)
2350 {
2351 	struct pfsync_softc *sc = pfsyncif;
2352 	size_t nlen, sc_len;
2353 
2354 	KASSERT(st->sync_state == PFSYNC_S_NONE);
2355 
2356 #if defined(PFSYNC_DEBUG)
2357 	if (sc->sc_len < PFSYNC_MINPKT)
2358 		panic("pfsync pkt len is too low %zd", sc->sc_len);
2359 #endif
2360 	do {
2361 		mtx_enter(&sc->sc_mtx[q]);
2362 
2363 		/*
2364 		 * If two threads are competing to insert the same state, then
2365 		 * there must be just single winner.
2366 		 */
2367 		if (st->sync_state != PFSYNC_S_NONE) {
2368 			mtx_leave(&sc->sc_mtx[q]);
2369 			break;
2370 		}
2371 
2372 		nlen = pfsync_qs[q].len;
2373 
2374 		if (TAILQ_EMPTY(&sc->sc_qs[q]))
2375 			nlen += sizeof(struct pfsync_subheader);
2376 
2377 		sc_len = atomic_add_long_nv(&sc->sc_len, nlen);
2378 		if (sc_len > sc->sc_if.if_mtu) {
2379 			atomic_sub_long(&sc->sc_len, nlen);
2380 			mtx_leave(&sc->sc_mtx[q]);
2381 			pfsync_sendout();
2382 			continue;
2383 		}
2384 
2385 		pf_state_ref(st);
2386 
2387 		TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list);
2388 		st->sync_state = q;
2389 		mtx_leave(&sc->sc_mtx[q]);
2390 	} while (0);
2391 }
2392 
2393 void
2394 pfsync_q_del(struct pf_state *st)
2395 {
2396 	struct pfsync_softc *sc = pfsyncif;
2397 	int q = st->sync_state;
2398 
2399 	KASSERT(st->sync_state != PFSYNC_S_NONE);
2400 
2401 	mtx_enter(&sc->sc_mtx[q]);
2402 	atomic_sub_long(&sc->sc_len, pfsync_qs[q].len);
2403 	TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
2404 	if (TAILQ_EMPTY(&sc->sc_qs[q]))
2405 		atomic_sub_long(&sc->sc_len, sizeof (struct pfsync_subheader));
2406 	mtx_leave(&sc->sc_mtx[q]);
2407 
2408 	st->sync_state = PFSYNC_S_NONE;
2409 	pf_state_unref(st);
2410 }
2411 
2412 void
2413 pfsync_update_tdb(struct tdb *t, int output)
2414 {
2415 	struct pfsync_softc *sc = pfsyncif;
2416 	size_t nlen, sc_len;
2417 
2418 	if (sc == NULL)
2419 		return;
2420 
2421 	if (!ISSET(t->tdb_flags, TDBF_PFSYNC)) {
2422 		do {
2423 			mtx_enter(&sc->sc_tdb_mtx);
2424 			nlen = sizeof(struct pfsync_tdb);
2425 
2426 			if (TAILQ_EMPTY(&sc->sc_tdb_q))
2427 				nlen += sizeof(struct pfsync_subheader);
2428 
2429 			sc_len = atomic_add_long_nv(&sc->sc_len, nlen);
2430 			if (sc_len > sc->sc_if.if_mtu) {
2431 				atomic_sub_long(&sc->sc_len, nlen);
2432 				mtx_leave(&sc->sc_tdb_mtx);
2433 				pfsync_sendout();
2434 				continue;
2435 			}
2436 
2437 			TAILQ_INSERT_TAIL(&sc->sc_tdb_q, t, tdb_sync_entry);
2438 			mtx_leave(&sc->sc_tdb_mtx);
2439 
2440 			SET(t->tdb_flags, TDBF_PFSYNC);
2441 			t->tdb_updates = 0;
2442 		} while (0);
2443 	} else {
2444 		if (++t->tdb_updates >= sc->sc_maxupdates)
2445 			schednetisr(NETISR_PFSYNC);
2446 	}
2447 
2448 	if (output)
2449 		SET(t->tdb_flags, TDBF_PFSYNC_RPL);
2450 	else
2451 		CLR(t->tdb_flags, TDBF_PFSYNC_RPL);
2452 }
2453 
2454 void
2455 pfsync_delete_tdb(struct tdb *t)
2456 {
2457 	struct pfsync_softc *sc = pfsyncif;
2458 	size_t nlen;
2459 
2460 	if (sc == NULL || !ISSET(t->tdb_flags, TDBF_PFSYNC))
2461 		return;
2462 
2463 	mtx_enter(&sc->sc_tdb_mtx);
2464 
2465 	TAILQ_REMOVE(&sc->sc_tdb_q, t, tdb_sync_entry);
2466 	CLR(t->tdb_flags, TDBF_PFSYNC);
2467 
2468 	nlen = sizeof(struct pfsync_tdb);
2469 	if (TAILQ_EMPTY(&sc->sc_tdb_q))
2470 		nlen += sizeof(struct pfsync_subheader);
2471 	atomic_sub_long(&sc->sc_len, nlen);
2472 
2473 	mtx_leave(&sc->sc_tdb_mtx);
2474 }
2475 
2476 void
2477 pfsync_out_tdb(struct tdb *t, void *buf)
2478 {
2479 	struct pfsync_tdb *ut = buf;
2480 
2481 	bzero(ut, sizeof(*ut));
2482 	ut->spi = t->tdb_spi;
2483 	bcopy(&t->tdb_dst, &ut->dst, sizeof(ut->dst));
2484 	/*
2485 	 * When a failover happens, the master's rpl is probably above
2486 	 * what we see here (we may be up to a second late), so
2487 	 * increase it a bit for outbound tdbs to manage most such
2488 	 * situations.
2489 	 *
2490 	 * For now, just add an offset that is likely to be larger
2491 	 * than the number of packets we can see in one second. The RFC
2492 	 * just says the next packet must have a higher seq value.
2493 	 *
2494 	 * XXX What is a good algorithm for this? We could use
2495 	 * a rate-determined increase, but to know it, we would have
2496 	 * to extend struct tdb.
2497 	 * XXX pt->rpl can wrap over MAXINT, but if so the real tdb
2498 	 * will soon be replaced anyway. For now, just don't handle
2499 	 * this edge case.
2500 	 */
2501 #define RPL_INCR 16384
2502 	ut->rpl = htobe64(t->tdb_rpl + (ISSET(t->tdb_flags, TDBF_PFSYNC_RPL) ?
2503 	    RPL_INCR : 0));
2504 	ut->cur_bytes = htobe64(t->tdb_cur_bytes);
2505 	ut->sproto = t->tdb_sproto;
2506 	ut->rdomain = htons(t->tdb_rdomain);
2507 }
2508 
2509 void
2510 pfsync_bulk_start(void)
2511 {
2512 	struct pfsync_softc *sc = pfsyncif;
2513 
2514 	DPFPRINTF(LOG_INFO, "received bulk update request");
2515 
2516 	if (TAILQ_EMPTY(&state_list))
2517 		pfsync_bulk_status(PFSYNC_BUS_END);
2518 	else {
2519 		sc->sc_ureq_received = getuptime();
2520 
2521 		if (sc->sc_bulk_next == NULL) {
2522 			PF_STATE_ENTER_READ();
2523 			sc->sc_bulk_next = TAILQ_FIRST(&state_list);
2524 			pf_state_ref(sc->sc_bulk_next);
2525 			PF_STATE_EXIT_READ();
2526 		}
2527 		sc->sc_bulk_last = sc->sc_bulk_next;
2528 		pf_state_ref(sc->sc_bulk_last);
2529 
2530 		pfsync_bulk_status(PFSYNC_BUS_START);
2531 		timeout_add(&sc->sc_bulk_tmo, 0);
2532 	}
2533 }
2534 
2535 void
2536 pfsync_bulk_update(void *arg)
2537 {
2538 	struct pfsync_softc *sc;
2539 	struct pf_state *st, *st_next;
2540 	int i = 0;
2541 
2542 	NET_LOCK();
2543 	sc = pfsyncif;
2544 	if (sc == NULL)
2545 		goto out;
2546 	st = sc->sc_bulk_next;
2547 	sc->sc_bulk_next = NULL;
2548 
2549 	for (;;) {
2550 		if (st->sync_state == PFSYNC_S_NONE &&
2551 		    st->timeout < PFTM_MAX &&
2552 		    st->pfsync_time <= sc->sc_ureq_received) {
2553 			pfsync_update_state_req(st);
2554 			i++;
2555 		}
2556 
2557 		/*
2558 		 * I wonder how we prevent infinite bulk update.  IMO it can
2559 		 * happen when sc_bulk_last state expires before we iterate
2560 		 * through the whole list.
2561 		 */
2562 		PF_STATE_ENTER_READ();
2563 		st_next = TAILQ_NEXT(st, entry_list);
2564 		pf_state_unref(st);
2565 		st = st_next;
2566 		if (st == NULL)
2567 			st = TAILQ_FIRST(&state_list);
2568 		pf_state_ref(st);
2569 		PF_STATE_EXIT_READ();
2570 
2571 		if ((st == NULL) || (st == sc->sc_bulk_last)) {
2572 			/* we're done */
2573 			pf_state_unref(sc->sc_bulk_last);
2574 			sc->sc_bulk_last = NULL;
2575 			pfsync_bulk_status(PFSYNC_BUS_END);
2576 			break;
2577 		}
2578 
2579 		if (i > 1 && (sc->sc_if.if_mtu - sc->sc_len) <
2580 		    sizeof(struct pfsync_state)) {
2581 			/* we've filled a packet */
2582 			sc->sc_bulk_next = st;
2583 			timeout_add(&sc->sc_bulk_tmo, 1);
2584 			break;
2585 		}
2586 	}
2587  out:
2588 	NET_UNLOCK();
2589 }
2590 
2591 void
2592 pfsync_bulk_status(u_int8_t status)
2593 {
2594 	struct {
2595 		struct pfsync_subheader subh;
2596 		struct pfsync_bus bus;
2597 	} __packed r;
2598 
2599 	struct pfsync_softc *sc = pfsyncif;
2600 
2601 	bzero(&r, sizeof(r));
2602 
2603 	r.subh.action = PFSYNC_ACT_BUS;
2604 	r.subh.len = sizeof(struct pfsync_bus) >> 2;
2605 	r.subh.count = htons(1);
2606 
2607 	r.bus.creatorid = pf_status.hostid;
2608 	r.bus.endtime = htonl(getuptime() - sc->sc_ureq_received);
2609 	r.bus.status = status;
2610 
2611 	pfsync_send_plus(&r, sizeof(r));
2612 }
2613 
2614 void
2615 pfsync_bulk_fail(void *arg)
2616 {
2617 	struct pfsync_softc *sc;
2618 
2619 	NET_LOCK();
2620 	sc = pfsyncif;
2621 	if (sc == NULL)
2622 		goto out;
2623 	if (sc->sc_bulk_tries++ < PFSYNC_MAX_BULKTRIES) {
2624 		/* Try again */
2625 		timeout_add_sec(&sc->sc_bulkfail_tmo, 5);
2626 		pfsync_request_update(0, 0);
2627 	} else {
2628 		/* Pretend like the transfer was ok */
2629 		sc->sc_ureq_sent = 0;
2630 		sc->sc_bulk_tries = 0;
2631 #if NCARP > 0
2632 		if (!pfsync_sync_ok)
2633 			carp_group_demote_adj(&sc->sc_if, -1,
2634 			    sc->sc_link_demoted ?
2635 			    "pfsync link state up" :
2636 			    "pfsync bulk fail");
2637 		if (sc->sc_initial_bulk) {
2638 			carp_group_demote_adj(&sc->sc_if, -32,
2639 			    "pfsync init");
2640 			sc->sc_initial_bulk = 0;
2641 		}
2642 #endif
2643 		pfsync_sync_ok = 1;
2644 		sc->sc_link_demoted = 0;
2645 		DPFPRINTF(LOG_ERR, "failed to receive bulk update");
2646 	}
2647  out:
2648 	NET_UNLOCK();
2649 }
2650 
2651 void
2652 pfsync_send_plus(void *plus, size_t pluslen)
2653 {
2654 	struct pfsync_softc *sc = pfsyncif;
2655 
2656 	if (sc->sc_len + pluslen > sc->sc_if.if_mtu)
2657 		pfsync_sendout();
2658 
2659 	sc->sc_plus = plus;
2660 	sc->sc_len += (sc->sc_pluslen = pluslen);
2661 
2662 	pfsync_sendout();
2663 }
2664 
2665 int
2666 pfsync_up(void)
2667 {
2668 	struct pfsync_softc *sc = pfsyncif;
2669 
2670 	if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
2671 		return (0);
2672 
2673 	return (1);
2674 }
2675 
2676 int
2677 pfsync_state_in_use(struct pf_state *st)
2678 {
2679 	struct pfsync_softc *sc = pfsyncif;
2680 
2681 	if (sc == NULL)
2682 		return (0);
2683 
2684 	if (st->sync_state != PFSYNC_S_NONE ||
2685 	    st == sc->sc_bulk_next ||
2686 	    st == sc->sc_bulk_last)
2687 		return (1);
2688 
2689 	return (0);
2690 }
2691 
2692 void
2693 pfsync_timeout(void *arg)
2694 {
2695 	NET_LOCK();
2696 	pfsync_sendout();
2697 	NET_UNLOCK();
2698 }
2699 
2700 /* this is a softnet/netisr handler */
2701 void
2702 pfsyncintr(void)
2703 {
2704 	pfsync_sendout();
2705 }
2706 
2707 int
2708 pfsync_sysctl_pfsyncstat(void *oldp, size_t *oldlenp, void *newp)
2709 {
2710 	struct pfsyncstats pfsyncstat;
2711 
2712 	CTASSERT(sizeof(pfsyncstat) == (pfsyncs_ncounters * sizeof(uint64_t)));
2713 	memset(&pfsyncstat, 0, sizeof pfsyncstat);
2714 	counters_read(pfsynccounters, (uint64_t *)&pfsyncstat,
2715 	    pfsyncs_ncounters);
2716 	return (sysctl_rdstruct(oldp, oldlenp, newp,
2717 	    &pfsyncstat, sizeof(pfsyncstat)));
2718 }
2719 
2720 int
2721 pfsync_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
2722     size_t newlen)
2723 {
2724 	/* All sysctl names at this level are terminal. */
2725 	if (namelen != 1)
2726 		return (ENOTDIR);
2727 
2728 	switch (name[0]) {
2729 	case PFSYNCCTL_STATS:
2730 		return (pfsync_sysctl_pfsyncstat(oldp, oldlenp, newp));
2731 	default:
2732 		return (ENOPROTOOPT);
2733 	}
2734 }
2735