xref: /openbsd-src/sys/net/if_pfsync.c (revision ff0e7be1ebbcc809ea8ad2b6dafe215824da9e46)
1 /*	$OpenBSD: if_pfsync.c,v 1.317 2023/06/05 08:45:20 sashan Exp $	*/
2 
3 /*
4  * Copyright (c) 2002 Michael Shalayeff
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
20  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22  * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
24  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
25  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26  * THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*
30  * Copyright (c) 2009 David Gwynne <dlg@openbsd.org>
31  *
32  * Permission to use, copy, modify, and distribute this software for any
33  * purpose with or without fee is hereby granted, provided that the above
34  * copyright notice and this permission notice appear in all copies.
35  *
36  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
37  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
38  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
39  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
40  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
41  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
42  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
43  */
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/time.h>
48 #include <sys/malloc.h>
49 #include <sys/mbuf.h>
50 #include <sys/socket.h>
51 #include <sys/ioctl.h>
52 #include <sys/timeout.h>
53 #include <sys/kernel.h>
54 #include <sys/sysctl.h>
55 #include <sys/pool.h>
56 #include <sys/syslog.h>
57 
58 #include <net/if.h>
59 #include <net/if_types.h>
60 #include <net/bpf.h>
61 #include <net/netisr.h>
62 
63 #include <netinet/in.h>
64 #include <netinet/if_ether.h>
65 #include <netinet/ip.h>
66 #include <netinet/in_var.h>
67 #include <netinet/ip_var.h>
68 #include <netinet/ip_ipsp.h>
69 #include <netinet/ip_icmp.h>
70 #include <netinet/icmp6.h>
71 #include <netinet/tcp.h>
72 #include <netinet/tcp_seq.h>
73 #include <netinet/tcp_fsm.h>
74 #include <netinet/udp.h>
75 
76 #ifdef INET6
77 #include <netinet6/in6_var.h>
78 #include <netinet/ip6.h>
79 #include <netinet6/ip6_var.h>
80 #include <netinet6/nd6.h>
81 #endif /* INET6 */
82 
83 #include "carp.h"
84 #if NCARP > 0
85 #include <netinet/ip_carp.h>
86 #endif
87 
88 #define PF_DEBUGNAME	"pfsync: "
89 #include <net/pfvar.h>
90 #include <net/pfvar_priv.h>
91 #include <net/if_pfsync.h>
92 
93 #include "bpfilter.h"
94 #include "pfsync.h"
95 
96 #define PFSYNC_DEFER_NSEC 20000000ULL
97 
98 #define PFSYNC_MINPKT ( \
99 	sizeof(struct ip) + \
100 	sizeof(struct pfsync_header))
101 
102 int	pfsync_upd_tcp(struct pf_state *, struct pfsync_state_peer *,
103 	    struct pfsync_state_peer *);
104 
105 int	pfsync_in_clr(caddr_t, int, int, int);
106 int	pfsync_in_iack(caddr_t, int, int, int);
107 int	pfsync_in_upd_c(caddr_t, int, int, int);
108 int	pfsync_in_ureq(caddr_t, int, int, int);
109 int	pfsync_in_del(caddr_t, int, int, int);
110 int	pfsync_in_del_c(caddr_t, int, int, int);
111 int	pfsync_in_bus(caddr_t, int, int, int);
112 int	pfsync_in_tdb(caddr_t, int, int, int);
113 int	pfsync_in_ins(caddr_t, int, int, int);
114 int	pfsync_in_upd(caddr_t, int, int, int);
115 int	pfsync_in_eof(caddr_t, int, int, int);
116 
117 int	pfsync_in_error(caddr_t, int, int, int);
118 
119 void	pfsync_update_state_locked(struct pf_state *);
120 
121 const struct {
122 	int	(*in)(caddr_t, int, int, int);
123 	size_t	len;
124 } pfsync_acts[] = {
125 	/* PFSYNC_ACT_CLR */
126 	{ pfsync_in_clr,	sizeof(struct pfsync_clr) },
127 	 /* PFSYNC_ACT_OINS */
128 	{ pfsync_in_error,	0 },
129 	/* PFSYNC_ACT_INS_ACK */
130 	{ pfsync_in_iack,	sizeof(struct pfsync_ins_ack) },
131 	/* PFSYNC_ACT_OUPD */
132 	{ pfsync_in_error,	0 },
133 	/* PFSYNC_ACT_UPD_C */
134 	{ pfsync_in_upd_c,	sizeof(struct pfsync_upd_c) },
135 	/* PFSYNC_ACT_UPD_REQ */
136 	{ pfsync_in_ureq,	sizeof(struct pfsync_upd_req) },
137 	/* PFSYNC_ACT_DEL */
138 	{ pfsync_in_del,	sizeof(struct pfsync_state) },
139 	/* PFSYNC_ACT_DEL_C */
140 	{ pfsync_in_del_c,	sizeof(struct pfsync_del_c) },
141 	/* PFSYNC_ACT_INS_F */
142 	{ pfsync_in_error,	0 },
143 	/* PFSYNC_ACT_DEL_F */
144 	{ pfsync_in_error,	0 },
145 	/* PFSYNC_ACT_BUS */
146 	{ pfsync_in_bus,	sizeof(struct pfsync_bus) },
147 	/* PFSYNC_ACT_OTDB */
148 	{ pfsync_in_error,	0 },
149 	/* PFSYNC_ACT_EOF */
150 	{ pfsync_in_error,	0 },
151 	/* PFSYNC_ACT_INS */
152 	{ pfsync_in_ins,	sizeof(struct pfsync_state) },
153 	/* PFSYNC_ACT_UPD */
154 	{ pfsync_in_upd,	sizeof(struct pfsync_state) },
155 	/* PFSYNC_ACT_TDB */
156 	{ pfsync_in_tdb,	sizeof(struct pfsync_tdb) },
157 };
158 
159 struct pfsync_q {
160 	void		(*write)(struct pf_state *, void *);
161 	size_t		len;
162 	u_int8_t	action;
163 };
164 
165 /* we have one of these for every PFSYNC_S_ */
166 void	pfsync_out_state(struct pf_state *, void *);
167 void	pfsync_out_iack(struct pf_state *, void *);
168 void	pfsync_out_upd_c(struct pf_state *, void *);
169 void	pfsync_out_del(struct pf_state *, void *);
170 
171 struct pfsync_q pfsync_qs[] = {
172 	{ pfsync_out_iack,  sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK },
173 	{ pfsync_out_upd_c, sizeof(struct pfsync_upd_c),   PFSYNC_ACT_UPD_C },
174 	{ pfsync_out_del,   sizeof(struct pfsync_del_c),   PFSYNC_ACT_DEL_C },
175 	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_INS },
176 	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_UPD }
177 };
178 
179 void	pfsync_q_ins(struct pf_state *, int);
180 void	pfsync_q_del(struct pf_state *);
181 
182 struct pfsync_upd_req_item {
183 	TAILQ_ENTRY(pfsync_upd_req_item)	ur_entry;
184 	TAILQ_ENTRY(pfsync_upd_req_item)	ur_snap;
185 	struct pfsync_upd_req			ur_msg;
186 };
187 TAILQ_HEAD(pfsync_upd_reqs, pfsync_upd_req_item);
188 
189 struct pfsync_deferral {
190 	TAILQ_ENTRY(pfsync_deferral)		 pd_entry;
191 	struct pf_state				*pd_st;
192 	struct mbuf				*pd_m;
193 	uint64_t				 pd_deadline;
194 };
195 TAILQ_HEAD(pfsync_deferrals, pfsync_deferral);
196 
197 #define PFSYNC_PLSIZE	MAX(sizeof(struct pfsync_upd_req_item), \
198 			    sizeof(struct pfsync_deferral))
199 
200 void	pfsync_out_tdb(struct tdb *, void *);
201 
202 struct pfsync_softc {
203 	struct ifnet		 sc_if;
204 	unsigned int		 sc_sync_ifidx;
205 
206 	struct pool		 sc_pool;
207 
208 	struct ip_moptions	 sc_imo;
209 
210 	struct in_addr		 sc_sync_peer;
211 	u_int8_t		 sc_maxupdates;
212 
213 	struct ip		 sc_template;
214 
215 	struct pf_state_queue	 sc_qs[PFSYNC_S_COUNT];
216 	struct mutex		 sc_st_mtx;
217 	size_t			 sc_len;
218 
219 	struct pfsync_upd_reqs	 sc_upd_req_list;
220 	struct mutex		 sc_upd_req_mtx;
221 
222 	int			 sc_initial_bulk;
223 	int			 sc_link_demoted;
224 
225 	int			 sc_defer;
226 	struct pfsync_deferrals	 sc_deferrals;
227 	u_int			 sc_deferred;
228 	struct mutex		 sc_deferrals_mtx;
229 	struct timeout		 sc_deferrals_tmo;
230 
231 	void			*sc_plus;
232 	size_t			 sc_pluslen;
233 
234 	u_int32_t		 sc_ureq_sent;
235 	int			 sc_bulk_tries;
236 	struct timeout		 sc_bulkfail_tmo;
237 
238 	u_int32_t		 sc_ureq_received;
239 	struct pf_state		*sc_bulk_next;
240 	struct pf_state		*sc_bulk_last;
241 	struct timeout		 sc_bulk_tmo;
242 
243 	TAILQ_HEAD(, tdb)	 sc_tdb_q;
244 	struct mutex		 sc_tdb_mtx;
245 
246 	struct task		 sc_ltask;
247 	struct task		 sc_dtask;
248 
249 	struct timeout		 sc_tmo;
250 };
251 
252 struct pfsync_snapshot {
253 	struct pfsync_softc	*sn_sc;
254 	struct pf_state_queue	 sn_qs[PFSYNC_S_COUNT];
255 	struct pfsync_upd_reqs	 sn_upd_req_list;
256 	TAILQ_HEAD(, tdb)	 sn_tdb_q;
257 	size_t			 sn_len;
258 	void			*sn_plus;
259 	size_t			 sn_pluslen;
260 };
261 
262 struct pfsync_softc	*pfsyncif = NULL;
263 struct cpumem		*pfsynccounters;
264 
265 void	pfsyncattach(int);
266 int	pfsync_clone_create(struct if_clone *, int);
267 int	pfsync_clone_destroy(struct ifnet *);
268 void	pfsync_update_net_tdb(struct pfsync_tdb *);
269 int	pfsyncoutput(struct ifnet *, struct mbuf *, struct sockaddr *,
270 	    struct rtentry *);
271 int	pfsyncioctl(struct ifnet *, u_long, caddr_t);
272 void	pfsyncstart(struct ifqueue *);
273 void	pfsync_syncdev_state(void *);
274 void	pfsync_ifdetach(void *);
275 
276 void	pfsync_deferred(struct pf_state *, int);
277 void	pfsync_undefer(struct pfsync_deferral *, int);
278 void	pfsync_deferrals_tmo(void *);
279 
280 void	pfsync_cancel_full_update(struct pfsync_softc *);
281 void	pfsync_request_full_update(struct pfsync_softc *);
282 void	pfsync_request_update(u_int32_t, u_int64_t);
283 void	pfsync_update_state_req(struct pf_state *);
284 
285 void	pfsync_drop(struct pfsync_softc *);
286 void	pfsync_sendout(void);
287 void	pfsync_send_plus(void *, size_t);
288 void	pfsync_timeout(void *);
289 void	pfsync_tdb_timeout(void *);
290 
291 void	pfsync_bulk_start(void);
292 void	pfsync_bulk_status(u_int8_t);
293 void	pfsync_bulk_update(void *);
294 void	pfsync_bulk_fail(void *);
295 
296 void	pfsync_grab_snapshot(struct pfsync_snapshot *, struct pfsync_softc *);
297 void	pfsync_drop_snapshot(struct pfsync_snapshot *);
298 
299 void	pfsync_send_dispatch(void *);
300 void	pfsync_send_pkt(struct mbuf *);
301 
302 static struct mbuf_queue	pfsync_mq;
303 static struct task	pfsync_task =
304     TASK_INITIALIZER(pfsync_send_dispatch, &pfsync_mq);
305 
306 #define PFSYNC_MAX_BULKTRIES	12
307 int	pfsync_sync_ok;
308 
309 struct if_clone	pfsync_cloner =
310     IF_CLONE_INITIALIZER("pfsync", pfsync_clone_create, pfsync_clone_destroy);
311 
312 void
313 pfsyncattach(int npfsync)
314 {
315 	if_clone_attach(&pfsync_cloner);
316 	pfsynccounters = counters_alloc(pfsyncs_ncounters);
317 	mq_init(&pfsync_mq, 4096, IPL_MPFLOOR);
318 }
319 
320 int
321 pfsync_clone_create(struct if_clone *ifc, int unit)
322 {
323 	struct pfsync_softc *sc;
324 	struct ifnet *ifp;
325 	int q;
326 
327 	if (unit != 0)
328 		return (EINVAL);
329 
330 	pfsync_sync_ok = 1;
331 
332 	sc = malloc(sizeof(*pfsyncif), M_DEVBUF, M_WAITOK|M_ZERO);
333 	for (q = 0; q < PFSYNC_S_COUNT; q++)
334 		TAILQ_INIT(&sc->sc_qs[q]);
335 	mtx_init(&sc->sc_st_mtx, IPL_MPFLOOR);
336 
337 	pool_init(&sc->sc_pool, PFSYNC_PLSIZE, 0, IPL_MPFLOOR, 0, "pfsync",
338 	    NULL);
339 	TAILQ_INIT(&sc->sc_upd_req_list);
340 	mtx_init(&sc->sc_upd_req_mtx, IPL_MPFLOOR);
341 	TAILQ_INIT(&sc->sc_deferrals);
342 	mtx_init(&sc->sc_deferrals_mtx, IPL_MPFLOOR);
343 	timeout_set_proc(&sc->sc_deferrals_tmo, pfsync_deferrals_tmo, sc);
344 	task_set(&sc->sc_ltask, pfsync_syncdev_state, sc);
345 	task_set(&sc->sc_dtask, pfsync_ifdetach, sc);
346 	sc->sc_deferred = 0;
347 
348 	TAILQ_INIT(&sc->sc_tdb_q);
349 	mtx_init(&sc->sc_tdb_mtx, IPL_MPFLOOR);
350 
351 	sc->sc_len = PFSYNC_MINPKT;
352 	sc->sc_maxupdates = 128;
353 
354 	sc->sc_imo.imo_membership = mallocarray(IP_MIN_MEMBERSHIPS,
355 	    sizeof(struct in_multi *), M_IPMOPTS, M_WAITOK|M_ZERO);
356 	sc->sc_imo.imo_max_memberships = IP_MIN_MEMBERSHIPS;
357 
358 	ifp = &sc->sc_if;
359 	snprintf(ifp->if_xname, sizeof ifp->if_xname, "pfsync%d", unit);
360 	ifp->if_softc = sc;
361 	ifp->if_ioctl = pfsyncioctl;
362 	ifp->if_output = pfsyncoutput;
363 	ifp->if_qstart = pfsyncstart;
364 	ifp->if_type = IFT_PFSYNC;
365 	ifp->if_hdrlen = sizeof(struct pfsync_header);
366 	ifp->if_mtu = ETHERMTU;
367 	ifp->if_xflags = IFXF_CLONED | IFXF_MPSAFE;
368 	timeout_set_proc(&sc->sc_tmo, pfsync_timeout, NULL);
369 	timeout_set_proc(&sc->sc_bulk_tmo, pfsync_bulk_update, NULL);
370 	timeout_set_proc(&sc->sc_bulkfail_tmo, pfsync_bulk_fail, NULL);
371 
372 	if_attach(ifp);
373 	if_alloc_sadl(ifp);
374 
375 #if NCARP > 0
376 	if_addgroup(ifp, "carp");
377 #endif
378 
379 #if NBPFILTER > 0
380 	bpfattach(&sc->sc_if.if_bpf, ifp, DLT_PFSYNC, PFSYNC_HDRLEN);
381 #endif
382 
383 	pfsyncif = sc;
384 
385 	return (0);
386 }
387 
388 int
389 pfsync_clone_destroy(struct ifnet *ifp)
390 {
391 	struct pfsync_softc *sc = ifp->if_softc;
392 	struct ifnet *ifp0;
393 	struct pfsync_deferral *pd;
394 	struct pfsync_deferrals	 deferrals;
395 
396 	NET_LOCK();
397 
398 #if NCARP > 0
399 	if (!pfsync_sync_ok)
400 		carp_group_demote_adj(&sc->sc_if, -1, "pfsync destroy");
401 	if (sc->sc_link_demoted)
402 		carp_group_demote_adj(&sc->sc_if, -1, "pfsync destroy");
403 #endif
404 	if ((ifp0 = if_get(sc->sc_sync_ifidx)) != NULL) {
405 		if_linkstatehook_del(ifp0, &sc->sc_ltask);
406 		if_detachhook_del(ifp0, &sc->sc_dtask);
407 	}
408 	if_put(ifp0);
409 
410 	/* XXXSMP breaks atomicity */
411 	NET_UNLOCK();
412 	if_detach(ifp);
413 	NET_LOCK();
414 
415 	pfsync_drop(sc);
416 
417 	if (sc->sc_deferred > 0) {
418 		TAILQ_INIT(&deferrals);
419 		mtx_enter(&sc->sc_deferrals_mtx);
420 		TAILQ_CONCAT(&deferrals, &sc->sc_deferrals, pd_entry);
421 		sc->sc_deferred = 0;
422 		mtx_leave(&sc->sc_deferrals_mtx);
423 
424 		while ((pd = TAILQ_FIRST(&deferrals)) != NULL) {
425 			TAILQ_REMOVE(&deferrals, pd, pd_entry);
426 			pfsync_undefer(pd, 0);
427 		}
428 	}
429 
430 	pfsyncif = NULL;
431 	timeout_del(&sc->sc_bulkfail_tmo);
432 	timeout_del(&sc->sc_bulk_tmo);
433 	timeout_del(&sc->sc_tmo);
434 
435 	NET_UNLOCK();
436 
437 	pool_destroy(&sc->sc_pool);
438 	free(sc->sc_imo.imo_membership, M_IPMOPTS,
439 	    sc->sc_imo.imo_max_memberships * sizeof(struct in_multi *));
440 	free(sc, M_DEVBUF, sizeof(*sc));
441 
442 	return (0);
443 }
444 
445 /*
446  * Start output on the pfsync interface.
447  */
448 void
449 pfsyncstart(struct ifqueue *ifq)
450 {
451 	ifq_purge(ifq);
452 }
453 
454 void
455 pfsync_syncdev_state(void *arg)
456 {
457 	struct pfsync_softc *sc = arg;
458 	struct ifnet *ifp;
459 
460 	if ((sc->sc_if.if_flags & IFF_UP) == 0)
461 		return;
462 	if ((ifp = if_get(sc->sc_sync_ifidx)) == NULL)
463 		return;
464 
465 	if (ifp->if_link_state == LINK_STATE_DOWN) {
466 		sc->sc_if.if_flags &= ~IFF_RUNNING;
467 		if (!sc->sc_link_demoted) {
468 #if NCARP > 0
469 			carp_group_demote_adj(&sc->sc_if, 1,
470 			    "pfsync link state down");
471 #endif
472 			sc->sc_link_demoted = 1;
473 		}
474 
475 		/* drop everything */
476 		timeout_del(&sc->sc_tmo);
477 		pfsync_drop(sc);
478 
479 		pfsync_cancel_full_update(sc);
480 	} else if (sc->sc_link_demoted) {
481 		sc->sc_if.if_flags |= IFF_RUNNING;
482 
483 		pfsync_request_full_update(sc);
484 	}
485 
486 	if_put(ifp);
487 }
488 
489 void
490 pfsync_ifdetach(void *arg)
491 {
492 	struct pfsync_softc *sc = arg;
493 	struct ifnet *ifp;
494 
495 	if ((ifp = if_get(sc->sc_sync_ifidx)) != NULL) {
496 		if_linkstatehook_del(ifp, &sc->sc_ltask);
497 		if_detachhook_del(ifp, &sc->sc_dtask);
498 	}
499 	if_put(ifp);
500 
501 	sc->sc_sync_ifidx = 0;
502 }
503 
504 int
505 pfsync_input(struct mbuf **mp, int *offp, int proto, int af)
506 {
507 	struct mbuf *n, *m = *mp;
508 	struct pfsync_softc *sc = pfsyncif;
509 	struct ip *ip = mtod(m, struct ip *);
510 	struct pfsync_header *ph;
511 	struct pfsync_subheader subh;
512 	int offset, noff, len, count, mlen, flags = 0;
513 	int e;
514 
515 	NET_ASSERT_LOCKED();
516 
517 	pfsyncstat_inc(pfsyncs_ipackets);
518 
519 	/* verify that we have a sync interface configured */
520 	if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING) ||
521 	    sc->sc_sync_ifidx == 0 || !pf_status.running)
522 		goto done;
523 
524 	/* verify that the packet came in on the right interface */
525 	if (sc->sc_sync_ifidx != m->m_pkthdr.ph_ifidx) {
526 		pfsyncstat_inc(pfsyncs_badif);
527 		goto done;
528 	}
529 
530 	sc->sc_if.if_ipackets++;
531 	sc->sc_if.if_ibytes += m->m_pkthdr.len;
532 
533 	/* verify that the IP TTL is 255. */
534 	if (ip->ip_ttl != PFSYNC_DFLTTL) {
535 		pfsyncstat_inc(pfsyncs_badttl);
536 		goto done;
537 	}
538 
539 	offset = ip->ip_hl << 2;
540 	n = m_pulldown(m, offset, sizeof(*ph), &noff);
541 	if (n == NULL) {
542 		pfsyncstat_inc(pfsyncs_hdrops);
543 		return IPPROTO_DONE;
544 	}
545 	ph = (struct pfsync_header *)(n->m_data + noff);
546 
547 	/* verify the version */
548 	if (ph->version != PFSYNC_VERSION) {
549 		pfsyncstat_inc(pfsyncs_badver);
550 		goto done;
551 	}
552 	len = ntohs(ph->len) + offset;
553 	if (m->m_pkthdr.len < len) {
554 		pfsyncstat_inc(pfsyncs_badlen);
555 		goto done;
556 	}
557 
558 	if (!bcmp(&ph->pfcksum, &pf_status.pf_chksum, PF_MD5_DIGEST_LENGTH))
559 		flags = PFSYNC_SI_CKSUM;
560 
561 	offset += sizeof(*ph);
562 	while (offset <= len - sizeof(subh)) {
563 		m_copydata(m, offset, sizeof(subh), &subh);
564 		offset += sizeof(subh);
565 
566 		mlen = subh.len << 2;
567 		count = ntohs(subh.count);
568 
569 		if (subh.action >= PFSYNC_ACT_MAX ||
570 		    subh.action >= nitems(pfsync_acts) ||
571 		    mlen < pfsync_acts[subh.action].len) {
572 			/*
573 			 * subheaders are always followed by at least one
574 			 * message, so if the peer is new
575 			 * enough to tell us how big its messages are then we
576 			 * know enough to skip them.
577 			 */
578 			if (count > 0 && mlen > 0) {
579 				offset += count * mlen;
580 				continue;
581 			}
582 			pfsyncstat_inc(pfsyncs_badact);
583 			goto done;
584 		}
585 
586 		n = m_pulldown(m, offset, mlen * count, &noff);
587 		if (n == NULL) {
588 			pfsyncstat_inc(pfsyncs_badlen);
589 			return IPPROTO_DONE;
590 		}
591 
592 		e = pfsync_acts[subh.action].in(n->m_data + noff, mlen, count,
593 		    flags);
594 		if (e != 0)
595 			goto done;
596 
597 		offset += mlen * count;
598 	}
599 
600 done:
601 	m_freem(m);
602 	return IPPROTO_DONE;
603 }
604 
605 int
606 pfsync_in_clr(caddr_t buf, int len, int count, int flags)
607 {
608 	struct pfsync_clr *clr;
609 	struct pf_state *st, *nexts;
610 	struct pfi_kif *kif;
611 	u_int32_t creatorid;
612 	int i;
613 
614 	PF_LOCK();
615 	for (i = 0; i < count; i++) {
616 		clr = (struct pfsync_clr *)buf + len * i;
617 		kif = NULL;
618 		creatorid = clr->creatorid;
619 		if (strlen(clr->ifname) &&
620 		    (kif = pfi_kif_find(clr->ifname)) == NULL)
621 			continue;
622 
623 		PF_STATE_ENTER_WRITE();
624 		RBT_FOREACH_SAFE(st, pf_state_tree_id, &tree_id, nexts) {
625 			if (st->creatorid == creatorid &&
626 			    ((kif && st->kif == kif) || !kif)) {
627 				SET(st->state_flags, PFSTATE_NOSYNC);
628 				pf_remove_state(st);
629 			}
630 		}
631 		PF_STATE_EXIT_WRITE();
632 	}
633 	PF_UNLOCK();
634 
635 	return (0);
636 }
637 
638 int
639 pfsync_in_ins(caddr_t buf, int len, int count, int flags)
640 {
641 	struct pfsync_state *sp;
642 	sa_family_t af1, af2;
643 	int i;
644 
645 	PF_LOCK();
646 	for (i = 0; i < count; i++) {
647 		sp = (struct pfsync_state *)(buf + len * i);
648 		af1 = sp->key[0].af;
649 		af2 = sp->key[1].af;
650 
651 		/* check for invalid values */
652 		if (sp->timeout >= PFTM_MAX ||
653 		    sp->src.state > PF_TCPS_PROXY_DST ||
654 		    sp->dst.state > PF_TCPS_PROXY_DST ||
655 		    sp->direction > PF_OUT ||
656 		    (((af1 || af2) &&
657 		     ((af1 != AF_INET && af1 != AF_INET6) ||
658 		      (af2 != AF_INET && af2 != AF_INET6))) ||
659 		    (sp->af != AF_INET && sp->af != AF_INET6))) {
660 			DPFPRINTF(LOG_NOTICE,
661 			    "pfsync_input: PFSYNC5_ACT_INS: invalid value");
662 			pfsyncstat_inc(pfsyncs_badval);
663 			continue;
664 		}
665 
666 		if (pf_state_import(sp, flags) == ENOMEM) {
667 			/* drop out, but process the rest of the actions */
668 			break;
669 		}
670 	}
671 	PF_UNLOCK();
672 
673 	return (0);
674 }
675 
676 int
677 pfsync_in_iack(caddr_t buf, int len, int count, int flags)
678 {
679 	struct pfsync_ins_ack *ia;
680 	struct pf_state_cmp id_key;
681 	struct pf_state *st;
682 	int i;
683 
684 	for (i = 0; i < count; i++) {
685 		ia = (struct pfsync_ins_ack *)(buf + len * i);
686 
687 		id_key.id = ia->id;
688 		id_key.creatorid = ia->creatorid;
689 
690 		PF_STATE_ENTER_READ();
691 		st = pf_find_state_byid(&id_key);
692 		pf_state_ref(st);
693 		PF_STATE_EXIT_READ();
694 		if (st == NULL)
695 			continue;
696 
697 		if (ISSET(st->state_flags, PFSTATE_ACK))
698 			pfsync_deferred(st, 0);
699 
700 		pf_state_unref(st);
701 	}
702 
703 	return (0);
704 }
705 
706 int
707 pfsync_upd_tcp(struct pf_state *st, struct pfsync_state_peer *src,
708     struct pfsync_state_peer *dst)
709 {
710 	int sync = 0;
711 
712 	/*
713 	 * The state should never go backwards except
714 	 * for syn-proxy states.  Neither should the
715 	 * sequence window slide backwards.
716 	 */
717 	if ((st->src.state > src->state &&
718 	    (st->src.state < PF_TCPS_PROXY_SRC ||
719 	    src->state >= PF_TCPS_PROXY_SRC)) ||
720 
721 	    (st->src.state == src->state &&
722 	    SEQ_GT(st->src.seqlo, ntohl(src->seqlo))))
723 		sync++;
724 	else
725 		pf_state_peer_ntoh(src, &st->src);
726 
727 	if ((st->dst.state > dst->state) ||
728 
729 	    (st->dst.state >= TCPS_SYN_SENT &&
730 	    SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo))))
731 		sync++;
732 	else
733 		pf_state_peer_ntoh(dst, &st->dst);
734 
735 	return (sync);
736 }
737 
738 int
739 pfsync_in_upd(caddr_t buf, int len, int count, int flags)
740 {
741 	struct pfsync_state *sp;
742 	struct pf_state_cmp id_key;
743 	struct pf_state *st;
744 	int sync, error;
745 	int i;
746 
747 	for (i = 0; i < count; i++) {
748 		sp = (struct pfsync_state *)(buf + len * i);
749 
750 		/* check for invalid values */
751 		if (sp->timeout >= PFTM_MAX ||
752 		    sp->src.state > PF_TCPS_PROXY_DST ||
753 		    sp->dst.state > PF_TCPS_PROXY_DST) {
754 			DPFPRINTF(LOG_NOTICE,
755 			    "pfsync_input: PFSYNC_ACT_UPD: invalid value");
756 			pfsyncstat_inc(pfsyncs_badval);
757 			continue;
758 		}
759 
760 		id_key.id = sp->id;
761 		id_key.creatorid = sp->creatorid;
762 
763 		PF_STATE_ENTER_READ();
764 		st = pf_find_state_byid(&id_key);
765 		pf_state_ref(st);
766 		PF_STATE_EXIT_READ();
767 		if (st == NULL) {
768 			/* insert the update */
769 			PF_LOCK();
770 			error = pf_state_import(sp, flags);
771 			if (error)
772 				pfsyncstat_inc(pfsyncs_badstate);
773 			PF_UNLOCK();
774 			continue;
775 		}
776 
777 		if (ISSET(st->state_flags, PFSTATE_ACK))
778 			pfsync_deferred(st, 1);
779 
780 		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
781 			sync = pfsync_upd_tcp(st, &sp->src, &sp->dst);
782 		else {
783 			sync = 0;
784 
785 			/*
786 			 * Non-TCP protocol state machine always go
787 			 * forwards
788 			 */
789 			if (st->src.state > sp->src.state)
790 				sync++;
791 			else
792 				pf_state_peer_ntoh(&sp->src, &st->src);
793 
794 			if (st->dst.state > sp->dst.state)
795 				sync++;
796 			else
797 				pf_state_peer_ntoh(&sp->dst, &st->dst);
798 		}
799 
800 		if (sync < 2) {
801 			pf_state_alloc_scrub_memory(&sp->dst, &st->dst);
802 			pf_state_peer_ntoh(&sp->dst, &st->dst);
803 			st->expire = getuptime();
804 			st->timeout = sp->timeout;
805 		}
806 		st->pfsync_time = getuptime();
807 
808 		if (sync) {
809 			pfsyncstat_inc(pfsyncs_stale);
810 
811 			pfsync_update_state(st);
812 			schednetisr(NETISR_PFSYNC);
813 		}
814 
815 		pf_state_unref(st);
816 	}
817 
818 	return (0);
819 }
820 
821 int
822 pfsync_in_upd_c(caddr_t buf, int len, int count, int flags)
823 {
824 	struct pfsync_upd_c *up;
825 	struct pf_state_cmp id_key;
826 	struct pf_state *st;
827 
828 	int sync;
829 
830 	int i;
831 
832 	for (i = 0; i < count; i++) {
833 		up = (struct pfsync_upd_c *)(buf + len * i);
834 
835 		/* check for invalid values */
836 		if (up->timeout >= PFTM_MAX ||
837 		    up->src.state > PF_TCPS_PROXY_DST ||
838 		    up->dst.state > PF_TCPS_PROXY_DST) {
839 			DPFPRINTF(LOG_NOTICE,
840 			    "pfsync_input: PFSYNC_ACT_UPD_C: invalid value");
841 			pfsyncstat_inc(pfsyncs_badval);
842 			continue;
843 		}
844 
845 		id_key.id = up->id;
846 		id_key.creatorid = up->creatorid;
847 
848 		PF_STATE_ENTER_READ();
849 		st = pf_find_state_byid(&id_key);
850 		pf_state_ref(st);
851 		PF_STATE_EXIT_READ();
852 		if (st == NULL) {
853 			/* We don't have this state. Ask for it. */
854 			pfsync_request_update(id_key.creatorid, id_key.id);
855 			continue;
856 		}
857 
858 		if (ISSET(st->state_flags, PFSTATE_ACK))
859 			pfsync_deferred(st, 1);
860 
861 		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
862 			sync = pfsync_upd_tcp(st, &up->src, &up->dst);
863 		else {
864 			sync = 0;
865 			/*
866 			 * Non-TCP protocol state machine always go
867 			 * forwards
868 			 */
869 			if (st->src.state > up->src.state)
870 				sync++;
871 			else
872 				pf_state_peer_ntoh(&up->src, &st->src);
873 
874 			if (st->dst.state > up->dst.state)
875 				sync++;
876 			else
877 				pf_state_peer_ntoh(&up->dst, &st->dst);
878 		}
879 		if (sync < 2) {
880 			pf_state_alloc_scrub_memory(&up->dst, &st->dst);
881 			pf_state_peer_ntoh(&up->dst, &st->dst);
882 			st->expire = getuptime();
883 			st->timeout = up->timeout;
884 		}
885 		st->pfsync_time = getuptime();
886 
887 		if (sync) {
888 			pfsyncstat_inc(pfsyncs_stale);
889 
890 			pfsync_update_state(st);
891 			schednetisr(NETISR_PFSYNC);
892 		}
893 
894 		pf_state_unref(st);
895 	}
896 
897 	return (0);
898 }
899 
900 int
901 pfsync_in_ureq(caddr_t buf, int len, int count, int flags)
902 {
903 	struct pfsync_upd_req *ur;
904 	int i;
905 
906 	struct pf_state_cmp id_key;
907 	struct pf_state *st;
908 
909 	for (i = 0; i < count; i++) {
910 		ur = (struct pfsync_upd_req *)(buf + len * i);
911 
912 		id_key.id = ur->id;
913 		id_key.creatorid = ur->creatorid;
914 
915 		if (id_key.id == 0 && id_key.creatorid == 0)
916 			pfsync_bulk_start();
917 		else {
918 			PF_STATE_ENTER_READ();
919 			st = pf_find_state_byid(&id_key);
920 			pf_state_ref(st);
921 			PF_STATE_EXIT_READ();
922 			if (st == NULL) {
923 				pfsyncstat_inc(pfsyncs_badstate);
924 				continue;
925 			}
926 			if (ISSET(st->state_flags, PFSTATE_NOSYNC)) {
927 				pf_state_unref(st);
928 				continue;
929 			}
930 
931 			pfsync_update_state_req(st);
932 			pf_state_unref(st);
933 		}
934 	}
935 
936 	return (0);
937 }
938 
939 int
940 pfsync_in_del(caddr_t buf, int len, int count, int flags)
941 {
942 	struct pfsync_state *sp;
943 	struct pf_state_cmp id_key;
944 	struct pf_state *st;
945 	int i;
946 
947 	PF_STATE_ENTER_WRITE();
948 	for (i = 0; i < count; i++) {
949 		sp = (struct pfsync_state *)(buf + len * i);
950 
951 		id_key.id = sp->id;
952 		id_key.creatorid = sp->creatorid;
953 
954 		st = pf_find_state_byid(&id_key);
955 		if (st == NULL) {
956 			pfsyncstat_inc(pfsyncs_badstate);
957 			continue;
958 		}
959 		SET(st->state_flags, PFSTATE_NOSYNC);
960 		pf_remove_state(st);
961 	}
962 	PF_STATE_EXIT_WRITE();
963 
964 	return (0);
965 }
966 
967 int
968 pfsync_in_del_c(caddr_t buf, int len, int count, int flags)
969 {
970 	struct pfsync_del_c *sp;
971 	struct pf_state_cmp id_key;
972 	struct pf_state *st;
973 	int i;
974 
975 	PF_LOCK();
976 	PF_STATE_ENTER_WRITE();
977 	for (i = 0; i < count; i++) {
978 		sp = (struct pfsync_del_c *)(buf + len * i);
979 
980 		id_key.id = sp->id;
981 		id_key.creatorid = sp->creatorid;
982 
983 		st = pf_find_state_byid(&id_key);
984 		if (st == NULL) {
985 			pfsyncstat_inc(pfsyncs_badstate);
986 			continue;
987 		}
988 
989 		SET(st->state_flags, PFSTATE_NOSYNC);
990 		pf_remove_state(st);
991 	}
992 	PF_STATE_EXIT_WRITE();
993 	PF_UNLOCK();
994 
995 	return (0);
996 }
997 
998 int
999 pfsync_in_bus(caddr_t buf, int len, int count, int flags)
1000 {
1001 	struct pfsync_softc *sc = pfsyncif;
1002 	struct pfsync_bus *bus;
1003 
1004 	/* If we're not waiting for a bulk update, who cares. */
1005 	if (sc->sc_ureq_sent == 0)
1006 		return (0);
1007 
1008 	bus = (struct pfsync_bus *)buf;
1009 
1010 	switch (bus->status) {
1011 	case PFSYNC_BUS_START:
1012 		PF_LOCK();
1013 		timeout_add(&sc->sc_bulkfail_tmo, 4 * hz +
1014 		    pf_pool_limits[PF_LIMIT_STATES].limit /
1015 		    ((sc->sc_if.if_mtu - PFSYNC_MINPKT) /
1016 		    sizeof(struct pfsync_state)));
1017 		PF_UNLOCK();
1018 		DPFPRINTF(LOG_INFO, "received bulk update start");
1019 		break;
1020 
1021 	case PFSYNC_BUS_END:
1022 		if (getuptime() - ntohl(bus->endtime) >=
1023 		    sc->sc_ureq_sent) {
1024 			/* that's it, we're happy */
1025 			sc->sc_ureq_sent = 0;
1026 			sc->sc_bulk_tries = 0;
1027 			timeout_del(&sc->sc_bulkfail_tmo);
1028 #if NCARP > 0
1029 			if (!pfsync_sync_ok)
1030 				carp_group_demote_adj(&sc->sc_if, -1,
1031 				    sc->sc_link_demoted ?
1032 				    "pfsync link state up" :
1033 				    "pfsync bulk done");
1034 			if (sc->sc_initial_bulk) {
1035 				carp_group_demote_adj(&sc->sc_if, -32,
1036 				    "pfsync init");
1037 				sc->sc_initial_bulk = 0;
1038 			}
1039 #endif
1040 			pfsync_sync_ok = 1;
1041 			sc->sc_link_demoted = 0;
1042 			DPFPRINTF(LOG_INFO, "received valid bulk update end");
1043 		} else {
1044 			DPFPRINTF(LOG_WARNING, "received invalid "
1045 			    "bulk update end: bad timestamp");
1046 		}
1047 		break;
1048 	}
1049 
1050 	return (0);
1051 }
1052 
1053 int
1054 pfsync_in_tdb(caddr_t buf, int len, int count, int flags)
1055 {
1056 #if defined(IPSEC)
1057 	struct pfsync_tdb *tp;
1058 	int i;
1059 
1060 	for (i = 0; i < count; i++) {
1061 		tp = (struct pfsync_tdb *)(buf + len * i);
1062 		pfsync_update_net_tdb(tp);
1063 	}
1064 #endif
1065 
1066 	return (0);
1067 }
1068 
1069 #if defined(IPSEC)
1070 /* Update an in-kernel tdb. Silently fail if no tdb is found. */
1071 void
1072 pfsync_update_net_tdb(struct pfsync_tdb *pt)
1073 {
1074 	struct tdb		*tdb;
1075 
1076 	NET_ASSERT_LOCKED();
1077 
1078 	/* check for invalid values */
1079 	if (ntohl(pt->spi) <= SPI_RESERVED_MAX ||
1080 	    (pt->dst.sa.sa_family != AF_INET &&
1081 	     pt->dst.sa.sa_family != AF_INET6))
1082 		goto bad;
1083 
1084 	tdb = gettdb(ntohs(pt->rdomain), pt->spi,
1085 	    (union sockaddr_union *)&pt->dst, pt->sproto);
1086 	if (tdb) {
1087 		pt->rpl = betoh64(pt->rpl);
1088 		pt->cur_bytes = betoh64(pt->cur_bytes);
1089 
1090 		/* Neither replay nor byte counter should ever decrease. */
1091 		if (pt->rpl < tdb->tdb_rpl ||
1092 		    pt->cur_bytes < tdb->tdb_cur_bytes) {
1093 			tdb_unref(tdb);
1094 			goto bad;
1095 		}
1096 
1097 		tdb->tdb_rpl = pt->rpl;
1098 		tdb->tdb_cur_bytes = pt->cur_bytes;
1099 		tdb_unref(tdb);
1100 	}
1101 	return;
1102 
1103  bad:
1104 	DPFPRINTF(LOG_WARNING, "pfsync_insert: PFSYNC_ACT_TDB_UPD: "
1105 	    "invalid value");
1106 	pfsyncstat_inc(pfsyncs_badstate);
1107 	return;
1108 }
1109 #endif
1110 
1111 
1112 int
1113 pfsync_in_eof(caddr_t buf, int len, int count, int flags)
1114 {
1115 	if (len > 0 || count > 0)
1116 		pfsyncstat_inc(pfsyncs_badact);
1117 
1118 	/* we're done. let the caller return */
1119 	return (1);
1120 }
1121 
1122 int
1123 pfsync_in_error(caddr_t buf, int len, int count, int flags)
1124 {
1125 	pfsyncstat_inc(pfsyncs_badact);
1126 	return (-1);
1127 }
1128 
1129 int
1130 pfsyncoutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
1131 	struct rtentry *rt)
1132 {
1133 	m_freem(m);	/* drop packet */
1134 	return (EAFNOSUPPORT);
1135 }
1136 
1137 int
1138 pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
1139 {
1140 	struct proc *p = curproc;
1141 	struct pfsync_softc *sc = ifp->if_softc;
1142 	struct ifreq *ifr = (struct ifreq *)data;
1143 	struct ip_moptions *imo = &sc->sc_imo;
1144 	struct pfsyncreq pfsyncr;
1145 	struct ifnet *ifp0, *sifp;
1146 	struct ip *ip;
1147 	int error;
1148 
1149 	switch (cmd) {
1150 	case SIOCSIFFLAGS:
1151 		if ((ifp->if_flags & IFF_RUNNING) == 0 &&
1152 		    (ifp->if_flags & IFF_UP)) {
1153 			ifp->if_flags |= IFF_RUNNING;
1154 
1155 #if NCARP > 0
1156 			sc->sc_initial_bulk = 1;
1157 			carp_group_demote_adj(&sc->sc_if, 32, "pfsync init");
1158 #endif
1159 
1160 			pfsync_request_full_update(sc);
1161 		}
1162 		if ((ifp->if_flags & IFF_RUNNING) &&
1163 		    (ifp->if_flags & IFF_UP) == 0) {
1164 			ifp->if_flags &= ~IFF_RUNNING;
1165 
1166 			/* drop everything */
1167 			timeout_del(&sc->sc_tmo);
1168 			pfsync_drop(sc);
1169 
1170 			pfsync_cancel_full_update(sc);
1171 		}
1172 		break;
1173 	case SIOCSIFMTU:
1174 		if ((ifp0 = if_get(sc->sc_sync_ifidx)) == NULL)
1175 			return (EINVAL);
1176 		error = 0;
1177 		if (ifr->ifr_mtu <= PFSYNC_MINPKT ||
1178 		    ifr->ifr_mtu > ifp0->if_mtu) {
1179 			error = EINVAL;
1180 		}
1181 		if_put(ifp0);
1182 		if (error)
1183 			return error;
1184 		if (ifr->ifr_mtu < ifp->if_mtu)
1185 			pfsync_sendout();
1186 		ifp->if_mtu = ifr->ifr_mtu;
1187 		break;
1188 	case SIOCGETPFSYNC:
1189 		bzero(&pfsyncr, sizeof(pfsyncr));
1190 		if ((ifp0 = if_get(sc->sc_sync_ifidx)) != NULL) {
1191 			strlcpy(pfsyncr.pfsyncr_syncdev,
1192 			    ifp0->if_xname, IFNAMSIZ);
1193 		}
1194 		if_put(ifp0);
1195 		pfsyncr.pfsyncr_syncpeer = sc->sc_sync_peer;
1196 		pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates;
1197 		pfsyncr.pfsyncr_defer = sc->sc_defer;
1198 		return (copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr)));
1199 
1200 	case SIOCSETPFSYNC:
1201 		if ((error = suser(p)) != 0)
1202 			return (error);
1203 		if ((error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr))))
1204 			return (error);
1205 
1206 		if (pfsyncr.pfsyncr_syncpeer.s_addr == 0)
1207 			sc->sc_sync_peer.s_addr = INADDR_PFSYNC_GROUP;
1208 		else
1209 			sc->sc_sync_peer.s_addr =
1210 			    pfsyncr.pfsyncr_syncpeer.s_addr;
1211 
1212 		if (pfsyncr.pfsyncr_maxupdates > 255)
1213 			return (EINVAL);
1214 		sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates;
1215 
1216 		sc->sc_defer = pfsyncr.pfsyncr_defer;
1217 
1218 		if (pfsyncr.pfsyncr_syncdev[0] == 0) {
1219 			if ((ifp0 = if_get(sc->sc_sync_ifidx)) != NULL) {
1220 				if_linkstatehook_del(ifp0, &sc->sc_ltask);
1221 				if_detachhook_del(ifp0, &sc->sc_dtask);
1222 			}
1223 			if_put(ifp0);
1224 			sc->sc_sync_ifidx = 0;
1225 			if (imo->imo_num_memberships > 0) {
1226 				in_delmulti(imo->imo_membership[
1227 				    --imo->imo_num_memberships]);
1228 				imo->imo_ifidx = 0;
1229 			}
1230 			break;
1231 		}
1232 
1233 		if ((sifp = if_unit(pfsyncr.pfsyncr_syncdev)) == NULL)
1234 			return (EINVAL);
1235 
1236 		ifp0 = if_get(sc->sc_sync_ifidx);
1237 
1238 		if (sifp->if_mtu < sc->sc_if.if_mtu || (ifp0 != NULL &&
1239 		    sifp->if_mtu < ifp0->if_mtu) ||
1240 		    sifp->if_mtu < MCLBYTES - sizeof(struct ip))
1241 			pfsync_sendout();
1242 
1243 		if (ifp0) {
1244 			if_linkstatehook_del(ifp0, &sc->sc_ltask);
1245 			if_detachhook_del(ifp0, &sc->sc_dtask);
1246 		}
1247 		if_put(ifp0);
1248 		sc->sc_sync_ifidx = sifp->if_index;
1249 
1250 		if (imo->imo_num_memberships > 0) {
1251 			in_delmulti(imo->imo_membership[--imo->imo_num_memberships]);
1252 			imo->imo_ifidx = 0;
1253 		}
1254 
1255 		if (sc->sc_sync_peer.s_addr == INADDR_PFSYNC_GROUP) {
1256 			struct in_addr addr;
1257 
1258 			if (!(sifp->if_flags & IFF_MULTICAST)) {
1259 				sc->sc_sync_ifidx = 0;
1260 				if_put(sifp);
1261 				return (EADDRNOTAVAIL);
1262 			}
1263 
1264 			addr.s_addr = INADDR_PFSYNC_GROUP;
1265 
1266 			if ((imo->imo_membership[0] =
1267 			    in_addmulti(&addr, sifp)) == NULL) {
1268 				sc->sc_sync_ifidx = 0;
1269 				if_put(sifp);
1270 				return (ENOBUFS);
1271 			}
1272 			imo->imo_num_memberships++;
1273 			imo->imo_ifidx = sc->sc_sync_ifidx;
1274 			imo->imo_ttl = PFSYNC_DFLTTL;
1275 			imo->imo_loop = 0;
1276 		}
1277 
1278 		ip = &sc->sc_template;
1279 		bzero(ip, sizeof(*ip));
1280 		ip->ip_v = IPVERSION;
1281 		ip->ip_hl = sizeof(sc->sc_template) >> 2;
1282 		ip->ip_tos = IPTOS_LOWDELAY;
1283 		/* len and id are set later */
1284 		ip->ip_off = htons(IP_DF);
1285 		ip->ip_ttl = PFSYNC_DFLTTL;
1286 		ip->ip_p = IPPROTO_PFSYNC;
1287 		ip->ip_src.s_addr = INADDR_ANY;
1288 		ip->ip_dst.s_addr = sc->sc_sync_peer.s_addr;
1289 
1290 		if_linkstatehook_add(sifp, &sc->sc_ltask);
1291 		if_detachhook_add(sifp, &sc->sc_dtask);
1292 		if_put(sifp);
1293 
1294 		pfsync_request_full_update(sc);
1295 
1296 		break;
1297 
1298 	default:
1299 		return (ENOTTY);
1300 	}
1301 
1302 	return (0);
1303 }
1304 
1305 void
1306 pfsync_out_state(struct pf_state *st, void *buf)
1307 {
1308 	struct pfsync_state *sp = buf;
1309 
1310 	pf_state_export(sp, st);
1311 }
1312 
1313 void
1314 pfsync_out_iack(struct pf_state *st, void *buf)
1315 {
1316 	struct pfsync_ins_ack *iack = buf;
1317 
1318 	iack->id = st->id;
1319 	iack->creatorid = st->creatorid;
1320 }
1321 
1322 void
1323 pfsync_out_upd_c(struct pf_state *st, void *buf)
1324 {
1325 	struct pfsync_upd_c *up = buf;
1326 
1327 	bzero(up, sizeof(*up));
1328 	up->id = st->id;
1329 	pf_state_peer_hton(&st->src, &up->src);
1330 	pf_state_peer_hton(&st->dst, &up->dst);
1331 	up->creatorid = st->creatorid;
1332 	up->timeout = st->timeout;
1333 }
1334 
1335 void
1336 pfsync_out_del(struct pf_state *st, void *buf)
1337 {
1338 	struct pfsync_del_c *dp = buf;
1339 
1340 	dp->id = st->id;
1341 	dp->creatorid = st->creatorid;
1342 
1343 	SET(st->state_flags, PFSTATE_NOSYNC);
1344 }
1345 
1346 void
1347 pfsync_grab_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc *sc)
1348 {
1349 	int q;
1350 	struct pf_state *st;
1351 	struct pfsync_upd_req_item *ur;
1352 #if defined(IPSEC)
1353 	struct tdb *tdb;
1354 #endif
1355 
1356 	sn->sn_sc = sc;
1357 
1358 	mtx_enter(&sc->sc_st_mtx);
1359 	mtx_enter(&sc->sc_upd_req_mtx);
1360 	mtx_enter(&sc->sc_tdb_mtx);
1361 
1362 	for (q = 0; q < PFSYNC_S_COUNT; q++) {
1363 		TAILQ_INIT(&sn->sn_qs[q]);
1364 
1365 		while ((st = TAILQ_FIRST(&sc->sc_qs[q])) != NULL) {
1366 			TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
1367 			mtx_enter(&st->mtx);
1368 			if (st->snapped == 0) {
1369 				TAILQ_INSERT_TAIL(&sn->sn_qs[q], st, sync_snap);
1370 				st->snapped = 1;
1371 				mtx_leave(&st->mtx);
1372 			} else {
1373 				/*
1374 				 * item is on snapshot list already, so we can
1375 				 * skip it now.
1376 				 */
1377 				mtx_leave(&st->mtx);
1378 				pf_state_unref(st);
1379 			}
1380 		}
1381 	}
1382 
1383 	TAILQ_INIT(&sn->sn_upd_req_list);
1384 	while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
1385 		TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
1386 		TAILQ_INSERT_TAIL(&sn->sn_upd_req_list, ur, ur_snap);
1387 	}
1388 
1389 	TAILQ_INIT(&sn->sn_tdb_q);
1390 #if defined(IPSEC)
1391 	while ((tdb = TAILQ_FIRST(&sc->sc_tdb_q)) != NULL) {
1392 		TAILQ_REMOVE(&sc->sc_tdb_q, tdb, tdb_sync_entry);
1393 		TAILQ_INSERT_TAIL(&sn->sn_tdb_q, tdb, tdb_sync_snap);
1394 
1395 		mtx_enter(&tdb->tdb_mtx);
1396 		KASSERT(!ISSET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED));
1397 		SET(tdb->tdb_flags, TDBF_PFSYNC_SNAPPED);
1398 		mtx_leave(&tdb->tdb_mtx);
1399 	}
1400 #endif
1401 
1402 	sn->sn_len = sc->sc_len;
1403 	sc->sc_len = PFSYNC_MINPKT;
1404 
1405 	sn->sn_plus = sc->sc_plus;
1406 	sc->sc_plus = NULL;
1407 	sn->sn_pluslen = sc->sc_pluslen;
1408 	sc->sc_pluslen = 0;
1409 
1410 	mtx_leave(&sc->sc_tdb_mtx);
1411 	mtx_leave(&sc->sc_upd_req_mtx);
1412 	mtx_leave(&sc->sc_st_mtx);
1413 }
1414 
1415 void
1416 pfsync_drop_snapshot(struct pfsync_snapshot *sn)
1417 {
1418 	struct pf_state *st;
1419 	struct pfsync_upd_req_item *ur;
1420 #if defined(IPSEC)
1421 	struct tdb *t;
1422 #endif
1423 	int q;
1424 
1425 	for (q = 0; q < PFSYNC_S_COUNT; q++) {
1426 		if (TAILQ_EMPTY(&sn->sn_qs[q]))
1427 			continue;
1428 
1429 		while ((st = TAILQ_FIRST(&sn->sn_qs[q])) != NULL) {
1430 			mtx_enter(&st->mtx);
1431 			KASSERT(st->sync_state == q);
1432 			KASSERT(st->snapped == 1);
1433 			TAILQ_REMOVE(&sn->sn_qs[q], st, sync_snap);
1434 			st->sync_state = PFSYNC_S_NONE;
1435 			st->snapped = 0;
1436 			mtx_leave(&st->mtx);
1437 			pf_state_unref(st);
1438 		}
1439 	}
1440 
1441 	while ((ur = TAILQ_FIRST(&sn->sn_upd_req_list)) != NULL) {
1442 		TAILQ_REMOVE(&sn->sn_upd_req_list, ur, ur_snap);
1443 		pool_put(&sn->sn_sc->sc_pool, ur);
1444 	}
1445 
1446 #if defined(IPSEC)
1447 	while ((t = TAILQ_FIRST(&sn->sn_tdb_q)) != NULL) {
1448 		TAILQ_REMOVE(&sn->sn_tdb_q, t, tdb_sync_snap);
1449 		mtx_enter(&t->tdb_mtx);
1450 		KASSERT(ISSET(t->tdb_flags, TDBF_PFSYNC_SNAPPED));
1451 		CLR(t->tdb_flags, TDBF_PFSYNC_SNAPPED);
1452 		CLR(t->tdb_flags, TDBF_PFSYNC);
1453 		mtx_leave(&t->tdb_mtx);
1454 	}
1455 #endif
1456 }
1457 
1458 int
1459 pfsync_is_snapshot_empty(struct pfsync_snapshot *sn)
1460 {
1461 	int	q;
1462 
1463 	for (q = 0; q < PFSYNC_S_COUNT; q++)
1464 		if (!TAILQ_EMPTY(&sn->sn_qs[q]))
1465 			return (0);
1466 
1467 	if (!TAILQ_EMPTY(&sn->sn_upd_req_list))
1468 		return (0);
1469 
1470 	if (!TAILQ_EMPTY(&sn->sn_tdb_q))
1471 		return (0);
1472 
1473 	return (sn->sn_plus == NULL);
1474 }
1475 
1476 void
1477 pfsync_drop(struct pfsync_softc *sc)
1478 {
1479 	struct pfsync_snapshot	sn;
1480 
1481 	pfsync_grab_snapshot(&sn, sc);
1482 	pfsync_drop_snapshot(&sn);
1483 }
1484 
1485 void
1486 pfsync_send_dispatch(void *xmq)
1487 {
1488 	struct mbuf_queue *mq = xmq;
1489 	struct pfsync_softc *sc;
1490 	struct mbuf *m;
1491 	struct mbuf_list ml;
1492 	int error;
1493 
1494 	mq_delist(mq, &ml);
1495 	if (ml_empty(&ml))
1496 		return;
1497 
1498 	NET_LOCK();
1499 	sc = pfsyncif;
1500 	if (sc == NULL) {
1501 		ml_purge(&ml);
1502 		goto done;
1503 	}
1504 
1505 	while ((m = ml_dequeue(&ml)) != NULL) {
1506 		if ((error = ip_output(m, NULL, NULL, IP_RAWOUTPUT,
1507 		    &sc->sc_imo, NULL, 0)) == 0)
1508 			pfsyncstat_inc(pfsyncs_opackets);
1509 		else {
1510 			DPFPRINTF(LOG_DEBUG,
1511 			    "ip_output() @ %s failed (%d)\n", __func__, error);
1512 			pfsyncstat_inc(pfsyncs_oerrors);
1513 		}
1514 	}
1515 done:
1516 	NET_UNLOCK();
1517 }
1518 
1519 void
1520 pfsync_send_pkt(struct mbuf *m)
1521 {
1522 	if (mq_enqueue(&pfsync_mq, m) != 0) {
1523 		pfsyncstat_inc(pfsyncs_oerrors);
1524 		DPFPRINTF(LOG_DEBUG, "mq_enqueue() @ %s failed, queue full\n",
1525 		    __func__);
1526 	} else
1527 		task_add(net_tq(0), &pfsync_task);
1528 }
1529 
1530 void
1531 pfsync_sendout(void)
1532 {
1533 	struct pfsync_snapshot sn;
1534 	struct pfsync_softc *sc = pfsyncif;
1535 #if NBPFILTER > 0
1536 	struct ifnet *ifp = &sc->sc_if;
1537 #endif
1538 	struct mbuf *m;
1539 	struct ip *ip;
1540 	struct pfsync_header *ph;
1541 	struct pfsync_subheader *subh;
1542 	struct pf_state *st;
1543 	struct pfsync_upd_req_item *ur;
1544 	int offset;
1545 	int q, count = 0;
1546 
1547 	if (sc == NULL || sc->sc_len == PFSYNC_MINPKT)
1548 		return;
1549 
1550 	if (!ISSET(sc->sc_if.if_flags, IFF_RUNNING) ||
1551 #if NBPFILTER > 0
1552 	    (ifp->if_bpf == NULL && sc->sc_sync_ifidx == 0)) {
1553 #else
1554 	    sc->sc_sync_ifidx == 0) {
1555 #endif
1556 		pfsync_drop(sc);
1557 		return;
1558 	}
1559 
1560 	pfsync_grab_snapshot(&sn, sc);
1561 
1562 	/*
1563 	 * Check below is sufficient to prevent us from sending empty packets,
1564 	 * but it does not stop us from sending short packets.
1565 	 */
1566 	if (pfsync_is_snapshot_empty(&sn))
1567 		return;
1568 
1569 	MGETHDR(m, M_DONTWAIT, MT_DATA);
1570 	if (m == NULL) {
1571 		sc->sc_if.if_oerrors++;
1572 		pfsyncstat_inc(pfsyncs_onomem);
1573 		pfsync_drop_snapshot(&sn);
1574 		return;
1575 	}
1576 
1577 	if (max_linkhdr + sn.sn_len > MHLEN) {
1578 		MCLGETL(m, M_DONTWAIT, max_linkhdr + sn.sn_len);
1579 		if (!ISSET(m->m_flags, M_EXT)) {
1580 			m_free(m);
1581 			sc->sc_if.if_oerrors++;
1582 			pfsyncstat_inc(pfsyncs_onomem);
1583 			pfsync_drop_snapshot(&sn);
1584 			return;
1585 		}
1586 	}
1587 	m->m_data += max_linkhdr;
1588 	m->m_len = m->m_pkthdr.len = sn.sn_len;
1589 
1590 	/* build the ip header */
1591 	ip = mtod(m, struct ip *);
1592 	bcopy(&sc->sc_template, ip, sizeof(*ip));
1593 	offset = sizeof(*ip);
1594 
1595 	ip->ip_len = htons(m->m_pkthdr.len);
1596 	ip->ip_id = htons(ip_randomid());
1597 
1598 	/* build the pfsync header */
1599 	ph = (struct pfsync_header *)(m->m_data + offset);
1600 	bzero(ph, sizeof(*ph));
1601 	offset += sizeof(*ph);
1602 
1603 	ph->version = PFSYNC_VERSION;
1604 	ph->len = htons(sn.sn_len - sizeof(*ip));
1605 	bcopy(pf_status.pf_chksum, ph->pfcksum, PF_MD5_DIGEST_LENGTH);
1606 
1607 	if (!TAILQ_EMPTY(&sn.sn_upd_req_list)) {
1608 		subh = (struct pfsync_subheader *)(m->m_data + offset);
1609 		offset += sizeof(*subh);
1610 
1611 		count = 0;
1612 		while ((ur = TAILQ_FIRST(&sn.sn_upd_req_list)) != NULL) {
1613 			TAILQ_REMOVE(&sn.sn_upd_req_list, ur, ur_snap);
1614 
1615 			bcopy(&ur->ur_msg, m->m_data + offset,
1616 			    sizeof(ur->ur_msg));
1617 			offset += sizeof(ur->ur_msg);
1618 
1619 			pool_put(&sc->sc_pool, ur);
1620 
1621 			count++;
1622 		}
1623 
1624 		bzero(subh, sizeof(*subh));
1625 		subh->len = sizeof(ur->ur_msg) >> 2;
1626 		subh->action = PFSYNC_ACT_UPD_REQ;
1627 		subh->count = htons(count);
1628 	}
1629 
1630 	/* has someone built a custom region for us to add? */
1631 	if (sn.sn_plus != NULL) {
1632 		bcopy(sn.sn_plus, m->m_data + offset, sn.sn_pluslen);
1633 		offset += sn.sn_pluslen;
1634 		sn.sn_plus = NULL;	/* XXX memory leak ? */
1635 	}
1636 
1637 #if defined(IPSEC)
1638 	if (!TAILQ_EMPTY(&sn.sn_tdb_q)) {
1639 		struct tdb *t;
1640 
1641 		subh = (struct pfsync_subheader *)(m->m_data + offset);
1642 		offset += sizeof(*subh);
1643 
1644 		count = 0;
1645 		while ((t = TAILQ_FIRST(&sn.sn_tdb_q)) != NULL) {
1646 			TAILQ_REMOVE(&sn.sn_tdb_q, t, tdb_sync_snap);
1647 			pfsync_out_tdb(t, m->m_data + offset);
1648 			offset += sizeof(struct pfsync_tdb);
1649 			mtx_enter(&t->tdb_mtx);
1650 			KASSERT(ISSET(t->tdb_flags, TDBF_PFSYNC_SNAPPED));
1651 			CLR(t->tdb_flags, TDBF_PFSYNC_SNAPPED);
1652 			CLR(t->tdb_flags, TDBF_PFSYNC);
1653 			mtx_leave(&t->tdb_mtx);
1654 			tdb_unref(t);
1655 			count++;
1656 		}
1657 
1658 		bzero(subh, sizeof(*subh));
1659 		subh->action = PFSYNC_ACT_TDB;
1660 		subh->len = sizeof(struct pfsync_tdb) >> 2;
1661 		subh->count = htons(count);
1662 	}
1663 #endif
1664 
1665 	/* walk the queues */
1666 	for (q = 0; q < PFSYNC_S_COUNT; q++) {
1667 		if (TAILQ_EMPTY(&sn.sn_qs[q]))
1668 			continue;
1669 
1670 		subh = (struct pfsync_subheader *)(m->m_data + offset);
1671 		offset += sizeof(*subh);
1672 
1673 		count = 0;
1674 		while ((st = TAILQ_FIRST(&sn.sn_qs[q])) != NULL) {
1675 			mtx_enter(&st->mtx);
1676 			TAILQ_REMOVE(&sn.sn_qs[q], st, sync_snap);
1677 			KASSERT(st->sync_state == q);
1678 			KASSERT(st->snapped == 1);
1679 			st->sync_state = PFSYNC_S_NONE;
1680 			st->snapped = 0;
1681 			pfsync_qs[q].write(st, m->m_data + offset);
1682 			offset += pfsync_qs[q].len;
1683 			mtx_leave(&st->mtx);
1684 
1685 			pf_state_unref(st);
1686 			count++;
1687 		}
1688 
1689 		bzero(subh, sizeof(*subh));
1690 		subh->action = pfsync_qs[q].action;
1691 		subh->len = pfsync_qs[q].len >> 2;
1692 		subh->count = htons(count);
1693 	}
1694 
1695 	/* we're done, let's put it on the wire */
1696 #if NBPFILTER > 0
1697 	if (ifp->if_bpf) {
1698 		m->m_data += sizeof(*ip);
1699 		m->m_len = m->m_pkthdr.len = sn.sn_len - sizeof(*ip);
1700 		bpf_mtap(ifp->if_bpf, m, BPF_DIRECTION_OUT);
1701 		m->m_data -= sizeof(*ip);
1702 		m->m_len = m->m_pkthdr.len = sn.sn_len;
1703 	}
1704 
1705 	if (sc->sc_sync_ifidx == 0) {
1706 		sc->sc_len = PFSYNC_MINPKT;
1707 		m_freem(m);
1708 		return;
1709 	}
1710 #endif
1711 
1712 	sc->sc_if.if_opackets++;
1713 	sc->sc_if.if_obytes += m->m_pkthdr.len;
1714 
1715 	m->m_pkthdr.ph_rtableid = sc->sc_if.if_rdomain;
1716 
1717 	pfsync_send_pkt(m);
1718 }
1719 
1720 void
1721 pfsync_insert_state(struct pf_state *st)
1722 {
1723 	struct pfsync_softc *sc = pfsyncif;
1724 
1725 	NET_ASSERT_LOCKED();
1726 
1727 	if (ISSET(st->rule.ptr->rule_flag, PFRULE_NOSYNC) ||
1728 	    st->key[PF_SK_WIRE]->proto == IPPROTO_PFSYNC) {
1729 		SET(st->state_flags, PFSTATE_NOSYNC);
1730 		return;
1731 	}
1732 
1733 	if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING) ||
1734 	    ISSET(st->state_flags, PFSTATE_NOSYNC))
1735 		return;
1736 
1737 	if (sc->sc_len == PFSYNC_MINPKT)
1738 		timeout_add_sec(&sc->sc_tmo, 1);
1739 
1740 	pfsync_q_ins(st, PFSYNC_S_INS);
1741 
1742 	st->sync_updates = 0;
1743 }
1744 
1745 int
1746 pfsync_defer(struct pf_state *st, struct mbuf *m, struct pfsync_deferral **ppd)
1747 {
1748 	struct pfsync_softc *sc = pfsyncif;
1749 	struct pfsync_deferral *pd;
1750 	unsigned int sched;
1751 
1752 	NET_ASSERT_LOCKED();
1753 
1754 	if (!sc->sc_defer ||
1755 	    ISSET(st->state_flags, PFSTATE_NOSYNC) ||
1756 	    m->m_flags & (M_BCAST|M_MCAST))
1757 		return (0);
1758 
1759 	pd = pool_get(&sc->sc_pool, M_NOWAIT);
1760 	if (pd == NULL)
1761 		return (0);
1762 
1763 	/*
1764 	 * deferral queue grows faster, than timeout can consume,
1765 	 * we have to ask packet (caller) to help timer and dispatch
1766 	 * one deferral for us.
1767 	 *
1768 	 * We wish to call pfsync_undefer() here. Unfortunately we can't,
1769 	 * because pfsync_undefer() will be calling to ip_output(),
1770 	 * which in turn will call to pf_test(), which would then attempt
1771 	 * to grab PF_LOCK() we currently hold.
1772 	 */
1773 	if (sc->sc_deferred >= 128) {
1774 		mtx_enter(&sc->sc_deferrals_mtx);
1775 		*ppd = TAILQ_FIRST(&sc->sc_deferrals);
1776 		if (*ppd != NULL) {
1777 			TAILQ_REMOVE(&sc->sc_deferrals, *ppd, pd_entry);
1778 			sc->sc_deferred--;
1779 		}
1780 		mtx_leave(&sc->sc_deferrals_mtx);
1781 	} else
1782 		*ppd = NULL;
1783 
1784 	m->m_pkthdr.pf.flags |= PF_TAG_GENERATED;
1785 	SET(st->state_flags, PFSTATE_ACK);
1786 
1787 	pd->pd_st = pf_state_ref(st);
1788 	pd->pd_m = m;
1789 
1790 	pd->pd_deadline = getnsecuptime() + PFSYNC_DEFER_NSEC;
1791 
1792 	mtx_enter(&sc->sc_deferrals_mtx);
1793 	sched = TAILQ_EMPTY(&sc->sc_deferrals);
1794 
1795 	TAILQ_INSERT_TAIL(&sc->sc_deferrals, pd, pd_entry);
1796 	sc->sc_deferred++;
1797 	mtx_leave(&sc->sc_deferrals_mtx);
1798 
1799 	if (sched)
1800 		timeout_add_nsec(&sc->sc_deferrals_tmo, PFSYNC_DEFER_NSEC);
1801 
1802 	schednetisr(NETISR_PFSYNC);
1803 
1804 	return (1);
1805 }
1806 
1807 void
1808 pfsync_undefer_notify(struct pfsync_deferral *pd)
1809 {
1810 	struct pf_pdesc pdesc;
1811 	struct pf_state *st = pd->pd_st;
1812 
1813 	/*
1814 	 * pf_remove_state removes the state keys and sets st->timeout
1815 	 * to PFTM_UNLINKED. this is done under NET_LOCK which should
1816 	 * be held here, so we can use PFTM_UNLINKED as a test for
1817 	 * whether the state keys are set for the address family
1818 	 * lookup.
1819 	 */
1820 
1821 	if (st->timeout == PFTM_UNLINKED)
1822 		return;
1823 
1824 	if (st->rt == PF_ROUTETO) {
1825 		if (pf_setup_pdesc(&pdesc, st->key[PF_SK_WIRE]->af,
1826 		    st->direction, st->kif, pd->pd_m, NULL) != PF_PASS)
1827 			return;
1828 		switch (st->key[PF_SK_WIRE]->af) {
1829 		case AF_INET:
1830 			pf_route(&pdesc, st);
1831 			break;
1832 #ifdef INET6
1833 		case AF_INET6:
1834 			pf_route6(&pdesc, st);
1835 			break;
1836 #endif /* INET6 */
1837 		default:
1838 			unhandled_af(st->key[PF_SK_WIRE]->af);
1839 		}
1840 		pd->pd_m = pdesc.m;
1841 	} else {
1842 		switch (st->key[PF_SK_WIRE]->af) {
1843 		case AF_INET:
1844 			ip_output(pd->pd_m, NULL, NULL, 0, NULL, NULL, 0);
1845 			break;
1846 #ifdef INET6
1847 		case AF_INET6:
1848 			ip6_output(pd->pd_m, NULL, NULL, 0, NULL, NULL);
1849 			break;
1850 #endif /* INET6 */
1851 		default:
1852 			unhandled_af(st->key[PF_SK_WIRE]->af);
1853 		}
1854 
1855 		pd->pd_m = NULL;
1856 	}
1857 }
1858 
1859 void
1860 pfsync_free_deferral(struct pfsync_deferral *pd)
1861 {
1862 	struct pfsync_softc *sc = pfsyncif;
1863 
1864 	pf_state_unref(pd->pd_st);
1865 	m_freem(pd->pd_m);
1866 	pool_put(&sc->sc_pool, pd);
1867 }
1868 
1869 void
1870 pfsync_undefer(struct pfsync_deferral *pd, int drop)
1871 {
1872 	struct pfsync_softc *sc = pfsyncif;
1873 
1874 	NET_ASSERT_LOCKED();
1875 
1876 	if (sc == NULL)
1877 		return;
1878 
1879 	CLR(pd->pd_st->state_flags, PFSTATE_ACK);
1880 	if (!drop)
1881 		pfsync_undefer_notify(pd);
1882 
1883 	pfsync_free_deferral(pd);
1884 }
1885 
1886 void
1887 pfsync_deferrals_tmo(void *arg)
1888 {
1889 	struct pfsync_softc *sc = arg;
1890 	struct pfsync_deferral *pd;
1891 	uint64_t now, nsec = 0;
1892 	struct pfsync_deferrals pds = TAILQ_HEAD_INITIALIZER(pds);
1893 
1894 	now = getnsecuptime();
1895 
1896 	mtx_enter(&sc->sc_deferrals_mtx);
1897 	for (;;) {
1898 		pd = TAILQ_FIRST(&sc->sc_deferrals);
1899 		if (pd == NULL)
1900 			break;
1901 
1902 		if (now < pd->pd_deadline) {
1903 			nsec = pd->pd_deadline - now;
1904 			break;
1905 		}
1906 
1907 		TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
1908 		sc->sc_deferred--;
1909 		TAILQ_INSERT_TAIL(&pds, pd, pd_entry);
1910 	}
1911 	mtx_leave(&sc->sc_deferrals_mtx);
1912 
1913 	if (nsec > 0) {
1914 		/* we were looking at a pd, but it wasn't old enough */
1915 		timeout_add_nsec(&sc->sc_deferrals_tmo, nsec);
1916 	}
1917 
1918 	if (TAILQ_EMPTY(&pds))
1919 		return;
1920 
1921 	NET_LOCK();
1922 	while ((pd = TAILQ_FIRST(&pds)) != NULL) {
1923 		TAILQ_REMOVE(&pds, pd, pd_entry);
1924 
1925 		pfsync_undefer(pd, 0);
1926 	}
1927 	NET_UNLOCK();
1928 }
1929 
1930 void
1931 pfsync_deferred(struct pf_state *st, int drop)
1932 {
1933 	struct pfsync_softc *sc = pfsyncif;
1934 	struct pfsync_deferral *pd;
1935 
1936 	NET_ASSERT_LOCKED();
1937 
1938 	mtx_enter(&sc->sc_deferrals_mtx);
1939 	TAILQ_FOREACH(pd, &sc->sc_deferrals, pd_entry) {
1940 		 if (pd->pd_st == st) {
1941 			TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
1942 			sc->sc_deferred--;
1943 			break;
1944 		}
1945 	}
1946 	mtx_leave(&sc->sc_deferrals_mtx);
1947 
1948 	if (pd != NULL)
1949 		pfsync_undefer(pd, drop);
1950 }
1951 
1952 void
1953 pfsync_update_state(struct pf_state *st)
1954 {
1955 	struct pfsync_softc *sc = pfsyncif;
1956 	int sync = 0;
1957 
1958 	NET_ASSERT_LOCKED();
1959 
1960 	if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
1961 		return;
1962 
1963 	if (ISSET(st->state_flags, PFSTATE_ACK))
1964 		pfsync_deferred(st, 0);
1965 	if (ISSET(st->state_flags, PFSTATE_NOSYNC)) {
1966 		if (st->sync_state != PFSYNC_S_NONE)
1967 			pfsync_q_del(st);
1968 		return;
1969 	}
1970 
1971 	if (sc->sc_len == PFSYNC_MINPKT)
1972 		timeout_add_sec(&sc->sc_tmo, 1);
1973 
1974 	switch (st->sync_state) {
1975 	case PFSYNC_S_UPD_C:
1976 	case PFSYNC_S_UPD:
1977 	case PFSYNC_S_INS:
1978 		/* we're already handling it */
1979 
1980 		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) {
1981 			st->sync_updates++;
1982 			if (st->sync_updates >= sc->sc_maxupdates)
1983 				sync = 1;
1984 		}
1985 		break;
1986 
1987 	case PFSYNC_S_IACK:
1988 		pfsync_q_del(st);
1989 	case PFSYNC_S_NONE:
1990 		pfsync_q_ins(st, PFSYNC_S_UPD_C);
1991 		st->sync_updates = 0;
1992 		break;
1993 
1994 	case PFSYNC_S_DEL:
1995 	case PFSYNC_S_COUNT:
1996 	case PFSYNC_S_DEFER:
1997 		break;
1998 
1999 	default:
2000 		panic("pfsync_update_state: unexpected sync state %d",
2001 		    st->sync_state);
2002 	}
2003 
2004 	if (sync || (getuptime() - st->pfsync_time) < 2)
2005 		schednetisr(NETISR_PFSYNC);
2006 }
2007 
2008 void
2009 pfsync_cancel_full_update(struct pfsync_softc *sc)
2010 {
2011 	if (timeout_pending(&sc->sc_bulkfail_tmo) ||
2012 	    timeout_pending(&sc->sc_bulk_tmo)) {
2013 #if NCARP > 0
2014 		if (!pfsync_sync_ok)
2015 			carp_group_demote_adj(&sc->sc_if, -1,
2016 			    "pfsync bulk cancelled");
2017 		if (sc->sc_initial_bulk) {
2018 			carp_group_demote_adj(&sc->sc_if, -32,
2019 			    "pfsync init");
2020 			sc->sc_initial_bulk = 0;
2021 		}
2022 #endif
2023 		pfsync_sync_ok = 1;
2024 		DPFPRINTF(LOG_INFO, "cancelling bulk update");
2025 	}
2026 	timeout_del(&sc->sc_bulkfail_tmo);
2027 	timeout_del(&sc->sc_bulk_tmo);
2028 	sc->sc_bulk_next = NULL;
2029 	sc->sc_bulk_last = NULL;
2030 	sc->sc_ureq_sent = 0;
2031 	sc->sc_bulk_tries = 0;
2032 }
2033 
2034 void
2035 pfsync_request_full_update(struct pfsync_softc *sc)
2036 {
2037 	if (sc->sc_sync_ifidx != 0 && ISSET(sc->sc_if.if_flags, IFF_RUNNING)) {
2038 		/* Request a full state table update. */
2039 		sc->sc_ureq_sent = getuptime();
2040 #if NCARP > 0
2041 		if (!sc->sc_link_demoted && pfsync_sync_ok)
2042 			carp_group_demote_adj(&sc->sc_if, 1,
2043 			    "pfsync bulk start");
2044 #endif
2045 		pfsync_sync_ok = 0;
2046 		DPFPRINTF(LOG_INFO, "requesting bulk update");
2047 		PF_LOCK();
2048 		timeout_add(&sc->sc_bulkfail_tmo, 4 * hz +
2049 		    pf_pool_limits[PF_LIMIT_STATES].limit /
2050 		    ((sc->sc_if.if_mtu - PFSYNC_MINPKT) /
2051 		    sizeof(struct pfsync_state)));
2052 		PF_UNLOCK();
2053 		pfsync_request_update(0, 0);
2054 	}
2055 }
2056 
2057 void
2058 pfsync_request_update(u_int32_t creatorid, u_int64_t id)
2059 {
2060 	struct pfsync_softc *sc = pfsyncif;
2061 	struct pfsync_upd_req_item *item;
2062 	size_t nlen, sclen;
2063 	int retry;
2064 
2065 	/*
2066 	 * this code does nothing to prevent multiple update requests for the
2067 	 * same state being generated.
2068 	 */
2069 
2070 	item = pool_get(&sc->sc_pool, PR_NOWAIT);
2071 	if (item == NULL) {
2072 		/* XXX stats */
2073 		return;
2074 	}
2075 
2076 	item->ur_msg.id = id;
2077 	item->ur_msg.creatorid = creatorid;
2078 
2079 	for (;;) {
2080 		mtx_enter(&sc->sc_upd_req_mtx);
2081 
2082 		nlen = sizeof(struct pfsync_upd_req);
2083 		if (TAILQ_EMPTY(&sc->sc_upd_req_list))
2084 			nlen += sizeof(struct pfsync_subheader);
2085 
2086 		sclen = atomic_add_long_nv(&sc->sc_len, nlen);
2087 		retry = (sclen > sc->sc_if.if_mtu);
2088 		if (retry)
2089 			atomic_sub_long(&sc->sc_len, nlen);
2090 		else
2091 			TAILQ_INSERT_TAIL(&sc->sc_upd_req_list, item, ur_entry);
2092 
2093 		mtx_leave(&sc->sc_upd_req_mtx);
2094 
2095 		if (!retry)
2096 			break;
2097 
2098 		pfsync_sendout();
2099 	}
2100 
2101 	schednetisr(NETISR_PFSYNC);
2102 }
2103 
2104 void
2105 pfsync_update_state_req(struct pf_state *st)
2106 {
2107 	struct pfsync_softc *sc = pfsyncif;
2108 
2109 	if (sc == NULL)
2110 		panic("pfsync_update_state_req: nonexistent instance");
2111 
2112 	if (ISSET(st->state_flags, PFSTATE_NOSYNC)) {
2113 		if (st->sync_state != PFSYNC_S_NONE)
2114 			pfsync_q_del(st);
2115 		return;
2116 	}
2117 
2118 	switch (st->sync_state) {
2119 	case PFSYNC_S_UPD_C:
2120 	case PFSYNC_S_IACK:
2121 		pfsync_q_del(st);
2122 	case PFSYNC_S_NONE:
2123 		pfsync_q_ins(st, PFSYNC_S_UPD);
2124 		schednetisr(NETISR_PFSYNC);
2125 		return;
2126 
2127 	case PFSYNC_S_INS:
2128 	case PFSYNC_S_UPD:
2129 	case PFSYNC_S_DEL:
2130 		/* we're already handling it */
2131 		return;
2132 
2133 	default:
2134 		panic("pfsync_update_state_req: unexpected sync state %d",
2135 		    st->sync_state);
2136 	}
2137 }
2138 
2139 void
2140 pfsync_delete_state(struct pf_state *st)
2141 {
2142 	struct pfsync_softc *sc = pfsyncif;
2143 
2144 	NET_ASSERT_LOCKED();
2145 
2146 	if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
2147 		return;
2148 
2149 	if (ISSET(st->state_flags, PFSTATE_ACK))
2150 		pfsync_deferred(st, 1);
2151 	if (ISSET(st->state_flags, PFSTATE_NOSYNC)) {
2152 		if (st->sync_state != PFSYNC_S_NONE)
2153 			pfsync_q_del(st);
2154 		return;
2155 	}
2156 
2157 	if (sc->sc_len == PFSYNC_MINPKT)
2158 		timeout_add_sec(&sc->sc_tmo, 1);
2159 
2160 	switch (st->sync_state) {
2161 	case PFSYNC_S_INS:
2162 		/* we never got to tell the world so just forget about it */
2163 		pfsync_q_del(st);
2164 		return;
2165 
2166 	case PFSYNC_S_UPD_C:
2167 	case PFSYNC_S_UPD:
2168 	case PFSYNC_S_IACK:
2169 		pfsync_q_del(st);
2170 		/*
2171 		 * FALLTHROUGH to putting it on the del list
2172 		 * Note on reference count bookkeeping:
2173 		 *	pfsync_q_del() drops reference for queue
2174 		 *	ownership. But the st entry survives, because
2175 		 *	our caller still holds a reference.
2176 		 */
2177 
2178 	case PFSYNC_S_NONE:
2179 		/*
2180 		 * We either fall through here, or there is no reference to
2181 		 * st owned by pfsync queues at this point.
2182 		 *
2183 		 * Calling pfsync_q_ins() puts st to del queue. The pfsync_q_ins()
2184 		 * grabs a reference for delete queue.
2185 		 */
2186 		pfsync_q_ins(st, PFSYNC_S_DEL);
2187 		return;
2188 
2189 	default:
2190 		panic("pfsync_delete_state: unexpected sync state %d",
2191 		    st->sync_state);
2192 	}
2193 }
2194 
2195 void
2196 pfsync_clear_states(u_int32_t creatorid, const char *ifname)
2197 {
2198 	struct pfsync_softc *sc = pfsyncif;
2199 	struct {
2200 		struct pfsync_subheader subh;
2201 		struct pfsync_clr clr;
2202 	} __packed r;
2203 
2204 	NET_ASSERT_LOCKED();
2205 
2206 	if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
2207 		return;
2208 
2209 	bzero(&r, sizeof(r));
2210 
2211 	r.subh.action = PFSYNC_ACT_CLR;
2212 	r.subh.len = sizeof(struct pfsync_clr) >> 2;
2213 	r.subh.count = htons(1);
2214 
2215 	strlcpy(r.clr.ifname, ifname, sizeof(r.clr.ifname));
2216 	r.clr.creatorid = creatorid;
2217 
2218 	pfsync_send_plus(&r, sizeof(r));
2219 }
2220 
2221 void
2222 pfsync_iack(struct pf_state *st)
2223 {
2224 	pfsync_q_ins(st, PFSYNC_S_IACK);
2225 	schednetisr(NETISR_PFSYNC);
2226 }
2227 
2228 void
2229 pfsync_q_ins(struct pf_state *st, int q)
2230 {
2231 	struct pfsync_softc *sc = pfsyncif;
2232 	size_t nlen, sclen;
2233 
2234 	if (sc->sc_len < PFSYNC_MINPKT)
2235 		panic("pfsync pkt len is too low %zd", sc->sc_len);
2236 	do {
2237 		mtx_enter(&sc->sc_st_mtx);
2238 		mtx_enter(&st->mtx);
2239 
2240 		/*
2241 		 * There are either two threads trying to update the
2242 		 * the same state, or the state is just being processed
2243 		 * (is on snapshot queue).
2244 		 */
2245 		if (st->sync_state != PFSYNC_S_NONE) {
2246 			mtx_leave(&st->mtx);
2247 			mtx_leave(&sc->sc_st_mtx);
2248 			break;
2249 		}
2250 
2251 		nlen = pfsync_qs[q].len;
2252 
2253 		if (TAILQ_EMPTY(&sc->sc_qs[q]))
2254 			nlen += sizeof(struct pfsync_subheader);
2255 
2256 		sclen = atomic_add_long_nv(&sc->sc_len, nlen);
2257 		if (sclen > sc->sc_if.if_mtu) {
2258 			atomic_sub_long(&sc->sc_len, nlen);
2259 			mtx_leave(&st->mtx);
2260 			mtx_leave(&sc->sc_st_mtx);
2261 			pfsync_sendout();
2262 			continue;
2263 		}
2264 
2265 		pf_state_ref(st);
2266 
2267 		TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list);
2268 		st->sync_state = q;
2269 		mtx_leave(&st->mtx);
2270 		mtx_leave(&sc->sc_st_mtx);
2271 	} while (0);
2272 }
2273 
2274 void
2275 pfsync_q_del(struct pf_state *st)
2276 {
2277 	struct pfsync_softc *sc = pfsyncif;
2278 	int q;
2279 
2280 	mtx_enter(&sc->sc_st_mtx);
2281 	mtx_enter(&st->mtx);
2282 	q = st->sync_state;
2283 	/*
2284 	 * re-check under mutex
2285 	 * if state is snapped already, then just bail out, because we came
2286 	 * too late, the state is being just processed/dispatched to peer.
2287 	 */
2288 	if ((q == PFSYNC_S_NONE) || (st->snapped)) {
2289 		mtx_leave(&st->mtx);
2290 		mtx_leave(&sc->sc_st_mtx);
2291 		return;
2292 	}
2293 	atomic_sub_long(&sc->sc_len, pfsync_qs[q].len);
2294 	TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
2295 	if (TAILQ_EMPTY(&sc->sc_qs[q]))
2296 		atomic_sub_long(&sc->sc_len, sizeof (struct pfsync_subheader));
2297 	st->sync_state = PFSYNC_S_NONE;
2298 	mtx_leave(&st->mtx);
2299 	mtx_leave(&sc->sc_st_mtx);
2300 
2301 	pf_state_unref(st);
2302 }
2303 
2304 #if defined(IPSEC)
2305 void
2306 pfsync_update_tdb(struct tdb *t, int output)
2307 {
2308 	struct pfsync_softc *sc = pfsyncif;
2309 	size_t nlen, sclen;
2310 
2311 	if (sc == NULL)
2312 		return;
2313 
2314 	if (!ISSET(t->tdb_flags, TDBF_PFSYNC)) {
2315 		do {
2316 			mtx_enter(&sc->sc_tdb_mtx);
2317 			nlen = sizeof(struct pfsync_tdb);
2318 
2319 			mtx_enter(&t->tdb_mtx);
2320 			if (ISSET(t->tdb_flags, TDBF_PFSYNC)) {
2321 				/* we've lost race, no action for us then */
2322 				mtx_leave(&t->tdb_mtx);
2323 				mtx_leave(&sc->sc_tdb_mtx);
2324 				break;
2325 			}
2326 
2327 			if (TAILQ_EMPTY(&sc->sc_tdb_q))
2328 				nlen += sizeof(struct pfsync_subheader);
2329 
2330 			sclen = atomic_add_long_nv(&sc->sc_len, nlen);
2331 			if (sclen > sc->sc_if.if_mtu) {
2332 				atomic_sub_long(&sc->sc_len, nlen);
2333 				mtx_leave(&t->tdb_mtx);
2334 				mtx_leave(&sc->sc_tdb_mtx);
2335 				pfsync_sendout();
2336 				continue;
2337 			}
2338 
2339 			TAILQ_INSERT_TAIL(&sc->sc_tdb_q, t, tdb_sync_entry);
2340 			tdb_ref(t);
2341 			SET(t->tdb_flags, TDBF_PFSYNC);
2342 			mtx_leave(&t->tdb_mtx);
2343 
2344 			mtx_leave(&sc->sc_tdb_mtx);
2345 			t->tdb_updates = 0;
2346 		} while (0);
2347 	} else {
2348 		if (++t->tdb_updates >= sc->sc_maxupdates)
2349 			schednetisr(NETISR_PFSYNC);
2350 	}
2351 
2352 	mtx_enter(&t->tdb_mtx);
2353 	if (output)
2354 		SET(t->tdb_flags, TDBF_PFSYNC_RPL);
2355 	else
2356 		CLR(t->tdb_flags, TDBF_PFSYNC_RPL);
2357 	mtx_leave(&t->tdb_mtx);
2358 }
2359 #endif
2360 
2361 #if defined(IPSEC)
2362 void
2363 pfsync_delete_tdb(struct tdb *t)
2364 {
2365 	struct pfsync_softc *sc = pfsyncif;
2366 	size_t nlen;
2367 
2368 	if (sc == NULL || !ISSET(t->tdb_flags, TDBF_PFSYNC))
2369 		return;
2370 
2371 	mtx_enter(&sc->sc_tdb_mtx);
2372 
2373 	/*
2374 	 * if tdb entry is just being processed (found in snapshot),
2375 	 * then it can not be deleted. we just came too late
2376 	 */
2377 	if (ISSET(t->tdb_flags, TDBF_PFSYNC_SNAPPED)) {
2378 		mtx_leave(&sc->sc_tdb_mtx);
2379 		return;
2380 	}
2381 
2382 	TAILQ_REMOVE(&sc->sc_tdb_q, t, tdb_sync_entry);
2383 
2384 	mtx_enter(&t->tdb_mtx);
2385 	CLR(t->tdb_flags, TDBF_PFSYNC);
2386 	mtx_leave(&t->tdb_mtx);
2387 
2388 	nlen = sizeof(struct pfsync_tdb);
2389 	if (TAILQ_EMPTY(&sc->sc_tdb_q))
2390 		nlen += sizeof(struct pfsync_subheader);
2391 	atomic_sub_long(&sc->sc_len, nlen);
2392 
2393 	mtx_leave(&sc->sc_tdb_mtx);
2394 
2395 	tdb_unref(t);
2396 }
2397 #endif
2398 
2399 void
2400 pfsync_out_tdb(struct tdb *t, void *buf)
2401 {
2402 	struct pfsync_tdb *ut = buf;
2403 
2404 	bzero(ut, sizeof(*ut));
2405 	ut->spi = t->tdb_spi;
2406 	bcopy(&t->tdb_dst, &ut->dst, sizeof(ut->dst));
2407 	/*
2408 	 * When a failover happens, the master's rpl is probably above
2409 	 * what we see here (we may be up to a second late), so
2410 	 * increase it a bit for outbound tdbs to manage most such
2411 	 * situations.
2412 	 *
2413 	 * For now, just add an offset that is likely to be larger
2414 	 * than the number of packets we can see in one second. The RFC
2415 	 * just says the next packet must have a higher seq value.
2416 	 *
2417 	 * XXX What is a good algorithm for this? We could use
2418 	 * a rate-determined increase, but to know it, we would have
2419 	 * to extend struct tdb.
2420 	 * XXX pt->rpl can wrap over MAXINT, but if so the real tdb
2421 	 * will soon be replaced anyway. For now, just don't handle
2422 	 * this edge case.
2423 	 */
2424 #define RPL_INCR 16384
2425 	ut->rpl = htobe64(t->tdb_rpl + (ISSET(t->tdb_flags, TDBF_PFSYNC_RPL) ?
2426 	    RPL_INCR : 0));
2427 	ut->cur_bytes = htobe64(t->tdb_cur_bytes);
2428 	ut->sproto = t->tdb_sproto;
2429 	ut->rdomain = htons(t->tdb_rdomain);
2430 }
2431 
2432 void
2433 pfsync_bulk_start(void)
2434 {
2435 	struct pfsync_softc *sc = pfsyncif;
2436 
2437 	NET_ASSERT_LOCKED();
2438 
2439 	/*
2440 	 * pf gc via pfsync_state_in_use reads sc_bulk_next and
2441 	 * sc_bulk_last while exclusively holding the pf_state_list
2442 	 * rwlock. make sure it can't race with us setting these
2443 	 * pointers. they basically act as hazards, and borrow the
2444 	 * lists state reference count.
2445 	 */
2446 	rw_enter_read(&pf_state_list.pfs_rwl);
2447 
2448 	/* get a consistent view of the list pointers */
2449 	mtx_enter(&pf_state_list.pfs_mtx);
2450 	if (sc->sc_bulk_next == NULL)
2451 		sc->sc_bulk_next = TAILQ_FIRST(&pf_state_list.pfs_list);
2452 
2453 	sc->sc_bulk_last = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
2454 	mtx_leave(&pf_state_list.pfs_mtx);
2455 
2456 	rw_exit_read(&pf_state_list.pfs_rwl);
2457 
2458 	DPFPRINTF(LOG_INFO, "received bulk update request");
2459 
2460 	if (sc->sc_bulk_last == NULL)
2461 		pfsync_bulk_status(PFSYNC_BUS_END);
2462 	else {
2463 		sc->sc_ureq_received = getuptime();
2464 
2465 		pfsync_bulk_status(PFSYNC_BUS_START);
2466 		timeout_add(&sc->sc_bulk_tmo, 0);
2467 	}
2468 }
2469 
2470 void
2471 pfsync_bulk_update(void *arg)
2472 {
2473 	struct pfsync_softc *sc;
2474 	struct pf_state *st;
2475 	int i = 0;
2476 
2477 	NET_LOCK();
2478 	sc = pfsyncif;
2479 	if (sc == NULL)
2480 		goto out;
2481 
2482 	rw_enter_read(&pf_state_list.pfs_rwl);
2483 	st = sc->sc_bulk_next;
2484 	sc->sc_bulk_next = NULL;
2485 
2486 	if (st == NULL) {
2487 		rw_exit_read(&pf_state_list.pfs_rwl);
2488 		goto out;
2489 	}
2490 
2491 	for (;;) {
2492 		if (st->sync_state == PFSYNC_S_NONE &&
2493 		    st->timeout < PFTM_MAX &&
2494 		    st->pfsync_time <= sc->sc_ureq_received) {
2495 			pfsync_update_state_req(st);
2496 			i++;
2497 		}
2498 
2499 		st = TAILQ_NEXT(st, entry_list);
2500 		if ((st == NULL) || (st == sc->sc_bulk_last)) {
2501 			/* we're done */
2502 			sc->sc_bulk_last = NULL;
2503 			pfsync_bulk_status(PFSYNC_BUS_END);
2504 			break;
2505 		}
2506 
2507 		if (i > 1 && (sc->sc_if.if_mtu - sc->sc_len) <
2508 		    sizeof(struct pfsync_state)) {
2509 			/* we've filled a packet */
2510 			sc->sc_bulk_next = st;
2511 			timeout_add(&sc->sc_bulk_tmo, 1);
2512 			break;
2513 		}
2514 	}
2515 
2516 	rw_exit_read(&pf_state_list.pfs_rwl);
2517  out:
2518 	NET_UNLOCK();
2519 }
2520 
2521 void
2522 pfsync_bulk_status(u_int8_t status)
2523 {
2524 	struct {
2525 		struct pfsync_subheader subh;
2526 		struct pfsync_bus bus;
2527 	} __packed r;
2528 
2529 	struct pfsync_softc *sc = pfsyncif;
2530 
2531 	bzero(&r, sizeof(r));
2532 
2533 	r.subh.action = PFSYNC_ACT_BUS;
2534 	r.subh.len = sizeof(struct pfsync_bus) >> 2;
2535 	r.subh.count = htons(1);
2536 
2537 	r.bus.creatorid = pf_status.hostid;
2538 	r.bus.endtime = htonl(getuptime() - sc->sc_ureq_received);
2539 	r.bus.status = status;
2540 
2541 	pfsync_send_plus(&r, sizeof(r));
2542 }
2543 
2544 void
2545 pfsync_bulk_fail(void *arg)
2546 {
2547 	struct pfsync_softc *sc;
2548 
2549 	NET_LOCK();
2550 	sc = pfsyncif;
2551 	if (sc == NULL)
2552 		goto out;
2553 	if (sc->sc_bulk_tries++ < PFSYNC_MAX_BULKTRIES) {
2554 		/* Try again */
2555 		timeout_add_sec(&sc->sc_bulkfail_tmo, 5);
2556 		pfsync_request_update(0, 0);
2557 	} else {
2558 		/* Pretend like the transfer was ok */
2559 		sc->sc_ureq_sent = 0;
2560 		sc->sc_bulk_tries = 0;
2561 #if NCARP > 0
2562 		if (!pfsync_sync_ok)
2563 			carp_group_demote_adj(&sc->sc_if, -1,
2564 			    sc->sc_link_demoted ?
2565 			    "pfsync link state up" :
2566 			    "pfsync bulk fail");
2567 		if (sc->sc_initial_bulk) {
2568 			carp_group_demote_adj(&sc->sc_if, -32,
2569 			    "pfsync init");
2570 			sc->sc_initial_bulk = 0;
2571 		}
2572 #endif
2573 		pfsync_sync_ok = 1;
2574 		sc->sc_link_demoted = 0;
2575 		DPFPRINTF(LOG_ERR, "failed to receive bulk update");
2576 	}
2577  out:
2578 	NET_UNLOCK();
2579 }
2580 
2581 void
2582 pfsync_send_plus(void *plus, size_t pluslen)
2583 {
2584 	struct pfsync_softc *sc = pfsyncif;
2585 
2586 	if (sc->sc_len + pluslen > sc->sc_if.if_mtu)
2587 		pfsync_sendout();
2588 
2589 	sc->sc_plus = plus;
2590 	sc->sc_pluslen = pluslen;
2591 	atomic_add_long(&sc->sc_len, pluslen);
2592 
2593 	pfsync_sendout();
2594 }
2595 
2596 int
2597 pfsync_is_up(void)
2598 {
2599 	struct pfsync_softc *sc = pfsyncif;
2600 
2601 	if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
2602 		return (0);
2603 
2604 	return (1);
2605 }
2606 
2607 int
2608 pfsync_state_in_use(struct pf_state *st)
2609 {
2610 	struct pfsync_softc *sc = pfsyncif;
2611 
2612 	if (sc == NULL)
2613 		return (0);
2614 
2615 	rw_assert_wrlock(&pf_state_list.pfs_rwl);
2616 
2617 	if (st->sync_state != PFSYNC_S_NONE ||
2618 	    st == sc->sc_bulk_next ||
2619 	    st == sc->sc_bulk_last)
2620 		return (1);
2621 
2622 	return (0);
2623 }
2624 
2625 void
2626 pfsync_timeout(void *arg)
2627 {
2628 	NET_LOCK();
2629 	pfsync_sendout();
2630 	NET_UNLOCK();
2631 }
2632 
2633 /* this is a softnet/netisr handler */
2634 void
2635 pfsyncintr(void)
2636 {
2637 	pfsync_sendout();
2638 }
2639 
2640 int
2641 pfsync_sysctl_pfsyncstat(void *oldp, size_t *oldlenp, void *newp)
2642 {
2643 	struct pfsyncstats pfsyncstat;
2644 
2645 	CTASSERT(sizeof(pfsyncstat) == (pfsyncs_ncounters * sizeof(uint64_t)));
2646 	memset(&pfsyncstat, 0, sizeof pfsyncstat);
2647 	counters_read(pfsynccounters, (uint64_t *)&pfsyncstat,
2648 	    pfsyncs_ncounters);
2649 	return (sysctl_rdstruct(oldp, oldlenp, newp,
2650 	    &pfsyncstat, sizeof(pfsyncstat)));
2651 }
2652 
2653 int
2654 pfsync_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
2655     size_t newlen)
2656 {
2657 	/* All sysctl names at this level are terminal. */
2658 	if (namelen != 1)
2659 		return (ENOTDIR);
2660 
2661 	switch (name[0]) {
2662 	case PFSYNCCTL_STATS:
2663 		return (pfsync_sysctl_pfsyncstat(oldp, oldlenp, newp));
2664 	default:
2665 		return (ENOPROTOOPT);
2666 	}
2667 }
2668