xref: /openbsd-src/sys/net/if_pfsync.c (revision 43003dfe3ad45d1698bed8a37f2b0f5b14f20d4f)
1 /*	$OpenBSD: if_pfsync.c,v 1.129 2009/09/28 03:01:23 dlg Exp $	*/
2 
3 /*
4  * Copyright (c) 2002 Michael Shalayeff
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
20  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22  * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
24  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
25  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26  * THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*
30  * Copyright (c) 2009 David Gwynne <dlg@openbsd.org>
31  *
32  * Permission to use, copy, modify, and distribute this software for any
33  * purpose with or without fee is hereby granted, provided that the above
34  * copyright notice and this permission notice appear in all copies.
35  *
36  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
37  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
38  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
39  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
40  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
41  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
42  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
43  */
44 
45 #include <sys/param.h>
46 #include <sys/proc.h>
47 #include <sys/systm.h>
48 #include <sys/time.h>
49 #include <sys/mbuf.h>
50 #include <sys/socket.h>
51 #include <sys/ioctl.h>
52 #include <sys/timeout.h>
53 #include <sys/kernel.h>
54 #include <sys/sysctl.h>
55 #include <sys/pool.h>
56 
57 #include <net/if.h>
58 #include <net/if_types.h>
59 #include <net/route.h>
60 #include <net/bpf.h>
61 #include <net/netisr.h>
62 #include <netinet/in.h>
63 #include <netinet/if_ether.h>
64 #include <netinet/tcp.h>
65 #include <netinet/tcp_seq.h>
66 
67 #ifdef	INET
68 #include <netinet/in_systm.h>
69 #include <netinet/in_var.h>
70 #include <netinet/ip.h>
71 #include <netinet/ip_var.h>
72 #endif
73 
74 #ifdef INET6
75 #include <netinet6/nd6.h>
76 #endif /* INET6 */
77 
78 #include "carp.h"
79 #if NCARP > 0
80 #include <netinet/ip_carp.h>
81 #endif
82 
83 #include <net/pfvar.h>
84 #include <net/if_pfsync.h>
85 
86 #include "bpfilter.h"
87 #include "pfsync.h"
88 
89 #define PFSYNC_MINPKT ( \
90 	sizeof(struct ip) + \
91 	sizeof(struct pfsync_header) + \
92 	sizeof(struct pfsync_subheader))
93 
94 struct pfsync_pkt {
95 	struct ip *ip;
96 	struct in_addr src;
97 	u_int8_t flags;
98 };
99 
100 int	pfsync_upd_tcp(struct pf_state *, struct pfsync_state_peer *,
101 	    struct pfsync_state_peer *);
102 
103 int	pfsync_in_clr(struct pfsync_pkt *, struct mbuf *, int, int);
104 int	pfsync_in_ins(struct pfsync_pkt *, struct mbuf *, int, int);
105 int	pfsync_in_iack(struct pfsync_pkt *, struct mbuf *, int, int);
106 int	pfsync_in_upd(struct pfsync_pkt *, struct mbuf *, int, int);
107 int	pfsync_in_upd_c(struct pfsync_pkt *, struct mbuf *, int, int);
108 int	pfsync_in_ureq(struct pfsync_pkt *, struct mbuf *, int, int);
109 int	pfsync_in_del(struct pfsync_pkt *, struct mbuf *, int, int);
110 int	pfsync_in_del_c(struct pfsync_pkt *, struct mbuf *, int, int);
111 int	pfsync_in_bus(struct pfsync_pkt *, struct mbuf *, int, int);
112 int	pfsync_in_tdb(struct pfsync_pkt *, struct mbuf *, int, int);
113 int	pfsync_in_eof(struct pfsync_pkt *, struct mbuf *, int, int);
114 
115 int	pfsync_in_error(struct pfsync_pkt *, struct mbuf *, int, int);
116 
117 int	(*pfsync_acts[])(struct pfsync_pkt *, struct mbuf *, int, int) = {
118 	pfsync_in_clr,			/* PFSYNC_ACT_CLR */
119 	pfsync_in_ins,			/* PFSYNC_ACT_INS */
120 	pfsync_in_iack,			/* PFSYNC_ACT_INS_ACK */
121 	pfsync_in_upd,			/* PFSYNC_ACT_UPD */
122 	pfsync_in_upd_c,		/* PFSYNC_ACT_UPD_C */
123 	pfsync_in_ureq,			/* PFSYNC_ACT_UPD_REQ */
124 	pfsync_in_del,			/* PFSYNC_ACT_DEL */
125 	pfsync_in_del_c,		/* PFSYNC_ACT_DEL_C */
126 	pfsync_in_error,		/* PFSYNC_ACT_INS_F */
127 	pfsync_in_error,		/* PFSYNC_ACT_DEL_F */
128 	pfsync_in_bus,			/* PFSYNC_ACT_BUS */
129 	pfsync_in_tdb,			/* PFSYNC_ACT_TDB */
130 	pfsync_in_eof			/* PFSYNC_ACT_EOF */
131 };
132 
133 struct pfsync_q {
134 	void		(*write)(struct pf_state *, void *);
135 	size_t		len;
136 	u_int8_t	action;
137 };
138 
139 /* we have one of these for every PFSYNC_S_ */
140 void	pfsync_out_state(struct pf_state *, void *);
141 void	pfsync_out_iack(struct pf_state *, void *);
142 void	pfsync_out_upd_c(struct pf_state *, void *);
143 void	pfsync_out_del(struct pf_state *, void *);
144 
145 struct pfsync_q pfsync_qs[] = {
146 	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_INS },
147 	{ pfsync_out_iack,  sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK },
148 	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_UPD },
149 	{ pfsync_out_upd_c, sizeof(struct pfsync_upd_c),   PFSYNC_ACT_UPD_C },
150 	{ pfsync_out_del,   sizeof(struct pfsync_del_c),   PFSYNC_ACT_DEL_C }
151 };
152 
153 void	pfsync_q_ins(struct pf_state *, int);
154 void	pfsync_q_del(struct pf_state *);
155 
156 struct pfsync_upd_req_item {
157 	TAILQ_ENTRY(pfsync_upd_req_item)	ur_entry;
158 	struct pfsync_upd_req			ur_msg;
159 };
160 TAILQ_HEAD(pfsync_upd_reqs, pfsync_upd_req_item);
161 
162 struct pfsync_deferral {
163 	TAILQ_ENTRY(pfsync_deferral)		 pd_entry;
164 	struct pf_state				*pd_st;
165 	struct mbuf				*pd_m;
166 	struct timeout				 pd_tmo;
167 };
168 TAILQ_HEAD(pfsync_deferrals, pfsync_deferral);
169 
170 #define PFSYNC_PLSIZE	MAX(sizeof(struct pfsync_upd_req_item), \
171 			    sizeof(struct pfsync_deferral))
172 
173 void	pfsync_out_tdb(struct tdb *, void *);
174 
175 struct pfsync_softc {
176 	struct ifnet		 sc_if;
177 	struct ifnet		*sc_sync_if;
178 
179 	struct pool		 sc_pool;
180 
181 	struct ip_moptions	 sc_imo;
182 
183 	struct in_addr		 sc_sync_peer;
184 	u_int8_t		 sc_maxupdates;
185 
186 	struct ip		 sc_template;
187 
188 	struct pf_state_queue	 sc_qs[PFSYNC_S_COUNT];
189 	size_t			 sc_len;
190 
191 	struct pfsync_upd_reqs	 sc_upd_req_list;
192 
193 	int			 sc_defer;
194 	struct pfsync_deferrals	 sc_deferrals;
195 	u_int			 sc_deferred;
196 
197 	void			*sc_plus;
198 	size_t			 sc_pluslen;
199 
200 	u_int32_t		 sc_ureq_sent;
201 	int			 sc_bulk_tries;
202 	struct timeout		 sc_bulkfail_tmo;
203 
204 	u_int32_t		 sc_ureq_received;
205 	struct pf_state		*sc_bulk_next;
206 	struct pf_state		*sc_bulk_last;
207 	struct timeout		 sc_bulk_tmo;
208 
209 	TAILQ_HEAD(, tdb)	 sc_tdb_q;
210 
211 	struct timeout		 sc_tmo;
212 };
213 
214 struct pfsync_softc	*pfsyncif = NULL;
215 struct pfsyncstats	 pfsyncstats;
216 
217 void	pfsyncattach(int);
218 int	pfsync_clone_create(struct if_clone *, int);
219 int	pfsync_clone_destroy(struct ifnet *);
220 int	pfsync_alloc_scrub_memory(struct pfsync_state_peer *,
221 	    struct pf_state_peer *);
222 void	pfsync_update_net_tdb(struct pfsync_tdb *);
223 int	pfsyncoutput(struct ifnet *, struct mbuf *, struct sockaddr *,
224 	    struct rtentry *);
225 int	pfsyncioctl(struct ifnet *, u_long, caddr_t);
226 void	pfsyncstart(struct ifnet *);
227 
228 struct mbuf *pfsync_if_dequeue(struct ifnet *);
229 
230 void	pfsync_deferred(struct pf_state *, int);
231 void	pfsync_undefer(struct pfsync_deferral *, int);
232 void	pfsync_defer_tmo(void *);
233 
234 void	pfsync_request_update(u_int32_t, u_int64_t);
235 void	pfsync_update_state_req(struct pf_state *);
236 
237 void	pfsync_drop(struct pfsync_softc *);
238 void	pfsync_sendout(void);
239 void	pfsync_send_plus(void *, size_t);
240 void	pfsync_timeout(void *);
241 void	pfsync_tdb_timeout(void *);
242 
243 void	pfsync_bulk_start(void);
244 void	pfsync_bulk_status(u_int8_t);
245 void	pfsync_bulk_update(void *);
246 void	pfsync_bulk_fail(void *);
247 
248 #define PFSYNC_MAX_BULKTRIES	12
249 int	pfsync_sync_ok;
250 
251 struct if_clone	pfsync_cloner =
252     IF_CLONE_INITIALIZER("pfsync", pfsync_clone_create, pfsync_clone_destroy);
253 
254 void
255 pfsyncattach(int npfsync)
256 {
257 	if_clone_attach(&pfsync_cloner);
258 }
259 int
260 pfsync_clone_create(struct if_clone *ifc, int unit)
261 {
262 	struct pfsync_softc *sc;
263 	struct ifnet *ifp;
264 	int q;
265 
266 	if (unit != 0)
267 		return (EINVAL);
268 
269 	pfsync_sync_ok = 1;
270 
271 	sc = malloc(sizeof(*pfsyncif), M_DEVBUF, M_NOWAIT | M_ZERO);
272 	if (sc == NULL)
273 		return (ENOMEM);
274 
275 	for (q = 0; q < PFSYNC_S_COUNT; q++)
276 		TAILQ_INIT(&sc->sc_qs[q]);
277 
278 	pool_init(&sc->sc_pool, PFSYNC_PLSIZE, 0, 0, 0, "pfsync", NULL);
279 	TAILQ_INIT(&sc->sc_upd_req_list);
280 	TAILQ_INIT(&sc->sc_deferrals);
281 	sc->sc_deferred = 0;
282 
283 	TAILQ_INIT(&sc->sc_tdb_q);
284 
285 	sc->sc_len = PFSYNC_MINPKT;
286 	sc->sc_maxupdates = 128;
287 
288 	sc->sc_imo.imo_membership = (struct in_multi **)malloc(
289 	    (sizeof(struct in_multi *) * IP_MIN_MEMBERSHIPS), M_IPMOPTS,
290 	    M_WAITOK | M_ZERO);
291 	sc->sc_imo.imo_max_memberships = IP_MIN_MEMBERSHIPS;
292 
293 	ifp = &sc->sc_if;
294 	snprintf(ifp->if_xname, sizeof ifp->if_xname, "pfsync%d", unit);
295 	ifp->if_softc = sc;
296 	ifp->if_ioctl = pfsyncioctl;
297 	ifp->if_output = pfsyncoutput;
298 	ifp->if_start = pfsyncstart;
299 	ifp->if_type = IFT_PFSYNC;
300 	ifp->if_snd.ifq_maxlen = ifqmaxlen;
301 	ifp->if_hdrlen = sizeof(struct pfsync_header);
302 	ifp->if_mtu = 1500; /* XXX */
303 	ifp->if_hardmtu = MCLBYTES; /* XXX */
304 	timeout_set(&sc->sc_tmo, pfsync_timeout, sc);
305 	timeout_set(&sc->sc_bulk_tmo, pfsync_bulk_update, sc);
306 	timeout_set(&sc->sc_bulkfail_tmo, pfsync_bulk_fail, sc);
307 
308 	if_attach(ifp);
309 	if_alloc_sadl(ifp);
310 
311 #if NCARP > 0
312 	if_addgroup(ifp, "carp");
313 #endif
314 
315 #if NBPFILTER > 0
316 	bpfattach(&sc->sc_if.if_bpf, ifp, DLT_PFSYNC, PFSYNC_HDRLEN);
317 #endif
318 
319 	pfsyncif = sc;
320 
321 	return (0);
322 }
323 
324 int
325 pfsync_clone_destroy(struct ifnet *ifp)
326 {
327 	struct pfsync_softc *sc = ifp->if_softc;
328 
329 	timeout_del(&sc->sc_bulk_tmo);
330 	timeout_del(&sc->sc_tmo);
331 #if NCARP > 0
332 	if (!pfsync_sync_ok)
333 		carp_group_demote_adj(&sc->sc_if, -1);
334 #endif
335 #if NBPFILTER > 0
336 	bpfdetach(ifp);
337 #endif
338 	if_detach(ifp);
339 
340 	pfsync_drop(sc);
341 
342 	while (sc->sc_deferred > 0)
343 		pfsync_undefer(TAILQ_FIRST(&sc->sc_deferrals), 0);
344 
345 	pool_destroy(&sc->sc_pool);
346 	free(sc->sc_imo.imo_membership, M_IPMOPTS);
347 	free(sc, M_DEVBUF);
348 
349 	pfsyncif = NULL;
350 
351 	return (0);
352 }
353 
354 struct mbuf *
355 pfsync_if_dequeue(struct ifnet *ifp)
356 {
357 	struct mbuf *m;
358 
359 	IF_DEQUEUE(&ifp->if_snd, m);
360 
361 	return (m);
362 }
363 
364 /*
365  * Start output on the pfsync interface.
366  */
367 void
368 pfsyncstart(struct ifnet *ifp)
369 {
370 	struct mbuf *m;
371 	int s;
372 
373 	s = splnet();
374 	while ((m = pfsync_if_dequeue(ifp)) != NULL) {
375 		IF_DROP(&ifp->if_snd);
376 		m_freem(m);
377 	}
378 	splx(s);
379 }
380 
381 int
382 pfsync_alloc_scrub_memory(struct pfsync_state_peer *s,
383     struct pf_state_peer *d)
384 {
385 	if (s->scrub.scrub_flag && d->scrub == NULL) {
386 		d->scrub = pool_get(&pf_state_scrub_pl, PR_NOWAIT | PR_ZERO);
387 		if (d->scrub == NULL)
388 			return (ENOMEM);
389 	}
390 
391 	return (0);
392 }
393 
394 void
395 pfsync_state_export(struct pfsync_state *sp, struct pf_state *st)
396 {
397 	bzero(sp, sizeof(struct pfsync_state));
398 
399 	/* copy from state key */
400 	sp->key[PF_SK_WIRE].addr[0] = st->key[PF_SK_WIRE]->addr[0];
401 	sp->key[PF_SK_WIRE].addr[1] = st->key[PF_SK_WIRE]->addr[1];
402 	sp->key[PF_SK_WIRE].port[0] = st->key[PF_SK_WIRE]->port[0];
403 	sp->key[PF_SK_WIRE].port[1] = st->key[PF_SK_WIRE]->port[1];
404 	sp->key[PF_SK_STACK].addr[0] = st->key[PF_SK_STACK]->addr[0];
405 	sp->key[PF_SK_STACK].addr[1] = st->key[PF_SK_STACK]->addr[1];
406 	sp->key[PF_SK_STACK].port[0] = st->key[PF_SK_STACK]->port[0];
407 	sp->key[PF_SK_STACK].port[1] = st->key[PF_SK_STACK]->port[1];
408 	sp->proto = st->key[PF_SK_WIRE]->proto;
409 	sp->af = st->key[PF_SK_WIRE]->af;
410 
411 	/* copy from state */
412 	strlcpy(sp->ifname, st->kif->pfik_name, sizeof(sp->ifname));
413 	bcopy(&st->rt_addr, &sp->rt_addr, sizeof(sp->rt_addr));
414 	sp->creation = htonl(time_second - st->creation);
415 	sp->expire = pf_state_expires(st);
416 	if (sp->expire <= time_second)
417 		sp->expire = htonl(0);
418 	else
419 		sp->expire = htonl(sp->expire - time_second);
420 
421 	sp->direction = st->direction;
422 	sp->log = st->log;
423 	sp->timeout = st->timeout;
424 	sp->state_flags = st->state_flags;
425 	if (st->src_node)
426 		sp->sync_flags |= PFSYNC_FLAG_SRCNODE;
427 	if (st->nat_src_node)
428 		sp->sync_flags |= PFSYNC_FLAG_NATSRCNODE;
429 
430 	bcopy(&st->id, &sp->id, sizeof(sp->id));
431 	sp->creatorid = st->creatorid;
432 	pf_state_peer_hton(&st->src, &sp->src);
433 	pf_state_peer_hton(&st->dst, &sp->dst);
434 
435 	if (st->rule.ptr == NULL)
436 		sp->rule = htonl(-1);
437 	else
438 		sp->rule = htonl(st->rule.ptr->nr);
439 	if (st->anchor.ptr == NULL)
440 		sp->anchor = htonl(-1);
441 	else
442 		sp->anchor = htonl(st->anchor.ptr->nr);
443 	if (st->nat_rule.ptr == NULL)
444 		sp->nat_rule = htonl(-1);
445 	else
446 		sp->nat_rule = htonl(st->nat_rule.ptr->nr);
447 
448 	pf_state_counter_hton(st->packets[0], sp->packets[0]);
449 	pf_state_counter_hton(st->packets[1], sp->packets[1]);
450 	pf_state_counter_hton(st->bytes[0], sp->bytes[0]);
451 	pf_state_counter_hton(st->bytes[1], sp->bytes[1]);
452 
453 }
454 
455 int
456 pfsync_state_import(struct pfsync_state *sp, u_int8_t flags)
457 {
458 	struct pf_state	*st = NULL;
459 	struct pf_state_key *skw = NULL, *sks = NULL;
460 	struct pf_rule *r = NULL;
461 	struct pfi_kif	*kif;
462 	int pool_flags;
463 	int error;
464 
465 	if (sp->creatorid == 0 && pf_status.debug >= PF_DEBUG_MISC) {
466 		printf("pfsync_state_import: invalid creator id:"
467 		    " %08x\n", ntohl(sp->creatorid));
468 		return (EINVAL);
469 	}
470 
471 	if ((kif = pfi_kif_get(sp->ifname)) == NULL) {
472 		if (pf_status.debug >= PF_DEBUG_MISC)
473 			printf("pfsync_state_import: "
474 			    "unknown interface: %s\n", sp->ifname);
475 		if (flags & PFSYNC_SI_IOCTL)
476 			return (EINVAL);
477 		return (0);	/* skip this state */
478 	}
479 
480 	/*
481 	 * If the ruleset checksums match or the state is coming from the ioctl,
482 	 * it's safe to associate the state with the rule of that number.
483 	 */
484 	if (sp->rule != htonl(-1) && sp->anchor == htonl(-1) &&
485 	    (flags & (PFSYNC_SI_IOCTL | PFSYNC_SI_CKSUM)) && ntohl(sp->rule) <
486 	    pf_main_ruleset.rules[PF_RULESET_FILTER].active.rcount)
487 		r = pf_main_ruleset.rules[
488 		    PF_RULESET_FILTER].active.ptr_array[ntohl(sp->rule)];
489 	else
490 		r = &pf_default_rule;
491 
492 	if ((r->max_states && r->states_cur >= r->max_states))
493 		goto cleanup;
494 
495 	if (flags & PFSYNC_SI_IOCTL)
496 		pool_flags = PR_WAITOK | PR_LIMITFAIL | PR_ZERO;
497 	else
498 		pool_flags = PR_LIMITFAIL | PR_ZERO;
499 
500 	if ((st = pool_get(&pf_state_pl, pool_flags)) == NULL)
501 		goto cleanup;
502 
503 	if ((skw = pf_alloc_state_key(pool_flags)) == NULL)
504 		goto cleanup;
505 
506 	if (PF_ANEQ(&sp->key[PF_SK_WIRE].addr[0],
507 	    &sp->key[PF_SK_STACK].addr[0], sp->af) ||
508 	    PF_ANEQ(&sp->key[PF_SK_WIRE].addr[1],
509 	    &sp->key[PF_SK_STACK].addr[1], sp->af) ||
510 	    sp->key[PF_SK_WIRE].port[0] != sp->key[PF_SK_STACK].port[0] ||
511 	    sp->key[PF_SK_WIRE].port[1] != sp->key[PF_SK_STACK].port[1]) {
512 		if ((sks = pf_alloc_state_key(pool_flags)) == NULL)
513 			goto cleanup;
514 	} else
515 		sks = skw;
516 
517 	/* allocate memory for scrub info */
518 	if (pfsync_alloc_scrub_memory(&sp->src, &st->src) ||
519 	    pfsync_alloc_scrub_memory(&sp->dst, &st->dst))
520 		goto cleanup;
521 
522 	/* copy to state key(s) */
523 	skw->addr[0] = sp->key[PF_SK_WIRE].addr[0];
524 	skw->addr[1] = sp->key[PF_SK_WIRE].addr[1];
525 	skw->port[0] = sp->key[PF_SK_WIRE].port[0];
526 	skw->port[1] = sp->key[PF_SK_WIRE].port[1];
527 	skw->proto = sp->proto;
528 	skw->af = sp->af;
529 	if (sks != skw) {
530 		sks->addr[0] = sp->key[PF_SK_STACK].addr[0];
531 		sks->addr[1] = sp->key[PF_SK_STACK].addr[1];
532 		sks->port[0] = sp->key[PF_SK_STACK].port[0];
533 		sks->port[1] = sp->key[PF_SK_STACK].port[1];
534 		sks->proto = sp->proto;
535 		sks->af = sp->af;
536 	}
537 
538 	/* copy to state */
539 	bcopy(&sp->rt_addr, &st->rt_addr, sizeof(st->rt_addr));
540 	st->creation = time_second - ntohl(sp->creation);
541 	st->expire = time_second;
542 	if (sp->expire) {
543 		/* XXX No adaptive scaling. */
544 		st->expire -= r->timeout[sp->timeout] - ntohl(sp->expire);
545 	}
546 
547 	st->expire = ntohl(sp->expire) + time_second;
548 	st->direction = sp->direction;
549 	st->log = sp->log;
550 	st->timeout = sp->timeout;
551 	st->state_flags = sp->state_flags;
552 
553 	bcopy(sp->id, &st->id, sizeof(st->id));
554 	st->creatorid = sp->creatorid;
555 	pf_state_peer_ntoh(&sp->src, &st->src);
556 	pf_state_peer_ntoh(&sp->dst, &st->dst);
557 
558 	st->rule.ptr = r;
559 	st->nat_rule.ptr = NULL;
560 	st->anchor.ptr = NULL;
561 	st->rt_kif = NULL;
562 
563 	st->pfsync_time = time_uptime;
564 	st->sync_state = PFSYNC_S_NONE;
565 
566 	/* XXX when we have nat_rule/anchors, use STATE_INC_COUNTERS */
567 	r->states_cur++;
568 	r->states_tot++;
569 
570 	if (!ISSET(flags, PFSYNC_SI_IOCTL))
571 		SET(st->state_flags, PFSTATE_NOSYNC);
572 
573 	if (pf_state_insert(kif, skw, sks, st) != 0) {
574 		/* XXX when we have nat_rule/anchors, use STATE_DEC_COUNTERS */
575 		r->states_cur--;
576 		error = EEXIST;
577 		goto cleanup_state;
578 	}
579 
580 	if (!ISSET(flags, PFSYNC_SI_IOCTL)) {
581 		CLR(st->state_flags, PFSTATE_NOSYNC);
582 		if (ISSET(st->state_flags, PFSTATE_ACK)) {
583 			pfsync_q_ins(st, PFSYNC_S_IACK);
584 			schednetisr(NETISR_PFSYNC);
585 		}
586 	}
587 	CLR(st->state_flags, PFSTATE_ACK);
588 
589 	return (0);
590 
591  cleanup:
592 	error = ENOMEM;
593 	if (skw == sks)
594 		sks = NULL;
595 	if (skw != NULL)
596 		pool_put(&pf_state_key_pl, skw);
597 	if (sks != NULL)
598 		pool_put(&pf_state_key_pl, sks);
599 
600  cleanup_state:	/* pf_state_insert frees the state keys */
601 	if (st) {
602 		if (st->dst.scrub)
603 			pool_put(&pf_state_scrub_pl, st->dst.scrub);
604 		if (st->src.scrub)
605 			pool_put(&pf_state_scrub_pl, st->src.scrub);
606 		pool_put(&pf_state_pl, st);
607 	}
608 	return (error);
609 }
610 
611 void
612 pfsync_input(struct mbuf *m, ...)
613 {
614 	struct pfsync_softc *sc = pfsyncif;
615 	struct pfsync_pkt pkt;
616 	struct ip *ip = mtod(m, struct ip *);
617 	struct pfsync_header *ph;
618 	struct pfsync_subheader subh;
619 
620 	int offset, len;
621 	int rv;
622 
623 	pfsyncstats.pfsyncs_ipackets++;
624 
625 	/* verify that we have a sync interface configured */
626 	if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING) ||
627 	    sc->sc_sync_if == NULL || !pf_status.running)
628 		goto done;
629 
630 	/* verify that the packet came in on the right interface */
631 	if (sc->sc_sync_if != m->m_pkthdr.rcvif) {
632 		pfsyncstats.pfsyncs_badif++;
633 		goto done;
634 	}
635 
636 	sc->sc_if.if_ipackets++;
637 	sc->sc_if.if_ibytes += m->m_pkthdr.len;
638 
639 	/* verify that the IP TTL is 255. */
640 	if (ip->ip_ttl != PFSYNC_DFLTTL) {
641 		pfsyncstats.pfsyncs_badttl++;
642 		goto done;
643 	}
644 
645 	offset = ip->ip_hl << 2;
646 	if (m->m_pkthdr.len < offset + sizeof(*ph)) {
647 		pfsyncstats.pfsyncs_hdrops++;
648 		goto done;
649 	}
650 
651 	if (offset + sizeof(*ph) > m->m_len) {
652 		if (m_pullup(m, offset + sizeof(*ph)) == NULL) {
653 			pfsyncstats.pfsyncs_hdrops++;
654 			return;
655 		}
656 		ip = mtod(m, struct ip *);
657 	}
658 	ph = (struct pfsync_header *)((char *)ip + offset);
659 
660 	/* verify the version */
661 	if (ph->version != PFSYNC_VERSION) {
662 		pfsyncstats.pfsyncs_badver++;
663 		goto done;
664 	}
665 	len = ntohs(ph->len) + offset;
666 	if (m->m_pkthdr.len < len) {
667 		pfsyncstats.pfsyncs_badlen++;
668 		goto done;
669 	}
670 
671 	/* Cheaper to grab this now than having to mess with mbufs later */
672 	pkt.ip = ip;
673 	pkt.src = ip->ip_src;
674 	pkt.flags = 0;
675 
676 	if (!bcmp(&ph->pfcksum, &pf_status.pf_chksum, PF_MD5_DIGEST_LENGTH))
677 		pkt.flags |= PFSYNC_SI_CKSUM;
678 
679 	offset += sizeof(*ph);
680 	while (offset <= len - sizeof(subh)) {
681 		m_copydata(m, offset, sizeof(subh), (caddr_t)&subh);
682 		offset += sizeof(subh);
683 
684 		if (subh.action >= PFSYNC_ACT_MAX) {
685 			pfsyncstats.pfsyncs_badact++;
686 			goto done;
687 		}
688 
689 		rv = (*pfsync_acts[subh.action])(&pkt, m, offset,
690 		    ntohs(subh.count));
691 		if (rv == -1)
692 			return;
693 
694 		offset += rv;
695 	}
696 
697 done:
698 	m_freem(m);
699 }
700 
701 int
702 pfsync_in_clr(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
703 {
704 	struct pfsync_clr *clr;
705 	struct mbuf *mp;
706 	int len = sizeof(*clr) * count;
707 	int i, offp;
708 
709 	struct pf_state *st, *nexts;
710 	struct pf_state_key *sk, *nextsk;
711 	struct pf_state_item *si;
712 	u_int32_t creatorid;
713 	int s;
714 
715 	mp = m_pulldown(m, offset, len, &offp);
716 	if (mp == NULL) {
717 		pfsyncstats.pfsyncs_badlen++;
718 		return (-1);
719 	}
720 	clr = (struct pfsync_clr *)(mp->m_data + offp);
721 
722 	s = splsoftnet();
723 	for (i = 0; i < count; i++) {
724 		creatorid = clr[i].creatorid;
725 
726 		if (clr[i].ifname[0] == '\0') {
727 			for (st = RB_MIN(pf_state_tree_id, &tree_id);
728 			    st; st = nexts) {
729 				nexts = RB_NEXT(pf_state_tree_id, &tree_id, st);
730 				if (st->creatorid == creatorid) {
731 					SET(st->state_flags, PFSTATE_NOSYNC);
732 					pf_unlink_state(st);
733 				}
734 			}
735 		} else {
736 			if (pfi_kif_get(clr[i].ifname) == NULL)
737 				continue;
738 
739 			/* XXX correct? */
740 			for (sk = RB_MIN(pf_state_tree, &pf_statetbl);
741 			    sk; sk = nextsk) {
742 				nextsk = RB_NEXT(pf_state_tree,
743 				    &pf_statetbl, sk);
744 				TAILQ_FOREACH(si, &sk->states, entry) {
745 					if (si->s->creatorid == creatorid) {
746 						SET(si->s->state_flags,
747 						    PFSTATE_NOSYNC);
748 						pf_unlink_state(si->s);
749 					}
750 				}
751 			}
752 		}
753 	}
754 	splx(s);
755 
756 	return (len);
757 }
758 
759 int
760 pfsync_in_ins(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
761 {
762 	struct mbuf *mp;
763 	struct pfsync_state *sa, *sp;
764 	int len = sizeof(*sp) * count;
765 	int i, offp;
766 
767 	int s;
768 
769 	mp = m_pulldown(m, offset, len, &offp);
770 	if (mp == NULL) {
771 		pfsyncstats.pfsyncs_badlen++;
772 		return (-1);
773 	}
774 	sa = (struct pfsync_state *)(mp->m_data + offp);
775 
776 	s = splsoftnet();
777 	for (i = 0; i < count; i++) {
778 		sp = &sa[i];
779 
780 		/* check for invalid values */
781 		if (sp->timeout >= PFTM_MAX ||
782 		    sp->src.state > PF_TCPS_PROXY_DST ||
783 		    sp->dst.state > PF_TCPS_PROXY_DST ||
784 		    sp->direction > PF_OUT ||
785 		    (sp->af != AF_INET && sp->af != AF_INET6)) {
786 			if (pf_status.debug >= PF_DEBUG_MISC) {
787 				printf("pfsync_input: PFSYNC5_ACT_INS: "
788 				    "invalid value\n");
789 			}
790 			pfsyncstats.pfsyncs_badval++;
791 			continue;
792 		}
793 
794 		if (pfsync_state_import(sp, pkt->flags) == ENOMEM) {
795 			/* drop out, but process the rest of the actions */
796 			break;
797 		}
798 	}
799 	splx(s);
800 
801 	return (len);
802 }
803 
804 int
805 pfsync_in_iack(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
806 {
807 	struct pfsync_ins_ack *ia, *iaa;
808 	struct pf_state_cmp id_key;
809 	struct pf_state *st;
810 
811 	struct mbuf *mp;
812 	int len = count * sizeof(*ia);
813 	int offp, i;
814 	int s;
815 
816 	mp = m_pulldown(m, offset, len, &offp);
817 	if (mp == NULL) {
818 		pfsyncstats.pfsyncs_badlen++;
819 		return (-1);
820 	}
821 	iaa = (struct pfsync_ins_ack *)(mp->m_data + offp);
822 
823 	s = splsoftnet();
824 	for (i = 0; i < count; i++) {
825 		ia = &iaa[i];
826 
827 		bcopy(&ia->id, &id_key.id, sizeof(id_key.id));
828 		id_key.creatorid = ia->creatorid;
829 
830 		st = pf_find_state_byid(&id_key);
831 		if (st == NULL)
832 			continue;
833 
834 		if (ISSET(st->state_flags, PFSTATE_ACK))
835 			pfsync_deferred(st, 0);
836 	}
837 	splx(s);
838 
839 	return (len);
840 }
841 
842 int
843 pfsync_upd_tcp(struct pf_state *st, struct pfsync_state_peer *src,
844     struct pfsync_state_peer *dst)
845 {
846 	int sync = 0;
847 
848 	/*
849 	 * The state should never go backwards except
850 	 * for syn-proxy states.  Neither should the
851 	 * sequence window slide backwards.
852 	 */
853 	if ((st->src.state > src->state &&
854 	    (st->src.state < PF_TCPS_PROXY_SRC ||
855 	    src->state >= PF_TCPS_PROXY_SRC)) ||
856 
857 	    (st->src.state == src->state &&
858 	    SEQ_GT(st->src.seqlo, ntohl(src->seqlo))))
859 		sync++;
860 	else
861 		pf_state_peer_ntoh(src, &st->src);
862 
863 	if ((st->dst.state > dst->state) ||
864 
865 	    (st->dst.state >= TCPS_SYN_SENT &&
866 	    SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo))))
867 		sync++;
868 	else
869 		pf_state_peer_ntoh(dst, &st->dst);
870 
871 	return (sync);
872 }
873 
874 int
875 pfsync_in_upd(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
876 {
877 	struct pfsync_state *sa, *sp;
878 	struct pf_state_cmp id_key;
879 	struct pf_state *st;
880 	int sync;
881 
882 	struct mbuf *mp;
883 	int len = count * sizeof(*sp);
884 	int offp, i;
885 	int s;
886 
887 	mp = m_pulldown(m, offset, len, &offp);
888 	if (mp == NULL) {
889 		pfsyncstats.pfsyncs_badlen++;
890 		return (-1);
891 	}
892 	sa = (struct pfsync_state *)(mp->m_data + offp);
893 
894 	s = splsoftnet();
895 	for (i = 0; i < count; i++) {
896 		sp = &sa[i];
897 
898 		/* check for invalid values */
899 		if (sp->timeout >= PFTM_MAX ||
900 		    sp->src.state > PF_TCPS_PROXY_DST ||
901 		    sp->dst.state > PF_TCPS_PROXY_DST) {
902 			if (pf_status.debug >= PF_DEBUG_MISC) {
903 				printf("pfsync_input: PFSYNC_ACT_UPD: "
904 				    "invalid value\n");
905 			}
906 			pfsyncstats.pfsyncs_badval++;
907 			continue;
908 		}
909 
910 		bcopy(sp->id, &id_key.id, sizeof(id_key.id));
911 		id_key.creatorid = sp->creatorid;
912 
913 		st = pf_find_state_byid(&id_key);
914 		if (st == NULL) {
915 			/* insert the update */
916 			if (pfsync_state_import(sp, 0))
917 				pfsyncstats.pfsyncs_badstate++;
918 			continue;
919 		}
920 
921 		if (ISSET(st->state_flags, PFSTATE_ACK))
922 			pfsync_deferred(st, 1);
923 
924 		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
925 			sync = pfsync_upd_tcp(st, &sp->src, &sp->dst);
926 		else {
927 			sync = 0;
928 
929 			/*
930 			 * Non-TCP protocol state machine always go
931 			 * forwards
932 			 */
933 			if (st->src.state > sp->src.state)
934 				sync++;
935 			else
936 				pf_state_peer_ntoh(&sp->src, &st->src);
937 
938 			if (st->dst.state > sp->dst.state)
939 				sync++;
940 			else
941 				pf_state_peer_ntoh(&sp->dst, &st->dst);
942 		}
943 
944 		if (sync < 2) {
945 			pfsync_alloc_scrub_memory(&sp->dst, &st->dst);
946 			pf_state_peer_ntoh(&sp->dst, &st->dst);
947 			st->expire = ntohl(sp->expire) + time_second;
948 			st->timeout = sp->timeout;
949 		}
950 		st->pfsync_time = time_uptime;
951 
952 		if (sync) {
953 			pfsyncstats.pfsyncs_stale++;
954 
955 			pfsync_update_state(st);
956 			schednetisr(NETISR_PFSYNC);
957 		}
958 	}
959 	splx(s);
960 
961 	return (len);
962 }
963 
964 int
965 pfsync_in_upd_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
966 {
967 	struct pfsync_upd_c *ua, *up;
968 	struct pf_state_cmp id_key;
969 	struct pf_state *st;
970 
971 	int len = count * sizeof(*up);
972 	int sync;
973 
974 	struct mbuf *mp;
975 	int offp, i;
976 	int s;
977 
978 	mp = m_pulldown(m, offset, len, &offp);
979 	if (mp == NULL) {
980 		pfsyncstats.pfsyncs_badlen++;
981 		return (-1);
982 	}
983 	ua = (struct pfsync_upd_c *)(mp->m_data + offp);
984 
985 	s = splsoftnet();
986 	for (i = 0; i < count; i++) {
987 		up = &ua[i];
988 
989 		/* check for invalid values */
990 		if (up->timeout >= PFTM_MAX ||
991 		    up->src.state > PF_TCPS_PROXY_DST ||
992 		    up->dst.state > PF_TCPS_PROXY_DST) {
993 			if (pf_status.debug >= PF_DEBUG_MISC) {
994 				printf("pfsync_input: "
995 				    "PFSYNC_ACT_UPD_C: "
996 				    "invalid value\n");
997 			}
998 			pfsyncstats.pfsyncs_badval++;
999 			continue;
1000 		}
1001 
1002 		bcopy(&up->id, &id_key.id, sizeof(id_key.id));
1003 		id_key.creatorid = up->creatorid;
1004 
1005 		st = pf_find_state_byid(&id_key);
1006 		if (st == NULL) {
1007 			/* We don't have this state. Ask for it. */
1008 			pfsync_request_update(id_key.creatorid, id_key.id);
1009 			continue;
1010 		}
1011 
1012 		if (ISSET(st->state_flags, PFSTATE_ACK))
1013 			pfsync_deferred(st, 1);
1014 
1015 		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
1016 			sync = pfsync_upd_tcp(st, &up->src, &up->dst);
1017 		else {
1018 			sync = 0;
1019 			/*
1020 			 * Non-TCP protocol state machine always go
1021 			 * forwards
1022 			 */
1023 			if (st->src.state > up->src.state)
1024 				sync++;
1025 			else
1026 				pf_state_peer_ntoh(&up->src, &st->src);
1027 
1028 			if (st->dst.state > up->dst.state)
1029 				sync++;
1030 			else
1031 				pf_state_peer_ntoh(&up->dst, &st->dst);
1032 		}
1033 		if (sync < 2) {
1034 			pfsync_alloc_scrub_memory(&up->dst, &st->dst);
1035 			pf_state_peer_ntoh(&up->dst, &st->dst);
1036 			st->expire = ntohl(up->expire) + time_second;
1037 			st->timeout = up->timeout;
1038 		}
1039 		st->pfsync_time = time_uptime;
1040 
1041 		if (sync) {
1042 			pfsyncstats.pfsyncs_stale++;
1043 
1044 			pfsync_update_state(st);
1045 			schednetisr(NETISR_PFSYNC);
1046 		}
1047 	}
1048 	splx(s);
1049 
1050 	return (len);
1051 }
1052 
1053 int
1054 pfsync_in_ureq(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1055 {
1056 	struct pfsync_upd_req *ur, *ura;
1057 	struct mbuf *mp;
1058 	int len = count * sizeof(*ur);
1059 	int i, offp;
1060 
1061 	struct pf_state_cmp id_key;
1062 	struct pf_state *st;
1063 
1064 	mp = m_pulldown(m, offset, len, &offp);
1065 	if (mp == NULL) {
1066 		pfsyncstats.pfsyncs_badlen++;
1067 		return (-1);
1068 	}
1069 	ura = (struct pfsync_upd_req *)(mp->m_data + offp);
1070 
1071 	for (i = 0; i < count; i++) {
1072 		ur = &ura[i];
1073 
1074 		bcopy(&ur->id, &id_key.id, sizeof(id_key.id));
1075 		id_key.creatorid = ur->creatorid;
1076 
1077 		if (id_key.id == 0 && id_key.creatorid == 0)
1078 			pfsync_bulk_start();
1079 		else {
1080 			st = pf_find_state_byid(&id_key);
1081 			if (st == NULL) {
1082 				pfsyncstats.pfsyncs_badstate++;
1083 				continue;
1084 			}
1085 			if (ISSET(st->state_flags, PFSTATE_NOSYNC))
1086 				continue;
1087 
1088 			pfsync_update_state_req(st);
1089 		}
1090 	}
1091 
1092 	return (len);
1093 }
1094 
1095 int
1096 pfsync_in_del(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1097 {
1098 	struct mbuf *mp;
1099 	struct pfsync_state *sa, *sp;
1100 	struct pf_state_cmp id_key;
1101 	struct pf_state *st;
1102 	int len = count * sizeof(*sp);
1103 	int offp, i;
1104 	int s;
1105 
1106 	mp = m_pulldown(m, offset, len, &offp);
1107 	if (mp == NULL) {
1108 		pfsyncstats.pfsyncs_badlen++;
1109 		return (-1);
1110 	}
1111 	sa = (struct pfsync_state *)(mp->m_data + offp);
1112 
1113 	s = splsoftnet();
1114 	for (i = 0; i < count; i++) {
1115 		sp = &sa[i];
1116 
1117 		bcopy(sp->id, &id_key.id, sizeof(id_key.id));
1118 		id_key.creatorid = sp->creatorid;
1119 
1120 		st = pf_find_state_byid(&id_key);
1121 		if (st == NULL) {
1122 			pfsyncstats.pfsyncs_badstate++;
1123 			continue;
1124 		}
1125 		SET(st->state_flags, PFSTATE_NOSYNC);
1126 		pf_unlink_state(st);
1127 	}
1128 	splx(s);
1129 
1130 	return (len);
1131 }
1132 
1133 int
1134 pfsync_in_del_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1135 {
1136 	struct mbuf *mp;
1137 	struct pfsync_del_c *sa, *sp;
1138 	struct pf_state_cmp id_key;
1139 	struct pf_state *st;
1140 	int len = count * sizeof(*sp);
1141 	int offp, i;
1142 	int s;
1143 
1144 	mp = m_pulldown(m, offset, len, &offp);
1145 	if (mp == NULL) {
1146 		pfsyncstats.pfsyncs_badlen++;
1147 		return (-1);
1148 	}
1149 	sa = (struct pfsync_del_c *)(mp->m_data + offp);
1150 
1151 	s = splsoftnet();
1152 	for (i = 0; i < count; i++) {
1153 		sp = &sa[i];
1154 
1155 		bcopy(&sp->id, &id_key.id, sizeof(id_key.id));
1156 		id_key.creatorid = sp->creatorid;
1157 
1158 		st = pf_find_state_byid(&id_key);
1159 		if (st == NULL) {
1160 			pfsyncstats.pfsyncs_badstate++;
1161 			continue;
1162 		}
1163 
1164 		SET(st->state_flags, PFSTATE_NOSYNC);
1165 		pf_unlink_state(st);
1166 	}
1167 	splx(s);
1168 
1169 	return (len);
1170 }
1171 
1172 int
1173 pfsync_in_bus(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1174 {
1175 	struct pfsync_softc *sc = pfsyncif;
1176 	struct pfsync_bus *bus;
1177 	struct mbuf *mp;
1178 	int len = count * sizeof(*bus);
1179 	int offp;
1180 
1181 	/* If we're not waiting for a bulk update, who cares. */
1182 	if (sc->sc_ureq_sent == 0)
1183 		return (len);
1184 
1185 	mp = m_pulldown(m, offset, len, &offp);
1186 	if (mp == NULL) {
1187 		pfsyncstats.pfsyncs_badlen++;
1188 		return (-1);
1189 	}
1190 	bus = (struct pfsync_bus *)(mp->m_data + offp);
1191 
1192 	switch (bus->status) {
1193 	case PFSYNC_BUS_START:
1194 		timeout_add(&sc->sc_bulkfail_tmo, 4 * hz +
1195 		    pf_pool_limits[PF_LIMIT_STATES].limit /
1196 		    ((sc->sc_if.if_mtu - PFSYNC_MINPKT) /
1197 		    sizeof(struct pfsync_state)));
1198 		if (pf_status.debug >= PF_DEBUG_MISC)
1199 			printf("pfsync: received bulk update start\n");
1200 		break;
1201 
1202 	case PFSYNC_BUS_END:
1203 		if (time_uptime - ntohl(bus->endtime) >=
1204 		    sc->sc_ureq_sent) {
1205 			/* that's it, we're happy */
1206 			sc->sc_ureq_sent = 0;
1207 			sc->sc_bulk_tries = 0;
1208 			timeout_del(&sc->sc_bulkfail_tmo);
1209 #if NCARP > 0
1210 			if (!pfsync_sync_ok)
1211 				carp_group_demote_adj(&sc->sc_if, -1);
1212 #endif
1213 			pfsync_sync_ok = 1;
1214 			if (pf_status.debug >= PF_DEBUG_MISC)
1215 				printf("pfsync: received valid "
1216 				    "bulk update end\n");
1217 		} else {
1218 			if (pf_status.debug >= PF_DEBUG_MISC)
1219 				printf("pfsync: received invalid "
1220 				    "bulk update end: bad timestamp\n");
1221 		}
1222 		break;
1223 	}
1224 
1225 	return (len);
1226 }
1227 
1228 int
1229 pfsync_in_tdb(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1230 {
1231 	int len = count * sizeof(struct pfsync_tdb);
1232 
1233 #if defined(IPSEC)
1234 	struct pfsync_tdb *tp;
1235 	struct mbuf *mp;
1236 	int offp;
1237 	int i;
1238 	int s;
1239 
1240 	mp = m_pulldown(m, offset, len, &offp);
1241 	if (mp == NULL) {
1242 		pfsyncstats.pfsyncs_badlen++;
1243 		return (-1);
1244 	}
1245 	tp = (struct pfsync_tdb *)(mp->m_data + offp);
1246 
1247 	s = splsoftnet();
1248 	for (i = 0; i < count; i++)
1249 		pfsync_update_net_tdb(&tp[i]);
1250 	splx(s);
1251 #endif
1252 
1253 	return (len);
1254 }
1255 
1256 #if defined(IPSEC)
1257 /* Update an in-kernel tdb. Silently fail if no tdb is found. */
1258 void
1259 pfsync_update_net_tdb(struct pfsync_tdb *pt)
1260 {
1261 	struct tdb		*tdb;
1262 	int			 s;
1263 
1264 	/* check for invalid values */
1265 	if (ntohl(pt->spi) <= SPI_RESERVED_MAX ||
1266 	    (pt->dst.sa.sa_family != AF_INET &&
1267 	     pt->dst.sa.sa_family != AF_INET6))
1268 		goto bad;
1269 
1270 	s = spltdb();
1271 	tdb = gettdb(pt->spi, &pt->dst, pt->sproto);
1272 	if (tdb) {
1273 		pt->rpl = ntohl(pt->rpl);
1274 		pt->cur_bytes = betoh64(pt->cur_bytes);
1275 
1276 		/* Neither replay nor byte counter should ever decrease. */
1277 		if (pt->rpl < tdb->tdb_rpl ||
1278 		    pt->cur_bytes < tdb->tdb_cur_bytes) {
1279 			splx(s);
1280 			goto bad;
1281 		}
1282 
1283 		tdb->tdb_rpl = pt->rpl;
1284 		tdb->tdb_cur_bytes = pt->cur_bytes;
1285 	}
1286 	splx(s);
1287 	return;
1288 
1289  bad:
1290 	if (pf_status.debug >= PF_DEBUG_MISC)
1291 		printf("pfsync_insert: PFSYNC_ACT_TDB_UPD: "
1292 		    "invalid value\n");
1293 	pfsyncstats.pfsyncs_badstate++;
1294 	return;
1295 }
1296 #endif
1297 
1298 
1299 int
1300 pfsync_in_eof(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1301 {
1302 	/* check if we are at the right place in the packet */
1303 	if (offset != m->m_pkthdr.len)
1304 		pfsyncstats.pfsyncs_badlen++;
1305 
1306 	/* we're done. free and let the caller return */
1307 	m_freem(m);
1308 	return (-1);
1309 }
1310 
1311 int
1312 pfsync_in_error(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1313 {
1314 	pfsyncstats.pfsyncs_badact++;
1315 
1316 	m_freem(m);
1317 	return (-1);
1318 }
1319 
1320 int
1321 pfsyncoutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
1322 	struct rtentry *rt)
1323 {
1324 	m_freem(m);
1325 	return (0);
1326 }
1327 
1328 /* ARGSUSED */
1329 int
1330 pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
1331 {
1332 	struct proc *p = curproc;
1333 	struct pfsync_softc *sc = ifp->if_softc;
1334 	struct ifreq *ifr = (struct ifreq *)data;
1335 	struct ip_moptions *imo = &sc->sc_imo;
1336 	struct pfsyncreq pfsyncr;
1337 	struct ifnet    *sifp;
1338 	struct ip *ip;
1339 	int s, error;
1340 
1341 	switch (cmd) {
1342 #if 0
1343 	case SIOCSIFADDR:
1344 	case SIOCAIFADDR:
1345 	case SIOCSIFDSTADDR:
1346 #endif
1347 	case SIOCSIFFLAGS:
1348 		s = splnet();
1349 		if (ifp->if_flags & IFF_UP)
1350 			ifp->if_flags |= IFF_RUNNING;
1351 		else {
1352 			ifp->if_flags &= ~IFF_RUNNING;
1353 
1354 			/* drop everything */
1355 			timeout_del(&sc->sc_tmo);
1356 			pfsync_drop(sc);
1357 
1358 			/* cancel bulk update */
1359 			timeout_del(&sc->sc_bulk_tmo);
1360 			sc->sc_bulk_next = NULL;
1361 			sc->sc_bulk_last = NULL;
1362 		}
1363 		splx(s);
1364 		break;
1365 	case SIOCSIFMTU:
1366 		s = splnet();
1367 		if (ifr->ifr_mtu <= PFSYNC_MINPKT)
1368 			return (EINVAL);
1369 		if (ifr->ifr_mtu > MCLBYTES) /* XXX could be bigger */
1370 			ifr->ifr_mtu = MCLBYTES;
1371 		if (ifr->ifr_mtu < ifp->if_mtu)
1372 			pfsync_sendout();
1373 		ifp->if_mtu = ifr->ifr_mtu;
1374 		splx(s);
1375 		break;
1376 	case SIOCGETPFSYNC:
1377 		bzero(&pfsyncr, sizeof(pfsyncr));
1378 		if (sc->sc_sync_if) {
1379 			strlcpy(pfsyncr.pfsyncr_syncdev,
1380 			    sc->sc_sync_if->if_xname, IFNAMSIZ);
1381 		}
1382 		pfsyncr.pfsyncr_syncpeer = sc->sc_sync_peer;
1383 		pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates;
1384 		pfsyncr.pfsyncr_defer = sc->sc_defer;
1385 		return (copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr)));
1386 
1387 	case SIOCSETPFSYNC:
1388 		if ((error = suser(p, p->p_acflag)) != 0)
1389 			return (error);
1390 		if ((error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr))))
1391 			return (error);
1392 
1393 		s = splnet();
1394 
1395 		if (pfsyncr.pfsyncr_syncpeer.s_addr == 0)
1396 			sc->sc_sync_peer.s_addr = INADDR_PFSYNC_GROUP;
1397 		else
1398 			sc->sc_sync_peer.s_addr =
1399 			    pfsyncr.pfsyncr_syncpeer.s_addr;
1400 
1401 		if (pfsyncr.pfsyncr_maxupdates > 255) {
1402 			splx(s);
1403 			return (EINVAL);
1404 		}
1405 		sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates;
1406 
1407 		sc->sc_defer = pfsyncr.pfsyncr_defer;
1408 
1409 		if (pfsyncr.pfsyncr_syncdev[0] == 0) {
1410 			sc->sc_sync_if = NULL;
1411 			if (imo->imo_num_memberships > 0) {
1412 				in_delmulti(imo->imo_membership[
1413 				    --imo->imo_num_memberships]);
1414 				imo->imo_multicast_ifp = NULL;
1415 			}
1416 			splx(s);
1417 			break;
1418 		}
1419 
1420 		if ((sifp = ifunit(pfsyncr.pfsyncr_syncdev)) == NULL) {
1421 			splx(s);
1422 			return (EINVAL);
1423 		}
1424 
1425 		if (sifp->if_mtu < sc->sc_if.if_mtu ||
1426 		    (sc->sc_sync_if != NULL &&
1427 		    sifp->if_mtu < sc->sc_sync_if->if_mtu) ||
1428 		    sifp->if_mtu < MCLBYTES - sizeof(struct ip))
1429 			pfsync_sendout();
1430 		sc->sc_sync_if = sifp;
1431 
1432 		if (imo->imo_num_memberships > 0) {
1433 			in_delmulti(imo->imo_membership[--imo->imo_num_memberships]);
1434 			imo->imo_multicast_ifp = NULL;
1435 		}
1436 
1437 		if (sc->sc_sync_if &&
1438 		    sc->sc_sync_peer.s_addr == INADDR_PFSYNC_GROUP) {
1439 			struct in_addr addr;
1440 
1441 			if (!(sc->sc_sync_if->if_flags & IFF_MULTICAST)) {
1442 				sc->sc_sync_if = NULL;
1443 				splx(s);
1444 				return (EADDRNOTAVAIL);
1445 			}
1446 
1447 			addr.s_addr = INADDR_PFSYNC_GROUP;
1448 
1449 			if ((imo->imo_membership[0] =
1450 			    in_addmulti(&addr, sc->sc_sync_if)) == NULL) {
1451 				sc->sc_sync_if = NULL;
1452 				splx(s);
1453 				return (ENOBUFS);
1454 			}
1455 			imo->imo_num_memberships++;
1456 			imo->imo_multicast_ifp = sc->sc_sync_if;
1457 			imo->imo_multicast_ttl = PFSYNC_DFLTTL;
1458 			imo->imo_multicast_loop = 0;
1459 		}
1460 
1461 		ip = &sc->sc_template;
1462 		bzero(ip, sizeof(*ip));
1463 		ip->ip_v = IPVERSION;
1464 		ip->ip_hl = sizeof(sc->sc_template) >> 2;
1465 		ip->ip_tos = IPTOS_LOWDELAY;
1466 		/* len and id are set later */
1467 		ip->ip_off = htons(IP_DF);
1468 		ip->ip_ttl = PFSYNC_DFLTTL;
1469 		ip->ip_p = IPPROTO_PFSYNC;
1470 		ip->ip_src.s_addr = INADDR_ANY;
1471 		ip->ip_dst.s_addr = sc->sc_sync_peer.s_addr;
1472 
1473 		if (sc->sc_sync_if) {
1474 			/* Request a full state table update. */
1475 			sc->sc_ureq_sent = time_uptime;
1476 #if NCARP > 0
1477 			if (pfsync_sync_ok)
1478 				carp_group_demote_adj(&sc->sc_if, 1);
1479 #endif
1480 			pfsync_sync_ok = 0;
1481 			if (pf_status.debug >= PF_DEBUG_MISC)
1482 				printf("pfsync: requesting bulk update\n");
1483 			timeout_add(&sc->sc_bulkfail_tmo, 4 * hz +
1484 			    pf_pool_limits[PF_LIMIT_STATES].limit /
1485 			    ((sc->sc_if.if_mtu - PFSYNC_MINPKT) /
1486 			    sizeof(struct pfsync_state)));
1487 			pfsync_request_update(0, 0);
1488 		}
1489 		splx(s);
1490 
1491 		break;
1492 
1493 	default:
1494 		return (ENOTTY);
1495 	}
1496 
1497 	return (0);
1498 }
1499 
1500 void
1501 pfsync_out_state(struct pf_state *st, void *buf)
1502 {
1503 	struct pfsync_state *sp = buf;
1504 
1505 	pfsync_state_export(sp, st);
1506 }
1507 
1508 void
1509 pfsync_out_iack(struct pf_state *st, void *buf)
1510 {
1511 	struct pfsync_ins_ack *iack = buf;
1512 
1513 	iack->id = st->id;
1514 	iack->creatorid = st->creatorid;
1515 }
1516 
1517 void
1518 pfsync_out_upd_c(struct pf_state *st, void *buf)
1519 {
1520 	struct pfsync_upd_c *up = buf;
1521 
1522 	up->id = st->id;
1523 	pf_state_peer_hton(&st->src, &up->src);
1524 	pf_state_peer_hton(&st->dst, &up->dst);
1525 	up->creatorid = st->creatorid;
1526 
1527 	up->expire = pf_state_expires(st);
1528 	if (up->expire <= time_second)
1529 		up->expire = htonl(0);
1530 	else
1531 		up->expire = htonl(up->expire - time_second);
1532 	up->timeout = st->timeout;
1533 
1534 	bzero(up->_pad, sizeof(up->_pad)); /* XXX */
1535 }
1536 
1537 void
1538 pfsync_out_del(struct pf_state *st, void *buf)
1539 {
1540 	struct pfsync_del_c *dp = buf;
1541 
1542 	dp->id = st->id;
1543 	dp->creatorid = st->creatorid;
1544 
1545 	SET(st->state_flags, PFSTATE_NOSYNC);
1546 }
1547 
1548 void
1549 pfsync_drop(struct pfsync_softc *sc)
1550 {
1551 	struct pf_state *st;
1552 	struct pfsync_upd_req_item *ur;
1553 	struct tdb *t;
1554 	int q;
1555 
1556 	for (q = 0; q < PFSYNC_S_COUNT; q++) {
1557 		if (TAILQ_EMPTY(&sc->sc_qs[q]))
1558 			continue;
1559 
1560 		TAILQ_FOREACH(st, &sc->sc_qs[q], sync_list) {
1561 #ifdef PFSYNC_DEBUG
1562 			KASSERT(st->sync_state == q);
1563 #endif
1564 			st->sync_state = PFSYNC_S_NONE;
1565 		}
1566 		TAILQ_INIT(&sc->sc_qs[q]);
1567 	}
1568 
1569 	while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
1570 		TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
1571 		pool_put(&sc->sc_pool, ur);
1572 	}
1573 
1574 	sc->sc_plus = NULL;
1575 
1576 	if (!TAILQ_EMPTY(&sc->sc_tdb_q)) {
1577 		TAILQ_FOREACH(t, &sc->sc_tdb_q, tdb_sync_entry)
1578 			CLR(t->tdb_flags, TDBF_PFSYNC);
1579 
1580 		TAILQ_INIT(&sc->sc_tdb_q);
1581 	}
1582 
1583 	sc->sc_len = PFSYNC_MINPKT;
1584 }
1585 
1586 void
1587 pfsync_sendout(void)
1588 {
1589 	struct pfsync_softc *sc = pfsyncif;
1590 #if NBPFILTER > 0
1591 	struct ifnet *ifp = &sc->sc_if;
1592 #endif
1593 	struct mbuf *m;
1594 	struct ip *ip;
1595 	struct pfsync_header *ph;
1596 	struct pfsync_subheader *subh;
1597 	struct pf_state *st;
1598 	struct pfsync_upd_req_item *ur;
1599 	struct tdb *t;
1600 
1601 	int offset;
1602 	int q, count = 0;
1603 
1604 	if (sc == NULL || sc->sc_len == PFSYNC_MINPKT)
1605 		return;
1606 
1607 	if (!ISSET(sc->sc_if.if_flags, IFF_RUNNING) ||
1608 #if NBPFILTER > 0
1609 	    (ifp->if_bpf == NULL && sc->sc_sync_if == NULL)) {
1610 #else
1611 	    sc->sc_sync_if == NULL) {
1612 #endif
1613 		pfsync_drop(sc);
1614 		return;
1615 	}
1616 
1617 	MGETHDR(m, M_DONTWAIT, MT_DATA);
1618 	if (m == NULL) {
1619 		sc->sc_if.if_oerrors++;
1620 		pfsyncstats.pfsyncs_onomem++;
1621 		pfsync_drop(sc);
1622 		return;
1623 	}
1624 
1625 	if (max_linkhdr + sc->sc_len > MHLEN) {
1626 		MCLGETI(m, M_DONTWAIT, NULL, max_linkhdr + sc->sc_len);
1627 		if (!ISSET(m->m_flags, M_EXT)) {
1628 			m_free(m);
1629 			sc->sc_if.if_oerrors++;
1630 			pfsyncstats.pfsyncs_onomem++;
1631 			pfsync_drop(sc);
1632 			return;
1633 		}
1634 	}
1635 	m->m_data += max_linkhdr;
1636 	m->m_len = m->m_pkthdr.len = sc->sc_len;
1637 
1638 	/* build the ip header */
1639 	ip = (struct ip *)m->m_data;
1640 	bcopy(&sc->sc_template, ip, sizeof(*ip));
1641 	offset = sizeof(*ip);
1642 
1643 	ip->ip_len = htons(m->m_pkthdr.len);
1644 	ip->ip_id = htons(ip_randomid());
1645 
1646 	/* build the pfsync header */
1647 	ph = (struct pfsync_header *)(m->m_data + offset);
1648 	bzero(ph, sizeof(*ph));
1649 	offset += sizeof(*ph);
1650 
1651 	ph->version = PFSYNC_VERSION;
1652 	ph->len = htons(sc->sc_len - sizeof(*ip));
1653 	bcopy(pf_status.pf_chksum, ph->pfcksum, PF_MD5_DIGEST_LENGTH);
1654 
1655 	/* walk the queues */
1656 	for (q = 0; q < PFSYNC_S_COUNT; q++) {
1657 		if (TAILQ_EMPTY(&sc->sc_qs[q]))
1658 			continue;
1659 
1660 		subh = (struct pfsync_subheader *)(m->m_data + offset);
1661 		offset += sizeof(*subh);
1662 
1663 		count = 0;
1664 		TAILQ_FOREACH(st, &sc->sc_qs[q], sync_list) {
1665 #ifdef PFSYNC_DEBUG
1666 			KASSERT(st->sync_state == q);
1667 #endif
1668 			pfsync_qs[q].write(st, m->m_data + offset);
1669 			offset += pfsync_qs[q].len;
1670 
1671 			st->sync_state = PFSYNC_S_NONE;
1672 			count++;
1673 		}
1674 		TAILQ_INIT(&sc->sc_qs[q]);
1675 
1676 		bzero(subh, sizeof(*subh));
1677 		subh->action = pfsync_qs[q].action;
1678 		subh->count = htons(count);
1679 	}
1680 
1681 	if (!TAILQ_EMPTY(&sc->sc_upd_req_list)) {
1682 		subh = (struct pfsync_subheader *)(m->m_data + offset);
1683 		offset += sizeof(*subh);
1684 
1685 		count = 0;
1686 		while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
1687 			TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
1688 
1689 			bcopy(&ur->ur_msg, m->m_data + offset,
1690 			    sizeof(ur->ur_msg));
1691 			offset += sizeof(ur->ur_msg);
1692 
1693 			pool_put(&sc->sc_pool, ur);
1694 
1695 			count++;
1696 		}
1697 
1698 		bzero(subh, sizeof(*subh));
1699 		subh->action = PFSYNC_ACT_UPD_REQ;
1700 		subh->count = htons(count);
1701 	}
1702 
1703 	/* has someone built a custom region for us to add? */
1704 	if (sc->sc_plus != NULL) {
1705 		bcopy(sc->sc_plus, m->m_data + offset, sc->sc_pluslen);
1706 		offset += sc->sc_pluslen;
1707 
1708 		sc->sc_plus = NULL;
1709 	}
1710 
1711 	if (!TAILQ_EMPTY(&sc->sc_tdb_q)) {
1712 		subh = (struct pfsync_subheader *)(m->m_data + offset);
1713 		offset += sizeof(*subh);
1714 
1715 		count = 0;
1716 		TAILQ_FOREACH(t, &sc->sc_tdb_q, tdb_sync_entry) {
1717 			pfsync_out_tdb(t, m->m_data + offset);
1718 			offset += sizeof(struct pfsync_tdb);
1719 			CLR(t->tdb_flags, TDBF_PFSYNC);
1720 
1721 			count++;
1722 		}
1723 		TAILQ_INIT(&sc->sc_tdb_q);
1724 
1725 		bzero(subh, sizeof(*subh));
1726 		subh->action = PFSYNC_ACT_TDB;
1727 		subh->count = htons(count);
1728 	}
1729 
1730 	subh = (struct pfsync_subheader *)(m->m_data + offset);
1731 	offset += sizeof(*subh);
1732 
1733 	bzero(subh, sizeof(*subh));
1734 	subh->action = PFSYNC_ACT_EOF;
1735 	subh->count = htons(1);
1736 
1737 	/* we're done, let's put it on the wire */
1738 #if NBPFILTER > 0
1739 	if (ifp->if_bpf) {
1740 		m->m_data += sizeof(*ip);
1741 		m->m_len = m->m_pkthdr.len = sc->sc_len - sizeof(*ip);
1742 		bpf_mtap(ifp->if_bpf, m, BPF_DIRECTION_OUT);
1743 		m->m_data -= sizeof(*ip);
1744 		m->m_len = m->m_pkthdr.len = sc->sc_len;
1745 	}
1746 
1747 	if (sc->sc_sync_if == NULL) {
1748 		sc->sc_len = PFSYNC_MINPKT;
1749 		m_freem(m);
1750 		return;
1751 	}
1752 #endif
1753 
1754 	sc->sc_if.if_opackets++;
1755 	sc->sc_if.if_obytes += m->m_pkthdr.len;
1756 
1757 	if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo, NULL) == 0)
1758 		pfsyncstats.pfsyncs_opackets++;
1759 	else
1760 		pfsyncstats.pfsyncs_oerrors++;
1761 
1762 	/* start again */
1763 	sc->sc_len = PFSYNC_MINPKT;
1764 }
1765 
1766 void
1767 pfsync_insert_state(struct pf_state *st)
1768 {
1769 	struct pfsync_softc *sc = pfsyncif;
1770 
1771 	splsoftassert(IPL_SOFTNET);
1772 
1773 	if (ISSET(st->rule.ptr->rule_flag, PFRULE_NOSYNC) ||
1774 	    st->key[PF_SK_WIRE]->proto == IPPROTO_PFSYNC) {
1775 		SET(st->state_flags, PFSTATE_NOSYNC);
1776 		return;
1777 	}
1778 
1779 	if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING) ||
1780 	    ISSET(st->state_flags, PFSTATE_NOSYNC))
1781 		return;
1782 
1783 #ifdef PFSYNC_DEBUG
1784 	KASSERT(st->sync_state == PFSYNC_S_NONE);
1785 #endif
1786 
1787 	if (sc->sc_len == PFSYNC_MINPKT)
1788 		timeout_add_sec(&sc->sc_tmo, 1);
1789 
1790 	pfsync_q_ins(st, PFSYNC_S_INS);
1791 
1792 	st->sync_updates = 0;
1793 }
1794 
1795 int defer = 10;
1796 
1797 int
1798 pfsync_defer(struct pf_state *st, struct mbuf *m)
1799 {
1800 	struct pfsync_softc *sc = pfsyncif;
1801 	struct pfsync_deferral *pd;
1802 
1803 	splsoftassert(IPL_SOFTNET);
1804 
1805 	if (!sc->sc_defer)
1806 		return (0);
1807 
1808 	if (sc->sc_deferred >= 128)
1809 		pfsync_undefer(TAILQ_FIRST(&sc->sc_deferrals), 0);
1810 
1811 	pd = pool_get(&sc->sc_pool, M_NOWAIT);
1812 	if (pd == NULL)
1813 		return (0);
1814 	sc->sc_deferred++;
1815 
1816 	m->m_pkthdr.pf.flags |= PF_TAG_GENERATED;
1817 	SET(st->state_flags, PFSTATE_ACK);
1818 
1819 	pd->pd_st = st;
1820 	pd->pd_m = m;
1821 
1822 	TAILQ_INSERT_TAIL(&sc->sc_deferrals, pd, pd_entry);
1823 	timeout_set(&pd->pd_tmo, pfsync_defer_tmo, pd);
1824 	timeout_add(&pd->pd_tmo, defer);
1825 
1826 	schednetisr(NETISR_PFSYNC);
1827 
1828 	return (1);
1829 }
1830 
1831 void
1832 pfsync_undefer(struct pfsync_deferral *pd, int drop)
1833 {
1834 	struct pfsync_softc *sc = pfsyncif;
1835 
1836 	splsoftassert(IPL_SOFTNET);
1837 
1838 	TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
1839 	sc->sc_deferred--;
1840 
1841 	CLR(pd->pd_st->state_flags, PFSTATE_ACK);
1842 	timeout_del(&pd->pd_tmo); /* bah */
1843 	if (drop)
1844 		m_freem(pd->pd_m);
1845 	else {
1846 		ip_output(pd->pd_m, (void *)NULL, (void *)NULL, 0,
1847 		    (void *)NULL, (void *)NULL);
1848 	}
1849 
1850 	pool_put(&sc->sc_pool, pd);
1851 }
1852 
1853 void
1854 pfsync_defer_tmo(void *arg)
1855 {
1856 	int s;
1857 
1858 	s = splsoftnet();
1859 	pfsync_undefer(arg, 0);
1860 	splx(s);
1861 }
1862 
1863 void
1864 pfsync_deferred(struct pf_state *st, int drop)
1865 {
1866 	struct pfsync_softc *sc = pfsyncif;
1867 	struct pfsync_deferral *pd;
1868 
1869 	TAILQ_FOREACH(pd, &sc->sc_deferrals, pd_entry) {
1870 		 if (pd->pd_st == st) {
1871 			pfsync_undefer(pd, drop);
1872 			return;
1873 		}
1874 	}
1875 
1876 	panic("pfsync_send_deferred: unable to find deferred state");
1877 }
1878 
1879 u_int pfsync_upds = 0;
1880 
1881 void
1882 pfsync_update_state(struct pf_state *st)
1883 {
1884 	struct pfsync_softc *sc = pfsyncif;
1885 	int sync = 0;
1886 
1887 	splsoftassert(IPL_SOFTNET);
1888 
1889 	if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
1890 		return;
1891 
1892 	if (ISSET(st->state_flags, PFSTATE_ACK))
1893 		pfsync_deferred(st, 0);
1894 	if (ISSET(st->state_flags, PFSTATE_NOSYNC)) {
1895 		if (st->sync_state != PFSYNC_S_NONE)
1896 			pfsync_q_del(st);
1897 		return;
1898 	}
1899 
1900 	if (sc->sc_len == PFSYNC_MINPKT)
1901 		timeout_add_sec(&sc->sc_tmo, 1);
1902 
1903 	switch (st->sync_state) {
1904 	case PFSYNC_S_UPD_C:
1905 	case PFSYNC_S_UPD:
1906 	case PFSYNC_S_INS:
1907 		/* we're already handling it */
1908 
1909 		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) {
1910 			st->sync_updates++;
1911 			if (st->sync_updates >= sc->sc_maxupdates)
1912 				sync = 1;
1913 		}
1914 		break;
1915 
1916 	case PFSYNC_S_IACK:
1917 		pfsync_q_del(st);
1918 	case PFSYNC_S_NONE:
1919 		pfsync_q_ins(st, PFSYNC_S_UPD_C);
1920 		st->sync_updates = 0;
1921 		break;
1922 
1923 	default:
1924 		panic("pfsync_update_state: unexpected sync state %d",
1925 		    st->sync_state);
1926 	}
1927 
1928 	if (sync || (time_uptime - st->pfsync_time) < 2) {
1929 		pfsync_upds++;
1930 		schednetisr(NETISR_PFSYNC);
1931 	}
1932 }
1933 
1934 void
1935 pfsync_request_update(u_int32_t creatorid, u_int64_t id)
1936 {
1937 	struct pfsync_softc *sc = pfsyncif;
1938 	struct pfsync_upd_req_item *item;
1939 	size_t nlen = sizeof(struct pfsync_upd_req);
1940 
1941 	/*
1942 	 * this code does nothing to prevent multiple update requests for the
1943 	 * same state being generated.
1944 	 */
1945 
1946 	item = pool_get(&sc->sc_pool, PR_NOWAIT);
1947 	if (item == NULL) {
1948 		/* XXX stats */
1949 		return;
1950 	}
1951 
1952 	item->ur_msg.id = id;
1953 	item->ur_msg.creatorid = creatorid;
1954 
1955 	if (TAILQ_EMPTY(&sc->sc_upd_req_list))
1956 		nlen += sizeof(struct pfsync_subheader);
1957 
1958 	if (sc->sc_len + nlen > sc->sc_if.if_mtu) {
1959 		pfsync_sendout();
1960 
1961 		nlen = sizeof(struct pfsync_subheader) +
1962 		    sizeof(struct pfsync_upd_req);
1963 	}
1964 
1965 	TAILQ_INSERT_TAIL(&sc->sc_upd_req_list, item, ur_entry);
1966 	sc->sc_len += nlen;
1967 
1968 	schednetisr(NETISR_PFSYNC);
1969 }
1970 
1971 void
1972 pfsync_update_state_req(struct pf_state *st)
1973 {
1974 	struct pfsync_softc *sc = pfsyncif;
1975 
1976 	if (sc == NULL)
1977 		panic("pfsync_update_state_req: nonexistant instance");
1978 
1979 	if (ISSET(st->state_flags, PFSTATE_NOSYNC)) {
1980 		if (st->sync_state != PFSYNC_S_NONE)
1981 			pfsync_q_del(st);
1982 		return;
1983 	}
1984 
1985 	switch (st->sync_state) {
1986 	case PFSYNC_S_UPD_C:
1987 	case PFSYNC_S_IACK:
1988 		pfsync_q_del(st);
1989 	case PFSYNC_S_NONE:
1990 		pfsync_q_ins(st, PFSYNC_S_UPD);
1991 		schednetisr(NETISR_PFSYNC);
1992 		return;
1993 
1994 	case PFSYNC_S_INS:
1995 	case PFSYNC_S_UPD:
1996 	case PFSYNC_S_DEL:
1997 		/* we're already handling it */
1998 		return;
1999 
2000 	default:
2001 		panic("pfsync_update_state_req: unexpected sync state %d",
2002 		    st->sync_state);
2003 	}
2004 }
2005 
2006 void
2007 pfsync_delete_state(struct pf_state *st)
2008 {
2009 	struct pfsync_softc *sc = pfsyncif;
2010 
2011 	splsoftassert(IPL_SOFTNET);
2012 
2013 	if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
2014 		return;
2015 
2016 	if (ISSET(st->state_flags, PFSTATE_ACK))
2017 		pfsync_deferred(st, 1);
2018 	if (ISSET(st->state_flags, PFSTATE_NOSYNC)) {
2019 		if (st->sync_state != PFSYNC_S_NONE)
2020 			pfsync_q_del(st);
2021 		return;
2022 	}
2023 
2024 	if (sc->sc_len == PFSYNC_MINPKT)
2025 		timeout_add_sec(&sc->sc_tmo, 1);
2026 
2027 	switch (st->sync_state) {
2028 	case PFSYNC_S_INS:
2029 		/* we never got to tell the world so just forget about it */
2030 		pfsync_q_del(st);
2031 		return;
2032 
2033 	case PFSYNC_S_UPD_C:
2034 	case PFSYNC_S_UPD:
2035 	case PFSYNC_S_IACK:
2036 		pfsync_q_del(st);
2037 		/* FALLTHROUGH to putting it on the del list */
2038 
2039 	case PFSYNC_S_NONE:
2040 		pfsync_q_ins(st, PFSYNC_S_DEL);
2041 		return;
2042 
2043 	default:
2044 		panic("pfsync_delete_state: unexpected sync state %d",
2045 		    st->sync_state);
2046 	}
2047 }
2048 
2049 void
2050 pfsync_clear_states(u_int32_t creatorid, const char *ifname)
2051 {
2052 	struct pfsync_softc *sc = pfsyncif;
2053 	struct {
2054 		struct pfsync_subheader subh;
2055 		struct pfsync_clr clr;
2056 	} __packed r;
2057 
2058 	splsoftassert(IPL_SOFTNET);
2059 
2060 	if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
2061 		return;
2062 
2063 	bzero(&r, sizeof(r));
2064 
2065 	r.subh.action = PFSYNC_ACT_CLR;
2066 	r.subh.count = htons(1);
2067 
2068 	strlcpy(r.clr.ifname, ifname, sizeof(r.clr.ifname));
2069 	r.clr.creatorid = creatorid;
2070 
2071 	pfsync_send_plus(&r, sizeof(r));
2072 }
2073 
2074 void
2075 pfsync_q_ins(struct pf_state *st, int q)
2076 {
2077 	struct pfsync_softc *sc = pfsyncif;
2078 	size_t nlen = pfsync_qs[q].len;
2079 
2080 	KASSERT(st->sync_state == PFSYNC_S_NONE);
2081 
2082 #if 1 || defined(PFSYNC_DEBUG)
2083 	if (sc->sc_len < PFSYNC_MINPKT)
2084 		panic("pfsync pkt len is too low %d", sc->sc_len);
2085 #endif
2086 	if (TAILQ_EMPTY(&sc->sc_qs[q]))
2087 		nlen += sizeof(struct pfsync_subheader);
2088 
2089 	if (sc->sc_len + nlen > sc->sc_if.if_mtu) {
2090 		pfsync_sendout();
2091 
2092 		nlen = sizeof(struct pfsync_subheader) + pfsync_qs[q].len;
2093 	}
2094 
2095 	sc->sc_len += nlen;
2096 	TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list);
2097 	st->sync_state = q;
2098 }
2099 
2100 void
2101 pfsync_q_del(struct pf_state *st)
2102 {
2103 	struct pfsync_softc *sc = pfsyncif;
2104 	int q = st->sync_state;
2105 
2106 	KASSERT(st->sync_state != PFSYNC_S_NONE);
2107 
2108 	sc->sc_len -= pfsync_qs[q].len;
2109 	TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
2110 	st->sync_state = PFSYNC_S_NONE;
2111 
2112 	if (TAILQ_EMPTY(&sc->sc_qs[q]))
2113 		sc->sc_len -= sizeof(struct pfsync_subheader);
2114 }
2115 
2116 void
2117 pfsync_update_tdb(struct tdb *t, int output)
2118 {
2119 	struct pfsync_softc *sc = pfsyncif;
2120 	size_t nlen = sizeof(struct pfsync_tdb);
2121 
2122 	if (sc == NULL)
2123 		return;
2124 
2125 	if (!ISSET(t->tdb_flags, TDBF_PFSYNC)) {
2126 		if (TAILQ_EMPTY(&sc->sc_tdb_q))
2127 			nlen += sizeof(struct pfsync_subheader);
2128 
2129 		if (sc->sc_len + nlen > sc->sc_if.if_mtu) {
2130 			pfsync_sendout();
2131 
2132 			nlen = sizeof(struct pfsync_subheader) +
2133 			    sizeof(struct pfsync_tdb);
2134 		}
2135 
2136 		sc->sc_len += nlen;
2137 		TAILQ_INSERT_TAIL(&sc->sc_tdb_q, t, tdb_sync_entry);
2138 		SET(t->tdb_flags, TDBF_PFSYNC);
2139 		t->tdb_updates = 0;
2140 	} else {
2141 		if (++t->tdb_updates >= sc->sc_maxupdates)
2142 			schednetisr(NETISR_PFSYNC);
2143 	}
2144 
2145 	if (output)
2146 		SET(t->tdb_flags, TDBF_PFSYNC_RPL);
2147 	else
2148 		CLR(t->tdb_flags, TDBF_PFSYNC_RPL);
2149 }
2150 
2151 void
2152 pfsync_delete_tdb(struct tdb *t)
2153 {
2154 	struct pfsync_softc *sc = pfsyncif;
2155 
2156 	if (sc == NULL || !ISSET(t->tdb_flags, TDBF_PFSYNC))
2157 		return;
2158 
2159 	sc->sc_len -= sizeof(struct pfsync_tdb);
2160 	TAILQ_REMOVE(&sc->sc_tdb_q, t, tdb_sync_entry);
2161 	CLR(t->tdb_flags, TDBF_PFSYNC);
2162 
2163 	if (TAILQ_EMPTY(&sc->sc_tdb_q))
2164 		sc->sc_len -= sizeof(struct pfsync_subheader);
2165 }
2166 
2167 void
2168 pfsync_out_tdb(struct tdb *t, void *buf)
2169 {
2170 	struct pfsync_tdb *ut = buf;
2171 
2172 	bzero(ut, sizeof(*ut));
2173 	ut->spi = t->tdb_spi;
2174 	bcopy(&t->tdb_dst, &ut->dst, sizeof(ut->dst));
2175 	/*
2176 	 * When a failover happens, the master's rpl is probably above
2177 	 * what we see here (we may be up to a second late), so
2178 	 * increase it a bit for outbound tdbs to manage most such
2179 	 * situations.
2180 	 *
2181 	 * For now, just add an offset that is likely to be larger
2182 	 * than the number of packets we can see in one second. The RFC
2183 	 * just says the next packet must have a higher seq value.
2184 	 *
2185 	 * XXX What is a good algorithm for this? We could use
2186 	 * a rate-determined increase, but to know it, we would have
2187 	 * to extend struct tdb.
2188 	 * XXX pt->rpl can wrap over MAXINT, but if so the real tdb
2189 	 * will soon be replaced anyway. For now, just don't handle
2190 	 * this edge case.
2191 	 */
2192 #define RPL_INCR 16384
2193 	ut->rpl = htonl(t->tdb_rpl + (ISSET(t->tdb_flags, TDBF_PFSYNC_RPL) ?
2194 	    RPL_INCR : 0));
2195 	ut->cur_bytes = htobe64(t->tdb_cur_bytes);
2196 	ut->sproto = t->tdb_sproto;
2197 }
2198 
2199 void
2200 pfsync_bulk_start(void)
2201 {
2202 	struct pfsync_softc *sc = pfsyncif;
2203 
2204 	sc->sc_ureq_received = time_uptime;
2205 
2206 	if (sc->sc_bulk_next == NULL)
2207 		sc->sc_bulk_next = TAILQ_FIRST(&state_list);
2208 	sc->sc_bulk_last = sc->sc_bulk_next;
2209 
2210 	if (pf_status.debug >= PF_DEBUG_MISC)
2211 		printf("pfsync: received bulk update request\n");
2212 
2213 	pfsync_bulk_status(PFSYNC_BUS_START);
2214 	timeout_add(&sc->sc_bulk_tmo, 0);
2215 }
2216 
2217 void
2218 pfsync_bulk_update(void *arg)
2219 {
2220 	struct pfsync_softc *sc = arg;
2221 	struct pf_state *st;
2222 	int i = 0;
2223 	int s;
2224 
2225 	s = splsoftnet();
2226 
2227 	st = sc->sc_bulk_next;
2228 
2229 	while (st != sc->sc_bulk_last) {
2230 		if (st->sync_state == PFSYNC_S_NONE &&
2231 		    st->timeout < PFTM_MAX &&
2232 		    st->pfsync_time <= sc->sc_ureq_received) {
2233 			pfsync_update_state_req(st);
2234 			i++;
2235 		}
2236 
2237 		st = TAILQ_NEXT(st, entry_list);
2238 		if (st == NULL)
2239 			st = TAILQ_FIRST(&state_list);
2240 
2241 		if (i > 0 && TAILQ_EMPTY(&sc->sc_qs[PFSYNC_S_UPD])) {
2242 			sc->sc_bulk_next = st;
2243 			timeout_add(&sc->sc_bulk_tmo, 1);
2244 			goto out;
2245 		}
2246 	}
2247 
2248 	/* we're done */
2249 	sc->sc_bulk_next = NULL;
2250 	sc->sc_bulk_last = NULL;
2251 	pfsync_bulk_status(PFSYNC_BUS_END);
2252 
2253 out:
2254 	splx(s);
2255 }
2256 
2257 void
2258 pfsync_bulk_status(u_int8_t status)
2259 {
2260 	struct {
2261 		struct pfsync_subheader subh;
2262 		struct pfsync_bus bus;
2263 	} __packed r;
2264 
2265 	struct pfsync_softc *sc = pfsyncif;
2266 
2267 	bzero(&r, sizeof(r));
2268 
2269 	r.subh.action = PFSYNC_ACT_BUS;
2270 	r.subh.count = htons(1);
2271 
2272 	r.bus.creatorid = pf_status.hostid;
2273 	r.bus.endtime = htonl(time_uptime - sc->sc_ureq_received);
2274 	r.bus.status = status;
2275 
2276 	pfsync_send_plus(&r, sizeof(r));
2277 }
2278 
2279 void
2280 pfsync_bulk_fail(void *arg)
2281 {
2282 	struct pfsync_softc *sc = arg;
2283 
2284 	if (sc->sc_bulk_tries++ < PFSYNC_MAX_BULKTRIES) {
2285 		/* Try again */
2286 		timeout_add_sec(&sc->sc_bulkfail_tmo, 5);
2287 		pfsync_request_update(0, 0);
2288 	} else {
2289 		/* Pretend like the transfer was ok */
2290 		sc->sc_ureq_sent = 0;
2291 		sc->sc_bulk_tries = 0;
2292 #if NCARP > 0
2293 		if (!pfsync_sync_ok)
2294 			carp_group_demote_adj(&sc->sc_if, -1);
2295 #endif
2296 		pfsync_sync_ok = 1;
2297 		if (pf_status.debug >= PF_DEBUG_MISC)
2298 			printf("pfsync: failed to receive bulk update\n");
2299 	}
2300 }
2301 
2302 void
2303 pfsync_send_plus(void *plus, size_t pluslen)
2304 {
2305 	struct pfsync_softc *sc = pfsyncif;
2306 
2307 	if (sc->sc_len + pluslen > sc->sc_if.if_mtu)
2308 		pfsync_sendout();
2309 
2310 	sc->sc_plus = plus;
2311 	sc->sc_len += (sc->sc_pluslen = pluslen);
2312 
2313 	pfsync_sendout();
2314 }
2315 
2316 int
2317 pfsync_up(void)
2318 {
2319 	struct pfsync_softc *sc = pfsyncif;
2320 
2321 	if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
2322 		return (0);
2323 
2324 	return (1);
2325 }
2326 
2327 int
2328 pfsync_state_in_use(struct pf_state *st)
2329 {
2330 	struct pfsync_softc *sc = pfsyncif;
2331 
2332 	if (sc == NULL)
2333 		return (0);
2334 
2335 	if (st->sync_state != PFSYNC_S_NONE)
2336 		return (1);
2337 
2338 	if (sc->sc_bulk_next == NULL && sc->sc_bulk_last == NULL)
2339 		return (0);
2340 
2341 	return (1);
2342 }
2343 
2344 void
2345 pfsync_timeout(void *arg)
2346 {
2347 	int s;
2348 
2349 	s = splsoftnet();
2350 	pfsync_sendout();
2351 	splx(s);
2352 }
2353 
2354 /* this is a softnet/netisr handler */
2355 void
2356 pfsyncintr(void)
2357 {
2358 	pfsync_sendout();
2359 }
2360 
2361 int
2362 pfsync_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
2363     size_t newlen)
2364 {
2365 	/* All sysctl names at this level are terminal. */
2366 	if (namelen != 1)
2367 		return (ENOTDIR);
2368 
2369 	switch (name[0]) {
2370 	case PFSYNCCTL_STATS:
2371 		if (newp != NULL)
2372 			return (EPERM);
2373 		return (sysctl_struct(oldp, oldlenp, newp, newlen,
2374 		    &pfsyncstats, sizeof(pfsyncstats)));
2375 	default:
2376 		return (ENOPROTOOPT);
2377 	}
2378 }
2379