xref: /openbsd-src/sys/netinet/ip_carp.c (revision fc405d53b73a2d73393cb97f684863d17b583e38)
1 /*	$OpenBSD: ip_carp.c,v 1.357 2023/05/16 14:32:54 jan Exp $	*/
2 
3 /*
4  * Copyright (c) 2002 Michael Shalayeff. All rights reserved.
5  * Copyright (c) 2003 Ryan McBride. All rights reserved.
6  * Copyright (c) 2006-2008 Marco Pfatschbacher. All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
21  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
23  * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
25  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
26  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
27  * THE POSSIBILITY OF SUCH DAMAGE.
28  */
29 
30 /*
31  * TODO:
32  *	- iface reconfigure
33  *	- support for hardware checksum calculations;
34  *
35  */
36 
37 #include "ether.h"
38 
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/mbuf.h>
42 #include <sys/socket.h>
43 #include <sys/socketvar.h>
44 #include <sys/timeout.h>
45 #include <sys/ioctl.h>
46 #include <sys/errno.h>
47 #include <sys/device.h>
48 #include <sys/kernel.h>
49 #include <sys/sysctl.h>
50 #include <sys/syslog.h>
51 #include <sys/refcnt.h>
52 
53 #include <net/if.h>
54 #include <net/if_var.h>
55 #include <net/if_types.h>
56 #include <net/netisr.h>
57 
58 #include <crypto/sha1.h>
59 
60 #include <netinet/in.h>
61 #include <netinet/in_var.h>
62 #include <netinet/ip.h>
63 #include <netinet/ip_var.h>
64 #include <netinet/if_ether.h>
65 #include <netinet/ip_ipsp.h>
66 
67 #include <net/if_dl.h>
68 
69 #ifdef INET6
70 #include <netinet6/in6_var.h>
71 #include <netinet/icmp6.h>
72 #include <netinet/ip6.h>
73 #include <netinet6/ip6_var.h>
74 #include <netinet6/nd6.h>
75 #include <netinet6/in6_ifattach.h>
76 #endif
77 
78 #include "bpfilter.h"
79 #if NBPFILTER > 0
80 #include <net/bpf.h>
81 #endif
82 
83 #include "vlan.h"
84 #if NVLAN > 0
85 #include <net/if_vlan_var.h>
86 #endif
87 
88 #include <netinet/ip_carp.h>
89 
90 struct carp_mc_entry {
91 	LIST_ENTRY(carp_mc_entry)	mc_entries;
92 	union {
93 		struct ether_multi	*mcu_enm;
94 	} mc_u;
95 	struct sockaddr_storage		mc_addr;
96 };
97 #define	mc_enm	mc_u.mcu_enm
98 
99 enum { HMAC_ORIG=0, HMAC_NOV6LL=1, HMAC_MAX=2 };
100 
101 struct carp_vhost_entry {
102 	SRPL_ENTRY(carp_vhost_entry) vhost_entries;
103 	struct refcnt vhost_refcnt;
104 
105 	struct carp_softc *parent_sc;
106 	int vhe_leader;
107 	int vhid;
108 	int advskew;
109 	enum { INIT = 0, BACKUP, MASTER }	state;
110 	struct timeout ad_tmo;	/* advertisement timeout */
111 	struct timeout md_tmo;	/* master down timeout */
112 	struct timeout md6_tmo;	/* master down timeout */
113 
114 	u_int64_t vhe_replay_cookie;
115 
116 	/* authentication */
117 #define CARP_HMAC_PAD	64
118 	unsigned char vhe_pad[CARP_HMAC_PAD];
119 	SHA1_CTX vhe_sha1[HMAC_MAX];
120 
121 	u_int8_t vhe_enaddr[ETHER_ADDR_LEN];
122 };
123 
124 void	carp_vh_ref(void *, void *);
125 void	carp_vh_unref(void *, void *);
126 
127 struct srpl_rc carp_vh_rc =
128     SRPL_RC_INITIALIZER(carp_vh_ref, carp_vh_unref, NULL);
129 
130 struct carp_softc {
131 	struct arpcom sc_ac;
132 #define	sc_if		sc_ac.ac_if
133 #define	sc_carpdevidx	sc_ac.ac_if.if_carpdevidx
134 	struct task sc_atask;
135 	struct task sc_ltask;
136 	struct task sc_dtask;
137 	struct ip_moptions sc_imo;
138 #ifdef INET6
139 	struct ip6_moptions sc_im6o;
140 #endif /* INET6 */
141 
142 	SRPL_ENTRY(carp_softc) sc_list;
143 	struct refcnt sc_refcnt;
144 
145 	int sc_suppress;
146 	int sc_bow_out;
147 	int sc_demote_cnt;
148 
149 	int sc_sendad_errors;
150 #define CARP_SENDAD_MAX_ERRORS(sc) (3 * (sc)->sc_vhe_count)
151 	int sc_sendad_success;
152 #define CARP_SENDAD_MIN_SUCCESS(sc) (3 * (sc)->sc_vhe_count)
153 
154 	char sc_curlladdr[ETHER_ADDR_LEN];
155 
156 	SRPL_HEAD(, carp_vhost_entry) carp_vhosts;
157 	int sc_vhe_count;
158 	u_int8_t sc_vhids[CARP_MAXNODES];
159 	u_int8_t sc_advskews[CARP_MAXNODES];
160 	u_int8_t sc_balancing;
161 
162 	int sc_naddrs;
163 	int sc_naddrs6;
164 	int sc_advbase;		/* seconds */
165 
166 	/* authentication */
167 	unsigned char sc_key[CARP_KEY_LEN];
168 
169 	u_int32_t sc_hashkey[2];
170 	u_int32_t sc_lsmask;		/* load sharing mask */
171 	int sc_lscount;			/* # load sharing interfaces (max 32) */
172 	int sc_delayed_arp;		/* delayed ARP request countdown */
173 	int sc_realmac;			/* using real mac */
174 
175 	struct in_addr sc_peer;
176 
177 	LIST_HEAD(__carp_mchead, carp_mc_entry)	carp_mc_listhead;
178 	struct carp_vhost_entry *cur_vhe; /* current active vhe */
179 };
180 
181 void	carp_sc_ref(void *, void *);
182 void	carp_sc_unref(void *, void *);
183 
184 struct srpl_rc carp_sc_rc =
185     SRPL_RC_INITIALIZER(carp_sc_ref, carp_sc_unref, NULL);
186 
187 int carp_opts[CARPCTL_MAXID] = { 0, 1, 0, LOG_CRIT };	/* XXX for now */
188 struct cpumem *carpcounters;
189 
190 int	carp_send_all_recur = 0;
191 
192 #define	CARP_LOG(l, sc, s)						\
193 	do {								\
194 		if (carp_opts[CARPCTL_LOG] >= l) {			\
195 			if (sc)						\
196 				log(l, "%s: ",				\
197 				    (sc)->sc_if.if_xname);		\
198 			else						\
199 				log(l, "carp: ");			\
200 			addlog s;					\
201 			addlog("\n");					\
202 		}							\
203 	} while (0)
204 
205 void	carp_hmac_prepare(struct carp_softc *);
206 void	carp_hmac_prepare_ctx(struct carp_vhost_entry *, u_int8_t);
207 void	carp_hmac_generate(struct carp_vhost_entry *, u_int32_t *,
208 	    unsigned char *, u_int8_t);
209 int	carp_hmac_verify(struct carp_vhost_entry *, u_int32_t *,
210 	    unsigned char *);
211 void	carp_proto_input_c(struct ifnet *, struct mbuf *,
212 	    struct carp_header *, int, sa_family_t);
213 int	carp_proto_input_if(struct ifnet *, struct mbuf **, int *, int);
214 #ifdef INET6
215 int	carp6_proto_input_if(struct ifnet *, struct mbuf **, int *, int);
216 #endif
217 void	carpattach(int);
218 void	carpdetach(void *);
219 void	carp_prepare_ad(struct mbuf *, struct carp_vhost_entry *,
220 	    struct carp_header *);
221 void	carp_send_ad_all(void);
222 void	carp_vhe_send_ad_all(struct carp_softc *);
223 void	carp_timer_ad(void *);
224 void	carp_send_ad(struct carp_vhost_entry *);
225 void	carp_send_arp(struct carp_softc *);
226 void	carp_timer_down(void *);
227 void	carp_master_down(struct carp_vhost_entry *);
228 int	carp_ioctl(struct ifnet *, u_long, caddr_t);
229 int	carp_vhids_ioctl(struct carp_softc *, struct carpreq *);
230 int	carp_check_dup_vhids(struct carp_softc *, struct srpl *,
231 	    struct carpreq *);
232 void	carp_ifgroup_ioctl(struct ifnet *, u_long, caddr_t);
233 void	carp_ifgattr_ioctl(struct ifnet *, u_long, caddr_t);
234 void	carp_start(struct ifnet *);
235 int	carp_enqueue(struct ifnet *, struct mbuf *);
236 void	carp_transmit(struct carp_softc *, struct ifnet *, struct mbuf *);
237 void	carp_setrun_all(struct carp_softc *, sa_family_t);
238 void	carp_setrun(struct carp_vhost_entry *, sa_family_t);
239 void	carp_set_state_all(struct carp_softc *, int);
240 void	carp_set_state(struct carp_vhost_entry *, int);
241 void	carp_multicast_cleanup(struct carp_softc *);
242 int	carp_set_ifp(struct carp_softc *, struct ifnet *);
243 void	carp_set_enaddr(struct carp_softc *);
244 void	carp_set_vhe_enaddr(struct carp_vhost_entry *);
245 void	carp_addr_updated(void *);
246 int	carp_set_addr(struct carp_softc *, struct sockaddr_in *);
247 int	carp_join_multicast(struct carp_softc *);
248 #ifdef INET6
249 void	carp_send_na(struct carp_softc *);
250 int	carp_set_addr6(struct carp_softc *, struct sockaddr_in6 *);
251 int	carp_join_multicast6(struct carp_softc *);
252 #endif
253 int	carp_clone_create(struct if_clone *, int);
254 int	carp_clone_destroy(struct ifnet *);
255 int	carp_ether_addmulti(struct carp_softc *, struct ifreq *);
256 int	carp_ether_delmulti(struct carp_softc *, struct ifreq *);
257 void	carp_ether_purgemulti(struct carp_softc *);
258 int	carp_group_demote_count(struct carp_softc *);
259 void	carp_update_lsmask(struct carp_softc *);
260 int	carp_new_vhost(struct carp_softc *, int, int);
261 void	carp_destroy_vhosts(struct carp_softc *);
262 void	carp_del_all_timeouts(struct carp_softc *);
263 int	carp_vhe_match(struct carp_softc *, uint64_t);
264 
265 struct if_clone carp_cloner =
266     IF_CLONE_INITIALIZER("carp", carp_clone_create, carp_clone_destroy);
267 
268 #define carp_cksum(_m, _l)	((u_int16_t)in_cksum((_m), (_l)))
269 #define CARP_IFQ_PRIO	6
270 
271 void
272 carp_hmac_prepare(struct carp_softc *sc)
273 {
274 	struct carp_vhost_entry *vhe;
275 	u_int8_t i;
276 
277 	KERNEL_ASSERT_LOCKED(); /* touching carp_vhosts */
278 
279 	SRPL_FOREACH_LOCKED(vhe, &sc->carp_vhosts, vhost_entries) {
280 		for (i = 0; i < HMAC_MAX; i++) {
281 			carp_hmac_prepare_ctx(vhe, i);
282 		}
283 	}
284 }
285 
286 void
287 carp_hmac_prepare_ctx(struct carp_vhost_entry *vhe, u_int8_t ctx)
288 {
289 	struct carp_softc *sc = vhe->parent_sc;
290 
291 	u_int8_t version = CARP_VERSION, type = CARP_ADVERTISEMENT;
292 	u_int8_t vhid = vhe->vhid & 0xff;
293 	SHA1_CTX sha1ctx;
294 	u_int32_t kmd[5];
295 	struct ifaddr *ifa;
296 	int i, found;
297 	struct in_addr last, cur, in;
298 #ifdef INET6
299 	struct in6_addr last6, cur6, in6;
300 #endif /* INET6 */
301 
302 	/* compute ipad from key */
303 	memset(vhe->vhe_pad, 0, sizeof(vhe->vhe_pad));
304 	bcopy(sc->sc_key, vhe->vhe_pad, sizeof(sc->sc_key));
305 	for (i = 0; i < sizeof(vhe->vhe_pad); i++)
306 		vhe->vhe_pad[i] ^= 0x36;
307 
308 	/* precompute first part of inner hash */
309 	SHA1Init(&vhe->vhe_sha1[ctx]);
310 	SHA1Update(&vhe->vhe_sha1[ctx], vhe->vhe_pad, sizeof(vhe->vhe_pad));
311 	SHA1Update(&vhe->vhe_sha1[ctx], (void *)&version, sizeof(version));
312 	SHA1Update(&vhe->vhe_sha1[ctx], (void *)&type, sizeof(type));
313 
314 	/* generate a key for the arpbalance hash, before the vhid is hashed */
315 	if (vhe->vhe_leader) {
316 		bcopy(&vhe->vhe_sha1[ctx], &sha1ctx, sizeof(sha1ctx));
317 		SHA1Final((unsigned char *)kmd, &sha1ctx);
318 		sc->sc_hashkey[0] = kmd[0] ^ kmd[1];
319 		sc->sc_hashkey[1] = kmd[2] ^ kmd[3];
320 	}
321 
322 	/* the rest of the precomputation */
323 	if (!sc->sc_realmac && vhe->vhe_leader &&
324 	    memcmp(sc->sc_ac.ac_enaddr, vhe->vhe_enaddr, ETHER_ADDR_LEN) != 0)
325 		SHA1Update(&vhe->vhe_sha1[ctx], sc->sc_ac.ac_enaddr,
326 		    ETHER_ADDR_LEN);
327 
328 	SHA1Update(&vhe->vhe_sha1[ctx], (void *)&vhid, sizeof(vhid));
329 
330 	/* Hash the addresses from smallest to largest, not interface order */
331 	cur.s_addr = 0;
332 	do {
333 		found = 0;
334 		last = cur;
335 		cur.s_addr = 0xffffffff;
336 		TAILQ_FOREACH(ifa, &sc->sc_if.if_addrlist, ifa_list) {
337 			if (ifa->ifa_addr->sa_family != AF_INET)
338 				continue;
339 			in.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr;
340 			if (ntohl(in.s_addr) > ntohl(last.s_addr) &&
341 			    ntohl(in.s_addr) < ntohl(cur.s_addr)) {
342 				cur.s_addr = in.s_addr;
343 				found++;
344 			}
345 		}
346 		if (found)
347 			SHA1Update(&vhe->vhe_sha1[ctx],
348 			    (void *)&cur, sizeof(cur));
349 	} while (found);
350 #ifdef INET6
351 	memset(&cur6, 0x00, sizeof(cur6));
352 	do {
353 		found = 0;
354 		last6 = cur6;
355 		memset(&cur6, 0xff, sizeof(cur6));
356 		TAILQ_FOREACH(ifa, &sc->sc_if.if_addrlist, ifa_list) {
357 			if (ifa->ifa_addr->sa_family != AF_INET6)
358 				continue;
359 			in6 = ifatoia6(ifa)->ia_addr.sin6_addr;
360 			if (IN6_IS_SCOPE_EMBED(&in6)) {
361 				if (ctx == HMAC_NOV6LL)
362 					continue;
363 				in6.s6_addr16[1] = 0;
364 			}
365 			if (memcmp(&in6, &last6, sizeof(in6)) > 0 &&
366 			    memcmp(&in6, &cur6, sizeof(in6)) < 0) {
367 				cur6 = in6;
368 				found++;
369 			}
370 		}
371 		if (found)
372 			SHA1Update(&vhe->vhe_sha1[ctx],
373 			    (void *)&cur6, sizeof(cur6));
374 	} while (found);
375 #endif /* INET6 */
376 
377 	/* convert ipad to opad */
378 	for (i = 0; i < sizeof(vhe->vhe_pad); i++)
379 		vhe->vhe_pad[i] ^= 0x36 ^ 0x5c;
380 }
381 
382 void
383 carp_hmac_generate(struct carp_vhost_entry *vhe, u_int32_t counter[2],
384     unsigned char md[20], u_int8_t ctx)
385 {
386 	SHA1_CTX sha1ctx;
387 
388 	/* fetch first half of inner hash */
389 	bcopy(&vhe->vhe_sha1[ctx], &sha1ctx, sizeof(sha1ctx));
390 
391 	SHA1Update(&sha1ctx, (void *)counter, sizeof(vhe->vhe_replay_cookie));
392 	SHA1Final(md, &sha1ctx);
393 
394 	/* outer hash */
395 	SHA1Init(&sha1ctx);
396 	SHA1Update(&sha1ctx, vhe->vhe_pad, sizeof(vhe->vhe_pad));
397 	SHA1Update(&sha1ctx, md, 20);
398 	SHA1Final(md, &sha1ctx);
399 }
400 
401 int
402 carp_hmac_verify(struct carp_vhost_entry *vhe, u_int32_t counter[2],
403     unsigned char md[20])
404 {
405 	unsigned char md2[20];
406 	u_int8_t i;
407 
408 	for (i = 0; i < HMAC_MAX; i++) {
409 		carp_hmac_generate(vhe, counter, md2, i);
410 		if (!timingsafe_bcmp(md, md2, sizeof(md2)))
411 			return (0);
412 	}
413 	return (1);
414 }
415 
416 int
417 carp_proto_input(struct mbuf **mp, int *offp, int proto, int af)
418 {
419 	struct ifnet *ifp;
420 
421 	ifp = if_get((*mp)->m_pkthdr.ph_ifidx);
422 	if (ifp == NULL) {
423 		m_freemp(mp);
424 		return IPPROTO_DONE;
425 	}
426 
427 	proto = carp_proto_input_if(ifp, mp, offp, proto);
428 	if_put(ifp);
429 	return proto;
430 }
431 
432 /*
433  * process input packet.
434  * we have rearranged checks order compared to the rfc,
435  * but it seems more efficient this way or not possible otherwise.
436  */
437 int
438 carp_proto_input_if(struct ifnet *ifp, struct mbuf **mp, int *offp, int proto)
439 {
440 	struct mbuf *m = *mp;
441 	struct ip *ip = mtod(m, struct ip *);
442 	struct carp_softc *sc = NULL;
443 	struct carp_header *ch;
444 	int iplen, len, ismulti;
445 
446 	carpstat_inc(carps_ipackets);
447 
448 	if (!carp_opts[CARPCTL_ALLOW]) {
449 		m_freem(m);
450 		return IPPROTO_DONE;
451 	}
452 
453 	ismulti = IN_MULTICAST(ip->ip_dst.s_addr);
454 
455 	/* check if received on a valid carp interface */
456 	switch (ifp->if_type) {
457 	case IFT_CARP:
458 		break;
459 	case IFT_ETHER:
460 		if (ismulti || !SRPL_EMPTY_LOCKED(&ifp->if_carp))
461 			break;
462 		/* FALLTHROUGH */
463 	default:
464 		carpstat_inc(carps_badif);
465 		CARP_LOG(LOG_INFO, sc,
466 		    ("packet received on non-carp interface: %s",
467 		     ifp->if_xname));
468 		m_freem(m);
469 		return IPPROTO_DONE;
470 	}
471 
472 	/* verify that the IP TTL is 255.  */
473 	if (ip->ip_ttl != CARP_DFLTTL) {
474 		carpstat_inc(carps_badttl);
475 		CARP_LOG(LOG_NOTICE, sc, ("received ttl %d != %d on %s",
476 		    ip->ip_ttl, CARP_DFLTTL, ifp->if_xname));
477 		m_freem(m);
478 		return IPPROTO_DONE;
479 	}
480 
481 	/*
482 	 * verify that the received packet length is
483 	 * equal to the CARP header
484 	 */
485 	iplen = ip->ip_hl << 2;
486 	len = iplen + sizeof(*ch);
487 	if (len > m->m_pkthdr.len) {
488 		carpstat_inc(carps_badlen);
489 		CARP_LOG(LOG_INFO, sc, ("packet too short %d on %s",
490 		    m->m_pkthdr.len, ifp->if_xname));
491 		m_freem(m);
492 		return IPPROTO_DONE;
493 	}
494 
495 	if ((m = *mp = m_pullup(m, len)) == NULL) {
496 		carpstat_inc(carps_hdrops);
497 		return IPPROTO_DONE;
498 	}
499 	ip = mtod(m, struct ip *);
500 	ch = (struct carp_header *)(mtod(m, caddr_t) + iplen);
501 
502 	/* verify the CARP checksum */
503 	m->m_data += iplen;
504 	if (carp_cksum(m, len - iplen)) {
505 		carpstat_inc(carps_badsum);
506 		CARP_LOG(LOG_INFO, sc, ("checksum failed on %s",
507 		    ifp->if_xname));
508 		m_freem(m);
509 		return IPPROTO_DONE;
510 	}
511 	m->m_data -= iplen;
512 
513 	KERNEL_LOCK();
514 	carp_proto_input_c(ifp, m, ch, ismulti, AF_INET);
515 	KERNEL_UNLOCK();
516 	return IPPROTO_DONE;
517 }
518 
519 #ifdef INET6
520 int
521 carp6_proto_input(struct mbuf **mp, int *offp, int proto, int af)
522 {
523 	struct ifnet *ifp;
524 
525 	ifp = if_get((*mp)->m_pkthdr.ph_ifidx);
526 	if (ifp == NULL) {
527 		m_freemp(mp);
528 		return IPPROTO_DONE;
529 	}
530 
531 	proto = carp6_proto_input_if(ifp, mp, offp, proto);
532 	if_put(ifp);
533 	return proto;
534 }
535 
536 int
537 carp6_proto_input_if(struct ifnet *ifp, struct mbuf **mp, int *offp, int proto)
538 {
539 	struct mbuf *m = *mp;
540 	struct carp_softc *sc = NULL;
541 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
542 	struct carp_header *ch;
543 	u_int len;
544 
545 	carpstat_inc(carps_ipackets6);
546 
547 	if (!carp_opts[CARPCTL_ALLOW]) {
548 		m_freem(m);
549 		return IPPROTO_DONE;
550 	}
551 
552 	/* check if received on a valid carp interface */
553 	if (ifp->if_type != IFT_CARP) {
554 		carpstat_inc(carps_badif);
555 		CARP_LOG(LOG_INFO, sc, ("packet received on non-carp interface: %s",
556 		    ifp->if_xname));
557 		m_freem(m);
558 		return IPPROTO_DONE;
559 	}
560 
561 	/* verify that the IP TTL is 255 */
562 	if (ip6->ip6_hlim != CARP_DFLTTL) {
563 		carpstat_inc(carps_badttl);
564 		CARP_LOG(LOG_NOTICE, sc, ("received ttl %d != %d on %s",
565 		    ip6->ip6_hlim, CARP_DFLTTL, ifp->if_xname));
566 		m_freem(m);
567 		return IPPROTO_DONE;
568 	}
569 
570 	/* verify that we have a complete carp packet */
571 	len = m->m_len;
572 	if ((m = *mp = m_pullup(m, *offp + sizeof(*ch))) == NULL) {
573 		carpstat_inc(carps_badlen);
574 		CARP_LOG(LOG_INFO, sc, ("packet size %u too small", len));
575 		return IPPROTO_DONE;
576 	}
577 	ch = (struct carp_header *)(mtod(m, caddr_t) + *offp);
578 
579 	/* verify the CARP checksum */
580 	m->m_data += *offp;
581 	if (carp_cksum(m, sizeof(*ch))) {
582 		carpstat_inc(carps_badsum);
583 		CARP_LOG(LOG_INFO, sc, ("checksum failed, on %s",
584 		    ifp->if_xname));
585 		m_freem(m);
586 		return IPPROTO_DONE;
587 	}
588 	m->m_data -= *offp;
589 
590 	KERNEL_LOCK();
591 	carp_proto_input_c(ifp, m, ch, 1, AF_INET6);
592 	KERNEL_UNLOCK();
593 	return IPPROTO_DONE;
594 }
595 #endif /* INET6 */
596 
597 void
598 carp_proto_input_c(struct ifnet *ifp, struct mbuf *m, struct carp_header *ch,
599     int ismulti, sa_family_t af)
600 {
601 	struct carp_softc *sc;
602 	struct ifnet *ifp0;
603 	struct carp_vhost_entry *vhe;
604 	struct timeval sc_tv, ch_tv;
605 	struct srpl *cif;
606 
607 	KERNEL_ASSERT_LOCKED(); /* touching if_carp + carp_vhosts */
608 
609 	ifp0 = if_get(ifp->if_carpdevidx);
610 
611 	if (ifp->if_type == IFT_CARP) {
612 		/*
613 		 * If the parent of this carp(4) got destroyed while
614 		 * `m' was being processed, silently drop it.
615 		 */
616 		if (ifp0 == NULL)
617 			goto rele;
618 		cif = &ifp0->if_carp;
619 	} else
620 		cif = &ifp->if_carp;
621 
622 	SRPL_FOREACH_LOCKED(sc, cif, sc_list) {
623 		if (af == AF_INET &&
624 		    ismulti != IN_MULTICAST(sc->sc_peer.s_addr))
625 			continue;
626 		SRPL_FOREACH_LOCKED(vhe, &sc->carp_vhosts, vhost_entries) {
627 			if (vhe->vhid == ch->carp_vhid)
628 				goto found;
629 		}
630 	}
631  found:
632 
633 	if (!sc || (sc->sc_if.if_flags & (IFF_UP|IFF_RUNNING)) !=
634 	    (IFF_UP|IFF_RUNNING)) {
635 		carpstat_inc(carps_badvhid);
636 		goto rele;
637 	}
638 
639 	getmicrotime(&sc->sc_if.if_lastchange);
640 
641 	/* verify the CARP version. */
642 	if (ch->carp_version != CARP_VERSION) {
643 		carpstat_inc(carps_badver);
644 		sc->sc_if.if_ierrors++;
645 		CARP_LOG(LOG_NOTICE, sc, ("invalid version %d != %d",
646 		    ch->carp_version, CARP_VERSION));
647 		goto rele;
648 	}
649 
650 	/* verify the hash */
651 	if (carp_hmac_verify(vhe, ch->carp_counter, ch->carp_md)) {
652 		carpstat_inc(carps_badauth);
653 		sc->sc_if.if_ierrors++;
654 		CARP_LOG(LOG_INFO, sc, ("incorrect hash"));
655 		goto rele;
656 	}
657 
658 	if (!memcmp(&vhe->vhe_replay_cookie, ch->carp_counter,
659 	    sizeof(ch->carp_counter))) {
660 		struct ifnet *ifp2;
661 
662 		ifp2 = if_get(sc->sc_carpdevidx);
663 		/* Do not log duplicates from non simplex interfaces */
664 		if (ifp2 && ifp2->if_flags & IFF_SIMPLEX) {
665 			carpstat_inc(carps_badauth);
666 			sc->sc_if.if_ierrors++;
667 			CARP_LOG(LOG_WARNING, sc,
668 			    ("replay or network loop detected"));
669 		}
670 		if_put(ifp2);
671 		goto rele;
672 	}
673 
674 	sc_tv.tv_sec = sc->sc_advbase;
675 	sc_tv.tv_usec = vhe->advskew * 1000000 / 256;
676 	ch_tv.tv_sec = ch->carp_advbase;
677 	ch_tv.tv_usec = ch->carp_advskew * 1000000 / 256;
678 
679 	switch (vhe->state) {
680 	case INIT:
681 		break;
682 	case MASTER:
683 		/*
684 		 * If we receive an advertisement from a master who's going to
685 		 * be more frequent than us, and whose demote count is not higher
686 		 * than ours, go into BACKUP state. If his demote count is lower,
687 		 * also go into BACKUP.
688 		 */
689 		if (((timercmp(&sc_tv, &ch_tv, >) ||
690 		    timercmp(&sc_tv, &ch_tv, ==)) &&
691 		    (ch->carp_demote <= carp_group_demote_count(sc))) ||
692 		    ch->carp_demote < carp_group_demote_count(sc)) {
693 			timeout_del(&vhe->ad_tmo);
694 			carp_set_state(vhe, BACKUP);
695 			carp_setrun(vhe, 0);
696 		}
697 		break;
698 	case BACKUP:
699 		/*
700 		 * If we're pre-empting masters who advertise slower than us,
701 		 * and do not have a better demote count, treat them as down.
702 		 *
703 		 */
704 		if (carp_opts[CARPCTL_PREEMPT] &&
705 		    timercmp(&sc_tv, &ch_tv, <) &&
706 		    ch->carp_demote >= carp_group_demote_count(sc)) {
707 			carp_master_down(vhe);
708 			break;
709 		}
710 
711 		/*
712 		 * Take over masters advertising with a higher demote count,
713 		 * regardless of CARPCTL_PREEMPT.
714 		 */
715 		if (ch->carp_demote > carp_group_demote_count(sc)) {
716 			carp_master_down(vhe);
717 			break;
718 		}
719 
720 		/*
721 		 *  If the master is going to advertise at such a low frequency
722 		 *  that he's guaranteed to time out, we'd might as well just
723 		 *  treat him as timed out now.
724 		 */
725 		sc_tv.tv_sec = sc->sc_advbase * 3;
726 		if (sc->sc_advbase && timercmp(&sc_tv, &ch_tv, <)) {
727 			carp_master_down(vhe);
728 			break;
729 		}
730 
731 		/*
732 		 * Otherwise, we reset the counter and wait for the next
733 		 * advertisement.
734 		 */
735 		carp_setrun(vhe, af);
736 		break;
737 	}
738 
739 rele:
740 	if_put(ifp0);
741 	m_freem(m);
742 	return;
743 }
744 
745 int
746 carp_sysctl_carpstat(void *oldp, size_t *oldlenp, void *newp)
747 {
748 	struct carpstats carpstat;
749 
750 	CTASSERT(sizeof(carpstat) == (carps_ncounters * sizeof(uint64_t)));
751 	memset(&carpstat, 0, sizeof carpstat);
752 	counters_read(carpcounters, (uint64_t *)&carpstat, carps_ncounters);
753 	return (sysctl_rdstruct(oldp, oldlenp, newp,
754 	    &carpstat, sizeof(carpstat)));
755 }
756 
757 int
758 carp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
759     size_t newlen)
760 {
761 	int error;
762 
763 	/* All sysctl names at this level are terminal. */
764 	if (namelen != 1)
765 		return (ENOTDIR);
766 
767 	switch (name[0]) {
768 	case CARPCTL_STATS:
769 		return (carp_sysctl_carpstat(oldp, oldlenp, newp));
770 	default:
771 		if (name[0] <= 0 || name[0] >= CARPCTL_MAXID)
772 			return (ENOPROTOOPT);
773 		NET_LOCK();
774 		error = sysctl_int(oldp, oldlenp, newp, newlen,
775 		    &carp_opts[name[0]]);
776 		NET_UNLOCK();
777 		return (error);
778 	}
779 }
780 
781 /*
782  * Interface side of the CARP implementation.
783  */
784 
785 void
786 carpattach(int n)
787 {
788 	if_creategroup("carp");  /* keep around even if empty */
789 	if_clone_attach(&carp_cloner);
790 	carpcounters = counters_alloc(carps_ncounters);
791 }
792 
793 int
794 carp_clone_create(struct if_clone *ifc, int unit)
795 {
796 	struct carp_softc *sc;
797 	struct ifnet *ifp;
798 
799 	sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO);
800 	refcnt_init(&sc->sc_refcnt);
801 
802 	SRPL_INIT(&sc->carp_vhosts);
803 	sc->sc_vhe_count = 0;
804 	if (carp_new_vhost(sc, 0, 0)) {
805 		free(sc, M_DEVBUF, sizeof(*sc));
806 		return (ENOMEM);
807 	}
808 
809 	task_set(&sc->sc_atask, carp_addr_updated, sc);
810 	task_set(&sc->sc_ltask, carp_carpdev_state, sc);
811 	task_set(&sc->sc_dtask, carpdetach, sc);
812 
813 	sc->sc_suppress = 0;
814 	sc->sc_advbase = CARP_DFLTINTV;
815 	sc->sc_naddrs = sc->sc_naddrs6 = 0;
816 #ifdef INET6
817 	sc->sc_im6o.im6o_hlim = CARP_DFLTTL;
818 #endif /* INET6 */
819 	sc->sc_imo.imo_membership = mallocarray(IP_MIN_MEMBERSHIPS,
820 	    sizeof(struct in_multi *), M_IPMOPTS, M_WAITOK|M_ZERO);
821 	sc->sc_imo.imo_max_memberships = IP_MIN_MEMBERSHIPS;
822 
823 	LIST_INIT(&sc->carp_mc_listhead);
824 	ifp = &sc->sc_if;
825 	ifp->if_softc = sc;
826 	snprintf(ifp->if_xname, sizeof ifp->if_xname, "%s%d", ifc->ifc_name,
827 	    unit);
828 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
829 	ifp->if_ioctl = carp_ioctl;
830 	ifp->if_start = carp_start;
831 	ifp->if_enqueue = carp_enqueue;
832 	ifp->if_xflags = IFXF_CLONED;
833 	if_counters_alloc(ifp);
834 	if_attach(ifp);
835 	ether_ifattach(ifp);
836 	ifp->if_type = IFT_CARP;
837 	ifp->if_sadl->sdl_type = IFT_CARP;
838 	ifp->if_output = carp_output;
839 	ifp->if_priority = IF_CARP_DEFAULT_PRIORITY;
840 	ifp->if_link_state = LINK_STATE_INVALID;
841 
842 	/* Hook carp_addr_updated to cope with address and route changes. */
843 	if_addrhook_add(&sc->sc_if, &sc->sc_atask);
844 
845 	return (0);
846 }
847 
848 int
849 carp_new_vhost(struct carp_softc *sc, int vhid, int advskew)
850 {
851 	struct carp_vhost_entry *vhe, *vhe0;
852 
853 	vhe = malloc(sizeof(*vhe), M_DEVBUF, M_NOWAIT | M_ZERO);
854 	if (vhe == NULL)
855 		return (ENOMEM);
856 
857 	refcnt_init(&vhe->vhost_refcnt);
858 	carp_sc_ref(NULL, sc); /* give a sc ref to the vhe */
859 	vhe->parent_sc = sc;
860 	vhe->vhid = vhid;
861 	vhe->advskew = advskew;
862 	vhe->state = INIT;
863 	timeout_set_proc(&vhe->ad_tmo, carp_timer_ad, vhe);
864 	timeout_set_proc(&vhe->md_tmo, carp_timer_down, vhe);
865 	timeout_set_proc(&vhe->md6_tmo, carp_timer_down, vhe);
866 
867 	KERNEL_ASSERT_LOCKED(); /* touching carp_vhosts */
868 
869 	/* mark the first vhe as leader */
870 	if (SRPL_EMPTY_LOCKED(&sc->carp_vhosts)) {
871 		vhe->vhe_leader = 1;
872 		SRPL_INSERT_HEAD_LOCKED(&carp_vh_rc, &sc->carp_vhosts,
873 		    vhe, vhost_entries);
874 		sc->sc_vhe_count = 1;
875 		return (0);
876 	}
877 
878 	SRPL_FOREACH_LOCKED(vhe0, &sc->carp_vhosts, vhost_entries) {
879 		if (SRPL_NEXT_LOCKED(vhe0, vhost_entries) == NULL)
880 			break;
881 	}
882 
883 	SRPL_INSERT_AFTER_LOCKED(&carp_vh_rc, vhe0, vhe, vhost_entries);
884 	sc->sc_vhe_count++;
885 
886 	return (0);
887 }
888 
889 int
890 carp_clone_destroy(struct ifnet *ifp)
891 {
892 	struct carp_softc *sc = ifp->if_softc;
893 
894 	if_addrhook_del(&sc->sc_if, &sc->sc_atask);
895 
896 	NET_LOCK();
897 	carpdetach(sc);
898 	NET_UNLOCK();
899 
900 	ether_ifdetach(ifp);
901 	if_detach(ifp);
902 	carp_destroy_vhosts(ifp->if_softc);
903 	refcnt_finalize(&sc->sc_refcnt, "carpdtor");
904 	free(sc->sc_imo.imo_membership, M_IPMOPTS,
905 	    sc->sc_imo.imo_max_memberships * sizeof(struct in_multi *));
906 	free(sc, M_DEVBUF, sizeof(*sc));
907 	return (0);
908 }
909 
910 void
911 carp_del_all_timeouts(struct carp_softc *sc)
912 {
913 	struct carp_vhost_entry *vhe;
914 
915 	KERNEL_ASSERT_LOCKED(); /* touching carp_vhosts */
916 	SRPL_FOREACH_LOCKED(vhe, &sc->carp_vhosts, vhost_entries) {
917 		timeout_del(&vhe->ad_tmo);
918 		timeout_del(&vhe->md_tmo);
919 		timeout_del(&vhe->md6_tmo);
920 	}
921 }
922 
923 void
924 carpdetach(void *arg)
925 {
926 	struct carp_softc *sc = arg;
927 	struct ifnet *ifp0;
928 	struct srpl *cif;
929 
930 	carp_del_all_timeouts(sc);
931 
932 	if (sc->sc_demote_cnt)
933 		carp_group_demote_adj(&sc->sc_if, -sc->sc_demote_cnt, "detach");
934 	sc->sc_suppress = 0;
935 	sc->sc_sendad_errors = 0;
936 
937 	carp_set_state_all(sc, INIT);
938 	sc->sc_if.if_flags &= ~IFF_UP;
939 	carp_setrun_all(sc, 0);
940 	carp_multicast_cleanup(sc);
941 
942 	ifp0 = if_get(sc->sc_carpdevidx);
943 	if (ifp0 == NULL)
944 		return;
945 
946 	KERNEL_ASSERT_LOCKED(); /* touching if_carp */
947 
948 	cif = &ifp0->if_carp;
949 
950 	SRPL_REMOVE_LOCKED(&carp_sc_rc, cif, sc, carp_softc, sc_list);
951 	sc->sc_carpdevidx = 0;
952 
953 	if_linkstatehook_del(ifp0, &sc->sc_ltask);
954 	if_detachhook_del(ifp0, &sc->sc_dtask);
955 	ifpromisc(ifp0, 0);
956 	if_put(ifp0);
957 }
958 
959 void
960 carp_destroy_vhosts(struct carp_softc *sc)
961 {
962 	/* XXX bow out? */
963 	struct carp_vhost_entry *vhe;
964 
965 	KERNEL_ASSERT_LOCKED(); /* touching carp_vhosts */
966 
967 	while ((vhe = SRPL_FIRST_LOCKED(&sc->carp_vhosts)) != NULL) {
968 		SRPL_REMOVE_LOCKED(&carp_vh_rc, &sc->carp_vhosts, vhe,
969 		    carp_vhost_entry, vhost_entries);
970 		carp_vh_unref(NULL, vhe); /* drop last ref */
971 	}
972 	sc->sc_vhe_count = 0;
973 }
974 
975 void
976 carp_prepare_ad(struct mbuf *m, struct carp_vhost_entry *vhe,
977     struct carp_header *ch)
978 {
979 	if (!vhe->vhe_replay_cookie) {
980 		arc4random_buf(&vhe->vhe_replay_cookie,
981 		    sizeof(vhe->vhe_replay_cookie));
982 	}
983 
984 	bcopy(&vhe->vhe_replay_cookie, ch->carp_counter,
985 	    sizeof(ch->carp_counter));
986 
987 	/*
988 	 * For the time being, do not include the IPv6 linklayer addresses
989 	 * in the HMAC.
990 	 */
991 	carp_hmac_generate(vhe, ch->carp_counter, ch->carp_md, HMAC_NOV6LL);
992 }
993 
994 void
995 carp_send_ad_all(void)
996 {
997 	struct ifnet *ifp0;
998 	struct srpl *cif;
999 	struct carp_softc *vh;
1000 
1001 	KERNEL_ASSERT_LOCKED(); /* touching if_carp */
1002 
1003 	if (carp_send_all_recur > 0)
1004 		return;
1005 	++carp_send_all_recur;
1006 	TAILQ_FOREACH(ifp0, &ifnetlist, if_list) {
1007 		if (ifp0->if_type != IFT_ETHER)
1008 			continue;
1009 
1010 		cif = &ifp0->if_carp;
1011 		SRPL_FOREACH_LOCKED(vh, cif, sc_list) {
1012 			if ((vh->sc_if.if_flags & (IFF_UP|IFF_RUNNING)) ==
1013 			    (IFF_UP|IFF_RUNNING)) {
1014 				carp_vhe_send_ad_all(vh);
1015 			}
1016 		}
1017 	}
1018 	--carp_send_all_recur;
1019 }
1020 
1021 void
1022 carp_vhe_send_ad_all(struct carp_softc *sc)
1023 {
1024 	struct carp_vhost_entry *vhe;
1025 
1026 	KERNEL_ASSERT_LOCKED(); /* touching carp_vhosts */
1027 
1028 	SRPL_FOREACH_LOCKED(vhe, &sc->carp_vhosts, vhost_entries) {
1029 		if (vhe->state == MASTER)
1030 			carp_send_ad(vhe);
1031 	}
1032 }
1033 
1034 void
1035 carp_timer_ad(void *v)
1036 {
1037 	NET_LOCK();
1038 	carp_send_ad(v);
1039 	NET_UNLOCK();
1040 }
1041 
1042 void
1043 carp_send_ad(struct carp_vhost_entry *vhe)
1044 {
1045 	struct carp_header ch;
1046 	struct timeval tv;
1047 	struct carp_softc *sc = vhe->parent_sc;
1048 	struct carp_header *ch_ptr;
1049 	struct mbuf *m;
1050 	int error, len, advbase, advskew;
1051 	struct ifnet *ifp;
1052 	struct ifaddr *ifa;
1053 	struct sockaddr sa;
1054 
1055 	NET_ASSERT_LOCKED();
1056 
1057 	if ((ifp = if_get(sc->sc_carpdevidx)) == NULL) {
1058 		sc->sc_if.if_oerrors++;
1059 		return;
1060 	}
1061 
1062 	/* bow out if we've gone to backup (the carp interface is going down) */
1063 	if (sc->sc_bow_out) {
1064 		advbase = 255;
1065 		advskew = 255;
1066 	} else {
1067 		advbase = sc->sc_advbase;
1068 		advskew = vhe->advskew;
1069 		tv.tv_sec = advbase;
1070 		if (advbase == 0 && advskew == 0)
1071 			tv.tv_usec = 1 * 1000000 / 256;
1072 		else
1073 			tv.tv_usec = advskew * 1000000 / 256;
1074 	}
1075 
1076 	ch.carp_version = CARP_VERSION;
1077 	ch.carp_type = CARP_ADVERTISEMENT;
1078 	ch.carp_vhid = vhe->vhid;
1079 	ch.carp_demote = carp_group_demote_count(sc) & 0xff;
1080 	ch.carp_advbase = advbase;
1081 	ch.carp_advskew = advskew;
1082 	ch.carp_authlen = 7;	/* XXX DEFINE */
1083 	ch.carp_cksum = 0;
1084 
1085 	sc->cur_vhe = vhe; /* we need the vhe later on the output path */
1086 
1087 	if (sc->sc_naddrs) {
1088 		struct ip *ip;
1089 
1090 		MGETHDR(m, M_DONTWAIT, MT_HEADER);
1091 		if (m == NULL) {
1092 			sc->sc_if.if_oerrors++;
1093 			carpstat_inc(carps_onomem);
1094 			/* XXX maybe less ? */
1095 			goto retry_later;
1096 		}
1097 		len = sizeof(*ip) + sizeof(ch);
1098 		m->m_pkthdr.pf.prio = CARP_IFQ_PRIO;
1099 		m->m_pkthdr.ph_rtableid = sc->sc_if.if_rdomain;
1100 		m->m_pkthdr.len = len;
1101 		m->m_len = len;
1102 		m_align(m, len);
1103 		ip = mtod(m, struct ip *);
1104 		ip->ip_v = IPVERSION;
1105 		ip->ip_hl = sizeof(*ip) >> 2;
1106 		ip->ip_tos = IPTOS_LOWDELAY;
1107 		ip->ip_len = htons(len);
1108 		ip->ip_id = htons(ip_randomid());
1109 		ip->ip_off = htons(IP_DF);
1110 		ip->ip_ttl = CARP_DFLTTL;
1111 		ip->ip_p = IPPROTO_CARP;
1112 		ip->ip_sum = 0;
1113 
1114 		memset(&sa, 0, sizeof(sa));
1115 		sa.sa_family = AF_INET;
1116 		/* Prefer addresses on the parent interface as source for AD. */
1117 		ifa = ifaof_ifpforaddr(&sa, ifp);
1118 		if (ifa == NULL)
1119 			ifa = ifaof_ifpforaddr(&sa, &sc->sc_if);
1120 		KASSERT(ifa != NULL);
1121 		ip->ip_src.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr;
1122 		ip->ip_dst.s_addr = sc->sc_peer.s_addr;
1123 		if (IN_MULTICAST(ip->ip_dst.s_addr))
1124 			m->m_flags |= M_MCAST;
1125 
1126 		ch_ptr = (struct carp_header *)(ip + 1);
1127 		bcopy(&ch, ch_ptr, sizeof(ch));
1128 		carp_prepare_ad(m, vhe, ch_ptr);
1129 
1130 		m->m_data += sizeof(*ip);
1131 		ch_ptr->carp_cksum = carp_cksum(m, len - sizeof(*ip));
1132 		m->m_data -= sizeof(*ip);
1133 
1134 		getmicrotime(&sc->sc_if.if_lastchange);
1135 		carpstat_inc(carps_opackets);
1136 
1137 		error = ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo,
1138 		    NULL, 0);
1139 		if (error &&
1140 		    /* when unicast, the peer's down is not our fault */
1141 		    !(!IN_MULTICAST(sc->sc_peer.s_addr) && error == EHOSTDOWN)){
1142 			if (error == ENOBUFS)
1143 				carpstat_inc(carps_onomem);
1144 			else
1145 				CARP_LOG(LOG_WARNING, sc,
1146 				    ("ip_output failed: %d", error));
1147 			sc->sc_if.if_oerrors++;
1148 			if (sc->sc_sendad_errors < INT_MAX)
1149 				sc->sc_sendad_errors++;
1150 			if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS(sc))
1151 				carp_group_demote_adj(&sc->sc_if, 1,
1152 				    "> snderrors");
1153 			sc->sc_sendad_success = 0;
1154 		} else {
1155 			if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS(sc)) {
1156 				if (++sc->sc_sendad_success >=
1157 				    CARP_SENDAD_MIN_SUCCESS(sc)) {
1158 					carp_group_demote_adj(&sc->sc_if, -1,
1159 					    "< snderrors");
1160 					sc->sc_sendad_errors = 0;
1161 				}
1162 			} else
1163 				sc->sc_sendad_errors = 0;
1164 		}
1165 		if (vhe->vhe_leader) {
1166 			if (sc->sc_delayed_arp > 0)
1167 				sc->sc_delayed_arp--;
1168 			if (sc->sc_delayed_arp == 0) {
1169 				carp_send_arp(sc);
1170 				sc->sc_delayed_arp = -1;
1171 			}
1172 		}
1173 	}
1174 #ifdef INET6
1175 	if (sc->sc_naddrs6) {
1176 		struct ip6_hdr *ip6;
1177 
1178 		MGETHDR(m, M_DONTWAIT, MT_HEADER);
1179 		if (m == NULL) {
1180 			sc->sc_if.if_oerrors++;
1181 			carpstat_inc(carps_onomem);
1182 			/* XXX maybe less ? */
1183 			goto retry_later;
1184 		}
1185 		len = sizeof(*ip6) + sizeof(ch);
1186 		m->m_pkthdr.pf.prio = CARP_IFQ_PRIO;
1187 		m->m_pkthdr.ph_rtableid = sc->sc_if.if_rdomain;
1188 		m->m_pkthdr.len = len;
1189 		m->m_len = len;
1190 		m_align(m, len);
1191 		m->m_flags |= M_MCAST;
1192 		ip6 = mtod(m, struct ip6_hdr *);
1193 		memset(ip6, 0, sizeof(*ip6));
1194 		ip6->ip6_vfc |= IPV6_VERSION;
1195 		ip6->ip6_hlim = CARP_DFLTTL;
1196 		ip6->ip6_nxt = IPPROTO_CARP;
1197 
1198 		/* set the source address */
1199 		memset(&sa, 0, sizeof(sa));
1200 		sa.sa_family = AF_INET6;
1201 		/* Prefer addresses on the parent interface as source for AD. */
1202 		ifa = ifaof_ifpforaddr(&sa, ifp);
1203 		if (ifa == NULL)
1204 			ifa = ifaof_ifpforaddr(&sa, &sc->sc_if);
1205 		KASSERT(ifa != NULL);
1206 		bcopy(ifatoia6(ifa)->ia_addr.sin6_addr.s6_addr,
1207 		    &ip6->ip6_src, sizeof(struct in6_addr));
1208 		/* set the multicast destination */
1209 
1210 		ip6->ip6_dst.s6_addr16[0] = htons(0xff02);
1211 		ip6->ip6_dst.s6_addr16[1] = htons(ifp->if_index);
1212 		ip6->ip6_dst.s6_addr8[15] = 0x12;
1213 
1214 		ch_ptr = (struct carp_header *)(ip6 + 1);
1215 		bcopy(&ch, ch_ptr, sizeof(ch));
1216 		carp_prepare_ad(m, vhe, ch_ptr);
1217 
1218 		m->m_data += sizeof(*ip6);
1219 		ch_ptr->carp_cksum = carp_cksum(m, len - sizeof(*ip6));
1220 		m->m_data -= sizeof(*ip6);
1221 
1222 		getmicrotime(&sc->sc_if.if_lastchange);
1223 		carpstat_inc(carps_opackets6);
1224 
1225 		error = ip6_output(m, NULL, NULL, 0, &sc->sc_im6o, NULL);
1226 		if (error) {
1227 			if (error == ENOBUFS)
1228 				carpstat_inc(carps_onomem);
1229 			else
1230 				CARP_LOG(LOG_WARNING, sc,
1231 				    ("ip6_output failed: %d", error));
1232 			sc->sc_if.if_oerrors++;
1233 			if (sc->sc_sendad_errors < INT_MAX)
1234 				sc->sc_sendad_errors++;
1235 			if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS(sc))
1236 				carp_group_demote_adj(&sc->sc_if, 1,
1237 					    "> snd6errors");
1238 			sc->sc_sendad_success = 0;
1239 		} else {
1240 			if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS(sc)) {
1241 				if (++sc->sc_sendad_success >=
1242 				    CARP_SENDAD_MIN_SUCCESS(sc)) {
1243 					carp_group_demote_adj(&sc->sc_if, -1,
1244 					    "< snd6errors");
1245 					sc->sc_sendad_errors = 0;
1246 				}
1247 			} else
1248 				sc->sc_sendad_errors = 0;
1249 		}
1250 	}
1251 #endif /* INET6 */
1252 
1253 retry_later:
1254 	sc->cur_vhe = NULL;
1255 	if (advbase != 255 || advskew != 255)
1256 		timeout_add_tv(&vhe->ad_tmo, &tv);
1257 	if_put(ifp);
1258 }
1259 
1260 /*
1261  * Broadcast a gratuitous ARP request containing
1262  * the virtual router MAC address for each IP address
1263  * associated with the virtual router.
1264  */
1265 void
1266 carp_send_arp(struct carp_softc *sc)
1267 {
1268 	struct ifaddr *ifa;
1269 	in_addr_t in;
1270 
1271 	TAILQ_FOREACH(ifa, &sc->sc_if.if_addrlist, ifa_list) {
1272 
1273 		if (ifa->ifa_addr->sa_family != AF_INET)
1274 			continue;
1275 
1276 		in = ifatoia(ifa)->ia_addr.sin_addr.s_addr;
1277 		arprequest(&sc->sc_if, &in, &in, sc->sc_ac.ac_enaddr);
1278 	}
1279 }
1280 
1281 #ifdef INET6
1282 void
1283 carp_send_na(struct carp_softc *sc)
1284 {
1285 	struct ifaddr *ifa;
1286 	struct in6_addr *in6;
1287 	static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
1288 
1289 	TAILQ_FOREACH(ifa, &sc->sc_if.if_addrlist, ifa_list) {
1290 
1291 		if (ifa->ifa_addr->sa_family != AF_INET6)
1292 			continue;
1293 
1294 		in6 = &ifatoia6(ifa)->ia_addr.sin6_addr;
1295 		nd6_na_output(&sc->sc_if, &mcast, in6,
1296 		    ND_NA_FLAG_OVERRIDE |
1297 		    (ip6_forwarding ? ND_NA_FLAG_ROUTER : 0), 1, NULL);
1298 	}
1299 }
1300 #endif /* INET6 */
1301 
1302 void
1303 carp_update_lsmask(struct carp_softc *sc)
1304 {
1305 	struct carp_vhost_entry *vhe;
1306 	int count;
1307 
1308 	if (sc->sc_balancing == CARP_BAL_NONE)
1309 		return;
1310 
1311 	sc->sc_lsmask = 0;
1312 	count = 0;
1313 
1314 	KERNEL_ASSERT_LOCKED(); /* touching carp_vhosts */
1315 	SRPL_FOREACH_LOCKED(vhe, &sc->carp_vhosts, vhost_entries) {
1316 		if (vhe->state == MASTER && count < sizeof(sc->sc_lsmask) * 8)
1317 			sc->sc_lsmask |= 1 << count;
1318 		count++;
1319 	}
1320 	sc->sc_lscount = count;
1321 	CARP_LOG(LOG_DEBUG, sc, ("carp_update_lsmask: %x", sc->sc_lsmask));
1322 }
1323 
1324 int
1325 carp_iamatch(struct ifnet *ifp)
1326 {
1327 	struct carp_softc *sc = ifp->if_softc;
1328 	struct carp_vhost_entry *vhe;
1329 	struct srp_ref sr;
1330 	int match = 0;
1331 
1332 	vhe = SRPL_FIRST(&sr, &sc->carp_vhosts);
1333 	if (vhe->state == MASTER)
1334 		match = 1;
1335 	SRPL_LEAVE(&sr);
1336 
1337 	return (match);
1338 }
1339 
1340 int
1341 carp_ourether(struct ifnet *ifp, uint8_t *ena)
1342 {
1343 	struct srpl *cif = &ifp->if_carp;
1344 	struct carp_softc *sc;
1345 	struct srp_ref sr;
1346 	int match = 0;
1347 	uint64_t dst = ether_addr_to_e64((struct ether_addr *)ena);
1348 
1349 	KASSERT(ifp->if_type == IFT_ETHER);
1350 
1351 	SRPL_FOREACH(sc, &sr, cif, sc_list) {
1352 		if ((sc->sc_if.if_flags & (IFF_UP|IFF_RUNNING)) !=
1353 		    (IFF_UP|IFF_RUNNING))
1354 			continue;
1355 		if (carp_vhe_match(sc, dst)) {
1356 			match = 1;
1357 			break;
1358 		}
1359 	}
1360 	SRPL_LEAVE(&sr);
1361 
1362 	return (match);
1363 }
1364 
1365 int
1366 carp_vhe_match(struct carp_softc *sc, uint64_t dst)
1367 {
1368 	struct carp_vhost_entry *vhe;
1369 	struct srp_ref sr;
1370 	int active = 0;
1371 
1372 	vhe = SRPL_FIRST(&sr, &sc->carp_vhosts);
1373 	active = (vhe->state == MASTER || sc->sc_balancing >= CARP_BAL_IP);
1374 	SRPL_LEAVE(&sr);
1375 
1376 	return (active && (dst ==
1377 	    ether_addr_to_e64((struct ether_addr *)sc->sc_ac.ac_enaddr)));
1378 }
1379 
1380 struct mbuf *
1381 carp_input(struct ifnet *ifp0, struct mbuf *m, uint64_t dst)
1382 {
1383 	struct srpl *cif;
1384 	struct carp_softc *sc;
1385 	struct srp_ref sr;
1386 
1387 	cif = &ifp0->if_carp;
1388 
1389 	SRPL_FOREACH(sc, &sr, cif, sc_list) {
1390 		if ((sc->sc_if.if_flags & (IFF_UP|IFF_RUNNING)) !=
1391 		    (IFF_UP|IFF_RUNNING))
1392 			continue;
1393 
1394 		if (carp_vhe_match(sc, dst)) {
1395 			/*
1396 			 * These packets look like layer 2 multicast but they
1397 			 * are unicast at layer 3. With help of the tag the
1398 			 * mbuf's M_MCAST flag can be removed by carp_lsdrop()
1399 			 * after we have passed layer 2.
1400 			 */
1401 			if (sc->sc_balancing == CARP_BAL_IP) {
1402 				struct m_tag *mtag;
1403 				mtag = m_tag_get(PACKET_TAG_CARP_BAL_IP, 0,
1404 				    M_NOWAIT);
1405 				if (mtag == NULL) {
1406 					m_freem(m);
1407 					goto out;
1408 				}
1409 				m_tag_prepend(m, mtag);
1410 			}
1411 			break;
1412 		}
1413 	}
1414 
1415 	if (sc == NULL) {
1416 		SRPL_LEAVE(&sr);
1417 
1418 		if (!ETH64_IS_MULTICAST(dst))
1419 			return (m);
1420 
1421 		/*
1422 		 * XXX Should really check the list of multicast addresses
1423 		 * for each CARP interface _before_ copying.
1424 		 */
1425 		SRPL_FOREACH(sc, &sr, cif, sc_list) {
1426 			struct mbuf *m0;
1427 
1428 			if (!(sc->sc_if.if_flags & IFF_UP))
1429 				continue;
1430 
1431 			m0 = m_dup_pkt(m, ETHER_ALIGN, M_DONTWAIT);
1432 			if (m0 == NULL)
1433 				continue;
1434 
1435 			if_vinput(&sc->sc_if, m0);
1436 		}
1437 		SRPL_LEAVE(&sr);
1438 
1439 		return (m);
1440 	}
1441 
1442 	if_vinput(&sc->sc_if, m);
1443 out:
1444 	SRPL_LEAVE(&sr);
1445 
1446 	return (NULL);
1447 }
1448 
1449 int
1450 carp_lsdrop(struct ifnet *ifp, struct mbuf *m, sa_family_t af, u_int32_t *src,
1451     u_int32_t *dst, int drop)
1452 {
1453 	struct carp_softc *sc;
1454 	u_int32_t fold;
1455 	struct m_tag *mtag;
1456 
1457 	if (ifp->if_type != IFT_CARP)
1458 		return 0;
1459 	sc = ifp->if_softc;
1460 	if (sc->sc_balancing == CARP_BAL_NONE)
1461 		return 0;
1462 
1463 	/*
1464 	 * Remove M_MCAST flag from mbuf of balancing ip traffic, since the fact
1465 	 * that it is layer 2 multicast does not implicate that it is also layer
1466 	 * 3 multicast.
1467 	 */
1468 	if (m->m_flags & M_MCAST &&
1469 	    (mtag = m_tag_find(m, PACKET_TAG_CARP_BAL_IP, NULL))) {
1470 		m_tag_delete(m, mtag);
1471 		m->m_flags &= ~M_MCAST;
1472 	}
1473 
1474 	/*
1475 	 * Return without making a drop decision. This allows to clear the
1476 	 * M_MCAST flag and do nothing else.
1477 	 */
1478 	if (!drop)
1479 		return 0;
1480 
1481 	/*
1482 	 * Never drop carp advertisements.
1483 	 * XXX Bad idea to pass all broadcast / multicast traffic?
1484 	 */
1485 	if (m->m_flags & (M_BCAST|M_MCAST))
1486 		return 0;
1487 
1488 	fold = src[0] ^ dst[0];
1489 #ifdef INET6
1490 	if (af == AF_INET6) {
1491 		int i;
1492 		for (i = 1; i < 4; i++)
1493 			fold ^= src[i] ^ dst[i];
1494 	}
1495 #endif
1496 	if (sc->sc_lscount == 0) /* just to be safe */
1497 		return 1;
1498 
1499 	return ((1 << (ntohl(fold) % sc->sc_lscount)) & sc->sc_lsmask) == 0;
1500 }
1501 
1502 void
1503 carp_timer_down(void *v)
1504 {
1505 	NET_LOCK();
1506 	carp_master_down(v);
1507 	NET_UNLOCK();
1508 }
1509 
1510 void
1511 carp_master_down(struct carp_vhost_entry *vhe)
1512 {
1513 	struct carp_softc *sc = vhe->parent_sc;
1514 
1515 	NET_ASSERT_LOCKED();
1516 
1517 	switch (vhe->state) {
1518 	case INIT:
1519 		printf("%s: master_down event in INIT state\n",
1520 		    sc->sc_if.if_xname);
1521 		break;
1522 	case MASTER:
1523 		break;
1524 	case BACKUP:
1525 		carp_set_state(vhe, MASTER);
1526 		carp_send_ad(vhe);
1527 		if (sc->sc_balancing == CARP_BAL_NONE && vhe->vhe_leader) {
1528 			carp_send_arp(sc);
1529 			/* Schedule a delayed ARP to deal w/ some L3 switches */
1530 			sc->sc_delayed_arp = 2;
1531 #ifdef INET6
1532 			carp_send_na(sc);
1533 #endif /* INET6 */
1534 		}
1535 		carp_setrun(vhe, 0);
1536 		carpstat_inc(carps_preempt);
1537 		break;
1538 	}
1539 }
1540 
1541 void
1542 carp_setrun_all(struct carp_softc *sc, sa_family_t af)
1543 {
1544 	struct carp_vhost_entry *vhe;
1545 
1546 	KERNEL_ASSERT_LOCKED(); /* touching carp_vhost */
1547 	SRPL_FOREACH_LOCKED(vhe, &sc->carp_vhosts, vhost_entries) {
1548 		carp_setrun(vhe, af);
1549 	}
1550 }
1551 
1552 /*
1553  * When in backup state, af indicates whether to reset the master down timer
1554  * for v4 or v6. If it's set to zero, reset the ones which are already pending.
1555  */
1556 void
1557 carp_setrun(struct carp_vhost_entry *vhe, sa_family_t af)
1558 {
1559 	struct ifnet *ifp;
1560 	struct timeval tv;
1561 	struct carp_softc *sc = vhe->parent_sc;
1562 
1563 	if ((ifp = if_get(sc->sc_carpdevidx)) == NULL) {
1564 		sc->sc_if.if_flags &= ~IFF_RUNNING;
1565 		carp_set_state_all(sc, INIT);
1566 		return;
1567 	}
1568 
1569 	if (memcmp(((struct arpcom *)ifp)->ac_enaddr,
1570 	    sc->sc_ac.ac_enaddr, ETHER_ADDR_LEN) == 0)
1571 		sc->sc_realmac = 1;
1572 	else
1573 		sc->sc_realmac = 0;
1574 
1575 	if_put(ifp);
1576 
1577 	if (sc->sc_if.if_flags & IFF_UP && vhe->vhid > 0 &&
1578 	    (sc->sc_naddrs || sc->sc_naddrs6) && !sc->sc_suppress) {
1579 		sc->sc_if.if_flags |= IFF_RUNNING;
1580 	} else {
1581 		sc->sc_if.if_flags &= ~IFF_RUNNING;
1582 		return;
1583 	}
1584 
1585 	switch (vhe->state) {
1586 	case INIT:
1587 		carp_set_state(vhe, BACKUP);
1588 		carp_setrun(vhe, 0);
1589 		break;
1590 	case BACKUP:
1591 		timeout_del(&vhe->ad_tmo);
1592 		tv.tv_sec = 3 * sc->sc_advbase;
1593 		if (sc->sc_advbase == 0 && vhe->advskew == 0)
1594 			tv.tv_usec = 3 * 1000000 / 256;
1595 		else if (sc->sc_advbase == 0)
1596 			tv.tv_usec = 3 * vhe->advskew * 1000000 / 256;
1597 		else
1598 			tv.tv_usec = vhe->advskew * 1000000 / 256;
1599 		if (vhe->vhe_leader)
1600 			sc->sc_delayed_arp = -1;
1601 		switch (af) {
1602 		case AF_INET:
1603 			timeout_add_tv(&vhe->md_tmo, &tv);
1604 			break;
1605 #ifdef INET6
1606 		case AF_INET6:
1607 			timeout_add_tv(&vhe->md6_tmo, &tv);
1608 			break;
1609 #endif /* INET6 */
1610 		default:
1611 			if (sc->sc_naddrs)
1612 				timeout_add_tv(&vhe->md_tmo, &tv);
1613 			if (sc->sc_naddrs6)
1614 				timeout_add_tv(&vhe->md6_tmo, &tv);
1615 			break;
1616 		}
1617 		break;
1618 	case MASTER:
1619 		tv.tv_sec = sc->sc_advbase;
1620 		if (sc->sc_advbase == 0 && vhe->advskew == 0)
1621 			tv.tv_usec = 1 * 1000000 / 256;
1622 		else
1623 			tv.tv_usec = vhe->advskew * 1000000 / 256;
1624 		timeout_add_tv(&vhe->ad_tmo, &tv);
1625 		break;
1626 	}
1627 }
1628 
1629 void
1630 carp_multicast_cleanup(struct carp_softc *sc)
1631 {
1632 	struct ip_moptions *imo = &sc->sc_imo;
1633 #ifdef INET6
1634 	struct ip6_moptions *im6o = &sc->sc_im6o;
1635 #endif
1636 	u_int16_t n = imo->imo_num_memberships;
1637 
1638 	/* Clean up our own multicast memberships */
1639 	while (n-- > 0) {
1640 		if (imo->imo_membership[n] != NULL) {
1641 			in_delmulti(imo->imo_membership[n]);
1642 			imo->imo_membership[n] = NULL;
1643 		}
1644 	}
1645 	imo->imo_num_memberships = 0;
1646 	imo->imo_ifidx = 0;
1647 
1648 #ifdef INET6
1649 	while (!LIST_EMPTY(&im6o->im6o_memberships)) {
1650 		struct in6_multi_mship *imm =
1651 		    LIST_FIRST(&im6o->im6o_memberships);
1652 
1653 		LIST_REMOVE(imm, i6mm_chain);
1654 		in6_leavegroup(imm);
1655 	}
1656 	im6o->im6o_ifidx = 0;
1657 #endif
1658 
1659 	/* And any other multicast memberships */
1660 	carp_ether_purgemulti(sc);
1661 }
1662 
1663 int
1664 carp_set_ifp(struct carp_softc *sc, struct ifnet *ifp0)
1665 {
1666 	struct srpl *cif;
1667 	struct carp_softc *vr, *last = NULL, *after = NULL;
1668 	int myself = 0, error = 0;
1669 
1670 	KASSERT(ifp0->if_index != sc->sc_carpdevidx);
1671 	KERNEL_ASSERT_LOCKED(); /* touching if_carp */
1672 
1673 	if ((ifp0->if_flags & IFF_MULTICAST) == 0)
1674 		return (EADDRNOTAVAIL);
1675 
1676 	if (ifp0->if_type != IFT_ETHER)
1677 		return (EINVAL);
1678 
1679 	cif = &ifp0->if_carp;
1680 	if (carp_check_dup_vhids(sc, cif, NULL))
1681 		return (EINVAL);
1682 
1683 	if ((error = ifpromisc(ifp0, 1)))
1684 		return (error);
1685 
1686 	/* detach from old interface */
1687 	if (sc->sc_carpdevidx != 0)
1688 		carpdetach(sc);
1689 
1690 	/* attach carp interface to physical interface */
1691 	if_detachhook_add(ifp0, &sc->sc_dtask);
1692 	if_linkstatehook_add(ifp0, &sc->sc_ltask);
1693 
1694 	sc->sc_carpdevidx = ifp0->if_index;
1695 	sc->sc_if.if_capabilities = ifp0->if_capabilities &
1696 	    (IFCAP_CSUM_MASK | IFCAP_TSOv4 | IFCAP_TSOv6);
1697 
1698 	SRPL_FOREACH_LOCKED(vr, cif, sc_list) {
1699 		struct carp_vhost_entry *vrhead, *schead;
1700 		last = vr;
1701 
1702 		if (vr == sc)
1703 			myself = 1;
1704 
1705 		vrhead = SRPL_FIRST_LOCKED(&vr->carp_vhosts);
1706 		schead = SRPL_FIRST_LOCKED(&sc->carp_vhosts);
1707 		if (vrhead->vhid < schead->vhid)
1708 			after = vr;
1709 	}
1710 
1711 	if (!myself) {
1712 		/* We're trying to keep things in order */
1713 		if (last == NULL) {
1714 			SRPL_INSERT_HEAD_LOCKED(&carp_sc_rc, cif,
1715 			    sc, sc_list);
1716 		} else if (after == NULL) {
1717 			SRPL_INSERT_AFTER_LOCKED(&carp_sc_rc, last,
1718 			    sc, sc_list);
1719 		} else {
1720 			SRPL_INSERT_AFTER_LOCKED(&carp_sc_rc, after,
1721 			    sc, sc_list);
1722 		}
1723 	}
1724 	if (sc->sc_naddrs || sc->sc_naddrs6)
1725 		sc->sc_if.if_flags |= IFF_UP;
1726 	carp_set_enaddr(sc);
1727 
1728 	carp_carpdev_state(sc);
1729 
1730 	return (0);
1731 }
1732 
1733 void
1734 carp_set_vhe_enaddr(struct carp_vhost_entry *vhe)
1735 {
1736 	struct carp_softc *sc = vhe->parent_sc;
1737 
1738 	if (vhe->vhid != 0 && sc->sc_carpdevidx != 0) {
1739 		if (vhe->vhe_leader && sc->sc_balancing == CARP_BAL_IP)
1740 			vhe->vhe_enaddr[0] = 1;
1741 		else
1742 			vhe->vhe_enaddr[0] = 0;
1743 		vhe->vhe_enaddr[1] = 0;
1744 		vhe->vhe_enaddr[2] = 0x5e;
1745 		vhe->vhe_enaddr[3] = 0;
1746 		vhe->vhe_enaddr[4] = 1;
1747 		vhe->vhe_enaddr[5] = vhe->vhid;
1748 	} else
1749 		memset(vhe->vhe_enaddr, 0, ETHER_ADDR_LEN);
1750 }
1751 
1752 void
1753 carp_set_enaddr(struct carp_softc *sc)
1754 {
1755 	struct carp_vhost_entry *vhe;
1756 
1757 	KERNEL_ASSERT_LOCKED(); /* touching carp_vhosts */
1758 	SRPL_FOREACH_LOCKED(vhe, &sc->carp_vhosts, vhost_entries)
1759 		carp_set_vhe_enaddr(vhe);
1760 
1761 	vhe = SRPL_FIRST_LOCKED(&sc->carp_vhosts);
1762 
1763 	/*
1764 	 * Use the carp lladdr if the running one isn't manually set.
1765 	 * Only compare static parts of the lladdr.
1766 	 */
1767 	if ((memcmp(sc->sc_ac.ac_enaddr + 1, vhe->vhe_enaddr + 1,
1768 	    ETHER_ADDR_LEN - 2) == 0) ||
1769 	    (!sc->sc_ac.ac_enaddr[0] && !sc->sc_ac.ac_enaddr[1] &&
1770 	    !sc->sc_ac.ac_enaddr[2] && !sc->sc_ac.ac_enaddr[3] &&
1771 	    !sc->sc_ac.ac_enaddr[4] && !sc->sc_ac.ac_enaddr[5]))
1772 		bcopy(vhe->vhe_enaddr, sc->sc_ac.ac_enaddr, ETHER_ADDR_LEN);
1773 
1774 	/* Make sure the enaddr has changed before further twiddling. */
1775 	if (memcmp(sc->sc_ac.ac_enaddr, sc->sc_curlladdr, ETHER_ADDR_LEN) != 0) {
1776 		bcopy(sc->sc_ac.ac_enaddr, LLADDR(sc->sc_if.if_sadl),
1777 		    ETHER_ADDR_LEN);
1778 		bcopy(sc->sc_ac.ac_enaddr, sc->sc_curlladdr, ETHER_ADDR_LEN);
1779 #ifdef INET6
1780 		/*
1781 		 * (re)attach a link-local address which matches
1782 		 * our new MAC address.
1783 		 */
1784 		if (sc->sc_naddrs6)
1785 			in6_ifattach_linklocal(&sc->sc_if, NULL);
1786 #endif
1787 		carp_set_state_all(sc, INIT);
1788 		carp_setrun_all(sc, 0);
1789 	}
1790 }
1791 
1792 void
1793 carp_addr_updated(void *v)
1794 {
1795 	struct carp_softc *sc = (struct carp_softc *) v;
1796 	struct ifaddr *ifa;
1797 	int new_naddrs = 0, new_naddrs6 = 0;
1798 
1799 	TAILQ_FOREACH(ifa, &sc->sc_if.if_addrlist, ifa_list) {
1800 		if (ifa->ifa_addr->sa_family == AF_INET)
1801 			new_naddrs++;
1802 #ifdef INET6
1803 		else if (ifa->ifa_addr->sa_family == AF_INET6)
1804 			new_naddrs6++;
1805 #endif /* INET6 */
1806 	}
1807 
1808 	/* We received address changes from if_addrhooks callback */
1809 	if (new_naddrs != sc->sc_naddrs || new_naddrs6 != sc->sc_naddrs6) {
1810 
1811 		sc->sc_naddrs = new_naddrs;
1812 		sc->sc_naddrs6 = new_naddrs6;
1813 
1814 		/* Re-establish multicast membership removed by in_control */
1815 		if (IN_MULTICAST(sc->sc_peer.s_addr)) {
1816 			if (!in_hasmulti(&sc->sc_peer, &sc->sc_if)) {
1817 				struct in_multi **imm =
1818 				    sc->sc_imo.imo_membership;
1819 				u_int16_t maxmem =
1820 				    sc->sc_imo.imo_max_memberships;
1821 
1822 				memset(&sc->sc_imo, 0, sizeof(sc->sc_imo));
1823 				sc->sc_imo.imo_membership = imm;
1824 				sc->sc_imo.imo_max_memberships = maxmem;
1825 
1826 				if (sc->sc_carpdevidx != 0 &&
1827 				    sc->sc_naddrs > 0)
1828 					carp_join_multicast(sc);
1829 			}
1830 		}
1831 
1832 		if (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0) {
1833 			sc->sc_if.if_flags &= ~IFF_UP;
1834 			carp_set_state_all(sc, INIT);
1835 		} else
1836 			carp_hmac_prepare(sc);
1837 	}
1838 
1839 	carp_setrun_all(sc, 0);
1840 }
1841 
1842 int
1843 carp_set_addr(struct carp_softc *sc, struct sockaddr_in *sin)
1844 {
1845 	struct in_addr *in = &sin->sin_addr;
1846 	int error;
1847 
1848 	KASSERT(sc->sc_carpdevidx != 0);
1849 
1850 	/* XXX is this necessary? */
1851 	if (in->s_addr == INADDR_ANY) {
1852 		carp_setrun_all(sc, 0);
1853 		return (0);
1854 	}
1855 
1856 	if (sc->sc_naddrs == 0 && (error = carp_join_multicast(sc)) != 0)
1857 		return (error);
1858 
1859 	carp_set_state_all(sc, INIT);
1860 
1861 	return (0);
1862 }
1863 
1864 int
1865 carp_join_multicast(struct carp_softc *sc)
1866 {
1867 	struct ip_moptions *imo = &sc->sc_imo;
1868 	struct in_multi *imm;
1869 	struct in_addr addr;
1870 
1871 	if (!IN_MULTICAST(sc->sc_peer.s_addr))
1872 		return (0);
1873 
1874 	addr.s_addr = sc->sc_peer.s_addr;
1875 	if ((imm = in_addmulti(&addr, &sc->sc_if)) == NULL)
1876 		return (ENOBUFS);
1877 
1878 	imo->imo_membership[0] = imm;
1879 	imo->imo_num_memberships = 1;
1880 	imo->imo_ifidx = sc->sc_if.if_index;
1881 	imo->imo_ttl = CARP_DFLTTL;
1882 	imo->imo_loop = 0;
1883 	return (0);
1884 }
1885 
1886 
1887 #ifdef INET6
1888 int
1889 carp_set_addr6(struct carp_softc *sc, struct sockaddr_in6 *sin6)
1890 {
1891 	int error;
1892 
1893 	KASSERT(sc->sc_carpdevidx != 0);
1894 
1895 	if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
1896 		carp_setrun_all(sc, 0);
1897 		return (0);
1898 	}
1899 
1900 	if (sc->sc_naddrs6 == 0 && (error = carp_join_multicast6(sc)) != 0)
1901 		return (error);
1902 
1903 	carp_set_state_all(sc, INIT);
1904 
1905 	return (0);
1906 }
1907 
1908 int
1909 carp_join_multicast6(struct carp_softc *sc)
1910 {
1911 	struct in6_multi_mship *imm, *imm2;
1912 	struct ip6_moptions *im6o = &sc->sc_im6o;
1913 	struct sockaddr_in6 addr6;
1914 	int error;
1915 
1916 	/* Join IPv6 CARP multicast group */
1917 	memset(&addr6, 0, sizeof(addr6));
1918 	addr6.sin6_family = AF_INET6;
1919 	addr6.sin6_len = sizeof(addr6);
1920 	addr6.sin6_addr.s6_addr16[0] = htons(0xff02);
1921 	addr6.sin6_addr.s6_addr16[1] = htons(sc->sc_if.if_index);
1922 	addr6.sin6_addr.s6_addr8[15] = 0x12;
1923 	if ((imm = in6_joingroup(&sc->sc_if,
1924 	    &addr6.sin6_addr, &error)) == NULL) {
1925 		return (error);
1926 	}
1927 	/* join solicited multicast address */
1928 	memset(&addr6.sin6_addr, 0, sizeof(addr6.sin6_addr));
1929 	addr6.sin6_addr.s6_addr16[0] = htons(0xff02);
1930 	addr6.sin6_addr.s6_addr16[1] = htons(sc->sc_if.if_index);
1931 	addr6.sin6_addr.s6_addr32[1] = 0;
1932 	addr6.sin6_addr.s6_addr32[2] = htonl(1);
1933 	addr6.sin6_addr.s6_addr32[3] = 0;
1934 	addr6.sin6_addr.s6_addr8[12] = 0xff;
1935 	if ((imm2 = in6_joingroup(&sc->sc_if,
1936 	    &addr6.sin6_addr, &error)) == NULL) {
1937 		in6_leavegroup(imm);
1938 		return (error);
1939 	}
1940 
1941 	/* apply v6 multicast membership */
1942 	im6o->im6o_ifidx = sc->sc_if.if_index;
1943 	if (imm)
1944 		LIST_INSERT_HEAD(&im6o->im6o_memberships, imm,
1945 		    i6mm_chain);
1946 	if (imm2)
1947 		LIST_INSERT_HEAD(&im6o->im6o_memberships, imm2,
1948 		    i6mm_chain);
1949 
1950 	return (0);
1951 }
1952 
1953 #endif /* INET6 */
1954 
1955 int
1956 carp_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr)
1957 {
1958 	struct proc *p = curproc;	/* XXX */
1959 	struct carp_softc *sc = ifp->if_softc;
1960 	struct carp_vhost_entry *vhe;
1961 	struct carpreq carpr;
1962 	struct ifaddr *ifa = (struct ifaddr *)addr;
1963 	struct ifreq *ifr = (struct ifreq *)addr;
1964 	struct ifnet *ifp0 = NULL;
1965 	int i, error = 0;
1966 
1967 	switch (cmd) {
1968 	case SIOCSIFADDR:
1969 		if (sc->sc_carpdevidx == 0)
1970 			return (EINVAL);
1971 
1972 		switch (ifa->ifa_addr->sa_family) {
1973 		case AF_INET:
1974 			sc->sc_if.if_flags |= IFF_UP;
1975 			error = carp_set_addr(sc, satosin(ifa->ifa_addr));
1976 			break;
1977 #ifdef INET6
1978 		case AF_INET6:
1979 			sc->sc_if.if_flags |= IFF_UP;
1980 			error = carp_set_addr6(sc, satosin6(ifa->ifa_addr));
1981 			break;
1982 #endif /* INET6 */
1983 		default:
1984 			error = EAFNOSUPPORT;
1985 			break;
1986 		}
1987 		break;
1988 
1989 	case SIOCSIFFLAGS:
1990 		KERNEL_ASSERT_LOCKED(); /* touching carp_vhosts */
1991 		vhe = SRPL_FIRST_LOCKED(&sc->carp_vhosts);
1992 		if (vhe->state != INIT && !(ifr->ifr_flags & IFF_UP)) {
1993 			carp_del_all_timeouts(sc);
1994 
1995 			/* we need the interface up to bow out */
1996 			sc->sc_if.if_flags |= IFF_UP;
1997 			sc->sc_bow_out = 1;
1998 			carp_vhe_send_ad_all(sc);
1999 			sc->sc_bow_out = 0;
2000 
2001 			sc->sc_if.if_flags &= ~IFF_UP;
2002 			carp_set_state_all(sc, INIT);
2003 			carp_setrun_all(sc, 0);
2004 		} else if (vhe->state == INIT && (ifr->ifr_flags & IFF_UP)) {
2005 			sc->sc_if.if_flags |= IFF_UP;
2006 			carp_setrun_all(sc, 0);
2007 		}
2008 		break;
2009 
2010 	case SIOCSVH:
2011 		KERNEL_ASSERT_LOCKED(); /* touching carp_vhosts */
2012 		vhe = SRPL_FIRST_LOCKED(&sc->carp_vhosts);
2013 		if ((error = suser(p)) != 0)
2014 			break;
2015 		if ((error = copyin(ifr->ifr_data, &carpr, sizeof carpr)))
2016 			break;
2017 		error = 1;
2018 		if (carpr.carpr_carpdev[0] != '\0' &&
2019 		    (ifp0 = if_unit(carpr.carpr_carpdev)) == NULL)
2020 			return (EINVAL);
2021 		if (carpr.carpr_peer.s_addr == 0)
2022 			sc->sc_peer.s_addr = INADDR_CARP_GROUP;
2023 		else
2024 			sc->sc_peer.s_addr = carpr.carpr_peer.s_addr;
2025 		if (ifp0 != NULL && ifp0->if_index != sc->sc_carpdevidx) {
2026 			if ((error = carp_set_ifp(sc, ifp0))) {
2027 				if_put(ifp0);
2028 				return (error);
2029 			}
2030 		}
2031 		if_put(ifp0);
2032 		if (vhe->state != INIT && carpr.carpr_state != vhe->state) {
2033 			switch (carpr.carpr_state) {
2034 			case BACKUP:
2035 				timeout_del(&vhe->ad_tmo);
2036 				carp_set_state_all(sc, BACKUP);
2037 				carp_setrun_all(sc, 0);
2038 				break;
2039 			case MASTER:
2040 				KERNEL_ASSERT_LOCKED();
2041 				/* touching carp_vhosts */
2042 				SRPL_FOREACH_LOCKED(vhe, &sc->carp_vhosts,
2043 				    vhost_entries)
2044 					carp_master_down(vhe);
2045 				break;
2046 			default:
2047 				break;
2048 			}
2049 		}
2050 		if ((error = carp_vhids_ioctl(sc, &carpr)))
2051 			return (error);
2052 		if (carpr.carpr_advbase >= 0) {
2053 			if (carpr.carpr_advbase > 255) {
2054 				error = EINVAL;
2055 				break;
2056 			}
2057 			sc->sc_advbase = carpr.carpr_advbase;
2058 			error--;
2059 		}
2060 		if (memcmp(sc->sc_advskews, carpr.carpr_advskews,
2061 		    sizeof(sc->sc_advskews))) {
2062 			i = 0;
2063 			KERNEL_ASSERT_LOCKED(); /* touching carp_vhosts */
2064 			SRPL_FOREACH_LOCKED(vhe, &sc->carp_vhosts,
2065 			    vhost_entries)
2066 				vhe->advskew = carpr.carpr_advskews[i++];
2067 			bcopy(carpr.carpr_advskews, sc->sc_advskews,
2068 			    sizeof(sc->sc_advskews));
2069 		}
2070 		if (sc->sc_balancing != carpr.carpr_balancing) {
2071 			if (carpr.carpr_balancing > CARP_BAL_MAXID) {
2072 				error = EINVAL;
2073 				break;
2074 			}
2075 			sc->sc_balancing = carpr.carpr_balancing;
2076 			carp_set_enaddr(sc);
2077 			carp_update_lsmask(sc);
2078 		}
2079 		bcopy(carpr.carpr_key, sc->sc_key, sizeof(sc->sc_key));
2080 		if (error > 0)
2081 			error = EINVAL;
2082 		else {
2083 			error = 0;
2084 			carp_hmac_prepare(sc);
2085 			carp_setrun_all(sc, 0);
2086 		}
2087 		break;
2088 
2089 	case SIOCGVH:
2090 		memset(&carpr, 0, sizeof(carpr));
2091 		if ((ifp0 = if_get(sc->sc_carpdevidx)) != NULL)
2092 			strlcpy(carpr.carpr_carpdev, ifp0->if_xname, IFNAMSIZ);
2093 		if_put(ifp0);
2094 		i = 0;
2095 		KERNEL_ASSERT_LOCKED(); /* touching carp_vhosts */
2096 		SRPL_FOREACH_LOCKED(vhe, &sc->carp_vhosts, vhost_entries) {
2097 			carpr.carpr_vhids[i] = vhe->vhid;
2098 			carpr.carpr_advskews[i] = vhe->advskew;
2099 			carpr.carpr_states[i] = vhe->state;
2100 			i++;
2101 		}
2102 		carpr.carpr_advbase = sc->sc_advbase;
2103 		carpr.carpr_balancing = sc->sc_balancing;
2104 		if (suser(p) == 0)
2105 			bcopy(sc->sc_key, carpr.carpr_key,
2106 			    sizeof(carpr.carpr_key));
2107 		carpr.carpr_peer.s_addr = sc->sc_peer.s_addr;
2108 		error = copyout(&carpr, ifr->ifr_data, sizeof(carpr));
2109 		break;
2110 
2111 	case SIOCADDMULTI:
2112 		error = carp_ether_addmulti(sc, ifr);
2113 		break;
2114 
2115 	case SIOCDELMULTI:
2116 		error = carp_ether_delmulti(sc, ifr);
2117 		break;
2118 	case SIOCAIFGROUP:
2119 	case SIOCDIFGROUP:
2120 		if (sc->sc_demote_cnt)
2121 			carp_ifgroup_ioctl(ifp, cmd, addr);
2122 		break;
2123 	case SIOCSIFGATTR:
2124 		carp_ifgattr_ioctl(ifp, cmd, addr);
2125 		break;
2126 	default:
2127 		error = ENOTTY;
2128 	}
2129 
2130 	if (memcmp(sc->sc_ac.ac_enaddr, sc->sc_curlladdr, ETHER_ADDR_LEN) != 0)
2131 		carp_set_enaddr(sc);
2132 	return (error);
2133 }
2134 
2135 int
2136 carp_check_dup_vhids(struct carp_softc *sc, struct srpl *cif,
2137     struct carpreq *carpr)
2138 {
2139 	struct carp_softc *vr;
2140 	struct carp_vhost_entry *vhe, *vhe0;
2141 	int i;
2142 
2143 	KERNEL_ASSERT_LOCKED(); /* touching if_carp + carp_vhosts */
2144 
2145 	SRPL_FOREACH_LOCKED(vr, cif, sc_list) {
2146 		if (vr == sc)
2147 			continue;
2148 		SRPL_FOREACH_LOCKED(vhe, &vr->carp_vhosts, vhost_entries) {
2149 			if (carpr) {
2150 				for (i = 0; carpr->carpr_vhids[i]; i++) {
2151 					if (vhe->vhid == carpr->carpr_vhids[i])
2152 						return (EINVAL);
2153 				}
2154 			}
2155 			SRPL_FOREACH_LOCKED(vhe0, &sc->carp_vhosts,
2156 			    vhost_entries) {
2157 				if (vhe->vhid == vhe0->vhid)
2158 					return (EINVAL);
2159 			}
2160 		}
2161 	}
2162 	return (0);
2163 }
2164 
2165 int
2166 carp_vhids_ioctl(struct carp_softc *sc, struct carpreq *carpr)
2167 {
2168 	int i, j;
2169 	u_int8_t taken_vhids[256];
2170 
2171 	if (carpr->carpr_vhids[0] == 0 ||
2172 	    !memcmp(sc->sc_vhids, carpr->carpr_vhids, sizeof(sc->sc_vhids)))
2173 		return (0);
2174 
2175 	memset(taken_vhids, 0, sizeof(taken_vhids));
2176 	for (i = 0; carpr->carpr_vhids[i]; i++) {
2177 		struct ifnet *ifp;
2178 
2179 		if (taken_vhids[carpr->carpr_vhids[i]])
2180 			return (EINVAL);
2181 		taken_vhids[carpr->carpr_vhids[i]] = 1;
2182 
2183 		if ((ifp = if_get(sc->sc_carpdevidx)) != NULL) {
2184 			struct srpl *cif;
2185 			cif = &ifp->if_carp;
2186 			if (carp_check_dup_vhids(sc, cif, carpr)) {
2187 				if_put(ifp);
2188 				return (EINVAL);
2189 			}
2190 		}
2191 		if_put(ifp);
2192 		if (carpr->carpr_advskews[i] >= 255)
2193 			return (EINVAL);
2194 	}
2195 	/* set sane balancing defaults */
2196 	if (i <= 1)
2197 		carpr->carpr_balancing = CARP_BAL_NONE;
2198 	else if (carpr->carpr_balancing == CARP_BAL_NONE &&
2199 	    sc->sc_balancing == CARP_BAL_NONE)
2200 		carpr->carpr_balancing = CARP_BAL_IP;
2201 
2202 	/* destroy all */
2203 	carp_del_all_timeouts(sc);
2204 	carp_destroy_vhosts(sc);
2205 	memset(sc->sc_vhids, 0, sizeof(sc->sc_vhids));
2206 
2207 	/* sort vhosts list by vhid */
2208 	for (j = 1; j <= 255; j++) {
2209 		for (i = 0; carpr->carpr_vhids[i]; i++) {
2210 			if (carpr->carpr_vhids[i] != j)
2211 				continue;
2212 			if (carp_new_vhost(sc, carpr->carpr_vhids[i],
2213 			    carpr->carpr_advskews[i]))
2214 				return (ENOMEM);
2215 			sc->sc_vhids[i] = carpr->carpr_vhids[i];
2216 			sc->sc_advskews[i] = carpr->carpr_advskews[i];
2217 		}
2218 	}
2219 	carp_set_enaddr(sc);
2220 	carp_set_state_all(sc, INIT);
2221 	return (0);
2222 }
2223 
2224 void
2225 carp_ifgroup_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr)
2226 {
2227 	struct ifgroupreq *ifgr = (struct ifgroupreq *)addr;
2228 	struct ifg_list	*ifgl;
2229 	int *dm, adj;
2230 
2231 	if (!strcmp(ifgr->ifgr_group, IFG_ALL))
2232 		return;
2233 	adj = ((struct carp_softc *)ifp->if_softc)->sc_demote_cnt;
2234 	if (cmd == SIOCDIFGROUP)
2235 		adj = adj * -1;
2236 
2237 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
2238 		if (!strcmp(ifgl->ifgl_group->ifg_group, ifgr->ifgr_group)) {
2239 			dm = &ifgl->ifgl_group->ifg_carp_demoted;
2240 			if (*dm + adj >= 0)
2241 				*dm += adj;
2242 			else
2243 				*dm = 0;
2244 		}
2245 }
2246 
2247 void
2248 carp_ifgattr_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr)
2249 {
2250 	struct ifgroupreq *ifgr = (struct ifgroupreq *)addr;
2251 	struct carp_softc *sc = ifp->if_softc;
2252 
2253 	if (ifgr->ifgr_attrib.ifg_carp_demoted > 0 && (sc->sc_if.if_flags &
2254 	    (IFF_UP|IFF_RUNNING)) == (IFF_UP|IFF_RUNNING))
2255 		carp_vhe_send_ad_all(sc);
2256 }
2257 
2258 void
2259 carp_start(struct ifnet *ifp)
2260 {
2261 	struct carp_softc *sc = ifp->if_softc;
2262 	struct ifnet *ifp0;
2263 	struct mbuf *m;
2264 
2265 	if ((ifp0 = if_get(sc->sc_carpdevidx)) == NULL) {
2266 		ifq_purge(&ifp->if_snd);
2267 		return;
2268 	}
2269 
2270 	while ((m = ifq_dequeue(&ifp->if_snd)) != NULL)
2271 		carp_transmit(sc, ifp0, m);
2272 	if_put(ifp0);
2273 }
2274 
2275 void
2276 carp_transmit(struct carp_softc *sc, struct ifnet *ifp0, struct mbuf *m)
2277 {
2278 	struct ifnet *ifp = &sc->sc_if;
2279 
2280 #if NBPFILTER > 0
2281 	{
2282 		caddr_t if_bpf = ifp->if_bpf;
2283 		if (if_bpf)
2284 			bpf_mtap_ether(if_bpf, m, BPF_DIRECTION_OUT);
2285 	}
2286 #endif /* NBPFILTER > 0 */
2287 
2288 	if (!ISSET(ifp0->if_flags, IFF_RUNNING)) {
2289 		counters_inc(ifp->if_counters, ifc_oerrors);
2290 		m_freem(m);
2291 		return;
2292 	}
2293 
2294 	/*
2295 	 * Do not leak the multicast address when sending
2296 	 * advertisements in 'ip' and 'ip-stealth' balancing
2297 	 * modes.
2298 	 */
2299 	if (sc->sc_balancing == CARP_BAL_IP ||
2300 	    sc->sc_balancing == CARP_BAL_IPSTEALTH) {
2301 		struct ether_header *eh = mtod(m, struct ether_header *);
2302 		memcpy(eh->ether_shost, sc->sc_ac.ac_enaddr,
2303 		    sizeof(eh->ether_shost));
2304 	}
2305 
2306 	if (if_enqueue(ifp0, m))
2307 		counters_inc(ifp->if_counters, ifc_oerrors);
2308 }
2309 
2310 int
2311 carp_enqueue(struct ifnet *ifp, struct mbuf *m)
2312 {
2313 	struct carp_softc *sc = ifp->if_softc;
2314 	struct ifnet *ifp0;
2315 
2316 	/* no ifq_is_priq, cos hfsc on carp doesn't make sense */
2317 
2318 	/*
2319 	 * If the parent of this carp(4) got destroyed while
2320 	 * `m' was being processed, silently drop it.
2321 	 */
2322 	if ((ifp0 = if_get(sc->sc_carpdevidx)) == NULL) {
2323 		m_freem(m);
2324 		return (0);
2325 	}
2326 
2327 	counters_pkt(ifp->if_counters,
2328 	    ifc_opackets, ifc_obytes, m->m_pkthdr.len);
2329 	carp_transmit(sc, ifp0, m);
2330 	if_put(ifp0);
2331 
2332 	return (0);
2333 }
2334 
2335 int
2336 carp_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa,
2337     struct rtentry *rt)
2338 {
2339 	struct carp_softc *sc = ((struct carp_softc *)ifp->if_softc);
2340 	struct carp_vhost_entry *vhe;
2341 	struct srp_ref sr;
2342 	int ismaster;
2343 
2344 	if (sc->cur_vhe == NULL) {
2345 		vhe = SRPL_FIRST(&sr, &sc->carp_vhosts);
2346 		ismaster = (vhe->state == MASTER);
2347 		SRPL_LEAVE(&sr);
2348 	} else {
2349 		ismaster = (sc->cur_vhe->state == MASTER);
2350 	}
2351 
2352 	if ((sc->sc_balancing == CARP_BAL_NONE && !ismaster)) {
2353 		m_freem(m);
2354 		return (ENETUNREACH);
2355 	}
2356 
2357 	return (ether_output(ifp, m, sa, rt));
2358 }
2359 
2360 void
2361 carp_set_state_all(struct carp_softc *sc, int state)
2362 {
2363 	struct carp_vhost_entry *vhe;
2364 
2365 	KERNEL_ASSERT_LOCKED(); /* touching carp_vhosts */
2366 
2367 	SRPL_FOREACH_LOCKED(vhe, &sc->carp_vhosts, vhost_entries) {
2368 		if (vhe->state == state)
2369 			continue;
2370 
2371 		carp_set_state(vhe, state);
2372 	}
2373 }
2374 
2375 void
2376 carp_set_state(struct carp_vhost_entry *vhe, int state)
2377 {
2378 	struct carp_softc *sc = vhe->parent_sc;
2379 	static const char *carp_states[] = { CARP_STATES };
2380 	int loglevel;
2381 	struct carp_vhost_entry *vhe0;
2382 
2383 	KASSERT(vhe->state != state);
2384 
2385 	if (vhe->state == INIT || state == INIT)
2386 		loglevel = LOG_WARNING;
2387 	else
2388 		loglevel = LOG_CRIT;
2389 
2390 	if (sc->sc_vhe_count > 1)
2391 		CARP_LOG(loglevel, sc,
2392 		    ("state transition (vhid %d): %s -> %s", vhe->vhid,
2393 		    carp_states[vhe->state], carp_states[state]));
2394 	else
2395 		CARP_LOG(loglevel, sc,
2396 		    ("state transition: %s -> %s",
2397 		    carp_states[vhe->state], carp_states[state]));
2398 
2399 	vhe->state = state;
2400 	carp_update_lsmask(sc);
2401 
2402 	KERNEL_ASSERT_LOCKED(); /* touching carp_vhosts */
2403 
2404 	sc->sc_if.if_link_state = LINK_STATE_INVALID;
2405 	SRPL_FOREACH_LOCKED(vhe0, &sc->carp_vhosts, vhost_entries) {
2406 		/*
2407 		 * Link must be up if at least one vhe is in state MASTER to
2408 		 * bring or keep route up.
2409 		 */
2410 		if (vhe0->state == MASTER) {
2411 			sc->sc_if.if_link_state = LINK_STATE_UP;
2412 			break;
2413 		} else if (vhe0->state == BACKUP) {
2414 			sc->sc_if.if_link_state = LINK_STATE_DOWN;
2415 		}
2416 	}
2417 	if_link_state_change(&sc->sc_if);
2418 }
2419 
2420 void
2421 carp_group_demote_adj(struct ifnet *ifp, int adj, char *reason)
2422 {
2423 	struct ifg_list	*ifgl;
2424 	int *dm, need_ad;
2425 	struct carp_softc *nil = NULL;
2426 
2427 	if (ifp->if_type == IFT_CARP) {
2428 		dm = &((struct carp_softc *)ifp->if_softc)->sc_demote_cnt;
2429 		if (*dm + adj >= 0)
2430 			*dm += adj;
2431 		else
2432 			*dm = 0;
2433 	}
2434 
2435 	need_ad = 0;
2436 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) {
2437 		if (!strcmp(ifgl->ifgl_group->ifg_group, IFG_ALL))
2438 			continue;
2439 		dm = &ifgl->ifgl_group->ifg_carp_demoted;
2440 
2441 		if (*dm + adj >= 0)
2442 			*dm += adj;
2443 		else
2444 			*dm = 0;
2445 
2446 		if (adj > 0 && *dm == 1)
2447 			need_ad = 1;
2448 		CARP_LOG(LOG_ERR, nil,
2449 		    ("%s demoted group %s by %d to %d (%s)",
2450 		    ifp->if_xname, ifgl->ifgl_group->ifg_group,
2451 		    adj, *dm, reason));
2452 	}
2453 	if (need_ad)
2454 		carp_send_ad_all();
2455 }
2456 
2457 int
2458 carp_group_demote_count(struct carp_softc *sc)
2459 {
2460 	struct ifg_list	*ifgl;
2461 	int count = 0;
2462 
2463 	TAILQ_FOREACH(ifgl, &sc->sc_if.if_groups, ifgl_next)
2464 		count += ifgl->ifgl_group->ifg_carp_demoted;
2465 
2466 	if (count == 0 && sc->sc_demote_cnt)
2467 		count = sc->sc_demote_cnt;
2468 
2469 	return (count > 255 ? 255 : count);
2470 }
2471 
2472 void
2473 carp_carpdev_state(void *v)
2474 {
2475 	struct carp_softc *sc = v;
2476 	struct ifnet *ifp0;
2477 	int suppressed = sc->sc_suppress;
2478 
2479 	if ((ifp0 = if_get(sc->sc_carpdevidx)) == NULL)
2480 		return;
2481 
2482 	if (ifp0->if_link_state == LINK_STATE_DOWN ||
2483 	    !(ifp0->if_flags & IFF_UP)) {
2484 		sc->sc_if.if_flags &= ~IFF_RUNNING;
2485 		carp_del_all_timeouts(sc);
2486 		carp_set_state_all(sc, INIT);
2487 		sc->sc_suppress = 1;
2488 		carp_setrun_all(sc, 0);
2489 		if (!suppressed)
2490 			carp_group_demote_adj(&sc->sc_if, 1, "carpdev");
2491 	} else if (suppressed) {
2492 		carp_set_state_all(sc, INIT);
2493 		sc->sc_suppress = 0;
2494 		carp_setrun_all(sc, 0);
2495 		carp_group_demote_adj(&sc->sc_if, -1, "carpdev");
2496 	}
2497 
2498 	if_put(ifp0);
2499 }
2500 
2501 int
2502 carp_ether_addmulti(struct carp_softc *sc, struct ifreq *ifr)
2503 {
2504 	struct ifnet *ifp0;
2505 	struct carp_mc_entry *mc;
2506 	u_int8_t addrlo[ETHER_ADDR_LEN], addrhi[ETHER_ADDR_LEN];
2507 	int error;
2508 
2509 	ifp0 = if_get(sc->sc_carpdevidx);
2510 	if (ifp0 == NULL)
2511 		return (EINVAL);
2512 
2513 	error = ether_addmulti(ifr, (struct arpcom *)&sc->sc_ac);
2514 	if (error != ENETRESET) {
2515 		if_put(ifp0);
2516 		return (error);
2517 	}
2518 
2519 	/*
2520 	 * This is new multicast address.  We have to tell parent
2521 	 * about it.  Also, remember this multicast address so that
2522 	 * we can delete them on unconfigure.
2523 	 */
2524 	mc = malloc(sizeof(*mc), M_DEVBUF, M_NOWAIT);
2525 	if (mc == NULL) {
2526 		error = ENOMEM;
2527 		goto alloc_failed;
2528 	}
2529 
2530 	/*
2531 	 * As ether_addmulti() returns ENETRESET, following two
2532 	 * statement shouldn't fail.
2533 	 */
2534 	(void)ether_multiaddr(&ifr->ifr_addr, addrlo, addrhi);
2535 	ETHER_LOOKUP_MULTI(addrlo, addrhi, &sc->sc_ac, mc->mc_enm);
2536 	memcpy(&mc->mc_addr, &ifr->ifr_addr, ifr->ifr_addr.sa_len);
2537 	LIST_INSERT_HEAD(&sc->carp_mc_listhead, mc, mc_entries);
2538 
2539 	error = (*ifp0->if_ioctl)(ifp0, SIOCADDMULTI, (caddr_t)ifr);
2540 	if (error != 0)
2541 		goto ioctl_failed;
2542 
2543 	if_put(ifp0);
2544 
2545 	return (error);
2546 
2547  ioctl_failed:
2548 	LIST_REMOVE(mc, mc_entries);
2549 	free(mc, M_DEVBUF, sizeof(*mc));
2550  alloc_failed:
2551 	(void)ether_delmulti(ifr, (struct arpcom *)&sc->sc_ac);
2552 	if_put(ifp0);
2553 
2554 	return (error);
2555 }
2556 
2557 int
2558 carp_ether_delmulti(struct carp_softc *sc, struct ifreq *ifr)
2559 {
2560 	struct ifnet *ifp0;
2561 	struct ether_multi *enm;
2562 	struct carp_mc_entry *mc;
2563 	u_int8_t addrlo[ETHER_ADDR_LEN], addrhi[ETHER_ADDR_LEN];
2564 	int error;
2565 
2566 	ifp0 = if_get(sc->sc_carpdevidx);
2567 	if (ifp0 == NULL)
2568 		return (EINVAL);
2569 
2570 	/*
2571 	 * Find a key to lookup carp_mc_entry.  We have to do this
2572 	 * before calling ether_delmulti for obvious reason.
2573 	 */
2574 	if ((error = ether_multiaddr(&ifr->ifr_addr, addrlo, addrhi)) != 0)
2575 		goto rele;
2576 	ETHER_LOOKUP_MULTI(addrlo, addrhi, &sc->sc_ac, enm);
2577 	if (enm == NULL) {
2578 		error = EINVAL;
2579 		goto rele;
2580 	}
2581 
2582 	LIST_FOREACH(mc, &sc->carp_mc_listhead, mc_entries)
2583 		if (mc->mc_enm == enm)
2584 			break;
2585 
2586 	/* We won't delete entries we didn't add */
2587 	if (mc == NULL) {
2588 		error = EINVAL;
2589 		goto rele;
2590 	}
2591 
2592 	error = ether_delmulti(ifr, (struct arpcom *)&sc->sc_ac);
2593 	if (error != ENETRESET)
2594 		goto rele;
2595 
2596 	/* We no longer use this multicast address.  Tell parent so. */
2597 	error = (*ifp0->if_ioctl)(ifp0, SIOCDELMULTI, (caddr_t)ifr);
2598 	if (error == 0) {
2599 		/* And forget about this address. */
2600 		LIST_REMOVE(mc, mc_entries);
2601 		free(mc, M_DEVBUF, sizeof(*mc));
2602 	} else
2603 		(void)ether_addmulti(ifr, (struct arpcom *)&sc->sc_ac);
2604 rele:
2605 	if_put(ifp0);
2606 	return (error);
2607 }
2608 
2609 /*
2610  * Delete any multicast address we have asked to add from parent
2611  * interface.  Called when the carp is being unconfigured.
2612  */
2613 void
2614 carp_ether_purgemulti(struct carp_softc *sc)
2615 {
2616 	struct ifnet *ifp0;		/* Parent. */
2617 	struct carp_mc_entry *mc;
2618 	union {
2619 		struct ifreq ifreq;
2620 		struct {
2621 			char ifr_name[IFNAMSIZ];
2622 			struct sockaddr_storage ifr_ss;
2623 		} ifreq_storage;
2624 	} u;
2625 	struct ifreq *ifr = &u.ifreq;
2626 
2627 	if ((ifp0 = if_get(sc->sc_carpdevidx)) == NULL)
2628 		return;
2629 
2630 	memcpy(ifr->ifr_name, ifp0->if_xname, IFNAMSIZ);
2631 	while ((mc = LIST_FIRST(&sc->carp_mc_listhead)) != NULL) {
2632 		memcpy(&ifr->ifr_addr, &mc->mc_addr, mc->mc_addr.ss_len);
2633 		(void)(*ifp0->if_ioctl)(ifp0, SIOCDELMULTI, (caddr_t)ifr);
2634 		LIST_REMOVE(mc, mc_entries);
2635 		free(mc, M_DEVBUF, sizeof(*mc));
2636 	}
2637 
2638 	if_put(ifp0);
2639 }
2640 
2641 void
2642 carp_vh_ref(void *null, void *v)
2643 {
2644 	struct carp_vhost_entry *vhe = v;
2645 
2646 	refcnt_take(&vhe->vhost_refcnt);
2647 }
2648 
2649 void
2650 carp_vh_unref(void *null, void *v)
2651 {
2652 	struct carp_vhost_entry *vhe = v;
2653 
2654 	if (refcnt_rele(&vhe->vhost_refcnt)) {
2655 		carp_sc_unref(NULL, vhe->parent_sc);
2656 		free(vhe, M_DEVBUF, sizeof(*vhe));
2657 	}
2658 }
2659 
2660 void
2661 carp_sc_ref(void *null, void *s)
2662 {
2663 	struct carp_softc *sc = s;
2664 
2665 	refcnt_take(&sc->sc_refcnt);
2666 }
2667 
2668 void
2669 carp_sc_unref(void *null, void *s)
2670 {
2671 	struct carp_softc *sc = s;
2672 
2673 	refcnt_rele_wake(&sc->sc_refcnt);
2674 }
2675