xref: /openbsd-src/sys/netinet/ip_carp.c (revision 1a8dbaac879b9f3335ad7fb25429ce63ac1d6bac)
1 /*	$OpenBSD: ip_carp.c,v 1.349 2020/07/28 16:44:34 yasuoka Exp $	*/
2 
3 /*
4  * Copyright (c) 2002 Michael Shalayeff. All rights reserved.
5  * Copyright (c) 2003 Ryan McBride. All rights reserved.
6  * Copyright (c) 2006-2008 Marco Pfatschbacher. All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
21  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
23  * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
25  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
26  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
27  * THE POSSIBILITY OF SUCH DAMAGE.
28  */
29 
30 /*
31  * TODO:
32  *	- iface reconfigure
33  *	- support for hardware checksum calculations;
34  *
35  */
36 
37 #include "ether.h"
38 
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/mbuf.h>
42 #include <sys/socket.h>
43 #include <sys/socketvar.h>
44 #include <sys/timeout.h>
45 #include <sys/ioctl.h>
46 #include <sys/errno.h>
47 #include <sys/device.h>
48 #include <sys/kernel.h>
49 #include <sys/sysctl.h>
50 #include <sys/syslog.h>
51 #include <sys/refcnt.h>
52 
53 #include <net/if.h>
54 #include <net/if_var.h>
55 #include <net/if_types.h>
56 #include <net/netisr.h>
57 
58 #include <crypto/sha1.h>
59 
60 #include <netinet/in.h>
61 #include <netinet/in_var.h>
62 #include <netinet/ip.h>
63 #include <netinet/ip_var.h>
64 #include <netinet/if_ether.h>
65 #include <netinet/ip_ipsp.h>
66 
67 #include <net/if_dl.h>
68 
69 #ifdef INET6
70 #include <netinet6/in6_var.h>
71 #include <netinet/icmp6.h>
72 #include <netinet/ip6.h>
73 #include <netinet6/ip6_var.h>
74 #include <netinet6/nd6.h>
75 #include <netinet6/in6_ifattach.h>
76 #endif
77 
78 #include "bpfilter.h"
79 #if NBPFILTER > 0
80 #include <net/bpf.h>
81 #endif
82 
83 #include "vlan.h"
84 #if NVLAN > 0
85 #include <net/if_vlan_var.h>
86 #endif
87 
88 #include <netinet/ip_carp.h>
89 
90 struct carp_mc_entry {
91 	LIST_ENTRY(carp_mc_entry)	mc_entries;
92 	union {
93 		struct ether_multi	*mcu_enm;
94 	} mc_u;
95 	struct sockaddr_storage		mc_addr;
96 };
97 #define	mc_enm	mc_u.mcu_enm
98 
99 enum { HMAC_ORIG=0, HMAC_NOV6LL=1, HMAC_MAX=2 };
100 
101 struct carp_vhost_entry {
102 	SRPL_ENTRY(carp_vhost_entry) vhost_entries;
103 	struct refcnt vhost_refcnt;
104 
105 	struct carp_softc *parent_sc;
106 	int vhe_leader;
107 	int vhid;
108 	int advskew;
109 	enum { INIT = 0, BACKUP, MASTER }	state;
110 	struct timeout ad_tmo;	/* advertisement timeout */
111 	struct timeout md_tmo;	/* master down timeout */
112 	struct timeout md6_tmo;	/* master down timeout */
113 
114 	u_int64_t vhe_replay_cookie;
115 
116 	/* authentication */
117 #define CARP_HMAC_PAD	64
118 	unsigned char vhe_pad[CARP_HMAC_PAD];
119 	SHA1_CTX vhe_sha1[HMAC_MAX];
120 
121 	u_int8_t vhe_enaddr[ETHER_ADDR_LEN];
122 };
123 
124 void	carp_vh_ref(void *, void *);
125 void	carp_vh_unref(void *, void *);
126 
127 struct srpl_rc carp_vh_rc =
128     SRPL_RC_INITIALIZER(carp_vh_ref, carp_vh_unref, NULL);
129 
130 struct carp_softc {
131 	struct arpcom sc_ac;
132 #define	sc_if		sc_ac.ac_if
133 #define	sc_carpdevidx	sc_ac.ac_if.if_carpdevidx
134 	struct task sc_atask;
135 	struct task sc_ltask;
136 	struct task sc_dtask;
137 	struct ip_moptions sc_imo;
138 #ifdef INET6
139 	struct ip6_moptions sc_im6o;
140 #endif /* INET6 */
141 
142 	SRPL_ENTRY(carp_softc) sc_list;
143 	struct refcnt sc_refcnt;
144 
145 	int sc_suppress;
146 	int sc_bow_out;
147 	int sc_demote_cnt;
148 
149 	int sc_sendad_errors;
150 #define CARP_SENDAD_MAX_ERRORS(sc) (3 * (sc)->sc_vhe_count)
151 	int sc_sendad_success;
152 #define CARP_SENDAD_MIN_SUCCESS(sc) (3 * (sc)->sc_vhe_count)
153 
154 	char sc_curlladdr[ETHER_ADDR_LEN];
155 
156 	SRPL_HEAD(, carp_vhost_entry) carp_vhosts;
157 	int sc_vhe_count;
158 	u_int8_t sc_vhids[CARP_MAXNODES];
159 	u_int8_t sc_advskews[CARP_MAXNODES];
160 	u_int8_t sc_balancing;
161 
162 	int sc_naddrs;
163 	int sc_naddrs6;
164 	int sc_advbase;		/* seconds */
165 
166 	/* authentication */
167 	unsigned char sc_key[CARP_KEY_LEN];
168 
169 	u_int32_t sc_hashkey[2];
170 	u_int32_t sc_lsmask;		/* load sharing mask */
171 	int sc_lscount;			/* # load sharing interfaces (max 32) */
172 	int sc_delayed_arp;		/* delayed ARP request countdown */
173 	int sc_realmac;			/* using real mac */
174 
175 	struct in_addr sc_peer;
176 
177 	LIST_HEAD(__carp_mchead, carp_mc_entry)	carp_mc_listhead;
178 	struct carp_vhost_entry *cur_vhe; /* current active vhe */
179 };
180 
181 void	carp_sc_ref(void *, void *);
182 void	carp_sc_unref(void *, void *);
183 
184 struct srpl_rc carp_sc_rc =
185     SRPL_RC_INITIALIZER(carp_sc_ref, carp_sc_unref, NULL);
186 
187 int carp_opts[CARPCTL_MAXID] = { 0, 1, 0, LOG_CRIT };	/* XXX for now */
188 struct cpumem *carpcounters;
189 
190 int	carp_send_all_recur = 0;
191 
192 #define	CARP_LOG(l, sc, s)						\
193 	do {								\
194 		if (carp_opts[CARPCTL_LOG] >= l) {			\
195 			if (sc)						\
196 				log(l, "%s: ",				\
197 				    (sc)->sc_if.if_xname);		\
198 			else						\
199 				log(l, "carp: ");			\
200 			addlog s;					\
201 			addlog("\n");					\
202 		}							\
203 	} while (0)
204 
205 void	carp_hmac_prepare(struct carp_softc *);
206 void	carp_hmac_prepare_ctx(struct carp_vhost_entry *, u_int8_t);
207 void	carp_hmac_generate(struct carp_vhost_entry *, u_int32_t *,
208 	    unsigned char *, u_int8_t);
209 int	carp_hmac_verify(struct carp_vhost_entry *, u_int32_t *,
210 	    unsigned char *);
211 void	carp_proto_input_c(struct ifnet *, struct mbuf *,
212 	    struct carp_header *, int, sa_family_t);
213 int	carp_proto_input_if(struct ifnet *, struct mbuf **, int *, int);
214 #ifdef INET6
215 int	carp6_proto_input_if(struct ifnet *, struct mbuf **, int *, int);
216 #endif
217 void	carpattach(int);
218 void	carpdetach(void *);
219 void	carp_prepare_ad(struct mbuf *, struct carp_vhost_entry *,
220 	    struct carp_header *);
221 void	carp_send_ad_all(void);
222 void	carp_vhe_send_ad_all(struct carp_softc *);
223 void	carp_timer_ad(void *);
224 void	carp_send_ad(struct carp_vhost_entry *);
225 void	carp_send_arp(struct carp_softc *);
226 void	carp_timer_down(void *);
227 void	carp_master_down(struct carp_vhost_entry *);
228 int	carp_ioctl(struct ifnet *, u_long, caddr_t);
229 int	carp_vhids_ioctl(struct carp_softc *, struct carpreq *);
230 int	carp_check_dup_vhids(struct carp_softc *, struct srpl *,
231 	    struct carpreq *);
232 void	carp_ifgroup_ioctl(struct ifnet *, u_long, caddr_t);
233 void	carp_ifgattr_ioctl(struct ifnet *, u_long, caddr_t);
234 void	carp_start(struct ifnet *);
235 int	carp_enqueue(struct ifnet *, struct mbuf *);
236 void	carp_transmit(struct carp_softc *, struct ifnet *, struct mbuf *);
237 void	carp_setrun_all(struct carp_softc *, sa_family_t);
238 void	carp_setrun(struct carp_vhost_entry *, sa_family_t);
239 void	carp_set_state_all(struct carp_softc *, int);
240 void	carp_set_state(struct carp_vhost_entry *, int);
241 void	carp_multicast_cleanup(struct carp_softc *);
242 int	carp_set_ifp(struct carp_softc *, struct ifnet *);
243 void	carp_set_enaddr(struct carp_softc *);
244 void	carp_set_vhe_enaddr(struct carp_vhost_entry *);
245 void	carp_addr_updated(void *);
246 int	carp_set_addr(struct carp_softc *, struct sockaddr_in *);
247 int	carp_join_multicast(struct carp_softc *);
248 #ifdef INET6
249 void	carp_send_na(struct carp_softc *);
250 int	carp_set_addr6(struct carp_softc *, struct sockaddr_in6 *);
251 int	carp_join_multicast6(struct carp_softc *);
252 #endif
253 int	carp_clone_create(struct if_clone *, int);
254 int	carp_clone_destroy(struct ifnet *);
255 int	carp_ether_addmulti(struct carp_softc *, struct ifreq *);
256 int	carp_ether_delmulti(struct carp_softc *, struct ifreq *);
257 void	carp_ether_purgemulti(struct carp_softc *);
258 int	carp_group_demote_count(struct carp_softc *);
259 void	carp_update_lsmask(struct carp_softc *);
260 int	carp_new_vhost(struct carp_softc *, int, int);
261 void	carp_destroy_vhosts(struct carp_softc *);
262 void	carp_del_all_timeouts(struct carp_softc *);
263 int	carp_vhe_match(struct carp_softc *, uint8_t *);
264 
265 struct if_clone carp_cloner =
266     IF_CLONE_INITIALIZER("carp", carp_clone_create, carp_clone_destroy);
267 
268 #define carp_cksum(_m, _l)	((u_int16_t)in_cksum((_m), (_l)))
269 #define CARP_IFQ_PRIO	6
270 
271 void
272 carp_hmac_prepare(struct carp_softc *sc)
273 {
274 	struct carp_vhost_entry *vhe;
275 	u_int8_t i;
276 
277 	KERNEL_ASSERT_LOCKED(); /* touching carp_vhosts */
278 
279 	SRPL_FOREACH_LOCKED(vhe, &sc->carp_vhosts, vhost_entries) {
280 		for (i = 0; i < HMAC_MAX; i++) {
281 			carp_hmac_prepare_ctx(vhe, i);
282 		}
283 	}
284 }
285 
286 void
287 carp_hmac_prepare_ctx(struct carp_vhost_entry *vhe, u_int8_t ctx)
288 {
289 	struct carp_softc *sc = vhe->parent_sc;
290 
291 	u_int8_t version = CARP_VERSION, type = CARP_ADVERTISEMENT;
292 	u_int8_t vhid = vhe->vhid & 0xff;
293 	SHA1_CTX sha1ctx;
294 	u_int32_t kmd[5];
295 	struct ifaddr *ifa;
296 	int i, found;
297 	struct in_addr last, cur, in;
298 #ifdef INET6
299 	struct in6_addr last6, cur6, in6;
300 #endif /* INET6 */
301 
302 	/* compute ipad from key */
303 	memset(vhe->vhe_pad, 0, sizeof(vhe->vhe_pad));
304 	bcopy(sc->sc_key, vhe->vhe_pad, sizeof(sc->sc_key));
305 	for (i = 0; i < sizeof(vhe->vhe_pad); i++)
306 		vhe->vhe_pad[i] ^= 0x36;
307 
308 	/* precompute first part of inner hash */
309 	SHA1Init(&vhe->vhe_sha1[ctx]);
310 	SHA1Update(&vhe->vhe_sha1[ctx], vhe->vhe_pad, sizeof(vhe->vhe_pad));
311 	SHA1Update(&vhe->vhe_sha1[ctx], (void *)&version, sizeof(version));
312 	SHA1Update(&vhe->vhe_sha1[ctx], (void *)&type, sizeof(type));
313 
314 	/* generate a key for the arpbalance hash, before the vhid is hashed */
315 	if (vhe->vhe_leader) {
316 		bcopy(&vhe->vhe_sha1[ctx], &sha1ctx, sizeof(sha1ctx));
317 		SHA1Final((unsigned char *)kmd, &sha1ctx);
318 		sc->sc_hashkey[0] = kmd[0] ^ kmd[1];
319 		sc->sc_hashkey[1] = kmd[2] ^ kmd[3];
320 	}
321 
322 	/* the rest of the precomputation */
323 	if (!sc->sc_realmac && vhe->vhe_leader &&
324 	    memcmp(sc->sc_ac.ac_enaddr, vhe->vhe_enaddr, ETHER_ADDR_LEN) != 0)
325 		SHA1Update(&vhe->vhe_sha1[ctx], sc->sc_ac.ac_enaddr,
326 		    ETHER_ADDR_LEN);
327 
328 	SHA1Update(&vhe->vhe_sha1[ctx], (void *)&vhid, sizeof(vhid));
329 
330 	/* Hash the addresses from smallest to largest, not interface order */
331 	cur.s_addr = 0;
332 	do {
333 		found = 0;
334 		last = cur;
335 		cur.s_addr = 0xffffffff;
336 		TAILQ_FOREACH(ifa, &sc->sc_if.if_addrlist, ifa_list) {
337 			if (ifa->ifa_addr->sa_family != AF_INET)
338 				continue;
339 			in.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr;
340 			if (ntohl(in.s_addr) > ntohl(last.s_addr) &&
341 			    ntohl(in.s_addr) < ntohl(cur.s_addr)) {
342 				cur.s_addr = in.s_addr;
343 				found++;
344 			}
345 		}
346 		if (found)
347 			SHA1Update(&vhe->vhe_sha1[ctx],
348 			    (void *)&cur, sizeof(cur));
349 	} while (found);
350 #ifdef INET6
351 	memset(&cur6, 0x00, sizeof(cur6));
352 	do {
353 		found = 0;
354 		last6 = cur6;
355 		memset(&cur6, 0xff, sizeof(cur6));
356 		TAILQ_FOREACH(ifa, &sc->sc_if.if_addrlist, ifa_list) {
357 			if (ifa->ifa_addr->sa_family != AF_INET6)
358 				continue;
359 			in6 = ifatoia6(ifa)->ia_addr.sin6_addr;
360 			if (IN6_IS_SCOPE_EMBED(&in6)) {
361 				if (ctx == HMAC_NOV6LL)
362 					continue;
363 				in6.s6_addr16[1] = 0;
364 			}
365 			if (memcmp(&in6, &last6, sizeof(in6)) > 0 &&
366 			    memcmp(&in6, &cur6, sizeof(in6)) < 0) {
367 				cur6 = in6;
368 				found++;
369 			}
370 		}
371 		if (found)
372 			SHA1Update(&vhe->vhe_sha1[ctx],
373 			    (void *)&cur6, sizeof(cur6));
374 	} while (found);
375 #endif /* INET6 */
376 
377 	/* convert ipad to opad */
378 	for (i = 0; i < sizeof(vhe->vhe_pad); i++)
379 		vhe->vhe_pad[i] ^= 0x36 ^ 0x5c;
380 }
381 
382 void
383 carp_hmac_generate(struct carp_vhost_entry *vhe, u_int32_t counter[2],
384     unsigned char md[20], u_int8_t ctx)
385 {
386 	SHA1_CTX sha1ctx;
387 
388 	/* fetch first half of inner hash */
389 	bcopy(&vhe->vhe_sha1[ctx], &sha1ctx, sizeof(sha1ctx));
390 
391 	SHA1Update(&sha1ctx, (void *)counter, sizeof(vhe->vhe_replay_cookie));
392 	SHA1Final(md, &sha1ctx);
393 
394 	/* outer hash */
395 	SHA1Init(&sha1ctx);
396 	SHA1Update(&sha1ctx, vhe->vhe_pad, sizeof(vhe->vhe_pad));
397 	SHA1Update(&sha1ctx, md, 20);
398 	SHA1Final(md, &sha1ctx);
399 }
400 
401 int
402 carp_hmac_verify(struct carp_vhost_entry *vhe, u_int32_t counter[2],
403     unsigned char md[20])
404 {
405 	unsigned char md2[20];
406 	u_int8_t i;
407 
408 	for (i = 0; i < HMAC_MAX; i++) {
409 		carp_hmac_generate(vhe, counter, md2, i);
410 		if (!timingsafe_bcmp(md, md2, sizeof(md2)))
411 			return (0);
412 	}
413 	return (1);
414 }
415 
416 int
417 carp_proto_input(struct mbuf **mp, int *offp, int proto, int af)
418 {
419 	struct ifnet *ifp;
420 
421 	ifp = if_get((*mp)->m_pkthdr.ph_ifidx);
422 	if (ifp == NULL) {
423 		m_freemp(mp);
424 		return IPPROTO_DONE;
425 	}
426 
427 	proto = carp_proto_input_if(ifp, mp, offp, proto);
428 	if_put(ifp);
429 	return proto;
430 }
431 
432 /*
433  * process input packet.
434  * we have rearranged checks order compared to the rfc,
435  * but it seems more efficient this way or not possible otherwise.
436  */
437 int
438 carp_proto_input_if(struct ifnet *ifp, struct mbuf **mp, int *offp, int proto)
439 {
440 	struct mbuf *m = *mp;
441 	struct ip *ip = mtod(m, struct ip *);
442 	struct carp_softc *sc = NULL;
443 	struct carp_header *ch;
444 	int iplen, len, ismulti;
445 
446 	carpstat_inc(carps_ipackets);
447 
448 	if (!carp_opts[CARPCTL_ALLOW]) {
449 		m_freem(m);
450 		return IPPROTO_DONE;
451 	}
452 
453 	ismulti = IN_MULTICAST(ip->ip_dst.s_addr);
454 
455 	/* check if received on a valid carp interface */
456 	switch (ifp->if_type) {
457 	case IFT_CARP:
458 		break;
459 	case IFT_ETHER:
460 		if (ismulti || !SRPL_EMPTY_LOCKED(&ifp->if_carp))
461 			break;
462 		/* FALLTHROUGH */
463 	default:
464 		carpstat_inc(carps_badif);
465 		CARP_LOG(LOG_INFO, sc,
466 		    ("packet received on non-carp interface: %s",
467 		     ifp->if_xname));
468 		m_freem(m);
469 		return IPPROTO_DONE;
470 	}
471 
472 	/* verify that the IP TTL is 255.  */
473 	if (ip->ip_ttl != CARP_DFLTTL) {
474 		carpstat_inc(carps_badttl);
475 		CARP_LOG(LOG_NOTICE, sc, ("received ttl %d != %d on %s",
476 		    ip->ip_ttl, CARP_DFLTTL, ifp->if_xname));
477 		m_freem(m);
478 		return IPPROTO_DONE;
479 	}
480 
481 	/*
482 	 * verify that the received packet length is
483 	 * equal to the CARP header
484 	 */
485 	iplen = ip->ip_hl << 2;
486 	len = iplen + sizeof(*ch);
487 	if (len > m->m_pkthdr.len) {
488 		carpstat_inc(carps_badlen);
489 		CARP_LOG(LOG_INFO, sc, ("packet too short %d on %s",
490 		    m->m_pkthdr.len, ifp->if_xname));
491 		m_freem(m);
492 		return IPPROTO_DONE;
493 	}
494 
495 	if ((m = *mp = m_pullup(m, len)) == NULL) {
496 		carpstat_inc(carps_hdrops);
497 		return IPPROTO_DONE;
498 	}
499 	ip = mtod(m, struct ip *);
500 	ch = (struct carp_header *)(mtod(m, caddr_t) + iplen);
501 
502 	/* verify the CARP checksum */
503 	m->m_data += iplen;
504 	if (carp_cksum(m, len - iplen)) {
505 		carpstat_inc(carps_badsum);
506 		CARP_LOG(LOG_INFO, sc, ("checksum failed on %s",
507 		    ifp->if_xname));
508 		m_freem(m);
509 		return IPPROTO_DONE;
510 	}
511 	m->m_data -= iplen;
512 
513 	KERNEL_LOCK();
514 	carp_proto_input_c(ifp, m, ch, ismulti, AF_INET);
515 	KERNEL_UNLOCK();
516 	return IPPROTO_DONE;
517 }
518 
519 #ifdef INET6
520 int
521 carp6_proto_input(struct mbuf **mp, int *offp, int proto, int af)
522 {
523 	struct ifnet *ifp;
524 
525 	ifp = if_get((*mp)->m_pkthdr.ph_ifidx);
526 	if (ifp == NULL) {
527 		m_freemp(mp);
528 		return IPPROTO_DONE;
529 	}
530 
531 	proto = carp6_proto_input_if(ifp, mp, offp, proto);
532 	if_put(ifp);
533 	return proto;
534 }
535 
536 int
537 carp6_proto_input_if(struct ifnet *ifp, struct mbuf **mp, int *offp, int proto)
538 {
539 	struct mbuf *m = *mp;
540 	struct carp_softc *sc = NULL;
541 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
542 	struct carp_header *ch;
543 	u_int len;
544 
545 	carpstat_inc(carps_ipackets6);
546 
547 	if (!carp_opts[CARPCTL_ALLOW]) {
548 		m_freem(m);
549 		return IPPROTO_DONE;
550 	}
551 
552 	/* check if received on a valid carp interface */
553 	if (ifp->if_type != IFT_CARP) {
554 		carpstat_inc(carps_badif);
555 		CARP_LOG(LOG_INFO, sc, ("packet received on non-carp interface: %s",
556 		    ifp->if_xname));
557 		m_freem(m);
558 		return IPPROTO_DONE;
559 	}
560 
561 	/* verify that the IP TTL is 255 */
562 	if (ip6->ip6_hlim != CARP_DFLTTL) {
563 		carpstat_inc(carps_badttl);
564 		CARP_LOG(LOG_NOTICE, sc, ("received ttl %d != %d on %s",
565 		    ip6->ip6_hlim, CARP_DFLTTL, ifp->if_xname));
566 		m_freem(m);
567 		return IPPROTO_DONE;
568 	}
569 
570 	/* verify that we have a complete carp packet */
571 	len = m->m_len;
572 	if ((m = *mp = m_pullup(m, *offp + sizeof(*ch))) == NULL) {
573 		carpstat_inc(carps_badlen);
574 		CARP_LOG(LOG_INFO, sc, ("packet size %u too small", len));
575 		return IPPROTO_DONE;
576 	}
577 	ch = (struct carp_header *)(mtod(m, caddr_t) + *offp);
578 
579 	/* verify the CARP checksum */
580 	m->m_data += *offp;
581 	if (carp_cksum(m, sizeof(*ch))) {
582 		carpstat_inc(carps_badsum);
583 		CARP_LOG(LOG_INFO, sc, ("checksum failed, on %s",
584 		    ifp->if_xname));
585 		m_freem(m);
586 		return IPPROTO_DONE;
587 	}
588 	m->m_data -= *offp;
589 
590 	KERNEL_LOCK();
591 	carp_proto_input_c(ifp, m, ch, 1, AF_INET6);
592 	KERNEL_UNLOCK();
593 	return IPPROTO_DONE;
594 }
595 #endif /* INET6 */
596 
597 void
598 carp_proto_input_c(struct ifnet *ifp, struct mbuf *m, struct carp_header *ch,
599     int ismulti, sa_family_t af)
600 {
601 	struct carp_softc *sc;
602 	struct ifnet *ifp0;
603 	struct carp_vhost_entry *vhe;
604 	struct timeval sc_tv, ch_tv;
605 	struct srpl *cif;
606 
607 	KERNEL_ASSERT_LOCKED(); /* touching if_carp + carp_vhosts */
608 
609 	ifp0 = if_get(ifp->if_carpdevidx);
610 
611 	if (ifp->if_type == IFT_CARP) {
612 		/*
613 		 * If the parent of this carp(4) got destroyed while
614 		 * `m' was being processed, silently drop it.
615 		 */
616 		if (ifp0 == NULL)
617 			goto rele;
618 		cif = &ifp0->if_carp;
619 	} else
620 		cif = &ifp->if_carp;
621 
622 	SRPL_FOREACH_LOCKED(sc, cif, sc_list) {
623 		if (af == AF_INET &&
624 		    ismulti != IN_MULTICAST(sc->sc_peer.s_addr))
625 			continue;
626 		SRPL_FOREACH_LOCKED(vhe, &sc->carp_vhosts, vhost_entries) {
627 			if (vhe->vhid == ch->carp_vhid)
628 				goto found;
629 		}
630 	}
631  found:
632 
633 	if (!sc || (sc->sc_if.if_flags & (IFF_UP|IFF_RUNNING)) !=
634 	    (IFF_UP|IFF_RUNNING)) {
635 		carpstat_inc(carps_badvhid);
636 		goto rele;
637 	}
638 
639 	getmicrotime(&sc->sc_if.if_lastchange);
640 
641 	/* verify the CARP version. */
642 	if (ch->carp_version != CARP_VERSION) {
643 		carpstat_inc(carps_badver);
644 		sc->sc_if.if_ierrors++;
645 		CARP_LOG(LOG_NOTICE, sc, ("invalid version %d != %d",
646 		    ch->carp_version, CARP_VERSION));
647 		goto rele;
648 	}
649 
650 	/* verify the hash */
651 	if (carp_hmac_verify(vhe, ch->carp_counter, ch->carp_md)) {
652 		carpstat_inc(carps_badauth);
653 		sc->sc_if.if_ierrors++;
654 		CARP_LOG(LOG_INFO, sc, ("incorrect hash"));
655 		goto rele;
656 	}
657 
658 	if (!memcmp(&vhe->vhe_replay_cookie, ch->carp_counter,
659 	    sizeof(ch->carp_counter))) {
660 		struct ifnet *ifp2;
661 
662 		ifp2 = if_get(sc->sc_carpdevidx);
663 		/* Do not log duplicates from non simplex interfaces */
664 		if (ifp2 && ifp2->if_flags & IFF_SIMPLEX) {
665 			carpstat_inc(carps_badauth);
666 			sc->sc_if.if_ierrors++;
667 			CARP_LOG(LOG_WARNING, sc,
668 			    ("replay or network loop detected"));
669 		}
670 		if_put(ifp2);
671 		goto rele;
672 	}
673 
674 	sc_tv.tv_sec = sc->sc_advbase;
675 	sc_tv.tv_usec = vhe->advskew * 1000000 / 256;
676 	ch_tv.tv_sec = ch->carp_advbase;
677 	ch_tv.tv_usec = ch->carp_advskew * 1000000 / 256;
678 
679 	switch (vhe->state) {
680 	case INIT:
681 		break;
682 	case MASTER:
683 		/*
684 		 * If we receive an advertisement from a master who's going to
685 		 * be more frequent than us, and whose demote count is not higher
686 		 * than ours, go into BACKUP state. If his demote count is lower,
687 		 * also go into BACKUP.
688 		 */
689 		if (((timercmp(&sc_tv, &ch_tv, >) ||
690 		    timercmp(&sc_tv, &ch_tv, ==)) &&
691 		    (ch->carp_demote <= carp_group_demote_count(sc))) ||
692 		    ch->carp_demote < carp_group_demote_count(sc)) {
693 			timeout_del(&vhe->ad_tmo);
694 			carp_set_state(vhe, BACKUP);
695 			carp_setrun(vhe, 0);
696 		}
697 		break;
698 	case BACKUP:
699 		/*
700 		 * If we're pre-empting masters who advertise slower than us,
701 		 * and do not have a better demote count, treat them as down.
702 		 *
703 		 */
704 		if (carp_opts[CARPCTL_PREEMPT] &&
705 		    timercmp(&sc_tv, &ch_tv, <) &&
706 		    ch->carp_demote >= carp_group_demote_count(sc)) {
707 			carp_master_down(vhe);
708 			break;
709 		}
710 
711 		/*
712 		 * Take over masters advertising with a higher demote count,
713 		 * regardless of CARPCTL_PREEMPT.
714 		 */
715 		if (ch->carp_demote > carp_group_demote_count(sc)) {
716 			carp_master_down(vhe);
717 			break;
718 		}
719 
720 		/*
721 		 *  If the master is going to advertise at such a low frequency
722 		 *  that he's guaranteed to time out, we'd might as well just
723 		 *  treat him as timed out now.
724 		 */
725 		sc_tv.tv_sec = sc->sc_advbase * 3;
726 		if (sc->sc_advbase && timercmp(&sc_tv, &ch_tv, <)) {
727 			carp_master_down(vhe);
728 			break;
729 		}
730 
731 		/*
732 		 * Otherwise, we reset the counter and wait for the next
733 		 * advertisement.
734 		 */
735 		carp_setrun(vhe, af);
736 		break;
737 	}
738 
739 rele:
740 	if_put(ifp0);
741 	m_freem(m);
742 	return;
743 }
744 
745 int
746 carp_sysctl_carpstat(void *oldp, size_t *oldlenp, void *newp)
747 {
748 	struct carpstats carpstat;
749 
750 	CTASSERT(sizeof(carpstat) == (carps_ncounters * sizeof(uint64_t)));
751 	memset(&carpstat, 0, sizeof carpstat);
752 	counters_read(carpcounters, (uint64_t *)&carpstat, carps_ncounters);
753 	return (sysctl_rdstruct(oldp, oldlenp, newp,
754 	    &carpstat, sizeof(carpstat)));
755 }
756 
757 int
758 carp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
759     size_t newlen)
760 {
761 	int error;
762 
763 	/* All sysctl names at this level are terminal. */
764 	if (namelen != 1)
765 		return (ENOTDIR);
766 
767 	switch (name[0]) {
768 	case CARPCTL_STATS:
769 		return (carp_sysctl_carpstat(oldp, oldlenp, newp));
770 	default:
771 		if (name[0] <= 0 || name[0] >= CARPCTL_MAXID)
772 			return (ENOPROTOOPT);
773 		NET_LOCK();
774 		error = sysctl_int(oldp, oldlenp, newp, newlen,
775 		    &carp_opts[name[0]]);
776 		NET_UNLOCK();
777 		return (error);
778 	}
779 }
780 
781 /*
782  * Interface side of the CARP implementation.
783  */
784 
785 /* ARGSUSED */
786 void
787 carpattach(int n)
788 {
789 	struct ifg_group	*ifg;
790 
791 	if ((ifg = if_creategroup("carp")) != NULL)
792 		ifg->ifg_refcnt++;	/* keep around even if empty */
793 	if_clone_attach(&carp_cloner);
794 	carpcounters = counters_alloc(carps_ncounters);
795 }
796 
797 int
798 carp_clone_create(struct if_clone *ifc, int unit)
799 {
800 	struct carp_softc *sc;
801 	struct ifnet *ifp;
802 
803 	sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO);
804 	refcnt_init(&sc->sc_refcnt);
805 
806 	SRPL_INIT(&sc->carp_vhosts);
807 	sc->sc_vhe_count = 0;
808 	if (carp_new_vhost(sc, 0, 0)) {
809 		free(sc, M_DEVBUF, sizeof(*sc));
810 		return (ENOMEM);
811 	}
812 
813 	task_set(&sc->sc_atask, carp_addr_updated, sc);
814 	task_set(&sc->sc_ltask, carp_carpdev_state, sc);
815 	task_set(&sc->sc_dtask, carpdetach, sc);
816 
817 	sc->sc_suppress = 0;
818 	sc->sc_advbase = CARP_DFLTINTV;
819 	sc->sc_naddrs = sc->sc_naddrs6 = 0;
820 #ifdef INET6
821 	sc->sc_im6o.im6o_hlim = CARP_DFLTTL;
822 #endif /* INET6 */
823 	sc->sc_imo.imo_membership = mallocarray(IP_MIN_MEMBERSHIPS,
824 	    sizeof(struct in_multi *), M_IPMOPTS, M_WAITOK|M_ZERO);
825 	sc->sc_imo.imo_max_memberships = IP_MIN_MEMBERSHIPS;
826 
827 	LIST_INIT(&sc->carp_mc_listhead);
828 	ifp = &sc->sc_if;
829 	ifp->if_softc = sc;
830 	snprintf(ifp->if_xname, sizeof ifp->if_xname, "%s%d", ifc->ifc_name,
831 	    unit);
832 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
833 	ifp->if_ioctl = carp_ioctl;
834 	ifp->if_start = carp_start;
835 	ifp->if_enqueue = carp_enqueue;
836 	ifp->if_xflags = IFXF_CLONED;
837 	if_counters_alloc(ifp);
838 	if_attach(ifp);
839 	ether_ifattach(ifp);
840 	ifp->if_type = IFT_CARP;
841 	ifp->if_sadl->sdl_type = IFT_CARP;
842 	ifp->if_output = carp_output;
843 	ifp->if_priority = IF_CARP_DEFAULT_PRIORITY;
844 	ifp->if_link_state = LINK_STATE_INVALID;
845 
846 	/* Hook carp_addr_updated to cope with address and route changes. */
847 	if_addrhook_add(&sc->sc_if, &sc->sc_atask);
848 
849 	return (0);
850 }
851 
852 int
853 carp_new_vhost(struct carp_softc *sc, int vhid, int advskew)
854 {
855 	struct carp_vhost_entry *vhe, *vhe0;
856 
857 	vhe = malloc(sizeof(*vhe), M_DEVBUF, M_NOWAIT | M_ZERO);
858 	if (vhe == NULL)
859 		return (ENOMEM);
860 
861 	refcnt_init(&vhe->vhost_refcnt);
862 	carp_sc_ref(NULL, sc); /* give a sc ref to the vhe */
863 	vhe->parent_sc = sc;
864 	vhe->vhid = vhid;
865 	vhe->advskew = advskew;
866 	vhe->state = INIT;
867 	timeout_set_proc(&vhe->ad_tmo, carp_timer_ad, vhe);
868 	timeout_set_proc(&vhe->md_tmo, carp_timer_down, vhe);
869 	timeout_set_proc(&vhe->md6_tmo, carp_timer_down, vhe);
870 
871 	KERNEL_ASSERT_LOCKED(); /* touching carp_vhosts */
872 
873 	/* mark the first vhe as leader */
874 	if (SRPL_EMPTY_LOCKED(&sc->carp_vhosts)) {
875 		vhe->vhe_leader = 1;
876 		SRPL_INSERT_HEAD_LOCKED(&carp_vh_rc, &sc->carp_vhosts,
877 		    vhe, vhost_entries);
878 		sc->sc_vhe_count = 1;
879 		return (0);
880 	}
881 
882 	SRPL_FOREACH_LOCKED(vhe0, &sc->carp_vhosts, vhost_entries) {
883 		if (SRPL_NEXT_LOCKED(vhe0, vhost_entries) == NULL)
884 			break;
885 	}
886 
887 	SRPL_INSERT_AFTER_LOCKED(&carp_vh_rc, vhe0, vhe, vhost_entries);
888 	sc->sc_vhe_count++;
889 
890 	return (0);
891 }
892 
893 int
894 carp_clone_destroy(struct ifnet *ifp)
895 {
896 	struct carp_softc *sc = ifp->if_softc;
897 
898 	if_addrhook_del(&sc->sc_if, &sc->sc_atask);
899 
900 	NET_LOCK();
901 	carpdetach(sc);
902 	NET_UNLOCK();
903 
904 	ether_ifdetach(ifp);
905 	if_detach(ifp);
906 	carp_destroy_vhosts(ifp->if_softc);
907 	refcnt_finalize(&sc->sc_refcnt, "carpdtor");
908 	free(sc->sc_imo.imo_membership, M_IPMOPTS,
909 	    sc->sc_imo.imo_max_memberships * sizeof(struct in_multi *));
910 	free(sc, M_DEVBUF, sizeof(*sc));
911 	return (0);
912 }
913 
914 void
915 carp_del_all_timeouts(struct carp_softc *sc)
916 {
917 	struct carp_vhost_entry *vhe;
918 
919 	KERNEL_ASSERT_LOCKED(); /* touching carp_vhosts */
920 	SRPL_FOREACH_LOCKED(vhe, &sc->carp_vhosts, vhost_entries) {
921 		timeout_del(&vhe->ad_tmo);
922 		timeout_del(&vhe->md_tmo);
923 		timeout_del(&vhe->md6_tmo);
924 	}
925 }
926 
927 void
928 carpdetach(void *arg)
929 {
930 	struct carp_softc *sc = arg;
931 	struct ifnet *ifp0;
932 	struct srpl *cif;
933 
934 	carp_del_all_timeouts(sc);
935 
936 	if (sc->sc_demote_cnt)
937 		carp_group_demote_adj(&sc->sc_if, -sc->sc_demote_cnt, "detach");
938 	sc->sc_suppress = 0;
939 	sc->sc_sendad_errors = 0;
940 
941 	carp_set_state_all(sc, INIT);
942 	sc->sc_if.if_flags &= ~IFF_UP;
943 	carp_setrun_all(sc, 0);
944 	carp_multicast_cleanup(sc);
945 
946 	ifp0 = if_get(sc->sc_carpdevidx);
947 	if (ifp0 == NULL)
948 		return;
949 
950 	KERNEL_ASSERT_LOCKED(); /* touching if_carp */
951 
952 	cif = &ifp0->if_carp;
953 
954 	SRPL_REMOVE_LOCKED(&carp_sc_rc, cif, sc, carp_softc, sc_list);
955 	sc->sc_carpdevidx = 0;
956 
957 	if_linkstatehook_del(ifp0, &sc->sc_ltask);
958 	if_detachhook_del(ifp0, &sc->sc_dtask);
959 	ifpromisc(ifp0, 0);
960 	if_put(ifp0);
961 }
962 
963 void
964 carp_destroy_vhosts(struct carp_softc *sc)
965 {
966 	/* XXX bow out? */
967 	struct carp_vhost_entry *vhe;
968 
969 	KERNEL_ASSERT_LOCKED(); /* touching carp_vhosts */
970 
971 	while ((vhe = SRPL_FIRST_LOCKED(&sc->carp_vhosts)) != NULL) {
972 		SRPL_REMOVE_LOCKED(&carp_vh_rc, &sc->carp_vhosts, vhe,
973 		    carp_vhost_entry, vhost_entries);
974 		carp_vh_unref(NULL, vhe); /* drop last ref */
975 	}
976 	sc->sc_vhe_count = 0;
977 }
978 
979 void
980 carp_prepare_ad(struct mbuf *m, struct carp_vhost_entry *vhe,
981     struct carp_header *ch)
982 {
983 	if (!vhe->vhe_replay_cookie) {
984 		arc4random_buf(&vhe->vhe_replay_cookie,
985 		    sizeof(vhe->vhe_replay_cookie));
986 	}
987 
988 	bcopy(&vhe->vhe_replay_cookie, ch->carp_counter,
989 	    sizeof(ch->carp_counter));
990 
991 	/*
992 	 * For the time being, do not include the IPv6 linklayer addresses
993 	 * in the HMAC.
994 	 */
995 	carp_hmac_generate(vhe, ch->carp_counter, ch->carp_md, HMAC_NOV6LL);
996 }
997 
998 void
999 carp_send_ad_all(void)
1000 {
1001 	struct ifnet *ifp0;
1002 	struct srpl *cif;
1003 	struct carp_softc *vh;
1004 
1005 	KERNEL_ASSERT_LOCKED(); /* touching if_carp */
1006 
1007 	if (carp_send_all_recur > 0)
1008 		return;
1009 	++carp_send_all_recur;
1010 	TAILQ_FOREACH(ifp0, &ifnet, if_list) {
1011 		if (ifp0->if_type != IFT_ETHER)
1012 			continue;
1013 
1014 		cif = &ifp0->if_carp;
1015 		SRPL_FOREACH_LOCKED(vh, cif, sc_list) {
1016 			if ((vh->sc_if.if_flags & (IFF_UP|IFF_RUNNING)) ==
1017 			    (IFF_UP|IFF_RUNNING)) {
1018 				carp_vhe_send_ad_all(vh);
1019 			}
1020 		}
1021 	}
1022 	--carp_send_all_recur;
1023 }
1024 
1025 void
1026 carp_vhe_send_ad_all(struct carp_softc *sc)
1027 {
1028 	struct carp_vhost_entry *vhe;
1029 
1030 	KERNEL_ASSERT_LOCKED(); /* touching carp_vhosts */
1031 
1032 	SRPL_FOREACH_LOCKED(vhe, &sc->carp_vhosts, vhost_entries) {
1033 		if (vhe->state == MASTER)
1034 			carp_send_ad(vhe);
1035 	}
1036 }
1037 
1038 void
1039 carp_timer_ad(void *v)
1040 {
1041 	NET_LOCK();
1042 	carp_send_ad(v);
1043 	NET_UNLOCK();
1044 }
1045 
1046 void
1047 carp_send_ad(struct carp_vhost_entry *vhe)
1048 {
1049 	struct carp_header ch;
1050 	struct timeval tv;
1051 	struct carp_softc *sc = vhe->parent_sc;
1052 	struct carp_header *ch_ptr;
1053 	struct mbuf *m;
1054 	int error, len, advbase, advskew;
1055 	struct ifnet *ifp;
1056 	struct ifaddr *ifa;
1057 	struct sockaddr sa;
1058 
1059 	NET_ASSERT_LOCKED();
1060 
1061 	if ((ifp = if_get(sc->sc_carpdevidx)) == NULL) {
1062 		sc->sc_if.if_oerrors++;
1063 		return;
1064 	}
1065 
1066 	/* bow out if we've gone to backup (the carp interface is going down) */
1067 	if (sc->sc_bow_out) {
1068 		advbase = 255;
1069 		advskew = 255;
1070 	} else {
1071 		advbase = sc->sc_advbase;
1072 		advskew = vhe->advskew;
1073 		tv.tv_sec = advbase;
1074 		if (advbase == 0 && advskew == 0)
1075 			tv.tv_usec = 1 * 1000000 / 256;
1076 		else
1077 			tv.tv_usec = advskew * 1000000 / 256;
1078 	}
1079 
1080 	ch.carp_version = CARP_VERSION;
1081 	ch.carp_type = CARP_ADVERTISEMENT;
1082 	ch.carp_vhid = vhe->vhid;
1083 	ch.carp_demote = carp_group_demote_count(sc) & 0xff;
1084 	ch.carp_advbase = advbase;
1085 	ch.carp_advskew = advskew;
1086 	ch.carp_authlen = 7;	/* XXX DEFINE */
1087 	ch.carp_cksum = 0;
1088 
1089 	sc->cur_vhe = vhe; /* we need the vhe later on the output path */
1090 
1091 	if (sc->sc_naddrs) {
1092 		struct ip *ip;
1093 
1094 		MGETHDR(m, M_DONTWAIT, MT_HEADER);
1095 		if (m == NULL) {
1096 			sc->sc_if.if_oerrors++;
1097 			carpstat_inc(carps_onomem);
1098 			/* XXX maybe less ? */
1099 			goto retry_later;
1100 		}
1101 		len = sizeof(*ip) + sizeof(ch);
1102 		m->m_pkthdr.pf.prio = CARP_IFQ_PRIO;
1103 		m->m_pkthdr.ph_rtableid = sc->sc_if.if_rdomain;
1104 		m->m_pkthdr.len = len;
1105 		m->m_len = len;
1106 		m_align(m, len);
1107 		ip = mtod(m, struct ip *);
1108 		ip->ip_v = IPVERSION;
1109 		ip->ip_hl = sizeof(*ip) >> 2;
1110 		ip->ip_tos = IPTOS_LOWDELAY;
1111 		ip->ip_len = htons(len);
1112 		ip->ip_id = htons(ip_randomid());
1113 		ip->ip_off = htons(IP_DF);
1114 		ip->ip_ttl = CARP_DFLTTL;
1115 		ip->ip_p = IPPROTO_CARP;
1116 		ip->ip_sum = 0;
1117 
1118 		memset(&sa, 0, sizeof(sa));
1119 		sa.sa_family = AF_INET;
1120 		/* Prefer addresses on the parent interface as source for AD. */
1121 		ifa = ifaof_ifpforaddr(&sa, ifp);
1122 		if (ifa == NULL)
1123 			ifa = ifaof_ifpforaddr(&sa, &sc->sc_if);
1124 		KASSERT(ifa != NULL);
1125 		ip->ip_src.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr;
1126 		ip->ip_dst.s_addr = sc->sc_peer.s_addr;
1127 		if (IN_MULTICAST(ip->ip_dst.s_addr))
1128 			m->m_flags |= M_MCAST;
1129 
1130 		ch_ptr = (struct carp_header *)(ip + 1);
1131 		bcopy(&ch, ch_ptr, sizeof(ch));
1132 		carp_prepare_ad(m, vhe, ch_ptr);
1133 
1134 		m->m_data += sizeof(*ip);
1135 		ch_ptr->carp_cksum = carp_cksum(m, len - sizeof(*ip));
1136 		m->m_data -= sizeof(*ip);
1137 
1138 		getmicrotime(&sc->sc_if.if_lastchange);
1139 		carpstat_inc(carps_opackets);
1140 
1141 		error = ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo,
1142 		    NULL, 0);
1143 		if (error &&
1144 		    /* when unicast, the peer's down is not our fault */
1145 		    !(!IN_MULTICAST(sc->sc_peer.s_addr) && error == EHOSTDOWN)){
1146 			if (error == ENOBUFS)
1147 				carpstat_inc(carps_onomem);
1148 			else
1149 				CARP_LOG(LOG_WARNING, sc,
1150 				    ("ip_output failed: %d", error));
1151 			sc->sc_if.if_oerrors++;
1152 			if (sc->sc_sendad_errors < INT_MAX)
1153 				sc->sc_sendad_errors++;
1154 			if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS(sc))
1155 				carp_group_demote_adj(&sc->sc_if, 1,
1156 				    "> snderrors");
1157 			sc->sc_sendad_success = 0;
1158 		} else {
1159 			if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS(sc)) {
1160 				if (++sc->sc_sendad_success >=
1161 				    CARP_SENDAD_MIN_SUCCESS(sc)) {
1162 					carp_group_demote_adj(&sc->sc_if, -1,
1163 					    "< snderrors");
1164 					sc->sc_sendad_errors = 0;
1165 				}
1166 			} else
1167 				sc->sc_sendad_errors = 0;
1168 		}
1169 		if (vhe->vhe_leader) {
1170 			if (sc->sc_delayed_arp > 0)
1171 				sc->sc_delayed_arp--;
1172 			if (sc->sc_delayed_arp == 0) {
1173 				carp_send_arp(sc);
1174 				sc->sc_delayed_arp = -1;
1175 			}
1176 		}
1177 	}
1178 #ifdef INET6
1179 	if (sc->sc_naddrs6) {
1180 		struct ip6_hdr *ip6;
1181 
1182 		MGETHDR(m, M_DONTWAIT, MT_HEADER);
1183 		if (m == NULL) {
1184 			sc->sc_if.if_oerrors++;
1185 			carpstat_inc(carps_onomem);
1186 			/* XXX maybe less ? */
1187 			goto retry_later;
1188 		}
1189 		len = sizeof(*ip6) + sizeof(ch);
1190 		m->m_pkthdr.pf.prio = CARP_IFQ_PRIO;
1191 		m->m_pkthdr.ph_rtableid = sc->sc_if.if_rdomain;
1192 		m->m_pkthdr.len = len;
1193 		m->m_len = len;
1194 		m_align(m, len);
1195 		m->m_flags |= M_MCAST;
1196 		ip6 = mtod(m, struct ip6_hdr *);
1197 		memset(ip6, 0, sizeof(*ip6));
1198 		ip6->ip6_vfc |= IPV6_VERSION;
1199 		ip6->ip6_hlim = CARP_DFLTTL;
1200 		ip6->ip6_nxt = IPPROTO_CARP;
1201 
1202 		/* set the source address */
1203 		memset(&sa, 0, sizeof(sa));
1204 		sa.sa_family = AF_INET6;
1205 		/* Prefer addresses on the parent interface as source for AD. */
1206 		ifa = ifaof_ifpforaddr(&sa, ifp);
1207 		if (ifa == NULL)
1208 			ifa = ifaof_ifpforaddr(&sa, &sc->sc_if);
1209 		KASSERT(ifa != NULL);
1210 		bcopy(ifatoia6(ifa)->ia_addr.sin6_addr.s6_addr,
1211 		    &ip6->ip6_src, sizeof(struct in6_addr));
1212 		/* set the multicast destination */
1213 
1214 		ip6->ip6_dst.s6_addr16[0] = htons(0xff02);
1215 		ip6->ip6_dst.s6_addr16[1] = htons(ifp->if_index);
1216 		ip6->ip6_dst.s6_addr8[15] = 0x12;
1217 
1218 		ch_ptr = (struct carp_header *)(ip6 + 1);
1219 		bcopy(&ch, ch_ptr, sizeof(ch));
1220 		carp_prepare_ad(m, vhe, ch_ptr);
1221 
1222 		m->m_data += sizeof(*ip6);
1223 		ch_ptr->carp_cksum = carp_cksum(m, len - sizeof(*ip6));
1224 		m->m_data -= sizeof(*ip6);
1225 
1226 		getmicrotime(&sc->sc_if.if_lastchange);
1227 		carpstat_inc(carps_opackets6);
1228 
1229 		error = ip6_output(m, NULL, NULL, 0, &sc->sc_im6o, NULL);
1230 		if (error) {
1231 			if (error == ENOBUFS)
1232 				carpstat_inc(carps_onomem);
1233 			else
1234 				CARP_LOG(LOG_WARNING, sc,
1235 				    ("ip6_output failed: %d", error));
1236 			sc->sc_if.if_oerrors++;
1237 			if (sc->sc_sendad_errors < INT_MAX)
1238 				sc->sc_sendad_errors++;
1239 			if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS(sc))
1240 				carp_group_demote_adj(&sc->sc_if, 1,
1241 					    "> snd6errors");
1242 			sc->sc_sendad_success = 0;
1243 		} else {
1244 			if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS(sc)) {
1245 				if (++sc->sc_sendad_success >=
1246 				    CARP_SENDAD_MIN_SUCCESS(sc)) {
1247 					carp_group_demote_adj(&sc->sc_if, -1,
1248 					    "< snd6errors");
1249 					sc->sc_sendad_errors = 0;
1250 				}
1251 			} else
1252 				sc->sc_sendad_errors = 0;
1253 		}
1254 	}
1255 #endif /* INET6 */
1256 
1257 retry_later:
1258 	sc->cur_vhe = NULL;
1259 	if (advbase != 255 || advskew != 255)
1260 		timeout_add_tv(&vhe->ad_tmo, &tv);
1261 	if_put(ifp);
1262 }
1263 
1264 /*
1265  * Broadcast a gratuitous ARP request containing
1266  * the virtual router MAC address for each IP address
1267  * associated with the virtual router.
1268  */
1269 void
1270 carp_send_arp(struct carp_softc *sc)
1271 {
1272 	struct ifaddr *ifa;
1273 	in_addr_t in;
1274 
1275 	TAILQ_FOREACH(ifa, &sc->sc_if.if_addrlist, ifa_list) {
1276 
1277 		if (ifa->ifa_addr->sa_family != AF_INET)
1278 			continue;
1279 
1280 		in = ifatoia(ifa)->ia_addr.sin_addr.s_addr;
1281 		arprequest(&sc->sc_if, &in, &in, sc->sc_ac.ac_enaddr);
1282 	}
1283 }
1284 
1285 #ifdef INET6
1286 void
1287 carp_send_na(struct carp_softc *sc)
1288 {
1289 	struct ifaddr *ifa;
1290 	struct in6_addr *in6;
1291 	static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
1292 
1293 	TAILQ_FOREACH(ifa, &sc->sc_if.if_addrlist, ifa_list) {
1294 
1295 		if (ifa->ifa_addr->sa_family != AF_INET6)
1296 			continue;
1297 
1298 		in6 = &ifatoia6(ifa)->ia_addr.sin6_addr;
1299 		nd6_na_output(&sc->sc_if, &mcast, in6,
1300 		    ND_NA_FLAG_OVERRIDE |
1301 		    (ip6_forwarding ? ND_NA_FLAG_ROUTER : 0), 1, NULL);
1302 	}
1303 }
1304 #endif /* INET6 */
1305 
1306 void
1307 carp_update_lsmask(struct carp_softc *sc)
1308 {
1309 	struct carp_vhost_entry *vhe;
1310 	int count;
1311 
1312 	if (sc->sc_balancing == CARP_BAL_NONE)
1313 		return;
1314 
1315 	sc->sc_lsmask = 0;
1316 	count = 0;
1317 
1318 	KERNEL_ASSERT_LOCKED(); /* touching carp_vhosts */
1319 	SRPL_FOREACH_LOCKED(vhe, &sc->carp_vhosts, vhost_entries) {
1320 		if (vhe->state == MASTER && count < sizeof(sc->sc_lsmask) * 8)
1321 			sc->sc_lsmask |= 1 << count;
1322 		count++;
1323 	}
1324 	sc->sc_lscount = count;
1325 	CARP_LOG(LOG_DEBUG, sc, ("carp_update_lsmask: %x", sc->sc_lsmask));
1326 }
1327 
1328 int
1329 carp_iamatch(struct ifnet *ifp)
1330 {
1331 	struct carp_softc *sc = ifp->if_softc;
1332 	struct carp_vhost_entry *vhe;
1333 	struct srp_ref sr;
1334 	int match = 0;
1335 
1336 	vhe = SRPL_FIRST(&sr, &sc->carp_vhosts);
1337 	if (vhe->state == MASTER)
1338 		match = 1;
1339 	SRPL_LEAVE(&sr);
1340 
1341 	return (match);
1342 }
1343 
1344 int
1345 carp_ourether(struct ifnet *ifp, uint8_t *ena)
1346 {
1347 	struct srpl *cif = &ifp->if_carp;
1348 	struct carp_softc *sc;
1349 	struct srp_ref sr;
1350 	int match = 0;
1351 
1352 	KASSERT(ifp->if_type == IFT_ETHER);
1353 
1354 	SRPL_FOREACH(sc, &sr, cif, sc_list) {
1355 		if ((sc->sc_if.if_flags & (IFF_UP|IFF_RUNNING)) !=
1356 		    (IFF_UP|IFF_RUNNING))
1357 			continue;
1358 		if (carp_vhe_match(sc, ena)) {
1359 			match = 1;
1360 			break;
1361 		}
1362 	}
1363 	SRPL_LEAVE(&sr);
1364 
1365 	return (match);
1366 }
1367 
1368 int
1369 carp_vhe_match(struct carp_softc *sc, uint8_t *ena)
1370 {
1371 	struct carp_vhost_entry *vhe;
1372 	struct srp_ref sr;
1373 	int match = 0;
1374 
1375 	vhe = SRPL_FIRST(&sr, &sc->carp_vhosts);
1376 	match = (vhe->state == MASTER || sc->sc_balancing >= CARP_BAL_IP) &&
1377 	    !memcmp(ena, sc->sc_ac.ac_enaddr, ETHER_ADDR_LEN);
1378 	SRPL_LEAVE(&sr);
1379 
1380 	return (match);
1381 }
1382 
1383 struct mbuf *
1384 carp_input(struct ifnet *ifp0, struct mbuf *m)
1385 {
1386 	struct ether_header *eh;
1387 	struct srpl *cif;
1388 	struct carp_softc *sc;
1389 	struct srp_ref sr;
1390 
1391 	eh = mtod(m, struct ether_header *);
1392 	cif = &ifp0->if_carp;
1393 
1394 	SRPL_FOREACH(sc, &sr, cif, sc_list) {
1395 		if ((sc->sc_if.if_flags & (IFF_UP|IFF_RUNNING)) !=
1396 		    (IFF_UP|IFF_RUNNING))
1397 			continue;
1398 
1399 		if (carp_vhe_match(sc, eh->ether_dhost)) {
1400 			/*
1401 			 * These packets look like layer 2 multicast but they
1402 			 * are unicast at layer 3. With help of the tag the
1403 			 * mbuf's M_MCAST flag can be removed by carp_lsdrop()
1404 			 * after we have passed layer 2.
1405 			 */
1406 			if (sc->sc_balancing == CARP_BAL_IP) {
1407 				struct m_tag *mtag;
1408 				mtag = m_tag_get(PACKET_TAG_CARP_BAL_IP, 0,
1409 				    M_NOWAIT);
1410 				if (mtag == NULL) {
1411 					m_freem(m);
1412 					goto out;
1413 				}
1414 				m_tag_prepend(m, mtag);
1415 			}
1416 			break;
1417 		}
1418 	}
1419 
1420 	if (sc == NULL) {
1421 		SRPL_LEAVE(&sr);
1422 
1423 		if (!ETHER_IS_MULTICAST(eh->ether_dhost))
1424 			return (m);
1425 
1426 		/*
1427 		 * XXX Should really check the list of multicast addresses
1428 		 * for each CARP interface _before_ copying.
1429 		 */
1430 		SRPL_FOREACH(sc, &sr, cif, sc_list) {
1431 			struct mbuf *m0;
1432 
1433 			if (!(sc->sc_if.if_flags & IFF_UP))
1434 				continue;
1435 
1436 			m0 = m_dup_pkt(m, ETHER_ALIGN, M_DONTWAIT);
1437 			if (m0 == NULL)
1438 				continue;
1439 
1440 			if_vinput(&sc->sc_if, m0);
1441 		}
1442 		SRPL_LEAVE(&sr);
1443 
1444 		return (m);
1445 	}
1446 
1447 	if_vinput(&sc->sc_if, m);
1448 out:
1449 	SRPL_LEAVE(&sr);
1450 
1451 	return (NULL);
1452 }
1453 
1454 int
1455 carp_lsdrop(struct ifnet *ifp, struct mbuf *m, sa_family_t af, u_int32_t *src,
1456     u_int32_t *dst, int drop)
1457 {
1458 	struct carp_softc *sc;
1459 	u_int32_t fold;
1460 	struct m_tag *mtag;
1461 
1462 	if (ifp->if_type != IFT_CARP)
1463 		return 0;
1464 	sc = ifp->if_softc;
1465 	if (sc->sc_balancing == CARP_BAL_NONE)
1466 		return 0;
1467 
1468 	/*
1469 	 * Remove M_MCAST flag from mbuf of balancing ip traffic, since the fact
1470 	 * that it is layer 2 multicast does not implicate that it is also layer
1471 	 * 3 multicast.
1472 	 */
1473 	if (m->m_flags & M_MCAST &&
1474 	    (mtag = m_tag_find(m, PACKET_TAG_CARP_BAL_IP, NULL))) {
1475 		m_tag_delete(m, mtag);
1476 		m->m_flags &= ~M_MCAST;
1477 	}
1478 
1479 	/*
1480 	 * Return without making a drop decision. This allows to clear the
1481 	 * M_MCAST flag and do nothing else.
1482 	 */
1483 	if (!drop)
1484 		return 0;
1485 
1486 	/*
1487 	 * Never drop carp advertisements.
1488 	 * XXX Bad idea to pass all broadcast / multicast traffic?
1489 	 */
1490 	if (m->m_flags & (M_BCAST|M_MCAST))
1491 		return 0;
1492 
1493 	fold = src[0] ^ dst[0];
1494 #ifdef INET6
1495 	if (af == AF_INET6) {
1496 		int i;
1497 		for (i = 1; i < 4; i++)
1498 			fold ^= src[i] ^ dst[i];
1499 	}
1500 #endif
1501 	if (sc->sc_lscount == 0) /* just to be safe */
1502 		return 1;
1503 
1504 	return ((1 << (ntohl(fold) % sc->sc_lscount)) & sc->sc_lsmask) == 0;
1505 }
1506 
1507 void
1508 carp_timer_down(void *v)
1509 {
1510 	NET_LOCK();
1511 	carp_master_down(v);
1512 	NET_UNLOCK();
1513 }
1514 
1515 void
1516 carp_master_down(struct carp_vhost_entry *vhe)
1517 {
1518 	struct carp_softc *sc = vhe->parent_sc;
1519 
1520 	NET_ASSERT_LOCKED();
1521 
1522 	switch (vhe->state) {
1523 	case INIT:
1524 		printf("%s: master_down event in INIT state\n",
1525 		    sc->sc_if.if_xname);
1526 		break;
1527 	case MASTER:
1528 		break;
1529 	case BACKUP:
1530 		carp_set_state(vhe, MASTER);
1531 		carp_send_ad(vhe);
1532 		if (sc->sc_balancing == CARP_BAL_NONE && vhe->vhe_leader) {
1533 			carp_send_arp(sc);
1534 			/* Schedule a delayed ARP to deal w/ some L3 switches */
1535 			sc->sc_delayed_arp = 2;
1536 #ifdef INET6
1537 			carp_send_na(sc);
1538 #endif /* INET6 */
1539 		}
1540 		carp_setrun(vhe, 0);
1541 		carpstat_inc(carps_preempt);
1542 		break;
1543 	}
1544 }
1545 
1546 void
1547 carp_setrun_all(struct carp_softc *sc, sa_family_t af)
1548 {
1549 	struct carp_vhost_entry *vhe;
1550 
1551 	KERNEL_ASSERT_LOCKED(); /* touching carp_vhost */
1552 	SRPL_FOREACH_LOCKED(vhe, &sc->carp_vhosts, vhost_entries) {
1553 		carp_setrun(vhe, af);
1554 	}
1555 }
1556 
1557 /*
1558  * When in backup state, af indicates whether to reset the master down timer
1559  * for v4 or v6. If it's set to zero, reset the ones which are already pending.
1560  */
1561 void
1562 carp_setrun(struct carp_vhost_entry *vhe, sa_family_t af)
1563 {
1564 	struct ifnet *ifp;
1565 	struct timeval tv;
1566 	struct carp_softc *sc = vhe->parent_sc;
1567 
1568 	if ((ifp = if_get(sc->sc_carpdevidx)) == NULL) {
1569 		sc->sc_if.if_flags &= ~IFF_RUNNING;
1570 		carp_set_state_all(sc, INIT);
1571 		return;
1572 	}
1573 
1574 	if (memcmp(((struct arpcom *)ifp)->ac_enaddr,
1575 	    sc->sc_ac.ac_enaddr, ETHER_ADDR_LEN) == 0)
1576 		sc->sc_realmac = 1;
1577 	else
1578 		sc->sc_realmac = 0;
1579 
1580 	if_put(ifp);
1581 
1582 	if (sc->sc_if.if_flags & IFF_UP && vhe->vhid > 0 &&
1583 	    (sc->sc_naddrs || sc->sc_naddrs6) && !sc->sc_suppress) {
1584 		sc->sc_if.if_flags |= IFF_RUNNING;
1585 	} else {
1586 		sc->sc_if.if_flags &= ~IFF_RUNNING;
1587 		return;
1588 	}
1589 
1590 	switch (vhe->state) {
1591 	case INIT:
1592 		carp_set_state(vhe, BACKUP);
1593 		carp_setrun(vhe, 0);
1594 		break;
1595 	case BACKUP:
1596 		timeout_del(&vhe->ad_tmo);
1597 		tv.tv_sec = 3 * sc->sc_advbase;
1598 		if (sc->sc_advbase == 0 && vhe->advskew == 0)
1599 			tv.tv_usec = 3 * 1000000 / 256;
1600 		else if (sc->sc_advbase == 0)
1601 			tv.tv_usec = 3 * vhe->advskew * 1000000 / 256;
1602 		else
1603 			tv.tv_usec = vhe->advskew * 1000000 / 256;
1604 		if (vhe->vhe_leader)
1605 			sc->sc_delayed_arp = -1;
1606 		switch (af) {
1607 		case AF_INET:
1608 			timeout_add_tv(&vhe->md_tmo, &tv);
1609 			break;
1610 #ifdef INET6
1611 		case AF_INET6:
1612 			timeout_add_tv(&vhe->md6_tmo, &tv);
1613 			break;
1614 #endif /* INET6 */
1615 		default:
1616 			if (sc->sc_naddrs)
1617 				timeout_add_tv(&vhe->md_tmo, &tv);
1618 			if (sc->sc_naddrs6)
1619 				timeout_add_tv(&vhe->md6_tmo, &tv);
1620 			break;
1621 		}
1622 		break;
1623 	case MASTER:
1624 		tv.tv_sec = sc->sc_advbase;
1625 		if (sc->sc_advbase == 0 && vhe->advskew == 0)
1626 			tv.tv_usec = 1 * 1000000 / 256;
1627 		else
1628 			tv.tv_usec = vhe->advskew * 1000000 / 256;
1629 		timeout_add_tv(&vhe->ad_tmo, &tv);
1630 		break;
1631 	}
1632 }
1633 
1634 void
1635 carp_multicast_cleanup(struct carp_softc *sc)
1636 {
1637 	struct ip_moptions *imo = &sc->sc_imo;
1638 #ifdef INET6
1639 	struct ip6_moptions *im6o = &sc->sc_im6o;
1640 #endif
1641 	u_int16_t n = imo->imo_num_memberships;
1642 
1643 	/* Clean up our own multicast memberships */
1644 	while (n-- > 0) {
1645 		if (imo->imo_membership[n] != NULL) {
1646 			in_delmulti(imo->imo_membership[n]);
1647 			imo->imo_membership[n] = NULL;
1648 		}
1649 	}
1650 	imo->imo_num_memberships = 0;
1651 	imo->imo_ifidx = 0;
1652 
1653 #ifdef INET6
1654 	while (!LIST_EMPTY(&im6o->im6o_memberships)) {
1655 		struct in6_multi_mship *imm =
1656 		    LIST_FIRST(&im6o->im6o_memberships);
1657 
1658 		LIST_REMOVE(imm, i6mm_chain);
1659 		in6_leavegroup(imm);
1660 	}
1661 	im6o->im6o_ifidx = 0;
1662 #endif
1663 
1664 	/* And any other multicast memberships */
1665 	carp_ether_purgemulti(sc);
1666 }
1667 
1668 int
1669 carp_set_ifp(struct carp_softc *sc, struct ifnet *ifp0)
1670 {
1671 	struct srpl *cif;
1672 	struct carp_softc *vr, *last = NULL, *after = NULL;
1673 	int myself = 0, error = 0;
1674 
1675 	KASSERT(ifp0->if_index != sc->sc_carpdevidx);
1676 	KERNEL_ASSERT_LOCKED(); /* touching if_carp */
1677 
1678 	if ((ifp0->if_flags & IFF_MULTICAST) == 0)
1679 		return (EADDRNOTAVAIL);
1680 
1681 	if (ifp0->if_type != IFT_ETHER)
1682 		return (EINVAL);
1683 
1684 	cif = &ifp0->if_carp;
1685 	if (carp_check_dup_vhids(sc, cif, NULL))
1686 		return (EINVAL);
1687 
1688 	if ((error = ifpromisc(ifp0, 1)))
1689 		return (error);
1690 
1691 	/* detach from old interface */
1692 	if (sc->sc_carpdevidx != 0)
1693 		carpdetach(sc);
1694 
1695 	/* attach carp interface to physical interface */
1696 	if_detachhook_add(ifp0, &sc->sc_dtask);
1697 	if_linkstatehook_add(ifp0, &sc->sc_ltask);
1698 
1699 	sc->sc_carpdevidx = ifp0->if_index;
1700 	sc->sc_if.if_capabilities = ifp0->if_capabilities &
1701 	    IFCAP_CSUM_MASK;
1702 
1703 	SRPL_FOREACH_LOCKED(vr, cif, sc_list) {
1704 		struct carp_vhost_entry *vrhead, *schead;
1705 		last = vr;
1706 
1707 		if (vr == sc)
1708 			myself = 1;
1709 
1710 		vrhead = SRPL_FIRST_LOCKED(&vr->carp_vhosts);
1711 		schead = SRPL_FIRST_LOCKED(&sc->carp_vhosts);
1712 		if (vrhead->vhid < schead->vhid)
1713 			after = vr;
1714 	}
1715 
1716 	if (!myself) {
1717 		/* We're trying to keep things in order */
1718 		if (last == NULL) {
1719 			SRPL_INSERT_HEAD_LOCKED(&carp_sc_rc, cif,
1720 			    sc, sc_list);
1721 		} else if (after == NULL) {
1722 			SRPL_INSERT_AFTER_LOCKED(&carp_sc_rc, last,
1723 			    sc, sc_list);
1724 		} else {
1725 			SRPL_INSERT_AFTER_LOCKED(&carp_sc_rc, after,
1726 			    sc, sc_list);
1727 		}
1728 	}
1729 	if (sc->sc_naddrs || sc->sc_naddrs6)
1730 		sc->sc_if.if_flags |= IFF_UP;
1731 	carp_set_enaddr(sc);
1732 
1733 	carp_carpdev_state(sc);
1734 
1735 	return (0);
1736 }
1737 
1738 void
1739 carp_set_vhe_enaddr(struct carp_vhost_entry *vhe)
1740 {
1741 	struct carp_softc *sc = vhe->parent_sc;
1742 
1743 	if (vhe->vhid != 0 && sc->sc_carpdevidx != 0) {
1744 		if (vhe->vhe_leader && sc->sc_balancing == CARP_BAL_IP)
1745 			vhe->vhe_enaddr[0] = 1;
1746 		else
1747 			vhe->vhe_enaddr[0] = 0;
1748 		vhe->vhe_enaddr[1] = 0;
1749 		vhe->vhe_enaddr[2] = 0x5e;
1750 		vhe->vhe_enaddr[3] = 0;
1751 		vhe->vhe_enaddr[4] = 1;
1752 		vhe->vhe_enaddr[5] = vhe->vhid;
1753 	} else
1754 		memset(vhe->vhe_enaddr, 0, ETHER_ADDR_LEN);
1755 }
1756 
1757 void
1758 carp_set_enaddr(struct carp_softc *sc)
1759 {
1760 	struct carp_vhost_entry *vhe;
1761 
1762 	KERNEL_ASSERT_LOCKED(); /* touching carp_vhosts */
1763 	SRPL_FOREACH_LOCKED(vhe, &sc->carp_vhosts, vhost_entries)
1764 		carp_set_vhe_enaddr(vhe);
1765 
1766 	vhe = SRPL_FIRST_LOCKED(&sc->carp_vhosts);
1767 
1768 	/*
1769 	 * Use the carp lladdr if the running one isn't manually set.
1770 	 * Only compare static parts of the lladdr.
1771 	 */
1772 	if ((memcmp(sc->sc_ac.ac_enaddr + 1, vhe->vhe_enaddr + 1,
1773 	    ETHER_ADDR_LEN - 2) == 0) ||
1774 	    (!sc->sc_ac.ac_enaddr[0] && !sc->sc_ac.ac_enaddr[1] &&
1775 	    !sc->sc_ac.ac_enaddr[2] && !sc->sc_ac.ac_enaddr[3] &&
1776 	    !sc->sc_ac.ac_enaddr[4] && !sc->sc_ac.ac_enaddr[5]))
1777 		bcopy(vhe->vhe_enaddr, sc->sc_ac.ac_enaddr, ETHER_ADDR_LEN);
1778 
1779 	/* Make sure the enaddr has changed before further twiddling. */
1780 	if (memcmp(sc->sc_ac.ac_enaddr, sc->sc_curlladdr, ETHER_ADDR_LEN) != 0) {
1781 		bcopy(sc->sc_ac.ac_enaddr, LLADDR(sc->sc_if.if_sadl),
1782 		    ETHER_ADDR_LEN);
1783 		bcopy(sc->sc_ac.ac_enaddr, sc->sc_curlladdr, ETHER_ADDR_LEN);
1784 #ifdef INET6
1785 		/*
1786 		 * (re)attach a link-local address which matches
1787 		 * our new MAC address.
1788 		 */
1789 		if (sc->sc_naddrs6)
1790 			in6_ifattach_linklocal(&sc->sc_if, NULL);
1791 #endif
1792 		carp_set_state_all(sc, INIT);
1793 		carp_setrun_all(sc, 0);
1794 	}
1795 }
1796 
1797 void
1798 carp_addr_updated(void *v)
1799 {
1800 	struct carp_softc *sc = (struct carp_softc *) v;
1801 	struct ifaddr *ifa;
1802 	int new_naddrs = 0, new_naddrs6 = 0;
1803 
1804 	TAILQ_FOREACH(ifa, &sc->sc_if.if_addrlist, ifa_list) {
1805 		if (ifa->ifa_addr->sa_family == AF_INET)
1806 			new_naddrs++;
1807 #ifdef INET6
1808 		else if (ifa->ifa_addr->sa_family == AF_INET6)
1809 			new_naddrs6++;
1810 #endif /* INET6 */
1811 	}
1812 
1813 	/* We received address changes from if_addrhooks callback */
1814 	if (new_naddrs != sc->sc_naddrs || new_naddrs6 != sc->sc_naddrs6) {
1815 
1816 		sc->sc_naddrs = new_naddrs;
1817 		sc->sc_naddrs6 = new_naddrs6;
1818 
1819 		/* Re-establish multicast membership removed by in_control */
1820 		if (IN_MULTICAST(sc->sc_peer.s_addr)) {
1821 			if (!in_hasmulti(&sc->sc_peer, &sc->sc_if)) {
1822 				struct in_multi **imm =
1823 				    sc->sc_imo.imo_membership;
1824 				u_int16_t maxmem =
1825 				    sc->sc_imo.imo_max_memberships;
1826 
1827 				memset(&sc->sc_imo, 0, sizeof(sc->sc_imo));
1828 				sc->sc_imo.imo_membership = imm;
1829 				sc->sc_imo.imo_max_memberships = maxmem;
1830 
1831 				if (sc->sc_carpdevidx != 0 &&
1832 				    sc->sc_naddrs > 0)
1833 					carp_join_multicast(sc);
1834 			}
1835 		}
1836 
1837 		if (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0) {
1838 			sc->sc_if.if_flags &= ~IFF_UP;
1839 			carp_set_state_all(sc, INIT);
1840 		} else
1841 			carp_hmac_prepare(sc);
1842 	}
1843 
1844 	carp_setrun_all(sc, 0);
1845 }
1846 
1847 int
1848 carp_set_addr(struct carp_softc *sc, struct sockaddr_in *sin)
1849 {
1850 	struct in_addr *in = &sin->sin_addr;
1851 	int error;
1852 
1853 	KASSERT(sc->sc_carpdevidx != 0);
1854 
1855 	/* XXX is this necessary? */
1856 	if (in->s_addr == INADDR_ANY) {
1857 		carp_setrun_all(sc, 0);
1858 		return (0);
1859 	}
1860 
1861 	if (sc->sc_naddrs == 0 && (error = carp_join_multicast(sc)) != 0)
1862 		return (error);
1863 
1864 	carp_set_state_all(sc, INIT);
1865 
1866 	return (0);
1867 }
1868 
1869 int
1870 carp_join_multicast(struct carp_softc *sc)
1871 {
1872 	struct ip_moptions *imo = &sc->sc_imo;
1873 	struct in_multi *imm;
1874 	struct in_addr addr;
1875 
1876 	if (!IN_MULTICAST(sc->sc_peer.s_addr))
1877 		return (0);
1878 
1879 	addr.s_addr = sc->sc_peer.s_addr;
1880 	if ((imm = in_addmulti(&addr, &sc->sc_if)) == NULL)
1881 		return (ENOBUFS);
1882 
1883 	imo->imo_membership[0] = imm;
1884 	imo->imo_num_memberships = 1;
1885 	imo->imo_ifidx = sc->sc_if.if_index;
1886 	imo->imo_ttl = CARP_DFLTTL;
1887 	imo->imo_loop = 0;
1888 	return (0);
1889 }
1890 
1891 
1892 #ifdef INET6
1893 int
1894 carp_set_addr6(struct carp_softc *sc, struct sockaddr_in6 *sin6)
1895 {
1896 	int error;
1897 
1898 	KASSERT(sc->sc_carpdevidx != 0);
1899 
1900 	if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
1901 		carp_setrun_all(sc, 0);
1902 		return (0);
1903 	}
1904 
1905 	if (sc->sc_naddrs6 == 0 && (error = carp_join_multicast6(sc)) != 0)
1906 		return (error);
1907 
1908 	carp_set_state_all(sc, INIT);
1909 
1910 	return (0);
1911 }
1912 
1913 int
1914 carp_join_multicast6(struct carp_softc *sc)
1915 {
1916 	struct in6_multi_mship *imm, *imm2;
1917 	struct ip6_moptions *im6o = &sc->sc_im6o;
1918 	struct sockaddr_in6 addr6;
1919 	int error;
1920 
1921 	/* Join IPv6 CARP multicast group */
1922 	memset(&addr6, 0, sizeof(addr6));
1923 	addr6.sin6_family = AF_INET6;
1924 	addr6.sin6_len = sizeof(addr6);
1925 	addr6.sin6_addr.s6_addr16[0] = htons(0xff02);
1926 	addr6.sin6_addr.s6_addr16[1] = htons(sc->sc_if.if_index);
1927 	addr6.sin6_addr.s6_addr8[15] = 0x12;
1928 	if ((imm = in6_joingroup(&sc->sc_if,
1929 	    &addr6.sin6_addr, &error)) == NULL) {
1930 		return (error);
1931 	}
1932 	/* join solicited multicast address */
1933 	memset(&addr6.sin6_addr, 0, sizeof(addr6.sin6_addr));
1934 	addr6.sin6_addr.s6_addr16[0] = htons(0xff02);
1935 	addr6.sin6_addr.s6_addr16[1] = htons(sc->sc_if.if_index);
1936 	addr6.sin6_addr.s6_addr32[1] = 0;
1937 	addr6.sin6_addr.s6_addr32[2] = htonl(1);
1938 	addr6.sin6_addr.s6_addr32[3] = 0;
1939 	addr6.sin6_addr.s6_addr8[12] = 0xff;
1940 	if ((imm2 = in6_joingroup(&sc->sc_if,
1941 	    &addr6.sin6_addr, &error)) == NULL) {
1942 		in6_leavegroup(imm);
1943 		return (error);
1944 	}
1945 
1946 	/* apply v6 multicast membership */
1947 	im6o->im6o_ifidx = sc->sc_if.if_index;
1948 	if (imm)
1949 		LIST_INSERT_HEAD(&im6o->im6o_memberships, imm,
1950 		    i6mm_chain);
1951 	if (imm2)
1952 		LIST_INSERT_HEAD(&im6o->im6o_memberships, imm2,
1953 		    i6mm_chain);
1954 
1955 	return (0);
1956 }
1957 
1958 #endif /* INET6 */
1959 
1960 int
1961 carp_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr)
1962 {
1963 	struct proc *p = curproc;	/* XXX */
1964 	struct carp_softc *sc = ifp->if_softc;
1965 	struct carp_vhost_entry *vhe;
1966 	struct carpreq carpr;
1967 	struct ifaddr *ifa = (struct ifaddr *)addr;
1968 	struct ifreq *ifr = (struct ifreq *)addr;
1969 	struct ifnet *ifp0 = NULL;
1970 	int i, error = 0;
1971 
1972 	switch (cmd) {
1973 	case SIOCSIFADDR:
1974 		if (sc->sc_carpdevidx == 0)
1975 			return (EINVAL);
1976 
1977 		switch (ifa->ifa_addr->sa_family) {
1978 		case AF_INET:
1979 			sc->sc_if.if_flags |= IFF_UP;
1980 			error = carp_set_addr(sc, satosin(ifa->ifa_addr));
1981 			break;
1982 #ifdef INET6
1983 		case AF_INET6:
1984 			sc->sc_if.if_flags |= IFF_UP;
1985 			error = carp_set_addr6(sc, satosin6(ifa->ifa_addr));
1986 			break;
1987 #endif /* INET6 */
1988 		default:
1989 			error = EAFNOSUPPORT;
1990 			break;
1991 		}
1992 		break;
1993 
1994 	case SIOCSIFFLAGS:
1995 		KERNEL_ASSERT_LOCKED(); /* touching carp_vhosts */
1996 		vhe = SRPL_FIRST_LOCKED(&sc->carp_vhosts);
1997 		if (vhe->state != INIT && !(ifr->ifr_flags & IFF_UP)) {
1998 			carp_del_all_timeouts(sc);
1999 
2000 			/* we need the interface up to bow out */
2001 			sc->sc_if.if_flags |= IFF_UP;
2002 			sc->sc_bow_out = 1;
2003 			carp_vhe_send_ad_all(sc);
2004 			sc->sc_bow_out = 0;
2005 
2006 			sc->sc_if.if_flags &= ~IFF_UP;
2007 			carp_set_state_all(sc, INIT);
2008 			carp_setrun_all(sc, 0);
2009 		} else if (vhe->state == INIT && (ifr->ifr_flags & IFF_UP)) {
2010 			sc->sc_if.if_flags |= IFF_UP;
2011 			carp_setrun_all(sc, 0);
2012 		}
2013 		break;
2014 
2015 	case SIOCSVH:
2016 		KERNEL_ASSERT_LOCKED(); /* touching carp_vhosts */
2017 		vhe = SRPL_FIRST_LOCKED(&sc->carp_vhosts);
2018 		if ((error = suser(p)) != 0)
2019 			break;
2020 		if ((error = copyin(ifr->ifr_data, &carpr, sizeof carpr)))
2021 			break;
2022 		error = 1;
2023 		if (carpr.carpr_carpdev[0] != '\0' &&
2024 		    (ifp0 = ifunit(carpr.carpr_carpdev)) == NULL)
2025 			return (EINVAL);
2026 		if (carpr.carpr_peer.s_addr == 0)
2027 			sc->sc_peer.s_addr = INADDR_CARP_GROUP;
2028 		else
2029 			sc->sc_peer.s_addr = carpr.carpr_peer.s_addr;
2030 		if (ifp0 != NULL && ifp0->if_index != sc->sc_carpdevidx) {
2031 			if ((error = carp_set_ifp(sc, ifp0)))
2032 				return (error);
2033 		}
2034 		if (vhe->state != INIT && carpr.carpr_state != vhe->state) {
2035 			switch (carpr.carpr_state) {
2036 			case BACKUP:
2037 				timeout_del(&vhe->ad_tmo);
2038 				carp_set_state_all(sc, BACKUP);
2039 				carp_setrun_all(sc, 0);
2040 				break;
2041 			case MASTER:
2042 				KERNEL_ASSERT_LOCKED();
2043 				/* touching carp_vhosts */
2044 				SRPL_FOREACH_LOCKED(vhe, &sc->carp_vhosts,
2045 				    vhost_entries)
2046 					carp_master_down(vhe);
2047 				break;
2048 			default:
2049 				break;
2050 			}
2051 		}
2052 		if ((error = carp_vhids_ioctl(sc, &carpr)))
2053 			return (error);
2054 		if (carpr.carpr_advbase >= 0) {
2055 			if (carpr.carpr_advbase > 255) {
2056 				error = EINVAL;
2057 				break;
2058 			}
2059 			sc->sc_advbase = carpr.carpr_advbase;
2060 			error--;
2061 		}
2062 		if (memcmp(sc->sc_advskews, carpr.carpr_advskews,
2063 		    sizeof(sc->sc_advskews))) {
2064 			i = 0;
2065 			KERNEL_ASSERT_LOCKED(); /* touching carp_vhosts */
2066 			SRPL_FOREACH_LOCKED(vhe, &sc->carp_vhosts,
2067 			    vhost_entries)
2068 				vhe->advskew = carpr.carpr_advskews[i++];
2069 			bcopy(carpr.carpr_advskews, sc->sc_advskews,
2070 			    sizeof(sc->sc_advskews));
2071 		}
2072 		if (sc->sc_balancing != carpr.carpr_balancing) {
2073 			if (carpr.carpr_balancing > CARP_BAL_MAXID) {
2074 				error = EINVAL;
2075 				break;
2076 			}
2077 			sc->sc_balancing = carpr.carpr_balancing;
2078 			carp_set_enaddr(sc);
2079 			carp_update_lsmask(sc);
2080 		}
2081 		bcopy(carpr.carpr_key, sc->sc_key, sizeof(sc->sc_key));
2082 		if (error > 0)
2083 			error = EINVAL;
2084 		else {
2085 			error = 0;
2086 			carp_hmac_prepare(sc);
2087 			carp_setrun_all(sc, 0);
2088 		}
2089 		break;
2090 
2091 	case SIOCGVH:
2092 		memset(&carpr, 0, sizeof(carpr));
2093 		if ((ifp0 = if_get(sc->sc_carpdevidx)) != NULL)
2094 			strlcpy(carpr.carpr_carpdev, ifp0->if_xname, IFNAMSIZ);
2095 		if_put(ifp0);
2096 		i = 0;
2097 		KERNEL_ASSERT_LOCKED(); /* touching carp_vhosts */
2098 		SRPL_FOREACH_LOCKED(vhe, &sc->carp_vhosts, vhost_entries) {
2099 			carpr.carpr_vhids[i] = vhe->vhid;
2100 			carpr.carpr_advskews[i] = vhe->advskew;
2101 			carpr.carpr_states[i] = vhe->state;
2102 			i++;
2103 		}
2104 		carpr.carpr_advbase = sc->sc_advbase;
2105 		carpr.carpr_balancing = sc->sc_balancing;
2106 		if (suser(p) == 0)
2107 			bcopy(sc->sc_key, carpr.carpr_key,
2108 			    sizeof(carpr.carpr_key));
2109 		carpr.carpr_peer.s_addr = sc->sc_peer.s_addr;
2110 		error = copyout(&carpr, ifr->ifr_data, sizeof(carpr));
2111 		break;
2112 
2113 	case SIOCADDMULTI:
2114 		error = carp_ether_addmulti(sc, ifr);
2115 		break;
2116 
2117 	case SIOCDELMULTI:
2118 		error = carp_ether_delmulti(sc, ifr);
2119 		break;
2120 	case SIOCAIFGROUP:
2121 	case SIOCDIFGROUP:
2122 		if (sc->sc_demote_cnt)
2123 			carp_ifgroup_ioctl(ifp, cmd, addr);
2124 		break;
2125 	case SIOCSIFGATTR:
2126 		carp_ifgattr_ioctl(ifp, cmd, addr);
2127 		break;
2128 	default:
2129 		error = ENOTTY;
2130 	}
2131 
2132 	if (memcmp(sc->sc_ac.ac_enaddr, sc->sc_curlladdr, ETHER_ADDR_LEN) != 0)
2133 		carp_set_enaddr(sc);
2134 	return (error);
2135 }
2136 
2137 int
2138 carp_check_dup_vhids(struct carp_softc *sc, struct srpl *cif,
2139     struct carpreq *carpr)
2140 {
2141 	struct carp_softc *vr;
2142 	struct carp_vhost_entry *vhe, *vhe0;
2143 	int i;
2144 
2145 	KERNEL_ASSERT_LOCKED(); /* touching if_carp + carp_vhosts */
2146 
2147 	SRPL_FOREACH_LOCKED(vr, cif, sc_list) {
2148 		if (vr == sc)
2149 			continue;
2150 		SRPL_FOREACH_LOCKED(vhe, &vr->carp_vhosts, vhost_entries) {
2151 			if (carpr) {
2152 				for (i = 0; carpr->carpr_vhids[i]; i++) {
2153 					if (vhe->vhid == carpr->carpr_vhids[i])
2154 						return (EINVAL);
2155 				}
2156 			}
2157 			SRPL_FOREACH_LOCKED(vhe0, &sc->carp_vhosts,
2158 			    vhost_entries) {
2159 				if (vhe->vhid == vhe0->vhid)
2160 					return (EINVAL);
2161 			}
2162 		}
2163 	}
2164 	return (0);
2165 }
2166 
2167 int
2168 carp_vhids_ioctl(struct carp_softc *sc, struct carpreq *carpr)
2169 {
2170 	int i, j;
2171 	u_int8_t taken_vhids[256];
2172 
2173 	if (carpr->carpr_vhids[0] == 0 ||
2174 	    !memcmp(sc->sc_vhids, carpr->carpr_vhids, sizeof(sc->sc_vhids)))
2175 		return (0);
2176 
2177 	memset(taken_vhids, 0, sizeof(taken_vhids));
2178 	for (i = 0; carpr->carpr_vhids[i]; i++) {
2179 		struct ifnet *ifp;
2180 
2181 		if (taken_vhids[carpr->carpr_vhids[i]])
2182 			return (EINVAL);
2183 		taken_vhids[carpr->carpr_vhids[i]] = 1;
2184 
2185 		if ((ifp = if_get(sc->sc_carpdevidx)) != NULL) {
2186 			struct srpl *cif;
2187 			cif = &ifp->if_carp;
2188 			if (carp_check_dup_vhids(sc, cif, carpr)) {
2189 				if_put(ifp);
2190 				return (EINVAL);
2191 			}
2192 		}
2193 		if_put(ifp);
2194 		if (carpr->carpr_advskews[i] >= 255)
2195 			return (EINVAL);
2196 	}
2197 	/* set sane balancing defaults */
2198 	if (i <= 1)
2199 		carpr->carpr_balancing = CARP_BAL_NONE;
2200 	else if (carpr->carpr_balancing == CARP_BAL_NONE &&
2201 	    sc->sc_balancing == CARP_BAL_NONE)
2202 		carpr->carpr_balancing = CARP_BAL_IP;
2203 
2204 	/* destroy all */
2205 	carp_del_all_timeouts(sc);
2206 	carp_destroy_vhosts(sc);
2207 	memset(sc->sc_vhids, 0, sizeof(sc->sc_vhids));
2208 
2209 	/* sort vhosts list by vhid */
2210 	for (j = 1; j <= 255; j++) {
2211 		for (i = 0; carpr->carpr_vhids[i]; i++) {
2212 			if (carpr->carpr_vhids[i] != j)
2213 				continue;
2214 			if (carp_new_vhost(sc, carpr->carpr_vhids[i],
2215 			    carpr->carpr_advskews[i]))
2216 				return (ENOMEM);
2217 			sc->sc_vhids[i] = carpr->carpr_vhids[i];
2218 			sc->sc_advskews[i] = carpr->carpr_advskews[i];
2219 		}
2220 	}
2221 	carp_set_enaddr(sc);
2222 	carp_set_state_all(sc, INIT);
2223 	return (0);
2224 }
2225 
2226 void
2227 carp_ifgroup_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr)
2228 {
2229 	struct ifgroupreq *ifgr = (struct ifgroupreq *)addr;
2230 	struct ifg_list	*ifgl;
2231 	int *dm, adj;
2232 
2233 	if (!strcmp(ifgr->ifgr_group, IFG_ALL))
2234 		return;
2235 	adj = ((struct carp_softc *)ifp->if_softc)->sc_demote_cnt;
2236 	if (cmd == SIOCDIFGROUP)
2237 		adj = adj * -1;
2238 
2239 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
2240 		if (!strcmp(ifgl->ifgl_group->ifg_group, ifgr->ifgr_group)) {
2241 			dm = &ifgl->ifgl_group->ifg_carp_demoted;
2242 			if (*dm + adj >= 0)
2243 				*dm += adj;
2244 			else
2245 				*dm = 0;
2246 		}
2247 }
2248 
2249 void
2250 carp_ifgattr_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr)
2251 {
2252 	struct ifgroupreq *ifgr = (struct ifgroupreq *)addr;
2253 	struct carp_softc *sc = ifp->if_softc;
2254 
2255 	if (ifgr->ifgr_attrib.ifg_carp_demoted > 0 && (sc->sc_if.if_flags &
2256 	    (IFF_UP|IFF_RUNNING)) == (IFF_UP|IFF_RUNNING))
2257 		carp_vhe_send_ad_all(sc);
2258 }
2259 
2260 void
2261 carp_start(struct ifnet *ifp)
2262 {
2263 	struct carp_softc *sc = ifp->if_softc;
2264 	struct ifnet *ifp0;
2265 	struct mbuf *m;
2266 
2267 	if ((ifp0 = if_get(sc->sc_carpdevidx)) == NULL) {
2268 		ifq_purge(&ifp->if_snd);
2269 		return;
2270 	}
2271 
2272 	while ((m = ifq_dequeue(&ifp->if_snd)) != NULL)
2273 		carp_transmit(sc, ifp0, m);
2274 	if_put(ifp0);
2275 }
2276 
2277 void
2278 carp_transmit(struct carp_softc *sc, struct ifnet *ifp0, struct mbuf *m)
2279 {
2280 	struct ifnet *ifp = &sc->sc_if;
2281 
2282 #if NBPFILTER > 0
2283 	{
2284 		caddr_t if_bpf = ifp->if_bpf;
2285 		if (if_bpf) {
2286 			if (bpf_mtap_ether(if_bpf, m, BPF_DIRECTION_OUT))
2287 				m_freem(m);
2288 		}
2289 	}
2290 #endif /* NBPFILTER > 0 */
2291 
2292 	if (!ISSET(ifp0->if_flags, IFF_RUNNING)) {
2293 		counters_inc(ifp->if_counters, ifc_oerrors);
2294 		m_freem(m);
2295 		return;
2296 	}
2297 
2298 	/*
2299 	 * Do not leak the multicast address when sending
2300 	 * advertisements in 'ip' and 'ip-stealth' balacing
2301 	 * modes.
2302 	 */
2303 	if (sc->sc_balancing == CARP_BAL_IP ||
2304 	    sc->sc_balancing == CARP_BAL_IPSTEALTH) {
2305 		struct ether_header *eh = mtod(m, struct ether_header *);
2306 		memcpy(eh->ether_shost, sc->sc_ac.ac_enaddr,
2307 		    sizeof(eh->ether_shost));
2308 	}
2309 
2310 	if (if_enqueue(ifp0, m))
2311 		counters_inc(ifp->if_counters, ifc_oerrors);
2312 }
2313 
2314 int
2315 carp_enqueue(struct ifnet *ifp, struct mbuf *m)
2316 {
2317 	struct carp_softc *sc = ifp->if_softc;
2318 	struct ifnet *ifp0;
2319 
2320 	/* no ifq_is_priq, cos hfsc on carp doesn't make sense */
2321 
2322 	/*
2323 	 * If the parent of this carp(4) got destroyed while
2324 	 * `m' was being processed, silently drop it.
2325 	 */
2326 	if ((ifp0 = if_get(sc->sc_carpdevidx)) == NULL) {
2327 		m_freem(m);
2328 		return (0);
2329 	}
2330 
2331 	counters_pkt(ifp->if_counters,
2332 	    ifc_opackets, ifc_obytes, m->m_pkthdr.len);
2333 	carp_transmit(sc, ifp0, m);
2334 	if_put(ifp0);
2335 
2336 	return (0);
2337 }
2338 
2339 int
2340 carp_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa,
2341     struct rtentry *rt)
2342 {
2343 	struct carp_softc *sc = ((struct carp_softc *)ifp->if_softc);
2344 	struct carp_vhost_entry *vhe;
2345 	struct srp_ref sr;
2346 	int ismaster;
2347 
2348 	if (sc->cur_vhe == NULL) {
2349 		vhe = SRPL_FIRST(&sr, &sc->carp_vhosts);
2350 		ismaster = (vhe->state == MASTER);
2351 		SRPL_LEAVE(&sr);
2352 	} else {
2353 		ismaster = (sc->cur_vhe->state == MASTER);
2354 	}
2355 
2356 	if ((sc->sc_balancing == CARP_BAL_NONE && !ismaster)) {
2357 		m_freem(m);
2358 		return (ENETUNREACH);
2359 	}
2360 
2361 	return (ether_output(ifp, m, sa, rt));
2362 }
2363 
2364 void
2365 carp_set_state_all(struct carp_softc *sc, int state)
2366 {
2367 	struct carp_vhost_entry *vhe;
2368 
2369 	KERNEL_ASSERT_LOCKED(); /* touching carp_vhosts */
2370 
2371 	SRPL_FOREACH_LOCKED(vhe, &sc->carp_vhosts, vhost_entries) {
2372 		if (vhe->state == state)
2373 			continue;
2374 
2375 		carp_set_state(vhe, state);
2376 	}
2377 }
2378 
2379 void
2380 carp_set_state(struct carp_vhost_entry *vhe, int state)
2381 {
2382 	struct carp_softc *sc = vhe->parent_sc;
2383 	static const char *carp_states[] = { CARP_STATES };
2384 	int loglevel;
2385 	struct carp_vhost_entry *vhe0;
2386 
2387 	KASSERT(vhe->state != state);
2388 
2389 	if (vhe->state == INIT || state == INIT)
2390 		loglevel = LOG_WARNING;
2391 	else
2392 		loglevel = LOG_CRIT;
2393 
2394 	if (sc->sc_vhe_count > 1)
2395 		CARP_LOG(loglevel, sc,
2396 		    ("state transition (vhid %d): %s -> %s", vhe->vhid,
2397 		    carp_states[vhe->state], carp_states[state]));
2398 	else
2399 		CARP_LOG(loglevel, sc,
2400 		    ("state transition: %s -> %s",
2401 		    carp_states[vhe->state], carp_states[state]));
2402 
2403 	vhe->state = state;
2404 	carp_update_lsmask(sc);
2405 
2406 	KERNEL_ASSERT_LOCKED(); /* touching carp_vhosts */
2407 
2408 	sc->sc_if.if_link_state = LINK_STATE_INVALID;
2409 	SRPL_FOREACH_LOCKED(vhe0, &sc->carp_vhosts, vhost_entries) {
2410 		/*
2411 		 * Link must be up if at least one vhe is in state MASTER to
2412 		 * bring or keep route up.
2413 		 */
2414 		if (vhe0->state == MASTER) {
2415 			sc->sc_if.if_link_state = LINK_STATE_UP;
2416 			break;
2417 		} else if (vhe0->state == BACKUP) {
2418 			sc->sc_if.if_link_state = LINK_STATE_DOWN;
2419 		}
2420 	}
2421 	if_link_state_change(&sc->sc_if);
2422 }
2423 
2424 void
2425 carp_group_demote_adj(struct ifnet *ifp, int adj, char *reason)
2426 {
2427 	struct ifg_list	*ifgl;
2428 	int *dm, need_ad;
2429 	struct carp_softc *nil = NULL;
2430 
2431 	if (ifp->if_type == IFT_CARP) {
2432 		dm = &((struct carp_softc *)ifp->if_softc)->sc_demote_cnt;
2433 		if (*dm + adj >= 0)
2434 			*dm += adj;
2435 		else
2436 			*dm = 0;
2437 	}
2438 
2439 	need_ad = 0;
2440 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) {
2441 		if (!strcmp(ifgl->ifgl_group->ifg_group, IFG_ALL))
2442 			continue;
2443 		dm = &ifgl->ifgl_group->ifg_carp_demoted;
2444 
2445 		if (*dm + adj >= 0)
2446 			*dm += adj;
2447 		else
2448 			*dm = 0;
2449 
2450 		if (adj > 0 && *dm == 1)
2451 			need_ad = 1;
2452 		CARP_LOG(LOG_ERR, nil,
2453 		    ("%s demoted group %s by %d to %d (%s)",
2454 		    ifp->if_xname, ifgl->ifgl_group->ifg_group,
2455 		    adj, *dm, reason));
2456 	}
2457 	if (need_ad)
2458 		carp_send_ad_all();
2459 }
2460 
2461 int
2462 carp_group_demote_count(struct carp_softc *sc)
2463 {
2464 	struct ifg_list	*ifgl;
2465 	int count = 0;
2466 
2467 	TAILQ_FOREACH(ifgl, &sc->sc_if.if_groups, ifgl_next)
2468 		count += ifgl->ifgl_group->ifg_carp_demoted;
2469 
2470 	if (count == 0 && sc->sc_demote_cnt)
2471 		count = sc->sc_demote_cnt;
2472 
2473 	return (count > 255 ? 255 : count);
2474 }
2475 
2476 void
2477 carp_carpdev_state(void *v)
2478 {
2479 	struct carp_softc *sc = v;
2480 	struct ifnet *ifp0;
2481 	int suppressed = sc->sc_suppress;
2482 
2483 	if ((ifp0 = if_get(sc->sc_carpdevidx)) == NULL)
2484 		return;
2485 
2486 	if (ifp0->if_link_state == LINK_STATE_DOWN ||
2487 	    !(ifp0->if_flags & IFF_UP)) {
2488 		sc->sc_if.if_flags &= ~IFF_RUNNING;
2489 		carp_del_all_timeouts(sc);
2490 		carp_set_state_all(sc, INIT);
2491 		sc->sc_suppress = 1;
2492 		carp_setrun_all(sc, 0);
2493 		if (!suppressed)
2494 			carp_group_demote_adj(&sc->sc_if, 1, "carpdev");
2495 	} else if (suppressed) {
2496 		carp_set_state_all(sc, INIT);
2497 		sc->sc_suppress = 0;
2498 		carp_setrun_all(sc, 0);
2499 		carp_group_demote_adj(&sc->sc_if, -1, "carpdev");
2500 	}
2501 
2502 	if_put(ifp0);
2503 }
2504 
2505 int
2506 carp_ether_addmulti(struct carp_softc *sc, struct ifreq *ifr)
2507 {
2508 	struct ifnet *ifp0;
2509 	struct carp_mc_entry *mc;
2510 	u_int8_t addrlo[ETHER_ADDR_LEN], addrhi[ETHER_ADDR_LEN];
2511 	int error;
2512 
2513 	ifp0 = if_get(sc->sc_carpdevidx);
2514 	if (ifp0 == NULL)
2515 		return (EINVAL);
2516 
2517 	error = ether_addmulti(ifr, (struct arpcom *)&sc->sc_ac);
2518 	if (error != ENETRESET) {
2519 		if_put(ifp0);
2520 		return (error);
2521 	}
2522 
2523 	/*
2524 	 * This is new multicast address.  We have to tell parent
2525 	 * about it.  Also, remember this multicast address so that
2526 	 * we can delete them on unconfigure.
2527 	 */
2528 	mc = malloc(sizeof(*mc), M_DEVBUF, M_NOWAIT);
2529 	if (mc == NULL) {
2530 		error = ENOMEM;
2531 		goto alloc_failed;
2532 	}
2533 
2534 	/*
2535 	 * As ether_addmulti() returns ENETRESET, following two
2536 	 * statement shouldn't fail.
2537 	 */
2538 	(void)ether_multiaddr(&ifr->ifr_addr, addrlo, addrhi);
2539 	ETHER_LOOKUP_MULTI(addrlo, addrhi, &sc->sc_ac, mc->mc_enm);
2540 	memcpy(&mc->mc_addr, &ifr->ifr_addr, ifr->ifr_addr.sa_len);
2541 	LIST_INSERT_HEAD(&sc->carp_mc_listhead, mc, mc_entries);
2542 
2543 	error = (*ifp0->if_ioctl)(ifp0, SIOCADDMULTI, (caddr_t)ifr);
2544 	if (error != 0)
2545 		goto ioctl_failed;
2546 
2547 	if_put(ifp0);
2548 
2549 	return (error);
2550 
2551  ioctl_failed:
2552 	LIST_REMOVE(mc, mc_entries);
2553 	free(mc, M_DEVBUF, sizeof(*mc));
2554  alloc_failed:
2555 	(void)ether_delmulti(ifr, (struct arpcom *)&sc->sc_ac);
2556 	if_put(ifp0);
2557 
2558 	return (error);
2559 }
2560 
2561 int
2562 carp_ether_delmulti(struct carp_softc *sc, struct ifreq *ifr)
2563 {
2564 	struct ifnet *ifp0;
2565 	struct ether_multi *enm;
2566 	struct carp_mc_entry *mc;
2567 	u_int8_t addrlo[ETHER_ADDR_LEN], addrhi[ETHER_ADDR_LEN];
2568 	int error;
2569 
2570 	ifp0 = if_get(sc->sc_carpdevidx);
2571 	if (ifp0 == NULL)
2572 		return (EINVAL);
2573 
2574 	/*
2575 	 * Find a key to lookup carp_mc_entry.  We have to do this
2576 	 * before calling ether_delmulti for obvious reason.
2577 	 */
2578 	if ((error = ether_multiaddr(&ifr->ifr_addr, addrlo, addrhi)) != 0)
2579 		goto rele;
2580 	ETHER_LOOKUP_MULTI(addrlo, addrhi, &sc->sc_ac, enm);
2581 	if (enm == NULL) {
2582 		error = EINVAL;
2583 		goto rele;
2584 	}
2585 
2586 	LIST_FOREACH(mc, &sc->carp_mc_listhead, mc_entries)
2587 		if (mc->mc_enm == enm)
2588 			break;
2589 
2590 	/* We won't delete entries we didn't add */
2591 	if (mc == NULL) {
2592 		error = EINVAL;
2593 		goto rele;
2594 	}
2595 
2596 	error = ether_delmulti(ifr, (struct arpcom *)&sc->sc_ac);
2597 	if (error != ENETRESET)
2598 		goto rele;
2599 
2600 	/* We no longer use this multicast address.  Tell parent so. */
2601 	error = (*ifp0->if_ioctl)(ifp0, SIOCDELMULTI, (caddr_t)ifr);
2602 	if (error == 0) {
2603 		/* And forget about this address. */
2604 		LIST_REMOVE(mc, mc_entries);
2605 		free(mc, M_DEVBUF, sizeof(*mc));
2606 	} else
2607 		(void)ether_addmulti(ifr, (struct arpcom *)&sc->sc_ac);
2608 rele:
2609 	if_put(ifp0);
2610 	return (error);
2611 }
2612 
2613 /*
2614  * Delete any multicast address we have asked to add from parent
2615  * interface.  Called when the carp is being unconfigured.
2616  */
2617 void
2618 carp_ether_purgemulti(struct carp_softc *sc)
2619 {
2620 	struct ifnet *ifp0;		/* Parent. */
2621 	struct carp_mc_entry *mc;
2622 	union {
2623 		struct ifreq ifreq;
2624 		struct {
2625 			char ifr_name[IFNAMSIZ];
2626 			struct sockaddr_storage ifr_ss;
2627 		} ifreq_storage;
2628 	} u;
2629 	struct ifreq *ifr = &u.ifreq;
2630 
2631 	if ((ifp0 = if_get(sc->sc_carpdevidx)) == NULL)
2632 		return;
2633 
2634 	memcpy(ifr->ifr_name, ifp0->if_xname, IFNAMSIZ);
2635 	while ((mc = LIST_FIRST(&sc->carp_mc_listhead)) != NULL) {
2636 		memcpy(&ifr->ifr_addr, &mc->mc_addr, mc->mc_addr.ss_len);
2637 		(void)(*ifp0->if_ioctl)(ifp0, SIOCDELMULTI, (caddr_t)ifr);
2638 		LIST_REMOVE(mc, mc_entries);
2639 		free(mc, M_DEVBUF, sizeof(*mc));
2640 	}
2641 
2642 	if_put(ifp0);
2643 }
2644 
2645 void
2646 carp_vh_ref(void *null, void *v)
2647 {
2648 	struct carp_vhost_entry *vhe = v;
2649 
2650 	refcnt_take(&vhe->vhost_refcnt);
2651 }
2652 
2653 void
2654 carp_vh_unref(void *null, void *v)
2655 {
2656 	struct carp_vhost_entry *vhe = v;
2657 
2658 	if (refcnt_rele(&vhe->vhost_refcnt)) {
2659 		carp_sc_unref(NULL, vhe->parent_sc);
2660 		free(vhe, M_DEVBUF, sizeof(*vhe));
2661 	}
2662 }
2663 
2664 void
2665 carp_sc_ref(void *null, void *s)
2666 {
2667 	struct carp_softc *sc = s;
2668 
2669 	refcnt_take(&sc->sc_refcnt);
2670 }
2671 
2672 void
2673 carp_sc_unref(void *null, void *s)
2674 {
2675 	struct carp_softc *sc = s;
2676 
2677 	refcnt_rele_wake(&sc->sc_refcnt);
2678 }
2679