xref: /openbsd-src/sys/net/if_vxlan.c (revision 8550894424f8a4aa4aafb6cd57229dd6ed7cd9dd)
1 /*	$OpenBSD: if_vxlan.c,v 1.91 2022/06/06 14:45:41 claudio Exp $ */
2 
3 /*
4  * Copyright (c) 2021 David Gwynne <dlg@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include "bpfilter.h"
20 #include "pf.h"
21 
22 #include <sys/param.h>
23 #include <sys/systm.h>
24 #include <sys/kernel.h>
25 #include <sys/mbuf.h>
26 #include <sys/socket.h>
27 #include <sys/ioctl.h>
28 #include <sys/timeout.h>
29 #include <sys/pool.h>
30 #include <sys/tree.h>
31 #include <sys/refcnt.h>
32 #include <sys/smr.h>
33 
34 #include <sys/socket.h>
35 #include <sys/socketvar.h>
36 
37 #include <net/if.h>
38 #include <net/if_var.h>
39 #include <net/if_dl.h>
40 #include <net/if_media.h>
41 #include <net/if_types.h>
42 #include <net/route.h>
43 #include <net/rtable.h>
44 
45 #include <netinet/in.h>
46 #include <netinet/in_var.h>
47 #include <netinet/if_ether.h>
48 #include <netinet/ip.h>
49 #include <netinet/udp.h>
50 #include <netinet/in_pcb.h>
51 #include <netinet/ip_var.h>
52 
53 #ifdef INET6
54 #include <netinet/ip6.h>
55 #include <netinet6/ip6_var.h>
56 #include <netinet6/in6_var.h>
57 #endif
58 
59 /* for bridge stuff */
60 #include <net/if_bridge.h>
61 #include <net/if_etherbridge.h>
62 
63 #if NBPFILTER > 0
64 #include <net/bpf.h>
65 #endif
66 
67 /*
68  * The protocol.
69  */
70 
71 #define VXLANMTU		1492
72 #define VXLAN_PORT		4789
73 
74 struct vxlan_header {
75 	uint32_t		vxlan_flags;
76 #define VXLAN_F_I			(1U << 27)
77 	uint32_t		vxlan_id;
78 #define VXLAN_VNI_SHIFT			8
79 #define VXLAN_VNI_MASK			(0xffffffU << VXLAN_VNI_SHIFT)
80 };
81 
82 #define VXLAN_VNI_MAX			0x00ffffffU
83 #define VXLAN_VNI_MIN			0x00000000U
84 
85 /*
86  * The driver.
87  */
88 
89 union vxlan_addr {
90 	struct in_addr		in4;
91 	struct in6_addr		in6;
92 };
93 
94 struct vxlan_softc;
95 
96 struct vxlan_peer {
97 	RBT_ENTRY(vxlan_peer)	 p_entry;
98 
99 	struct vxlan_header	 p_header;
100 	union vxlan_addr	 p_addr;
101 
102 	struct vxlan_softc	*p_sc;
103 };
104 
105 RBT_HEAD(vxlan_peers, vxlan_peer);
106 
107 struct vxlan_tep {
108 	TAILQ_ENTRY(vxlan_tep)	 vt_entry;
109 
110 	sa_family_t		 vt_af;
111 	unsigned int		 vt_rdomain;
112 	union vxlan_addr	 vt_addr;
113 #define vt_addr4 vt_addr.in4
114 #define vt_addr6 vt_addr.in6
115 	in_port_t		 vt_port;
116 
117 	struct socket		*vt_so;
118 
119 	struct mutex		 vt_mtx;
120 	struct vxlan_peers	 vt_peers;
121 };
122 
123 TAILQ_HEAD(vxlan_teps, vxlan_tep);
124 
125 enum vxlan_tunnel_mode {
126 	VXLAN_TMODE_UNSET,
127 	VXLAN_TMODE_P2P,	 /* unicast destination, no learning */
128 	VXLAN_TMODE_LEARNING,	 /* multicast destination, learning */
129 	VXLAN_TMODE_ENDPOINT,	 /* unset destination, no learning */
130 };
131 
132 struct vxlan_softc {
133 	struct arpcom		 sc_ac;
134 	struct etherbridge	 sc_eb;
135 
136 	unsigned int		 sc_rdomain;
137 	sa_family_t		 sc_af;
138 	union vxlan_addr	 sc_src;
139 	union vxlan_addr	 sc_dst;
140 	in_port_t		 sc_port;
141 	struct vxlan_header	 sc_header;
142 	unsigned int		 sc_if_index0;
143 
144 	struct task		 sc_dtask;
145 	void			*sc_inmulti;
146 
147 	enum vxlan_tunnel_mode	 sc_mode;
148 	struct vxlan_peer	*sc_ucast_peer;
149 	struct vxlan_peer	*sc_mcast_peer;
150 	struct refcnt		 sc_refs;
151 
152 	uint16_t		 sc_df;
153 	int			 sc_ttl;
154 	int			 sc_txhprio;
155 	int			 sc_rxhprio;
156 
157 	struct task		 sc_send_task;
158 };
159 
160 void		vxlanattach(int);
161 
162 static int	vxlan_clone_create(struct if_clone *, int);
163 static int	vxlan_clone_destroy(struct ifnet *);
164 
165 static int	vxlan_output(struct ifnet *, struct mbuf *,
166 		    struct sockaddr *, struct rtentry *);
167 static int	vxlan_enqueue(struct ifnet *, struct mbuf *);
168 static void	vxlan_start(struct ifqueue *);
169 static void	vxlan_send(void *);
170 
171 static int	vxlan_ioctl(struct ifnet *, u_long, caddr_t);
172 static int	vxlan_up(struct vxlan_softc *);
173 static int	vxlan_down(struct vxlan_softc *);
174 static int	vxlan_addmulti(struct vxlan_softc *, struct ifnet *);
175 static void	vxlan_delmulti(struct vxlan_softc *);
176 
177 static struct mbuf *
178 		vxlan_input(void *, struct mbuf *,
179 		    struct ip *, struct ip6_hdr *, void *, int);
180 
181 static int	vxlan_set_rdomain(struct vxlan_softc *, const struct ifreq *);
182 static int	vxlan_get_rdomain(struct vxlan_softc *, struct ifreq *);
183 static int	vxlan_set_tunnel(struct vxlan_softc *,
184 		    const struct if_laddrreq *);
185 static int	vxlan_get_tunnel(struct vxlan_softc *, struct if_laddrreq *);
186 static int	vxlan_del_tunnel(struct vxlan_softc *);
187 static int	vxlan_set_vnetid(struct vxlan_softc *, const struct ifreq *);
188 static int	vxlan_get_vnetid(struct vxlan_softc *, struct ifreq *);
189 static int	vxlan_del_vnetid(struct vxlan_softc *);
190 static int	vxlan_set_parent(struct vxlan_softc *,
191 		    const struct if_parent *);
192 static int	vxlan_get_parent(struct vxlan_softc *, struct if_parent *);
193 static int	vxlan_del_parent(struct vxlan_softc *);
194 
195 static int	vxlan_add_addr(struct vxlan_softc *, const struct ifbareq *);
196 static int	vxlan_del_addr(struct vxlan_softc *, const struct ifbareq *);
197 
198 static void	vxlan_detach_hook(void *);
199 
200 static struct if_clone vxlan_cloner =
201     IF_CLONE_INITIALIZER("vxlan", vxlan_clone_create, vxlan_clone_destroy);
202 
203 static int	 vxlan_eb_port_eq(void *, void *, void *);
204 static void	*vxlan_eb_port_take(void *, void *);
205 static void	 vxlan_eb_port_rele(void *, void *);
206 static size_t	 vxlan_eb_port_ifname(void *, char *, size_t, void *);
207 static void	 vxlan_eb_port_sa(void *, struct sockaddr_storage *, void *);
208 
209 static const struct etherbridge_ops vxlan_etherbridge_ops = {
210 	vxlan_eb_port_eq,
211 	vxlan_eb_port_take,
212 	vxlan_eb_port_rele,
213 	vxlan_eb_port_ifname,
214 	vxlan_eb_port_sa,
215 };
216 
217 static struct rwlock vxlan_lock = RWLOCK_INITIALIZER("vteps");
218 static struct vxlan_teps vxlan_teps = TAILQ_HEAD_INITIALIZER(vxlan_teps);
219 static struct pool vxlan_endpoint_pool;
220 
221 static inline int	vxlan_peer_cmp(const struct vxlan_peer *,
222 			    const struct vxlan_peer *);
223 
224 RBT_PROTOTYPE(vxlan_peers, vxlan_peer, p_entry, vxlan_peer_cmp);
225 
226 void
227 vxlanattach(int count)
228 {
229 	if_clone_attach(&vxlan_cloner);
230 }
231 
232 static int
233 vxlan_clone_create(struct if_clone *ifc, int unit)
234 {
235 	struct vxlan_softc *sc;
236 	struct ifnet *ifp;
237 	int error;
238 
239 	if (vxlan_endpoint_pool.pr_size == 0) {
240 		pool_init(&vxlan_endpoint_pool, sizeof(union vxlan_addr),
241 		    0, IPL_SOFTNET, 0, "vxlanep", NULL);
242 	}
243 
244 	sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO|M_CANFAIL);
245 	if (sc == NULL)
246 		return (ENOMEM);
247 
248 	ifp = &sc->sc_ac.ac_if;
249 
250 	snprintf(ifp->if_xname, sizeof(ifp->if_xname), "%s%d",
251 	    ifc->ifc_name, unit);
252 
253 	error = etherbridge_init(&sc->sc_eb, ifp->if_xname,
254 	    &vxlan_etherbridge_ops, sc);
255 	if (error == -1) {
256 		free(sc, M_DEVBUF, sizeof(*sc));
257 		return (error);
258 	}
259 
260 	sc->sc_af = AF_UNSPEC;
261 	sc->sc_txhprio = 0;
262 	sc->sc_rxhprio = IF_HDRPRIO_OUTER;
263 	sc->sc_df = 0;
264 	sc->sc_ttl = IP_DEFAULT_MULTICAST_TTL;
265 
266 	task_set(&sc->sc_dtask, vxlan_detach_hook, sc);
267 	refcnt_init(&sc->sc_refs);
268 	task_set(&sc->sc_send_task, vxlan_send, sc);
269 
270 	ifp->if_softc = sc;
271 	ifp->if_hardmtu = ETHER_MAX_HARDMTU_LEN;
272 	ifp->if_ioctl = vxlan_ioctl;
273 	ifp->if_output = vxlan_output;
274 	ifp->if_enqueue = vxlan_enqueue;
275 	ifp->if_qstart = vxlan_start;
276 	ifp->if_flags = IFF_BROADCAST | IFF_MULTICAST | IFF_SIMPLEX;
277 	ifp->if_xflags = IFXF_CLONED | IFXF_MPSAFE;
278 	ether_fakeaddr(ifp);
279 
280 	if_counters_alloc(ifp);
281 	if_attach(ifp);
282 	ether_ifattach(ifp);
283 
284 	return (0);
285 }
286 
287 static int
288 vxlan_clone_destroy(struct ifnet *ifp)
289 {
290 	struct vxlan_softc *sc = ifp->if_softc;
291 
292 	NET_LOCK();
293 	if (ISSET(ifp->if_flags, IFF_RUNNING))
294 		vxlan_down(sc);
295 	NET_UNLOCK();
296 
297 	ether_ifdetach(ifp);
298 	if_detach(ifp);
299 
300 	etherbridge_destroy(&sc->sc_eb);
301 
302 	refcnt_finalize(&sc->sc_refs, "vxlanfini");
303 
304 	free(sc, M_DEVBUF, sizeof(*sc));
305 
306 	return (0);
307 }
308 
309 static struct vxlan_softc *
310 vxlan_take(struct vxlan_softc *sc)
311 {
312 	refcnt_take(&sc->sc_refs);
313 	return (sc);
314 }
315 
316 static void
317 vxlan_rele(struct vxlan_softc *sc)
318 {
319 	refcnt_rele_wake(&sc->sc_refs);
320 }
321 
322 static struct mbuf *
323 vxlan_encap(struct vxlan_softc *sc, struct mbuf *m,
324     struct mbuf *(ip_encap)(struct vxlan_softc *sc, struct mbuf *,
325     const union vxlan_addr *, uint8_t))
326 {
327 	struct ifnet *ifp = &sc->sc_ac.ac_if;
328 	struct m_tag *mtag;
329 	struct mbuf *m0;
330 	union vxlan_addr gateway;
331 	const union vxlan_addr *endpoint;
332 	struct vxlan_header *vh;
333 	struct udphdr *uh;
334 	int prio;
335 	uint8_t tos;
336 
337 	if (sc->sc_mode == VXLAN_TMODE_UNSET)
338 		goto drop;
339 
340 	if (sc->sc_mode == VXLAN_TMODE_P2P)
341 		endpoint = &sc->sc_dst;
342 	else { /* VXLAN_TMODE_LEARNING || VXLAN_TMODE_ENDPOINT */
343 		struct ether_header *eh = mtod(m, struct ether_header *);
344 
345 		smr_read_enter();
346 		endpoint = etherbridge_resolve_ea(&sc->sc_eb,
347 		    (struct ether_addr *)eh->ether_dhost);
348 		if (endpoint != NULL) {
349 			gateway = *endpoint;
350 			endpoint = &gateway;
351 		}
352 		smr_read_leave();
353 
354 		if (endpoint == NULL) {
355 			if (sc->sc_mode == VXLAN_TMODE_ENDPOINT)
356 				goto drop;
357 
358 			/* "flood" to unknown destinations */
359 			endpoint = &sc->sc_dst;
360 		}
361 	}
362 
363 	/* force prepend mbuf because of payload alignment */
364 	m0 = m_get(M_DONTWAIT, m->m_type);
365 	if (m0 == NULL)
366 		goto drop;
367 
368 	m_align(m0, 0);
369 	m0->m_len = 0;
370 
371 	M_MOVE_PKTHDR(m0, m);
372 	m0->m_next = m;
373 
374 	m = m_prepend(m0, sizeof(*vh), M_DONTWAIT);
375 	if (m == NULL)
376 		return (NULL);
377 
378 	vh = mtod(m, struct vxlan_header *);
379 	*vh = sc->sc_header;
380 
381 	m = m_prepend(m, sizeof(*uh), M_DONTWAIT);
382 	if (m == NULL)
383 		return (NULL);
384 
385 	uh = mtod(m, struct udphdr *);
386 	uh->uh_sport = sc->sc_port; /* XXX */
387 	uh->uh_dport = sc->sc_port;
388 	htobem16(&uh->uh_ulen, m->m_pkthdr.len);
389 	uh->uh_sum = htons(0);
390 
391 	SET(m->m_pkthdr.csum_flags, M_UDP_CSUM_OUT);
392 
393 	mtag = m_tag_get(PACKET_TAG_GRE, sizeof(ifp->if_index), M_NOWAIT);
394 	if (mtag == NULL)
395 		goto drop;
396 
397 	*(int *)(mtag + 1) = ifp->if_index;
398 	m_tag_prepend(m, mtag);
399 
400 	prio = sc->sc_txhprio;
401 	if (prio == IF_HDRPRIO_PACKET)
402 		prio = m->m_pkthdr.pf.prio;
403 	tos = IFQ_PRIO2TOS(prio);
404 
405 	CLR(m->m_flags, M_BCAST|M_MCAST);
406 	m->m_pkthdr.ph_rtableid = sc->sc_rdomain;
407 
408 #if NPF > 0
409 	pf_pkt_addr_changed(m);
410 #endif
411 
412 	return ((*ip_encap)(sc, m, endpoint, tos));
413 drop:
414 	m_freem(m);
415 	return (NULL);
416 }
417 
418 static struct mbuf *
419 vxlan_encap_ipv4(struct vxlan_softc *sc, struct mbuf *m,
420     const union vxlan_addr *endpoint, uint8_t tos)
421 {
422 	struct ip *ip;
423 
424 	m = m_prepend(m, sizeof(*ip), M_DONTWAIT);
425 	if (m == NULL)
426 		return (NULL);
427 
428 	ip = mtod(m, struct ip *);
429 	ip->ip_v = IPVERSION;
430 	ip->ip_hl = sizeof(*ip) >> 2;
431 	ip->ip_off = sc->sc_df;
432 	ip->ip_tos = tos;
433 	ip->ip_len = htons(m->m_pkthdr.len);
434 	ip->ip_ttl = sc->sc_ttl;
435 	ip->ip_p = IPPROTO_UDP;
436 	ip->ip_src = sc->sc_src.in4;
437 	ip->ip_dst = endpoint->in4;
438 
439 	return (m);
440 }
441 
442 #ifdef INET6
443 static struct mbuf *
444 vxlan_encap_ipv6(struct vxlan_softc *sc, struct mbuf *m,
445     const union vxlan_addr *endpoint, uint8_t tos)
446 {
447 	struct ip6_hdr *ip6;
448 	int len = m->m_pkthdr.len;
449 
450 	m = m_prepend(m, sizeof(*ip6), M_DONTWAIT);
451 	if (m == NULL)
452 		return (NULL);
453 
454 	ip6 = mtod(m, struct ip6_hdr *);
455 	ip6->ip6_flow = ISSET(m->m_pkthdr.csum_flags, M_FLOWID) ?
456 	    htonl(m->m_pkthdr.ph_flowid) : 0;
457 	ip6->ip6_vfc |= IPV6_VERSION;
458 	ip6->ip6_flow |= htonl((uint32_t)tos << 20);
459 	ip6->ip6_plen = htons(len);
460 	ip6->ip6_nxt = IPPROTO_UDP;
461 	ip6->ip6_hlim = sc->sc_ttl;
462 	ip6->ip6_src = sc->sc_src.in6;
463 	ip6->ip6_dst = endpoint->in6;
464 
465 	if (sc->sc_df)
466 		SET(m->m_pkthdr.csum_flags, M_IPV6_DF_OUT);
467 
468 	return (m);
469 }
470 #endif /* INET6 */
471 
472 static int
473 vxlan_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
474     struct rtentry *rt)
475 {
476 	struct m_tag *mtag;
477 
478 	mtag = NULL;
479 	while ((mtag = m_tag_find(m, PACKET_TAG_GRE, mtag)) != NULL) {
480 		if (*(int *)(mtag + 1) == ifp->if_index) {
481 			m_freem(m);
482 			return (EIO);
483 		}
484 	}
485 
486 	return (ether_output(ifp, m, dst, rt));
487 }
488 
489 static int
490 vxlan_enqueue(struct ifnet *ifp, struct mbuf *m)
491 {
492 	struct vxlan_softc *sc = ifp->if_softc;
493 	struct ifqueue *ifq = &ifp->if_snd;
494 
495 	if (ifq_enqueue(ifq, m) != 0)
496 		return (ENOBUFS);
497 
498 	task_add(ifq->ifq_softnet, &sc->sc_send_task);
499 
500 	return (0);
501 }
502 
503 static void
504 vxlan_start(struct ifqueue *ifq)
505 {
506 	struct ifnet *ifp = ifq->ifq_if;
507 	struct vxlan_softc *sc = ifp->if_softc;
508 
509 	task_add(ifq->ifq_softnet, &sc->sc_send_task);
510 }
511 
512 static uint64_t
513 vxlan_send_ipv4(struct vxlan_softc *sc, struct mbuf_list *ml)
514 {
515 	struct ip_moptions imo;
516 	struct mbuf *m;
517 	uint64_t oerrors = 0;
518 
519 	imo.imo_ifidx = sc->sc_if_index0;
520 	imo.imo_ttl = sc->sc_ttl;
521 	imo.imo_loop = 0;
522 
523 	NET_LOCK();
524 	while ((m = ml_dequeue(ml)) != NULL) {
525 		if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &imo, NULL, 0) != 0)
526 			oerrors++;
527 	}
528 	NET_UNLOCK();
529 
530 	return (oerrors);
531 }
532 
533 #ifdef INET6
534 static uint64_t
535 vxlan_send_ipv6(struct vxlan_softc *sc, struct mbuf_list *ml)
536 {
537 	struct ip6_moptions im6o;
538 	struct mbuf *m;
539 	uint64_t oerrors = 0;
540 
541 	im6o.im6o_ifidx = sc->sc_if_index0;
542 	im6o.im6o_hlim = sc->sc_ttl;
543 	im6o.im6o_loop = 0;
544 
545 	NET_LOCK();
546 	while ((m = ml_dequeue(ml)) != NULL) {
547 		if (ip6_output(m, NULL, NULL, 0, &im6o, NULL) != 0)
548 			oerrors++;
549 	}
550 	NET_UNLOCK();
551 
552 	return (oerrors);
553 }
554 #endif /* INET6 */
555 
556 static void
557 vxlan_send(void *arg)
558 {
559 	struct vxlan_softc *sc = arg;
560 	struct ifnet *ifp = &sc->sc_ac.ac_if;
561 	struct mbuf *(*ip_encap)(struct vxlan_softc *, struct mbuf *,
562 	    const union vxlan_addr *, uint8_t);
563 	uint64_t (*ip_send)(struct vxlan_softc *, struct mbuf_list *);
564 	struct mbuf_list ml = MBUF_LIST_INITIALIZER();
565 	struct mbuf *m;
566 	uint64_t oerrors;
567 
568 	if (!ISSET(ifp->if_flags, IFF_RUNNING))
569 		return;
570 
571 	switch (sc->sc_af) {
572 	case AF_INET:
573 		ip_encap = vxlan_encap_ipv4;
574 		ip_send = vxlan_send_ipv4;
575 		break;
576 #ifdef INET6
577 	case AF_INET6:
578 		ip_encap = vxlan_encap_ipv6;
579 		ip_send = vxlan_send_ipv6;
580 		break;
581 #endif
582 	default:
583 		unhandled_af(sc->sc_af);
584 		/* NOTREACHED */
585 	}
586 
587 	while ((m = ifq_dequeue(&ifp->if_snd)) != NULL) {
588 #if NBPFILTER > 0
589 		caddr_t if_bpf = READ_ONCE(ifp->if_bpf);
590 		if (if_bpf != NULL)
591 			bpf_mtap_ether(if_bpf, m, BPF_DIRECTION_OUT);
592 #endif
593 		m = vxlan_encap(sc, m, ip_encap);
594 		if (m == NULL)
595 			continue;
596 
597 		ml_enqueue(&ml, m);
598 	}
599 
600 	oerrors = (*ip_send)(sc, &ml);
601 
602 	counters_add(ifp->if_counters, ifc_oerrors, oerrors);
603 }
604 
605 static struct mbuf *
606 vxlan_input(void *arg, struct mbuf *m, struct ip *ip, struct ip6_hdr *ip6,
607     void *uhp, int hlen)
608 {
609 	struct vxlan_tep *vt = arg;
610 	union vxlan_addr addr;
611 	struct vxlan_peer key, *p;
612 	struct udphdr *uh;
613 	struct vxlan_header *vh;
614 	struct ether_header *eh;
615 	int vhlen = hlen + sizeof(*vh);
616 	struct mbuf *n;
617 	int off;
618 	in_port_t port;
619 	struct vxlan_softc *sc = NULL;
620 	struct ifnet *ifp;
621 	int rxhprio;
622 	uint8_t tos;
623 
624 	if (m->m_pkthdr.len < vhlen)
625 		goto drop;
626 
627 	uh = uhp;
628 	port = uh->uh_sport;
629 
630 	if (ip != NULL) {
631 		memset(&addr, 0, sizeof(addr));
632 		addr.in4 = ip->ip_src;
633 		tos = ip->ip_tos;
634 	}
635 #ifdef INET6
636 	else {
637 		addr.in6 = ip6->ip6_src;
638 		tos = bemtoh32(&ip6->ip6_flow) >> 20;
639 	}
640 #endif
641 
642 	if (m->m_len < vhlen) {
643 		m = m_pullup(m, vhlen);
644 		if (m == NULL)
645 			return (NULL);
646 	}
647 
648 	/* can't use ip/ip6/uh after this */
649 
650 	vh = (struct vxlan_header *)(mtod(m, caddr_t) + hlen);
651 
652 	memset(&key, 0, sizeof(key));
653 	key.p_addr = addr;
654 	key.p_header.vxlan_flags = vh->vxlan_flags & htonl(VXLAN_F_I);
655 	key.p_header.vxlan_id = vh->vxlan_id & htonl(VXLAN_VNI_MASK);
656 
657 	mtx_enter(&vt->vt_mtx);
658 	p = RBT_FIND(vxlan_peers, &vt->vt_peers, &key);
659 	if (p == NULL) {
660 		memset(&key.p_addr, 0, sizeof(key.p_addr));
661 		p = RBT_FIND(vxlan_peers, &vt->vt_peers, &key);
662 	}
663 	if (p != NULL)
664 		sc = vxlan_take(p->p_sc);
665 	mtx_leave(&vt->vt_mtx);
666 
667 	if (sc == NULL)
668 		goto drop;
669 
670 	ifp = &sc->sc_ac.ac_if;
671 	if (ISSET(ifp->if_flags, IFF_LINK0) && port != sc->sc_port)
672 		goto rele_drop;
673 
674 	m_adj(m, vhlen);
675 
676 	if (m->m_pkthdr.len < sizeof(*eh))
677 		goto rele_drop;
678 
679 	if (m->m_len < sizeof(*eh)) {
680 		m = m_pullup(m, sizeof(*eh));
681 		if (m == NULL)
682 			goto rele;
683 	}
684 
685 	n = m_getptr(m, sizeof(*eh), &off);
686 	if (n == NULL)
687 		goto rele_drop;
688 
689 	if (!ALIGNED_POINTER(mtod(n, caddr_t) + off, uint32_t)) {
690 		n = m_dup_pkt(m, ETHER_ALIGN, M_NOWAIT);
691 		m_freem(m);
692 		if (n == NULL)
693 			goto rele;
694 		m = n;
695 	}
696 
697 	if (sc->sc_mode == VXLAN_TMODE_LEARNING) {
698 		eh = mtod(m, struct ether_header *);
699 		etherbridge_map_ea(&sc->sc_eb, &addr,
700 		    (struct ether_addr *)eh->ether_shost);
701 	}
702 
703 	rxhprio = sc->sc_rxhprio;
704 	switch (rxhprio) {
705 	case IF_HDRPRIO_PACKET:
706 		/* nop */
707 		break;
708 	case IF_HDRPRIO_OUTER:
709 		m->m_pkthdr.pf.prio = IFQ_TOS2PRIO(tos);
710 		break;
711 	default:
712 		m->m_pkthdr.pf.prio = rxhprio;
713 		break;                                                  \
714         }                                                               \
715 
716 	if_vinput(ifp, m);
717 rele:
718 	vxlan_rele(sc);
719 	return (NULL);
720 
721 rele_drop:
722 	vxlan_rele(sc);
723 drop:
724 	m_freem(m);
725 	return (NULL);
726 }
727 
728 static int
729 vxlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
730 {
731 	struct vxlan_softc *sc = ifp->if_softc;
732 	struct ifreq *ifr = (struct ifreq *)data;
733 	struct ifbrparam *bparam = (struct ifbrparam *)data;
734 	int error = 0;
735 
736 	switch (cmd) {
737 	case SIOCSIFADDR:
738 		break;
739 	case SIOCSIFFLAGS:
740 		if (ISSET(ifp->if_flags, IFF_UP)) {
741 			if (!ISSET(ifp->if_flags, IFF_RUNNING))
742 				error = vxlan_up(sc);
743 			else
744 				error = 0;
745 		} else {
746 			if (ISSET(ifp->if_flags, IFF_RUNNING))
747 				error = vxlan_down(sc);
748 		}
749 		break;
750 
751 	case SIOCSLIFPHYRTABLE:
752 		error = vxlan_set_rdomain(sc, ifr);
753 		break;
754 	case SIOCGLIFPHYRTABLE:
755 		error = vxlan_get_rdomain(sc, ifr);
756 		break;
757 
758 	case SIOCSLIFPHYADDR:
759 		error = vxlan_set_tunnel(sc, (const struct if_laddrreq *)data);
760 		break;
761 	case SIOCGLIFPHYADDR:
762 		error = vxlan_get_tunnel(sc, (struct if_laddrreq *)data);
763 		break;
764 	case SIOCDIFPHYADDR:
765 		error = vxlan_del_tunnel(sc);
766 		break;
767 
768 	case SIOCSVNETID:
769 		error = vxlan_set_vnetid(sc, ifr);
770 		break;
771 	case SIOCGVNETID:
772 		error = vxlan_get_vnetid(sc, ifr);
773 		break;
774 	case SIOCDVNETID:
775 		error = vxlan_del_vnetid(sc);
776 		break;
777 
778 	case SIOCSIFPARENT:
779 		error = vxlan_set_parent(sc, (struct if_parent *)data);
780 		break;
781 	case SIOCGIFPARENT:
782 		error = vxlan_get_parent(sc, (struct if_parent *)data);
783 		break;
784 	case SIOCDIFPARENT:
785 		error = vxlan_del_parent(sc);
786 		break;
787 
788 	case SIOCSTXHPRIO:
789 		error = if_txhprio_l2_check(ifr->ifr_hdrprio);
790 		if (error != 0)
791 			break;
792 
793 		sc->sc_txhprio = ifr->ifr_hdrprio;
794 		break;
795 	case SIOCGTXHPRIO:
796 		ifr->ifr_hdrprio = sc->sc_txhprio;
797 		break;
798 
799 	case SIOCSRXHPRIO:
800 		error = if_rxhprio_l2_check(ifr->ifr_hdrprio);
801 		if (error != 0)
802 			break;
803 
804 		sc->sc_rxhprio = ifr->ifr_hdrprio;
805 		break;
806 	case SIOCGRXHPRIO:
807 		ifr->ifr_hdrprio = sc->sc_rxhprio;
808 		break;
809 
810 	case SIOCSLIFPHYDF:
811 		/* commit */
812 		sc->sc_df = ifr->ifr_df ? htons(IP_DF) : htons(0);
813 		break;
814 	case SIOCGLIFPHYDF:
815 		ifr->ifr_df = sc->sc_df ? 1 : 0;
816 		break;
817 
818 	case SIOCSLIFPHYTTL:
819 		if (ifr->ifr_ttl < 1 || ifr->ifr_ttl > 0xff) {
820 			error = EINVAL;
821 			break;
822 		}
823 
824 		/* commit */
825 		sc->sc_ttl = (uint8_t)ifr->ifr_ttl;
826 		break;
827 	case SIOCGLIFPHYTTL:
828 		ifr->ifr_ttl = (int)sc->sc_ttl;
829 		break;
830 
831 	case SIOCBRDGSCACHE:
832 		error = etherbridge_set_max(&sc->sc_eb, bparam);
833 		break;
834 	case SIOCBRDGGCACHE:
835 		error = etherbridge_get_max(&sc->sc_eb, bparam);
836 		break;
837 	case SIOCBRDGSTO:
838 		error = etherbridge_set_tmo(&sc->sc_eb, bparam);
839 		break;
840 	case SIOCBRDGGTO:
841 		error = etherbridge_get_tmo(&sc->sc_eb, bparam);
842 		break;
843 
844 	case SIOCBRDGRTS:
845 		error = etherbridge_rtfind(&sc->sc_eb,
846 		    (struct ifbaconf *)data);
847 		break;
848 	case SIOCBRDGFLUSH:
849 		etherbridge_flush(&sc->sc_eb,
850 		    ((struct ifbreq *)data)->ifbr_ifsflags);
851 		break;
852 	case SIOCBRDGSADDR:
853 		error = vxlan_add_addr(sc, (struct ifbareq *)data);
854 		break;
855 	case SIOCBRDGDADDR:
856 		error = vxlan_del_addr(sc, (struct ifbareq *)data);
857 		break;
858 
859 	case SIOCADDMULTI:
860 	case SIOCDELMULTI:
861 		/* no hardware to program */
862 		break;
863 
864 	default:
865 		error = ether_ioctl(ifp, &sc->sc_ac, cmd, data);
866 		break;
867 	}
868 
869 	if (error == ENETRESET) {
870 		/* no hardware to program */
871 		error = 0;
872 	}
873 
874 	return (error);
875 }
876 
877 static struct vxlan_tep *
878 vxlan_tep_get(struct vxlan_softc *sc, const union vxlan_addr *addr)
879 {
880 	struct vxlan_tep *vt;
881 
882 	TAILQ_FOREACH(vt, &vxlan_teps, vt_entry) {
883 		if (sc->sc_af == vt->vt_af &&
884 		    sc->sc_rdomain == vt->vt_rdomain &&
885 		    memcmp(addr, &vt->vt_addr, sizeof(*addr)) == 0 &&
886 		    sc->sc_port == vt->vt_port)
887 			return (vt);
888 	}
889 
890 	return (NULL);
891 }
892 
893 static int
894 vxlan_tep_add_addr(struct vxlan_softc *sc, const union vxlan_addr *addr,
895     struct vxlan_peer *p)
896 {
897 	struct mbuf m;
898 	struct vxlan_tep *vt;
899 	struct socket *so;
900 	struct sockaddr_in *sin;
901 #ifdef INET6
902 	struct sockaddr_in6 *sin6;
903 #endif
904 	int error;
905 
906 	vt = vxlan_tep_get(sc, addr);
907 	if (vt != NULL) {
908 		struct vxlan_peer *op;
909 
910 		mtx_enter(&vt->vt_mtx);
911 		op = RBT_INSERT(vxlan_peers, &vt->vt_peers, p);
912 		mtx_leave(&vt->vt_mtx);
913 
914 		if (op != NULL)
915 			return (EADDRINUSE);
916 
917 		return (0);
918 	}
919 
920 	vt = malloc(sizeof(*vt), M_DEVBUF, M_NOWAIT|M_ZERO);
921 	if (vt == NULL)
922 		return (ENOMEM);
923 
924 	vt->vt_af = sc->sc_af;
925 	vt->vt_rdomain = sc->sc_rdomain;
926 	vt->vt_addr = *addr;
927 	vt->vt_port = sc->sc_port;
928 
929 	mtx_init(&vt->vt_mtx, IPL_SOFTNET);
930 	RBT_INIT(vxlan_peers, &vt->vt_peers);
931 	RBT_INSERT(vxlan_peers, &vt->vt_peers, p);
932 
933 	error = socreate(vt->vt_af, &so, SOCK_DGRAM, IPPROTO_UDP);
934 	if (error != 0)
935 		goto free;
936 
937 	solock(so);
938 
939 	sotoinpcb(so)->inp_upcall = vxlan_input;
940 	sotoinpcb(so)->inp_upcall_arg = vt;
941 
942 	m_inithdr(&m);
943 	m.m_len = sizeof(vt->vt_rdomain);
944 	*mtod(&m, unsigned int *) = vt->vt_rdomain;
945 	error = sosetopt(so, SOL_SOCKET, SO_RTABLE, &m);
946 	if (error != 0)
947 		goto close;
948 
949 	m_inithdr(&m);
950 	switch (vt->vt_af) {
951 	case AF_INET:
952 		sin = mtod(&m, struct sockaddr_in *);
953 		memset(sin, 0, sizeof(*sin));
954 		sin->sin_len = sizeof(*sin);
955 		sin->sin_family = AF_INET;
956 		sin->sin_addr = addr->in4;
957 		sin->sin_port = vt->vt_port;
958 
959 		m.m_len = sizeof(*sin);
960 		break;
961 
962 #ifdef INET6
963 	case AF_INET6:
964 		sin6 = mtod(&m, struct sockaddr_in6 *);
965 		sin6->sin6_len = sizeof(*sin6);
966 		sin6->sin6_family = AF_INET6;
967 		in6_recoverscope(sin6, &addr->in6);
968 		sin6->sin6_port = sc->sc_port;
969 
970 		m.m_len = sizeof(*sin6);
971 		break;
972 #endif
973 	default:
974 		unhandled_af(vt->vt_af);
975 	}
976 
977 	error = sobind(so, &m, curproc);
978 	if (error != 0)
979 		goto close;
980 
981 	sounlock(so);
982 
983 	rw_assert_wrlock(&vxlan_lock);
984 	TAILQ_INSERT_TAIL(&vxlan_teps, vt, vt_entry);
985 
986 	vt->vt_so = so;
987 
988 	return (0);
989 
990 close:
991 	sounlock(so);
992 	soclose(so, MSG_DONTWAIT);
993 free:
994 	free(vt, M_DEVBUF, sizeof(*vt));
995 	return (error);
996 }
997 
998 static void
999 vxlan_tep_del_addr(struct vxlan_softc *sc, const union vxlan_addr *addr,
1000     struct vxlan_peer *p)
1001 {
1002 	struct vxlan_tep *vt;
1003 	int empty;
1004 
1005 	vt = vxlan_tep_get(sc, addr);
1006 	if (vt == NULL)
1007 		panic("unable to find vxlan_tep for peer %p (sc %p)", p, sc);
1008 
1009 	mtx_enter(&vt->vt_mtx);
1010 	RBT_REMOVE(vxlan_peers, &vt->vt_peers, p);
1011 	empty = RBT_EMPTY(vxlan_peers, &vt->vt_peers);
1012 	mtx_leave(&vt->vt_mtx);
1013 
1014 	if (!empty)
1015 		return;
1016 
1017 	rw_assert_wrlock(&vxlan_lock);
1018 	TAILQ_REMOVE(&vxlan_teps, vt, vt_entry);
1019 
1020 	soclose(vt->vt_so, MSG_DONTWAIT);
1021 	free(vt, M_DEVBUF, sizeof(*vt));
1022 }
1023 
1024 static int
1025 vxlan_tep_up(struct vxlan_softc *sc)
1026 {
1027 	struct vxlan_peer *up, *mp;
1028 	int error;
1029 
1030 	up = malloc(sizeof(*up), M_DEVBUF, M_NOWAIT|M_ZERO);
1031 	if (up == NULL)
1032 		return (ENOMEM);
1033 
1034 	if (sc->sc_mode == VXLAN_TMODE_P2P)
1035 		up->p_addr = sc->sc_dst;
1036 	up->p_header = sc->sc_header;
1037 	up->p_sc = vxlan_take(sc);
1038 
1039 	error = vxlan_tep_add_addr(sc, &sc->sc_src, up);
1040 	if (error != 0)
1041 		goto freeup;
1042 
1043 	sc->sc_ucast_peer = up;
1044 
1045 	if (sc->sc_mode != VXLAN_TMODE_LEARNING)
1046 		return (0);
1047 
1048 	mp = malloc(sizeof(*mp), M_DEVBUF, M_NOWAIT|M_ZERO);
1049 	if (mp == NULL) {
1050 		error = ENOMEM;
1051 		goto delup;
1052 	}
1053 
1054 	/* addr is multicast, leave it as 0s */
1055 	mp->p_header = sc->sc_header;
1056 	mp->p_sc = vxlan_take(sc);
1057 
1058 	/* destination address is a multicast group we want to join */
1059 	error = vxlan_tep_add_addr(sc, &sc->sc_dst, up);
1060 	if (error != 0)
1061 		goto freemp;
1062 
1063 	sc->sc_mcast_peer = mp;
1064 
1065 	return (0);
1066 
1067 freemp:
1068 	vxlan_rele(mp->p_sc);
1069 	free(mp, M_DEVBUF, sizeof(*mp));
1070 delup:
1071 	vxlan_tep_del_addr(sc, &sc->sc_src, up);
1072 freeup:
1073 	vxlan_rele(up->p_sc);
1074 	free(up, M_DEVBUF, sizeof(*up));
1075 	return (error);
1076 }
1077 
1078 static void
1079 vxlan_tep_down(struct vxlan_softc *sc)
1080 {
1081 	struct vxlan_peer *up = sc->sc_ucast_peer;
1082 
1083 	if (sc->sc_mode == VXLAN_TMODE_LEARNING) {
1084 		struct vxlan_peer *mp = sc->sc_mcast_peer;
1085 		vxlan_tep_del_addr(sc, &sc->sc_dst, mp);
1086 		vxlan_rele(mp->p_sc);
1087 		free(mp, M_DEVBUF, sizeof(*mp));
1088 	}
1089 
1090 	vxlan_tep_del_addr(sc, &sc->sc_src, up);
1091 	vxlan_rele(up->p_sc);
1092 	free(up, M_DEVBUF, sizeof(*up));
1093 }
1094 
1095 static int
1096 vxlan_up(struct vxlan_softc *sc)
1097 {
1098 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1099 	struct ifnet *ifp0 = NULL;
1100 	int error;
1101 
1102 	KASSERT(!ISSET(ifp->if_flags, IFF_RUNNING));
1103 	NET_ASSERT_LOCKED();
1104 
1105 	if (sc->sc_af == AF_UNSPEC)
1106 		return (EDESTADDRREQ);
1107 	KASSERT(sc->sc_mode != VXLAN_TMODE_UNSET);
1108 
1109 	NET_UNLOCK();
1110 
1111 	error = rw_enter(&vxlan_lock, RW_WRITE|RW_INTR);
1112 	if (error != 0)
1113 		goto netlock;
1114 
1115 	NET_LOCK();
1116 	if (ISSET(ifp->if_flags, IFF_RUNNING)) {
1117 		/* something else beat us */
1118 		rw_exit(&vxlan_lock);
1119 		return (0);
1120 	}
1121 	NET_UNLOCK();
1122 
1123 	if (sc->sc_mode != VXLAN_TMODE_P2P) {
1124 		error = etherbridge_up(&sc->sc_eb);
1125 		if (error != 0)
1126 			goto unlock;
1127 	}
1128 
1129 	if (sc->sc_mode == VXLAN_TMODE_LEARNING) {
1130 		ifp0 = if_get(sc->sc_if_index0);
1131 		if (ifp0 == NULL) {
1132 			error = ENXIO;
1133 			goto down;
1134 		}
1135 
1136 		/* check again if multicast will work on top of the parent */
1137 		if (!ISSET(ifp0->if_flags, IFF_MULTICAST)) {
1138 			error = EPROTONOSUPPORT;
1139 			goto put;
1140 		}
1141 
1142 		error = vxlan_addmulti(sc, ifp0);
1143 		if (error != 0)
1144 			goto put;
1145 
1146 		/* Register callback if parent wants to unregister */
1147 		if_detachhook_add(ifp0, &sc->sc_dtask);
1148 	} else {
1149 		if (sc->sc_if_index0 != 0) {
1150 			error = EPROTONOSUPPORT;
1151 			goto down;
1152 		}
1153 	}
1154 
1155 	error = vxlan_tep_up(sc);
1156 	if (error != 0)
1157 		goto del;
1158 
1159 	if_put(ifp0);
1160 
1161 	NET_LOCK();
1162 	SET(ifp->if_flags, IFF_RUNNING);
1163 	rw_exit(&vxlan_lock);
1164 
1165 	return (0);
1166 
1167 del:
1168 	if (sc->sc_mode == VXLAN_TMODE_LEARNING) {
1169 		if (ifp0 != NULL)
1170 			if_detachhook_del(ifp0, &sc->sc_dtask);
1171 		vxlan_delmulti(sc);
1172 	}
1173 put:
1174 	if_put(ifp0);
1175 down:
1176 	if (sc->sc_mode != VXLAN_TMODE_P2P)
1177 		etherbridge_down(&sc->sc_eb);
1178 unlock:
1179 	rw_exit(&vxlan_lock);
1180 netlock:
1181 	NET_LOCK();
1182 
1183 	return (error);
1184 }
1185 
1186 static int
1187 vxlan_down(struct vxlan_softc *sc)
1188 {
1189 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1190 	struct ifnet *ifp0;
1191 	int error;
1192 
1193 	KASSERT(ISSET(ifp->if_flags, IFF_RUNNING));
1194 	NET_UNLOCK();
1195 
1196 	error = rw_enter(&vxlan_lock, RW_WRITE|RW_INTR);
1197 	if (error != 0) {
1198 		NET_LOCK();
1199 		return (error);
1200 	}
1201 
1202 	NET_LOCK();
1203 	if (!ISSET(ifp->if_flags, IFF_RUNNING)) {
1204 		/* something else beat us */
1205 		rw_exit(&vxlan_lock);
1206 		return (0);
1207 	}
1208 	NET_UNLOCK();
1209 
1210 	vxlan_tep_down(sc);
1211 
1212 	if (sc->sc_mode == VXLAN_TMODE_LEARNING) {
1213 		vxlan_delmulti(sc);
1214 		ifp0 = if_get(sc->sc_if_index0);
1215 		if (ifp0 != NULL) {
1216 			if_detachhook_del(ifp0, &sc->sc_dtask);
1217 		}
1218 		if_put(ifp0);
1219 	}
1220 
1221 	if (sc->sc_mode != VXLAN_TMODE_P2P)
1222 		etherbridge_down(&sc->sc_eb);
1223 
1224 	taskq_del_barrier(ifp->if_snd.ifq_softnet, &sc->sc_send_task);
1225 	NET_LOCK();
1226 	CLR(ifp->if_flags, IFF_RUNNING);
1227 	rw_exit(&vxlan_lock);
1228 
1229 	return (0);
1230 }
1231 
1232 static int
1233 vxlan_addmulti(struct vxlan_softc *sc, struct ifnet *ifp0)
1234 {
1235 	int error = 0;
1236 
1237 	NET_LOCK();
1238 
1239 	switch (sc->sc_af) {
1240 	case AF_INET:
1241 		sc->sc_inmulti = in_addmulti(&sc->sc_dst.in4, ifp0);
1242 		if (sc->sc_inmulti == NULL)
1243 			error = EADDRNOTAVAIL;
1244 		break;
1245 #ifdef INET6
1246 	case AF_INET6:
1247 		sc->sc_inmulti = in6_addmulti(&sc->sc_dst.in6, ifp0, &error);
1248 		break;
1249 #endif
1250 	default:
1251 		unhandled_af(sc->sc_af);
1252 	}
1253 
1254 	NET_UNLOCK();
1255 
1256 	return (error);
1257 }
1258 
1259 static void
1260 vxlan_delmulti(struct vxlan_softc *sc)
1261 {
1262 	NET_LOCK();
1263 
1264 	switch (sc->sc_af) {
1265 	case AF_INET:
1266 		in_delmulti(sc->sc_inmulti);
1267 		break;
1268 #ifdef INET6
1269 	case AF_INET6:
1270 		in6_delmulti(sc->sc_inmulti);
1271 		break;
1272 #endif
1273 	default:
1274 		unhandled_af(sc->sc_af);
1275 	}
1276 
1277 	sc->sc_inmulti = NULL; /* keep it tidy */
1278 
1279 	NET_UNLOCK();
1280 }
1281 
1282 static int
1283 vxlan_set_rdomain(struct vxlan_softc *sc, const struct ifreq *ifr)
1284 {
1285 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1286 
1287 	if (ifr->ifr_rdomainid < 0 ||
1288 	    ifr->ifr_rdomainid > RT_TABLEID_MAX)
1289 		return (EINVAL);
1290 	if (!rtable_exists(ifr->ifr_rdomainid))
1291 		return (EADDRNOTAVAIL);
1292 
1293 	if (sc->sc_rdomain == ifr->ifr_rdomainid)
1294 		return (0);
1295 
1296 	if (ISSET(ifp->if_flags, IFF_RUNNING))
1297 		return (EBUSY);
1298 
1299 	/* commit */
1300 	sc->sc_rdomain = ifr->ifr_rdomainid;
1301 	etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
1302 
1303 	return (0);
1304 }
1305 
1306 static int
1307 vxlan_get_rdomain(struct vxlan_softc *sc, struct ifreq *ifr)
1308 {
1309 	ifr->ifr_rdomainid = sc->sc_rdomain;
1310 
1311 	return (0);
1312 }
1313 
1314 static int
1315 vxlan_set_tunnel(struct vxlan_softc *sc, const struct if_laddrreq *req)
1316 {
1317 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1318 	struct sockaddr *src = (struct sockaddr *)&req->addr;
1319 	struct sockaddr *dst = (struct sockaddr *)&req->dstaddr;
1320 	struct sockaddr_in *src4, *dst4;
1321 #ifdef INET6
1322 	struct sockaddr_in6 *src6, *dst6;
1323 	int error;
1324 #endif
1325 	union vxlan_addr saddr, daddr;
1326 	unsigned int mode = VXLAN_TMODE_ENDPOINT;
1327 	in_port_t port = htons(VXLAN_PORT);
1328 
1329 	memset(&saddr, 0, sizeof(saddr));
1330 	memset(&daddr, 0, sizeof(daddr));
1331 
1332 	/* validate */
1333 	switch (src->sa_family) {
1334 	case AF_INET:
1335 		src4 = (struct sockaddr_in *)src;
1336 		if (in_nullhost(src4->sin_addr) ||
1337 		    IN_MULTICAST(src4->sin_addr.s_addr))
1338 			return (EINVAL);
1339 
1340 		if (src4->sin_port != htons(0))
1341 			port = src4->sin_port;
1342 
1343 		if (dst->sa_family != AF_UNSPEC) {
1344 			if (dst->sa_family != AF_INET)
1345 				return (EINVAL);
1346 
1347 			dst4 = (struct sockaddr_in *)dst;
1348 			if (in_nullhost(dst4->sin_addr))
1349 				return (EINVAL);
1350 
1351 			/* all good */
1352 			mode = IN_MULTICAST(dst4->sin_addr.s_addr) ?
1353 			    VXLAN_TMODE_LEARNING : VXLAN_TMODE_P2P;
1354 			daddr.in4 = dst4->sin_addr;
1355 		}
1356 
1357 		saddr.in4 = src4->sin_addr;
1358 		break;
1359 
1360 #ifdef INET6
1361 	case AF_INET6:
1362 		src6 = (struct sockaddr_in6 *)src;
1363 		if (IN6_IS_ADDR_UNSPECIFIED(&src6->sin6_addr) ||
1364 		    IN6_IS_ADDR_MULTICAST(&src6->sin6_addr))
1365 			return (EINVAL);
1366 
1367 		if (src6->sin6_port != htons(0))
1368 			port = src6->sin6_port;
1369 
1370 		if (dst->sa_family != AF_UNSPEC) {
1371 			if (dst->sa_family != AF_INET6)
1372 				return (EINVAL);
1373 
1374 			dst6 = (struct sockaddr_in6 *)dst;
1375 			if (IN6_IS_ADDR_UNSPECIFIED(&dst6->sin6_addr))
1376 				return (EINVAL);
1377 
1378 			if (src6->sin6_scope_id != dst6->sin6_scope_id)
1379 				return (EINVAL);
1380 
1381 			/* all good */
1382 			mode = IN6_IS_ADDR_MULTICAST(&dst6->sin6_addr) ?
1383 			    VXLAN_TMODE_LEARNING : VXLAN_TMODE_P2P;
1384 			error = in6_embedscope(&daddr.in6, dst6, NULL);
1385 			if (error != 0)
1386 				return (error);
1387 		}
1388 
1389 		error = in6_embedscope(&saddr.in6, src6, NULL);
1390 		if (error != 0)
1391 			return (error);
1392 
1393 		break;
1394 #endif
1395 	default:
1396 		return (EAFNOSUPPORT);
1397 	}
1398 
1399 	if (memcmp(&sc->sc_src, &saddr, sizeof(sc->sc_src)) == 0 &&
1400 	    memcmp(&sc->sc_dst, &daddr, sizeof(sc->sc_dst)) == 0 &&
1401 	    sc->sc_port == port)
1402 		return (0);
1403 
1404 	if (ISSET(ifp->if_flags, IFF_RUNNING))
1405 		return (EBUSY);
1406 
1407 	/* commit */
1408 	sc->sc_af = src->sa_family;
1409 	sc->sc_src = saddr;
1410 	sc->sc_dst = daddr;
1411 	sc->sc_port = port;
1412 	sc->sc_mode = mode;
1413 	etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
1414 
1415 	return (0);
1416 }
1417 
1418 static int
1419 vxlan_get_tunnel(struct vxlan_softc *sc, struct if_laddrreq *req)
1420 {
1421 	struct sockaddr *dstaddr = (struct sockaddr *)&req->dstaddr;
1422 	struct sockaddr_in *sin;
1423 #ifdef INET6
1424 	struct sockaddr_in6 *sin6;
1425 #endif
1426 
1427 	if (sc->sc_af == AF_UNSPEC)
1428 		return (EADDRNOTAVAIL);
1429 	KASSERT(sc->sc_mode != VXLAN_TMODE_UNSET);
1430 
1431 	memset(&req->addr, 0, sizeof(req->addr));
1432 	memset(&req->dstaddr, 0, sizeof(req->dstaddr));
1433 
1434 	/* default to endpoint */
1435 	dstaddr->sa_len = 2;
1436 	dstaddr->sa_family = AF_UNSPEC;
1437 
1438 	switch (sc->sc_af) {
1439 	case AF_INET:
1440 		sin = (struct sockaddr_in *)&req->addr;
1441 		sin->sin_len = sizeof(*sin);
1442 		sin->sin_family = AF_INET;
1443 		sin->sin_addr = sc->sc_src.in4;
1444 		sin->sin_port = sc->sc_port;
1445 
1446 		if (sc->sc_mode == VXLAN_TMODE_ENDPOINT)
1447 			break;
1448 
1449 		sin = (struct sockaddr_in *)&req->dstaddr;
1450 		sin->sin_len = sizeof(*sin);
1451 		sin->sin_family = AF_INET;
1452 		sin->sin_addr = sc->sc_dst.in4;
1453 		break;
1454 
1455 #ifdef INET6
1456 	case AF_INET6:
1457 		sin6 = (struct sockaddr_in6 *)&req->addr;
1458 		sin6->sin6_len = sizeof(*sin6);
1459 		sin6->sin6_family = AF_INET6;
1460 		in6_recoverscope(sin6, &sc->sc_src.in6);
1461 		sin6->sin6_port = sc->sc_port;
1462 
1463 		if (sc->sc_mode == VXLAN_TMODE_ENDPOINT)
1464 			break;
1465 
1466 		sin6 = (struct sockaddr_in6 *)&req->dstaddr;
1467 		sin6->sin6_len = sizeof(*sin6);
1468 		sin6->sin6_family = AF_INET6;
1469 		in6_recoverscope(sin6, &sc->sc_dst.in6);
1470 		break;
1471 #endif
1472 	default:
1473 		unhandled_af(sc->sc_af);
1474 	}
1475 
1476 	return (0);
1477 }
1478 
1479 static int
1480 vxlan_del_tunnel(struct vxlan_softc *sc)
1481 {
1482 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1483 
1484 	if (sc->sc_af == AF_UNSPEC)
1485 		return (0);
1486 
1487 	if (ISSET(ifp->if_flags, IFF_RUNNING))
1488 		return (EBUSY);
1489 
1490 	/* commit */
1491 	sc->sc_af = AF_UNSPEC;
1492 	memset(&sc->sc_src, 0, sizeof(sc->sc_src));
1493 	memset(&sc->sc_dst, 0, sizeof(sc->sc_dst));
1494 	sc->sc_port = htons(0);
1495 	sc->sc_mode = VXLAN_TMODE_UNSET;
1496 	etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
1497 
1498 	return (0);
1499 }
1500 
1501 static int
1502 vxlan_set_vnetid(struct vxlan_softc *sc, const struct ifreq *ifr)
1503 {
1504 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1505 	uint32_t vni;
1506 
1507 	if (ifr->ifr_vnetid < VXLAN_VNI_MIN ||
1508 	    ifr->ifr_vnetid > VXLAN_VNI_MAX)
1509 		return (EINVAL);
1510 
1511 	vni = htonl(ifr->ifr_vnetid << VXLAN_VNI_SHIFT);
1512 	if (ISSET(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I)) &&
1513 	    sc->sc_header.vxlan_id == vni)
1514 		return (0);
1515 
1516 	if (ISSET(ifp->if_flags, IFF_RUNNING))
1517 		return (EBUSY);
1518 
1519 	/* commit */
1520 	SET(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I));
1521 	sc->sc_header.vxlan_id = vni;
1522 	etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
1523 
1524 	return (0);
1525 }
1526 
1527 static int
1528 vxlan_get_vnetid(struct vxlan_softc *sc, struct ifreq *ifr)
1529 {
1530 	uint32_t vni;
1531 
1532 	if (!ISSET(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I)))
1533 		return (EADDRNOTAVAIL);
1534 
1535 	vni = ntohl(sc->sc_header.vxlan_id);
1536 	vni &= VXLAN_VNI_MASK;
1537 	vni >>= VXLAN_VNI_SHIFT;
1538 
1539 	ifr->ifr_vnetid = vni;
1540 
1541 	return (0);
1542 }
1543 
1544 static int
1545 vxlan_del_vnetid(struct vxlan_softc *sc)
1546 {
1547 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1548 
1549 	if (!ISSET(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I)))
1550 		return (0);
1551 
1552 	if (ISSET(ifp->if_flags, IFF_RUNNING))
1553 		return (EBUSY);
1554 
1555 	/* commit */
1556 	CLR(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I));
1557 	sc->sc_header.vxlan_id = htonl(0 << VXLAN_VNI_SHIFT);
1558 	etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
1559 
1560 	return (0);
1561 }
1562 
1563 static int
1564 vxlan_set_parent(struct vxlan_softc *sc, const struct if_parent *p)
1565 {
1566 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1567 	struct ifnet *ifp0;
1568 	int error = 0;
1569 
1570 	ifp0 = if_unit(p->ifp_parent);
1571 	if (ifp0 == NULL)
1572 		return (ENXIO);
1573 
1574 	if (!ISSET(ifp0->if_flags, IFF_MULTICAST)) {
1575 		error = ENXIO;
1576 		goto put;
1577 	}
1578 
1579 	if (sc->sc_if_index0 == ifp0->if_index)
1580 		goto put;
1581 
1582 	if (ISSET(ifp->if_flags, IFF_RUNNING)) {
1583 		error = EBUSY;
1584 		goto put;
1585 	}
1586 
1587 	/* commit */
1588 	sc->sc_if_index0 = ifp0->if_index;
1589 	etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
1590 
1591 put:
1592 	if_put(ifp0);
1593 	return (error);
1594 }
1595 
1596 static int
1597 vxlan_get_parent(struct vxlan_softc *sc, struct if_parent *p)
1598 {
1599 	struct ifnet *ifp0;
1600 	int error = 0;
1601 
1602 	ifp0 = if_get(sc->sc_if_index0);
1603 	if (ifp0 == NULL)
1604 		error = EADDRNOTAVAIL;
1605 	else
1606 		strlcpy(p->ifp_parent, ifp0->if_xname, sizeof(p->ifp_parent));
1607 	if_put(ifp0);
1608 
1609 	return (error);
1610 }
1611 
1612 static int
1613 vxlan_del_parent(struct vxlan_softc *sc)
1614 {
1615 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1616 
1617 	if (sc->sc_if_index0 == 0)
1618 		return (0);
1619 
1620 	if (ISSET(ifp->if_flags, IFF_RUNNING))
1621 		return (EBUSY);
1622 
1623 	/* commit */
1624 	sc->sc_if_index0 = 0;
1625 	etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
1626 
1627 	return (0);
1628 }
1629 
1630 static int
1631 vxlan_add_addr(struct vxlan_softc *sc, const struct ifbareq *ifba)
1632 {
1633 	struct sockaddr_in *sin;
1634 #ifdef INET6
1635 	struct sockaddr_in6 *sin6;
1636 	struct sockaddr_in6 src6 = {
1637 		.sin6_len = sizeof(src6),
1638 		.sin6_family = AF_UNSPEC,
1639 	};
1640 	int error;
1641 #endif
1642 	union vxlan_addr endpoint;
1643 	unsigned int type;
1644 
1645 	switch (sc->sc_mode) {
1646 	case VXLAN_TMODE_UNSET:
1647 		return (ENOPROTOOPT);
1648 	case VXLAN_TMODE_P2P:
1649 		return (EPROTONOSUPPORT);
1650 	default:
1651 		break;
1652 	}
1653 
1654 	/* ignore ifba_ifsname */
1655 
1656 	if (ISSET(ifba->ifba_flags, ~IFBAF_TYPEMASK))
1657 		return (EINVAL);
1658 	switch (ifba->ifba_flags & IFBAF_TYPEMASK) {
1659 	case IFBAF_DYNAMIC:
1660 		type = EBE_DYNAMIC;
1661 		break;
1662 	case IFBAF_STATIC:
1663 		type = EBE_STATIC;
1664 		break;
1665 	default:
1666 		return (EINVAL);
1667 	}
1668 
1669 	memset(&endpoint, 0, sizeof(endpoint));
1670 
1671 	if (ifba->ifba_dstsa.ss_family != sc->sc_af)
1672 		return (EAFNOSUPPORT);
1673 	switch (ifba->ifba_dstsa.ss_family) {
1674 	case AF_INET:
1675 		sin = (struct sockaddr_in *)&ifba->ifba_dstsa;
1676 		if (in_nullhost(sin->sin_addr) ||
1677 		    IN_MULTICAST(sin->sin_addr.s_addr))
1678 			return (EADDRNOTAVAIL);
1679 
1680 		if (sin->sin_port != htons(0))
1681 			return (EADDRNOTAVAIL);
1682 
1683 		endpoint.in4 = sin->sin_addr;
1684 		break;
1685 
1686 #ifdef INET6
1687 	case AF_INET6:
1688 		sin6 = (struct sockaddr_in6 *)&ifba->ifba_dstsa;
1689 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) ||
1690 		    IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
1691 			return (EADDRNOTAVAIL);
1692 
1693 		in6_recoverscope(&src6, &sc->sc_src.in6);
1694 		if (src6.sin6_scope_id != sin6->sin6_scope_id)
1695 			return (EADDRNOTAVAIL);
1696 
1697 		if (sin6->sin6_port != htons(0))
1698 			return (EADDRNOTAVAIL);
1699 
1700 		error = in6_embedscope(&endpoint.in6, sin6, NULL);
1701 		if (error != 0)
1702 			return (error);
1703 
1704 		break;
1705 #endif
1706 	default: /* AF_UNSPEC */
1707 		return (EADDRNOTAVAIL);
1708 	}
1709 
1710 	return (etherbridge_add_addr(&sc->sc_eb, &endpoint,
1711 	    &ifba->ifba_dst, type));
1712 }
1713 
1714 static int
1715 vxlan_del_addr(struct vxlan_softc *sc, const struct ifbareq *ifba)
1716 {
1717 	return (etherbridge_del_addr(&sc->sc_eb, &ifba->ifba_dst));
1718 }
1719 
1720 void
1721 vxlan_detach_hook(void *arg)
1722 {
1723 	struct vxlan_softc *sc = arg;
1724 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1725 
1726 	if (ISSET(ifp->if_flags, IFF_RUNNING)) {
1727 		vxlan_down(sc);
1728 		CLR(ifp->if_flags, IFF_UP);
1729 	}
1730 
1731 	sc->sc_if_index0 = 0;
1732 }
1733 
1734 static int
1735 vxlan_eb_port_eq(void *arg, void *a, void *b)
1736 {
1737 	const union vxlan_addr *va = a, *vb = b;
1738 	size_t i;
1739 
1740 	for (i = 0; i < nitems(va->in6.s6_addr32); i++) {
1741 		if (va->in6.s6_addr32[i] != vb->in6.s6_addr32[i])
1742 			return (0);
1743 	}
1744 
1745 	return (1);
1746 }
1747 
1748 static void *
1749 vxlan_eb_port_take(void *arg, void *port)
1750 {
1751 	union vxlan_addr *endpoint;
1752 
1753 	endpoint = pool_get(&vxlan_endpoint_pool, PR_NOWAIT);
1754 	if (endpoint == NULL)
1755 		return (NULL);
1756 
1757 	*endpoint = *(union vxlan_addr *)port;
1758 
1759 	return (endpoint);
1760 }
1761 
1762 static void
1763 vxlan_eb_port_rele(void *arg, void *port)
1764 {
1765 	union vxlan_addr *endpoint = port;
1766 
1767 	pool_put(&vxlan_endpoint_pool, endpoint);
1768 }
1769 
1770 static size_t
1771 vxlan_eb_port_ifname(void *arg, char *dst, size_t len, void *port)
1772 {
1773 	struct vxlan_softc *sc = arg;
1774 
1775 	return (strlcpy(dst, sc->sc_ac.ac_if.if_xname, len));
1776 }
1777 
1778 static void
1779 vxlan_eb_port_sa(void *arg, struct sockaddr_storage *ss, void *port)
1780 {
1781 	struct vxlan_softc *sc = arg;
1782 	union vxlan_addr *endpoint = port;
1783 
1784 	switch (sc->sc_af) {
1785 	case AF_INET: {
1786 		struct sockaddr_in *sin = (struct sockaddr_in *)ss;
1787 
1788 		sin->sin_len = sizeof(*sin);
1789 		sin->sin_family = AF_INET;
1790 		sin->sin_addr = endpoint->in4;
1791 		break;
1792 	}
1793 #ifdef INET6
1794 	case AF_INET6: {
1795 		struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)ss;
1796 
1797 		sin6->sin6_len = sizeof(*sin6);
1798 		sin6->sin6_family = AF_INET6;
1799 		in6_recoverscope(sin6, &endpoint->in6);
1800 		break;
1801 	}
1802 #endif /* INET6 */
1803 	default:
1804 		unhandled_af(sc->sc_af);
1805 	}
1806 }
1807 
1808 static inline int
1809 vxlan_peer_cmp(const struct vxlan_peer *ap, const struct vxlan_peer *bp)
1810 {
1811 	size_t i;
1812 
1813 	if (ap->p_header.vxlan_id > bp->p_header.vxlan_id)
1814 		return (1);
1815 	if (ap->p_header.vxlan_id < bp->p_header.vxlan_id)
1816 		return (-1);
1817 	if (ap->p_header.vxlan_flags > bp->p_header.vxlan_flags)
1818 		return (1);
1819 	if (ap->p_header.vxlan_flags < bp->p_header.vxlan_flags)
1820 		return (-1);
1821 
1822 	for (i = 0; i < nitems(ap->p_addr.in6.s6_addr32); i++) {
1823 		if (ap->p_addr.in6.s6_addr32[i] >
1824 		    bp->p_addr.in6.s6_addr32[i])
1825 			return (1);
1826 		if (ap->p_addr.in6.s6_addr32[i] <
1827 		    bp->p_addr.in6.s6_addr32[i])
1828 			return (-1);
1829 	}
1830 
1831 	return (0);
1832 }
1833 
1834 RBT_GENERATE(vxlan_peers, vxlan_peer, p_entry, vxlan_peer_cmp);
1835