xref: /openbsd-src/sys/net/if_vxlan.c (revision c1a45aed656e7d5627c30c92421893a76f370ccb)
1 /*	$OpenBSD: if_vxlan.c,v 1.90 2022/02/26 04:46:34 dlg Exp $ */
2 
3 /*
4  * Copyright (c) 2021 David Gwynne <dlg@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include "bpfilter.h"
20 #include "pf.h"
21 
22 #include <sys/param.h>
23 #include <sys/systm.h>
24 #include <sys/kernel.h>
25 #include <sys/mbuf.h>
26 #include <sys/socket.h>
27 #include <sys/ioctl.h>
28 #include <sys/timeout.h>
29 #include <sys/pool.h>
30 #include <sys/tree.h>
31 #include <sys/refcnt.h>
32 #include <sys/smr.h>
33 
34 #include <sys/socket.h>
35 #include <sys/socketvar.h>
36 
37 #include <net/if.h>
38 #include <net/if_var.h>
39 #include <net/if_dl.h>
40 #include <net/if_media.h>
41 #include <net/if_types.h>
42 #include <net/route.h>
43 #include <net/rtable.h>
44 
45 #include <netinet/in.h>
46 #include <netinet/in_var.h>
47 #include <netinet/if_ether.h>
48 #include <netinet/ip.h>
49 #include <netinet/udp.h>
50 #include <netinet/in_pcb.h>
51 #include <netinet/ip_var.h>
52 
53 #ifdef INET6
54 #include <netinet/ip6.h>
55 #include <netinet6/ip6_var.h>
56 #include <netinet6/in6_var.h>
57 #endif
58 
59 /* for bridge stuff */
60 #include <net/if_bridge.h>
61 #include <net/if_etherbridge.h>
62 
63 #if NBPFILTER > 0
64 #include <net/bpf.h>
65 #endif
66 
67 /*
68  * The protocol.
69  */
70 
71 #define VXLANMTU		1492
72 #define VXLAN_PORT		4789
73 
74 struct vxlan_header {
75 	uint32_t		vxlan_flags;
76 #define VXLAN_F_I			(1U << 27)
77 	uint32_t		vxlan_id;
78 #define VXLAN_VNI_SHIFT			8
79 #define VXLAN_VNI_MASK			(0xffffffU << VXLAN_VNI_SHIFT)
80 };
81 
82 #define VXLAN_VNI_MAX			0x00ffffffU
83 #define VXLAN_VNI_MIN			0x00000000U
84 
85 /*
86  * The driver.
87  */
88 
89 union vxlan_addr {
90 	struct in_addr		in4;
91 	struct in6_addr		in6;
92 };
93 
94 struct vxlan_softc;
95 
96 struct vxlan_peer {
97 	RBT_ENTRY(vxlan_peer)	 p_entry;
98 
99 	struct vxlan_header	 p_header;
100 	union vxlan_addr	 p_addr;
101 
102 	struct vxlan_softc	*p_sc;
103 };
104 
105 RBT_HEAD(vxlan_peers, vxlan_peer);
106 
107 struct vxlan_tep {
108 	TAILQ_ENTRY(vxlan_tep)	 vt_entry;
109 
110 	sa_family_t		 vt_af;
111 	unsigned int		 vt_rdomain;
112 	union vxlan_addr	 vt_addr;
113 #define vt_addr4 vt_addr.in4
114 #define vt_addr6 vt_addr.in6
115 	in_port_t		 vt_port;
116 
117 	struct socket		*vt_so;
118 
119 	struct mutex		 vt_mtx;
120 	struct vxlan_peers	 vt_peers;
121 };
122 
123 TAILQ_HEAD(vxlan_teps, vxlan_tep);
124 
125 enum vxlan_tunnel_mode {
126 	VXLAN_TMODE_UNSET,
127 	VXLAN_TMODE_P2P,	 /* unicast destination, no learning */
128 	VXLAN_TMODE_LEARNING,	 /* multicast destination, learning */
129 	VXLAN_TMODE_ENDPOINT,	 /* unset destination, no learning */
130 };
131 
132 struct vxlan_softc {
133 	struct arpcom		 sc_ac;
134 	struct etherbridge	 sc_eb;
135 
136 	unsigned int		 sc_rdomain;
137 	sa_family_t		 sc_af;
138 	union vxlan_addr	 sc_src;
139 	union vxlan_addr	 sc_dst;
140 	in_port_t		 sc_port;
141 	struct vxlan_header	 sc_header;
142 	unsigned int		 sc_if_index0;
143 
144 	struct task		 sc_dtask;
145 	void			*sc_inmulti;
146 
147 	enum vxlan_tunnel_mode	 sc_mode;
148 	struct vxlan_peer	*sc_ucast_peer;
149 	struct vxlan_peer	*sc_mcast_peer;
150 	struct refcnt		 sc_refs;
151 
152 	uint16_t		 sc_df;
153 	int			 sc_ttl;
154 	int			 sc_txhprio;
155 	int			 sc_rxhprio;
156 
157 	struct task		 sc_send_task;
158 };
159 
160 void		vxlanattach(int);
161 
162 static int	vxlan_clone_create(struct if_clone *, int);
163 static int	vxlan_clone_destroy(struct ifnet *);
164 
165 static int	vxlan_output(struct ifnet *, struct mbuf *,
166 		    struct sockaddr *, struct rtentry *);
167 static int	vxlan_enqueue(struct ifnet *, struct mbuf *);
168 static void	vxlan_start(struct ifqueue *);
169 static void	vxlan_send(void *);
170 
171 static int	vxlan_ioctl(struct ifnet *, u_long, caddr_t);
172 static int	vxlan_up(struct vxlan_softc *);
173 static int	vxlan_down(struct vxlan_softc *);
174 static int	vxlan_addmulti(struct vxlan_softc *, struct ifnet *);
175 static void	vxlan_delmulti(struct vxlan_softc *);
176 
177 static struct mbuf *
178 		vxlan_input(void *, struct mbuf *,
179 		    struct ip *, struct ip6_hdr *, void *, int);
180 
181 static int	vxlan_set_rdomain(struct vxlan_softc *, const struct ifreq *);
182 static int	vxlan_get_rdomain(struct vxlan_softc *, struct ifreq *);
183 static int	vxlan_set_tunnel(struct vxlan_softc *,
184 		    const struct if_laddrreq *);
185 static int	vxlan_get_tunnel(struct vxlan_softc *, struct if_laddrreq *);
186 static int	vxlan_del_tunnel(struct vxlan_softc *);
187 static int	vxlan_set_vnetid(struct vxlan_softc *, const struct ifreq *);
188 static int	vxlan_get_vnetid(struct vxlan_softc *, struct ifreq *);
189 static int	vxlan_del_vnetid(struct vxlan_softc *);
190 static int	vxlan_set_parent(struct vxlan_softc *,
191 		    const struct if_parent *);
192 static int	vxlan_get_parent(struct vxlan_softc *, struct if_parent *);
193 static int	vxlan_del_parent(struct vxlan_softc *);
194 
195 static int	vxlan_add_addr(struct vxlan_softc *, const struct ifbareq *);
196 static int	vxlan_del_addr(struct vxlan_softc *, const struct ifbareq *);
197 
198 static void	vxlan_detach_hook(void *);
199 
200 static struct if_clone vxlan_cloner =
201     IF_CLONE_INITIALIZER("vxlan", vxlan_clone_create, vxlan_clone_destroy);
202 
203 static int	 vxlan_eb_port_eq(void *, void *, void *);
204 static void	*vxlan_eb_port_take(void *, void *);
205 static void	 vxlan_eb_port_rele(void *, void *);
206 static size_t	 vxlan_eb_port_ifname(void *, char *, size_t, void *);
207 static void	 vxlan_eb_port_sa(void *, struct sockaddr_storage *, void *);
208 
209 static const struct etherbridge_ops vxlan_etherbridge_ops = {
210 	vxlan_eb_port_eq,
211 	vxlan_eb_port_take,
212 	vxlan_eb_port_rele,
213 	vxlan_eb_port_ifname,
214 	vxlan_eb_port_sa,
215 };
216 
217 static struct rwlock vxlan_lock = RWLOCK_INITIALIZER("vteps");
218 static struct vxlan_teps vxlan_teps = TAILQ_HEAD_INITIALIZER(vxlan_teps);
219 static struct pool vxlan_endpoint_pool;
220 
221 static inline int	vxlan_peer_cmp(const struct vxlan_peer *,
222 			    const struct vxlan_peer *);
223 
224 RBT_PROTOTYPE(vxlan_peers, vxlan_peer, p_entry, vxlan_peer_cmp);
225 
226 void
227 vxlanattach(int count)
228 {
229 	if_clone_attach(&vxlan_cloner);
230 }
231 
232 static int
233 vxlan_clone_create(struct if_clone *ifc, int unit)
234 {
235 	struct vxlan_softc *sc;
236 	struct ifnet *ifp;
237 	int error;
238 
239 	if (vxlan_endpoint_pool.pr_size == 0) {
240 		pool_init(&vxlan_endpoint_pool, sizeof(union vxlan_addr),
241 		    0, IPL_SOFTNET, 0, "vxlanep", NULL);
242 	}
243 
244 	sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO|M_CANFAIL);
245 	if (sc == NULL)
246 		return (ENOMEM);
247 
248 	ifp = &sc->sc_ac.ac_if;
249 
250 	snprintf(ifp->if_xname, sizeof(ifp->if_xname), "%s%d",
251 	    ifc->ifc_name, unit);
252 
253 	error = etherbridge_init(&sc->sc_eb, ifp->if_xname,
254 	    &vxlan_etherbridge_ops, sc);
255 	if (error == -1) {
256 		free(sc, M_DEVBUF, sizeof(*sc));
257 		return (error);
258 	}
259 
260 	sc->sc_af = AF_UNSPEC;
261 	sc->sc_txhprio = 0;
262 	sc->sc_rxhprio = IF_HDRPRIO_OUTER;
263 	sc->sc_df = 0;
264 	sc->sc_ttl = IP_DEFAULT_MULTICAST_TTL;
265 
266 	task_set(&sc->sc_dtask, vxlan_detach_hook, sc);
267 	refcnt_init(&sc->sc_refs);
268 	task_set(&sc->sc_send_task, vxlan_send, sc);
269 
270 	ifp->if_softc = sc;
271 	ifp->if_hardmtu = ETHER_MAX_HARDMTU_LEN;
272 	ifp->if_ioctl = vxlan_ioctl;
273 	ifp->if_output = vxlan_output;
274 	ifp->if_enqueue = vxlan_enqueue;
275 	ifp->if_qstart = vxlan_start;
276 	ifp->if_flags = IFF_BROADCAST | IFF_MULTICAST | IFF_SIMPLEX;
277 	ifp->if_xflags = IFXF_CLONED | IFXF_MPSAFE;
278 	ether_fakeaddr(ifp);
279 
280 	if_counters_alloc(ifp);
281 	if_attach(ifp);
282 	ether_ifattach(ifp);
283 
284 	return (0);
285 }
286 
287 static int
288 vxlan_clone_destroy(struct ifnet *ifp)
289 {
290 	struct vxlan_softc *sc = ifp->if_softc;
291 
292 	NET_LOCK();
293 	if (ISSET(ifp->if_flags, IFF_RUNNING))
294 		vxlan_down(sc);
295 	NET_UNLOCK();
296 
297 	ether_ifdetach(ifp);
298 	if_detach(ifp);
299 
300 	etherbridge_destroy(&sc->sc_eb);
301 
302 	refcnt_finalize(&sc->sc_refs, "vxlanfini");
303 
304 	free(sc, M_DEVBUF, sizeof(*sc));
305 
306 	return (0);
307 }
308 
309 static struct vxlan_softc *
310 vxlan_take(struct vxlan_softc *sc)
311 {
312 	refcnt_take(&sc->sc_refs);
313 	return (sc);
314 }
315 
316 static void
317 vxlan_rele(struct vxlan_softc *sc)
318 {
319 	refcnt_rele_wake(&sc->sc_refs);
320 }
321 
322 static struct mbuf *
323 vxlan_encap(struct vxlan_softc *sc, struct mbuf *m,
324     struct mbuf *(ip_encap)(struct vxlan_softc *sc, struct mbuf *,
325     const union vxlan_addr *, uint8_t))
326 {
327 	struct ifnet *ifp = &sc->sc_ac.ac_if;
328 	struct m_tag *mtag;
329 	struct mbuf *m0;
330 	union vxlan_addr gateway;
331 	const union vxlan_addr *endpoint;
332 	struct vxlan_header *vh;
333 	struct udphdr *uh;
334 	int prio;
335 	uint8_t tos;
336 
337 	if (sc->sc_mode == VXLAN_TMODE_UNSET)
338 		goto drop;
339 
340 	if (sc->sc_mode == VXLAN_TMODE_P2P)
341 		endpoint = &sc->sc_dst;
342 	else { /* VXLAN_TMODE_LEARNING || VXLAN_TMODE_ENDPOINT */
343 		struct ether_header *eh = mtod(m, struct ether_header *);
344 
345 		smr_read_enter();
346 		endpoint = etherbridge_resolve_ea(&sc->sc_eb,
347 		    (struct ether_addr *)eh->ether_dhost);
348 		if (endpoint != NULL) {
349 			gateway = *endpoint;
350 			endpoint = &gateway;
351 		}
352 		smr_read_leave();
353 
354 		if (endpoint == NULL) {
355 			if (sc->sc_mode == VXLAN_TMODE_ENDPOINT)
356 				goto drop;
357 
358 			/* "flood" to unknown destinations */
359 			endpoint = &sc->sc_dst;
360 		}
361 	}
362 
363 	/* force prepend mbuf because of payload alignment */
364 	m0 = m_get(M_DONTWAIT, m->m_type);
365 	if (m0 == NULL)
366 		goto drop;
367 
368 	m_align(m0, 0);
369 	m0->m_len = 0;
370 
371 	M_MOVE_PKTHDR(m0, m);
372 	m0->m_next = m;
373 
374 	m = m_prepend(m0, sizeof(*vh), M_DONTWAIT);
375 	if (m == NULL)
376 		return (NULL);
377 
378 	vh = mtod(m, struct vxlan_header *);
379 	*vh = sc->sc_header;
380 
381 	m = m_prepend(m, sizeof(*uh), M_DONTWAIT);
382 	if (m == NULL)
383 		return (NULL);
384 
385 	uh = mtod(m, struct udphdr *);
386 	uh->uh_sport = sc->sc_port; /* XXX */
387 	uh->uh_dport = sc->sc_port;
388 	htobem16(&uh->uh_ulen, m->m_pkthdr.len);
389 	uh->uh_sum = htons(0);
390 
391 	SET(m->m_pkthdr.csum_flags, M_UDP_CSUM_OUT);
392 
393 	mtag = m_tag_get(PACKET_TAG_GRE, sizeof(ifp->if_index), M_NOWAIT);
394 	if (mtag == NULL)
395 		goto drop;
396 
397 	*(int *)(mtag + 1) = ifp->if_index;
398 	m_tag_prepend(m, mtag);
399 
400 	prio = sc->sc_txhprio;
401 	if (prio == IF_HDRPRIO_PACKET)
402 		prio = m->m_pkthdr.pf.prio;
403 	tos = IFQ_PRIO2TOS(prio);
404 
405 	CLR(m->m_flags, M_BCAST|M_MCAST);
406 	m->m_pkthdr.ph_rtableid = sc->sc_rdomain;
407 
408 #if NPF > 0
409 	pf_pkt_addr_changed(m);
410 #endif
411 
412 	return ((*ip_encap)(sc, m, endpoint, tos));
413 drop:
414 	m_freem(m);
415 	return (NULL);
416 }
417 
418 static struct mbuf *
419 vxlan_encap_ipv4(struct vxlan_softc *sc, struct mbuf *m,
420     const union vxlan_addr *endpoint, uint8_t tos)
421 {
422 	struct ip *ip;
423 
424 	m = m_prepend(m, sizeof(*ip), M_DONTWAIT);
425 	if (m == NULL)
426 		return (NULL);
427 
428 	ip = mtod(m, struct ip *);
429 	ip->ip_v = IPVERSION;
430 	ip->ip_hl = sizeof(*ip) >> 2;
431 	ip->ip_off = sc->sc_df;
432 	ip->ip_tos = tos;
433 	ip->ip_len = htons(m->m_pkthdr.len);
434 	ip->ip_ttl = sc->sc_ttl;
435 	ip->ip_p = IPPROTO_UDP;
436 	ip->ip_src = sc->sc_src.in4;
437 	ip->ip_dst = endpoint->in4;
438 
439 	return (m);
440 }
441 
442 #ifdef INET6
443 static struct mbuf *
444 vxlan_encap_ipv6(struct vxlan_softc *sc, struct mbuf *m,
445     const union vxlan_addr *endpoint, uint8_t tos)
446 {
447 	struct ip6_hdr *ip6;
448 	int len = m->m_pkthdr.len;
449 
450 	m = m_prepend(m, sizeof(*ip6), M_DONTWAIT);
451 	if (m == NULL)
452 		return (NULL);
453 
454 	ip6 = mtod(m, struct ip6_hdr *);
455 	ip6->ip6_flow = ISSET(m->m_pkthdr.csum_flags, M_FLOWID) ?
456 	    htonl(m->m_pkthdr.ph_flowid) : 0;
457 	ip6->ip6_vfc |= IPV6_VERSION;
458 	ip6->ip6_flow |= htonl((uint32_t)tos << 20);
459 	ip6->ip6_plen = htons(len);
460 	ip6->ip6_nxt = IPPROTO_UDP;
461 	ip6->ip6_hlim = sc->sc_ttl;
462 	ip6->ip6_src = sc->sc_src.in6;
463 	ip6->ip6_dst = endpoint->in6;
464 
465 	if (sc->sc_df)
466 		SET(m->m_pkthdr.csum_flags, M_IPV6_DF_OUT);
467 
468 	return (m);
469 }
470 #endif /* INET6 */
471 
472 static int
473 vxlan_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
474     struct rtentry *rt)
475 {
476 	struct m_tag *mtag;
477 
478 	mtag = NULL;
479 	while ((mtag = m_tag_find(m, PACKET_TAG_GRE, mtag)) != NULL) {
480 		if (*(int *)(mtag + 1) == ifp->if_index) {
481 			m_freem(m);
482 			return (EIO);
483 		}
484 	}
485 
486 	return (ether_output(ifp, m, dst, rt));
487 }
488 
489 static int
490 vxlan_enqueue(struct ifnet *ifp, struct mbuf *m)
491 {
492 	struct vxlan_softc *sc = ifp->if_softc;
493 	struct ifqueue *ifq = &ifp->if_snd;
494 
495 	if (ifq_enqueue(ifq, m) != 0)
496 		return (ENOBUFS);
497 
498 	task_add(ifq->ifq_softnet, &sc->sc_send_task);
499 
500 	return (0);
501 }
502 
503 static void
504 vxlan_start(struct ifqueue *ifq)
505 {
506 	struct ifnet *ifp = ifq->ifq_if;
507 	struct vxlan_softc *sc = ifp->if_softc;
508 
509 	task_add(ifq->ifq_softnet, &sc->sc_send_task);
510 }
511 
512 static uint64_t
513 vxlan_send_ipv4(struct vxlan_softc *sc, struct mbuf_list *ml)
514 {
515 	struct ip_moptions imo;
516 	struct mbuf *m;
517 	uint64_t oerrors = 0;
518 
519 	imo.imo_ifidx = sc->sc_if_index0;
520 	imo.imo_ttl = sc->sc_ttl;
521 	imo.imo_loop = 0;
522 
523 	NET_LOCK();
524 	while ((m = ml_dequeue(ml)) != NULL) {
525 		if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &imo, NULL, 0) != 0)
526 			oerrors++;
527 	}
528 	NET_UNLOCK();
529 
530 	return (oerrors);
531 }
532 
533 #ifdef INET6
534 static uint64_t
535 vxlan_send_ipv6(struct vxlan_softc *sc, struct mbuf_list *ml)
536 {
537 	struct ip6_moptions im6o;
538 	struct mbuf *m;
539 	uint64_t oerrors = 0;
540 
541 	im6o.im6o_ifidx = sc->sc_if_index0;
542 	im6o.im6o_hlim = sc->sc_ttl;
543 	im6o.im6o_loop = 0;
544 
545 	NET_LOCK();
546 	while ((m = ml_dequeue(ml)) != NULL) {
547 		if (ip6_output(m, NULL, NULL, 0, &im6o, NULL) != 0)
548 			oerrors++;
549 	}
550 	NET_UNLOCK();
551 
552 	return (oerrors);
553 }
554 #endif /* INET6 */
555 
556 static void
557 vxlan_send(void *arg)
558 {
559 	struct vxlan_softc *sc = arg;
560 	struct ifnet *ifp = &sc->sc_ac.ac_if;
561 	struct mbuf *(*ip_encap)(struct vxlan_softc *, struct mbuf *,
562 	    const union vxlan_addr *, uint8_t);
563 	uint64_t (*ip_send)(struct vxlan_softc *, struct mbuf_list *);
564 	struct mbuf_list ml = MBUF_LIST_INITIALIZER();
565 	struct mbuf *m;
566 	uint64_t oerrors;
567 
568 	if (!ISSET(ifp->if_flags, IFF_RUNNING))
569 		return;
570 
571 	switch (sc->sc_af) {
572 	case AF_INET:
573 		ip_encap = vxlan_encap_ipv4;
574 		ip_send = vxlan_send_ipv4;
575 		break;
576 #ifdef INET6
577 	case AF_INET6:
578 		ip_encap = vxlan_encap_ipv6;
579 		ip_send = vxlan_send_ipv6;
580 		break;
581 #endif
582 	default:
583 		unhandled_af(sc->sc_af);
584 		/* NOTREACHED */
585 	}
586 
587 	while ((m = ifq_dequeue(&ifp->if_snd)) != NULL) {
588 #if NBPFILTER > 0
589 		caddr_t if_bpf = READ_ONCE(ifp->if_bpf);
590 		if (if_bpf != NULL)
591 			bpf_mtap_ether(if_bpf, m, BPF_DIRECTION_OUT);
592 #endif
593 		m = vxlan_encap(sc, m, ip_encap);
594 		if (m == NULL)
595 			continue;
596 
597 		ml_enqueue(&ml, m);
598 	}
599 
600 	oerrors = (*ip_send)(sc, &ml);
601 
602 	counters_add(ifp->if_counters, ifc_oerrors, oerrors);
603 }
604 
605 static struct mbuf *
606 vxlan_input(void *arg, struct mbuf *m, struct ip *ip, struct ip6_hdr *ip6,
607     void *uhp, int hlen)
608 {
609 	struct vxlan_tep *vt = arg;
610 	union vxlan_addr addr;
611 	struct vxlan_peer key, *p;
612 	struct udphdr *uh;
613 	struct vxlan_header *vh;
614 	struct ether_header *eh;
615 	int vhlen = hlen + sizeof(*vh);
616 	struct mbuf *n;
617 	int off;
618 	in_port_t port;
619 	struct vxlan_softc *sc = NULL;
620 	struct ifnet *ifp;
621 	int rxhprio;
622 	uint8_t tos;
623 
624 	if (m->m_pkthdr.len < vhlen)
625 		goto drop;
626 
627 	uh = uhp;
628 	port = uh->uh_sport;
629 
630 	if (ip != NULL) {
631 		memset(&addr, 0, sizeof(addr));
632 		addr.in4 = ip->ip_src;
633 		tos = ip->ip_tos;
634 	}
635 #ifdef INET6
636 	else {
637 		addr.in6 = ip6->ip6_src;
638 		tos = bemtoh32(&ip6->ip6_flow) >> 20;
639 	}
640 #endif
641 
642 	if (m->m_len < vhlen) {
643 		m = m_pullup(m, vhlen);
644 		if (m == NULL)
645 			return (NULL);
646 	}
647 
648 	/* can't use ip/ip6/uh after this */
649 
650 	vh = (struct vxlan_header *)(mtod(m, caddr_t) + hlen);
651 
652 	memset(&key, 0, sizeof(key));
653 	key.p_addr = addr;
654 	key.p_header.vxlan_flags = vh->vxlan_flags & htonl(VXLAN_F_I);
655 	key.p_header.vxlan_id = vh->vxlan_id & htonl(VXLAN_VNI_MASK);
656 
657 	mtx_enter(&vt->vt_mtx);
658 	p = RBT_FIND(vxlan_peers, &vt->vt_peers, &key);
659 	if (p == NULL) {
660 		memset(&key.p_addr, 0, sizeof(key.p_addr));
661 		p = RBT_FIND(vxlan_peers, &vt->vt_peers, &key);
662 	}
663 	if (p != NULL)
664 		sc = vxlan_take(p->p_sc);
665 	mtx_leave(&vt->vt_mtx);
666 
667 	if (sc == NULL)
668 		goto drop;
669 
670 	ifp = &sc->sc_ac.ac_if;
671 	if (ISSET(ifp->if_flags, IFF_LINK0) && port != sc->sc_port)
672 		goto rele_drop;
673 
674 	m_adj(m, vhlen);
675 
676 	if (m->m_pkthdr.len < sizeof(*eh))
677 		goto rele_drop;
678 
679 	if (m->m_len < sizeof(*eh)) {
680 		m = m_pullup(m, sizeof(*eh));
681 		if (m == NULL)
682 			goto rele;
683 	}
684 
685 	n = m_getptr(m, sizeof(*eh), &off);
686 	if (n == NULL)
687 		goto rele_drop;
688 
689 	if (!ALIGNED_POINTER(mtod(n, caddr_t) + off, uint32_t)) {
690 		n = m_dup_pkt(m, ETHER_ALIGN, M_NOWAIT);
691 		m_freem(m);
692 		if (n == NULL)
693 			goto rele;
694 		m = n;
695 	}
696 
697 	if (sc->sc_mode == VXLAN_TMODE_LEARNING) {
698 		eh = mtod(m, struct ether_header *);
699 		etherbridge_map_ea(&sc->sc_eb, &addr,
700 		    (struct ether_addr *)eh->ether_shost);
701 	}
702 
703 	rxhprio = sc->sc_rxhprio;
704 	switch (rxhprio) {
705 	case IF_HDRPRIO_PACKET:
706 		/* nop */
707 		break;
708 	case IF_HDRPRIO_OUTER:
709 		m->m_pkthdr.pf.prio = IFQ_TOS2PRIO(tos);
710 		break;
711 	default:
712 		m->m_pkthdr.pf.prio = rxhprio;
713 		break;                                                  \
714         }                                                               \
715 
716 	if_vinput(ifp, m);
717 rele:
718 	vxlan_rele(sc);
719 	return (NULL);
720 
721 rele_drop:
722 	vxlan_rele(sc);
723 drop:
724 	m_freem(m);
725 	return (NULL);
726 }
727 
728 static int
729 vxlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
730 {
731 	struct vxlan_softc *sc = ifp->if_softc;
732 	struct ifreq *ifr = (struct ifreq *)data;
733 	struct ifbrparam *bparam = (struct ifbrparam *)data;
734 	int error = 0;
735 
736 	switch (cmd) {
737 	case SIOCSIFADDR:
738 		break;
739 	case SIOCSIFFLAGS:
740 		if (ISSET(ifp->if_flags, IFF_UP)) {
741 			if (!ISSET(ifp->if_flags, IFF_RUNNING))
742 				error = vxlan_up(sc);
743 			else
744 				error = 0;
745 		} else {
746 			if (ISSET(ifp->if_flags, IFF_RUNNING))
747 				error = vxlan_down(sc);
748 		}
749 		break;
750 
751 	case SIOCSLIFPHYRTABLE:
752 		error = vxlan_set_rdomain(sc, ifr);
753 		break;
754 	case SIOCGLIFPHYRTABLE:
755 		error = vxlan_get_rdomain(sc, ifr);
756 		break;
757 
758 	case SIOCSLIFPHYADDR:
759 		error = vxlan_set_tunnel(sc, (const struct if_laddrreq *)data);
760 		break;
761 	case SIOCGLIFPHYADDR:
762 		error = vxlan_get_tunnel(sc, (struct if_laddrreq *)data);
763 		break;
764 	case SIOCDIFPHYADDR:
765 		error = vxlan_del_tunnel(sc);
766 		break;
767 
768 	case SIOCSVNETID:
769 		error = vxlan_set_vnetid(sc, ifr);
770 		break;
771 	case SIOCGVNETID:
772 		error = vxlan_get_vnetid(sc, ifr);
773 		break;
774 	case SIOCDVNETID:
775 		error = vxlan_del_vnetid(sc);
776 		break;
777 
778 	case SIOCSIFPARENT:
779 		error = vxlan_set_parent(sc, (struct if_parent *)data);
780 		break;
781 	case SIOCGIFPARENT:
782 		error = vxlan_get_parent(sc, (struct if_parent *)data);
783 		break;
784 	case SIOCDIFPARENT:
785 		error = vxlan_del_parent(sc);
786 		break;
787 
788 	case SIOCSTXHPRIO:
789 		error = if_txhprio_l2_check(ifr->ifr_hdrprio);
790 		if (error != 0)
791 			break;
792 
793 		sc->sc_txhprio = ifr->ifr_hdrprio;
794 		break;
795 	case SIOCGTXHPRIO:
796 		ifr->ifr_hdrprio = sc->sc_txhprio;
797 		break;
798 
799 	case SIOCSRXHPRIO:
800 		error = if_rxhprio_l2_check(ifr->ifr_hdrprio);
801 		if (error != 0)
802 			break;
803 
804 		sc->sc_rxhprio = ifr->ifr_hdrprio;
805 		break;
806 	case SIOCGRXHPRIO:
807 		ifr->ifr_hdrprio = sc->sc_rxhprio;
808 		break;
809 
810 	case SIOCSLIFPHYDF:
811 		/* commit */
812 		sc->sc_df = ifr->ifr_df ? htons(IP_DF) : htons(0);
813 		break;
814 	case SIOCGLIFPHYDF:
815 		ifr->ifr_df = sc->sc_df ? 1 : 0;
816 		break;
817 
818 	case SIOCSLIFPHYTTL:
819 		if (ifr->ifr_ttl < 1 || ifr->ifr_ttl > 0xff) {
820 			error = EINVAL;
821 			break;
822 		}
823 
824 		/* commit */
825 		sc->sc_ttl = (uint8_t)ifr->ifr_ttl;
826 		break;
827 	case SIOCGLIFPHYTTL:
828 		ifr->ifr_ttl = (int)sc->sc_ttl;
829 		break;
830 
831 	case SIOCBRDGSCACHE:
832 		error = etherbridge_set_max(&sc->sc_eb, bparam);
833 		break;
834 	case SIOCBRDGGCACHE:
835 		error = etherbridge_get_max(&sc->sc_eb, bparam);
836 		break;
837 	case SIOCBRDGSTO:
838 		error = etherbridge_set_tmo(&sc->sc_eb, bparam);
839 		break;
840 	case SIOCBRDGGTO:
841 		error = etherbridge_get_tmo(&sc->sc_eb, bparam);
842 		break;
843 
844 	case SIOCBRDGRTS:
845 		error = etherbridge_rtfind(&sc->sc_eb,
846 		    (struct ifbaconf *)data);
847 		break;
848 	case SIOCBRDGFLUSH:
849 		etherbridge_flush(&sc->sc_eb,
850 		    ((struct ifbreq *)data)->ifbr_ifsflags);
851 		break;
852 	case SIOCBRDGSADDR:
853 		error = vxlan_add_addr(sc, (struct ifbareq *)data);
854 		break;
855 	case SIOCBRDGDADDR:
856 		error = vxlan_del_addr(sc, (struct ifbareq *)data);
857 		break;
858 
859 	case SIOCADDMULTI:
860 	case SIOCDELMULTI:
861 		/* no hardware to program */
862 		break;
863 
864 	default:
865 		error = ether_ioctl(ifp, &sc->sc_ac, cmd, data);
866 		break;
867 	}
868 
869 	if (error == ENETRESET) {
870 		/* no hardware to program */
871 		error = 0;
872 	}
873 
874 	return (error);
875 }
876 
877 static struct vxlan_tep *
878 vxlan_tep_get(struct vxlan_softc *sc, const union vxlan_addr *addr)
879 {
880 	struct vxlan_tep *vt;
881 
882 	TAILQ_FOREACH(vt, &vxlan_teps, vt_entry) {
883 		if (sc->sc_af == vt->vt_af &&
884 		    sc->sc_rdomain == vt->vt_rdomain &&
885 		    memcmp(addr, &vt->vt_addr, sizeof(*addr)) == 0 &&
886 		    sc->sc_port == vt->vt_port)
887 			return (vt);
888 	}
889 
890 	return (NULL);
891 }
892 
893 static int
894 vxlan_tep_add_addr(struct vxlan_softc *sc, const union vxlan_addr *addr,
895     struct vxlan_peer *p)
896 {
897 	struct mbuf m;
898 	struct vxlan_tep *vt;
899 	struct socket *so;
900 	struct sockaddr_in *sin;
901 #ifdef INET6
902 	struct sockaddr_in6 *sin6;
903 #endif
904 	int error;
905 	int s;
906 
907 	vt = vxlan_tep_get(sc, addr);
908 	if (vt != NULL) {
909 		struct vxlan_peer *op;
910 
911 		mtx_enter(&vt->vt_mtx);
912 		op = RBT_INSERT(vxlan_peers, &vt->vt_peers, p);
913 		mtx_leave(&vt->vt_mtx);
914 
915 		if (op != NULL)
916 			return (EADDRINUSE);
917 
918 		return (0);
919 	}
920 
921 	vt = malloc(sizeof(*vt), M_DEVBUF, M_NOWAIT|M_ZERO);
922 	if (vt == NULL)
923 		return (ENOMEM);
924 
925 	vt->vt_af = sc->sc_af;
926 	vt->vt_rdomain = sc->sc_rdomain;
927 	vt->vt_addr = *addr;
928 	vt->vt_port = sc->sc_port;
929 
930 	mtx_init(&vt->vt_mtx, IPL_SOFTNET);
931 	RBT_INIT(vxlan_peers, &vt->vt_peers);
932 	RBT_INSERT(vxlan_peers, &vt->vt_peers, p);
933 
934 	error = socreate(vt->vt_af, &so, SOCK_DGRAM, IPPROTO_UDP);
935 	if (error != 0)
936 		goto free;
937 
938 	s = solock(so);
939 
940 	sotoinpcb(so)->inp_upcall = vxlan_input;
941 	sotoinpcb(so)->inp_upcall_arg = vt;
942 
943 	m_inithdr(&m);
944 	m.m_len = sizeof(vt->vt_rdomain);
945 	*mtod(&m, unsigned int *) = vt->vt_rdomain;
946 	error = sosetopt(so, SOL_SOCKET, SO_RTABLE, &m);
947 	if (error != 0)
948 		goto close;
949 
950 	m_inithdr(&m);
951 	switch (vt->vt_af) {
952 	case AF_INET:
953 		sin = mtod(&m, struct sockaddr_in *);
954 		memset(sin, 0, sizeof(*sin));
955 		sin->sin_len = sizeof(*sin);
956 		sin->sin_family = AF_INET;
957 		sin->sin_addr = addr->in4;
958 		sin->sin_port = vt->vt_port;
959 
960 		m.m_len = sizeof(*sin);
961 		break;
962 
963 #ifdef INET6
964 	case AF_INET6:
965 		sin6 = mtod(&m, struct sockaddr_in6 *);
966 		sin6->sin6_len = sizeof(*sin6);
967 		sin6->sin6_family = AF_INET6;
968 		in6_recoverscope(sin6, &addr->in6);
969 		sin6->sin6_port = sc->sc_port;
970 
971 		m.m_len = sizeof(*sin6);
972 		break;
973 #endif
974 	default:
975 		unhandled_af(vt->vt_af);
976 	}
977 
978 	error = sobind(so, &m, curproc);
979 	if (error != 0)
980 		goto close;
981 
982 	sounlock(so, s);
983 
984 	rw_assert_wrlock(&vxlan_lock);
985 	TAILQ_INSERT_TAIL(&vxlan_teps, vt, vt_entry);
986 
987 	vt->vt_so = so;
988 
989 	return (0);
990 
991 close:
992 	sounlock(so, s);
993 	soclose(so, MSG_DONTWAIT);
994 free:
995 	free(vt, M_DEVBUF, sizeof(*vt));
996 	return (error);
997 }
998 
999 static void
1000 vxlan_tep_del_addr(struct vxlan_softc *sc, const union vxlan_addr *addr,
1001     struct vxlan_peer *p)
1002 {
1003 	struct vxlan_tep *vt;
1004 	int empty;
1005 
1006 	vt = vxlan_tep_get(sc, addr);
1007 	if (vt == NULL)
1008 		panic("unable to find vxlan_tep for peer %p (sc %p)", p, sc);
1009 
1010 	mtx_enter(&vt->vt_mtx);
1011 	RBT_REMOVE(vxlan_peers, &vt->vt_peers, p);
1012 	empty = RBT_EMPTY(vxlan_peers, &vt->vt_peers);
1013 	mtx_leave(&vt->vt_mtx);
1014 
1015 	if (!empty)
1016 		return;
1017 
1018 	rw_assert_wrlock(&vxlan_lock);
1019 	TAILQ_REMOVE(&vxlan_teps, vt, vt_entry);
1020 
1021 	soclose(vt->vt_so, MSG_DONTWAIT);
1022 	free(vt, M_DEVBUF, sizeof(*vt));
1023 }
1024 
1025 static int
1026 vxlan_tep_up(struct vxlan_softc *sc)
1027 {
1028 	struct vxlan_peer *up, *mp;
1029 	int error;
1030 
1031 	up = malloc(sizeof(*up), M_DEVBUF, M_NOWAIT|M_ZERO);
1032 	if (up == NULL)
1033 		return (ENOMEM);
1034 
1035 	if (sc->sc_mode == VXLAN_TMODE_P2P)
1036 		up->p_addr = sc->sc_dst;
1037 	up->p_header = sc->sc_header;
1038 	up->p_sc = vxlan_take(sc);
1039 
1040 	error = vxlan_tep_add_addr(sc, &sc->sc_src, up);
1041 	if (error != 0)
1042 		goto freeup;
1043 
1044 	sc->sc_ucast_peer = up;
1045 
1046 	if (sc->sc_mode != VXLAN_TMODE_LEARNING)
1047 		return (0);
1048 
1049 	mp = malloc(sizeof(*mp), M_DEVBUF, M_NOWAIT|M_ZERO);
1050 	if (mp == NULL) {
1051 		error = ENOMEM;
1052 		goto delup;
1053 	}
1054 
1055 	/* addr is multicast, leave it as 0s */
1056 	mp->p_header = sc->sc_header;
1057 	mp->p_sc = vxlan_take(sc);
1058 
1059 	/* destination address is a multicast group we want to join */
1060 	error = vxlan_tep_add_addr(sc, &sc->sc_dst, up);
1061 	if (error != 0)
1062 		goto freemp;
1063 
1064 	sc->sc_mcast_peer = mp;
1065 
1066 	return (0);
1067 
1068 freemp:
1069 	vxlan_rele(mp->p_sc);
1070 	free(mp, M_DEVBUF, sizeof(*mp));
1071 delup:
1072 	vxlan_tep_del_addr(sc, &sc->sc_src, up);
1073 freeup:
1074 	vxlan_rele(up->p_sc);
1075 	free(up, M_DEVBUF, sizeof(*up));
1076 	return (error);
1077 }
1078 
1079 static void
1080 vxlan_tep_down(struct vxlan_softc *sc)
1081 {
1082 	struct vxlan_peer *up = sc->sc_ucast_peer;
1083 
1084 	if (sc->sc_mode == VXLAN_TMODE_LEARNING) {
1085 		struct vxlan_peer *mp = sc->sc_mcast_peer;
1086 		vxlan_tep_del_addr(sc, &sc->sc_dst, mp);
1087 		vxlan_rele(mp->p_sc);
1088 		free(mp, M_DEVBUF, sizeof(*mp));
1089 	}
1090 
1091 	vxlan_tep_del_addr(sc, &sc->sc_src, up);
1092 	vxlan_rele(up->p_sc);
1093 	free(up, M_DEVBUF, sizeof(*up));
1094 }
1095 
1096 static int
1097 vxlan_up(struct vxlan_softc *sc)
1098 {
1099 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1100 	struct ifnet *ifp0 = NULL;
1101 	int error;
1102 
1103 	KASSERT(!ISSET(ifp->if_flags, IFF_RUNNING));
1104 	NET_ASSERT_LOCKED();
1105 
1106 	if (sc->sc_af == AF_UNSPEC)
1107 		return (EDESTADDRREQ);
1108 	KASSERT(sc->sc_mode != VXLAN_TMODE_UNSET);
1109 
1110 	NET_UNLOCK();
1111 
1112 	error = rw_enter(&vxlan_lock, RW_WRITE|RW_INTR);
1113 	if (error != 0)
1114 		goto netlock;
1115 
1116 	NET_LOCK();
1117 	if (ISSET(ifp->if_flags, IFF_RUNNING)) {
1118 		/* something else beat us */
1119 		rw_exit(&vxlan_lock);
1120 		return (0);
1121 	}
1122 	NET_UNLOCK();
1123 
1124 	if (sc->sc_mode != VXLAN_TMODE_P2P) {
1125 		error = etherbridge_up(&sc->sc_eb);
1126 		if (error != 0)
1127 			goto unlock;
1128 	}
1129 
1130 	if (sc->sc_mode == VXLAN_TMODE_LEARNING) {
1131 		ifp0 = if_get(sc->sc_if_index0);
1132 		if (ifp0 == NULL) {
1133 			error = ENXIO;
1134 			goto down;
1135 		}
1136 
1137 		/* check again if multicast will work on top of the parent */
1138 		if (!ISSET(ifp0->if_flags, IFF_MULTICAST)) {
1139 			error = EPROTONOSUPPORT;
1140 			goto put;
1141 		}
1142 
1143 		error = vxlan_addmulti(sc, ifp0);
1144 		if (error != 0)
1145 			goto put;
1146 
1147 		/* Register callback if parent wants to unregister */
1148 		if_detachhook_add(ifp0, &sc->sc_dtask);
1149 	} else {
1150 		if (sc->sc_if_index0 != 0) {
1151 			error = EPROTONOSUPPORT;
1152 			goto down;
1153 		}
1154 	}
1155 
1156 	error = vxlan_tep_up(sc);
1157 	if (error != 0)
1158 		goto del;
1159 
1160 	if_put(ifp0);
1161 
1162 	NET_LOCK();
1163 	SET(ifp->if_flags, IFF_RUNNING);
1164 	rw_exit(&vxlan_lock);
1165 
1166 	return (0);
1167 
1168 del:
1169 	if (sc->sc_mode == VXLAN_TMODE_LEARNING) {
1170 		if (ifp0 != NULL)
1171 			if_detachhook_del(ifp0, &sc->sc_dtask);
1172 		vxlan_delmulti(sc);
1173 	}
1174 put:
1175 	if_put(ifp0);
1176 down:
1177 	if (sc->sc_mode != VXLAN_TMODE_P2P)
1178 		etherbridge_down(&sc->sc_eb);
1179 unlock:
1180 	rw_exit(&vxlan_lock);
1181 netlock:
1182 	NET_LOCK();
1183 
1184 	return (error);
1185 }
1186 
1187 static int
1188 vxlan_down(struct vxlan_softc *sc)
1189 {
1190 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1191 	struct ifnet *ifp0;
1192 	int error;
1193 
1194 	KASSERT(ISSET(ifp->if_flags, IFF_RUNNING));
1195 	NET_UNLOCK();
1196 
1197 	error = rw_enter(&vxlan_lock, RW_WRITE|RW_INTR);
1198 	if (error != 0) {
1199 		NET_LOCK();
1200 		return (error);
1201 	}
1202 
1203 	NET_LOCK();
1204 	if (!ISSET(ifp->if_flags, IFF_RUNNING)) {
1205 		/* something else beat us */
1206 		rw_exit(&vxlan_lock);
1207 		return (0);
1208 	}
1209 	NET_UNLOCK();
1210 
1211 	vxlan_tep_down(sc);
1212 
1213 	if (sc->sc_mode == VXLAN_TMODE_LEARNING) {
1214 		vxlan_delmulti(sc);
1215 		ifp0 = if_get(sc->sc_if_index0);
1216 		if (ifp0 != NULL) {
1217 			if_detachhook_del(ifp0, &sc->sc_dtask);
1218 		}
1219 		if_put(ifp0);
1220 	}
1221 
1222 	if (sc->sc_mode != VXLAN_TMODE_P2P)
1223 		etherbridge_down(&sc->sc_eb);
1224 
1225 	taskq_del_barrier(ifp->if_snd.ifq_softnet, &sc->sc_send_task);
1226 	NET_LOCK();
1227 	CLR(ifp->if_flags, IFF_RUNNING);
1228 	rw_exit(&vxlan_lock);
1229 
1230 	return (0);
1231 }
1232 
1233 static int
1234 vxlan_addmulti(struct vxlan_softc *sc, struct ifnet *ifp0)
1235 {
1236 	int error = 0;
1237 
1238 	NET_LOCK();
1239 
1240 	switch (sc->sc_af) {
1241 	case AF_INET:
1242 		sc->sc_inmulti = in_addmulti(&sc->sc_dst.in4, ifp0);
1243 		if (sc->sc_inmulti == NULL)
1244 			error = EADDRNOTAVAIL;
1245 		break;
1246 #ifdef INET6
1247 	case AF_INET6:
1248 		sc->sc_inmulti = in6_addmulti(&sc->sc_dst.in6, ifp0, &error);
1249 		break;
1250 #endif
1251 	default:
1252 		unhandled_af(sc->sc_af);
1253 	}
1254 
1255 	NET_UNLOCK();
1256 
1257 	return (error);
1258 }
1259 
1260 static void
1261 vxlan_delmulti(struct vxlan_softc *sc)
1262 {
1263 	NET_LOCK();
1264 
1265 	switch (sc->sc_af) {
1266 	case AF_INET:
1267 		in_delmulti(sc->sc_inmulti);
1268 		break;
1269 #ifdef INET6
1270 	case AF_INET6:
1271 		in6_delmulti(sc->sc_inmulti);
1272 		break;
1273 #endif
1274 	default:
1275 		unhandled_af(sc->sc_af);
1276 	}
1277 
1278 	sc->sc_inmulti = NULL; /* keep it tidy */
1279 
1280 	NET_UNLOCK();
1281 }
1282 
1283 static int
1284 vxlan_set_rdomain(struct vxlan_softc *sc, const struct ifreq *ifr)
1285 {
1286 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1287 
1288 	if (ifr->ifr_rdomainid < 0 ||
1289 	    ifr->ifr_rdomainid > RT_TABLEID_MAX)
1290 		return (EINVAL);
1291 	if (!rtable_exists(ifr->ifr_rdomainid))
1292 		return (EADDRNOTAVAIL);
1293 
1294 	if (sc->sc_rdomain == ifr->ifr_rdomainid)
1295 		return (0);
1296 
1297 	if (ISSET(ifp->if_flags, IFF_RUNNING))
1298 		return (EBUSY);
1299 
1300 	/* commit */
1301 	sc->sc_rdomain = ifr->ifr_rdomainid;
1302 	etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
1303 
1304 	return (0);
1305 }
1306 
1307 static int
1308 vxlan_get_rdomain(struct vxlan_softc *sc, struct ifreq *ifr)
1309 {
1310 	ifr->ifr_rdomainid = sc->sc_rdomain;
1311 
1312 	return (0);
1313 }
1314 
1315 static int
1316 vxlan_set_tunnel(struct vxlan_softc *sc, const struct if_laddrreq *req)
1317 {
1318 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1319 	struct sockaddr *src = (struct sockaddr *)&req->addr;
1320 	struct sockaddr *dst = (struct sockaddr *)&req->dstaddr;
1321 	struct sockaddr_in *src4, *dst4;
1322 #ifdef INET6
1323 	struct sockaddr_in6 *src6, *dst6;
1324 	int error;
1325 #endif
1326 	union vxlan_addr saddr, daddr;
1327 	unsigned int mode = VXLAN_TMODE_ENDPOINT;
1328 	in_port_t port = htons(VXLAN_PORT);
1329 
1330 	memset(&saddr, 0, sizeof(saddr));
1331 	memset(&daddr, 0, sizeof(daddr));
1332 
1333 	/* validate */
1334 	switch (src->sa_family) {
1335 	case AF_INET:
1336 		src4 = (struct sockaddr_in *)src;
1337 		if (in_nullhost(src4->sin_addr) ||
1338 		    IN_MULTICAST(src4->sin_addr.s_addr))
1339 			return (EINVAL);
1340 
1341 		if (src4->sin_port != htons(0))
1342 			port = src4->sin_port;
1343 
1344 		if (dst->sa_family != AF_UNSPEC) {
1345 			if (dst->sa_family != AF_INET)
1346 				return (EINVAL);
1347 
1348 			dst4 = (struct sockaddr_in *)dst;
1349 			if (in_nullhost(dst4->sin_addr))
1350 				return (EINVAL);
1351 
1352 			/* all good */
1353 			mode = IN_MULTICAST(dst4->sin_addr.s_addr) ?
1354 			    VXLAN_TMODE_LEARNING : VXLAN_TMODE_P2P;
1355 			daddr.in4 = dst4->sin_addr;
1356 		}
1357 
1358 		saddr.in4 = src4->sin_addr;
1359 		break;
1360 
1361 #ifdef INET6
1362 	case AF_INET6:
1363 		src6 = (struct sockaddr_in6 *)src;
1364 		if (IN6_IS_ADDR_UNSPECIFIED(&src6->sin6_addr) ||
1365 		    IN6_IS_ADDR_MULTICAST(&src6->sin6_addr))
1366 			return (EINVAL);
1367 
1368 		if (src6->sin6_port != htons(0))
1369 			port = src6->sin6_port;
1370 
1371 		if (dst->sa_family != AF_UNSPEC) {
1372 			if (dst->sa_family != AF_INET6)
1373 				return (EINVAL);
1374 
1375 			dst6 = (struct sockaddr_in6 *)dst;
1376 			if (IN6_IS_ADDR_UNSPECIFIED(&dst6->sin6_addr))
1377 				return (EINVAL);
1378 
1379 			if (src6->sin6_scope_id != dst6->sin6_scope_id)
1380 				return (EINVAL);
1381 
1382 			/* all good */
1383 			mode = IN6_IS_ADDR_MULTICAST(&dst6->sin6_addr) ?
1384 			    VXLAN_TMODE_LEARNING : VXLAN_TMODE_P2P;
1385 			error = in6_embedscope(&daddr.in6, dst6, NULL);
1386 			if (error != 0)
1387 				return (error);
1388 		}
1389 
1390 		error = in6_embedscope(&saddr.in6, src6, NULL);
1391 		if (error != 0)
1392 			return (error);
1393 
1394 		break;
1395 #endif
1396 	default:
1397 		return (EAFNOSUPPORT);
1398 	}
1399 
1400 	if (memcmp(&sc->sc_src, &saddr, sizeof(sc->sc_src)) == 0 &&
1401 	    memcmp(&sc->sc_dst, &daddr, sizeof(sc->sc_dst)) == 0 &&
1402 	    sc->sc_port == port)
1403 		return (0);
1404 
1405 	if (ISSET(ifp->if_flags, IFF_RUNNING))
1406 		return (EBUSY);
1407 
1408 	/* commit */
1409 	sc->sc_af = src->sa_family;
1410 	sc->sc_src = saddr;
1411 	sc->sc_dst = daddr;
1412 	sc->sc_port = port;
1413 	sc->sc_mode = mode;
1414 	etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
1415 
1416 	return (0);
1417 }
1418 
1419 static int
1420 vxlan_get_tunnel(struct vxlan_softc *sc, struct if_laddrreq *req)
1421 {
1422 	struct sockaddr *dstaddr = (struct sockaddr *)&req->dstaddr;
1423 	struct sockaddr_in *sin;
1424 #ifdef INET6
1425 	struct sockaddr_in6 *sin6;
1426 #endif
1427 
1428 	if (sc->sc_af == AF_UNSPEC)
1429 		return (EADDRNOTAVAIL);
1430 	KASSERT(sc->sc_mode != VXLAN_TMODE_UNSET);
1431 
1432 	memset(&req->addr, 0, sizeof(req->addr));
1433 	memset(&req->dstaddr, 0, sizeof(req->dstaddr));
1434 
1435 	/* default to endpoint */
1436 	dstaddr->sa_len = 2;
1437 	dstaddr->sa_family = AF_UNSPEC;
1438 
1439 	switch (sc->sc_af) {
1440 	case AF_INET:
1441 		sin = (struct sockaddr_in *)&req->addr;
1442 		sin->sin_len = sizeof(*sin);
1443 		sin->sin_family = AF_INET;
1444 		sin->sin_addr = sc->sc_src.in4;
1445 		sin->sin_port = sc->sc_port;
1446 
1447 		if (sc->sc_mode == VXLAN_TMODE_ENDPOINT)
1448 			break;
1449 
1450 		sin = (struct sockaddr_in *)&req->dstaddr;
1451 		sin->sin_len = sizeof(*sin);
1452 		sin->sin_family = AF_INET;
1453 		sin->sin_addr = sc->sc_dst.in4;
1454 		break;
1455 
1456 #ifdef INET6
1457 	case AF_INET6:
1458 		sin6 = (struct sockaddr_in6 *)&req->addr;
1459 		sin6->sin6_len = sizeof(*sin6);
1460 		sin6->sin6_family = AF_INET6;
1461 		in6_recoverscope(sin6, &sc->sc_src.in6);
1462 		sin6->sin6_port = sc->sc_port;
1463 
1464 		if (sc->sc_mode == VXLAN_TMODE_ENDPOINT)
1465 			break;
1466 
1467 		sin6 = (struct sockaddr_in6 *)&req->dstaddr;
1468 		sin6->sin6_len = sizeof(*sin6);
1469 		sin6->sin6_family = AF_INET6;
1470 		in6_recoverscope(sin6, &sc->sc_dst.in6);
1471 		break;
1472 #endif
1473 	default:
1474 		unhandled_af(sc->sc_af);
1475 	}
1476 
1477 	return (0);
1478 }
1479 
1480 static int
1481 vxlan_del_tunnel(struct vxlan_softc *sc)
1482 {
1483 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1484 
1485 	if (sc->sc_af == AF_UNSPEC)
1486 		return (0);
1487 
1488 	if (ISSET(ifp->if_flags, IFF_RUNNING))
1489 		return (EBUSY);
1490 
1491 	/* commit */
1492 	sc->sc_af = AF_UNSPEC;
1493 	memset(&sc->sc_src, 0, sizeof(sc->sc_src));
1494 	memset(&sc->sc_dst, 0, sizeof(sc->sc_dst));
1495 	sc->sc_port = htons(0);
1496 	sc->sc_mode = VXLAN_TMODE_UNSET;
1497 	etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
1498 
1499 	return (0);
1500 }
1501 
1502 static int
1503 vxlan_set_vnetid(struct vxlan_softc *sc, const struct ifreq *ifr)
1504 {
1505 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1506 	uint32_t vni;
1507 
1508 	if (ifr->ifr_vnetid < VXLAN_VNI_MIN ||
1509 	    ifr->ifr_vnetid > VXLAN_VNI_MAX)
1510 		return (EINVAL);
1511 
1512 	vni = htonl(ifr->ifr_vnetid << VXLAN_VNI_SHIFT);
1513 	if (ISSET(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I)) &&
1514 	    sc->sc_header.vxlan_id == vni)
1515 		return (0);
1516 
1517 	if (ISSET(ifp->if_flags, IFF_RUNNING))
1518 		return (EBUSY);
1519 
1520 	/* commit */
1521 	SET(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I));
1522 	sc->sc_header.vxlan_id = vni;
1523 	etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
1524 
1525 	return (0);
1526 }
1527 
1528 static int
1529 vxlan_get_vnetid(struct vxlan_softc *sc, struct ifreq *ifr)
1530 {
1531 	uint32_t vni;
1532 
1533 	if (!ISSET(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I)))
1534 		return (EADDRNOTAVAIL);
1535 
1536 	vni = ntohl(sc->sc_header.vxlan_id);
1537 	vni &= VXLAN_VNI_MASK;
1538 	vni >>= VXLAN_VNI_SHIFT;
1539 
1540 	ifr->ifr_vnetid = vni;
1541 
1542 	return (0);
1543 }
1544 
1545 static int
1546 vxlan_del_vnetid(struct vxlan_softc *sc)
1547 {
1548 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1549 
1550 	if (!ISSET(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I)))
1551 		return (0);
1552 
1553 	if (ISSET(ifp->if_flags, IFF_RUNNING))
1554 		return (EBUSY);
1555 
1556 	/* commit */
1557 	CLR(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I));
1558 	sc->sc_header.vxlan_id = htonl(0 << VXLAN_VNI_SHIFT);
1559 	etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
1560 
1561 	return (0);
1562 }
1563 
1564 static int
1565 vxlan_set_parent(struct vxlan_softc *sc, const struct if_parent *p)
1566 {
1567 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1568 	struct ifnet *ifp0;
1569 	int error = 0;
1570 
1571 	ifp0 = if_unit(p->ifp_parent);
1572 	if (ifp0 == NULL)
1573 		return (ENXIO);
1574 
1575 	if (!ISSET(ifp0->if_flags, IFF_MULTICAST)) {
1576 		error = ENXIO;
1577 		goto put;
1578 	}
1579 
1580 	if (sc->sc_if_index0 == ifp0->if_index)
1581 		goto put;
1582 
1583 	if (ISSET(ifp->if_flags, IFF_RUNNING)) {
1584 		error = EBUSY;
1585 		goto put;
1586 	}
1587 
1588 	/* commit */
1589 	sc->sc_if_index0 = ifp0->if_index;
1590 	etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
1591 
1592 put:
1593 	if_put(ifp0);
1594 	return (error);
1595 }
1596 
1597 static int
1598 vxlan_get_parent(struct vxlan_softc *sc, struct if_parent *p)
1599 {
1600 	struct ifnet *ifp0;
1601 	int error = 0;
1602 
1603 	ifp0 = if_get(sc->sc_if_index0);
1604 	if (ifp0 == NULL)
1605 		error = EADDRNOTAVAIL;
1606 	else
1607 		strlcpy(p->ifp_parent, ifp0->if_xname, sizeof(p->ifp_parent));
1608 	if_put(ifp0);
1609 
1610 	return (error);
1611 }
1612 
1613 static int
1614 vxlan_del_parent(struct vxlan_softc *sc)
1615 {
1616 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1617 
1618 	if (sc->sc_if_index0 == 0)
1619 		return (0);
1620 
1621 	if (ISSET(ifp->if_flags, IFF_RUNNING))
1622 		return (EBUSY);
1623 
1624 	/* commit */
1625 	sc->sc_if_index0 = 0;
1626 	etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
1627 
1628 	return (0);
1629 }
1630 
1631 static int
1632 vxlan_add_addr(struct vxlan_softc *sc, const struct ifbareq *ifba)
1633 {
1634 	struct sockaddr_in *sin;
1635 #ifdef INET6
1636 	struct sockaddr_in6 *sin6;
1637 	struct sockaddr_in6 src6 = {
1638 		.sin6_len = sizeof(src6),
1639 		.sin6_family = AF_UNSPEC,
1640 	};
1641 	int error;
1642 #endif
1643 	union vxlan_addr endpoint;
1644 	unsigned int type;
1645 
1646 	switch (sc->sc_mode) {
1647 	case VXLAN_TMODE_UNSET:
1648 		return (ENOPROTOOPT);
1649 	case VXLAN_TMODE_P2P:
1650 		return (EPROTONOSUPPORT);
1651 	default:
1652 		break;
1653 	}
1654 
1655 	/* ignore ifba_ifsname */
1656 
1657 	if (ISSET(ifba->ifba_flags, ~IFBAF_TYPEMASK))
1658 		return (EINVAL);
1659 	switch (ifba->ifba_flags & IFBAF_TYPEMASK) {
1660 	case IFBAF_DYNAMIC:
1661 		type = EBE_DYNAMIC;
1662 		break;
1663 	case IFBAF_STATIC:
1664 		type = EBE_STATIC;
1665 		break;
1666 	default:
1667 		return (EINVAL);
1668 	}
1669 
1670 	memset(&endpoint, 0, sizeof(endpoint));
1671 
1672 	if (ifba->ifba_dstsa.ss_family != sc->sc_af)
1673 		return (EAFNOSUPPORT);
1674 	switch (ifba->ifba_dstsa.ss_family) {
1675 	case AF_INET:
1676 		sin = (struct sockaddr_in *)&ifba->ifba_dstsa;
1677 		if (in_nullhost(sin->sin_addr) ||
1678 		    IN_MULTICAST(sin->sin_addr.s_addr))
1679 			return (EADDRNOTAVAIL);
1680 
1681 		if (sin->sin_port != htons(0))
1682 			return (EADDRNOTAVAIL);
1683 
1684 		endpoint.in4 = sin->sin_addr;
1685 		break;
1686 
1687 #ifdef INET6
1688 	case AF_INET6:
1689 		sin6 = (struct sockaddr_in6 *)&ifba->ifba_dstsa;
1690 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) ||
1691 		    IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
1692 			return (EADDRNOTAVAIL);
1693 
1694 		in6_recoverscope(&src6, &sc->sc_src.in6);
1695 		if (src6.sin6_scope_id != sin6->sin6_scope_id)
1696 			return (EADDRNOTAVAIL);
1697 
1698 		if (sin6->sin6_port != htons(0))
1699 			return (EADDRNOTAVAIL);
1700 
1701 		error = in6_embedscope(&endpoint.in6, sin6, NULL);
1702 		if (error != 0)
1703 			return (error);
1704 
1705 		break;
1706 #endif
1707 	default: /* AF_UNSPEC */
1708 		return (EADDRNOTAVAIL);
1709 	}
1710 
1711 	return (etherbridge_add_addr(&sc->sc_eb, &endpoint,
1712 	    &ifba->ifba_dst, type));
1713 }
1714 
1715 static int
1716 vxlan_del_addr(struct vxlan_softc *sc, const struct ifbareq *ifba)
1717 {
1718 	return (etherbridge_del_addr(&sc->sc_eb, &ifba->ifba_dst));
1719 }
1720 
1721 void
1722 vxlan_detach_hook(void *arg)
1723 {
1724 	struct vxlan_softc *sc = arg;
1725 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1726 
1727 	if (ISSET(ifp->if_flags, IFF_RUNNING)) {
1728 		vxlan_down(sc);
1729 		CLR(ifp->if_flags, IFF_UP);
1730 	}
1731 
1732 	sc->sc_if_index0 = 0;
1733 }
1734 
1735 static int
1736 vxlan_eb_port_eq(void *arg, void *a, void *b)
1737 {
1738 	const union vxlan_addr *va = a, *vb = b;
1739 	size_t i;
1740 
1741 	for (i = 0; i < nitems(va->in6.s6_addr32); i++) {
1742 		if (va->in6.s6_addr32[i] != vb->in6.s6_addr32[i])
1743 			return (0);
1744 	}
1745 
1746 	return (1);
1747 }
1748 
1749 static void *
1750 vxlan_eb_port_take(void *arg, void *port)
1751 {
1752 	union vxlan_addr *endpoint;
1753 
1754 	endpoint = pool_get(&vxlan_endpoint_pool, PR_NOWAIT);
1755 	if (endpoint == NULL)
1756 		return (NULL);
1757 
1758 	*endpoint = *(union vxlan_addr *)port;
1759 
1760 	return (endpoint);
1761 }
1762 
1763 static void
1764 vxlan_eb_port_rele(void *arg, void *port)
1765 {
1766 	union vxlan_addr *endpoint = port;
1767 
1768 	pool_put(&vxlan_endpoint_pool, endpoint);
1769 }
1770 
1771 static size_t
1772 vxlan_eb_port_ifname(void *arg, char *dst, size_t len, void *port)
1773 {
1774 	struct vxlan_softc *sc = arg;
1775 
1776 	return (strlcpy(dst, sc->sc_ac.ac_if.if_xname, len));
1777 }
1778 
1779 static void
1780 vxlan_eb_port_sa(void *arg, struct sockaddr_storage *ss, void *port)
1781 {
1782 	struct vxlan_softc *sc = arg;
1783 	union vxlan_addr *endpoint = port;
1784 
1785 	switch (sc->sc_af) {
1786 	case AF_INET: {
1787 		struct sockaddr_in *sin = (struct sockaddr_in *)ss;
1788 
1789 		sin->sin_len = sizeof(*sin);
1790 		sin->sin_family = AF_INET;
1791 		sin->sin_addr = endpoint->in4;
1792 		break;
1793 	}
1794 #ifdef INET6
1795 	case AF_INET6: {
1796 		struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)ss;
1797 
1798 		sin6->sin6_len = sizeof(*sin6);
1799 		sin6->sin6_family = AF_INET6;
1800 		in6_recoverscope(sin6, &endpoint->in6);
1801 		break;
1802 	}
1803 #endif /* INET6 */
1804 	default:
1805 		unhandled_af(sc->sc_af);
1806 	}
1807 }
1808 
1809 static inline int
1810 vxlan_peer_cmp(const struct vxlan_peer *ap, const struct vxlan_peer *bp)
1811 {
1812 	size_t i;
1813 
1814 	if (ap->p_header.vxlan_id > bp->p_header.vxlan_id)
1815 		return (1);
1816 	if (ap->p_header.vxlan_id < bp->p_header.vxlan_id)
1817 		return (-1);
1818 	if (ap->p_header.vxlan_flags > bp->p_header.vxlan_flags)
1819 		return (1);
1820 	if (ap->p_header.vxlan_flags < bp->p_header.vxlan_flags)
1821 		return (-1);
1822 
1823 	for (i = 0; i < nitems(ap->p_addr.in6.s6_addr32); i++) {
1824 		if (ap->p_addr.in6.s6_addr32[i] >
1825 		    bp->p_addr.in6.s6_addr32[i])
1826 			return (1);
1827 		if (ap->p_addr.in6.s6_addr32[i] <
1828 		    bp->p_addr.in6.s6_addr32[i])
1829 			return (-1);
1830 	}
1831 
1832 	return (0);
1833 }
1834 
1835 RBT_GENERATE(vxlan_peers, vxlan_peer, p_entry, vxlan_peer_cmp);
1836