xref: /openbsd-src/sys/net/if_vxlan.c (revision fc405d53b73a2d73393cb97f684863d17b583e38)
1 /*	$OpenBSD: if_vxlan.c,v 1.92 2023/04/13 02:19:05 jsg Exp $ */
2 
3 /*
4  * Copyright (c) 2021 David Gwynne <dlg@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include "bpfilter.h"
20 #include "pf.h"
21 
22 #include <sys/param.h>
23 #include <sys/systm.h>
24 #include <sys/kernel.h>
25 #include <sys/mbuf.h>
26 #include <sys/socket.h>
27 #include <sys/ioctl.h>
28 #include <sys/timeout.h>
29 #include <sys/pool.h>
30 #include <sys/tree.h>
31 #include <sys/refcnt.h>
32 #include <sys/smr.h>
33 
34 #include <sys/socketvar.h>
35 
36 #include <net/if.h>
37 #include <net/if_var.h>
38 #include <net/if_dl.h>
39 #include <net/if_media.h>
40 #include <net/if_types.h>
41 #include <net/route.h>
42 #include <net/rtable.h>
43 
44 #include <netinet/in.h>
45 #include <netinet/in_var.h>
46 #include <netinet/if_ether.h>
47 #include <netinet/ip.h>
48 #include <netinet/udp.h>
49 #include <netinet/in_pcb.h>
50 #include <netinet/ip_var.h>
51 
52 #ifdef INET6
53 #include <netinet/ip6.h>
54 #include <netinet6/ip6_var.h>
55 #include <netinet6/in6_var.h>
56 #endif
57 
58 /* for bridge stuff */
59 #include <net/if_bridge.h>
60 #include <net/if_etherbridge.h>
61 
62 #if NBPFILTER > 0
63 #include <net/bpf.h>
64 #endif
65 
66 /*
67  * The protocol.
68  */
69 
70 #define VXLANMTU		1492
71 #define VXLAN_PORT		4789
72 
73 struct vxlan_header {
74 	uint32_t		vxlan_flags;
75 #define VXLAN_F_I			(1U << 27)
76 	uint32_t		vxlan_id;
77 #define VXLAN_VNI_SHIFT			8
78 #define VXLAN_VNI_MASK			(0xffffffU << VXLAN_VNI_SHIFT)
79 };
80 
81 #define VXLAN_VNI_MAX			0x00ffffffU
82 #define VXLAN_VNI_MIN			0x00000000U
83 
84 /*
85  * The driver.
86  */
87 
88 union vxlan_addr {
89 	struct in_addr		in4;
90 	struct in6_addr		in6;
91 };
92 
93 struct vxlan_softc;
94 
95 struct vxlan_peer {
96 	RBT_ENTRY(vxlan_peer)	 p_entry;
97 
98 	struct vxlan_header	 p_header;
99 	union vxlan_addr	 p_addr;
100 
101 	struct vxlan_softc	*p_sc;
102 };
103 
104 RBT_HEAD(vxlan_peers, vxlan_peer);
105 
106 struct vxlan_tep {
107 	TAILQ_ENTRY(vxlan_tep)	 vt_entry;
108 
109 	sa_family_t		 vt_af;
110 	unsigned int		 vt_rdomain;
111 	union vxlan_addr	 vt_addr;
112 #define vt_addr4 vt_addr.in4
113 #define vt_addr6 vt_addr.in6
114 	in_port_t		 vt_port;
115 
116 	struct socket		*vt_so;
117 
118 	struct mutex		 vt_mtx;
119 	struct vxlan_peers	 vt_peers;
120 };
121 
122 TAILQ_HEAD(vxlan_teps, vxlan_tep);
123 
124 enum vxlan_tunnel_mode {
125 	VXLAN_TMODE_UNSET,
126 	VXLAN_TMODE_P2P,	 /* unicast destination, no learning */
127 	VXLAN_TMODE_LEARNING,	 /* multicast destination, learning */
128 	VXLAN_TMODE_ENDPOINT,	 /* unset destination, no learning */
129 };
130 
131 struct vxlan_softc {
132 	struct arpcom		 sc_ac;
133 	struct etherbridge	 sc_eb;
134 
135 	unsigned int		 sc_rdomain;
136 	sa_family_t		 sc_af;
137 	union vxlan_addr	 sc_src;
138 	union vxlan_addr	 sc_dst;
139 	in_port_t		 sc_port;
140 	struct vxlan_header	 sc_header;
141 	unsigned int		 sc_if_index0;
142 
143 	struct task		 sc_dtask;
144 	void			*sc_inmulti;
145 
146 	enum vxlan_tunnel_mode	 sc_mode;
147 	struct vxlan_peer	*sc_ucast_peer;
148 	struct vxlan_peer	*sc_mcast_peer;
149 	struct refcnt		 sc_refs;
150 
151 	uint16_t		 sc_df;
152 	int			 sc_ttl;
153 	int			 sc_txhprio;
154 	int			 sc_rxhprio;
155 
156 	struct task		 sc_send_task;
157 };
158 
159 void		vxlanattach(int);
160 
161 static int	vxlan_clone_create(struct if_clone *, int);
162 static int	vxlan_clone_destroy(struct ifnet *);
163 
164 static int	vxlan_output(struct ifnet *, struct mbuf *,
165 		    struct sockaddr *, struct rtentry *);
166 static int	vxlan_enqueue(struct ifnet *, struct mbuf *);
167 static void	vxlan_start(struct ifqueue *);
168 static void	vxlan_send(void *);
169 
170 static int	vxlan_ioctl(struct ifnet *, u_long, caddr_t);
171 static int	vxlan_up(struct vxlan_softc *);
172 static int	vxlan_down(struct vxlan_softc *);
173 static int	vxlan_addmulti(struct vxlan_softc *, struct ifnet *);
174 static void	vxlan_delmulti(struct vxlan_softc *);
175 
176 static struct mbuf *
177 		vxlan_input(void *, struct mbuf *,
178 		    struct ip *, struct ip6_hdr *, void *, int);
179 
180 static int	vxlan_set_rdomain(struct vxlan_softc *, const struct ifreq *);
181 static int	vxlan_get_rdomain(struct vxlan_softc *, struct ifreq *);
182 static int	vxlan_set_tunnel(struct vxlan_softc *,
183 		    const struct if_laddrreq *);
184 static int	vxlan_get_tunnel(struct vxlan_softc *, struct if_laddrreq *);
185 static int	vxlan_del_tunnel(struct vxlan_softc *);
186 static int	vxlan_set_vnetid(struct vxlan_softc *, const struct ifreq *);
187 static int	vxlan_get_vnetid(struct vxlan_softc *, struct ifreq *);
188 static int	vxlan_del_vnetid(struct vxlan_softc *);
189 static int	vxlan_set_parent(struct vxlan_softc *,
190 		    const struct if_parent *);
191 static int	vxlan_get_parent(struct vxlan_softc *, struct if_parent *);
192 static int	vxlan_del_parent(struct vxlan_softc *);
193 
194 static int	vxlan_add_addr(struct vxlan_softc *, const struct ifbareq *);
195 static int	vxlan_del_addr(struct vxlan_softc *, const struct ifbareq *);
196 
197 static void	vxlan_detach_hook(void *);
198 
199 static struct if_clone vxlan_cloner =
200     IF_CLONE_INITIALIZER("vxlan", vxlan_clone_create, vxlan_clone_destroy);
201 
202 static int	 vxlan_eb_port_eq(void *, void *, void *);
203 static void	*vxlan_eb_port_take(void *, void *);
204 static void	 vxlan_eb_port_rele(void *, void *);
205 static size_t	 vxlan_eb_port_ifname(void *, char *, size_t, void *);
206 static void	 vxlan_eb_port_sa(void *, struct sockaddr_storage *, void *);
207 
208 static const struct etherbridge_ops vxlan_etherbridge_ops = {
209 	vxlan_eb_port_eq,
210 	vxlan_eb_port_take,
211 	vxlan_eb_port_rele,
212 	vxlan_eb_port_ifname,
213 	vxlan_eb_port_sa,
214 };
215 
216 static struct rwlock vxlan_lock = RWLOCK_INITIALIZER("vteps");
217 static struct vxlan_teps vxlan_teps = TAILQ_HEAD_INITIALIZER(vxlan_teps);
218 static struct pool vxlan_endpoint_pool;
219 
220 static inline int	vxlan_peer_cmp(const struct vxlan_peer *,
221 			    const struct vxlan_peer *);
222 
223 RBT_PROTOTYPE(vxlan_peers, vxlan_peer, p_entry, vxlan_peer_cmp);
224 
225 void
226 vxlanattach(int count)
227 {
228 	if_clone_attach(&vxlan_cloner);
229 }
230 
231 static int
232 vxlan_clone_create(struct if_clone *ifc, int unit)
233 {
234 	struct vxlan_softc *sc;
235 	struct ifnet *ifp;
236 	int error;
237 
238 	if (vxlan_endpoint_pool.pr_size == 0) {
239 		pool_init(&vxlan_endpoint_pool, sizeof(union vxlan_addr),
240 		    0, IPL_SOFTNET, 0, "vxlanep", NULL);
241 	}
242 
243 	sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO|M_CANFAIL);
244 	if (sc == NULL)
245 		return (ENOMEM);
246 
247 	ifp = &sc->sc_ac.ac_if;
248 
249 	snprintf(ifp->if_xname, sizeof(ifp->if_xname), "%s%d",
250 	    ifc->ifc_name, unit);
251 
252 	error = etherbridge_init(&sc->sc_eb, ifp->if_xname,
253 	    &vxlan_etherbridge_ops, sc);
254 	if (error == -1) {
255 		free(sc, M_DEVBUF, sizeof(*sc));
256 		return (error);
257 	}
258 
259 	sc->sc_af = AF_UNSPEC;
260 	sc->sc_txhprio = 0;
261 	sc->sc_rxhprio = IF_HDRPRIO_OUTER;
262 	sc->sc_df = 0;
263 	sc->sc_ttl = IP_DEFAULT_MULTICAST_TTL;
264 
265 	task_set(&sc->sc_dtask, vxlan_detach_hook, sc);
266 	refcnt_init(&sc->sc_refs);
267 	task_set(&sc->sc_send_task, vxlan_send, sc);
268 
269 	ifp->if_softc = sc;
270 	ifp->if_hardmtu = ETHER_MAX_HARDMTU_LEN;
271 	ifp->if_ioctl = vxlan_ioctl;
272 	ifp->if_output = vxlan_output;
273 	ifp->if_enqueue = vxlan_enqueue;
274 	ifp->if_qstart = vxlan_start;
275 	ifp->if_flags = IFF_BROADCAST | IFF_MULTICAST | IFF_SIMPLEX;
276 	ifp->if_xflags = IFXF_CLONED | IFXF_MPSAFE;
277 	ether_fakeaddr(ifp);
278 
279 	if_counters_alloc(ifp);
280 	if_attach(ifp);
281 	ether_ifattach(ifp);
282 
283 	return (0);
284 }
285 
286 static int
287 vxlan_clone_destroy(struct ifnet *ifp)
288 {
289 	struct vxlan_softc *sc = ifp->if_softc;
290 
291 	NET_LOCK();
292 	if (ISSET(ifp->if_flags, IFF_RUNNING))
293 		vxlan_down(sc);
294 	NET_UNLOCK();
295 
296 	ether_ifdetach(ifp);
297 	if_detach(ifp);
298 
299 	etherbridge_destroy(&sc->sc_eb);
300 
301 	refcnt_finalize(&sc->sc_refs, "vxlanfini");
302 
303 	free(sc, M_DEVBUF, sizeof(*sc));
304 
305 	return (0);
306 }
307 
308 static struct vxlan_softc *
309 vxlan_take(struct vxlan_softc *sc)
310 {
311 	refcnt_take(&sc->sc_refs);
312 	return (sc);
313 }
314 
315 static void
316 vxlan_rele(struct vxlan_softc *sc)
317 {
318 	refcnt_rele_wake(&sc->sc_refs);
319 }
320 
321 static struct mbuf *
322 vxlan_encap(struct vxlan_softc *sc, struct mbuf *m,
323     struct mbuf *(ip_encap)(struct vxlan_softc *sc, struct mbuf *,
324     const union vxlan_addr *, uint8_t))
325 {
326 	struct ifnet *ifp = &sc->sc_ac.ac_if;
327 	struct m_tag *mtag;
328 	struct mbuf *m0;
329 	union vxlan_addr gateway;
330 	const union vxlan_addr *endpoint;
331 	struct vxlan_header *vh;
332 	struct udphdr *uh;
333 	int prio;
334 	uint8_t tos;
335 
336 	if (sc->sc_mode == VXLAN_TMODE_UNSET)
337 		goto drop;
338 
339 	if (sc->sc_mode == VXLAN_TMODE_P2P)
340 		endpoint = &sc->sc_dst;
341 	else { /* VXLAN_TMODE_LEARNING || VXLAN_TMODE_ENDPOINT */
342 		struct ether_header *eh = mtod(m, struct ether_header *);
343 
344 		smr_read_enter();
345 		endpoint = etherbridge_resolve_ea(&sc->sc_eb,
346 		    (struct ether_addr *)eh->ether_dhost);
347 		if (endpoint != NULL) {
348 			gateway = *endpoint;
349 			endpoint = &gateway;
350 		}
351 		smr_read_leave();
352 
353 		if (endpoint == NULL) {
354 			if (sc->sc_mode == VXLAN_TMODE_ENDPOINT)
355 				goto drop;
356 
357 			/* "flood" to unknown destinations */
358 			endpoint = &sc->sc_dst;
359 		}
360 	}
361 
362 	/* force prepend mbuf because of payload alignment */
363 	m0 = m_get(M_DONTWAIT, m->m_type);
364 	if (m0 == NULL)
365 		goto drop;
366 
367 	m_align(m0, 0);
368 	m0->m_len = 0;
369 
370 	M_MOVE_PKTHDR(m0, m);
371 	m0->m_next = m;
372 
373 	m = m_prepend(m0, sizeof(*vh), M_DONTWAIT);
374 	if (m == NULL)
375 		return (NULL);
376 
377 	vh = mtod(m, struct vxlan_header *);
378 	*vh = sc->sc_header;
379 
380 	m = m_prepend(m, sizeof(*uh), M_DONTWAIT);
381 	if (m == NULL)
382 		return (NULL);
383 
384 	uh = mtod(m, struct udphdr *);
385 	uh->uh_sport = sc->sc_port; /* XXX */
386 	uh->uh_dport = sc->sc_port;
387 	htobem16(&uh->uh_ulen, m->m_pkthdr.len);
388 	uh->uh_sum = htons(0);
389 
390 	SET(m->m_pkthdr.csum_flags, M_UDP_CSUM_OUT);
391 
392 	mtag = m_tag_get(PACKET_TAG_GRE, sizeof(ifp->if_index), M_NOWAIT);
393 	if (mtag == NULL)
394 		goto drop;
395 
396 	*(int *)(mtag + 1) = ifp->if_index;
397 	m_tag_prepend(m, mtag);
398 
399 	prio = sc->sc_txhprio;
400 	if (prio == IF_HDRPRIO_PACKET)
401 		prio = m->m_pkthdr.pf.prio;
402 	tos = IFQ_PRIO2TOS(prio);
403 
404 	CLR(m->m_flags, M_BCAST|M_MCAST);
405 	m->m_pkthdr.ph_rtableid = sc->sc_rdomain;
406 
407 #if NPF > 0
408 	pf_pkt_addr_changed(m);
409 #endif
410 
411 	return ((*ip_encap)(sc, m, endpoint, tos));
412 drop:
413 	m_freem(m);
414 	return (NULL);
415 }
416 
417 static struct mbuf *
418 vxlan_encap_ipv4(struct vxlan_softc *sc, struct mbuf *m,
419     const union vxlan_addr *endpoint, uint8_t tos)
420 {
421 	struct ip *ip;
422 
423 	m = m_prepend(m, sizeof(*ip), M_DONTWAIT);
424 	if (m == NULL)
425 		return (NULL);
426 
427 	ip = mtod(m, struct ip *);
428 	ip->ip_v = IPVERSION;
429 	ip->ip_hl = sizeof(*ip) >> 2;
430 	ip->ip_off = sc->sc_df;
431 	ip->ip_tos = tos;
432 	ip->ip_len = htons(m->m_pkthdr.len);
433 	ip->ip_ttl = sc->sc_ttl;
434 	ip->ip_p = IPPROTO_UDP;
435 	ip->ip_src = sc->sc_src.in4;
436 	ip->ip_dst = endpoint->in4;
437 
438 	return (m);
439 }
440 
441 #ifdef INET6
442 static struct mbuf *
443 vxlan_encap_ipv6(struct vxlan_softc *sc, struct mbuf *m,
444     const union vxlan_addr *endpoint, uint8_t tos)
445 {
446 	struct ip6_hdr *ip6;
447 	int len = m->m_pkthdr.len;
448 
449 	m = m_prepend(m, sizeof(*ip6), M_DONTWAIT);
450 	if (m == NULL)
451 		return (NULL);
452 
453 	ip6 = mtod(m, struct ip6_hdr *);
454 	ip6->ip6_flow = ISSET(m->m_pkthdr.csum_flags, M_FLOWID) ?
455 	    htonl(m->m_pkthdr.ph_flowid) : 0;
456 	ip6->ip6_vfc |= IPV6_VERSION;
457 	ip6->ip6_flow |= htonl((uint32_t)tos << 20);
458 	ip6->ip6_plen = htons(len);
459 	ip6->ip6_nxt = IPPROTO_UDP;
460 	ip6->ip6_hlim = sc->sc_ttl;
461 	ip6->ip6_src = sc->sc_src.in6;
462 	ip6->ip6_dst = endpoint->in6;
463 
464 	if (sc->sc_df)
465 		SET(m->m_pkthdr.csum_flags, M_IPV6_DF_OUT);
466 
467 	return (m);
468 }
469 #endif /* INET6 */
470 
471 static int
472 vxlan_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
473     struct rtentry *rt)
474 {
475 	struct m_tag *mtag;
476 
477 	mtag = NULL;
478 	while ((mtag = m_tag_find(m, PACKET_TAG_GRE, mtag)) != NULL) {
479 		if (*(int *)(mtag + 1) == ifp->if_index) {
480 			m_freem(m);
481 			return (EIO);
482 		}
483 	}
484 
485 	return (ether_output(ifp, m, dst, rt));
486 }
487 
488 static int
489 vxlan_enqueue(struct ifnet *ifp, struct mbuf *m)
490 {
491 	struct vxlan_softc *sc = ifp->if_softc;
492 	struct ifqueue *ifq = &ifp->if_snd;
493 
494 	if (ifq_enqueue(ifq, m) != 0)
495 		return (ENOBUFS);
496 
497 	task_add(ifq->ifq_softnet, &sc->sc_send_task);
498 
499 	return (0);
500 }
501 
502 static void
503 vxlan_start(struct ifqueue *ifq)
504 {
505 	struct ifnet *ifp = ifq->ifq_if;
506 	struct vxlan_softc *sc = ifp->if_softc;
507 
508 	task_add(ifq->ifq_softnet, &sc->sc_send_task);
509 }
510 
511 static uint64_t
512 vxlan_send_ipv4(struct vxlan_softc *sc, struct mbuf_list *ml)
513 {
514 	struct ip_moptions imo;
515 	struct mbuf *m;
516 	uint64_t oerrors = 0;
517 
518 	imo.imo_ifidx = sc->sc_if_index0;
519 	imo.imo_ttl = sc->sc_ttl;
520 	imo.imo_loop = 0;
521 
522 	NET_LOCK();
523 	while ((m = ml_dequeue(ml)) != NULL) {
524 		if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &imo, NULL, 0) != 0)
525 			oerrors++;
526 	}
527 	NET_UNLOCK();
528 
529 	return (oerrors);
530 }
531 
532 #ifdef INET6
533 static uint64_t
534 vxlan_send_ipv6(struct vxlan_softc *sc, struct mbuf_list *ml)
535 {
536 	struct ip6_moptions im6o;
537 	struct mbuf *m;
538 	uint64_t oerrors = 0;
539 
540 	im6o.im6o_ifidx = sc->sc_if_index0;
541 	im6o.im6o_hlim = sc->sc_ttl;
542 	im6o.im6o_loop = 0;
543 
544 	NET_LOCK();
545 	while ((m = ml_dequeue(ml)) != NULL) {
546 		if (ip6_output(m, NULL, NULL, 0, &im6o, NULL) != 0)
547 			oerrors++;
548 	}
549 	NET_UNLOCK();
550 
551 	return (oerrors);
552 }
553 #endif /* INET6 */
554 
555 static void
556 vxlan_send(void *arg)
557 {
558 	struct vxlan_softc *sc = arg;
559 	struct ifnet *ifp = &sc->sc_ac.ac_if;
560 	struct mbuf *(*ip_encap)(struct vxlan_softc *, struct mbuf *,
561 	    const union vxlan_addr *, uint8_t);
562 	uint64_t (*ip_send)(struct vxlan_softc *, struct mbuf_list *);
563 	struct mbuf_list ml = MBUF_LIST_INITIALIZER();
564 	struct mbuf *m;
565 	uint64_t oerrors;
566 
567 	if (!ISSET(ifp->if_flags, IFF_RUNNING))
568 		return;
569 
570 	switch (sc->sc_af) {
571 	case AF_INET:
572 		ip_encap = vxlan_encap_ipv4;
573 		ip_send = vxlan_send_ipv4;
574 		break;
575 #ifdef INET6
576 	case AF_INET6:
577 		ip_encap = vxlan_encap_ipv6;
578 		ip_send = vxlan_send_ipv6;
579 		break;
580 #endif
581 	default:
582 		unhandled_af(sc->sc_af);
583 		/* NOTREACHED */
584 	}
585 
586 	while ((m = ifq_dequeue(&ifp->if_snd)) != NULL) {
587 #if NBPFILTER > 0
588 		caddr_t if_bpf = READ_ONCE(ifp->if_bpf);
589 		if (if_bpf != NULL)
590 			bpf_mtap_ether(if_bpf, m, BPF_DIRECTION_OUT);
591 #endif
592 		m = vxlan_encap(sc, m, ip_encap);
593 		if (m == NULL)
594 			continue;
595 
596 		ml_enqueue(&ml, m);
597 	}
598 
599 	oerrors = (*ip_send)(sc, &ml);
600 
601 	counters_add(ifp->if_counters, ifc_oerrors, oerrors);
602 }
603 
604 static struct mbuf *
605 vxlan_input(void *arg, struct mbuf *m, struct ip *ip, struct ip6_hdr *ip6,
606     void *uhp, int hlen)
607 {
608 	struct vxlan_tep *vt = arg;
609 	union vxlan_addr addr;
610 	struct vxlan_peer key, *p;
611 	struct udphdr *uh;
612 	struct vxlan_header *vh;
613 	struct ether_header *eh;
614 	int vhlen = hlen + sizeof(*vh);
615 	struct mbuf *n;
616 	int off;
617 	in_port_t port;
618 	struct vxlan_softc *sc = NULL;
619 	struct ifnet *ifp;
620 	int rxhprio;
621 	uint8_t tos;
622 
623 	if (m->m_pkthdr.len < vhlen)
624 		goto drop;
625 
626 	uh = uhp;
627 	port = uh->uh_sport;
628 
629 	if (ip != NULL) {
630 		memset(&addr, 0, sizeof(addr));
631 		addr.in4 = ip->ip_src;
632 		tos = ip->ip_tos;
633 	}
634 #ifdef INET6
635 	else {
636 		addr.in6 = ip6->ip6_src;
637 		tos = bemtoh32(&ip6->ip6_flow) >> 20;
638 	}
639 #endif
640 
641 	if (m->m_len < vhlen) {
642 		m = m_pullup(m, vhlen);
643 		if (m == NULL)
644 			return (NULL);
645 	}
646 
647 	/* can't use ip/ip6/uh after this */
648 
649 	vh = (struct vxlan_header *)(mtod(m, caddr_t) + hlen);
650 
651 	memset(&key, 0, sizeof(key));
652 	key.p_addr = addr;
653 	key.p_header.vxlan_flags = vh->vxlan_flags & htonl(VXLAN_F_I);
654 	key.p_header.vxlan_id = vh->vxlan_id & htonl(VXLAN_VNI_MASK);
655 
656 	mtx_enter(&vt->vt_mtx);
657 	p = RBT_FIND(vxlan_peers, &vt->vt_peers, &key);
658 	if (p == NULL) {
659 		memset(&key.p_addr, 0, sizeof(key.p_addr));
660 		p = RBT_FIND(vxlan_peers, &vt->vt_peers, &key);
661 	}
662 	if (p != NULL)
663 		sc = vxlan_take(p->p_sc);
664 	mtx_leave(&vt->vt_mtx);
665 
666 	if (sc == NULL)
667 		goto drop;
668 
669 	ifp = &sc->sc_ac.ac_if;
670 	if (ISSET(ifp->if_flags, IFF_LINK0) && port != sc->sc_port)
671 		goto rele_drop;
672 
673 	m_adj(m, vhlen);
674 
675 	if (m->m_pkthdr.len < sizeof(*eh))
676 		goto rele_drop;
677 
678 	if (m->m_len < sizeof(*eh)) {
679 		m = m_pullup(m, sizeof(*eh));
680 		if (m == NULL)
681 			goto rele;
682 	}
683 
684 	n = m_getptr(m, sizeof(*eh), &off);
685 	if (n == NULL)
686 		goto rele_drop;
687 
688 	if (!ALIGNED_POINTER(mtod(n, caddr_t) + off, uint32_t)) {
689 		n = m_dup_pkt(m, ETHER_ALIGN, M_NOWAIT);
690 		m_freem(m);
691 		if (n == NULL)
692 			goto rele;
693 		m = n;
694 	}
695 
696 	if (sc->sc_mode == VXLAN_TMODE_LEARNING) {
697 		eh = mtod(m, struct ether_header *);
698 		etherbridge_map_ea(&sc->sc_eb, &addr,
699 		    (struct ether_addr *)eh->ether_shost);
700 	}
701 
702 	rxhprio = sc->sc_rxhprio;
703 	switch (rxhprio) {
704 	case IF_HDRPRIO_PACKET:
705 		/* nop */
706 		break;
707 	case IF_HDRPRIO_OUTER:
708 		m->m_pkthdr.pf.prio = IFQ_TOS2PRIO(tos);
709 		break;
710 	default:
711 		m->m_pkthdr.pf.prio = rxhprio;
712 		break;                                                  \
713         }                                                               \
714 
715 	if_vinput(ifp, m);
716 rele:
717 	vxlan_rele(sc);
718 	return (NULL);
719 
720 rele_drop:
721 	vxlan_rele(sc);
722 drop:
723 	m_freem(m);
724 	return (NULL);
725 }
726 
727 static int
728 vxlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
729 {
730 	struct vxlan_softc *sc = ifp->if_softc;
731 	struct ifreq *ifr = (struct ifreq *)data;
732 	struct ifbrparam *bparam = (struct ifbrparam *)data;
733 	int error = 0;
734 
735 	switch (cmd) {
736 	case SIOCSIFADDR:
737 		break;
738 	case SIOCSIFFLAGS:
739 		if (ISSET(ifp->if_flags, IFF_UP)) {
740 			if (!ISSET(ifp->if_flags, IFF_RUNNING))
741 				error = vxlan_up(sc);
742 			else
743 				error = 0;
744 		} else {
745 			if (ISSET(ifp->if_flags, IFF_RUNNING))
746 				error = vxlan_down(sc);
747 		}
748 		break;
749 
750 	case SIOCSLIFPHYRTABLE:
751 		error = vxlan_set_rdomain(sc, ifr);
752 		break;
753 	case SIOCGLIFPHYRTABLE:
754 		error = vxlan_get_rdomain(sc, ifr);
755 		break;
756 
757 	case SIOCSLIFPHYADDR:
758 		error = vxlan_set_tunnel(sc, (const struct if_laddrreq *)data);
759 		break;
760 	case SIOCGLIFPHYADDR:
761 		error = vxlan_get_tunnel(sc, (struct if_laddrreq *)data);
762 		break;
763 	case SIOCDIFPHYADDR:
764 		error = vxlan_del_tunnel(sc);
765 		break;
766 
767 	case SIOCSVNETID:
768 		error = vxlan_set_vnetid(sc, ifr);
769 		break;
770 	case SIOCGVNETID:
771 		error = vxlan_get_vnetid(sc, ifr);
772 		break;
773 	case SIOCDVNETID:
774 		error = vxlan_del_vnetid(sc);
775 		break;
776 
777 	case SIOCSIFPARENT:
778 		error = vxlan_set_parent(sc, (struct if_parent *)data);
779 		break;
780 	case SIOCGIFPARENT:
781 		error = vxlan_get_parent(sc, (struct if_parent *)data);
782 		break;
783 	case SIOCDIFPARENT:
784 		error = vxlan_del_parent(sc);
785 		break;
786 
787 	case SIOCSTXHPRIO:
788 		error = if_txhprio_l2_check(ifr->ifr_hdrprio);
789 		if (error != 0)
790 			break;
791 
792 		sc->sc_txhprio = ifr->ifr_hdrprio;
793 		break;
794 	case SIOCGTXHPRIO:
795 		ifr->ifr_hdrprio = sc->sc_txhprio;
796 		break;
797 
798 	case SIOCSRXHPRIO:
799 		error = if_rxhprio_l2_check(ifr->ifr_hdrprio);
800 		if (error != 0)
801 			break;
802 
803 		sc->sc_rxhprio = ifr->ifr_hdrprio;
804 		break;
805 	case SIOCGRXHPRIO:
806 		ifr->ifr_hdrprio = sc->sc_rxhprio;
807 		break;
808 
809 	case SIOCSLIFPHYDF:
810 		/* commit */
811 		sc->sc_df = ifr->ifr_df ? htons(IP_DF) : htons(0);
812 		break;
813 	case SIOCGLIFPHYDF:
814 		ifr->ifr_df = sc->sc_df ? 1 : 0;
815 		break;
816 
817 	case SIOCSLIFPHYTTL:
818 		if (ifr->ifr_ttl < 1 || ifr->ifr_ttl > 0xff) {
819 			error = EINVAL;
820 			break;
821 		}
822 
823 		/* commit */
824 		sc->sc_ttl = (uint8_t)ifr->ifr_ttl;
825 		break;
826 	case SIOCGLIFPHYTTL:
827 		ifr->ifr_ttl = (int)sc->sc_ttl;
828 		break;
829 
830 	case SIOCBRDGSCACHE:
831 		error = etherbridge_set_max(&sc->sc_eb, bparam);
832 		break;
833 	case SIOCBRDGGCACHE:
834 		error = etherbridge_get_max(&sc->sc_eb, bparam);
835 		break;
836 	case SIOCBRDGSTO:
837 		error = etherbridge_set_tmo(&sc->sc_eb, bparam);
838 		break;
839 	case SIOCBRDGGTO:
840 		error = etherbridge_get_tmo(&sc->sc_eb, bparam);
841 		break;
842 
843 	case SIOCBRDGRTS:
844 		error = etherbridge_rtfind(&sc->sc_eb,
845 		    (struct ifbaconf *)data);
846 		break;
847 	case SIOCBRDGFLUSH:
848 		etherbridge_flush(&sc->sc_eb,
849 		    ((struct ifbreq *)data)->ifbr_ifsflags);
850 		break;
851 	case SIOCBRDGSADDR:
852 		error = vxlan_add_addr(sc, (struct ifbareq *)data);
853 		break;
854 	case SIOCBRDGDADDR:
855 		error = vxlan_del_addr(sc, (struct ifbareq *)data);
856 		break;
857 
858 	case SIOCADDMULTI:
859 	case SIOCDELMULTI:
860 		/* no hardware to program */
861 		break;
862 
863 	default:
864 		error = ether_ioctl(ifp, &sc->sc_ac, cmd, data);
865 		break;
866 	}
867 
868 	if (error == ENETRESET) {
869 		/* no hardware to program */
870 		error = 0;
871 	}
872 
873 	return (error);
874 }
875 
876 static struct vxlan_tep *
877 vxlan_tep_get(struct vxlan_softc *sc, const union vxlan_addr *addr)
878 {
879 	struct vxlan_tep *vt;
880 
881 	TAILQ_FOREACH(vt, &vxlan_teps, vt_entry) {
882 		if (sc->sc_af == vt->vt_af &&
883 		    sc->sc_rdomain == vt->vt_rdomain &&
884 		    memcmp(addr, &vt->vt_addr, sizeof(*addr)) == 0 &&
885 		    sc->sc_port == vt->vt_port)
886 			return (vt);
887 	}
888 
889 	return (NULL);
890 }
891 
892 static int
893 vxlan_tep_add_addr(struct vxlan_softc *sc, const union vxlan_addr *addr,
894     struct vxlan_peer *p)
895 {
896 	struct mbuf m;
897 	struct vxlan_tep *vt;
898 	struct socket *so;
899 	struct sockaddr_in *sin;
900 #ifdef INET6
901 	struct sockaddr_in6 *sin6;
902 #endif
903 	int error;
904 
905 	vt = vxlan_tep_get(sc, addr);
906 	if (vt != NULL) {
907 		struct vxlan_peer *op;
908 
909 		mtx_enter(&vt->vt_mtx);
910 		op = RBT_INSERT(vxlan_peers, &vt->vt_peers, p);
911 		mtx_leave(&vt->vt_mtx);
912 
913 		if (op != NULL)
914 			return (EADDRINUSE);
915 
916 		return (0);
917 	}
918 
919 	vt = malloc(sizeof(*vt), M_DEVBUF, M_NOWAIT|M_ZERO);
920 	if (vt == NULL)
921 		return (ENOMEM);
922 
923 	vt->vt_af = sc->sc_af;
924 	vt->vt_rdomain = sc->sc_rdomain;
925 	vt->vt_addr = *addr;
926 	vt->vt_port = sc->sc_port;
927 
928 	mtx_init(&vt->vt_mtx, IPL_SOFTNET);
929 	RBT_INIT(vxlan_peers, &vt->vt_peers);
930 	RBT_INSERT(vxlan_peers, &vt->vt_peers, p);
931 
932 	error = socreate(vt->vt_af, &so, SOCK_DGRAM, IPPROTO_UDP);
933 	if (error != 0)
934 		goto free;
935 
936 	solock(so);
937 
938 	sotoinpcb(so)->inp_upcall = vxlan_input;
939 	sotoinpcb(so)->inp_upcall_arg = vt;
940 
941 	m_inithdr(&m);
942 	m.m_len = sizeof(vt->vt_rdomain);
943 	*mtod(&m, unsigned int *) = vt->vt_rdomain;
944 	error = sosetopt(so, SOL_SOCKET, SO_RTABLE, &m);
945 	if (error != 0)
946 		goto close;
947 
948 	m_inithdr(&m);
949 	switch (vt->vt_af) {
950 	case AF_INET:
951 		sin = mtod(&m, struct sockaddr_in *);
952 		memset(sin, 0, sizeof(*sin));
953 		sin->sin_len = sizeof(*sin);
954 		sin->sin_family = AF_INET;
955 		sin->sin_addr = addr->in4;
956 		sin->sin_port = vt->vt_port;
957 
958 		m.m_len = sizeof(*sin);
959 		break;
960 
961 #ifdef INET6
962 	case AF_INET6:
963 		sin6 = mtod(&m, struct sockaddr_in6 *);
964 		sin6->sin6_len = sizeof(*sin6);
965 		sin6->sin6_family = AF_INET6;
966 		in6_recoverscope(sin6, &addr->in6);
967 		sin6->sin6_port = sc->sc_port;
968 
969 		m.m_len = sizeof(*sin6);
970 		break;
971 #endif
972 	default:
973 		unhandled_af(vt->vt_af);
974 	}
975 
976 	error = sobind(so, &m, curproc);
977 	if (error != 0)
978 		goto close;
979 
980 	sounlock(so);
981 
982 	rw_assert_wrlock(&vxlan_lock);
983 	TAILQ_INSERT_TAIL(&vxlan_teps, vt, vt_entry);
984 
985 	vt->vt_so = so;
986 
987 	return (0);
988 
989 close:
990 	sounlock(so);
991 	soclose(so, MSG_DONTWAIT);
992 free:
993 	free(vt, M_DEVBUF, sizeof(*vt));
994 	return (error);
995 }
996 
997 static void
998 vxlan_tep_del_addr(struct vxlan_softc *sc, const union vxlan_addr *addr,
999     struct vxlan_peer *p)
1000 {
1001 	struct vxlan_tep *vt;
1002 	int empty;
1003 
1004 	vt = vxlan_tep_get(sc, addr);
1005 	if (vt == NULL)
1006 		panic("unable to find vxlan_tep for peer %p (sc %p)", p, sc);
1007 
1008 	mtx_enter(&vt->vt_mtx);
1009 	RBT_REMOVE(vxlan_peers, &vt->vt_peers, p);
1010 	empty = RBT_EMPTY(vxlan_peers, &vt->vt_peers);
1011 	mtx_leave(&vt->vt_mtx);
1012 
1013 	if (!empty)
1014 		return;
1015 
1016 	rw_assert_wrlock(&vxlan_lock);
1017 	TAILQ_REMOVE(&vxlan_teps, vt, vt_entry);
1018 
1019 	soclose(vt->vt_so, MSG_DONTWAIT);
1020 	free(vt, M_DEVBUF, sizeof(*vt));
1021 }
1022 
1023 static int
1024 vxlan_tep_up(struct vxlan_softc *sc)
1025 {
1026 	struct vxlan_peer *up, *mp;
1027 	int error;
1028 
1029 	up = malloc(sizeof(*up), M_DEVBUF, M_NOWAIT|M_ZERO);
1030 	if (up == NULL)
1031 		return (ENOMEM);
1032 
1033 	if (sc->sc_mode == VXLAN_TMODE_P2P)
1034 		up->p_addr = sc->sc_dst;
1035 	up->p_header = sc->sc_header;
1036 	up->p_sc = vxlan_take(sc);
1037 
1038 	error = vxlan_tep_add_addr(sc, &sc->sc_src, up);
1039 	if (error != 0)
1040 		goto freeup;
1041 
1042 	sc->sc_ucast_peer = up;
1043 
1044 	if (sc->sc_mode != VXLAN_TMODE_LEARNING)
1045 		return (0);
1046 
1047 	mp = malloc(sizeof(*mp), M_DEVBUF, M_NOWAIT|M_ZERO);
1048 	if (mp == NULL) {
1049 		error = ENOMEM;
1050 		goto delup;
1051 	}
1052 
1053 	/* addr is multicast, leave it as 0s */
1054 	mp->p_header = sc->sc_header;
1055 	mp->p_sc = vxlan_take(sc);
1056 
1057 	/* destination address is a multicast group we want to join */
1058 	error = vxlan_tep_add_addr(sc, &sc->sc_dst, up);
1059 	if (error != 0)
1060 		goto freemp;
1061 
1062 	sc->sc_mcast_peer = mp;
1063 
1064 	return (0);
1065 
1066 freemp:
1067 	vxlan_rele(mp->p_sc);
1068 	free(mp, M_DEVBUF, sizeof(*mp));
1069 delup:
1070 	vxlan_tep_del_addr(sc, &sc->sc_src, up);
1071 freeup:
1072 	vxlan_rele(up->p_sc);
1073 	free(up, M_DEVBUF, sizeof(*up));
1074 	return (error);
1075 }
1076 
1077 static void
1078 vxlan_tep_down(struct vxlan_softc *sc)
1079 {
1080 	struct vxlan_peer *up = sc->sc_ucast_peer;
1081 
1082 	if (sc->sc_mode == VXLAN_TMODE_LEARNING) {
1083 		struct vxlan_peer *mp = sc->sc_mcast_peer;
1084 		vxlan_tep_del_addr(sc, &sc->sc_dst, mp);
1085 		vxlan_rele(mp->p_sc);
1086 		free(mp, M_DEVBUF, sizeof(*mp));
1087 	}
1088 
1089 	vxlan_tep_del_addr(sc, &sc->sc_src, up);
1090 	vxlan_rele(up->p_sc);
1091 	free(up, M_DEVBUF, sizeof(*up));
1092 }
1093 
1094 static int
1095 vxlan_up(struct vxlan_softc *sc)
1096 {
1097 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1098 	struct ifnet *ifp0 = NULL;
1099 	int error;
1100 
1101 	KASSERT(!ISSET(ifp->if_flags, IFF_RUNNING));
1102 	NET_ASSERT_LOCKED();
1103 
1104 	if (sc->sc_af == AF_UNSPEC)
1105 		return (EDESTADDRREQ);
1106 	KASSERT(sc->sc_mode != VXLAN_TMODE_UNSET);
1107 
1108 	NET_UNLOCK();
1109 
1110 	error = rw_enter(&vxlan_lock, RW_WRITE|RW_INTR);
1111 	if (error != 0)
1112 		goto netlock;
1113 
1114 	NET_LOCK();
1115 	if (ISSET(ifp->if_flags, IFF_RUNNING)) {
1116 		/* something else beat us */
1117 		rw_exit(&vxlan_lock);
1118 		return (0);
1119 	}
1120 	NET_UNLOCK();
1121 
1122 	if (sc->sc_mode != VXLAN_TMODE_P2P) {
1123 		error = etherbridge_up(&sc->sc_eb);
1124 		if (error != 0)
1125 			goto unlock;
1126 	}
1127 
1128 	if (sc->sc_mode == VXLAN_TMODE_LEARNING) {
1129 		ifp0 = if_get(sc->sc_if_index0);
1130 		if (ifp0 == NULL) {
1131 			error = ENXIO;
1132 			goto down;
1133 		}
1134 
1135 		/* check again if multicast will work on top of the parent */
1136 		if (!ISSET(ifp0->if_flags, IFF_MULTICAST)) {
1137 			error = EPROTONOSUPPORT;
1138 			goto put;
1139 		}
1140 
1141 		error = vxlan_addmulti(sc, ifp0);
1142 		if (error != 0)
1143 			goto put;
1144 
1145 		/* Register callback if parent wants to unregister */
1146 		if_detachhook_add(ifp0, &sc->sc_dtask);
1147 	} else {
1148 		if (sc->sc_if_index0 != 0) {
1149 			error = EPROTONOSUPPORT;
1150 			goto down;
1151 		}
1152 	}
1153 
1154 	error = vxlan_tep_up(sc);
1155 	if (error != 0)
1156 		goto del;
1157 
1158 	if_put(ifp0);
1159 
1160 	NET_LOCK();
1161 	SET(ifp->if_flags, IFF_RUNNING);
1162 	rw_exit(&vxlan_lock);
1163 
1164 	return (0);
1165 
1166 del:
1167 	if (sc->sc_mode == VXLAN_TMODE_LEARNING) {
1168 		if (ifp0 != NULL)
1169 			if_detachhook_del(ifp0, &sc->sc_dtask);
1170 		vxlan_delmulti(sc);
1171 	}
1172 put:
1173 	if_put(ifp0);
1174 down:
1175 	if (sc->sc_mode != VXLAN_TMODE_P2P)
1176 		etherbridge_down(&sc->sc_eb);
1177 unlock:
1178 	rw_exit(&vxlan_lock);
1179 netlock:
1180 	NET_LOCK();
1181 
1182 	return (error);
1183 }
1184 
1185 static int
1186 vxlan_down(struct vxlan_softc *sc)
1187 {
1188 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1189 	struct ifnet *ifp0;
1190 	int error;
1191 
1192 	KASSERT(ISSET(ifp->if_flags, IFF_RUNNING));
1193 	NET_UNLOCK();
1194 
1195 	error = rw_enter(&vxlan_lock, RW_WRITE|RW_INTR);
1196 	if (error != 0) {
1197 		NET_LOCK();
1198 		return (error);
1199 	}
1200 
1201 	NET_LOCK();
1202 	if (!ISSET(ifp->if_flags, IFF_RUNNING)) {
1203 		/* something else beat us */
1204 		rw_exit(&vxlan_lock);
1205 		return (0);
1206 	}
1207 	NET_UNLOCK();
1208 
1209 	vxlan_tep_down(sc);
1210 
1211 	if (sc->sc_mode == VXLAN_TMODE_LEARNING) {
1212 		vxlan_delmulti(sc);
1213 		ifp0 = if_get(sc->sc_if_index0);
1214 		if (ifp0 != NULL) {
1215 			if_detachhook_del(ifp0, &sc->sc_dtask);
1216 		}
1217 		if_put(ifp0);
1218 	}
1219 
1220 	if (sc->sc_mode != VXLAN_TMODE_P2P)
1221 		etherbridge_down(&sc->sc_eb);
1222 
1223 	taskq_del_barrier(ifp->if_snd.ifq_softnet, &sc->sc_send_task);
1224 	NET_LOCK();
1225 	CLR(ifp->if_flags, IFF_RUNNING);
1226 	rw_exit(&vxlan_lock);
1227 
1228 	return (0);
1229 }
1230 
1231 static int
1232 vxlan_addmulti(struct vxlan_softc *sc, struct ifnet *ifp0)
1233 {
1234 	int error = 0;
1235 
1236 	NET_LOCK();
1237 
1238 	switch (sc->sc_af) {
1239 	case AF_INET:
1240 		sc->sc_inmulti = in_addmulti(&sc->sc_dst.in4, ifp0);
1241 		if (sc->sc_inmulti == NULL)
1242 			error = EADDRNOTAVAIL;
1243 		break;
1244 #ifdef INET6
1245 	case AF_INET6:
1246 		sc->sc_inmulti = in6_addmulti(&sc->sc_dst.in6, ifp0, &error);
1247 		break;
1248 #endif
1249 	default:
1250 		unhandled_af(sc->sc_af);
1251 	}
1252 
1253 	NET_UNLOCK();
1254 
1255 	return (error);
1256 }
1257 
1258 static void
1259 vxlan_delmulti(struct vxlan_softc *sc)
1260 {
1261 	NET_LOCK();
1262 
1263 	switch (sc->sc_af) {
1264 	case AF_INET:
1265 		in_delmulti(sc->sc_inmulti);
1266 		break;
1267 #ifdef INET6
1268 	case AF_INET6:
1269 		in6_delmulti(sc->sc_inmulti);
1270 		break;
1271 #endif
1272 	default:
1273 		unhandled_af(sc->sc_af);
1274 	}
1275 
1276 	sc->sc_inmulti = NULL; /* keep it tidy */
1277 
1278 	NET_UNLOCK();
1279 }
1280 
1281 static int
1282 vxlan_set_rdomain(struct vxlan_softc *sc, const struct ifreq *ifr)
1283 {
1284 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1285 
1286 	if (ifr->ifr_rdomainid < 0 ||
1287 	    ifr->ifr_rdomainid > RT_TABLEID_MAX)
1288 		return (EINVAL);
1289 	if (!rtable_exists(ifr->ifr_rdomainid))
1290 		return (EADDRNOTAVAIL);
1291 
1292 	if (sc->sc_rdomain == ifr->ifr_rdomainid)
1293 		return (0);
1294 
1295 	if (ISSET(ifp->if_flags, IFF_RUNNING))
1296 		return (EBUSY);
1297 
1298 	/* commit */
1299 	sc->sc_rdomain = ifr->ifr_rdomainid;
1300 	etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
1301 
1302 	return (0);
1303 }
1304 
1305 static int
1306 vxlan_get_rdomain(struct vxlan_softc *sc, struct ifreq *ifr)
1307 {
1308 	ifr->ifr_rdomainid = sc->sc_rdomain;
1309 
1310 	return (0);
1311 }
1312 
1313 static int
1314 vxlan_set_tunnel(struct vxlan_softc *sc, const struct if_laddrreq *req)
1315 {
1316 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1317 	struct sockaddr *src = (struct sockaddr *)&req->addr;
1318 	struct sockaddr *dst = (struct sockaddr *)&req->dstaddr;
1319 	struct sockaddr_in *src4, *dst4;
1320 #ifdef INET6
1321 	struct sockaddr_in6 *src6, *dst6;
1322 	int error;
1323 #endif
1324 	union vxlan_addr saddr, daddr;
1325 	unsigned int mode = VXLAN_TMODE_ENDPOINT;
1326 	in_port_t port = htons(VXLAN_PORT);
1327 
1328 	memset(&saddr, 0, sizeof(saddr));
1329 	memset(&daddr, 0, sizeof(daddr));
1330 
1331 	/* validate */
1332 	switch (src->sa_family) {
1333 	case AF_INET:
1334 		src4 = (struct sockaddr_in *)src;
1335 		if (in_nullhost(src4->sin_addr) ||
1336 		    IN_MULTICAST(src4->sin_addr.s_addr))
1337 			return (EINVAL);
1338 
1339 		if (src4->sin_port != htons(0))
1340 			port = src4->sin_port;
1341 
1342 		if (dst->sa_family != AF_UNSPEC) {
1343 			if (dst->sa_family != AF_INET)
1344 				return (EINVAL);
1345 
1346 			dst4 = (struct sockaddr_in *)dst;
1347 			if (in_nullhost(dst4->sin_addr))
1348 				return (EINVAL);
1349 
1350 			/* all good */
1351 			mode = IN_MULTICAST(dst4->sin_addr.s_addr) ?
1352 			    VXLAN_TMODE_LEARNING : VXLAN_TMODE_P2P;
1353 			daddr.in4 = dst4->sin_addr;
1354 		}
1355 
1356 		saddr.in4 = src4->sin_addr;
1357 		break;
1358 
1359 #ifdef INET6
1360 	case AF_INET6:
1361 		src6 = (struct sockaddr_in6 *)src;
1362 		if (IN6_IS_ADDR_UNSPECIFIED(&src6->sin6_addr) ||
1363 		    IN6_IS_ADDR_MULTICAST(&src6->sin6_addr))
1364 			return (EINVAL);
1365 
1366 		if (src6->sin6_port != htons(0))
1367 			port = src6->sin6_port;
1368 
1369 		if (dst->sa_family != AF_UNSPEC) {
1370 			if (dst->sa_family != AF_INET6)
1371 				return (EINVAL);
1372 
1373 			dst6 = (struct sockaddr_in6 *)dst;
1374 			if (IN6_IS_ADDR_UNSPECIFIED(&dst6->sin6_addr))
1375 				return (EINVAL);
1376 
1377 			if (src6->sin6_scope_id != dst6->sin6_scope_id)
1378 				return (EINVAL);
1379 
1380 			/* all good */
1381 			mode = IN6_IS_ADDR_MULTICAST(&dst6->sin6_addr) ?
1382 			    VXLAN_TMODE_LEARNING : VXLAN_TMODE_P2P;
1383 			error = in6_embedscope(&daddr.in6, dst6, NULL);
1384 			if (error != 0)
1385 				return (error);
1386 		}
1387 
1388 		error = in6_embedscope(&saddr.in6, src6, NULL);
1389 		if (error != 0)
1390 			return (error);
1391 
1392 		break;
1393 #endif
1394 	default:
1395 		return (EAFNOSUPPORT);
1396 	}
1397 
1398 	if (memcmp(&sc->sc_src, &saddr, sizeof(sc->sc_src)) == 0 &&
1399 	    memcmp(&sc->sc_dst, &daddr, sizeof(sc->sc_dst)) == 0 &&
1400 	    sc->sc_port == port)
1401 		return (0);
1402 
1403 	if (ISSET(ifp->if_flags, IFF_RUNNING))
1404 		return (EBUSY);
1405 
1406 	/* commit */
1407 	sc->sc_af = src->sa_family;
1408 	sc->sc_src = saddr;
1409 	sc->sc_dst = daddr;
1410 	sc->sc_port = port;
1411 	sc->sc_mode = mode;
1412 	etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
1413 
1414 	return (0);
1415 }
1416 
1417 static int
1418 vxlan_get_tunnel(struct vxlan_softc *sc, struct if_laddrreq *req)
1419 {
1420 	struct sockaddr *dstaddr = (struct sockaddr *)&req->dstaddr;
1421 	struct sockaddr_in *sin;
1422 #ifdef INET6
1423 	struct sockaddr_in6 *sin6;
1424 #endif
1425 
1426 	if (sc->sc_af == AF_UNSPEC)
1427 		return (EADDRNOTAVAIL);
1428 	KASSERT(sc->sc_mode != VXLAN_TMODE_UNSET);
1429 
1430 	memset(&req->addr, 0, sizeof(req->addr));
1431 	memset(&req->dstaddr, 0, sizeof(req->dstaddr));
1432 
1433 	/* default to endpoint */
1434 	dstaddr->sa_len = 2;
1435 	dstaddr->sa_family = AF_UNSPEC;
1436 
1437 	switch (sc->sc_af) {
1438 	case AF_INET:
1439 		sin = (struct sockaddr_in *)&req->addr;
1440 		sin->sin_len = sizeof(*sin);
1441 		sin->sin_family = AF_INET;
1442 		sin->sin_addr = sc->sc_src.in4;
1443 		sin->sin_port = sc->sc_port;
1444 
1445 		if (sc->sc_mode == VXLAN_TMODE_ENDPOINT)
1446 			break;
1447 
1448 		sin = (struct sockaddr_in *)&req->dstaddr;
1449 		sin->sin_len = sizeof(*sin);
1450 		sin->sin_family = AF_INET;
1451 		sin->sin_addr = sc->sc_dst.in4;
1452 		break;
1453 
1454 #ifdef INET6
1455 	case AF_INET6:
1456 		sin6 = (struct sockaddr_in6 *)&req->addr;
1457 		sin6->sin6_len = sizeof(*sin6);
1458 		sin6->sin6_family = AF_INET6;
1459 		in6_recoverscope(sin6, &sc->sc_src.in6);
1460 		sin6->sin6_port = sc->sc_port;
1461 
1462 		if (sc->sc_mode == VXLAN_TMODE_ENDPOINT)
1463 			break;
1464 
1465 		sin6 = (struct sockaddr_in6 *)&req->dstaddr;
1466 		sin6->sin6_len = sizeof(*sin6);
1467 		sin6->sin6_family = AF_INET6;
1468 		in6_recoverscope(sin6, &sc->sc_dst.in6);
1469 		break;
1470 #endif
1471 	default:
1472 		unhandled_af(sc->sc_af);
1473 	}
1474 
1475 	return (0);
1476 }
1477 
1478 static int
1479 vxlan_del_tunnel(struct vxlan_softc *sc)
1480 {
1481 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1482 
1483 	if (sc->sc_af == AF_UNSPEC)
1484 		return (0);
1485 
1486 	if (ISSET(ifp->if_flags, IFF_RUNNING))
1487 		return (EBUSY);
1488 
1489 	/* commit */
1490 	sc->sc_af = AF_UNSPEC;
1491 	memset(&sc->sc_src, 0, sizeof(sc->sc_src));
1492 	memset(&sc->sc_dst, 0, sizeof(sc->sc_dst));
1493 	sc->sc_port = htons(0);
1494 	sc->sc_mode = VXLAN_TMODE_UNSET;
1495 	etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
1496 
1497 	return (0);
1498 }
1499 
1500 static int
1501 vxlan_set_vnetid(struct vxlan_softc *sc, const struct ifreq *ifr)
1502 {
1503 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1504 	uint32_t vni;
1505 
1506 	if (ifr->ifr_vnetid < VXLAN_VNI_MIN ||
1507 	    ifr->ifr_vnetid > VXLAN_VNI_MAX)
1508 		return (EINVAL);
1509 
1510 	vni = htonl(ifr->ifr_vnetid << VXLAN_VNI_SHIFT);
1511 	if (ISSET(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I)) &&
1512 	    sc->sc_header.vxlan_id == vni)
1513 		return (0);
1514 
1515 	if (ISSET(ifp->if_flags, IFF_RUNNING))
1516 		return (EBUSY);
1517 
1518 	/* commit */
1519 	SET(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I));
1520 	sc->sc_header.vxlan_id = vni;
1521 	etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
1522 
1523 	return (0);
1524 }
1525 
1526 static int
1527 vxlan_get_vnetid(struct vxlan_softc *sc, struct ifreq *ifr)
1528 {
1529 	uint32_t vni;
1530 
1531 	if (!ISSET(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I)))
1532 		return (EADDRNOTAVAIL);
1533 
1534 	vni = ntohl(sc->sc_header.vxlan_id);
1535 	vni &= VXLAN_VNI_MASK;
1536 	vni >>= VXLAN_VNI_SHIFT;
1537 
1538 	ifr->ifr_vnetid = vni;
1539 
1540 	return (0);
1541 }
1542 
1543 static int
1544 vxlan_del_vnetid(struct vxlan_softc *sc)
1545 {
1546 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1547 
1548 	if (!ISSET(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I)))
1549 		return (0);
1550 
1551 	if (ISSET(ifp->if_flags, IFF_RUNNING))
1552 		return (EBUSY);
1553 
1554 	/* commit */
1555 	CLR(sc->sc_header.vxlan_flags, htonl(VXLAN_F_I));
1556 	sc->sc_header.vxlan_id = htonl(0 << VXLAN_VNI_SHIFT);
1557 	etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
1558 
1559 	return (0);
1560 }
1561 
1562 static int
1563 vxlan_set_parent(struct vxlan_softc *sc, const struct if_parent *p)
1564 {
1565 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1566 	struct ifnet *ifp0;
1567 	int error = 0;
1568 
1569 	ifp0 = if_unit(p->ifp_parent);
1570 	if (ifp0 == NULL)
1571 		return (ENXIO);
1572 
1573 	if (!ISSET(ifp0->if_flags, IFF_MULTICAST)) {
1574 		error = ENXIO;
1575 		goto put;
1576 	}
1577 
1578 	if (sc->sc_if_index0 == ifp0->if_index)
1579 		goto put;
1580 
1581 	if (ISSET(ifp->if_flags, IFF_RUNNING)) {
1582 		error = EBUSY;
1583 		goto put;
1584 	}
1585 
1586 	/* commit */
1587 	sc->sc_if_index0 = ifp0->if_index;
1588 	etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
1589 
1590 put:
1591 	if_put(ifp0);
1592 	return (error);
1593 }
1594 
1595 static int
1596 vxlan_get_parent(struct vxlan_softc *sc, struct if_parent *p)
1597 {
1598 	struct ifnet *ifp0;
1599 	int error = 0;
1600 
1601 	ifp0 = if_get(sc->sc_if_index0);
1602 	if (ifp0 == NULL)
1603 		error = EADDRNOTAVAIL;
1604 	else
1605 		strlcpy(p->ifp_parent, ifp0->if_xname, sizeof(p->ifp_parent));
1606 	if_put(ifp0);
1607 
1608 	return (error);
1609 }
1610 
1611 static int
1612 vxlan_del_parent(struct vxlan_softc *sc)
1613 {
1614 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1615 
1616 	if (sc->sc_if_index0 == 0)
1617 		return (0);
1618 
1619 	if (ISSET(ifp->if_flags, IFF_RUNNING))
1620 		return (EBUSY);
1621 
1622 	/* commit */
1623 	sc->sc_if_index0 = 0;
1624 	etherbridge_flush(&sc->sc_eb, IFBF_FLUSHALL);
1625 
1626 	return (0);
1627 }
1628 
1629 static int
1630 vxlan_add_addr(struct vxlan_softc *sc, const struct ifbareq *ifba)
1631 {
1632 	struct sockaddr_in *sin;
1633 #ifdef INET6
1634 	struct sockaddr_in6 *sin6;
1635 	struct sockaddr_in6 src6 = {
1636 		.sin6_len = sizeof(src6),
1637 		.sin6_family = AF_UNSPEC,
1638 	};
1639 	int error;
1640 #endif
1641 	union vxlan_addr endpoint;
1642 	unsigned int type;
1643 
1644 	switch (sc->sc_mode) {
1645 	case VXLAN_TMODE_UNSET:
1646 		return (ENOPROTOOPT);
1647 	case VXLAN_TMODE_P2P:
1648 		return (EPROTONOSUPPORT);
1649 	default:
1650 		break;
1651 	}
1652 
1653 	/* ignore ifba_ifsname */
1654 
1655 	if (ISSET(ifba->ifba_flags, ~IFBAF_TYPEMASK))
1656 		return (EINVAL);
1657 	switch (ifba->ifba_flags & IFBAF_TYPEMASK) {
1658 	case IFBAF_DYNAMIC:
1659 		type = EBE_DYNAMIC;
1660 		break;
1661 	case IFBAF_STATIC:
1662 		type = EBE_STATIC;
1663 		break;
1664 	default:
1665 		return (EINVAL);
1666 	}
1667 
1668 	memset(&endpoint, 0, sizeof(endpoint));
1669 
1670 	if (ifba->ifba_dstsa.ss_family != sc->sc_af)
1671 		return (EAFNOSUPPORT);
1672 	switch (ifba->ifba_dstsa.ss_family) {
1673 	case AF_INET:
1674 		sin = (struct sockaddr_in *)&ifba->ifba_dstsa;
1675 		if (in_nullhost(sin->sin_addr) ||
1676 		    IN_MULTICAST(sin->sin_addr.s_addr))
1677 			return (EADDRNOTAVAIL);
1678 
1679 		if (sin->sin_port != htons(0))
1680 			return (EADDRNOTAVAIL);
1681 
1682 		endpoint.in4 = sin->sin_addr;
1683 		break;
1684 
1685 #ifdef INET6
1686 	case AF_INET6:
1687 		sin6 = (struct sockaddr_in6 *)&ifba->ifba_dstsa;
1688 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) ||
1689 		    IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
1690 			return (EADDRNOTAVAIL);
1691 
1692 		in6_recoverscope(&src6, &sc->sc_src.in6);
1693 		if (src6.sin6_scope_id != sin6->sin6_scope_id)
1694 			return (EADDRNOTAVAIL);
1695 
1696 		if (sin6->sin6_port != htons(0))
1697 			return (EADDRNOTAVAIL);
1698 
1699 		error = in6_embedscope(&endpoint.in6, sin6, NULL);
1700 		if (error != 0)
1701 			return (error);
1702 
1703 		break;
1704 #endif
1705 	default: /* AF_UNSPEC */
1706 		return (EADDRNOTAVAIL);
1707 	}
1708 
1709 	return (etherbridge_add_addr(&sc->sc_eb, &endpoint,
1710 	    &ifba->ifba_dst, type));
1711 }
1712 
1713 static int
1714 vxlan_del_addr(struct vxlan_softc *sc, const struct ifbareq *ifba)
1715 {
1716 	return (etherbridge_del_addr(&sc->sc_eb, &ifba->ifba_dst));
1717 }
1718 
1719 void
1720 vxlan_detach_hook(void *arg)
1721 {
1722 	struct vxlan_softc *sc = arg;
1723 	struct ifnet *ifp = &sc->sc_ac.ac_if;
1724 
1725 	if (ISSET(ifp->if_flags, IFF_RUNNING)) {
1726 		vxlan_down(sc);
1727 		CLR(ifp->if_flags, IFF_UP);
1728 	}
1729 
1730 	sc->sc_if_index0 = 0;
1731 }
1732 
1733 static int
1734 vxlan_eb_port_eq(void *arg, void *a, void *b)
1735 {
1736 	const union vxlan_addr *va = a, *vb = b;
1737 	size_t i;
1738 
1739 	for (i = 0; i < nitems(va->in6.s6_addr32); i++) {
1740 		if (va->in6.s6_addr32[i] != vb->in6.s6_addr32[i])
1741 			return (0);
1742 	}
1743 
1744 	return (1);
1745 }
1746 
1747 static void *
1748 vxlan_eb_port_take(void *arg, void *port)
1749 {
1750 	union vxlan_addr *endpoint;
1751 
1752 	endpoint = pool_get(&vxlan_endpoint_pool, PR_NOWAIT);
1753 	if (endpoint == NULL)
1754 		return (NULL);
1755 
1756 	*endpoint = *(union vxlan_addr *)port;
1757 
1758 	return (endpoint);
1759 }
1760 
1761 static void
1762 vxlan_eb_port_rele(void *arg, void *port)
1763 {
1764 	union vxlan_addr *endpoint = port;
1765 
1766 	pool_put(&vxlan_endpoint_pool, endpoint);
1767 }
1768 
1769 static size_t
1770 vxlan_eb_port_ifname(void *arg, char *dst, size_t len, void *port)
1771 {
1772 	struct vxlan_softc *sc = arg;
1773 
1774 	return (strlcpy(dst, sc->sc_ac.ac_if.if_xname, len));
1775 }
1776 
1777 static void
1778 vxlan_eb_port_sa(void *arg, struct sockaddr_storage *ss, void *port)
1779 {
1780 	struct vxlan_softc *sc = arg;
1781 	union vxlan_addr *endpoint = port;
1782 
1783 	switch (sc->sc_af) {
1784 	case AF_INET: {
1785 		struct sockaddr_in *sin = (struct sockaddr_in *)ss;
1786 
1787 		sin->sin_len = sizeof(*sin);
1788 		sin->sin_family = AF_INET;
1789 		sin->sin_addr = endpoint->in4;
1790 		break;
1791 	}
1792 #ifdef INET6
1793 	case AF_INET6: {
1794 		struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)ss;
1795 
1796 		sin6->sin6_len = sizeof(*sin6);
1797 		sin6->sin6_family = AF_INET6;
1798 		in6_recoverscope(sin6, &endpoint->in6);
1799 		break;
1800 	}
1801 #endif /* INET6 */
1802 	default:
1803 		unhandled_af(sc->sc_af);
1804 	}
1805 }
1806 
1807 static inline int
1808 vxlan_peer_cmp(const struct vxlan_peer *ap, const struct vxlan_peer *bp)
1809 {
1810 	size_t i;
1811 
1812 	if (ap->p_header.vxlan_id > bp->p_header.vxlan_id)
1813 		return (1);
1814 	if (ap->p_header.vxlan_id < bp->p_header.vxlan_id)
1815 		return (-1);
1816 	if (ap->p_header.vxlan_flags > bp->p_header.vxlan_flags)
1817 		return (1);
1818 	if (ap->p_header.vxlan_flags < bp->p_header.vxlan_flags)
1819 		return (-1);
1820 
1821 	for (i = 0; i < nitems(ap->p_addr.in6.s6_addr32); i++) {
1822 		if (ap->p_addr.in6.s6_addr32[i] >
1823 		    bp->p_addr.in6.s6_addr32[i])
1824 			return (1);
1825 		if (ap->p_addr.in6.s6_addr32[i] <
1826 		    bp->p_addr.in6.s6_addr32[i])
1827 			return (-1);
1828 	}
1829 
1830 	return (0);
1831 }
1832 
1833 RBT_GENERATE(vxlan_peers, vxlan_peer, p_entry, vxlan_peer_cmp);
1834