xref: /openbsd-src/sys/net/if_vxlan.c (revision 46035553bfdd96e63c94e32da0210227ec2e3cf1)
1 /*	$OpenBSD: if_vxlan.c,v 1.81 2020/08/21 22:59:27 kn Exp $	*/
2 
3 /*
4  * Copyright (c) 2013 Reyk Floeter <reyk@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include "bpfilter.h"
20 #include "vxlan.h"
21 #include "vlan.h"
22 #include "pf.h"
23 #include "bridge.h"
24 
25 #include <sys/param.h>
26 #include <sys/systm.h>
27 #include <sys/mbuf.h>
28 #include <sys/socket.h>
29 #include <sys/sockio.h>
30 #include <sys/ioctl.h>
31 
32 #include <net/if.h>
33 #include <net/if_var.h>
34 #include <net/if_media.h>
35 #include <net/route.h>
36 
37 #if NBPFILTER > 0
38 #include <net/bpf.h>
39 #endif
40 
41 #include <netinet/in.h>
42 #include <netinet/in_var.h>
43 #include <netinet/if_ether.h>
44 #include <netinet/ip.h>
45 #include <netinet/ip_var.h>
46 #include <netinet/udp.h>
47 #include <netinet/udp_var.h>
48 #include <netinet/in_pcb.h>
49 
50 #if NPF > 0
51 #include <net/pfvar.h>
52 #endif
53 
54 #if NBRIDGE > 0
55 #include <net/if_bridge.h>
56 #endif
57 
58 #include <net/if_vxlan.h>
59 
60 struct vxlan_softc {
61 	struct arpcom		 sc_ac;
62 	struct ifmedia		 sc_media;
63 
64 	struct ip_moptions	 sc_imo;
65 	struct task		 sc_atask;
66 	struct task		 sc_ltask;
67 	struct task		 sc_dtask;
68 
69 	struct sockaddr_storage	 sc_src;
70 	struct sockaddr_storage	 sc_dst;
71 	in_port_t		 sc_dstport;
72 	u_int			 sc_rdomain;
73 	int64_t			 sc_vnetid;
74 	uint16_t		 sc_df;
75 	u_int8_t		 sc_ttl;
76 	int			 sc_txhprio;
77 
78 	struct task		 sc_sendtask;
79 
80 	LIST_ENTRY(vxlan_softc)	 sc_entry;
81 };
82 
83 void	 vxlanattach(int);
84 int	 vxlanioctl(struct ifnet *, u_long, caddr_t);
85 void	 vxlanstart(struct ifnet *);
86 int	 vxlan_clone_create(struct if_clone *, int);
87 int	 vxlan_clone_destroy(struct ifnet *);
88 void	 vxlan_multicast_cleanup(struct ifnet *);
89 int	 vxlan_multicast_join(struct ifnet *, struct sockaddr *,
90 	    struct sockaddr *);
91 int	 vxlan_media_change(struct ifnet *);
92 void	 vxlan_media_status(struct ifnet *, struct ifmediareq *);
93 int	 vxlan_config(struct ifnet *, struct sockaddr *, struct sockaddr *);
94 int	 vxlan_output(struct ifnet *, struct mbuf *);
95 void	 vxlan_addr_change(void *);
96 void	 vxlan_if_change(void *);
97 void	 vxlan_link_change(void *);
98 void	 vxlan_send_dispatch(void *);
99 
100 int	 vxlan_sockaddr_cmp(struct sockaddr *, struct sockaddr *);
101 uint16_t vxlan_sockaddr_port(struct sockaddr *);
102 
103 struct if_clone	vxlan_cloner =
104     IF_CLONE_INITIALIZER("vxlan", vxlan_clone_create, vxlan_clone_destroy);
105 
106 int	 vxlan_enable = 0;
107 u_long	 vxlan_tagmask;
108 
109 #define VXLAN_TAGHASHSIZE		 32
110 #define VXLAN_TAGHASH(tag)		 ((unsigned int)tag & vxlan_tagmask)
111 LIST_HEAD(vxlan_taghash, vxlan_softc)	*vxlan_tagh, vxlan_any;
112 
113 void
114 vxlanattach(int count)
115 {
116 	/* Regular vxlan interfaces with a VNI */
117 	if ((vxlan_tagh = hashinit(VXLAN_TAGHASHSIZE, M_DEVBUF, M_NOWAIT,
118 	    &vxlan_tagmask)) == NULL)
119 		panic("vxlanattach: hashinit");
120 
121 	/* multipoint-to-multipoint interfaces that accept any VNI */
122 	LIST_INIT(&vxlan_any);
123 
124 	if_clone_attach(&vxlan_cloner);
125 }
126 
127 int
128 vxlan_clone_create(struct if_clone *ifc, int unit)
129 {
130 	struct ifnet		*ifp;
131 	struct vxlan_softc	*sc;
132 
133 	sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO);
134 	sc->sc_imo.imo_membership = mallocarray(IP_MIN_MEMBERSHIPS,
135 	    sizeof(struct in_multi *), M_IPMOPTS, M_WAITOK|M_ZERO);
136 	sc->sc_imo.imo_max_memberships = IP_MIN_MEMBERSHIPS;
137 	sc->sc_dstport = htons(VXLAN_PORT);
138 	sc->sc_vnetid = VXLAN_VNI_UNSET;
139 	sc->sc_txhprio = IFQ_TOS2PRIO(IPTOS_PREC_ROUTINE); /* 0 */
140 	sc->sc_df = htons(0);
141 	task_set(&sc->sc_atask, vxlan_addr_change, sc);
142 	task_set(&sc->sc_ltask, vxlan_link_change, sc);
143 	task_set(&sc->sc_dtask, vxlan_if_change, sc);
144 	task_set(&sc->sc_sendtask, vxlan_send_dispatch, sc);
145 
146 	ifp = &sc->sc_ac.ac_if;
147 	snprintf(ifp->if_xname, sizeof ifp->if_xname, "vxlan%d", unit);
148 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
149 	ether_fakeaddr(ifp);
150 
151 	ifp->if_softc = sc;
152 	ifp->if_ioctl = vxlanioctl;
153 	ifp->if_start = vxlanstart;
154 
155 	ifp->if_hardmtu = ETHER_MAX_HARDMTU_LEN;
156 	ifp->if_capabilities = IFCAP_VLAN_MTU;
157 	ifp->if_xflags = IFXF_CLONED;
158 
159 	ifmedia_init(&sc->sc_media, 0, vxlan_media_change,
160 	    vxlan_media_status);
161 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL);
162 	ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO);
163 
164 	if_counters_alloc(ifp);
165 	if_attach(ifp);
166 	ether_ifattach(ifp);
167 
168 #if 0
169 	/*
170 	 * Instead of using a decreased MTU of 1450 bytes, prefer
171 	 * to use the default Ethernet-size MTU of 1500 bytes and to
172 	 * increase the MTU of the outer transport interfaces to
173 	 * at least 1550 bytes. The following is disabled by default.
174 	 */
175 	ifp->if_mtu = ETHERMTU - sizeof(struct ether_header);
176 	ifp->if_mtu -= sizeof(struct vxlanudphdr) + sizeof(struct ipovly);
177 #endif
178 
179 	LIST_INSERT_HEAD(&vxlan_tagh[VXLAN_TAGHASH(0)], sc, sc_entry);
180 	vxlan_enable++;
181 
182 	return (0);
183 }
184 
185 int
186 vxlan_clone_destroy(struct ifnet *ifp)
187 {
188 	struct vxlan_softc	*sc = ifp->if_softc;
189 
190 	NET_LOCK();
191 	vxlan_multicast_cleanup(ifp);
192 	NET_UNLOCK();
193 
194 	vxlan_enable--;
195 	LIST_REMOVE(sc, sc_entry);
196 
197 	ifmedia_delete_instance(&sc->sc_media, IFM_INST_ANY);
198 	ether_ifdetach(ifp);
199 	if_detach(ifp);
200 
201 	if (!task_del(net_tq(ifp->if_index), &sc->sc_sendtask))
202 		taskq_barrier(net_tq(ifp->if_index));
203 
204 	free(sc->sc_imo.imo_membership, M_IPMOPTS,
205 	    sc->sc_imo.imo_max_memberships * sizeof(struct in_multi *));
206 	free(sc, M_DEVBUF, sizeof(*sc));
207 
208 	return (0);
209 }
210 
211 void
212 vxlan_multicast_cleanup(struct ifnet *ifp)
213 {
214 	struct vxlan_softc	*sc = (struct vxlan_softc *)ifp->if_softc;
215 	struct ip_moptions	*imo = &sc->sc_imo;
216 	struct ifnet		*mifp;
217 
218 	mifp = if_get(imo->imo_ifidx);
219 	if (mifp != NULL) {
220 		if_addrhook_del(mifp, &sc->sc_atask);
221 		if_linkstatehook_del(mifp, &sc->sc_ltask);
222 		if_detachhook_del(mifp, &sc->sc_dtask);
223 
224 		if_put(mifp);
225 	}
226 
227 	if (imo->imo_num_memberships > 0) {
228 		in_delmulti(imo->imo_membership[--imo->imo_num_memberships]);
229 		imo->imo_ifidx = 0;
230 	}
231 }
232 
233 int
234 vxlan_multicast_join(struct ifnet *ifp, struct sockaddr *src,
235     struct sockaddr *dst)
236 {
237 	struct vxlan_softc	*sc = ifp->if_softc;
238 	struct ip_moptions	*imo = &sc->sc_imo;
239 	struct sockaddr_in	*src4, *dst4;
240 #ifdef INET6
241 	struct sockaddr_in6	*dst6;
242 #endif /* INET6 */
243 	struct ifaddr		*ifa;
244 	struct ifnet		*mifp;
245 
246 	switch (dst->sa_family) {
247 	case AF_INET:
248 		dst4 = satosin(dst);
249 		if (!IN_MULTICAST(dst4->sin_addr.s_addr))
250 			return (0);
251 		break;
252 #ifdef INET6
253 	case AF_INET6:
254 		dst6 = satosin6(dst);
255 		if (!IN6_IS_ADDR_MULTICAST(&dst6->sin6_addr))
256 			return (0);
257 
258 		/* Multicast mode is currently not supported for IPv6 */
259 		return (EAFNOSUPPORT);
260 #endif /* INET6 */
261 	default:
262 		return (EAFNOSUPPORT);
263 	}
264 
265 	src4 = satosin(src);
266 	dst4 = satosin(dst);
267 
268 	if (src4->sin_addr.s_addr == INADDR_ANY ||
269 	    IN_MULTICAST(src4->sin_addr.s_addr))
270 		return (EINVAL);
271 	if ((ifa = ifa_ifwithaddr(src, sc->sc_rdomain)) == NULL ||
272 	    (mifp = ifa->ifa_ifp) == NULL ||
273 	    (mifp->if_flags & IFF_MULTICAST) == 0)
274 		return (EADDRNOTAVAIL);
275 
276 	if ((imo->imo_membership[0] =
277 	    in_addmulti(&dst4->sin_addr, mifp)) == NULL)
278 		return (ENOBUFS);
279 
280 	imo->imo_num_memberships++;
281 	imo->imo_ifidx = mifp->if_index;
282 	if (sc->sc_ttl > 0)
283 		imo->imo_ttl = sc->sc_ttl;
284 	else
285 		imo->imo_ttl = IP_DEFAULT_MULTICAST_TTL;
286 	imo->imo_loop = 0;
287 
288 	/*
289 	 * Use interface hooks to track any changes on the interface
290 	 * that is used to send out the tunnel traffic as multicast.
291 	 */
292 	if_addrhook_add(mifp, &sc->sc_atask);
293 	if_linkstatehook_add(mifp, &sc->sc_ltask);
294 	if_detachhook_add(mifp, &sc->sc_dtask);
295 
296 	return (0);
297 }
298 
299 void
300 vxlanstart(struct ifnet *ifp)
301 {
302 	struct vxlan_softc	*sc = (struct vxlan_softc *)ifp->if_softc;
303 
304 	task_add(net_tq(ifp->if_index), &sc->sc_sendtask);
305 }
306 
307 void
308 vxlan_send_dispatch(void *xsc)
309 {
310 	struct vxlan_softc	*sc = xsc;
311 	struct ifnet		*ifp = &sc->sc_ac.ac_if;
312 	struct mbuf		*m;
313 	struct mbuf_list	 ml;
314 
315 	ml_init(&ml);
316 	for (;;) {
317 		m = ifq_dequeue(&ifp->if_snd);
318 		if (m == NULL)
319 			break;
320 
321 #if NBPFILTER > 0
322 		if (ifp->if_bpf)
323 			bpf_mtap(ifp->if_bpf, m, BPF_DIRECTION_OUT);
324 #endif
325 
326 		ml_enqueue(&ml, m);
327 	}
328 
329 	if (ml_empty(&ml))
330 		return;
331 
332 	NET_LOCK();
333 	while ((m = ml_dequeue(&ml)) != NULL) {
334 		vxlan_output(ifp, m);
335 	}
336 	NET_UNLOCK();
337 }
338 
339 
340 int
341 vxlan_config(struct ifnet *ifp, struct sockaddr *src, struct sockaddr *dst)
342 {
343 	struct vxlan_softc	*sc = (struct vxlan_softc *)ifp->if_softc;
344 	int			 reset = 0, error, af;
345 	socklen_t		 slen;
346 	in_port_t		 port;
347 	struct vxlan_taghash	*tagh;
348 
349 	if (src != NULL && dst != NULL) {
350 		if ((af = src->sa_family) != dst->sa_family)
351 			return (EAFNOSUPPORT);
352 	} else {
353 		/* Reset current configuration */
354 		af = sc->sc_src.ss_family;
355 		src = sstosa(&sc->sc_src);
356 		dst = sstosa(&sc->sc_dst);
357 		reset = 1;
358 	}
359 
360 	switch (af) {
361 	case AF_INET:
362 		slen = sizeof(struct sockaddr_in);
363 		break;
364 #ifdef INET6
365 	case AF_INET6:
366 		slen = sizeof(struct sockaddr_in6);
367 		break;
368 #endif /* INET6 */
369 	default:
370 		return (EAFNOSUPPORT);
371 	}
372 
373 	if (src->sa_len != slen || dst->sa_len != slen)
374 		return (EINVAL);
375 
376 	vxlan_multicast_cleanup(ifp);
377 
378 	/* returns without error if multicast is not configured */
379 	if ((error = vxlan_multicast_join(ifp, src, dst)) != 0)
380 		return (error);
381 
382 	if ((port = vxlan_sockaddr_port(dst)) != 0)
383 		sc->sc_dstport = port;
384 
385 	if (!reset) {
386 		bzero(&sc->sc_src, sizeof(sc->sc_src));
387 		bzero(&sc->sc_dst, sizeof(sc->sc_dst));
388 		memcpy(&sc->sc_src, src, src->sa_len);
389 		memcpy(&sc->sc_dst, dst, dst->sa_len);
390 	}
391 
392 	if (sc->sc_vnetid == VXLAN_VNI_ANY) {
393 		/*
394 		 * If the interface accepts any VNI, put it into a separate
395 		 * list that is not part of the main hash.
396 		 */
397 		tagh = &vxlan_any;
398 	} else
399 		tagh = &vxlan_tagh[VXLAN_TAGHASH(sc->sc_vnetid)];
400 
401 	LIST_REMOVE(sc, sc_entry);
402 	LIST_INSERT_HEAD(tagh, sc, sc_entry);
403 
404 	return (0);
405 }
406 
407 int
408 vxlanioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
409 {
410 	struct vxlan_softc	*sc = (struct vxlan_softc *)ifp->if_softc;
411 	struct ifreq		*ifr = (struct ifreq *)data;
412 	struct if_laddrreq	*lifr = (struct if_laddrreq *)data;
413 	int			 error = 0;
414 
415 	switch (cmd) {
416 	case SIOCSIFADDR:
417 		ifp->if_flags |= IFF_UP;
418 		/* FALLTHROUGH */
419 
420 	case SIOCSIFFLAGS:
421 		if (ifp->if_flags & IFF_UP) {
422 			ifp->if_flags |= IFF_RUNNING;
423 		} else {
424 			ifp->if_flags &= ~IFF_RUNNING;
425 		}
426 		break;
427 
428 	case SIOCADDMULTI:
429 	case SIOCDELMULTI:
430 		break;
431 
432 	case SIOCGIFMEDIA:
433 	case SIOCSIFMEDIA:
434 		error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd);
435 		break;
436 
437 	case SIOCSLIFPHYADDR:
438 		error = vxlan_config(ifp,
439 		    sstosa(&lifr->addr),
440 		    sstosa(&lifr->dstaddr));
441 		break;
442 
443 	case SIOCDIFPHYADDR:
444 		vxlan_multicast_cleanup(ifp);
445 		bzero(&sc->sc_src, sizeof(sc->sc_src));
446 		bzero(&sc->sc_dst, sizeof(sc->sc_dst));
447 		sc->sc_dstport = htons(VXLAN_PORT);
448 		break;
449 
450 	case SIOCGLIFPHYADDR:
451 		if (sc->sc_dst.ss_family == AF_UNSPEC) {
452 			error = EADDRNOTAVAIL;
453 			break;
454 		}
455 		bzero(&lifr->addr, sizeof(lifr->addr));
456 		bzero(&lifr->dstaddr, sizeof(lifr->dstaddr));
457 		memcpy(&lifr->addr, &sc->sc_src, sc->sc_src.ss_len);
458 		memcpy(&lifr->dstaddr, &sc->sc_dst, sc->sc_dst.ss_len);
459 		break;
460 
461 	case SIOCSLIFPHYRTABLE:
462 		if (ifr->ifr_rdomainid < 0 ||
463 		    ifr->ifr_rdomainid > RT_TABLEID_MAX ||
464 		    !rtable_exists(ifr->ifr_rdomainid)) {
465 			error = EINVAL;
466 			break;
467 		}
468 		sc->sc_rdomain = ifr->ifr_rdomainid;
469 		(void)vxlan_config(ifp, NULL, NULL);
470 		break;
471 
472 	case SIOCGLIFPHYRTABLE:
473 		ifr->ifr_rdomainid = sc->sc_rdomain;
474 		break;
475 
476 	case SIOCSLIFPHYTTL:
477 		if (ifr->ifr_ttl < 0 || ifr->ifr_ttl > 0xff) {
478 			error = EINVAL;
479 			break;
480 		}
481 		if (sc->sc_ttl == (u_int8_t)ifr->ifr_ttl)
482 			break;
483 		sc->sc_ttl = (u_int8_t)(ifr->ifr_ttl);
484 		(void)vxlan_config(ifp, NULL, NULL);
485 		break;
486 
487 	case SIOCGLIFPHYTTL:
488 		ifr->ifr_ttl = (int)sc->sc_ttl;
489 		break;
490 
491 	case SIOCSLIFPHYDF:
492 		/* commit */
493 		sc->sc_df = ifr->ifr_df ? htons(IP_DF) : htons(0);
494 		break;
495 	case SIOCGLIFPHYDF:
496 		ifr->ifr_df = sc->sc_df ? 1 : 0;
497 		break;
498 
499 	case SIOCSTXHPRIO:
500 		if (ifr->ifr_hdrprio == IF_HDRPRIO_PACKET)
501 			; /* fall through */
502 		else if (ifr->ifr_hdrprio < IF_HDRPRIO_MIN ||
503 		    ifr->ifr_hdrprio > IF_HDRPRIO_MAX) {
504 			error = EINVAL;
505 			break;
506 		}
507 
508 		sc->sc_txhprio = ifr->ifr_hdrprio;
509 		break;
510 	case SIOCGTXHPRIO:
511 		ifr->ifr_hdrprio = sc->sc_txhprio;
512 		break;
513 
514 	case SIOCSVNETID:
515 		if (sc->sc_vnetid == ifr->ifr_vnetid)
516 			break;
517 
518 		if ((ifr->ifr_vnetid != VXLAN_VNI_ANY) &&
519 		    (ifr->ifr_vnetid > VXLAN_VNI_MAX ||
520 		     ifr->ifr_vnetid < VXLAN_VNI_MIN)) {
521 			error = EINVAL;
522 			break;
523 		}
524 
525 		sc->sc_vnetid = (int)ifr->ifr_vnetid;
526 		(void)vxlan_config(ifp, NULL, NULL);
527 		break;
528 
529 	case SIOCGVNETID:
530 		if ((sc->sc_vnetid != VXLAN_VNI_ANY) &&
531 		    (sc->sc_vnetid > VXLAN_VNI_MAX ||
532 		     sc->sc_vnetid < VXLAN_VNI_MIN)) {
533 			error = EADDRNOTAVAIL;
534 			break;
535 		}
536 
537 		ifr->ifr_vnetid = sc->sc_vnetid;
538 		break;
539 
540 	case SIOCDVNETID:
541 		sc->sc_vnetid = VXLAN_VNI_UNSET;
542 		(void)vxlan_config(ifp, NULL, NULL);
543 		break;
544 
545 	default:
546 		error = ether_ioctl(ifp, &sc->sc_ac, cmd, data);
547 		break;
548 	}
549 
550 	return (error);
551 }
552 
553 int
554 vxlan_media_change(struct ifnet *ifp)
555 {
556 	return (0);
557 }
558 
559 void
560 vxlan_media_status(struct ifnet *ifp, struct ifmediareq *imr)
561 {
562 	imr->ifm_status = IFM_AVALID | IFM_ACTIVE;
563 }
564 
565 int
566 vxlan_sockaddr_cmp(struct sockaddr *srcsa, struct sockaddr *dstsa)
567 {
568 	struct sockaddr_in	*src4, *dst4;
569 #ifdef INET6
570 	struct sockaddr_in6	*src6, *dst6;
571 #endif /* INET6 */
572 
573 	if (srcsa->sa_family != dstsa->sa_family)
574 		return (1);
575 
576 	switch (dstsa->sa_family) {
577 	case AF_INET:
578 		src4 = satosin(srcsa);
579 		dst4 = satosin(dstsa);
580 		if (src4->sin_addr.s_addr == dst4->sin_addr.s_addr)
581 			return (0);
582 		break;
583 #ifdef INET6
584 	case AF_INET6:
585 		src6 = satosin6(srcsa);
586 		dst6 = satosin6(dstsa);
587 		if (IN6_ARE_ADDR_EQUAL(&src6->sin6_addr, &dst6->sin6_addr) &&
588 		    src6->sin6_scope_id == dst6->sin6_scope_id)
589 			return (0);
590 		break;
591 #endif /* INET6 */
592 	}
593 
594 	return (1);
595 }
596 
597 uint16_t
598 vxlan_sockaddr_port(struct sockaddr *sa)
599 {
600 	struct sockaddr_in	*sin4;
601 #ifdef INET6
602 	struct sockaddr_in6	*sin6;
603 #endif /* INET6 */
604 
605 	switch (sa->sa_family) {
606 	case AF_INET:
607 		sin4 = satosin(sa);
608 		return (sin4->sin_port);
609 #ifdef INET6
610 	case AF_INET6:
611 		sin6 = satosin6(sa);
612 		return (sin6->sin6_port);
613 #endif /* INET6 */
614 	default:
615 		break;
616 	}
617 
618 	return (0);
619 }
620 
621 int
622 vxlan_lookup(struct mbuf *m, struct udphdr *uh, int iphlen,
623     struct sockaddr *srcsa, struct sockaddr *dstsa)
624 {
625 	struct vxlan_softc	*sc = NULL, *sc_cand = NULL;
626 	struct vxlan_header	 v;
627 	int			 vni;
628 	struct ifnet		*ifp;
629 	int			 skip;
630 #if NBRIDGE > 0
631 	struct bridge_tunneltag	*brtag;
632 #endif
633 	struct mbuf		*n;
634 	int			 off;
635 
636 	/* XXX Should verify the UDP port first before copying the packet */
637 	skip = iphlen + sizeof(*uh);
638 	if (m->m_pkthdr.len - skip < sizeof(v))
639 		return (0);
640 	m_copydata(m, skip, sizeof(v), (caddr_t)&v);
641 	skip += sizeof(v);
642 
643 	if (v.vxlan_flags & htonl(VXLAN_RESERVED1) ||
644 	    v.vxlan_id & htonl(VXLAN_RESERVED2))
645 		return (0);
646 
647 	vni = ntohl(v.vxlan_id) >> VXLAN_VNI_S;
648 	if ((v.vxlan_flags & htonl(VXLAN_FLAGS_VNI)) == 0) {
649 		if (vni != 0)
650 			return (0);
651 
652 		vni = VXLAN_VNI_UNSET;
653 	}
654 
655 	NET_ASSERT_LOCKED();
656 	/* First search for a vxlan(4) interface with the packet's VNI */
657 	LIST_FOREACH(sc, &vxlan_tagh[VXLAN_TAGHASH(vni)], sc_entry) {
658 		if ((uh->uh_dport == sc->sc_dstport) &&
659 		    vni == sc->sc_vnetid &&
660 		    sc->sc_rdomain == rtable_l2(m->m_pkthdr.ph_rtableid)) {
661 			sc_cand = sc;
662 			if (vxlan_sockaddr_cmp(srcsa, sstosa(&sc->sc_dst)) == 0)
663 				goto found;
664 		}
665 	}
666 
667 	/*
668 	 * Now loop through all the vxlan(4) interfaces that are configured
669 	 * to accept any VNI and operating in multipoint-to-multipoint mode
670 	 * that is used in combination with bridge(4) or switch(4).
671 	 * If a vxlan(4) interface has been found for the packet's VNI, this
672 	 * code is not reached as the other interface is more specific.
673 	 */
674 	LIST_FOREACH(sc, &vxlan_any, sc_entry) {
675 		if ((uh->uh_dport == sc->sc_dstport) &&
676 		    (sc->sc_rdomain == rtable_l2(m->m_pkthdr.ph_rtableid))) {
677 			sc_cand = sc;
678 			goto found;
679 		}
680 	}
681 
682 	if (sc_cand) {
683 		sc = sc_cand;
684 		goto found;
685 	}
686 
687 	/* not found */
688 	return (0);
689 
690  found:
691 	if (m->m_pkthdr.len < skip + sizeof(struct ether_header)) {
692 		m_freem(m);
693 		return (EINVAL);
694 	}
695 
696 	m_adj(m, skip);
697 	ifp = &sc->sc_ac.ac_if;
698 
699 #if NBRIDGE > 0
700 	/* Store the tunnel src/dst IP and vni for the bridge or switch */
701 	if ((ifp->if_bridgeidx != 0 || ifp->if_switchport != NULL) &&
702 	    srcsa->sa_family != AF_UNSPEC &&
703 	    ((brtag = bridge_tunneltag(m)) != NULL)) {
704 		memcpy(&brtag->brtag_peer.sa, srcsa, srcsa->sa_len);
705 		memcpy(&brtag->brtag_local.sa, dstsa, dstsa->sa_len);
706 		brtag->brtag_id = vni;
707 	}
708 #endif
709 
710 	m->m_flags &= ~(M_BCAST|M_MCAST);
711 
712 #if NPF > 0
713 	pf_pkt_addr_changed(m);
714 #endif
715 	if ((m->m_len < sizeof(struct ether_header)) &&
716 	    (m = m_pullup(m, sizeof(struct ether_header))) == NULL)
717 		return (ENOBUFS);
718 
719 	n = m_getptr(m, sizeof(struct ether_header), &off);
720 	if (n == NULL) {
721 		m_freem(m);
722 		return (EINVAL);
723 	}
724 	if (!ALIGNED_POINTER(mtod(n, caddr_t) + off, uint32_t)) {
725 		n = m_dup_pkt(m, ETHER_ALIGN, M_NOWAIT);
726 		/* Dispose of the original mbuf chain */
727 		m_freem(m);
728 		if (n == NULL)
729 			return (ENOBUFS);
730 		m = n;
731 	}
732 
733 	if_vinput(ifp, m);
734 
735 	/* success */
736 	return (1);
737 }
738 
739 struct mbuf *
740 vxlan_encap4(struct ifnet *ifp, struct mbuf *m,
741     struct sockaddr *src, struct sockaddr *dst)
742 {
743 	struct vxlan_softc	*sc = (struct vxlan_softc *)ifp->if_softc;
744 	struct ip		*ip;
745 
746 	/*
747 	 * Remove multicast and broadcast flags or encapsulated packet
748 	 * ends up as multicast or broadcast packet.
749 	 */
750 	m->m_flags &= ~(M_BCAST|M_MCAST);
751 
752 	M_PREPEND(m, sizeof(*ip), M_DONTWAIT);
753 	if (m == NULL)
754 		return (NULL);
755 
756 	ip = mtod(m, struct ip *);
757 	ip->ip_v = IPVERSION;
758 	ip->ip_hl = sizeof(struct ip) >> 2;
759 	ip->ip_id = htons(ip_randomid());
760 	ip->ip_off = sc->sc_df;
761 	ip->ip_p = IPPROTO_UDP;
762 	ip->ip_tos = IFQ_PRIO2TOS(sc->sc_txhprio == IF_HDRPRIO_PACKET ?
763 	    m->m_pkthdr.pf.prio : sc->sc_txhprio);
764 	ip->ip_len = htons(m->m_pkthdr.len);
765 
766 	ip->ip_src = satosin(src)->sin_addr;
767 	ip->ip_dst = satosin(dst)->sin_addr;
768 
769 	if (sc->sc_ttl > 0)
770 		ip->ip_ttl = sc->sc_ttl;
771 	else
772 		ip->ip_ttl = IPDEFTTL;
773 
774 	return (m);
775 }
776 
777 #ifdef INET6
778 struct mbuf *
779 vxlan_encap6(struct ifnet *ifp, struct mbuf *m,
780     struct sockaddr *src, struct sockaddr *dst)
781 {
782 	struct vxlan_softc	*sc = (struct vxlan_softc *)ifp->if_softc;
783 	struct ip6_hdr		*ip6;
784 	struct in6_addr		*in6a;
785 	uint32_t		 flow;
786 
787 	/*
788 	 * Remove multicast and broadcast flags or encapsulated packet
789 	 * ends up as multicast or broadcast packet.
790 	 */
791 	m->m_flags &= ~(M_BCAST|M_MCAST);
792 
793 	M_PREPEND(m, sizeof(struct ip6_hdr), M_DONTWAIT);
794 	if (m == NULL)
795 		return (NULL);
796 
797 	flow = (uint32_t)IFQ_PRIO2TOS(sc->sc_txhprio == IF_HDRPRIO_PACKET ?
798 	    m->m_pkthdr.pf.prio : sc->sc_txhprio) << 20;
799 
800 	ip6 = mtod(m, struct ip6_hdr *);
801 	ip6->ip6_flow = htonl(flow);
802 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
803 	ip6->ip6_vfc |= IPV6_VERSION;
804 	ip6->ip6_nxt = IPPROTO_UDP;
805 	ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr));
806 	if (in6_embedscope(&ip6->ip6_src, satosin6(src), NULL) != 0)
807 		goto drop;
808 	if (in6_embedscope(&ip6->ip6_dst, satosin6(dst), NULL) != 0)
809 		goto drop;
810 
811 	if (sc->sc_ttl > 0)
812 		ip6->ip6_hlim = sc->sc_ttl;
813 	else
814 		ip6->ip6_hlim = ip6_defhlim;
815 
816 	if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(src)->sin6_addr)) {
817 		if (in6_selectsrc(&in6a, satosin6(dst), NULL,
818 		    sc->sc_rdomain) != 0)
819 			goto drop;
820 
821 		ip6->ip6_src = *in6a;
822 	}
823 
824 	if (sc->sc_df)
825 		SET(m->m_pkthdr.csum_flags, M_IPV6_DF_OUT);
826 
827 	/*
828 	 * The UDP checksum of VXLAN packets should be set to zero,
829 	 * but the IPv6 UDP checksum is not optional.  There is an RFC 6539
830 	 * to relax the IPv6 UDP checksum requirement for tunnels, but it
831 	 * is currently not supported by most implementations.
832 	 */
833 	m->m_pkthdr.csum_flags |= M_UDP_CSUM_OUT;
834 
835 	return (m);
836 
837 drop:
838 	m_freem(m);
839 	return (NULL);
840 }
841 #endif /* INET6 */
842 
843 int
844 vxlan_output(struct ifnet *ifp, struct mbuf *m)
845 {
846 	struct vxlan_softc	*sc = (struct vxlan_softc *)ifp->if_softc;
847 	struct vxlanudphdr	*vu;
848 	struct sockaddr		*src, *dst;
849 #if NBRIDGE > 0
850 	struct bridge_tunneltag	*brtag;
851 #endif
852 	int			 error, af;
853 	uint32_t		 tag;
854 	struct mbuf		*m0;
855 
856 	/* VXLAN header, needs new mbuf because of alignment issues */
857 	MGET(m0, M_DONTWAIT, m->m_type);
858 	if (m0 == NULL) {
859 		ifp->if_oerrors++;
860 		return (ENOBUFS);
861 	}
862 	M_MOVE_PKTHDR(m0, m);
863 	m0->m_next = m;
864 	m = m0;
865 	m_align(m, sizeof(*vu));
866 	m->m_len = sizeof(*vu);
867 	m->m_pkthdr.len += sizeof(*vu);
868 
869 	src = sstosa(&sc->sc_src);
870 	dst = sstosa(&sc->sc_dst);
871 	af = src->sa_family;
872 
873 	vu = mtod(m, struct vxlanudphdr *);
874 	vu->vu_u.uh_sport = sc->sc_dstport;
875 	vu->vu_u.uh_dport = sc->sc_dstport;
876 	vu->vu_u.uh_ulen = htons(m->m_pkthdr.len);
877 	vu->vu_u.uh_sum = 0;
878 	tag = sc->sc_vnetid;
879 
880 #if NBRIDGE > 0
881 	if ((brtag = bridge_tunnel(m)) != NULL) {
882 		dst = &brtag->brtag_peer.sa;
883 
884 		/* If accepting any VNI, source ip address is from brtag */
885 		if (sc->sc_vnetid == VXLAN_VNI_ANY) {
886 			src = &brtag->brtag_local.sa;
887 			tag = (uint32_t)brtag->brtag_id;
888 			af = src->sa_family;
889 		}
890 
891 		if (dst->sa_family != af) {
892 			ifp->if_oerrors++;
893 			m_freem(m);
894 			return (EINVAL);
895 		}
896 	} else
897 #endif
898 	if (sc->sc_vnetid == VXLAN_VNI_ANY) {
899 		/*
900 		 * If accepting any VNI, build the vxlan header only by
901 		 * bridge_tunneltag or drop packet if the tag does not exist.
902 		 */
903 		ifp->if_oerrors++;
904 		m_freem(m);
905 		return (ENETUNREACH);
906 	}
907 
908 	if (sc->sc_vnetid != VXLAN_VNI_UNSET) {
909 		vu->vu_v.vxlan_flags = htonl(VXLAN_FLAGS_VNI);
910 		vu->vu_v.vxlan_id = htonl(tag << VXLAN_VNI_S);
911 	} else {
912 		vu->vu_v.vxlan_flags = htonl(0);
913 		vu->vu_v.vxlan_id = htonl(0);
914 	}
915 
916 	switch (af) {
917 	case AF_INET:
918 		m = vxlan_encap4(ifp, m, src, dst);
919 		break;
920 #ifdef INET6
921 	case AF_INET6:
922 		m = vxlan_encap6(ifp, m, src, dst);
923 		break;
924 #endif /* INET6 */
925 	default:
926 		m_freem(m);
927 		m = NULL;
928 	}
929 
930 	if (m == NULL) {
931 		ifp->if_oerrors++;
932 		return (ENOBUFS);
933 	}
934 
935 #if NBRIDGE > 0
936 	if (brtag != NULL)
937 		bridge_tunneluntag(m);
938 #endif
939 
940 	m->m_pkthdr.ph_rtableid = sc->sc_rdomain;
941 
942 #if NPF > 0
943 	pf_pkt_addr_changed(m);
944 #endif
945 
946 	switch (af) {
947 	case AF_INET:
948 		error = ip_output(m, NULL, NULL, IP_RAWOUTPUT,
949 		    &sc->sc_imo, NULL, 0);
950 		break;
951 #ifdef INET6
952 	case AF_INET6:
953 		error = ip6_output(m, 0, NULL, IPV6_MINMTU, 0, NULL);
954 		break;
955 #endif /* INET6 */
956 	default:
957 		m_freem(m);
958 		error = EAFNOSUPPORT;
959 	}
960 
961 	if (error)
962 		ifp->if_oerrors++;
963 
964 	return (error);
965 }
966 
967 void
968 vxlan_addr_change(void *arg)
969 {
970 	struct vxlan_softc	*sc = arg;
971 	struct ifnet		*ifp = &sc->sc_ac.ac_if;
972 	int			 error;
973 
974 	/*
975 	 * Reset the configuration after resume or any possible address
976 	 * configuration changes.
977 	 */
978 	if ((error = vxlan_config(ifp, NULL, NULL))) {
979 		/*
980 		 * The source address of the tunnel can temporarily disappear,
981 		 * after a link state change when running the DHCP client,
982 		 * so keep it configured.
983 		 */
984 	}
985 }
986 
987 void
988 vxlan_if_change(void *arg)
989 {
990 	struct vxlan_softc	*sc = arg;
991 	struct ifnet		*ifp = &sc->sc_ac.ac_if;
992 
993 	/*
994 	 * Reset the configuration after the parent interface disappeared.
995 	 */
996 	vxlan_multicast_cleanup(ifp);
997 	memset(&sc->sc_src, 0, sizeof(sc->sc_src));
998 	memset(&sc->sc_dst, 0, sizeof(sc->sc_dst));
999 	sc->sc_dstport = htons(VXLAN_PORT);
1000 }
1001 
1002 void
1003 vxlan_link_change(void *arg)
1004 {
1005 	struct vxlan_softc	*sc = arg;
1006 	struct ifnet		*ifp = &sc->sc_ac.ac_if;
1007 
1008 	/*
1009 	 * The machine might have lost its multicast associations after
1010 	 * link state changes.  This fixes a problem with VMware after
1011 	 * suspend/resume of the host or guest.
1012 	 */
1013 	(void)vxlan_config(ifp, NULL, NULL);
1014 }
1015