xref: /netbsd-src/sys/net/if_ipsec.c (revision deb6f0161a9109e7de9b519dc8dfb9478668dcdd)
1 /*	$NetBSD: if_ipsec.c,v 1.19 2018/12/07 05:09:39 knakahara Exp $  */
2 
3 /*
4  * Copyright (c) 2017 Internet Initiative Japan Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __KERNEL_RCSID(0, "$NetBSD: if_ipsec.c,v 1.19 2018/12/07 05:09:39 knakahara Exp $");
31 
32 #ifdef _KERNEL_OPT
33 #include "opt_inet.h"
34 #endif
35 
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/kernel.h>
39 #include <sys/mbuf.h>
40 #include <sys/socket.h>
41 #include <sys/sockio.h>
42 #include <sys/errno.h>
43 #include <sys/ioctl.h>
44 #include <sys/time.h>
45 #include <sys/syslog.h>
46 #include <sys/cpu.h>
47 #include <sys/kmem.h>
48 #include <sys/mutex.h>
49 #include <sys/pserialize.h>
50 #include <sys/psref.h>
51 
52 #include <net/if.h>
53 #include <net/if_types.h>
54 #include <net/route.h>
55 #include <net/bpf.h>
56 #include <net/pfkeyv2.h>
57 
58 #include <netinet/in.h>
59 #include <netinet/in_systm.h>
60 #include <netinet/ip.h>
61 #ifdef	INET
62 #include <netinet/in_var.h>
63 #endif	/* INET */
64 
65 #ifdef INET6
66 #include <netinet6/in6_var.h>
67 #include <netinet/ip6.h>
68 #include <netinet6/ip6_var.h>
69 #endif /* INET6 */
70 
71 #include <netinet/ip_encap.h>
72 
73 #include <net/if_ipsec.h>
74 
75 #include <net/raw_cb.h>
76 #include <net/pfkeyv2.h>
77 
78 #include <netipsec/key.h>
79 #include <netipsec/keydb.h> /* for union sockaddr_union */
80 #include <netipsec/ipsec.h>
81 #include <netipsec/ipsecif.h>
82 
83 static void if_ipsec_ro_init_pc(void *, void *, struct cpu_info *);
84 static void if_ipsec_ro_fini_pc(void *, void *, struct cpu_info *);
85 
86 static int if_ipsec_clone_create(struct if_clone *, int);
87 static int if_ipsec_clone_destroy(struct ifnet *);
88 
89 static inline int if_ipsec_out_direct(struct ipsec_variant *, struct mbuf *, int);
90 static inline void if_ipsec_in_enqueue(struct mbuf *, int, struct ifnet *);
91 
92 static int if_ipsec_encap_attach(struct ipsec_variant *);
93 static int if_ipsec_encap_detach(struct ipsec_variant *);
94 static int if_ipsec_set_tunnel(struct ifnet *,
95     struct sockaddr *, struct sockaddr *);
96 static void if_ipsec_delete_tunnel(struct ifnet *);
97 static int if_ipsec_ensure_flags(struct ifnet *, short);
98 static void if_ipsec_attach0(struct ipsec_softc *);
99 
100 static int if_ipsec_update_variant(struct ipsec_softc *,
101     struct ipsec_variant *, struct ipsec_variant *);
102 
103 /* sadb_msg */
104 static inline void if_ipsec_add_mbuf(struct mbuf *, void *, size_t);
105 static inline void if_ipsec_add_pad(struct mbuf *, size_t);
106 static inline size_t if_ipsec_set_sadb_addr(struct sadb_address *,
107     struct sockaddr *, int, uint16_t);
108 static inline size_t if_ipsec_set_sadb_src(struct sadb_address *,
109     struct sockaddr *, int);
110 static inline size_t if_ipsec_set_sadb_dst(struct sadb_address *,
111     struct sockaddr *, int);
112 static inline size_t if_ipsec_set_sadb_x_policy(struct sadb_x_policy *,
113     struct sadb_x_ipsecrequest *, uint16_t, uint8_t, uint32_t, uint8_t,
114     struct sockaddr *, struct sockaddr *);
115 static inline void if_ipsec_set_sadb_msg(struct sadb_msg *, uint16_t, uint8_t);
116 static inline void if_ipsec_set_sadb_msg_add(struct sadb_msg *, uint16_t);
117 static inline void if_ipsec_set_sadb_msg_del(struct sadb_msg *, uint16_t);
118 /* SPD */
119 static int if_ipsec_share_sp(struct ipsec_variant *);
120 static int if_ipsec_unshare_sp(struct ipsec_variant *);
121 static inline struct secpolicy *if_ipsec_add_sp0(struct sockaddr *,
122     in_port_t, struct sockaddr *, in_port_t, int, int, int, u_int);
123 static inline int if_ipsec_del_sp0(struct secpolicy *);
124 static int if_ipsec_add_sp(struct ipsec_variant *,
125     struct sockaddr *, in_port_t, struct sockaddr *, in_port_t);
126 static void if_ipsec_del_sp(struct ipsec_variant *);
127 static int if_ipsec_replace_sp(struct ipsec_softc *, struct ipsec_variant *,
128     struct ipsec_variant *);
129 
130 static int if_ipsec_set_addr_port(struct sockaddr *, struct sockaddr *,
131     in_port_t);
132 #define IF_IPSEC_GATHER_PSRC_ADDR_PORT(var, target)			\
133 	if_ipsec_set_addr_port(target, (var)->iv_psrc, (var)->iv_sport)
134 #define IF_IPSEC_GATHER_PDST_ADDR_PORT(var, target)			\
135 	if_ipsec_set_addr_port(target, (var)->iv_pdst, (var)->iv_dport)
136 
137 /*
138  * ipsec global variable definitions
139  */
140 
141 /* This list is used in ioctl context only. */
142 LIST_HEAD(ipsec_sclist, ipsec_softc);
143 static struct {
144 	struct ipsec_sclist list;
145 	kmutex_t lock;
146 } ipsec_softcs __cacheline_aligned;
147 
148 struct psref_class *iv_psref_class __read_mostly;
149 
150 struct if_clone ipsec_cloner =
151     IF_CLONE_INITIALIZER("ipsec", if_ipsec_clone_create, if_ipsec_clone_destroy);
152 static int max_ipsec_nesting = MAX_IPSEC_NEST;
153 
154 /* ARGSUSED */
155 void
156 ipsecifattach(int count)
157 {
158 
159 	mutex_init(&ipsec_softcs.lock, MUTEX_DEFAULT, IPL_NONE);
160 	LIST_INIT(&ipsec_softcs.list);
161 
162 	iv_psref_class = psref_class_create("ipsecvar", IPL_SOFTNET);
163 
164 	if_clone_attach(&ipsec_cloner);
165 }
166 
167 static int
168 if_ipsec_clone_create(struct if_clone *ifc, int unit)
169 {
170 	struct ipsec_softc *sc;
171 	struct ipsec_variant *var;
172 
173 	sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
174 
175 	if_initname(&sc->ipsec_if, ifc->ifc_name, unit);
176 
177 	if_ipsec_attach0(sc);
178 
179 	var = kmem_zalloc(sizeof(*var), KM_SLEEP);
180 	var->iv_softc = sc;
181 	psref_target_init(&var->iv_psref, iv_psref_class);
182 
183 	sc->ipsec_var = var;
184 	mutex_init(&sc->ipsec_lock, MUTEX_DEFAULT, IPL_NONE);
185 	sc->ipsec_psz = pserialize_create();
186 	sc->ipsec_ro_percpu = percpu_alloc(sizeof(struct ipsec_ro));
187 	percpu_foreach(sc->ipsec_ro_percpu, if_ipsec_ro_init_pc, NULL);
188 
189 	mutex_enter(&ipsec_softcs.lock);
190 	LIST_INSERT_HEAD(&ipsec_softcs.list, sc, ipsec_list);
191 	mutex_exit(&ipsec_softcs.lock);
192 	return 0;
193 }
194 
195 static void
196 if_ipsec_attach0(struct ipsec_softc *sc)
197 {
198 
199 	sc->ipsec_if.if_addrlen = 0;
200 	sc->ipsec_if.if_mtu    = IPSEC_MTU;
201 	sc->ipsec_if.if_flags  = IFF_POINTOPOINT | IFF_MULTICAST;
202 	/* set ipsec(4) specific default flags. */
203 	sc->ipsec_if.if_flags  |= IFF_FWD_IPV6;
204 	sc->ipsec_if.if_extflags = IFEF_NO_LINK_STATE_CHANGE | IFEF_MPSAFE;
205 	sc->ipsec_if.if_ioctl  = if_ipsec_ioctl;
206 	sc->ipsec_if.if_output = if_ipsec_output;
207 	sc->ipsec_if.if_type   = IFT_IPSEC;
208 	sc->ipsec_if.if_dlt    = DLT_NULL;
209 	sc->ipsec_if.if_softc  = sc;
210 	IFQ_SET_READY(&sc->ipsec_if.if_snd);
211 	if_initialize(&sc->ipsec_if);
212 	if_alloc_sadl(&sc->ipsec_if);
213 	bpf_attach(&sc->ipsec_if, DLT_NULL, sizeof(u_int));
214 	if_register(&sc->ipsec_if);
215 }
216 
217 static void
218 if_ipsec_ro_init_pc(void *p, void *arg __unused, struct cpu_info *ci __unused)
219 {
220 	struct ipsec_ro *iro = p;
221 
222 	iro->ir_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
223 }
224 
225 static void
226 if_ipsec_ro_fini_pc(void *p, void *arg __unused, struct cpu_info *ci __unused)
227 {
228 	struct ipsec_ro *iro = p;
229 
230 	rtcache_free(&iro->ir_ro);
231 
232 	mutex_obj_free(iro->ir_lock);
233 }
234 
235 static int
236 if_ipsec_clone_destroy(struct ifnet *ifp)
237 {
238 	struct ipsec_softc *sc = ifp->if_softc;
239 	struct ipsec_variant *var;
240 	int bound;
241 
242 	mutex_enter(&ipsec_softcs.lock);
243 	LIST_REMOVE(sc, ipsec_list);
244 	mutex_exit(&ipsec_softcs.lock);
245 
246 	bound = curlwp_bind();
247 	if_ipsec_delete_tunnel(&sc->ipsec_if);
248 	curlwp_bindx(bound);
249 
250 	bpf_detach(ifp);
251 	if_detach(ifp);
252 
253 	percpu_foreach(sc->ipsec_ro_percpu, if_ipsec_ro_fini_pc, NULL);
254 	percpu_free(sc->ipsec_ro_percpu, sizeof(struct ipsec_ro));
255 
256 	pserialize_destroy(sc->ipsec_psz);
257 	mutex_destroy(&sc->ipsec_lock);
258 
259 	var = sc->ipsec_var;
260 	kmem_free(var, sizeof(*var));
261 	kmem_free(sc, sizeof(*sc));
262 
263 	return 0;
264 }
265 
266 static inline bool
267 if_ipsec_nat_t(struct ipsec_softc *sc)
268 {
269 
270 	return (sc->ipsec_if.if_flags & IFF_NAT_T) != 0;
271 }
272 
273 static inline bool
274 if_ipsec_fwd_ipv6(struct ipsec_softc *sc)
275 {
276 
277 	return (sc->ipsec_if.if_flags & IFF_FWD_IPV6) != 0;
278 }
279 
280 int
281 if_ipsec_encap_func(struct mbuf *m, int off, int proto, void *arg)
282 {
283 	uint8_t v;
284 	struct ipsec_softc *sc;
285 	struct ipsec_variant *var = NULL;
286 	struct psref psref;
287 	int ret = 0;
288 
289 	sc = arg;
290 	KASSERT(sc != NULL);
291 
292 	if ((sc->ipsec_if.if_flags & IFF_UP) == 0)
293 		goto out;
294 
295 	var = if_ipsec_getref_variant(sc, &psref);
296 	if (if_ipsec_variant_is_unconfigured(var))
297 		goto out;
298 
299 	switch (proto) {
300 	case IPPROTO_IPV4:
301 	case IPPROTO_IPV6:
302 		break;
303 	default:
304 		goto out;
305 	}
306 
307 	m_copydata(m, 0, sizeof(v), &v);
308 	v = (v >> 4) & 0xff;  /* Get the IP version number. */
309 
310 	switch (v) {
311 #ifdef INET
312 	case IPVERSION: {
313 		struct ip ip;
314 
315 		if (m->m_pkthdr.len < sizeof(ip))
316 			goto out;
317 
318 		m_copydata(m, 0, sizeof(ip), &ip);
319 		if (var->iv_psrc->sa_family != AF_INET ||
320 		    var->iv_pdst->sa_family != AF_INET)
321 			goto out;
322 		ret = ipsecif4_encap_func(m, &ip, var);
323 		break;
324 	}
325 #endif
326 #ifdef INET6
327 	case (IPV6_VERSION >> 4): {
328 		struct ip6_hdr ip6;
329 
330 		if (m->m_pkthdr.len < sizeof(ip6))
331 			goto out;
332 
333 		m_copydata(m, 0, sizeof(ip6), &ip6);
334 		if (var->iv_psrc->sa_family != AF_INET6 ||
335 		    var->iv_pdst->sa_family != AF_INET6)
336 			goto out;
337 		ret = ipsecif6_encap_func(m, &ip6, var);
338 		break;
339 	}
340 #endif
341 	default:
342 		goto out;
343 	}
344 
345 out:
346 	if (var != NULL)
347 		if_ipsec_putref_variant(var, &psref);
348 	return ret;
349 }
350 
351 /*
352  * ipsec(4) I/F may cause infinite recursion calls when misconfigured.
353  * We'll prevent this by introducing upper limit.
354  */
355 static int
356 if_ipsec_check_nesting(struct ifnet *ifp, struct mbuf *m)
357 {
358 
359 	return if_tunnel_check_nesting(ifp, m, max_ipsec_nesting);
360 }
361 
362 int
363 if_ipsec_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
364     const struct rtentry *rt)
365 {
366 	struct ipsec_softc *sc = ifp->if_softc;
367 	struct ipsec_variant *var;
368 	struct psref psref;
369 	int error;
370 	int bound;
371 
372 	IFQ_CLASSIFY(&ifp->if_snd, m, dst->sa_family);
373 
374 	error = if_ipsec_check_nesting(ifp, m);
375 	if (error) {
376 		m_freem(m);
377 		goto noref_end;
378 	}
379 
380 	if ((ifp->if_flags & IFF_UP) == 0) {
381 		m_freem(m);
382 		error = ENETDOWN;
383 		goto noref_end;
384 	}
385 
386 
387 	bound = curlwp_bind();
388 	var = if_ipsec_getref_variant(sc, &psref);
389 	if (if_ipsec_variant_is_unconfigured(var)) {
390 		m_freem(m);
391 		error = ENETDOWN;
392 		goto end;
393 	}
394 
395 	m->m_flags &= ~(M_BCAST|M_MCAST);
396 
397 	/* use DLT_NULL encapsulation here to pass inner af type */
398 	M_PREPEND(m, sizeof(int), M_DONTWAIT);
399 	if (!m) {
400 		error = ENOBUFS;
401 		goto end;
402 	}
403 	*mtod(m, int *) = dst->sa_family;
404 
405 #if INET6
406 	/* drop IPv6 packet if IFF_FWD_IPV6 is not set */
407 	if (dst->sa_family == AF_INET6 &&
408 	    !if_ipsec_fwd_ipv6(sc)) {
409 		/*
410 		 * IPv6 packet is not allowed to forward,that is not error.
411 		 */
412 		error = 0;
413 		IF_DROP(&ifp->if_snd);
414 		m_freem(m);
415 		goto end;
416 	}
417 #endif
418 
419 	error = if_ipsec_out_direct(var, m, dst->sa_family);
420 
421 end:
422 	if_ipsec_putref_variant(var, &psref);
423 	curlwp_bindx(bound);
424 noref_end:
425 	if (error)
426 		ifp->if_oerrors++;
427 
428 	return error;
429 }
430 
431 static inline int
432 if_ipsec_out_direct(struct ipsec_variant *var, struct mbuf *m, int family)
433 {
434 	struct ifnet *ifp = &var->iv_softc->ipsec_if;
435 	int error;
436 	int len;
437 
438 	KASSERT(if_ipsec_heldref_variant(var));
439 	KASSERT(var->iv_output != NULL);
440 
441 	len = m->m_pkthdr.len;
442 
443 	/* input DLT_NULL frame to BPF */
444 	bpf_mtap(ifp, m, BPF_D_OUT);
445 
446 	/* grab and chop off inner af type */
447 	/* XXX need pullup? */
448 	m_adj(m, sizeof(int));
449 
450 	error = var->iv_output(var, family, m);
451 	if (error)
452 		return error;
453 
454 	ifp->if_opackets++;
455 	ifp->if_obytes += len;
456 
457 	return 0;
458 }
459 
460 void
461 if_ipsec_input(struct mbuf *m, int af, struct ifnet *ifp)
462 {
463 
464 	KASSERT(ifp != NULL);
465 
466 	m_set_rcvif(m, ifp);
467 
468 	bpf_mtap_af(ifp, af, m, BPF_D_IN);
469 
470 	if_ipsec_in_enqueue(m, af, ifp);
471 
472 	return;
473 }
474 
475 static inline void
476 if_ipsec_in_enqueue(struct mbuf *m, int af, struct ifnet *ifp)
477 {
478 	pktqueue_t *pktq;
479 	int pktlen;
480 
481 	/*
482 	 * Put the packet to the network layer input queue according to the
483 	 * specified address family.
484 	 */
485 	switch (af) {
486 #ifdef INET
487 	case AF_INET:
488 		pktq = ip_pktq;
489 		break;
490 #endif
491 #ifdef INET6
492 	case AF_INET6:
493 		pktq = ip6_pktq;
494 		break;
495 #endif
496 	default:
497 		ifp->if_ierrors++;
498 		m_freem(m);
499 		return;
500 	}
501 
502 #if 1
503 	const u_int h = curcpu()->ci_index;
504 #else
505 	const uint32_t h = pktq_rps_hash(m);
506 #endif
507 	pktlen = m->m_pkthdr.len;
508 	if (__predict_true(pktq_enqueue(pktq, m, h))) {
509 		ifp->if_ibytes += pktlen;
510 		ifp->if_ipackets++;
511 	} else {
512 		ifp->if_iqdrops++;
513 		m_freem(m);
514 	}
515 
516 	return;
517 }
518 
519 static inline int
520 if_ipsec_check_salen(struct sockaddr *addr)
521 {
522 
523 	switch (addr->sa_family) {
524 #ifdef INET
525 	case AF_INET:
526 		if (addr->sa_len != sizeof(struct sockaddr_in))
527 			return EINVAL;
528 		break;
529 #endif /* INET */
530 #ifdef INET6
531 	case AF_INET6:
532 		if (addr->sa_len != sizeof(struct sockaddr_in6))
533 			return EINVAL;
534 		break;
535 #endif /* INET6 */
536 	default:
537 		return EAFNOSUPPORT;
538 	}
539 
540 	return 0;
541 }
542 
543 /* XXX how should we handle IPv6 scope on SIOC[GS]IFPHYADDR? */
544 int
545 if_ipsec_ioctl(struct ifnet *ifp, u_long cmd, void *data)
546 {
547 	struct ipsec_softc *sc  = ifp->if_softc;
548 	struct ipsec_variant *var = NULL;
549 	struct ifreq     *ifr = (struct ifreq*)data;
550 	struct ifaddr    *ifa = (struct ifaddr*)data;
551 	int error = 0, size;
552 	struct sockaddr *dst, *src;
553 	u_long mtu;
554 	short oflags = ifp->if_flags;
555 	int bound;
556 	struct psref psref;
557 
558 	switch (cmd) {
559 	case SIOCINITIFADDR:
560 		ifp->if_flags |= IFF_UP;
561 		ifa->ifa_rtrequest = p2p_rtrequest;
562 		break;
563 
564 	case SIOCSIFDSTADDR:
565 		break;
566 
567 	case SIOCADDMULTI:
568 	case SIOCDELMULTI:
569 		switch (ifr->ifr_addr.sa_family) {
570 #ifdef INET
571 		case AF_INET:	/* IP supports Multicast */
572 			break;
573 #endif /* INET */
574 #ifdef INET6
575 		case AF_INET6:	/* IP6 supports Multicast */
576 			break;
577 #endif /* INET6 */
578 		default:  /* Other protocols doesn't support Multicast */
579 			error = EAFNOSUPPORT;
580 			break;
581 		}
582 		break;
583 
584 	case SIOCSIFMTU:
585 		mtu = ifr->ifr_mtu;
586 		if (mtu < IPSEC_MTU_MIN || mtu > IPSEC_MTU_MAX)
587 			return EINVAL;
588 		else if ((error = ifioctl_common(ifp, cmd, data)) == ENETRESET)
589 			error = 0;
590 		break;
591 
592 #ifdef INET
593 	case SIOCSIFPHYADDR:
594 #endif
595 #ifdef INET6
596 	case SIOCSIFPHYADDR_IN6:
597 #endif /* INET6 */
598 	case SIOCSLIFPHYADDR:
599 		switch (cmd) {
600 #ifdef INET
601 		case SIOCSIFPHYADDR:
602 			src = (struct sockaddr *)
603 				&(((struct in_aliasreq *)data)->ifra_addr);
604 			dst = (struct sockaddr *)
605 				&(((struct in_aliasreq *)data)->ifra_dstaddr);
606 			break;
607 #endif /* INET */
608 #ifdef INET6
609 		case SIOCSIFPHYADDR_IN6:
610 			src = (struct sockaddr *)
611 				&(((struct in6_aliasreq *)data)->ifra_addr);
612 			dst = (struct sockaddr *)
613 				&(((struct in6_aliasreq *)data)->ifra_dstaddr);
614 			break;
615 #endif /* INET6 */
616 		case SIOCSLIFPHYADDR:
617 			src = (struct sockaddr *)
618 				&(((struct if_laddrreq *)data)->addr);
619 			dst = (struct sockaddr *)
620 				&(((struct if_laddrreq *)data)->dstaddr);
621 			break;
622 		default:
623 			return EINVAL;
624 		}
625 
626 		/* sa_family must be equal */
627 		if (src->sa_family != dst->sa_family)
628 			return EINVAL;
629 
630 		error = if_ipsec_check_salen(src);
631 		if (error)
632 			return error;
633 		error = if_ipsec_check_salen(dst);
634 		if (error)
635 			return error;
636 
637 		/* check sa_family looks sane for the cmd */
638 		switch (cmd) {
639 #ifdef INET
640 		case SIOCSIFPHYADDR:
641 			if (src->sa_family == AF_INET)
642 				break;
643 			return EAFNOSUPPORT;
644 #endif /* INET */
645 #ifdef INET6
646 		case SIOCSIFPHYADDR_IN6:
647 			if (src->sa_family == AF_INET6)
648 				break;
649 			return EAFNOSUPPORT;
650 #endif /* INET6 */
651 		case SIOCSLIFPHYADDR:
652 			/* checks done in the above */
653 			break;
654 		}
655 		/*
656 		 * calls if_ipsec_getref_variant() for other softcs to check
657 		 * address pair duplicattion
658 		 */
659 		bound = curlwp_bind();
660 		error = if_ipsec_set_tunnel(&sc->ipsec_if, src, dst);
661 		if (error)
662 			goto bad;
663 		curlwp_bindx(bound);
664 		break;
665 
666 	case SIOCDIFPHYADDR:
667 		bound = curlwp_bind();
668 		if_ipsec_delete_tunnel(&sc->ipsec_if);
669 		curlwp_bindx(bound);
670 		break;
671 
672 	case SIOCGIFPSRCADDR:
673 #ifdef INET6
674 	case SIOCGIFPSRCADDR_IN6:
675 #endif /* INET6 */
676 		bound = curlwp_bind();
677 		var = if_ipsec_getref_variant(sc, &psref);
678 		if (var->iv_psrc == NULL) {
679 			error = EADDRNOTAVAIL;
680 			goto bad;
681 		}
682 		src = var->iv_psrc;
683 		switch (cmd) {
684 #ifdef INET
685 		case SIOCGIFPSRCADDR:
686 			dst = &ifr->ifr_addr;
687 			size = sizeof(ifr->ifr_addr);
688 			break;
689 #endif /* INET */
690 #ifdef INET6
691 		case SIOCGIFPSRCADDR_IN6:
692 			dst = (struct sockaddr *)
693 				&(((struct in6_ifreq *)data)->ifr_addr);
694 			size = sizeof(((struct in6_ifreq *)data)->ifr_addr);
695 			break;
696 #endif /* INET6 */
697 		default:
698 			error = EADDRNOTAVAIL;
699 			goto bad;
700 		}
701 		if (src->sa_len > size) {
702 			error = EINVAL;
703 			goto bad;
704 		}
705 		error = IF_IPSEC_GATHER_PSRC_ADDR_PORT(var, dst);
706 		if (error)
707 			goto bad;
708 		if_ipsec_putref_variant(var, &psref);
709 		curlwp_bindx(bound);
710 		break;
711 
712 	case SIOCGIFPDSTADDR:
713 #ifdef INET6
714 	case SIOCGIFPDSTADDR_IN6:
715 #endif /* INET6 */
716 		bound = curlwp_bind();
717 		var = if_ipsec_getref_variant(sc, &psref);
718 		if (var->iv_pdst == NULL) {
719 			error = EADDRNOTAVAIL;
720 			goto bad;
721 		}
722 		src = var->iv_pdst;
723 		switch (cmd) {
724 #ifdef INET
725 		case SIOCGIFPDSTADDR:
726 			dst = &ifr->ifr_addr;
727 			size = sizeof(ifr->ifr_addr);
728 			break;
729 #endif /* INET */
730 #ifdef INET6
731 		case SIOCGIFPDSTADDR_IN6:
732 			dst = (struct sockaddr *)
733 				&(((struct in6_ifreq *)data)->ifr_addr);
734 			size = sizeof(((struct in6_ifreq *)data)->ifr_addr);
735 			break;
736 #endif /* INET6 */
737 		default:
738 			error = EADDRNOTAVAIL;
739 			goto bad;
740 		}
741 		if (src->sa_len > size) {
742 			error = EINVAL;
743 			goto bad;
744 		}
745 		error = IF_IPSEC_GATHER_PDST_ADDR_PORT(var, dst);
746 		if (error)
747 			goto bad;
748 		if_ipsec_putref_variant(var, &psref);
749 		curlwp_bindx(bound);
750 		break;
751 
752 	case SIOCGLIFPHYADDR:
753 		bound = curlwp_bind();
754 		var = if_ipsec_getref_variant(sc, &psref);
755 		if (if_ipsec_variant_is_unconfigured(var)) {
756 			error = EADDRNOTAVAIL;
757 			goto bad;
758 		}
759 
760 		/* copy src */
761 		src = var->iv_psrc;
762 		dst = (struct sockaddr *)
763 			&(((struct if_laddrreq *)data)->addr);
764 		size = sizeof(((struct if_laddrreq *)data)->addr);
765 		if (src->sa_len > size) {
766 			error = EINVAL;
767 			goto bad;
768 		}
769 		error = IF_IPSEC_GATHER_PSRC_ADDR_PORT(var, dst);
770 		if (error)
771 			goto bad;
772 
773 		/* copy dst */
774 		src = var->iv_pdst;
775 		dst = (struct sockaddr *)
776 			&(((struct if_laddrreq *)data)->dstaddr);
777 		size = sizeof(((struct if_laddrreq *)data)->dstaddr);
778 		if (src->sa_len > size) {
779 			error = EINVAL;
780 			goto bad;
781 		}
782 		error = IF_IPSEC_GATHER_PDST_ADDR_PORT(var, dst);
783 		if (error)
784 			goto bad;
785 		if_ipsec_putref_variant(var, &psref);
786 		curlwp_bindx(bound);
787 		break;
788 
789 	default:
790 		error = ifioctl_common(ifp, cmd, data);
791 		if (!error) {
792 			bound = curlwp_bind();
793 			error = if_ipsec_ensure_flags(&sc->ipsec_if, oflags);
794 			if (error)
795 				goto bad;
796 			curlwp_bindx(bound);
797 		}
798 		break;
799 	}
800 	return error;
801 
802 bad:
803 	if (var != NULL)
804 		if_ipsec_putref_variant(var, &psref);
805 	curlwp_bindx(bound);
806 
807 	return error;
808 }
809 
810 struct encap_funcs {
811 #ifdef INET
812 	int (*ef_inet)(struct ipsec_variant *);
813 #endif
814 #ifdef INET6
815 	int (*ef_inet6)(struct ipsec_variant *);
816 #endif
817 };
818 
819 static struct encap_funcs ipsec_encap_attach = {
820 #ifdef INET
821 	.ef_inet = ipsecif4_attach,
822 #endif
823 #ifdef INET6
824 	.ef_inet6 = &ipsecif6_attach,
825 #endif
826 };
827 
828 static struct encap_funcs ipsec_encap_detach = {
829 #ifdef INET
830 	.ef_inet = ipsecif4_detach,
831 #endif
832 #ifdef INET6
833 	.ef_inet6 = &ipsecif6_detach,
834 #endif
835 };
836 
837 static int
838 if_ipsec_encap_common(struct ipsec_variant *var, struct encap_funcs *funcs)
839 {
840 	int error;
841 
842 	KASSERT(var != NULL);
843 	KASSERT(if_ipsec_variant_is_configured(var));
844 
845 	switch (var->iv_psrc->sa_family) {
846 #ifdef INET
847 	case AF_INET:
848 		error = (funcs->ef_inet)(var);
849 		break;
850 #endif /* INET */
851 #ifdef INET6
852 	case AF_INET6:
853 		error = (funcs->ef_inet6)(var);
854 		break;
855 #endif /* INET6 */
856 	default:
857 		error = EINVAL;
858 		break;
859 	}
860 
861 	return error;
862 }
863 
864 static int
865 if_ipsec_encap_attach(struct ipsec_variant *var)
866 {
867 
868 	return if_ipsec_encap_common(var, &ipsec_encap_attach);
869 }
870 
871 static int
872 if_ipsec_encap_detach(struct ipsec_variant *var)
873 {
874 
875 	return if_ipsec_encap_common(var, &ipsec_encap_detach);
876 }
877 
878 /*
879  * Validate and set ipsec(4) I/F configurations.
880  *     (1) validate
881  *         (1-1) Check the argument src and dst address pair will change
882  *               configuration from current src and dst address pair.
883  *         (1-2) Check any ipsec(4) I/F uses duplicated src and dst address pair
884  *               with argument src and dst address pair, except for NAT-T shared
885  *               tunnels.
886  *     (2) set
887  *         (2-1) Create variant for new configuration.
888  *         (2-2) Create temporary "null" variant used to avoid to access
889  *               dangling variant while SPs are deleted and added.
890  *         (2-3) Swap variant include its SPs.
891  *         (2-4) Cleanup last configurations.
892  */
893 static int
894 if_ipsec_set_tunnel(struct ifnet *ifp,
895     struct sockaddr *src, struct sockaddr *dst)
896 {
897 	struct ipsec_softc *sc = ifp->if_softc;
898 	struct ipsec_softc *sc2;
899 	struct ipsec_variant *ovar, *nvar, *nullvar;
900 	struct sockaddr *osrc, *odst;
901 	struct sockaddr *nsrc, *ndst;
902 	in_port_t nsport = 0, ndport = 0;
903 	int error;
904 
905 	error = encap_lock_enter();
906 	if (error)
907 		return error;
908 
909 	nsrc = sockaddr_dup(src, M_WAITOK);
910 	ndst = sockaddr_dup(dst, M_WAITOK);
911 	nvar = kmem_zalloc(sizeof(*nvar), KM_SLEEP);
912 	nullvar = kmem_zalloc(sizeof(*nullvar), KM_SLEEP);
913 
914 	mutex_enter(&sc->ipsec_lock);
915 
916 	ovar = sc->ipsec_var;
917 
918 	switch(nsrc->sa_family) {
919 #ifdef INET
920 	case AF_INET:
921 		nsport = satosin(src)->sin_port;
922 		/*
923 		 * avoid confuse SP when NAT-T disabled,
924 		 * e.g.
925 		 *     expected: 10.0.1.2[any] 10.0.1.1[any] 4(ipv4)
926 		 *     confuse : 10.0.1.2[600] 10.0.1.1[600] 4(ipv4)
927 		 */
928 		satosin(nsrc)->sin_port = 0;
929 		ndport = satosin(dst)->sin_port;
930 		satosin(ndst)->sin_port = 0;
931 		break;
932 #endif /* INET */
933 #ifdef INET6
934 	case AF_INET6:
935 		nsport = satosin6(src)->sin6_port;
936 		satosin6(nsrc)->sin6_port = 0;
937 		ndport = satosin6(dst)->sin6_port;
938 		satosin6(ndst)->sin6_port = 0;
939 		break;
940 #endif /* INET6 */
941 	default:
942 		log(LOG_DEBUG,
943 		    "%s: Invalid address family: %d.\n",
944 		    __func__, src->sa_family);
945 		error = EINVAL;
946 		goto out;
947 	}
948 
949 	/*
950 	 * (1-1) Check the argument src and dst address pair will change
951 	 *       configuration from current src and dst address pair.
952 	 */
953 	if ((ovar->iv_pdst && sockaddr_cmp(ovar->iv_pdst, dst) == 0) &&
954 	    (ovar->iv_psrc && sockaddr_cmp(ovar->iv_psrc, src) == 0) &&
955 	    (ovar->iv_sport == nsport && ovar->iv_dport == ndport)) {
956 		/* address and port pair not changed. */
957 		error = 0;
958 		goto out;
959 	}
960 
961 	/*
962 	 * (1-2) Check any ipsec(4) I/F uses duplicated src and dst address pair
963 	 *       with argument src and dst address pair, except for NAT-T shared
964 	 *       tunnels.
965 	 */
966 	mutex_enter(&ipsec_softcs.lock);
967 	LIST_FOREACH(sc2, &ipsec_softcs.list, ipsec_list) {
968 		struct ipsec_variant *var2;
969 		struct psref psref;
970 
971 		if (sc2 == sc)
972 			continue;
973 		var2 = if_ipsec_getref_variant(sc2, &psref);
974 		if (if_ipsec_variant_is_unconfigured(var2)) {
975 			if_ipsec_putref_variant(var2, &psref);
976 			continue;
977 		}
978 		if (if_ipsec_nat_t(sc) || if_ipsec_nat_t(sc2)) {
979 			if_ipsec_putref_variant(var2, &psref);
980 			continue; /* NAT-T shared tunnel */
981 		}
982 		if (sockaddr_cmp(var2->iv_pdst, dst) == 0 &&
983 		    sockaddr_cmp(var2->iv_psrc, src) == 0) {
984 			if_ipsec_putref_variant(var2, &psref);
985 			mutex_exit(&ipsec_softcs.lock);
986 			error = EADDRNOTAVAIL;
987 			goto out;
988 		}
989 
990 		if_ipsec_putref_variant(var2, &psref);
991 		/* XXX both end must be valid? (I mean, not 0.0.0.0) */
992 	}
993 	mutex_exit(&ipsec_softcs.lock);
994 
995 
996 	osrc = ovar->iv_psrc;
997 	odst = ovar->iv_pdst;
998 
999 	/*
1000 	 * (2-1) Create ipsec_variant for new configuration.
1001 	 */
1002 	if_ipsec_copy_variant(nvar, ovar);
1003 	nvar->iv_psrc = nsrc;
1004 	nvar->iv_pdst = ndst;
1005 	nvar->iv_sport = nsport;
1006 	nvar->iv_dport = ndport;
1007 	nvar->iv_encap_cookie4 = NULL;
1008 	nvar->iv_encap_cookie6 = NULL;
1009 	psref_target_init(&nvar->iv_psref, iv_psref_class);
1010 	error = if_ipsec_encap_attach(nvar);
1011 	if (error)
1012 		goto out;
1013 
1014 	/*
1015 	 * (2-2) Create temporary "null" variant.
1016 	 */
1017 	if_ipsec_copy_variant(nullvar, ovar);
1018 	if_ipsec_clear_config(nullvar);
1019 	psref_target_init(&nullvar->iv_psref, iv_psref_class);
1020 	membar_producer();
1021 	/*
1022 	 * (2-3) Swap variant include its SPs.
1023 	 */
1024 	error = if_ipsec_update_variant(sc, nvar, nullvar);
1025 	if (error) {
1026 		if_ipsec_encap_detach(nvar);
1027 		goto out;
1028 	}
1029 
1030 	mutex_exit(&sc->ipsec_lock);
1031 
1032 	/*
1033 	 * (2-4) Cleanup last configurations.
1034 	 */
1035 	if (if_ipsec_variant_is_configured(ovar))
1036 		if_ipsec_encap_detach(ovar);
1037 	encap_lock_exit();
1038 
1039 	if (osrc != NULL)
1040 		sockaddr_free(osrc);
1041 	if (odst != NULL)
1042 		sockaddr_free(odst);
1043 	kmem_free(ovar, sizeof(*ovar));
1044 	kmem_free(nullvar, sizeof(*nullvar));
1045 
1046 	return 0;
1047 
1048 out:
1049 	mutex_exit(&sc->ipsec_lock);
1050 	encap_lock_exit();
1051 
1052 	sockaddr_free(nsrc);
1053 	sockaddr_free(ndst);
1054 	kmem_free(nvar, sizeof(*nvar));
1055 	kmem_free(nullvar, sizeof(*nullvar));
1056 
1057 	return error;
1058 }
1059 
1060 /*
1061  * Validate and delete ipsec(4) I/F configurations.
1062  *     (1) validate
1063  *         (1-1) Check current src and dst address pair are null,
1064  *               which means the ipsec(4) I/F is already done deletetunnel.
1065  *     (2) delete
1066  *         (2-1) Create variant for deleted status.
1067  *         (2-2) Create temporary "null" variant used to avoid to access
1068  *               dangling variant while SPs are deleted and added.
1069  *               NOTE:
1070  *               The contents of temporary "null" variant equal to the variant
1071  *               of (2-1), however two psref_target_destroy() synchronization
1072  *               points are necessary to avoid to access dangling variant
1073  *               while SPs are deleted and added. To implement that simply,
1074  *               we use the same manner as if_ipsec_set_tunnel(), that is,
1075  *               create extra "null" variant and use it temporarily.
1076  *         (2-3) Swap variant include its SPs.
1077  *         (2-4) Cleanup last configurations.
1078  */
1079 static void
1080 if_ipsec_delete_tunnel(struct ifnet *ifp)
1081 {
1082 	struct ipsec_softc *sc = ifp->if_softc;
1083 	struct ipsec_variant *ovar, *nvar, *nullvar;
1084 	struct sockaddr *osrc, *odst;
1085 	int error;
1086 
1087 	error = encap_lock_enter();
1088 	if (error)
1089 		return;
1090 
1091 	nvar = kmem_zalloc(sizeof(*nvar), KM_SLEEP);
1092 	nullvar = kmem_zalloc(sizeof(*nullvar), KM_SLEEP);
1093 
1094 	mutex_enter(&sc->ipsec_lock);
1095 
1096 	ovar = sc->ipsec_var;
1097 	osrc = ovar->iv_psrc;
1098 	odst = ovar->iv_pdst;
1099 	/*
1100 	 * (1-1) Check current src and dst address pair are null,
1101 	 *       which means the ipsec(4) I/F is already done deletetunnel.
1102 	 */
1103 	if (osrc == NULL || odst == NULL) {
1104 		/* address pair not changed. */
1105 		mutex_exit(&sc->ipsec_lock);
1106 		encap_lock_exit();
1107 		kmem_free(nvar, sizeof(*nvar));
1108 		return;
1109 	}
1110 
1111 	/*
1112 	 * (2-1) Create variant for deleted status.
1113 	 */
1114 	if_ipsec_copy_variant(nvar, ovar);
1115 	if_ipsec_clear_config(nvar);
1116 	psref_target_init(&nvar->iv_psref, iv_psref_class);
1117 
1118 	/*
1119 	 * (2-2) Create temporary "null" variant used to avoid to access
1120 	 *       dangling variant while SPs are deleted and added.
1121 	 */
1122 	if_ipsec_copy_variant(nullvar, ovar);
1123 	if_ipsec_clear_config(nullvar);
1124 	psref_target_init(&nullvar->iv_psref, iv_psref_class);
1125 	membar_producer();
1126 	/*
1127 	 * (2-3) Swap variant include its SPs.
1128 	 */
1129 	/* if_ipsec_update_variant() does not fail when delete SP only. */
1130 	(void)if_ipsec_update_variant(sc, nvar, nullvar);
1131 
1132 	mutex_exit(&sc->ipsec_lock);
1133 
1134 	/*
1135 	 * (2-4) Cleanup last configurations.
1136 	 */
1137 	if (if_ipsec_variant_is_configured(ovar))
1138 		if_ipsec_encap_detach(ovar);
1139 	encap_lock_exit();
1140 
1141 	sockaddr_free(osrc);
1142 	sockaddr_free(odst);
1143 	kmem_free(ovar, sizeof(*ovar));
1144 	kmem_free(nullvar, sizeof(*nullvar));
1145 }
1146 
1147 /*
1148  * Check IFF_NAT_T and IFF_FWD_IPV6 flags, therefore update SPs if needed.
1149  *     (1) check
1150  *         (1-1) Check flags are changed.
1151  *         (1-2) Check current src and dst address pair. If they are null,
1152  *               that means the ipsec(4) I/F is deletetunnel'ed, so it is
1153  *               not needed to update.
1154  *     (2) update
1155  *         (2-1) Create variant for new SPs.
1156  *         (2-2) Create temporary "null" variant used to avoid to access
1157  *               dangling variant while SPs are deleted and added.
1158  *               NOTE:
1159  *               There is the same problem as if_ipsec_delete_tunnel().
1160  *         (2-3) Swap variant include its SPs.
1161  *         (2-4) Cleanup unused configurations.
1162  *               NOTE: use the same encap_cookies.
1163  */
1164 static int
1165 if_ipsec_ensure_flags(struct ifnet *ifp, short oflags)
1166 {
1167 	struct ipsec_softc *sc = ifp->if_softc;
1168 	struct ipsec_variant *ovar, *nvar, *nullvar;
1169 	int error;
1170 
1171 	/*
1172 	 * (1) Check flags are changed.
1173 	 */
1174 	if ((oflags & (IFF_NAT_T|IFF_FWD_IPV6)) ==
1175 	    (ifp->if_flags & (IFF_NAT_T|IFF_FWD_IPV6)))
1176 		return 0; /* flags not changed. */
1177 
1178 	error = encap_lock_enter();
1179 	if (error)
1180 		return error;
1181 
1182 	nvar = kmem_zalloc(sizeof(*nvar), KM_SLEEP);
1183 	nullvar = kmem_zalloc(sizeof(*nullvar), KM_SLEEP);
1184 
1185 	mutex_enter(&sc->ipsec_lock);
1186 
1187 	ovar = sc->ipsec_var;
1188 	/*
1189 	 * (1-2) Check current src and dst address pair.
1190 	 */
1191 	if (if_ipsec_variant_is_unconfigured(ovar)) {
1192 		/* nothing to do */
1193 		mutex_exit(&sc->ipsec_lock);
1194 		encap_lock_exit();
1195 		return 0;
1196 	}
1197 
1198 	/*
1199 	 * (2-1) Create variant for new SPs.
1200 	 */
1201 	if_ipsec_copy_variant(nvar, ovar);
1202 	psref_target_init(&nvar->iv_psref, iv_psref_class);
1203 	/*
1204 	 * (2-2) Create temporary "null" variant used to avoid to access
1205 	 *       dangling variant while SPs are deleted and added.
1206 	 */
1207 	if_ipsec_copy_variant(nullvar, ovar);
1208 	if_ipsec_clear_config(nullvar);
1209 	psref_target_init(&nullvar->iv_psref, iv_psref_class);
1210 	membar_producer();
1211 	/*
1212 	 * (2-3) Swap variant include its SPs.
1213 	 */
1214 	error = if_ipsec_update_variant(sc, nvar, nullvar);
1215 
1216 	mutex_exit(&sc->ipsec_lock);
1217 	encap_lock_exit();
1218 
1219 	/*
1220 	 * (2-4) Cleanup unused configurations.
1221 	 */
1222 	if (!error)
1223 		kmem_free(ovar, sizeof(*ovar));
1224 	else
1225 		kmem_free(nvar, sizeof(*ovar));
1226 	kmem_free(nullvar, sizeof(*nullvar));
1227 
1228 	return error;
1229 }
1230 
1231 /*
1232  * SPD management
1233  */
1234 
1235 /*
1236  * Share SP set with other NAT-T ipsec(4) I/F(s).
1237  *     Return 1, when "var" shares SP set.
1238  *     Return 0, when "var" cannot share SP set.
1239  *
1240  * NOTE:
1241  * if_ipsec_share_sp() and if_ipsec_unshare_sp() would require global lock
1242  * to exclude other ipsec(4) I/Fs set_tunnel/delete_tunnel. E.g. when ipsec0
1243  * and ipsec1 can share SP set, running ipsec0's set_tunnel and ipsec1's
1244  * set_tunnel causes race.
1245  * Currently, (fortunately) encap_lock works as this global lock.
1246  */
1247 static int
1248 if_ipsec_share_sp(struct ipsec_variant *var)
1249 {
1250 	struct ipsec_softc *sc = var->iv_softc;
1251 	struct ipsec_softc *sc2;
1252 	struct ipsec_variant *var2;
1253 	struct psref psref;
1254 
1255 	KASSERT(encap_lock_held());
1256 	KASSERT(var->iv_psrc != NULL && var->iv_pdst != NULL);
1257 
1258 	mutex_enter(&ipsec_softcs.lock);
1259 	LIST_FOREACH(sc2, &ipsec_softcs.list, ipsec_list) {
1260 		if (sc2 == sc)
1261 			continue;
1262 		var2 = if_ipsec_getref_variant(sc2, &psref);
1263 		if (if_ipsec_variant_is_unconfigured(var2)) {
1264 			if_ipsec_putref_variant(var2, &psref);
1265 			continue;
1266 		}
1267 		if (sockaddr_cmp(var2->iv_pdst, var->iv_pdst) != 0 ||
1268 		    sockaddr_cmp(var2->iv_psrc, var->iv_psrc) != 0) {
1269 			if_ipsec_putref_variant(var2, &psref);
1270 			continue;
1271 		}
1272 
1273 		break;
1274 	}
1275 	mutex_exit(&ipsec_softcs.lock);
1276 	if (sc2 == NULL)
1277 		return 0; /* not shared */
1278 
1279 	IV_SP_IN(var) = IV_SP_IN(var2);
1280 	IV_SP_IN6(var) = IV_SP_IN6(var2);
1281 	IV_SP_OUT(var) = IV_SP_OUT(var2);
1282 	IV_SP_OUT6(var) = IV_SP_OUT6(var2);
1283 
1284 	if_ipsec_putref_variant(var2, &psref);
1285 	return 1; /* shared */
1286 }
1287 
1288 /*
1289  * Unshare SP set with other NAT-T ipsec(4) I/F(s).
1290  *     Return 1, when "var" shared SP set, and then unshare them.
1291  *     Return 0, when "var" did not share SP set.
1292  *
1293  * NOTE:
1294  * See if_ipsec_share_sp()'s note.
1295  */
1296 static int
1297 if_ipsec_unshare_sp(struct ipsec_variant *var)
1298 {
1299 	struct ipsec_softc *sc = var->iv_softc;
1300 	struct ipsec_softc *sc2;
1301 	struct ipsec_variant *var2;
1302 	struct psref psref;
1303 
1304 	KASSERT(encap_lock_held());
1305 
1306 	if (!var->iv_pdst || !var->iv_psrc)
1307 		return 0;
1308 
1309 	mutex_enter(&ipsec_softcs.lock);
1310 	LIST_FOREACH(sc2, &ipsec_softcs.list, ipsec_list) {
1311 		if (sc2 == sc)
1312 			continue;
1313 		var2 = if_ipsec_getref_variant(sc2, &psref);
1314 		if (!var2->iv_pdst || !var2->iv_psrc) {
1315 			if_ipsec_putref_variant(var2, &psref);
1316 			continue;
1317 		}
1318 		if (sockaddr_cmp(var2->iv_pdst, var->iv_pdst) != 0 ||
1319 		    sockaddr_cmp(var2->iv_psrc, var->iv_psrc) != 0) {
1320 			if_ipsec_putref_variant(var2, &psref);
1321 			continue;
1322 		}
1323 
1324 		break;
1325 	}
1326 	mutex_exit(&ipsec_softcs.lock);
1327 	if (sc2 == NULL)
1328 		return 0; /* not shared */
1329 
1330 	IV_SP_IN(var) = NULL;
1331 	IV_SP_IN6(var) = NULL;
1332 	IV_SP_OUT(var) = NULL;
1333 	IV_SP_OUT6(var) = NULL;
1334 	if_ipsec_putref_variant(var2, &psref);
1335 	return 1; /* shared */
1336 }
1337 
1338 static inline void
1339 if_ipsec_add_mbuf_optalign(struct mbuf *m0, void *data, size_t len, bool align)
1340 {
1341 	struct mbuf *m;
1342 
1343 	MGET(m, M_WAIT, MT_DATA);
1344 	if (align) {
1345 		m->m_len = PFKEY_ALIGN8(len);
1346 		memset(mtod(m, void *), 0, m->m_len);
1347 	} else
1348 		m->m_len = len;
1349 	m_copyback(m, 0, len, data);
1350 	m_cat(m0, m);
1351 }
1352 
1353 static inline void
1354 if_ipsec_add_mbuf(struct mbuf *m0, void *data, size_t len)
1355 {
1356 
1357 	if_ipsec_add_mbuf_optalign(m0, data, len, true);
1358 }
1359 
1360 static inline void
1361 if_ipsec_add_mbuf_addr_port(struct mbuf *m0, struct sockaddr *addr, in_port_t port, bool align)
1362 {
1363 
1364 	if (port == 0) {
1365 		if_ipsec_add_mbuf_optalign(m0, addr, addr->sa_len, align);
1366 	} else {
1367 		union sockaddr_union addrport_u;
1368 		struct sockaddr *addrport = &addrport_u.sa;
1369 
1370 		if_ipsec_set_addr_port(addrport, addr, port);
1371 		if_ipsec_add_mbuf_optalign(m0, addrport, addrport->sa_len, align);
1372 	}
1373 }
1374 
1375 static inline void
1376 if_ipsec_add_pad(struct mbuf *m0, size_t len)
1377 {
1378 	struct mbuf *m;
1379 
1380 	if (len == 0)
1381 		return;
1382 
1383 	MGET(m, M_WAIT, MT_DATA);
1384 	m->m_len = len;
1385 	memset(mtod(m, void *), 0, m->m_len);
1386 	m_cat(m0, m);
1387 }
1388 
1389 static inline size_t
1390 if_ipsec_set_sadb_addr(struct sadb_address *saaddr, struct sockaddr *addr,
1391     int proto, uint16_t exttype)
1392 {
1393 	size_t size;
1394 
1395 	KASSERT(saaddr != NULL);
1396 	KASSERT(addr != NULL);
1397 
1398 	size = sizeof(*saaddr) + PFKEY_ALIGN8(addr->sa_len);
1399 	saaddr->sadb_address_len = PFKEY_UNIT64(size);
1400 	saaddr->sadb_address_exttype = exttype;
1401 	saaddr->sadb_address_proto = proto;
1402 	switch (addr->sa_family) {
1403 #ifdef INET
1404 	case AF_INET:
1405 		saaddr->sadb_address_prefixlen = sizeof(struct in_addr) << 3;
1406 		break;
1407 #endif /* INET */
1408 #ifdef INET6
1409 	case AF_INET6:
1410 		saaddr->sadb_address_prefixlen = sizeof(struct in6_addr) << 3;
1411 		break;
1412 #endif /* INET6 */
1413 	default:
1414 		log(LOG_DEBUG,
1415 		    "%s: Invalid address family: %d.\n",
1416 		    __func__, addr->sa_family);
1417 		break;
1418 	}
1419 	saaddr->sadb_address_reserved = 0;
1420 
1421 	return size;
1422 }
1423 
1424 static inline size_t
1425 if_ipsec_set_sadb_src(struct sadb_address *sasrc, struct sockaddr *src,
1426     int proto)
1427 {
1428 
1429 	return if_ipsec_set_sadb_addr(sasrc, src, proto,
1430 	    SADB_EXT_ADDRESS_SRC);
1431 }
1432 
1433 static inline size_t
1434 if_ipsec_set_sadb_dst(struct sadb_address *sadst, struct sockaddr *dst,
1435     int proto)
1436 {
1437 
1438 	return if_ipsec_set_sadb_addr(sadst, dst, proto,
1439 	    SADB_EXT_ADDRESS_DST);
1440 }
1441 
1442 static inline size_t
1443 if_ipsec_set_sadb_x_policy(struct sadb_x_policy *xpl,
1444     struct sadb_x_ipsecrequest *xisr, uint16_t policy, uint8_t dir, uint32_t id,
1445     uint8_t level, struct sockaddr *src, struct sockaddr *dst)
1446 {
1447 	size_t size;
1448 
1449 	KASSERT(policy != IPSEC_POLICY_IPSEC || xisr != NULL);
1450 
1451 	size = sizeof(*xpl);
1452 	if (policy == IPSEC_POLICY_IPSEC) {
1453 		size += PFKEY_ALIGN8(sizeof(*xisr));
1454 		if (src != NULL && dst != NULL)
1455 			size += PFKEY_ALIGN8(src->sa_len + dst->sa_len);
1456 	}
1457 	xpl->sadb_x_policy_len = PFKEY_UNIT64(size);
1458 	xpl->sadb_x_policy_exttype = SADB_X_EXT_POLICY;
1459 	xpl->sadb_x_policy_type = policy;
1460 	xpl->sadb_x_policy_dir = dir;
1461 	xpl->sadb_x_policy_reserved = 0;
1462 	xpl->sadb_x_policy_id = id;
1463 	xpl->sadb_x_policy_reserved2 = 0;
1464 
1465 	if (policy == IPSEC_POLICY_IPSEC) {
1466 		xisr->sadb_x_ipsecrequest_len = PFKEY_ALIGN8(sizeof(*xisr));
1467 		if (src != NULL && dst != NULL)
1468 			xisr->sadb_x_ipsecrequest_len +=
1469 				PFKEY_ALIGN8(src->sa_len + dst->sa_len);
1470 		xisr->sadb_x_ipsecrequest_proto = IPPROTO_ESP;
1471 		xisr->sadb_x_ipsecrequest_mode = IPSEC_MODE_TRANSPORT;
1472 		xisr->sadb_x_ipsecrequest_level = level;
1473 		xisr->sadb_x_ipsecrequest_reqid = key_newreqid();
1474 	}
1475 
1476 	return size;
1477 }
1478 
1479 static inline void
1480 if_ipsec_set_sadb_msg(struct sadb_msg *msg, uint16_t extlen, uint8_t msgtype)
1481 {
1482 
1483 	KASSERT(msg != NULL);
1484 
1485 	msg->sadb_msg_version = PF_KEY_V2;
1486 	msg->sadb_msg_type = msgtype;
1487 	msg->sadb_msg_errno = 0;
1488 	msg->sadb_msg_satype = SADB_SATYPE_UNSPEC;
1489 	msg->sadb_msg_len = PFKEY_UNIT64(sizeof(*msg)) + extlen;
1490 	msg->sadb_msg_reserved = 0;
1491 	msg->sadb_msg_seq = 0; /* XXXX */
1492 	msg->sadb_msg_pid = 0; /* XXXX */
1493 }
1494 
1495 static inline void
1496 if_ipsec_set_sadb_msg_add(struct sadb_msg *msg, uint16_t extlen)
1497 {
1498 
1499 	if_ipsec_set_sadb_msg(msg, extlen, SADB_X_SPDADD);
1500 }
1501 
1502 static inline void
1503 if_ipsec_set_sadb_msg_del(struct sadb_msg *msg, uint16_t extlen)
1504 {
1505 
1506 	if_ipsec_set_sadb_msg(msg, extlen, SADB_X_SPDDELETE2);
1507 }
1508 
1509 static int
1510 if_ipsec_set_addr_port(struct sockaddr *addrport, struct sockaddr *addr,
1511     in_port_t port)
1512 {
1513 	int error = 0;
1514 
1515 	sockaddr_copy(addrport, addr->sa_len, addr);
1516 
1517 	switch (addr->sa_family) {
1518 #ifdef INET
1519 	case AF_INET: {
1520 		struct sockaddr_in *sin = satosin(addrport);
1521 		sin->sin_port = port;
1522 		break;
1523 	}
1524 #endif /* INET */
1525 #ifdef INET6
1526 	case AF_INET6: {
1527 		struct sockaddr_in6 *sin6 = satosin6(addrport);
1528 		sin6->sin6_port = port;
1529 		break;
1530 	}
1531 #endif /* INET6 */
1532 	default:
1533 		log(LOG_DEBUG,
1534 		    "%s: Invalid address family: %d.\n",
1535 		    __func__, addr->sa_family);
1536 		error = EINVAL;
1537 	}
1538 
1539 	return error;
1540 }
1541 
1542 static struct secpolicy *
1543 if_ipsec_add_sp0(struct sockaddr *src, in_port_t sport,
1544     struct sockaddr *dst, in_port_t dport,
1545     int dir, int proto, int level, u_int policy)
1546 {
1547 	struct sadb_msg msg;
1548 	struct sadb_address xsrc, xdst;
1549 	struct sadb_x_policy xpl;
1550 	struct sadb_x_ipsecrequest xisr;
1551 	size_t size;
1552 	size_t padlen;
1553 	uint16_t ext_msg_len = 0;
1554 	struct mbuf *m;
1555 
1556 	memset(&msg, 0, sizeof(msg));
1557 	memset(&xsrc, 0, sizeof(xsrc));
1558 	memset(&xdst, 0, sizeof(xdst));
1559 	memset(&xpl, 0, sizeof(xpl));
1560 	memset(&xisr, 0, sizeof(xisr));
1561 
1562 	MGETHDR(m, M_WAIT, MT_DATA);
1563 
1564 	size = if_ipsec_set_sadb_src(&xsrc, src, proto);
1565 	ext_msg_len += PFKEY_UNIT64(size);
1566 	size = if_ipsec_set_sadb_dst(&xdst, dst, proto);
1567 	ext_msg_len += PFKEY_UNIT64(size);
1568 	size = if_ipsec_set_sadb_x_policy(&xpl, &xisr, policy, dir, 0, level, src, dst);
1569 	ext_msg_len += PFKEY_UNIT64(size);
1570 	if_ipsec_set_sadb_msg_add(&msg, ext_msg_len);
1571 
1572 	/* build PF_KEY message */
1573 
1574 	m->m_len = sizeof(msg);
1575 	m_copyback(m, 0, sizeof(msg), &msg);
1576 
1577 	if_ipsec_add_mbuf(m, &xsrc, sizeof(xsrc));
1578 	/*
1579 	 * secpolicy.spidx.{src, dst} must not be set port number,
1580 	 * even if it is used for NAT-T.
1581 	 */
1582 	if_ipsec_add_mbuf_addr_port(m, src, 0, true);
1583 	padlen = PFKEY_UNUNIT64(xsrc.sadb_address_len)
1584 		- (sizeof(xsrc) + PFKEY_ALIGN8(src->sa_len));
1585 	if_ipsec_add_pad(m, padlen);
1586 
1587 	if_ipsec_add_mbuf(m, &xdst, sizeof(xdst));
1588 	/* ditto */
1589 	if_ipsec_add_mbuf_addr_port(m, dst, 0, true);
1590 	padlen = PFKEY_UNUNIT64(xdst.sadb_address_len)
1591 		- (sizeof(xdst) + PFKEY_ALIGN8(dst->sa_len));
1592 	if_ipsec_add_pad(m, padlen);
1593 
1594 	if_ipsec_add_mbuf(m, &xpl, sizeof(xpl));
1595 	padlen = PFKEY_UNUNIT64(xpl.sadb_x_policy_len) - sizeof(xpl);
1596 	if (policy == IPSEC_POLICY_IPSEC) {
1597 		if_ipsec_add_mbuf(m, &xisr, sizeof(xisr));
1598 		/*
1599 		 * secpolicy.req->saidx.{src, dst} must be set port number,
1600 		 * when it is used for NAT-T.
1601 		 */
1602 		if_ipsec_add_mbuf_addr_port(m, src, sport, false);
1603 		if_ipsec_add_mbuf_addr_port(m, dst, dport, false);
1604 		padlen -= PFKEY_ALIGN8(sizeof(xisr));
1605 		padlen -= PFKEY_ALIGN8(src->sa_len + dst->sa_len);
1606 	}
1607 	if_ipsec_add_pad(m, padlen);
1608 
1609 	/* key_kpi_spdadd() has already done KEY_SP_REF(). */
1610 	return key_kpi_spdadd(m);
1611 }
1612 
1613 static int
1614 if_ipsec_add_sp(struct ipsec_variant *var,
1615     struct sockaddr *src, in_port_t sport,
1616     struct sockaddr *dst, in_port_t dport)
1617 {
1618 	struct ipsec_softc *sc = var->iv_softc;
1619 	int level;
1620 	u_int v6policy;
1621 
1622 	/*
1623 	 * must delete sp before add it.
1624 	 */
1625 	KASSERT(IV_SP_IN(var) == NULL);
1626 	KASSERT(IV_SP_OUT(var) == NULL);
1627 	KASSERT(IV_SP_IN6(var) == NULL);
1628 	KASSERT(IV_SP_OUT6(var) == NULL);
1629 
1630 	/*
1631 	 * can be shared?
1632 	 */
1633 	if (if_ipsec_share_sp(var))
1634 		return 0;
1635 
1636 	if (if_ipsec_nat_t(sc))
1637 		level = IPSEC_LEVEL_REQUIRE;
1638 	else
1639 		level = IPSEC_LEVEL_UNIQUE;
1640 
1641 	if (if_ipsec_fwd_ipv6(sc))
1642 		v6policy = IPSEC_POLICY_IPSEC;
1643 	else
1644 		v6policy = IPSEC_POLICY_DISCARD;
1645 
1646 	IV_SP_IN(var) = if_ipsec_add_sp0(dst, dport, src, sport,
1647 	    IPSEC_DIR_INBOUND, IPPROTO_IPIP, level, IPSEC_POLICY_IPSEC);
1648 	if (IV_SP_IN(var) == NULL)
1649 		goto fail;
1650 	IV_SP_OUT(var) = if_ipsec_add_sp0(src, sport, dst, dport,
1651 	    IPSEC_DIR_OUTBOUND, IPPROTO_IPIP, level, IPSEC_POLICY_IPSEC);
1652 	if (IV_SP_OUT(var) == NULL)
1653 		goto fail;
1654 	IV_SP_IN6(var) = if_ipsec_add_sp0(dst, dport, src, sport,
1655 	    IPSEC_DIR_INBOUND, IPPROTO_IPV6, level, v6policy);
1656 	if (IV_SP_IN6(var) == NULL)
1657 		goto fail;
1658 	IV_SP_OUT6(var) = if_ipsec_add_sp0(src, sport, dst, dport,
1659 	    IPSEC_DIR_OUTBOUND, IPPROTO_IPV6, level, v6policy);
1660 	if (IV_SP_OUT6(var) == NULL)
1661 		goto fail;
1662 
1663 	return 0;
1664 
1665 fail:
1666 	if (IV_SP_IN6(var) != NULL) {
1667 		if_ipsec_del_sp0(IV_SP_IN6(var));
1668 		IV_SP_IN6(var) = NULL;
1669 	}
1670 	if (IV_SP_OUT(var) != NULL) {
1671 		if_ipsec_del_sp0(IV_SP_OUT(var));
1672 		IV_SP_OUT(var) = NULL;
1673 	}
1674 	if (IV_SP_IN(var) != NULL) {
1675 		if_ipsec_del_sp0(IV_SP_IN(var));
1676 		IV_SP_IN(var) = NULL;
1677 	}
1678 
1679 	return EEXIST;
1680 }
1681 
1682 static int
1683 if_ipsec_del_sp0(struct secpolicy *sp)
1684 {
1685 	struct sadb_msg msg;
1686 	struct sadb_x_policy xpl;
1687 	size_t size;
1688 	uint16_t ext_msg_len = 0;
1689 	int error;
1690 	struct mbuf *m;
1691 
1692 	if (sp == NULL)
1693 		return 0;
1694 
1695 	memset(&msg, 0, sizeof(msg));
1696 	memset(&xpl, 0, sizeof(xpl));
1697 
1698 	MGETHDR(m, M_WAIT, MT_DATA);
1699 
1700 	size = if_ipsec_set_sadb_x_policy(&xpl, NULL, 0, 0, sp->id, 0, NULL, NULL);
1701 	ext_msg_len += PFKEY_UNIT64(size);
1702 
1703 	if_ipsec_set_sadb_msg_del(&msg, ext_msg_len);
1704 
1705 	m->m_len = sizeof(msg);
1706 	m_copyback(m, 0, sizeof(msg), &msg);
1707 
1708 	if_ipsec_add_mbuf(m, &xpl, sizeof(xpl));
1709 
1710 	/*  unreference correspond to key_kpi_spdadd(). */
1711 	KEY_SP_UNREF(&sp);
1712 	error = key_kpi_spddelete2(m);
1713 	if (error != 0) {
1714 		log(LOG_ERR, "%s: cannot delete SP(ID=%u) (error=%d).\n",
1715 		    __func__, sp->id, error);
1716 	}
1717 	return error;
1718 }
1719 
1720 static void
1721 if_ipsec_del_sp(struct ipsec_variant *var)
1722 {
1723 
1724 	/* are the SPs shared? */
1725 	if (if_ipsec_unshare_sp(var))
1726 		return;
1727 
1728 	(void)if_ipsec_del_sp0(IV_SP_OUT(var));
1729 	(void)if_ipsec_del_sp0(IV_SP_IN(var));
1730 	(void)if_ipsec_del_sp0(IV_SP_OUT6(var));
1731 	(void)if_ipsec_del_sp0(IV_SP_IN6(var));
1732 	IV_SP_IN(var) = NULL;
1733 	IV_SP_IN6(var) = NULL;
1734 	IV_SP_OUT(var) = NULL;
1735 	IV_SP_OUT6(var) = NULL;
1736 }
1737 
1738 static int
1739 if_ipsec_replace_sp(struct ipsec_softc *sc, struct ipsec_variant *ovar,
1740     struct ipsec_variant *nvar)
1741 {
1742 	in_port_t src_port = 0;
1743 	in_port_t dst_port = 0;
1744 	struct sockaddr *src;
1745 	struct sockaddr *dst;
1746 	int error = 0;
1747 
1748 	KASSERT(mutex_owned(&sc->ipsec_lock));
1749 
1750 	if_ipsec_del_sp(ovar);
1751 
1752 	src = nvar->iv_psrc;
1753 	dst = nvar->iv_pdst;
1754 	if (if_ipsec_nat_t(sc)) {
1755 		/* NAT-T enabled */
1756 		src_port = nvar->iv_sport;
1757 		dst_port = nvar->iv_dport;
1758 	}
1759 	if (src && dst)
1760 		error = if_ipsec_add_sp(nvar, src, src_port, dst, dst_port);
1761 
1762 	return error;
1763 }
1764 
1765 /*
1766  * ipsec_variant and its SPs update API.
1767  *
1768  * Assumption:
1769  * reader side dereferences sc->ipsec_var in reader critical section only,
1770  * that is, all of reader sides do not reader the sc->ipsec_var after
1771  * pserialize_perform().
1772  */
1773 static int
1774 if_ipsec_update_variant(struct ipsec_softc *sc, struct ipsec_variant *nvar,
1775     struct ipsec_variant *nullvar)
1776 {
1777 	struct ifnet *ifp = &sc->ipsec_if;
1778 	struct ipsec_variant *ovar = sc->ipsec_var;
1779 	int error;
1780 
1781 	KASSERT(mutex_owned(&sc->ipsec_lock));
1782 
1783 	/*
1784 	 * To keep consistency between ipsec(4) I/F settings and SPs,
1785 	 * we stop packet processing while replacing SPs, that is, we set
1786 	 * "null" config variant to sc->ipsec_var.
1787 	 */
1788 	sc->ipsec_var = nullvar;
1789 	pserialize_perform(sc->ipsec_psz);
1790 	psref_target_destroy(&ovar->iv_psref, iv_psref_class);
1791 
1792 	error = if_ipsec_replace_sp(sc, ovar, nvar);
1793 	if (!error)
1794 		sc->ipsec_var = nvar;
1795 	else {
1796 		sc->ipsec_var = ovar; /* rollback */
1797 		psref_target_init(&ovar->iv_psref, iv_psref_class);
1798 	}
1799 
1800 	pserialize_perform(sc->ipsec_psz);
1801 	psref_target_destroy(&nullvar->iv_psref, iv_psref_class);
1802 
1803 	if (if_ipsec_variant_is_configured(sc->ipsec_var))
1804 		ifp->if_flags |= IFF_RUNNING;
1805 	else
1806 		ifp->if_flags &= ~IFF_RUNNING;
1807 
1808 	return error;
1809 }
1810