xref: /netbsd-src/sys/net/if_ipsec.c (revision 165a21a8716666c5138d91b8ea5edcaef219e38f)
1 /*	$NetBSD: if_ipsec.c,v 1.24 2019/09/19 06:07:24 knakahara Exp $  */
2 
3 /*
4  * Copyright (c) 2017 Internet Initiative Japan Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __KERNEL_RCSID(0, "$NetBSD: if_ipsec.c,v 1.24 2019/09/19 06:07:24 knakahara Exp $");
31 
32 #ifdef _KERNEL_OPT
33 #include "opt_inet.h"
34 #endif
35 
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/kernel.h>
39 #include <sys/mbuf.h>
40 #include <sys/socket.h>
41 #include <sys/sockio.h>
42 #include <sys/errno.h>
43 #include <sys/ioctl.h>
44 #include <sys/time.h>
45 #include <sys/syslog.h>
46 #include <sys/cpu.h>
47 #include <sys/kmem.h>
48 #include <sys/mutex.h>
49 #include <sys/pserialize.h>
50 #include <sys/psref.h>
51 
52 #include <net/if.h>
53 #include <net/if_types.h>
54 #include <net/route.h>
55 #include <net/bpf.h>
56 #include <net/pfkeyv2.h>
57 
58 #include <netinet/in.h>
59 #include <netinet/in_systm.h>
60 #include <netinet/ip.h>
61 #ifdef	INET
62 #include <netinet/in_var.h>
63 #endif	/* INET */
64 
65 #ifdef INET6
66 #include <netinet6/in6_var.h>
67 #include <netinet/ip6.h>
68 #include <netinet6/ip6_var.h>
69 #endif /* INET6 */
70 
71 #include <netinet/ip_encap.h>
72 
73 #include <net/if_ipsec.h>
74 
75 #include <net/raw_cb.h>
76 #include <net/pfkeyv2.h>
77 
78 #include <netipsec/key.h>
79 #include <netipsec/keydb.h> /* for union sockaddr_union */
80 #include <netipsec/ipsec.h>
81 #include <netipsec/ipsecif.h>
82 
83 static int if_ipsec_clone_create(struct if_clone *, int);
84 static int if_ipsec_clone_destroy(struct ifnet *);
85 
86 static inline int if_ipsec_out_direct(struct ipsec_variant *, struct mbuf *, int);
87 static inline void if_ipsec_in_enqueue(struct mbuf *, int, struct ifnet *);
88 
89 static int if_ipsec_encap_attach(struct ipsec_variant *);
90 static int if_ipsec_encap_detach(struct ipsec_variant *);
91 static int if_ipsec_set_tunnel(struct ifnet *,
92     struct sockaddr *, struct sockaddr *);
93 static void if_ipsec_delete_tunnel(struct ifnet *);
94 static int if_ipsec_ensure_flags(struct ifnet *, u_short);
95 static void if_ipsec_attach0(struct ipsec_softc *);
96 
97 static int if_ipsec_update_variant(struct ipsec_softc *,
98     struct ipsec_variant *, struct ipsec_variant *);
99 
100 /* sadb_msg */
101 static inline void if_ipsec_add_mbuf(struct mbuf *, void *, size_t);
102 static inline void if_ipsec_add_pad(struct mbuf *, size_t);
103 static inline size_t if_ipsec_set_sadb_addr(struct sadb_address *,
104     struct sockaddr *, int, uint16_t);
105 static inline size_t if_ipsec_set_sadb_src(struct sadb_address *,
106     struct sockaddr *, int);
107 static inline size_t if_ipsec_set_sadb_dst(struct sadb_address *,
108     struct sockaddr *, int);
109 static inline size_t if_ipsec_set_sadb_x_policy(struct sadb_x_policy *,
110     struct sadb_x_ipsecrequest *, uint16_t, uint8_t, uint32_t, uint8_t,
111     struct sockaddr *, struct sockaddr *);
112 static inline void if_ipsec_set_sadb_msg(struct sadb_msg *, uint16_t, uint8_t);
113 static inline void if_ipsec_set_sadb_msg_add(struct sadb_msg *, uint16_t);
114 static inline void if_ipsec_set_sadb_msg_del(struct sadb_msg *, uint16_t);
115 /* SPD */
116 static int if_ipsec_share_sp(struct ipsec_variant *);
117 static int if_ipsec_unshare_sp(struct ipsec_variant *);
118 static inline struct secpolicy *if_ipsec_add_sp0(struct sockaddr *,
119     in_port_t, struct sockaddr *, in_port_t, int, int, int, u_int);
120 static inline int if_ipsec_del_sp0(struct secpolicy *);
121 static int if_ipsec_add_sp(struct ipsec_variant *,
122     struct sockaddr *, in_port_t, struct sockaddr *, in_port_t);
123 static void if_ipsec_del_sp(struct ipsec_variant *);
124 static int if_ipsec_replace_sp(struct ipsec_softc *, struct ipsec_variant *,
125     struct ipsec_variant *);
126 
127 static int if_ipsec_set_addr_port(struct sockaddr *, struct sockaddr *,
128     in_port_t);
129 #define IF_IPSEC_GATHER_PSRC_ADDR_PORT(var, target)			\
130 	if_ipsec_set_addr_port(target, (var)->iv_psrc, (var)->iv_sport)
131 #define IF_IPSEC_GATHER_PDST_ADDR_PORT(var, target)			\
132 	if_ipsec_set_addr_port(target, (var)->iv_pdst, (var)->iv_dport)
133 
134 /*
135  * ipsec global variable definitions
136  */
137 
138 /* This list is used in ioctl context only. */
139 static struct {
140 	LIST_HEAD(ipsec_sclist, ipsec_softc) list;
141 	kmutex_t lock;
142 } ipsec_softcs __cacheline_aligned;
143 
144 struct psref_class *iv_psref_class __read_mostly;
145 
146 struct if_clone ipsec_cloner =
147     IF_CLONE_INITIALIZER("ipsec", if_ipsec_clone_create, if_ipsec_clone_destroy);
148 static int max_ipsec_nesting = MAX_IPSEC_NEST;
149 
150 /* ARGSUSED */
151 void
152 ipsecifattach(int count)
153 {
154 
155 	mutex_init(&ipsec_softcs.lock, MUTEX_DEFAULT, IPL_NONE);
156 	LIST_INIT(&ipsec_softcs.list);
157 
158 	iv_psref_class = psref_class_create("ipsecvar", IPL_SOFTNET);
159 
160 	if_clone_attach(&ipsec_cloner);
161 }
162 
163 static int
164 if_ipsec_clone_create(struct if_clone *ifc, int unit)
165 {
166 	struct ipsec_softc *sc;
167 	struct ipsec_variant *var;
168 
169 	sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
170 
171 	if_initname(&sc->ipsec_if, ifc->ifc_name, unit);
172 
173 	if_ipsec_attach0(sc);
174 
175 	var = kmem_zalloc(sizeof(*var), KM_SLEEP);
176 	var->iv_softc = sc;
177 	psref_target_init(&var->iv_psref, iv_psref_class);
178 
179 	sc->ipsec_var = var;
180 	mutex_init(&sc->ipsec_lock, MUTEX_DEFAULT, IPL_NONE);
181 	sc->ipsec_psz = pserialize_create();
182 	sc->ipsec_ro_percpu = if_tunnel_alloc_ro_percpu();
183 
184 	mutex_enter(&ipsec_softcs.lock);
185 	LIST_INSERT_HEAD(&ipsec_softcs.list, sc, ipsec_list);
186 	mutex_exit(&ipsec_softcs.lock);
187 	return 0;
188 }
189 
190 static void
191 if_ipsec_attach0(struct ipsec_softc *sc)
192 {
193 
194 	sc->ipsec_if.if_addrlen = 0;
195 	sc->ipsec_if.if_mtu    = IPSEC_MTU;
196 	sc->ipsec_if.if_flags  = IFF_POINTOPOINT | IFF_MULTICAST;
197 	/* set ipsec(4) specific default flags. */
198 	sc->ipsec_if.if_flags  |= IFF_FWD_IPV6;
199 	sc->ipsec_if.if_extflags = IFEF_NO_LINK_STATE_CHANGE | IFEF_MPSAFE;
200 	sc->ipsec_if.if_ioctl  = if_ipsec_ioctl;
201 	sc->ipsec_if.if_output = if_ipsec_output;
202 	sc->ipsec_if.if_type   = IFT_IPSEC;
203 	sc->ipsec_if.if_dlt    = DLT_NULL;
204 	sc->ipsec_if.if_softc  = sc;
205 	IFQ_SET_READY(&sc->ipsec_if.if_snd);
206 	if_initialize(&sc->ipsec_if);
207 	if_alloc_sadl(&sc->ipsec_if);
208 	bpf_attach(&sc->ipsec_if, DLT_NULL, sizeof(u_int));
209 	if_register(&sc->ipsec_if);
210 }
211 
212 static int
213 if_ipsec_clone_destroy(struct ifnet *ifp)
214 {
215 	struct ipsec_softc *sc = ifp->if_softc;
216 	struct ipsec_variant *var;
217 	int bound;
218 
219 	mutex_enter(&ipsec_softcs.lock);
220 	LIST_REMOVE(sc, ipsec_list);
221 	mutex_exit(&ipsec_softcs.lock);
222 
223 	bound = curlwp_bind();
224 	if_ipsec_delete_tunnel(&sc->ipsec_if);
225 	curlwp_bindx(bound);
226 
227 	bpf_detach(ifp);
228 	if_detach(ifp);
229 
230 	if_tunnel_free_ro_percpu(sc->ipsec_ro_percpu);
231 
232 	pserialize_destroy(sc->ipsec_psz);
233 	mutex_destroy(&sc->ipsec_lock);
234 
235 	var = sc->ipsec_var;
236 	kmem_free(var, sizeof(*var));
237 	kmem_free(sc, sizeof(*sc));
238 
239 	return 0;
240 }
241 
242 static inline bool
243 if_ipsec_nat_t(struct ipsec_softc *sc)
244 {
245 
246 	return (sc->ipsec_if.if_flags & IFF_NAT_T) != 0;
247 }
248 
249 static inline bool
250 if_ipsec_fwd_ipv6(struct ipsec_softc *sc)
251 {
252 
253 	return (sc->ipsec_if.if_flags & IFF_FWD_IPV6) != 0;
254 }
255 
256 int
257 if_ipsec_encap_func(struct mbuf *m, int off, int proto, void *arg)
258 {
259 	uint8_t v;
260 	struct ipsec_softc *sc;
261 	struct ipsec_variant *var = NULL;
262 	struct psref psref;
263 	int ret = 0;
264 
265 	sc = arg;
266 	KASSERT(sc != NULL);
267 
268 	if ((sc->ipsec_if.if_flags & IFF_UP) == 0)
269 		goto out;
270 
271 	var = if_ipsec_getref_variant(sc, &psref);
272 	if (if_ipsec_variant_is_unconfigured(var))
273 		goto out;
274 
275 	switch (proto) {
276 	case IPPROTO_IPV4:
277 	case IPPROTO_IPV6:
278 		break;
279 	default:
280 		goto out;
281 	}
282 
283 	m_copydata(m, 0, sizeof(v), &v);
284 	v = (v >> 4) & 0xff;  /* Get the IP version number. */
285 
286 	switch (v) {
287 #ifdef INET
288 	case IPVERSION: {
289 		struct ip ip;
290 
291 		if (m->m_pkthdr.len < sizeof(ip))
292 			goto out;
293 
294 		m_copydata(m, 0, sizeof(ip), &ip);
295 		if (var->iv_psrc->sa_family != AF_INET ||
296 		    var->iv_pdst->sa_family != AF_INET)
297 			goto out;
298 		ret = ipsecif4_encap_func(m, &ip, var);
299 		break;
300 	}
301 #endif
302 #ifdef INET6
303 	case (IPV6_VERSION >> 4): {
304 		struct ip6_hdr ip6;
305 
306 		if (m->m_pkthdr.len < sizeof(ip6))
307 			goto out;
308 
309 		m_copydata(m, 0, sizeof(ip6), &ip6);
310 		if (var->iv_psrc->sa_family != AF_INET6 ||
311 		    var->iv_pdst->sa_family != AF_INET6)
312 			goto out;
313 		ret = ipsecif6_encap_func(m, &ip6, var);
314 		break;
315 	}
316 #endif
317 	default:
318 		goto out;
319 	}
320 
321 out:
322 	if (var != NULL)
323 		if_ipsec_putref_variant(var, &psref);
324 	return ret;
325 }
326 
327 /*
328  * ipsec(4) I/F may cause infinite recursion calls when misconfigured.
329  * We'll prevent this by introducing upper limit.
330  */
331 static int
332 if_ipsec_check_nesting(struct ifnet *ifp, struct mbuf *m)
333 {
334 
335 	return if_tunnel_check_nesting(ifp, m, max_ipsec_nesting);
336 }
337 
338 int
339 if_ipsec_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
340     const struct rtentry *rt)
341 {
342 	struct ipsec_softc *sc = ifp->if_softc;
343 	struct ipsec_variant *var;
344 	struct psref psref;
345 	int error;
346 	int bound;
347 
348 	IFQ_CLASSIFY(&ifp->if_snd, m, dst->sa_family);
349 
350 	error = if_ipsec_check_nesting(ifp, m);
351 	if (error) {
352 		m_freem(m);
353 		goto noref_end;
354 	}
355 
356 	if ((ifp->if_flags & IFF_UP) == 0) {
357 		m_freem(m);
358 		error = ENETDOWN;
359 		goto noref_end;
360 	}
361 
362 
363 	bound = curlwp_bind();
364 	var = if_ipsec_getref_variant(sc, &psref);
365 	if (if_ipsec_variant_is_unconfigured(var)) {
366 		m_freem(m);
367 		error = ENETDOWN;
368 		goto end;
369 	}
370 
371 	m->m_flags &= ~(M_BCAST|M_MCAST);
372 
373 	/* use DLT_NULL encapsulation here to pass inner af type */
374 	M_PREPEND(m, sizeof(int), M_DONTWAIT);
375 	if (!m) {
376 		error = ENOBUFS;
377 		goto end;
378 	}
379 	*mtod(m, int *) = dst->sa_family;
380 
381 #if INET6
382 	/* drop IPv6 packet if IFF_FWD_IPV6 is not set */
383 	if (dst->sa_family == AF_INET6 &&
384 	    !if_ipsec_fwd_ipv6(sc)) {
385 		/*
386 		 * IPv6 packet is not allowed to forward,that is not error.
387 		 */
388 		error = 0;
389 		IF_DROP(&ifp->if_snd);
390 		m_freem(m);
391 		goto end;
392 	}
393 #endif
394 
395 	error = if_ipsec_out_direct(var, m, dst->sa_family);
396 
397 end:
398 	if_ipsec_putref_variant(var, &psref);
399 	curlwp_bindx(bound);
400 noref_end:
401 	if (error)
402 		ifp->if_oerrors++;
403 
404 	return error;
405 }
406 
407 static inline int
408 if_ipsec_out_direct(struct ipsec_variant *var, struct mbuf *m, int family)
409 {
410 	struct ifnet *ifp = &var->iv_softc->ipsec_if;
411 	int error;
412 	int len;
413 
414 	KASSERT(if_ipsec_heldref_variant(var));
415 	KASSERT(var->iv_output != NULL);
416 
417 	len = m->m_pkthdr.len;
418 
419 	/* input DLT_NULL frame to BPF */
420 	bpf_mtap(ifp, m, BPF_D_OUT);
421 
422 	/* grab and chop off inner af type */
423 	/* XXX need pullup? */
424 	m_adj(m, sizeof(int));
425 
426 	error = var->iv_output(var, family, m);
427 	if (error)
428 		return error;
429 
430 	ifp->if_opackets++;
431 	ifp->if_obytes += len;
432 
433 	return 0;
434 }
435 
436 void
437 if_ipsec_input(struct mbuf *m, int af, struct ifnet *ifp)
438 {
439 
440 	KASSERT(ifp != NULL);
441 
442 	m_set_rcvif(m, ifp);
443 
444 	bpf_mtap_af(ifp, af, m, BPF_D_IN);
445 
446 	if_ipsec_in_enqueue(m, af, ifp);
447 
448 	return;
449 }
450 
451 static inline void
452 if_ipsec_in_enqueue(struct mbuf *m, int af, struct ifnet *ifp)
453 {
454 	pktqueue_t *pktq;
455 	int pktlen;
456 
457 	/*
458 	 * Put the packet to the network layer input queue according to the
459 	 * specified address family.
460 	 */
461 	switch (af) {
462 #ifdef INET
463 	case AF_INET:
464 		pktq = ip_pktq;
465 		break;
466 #endif
467 #ifdef INET6
468 	case AF_INET6:
469 		pktq = ip6_pktq;
470 		break;
471 #endif
472 	default:
473 		ifp->if_ierrors++;
474 		m_freem(m);
475 		return;
476 	}
477 
478 #if 1
479 	const u_int h = curcpu()->ci_index;
480 #else
481 	const uint32_t h = pktq_rps_hash(m);
482 #endif
483 	pktlen = m->m_pkthdr.len;
484 	if (__predict_true(pktq_enqueue(pktq, m, h))) {
485 		ifp->if_ibytes += pktlen;
486 		ifp->if_ipackets++;
487 	} else {
488 		ifp->if_iqdrops++;
489 		m_freem(m);
490 	}
491 
492 	return;
493 }
494 
495 static inline int
496 if_ipsec_check_salen(struct sockaddr *addr)
497 {
498 
499 	switch (addr->sa_family) {
500 #ifdef INET
501 	case AF_INET:
502 		if (addr->sa_len != sizeof(struct sockaddr_in))
503 			return EINVAL;
504 		break;
505 #endif /* INET */
506 #ifdef INET6
507 	case AF_INET6:
508 		if (addr->sa_len != sizeof(struct sockaddr_in6))
509 			return EINVAL;
510 		break;
511 #endif /* INET6 */
512 	default:
513 		return EAFNOSUPPORT;
514 	}
515 
516 	return 0;
517 }
518 
519 /* XXX how should we handle IPv6 scope on SIOC[GS]IFPHYADDR? */
520 int
521 if_ipsec_ioctl(struct ifnet *ifp, u_long cmd, void *data)
522 {
523 	struct ipsec_softc *sc  = ifp->if_softc;
524 	struct ipsec_variant *var = NULL;
525 	struct ifreq     *ifr = (struct ifreq*)data;
526 	struct ifaddr    *ifa = (struct ifaddr*)data;
527 	int error = 0, size;
528 	struct sockaddr *dst, *src;
529 	u_long mtu;
530 	u_short oflags = ifp->if_flags;
531 	int bound;
532 	struct psref psref;
533 
534 	switch (cmd) {
535 	case SIOCINITIFADDR:
536 		ifp->if_flags |= IFF_UP;
537 		ifa->ifa_rtrequest = p2p_rtrequest;
538 		break;
539 
540 	case SIOCSIFDSTADDR:
541 		break;
542 
543 	case SIOCADDMULTI:
544 	case SIOCDELMULTI:
545 		switch (ifr->ifr_addr.sa_family) {
546 #ifdef INET
547 		case AF_INET:	/* IP supports Multicast */
548 			break;
549 #endif /* INET */
550 #ifdef INET6
551 		case AF_INET6:	/* IP6 supports Multicast */
552 			break;
553 #endif /* INET6 */
554 		default:  /* Other protocols doesn't support Multicast */
555 			error = EAFNOSUPPORT;
556 			break;
557 		}
558 		break;
559 
560 	case SIOCSIFMTU:
561 		mtu = ifr->ifr_mtu;
562 		if (mtu < IPSEC_MTU_MIN || mtu > IPSEC_MTU_MAX)
563 			return EINVAL;
564 		else if ((error = ifioctl_common(ifp, cmd, data)) == ENETRESET)
565 			error = 0;
566 		break;
567 
568 #ifdef INET
569 	case SIOCSIFPHYADDR:
570 #endif
571 #ifdef INET6
572 	case SIOCSIFPHYADDR_IN6:
573 #endif /* INET6 */
574 	case SIOCSLIFPHYADDR:
575 		switch (cmd) {
576 #ifdef INET
577 		case SIOCSIFPHYADDR:
578 			src = (struct sockaddr *)
579 				&(((struct in_aliasreq *)data)->ifra_addr);
580 			dst = (struct sockaddr *)
581 				&(((struct in_aliasreq *)data)->ifra_dstaddr);
582 			break;
583 #endif /* INET */
584 #ifdef INET6
585 		case SIOCSIFPHYADDR_IN6:
586 			src = (struct sockaddr *)
587 				&(((struct in6_aliasreq *)data)->ifra_addr);
588 			dst = (struct sockaddr *)
589 				&(((struct in6_aliasreq *)data)->ifra_dstaddr);
590 			break;
591 #endif /* INET6 */
592 		case SIOCSLIFPHYADDR:
593 			src = (struct sockaddr *)
594 				&(((struct if_laddrreq *)data)->addr);
595 			dst = (struct sockaddr *)
596 				&(((struct if_laddrreq *)data)->dstaddr);
597 			break;
598 		default:
599 			return EINVAL;
600 		}
601 
602 		/* sa_family must be equal */
603 		if (src->sa_family != dst->sa_family)
604 			return EINVAL;
605 
606 		error = if_ipsec_check_salen(src);
607 		if (error)
608 			return error;
609 		error = if_ipsec_check_salen(dst);
610 		if (error)
611 			return error;
612 
613 		/* check sa_family looks sane for the cmd */
614 		switch (cmd) {
615 #ifdef INET
616 		case SIOCSIFPHYADDR:
617 			if (src->sa_family == AF_INET)
618 				break;
619 			return EAFNOSUPPORT;
620 #endif /* INET */
621 #ifdef INET6
622 		case SIOCSIFPHYADDR_IN6:
623 			if (src->sa_family == AF_INET6)
624 				break;
625 			return EAFNOSUPPORT;
626 #endif /* INET6 */
627 		case SIOCSLIFPHYADDR:
628 			/* checks done in the above */
629 			break;
630 		}
631 		/*
632 		 * calls if_ipsec_getref_variant() for other softcs to check
633 		 * address pair duplicattion
634 		 */
635 		bound = curlwp_bind();
636 		error = if_ipsec_set_tunnel(&sc->ipsec_if, src, dst);
637 		if (error)
638 			goto bad;
639 		curlwp_bindx(bound);
640 		break;
641 
642 	case SIOCDIFPHYADDR:
643 		bound = curlwp_bind();
644 		if_ipsec_delete_tunnel(&sc->ipsec_if);
645 		curlwp_bindx(bound);
646 		break;
647 
648 	case SIOCGIFPSRCADDR:
649 #ifdef INET6
650 	case SIOCGIFPSRCADDR_IN6:
651 #endif /* INET6 */
652 		bound = curlwp_bind();
653 		var = if_ipsec_getref_variant(sc, &psref);
654 		if (var->iv_psrc == NULL) {
655 			error = EADDRNOTAVAIL;
656 			goto bad;
657 		}
658 		src = var->iv_psrc;
659 		switch (cmd) {
660 #ifdef INET
661 		case SIOCGIFPSRCADDR:
662 			dst = &ifr->ifr_addr;
663 			size = sizeof(ifr->ifr_addr);
664 			break;
665 #endif /* INET */
666 #ifdef INET6
667 		case SIOCGIFPSRCADDR_IN6:
668 			dst = (struct sockaddr *)
669 				&(((struct in6_ifreq *)data)->ifr_addr);
670 			size = sizeof(((struct in6_ifreq *)data)->ifr_addr);
671 			break;
672 #endif /* INET6 */
673 		default:
674 			error = EADDRNOTAVAIL;
675 			goto bad;
676 		}
677 		if (src->sa_len > size) {
678 			error = EINVAL;
679 			goto bad;
680 		}
681 		error = IF_IPSEC_GATHER_PSRC_ADDR_PORT(var, dst);
682 		if (error)
683 			goto bad;
684 		if_ipsec_putref_variant(var, &psref);
685 		curlwp_bindx(bound);
686 		break;
687 
688 	case SIOCGIFPDSTADDR:
689 #ifdef INET6
690 	case SIOCGIFPDSTADDR_IN6:
691 #endif /* INET6 */
692 		bound = curlwp_bind();
693 		var = if_ipsec_getref_variant(sc, &psref);
694 		if (var->iv_pdst == NULL) {
695 			error = EADDRNOTAVAIL;
696 			goto bad;
697 		}
698 		src = var->iv_pdst;
699 		switch (cmd) {
700 #ifdef INET
701 		case SIOCGIFPDSTADDR:
702 			dst = &ifr->ifr_addr;
703 			size = sizeof(ifr->ifr_addr);
704 			break;
705 #endif /* INET */
706 #ifdef INET6
707 		case SIOCGIFPDSTADDR_IN6:
708 			dst = (struct sockaddr *)
709 				&(((struct in6_ifreq *)data)->ifr_addr);
710 			size = sizeof(((struct in6_ifreq *)data)->ifr_addr);
711 			break;
712 #endif /* INET6 */
713 		default:
714 			error = EADDRNOTAVAIL;
715 			goto bad;
716 		}
717 		if (src->sa_len > size) {
718 			error = EINVAL;
719 			goto bad;
720 		}
721 		error = IF_IPSEC_GATHER_PDST_ADDR_PORT(var, dst);
722 		if (error)
723 			goto bad;
724 		if_ipsec_putref_variant(var, &psref);
725 		curlwp_bindx(bound);
726 		break;
727 
728 	case SIOCGLIFPHYADDR:
729 		bound = curlwp_bind();
730 		var = if_ipsec_getref_variant(sc, &psref);
731 		if (if_ipsec_variant_is_unconfigured(var)) {
732 			error = EADDRNOTAVAIL;
733 			goto bad;
734 		}
735 
736 		/* copy src */
737 		src = var->iv_psrc;
738 		dst = (struct sockaddr *)
739 			&(((struct if_laddrreq *)data)->addr);
740 		size = sizeof(((struct if_laddrreq *)data)->addr);
741 		if (src->sa_len > size) {
742 			error = EINVAL;
743 			goto bad;
744 		}
745 		error = IF_IPSEC_GATHER_PSRC_ADDR_PORT(var, dst);
746 		if (error)
747 			goto bad;
748 
749 		/* copy dst */
750 		src = var->iv_pdst;
751 		dst = (struct sockaddr *)
752 			&(((struct if_laddrreq *)data)->dstaddr);
753 		size = sizeof(((struct if_laddrreq *)data)->dstaddr);
754 		if (src->sa_len > size) {
755 			error = EINVAL;
756 			goto bad;
757 		}
758 		error = IF_IPSEC_GATHER_PDST_ADDR_PORT(var, dst);
759 		if (error)
760 			goto bad;
761 		if_ipsec_putref_variant(var, &psref);
762 		curlwp_bindx(bound);
763 		break;
764 
765 	default:
766 		error = ifioctl_common(ifp, cmd, data);
767 		if (!error) {
768 			bound = curlwp_bind();
769 			error = if_ipsec_ensure_flags(&sc->ipsec_if, oflags);
770 			if (error)
771 				goto bad;
772 			curlwp_bindx(bound);
773 		}
774 		break;
775 	}
776 	return error;
777 
778 bad:
779 	if (var != NULL)
780 		if_ipsec_putref_variant(var, &psref);
781 	curlwp_bindx(bound);
782 
783 	return error;
784 }
785 
786 struct encap_funcs {
787 #ifdef INET
788 	int (*ef_inet)(struct ipsec_variant *);
789 #endif
790 #ifdef INET6
791 	int (*ef_inet6)(struct ipsec_variant *);
792 #endif
793 };
794 
795 static struct encap_funcs ipsec_encap_attach = {
796 #ifdef INET
797 	.ef_inet = ipsecif4_attach,
798 #endif
799 #ifdef INET6
800 	.ef_inet6 = &ipsecif6_attach,
801 #endif
802 };
803 
804 static struct encap_funcs ipsec_encap_detach = {
805 #ifdef INET
806 	.ef_inet = ipsecif4_detach,
807 #endif
808 #ifdef INET6
809 	.ef_inet6 = &ipsecif6_detach,
810 #endif
811 };
812 
813 static int
814 if_ipsec_encap_common(struct ipsec_variant *var, struct encap_funcs *funcs)
815 {
816 	int error;
817 
818 	KASSERT(var != NULL);
819 	KASSERT(if_ipsec_variant_is_configured(var));
820 
821 	switch (var->iv_psrc->sa_family) {
822 #ifdef INET
823 	case AF_INET:
824 		error = (funcs->ef_inet)(var);
825 		break;
826 #endif /* INET */
827 #ifdef INET6
828 	case AF_INET6:
829 		error = (funcs->ef_inet6)(var);
830 		break;
831 #endif /* INET6 */
832 	default:
833 		error = EINVAL;
834 		break;
835 	}
836 
837 	return error;
838 }
839 
840 static int
841 if_ipsec_encap_attach(struct ipsec_variant *var)
842 {
843 
844 	return if_ipsec_encap_common(var, &ipsec_encap_attach);
845 }
846 
847 static int
848 if_ipsec_encap_detach(struct ipsec_variant *var)
849 {
850 
851 	return if_ipsec_encap_common(var, &ipsec_encap_detach);
852 }
853 
854 /*
855  * Validate and set ipsec(4) I/F configurations.
856  *     (1) validate
857  *         (1-1) Check the argument src and dst address pair will change
858  *               configuration from current src and dst address pair.
859  *         (1-2) Check any ipsec(4) I/F uses duplicated src and dst address pair
860  *               with argument src and dst address pair, except for NAT-T shared
861  *               tunnels.
862  *     (2) set
863  *         (2-1) Create variant for new configuration.
864  *         (2-2) Create temporary "null" variant used to avoid to access
865  *               dangling variant while SPs are deleted and added.
866  *         (2-3) Swap variant include its SPs.
867  *         (2-4) Cleanup last configurations.
868  */
869 static int
870 if_ipsec_set_tunnel(struct ifnet *ifp,
871     struct sockaddr *src, struct sockaddr *dst)
872 {
873 	struct ipsec_softc *sc = ifp->if_softc;
874 	struct ipsec_softc *sc2;
875 	struct ipsec_variant *ovar, *nvar, *nullvar;
876 	struct sockaddr *osrc, *odst;
877 	struct sockaddr *nsrc, *ndst;
878 	in_port_t nsport = 0, ndport = 0;
879 	int error;
880 
881 	error = encap_lock_enter();
882 	if (error)
883 		return error;
884 
885 	nsrc = sockaddr_dup(src, M_WAITOK);
886 	ndst = sockaddr_dup(dst, M_WAITOK);
887 	nvar = kmem_zalloc(sizeof(*nvar), KM_SLEEP);
888 	nullvar = kmem_zalloc(sizeof(*nullvar), KM_SLEEP);
889 
890 	mutex_enter(&sc->ipsec_lock);
891 
892 	ovar = sc->ipsec_var;
893 
894 	switch(nsrc->sa_family) {
895 #ifdef INET
896 	case AF_INET:
897 		nsport = satosin(src)->sin_port;
898 		/*
899 		 * avoid confuse SP when NAT-T disabled,
900 		 * e.g.
901 		 *     expected: 10.0.1.2[any] 10.0.1.1[any] 4(ipv4)
902 		 *     confuse : 10.0.1.2[600] 10.0.1.1[600] 4(ipv4)
903 		 */
904 		satosin(nsrc)->sin_port = 0;
905 		ndport = satosin(dst)->sin_port;
906 		satosin(ndst)->sin_port = 0;
907 		break;
908 #endif /* INET */
909 #ifdef INET6
910 	case AF_INET6:
911 		nsport = satosin6(src)->sin6_port;
912 		satosin6(nsrc)->sin6_port = 0;
913 		ndport = satosin6(dst)->sin6_port;
914 		satosin6(ndst)->sin6_port = 0;
915 		break;
916 #endif /* INET6 */
917 	default:
918 		log(LOG_DEBUG,
919 		    "%s: Invalid address family: %d.\n",
920 		    __func__, src->sa_family);
921 		error = EINVAL;
922 		goto out;
923 	}
924 
925 	/*
926 	 * (1-1) Check the argument src and dst address pair will change
927 	 *       configuration from current src and dst address pair.
928 	 */
929 	if ((ovar->iv_pdst && sockaddr_cmp(ovar->iv_pdst, dst) == 0) &&
930 	    (ovar->iv_psrc && sockaddr_cmp(ovar->iv_psrc, src) == 0) &&
931 	    (ovar->iv_sport == nsport && ovar->iv_dport == ndport)) {
932 		/* address and port pair not changed. */
933 		error = 0;
934 		goto out;
935 	}
936 
937 	/*
938 	 * (1-2) Check any ipsec(4) I/F uses duplicated src and dst address pair
939 	 *       with argument src and dst address pair, except for NAT-T shared
940 	 *       tunnels.
941 	 */
942 	mutex_enter(&ipsec_softcs.lock);
943 	LIST_FOREACH(sc2, &ipsec_softcs.list, ipsec_list) {
944 		struct ipsec_variant *var2;
945 		struct psref psref;
946 
947 		if (sc2 == sc)
948 			continue;
949 		var2 = if_ipsec_getref_variant(sc2, &psref);
950 		if (if_ipsec_variant_is_unconfigured(var2)) {
951 			if_ipsec_putref_variant(var2, &psref);
952 			continue;
953 		}
954 		if (if_ipsec_nat_t(sc) || if_ipsec_nat_t(sc2)) {
955 			if_ipsec_putref_variant(var2, &psref);
956 			continue; /* NAT-T shared tunnel */
957 		}
958 		if (sockaddr_cmp(var2->iv_pdst, dst) == 0 &&
959 		    sockaddr_cmp(var2->iv_psrc, src) == 0) {
960 			if_ipsec_putref_variant(var2, &psref);
961 			mutex_exit(&ipsec_softcs.lock);
962 			error = EADDRNOTAVAIL;
963 			goto out;
964 		}
965 
966 		if_ipsec_putref_variant(var2, &psref);
967 		/* XXX both end must be valid? (I mean, not 0.0.0.0) */
968 	}
969 	mutex_exit(&ipsec_softcs.lock);
970 
971 
972 	osrc = ovar->iv_psrc;
973 	odst = ovar->iv_pdst;
974 
975 	/*
976 	 * (2-1) Create ipsec_variant for new configuration.
977 	 */
978 	if_ipsec_copy_variant(nvar, ovar);
979 	nvar->iv_psrc = nsrc;
980 	nvar->iv_pdst = ndst;
981 	nvar->iv_sport = nsport;
982 	nvar->iv_dport = ndport;
983 	nvar->iv_encap_cookie4 = NULL;
984 	nvar->iv_encap_cookie6 = NULL;
985 	psref_target_init(&nvar->iv_psref, iv_psref_class);
986 	error = if_ipsec_encap_attach(nvar);
987 	if (error)
988 		goto out;
989 
990 	/*
991 	 * (2-2) Create temporary "null" variant.
992 	 */
993 	if_ipsec_copy_variant(nullvar, ovar);
994 	if_ipsec_clear_config(nullvar);
995 	psref_target_init(&nullvar->iv_psref, iv_psref_class);
996 	membar_producer();
997 	/*
998 	 * (2-3) Swap variant include its SPs.
999 	 */
1000 	error = if_ipsec_update_variant(sc, nvar, nullvar);
1001 	if (error) {
1002 		if_ipsec_encap_detach(nvar);
1003 		goto out;
1004 	}
1005 
1006 	mutex_exit(&sc->ipsec_lock);
1007 
1008 	/*
1009 	 * (2-4) Cleanup last configurations.
1010 	 */
1011 	if (if_ipsec_variant_is_configured(ovar))
1012 		if_ipsec_encap_detach(ovar);
1013 	encap_lock_exit();
1014 
1015 	if (osrc != NULL)
1016 		sockaddr_free(osrc);
1017 	if (odst != NULL)
1018 		sockaddr_free(odst);
1019 	kmem_free(ovar, sizeof(*ovar));
1020 	kmem_free(nullvar, sizeof(*nullvar));
1021 
1022 	return 0;
1023 
1024 out:
1025 	mutex_exit(&sc->ipsec_lock);
1026 	encap_lock_exit();
1027 
1028 	sockaddr_free(nsrc);
1029 	sockaddr_free(ndst);
1030 	kmem_free(nvar, sizeof(*nvar));
1031 	kmem_free(nullvar, sizeof(*nullvar));
1032 
1033 	return error;
1034 }
1035 
1036 /*
1037  * Validate and delete ipsec(4) I/F configurations.
1038  *     (1) validate
1039  *         (1-1) Check current src and dst address pair are null,
1040  *               which means the ipsec(4) I/F is already done deletetunnel.
1041  *     (2) delete
1042  *         (2-1) Create variant for deleted status.
1043  *         (2-2) Create temporary "null" variant used to avoid to access
1044  *               dangling variant while SPs are deleted and added.
1045  *               NOTE:
1046  *               The contents of temporary "null" variant equal to the variant
1047  *               of (2-1), however two psref_target_destroy() synchronization
1048  *               points are necessary to avoid to access dangling variant
1049  *               while SPs are deleted and added. To implement that simply,
1050  *               we use the same manner as if_ipsec_set_tunnel(), that is,
1051  *               create extra "null" variant and use it temporarily.
1052  *         (2-3) Swap variant include its SPs.
1053  *         (2-4) Cleanup last configurations.
1054  */
1055 static void
1056 if_ipsec_delete_tunnel(struct ifnet *ifp)
1057 {
1058 	struct ipsec_softc *sc = ifp->if_softc;
1059 	struct ipsec_variant *ovar, *nvar, *nullvar;
1060 	struct sockaddr *osrc, *odst;
1061 	int error;
1062 
1063 	error = encap_lock_enter();
1064 	if (error)
1065 		return;
1066 
1067 	nvar = kmem_zalloc(sizeof(*nvar), KM_SLEEP);
1068 	nullvar = kmem_zalloc(sizeof(*nullvar), KM_SLEEP);
1069 
1070 	mutex_enter(&sc->ipsec_lock);
1071 
1072 	ovar = sc->ipsec_var;
1073 	osrc = ovar->iv_psrc;
1074 	odst = ovar->iv_pdst;
1075 	/*
1076 	 * (1-1) Check current src and dst address pair are null,
1077 	 *       which means the ipsec(4) I/F is already done deletetunnel.
1078 	 */
1079 	if (osrc == NULL || odst == NULL) {
1080 		/* address pair not changed. */
1081 		mutex_exit(&sc->ipsec_lock);
1082 		encap_lock_exit();
1083 		kmem_free(nvar, sizeof(*nvar));
1084 		kmem_free(nullvar, sizeof(*nullvar));
1085 		return;
1086 	}
1087 
1088 	/*
1089 	 * (2-1) Create variant for deleted status.
1090 	 */
1091 	if_ipsec_copy_variant(nvar, ovar);
1092 	if_ipsec_clear_config(nvar);
1093 	psref_target_init(&nvar->iv_psref, iv_psref_class);
1094 
1095 	/*
1096 	 * (2-2) Create temporary "null" variant used to avoid to access
1097 	 *       dangling variant while SPs are deleted and added.
1098 	 */
1099 	if_ipsec_copy_variant(nullvar, ovar);
1100 	if_ipsec_clear_config(nullvar);
1101 	psref_target_init(&nullvar->iv_psref, iv_psref_class);
1102 	membar_producer();
1103 	/*
1104 	 * (2-3) Swap variant include its SPs.
1105 	 */
1106 	/* if_ipsec_update_variant() does not fail when delete SP only. */
1107 	(void)if_ipsec_update_variant(sc, nvar, nullvar);
1108 
1109 	mutex_exit(&sc->ipsec_lock);
1110 
1111 	/*
1112 	 * (2-4) Cleanup last configurations.
1113 	 */
1114 	if (if_ipsec_variant_is_configured(ovar))
1115 		if_ipsec_encap_detach(ovar);
1116 	encap_lock_exit();
1117 
1118 	sockaddr_free(osrc);
1119 	sockaddr_free(odst);
1120 	kmem_free(ovar, sizeof(*ovar));
1121 	kmem_free(nullvar, sizeof(*nullvar));
1122 }
1123 
1124 /*
1125  * Check IFF_NAT_T and IFF_FWD_IPV6 flags, therefore update SPs if needed.
1126  *     (1) check
1127  *         (1-1) Check flags are changed.
1128  *         (1-2) Check current src and dst address pair. If they are null,
1129  *               that means the ipsec(4) I/F is deletetunnel'ed, so it is
1130  *               not needed to update.
1131  *     (2) update
1132  *         (2-1) Create variant for new SPs.
1133  *         (2-2) Create temporary "null" variant used to avoid to access
1134  *               dangling variant while SPs are deleted and added.
1135  *               NOTE:
1136  *               There is the same problem as if_ipsec_delete_tunnel().
1137  *         (2-3) Swap variant include its SPs.
1138  *         (2-4) Cleanup unused configurations.
1139  *               NOTE: use the same encap_cookies.
1140  */
1141 static int
1142 if_ipsec_ensure_flags(struct ifnet *ifp, u_short oflags)
1143 {
1144 	struct ipsec_softc *sc = ifp->if_softc;
1145 	struct ipsec_variant *ovar, *nvar, *nullvar;
1146 	int error;
1147 
1148 	/*
1149 	 * (1) Check flags are changed.
1150 	 */
1151 	if ((oflags & (IFF_NAT_T|IFF_FWD_IPV6)) ==
1152 	    (ifp->if_flags & (IFF_NAT_T|IFF_FWD_IPV6)))
1153 		return 0; /* flags not changed. */
1154 
1155 	error = encap_lock_enter();
1156 	if (error)
1157 		return error;
1158 
1159 	nvar = kmem_zalloc(sizeof(*nvar), KM_SLEEP);
1160 	nullvar = kmem_zalloc(sizeof(*nullvar), KM_SLEEP);
1161 
1162 	mutex_enter(&sc->ipsec_lock);
1163 
1164 	ovar = sc->ipsec_var;
1165 	/*
1166 	 * (1-2) Check current src and dst address pair.
1167 	 */
1168 	if (if_ipsec_variant_is_unconfigured(ovar)) {
1169 		/* nothing to do */
1170 		mutex_exit(&sc->ipsec_lock);
1171 		encap_lock_exit();
1172 		kmem_free(nvar, sizeof(*nvar));
1173 		kmem_free(nullvar, sizeof(*nullvar));
1174 		return 0;
1175 	}
1176 
1177 	/*
1178 	 * (2-1) Create variant for new SPs.
1179 	 */
1180 	if_ipsec_copy_variant(nvar, ovar);
1181 	psref_target_init(&nvar->iv_psref, iv_psref_class);
1182 	/*
1183 	 * (2-2) Create temporary "null" variant used to avoid to access
1184 	 *       dangling variant while SPs are deleted and added.
1185 	 */
1186 	if_ipsec_copy_variant(nullvar, ovar);
1187 	if_ipsec_clear_config(nullvar);
1188 	psref_target_init(&nullvar->iv_psref, iv_psref_class);
1189 	membar_producer();
1190 	/*
1191 	 * (2-3) Swap variant include its SPs.
1192 	 */
1193 	error = if_ipsec_update_variant(sc, nvar, nullvar);
1194 
1195 	mutex_exit(&sc->ipsec_lock);
1196 	encap_lock_exit();
1197 
1198 	/*
1199 	 * (2-4) Cleanup unused configurations.
1200 	 */
1201 	if (!error)
1202 		kmem_free(ovar, sizeof(*ovar));
1203 	else
1204 		kmem_free(nvar, sizeof(*ovar));
1205 	kmem_free(nullvar, sizeof(*nullvar));
1206 
1207 	return error;
1208 }
1209 
1210 /*
1211  * SPD management
1212  */
1213 
1214 /*
1215  * Share SP set with other NAT-T ipsec(4) I/F(s).
1216  *     Return 1, when "var" shares SP set.
1217  *     Return 0, when "var" cannot share SP set.
1218  *
1219  * NOTE:
1220  * if_ipsec_share_sp() and if_ipsec_unshare_sp() would require global lock
1221  * to exclude other ipsec(4) I/Fs set_tunnel/delete_tunnel. E.g. when ipsec0
1222  * and ipsec1 can share SP set, running ipsec0's set_tunnel and ipsec1's
1223  * set_tunnel causes race.
1224  * Currently, (fortunately) encap_lock works as this global lock.
1225  */
1226 static int
1227 if_ipsec_share_sp(struct ipsec_variant *var)
1228 {
1229 	struct ipsec_softc *sc = var->iv_softc;
1230 	struct ipsec_softc *sc2;
1231 	struct ipsec_variant *var2;
1232 	struct psref psref;
1233 
1234 	KASSERT(encap_lock_held());
1235 	KASSERT(var->iv_psrc != NULL && var->iv_pdst != NULL);
1236 
1237 	mutex_enter(&ipsec_softcs.lock);
1238 	LIST_FOREACH(sc2, &ipsec_softcs.list, ipsec_list) {
1239 		if (sc2 == sc)
1240 			continue;
1241 		var2 = if_ipsec_getref_variant(sc2, &psref);
1242 		if (if_ipsec_variant_is_unconfigured(var2)) {
1243 			if_ipsec_putref_variant(var2, &psref);
1244 			continue;
1245 		}
1246 		if (sockaddr_cmp(var2->iv_pdst, var->iv_pdst) != 0 ||
1247 		    sockaddr_cmp(var2->iv_psrc, var->iv_psrc) != 0) {
1248 			if_ipsec_putref_variant(var2, &psref);
1249 			continue;
1250 		}
1251 
1252 		break;
1253 	}
1254 	mutex_exit(&ipsec_softcs.lock);
1255 	if (sc2 == NULL)
1256 		return 0; /* not shared */
1257 
1258 	IV_SP_IN(var) = IV_SP_IN(var2);
1259 	IV_SP_IN6(var) = IV_SP_IN6(var2);
1260 	IV_SP_OUT(var) = IV_SP_OUT(var2);
1261 	IV_SP_OUT6(var) = IV_SP_OUT6(var2);
1262 
1263 	if_ipsec_putref_variant(var2, &psref);
1264 	return 1; /* shared */
1265 }
1266 
1267 /*
1268  * Unshare SP set with other NAT-T ipsec(4) I/F(s).
1269  *     Return 1, when "var" shared SP set, and then unshare them.
1270  *     Return 0, when "var" did not share SP set.
1271  *
1272  * NOTE:
1273  * See if_ipsec_share_sp()'s note.
1274  */
1275 static int
1276 if_ipsec_unshare_sp(struct ipsec_variant *var)
1277 {
1278 	struct ipsec_softc *sc = var->iv_softc;
1279 	struct ipsec_softc *sc2;
1280 	struct ipsec_variant *var2;
1281 	struct psref psref;
1282 
1283 	KASSERT(encap_lock_held());
1284 
1285 	if (!var->iv_pdst || !var->iv_psrc)
1286 		return 0;
1287 
1288 	mutex_enter(&ipsec_softcs.lock);
1289 	LIST_FOREACH(sc2, &ipsec_softcs.list, ipsec_list) {
1290 		if (sc2 == sc)
1291 			continue;
1292 		var2 = if_ipsec_getref_variant(sc2, &psref);
1293 		if (!var2->iv_pdst || !var2->iv_psrc) {
1294 			if_ipsec_putref_variant(var2, &psref);
1295 			continue;
1296 		}
1297 		if (sockaddr_cmp(var2->iv_pdst, var->iv_pdst) != 0 ||
1298 		    sockaddr_cmp(var2->iv_psrc, var->iv_psrc) != 0) {
1299 			if_ipsec_putref_variant(var2, &psref);
1300 			continue;
1301 		}
1302 
1303 		break;
1304 	}
1305 	mutex_exit(&ipsec_softcs.lock);
1306 	if (sc2 == NULL)
1307 		return 0; /* not shared */
1308 
1309 	IV_SP_IN(var) = NULL;
1310 	IV_SP_IN6(var) = NULL;
1311 	IV_SP_OUT(var) = NULL;
1312 	IV_SP_OUT6(var) = NULL;
1313 	if_ipsec_putref_variant(var2, &psref);
1314 	return 1; /* shared */
1315 }
1316 
1317 static inline void
1318 if_ipsec_add_mbuf_optalign(struct mbuf *m0, void *data, size_t len, bool align)
1319 {
1320 	struct mbuf *m;
1321 
1322 	MGET(m, M_WAIT, MT_DATA);
1323 	if (align) {
1324 		m->m_len = PFKEY_ALIGN8(len);
1325 		memset(mtod(m, void *), 0, m->m_len);
1326 	} else
1327 		m->m_len = len;
1328 	m_copyback(m, 0, len, data);
1329 	m_cat(m0, m);
1330 }
1331 
1332 static inline void
1333 if_ipsec_add_mbuf(struct mbuf *m0, void *data, size_t len)
1334 {
1335 
1336 	if_ipsec_add_mbuf_optalign(m0, data, len, true);
1337 }
1338 
1339 static inline void
1340 if_ipsec_add_mbuf_addr_port(struct mbuf *m0, struct sockaddr *addr, in_port_t port, bool align)
1341 {
1342 
1343 	if (port == 0) {
1344 		if_ipsec_add_mbuf_optalign(m0, addr, addr->sa_len, align);
1345 	} else {
1346 		union sockaddr_union addrport_u;
1347 		struct sockaddr *addrport = &addrport_u.sa;
1348 
1349 		if_ipsec_set_addr_port(addrport, addr, port);
1350 		if_ipsec_add_mbuf_optalign(m0, addrport, addrport->sa_len, align);
1351 	}
1352 }
1353 
1354 static inline void
1355 if_ipsec_add_pad(struct mbuf *m0, size_t len)
1356 {
1357 	struct mbuf *m;
1358 
1359 	if (len == 0)
1360 		return;
1361 
1362 	MGET(m, M_WAIT, MT_DATA);
1363 	m->m_len = len;
1364 	memset(mtod(m, void *), 0, m->m_len);
1365 	m_cat(m0, m);
1366 }
1367 
1368 static inline size_t
1369 if_ipsec_set_sadb_addr(struct sadb_address *saaddr, struct sockaddr *addr,
1370     int proto, uint16_t exttype)
1371 {
1372 	size_t size;
1373 
1374 	KASSERT(saaddr != NULL);
1375 	KASSERT(addr != NULL);
1376 
1377 	size = sizeof(*saaddr) + PFKEY_ALIGN8(addr->sa_len);
1378 	saaddr->sadb_address_len = PFKEY_UNIT64(size);
1379 	saaddr->sadb_address_exttype = exttype;
1380 	saaddr->sadb_address_proto = proto;
1381 	switch (addr->sa_family) {
1382 #ifdef INET
1383 	case AF_INET:
1384 		saaddr->sadb_address_prefixlen = sizeof(struct in_addr) << 3;
1385 		break;
1386 #endif /* INET */
1387 #ifdef INET6
1388 	case AF_INET6:
1389 		saaddr->sadb_address_prefixlen = sizeof(struct in6_addr) << 3;
1390 		break;
1391 #endif /* INET6 */
1392 	default:
1393 		log(LOG_DEBUG,
1394 		    "%s: Invalid address family: %d.\n",
1395 		    __func__, addr->sa_family);
1396 		break;
1397 	}
1398 	saaddr->sadb_address_reserved = 0;
1399 
1400 	return size;
1401 }
1402 
1403 static inline size_t
1404 if_ipsec_set_sadb_src(struct sadb_address *sasrc, struct sockaddr *src,
1405     int proto)
1406 {
1407 
1408 	return if_ipsec_set_sadb_addr(sasrc, src, proto,
1409 	    SADB_EXT_ADDRESS_SRC);
1410 }
1411 
1412 static inline size_t
1413 if_ipsec_set_sadb_dst(struct sadb_address *sadst, struct sockaddr *dst,
1414     int proto)
1415 {
1416 
1417 	return if_ipsec_set_sadb_addr(sadst, dst, proto,
1418 	    SADB_EXT_ADDRESS_DST);
1419 }
1420 
1421 static inline size_t
1422 if_ipsec_set_sadb_x_policy(struct sadb_x_policy *xpl,
1423     struct sadb_x_ipsecrequest *xisr, uint16_t policy, uint8_t dir, uint32_t id,
1424     uint8_t level, struct sockaddr *src, struct sockaddr *dst)
1425 {
1426 	size_t size;
1427 
1428 	KASSERT(policy != IPSEC_POLICY_IPSEC || xisr != NULL);
1429 
1430 	size = sizeof(*xpl);
1431 	if (policy == IPSEC_POLICY_IPSEC) {
1432 		size += PFKEY_ALIGN8(sizeof(*xisr));
1433 		if (src != NULL && dst != NULL)
1434 			size += PFKEY_ALIGN8(src->sa_len + dst->sa_len);
1435 	}
1436 	xpl->sadb_x_policy_len = PFKEY_UNIT64(size);
1437 	xpl->sadb_x_policy_exttype = SADB_X_EXT_POLICY;
1438 	xpl->sadb_x_policy_type = policy;
1439 	xpl->sadb_x_policy_dir = dir;
1440 	xpl->sadb_x_policy_reserved = 0;
1441 	xpl->sadb_x_policy_id = id;
1442 	xpl->sadb_x_policy_reserved2 = 0;
1443 
1444 	if (policy == IPSEC_POLICY_IPSEC) {
1445 		xisr->sadb_x_ipsecrequest_len = PFKEY_ALIGN8(sizeof(*xisr));
1446 		if (src != NULL && dst != NULL)
1447 			xisr->sadb_x_ipsecrequest_len +=
1448 				PFKEY_ALIGN8(src->sa_len + dst->sa_len);
1449 		xisr->sadb_x_ipsecrequest_proto = IPPROTO_ESP;
1450 		xisr->sadb_x_ipsecrequest_mode = IPSEC_MODE_TRANSPORT;
1451 		xisr->sadb_x_ipsecrequest_level = level;
1452 		xisr->sadb_x_ipsecrequest_reqid = key_newreqid();
1453 	}
1454 
1455 	return size;
1456 }
1457 
1458 static inline void
1459 if_ipsec_set_sadb_msg(struct sadb_msg *msg, uint16_t extlen, uint8_t msgtype)
1460 {
1461 
1462 	KASSERT(msg != NULL);
1463 
1464 	msg->sadb_msg_version = PF_KEY_V2;
1465 	msg->sadb_msg_type = msgtype;
1466 	msg->sadb_msg_errno = 0;
1467 	msg->sadb_msg_satype = SADB_SATYPE_UNSPEC;
1468 	msg->sadb_msg_len = PFKEY_UNIT64(sizeof(*msg)) + extlen;
1469 	msg->sadb_msg_reserved = 0;
1470 	msg->sadb_msg_seq = 0; /* XXXX */
1471 	msg->sadb_msg_pid = 0; /* XXXX */
1472 }
1473 
1474 static inline void
1475 if_ipsec_set_sadb_msg_add(struct sadb_msg *msg, uint16_t extlen)
1476 {
1477 
1478 	if_ipsec_set_sadb_msg(msg, extlen, SADB_X_SPDADD);
1479 }
1480 
1481 static inline void
1482 if_ipsec_set_sadb_msg_del(struct sadb_msg *msg, uint16_t extlen)
1483 {
1484 
1485 	if_ipsec_set_sadb_msg(msg, extlen, SADB_X_SPDDELETE2);
1486 }
1487 
1488 static int
1489 if_ipsec_set_addr_port(struct sockaddr *addrport, struct sockaddr *addr,
1490     in_port_t port)
1491 {
1492 	int error = 0;
1493 
1494 	sockaddr_copy(addrport, addr->sa_len, addr);
1495 
1496 	switch (addr->sa_family) {
1497 #ifdef INET
1498 	case AF_INET: {
1499 		struct sockaddr_in *sin = satosin(addrport);
1500 		sin->sin_port = port;
1501 		break;
1502 	}
1503 #endif /* INET */
1504 #ifdef INET6
1505 	case AF_INET6: {
1506 		struct sockaddr_in6 *sin6 = satosin6(addrport);
1507 		sin6->sin6_port = port;
1508 		break;
1509 	}
1510 #endif /* INET6 */
1511 	default:
1512 		log(LOG_DEBUG,
1513 		    "%s: Invalid address family: %d.\n",
1514 		    __func__, addr->sa_family);
1515 		error = EINVAL;
1516 	}
1517 
1518 	return error;
1519 }
1520 
1521 static struct secpolicy *
1522 if_ipsec_add_sp0(struct sockaddr *src, in_port_t sport,
1523     struct sockaddr *dst, in_port_t dport,
1524     int dir, int proto, int level, u_int policy)
1525 {
1526 	struct sadb_msg msg;
1527 	struct sadb_address xsrc, xdst;
1528 	struct sadb_x_policy xpl;
1529 	struct sadb_x_ipsecrequest xisr;
1530 	size_t size;
1531 	size_t padlen;
1532 	uint16_t ext_msg_len = 0;
1533 	struct mbuf *m;
1534 
1535 	memset(&msg, 0, sizeof(msg));
1536 	memset(&xsrc, 0, sizeof(xsrc));
1537 	memset(&xdst, 0, sizeof(xdst));
1538 	memset(&xpl, 0, sizeof(xpl));
1539 	memset(&xisr, 0, sizeof(xisr));
1540 
1541 	MGETHDR(m, M_WAIT, MT_DATA);
1542 
1543 	size = if_ipsec_set_sadb_src(&xsrc, src, proto);
1544 	ext_msg_len += PFKEY_UNIT64(size);
1545 	size = if_ipsec_set_sadb_dst(&xdst, dst, proto);
1546 	ext_msg_len += PFKEY_UNIT64(size);
1547 	size = if_ipsec_set_sadb_x_policy(&xpl, &xisr, policy, dir, 0, level, src, dst);
1548 	ext_msg_len += PFKEY_UNIT64(size);
1549 	if_ipsec_set_sadb_msg_add(&msg, ext_msg_len);
1550 
1551 	/* build PF_KEY message */
1552 
1553 	m->m_len = sizeof(msg);
1554 	m_copyback(m, 0, sizeof(msg), &msg);
1555 
1556 	if_ipsec_add_mbuf(m, &xsrc, sizeof(xsrc));
1557 	/*
1558 	 * secpolicy.spidx.{src, dst} must not be set port number,
1559 	 * even if it is used for NAT-T.
1560 	 */
1561 	if_ipsec_add_mbuf_addr_port(m, src, 0, true);
1562 	padlen = PFKEY_UNUNIT64(xsrc.sadb_address_len)
1563 		- (sizeof(xsrc) + PFKEY_ALIGN8(src->sa_len));
1564 	if_ipsec_add_pad(m, padlen);
1565 
1566 	if_ipsec_add_mbuf(m, &xdst, sizeof(xdst));
1567 	/* ditto */
1568 	if_ipsec_add_mbuf_addr_port(m, dst, 0, true);
1569 	padlen = PFKEY_UNUNIT64(xdst.sadb_address_len)
1570 		- (sizeof(xdst) + PFKEY_ALIGN8(dst->sa_len));
1571 	if_ipsec_add_pad(m, padlen);
1572 
1573 	if_ipsec_add_mbuf(m, &xpl, sizeof(xpl));
1574 	padlen = PFKEY_UNUNIT64(xpl.sadb_x_policy_len) - sizeof(xpl);
1575 	if (policy == IPSEC_POLICY_IPSEC) {
1576 		if_ipsec_add_mbuf(m, &xisr, sizeof(xisr));
1577 		padlen -= PFKEY_ALIGN8(sizeof(xisr));
1578 	}
1579 	if_ipsec_add_pad(m, padlen);
1580 
1581 	/* key_kpi_spdadd() has already done KEY_SP_REF(). */
1582 	return key_kpi_spdadd(m);
1583 }
1584 
1585 static int
1586 if_ipsec_add_sp(struct ipsec_variant *var,
1587     struct sockaddr *src, in_port_t sport,
1588     struct sockaddr *dst, in_port_t dport)
1589 {
1590 	struct ipsec_softc *sc = var->iv_softc;
1591 	int level;
1592 	u_int v6policy;
1593 
1594 	/*
1595 	 * must delete sp before add it.
1596 	 */
1597 	KASSERT(IV_SP_IN(var) == NULL);
1598 	KASSERT(IV_SP_OUT(var) == NULL);
1599 	KASSERT(IV_SP_IN6(var) == NULL);
1600 	KASSERT(IV_SP_OUT6(var) == NULL);
1601 
1602 	/*
1603 	 * can be shared?
1604 	 */
1605 	if (if_ipsec_share_sp(var))
1606 		return 0;
1607 
1608 	if (if_ipsec_nat_t(sc))
1609 		level = IPSEC_LEVEL_REQUIRE;
1610 	else
1611 		level = IPSEC_LEVEL_UNIQUE;
1612 
1613 	if (if_ipsec_fwd_ipv6(sc))
1614 		v6policy = IPSEC_POLICY_IPSEC;
1615 	else
1616 		v6policy = IPSEC_POLICY_DISCARD;
1617 
1618 	IV_SP_IN(var) = if_ipsec_add_sp0(dst, dport, src, sport,
1619 	    IPSEC_DIR_INBOUND, IPPROTO_IPIP, level, IPSEC_POLICY_IPSEC);
1620 	if (IV_SP_IN(var) == NULL)
1621 		goto fail;
1622 	IV_SP_OUT(var) = if_ipsec_add_sp0(src, sport, dst, dport,
1623 	    IPSEC_DIR_OUTBOUND, IPPROTO_IPIP, level, IPSEC_POLICY_IPSEC);
1624 	if (IV_SP_OUT(var) == NULL)
1625 		goto fail;
1626 	IV_SP_IN6(var) = if_ipsec_add_sp0(dst, dport, src, sport,
1627 	    IPSEC_DIR_INBOUND, IPPROTO_IPV6, level, v6policy);
1628 	if (IV_SP_IN6(var) == NULL)
1629 		goto fail;
1630 	IV_SP_OUT6(var) = if_ipsec_add_sp0(src, sport, dst, dport,
1631 	    IPSEC_DIR_OUTBOUND, IPPROTO_IPV6, level, v6policy);
1632 	if (IV_SP_OUT6(var) == NULL)
1633 		goto fail;
1634 
1635 	return 0;
1636 
1637 fail:
1638 	if (IV_SP_IN6(var) != NULL) {
1639 		if_ipsec_del_sp0(IV_SP_IN6(var));
1640 		IV_SP_IN6(var) = NULL;
1641 	}
1642 	if (IV_SP_OUT(var) != NULL) {
1643 		if_ipsec_del_sp0(IV_SP_OUT(var));
1644 		IV_SP_OUT(var) = NULL;
1645 	}
1646 	if (IV_SP_IN(var) != NULL) {
1647 		if_ipsec_del_sp0(IV_SP_IN(var));
1648 		IV_SP_IN(var) = NULL;
1649 	}
1650 
1651 	return EEXIST;
1652 }
1653 
1654 static int
1655 if_ipsec_del_sp0(struct secpolicy *sp)
1656 {
1657 	struct sadb_msg msg;
1658 	struct sadb_x_policy xpl;
1659 	size_t size;
1660 	uint16_t ext_msg_len = 0;
1661 	int error;
1662 	struct mbuf *m;
1663 
1664 	if (sp == NULL)
1665 		return 0;
1666 
1667 	memset(&msg, 0, sizeof(msg));
1668 	memset(&xpl, 0, sizeof(xpl));
1669 
1670 	MGETHDR(m, M_WAIT, MT_DATA);
1671 
1672 	size = if_ipsec_set_sadb_x_policy(&xpl, NULL, 0, 0, sp->id, 0, NULL, NULL);
1673 	ext_msg_len += PFKEY_UNIT64(size);
1674 
1675 	if_ipsec_set_sadb_msg_del(&msg, ext_msg_len);
1676 
1677 	m->m_len = sizeof(msg);
1678 	m_copyback(m, 0, sizeof(msg), &msg);
1679 
1680 	if_ipsec_add_mbuf(m, &xpl, sizeof(xpl));
1681 
1682 	/*  unreference correspond to key_kpi_spdadd(). */
1683 	KEY_SP_UNREF(&sp);
1684 	error = key_kpi_spddelete2(m);
1685 	if (error != 0) {
1686 		log(LOG_ERR, "%s: cannot delete SP(ID=%u) (error=%d).\n",
1687 		    __func__, sp->id, error);
1688 	}
1689 	return error;
1690 }
1691 
1692 static void
1693 if_ipsec_del_sp(struct ipsec_variant *var)
1694 {
1695 
1696 	/* are the SPs shared? */
1697 	if (if_ipsec_unshare_sp(var))
1698 		return;
1699 
1700 	(void)if_ipsec_del_sp0(IV_SP_OUT(var));
1701 	(void)if_ipsec_del_sp0(IV_SP_IN(var));
1702 	(void)if_ipsec_del_sp0(IV_SP_OUT6(var));
1703 	(void)if_ipsec_del_sp0(IV_SP_IN6(var));
1704 	IV_SP_IN(var) = NULL;
1705 	IV_SP_IN6(var) = NULL;
1706 	IV_SP_OUT(var) = NULL;
1707 	IV_SP_OUT6(var) = NULL;
1708 }
1709 
1710 static int
1711 if_ipsec_replace_sp(struct ipsec_softc *sc, struct ipsec_variant *ovar,
1712     struct ipsec_variant *nvar)
1713 {
1714 	in_port_t src_port = 0;
1715 	in_port_t dst_port = 0;
1716 	struct sockaddr *src;
1717 	struct sockaddr *dst;
1718 	int error = 0;
1719 
1720 	KASSERT(mutex_owned(&sc->ipsec_lock));
1721 
1722 	if_ipsec_del_sp(ovar);
1723 
1724 	src = nvar->iv_psrc;
1725 	dst = nvar->iv_pdst;
1726 	if (if_ipsec_nat_t(sc)) {
1727 		/* NAT-T enabled */
1728 		src_port = nvar->iv_sport;
1729 		dst_port = nvar->iv_dport;
1730 	}
1731 	if (src && dst)
1732 		error = if_ipsec_add_sp(nvar, src, src_port, dst, dst_port);
1733 
1734 	return error;
1735 }
1736 
1737 /*
1738  * ipsec_variant and its SPs update API.
1739  *
1740  * Assumption:
1741  * reader side dereferences sc->ipsec_var in reader critical section only,
1742  * that is, all of reader sides do not reader the sc->ipsec_var after
1743  * pserialize_perform().
1744  */
1745 static int
1746 if_ipsec_update_variant(struct ipsec_softc *sc, struct ipsec_variant *nvar,
1747     struct ipsec_variant *nullvar)
1748 {
1749 	struct ifnet *ifp = &sc->ipsec_if;
1750 	struct ipsec_variant *ovar = sc->ipsec_var;
1751 	int error;
1752 
1753 	KASSERT(mutex_owned(&sc->ipsec_lock));
1754 
1755 	/*
1756 	 * To keep consistency between ipsec(4) I/F settings and SPs,
1757 	 * we stop packet processing while replacing SPs, that is, we set
1758 	 * "null" config variant to sc->ipsec_var.
1759 	 */
1760 	sc->ipsec_var = nullvar;
1761 	pserialize_perform(sc->ipsec_psz);
1762 	psref_target_destroy(&ovar->iv_psref, iv_psref_class);
1763 
1764 	error = if_ipsec_replace_sp(sc, ovar, nvar);
1765 	if (!error)
1766 		sc->ipsec_var = nvar;
1767 	else {
1768 		sc->ipsec_var = ovar; /* rollback */
1769 		psref_target_init(&ovar->iv_psref, iv_psref_class);
1770 	}
1771 
1772 	pserialize_perform(sc->ipsec_psz);
1773 	psref_target_destroy(&nullvar->iv_psref, iv_psref_class);
1774 
1775 	if (if_ipsec_variant_is_configured(sc->ipsec_var))
1776 		ifp->if_flags |= IFF_RUNNING;
1777 	else
1778 		ifp->if_flags &= ~IFF_RUNNING;
1779 
1780 	return error;
1781 }
1782