xref: /netbsd-src/sys/net/if_ipsec.c (revision d3d2abdc28a790079ac0d9b337b15bd97b65d751)
1 /*	$NetBSD: if_ipsec.c,v 1.22 2019/06/25 12:30:50 msaitoh Exp $  */
2 
3 /*
4  * Copyright (c) 2017 Internet Initiative Japan Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __KERNEL_RCSID(0, "$NetBSD: if_ipsec.c,v 1.22 2019/06/25 12:30:50 msaitoh Exp $");
31 
32 #ifdef _KERNEL_OPT
33 #include "opt_inet.h"
34 #endif
35 
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/kernel.h>
39 #include <sys/mbuf.h>
40 #include <sys/socket.h>
41 #include <sys/sockio.h>
42 #include <sys/errno.h>
43 #include <sys/ioctl.h>
44 #include <sys/time.h>
45 #include <sys/syslog.h>
46 #include <sys/cpu.h>
47 #include <sys/kmem.h>
48 #include <sys/mutex.h>
49 #include <sys/pserialize.h>
50 #include <sys/psref.h>
51 
52 #include <net/if.h>
53 #include <net/if_types.h>
54 #include <net/route.h>
55 #include <net/bpf.h>
56 #include <net/pfkeyv2.h>
57 
58 #include <netinet/in.h>
59 #include <netinet/in_systm.h>
60 #include <netinet/ip.h>
61 #ifdef	INET
62 #include <netinet/in_var.h>
63 #endif	/* INET */
64 
65 #ifdef INET6
66 #include <netinet6/in6_var.h>
67 #include <netinet/ip6.h>
68 #include <netinet6/ip6_var.h>
69 #endif /* INET6 */
70 
71 #include <netinet/ip_encap.h>
72 
73 #include <net/if_ipsec.h>
74 
75 #include <net/raw_cb.h>
76 #include <net/pfkeyv2.h>
77 
78 #include <netipsec/key.h>
79 #include <netipsec/keydb.h> /* for union sockaddr_union */
80 #include <netipsec/ipsec.h>
81 #include <netipsec/ipsecif.h>
82 
83 static void if_ipsec_ro_init_pc(void *, void *, struct cpu_info *);
84 static void if_ipsec_ro_fini_pc(void *, void *, struct cpu_info *);
85 
86 static int if_ipsec_clone_create(struct if_clone *, int);
87 static int if_ipsec_clone_destroy(struct ifnet *);
88 
89 static inline int if_ipsec_out_direct(struct ipsec_variant *, struct mbuf *, int);
90 static inline void if_ipsec_in_enqueue(struct mbuf *, int, struct ifnet *);
91 
92 static int if_ipsec_encap_attach(struct ipsec_variant *);
93 static int if_ipsec_encap_detach(struct ipsec_variant *);
94 static int if_ipsec_set_tunnel(struct ifnet *,
95     struct sockaddr *, struct sockaddr *);
96 static void if_ipsec_delete_tunnel(struct ifnet *);
97 static int if_ipsec_ensure_flags(struct ifnet *, short);
98 static void if_ipsec_attach0(struct ipsec_softc *);
99 
100 static int if_ipsec_update_variant(struct ipsec_softc *,
101     struct ipsec_variant *, struct ipsec_variant *);
102 
103 /* sadb_msg */
104 static inline void if_ipsec_add_mbuf(struct mbuf *, void *, size_t);
105 static inline void if_ipsec_add_pad(struct mbuf *, size_t);
106 static inline size_t if_ipsec_set_sadb_addr(struct sadb_address *,
107     struct sockaddr *, int, uint16_t);
108 static inline size_t if_ipsec_set_sadb_src(struct sadb_address *,
109     struct sockaddr *, int);
110 static inline size_t if_ipsec_set_sadb_dst(struct sadb_address *,
111     struct sockaddr *, int);
112 static inline size_t if_ipsec_set_sadb_x_policy(struct sadb_x_policy *,
113     struct sadb_x_ipsecrequest *, uint16_t, uint8_t, uint32_t, uint8_t,
114     struct sockaddr *, struct sockaddr *);
115 static inline void if_ipsec_set_sadb_msg(struct sadb_msg *, uint16_t, uint8_t);
116 static inline void if_ipsec_set_sadb_msg_add(struct sadb_msg *, uint16_t);
117 static inline void if_ipsec_set_sadb_msg_del(struct sadb_msg *, uint16_t);
118 /* SPD */
119 static int if_ipsec_share_sp(struct ipsec_variant *);
120 static int if_ipsec_unshare_sp(struct ipsec_variant *);
121 static inline struct secpolicy *if_ipsec_add_sp0(struct sockaddr *,
122     in_port_t, struct sockaddr *, in_port_t, int, int, int, u_int);
123 static inline int if_ipsec_del_sp0(struct secpolicy *);
124 static int if_ipsec_add_sp(struct ipsec_variant *,
125     struct sockaddr *, in_port_t, struct sockaddr *, in_port_t);
126 static void if_ipsec_del_sp(struct ipsec_variant *);
127 static int if_ipsec_replace_sp(struct ipsec_softc *, struct ipsec_variant *,
128     struct ipsec_variant *);
129 
130 static int if_ipsec_set_addr_port(struct sockaddr *, struct sockaddr *,
131     in_port_t);
132 #define IF_IPSEC_GATHER_PSRC_ADDR_PORT(var, target)			\
133 	if_ipsec_set_addr_port(target, (var)->iv_psrc, (var)->iv_sport)
134 #define IF_IPSEC_GATHER_PDST_ADDR_PORT(var, target)			\
135 	if_ipsec_set_addr_port(target, (var)->iv_pdst, (var)->iv_dport)
136 
137 /*
138  * ipsec global variable definitions
139  */
140 
141 /* This list is used in ioctl context only. */
142 static struct {
143 	LIST_HEAD(ipsec_sclist, ipsec_softc) list;
144 	kmutex_t lock;
145 } ipsec_softcs __cacheline_aligned;
146 
147 struct psref_class *iv_psref_class __read_mostly;
148 
149 struct if_clone ipsec_cloner =
150     IF_CLONE_INITIALIZER("ipsec", if_ipsec_clone_create, if_ipsec_clone_destroy);
151 static int max_ipsec_nesting = MAX_IPSEC_NEST;
152 
153 /* ARGSUSED */
154 void
155 ipsecifattach(int count)
156 {
157 
158 	mutex_init(&ipsec_softcs.lock, MUTEX_DEFAULT, IPL_NONE);
159 	LIST_INIT(&ipsec_softcs.list);
160 
161 	iv_psref_class = psref_class_create("ipsecvar", IPL_SOFTNET);
162 
163 	if_clone_attach(&ipsec_cloner);
164 }
165 
166 static int
167 if_ipsec_clone_create(struct if_clone *ifc, int unit)
168 {
169 	struct ipsec_softc *sc;
170 	struct ipsec_variant *var;
171 
172 	sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
173 
174 	if_initname(&sc->ipsec_if, ifc->ifc_name, unit);
175 
176 	if_ipsec_attach0(sc);
177 
178 	var = kmem_zalloc(sizeof(*var), KM_SLEEP);
179 	var->iv_softc = sc;
180 	psref_target_init(&var->iv_psref, iv_psref_class);
181 
182 	sc->ipsec_var = var;
183 	mutex_init(&sc->ipsec_lock, MUTEX_DEFAULT, IPL_NONE);
184 	sc->ipsec_psz = pserialize_create();
185 	sc->ipsec_ro_percpu = percpu_alloc(sizeof(struct ipsec_ro));
186 	percpu_foreach(sc->ipsec_ro_percpu, if_ipsec_ro_init_pc, NULL);
187 
188 	mutex_enter(&ipsec_softcs.lock);
189 	LIST_INSERT_HEAD(&ipsec_softcs.list, sc, ipsec_list);
190 	mutex_exit(&ipsec_softcs.lock);
191 	return 0;
192 }
193 
194 static void
195 if_ipsec_attach0(struct ipsec_softc *sc)
196 {
197 
198 	sc->ipsec_if.if_addrlen = 0;
199 	sc->ipsec_if.if_mtu    = IPSEC_MTU;
200 	sc->ipsec_if.if_flags  = IFF_POINTOPOINT | IFF_MULTICAST;
201 	/* set ipsec(4) specific default flags. */
202 	sc->ipsec_if.if_flags  |= IFF_FWD_IPV6;
203 	sc->ipsec_if.if_extflags = IFEF_NO_LINK_STATE_CHANGE | IFEF_MPSAFE;
204 	sc->ipsec_if.if_ioctl  = if_ipsec_ioctl;
205 	sc->ipsec_if.if_output = if_ipsec_output;
206 	sc->ipsec_if.if_type   = IFT_IPSEC;
207 	sc->ipsec_if.if_dlt    = DLT_NULL;
208 	sc->ipsec_if.if_softc  = sc;
209 	IFQ_SET_READY(&sc->ipsec_if.if_snd);
210 	if_initialize(&sc->ipsec_if);
211 	if_alloc_sadl(&sc->ipsec_if);
212 	bpf_attach(&sc->ipsec_if, DLT_NULL, sizeof(u_int));
213 	if_register(&sc->ipsec_if);
214 }
215 
216 static void
217 if_ipsec_ro_init_pc(void *p, void *arg __unused, struct cpu_info *ci __unused)
218 {
219 	struct ipsec_ro *iro = p;
220 
221 	iro->ir_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
222 }
223 
224 static void
225 if_ipsec_ro_fini_pc(void *p, void *arg __unused, struct cpu_info *ci __unused)
226 {
227 	struct ipsec_ro *iro = p;
228 
229 	rtcache_free(&iro->ir_ro);
230 
231 	mutex_obj_free(iro->ir_lock);
232 }
233 
234 static int
235 if_ipsec_clone_destroy(struct ifnet *ifp)
236 {
237 	struct ipsec_softc *sc = ifp->if_softc;
238 	struct ipsec_variant *var;
239 	int bound;
240 
241 	mutex_enter(&ipsec_softcs.lock);
242 	LIST_REMOVE(sc, ipsec_list);
243 	mutex_exit(&ipsec_softcs.lock);
244 
245 	bound = curlwp_bind();
246 	if_ipsec_delete_tunnel(&sc->ipsec_if);
247 	curlwp_bindx(bound);
248 
249 	bpf_detach(ifp);
250 	if_detach(ifp);
251 
252 	percpu_foreach(sc->ipsec_ro_percpu, if_ipsec_ro_fini_pc, NULL);
253 	percpu_free(sc->ipsec_ro_percpu, sizeof(struct ipsec_ro));
254 
255 	pserialize_destroy(sc->ipsec_psz);
256 	mutex_destroy(&sc->ipsec_lock);
257 
258 	var = sc->ipsec_var;
259 	kmem_free(var, sizeof(*var));
260 	kmem_free(sc, sizeof(*sc));
261 
262 	return 0;
263 }
264 
265 static inline bool
266 if_ipsec_nat_t(struct ipsec_softc *sc)
267 {
268 
269 	return (sc->ipsec_if.if_flags & IFF_NAT_T) != 0;
270 }
271 
272 static inline bool
273 if_ipsec_fwd_ipv6(struct ipsec_softc *sc)
274 {
275 
276 	return (sc->ipsec_if.if_flags & IFF_FWD_IPV6) != 0;
277 }
278 
279 int
280 if_ipsec_encap_func(struct mbuf *m, int off, int proto, void *arg)
281 {
282 	uint8_t v;
283 	struct ipsec_softc *sc;
284 	struct ipsec_variant *var = NULL;
285 	struct psref psref;
286 	int ret = 0;
287 
288 	sc = arg;
289 	KASSERT(sc != NULL);
290 
291 	if ((sc->ipsec_if.if_flags & IFF_UP) == 0)
292 		goto out;
293 
294 	var = if_ipsec_getref_variant(sc, &psref);
295 	if (if_ipsec_variant_is_unconfigured(var))
296 		goto out;
297 
298 	switch (proto) {
299 	case IPPROTO_IPV4:
300 	case IPPROTO_IPV6:
301 		break;
302 	default:
303 		goto out;
304 	}
305 
306 	m_copydata(m, 0, sizeof(v), &v);
307 	v = (v >> 4) & 0xff;  /* Get the IP version number. */
308 
309 	switch (v) {
310 #ifdef INET
311 	case IPVERSION: {
312 		struct ip ip;
313 
314 		if (m->m_pkthdr.len < sizeof(ip))
315 			goto out;
316 
317 		m_copydata(m, 0, sizeof(ip), &ip);
318 		if (var->iv_psrc->sa_family != AF_INET ||
319 		    var->iv_pdst->sa_family != AF_INET)
320 			goto out;
321 		ret = ipsecif4_encap_func(m, &ip, var);
322 		break;
323 	}
324 #endif
325 #ifdef INET6
326 	case (IPV6_VERSION >> 4): {
327 		struct ip6_hdr ip6;
328 
329 		if (m->m_pkthdr.len < sizeof(ip6))
330 			goto out;
331 
332 		m_copydata(m, 0, sizeof(ip6), &ip6);
333 		if (var->iv_psrc->sa_family != AF_INET6 ||
334 		    var->iv_pdst->sa_family != AF_INET6)
335 			goto out;
336 		ret = ipsecif6_encap_func(m, &ip6, var);
337 		break;
338 	}
339 #endif
340 	default:
341 		goto out;
342 	}
343 
344 out:
345 	if (var != NULL)
346 		if_ipsec_putref_variant(var, &psref);
347 	return ret;
348 }
349 
350 /*
351  * ipsec(4) I/F may cause infinite recursion calls when misconfigured.
352  * We'll prevent this by introducing upper limit.
353  */
354 static int
355 if_ipsec_check_nesting(struct ifnet *ifp, struct mbuf *m)
356 {
357 
358 	return if_tunnel_check_nesting(ifp, m, max_ipsec_nesting);
359 }
360 
361 int
362 if_ipsec_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
363     const struct rtentry *rt)
364 {
365 	struct ipsec_softc *sc = ifp->if_softc;
366 	struct ipsec_variant *var;
367 	struct psref psref;
368 	int error;
369 	int bound;
370 
371 	IFQ_CLASSIFY(&ifp->if_snd, m, dst->sa_family);
372 
373 	error = if_ipsec_check_nesting(ifp, m);
374 	if (error) {
375 		m_freem(m);
376 		goto noref_end;
377 	}
378 
379 	if ((ifp->if_flags & IFF_UP) == 0) {
380 		m_freem(m);
381 		error = ENETDOWN;
382 		goto noref_end;
383 	}
384 
385 
386 	bound = curlwp_bind();
387 	var = if_ipsec_getref_variant(sc, &psref);
388 	if (if_ipsec_variant_is_unconfigured(var)) {
389 		m_freem(m);
390 		error = ENETDOWN;
391 		goto end;
392 	}
393 
394 	m->m_flags &= ~(M_BCAST|M_MCAST);
395 
396 	/* use DLT_NULL encapsulation here to pass inner af type */
397 	M_PREPEND(m, sizeof(int), M_DONTWAIT);
398 	if (!m) {
399 		error = ENOBUFS;
400 		goto end;
401 	}
402 	*mtod(m, int *) = dst->sa_family;
403 
404 #if INET6
405 	/* drop IPv6 packet if IFF_FWD_IPV6 is not set */
406 	if (dst->sa_family == AF_INET6 &&
407 	    !if_ipsec_fwd_ipv6(sc)) {
408 		/*
409 		 * IPv6 packet is not allowed to forward,that is not error.
410 		 */
411 		error = 0;
412 		IF_DROP(&ifp->if_snd);
413 		m_freem(m);
414 		goto end;
415 	}
416 #endif
417 
418 	error = if_ipsec_out_direct(var, m, dst->sa_family);
419 
420 end:
421 	if_ipsec_putref_variant(var, &psref);
422 	curlwp_bindx(bound);
423 noref_end:
424 	if (error)
425 		ifp->if_oerrors++;
426 
427 	return error;
428 }
429 
430 static inline int
431 if_ipsec_out_direct(struct ipsec_variant *var, struct mbuf *m, int family)
432 {
433 	struct ifnet *ifp = &var->iv_softc->ipsec_if;
434 	int error;
435 	int len;
436 
437 	KASSERT(if_ipsec_heldref_variant(var));
438 	KASSERT(var->iv_output != NULL);
439 
440 	len = m->m_pkthdr.len;
441 
442 	/* input DLT_NULL frame to BPF */
443 	bpf_mtap(ifp, m, BPF_D_OUT);
444 
445 	/* grab and chop off inner af type */
446 	/* XXX need pullup? */
447 	m_adj(m, sizeof(int));
448 
449 	error = var->iv_output(var, family, m);
450 	if (error)
451 		return error;
452 
453 	ifp->if_opackets++;
454 	ifp->if_obytes += len;
455 
456 	return 0;
457 }
458 
459 void
460 if_ipsec_input(struct mbuf *m, int af, struct ifnet *ifp)
461 {
462 
463 	KASSERT(ifp != NULL);
464 
465 	m_set_rcvif(m, ifp);
466 
467 	bpf_mtap_af(ifp, af, m, BPF_D_IN);
468 
469 	if_ipsec_in_enqueue(m, af, ifp);
470 
471 	return;
472 }
473 
474 static inline void
475 if_ipsec_in_enqueue(struct mbuf *m, int af, struct ifnet *ifp)
476 {
477 	pktqueue_t *pktq;
478 	int pktlen;
479 
480 	/*
481 	 * Put the packet to the network layer input queue according to the
482 	 * specified address family.
483 	 */
484 	switch (af) {
485 #ifdef INET
486 	case AF_INET:
487 		pktq = ip_pktq;
488 		break;
489 #endif
490 #ifdef INET6
491 	case AF_INET6:
492 		pktq = ip6_pktq;
493 		break;
494 #endif
495 	default:
496 		ifp->if_ierrors++;
497 		m_freem(m);
498 		return;
499 	}
500 
501 #if 1
502 	const u_int h = curcpu()->ci_index;
503 #else
504 	const uint32_t h = pktq_rps_hash(m);
505 #endif
506 	pktlen = m->m_pkthdr.len;
507 	if (__predict_true(pktq_enqueue(pktq, m, h))) {
508 		ifp->if_ibytes += pktlen;
509 		ifp->if_ipackets++;
510 	} else {
511 		ifp->if_iqdrops++;
512 		m_freem(m);
513 	}
514 
515 	return;
516 }
517 
518 static inline int
519 if_ipsec_check_salen(struct sockaddr *addr)
520 {
521 
522 	switch (addr->sa_family) {
523 #ifdef INET
524 	case AF_INET:
525 		if (addr->sa_len != sizeof(struct sockaddr_in))
526 			return EINVAL;
527 		break;
528 #endif /* INET */
529 #ifdef INET6
530 	case AF_INET6:
531 		if (addr->sa_len != sizeof(struct sockaddr_in6))
532 			return EINVAL;
533 		break;
534 #endif /* INET6 */
535 	default:
536 		return EAFNOSUPPORT;
537 	}
538 
539 	return 0;
540 }
541 
542 /* XXX how should we handle IPv6 scope on SIOC[GS]IFPHYADDR? */
543 int
544 if_ipsec_ioctl(struct ifnet *ifp, u_long cmd, void *data)
545 {
546 	struct ipsec_softc *sc  = ifp->if_softc;
547 	struct ipsec_variant *var = NULL;
548 	struct ifreq     *ifr = (struct ifreq*)data;
549 	struct ifaddr    *ifa = (struct ifaddr*)data;
550 	int error = 0, size;
551 	struct sockaddr *dst, *src;
552 	u_long mtu;
553 	short oflags = ifp->if_flags;
554 	int bound;
555 	struct psref psref;
556 
557 	switch (cmd) {
558 	case SIOCINITIFADDR:
559 		ifp->if_flags |= IFF_UP;
560 		ifa->ifa_rtrequest = p2p_rtrequest;
561 		break;
562 
563 	case SIOCSIFDSTADDR:
564 		break;
565 
566 	case SIOCADDMULTI:
567 	case SIOCDELMULTI:
568 		switch (ifr->ifr_addr.sa_family) {
569 #ifdef INET
570 		case AF_INET:	/* IP supports Multicast */
571 			break;
572 #endif /* INET */
573 #ifdef INET6
574 		case AF_INET6:	/* IP6 supports Multicast */
575 			break;
576 #endif /* INET6 */
577 		default:  /* Other protocols doesn't support Multicast */
578 			error = EAFNOSUPPORT;
579 			break;
580 		}
581 		break;
582 
583 	case SIOCSIFMTU:
584 		mtu = ifr->ifr_mtu;
585 		if (mtu < IPSEC_MTU_MIN || mtu > IPSEC_MTU_MAX)
586 			return EINVAL;
587 		else if ((error = ifioctl_common(ifp, cmd, data)) == ENETRESET)
588 			error = 0;
589 		break;
590 
591 #ifdef INET
592 	case SIOCSIFPHYADDR:
593 #endif
594 #ifdef INET6
595 	case SIOCSIFPHYADDR_IN6:
596 #endif /* INET6 */
597 	case SIOCSLIFPHYADDR:
598 		switch (cmd) {
599 #ifdef INET
600 		case SIOCSIFPHYADDR:
601 			src = (struct sockaddr *)
602 				&(((struct in_aliasreq *)data)->ifra_addr);
603 			dst = (struct sockaddr *)
604 				&(((struct in_aliasreq *)data)->ifra_dstaddr);
605 			break;
606 #endif /* INET */
607 #ifdef INET6
608 		case SIOCSIFPHYADDR_IN6:
609 			src = (struct sockaddr *)
610 				&(((struct in6_aliasreq *)data)->ifra_addr);
611 			dst = (struct sockaddr *)
612 				&(((struct in6_aliasreq *)data)->ifra_dstaddr);
613 			break;
614 #endif /* INET6 */
615 		case SIOCSLIFPHYADDR:
616 			src = (struct sockaddr *)
617 				&(((struct if_laddrreq *)data)->addr);
618 			dst = (struct sockaddr *)
619 				&(((struct if_laddrreq *)data)->dstaddr);
620 			break;
621 		default:
622 			return EINVAL;
623 		}
624 
625 		/* sa_family must be equal */
626 		if (src->sa_family != dst->sa_family)
627 			return EINVAL;
628 
629 		error = if_ipsec_check_salen(src);
630 		if (error)
631 			return error;
632 		error = if_ipsec_check_salen(dst);
633 		if (error)
634 			return error;
635 
636 		/* check sa_family looks sane for the cmd */
637 		switch (cmd) {
638 #ifdef INET
639 		case SIOCSIFPHYADDR:
640 			if (src->sa_family == AF_INET)
641 				break;
642 			return EAFNOSUPPORT;
643 #endif /* INET */
644 #ifdef INET6
645 		case SIOCSIFPHYADDR_IN6:
646 			if (src->sa_family == AF_INET6)
647 				break;
648 			return EAFNOSUPPORT;
649 #endif /* INET6 */
650 		case SIOCSLIFPHYADDR:
651 			/* checks done in the above */
652 			break;
653 		}
654 		/*
655 		 * calls if_ipsec_getref_variant() for other softcs to check
656 		 * address pair duplicattion
657 		 */
658 		bound = curlwp_bind();
659 		error = if_ipsec_set_tunnel(&sc->ipsec_if, src, dst);
660 		if (error)
661 			goto bad;
662 		curlwp_bindx(bound);
663 		break;
664 
665 	case SIOCDIFPHYADDR:
666 		bound = curlwp_bind();
667 		if_ipsec_delete_tunnel(&sc->ipsec_if);
668 		curlwp_bindx(bound);
669 		break;
670 
671 	case SIOCGIFPSRCADDR:
672 #ifdef INET6
673 	case SIOCGIFPSRCADDR_IN6:
674 #endif /* INET6 */
675 		bound = curlwp_bind();
676 		var = if_ipsec_getref_variant(sc, &psref);
677 		if (var->iv_psrc == NULL) {
678 			error = EADDRNOTAVAIL;
679 			goto bad;
680 		}
681 		src = var->iv_psrc;
682 		switch (cmd) {
683 #ifdef INET
684 		case SIOCGIFPSRCADDR:
685 			dst = &ifr->ifr_addr;
686 			size = sizeof(ifr->ifr_addr);
687 			break;
688 #endif /* INET */
689 #ifdef INET6
690 		case SIOCGIFPSRCADDR_IN6:
691 			dst = (struct sockaddr *)
692 				&(((struct in6_ifreq *)data)->ifr_addr);
693 			size = sizeof(((struct in6_ifreq *)data)->ifr_addr);
694 			break;
695 #endif /* INET6 */
696 		default:
697 			error = EADDRNOTAVAIL;
698 			goto bad;
699 		}
700 		if (src->sa_len > size) {
701 			error = EINVAL;
702 			goto bad;
703 		}
704 		error = IF_IPSEC_GATHER_PSRC_ADDR_PORT(var, dst);
705 		if (error)
706 			goto bad;
707 		if_ipsec_putref_variant(var, &psref);
708 		curlwp_bindx(bound);
709 		break;
710 
711 	case SIOCGIFPDSTADDR:
712 #ifdef INET6
713 	case SIOCGIFPDSTADDR_IN6:
714 #endif /* INET6 */
715 		bound = curlwp_bind();
716 		var = if_ipsec_getref_variant(sc, &psref);
717 		if (var->iv_pdst == NULL) {
718 			error = EADDRNOTAVAIL;
719 			goto bad;
720 		}
721 		src = var->iv_pdst;
722 		switch (cmd) {
723 #ifdef INET
724 		case SIOCGIFPDSTADDR:
725 			dst = &ifr->ifr_addr;
726 			size = sizeof(ifr->ifr_addr);
727 			break;
728 #endif /* INET */
729 #ifdef INET6
730 		case SIOCGIFPDSTADDR_IN6:
731 			dst = (struct sockaddr *)
732 				&(((struct in6_ifreq *)data)->ifr_addr);
733 			size = sizeof(((struct in6_ifreq *)data)->ifr_addr);
734 			break;
735 #endif /* INET6 */
736 		default:
737 			error = EADDRNOTAVAIL;
738 			goto bad;
739 		}
740 		if (src->sa_len > size) {
741 			error = EINVAL;
742 			goto bad;
743 		}
744 		error = IF_IPSEC_GATHER_PDST_ADDR_PORT(var, dst);
745 		if (error)
746 			goto bad;
747 		if_ipsec_putref_variant(var, &psref);
748 		curlwp_bindx(bound);
749 		break;
750 
751 	case SIOCGLIFPHYADDR:
752 		bound = curlwp_bind();
753 		var = if_ipsec_getref_variant(sc, &psref);
754 		if (if_ipsec_variant_is_unconfigured(var)) {
755 			error = EADDRNOTAVAIL;
756 			goto bad;
757 		}
758 
759 		/* copy src */
760 		src = var->iv_psrc;
761 		dst = (struct sockaddr *)
762 			&(((struct if_laddrreq *)data)->addr);
763 		size = sizeof(((struct if_laddrreq *)data)->addr);
764 		if (src->sa_len > size) {
765 			error = EINVAL;
766 			goto bad;
767 		}
768 		error = IF_IPSEC_GATHER_PSRC_ADDR_PORT(var, dst);
769 		if (error)
770 			goto bad;
771 
772 		/* copy dst */
773 		src = var->iv_pdst;
774 		dst = (struct sockaddr *)
775 			&(((struct if_laddrreq *)data)->dstaddr);
776 		size = sizeof(((struct if_laddrreq *)data)->dstaddr);
777 		if (src->sa_len > size) {
778 			error = EINVAL;
779 			goto bad;
780 		}
781 		error = IF_IPSEC_GATHER_PDST_ADDR_PORT(var, dst);
782 		if (error)
783 			goto bad;
784 		if_ipsec_putref_variant(var, &psref);
785 		curlwp_bindx(bound);
786 		break;
787 
788 	default:
789 		error = ifioctl_common(ifp, cmd, data);
790 		if (!error) {
791 			bound = curlwp_bind();
792 			error = if_ipsec_ensure_flags(&sc->ipsec_if, oflags);
793 			if (error)
794 				goto bad;
795 			curlwp_bindx(bound);
796 		}
797 		break;
798 	}
799 	return error;
800 
801 bad:
802 	if (var != NULL)
803 		if_ipsec_putref_variant(var, &psref);
804 	curlwp_bindx(bound);
805 
806 	return error;
807 }
808 
809 struct encap_funcs {
810 #ifdef INET
811 	int (*ef_inet)(struct ipsec_variant *);
812 #endif
813 #ifdef INET6
814 	int (*ef_inet6)(struct ipsec_variant *);
815 #endif
816 };
817 
818 static struct encap_funcs ipsec_encap_attach = {
819 #ifdef INET
820 	.ef_inet = ipsecif4_attach,
821 #endif
822 #ifdef INET6
823 	.ef_inet6 = &ipsecif6_attach,
824 #endif
825 };
826 
827 static struct encap_funcs ipsec_encap_detach = {
828 #ifdef INET
829 	.ef_inet = ipsecif4_detach,
830 #endif
831 #ifdef INET6
832 	.ef_inet6 = &ipsecif6_detach,
833 #endif
834 };
835 
836 static int
837 if_ipsec_encap_common(struct ipsec_variant *var, struct encap_funcs *funcs)
838 {
839 	int error;
840 
841 	KASSERT(var != NULL);
842 	KASSERT(if_ipsec_variant_is_configured(var));
843 
844 	switch (var->iv_psrc->sa_family) {
845 #ifdef INET
846 	case AF_INET:
847 		error = (funcs->ef_inet)(var);
848 		break;
849 #endif /* INET */
850 #ifdef INET6
851 	case AF_INET6:
852 		error = (funcs->ef_inet6)(var);
853 		break;
854 #endif /* INET6 */
855 	default:
856 		error = EINVAL;
857 		break;
858 	}
859 
860 	return error;
861 }
862 
863 static int
864 if_ipsec_encap_attach(struct ipsec_variant *var)
865 {
866 
867 	return if_ipsec_encap_common(var, &ipsec_encap_attach);
868 }
869 
870 static int
871 if_ipsec_encap_detach(struct ipsec_variant *var)
872 {
873 
874 	return if_ipsec_encap_common(var, &ipsec_encap_detach);
875 }
876 
877 /*
878  * Validate and set ipsec(4) I/F configurations.
879  *     (1) validate
880  *         (1-1) Check the argument src and dst address pair will change
881  *               configuration from current src and dst address pair.
882  *         (1-2) Check any ipsec(4) I/F uses duplicated src and dst address pair
883  *               with argument src and dst address pair, except for NAT-T shared
884  *               tunnels.
885  *     (2) set
886  *         (2-1) Create variant for new configuration.
887  *         (2-2) Create temporary "null" variant used to avoid to access
888  *               dangling variant while SPs are deleted and added.
889  *         (2-3) Swap variant include its SPs.
890  *         (2-4) Cleanup last configurations.
891  */
892 static int
893 if_ipsec_set_tunnel(struct ifnet *ifp,
894     struct sockaddr *src, struct sockaddr *dst)
895 {
896 	struct ipsec_softc *sc = ifp->if_softc;
897 	struct ipsec_softc *sc2;
898 	struct ipsec_variant *ovar, *nvar, *nullvar;
899 	struct sockaddr *osrc, *odst;
900 	struct sockaddr *nsrc, *ndst;
901 	in_port_t nsport = 0, ndport = 0;
902 	int error;
903 
904 	error = encap_lock_enter();
905 	if (error)
906 		return error;
907 
908 	nsrc = sockaddr_dup(src, M_WAITOK);
909 	ndst = sockaddr_dup(dst, M_WAITOK);
910 	nvar = kmem_zalloc(sizeof(*nvar), KM_SLEEP);
911 	nullvar = kmem_zalloc(sizeof(*nullvar), KM_SLEEP);
912 
913 	mutex_enter(&sc->ipsec_lock);
914 
915 	ovar = sc->ipsec_var;
916 
917 	switch(nsrc->sa_family) {
918 #ifdef INET
919 	case AF_INET:
920 		nsport = satosin(src)->sin_port;
921 		/*
922 		 * avoid confuse SP when NAT-T disabled,
923 		 * e.g.
924 		 *     expected: 10.0.1.2[any] 10.0.1.1[any] 4(ipv4)
925 		 *     confuse : 10.0.1.2[600] 10.0.1.1[600] 4(ipv4)
926 		 */
927 		satosin(nsrc)->sin_port = 0;
928 		ndport = satosin(dst)->sin_port;
929 		satosin(ndst)->sin_port = 0;
930 		break;
931 #endif /* INET */
932 #ifdef INET6
933 	case AF_INET6:
934 		nsport = satosin6(src)->sin6_port;
935 		satosin6(nsrc)->sin6_port = 0;
936 		ndport = satosin6(dst)->sin6_port;
937 		satosin6(ndst)->sin6_port = 0;
938 		break;
939 #endif /* INET6 */
940 	default:
941 		log(LOG_DEBUG,
942 		    "%s: Invalid address family: %d.\n",
943 		    __func__, src->sa_family);
944 		error = EINVAL;
945 		goto out;
946 	}
947 
948 	/*
949 	 * (1-1) Check the argument src and dst address pair will change
950 	 *       configuration from current src and dst address pair.
951 	 */
952 	if ((ovar->iv_pdst && sockaddr_cmp(ovar->iv_pdst, dst) == 0) &&
953 	    (ovar->iv_psrc && sockaddr_cmp(ovar->iv_psrc, src) == 0) &&
954 	    (ovar->iv_sport == nsport && ovar->iv_dport == ndport)) {
955 		/* address and port pair not changed. */
956 		error = 0;
957 		goto out;
958 	}
959 
960 	/*
961 	 * (1-2) Check any ipsec(4) I/F uses duplicated src and dst address pair
962 	 *       with argument src and dst address pair, except for NAT-T shared
963 	 *       tunnels.
964 	 */
965 	mutex_enter(&ipsec_softcs.lock);
966 	LIST_FOREACH(sc2, &ipsec_softcs.list, ipsec_list) {
967 		struct ipsec_variant *var2;
968 		struct psref psref;
969 
970 		if (sc2 == sc)
971 			continue;
972 		var2 = if_ipsec_getref_variant(sc2, &psref);
973 		if (if_ipsec_variant_is_unconfigured(var2)) {
974 			if_ipsec_putref_variant(var2, &psref);
975 			continue;
976 		}
977 		if (if_ipsec_nat_t(sc) || if_ipsec_nat_t(sc2)) {
978 			if_ipsec_putref_variant(var2, &psref);
979 			continue; /* NAT-T shared tunnel */
980 		}
981 		if (sockaddr_cmp(var2->iv_pdst, dst) == 0 &&
982 		    sockaddr_cmp(var2->iv_psrc, src) == 0) {
983 			if_ipsec_putref_variant(var2, &psref);
984 			mutex_exit(&ipsec_softcs.lock);
985 			error = EADDRNOTAVAIL;
986 			goto out;
987 		}
988 
989 		if_ipsec_putref_variant(var2, &psref);
990 		/* XXX both end must be valid? (I mean, not 0.0.0.0) */
991 	}
992 	mutex_exit(&ipsec_softcs.lock);
993 
994 
995 	osrc = ovar->iv_psrc;
996 	odst = ovar->iv_pdst;
997 
998 	/*
999 	 * (2-1) Create ipsec_variant for new configuration.
1000 	 */
1001 	if_ipsec_copy_variant(nvar, ovar);
1002 	nvar->iv_psrc = nsrc;
1003 	nvar->iv_pdst = ndst;
1004 	nvar->iv_sport = nsport;
1005 	nvar->iv_dport = ndport;
1006 	nvar->iv_encap_cookie4 = NULL;
1007 	nvar->iv_encap_cookie6 = NULL;
1008 	psref_target_init(&nvar->iv_psref, iv_psref_class);
1009 	error = if_ipsec_encap_attach(nvar);
1010 	if (error)
1011 		goto out;
1012 
1013 	/*
1014 	 * (2-2) Create temporary "null" variant.
1015 	 */
1016 	if_ipsec_copy_variant(nullvar, ovar);
1017 	if_ipsec_clear_config(nullvar);
1018 	psref_target_init(&nullvar->iv_psref, iv_psref_class);
1019 	membar_producer();
1020 	/*
1021 	 * (2-3) Swap variant include its SPs.
1022 	 */
1023 	error = if_ipsec_update_variant(sc, nvar, nullvar);
1024 	if (error) {
1025 		if_ipsec_encap_detach(nvar);
1026 		goto out;
1027 	}
1028 
1029 	mutex_exit(&sc->ipsec_lock);
1030 
1031 	/*
1032 	 * (2-4) Cleanup last configurations.
1033 	 */
1034 	if (if_ipsec_variant_is_configured(ovar))
1035 		if_ipsec_encap_detach(ovar);
1036 	encap_lock_exit();
1037 
1038 	if (osrc != NULL)
1039 		sockaddr_free(osrc);
1040 	if (odst != NULL)
1041 		sockaddr_free(odst);
1042 	kmem_free(ovar, sizeof(*ovar));
1043 	kmem_free(nullvar, sizeof(*nullvar));
1044 
1045 	return 0;
1046 
1047 out:
1048 	mutex_exit(&sc->ipsec_lock);
1049 	encap_lock_exit();
1050 
1051 	sockaddr_free(nsrc);
1052 	sockaddr_free(ndst);
1053 	kmem_free(nvar, sizeof(*nvar));
1054 	kmem_free(nullvar, sizeof(*nullvar));
1055 
1056 	return error;
1057 }
1058 
1059 /*
1060  * Validate and delete ipsec(4) I/F configurations.
1061  *     (1) validate
1062  *         (1-1) Check current src and dst address pair are null,
1063  *               which means the ipsec(4) I/F is already done deletetunnel.
1064  *     (2) delete
1065  *         (2-1) Create variant for deleted status.
1066  *         (2-2) Create temporary "null" variant used to avoid to access
1067  *               dangling variant while SPs are deleted and added.
1068  *               NOTE:
1069  *               The contents of temporary "null" variant equal to the variant
1070  *               of (2-1), however two psref_target_destroy() synchronization
1071  *               points are necessary to avoid to access dangling variant
1072  *               while SPs are deleted and added. To implement that simply,
1073  *               we use the same manner as if_ipsec_set_tunnel(), that is,
1074  *               create extra "null" variant and use it temporarily.
1075  *         (2-3) Swap variant include its SPs.
1076  *         (2-4) Cleanup last configurations.
1077  */
1078 static void
1079 if_ipsec_delete_tunnel(struct ifnet *ifp)
1080 {
1081 	struct ipsec_softc *sc = ifp->if_softc;
1082 	struct ipsec_variant *ovar, *nvar, *nullvar;
1083 	struct sockaddr *osrc, *odst;
1084 	int error;
1085 
1086 	error = encap_lock_enter();
1087 	if (error)
1088 		return;
1089 
1090 	nvar = kmem_zalloc(sizeof(*nvar), KM_SLEEP);
1091 	nullvar = kmem_zalloc(sizeof(*nullvar), KM_SLEEP);
1092 
1093 	mutex_enter(&sc->ipsec_lock);
1094 
1095 	ovar = sc->ipsec_var;
1096 	osrc = ovar->iv_psrc;
1097 	odst = ovar->iv_pdst;
1098 	/*
1099 	 * (1-1) Check current src and dst address pair are null,
1100 	 *       which means the ipsec(4) I/F is already done deletetunnel.
1101 	 */
1102 	if (osrc == NULL || odst == NULL) {
1103 		/* address pair not changed. */
1104 		mutex_exit(&sc->ipsec_lock);
1105 		encap_lock_exit();
1106 		kmem_free(nvar, sizeof(*nvar));
1107 		kmem_free(nullvar, sizeof(*nullvar));
1108 		return;
1109 	}
1110 
1111 	/*
1112 	 * (2-1) Create variant for deleted status.
1113 	 */
1114 	if_ipsec_copy_variant(nvar, ovar);
1115 	if_ipsec_clear_config(nvar);
1116 	psref_target_init(&nvar->iv_psref, iv_psref_class);
1117 
1118 	/*
1119 	 * (2-2) Create temporary "null" variant used to avoid to access
1120 	 *       dangling variant while SPs are deleted and added.
1121 	 */
1122 	if_ipsec_copy_variant(nullvar, ovar);
1123 	if_ipsec_clear_config(nullvar);
1124 	psref_target_init(&nullvar->iv_psref, iv_psref_class);
1125 	membar_producer();
1126 	/*
1127 	 * (2-3) Swap variant include its SPs.
1128 	 */
1129 	/* if_ipsec_update_variant() does not fail when delete SP only. */
1130 	(void)if_ipsec_update_variant(sc, nvar, nullvar);
1131 
1132 	mutex_exit(&sc->ipsec_lock);
1133 
1134 	/*
1135 	 * (2-4) Cleanup last configurations.
1136 	 */
1137 	if (if_ipsec_variant_is_configured(ovar))
1138 		if_ipsec_encap_detach(ovar);
1139 	encap_lock_exit();
1140 
1141 	sockaddr_free(osrc);
1142 	sockaddr_free(odst);
1143 	kmem_free(ovar, sizeof(*ovar));
1144 	kmem_free(nullvar, sizeof(*nullvar));
1145 }
1146 
1147 /*
1148  * Check IFF_NAT_T and IFF_FWD_IPV6 flags, therefore update SPs if needed.
1149  *     (1) check
1150  *         (1-1) Check flags are changed.
1151  *         (1-2) Check current src and dst address pair. If they are null,
1152  *               that means the ipsec(4) I/F is deletetunnel'ed, so it is
1153  *               not needed to update.
1154  *     (2) update
1155  *         (2-1) Create variant for new SPs.
1156  *         (2-2) Create temporary "null" variant used to avoid to access
1157  *               dangling variant while SPs are deleted and added.
1158  *               NOTE:
1159  *               There is the same problem as if_ipsec_delete_tunnel().
1160  *         (2-3) Swap variant include its SPs.
1161  *         (2-4) Cleanup unused configurations.
1162  *               NOTE: use the same encap_cookies.
1163  */
1164 static int
1165 if_ipsec_ensure_flags(struct ifnet *ifp, short oflags)
1166 {
1167 	struct ipsec_softc *sc = ifp->if_softc;
1168 	struct ipsec_variant *ovar, *nvar, *nullvar;
1169 	int error;
1170 
1171 	/*
1172 	 * (1) Check flags are changed.
1173 	 */
1174 	if ((oflags & (IFF_NAT_T|IFF_FWD_IPV6)) ==
1175 	    (ifp->if_flags & (IFF_NAT_T|IFF_FWD_IPV6)))
1176 		return 0; /* flags not changed. */
1177 
1178 	error = encap_lock_enter();
1179 	if (error)
1180 		return error;
1181 
1182 	nvar = kmem_zalloc(sizeof(*nvar), KM_SLEEP);
1183 	nullvar = kmem_zalloc(sizeof(*nullvar), KM_SLEEP);
1184 
1185 	mutex_enter(&sc->ipsec_lock);
1186 
1187 	ovar = sc->ipsec_var;
1188 	/*
1189 	 * (1-2) Check current src and dst address pair.
1190 	 */
1191 	if (if_ipsec_variant_is_unconfigured(ovar)) {
1192 		/* nothing to do */
1193 		mutex_exit(&sc->ipsec_lock);
1194 		encap_lock_exit();
1195 		kmem_free(nvar, sizeof(*nvar));
1196 		kmem_free(nullvar, sizeof(*nullvar));
1197 		return 0;
1198 	}
1199 
1200 	/*
1201 	 * (2-1) Create variant for new SPs.
1202 	 */
1203 	if_ipsec_copy_variant(nvar, ovar);
1204 	psref_target_init(&nvar->iv_psref, iv_psref_class);
1205 	/*
1206 	 * (2-2) Create temporary "null" variant used to avoid to access
1207 	 *       dangling variant while SPs are deleted and added.
1208 	 */
1209 	if_ipsec_copy_variant(nullvar, ovar);
1210 	if_ipsec_clear_config(nullvar);
1211 	psref_target_init(&nullvar->iv_psref, iv_psref_class);
1212 	membar_producer();
1213 	/*
1214 	 * (2-3) Swap variant include its SPs.
1215 	 */
1216 	error = if_ipsec_update_variant(sc, nvar, nullvar);
1217 
1218 	mutex_exit(&sc->ipsec_lock);
1219 	encap_lock_exit();
1220 
1221 	/*
1222 	 * (2-4) Cleanup unused configurations.
1223 	 */
1224 	if (!error)
1225 		kmem_free(ovar, sizeof(*ovar));
1226 	else
1227 		kmem_free(nvar, sizeof(*ovar));
1228 	kmem_free(nullvar, sizeof(*nullvar));
1229 
1230 	return error;
1231 }
1232 
1233 /*
1234  * SPD management
1235  */
1236 
1237 /*
1238  * Share SP set with other NAT-T ipsec(4) I/F(s).
1239  *     Return 1, when "var" shares SP set.
1240  *     Return 0, when "var" cannot share SP set.
1241  *
1242  * NOTE:
1243  * if_ipsec_share_sp() and if_ipsec_unshare_sp() would require global lock
1244  * to exclude other ipsec(4) I/Fs set_tunnel/delete_tunnel. E.g. when ipsec0
1245  * and ipsec1 can share SP set, running ipsec0's set_tunnel and ipsec1's
1246  * set_tunnel causes race.
1247  * Currently, (fortunately) encap_lock works as this global lock.
1248  */
1249 static int
1250 if_ipsec_share_sp(struct ipsec_variant *var)
1251 {
1252 	struct ipsec_softc *sc = var->iv_softc;
1253 	struct ipsec_softc *sc2;
1254 	struct ipsec_variant *var2;
1255 	struct psref psref;
1256 
1257 	KASSERT(encap_lock_held());
1258 	KASSERT(var->iv_psrc != NULL && var->iv_pdst != NULL);
1259 
1260 	mutex_enter(&ipsec_softcs.lock);
1261 	LIST_FOREACH(sc2, &ipsec_softcs.list, ipsec_list) {
1262 		if (sc2 == sc)
1263 			continue;
1264 		var2 = if_ipsec_getref_variant(sc2, &psref);
1265 		if (if_ipsec_variant_is_unconfigured(var2)) {
1266 			if_ipsec_putref_variant(var2, &psref);
1267 			continue;
1268 		}
1269 		if (sockaddr_cmp(var2->iv_pdst, var->iv_pdst) != 0 ||
1270 		    sockaddr_cmp(var2->iv_psrc, var->iv_psrc) != 0) {
1271 			if_ipsec_putref_variant(var2, &psref);
1272 			continue;
1273 		}
1274 
1275 		break;
1276 	}
1277 	mutex_exit(&ipsec_softcs.lock);
1278 	if (sc2 == NULL)
1279 		return 0; /* not shared */
1280 
1281 	IV_SP_IN(var) = IV_SP_IN(var2);
1282 	IV_SP_IN6(var) = IV_SP_IN6(var2);
1283 	IV_SP_OUT(var) = IV_SP_OUT(var2);
1284 	IV_SP_OUT6(var) = IV_SP_OUT6(var2);
1285 
1286 	if_ipsec_putref_variant(var2, &psref);
1287 	return 1; /* shared */
1288 }
1289 
1290 /*
1291  * Unshare SP set with other NAT-T ipsec(4) I/F(s).
1292  *     Return 1, when "var" shared SP set, and then unshare them.
1293  *     Return 0, when "var" did not share SP set.
1294  *
1295  * NOTE:
1296  * See if_ipsec_share_sp()'s note.
1297  */
1298 static int
1299 if_ipsec_unshare_sp(struct ipsec_variant *var)
1300 {
1301 	struct ipsec_softc *sc = var->iv_softc;
1302 	struct ipsec_softc *sc2;
1303 	struct ipsec_variant *var2;
1304 	struct psref psref;
1305 
1306 	KASSERT(encap_lock_held());
1307 
1308 	if (!var->iv_pdst || !var->iv_psrc)
1309 		return 0;
1310 
1311 	mutex_enter(&ipsec_softcs.lock);
1312 	LIST_FOREACH(sc2, &ipsec_softcs.list, ipsec_list) {
1313 		if (sc2 == sc)
1314 			continue;
1315 		var2 = if_ipsec_getref_variant(sc2, &psref);
1316 		if (!var2->iv_pdst || !var2->iv_psrc) {
1317 			if_ipsec_putref_variant(var2, &psref);
1318 			continue;
1319 		}
1320 		if (sockaddr_cmp(var2->iv_pdst, var->iv_pdst) != 0 ||
1321 		    sockaddr_cmp(var2->iv_psrc, var->iv_psrc) != 0) {
1322 			if_ipsec_putref_variant(var2, &psref);
1323 			continue;
1324 		}
1325 
1326 		break;
1327 	}
1328 	mutex_exit(&ipsec_softcs.lock);
1329 	if (sc2 == NULL)
1330 		return 0; /* not shared */
1331 
1332 	IV_SP_IN(var) = NULL;
1333 	IV_SP_IN6(var) = NULL;
1334 	IV_SP_OUT(var) = NULL;
1335 	IV_SP_OUT6(var) = NULL;
1336 	if_ipsec_putref_variant(var2, &psref);
1337 	return 1; /* shared */
1338 }
1339 
1340 static inline void
1341 if_ipsec_add_mbuf_optalign(struct mbuf *m0, void *data, size_t len, bool align)
1342 {
1343 	struct mbuf *m;
1344 
1345 	MGET(m, M_WAIT, MT_DATA);
1346 	if (align) {
1347 		m->m_len = PFKEY_ALIGN8(len);
1348 		memset(mtod(m, void *), 0, m->m_len);
1349 	} else
1350 		m->m_len = len;
1351 	m_copyback(m, 0, len, data);
1352 	m_cat(m0, m);
1353 }
1354 
1355 static inline void
1356 if_ipsec_add_mbuf(struct mbuf *m0, void *data, size_t len)
1357 {
1358 
1359 	if_ipsec_add_mbuf_optalign(m0, data, len, true);
1360 }
1361 
1362 static inline void
1363 if_ipsec_add_mbuf_addr_port(struct mbuf *m0, struct sockaddr *addr, in_port_t port, bool align)
1364 {
1365 
1366 	if (port == 0) {
1367 		if_ipsec_add_mbuf_optalign(m0, addr, addr->sa_len, align);
1368 	} else {
1369 		union sockaddr_union addrport_u;
1370 		struct sockaddr *addrport = &addrport_u.sa;
1371 
1372 		if_ipsec_set_addr_port(addrport, addr, port);
1373 		if_ipsec_add_mbuf_optalign(m0, addrport, addrport->sa_len, align);
1374 	}
1375 }
1376 
1377 static inline void
1378 if_ipsec_add_pad(struct mbuf *m0, size_t len)
1379 {
1380 	struct mbuf *m;
1381 
1382 	if (len == 0)
1383 		return;
1384 
1385 	MGET(m, M_WAIT, MT_DATA);
1386 	m->m_len = len;
1387 	memset(mtod(m, void *), 0, m->m_len);
1388 	m_cat(m0, m);
1389 }
1390 
1391 static inline size_t
1392 if_ipsec_set_sadb_addr(struct sadb_address *saaddr, struct sockaddr *addr,
1393     int proto, uint16_t exttype)
1394 {
1395 	size_t size;
1396 
1397 	KASSERT(saaddr != NULL);
1398 	KASSERT(addr != NULL);
1399 
1400 	size = sizeof(*saaddr) + PFKEY_ALIGN8(addr->sa_len);
1401 	saaddr->sadb_address_len = PFKEY_UNIT64(size);
1402 	saaddr->sadb_address_exttype = exttype;
1403 	saaddr->sadb_address_proto = proto;
1404 	switch (addr->sa_family) {
1405 #ifdef INET
1406 	case AF_INET:
1407 		saaddr->sadb_address_prefixlen = sizeof(struct in_addr) << 3;
1408 		break;
1409 #endif /* INET */
1410 #ifdef INET6
1411 	case AF_INET6:
1412 		saaddr->sadb_address_prefixlen = sizeof(struct in6_addr) << 3;
1413 		break;
1414 #endif /* INET6 */
1415 	default:
1416 		log(LOG_DEBUG,
1417 		    "%s: Invalid address family: %d.\n",
1418 		    __func__, addr->sa_family);
1419 		break;
1420 	}
1421 	saaddr->sadb_address_reserved = 0;
1422 
1423 	return size;
1424 }
1425 
1426 static inline size_t
1427 if_ipsec_set_sadb_src(struct sadb_address *sasrc, struct sockaddr *src,
1428     int proto)
1429 {
1430 
1431 	return if_ipsec_set_sadb_addr(sasrc, src, proto,
1432 	    SADB_EXT_ADDRESS_SRC);
1433 }
1434 
1435 static inline size_t
1436 if_ipsec_set_sadb_dst(struct sadb_address *sadst, struct sockaddr *dst,
1437     int proto)
1438 {
1439 
1440 	return if_ipsec_set_sadb_addr(sadst, dst, proto,
1441 	    SADB_EXT_ADDRESS_DST);
1442 }
1443 
1444 static inline size_t
1445 if_ipsec_set_sadb_x_policy(struct sadb_x_policy *xpl,
1446     struct sadb_x_ipsecrequest *xisr, uint16_t policy, uint8_t dir, uint32_t id,
1447     uint8_t level, struct sockaddr *src, struct sockaddr *dst)
1448 {
1449 	size_t size;
1450 
1451 	KASSERT(policy != IPSEC_POLICY_IPSEC || xisr != NULL);
1452 
1453 	size = sizeof(*xpl);
1454 	if (policy == IPSEC_POLICY_IPSEC) {
1455 		size += PFKEY_ALIGN8(sizeof(*xisr));
1456 		if (src != NULL && dst != NULL)
1457 			size += PFKEY_ALIGN8(src->sa_len + dst->sa_len);
1458 	}
1459 	xpl->sadb_x_policy_len = PFKEY_UNIT64(size);
1460 	xpl->sadb_x_policy_exttype = SADB_X_EXT_POLICY;
1461 	xpl->sadb_x_policy_type = policy;
1462 	xpl->sadb_x_policy_dir = dir;
1463 	xpl->sadb_x_policy_reserved = 0;
1464 	xpl->sadb_x_policy_id = id;
1465 	xpl->sadb_x_policy_reserved2 = 0;
1466 
1467 	if (policy == IPSEC_POLICY_IPSEC) {
1468 		xisr->sadb_x_ipsecrequest_len = PFKEY_ALIGN8(sizeof(*xisr));
1469 		if (src != NULL && dst != NULL)
1470 			xisr->sadb_x_ipsecrequest_len +=
1471 				PFKEY_ALIGN8(src->sa_len + dst->sa_len);
1472 		xisr->sadb_x_ipsecrequest_proto = IPPROTO_ESP;
1473 		xisr->sadb_x_ipsecrequest_mode = IPSEC_MODE_TRANSPORT;
1474 		xisr->sadb_x_ipsecrequest_level = level;
1475 		xisr->sadb_x_ipsecrequest_reqid = key_newreqid();
1476 	}
1477 
1478 	return size;
1479 }
1480 
1481 static inline void
1482 if_ipsec_set_sadb_msg(struct sadb_msg *msg, uint16_t extlen, uint8_t msgtype)
1483 {
1484 
1485 	KASSERT(msg != NULL);
1486 
1487 	msg->sadb_msg_version = PF_KEY_V2;
1488 	msg->sadb_msg_type = msgtype;
1489 	msg->sadb_msg_errno = 0;
1490 	msg->sadb_msg_satype = SADB_SATYPE_UNSPEC;
1491 	msg->sadb_msg_len = PFKEY_UNIT64(sizeof(*msg)) + extlen;
1492 	msg->sadb_msg_reserved = 0;
1493 	msg->sadb_msg_seq = 0; /* XXXX */
1494 	msg->sadb_msg_pid = 0; /* XXXX */
1495 }
1496 
1497 static inline void
1498 if_ipsec_set_sadb_msg_add(struct sadb_msg *msg, uint16_t extlen)
1499 {
1500 
1501 	if_ipsec_set_sadb_msg(msg, extlen, SADB_X_SPDADD);
1502 }
1503 
1504 static inline void
1505 if_ipsec_set_sadb_msg_del(struct sadb_msg *msg, uint16_t extlen)
1506 {
1507 
1508 	if_ipsec_set_sadb_msg(msg, extlen, SADB_X_SPDDELETE2);
1509 }
1510 
1511 static int
1512 if_ipsec_set_addr_port(struct sockaddr *addrport, struct sockaddr *addr,
1513     in_port_t port)
1514 {
1515 	int error = 0;
1516 
1517 	sockaddr_copy(addrport, addr->sa_len, addr);
1518 
1519 	switch (addr->sa_family) {
1520 #ifdef INET
1521 	case AF_INET: {
1522 		struct sockaddr_in *sin = satosin(addrport);
1523 		sin->sin_port = port;
1524 		break;
1525 	}
1526 #endif /* INET */
1527 #ifdef INET6
1528 	case AF_INET6: {
1529 		struct sockaddr_in6 *sin6 = satosin6(addrport);
1530 		sin6->sin6_port = port;
1531 		break;
1532 	}
1533 #endif /* INET6 */
1534 	default:
1535 		log(LOG_DEBUG,
1536 		    "%s: Invalid address family: %d.\n",
1537 		    __func__, addr->sa_family);
1538 		error = EINVAL;
1539 	}
1540 
1541 	return error;
1542 }
1543 
1544 static struct secpolicy *
1545 if_ipsec_add_sp0(struct sockaddr *src, in_port_t sport,
1546     struct sockaddr *dst, in_port_t dport,
1547     int dir, int proto, int level, u_int policy)
1548 {
1549 	struct sadb_msg msg;
1550 	struct sadb_address xsrc, xdst;
1551 	struct sadb_x_policy xpl;
1552 	struct sadb_x_ipsecrequest xisr;
1553 	size_t size;
1554 	size_t padlen;
1555 	uint16_t ext_msg_len = 0;
1556 	struct mbuf *m;
1557 
1558 	memset(&msg, 0, sizeof(msg));
1559 	memset(&xsrc, 0, sizeof(xsrc));
1560 	memset(&xdst, 0, sizeof(xdst));
1561 	memset(&xpl, 0, sizeof(xpl));
1562 	memset(&xisr, 0, sizeof(xisr));
1563 
1564 	MGETHDR(m, M_WAIT, MT_DATA);
1565 
1566 	size = if_ipsec_set_sadb_src(&xsrc, src, proto);
1567 	ext_msg_len += PFKEY_UNIT64(size);
1568 	size = if_ipsec_set_sadb_dst(&xdst, dst, proto);
1569 	ext_msg_len += PFKEY_UNIT64(size);
1570 	size = if_ipsec_set_sadb_x_policy(&xpl, &xisr, policy, dir, 0, level, src, dst);
1571 	ext_msg_len += PFKEY_UNIT64(size);
1572 	if_ipsec_set_sadb_msg_add(&msg, ext_msg_len);
1573 
1574 	/* build PF_KEY message */
1575 
1576 	m->m_len = sizeof(msg);
1577 	m_copyback(m, 0, sizeof(msg), &msg);
1578 
1579 	if_ipsec_add_mbuf(m, &xsrc, sizeof(xsrc));
1580 	/*
1581 	 * secpolicy.spidx.{src, dst} must not be set port number,
1582 	 * even if it is used for NAT-T.
1583 	 */
1584 	if_ipsec_add_mbuf_addr_port(m, src, 0, true);
1585 	padlen = PFKEY_UNUNIT64(xsrc.sadb_address_len)
1586 		- (sizeof(xsrc) + PFKEY_ALIGN8(src->sa_len));
1587 	if_ipsec_add_pad(m, padlen);
1588 
1589 	if_ipsec_add_mbuf(m, &xdst, sizeof(xdst));
1590 	/* ditto */
1591 	if_ipsec_add_mbuf_addr_port(m, dst, 0, true);
1592 	padlen = PFKEY_UNUNIT64(xdst.sadb_address_len)
1593 		- (sizeof(xdst) + PFKEY_ALIGN8(dst->sa_len));
1594 	if_ipsec_add_pad(m, padlen);
1595 
1596 	if_ipsec_add_mbuf(m, &xpl, sizeof(xpl));
1597 	padlen = PFKEY_UNUNIT64(xpl.sadb_x_policy_len) - sizeof(xpl);
1598 	if (policy == IPSEC_POLICY_IPSEC) {
1599 		if_ipsec_add_mbuf(m, &xisr, sizeof(xisr));
1600 		padlen -= PFKEY_ALIGN8(sizeof(xisr));
1601 	}
1602 	if_ipsec_add_pad(m, padlen);
1603 
1604 	/* key_kpi_spdadd() has already done KEY_SP_REF(). */
1605 	return key_kpi_spdadd(m);
1606 }
1607 
1608 static int
1609 if_ipsec_add_sp(struct ipsec_variant *var,
1610     struct sockaddr *src, in_port_t sport,
1611     struct sockaddr *dst, in_port_t dport)
1612 {
1613 	struct ipsec_softc *sc = var->iv_softc;
1614 	int level;
1615 	u_int v6policy;
1616 
1617 	/*
1618 	 * must delete sp before add it.
1619 	 */
1620 	KASSERT(IV_SP_IN(var) == NULL);
1621 	KASSERT(IV_SP_OUT(var) == NULL);
1622 	KASSERT(IV_SP_IN6(var) == NULL);
1623 	KASSERT(IV_SP_OUT6(var) == NULL);
1624 
1625 	/*
1626 	 * can be shared?
1627 	 */
1628 	if (if_ipsec_share_sp(var))
1629 		return 0;
1630 
1631 	if (if_ipsec_nat_t(sc))
1632 		level = IPSEC_LEVEL_REQUIRE;
1633 	else
1634 		level = IPSEC_LEVEL_UNIQUE;
1635 
1636 	if (if_ipsec_fwd_ipv6(sc))
1637 		v6policy = IPSEC_POLICY_IPSEC;
1638 	else
1639 		v6policy = IPSEC_POLICY_DISCARD;
1640 
1641 	IV_SP_IN(var) = if_ipsec_add_sp0(dst, dport, src, sport,
1642 	    IPSEC_DIR_INBOUND, IPPROTO_IPIP, level, IPSEC_POLICY_IPSEC);
1643 	if (IV_SP_IN(var) == NULL)
1644 		goto fail;
1645 	IV_SP_OUT(var) = if_ipsec_add_sp0(src, sport, dst, dport,
1646 	    IPSEC_DIR_OUTBOUND, IPPROTO_IPIP, level, IPSEC_POLICY_IPSEC);
1647 	if (IV_SP_OUT(var) == NULL)
1648 		goto fail;
1649 	IV_SP_IN6(var) = if_ipsec_add_sp0(dst, dport, src, sport,
1650 	    IPSEC_DIR_INBOUND, IPPROTO_IPV6, level, v6policy);
1651 	if (IV_SP_IN6(var) == NULL)
1652 		goto fail;
1653 	IV_SP_OUT6(var) = if_ipsec_add_sp0(src, sport, dst, dport,
1654 	    IPSEC_DIR_OUTBOUND, IPPROTO_IPV6, level, v6policy);
1655 	if (IV_SP_OUT6(var) == NULL)
1656 		goto fail;
1657 
1658 	return 0;
1659 
1660 fail:
1661 	if (IV_SP_IN6(var) != NULL) {
1662 		if_ipsec_del_sp0(IV_SP_IN6(var));
1663 		IV_SP_IN6(var) = NULL;
1664 	}
1665 	if (IV_SP_OUT(var) != NULL) {
1666 		if_ipsec_del_sp0(IV_SP_OUT(var));
1667 		IV_SP_OUT(var) = NULL;
1668 	}
1669 	if (IV_SP_IN(var) != NULL) {
1670 		if_ipsec_del_sp0(IV_SP_IN(var));
1671 		IV_SP_IN(var) = NULL;
1672 	}
1673 
1674 	return EEXIST;
1675 }
1676 
1677 static int
1678 if_ipsec_del_sp0(struct secpolicy *sp)
1679 {
1680 	struct sadb_msg msg;
1681 	struct sadb_x_policy xpl;
1682 	size_t size;
1683 	uint16_t ext_msg_len = 0;
1684 	int error;
1685 	struct mbuf *m;
1686 
1687 	if (sp == NULL)
1688 		return 0;
1689 
1690 	memset(&msg, 0, sizeof(msg));
1691 	memset(&xpl, 0, sizeof(xpl));
1692 
1693 	MGETHDR(m, M_WAIT, MT_DATA);
1694 
1695 	size = if_ipsec_set_sadb_x_policy(&xpl, NULL, 0, 0, sp->id, 0, NULL, NULL);
1696 	ext_msg_len += PFKEY_UNIT64(size);
1697 
1698 	if_ipsec_set_sadb_msg_del(&msg, ext_msg_len);
1699 
1700 	m->m_len = sizeof(msg);
1701 	m_copyback(m, 0, sizeof(msg), &msg);
1702 
1703 	if_ipsec_add_mbuf(m, &xpl, sizeof(xpl));
1704 
1705 	/*  unreference correspond to key_kpi_spdadd(). */
1706 	KEY_SP_UNREF(&sp);
1707 	error = key_kpi_spddelete2(m);
1708 	if (error != 0) {
1709 		log(LOG_ERR, "%s: cannot delete SP(ID=%u) (error=%d).\n",
1710 		    __func__, sp->id, error);
1711 	}
1712 	return error;
1713 }
1714 
1715 static void
1716 if_ipsec_del_sp(struct ipsec_variant *var)
1717 {
1718 
1719 	/* are the SPs shared? */
1720 	if (if_ipsec_unshare_sp(var))
1721 		return;
1722 
1723 	(void)if_ipsec_del_sp0(IV_SP_OUT(var));
1724 	(void)if_ipsec_del_sp0(IV_SP_IN(var));
1725 	(void)if_ipsec_del_sp0(IV_SP_OUT6(var));
1726 	(void)if_ipsec_del_sp0(IV_SP_IN6(var));
1727 	IV_SP_IN(var) = NULL;
1728 	IV_SP_IN6(var) = NULL;
1729 	IV_SP_OUT(var) = NULL;
1730 	IV_SP_OUT6(var) = NULL;
1731 }
1732 
1733 static int
1734 if_ipsec_replace_sp(struct ipsec_softc *sc, struct ipsec_variant *ovar,
1735     struct ipsec_variant *nvar)
1736 {
1737 	in_port_t src_port = 0;
1738 	in_port_t dst_port = 0;
1739 	struct sockaddr *src;
1740 	struct sockaddr *dst;
1741 	int error = 0;
1742 
1743 	KASSERT(mutex_owned(&sc->ipsec_lock));
1744 
1745 	if_ipsec_del_sp(ovar);
1746 
1747 	src = nvar->iv_psrc;
1748 	dst = nvar->iv_pdst;
1749 	if (if_ipsec_nat_t(sc)) {
1750 		/* NAT-T enabled */
1751 		src_port = nvar->iv_sport;
1752 		dst_port = nvar->iv_dport;
1753 	}
1754 	if (src && dst)
1755 		error = if_ipsec_add_sp(nvar, src, src_port, dst, dst_port);
1756 
1757 	return error;
1758 }
1759 
1760 /*
1761  * ipsec_variant and its SPs update API.
1762  *
1763  * Assumption:
1764  * reader side dereferences sc->ipsec_var in reader critical section only,
1765  * that is, all of reader sides do not reader the sc->ipsec_var after
1766  * pserialize_perform().
1767  */
1768 static int
1769 if_ipsec_update_variant(struct ipsec_softc *sc, struct ipsec_variant *nvar,
1770     struct ipsec_variant *nullvar)
1771 {
1772 	struct ifnet *ifp = &sc->ipsec_if;
1773 	struct ipsec_variant *ovar = sc->ipsec_var;
1774 	int error;
1775 
1776 	KASSERT(mutex_owned(&sc->ipsec_lock));
1777 
1778 	/*
1779 	 * To keep consistency between ipsec(4) I/F settings and SPs,
1780 	 * we stop packet processing while replacing SPs, that is, we set
1781 	 * "null" config variant to sc->ipsec_var.
1782 	 */
1783 	sc->ipsec_var = nullvar;
1784 	pserialize_perform(sc->ipsec_psz);
1785 	psref_target_destroy(&ovar->iv_psref, iv_psref_class);
1786 
1787 	error = if_ipsec_replace_sp(sc, ovar, nvar);
1788 	if (!error)
1789 		sc->ipsec_var = nvar;
1790 	else {
1791 		sc->ipsec_var = ovar; /* rollback */
1792 		psref_target_init(&ovar->iv_psref, iv_psref_class);
1793 	}
1794 
1795 	pserialize_perform(sc->ipsec_psz);
1796 	psref_target_destroy(&nullvar->iv_psref, iv_psref_class);
1797 
1798 	if (if_ipsec_variant_is_configured(sc->ipsec_var))
1799 		ifp->if_flags |= IFF_RUNNING;
1800 	else
1801 		ifp->if_flags &= ~IFF_RUNNING;
1802 
1803 	return error;
1804 }
1805