xref: /netbsd-src/sys/netinet/ip_encap.c (revision d909946ca08dceb44d7d0f22ec9488679695d976)
1 /*	$NetBSD: ip_encap.c,v 1.61 2016/07/04 04:40:13 knakahara Exp $	*/
2 /*	$KAME: ip_encap.c,v 1.73 2001/10/02 08:30:58 itojun Exp $	*/
3 
4 /*
5  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the project nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 /*
33  * My grandfather said that there's a devil inside tunnelling technology...
34  *
35  * We have surprisingly many protocols that want packets with IP protocol
36  * #4 or #41.  Here's a list of protocols that want protocol #41:
37  *	RFC1933 configured tunnel
38  *	RFC1933 automatic tunnel
39  *	RFC2401 IPsec tunnel
40  *	RFC2473 IPv6 generic packet tunnelling
41  *	RFC2529 6over4 tunnel
42  *	RFC3056 6to4 tunnel
43  *	isatap tunnel
44  *	mobile-ip6 (uses RFC2473)
45  * Here's a list of protocol that want protocol #4:
46  *	RFC1853 IPv4-in-IPv4 tunnelling
47  *	RFC2003 IPv4 encapsulation within IPv4
48  *	RFC2344 reverse tunnelling for mobile-ip4
49  *	RFC2401 IPsec tunnel
50  * Well, what can I say.  They impose different en/decapsulation mechanism
51  * from each other, so they need separate protocol handler.  The only one
52  * we can easily determine by protocol # is IPsec, which always has
53  * AH/ESP/IPComp header right after outer IP header.
54  *
55  * So, clearly good old protosw does not work for protocol #4 and #41.
56  * The code will let you match protocol via src/dst address pair.
57  */
58 /* XXX is M_NETADDR correct? */
59 
60 /*
61  * With USE_RADIX the code will use radix table for tunnel lookup, for
62  * tunnels registered with encap_attach() with a addr/mask pair.
63  * Faster on machines with thousands of tunnel registerations (= interfaces).
64  *
65  * The code assumes that radix table code can handle non-continuous netmask,
66  * as it will pass radix table memory region with (src + dst) sockaddr pair.
67  */
68 #define USE_RADIX
69 
70 #include <sys/cdefs.h>
71 __KERNEL_RCSID(0, "$NetBSD: ip_encap.c,v 1.61 2016/07/04 04:40:13 knakahara Exp $");
72 
73 #ifdef _KERNEL_OPT
74 #include "opt_mrouting.h"
75 #include "opt_inet.h"
76 #include "opt_net_mpsafe.h"
77 #endif
78 
79 #include <sys/param.h>
80 #include <sys/systm.h>
81 #include <sys/socket.h>
82 #include <sys/sockio.h>
83 #include <sys/mbuf.h>
84 #include <sys/errno.h>
85 #include <sys/queue.h>
86 #include <sys/kmem.h>
87 #include <sys/mutex.h>
88 #include <sys/condvar.h>
89 #include <sys/psref.h>
90 #include <sys/pslist.h>
91 
92 #include <net/if.h>
93 
94 #include <netinet/in.h>
95 #include <netinet/in_systm.h>
96 #include <netinet/ip.h>
97 #include <netinet/ip_var.h>
98 #include <netinet/ip_encap.h>
99 #ifdef MROUTING
100 #include <netinet/ip_mroute.h>
101 #endif /* MROUTING */
102 
103 #ifdef INET6
104 #include <netinet/ip6.h>
105 #include <netinet6/ip6_var.h>
106 #include <netinet6/ip6protosw.h> /* for struct ip6ctlparam */
107 #include <netinet6/in6_var.h>
108 #include <netinet6/in6_pcb.h>
109 #include <netinet/icmp6.h>
110 #endif
111 
112 #include <net/net_osdep.h>
113 
114 #ifdef NET_MPSAFE
115 #define ENCAP_MPSAFE	1
116 #endif
117 
118 enum direction { INBOUND, OUTBOUND };
119 
120 #ifdef INET
121 static struct encaptab *encap4_lookup(struct mbuf *, int, int, enum direction,
122     struct psref *);
123 #endif
124 #ifdef INET6
125 static struct encaptab *encap6_lookup(struct mbuf *, int, int, enum direction,
126     struct psref *);
127 #endif
128 static int encap_add(struct encaptab *);
129 static int encap_remove(struct encaptab *);
130 static int encap_afcheck(int, const struct sockaddr *, const struct sockaddr *);
131 #ifdef USE_RADIX
132 static struct radix_node_head *encap_rnh(int);
133 static int mask_matchlen(const struct sockaddr *);
134 #else
135 static int mask_match(const struct encaptab *, const struct sockaddr *,
136 		const struct sockaddr *);
137 #endif
138 static void encap_fillarg(struct mbuf *, const struct encaptab *);
139 
140 /*
141  * In encap[46]_lookup(), ep->func can sleep(e.g. rtalloc1) while walking
142  * encap_table. So, it cannot use pserialize_read_enter()
143  */
144 static struct {
145 	struct pslist_head	list;
146 	pserialize_t		psz;
147 	struct psref_class	*elem_class; /* for the element of et_list */
148 } encaptab  __cacheline_aligned = {
149 	.list = PSLIST_INITIALIZER,
150 };
151 #define encap_table encaptab.list
152 
153 static struct {
154 	kmutex_t	lock;
155 	kcondvar_t	cv;
156 	struct lwp	*busy;
157 } encap_whole __cacheline_aligned;
158 
159 #ifdef USE_RADIX
160 struct radix_node_head *encap_head[2];	/* 0 for AF_INET, 1 for AF_INET6 */
161 static bool encap_head_updating = false;
162 #endif
163 
164 /*
165  * must be done before other encap interfaces initialization.
166  */
167 void
168 encapinit(void)
169 {
170 
171 	encaptab.psz = pserialize_create();
172 	encaptab.elem_class = psref_class_create("encapelem", IPL_SOFTNET);
173 	if (encaptab.elem_class == NULL)
174 		panic("encaptab.elem_class cannot be allocated.\n");
175 
176 	mutex_init(&encap_whole.lock, MUTEX_DEFAULT, IPL_NONE);
177 	cv_init(&encap_whole.cv, "ip_encap cv");
178 	encap_whole.busy = NULL;
179 }
180 
181 void
182 encap_init(void)
183 {
184 	static int initialized = 0;
185 
186 	if (initialized)
187 		return;
188 	initialized++;
189 #if 0
190 	/*
191 	 * we cannot use LIST_INIT() here, since drivers may want to call
192 	 * encap_attach(), on driver attach.  encap_init() will be called
193 	 * on AF_INET{,6} initialization, which happens after driver
194 	 * initialization - using LIST_INIT() here can nuke encap_attach()
195 	 * from drivers.
196 	 */
197 	PSLIST_INIT(&encap_table);
198 #endif
199 
200 #ifdef USE_RADIX
201 	/*
202 	 * initialize radix lookup table when the radix subsystem is inited.
203 	 */
204 	rn_delayedinit((void *)&encap_head[0],
205 	    sizeof(struct sockaddr_pack) << 3);
206 #ifdef INET6
207 	rn_delayedinit((void *)&encap_head[1],
208 	    sizeof(struct sockaddr_pack) << 3);
209 #endif
210 #endif
211 }
212 
213 #ifdef INET
214 static struct encaptab *
215 encap4_lookup(struct mbuf *m, int off, int proto, enum direction dir,
216     struct psref *match_psref)
217 {
218 	struct ip *ip;
219 	struct ip_pack4 pack;
220 	struct encaptab *ep, *match;
221 	int prio, matchprio;
222 	int s;
223 #ifdef USE_RADIX
224 	struct radix_node_head *rnh = encap_rnh(AF_INET);
225 	struct radix_node *rn;
226 #endif
227 
228 	KASSERT(m->m_len >= sizeof(*ip));
229 
230 	ip = mtod(m, struct ip *);
231 
232 	memset(&pack, 0, sizeof(pack));
233 	pack.p.sp_len = sizeof(pack);
234 	pack.mine.sin_family = pack.yours.sin_family = AF_INET;
235 	pack.mine.sin_len = pack.yours.sin_len = sizeof(struct sockaddr_in);
236 	if (dir == INBOUND) {
237 		pack.mine.sin_addr = ip->ip_dst;
238 		pack.yours.sin_addr = ip->ip_src;
239 	} else {
240 		pack.mine.sin_addr = ip->ip_src;
241 		pack.yours.sin_addr = ip->ip_dst;
242 	}
243 
244 	match = NULL;
245 	matchprio = 0;
246 
247 	s = pserialize_read_enter();
248 #ifdef USE_RADIX
249 	if (encap_head_updating) {
250 		/*
251 		 * Update in progress. Do nothing.
252 		 */
253 		pserialize_read_exit(s);
254 		return NULL;
255 	}
256 
257 	rn = rnh->rnh_matchaddr((void *)&pack, rnh);
258 	if (rn && (rn->rn_flags & RNF_ROOT) == 0) {
259 		struct encaptab *encapp = (struct encaptab *)rn;
260 
261 		psref_acquire(match_psref, &encapp->psref,
262 		    encaptab.elem_class);
263 		match = encapp;
264 		matchprio = mask_matchlen(match->srcmask) +
265 		    mask_matchlen(match->dstmask);
266 	}
267 #endif
268 	PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) {
269 		struct psref elem_psref;
270 
271 		membar_datadep_consumer();
272 
273 		if (ep->af != AF_INET)
274 			continue;
275 		if (ep->proto >= 0 && ep->proto != proto)
276 			continue;
277 
278 		psref_acquire(&elem_psref, &ep->psref,
279 		    encaptab.elem_class);
280 		if (ep->func) {
281 			pserialize_read_exit(s);
282 			/* ep->func is sleepable. e.g. rtalloc1 */
283 			prio = (*ep->func)(m, off, proto, ep->arg);
284 			s = pserialize_read_enter();
285 		} else {
286 #ifdef USE_RADIX
287 			psref_release(&elem_psref, &ep->psref,
288 			    encaptab.elem_class);
289 			continue;
290 #else
291 			prio = mask_match(ep, (struct sockaddr *)&pack.mine,
292 			    (struct sockaddr *)&pack.yours);
293 #endif
294 		}
295 
296 		/*
297 		 * We prioritize the matches by using bit length of the
298 		 * matches.  mask_match() and user-supplied matching function
299 		 * should return the bit length of the matches (for example,
300 		 * if both src/dst are matched for IPv4, 64 should be returned).
301 		 * 0 or negative return value means "it did not match".
302 		 *
303 		 * The question is, since we have two "mask" portion, we
304 		 * cannot really define total order between entries.
305 		 * For example, which of these should be preferred?
306 		 * mask_match() returns 48 (32 + 16) for both of them.
307 		 *	src=3ffe::/16, dst=3ffe:501::/32
308 		 *	src=3ffe:501::/32, dst=3ffe::/16
309 		 *
310 		 * We need to loop through all the possible candidates
311 		 * to get the best match - the search takes O(n) for
312 		 * n attachments (i.e. interfaces).
313 		 *
314 		 * For radix-based lookup, I guess source takes precedence.
315 		 * See rn_{refines,lexobetter} for the correct answer.
316 		 */
317 		if (prio <= 0) {
318 			psref_release(&elem_psref, &ep->psref,
319 			    encaptab.elem_class);
320 			continue;
321 		}
322 		if (prio > matchprio) {
323 			/* release last matched ep */
324 			if (match != NULL)
325 				psref_release(match_psref, &match->psref,
326 				    encaptab.elem_class);
327 
328 			psref_copy(match_psref, &elem_psref,
329 			    encaptab.elem_class);
330 			matchprio = prio;
331 			match = ep;
332 		}
333 		KASSERTMSG((match == NULL) || psref_held(&match->psref,
334 			encaptab.elem_class),
335 		    "current match = %p, but not hold its psref", match);
336 
337 		psref_release(&elem_psref, &ep->psref,
338 		    encaptab.elem_class);
339 	}
340 	pserialize_read_exit(s);
341 
342 	return match;
343 }
344 
345 void
346 encap4_input(struct mbuf *m, ...)
347 {
348 	int off, proto;
349 	va_list ap;
350 	const struct encapsw *esw;
351 	struct encaptab *match;
352 	struct psref match_psref;
353 
354 	va_start(ap, m);
355 	off = va_arg(ap, int);
356 	proto = va_arg(ap, int);
357 	va_end(ap);
358 
359 	match = encap4_lookup(m, off, proto, INBOUND, &match_psref);
360 	if (match) {
361 		/* found a match, "match" has the best one */
362 		esw = match->esw;
363 		if (esw && esw->encapsw4.pr_input) {
364 			encap_fillarg(m, match);
365 			(*esw->encapsw4.pr_input)(m, off, proto);
366 			psref_release(&match_psref, &match->psref,
367 			    encaptab.elem_class);
368 		} else {
369 			psref_release(&match_psref, &match->psref,
370 			    encaptab.elem_class);
371 			m_freem(m);
372 		}
373 		return;
374 	}
375 
376 	/* last resort: inject to raw socket */
377 	rip_input(m, off, proto);
378 }
379 #endif
380 
381 #ifdef INET6
382 static struct encaptab *
383 encap6_lookup(struct mbuf *m, int off, int proto, enum direction dir,
384     struct psref *match_psref)
385 {
386 	struct ip6_hdr *ip6;
387 	struct ip_pack6 pack;
388 	int prio, matchprio;
389 	int s;
390 	struct encaptab *ep, *match;
391 #ifdef USE_RADIX
392 	struct radix_node_head *rnh = encap_rnh(AF_INET6);
393 	struct radix_node *rn;
394 #endif
395 
396 	KASSERT(m->m_len >= sizeof(*ip6));
397 
398 	ip6 = mtod(m, struct ip6_hdr *);
399 
400 	memset(&pack, 0, sizeof(pack));
401 	pack.p.sp_len = sizeof(pack);
402 	pack.mine.sin6_family = pack.yours.sin6_family = AF_INET6;
403 	pack.mine.sin6_len = pack.yours.sin6_len = sizeof(struct sockaddr_in6);
404 	if (dir == INBOUND) {
405 		pack.mine.sin6_addr = ip6->ip6_dst;
406 		pack.yours.sin6_addr = ip6->ip6_src;
407 	} else {
408 		pack.mine.sin6_addr = ip6->ip6_src;
409 		pack.yours.sin6_addr = ip6->ip6_dst;
410 	}
411 
412 	match = NULL;
413 	matchprio = 0;
414 
415 	s = pserialize_read_enter();
416 #ifdef USE_RADIX
417 	if (encap_head_updating) {
418 		/*
419 		 * Update in progress. Do nothing.
420 		 */
421 		pserialize_read_exit(s);
422 		return NULL;
423 	}
424 
425 	rn = rnh->rnh_matchaddr((void *)&pack, rnh);
426 	if (rn && (rn->rn_flags & RNF_ROOT) == 0) {
427 		struct encaptab *encapp = (struct encaptab *)rn;
428 
429 		psref_acquire(match_psref, &encapp->psref,
430 		    encaptab.elem_class);
431 		match = encapp;
432 		matchprio = mask_matchlen(match->srcmask) +
433 		    mask_matchlen(match->dstmask);
434 	}
435 #endif
436 	PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) {
437 		struct psref elem_psref;
438 
439 		membar_datadep_consumer();
440 
441 		if (ep->af != AF_INET6)
442 			continue;
443 		if (ep->proto >= 0 && ep->proto != proto)
444 			continue;
445 
446 		psref_acquire(&elem_psref, &ep->psref,
447 		    encaptab.elem_class);
448 
449 		if (ep->func) {
450 			pserialize_read_exit(s);
451 			/* ep->func is sleepable. e.g. rtalloc1 */
452 			prio = (*ep->func)(m, off, proto, ep->arg);
453 			s = pserialize_read_enter();
454 		} else {
455 #ifdef USE_RADIX
456 			psref_release(&elem_psref, &ep->psref,
457 			    encaptab.elem_class);
458 			continue;
459 #else
460 			prio = mask_match(ep, (struct sockaddr *)&pack.mine,
461 			    (struct sockaddr *)&pack.yours);
462 #endif
463 		}
464 
465 		/* see encap4_lookup() for issues here */
466 		if (prio <= 0) {
467 			psref_release(&elem_psref, &ep->psref,
468 			    encaptab.elem_class);
469 			continue;
470 		}
471 		if (prio > matchprio) {
472 			/* release last matched ep */
473 			if (match != NULL)
474 				psref_release(match_psref, &match->psref,
475 				    encaptab.elem_class);
476 
477 			psref_copy(match_psref, &elem_psref,
478 			    encaptab.elem_class);
479 			matchprio = prio;
480 			match = ep;
481 		}
482 		KASSERTMSG((match == NULL) || psref_held(&match->psref,
483 			encaptab.elem_class),
484 		    "current match = %p, but not hold its psref", match);
485 
486 		psref_release(&elem_psref, &ep->psref,
487 		    encaptab.elem_class);
488 	}
489 	pserialize_read_exit(s);
490 
491 	return match;
492 }
493 
494 int
495 encap6_input(struct mbuf **mp, int *offp, int proto)
496 {
497 	struct mbuf *m = *mp;
498 	const struct encapsw *esw;
499 	struct encaptab *match;
500 	struct psref match_psref;
501 
502 	match = encap6_lookup(m, *offp, proto, INBOUND, &match_psref);
503 
504 	if (match) {
505 		/* found a match */
506 		esw = match->esw;
507 		if (esw && esw->encapsw6.pr_input) {
508 			int ret;
509 			encap_fillarg(m, match);
510 			ret = (*esw->encapsw6.pr_input)(mp, offp, proto);
511 			psref_release(&match_psref, &match->psref,
512 			    encaptab.elem_class);
513 			return ret;
514 		} else {
515 			psref_release(&match_psref, &match->psref,
516 			    encaptab.elem_class);
517 			m_freem(m);
518 			return IPPROTO_DONE;
519 		}
520 	}
521 
522 	/* last resort: inject to raw socket */
523 	return rip6_input(mp, offp, proto);
524 }
525 #endif
526 
527 /*
528  * XXX
529  * The encaptab list and the rnh radix tree must be manipulated atomically.
530  */
531 static int
532 encap_add(struct encaptab *ep)
533 {
534 #ifdef USE_RADIX
535 	struct radix_node_head *rnh = encap_rnh(ep->af);
536 #endif
537 
538 	KASSERT(encap_lock_held());
539 
540 #ifdef USE_RADIX
541 	if (!ep->func && rnh) {
542 		/* Disable access to the radix tree for reader. */
543 		encap_head_updating = true;
544 		/* Wait for all readers to drain. */
545 		pserialize_perform(encaptab.psz);
546 
547 		if (!rnh->rnh_addaddr((void *)ep->addrpack,
548 		    (void *)ep->maskpack, rnh, ep->nodes)) {
549 			encap_head_updating = false;
550 			return EEXIST;
551 		}
552 
553 		/*
554 		 * The ep added to the radix tree must be skipped while
555 		 * encap[46]_lookup walks encaptab list. In other words,
556 		 * encap_add() does not need to care whether the ep has
557 		 * been added encaptab list or not yet.
558 		 * So, we can re-enable access to the radix tree for now.
559 		 */
560 		encap_head_updating = false;
561 	}
562 #endif
563 	PSLIST_WRITER_INSERT_HEAD(&encap_table, ep, chain);
564 
565 	return 0;
566 }
567 
568 /*
569  * XXX
570  * The encaptab list and the rnh radix tree must be manipulated atomically.
571  */
572 static int
573 encap_remove(struct encaptab *ep)
574 {
575 #ifdef USE_RADIX
576 	struct radix_node_head *rnh = encap_rnh(ep->af);
577 #endif
578 	int error = 0;
579 
580 	KASSERT(encap_lock_held());
581 
582 #ifdef USE_RADIX
583 	if (!ep->func && rnh) {
584 		/* Disable access to the radix tree for reader. */
585 		encap_head_updating = true;
586 		/* Wait for all readers to drain. */
587 		pserialize_perform(encaptab.psz);
588 
589 		if (!rnh->rnh_deladdr((void *)ep->addrpack,
590 		    (void *)ep->maskpack, rnh))
591 			error = ESRCH;
592 
593 		/*
594 		 * The ep added to the radix tree must be skipped while
595 		 * encap[46]_lookup walks encaptab list. In other words,
596 		 * encap_add() does not need to care whether the ep has
597 		 * been added encaptab list or not yet.
598 		 * So, we can re-enable access to the radix tree for now.
599 		 */
600 		encap_head_updating = false;
601 	}
602 #endif
603 	PSLIST_WRITER_REMOVE(ep, chain);
604 
605 	return error;
606 }
607 
608 static int
609 encap_afcheck(int af, const struct sockaddr *sp, const struct sockaddr *dp)
610 {
611 	if (sp && dp) {
612 		if (sp->sa_len != dp->sa_len)
613 			return EINVAL;
614 		if (af != sp->sa_family || af != dp->sa_family)
615 			return EINVAL;
616 	} else if (!sp && !dp)
617 		;
618 	else
619 		return EINVAL;
620 
621 	switch (af) {
622 	case AF_INET:
623 		if (sp && sp->sa_len != sizeof(struct sockaddr_in))
624 			return EINVAL;
625 		if (dp && dp->sa_len != sizeof(struct sockaddr_in))
626 			return EINVAL;
627 		break;
628 #ifdef INET6
629 	case AF_INET6:
630 		if (sp && sp->sa_len != sizeof(struct sockaddr_in6))
631 			return EINVAL;
632 		if (dp && dp->sa_len != sizeof(struct sockaddr_in6))
633 			return EINVAL;
634 		break;
635 #endif
636 	default:
637 		return EAFNOSUPPORT;
638 	}
639 
640 	return 0;
641 }
642 
643 /*
644  * sp (src ptr) is always my side, and dp (dst ptr) is always remote side.
645  * length of mask (sm and dm) is assumed to be same as sp/dp.
646  * Return value will be necessary as input (cookie) for encap_detach().
647  */
648 const struct encaptab *
649 encap_attach(int af, int proto,
650     const struct sockaddr *sp, const struct sockaddr *sm,
651     const struct sockaddr *dp, const struct sockaddr *dm,
652     const struct encapsw *esw, void *arg)
653 {
654 	struct encaptab *ep;
655 	int error;
656 	int pss;
657 	size_t l;
658 	struct ip_pack4 *pack4;
659 #ifdef INET6
660 	struct ip_pack6 *pack6;
661 #endif
662 #ifndef ENCAP_MPSAFE
663 	int s;
664 
665 	s = splsoftnet();
666 #endif
667 	/* sanity check on args */
668 	error = encap_afcheck(af, sp, dp);
669 	if (error)
670 		goto fail;
671 
672 	/* check if anyone have already attached with exactly same config */
673 	pss = pserialize_read_enter();
674 	PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) {
675 		membar_datadep_consumer();
676 
677 		if (ep->af != af)
678 			continue;
679 		if (ep->proto != proto)
680 			continue;
681 		if (ep->func)
682 			continue;
683 
684 		KASSERT(ep->src != NULL);
685 		KASSERT(ep->dst != NULL);
686 		KASSERT(ep->srcmask != NULL);
687 		KASSERT(ep->dstmask != NULL);
688 
689 		if (ep->src->sa_len != sp->sa_len ||
690 		    memcmp(ep->src, sp, sp->sa_len) != 0 ||
691 		    memcmp(ep->srcmask, sm, sp->sa_len) != 0)
692 			continue;
693 		if (ep->dst->sa_len != dp->sa_len ||
694 		    memcmp(ep->dst, dp, dp->sa_len) != 0 ||
695 		    memcmp(ep->dstmask, dm, dp->sa_len) != 0)
696 			continue;
697 
698 		error = EEXIST;
699 		pserialize_read_exit(pss);
700 		goto fail;
701 	}
702 	pserialize_read_exit(pss);
703 
704 	switch (af) {
705 	case AF_INET:
706 		l = sizeof(*pack4);
707 		break;
708 #ifdef INET6
709 	case AF_INET6:
710 		l = sizeof(*pack6);
711 		break;
712 #endif
713 	default:
714 		goto fail;
715 	}
716 
717 	/* M_NETADDR ok? */
718 	ep = kmem_zalloc(sizeof(*ep), KM_NOSLEEP);
719 	if (ep == NULL) {
720 		error = ENOBUFS;
721 		goto fail;
722 	}
723 	ep->addrpack = kmem_zalloc(l, KM_NOSLEEP);
724 	if (ep->addrpack == NULL) {
725 		error = ENOBUFS;
726 		goto gc;
727 	}
728 	ep->maskpack = kmem_zalloc(l, KM_NOSLEEP);
729 	if (ep->maskpack == NULL) {
730 		error = ENOBUFS;
731 		goto gc;
732 	}
733 
734 	ep->af = af;
735 	ep->proto = proto;
736 	ep->addrpack->sa_len = l & 0xff;
737 	ep->maskpack->sa_len = l & 0xff;
738 	switch (af) {
739 	case AF_INET:
740 		pack4 = (struct ip_pack4 *)ep->addrpack;
741 		ep->src = (struct sockaddr *)&pack4->mine;
742 		ep->dst = (struct sockaddr *)&pack4->yours;
743 		pack4 = (struct ip_pack4 *)ep->maskpack;
744 		ep->srcmask = (struct sockaddr *)&pack4->mine;
745 		ep->dstmask = (struct sockaddr *)&pack4->yours;
746 		break;
747 #ifdef INET6
748 	case AF_INET6:
749 		pack6 = (struct ip_pack6 *)ep->addrpack;
750 		ep->src = (struct sockaddr *)&pack6->mine;
751 		ep->dst = (struct sockaddr *)&pack6->yours;
752 		pack6 = (struct ip_pack6 *)ep->maskpack;
753 		ep->srcmask = (struct sockaddr *)&pack6->mine;
754 		ep->dstmask = (struct sockaddr *)&pack6->yours;
755 		break;
756 #endif
757 	}
758 
759 	memcpy(ep->src, sp, sp->sa_len);
760 	memcpy(ep->srcmask, sm, sp->sa_len);
761 	memcpy(ep->dst, dp, dp->sa_len);
762 	memcpy(ep->dstmask, dm, dp->sa_len);
763 	ep->esw = esw;
764 	ep->arg = arg;
765 	psref_target_init(&ep->psref, encaptab.elem_class);
766 
767 	error = encap_add(ep);
768 	if (error)
769 		goto gc;
770 
771 	error = 0;
772 #ifndef ENCAP_MPSAFE
773 	splx(s);
774 #endif
775 	return ep;
776 
777 gc:
778 	if (ep->addrpack)
779 		kmem_free(ep->addrpack, l);
780 	if (ep->maskpack)
781 		kmem_free(ep->maskpack, l);
782 	if (ep)
783 		kmem_free(ep, sizeof(*ep));
784 fail:
785 #ifndef ENCAP_MPSAFE
786 	splx(s);
787 #endif
788 	return NULL;
789 }
790 
791 const struct encaptab *
792 encap_attach_func(int af, int proto,
793     int (*func)(struct mbuf *, int, int, void *),
794     const struct encapsw *esw, void *arg)
795 {
796 	struct encaptab *ep;
797 	int error;
798 #ifndef ENCAP_MPSAFE
799 	int s;
800 
801 	s = splsoftnet();
802 #endif
803 	/* sanity check on args */
804 	if (!func) {
805 		error = EINVAL;
806 		goto fail;
807 	}
808 
809 	error = encap_afcheck(af, NULL, NULL);
810 	if (error)
811 		goto fail;
812 
813 	ep = kmem_alloc(sizeof(*ep), KM_NOSLEEP);	/*XXX*/
814 	if (ep == NULL) {
815 		error = ENOBUFS;
816 		goto fail;
817 	}
818 	memset(ep, 0, sizeof(*ep));
819 
820 	ep->af = af;
821 	ep->proto = proto;
822 	ep->func = func;
823 	ep->esw = esw;
824 	ep->arg = arg;
825 	psref_target_init(&ep->psref, encaptab.elem_class);
826 
827 	error = encap_add(ep);
828 	if (error)
829 		goto fail;
830 
831 	error = 0;
832 #ifndef ENCAP_MPSAFE
833 	splx(s);
834 #endif
835 	return ep;
836 
837 fail:
838 #ifndef ENCAP_MPSAFE
839 	splx(s);
840 #endif
841 	return NULL;
842 }
843 
844 /* XXX encap4_ctlinput() is necessary if we set DF=1 on outer IPv4 header */
845 
846 #ifdef INET6
847 void *
848 encap6_ctlinput(int cmd, const struct sockaddr *sa, void *d0)
849 {
850 	void *d = d0;
851 	struct ip6_hdr *ip6;
852 	struct mbuf *m;
853 	int off;
854 	struct ip6ctlparam *ip6cp = NULL;
855 	int nxt;
856 	int s;
857 	struct encaptab *ep;
858 	const struct encapsw *esw;
859 
860 	if (sa->sa_family != AF_INET6 ||
861 	    sa->sa_len != sizeof(struct sockaddr_in6))
862 		return NULL;
863 
864 	if ((unsigned)cmd >= PRC_NCMDS)
865 		return NULL;
866 	if (cmd == PRC_HOSTDEAD)
867 		d = NULL;
868 	else if (cmd == PRC_MSGSIZE)
869 		; /* special code is present, see below */
870 	else if (inet6ctlerrmap[cmd] == 0)
871 		return NULL;
872 
873 	/* if the parameter is from icmp6, decode it. */
874 	if (d != NULL) {
875 		ip6cp = (struct ip6ctlparam *)d;
876 		m = ip6cp->ip6c_m;
877 		ip6 = ip6cp->ip6c_ip6;
878 		off = ip6cp->ip6c_off;
879 		nxt = ip6cp->ip6c_nxt;
880 
881 		if (ip6 && cmd == PRC_MSGSIZE) {
882 			int valid = 0;
883 			struct encaptab *match;
884 			struct psref elem_psref;
885 
886 			/*
887 		 	* Check to see if we have a valid encap configuration.
888 		 	*/
889 			match = encap6_lookup(m, off, nxt, OUTBOUND,
890 			    &elem_psref);
891 			if (match)
892 				valid++;
893 			psref_release(&elem_psref, &match->psref,
894 			    encaptab.elem_class);
895 
896 			/*
897 		 	* Depending on the value of "valid" and routing table
898 		 	* size (mtudisc_{hi,lo}wat), we will:
899 		 	* - recalcurate the new MTU and create the
900 		 	*   corresponding routing entry, or
901 		 	* - ignore the MTU change notification.
902 		 	*/
903 			icmp6_mtudisc_update((struct ip6ctlparam *)d, valid);
904 		}
905 	} else {
906 		m = NULL;
907 		ip6 = NULL;
908 		nxt = -1;
909 	}
910 
911 	/* inform all listeners */
912 
913 	s = pserialize_read_enter();
914 	PSLIST_READER_FOREACH(ep, &encap_table, struct encaptab, chain) {
915 		struct psref elem_psref;
916 
917 		membar_datadep_consumer();
918 
919 		if (ep->af != AF_INET6)
920 			continue;
921 		if (ep->proto >= 0 && ep->proto != nxt)
922 			continue;
923 
924 		/* should optimize by looking at address pairs */
925 
926 		/* XXX need to pass ep->arg or ep itself to listeners */
927 		psref_acquire(&elem_psref, &ep->psref,
928 		    encaptab.elem_class);
929 		esw = ep->esw;
930 		if (esw && esw->encapsw6.pr_ctlinput) {
931 			pserialize_read_exit(s);
932 			/* pr_ctlinput is sleepable. e.g. rtcache_free */
933 			(*esw->encapsw6.pr_ctlinput)(cmd, sa, d, ep->arg);
934 			s = pserialize_read_enter();
935 		}
936 		psref_release(&elem_psref, &ep->psref,
937 		    encaptab.elem_class);
938 	}
939 	pserialize_read_exit(s);
940 
941 	rip6_ctlinput(cmd, sa, d0);
942 	return NULL;
943 }
944 #endif
945 
946 int
947 encap_detach(const struct encaptab *cookie)
948 {
949 	const struct encaptab *ep = cookie;
950 	struct encaptab *p;
951 	int error;
952 
953 	KASSERT(encap_lock_held());
954 
955 	PSLIST_WRITER_FOREACH(p, &encap_table, struct encaptab, chain) {
956 		membar_datadep_consumer();
957 
958 		if (p == ep) {
959 			error = encap_remove(p);
960 			if (error)
961 				return error;
962 			else
963 				break;
964 		}
965 	}
966 	if (p == NULL)
967 		return ENOENT;
968 
969 #ifndef USE_RADIX
970 	/*
971 	 * pserialize_perform(encaptab.psz) is already done in encap_remove().
972 	 */
973 	pserialize_perform(encaptab.psz);
974 #endif
975 	psref_target_destroy(&p->psref,
976 	    encaptab.elem_class);
977 	if (!ep->func) {
978 		kmem_free(p->addrpack, ep->addrpack->sa_len);
979 		kmem_free(p->maskpack, ep->maskpack->sa_len);
980 	}
981 	kmem_free(p, sizeof(*p));
982 
983 	return 0;
984 }
985 
986 #ifdef USE_RADIX
987 static struct radix_node_head *
988 encap_rnh(int af)
989 {
990 
991 	switch (af) {
992 	case AF_INET:
993 		return encap_head[0];
994 #ifdef INET6
995 	case AF_INET6:
996 		return encap_head[1];
997 #endif
998 	default:
999 		return NULL;
1000 	}
1001 }
1002 
1003 static int
1004 mask_matchlen(const struct sockaddr *sa)
1005 {
1006 	const char *p, *ep;
1007 	int l;
1008 
1009 	p = (const char *)sa;
1010 	ep = p + sa->sa_len;
1011 	p += 2;	/* sa_len + sa_family */
1012 
1013 	l = 0;
1014 	while (p < ep) {
1015 		l += (*p ? 8 : 0);	/* estimate */
1016 		p++;
1017 	}
1018 	return l;
1019 }
1020 #endif
1021 
1022 #ifndef USE_RADIX
1023 static int
1024 mask_match(const struct encaptab *ep,
1025 	   const struct sockaddr *sp,
1026 	   const struct sockaddr *dp)
1027 {
1028 	struct sockaddr_storage s;
1029 	struct sockaddr_storage d;
1030 	int i;
1031 	const u_int8_t *p, *q;
1032 	u_int8_t *r;
1033 	int matchlen;
1034 
1035 	KASSERTMSG(ep->func == NULL, "wrong encaptab passed to mask_match");
1036 
1037 	if (sp->sa_len > sizeof(s) || dp->sa_len > sizeof(d))
1038 		return 0;
1039 	if (sp->sa_family != ep->af || dp->sa_family != ep->af)
1040 		return 0;
1041 	if (sp->sa_len != ep->src->sa_len || dp->sa_len != ep->dst->sa_len)
1042 		return 0;
1043 
1044 	matchlen = 0;
1045 
1046 	p = (const u_int8_t *)sp;
1047 	q = (const u_int8_t *)ep->srcmask;
1048 	r = (u_int8_t *)&s;
1049 	for (i = 0 ; i < sp->sa_len; i++) {
1050 		r[i] = p[i] & q[i];
1051 		/* XXX estimate */
1052 		matchlen += (q[i] ? 8 : 0);
1053 	}
1054 
1055 	p = (const u_int8_t *)dp;
1056 	q = (const u_int8_t *)ep->dstmask;
1057 	r = (u_int8_t *)&d;
1058 	for (i = 0 ; i < dp->sa_len; i++) {
1059 		r[i] = p[i] & q[i];
1060 		/* XXX rough estimate */
1061 		matchlen += (q[i] ? 8 : 0);
1062 	}
1063 
1064 	/* need to overwrite len/family portion as we don't compare them */
1065 	s.ss_len = sp->sa_len;
1066 	s.ss_family = sp->sa_family;
1067 	d.ss_len = dp->sa_len;
1068 	d.ss_family = dp->sa_family;
1069 
1070 	if (memcmp(&s, ep->src, ep->src->sa_len) == 0 &&
1071 	    memcmp(&d, ep->dst, ep->dst->sa_len) == 0) {
1072 		return matchlen;
1073 	} else
1074 		return 0;
1075 }
1076 #endif
1077 
1078 static void
1079 encap_fillarg(struct mbuf *m, const struct encaptab *ep)
1080 {
1081 	struct m_tag *mtag;
1082 
1083 	mtag = m_tag_get(PACKET_TAG_ENCAP, sizeof(void *), M_NOWAIT);
1084 	if (mtag) {
1085 		*(void **)(mtag + 1) = ep->arg;
1086 		m_tag_prepend(m, mtag);
1087 	}
1088 }
1089 
1090 void *
1091 encap_getarg(struct mbuf *m)
1092 {
1093 	void *p;
1094 	struct m_tag *mtag;
1095 
1096 	p = NULL;
1097 	mtag = m_tag_find(m, PACKET_TAG_ENCAP, NULL);
1098 	if (mtag != NULL) {
1099 		p = *(void **)(mtag + 1);
1100 		m_tag_delete(m, mtag);
1101 	}
1102 	return p;
1103 }
1104 
1105 int
1106 encap_lock_enter(void)
1107 {
1108 	int error;
1109 
1110 	mutex_enter(&encap_whole.lock);
1111 	while (encap_whole.busy != NULL) {
1112 		error = cv_wait_sig(&encap_whole.cv, &encap_whole.lock);
1113 		if (error) {
1114 			mutex_exit(&encap_whole.lock);
1115 			return error;
1116 		}
1117 	}
1118 	KASSERT(encap_whole.busy == NULL);
1119 	encap_whole.busy = curlwp;
1120 	mutex_exit(&encap_whole.lock);
1121 
1122 	return 0;
1123 }
1124 
1125 void
1126 encap_lock_exit(void)
1127 {
1128 
1129 	mutex_enter(&encap_whole.lock);
1130 	KASSERT(encap_whole.busy == curlwp);
1131 	encap_whole.busy = NULL;
1132 	cv_broadcast(&encap_whole.cv);
1133 	mutex_exit(&encap_whole.lock);
1134 }
1135 
1136 bool
1137 encap_lock_held(void)
1138 {
1139 
1140 	return (encap_whole.busy == curlwp);
1141 }
1142