xref: /openbsd-src/sys/netinet/in_pcb.c (revision d13be5d47e4149db2549a9828e244d59dbc43f15)
1 /*	$OpenBSD: in_pcb.c,v 1.124 2011/07/06 01:57:37 dlg Exp $	*/
2 /*	$NetBSD: in_pcb.c,v 1.25 1996/02/13 23:41:53 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1991, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)COPYRIGHT	1.1 (NRL) 17 January 1995
33  *
34  * NRL grants permission for redistribution and use in source and binary
35  * forms, with or without modification, of the software and documentation
36  * created at NRL provided that the following conditions are met:
37  *
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  * 3. All advertising materials mentioning features or use of this software
44  *    must display the following acknowledgements:
45  * 	This product includes software developed by the University of
46  * 	California, Berkeley and its contributors.
47  * 	This product includes software developed at the Information
48  * 	Technology Division, US Naval Research Laboratory.
49  * 4. Neither the name of the NRL nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
54  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
55  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
56  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
57  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
58  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
59  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
60  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
61  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
62  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
63  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
64  *
65  * The views and conclusions contained in the software and documentation
66  * are those of the authors and should not be interpreted as representing
67  * official policies, either expressed or implied, of the US Naval
68  * Research Laboratory (NRL).
69  */
70 
71 #include "pf.h"
72 
73 #include <sys/param.h>
74 #include <sys/systm.h>
75 #include <sys/mbuf.h>
76 #include <sys/protosw.h>
77 #include <sys/socket.h>
78 #include <sys/socketvar.h>
79 #include <sys/proc.h>
80 #include <sys/domain.h>
81 #include <sys/pool.h>
82 
83 #include <net/if.h>
84 #include <net/route.h>
85 #include <net/pfvar.h>
86 
87 #include <netinet/in.h>
88 #include <netinet/in_systm.h>
89 #include <netinet/ip.h>
90 #include <netinet/in_pcb.h>
91 #include <netinet/in_var.h>
92 #include <netinet/ip_var.h>
93 #include <dev/rndvar.h>
94 
95 #include <sys/mount.h>
96 #include <nfs/nfsproto.h>
97 
98 #ifdef INET6
99 #include <netinet6/ip6_var.h>
100 #endif /* INET6 */
101 #ifdef IPSEC
102 #include <netinet/ip_esp.h>
103 #endif /* IPSEC */
104 
105 struct	in_addr zeroin_addr;
106 
107 extern int ipsec_auth_default_level;
108 extern int ipsec_esp_trans_default_level;
109 extern int ipsec_esp_network_default_level;
110 extern int ipsec_ipcomp_default_level;
111 
112 /*
113  * These configure the range of local port addresses assigned to
114  * "unspecified" outgoing connections/packets/whatever.
115  */
116 int ipport_firstauto = IPPORT_RESERVED;
117 int ipport_lastauto = IPPORT_USERRESERVED;
118 int ipport_hifirstauto = IPPORT_HIFIRSTAUTO;
119 int ipport_hilastauto = IPPORT_HILASTAUTO;
120 
121 struct baddynamicports baddynamicports;
122 struct pool inpcb_pool;
123 int inpcb_pool_initialized = 0;
124 
125 #define	INPCBHASH(table, faddr, fport, laddr, lport, rdom) \
126 	&(table)->inpt_hashtbl[(ntohl((faddr)->s_addr) + \
127 	ntohs((fport)) + ntohs((lport)) + (rdom)) & (table->inpt_hash)]
128 
129 #define	IN6PCBHASH(table, faddr, fport, laddr, lport) \
130 	&(table)->inpt_hashtbl[(ntohl((faddr)->s6_addr32[0] ^ \
131 	(faddr)->s6_addr32[3]) + ntohs((fport)) + ntohs((lport))) & \
132 	(table->inpt_hash)]
133 
134 #define	INPCBLHASH(table, lport, rdom) \
135 	&(table)->inpt_lhashtbl[(ntohs((lport)) + (rdom)) & table->inpt_lhash]
136 
137 void
138 in_pcbinit(struct inpcbtable *table, int hashsize)
139 {
140 
141 	CIRCLEQ_INIT(&table->inpt_queue);
142 	table->inpt_hashtbl = hashinit(hashsize, M_PCB, M_NOWAIT,
143 	    &table->inpt_hash);
144 	if (table->inpt_hashtbl == NULL)
145 		panic("in_pcbinit: hashinit failed");
146 	table->inpt_lhashtbl = hashinit(hashsize, M_PCB, M_NOWAIT,
147 	    &table->inpt_lhash);
148 	if (table->inpt_lhashtbl == NULL)
149 		panic("in_pcbinit: hashinit failed for lport");
150 	table->inpt_lastport = 0;
151 }
152 
153 /*
154  * Check if the specified port is invalid for dynamic allocation.
155  */
156 int
157 in_baddynamic(u_int16_t port, u_int16_t proto)
158 {
159 	switch (proto) {
160 	case IPPROTO_TCP:
161 		return (DP_ISSET(baddynamicports.tcp, port));
162 	case IPPROTO_UDP:
163 #ifdef IPSEC
164 		/* Cannot preset this as it is a sysctl */
165 		if (port == udpencap_port)
166 			return (1);
167 #endif
168 		return (DP_ISSET(baddynamicports.udp, port));
169 	default:
170 		return (0);
171 	}
172 }
173 
174 int
175 in_pcballoc(struct socket *so, void *v)
176 {
177 	struct inpcbtable *table = v;
178 	struct inpcb *inp;
179 	int s;
180 
181 	if (inpcb_pool_initialized == 0) {
182 		pool_init(&inpcb_pool, sizeof(struct inpcb), 0, 0, 0,
183 		    "inpcbpl", NULL);
184 		inpcb_pool_initialized = 1;
185 	}
186 	inp = pool_get(&inpcb_pool, PR_NOWAIT|PR_ZERO);
187 	if (inp == NULL)
188 		return (ENOBUFS);
189 	inp->inp_table = table;
190 	inp->inp_socket = so;
191 	inp->inp_seclevel[SL_AUTH] = ipsec_auth_default_level;
192 	inp->inp_seclevel[SL_ESP_TRANS] = ipsec_esp_trans_default_level;
193 	inp->inp_seclevel[SL_ESP_NETWORK] = ipsec_esp_network_default_level;
194 	inp->inp_seclevel[SL_IPCOMP] = ipsec_ipcomp_default_level;
195 	inp->inp_rtableid = curproc->p_p->ps_rtableid;
196 	s = splnet();
197 	CIRCLEQ_INSERT_HEAD(&table->inpt_queue, inp, inp_queue);
198 	LIST_INSERT_HEAD(INPCBLHASH(table, inp->inp_lport,
199 	    inp->inp_rtableid), inp, inp_lhash);
200 	LIST_INSERT_HEAD(INPCBHASH(table, &inp->inp_faddr, inp->inp_fport,
201 	    &inp->inp_laddr, inp->inp_lport, rtable_l2(inp->inp_rtableid)),
202 	    inp, inp_hash);
203 	splx(s);
204 	so->so_pcb = inp;
205 	inp->inp_hops = -1;
206 
207 #ifdef INET6
208 	/*
209 	 * Small change in this function to set the INP_IPV6 flag so routines
210 	 * outside pcb-specific routines don't need to use sotopf(), and all
211 	 * of its pointer chasing, later.
212 	 */
213 	if (sotopf(so) == PF_INET6)
214 		inp->inp_flags = INP_IPV6;
215 	inp->in6p_cksum = -1;
216 #endif /* INET6 */
217 	return (0);
218 }
219 
220 int
221 in_pcbbind(void *v, struct mbuf *nam, struct proc *p)
222 {
223 	struct inpcb *inp = v;
224 	struct socket *so = inp->inp_socket;
225 	struct inpcbtable *table = inp->inp_table;
226 	u_int16_t *lastport = &inp->inp_table->inpt_lastport;
227 	struct sockaddr_in *sin;
228 	u_int16_t lport = 0;
229 	int wild = 0, reuseport = (so->so_options & SO_REUSEPORT);
230 	int error;
231 
232 #ifdef INET6
233 	if (sotopf(so) == PF_INET6)
234 		return in6_pcbbind(inp, nam, p);
235 #endif /* INET6 */
236 
237 	if (TAILQ_EMPTY(&in_ifaddr))
238 		return (EADDRNOTAVAIL);
239 	if (inp->inp_lport || inp->inp_laddr.s_addr != INADDR_ANY)
240 		return (EINVAL);
241 	if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0 &&
242 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0 ||
243 	     (so->so_options & SO_ACCEPTCONN) == 0))
244 		wild = INPLOOKUP_WILDCARD;
245 	if (nam) {
246 		sin = mtod(nam, struct sockaddr_in *);
247 		if (nam->m_len != sizeof (*sin))
248 			return (EINVAL);
249 #ifdef notdef
250 		/*
251 		 * We should check the family, but old programs
252 		 * incorrectly fail to initialize it.
253 		 */
254 		if (sin->sin_family != AF_INET)
255 			return (EAFNOSUPPORT);
256 #endif
257 		lport = sin->sin_port;
258 		if (IN_MULTICAST(sin->sin_addr.s_addr)) {
259 			/*
260 			 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
261 			 * allow complete duplication of binding if
262 			 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
263 			 * and a multicast address is bound on both
264 			 * new and duplicated sockets.
265 			 */
266 			if (so->so_options & SO_REUSEADDR)
267 				reuseport = SO_REUSEADDR|SO_REUSEPORT;
268 		} else if (sin->sin_addr.s_addr != INADDR_ANY) {
269 			sin->sin_port = 0;		/* yech... */
270 			if (!(so->so_options & SO_BINDANY) &&
271 			    in_iawithaddr(sin->sin_addr,
272 			    inp->inp_rtableid) == NULL)
273 				/* SOCK_RAW does not use in_pcbbind() */
274 				if (!(so->so_type == SOCK_DGRAM &&
275 				    in_broadcast(sin->sin_addr, NULL,
276 				    inp->inp_rtableid)))
277 					return (EADDRNOTAVAIL);
278 		}
279 		if (lport) {
280 			struct inpcb *t;
281 
282 			/* GROSS */
283 			if (ntohs(lport) < IPPORT_RESERVED &&
284 			    (error = suser(p, 0)))
285 				return (EACCES);
286 			if (so->so_euid) {
287 				t = in_pcblookup(table, &zeroin_addr, 0,
288 				    &sin->sin_addr, lport, INPLOOKUP_WILDCARD,
289 				    inp->inp_rtableid);
290 				if (t && (so->so_euid != t->inp_socket->so_euid))
291 					return (EADDRINUSE);
292 			}
293 			t = in_pcblookup(table, &zeroin_addr, 0,
294 			    &sin->sin_addr, lport, wild, inp->inp_rtableid);
295 			if (t && (reuseport & t->inp_socket->so_options) == 0)
296 				return (EADDRINUSE);
297 		}
298 		inp->inp_laddr = sin->sin_addr;
299 	}
300 	if (lport == 0) {
301 		u_int16_t first, last;
302 		int count;
303 
304 		if (inp->inp_flags & INP_HIGHPORT) {
305 			first = ipport_hifirstauto;	/* sysctl */
306 			last = ipport_hilastauto;
307 		} else if (inp->inp_flags & INP_LOWPORT) {
308 			if ((error = suser(p, 0)))
309 				return (EACCES);
310 			first = IPPORT_RESERVED-1; /* 1023 */
311 			last = 600;		   /* not IPPORT_RESERVED/2 */
312 		} else {
313 			first = ipport_firstauto;	/* sysctl */
314 			last  = ipport_lastauto;
315 		}
316 
317 		/*
318 		 * Simple check to ensure all ports are not used up causing
319 		 * a deadlock here.
320 		 *
321 		 * We split the two cases (up and down) so that the direction
322 		 * is not being tested on each round of the loop.
323 		 */
324 
325 		if (first > last) {
326 			/*
327 			 * counting down
328 			 */
329 			count = first - last;
330 			if (count)
331 				*lastport = first - arc4random_uniform(count);
332 
333 			do {
334 				if (count-- < 0)	/* completely used? */
335 					return (EADDRNOTAVAIL);
336 				--*lastport;
337 				if (*lastport > first || *lastport < last)
338 					*lastport = first;
339 				lport = htons(*lastport);
340 			} while (in_baddynamic(*lastport, so->so_proto->pr_protocol) ||
341 			    in_pcblookup(table, &zeroin_addr, 0,
342 			    &inp->inp_laddr, lport, wild, inp->inp_rtableid));
343 		} else {
344 			/*
345 			 * counting up
346 			 */
347 			count = last - first;
348 			if (count)
349 				*lastport = first + arc4random_uniform(count);
350 
351 			do {
352 				if (count-- < 0)	/* completely used? */
353 					return (EADDRNOTAVAIL);
354 				++*lastport;
355 				if (*lastport < first || *lastport > last)
356 					*lastport = first;
357 				lport = htons(*lastport);
358 			} while (in_baddynamic(*lastport, so->so_proto->pr_protocol) ||
359 			    in_pcblookup(table, &zeroin_addr, 0,
360 			    &inp->inp_laddr, lport, wild, inp->inp_rtableid));
361 		}
362 	}
363 	inp->inp_lport = lport;
364 	in_pcbrehash(inp);
365 	return (0);
366 }
367 
368 /*
369  * Connect from a socket to a specified address.
370  * Both address and port must be specified in argument sin.
371  * If don't have a local address for this socket yet,
372  * then pick one.
373  */
374 int
375 in_pcbconnect(void *v, struct mbuf *nam)
376 {
377 	struct inpcb *inp = v;
378 	struct sockaddr_in *ifaddr = NULL;
379 	struct sockaddr_in *sin = mtod(nam, struct sockaddr_in *);
380 
381 #ifdef INET6
382 	if (sotopf(inp->inp_socket) == PF_INET6)
383 		return (in6_pcbconnect(inp, nam));
384 	if ((inp->inp_flags & INP_IPV6) != 0)
385 		panic("IPv6 pcb passed into in_pcbconnect");
386 #endif /* INET6 */
387 
388 	if (nam->m_len != sizeof (*sin))
389 		return (EINVAL);
390 	if (sin->sin_family != AF_INET)
391 		return (EAFNOSUPPORT);
392 	if (sin->sin_port == 0)
393 		return (EADDRNOTAVAIL);
394 	if (!TAILQ_EMPTY(&in_ifaddr)) {
395 		/*
396 		 * If the destination address is INADDR_ANY,
397 		 * use the primary local address.
398 		 * If the supplied address is INADDR_BROADCAST,
399 		 * and the primary interface supports broadcast,
400 		 * choose the broadcast address for that interface.
401 		 */
402 		if (sin->sin_addr.s_addr == INADDR_ANY)
403 			sin->sin_addr = TAILQ_FIRST(&in_ifaddr)->ia_addr.sin_addr;
404 		else if (sin->sin_addr.s_addr == INADDR_BROADCAST &&
405 		  (TAILQ_FIRST(&in_ifaddr)->ia_ifp->if_flags & IFF_BROADCAST) &&
406 		  TAILQ_FIRST(&in_ifaddr)->ia_broadaddr.sin_addr.s_addr)
407 			sin->sin_addr =
408 			    TAILQ_FIRST(&in_ifaddr)->ia_broadaddr.sin_addr;
409 	}
410 	if (inp->inp_laddr.s_addr == INADDR_ANY) {
411 		int error;
412 		ifaddr = in_selectsrc(sin, &inp->inp_route,
413 			inp->inp_socket->so_options, inp->inp_moptions, &error,
414 			inp->inp_rtableid);
415 		if (ifaddr == NULL) {
416 			if (error == 0)
417 				error = EADDRNOTAVAIL;
418 			return error;
419 		}
420 	}
421 	if (in_pcbhashlookup(inp->inp_table, sin->sin_addr, sin->sin_port,
422 	    inp->inp_laddr.s_addr ? inp->inp_laddr : ifaddr->sin_addr,
423 	    inp->inp_lport, inp->inp_rtableid) != 0)
424 		return (EADDRINUSE);
425 	if (inp->inp_laddr.s_addr == INADDR_ANY) {
426 		if (inp->inp_lport == 0 &&
427 		    in_pcbbind(inp, NULL, curproc) == EADDRNOTAVAIL)
428 			return (EADDRNOTAVAIL);
429 		inp->inp_laddr = ifaddr->sin_addr;
430 	}
431 	inp->inp_faddr = sin->sin_addr;
432 	inp->inp_fport = sin->sin_port;
433 	in_pcbrehash(inp);
434 #ifdef IPSEC
435 	{
436 		int error; /* This is just ignored */
437 
438 		/* Cause an IPsec SA to be established. */
439 		ipsp_spd_inp(NULL, AF_INET, 0, &error, IPSP_DIRECTION_OUT,
440 		    NULL, inp, NULL);
441 	}
442 #endif
443 	return (0);
444 }
445 
446 void
447 in_pcbdisconnect(void *v)
448 {
449 	struct inpcb *inp = v;
450 
451 	switch (sotopf(inp->inp_socket)) {
452 #ifdef INET6
453 	case PF_INET6:
454 		inp->inp_faddr6 = in6addr_any;
455 		break;
456 #endif
457 	case PF_INET:
458 		inp->inp_faddr.s_addr = INADDR_ANY;
459 		break;
460 	}
461 
462 	inp->inp_fport = 0;
463 	in_pcbrehash(inp);
464 	if (inp->inp_socket->so_state & SS_NOFDREF)
465 		in_pcbdetach(inp);
466 }
467 
468 void
469 in_pcbdetach(void *v)
470 {
471 	struct inpcb *inp = v;
472 	struct socket *so = inp->inp_socket;
473 	int s;
474 
475 	so->so_pcb = 0;
476 	sofree(so);
477 	if (inp->inp_options)
478 		m_freem(inp->inp_options);
479 	if (inp->inp_route.ro_rt)
480 		rtfree(inp->inp_route.ro_rt);
481 #ifdef INET6
482 	if (inp->inp_flags & INP_IPV6) {
483 		ip6_freepcbopts(inp->inp_outputopts6);
484 		ip6_freemoptions(inp->inp_moptions6);
485 	} else
486 #endif
487 		ip_freemoptions(inp->inp_moptions);
488 #ifdef IPSEC
489 	/* IPsec cleanup here */
490 	s = spltdb();
491 	if (inp->inp_tdb_in)
492 		TAILQ_REMOVE(&inp->inp_tdb_in->tdb_inp_in,
493 			     inp, inp_tdb_in_next);
494 	if (inp->inp_tdb_out)
495 	        TAILQ_REMOVE(&inp->inp_tdb_out->tdb_inp_out, inp,
496 			     inp_tdb_out_next);
497 	if (inp->inp_ipsec_remotecred)
498 		ipsp_reffree(inp->inp_ipsec_remotecred);
499 	if (inp->inp_ipsec_remoteauth)
500 		ipsp_reffree(inp->inp_ipsec_remoteauth);
501 	if (inp->inp_ipo)
502 		ipsec_delete_policy(inp->inp_ipo);
503 	splx(s);
504 #endif
505 #if NPF > 0
506 	if (inp->inp_pf_sk)
507 		((struct pf_state_key *)inp->inp_pf_sk)->inp = NULL;
508 #endif
509 	s = splnet();
510 	LIST_REMOVE(inp, inp_lhash);
511 	LIST_REMOVE(inp, inp_hash);
512 	CIRCLEQ_REMOVE(&inp->inp_table->inpt_queue, inp, inp_queue);
513 	splx(s);
514 	pool_put(&inpcb_pool, inp);
515 }
516 
517 void
518 in_setsockaddr(struct inpcb *inp, struct mbuf *nam)
519 {
520 	struct sockaddr_in *sin;
521 
522 	nam->m_len = sizeof (*sin);
523 	sin = mtod(nam, struct sockaddr_in *);
524 	bzero((caddr_t)sin, sizeof (*sin));
525 	sin->sin_family = AF_INET;
526 	sin->sin_len = sizeof(*sin);
527 	sin->sin_port = inp->inp_lport;
528 	sin->sin_addr = inp->inp_laddr;
529 }
530 
531 void
532 in_setpeeraddr(struct inpcb *inp, struct mbuf *nam)
533 {
534 	struct sockaddr_in *sin;
535 
536 #ifdef INET6
537 	if (sotopf(inp->inp_socket) == PF_INET6) {
538 		in6_setpeeraddr(inp, nam);
539 		return;
540 	}
541 #endif /* INET6 */
542 
543 	nam->m_len = sizeof (*sin);
544 	sin = mtod(nam, struct sockaddr_in *);
545 	bzero((caddr_t)sin, sizeof (*sin));
546 	sin->sin_family = AF_INET;
547 	sin->sin_len = sizeof(*sin);
548 	sin->sin_port = inp->inp_fport;
549 	sin->sin_addr = inp->inp_faddr;
550 }
551 
552 /*
553  * Pass some notification to all connections of a protocol
554  * associated with address dst.  The "usual action" will be
555  * taken, depending on the ctlinput cmd.  The caller must filter any
556  * cmds that are uninteresting (e.g., no error in the map).
557  * Call the protocol specific routine (if any) to report
558  * any errors for each matching socket.
559  *
560  * Must be called at splsoftnet.
561  */
562 void
563 in_pcbnotifyall(struct inpcbtable *table, struct sockaddr *dst, u_int rdomain,
564     int errno, void (*notify)(struct inpcb *, int))
565 {
566 	struct inpcb *inp, *oinp;
567 	struct in_addr faddr;
568 
569 	splsoftassert(IPL_SOFTNET);
570 
571 #ifdef INET6
572 	/*
573 	 * See in6_pcbnotify() for IPv6 codepath.  By the time this
574 	 * gets called, the addresses passed are either definitely IPv4 or
575 	 * IPv6; *_pcbnotify() never gets called with v4-mapped v6 addresses.
576 	 */
577 #endif /* INET6 */
578 
579 	if (dst->sa_family != AF_INET)
580 		return;
581 	faddr = satosin(dst)->sin_addr;
582 	if (faddr.s_addr == INADDR_ANY)
583 		return;
584 
585 	rdomain = rtable_l2(rdomain);
586 	for (inp = CIRCLEQ_FIRST(&table->inpt_queue);
587 	    inp != CIRCLEQ_END(&table->inpt_queue);) {
588 #ifdef INET6
589 		if (inp->inp_flags & INP_IPV6) {
590 			inp = CIRCLEQ_NEXT(inp, inp_queue);
591 			continue;
592 		}
593 #endif
594 		if (inp->inp_faddr.s_addr != faddr.s_addr ||
595 		    rtable_l2(inp->inp_rtableid) != rdomain ||
596 		    inp->inp_socket == 0) {
597 			inp = CIRCLEQ_NEXT(inp, inp_queue);
598 			continue;
599 		}
600 		oinp = inp;
601 		inp = CIRCLEQ_NEXT(inp, inp_queue);
602 		if (notify)
603 			(*notify)(oinp, errno);
604 	}
605 }
606 
607 /*
608  * Check for alternatives when higher level complains
609  * about service problems.  For now, invalidate cached
610  * routing information.  If the route was created dynamically
611  * (by a redirect), time to try a default gateway again.
612  */
613 void
614 in_losing(struct inpcb *inp)
615 {
616 	struct rtentry *rt;
617 	struct rt_addrinfo info;
618 
619 	if ((rt = inp->inp_route.ro_rt)) {
620 		inp->inp_route.ro_rt = 0;
621 		bzero((caddr_t)&info, sizeof(info));
622 		info.rti_flags = rt->rt_flags;
623 		info.rti_info[RTAX_DST] = &inp->inp_route.ro_dst;
624 		info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
625 		info.rti_info[RTAX_NETMASK] = rt_mask(rt);
626 		rt_missmsg(RTM_LOSING, &info, rt->rt_flags, rt->rt_ifp, 0,
627 		    inp->inp_rtableid);
628 		if (rt->rt_flags & RTF_DYNAMIC)
629 			(void)rtrequest1(RTM_DELETE, &info, rt->rt_priority,
630 				(struct rtentry **)0, inp->inp_rtableid);
631 		/*
632 		 * A new route can be allocated
633 		 * the next time output is attempted.
634 		 * rtfree() needs to be called in anycase because the inp
635 		 * is still holding a reference to rt.
636 		 */
637 		rtfree(rt);
638 	}
639 }
640 
641 /*
642  * After a routing change, flush old routing
643  * and allocate a (hopefully) better one.
644  */
645 void
646 in_rtchange(struct inpcb *inp, int errno)
647 {
648 	if (inp->inp_route.ro_rt) {
649 		rtfree(inp->inp_route.ro_rt);
650 		inp->inp_route.ro_rt = 0;
651 		/*
652 		 * A new route can be allocated the next time
653 		 * output is attempted.
654 		 */
655 	}
656 }
657 
658 struct inpcb *
659 in_pcblookup(struct inpcbtable *table, void *faddrp, u_int fport_arg,
660     void *laddrp, u_int lport_arg, int flags, u_int rdomain)
661 {
662 	struct inpcb *inp, *match = 0;
663 	int matchwild = 3, wildcard;
664 	u_int16_t fport = fport_arg, lport = lport_arg;
665 	struct in_addr faddr = *(struct in_addr *)faddrp;
666 	struct in_addr laddr = *(struct in_addr *)laddrp;
667 
668 	rdomain = rtable_l2(rdomain);	/* convert passed rtableid to rdomain */
669 	for (inp = LIST_FIRST(INPCBLHASH(table, lport, rdomain)); inp;
670 	    inp = LIST_NEXT(inp, inp_lhash)) {
671 		if (rtable_l2(inp->inp_rtableid) != rdomain)
672 			continue;
673 		if (inp->inp_lport != lport)
674 			continue;
675 		wildcard = 0;
676 #ifdef INET6
677 		if (flags & INPLOOKUP_IPV6) {
678 			struct in6_addr *laddr6 = (struct in6_addr *)laddrp;
679 			struct in6_addr *faddr6 = (struct in6_addr *)faddrp;
680 
681 			if (!(inp->inp_flags & INP_IPV6))
682 				continue;
683 
684 			if (!IN6_IS_ADDR_UNSPECIFIED(&inp->inp_laddr6)) {
685 				if (IN6_IS_ADDR_UNSPECIFIED(laddr6))
686 					wildcard++;
687 				else if (!IN6_ARE_ADDR_EQUAL(&inp->inp_laddr6, laddr6))
688 					continue;
689 			} else {
690 				if (!IN6_IS_ADDR_UNSPECIFIED(laddr6))
691 					wildcard++;
692 			}
693 
694 			if (!IN6_IS_ADDR_UNSPECIFIED(&inp->inp_faddr6)) {
695 				if (IN6_IS_ADDR_UNSPECIFIED(faddr6))
696 					wildcard++;
697 				else if (!IN6_ARE_ADDR_EQUAL(&inp->inp_faddr6,
698 				    faddr6) || inp->inp_fport != fport)
699 					continue;
700 			} else {
701 				if (!IN6_IS_ADDR_UNSPECIFIED(faddr6))
702 					wildcard++;
703 			}
704 		} else
705 #endif /* INET6 */
706 		{
707 #ifdef INET6
708 		        if (inp->inp_flags & INP_IPV6)
709 			        continue;
710 #endif /* INET6 */
711 
712 			if (inp->inp_faddr.s_addr != INADDR_ANY) {
713 				if (faddr.s_addr == INADDR_ANY)
714 					wildcard++;
715 				else if (inp->inp_faddr.s_addr != faddr.s_addr ||
716 				    inp->inp_fport != fport)
717 					continue;
718 			} else {
719 				if (faddr.s_addr != INADDR_ANY)
720 					wildcard++;
721 			}
722 			if (inp->inp_laddr.s_addr != INADDR_ANY) {
723 				if (laddr.s_addr == INADDR_ANY)
724 					wildcard++;
725 				else if (inp->inp_laddr.s_addr != laddr.s_addr)
726 					continue;
727 			} else {
728 				if (laddr.s_addr != INADDR_ANY)
729 					wildcard++;
730 			}
731 		}
732 		if ((!wildcard || (flags & INPLOOKUP_WILDCARD)) &&
733 		    wildcard < matchwild) {
734 			match = inp;
735 			if ((matchwild = wildcard) == 0)
736 				break;
737 		}
738 	}
739 	return (match);
740 }
741 
742 struct rtentry *
743 in_pcbrtentry(struct inpcb *inp)
744 {
745 	struct route *ro;
746 
747 	ro = &inp->inp_route;
748 
749 	/*
750 	 * No route yet, so try to acquire one.
751 	 */
752 	if (ro->ro_rt == NULL) {
753 #ifdef INET6
754 		bzero(ro, sizeof(struct route_in6));
755 #else
756 		bzero(ro, sizeof(struct route));
757 #endif
758 
759 		switch(sotopf(inp->inp_socket)) {
760 #ifdef INET6
761 		case PF_INET6:
762 			if (IN6_IS_ADDR_UNSPECIFIED(&inp->inp_faddr6))
763 				break;
764 			ro->ro_dst.sa_family = AF_INET6;
765 			ro->ro_dst.sa_len = sizeof(struct sockaddr_in6);
766 			((struct sockaddr_in6 *) &ro->ro_dst)->sin6_addr =
767 			    inp->inp_faddr6;
768 			rtalloc_mpath(ro, &inp->inp_laddr6.s6_addr32[0]);
769 			break;
770 #endif /* INET6 */
771 		case PF_INET:
772 			if (inp->inp_faddr.s_addr == INADDR_ANY)
773 				break;
774 			ro->ro_dst.sa_family = AF_INET;
775 			ro->ro_dst.sa_len = sizeof(ro->ro_dst);
776 			ro->ro_tableid = inp->inp_rtableid;
777 			satosin(&ro->ro_dst)->sin_addr = inp->inp_faddr;
778 			rtalloc_mpath(ro, &inp->inp_laddr.s_addr);
779 			break;
780 		}
781 	}
782 	return (ro->ro_rt);
783 }
784 
785 struct sockaddr_in *
786 in_selectsrc(struct sockaddr_in *sin, struct route *ro, int soopts,
787     struct ip_moptions *mopts, int *errorp, u_int rtableid)
788 {
789 	struct sockaddr_in *sin2;
790 	struct in_ifaddr *ia;
791 
792 	ia = (struct in_ifaddr *)0;
793 	/*
794 	 * If the destination address is multicast and an outgoing
795 	 * interface has been set as a multicast option, use the
796 	 * address of that interface as our source address.
797 	 */
798 	if (IN_MULTICAST(sin->sin_addr.s_addr) && mopts != NULL) {
799 		struct ifnet *ifp;
800 
801 		if (mopts->imo_multicast_ifp != NULL) {
802 			ifp = mopts->imo_multicast_ifp;
803 			TAILQ_FOREACH(ia, &in_ifaddr, ia_list)
804 				if (ia->ia_ifp == ifp &&
805 				    rtable_l2(rtableid) == ifp->if_rdomain)
806 					break;
807 			if (ia == 0) {
808 				*errorp = EADDRNOTAVAIL;
809 				return NULL;
810 			}
811 			return satosin(&ia->ia_addr);
812 		}
813 	}
814 	/*
815 	 * If route is known or can be allocated now,
816 	 * our src addr is taken from the i/f, else punt.
817 	 */
818 	if (ro->ro_rt &&
819 	    (satosin(&ro->ro_dst)->sin_addr.s_addr !=
820 		sin->sin_addr.s_addr ||
821 	    soopts & SO_DONTROUTE)) {
822 		RTFREE(ro->ro_rt);
823 		ro->ro_rt = (struct rtentry *)0;
824 	}
825 	if ((soopts & SO_DONTROUTE) == 0 && /*XXX*/
826 	    (ro->ro_rt == (struct rtentry *)0 ||
827 	    ro->ro_rt->rt_ifp == (struct ifnet *)0)) {
828 		/* No route yet, so try to acquire one */
829 		ro->ro_dst.sa_family = AF_INET;
830 		ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
831 		satosin(&ro->ro_dst)->sin_addr = sin->sin_addr;
832 		ro->ro_tableid = rtableid;
833 		rtalloc_mpath(ro, NULL);
834 
835 		/*
836 		 * It is important to bzero out the rest of the
837 		 * struct sockaddr_in when mixing v6 & v4!
838 		 */
839 		sin2 = (struct sockaddr_in *)&ro->ro_dst;
840 		bzero(sin2->sin_zero, sizeof(sin2->sin_zero));
841 	}
842 	/*
843 	 * If we found a route, use the address
844 	 * corresponding to the outgoing interface
845 	 * unless it is the loopback (in case a route
846 	 * to our address on another net goes to loopback).
847 	 */
848 	if (ro->ro_rt && !(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK))
849 		ia = ifatoia(ro->ro_rt->rt_ifa);
850 	if (ia == 0) {
851 		u_int16_t fport = sin->sin_port;
852 
853 		sin->sin_port = 0;
854 		ia = ifatoia(ifa_ifwithdstaddr(sintosa(sin), rtableid));
855 		if (ia == 0)
856 			ia = ifatoia(ifa_ifwithnet(sintosa(sin), rtableid));
857 		sin->sin_port = fport;
858 		if (ia == 0)
859 			ia = TAILQ_FIRST(&in_ifaddr);
860 		if (ia == 0) {
861 			*errorp = EADDRNOTAVAIL;
862 			return NULL;
863 		}
864 	}
865 	return satosin(&ia->ia_addr);
866 }
867 
868 void
869 in_pcbrehash(struct inpcb *inp)
870 {
871 	struct inpcbtable *table = inp->inp_table;
872 	int s;
873 
874 	s = splnet();
875 	LIST_REMOVE(inp, inp_lhash);
876 	LIST_INSERT_HEAD(INPCBLHASH(table, inp->inp_lport, inp->inp_rtableid),
877 	    inp, inp_lhash);
878 	LIST_REMOVE(inp, inp_hash);
879 #ifdef INET6
880 	if (inp->inp_flags & INP_IPV6) {
881 		LIST_INSERT_HEAD(IN6PCBHASH(table, &inp->inp_faddr6,
882 		    inp->inp_fport, &inp->inp_laddr6, inp->inp_lport),
883 		    inp, inp_hash);
884 	} else {
885 #endif /* INET6 */
886 		LIST_INSERT_HEAD(INPCBHASH(table, &inp->inp_faddr,
887 		    inp->inp_fport, &inp->inp_laddr, inp->inp_lport,
888 		    rtable_l2(inp->inp_rtableid)), inp, inp_hash);
889 #ifdef INET6
890 	}
891 #endif /* INET6 */
892 	splx(s);
893 }
894 
895 #ifdef DIAGNOSTIC
896 int	in_pcbnotifymiss = 0;
897 #endif
898 
899 /*
900  * The in(6)_pcbhashlookup functions are used to locate connected sockets
901  * quickly:
902  * 		faddr.fport <-> laddr.lport
903  * No wildcard matching is done so that listening sockets are not found.
904  * If the functions return NULL in(6)_pcblookup_listen can be used to
905  * find a listening/bound socket that may accept the connection.
906  * After those two lookups no other are necessary.
907  */
908 struct inpcb *
909 in_pcbhashlookup(struct inpcbtable *table, struct in_addr faddr,
910     u_int fport_arg, struct in_addr laddr, u_int lport_arg, u_int rdomain)
911 {
912 	struct inpcbhead *head;
913 	struct inpcb *inp;
914 	u_int16_t fport = fport_arg, lport = lport_arg;
915 
916 	rdomain = rtable_l2(rdomain);	/* convert passed rtableid to rdomain */
917 	head = INPCBHASH(table, &faddr, fport, &laddr, lport, rdomain);
918 	LIST_FOREACH(inp, head, inp_hash) {
919 #ifdef INET6
920 		if (inp->inp_flags & INP_IPV6)
921 			continue;	/*XXX*/
922 #endif
923 		if (inp->inp_faddr.s_addr == faddr.s_addr &&
924 		    inp->inp_fport == fport &&
925 		    inp->inp_lport == lport &&
926 		    inp->inp_laddr.s_addr == laddr.s_addr &&
927 		    rtable_l2(inp->inp_rtableid) == rdomain) {
928 			/*
929 			 * Move this PCB to the head of hash chain so that
930 			 * repeated accesses are quicker.  This is analogous to
931 			 * the historic single-entry PCB cache.
932 			 */
933 			if (inp != LIST_FIRST(head)) {
934 				LIST_REMOVE(inp, inp_hash);
935 				LIST_INSERT_HEAD(head, inp, inp_hash);
936 			}
937 			break;
938 		}
939 	}
940 #ifdef DIAGNOSTIC
941 	if (inp == NULL && in_pcbnotifymiss) {
942 		printf("in_pcbhashlookup: faddr=%08x fport=%d laddr=%08x lport=%d rdom=%d\n",
943 		    ntohl(faddr.s_addr), ntohs(fport),
944 		    ntohl(laddr.s_addr), ntohs(lport), rdomain);
945 	}
946 #endif
947 	return (inp);
948 }
949 
950 #ifdef INET6
951 struct inpcb *
952 in6_pcbhashlookup(struct inpcbtable *table, struct in6_addr *faddr,
953     u_int fport_arg, struct in6_addr *laddr, u_int lport_arg)
954 {
955 	struct inpcbhead *head;
956 	struct inpcb *inp;
957 	u_int16_t fport = fport_arg, lport = lport_arg;
958 
959 	head = IN6PCBHASH(table, faddr, fport, laddr, lport);
960 	LIST_FOREACH(inp, head, inp_hash) {
961 		if (!(inp->inp_flags & INP_IPV6))
962 			continue;
963 		if (IN6_ARE_ADDR_EQUAL(&inp->inp_faddr6, faddr) &&
964 		    inp->inp_fport == fport && inp->inp_lport == lport &&
965 		    IN6_ARE_ADDR_EQUAL(&inp->inp_laddr6, laddr)) {
966 			/*
967 			 * Move this PCB to the head of hash chain so that
968 			 * repeated accesses are quicker.  This is analogous to
969 			 * the historic single-entry PCB cache.
970 			 */
971 			if (inp != LIST_FIRST(head)) {
972 				LIST_REMOVE(inp, inp_hash);
973 				LIST_INSERT_HEAD(head, inp, inp_hash);
974 			}
975 			break;
976 		}
977 	}
978 #ifdef DIAGNOSTIC
979 	if (inp == NULL && in_pcbnotifymiss) {
980 		printf("in6_pcbhashlookup: faddr=");
981 		printf(" fport=%d laddr=", ntohs(fport));
982 		printf(" lport=%d\n", ntohs(lport));
983 	}
984 #endif
985 	return (inp);
986 }
987 #endif /* INET6 */
988 
989 /*
990  * The in(6)_pcblookup_listen functions are used to locate listening
991  * sockets quickly.  This are sockets with unspecified foreign address
992  * and port:
993  *		*.*     <-> laddr.lport
994  *		*.*     <->     *.lport
995  */
996 struct inpcb *
997 in_pcblookup_listen(struct inpcbtable *table, struct in_addr laddr,
998     u_int lport_arg, int reverse, struct mbuf *m, u_int rdomain)
999 {
1000 	struct inpcbhead *head;
1001 	struct in_addr *key1, *key2;
1002 	struct inpcb *inp;
1003 	u_int16_t lport = lport_arg;
1004 
1005 	rdomain = rtable_l2(rdomain);	/* convert passed rtableid to rdomain */
1006 #if NPF > 0
1007 	if (m && m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) {
1008 		struct pf_divert *divert;
1009 
1010 		if ((divert = pf_find_divert(m)) == NULL)
1011 			return (NULL);
1012 		key1 = key2 = &divert->addr.v4;
1013 		lport = divert->port;
1014 	} else
1015 #endif
1016 	if (reverse) {
1017 		key1 = &zeroin_addr;
1018 		key2 = &laddr;
1019 	} else {
1020 		key1 = &laddr;
1021 		key2 = &zeroin_addr;
1022 	}
1023 
1024 	head = INPCBHASH(table, &zeroin_addr, 0, key1, lport, rdomain);
1025 	LIST_FOREACH(inp, head, inp_hash) {
1026 #ifdef INET6
1027 		if (inp->inp_flags & INP_IPV6)
1028 			continue;	/*XXX*/
1029 #endif
1030 		if (inp->inp_lport == lport && inp->inp_fport == 0 &&
1031 		    inp->inp_laddr.s_addr == key1->s_addr &&
1032 		    inp->inp_faddr.s_addr == INADDR_ANY &&
1033 		    rtable_l2(inp->inp_rtableid) == rdomain)
1034 			break;
1035 	}
1036 	if (inp == NULL && key1->s_addr != key2->s_addr) {
1037 		head = INPCBHASH(table, &zeroin_addr, 0, key2, lport, rdomain);
1038 		LIST_FOREACH(inp, head, inp_hash) {
1039 #ifdef INET6
1040 			if (inp->inp_flags & INP_IPV6)
1041 				continue;	/*XXX*/
1042 #endif
1043 			if (inp->inp_lport == lport && inp->inp_fport == 0 &&
1044 			    inp->inp_laddr.s_addr == key2->s_addr &&
1045 			    inp->inp_faddr.s_addr == INADDR_ANY &&
1046 			    rtable_l2(inp->inp_rtableid) == rdomain)
1047 				break;
1048 		}
1049 	}
1050 #ifdef DIAGNOSTIC
1051 	if (inp == NULL && in_pcbnotifymiss) {
1052 		printf("in_pcblookup_listen: laddr=%08x lport=%d\n",
1053 		    ntohl(laddr.s_addr), ntohs(lport));
1054 	}
1055 #endif
1056 	/*
1057 	 * Move this PCB to the head of hash chain so that
1058 	 * repeated accesses are quicker.  This is analogous to
1059 	 * the historic single-entry PCB cache.
1060 	 */
1061 	if (inp != NULL && inp != LIST_FIRST(head)) {
1062 		LIST_REMOVE(inp, inp_hash);
1063 		LIST_INSERT_HEAD(head, inp, inp_hash);
1064 	}
1065 	return (inp);
1066 }
1067 
1068 #ifdef INET6
1069 struct inpcb *
1070 in6_pcblookup_listen(struct inpcbtable *table, struct in6_addr *laddr,
1071     u_int lport_arg, int reverse, struct mbuf *m)
1072 {
1073 	struct inpcbhead *head;
1074 	struct in6_addr *key1, *key2;
1075 	struct inpcb *inp;
1076 	u_int16_t lport = lport_arg;
1077 
1078 #if NPF > 0
1079 	if (m && m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) {
1080 		struct pf_divert *divert;
1081 
1082 		if ((divert = pf_find_divert(m)) == NULL)
1083 			return (NULL);
1084 		key1 = key2 = &divert->addr.v6;
1085 		lport = divert->port;
1086 	} else
1087 #endif
1088 	if (reverse) {
1089 		key1 = &zeroin6_addr;
1090 		key2 = laddr;
1091 	} else {
1092 		key1 = laddr;
1093 		key2 = &zeroin6_addr;
1094 	}
1095 
1096 	head = IN6PCBHASH(table, &zeroin6_addr, 0, key1, lport);
1097 	LIST_FOREACH(inp, head, inp_hash) {
1098 		if (!(inp->inp_flags & INP_IPV6))
1099 			continue;
1100 		if (inp->inp_lport == lport && inp->inp_fport == 0 &&
1101 		    IN6_ARE_ADDR_EQUAL(&inp->inp_laddr6, key1) &&
1102 		    IN6_IS_ADDR_UNSPECIFIED(&inp->inp_faddr6))
1103 			break;
1104 	}
1105 	if (inp == NULL && ! IN6_ARE_ADDR_EQUAL(key1, key2)) {
1106 		head = IN6PCBHASH(table, &zeroin6_addr, 0, key2, lport);
1107 		LIST_FOREACH(inp, head, inp_hash) {
1108 			if (!(inp->inp_flags & INP_IPV6))
1109 				continue;
1110 			if (inp->inp_lport == lport && inp->inp_fport == 0 &&
1111 		    	    IN6_ARE_ADDR_EQUAL(&inp->inp_laddr6, key2) &&
1112 			    IN6_IS_ADDR_UNSPECIFIED(&inp->inp_faddr6))
1113 				break;
1114 		}
1115 	}
1116 #ifdef DIAGNOSTIC
1117 	if (inp == NULL && in_pcbnotifymiss) {
1118 		printf("in6_pcblookup_listen: laddr= lport=%d\n",
1119 		    ntohs(lport));
1120 	}
1121 #endif
1122 	/*
1123 	 * Move this PCB to the head of hash chain so that
1124 	 * repeated accesses are quicker.  This is analogous to
1125 	 * the historic single-entry PCB cache.
1126 	 */
1127 	if (inp != NULL && inp != LIST_FIRST(head)) {
1128 		LIST_REMOVE(inp, inp_hash);
1129 		LIST_INSERT_HEAD(head, inp, inp_hash);
1130 	}
1131 	return (inp);
1132 }
1133 #endif /* INET6 */
1134