1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 #include <sys/types.h>
27 #include <sys/stream.h>
28 #include <sys/strsun.h>
29 #include <sys/strsubr.h>
30 #include <sys/stropts.h>
31 #include <sys/strlog.h>
32 #define _SUN_TPI_VERSION 2
33 #include <sys/tihdr.h>
34 #include <sys/suntpi.h>
35 #include <sys/xti_inet.h>
36 #include <sys/policy.h>
37 #include <sys/squeue_impl.h>
38 #include <sys/squeue.h>
39 #include <sys/tsol/tnet.h>
40
41 #include <rpc/pmap_prot.h>
42
43 #include <inet/common.h>
44 #include <inet/ip.h>
45 #include <inet/tcp.h>
46 #include <inet/tcp_impl.h>
47 #include <inet/proto_set.h>
48 #include <inet/ipsec_impl.h>
49
50 /* Setable in /etc/system */
51 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
52 static uint32_t tcp_random_anon_port = 1;
53
54 static int tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t,
55 cred_t *cr);
56 static in_port_t tcp_get_next_priv_port(const tcp_t *);
57
58 /*
59 * Hash list insertion routine for tcp_t structures. Each hash bucket
60 * contains a list of tcp_t entries, and each entry is bound to a unique
61 * port. If there are multiple tcp_t's that are bound to the same port, then
62 * one of them will be linked into the hash bucket list, and the rest will
63 * hang off of that one entry. For each port, entries bound to a specific IP
64 * address will be inserted before those those bound to INADDR_ANY.
65 */
66 void
tcp_bind_hash_insert(tf_t * tbf,tcp_t * tcp,int caller_holds_lock)67 tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock)
68 {
69 tcp_t **tcpp;
70 tcp_t *tcpnext;
71 tcp_t *tcphash;
72 conn_t *connp = tcp->tcp_connp;
73 conn_t *connext;
74
75 if (tcp->tcp_ptpbhn != NULL) {
76 ASSERT(!caller_holds_lock);
77 tcp_bind_hash_remove(tcp);
78 }
79 tcpp = &tbf->tf_tcp;
80 if (!caller_holds_lock) {
81 mutex_enter(&tbf->tf_lock);
82 } else {
83 ASSERT(MUTEX_HELD(&tbf->tf_lock));
84 }
85 tcphash = tcpp[0];
86 tcpnext = NULL;
87 if (tcphash != NULL) {
88 /* Look for an entry using the same port */
89 while ((tcphash = tcpp[0]) != NULL &&
90 connp->conn_lport != tcphash->tcp_connp->conn_lport)
91 tcpp = &(tcphash->tcp_bind_hash);
92
93 /* The port was not found, just add to the end */
94 if (tcphash == NULL)
95 goto insert;
96
97 /*
98 * OK, there already exists an entry bound to the
99 * same port.
100 *
101 * If the new tcp bound to the INADDR_ANY address
102 * and the first one in the list is not bound to
103 * INADDR_ANY we skip all entries until we find the
104 * first one bound to INADDR_ANY.
105 * This makes sure that applications binding to a
106 * specific address get preference over those binding to
107 * INADDR_ANY.
108 */
109 tcpnext = tcphash;
110 connext = tcpnext->tcp_connp;
111 tcphash = NULL;
112 if (V6_OR_V4_INADDR_ANY(connp->conn_bound_addr_v6) &&
113 !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) {
114 while ((tcpnext = tcpp[0]) != NULL) {
115 connext = tcpnext->tcp_connp;
116 if (!V6_OR_V4_INADDR_ANY(
117 connext->conn_bound_addr_v6))
118 tcpp = &(tcpnext->tcp_bind_hash_port);
119 else
120 break;
121 }
122 if (tcpnext != NULL) {
123 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
124 tcphash = tcpnext->tcp_bind_hash;
125 if (tcphash != NULL) {
126 tcphash->tcp_ptpbhn =
127 &(tcp->tcp_bind_hash);
128 tcpnext->tcp_bind_hash = NULL;
129 }
130 }
131 } else {
132 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
133 tcphash = tcpnext->tcp_bind_hash;
134 if (tcphash != NULL) {
135 tcphash->tcp_ptpbhn =
136 &(tcp->tcp_bind_hash);
137 tcpnext->tcp_bind_hash = NULL;
138 }
139 }
140 }
141 insert:
142 tcp->tcp_bind_hash_port = tcpnext;
143 tcp->tcp_bind_hash = tcphash;
144 tcp->tcp_ptpbhn = tcpp;
145 tcpp[0] = tcp;
146 if (!caller_holds_lock)
147 mutex_exit(&tbf->tf_lock);
148 }
149
150 /*
151 * Hash list removal routine for tcp_t structures.
152 */
153 void
tcp_bind_hash_remove(tcp_t * tcp)154 tcp_bind_hash_remove(tcp_t *tcp)
155 {
156 tcp_t *tcpnext;
157 kmutex_t *lockp;
158 tcp_stack_t *tcps = tcp->tcp_tcps;
159 conn_t *connp = tcp->tcp_connp;
160
161 if (tcp->tcp_ptpbhn == NULL)
162 return;
163
164 /*
165 * Extract the lock pointer in case there are concurrent
166 * hash_remove's for this instance.
167 */
168 ASSERT(connp->conn_lport != 0);
169 lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH(
170 connp->conn_lport)].tf_lock;
171
172 ASSERT(lockp != NULL);
173 mutex_enter(lockp);
174 if (tcp->tcp_ptpbhn) {
175 tcpnext = tcp->tcp_bind_hash_port;
176 if (tcpnext != NULL) {
177 tcp->tcp_bind_hash_port = NULL;
178 tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
179 tcpnext->tcp_bind_hash = tcp->tcp_bind_hash;
180 if (tcpnext->tcp_bind_hash != NULL) {
181 tcpnext->tcp_bind_hash->tcp_ptpbhn =
182 &(tcpnext->tcp_bind_hash);
183 tcp->tcp_bind_hash = NULL;
184 }
185 } else if ((tcpnext = tcp->tcp_bind_hash) != NULL) {
186 tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
187 tcp->tcp_bind_hash = NULL;
188 }
189 *tcp->tcp_ptpbhn = tcpnext;
190 tcp->tcp_ptpbhn = NULL;
191 }
192 mutex_exit(lockp);
193 }
194
195 /*
196 * Don't let port fall into the privileged range.
197 * Since the extra privileged ports can be arbitrary we also
198 * ensure that we exclude those from consideration.
199 * tcp_g_epriv_ports is not sorted thus we loop over it until
200 * there are no changes.
201 *
202 * Note: No locks are held when inspecting tcp_g_*epriv_ports
203 * but instead the code relies on:
204 * - the fact that the address of the array and its size never changes
205 * - the atomic assignment of the elements of the array
206 *
207 * Returns 0 if there are no more ports available.
208 *
209 * TS note: skip multilevel ports.
210 */
211 in_port_t
tcp_update_next_port(in_port_t port,const tcp_t * tcp,boolean_t random)212 tcp_update_next_port(in_port_t port, const tcp_t *tcp, boolean_t random)
213 {
214 int i;
215 boolean_t restart = B_FALSE;
216 tcp_stack_t *tcps = tcp->tcp_tcps;
217
218 if (random && tcp_random_anon_port != 0) {
219 (void) random_get_pseudo_bytes((uint8_t *)&port,
220 sizeof (in_port_t));
221 /*
222 * Unless changed by a sys admin, the smallest anon port
223 * is 32768 and the largest anon port is 65535. It is
224 * very likely (50%) for the random port to be smaller
225 * than the smallest anon port. When that happens,
226 * add port % (anon port range) to the smallest anon
227 * port to get the random port. It should fall into the
228 * valid anon port range.
229 */
230 if (port < tcps->tcps_smallest_anon_port) {
231 port = tcps->tcps_smallest_anon_port +
232 port % (tcps->tcps_largest_anon_port -
233 tcps->tcps_smallest_anon_port);
234 }
235 }
236
237 retry:
238 if (port < tcps->tcps_smallest_anon_port)
239 port = (in_port_t)tcps->tcps_smallest_anon_port;
240
241 if (port > tcps->tcps_largest_anon_port) {
242 if (restart)
243 return (0);
244 restart = B_TRUE;
245 port = (in_port_t)tcps->tcps_smallest_anon_port;
246 }
247
248 if (port < tcps->tcps_smallest_nonpriv_port)
249 port = (in_port_t)tcps->tcps_smallest_nonpriv_port;
250
251 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
252 if (port == tcps->tcps_g_epriv_ports[i]) {
253 port++;
254 /*
255 * Make sure whether the port is in the
256 * valid range.
257 */
258 goto retry;
259 }
260 }
261 if (is_system_labeled() &&
262 (i = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), port,
263 IPPROTO_TCP, B_TRUE)) != 0) {
264 port = i;
265 goto retry;
266 }
267 return (port);
268 }
269
270 /*
271 * Return the next anonymous port in the privileged port range for
272 * bind checking. It starts at IPPORT_RESERVED - 1 and goes
273 * downwards. This is the same behavior as documented in the userland
274 * library call rresvport(3N).
275 *
276 * TS note: skip multilevel ports.
277 */
278 static in_port_t
tcp_get_next_priv_port(const tcp_t * tcp)279 tcp_get_next_priv_port(const tcp_t *tcp)
280 {
281 static in_port_t next_priv_port = IPPORT_RESERVED - 1;
282 in_port_t nextport;
283 boolean_t restart = B_FALSE;
284 tcp_stack_t *tcps = tcp->tcp_tcps;
285 retry:
286 if (next_priv_port < tcps->tcps_min_anonpriv_port ||
287 next_priv_port >= IPPORT_RESERVED) {
288 next_priv_port = IPPORT_RESERVED - 1;
289 if (restart)
290 return (0);
291 restart = B_TRUE;
292 }
293 if (is_system_labeled() &&
294 (nextport = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred),
295 next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) {
296 next_priv_port = nextport;
297 goto retry;
298 }
299 return (next_priv_port--);
300 }
301
302 static int
tcp_bind_select_lport(tcp_t * tcp,in_port_t * requested_port_ptr,boolean_t bind_to_req_port_only,cred_t * cr)303 tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
304 boolean_t bind_to_req_port_only, cred_t *cr)
305 {
306 in_port_t mlp_port;
307 mlp_type_t addrtype, mlptype;
308 boolean_t user_specified;
309 in_port_t allocated_port;
310 in_port_t requested_port = *requested_port_ptr;
311 conn_t *connp = tcp->tcp_connp;
312 zone_t *zone;
313 tcp_stack_t *tcps = tcp->tcp_tcps;
314 in6_addr_t v6addr = connp->conn_laddr_v6;
315
316 /*
317 * XXX It's up to the caller to specify bind_to_req_port_only or not.
318 */
319 ASSERT(cr != NULL);
320
321 /*
322 * Get a valid port (within the anonymous range and should not
323 * be a privileged one) to use if the user has not given a port.
324 * If multiple threads are here, they may all start with
325 * with the same initial port. But, it should be fine as long as
326 * tcp_bindi will ensure that no two threads will be assigned
327 * the same port.
328 *
329 * NOTE: XXX If a privileged process asks for an anonymous port, we
330 * still check for ports only in the range > tcp_smallest_non_priv_port,
331 * unless TCP_ANONPRIVBIND option is set.
332 */
333 mlptype = mlptSingle;
334 mlp_port = requested_port;
335 if (requested_port == 0) {
336 requested_port = connp->conn_anon_priv_bind ?
337 tcp_get_next_priv_port(tcp) :
338 tcp_update_next_port(tcps->tcps_next_port_to_try,
339 tcp, B_TRUE);
340 if (requested_port == 0) {
341 return (-TNOADDR);
342 }
343 user_specified = B_FALSE;
344
345 /*
346 * If the user went through one of the RPC interfaces to create
347 * this socket and RPC is MLP in this zone, then give him an
348 * anonymous MLP.
349 */
350 if (connp->conn_anon_mlp && is_system_labeled()) {
351 zone = crgetzone(cr);
352 addrtype = tsol_mlp_addr_type(
353 connp->conn_allzones ? ALL_ZONES : zone->zone_id,
354 IPV6_VERSION, &v6addr,
355 tcps->tcps_netstack->netstack_ip);
356 if (addrtype == mlptSingle) {
357 return (-TNOADDR);
358 }
359 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
360 PMAPPORT, addrtype);
361 mlp_port = PMAPPORT;
362 }
363 } else {
364 int i;
365 boolean_t priv = B_FALSE;
366
367 /*
368 * If the requested_port is in the well-known privileged range,
369 * verify that the stream was opened by a privileged user.
370 * Note: No locks are held when inspecting tcp_g_*epriv_ports
371 * but instead the code relies on:
372 * - the fact that the address of the array and its size never
373 * changes
374 * - the atomic assignment of the elements of the array
375 */
376 if (requested_port < tcps->tcps_smallest_nonpriv_port) {
377 priv = B_TRUE;
378 } else {
379 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
380 if (requested_port ==
381 tcps->tcps_g_epriv_ports[i]) {
382 priv = B_TRUE;
383 break;
384 }
385 }
386 }
387 if (priv) {
388 if (secpolicy_net_privaddr(cr, requested_port,
389 IPPROTO_TCP) != 0) {
390 if (connp->conn_debug) {
391 (void) strlog(TCP_MOD_ID, 0, 1,
392 SL_ERROR|SL_TRACE,
393 "tcp_bind: no priv for port %d",
394 requested_port);
395 }
396 return (-TACCES);
397 }
398 }
399 user_specified = B_TRUE;
400
401 connp = tcp->tcp_connp;
402 if (is_system_labeled()) {
403 zone = crgetzone(cr);
404 addrtype = tsol_mlp_addr_type(
405 connp->conn_allzones ? ALL_ZONES : zone->zone_id,
406 IPV6_VERSION, &v6addr,
407 tcps->tcps_netstack->netstack_ip);
408 if (addrtype == mlptSingle) {
409 return (-TNOADDR);
410 }
411 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
412 requested_port, addrtype);
413 }
414 }
415
416 if (mlptype != mlptSingle) {
417 if (secpolicy_net_bindmlp(cr) != 0) {
418 if (connp->conn_debug) {
419 (void) strlog(TCP_MOD_ID, 0, 1,
420 SL_ERROR|SL_TRACE,
421 "tcp_bind: no priv for multilevel port %d",
422 requested_port);
423 }
424 return (-TACCES);
425 }
426
427 /*
428 * If we're specifically binding a shared IP address and the
429 * port is MLP on shared addresses, then check to see if this
430 * zone actually owns the MLP. Reject if not.
431 */
432 if (mlptype == mlptShared && addrtype == mlptShared) {
433 /*
434 * No need to handle exclusive-stack zones since
435 * ALL_ZONES only applies to the shared stack.
436 */
437 zoneid_t mlpzone;
438
439 mlpzone = tsol_mlp_findzone(IPPROTO_TCP,
440 htons(mlp_port));
441 if (connp->conn_zoneid != mlpzone) {
442 if (connp->conn_debug) {
443 (void) strlog(TCP_MOD_ID, 0, 1,
444 SL_ERROR|SL_TRACE,
445 "tcp_bind: attempt to bind port "
446 "%d on shared addr in zone %d "
447 "(should be %d)",
448 mlp_port, connp->conn_zoneid,
449 mlpzone);
450 }
451 return (-TACCES);
452 }
453 }
454
455 if (!user_specified) {
456 int err;
457 err = tsol_mlp_anon(zone, mlptype, connp->conn_proto,
458 requested_port, B_TRUE);
459 if (err != 0) {
460 if (connp->conn_debug) {
461 (void) strlog(TCP_MOD_ID, 0, 1,
462 SL_ERROR|SL_TRACE,
463 "tcp_bind: cannot establish anon "
464 "MLP for port %d",
465 requested_port);
466 }
467 return (err);
468 }
469 connp->conn_anon_port = B_TRUE;
470 }
471 connp->conn_mlp_type = mlptype;
472 }
473
474 allocated_port = tcp_bindi(tcp, requested_port, &v6addr,
475 connp->conn_reuseaddr, B_FALSE, bind_to_req_port_only,
476 user_specified);
477
478 if (allocated_port == 0) {
479 connp->conn_mlp_type = mlptSingle;
480 if (connp->conn_anon_port) {
481 connp->conn_anon_port = B_FALSE;
482 (void) tsol_mlp_anon(zone, mlptype, connp->conn_proto,
483 requested_port, B_FALSE);
484 }
485 if (bind_to_req_port_only) {
486 if (connp->conn_debug) {
487 (void) strlog(TCP_MOD_ID, 0, 1,
488 SL_ERROR|SL_TRACE,
489 "tcp_bind: requested addr busy");
490 }
491 return (-TADDRBUSY);
492 } else {
493 /* If we are out of ports, fail the bind. */
494 if (connp->conn_debug) {
495 (void) strlog(TCP_MOD_ID, 0, 1,
496 SL_ERROR|SL_TRACE,
497 "tcp_bind: out of ports?");
498 }
499 return (-TNOADDR);
500 }
501 }
502
503 /* Pass the allocated port back */
504 *requested_port_ptr = allocated_port;
505 return (0);
506 }
507
508 /*
509 * Check the address and check/pick a local port number.
510 */
511 int
tcp_bind_check(conn_t * connp,struct sockaddr * sa,socklen_t len,cred_t * cr,boolean_t bind_to_req_port_only)512 tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
513 boolean_t bind_to_req_port_only)
514 {
515 tcp_t *tcp = connp->conn_tcp;
516 sin_t *sin;
517 sin6_t *sin6;
518 in_port_t requested_port;
519 ipaddr_t v4addr;
520 in6_addr_t v6addr;
521 ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */
522 zoneid_t zoneid = IPCL_ZONEID(connp);
523 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
524 uint_t scopeid = 0;
525 int error = 0;
526 ip_xmit_attr_t *ixa = connp->conn_ixa;
527
528 ASSERT((uintptr_t)len <= (uintptr_t)INT_MAX);
529
530 if (tcp->tcp_state == TCPS_BOUND) {
531 return (0);
532 } else if (tcp->tcp_state > TCPS_BOUND) {
533 if (connp->conn_debug) {
534 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
535 "tcp_bind: bad state, %d", tcp->tcp_state);
536 }
537 return (-TOUTSTATE);
538 }
539
540 ASSERT(sa != NULL && len != 0);
541
542 if (!OK_32PTR((char *)sa)) {
543 if (connp->conn_debug) {
544 (void) strlog(TCP_MOD_ID, 0, 1,
545 SL_ERROR|SL_TRACE,
546 "tcp_bind: bad address parameter, "
547 "address %p, len %d",
548 (void *)sa, len);
549 }
550 return (-TPROTO);
551 }
552
553 error = proto_verify_ip_addr(connp->conn_family, sa, len);
554 if (error != 0) {
555 return (error);
556 }
557
558 switch (len) {
559 case sizeof (sin_t): /* Complete IPv4 address */
560 sin = (sin_t *)sa;
561 requested_port = ntohs(sin->sin_port);
562 v4addr = sin->sin_addr.s_addr;
563 IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr);
564 if (v4addr != INADDR_ANY) {
565 laddr_type = ip_laddr_verify_v4(v4addr, zoneid, ipst,
566 B_FALSE);
567 }
568 break;
569
570 case sizeof (sin6_t): /* Complete IPv6 address */
571 sin6 = (sin6_t *)sa;
572 v6addr = sin6->sin6_addr;
573 requested_port = ntohs(sin6->sin6_port);
574 if (IN6_IS_ADDR_V4MAPPED(&v6addr)) {
575 if (connp->conn_ipv6_v6only)
576 return (EADDRNOTAVAIL);
577
578 IN6_V4MAPPED_TO_IPADDR(&v6addr, v4addr);
579 if (v4addr != INADDR_ANY) {
580 laddr_type = ip_laddr_verify_v4(v4addr,
581 zoneid, ipst, B_FALSE);
582 }
583 } else {
584 if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr)) {
585 if (IN6_IS_ADDR_LINKSCOPE(&v6addr))
586 scopeid = sin6->sin6_scope_id;
587 laddr_type = ip_laddr_verify_v6(&v6addr,
588 zoneid, ipst, B_FALSE, scopeid);
589 }
590 }
591 break;
592
593 default:
594 if (connp->conn_debug) {
595 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
596 "tcp_bind: bad address length, %d", len);
597 }
598 return (EAFNOSUPPORT);
599 /* return (-TBADADDR); */
600 }
601
602 /* Is the local address a valid unicast address? */
603 if (laddr_type == IPVL_BAD)
604 return (EADDRNOTAVAIL);
605
606 connp->conn_bound_addr_v6 = v6addr;
607 if (scopeid != 0) {
608 ixa->ixa_flags |= IXAF_SCOPEID_SET;
609 ixa->ixa_scopeid = scopeid;
610 connp->conn_incoming_ifindex = scopeid;
611 } else {
612 ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
613 connp->conn_incoming_ifindex = connp->conn_bound_if;
614 }
615
616 connp->conn_laddr_v6 = v6addr;
617 connp->conn_saddr_v6 = v6addr;
618
619 bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only;
620
621 error = tcp_bind_select_lport(tcp, &requested_port,
622 bind_to_req_port_only, cr);
623 if (error != 0) {
624 connp->conn_laddr_v6 = ipv6_all_zeros;
625 connp->conn_saddr_v6 = ipv6_all_zeros;
626 connp->conn_bound_addr_v6 = ipv6_all_zeros;
627 }
628 return (error);
629 }
630
631 /*
632 * If the "bind_to_req_port_only" parameter is set, if the requested port
633 * number is available, return it, If not return 0
634 *
635 * If "bind_to_req_port_only" parameter is not set and
636 * If the requested port number is available, return it. If not, return
637 * the first anonymous port we happen across. If no anonymous ports are
638 * available, return 0. addr is the requested local address, if any.
639 *
640 * In either case, when succeeding update the tcp_t to record the port number
641 * and insert it in the bind hash table.
642 *
643 * Note that TCP over IPv4 and IPv6 sockets can use the same port number
644 * without setting SO_REUSEADDR. This is needed so that they
645 * can be viewed as two independent transport protocols.
646 */
647 in_port_t
tcp_bindi(tcp_t * tcp,in_port_t port,const in6_addr_t * laddr,int reuseaddr,boolean_t quick_connect,boolean_t bind_to_req_port_only,boolean_t user_specified)648 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
649 int reuseaddr, boolean_t quick_connect,
650 boolean_t bind_to_req_port_only, boolean_t user_specified)
651 {
652 /* number of times we have run around the loop */
653 int count = 0;
654 /* maximum number of times to run around the loop */
655 int loopmax;
656 conn_t *connp = tcp->tcp_connp;
657 tcp_stack_t *tcps = tcp->tcp_tcps;
658
659 /*
660 * Lookup for free addresses is done in a loop and "loopmax"
661 * influences how long we spin in the loop
662 */
663 if (bind_to_req_port_only) {
664 /*
665 * If the requested port is busy, don't bother to look
666 * for a new one. Setting loop maximum count to 1 has
667 * that effect.
668 */
669 loopmax = 1;
670 } else {
671 /*
672 * If the requested port is busy, look for a free one
673 * in the anonymous port range.
674 * Set loopmax appropriately so that one does not look
675 * forever in the case all of the anonymous ports are in use.
676 */
677 if (connp->conn_anon_priv_bind) {
678 /*
679 * loopmax =
680 * (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1
681 */
682 loopmax = IPPORT_RESERVED -
683 tcps->tcps_min_anonpriv_port;
684 } else {
685 loopmax = (tcps->tcps_largest_anon_port -
686 tcps->tcps_smallest_anon_port + 1);
687 }
688 }
689 do {
690 uint16_t lport;
691 tf_t *tbf;
692 tcp_t *ltcp;
693 conn_t *lconnp;
694
695 lport = htons(port);
696
697 /*
698 * Ensure that the tcp_t is not currently in the bind hash.
699 * Hold the lock on the hash bucket to ensure that
700 * the duplicate check plus the insertion is an atomic
701 * operation.
702 *
703 * This function does an inline lookup on the bind hash list
704 * Make sure that we access only members of tcp_t
705 * and that we don't look at tcp_tcp, since we are not
706 * doing a CONN_INC_REF.
707 */
708 tcp_bind_hash_remove(tcp);
709 tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)];
710 mutex_enter(&tbf->tf_lock);
711 for (ltcp = tbf->tf_tcp; ltcp != NULL;
712 ltcp = ltcp->tcp_bind_hash) {
713 if (lport == ltcp->tcp_connp->conn_lport)
714 break;
715 }
716
717 for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
718 boolean_t not_socket;
719 boolean_t exclbind;
720
721 lconnp = ltcp->tcp_connp;
722
723 /*
724 * On a labeled system, we must treat bindings to ports
725 * on shared IP addresses by sockets with MAC exemption
726 * privilege as being in all zones, as there's
727 * otherwise no way to identify the right receiver.
728 */
729 if (!IPCL_BIND_ZONE_MATCH(lconnp, connp))
730 continue;
731
732 /*
733 * If TCP_EXCLBIND is set for either the bound or
734 * binding endpoint, the semantics of bind
735 * is changed according to the following.
736 *
737 * spec = specified address (v4 or v6)
738 * unspec = unspecified address (v4 or v6)
739 * A = specified addresses are different for endpoints
740 *
741 * bound bind to allowed
742 * -------------------------------------
743 * unspec unspec no
744 * unspec spec no
745 * spec unspec no
746 * spec spec yes if A
747 *
748 * For labeled systems, SO_MAC_EXEMPT behaves the same
749 * as TCP_EXCLBIND, except that zoneid is ignored.
750 *
751 * Note:
752 *
753 * 1. Because of TLI semantics, an endpoint can go
754 * back from, say TCP_ESTABLISHED to TCPS_LISTEN or
755 * TCPS_BOUND, depending on whether it is originally
756 * a listener or not. That is why we need to check
757 * for states greater than or equal to TCPS_BOUND
758 * here.
759 *
760 * 2. Ideally, we should only check for state equals
761 * to TCPS_LISTEN. And the following check should be
762 * added.
763 *
764 * if (ltcp->tcp_state == TCPS_LISTEN ||
765 * !reuseaddr || !lconnp->conn_reuseaddr) {
766 * ...
767 * }
768 *
769 * The semantics will be changed to this. If the
770 * endpoint on the list is in state not equal to
771 * TCPS_LISTEN and both endpoints have SO_REUSEADDR
772 * set, let the bind succeed.
773 *
774 * Because of (1), we cannot do that for TLI
775 * endpoints. But we can do that for socket endpoints.
776 * If in future, we can change this going back
777 * semantics, we can use the above check for TLI also.
778 */
779 not_socket = !(TCP_IS_SOCKET(ltcp) &&
780 TCP_IS_SOCKET(tcp));
781 exclbind = lconnp->conn_exclbind ||
782 connp->conn_exclbind;
783
784 if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) ||
785 (connp->conn_mac_mode != CONN_MAC_DEFAULT) ||
786 (exclbind && (not_socket ||
787 ltcp->tcp_state <= TCPS_ESTABLISHED))) {
788 if (V6_OR_V4_INADDR_ANY(
789 lconnp->conn_bound_addr_v6) ||
790 V6_OR_V4_INADDR_ANY(*laddr) ||
791 IN6_ARE_ADDR_EQUAL(laddr,
792 &lconnp->conn_bound_addr_v6)) {
793 break;
794 }
795 continue;
796 }
797
798 /*
799 * Check ipversion to allow IPv4 and IPv6 sockets to
800 * have disjoint port number spaces, if *_EXCLBIND
801 * is not set and only if the application binds to a
802 * specific port. We use the same autoassigned port
803 * number space for IPv4 and IPv6 sockets.
804 */
805 if (connp->conn_ipversion != lconnp->conn_ipversion &&
806 bind_to_req_port_only)
807 continue;
808
809 /*
810 * Ideally, we should make sure that the source
811 * address, remote address, and remote port in the
812 * four tuple for this tcp-connection is unique.
813 * However, trying to find out the local source
814 * address would require too much code duplication
815 * with IP, since IP needs needs to have that code
816 * to support userland TCP implementations.
817 */
818 if (quick_connect &&
819 (ltcp->tcp_state > TCPS_LISTEN) &&
820 ((connp->conn_fport != lconnp->conn_fport) ||
821 !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
822 &lconnp->conn_faddr_v6)))
823 continue;
824
825 if (!reuseaddr) {
826 /*
827 * No socket option SO_REUSEADDR.
828 * If existing port is bound to
829 * a non-wildcard IP address
830 * and the requesting stream is
831 * bound to a distinct
832 * different IP addresses
833 * (non-wildcard, also), keep
834 * going.
835 */
836 if (!V6_OR_V4_INADDR_ANY(*laddr) &&
837 !V6_OR_V4_INADDR_ANY(
838 lconnp->conn_bound_addr_v6) &&
839 !IN6_ARE_ADDR_EQUAL(laddr,
840 &lconnp->conn_bound_addr_v6))
841 continue;
842 if (ltcp->tcp_state >= TCPS_BOUND) {
843 /*
844 * This port is being used and
845 * its state is >= TCPS_BOUND,
846 * so we can't bind to it.
847 */
848 break;
849 }
850 } else {
851 /*
852 * socket option SO_REUSEADDR is set on the
853 * binding tcp_t.
854 *
855 * If two streams are bound to
856 * same IP address or both addr
857 * and bound source are wildcards
858 * (INADDR_ANY), we want to stop
859 * searching.
860 * We have found a match of IP source
861 * address and source port, which is
862 * refused regardless of the
863 * SO_REUSEADDR setting, so we break.
864 */
865 if (IN6_ARE_ADDR_EQUAL(laddr,
866 &lconnp->conn_bound_addr_v6) &&
867 (ltcp->tcp_state == TCPS_LISTEN ||
868 ltcp->tcp_state == TCPS_BOUND))
869 break;
870 }
871 }
872 if (ltcp != NULL) {
873 /* The port number is busy */
874 mutex_exit(&tbf->tf_lock);
875 } else {
876 /*
877 * This port is ours. Insert in fanout and mark as
878 * bound to prevent others from getting the port
879 * number.
880 */
881 tcp->tcp_state = TCPS_BOUND;
882 DTRACE_TCP6(state__change, void, NULL,
883 ip_xmit_attr_t *, connp->conn_ixa,
884 void, NULL, tcp_t *, tcp, void, NULL,
885 int32_t, TCPS_IDLE);
886
887 connp->conn_lport = htons(port);
888
889 ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
890 connp->conn_lport)] == tbf);
891 tcp_bind_hash_insert(tbf, tcp, 1);
892
893 mutex_exit(&tbf->tf_lock);
894
895 /*
896 * We don't want tcp_next_port_to_try to "inherit"
897 * a port number supplied by the user in a bind.
898 */
899 if (user_specified)
900 return (port);
901
902 /*
903 * This is the only place where tcp_next_port_to_try
904 * is updated. After the update, it may or may not
905 * be in the valid range.
906 */
907 if (!connp->conn_anon_priv_bind)
908 tcps->tcps_next_port_to_try = port + 1;
909 return (port);
910 }
911
912 if (connp->conn_anon_priv_bind) {
913 port = tcp_get_next_priv_port(tcp);
914 } else {
915 if (count == 0 && user_specified) {
916 /*
917 * We may have to return an anonymous port. So
918 * get one to start with.
919 */
920 port =
921 tcp_update_next_port(
922 tcps->tcps_next_port_to_try,
923 tcp, B_TRUE);
924 user_specified = B_FALSE;
925 } else {
926 port = tcp_update_next_port(port + 1, tcp,
927 B_FALSE);
928 }
929 }
930 if (port == 0)
931 break;
932
933 /*
934 * Don't let this loop run forever in the case where
935 * all of the anonymous ports are in use.
936 */
937 } while (++count < loopmax);
938 return (0);
939 }
940