1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 /* This file contains all TCP kernel socket related functions. */
27
28 #include <sys/types.h>
29 #include <sys/strlog.h>
30 #include <sys/policy.h>
31 #include <sys/sockio.h>
32 #include <sys/strsubr.h>
33 #include <sys/strsun.h>
34 #include <sys/squeue_impl.h>
35 #include <sys/squeue.h>
36 #define _SUN_TPI_VERSION 2
37 #include <sys/tihdr.h>
38 #include <sys/timod.h>
39 #include <sys/tpicommon.h>
40 #include <sys/socketvar.h>
41
42 #include <inet/common.h>
43 #include <inet/proto_set.h>
44 #include <inet/ip.h>
45 #include <inet/tcp.h>
46 #include <inet/tcp_impl.h>
47
48 static void tcp_activate(sock_lower_handle_t, sock_upper_handle_t,
49 sock_upcalls_t *, int, cred_t *);
50 static int tcp_accept(sock_lower_handle_t, sock_lower_handle_t,
51 sock_upper_handle_t, cred_t *);
52 static int tcp_bind(sock_lower_handle_t, struct sockaddr *,
53 socklen_t, cred_t *);
54 static int tcp_listen(sock_lower_handle_t, int, cred_t *);
55 static int tcp_connect(sock_lower_handle_t, const struct sockaddr *,
56 socklen_t, sock_connid_t *, cred_t *);
57 static int tcp_getsockopt(sock_lower_handle_t, int, int, void *,
58 socklen_t *, cred_t *);
59 static int tcp_setsockopt(sock_lower_handle_t, int, int, const void *,
60 socklen_t, cred_t *);
61 static int tcp_sendmsg(sock_lower_handle_t, mblk_t *, struct nmsghdr *,
62 cred_t *cr);
63 static int tcp_shutdown(sock_lower_handle_t, int, cred_t *);
64 static void tcp_clr_flowctrl(sock_lower_handle_t);
65 static int tcp_ioctl(sock_lower_handle_t, int, intptr_t, int, int32_t *,
66 cred_t *);
67 static int tcp_close(sock_lower_handle_t, int, cred_t *);
68
69 sock_downcalls_t sock_tcp_downcalls = {
70 tcp_activate,
71 tcp_accept,
72 tcp_bind,
73 tcp_listen,
74 tcp_connect,
75 tcp_getpeername,
76 tcp_getsockname,
77 tcp_getsockopt,
78 tcp_setsockopt,
79 tcp_sendmsg,
80 NULL,
81 NULL,
82 NULL,
83 tcp_shutdown,
84 tcp_clr_flowctrl,
85 tcp_ioctl,
86 tcp_close,
87 };
88
89 /* ARGSUSED */
90 static void
tcp_activate(sock_lower_handle_t proto_handle,sock_upper_handle_t sock_handle,sock_upcalls_t * sock_upcalls,int flags,cred_t * cr)91 tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
92 sock_upcalls_t *sock_upcalls, int flags, cred_t *cr)
93 {
94 conn_t *connp = (conn_t *)proto_handle;
95 struct sock_proto_props sopp;
96 extern struct module_info tcp_rinfo;
97
98 ASSERT(connp->conn_upper_handle == NULL);
99
100 /* All Solaris components should pass a cred for this operation. */
101 ASSERT(cr != NULL);
102
103 sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
104 SOCKOPT_MAXPSZ | SOCKOPT_MAXBLK | SOCKOPT_RCVTIMER |
105 SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ;
106
107 sopp.sopp_rxhiwat = SOCKET_RECVHIWATER;
108 sopp.sopp_rxlowat = SOCKET_RECVLOWATER;
109 sopp.sopp_maxpsz = INFPSZ;
110 sopp.sopp_maxblk = INFPSZ;
111 sopp.sopp_rcvtimer = SOCKET_TIMER_INTERVAL;
112 sopp.sopp_rcvthresh = SOCKET_RECVHIWATER >> 3;
113 sopp.sopp_maxaddrlen = sizeof (sin6_t);
114 sopp.sopp_minpsz = (tcp_rinfo.mi_minpsz == 1) ? 0 :
115 tcp_rinfo.mi_minpsz;
116
117 connp->conn_upcalls = sock_upcalls;
118 connp->conn_upper_handle = sock_handle;
119
120 ASSERT(connp->conn_rcvbuf != 0 &&
121 connp->conn_rcvbuf == connp->conn_tcp->tcp_rwnd);
122 (*sock_upcalls->su_set_proto_props)(sock_handle, &sopp);
123 }
124
125 /*ARGSUSED*/
126 static int
tcp_accept(sock_lower_handle_t lproto_handle,sock_lower_handle_t eproto_handle,sock_upper_handle_t sock_handle,cred_t * cr)127 tcp_accept(sock_lower_handle_t lproto_handle,
128 sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
129 cred_t *cr)
130 {
131 conn_t *lconnp, *econnp;
132 tcp_t *listener, *eager;
133
134 /*
135 * KSSL can move a socket from one listener to another, in which
136 * case `lproto_handle' points to the new listener. To ensure that
137 * the original listener is used the information is obtained from
138 * the eager.
139 */
140 econnp = (conn_t *)eproto_handle;
141 eager = econnp->conn_tcp;
142 ASSERT(IPCL_IS_NONSTR(econnp));
143 ASSERT(eager->tcp_listener != NULL);
144 listener = eager->tcp_listener;
145 lconnp = (conn_t *)listener->tcp_connp;
146 ASSERT(listener->tcp_state == TCPS_LISTEN);
147 ASSERT(lconnp->conn_upper_handle != NULL);
148
149 /*
150 * It is possible for the accept thread to race with the thread that
151 * made the su_newconn upcall in tcp_newconn_notify. Both
152 * tcp_newconn_notify and tcp_accept require that conn_upper_handle
153 * and conn_upcalls be set before returning, so they both write to
154 * them. However, we're guaranteed that the value written is the same
155 * for both threads.
156 */
157 ASSERT(econnp->conn_upper_handle == NULL ||
158 econnp->conn_upper_handle == sock_handle);
159 ASSERT(econnp->conn_upcalls == NULL ||
160 econnp->conn_upcalls == lconnp->conn_upcalls);
161 econnp->conn_upper_handle = sock_handle;
162 econnp->conn_upcalls = lconnp->conn_upcalls;
163
164 ASSERT(econnp->conn_netstack ==
165 listener->tcp_connp->conn_netstack);
166 ASSERT(eager->tcp_tcps == listener->tcp_tcps);
167
168 /*
169 * We should have a minimum of 2 references on the conn at this
170 * point. One for TCP and one for the newconn notification
171 * (which is now taken over by IP). In the normal case we would
172 * also have another reference (making a total of 3) for the conn
173 * being in the classifier hash list. However the eager could have
174 * received an RST subsequently and tcp_closei_local could have
175 * removed the eager from the classifier hash list, hence we can't
176 * assert that reference.
177 */
178 ASSERT(econnp->conn_ref >= 2);
179
180 mutex_enter(&listener->tcp_eager_lock);
181 /*
182 * Non-STREAMS listeners never defer the notification of new
183 * connections.
184 */
185 ASSERT(!listener->tcp_eager_prev_q0->tcp_conn_def_q0);
186 tcp_eager_unlink(eager);
187 mutex_exit(&listener->tcp_eager_lock);
188 CONN_DEC_REF(listener->tcp_connp);
189
190 return ((eager->tcp_state < TCPS_ESTABLISHED) ? ECONNABORTED : 0);
191 }
192
193 static int
tcp_bind(sock_lower_handle_t proto_handle,struct sockaddr * sa,socklen_t len,cred_t * cr)194 tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
195 socklen_t len, cred_t *cr)
196 {
197 int error;
198 conn_t *connp = (conn_t *)proto_handle;
199
200 /* All Solaris components should pass a cred for this operation. */
201 ASSERT(cr != NULL);
202 ASSERT(connp->conn_upper_handle != NULL);
203
204 error = squeue_synch_enter(connp, NULL);
205 if (error != 0) {
206 /* failed to enter */
207 return (ENOSR);
208 }
209
210 /* binding to a NULL address really means unbind */
211 if (sa == NULL) {
212 if (connp->conn_tcp->tcp_state < TCPS_LISTEN)
213 error = tcp_do_unbind(connp);
214 else
215 error = EINVAL;
216 } else {
217 error = tcp_do_bind(connp, sa, len, cr, B_TRUE);
218 }
219
220 squeue_synch_exit(connp);
221
222 if (error < 0) {
223 if (error == -TOUTSTATE)
224 error = EINVAL;
225 else
226 error = proto_tlitosyserr(-error);
227 }
228
229 return (error);
230 }
231
232 /* ARGSUSED */
233 static int
tcp_listen(sock_lower_handle_t proto_handle,int backlog,cred_t * cr)234 tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
235 {
236 conn_t *connp = (conn_t *)proto_handle;
237 tcp_t *tcp = connp->conn_tcp;
238 int error;
239
240 ASSERT(connp->conn_upper_handle != NULL);
241
242 /* All Solaris components should pass a cred for this operation. */
243 ASSERT(cr != NULL);
244
245 error = squeue_synch_enter(connp, NULL);
246 if (error != 0) {
247 /* failed to enter */
248 return (ENOBUFS);
249 }
250
251 error = tcp_do_listen(connp, NULL, 0, backlog, cr, B_FALSE);
252 if (error == 0) {
253 /*
254 * sockfs needs to know what's the maximum number of socket
255 * that can be queued on the listener.
256 */
257 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
258 SOCK_OPCTL_ENAB_ACCEPT,
259 (uintptr_t)(tcp->tcp_conn_req_max +
260 tcp->tcp_tcps->tcps_conn_req_max_q0));
261 } else if (error < 0) {
262 if (error == -TOUTSTATE)
263 error = EINVAL;
264 else
265 error = proto_tlitosyserr(-error);
266 }
267 squeue_synch_exit(connp);
268 return (error);
269 }
270
271 static int
tcp_connect(sock_lower_handle_t proto_handle,const struct sockaddr * sa,socklen_t len,sock_connid_t * id,cred_t * cr)272 tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
273 socklen_t len, sock_connid_t *id, cred_t *cr)
274 {
275 conn_t *connp = (conn_t *)proto_handle;
276 int error;
277
278 ASSERT(connp->conn_upper_handle != NULL);
279
280 /* All Solaris components should pass a cred for this operation. */
281 ASSERT(cr != NULL);
282
283 error = proto_verify_ip_addr(connp->conn_family, sa, len);
284 if (error != 0) {
285 return (error);
286 }
287
288 error = squeue_synch_enter(connp, NULL);
289 if (error != 0) {
290 /* failed to enter */
291 return (ENOSR);
292 }
293
294 /*
295 * TCP supports quick connect, so no need to do an implicit bind
296 */
297 error = tcp_do_connect(connp, sa, len, cr, curproc->p_pid);
298 if (error == 0) {
299 *id = connp->conn_tcp->tcp_connid;
300 } else if (error < 0) {
301 if (error == -TOUTSTATE) {
302 switch (connp->conn_tcp->tcp_state) {
303 case TCPS_SYN_SENT:
304 error = EALREADY;
305 break;
306 case TCPS_ESTABLISHED:
307 error = EISCONN;
308 break;
309 case TCPS_LISTEN:
310 error = EOPNOTSUPP;
311 break;
312 default:
313 error = EINVAL;
314 break;
315 }
316 } else {
317 error = proto_tlitosyserr(-error);
318 }
319 }
320
321 if (connp->conn_tcp->tcp_loopback) {
322 struct sock_proto_props sopp;
323
324 sopp.sopp_flags = SOCKOPT_LOOPBACK;
325 sopp.sopp_loopback = B_TRUE;
326
327 (*connp->conn_upcalls->su_set_proto_props)(
328 connp->conn_upper_handle, &sopp);
329 }
330 done:
331 squeue_synch_exit(connp);
332
333 return ((error == 0) ? EINPROGRESS : error);
334 }
335
336 /* ARGSUSED3 */
337 int
tcp_getpeername(sock_lower_handle_t proto_handle,struct sockaddr * addr,socklen_t * addrlenp,cred_t * cr)338 tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr,
339 socklen_t *addrlenp, cred_t *cr)
340 {
341 conn_t *connp = (conn_t *)proto_handle;
342 tcp_t *tcp = connp->conn_tcp;
343
344 /* All Solaris components should pass a cred for this operation. */
345 ASSERT(cr != NULL);
346
347 ASSERT(tcp != NULL);
348 if (tcp->tcp_state < TCPS_SYN_RCVD)
349 return (ENOTCONN);
350
351 return (conn_getpeername(connp, addr, addrlenp));
352 }
353
354 /* ARGSUSED3 */
355 int
tcp_getsockname(sock_lower_handle_t proto_handle,struct sockaddr * addr,socklen_t * addrlenp,cred_t * cr)356 tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr,
357 socklen_t *addrlenp, cred_t *cr)
358 {
359 conn_t *connp = (conn_t *)proto_handle;
360
361 /* All Solaris components should pass a cred for this operation. */
362 ASSERT(cr != NULL);
363
364 return (conn_getsockname(connp, addr, addrlenp));
365 }
366
367 /* returns UNIX error, the optlen is a value-result arg */
368 static int
tcp_getsockopt(sock_lower_handle_t proto_handle,int level,int option_name,void * optvalp,socklen_t * optlen,cred_t * cr)369 tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
370 void *optvalp, socklen_t *optlen, cred_t *cr)
371 {
372 conn_t *connp = (conn_t *)proto_handle;
373 int error;
374 t_uscalar_t max_optbuf_len;
375 void *optvalp_buf;
376 int len;
377
378 ASSERT(connp->conn_upper_handle != NULL);
379
380 error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
381 tcp_opt_obj.odb_opt_des_arr,
382 tcp_opt_obj.odb_opt_arr_cnt,
383 B_FALSE, B_TRUE, cr);
384 if (error != 0) {
385 if (error < 0) {
386 error = proto_tlitosyserr(-error);
387 }
388 return (error);
389 }
390
391 optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
392
393 error = squeue_synch_enter(connp, NULL);
394 if (error == ENOMEM) {
395 kmem_free(optvalp_buf, max_optbuf_len);
396 return (ENOMEM);
397 }
398
399 len = tcp_opt_get(connp, level, option_name, optvalp_buf);
400 squeue_synch_exit(connp);
401
402 if (len == -1) {
403 kmem_free(optvalp_buf, max_optbuf_len);
404 return (EINVAL);
405 }
406
407 /*
408 * update optlen and copy option value
409 */
410 t_uscalar_t size = MIN(len, *optlen);
411
412 bcopy(optvalp_buf, optvalp, size);
413 bcopy(&size, optlen, sizeof (size));
414
415 kmem_free(optvalp_buf, max_optbuf_len);
416 return (0);
417 }
418
419 static int
tcp_setsockopt(sock_lower_handle_t proto_handle,int level,int option_name,const void * optvalp,socklen_t optlen,cred_t * cr)420 tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
421 const void *optvalp, socklen_t optlen, cred_t *cr)
422 {
423 conn_t *connp = (conn_t *)proto_handle;
424 int error;
425
426 ASSERT(connp->conn_upper_handle != NULL);
427 /*
428 * Entering the squeue synchronously can result in a context switch,
429 * which can cause a rather sever performance degradation. So we try to
430 * handle whatever options we can without entering the squeue.
431 */
432 if (level == IPPROTO_TCP) {
433 switch (option_name) {
434 case TCP_NODELAY:
435 if (optlen != sizeof (int32_t))
436 return (EINVAL);
437 mutex_enter(&connp->conn_tcp->tcp_non_sq_lock);
438 connp->conn_tcp->tcp_naglim = *(int *)optvalp ? 1 :
439 connp->conn_tcp->tcp_mss;
440 mutex_exit(&connp->conn_tcp->tcp_non_sq_lock);
441 return (0);
442 default:
443 break;
444 }
445 }
446
447 error = squeue_synch_enter(connp, NULL);
448 if (error == ENOMEM) {
449 return (ENOMEM);
450 }
451
452 error = proto_opt_check(level, option_name, optlen, NULL,
453 tcp_opt_obj.odb_opt_des_arr,
454 tcp_opt_obj.odb_opt_arr_cnt,
455 B_TRUE, B_FALSE, cr);
456
457 if (error != 0) {
458 if (error < 0) {
459 error = proto_tlitosyserr(-error);
460 }
461 squeue_synch_exit(connp);
462 return (error);
463 }
464
465 error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name,
466 optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp,
467 NULL, cr);
468 squeue_synch_exit(connp);
469
470 ASSERT(error >= 0);
471
472 return (error);
473 }
474
475 /* ARGSUSED */
476 static int
tcp_sendmsg(sock_lower_handle_t proto_handle,mblk_t * mp,struct nmsghdr * msg,cred_t * cr)477 tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
478 cred_t *cr)
479 {
480 tcp_t *tcp;
481 uint32_t msize;
482 conn_t *connp = (conn_t *)proto_handle;
483 int32_t tcpstate;
484
485 /* All Solaris components should pass a cred for this operation. */
486 ASSERT(cr != NULL);
487
488 ASSERT(connp->conn_ref >= 2);
489 ASSERT(connp->conn_upper_handle != NULL);
490
491 if (msg->msg_controllen != 0) {
492 freemsg(mp);
493 return (EOPNOTSUPP);
494 }
495
496 switch (DB_TYPE(mp)) {
497 case M_DATA:
498 tcp = connp->conn_tcp;
499 ASSERT(tcp != NULL);
500
501 tcpstate = tcp->tcp_state;
502 if (tcpstate < TCPS_ESTABLISHED) {
503 freemsg(mp);
504 /*
505 * We return ENOTCONN if the endpoint is trying to
506 * connect or has never been connected, and EPIPE if it
507 * has been disconnected. The connection id helps us
508 * distinguish between the last two cases.
509 */
510 return ((tcpstate == TCPS_SYN_SENT) ? ENOTCONN :
511 ((tcp->tcp_connid > 0) ? EPIPE : ENOTCONN));
512 } else if (tcpstate > TCPS_CLOSE_WAIT) {
513 freemsg(mp);
514 return (EPIPE);
515 }
516
517 msize = msgdsize(mp);
518
519 mutex_enter(&tcp->tcp_non_sq_lock);
520 tcp->tcp_squeue_bytes += msize;
521 /*
522 * Squeue Flow Control
523 */
524 if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) {
525 tcp_setqfull(tcp);
526 }
527 mutex_exit(&tcp->tcp_non_sq_lock);
528
529 /*
530 * The application may pass in an address in the msghdr, but
531 * we ignore the address on connection-oriented sockets.
532 * Just like BSD this code does not generate an error for
533 * TCP (a CONNREQUIRED socket) when sending to an address
534 * passed in with sendto/sendmsg. Instead the data is
535 * delivered on the connection as if no address had been
536 * supplied.
537 */
538 CONN_INC_REF(connp);
539
540 if (msg->msg_flags & MSG_OOB) {
541 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output_urgent,
542 connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
543 } else {
544 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output,
545 connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
546 }
547
548 return (0);
549
550 default:
551 ASSERT(0);
552 }
553
554 freemsg(mp);
555 return (0);
556 }
557
558 /* ARGSUSED */
559 static int
tcp_shutdown(sock_lower_handle_t proto_handle,int how,cred_t * cr)560 tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
561 {
562 conn_t *connp = (conn_t *)proto_handle;
563 tcp_t *tcp = connp->conn_tcp;
564
565 ASSERT(connp->conn_upper_handle != NULL);
566
567 /* All Solaris components should pass a cred for this operation. */
568 ASSERT(cr != NULL);
569
570 /*
571 * X/Open requires that we check the connected state.
572 */
573 if (tcp->tcp_state < TCPS_SYN_SENT)
574 return (ENOTCONN);
575
576 /* shutdown the send side */
577 if (how != SHUT_RD) {
578 mblk_t *bp;
579
580 bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL);
581 CONN_INC_REF(connp);
582 SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output,
583 connp, NULL, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT);
584
585 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
586 SOCK_OPCTL_SHUT_SEND, 0);
587 }
588
589 /* shutdown the recv side */
590 if (how != SHUT_WR)
591 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
592 SOCK_OPCTL_SHUT_RECV, 0);
593
594 return (0);
595 }
596
597 static void
tcp_clr_flowctrl(sock_lower_handle_t proto_handle)598 tcp_clr_flowctrl(sock_lower_handle_t proto_handle)
599 {
600 conn_t *connp = (conn_t *)proto_handle;
601 tcp_t *tcp = connp->conn_tcp;
602 mblk_t *mp;
603 int error;
604
605 ASSERT(connp->conn_upper_handle != NULL);
606
607 /*
608 * If tcp->tcp_rsrv_mp == NULL, it means that tcp_clr_flowctrl()
609 * is currently running.
610 */
611 mutex_enter(&tcp->tcp_rsrv_mp_lock);
612 if ((mp = tcp->tcp_rsrv_mp) == NULL) {
613 mutex_exit(&tcp->tcp_rsrv_mp_lock);
614 return;
615 }
616 tcp->tcp_rsrv_mp = NULL;
617 mutex_exit(&tcp->tcp_rsrv_mp_lock);
618
619 error = squeue_synch_enter(connp, mp);
620 ASSERT(error == 0);
621
622 mutex_enter(&tcp->tcp_rsrv_mp_lock);
623 tcp->tcp_rsrv_mp = mp;
624 mutex_exit(&tcp->tcp_rsrv_mp_lock);
625
626 if (tcp->tcp_fused) {
627 tcp_fuse_backenable(tcp);
628 } else {
629 tcp->tcp_rwnd = connp->conn_rcvbuf;
630 /*
631 * Send back a window update immediately if TCP is above
632 * ESTABLISHED state and the increase of the rcv window
633 * that the other side knows is at least 1 MSS after flow
634 * control is lifted.
635 */
636 if (tcp->tcp_state >= TCPS_ESTABLISHED &&
637 tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
638 tcp_xmit_ctl(NULL, tcp,
639 (tcp->tcp_swnd == 0) ? tcp->tcp_suna :
640 tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
641 }
642 }
643
644 squeue_synch_exit(connp);
645 }
646
647 /* ARGSUSED */
648 static int
tcp_ioctl(sock_lower_handle_t proto_handle,int cmd,intptr_t arg,int mode,int32_t * rvalp,cred_t * cr)649 tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
650 int mode, int32_t *rvalp, cred_t *cr)
651 {
652 conn_t *connp = (conn_t *)proto_handle;
653 int error;
654
655 ASSERT(connp->conn_upper_handle != NULL);
656
657 /* All Solaris components should pass a cred for this operation. */
658 ASSERT(cr != NULL);
659
660 /*
661 * If we don't have a helper stream then create one.
662 * ip_create_helper_stream takes care of locking the conn_t,
663 * so this check for NULL is just a performance optimization.
664 */
665 if (connp->conn_helper_info == NULL) {
666 tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps;
667
668 /*
669 * Create a helper stream for non-STREAMS socket.
670 */
671 error = ip_create_helper_stream(connp, tcps->tcps_ldi_ident);
672 if (error != 0) {
673 ip0dbg(("tcp_ioctl: create of IP helper stream "
674 "failed %d\n", error));
675 return (error);
676 }
677 }
678
679 switch (cmd) {
680 case ND_SET:
681 case ND_GET:
682 case _SIOCSOCKFALLBACK:
683 case TCP_IOC_ABORT_CONN:
684 case TI_GETPEERNAME:
685 case TI_GETMYNAME:
686 ip1dbg(("tcp_ioctl: cmd 0x%x on non streams socket",
687 cmd));
688 error = EINVAL;
689 break;
690 default:
691 /*
692 * If the conn is not closing, pass on to IP using
693 * helper stream. Bump the ioctlref to prevent tcp_close
694 * from closing the rq/wq out from underneath the ioctl
695 * if it ends up queued or aborted/interrupted.
696 */
697 mutex_enter(&connp->conn_lock);
698 if (connp->conn_state_flags & (CONN_CLOSING)) {
699 mutex_exit(&connp->conn_lock);
700 error = EINVAL;
701 break;
702 }
703 CONN_INC_IOCTLREF_LOCKED(connp);
704 error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
705 cmd, arg, mode, cr, rvalp);
706 CONN_DEC_IOCTLREF(connp);
707 break;
708 }
709 return (error);
710 }
711
712 /* ARGSUSED */
713 static int
tcp_close(sock_lower_handle_t proto_handle,int flags,cred_t * cr)714 tcp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
715 {
716 conn_t *connp = (conn_t *)proto_handle;
717
718 ASSERT(connp->conn_upper_handle != NULL);
719
720 /* All Solaris components should pass a cred for this operation. */
721 ASSERT(cr != NULL);
722
723 tcp_close_common(connp, flags);
724
725 ip_free_helper_stream(connp);
726
727 /*
728 * Drop IP's reference on the conn. This is the last reference
729 * on the connp if the state was less than established. If the
730 * connection has gone into timewait state, then we will have
731 * one ref for the TCP and one more ref (total of two) for the
732 * classifier connected hash list (a timewait connections stays
733 * in connected hash till closed).
734 *
735 * We can't assert the references because there might be other
736 * transient reference places because of some walkers or queued
737 * packets in squeue for the timewait state.
738 */
739 CONN_DEC_REF(connp);
740
741 /*
742 * EINPROGRESS tells sockfs to wait for a 'closed' upcall before
743 * freeing the socket.
744 */
745 return (EINPROGRESS);
746 }
747
748 /* ARGSUSED */
749 sock_lower_handle_t
tcp_create(int family,int type,int proto,sock_downcalls_t ** sock_downcalls,uint_t * smodep,int * errorp,int flags,cred_t * credp)750 tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
751 uint_t *smodep, int *errorp, int flags, cred_t *credp)
752 {
753 conn_t *connp;
754 boolean_t isv6 = family == AF_INET6;
755 if (type != SOCK_STREAM || (family != AF_INET && family != AF_INET6) ||
756 (proto != 0 && proto != IPPROTO_TCP)) {
757 *errorp = EPROTONOSUPPORT;
758 return (NULL);
759 }
760
761 connp = tcp_create_common(credp, isv6, B_TRUE, errorp);
762 if (connp == NULL) {
763 return (NULL);
764 }
765
766 /*
767 * Put the ref for TCP. Ref for IP was already put
768 * by ipcl_conn_create. Also Make the conn_t globally
769 * visible to walkers
770 */
771 mutex_enter(&connp->conn_lock);
772 CONN_INC_REF_LOCKED(connp);
773 ASSERT(connp->conn_ref == 2);
774 connp->conn_state_flags &= ~CONN_INCIPIENT;
775
776 connp->conn_flags |= IPCL_NONSTR;
777 mutex_exit(&connp->conn_lock);
778
779 ASSERT(errorp != NULL);
780 *errorp = 0;
781 *sock_downcalls = &sock_tcp_downcalls;
782 *smodep = SM_CONNREQUIRED | SM_EXDATA | SM_ACCEPTSUPP |
783 SM_SENDFILESUPP;
784
785 return ((sock_lower_handle_t)connp);
786 }
787
788 /*
789 * tcp_fallback
790 *
791 * A direct socket is falling back to using STREAMS. The queue
792 * that is being passed down was created using tcp_open() with
793 * the SO_FALLBACK flag set. As a result, the queue is not
794 * associated with a conn, and the q_ptrs instead contain the
795 * dev and minor area that should be used.
796 *
797 * The 'issocket' flag indicates whether the FireEngine
798 * optimizations should be used. The common case would be that
799 * optimizations are enabled, and they might be subsequently
800 * disabled using the _SIOCSOCKFALLBACK ioctl.
801 */
802
803 /*
804 * An active connection is falling back to TPI. Gather all the information
805 * required by the STREAM head and TPI sonode and send it up.
806 */
807 static void
tcp_fallback_noneager(tcp_t * tcp,mblk_t * stropt_mp,queue_t * q,boolean_t issocket,so_proto_quiesced_cb_t quiesced_cb,sock_quiesce_arg_t * arg)808 tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
809 boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb,
810 sock_quiesce_arg_t *arg)
811 {
812 conn_t *connp = tcp->tcp_connp;
813 struct stroptions *stropt;
814 struct T_capability_ack tca;
815 struct sockaddr_in6 laddr, faddr;
816 socklen_t laddrlen, faddrlen;
817 short opts;
818 int error;
819 mblk_t *mp, *mpnext;
820
821 connp->conn_dev = (dev_t)RD(q)->q_ptr;
822 connp->conn_minor_arena = WR(q)->q_ptr;
823
824 RD(q)->q_ptr = WR(q)->q_ptr = connp;
825
826 connp->conn_rq = RD(q);
827 connp->conn_wq = WR(q);
828
829 WR(q)->q_qinfo = &tcp_sock_winit;
830
831 if (!issocket)
832 tcp_use_pure_tpi(tcp);
833
834 /*
835 * free the helper stream
836 */
837 ip_free_helper_stream(connp);
838
839 /*
840 * Notify the STREAM head about options
841 */
842 DB_TYPE(stropt_mp) = M_SETOPTS;
843 stropt = (struct stroptions *)stropt_mp->b_rptr;
844 stropt_mp->b_wptr += sizeof (struct stroptions);
845 stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
846
847 stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 :
848 tcp->tcp_tcps->tcps_wroff_xtra);
849 if (tcp->tcp_snd_sack_ok)
850 stropt->so_wroff += TCPOPT_MAX_SACK_LEN;
851 stropt->so_hiwat = connp->conn_rcvbuf;
852 stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
853
854 putnext(RD(q), stropt_mp);
855
856 /*
857 * Collect the information needed to sync with the sonode
858 */
859 tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID);
860
861 laddrlen = faddrlen = sizeof (sin6_t);
862 (void) tcp_getsockname((sock_lower_handle_t)connp,
863 (struct sockaddr *)&laddr, &laddrlen, CRED());
864 error = tcp_getpeername((sock_lower_handle_t)connp,
865 (struct sockaddr *)&faddr, &faddrlen, CRED());
866 if (error != 0)
867 faddrlen = 0;
868
869 opts = 0;
870 if (connp->conn_oobinline)
871 opts |= SO_OOBINLINE;
872 if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
873 opts |= SO_DONTROUTE;
874
875 /*
876 * Notify the socket that the protocol is now quiescent,
877 * and it's therefore safe move data from the socket
878 * to the stream head.
879 */
880 mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca,
881 (struct sockaddr *)&laddr, laddrlen,
882 (struct sockaddr *)&faddr, faddrlen, opts);
883
884 while (mp != NULL) {
885 mpnext = mp->b_next;
886 tcp->tcp_rcv_list = mp->b_next;
887 mp->b_next = NULL;
888 putnext(q, mp);
889 mp = mpnext;
890 }
891 ASSERT(tcp->tcp_rcv_last_head == NULL);
892 ASSERT(tcp->tcp_rcv_last_tail == NULL);
893 ASSERT(tcp->tcp_rcv_cnt == 0);
894
895 /*
896 * All eagers in q0 are marked as being non-STREAM, so they will
897 * make su_newconn upcalls when the handshake completes, which
898 * will fail (resulting in the conn being closed). So we just blow
899 * off everything in q0 instead of waiting for the inevitable.
900 */
901 if (tcp->tcp_conn_req_cnt_q0 != 0)
902 tcp_eager_cleanup(tcp, B_TRUE);
903 }
904
905 /*
906 * An eager is falling back to TPI. All we have to do is send
907 * up a T_CONN_IND.
908 */
909 static void
tcp_fallback_eager(tcp_t * eager,boolean_t issocket,so_proto_quiesced_cb_t quiesced_cb,sock_quiesce_arg_t * arg)910 tcp_fallback_eager(tcp_t *eager, boolean_t issocket,
911 so_proto_quiesced_cb_t quiesced_cb, sock_quiesce_arg_t *arg)
912 {
913 conn_t *connp = eager->tcp_connp;
914 tcp_t *listener = eager->tcp_listener;
915 mblk_t *mp;
916
917 ASSERT(listener != NULL);
918
919 /*
920 * Notify the socket that the protocol is now quiescent,
921 * and it's therefore safe move data from the socket
922 * to tcp's rcv queue.
923 */
924 mp = (*quiesced_cb)(connp->conn_upper_handle, arg, NULL, NULL, 0,
925 NULL, 0, 0);
926
927 if (mp != NULL) {
928 ASSERT(eager->tcp_rcv_cnt == 0);
929
930 eager->tcp_rcv_list = mp;
931 eager->tcp_rcv_cnt = msgdsize(mp);
932 while (mp->b_next != NULL) {
933 mp = mp->b_next;
934 eager->tcp_rcv_cnt += msgdsize(mp);
935 }
936 eager->tcp_rcv_last_head = mp;
937 while (mp->b_cont)
938 mp = mp->b_cont;
939 eager->tcp_rcv_last_tail = mp;
940 if (eager->tcp_rcv_cnt > eager->tcp_rwnd)
941 eager->tcp_rwnd = 0;
942 else
943 eager->tcp_rwnd -= eager->tcp_rcv_cnt;
944 }
945
946 if (!issocket)
947 eager->tcp_issocket = B_FALSE;
948 /*
949 * The stream for this eager does not yet exist, so mark it as
950 * being detached.
951 */
952 eager->tcp_detached = B_TRUE;
953 eager->tcp_hard_binding = B_TRUE;
954 connp->conn_rq = listener->tcp_connp->conn_rq;
955 connp->conn_wq = listener->tcp_connp->conn_wq;
956
957 /* Send up the connection indication */
958 mp = eager->tcp_conn.tcp_eager_conn_ind;
959 ASSERT(mp != NULL);
960 eager->tcp_conn.tcp_eager_conn_ind = NULL;
961
962 /*
963 * TLI/XTI applications will get confused by
964 * sending eager as an option since it violates
965 * the option semantics. So remove the eager as
966 * option since TLI/XTI app doesn't need it anyway.
967 */
968 if (!issocket) {
969 struct T_conn_ind *conn_ind;
970
971 conn_ind = (struct T_conn_ind *)mp->b_rptr;
972 conn_ind->OPT_length = 0;
973 conn_ind->OPT_offset = 0;
974 }
975
976 /*
977 * Sockfs guarantees that the listener will not be closed
978 * during fallback. So we can safely use the listener's queue.
979 */
980 putnext(listener->tcp_connp->conn_rq, mp);
981 }
982
983
984 int
tcp_fallback(sock_lower_handle_t proto_handle,queue_t * q,boolean_t direct_sockfs,so_proto_quiesced_cb_t quiesced_cb,sock_quiesce_arg_t * arg)985 tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
986 boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb,
987 sock_quiesce_arg_t *arg)
988 {
989 tcp_t *tcp;
990 conn_t *connp = (conn_t *)proto_handle;
991 int error;
992 mblk_t *stropt_mp;
993 mblk_t *ordrel_mp;
994
995 tcp = connp->conn_tcp;
996
997 stropt_mp = allocb_wait(sizeof (struct stroptions), BPRI_HI, STR_NOSIG,
998 NULL);
999
1000 /* Pre-allocate the T_ordrel_ind mblk. */
1001 ASSERT(tcp->tcp_ordrel_mp == NULL);
1002 ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI,
1003 STR_NOSIG, NULL);
1004 ordrel_mp->b_datap->db_type = M_PROTO;
1005 ((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND;
1006 ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind);
1007
1008 /*
1009 * Enter the squeue so that no new packets can come in
1010 */
1011 error = squeue_synch_enter(connp, NULL);
1012 if (error != 0) {
1013 /* failed to enter, free all the pre-allocated messages. */
1014 freeb(stropt_mp);
1015 freeb(ordrel_mp);
1016 return (ENOMEM);
1017 }
1018
1019 /*
1020 * Both endpoints must be of the same type (either STREAMS or
1021 * non-STREAMS) for fusion to be enabled. So if we are fused,
1022 * we have to unfuse.
1023 */
1024 if (tcp->tcp_fused)
1025 tcp_unfuse(tcp);
1026
1027 if (tcp->tcp_listener != NULL) {
1028 /* The eager will deal with opts when accept() is called */
1029 freeb(stropt_mp);
1030 tcp_fallback_eager(tcp, direct_sockfs, quiesced_cb, arg);
1031 } else {
1032 tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs,
1033 quiesced_cb, arg);
1034 }
1035
1036 /*
1037 * No longer a direct socket
1038 *
1039 * Note that we intentionally leave the upper_handle and upcalls
1040 * intact, since eagers may still be using them.
1041 */
1042 connp->conn_flags &= ~IPCL_NONSTR;
1043 tcp->tcp_ordrel_mp = ordrel_mp;
1044
1045 /*
1046 * There should be atleast two ref's (IP + TCP)
1047 */
1048 ASSERT(connp->conn_ref >= 2);
1049 squeue_synch_exit(connp);
1050
1051 return (0);
1052 }
1053
1054 /*
1055 * Notifies a non-STREAMS based listener about a new connection. This
1056 * function is executed on the *eager*'s squeue once the 3 way handshake
1057 * has completed. Note that the behavior differs from STREAMS, where the
1058 * T_CONN_IND is sent up by tcp_send_conn_ind() while on the *listener*'s
1059 * squeue.
1060 *
1061 * Returns B_TRUE if the notification succeeded and an upper handle was
1062 * obtained. `tcp' should be closed on failure.
1063 */
1064 boolean_t
tcp_newconn_notify(tcp_t * tcp,ip_recv_attr_t * ira)1065 tcp_newconn_notify(tcp_t *tcp, ip_recv_attr_t *ira)
1066 {
1067 tcp_t *listener = tcp->tcp_listener;
1068 conn_t *lconnp = listener->tcp_connp;
1069 conn_t *econnp = tcp->tcp_connp;
1070 tcp_t *tail;
1071 ipaddr_t *addr_cache;
1072 sock_upper_handle_t upper;
1073 struct sock_proto_props sopp;
1074
1075 mutex_enter(&listener->tcp_eager_lock);
1076 /*
1077 * Take the eager out, if it is in the list of droppable eagers
1078 * as we are here because the 3W handshake is over.
1079 */
1080 MAKE_UNDROPPABLE(tcp);
1081 /*
1082 * The eager already has an extra ref put in tcp_input_data
1083 * so that it stays till accept comes back even though it
1084 * might get into TCPS_CLOSED as a result of a TH_RST etc.
1085 */
1086 ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
1087 listener->tcp_conn_req_cnt_q0--;
1088 listener->tcp_conn_req_cnt_q++;
1089
1090 /* Move from SYN_RCVD to ESTABLISHED list */
1091 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = tcp->tcp_eager_prev_q0;
1092 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp->tcp_eager_next_q0;
1093 tcp->tcp_eager_prev_q0 = NULL;
1094 tcp->tcp_eager_next_q0 = NULL;
1095
1096 /*
1097 * Insert at end of the queue because connections are accepted
1098 * in chronological order. Leaving the older connections at front
1099 * of the queue helps reducing search time.
1100 */
1101 tail = listener->tcp_eager_last_q;
1102 if (tail != NULL)
1103 tail->tcp_eager_next_q = tcp;
1104 else
1105 listener->tcp_eager_next_q = tcp;
1106 listener->tcp_eager_last_q = tcp;
1107 tcp->tcp_eager_next_q = NULL;
1108
1109 /* we have timed out before */
1110 if (tcp->tcp_syn_rcvd_timeout != 0) {
1111 tcp->tcp_syn_rcvd_timeout = 0;
1112 listener->tcp_syn_rcvd_timeout--;
1113 if (listener->tcp_syn_defense &&
1114 listener->tcp_syn_rcvd_timeout <=
1115 (listener->tcp_tcps->tcps_conn_req_max_q0 >> 5) &&
1116 10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() -
1117 listener->tcp_last_rcv_lbolt)) {
1118 /*
1119 * Turn off the defense mode if we
1120 * believe the SYN attack is over.
1121 */
1122 listener->tcp_syn_defense = B_FALSE;
1123 if (listener->tcp_ip_addr_cache) {
1124 kmem_free((void *)listener->tcp_ip_addr_cache,
1125 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
1126 listener->tcp_ip_addr_cache = NULL;
1127 }
1128 }
1129 }
1130 addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
1131 if (addr_cache != NULL) {
1132 /*
1133 * We have finished a 3-way handshake with this
1134 * remote host. This proves the IP addr is good.
1135 * Cache it!
1136 */
1137 addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] =
1138 tcp->tcp_connp->conn_faddr_v4;
1139 }
1140 mutex_exit(&listener->tcp_eager_lock);
1141
1142 /*
1143 * Notify the ULP about the newconn. It is guaranteed that no
1144 * tcp_accept() call will be made for the eager if the
1145 * notification fails.
1146 */
1147 if ((upper = (*lconnp->conn_upcalls->su_newconn)
1148 (lconnp->conn_upper_handle, (sock_lower_handle_t)econnp,
1149 &sock_tcp_downcalls, ira->ira_cred, ira->ira_cpid,
1150 &econnp->conn_upcalls)) == NULL) {
1151 return (B_FALSE);
1152 }
1153 econnp->conn_upper_handle = upper;
1154
1155 tcp->tcp_detached = B_FALSE;
1156 tcp->tcp_hard_binding = B_FALSE;
1157 tcp->tcp_tconnind_started = B_TRUE;
1158
1159 if (econnp->conn_keepalive) {
1160 tcp->tcp_ka_last_intrvl = 0;
1161 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
1162 tcp->tcp_ka_interval);
1163 }
1164
1165 /* Update the necessary parameters */
1166 tcp_get_proto_props(tcp, &sopp);
1167
1168 (*econnp->conn_upcalls->su_set_proto_props)
1169 (econnp->conn_upper_handle, &sopp);
1170
1171 return (B_TRUE);
1172 }
1173