1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 #include <sys/types.h>
27 #include <sys/strlog.h>
28 #include <sys/strsun.h>
29 #include <sys/squeue_impl.h>
30 #include <sys/squeue.h>
31 #include <sys/callo.h>
32 #include <sys/strsubr.h>
33
34 #include <inet/common.h>
35 #include <inet/ip.h>
36 #include <inet/ip_ire.h>
37 #include <inet/ip_rts.h>
38 #include <inet/tcp.h>
39 #include <inet/tcp_impl.h>
40
41 /*
42 * Implementation of TCP Timers.
43 * =============================
44 *
45 * INTERFACE:
46 *
47 * There are two basic functions dealing with tcp timers:
48 *
49 * timeout_id_t tcp_timeout(connp, func, time)
50 * clock_t tcp_timeout_cancel(connp, timeout_id)
51 * TCP_TIMER_RESTART(tcp, intvl)
52 *
53 * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func'
54 * after 'time' ticks passed. The function called by timeout() must adhere to
55 * the same restrictions as a driver soft interrupt handler - it must not sleep
56 * or call other functions that might sleep. The value returned is the opaque
57 * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to
58 * cancel the request. The call to tcp_timeout() may fail in which case it
59 * returns zero. This is different from the timeout(9F) function which never
60 * fails.
61 *
62 * The call-back function 'func' always receives 'connp' as its single
63 * argument. It is always executed in the squeue corresponding to the tcp
64 * structure. The tcp structure is guaranteed to be present at the time the
65 * call-back is called.
66 *
67 * NOTE: The call-back function 'func' is never called if tcp is in
68 * the TCPS_CLOSED state.
69 *
70 * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout()
71 * request. locks acquired by the call-back routine should not be held across
72 * the call to tcp_timeout_cancel() or a deadlock may result.
73 *
74 * tcp_timeout_cancel() returns -1 if it can not cancel the timeout request.
75 * Otherwise, it returns an integer value greater than or equal to 0. In
76 * particular, if the call-back function is already placed on the squeue, it can
77 * not be canceled.
78 *
79 * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called
80 * within squeue context corresponding to the tcp instance. Since the
81 * call-back is also called via the same squeue, there are no race
82 * conditions described in untimeout(9F) manual page since all calls are
83 * strictly serialized.
84 *
85 * TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout
86 * stored in tcp_timer_tid and starts a new one using
87 * MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back
88 * and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid
89 * field.
90 *
91 * NOTE: since the timeout cancellation is not guaranteed, the cancelled
92 * call-back may still be called, so it is possible tcp_timer() will be
93 * called several times. This should not be a problem since tcp_timer()
94 * should always check the tcp instance state.
95 *
96 *
97 * IMPLEMENTATION:
98 *
99 * TCP timers are implemented using three-stage process. The call to
100 * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function
101 * when the timer expires. The tcp_timer_callback() arranges the call of the
102 * tcp_timer_handler() function via squeue corresponding to the tcp
103 * instance. The tcp_timer_handler() calls actual requested timeout call-back
104 * and passes tcp instance as an argument to it. Information is passed between
105 * stages using the tcp_timer_t structure which contains the connp pointer, the
106 * tcp call-back to call and the timeout id returned by the timeout(9F).
107 *
108 * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t -
109 * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo
110 * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout()
111 * returns the pointer to this mblk.
112 *
113 * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It
114 * looks like a normal mblk without actual dblk attached to it.
115 *
116 * To optimize performance each tcp instance holds a small cache of timer
117 * mblocks. In the current implementation it caches up to two timer mblocks per
118 * tcp instance. The cache is preserved over tcp frees and is only freed when
119 * the whole tcp structure is destroyed by its kmem destructor. Since all tcp
120 * timer processing happens on a corresponding squeue, the cache manipulation
121 * does not require any locks. Experiments show that majority of timer mblocks
122 * allocations are satisfied from the tcp cache and do not involve kmem calls.
123 *
124 * The tcp_timeout() places a refhold on the connp instance which guarantees
125 * that it will be present at the time the call-back function fires. The
126 * tcp_timer_handler() drops the reference after calling the call-back, so the
127 * call-back function does not need to manipulate the references explicitly.
128 */
129
130 kmem_cache_t *tcp_timercache;
131
132 static void tcp_ip_notify(tcp_t *);
133 static void tcp_timer_callback(void *);
134 static void tcp_timer_free(tcp_t *, mblk_t *);
135 static void tcp_timer_handler(void *, mblk_t *, void *, ip_recv_attr_t *);
136
137 /*
138 * tim is in millisec.
139 */
140 timeout_id_t
tcp_timeout(conn_t * connp,void (* f)(void *),hrtime_t tim)141 tcp_timeout(conn_t *connp, void (*f)(void *), hrtime_t tim)
142 {
143 mblk_t *mp;
144 tcp_timer_t *tcpt;
145 tcp_t *tcp = connp->conn_tcp;
146
147 ASSERT(connp->conn_sqp != NULL);
148
149 TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_calls);
150
151 if (tcp->tcp_timercache == NULL) {
152 mp = tcp_timermp_alloc(KM_NOSLEEP | KM_PANIC);
153 } else {
154 TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_cached_alloc);
155 mp = tcp->tcp_timercache;
156 tcp->tcp_timercache = mp->b_next;
157 mp->b_next = NULL;
158 ASSERT(mp->b_wptr == NULL);
159 }
160
161 CONN_INC_REF(connp);
162 tcpt = (tcp_timer_t *)mp->b_rptr;
163 tcpt->connp = connp;
164 tcpt->tcpt_proc = f;
165 /*
166 * TCP timers are normal timeouts. Plus, they do not require more than
167 * a 10 millisecond resolution. By choosing a coarser resolution and by
168 * rounding up the expiration to the next resolution boundary, we can
169 * batch timers in the callout subsystem to make TCP timers more
170 * efficient. The roundup also protects short timers from expiring too
171 * early before they have a chance to be cancelled.
172 */
173 tcpt->tcpt_tid = timeout_generic(CALLOUT_NORMAL, tcp_timer_callback, mp,
174 tim * MICROSEC, CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
175
176 return ((timeout_id_t)mp);
177 }
178
179 static void
tcp_timer_callback(void * arg)180 tcp_timer_callback(void *arg)
181 {
182 mblk_t *mp = (mblk_t *)arg;
183 tcp_timer_t *tcpt;
184 conn_t *connp;
185
186 tcpt = (tcp_timer_t *)mp->b_rptr;
187 connp = tcpt->connp;
188 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timer_handler, connp,
189 NULL, SQ_FILL, SQTAG_TCP_TIMER);
190 }
191
192 /* ARGSUSED */
193 static void
tcp_timer_handler(void * arg,mblk_t * mp,void * arg2,ip_recv_attr_t * dummy)194 tcp_timer_handler(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
195 {
196 tcp_timer_t *tcpt;
197 conn_t *connp = (conn_t *)arg;
198 tcp_t *tcp = connp->conn_tcp;
199
200 tcpt = (tcp_timer_t *)mp->b_rptr;
201 ASSERT(connp == tcpt->connp);
202 ASSERT((squeue_t *)arg2 == connp->conn_sqp);
203
204 /*
205 * If the TCP has reached the closed state, don't proceed any
206 * further. This TCP logically does not exist on the system.
207 * tcpt_proc could for example access queues, that have already
208 * been qprocoff'ed off.
209 */
210 if (tcp->tcp_state != TCPS_CLOSED) {
211 (*tcpt->tcpt_proc)(connp);
212 } else {
213 tcp->tcp_timer_tid = 0;
214 }
215 tcp_timer_free(connp->conn_tcp, mp);
216 }
217
218 /*
219 * There is potential race with untimeout and the handler firing at the same
220 * time. The mblock may be freed by the handler while we are trying to use
221 * it. But since both should execute on the same squeue, this race should not
222 * occur.
223 */
224 clock_t
tcp_timeout_cancel(conn_t * connp,timeout_id_t id)225 tcp_timeout_cancel(conn_t *connp, timeout_id_t id)
226 {
227 mblk_t *mp = (mblk_t *)id;
228 tcp_timer_t *tcpt;
229 clock_t delta;
230
231 TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_cancel_reqs);
232
233 if (mp == NULL)
234 return (-1);
235
236 tcpt = (tcp_timer_t *)mp->b_rptr;
237 ASSERT(tcpt->connp == connp);
238
239 delta = untimeout_default(tcpt->tcpt_tid, 0);
240
241 if (delta >= 0) {
242 TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_canceled);
243 tcp_timer_free(connp->conn_tcp, mp);
244 CONN_DEC_REF(connp);
245 }
246
247 return (TICK_TO_MSEC(delta));
248 }
249
250 /*
251 * Allocate space for the timer event. The allocation looks like mblk, but it is
252 * not a proper mblk. To avoid confusion we set b_wptr to NULL.
253 *
254 * Dealing with failures: If we can't allocate from the timer cache we try
255 * allocating from dblock caches using allocb_tryhard(). In this case b_wptr
256 * points to b_rptr.
257 * If we can't allocate anything using allocb_tryhard(), we perform a last
258 * attempt and use kmem_alloc_tryhard(). In this case we set b_wptr to -1 and
259 * save the actual allocation size in b_datap.
260 */
261 mblk_t *
tcp_timermp_alloc(int kmflags)262 tcp_timermp_alloc(int kmflags)
263 {
264 mblk_t *mp = (mblk_t *)kmem_cache_alloc(tcp_timercache,
265 kmflags & ~KM_PANIC);
266
267 if (mp != NULL) {
268 mp->b_next = mp->b_prev = NULL;
269 mp->b_rptr = (uchar_t *)(&mp[1]);
270 mp->b_wptr = NULL;
271 mp->b_datap = NULL;
272 mp->b_queue = NULL;
273 mp->b_cont = NULL;
274 } else if (kmflags & KM_PANIC) {
275 /*
276 * Failed to allocate memory for the timer. Try allocating from
277 * dblock caches.
278 */
279 /* ipclassifier calls this from a constructor - hence no tcps */
280 TCP_G_STAT(tcp_timermp_allocfail);
281 mp = allocb_tryhard(sizeof (tcp_timer_t));
282 if (mp == NULL) {
283 size_t size = 0;
284 /*
285 * Memory is really low. Try tryhard allocation.
286 *
287 * ipclassifier calls this from a constructor -
288 * hence no tcps
289 */
290 TCP_G_STAT(tcp_timermp_allocdblfail);
291 mp = kmem_alloc_tryhard(sizeof (mblk_t) +
292 sizeof (tcp_timer_t), &size, kmflags);
293 mp->b_rptr = (uchar_t *)(&mp[1]);
294 mp->b_next = mp->b_prev = NULL;
295 mp->b_wptr = (uchar_t *)-1;
296 mp->b_datap = (dblk_t *)size;
297 mp->b_queue = NULL;
298 mp->b_cont = NULL;
299 }
300 ASSERT(mp->b_wptr != NULL);
301 }
302 /* ipclassifier calls this from a constructor - hence no tcps */
303 TCP_G_DBGSTAT(tcp_timermp_alloced);
304
305 return (mp);
306 }
307
308 /*
309 * Free per-tcp timer cache.
310 * It can only contain entries from tcp_timercache.
311 */
312 void
tcp_timermp_free(tcp_t * tcp)313 tcp_timermp_free(tcp_t *tcp)
314 {
315 mblk_t *mp;
316
317 while ((mp = tcp->tcp_timercache) != NULL) {
318 ASSERT(mp->b_wptr == NULL);
319 tcp->tcp_timercache = tcp->tcp_timercache->b_next;
320 kmem_cache_free(tcp_timercache, mp);
321 }
322 }
323
324 /*
325 * Free timer event. Put it on the per-tcp timer cache if there is not too many
326 * events there already (currently at most two events are cached).
327 * If the event is not allocated from the timer cache, free it right away.
328 */
329 static void
tcp_timer_free(tcp_t * tcp,mblk_t * mp)330 tcp_timer_free(tcp_t *tcp, mblk_t *mp)
331 {
332 mblk_t *mp1 = tcp->tcp_timercache;
333
334 if (mp->b_wptr != NULL) {
335 /*
336 * This allocation is not from a timer cache, free it right
337 * away.
338 */
339 if (mp->b_wptr != (uchar_t *)-1)
340 freeb(mp);
341 else
342 kmem_free(mp, (size_t)mp->b_datap);
343 } else if (mp1 == NULL || mp1->b_next == NULL) {
344 /* Cache this timer block for future allocations */
345 mp->b_rptr = (uchar_t *)(&mp[1]);
346 mp->b_next = mp1;
347 tcp->tcp_timercache = mp;
348 } else {
349 kmem_cache_free(tcp_timercache, mp);
350 TCP_DBGSTAT(tcp->tcp_tcps, tcp_timermp_freed);
351 }
352 }
353
354 /*
355 * Stop all TCP timers.
356 */
357 void
tcp_timers_stop(tcp_t * tcp)358 tcp_timers_stop(tcp_t *tcp)
359 {
360 if (tcp->tcp_timer_tid != 0) {
361 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
362 tcp->tcp_timer_tid = 0;
363 }
364 if (tcp->tcp_ka_tid != 0) {
365 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ka_tid);
366 tcp->tcp_ka_tid = 0;
367 }
368 if (tcp->tcp_ack_tid != 0) {
369 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid);
370 tcp->tcp_ack_tid = 0;
371 }
372 if (tcp->tcp_push_tid != 0) {
373 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid);
374 tcp->tcp_push_tid = 0;
375 }
376 if (tcp->tcp_reass_tid != 0) {
377 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_reass_tid);
378 tcp->tcp_reass_tid = 0;
379 }
380 }
381
382 /*
383 * Timer callback routine for keepalive probe. We do a fake resend of
384 * last ACKed byte. Then set a timer using RTO. When the timer expires,
385 * check to see if we have heard anything from the other end for the last
386 * RTO period. If we have, set the timer to expire for another
387 * tcp_keepalive_intrvl and check again. If we have not, set a timer using
388 * RTO << 1 and check again when it expires. Keep exponentially increasing
389 * the timeout if we have not heard from the other side. If for more than
390 * (tcp_ka_interval + tcp_ka_abort_thres) we have not heard anything,
391 * kill the connection unless the keepalive abort threshold is 0. In
392 * that case, we will probe "forever."
393 */
394 void
tcp_keepalive_timer(void * arg)395 tcp_keepalive_timer(void *arg)
396 {
397 mblk_t *mp;
398 conn_t *connp = (conn_t *)arg;
399 tcp_t *tcp = connp->conn_tcp;
400 int32_t firetime;
401 int32_t idletime;
402 int32_t ka_intrvl;
403 tcp_stack_t *tcps = tcp->tcp_tcps;
404
405 tcp->tcp_ka_tid = 0;
406
407 if (tcp->tcp_fused)
408 return;
409
410 TCPS_BUMP_MIB(tcps, tcpTimKeepalive);
411 ka_intrvl = tcp->tcp_ka_interval;
412
413 /*
414 * Keepalive probe should only be sent if the application has not
415 * done a close on the connection.
416 */
417 if (tcp->tcp_state > TCPS_CLOSE_WAIT) {
418 return;
419 }
420 /* Timer fired too early, restart it. */
421 if (tcp->tcp_state < TCPS_ESTABLISHED) {
422 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
423 ka_intrvl);
424 return;
425 }
426
427 idletime = TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time);
428 /*
429 * If we have not heard from the other side for a long
430 * time, kill the connection unless the keepalive abort
431 * threshold is 0. In that case, we will probe "forever."
432 */
433 if (tcp->tcp_ka_abort_thres != 0 &&
434 idletime > (ka_intrvl + tcp->tcp_ka_abort_thres)) {
435 TCPS_BUMP_MIB(tcps, tcpTimKeepaliveDrop);
436 (void) tcp_clean_death(tcp, tcp->tcp_client_errno ?
437 tcp->tcp_client_errno : ETIMEDOUT);
438 return;
439 }
440
441 if (tcp->tcp_snxt == tcp->tcp_suna &&
442 idletime >= ka_intrvl) {
443 /* Fake resend of last ACKed byte. */
444 mblk_t *mp1 = allocb(1, BPRI_LO);
445
446 if (mp1 != NULL) {
447 *mp1->b_wptr++ = '\0';
448 mp = tcp_xmit_mp(tcp, mp1, 1, NULL, NULL,
449 tcp->tcp_suna - 1, B_FALSE, NULL, B_TRUE);
450 freeb(mp1);
451 /*
452 * if allocation failed, fall through to start the
453 * timer back.
454 */
455 if (mp != NULL) {
456 tcp_send_data(tcp, mp);
457 TCPS_BUMP_MIB(tcps, tcpTimKeepaliveProbe);
458 if (tcp->tcp_ka_last_intrvl != 0) {
459 int max;
460 /*
461 * We should probe again at least
462 * in ka_intrvl, but not more than
463 * tcp_rto_max.
464 */
465 max = tcp->tcp_rto_max;
466 firetime = MIN(ka_intrvl - 1,
467 tcp->tcp_ka_last_intrvl << 1);
468 if (firetime > max)
469 firetime = max;
470 } else {
471 firetime = tcp->tcp_rto;
472 }
473 tcp->tcp_ka_tid = TCP_TIMER(tcp,
474 tcp_keepalive_timer, firetime);
475 tcp->tcp_ka_last_intrvl = firetime;
476 return;
477 }
478 }
479 } else {
480 tcp->tcp_ka_last_intrvl = 0;
481 }
482
483 /* firetime can be negative if (mp1 == NULL || mp == NULL) */
484 if ((firetime = ka_intrvl - idletime) < 0) {
485 firetime = ka_intrvl;
486 }
487 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer, firetime);
488 }
489
490 void
tcp_reass_timer(void * arg)491 tcp_reass_timer(void *arg)
492 {
493 conn_t *connp = (conn_t *)arg;
494 tcp_t *tcp = connp->conn_tcp;
495
496 tcp->tcp_reass_tid = 0;
497 if (tcp->tcp_reass_head == NULL)
498 return;
499 ASSERT(tcp->tcp_reass_tail != NULL);
500 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
501 tcp_sack_remove(tcp->tcp_sack_list,
502 TCP_REASS_END(tcp->tcp_reass_tail), &tcp->tcp_num_sack_blk);
503 }
504 tcp_close_mpp(&tcp->tcp_reass_head);
505 tcp->tcp_reass_tail = NULL;
506 TCP_STAT(tcp->tcp_tcps, tcp_reass_timeout);
507 }
508
509 /* This function handles the push timeout. */
510 void
tcp_push_timer(void * arg)511 tcp_push_timer(void *arg)
512 {
513 conn_t *connp = (conn_t *)arg;
514 tcp_t *tcp = connp->conn_tcp;
515
516 TCP_DBGSTAT(tcp->tcp_tcps, tcp_push_timer_cnt);
517
518 ASSERT(tcp->tcp_listener == NULL);
519
520 ASSERT(!IPCL_IS_NONSTR(connp));
521
522 tcp->tcp_push_tid = 0;
523
524 if (tcp->tcp_rcv_list != NULL &&
525 tcp_rcv_drain(tcp) == TH_ACK_NEEDED)
526 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
527 }
528
529 /*
530 * This function handles delayed ACK timeout.
531 */
532 void
tcp_ack_timer(void * arg)533 tcp_ack_timer(void *arg)
534 {
535 conn_t *connp = (conn_t *)arg;
536 tcp_t *tcp = connp->conn_tcp;
537 mblk_t *mp;
538 tcp_stack_t *tcps = tcp->tcp_tcps;
539
540 TCP_DBGSTAT(tcps, tcp_ack_timer_cnt);
541
542 tcp->tcp_ack_tid = 0;
543
544 if (tcp->tcp_fused)
545 return;
546
547 /*
548 * Do not send ACK if there is no outstanding unack'ed data.
549 */
550 if (tcp->tcp_rnxt == tcp->tcp_rack) {
551 return;
552 }
553
554 if ((tcp->tcp_rnxt - tcp->tcp_rack) > tcp->tcp_mss) {
555 /*
556 * Make sure we don't allow deferred ACKs to result in
557 * timer-based ACKing. If we have held off an ACK
558 * when there was more than an mss here, and the timer
559 * goes off, we have to worry about the possibility
560 * that the sender isn't doing slow-start, or is out
561 * of step with us for some other reason. We fall
562 * permanently back in the direction of
563 * ACK-every-other-packet as suggested in RFC 1122.
564 */
565 if (tcp->tcp_rack_abs_max > 2)
566 tcp->tcp_rack_abs_max--;
567 tcp->tcp_rack_cur_max = 2;
568 }
569 mp = tcp_ack_mp(tcp);
570
571 if (mp != NULL) {
572 BUMP_LOCAL(tcp->tcp_obsegs);
573 TCPS_BUMP_MIB(tcps, tcpOutAck);
574 TCPS_BUMP_MIB(tcps, tcpOutAckDelayed);
575 tcp_send_data(tcp, mp);
576 }
577 }
578
579 /*
580 * Notify IP that we are having trouble with this connection. IP should
581 * make note so it can potentially use a different IRE.
582 */
583 static void
tcp_ip_notify(tcp_t * tcp)584 tcp_ip_notify(tcp_t *tcp)
585 {
586 conn_t *connp = tcp->tcp_connp;
587 ire_t *ire;
588
589 /*
590 * Note: in the case of source routing we want to blow away the
591 * route to the first source route hop.
592 */
593 ire = connp->conn_ixa->ixa_ire;
594 if (ire != NULL && !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
595 if (ire->ire_ipversion == IPV4_VERSION) {
596 /*
597 * As per RFC 1122, we send an RTM_LOSING to inform
598 * routing protocols.
599 */
600 ip_rts_change(RTM_LOSING, ire->ire_addr,
601 ire->ire_gateway_addr, ire->ire_mask,
602 connp->conn_laddr_v4, 0, 0, 0,
603 (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA),
604 ire->ire_ipst);
605 }
606 (void) ire_no_good(ire);
607 }
608 }
609
610 /*
611 * tcp_timer is the timer service routine. It handles the retransmission,
612 * FIN_WAIT_2 flush, and zero window probe timeout events. It figures out
613 * from the state of the tcp instance what kind of action needs to be done
614 * at the time it is called.
615 */
616 void
tcp_timer(void * arg)617 tcp_timer(void *arg)
618 {
619 mblk_t *mp;
620 clock_t first_threshold;
621 clock_t second_threshold;
622 clock_t ms;
623 uint32_t mss;
624 conn_t *connp = (conn_t *)arg;
625 tcp_t *tcp = connp->conn_tcp;
626 tcp_stack_t *tcps = tcp->tcp_tcps;
627 boolean_t dont_timeout = B_FALSE;
628
629 tcp->tcp_timer_tid = 0;
630
631 if (tcp->tcp_fused)
632 return;
633
634 first_threshold = tcp->tcp_first_timer_threshold;
635 second_threshold = tcp->tcp_second_timer_threshold;
636 switch (tcp->tcp_state) {
637 case TCPS_IDLE:
638 case TCPS_BOUND:
639 case TCPS_LISTEN:
640 return;
641 case TCPS_SYN_RCVD: {
642 tcp_t *listener = tcp->tcp_listener;
643
644 if (tcp->tcp_syn_rcvd_timeout == 0 && (listener != NULL)) {
645 /* it's our first timeout */
646 tcp->tcp_syn_rcvd_timeout = 1;
647 mutex_enter(&listener->tcp_eager_lock);
648 listener->tcp_syn_rcvd_timeout++;
649 if (!tcp->tcp_dontdrop && !tcp->tcp_closemp_used) {
650 /*
651 * Make this eager available for drop if we
652 * need to drop one to accomodate a new
653 * incoming SYN request.
654 */
655 MAKE_DROPPABLE(listener, tcp);
656 }
657 if (!listener->tcp_syn_defense &&
658 (listener->tcp_syn_rcvd_timeout >
659 (tcps->tcps_conn_req_max_q0 >> 2)) &&
660 (tcps->tcps_conn_req_max_q0 > 200)) {
661 /* We may be under attack. Put on a defense. */
662 listener->tcp_syn_defense = B_TRUE;
663 cmn_err(CE_WARN, "High TCP connect timeout "
664 "rate! System (port %d) may be under a "
665 "SYN flood attack!",
666 ntohs(listener->tcp_connp->conn_lport));
667
668 listener->tcp_ip_addr_cache = kmem_zalloc(
669 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t),
670 KM_NOSLEEP);
671 }
672 mutex_exit(&listener->tcp_eager_lock);
673 } else if (listener != NULL) {
674 mutex_enter(&listener->tcp_eager_lock);
675 tcp->tcp_syn_rcvd_timeout++;
676 if (tcp->tcp_syn_rcvd_timeout > 1 &&
677 !tcp->tcp_closemp_used) {
678 /*
679 * This is our second timeout. Put the tcp in
680 * the list of droppable eagers to allow it to
681 * be dropped, if needed. We don't check
682 * whether tcp_dontdrop is set or not to
683 * protect ourselve from a SYN attack where a
684 * remote host can spoof itself as one of the
685 * good IP source and continue to hold
686 * resources too long.
687 */
688 MAKE_DROPPABLE(listener, tcp);
689 }
690 mutex_exit(&listener->tcp_eager_lock);
691 }
692 }
693 /* FALLTHRU */
694 case TCPS_SYN_SENT:
695 first_threshold = tcp->tcp_first_ctimer_threshold;
696 second_threshold = tcp->tcp_second_ctimer_threshold;
697
698 /*
699 * If an app has set the second_threshold to 0, it means that
700 * we need to retransmit forever, unless this is a passive
701 * open. We need to set second_threshold back to a normal
702 * value such that later comparison with it still makes
703 * sense. But we set dont_timeout to B_TRUE so that we will
704 * never time out.
705 */
706 if (second_threshold == 0) {
707 second_threshold = tcps->tcps_ip_abort_linterval;
708 if (tcp->tcp_active_open)
709 dont_timeout = B_TRUE;
710 }
711 break;
712 case TCPS_ESTABLISHED:
713 case TCPS_CLOSE_WAIT:
714 /*
715 * If the end point has not been closed, TCP can retransmit
716 * forever. But if the end point is closed, the normal
717 * timeout applies.
718 */
719 if (second_threshold == 0) {
720 second_threshold = tcps->tcps_ip_abort_linterval;
721 dont_timeout = B_TRUE;
722 }
723 /* FALLTHRU */
724 case TCPS_FIN_WAIT_1:
725 case TCPS_CLOSING:
726 case TCPS_LAST_ACK:
727 /* If we have data to rexmit */
728 if (tcp->tcp_suna != tcp->tcp_snxt) {
729 clock_t time_to_wait;
730
731 TCPS_BUMP_MIB(tcps, tcpTimRetrans);
732 if (!tcp->tcp_xmit_head)
733 break;
734 time_to_wait = ddi_get_lbolt() -
735 (clock_t)tcp->tcp_xmit_head->b_prev;
736 time_to_wait = tcp->tcp_rto -
737 TICK_TO_MSEC(time_to_wait);
738 /*
739 * If the timer fires too early, 1 clock tick earlier,
740 * restart the timer.
741 */
742 if (time_to_wait > msec_per_tick) {
743 TCP_STAT(tcps, tcp_timer_fire_early);
744 TCP_TIMER_RESTART(tcp, time_to_wait);
745 return;
746 }
747 /*
748 * When we probe zero windows, we force the swnd open.
749 * If our peer acks with a closed window swnd will be
750 * set to zero by tcp_rput(). As long as we are
751 * receiving acks tcp_rput will
752 * reset 'tcp_ms_we_have_waited' so as not to trip the
753 * first and second interval actions. NOTE: the timer
754 * interval is allowed to continue its exponential
755 * backoff.
756 */
757 if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) {
758 if (connp->conn_debug) {
759 (void) strlog(TCP_MOD_ID, 0, 1,
760 SL_TRACE, "tcp_timer: zero win");
761 }
762 } else {
763 /*
764 * After retransmission, we need to do
765 * slow start. Set the ssthresh to one
766 * half of current effective window and
767 * cwnd to one MSS. Also reset
768 * tcp_cwnd_cnt.
769 *
770 * Note that if tcp_ssthresh is reduced because
771 * of ECN, do not reduce it again unless it is
772 * already one window of data away (tcp_cwr
773 * should then be cleared) or this is a
774 * timeout for a retransmitted segment.
775 */
776 uint32_t npkt;
777
778 if (!tcp->tcp_cwr || tcp->tcp_rexmit) {
779 npkt = ((tcp->tcp_timer_backoff ?
780 tcp->tcp_cwnd_ssthresh :
781 tcp->tcp_snxt -
782 tcp->tcp_suna) >> 1) / tcp->tcp_mss;
783 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) *
784 tcp->tcp_mss;
785 }
786 tcp->tcp_cwnd = tcp->tcp_mss;
787 tcp->tcp_cwnd_cnt = 0;
788 if (tcp->tcp_ecn_ok) {
789 tcp->tcp_cwr = B_TRUE;
790 tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
791 tcp->tcp_ecn_cwr_sent = B_FALSE;
792 }
793 }
794 break;
795 }
796 /*
797 * We have something to send yet we cannot send. The
798 * reason can be:
799 *
800 * 1. Zero send window: we need to do zero window probe.
801 * 2. Zero cwnd: because of ECN, we need to "clock out
802 * segments.
803 * 3. SWS avoidance: receiver may have shrunk window,
804 * reset our knowledge.
805 *
806 * Note that condition 2 can happen with either 1 or
807 * 3. But 1 and 3 are exclusive.
808 */
809 if (tcp->tcp_unsent != 0) {
810 /*
811 * Should not hold the zero-copy messages for too long.
812 */
813 if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
814 tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
815 tcp->tcp_xmit_head, B_TRUE);
816
817 if (tcp->tcp_cwnd == 0) {
818 /*
819 * Set tcp_cwnd to 1 MSS so that a
820 * new segment can be sent out. We
821 * are "clocking out" new data when
822 * the network is really congested.
823 */
824 ASSERT(tcp->tcp_ecn_ok);
825 tcp->tcp_cwnd = tcp->tcp_mss;
826 }
827 if (tcp->tcp_swnd == 0) {
828 /* Extend window for zero window probe */
829 tcp->tcp_swnd++;
830 tcp->tcp_zero_win_probe = B_TRUE;
831 TCPS_BUMP_MIB(tcps, tcpOutWinProbe);
832 } else {
833 /*
834 * Handle timeout from sender SWS avoidance.
835 * Reset our knowledge of the max send window
836 * since the receiver might have reduced its
837 * receive buffer. Avoid setting tcp_max_swnd
838 * to one since that will essentially disable
839 * the SWS checks.
840 *
841 * Note that since we don't have a SWS
842 * state variable, if the timeout is set
843 * for ECN but not for SWS, this
844 * code will also be executed. This is
845 * fine as tcp_max_swnd is updated
846 * constantly and it will not affect
847 * anything.
848 */
849 tcp->tcp_max_swnd = MAX(tcp->tcp_swnd, 2);
850 }
851 tcp_wput_data(tcp, NULL, B_FALSE);
852 return;
853 }
854 /* Is there a FIN that needs to be to re retransmitted? */
855 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
856 !tcp->tcp_fin_acked)
857 break;
858 /* Nothing to do, return without restarting timer. */
859 TCP_STAT(tcps, tcp_timer_fire_miss);
860 return;
861 case TCPS_FIN_WAIT_2:
862 /*
863 * User closed the TCP endpoint and peer ACK'ed our FIN.
864 * We waited some time for for peer's FIN, but it hasn't
865 * arrived. We flush the connection now to avoid
866 * case where the peer has rebooted.
867 */
868 if (TCP_IS_DETACHED(tcp)) {
869 (void) tcp_clean_death(tcp, 0);
870 } else {
871 TCP_TIMER_RESTART(tcp,
872 tcp->tcp_fin_wait_2_flush_interval);
873 }
874 return;
875 case TCPS_TIME_WAIT:
876 (void) tcp_clean_death(tcp, 0);
877 return;
878 default:
879 if (connp->conn_debug) {
880 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
881 "tcp_timer: strange state (%d) %s",
882 tcp->tcp_state, tcp_display(tcp, NULL,
883 DISP_PORT_ONLY));
884 }
885 return;
886 }
887
888 /*
889 * If the system is under memory pressure or the max number of
890 * connections have been established for the listener, be more
891 * aggressive in aborting connections.
892 */
893 if (tcps->tcps_reclaim || (tcp->tcp_listen_cnt != NULL &&
894 tcp->tcp_listen_cnt->tlc_cnt > tcp->tcp_listen_cnt->tlc_max)) {
895 second_threshold = tcp_early_abort * SECONDS;
896
897 /* We will ignore the never timeout promise in this case... */
898 dont_timeout = B_FALSE;
899 }
900
901 ASSERT(second_threshold != 0);
902
903 if ((ms = tcp->tcp_ms_we_have_waited) > second_threshold) {
904 /*
905 * Should not hold the zero-copy messages for too long.
906 */
907 if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
908 tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
909 tcp->tcp_xmit_head, B_TRUE);
910
911 if (dont_timeout) {
912 /*
913 * Reset tcp_ms_we_have_waited to avoid overflow since
914 * we are going to retransmit forever.
915 */
916 tcp->tcp_ms_we_have_waited = second_threshold;
917 goto timer_rexmit;
918 }
919
920 /*
921 * For zero window probe, we need to send indefinitely,
922 * unless we have not heard from the other side for some
923 * time...
924 */
925 if ((tcp->tcp_zero_win_probe == 0) ||
926 (TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time) >
927 second_threshold)) {
928 TCPS_BUMP_MIB(tcps, tcpTimRetransDrop);
929 /*
930 * If TCP is in SYN_RCVD state, send back a
931 * RST|ACK as BSD does. Note that tcp_zero_win_probe
932 * should be zero in TCPS_SYN_RCVD state.
933 */
934 if (tcp->tcp_state == TCPS_SYN_RCVD) {
935 tcp_xmit_ctl("tcp_timer: RST sent on timeout "
936 "in SYN_RCVD",
937 tcp, tcp->tcp_snxt,
938 tcp->tcp_rnxt, TH_RST | TH_ACK);
939 }
940 (void) tcp_clean_death(tcp,
941 tcp->tcp_client_errno ?
942 tcp->tcp_client_errno : ETIMEDOUT);
943 return;
944 } else {
945 /*
946 * If the system is under memory pressure, we also
947 * abort connection in zero window probing.
948 */
949 if (tcps->tcps_reclaim) {
950 (void) tcp_clean_death(tcp,
951 tcp->tcp_client_errno ?
952 tcp->tcp_client_errno : ETIMEDOUT);
953 TCP_STAT(tcps, tcp_zwin_mem_drop);
954 return;
955 }
956 /*
957 * Set tcp_ms_we_have_waited to second_threshold
958 * so that in next timeout, we will do the above
959 * check (ddi_get_lbolt() - tcp_last_recv_time).
960 * This is also to avoid overflow.
961 *
962 * We don't need to decrement tcp_timer_backoff
963 * to avoid overflow because it will be decremented
964 * later if new timeout value is greater than
965 * tcp_rto_max. In the case when tcp_rto_max is
966 * greater than second_threshold, it means that we
967 * will wait longer than second_threshold to send
968 * the next
969 * window probe.
970 */
971 tcp->tcp_ms_we_have_waited = second_threshold;
972 }
973 } else if (ms > first_threshold) {
974 /*
975 * Should not hold the zero-copy messages for too long.
976 */
977 if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
978 tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
979 tcp->tcp_xmit_head, B_TRUE);
980
981 /*
982 * We have been retransmitting for too long... The RTT
983 * we calculated is probably incorrect. Reinitialize it.
984 * Need to compensate for 0 tcp_rtt_sa. Reset
985 * tcp_rtt_update so that we won't accidentally cache a
986 * bad value. But only do this if this is not a zero
987 * window probe.
988 */
989 if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) {
990 tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) +
991 (tcp->tcp_rtt_sa >> 5);
992 tcp->tcp_rtt_sa = 0;
993 tcp_ip_notify(tcp);
994 tcp->tcp_rtt_update = 0;
995 }
996 }
997
998 timer_rexmit:
999 tcp->tcp_timer_backoff++;
1000 if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
1001 tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) <
1002 tcp->tcp_rto_min) {
1003 /*
1004 * This means the original RTO is tcp_rexmit_interval_min.
1005 * So we will use tcp_rexmit_interval_min as the RTO value
1006 * and do the backoff.
1007 */
1008 ms = tcp->tcp_rto_min << tcp->tcp_timer_backoff;
1009 } else {
1010 ms <<= tcp->tcp_timer_backoff;
1011 }
1012 if (ms > tcp->tcp_rto_max) {
1013 ms = tcp->tcp_rto_max;
1014 /*
1015 * ms is at max, decrement tcp_timer_backoff to avoid
1016 * overflow.
1017 */
1018 tcp->tcp_timer_backoff--;
1019 }
1020 tcp->tcp_ms_we_have_waited += ms;
1021 if (tcp->tcp_zero_win_probe == 0) {
1022 tcp->tcp_rto = ms;
1023 }
1024 TCP_TIMER_RESTART(tcp, ms);
1025 /*
1026 * This is after a timeout and tcp_rto is backed off. Set
1027 * tcp_set_timer to 1 so that next time RTO is updated, we will
1028 * restart the timer with a correct value.
1029 */
1030 tcp->tcp_set_timer = 1;
1031 mss = tcp->tcp_snxt - tcp->tcp_suna;
1032 if (mss > tcp->tcp_mss)
1033 mss = tcp->tcp_mss;
1034 if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0)
1035 mss = tcp->tcp_swnd;
1036
1037 if ((mp = tcp->tcp_xmit_head) != NULL)
1038 mp->b_prev = (mblk_t *)ddi_get_lbolt();
1039 mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss,
1040 B_TRUE);
1041
1042 /*
1043 * When slow start after retransmission begins, start with
1044 * this seq no. tcp_rexmit_max marks the end of special slow
1045 * start phase. tcp_snd_burst controls how many segments
1046 * can be sent because of an ack.
1047 */
1048 tcp->tcp_rexmit_nxt = tcp->tcp_suna;
1049 tcp->tcp_snd_burst = TCP_CWND_SS;
1050 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
1051 (tcp->tcp_unsent == 0)) {
1052 tcp->tcp_rexmit_max = tcp->tcp_fss;
1053 } else {
1054 tcp->tcp_rexmit_max = tcp->tcp_snxt;
1055 }
1056 tcp->tcp_rexmit = B_TRUE;
1057 tcp->tcp_dupack_cnt = 0;
1058
1059 /*
1060 * Remove all rexmit SACK blk to start from fresh.
1061 */
1062 if (tcp->tcp_snd_sack_ok)
1063 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp);
1064 if (mp == NULL) {
1065 return;
1066 }
1067
1068 tcp->tcp_csuna = tcp->tcp_snxt;
1069 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
1070 TCPS_UPDATE_MIB(tcps, tcpRetransBytes, mss);
1071 tcp_send_data(tcp, mp);
1072
1073 }
1074
1075 /*
1076 * Handle lingering timeouts. This function is called when the SO_LINGER timeout
1077 * expires.
1078 */
1079 void
tcp_close_linger_timeout(void * arg)1080 tcp_close_linger_timeout(void *arg)
1081 {
1082 conn_t *connp = (conn_t *)arg;
1083 tcp_t *tcp = connp->conn_tcp;
1084
1085 tcp->tcp_client_errno = ETIMEDOUT;
1086 tcp_stop_lingering(tcp);
1087 }
1088