xref: /csrg-svn/sys/netinet/tcp_input.c (revision 5065)
1 /* tcp_input.c 1.30 81/11/24 */
2 
3 #include "../h/param.h"
4 #include "../h/systm.h"
5 #include "../h/mbuf.h"
6 #include "../h/socket.h"
7 #include "../h/socketvar.h"
8 #include "../net/inet.h"
9 #include "../net/inet_pcb.h"
10 #include "../net/inet_systm.h"
11 #include "../net/imp.h"
12 #include "../net/ip.h"
13 #include "../net/ip_var.h"
14 #include "../net/tcp.h"
15 #include "../net/tcp_fsm.h"
16 #include "../net/tcp_var.h"
17 #include "/usr/include/errno.h"
18 
19 int	tcpcksum = 1;
20 
21 struct	sockaddr_in tcp_sockaddr = { AF_INET };
22 
23 /*
24  * TCP input routine, follows pages 65-76 of the
25  * protocol specification dated September, 1981 very closely.
26  */
27 tcp_input(m0)
28 	struct mbuf *m0;
29 {
30 	register struct tcpiphdr *ti;
31 	struct inpcb *inp;
32 	register struct mbuf *m;
33 	int len, tlen, off;
34 	register struct tcpcb *tp;
35 	register int tiflags;
36 	struct socket *so;
37 	seq_t segend;
38 	int acceptable;
39 
40 COUNT(TCP_INPUT);
41 	/*
42 	 * Get ip and tcp header together in first mbuf.
43 	 */
44 	m = m0;
45 	if (m->m_len < sizeof (struct tcpiphdr) &&
46 	    m_pullup(m, sizeof (struct tcpiphdr)) == 0) {
47 		tcpstat.tcps_hdrops++;
48 		goto bad;
49 	}
50 	ti = mtod(m, struct tcpiphdr *);
51 	if (ti->ti_len > sizeof (struct ip))
52 		ip_stripoptions((struct ip *)ti, (char *)0);
53 
54 	/*
55 	 * Checksum extended tcp header and data.
56 	 */
57 	tlen = ((struct ip *)ti)->ip_len;
58 	len = sizeof (struct ip) + tlen;
59 	if (tcpcksum) {
60 		ti->ti_next = ti->ti_prev = 0;
61 		ti->ti_x1 = 0;
62 		ti->ti_len = htons((u_short)tlen);
63 		if ((ti->ti_sum = inet_cksum(m, len)) != 0xffff) {
64 			tcpstat.tcps_badsum++;
65 			printf("tcp cksum %x\n", ti->ti_sum);
66 			goto bad;
67 		}
68 	}
69 
70 	/*
71 	 * Check that tcp offset makes sense,
72 	 * process tcp options and adjust length.
73 	 */
74 	off = ti->ti_off << 2;
75 	if (off < sizeof (struct tcphdr) || off > ti->ti_len) {
76 		tcpstat.tcps_badoff++;
77 		goto bad;
78 	}
79 	ti->ti_len = tlen - off;
80 	/* PROCESS OPTIONS */
81 	tiflags = ti->ti_flags;
82 
83 	/*
84 	 * Locate pcb for segment.
85 	 */
86 	inp = in_pcblookup
87 		(&tcb, ti->ti_src, ti->ti_sport, ti->ti_dst, ti->ti_dport);
88 
89 	/*
90 	 * If the state is CLOSED (i.e., TCB does not exist) then
91 	 * all data in the incoming segment is discarded.  (p. 65).
92 	 */
93 	if (inp == 0)
94 		goto sendreset;
95 	tp = intotcpcb(inp);
96 	if (tp == 0)
97 		goto sendreset;
98 
99 	/*
100 	 * Convert tcp protocol specific fields to host format.
101 	 */
102 	ti->ti_seq = ntohl(ti->ti_seq);
103 	ti->ti_ackno = ntohl((n_long)ti->ti_ackno);
104 	ti->ti_win = ntohs(ti->ti_win);
105 	ti->ti_urp = ntohs(ti->ti_urp);
106 
107 	/*
108 	 * Discard ip header, and do tcp input processing.
109 	 */
110 	off += sizeof (struct ip);
111 	m->m_off += off;
112 	m->m_len -= off;
113 
114 	switch (tp->t_state) {
115 
116 	/*
117 	 * If the state is LISTEN then ignore segment if it contains an RST.
118 	 * If the segment contains an ACK then it is bad and send a RST.
119 	 * If it does not contain a SYN then it is not interesting; drop it.
120 	 * Otherwise initialize tp->rcv_next, and tp->irs, select an initial
121 	 * tp->iss, and send a segment:
122 	 *     <SEQ=ISS><ACK=RCV>NXT><CTL=SYN,ACK>
123 	 * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
124 	 * Fill in remote peer address fields if not previously specified.
125 	 * Enter SYN_RECEIVED state, and process any other fields of this
126 	 * segment in this state.  (p. 65)
127 	 */
128 	case TCPS_LISTEN:
129 		if (tiflags & TH_RST)
130 			goto drop;
131 		if (tiflags & TH_ACK)
132 			goto sendrst;
133 		if ((tiflags & TH_SYN) == 0)
134 			goto drop;
135 		tp->rcv_nxt = ti->ti_seq + 1;
136 		tp->irs = ti->ti_seq;
137 		tp->iss = tcp_selectiss();
138 		tcp_reflect(ti, tp->iss, tp->rcv_next, TH_SYN|TH_ACK);
139 		tp->t_state = TCPS_SYN_RECEIVED;
140 		tiflags &= ~TH_SYN; tiflags |= TH_RST;
141 		if (inp->inp_faddr.s_addr == 0) {
142 			inp->inp_faddr = ti->ti_src;
143 			inp->inp_fport = ti->ti_sport;
144 		}
145 		break;
146 
147 	/*
148 	 * If the state is SYN_SENT:
149 	 *	if seg contains an ACK, but not for our SYN, drop the input.
150 	 *	if seg contains a RST, then drop the connection.
151 	 *	if seg does not contain SYN, then drop it.
152 	 * Otherwise this is an acceptable SYN segment
153 	 *	initialize tp->rcv_nxt and tp->irs
154 	 *	if seg contains ack then advance tp->snd_una
155 	 *	if SYN has been acked change to ESTABLISHED else SYN_RCVD state
156 	 *	arrange for segment to be acked (eventually)
157 	 *	continue processing rest of data/controls, beginning with URG
158 	 */
159 	case TCPS_SYN_SENT:
160 		if ((tiflags & TH_ACK) &&
161 		    (SEQ_LEQ(ti->ti_ack, tp->iss) ||
162 		     SEQ_GT(ti->ti_ack, tp->snd.nxt))) {
163 			tcp_reflect(ti, ti->ti_ack, 0, TH_RST);
164 			goto drop;
165 		}
166 		if (tiflags & TH_RST) {
167 			if (tiflags & TH_ACK)
168 				tcp_drop(tp, ENETRESET);
169 			goto drop;
170 		}
171 		if ((tiflags & TH_SYN) == 0)
172 			goto drop;
173 		tp->rcv_nxt = ti->ti_seq + 1;
174 		tp->irs = ti->ti_seq;
175 		tp->snd_una = ti->ti_seq;
176 		if (SEQ_GT(tp->snd_una, tp->iss)) {
177 			tp->t_state = TCPS_ESTABLISHED;
178 			tp->t_flags |= TF_OWEACK;
179 			goto step6;
180 		}
181 		tp->t_state = TCPS_SYN_RECEIVED;
182 		tcp_reflect(ti, tp->iss, tp->rcv_nxt, TH_SYN|TH_ACK);
183 		break;
184 	}
185 
186 	/*
187 	 * States other than LISTEN or SYN_SENT.
188 	 * First check that at least some bytes of segment are within
189 	 * receive window.
190 	 */
191 	if (tp->rcv_wnd == 0) {
192 		/*
193 		 * If window is closed can only take segments at
194 		 * window edge, and have to drop data and EOL from
195 		 * incoming segments.
196 		 */
197 		if (tp->rcv_nxt != ti->ti_seq)
198 			goto dropafterack;
199 		if (tp->ti_len > 0) {
200 			tp->ti_len = 0;
201 			tp->ti_flags &= ~(TH_EOL|TH_FIN);
202 		}
203 	} else {
204 		/*
205 		 * If segment begins before rcv_next, drop leading
206 		 * data (and SYN); if nothing left, just ack.
207 		 */
208 		if (SEQ_GT(tp->rcv_nxt, ti->ti_seq)) {
209 			tcpseq_t todrop = tp->rcv_nxt - ti->ti_seq;
210 			if (todrop > ti->ti_len)
211 				goto dropafterack;
212 			m_adj(m, todrop);
213 			ti->ti_seq += todrop;
214 			ti->ti_len -= todrop;
215 			ti->ti_flags &= ~TH_SYN;
216 		}
217 		/*
218 		 * If segment ends after window, drop trailing data
219 		 * (and EOL and FIN); if there would be nothing left, just ack.
220 		 */
221 		if (SEQ_GT(ti->ti_seq+ti->ti_len, tp->rcv_nxt+tp->rcv_wnd)) {
222 			tcpseq_t todrop =
223 			     ti->ti_seq+ti->ti_len - (tp->rcv_nxt+tp->rcv_wnd);
224 			if (todrop > ti->ti_len)
225 				goto dropafterack;
226 			m_adj(m, -todrop);
227 			ti->ti_len -= todrop;
228 			ti->ti_flags &= ~(TH_EOL|TH_FIN);
229 		}
230 	}
231 
232 	/*
233 	 * If the RST bit is set examine the state:
234 	 *    SYN_RECEIVED STATE:
235 	 *	If passive open, return to LISTEN state.
236 	 *	If active open, inform user that connection was refused.
237 	 *    ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
238 	 *	Inform user that connection was reset, and close tcb.
239 	 *    CLOSING, LAST_ACK, TIME_WAIT STATES
240 	 *	Close the tcb.
241 	 */
242 	if (tiflags&TH_RST) switch (tp->t_state) {
243 
244 	case TCPS_SYN_RECEIVED:
245 		if (inp->inp_socket->so_options & SO_ACCEPTCONN) {
246 			tp->t_state = LISTEN;
247 			inp->inp_fhost->s_addr = 0;
248 			goto drop;
249 		}
250 		tcp_drop(tp, EREFUSED);
251 		goto drop;
252 
253 	case TCPS_ESTABLISHED:
254 	case TCPS_FIN_WAIT_1:
255 	case TCPS_FIN_WAIT_2:
256 	case TCPS_CLOSE_WAIT:
257 		tcp_drop(tp, ECONNRESET);
258 		goto drop;
259 
260 	case TCPS_CLOSING:
261 	case TCPS_LAST_ACK:
262 	case TCPS_TIME_WAIT:
263 		tcp_close(tp);
264 		goto drop;
265 	}
266 
267 	/*
268 	 * If a SYN is in the window, then this is an
269 	 * error and we send an RST and drop the connection.
270 	 */
271 	if (tiflags & TH_SYN) {
272 		tcp_drop(tp, ECONNRESET);
273 		goto sendreset;
274 	}
275 
276 	/*
277 	 * If the ACK bit is off we drop the segment and return.
278 	 */
279 	if ((tiflags & TI_ACK) == 0)
280 		goto drop;
281 
282 	/*
283 	 * Ack processing.
284 	 */
285 	switch (tp->t_state) {
286 
287 	/*
288 	 * In SYN_RECEIVED state if the ack ACKs our SYN then enter
289 	 * ESTABLISHED state and continue processing, othewise
290 	 * send an RST.
291 	 */
292 	case TCPS_SYN_RECEIVED:
293 		if (SEQ_LEQ(tp->snd_una, ti->ti_ack) &&
294 		    SEQ_LEQ(ti->ti_ack, tp->snd_nxt))
295 			tp->t_state = TCPS_ESTABLISHED;
296 		else
297 			goto sendreset;
298 		/* fall into next case, below... */
299 
300 	/*
301 	 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
302 	 * ACKs.  If the ack is in the range
303 	 *	tp->snd_una < ti->ti_ack <= tp->snd_nxt
304 	 * then advance tp->snd_una to ti->ti_ack and drop
305 	 * data from the retransmission queue.  If this ACK reflects
306 	 * more up to date window information we update our window information.
307 	 */
308 	case TCPS_ESTABLISHED:
309 	case TCPS_FIN_WAIT_1:
310 	case TCPS_FIN_WAIT_2:
311 	case TCPS_CLOSE_WAIT:
312 	case TCPS_CLOSING:
313 		if (SEQ_LT(ti->ti_ack, tp->snd_una))
314 			break;
315 		if (SEQ_GT(ti->ti_ack, tp->snd_nxt))
316 			goto dropafterack;
317 		sbdrop(&so->so_snd, ti->ti_ack - tp->snd_una);
318 		tp->snd_una = ti->ti_ack;
319 		if (SEQ_LT(tp->snd_wl1, ti->ti_seq) ||
320 		    tp->snd_wl1==ti-ti_seq && SEQ_LEQ(tp->snd_wl2,ti->ti_seq)) {
321 			tp->snd_wnd = ti->ti_win;
322 			tp->snd_wl1 = ti->ti_seq;
323 			tp->snd_wl2 = ti->ti_ack;
324 		}
325 
326 		switch (tp->t_state) {
327 
328 		/*
329 		 * In FIN_WAIT_1 STATE in addition to the processing
330 		 * for the ESTABLISHED state if our FIN is now acknowledged
331 		 * then enter FIN_WAIT_2 and continue processing in that state.
332 		 */
333 		case TCPS_FIN_WAIT_1:
334 			if (tcp_finisacked(tp) == 0)
335 				break;
336 			tp->t_state = TCPS_FIN_WAIT_2;
337 			/* fall into ... */
338 
339 	 	/*
340 		 * In FIN_WAIT_2 STATE in addition to the processing for
341 		 * the ESTABLISHED state allow the user to close when
342 		 * the data has drained.
343 		 */
344 		case TCPS_FIN_WAIT_2:
345 			tcp_usrcanclose(tp);
346 			break;
347 
348 	 	/*
349 		 * In CLOSING STATE in addition to the processing for
350 		 * the ESTABLISHED state if the ACK acknowledges our FIN
351 		 * then enter the TIME-WAIT state, otherwise ignore
352 		 * the segment.
353 		 */
354 		case TCPS_CLOSING:
355 			if (tcp_finisacked(tp))
356 				tp->t_state = TCPS_TIME_WAIT;
357 			break;
358 
359 		/*
360 		 * In LAST_ACK state if our FIN is now acknowledged
361 		 * then enter the TIME_WAIT state, otherwise ignore the
362 		 * segment.
363 		 */
364 		case TCPS_LAST_ACK:
365 			if (tcp_finisacked(tp))
366 				tcp_close(tp);
367 			goto drop;
368 
369 		/*
370 		 * In TIME_WAIT state the only thing that should arrive
371 		 * is a retransmission of the remote FIN.  Acknowledge
372 		 * it and restart the finack timer.
373 		 */
374 		case TCPS_TIME_WAIT:
375 			tp->t_finack = 2 * TCP_MSL;
376 			goto dropafterack;
377 		}
378 
379 step6:
380 	/*
381 	 * If an URG bit is set in the segment and is greater than the
382 	 * current known urgent pointer, then signal the user that the
383 	 * remote side has urgent data.  This should not happen
384 	 * in CLOSE_WAIT, CLOSING, LAST-ACK or TIME_WAIT STATES since
385 	 * a FIN has been received from the remote side.  In these states
386 	 * we ignore the URG.
387 	 */
388 	if ((tiflags & TH_URG) == 0 && (TCPS_RCVDFIN(tp->t_state) == 0) {
389 		if (SEQ_GT(ti->ti_urp, tp->rcv_up) {
390 			tp->rcv_up = ti->ti_urp;
391 			soisurgendata(so);		/* XXX */
392 		}
393 	}
394 
395 	/*
396 	 * Process the segment text, merging it into the TCP sequencing queue,
397 	 * and arranging for acknowledgment of receipt if necessary.
398 	 * This process logically involves adjusting tp->rcv_wnd as data
399 	 * is presented to the user (this happens in tcp_usrreq.c,
400 	 * case PRU_RCVD).  If a FIN has already been received on this
401 	 * connection then we just ignore the text.
402 	 */
403 	if (ti->ti_len) {
404 		if (TCPS_RCVDFIN(tp->t_state))
405 			goto drop;
406 		tiflags = tcp_reass(tp, ti);
407 	else
408 		m_freem(m);
409 
410 	/*
411 	 * If FIN is received then if we haven't received SYN and
412 	 * therefore can't validate drop the segment.  Otherwise ACK
413 	 * the FIN and let the user know that the connection is closing.
414 	 */
415 	if ((tiflags & TH_FIN) && TCPS_HAVERCVDSYN(tp->t_state)) {
416 		tcp_usrclosing(tp);
417 		tp->t_flags |= TF_ACKNOW;
418 		tp->rcv_nxt++;
419 		switch (tp->t_state) {
420 
421 	 	/*
422 		 * In SYN_RECEIVED and ESTABLISHED STATES
423 		 * enter the CLOSE_WAIT state.
424 		 */
425 		case TCPS_SYN_RECEIVED:
426 		case TCPS_ESTABLISHED:
427 			tp->t_state = TCPS_CLOSE_WAIT;
428 			break;
429 
430 	 	/*
431 		 * In FIN_WAIT_1 STATE if our FIN has been acked then
432 		 * enter TIME_WAIT state, starting the associated timer
433 		 * and turning off all other standard timers.
434 	 	 * If FIN has not been acked enter the CLOSING state.
435 		 */
436 		case TCPS_FIN_WAIT_1:
437 			if (tcp_finisacked(tp)) {
438 				tp->t_state = TCPS_TIME_WAIT;
439 				tcp_canceltimers(tp, 0);
440 				tp->t_timer[TCPT_FINACK] = TCPSC_2MSL;
441 			} else
442 				tp->t_state = TCPS_CLOSING;
443 			break;
444 		}
445 
446 	 	/*
447 		 * In FIN_WAIT_2 state enter the TIME_WAIT state,
448 		 * starting the time-wait timer, turning off the other
449 		 * standard timers.
450 		 */
451 		case TCPS_FIN_WAIT_2:
452 			tp->t_state = TCPS_FIN_WAIT_2;
453 			tcp_canceltimers(tp, 0);
454 			tp->t_timer[TCPT_FINACK] = TCPSC_2MSL;
455 			break;
456 
457 		/*
458 		 * In TIME_WAIT state restart the 2 MSL time_wait timer.
459 		 */
460 		case TCPS_TIME_WAIT:
461 			tp->t_timer[TCPT_FINACK] = TCPSC_2MSL;
462 			break;
463 	}
464 	return;
465 dropafterack:
466 	if ((tiflags & TH_RST) == 0)
467 		tcp_reflect(ti, tp->rcv_nxt, tp->snd_nxt, TH_ACK);
468 drop:
469 	m_freem(m);
470 	return;
471 }
472 
473 /*
474  * Insert segment ti into reassembly queue of tcp with
475  * control block tp.  Return TH_FIN if reassembly now includes
476  * a segment with FIN.
477  */
478 tcp_reass(tp, ti, endp)
479 	register struct tcpcb *tp;
480 	register struct tcpiphdr *ti;
481 	int *endp;
482 {
483 	register struct tcpiphdr *q;
484 	int flags = 0;		/* no FIN */
485 	int overage;
486 
487 	/*
488 	 * If no data in this segment may want
489 	 * to move data up to socket structure (if
490 	 * connection is now established).
491 	 */
492 	if (ti->ti_len == 0) {
493 		m_freem(dtom(ti));
494 		goto present;
495 	}
496 
497 	/*
498 	 * Find a segment which begins after this one does.
499 	 */
500 	for (q = tp->seg_next; q != (struct tcpiphdr *)tp;
501 	    q = (struct tcpiphdr *)q->ti_next)
502 		if (SEQ_GT(q->ti_seq, ti->ti_seq))
503 			break;
504 
505 	/*
506 	 * If there is a preceding segment, it may provide some of
507 	 * our data already.  If so, drop the data from the incoming
508 	 * segment.  If it provides all of our data, drop us.
509 	 */
510 	if ((struct tcpiphdr *)q->ti_prev != (struct tcpiphdr *)tp) {
511 		register int i;
512 		q = (struct tcpiphdr *)(q->ti_prev);
513 		/* conversion to int (in i) handles seq wraparound */
514 		i = q->ti_seq + q->ti_len - ti->ti_seq;
515 		if (i > 0) {
516 			if (i >= ti->ti_len)
517 				goto drop;
518 			m_adj(dtom(tp), i);
519 			ti->ti_len -= i;
520 			ti->ti_seq += i;
521 		}
522 		q = (struct tcpiphdr *)(q->ti_next);
523 	}
524 
525 	/*
526 	 * While we overlap succeeding segments trim them or,
527 	 * if they are completely covered, dequeue them.
528 	 */
529 	while (q != (struct tcpiphdr *)tp &&
530 	    SEQ_GT(ti->ti_seq + ti->ti_len, q->ti_seq)) {
531 		register int i = (ti->ti_seq + ti->ti_len) - q->ti_seq;
532 		if (i < q->ti_len) {
533 			q->ti_len -= i;
534 			m_adj(dtom(q), i);
535 			break;
536 		}
537 		q = (struct tcpiphdr *)q->ti_next;
538 		m_freem(dtom(q->ti_prev));
539 		remque(q->ti_prev);
540 	}
541 
542 	/*
543 	 * Stick new segment in its place.
544 	 */
545 	insque(ti, q->ti_prev);
546 	tp->seqcnt += ti->ti_len;
547 
548 	/*
549 	 * Calculate available space and discard segments for
550 	 * which there is too much.
551 	 */
552 	overage =
553 	    (so->so_rcv.sb_cc + tp->seqcnt) - so->so_rcv.sb_hiwat;
554 	if (overage > 0) {
555 		q = tp->seg_prev;
556 		for (;;) {
557 			register int i = MIN(q->ti_len, overage);
558 			overage -= i;
559 			tp->seqcnt -= i;
560 			q->ti_len -= i;
561 			m_adj(dtom(q), -i);
562 			if (q->ti_len)
563 				break;
564 			if (q == ti)
565 				panic("tcp_text dropall");
566 			q = (struct tcpiphdr *)q->ti_prev;
567 			remque(q->ti_next);
568 		}
569 	}
570 
571 	/*
572 	 * Advance rcv_next through newly completed sequence space.
573 	 */
574 	while (ti->ti_seq == tp->rcv_nxt) {
575 		tp->rcv_nxt += ti->ti_len;
576 		flags = ti->ti_flags & TH_FIN;
577 		ti = (struct tcpiphdr *)ti->ti_next;
578 		if (ti == (struct tcpiphdr *)tp)
579 			break;
580 	}
581 
582 present:
583 	/*
584 	 * Present data to user.
585 	 */
586 	if (tp->t_state < ESTAB)
587 		return (flags);
588 	ti = tp->seg_next;
589 	while (ti != (struct tcpiphdr *)tp && ti->ti_seq < tp->rcv_nxt) {
590 		remque(ti);
591 		sbappend(&so->so_rcv, dtom(ti));
592 		tp->seqcnt -= ti->ti_len;
593 		if (tp->seqcnt < 0)
594 			panic("tcp_reass");
595 		ti = (struct tcpiphdr *)ti->ti_next;
596 	}
597 	sorwakeup(so);
598 	return (flags);
599 drop:
600 	m_freem(dtom(ti));
601 	return (flags);
602 }
603