1 /* $NetBSD: npf_state_tcp.c,v 1.11 2012/10/06 23:50:17 rmind Exp $ */ 2 3 /*- 4 * Copyright (c) 2010-2012 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This material is based upon work partially supported by The 8 * NetBSD Foundation under a contract with Mindaugas Rasiukevicius. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * NPF TCP state engine for connection tracking. 34 */ 35 36 #include <sys/cdefs.h> 37 __KERNEL_RCSID(0, "$NetBSD: npf_state_tcp.c,v 1.11 2012/10/06 23:50:17 rmind Exp $"); 38 39 #include <sys/param.h> 40 #include <sys/types.h> 41 42 #ifndef _KERNEL 43 #include <stdio.h> 44 #include <stdbool.h> 45 #include <inttypes.h> 46 #endif 47 #include <netinet/in.h> 48 #include <netinet/tcp.h> 49 #include <netinet/tcp_seq.h> 50 51 #include "npf_impl.h" 52 53 /* 54 * NPF TCP states. Note: these states are different from the TCP FSM 55 * states of RFC 793. The packet filter is a man-in-the-middle. 56 */ 57 #define NPF_TCPS_OK (-1) 58 #define NPF_TCPS_CLOSED 0 59 #define NPF_TCPS_SYN_SENT 1 60 #define NPF_TCPS_SIMSYN_SENT 2 61 #define NPF_TCPS_SYN_RECEIVED 3 62 #define NPF_TCPS_ESTABLISHED 4 63 #define NPF_TCPS_FIN_SENT 5 64 #define NPF_TCPS_FIN_RECEIVED 6 65 #define NPF_TCPS_CLOSE_WAIT 7 66 #define NPF_TCPS_FIN_WAIT 8 67 #define NPF_TCPS_CLOSING 9 68 #define NPF_TCPS_LAST_ACK 10 69 #define NPF_TCPS_TIME_WAIT 11 70 71 #define NPF_TCP_NSTATES 12 72 73 /* 74 * TCP connection timeout table (in seconds). 75 */ 76 static u_int npf_tcp_timeouts[] __read_mostly = { 77 /* Closed, timeout nearly immediately. */ 78 [NPF_TCPS_CLOSED] = 10, 79 /* Unsynchronised states. */ 80 [NPF_TCPS_SYN_SENT] = 30, 81 [NPF_TCPS_SIMSYN_SENT] = 30, 82 [NPF_TCPS_SYN_RECEIVED] = 60, 83 /* Established: 24 hours. */ 84 [NPF_TCPS_ESTABLISHED] = 60 * 60 * 24, 85 /* FIN seen: 4 minutes (2 * MSL). */ 86 [NPF_TCPS_FIN_SENT] = 60 * 2 * 2, 87 [NPF_TCPS_FIN_RECEIVED] = 60 * 2 * 2, 88 /* Half-closed cases: 6 hours. */ 89 [NPF_TCPS_CLOSE_WAIT] = 60 * 60 * 6, 90 [NPF_TCPS_FIN_WAIT] = 60 * 60 * 6, 91 /* Full close cases: 30 sec and 2 * MSL. */ 92 [NPF_TCPS_CLOSING] = 30, 93 [NPF_TCPS_LAST_ACK] = 30, 94 [NPF_TCPS_TIME_WAIT] = 60 * 2 * 2, 95 }; 96 97 static bool npf_strict_order_rst __read_mostly = false; 98 99 #define NPF_TCP_MAXACKWIN 66000 100 101 /* 102 * List of TCP flag cases and conversion of flags to a case (index). 103 */ 104 105 #define TCPFC_INVALID 0 106 #define TCPFC_SYN 1 107 #define TCPFC_SYNACK 2 108 #define TCPFC_ACK 3 109 #define TCPFC_FIN 4 110 #define TCPFC_COUNT 5 111 112 static inline u_int 113 npf_tcpfl2case(const int tcpfl) 114 { 115 u_int i, c; 116 117 CTASSERT(TH_FIN == 0x01); 118 CTASSERT(TH_SYN == 0x02); 119 CTASSERT(TH_ACK == 0x10); 120 121 /* 122 * Flags are shifted to use three least significant bits, thus each 123 * flag combination has a unique number ranging from 0 to 7, e.g. 124 * TH_SYN | TH_ACK has number 6, since (0x02 | (0x10 >> 2)) == 6. 125 * However, the requirement is to have number 0 for invalid cases, 126 * such as TH_SYN | TH_FIN, and to have the same number for TH_FIN 127 * and TH_FIN|TH_ACK cases. Thus, we generate a mask assigning 3 128 * bits for each number, which contains the actual case numbers: 129 * 130 * TCPFC_SYNACK << (6 << 2) == 0x2000000 (6 - SYN,ACK) 131 * TCPFC_FIN << (5 << 2) == 0x0400000 (5 - FIN,ACK) 132 * ... 133 * 134 * Hence, OR'ed mask value is 0x2430140. 135 */ 136 i = (tcpfl & (TH_SYN | TH_FIN)) | ((tcpfl & TH_ACK) >> 2); 137 c = (0x2430140 >> (i << 2)) & 7; 138 139 KASSERT(c < TCPFC_COUNT); 140 return c; 141 } 142 143 /* 144 * NPF transition table of a tracked TCP connection. 145 * 146 * There is a single state, which is changed in the following way: 147 * 148 * new_state = npf_tcp_fsm[old_state][direction][npf_tcpfl2case(tcp_flags)]; 149 * 150 * Note that this state is different from the state in each end (host). 151 */ 152 153 static const int npf_tcp_fsm[NPF_TCP_NSTATES][2][TCPFC_COUNT] = { 154 [NPF_TCPS_CLOSED] = { 155 [NPF_FLOW_FORW] = { 156 /* Handshake (1): initial SYN. */ 157 [TCPFC_SYN] = NPF_TCPS_SYN_SENT, 158 }, 159 }, 160 [NPF_TCPS_SYN_SENT] = { 161 [NPF_FLOW_FORW] = { 162 /* SYN may be retransmitted. */ 163 [TCPFC_SYN] = NPF_TCPS_OK, 164 }, 165 [NPF_FLOW_BACK] = { 166 /* Handshake (2): SYN-ACK is expected. */ 167 [TCPFC_SYNACK] = NPF_TCPS_SYN_RECEIVED, 168 /* Simultaneous initiation - SYN. */ 169 [TCPFC_SYN] = NPF_TCPS_SIMSYN_SENT, 170 }, 171 }, 172 [NPF_TCPS_SIMSYN_SENT] = { 173 [NPF_FLOW_FORW] = { 174 /* Original SYN re-transmission. */ 175 [TCPFC_SYN] = NPF_TCPS_OK, 176 /* SYN-ACK response to simultaneous SYN. */ 177 [TCPFC_SYNACK] = NPF_TCPS_SYN_RECEIVED, 178 }, 179 [NPF_FLOW_BACK] = { 180 /* Simultaneous SYN re-transmission.*/ 181 [TCPFC_SYN] = NPF_TCPS_OK, 182 /* SYN-ACK response to original SYN. */ 183 [TCPFC_SYNACK] = NPF_TCPS_SYN_RECEIVED, 184 /* FIN may occur early. */ 185 [TCPFC_FIN] = NPF_TCPS_FIN_RECEIVED, 186 }, 187 }, 188 [NPF_TCPS_SYN_RECEIVED] = { 189 [NPF_FLOW_FORW] = { 190 /* Handshake (3): ACK is expected. */ 191 [TCPFC_ACK] = NPF_TCPS_ESTABLISHED, 192 /* FIN may be sent early. */ 193 [TCPFC_FIN] = NPF_TCPS_FIN_SENT, 194 }, 195 [NPF_FLOW_BACK] = { 196 /* SYN-ACK may be retransmitted. */ 197 [TCPFC_SYNACK] = NPF_TCPS_OK, 198 /* XXX: ACK of late SYN in simultaneous case? */ 199 [TCPFC_ACK] = NPF_TCPS_OK, 200 /* FIN may occur early. */ 201 [TCPFC_FIN] = NPF_TCPS_FIN_RECEIVED, 202 }, 203 }, 204 [NPF_TCPS_ESTABLISHED] = { 205 /* 206 * Regular ACKs (data exchange) or FIN. 207 * FIN packets may have ACK set. 208 */ 209 [NPF_FLOW_FORW] = { 210 [TCPFC_ACK] = NPF_TCPS_OK, 211 /* FIN by the sender. */ 212 [TCPFC_FIN] = NPF_TCPS_FIN_SENT, 213 }, 214 [NPF_FLOW_BACK] = { 215 [TCPFC_ACK] = NPF_TCPS_OK, 216 /* FIN by the receiver. */ 217 [TCPFC_FIN] = NPF_TCPS_FIN_RECEIVED, 218 }, 219 }, 220 [NPF_TCPS_FIN_SENT] = { 221 [NPF_FLOW_FORW] = { 222 /* FIN may be re-transmitted. Late ACK as well. */ 223 [TCPFC_ACK] = NPF_TCPS_OK, 224 [TCPFC_FIN] = NPF_TCPS_OK, 225 }, 226 [NPF_FLOW_BACK] = { 227 /* If ACK, connection is half-closed now. */ 228 [TCPFC_ACK] = NPF_TCPS_FIN_WAIT, 229 /* FIN or FIN-ACK race - immediate closing. */ 230 [TCPFC_FIN] = NPF_TCPS_CLOSING, 231 }, 232 }, 233 [NPF_TCPS_FIN_RECEIVED] = { 234 /* 235 * FIN was received. Equivalent scenario to sent FIN. 236 */ 237 [NPF_FLOW_FORW] = { 238 [TCPFC_ACK] = NPF_TCPS_CLOSE_WAIT, 239 [TCPFC_FIN] = NPF_TCPS_CLOSING, 240 }, 241 [NPF_FLOW_BACK] = { 242 [TCPFC_ACK] = NPF_TCPS_OK, 243 [TCPFC_FIN] = NPF_TCPS_OK, 244 }, 245 }, 246 [NPF_TCPS_CLOSE_WAIT] = { 247 /* Sender has sent the FIN and closed its end. */ 248 [NPF_FLOW_FORW] = { 249 [TCPFC_ACK] = NPF_TCPS_OK, 250 [TCPFC_FIN] = NPF_TCPS_LAST_ACK, 251 }, 252 [NPF_FLOW_BACK] = { 253 [TCPFC_ACK] = NPF_TCPS_OK, 254 [TCPFC_FIN] = NPF_TCPS_LAST_ACK, 255 }, 256 }, 257 [NPF_TCPS_FIN_WAIT] = { 258 /* Receiver has closed its end. */ 259 [NPF_FLOW_FORW] = { 260 [TCPFC_ACK] = NPF_TCPS_OK, 261 [TCPFC_FIN] = NPF_TCPS_LAST_ACK, 262 }, 263 [NPF_FLOW_BACK] = { 264 [TCPFC_ACK] = NPF_TCPS_OK, 265 [TCPFC_FIN] = NPF_TCPS_LAST_ACK, 266 }, 267 }, 268 [NPF_TCPS_CLOSING] = { 269 /* Race of FINs - expecting ACK. */ 270 [NPF_FLOW_FORW] = { 271 [TCPFC_ACK] = NPF_TCPS_LAST_ACK, 272 }, 273 [NPF_FLOW_BACK] = { 274 [TCPFC_ACK] = NPF_TCPS_LAST_ACK, 275 }, 276 }, 277 [NPF_TCPS_LAST_ACK] = { 278 /* FINs exchanged - expecting last ACK. */ 279 [NPF_FLOW_FORW] = { 280 [TCPFC_ACK] = NPF_TCPS_TIME_WAIT, 281 }, 282 [NPF_FLOW_BACK] = { 283 [TCPFC_ACK] = NPF_TCPS_TIME_WAIT, 284 }, 285 }, 286 [NPF_TCPS_TIME_WAIT] = { 287 /* May re-open the connection as per RFC 1122. */ 288 [NPF_FLOW_FORW] = { 289 [TCPFC_SYN] = NPF_TCPS_SYN_SENT, 290 }, 291 }, 292 }; 293 294 /* 295 * npf_tcp_inwindow: determine whether the packet is in the TCP window 296 * and thus part of the connection we are tracking. 297 */ 298 static bool 299 npf_tcp_inwindow(const npf_cache_t *npc, nbuf_t *nbuf, npf_state_t *nst, 300 const int di) 301 { 302 const struct tcphdr * const th = &npc->npc_l4.tcp; 303 const int tcpfl = th->th_flags; 304 npf_tcpstate_t *fstate, *tstate; 305 int tcpdlen, ackskew; 306 tcp_seq seq, ack, end; 307 uint32_t win; 308 309 KASSERT(npf_iscached(npc, NPC_TCP)); 310 KASSERT(di == NPF_FLOW_FORW || di == NPF_FLOW_BACK); 311 312 /* 313 * Perform SEQ/ACK numbers check against boundaries. Reference: 314 * 315 * Rooij G., "Real stateful TCP packet filtering in IP Filter", 316 * 10th USENIX Security Symposium invited talk, Aug. 2001. 317 * 318 * There are four boundaries defined as following: 319 * I) SEQ + LEN <= MAX { SND.ACK + MAX(SND.WIN, 1) } 320 * II) SEQ >= MAX { SND.SEQ + SND.LEN - MAX(RCV.WIN, 1) } 321 * III) ACK <= MAX { RCV.SEQ + RCV.LEN } 322 * IV) ACK >= MAX { RCV.SEQ + RCV.LEN } - MAXACKWIN 323 * 324 * Let these members of npf_tcpstate_t be the maximum seen values of: 325 * nst_end - SEQ + LEN 326 * nst_maxend - ACK + MAX(WIN, 1) 327 * nst_maxwin - MAX(WIN, 1) 328 */ 329 330 tcpdlen = npf_tcpsaw(__UNCONST(npc), &seq, &ack, &win); 331 end = seq + tcpdlen; 332 if (tcpfl & TH_SYN) { 333 end++; 334 } 335 if (tcpfl & TH_FIN) { 336 end++; 337 } 338 339 fstate = &nst->nst_tcpst[di]; 340 tstate = &nst->nst_tcpst[!di]; 341 win = win ? (win << fstate->nst_wscale) : 1; 342 343 /* 344 * Initialise if the first packet. 345 * Note: only case when nst_maxwin is zero. 346 */ 347 if (__predict_false(fstate->nst_maxwin == 0)) { 348 /* 349 * Normally, it should be the first SYN or a re-transmission 350 * of SYN. The state of the other side will get set with a 351 * SYN-ACK reply (see below). 352 */ 353 fstate->nst_end = end; 354 fstate->nst_maxend = end; 355 fstate->nst_maxwin = win; 356 tstate->nst_end = 0; 357 tstate->nst_maxend = 0; 358 tstate->nst_maxwin = 1; 359 360 /* 361 * Handle TCP Window Scaling (RFC 1323). Both sides may 362 * send this option in their SYN packets. 363 */ 364 fstate->nst_wscale = 0; 365 (void)npf_fetch_tcpopts(npc, nbuf, NULL, &fstate->nst_wscale); 366 367 tstate->nst_wscale = 0; 368 369 /* Done. */ 370 return true; 371 } 372 if (fstate->nst_end == 0) { 373 /* 374 * Should be a SYN-ACK reply to SYN. If SYN is not set, 375 * then we are in the middle of connection and lost tracking. 376 */ 377 fstate->nst_end = end; 378 fstate->nst_maxend = end + 1; 379 fstate->nst_maxwin = win; 380 fstate->nst_wscale = 0; 381 382 /* Handle TCP Window Scaling (must be ignored if no SYN). */ 383 if (tcpfl & TH_SYN) { 384 (void)npf_fetch_tcpopts(npc, nbuf, NULL, 385 &fstate->nst_wscale); 386 } 387 } 388 389 if ((tcpfl & TH_ACK) == 0) { 390 /* Pretend that an ACK was sent. */ 391 ack = tstate->nst_end; 392 } else if ((tcpfl & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST) && ack == 0) { 393 /* Workaround for some TCP stacks. */ 394 ack = tstate->nst_end; 395 } 396 397 if (__predict_false(tcpfl & TH_RST)) { 398 /* RST to the initial SYN may have zero SEQ - fix it up. */ 399 if (seq == 0 && nst->nst_state == NPF_TCPS_SYN_SENT) { 400 end = fstate->nst_end; 401 seq = end; 402 } 403 404 /* Strict in-order sequence for RST packets. */ 405 if (npf_strict_order_rst && (fstate->nst_end - seq) > 1) { 406 return false; 407 } 408 } 409 410 /* 411 * Determine whether the data is within previously noted window, 412 * that is, upper boundary for valid data (I). 413 */ 414 if (!SEQ_LEQ(end, fstate->nst_maxend)) { 415 npf_stats_inc(NPF_STAT_INVALID_STATE_TCP1); 416 return false; 417 } 418 419 /* Lower boundary (II), which is no more than one window back. */ 420 if (!SEQ_GEQ(seq, fstate->nst_end - tstate->nst_maxwin)) { 421 npf_stats_inc(NPF_STAT_INVALID_STATE_TCP2); 422 return false; 423 } 424 425 /* 426 * Boundaries for valid acknowledgments (III, IV) - one predicted 427 * window up or down, since packets may be fragmented. 428 */ 429 ackskew = tstate->nst_end - ack; 430 if (ackskew < -NPF_TCP_MAXACKWIN || 431 ackskew > (NPF_TCP_MAXACKWIN << fstate->nst_wscale)) { 432 npf_stats_inc(NPF_STAT_INVALID_STATE_TCP3); 433 return false; 434 } 435 436 /* 437 * Packet has been passed. 438 * 439 * Negative ackskew might be due to fragmented packets. Since the 440 * total length of the packet is unknown - bump the boundary. 441 */ 442 443 if (ackskew < 0) { 444 tstate->nst_end = ack; 445 } 446 /* Keep track of the maximum window seen. */ 447 if (fstate->nst_maxwin < win) { 448 fstate->nst_maxwin = win; 449 } 450 if (SEQ_GT(end, fstate->nst_end)) { 451 fstate->nst_end = end; 452 } 453 /* Note the window for upper boundary. */ 454 if (SEQ_GEQ(ack + win, tstate->nst_maxend)) { 455 tstate->nst_maxend = ack + win; 456 } 457 return true; 458 } 459 460 /* 461 * npf_state_tcp: inspect TCP segment, determine whether it belongs to 462 * the connection and track its state. 463 */ 464 bool 465 npf_state_tcp(const npf_cache_t *npc, nbuf_t *nbuf, npf_state_t *nst, int di) 466 { 467 const struct tcphdr * const th = &npc->npc_l4.tcp; 468 const int tcpfl = th->th_flags, state = nst->nst_state; 469 int nstate; 470 471 KASSERT(nst->nst_state == 0 || mutex_owned(&nst->nst_lock)); 472 473 /* Look for a transition to a new state. */ 474 if (__predict_true((tcpfl & TH_RST) == 0)) { 475 const int flagcase = npf_tcpfl2case(tcpfl); 476 nstate = npf_tcp_fsm[state][di][flagcase]; 477 } else if (state == NPF_TCPS_TIME_WAIT) { 478 /* Prevent TIME-WAIT assassination (RFC 1337). */ 479 nstate = NPF_TCPS_OK; 480 } else { 481 nstate = NPF_TCPS_CLOSED; 482 } 483 484 /* Determine whether TCP packet really belongs to this connection. */ 485 if (!npf_tcp_inwindow(npc, nbuf, nst, di)) { 486 return false; 487 } 488 if (__predict_true(nstate == NPF_TCPS_OK)) { 489 return true; 490 } 491 492 nst->nst_state = nstate; 493 return true; 494 } 495 496 int 497 npf_state_tcp_timeout(const npf_state_t *nst) 498 { 499 const u_int state = nst->nst_state; 500 501 KASSERT(state < NPF_TCP_NSTATES); 502 return npf_tcp_timeouts[state]; 503 } 504