1 /* $NetBSD: npf_state_tcp.c,v 1.12 2012/12/24 19:05:45 rmind Exp $ */ 2 3 /*- 4 * Copyright (c) 2010-2012 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This material is based upon work partially supported by The 8 * NetBSD Foundation under a contract with Mindaugas Rasiukevicius. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * NPF TCP state engine for connection tracking. 34 */ 35 36 #include <sys/cdefs.h> 37 __KERNEL_RCSID(0, "$NetBSD: npf_state_tcp.c,v 1.12 2012/12/24 19:05:45 rmind Exp $"); 38 39 #include <sys/param.h> 40 #include <sys/types.h> 41 42 #ifndef _KERNEL 43 #include <stdio.h> 44 #include <stdbool.h> 45 #include <inttypes.h> 46 #endif 47 #include <netinet/in.h> 48 #include <netinet/tcp.h> 49 #include <netinet/tcp_seq.h> 50 51 #include "npf_impl.h" 52 53 /* 54 * NPF TCP states. Note: these states are different from the TCP FSM 55 * states of RFC 793. The packet filter is a man-in-the-middle. 56 */ 57 #define NPF_TCPS_OK (-1) 58 #define NPF_TCPS_CLOSED 0 59 #define NPF_TCPS_SYN_SENT 1 60 #define NPF_TCPS_SIMSYN_SENT 2 61 #define NPF_TCPS_SYN_RECEIVED 3 62 #define NPF_TCPS_ESTABLISHED 4 63 #define NPF_TCPS_FIN_SENT 5 64 #define NPF_TCPS_FIN_RECEIVED 6 65 #define NPF_TCPS_CLOSE_WAIT 7 66 #define NPF_TCPS_FIN_WAIT 8 67 #define NPF_TCPS_CLOSING 9 68 #define NPF_TCPS_LAST_ACK 10 69 #define NPF_TCPS_TIME_WAIT 11 70 71 #define NPF_TCP_NSTATES 12 72 73 /* 74 * TCP connection timeout table (in seconds). 75 */ 76 static u_int npf_tcp_timeouts[] __read_mostly = { 77 /* Closed, timeout nearly immediately. */ 78 [NPF_TCPS_CLOSED] = 10, 79 /* Unsynchronised states. */ 80 [NPF_TCPS_SYN_SENT] = 30, 81 [NPF_TCPS_SIMSYN_SENT] = 30, 82 [NPF_TCPS_SYN_RECEIVED] = 60, 83 /* Established: 24 hours. */ 84 [NPF_TCPS_ESTABLISHED] = 60 * 60 * 24, 85 /* FIN seen: 4 minutes (2 * MSL). */ 86 [NPF_TCPS_FIN_SENT] = 60 * 2 * 2, 87 [NPF_TCPS_FIN_RECEIVED] = 60 * 2 * 2, 88 /* Half-closed cases: 6 hours. */ 89 [NPF_TCPS_CLOSE_WAIT] = 60 * 60 * 6, 90 [NPF_TCPS_FIN_WAIT] = 60 * 60 * 6, 91 /* Full close cases: 30 sec and 2 * MSL. */ 92 [NPF_TCPS_CLOSING] = 30, 93 [NPF_TCPS_LAST_ACK] = 30, 94 [NPF_TCPS_TIME_WAIT] = 60 * 2 * 2, 95 }; 96 97 static bool npf_strict_order_rst __read_mostly = false; 98 99 #define NPF_TCP_MAXACKWIN 66000 100 101 /* 102 * List of TCP flag cases and conversion of flags to a case (index). 103 */ 104 105 #define TCPFC_INVALID 0 106 #define TCPFC_SYN 1 107 #define TCPFC_SYNACK 2 108 #define TCPFC_ACK 3 109 #define TCPFC_FIN 4 110 #define TCPFC_COUNT 5 111 112 static inline u_int 113 npf_tcpfl2case(const int tcpfl) 114 { 115 u_int i, c; 116 117 CTASSERT(TH_FIN == 0x01); 118 CTASSERT(TH_SYN == 0x02); 119 CTASSERT(TH_ACK == 0x10); 120 121 /* 122 * Flags are shifted to use three least significant bits, thus each 123 * flag combination has a unique number ranging from 0 to 7, e.g. 124 * TH_SYN | TH_ACK has number 6, since (0x02 | (0x10 >> 2)) == 6. 125 * However, the requirement is to have number 0 for invalid cases, 126 * such as TH_SYN | TH_FIN, and to have the same number for TH_FIN 127 * and TH_FIN|TH_ACK cases. Thus, we generate a mask assigning 3 128 * bits for each number, which contains the actual case numbers: 129 * 130 * TCPFC_SYNACK << (6 << 2) == 0x2000000 (6 - SYN,ACK) 131 * TCPFC_FIN << (5 << 2) == 0x0400000 (5 - FIN,ACK) 132 * ... 133 * 134 * Hence, OR'ed mask value is 0x2430140. 135 */ 136 i = (tcpfl & (TH_SYN | TH_FIN)) | ((tcpfl & TH_ACK) >> 2); 137 c = (0x2430140 >> (i << 2)) & 7; 138 139 KASSERT(c < TCPFC_COUNT); 140 return c; 141 } 142 143 /* 144 * NPF transition table of a tracked TCP connection. 145 * 146 * There is a single state, which is changed in the following way: 147 * 148 * new_state = npf_tcp_fsm[old_state][direction][npf_tcpfl2case(tcp_flags)]; 149 * 150 * Note that this state is different from the state in each end (host). 151 */ 152 153 static const int npf_tcp_fsm[NPF_TCP_NSTATES][2][TCPFC_COUNT] = { 154 [NPF_TCPS_CLOSED] = { 155 [NPF_FLOW_FORW] = { 156 /* Handshake (1): initial SYN. */ 157 [TCPFC_SYN] = NPF_TCPS_SYN_SENT, 158 }, 159 }, 160 [NPF_TCPS_SYN_SENT] = { 161 [NPF_FLOW_FORW] = { 162 /* SYN may be retransmitted. */ 163 [TCPFC_SYN] = NPF_TCPS_OK, 164 }, 165 [NPF_FLOW_BACK] = { 166 /* Handshake (2): SYN-ACK is expected. */ 167 [TCPFC_SYNACK] = NPF_TCPS_SYN_RECEIVED, 168 /* Simultaneous initiation - SYN. */ 169 [TCPFC_SYN] = NPF_TCPS_SIMSYN_SENT, 170 }, 171 }, 172 [NPF_TCPS_SIMSYN_SENT] = { 173 [NPF_FLOW_FORW] = { 174 /* Original SYN re-transmission. */ 175 [TCPFC_SYN] = NPF_TCPS_OK, 176 /* SYN-ACK response to simultaneous SYN. */ 177 [TCPFC_SYNACK] = NPF_TCPS_SYN_RECEIVED, 178 }, 179 [NPF_FLOW_BACK] = { 180 /* Simultaneous SYN re-transmission.*/ 181 [TCPFC_SYN] = NPF_TCPS_OK, 182 /* SYN-ACK response to original SYN. */ 183 [TCPFC_SYNACK] = NPF_TCPS_SYN_RECEIVED, 184 /* FIN may occur early. */ 185 [TCPFC_FIN] = NPF_TCPS_FIN_RECEIVED, 186 }, 187 }, 188 [NPF_TCPS_SYN_RECEIVED] = { 189 [NPF_FLOW_FORW] = { 190 /* Handshake (3): ACK is expected. */ 191 [TCPFC_ACK] = NPF_TCPS_ESTABLISHED, 192 /* FIN may be sent early. */ 193 [TCPFC_FIN] = NPF_TCPS_FIN_SENT, 194 }, 195 [NPF_FLOW_BACK] = { 196 /* SYN-ACK may be retransmitted. */ 197 [TCPFC_SYNACK] = NPF_TCPS_OK, 198 /* XXX: ACK of late SYN in simultaneous case? */ 199 [TCPFC_ACK] = NPF_TCPS_OK, 200 /* FIN may occur early. */ 201 [TCPFC_FIN] = NPF_TCPS_FIN_RECEIVED, 202 }, 203 }, 204 [NPF_TCPS_ESTABLISHED] = { 205 /* 206 * Regular ACKs (data exchange) or FIN. 207 * FIN packets may have ACK set. 208 */ 209 [NPF_FLOW_FORW] = { 210 [TCPFC_ACK] = NPF_TCPS_OK, 211 /* FIN by the sender. */ 212 [TCPFC_FIN] = NPF_TCPS_FIN_SENT, 213 }, 214 [NPF_FLOW_BACK] = { 215 [TCPFC_ACK] = NPF_TCPS_OK, 216 /* FIN by the receiver. */ 217 [TCPFC_FIN] = NPF_TCPS_FIN_RECEIVED, 218 }, 219 }, 220 [NPF_TCPS_FIN_SENT] = { 221 [NPF_FLOW_FORW] = { 222 /* FIN may be re-transmitted. Late ACK as well. */ 223 [TCPFC_ACK] = NPF_TCPS_OK, 224 [TCPFC_FIN] = NPF_TCPS_OK, 225 }, 226 [NPF_FLOW_BACK] = { 227 /* If ACK, connection is half-closed now. */ 228 [TCPFC_ACK] = NPF_TCPS_FIN_WAIT, 229 /* FIN or FIN-ACK race - immediate closing. */ 230 [TCPFC_FIN] = NPF_TCPS_CLOSING, 231 }, 232 }, 233 [NPF_TCPS_FIN_RECEIVED] = { 234 /* 235 * FIN was received. Equivalent scenario to sent FIN. 236 */ 237 [NPF_FLOW_FORW] = { 238 [TCPFC_ACK] = NPF_TCPS_CLOSE_WAIT, 239 [TCPFC_FIN] = NPF_TCPS_CLOSING, 240 }, 241 [NPF_FLOW_BACK] = { 242 [TCPFC_ACK] = NPF_TCPS_OK, 243 [TCPFC_FIN] = NPF_TCPS_OK, 244 }, 245 }, 246 [NPF_TCPS_CLOSE_WAIT] = { 247 /* Sender has sent the FIN and closed its end. */ 248 [NPF_FLOW_FORW] = { 249 [TCPFC_ACK] = NPF_TCPS_OK, 250 [TCPFC_FIN] = NPF_TCPS_LAST_ACK, 251 }, 252 [NPF_FLOW_BACK] = { 253 [TCPFC_ACK] = NPF_TCPS_OK, 254 [TCPFC_FIN] = NPF_TCPS_LAST_ACK, 255 }, 256 }, 257 [NPF_TCPS_FIN_WAIT] = { 258 /* Receiver has closed its end. */ 259 [NPF_FLOW_FORW] = { 260 [TCPFC_ACK] = NPF_TCPS_OK, 261 [TCPFC_FIN] = NPF_TCPS_LAST_ACK, 262 }, 263 [NPF_FLOW_BACK] = { 264 [TCPFC_ACK] = NPF_TCPS_OK, 265 [TCPFC_FIN] = NPF_TCPS_LAST_ACK, 266 }, 267 }, 268 [NPF_TCPS_CLOSING] = { 269 /* Race of FINs - expecting ACK. */ 270 [NPF_FLOW_FORW] = { 271 [TCPFC_ACK] = NPF_TCPS_LAST_ACK, 272 }, 273 [NPF_FLOW_BACK] = { 274 [TCPFC_ACK] = NPF_TCPS_LAST_ACK, 275 }, 276 }, 277 [NPF_TCPS_LAST_ACK] = { 278 /* FINs exchanged - expecting last ACK. */ 279 [NPF_FLOW_FORW] = { 280 [TCPFC_ACK] = NPF_TCPS_TIME_WAIT, 281 }, 282 [NPF_FLOW_BACK] = { 283 [TCPFC_ACK] = NPF_TCPS_TIME_WAIT, 284 }, 285 }, 286 [NPF_TCPS_TIME_WAIT] = { 287 /* May re-open the connection as per RFC 1122. */ 288 [NPF_FLOW_FORW] = { 289 [TCPFC_SYN] = NPF_TCPS_SYN_SENT, 290 }, 291 }, 292 }; 293 294 /* 295 * npf_tcp_inwindow: determine whether the packet is in the TCP window 296 * and thus part of the connection we are tracking. 297 */ 298 static bool 299 npf_tcp_inwindow(npf_cache_t *npc, nbuf_t *nbuf, npf_state_t *nst, const int di) 300 { 301 const struct tcphdr * const th = npc->npc_l4.tcp; 302 const int tcpfl = th->th_flags; 303 npf_tcpstate_t *fstate, *tstate; 304 int tcpdlen, ackskew; 305 tcp_seq seq, ack, end; 306 uint32_t win; 307 308 KASSERT(npf_iscached(npc, NPC_TCP)); 309 KASSERT(di == NPF_FLOW_FORW || di == NPF_FLOW_BACK); 310 311 /* 312 * Perform SEQ/ACK numbers check against boundaries. Reference: 313 * 314 * Rooij G., "Real stateful TCP packet filtering in IP Filter", 315 * 10th USENIX Security Symposium invited talk, Aug. 2001. 316 * 317 * There are four boundaries defined as following: 318 * I) SEQ + LEN <= MAX { SND.ACK + MAX(SND.WIN, 1) } 319 * II) SEQ >= MAX { SND.SEQ + SND.LEN - MAX(RCV.WIN, 1) } 320 * III) ACK <= MAX { RCV.SEQ + RCV.LEN } 321 * IV) ACK >= MAX { RCV.SEQ + RCV.LEN } - MAXACKWIN 322 * 323 * Let these members of npf_tcpstate_t be the maximum seen values of: 324 * nst_end - SEQ + LEN 325 * nst_maxend - ACK + MAX(WIN, 1) 326 * nst_maxwin - MAX(WIN, 1) 327 */ 328 329 tcpdlen = npf_tcpsaw(__UNCONST(npc), &seq, &ack, &win); 330 end = seq + tcpdlen; 331 if (tcpfl & TH_SYN) { 332 end++; 333 } 334 if (tcpfl & TH_FIN) { 335 end++; 336 } 337 338 fstate = &nst->nst_tcpst[di]; 339 tstate = &nst->nst_tcpst[!di]; 340 win = win ? (win << fstate->nst_wscale) : 1; 341 342 /* 343 * Initialise if the first packet. 344 * Note: only case when nst_maxwin is zero. 345 */ 346 if (__predict_false(fstate->nst_maxwin == 0)) { 347 /* 348 * Normally, it should be the first SYN or a re-transmission 349 * of SYN. The state of the other side will get set with a 350 * SYN-ACK reply (see below). 351 */ 352 fstate->nst_end = end; 353 fstate->nst_maxend = end; 354 fstate->nst_maxwin = win; 355 tstate->nst_end = 0; 356 tstate->nst_maxend = 0; 357 tstate->nst_maxwin = 1; 358 359 /* 360 * Handle TCP Window Scaling (RFC 1323). Both sides may 361 * send this option in their SYN packets. 362 */ 363 fstate->nst_wscale = 0; 364 (void)npf_fetch_tcpopts(npc, nbuf, NULL, &fstate->nst_wscale); 365 366 tstate->nst_wscale = 0; 367 368 /* Done. */ 369 return true; 370 } 371 if (fstate->nst_end == 0) { 372 /* 373 * Should be a SYN-ACK reply to SYN. If SYN is not set, 374 * then we are in the middle of connection and lost tracking. 375 */ 376 fstate->nst_end = end; 377 fstate->nst_maxend = end + 1; 378 fstate->nst_maxwin = win; 379 fstate->nst_wscale = 0; 380 381 /* Handle TCP Window Scaling (must be ignored if no SYN). */ 382 if (tcpfl & TH_SYN) { 383 (void)npf_fetch_tcpopts(npc, nbuf, NULL, 384 &fstate->nst_wscale); 385 } 386 } 387 388 if ((tcpfl & TH_ACK) == 0) { 389 /* Pretend that an ACK was sent. */ 390 ack = tstate->nst_end; 391 } else if ((tcpfl & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST) && ack == 0) { 392 /* Workaround for some TCP stacks. */ 393 ack = tstate->nst_end; 394 } 395 396 if (__predict_false(tcpfl & TH_RST)) { 397 /* RST to the initial SYN may have zero SEQ - fix it up. */ 398 if (seq == 0 && nst->nst_state == NPF_TCPS_SYN_SENT) { 399 end = fstate->nst_end; 400 seq = end; 401 } 402 403 /* Strict in-order sequence for RST packets. */ 404 if (npf_strict_order_rst && (fstate->nst_end - seq) > 1) { 405 return false; 406 } 407 } 408 409 /* 410 * Determine whether the data is within previously noted window, 411 * that is, upper boundary for valid data (I). 412 */ 413 if (!SEQ_LEQ(end, fstate->nst_maxend)) { 414 npf_stats_inc(NPF_STAT_INVALID_STATE_TCP1); 415 return false; 416 } 417 418 /* Lower boundary (II), which is no more than one window back. */ 419 if (!SEQ_GEQ(seq, fstate->nst_end - tstate->nst_maxwin)) { 420 npf_stats_inc(NPF_STAT_INVALID_STATE_TCP2); 421 return false; 422 } 423 424 /* 425 * Boundaries for valid acknowledgments (III, IV) - one predicted 426 * window up or down, since packets may be fragmented. 427 */ 428 ackskew = tstate->nst_end - ack; 429 if (ackskew < -NPF_TCP_MAXACKWIN || 430 ackskew > (NPF_TCP_MAXACKWIN << fstate->nst_wscale)) { 431 npf_stats_inc(NPF_STAT_INVALID_STATE_TCP3); 432 return false; 433 } 434 435 /* 436 * Packet has been passed. 437 * 438 * Negative ackskew might be due to fragmented packets. Since the 439 * total length of the packet is unknown - bump the boundary. 440 */ 441 442 if (ackskew < 0) { 443 tstate->nst_end = ack; 444 } 445 /* Keep track of the maximum window seen. */ 446 if (fstate->nst_maxwin < win) { 447 fstate->nst_maxwin = win; 448 } 449 if (SEQ_GT(end, fstate->nst_end)) { 450 fstate->nst_end = end; 451 } 452 /* Note the window for upper boundary. */ 453 if (SEQ_GEQ(ack + win, tstate->nst_maxend)) { 454 tstate->nst_maxend = ack + win; 455 } 456 return true; 457 } 458 459 /* 460 * npf_state_tcp: inspect TCP segment, determine whether it belongs to 461 * the connection and track its state. 462 */ 463 bool 464 npf_state_tcp(npf_cache_t *npc, nbuf_t *nbuf, npf_state_t *nst, int di) 465 { 466 const struct tcphdr * const th = npc->npc_l4.tcp; 467 const int tcpfl = th->th_flags, state = nst->nst_state; 468 int nstate; 469 470 KASSERT(nst->nst_state == 0 || mutex_owned(&nst->nst_lock)); 471 472 /* Look for a transition to a new state. */ 473 if (__predict_true((tcpfl & TH_RST) == 0)) { 474 const int flagcase = npf_tcpfl2case(tcpfl); 475 nstate = npf_tcp_fsm[state][di][flagcase]; 476 } else if (state == NPF_TCPS_TIME_WAIT) { 477 /* Prevent TIME-WAIT assassination (RFC 1337). */ 478 nstate = NPF_TCPS_OK; 479 } else { 480 nstate = NPF_TCPS_CLOSED; 481 } 482 483 /* Determine whether TCP packet really belongs to this connection. */ 484 if (!npf_tcp_inwindow(npc, nbuf, nst, di)) { 485 return false; 486 } 487 if (__predict_true(nstate == NPF_TCPS_OK)) { 488 return true; 489 } 490 491 nst->nst_state = nstate; 492 return true; 493 } 494 495 int 496 npf_state_tcp_timeout(const npf_state_t *nst) 497 { 498 const u_int state = nst->nst_state; 499 500 KASSERT(state < NPF_TCP_NSTATES); 501 return npf_tcp_timeouts[state]; 502 } 503