1 /* $NetBSD: npf_state_tcp.c,v 1.3 2011/12/08 23:36:57 rmind Exp $ */ 2 3 /*- 4 * Copyright (c) 2010-2011 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This material is based upon work partially supported by The 8 * NetBSD Foundation under a contract with Mindaugas Rasiukevicius. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * NPF TCP state engine for connection tracking. 34 */ 35 36 #include <sys/cdefs.h> 37 __KERNEL_RCSID(0, "$NetBSD: npf_state_tcp.c,v 1.3 2011/12/08 23:36:57 rmind Exp $"); 38 39 #include <sys/param.h> 40 #include <sys/types.h> 41 42 #ifndef _KERNEL 43 #include <stdio.h> 44 #include <stdbool.h> 45 #include <inttypes.h> 46 #endif 47 #include <netinet/in.h> 48 #include <netinet/tcp.h> 49 #include <netinet/tcp_seq.h> 50 51 #include "npf_impl.h" 52 53 #if defined(_NPF_TESTING) 54 void npf_state_sample(npf_state_t *); 55 #define NPF_TCP_STATE_SAMPLE(nst) npf_state_sample(nst) 56 #else 57 #define NPF_TCP_STATE_SAMPLE(nst) 58 #endif 59 60 /* 61 * NPF TCP states. Note: these states are different from the TCP FSM 62 * states of RFC 793. Mind that packet filter is a man-in-the-middle. 63 */ 64 #define NPF_TCPS_OK (-1) 65 #define NPF_TCPS_CLOSED 0 66 #define NPF_TCPS_SYN_SENT 1 67 #define NPF_TCPS_SIMSYN_SENT 2 68 #define NPF_TCPS_SYN_RECEIVED 3 69 #define NPF_TCPS_ESTABLISHED 4 70 #define NPF_TCPS_FIN_SEEN 5 71 #define NPF_TCPS_CLOSE_WAIT 6 72 #define NPF_TCPS_FIN_WAIT 7 73 #define NPF_TCPS_CLOSING 8 74 #define NPF_TCPS_LAST_ACK 9 75 #define NPF_TCPS_TIME_WAIT 10 76 77 #define NPF_TCP_NSTATES 11 78 79 /* 80 * TCP connection timeout table (in seconds). 81 */ 82 static u_int npf_tcp_timeouts[] __read_mostly = { 83 /* Closed, timeout nearly immediately. */ 84 [NPF_TCPS_CLOSED] = 10, 85 /* Unsynchronised states. */ 86 [NPF_TCPS_SYN_SENT] = 30, 87 [NPF_TCPS_SIMSYN_SENT] = 30, 88 [NPF_TCPS_SYN_RECEIVED] = 60, 89 /* Established, timeout: 24 hours. */ 90 [NPF_TCPS_ESTABLISHED] = 60 * 60 * 24, 91 /* Closure cases, timeout: 4 minutes (2 * MSL). */ 92 [NPF_TCPS_FIN_SEEN] = 60 * 2 * 2, 93 [NPF_TCPS_CLOSE_WAIT] = 60 * 2 * 2, 94 [NPF_TCPS_FIN_WAIT] = 60 * 2 * 2, 95 [NPF_TCPS_CLOSING] = 30, 96 [NPF_TCPS_LAST_ACK] = 30, 97 [NPF_TCPS_TIME_WAIT] = 60 * 2 * 2, 98 }; 99 100 #define NPF_TCP_MAXACKWIN 66000 101 102 /* 103 * List of TCP flag cases and conversion of flags to a case (index). 104 */ 105 106 #define TCPFC_INVALID 0 107 #define TCPFC_SYN 1 108 #define TCPFC_SYNACK 2 109 #define TCPFC_ACK 3 110 #define TCPFC_FIN 4 111 #define TCPFC_COUNT 5 112 113 static inline u_int 114 npf_tcpfl2case(const int tcpfl) 115 { 116 u_int i, c; 117 118 CTASSERT(TH_FIN == 0x01); 119 CTASSERT(TH_SYN == 0x02); 120 CTASSERT(TH_ACK == 0x10); 121 122 /* 123 * Flags are shifted to use three least significant bits, thus each 124 * flag combination has a unique number ranging from 0 to 7, e.g. 125 * TH_SYN | TH_ACK has number 6, since (0x02 | (0x10 >> 2)) == 6. 126 * However, the requirement is to have number 0 for invalid cases, 127 * such as TH_SYN | TH_FIN, and to have the same number for TH_FIN 128 * and TH_FIN|TH_ACK cases. Thus, we generate a mask assigning 3 129 * bits for each number, which contains the actual case numbers: 130 * 131 * TCPFC_SYNACK << (6 << 2) == 0x2000000 (6 - SYN,ACK) 132 * TCPFC_FIN << (5 << 2) == 0x0400000 (5 - FIN,ACK) 133 * ... 134 * 135 * Hence, OR'ed mask value is 0x2430140. 136 */ 137 i = (tcpfl & (TH_SYN | TH_FIN)) | ((tcpfl & TH_ACK) >> 2); 138 c = (0x2430140 >> (i << 2)) & 7; 139 140 KASSERT(c < TCPFC_COUNT); 141 return c; 142 } 143 144 /* 145 * NPF transition table of a tracked TCP connection. 146 * 147 * There is a single state, which is changed in the following way: 148 * 149 * new_state = npf_tcp_fsm[old_state][direction][npf_tcpfl2case(tcp_flags)]; 150 * 151 * Note that this state is different from the state in each end (host). 152 */ 153 154 static const int npf_tcp_fsm[NPF_TCP_NSTATES][2][TCPFC_COUNT] = { 155 [NPF_TCPS_CLOSED] = { 156 [NPF_FLOW_FORW] = { 157 /* Handshake (1): initial SYN. */ 158 [TCPFC_SYN] = NPF_TCPS_SYN_SENT, 159 }, 160 }, 161 [NPF_TCPS_SYN_SENT] = { 162 [NPF_FLOW_FORW] = { 163 /* SYN may be retransmitted. */ 164 [TCPFC_SYN] = NPF_TCPS_OK, 165 }, 166 [NPF_FLOW_BACK] = { 167 /* Handshake (2): SYN-ACK is expected. */ 168 [TCPFC_SYNACK] = NPF_TCPS_SYN_RECEIVED, 169 /* Simultaneous initiation - SYN. */ 170 [TCPFC_SYN] = NPF_TCPS_SIMSYN_SENT, 171 }, 172 }, 173 [NPF_TCPS_SIMSYN_SENT] = { 174 [NPF_FLOW_FORW] = { 175 /* Original SYN re-transmission. */ 176 [TCPFC_SYN] = NPF_TCPS_OK, 177 /* SYN-ACK response to simultaneous SYN. */ 178 [TCPFC_SYNACK] = NPF_TCPS_SYN_RECEIVED, 179 }, 180 [NPF_FLOW_BACK] = { 181 /* Simultaneous SYN re-transmission.*/ 182 [TCPFC_SYN] = NPF_TCPS_OK, 183 /* SYN-ACK response to original SYN. */ 184 [TCPFC_SYNACK] = NPF_TCPS_SYN_RECEIVED, 185 /* FIN may be sent early. */ 186 [TCPFC_FIN] = NPF_TCPS_FIN_SEEN, 187 }, 188 }, 189 [NPF_TCPS_SYN_RECEIVED] = { 190 [NPF_FLOW_FORW] = { 191 /* Handshake (3): ACK is expected. */ 192 [TCPFC_ACK] = NPF_TCPS_ESTABLISHED, 193 /* FIN may be sent early. */ 194 [TCPFC_FIN] = NPF_TCPS_FIN_SEEN, 195 }, 196 [NPF_FLOW_BACK] = { 197 /* SYN-ACK may be retransmitted. */ 198 [TCPFC_SYNACK] = NPF_TCPS_OK, 199 /* XXX: ACK of late SYN in simultaneous case? */ 200 [TCPFC_ACK] = NPF_TCPS_OK, 201 /* FIN may be sent early. */ 202 [TCPFC_FIN] = NPF_TCPS_FIN_SEEN, 203 }, 204 }, 205 [NPF_TCPS_ESTABLISHED] = { 206 /* 207 * Regular ACKs (data exchange) or FIN. 208 * FIN packets may have ACK set. 209 */ 210 [NPF_FLOW_FORW] = { 211 [TCPFC_ACK] = NPF_TCPS_OK, 212 /* FIN by the sender. */ 213 [TCPFC_FIN] = NPF_TCPS_FIN_SEEN, 214 }, 215 [NPF_FLOW_BACK] = { 216 [TCPFC_ACK] = NPF_TCPS_OK, 217 /* FIN by the receiver. */ 218 [TCPFC_FIN] = NPF_TCPS_FIN_SEEN, 219 }, 220 }, 221 [NPF_TCPS_FIN_SEEN] = { 222 /* 223 * FIN was seen. If ACK only, connection is half-closed now, 224 * need to determine which end is closed (sender or receiver). 225 * However, both FIN and FIN-ACK may race here - in which 226 * case we are closing immediately. 227 */ 228 [NPF_FLOW_FORW] = { 229 [TCPFC_ACK] = NPF_TCPS_CLOSE_WAIT, 230 [TCPFC_FIN] = NPF_TCPS_CLOSING, 231 }, 232 [NPF_FLOW_BACK] = { 233 [TCPFC_ACK] = NPF_TCPS_FIN_WAIT, 234 [TCPFC_FIN] = NPF_TCPS_CLOSING, 235 }, 236 }, 237 [NPF_TCPS_CLOSE_WAIT] = { 238 /* Sender has sent the FIN and closed its end. */ 239 [NPF_FLOW_FORW] = { 240 [TCPFC_ACK] = NPF_TCPS_OK, 241 [TCPFC_FIN] = NPF_TCPS_LAST_ACK, 242 }, 243 [NPF_FLOW_BACK] = { 244 [TCPFC_ACK] = NPF_TCPS_OK, 245 [TCPFC_FIN] = NPF_TCPS_LAST_ACK, 246 }, 247 }, 248 [NPF_TCPS_FIN_WAIT] = { 249 /* Receiver has closed its end. */ 250 [NPF_FLOW_FORW] = { 251 [TCPFC_ACK] = NPF_TCPS_OK, 252 [TCPFC_FIN] = NPF_TCPS_LAST_ACK, 253 }, 254 [NPF_FLOW_BACK] = { 255 [TCPFC_ACK] = NPF_TCPS_OK, 256 [TCPFC_FIN] = NPF_TCPS_LAST_ACK, 257 }, 258 }, 259 [NPF_TCPS_CLOSING] = { 260 /* Race of FINs - expecting ACK. */ 261 [NPF_FLOW_FORW] = { 262 [TCPFC_ACK] = NPF_TCPS_LAST_ACK, 263 }, 264 [NPF_FLOW_BACK] = { 265 [TCPFC_ACK] = NPF_TCPS_LAST_ACK, 266 }, 267 }, 268 [NPF_TCPS_LAST_ACK] = { 269 /* FINs exchanged - expecting last ACK. */ 270 [NPF_FLOW_FORW] = { 271 [TCPFC_ACK] = NPF_TCPS_TIME_WAIT, 272 }, 273 [NPF_FLOW_BACK] = { 274 [TCPFC_ACK] = NPF_TCPS_TIME_WAIT, 275 }, 276 }, 277 [NPF_TCPS_TIME_WAIT] = { 278 /* May re-open the connection as per RFC 1122. */ 279 [NPF_FLOW_FORW] = { 280 [TCPFC_SYN] = NPF_TCPS_SYN_SENT, 281 }, 282 }, 283 }; 284 285 /* 286 * npf_tcp_inwindow: determine whether the packet is in the TCP window 287 * and thus part of the connection we are tracking. 288 */ 289 static bool 290 npf_tcp_inwindow(const npf_cache_t *npc, nbuf_t *nbuf, npf_state_t *nst, 291 const int di) 292 { 293 const struct tcphdr * const th = &npc->npc_l4.tcp; 294 const int tcpfl = th->th_flags; 295 npf_tcpstate_t *fstate, *tstate; 296 int tcpdlen, wscale, ackskew; 297 tcp_seq seq, ack, end; 298 uint32_t win; 299 300 KASSERT(npf_iscached(npc, NPC_TCP)); 301 KASSERT(di == NPF_FLOW_FORW || di == NPF_FLOW_BACK); 302 303 /* 304 * Perform SEQ/ACK numbers check against boundaries. Reference: 305 * 306 * Rooij G., "Real stateful TCP packet filtering in IP Filter", 307 * 10th USENIX Security Symposium invited talk, Aug. 2001. 308 * 309 * There are four boundaries defined as following: 310 * I) SEQ + LEN <= MAX { SND.ACK + MAX(SND.WIN, 1) } 311 * II) SEQ >= MAX { SND.SEQ + SND.LEN - MAX(RCV.WIN, 1) } 312 * III) ACK <= MAX { RCV.SEQ + RCV.LEN } 313 * IV) ACK >= MAX { RCV.SEQ + RCV.LEN } - MAXACKWIN 314 * 315 * Let these members of npf_tcpstate_t be the maximum seen values of: 316 * nst_end - SEQ + LEN 317 * nst_maxend - ACK + MAX(WIN, 1) 318 * nst_maxwin - MAX(WIN, 1) 319 */ 320 321 tcpdlen = npf_tcpsaw(__UNCONST(npc), &seq, &ack, &win); 322 end = seq + tcpdlen; 323 if (tcpfl & TH_SYN) { 324 end++; 325 } 326 if (tcpfl & TH_FIN) { 327 end++; 328 } 329 330 fstate = &nst->nst_tcpst[di]; 331 tstate = &nst->nst_tcpst[!di]; 332 win = win ? (win << fstate->nst_wscale) : 1; 333 334 /* 335 * Initialise if the first packet. 336 * Note: only case when nst_maxwin is zero. 337 */ 338 if (__predict_false(fstate->nst_maxwin == 0)) { 339 /* 340 * Should be first SYN or re-transmission of SYN. State of 341 * other side will get set with a SYN-ACK reply (see below). 342 */ 343 fstate->nst_end = end; 344 fstate->nst_maxend = end; 345 fstate->nst_maxwin = win; 346 tstate->nst_end = 0; 347 tstate->nst_maxend = 0; 348 tstate->nst_maxwin = 1; 349 350 /* 351 * Handle TCP Window Scaling (RFC 1323). Both sides may 352 * send this option in their SYN packets. 353 */ 354 if (npf_fetch_tcpopts(npc, nbuf, NULL, &wscale)) { 355 fstate->nst_wscale = wscale; 356 } else { 357 fstate->nst_wscale = 0; 358 } 359 tstate->nst_wscale = 0; 360 361 /* Done. */ 362 return true; 363 } 364 if (fstate->nst_end == 0) { 365 /* 366 * Should be a SYN-ACK reply to SYN. If SYN is not set, 367 * then we are in the middle of connection and lost tracking. 368 */ 369 fstate->nst_end = end; 370 fstate->nst_maxend = end + 1; 371 fstate->nst_maxwin = win; 372 373 /* Handle TCP Window Scaling (must be ignored if no SYN). */ 374 if (tcpfl & TH_SYN) { 375 fstate->nst_wscale = 376 npf_fetch_tcpopts(npc, nbuf, NULL, &wscale) ? 377 wscale : 0; 378 } 379 } 380 if ((tcpfl & TH_ACK) == 0) { 381 /* Pretend that an ACK was sent. */ 382 ack = tstate->nst_end; 383 } else if ((tcpfl & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST) && ack == 0) { 384 /* Workaround for some TCP stacks. */ 385 ack = tstate->nst_end; 386 } 387 if (seq == end) { 388 /* If packet contains no data - assume it is valid. */ 389 end = fstate->nst_end; 390 seq = end; 391 } 392 393 NPF_TCP_STATE_SAMPLE(nst); 394 #if 0 395 /* Strict in-order sequence for RST packets. */ 396 if (((tcpfl & TH_RST) != 0) && (fstate->nst_end - seq) > 1) { 397 return false; 398 } 399 #endif 400 /* 401 * Determine whether the data is within previously noted window, 402 * that is, upper boundary for valid data (I). 403 */ 404 if (!SEQ_LEQ(end, fstate->nst_maxend)) { 405 npf_stats_inc(NPF_STAT_INVALID_STATE_TCP1); 406 return false; 407 } 408 409 /* Lower boundary (II), which is no more than one window back. */ 410 if (!SEQ_GEQ(seq, fstate->nst_end - tstate->nst_maxwin)) { 411 npf_stats_inc(NPF_STAT_INVALID_STATE_TCP2); 412 return false; 413 } 414 415 /* 416 * Boundaries for valid acknowledgments (III, IV) - on predicted 417 * window up or down, since packets may be fragmented. 418 */ 419 ackskew = tstate->nst_end - ack; 420 if (ackskew < -NPF_TCP_MAXACKWIN || 421 ackskew > (NPF_TCP_MAXACKWIN << fstate->nst_wscale)) { 422 npf_stats_inc(NPF_STAT_INVALID_STATE_TCP3); 423 return false; 424 } 425 426 /* 427 * Packet has been passed. 428 * 429 * Negative ackskew might be due to fragmented packets. Since the 430 * total length of the packet is unknown - bump the boundary. 431 */ 432 if (ackskew < 0) { 433 tstate->nst_end = end; 434 } 435 /* Keep track of the maximum window seen. */ 436 if (fstate->nst_maxwin < win) { 437 fstate->nst_maxwin = win; 438 } 439 if (SEQ_GT(end, fstate->nst_end)) { 440 fstate->nst_end = end; 441 } 442 /* Note the window for upper boundary. */ 443 if (SEQ_GEQ(ack + win, tstate->nst_maxend)) { 444 tstate->nst_maxend = ack + win; 445 } 446 return true; 447 } 448 449 bool 450 npf_state_tcp(const npf_cache_t *npc, nbuf_t *nbuf, npf_state_t *nst, int di) 451 { 452 const struct tcphdr * const th = &npc->npc_l4.tcp; 453 const int tcpfl = th->th_flags, state = nst->nst_state; 454 int nstate; 455 456 /* Look for a transition to a new state. */ 457 if (__predict_true((tcpfl & TH_RST) == 0)) { 458 const int flagcase = npf_tcpfl2case(tcpfl); 459 nstate = npf_tcp_fsm[state][di][flagcase]; 460 } else if (state == NPF_TCPS_TIME_WAIT) { 461 /* Prevent TIME-WAIT assassination (RFC 1337). */ 462 nstate = NPF_TCPS_OK; 463 } else { 464 nstate = NPF_TCPS_CLOSED; 465 } 466 /* Determine whether TCP packet really belongs to this connection. */ 467 if (!npf_tcp_inwindow(npc, nbuf, nst, di)) { 468 return false; 469 } 470 if (__predict_true(nstate == NPF_TCPS_OK)) { 471 return true; 472 } 473 nst->nst_state = nstate; 474 return true; 475 } 476 477 int 478 npf_state_tcp_timeout(const npf_state_t *nst) 479 { 480 const u_int state = nst->nst_state; 481 482 KASSERT(state < NPF_TCP_NSTATES); 483 return npf_tcp_timeouts[state]; 484 } 485