1 /*- 2 * Copyright (c) 2010-2012 The NetBSD Foundation, Inc. 3 * All rights reserved. 4 * 5 * This material is based upon work partially supported by The 6 * NetBSD Foundation under a contract with Mindaugas Rasiukevicius. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 18 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 19 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 21 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 * POSSIBILITY OF SUCH DAMAGE. 28 */ 29 30 /* 31 * NPF TCP state engine for connection tracking. 32 */ 33 34 #ifdef _KERNEL 35 #include <sys/cdefs.h> 36 __KERNEL_RCSID(0, "$NetBSD: npf_state_tcp.c,v 1.19 2018/09/29 14:41:36 rmind Exp $"); 37 38 #include <sys/param.h> 39 #include <sys/types.h> 40 41 #include <netinet/in.h> 42 #include <netinet/tcp.h> 43 #endif 44 45 #include "npf_impl.h" 46 47 /* 48 * NPF TCP states. Note: these states are different from the TCP FSM 49 * states of RFC 793. The packet filter is a man-in-the-middle. 50 */ 51 #define NPF_TCPS_OK 255 52 #define NPF_TCPS_CLOSED 0 53 #define NPF_TCPS_SYN_SENT 1 54 #define NPF_TCPS_SIMSYN_SENT 2 55 #define NPF_TCPS_SYN_RECEIVED 3 56 #define NPF_TCPS_ESTABLISHED 4 57 #define NPF_TCPS_FIN_SENT 5 58 #define NPF_TCPS_FIN_RECEIVED 6 59 #define NPF_TCPS_CLOSE_WAIT 7 60 #define NPF_TCPS_FIN_WAIT 8 61 #define NPF_TCPS_CLOSING 9 62 #define NPF_TCPS_LAST_ACK 10 63 #define NPF_TCPS_TIME_WAIT 11 64 65 #define NPF_TCP_NSTATES 12 66 67 /* 68 * TCP connection timeout table (in seconds). 69 */ 70 static u_int npf_tcp_timeouts[] __read_mostly = { 71 /* Closed, timeout nearly immediately. */ 72 [NPF_TCPS_CLOSED] = 10, 73 /* Unsynchronised states. */ 74 [NPF_TCPS_SYN_SENT] = 30, 75 [NPF_TCPS_SIMSYN_SENT] = 30, 76 [NPF_TCPS_SYN_RECEIVED] = 60, 77 /* Established: 24 hours. */ 78 [NPF_TCPS_ESTABLISHED] = 60 * 60 * 24, 79 /* FIN seen: 4 minutes (2 * MSL). */ 80 [NPF_TCPS_FIN_SENT] = 60 * 2 * 2, 81 [NPF_TCPS_FIN_RECEIVED] = 60 * 2 * 2, 82 /* Half-closed cases: 6 hours. */ 83 [NPF_TCPS_CLOSE_WAIT] = 60 * 60 * 6, 84 [NPF_TCPS_FIN_WAIT] = 60 * 60 * 6, 85 /* Full close cases: 30 sec and 2 * MSL. */ 86 [NPF_TCPS_CLOSING] = 30, 87 [NPF_TCPS_LAST_ACK] = 30, 88 [NPF_TCPS_TIME_WAIT] = 60 * 2 * 2, 89 }; 90 91 static bool npf_strict_order_rst __read_mostly = true; 92 93 #define NPF_TCP_MAXACKWIN 66000 94 95 #define SEQ_LT(a,b) ((int)((a)-(b)) < 0) 96 #define SEQ_LEQ(a,b) ((int)((a)-(b)) <= 0) 97 #define SEQ_GT(a,b) ((int)((a)-(b)) > 0) 98 #define SEQ_GEQ(a,b) ((int)((a)-(b)) >= 0) 99 100 /* 101 * List of TCP flag cases and conversion of flags to a case (index). 102 */ 103 104 #define TCPFC_INVALID 0 105 #define TCPFC_SYN 1 106 #define TCPFC_SYNACK 2 107 #define TCPFC_ACK 3 108 #define TCPFC_FIN 4 109 #define TCPFC_COUNT 5 110 111 static inline u_int 112 npf_tcpfl2case(const u_int tcpfl) 113 { 114 u_int i, c; 115 116 CTASSERT(TH_FIN == 0x01); 117 CTASSERT(TH_SYN == 0x02); 118 CTASSERT(TH_ACK == 0x10); 119 120 /* 121 * Flags are shifted to use three least significant bits, thus each 122 * flag combination has a unique number ranging from 0 to 7, e.g. 123 * TH_SYN | TH_ACK has number 6, since (0x02 | (0x10 >> 2)) == 6. 124 * However, the requirement is to have number 0 for invalid cases, 125 * such as TH_SYN | TH_FIN, and to have the same number for TH_FIN 126 * and TH_FIN|TH_ACK cases. Thus, we generate a mask assigning 3 127 * bits for each number, which contains the actual case numbers: 128 * 129 * TCPFC_SYNACK << (6 << 2) == 0x2000000 (6 - SYN,ACK) 130 * TCPFC_FIN << (5 << 2) == 0x0400000 (5 - FIN,ACK) 131 * ... 132 * 133 * Hence, OR'ed mask value is 0x2430140. 134 */ 135 i = (tcpfl & (TH_SYN | TH_FIN)) | ((tcpfl & TH_ACK) >> 2); 136 c = (0x2430140 >> (i << 2)) & 7; 137 138 KASSERT(c < TCPFC_COUNT); 139 return c; 140 } 141 142 /* 143 * NPF transition table of a tracked TCP connection. 144 * 145 * There is a single state, which is changed in the following way: 146 * 147 * new_state = npf_tcp_fsm[old_state][direction][npf_tcpfl2case(tcp_flags)]; 148 * 149 * Note that this state is different from the state in each end (host). 150 */ 151 152 static const uint8_t npf_tcp_fsm[NPF_TCP_NSTATES][2][TCPFC_COUNT] = { 153 [NPF_TCPS_CLOSED] = { 154 [NPF_FLOW_FORW] = { 155 /* Handshake (1): initial SYN. */ 156 [TCPFC_SYN] = NPF_TCPS_SYN_SENT, 157 }, 158 }, 159 [NPF_TCPS_SYN_SENT] = { 160 [NPF_FLOW_FORW] = { 161 /* SYN may be retransmitted. */ 162 [TCPFC_SYN] = NPF_TCPS_OK, 163 }, 164 [NPF_FLOW_BACK] = { 165 /* Handshake (2): SYN-ACK is expected. */ 166 [TCPFC_SYNACK] = NPF_TCPS_SYN_RECEIVED, 167 /* Simultaneous initiation - SYN. */ 168 [TCPFC_SYN] = NPF_TCPS_SIMSYN_SENT, 169 }, 170 }, 171 [NPF_TCPS_SIMSYN_SENT] = { 172 [NPF_FLOW_FORW] = { 173 /* Original SYN re-transmission. */ 174 [TCPFC_SYN] = NPF_TCPS_OK, 175 /* SYN-ACK response to simultaneous SYN. */ 176 [TCPFC_SYNACK] = NPF_TCPS_SYN_RECEIVED, 177 }, 178 [NPF_FLOW_BACK] = { 179 /* Simultaneous SYN re-transmission.*/ 180 [TCPFC_SYN] = NPF_TCPS_OK, 181 /* SYN-ACK response to original SYN. */ 182 [TCPFC_SYNACK] = NPF_TCPS_SYN_RECEIVED, 183 /* FIN may occur early. */ 184 [TCPFC_FIN] = NPF_TCPS_FIN_RECEIVED, 185 }, 186 }, 187 [NPF_TCPS_SYN_RECEIVED] = { 188 [NPF_FLOW_FORW] = { 189 /* Handshake (3): ACK is expected. */ 190 [TCPFC_ACK] = NPF_TCPS_ESTABLISHED, 191 /* FIN may be sent early. */ 192 [TCPFC_FIN] = NPF_TCPS_FIN_SENT, 193 /* Late SYN re-transmission. */ 194 [TCPFC_SYN] = NPF_TCPS_OK, 195 }, 196 [NPF_FLOW_BACK] = { 197 /* SYN-ACK may be retransmitted. */ 198 [TCPFC_SYNACK] = NPF_TCPS_OK, 199 /* XXX: ACK of late SYN in simultaneous case? */ 200 [TCPFC_ACK] = NPF_TCPS_OK, 201 /* FIN may occur early. */ 202 [TCPFC_FIN] = NPF_TCPS_FIN_RECEIVED, 203 }, 204 }, 205 [NPF_TCPS_ESTABLISHED] = { 206 /* 207 * Regular ACKs (data exchange) or FIN. 208 * FIN packets may have ACK set. 209 */ 210 [NPF_FLOW_FORW] = { 211 [TCPFC_ACK] = NPF_TCPS_OK, 212 /* FIN by the sender. */ 213 [TCPFC_FIN] = NPF_TCPS_FIN_SENT, 214 }, 215 [NPF_FLOW_BACK] = { 216 [TCPFC_ACK] = NPF_TCPS_OK, 217 /* FIN by the receiver. */ 218 [TCPFC_FIN] = NPF_TCPS_FIN_RECEIVED, 219 }, 220 }, 221 [NPF_TCPS_FIN_SENT] = { 222 [NPF_FLOW_FORW] = { 223 /* FIN may be re-transmitted. Late ACK as well. */ 224 [TCPFC_ACK] = NPF_TCPS_OK, 225 [TCPFC_FIN] = NPF_TCPS_OK, 226 }, 227 [NPF_FLOW_BACK] = { 228 /* If ACK, connection is half-closed now. */ 229 [TCPFC_ACK] = NPF_TCPS_FIN_WAIT, 230 /* FIN or FIN-ACK race - immediate closing. */ 231 [TCPFC_FIN] = NPF_TCPS_CLOSING, 232 }, 233 }, 234 [NPF_TCPS_FIN_RECEIVED] = { 235 /* 236 * FIN was received. Equivalent scenario to sent FIN. 237 */ 238 [NPF_FLOW_FORW] = { 239 [TCPFC_ACK] = NPF_TCPS_CLOSE_WAIT, 240 [TCPFC_FIN] = NPF_TCPS_CLOSING, 241 }, 242 [NPF_FLOW_BACK] = { 243 [TCPFC_ACK] = NPF_TCPS_OK, 244 [TCPFC_FIN] = NPF_TCPS_OK, 245 }, 246 }, 247 [NPF_TCPS_CLOSE_WAIT] = { 248 /* Sender has sent the FIN and closed its end. */ 249 [NPF_FLOW_FORW] = { 250 [TCPFC_ACK] = NPF_TCPS_OK, 251 [TCPFC_FIN] = NPF_TCPS_LAST_ACK, 252 }, 253 [NPF_FLOW_BACK] = { 254 [TCPFC_ACK] = NPF_TCPS_OK, 255 [TCPFC_FIN] = NPF_TCPS_LAST_ACK, 256 }, 257 }, 258 [NPF_TCPS_FIN_WAIT] = { 259 /* Receiver has closed its end. */ 260 [NPF_FLOW_FORW] = { 261 [TCPFC_ACK] = NPF_TCPS_OK, 262 [TCPFC_FIN] = NPF_TCPS_LAST_ACK, 263 }, 264 [NPF_FLOW_BACK] = { 265 [TCPFC_ACK] = NPF_TCPS_OK, 266 [TCPFC_FIN] = NPF_TCPS_LAST_ACK, 267 }, 268 }, 269 [NPF_TCPS_CLOSING] = { 270 /* Race of FINs - expecting ACK. */ 271 [NPF_FLOW_FORW] = { 272 [TCPFC_ACK] = NPF_TCPS_LAST_ACK, 273 }, 274 [NPF_FLOW_BACK] = { 275 [TCPFC_ACK] = NPF_TCPS_LAST_ACK, 276 }, 277 }, 278 [NPF_TCPS_LAST_ACK] = { 279 /* FINs exchanged - expecting last ACK. */ 280 [NPF_FLOW_FORW] = { 281 [TCPFC_ACK] = NPF_TCPS_TIME_WAIT, 282 }, 283 [NPF_FLOW_BACK] = { 284 [TCPFC_ACK] = NPF_TCPS_TIME_WAIT, 285 }, 286 }, 287 [NPF_TCPS_TIME_WAIT] = { 288 /* May re-open the connection as per RFC 1122. */ 289 [NPF_FLOW_FORW] = { 290 [TCPFC_SYN] = NPF_TCPS_SYN_SENT, 291 }, 292 }, 293 }; 294 295 /* 296 * npf_tcp_inwindow: determine whether the packet is in the TCP window 297 * and thus part of the connection we are tracking. 298 */ 299 static bool 300 npf_tcp_inwindow(npf_cache_t *npc, npf_state_t *nst, const int di) 301 { 302 const struct tcphdr * const th = npc->npc_l4.tcp; 303 const int tcpfl = th->th_flags; 304 npf_tcpstate_t *fstate, *tstate; 305 int tcpdlen, ackskew; 306 tcp_seq seq, ack, end; 307 uint32_t win; 308 309 KASSERT(npf_iscached(npc, NPC_TCP)); 310 KASSERT(di == NPF_FLOW_FORW || di == NPF_FLOW_BACK); 311 312 /* 313 * Perform SEQ/ACK numbers check against boundaries. Reference: 314 * 315 * Rooij G., "Real stateful TCP packet filtering in IP Filter", 316 * 10th USENIX Security Symposium invited talk, Aug. 2001. 317 * 318 * There are four boundaries defined as following: 319 * I) SEQ + LEN <= MAX { SND.ACK + MAX(SND.WIN, 1) } 320 * II) SEQ >= MAX { SND.SEQ + SND.LEN - MAX(RCV.WIN, 1) } 321 * III) ACK <= MAX { RCV.SEQ + RCV.LEN } 322 * IV) ACK >= MAX { RCV.SEQ + RCV.LEN } - MAXACKWIN 323 * 324 * Let these members of npf_tcpstate_t be the maximum seen values of: 325 * nst_end - SEQ + LEN 326 * nst_maxend - ACK + MAX(WIN, 1) 327 * nst_maxwin - MAX(WIN, 1) 328 */ 329 330 tcpdlen = npf_tcpsaw(__UNCONST(npc), &seq, &ack, &win); 331 end = seq + tcpdlen; 332 if (tcpfl & TH_SYN) { 333 end++; 334 } 335 if (tcpfl & TH_FIN) { 336 end++; 337 } 338 339 fstate = &nst->nst_tcpst[di]; 340 tstate = &nst->nst_tcpst[!di]; 341 win = win ? (win << fstate->nst_wscale) : 1; 342 343 /* 344 * Initialise if the first packet. 345 * Note: only case when nst_maxwin is zero. 346 */ 347 if (__predict_false(fstate->nst_maxwin == 0)) { 348 /* 349 * Normally, it should be the first SYN or a re-transmission 350 * of SYN. The state of the other side will get set with a 351 * SYN-ACK reply (see below). 352 */ 353 fstate->nst_end = end; 354 fstate->nst_maxend = end; 355 fstate->nst_maxwin = win; 356 tstate->nst_end = 0; 357 tstate->nst_maxend = 0; 358 tstate->nst_maxwin = 1; 359 360 /* 361 * Handle TCP Window Scaling (RFC 1323). Both sides may 362 * send this option in their SYN packets. 363 */ 364 fstate->nst_wscale = 0; 365 (void)npf_fetch_tcpopts(npc, NULL, &fstate->nst_wscale); 366 367 tstate->nst_wscale = 0; 368 369 /* Done. */ 370 return true; 371 } 372 373 if (fstate->nst_end == 0) { 374 /* 375 * Should be a SYN-ACK reply to SYN. If SYN is not set, 376 * then we are in the middle of connection and lost tracking. 377 */ 378 fstate->nst_end = end; 379 fstate->nst_maxend = end + 1; 380 fstate->nst_maxwin = win; 381 fstate->nst_wscale = 0; 382 383 /* Handle TCP Window Scaling (must be ignored if no SYN). */ 384 if (tcpfl & TH_SYN) { 385 (void)npf_fetch_tcpopts(npc, NULL, &fstate->nst_wscale); 386 } 387 } 388 389 if ((tcpfl & TH_ACK) == 0) { 390 /* Pretend that an ACK was sent. */ 391 ack = tstate->nst_end; 392 } else if ((tcpfl & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST) && ack == 0) { 393 /* Workaround for some TCP stacks. */ 394 ack = tstate->nst_end; 395 } 396 397 if (__predict_false(tcpfl & TH_RST)) { 398 /* RST to the initial SYN may have zero SEQ - fix it up. */ 399 if (seq == 0 && nst->nst_state == NPF_TCPS_SYN_SENT) { 400 end = fstate->nst_end; 401 seq = end; 402 } 403 404 /* Strict in-order sequence for RST packets (RFC 5961). */ 405 if (npf_strict_order_rst && (fstate->nst_end - seq) > 1) { 406 return false; 407 } 408 } 409 410 /* 411 * Determine whether the data is within previously noted window, 412 * that is, upper boundary for valid data (I). 413 */ 414 if (!SEQ_LEQ(end, fstate->nst_maxend)) { 415 npf_stats_inc(npc->npc_ctx, NPF_STAT_INVALID_STATE_TCP1); 416 return false; 417 } 418 419 /* Lower boundary (II), which is no more than one window back. */ 420 if (!SEQ_GEQ(seq, fstate->nst_end - tstate->nst_maxwin)) { 421 npf_stats_inc(npc->npc_ctx, NPF_STAT_INVALID_STATE_TCP2); 422 return false; 423 } 424 425 /* 426 * Boundaries for valid acknowledgments (III, IV) - one predicted 427 * window up or down, since packets may be fragmented. 428 */ 429 ackskew = tstate->nst_end - ack; 430 if (ackskew < -NPF_TCP_MAXACKWIN || 431 ackskew > (NPF_TCP_MAXACKWIN << fstate->nst_wscale)) { 432 npf_stats_inc(npc->npc_ctx, NPF_STAT_INVALID_STATE_TCP3); 433 return false; 434 } 435 436 /* 437 * Packet has been passed. 438 * 439 * Negative ackskew might be due to fragmented packets. Since the 440 * total length of the packet is unknown - bump the boundary. 441 */ 442 443 if (ackskew < 0) { 444 tstate->nst_end = ack; 445 } 446 /* Keep track of the maximum window seen. */ 447 if (fstate->nst_maxwin < win) { 448 fstate->nst_maxwin = win; 449 } 450 if (SEQ_GT(end, fstate->nst_end)) { 451 fstate->nst_end = end; 452 } 453 /* Note the window for upper boundary. */ 454 if (SEQ_GEQ(ack + win, tstate->nst_maxend)) { 455 tstate->nst_maxend = ack + win; 456 } 457 return true; 458 } 459 460 /* 461 * npf_state_tcp: inspect TCP segment, determine whether it belongs to 462 * the connection and track its state. 463 */ 464 bool 465 npf_state_tcp(npf_cache_t *npc, npf_state_t *nst, int di) 466 { 467 const struct tcphdr * const th = npc->npc_l4.tcp; 468 const u_int tcpfl = th->th_flags, state = nst->nst_state; 469 u_int nstate; 470 471 KASSERT(nst->nst_state < NPF_TCP_NSTATES); 472 473 /* Look for a transition to a new state. */ 474 if (__predict_true((tcpfl & TH_RST) == 0)) { 475 const u_int flagcase = npf_tcpfl2case(tcpfl); 476 nstate = npf_tcp_fsm[state][di][flagcase]; 477 } else if (state == NPF_TCPS_TIME_WAIT) { 478 /* Prevent TIME-WAIT assassination (RFC 1337). */ 479 nstate = NPF_TCPS_OK; 480 } else { 481 nstate = NPF_TCPS_CLOSED; 482 } 483 484 /* Determine whether TCP packet really belongs to this connection. */ 485 if (!npf_tcp_inwindow(npc, nst, di)) { 486 return false; 487 } 488 if (__predict_true(nstate == NPF_TCPS_OK)) { 489 return true; 490 } 491 492 nst->nst_state = nstate; 493 return true; 494 } 495 496 int 497 npf_state_tcp_timeout(const npf_state_t *nst) 498 { 499 const u_int state = nst->nst_state; 500 501 KASSERT(state < NPF_TCP_NSTATES); 502 return npf_tcp_timeouts[state]; 503 } 504