1 /* $NetBSD: npf_state.c,v 1.2 2010/12/18 01:07:25 rmind Exp $ */ 2 3 /*- 4 * Copyright (c) 2010 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This material is based upon work partially supported by The 8 * NetBSD Foundation under a contract with Mindaugas Rasiukevicius. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * NPF state engine to track connections. 34 */ 35 36 #include <sys/cdefs.h> 37 __KERNEL_RCSID(0, "$NetBSD: npf_state.c,v 1.2 2010/12/18 01:07:25 rmind Exp $"); 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 42 #include <sys/mutex.h> 43 #include <netinet/in.h> 44 #include <netinet/tcp.h> 45 #include <netinet/tcp_seq.h> 46 47 #include "npf_impl.h" 48 49 #define MAXACKWINDOW 66000 50 51 /* Session expiration table. XXX revisit later */ 52 static const u_int expire_table[ ] = { 53 [IPPROTO_TCP] = 86400, /* 24 hours */ 54 [IPPROTO_UDP] = 120, /* 2 min */ 55 [IPPROTO_ICMP] = 30 /* 1 min */ 56 }; 57 58 static bool 59 npf_tcp_inwindow(const npf_cache_t *npc, nbuf_t *nbuf, npf_state_t *nst, 60 const bool forw) 61 { 62 const struct tcphdr *th = &npc->npc_l4.tcp; 63 const int tcpfl = th->th_flags; 64 npf_tcpstate_t *fstate, *tstate; 65 int tcpdlen, wscale, ackskew; 66 tcp_seq seq, ack, end; 67 uint32_t win; 68 69 KASSERT(npf_iscached(npc, NPC_TCP)); 70 tcpdlen = npf_tcpsaw(__UNCONST(npc), &seq, &ack, &win); 71 end = seq + tcpdlen; 72 if (tcpfl & TH_SYN) { 73 end++; 74 } 75 if (tcpfl & TH_FIN) { 76 end++; 77 } 78 79 /* 80 * Perform SEQ/ACK numbers check against boundaries. Reference: 81 * 82 * Rooij G., "Real stateful TCP packet filtering in IP Filter", 83 * 10th USENIX Security Symposium invited talk, Aug. 2001. 84 */ 85 86 fstate = &nst->nst_tcpst[forw ? 0 : 1]; 87 tstate = &nst->nst_tcpst[forw ? 1 : 0]; 88 win = win ? (win << fstate->nst_wscale) : 1; 89 90 if (tcpfl == TH_SYN) { 91 /* 92 * First SYN or re-transmission of SYN. Initialize all 93 * values. State of other side will get set with a SYN-ACK 94 * reply (see below). 95 */ 96 fstate->nst_seqend = end; 97 fstate->nst_ackend = end; 98 fstate->nst_maxwin = win; 99 tstate->nst_ackend = 0; 100 tstate->nst_ackend = 0; 101 tstate->nst_maxwin = 0; 102 /* 103 * Handle TCP Window Scaling (RFC 1323). Both sides may 104 * send this option in their SYN packets. 105 */ 106 if (npf_fetch_tcpopts(npc, nbuf, NULL, &wscale)) { 107 fstate->nst_wscale = wscale; 108 } else { 109 fstate->nst_wscale = 0; 110 } 111 tstate->nst_wscale = 0; 112 /* Done. */ 113 return true; 114 } 115 if (fstate->nst_seqend == 0) { 116 /* 117 * Should be a SYN-ACK reply to SYN. If SYN is not set, 118 * then we are in the middle connection and lost tracking. 119 */ 120 fstate->nst_seqend = end; 121 fstate->nst_ackend = end + 1; 122 fstate->nst_maxwin = 1; 123 124 /* Handle TCP Window Scaling (must be ignored if no SYN). */ 125 if (tcpfl & TH_SYN) { 126 fstate->nst_wscale = 127 npf_fetch_tcpopts(npc, nbuf, NULL, &wscale) ? 128 wscale : 0; 129 } 130 } 131 if ((tcpfl & TH_ACK) == 0) { 132 /* Pretend that an ACK was sent. */ 133 ack = tstate->nst_seqend; 134 } else if ((tcpfl & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST) && ack == 0) { 135 /* Workaround for some TCP stacks. */ 136 ack = tstate->nst_seqend; 137 } 138 if (seq == end) { 139 /* If packet contains no data - assume it is valid. */ 140 end = fstate->nst_seqend; 141 seq = end; 142 } 143 144 /* 145 * Determine whether the data is within previously noted window, 146 * that is, upper boundary for valid data (I). 147 */ 148 if (!SEQ_GEQ(fstate->nst_ackend, end)) { 149 npf_stats_inc(NPF_STAT_INVALID_STATE_TCP1); 150 return false; 151 } 152 /* Lower boundary (II), which is no more than one window back. */ 153 if (!SEQ_GEQ(seq, fstate->nst_seqend - tstate->nst_maxwin)) { 154 npf_stats_inc(NPF_STAT_INVALID_STATE_TCP2); 155 return false; 156 } 157 /* 158 * Boundaries for valid acknowledgments (III, IV) - on predicted 159 * window up or down, since packets may be fragmented. 160 */ 161 ackskew = tstate->nst_seqend - ack; 162 if (ackskew < -MAXACKWINDOW || ackskew > MAXACKWINDOW) { 163 npf_stats_inc(NPF_STAT_INVALID_STATE_TCP3); 164 return false; 165 } 166 167 /* 168 * Packet is passed now. 169 * 170 * Negative ackskew might be due to fragmented packets. Since the 171 * total length of the packet is unknown - bump the boundary. 172 */ 173 if (ackskew < 0) { 174 tstate->nst_seqend = end; 175 } 176 /* Keep track of the maximum window seen. */ 177 if (fstate->nst_maxwin < win) { 178 fstate->nst_maxwin = win; 179 } 180 if (SEQ_GT(end, fstate->nst_seqend)) { 181 fstate->nst_seqend = end; 182 } 183 /* Note the window for upper boundary. */ 184 if (SEQ_GEQ(ack + win, tstate->nst_ackend)) { 185 tstate->nst_ackend = ack + win; 186 } 187 return true; 188 } 189 190 static inline bool 191 npf_state_tcp(const npf_cache_t *npc, nbuf_t *nbuf, npf_state_t *nst, 192 const bool forw) 193 { 194 const struct tcphdr *th = &npc->npc_l4.tcp; 195 const int tcpfl = th->th_flags; 196 int nstate = 0; 197 198 /* 199 * Handle 3-way handshake (SYN -> SYN,ACK -> ACK). 200 */ 201 switch (nst->nst_state) { 202 case ST_ESTABLISHED: 203 /* Common case - connection established. */ 204 if (__predict_false(tcpfl & (TH_FIN | TH_RST))) { 205 /* Handle connection closure (FIN or RST). */ 206 nstate = ST_CLOSING; 207 } 208 break; 209 case ST_OPENING: 210 /* SYN has been sent, expecting SYN-ACK. */ 211 if (tcpfl == (TH_SYN | TH_ACK) && !forw) { 212 /* Received backwards SYN-ACK. */ 213 nstate = ST_ACKNOWLEDGE; 214 } else if (tcpfl == TH_SYN && forw) { 215 /* Re-transmission of SYN. */ 216 } else { 217 return false; 218 } 219 break; 220 case ST_ACKNOWLEDGE: 221 /* SYN-ACK was seen, expecting ACK. */ 222 if (tcpfl == TH_ACK && forw) { 223 nstate = ST_ESTABLISHED; 224 } else { 225 return false; 226 } 227 break; 228 case ST_CLOSING: 229 /* XXX TODO */ 230 break; 231 default: 232 npf_state_dump(nst); 233 KASSERT(false); 234 } 235 #if 0 236 if (!npf_tcp_inwindow(npc, nbuf, nst, forw)) { 237 return false; 238 } 239 #endif 240 if (__predict_false(nstate)) { 241 nst->nst_state = nstate; 242 } 243 return true; 244 } 245 246 bool 247 npf_state_init(const npf_cache_t *npc, nbuf_t *nbuf, npf_state_t *nst) 248 { 249 const int proto = npf_cache_ipproto(npc); 250 251 KASSERT(npf_iscached(npc, NPC_IP46 | NPC_LAYER4)); 252 253 mutex_init(&nst->nst_lock, MUTEX_DEFAULT, IPL_SOFTNET); 254 nst->nst_state = ST_OPENING; 255 256 if (proto == IPPROTO_TCP) { 257 const struct tcphdr *th = &npc->npc_l4.tcp; 258 /* TCP case: must be SYN. */ 259 KASSERT(npf_iscached(npc, NPC_TCP)); 260 if (th->th_flags != TH_SYN) { 261 npf_stats_inc(NPF_STAT_INVALID_STATE); 262 return false; 263 } 264 /* Initial values for TCP window and sequence tracking. */ 265 if (!npf_tcp_inwindow(npc, nbuf, nst, true)) { 266 npf_stats_inc(NPF_STAT_INVALID_STATE); 267 return false; 268 } 269 } 270 return true; 271 } 272 273 void 274 npf_state_destroy(npf_state_t *nst) 275 { 276 277 KASSERT(nst->nst_state != 0); 278 mutex_destroy(&nst->nst_lock); 279 } 280 281 bool 282 npf_state_inspect(const npf_cache_t *npc, nbuf_t *nbuf, 283 npf_state_t *nst, const bool forw) 284 { 285 const int proto = npf_cache_ipproto(npc); 286 bool ret; 287 288 mutex_enter(&nst->nst_lock); 289 switch (proto) { 290 case IPPROTO_TCP: 291 /* Handle TCP. */ 292 ret = npf_state_tcp(npc, nbuf, nst, forw); 293 break; 294 default: 295 /* Handle UDP or ICMP response for opening session. */ 296 if (nst->nst_state == ST_OPENING && !forw) { 297 nst->nst_state = ST_ESTABLISHED; 298 } 299 ret = true; 300 } 301 mutex_exit(&nst->nst_lock); 302 if (__predict_false(!ret)) { 303 npf_stats_inc(NPF_STAT_INVALID_STATE); 304 } 305 return ret; 306 } 307 308 int 309 npf_state_etime(const npf_state_t *nst, const int proto) 310 { 311 312 if (nst->nst_state == ST_ESTABLISHED) { 313 return expire_table[proto]; 314 } 315 return 10; /* XXX TODO */ 316 } 317 318 #if defined(DDB) || defined(_NPF_TESTING) 319 320 void 321 npf_state_dump(npf_state_t *nst) 322 { 323 npf_tcpstate_t *fst = &nst->nst_tcpst[0], *tst = &nst->nst_tcpst[1]; 324 325 printf("\tstate (%p) %d:\n\t\t" 326 "F { seqend %u ackend %u mwin %u wscale %u }\n\t\t" 327 "T { seqend %u, ackend %u mwin %u wscale %u }\n", 328 nst, nst->nst_state, 329 fst->nst_seqend, fst->nst_ackend, fst->nst_maxwin, fst->nst_wscale, 330 tst->nst_seqend, tst->nst_ackend, tst->nst_maxwin, tst->nst_wscale 331 ); 332 } 333 334 #endif 335