1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 #include <sys/types.h>
28 #include <sys/conf.h>
29 #include <sys/time.h>
30 #include <sys/taskq.h>
31 #include <sys/cmn_err.h>
32 #include <sys/sdt.h>
33 #include <sys/atomic.h>
34 #include <netinet/in.h>
35 #include <inet/ip.h>
36 #include <inet/ip6.h>
37 #include <inet/tcp.h>
38 #include <inet/udp_impl.h>
39 #include <inet/ilb.h>
40
41 #include "ilb_stack.h"
42 #include "ilb_impl.h"
43 #include "ilb_conn.h"
44 #include "ilb_nat.h"
45
46 /*
47 * Timer struct for ilb_conn_t and ilb_sticky_t garbage collection
48 *
49 * start: starting index into the hash table to do gc
50 * end: ending index into the hash table to do gc
51 * ilbs: pointer to the ilb_stack_t of the IP stack
52 * tid_lock: mutex to protect the timer id.
53 * tid: timer id of the timer
54 */
55 typedef struct ilb_timer_s {
56 uint32_t start;
57 uint32_t end;
58 ilb_stack_t *ilbs;
59 kmutex_t tid_lock;
60 timeout_id_t tid;
61 } ilb_timer_t;
62
63 /* Hash macro for finding the index to the conn hash table */
64 #define ILB_CONN_HASH(saddr, sport, daddr, dport, hash_size) \
65 (((*((saddr) + 3) ^ *((daddr) + 3)) * 50653 + \
66 (*((saddr) + 2) ^ *((daddr) + 2)) * 1369 + \
67 (*((saddr) + 1) ^ *((daddr) + 1)) * 37 + \
68 (*(saddr) ^ *(daddr)) + (sport) * 37 + (dport)) & \
69 ((hash_size) - 1))
70
71 /* Kmem cache for the conn hash entry */
72 static struct kmem_cache *ilb_conn_cache = NULL;
73
74 /*
75 * There are 60 timers running to do conn cache garbage collection. Each
76 * gc thread is responsible for 1/60 of the conn hash table.
77 */
78 static int ilb_conn_timer_size = 60;
79
80 /* Each of the above gc timers wake up every 15s to do the gc. */
81 static int ilb_conn_cache_timeout = 15;
82
83 #define ILB_STICKY_HASH(saddr, rule, hash_size) \
84 (((*((saddr) + 3) ^ ((rule) >> 24)) * 29791 + \
85 (*((saddr) + 2) ^ ((rule) >> 16)) * 961 + \
86 (*((saddr) + 1) ^ ((rule) >> 8)) * 31 + \
87 (*(saddr) ^ (rule))) & ((hash_size) - 1))
88
89 static struct kmem_cache *ilb_sticky_cache = NULL;
90
91 /*
92 * There are 60 timers running to do sticky cache garbage collection. Each
93 * gc thread is responsible for 1/60 of the sticky hash table.
94 */
95 static int ilb_sticky_timer_size = 60;
96
97 /* Each of the above gc timers wake up every 15s to do the gc. */
98 static int ilb_sticky_timeout = 15;
99
100 #define ILB_STICKY_REFRELE(s) \
101 { \
102 mutex_enter(&(s)->hash->sticky_lock); \
103 (s)->refcnt--; \
104 (s)->atime = ddi_get_lbolt64(); \
105 mutex_exit(&s->hash->sticky_lock); \
106 }
107
108
109 static void
ilb_conn_cache_init(void)110 ilb_conn_cache_init(void)
111 {
112 ilb_conn_cache = kmem_cache_create("ilb_conn_cache",
113 sizeof (ilb_conn_t), 0, NULL, NULL, NULL, NULL, NULL,
114 ilb_kmem_flags);
115 }
116
117 void
ilb_conn_cache_fini(void)118 ilb_conn_cache_fini(void)
119 {
120 if (ilb_conn_cache != NULL) {
121 kmem_cache_destroy(ilb_conn_cache);
122 ilb_conn_cache = NULL;
123 }
124 }
125
126 static void
ilb_conn_remove_common(ilb_conn_t * connp,boolean_t c2s)127 ilb_conn_remove_common(ilb_conn_t *connp, boolean_t c2s)
128 {
129 ilb_conn_hash_t *hash;
130 ilb_conn_t **next, **prev;
131 ilb_conn_t **next_prev, **prev_next;
132
133 if (c2s) {
134 hash = connp->conn_c2s_hash;
135 ASSERT(MUTEX_HELD(&hash->ilb_conn_hash_lock));
136 next = &connp->conn_c2s_next;
137 prev = &connp->conn_c2s_prev;
138 if (*next != NULL)
139 next_prev = &(*next)->conn_c2s_prev;
140 if (*prev != NULL)
141 prev_next = &(*prev)->conn_c2s_next;
142 } else {
143 hash = connp->conn_s2c_hash;
144 ASSERT(MUTEX_HELD(&hash->ilb_conn_hash_lock));
145 next = &connp->conn_s2c_next;
146 prev = &connp->conn_s2c_prev;
147 if (*next != NULL)
148 next_prev = &(*next)->conn_s2c_prev;
149 if (*prev != NULL)
150 prev_next = &(*prev)->conn_s2c_next;
151 }
152
153 if (hash->ilb_connp == connp) {
154 hash->ilb_connp = *next;
155 if (*next != NULL)
156 *next_prev = NULL;
157 } else {
158 if (*prev != NULL)
159 *prev_next = *next;
160 if (*next != NULL)
161 *next_prev = *prev;
162 }
163 ASSERT(hash->ilb_conn_cnt > 0);
164 hash->ilb_conn_cnt--;
165
166 *next = NULL;
167 *prev = NULL;
168 }
169
170 static void
ilb_conn_remove(ilb_conn_t * connp)171 ilb_conn_remove(ilb_conn_t *connp)
172 {
173 ASSERT(MUTEX_HELD(&connp->conn_c2s_hash->ilb_conn_hash_lock));
174 ilb_conn_remove_common(connp, B_TRUE);
175 ASSERT(MUTEX_HELD(&connp->conn_s2c_hash->ilb_conn_hash_lock));
176 ilb_conn_remove_common(connp, B_FALSE);
177
178 if (connp->conn_rule_cache.topo == ILB_TOPO_IMPL_NAT) {
179 in_port_t port;
180
181 port = ntohs(connp->conn_rule_cache.info.nat_sport);
182 vmem_free(connp->conn_rule_cache.info.src_ent->nse_port_arena,
183 (void *)(uintptr_t)port, 1);
184 }
185
186 if (connp->conn_sticky != NULL)
187 ILB_STICKY_REFRELE(connp->conn_sticky);
188 ILB_SERVER_REFRELE(connp->conn_server);
189 kmem_cache_free(ilb_conn_cache, connp);
190 }
191
192 /*
193 * Routine to do periodic garbage collection of conn hash entries. When
194 * a conn hash timer fires, it dispatches a taskq to call this function
195 * to do the gc. Note that each taskq is responisble for a portion of
196 * the table. The portion is stored in timer->start, timer->end.
197 */
198 static void
ilb_conn_cleanup(void * arg)199 ilb_conn_cleanup(void *arg)
200 {
201 ilb_timer_t *timer = (ilb_timer_t *)arg;
202 uint32_t i;
203 ilb_stack_t *ilbs;
204 ilb_conn_hash_t *c2s_hash, *s2c_hash;
205 ilb_conn_t *connp, *nxt_connp;
206 int64_t now;
207 int64_t expiry;
208 boolean_t die_now;
209
210 ilbs = timer->ilbs;
211 c2s_hash = ilbs->ilbs_c2s_conn_hash;
212 ASSERT(c2s_hash != NULL);
213
214 now = ddi_get_lbolt64();
215 for (i = timer->start; i < timer->end; i++) {
216 mutex_enter(&c2s_hash[i].ilb_conn_hash_lock);
217 if ((connp = c2s_hash[i].ilb_connp) == NULL) {
218 ASSERT(c2s_hash[i].ilb_conn_cnt == 0);
219 mutex_exit(&c2s_hash[i].ilb_conn_hash_lock);
220 continue;
221 }
222 do {
223 ASSERT(c2s_hash[i].ilb_conn_cnt > 0);
224 ASSERT(connp->conn_c2s_hash == &c2s_hash[i]);
225 nxt_connp = connp->conn_c2s_next;
226 expiry = now - SEC_TO_TICK(connp->conn_expiry);
227 if (connp->conn_server->iser_die_time != 0 &&
228 connp->conn_server->iser_die_time < now)
229 die_now = B_TRUE;
230 else
231 die_now = B_FALSE;
232 s2c_hash = connp->conn_s2c_hash;
233 mutex_enter(&s2c_hash->ilb_conn_hash_lock);
234
235 if (connp->conn_gc || die_now ||
236 (connp->conn_c2s_atime < expiry &&
237 connp->conn_s2c_atime < expiry)) {
238 /* Need to update the nat list cur_connp */
239 if (connp == ilbs->ilbs_conn_list_connp) {
240 ilbs->ilbs_conn_list_connp =
241 connp->conn_c2s_next;
242 }
243 ilb_conn_remove(connp);
244 goto nxt_connp;
245 }
246
247 if (connp->conn_l4 != IPPROTO_TCP)
248 goto nxt_connp;
249
250 /* Update and check TCP related conn info */
251 if (connp->conn_c2s_tcp_fin_sent &&
252 SEQ_GT(connp->conn_s2c_tcp_ack,
253 connp->conn_c2s_tcp_fss)) {
254 connp->conn_c2s_tcp_fin_acked = B_TRUE;
255 }
256 if (connp->conn_s2c_tcp_fin_sent &&
257 SEQ_GT(connp->conn_c2s_tcp_ack,
258 connp->conn_s2c_tcp_fss)) {
259 connp->conn_s2c_tcp_fin_acked = B_TRUE;
260 }
261 if (connp->conn_c2s_tcp_fin_acked &&
262 connp->conn_s2c_tcp_fin_acked) {
263 ilb_conn_remove(connp);
264 }
265 nxt_connp:
266 mutex_exit(&s2c_hash->ilb_conn_hash_lock);
267 connp = nxt_connp;
268 } while (connp != NULL);
269 mutex_exit(&c2s_hash[i].ilb_conn_hash_lock);
270 }
271 }
272
273 /* Conn hash timer routine. It dispatches a taskq and restart the timer */
274 static void
ilb_conn_timer(void * arg)275 ilb_conn_timer(void *arg)
276 {
277 ilb_timer_t *timer = (ilb_timer_t *)arg;
278
279 (void) taskq_dispatch(timer->ilbs->ilbs_conn_taskq, ilb_conn_cleanup,
280 arg, TQ_SLEEP);
281 mutex_enter(&timer->tid_lock);
282 if (timer->tid == 0) {
283 mutex_exit(&timer->tid_lock);
284 } else {
285 timer->tid = timeout(ilb_conn_timer, arg,
286 SEC_TO_TICK(ilb_conn_cache_timeout));
287 mutex_exit(&timer->tid_lock);
288 }
289 }
290
291 void
ilb_conn_hash_init(ilb_stack_t * ilbs)292 ilb_conn_hash_init(ilb_stack_t *ilbs)
293 {
294 extern pri_t minclsyspri;
295 int i, part;
296 ilb_timer_t *tm;
297 char tq_name[TASKQ_NAMELEN];
298
299 /*
300 * If ilbs->ilbs_conn_hash_size is not a power of 2, bump it up to
301 * the next power of 2.
302 */
303 if (ilbs->ilbs_conn_hash_size & (ilbs->ilbs_conn_hash_size - 1)) {
304 for (i = 0; i < 31; i++) {
305 if (ilbs->ilbs_conn_hash_size < (1 << i))
306 break;
307 }
308 ilbs->ilbs_conn_hash_size = 1 << i;
309 }
310
311 /*
312 * Can sleep since this should be called when a rule is being added,
313 * hence we are not in interrupt context.
314 */
315 ilbs->ilbs_c2s_conn_hash = kmem_zalloc(sizeof (ilb_conn_hash_t) *
316 ilbs->ilbs_conn_hash_size, KM_SLEEP);
317 ilbs->ilbs_s2c_conn_hash = kmem_zalloc(sizeof (ilb_conn_hash_t) *
318 ilbs->ilbs_conn_hash_size, KM_SLEEP);
319
320 for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) {
321 mutex_init(&ilbs->ilbs_c2s_conn_hash[i].ilb_conn_hash_lock,
322 NULL, MUTEX_DEFAULT, NULL);
323 }
324 for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) {
325 mutex_init(&ilbs->ilbs_s2c_conn_hash[i].ilb_conn_hash_lock,
326 NULL, MUTEX_DEFAULT, NULL);
327 }
328
329 if (ilb_conn_cache == NULL)
330 ilb_conn_cache_init();
331
332 (void) snprintf(tq_name, sizeof (tq_name), "ilb_conn_taskq_%p",
333 (void *)ilbs->ilbs_netstack);
334 ASSERT(ilbs->ilbs_conn_taskq == NULL);
335 ilbs->ilbs_conn_taskq = taskq_create(tq_name,
336 ilb_conn_timer_size * 2, minclsyspri, ilb_conn_timer_size,
337 ilb_conn_timer_size * 2, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
338
339 ASSERT(ilbs->ilbs_conn_timer_list == NULL);
340 ilbs->ilbs_conn_timer_list = kmem_zalloc(sizeof (ilb_timer_t) *
341 ilb_conn_timer_size, KM_SLEEP);
342
343 /*
344 * The hash table is divided in equal partition for those timers
345 * to do garbage collection.
346 */
347 part = ilbs->ilbs_conn_hash_size / ilb_conn_timer_size + 1;
348 for (i = 0; i < ilb_conn_timer_size; i++) {
349 tm = ilbs->ilbs_conn_timer_list + i;
350 tm->start = i * part;
351 tm->end = i * part + part;
352 if (tm->end > ilbs->ilbs_conn_hash_size)
353 tm->end = ilbs->ilbs_conn_hash_size;
354 tm->ilbs = ilbs;
355 mutex_init(&tm->tid_lock, NULL, MUTEX_DEFAULT, NULL);
356 /* Spread out the starting execution time of all the timers. */
357 tm->tid = timeout(ilb_conn_timer, tm,
358 SEC_TO_TICK(ilb_conn_cache_timeout + i));
359 }
360 }
361
362 void
ilb_conn_hash_fini(ilb_stack_t * ilbs)363 ilb_conn_hash_fini(ilb_stack_t *ilbs)
364 {
365 uint32_t i;
366 ilb_conn_t *connp;
367
368 if (ilbs->ilbs_c2s_conn_hash == NULL) {
369 ASSERT(ilbs->ilbs_s2c_conn_hash == NULL);
370 return;
371 }
372
373 /* Stop all the timers first. */
374 for (i = 0; i < ilb_conn_timer_size; i++) {
375 timeout_id_t tid;
376
377 /* Setting tid to 0 tells the timer handler not to restart. */
378 mutex_enter(&ilbs->ilbs_conn_timer_list[i].tid_lock);
379 tid = ilbs->ilbs_conn_timer_list[i].tid;
380 ilbs->ilbs_conn_timer_list[i].tid = 0;
381 mutex_exit(&ilbs->ilbs_conn_timer_list[i].tid_lock);
382 (void) untimeout(tid);
383 }
384 kmem_free(ilbs->ilbs_conn_timer_list, sizeof (ilb_timer_t) *
385 ilb_conn_timer_size);
386 taskq_destroy(ilbs->ilbs_conn_taskq);
387 ilbs->ilbs_conn_taskq = NULL;
388
389 /* Then remove all the conns. */
390 for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) {
391 while ((connp = ilbs->ilbs_s2c_conn_hash->ilb_connp) != NULL) {
392 ilbs->ilbs_s2c_conn_hash->ilb_connp =
393 connp->conn_s2c_next;
394 ILB_SERVER_REFRELE(connp->conn_server);
395 if (connp->conn_rule_cache.topo == ILB_TOPO_IMPL_NAT) {
396 ilb_nat_src_entry_t *ent;
397 in_port_t port;
398
399 /*
400 * src_ent will be freed in ilb_nat_src_fini().
401 */
402 port = ntohs(
403 connp->conn_rule_cache.info.nat_sport);
404 ent = connp->conn_rule_cache.info.src_ent;
405 vmem_free(ent->nse_port_arena,
406 (void *)(uintptr_t)port, 1);
407 }
408 kmem_cache_free(ilb_conn_cache, connp);
409 }
410 }
411 kmem_free(ilbs->ilbs_c2s_conn_hash, sizeof (ilb_conn_hash_t) *
412 ilbs->ilbs_conn_hash_size);
413 kmem_free(ilbs->ilbs_s2c_conn_hash, sizeof (ilb_conn_hash_t) *
414 ilbs->ilbs_conn_hash_size);
415 }
416
417 /*
418 * Internet checksum adjustment calculation routines. We pre-calculate
419 * checksum adjustment so that we don't need to compute the checksum on
420 * the whole packet when we change address/port in the packet.
421 */
422
423 static void
hnat_cksum_v4(uint16_t * oaddr,uint16_t * naddr,in_port_t old_port,in_port_t new_port,uint32_t * adj_sum)424 hnat_cksum_v4(uint16_t *oaddr, uint16_t *naddr, in_port_t old_port,
425 in_port_t new_port, uint32_t *adj_sum)
426 {
427 uint32_t sum;
428
429 sum = *oaddr + *(oaddr + 1) + old_port;
430 while ((sum >> 16) != 0)
431 sum = (sum & 0xffff) + (sum >> 16);
432 *adj_sum = (uint16_t)~sum + *naddr + *(naddr + 1) + new_port;
433 }
434
435 static void
hnat_cksum_v6(uint16_t * oaddr,uint16_t * naddr,in_port_t old_port,in_port_t new_port,uint32_t * adj_sum)436 hnat_cksum_v6(uint16_t *oaddr, uint16_t *naddr, in_port_t old_port,
437 in_port_t new_port, uint32_t *adj_sum)
438 {
439 uint32_t sum = 0;
440
441 sum = *oaddr + *(oaddr + 1) + *(oaddr + 2) + *(oaddr + 3) +
442 *(oaddr + 4) + *(oaddr + 5) + *(oaddr + 6) + *(oaddr + 7) +
443 old_port;
444 while ((sum >> 16) != 0)
445 sum = (sum & 0xffff) + (sum >> 16);
446 *adj_sum = (uint16_t)~sum + *naddr + *(naddr + 1) +
447 *(naddr + 2) + *(naddr + 3) + *(naddr + 4) + *(naddr + 5) +
448 *(naddr + 6) + *(naddr + 7) + new_port;
449 }
450
451 static void
fnat_cksum_v4(uint16_t * oaddr1,uint16_t * oaddr2,uint16_t * naddr1,uint16_t * naddr2,in_port_t old_port1,in_port_t old_port2,in_port_t new_port1,in_port_t new_port2,uint32_t * adj_sum)452 fnat_cksum_v4(uint16_t *oaddr1, uint16_t *oaddr2, uint16_t *naddr1,
453 uint16_t *naddr2, in_port_t old_port1, in_port_t old_port2,
454 in_port_t new_port1, in_port_t new_port2, uint32_t *adj_sum)
455 {
456 uint32_t sum;
457
458 sum = *oaddr1 + *(oaddr1 + 1) + old_port1 + *oaddr2 + *(oaddr2 + 1) +
459 old_port2;
460 while ((sum >> 16) != 0)
461 sum = (sum & 0xffff) + (sum >> 16);
462 *adj_sum = (uint16_t)~sum + *naddr1 + *(naddr1 + 1) + new_port1 +
463 *naddr2 + *(naddr2 + 1) + new_port2;
464 }
465
466 static void
fnat_cksum_v6(uint16_t * oaddr1,uint16_t * oaddr2,uint16_t * naddr1,uint16_t * naddr2,in_port_t old_port1,in_port_t old_port2,in_port_t new_port1,in_port_t new_port2,uint32_t * adj_sum)467 fnat_cksum_v6(uint16_t *oaddr1, uint16_t *oaddr2, uint16_t *naddr1,
468 uint16_t *naddr2, in_port_t old_port1, in_port_t old_port2,
469 in_port_t new_port1, in_port_t new_port2, uint32_t *adj_sum)
470 {
471 uint32_t sum = 0;
472
473 sum = *oaddr1 + *(oaddr1 + 1) + *(oaddr1 + 2) + *(oaddr1 + 3) +
474 *(oaddr1 + 4) + *(oaddr1 + 5) + *(oaddr1 + 6) + *(oaddr1 + 7) +
475 old_port1;
476 sum += *oaddr2 + *(oaddr2 + 1) + *(oaddr2 + 2) + *(oaddr2 + 3) +
477 *(oaddr2 + 4) + *(oaddr2 + 5) + *(oaddr2 + 6) + *(oaddr2 + 7) +
478 old_port2;
479 while ((sum >> 16) != 0)
480 sum = (sum & 0xffff) + (sum >> 16);
481 sum = (uint16_t)~sum + *naddr1 + *(naddr1 + 1) + *(naddr1 + 2) +
482 *(naddr1 + 3) + *(naddr1 + 4) + *(naddr1 + 5) + *(naddr1 + 6) +
483 *(naddr1 + 7) + new_port1;
484 *adj_sum = sum + *naddr2 + *(naddr2 + 1) + *(naddr2 + 2) +
485 *(naddr2 + 3) + *(naddr2 + 4) + *(naddr2 + 5) + *(naddr2 + 6) +
486 *(naddr2 + 7) + new_port2;
487 }
488
489 /*
490 * Add a conn hash entry to the tables. Note that a conn hash entry
491 * (ilb_conn_t) contains info on both directions. And there are two hash
492 * tables, one for client to server and the other for server to client.
493 * So the same entry is added to both tables and can be ccessed by two
494 * thread simultaneously. But each thread will only access data on one
495 * direction, so there is no conflict.
496 */
497 int
ilb_conn_add(ilb_stack_t * ilbs,ilb_rule_t * rule,ilb_server_t * server,in6_addr_t * src,in_port_t sport,in6_addr_t * dst,in_port_t dport,ilb_nat_info_t * info,uint32_t * ip_sum,uint32_t * tp_sum,ilb_sticky_t * s)498 ilb_conn_add(ilb_stack_t *ilbs, ilb_rule_t *rule, ilb_server_t *server,
499 in6_addr_t *src, in_port_t sport, in6_addr_t *dst, in_port_t dport,
500 ilb_nat_info_t *info, uint32_t *ip_sum, uint32_t *tp_sum, ilb_sticky_t *s)
501 {
502 ilb_conn_t *connp;
503 ilb_conn_hash_t *hash;
504 int i;
505
506 connp = kmem_cache_alloc(ilb_conn_cache, KM_NOSLEEP);
507 if (connp == NULL) {
508 if (s != NULL) {
509 if (rule->ir_topo == ILB_TOPO_IMPL_NAT) {
510 ilb_nat_src_entry_t **entry;
511
512 entry = s->server->iser_nat_src->src_list;
513 vmem_free(entry[s->nat_src_idx]->nse_port_arena,
514 (void *)(uintptr_t)ntohs(info->nat_sport),
515 1);
516 }
517 ILB_STICKY_REFRELE(s);
518 }
519 return (ENOMEM);
520 }
521
522 connp->conn_l4 = rule->ir_proto;
523
524 connp->conn_server = server;
525 ILB_SERVER_REFHOLD(server);
526 connp->conn_sticky = s;
527
528 connp->conn_rule_cache.topo = rule->ir_topo;
529 connp->conn_rule_cache.info = *info;
530
531 connp->conn_gc = B_FALSE;
532
533 connp->conn_expiry = rule->ir_nat_expiry;
534 connp->conn_cr_time = ddi_get_lbolt64();
535
536 /* Client to server info. */
537 connp->conn_c2s_saddr = *src;
538 connp->conn_c2s_sport = sport;
539 connp->conn_c2s_daddr = *dst;
540 connp->conn_c2s_dport = dport;
541
542 connp->conn_c2s_atime = ddi_get_lbolt64();
543 /* The packet ths triggers this creation should be counted */
544 connp->conn_c2s_pkt_cnt = 1;
545 connp->conn_c2s_tcp_fin_sent = B_FALSE;
546 connp->conn_c2s_tcp_fin_acked = B_FALSE;
547
548 /* Server to client info, before NAT */
549 switch (rule->ir_topo) {
550 case ILB_TOPO_IMPL_HALF_NAT:
551 connp->conn_s2c_saddr = info->nat_dst;
552 connp->conn_s2c_sport = info->nat_dport;
553 connp->conn_s2c_daddr = *src;
554 connp->conn_s2c_dport = sport;
555
556 /* Pre-calculate checksum changes for both directions */
557 if (rule->ir_ipver == IPPROTO_IP) {
558 hnat_cksum_v4((uint16_t *)&dst->s6_addr32[3],
559 (uint16_t *)&info->nat_dst.s6_addr32[3], 0, 0,
560 &connp->conn_c2s_ip_sum);
561 hnat_cksum_v4((uint16_t *)&dst->s6_addr32[3],
562 (uint16_t *)&info->nat_dst.s6_addr32[3], dport,
563 info->nat_dport, &connp->conn_c2s_tp_sum);
564 *ip_sum = connp->conn_c2s_ip_sum;
565 *tp_sum = connp->conn_c2s_tp_sum;
566
567 hnat_cksum_v4(
568 (uint16_t *)&info->nat_dst.s6_addr32[3],
569 (uint16_t *)&dst->s6_addr32[3], 0, 0,
570 &connp->conn_s2c_ip_sum);
571 hnat_cksum_v4(
572 (uint16_t *)&info->nat_dst.s6_addr32[3],
573 (uint16_t *)&dst->s6_addr32[3],
574 info->nat_dport, dport,
575 &connp->conn_s2c_tp_sum);
576 } else {
577 connp->conn_c2s_ip_sum = 0;
578 hnat_cksum_v6((uint16_t *)dst,
579 (uint16_t *)&info->nat_dst, dport,
580 info->nat_dport, &connp->conn_c2s_tp_sum);
581 *ip_sum = 0;
582 *tp_sum = connp->conn_c2s_tp_sum;
583
584 connp->conn_s2c_ip_sum = 0;
585 hnat_cksum_v6((uint16_t *)&info->nat_dst,
586 (uint16_t *)dst, info->nat_dport, dport,
587 &connp->conn_s2c_tp_sum);
588 }
589 break;
590 case ILB_TOPO_IMPL_NAT:
591 connp->conn_s2c_saddr = info->nat_dst;
592 connp->conn_s2c_sport = info->nat_dport;
593 connp->conn_s2c_daddr = info->nat_src;
594 connp->conn_s2c_dport = info->nat_sport;
595
596 if (rule->ir_ipver == IPPROTO_IP) {
597 fnat_cksum_v4((uint16_t *)&src->s6_addr32[3],
598 (uint16_t *)&dst->s6_addr32[3],
599 (uint16_t *)&info->nat_src.s6_addr32[3],
600 (uint16_t *)&info->nat_dst.s6_addr32[3],
601 0, 0, 0, 0, &connp->conn_c2s_ip_sum);
602 fnat_cksum_v4((uint16_t *)&src->s6_addr32[3],
603 (uint16_t *)&dst->s6_addr32[3],
604 (uint16_t *)&info->nat_src.s6_addr32[3],
605 (uint16_t *)&info->nat_dst.s6_addr32[3],
606 sport, dport, info->nat_sport,
607 info->nat_dport, &connp->conn_c2s_tp_sum);
608 *ip_sum = connp->conn_c2s_ip_sum;
609 *tp_sum = connp->conn_c2s_tp_sum;
610
611 fnat_cksum_v4(
612 (uint16_t *)&info->nat_src.s6_addr32[3],
613 (uint16_t *)&info->nat_dst.s6_addr32[3],
614 (uint16_t *)&src->s6_addr32[3],
615 (uint16_t *)&dst->s6_addr32[3],
616 0, 0, 0, 0, &connp->conn_s2c_ip_sum);
617 fnat_cksum_v4(
618 (uint16_t *)&info->nat_src.s6_addr32[3],
619 (uint16_t *)&info->nat_dst.s6_addr32[3],
620 (uint16_t *)&src->s6_addr32[3],
621 (uint16_t *)&dst->s6_addr32[3],
622 info->nat_sport, info->nat_dport,
623 sport, dport, &connp->conn_s2c_tp_sum);
624 } else {
625 fnat_cksum_v6((uint16_t *)src, (uint16_t *)dst,
626 (uint16_t *)&info->nat_src,
627 (uint16_t *)&info->nat_dst,
628 sport, dport, info->nat_sport,
629 info->nat_dport, &connp->conn_c2s_tp_sum);
630 connp->conn_c2s_ip_sum = 0;
631 *ip_sum = 0;
632 *tp_sum = connp->conn_c2s_tp_sum;
633
634 fnat_cksum_v6((uint16_t *)&info->nat_src,
635 (uint16_t *)&info->nat_dst, (uint16_t *)src,
636 (uint16_t *)dst, info->nat_sport,
637 info->nat_dport, sport, dport,
638 &connp->conn_s2c_tp_sum);
639 connp->conn_s2c_ip_sum = 0;
640 }
641 break;
642 }
643
644 connp->conn_s2c_atime = ddi_get_lbolt64();
645 connp->conn_s2c_pkt_cnt = 1;
646 connp->conn_s2c_tcp_fin_sent = B_FALSE;
647 connp->conn_s2c_tcp_fin_acked = B_FALSE;
648
649 /* Add it to the s2c hash table. */
650 hash = ilbs->ilbs_s2c_conn_hash;
651 i = ILB_CONN_HASH((uint8_t *)&connp->conn_s2c_saddr.s6_addr32[3],
652 ntohs(connp->conn_s2c_sport),
653 (uint8_t *)&connp->conn_s2c_daddr.s6_addr32[3],
654 ntohs(connp->conn_s2c_dport), ilbs->ilbs_conn_hash_size);
655 connp->conn_s2c_hash = &hash[i];
656 DTRACE_PROBE2(ilb__conn__hash__add__s2c, ilb_conn_t *, connp, int, i);
657
658 mutex_enter(&hash[i].ilb_conn_hash_lock);
659 hash[i].ilb_conn_cnt++;
660 connp->conn_s2c_next = hash[i].ilb_connp;
661 if (hash[i].ilb_connp != NULL)
662 hash[i].ilb_connp->conn_s2c_prev = connp;
663 connp->conn_s2c_prev = NULL;
664 hash[i].ilb_connp = connp;
665 mutex_exit(&hash[i].ilb_conn_hash_lock);
666
667 /* Add it to the c2s hash table. */
668 hash = ilbs->ilbs_c2s_conn_hash;
669 i = ILB_CONN_HASH((uint8_t *)&src->s6_addr32[3], ntohs(sport),
670 (uint8_t *)&dst->s6_addr32[3], ntohs(dport),
671 ilbs->ilbs_conn_hash_size);
672 connp->conn_c2s_hash = &hash[i];
673 DTRACE_PROBE2(ilb__conn__hash__add__c2s, ilb_conn_t *, connp, int, i);
674
675 mutex_enter(&hash[i].ilb_conn_hash_lock);
676 hash[i].ilb_conn_cnt++;
677 connp->conn_c2s_next = hash[i].ilb_connp;
678 if (hash[i].ilb_connp != NULL)
679 hash[i].ilb_connp->conn_c2s_prev = connp;
680 connp->conn_c2s_prev = NULL;
681 hash[i].ilb_connp = connp;
682 mutex_exit(&hash[i].ilb_conn_hash_lock);
683
684 return (0);
685 }
686
687 /*
688 * If a connection is using TCP, we keep track of simple TCP state transition
689 * so that we know when to clean up an entry.
690 */
691 static boolean_t
update_conn_tcp(ilb_conn_t * connp,void * iph,tcpha_t * tcpha,int32_t pkt_len,boolean_t c2s)692 update_conn_tcp(ilb_conn_t *connp, void *iph, tcpha_t *tcpha, int32_t pkt_len,
693 boolean_t c2s)
694 {
695 uint32_t ack, seq;
696 int32_t seg_len;
697
698 if (tcpha->tha_flags & TH_RST)
699 return (B_FALSE);
700
701 seg_len = pkt_len - ((uint8_t *)tcpha - (uint8_t *)iph) -
702 TCP_HDR_LENGTH((tcph_t *)tcpha);
703
704 if (tcpha->tha_flags & TH_ACK)
705 ack = ntohl(tcpha->tha_ack);
706 seq = ntohl(tcpha->tha_seq);
707 if (c2s) {
708 ASSERT(MUTEX_HELD(&connp->conn_c2s_hash->ilb_conn_hash_lock));
709 if (tcpha->tha_flags & TH_FIN) {
710 connp->conn_c2s_tcp_fss = seq + seg_len;
711 connp->conn_c2s_tcp_fin_sent = B_TRUE;
712 }
713 connp->conn_c2s_tcp_ack = ack;
714
715 /* Port reuse by the client, restart the conn. */
716 if (connp->conn_c2s_tcp_fin_sent &&
717 SEQ_GT(seq, connp->conn_c2s_tcp_fss + 1)) {
718 connp->conn_c2s_tcp_fin_sent = B_FALSE;
719 connp->conn_c2s_tcp_fin_acked = B_FALSE;
720 }
721 } else {
722 ASSERT(MUTEX_HELD(&connp->conn_s2c_hash->ilb_conn_hash_lock));
723 if (tcpha->tha_flags & TH_FIN) {
724 connp->conn_s2c_tcp_fss = seq + seg_len;
725 connp->conn_s2c_tcp_fin_sent = B_TRUE;
726 }
727 connp->conn_s2c_tcp_ack = ack;
728
729 /* Port reuse by the client, restart the conn. */
730 if (connp->conn_s2c_tcp_fin_sent &&
731 SEQ_GT(seq, connp->conn_s2c_tcp_fss + 1)) {
732 connp->conn_s2c_tcp_fin_sent = B_FALSE;
733 connp->conn_s2c_tcp_fin_acked = B_FALSE;
734 }
735 }
736
737 return (B_TRUE);
738 }
739
740 /*
741 * Helper routint to find conn hash entry given some packet information and
742 * the traffic direction (c2s, client to server?)
743 */
744 static boolean_t
ilb_find_conn(ilb_stack_t * ilbs,void * iph,void * tph,int l4,in6_addr_t * src,in_port_t sport,in6_addr_t * dst,in_port_t dport,ilb_rule_info_t * rule_cache,uint32_t * ip_sum,uint32_t * tp_sum,int32_t pkt_len,boolean_t c2s)745 ilb_find_conn(ilb_stack_t *ilbs, void *iph, void *tph, int l4, in6_addr_t *src,
746 in_port_t sport, in6_addr_t *dst, in_port_t dport,
747 ilb_rule_info_t *rule_cache, uint32_t *ip_sum, uint32_t *tp_sum,
748 int32_t pkt_len, boolean_t c2s)
749 {
750 ilb_conn_hash_t *hash;
751 uint_t i;
752 ilb_conn_t *connp;
753 boolean_t tcp_alive;
754 boolean_t ret = B_FALSE;
755
756 i = ILB_CONN_HASH((uint8_t *)&src->s6_addr32[3], ntohs(sport),
757 (uint8_t *)&dst->s6_addr32[3], ntohs(dport),
758 ilbs->ilbs_conn_hash_size);
759 if (c2s) {
760 hash = ilbs->ilbs_c2s_conn_hash;
761 mutex_enter(&hash[i].ilb_conn_hash_lock);
762 for (connp = hash[i].ilb_connp; connp != NULL;
763 connp = connp->conn_c2s_next) {
764 if (connp->conn_l4 == l4 &&
765 connp->conn_c2s_dport == dport &&
766 connp->conn_c2s_sport == sport &&
767 IN6_ARE_ADDR_EQUAL(src, &connp->conn_c2s_saddr) &&
768 IN6_ARE_ADDR_EQUAL(dst, &connp->conn_c2s_daddr)) {
769 connp->conn_c2s_atime = ddi_get_lbolt64();
770 connp->conn_c2s_pkt_cnt++;
771 *rule_cache = connp->conn_rule_cache;
772 *ip_sum = connp->conn_c2s_ip_sum;
773 *tp_sum = connp->conn_c2s_tp_sum;
774 ret = B_TRUE;
775 break;
776 }
777 }
778 } else {
779 hash = ilbs->ilbs_s2c_conn_hash;
780 mutex_enter(&hash[i].ilb_conn_hash_lock);
781 for (connp = hash[i].ilb_connp; connp != NULL;
782 connp = connp->conn_s2c_next) {
783 if (connp->conn_l4 == l4 &&
784 connp->conn_s2c_dport == dport &&
785 connp->conn_s2c_sport == sport &&
786 IN6_ARE_ADDR_EQUAL(src, &connp->conn_s2c_saddr) &&
787 IN6_ARE_ADDR_EQUAL(dst, &connp->conn_s2c_daddr)) {
788 connp->conn_s2c_atime = ddi_get_lbolt64();
789 connp->conn_s2c_pkt_cnt++;
790 *rule_cache = connp->conn_rule_cache;
791 *ip_sum = connp->conn_s2c_ip_sum;
792 *tp_sum = connp->conn_s2c_tp_sum;
793 ret = B_TRUE;
794 break;
795 }
796 }
797 }
798 if (ret) {
799 ILB_S_KSTAT(connp->conn_server, pkt_processed);
800 ILB_S_KSTAT_UPDATE(connp->conn_server, bytes_processed,
801 pkt_len);
802
803 switch (l4) {
804 case (IPPROTO_TCP):
805 tcp_alive = update_conn_tcp(connp, iph, tph, pkt_len,
806 c2s);
807 if (!tcp_alive) {
808 connp->conn_gc = B_TRUE;
809 }
810 break;
811 default:
812 break;
813 }
814 }
815 mutex_exit(&hash[i].ilb_conn_hash_lock);
816
817 return (ret);
818 }
819
820 /*
821 * To check if a give packet matches an existing conn hash entry. If it
822 * does, return the information about this entry so that the caller can
823 * do the proper NAT.
824 */
825 boolean_t
ilb_check_conn(ilb_stack_t * ilbs,int l3,void * iph,int l4,void * tph,in6_addr_t * src,in6_addr_t * dst,in_port_t sport,in_port_t dport,uint32_t pkt_len,in6_addr_t * lb_dst)826 ilb_check_conn(ilb_stack_t *ilbs, int l3, void *iph, int l4, void *tph,
827 in6_addr_t *src, in6_addr_t *dst, in_port_t sport, in_port_t dport,
828 uint32_t pkt_len, in6_addr_t *lb_dst)
829 {
830 ilb_rule_info_t rule_cache;
831 uint32_t adj_ip_sum, adj_tp_sum;
832 boolean_t ret;
833
834 /* Check the incoming hash table. */
835 if (ilb_find_conn(ilbs, iph, tph, l4, src, sport, dst, dport,
836 &rule_cache, &adj_ip_sum, &adj_tp_sum, pkt_len, B_TRUE)) {
837 switch (rule_cache.topo) {
838 case ILB_TOPO_IMPL_NAT:
839 *lb_dst = rule_cache.info.nat_dst;
840 ilb_full_nat(l3, iph, l4, tph, &rule_cache.info,
841 adj_ip_sum, adj_tp_sum, B_TRUE);
842 ret = B_TRUE;
843 break;
844 case ILB_TOPO_IMPL_HALF_NAT:
845 *lb_dst = rule_cache.info.nat_dst;
846 ilb_half_nat(l3, iph, l4, tph, &rule_cache.info,
847 adj_ip_sum, adj_tp_sum, B_TRUE);
848 ret = B_TRUE;
849 break;
850 default:
851 ret = B_FALSE;
852 break;
853 }
854 return (ret);
855 }
856 if (ilb_find_conn(ilbs, iph, tph, l4, src, sport, dst, dport,
857 &rule_cache, &adj_ip_sum, &adj_tp_sum, pkt_len, B_FALSE)) {
858 switch (rule_cache.topo) {
859 case ILB_TOPO_IMPL_NAT:
860 *lb_dst = rule_cache.info.src;
861 ilb_full_nat(l3, iph, l4, tph, &rule_cache.info,
862 adj_ip_sum, adj_tp_sum, B_FALSE);
863 ret = B_TRUE;
864 break;
865 case ILB_TOPO_IMPL_HALF_NAT:
866 *lb_dst = *dst;
867 ilb_half_nat(l3, iph, l4, tph, &rule_cache.info,
868 adj_ip_sum, adj_tp_sum, B_FALSE);
869 ret = B_TRUE;
870 break;
871 default:
872 ret = B_FALSE;
873 break;
874 }
875 return (ret);
876 }
877
878 return (B_FALSE);
879 }
880
881 /*
882 * To check if an ICMP packet belongs to a connection in one of the conn
883 * hash entries.
884 */
885 boolean_t
ilb_check_icmp_conn(ilb_stack_t * ilbs,mblk_t * mp,int l3,void * out_iph,void * icmph,in6_addr_t * lb_dst)886 ilb_check_icmp_conn(ilb_stack_t *ilbs, mblk_t *mp, int l3, void *out_iph,
887 void *icmph, in6_addr_t *lb_dst)
888 {
889 ilb_conn_hash_t *hash;
890 ipha_t *in_iph4;
891 ip6_t *in_iph6;
892 icmph_t *icmph4;
893 icmp6_t *icmph6;
894 in6_addr_t *in_src_p, *in_dst_p;
895 in_port_t *sport, *dport;
896 int l4;
897 uint_t i;
898 ilb_conn_t *connp;
899 ilb_rule_info_t rule_cache;
900 uint32_t adj_ip_sum;
901 boolean_t full_nat;
902
903 if (l3 == IPPROTO_IP) {
904 in6_addr_t in_src, in_dst;
905
906 icmph4 = (icmph_t *)icmph;
907 in_iph4 = (ipha_t *)&icmph4[1];
908
909 if ((uint8_t *)in_iph4 + IPH_HDR_LENGTH(in_iph4) +
910 ICMP_MIN_TP_HDR_LEN > mp->b_wptr) {
911 return (B_FALSE);
912 }
913
914 IN6_IPADDR_TO_V4MAPPED(in_iph4->ipha_src, &in_src);
915 in_src_p = &in_src;
916 IN6_IPADDR_TO_V4MAPPED(in_iph4->ipha_dst, &in_dst);
917 in_dst_p = &in_dst;
918
919 l4 = in_iph4->ipha_protocol;
920 if (l4 != IPPROTO_TCP && l4 != IPPROTO_UDP)
921 return (B_FALSE);
922
923 sport = (in_port_t *)((char *)in_iph4 +
924 IPH_HDR_LENGTH(in_iph4));
925 dport = sport + 1;
926
927 DTRACE_PROBE4(ilb__chk__icmp__conn__v4, uint32_t,
928 in_iph4->ipha_src, uint32_t, in_iph4->ipha_dst, uint16_t,
929 ntohs(*sport), uint16_t, ntohs(*dport));
930 } else {
931 ASSERT(l3 == IPPROTO_IPV6);
932
933 icmph6 = (icmp6_t *)icmph;
934 in_iph6 = (ip6_t *)&icmph6[1];
935 in_src_p = &in_iph6->ip6_src;
936 in_dst_p = &in_iph6->ip6_dst;
937
938 if ((uint8_t *)in_iph6 + sizeof (ip6_t) +
939 ICMP_MIN_TP_HDR_LEN > mp->b_wptr) {
940 return (B_FALSE);
941 }
942
943 l4 = in_iph6->ip6_nxt;
944 /* We don't go deep inside an IPv6 packet yet. */
945 if (l4 != IPPROTO_TCP && l4 != IPPROTO_UDP)
946 return (B_FALSE);
947
948 sport = (in_port_t *)&in_iph6[1];
949 dport = sport + 1;
950
951 DTRACE_PROBE4(ilb__chk__icmp__conn__v6, in6_addr_t *,
952 &in_iph6->ip6_src, in6_addr_t *, &in_iph6->ip6_dst,
953 uint16_t, ntohs(*sport), uint16_t, ntohs(*dport));
954 }
955
956 i = ILB_CONN_HASH((uint8_t *)&in_dst_p->s6_addr32[3], ntohs(*dport),
957 (uint8_t *)&in_src_p->s6_addr32[3], ntohs(*sport),
958 ilbs->ilbs_conn_hash_size);
959 hash = ilbs->ilbs_c2s_conn_hash;
960
961 mutex_enter(&hash[i].ilb_conn_hash_lock);
962 for (connp = hash[i].ilb_connp; connp != NULL;
963 connp = connp->conn_c2s_next) {
964 if (connp->conn_l4 == l4 &&
965 connp->conn_c2s_dport == *sport &&
966 connp->conn_c2s_sport == *dport &&
967 IN6_ARE_ADDR_EQUAL(in_dst_p, &connp->conn_c2s_saddr) &&
968 IN6_ARE_ADDR_EQUAL(in_src_p, &connp->conn_c2s_daddr)) {
969 connp->conn_c2s_atime = ddi_get_lbolt64();
970 connp->conn_c2s_pkt_cnt++;
971 rule_cache = connp->conn_rule_cache;
972 adj_ip_sum = connp->conn_c2s_ip_sum;
973 break;
974 }
975 }
976 mutex_exit(&hash[i].ilb_conn_hash_lock);
977
978 if (connp == NULL) {
979 DTRACE_PROBE(ilb__chk__icmp__conn__failed);
980 return (B_FALSE);
981 }
982
983 switch (rule_cache.topo) {
984 case ILB_TOPO_IMPL_NAT:
985 full_nat = B_TRUE;
986 break;
987 case ILB_TOPO_IMPL_HALF_NAT:
988 full_nat = B_FALSE;
989 break;
990 default:
991 return (B_FALSE);
992 }
993
994 *lb_dst = rule_cache.info.nat_dst;
995 if (l3 == IPPROTO_IP) {
996 ilb_nat_icmpv4(mp, out_iph, icmph4, in_iph4, sport, dport,
997 &rule_cache.info, adj_ip_sum, full_nat);
998 } else {
999 ilb_nat_icmpv6(mp, out_iph, icmph6, in_iph6, sport, dport,
1000 &rule_cache.info, full_nat);
1001 }
1002 return (B_TRUE);
1003 }
1004
1005 /*
1006 * This routine sends up the conn hash table to user land. Note that the
1007 * request is an ioctl, hence we cannot really differentiate requests
1008 * from different clients. There is no context shared between different
1009 * ioctls. Here we make the assumption that the user land ilbd will
1010 * only allow one client to show the conn hash table at any time.
1011 * Otherwise, the results will be "very" inconsistent.
1012 *
1013 * In each ioctl, a flag (ILB_LIST_BEGIN) indicates whether the client wants
1014 * to read from the beginning of the able. After a certain entries
1015 * are reported, the kernel remembers the position of the last returned
1016 * entry. When the next ioctl comes in with the ILB_LIST_BEGIN flag,
1017 * it will return entries starting from where it was left off. When
1018 * the end of table is reached, a flag (ILB_LIST_END) is set to tell
1019 * the client that there is no more entry.
1020 *
1021 * It is assumed that the caller has checked the size of nat so that it
1022 * can hold num entries.
1023 */
1024 /* ARGSUSED */
1025 int
ilb_list_nat(ilb_stack_t * ilbs,zoneid_t zoneid,ilb_nat_entry_t * nat,uint32_t * num,uint32_t * flags)1026 ilb_list_nat(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_nat_entry_t *nat,
1027 uint32_t *num, uint32_t *flags)
1028 {
1029 ilb_conn_hash_t *hash;
1030 ilb_conn_t *cur_connp;
1031 uint32_t i, j;
1032 int ret = 0;
1033
1034 mutex_enter(&ilbs->ilbs_conn_list_lock);
1035 while (ilbs->ilbs_conn_list_busy) {
1036 if (cv_wait_sig(&ilbs->ilbs_conn_list_cv,
1037 &ilbs->ilbs_conn_list_lock) == 0) {
1038 mutex_exit(&ilbs->ilbs_conn_list_lock);
1039 return (EINTR);
1040 }
1041 }
1042 if ((hash = ilbs->ilbs_c2s_conn_hash) == NULL) {
1043 ASSERT(ilbs->ilbs_s2c_conn_hash == NULL);
1044 mutex_exit(&ilbs->ilbs_conn_list_lock);
1045 *num = 0;
1046 *flags |= ILB_LIST_END;
1047 return (0);
1048 }
1049 ilbs->ilbs_conn_list_busy = B_TRUE;
1050 mutex_exit(&ilbs->ilbs_conn_list_lock);
1051
1052 if (*flags & ILB_LIST_BEGIN) {
1053 i = 0;
1054 mutex_enter(&hash[0].ilb_conn_hash_lock);
1055 cur_connp = hash[0].ilb_connp;
1056 } else if (*flags & ILB_LIST_CONT) {
1057 if (ilbs->ilbs_conn_list_cur == ilbs->ilbs_conn_hash_size) {
1058 *num = 0;
1059 *flags |= ILB_LIST_END;
1060 goto done;
1061 }
1062 i = ilbs->ilbs_conn_list_cur;
1063 mutex_enter(&hash[i].ilb_conn_hash_lock);
1064 cur_connp = ilbs->ilbs_conn_list_connp;
1065 } else {
1066 ret = EINVAL;
1067 goto done;
1068 }
1069
1070 j = 0;
1071 while (j < *num) {
1072 if (cur_connp == NULL) {
1073 mutex_exit(&hash[i].ilb_conn_hash_lock);
1074 if (++i == ilbs->ilbs_conn_hash_size) {
1075 *flags |= ILB_LIST_END;
1076 break;
1077 }
1078 mutex_enter(&hash[i].ilb_conn_hash_lock);
1079 cur_connp = hash[i].ilb_connp;
1080 continue;
1081 }
1082 nat[j].proto = cur_connp->conn_l4;
1083
1084 nat[j].in_global = cur_connp->conn_c2s_daddr;
1085 nat[j].in_global_port = cur_connp->conn_c2s_dport;
1086 nat[j].out_global = cur_connp->conn_c2s_saddr;
1087 nat[j].out_global_port = cur_connp->conn_c2s_sport;
1088
1089 nat[j].in_local = cur_connp->conn_s2c_saddr;
1090 nat[j].in_local_port = cur_connp->conn_s2c_sport;
1091 nat[j].out_local = cur_connp->conn_s2c_daddr;
1092 nat[j].out_local_port = cur_connp->conn_s2c_dport;
1093
1094 nat[j].create_time = TICK_TO_MSEC(cur_connp->conn_cr_time);
1095 nat[j].last_access_time =
1096 TICK_TO_MSEC(cur_connp->conn_c2s_atime);
1097
1098 /*
1099 * The conn_s2c_pkt_cnt may not be accurate since we are not
1100 * holding the s2c hash lock.
1101 */
1102 nat[j].pkt_cnt = cur_connp->conn_c2s_pkt_cnt +
1103 cur_connp->conn_s2c_pkt_cnt;
1104 j++;
1105
1106 cur_connp = cur_connp->conn_c2s_next;
1107 }
1108 ilbs->ilbs_conn_list_connp = cur_connp;
1109 if (j == *num)
1110 mutex_exit(&hash[i].ilb_conn_hash_lock);
1111
1112 ilbs->ilbs_conn_list_cur = i;
1113
1114 *num = j;
1115 done:
1116 mutex_enter(&ilbs->ilbs_conn_list_lock);
1117 ilbs->ilbs_conn_list_busy = B_FALSE;
1118 cv_signal(&ilbs->ilbs_conn_list_cv);
1119 mutex_exit(&ilbs->ilbs_conn_list_lock);
1120
1121 return (ret);
1122 }
1123
1124
1125 /*
1126 * Stickiness (persistence) handling routines.
1127 */
1128
1129
1130 static void
ilb_sticky_cache_init(void)1131 ilb_sticky_cache_init(void)
1132 {
1133 ilb_sticky_cache = kmem_cache_create("ilb_sticky_cache",
1134 sizeof (ilb_sticky_t), 0, NULL, NULL, NULL, NULL, NULL,
1135 ilb_kmem_flags);
1136 }
1137
1138 void
ilb_sticky_cache_fini(void)1139 ilb_sticky_cache_fini(void)
1140 {
1141 if (ilb_sticky_cache != NULL) {
1142 kmem_cache_destroy(ilb_sticky_cache);
1143 ilb_sticky_cache = NULL;
1144 }
1145 }
1146
1147 void
ilb_sticky_refrele(ilb_sticky_t * s)1148 ilb_sticky_refrele(ilb_sticky_t *s)
1149 {
1150 ILB_STICKY_REFRELE(s);
1151 }
1152
1153 static ilb_sticky_t *
ilb_sticky_lookup(ilb_sticky_hash_t * hash,ilb_rule_t * rule,in6_addr_t * src)1154 ilb_sticky_lookup(ilb_sticky_hash_t *hash, ilb_rule_t *rule, in6_addr_t *src)
1155 {
1156 ilb_sticky_t *s;
1157
1158 ASSERT(mutex_owned(&hash->sticky_lock));
1159
1160 for (s = list_head(&hash->sticky_head); s != NULL;
1161 s = list_next(&hash->sticky_head, s)) {
1162 if (s->rule_instance == rule->ir_ks_instance) {
1163 if (IN6_ARE_ADDR_EQUAL(src, &s->src))
1164 return (s);
1165 }
1166 }
1167 return (NULL);
1168 }
1169
1170 static ilb_sticky_t *
ilb_sticky_add(ilb_sticky_hash_t * hash,ilb_rule_t * rule,ilb_server_t * server,in6_addr_t * src)1171 ilb_sticky_add(ilb_sticky_hash_t *hash, ilb_rule_t *rule, ilb_server_t *server,
1172 in6_addr_t *src)
1173 {
1174 ilb_sticky_t *s;
1175
1176 ASSERT(mutex_owned(&hash->sticky_lock));
1177
1178 if ((s = kmem_cache_alloc(ilb_sticky_cache, KM_NOSLEEP)) == NULL)
1179 return (NULL);
1180
1181 /*
1182 * The rule instance is for handling the scenario when the same
1183 * client talks to different rules at the same time. Stickiness
1184 * is per rule so we can use the rule instance to differentiate
1185 * the client's request.
1186 */
1187 s->rule_instance = rule->ir_ks_instance;
1188 /*
1189 * Copy the rule name for listing all sticky cache entry. ir_name
1190 * is guaranteed to be NULL terminated.
1191 */
1192 (void) strcpy(s->rule_name, rule->ir_name);
1193 s->server = server;
1194
1195 /*
1196 * Grab a ref cnt on the server so that it won't go away while
1197 * it is still in the sticky table.
1198 */
1199 ILB_SERVER_REFHOLD(server);
1200 s->src = *src;
1201 s->expiry = rule->ir_sticky_expiry;
1202 s->refcnt = 1;
1203 s->hash = hash;
1204
1205 /*
1206 * There is no need to set atime here since the refcnt is not
1207 * zero. A sticky entry is removed only when the refcnt is
1208 * zero. But just set it here for debugging purpose. The
1209 * atime is set when a refrele is done on a sticky entry.
1210 */
1211 s->atime = ddi_get_lbolt64();
1212
1213 list_insert_head(&hash->sticky_head, s);
1214 hash->sticky_cnt++;
1215 return (s);
1216 }
1217
1218 /*
1219 * This routine checks if there is an existing sticky entry which matches
1220 * a given packet. If there is one, return it. If there is not, create
1221 * a sticky entry using the packet's info.
1222 */
1223 ilb_server_t *
ilb_sticky_find_add(ilb_stack_t * ilbs,ilb_rule_t * rule,in6_addr_t * src,ilb_server_t * server,ilb_sticky_t ** res,uint16_t * src_ent_idx)1224 ilb_sticky_find_add(ilb_stack_t *ilbs, ilb_rule_t *rule, in6_addr_t *src,
1225 ilb_server_t *server, ilb_sticky_t **res, uint16_t *src_ent_idx)
1226 {
1227 int i;
1228 ilb_sticky_hash_t *hash;
1229 ilb_sticky_t *s;
1230
1231 ASSERT(server != NULL);
1232
1233 *res = NULL;
1234
1235 i = ILB_STICKY_HASH((uint8_t *)&src->s6_addr32[3],
1236 (uint32_t)(uintptr_t)rule, ilbs->ilbs_sticky_hash_size);
1237 hash = &ilbs->ilbs_sticky_hash[i];
1238
1239 /* First check if there is already an entry. */
1240 mutex_enter(&hash->sticky_lock);
1241 s = ilb_sticky_lookup(hash, rule, src);
1242
1243 /* No sticky entry, add one. */
1244 if (s == NULL) {
1245 add_new_entry:
1246 s = ilb_sticky_add(hash, rule, server, src);
1247 if (s == NULL) {
1248 mutex_exit(&hash->sticky_lock);
1249 return (NULL);
1250 }
1251 /*
1252 * Find a source for this server. All subseqent requests from
1253 * the same client matching this sticky entry will use this
1254 * source address in doing NAT. The current algorithm is
1255 * simple, rotate the source address. Note that the
1256 * source address array does not change after it's created, so
1257 * it is OK to just increment the cur index.
1258 */
1259 if (server->iser_nat_src != NULL) {
1260 /* It is a hint, does not need to be atomic. */
1261 *src_ent_idx = (server->iser_nat_src->cur++ %
1262 server->iser_nat_src->num_src);
1263 s->nat_src_idx = *src_ent_idx;
1264 }
1265 mutex_exit(&hash->sticky_lock);
1266 *res = s;
1267 return (server);
1268 }
1269
1270 /*
1271 * We don't hold any lock accessing iser_enabled. Refer to the
1272 * comment in ilb_server_add() about iser_lock.
1273 */
1274 if (!s->server->iser_enabled) {
1275 /*
1276 * s->server == server can only happen if there is a race in
1277 * toggling the iser_enabled flag (we don't hold a lock doing
1278 * that) so that the load balance algorithm still returns a
1279 * disabled server. In this case, just drop the packet...
1280 */
1281 if (s->server == server) {
1282 mutex_exit(&hash->sticky_lock);
1283 return (NULL);
1284 }
1285
1286 /*
1287 * The old server is disabled and there is a new server, use
1288 * the new one to create a sticky entry. Since we will
1289 * add the entry at the beginning, subsequent lookup will
1290 * find this new entry instead of the old one.
1291 */
1292 goto add_new_entry;
1293 }
1294
1295 s->refcnt++;
1296 *res = s;
1297 mutex_exit(&hash->sticky_lock);
1298 if (server->iser_nat_src != NULL)
1299 *src_ent_idx = s->nat_src_idx;
1300 return (s->server);
1301 }
1302
1303 static void
ilb_sticky_cleanup(void * arg)1304 ilb_sticky_cleanup(void *arg)
1305 {
1306 ilb_timer_t *timer = (ilb_timer_t *)arg;
1307 uint32_t i;
1308 ilb_stack_t *ilbs;
1309 ilb_sticky_hash_t *hash;
1310 ilb_sticky_t *s, *nxt_s;
1311 int64_t now, expiry;
1312
1313 ilbs = timer->ilbs;
1314 hash = ilbs->ilbs_sticky_hash;
1315 ASSERT(hash != NULL);
1316
1317 now = ddi_get_lbolt64();
1318 for (i = timer->start; i < timer->end; i++) {
1319 mutex_enter(&hash[i].sticky_lock);
1320 for (s = list_head(&hash[i].sticky_head); s != NULL;
1321 s = nxt_s) {
1322 nxt_s = list_next(&hash[i].sticky_head, s);
1323 if (s->refcnt != 0)
1324 continue;
1325 expiry = now - SEC_TO_TICK(s->expiry);
1326 if (s->atime < expiry) {
1327 ILB_SERVER_REFRELE(s->server);
1328 list_remove(&hash[i].sticky_head, s);
1329 kmem_cache_free(ilb_sticky_cache, s);
1330 hash[i].sticky_cnt--;
1331 }
1332 }
1333 mutex_exit(&hash[i].sticky_lock);
1334 }
1335 }
1336
1337 static void
ilb_sticky_timer(void * arg)1338 ilb_sticky_timer(void *arg)
1339 {
1340 ilb_timer_t *timer = (ilb_timer_t *)arg;
1341
1342 (void) taskq_dispatch(timer->ilbs->ilbs_sticky_taskq,
1343 ilb_sticky_cleanup, arg, TQ_SLEEP);
1344 mutex_enter(&timer->tid_lock);
1345 if (timer->tid == 0) {
1346 mutex_exit(&timer->tid_lock);
1347 } else {
1348 timer->tid = timeout(ilb_sticky_timer, arg,
1349 SEC_TO_TICK(ilb_sticky_timeout));
1350 mutex_exit(&timer->tid_lock);
1351 }
1352 }
1353
1354 void
ilb_sticky_hash_init(ilb_stack_t * ilbs)1355 ilb_sticky_hash_init(ilb_stack_t *ilbs)
1356 {
1357 extern pri_t minclsyspri;
1358 int i, part;
1359 char tq_name[TASKQ_NAMELEN];
1360 ilb_timer_t *tm;
1361
1362 if (ilbs->ilbs_sticky_hash_size & (ilbs->ilbs_sticky_hash_size - 1)) {
1363 for (i = 0; i < 31; i++) {
1364 if (ilbs->ilbs_sticky_hash_size < (1 << i))
1365 break;
1366 }
1367 ilbs->ilbs_sticky_hash_size = 1 << i;
1368 }
1369
1370 ilbs->ilbs_sticky_hash = kmem_zalloc(sizeof (ilb_sticky_hash_t) *
1371 ilbs->ilbs_sticky_hash_size, KM_SLEEP);
1372 for (i = 0; i < ilbs->ilbs_sticky_hash_size; i++) {
1373 mutex_init(&ilbs->ilbs_sticky_hash[i].sticky_lock, NULL,
1374 MUTEX_DEFAULT, NULL);
1375 list_create(&ilbs->ilbs_sticky_hash[i].sticky_head,
1376 sizeof (ilb_sticky_t),
1377 offsetof(ilb_sticky_t, list));
1378 }
1379
1380 if (ilb_sticky_cache == NULL)
1381 ilb_sticky_cache_init();
1382
1383 (void) snprintf(tq_name, sizeof (tq_name), "ilb_sticky_taskq_%p",
1384 (void *)ilbs->ilbs_netstack);
1385 ASSERT(ilbs->ilbs_sticky_taskq == NULL);
1386 ilbs->ilbs_sticky_taskq = taskq_create(tq_name,
1387 ilb_sticky_timer_size * 2, minclsyspri, ilb_sticky_timer_size,
1388 ilb_sticky_timer_size * 2, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
1389
1390 ASSERT(ilbs->ilbs_sticky_timer_list == NULL);
1391 ilbs->ilbs_sticky_timer_list = kmem_zalloc(sizeof (ilb_timer_t) *
1392 ilb_sticky_timer_size, KM_SLEEP);
1393 part = ilbs->ilbs_sticky_hash_size / ilb_sticky_timer_size + 1;
1394 for (i = 0; i < ilb_sticky_timer_size; i++) {
1395 tm = ilbs->ilbs_sticky_timer_list + i;
1396 tm->start = i * part;
1397 tm->end = i * part + part;
1398 if (tm->end > ilbs->ilbs_sticky_hash_size)
1399 tm->end = ilbs->ilbs_sticky_hash_size;
1400 tm->ilbs = ilbs;
1401 mutex_init(&tm->tid_lock, NULL, MUTEX_DEFAULT, NULL);
1402 /* Spread out the starting execution time of all the timers. */
1403 tm->tid = timeout(ilb_sticky_timer, tm,
1404 SEC_TO_TICK(ilb_sticky_timeout + i));
1405 }
1406 }
1407
1408 void
ilb_sticky_hash_fini(ilb_stack_t * ilbs)1409 ilb_sticky_hash_fini(ilb_stack_t *ilbs)
1410 {
1411 int i;
1412 ilb_sticky_t *s;
1413
1414 if (ilbs->ilbs_sticky_hash == NULL)
1415 return;
1416
1417 /* Stop all the timers first. */
1418 for (i = 0; i < ilb_sticky_timer_size; i++) {
1419 timeout_id_t tid;
1420
1421 /* Setting tid to 0 tells the timer handler not to restart. */
1422 mutex_enter(&ilbs->ilbs_sticky_timer_list[i].tid_lock);
1423 tid = ilbs->ilbs_sticky_timer_list[i].tid;
1424 ilbs->ilbs_sticky_timer_list[i].tid = 0;
1425 mutex_exit(&ilbs->ilbs_sticky_timer_list[i].tid_lock);
1426 (void) untimeout(tid);
1427 }
1428 kmem_free(ilbs->ilbs_sticky_timer_list, sizeof (ilb_timer_t) *
1429 ilb_sticky_timer_size);
1430 taskq_destroy(ilbs->ilbs_sticky_taskq);
1431 ilbs->ilbs_sticky_taskq = NULL;
1432
1433 for (i = 0; i < ilbs->ilbs_sticky_hash_size; i++) {
1434 while ((s = list_head(&ilbs->ilbs_sticky_hash[i].sticky_head))
1435 != NULL) {
1436 list_remove(&ilbs->ilbs_sticky_hash[i].sticky_head, s);
1437 ILB_SERVER_REFRELE(s->server);
1438 kmem_free(s, sizeof (ilb_sticky_t));
1439 }
1440 }
1441 kmem_free(ilbs->ilbs_sticky_hash, ilbs->ilbs_sticky_hash_size *
1442 sizeof (ilb_sticky_hash_t));
1443 }
1444
1445 /*
1446 * This routine sends up the sticky hash table to user land. Refer to
1447 * the comments before ilb_list_nat(). Both routines assume similar
1448 * conditions.
1449 *
1450 * It is assumed that the caller has checked the size of st so that it
1451 * can hold num entries.
1452 */
1453 /* ARGSUSED */
1454 int
ilb_list_sticky(ilb_stack_t * ilbs,zoneid_t zoneid,ilb_sticky_entry_t * st,uint32_t * num,uint32_t * flags)1455 ilb_list_sticky(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_sticky_entry_t *st,
1456 uint32_t *num, uint32_t *flags)
1457 {
1458 ilb_sticky_hash_t *hash;
1459 ilb_sticky_t *curp;
1460 uint32_t i, j;
1461 int ret = 0;
1462
1463 mutex_enter(&ilbs->ilbs_sticky_list_lock);
1464 while (ilbs->ilbs_sticky_list_busy) {
1465 if (cv_wait_sig(&ilbs->ilbs_sticky_list_cv,
1466 &ilbs->ilbs_sticky_list_lock) == 0) {
1467 mutex_exit(&ilbs->ilbs_sticky_list_lock);
1468 return (EINTR);
1469 }
1470 }
1471 if ((hash = ilbs->ilbs_sticky_hash) == NULL) {
1472 mutex_exit(&ilbs->ilbs_sticky_list_lock);
1473 *num = 0;
1474 *flags |= ILB_LIST_END;
1475 return (0);
1476 }
1477 ilbs->ilbs_sticky_list_busy = B_TRUE;
1478 mutex_exit(&ilbs->ilbs_sticky_list_lock);
1479
1480 if (*flags & ILB_LIST_BEGIN) {
1481 i = 0;
1482 mutex_enter(&hash[0].sticky_lock);
1483 curp = list_head(&hash[0].sticky_head);
1484 } else if (*flags & ILB_LIST_CONT) {
1485 if (ilbs->ilbs_sticky_list_cur == ilbs->ilbs_sticky_hash_size) {
1486 *num = 0;
1487 *flags |= ILB_LIST_END;
1488 goto done;
1489 }
1490 i = ilbs->ilbs_sticky_list_cur;
1491 mutex_enter(&hash[i].sticky_lock);
1492 curp = ilbs->ilbs_sticky_list_curp;
1493 } else {
1494 ret = EINVAL;
1495 goto done;
1496 }
1497
1498 j = 0;
1499 while (j < *num) {
1500 if (curp == NULL) {
1501 mutex_exit(&hash[i].sticky_lock);
1502 if (++i == ilbs->ilbs_sticky_hash_size) {
1503 *flags |= ILB_LIST_END;
1504 break;
1505 }
1506 mutex_enter(&hash[i].sticky_lock);
1507 curp = list_head(&hash[i].sticky_head);
1508 continue;
1509 }
1510 (void) strcpy(st[j].rule_name, curp->rule_name);
1511 st[j].req_addr = curp->src;
1512 st[j].srv_addr = curp->server->iser_addr_v6;
1513 st[j].expiry_time = TICK_TO_MSEC(curp->expiry);
1514 j++;
1515 curp = list_next(&hash[i].sticky_head, curp);
1516 }
1517 ilbs->ilbs_sticky_list_curp = curp;
1518 if (j == *num)
1519 mutex_exit(&hash[i].sticky_lock);
1520
1521 ilbs->ilbs_sticky_list_cur = i;
1522
1523 *num = j;
1524 done:
1525 mutex_enter(&ilbs->ilbs_sticky_list_lock);
1526 ilbs->ilbs_sticky_list_busy = B_FALSE;
1527 cv_signal(&ilbs->ilbs_sticky_list_cv);
1528 mutex_exit(&ilbs->ilbs_sticky_list_lock);
1529
1530 return (ret);
1531 }
1532