1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 #include <sys/kmem.h>
28 #include <sys/ksynch.h>
29 #include <sys/systm.h>
30 #include <sys/socket.h>
31 #include <sys/disp.h>
32 #include <sys/taskq.h>
33 #include <sys/cmn_err.h>
34 #include <sys/strsun.h>
35 #include <sys/sdt.h>
36 #include <sys/atomic.h>
37 #include <netinet/in.h>
38 #include <inet/ip.h>
39 #include <inet/ip6.h>
40 #include <inet/tcp.h>
41 #include <inet/udp_impl.h>
42 #include <inet/kstatcom.h>
43
44 #include <inet/ilb_ip.h>
45 #include "ilb_alg.h"
46 #include "ilb_nat.h"
47 #include "ilb_conn.h"
48
49 /* ILB kmem cache flag */
50 int ilb_kmem_flags = 0;
51
52 /*
53 * The default size for the different hash tables. Global for all stacks.
54 * But each stack has its own table, just that their sizes are the same.
55 */
56 static size_t ilb_rule_hash_size = 2048;
57
58 static size_t ilb_conn_hash_size = 262144;
59
60 static size_t ilb_sticky_hash_size = 262144;
61
62 /* This should be a prime number. */
63 static size_t ilb_nat_src_hash_size = 97;
64
65 /* Default NAT cache entry expiry time. */
66 static uint32_t ilb_conn_tcp_expiry = 120;
67 static uint32_t ilb_conn_udp_expiry = 60;
68
69 /* Default sticky entry expiry time. */
70 static uint32_t ilb_sticky_expiry = 60;
71
72 /* addr is assumed to be a uint8_t * to an ipaddr_t. */
73 #define ILB_RULE_HASH(addr, hash_size) \
74 ((*((addr) + 3) * 29791 + *((addr) + 2) * 961 + *((addr) + 1) * 31 + \
75 *(addr)) & ((hash_size) - 1))
76
77 /*
78 * Note on ILB delayed processing
79 *
80 * To avoid in line removal on some of the data structures, such as rules,
81 * servers and ilb_conn_hash entries, ILB delays such processing to a taskq.
82 * There are three types of ILB taskq:
83 *
84 * 1. rule handling: created at stack initialialization time, ilb_stack_init()
85 * 2. conn hash handling: created at conn hash initialization time,
86 * ilb_conn_hash_init()
87 * 3. sticky hash handling: created at sticky hash initialization time,
88 * ilb_sticky_hash_init()
89 *
90 * The rule taskq is for processing rule and server removal. When a user
91 * land rule/server removal request comes in, a taskq is dispatched after
92 * removing the rule/server from all related hashes. This taskq will wait
93 * until all references to the rule/server are gone before removing it.
94 * So the user land thread requesting the removal does not need to wait
95 * for the removal completion.
96 *
97 * The conn hash/sticky hash taskq is for processing ilb_conn_hash and
98 * ilb_sticky_hash table entry removal. There are ilb_conn_timer_size timers
99 * and ilb_sticky_timer_size timers running for ilb_conn_hash and
100 * ilb_sticky_hash cleanup respectively. Each timer is responsible for one
101 * portion (same size) of the hash table. When a timer fires, it dispatches
102 * a conn hash taskq to clean up its portion of the table. This avoids in
103 * line processing of the removal.
104 *
105 * There is another delayed processing, the clean up of NAT source address
106 * table. We just use the timer to directly handle it instead of using
107 * a taskq. The reason is that the table is small so it is OK to use the
108 * timer.
109 */
110
111 /* ILB rule taskq constants. */
112 #define ILB_RULE_TASKQ_NUM_THR 20
113
114 /* Argument passed to ILB rule taskq routines. */
115 typedef struct {
116 ilb_stack_t *ilbs;
117 ilb_rule_t *rule;
118 } ilb_rule_tq_t;
119
120 /* kstat handling routines. */
121 static kstat_t *ilb_kstat_g_init(netstackid_t, ilb_stack_t *);
122 static void ilb_kstat_g_fini(netstackid_t, ilb_stack_t *);
123 static kstat_t *ilb_rule_kstat_init(netstackid_t, ilb_rule_t *);
124 static kstat_t *ilb_server_kstat_init(netstackid_t, ilb_rule_t *,
125 ilb_server_t *);
126
127 /* Rule hash handling routines. */
128 static void ilb_rule_hash_init(ilb_stack_t *);
129 static void ilb_rule_hash_fini(ilb_stack_t *);
130 static void ilb_rule_hash_add(ilb_stack_t *, ilb_rule_t *, const in6_addr_t *);
131 static void ilb_rule_hash_del(ilb_rule_t *);
132 static ilb_rule_t *ilb_rule_hash(ilb_stack_t *, int, int, in6_addr_t *,
133 in_port_t, zoneid_t, uint32_t, boolean_t *);
134
135 static void ilb_rule_g_add(ilb_stack_t *, ilb_rule_t *);
136 static void ilb_rule_g_del(ilb_stack_t *, ilb_rule_t *);
137 static void ilb_del_rule_common(ilb_stack_t *, ilb_rule_t *);
138 static ilb_rule_t *ilb_find_rule_locked(ilb_stack_t *, zoneid_t, const char *,
139 int *);
140 static boolean_t ilb_match_rule(ilb_stack_t *, zoneid_t, const char *, int,
141 int, in_port_t, in_port_t, const in6_addr_t *);
142
143 /* Back end server handling routines. */
144 static void ilb_server_free(ilb_server_t *);
145
146 /* Network stack handling routines. */
147 static void *ilb_stack_init(netstackid_t, netstack_t *);
148 static void ilb_stack_shutdown(netstackid_t, void *);
149 static void ilb_stack_fini(netstackid_t, void *);
150
151 /* Sticky connection handling routines. */
152 static void ilb_rule_sticky_init(ilb_rule_t *);
153 static void ilb_rule_sticky_fini(ilb_rule_t *);
154
155 /* Handy macro to check for unspecified address. */
156 #define IS_ADDR_UNSPEC(addr) \
157 (IN6_IS_ADDR_V4MAPPED(addr) ? IN6_IS_ADDR_V4MAPPED_ANY(addr) : \
158 IN6_IS_ADDR_UNSPECIFIED(addr))
159
160 /*
161 * Global kstat instance counter. When a rule is created, its kstat instance
162 * number is assigned by ilb_kstat_instance and ilb_kstat_instance is
163 * incremented.
164 */
165 static uint_t ilb_kstat_instance = 0;
166
167 /*
168 * The ILB global kstat has name ILB_G_KS_NAME and class name ILB_G_KS_CNAME.
169 * A rule's kstat has ILB_RULE_KS_CNAME class name.
170 */
171 #define ILB_G_KS_NAME "global"
172 #define ILB_G_KS_CNAME "kstat"
173 #define ILB_RULE_KS_CNAME "rulestat"
174
175 static kstat_t *
ilb_kstat_g_init(netstackid_t stackid,ilb_stack_t * ilbs)176 ilb_kstat_g_init(netstackid_t stackid, ilb_stack_t *ilbs)
177 {
178 kstat_t *ksp;
179 ilb_g_kstat_t template = {
180 { "num_rules", KSTAT_DATA_UINT64, 0 },
181 { "ip_frag_in", KSTAT_DATA_UINT64, 0 },
182 { "ip_frag_dropped", KSTAT_DATA_UINT64, 0 }
183 };
184
185 ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, 0, ILB_G_KS_NAME,
186 ILB_G_KS_CNAME, KSTAT_TYPE_NAMED, NUM_OF_FIELDS(ilb_g_kstat_t),
187 KSTAT_FLAG_VIRTUAL, stackid);
188 if (ksp == NULL)
189 return (NULL);
190 bcopy(&template, ilbs->ilbs_kstat, sizeof (template));
191 ksp->ks_data = ilbs->ilbs_kstat;
192 ksp->ks_private = (void *)(uintptr_t)stackid;
193
194 kstat_install(ksp);
195 return (ksp);
196 }
197
198 static void
ilb_kstat_g_fini(netstackid_t stackid,ilb_stack_t * ilbs)199 ilb_kstat_g_fini(netstackid_t stackid, ilb_stack_t *ilbs)
200 {
201 if (ilbs->ilbs_ksp != NULL) {
202 ASSERT(stackid == (netstackid_t)(uintptr_t)
203 ilbs->ilbs_ksp->ks_private);
204 kstat_delete_netstack(ilbs->ilbs_ksp, stackid);
205 ilbs->ilbs_ksp = NULL;
206 }
207 }
208
209 static kstat_t *
ilb_rule_kstat_init(netstackid_t stackid,ilb_rule_t * rule)210 ilb_rule_kstat_init(netstackid_t stackid, ilb_rule_t *rule)
211 {
212 kstat_t *ksp;
213 ilb_rule_kstat_t template = {
214 { "num_servers", KSTAT_DATA_UINT64, 0 },
215 { "bytes_not_processed", KSTAT_DATA_UINT64, 0 },
216 { "pkt_not_processed", KSTAT_DATA_UINT64, 0 },
217 { "bytes_dropped", KSTAT_DATA_UINT64, 0 },
218 { "pkt_dropped", KSTAT_DATA_UINT64, 0 },
219 { "nomem_bytes_dropped", KSTAT_DATA_UINT64, 0 },
220 { "nomem_pkt_dropped", KSTAT_DATA_UINT64, 0 },
221 { "noport_bytes_dropped", KSTAT_DATA_UINT64, 0 },
222 { "noport_pkt_dropped", KSTAT_DATA_UINT64, 0 },
223 { "icmp_echo_processed", KSTAT_DATA_UINT64, 0 },
224 { "icmp_dropped", KSTAT_DATA_UINT64, 0 },
225 { "icmp_too_big_processed", KSTAT_DATA_UINT64, 0 },
226 { "icmp_too_big_dropped", KSTAT_DATA_UINT64, 0 }
227 };
228
229 ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, rule->ir_ks_instance,
230 rule->ir_name, ILB_RULE_KS_CNAME, KSTAT_TYPE_NAMED,
231 NUM_OF_FIELDS(ilb_rule_kstat_t), KSTAT_FLAG_VIRTUAL, stackid);
232 if (ksp == NULL)
233 return (NULL);
234
235 bcopy(&template, &rule->ir_kstat, sizeof (template));
236 ksp->ks_data = &rule->ir_kstat;
237 ksp->ks_private = (void *)(uintptr_t)stackid;
238
239 kstat_install(ksp);
240 return (ksp);
241 }
242
243 static kstat_t *
ilb_server_kstat_init(netstackid_t stackid,ilb_rule_t * rule,ilb_server_t * server)244 ilb_server_kstat_init(netstackid_t stackid, ilb_rule_t *rule,
245 ilb_server_t *server)
246 {
247 kstat_t *ksp;
248 ilb_server_kstat_t template = {
249 { "bytes_processed", KSTAT_DATA_UINT64, 0 },
250 { "pkt_processed", KSTAT_DATA_UINT64, 0 },
251 { "ip_address", KSTAT_DATA_STRING, 0 }
252 };
253 char cname_buf[KSTAT_STRLEN];
254
255 /* 7 is "-sstat" */
256 ASSERT(strlen(rule->ir_name) + 7 < KSTAT_STRLEN);
257 (void) sprintf(cname_buf, "%s-sstat", rule->ir_name);
258 ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, rule->ir_ks_instance,
259 server->iser_name, cname_buf, KSTAT_TYPE_NAMED,
260 NUM_OF_FIELDS(ilb_server_kstat_t), KSTAT_FLAG_VIRTUAL, stackid);
261 if (ksp == NULL)
262 return (NULL);
263
264 bcopy(&template, &server->iser_kstat, sizeof (template));
265 ksp->ks_data = &server->iser_kstat;
266 ksp->ks_private = (void *)(uintptr_t)stackid;
267
268 kstat_named_setstr(&server->iser_kstat.ip_address,
269 server->iser_ip_addr);
270 /* We never change the IP address */
271 ksp->ks_data_size += strlen(server->iser_ip_addr) + 1;
272
273 kstat_install(ksp);
274 return (ksp);
275 }
276
277 /* Initialize the rule hash table. */
278 static void
ilb_rule_hash_init(ilb_stack_t * ilbs)279 ilb_rule_hash_init(ilb_stack_t *ilbs)
280 {
281 int i;
282
283 /*
284 * If ilbs->ilbs_rule_hash_size is not a power of 2, bump it up to
285 * the next power of 2.
286 */
287 if (ilbs->ilbs_rule_hash_size & (ilbs->ilbs_rule_hash_size - 1)) {
288 for (i = 0; i < 31; i++) {
289 if (ilbs->ilbs_rule_hash_size < (1 << i))
290 break;
291 }
292 ilbs->ilbs_rule_hash_size = 1 << i;
293 }
294 ilbs->ilbs_g_hash = kmem_zalloc(sizeof (ilb_hash_t) *
295 ilbs->ilbs_rule_hash_size, KM_SLEEP);
296 for (i = 0; i < ilbs->ilbs_rule_hash_size; i++) {
297 mutex_init(&ilbs->ilbs_g_hash[i].ilb_hash_lock, NULL,
298 MUTEX_DEFAULT, NULL);
299 }
300 }
301
302 /* Clean up the rule hash table. */
303 static void
ilb_rule_hash_fini(ilb_stack_t * ilbs)304 ilb_rule_hash_fini(ilb_stack_t *ilbs)
305 {
306 if (ilbs->ilbs_g_hash == NULL)
307 return;
308 kmem_free(ilbs->ilbs_g_hash, sizeof (ilb_hash_t) *
309 ilbs->ilbs_rule_hash_size);
310 }
311
312 /* Add a rule to the rule hash table. */
313 static void
ilb_rule_hash_add(ilb_stack_t * ilbs,ilb_rule_t * rule,const in6_addr_t * addr)314 ilb_rule_hash_add(ilb_stack_t *ilbs, ilb_rule_t *rule, const in6_addr_t *addr)
315 {
316 int i;
317
318 i = ILB_RULE_HASH((uint8_t *)&addr->s6_addr32[3],
319 ilbs->ilbs_rule_hash_size);
320 DTRACE_PROBE2(ilb__rule__hash__add, ilb_rule_t *, rule, int, i);
321 mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
322 rule->ir_hash_next = ilbs->ilbs_g_hash[i].ilb_hash_rule;
323 if (ilbs->ilbs_g_hash[i].ilb_hash_rule != NULL)
324 ilbs->ilbs_g_hash[i].ilb_hash_rule->ir_hash_prev = rule;
325 rule->ir_hash_prev = NULL;
326 ilbs->ilbs_g_hash[i].ilb_hash_rule = rule;
327
328 rule->ir_hash = &ilbs->ilbs_g_hash[i];
329 mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
330 }
331
332 /*
333 * Remove a rule from the rule hash table. Note that the rule is not freed
334 * in this routine.
335 */
336 static void
ilb_rule_hash_del(ilb_rule_t * rule)337 ilb_rule_hash_del(ilb_rule_t *rule)
338 {
339 mutex_enter(&rule->ir_hash->ilb_hash_lock);
340 if (rule->ir_hash->ilb_hash_rule == rule) {
341 rule->ir_hash->ilb_hash_rule = rule->ir_hash_next;
342 if (rule->ir_hash_next != NULL)
343 rule->ir_hash_next->ir_hash_prev = NULL;
344 } else {
345 if (rule->ir_hash_prev != NULL)
346 rule->ir_hash_prev->ir_hash_next =
347 rule->ir_hash_next;
348 if (rule->ir_hash_next != NULL) {
349 rule->ir_hash_next->ir_hash_prev =
350 rule->ir_hash_prev;
351 }
352 }
353 mutex_exit(&rule->ir_hash->ilb_hash_lock);
354
355 rule->ir_hash_next = NULL;
356 rule->ir_hash_prev = NULL;
357 rule->ir_hash = NULL;
358 }
359
360 /*
361 * Given the info of a packet, look for a match in the rule hash table.
362 */
363 static ilb_rule_t *
ilb_rule_hash(ilb_stack_t * ilbs,int l3,int l4,in6_addr_t * addr,in_port_t port,zoneid_t zoneid,uint32_t len,boolean_t * busy)364 ilb_rule_hash(ilb_stack_t *ilbs, int l3, int l4, in6_addr_t *addr,
365 in_port_t port, zoneid_t zoneid, uint32_t len, boolean_t *busy)
366 {
367 int i;
368 ilb_rule_t *rule;
369 ipaddr_t v4_addr;
370
371 *busy = B_FALSE;
372 IN6_V4MAPPED_TO_IPADDR(addr, v4_addr);
373 i = ILB_RULE_HASH((uint8_t *)&v4_addr, ilbs->ilbs_rule_hash_size);
374 port = ntohs(port);
375
376 mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
377 for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
378 rule = rule->ir_hash_next) {
379 if (!rule->ir_port_range) {
380 if (rule->ir_min_port != port)
381 continue;
382 } else {
383 if (port < rule->ir_min_port ||
384 port > rule->ir_max_port) {
385 continue;
386 }
387 }
388 if (rule->ir_ipver != l3 || rule->ir_proto != l4 ||
389 rule->ir_zoneid != zoneid) {
390 continue;
391 }
392
393 if (l3 == IPPROTO_IP) {
394 if (rule->ir_target_v4 != INADDR_ANY &&
395 rule->ir_target_v4 != v4_addr) {
396 continue;
397 }
398 } else {
399 if (!IN6_IS_ADDR_UNSPECIFIED(&rule->ir_target_v6) &&
400 !IN6_ARE_ADDR_EQUAL(addr, &rule->ir_target_v6)) {
401 continue;
402 }
403 }
404
405 /*
406 * Just update the stats if the rule is disabled.
407 */
408 mutex_enter(&rule->ir_lock);
409 if (!(rule->ir_flags & ILB_RULE_ENABLED)) {
410 ILB_R_KSTAT(rule, pkt_not_processed);
411 ILB_R_KSTAT_UPDATE(rule, bytes_not_processed, len);
412 mutex_exit(&rule->ir_lock);
413 rule = NULL;
414 break;
415 } else if (rule->ir_flags & ILB_RULE_BUSY) {
416 /*
417 * If we are busy...
418 *
419 * XXX we should have a queue to postpone the
420 * packet processing. But this requires a
421 * mechanism in IP to re-start the packet
422 * processing. So for now, just drop the packet.
423 */
424 ILB_R_KSTAT(rule, pkt_dropped);
425 ILB_R_KSTAT_UPDATE(rule, bytes_dropped, len);
426 mutex_exit(&rule->ir_lock);
427 *busy = B_TRUE;
428 rule = NULL;
429 break;
430 } else {
431 rule->ir_refcnt++;
432 ASSERT(rule->ir_refcnt != 1);
433 mutex_exit(&rule->ir_lock);
434 break;
435 }
436 }
437 mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
438 return (rule);
439 }
440
441 /*
442 * Add a rule to the global rule list. This list is for finding all rules
443 * in an IP stack. The caller is assumed to hold the ilbs_g_lock.
444 */
445 static void
ilb_rule_g_add(ilb_stack_t * ilbs,ilb_rule_t * rule)446 ilb_rule_g_add(ilb_stack_t *ilbs, ilb_rule_t *rule)
447 {
448 ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
449 rule->ir_next = ilbs->ilbs_rule_head;
450 ilbs->ilbs_rule_head = rule;
451 ILB_KSTAT_UPDATE(ilbs, num_rules, 1);
452 }
453
454 /* The call is assumed to hold the ilbs_g_lock. */
455 static void
ilb_rule_g_del(ilb_stack_t * ilbs,ilb_rule_t * rule)456 ilb_rule_g_del(ilb_stack_t *ilbs, ilb_rule_t *rule)
457 {
458 ilb_rule_t *tmp_rule;
459 ilb_rule_t *prev_rule;
460
461 ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
462 prev_rule = NULL;
463 for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
464 prev_rule = tmp_rule, tmp_rule = tmp_rule->ir_next) {
465 if (tmp_rule == rule)
466 break;
467 }
468 if (tmp_rule == NULL) {
469 mutex_exit(&ilbs->ilbs_g_lock);
470 return;
471 }
472 if (prev_rule == NULL)
473 ilbs->ilbs_rule_head = tmp_rule->ir_next;
474 else
475 prev_rule->ir_next = tmp_rule->ir_next;
476 ILB_KSTAT_UPDATE(ilbs, num_rules, -1);
477 }
478
479 /*
480 * Helper routine to calculate how many source addresses are in a given
481 * range.
482 */
483 static int64_t
num_nat_src_v6(const in6_addr_t * a1,const in6_addr_t * a2)484 num_nat_src_v6(const in6_addr_t *a1, const in6_addr_t *a2)
485 {
486 int64_t ret;
487 uint32_t addr1, addr2;
488
489 /*
490 * Here we assume that the max number of NAT source cannot be
491 * large such that the most significant 2 s6_addr32 must be
492 * equal.
493 */
494 addr1 = ntohl(a1->s6_addr32[3]);
495 addr2 = ntohl(a2->s6_addr32[3]);
496 if (a1->s6_addr32[0] != a2->s6_addr32[0] ||
497 a1->s6_addr32[1] != a2->s6_addr32[1] ||
498 a1->s6_addr32[2] > a2->s6_addr32[2] ||
499 (a1->s6_addr32[2] == a2->s6_addr32[2] && addr1 > addr2)) {
500 return (-1);
501 }
502 if (a1->s6_addr32[2] == a2->s6_addr32[2]) {
503 return (addr2 - addr1 + 1);
504 } else {
505 ret = (ntohl(a2->s6_addr32[2]) - ntohl(a1->s6_addr32[2]));
506 ret <<= 32;
507 ret = ret + addr1 - addr2;
508 return (ret + 1);
509 }
510 }
511
512 /*
513 * Add an ILB rule.
514 */
515 int
ilb_rule_add(ilb_stack_t * ilbs,zoneid_t zoneid,const ilb_rule_cmd_t * cmd)516 ilb_rule_add(ilb_stack_t *ilbs, zoneid_t zoneid, const ilb_rule_cmd_t *cmd)
517 {
518 ilb_rule_t *rule;
519 netstackid_t stackid;
520 int ret;
521 in_port_t min_port, max_port;
522 int64_t num_src;
523
524 /* Sanity checks. */
525 if (cmd->ip_ver != IPPROTO_IP && cmd->ip_ver != IPPROTO_IPV6)
526 return (EINVAL);
527
528 /* Need to support SCTP... */
529 if (cmd->proto != IPPROTO_TCP && cmd->proto != IPPROTO_UDP)
530 return (EINVAL);
531
532 /* For full NAT, the NAT source must be supplied. */
533 if (cmd->topo == ILB_TOPO_IMPL_NAT) {
534 if (IS_ADDR_UNSPEC(&cmd->nat_src_start) ||
535 IS_ADDR_UNSPEC(&cmd->nat_src_end)) {
536 return (EINVAL);
537 }
538 }
539
540 /* Check invalid mask */
541 if ((cmd->flags & ILB_RULE_STICKY) &&
542 IS_ADDR_UNSPEC(&cmd->sticky_mask)) {
543 return (EINVAL);
544 }
545
546 /* Port is passed in network byte order. */
547 min_port = ntohs(cmd->min_port);
548 max_port = ntohs(cmd->max_port);
549 if (min_port > max_port)
550 return (EINVAL);
551
552 /* min_port == 0 means "all ports". Make it so */
553 if (min_port == 0) {
554 min_port = 1;
555 max_port = 65535;
556 }
557
558 /* Funny address checking. */
559 if (cmd->ip_ver == IPPROTO_IP) {
560 in_addr_t v4_addr1, v4_addr2;
561
562 v4_addr1 = cmd->vip.s6_addr32[3];
563 if ((*(uchar_t *)&v4_addr1) == IN_LOOPBACKNET ||
564 CLASSD(v4_addr1) || v4_addr1 == INADDR_BROADCAST ||
565 v4_addr1 == INADDR_ANY ||
566 !IN6_IS_ADDR_V4MAPPED(&cmd->vip)) {
567 return (EINVAL);
568 }
569
570 if (cmd->topo == ILB_TOPO_IMPL_NAT) {
571 v4_addr1 = ntohl(cmd->nat_src_start.s6_addr32[3]);
572 v4_addr2 = ntohl(cmd->nat_src_end.s6_addr32[3]);
573 if ((*(uchar_t *)&v4_addr1) == IN_LOOPBACKNET ||
574 (*(uchar_t *)&v4_addr2) == IN_LOOPBACKNET ||
575 v4_addr1 == INADDR_BROADCAST ||
576 v4_addr2 == INADDR_BROADCAST ||
577 v4_addr1 == INADDR_ANY || v4_addr2 == INADDR_ANY ||
578 CLASSD(v4_addr1) || CLASSD(v4_addr2) ||
579 !IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_start) ||
580 !IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_end)) {
581 return (EINVAL);
582 }
583
584 num_src = v4_addr2 - v4_addr1 + 1;
585 if (v4_addr1 > v4_addr2 || num_src > ILB_MAX_NAT_SRC)
586 return (EINVAL);
587 }
588 } else {
589 if (IN6_IS_ADDR_LOOPBACK(&cmd->vip) ||
590 IN6_IS_ADDR_MULTICAST(&cmd->vip) ||
591 IN6_IS_ADDR_UNSPECIFIED(&cmd->vip) ||
592 IN6_IS_ADDR_V4MAPPED(&cmd->vip)) {
593 return (EINVAL);
594 }
595
596 if (cmd->topo == ILB_TOPO_IMPL_NAT) {
597 if (IN6_IS_ADDR_LOOPBACK(&cmd->nat_src_start) ||
598 IN6_IS_ADDR_LOOPBACK(&cmd->nat_src_end) ||
599 IN6_IS_ADDR_MULTICAST(&cmd->nat_src_start) ||
600 IN6_IS_ADDR_MULTICAST(&cmd->nat_src_end) ||
601 IN6_IS_ADDR_UNSPECIFIED(&cmd->nat_src_start) ||
602 IN6_IS_ADDR_UNSPECIFIED(&cmd->nat_src_end) ||
603 IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_start) ||
604 IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_end)) {
605 return (EINVAL);
606 }
607
608 if ((num_src = num_nat_src_v6(&cmd->nat_src_start,
609 &cmd->nat_src_end)) < 0 ||
610 num_src > ILB_MAX_NAT_SRC) {
611 return (EINVAL);
612 }
613 }
614 }
615
616 mutex_enter(&ilbs->ilbs_g_lock);
617 if (ilbs->ilbs_g_hash == NULL)
618 ilb_rule_hash_init(ilbs);
619 if (ilbs->ilbs_c2s_conn_hash == NULL) {
620 ASSERT(ilbs->ilbs_s2c_conn_hash == NULL);
621 ilb_conn_hash_init(ilbs);
622 ilb_nat_src_init(ilbs);
623 }
624
625 /* Make sure that the new rule does not duplicate an existing one. */
626 if (ilb_match_rule(ilbs, zoneid, cmd->name, cmd->ip_ver, cmd->proto,
627 min_port, max_port, &cmd->vip)) {
628 mutex_exit(&ilbs->ilbs_g_lock);
629 return (EEXIST);
630 }
631
632 rule = kmem_zalloc(sizeof (ilb_rule_t), KM_NOSLEEP);
633 if (rule == NULL) {
634 mutex_exit(&ilbs->ilbs_g_lock);
635 return (ENOMEM);
636 }
637
638 /* ir_name is all 0 to begin with */
639 (void) memcpy(rule->ir_name, cmd->name, ILB_RULE_NAMESZ - 1);
640
641 rule->ir_ks_instance = atomic_add_int_nv(&ilb_kstat_instance, 1);
642 stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;
643 if ((rule->ir_ksp = ilb_rule_kstat_init(stackid, rule)) == NULL) {
644 ret = ENOMEM;
645 goto error;
646 }
647
648 if (cmd->topo == ILB_TOPO_IMPL_NAT) {
649 rule->ir_nat_src_start = cmd->nat_src_start;
650 rule->ir_nat_src_end = cmd->nat_src_end;
651 }
652
653 rule->ir_ipver = cmd->ip_ver;
654 rule->ir_proto = cmd->proto;
655 rule->ir_topo = cmd->topo;
656
657 rule->ir_min_port = min_port;
658 rule->ir_max_port = max_port;
659 if (rule->ir_min_port != rule->ir_max_port)
660 rule->ir_port_range = B_TRUE;
661 else
662 rule->ir_port_range = B_FALSE;
663
664 rule->ir_zoneid = zoneid;
665
666 rule->ir_target_v6 = cmd->vip;
667 rule->ir_servers = NULL;
668
669 /*
670 * The default connection drain timeout is indefinite (value 0),
671 * meaning we will wait for all connections to finish. So we
672 * can assign cmd->conn_drain_timeout to it directly.
673 */
674 rule->ir_conn_drain_timeout = cmd->conn_drain_timeout;
675 if (cmd->nat_expiry != 0) {
676 rule->ir_nat_expiry = cmd->nat_expiry;
677 } else {
678 switch (rule->ir_proto) {
679 case IPPROTO_TCP:
680 rule->ir_nat_expiry = ilb_conn_tcp_expiry;
681 break;
682 case IPPROTO_UDP:
683 rule->ir_nat_expiry = ilb_conn_udp_expiry;
684 break;
685 default:
686 cmn_err(CE_PANIC, "data corruption: wrong ir_proto: %p",
687 (void *)rule);
688 break;
689 }
690 }
691 if (cmd->sticky_expiry != 0)
692 rule->ir_sticky_expiry = cmd->sticky_expiry;
693 else
694 rule->ir_sticky_expiry = ilb_sticky_expiry;
695
696 if (cmd->flags & ILB_RULE_STICKY) {
697 rule->ir_flags |= ILB_RULE_STICKY;
698 rule->ir_sticky_mask = cmd->sticky_mask;
699 if (ilbs->ilbs_sticky_hash == NULL)
700 ilb_sticky_hash_init(ilbs);
701 }
702 if (cmd->flags & ILB_RULE_ENABLED)
703 rule->ir_flags |= ILB_RULE_ENABLED;
704
705 mutex_init(&rule->ir_lock, NULL, MUTEX_DEFAULT, NULL);
706 cv_init(&rule->ir_cv, NULL, CV_DEFAULT, NULL);
707
708 rule->ir_refcnt = 1;
709
710 switch (cmd->algo) {
711 case ILB_ALG_IMPL_ROUNDROBIN:
712 if ((rule->ir_alg = ilb_alg_rr_init(rule, NULL)) == NULL) {
713 ret = ENOMEM;
714 goto error;
715 }
716 rule->ir_alg_type = ILB_ALG_IMPL_ROUNDROBIN;
717 break;
718 case ILB_ALG_IMPL_HASH_IP:
719 case ILB_ALG_IMPL_HASH_IP_SPORT:
720 case ILB_ALG_IMPL_HASH_IP_VIP:
721 if ((rule->ir_alg = ilb_alg_hash_init(rule,
722 &cmd->algo)) == NULL) {
723 ret = ENOMEM;
724 goto error;
725 }
726 rule->ir_alg_type = cmd->algo;
727 break;
728 default:
729 ret = EINVAL;
730 goto error;
731 }
732
733 /* Add it to the global list and hash array at the end. */
734 ilb_rule_g_add(ilbs, rule);
735 ilb_rule_hash_add(ilbs, rule, &cmd->vip);
736
737 mutex_exit(&ilbs->ilbs_g_lock);
738
739 return (0);
740
741 error:
742 mutex_exit(&ilbs->ilbs_g_lock);
743 if (rule->ir_ksp != NULL) {
744 /* stackid must be initialized if ir_ksp != NULL */
745 kstat_delete_netstack(rule->ir_ksp, stackid);
746 }
747 kmem_free(rule, sizeof (ilb_rule_t));
748 return (ret);
749 }
750
751 /*
752 * The final part in deleting a rule. Either called directly or by the
753 * taskq dispatched.
754 */
755 static void
ilb_rule_del_common(ilb_stack_t * ilbs,ilb_rule_t * tmp_rule)756 ilb_rule_del_common(ilb_stack_t *ilbs, ilb_rule_t *tmp_rule)
757 {
758 netstackid_t stackid;
759 ilb_server_t *server;
760
761 stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;
762
763 /*
764 * Let the algorithm know that the rule is going away. The
765 * algorithm fini routine will free all its resources with this
766 * rule.
767 */
768 tmp_rule->ir_alg->ilb_alg_fini(&tmp_rule->ir_alg);
769
770 while ((server = tmp_rule->ir_servers) != NULL) {
771 mutex_enter(&server->iser_lock);
772 ilb_destroy_nat_src(&server->iser_nat_src);
773 if (tmp_rule->ir_conn_drain_timeout != 0) {
774 /*
775 * The garbage collection thread checks this value
776 * without grabing a lock. So we need to use
777 * atomic_swap_64() to make sure that the value seen
778 * by gc thread is intact.
779 */
780 (void) atomic_swap_64(
781 (uint64_t *)&server->iser_die_time,
782 ddi_get_lbolt64() +
783 SEC_TO_TICK(tmp_rule->ir_conn_drain_timeout));
784 }
785 while (server->iser_refcnt > 1)
786 cv_wait(&server->iser_cv, &server->iser_lock);
787 tmp_rule->ir_servers = server->iser_next;
788 kstat_delete_netstack(server->iser_ksp, stackid);
789 kmem_free(server, sizeof (ilb_server_t));
790 }
791
792 ASSERT(tmp_rule->ir_ksp != NULL);
793 kstat_delete_netstack(tmp_rule->ir_ksp, stackid);
794
795 kmem_free(tmp_rule, sizeof (ilb_rule_t));
796 }
797
798 /* The routine executed by the delayed rule taskq. */
799 static void
ilb_rule_del_tq(void * arg)800 ilb_rule_del_tq(void *arg)
801 {
802 ilb_stack_t *ilbs = ((ilb_rule_tq_t *)arg)->ilbs;
803 ilb_rule_t *rule = ((ilb_rule_tq_t *)arg)->rule;
804
805 mutex_enter(&rule->ir_lock);
806 while (rule->ir_refcnt > 1)
807 cv_wait(&rule->ir_cv, &rule->ir_lock);
808 ilb_rule_del_common(ilbs, rule);
809 kmem_free(arg, sizeof (ilb_rule_tq_t));
810 }
811
812 /* Routine to delete a rule. */
813 int
ilb_rule_del(ilb_stack_t * ilbs,zoneid_t zoneid,const char * name)814 ilb_rule_del(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name)
815 {
816 ilb_rule_t *tmp_rule;
817 ilb_rule_tq_t *arg;
818 int err;
819
820 mutex_enter(&ilbs->ilbs_g_lock);
821 if ((tmp_rule = ilb_find_rule_locked(ilbs, zoneid, name,
822 &err)) == NULL) {
823 mutex_exit(&ilbs->ilbs_g_lock);
824 return (err);
825 }
826
827 /*
828 * First remove the rule from the hash array and the global list so
829 * that no one can find this rule any more.
830 */
831 ilb_rule_hash_del(tmp_rule);
832 ilb_rule_g_del(ilbs, tmp_rule);
833 mutex_exit(&ilbs->ilbs_g_lock);
834 ILB_RULE_REFRELE(tmp_rule);
835
836 /*
837 * Now no one can find this rule, we can remove it once all
838 * references to it are dropped and all references to the list
839 * of servers are dropped. So dispatch a task to finish the deletion.
840 * We do this instead of letting the last one referencing the
841 * rule do it. The reason is that the last one may be the
842 * interrupt thread. We want to minimize the work it needs to
843 * do. Rule deletion is not a critical task so it can be delayed.
844 */
845 arg = kmem_alloc(sizeof (ilb_rule_tq_t), KM_SLEEP);
846 arg->ilbs = ilbs;
847 arg->rule = tmp_rule;
848 (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_rule_del_tq, arg,
849 TQ_SLEEP);
850
851 return (0);
852 }
853
854 /*
855 * Given an IP address, check to see if there is a rule using this
856 * as the VIP. It can be used to check if we need to drop a fragment.
857 */
858 boolean_t
ilb_rule_match_vip_v6(ilb_stack_t * ilbs,in6_addr_t * vip,ilb_rule_t ** ret_rule)859 ilb_rule_match_vip_v6(ilb_stack_t *ilbs, in6_addr_t *vip, ilb_rule_t **ret_rule)
860 {
861 int i;
862 ilb_rule_t *rule;
863 boolean_t ret = B_FALSE;
864
865 i = ILB_RULE_HASH((uint8_t *)&vip->s6_addr32[3],
866 ilbs->ilbs_rule_hash_size);
867 mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
868 for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
869 rule = rule->ir_hash_next) {
870 if (IN6_ARE_ADDR_EQUAL(vip, &rule->ir_target_v6)) {
871 mutex_enter(&rule->ir_lock);
872 if (rule->ir_flags & ILB_RULE_BUSY) {
873 mutex_exit(&rule->ir_lock);
874 break;
875 }
876 if (ret_rule != NULL) {
877 rule->ir_refcnt++;
878 mutex_exit(&rule->ir_lock);
879 *ret_rule = rule;
880 } else {
881 mutex_exit(&rule->ir_lock);
882 }
883 ret = B_TRUE;
884 break;
885 }
886 }
887 mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
888 return (ret);
889 }
890
891 boolean_t
ilb_rule_match_vip_v4(ilb_stack_t * ilbs,ipaddr_t addr,ilb_rule_t ** ret_rule)892 ilb_rule_match_vip_v4(ilb_stack_t *ilbs, ipaddr_t addr, ilb_rule_t **ret_rule)
893 {
894 int i;
895 ilb_rule_t *rule;
896 boolean_t ret = B_FALSE;
897
898 i = ILB_RULE_HASH((uint8_t *)&addr, ilbs->ilbs_rule_hash_size);
899 mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
900 for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
901 rule = rule->ir_hash_next) {
902 if (rule->ir_target_v6.s6_addr32[3] == addr) {
903 mutex_enter(&rule->ir_lock);
904 if (rule->ir_flags & ILB_RULE_BUSY) {
905 mutex_exit(&rule->ir_lock);
906 break;
907 }
908 if (ret_rule != NULL) {
909 rule->ir_refcnt++;
910 mutex_exit(&rule->ir_lock);
911 *ret_rule = rule;
912 } else {
913 mutex_exit(&rule->ir_lock);
914 }
915 ret = B_TRUE;
916 break;
917 }
918 }
919 mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
920 return (ret);
921 }
922
923 static ilb_rule_t *
ilb_find_rule_locked(ilb_stack_t * ilbs,zoneid_t zoneid,const char * name,int * err)924 ilb_find_rule_locked(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
925 int *err)
926 {
927 ilb_rule_t *tmp_rule;
928
929 ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
930
931 for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
932 tmp_rule = tmp_rule->ir_next) {
933 if (tmp_rule->ir_zoneid != zoneid)
934 continue;
935 if (strcasecmp(tmp_rule->ir_name, name) == 0) {
936 mutex_enter(&tmp_rule->ir_lock);
937 if (tmp_rule->ir_flags & ILB_RULE_BUSY) {
938 mutex_exit(&tmp_rule->ir_lock);
939 *err = EINPROGRESS;
940 return (NULL);
941 }
942 tmp_rule->ir_refcnt++;
943 mutex_exit(&tmp_rule->ir_lock);
944 *err = 0;
945 return (tmp_rule);
946 }
947 }
948 *err = ENOENT;
949 return (NULL);
950 }
951
952 /* To find a rule with a given name and zone in the global rule list. */
953 ilb_rule_t *
ilb_find_rule(ilb_stack_t * ilbs,zoneid_t zoneid,const char * name,int * err)954 ilb_find_rule(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
955 int *err)
956 {
957 ilb_rule_t *tmp_rule;
958
959 mutex_enter(&ilbs->ilbs_g_lock);
960 tmp_rule = ilb_find_rule_locked(ilbs, zoneid, name, err);
961 mutex_exit(&ilbs->ilbs_g_lock);
962 return (tmp_rule);
963 }
964
965 /* Try to match the given packet info and zone ID with a rule. */
966 static boolean_t
ilb_match_rule(ilb_stack_t * ilbs,zoneid_t zoneid,const char * name,int l3,int l4,in_port_t min_port,in_port_t max_port,const in6_addr_t * addr)967 ilb_match_rule(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, int l3,
968 int l4, in_port_t min_port, in_port_t max_port, const in6_addr_t *addr)
969 {
970 ilb_rule_t *tmp_rule;
971
972 ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
973
974 for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
975 tmp_rule = tmp_rule->ir_next) {
976 if (tmp_rule->ir_zoneid != zoneid)
977 continue;
978
979 /*
980 * We don't allow the same name in different rules even if all
981 * the other rule components are different.
982 */
983 if (strcasecmp(tmp_rule->ir_name, name) == 0)
984 return (B_TRUE);
985
986 if (tmp_rule->ir_ipver != l3 || tmp_rule->ir_proto != l4)
987 continue;
988
989 /*
990 * ir_min_port and ir_max_port are the same if ir_port_range
991 * is false. In this case, if the ir_min|max_port (same) is
992 * outside of the given port range, it is OK. In other cases,
993 * check if min and max port are outside a rule's range.
994 */
995 if (tmp_rule->ir_max_port < min_port ||
996 tmp_rule->ir_min_port > max_port) {
997 continue;
998 }
999
1000 /*
1001 * If l3 is IPv4, the addr passed in is assumed to be
1002 * mapped address.
1003 */
1004 if (V6_OR_V4_INADDR_ANY(*addr) ||
1005 V6_OR_V4_INADDR_ANY(tmp_rule->ir_target_v6) ||
1006 IN6_ARE_ADDR_EQUAL(addr, &tmp_rule->ir_target_v6)) {
1007 return (B_TRUE);
1008 }
1009 }
1010 return (B_FALSE);
1011 }
1012
1013 int
ilb_rule_enable(ilb_stack_t * ilbs,zoneid_t zoneid,const char * rule_name,ilb_rule_t * in_rule)1014 ilb_rule_enable(ilb_stack_t *ilbs, zoneid_t zoneid,
1015 const char *rule_name, ilb_rule_t *in_rule)
1016 {
1017 ilb_rule_t *rule;
1018 int err;
1019
1020 ASSERT((in_rule == NULL && rule_name != NULL) ||
1021 (in_rule != NULL && rule_name == NULL));
1022 if ((rule = in_rule) == NULL) {
1023 if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
1024 &err)) == NULL) {
1025 return (err);
1026 }
1027 }
1028 mutex_enter(&rule->ir_lock);
1029 rule->ir_flags |= ILB_RULE_ENABLED;
1030 mutex_exit(&rule->ir_lock);
1031
1032 /* Only refrele if the rule is passed in. */
1033 if (in_rule == NULL)
1034 ILB_RULE_REFRELE(rule);
1035 return (0);
1036 }
1037
1038 int
ilb_rule_disable(ilb_stack_t * ilbs,zoneid_t zoneid,const char * rule_name,ilb_rule_t * in_rule)1039 ilb_rule_disable(ilb_stack_t *ilbs, zoneid_t zoneid,
1040 const char *rule_name, ilb_rule_t *in_rule)
1041 {
1042 ilb_rule_t *rule;
1043 int err;
1044
1045 ASSERT((in_rule == NULL && rule_name != NULL) ||
1046 (in_rule != NULL && rule_name == NULL));
1047 if ((rule = in_rule) == NULL) {
1048 if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
1049 &err)) == NULL) {
1050 return (err);
1051 }
1052 }
1053 mutex_enter(&rule->ir_lock);
1054 rule->ir_flags &= ~ILB_RULE_ENABLED;
1055 mutex_exit(&rule->ir_lock);
1056
1057 /* Only refrele if the rule is passed in. */
1058 if (in_rule == NULL)
1059 ILB_RULE_REFRELE(rule);
1060 return (0);
1061 }
1062
1063 /*
1064 * XXX We should probably have a walker function to walk all rules. For
1065 * now, just add a simple loop for enable/disable/del.
1066 */
1067 void
ilb_rule_enable_all(ilb_stack_t * ilbs,zoneid_t zoneid)1068 ilb_rule_enable_all(ilb_stack_t *ilbs, zoneid_t zoneid)
1069 {
1070 ilb_rule_t *rule;
1071
1072 mutex_enter(&ilbs->ilbs_g_lock);
1073 for (rule = ilbs->ilbs_rule_head; rule != NULL; rule = rule->ir_next) {
1074 if (rule->ir_zoneid != zoneid)
1075 continue;
1076 /*
1077 * No need to hold the rule as we are holding the global
1078 * lock so it won't go away. Ignore the return value here
1079 * as the rule is provided so the call cannot fail.
1080 */
1081 (void) ilb_rule_enable(ilbs, zoneid, NULL, rule);
1082 }
1083 mutex_exit(&ilbs->ilbs_g_lock);
1084 }
1085
1086 void
ilb_rule_disable_all(ilb_stack_t * ilbs,zoneid_t zoneid)1087 ilb_rule_disable_all(ilb_stack_t *ilbs, zoneid_t zoneid)
1088 {
1089 ilb_rule_t *rule;
1090
1091 mutex_enter(&ilbs->ilbs_g_lock);
1092 for (rule = ilbs->ilbs_rule_head; rule != NULL;
1093 rule = rule->ir_next) {
1094 if (rule->ir_zoneid != zoneid)
1095 continue;
1096 (void) ilb_rule_disable(ilbs, zoneid, NULL, rule);
1097 }
1098 mutex_exit(&ilbs->ilbs_g_lock);
1099 }
1100
1101 void
ilb_rule_del_all(ilb_stack_t * ilbs,zoneid_t zoneid)1102 ilb_rule_del_all(ilb_stack_t *ilbs, zoneid_t zoneid)
1103 {
1104 ilb_rule_t *rule;
1105 ilb_rule_tq_t *arg;
1106
1107 mutex_enter(&ilbs->ilbs_g_lock);
1108 while ((rule = ilbs->ilbs_rule_head) != NULL) {
1109 if (rule->ir_zoneid != zoneid)
1110 continue;
1111 ilb_rule_hash_del(rule);
1112 ilb_rule_g_del(ilbs, rule);
1113 mutex_exit(&ilbs->ilbs_g_lock);
1114
1115 arg = kmem_alloc(sizeof (ilb_rule_tq_t), KM_SLEEP);
1116 arg->ilbs = ilbs;
1117 arg->rule = rule;
1118 (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_rule_del_tq,
1119 arg, TQ_SLEEP);
1120
1121 mutex_enter(&ilbs->ilbs_g_lock);
1122 }
1123 mutex_exit(&ilbs->ilbs_g_lock);
1124 }
1125
1126 /*
1127 * This is just an optimization, so don't grab the global lock. The
1128 * worst case is that we missed a couple packets.
1129 */
1130 boolean_t
ilb_has_rules(ilb_stack_t * ilbs)1131 ilb_has_rules(ilb_stack_t *ilbs)
1132 {
1133 return (ilbs->ilbs_rule_head != NULL);
1134 }
1135
1136
1137 static int
ilb_server_toggle(ilb_stack_t * ilbs,zoneid_t zoneid,const char * rule_name,ilb_rule_t * rule,in6_addr_t * addr,boolean_t enable)1138 ilb_server_toggle(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name,
1139 ilb_rule_t *rule, in6_addr_t *addr, boolean_t enable)
1140 {
1141 ilb_server_t *tmp_server;
1142 int ret;
1143
1144 ASSERT((rule == NULL && rule_name != NULL) ||
1145 (rule != NULL && rule_name == NULL));
1146
1147 if (rule == NULL) {
1148 if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
1149 &ret)) == NULL) {
1150 return (ret);
1151 }
1152 }
1153
1154 /* Once we get a hold on the rule, no server can be added/deleted. */
1155 for (tmp_server = rule->ir_servers; tmp_server != NULL;
1156 tmp_server = tmp_server->iser_next) {
1157 if (IN6_ARE_ADDR_EQUAL(&tmp_server->iser_addr_v6, addr))
1158 break;
1159 }
1160 if (tmp_server == NULL) {
1161 ret = ENOENT;
1162 goto done;
1163 }
1164
1165 if (enable) {
1166 ret = rule->ir_alg->ilb_alg_server_enable(tmp_server,
1167 rule->ir_alg->ilb_alg_data);
1168 if (ret == 0) {
1169 tmp_server->iser_enabled = B_TRUE;
1170 tmp_server->iser_die_time = 0;
1171 }
1172 } else {
1173 ret = rule->ir_alg->ilb_alg_server_disable(tmp_server,
1174 rule->ir_alg->ilb_alg_data);
1175 if (ret == 0) {
1176 tmp_server->iser_enabled = B_FALSE;
1177 if (rule->ir_conn_drain_timeout != 0) {
1178 (void) atomic_swap_64(
1179 (uint64_t *)&tmp_server->iser_die_time,
1180 ddi_get_lbolt64() + SEC_TO_TICK(
1181 rule->ir_conn_drain_timeout));
1182 }
1183 }
1184 }
1185
1186 done:
1187 if (rule_name != NULL)
1188 ILB_RULE_REFRELE(rule);
1189 return (ret);
1190 }
1191 int
ilb_server_enable(ilb_stack_t * ilbs,zoneid_t zoneid,const char * name,ilb_rule_t * rule,in6_addr_t * addr)1192 ilb_server_enable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
1193 ilb_rule_t *rule, in6_addr_t *addr)
1194 {
1195 return (ilb_server_toggle(ilbs, zoneid, name, rule, addr, B_TRUE));
1196 }
1197
1198 int
ilb_server_disable(ilb_stack_t * ilbs,zoneid_t zoneid,const char * name,ilb_rule_t * rule,in6_addr_t * addr)1199 ilb_server_disable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
1200 ilb_rule_t *rule, in6_addr_t *addr)
1201 {
1202 return (ilb_server_toggle(ilbs, zoneid, name, rule, addr, B_FALSE));
1203 }
1204
1205 /*
1206 * Add a back end server to a rule. If the address is IPv4, it is assumed
1207 * to be passed in as a mapped address.
1208 */
1209 int
ilb_server_add(ilb_stack_t * ilbs,ilb_rule_t * rule,ilb_server_info_t * info)1210 ilb_server_add(ilb_stack_t *ilbs, ilb_rule_t *rule, ilb_server_info_t *info)
1211 {
1212 ilb_server_t *server;
1213 netstackid_t stackid;
1214 int ret = 0;
1215 in_port_t min_port, max_port;
1216 in_port_t range;
1217
1218 /* Port is passed in network byte order. */
1219 min_port = ntohs(info->min_port);
1220 max_port = ntohs(info->max_port);
1221 if (min_port > max_port)
1222 return (EINVAL);
1223
1224 /* min_port == 0 means "all ports". Make it so */
1225 if (min_port == 0) {
1226 min_port = 1;
1227 max_port = 65535;
1228 }
1229 range = max_port - min_port;
1230
1231 mutex_enter(&rule->ir_lock);
1232 /* If someone is already doing server add/del, sleeps and wait. */
1233 while (rule->ir_flags & ILB_RULE_BUSY) {
1234 if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
1235 mutex_exit(&rule->ir_lock);
1236 return (EINTR);
1237 }
1238 }
1239
1240 /*
1241 * Set the rule to be busy to make sure that no new packet can
1242 * use this rule.
1243 */
1244 rule->ir_flags |= ILB_RULE_BUSY;
1245
1246 /* Now wait for all other guys to finish their work. */
1247 while (rule->ir_refcnt > 2) {
1248 if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
1249 mutex_exit(&rule->ir_lock);
1250 ret = EINTR;
1251 goto end;
1252 }
1253 }
1254 mutex_exit(&rule->ir_lock);
1255
1256 /* Sanity checks... */
1257 if ((IN6_IS_ADDR_V4MAPPED(&info->addr) &&
1258 rule->ir_ipver != IPPROTO_IP) ||
1259 (!IN6_IS_ADDR_V4MAPPED(&info->addr) &&
1260 rule->ir_ipver != IPPROTO_IPV6)) {
1261 ret = EINVAL;
1262 goto end;
1263 }
1264
1265 /*
1266 * Check for valid port range.
1267 *
1268 * For DSR, there can be no port shifting. Hence the server
1269 * specification must be the same as the rule's.
1270 *
1271 * For half-NAT/NAT, the range must either be 0 (port collapsing) or
1272 * it must be equal to the same value as the rule port range.
1273 *
1274 */
1275 if (rule->ir_topo == ILB_TOPO_IMPL_DSR) {
1276 if (rule->ir_max_port != max_port ||
1277 rule->ir_min_port != min_port) {
1278 ret = EINVAL;
1279 goto end;
1280 }
1281 } else {
1282 if ((range != rule->ir_max_port - rule->ir_min_port) &&
1283 range != 0) {
1284 ret = EINVAL;
1285 goto end;
1286 }
1287 }
1288
1289 /* Check for duplicate. */
1290 for (server = rule->ir_servers; server != NULL;
1291 server = server->iser_next) {
1292 if (IN6_ARE_ADDR_EQUAL(&server->iser_addr_v6, &info->addr) ||
1293 strcasecmp(server->iser_name, info->name) == 0) {
1294 break;
1295 }
1296 }
1297 if (server != NULL) {
1298 ret = EEXIST;
1299 goto end;
1300 }
1301
1302 if ((server = kmem_zalloc(sizeof (ilb_server_t), KM_NOSLEEP)) == NULL) {
1303 ret = ENOMEM;
1304 goto end;
1305 }
1306
1307 (void) memcpy(server->iser_name, info->name, ILB_SERVER_NAMESZ - 1);
1308 (void) inet_ntop(AF_INET6, &info->addr, server->iser_ip_addr,
1309 sizeof (server->iser_ip_addr));
1310 stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;
1311 server->iser_ksp = ilb_server_kstat_init(stackid, rule, server);
1312 if (server->iser_ksp == NULL) {
1313 kmem_free(server, sizeof (ilb_server_t));
1314 ret = EINVAL;
1315 goto end;
1316 }
1317
1318 server->iser_stackid = stackid;
1319 server->iser_addr_v6 = info->addr;
1320 server->iser_min_port = min_port;
1321 server->iser_max_port = max_port;
1322 if (min_port != max_port)
1323 server->iser_port_range = B_TRUE;
1324 else
1325 server->iser_port_range = B_FALSE;
1326
1327 /*
1328 * If the rule uses NAT, find/create the NAT source entry to use
1329 * for this server.
1330 */
1331 if (rule->ir_topo == ILB_TOPO_IMPL_NAT) {
1332 in_port_t port;
1333
1334 /*
1335 * If the server uses a port range, our port allocation
1336 * scheme needs to treat it as a wildcard. Refer to the
1337 * comments in ilb_nat.c about the scheme.
1338 */
1339 if (server->iser_port_range)
1340 port = 0;
1341 else
1342 port = server->iser_min_port;
1343
1344 if ((ret = ilb_create_nat_src(ilbs, &server->iser_nat_src,
1345 &server->iser_addr_v6, port, &rule->ir_nat_src_start,
1346 num_nat_src_v6(&rule->ir_nat_src_start,
1347 &rule->ir_nat_src_end))) != 0) {
1348 kstat_delete_netstack(server->iser_ksp, stackid);
1349 kmem_free(server, sizeof (ilb_server_t));
1350 goto end;
1351 }
1352 }
1353
1354 /*
1355 * The iser_lock is only used to protect iser_refcnt. All the other
1356 * fields in ilb_server_t should not change, except for iser_enabled.
1357 * The worst thing that can happen if iser_enabled is messed up is
1358 * that one or two packets may not be load balanced to a server
1359 * correctly.
1360 */
1361 server->iser_refcnt = 1;
1362 server->iser_enabled = info->flags & ILB_SERVER_ENABLED ? B_TRUE :
1363 B_FALSE;
1364 mutex_init(&server->iser_lock, NULL, MUTEX_DEFAULT, NULL);
1365 cv_init(&server->iser_cv, NULL, CV_DEFAULT, NULL);
1366
1367 /* Let the load balancing algorithm know about the addition. */
1368 ASSERT(rule->ir_alg != NULL);
1369 if ((ret = rule->ir_alg->ilb_alg_server_add(server,
1370 rule->ir_alg->ilb_alg_data)) != 0) {
1371 kstat_delete_netstack(server->iser_ksp, stackid);
1372 kmem_free(server, sizeof (ilb_server_t));
1373 goto end;
1374 }
1375
1376 /*
1377 * No need to hold ir_lock since no other thread should manipulate
1378 * the following fields until ILB_RULE_BUSY is cleared.
1379 */
1380 if (rule->ir_servers == NULL) {
1381 server->iser_next = NULL;
1382 } else {
1383 server->iser_next = rule->ir_servers;
1384 }
1385 rule->ir_servers = server;
1386 ILB_R_KSTAT(rule, num_servers);
1387
1388 end:
1389 mutex_enter(&rule->ir_lock);
1390 rule->ir_flags &= ~ILB_RULE_BUSY;
1391 cv_signal(&rule->ir_cv);
1392 mutex_exit(&rule->ir_lock);
1393 return (ret);
1394 }
1395
1396 /* The routine executed by the delayed rule processing taskq. */
1397 static void
ilb_server_del_tq(void * arg)1398 ilb_server_del_tq(void *arg)
1399 {
1400 ilb_server_t *server = (ilb_server_t *)arg;
1401
1402 mutex_enter(&server->iser_lock);
1403 while (server->iser_refcnt > 1)
1404 cv_wait(&server->iser_cv, &server->iser_lock);
1405 kstat_delete_netstack(server->iser_ksp, server->iser_stackid);
1406 kmem_free(server, sizeof (ilb_server_t));
1407 }
1408
1409 /*
1410 * Delete a back end server from a rule. If the address is IPv4, it is assumed
1411 * to be passed in as a mapped address.
1412 */
1413 int
ilb_server_del(ilb_stack_t * ilbs,zoneid_t zoneid,const char * rule_name,ilb_rule_t * rule,in6_addr_t * addr)1414 ilb_server_del(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name,
1415 ilb_rule_t *rule, in6_addr_t *addr)
1416 {
1417 ilb_server_t *server;
1418 ilb_server_t *prev_server;
1419 int ret = 0;
1420
1421 ASSERT((rule == NULL && rule_name != NULL) ||
1422 (rule != NULL && rule_name == NULL));
1423 if (rule == NULL) {
1424 if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
1425 &ret)) == NULL) {
1426 return (ret);
1427 }
1428 }
1429
1430 mutex_enter(&rule->ir_lock);
1431 /* If someone is already doing server add/del, sleeps and wait. */
1432 while (rule->ir_flags & ILB_RULE_BUSY) {
1433 if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
1434 if (rule_name != NULL) {
1435 if (--rule->ir_refcnt <= 2)
1436 cv_signal(&rule->ir_cv);
1437 }
1438 mutex_exit(&rule->ir_lock);
1439 return (EINTR);
1440 }
1441 }
1442 /*
1443 * Set the rule to be busy to make sure that no new packet can
1444 * use this rule.
1445 */
1446 rule->ir_flags |= ILB_RULE_BUSY;
1447
1448 /* Now wait for all other guys to finish their work. */
1449 while (rule->ir_refcnt > 2) {
1450 if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
1451 mutex_exit(&rule->ir_lock);
1452 ret = EINTR;
1453 goto end;
1454 }
1455 }
1456 mutex_exit(&rule->ir_lock);
1457
1458 prev_server = NULL;
1459 for (server = rule->ir_servers; server != NULL;
1460 prev_server = server, server = server->iser_next) {
1461 if (IN6_ARE_ADDR_EQUAL(&server->iser_addr_v6, addr))
1462 break;
1463 }
1464 if (server == NULL) {
1465 ret = ENOENT;
1466 goto end;
1467 }
1468
1469 /*
1470 * Let the load balancing algorithm know about the removal.
1471 * The algorithm may disallow the removal...
1472 */
1473 if ((ret = rule->ir_alg->ilb_alg_server_del(server,
1474 rule->ir_alg->ilb_alg_data)) != 0) {
1475 goto end;
1476 }
1477
1478 if (prev_server == NULL)
1479 rule->ir_servers = server->iser_next;
1480 else
1481 prev_server->iser_next = server->iser_next;
1482
1483 ILB_R_KSTAT_UPDATE(rule, num_servers, -1);
1484
1485 /*
1486 * Mark the server as disabled so that if there is any sticky cache
1487 * using this server around, it won't be used.
1488 */
1489 server->iser_enabled = B_FALSE;
1490
1491 mutex_enter(&server->iser_lock);
1492
1493 /*
1494 * De-allocate the NAT source array. The indiviual ilb_nat_src_entry_t
1495 * may not go away if there is still a conn using it. The NAT source
1496 * timer will do the garbage collection.
1497 */
1498 ilb_destroy_nat_src(&server->iser_nat_src);
1499
1500 /* If there is a hard limit on when a server should die, set it. */
1501 if (rule->ir_conn_drain_timeout != 0) {
1502 (void) atomic_swap_64((uint64_t *)&server->iser_die_time,
1503 ddi_get_lbolt64() +
1504 SEC_TO_TICK(rule->ir_conn_drain_timeout));
1505 }
1506
1507 if (server->iser_refcnt > 1) {
1508 (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_server_del_tq,
1509 server, TQ_SLEEP);
1510 mutex_exit(&server->iser_lock);
1511 } else {
1512 kstat_delete_netstack(server->iser_ksp, server->iser_stackid);
1513 kmem_free(server, sizeof (ilb_server_t));
1514 }
1515
1516 end:
1517 mutex_enter(&rule->ir_lock);
1518 rule->ir_flags &= ~ILB_RULE_BUSY;
1519 if (rule_name != NULL)
1520 rule->ir_refcnt--;
1521 cv_signal(&rule->ir_cv);
1522 mutex_exit(&rule->ir_lock);
1523 return (ret);
1524 }
1525
1526 /*
1527 * First check if the destination of the ICMP message matches a VIP of
1528 * a rule. If it does not, just return ILB_PASSED.
1529 *
1530 * If the destination matches a VIP:
1531 *
1532 * For ICMP_ECHO_REQUEST, generate a response on behalf of the back end
1533 * server.
1534 *
1535 * For ICMP_DEST_UNREACHABLE fragmentation needed, check inside the payload
1536 * and see which back end server we should send this message to. And we
1537 * need to do NAT on both the payload message and the outside IP packet.
1538 *
1539 * For other ICMP messages, drop them.
1540 */
1541 /* ARGSUSED */
1542 static int
ilb_icmp_v4(ilb_stack_t * ilbs,ill_t * ill,mblk_t * mp,ipha_t * ipha,icmph_t * icmph,ipaddr_t * lb_dst)1543 ilb_icmp_v4(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ipha_t *ipha,
1544 icmph_t *icmph, ipaddr_t *lb_dst)
1545 {
1546 ipaddr_t vip;
1547 ilb_rule_t *rule;
1548 in6_addr_t addr6;
1549
1550 if (!ilb_rule_match_vip_v4(ilbs, ipha->ipha_dst, &rule))
1551 return (ILB_PASSED);
1552
1553
1554 if ((uint8_t *)icmph + sizeof (icmph_t) > mp->b_wptr) {
1555 ILB_R_KSTAT(rule, icmp_dropped);
1556 ILB_RULE_REFRELE(rule);
1557 return (ILB_DROPPED);
1558 }
1559
1560 switch (icmph->icmph_type) {
1561 case ICMP_ECHO_REQUEST:
1562 ILB_R_KSTAT(rule, icmp_echo_processed);
1563 ILB_RULE_REFRELE(rule);
1564
1565 icmph->icmph_type = ICMP_ECHO_REPLY;
1566 icmph->icmph_checksum = 0;
1567 icmph->icmph_checksum = IP_CSUM(mp, IPH_HDR_LENGTH(ipha), 0);
1568 ipha->ipha_ttl =
1569 ilbs->ilbs_netstack->netstack_ip->ips_ip_def_ttl;
1570 *lb_dst = ipha->ipha_src;
1571 vip = ipha->ipha_dst;
1572 ipha->ipha_dst = ipha->ipha_src;
1573 ipha->ipha_src = vip;
1574 return (ILB_BALANCED);
1575 case ICMP_DEST_UNREACHABLE: {
1576 int ret;
1577
1578 if (icmph->icmph_code != ICMP_FRAGMENTATION_NEEDED) {
1579 ILB_R_KSTAT(rule, icmp_dropped);
1580 ILB_RULE_REFRELE(rule);
1581 return (ILB_DROPPED);
1582 }
1583 if (ilb_check_icmp_conn(ilbs, mp, IPPROTO_IP, ipha, icmph,
1584 &addr6)) {
1585 ILB_R_KSTAT(rule, icmp_2big_processed);
1586 ret = ILB_BALANCED;
1587 } else {
1588 ILB_R_KSTAT(rule, icmp_2big_dropped);
1589 ret = ILB_DROPPED;
1590 }
1591 ILB_RULE_REFRELE(rule);
1592 IN6_V4MAPPED_TO_IPADDR(&addr6, *lb_dst);
1593 return (ret);
1594 }
1595 default:
1596 ILB_R_KSTAT(rule, icmp_dropped);
1597 ILB_RULE_REFRELE(rule);
1598 return (ILB_DROPPED);
1599 }
1600 }
1601
1602 /* ARGSUSED */
1603 static int
ilb_icmp_v6(ilb_stack_t * ilbs,ill_t * ill,mblk_t * mp,ip6_t * ip6h,icmp6_t * icmp6,in6_addr_t * lb_dst)1604 ilb_icmp_v6(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ip6_t *ip6h,
1605 icmp6_t *icmp6, in6_addr_t *lb_dst)
1606 {
1607 ilb_rule_t *rule;
1608
1609 if (!ilb_rule_match_vip_v6(ilbs, &ip6h->ip6_dst, &rule))
1610 return (ILB_PASSED);
1611
1612 if ((uint8_t *)icmp6 + sizeof (icmp6_t) > mp->b_wptr) {
1613 ILB_R_KSTAT(rule, icmp_dropped);
1614 ILB_RULE_REFRELE(rule);
1615 return (ILB_DROPPED);
1616 }
1617
1618 switch (icmp6->icmp6_type) {
1619 case ICMP6_ECHO_REQUEST: {
1620 int hdr_len;
1621
1622 ILB_R_KSTAT(rule, icmp_echo_processed);
1623 ILB_RULE_REFRELE(rule);
1624
1625 icmp6->icmp6_type = ICMP6_ECHO_REPLY;
1626 icmp6->icmp6_cksum = ip6h->ip6_plen;
1627 hdr_len = (char *)icmp6 - (char *)ip6h;
1628 icmp6->icmp6_cksum = IP_CSUM(mp, hdr_len,
1629 ilb_pseudo_sum_v6(ip6h, IPPROTO_ICMPV6));
1630 ip6h->ip6_vcf &= ~IPV6_FLOWINFO_FLOWLABEL;
1631 ip6h->ip6_hops =
1632 ilbs->ilbs_netstack->netstack_ip->ips_ipv6_def_hops;
1633 *lb_dst = ip6h->ip6_src;
1634 ip6h->ip6_src = ip6h->ip6_dst;
1635 ip6h->ip6_dst = *lb_dst;
1636 return (ILB_BALANCED);
1637 }
1638 case ICMP6_PACKET_TOO_BIG: {
1639 int ret;
1640
1641 if (ilb_check_icmp_conn(ilbs, mp, IPPROTO_IPV6, ip6h, icmp6,
1642 lb_dst)) {
1643 ILB_R_KSTAT(rule, icmp_2big_processed);
1644 ret = ILB_BALANCED;
1645 } else {
1646 ILB_R_KSTAT(rule, icmp_2big_dropped);
1647 ret = ILB_DROPPED;
1648 }
1649 ILB_RULE_REFRELE(rule);
1650 return (ret);
1651 }
1652 default:
1653 ILB_R_KSTAT(rule, icmp_dropped);
1654 ILB_RULE_REFRELE(rule);
1655 return (ILB_DROPPED);
1656 }
1657 }
1658
1659 /*
1660 * Common routine to check an incoming packet and decide what to do with it.
1661 * called by ilb_check_v4|v6().
1662 */
1663 static int
ilb_check(ilb_stack_t * ilbs,ill_t * ill,mblk_t * mp,in6_addr_t * src,in6_addr_t * dst,int l3,int l4,void * iph,uint8_t * tph,uint32_t pkt_len,in6_addr_t * lb_dst)1664 ilb_check(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, in6_addr_t *src,
1665 in6_addr_t *dst, int l3, int l4, void *iph, uint8_t *tph, uint32_t pkt_len,
1666 in6_addr_t *lb_dst)
1667 {
1668 in_port_t sport, dport;
1669 tcpha_t *tcph;
1670 udpha_t *udph;
1671 ilb_rule_t *rule;
1672 ilb_server_t *server;
1673 boolean_t balanced;
1674 struct ilb_sticky_s *s = NULL;
1675 int ret;
1676 uint32_t ip_sum, tp_sum;
1677 ilb_nat_info_t info;
1678 uint16_t nat_src_idx;
1679 boolean_t busy;
1680
1681 /*
1682 * We don't really need to switch here since both protocols's
1683 * ports are at the same offset. Just prepare for future protocol
1684 * specific processing.
1685 */
1686 switch (l4) {
1687 case IPPROTO_TCP:
1688 if (tph + TCP_MIN_HEADER_LENGTH > mp->b_wptr)
1689 return (ILB_DROPPED);
1690 tcph = (tcpha_t *)tph;
1691 sport = tcph->tha_lport;
1692 dport = tcph->tha_fport;
1693 break;
1694 case IPPROTO_UDP:
1695 if (tph + sizeof (udpha_t) > mp->b_wptr)
1696 return (ILB_DROPPED);
1697 udph = (udpha_t *)tph;
1698 sport = udph->uha_src_port;
1699 dport = udph->uha_dst_port;
1700 break;
1701 default:
1702 return (ILB_PASSED);
1703 }
1704
1705 /* Fast path, there is an existing conn. */
1706 if (ilb_check_conn(ilbs, l3, iph, l4, tph, src, dst, sport, dport,
1707 pkt_len, lb_dst)) {
1708 return (ILB_BALANCED);
1709 }
1710
1711 /*
1712 * If there is no existing connection for the incoming packet, check
1713 * to see if the packet matches a rule. If not, just let IP decide
1714 * what to do with it.
1715 *
1716 * Note: a reply from back end server should not match a rule. A
1717 * reply should match one existing conn.
1718 */
1719 rule = ilb_rule_hash(ilbs, l3, l4, dst, dport, ill->ill_zoneid,
1720 pkt_len, &busy);
1721 if (rule == NULL) {
1722 /* If the rule is busy, just drop the packet. */
1723 if (busy)
1724 return (ILB_DROPPED);
1725 else
1726 return (ILB_PASSED);
1727 }
1728
1729 /*
1730 * The packet matches a rule, use the rule load balance algorithm
1731 * to find a server.
1732 */
1733 balanced = rule->ir_alg->ilb_alg_lb(src, sport, dst, dport,
1734 rule->ir_alg->ilb_alg_data, &server);
1735 /*
1736 * This can only happen if there is no server in a rule or all
1737 * the servers are currently disabled.
1738 */
1739 if (!balanced)
1740 goto no_server;
1741
1742 /*
1743 * If the rule is sticky enabled, we need to check the sticky table.
1744 * If there is a sticky entry for the client, use the previous server
1745 * instead of the one found above (note that both can be the same).
1746 * If there is no entry for that client, add an entry to the sticky
1747 * table. Both the find and add are done in ilb_sticky_find_add()
1748 * to avoid checking for duplicate when adding an entry.
1749 */
1750 if (rule->ir_flags & ILB_RULE_STICKY) {
1751 in6_addr_t addr;
1752
1753 V6_MASK_COPY(*src, rule->ir_sticky_mask, addr);
1754 if ((server = ilb_sticky_find_add(ilbs, rule, &addr, server,
1755 &s, &nat_src_idx)) == NULL) {
1756 ILB_R_KSTAT(rule, nomem_pkt_dropped);
1757 ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
1758 goto no_server;
1759 }
1760 }
1761
1762 /*
1763 * We are holding a reference on the rule, so the server
1764 * cannot go away.
1765 */
1766 *lb_dst = server->iser_addr_v6;
1767 ILB_S_KSTAT(server, pkt_processed);
1768 ILB_S_KSTAT_UPDATE(server, bytes_processed, pkt_len);
1769
1770 switch (rule->ir_topo) {
1771 case ILB_TOPO_IMPL_NAT: {
1772 ilb_nat_src_entry_t *src_ent;
1773 uint16_t *src_idx;
1774
1775 /*
1776 * We create a cache even if it is not a SYN segment.
1777 * The server should return a RST. When we see the
1778 * RST, we will destroy this cache. But by having
1779 * a cache, we know how to NAT the returned RST.
1780 */
1781 info.vip = *dst;
1782 info.dport = dport;
1783 info.src = *src;
1784 info.sport = sport;
1785
1786 /* If stickiness is enabled, use the same source address */
1787 if (s != NULL)
1788 src_idx = &nat_src_idx;
1789 else
1790 src_idx = NULL;
1791
1792 if ((src_ent = ilb_alloc_nat_addr(server->iser_nat_src,
1793 &info.nat_src, &info.nat_sport, src_idx)) == NULL) {
1794 if (s != NULL)
1795 ilb_sticky_refrele(s);
1796 ILB_R_KSTAT(rule, pkt_dropped);
1797 ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
1798 ILB_R_KSTAT(rule, noport_pkt_dropped);
1799 ILB_R_KSTAT_UPDATE(rule, noport_bytes_dropped, pkt_len);
1800 ret = ILB_DROPPED;
1801 break;
1802 }
1803 info.src_ent = src_ent;
1804 info.nat_dst = server->iser_addr_v6;
1805 if (rule->ir_port_range && server->iser_port_range) {
1806 info.nat_dport = htons(ntohs(dport) -
1807 rule->ir_min_port + server->iser_min_port);
1808 } else {
1809 info.nat_dport = htons(server->iser_min_port);
1810 }
1811
1812 /*
1813 * If ilb_conn_add() fails, it will release the reference on
1814 * sticky info and de-allocate the NAT source port allocated
1815 * above.
1816 */
1817 if (ilb_conn_add(ilbs, rule, server, src, sport, dst,
1818 dport, &info, &ip_sum, &tp_sum, s) != 0) {
1819 ILB_R_KSTAT(rule, pkt_dropped);
1820 ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
1821 ILB_R_KSTAT(rule, nomem_pkt_dropped);
1822 ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
1823 ret = ILB_DROPPED;
1824 break;
1825 }
1826 ilb_full_nat(l3, iph, l4, tph, &info, ip_sum, tp_sum, B_TRUE);
1827 ret = ILB_BALANCED;
1828 break;
1829 }
1830 case ILB_TOPO_IMPL_HALF_NAT:
1831 info.vip = *dst;
1832 info.nat_dst = server->iser_addr_v6;
1833 info.dport = dport;
1834 if (rule->ir_port_range && server->iser_port_range) {
1835 info.nat_dport = htons(ntohs(dport) -
1836 rule->ir_min_port + server->iser_min_port);
1837 } else {
1838 info.nat_dport = htons(server->iser_min_port);
1839 }
1840
1841 if (ilb_conn_add(ilbs, rule, server, src, sport, dst,
1842 dport, &info, &ip_sum, &tp_sum, s) != 0) {
1843 ILB_R_KSTAT(rule, pkt_dropped);
1844 ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
1845 ILB_R_KSTAT(rule, nomem_pkt_dropped);
1846 ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
1847 ret = ILB_DROPPED;
1848 break;
1849 }
1850 ilb_half_nat(l3, iph, l4, tph, &info, ip_sum, tp_sum, B_TRUE);
1851
1852 ret = ILB_BALANCED;
1853 break;
1854 case ILB_TOPO_IMPL_DSR:
1855 /*
1856 * By decrementing the sticky refcnt, the period of
1857 * stickiness (life time of ilb_sticky_t) will be
1858 * from now to (now + default expiry time).
1859 */
1860 if (s != NULL)
1861 ilb_sticky_refrele(s);
1862 ret = ILB_BALANCED;
1863 break;
1864 default:
1865 cmn_err(CE_PANIC, "data corruption unknown topology: %p",
1866 (void *) rule);
1867 break;
1868 }
1869 ILB_RULE_REFRELE(rule);
1870 return (ret);
1871
1872 no_server:
1873 /* This can only happen if there is no server available. */
1874 ILB_R_KSTAT(rule, pkt_dropped);
1875 ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
1876 ILB_RULE_REFRELE(rule);
1877 return (ILB_DROPPED);
1878 }
1879
1880 int
ilb_check_v4(ilb_stack_t * ilbs,ill_t * ill,mblk_t * mp,ipha_t * ipha,int l4,uint8_t * tph,ipaddr_t * lb_dst)1881 ilb_check_v4(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ipha_t *ipha, int l4,
1882 uint8_t *tph, ipaddr_t *lb_dst)
1883 {
1884 in6_addr_t v6_src, v6_dst, v6_lb_dst;
1885 int ret;
1886
1887 ASSERT(DB_REF(mp) == 1);
1888
1889 if (l4 == IPPROTO_ICMP) {
1890 return (ilb_icmp_v4(ilbs, ill, mp, ipha, (icmph_t *)tph,
1891 lb_dst));
1892 }
1893
1894 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6_src);
1895 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6_dst);
1896 ret = ilb_check(ilbs, ill, mp, &v6_src, &v6_dst, IPPROTO_IP, l4, ipha,
1897 tph, ntohs(ipha->ipha_length), &v6_lb_dst);
1898 if (ret == ILB_BALANCED)
1899 IN6_V4MAPPED_TO_IPADDR(&v6_lb_dst, *lb_dst);
1900 return (ret);
1901 }
1902
1903 int
ilb_check_v6(ilb_stack_t * ilbs,ill_t * ill,mblk_t * mp,ip6_t * ip6h,int l4,uint8_t * tph,in6_addr_t * lb_dst)1904 ilb_check_v6(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ip6_t *ip6h, int l4,
1905 uint8_t *tph, in6_addr_t *lb_dst)
1906 {
1907 uint32_t pkt_len;
1908
1909 ASSERT(DB_REF(mp) == 1);
1910
1911 if (l4 == IPPROTO_ICMPV6) {
1912 return (ilb_icmp_v6(ilbs, ill, mp, ip6h, (icmp6_t *)tph,
1913 lb_dst));
1914 }
1915
1916 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
1917 return (ilb_check(ilbs, ill, mp, &ip6h->ip6_src, &ip6h->ip6_dst,
1918 IPPROTO_IPV6, l4, ip6h, tph, pkt_len, lb_dst));
1919 }
1920
1921 void
ilb_get_num_rules(ilb_stack_t * ilbs,zoneid_t zoneid,uint32_t * num_rules)1922 ilb_get_num_rules(ilb_stack_t *ilbs, zoneid_t zoneid, uint32_t *num_rules)
1923 {
1924 ilb_rule_t *tmp_rule;
1925
1926 mutex_enter(&ilbs->ilbs_g_lock);
1927 *num_rules = 0;
1928 for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
1929 tmp_rule = tmp_rule->ir_next) {
1930 if (tmp_rule->ir_zoneid == zoneid)
1931 *num_rules += 1;
1932 }
1933 mutex_exit(&ilbs->ilbs_g_lock);
1934 }
1935
1936 int
ilb_get_num_servers(ilb_stack_t * ilbs,zoneid_t zoneid,const char * name,uint32_t * num_servers)1937 ilb_get_num_servers(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
1938 uint32_t *num_servers)
1939 {
1940 ilb_rule_t *rule;
1941 int err;
1942
1943 if ((rule = ilb_find_rule(ilbs, zoneid, name, &err)) == NULL)
1944 return (err);
1945 *num_servers = rule->ir_kstat.num_servers.value.ui64;
1946 ILB_RULE_REFRELE(rule);
1947 return (0);
1948 }
1949
1950 int
ilb_get_servers(ilb_stack_t * ilbs,zoneid_t zoneid,const char * name,ilb_server_info_t * servers,uint32_t * num_servers)1951 ilb_get_servers(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
1952 ilb_server_info_t *servers, uint32_t *num_servers)
1953 {
1954 ilb_rule_t *rule;
1955 ilb_server_t *server;
1956 size_t cnt;
1957 int err;
1958
1959 if ((rule = ilb_find_rule(ilbs, zoneid, name, &err)) == NULL)
1960 return (err);
1961 for (server = rule->ir_servers, cnt = *num_servers;
1962 server != NULL && cnt > 0;
1963 server = server->iser_next, cnt--, servers++) {
1964 (void) memcpy(servers->name, server->iser_name,
1965 ILB_SERVER_NAMESZ);
1966 servers->addr = server->iser_addr_v6;
1967 servers->min_port = htons(server->iser_min_port);
1968 servers->max_port = htons(server->iser_max_port);
1969 servers->flags = server->iser_enabled ? ILB_SERVER_ENABLED : 0;
1970 servers->err = 0;
1971 }
1972 ILB_RULE_REFRELE(rule);
1973 *num_servers -= cnt;
1974
1975 return (0);
1976 }
1977
1978 void
ilb_get_rulenames(ilb_stack_t * ilbs,zoneid_t zoneid,uint32_t * num_names,char * buf)1979 ilb_get_rulenames(ilb_stack_t *ilbs, zoneid_t zoneid, uint32_t *num_names,
1980 char *buf)
1981 {
1982 ilb_rule_t *tmp_rule;
1983 int cnt;
1984
1985 if (*num_names == 0)
1986 return;
1987
1988 mutex_enter(&ilbs->ilbs_g_lock);
1989 for (cnt = 0, tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
1990 tmp_rule = tmp_rule->ir_next) {
1991 if (tmp_rule->ir_zoneid != zoneid)
1992 continue;
1993
1994 (void) memcpy(buf, tmp_rule->ir_name, ILB_RULE_NAMESZ);
1995 buf += ILB_RULE_NAMESZ;
1996 if (++cnt == *num_names)
1997 break;
1998 }
1999 mutex_exit(&ilbs->ilbs_g_lock);
2000 *num_names = cnt;
2001 }
2002
2003 int
ilb_rule_list(ilb_stack_t * ilbs,zoneid_t zoneid,ilb_rule_cmd_t * cmd)2004 ilb_rule_list(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_rule_cmd_t *cmd)
2005 {
2006 ilb_rule_t *rule;
2007 int err;
2008
2009 if ((rule = ilb_find_rule(ilbs, zoneid, cmd->name, &err)) == NULL) {
2010 return (err);
2011 }
2012
2013 /*
2014 * Except the enabled flags, none of the following will change
2015 * in the life time of a rule. So we don't hold the mutex when
2016 * reading them. The worst is to report a wrong enabled flags.
2017 */
2018 cmd->ip_ver = rule->ir_ipver;
2019 cmd->proto = rule->ir_proto;
2020 cmd->min_port = htons(rule->ir_min_port);
2021 cmd->max_port = htons(rule->ir_max_port);
2022
2023 cmd->vip = rule->ir_target_v6;
2024 cmd->algo = rule->ir_alg_type;
2025 cmd->topo = rule->ir_topo;
2026
2027 cmd->nat_src_start = rule->ir_nat_src_start;
2028 cmd->nat_src_end = rule->ir_nat_src_end;
2029
2030 cmd->conn_drain_timeout = rule->ir_conn_drain_timeout;
2031 cmd->nat_expiry = rule->ir_nat_expiry;
2032 cmd->sticky_expiry = rule->ir_sticky_expiry;
2033
2034 cmd->flags = 0;
2035 if (rule->ir_flags & ILB_RULE_ENABLED)
2036 cmd->flags |= ILB_RULE_ENABLED;
2037 if (rule->ir_flags & ILB_RULE_STICKY) {
2038 cmd->flags |= ILB_RULE_STICKY;
2039 cmd->sticky_mask = rule->ir_sticky_mask;
2040 }
2041
2042 ILB_RULE_REFRELE(rule);
2043 return (0);
2044 }
2045
2046 static void *
ilb_stack_init(netstackid_t stackid,netstack_t * ns)2047 ilb_stack_init(netstackid_t stackid, netstack_t *ns)
2048 {
2049 ilb_stack_t *ilbs;
2050 char tq_name[TASKQ_NAMELEN];
2051
2052 ilbs = kmem_alloc(sizeof (ilb_stack_t), KM_SLEEP);
2053 ilbs->ilbs_netstack = ns;
2054
2055 ilbs->ilbs_rule_head = NULL;
2056 ilbs->ilbs_g_hash = NULL;
2057 mutex_init(&ilbs->ilbs_g_lock, NULL, MUTEX_DEFAULT, NULL);
2058
2059 ilbs->ilbs_kstat = kmem_alloc(sizeof (ilb_g_kstat_t), KM_SLEEP);
2060 if ((ilbs->ilbs_ksp = ilb_kstat_g_init(stackid, ilbs)) == NULL) {
2061 kmem_free(ilbs, sizeof (ilb_stack_t));
2062 return (NULL);
2063 }
2064
2065 /*
2066 * ilbs_conn/sticky_hash related info is initialized in
2067 * ilb_conn/sticky_hash_init().
2068 */
2069 ilbs->ilbs_conn_taskq = NULL;
2070 ilbs->ilbs_rule_hash_size = ilb_rule_hash_size;
2071 ilbs->ilbs_conn_hash_size = ilb_conn_hash_size;
2072 ilbs->ilbs_c2s_conn_hash = NULL;
2073 ilbs->ilbs_s2c_conn_hash = NULL;
2074 ilbs->ilbs_conn_timer_list = NULL;
2075
2076 ilbs->ilbs_sticky_hash = NULL;
2077 ilbs->ilbs_sticky_hash_size = ilb_sticky_hash_size;
2078 ilbs->ilbs_sticky_timer_list = NULL;
2079 ilbs->ilbs_sticky_taskq = NULL;
2080
2081 /* The allocation is done later when there is a rule using NAT mode. */
2082 ilbs->ilbs_nat_src = NULL;
2083 ilbs->ilbs_nat_src_hash_size = ilb_nat_src_hash_size;
2084 mutex_init(&ilbs->ilbs_nat_src_lock, NULL, MUTEX_DEFAULT, NULL);
2085 ilbs->ilbs_nat_src_tid = 0;
2086
2087 /* For listing the conn hash table */
2088 mutex_init(&ilbs->ilbs_conn_list_lock, NULL, MUTEX_DEFAULT, NULL);
2089 cv_init(&ilbs->ilbs_conn_list_cv, NULL, CV_DEFAULT, NULL);
2090 ilbs->ilbs_conn_list_busy = B_FALSE;
2091 ilbs->ilbs_conn_list_cur = 0;
2092 ilbs->ilbs_conn_list_connp = NULL;
2093
2094 /* For listing the sticky hash table */
2095 mutex_init(&ilbs->ilbs_sticky_list_lock, NULL, MUTEX_DEFAULT, NULL);
2096 cv_init(&ilbs->ilbs_sticky_list_cv, NULL, CV_DEFAULT, NULL);
2097 ilbs->ilbs_sticky_list_busy = B_FALSE;
2098 ilbs->ilbs_sticky_list_cur = 0;
2099 ilbs->ilbs_sticky_list_curp = NULL;
2100
2101 (void) snprintf(tq_name, sizeof (tq_name), "ilb_rule_taskq_%p",
2102 (void *)ns);
2103 ilbs->ilbs_rule_taskq = taskq_create(tq_name, ILB_RULE_TASKQ_NUM_THR,
2104 minclsyspri, 1, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
2105
2106 return (ilbs);
2107 }
2108
2109 /* ARGSUSED */
2110 static void
ilb_stack_shutdown(netstackid_t stackid,void * arg)2111 ilb_stack_shutdown(netstackid_t stackid, void *arg)
2112 {
2113 ilb_stack_t *ilbs = (ilb_stack_t *)arg;
2114 ilb_rule_t *tmp_rule;
2115
2116 ilb_sticky_hash_fini(ilbs);
2117 ilb_conn_hash_fini(ilbs);
2118 mutex_enter(&ilbs->ilbs_g_lock);
2119 while ((tmp_rule = ilbs->ilbs_rule_head) != NULL) {
2120 ilb_rule_hash_del(tmp_rule);
2121 ilb_rule_g_del(ilbs, tmp_rule);
2122 mutex_exit(&ilbs->ilbs_g_lock);
2123 ilb_rule_del_common(ilbs, tmp_rule);
2124 mutex_enter(&ilbs->ilbs_g_lock);
2125 }
2126 mutex_exit(&ilbs->ilbs_g_lock);
2127 if (ilbs->ilbs_nat_src != NULL)
2128 ilb_nat_src_fini(ilbs);
2129 }
2130
2131 static void
ilb_stack_fini(netstackid_t stackid,void * arg)2132 ilb_stack_fini(netstackid_t stackid, void * arg)
2133 {
2134 ilb_stack_t *ilbs = (ilb_stack_t *)arg;
2135
2136 ilb_rule_hash_fini(ilbs);
2137 taskq_destroy(ilbs->ilbs_rule_taskq);
2138 ilb_kstat_g_fini(stackid, ilbs);
2139 kmem_free(ilbs->ilbs_kstat, sizeof (ilb_g_kstat_t));
2140 kmem_free(ilbs, sizeof (ilb_stack_t));
2141 }
2142
2143 void
ilb_ddi_g_init(void)2144 ilb_ddi_g_init(void)
2145 {
2146 netstack_register(NS_ILB, ilb_stack_init, ilb_stack_shutdown,
2147 ilb_stack_fini);
2148 }
2149
2150 void
ilb_ddi_g_destroy(void)2151 ilb_ddi_g_destroy(void)
2152 {
2153 netstack_unregister(NS_ILB);
2154 ilb_conn_cache_fini();
2155 ilb_sticky_cache_fini();
2156 }
2157