1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 #include <sys/types.h>
27 #include <sys/stream.h>
28 #include <sys/strsun.h>
29 #include <sys/zone.h>
30 #include <sys/ddi.h>
31 #include <sys/sunddi.h>
32 #include <sys/cmn_err.h>
33 #include <sys/debug.h>
34 #include <sys/atomic.h>
35 #define _SUN_TPI_VERSION 2
36 #include <sys/tihdr.h>
37
38 #include <inet/common.h>
39 #include <inet/mi.h>
40 #include <inet/mib2.h>
41 #include <inet/snmpcom.h>
42
43 #include <netinet/ip6.h>
44 #include <netinet/icmp6.h>
45
46 #include <inet/ip.h>
47 #include <inet/ip_impl.h>
48 #include <inet/ip6.h>
49 #include <inet/ip6_asp.h>
50 #include <inet/ip_multi.h>
51 #include <inet/ip_if.h>
52 #include <inet/ip_ire.h>
53 #include <inet/ip_ftable.h>
54 #include <inet/ip_rts.h>
55 #include <inet/ip_ndp.h>
56 #include <inet/ipclassifier.h>
57 #include <inet/ip_listutils.h>
58
59 #include <sys/sunddi.h>
60
61 /*
62 * Routines for handling destination cache entries.
63 * There is always one DCEF_DEFAULT for each ip_stack_t created at init time.
64 * That entry holds both the IP ident value and the dce generation number.
65 *
66 * Any time a DCE is changed significantly (different path MTU, but NOT
67 * different ULP info!), the dce_generation number is increased.
68 * Also, when a new DCE is created, the dce_generation number in the default
69 * DCE is bumped. That allows the dce_t information to be cached efficiently
70 * as long as the entity caching the dce_t also caches the dce_generation,
71 * and compares the cached generation to detect any changes.
72 * Furthermore, when a DCE is deleted, if there are any outstanding references
73 * to the DCE it will be marked as condemned. The condemned mark is
74 * a designated generation number which is never otherwise used, hence
75 * the single comparison with the generation number captures that as well.
76 *
77 * An example of code which caches is as follows:
78 *
79 * if (mystruct->my_dce_generation != mystruct->my_dce->dce_generation) {
80 * The DCE has changed
81 * mystruct->my_dce = dce_lookup_pkt(mp, ixa,
82 * &mystruct->my_dce_generation);
83 * Not needed in practice, since we have the default DCE:
84 * if (DCE_IS_CONDEMNED(mystruct->my_dce))
85 * return failure;
86 * }
87 *
88 * Note that for IPv6 link-local addresses we record the ifindex since the
89 * link-locals are not globally unique.
90 */
91
92 /*
93 * Hash bucket structure for DCEs
94 */
95 typedef struct dcb_s {
96 krwlock_t dcb_lock;
97 uint32_t dcb_cnt;
98 dce_t *dcb_dce;
99 } dcb_t;
100
101 static void dce_delete_locked(dcb_t *, dce_t *);
102 static void dce_make_condemned(dce_t *);
103
104 static kmem_cache_t *dce_cache;
105
106
107 /* Operates on a uint64_t */
108 #define RANDOM_HASH(p) ((p) ^ ((p)>>16) ^ ((p)>>32) ^ ((p)>>48))
109
110 /*
111 * Reclaim a fraction of dce's in the dcb.
112 * For now we have a higher probability to delete DCEs without DCE_PMTU.
113 */
114 static void
dcb_reclaim(dcb_t * dcb,ip_stack_t * ipst,uint_t fraction)115 dcb_reclaim(dcb_t *dcb, ip_stack_t *ipst, uint_t fraction)
116 {
117 uint_t fraction_pmtu = fraction*4;
118 uint_t hash;
119 dce_t *dce, *nextdce;
120
121 rw_enter(&dcb->dcb_lock, RW_WRITER);
122 for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
123 nextdce = dce->dce_next;
124 /* Clear DCEF_PMTU if the pmtu is too old */
125 mutex_enter(&dce->dce_lock);
126 if ((dce->dce_flags & DCEF_PMTU) &&
127 TICK_TO_SEC(ddi_get_lbolt64()) - dce->dce_last_change_time >
128 ipst->ips_ip_pathmtu_interval) {
129 dce->dce_flags &= ~DCEF_PMTU;
130 mutex_exit(&dce->dce_lock);
131 dce_increment_generation(dce);
132 } else {
133 mutex_exit(&dce->dce_lock);
134 }
135 hash = RANDOM_HASH((uint64_t)(uintptr_t)dce);
136 if (dce->dce_flags & DCEF_PMTU) {
137 if (hash % fraction_pmtu != 0)
138 continue;
139 } else {
140 if (hash % fraction != 0)
141 continue;
142 }
143
144 IP_STAT(ipst, ip_dce_reclaim_deleted);
145 dce_delete_locked(dcb, dce);
146 dce_refrele(dce);
147 }
148 rw_exit(&dcb->dcb_lock);
149 }
150
151 /*
152 * kmem_cache callback to free up memory.
153 *
154 */
155 static void
ip_dce_reclaim_stack(ip_stack_t * ipst)156 ip_dce_reclaim_stack(ip_stack_t *ipst)
157 {
158 int i;
159
160 IP_STAT(ipst, ip_dce_reclaim_calls);
161 for (i = 0; i < ipst->ips_dce_hashsize; i++) {
162 dcb_reclaim(&ipst->ips_dce_hash_v4[i], ipst,
163 ipst->ips_ip_dce_reclaim_fraction);
164
165 dcb_reclaim(&ipst->ips_dce_hash_v6[i], ipst,
166 ipst->ips_ip_dce_reclaim_fraction);
167 }
168
169 /*
170 * Walk all CONNs that can have a reference on an ire, nce or dce.
171 * Get them to update any stale references to drop any refholds they
172 * have.
173 */
174 ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
175 }
176
177 /*
178 * Called by the memory allocator subsystem directly, when the system
179 * is running low on memory.
180 */
181 /* ARGSUSED */
182 void
ip_dce_reclaim(void * args)183 ip_dce_reclaim(void *args)
184 {
185 netstack_handle_t nh;
186 netstack_t *ns;
187 ip_stack_t *ipst;
188
189 netstack_next_init(&nh);
190 while ((ns = netstack_next(&nh)) != NULL) {
191 /*
192 * netstack_next() can return a netstack_t with a NULL
193 * netstack_ip at boot time.
194 */
195 if ((ipst = ns->netstack_ip) == NULL) {
196 netstack_rele(ns);
197 continue;
198 }
199 ip_dce_reclaim_stack(ipst);
200 netstack_rele(ns);
201 }
202 netstack_next_fini(&nh);
203 }
204
205 void
dce_g_init(void)206 dce_g_init(void)
207 {
208 dce_cache = kmem_cache_create("dce_cache",
209 sizeof (dce_t), 0, NULL, NULL, ip_dce_reclaim, NULL, NULL, 0);
210 }
211
212 void
dce_g_destroy(void)213 dce_g_destroy(void)
214 {
215 kmem_cache_destroy(dce_cache);
216 }
217
218
219 /*
220 * Allocate a default DCE and a hash table for per-IP address DCEs
221 */
222 void
dce_stack_init(ip_stack_t * ipst)223 dce_stack_init(ip_stack_t *ipst)
224 {
225 int i;
226
227 ipst->ips_dce_default = kmem_cache_alloc(dce_cache, KM_SLEEP);
228 bzero(ipst->ips_dce_default, sizeof (dce_t));
229 ipst->ips_dce_default->dce_flags = DCEF_DEFAULT;
230 ipst->ips_dce_default->dce_generation = DCE_GENERATION_INITIAL;
231 ipst->ips_dce_default->dce_last_change_time =
232 TICK_TO_SEC(ddi_get_lbolt64());
233 ipst->ips_dce_default->dce_refcnt = 1; /* Should never go away */
234 ipst->ips_dce_default->dce_ipst = ipst;
235
236 /* This must be a power of two since we are using IRE_ADDR_HASH macro */
237 ipst->ips_dce_hashsize = 256;
238 ipst->ips_dce_hash_v4 = kmem_zalloc(ipst->ips_dce_hashsize *
239 sizeof (dcb_t), KM_SLEEP);
240 ipst->ips_dce_hash_v6 = kmem_zalloc(ipst->ips_dce_hashsize *
241 sizeof (dcb_t), KM_SLEEP);
242 for (i = 0; i < ipst->ips_dce_hashsize; i++) {
243 rw_init(&ipst->ips_dce_hash_v4[i].dcb_lock, NULL, RW_DEFAULT,
244 NULL);
245 rw_init(&ipst->ips_dce_hash_v6[i].dcb_lock, NULL, RW_DEFAULT,
246 NULL);
247 }
248 }
249
250 void
dce_stack_destroy(ip_stack_t * ipst)251 dce_stack_destroy(ip_stack_t *ipst)
252 {
253 int i;
254 for (i = 0; i < ipst->ips_dce_hashsize; i++) {
255 rw_destroy(&ipst->ips_dce_hash_v4[i].dcb_lock);
256 rw_destroy(&ipst->ips_dce_hash_v6[i].dcb_lock);
257 }
258 kmem_free(ipst->ips_dce_hash_v4,
259 ipst->ips_dce_hashsize * sizeof (dcb_t));
260 ipst->ips_dce_hash_v4 = NULL;
261 kmem_free(ipst->ips_dce_hash_v6,
262 ipst->ips_dce_hashsize * sizeof (dcb_t));
263 ipst->ips_dce_hash_v6 = NULL;
264 ipst->ips_dce_hashsize = 0;
265
266 ASSERT(ipst->ips_dce_default->dce_refcnt == 1);
267 kmem_cache_free(dce_cache, ipst->ips_dce_default);
268 ipst->ips_dce_default = NULL;
269 }
270
271 /* When any DCE is good enough */
272 dce_t *
dce_get_default(ip_stack_t * ipst)273 dce_get_default(ip_stack_t *ipst)
274 {
275 dce_t *dce;
276
277 dce = ipst->ips_dce_default;
278 dce_refhold(dce);
279 return (dce);
280 }
281
282 /*
283 * Generic for IPv4 and IPv6.
284 *
285 * Used by callers that need to cache e.g., the datapath
286 * Returns the generation number in the last argument.
287 */
288 dce_t *
dce_lookup_pkt(mblk_t * mp,ip_xmit_attr_t * ixa,uint_t * generationp)289 dce_lookup_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp)
290 {
291 if (ixa->ixa_flags & IXAF_IS_IPV4) {
292 /*
293 * If we have a source route we need to look for the final
294 * destination in the source route option.
295 */
296 ipaddr_t final_dst;
297 ipha_t *ipha = (ipha_t *)mp->b_rptr;
298
299 final_dst = ip_get_dst(ipha);
300 return (dce_lookup_v4(final_dst, ixa->ixa_ipst, generationp));
301 } else {
302 uint_t ifindex;
303 /*
304 * If we have a routing header we need to look for the final
305 * destination in the routing extension header.
306 */
307 in6_addr_t final_dst;
308 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
309
310 final_dst = ip_get_dst_v6(ip6h, mp, NULL);
311 ifindex = 0;
312 if (IN6_IS_ADDR_LINKSCOPE(&final_dst) && ixa->ixa_nce != NULL) {
313 ifindex = ixa->ixa_nce->nce_common->ncec_ill->
314 ill_phyint->phyint_ifindex;
315 }
316 return (dce_lookup_v6(&final_dst, ifindex, ixa->ixa_ipst,
317 generationp));
318 }
319 }
320
321 /*
322 * Used by callers that need to cache e.g., the datapath
323 * Returns the generation number in the last argument.
324 */
325 dce_t *
dce_lookup_v4(ipaddr_t dst,ip_stack_t * ipst,uint_t * generationp)326 dce_lookup_v4(ipaddr_t dst, ip_stack_t *ipst, uint_t *generationp)
327 {
328 uint_t hash;
329 dcb_t *dcb;
330 dce_t *dce;
331
332 /* Set *generationp before dropping the lock(s) that allow additions */
333 if (generationp != NULL)
334 *generationp = ipst->ips_dce_default->dce_generation;
335
336 hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
337 dcb = &ipst->ips_dce_hash_v4[hash];
338 rw_enter(&dcb->dcb_lock, RW_READER);
339 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
340 if (dce->dce_v4addr == dst) {
341 mutex_enter(&dce->dce_lock);
342 if (!DCE_IS_CONDEMNED(dce)) {
343 dce_refhold(dce);
344 if (generationp != NULL)
345 *generationp = dce->dce_generation;
346 mutex_exit(&dce->dce_lock);
347 rw_exit(&dcb->dcb_lock);
348 return (dce);
349 }
350 mutex_exit(&dce->dce_lock);
351 }
352 }
353 rw_exit(&dcb->dcb_lock);
354 /* Not found */
355 dce = ipst->ips_dce_default;
356 dce_refhold(dce);
357 return (dce);
358 }
359
360 /*
361 * Used by callers that need to cache e.g., the datapath
362 * Returns the generation number in the last argument.
363 * ifindex should only be set for link-locals
364 */
365 dce_t *
dce_lookup_v6(const in6_addr_t * dst,uint_t ifindex,ip_stack_t * ipst,uint_t * generationp)366 dce_lookup_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst,
367 uint_t *generationp)
368 {
369 uint_t hash;
370 dcb_t *dcb;
371 dce_t *dce;
372
373 /* Set *generationp before dropping the lock(s) that allow additions */
374 if (generationp != NULL)
375 *generationp = ipst->ips_dce_default->dce_generation;
376
377 hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
378 dcb = &ipst->ips_dce_hash_v6[hash];
379 rw_enter(&dcb->dcb_lock, RW_READER);
380 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
381 if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
382 dce->dce_ifindex == ifindex) {
383 mutex_enter(&dce->dce_lock);
384 if (!DCE_IS_CONDEMNED(dce)) {
385 dce_refhold(dce);
386 if (generationp != NULL)
387 *generationp = dce->dce_generation;
388 mutex_exit(&dce->dce_lock);
389 rw_exit(&dcb->dcb_lock);
390 return (dce);
391 }
392 mutex_exit(&dce->dce_lock);
393 }
394 }
395 rw_exit(&dcb->dcb_lock);
396 /* Not found */
397 dce = ipst->ips_dce_default;
398 dce_refhold(dce);
399 return (dce);
400 }
401
402 /*
403 * Atomically looks for a non-default DCE, and if not found tries to create one.
404 * If there is no memory it returns NULL.
405 * When an entry is created we increase the generation number on
406 * the default DCE so that conn_ip_output will detect there is a new DCE.
407 */
408 dce_t *
dce_lookup_and_add_v4(ipaddr_t dst,ip_stack_t * ipst)409 dce_lookup_and_add_v4(ipaddr_t dst, ip_stack_t *ipst)
410 {
411 uint_t hash;
412 dcb_t *dcb;
413 dce_t *dce;
414
415 hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
416 dcb = &ipst->ips_dce_hash_v4[hash];
417 rw_enter(&dcb->dcb_lock, RW_WRITER);
418 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
419 if (dce->dce_v4addr == dst) {
420 mutex_enter(&dce->dce_lock);
421 if (!DCE_IS_CONDEMNED(dce)) {
422 dce_refhold(dce);
423 mutex_exit(&dce->dce_lock);
424 rw_exit(&dcb->dcb_lock);
425 return (dce);
426 }
427 mutex_exit(&dce->dce_lock);
428 }
429 }
430 dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
431 if (dce == NULL) {
432 rw_exit(&dcb->dcb_lock);
433 return (NULL);
434 }
435 bzero(dce, sizeof (dce_t));
436 dce->dce_ipst = ipst; /* No netstack_hold */
437 dce->dce_v4addr = dst;
438 dce->dce_generation = DCE_GENERATION_INITIAL;
439 dce->dce_ipversion = IPV4_VERSION;
440 dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
441 dce_refhold(dce); /* For the hash list */
442
443 /* Link into list */
444 if (dcb->dcb_dce != NULL)
445 dcb->dcb_dce->dce_ptpn = &dce->dce_next;
446 dce->dce_next = dcb->dcb_dce;
447 dce->dce_ptpn = &dcb->dcb_dce;
448 dcb->dcb_dce = dce;
449 dce->dce_bucket = dcb;
450 dce_refhold(dce); /* For the caller */
451 rw_exit(&dcb->dcb_lock);
452
453 /* Initialize dce_ident to be different than for the last packet */
454 dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
455
456 dce_increment_generation(ipst->ips_dce_default);
457 return (dce);
458 }
459
460 /*
461 * Atomically looks for a non-default DCE, and if not found tries to create one.
462 * If there is no memory it returns NULL.
463 * When an entry is created we increase the generation number on
464 * the default DCE so that conn_ip_output will detect there is a new DCE.
465 * ifindex should only be used with link-local addresses.
466 */
467 dce_t *
dce_lookup_and_add_v6(const in6_addr_t * dst,uint_t ifindex,ip_stack_t * ipst)468 dce_lookup_and_add_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst)
469 {
470 uint_t hash;
471 dcb_t *dcb;
472 dce_t *dce;
473
474 /* We should not create entries for link-locals w/o an ifindex */
475 ASSERT(!(IN6_IS_ADDR_LINKSCOPE(dst)) || ifindex != 0);
476
477 hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
478 dcb = &ipst->ips_dce_hash_v6[hash];
479 rw_enter(&dcb->dcb_lock, RW_WRITER);
480 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
481 if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
482 dce->dce_ifindex == ifindex) {
483 mutex_enter(&dce->dce_lock);
484 if (!DCE_IS_CONDEMNED(dce)) {
485 dce_refhold(dce);
486 mutex_exit(&dce->dce_lock);
487 rw_exit(&dcb->dcb_lock);
488 return (dce);
489 }
490 mutex_exit(&dce->dce_lock);
491 }
492 }
493
494 dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
495 if (dce == NULL) {
496 rw_exit(&dcb->dcb_lock);
497 return (NULL);
498 }
499 bzero(dce, sizeof (dce_t));
500 dce->dce_ipst = ipst; /* No netstack_hold */
501 dce->dce_v6addr = *dst;
502 dce->dce_ifindex = ifindex;
503 dce->dce_generation = DCE_GENERATION_INITIAL;
504 dce->dce_ipversion = IPV6_VERSION;
505 dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
506 dce_refhold(dce); /* For the hash list */
507
508 /* Link into list */
509 if (dcb->dcb_dce != NULL)
510 dcb->dcb_dce->dce_ptpn = &dce->dce_next;
511 dce->dce_next = dcb->dcb_dce;
512 dce->dce_ptpn = &dcb->dcb_dce;
513 dcb->dcb_dce = dce;
514 dce->dce_bucket = dcb;
515 atomic_add_32(&dcb->dcb_cnt, 1);
516 dce_refhold(dce); /* For the caller */
517 rw_exit(&dcb->dcb_lock);
518
519 /* Initialize dce_ident to be different than for the last packet */
520 dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
521 dce_increment_generation(ipst->ips_dce_default);
522 return (dce);
523 }
524
525 /*
526 * Set/update uinfo. Creates a per-destination dce if none exists.
527 *
528 * Note that we do not bump the generation number here.
529 * New connections will find the new uinfo.
530 *
531 * The only use of this (tcp, sctp using iulp_t) is to set rtt+rtt_sd.
532 */
533 static void
dce_setuinfo(dce_t * dce,iulp_t * uinfo)534 dce_setuinfo(dce_t *dce, iulp_t *uinfo)
535 {
536 /*
537 * Update the round trip time estimate and/or the max frag size
538 * and/or the slow start threshold.
539 *
540 * We serialize multiple advises using dce_lock.
541 */
542 mutex_enter(&dce->dce_lock);
543 /* Gard against setting to zero */
544 if (uinfo->iulp_rtt != 0) {
545 /*
546 * If there is no old cached values, initialize them
547 * conservatively. Set them to be (1.5 * new value).
548 */
549 if (dce->dce_uinfo.iulp_rtt != 0) {
550 dce->dce_uinfo.iulp_rtt = (dce->dce_uinfo.iulp_rtt +
551 uinfo->iulp_rtt) >> 1;
552 } else {
553 dce->dce_uinfo.iulp_rtt = uinfo->iulp_rtt +
554 (uinfo->iulp_rtt >> 1);
555 }
556 if (dce->dce_uinfo.iulp_rtt_sd != 0) {
557 dce->dce_uinfo.iulp_rtt_sd =
558 (dce->dce_uinfo.iulp_rtt_sd +
559 uinfo->iulp_rtt_sd) >> 1;
560 } else {
561 dce->dce_uinfo.iulp_rtt_sd = uinfo->iulp_rtt_sd +
562 (uinfo->iulp_rtt_sd >> 1);
563 }
564 }
565 if (uinfo->iulp_mtu != 0) {
566 if (dce->dce_flags & DCEF_PMTU) {
567 dce->dce_pmtu = MIN(uinfo->iulp_mtu, dce->dce_pmtu);
568 } else {
569 dce->dce_pmtu = MIN(uinfo->iulp_mtu, IP_MAXPACKET);
570 dce->dce_flags |= DCEF_PMTU;
571 }
572 dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
573 }
574 if (uinfo->iulp_ssthresh != 0) {
575 if (dce->dce_uinfo.iulp_ssthresh != 0)
576 dce->dce_uinfo.iulp_ssthresh =
577 (uinfo->iulp_ssthresh +
578 dce->dce_uinfo.iulp_ssthresh) >> 1;
579 else
580 dce->dce_uinfo.iulp_ssthresh = uinfo->iulp_ssthresh;
581 }
582 /* We have uinfo for sure */
583 dce->dce_flags |= DCEF_UINFO;
584 mutex_exit(&dce->dce_lock);
585 }
586
587
588 int
dce_update_uinfo_v4(ipaddr_t dst,iulp_t * uinfo,ip_stack_t * ipst)589 dce_update_uinfo_v4(ipaddr_t dst, iulp_t *uinfo, ip_stack_t *ipst)
590 {
591 dce_t *dce;
592
593 dce = dce_lookup_and_add_v4(dst, ipst);
594 if (dce == NULL)
595 return (ENOMEM);
596
597 dce_setuinfo(dce, uinfo);
598 dce_refrele(dce);
599 return (0);
600 }
601
602 int
dce_update_uinfo_v6(const in6_addr_t * dst,uint_t ifindex,iulp_t * uinfo,ip_stack_t * ipst)603 dce_update_uinfo_v6(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
604 ip_stack_t *ipst)
605 {
606 dce_t *dce;
607
608 dce = dce_lookup_and_add_v6(dst, ifindex, ipst);
609 if (dce == NULL)
610 return (ENOMEM);
611
612 dce_setuinfo(dce, uinfo);
613 dce_refrele(dce);
614 return (0);
615 }
616
617 /* Common routine for IPv4 and IPv6 */
618 int
dce_update_uinfo(const in6_addr_t * dst,uint_t ifindex,iulp_t * uinfo,ip_stack_t * ipst)619 dce_update_uinfo(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
620 ip_stack_t *ipst)
621 {
622 ipaddr_t dst4;
623
624 if (IN6_IS_ADDR_V4MAPPED_ANY(dst)) {
625 IN6_V4MAPPED_TO_IPADDR(dst, dst4);
626 return (dce_update_uinfo_v4(dst4, uinfo, ipst));
627 } else {
628 return (dce_update_uinfo_v6(dst, ifindex, uinfo, ipst));
629 }
630 }
631
632 static void
dce_make_condemned(dce_t * dce)633 dce_make_condemned(dce_t *dce)
634 {
635 ip_stack_t *ipst = dce->dce_ipst;
636
637 mutex_enter(&dce->dce_lock);
638 ASSERT(!DCE_IS_CONDEMNED(dce));
639 dce->dce_generation = DCE_GENERATION_CONDEMNED;
640 mutex_exit(&dce->dce_lock);
641 /* Count how many condemned dces for kmem_cache callback */
642 atomic_add_32(&ipst->ips_num_dce_condemned, 1);
643 }
644
645 /*
646 * Increment the generation avoiding the special condemned value
647 */
648 void
dce_increment_generation(dce_t * dce)649 dce_increment_generation(dce_t *dce)
650 {
651 uint_t generation;
652
653 mutex_enter(&dce->dce_lock);
654 if (!DCE_IS_CONDEMNED(dce)) {
655 generation = dce->dce_generation + 1;
656 if (generation == DCE_GENERATION_CONDEMNED)
657 generation = DCE_GENERATION_INITIAL;
658 ASSERT(generation != DCE_GENERATION_VERIFY);
659 dce->dce_generation = generation;
660 }
661 mutex_exit(&dce->dce_lock);
662 }
663
664 /*
665 * Increment the generation number on all dces that have a path MTU and
666 * the default DCE. Used when ill_mtu or ill_mc_mtu changes.
667 */
668 void
dce_increment_all_generations(boolean_t isv6,ip_stack_t * ipst)669 dce_increment_all_generations(boolean_t isv6, ip_stack_t *ipst)
670 {
671 int i;
672 dcb_t *dcb;
673 dce_t *dce;
674
675 for (i = 0; i < ipst->ips_dce_hashsize; i++) {
676 if (isv6)
677 dcb = &ipst->ips_dce_hash_v6[i];
678 else
679 dcb = &ipst->ips_dce_hash_v4[i];
680 rw_enter(&dcb->dcb_lock, RW_WRITER);
681 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
682 if (DCE_IS_CONDEMNED(dce))
683 continue;
684 dce_increment_generation(dce);
685 }
686 rw_exit(&dcb->dcb_lock);
687 }
688 dce_increment_generation(ipst->ips_dce_default);
689 }
690
691 /*
692 * Caller needs to do a dce_refrele since we can't do the
693 * dce_refrele under dcb_lock.
694 */
695 static void
dce_delete_locked(dcb_t * dcb,dce_t * dce)696 dce_delete_locked(dcb_t *dcb, dce_t *dce)
697 {
698 dce->dce_bucket = NULL;
699 *dce->dce_ptpn = dce->dce_next;
700 if (dce->dce_next != NULL)
701 dce->dce_next->dce_ptpn = dce->dce_ptpn;
702 dce->dce_ptpn = NULL;
703 dce->dce_next = NULL;
704 atomic_add_32(&dcb->dcb_cnt, -1);
705 dce_make_condemned(dce);
706 }
707
708 static void
dce_inactive(dce_t * dce)709 dce_inactive(dce_t *dce)
710 {
711 ip_stack_t *ipst = dce->dce_ipst;
712
713 ASSERT(!(dce->dce_flags & DCEF_DEFAULT));
714 ASSERT(dce->dce_ptpn == NULL);
715 ASSERT(dce->dce_bucket == NULL);
716
717 /* Count how many condemned dces for kmem_cache callback */
718 if (DCE_IS_CONDEMNED(dce))
719 atomic_add_32(&ipst->ips_num_dce_condemned, -1);
720
721 kmem_cache_free(dce_cache, dce);
722 }
723
724 void
dce_refrele(dce_t * dce)725 dce_refrele(dce_t *dce)
726 {
727 ASSERT(dce->dce_refcnt != 0);
728 if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0)
729 dce_inactive(dce);
730 }
731
732 void
dce_refhold(dce_t * dce)733 dce_refhold(dce_t *dce)
734 {
735 atomic_add_32(&dce->dce_refcnt, 1);
736 ASSERT(dce->dce_refcnt != 0);
737 }
738
739 /* No tracing support yet hence the same as the above functions */
740 void
dce_refrele_notr(dce_t * dce)741 dce_refrele_notr(dce_t *dce)
742 {
743 ASSERT(dce->dce_refcnt != 0);
744 if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0)
745 dce_inactive(dce);
746 }
747
748 void
dce_refhold_notr(dce_t * dce)749 dce_refhold_notr(dce_t *dce)
750 {
751 atomic_add_32(&dce->dce_refcnt, 1);
752 ASSERT(dce->dce_refcnt != 0);
753 }
754
755 /* Report both the IPv4 and IPv6 DCEs. */
756 mblk_t *
ip_snmp_get_mib2_ip_dce(queue_t * q,mblk_t * mpctl,ip_stack_t * ipst)757 ip_snmp_get_mib2_ip_dce(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
758 {
759 struct opthdr *optp;
760 mblk_t *mp2ctl;
761 dest_cache_entry_t dest_cache;
762 mblk_t *mp_tail = NULL;
763 dce_t *dce;
764 dcb_t *dcb;
765 int i;
766 uint64_t current_time;
767
768 current_time = TICK_TO_SEC(ddi_get_lbolt64());
769
770 /*
771 * make a copy of the original message
772 */
773 mp2ctl = copymsg(mpctl);
774
775 /* First we do IPv4 entries */
776 optp = (struct opthdr *)&mpctl->b_rptr[
777 sizeof (struct T_optmgmt_ack)];
778 optp->level = MIB2_IP;
779 optp->name = EXPER_IP_DCE;
780
781 for (i = 0; i < ipst->ips_dce_hashsize; i++) {
782 dcb = &ipst->ips_dce_hash_v4[i];
783 rw_enter(&dcb->dcb_lock, RW_READER);
784 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
785 dest_cache.DestIpv4Address = dce->dce_v4addr;
786 dest_cache.DestFlags = dce->dce_flags;
787 if (dce->dce_flags & DCEF_PMTU)
788 dest_cache.DestPmtu = dce->dce_pmtu;
789 else
790 dest_cache.DestPmtu = 0;
791 dest_cache.DestIdent = dce->dce_ident;
792 dest_cache.DestIfindex = 0;
793 dest_cache.DestAge = current_time -
794 dce->dce_last_change_time;
795 if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
796 (char *)&dest_cache, (int)sizeof (dest_cache))) {
797 ip1dbg(("ip_snmp_get_mib2_ip_dce: "
798 "failed to allocate %u bytes\n",
799 (uint_t)sizeof (dest_cache)));
800 }
801 }
802 rw_exit(&dcb->dcb_lock);
803 }
804 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
805 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
806 (int)optp->level, (int)optp->name, (int)optp->len));
807 qreply(q, mpctl);
808
809 if (mp2ctl == NULL) {
810 /* Copymsg failed above */
811 return (NULL);
812 }
813
814 /* Now for IPv6 */
815 mpctl = mp2ctl;
816 mp_tail = NULL;
817 mp2ctl = copymsg(mpctl);
818 optp = (struct opthdr *)&mpctl->b_rptr[
819 sizeof (struct T_optmgmt_ack)];
820 optp->level = MIB2_IP6;
821 optp->name = EXPER_IP_DCE;
822
823 for (i = 0; i < ipst->ips_dce_hashsize; i++) {
824 dcb = &ipst->ips_dce_hash_v6[i];
825 rw_enter(&dcb->dcb_lock, RW_READER);
826 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
827 dest_cache.DestIpv6Address = dce->dce_v6addr;
828 dest_cache.DestFlags = dce->dce_flags;
829 if (dce->dce_flags & DCEF_PMTU)
830 dest_cache.DestPmtu = dce->dce_pmtu;
831 else
832 dest_cache.DestPmtu = 0;
833 dest_cache.DestIdent = dce->dce_ident;
834 if (IN6_IS_ADDR_LINKSCOPE(&dce->dce_v6addr))
835 dest_cache.DestIfindex = dce->dce_ifindex;
836 else
837 dest_cache.DestIfindex = 0;
838 dest_cache.DestAge = current_time -
839 dce->dce_last_change_time;
840 if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
841 (char *)&dest_cache, (int)sizeof (dest_cache))) {
842 ip1dbg(("ip_snmp_get_mib2_ip_dce: "
843 "failed to allocate %u bytes\n",
844 (uint_t)sizeof (dest_cache)));
845 }
846 }
847 rw_exit(&dcb->dcb_lock);
848 }
849 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
850 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
851 (int)optp->level, (int)optp->name, (int)optp->len));
852 qreply(q, mpctl);
853
854 return (mp2ctl);
855 }
856
857 /*
858 * Remove IPv6 DCEs which refer to an ifindex that is going away.
859 * This is not required for correctness, but it avoids netstat -d
860 * showing stale stuff that will never be used.
861 */
862 void
dce_cleanup(uint_t ifindex,ip_stack_t * ipst)863 dce_cleanup(uint_t ifindex, ip_stack_t *ipst)
864 {
865 uint_t i;
866 dcb_t *dcb;
867 dce_t *dce, *nextdce;
868
869 for (i = 0; i < ipst->ips_dce_hashsize; i++) {
870 dcb = &ipst->ips_dce_hash_v6[i];
871 rw_enter(&dcb->dcb_lock, RW_WRITER);
872
873 for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
874 nextdce = dce->dce_next;
875 if (dce->dce_ifindex == ifindex) {
876 dce_delete_locked(dcb, dce);
877 dce_refrele(dce);
878 }
879 }
880 rw_exit(&dcb->dcb_lock);
881 }
882 }
883