12535Ssangeeta /* 22535Ssangeeta * CDDL HEADER START 32535Ssangeeta * 42535Ssangeeta * The contents of this file are subject to the terms of the 52535Ssangeeta * Common Development and Distribution License (the "License"). 62535Ssangeeta * You may not use this file except in compliance with the License. 72535Ssangeeta * 82535Ssangeeta * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 92535Ssangeeta * or http://www.opensolaris.org/os/licensing. 102535Ssangeeta * See the License for the specific language governing permissions 112535Ssangeeta * and limitations under the License. 122535Ssangeeta * 132535Ssangeeta * When distributing Covered Code, include this CDDL HEADER in each 142535Ssangeeta * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 152535Ssangeeta * If applicable, add the following below this CDDL HEADER, with the 162535Ssangeeta * fields enclosed by brackets "[]" replaced with your own identifying 172535Ssangeeta * information: Portions Copyright [yyyy] [name of copyright owner] 182535Ssangeeta * 192535Ssangeeta * CDDL HEADER END 202535Ssangeeta */ 212535Ssangeeta /* 22*11457SErik.Nordmark@Sun.COM * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 232535Ssangeeta * Use is subject to license terms. 242535Ssangeeta */ 252535Ssangeeta 262535Ssangeeta /* 272535Ssangeeta * This file contains consumer routines of the IPv4 forwarding engine 282535Ssangeeta */ 292535Ssangeeta 302535Ssangeeta #include <sys/types.h> 312535Ssangeeta #include <sys/stream.h> 322535Ssangeeta #include <sys/stropts.h> 332535Ssangeeta #include <sys/strlog.h> 342535Ssangeeta #include <sys/dlpi.h> 352535Ssangeeta #include <sys/ddi.h> 362535Ssangeeta #include <sys/cmn_err.h> 372535Ssangeeta #include <sys/policy.h> 382535Ssangeeta 392535Ssangeeta #include <sys/systm.h> 402535Ssangeeta #include <sys/strsun.h> 412535Ssangeeta #include <sys/kmem.h> 422535Ssangeeta #include <sys/param.h> 432535Ssangeeta #include <sys/socket.h> 444482Sdr146992 #include <sys/strsubr.h> 452535Ssangeeta #include <net/if.h> 462535Ssangeeta #include <net/route.h> 472535Ssangeeta #include <netinet/in.h> 482535Ssangeeta #include <net/if_dl.h> 492535Ssangeeta #include <netinet/ip6.h> 502535Ssangeeta #include <netinet/icmp6.h> 512535Ssangeeta 5211042SErik.Nordmark@Sun.COM #include <inet/ipsec_impl.h> 532535Ssangeeta #include <inet/common.h> 542535Ssangeeta #include <inet/mi.h> 552535Ssangeeta #include <inet/mib2.h> 562535Ssangeeta #include <inet/ip.h> 574482Sdr146992 #include <inet/ip_impl.h> 582535Ssangeeta #include <inet/ip6.h> 592535Ssangeeta #include <inet/ip_ndp.h> 602535Ssangeeta #include <inet/arp.h> 612535Ssangeeta #include <inet/ip_if.h> 622535Ssangeeta #include <inet/ip_ire.h> 632535Ssangeeta #include <inet/ip_ftable.h> 642535Ssangeeta #include <inet/ip_rts.h> 652535Ssangeeta #include <inet/nd.h> 662535Ssangeeta 672535Ssangeeta #include <net/pfkeyv2.h> 682535Ssangeeta #include <inet/sadb.h> 692535Ssangeeta #include <inet/tcp.h> 702535Ssangeeta #include <inet/ipclassifier.h> 712535Ssangeeta #include <sys/zone.h> 722535Ssangeeta #include <net/radix.h> 732535Ssangeeta #include <sys/tsol/label.h> 742535Ssangeeta #include <sys/tsol/tnet.h> 752535Ssangeeta 762535Ssangeeta #define IS_DEFAULT_ROUTE(ire) \ 772535Ssangeeta (((ire)->ire_type & IRE_DEFAULT) || \ 782535Ssangeeta (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0))) 792535Ssangeeta 803448Sdh155122 static ire_t *route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *); 8111042SErik.Nordmark@Sun.COM static void ire_del_host_redir(ire_t *, char *); 8211042SErik.Nordmark@Sun.COM static boolean_t ire_find_best_route(struct radix_node *, void *); 832535Ssangeeta 842535Ssangeeta /* 852535Ssangeeta * Lookup a route in forwarding table. A specific lookup is indicated by 862535Ssangeeta * passing the required parameters and indicating the match required in the 872535Ssangeeta * flag field. 882535Ssangeeta * 892535Ssangeeta * Supports IP_BOUND_IF by following the ipif/ill when recursing. 902535Ssangeeta */ 912535Ssangeeta ire_t * 9211042SErik.Nordmark@Sun.COM ire_ftable_lookup_v4(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, 9311042SErik.Nordmark@Sun.COM int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, 9411042SErik.Nordmark@Sun.COM int flags, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp) 952535Ssangeeta { 9611042SErik.Nordmark@Sun.COM ire_t *ire; 972535Ssangeeta struct rt_sockaddr rdst, rmask; 982535Ssangeeta struct rt_entry *rt; 992535Ssangeeta ire_ftable_args_t margs; 1002535Ssangeeta 10111042SErik.Nordmark@Sun.COM ASSERT(ill == NULL || !ill->ill_isv6); 1022535Ssangeeta 1032535Ssangeeta /* 10411042SErik.Nordmark@Sun.COM * ire_match_args() will dereference ill if MATCH_IRE_ILL 10511042SErik.Nordmark@Sun.COM * is set. 1062535Ssangeeta */ 10711042SErik.Nordmark@Sun.COM if ((flags & MATCH_IRE_ILL) && (ill == NULL)) 1082535Ssangeeta return (NULL); 1092535Ssangeeta 11011131SErik.Nordmark@Sun.COM bzero(&rdst, sizeof (rdst)); 1112535Ssangeeta rdst.rt_sin_len = sizeof (rdst); 1122535Ssangeeta rdst.rt_sin_family = AF_INET; 1132535Ssangeeta rdst.rt_sin_addr.s_addr = addr; 1142535Ssangeeta 11511131SErik.Nordmark@Sun.COM bzero(&rmask, sizeof (rmask)); 1162535Ssangeeta rmask.rt_sin_len = sizeof (rmask); 1172535Ssangeeta rmask.rt_sin_family = AF_INET; 1182535Ssangeeta rmask.rt_sin_addr.s_addr = mask; 1192535Ssangeeta 12011131SErik.Nordmark@Sun.COM bzero(&margs, sizeof (margs)); 1212535Ssangeeta margs.ift_addr = addr; 1222535Ssangeeta margs.ift_mask = mask; 1232535Ssangeeta margs.ift_gateway = gateway; 1242535Ssangeeta margs.ift_type = type; 12511042SErik.Nordmark@Sun.COM margs.ift_ill = ill; 1262535Ssangeeta margs.ift_zoneid = zoneid; 1272535Ssangeeta margs.ift_tsl = tsl; 1282535Ssangeeta margs.ift_flags = flags; 1292535Ssangeeta 1302535Ssangeeta /* 1312535Ssangeeta * The flags argument passed to ire_ftable_lookup may cause the 1322535Ssangeeta * search to return, not the longest matching prefix, but the 1332535Ssangeeta * "best matching prefix", i.e., the longest prefix that also 1342535Ssangeeta * satisfies constraints imposed via the permutation of flags 1352535Ssangeeta * passed in. To achieve this, we invoke ire_match_args() on 1362535Ssangeeta * each matching leaf in the radix tree. ire_match_args is 1372535Ssangeeta * invoked by the callback function ire_find_best_route() 1382535Ssangeeta * We hold the global tree lock in read mode when calling 13911042SErik.Nordmark@Sun.COM * rn_match_args. Before dropping the global tree lock, ensure 1402535Ssangeeta * that the radix node can't be deleted by incrementing ire_refcnt. 1412535Ssangeeta */ 1423448Sdh155122 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 1433448Sdh155122 rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, 1443448Sdh155122 ipst->ips_ip_ftable, ire_find_best_route, &margs); 1452535Ssangeeta ire = margs.ift_best_ire; 1462535Ssangeeta if (rt == NULL) { 14711042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 1482535Ssangeeta return (NULL); 1492535Ssangeeta } 15011042SErik.Nordmark@Sun.COM ASSERT(ire != NULL); 1512535Ssangeeta 1522535Ssangeeta DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire); 1532535Ssangeeta 1542535Ssangeeta /* 1552535Ssangeeta * round-robin only if we have more than one route in the bucket. 15611042SErik.Nordmark@Sun.COM * ips_ip_ecmp_behavior controls when we do ECMP 15711042SErik.Nordmark@Sun.COM * 2: always 15811042SErik.Nordmark@Sun.COM * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 15911042SErik.Nordmark@Sun.COM * 0: never 1602535Ssangeeta */ 16111042SErik.Nordmark@Sun.COM if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) { 16211042SErik.Nordmark@Sun.COM if (ipst->ips_ip_ecmp_behavior == 2 || 16311042SErik.Nordmark@Sun.COM (ipst->ips_ip_ecmp_behavior == 1 && 16411042SErik.Nordmark@Sun.COM IS_DEFAULT_ROUTE(ire))) { 16511042SErik.Nordmark@Sun.COM ire_t *next_ire; 1662535Ssangeeta 16711042SErik.Nordmark@Sun.COM margs.ift_best_ire = NULL; 16811042SErik.Nordmark@Sun.COM next_ire = ire_round_robin(ire->ire_bucket, &margs, 16911042SErik.Nordmark@Sun.COM xmit_hint, ire, ipst); 17011042SErik.Nordmark@Sun.COM if (next_ire == NULL) { 17111042SErik.Nordmark@Sun.COM /* keep ire if next_ire is null */ 17211042SErik.Nordmark@Sun.COM goto done; 17311042SErik.Nordmark@Sun.COM } 17411042SErik.Nordmark@Sun.COM ire_refrele(ire); 1752535Ssangeeta ire = next_ire; 1762535Ssangeeta } 1772535Ssangeeta } 1782535Ssangeeta 17911042SErik.Nordmark@Sun.COM done: 18011042SErik.Nordmark@Sun.COM /* Return generation before dropping lock */ 18111042SErik.Nordmark@Sun.COM if (generationp != NULL) 18211042SErik.Nordmark@Sun.COM *generationp = ire->ire_generation; 1832535Ssangeeta 18411042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 1858485SPeter.Memishian@Sun.COM 18611042SErik.Nordmark@Sun.COM /* 18711042SErik.Nordmark@Sun.COM * For shared-IP zones we need additional checks to what was 18811042SErik.Nordmark@Sun.COM * done in ire_match_args to make sure IRE_LOCALs are handled. 18911042SErik.Nordmark@Sun.COM * 19011042SErik.Nordmark@Sun.COM * When ip_restrict_interzone_loopback is set, then 19111042SErik.Nordmark@Sun.COM * we ensure that IRE_LOCAL are only used for loopback 19211042SErik.Nordmark@Sun.COM * between zones when the logical "Ethernet" would 19311042SErik.Nordmark@Sun.COM * have looped them back. That is, if in the absense of 19411042SErik.Nordmark@Sun.COM * the IRE_LOCAL we would have sent to packet out the 19511042SErik.Nordmark@Sun.COM * same ill. 19611042SErik.Nordmark@Sun.COM */ 19711042SErik.Nordmark@Sun.COM if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES && 19811042SErik.Nordmark@Sun.COM ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES && 19911042SErik.Nordmark@Sun.COM ipst->ips_ip_restrict_interzone_loopback) { 20011042SErik.Nordmark@Sun.COM ire = ire_alt_local(ire, zoneid, tsl, ill, generationp); 20111042SErik.Nordmark@Sun.COM ASSERT(ire != NULL); 2022535Ssangeeta } 2032535Ssangeeta return (ire); 2042535Ssangeeta } 2052535Ssangeeta 2068275SEric Cheng /* 2078275SEric Cheng * This function is called by 20811042SErik.Nordmark@Sun.COM * ip_input/ire_route_recursive when doing a route lookup on only the 20911042SErik.Nordmark@Sun.COM * destination address. 21011042SErik.Nordmark@Sun.COM * 2118275SEric Cheng * The optimizations of this function over ire_ftable_lookup are: 2128275SEric Cheng * o removing unnecessary flag matching 2138275SEric Cheng * o doing longest prefix match instead of overloading it further 2148275SEric Cheng * with the unnecessary "best_prefix_match" 21511042SErik.Nordmark@Sun.COM * 21611042SErik.Nordmark@Sun.COM * If no route is found we return IRE_NOROUTE. 2178275SEric Cheng */ 21811042SErik.Nordmark@Sun.COM ire_t * 21911042SErik.Nordmark@Sun.COM ire_ftable_lookup_simple_v4(ipaddr_t addr, uint32_t xmit_hint, ip_stack_t *ipst, 22011042SErik.Nordmark@Sun.COM uint_t *generationp) 2218275SEric Cheng { 22211042SErik.Nordmark@Sun.COM ire_t *ire; 2238275SEric Cheng struct rt_sockaddr rdst; 2248275SEric Cheng struct rt_entry *rt; 22511042SErik.Nordmark@Sun.COM irb_t *irb; 2268275SEric Cheng 2278275SEric Cheng rdst.rt_sin_len = sizeof (rdst); 2288275SEric Cheng rdst.rt_sin_family = AF_INET; 2298275SEric Cheng rdst.rt_sin_addr.s_addr = addr; 2308275SEric Cheng 2318275SEric Cheng /* 2328275SEric Cheng * This is basically inlining a simpler version of ire_match_args 2338275SEric Cheng */ 2348275SEric Cheng RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 2358275SEric Cheng 2368275SEric Cheng rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, 2378275SEric Cheng ipst->ips_ip_ftable, NULL, NULL); 2388275SEric Cheng 23911042SErik.Nordmark@Sun.COM if (rt == NULL) 24011042SErik.Nordmark@Sun.COM goto bad; 24111042SErik.Nordmark@Sun.COM 24211042SErik.Nordmark@Sun.COM irb = &rt->rt_irb; 24311042SErik.Nordmark@Sun.COM if (irb->irb_ire_cnt == 0) 24411042SErik.Nordmark@Sun.COM goto bad; 24511042SErik.Nordmark@Sun.COM 24611042SErik.Nordmark@Sun.COM rw_enter(&irb->irb_lock, RW_READER); 24711042SErik.Nordmark@Sun.COM ire = irb->irb_ire; 24811042SErik.Nordmark@Sun.COM if (ire == NULL) { 24911042SErik.Nordmark@Sun.COM rw_exit(&irb->irb_lock); 25011042SErik.Nordmark@Sun.COM goto bad; 2518275SEric Cheng } 25211042SErik.Nordmark@Sun.COM while (IRE_IS_CONDEMNED(ire)) { 25311042SErik.Nordmark@Sun.COM ire = ire->ire_next; 25411042SErik.Nordmark@Sun.COM if (ire == NULL) { 25511042SErik.Nordmark@Sun.COM rw_exit(&irb->irb_lock); 25611042SErik.Nordmark@Sun.COM goto bad; 25711042SErik.Nordmark@Sun.COM } 2588275SEric Cheng } 2598275SEric Cheng 2608275SEric Cheng /* we have a ire that matches */ 26111042SErik.Nordmark@Sun.COM ire_refhold(ire); 26211042SErik.Nordmark@Sun.COM rw_exit(&irb->irb_lock); 2638275SEric Cheng 2648275SEric Cheng /* 26511042SErik.Nordmark@Sun.COM * round-robin only if we have more than one route in the bucket. 26611042SErik.Nordmark@Sun.COM * ips_ip_ecmp_behavior controls when we do ECMP 26711042SErik.Nordmark@Sun.COM * 2: always 26811042SErik.Nordmark@Sun.COM * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 26911042SErik.Nordmark@Sun.COM * 0: never 2708275SEric Cheng * 27111042SErik.Nordmark@Sun.COM * Note: if we found an IRE_IF_CLONE we won't look at the bucket with 27211042SErik.Nordmark@Sun.COM * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match 27311042SErik.Nordmark@Sun.COM * and the IRE_INTERFACESs are likely to be shorter matches. 2748275SEric Cheng */ 27511042SErik.Nordmark@Sun.COM if (ire->ire_bucket->irb_ire_cnt > 1) { 27611042SErik.Nordmark@Sun.COM if (ipst->ips_ip_ecmp_behavior == 2 || 27711042SErik.Nordmark@Sun.COM (ipst->ips_ip_ecmp_behavior == 1 && 27811042SErik.Nordmark@Sun.COM IS_DEFAULT_ROUTE(ire))) { 27911042SErik.Nordmark@Sun.COM ire_t *next_ire; 28011042SErik.Nordmark@Sun.COM ire_ftable_args_t margs; 2818275SEric Cheng 28211131SErik.Nordmark@Sun.COM bzero(&margs, sizeof (margs)); 28311042SErik.Nordmark@Sun.COM margs.ift_addr = addr; 28411042SErik.Nordmark@Sun.COM margs.ift_zoneid = ALL_ZONES; 28511042SErik.Nordmark@Sun.COM 28611042SErik.Nordmark@Sun.COM next_ire = ire_round_robin(ire->ire_bucket, &margs, 28711042SErik.Nordmark@Sun.COM xmit_hint, ire, ipst); 28811042SErik.Nordmark@Sun.COM if (next_ire == NULL) { 28911042SErik.Nordmark@Sun.COM /* keep ire if next_ire is null */ 29011042SErik.Nordmark@Sun.COM if (generationp != NULL) 29111042SErik.Nordmark@Sun.COM *generationp = ire->ire_generation; 29211042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 29311042SErik.Nordmark@Sun.COM return (ire); 29411042SErik.Nordmark@Sun.COM } 29511042SErik.Nordmark@Sun.COM ire_refrele(ire); 29611042SErik.Nordmark@Sun.COM ire = next_ire; 2978275SEric Cheng } 2988275SEric Cheng } 29911042SErik.Nordmark@Sun.COM /* Return generation before dropping lock */ 30011042SErik.Nordmark@Sun.COM if (generationp != NULL) 30111042SErik.Nordmark@Sun.COM *generationp = ire->ire_generation; 30211042SErik.Nordmark@Sun.COM 30311042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 3048275SEric Cheng 30511042SErik.Nordmark@Sun.COM /* 30611042SErik.Nordmark@Sun.COM * Since we only did ALL_ZONES matches there is no special handling 30711042SErik.Nordmark@Sun.COM * of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that. 30811042SErik.Nordmark@Sun.COM */ 3098275SEric Cheng return (ire); 31011042SErik.Nordmark@Sun.COM 31111042SErik.Nordmark@Sun.COM bad: 31211042SErik.Nordmark@Sun.COM if (generationp != NULL) 31311042SErik.Nordmark@Sun.COM *generationp = IRE_GENERATION_VERIFY; 31411042SErik.Nordmark@Sun.COM 31511042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 31611042SErik.Nordmark@Sun.COM return (ire_reject(ipst, B_FALSE)); 3178275SEric Cheng } 3182535Ssangeeta 3192535Ssangeeta /* 32011042SErik.Nordmark@Sun.COM * Find the ill matching a multicast group. 3212535Ssangeeta * Allows different routes for multicast addresses 3222535Ssangeeta * in the unicast routing table (akin to 224.0.0.0 but could be more specific) 3232535Ssangeeta * which point at different interfaces. This is used when IP_MULTICAST_IF 3242535Ssangeeta * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't 3252535Ssangeeta * specify the interface to join on. 3262535Ssangeeta * 32711042SErik.Nordmark@Sun.COM * Supports link-local addresses by using ire_route_recursive which follows 32811042SErik.Nordmark@Sun.COM * the ill when recursing. 32911042SErik.Nordmark@Sun.COM * 33011042SErik.Nordmark@Sun.COM * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group 33111042SErik.Nordmark@Sun.COM * and the MULTIRT property can be different for different groups, we 33211042SErik.Nordmark@Sun.COM * extract RTF_MULTIRT from the special unicast route added for a group 33311042SErik.Nordmark@Sun.COM * with CGTP and pass that back in the multirtp argument. 33411042SErik.Nordmark@Sun.COM * This is used in ip_set_destination etc to set ixa_postfragfn for multicast. 33511042SErik.Nordmark@Sun.COM * We have a setsrcp argument for the same reason. 3362535Ssangeeta */ 33711042SErik.Nordmark@Sun.COM ill_t * 33811042SErik.Nordmark@Sun.COM ire_lookup_multi_ill_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst, 33911042SErik.Nordmark@Sun.COM boolean_t *multirtp, ipaddr_t *setsrcp) 3402535Ssangeeta { 3412535Ssangeeta ire_t *ire; 34211042SErik.Nordmark@Sun.COM ill_t *ill; 3432535Ssangeeta 34411042SErik.Nordmark@Sun.COM ire = ire_route_recursive_v4(group, 0, NULL, zoneid, NULL, 345*11457SErik.Nordmark@Sun.COM MATCH_IRE_DSTONLY, IRR_NONE, 0, ipst, setsrcp, NULL, NULL); 34611042SErik.Nordmark@Sun.COM ASSERT(ire != NULL); 34711042SErik.Nordmark@Sun.COM if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 3482535Ssangeeta ire_refrele(ire); 3492535Ssangeeta return (NULL); 3502535Ssangeeta } 35111042SErik.Nordmark@Sun.COM 35211042SErik.Nordmark@Sun.COM if (multirtp != NULL) 35311042SErik.Nordmark@Sun.COM *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0; 35411042SErik.Nordmark@Sun.COM 35511042SErik.Nordmark@Sun.COM ill = ire_nexthop_ill(ire); 35611042SErik.Nordmark@Sun.COM ire_refrele(ire); 35711042SErik.Nordmark@Sun.COM return (ill); 3582535Ssangeeta } 3592535Ssangeeta 3602535Ssangeeta /* 3612535Ssangeeta * Delete the passed in ire if the gateway addr matches 3622535Ssangeeta */ 3632535Ssangeeta void 3642535Ssangeeta ire_del_host_redir(ire_t *ire, char *gateway) 3652535Ssangeeta { 3663004Sdd193516 if ((ire->ire_flags & RTF_DYNAMIC) && 3672535Ssangeeta (ire->ire_gateway_addr == *(ipaddr_t *)gateway)) 3682535Ssangeeta ire_delete(ire); 3692535Ssangeeta } 3702535Ssangeeta 3712535Ssangeeta /* 37211042SErik.Nordmark@Sun.COM * Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are 3732535Ssangeeta * pointing at the specified gateway and 3742535Ssangeeta * delete them. This routine is called only 3752535Ssangeeta * when a default gateway is going away. 3762535Ssangeeta */ 3772535Ssangeeta void 3783448Sdh155122 ire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst) 3792535Ssangeeta { 3802535Ssangeeta struct rtfuncarg rtfarg; 3812535Ssangeeta 38211131SErik.Nordmark@Sun.COM bzero(&rtfarg, sizeof (rtfarg)); 3832535Ssangeeta rtfarg.rt_func = ire_del_host_redir; 3842535Ssangeeta rtfarg.rt_arg = (void *)&gateway; 38511131SErik.Nordmark@Sun.COM rtfarg.rt_zoneid = ALL_ZONES; 38611131SErik.Nordmark@Sun.COM rtfarg.rt_ipst = ipst; 3873448Sdh155122 (void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable, 3883448Sdh155122 rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn); 3892535Ssangeeta } 3902535Ssangeeta 3912535Ssangeeta /* 3923448Sdh155122 * Obtain the rt_entry and rt_irb for the route to be added to 3933448Sdh155122 * the ips_ip_ftable. 3942535Ssangeeta * First attempt to add a node to the radix tree via rn_addroute. If the 3952535Ssangeeta * route already exists, return the bucket for the existing route. 3962535Ssangeeta * 3972535Ssangeeta * Locking notes: Need to hold the global radix tree lock in write mode to 3982535Ssangeeta * add a radix node. To prevent the node from being deleted, ire_get_bucket() 3992535Ssangeeta * returns with a ref'ed irb_t. The ire itself is added in ire_add_v4() 4002535Ssangeeta * while holding the irb_lock, but not the radix tree lock. 4012535Ssangeeta */ 4022535Ssangeeta irb_t * 4032535Ssangeeta ire_get_bucket(ire_t *ire) 4042535Ssangeeta { 4052535Ssangeeta struct radix_node *rn; 4062535Ssangeeta struct rt_entry *rt; 4072535Ssangeeta struct rt_sockaddr rmask, rdst; 4082535Ssangeeta irb_t *irb = NULL; 4093448Sdh155122 ip_stack_t *ipst = ire->ire_ipst; 4102535Ssangeeta 4113448Sdh155122 ASSERT(ipst->ips_ip_ftable != NULL); 4122535Ssangeeta 4132535Ssangeeta /* first try to see if route exists (based on rtalloc1) */ 41411131SErik.Nordmark@Sun.COM bzero(&rdst, sizeof (rdst)); 4152535Ssangeeta rdst.rt_sin_len = sizeof (rdst); 4162535Ssangeeta rdst.rt_sin_family = AF_INET; 4172535Ssangeeta rdst.rt_sin_addr.s_addr = ire->ire_addr; 4182535Ssangeeta 41911131SErik.Nordmark@Sun.COM bzero(&rmask, sizeof (rmask)); 4202535Ssangeeta rmask.rt_sin_len = sizeof (rmask); 4212535Ssangeeta rmask.rt_sin_family = AF_INET; 4222535Ssangeeta rmask.rt_sin_addr.s_addr = ire->ire_mask; 4232535Ssangeeta 4242535Ssangeeta /* 4252535Ssangeeta * add the route. based on BSD's rtrequest1(RTM_ADD) 4262535Ssangeeta */ 4272535Ssangeeta R_Malloc(rt, rt_entry_cache, sizeof (*rt)); 4285090Ssangeeta /* kmem_alloc failed */ 4295090Ssangeeta if (rt == NULL) 4305090Ssangeeta return (NULL); 4315090Ssangeeta 43211131SErik.Nordmark@Sun.COM bzero(rt, sizeof (*rt)); 4332535Ssangeeta rt->rt_nodes->rn_key = (char *)&rt->rt_dst; 4342535Ssangeeta rt->rt_dst = rdst; 4352535Ssangeeta irb = &rt->rt_irb; 43611042SErik.Nordmark@Sun.COM irb->irb_marks |= IRB_MARK_DYNAMIC; /* dynamically allocated/freed */ 4373448Sdh155122 irb->irb_ipst = ipst; 4382535Ssangeeta rw_init(&irb->irb_lock, NULL, RW_DEFAULT, NULL); 4393448Sdh155122 RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable); 4403448Sdh155122 rn = ipst->ips_ip_ftable->rnh_addaddr(&rt->rt_dst, &rmask, 4413448Sdh155122 ipst->ips_ip_ftable, (struct radix_node *)rt); 4422535Ssangeeta if (rn == NULL) { 4433448Sdh155122 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 4442535Ssangeeta Free(rt, rt_entry_cache); 4452535Ssangeeta rt = NULL; 4462535Ssangeeta irb = NULL; 4473448Sdh155122 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 4483448Sdh155122 rn = ipst->ips_ip_ftable->rnh_lookup(&rdst, &rmask, 4493448Sdh155122 ipst->ips_ip_ftable); 4503448Sdh155122 if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { 4512535Ssangeeta /* found a non-root match */ 4522535Ssangeeta rt = (struct rt_entry *)rn; 4532535Ssangeeta } 4542535Ssangeeta } 4552535Ssangeeta if (rt != NULL) { 4562535Ssangeeta irb = &rt->rt_irb; 45711042SErik.Nordmark@Sun.COM irb_refhold(irb); 4582535Ssangeeta } 4593448Sdh155122 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 4602535Ssangeeta return (irb); 4612535Ssangeeta } 4622535Ssangeeta 4632535Ssangeeta /* 4642535Ssangeeta * This function is used when the caller wants to know the outbound 4652535Ssangeeta * interface for a packet given only the address. 4662535Ssangeeta * If this is a offlink IP address and there are multiple 4672535Ssangeeta * routes to this destination, this routine will utilise the 4682535Ssangeeta * first route it finds to IP address 4692535Ssangeeta * Return values: 4702535Ssangeeta * 0 - FAILURE 4712535Ssangeeta * nonzero - ifindex 4722535Ssangeeta */ 4732535Ssangeeta uint_t 4742535Ssangeeta ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid) 4752535Ssangeeta { 4762535Ssangeeta uint_t ifindex = 0; 4772535Ssangeeta ire_t *ire; 4782535Ssangeeta ill_t *ill; 4793448Sdh155122 netstack_t *ns; 4803448Sdh155122 ip_stack_t *ipst; 4812535Ssangeeta 4823448Sdh155122 if (zoneid == ALL_ZONES) 4833448Sdh155122 ns = netstack_find_by_zoneid(GLOBAL_ZONEID); 4843448Sdh155122 else 4853448Sdh155122 ns = netstack_find_by_zoneid(zoneid); 4863448Sdh155122 ASSERT(ns != NULL); 4873448Sdh155122 4883448Sdh155122 /* 4893448Sdh155122 * For exclusive stacks we set the zoneid to zero 4903448Sdh155122 * since IP uses the global zoneid in the exclusive stacks. 4913448Sdh155122 */ 4923448Sdh155122 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 4933448Sdh155122 zoneid = GLOBAL_ZONEID; 4943448Sdh155122 ipst = ns->netstack_ip; 4952535Ssangeeta 4962535Ssangeeta ASSERT(ipaddr->sa_family == AF_INET || ipaddr->sa_family == AF_INET6); 4972535Ssangeeta 49811042SErik.Nordmark@Sun.COM if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) { 49911042SErik.Nordmark@Sun.COM ill = ire_nexthop_ill(ire); 50011042SErik.Nordmark@Sun.COM if (ill != NULL) { 5012535Ssangeeta ifindex = ill->ill_phyint->phyint_ifindex; 50211042SErik.Nordmark@Sun.COM ill_refrele(ill); 50311042SErik.Nordmark@Sun.COM } 5042535Ssangeeta ire_refrele(ire); 5052535Ssangeeta } 5063448Sdh155122 netstack_rele(ns); 5072535Ssangeeta return (ifindex); 5082535Ssangeeta } 5092535Ssangeeta 5102535Ssangeeta /* 5112535Ssangeeta * Routine to find the route to a destination. If a ifindex is supplied 51211042SErik.Nordmark@Sun.COM * it tries to match the route to the corresponding ipif for the ifindex 5132535Ssangeeta */ 5142535Ssangeeta static ire_t * 5153448Sdh155122 route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst) 5162535Ssangeeta { 5172535Ssangeeta ire_t *ire = NULL; 5182535Ssangeeta int match_flags; 5192535Ssangeeta 52011042SErik.Nordmark@Sun.COM match_flags = MATCH_IRE_DSTONLY; 5212535Ssangeeta 5222535Ssangeeta /* XXX pass NULL tsl for now */ 5232535Ssangeeta 5242535Ssangeeta if (dst_addr->sa_family == AF_INET) { 52511042SErik.Nordmark@Sun.COM ire = ire_route_recursive_v4( 52611042SErik.Nordmark@Sun.COM ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr, 0, NULL, 527*11457SErik.Nordmark@Sun.COM zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL, 528*11457SErik.Nordmark@Sun.COM NULL, NULL); 5292535Ssangeeta } else { 53011042SErik.Nordmark@Sun.COM ire = ire_route_recursive_v6( 53111042SErik.Nordmark@Sun.COM &((struct sockaddr_in6 *)dst_addr)->sin6_addr, 0, NULL, 532*11457SErik.Nordmark@Sun.COM zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL, 533*11457SErik.Nordmark@Sun.COM NULL, NULL); 53411042SErik.Nordmark@Sun.COM } 53511042SErik.Nordmark@Sun.COM ASSERT(ire != NULL); 53611042SErik.Nordmark@Sun.COM if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 53711042SErik.Nordmark@Sun.COM ire_refrele(ire); 53811042SErik.Nordmark@Sun.COM return (NULL); 5392535Ssangeeta } 5402535Ssangeeta return (ire); 5412535Ssangeeta } 5422535Ssangeeta 5432535Ssangeeta /* 5442535Ssangeeta * This routine is called by IP Filter to send a packet out on the wire 54511042SErik.Nordmark@Sun.COM * to a specified dstination (which may be onlink or offlink). The ifindex may 54611042SErik.Nordmark@Sun.COM * or may not be 0. A non-null ifindex indicates IP Filter has stipulated 5472535Ssangeeta * an outgoing interface and requires the nexthop to be on that interface. 5484482Sdr146992 * IP WILL NOT DO the following to the data packet before sending it out: 5492535Ssangeeta * a. manipulate ttl 5504482Sdr146992 * b. ipsec work 5514482Sdr146992 * c. fragmentation 5524482Sdr146992 * 5534482Sdr146992 * If the packet has been prepared for hardware checksum then it will be 5544482Sdr146992 * passed off to ip_send_align_cksum() to check that the flags set on the 5554482Sdr146992 * packet are in alignment with the capabilities of the new outgoing NIC. 5562535Ssangeeta * 5572535Ssangeeta * Return values: 5582535Ssangeeta * 0: IP was able to send of the data pkt 5592535Ssangeeta * ECOMM: Could not send packet 5602535Ssangeeta * ENONET No route to dst. It is up to the caller 5612535Ssangeeta * to send icmp unreachable error message, 5622535Ssangeeta * EINPROGRESS The macaddr of the onlink dst or that 5632535Ssangeeta * of the offlink dst's nexthop needs to get 5642535Ssangeeta * resolved before packet can be sent to dst. 5652535Ssangeeta * Thus transmission is not guaranteed. 56611042SErik.Nordmark@Sun.COM * Note: No longer have visibility to the ARP queue 56711042SErik.Nordmark@Sun.COM * hence no EINPROGRESS. 5682535Ssangeeta */ 5692535Ssangeeta int 5702535Ssangeeta ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex, 5712535Ssangeeta zoneid_t zoneid) 5722535Ssangeeta { 57311042SErik.Nordmark@Sun.COM ipaddr_t nexthop; 5743448Sdh155122 netstack_t *ns; 5753448Sdh155122 ip_stack_t *ipst; 57611042SErik.Nordmark@Sun.COM ip_xmit_attr_t ixas; 57711042SErik.Nordmark@Sun.COM int error; 5782535Ssangeeta 5792535Ssangeeta ASSERT(mp != NULL); 5802535Ssangeeta 5813448Sdh155122 if (zoneid == ALL_ZONES) 5823448Sdh155122 ns = netstack_find_by_zoneid(GLOBAL_ZONEID); 5833448Sdh155122 else 5843448Sdh155122 ns = netstack_find_by_zoneid(zoneid); 5853448Sdh155122 ASSERT(ns != NULL); 5863448Sdh155122 5873448Sdh155122 /* 5883448Sdh155122 * For exclusive stacks we set the zoneid to zero 5893448Sdh155122 * since IP uses the global zoneid in the exclusive stacks. 5903448Sdh155122 */ 5913448Sdh155122 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 5923448Sdh155122 zoneid = GLOBAL_ZONEID; 5933448Sdh155122 ipst = ns->netstack_ip; 5943448Sdh155122 5952535Ssangeeta ASSERT(dst_addr->sa_family == AF_INET || 5962535Ssangeeta dst_addr->sa_family == AF_INET6); 5972535Ssangeeta 59811042SErik.Nordmark@Sun.COM bzero(&ixas, sizeof (ixas)); 5992535Ssangeeta /* 60011042SErik.Nordmark@Sun.COM * No IPsec, no fragmentation, and don't let any hooks see 60111042SErik.Nordmark@Sun.COM * the packet. 6022535Ssangeeta */ 60311042SErik.Nordmark@Sun.COM ixas.ixa_flags = IXAF_NO_IPSEC | IXAF_DONTFRAG | IXAF_NO_PFHOOK; 60411042SErik.Nordmark@Sun.COM ixas.ixa_cred = kcred; 60511042SErik.Nordmark@Sun.COM ixas.ixa_cpid = NOPID; 60611042SErik.Nordmark@Sun.COM ixas.ixa_tsl = NULL; 60711042SErik.Nordmark@Sun.COM ixas.ixa_ipst = ipst; 60811042SErik.Nordmark@Sun.COM ixas.ixa_ifindex = ifindex; 6092535Ssangeeta 61011042SErik.Nordmark@Sun.COM if (dst_addr->sa_family == AF_INET) { 61111042SErik.Nordmark@Sun.COM ipha_t *ipha = (ipha_t *)mp->b_rptr; 6124482Sdr146992 61311042SErik.Nordmark@Sun.COM ixas.ixa_flags |= IXAF_IS_IPV4; 61411042SErik.Nordmark@Sun.COM nexthop = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr; 61511042SErik.Nordmark@Sun.COM if (nexthop != ipha->ipha_dst) { 61611042SErik.Nordmark@Sun.COM ixas.ixa_flags |= IXAF_NEXTHOP_SET; 61711042SErik.Nordmark@Sun.COM ixas.ixa_nexthop_v4 = nexthop; 6182535Ssangeeta } 61911042SErik.Nordmark@Sun.COM ixas.ixa_multicast_ttl = ipha->ipha_ttl; 62011042SErik.Nordmark@Sun.COM } else { 62111042SErik.Nordmark@Sun.COM ip6_t *ip6h = (ip6_t *)mp->b_rptr; 62211042SErik.Nordmark@Sun.COM in6_addr_t *nexthop6; 62311042SErik.Nordmark@Sun.COM 62411042SErik.Nordmark@Sun.COM nexthop6 = &((struct sockaddr_in6 *)dst_addr)->sin6_addr; 62511042SErik.Nordmark@Sun.COM if (!IN6_ARE_ADDR_EQUAL(nexthop6, &ip6h->ip6_dst)) { 62611042SErik.Nordmark@Sun.COM ixas.ixa_flags |= IXAF_NEXTHOP_SET; 62711042SErik.Nordmark@Sun.COM ixas.ixa_nexthop_v6 = *nexthop6; 62811042SErik.Nordmark@Sun.COM } 62911042SErik.Nordmark@Sun.COM ixas.ixa_multicast_ttl = ip6h->ip6_hops; 63011042SErik.Nordmark@Sun.COM } 63111042SErik.Nordmark@Sun.COM error = ip_output_simple(mp, &ixas); 63211042SErik.Nordmark@Sun.COM ixa_cleanup(&ixas); 63311042SErik.Nordmark@Sun.COM 63411042SErik.Nordmark@Sun.COM netstack_rele(ns); 63511042SErik.Nordmark@Sun.COM switch (error) { 63611042SErik.Nordmark@Sun.COM case 0: 6372535Ssangeeta break; 63811042SErik.Nordmark@Sun.COM 63911042SErik.Nordmark@Sun.COM case EHOSTUNREACH: 64011042SErik.Nordmark@Sun.COM case ENETUNREACH: 64111042SErik.Nordmark@Sun.COM error = ENONET; 64211042SErik.Nordmark@Sun.COM break; 64311042SErik.Nordmark@Sun.COM 64411042SErik.Nordmark@Sun.COM default: 64511042SErik.Nordmark@Sun.COM error = ECOMM; 6462535Ssangeeta break; 6472535Ssangeeta } 64811042SErik.Nordmark@Sun.COM return (error); 6494482Sdr146992 } 6504482Sdr146992 6512535Ssangeeta /* 6522535Ssangeeta * callback function provided by ire_ftable_lookup when calling 6532535Ssangeeta * rn_match_args(). Invoke ire_match_args on each matching leaf node in 6542535Ssangeeta * the radix tree. 6552535Ssangeeta */ 6562535Ssangeeta boolean_t 6572535Ssangeeta ire_find_best_route(struct radix_node *rn, void *arg) 6582535Ssangeeta { 6592535Ssangeeta struct rt_entry *rt = (struct rt_entry *)rn; 6602535Ssangeeta irb_t *irb_ptr; 6612535Ssangeeta ire_t *ire; 6622535Ssangeeta ire_ftable_args_t *margs = arg; 6632535Ssangeeta ipaddr_t match_mask; 6642535Ssangeeta 6652535Ssangeeta ASSERT(rt != NULL); 6662535Ssangeeta 6672535Ssangeeta irb_ptr = &rt->rt_irb; 6682535Ssangeeta 6692535Ssangeeta if (irb_ptr->irb_ire_cnt == 0) 6702535Ssangeeta return (B_FALSE); 6712535Ssangeeta 6722535Ssangeeta rw_enter(&irb_ptr->irb_lock, RW_READER); 6732535Ssangeeta for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { 67411042SErik.Nordmark@Sun.COM if (IRE_IS_CONDEMNED(ire)) 6752535Ssangeeta continue; 67611042SErik.Nordmark@Sun.COM if (margs->ift_flags & (MATCH_IRE_MASK|MATCH_IRE_SHORTERMASK)) 6772535Ssangeeta match_mask = margs->ift_mask; 6782535Ssangeeta else 6792535Ssangeeta match_mask = ire->ire_mask; 6802535Ssangeeta 6812535Ssangeeta if (ire_match_args(ire, margs->ift_addr, match_mask, 68211042SErik.Nordmark@Sun.COM margs->ift_gateway, margs->ift_type, margs->ift_ill, 68311042SErik.Nordmark@Sun.COM margs->ift_zoneid, margs->ift_tsl, 68411042SErik.Nordmark@Sun.COM margs->ift_flags)) { 68511042SErik.Nordmark@Sun.COM ire_refhold(ire); 6862535Ssangeeta rw_exit(&irb_ptr->irb_lock); 6872535Ssangeeta margs->ift_best_ire = ire; 6882535Ssangeeta return (B_TRUE); 6892535Ssangeeta } 6902535Ssangeeta } 6912535Ssangeeta rw_exit(&irb_ptr->irb_lock); 6922535Ssangeeta return (B_FALSE); 6932535Ssangeeta } 6942535Ssangeeta 6952535Ssangeeta /* 6962535Ssangeeta * ftable irb_t structures are dynamically allocated, and we need to 6972535Ssangeeta * check if the irb_t (and associated ftable tree attachment) needs to 6982535Ssangeeta * be cleaned up when the irb_refcnt goes to 0. The conditions that need 6992535Ssangeeta * be verified are: 7002535Ssangeeta * - no other walkers of the irebucket, i.e., quiescent irb_refcnt, 7012535Ssangeeta * - no other threads holding references to ire's in the bucket, 7022535Ssangeeta * i.e., irb_nire == 0 7032535Ssangeeta * - no active ire's in the bucket, i.e., irb_ire_cnt == 0 7042535Ssangeeta * - need to hold the global tree lock and irb_lock in write mode. 7052535Ssangeeta */ 7062535Ssangeeta void 7072535Ssangeeta irb_refrele_ftable(irb_t *irb) 7082535Ssangeeta { 7092535Ssangeeta for (;;) { 7102535Ssangeeta rw_enter(&irb->irb_lock, RW_WRITER); 7112535Ssangeeta ASSERT(irb->irb_refcnt != 0); 7122535Ssangeeta if (irb->irb_refcnt != 1) { 7132535Ssangeeta /* 7142535Ssangeeta * Someone has a reference to this radix node 7152535Ssangeeta * or there is some bucket walker. 7162535Ssangeeta */ 7172535Ssangeeta irb->irb_refcnt--; 7182535Ssangeeta rw_exit(&irb->irb_lock); 7192535Ssangeeta return; 7202535Ssangeeta } else { 7212535Ssangeeta /* 7222535Ssangeeta * There is no other walker, nor is there any 7232535Ssangeeta * other thread that holds a direct ref to this 7242535Ssangeeta * radix node. Do the clean up if needed. Call 7252535Ssangeeta * to ire_unlink will clear the IRB_MARK_CONDEMNED flag 7262535Ssangeeta */ 7272535Ssangeeta if (irb->irb_marks & IRB_MARK_CONDEMNED) { 7282535Ssangeeta ire_t *ire_list; 7292535Ssangeeta 7302535Ssangeeta ire_list = ire_unlink(irb); 7312535Ssangeeta rw_exit(&irb->irb_lock); 7322535Ssangeeta 7332535Ssangeeta if (ire_list != NULL) 7342535Ssangeeta ire_cleanup(ire_list); 7352535Ssangeeta /* 7362535Ssangeeta * more CONDEMNED entries could have 7372535Ssangeeta * been added while we dropped the lock, 7382535Ssangeeta * so we have to re-check. 7392535Ssangeeta */ 7402535Ssangeeta continue; 7412535Ssangeeta } 7422535Ssangeeta 7432535Ssangeeta /* 7442535Ssangeeta * Now check if there are still any ires 7452535Ssangeeta * associated with this radix node. 7462535Ssangeeta */ 7472535Ssangeeta if (irb->irb_nire != 0) { 7482535Ssangeeta /* 7492535Ssangeeta * someone is still holding on 7502535Ssangeeta * to ires in this bucket 7512535Ssangeeta */ 7522535Ssangeeta irb->irb_refcnt--; 7532535Ssangeeta rw_exit(&irb->irb_lock); 7542535Ssangeeta return; 7552535Ssangeeta } else { 7562535Ssangeeta /* 7572535Ssangeeta * Everything is clear. Zero walkers, 7582535Ssangeeta * Zero threads with a ref to this 7592535Ssangeeta * radix node, Zero ires associated with 7602535Ssangeeta * this radix node. Due to lock order, 7612535Ssangeeta * check the above conditions again 7622535Ssangeeta * after grabbing all locks in the right order 7632535Ssangeeta */ 7642535Ssangeeta rw_exit(&irb->irb_lock); 7652535Ssangeeta if (irb_inactive(irb)) 7662535Ssangeeta return; 7672535Ssangeeta /* 7682535Ssangeeta * irb_inactive could not free the irb. 7692535Ssangeeta * See if there are any walkers, if not 7702535Ssangeeta * try to clean up again. 7712535Ssangeeta */ 7722535Ssangeeta } 7732535Ssangeeta } 7742535Ssangeeta } 7752535Ssangeeta } 7762535Ssangeeta 7772535Ssangeeta /* 77811042SErik.Nordmark@Sun.COM * IRE iterator used by ire_ftable_lookup to process multiple equal 77911042SErik.Nordmark@Sun.COM * routes. Given a starting point in the hash list (hash), walk the IREs 78011042SErik.Nordmark@Sun.COM * in the bucket skipping deleted entries. We treat the bucket as a circular 78111042SErik.Nordmark@Sun.COM * list for the purposes of walking it. 78211042SErik.Nordmark@Sun.COM * Returns the IRE (held) that corresponds to the hash value. If that IRE is 78311042SErik.Nordmark@Sun.COM * not applicable (ire_match_args failed) then it returns a subsequent one. 78411042SErik.Nordmark@Sun.COM * If we fail to find an IRE we return NULL. 78511042SErik.Nordmark@Sun.COM * 78611042SErik.Nordmark@Sun.COM * Assumes that the caller holds a reference on the IRE bucket and a read lock 78711042SErik.Nordmark@Sun.COM * on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6). 7882535Ssangeeta * 78911042SErik.Nordmark@Sun.COM * Applies to IPv4 and IPv6. 79011042SErik.Nordmark@Sun.COM * 79111042SErik.Nordmark@Sun.COM * For CGTP, where an IRE_BROADCAST and IRE_HOST can exist for the same 79211042SErik.Nordmark@Sun.COM * address and bucket, we compare against ire_type for the orig_ire. We also 79311042SErik.Nordmark@Sun.COM * have IRE_BROADCASTs with and without RTF_MULTIRT, with the former being 79411131SErik.Nordmark@Sun.COM * first in the bucket. Thus we compare that RTF_MULTIRT match the orig_ire. 79511042SErik.Nordmark@Sun.COM * 79611042SErik.Nordmark@Sun.COM * Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is 79711042SErik.Nordmark@Sun.COM * reachable from the zone i.e., that the ire_gateway_addr is in a subnet 79811042SErik.Nordmark@Sun.COM * in which the zone has an IP address. We check this for the global zone 79911042SErik.Nordmark@Sun.COM * even if no shared-IP zones are configured. 8002535Ssangeeta */ 8012535Ssangeeta ire_t * 80211042SErik.Nordmark@Sun.COM ire_round_robin(irb_t *irb_ptr, ire_ftable_args_t *margs, uint_t hash, 80311042SErik.Nordmark@Sun.COM ire_t *orig_ire, ip_stack_t *ipst) 8042535Ssangeeta { 80511042SErik.Nordmark@Sun.COM ire_t *ire, *maybe_ire = NULL; 80611042SErik.Nordmark@Sun.COM uint_t maybe_badcnt; 80711042SErik.Nordmark@Sun.COM uint_t maxwalk; 80811042SErik.Nordmark@Sun.COM 80911042SErik.Nordmark@Sun.COM /* Fold in more bits from the hint/hash */ 81011042SErik.Nordmark@Sun.COM hash = hash ^ (hash >> 8) ^ (hash >> 16); 8112535Ssangeeta 8122535Ssangeeta rw_enter(&irb_ptr->irb_lock, RW_WRITER); 81311042SErik.Nordmark@Sun.COM maxwalk = irb_ptr->irb_ire_cnt; /* Excludes condemned */ 81411042SErik.Nordmark@Sun.COM hash %= maxwalk; 81511042SErik.Nordmark@Sun.COM irb_refhold_locked(irb_ptr); 8162535Ssangeeta rw_exit(&irb_ptr->irb_lock); 8172535Ssangeeta 8182535Ssangeeta /* 8192535Ssangeeta * Round-robin the routers list looking for a route that 8202535Ssangeeta * matches the passed in parameters. 82111042SErik.Nordmark@Sun.COM * First we skip "hash" number of non-condemned IREs. 82211042SErik.Nordmark@Sun.COM * Then we match the IRE. 82311042SErik.Nordmark@Sun.COM * If we find an ire which has a non-zero ire_badcnt then we remember 82411042SErik.Nordmark@Sun.COM * it and keep on looking for a lower ire_badcnt. 82511042SErik.Nordmark@Sun.COM * If we come to the end of the list we continue (treat the 82611042SErik.Nordmark@Sun.COM * bucket list as a circular list) but we match less than "max" 82711042SErik.Nordmark@Sun.COM * entries. 8282535Ssangeeta */ 82911042SErik.Nordmark@Sun.COM ire = irb_ptr->irb_ire; 83011042SErik.Nordmark@Sun.COM while (maxwalk > 0) { 83111042SErik.Nordmark@Sun.COM if (IRE_IS_CONDEMNED(ire)) 83211042SErik.Nordmark@Sun.COM goto next_ire_skip; 8332535Ssangeeta 83411042SErik.Nordmark@Sun.COM /* Skip the first "hash" entries to do ECMP */ 83511042SErik.Nordmark@Sun.COM if (hash != 0) { 83611042SErik.Nordmark@Sun.COM hash--; 83711042SErik.Nordmark@Sun.COM goto next_ire_skip; 83811042SErik.Nordmark@Sun.COM } 83911042SErik.Nordmark@Sun.COM 84011042SErik.Nordmark@Sun.COM /* See CGTP comment above */ 84111042SErik.Nordmark@Sun.COM if (ire->ire_type != orig_ire->ire_type || 84211131SErik.Nordmark@Sun.COM ((ire->ire_flags ^ orig_ire->ire_flags) & RTF_MULTIRT) != 0) 8432535Ssangeeta goto next_ire; 8442535Ssangeeta 84511042SErik.Nordmark@Sun.COM /* 84611042SErik.Nordmark@Sun.COM * Note: Since IPv6 has hash buckets instead of radix 84711042SErik.Nordmark@Sun.COM * buckers we need to explicitly compare the addresses. 84811042SErik.Nordmark@Sun.COM * That makes this less efficient since we will be called 84911042SErik.Nordmark@Sun.COM * even if there is no alternatives just because the 85011042SErik.Nordmark@Sun.COM * bucket has multiple IREs for different addresses. 85111042SErik.Nordmark@Sun.COM */ 85211042SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV6_VERSION) { 85311042SErik.Nordmark@Sun.COM if (!IN6_ARE_ADDR_EQUAL(&orig_ire->ire_addr_v6, 85411042SErik.Nordmark@Sun.COM &ire->ire_addr_v6)) 85511042SErik.Nordmark@Sun.COM goto next_ire; 85611042SErik.Nordmark@Sun.COM } 85711042SErik.Nordmark@Sun.COM 85811042SErik.Nordmark@Sun.COM /* 85911042SErik.Nordmark@Sun.COM * For some reason find_best_route uses ire_mask. We do 86011042SErik.Nordmark@Sun.COM * the same. 86111042SErik.Nordmark@Sun.COM */ 86211042SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV4_VERSION ? 86311042SErik.Nordmark@Sun.COM !ire_match_args(ire, margs->ift_addr, 86411042SErik.Nordmark@Sun.COM ire->ire_mask, margs->ift_gateway, 86511042SErik.Nordmark@Sun.COM margs->ift_type, margs->ift_ill, margs->ift_zoneid, 86611042SErik.Nordmark@Sun.COM margs->ift_tsl, margs->ift_flags) : 86711042SErik.Nordmark@Sun.COM !ire_match_args_v6(ire, &margs->ift_addr_v6, 86811042SErik.Nordmark@Sun.COM &ire->ire_mask_v6, &margs->ift_gateway_v6, 86911042SErik.Nordmark@Sun.COM margs->ift_type, margs->ift_ill, margs->ift_zoneid, 87011042SErik.Nordmark@Sun.COM margs->ift_tsl, margs->ift_flags)) 8712535Ssangeeta goto next_ire; 8722535Ssangeeta 87311042SErik.Nordmark@Sun.COM if (margs->ift_zoneid != ALL_ZONES && 87411042SErik.Nordmark@Sun.COM (ire->ire_type & IRE_OFFLINK)) { 8752535Ssangeeta /* 87611042SErik.Nordmark@Sun.COM * When we're in a zone, we're only 87711042SErik.Nordmark@Sun.COM * interested in routers that are 87811042SErik.Nordmark@Sun.COM * reachable through ipifs within our zone. 8792535Ssangeeta */ 88011042SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV4_VERSION) { 88111042SErik.Nordmark@Sun.COM if (!ire_gateway_ok_zone_v4( 88211042SErik.Nordmark@Sun.COM ire->ire_gateway_addr, margs->ift_zoneid, 88311042SErik.Nordmark@Sun.COM ire->ire_ill, margs->ift_tsl, ipst, 88411042SErik.Nordmark@Sun.COM B_TRUE)) 88511042SErik.Nordmark@Sun.COM goto next_ire; 88611042SErik.Nordmark@Sun.COM } else { 88711042SErik.Nordmark@Sun.COM if (!ire_gateway_ok_zone_v6( 88811042SErik.Nordmark@Sun.COM &ire->ire_gateway_addr_v6, 88911042SErik.Nordmark@Sun.COM margs->ift_zoneid, ire->ire_ill, 89011042SErik.Nordmark@Sun.COM margs->ift_tsl, ipst, B_TRUE)) 89111042SErik.Nordmark@Sun.COM goto next_ire; 89211042SErik.Nordmark@Sun.COM } 8932535Ssangeeta } 89411042SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock); 89511042SErik.Nordmark@Sun.COM /* Look for stale ire_badcnt and clear */ 89611042SErik.Nordmark@Sun.COM if (ire->ire_badcnt != 0 && 89711066Srafael.vanoni@sun.com (TICK_TO_SEC(ddi_get_lbolt64()) - ire->ire_last_badcnt > 89811042SErik.Nordmark@Sun.COM ipst->ips_ip_ire_badcnt_lifetime)) 89911042SErik.Nordmark@Sun.COM ire->ire_badcnt = 0; 90011042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 9012535Ssangeeta 90211042SErik.Nordmark@Sun.COM if (ire->ire_badcnt == 0) { 90311042SErik.Nordmark@Sun.COM /* We found one with a zero badcnt; done */ 90411042SErik.Nordmark@Sun.COM ire_refhold(ire); 90511042SErik.Nordmark@Sun.COM /* 90611042SErik.Nordmark@Sun.COM * Care needed since irb_refrele grabs WLOCK to free 90711042SErik.Nordmark@Sun.COM * the irb_t. 90811042SErik.Nordmark@Sun.COM */ 90911042SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV4_VERSION) { 91011042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 91111042SErik.Nordmark@Sun.COM irb_refrele(irb_ptr); 91211042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 91311042SErik.Nordmark@Sun.COM } else { 91411042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ip6_ire_head_lock); 91511042SErik.Nordmark@Sun.COM irb_refrele(irb_ptr); 91611042SErik.Nordmark@Sun.COM rw_enter(&ipst->ips_ip6_ire_head_lock, 91711042SErik.Nordmark@Sun.COM RW_READER); 91811042SErik.Nordmark@Sun.COM } 9192535Ssangeeta return (ire); 9202535Ssangeeta } 9212535Ssangeeta /* 92211042SErik.Nordmark@Sun.COM * keep looking to see if there is a better (lower 92311042SErik.Nordmark@Sun.COM * badcnt) matching IRE, but save this one as a last resort. 92411042SErik.Nordmark@Sun.COM * If we find a lower badcnt pick that one as the last* resort. 9252535Ssangeeta */ 92611042SErik.Nordmark@Sun.COM if (maybe_ire == NULL) { 92711042SErik.Nordmark@Sun.COM maybe_ire = ire; 92811042SErik.Nordmark@Sun.COM maybe_badcnt = ire->ire_badcnt; 92911042SErik.Nordmark@Sun.COM } else if (ire->ire_badcnt < maybe_badcnt) { 93011042SErik.Nordmark@Sun.COM maybe_ire = ire; 93111042SErik.Nordmark@Sun.COM maybe_badcnt = ire->ire_badcnt; 93211042SErik.Nordmark@Sun.COM } 9338485SPeter.Memishian@Sun.COM 9342535Ssangeeta next_ire: 93511042SErik.Nordmark@Sun.COM maxwalk--; 93611042SErik.Nordmark@Sun.COM next_ire_skip: 93711042SErik.Nordmark@Sun.COM ire = ire->ire_next; 93811042SErik.Nordmark@Sun.COM if (ire == NULL) 93911042SErik.Nordmark@Sun.COM ire = irb_ptr->irb_ire; 9402535Ssangeeta } 9412535Ssangeeta if (maybe_ire != NULL) 94211042SErik.Nordmark@Sun.COM ire_refhold(maybe_ire); 94311042SErik.Nordmark@Sun.COM 94411042SErik.Nordmark@Sun.COM /* Care needed since irb_refrele grabs WLOCK to free the irb_t. */ 94511042SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV4_VERSION) { 94611042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 94711042SErik.Nordmark@Sun.COM irb_refrele(irb_ptr); 94811042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 94911042SErik.Nordmark@Sun.COM } else { 95011042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ip6_ire_head_lock); 95111042SErik.Nordmark@Sun.COM irb_refrele(irb_ptr); 95211042SErik.Nordmark@Sun.COM rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); 95311042SErik.Nordmark@Sun.COM } 9542535Ssangeeta return (maybe_ire); 9552535Ssangeeta } 9562783Ssowmini 9572783Ssowmini void 9582783Ssowmini irb_refhold_rn(struct radix_node *rn) 9592783Ssowmini { 9602783Ssowmini if ((rn->rn_flags & RNF_ROOT) == 0) 96111042SErik.Nordmark@Sun.COM irb_refhold(&((rt_t *)(rn))->rt_irb); 9622783Ssowmini } 9632783Ssowmini 9642783Ssowmini void 9652783Ssowmini irb_refrele_rn(struct radix_node *rn) 9662783Ssowmini { 9672783Ssowmini if ((rn->rn_flags & RNF_ROOT) == 0) 9682783Ssowmini irb_refrele_ftable(&((rt_t *)(rn))->rt_irb); 9692783Ssowmini } 97011042SErik.Nordmark@Sun.COM 97111042SErik.Nordmark@Sun.COM /* 97211042SErik.Nordmark@Sun.COM * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject 97311042SErik.Nordmark@Sun.COM * routes this routine sets up a ire_nce_cache as well. The caller needs to 97411042SErik.Nordmark@Sun.COM * lookup an nce for the multicast case. 97511042SErik.Nordmark@Sun.COM */ 97611042SErik.Nordmark@Sun.COM ire_t * 97711042SErik.Nordmark@Sun.COM ip_select_route(const in6_addr_t *v6dst, ip_xmit_attr_t *ixa, 97811042SErik.Nordmark@Sun.COM uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp) 97911042SErik.Nordmark@Sun.COM { 98011042SErik.Nordmark@Sun.COM uint_t match_args; 98111042SErik.Nordmark@Sun.COM uint_t ire_type; 98211042SErik.Nordmark@Sun.COM ill_t *ill; 98311042SErik.Nordmark@Sun.COM ire_t *ire; 98411042SErik.Nordmark@Sun.COM ip_stack_t *ipst = ixa->ixa_ipst; 98511042SErik.Nordmark@Sun.COM ipaddr_t v4dst; 98611042SErik.Nordmark@Sun.COM in6_addr_t v6nexthop; 98711042SErik.Nordmark@Sun.COM iaflags_t ixaflags = ixa->ixa_flags; 98811042SErik.Nordmark@Sun.COM nce_t *nce; 98911042SErik.Nordmark@Sun.COM 99011042SErik.Nordmark@Sun.COM match_args = MATCH_IRE_SECATTR; 99111042SErik.Nordmark@Sun.COM IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst); 99211042SErik.Nordmark@Sun.COM if (setsrcp != NULL) 99311042SErik.Nordmark@Sun.COM ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp)); 99411042SErik.Nordmark@Sun.COM if (errorp != NULL) 99511042SErik.Nordmark@Sun.COM ASSERT(*errorp == 0); 99611042SErik.Nordmark@Sun.COM 99711042SErik.Nordmark@Sun.COM /* 99811042SErik.Nordmark@Sun.COM * The content of the ixa will be different if IP_NEXTHOP, 99911042SErik.Nordmark@Sun.COM * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set 100011042SErik.Nordmark@Sun.COM */ 100111042SErik.Nordmark@Sun.COM 100211042SErik.Nordmark@Sun.COM if ((ixaflags & IXAF_IS_IPV4) ? CLASSD(v4dst) : 100311042SErik.Nordmark@Sun.COM IN6_IS_ADDR_MULTICAST(v6dst)) { 100411042SErik.Nordmark@Sun.COM /* Pick up the IRE_MULTICAST for the ill */ 100511042SErik.Nordmark@Sun.COM if (ixa->ixa_multicast_ifindex != 0) { 100611042SErik.Nordmark@Sun.COM ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex, 100711042SErik.Nordmark@Sun.COM !(ixaflags & IXAF_IS_IPV4), ipst); 100811042SErik.Nordmark@Sun.COM } else if (ixaflags & IXAF_SCOPEID_SET) { 100911042SErik.Nordmark@Sun.COM /* sin6_scope_id takes precedence over ixa_ifindex */ 101011042SErik.Nordmark@Sun.COM ASSERT(ixa->ixa_scopeid != 0); 101111042SErik.Nordmark@Sun.COM ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, 101211042SErik.Nordmark@Sun.COM !(ixaflags & IXAF_IS_IPV4), ipst); 101311042SErik.Nordmark@Sun.COM } else if (ixa->ixa_ifindex != 0) { 101411042SErik.Nordmark@Sun.COM /* 101511042SErik.Nordmark@Sun.COM * In the ipmp case, the ixa_ifindex is set to 101611042SErik.Nordmark@Sun.COM * point at an under_ill and we would return the 101711042SErik.Nordmark@Sun.COM * ire_multicast() corresponding to that under_ill. 101811042SErik.Nordmark@Sun.COM */ 101911042SErik.Nordmark@Sun.COM ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, 102011042SErik.Nordmark@Sun.COM !(ixaflags & IXAF_IS_IPV4), ipst); 102111042SErik.Nordmark@Sun.COM } else if (ixaflags & IXAF_IS_IPV4) { 102211042SErik.Nordmark@Sun.COM ipaddr_t v4setsrc = INADDR_ANY; 102311042SErik.Nordmark@Sun.COM 102411042SErik.Nordmark@Sun.COM ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid, ipst, 102511042SErik.Nordmark@Sun.COM multirtp, &v4setsrc); 102611042SErik.Nordmark@Sun.COM if (setsrcp != NULL) 102711042SErik.Nordmark@Sun.COM IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); 102811042SErik.Nordmark@Sun.COM } else { 102911042SErik.Nordmark@Sun.COM ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid, ipst, 103011042SErik.Nordmark@Sun.COM multirtp, setsrcp); 103111042SErik.Nordmark@Sun.COM } 103211042SErik.Nordmark@Sun.COM if (ill != NULL && IS_VNI(ill)) { 103311042SErik.Nordmark@Sun.COM ill_refrele(ill); 103411042SErik.Nordmark@Sun.COM ill = NULL; 103511042SErik.Nordmark@Sun.COM } 103611042SErik.Nordmark@Sun.COM if (ill == NULL) { 103711042SErik.Nordmark@Sun.COM if (errorp != NULL) 103811042SErik.Nordmark@Sun.COM *errorp = ENXIO; 103911042SErik.Nordmark@Sun.COM /* Get a hold on the IRE_NOROUTE */ 104011042SErik.Nordmark@Sun.COM ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4)); 104111042SErik.Nordmark@Sun.COM return (ire); 104211042SErik.Nordmark@Sun.COM } 104311042SErik.Nordmark@Sun.COM if (!(ill->ill_flags & ILLF_MULTICAST)) { 104411042SErik.Nordmark@Sun.COM ill_refrele(ill); 104511042SErik.Nordmark@Sun.COM if (errorp != NULL) 104611042SErik.Nordmark@Sun.COM *errorp = EHOSTUNREACH; 104711042SErik.Nordmark@Sun.COM /* Get a hold on the IRE_NOROUTE */ 104811042SErik.Nordmark@Sun.COM ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4)); 104911042SErik.Nordmark@Sun.COM return (ire); 105011042SErik.Nordmark@Sun.COM } 105111042SErik.Nordmark@Sun.COM /* Get a refcnt on the single IRE_MULTICAST per ill */ 105211042SErik.Nordmark@Sun.COM ire = ire_multicast(ill); 105311042SErik.Nordmark@Sun.COM ill_refrele(ill); 105411042SErik.Nordmark@Sun.COM if (generationp != NULL) 105511042SErik.Nordmark@Sun.COM *generationp = ire->ire_generation; 105611042SErik.Nordmark@Sun.COM if (errorp != NULL && 105711042SErik.Nordmark@Sun.COM (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) { 105811042SErik.Nordmark@Sun.COM *errorp = EHOSTUNREACH; 105911042SErik.Nordmark@Sun.COM } 106011042SErik.Nordmark@Sun.COM return (ire); 106111042SErik.Nordmark@Sun.COM } 106211042SErik.Nordmark@Sun.COM 106311042SErik.Nordmark@Sun.COM if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) { 106411042SErik.Nordmark@Sun.COM if (ixaflags & IXAF_SCOPEID_SET) { 106511042SErik.Nordmark@Sun.COM /* sin6_scope_id takes precedence over ixa_ifindex */ 106611042SErik.Nordmark@Sun.COM ASSERT(ixa->ixa_scopeid != 0); 106711042SErik.Nordmark@Sun.COM ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, 106811042SErik.Nordmark@Sun.COM !(ixaflags & IXAF_IS_IPV4), ipst); 106911042SErik.Nordmark@Sun.COM } else { 107011042SErik.Nordmark@Sun.COM ASSERT(ixa->ixa_ifindex != 0); 107111042SErik.Nordmark@Sun.COM ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, 107211042SErik.Nordmark@Sun.COM !(ixaflags & IXAF_IS_IPV4), ipst); 107311042SErik.Nordmark@Sun.COM } 107411042SErik.Nordmark@Sun.COM if (ill != NULL && IS_VNI(ill)) { 107511042SErik.Nordmark@Sun.COM ill_refrele(ill); 107611042SErik.Nordmark@Sun.COM ill = NULL; 107711042SErik.Nordmark@Sun.COM } 107811042SErik.Nordmark@Sun.COM if (ill == NULL) { 107911042SErik.Nordmark@Sun.COM if (errorp != NULL) 108011042SErik.Nordmark@Sun.COM *errorp = ENXIO; 108111042SErik.Nordmark@Sun.COM /* Get a hold on the IRE_NOROUTE */ 108211042SErik.Nordmark@Sun.COM ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4)); 108311042SErik.Nordmark@Sun.COM return (ire); 108411042SErik.Nordmark@Sun.COM } 108511042SErik.Nordmark@Sun.COM /* 108611042SErik.Nordmark@Sun.COM * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF 108711042SErik.Nordmark@Sun.COM * so for both of them we need to be able look for an under 108811042SErik.Nordmark@Sun.COM * interface. 108911042SErik.Nordmark@Sun.COM */ 109011042SErik.Nordmark@Sun.COM if (IS_UNDER_IPMP(ill)) 109111042SErik.Nordmark@Sun.COM match_args |= MATCH_IRE_TESTHIDDEN; 109211042SErik.Nordmark@Sun.COM } else { 109311042SErik.Nordmark@Sun.COM ill = NULL; 109411042SErik.Nordmark@Sun.COM } 109511042SErik.Nordmark@Sun.COM 109611042SErik.Nordmark@Sun.COM if (ixaflags & IXAF_NEXTHOP_SET) { 109711042SErik.Nordmark@Sun.COM /* IP_NEXTHOP was set */ 109811042SErik.Nordmark@Sun.COM v6nexthop = ixa->ixa_nexthop_v6; 109911042SErik.Nordmark@Sun.COM } else { 110011042SErik.Nordmark@Sun.COM v6nexthop = *v6dst; 110111042SErik.Nordmark@Sun.COM } 110211042SErik.Nordmark@Sun.COM 110311042SErik.Nordmark@Sun.COM ire_type = 0; 110411042SErik.Nordmark@Sun.COM /* If ill is null then ire_route_recursive will set MATCH_IRE_ILL */ 110511042SErik.Nordmark@Sun.COM 110611042SErik.Nordmark@Sun.COM /* 110711042SErik.Nordmark@Sun.COM * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then 110811042SErik.Nordmark@Sun.COM * we only look for an onlink IRE. 110911042SErik.Nordmark@Sun.COM */ 111011042SErik.Nordmark@Sun.COM if (ixaflags & (IXAF_DONTROUTE|IXAF_NEXTHOP_SET)) { 111111042SErik.Nordmark@Sun.COM match_args |= MATCH_IRE_TYPE; 111211042SErik.Nordmark@Sun.COM ire_type = IRE_ONLINK; 111311042SErik.Nordmark@Sun.COM } 111411042SErik.Nordmark@Sun.COM 111511042SErik.Nordmark@Sun.COM if (ixaflags & IXAF_IS_IPV4) { 111611042SErik.Nordmark@Sun.COM ipaddr_t v4nexthop; 111711042SErik.Nordmark@Sun.COM ipaddr_t v4setsrc = INADDR_ANY; 111811042SErik.Nordmark@Sun.COM 111911042SErik.Nordmark@Sun.COM IN6_V4MAPPED_TO_IPADDR(&v6nexthop, v4nexthop); 112011042SErik.Nordmark@Sun.COM ire = ire_route_recursive_v4(v4nexthop, ire_type, ill, 1121*11457SErik.Nordmark@Sun.COM ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE, 112211042SErik.Nordmark@Sun.COM ixa->ixa_xmit_hint, ipst, &v4setsrc, NULL, generationp); 112311042SErik.Nordmark@Sun.COM if (setsrcp != NULL) 112411042SErik.Nordmark@Sun.COM IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); 112511042SErik.Nordmark@Sun.COM } else { 112611042SErik.Nordmark@Sun.COM ire = ire_route_recursive_v6(&v6nexthop, ire_type, ill, 1127*11457SErik.Nordmark@Sun.COM ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE, 112811042SErik.Nordmark@Sun.COM ixa->ixa_xmit_hint, ipst, setsrcp, NULL, generationp); 112911042SErik.Nordmark@Sun.COM } 113011042SErik.Nordmark@Sun.COM 113111042SErik.Nordmark@Sun.COM #ifdef DEBUG 113211042SErik.Nordmark@Sun.COM if (match_args & MATCH_IRE_TESTHIDDEN) { 113311042SErik.Nordmark@Sun.COM ip3dbg(("looking for hidden; dst %x ire %p\n", 113411042SErik.Nordmark@Sun.COM v4dst, (void *)ire)); 113511042SErik.Nordmark@Sun.COM } 113611042SErik.Nordmark@Sun.COM #endif 113711042SErik.Nordmark@Sun.COM 113811042SErik.Nordmark@Sun.COM if (ill != NULL) 113911042SErik.Nordmark@Sun.COM ill_refrele(ill); 114011042SErik.Nordmark@Sun.COM 114111042SErik.Nordmark@Sun.COM if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 114211042SErik.Nordmark@Sun.COM (ire->ire_type & IRE_MULTICAST)) { 114311042SErik.Nordmark@Sun.COM /* No ire_nce_cache */ 114411042SErik.Nordmark@Sun.COM return (ire); 114511042SErik.Nordmark@Sun.COM } 114611042SErik.Nordmark@Sun.COM 114711042SErik.Nordmark@Sun.COM /* Setup ire_nce_cache if it doesn't exist or is condemned. */ 114811042SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock); 114911042SErik.Nordmark@Sun.COM nce = ire->ire_nce_cache; 115011042SErik.Nordmark@Sun.COM if (nce == NULL || nce->nce_is_condemned) { 115111042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 115211042SErik.Nordmark@Sun.COM (void) ire_revalidate_nce(ire); 115311042SErik.Nordmark@Sun.COM } else { 115411042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 115511042SErik.Nordmark@Sun.COM } 115611042SErik.Nordmark@Sun.COM return (ire); 115711042SErik.Nordmark@Sun.COM } 115811042SErik.Nordmark@Sun.COM 115911042SErik.Nordmark@Sun.COM /* 116011042SErik.Nordmark@Sun.COM * Find a route given some xmit attributes and a packet. 116111042SErik.Nordmark@Sun.COM * Generic for IPv4 and IPv6 116211042SErik.Nordmark@Sun.COM * 116311042SErik.Nordmark@Sun.COM * This never returns NULL. But when it returns the IRE_NOROUTE 116411042SErik.Nordmark@Sun.COM * it might set errorp. 116511042SErik.Nordmark@Sun.COM */ 116611042SErik.Nordmark@Sun.COM ire_t * 116711042SErik.Nordmark@Sun.COM ip_select_route_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp, 116811042SErik.Nordmark@Sun.COM int *errorp, boolean_t *multirtp) 116911042SErik.Nordmark@Sun.COM { 117011042SErik.Nordmark@Sun.COM if (ixa->ixa_flags & IXAF_IS_IPV4) { 117111042SErik.Nordmark@Sun.COM ipha_t *ipha = (ipha_t *)mp->b_rptr; 117211042SErik.Nordmark@Sun.COM in6_addr_t v6dst; 117311042SErik.Nordmark@Sun.COM 117411042SErik.Nordmark@Sun.COM IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); 117511042SErik.Nordmark@Sun.COM 117611042SErik.Nordmark@Sun.COM return (ip_select_route(&v6dst, ixa, generationp, 117711042SErik.Nordmark@Sun.COM NULL, errorp, multirtp)); 117811042SErik.Nordmark@Sun.COM } else { 117911042SErik.Nordmark@Sun.COM ip6_t *ip6h = (ip6_t *)mp->b_rptr; 118011042SErik.Nordmark@Sun.COM 118111042SErik.Nordmark@Sun.COM return (ip_select_route(&ip6h->ip6_dst, ixa, generationp, 118211042SErik.Nordmark@Sun.COM NULL, errorp, multirtp)); 118311042SErik.Nordmark@Sun.COM } 118411042SErik.Nordmark@Sun.COM } 118511042SErik.Nordmark@Sun.COM 118611042SErik.Nordmark@Sun.COM ire_t * 118711042SErik.Nordmark@Sun.COM ip_select_route_v4(ipaddr_t dst, ip_xmit_attr_t *ixa, uint_t *generationp, 118811042SErik.Nordmark@Sun.COM ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp) 118911042SErik.Nordmark@Sun.COM { 119011042SErik.Nordmark@Sun.COM in6_addr_t v6dst; 119111042SErik.Nordmark@Sun.COM ire_t *ire; 119211042SErik.Nordmark@Sun.COM in6_addr_t setsrc; 119311042SErik.Nordmark@Sun.COM 119411042SErik.Nordmark@Sun.COM ASSERT(ixa->ixa_flags & IXAF_IS_IPV4); 119511042SErik.Nordmark@Sun.COM 119611042SErik.Nordmark@Sun.COM IN6_IPADDR_TO_V4MAPPED(dst, &v6dst); 119711042SErik.Nordmark@Sun.COM 119811042SErik.Nordmark@Sun.COM setsrc = ipv6_all_zeros; 119911042SErik.Nordmark@Sun.COM ire = ip_select_route(&v6dst, ixa, generationp, &setsrc, errorp, 120011042SErik.Nordmark@Sun.COM multirtp); 120111042SErik.Nordmark@Sun.COM if (v4setsrcp != NULL) 120211042SErik.Nordmark@Sun.COM IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp); 120311042SErik.Nordmark@Sun.COM return (ire); 120411042SErik.Nordmark@Sun.COM } 120511042SErik.Nordmark@Sun.COM 120611042SErik.Nordmark@Sun.COM /* 120711042SErik.Nordmark@Sun.COM * Recursively look for a route to the destination. Can also match on 120811042SErik.Nordmark@Sun.COM * the zoneid, ill, and label. Used for the data paths. See also 120911042SErik.Nordmark@Sun.COM * ire_route_recursive. 121011042SErik.Nordmark@Sun.COM * 121111042SErik.Nordmark@Sun.COM * If ill is set this means we will match it by adding MATCH_IRE_ILL. 121211042SErik.Nordmark@Sun.COM * 1213*11457SErik.Nordmark@Sun.COM * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never 1214*11457SErik.Nordmark@Sun.COM * create an IRE_IF_CLONE. This is used on the receive side when we are not 1215*11457SErik.Nordmark@Sun.COM * forwarding. 1216*11457SErik.Nordmark@Sun.COM * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly 1217*11457SErik.Nordmark@Sun.COM * resolve the gateway. 1218*11457SErik.Nordmark@Sun.COM * 121911042SErik.Nordmark@Sun.COM * Note that this function never returns NULL. It returns an IRE_NOROUTE 122011042SErik.Nordmark@Sun.COM * instead. 122111042SErik.Nordmark@Sun.COM * 122211042SErik.Nordmark@Sun.COM * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 122311042SErik.Nordmark@Sun.COM * is an error. 122411042SErik.Nordmark@Sun.COM * Allow at most one RTF_INDIRECT. 122511042SErik.Nordmark@Sun.COM */ 122611042SErik.Nordmark@Sun.COM ire_t * 122711042SErik.Nordmark@Sun.COM ire_route_recursive_impl_v4(ire_t *ire, 122811042SErik.Nordmark@Sun.COM ipaddr_t nexthop, uint_t ire_type, const ill_t *ill_arg, 122911042SErik.Nordmark@Sun.COM zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1230*11457SErik.Nordmark@Sun.COM uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp, 123111042SErik.Nordmark@Sun.COM tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 123211042SErik.Nordmark@Sun.COM { 123311042SErik.Nordmark@Sun.COM int i, j; 123411042SErik.Nordmark@Sun.COM ire_t *ires[MAX_IRE_RECURSION]; 123511042SErik.Nordmark@Sun.COM uint_t generation; 123611042SErik.Nordmark@Sun.COM uint_t generations[MAX_IRE_RECURSION]; 123711042SErik.Nordmark@Sun.COM boolean_t need_refrele = B_FALSE; 123811042SErik.Nordmark@Sun.COM boolean_t invalidate = B_FALSE; 123911042SErik.Nordmark@Sun.COM int prefs[MAX_IRE_RECURSION]; 124011042SErik.Nordmark@Sun.COM ill_t *ill = NULL; 124111042SErik.Nordmark@Sun.COM 124211042SErik.Nordmark@Sun.COM if (setsrcp != NULL) 124311042SErik.Nordmark@Sun.COM ASSERT(*setsrcp == INADDR_ANY); 124411042SErik.Nordmark@Sun.COM if (gwattrp != NULL) 124511042SErik.Nordmark@Sun.COM ASSERT(*gwattrp == NULL); 124611042SErik.Nordmark@Sun.COM 124711042SErik.Nordmark@Sun.COM if (ill_arg != NULL) 124811042SErik.Nordmark@Sun.COM match_args |= MATCH_IRE_ILL; 124911042SErik.Nordmark@Sun.COM 125011042SErik.Nordmark@Sun.COM /* 125111042SErik.Nordmark@Sun.COM * We iterate up to three times to resolve a route, even though 125211042SErik.Nordmark@Sun.COM * we have four slots in the array. The extra slot is for an 125311042SErik.Nordmark@Sun.COM * IRE_IF_CLONE we might need to create. 125411042SErik.Nordmark@Sun.COM */ 125511042SErik.Nordmark@Sun.COM i = 0; 125611042SErik.Nordmark@Sun.COM while (i < MAX_IRE_RECURSION - 1) { 125711042SErik.Nordmark@Sun.COM /* ire_ftable_lookup handles round-robin/ECMP */ 125811042SErik.Nordmark@Sun.COM if (ire == NULL) { 125911042SErik.Nordmark@Sun.COM ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type, 126011042SErik.Nordmark@Sun.COM (ill_arg != NULL ? ill_arg : ill), zoneid, tsl, 126111042SErik.Nordmark@Sun.COM match_args, xmit_hint, ipst, &generation); 126211042SErik.Nordmark@Sun.COM } else { 126311042SErik.Nordmark@Sun.COM /* Caller passed it; extra hold since we will rele */ 126411042SErik.Nordmark@Sun.COM ire_refhold(ire); 126511042SErik.Nordmark@Sun.COM if (generationp != NULL) 126611042SErik.Nordmark@Sun.COM generation = *generationp; 126711042SErik.Nordmark@Sun.COM else 126811042SErik.Nordmark@Sun.COM generation = IRE_GENERATION_VERIFY; 126911042SErik.Nordmark@Sun.COM } 127011042SErik.Nordmark@Sun.COM if (ire == NULL) 127111042SErik.Nordmark@Sun.COM ire = ire_reject(ipst, B_FALSE); 127211042SErik.Nordmark@Sun.COM 127311042SErik.Nordmark@Sun.COM /* Need to return the ire with RTF_REJECT|BLACKHOLE */ 127411042SErik.Nordmark@Sun.COM if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) 127511042SErik.Nordmark@Sun.COM goto error; 127611042SErik.Nordmark@Sun.COM 127711042SErik.Nordmark@Sun.COM ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */ 127811042SErik.Nordmark@Sun.COM 127911042SErik.Nordmark@Sun.COM if (i != 0) { 128011131SErik.Nordmark@Sun.COM prefs[i] = ire_pref(ire); 128111042SErik.Nordmark@Sun.COM /* 128211042SErik.Nordmark@Sun.COM * Don't allow anything unusual past the first 128311042SErik.Nordmark@Sun.COM * iteration. 128411042SErik.Nordmark@Sun.COM */ 128511042SErik.Nordmark@Sun.COM if ((ire->ire_type & 128611042SErik.Nordmark@Sun.COM (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) || 128711042SErik.Nordmark@Sun.COM prefs[i] <= prefs[i-1]) { 128811042SErik.Nordmark@Sun.COM ire_refrele(ire); 1289*11457SErik.Nordmark@Sun.COM if (irr_flags & IRR_INCOMPLETE) { 1290*11457SErik.Nordmark@Sun.COM ire = ires[0]; 1291*11457SErik.Nordmark@Sun.COM ire_refhold(ire); 1292*11457SErik.Nordmark@Sun.COM } else { 1293*11457SErik.Nordmark@Sun.COM ire = ire_reject(ipst, B_FALSE); 1294*11457SErik.Nordmark@Sun.COM } 129511042SErik.Nordmark@Sun.COM goto error; 129611042SErik.Nordmark@Sun.COM } 129711042SErik.Nordmark@Sun.COM } 129811042SErik.Nordmark@Sun.COM /* We have a usable IRE */ 129911042SErik.Nordmark@Sun.COM ires[i] = ire; 130011042SErik.Nordmark@Sun.COM generations[i] = generation; 130111042SErik.Nordmark@Sun.COM i++; 130211042SErik.Nordmark@Sun.COM 130311042SErik.Nordmark@Sun.COM /* The first RTF_SETSRC address is passed back if setsrcp */ 130411042SErik.Nordmark@Sun.COM if ((ire->ire_flags & RTF_SETSRC) && 130511042SErik.Nordmark@Sun.COM setsrcp != NULL && *setsrcp == INADDR_ANY) { 130611042SErik.Nordmark@Sun.COM ASSERT(ire->ire_setsrc_addr != INADDR_ANY); 130711042SErik.Nordmark@Sun.COM *setsrcp = ire->ire_setsrc_addr; 130811042SErik.Nordmark@Sun.COM } 130911042SErik.Nordmark@Sun.COM 131011042SErik.Nordmark@Sun.COM /* The first ire_gw_secattr is passed back if gwattrp */ 131111042SErik.Nordmark@Sun.COM if (ire->ire_gw_secattr != NULL && 131211042SErik.Nordmark@Sun.COM gwattrp != NULL && *gwattrp == NULL) 131311042SErik.Nordmark@Sun.COM *gwattrp = ire->ire_gw_secattr; 131411042SErik.Nordmark@Sun.COM 131511042SErik.Nordmark@Sun.COM /* 131611042SErik.Nordmark@Sun.COM * Check if we have a short-cut pointer to an IRE for this 131711042SErik.Nordmark@Sun.COM * destination, and that the cached dependency isn't stale. 131811042SErik.Nordmark@Sun.COM * In that case we've rejoined an existing tree towards a 131911042SErik.Nordmark@Sun.COM * parent, thus we don't need to continue the loop to 132011042SErik.Nordmark@Sun.COM * discover the rest of the tree. 132111042SErik.Nordmark@Sun.COM */ 132211042SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock); 132311042SErik.Nordmark@Sun.COM if (ire->ire_dep_parent != NULL && 132411042SErik.Nordmark@Sun.COM ire->ire_dep_parent->ire_generation == 132511042SErik.Nordmark@Sun.COM ire->ire_dep_parent_generation) { 132611042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 132711042SErik.Nordmark@Sun.COM ire = NULL; 132811042SErik.Nordmark@Sun.COM goto done; 132911042SErik.Nordmark@Sun.COM } 133011042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 133111042SErik.Nordmark@Sun.COM 133211042SErik.Nordmark@Sun.COM /* 133311042SErik.Nordmark@Sun.COM * If this type should have an ire_nce_cache (even if it 133411042SErik.Nordmark@Sun.COM * doesn't yet have one) then we are done. Includes 133511042SErik.Nordmark@Sun.COM * IRE_INTERFACE with a full 32 bit mask. 133611042SErik.Nordmark@Sun.COM */ 133711042SErik.Nordmark@Sun.COM if (ire->ire_nce_capable) { 133811042SErik.Nordmark@Sun.COM ire = NULL; 133911042SErik.Nordmark@Sun.COM goto done; 134011042SErik.Nordmark@Sun.COM } 134111042SErik.Nordmark@Sun.COM ASSERT(!(ire->ire_type & IRE_IF_CLONE)); 134211042SErik.Nordmark@Sun.COM /* 134311042SErik.Nordmark@Sun.COM * For an IRE_INTERFACE we create an IRE_IF_CLONE for this 134411042SErik.Nordmark@Sun.COM * particular destination 134511042SErik.Nordmark@Sun.COM */ 134611042SErik.Nordmark@Sun.COM if (ire->ire_type & IRE_INTERFACE) { 134711042SErik.Nordmark@Sun.COM in6_addr_t v6nexthop; 134811042SErik.Nordmark@Sun.COM ire_t *clone; 134911042SErik.Nordmark@Sun.COM 135011042SErik.Nordmark@Sun.COM ASSERT(ire->ire_masklen != IPV4_ABITS); 135111042SErik.Nordmark@Sun.COM 135211042SErik.Nordmark@Sun.COM /* 135311042SErik.Nordmark@Sun.COM * In the case of ip_input and ILLF_FORWARDING not 1354*11457SErik.Nordmark@Sun.COM * being set, and in the case of RTM_GET, there is 1355*11457SErik.Nordmark@Sun.COM * no point in allocating an IRE_IF_CLONE. We return 1356*11457SErik.Nordmark@Sun.COM * the IRE_INTERFACE. Note that !IRR_ALLOCATE can 1357*11457SErik.Nordmark@Sun.COM * result in a ire_dep_parent which is IRE_IF_* 1358*11457SErik.Nordmark@Sun.COM * without an IRE_IF_CLONE. 135911042SErik.Nordmark@Sun.COM * We recover from that when we need to send packets 136011042SErik.Nordmark@Sun.COM * by ensuring that the generations become 136111042SErik.Nordmark@Sun.COM * IRE_GENERATION_VERIFY in this case. 136211042SErik.Nordmark@Sun.COM */ 1363*11457SErik.Nordmark@Sun.COM if (!(irr_flags & IRR_ALLOCATE)) { 136411042SErik.Nordmark@Sun.COM invalidate = B_TRUE; 136511042SErik.Nordmark@Sun.COM ire = NULL; 136611042SErik.Nordmark@Sun.COM goto done; 136711042SErik.Nordmark@Sun.COM } 136811042SErik.Nordmark@Sun.COM 136911042SErik.Nordmark@Sun.COM IN6_IPADDR_TO_V4MAPPED(nexthop, &v6nexthop); 137011042SErik.Nordmark@Sun.COM 137111042SErik.Nordmark@Sun.COM clone = ire_create_if_clone(ire, &v6nexthop, 137211042SErik.Nordmark@Sun.COM &generation); 137311042SErik.Nordmark@Sun.COM if (clone == NULL) { 137411042SErik.Nordmark@Sun.COM /* 137511042SErik.Nordmark@Sun.COM * Temporary failure - no memory. 137611042SErik.Nordmark@Sun.COM * Don't want caller to cache IRE_NOROUTE. 137711042SErik.Nordmark@Sun.COM */ 137811042SErik.Nordmark@Sun.COM invalidate = B_TRUE; 137911042SErik.Nordmark@Sun.COM ire = ire_blackhole(ipst, B_FALSE); 138011042SErik.Nordmark@Sun.COM goto error; 138111042SErik.Nordmark@Sun.COM } 138211042SErik.Nordmark@Sun.COM /* 138311042SErik.Nordmark@Sun.COM * Make clone next to last entry and the 138411042SErik.Nordmark@Sun.COM * IRE_INTERFACE the last in the dependency 138511042SErik.Nordmark@Sun.COM * chain since the clone depends on the 138611042SErik.Nordmark@Sun.COM * IRE_INTERFACE. 138711042SErik.Nordmark@Sun.COM */ 138811042SErik.Nordmark@Sun.COM ASSERT(i >= 1); 138911042SErik.Nordmark@Sun.COM ASSERT(i < MAX_IRE_RECURSION); 139011042SErik.Nordmark@Sun.COM 139111042SErik.Nordmark@Sun.COM ires[i] = ires[i-1]; 139211042SErik.Nordmark@Sun.COM generations[i] = generations[i-1]; 139311042SErik.Nordmark@Sun.COM ires[i-1] = clone; 139411042SErik.Nordmark@Sun.COM generations[i-1] = generation; 139511042SErik.Nordmark@Sun.COM i++; 139611042SErik.Nordmark@Sun.COM 139711042SErik.Nordmark@Sun.COM ire = NULL; 139811042SErik.Nordmark@Sun.COM goto done; 139911042SErik.Nordmark@Sun.COM } 140011042SErik.Nordmark@Sun.COM 140111042SErik.Nordmark@Sun.COM /* 140211042SErik.Nordmark@Sun.COM * We only match on the type and optionally ILL when 140311042SErik.Nordmark@Sun.COM * recursing. The type match is used by some callers 140411042SErik.Nordmark@Sun.COM * to exclude certain types (such as IRE_IF_CLONE or 140511042SErik.Nordmark@Sun.COM * IRE_LOCAL|IRE_LOOPBACK). 140611042SErik.Nordmark@Sun.COM */ 140711042SErik.Nordmark@Sun.COM match_args &= MATCH_IRE_TYPE; 140811042SErik.Nordmark@Sun.COM nexthop = ire->ire_gateway_addr; 140911042SErik.Nordmark@Sun.COM if (ill == NULL && ire->ire_ill != NULL) { 141011042SErik.Nordmark@Sun.COM ill = ire->ire_ill; 141111042SErik.Nordmark@Sun.COM need_refrele = B_TRUE; 141211042SErik.Nordmark@Sun.COM ill_refhold(ill); 141311042SErik.Nordmark@Sun.COM match_args |= MATCH_IRE_ILL; 141411042SErik.Nordmark@Sun.COM } 141511131SErik.Nordmark@Sun.COM /* 141611131SErik.Nordmark@Sun.COM * We set the prefs[i] value above if i > 0. We've already 141711131SErik.Nordmark@Sun.COM * done i++ so i is one in the case of the first time around. 141811131SErik.Nordmark@Sun.COM */ 141911131SErik.Nordmark@Sun.COM if (i == 1) 142011131SErik.Nordmark@Sun.COM prefs[0] = ire_pref(ire); 142111042SErik.Nordmark@Sun.COM ire = NULL; 142211042SErik.Nordmark@Sun.COM } 142311042SErik.Nordmark@Sun.COM ASSERT(ire == NULL); 142411042SErik.Nordmark@Sun.COM ire = ire_reject(ipst, B_FALSE); 142511042SErik.Nordmark@Sun.COM 142611042SErik.Nordmark@Sun.COM error: 142711042SErik.Nordmark@Sun.COM ASSERT(ire != NULL); 142811042SErik.Nordmark@Sun.COM if (need_refrele) 142911042SErik.Nordmark@Sun.COM ill_refrele(ill); 143011042SErik.Nordmark@Sun.COM 143111042SErik.Nordmark@Sun.COM /* 143211042SErik.Nordmark@Sun.COM * In the case of MULTIRT we want to try a different IRE the next 143311042SErik.Nordmark@Sun.COM * time. We let the next packet retry in that case. 143411042SErik.Nordmark@Sun.COM */ 143511042SErik.Nordmark@Sun.COM if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT)) 143611042SErik.Nordmark@Sun.COM (void) ire_no_good(ires[0]); 143711042SErik.Nordmark@Sun.COM 143811042SErik.Nordmark@Sun.COM cleanup: 143911042SErik.Nordmark@Sun.COM /* cleanup ires[i] */ 144011042SErik.Nordmark@Sun.COM ire_dep_unbuild(ires, i); 144111042SErik.Nordmark@Sun.COM for (j = 0; j < i; j++) 144211042SErik.Nordmark@Sun.COM ire_refrele(ires[j]); 144311042SErik.Nordmark@Sun.COM 1444*11457SErik.Nordmark@Sun.COM ASSERT((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 1445*11457SErik.Nordmark@Sun.COM (irr_flags & IRR_INCOMPLETE)); 144611042SErik.Nordmark@Sun.COM /* 144711042SErik.Nordmark@Sun.COM * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the 144811042SErik.Nordmark@Sun.COM * ip_select_route since the reject or lack of memory might be gone. 144911042SErik.Nordmark@Sun.COM */ 145011042SErik.Nordmark@Sun.COM if (generationp != NULL) 145111042SErik.Nordmark@Sun.COM *generationp = IRE_GENERATION_VERIFY; 145211042SErik.Nordmark@Sun.COM return (ire); 145311042SErik.Nordmark@Sun.COM 145411042SErik.Nordmark@Sun.COM done: 145511042SErik.Nordmark@Sun.COM ASSERT(ire == NULL); 145611042SErik.Nordmark@Sun.COM if (need_refrele) { 145711042SErik.Nordmark@Sun.COM ill_refrele(ill); 145811042SErik.Nordmark@Sun.COM ill = NULL; 145911042SErik.Nordmark@Sun.COM } 146011042SErik.Nordmark@Sun.COM 146111042SErik.Nordmark@Sun.COM /* Build dependencies */ 146211131SErik.Nordmark@Sun.COM if (i > 1 && !ire_dep_build(ires, generations, i)) { 146311042SErik.Nordmark@Sun.COM /* Something in chain was condemned; tear it apart */ 146411042SErik.Nordmark@Sun.COM ire = ire_reject(ipst, B_FALSE); 146511042SErik.Nordmark@Sun.COM goto cleanup; 146611042SErik.Nordmark@Sun.COM } 146711042SErik.Nordmark@Sun.COM 146811042SErik.Nordmark@Sun.COM /* 146911042SErik.Nordmark@Sun.COM * Release all refholds except the one for ires[0] that we 147011042SErik.Nordmark@Sun.COM * will return to the caller. 147111042SErik.Nordmark@Sun.COM */ 147211042SErik.Nordmark@Sun.COM for (j = 1; j < i; j++) 147311042SErik.Nordmark@Sun.COM ire_refrele(ires[j]); 147411042SErik.Nordmark@Sun.COM 147511042SErik.Nordmark@Sun.COM if (invalidate) { 147611042SErik.Nordmark@Sun.COM /* 147711042SErik.Nordmark@Sun.COM * Since we needed to allocate but couldn't we need to make 147811042SErik.Nordmark@Sun.COM * sure that the dependency chain is rebuilt the next time. 147911042SErik.Nordmark@Sun.COM */ 148011042SErik.Nordmark@Sun.COM ire_dep_invalidate_generations(ires[0]); 148111042SErik.Nordmark@Sun.COM generation = IRE_GENERATION_VERIFY; 148211042SErik.Nordmark@Sun.COM } else { 148311042SErik.Nordmark@Sun.COM /* 148411042SErik.Nordmark@Sun.COM * IREs can have been added or deleted while we did the 148511042SErik.Nordmark@Sun.COM * recursive lookup and we can't catch those until we've built 148611042SErik.Nordmark@Sun.COM * the dependencies. We verify the stored 148711042SErik.Nordmark@Sun.COM * ire_dep_parent_generation to catch any such changes and 148811042SErik.Nordmark@Sun.COM * return IRE_GENERATION_VERIFY (which will cause 148911042SErik.Nordmark@Sun.COM * ip_select_route to be called again so we can redo the 149011042SErik.Nordmark@Sun.COM * recursive lookup next time we send a packet. 149111042SErik.Nordmark@Sun.COM */ 149211131SErik.Nordmark@Sun.COM if (ires[0]->ire_dep_parent == NULL) 149311131SErik.Nordmark@Sun.COM generation = ires[0]->ire_generation; 149411131SErik.Nordmark@Sun.COM else 149511131SErik.Nordmark@Sun.COM generation = ire_dep_validate_generations(ires[0]); 149611042SErik.Nordmark@Sun.COM if (generations[0] != ires[0]->ire_generation) { 149711042SErik.Nordmark@Sun.COM /* Something changed at the top */ 149811042SErik.Nordmark@Sun.COM generation = IRE_GENERATION_VERIFY; 149911042SErik.Nordmark@Sun.COM } 150011042SErik.Nordmark@Sun.COM } 150111042SErik.Nordmark@Sun.COM if (generationp != NULL) 150211042SErik.Nordmark@Sun.COM *generationp = generation; 150311042SErik.Nordmark@Sun.COM 150411042SErik.Nordmark@Sun.COM return (ires[0]); 150511042SErik.Nordmark@Sun.COM } 150611042SErik.Nordmark@Sun.COM 150711042SErik.Nordmark@Sun.COM ire_t * 150811042SErik.Nordmark@Sun.COM ire_route_recursive_v4(ipaddr_t nexthop, uint_t ire_type, const ill_t *ill, 150911042SErik.Nordmark@Sun.COM zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1510*11457SErik.Nordmark@Sun.COM uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp, 151111042SErik.Nordmark@Sun.COM tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 151211042SErik.Nordmark@Sun.COM { 151311042SErik.Nordmark@Sun.COM return (ire_route_recursive_impl_v4(NULL, nexthop, ire_type, ill, 1514*11457SErik.Nordmark@Sun.COM zoneid, tsl, match_args, irr_flags, xmit_hint, ipst, setsrcp, 151511042SErik.Nordmark@Sun.COM gwattrp, generationp)); 151611042SErik.Nordmark@Sun.COM } 151711042SErik.Nordmark@Sun.COM 151811042SErik.Nordmark@Sun.COM /* 151911042SErik.Nordmark@Sun.COM * Recursively look for a route to the destination. 152011042SErik.Nordmark@Sun.COM * We only handle a destination match here, yet we have the same arguments 152111042SErik.Nordmark@Sun.COM * as the full match to allow function pointers to select between the two. 152211042SErik.Nordmark@Sun.COM * 152311042SErik.Nordmark@Sun.COM * Note that this function never returns NULL. It returns an IRE_NOROUTE 152411042SErik.Nordmark@Sun.COM * instead. 152511042SErik.Nordmark@Sun.COM * 152611042SErik.Nordmark@Sun.COM * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 152711042SErik.Nordmark@Sun.COM * is an error. 152811042SErik.Nordmark@Sun.COM * Allow at most one RTF_INDIRECT. 152911042SErik.Nordmark@Sun.COM */ 153011042SErik.Nordmark@Sun.COM ire_t * 1531*11457SErik.Nordmark@Sun.COM ire_route_recursive_dstonly_v4(ipaddr_t nexthop, uint_t irr_flags, 153211042SErik.Nordmark@Sun.COM uint32_t xmit_hint, ip_stack_t *ipst) 153311042SErik.Nordmark@Sun.COM { 153411042SErik.Nordmark@Sun.COM ire_t *ire; 153511042SErik.Nordmark@Sun.COM ire_t *ire1; 153611042SErik.Nordmark@Sun.COM uint_t generation; 153711042SErik.Nordmark@Sun.COM 153811042SErik.Nordmark@Sun.COM /* ire_ftable_lookup handles round-robin/ECMP */ 153911042SErik.Nordmark@Sun.COM ire = ire_ftable_lookup_simple_v4(nexthop, xmit_hint, ipst, 154011042SErik.Nordmark@Sun.COM &generation); 154111042SErik.Nordmark@Sun.COM ASSERT(ire != NULL); 154211042SErik.Nordmark@Sun.COM 154311042SErik.Nordmark@Sun.COM /* 154411042SErik.Nordmark@Sun.COM * If this type should have an ire_nce_cache (even if it 154511042SErik.Nordmark@Sun.COM * doesn't yet have one) then we are done. Includes 154611042SErik.Nordmark@Sun.COM * IRE_INTERFACE with a full 32 bit mask. 154711042SErik.Nordmark@Sun.COM */ 154811042SErik.Nordmark@Sun.COM if (ire->ire_nce_capable) 154911042SErik.Nordmark@Sun.COM return (ire); 155011042SErik.Nordmark@Sun.COM 155111042SErik.Nordmark@Sun.COM /* 155211042SErik.Nordmark@Sun.COM * If the IRE has a current cached parent we know that the whole 155311042SErik.Nordmark@Sun.COM * parent chain is current, hence we don't need to discover and 155411042SErik.Nordmark@Sun.COM * build any dependencies by doing a recursive lookup. 155511042SErik.Nordmark@Sun.COM */ 155611042SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock); 155711042SErik.Nordmark@Sun.COM if (ire->ire_dep_parent != NULL && 155811042SErik.Nordmark@Sun.COM ire->ire_dep_parent->ire_generation == 155911042SErik.Nordmark@Sun.COM ire->ire_dep_parent_generation) { 156011042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 156111042SErik.Nordmark@Sun.COM return (ire); 156211042SErik.Nordmark@Sun.COM } 156311042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 156411042SErik.Nordmark@Sun.COM 156511042SErik.Nordmark@Sun.COM /* 156611042SErik.Nordmark@Sun.COM * Fallback to loop in the normal code starting with the ire 156711042SErik.Nordmark@Sun.COM * we found. Normally this would return the same ire. 156811042SErik.Nordmark@Sun.COM */ 156911042SErik.Nordmark@Sun.COM ire1 = ire_route_recursive_impl_v4(ire, nexthop, 0, NULL, ALL_ZONES, 1570*11457SErik.Nordmark@Sun.COM NULL, MATCH_IRE_DSTONLY, irr_flags, xmit_hint, ipst, NULL, NULL, 157111042SErik.Nordmark@Sun.COM &generation); 157211042SErik.Nordmark@Sun.COM ire_refrele(ire); 157311042SErik.Nordmark@Sun.COM return (ire1); 157411042SErik.Nordmark@Sun.COM } 1575