12535Ssangeeta /* 22535Ssangeeta * CDDL HEADER START 32535Ssangeeta * 42535Ssangeeta * The contents of this file are subject to the terms of the 52535Ssangeeta * Common Development and Distribution License (the "License"). 62535Ssangeeta * You may not use this file except in compliance with the License. 72535Ssangeeta * 82535Ssangeeta * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 92535Ssangeeta * or http://www.opensolaris.org/os/licensing. 102535Ssangeeta * See the License for the specific language governing permissions 112535Ssangeeta * and limitations under the License. 122535Ssangeeta * 132535Ssangeeta * When distributing Covered Code, include this CDDL HEADER in each 142535Ssangeeta * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 152535Ssangeeta * If applicable, add the following below this CDDL HEADER, with the 162535Ssangeeta * fields enclosed by brackets "[]" replaced with your own identifying 172535Ssangeeta * information: Portions Copyright [yyyy] [name of copyright owner] 182535Ssangeeta * 192535Ssangeeta * CDDL HEADER END 202535Ssangeeta */ 212535Ssangeeta /* 2211457SErik.Nordmark@Sun.COM * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 232535Ssangeeta * Use is subject to license terms. 242535Ssangeeta */ 252535Ssangeeta 262535Ssangeeta /* 272535Ssangeeta * This file contains consumer routines of the IPv4 forwarding engine 282535Ssangeeta */ 292535Ssangeeta 302535Ssangeeta #include <sys/types.h> 312535Ssangeeta #include <sys/stream.h> 322535Ssangeeta #include <sys/stropts.h> 332535Ssangeeta #include <sys/strlog.h> 342535Ssangeeta #include <sys/dlpi.h> 352535Ssangeeta #include <sys/ddi.h> 362535Ssangeeta #include <sys/cmn_err.h> 372535Ssangeeta #include <sys/policy.h> 382535Ssangeeta 392535Ssangeeta #include <sys/systm.h> 402535Ssangeeta #include <sys/strsun.h> 412535Ssangeeta #include <sys/kmem.h> 422535Ssangeeta #include <sys/param.h> 432535Ssangeeta #include <sys/socket.h> 444482Sdr146992 #include <sys/strsubr.h> 452535Ssangeeta #include <net/if.h> 462535Ssangeeta #include <net/route.h> 472535Ssangeeta #include <netinet/in.h> 482535Ssangeeta #include <net/if_dl.h> 492535Ssangeeta #include <netinet/ip6.h> 502535Ssangeeta #include <netinet/icmp6.h> 512535Ssangeeta 5211042SErik.Nordmark@Sun.COM #include <inet/ipsec_impl.h> 532535Ssangeeta #include <inet/common.h> 542535Ssangeeta #include <inet/mi.h> 552535Ssangeeta #include <inet/mib2.h> 562535Ssangeeta #include <inet/ip.h> 574482Sdr146992 #include <inet/ip_impl.h> 582535Ssangeeta #include <inet/ip6.h> 592535Ssangeeta #include <inet/ip_ndp.h> 602535Ssangeeta #include <inet/arp.h> 612535Ssangeeta #include <inet/ip_if.h> 622535Ssangeeta #include <inet/ip_ire.h> 632535Ssangeeta #include <inet/ip_ftable.h> 642535Ssangeeta #include <inet/ip_rts.h> 652535Ssangeeta #include <inet/nd.h> 662535Ssangeeta 672535Ssangeeta #include <net/pfkeyv2.h> 682535Ssangeeta #include <inet/sadb.h> 692535Ssangeeta #include <inet/tcp.h> 702535Ssangeeta #include <inet/ipclassifier.h> 712535Ssangeeta #include <sys/zone.h> 722535Ssangeeta #include <net/radix.h> 732535Ssangeeta #include <sys/tsol/label.h> 742535Ssangeeta #include <sys/tsol/tnet.h> 752535Ssangeeta 762535Ssangeeta #define IS_DEFAULT_ROUTE(ire) \ 772535Ssangeeta (((ire)->ire_type & IRE_DEFAULT) || \ 782535Ssangeeta (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0))) 792535Ssangeeta 80*11681SSowmini.Varadhan@Sun.COM #define IP_SRC_MULTIHOMING(isv6, ipst) \ 81*11681SSowmini.Varadhan@Sun.COM (isv6 ? ipst->ips_ipv6_strict_src_multihoming : \ 82*11681SSowmini.Varadhan@Sun.COM ipst->ips_ip_strict_src_multihoming) 83*11681SSowmini.Varadhan@Sun.COM 843448Sdh155122 static ire_t *route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *); 8511042SErik.Nordmark@Sun.COM static void ire_del_host_redir(ire_t *, char *); 8611042SErik.Nordmark@Sun.COM static boolean_t ire_find_best_route(struct radix_node *, void *); 872535Ssangeeta 882535Ssangeeta /* 892535Ssangeeta * Lookup a route in forwarding table. A specific lookup is indicated by 902535Ssangeeta * passing the required parameters and indicating the match required in the 912535Ssangeeta * flag field. 922535Ssangeeta * 932535Ssangeeta * Supports IP_BOUND_IF by following the ipif/ill when recursing. 942535Ssangeeta */ 952535Ssangeeta ire_t * 9611042SErik.Nordmark@Sun.COM ire_ftable_lookup_v4(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, 9711042SErik.Nordmark@Sun.COM int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, 9811042SErik.Nordmark@Sun.COM int flags, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp) 992535Ssangeeta { 10011042SErik.Nordmark@Sun.COM ire_t *ire; 1012535Ssangeeta struct rt_sockaddr rdst, rmask; 1022535Ssangeeta struct rt_entry *rt; 1032535Ssangeeta ire_ftable_args_t margs; 1042535Ssangeeta 10511042SErik.Nordmark@Sun.COM ASSERT(ill == NULL || !ill->ill_isv6); 1062535Ssangeeta 1072535Ssangeeta /* 10811042SErik.Nordmark@Sun.COM * ire_match_args() will dereference ill if MATCH_IRE_ILL 10911042SErik.Nordmark@Sun.COM * is set. 1102535Ssangeeta */ 111*11681SSowmini.Varadhan@Sun.COM if ((flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL)) && (ill == NULL)) 1122535Ssangeeta return (NULL); 1132535Ssangeeta 11411131SErik.Nordmark@Sun.COM bzero(&rdst, sizeof (rdst)); 1152535Ssangeeta rdst.rt_sin_len = sizeof (rdst); 1162535Ssangeeta rdst.rt_sin_family = AF_INET; 1172535Ssangeeta rdst.rt_sin_addr.s_addr = addr; 1182535Ssangeeta 11911131SErik.Nordmark@Sun.COM bzero(&rmask, sizeof (rmask)); 1202535Ssangeeta rmask.rt_sin_len = sizeof (rmask); 1212535Ssangeeta rmask.rt_sin_family = AF_INET; 1222535Ssangeeta rmask.rt_sin_addr.s_addr = mask; 1232535Ssangeeta 12411131SErik.Nordmark@Sun.COM bzero(&margs, sizeof (margs)); 1252535Ssangeeta margs.ift_addr = addr; 1262535Ssangeeta margs.ift_mask = mask; 1272535Ssangeeta margs.ift_gateway = gateway; 1282535Ssangeeta margs.ift_type = type; 12911042SErik.Nordmark@Sun.COM margs.ift_ill = ill; 1302535Ssangeeta margs.ift_zoneid = zoneid; 1312535Ssangeeta margs.ift_tsl = tsl; 1322535Ssangeeta margs.ift_flags = flags; 1332535Ssangeeta 1342535Ssangeeta /* 1352535Ssangeeta * The flags argument passed to ire_ftable_lookup may cause the 1362535Ssangeeta * search to return, not the longest matching prefix, but the 1372535Ssangeeta * "best matching prefix", i.e., the longest prefix that also 1382535Ssangeeta * satisfies constraints imposed via the permutation of flags 1392535Ssangeeta * passed in. To achieve this, we invoke ire_match_args() on 1402535Ssangeeta * each matching leaf in the radix tree. ire_match_args is 1412535Ssangeeta * invoked by the callback function ire_find_best_route() 1422535Ssangeeta * We hold the global tree lock in read mode when calling 14311042SErik.Nordmark@Sun.COM * rn_match_args. Before dropping the global tree lock, ensure 1442535Ssangeeta * that the radix node can't be deleted by incrementing ire_refcnt. 1452535Ssangeeta */ 1463448Sdh155122 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 1473448Sdh155122 rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, 1483448Sdh155122 ipst->ips_ip_ftable, ire_find_best_route, &margs); 1492535Ssangeeta ire = margs.ift_best_ire; 1502535Ssangeeta if (rt == NULL) { 15111042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 1522535Ssangeeta return (NULL); 1532535Ssangeeta } 15411042SErik.Nordmark@Sun.COM ASSERT(ire != NULL); 1552535Ssangeeta 1562535Ssangeeta DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire); 1572535Ssangeeta 1582535Ssangeeta /* 1592535Ssangeeta * round-robin only if we have more than one route in the bucket. 16011042SErik.Nordmark@Sun.COM * ips_ip_ecmp_behavior controls when we do ECMP 16111042SErik.Nordmark@Sun.COM * 2: always 16211042SErik.Nordmark@Sun.COM * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 16311042SErik.Nordmark@Sun.COM * 0: never 1642535Ssangeeta */ 16511042SErik.Nordmark@Sun.COM if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) { 16611042SErik.Nordmark@Sun.COM if (ipst->ips_ip_ecmp_behavior == 2 || 16711042SErik.Nordmark@Sun.COM (ipst->ips_ip_ecmp_behavior == 1 && 16811042SErik.Nordmark@Sun.COM IS_DEFAULT_ROUTE(ire))) { 16911042SErik.Nordmark@Sun.COM ire_t *next_ire; 1702535Ssangeeta 17111042SErik.Nordmark@Sun.COM margs.ift_best_ire = NULL; 17211042SErik.Nordmark@Sun.COM next_ire = ire_round_robin(ire->ire_bucket, &margs, 17311042SErik.Nordmark@Sun.COM xmit_hint, ire, ipst); 17411042SErik.Nordmark@Sun.COM if (next_ire == NULL) { 17511042SErik.Nordmark@Sun.COM /* keep ire if next_ire is null */ 17611042SErik.Nordmark@Sun.COM goto done; 17711042SErik.Nordmark@Sun.COM } 17811042SErik.Nordmark@Sun.COM ire_refrele(ire); 1792535Ssangeeta ire = next_ire; 1802535Ssangeeta } 1812535Ssangeeta } 1822535Ssangeeta 18311042SErik.Nordmark@Sun.COM done: 18411042SErik.Nordmark@Sun.COM /* Return generation before dropping lock */ 18511042SErik.Nordmark@Sun.COM if (generationp != NULL) 18611042SErik.Nordmark@Sun.COM *generationp = ire->ire_generation; 1872535Ssangeeta 18811042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 1898485SPeter.Memishian@Sun.COM 19011042SErik.Nordmark@Sun.COM /* 19111042SErik.Nordmark@Sun.COM * For shared-IP zones we need additional checks to what was 19211042SErik.Nordmark@Sun.COM * done in ire_match_args to make sure IRE_LOCALs are handled. 19311042SErik.Nordmark@Sun.COM * 19411042SErik.Nordmark@Sun.COM * When ip_restrict_interzone_loopback is set, then 19511042SErik.Nordmark@Sun.COM * we ensure that IRE_LOCAL are only used for loopback 19611042SErik.Nordmark@Sun.COM * between zones when the logical "Ethernet" would 19711042SErik.Nordmark@Sun.COM * have looped them back. That is, if in the absense of 19811042SErik.Nordmark@Sun.COM * the IRE_LOCAL we would have sent to packet out the 19911042SErik.Nordmark@Sun.COM * same ill. 20011042SErik.Nordmark@Sun.COM */ 20111042SErik.Nordmark@Sun.COM if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES && 20211042SErik.Nordmark@Sun.COM ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES && 20311042SErik.Nordmark@Sun.COM ipst->ips_ip_restrict_interzone_loopback) { 20411042SErik.Nordmark@Sun.COM ire = ire_alt_local(ire, zoneid, tsl, ill, generationp); 20511042SErik.Nordmark@Sun.COM ASSERT(ire != NULL); 2062535Ssangeeta } 2072535Ssangeeta return (ire); 2082535Ssangeeta } 2092535Ssangeeta 2108275SEric Cheng /* 2118275SEric Cheng * This function is called by 21211042SErik.Nordmark@Sun.COM * ip_input/ire_route_recursive when doing a route lookup on only the 21311042SErik.Nordmark@Sun.COM * destination address. 21411042SErik.Nordmark@Sun.COM * 2158275SEric Cheng * The optimizations of this function over ire_ftable_lookup are: 2168275SEric Cheng * o removing unnecessary flag matching 2178275SEric Cheng * o doing longest prefix match instead of overloading it further 2188275SEric Cheng * with the unnecessary "best_prefix_match" 21911042SErik.Nordmark@Sun.COM * 22011042SErik.Nordmark@Sun.COM * If no route is found we return IRE_NOROUTE. 2218275SEric Cheng */ 22211042SErik.Nordmark@Sun.COM ire_t * 22311042SErik.Nordmark@Sun.COM ire_ftable_lookup_simple_v4(ipaddr_t addr, uint32_t xmit_hint, ip_stack_t *ipst, 22411042SErik.Nordmark@Sun.COM uint_t *generationp) 2258275SEric Cheng { 22611042SErik.Nordmark@Sun.COM ire_t *ire; 2278275SEric Cheng struct rt_sockaddr rdst; 2288275SEric Cheng struct rt_entry *rt; 22911042SErik.Nordmark@Sun.COM irb_t *irb; 2308275SEric Cheng 2318275SEric Cheng rdst.rt_sin_len = sizeof (rdst); 2328275SEric Cheng rdst.rt_sin_family = AF_INET; 2338275SEric Cheng rdst.rt_sin_addr.s_addr = addr; 2348275SEric Cheng 2358275SEric Cheng /* 2368275SEric Cheng * This is basically inlining a simpler version of ire_match_args 2378275SEric Cheng */ 2388275SEric Cheng RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 2398275SEric Cheng 2408275SEric Cheng rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, 2418275SEric Cheng ipst->ips_ip_ftable, NULL, NULL); 2428275SEric Cheng 24311042SErik.Nordmark@Sun.COM if (rt == NULL) 24411042SErik.Nordmark@Sun.COM goto bad; 24511042SErik.Nordmark@Sun.COM 24611042SErik.Nordmark@Sun.COM irb = &rt->rt_irb; 24711042SErik.Nordmark@Sun.COM if (irb->irb_ire_cnt == 0) 24811042SErik.Nordmark@Sun.COM goto bad; 24911042SErik.Nordmark@Sun.COM 25011042SErik.Nordmark@Sun.COM rw_enter(&irb->irb_lock, RW_READER); 25111042SErik.Nordmark@Sun.COM ire = irb->irb_ire; 25211042SErik.Nordmark@Sun.COM if (ire == NULL) { 25311042SErik.Nordmark@Sun.COM rw_exit(&irb->irb_lock); 25411042SErik.Nordmark@Sun.COM goto bad; 2558275SEric Cheng } 25611042SErik.Nordmark@Sun.COM while (IRE_IS_CONDEMNED(ire)) { 25711042SErik.Nordmark@Sun.COM ire = ire->ire_next; 25811042SErik.Nordmark@Sun.COM if (ire == NULL) { 25911042SErik.Nordmark@Sun.COM rw_exit(&irb->irb_lock); 26011042SErik.Nordmark@Sun.COM goto bad; 26111042SErik.Nordmark@Sun.COM } 2628275SEric Cheng } 2638275SEric Cheng 2648275SEric Cheng /* we have a ire that matches */ 26511042SErik.Nordmark@Sun.COM ire_refhold(ire); 26611042SErik.Nordmark@Sun.COM rw_exit(&irb->irb_lock); 2678275SEric Cheng 2688275SEric Cheng /* 26911042SErik.Nordmark@Sun.COM * round-robin only if we have more than one route in the bucket. 27011042SErik.Nordmark@Sun.COM * ips_ip_ecmp_behavior controls when we do ECMP 27111042SErik.Nordmark@Sun.COM * 2: always 27211042SErik.Nordmark@Sun.COM * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 27311042SErik.Nordmark@Sun.COM * 0: never 2748275SEric Cheng * 27511042SErik.Nordmark@Sun.COM * Note: if we found an IRE_IF_CLONE we won't look at the bucket with 27611042SErik.Nordmark@Sun.COM * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match 27711042SErik.Nordmark@Sun.COM * and the IRE_INTERFACESs are likely to be shorter matches. 2788275SEric Cheng */ 27911042SErik.Nordmark@Sun.COM if (ire->ire_bucket->irb_ire_cnt > 1) { 28011042SErik.Nordmark@Sun.COM if (ipst->ips_ip_ecmp_behavior == 2 || 28111042SErik.Nordmark@Sun.COM (ipst->ips_ip_ecmp_behavior == 1 && 28211042SErik.Nordmark@Sun.COM IS_DEFAULT_ROUTE(ire))) { 28311042SErik.Nordmark@Sun.COM ire_t *next_ire; 28411042SErik.Nordmark@Sun.COM ire_ftable_args_t margs; 2858275SEric Cheng 28611131SErik.Nordmark@Sun.COM bzero(&margs, sizeof (margs)); 28711042SErik.Nordmark@Sun.COM margs.ift_addr = addr; 28811042SErik.Nordmark@Sun.COM margs.ift_zoneid = ALL_ZONES; 28911042SErik.Nordmark@Sun.COM 29011042SErik.Nordmark@Sun.COM next_ire = ire_round_robin(ire->ire_bucket, &margs, 29111042SErik.Nordmark@Sun.COM xmit_hint, ire, ipst); 29211042SErik.Nordmark@Sun.COM if (next_ire == NULL) { 29311042SErik.Nordmark@Sun.COM /* keep ire if next_ire is null */ 29411042SErik.Nordmark@Sun.COM if (generationp != NULL) 29511042SErik.Nordmark@Sun.COM *generationp = ire->ire_generation; 29611042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 29711042SErik.Nordmark@Sun.COM return (ire); 29811042SErik.Nordmark@Sun.COM } 29911042SErik.Nordmark@Sun.COM ire_refrele(ire); 30011042SErik.Nordmark@Sun.COM ire = next_ire; 3018275SEric Cheng } 3028275SEric Cheng } 30311042SErik.Nordmark@Sun.COM /* Return generation before dropping lock */ 30411042SErik.Nordmark@Sun.COM if (generationp != NULL) 30511042SErik.Nordmark@Sun.COM *generationp = ire->ire_generation; 30611042SErik.Nordmark@Sun.COM 30711042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 3088275SEric Cheng 30911042SErik.Nordmark@Sun.COM /* 31011042SErik.Nordmark@Sun.COM * Since we only did ALL_ZONES matches there is no special handling 31111042SErik.Nordmark@Sun.COM * of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that. 31211042SErik.Nordmark@Sun.COM */ 3138275SEric Cheng return (ire); 31411042SErik.Nordmark@Sun.COM 31511042SErik.Nordmark@Sun.COM bad: 31611042SErik.Nordmark@Sun.COM if (generationp != NULL) 31711042SErik.Nordmark@Sun.COM *generationp = IRE_GENERATION_VERIFY; 31811042SErik.Nordmark@Sun.COM 31911042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 32011042SErik.Nordmark@Sun.COM return (ire_reject(ipst, B_FALSE)); 3218275SEric Cheng } 3222535Ssangeeta 3232535Ssangeeta /* 32411042SErik.Nordmark@Sun.COM * Find the ill matching a multicast group. 3252535Ssangeeta * Allows different routes for multicast addresses 3262535Ssangeeta * in the unicast routing table (akin to 224.0.0.0 but could be more specific) 3272535Ssangeeta * which point at different interfaces. This is used when IP_MULTICAST_IF 3282535Ssangeeta * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't 3292535Ssangeeta * specify the interface to join on. 3302535Ssangeeta * 33111042SErik.Nordmark@Sun.COM * Supports link-local addresses by using ire_route_recursive which follows 33211042SErik.Nordmark@Sun.COM * the ill when recursing. 33311042SErik.Nordmark@Sun.COM * 33411042SErik.Nordmark@Sun.COM * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group 33511042SErik.Nordmark@Sun.COM * and the MULTIRT property can be different for different groups, we 33611042SErik.Nordmark@Sun.COM * extract RTF_MULTIRT from the special unicast route added for a group 33711042SErik.Nordmark@Sun.COM * with CGTP and pass that back in the multirtp argument. 33811042SErik.Nordmark@Sun.COM * This is used in ip_set_destination etc to set ixa_postfragfn for multicast. 33911042SErik.Nordmark@Sun.COM * We have a setsrcp argument for the same reason. 3402535Ssangeeta */ 34111042SErik.Nordmark@Sun.COM ill_t * 34211042SErik.Nordmark@Sun.COM ire_lookup_multi_ill_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst, 34311042SErik.Nordmark@Sun.COM boolean_t *multirtp, ipaddr_t *setsrcp) 3442535Ssangeeta { 3452535Ssangeeta ire_t *ire; 34611042SErik.Nordmark@Sun.COM ill_t *ill; 3472535Ssangeeta 34811042SErik.Nordmark@Sun.COM ire = ire_route_recursive_v4(group, 0, NULL, zoneid, NULL, 34911457SErik.Nordmark@Sun.COM MATCH_IRE_DSTONLY, IRR_NONE, 0, ipst, setsrcp, NULL, NULL); 35011042SErik.Nordmark@Sun.COM ASSERT(ire != NULL); 35111042SErik.Nordmark@Sun.COM if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 3522535Ssangeeta ire_refrele(ire); 3532535Ssangeeta return (NULL); 3542535Ssangeeta } 35511042SErik.Nordmark@Sun.COM 35611042SErik.Nordmark@Sun.COM if (multirtp != NULL) 35711042SErik.Nordmark@Sun.COM *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0; 35811042SErik.Nordmark@Sun.COM 35911042SErik.Nordmark@Sun.COM ill = ire_nexthop_ill(ire); 36011042SErik.Nordmark@Sun.COM ire_refrele(ire); 36111042SErik.Nordmark@Sun.COM return (ill); 3622535Ssangeeta } 3632535Ssangeeta 3642535Ssangeeta /* 3652535Ssangeeta * Delete the passed in ire if the gateway addr matches 3662535Ssangeeta */ 3672535Ssangeeta void 3682535Ssangeeta ire_del_host_redir(ire_t *ire, char *gateway) 3692535Ssangeeta { 3703004Sdd193516 if ((ire->ire_flags & RTF_DYNAMIC) && 3712535Ssangeeta (ire->ire_gateway_addr == *(ipaddr_t *)gateway)) 3722535Ssangeeta ire_delete(ire); 3732535Ssangeeta } 3742535Ssangeeta 3752535Ssangeeta /* 37611042SErik.Nordmark@Sun.COM * Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are 3772535Ssangeeta * pointing at the specified gateway and 3782535Ssangeeta * delete them. This routine is called only 3792535Ssangeeta * when a default gateway is going away. 3802535Ssangeeta */ 3812535Ssangeeta void 3823448Sdh155122 ire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst) 3832535Ssangeeta { 3842535Ssangeeta struct rtfuncarg rtfarg; 3852535Ssangeeta 38611131SErik.Nordmark@Sun.COM bzero(&rtfarg, sizeof (rtfarg)); 3872535Ssangeeta rtfarg.rt_func = ire_del_host_redir; 3882535Ssangeeta rtfarg.rt_arg = (void *)&gateway; 38911131SErik.Nordmark@Sun.COM rtfarg.rt_zoneid = ALL_ZONES; 39011131SErik.Nordmark@Sun.COM rtfarg.rt_ipst = ipst; 3913448Sdh155122 (void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable, 3923448Sdh155122 rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn); 3932535Ssangeeta } 3942535Ssangeeta 3952535Ssangeeta /* 3963448Sdh155122 * Obtain the rt_entry and rt_irb for the route to be added to 3973448Sdh155122 * the ips_ip_ftable. 3982535Ssangeeta * First attempt to add a node to the radix tree via rn_addroute. If the 3992535Ssangeeta * route already exists, return the bucket for the existing route. 4002535Ssangeeta * 4012535Ssangeeta * Locking notes: Need to hold the global radix tree lock in write mode to 4022535Ssangeeta * add a radix node. To prevent the node from being deleted, ire_get_bucket() 4032535Ssangeeta * returns with a ref'ed irb_t. The ire itself is added in ire_add_v4() 4042535Ssangeeta * while holding the irb_lock, but not the radix tree lock. 4052535Ssangeeta */ 4062535Ssangeeta irb_t * 4072535Ssangeeta ire_get_bucket(ire_t *ire) 4082535Ssangeeta { 4092535Ssangeeta struct radix_node *rn; 4102535Ssangeeta struct rt_entry *rt; 4112535Ssangeeta struct rt_sockaddr rmask, rdst; 4122535Ssangeeta irb_t *irb = NULL; 4133448Sdh155122 ip_stack_t *ipst = ire->ire_ipst; 4142535Ssangeeta 4153448Sdh155122 ASSERT(ipst->ips_ip_ftable != NULL); 4162535Ssangeeta 4172535Ssangeeta /* first try to see if route exists (based on rtalloc1) */ 41811131SErik.Nordmark@Sun.COM bzero(&rdst, sizeof (rdst)); 4192535Ssangeeta rdst.rt_sin_len = sizeof (rdst); 4202535Ssangeeta rdst.rt_sin_family = AF_INET; 4212535Ssangeeta rdst.rt_sin_addr.s_addr = ire->ire_addr; 4222535Ssangeeta 42311131SErik.Nordmark@Sun.COM bzero(&rmask, sizeof (rmask)); 4242535Ssangeeta rmask.rt_sin_len = sizeof (rmask); 4252535Ssangeeta rmask.rt_sin_family = AF_INET; 4262535Ssangeeta rmask.rt_sin_addr.s_addr = ire->ire_mask; 4272535Ssangeeta 4282535Ssangeeta /* 4292535Ssangeeta * add the route. based on BSD's rtrequest1(RTM_ADD) 4302535Ssangeeta */ 4312535Ssangeeta R_Malloc(rt, rt_entry_cache, sizeof (*rt)); 4325090Ssangeeta /* kmem_alloc failed */ 4335090Ssangeeta if (rt == NULL) 4345090Ssangeeta return (NULL); 4355090Ssangeeta 43611131SErik.Nordmark@Sun.COM bzero(rt, sizeof (*rt)); 4372535Ssangeeta rt->rt_nodes->rn_key = (char *)&rt->rt_dst; 4382535Ssangeeta rt->rt_dst = rdst; 4392535Ssangeeta irb = &rt->rt_irb; 44011042SErik.Nordmark@Sun.COM irb->irb_marks |= IRB_MARK_DYNAMIC; /* dynamically allocated/freed */ 4413448Sdh155122 irb->irb_ipst = ipst; 4422535Ssangeeta rw_init(&irb->irb_lock, NULL, RW_DEFAULT, NULL); 4433448Sdh155122 RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable); 4443448Sdh155122 rn = ipst->ips_ip_ftable->rnh_addaddr(&rt->rt_dst, &rmask, 4453448Sdh155122 ipst->ips_ip_ftable, (struct radix_node *)rt); 4462535Ssangeeta if (rn == NULL) { 4473448Sdh155122 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 4482535Ssangeeta Free(rt, rt_entry_cache); 4492535Ssangeeta rt = NULL; 4502535Ssangeeta irb = NULL; 4513448Sdh155122 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 4523448Sdh155122 rn = ipst->ips_ip_ftable->rnh_lookup(&rdst, &rmask, 4533448Sdh155122 ipst->ips_ip_ftable); 4543448Sdh155122 if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { 4552535Ssangeeta /* found a non-root match */ 4562535Ssangeeta rt = (struct rt_entry *)rn; 4572535Ssangeeta } 4582535Ssangeeta } 4592535Ssangeeta if (rt != NULL) { 4602535Ssangeeta irb = &rt->rt_irb; 46111042SErik.Nordmark@Sun.COM irb_refhold(irb); 4622535Ssangeeta } 4633448Sdh155122 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 4642535Ssangeeta return (irb); 4652535Ssangeeta } 4662535Ssangeeta 4672535Ssangeeta /* 4682535Ssangeeta * This function is used when the caller wants to know the outbound 4692535Ssangeeta * interface for a packet given only the address. 4702535Ssangeeta * If this is a offlink IP address and there are multiple 4712535Ssangeeta * routes to this destination, this routine will utilise the 4722535Ssangeeta * first route it finds to IP address 4732535Ssangeeta * Return values: 4742535Ssangeeta * 0 - FAILURE 4752535Ssangeeta * nonzero - ifindex 4762535Ssangeeta */ 4772535Ssangeeta uint_t 4782535Ssangeeta ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid) 4792535Ssangeeta { 4802535Ssangeeta uint_t ifindex = 0; 4812535Ssangeeta ire_t *ire; 4822535Ssangeeta ill_t *ill; 4833448Sdh155122 netstack_t *ns; 4843448Sdh155122 ip_stack_t *ipst; 4852535Ssangeeta 4863448Sdh155122 if (zoneid == ALL_ZONES) 4873448Sdh155122 ns = netstack_find_by_zoneid(GLOBAL_ZONEID); 4883448Sdh155122 else 4893448Sdh155122 ns = netstack_find_by_zoneid(zoneid); 4903448Sdh155122 ASSERT(ns != NULL); 4913448Sdh155122 4923448Sdh155122 /* 4933448Sdh155122 * For exclusive stacks we set the zoneid to zero 4943448Sdh155122 * since IP uses the global zoneid in the exclusive stacks. 4953448Sdh155122 */ 4963448Sdh155122 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 4973448Sdh155122 zoneid = GLOBAL_ZONEID; 4983448Sdh155122 ipst = ns->netstack_ip; 4992535Ssangeeta 5002535Ssangeeta ASSERT(ipaddr->sa_family == AF_INET || ipaddr->sa_family == AF_INET6); 5012535Ssangeeta 50211042SErik.Nordmark@Sun.COM if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) { 50311042SErik.Nordmark@Sun.COM ill = ire_nexthop_ill(ire); 50411042SErik.Nordmark@Sun.COM if (ill != NULL) { 5052535Ssangeeta ifindex = ill->ill_phyint->phyint_ifindex; 50611042SErik.Nordmark@Sun.COM ill_refrele(ill); 50711042SErik.Nordmark@Sun.COM } 5082535Ssangeeta ire_refrele(ire); 5092535Ssangeeta } 5103448Sdh155122 netstack_rele(ns); 5112535Ssangeeta return (ifindex); 5122535Ssangeeta } 5132535Ssangeeta 5142535Ssangeeta /* 5152535Ssangeeta * Routine to find the route to a destination. If a ifindex is supplied 51611042SErik.Nordmark@Sun.COM * it tries to match the route to the corresponding ipif for the ifindex 5172535Ssangeeta */ 5182535Ssangeeta static ire_t * 5193448Sdh155122 route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst) 5202535Ssangeeta { 5212535Ssangeeta ire_t *ire = NULL; 5222535Ssangeeta int match_flags; 5232535Ssangeeta 52411042SErik.Nordmark@Sun.COM match_flags = MATCH_IRE_DSTONLY; 5252535Ssangeeta 5262535Ssangeeta /* XXX pass NULL tsl for now */ 5272535Ssangeeta 5282535Ssangeeta if (dst_addr->sa_family == AF_INET) { 52911042SErik.Nordmark@Sun.COM ire = ire_route_recursive_v4( 53011042SErik.Nordmark@Sun.COM ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr, 0, NULL, 53111457SErik.Nordmark@Sun.COM zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL, 53211457SErik.Nordmark@Sun.COM NULL, NULL); 5332535Ssangeeta } else { 53411042SErik.Nordmark@Sun.COM ire = ire_route_recursive_v6( 53511042SErik.Nordmark@Sun.COM &((struct sockaddr_in6 *)dst_addr)->sin6_addr, 0, NULL, 53611457SErik.Nordmark@Sun.COM zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL, 53711457SErik.Nordmark@Sun.COM NULL, NULL); 53811042SErik.Nordmark@Sun.COM } 53911042SErik.Nordmark@Sun.COM ASSERT(ire != NULL); 54011042SErik.Nordmark@Sun.COM if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 54111042SErik.Nordmark@Sun.COM ire_refrele(ire); 54211042SErik.Nordmark@Sun.COM return (NULL); 5432535Ssangeeta } 5442535Ssangeeta return (ire); 5452535Ssangeeta } 5462535Ssangeeta 5472535Ssangeeta /* 5482535Ssangeeta * This routine is called by IP Filter to send a packet out on the wire 54911042SErik.Nordmark@Sun.COM * to a specified dstination (which may be onlink or offlink). The ifindex may 55011042SErik.Nordmark@Sun.COM * or may not be 0. A non-null ifindex indicates IP Filter has stipulated 5512535Ssangeeta * an outgoing interface and requires the nexthop to be on that interface. 5524482Sdr146992 * IP WILL NOT DO the following to the data packet before sending it out: 5532535Ssangeeta * a. manipulate ttl 5544482Sdr146992 * b. ipsec work 5554482Sdr146992 * c. fragmentation 5564482Sdr146992 * 5574482Sdr146992 * If the packet has been prepared for hardware checksum then it will be 5584482Sdr146992 * passed off to ip_send_align_cksum() to check that the flags set on the 5594482Sdr146992 * packet are in alignment with the capabilities of the new outgoing NIC. 5602535Ssangeeta * 5612535Ssangeeta * Return values: 5622535Ssangeeta * 0: IP was able to send of the data pkt 5632535Ssangeeta * ECOMM: Could not send packet 5642535Ssangeeta * ENONET No route to dst. It is up to the caller 5652535Ssangeeta * to send icmp unreachable error message, 5662535Ssangeeta * EINPROGRESS The macaddr of the onlink dst or that 5672535Ssangeeta * of the offlink dst's nexthop needs to get 5682535Ssangeeta * resolved before packet can be sent to dst. 5692535Ssangeeta * Thus transmission is not guaranteed. 57011042SErik.Nordmark@Sun.COM * Note: No longer have visibility to the ARP queue 57111042SErik.Nordmark@Sun.COM * hence no EINPROGRESS. 5722535Ssangeeta */ 5732535Ssangeeta int 5742535Ssangeeta ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex, 5752535Ssangeeta zoneid_t zoneid) 5762535Ssangeeta { 57711042SErik.Nordmark@Sun.COM ipaddr_t nexthop; 5783448Sdh155122 netstack_t *ns; 5793448Sdh155122 ip_stack_t *ipst; 58011042SErik.Nordmark@Sun.COM ip_xmit_attr_t ixas; 58111042SErik.Nordmark@Sun.COM int error; 5822535Ssangeeta 5832535Ssangeeta ASSERT(mp != NULL); 5842535Ssangeeta 5853448Sdh155122 if (zoneid == ALL_ZONES) 5863448Sdh155122 ns = netstack_find_by_zoneid(GLOBAL_ZONEID); 5873448Sdh155122 else 5883448Sdh155122 ns = netstack_find_by_zoneid(zoneid); 5893448Sdh155122 ASSERT(ns != NULL); 5903448Sdh155122 5913448Sdh155122 /* 5923448Sdh155122 * For exclusive stacks we set the zoneid to zero 5933448Sdh155122 * since IP uses the global zoneid in the exclusive stacks. 5943448Sdh155122 */ 5953448Sdh155122 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 5963448Sdh155122 zoneid = GLOBAL_ZONEID; 5973448Sdh155122 ipst = ns->netstack_ip; 5983448Sdh155122 5992535Ssangeeta ASSERT(dst_addr->sa_family == AF_INET || 6002535Ssangeeta dst_addr->sa_family == AF_INET6); 6012535Ssangeeta 60211042SErik.Nordmark@Sun.COM bzero(&ixas, sizeof (ixas)); 6032535Ssangeeta /* 60411042SErik.Nordmark@Sun.COM * No IPsec, no fragmentation, and don't let any hooks see 60511042SErik.Nordmark@Sun.COM * the packet. 6062535Ssangeeta */ 60711042SErik.Nordmark@Sun.COM ixas.ixa_flags = IXAF_NO_IPSEC | IXAF_DONTFRAG | IXAF_NO_PFHOOK; 60811042SErik.Nordmark@Sun.COM ixas.ixa_cred = kcred; 60911042SErik.Nordmark@Sun.COM ixas.ixa_cpid = NOPID; 61011042SErik.Nordmark@Sun.COM ixas.ixa_tsl = NULL; 61111042SErik.Nordmark@Sun.COM ixas.ixa_ipst = ipst; 61211042SErik.Nordmark@Sun.COM ixas.ixa_ifindex = ifindex; 6132535Ssangeeta 61411042SErik.Nordmark@Sun.COM if (dst_addr->sa_family == AF_INET) { 61511042SErik.Nordmark@Sun.COM ipha_t *ipha = (ipha_t *)mp->b_rptr; 6164482Sdr146992 61711042SErik.Nordmark@Sun.COM ixas.ixa_flags |= IXAF_IS_IPV4; 61811042SErik.Nordmark@Sun.COM nexthop = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr; 61911042SErik.Nordmark@Sun.COM if (nexthop != ipha->ipha_dst) { 62011042SErik.Nordmark@Sun.COM ixas.ixa_flags |= IXAF_NEXTHOP_SET; 62111042SErik.Nordmark@Sun.COM ixas.ixa_nexthop_v4 = nexthop; 6222535Ssangeeta } 62311042SErik.Nordmark@Sun.COM ixas.ixa_multicast_ttl = ipha->ipha_ttl; 62411042SErik.Nordmark@Sun.COM } else { 62511042SErik.Nordmark@Sun.COM ip6_t *ip6h = (ip6_t *)mp->b_rptr; 62611042SErik.Nordmark@Sun.COM in6_addr_t *nexthop6; 62711042SErik.Nordmark@Sun.COM 62811042SErik.Nordmark@Sun.COM nexthop6 = &((struct sockaddr_in6 *)dst_addr)->sin6_addr; 62911042SErik.Nordmark@Sun.COM if (!IN6_ARE_ADDR_EQUAL(nexthop6, &ip6h->ip6_dst)) { 63011042SErik.Nordmark@Sun.COM ixas.ixa_flags |= IXAF_NEXTHOP_SET; 63111042SErik.Nordmark@Sun.COM ixas.ixa_nexthop_v6 = *nexthop6; 63211042SErik.Nordmark@Sun.COM } 63311042SErik.Nordmark@Sun.COM ixas.ixa_multicast_ttl = ip6h->ip6_hops; 63411042SErik.Nordmark@Sun.COM } 63511042SErik.Nordmark@Sun.COM error = ip_output_simple(mp, &ixas); 63611042SErik.Nordmark@Sun.COM ixa_cleanup(&ixas); 63711042SErik.Nordmark@Sun.COM 63811042SErik.Nordmark@Sun.COM netstack_rele(ns); 63911042SErik.Nordmark@Sun.COM switch (error) { 64011042SErik.Nordmark@Sun.COM case 0: 6412535Ssangeeta break; 64211042SErik.Nordmark@Sun.COM 64311042SErik.Nordmark@Sun.COM case EHOSTUNREACH: 64411042SErik.Nordmark@Sun.COM case ENETUNREACH: 64511042SErik.Nordmark@Sun.COM error = ENONET; 64611042SErik.Nordmark@Sun.COM break; 64711042SErik.Nordmark@Sun.COM 64811042SErik.Nordmark@Sun.COM default: 64911042SErik.Nordmark@Sun.COM error = ECOMM; 6502535Ssangeeta break; 6512535Ssangeeta } 65211042SErik.Nordmark@Sun.COM return (error); 6534482Sdr146992 } 6544482Sdr146992 6552535Ssangeeta /* 6562535Ssangeeta * callback function provided by ire_ftable_lookup when calling 6572535Ssangeeta * rn_match_args(). Invoke ire_match_args on each matching leaf node in 6582535Ssangeeta * the radix tree. 6592535Ssangeeta */ 6602535Ssangeeta boolean_t 6612535Ssangeeta ire_find_best_route(struct radix_node *rn, void *arg) 6622535Ssangeeta { 6632535Ssangeeta struct rt_entry *rt = (struct rt_entry *)rn; 6642535Ssangeeta irb_t *irb_ptr; 6652535Ssangeeta ire_t *ire; 6662535Ssangeeta ire_ftable_args_t *margs = arg; 6672535Ssangeeta ipaddr_t match_mask; 6682535Ssangeeta 6692535Ssangeeta ASSERT(rt != NULL); 6702535Ssangeeta 6712535Ssangeeta irb_ptr = &rt->rt_irb; 6722535Ssangeeta 6732535Ssangeeta if (irb_ptr->irb_ire_cnt == 0) 6742535Ssangeeta return (B_FALSE); 6752535Ssangeeta 6762535Ssangeeta rw_enter(&irb_ptr->irb_lock, RW_READER); 6772535Ssangeeta for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { 67811042SErik.Nordmark@Sun.COM if (IRE_IS_CONDEMNED(ire)) 6792535Ssangeeta continue; 680*11681SSowmini.Varadhan@Sun.COM ASSERT((margs->ift_flags & MATCH_IRE_SHORTERMASK) == 0); 681*11681SSowmini.Varadhan@Sun.COM if (margs->ift_flags & MATCH_IRE_MASK) 6822535Ssangeeta match_mask = margs->ift_mask; 6832535Ssangeeta else 6842535Ssangeeta match_mask = ire->ire_mask; 6852535Ssangeeta 6862535Ssangeeta if (ire_match_args(ire, margs->ift_addr, match_mask, 68711042SErik.Nordmark@Sun.COM margs->ift_gateway, margs->ift_type, margs->ift_ill, 68811042SErik.Nordmark@Sun.COM margs->ift_zoneid, margs->ift_tsl, 68911042SErik.Nordmark@Sun.COM margs->ift_flags)) { 69011042SErik.Nordmark@Sun.COM ire_refhold(ire); 6912535Ssangeeta rw_exit(&irb_ptr->irb_lock); 6922535Ssangeeta margs->ift_best_ire = ire; 6932535Ssangeeta return (B_TRUE); 6942535Ssangeeta } 6952535Ssangeeta } 6962535Ssangeeta rw_exit(&irb_ptr->irb_lock); 6972535Ssangeeta return (B_FALSE); 6982535Ssangeeta } 6992535Ssangeeta 7002535Ssangeeta /* 7012535Ssangeeta * ftable irb_t structures are dynamically allocated, and we need to 7022535Ssangeeta * check if the irb_t (and associated ftable tree attachment) needs to 7032535Ssangeeta * be cleaned up when the irb_refcnt goes to 0. The conditions that need 7042535Ssangeeta * be verified are: 7052535Ssangeeta * - no other walkers of the irebucket, i.e., quiescent irb_refcnt, 7062535Ssangeeta * - no other threads holding references to ire's in the bucket, 7072535Ssangeeta * i.e., irb_nire == 0 7082535Ssangeeta * - no active ire's in the bucket, i.e., irb_ire_cnt == 0 7092535Ssangeeta * - need to hold the global tree lock and irb_lock in write mode. 7102535Ssangeeta */ 7112535Ssangeeta void 7122535Ssangeeta irb_refrele_ftable(irb_t *irb) 7132535Ssangeeta { 7142535Ssangeeta for (;;) { 7152535Ssangeeta rw_enter(&irb->irb_lock, RW_WRITER); 7162535Ssangeeta ASSERT(irb->irb_refcnt != 0); 7172535Ssangeeta if (irb->irb_refcnt != 1) { 7182535Ssangeeta /* 7192535Ssangeeta * Someone has a reference to this radix node 7202535Ssangeeta * or there is some bucket walker. 7212535Ssangeeta */ 7222535Ssangeeta irb->irb_refcnt--; 7232535Ssangeeta rw_exit(&irb->irb_lock); 7242535Ssangeeta return; 7252535Ssangeeta } else { 7262535Ssangeeta /* 7272535Ssangeeta * There is no other walker, nor is there any 7282535Ssangeeta * other thread that holds a direct ref to this 7292535Ssangeeta * radix node. Do the clean up if needed. Call 7302535Ssangeeta * to ire_unlink will clear the IRB_MARK_CONDEMNED flag 7312535Ssangeeta */ 7322535Ssangeeta if (irb->irb_marks & IRB_MARK_CONDEMNED) { 7332535Ssangeeta ire_t *ire_list; 7342535Ssangeeta 7352535Ssangeeta ire_list = ire_unlink(irb); 7362535Ssangeeta rw_exit(&irb->irb_lock); 7372535Ssangeeta 7382535Ssangeeta if (ire_list != NULL) 7392535Ssangeeta ire_cleanup(ire_list); 7402535Ssangeeta /* 7412535Ssangeeta * more CONDEMNED entries could have 7422535Ssangeeta * been added while we dropped the lock, 7432535Ssangeeta * so we have to re-check. 7442535Ssangeeta */ 7452535Ssangeeta continue; 7462535Ssangeeta } 7472535Ssangeeta 7482535Ssangeeta /* 7492535Ssangeeta * Now check if there are still any ires 7502535Ssangeeta * associated with this radix node. 7512535Ssangeeta */ 7522535Ssangeeta if (irb->irb_nire != 0) { 7532535Ssangeeta /* 7542535Ssangeeta * someone is still holding on 7552535Ssangeeta * to ires in this bucket 7562535Ssangeeta */ 7572535Ssangeeta irb->irb_refcnt--; 7582535Ssangeeta rw_exit(&irb->irb_lock); 7592535Ssangeeta return; 7602535Ssangeeta } else { 7612535Ssangeeta /* 7622535Ssangeeta * Everything is clear. Zero walkers, 7632535Ssangeeta * Zero threads with a ref to this 7642535Ssangeeta * radix node, Zero ires associated with 7652535Ssangeeta * this radix node. Due to lock order, 7662535Ssangeeta * check the above conditions again 7672535Ssangeeta * after grabbing all locks in the right order 7682535Ssangeeta */ 7692535Ssangeeta rw_exit(&irb->irb_lock); 7702535Ssangeeta if (irb_inactive(irb)) 7712535Ssangeeta return; 7722535Ssangeeta /* 7732535Ssangeeta * irb_inactive could not free the irb. 7742535Ssangeeta * See if there are any walkers, if not 7752535Ssangeeta * try to clean up again. 7762535Ssangeeta */ 7772535Ssangeeta } 7782535Ssangeeta } 7792535Ssangeeta } 7802535Ssangeeta } 7812535Ssangeeta 7822535Ssangeeta /* 78311042SErik.Nordmark@Sun.COM * IRE iterator used by ire_ftable_lookup to process multiple equal 78411042SErik.Nordmark@Sun.COM * routes. Given a starting point in the hash list (hash), walk the IREs 78511042SErik.Nordmark@Sun.COM * in the bucket skipping deleted entries. We treat the bucket as a circular 78611042SErik.Nordmark@Sun.COM * list for the purposes of walking it. 78711042SErik.Nordmark@Sun.COM * Returns the IRE (held) that corresponds to the hash value. If that IRE is 78811042SErik.Nordmark@Sun.COM * not applicable (ire_match_args failed) then it returns a subsequent one. 78911042SErik.Nordmark@Sun.COM * If we fail to find an IRE we return NULL. 79011042SErik.Nordmark@Sun.COM * 79111042SErik.Nordmark@Sun.COM * Assumes that the caller holds a reference on the IRE bucket and a read lock 79211042SErik.Nordmark@Sun.COM * on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6). 7932535Ssangeeta * 79411042SErik.Nordmark@Sun.COM * Applies to IPv4 and IPv6. 79511042SErik.Nordmark@Sun.COM * 79611042SErik.Nordmark@Sun.COM * For CGTP, where an IRE_BROADCAST and IRE_HOST can exist for the same 79711042SErik.Nordmark@Sun.COM * address and bucket, we compare against ire_type for the orig_ire. We also 79811042SErik.Nordmark@Sun.COM * have IRE_BROADCASTs with and without RTF_MULTIRT, with the former being 79911131SErik.Nordmark@Sun.COM * first in the bucket. Thus we compare that RTF_MULTIRT match the orig_ire. 80011042SErik.Nordmark@Sun.COM * 80111042SErik.Nordmark@Sun.COM * Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is 80211042SErik.Nordmark@Sun.COM * reachable from the zone i.e., that the ire_gateway_addr is in a subnet 80311042SErik.Nordmark@Sun.COM * in which the zone has an IP address. We check this for the global zone 80411042SErik.Nordmark@Sun.COM * even if no shared-IP zones are configured. 8052535Ssangeeta */ 8062535Ssangeeta ire_t * 80711042SErik.Nordmark@Sun.COM ire_round_robin(irb_t *irb_ptr, ire_ftable_args_t *margs, uint_t hash, 80811042SErik.Nordmark@Sun.COM ire_t *orig_ire, ip_stack_t *ipst) 8092535Ssangeeta { 81011042SErik.Nordmark@Sun.COM ire_t *ire, *maybe_ire = NULL; 81111042SErik.Nordmark@Sun.COM uint_t maybe_badcnt; 81211042SErik.Nordmark@Sun.COM uint_t maxwalk; 81311042SErik.Nordmark@Sun.COM 81411042SErik.Nordmark@Sun.COM /* Fold in more bits from the hint/hash */ 81511042SErik.Nordmark@Sun.COM hash = hash ^ (hash >> 8) ^ (hash >> 16); 8162535Ssangeeta 8172535Ssangeeta rw_enter(&irb_ptr->irb_lock, RW_WRITER); 81811042SErik.Nordmark@Sun.COM maxwalk = irb_ptr->irb_ire_cnt; /* Excludes condemned */ 81911042SErik.Nordmark@Sun.COM hash %= maxwalk; 82011042SErik.Nordmark@Sun.COM irb_refhold_locked(irb_ptr); 8212535Ssangeeta rw_exit(&irb_ptr->irb_lock); 8222535Ssangeeta 8232535Ssangeeta /* 8242535Ssangeeta * Round-robin the routers list looking for a route that 8252535Ssangeeta * matches the passed in parameters. 82611042SErik.Nordmark@Sun.COM * First we skip "hash" number of non-condemned IREs. 82711042SErik.Nordmark@Sun.COM * Then we match the IRE. 82811042SErik.Nordmark@Sun.COM * If we find an ire which has a non-zero ire_badcnt then we remember 82911042SErik.Nordmark@Sun.COM * it and keep on looking for a lower ire_badcnt. 83011042SErik.Nordmark@Sun.COM * If we come to the end of the list we continue (treat the 83111042SErik.Nordmark@Sun.COM * bucket list as a circular list) but we match less than "max" 83211042SErik.Nordmark@Sun.COM * entries. 8332535Ssangeeta */ 83411042SErik.Nordmark@Sun.COM ire = irb_ptr->irb_ire; 83511042SErik.Nordmark@Sun.COM while (maxwalk > 0) { 83611042SErik.Nordmark@Sun.COM if (IRE_IS_CONDEMNED(ire)) 83711042SErik.Nordmark@Sun.COM goto next_ire_skip; 8382535Ssangeeta 83911042SErik.Nordmark@Sun.COM /* Skip the first "hash" entries to do ECMP */ 84011042SErik.Nordmark@Sun.COM if (hash != 0) { 84111042SErik.Nordmark@Sun.COM hash--; 84211042SErik.Nordmark@Sun.COM goto next_ire_skip; 84311042SErik.Nordmark@Sun.COM } 84411042SErik.Nordmark@Sun.COM 84511042SErik.Nordmark@Sun.COM /* See CGTP comment above */ 84611042SErik.Nordmark@Sun.COM if (ire->ire_type != orig_ire->ire_type || 84711131SErik.Nordmark@Sun.COM ((ire->ire_flags ^ orig_ire->ire_flags) & RTF_MULTIRT) != 0) 8482535Ssangeeta goto next_ire; 8492535Ssangeeta 85011042SErik.Nordmark@Sun.COM /* 85111042SErik.Nordmark@Sun.COM * Note: Since IPv6 has hash buckets instead of radix 85211042SErik.Nordmark@Sun.COM * buckers we need to explicitly compare the addresses. 85311042SErik.Nordmark@Sun.COM * That makes this less efficient since we will be called 85411042SErik.Nordmark@Sun.COM * even if there is no alternatives just because the 85511042SErik.Nordmark@Sun.COM * bucket has multiple IREs for different addresses. 85611042SErik.Nordmark@Sun.COM */ 85711042SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV6_VERSION) { 85811042SErik.Nordmark@Sun.COM if (!IN6_ARE_ADDR_EQUAL(&orig_ire->ire_addr_v6, 85911042SErik.Nordmark@Sun.COM &ire->ire_addr_v6)) 86011042SErik.Nordmark@Sun.COM goto next_ire; 86111042SErik.Nordmark@Sun.COM } 86211042SErik.Nordmark@Sun.COM 86311042SErik.Nordmark@Sun.COM /* 86411042SErik.Nordmark@Sun.COM * For some reason find_best_route uses ire_mask. We do 86511042SErik.Nordmark@Sun.COM * the same. 86611042SErik.Nordmark@Sun.COM */ 86711042SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV4_VERSION ? 86811042SErik.Nordmark@Sun.COM !ire_match_args(ire, margs->ift_addr, 86911042SErik.Nordmark@Sun.COM ire->ire_mask, margs->ift_gateway, 87011042SErik.Nordmark@Sun.COM margs->ift_type, margs->ift_ill, margs->ift_zoneid, 87111042SErik.Nordmark@Sun.COM margs->ift_tsl, margs->ift_flags) : 87211042SErik.Nordmark@Sun.COM !ire_match_args_v6(ire, &margs->ift_addr_v6, 87311042SErik.Nordmark@Sun.COM &ire->ire_mask_v6, &margs->ift_gateway_v6, 87411042SErik.Nordmark@Sun.COM margs->ift_type, margs->ift_ill, margs->ift_zoneid, 87511042SErik.Nordmark@Sun.COM margs->ift_tsl, margs->ift_flags)) 8762535Ssangeeta goto next_ire; 8772535Ssangeeta 87811042SErik.Nordmark@Sun.COM if (margs->ift_zoneid != ALL_ZONES && 87911042SErik.Nordmark@Sun.COM (ire->ire_type & IRE_OFFLINK)) { 8802535Ssangeeta /* 88111042SErik.Nordmark@Sun.COM * When we're in a zone, we're only 88211042SErik.Nordmark@Sun.COM * interested in routers that are 88311042SErik.Nordmark@Sun.COM * reachable through ipifs within our zone. 8842535Ssangeeta */ 88511042SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV4_VERSION) { 88611042SErik.Nordmark@Sun.COM if (!ire_gateway_ok_zone_v4( 88711042SErik.Nordmark@Sun.COM ire->ire_gateway_addr, margs->ift_zoneid, 88811042SErik.Nordmark@Sun.COM ire->ire_ill, margs->ift_tsl, ipst, 88911042SErik.Nordmark@Sun.COM B_TRUE)) 89011042SErik.Nordmark@Sun.COM goto next_ire; 89111042SErik.Nordmark@Sun.COM } else { 89211042SErik.Nordmark@Sun.COM if (!ire_gateway_ok_zone_v6( 89311042SErik.Nordmark@Sun.COM &ire->ire_gateway_addr_v6, 89411042SErik.Nordmark@Sun.COM margs->ift_zoneid, ire->ire_ill, 89511042SErik.Nordmark@Sun.COM margs->ift_tsl, ipst, B_TRUE)) 89611042SErik.Nordmark@Sun.COM goto next_ire; 89711042SErik.Nordmark@Sun.COM } 8982535Ssangeeta } 89911042SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock); 90011042SErik.Nordmark@Sun.COM /* Look for stale ire_badcnt and clear */ 90111042SErik.Nordmark@Sun.COM if (ire->ire_badcnt != 0 && 90211066Srafael.vanoni@sun.com (TICK_TO_SEC(ddi_get_lbolt64()) - ire->ire_last_badcnt > 90311042SErik.Nordmark@Sun.COM ipst->ips_ip_ire_badcnt_lifetime)) 90411042SErik.Nordmark@Sun.COM ire->ire_badcnt = 0; 90511042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 9062535Ssangeeta 90711042SErik.Nordmark@Sun.COM if (ire->ire_badcnt == 0) { 90811042SErik.Nordmark@Sun.COM /* We found one with a zero badcnt; done */ 90911042SErik.Nordmark@Sun.COM ire_refhold(ire); 91011042SErik.Nordmark@Sun.COM /* 91111042SErik.Nordmark@Sun.COM * Care needed since irb_refrele grabs WLOCK to free 91211042SErik.Nordmark@Sun.COM * the irb_t. 91311042SErik.Nordmark@Sun.COM */ 91411042SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV4_VERSION) { 91511042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 91611042SErik.Nordmark@Sun.COM irb_refrele(irb_ptr); 91711042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 91811042SErik.Nordmark@Sun.COM } else { 91911042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ip6_ire_head_lock); 92011042SErik.Nordmark@Sun.COM irb_refrele(irb_ptr); 92111042SErik.Nordmark@Sun.COM rw_enter(&ipst->ips_ip6_ire_head_lock, 92211042SErik.Nordmark@Sun.COM RW_READER); 92311042SErik.Nordmark@Sun.COM } 9242535Ssangeeta return (ire); 9252535Ssangeeta } 9262535Ssangeeta /* 92711042SErik.Nordmark@Sun.COM * keep looking to see if there is a better (lower 92811042SErik.Nordmark@Sun.COM * badcnt) matching IRE, but save this one as a last resort. 92911042SErik.Nordmark@Sun.COM * If we find a lower badcnt pick that one as the last* resort. 9302535Ssangeeta */ 93111042SErik.Nordmark@Sun.COM if (maybe_ire == NULL) { 93211042SErik.Nordmark@Sun.COM maybe_ire = ire; 93311042SErik.Nordmark@Sun.COM maybe_badcnt = ire->ire_badcnt; 93411042SErik.Nordmark@Sun.COM } else if (ire->ire_badcnt < maybe_badcnt) { 93511042SErik.Nordmark@Sun.COM maybe_ire = ire; 93611042SErik.Nordmark@Sun.COM maybe_badcnt = ire->ire_badcnt; 93711042SErik.Nordmark@Sun.COM } 9388485SPeter.Memishian@Sun.COM 9392535Ssangeeta next_ire: 94011042SErik.Nordmark@Sun.COM maxwalk--; 94111042SErik.Nordmark@Sun.COM next_ire_skip: 94211042SErik.Nordmark@Sun.COM ire = ire->ire_next; 94311042SErik.Nordmark@Sun.COM if (ire == NULL) 94411042SErik.Nordmark@Sun.COM ire = irb_ptr->irb_ire; 9452535Ssangeeta } 9462535Ssangeeta if (maybe_ire != NULL) 94711042SErik.Nordmark@Sun.COM ire_refhold(maybe_ire); 94811042SErik.Nordmark@Sun.COM 94911042SErik.Nordmark@Sun.COM /* Care needed since irb_refrele grabs WLOCK to free the irb_t. */ 95011042SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV4_VERSION) { 95111042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 95211042SErik.Nordmark@Sun.COM irb_refrele(irb_ptr); 95311042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 95411042SErik.Nordmark@Sun.COM } else { 95511042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ip6_ire_head_lock); 95611042SErik.Nordmark@Sun.COM irb_refrele(irb_ptr); 95711042SErik.Nordmark@Sun.COM rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); 95811042SErik.Nordmark@Sun.COM } 9592535Ssangeeta return (maybe_ire); 9602535Ssangeeta } 9612783Ssowmini 9622783Ssowmini void 9632783Ssowmini irb_refhold_rn(struct radix_node *rn) 9642783Ssowmini { 9652783Ssowmini if ((rn->rn_flags & RNF_ROOT) == 0) 96611042SErik.Nordmark@Sun.COM irb_refhold(&((rt_t *)(rn))->rt_irb); 9672783Ssowmini } 9682783Ssowmini 9692783Ssowmini void 9702783Ssowmini irb_refrele_rn(struct radix_node *rn) 9712783Ssowmini { 9722783Ssowmini if ((rn->rn_flags & RNF_ROOT) == 0) 9732783Ssowmini irb_refrele_ftable(&((rt_t *)(rn))->rt_irb); 9742783Ssowmini } 97511042SErik.Nordmark@Sun.COM 976*11681SSowmini.Varadhan@Sun.COM 977*11681SSowmini.Varadhan@Sun.COM /* 978*11681SSowmini.Varadhan@Sun.COM * ip_select_src_ill() is used by ip_select_route() to find the src_ill 979*11681SSowmini.Varadhan@Sun.COM * to be used for source-aware routing table lookup. This function will 980*11681SSowmini.Varadhan@Sun.COM * ignore IPIF_UNNUMBERED interface addresses, and will only return a 981*11681SSowmini.Varadhan@Sun.COM * numbered interface (ipif_lookup_addr_nondup() will ignore UNNUMBERED 982*11681SSowmini.Varadhan@Sun.COM * interfaces). 983*11681SSowmini.Varadhan@Sun.COM */ 984*11681SSowmini.Varadhan@Sun.COM static ill_t * 985*11681SSowmini.Varadhan@Sun.COM ip_select_src_ill(const in6_addr_t *v6src, zoneid_t zoneid, ip_stack_t *ipst) 986*11681SSowmini.Varadhan@Sun.COM { 987*11681SSowmini.Varadhan@Sun.COM ipif_t *ipif; 988*11681SSowmini.Varadhan@Sun.COM ill_t *ill; 989*11681SSowmini.Varadhan@Sun.COM boolean_t isv6 = !IN6_IS_ADDR_V4MAPPED(v6src); 990*11681SSowmini.Varadhan@Sun.COM ipaddr_t v4src; 991*11681SSowmini.Varadhan@Sun.COM 992*11681SSowmini.Varadhan@Sun.COM if (isv6) { 993*11681SSowmini.Varadhan@Sun.COM ipif = ipif_lookup_addr_nondup_v6(v6src, NULL, zoneid, ipst); 994*11681SSowmini.Varadhan@Sun.COM } else { 995*11681SSowmini.Varadhan@Sun.COM IN6_V4MAPPED_TO_IPADDR(v6src, v4src); 996*11681SSowmini.Varadhan@Sun.COM ipif = ipif_lookup_addr_nondup(v4src, NULL, zoneid, ipst); 997*11681SSowmini.Varadhan@Sun.COM } 998*11681SSowmini.Varadhan@Sun.COM if (ipif == NULL) 999*11681SSowmini.Varadhan@Sun.COM return (NULL); 1000*11681SSowmini.Varadhan@Sun.COM ill = ipif->ipif_ill; 1001*11681SSowmini.Varadhan@Sun.COM ill_refhold(ill); 1002*11681SSowmini.Varadhan@Sun.COM ipif_refrele(ipif); 1003*11681SSowmini.Varadhan@Sun.COM return (ill); 1004*11681SSowmini.Varadhan@Sun.COM } 1005*11681SSowmini.Varadhan@Sun.COM 1006*11681SSowmini.Varadhan@Sun.COM /* 1007*11681SSowmini.Varadhan@Sun.COM * verify that v6src is configured on ill 1008*11681SSowmini.Varadhan@Sun.COM */ 1009*11681SSowmini.Varadhan@Sun.COM static boolean_t 1010*11681SSowmini.Varadhan@Sun.COM ip_verify_src_on_ill(const in6_addr_t v6src, ill_t *ill, zoneid_t zoneid) 1011*11681SSowmini.Varadhan@Sun.COM { 1012*11681SSowmini.Varadhan@Sun.COM ipif_t *ipif; 1013*11681SSowmini.Varadhan@Sun.COM ip_stack_t *ipst; 1014*11681SSowmini.Varadhan@Sun.COM ipaddr_t v4src; 1015*11681SSowmini.Varadhan@Sun.COM 1016*11681SSowmini.Varadhan@Sun.COM if (ill == NULL) 1017*11681SSowmini.Varadhan@Sun.COM return (B_FALSE); 1018*11681SSowmini.Varadhan@Sun.COM ipst = ill->ill_ipst; 1019*11681SSowmini.Varadhan@Sun.COM 1020*11681SSowmini.Varadhan@Sun.COM if (ill->ill_isv6) { 1021*11681SSowmini.Varadhan@Sun.COM ipif = ipif_lookup_addr_nondup_v6(&v6src, ill, zoneid, ipst); 1022*11681SSowmini.Varadhan@Sun.COM } else { 1023*11681SSowmini.Varadhan@Sun.COM IN6_V4MAPPED_TO_IPADDR(&v6src, v4src); 1024*11681SSowmini.Varadhan@Sun.COM ipif = ipif_lookup_addr_nondup(v4src, ill, zoneid, ipst); 1025*11681SSowmini.Varadhan@Sun.COM } 1026*11681SSowmini.Varadhan@Sun.COM 1027*11681SSowmini.Varadhan@Sun.COM if (ipif != NULL) { 1028*11681SSowmini.Varadhan@Sun.COM ipif_refrele(ipif); 1029*11681SSowmini.Varadhan@Sun.COM return (B_TRUE); 1030*11681SSowmini.Varadhan@Sun.COM } else { 1031*11681SSowmini.Varadhan@Sun.COM return (B_FALSE); 1032*11681SSowmini.Varadhan@Sun.COM } 1033*11681SSowmini.Varadhan@Sun.COM } 1034*11681SSowmini.Varadhan@Sun.COM 103511042SErik.Nordmark@Sun.COM /* 103611042SErik.Nordmark@Sun.COM * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject 103711042SErik.Nordmark@Sun.COM * routes this routine sets up a ire_nce_cache as well. The caller needs to 103811042SErik.Nordmark@Sun.COM * lookup an nce for the multicast case. 1039*11681SSowmini.Varadhan@Sun.COM * 1040*11681SSowmini.Varadhan@Sun.COM * When src_multihoming is set to 2 (strict src multihoming) we use the source 1041*11681SSowmini.Varadhan@Sun.COM * address to select the interface and route. If IP_BOUND_IF etc are 1042*11681SSowmini.Varadhan@Sun.COM * specified, we require that they specify an interface on which the 1043*11681SSowmini.Varadhan@Sun.COM * source address is assigned. 1044*11681SSowmini.Varadhan@Sun.COM * 1045*11681SSowmini.Varadhan@Sun.COM * When src_multihoming is set to 1 (preferred src aware route 1046*11681SSowmini.Varadhan@Sun.COM * selection) the unicast lookup prefers a matching source 1047*11681SSowmini.Varadhan@Sun.COM * (i.e., that the route points out an ill on which the source is assigned), but 1048*11681SSowmini.Varadhan@Sun.COM * if no such route is found we fallback to not considering the source in the 1049*11681SSowmini.Varadhan@Sun.COM * route lookup. 1050*11681SSowmini.Varadhan@Sun.COM * 1051*11681SSowmini.Varadhan@Sun.COM * We skip the src_multihoming check when the source isn't (yet) set, and 1052*11681SSowmini.Varadhan@Sun.COM * when IXAF_VERIFY_SOURCE is not set. The latter allows RAW sockets to send 1053*11681SSowmini.Varadhan@Sun.COM * with bogus source addresses as allowed by IP_HDRINCL and IPV6_PKTINFO 1054*11681SSowmini.Varadhan@Sun.COM * when secpolicy_net_rawaccess(). 105511042SErik.Nordmark@Sun.COM */ 105611042SErik.Nordmark@Sun.COM ire_t * 1057*11681SSowmini.Varadhan@Sun.COM ip_select_route(const in6_addr_t *v6dst, const in6_addr_t v6src, 1058*11681SSowmini.Varadhan@Sun.COM ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp, 1059*11681SSowmini.Varadhan@Sun.COM int *errorp, boolean_t *multirtp) 106011042SErik.Nordmark@Sun.COM { 106111042SErik.Nordmark@Sun.COM uint_t match_args; 106211042SErik.Nordmark@Sun.COM uint_t ire_type; 1063*11681SSowmini.Varadhan@Sun.COM ill_t *ill = NULL; 106411042SErik.Nordmark@Sun.COM ire_t *ire; 106511042SErik.Nordmark@Sun.COM ip_stack_t *ipst = ixa->ixa_ipst; 106611042SErik.Nordmark@Sun.COM ipaddr_t v4dst; 106711042SErik.Nordmark@Sun.COM in6_addr_t v6nexthop; 106811042SErik.Nordmark@Sun.COM iaflags_t ixaflags = ixa->ixa_flags; 106911042SErik.Nordmark@Sun.COM nce_t *nce; 1070*11681SSowmini.Varadhan@Sun.COM boolean_t preferred_src_aware = B_FALSE; 1071*11681SSowmini.Varadhan@Sun.COM boolean_t verify_src; 1072*11681SSowmini.Varadhan@Sun.COM boolean_t isv6 = !(ixa->ixa_flags & IXAF_IS_IPV4); 1073*11681SSowmini.Varadhan@Sun.COM int src_multihoming = IP_SRC_MULTIHOMING(isv6, ipst); 1074*11681SSowmini.Varadhan@Sun.COM 1075*11681SSowmini.Varadhan@Sun.COM /* 1076*11681SSowmini.Varadhan@Sun.COM * We only verify that the src has been configured on a selected 1077*11681SSowmini.Varadhan@Sun.COM * interface if the src is not :: or INADDR_ANY, and if the 1078*11681SSowmini.Varadhan@Sun.COM * IXAF_VERIFY_SOURCE flag is set. 1079*11681SSowmini.Varadhan@Sun.COM */ 1080*11681SSowmini.Varadhan@Sun.COM verify_src = (!V6_OR_V4_INADDR_ANY(v6src) && 1081*11681SSowmini.Varadhan@Sun.COM (ixa->ixa_flags & IXAF_VERIFY_SOURCE)); 108211042SErik.Nordmark@Sun.COM 108311042SErik.Nordmark@Sun.COM match_args = MATCH_IRE_SECATTR; 108411042SErik.Nordmark@Sun.COM IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst); 108511042SErik.Nordmark@Sun.COM if (setsrcp != NULL) 108611042SErik.Nordmark@Sun.COM ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp)); 108711042SErik.Nordmark@Sun.COM if (errorp != NULL) 108811042SErik.Nordmark@Sun.COM ASSERT(*errorp == 0); 108911042SErik.Nordmark@Sun.COM 109011042SErik.Nordmark@Sun.COM /* 109111042SErik.Nordmark@Sun.COM * The content of the ixa will be different if IP_NEXTHOP, 109211042SErik.Nordmark@Sun.COM * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set 109311042SErik.Nordmark@Sun.COM */ 109411042SErik.Nordmark@Sun.COM 1095*11681SSowmini.Varadhan@Sun.COM if (isv6 ? IN6_IS_ADDR_MULTICAST(v6dst) : CLASSD(v4dst)) { 109611042SErik.Nordmark@Sun.COM /* Pick up the IRE_MULTICAST for the ill */ 109711042SErik.Nordmark@Sun.COM if (ixa->ixa_multicast_ifindex != 0) { 109811042SErik.Nordmark@Sun.COM ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex, 1099*11681SSowmini.Varadhan@Sun.COM isv6, ipst); 110011042SErik.Nordmark@Sun.COM } else if (ixaflags & IXAF_SCOPEID_SET) { 110111042SErik.Nordmark@Sun.COM /* sin6_scope_id takes precedence over ixa_ifindex */ 110211042SErik.Nordmark@Sun.COM ASSERT(ixa->ixa_scopeid != 0); 110311042SErik.Nordmark@Sun.COM ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, 1104*11681SSowmini.Varadhan@Sun.COM isv6, ipst); 110511042SErik.Nordmark@Sun.COM } else if (ixa->ixa_ifindex != 0) { 110611042SErik.Nordmark@Sun.COM /* 110711042SErik.Nordmark@Sun.COM * In the ipmp case, the ixa_ifindex is set to 110811042SErik.Nordmark@Sun.COM * point at an under_ill and we would return the 110911042SErik.Nordmark@Sun.COM * ire_multicast() corresponding to that under_ill. 111011042SErik.Nordmark@Sun.COM */ 111111042SErik.Nordmark@Sun.COM ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, 1112*11681SSowmini.Varadhan@Sun.COM isv6, ipst); 1113*11681SSowmini.Varadhan@Sun.COM } else if (src_multihoming != 0 && verify_src) { 1114*11681SSowmini.Varadhan@Sun.COM /* Look up the ill based on the source address */ 1115*11681SSowmini.Varadhan@Sun.COM ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst); 1116*11681SSowmini.Varadhan@Sun.COM /* 1117*11681SSowmini.Varadhan@Sun.COM * Since we looked up the ill from the source there 1118*11681SSowmini.Varadhan@Sun.COM * is no need to verify that the source is on the ill 1119*11681SSowmini.Varadhan@Sun.COM * below. 1120*11681SSowmini.Varadhan@Sun.COM */ 1121*11681SSowmini.Varadhan@Sun.COM verify_src = B_FALSE; 1122*11681SSowmini.Varadhan@Sun.COM if (ill != NULL && IS_VNI(ill)) { 1123*11681SSowmini.Varadhan@Sun.COM ill_t *usesrc = ill; 1124*11681SSowmini.Varadhan@Sun.COM 1125*11681SSowmini.Varadhan@Sun.COM ill = ill_lookup_usesrc(usesrc); 1126*11681SSowmini.Varadhan@Sun.COM ill_refrele(usesrc); 1127*11681SSowmini.Varadhan@Sun.COM } 1128*11681SSowmini.Varadhan@Sun.COM } else if (!isv6) { 112911042SErik.Nordmark@Sun.COM ipaddr_t v4setsrc = INADDR_ANY; 113011042SErik.Nordmark@Sun.COM 1131*11681SSowmini.Varadhan@Sun.COM ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid, 1132*11681SSowmini.Varadhan@Sun.COM ipst, multirtp, &v4setsrc); 113311042SErik.Nordmark@Sun.COM if (setsrcp != NULL) 113411042SErik.Nordmark@Sun.COM IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); 113511042SErik.Nordmark@Sun.COM } else { 1136*11681SSowmini.Varadhan@Sun.COM ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid, 1137*11681SSowmini.Varadhan@Sun.COM ipst, multirtp, setsrcp); 113811042SErik.Nordmark@Sun.COM } 113911042SErik.Nordmark@Sun.COM if (ill != NULL && IS_VNI(ill)) { 114011042SErik.Nordmark@Sun.COM ill_refrele(ill); 114111042SErik.Nordmark@Sun.COM ill = NULL; 114211042SErik.Nordmark@Sun.COM } 114311042SErik.Nordmark@Sun.COM if (ill == NULL) { 114411042SErik.Nordmark@Sun.COM if (errorp != NULL) 114511042SErik.Nordmark@Sun.COM *errorp = ENXIO; 114611042SErik.Nordmark@Sun.COM /* Get a hold on the IRE_NOROUTE */ 1147*11681SSowmini.Varadhan@Sun.COM ire = ire_reject(ipst, isv6); 114811042SErik.Nordmark@Sun.COM return (ire); 114911042SErik.Nordmark@Sun.COM } 115011042SErik.Nordmark@Sun.COM if (!(ill->ill_flags & ILLF_MULTICAST)) { 115111042SErik.Nordmark@Sun.COM ill_refrele(ill); 115211042SErik.Nordmark@Sun.COM if (errorp != NULL) 115311042SErik.Nordmark@Sun.COM *errorp = EHOSTUNREACH; 115411042SErik.Nordmark@Sun.COM /* Get a hold on the IRE_NOROUTE */ 1155*11681SSowmini.Varadhan@Sun.COM ire = ire_reject(ipst, isv6); 1156*11681SSowmini.Varadhan@Sun.COM return (ire); 1157*11681SSowmini.Varadhan@Sun.COM } 1158*11681SSowmini.Varadhan@Sun.COM /* 1159*11681SSowmini.Varadhan@Sun.COM * If we are doing the strictest src_multihoming, then 1160*11681SSowmini.Varadhan@Sun.COM * we check that IP_MULTICAST_IF, IP_BOUND_IF, etc specify 1161*11681SSowmini.Varadhan@Sun.COM * an interface that is consistent with the source address. 1162*11681SSowmini.Varadhan@Sun.COM */ 1163*11681SSowmini.Varadhan@Sun.COM if (verify_src && src_multihoming == 2 && 1164*11681SSowmini.Varadhan@Sun.COM !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) { 1165*11681SSowmini.Varadhan@Sun.COM if (errorp != NULL) 1166*11681SSowmini.Varadhan@Sun.COM *errorp = EADDRNOTAVAIL; 1167*11681SSowmini.Varadhan@Sun.COM ill_refrele(ill); 1168*11681SSowmini.Varadhan@Sun.COM /* Get a hold on the IRE_NOROUTE */ 1169*11681SSowmini.Varadhan@Sun.COM ire = ire_reject(ipst, isv6); 117011042SErik.Nordmark@Sun.COM return (ire); 117111042SErik.Nordmark@Sun.COM } 117211042SErik.Nordmark@Sun.COM /* Get a refcnt on the single IRE_MULTICAST per ill */ 117311042SErik.Nordmark@Sun.COM ire = ire_multicast(ill); 117411042SErik.Nordmark@Sun.COM ill_refrele(ill); 117511042SErik.Nordmark@Sun.COM if (generationp != NULL) 117611042SErik.Nordmark@Sun.COM *generationp = ire->ire_generation; 117711042SErik.Nordmark@Sun.COM if (errorp != NULL && 117811042SErik.Nordmark@Sun.COM (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) { 117911042SErik.Nordmark@Sun.COM *errorp = EHOSTUNREACH; 118011042SErik.Nordmark@Sun.COM } 118111042SErik.Nordmark@Sun.COM return (ire); 118211042SErik.Nordmark@Sun.COM } 118311042SErik.Nordmark@Sun.COM 1184*11681SSowmini.Varadhan@Sun.COM /* Now for unicast */ 118511042SErik.Nordmark@Sun.COM if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) { 118611042SErik.Nordmark@Sun.COM if (ixaflags & IXAF_SCOPEID_SET) { 118711042SErik.Nordmark@Sun.COM /* sin6_scope_id takes precedence over ixa_ifindex */ 118811042SErik.Nordmark@Sun.COM ASSERT(ixa->ixa_scopeid != 0); 118911042SErik.Nordmark@Sun.COM ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, 1190*11681SSowmini.Varadhan@Sun.COM isv6, ipst); 119111042SErik.Nordmark@Sun.COM } else { 119211042SErik.Nordmark@Sun.COM ASSERT(ixa->ixa_ifindex != 0); 119311042SErik.Nordmark@Sun.COM ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, 1194*11681SSowmini.Varadhan@Sun.COM isv6, ipst); 119511042SErik.Nordmark@Sun.COM } 119611042SErik.Nordmark@Sun.COM if (ill != NULL && IS_VNI(ill)) { 119711042SErik.Nordmark@Sun.COM ill_refrele(ill); 119811042SErik.Nordmark@Sun.COM ill = NULL; 119911042SErik.Nordmark@Sun.COM } 120011042SErik.Nordmark@Sun.COM if (ill == NULL) { 120111042SErik.Nordmark@Sun.COM if (errorp != NULL) 120211042SErik.Nordmark@Sun.COM *errorp = ENXIO; 120311042SErik.Nordmark@Sun.COM /* Get a hold on the IRE_NOROUTE */ 1204*11681SSowmini.Varadhan@Sun.COM ire = ire_reject(ipst, isv6); 120511042SErik.Nordmark@Sun.COM return (ire); 120611042SErik.Nordmark@Sun.COM } 1207*11681SSowmini.Varadhan@Sun.COM 1208*11681SSowmini.Varadhan@Sun.COM match_args |= MATCH_IRE_ILL; 1209*11681SSowmini.Varadhan@Sun.COM 121011042SErik.Nordmark@Sun.COM /* 121111042SErik.Nordmark@Sun.COM * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF 121211042SErik.Nordmark@Sun.COM * so for both of them we need to be able look for an under 121311042SErik.Nordmark@Sun.COM * interface. 121411042SErik.Nordmark@Sun.COM */ 121511042SErik.Nordmark@Sun.COM if (IS_UNDER_IPMP(ill)) 121611042SErik.Nordmark@Sun.COM match_args |= MATCH_IRE_TESTHIDDEN; 1217*11681SSowmini.Varadhan@Sun.COM 1218*11681SSowmini.Varadhan@Sun.COM /* 1219*11681SSowmini.Varadhan@Sun.COM * If we are doing the strictest src_multihoming, then 1220*11681SSowmini.Varadhan@Sun.COM * we check that IP_BOUND_IF, IP_PKTINFO, etc specify 1221*11681SSowmini.Varadhan@Sun.COM * an interface that is consistent with the source address. 1222*11681SSowmini.Varadhan@Sun.COM */ 1223*11681SSowmini.Varadhan@Sun.COM if (src_multihoming == 2 && 1224*11681SSowmini.Varadhan@Sun.COM !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) { 1225*11681SSowmini.Varadhan@Sun.COM if (errorp != NULL) 1226*11681SSowmini.Varadhan@Sun.COM *errorp = EADDRNOTAVAIL; 1227*11681SSowmini.Varadhan@Sun.COM ill_refrele(ill); 1228*11681SSowmini.Varadhan@Sun.COM /* Get a hold on the IRE_NOROUTE */ 1229*11681SSowmini.Varadhan@Sun.COM ire = ire_reject(ipst, isv6); 1230*11681SSowmini.Varadhan@Sun.COM return (ire); 1231*11681SSowmini.Varadhan@Sun.COM } 1232*11681SSowmini.Varadhan@Sun.COM } else if (src_multihoming != 0 && verify_src) { 1233*11681SSowmini.Varadhan@Sun.COM /* Look up the ill based on the source address */ 1234*11681SSowmini.Varadhan@Sun.COM ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst); 1235*11681SSowmini.Varadhan@Sun.COM if (ill == NULL) { 1236*11681SSowmini.Varadhan@Sun.COM char addrbuf[INET6_ADDRSTRLEN]; 1237*11681SSowmini.Varadhan@Sun.COM 1238*11681SSowmini.Varadhan@Sun.COM ip3dbg(("%s not a valid src for unicast", 1239*11681SSowmini.Varadhan@Sun.COM inet_ntop(AF_INET6, &v6src, addrbuf, 1240*11681SSowmini.Varadhan@Sun.COM sizeof (addrbuf)))); 1241*11681SSowmini.Varadhan@Sun.COM if (errorp != NULL) 1242*11681SSowmini.Varadhan@Sun.COM *errorp = EADDRNOTAVAIL; 1243*11681SSowmini.Varadhan@Sun.COM /* Get a hold on the IRE_NOROUTE */ 1244*11681SSowmini.Varadhan@Sun.COM ire = ire_reject(ipst, isv6); 1245*11681SSowmini.Varadhan@Sun.COM return (ire); 1246*11681SSowmini.Varadhan@Sun.COM } 1247*11681SSowmini.Varadhan@Sun.COM match_args |= MATCH_IRE_SRC_ILL; 1248*11681SSowmini.Varadhan@Sun.COM preferred_src_aware = (src_multihoming == 1); 124911042SErik.Nordmark@Sun.COM } 125011042SErik.Nordmark@Sun.COM 125111042SErik.Nordmark@Sun.COM if (ixaflags & IXAF_NEXTHOP_SET) { 125211042SErik.Nordmark@Sun.COM /* IP_NEXTHOP was set */ 125311042SErik.Nordmark@Sun.COM v6nexthop = ixa->ixa_nexthop_v6; 125411042SErik.Nordmark@Sun.COM } else { 125511042SErik.Nordmark@Sun.COM v6nexthop = *v6dst; 125611042SErik.Nordmark@Sun.COM } 125711042SErik.Nordmark@Sun.COM 125811042SErik.Nordmark@Sun.COM ire_type = 0; 125911042SErik.Nordmark@Sun.COM 126011042SErik.Nordmark@Sun.COM /* 126111042SErik.Nordmark@Sun.COM * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then 126211042SErik.Nordmark@Sun.COM * we only look for an onlink IRE. 126311042SErik.Nordmark@Sun.COM */ 126411042SErik.Nordmark@Sun.COM if (ixaflags & (IXAF_DONTROUTE|IXAF_NEXTHOP_SET)) { 126511042SErik.Nordmark@Sun.COM match_args |= MATCH_IRE_TYPE; 126611042SErik.Nordmark@Sun.COM ire_type = IRE_ONLINK; 126711042SErik.Nordmark@Sun.COM } 126811042SErik.Nordmark@Sun.COM 1269*11681SSowmini.Varadhan@Sun.COM retry: 1270*11681SSowmini.Varadhan@Sun.COM if (!isv6) { 127111042SErik.Nordmark@Sun.COM ipaddr_t v4nexthop; 127211042SErik.Nordmark@Sun.COM ipaddr_t v4setsrc = INADDR_ANY; 127311042SErik.Nordmark@Sun.COM 127411042SErik.Nordmark@Sun.COM IN6_V4MAPPED_TO_IPADDR(&v6nexthop, v4nexthop); 127511042SErik.Nordmark@Sun.COM ire = ire_route_recursive_v4(v4nexthop, ire_type, ill, 127611457SErik.Nordmark@Sun.COM ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE, 127711042SErik.Nordmark@Sun.COM ixa->ixa_xmit_hint, ipst, &v4setsrc, NULL, generationp); 127811042SErik.Nordmark@Sun.COM if (setsrcp != NULL) 127911042SErik.Nordmark@Sun.COM IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); 128011042SErik.Nordmark@Sun.COM } else { 128111042SErik.Nordmark@Sun.COM ire = ire_route_recursive_v6(&v6nexthop, ire_type, ill, 128211457SErik.Nordmark@Sun.COM ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE, 128311042SErik.Nordmark@Sun.COM ixa->ixa_xmit_hint, ipst, setsrcp, NULL, generationp); 128411042SErik.Nordmark@Sun.COM } 128511042SErik.Nordmark@Sun.COM 128611042SErik.Nordmark@Sun.COM #ifdef DEBUG 128711042SErik.Nordmark@Sun.COM if (match_args & MATCH_IRE_TESTHIDDEN) { 128811042SErik.Nordmark@Sun.COM ip3dbg(("looking for hidden; dst %x ire %p\n", 128911042SErik.Nordmark@Sun.COM v4dst, (void *)ire)); 129011042SErik.Nordmark@Sun.COM } 129111042SErik.Nordmark@Sun.COM #endif 1292*11681SSowmini.Varadhan@Sun.COM if (ill != NULL) { 129311042SErik.Nordmark@Sun.COM ill_refrele(ill); 1294*11681SSowmini.Varadhan@Sun.COM ill = NULL; 1295*11681SSowmini.Varadhan@Sun.COM } 129611042SErik.Nordmark@Sun.COM if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 129711042SErik.Nordmark@Sun.COM (ire->ire_type & IRE_MULTICAST)) { 1298*11681SSowmini.Varadhan@Sun.COM if (preferred_src_aware) { 1299*11681SSowmini.Varadhan@Sun.COM /* 1300*11681SSowmini.Varadhan@Sun.COM * "Preferred Source Aware" send mode. If we cannot 1301*11681SSowmini.Varadhan@Sun.COM * find an ire whose ire_ill had the desired source 1302*11681SSowmini.Varadhan@Sun.COM * address retry after relaxing the ill matching 1303*11681SSowmini.Varadhan@Sun.COM * constraint. 1304*11681SSowmini.Varadhan@Sun.COM */ 1305*11681SSowmini.Varadhan@Sun.COM ire_refrele(ire); 1306*11681SSowmini.Varadhan@Sun.COM preferred_src_aware = B_FALSE; 1307*11681SSowmini.Varadhan@Sun.COM match_args &= ~MATCH_IRE_SRC_ILL; 1308*11681SSowmini.Varadhan@Sun.COM goto retry; 1309*11681SSowmini.Varadhan@Sun.COM } 131011042SErik.Nordmark@Sun.COM /* No ire_nce_cache */ 131111042SErik.Nordmark@Sun.COM return (ire); 131211042SErik.Nordmark@Sun.COM } 131311042SErik.Nordmark@Sun.COM 131411042SErik.Nordmark@Sun.COM /* Setup ire_nce_cache if it doesn't exist or is condemned. */ 131511042SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock); 131611042SErik.Nordmark@Sun.COM nce = ire->ire_nce_cache; 131711042SErik.Nordmark@Sun.COM if (nce == NULL || nce->nce_is_condemned) { 131811042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 131911042SErik.Nordmark@Sun.COM (void) ire_revalidate_nce(ire); 132011042SErik.Nordmark@Sun.COM } else { 132111042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 132211042SErik.Nordmark@Sun.COM } 132311042SErik.Nordmark@Sun.COM return (ire); 132411042SErik.Nordmark@Sun.COM } 132511042SErik.Nordmark@Sun.COM 132611042SErik.Nordmark@Sun.COM /* 132711042SErik.Nordmark@Sun.COM * Find a route given some xmit attributes and a packet. 132811042SErik.Nordmark@Sun.COM * Generic for IPv4 and IPv6 132911042SErik.Nordmark@Sun.COM * 133011042SErik.Nordmark@Sun.COM * This never returns NULL. But when it returns the IRE_NOROUTE 133111042SErik.Nordmark@Sun.COM * it might set errorp. 133211042SErik.Nordmark@Sun.COM */ 133311042SErik.Nordmark@Sun.COM ire_t * 133411042SErik.Nordmark@Sun.COM ip_select_route_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp, 133511042SErik.Nordmark@Sun.COM int *errorp, boolean_t *multirtp) 133611042SErik.Nordmark@Sun.COM { 133711042SErik.Nordmark@Sun.COM if (ixa->ixa_flags & IXAF_IS_IPV4) { 133811042SErik.Nordmark@Sun.COM ipha_t *ipha = (ipha_t *)mp->b_rptr; 1339*11681SSowmini.Varadhan@Sun.COM in6_addr_t v6dst, v6src; 134011042SErik.Nordmark@Sun.COM 134111042SErik.Nordmark@Sun.COM IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); 1342*11681SSowmini.Varadhan@Sun.COM IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src); 134311042SErik.Nordmark@Sun.COM 1344*11681SSowmini.Varadhan@Sun.COM return (ip_select_route(&v6dst, v6src, ixa, generationp, 134511042SErik.Nordmark@Sun.COM NULL, errorp, multirtp)); 134611042SErik.Nordmark@Sun.COM } else { 134711042SErik.Nordmark@Sun.COM ip6_t *ip6h = (ip6_t *)mp->b_rptr; 134811042SErik.Nordmark@Sun.COM 1349*11681SSowmini.Varadhan@Sun.COM return (ip_select_route(&ip6h->ip6_dst, ip6h->ip6_src, 1350*11681SSowmini.Varadhan@Sun.COM ixa, generationp, NULL, errorp, multirtp)); 135111042SErik.Nordmark@Sun.COM } 135211042SErik.Nordmark@Sun.COM } 135311042SErik.Nordmark@Sun.COM 135411042SErik.Nordmark@Sun.COM ire_t * 1355*11681SSowmini.Varadhan@Sun.COM ip_select_route_v4(ipaddr_t dst, ipaddr_t src, ip_xmit_attr_t *ixa, 1356*11681SSowmini.Varadhan@Sun.COM uint_t *generationp, ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp) 135711042SErik.Nordmark@Sun.COM { 1358*11681SSowmini.Varadhan@Sun.COM in6_addr_t v6dst, v6src; 135911042SErik.Nordmark@Sun.COM ire_t *ire; 136011042SErik.Nordmark@Sun.COM in6_addr_t setsrc; 136111042SErik.Nordmark@Sun.COM 136211042SErik.Nordmark@Sun.COM ASSERT(ixa->ixa_flags & IXAF_IS_IPV4); 136311042SErik.Nordmark@Sun.COM 136411042SErik.Nordmark@Sun.COM IN6_IPADDR_TO_V4MAPPED(dst, &v6dst); 1365*11681SSowmini.Varadhan@Sun.COM IN6_IPADDR_TO_V4MAPPED(src, &v6src); 136611042SErik.Nordmark@Sun.COM 136711042SErik.Nordmark@Sun.COM setsrc = ipv6_all_zeros; 1368*11681SSowmini.Varadhan@Sun.COM ire = ip_select_route(&v6dst, v6src, ixa, generationp, &setsrc, errorp, 136911042SErik.Nordmark@Sun.COM multirtp); 137011042SErik.Nordmark@Sun.COM if (v4setsrcp != NULL) 137111042SErik.Nordmark@Sun.COM IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp); 137211042SErik.Nordmark@Sun.COM return (ire); 137311042SErik.Nordmark@Sun.COM } 137411042SErik.Nordmark@Sun.COM 137511042SErik.Nordmark@Sun.COM /* 137611042SErik.Nordmark@Sun.COM * Recursively look for a route to the destination. Can also match on 137711042SErik.Nordmark@Sun.COM * the zoneid, ill, and label. Used for the data paths. See also 137811042SErik.Nordmark@Sun.COM * ire_route_recursive. 137911042SErik.Nordmark@Sun.COM * 138011457SErik.Nordmark@Sun.COM * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never 138111457SErik.Nordmark@Sun.COM * create an IRE_IF_CLONE. This is used on the receive side when we are not 138211457SErik.Nordmark@Sun.COM * forwarding. 138311457SErik.Nordmark@Sun.COM * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly 138411457SErik.Nordmark@Sun.COM * resolve the gateway. 138511457SErik.Nordmark@Sun.COM * 138611042SErik.Nordmark@Sun.COM * Note that this function never returns NULL. It returns an IRE_NOROUTE 138711042SErik.Nordmark@Sun.COM * instead. 138811042SErik.Nordmark@Sun.COM * 138911042SErik.Nordmark@Sun.COM * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 139011042SErik.Nordmark@Sun.COM * is an error. 139111042SErik.Nordmark@Sun.COM * Allow at most one RTF_INDIRECT. 139211042SErik.Nordmark@Sun.COM */ 139311042SErik.Nordmark@Sun.COM ire_t * 139411042SErik.Nordmark@Sun.COM ire_route_recursive_impl_v4(ire_t *ire, 139511042SErik.Nordmark@Sun.COM ipaddr_t nexthop, uint_t ire_type, const ill_t *ill_arg, 139611042SErik.Nordmark@Sun.COM zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 139711457SErik.Nordmark@Sun.COM uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp, 139811042SErik.Nordmark@Sun.COM tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 139911042SErik.Nordmark@Sun.COM { 140011042SErik.Nordmark@Sun.COM int i, j; 140111042SErik.Nordmark@Sun.COM ire_t *ires[MAX_IRE_RECURSION]; 140211042SErik.Nordmark@Sun.COM uint_t generation; 140311042SErik.Nordmark@Sun.COM uint_t generations[MAX_IRE_RECURSION]; 140411042SErik.Nordmark@Sun.COM boolean_t need_refrele = B_FALSE; 140511042SErik.Nordmark@Sun.COM boolean_t invalidate = B_FALSE; 140611042SErik.Nordmark@Sun.COM int prefs[MAX_IRE_RECURSION]; 140711042SErik.Nordmark@Sun.COM ill_t *ill = NULL; 140811042SErik.Nordmark@Sun.COM 140911042SErik.Nordmark@Sun.COM if (setsrcp != NULL) 141011042SErik.Nordmark@Sun.COM ASSERT(*setsrcp == INADDR_ANY); 141111042SErik.Nordmark@Sun.COM if (gwattrp != NULL) 141211042SErik.Nordmark@Sun.COM ASSERT(*gwattrp == NULL); 141311042SErik.Nordmark@Sun.COM 141411042SErik.Nordmark@Sun.COM /* 141511042SErik.Nordmark@Sun.COM * We iterate up to three times to resolve a route, even though 141611042SErik.Nordmark@Sun.COM * we have four slots in the array. The extra slot is for an 141711042SErik.Nordmark@Sun.COM * IRE_IF_CLONE we might need to create. 141811042SErik.Nordmark@Sun.COM */ 141911042SErik.Nordmark@Sun.COM i = 0; 142011042SErik.Nordmark@Sun.COM while (i < MAX_IRE_RECURSION - 1) { 142111042SErik.Nordmark@Sun.COM /* ire_ftable_lookup handles round-robin/ECMP */ 142211042SErik.Nordmark@Sun.COM if (ire == NULL) { 142311042SErik.Nordmark@Sun.COM ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type, 1424*11681SSowmini.Varadhan@Sun.COM (ill != NULL? ill : ill_arg), zoneid, tsl, 142511042SErik.Nordmark@Sun.COM match_args, xmit_hint, ipst, &generation); 142611042SErik.Nordmark@Sun.COM } else { 142711042SErik.Nordmark@Sun.COM /* Caller passed it; extra hold since we will rele */ 142811042SErik.Nordmark@Sun.COM ire_refhold(ire); 142911042SErik.Nordmark@Sun.COM if (generationp != NULL) 143011042SErik.Nordmark@Sun.COM generation = *generationp; 143111042SErik.Nordmark@Sun.COM else 143211042SErik.Nordmark@Sun.COM generation = IRE_GENERATION_VERIFY; 143311042SErik.Nordmark@Sun.COM } 143411042SErik.Nordmark@Sun.COM if (ire == NULL) 143511042SErik.Nordmark@Sun.COM ire = ire_reject(ipst, B_FALSE); 143611042SErik.Nordmark@Sun.COM 143711042SErik.Nordmark@Sun.COM /* Need to return the ire with RTF_REJECT|BLACKHOLE */ 143811042SErik.Nordmark@Sun.COM if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) 143911042SErik.Nordmark@Sun.COM goto error; 144011042SErik.Nordmark@Sun.COM 144111042SErik.Nordmark@Sun.COM ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */ 144211042SErik.Nordmark@Sun.COM 144311042SErik.Nordmark@Sun.COM if (i != 0) { 144411131SErik.Nordmark@Sun.COM prefs[i] = ire_pref(ire); 144511042SErik.Nordmark@Sun.COM /* 144611042SErik.Nordmark@Sun.COM * Don't allow anything unusual past the first 144711042SErik.Nordmark@Sun.COM * iteration. 144811042SErik.Nordmark@Sun.COM */ 144911042SErik.Nordmark@Sun.COM if ((ire->ire_type & 145011042SErik.Nordmark@Sun.COM (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) || 145111042SErik.Nordmark@Sun.COM prefs[i] <= prefs[i-1]) { 145211042SErik.Nordmark@Sun.COM ire_refrele(ire); 145311457SErik.Nordmark@Sun.COM if (irr_flags & IRR_INCOMPLETE) { 145411457SErik.Nordmark@Sun.COM ire = ires[0]; 145511457SErik.Nordmark@Sun.COM ire_refhold(ire); 145611457SErik.Nordmark@Sun.COM } else { 145711457SErik.Nordmark@Sun.COM ire = ire_reject(ipst, B_FALSE); 145811457SErik.Nordmark@Sun.COM } 145911042SErik.Nordmark@Sun.COM goto error; 146011042SErik.Nordmark@Sun.COM } 146111042SErik.Nordmark@Sun.COM } 146211042SErik.Nordmark@Sun.COM /* We have a usable IRE */ 146311042SErik.Nordmark@Sun.COM ires[i] = ire; 146411042SErik.Nordmark@Sun.COM generations[i] = generation; 146511042SErik.Nordmark@Sun.COM i++; 146611042SErik.Nordmark@Sun.COM 146711042SErik.Nordmark@Sun.COM /* The first RTF_SETSRC address is passed back if setsrcp */ 146811042SErik.Nordmark@Sun.COM if ((ire->ire_flags & RTF_SETSRC) && 146911042SErik.Nordmark@Sun.COM setsrcp != NULL && *setsrcp == INADDR_ANY) { 147011042SErik.Nordmark@Sun.COM ASSERT(ire->ire_setsrc_addr != INADDR_ANY); 147111042SErik.Nordmark@Sun.COM *setsrcp = ire->ire_setsrc_addr; 147211042SErik.Nordmark@Sun.COM } 147311042SErik.Nordmark@Sun.COM 147411042SErik.Nordmark@Sun.COM /* The first ire_gw_secattr is passed back if gwattrp */ 147511042SErik.Nordmark@Sun.COM if (ire->ire_gw_secattr != NULL && 147611042SErik.Nordmark@Sun.COM gwattrp != NULL && *gwattrp == NULL) 147711042SErik.Nordmark@Sun.COM *gwattrp = ire->ire_gw_secattr; 147811042SErik.Nordmark@Sun.COM 147911042SErik.Nordmark@Sun.COM /* 148011042SErik.Nordmark@Sun.COM * Check if we have a short-cut pointer to an IRE for this 148111042SErik.Nordmark@Sun.COM * destination, and that the cached dependency isn't stale. 148211042SErik.Nordmark@Sun.COM * In that case we've rejoined an existing tree towards a 148311042SErik.Nordmark@Sun.COM * parent, thus we don't need to continue the loop to 148411042SErik.Nordmark@Sun.COM * discover the rest of the tree. 148511042SErik.Nordmark@Sun.COM */ 148611042SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock); 148711042SErik.Nordmark@Sun.COM if (ire->ire_dep_parent != NULL && 148811042SErik.Nordmark@Sun.COM ire->ire_dep_parent->ire_generation == 148911042SErik.Nordmark@Sun.COM ire->ire_dep_parent_generation) { 149011042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 149111042SErik.Nordmark@Sun.COM ire = NULL; 149211042SErik.Nordmark@Sun.COM goto done; 149311042SErik.Nordmark@Sun.COM } 149411042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 149511042SErik.Nordmark@Sun.COM 149611042SErik.Nordmark@Sun.COM /* 149711042SErik.Nordmark@Sun.COM * If this type should have an ire_nce_cache (even if it 149811042SErik.Nordmark@Sun.COM * doesn't yet have one) then we are done. Includes 149911042SErik.Nordmark@Sun.COM * IRE_INTERFACE with a full 32 bit mask. 150011042SErik.Nordmark@Sun.COM */ 150111042SErik.Nordmark@Sun.COM if (ire->ire_nce_capable) { 150211042SErik.Nordmark@Sun.COM ire = NULL; 150311042SErik.Nordmark@Sun.COM goto done; 150411042SErik.Nordmark@Sun.COM } 150511042SErik.Nordmark@Sun.COM ASSERT(!(ire->ire_type & IRE_IF_CLONE)); 150611042SErik.Nordmark@Sun.COM /* 150711042SErik.Nordmark@Sun.COM * For an IRE_INTERFACE we create an IRE_IF_CLONE for this 150811042SErik.Nordmark@Sun.COM * particular destination 150911042SErik.Nordmark@Sun.COM */ 151011042SErik.Nordmark@Sun.COM if (ire->ire_type & IRE_INTERFACE) { 151111042SErik.Nordmark@Sun.COM in6_addr_t v6nexthop; 151211042SErik.Nordmark@Sun.COM ire_t *clone; 151311042SErik.Nordmark@Sun.COM 151411042SErik.Nordmark@Sun.COM ASSERT(ire->ire_masklen != IPV4_ABITS); 151511042SErik.Nordmark@Sun.COM 151611042SErik.Nordmark@Sun.COM /* 151711042SErik.Nordmark@Sun.COM * In the case of ip_input and ILLF_FORWARDING not 151811457SErik.Nordmark@Sun.COM * being set, and in the case of RTM_GET, there is 151911457SErik.Nordmark@Sun.COM * no point in allocating an IRE_IF_CLONE. We return 152011457SErik.Nordmark@Sun.COM * the IRE_INTERFACE. Note that !IRR_ALLOCATE can 152111457SErik.Nordmark@Sun.COM * result in a ire_dep_parent which is IRE_IF_* 152211457SErik.Nordmark@Sun.COM * without an IRE_IF_CLONE. 152311042SErik.Nordmark@Sun.COM * We recover from that when we need to send packets 152411042SErik.Nordmark@Sun.COM * by ensuring that the generations become 152511042SErik.Nordmark@Sun.COM * IRE_GENERATION_VERIFY in this case. 152611042SErik.Nordmark@Sun.COM */ 152711457SErik.Nordmark@Sun.COM if (!(irr_flags & IRR_ALLOCATE)) { 152811042SErik.Nordmark@Sun.COM invalidate = B_TRUE; 152911042SErik.Nordmark@Sun.COM ire = NULL; 153011042SErik.Nordmark@Sun.COM goto done; 153111042SErik.Nordmark@Sun.COM } 153211042SErik.Nordmark@Sun.COM 153311042SErik.Nordmark@Sun.COM IN6_IPADDR_TO_V4MAPPED(nexthop, &v6nexthop); 153411042SErik.Nordmark@Sun.COM 153511042SErik.Nordmark@Sun.COM clone = ire_create_if_clone(ire, &v6nexthop, 153611042SErik.Nordmark@Sun.COM &generation); 153711042SErik.Nordmark@Sun.COM if (clone == NULL) { 153811042SErik.Nordmark@Sun.COM /* 153911042SErik.Nordmark@Sun.COM * Temporary failure - no memory. 154011042SErik.Nordmark@Sun.COM * Don't want caller to cache IRE_NOROUTE. 154111042SErik.Nordmark@Sun.COM */ 154211042SErik.Nordmark@Sun.COM invalidate = B_TRUE; 154311042SErik.Nordmark@Sun.COM ire = ire_blackhole(ipst, B_FALSE); 154411042SErik.Nordmark@Sun.COM goto error; 154511042SErik.Nordmark@Sun.COM } 154611042SErik.Nordmark@Sun.COM /* 154711042SErik.Nordmark@Sun.COM * Make clone next to last entry and the 154811042SErik.Nordmark@Sun.COM * IRE_INTERFACE the last in the dependency 154911042SErik.Nordmark@Sun.COM * chain since the clone depends on the 155011042SErik.Nordmark@Sun.COM * IRE_INTERFACE. 155111042SErik.Nordmark@Sun.COM */ 155211042SErik.Nordmark@Sun.COM ASSERT(i >= 1); 155311042SErik.Nordmark@Sun.COM ASSERT(i < MAX_IRE_RECURSION); 155411042SErik.Nordmark@Sun.COM 155511042SErik.Nordmark@Sun.COM ires[i] = ires[i-1]; 155611042SErik.Nordmark@Sun.COM generations[i] = generations[i-1]; 155711042SErik.Nordmark@Sun.COM ires[i-1] = clone; 155811042SErik.Nordmark@Sun.COM generations[i-1] = generation; 155911042SErik.Nordmark@Sun.COM i++; 156011042SErik.Nordmark@Sun.COM 156111042SErik.Nordmark@Sun.COM ire = NULL; 156211042SErik.Nordmark@Sun.COM goto done; 156311042SErik.Nordmark@Sun.COM } 156411042SErik.Nordmark@Sun.COM 156511042SErik.Nordmark@Sun.COM /* 156611042SErik.Nordmark@Sun.COM * We only match on the type and optionally ILL when 156711042SErik.Nordmark@Sun.COM * recursing. The type match is used by some callers 156811042SErik.Nordmark@Sun.COM * to exclude certain types (such as IRE_IF_CLONE or 156911042SErik.Nordmark@Sun.COM * IRE_LOCAL|IRE_LOOPBACK). 1570*11681SSowmini.Varadhan@Sun.COM * 1571*11681SSowmini.Varadhan@Sun.COM * In the MATCH_IRE_SRC_ILL case, ill_arg may be the 'srcof' 1572*11681SSowmini.Varadhan@Sun.COM * ire->ire_ill, and we want to find the IRE_INTERFACE for 1573*11681SSowmini.Varadhan@Sun.COM * ire_ill, so we set ill to the ire_ill; 157411042SErik.Nordmark@Sun.COM */ 157511042SErik.Nordmark@Sun.COM match_args &= MATCH_IRE_TYPE; 157611042SErik.Nordmark@Sun.COM nexthop = ire->ire_gateway_addr; 157711042SErik.Nordmark@Sun.COM if (ill == NULL && ire->ire_ill != NULL) { 157811042SErik.Nordmark@Sun.COM ill = ire->ire_ill; 157911042SErik.Nordmark@Sun.COM need_refrele = B_TRUE; 158011042SErik.Nordmark@Sun.COM ill_refhold(ill); 158111042SErik.Nordmark@Sun.COM match_args |= MATCH_IRE_ILL; 158211042SErik.Nordmark@Sun.COM } 158311131SErik.Nordmark@Sun.COM /* 158411131SErik.Nordmark@Sun.COM * We set the prefs[i] value above if i > 0. We've already 158511131SErik.Nordmark@Sun.COM * done i++ so i is one in the case of the first time around. 158611131SErik.Nordmark@Sun.COM */ 158711131SErik.Nordmark@Sun.COM if (i == 1) 158811131SErik.Nordmark@Sun.COM prefs[0] = ire_pref(ire); 158911042SErik.Nordmark@Sun.COM ire = NULL; 159011042SErik.Nordmark@Sun.COM } 159111042SErik.Nordmark@Sun.COM ASSERT(ire == NULL); 159211042SErik.Nordmark@Sun.COM ire = ire_reject(ipst, B_FALSE); 159311042SErik.Nordmark@Sun.COM 159411042SErik.Nordmark@Sun.COM error: 159511042SErik.Nordmark@Sun.COM ASSERT(ire != NULL); 159611042SErik.Nordmark@Sun.COM if (need_refrele) 159711042SErik.Nordmark@Sun.COM ill_refrele(ill); 159811042SErik.Nordmark@Sun.COM 159911042SErik.Nordmark@Sun.COM /* 160011042SErik.Nordmark@Sun.COM * In the case of MULTIRT we want to try a different IRE the next 160111042SErik.Nordmark@Sun.COM * time. We let the next packet retry in that case. 160211042SErik.Nordmark@Sun.COM */ 160311042SErik.Nordmark@Sun.COM if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT)) 160411042SErik.Nordmark@Sun.COM (void) ire_no_good(ires[0]); 160511042SErik.Nordmark@Sun.COM 160611042SErik.Nordmark@Sun.COM cleanup: 160711042SErik.Nordmark@Sun.COM /* cleanup ires[i] */ 160811042SErik.Nordmark@Sun.COM ire_dep_unbuild(ires, i); 160911042SErik.Nordmark@Sun.COM for (j = 0; j < i; j++) 161011042SErik.Nordmark@Sun.COM ire_refrele(ires[j]); 161111042SErik.Nordmark@Sun.COM 161211457SErik.Nordmark@Sun.COM ASSERT((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 161311457SErik.Nordmark@Sun.COM (irr_flags & IRR_INCOMPLETE)); 161411042SErik.Nordmark@Sun.COM /* 161511042SErik.Nordmark@Sun.COM * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the 161611042SErik.Nordmark@Sun.COM * ip_select_route since the reject or lack of memory might be gone. 161711042SErik.Nordmark@Sun.COM */ 161811042SErik.Nordmark@Sun.COM if (generationp != NULL) 161911042SErik.Nordmark@Sun.COM *generationp = IRE_GENERATION_VERIFY; 162011042SErik.Nordmark@Sun.COM return (ire); 162111042SErik.Nordmark@Sun.COM 162211042SErik.Nordmark@Sun.COM done: 162311042SErik.Nordmark@Sun.COM ASSERT(ire == NULL); 162411042SErik.Nordmark@Sun.COM if (need_refrele) { 162511042SErik.Nordmark@Sun.COM ill_refrele(ill); 162611042SErik.Nordmark@Sun.COM ill = NULL; 162711042SErik.Nordmark@Sun.COM } 162811042SErik.Nordmark@Sun.COM 162911042SErik.Nordmark@Sun.COM /* Build dependencies */ 163011131SErik.Nordmark@Sun.COM if (i > 1 && !ire_dep_build(ires, generations, i)) { 163111042SErik.Nordmark@Sun.COM /* Something in chain was condemned; tear it apart */ 163211042SErik.Nordmark@Sun.COM ire = ire_reject(ipst, B_FALSE); 163311042SErik.Nordmark@Sun.COM goto cleanup; 163411042SErik.Nordmark@Sun.COM } 163511042SErik.Nordmark@Sun.COM 163611042SErik.Nordmark@Sun.COM /* 163711042SErik.Nordmark@Sun.COM * Release all refholds except the one for ires[0] that we 163811042SErik.Nordmark@Sun.COM * will return to the caller. 163911042SErik.Nordmark@Sun.COM */ 164011042SErik.Nordmark@Sun.COM for (j = 1; j < i; j++) 164111042SErik.Nordmark@Sun.COM ire_refrele(ires[j]); 164211042SErik.Nordmark@Sun.COM 164311042SErik.Nordmark@Sun.COM if (invalidate) { 164411042SErik.Nordmark@Sun.COM /* 164511042SErik.Nordmark@Sun.COM * Since we needed to allocate but couldn't we need to make 164611042SErik.Nordmark@Sun.COM * sure that the dependency chain is rebuilt the next time. 164711042SErik.Nordmark@Sun.COM */ 164811042SErik.Nordmark@Sun.COM ire_dep_invalidate_generations(ires[0]); 164911042SErik.Nordmark@Sun.COM generation = IRE_GENERATION_VERIFY; 165011042SErik.Nordmark@Sun.COM } else { 165111042SErik.Nordmark@Sun.COM /* 165211042SErik.Nordmark@Sun.COM * IREs can have been added or deleted while we did the 165311042SErik.Nordmark@Sun.COM * recursive lookup and we can't catch those until we've built 165411042SErik.Nordmark@Sun.COM * the dependencies. We verify the stored 165511042SErik.Nordmark@Sun.COM * ire_dep_parent_generation to catch any such changes and 165611042SErik.Nordmark@Sun.COM * return IRE_GENERATION_VERIFY (which will cause 165711042SErik.Nordmark@Sun.COM * ip_select_route to be called again so we can redo the 165811042SErik.Nordmark@Sun.COM * recursive lookup next time we send a packet. 165911042SErik.Nordmark@Sun.COM */ 166011131SErik.Nordmark@Sun.COM if (ires[0]->ire_dep_parent == NULL) 166111131SErik.Nordmark@Sun.COM generation = ires[0]->ire_generation; 166211131SErik.Nordmark@Sun.COM else 166311131SErik.Nordmark@Sun.COM generation = ire_dep_validate_generations(ires[0]); 166411042SErik.Nordmark@Sun.COM if (generations[0] != ires[0]->ire_generation) { 166511042SErik.Nordmark@Sun.COM /* Something changed at the top */ 166611042SErik.Nordmark@Sun.COM generation = IRE_GENERATION_VERIFY; 166711042SErik.Nordmark@Sun.COM } 166811042SErik.Nordmark@Sun.COM } 166911042SErik.Nordmark@Sun.COM if (generationp != NULL) 167011042SErik.Nordmark@Sun.COM *generationp = generation; 167111042SErik.Nordmark@Sun.COM 167211042SErik.Nordmark@Sun.COM return (ires[0]); 167311042SErik.Nordmark@Sun.COM } 167411042SErik.Nordmark@Sun.COM 167511042SErik.Nordmark@Sun.COM ire_t * 167611042SErik.Nordmark@Sun.COM ire_route_recursive_v4(ipaddr_t nexthop, uint_t ire_type, const ill_t *ill, 167711042SErik.Nordmark@Sun.COM zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 167811457SErik.Nordmark@Sun.COM uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp, 167911042SErik.Nordmark@Sun.COM tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 168011042SErik.Nordmark@Sun.COM { 168111042SErik.Nordmark@Sun.COM return (ire_route_recursive_impl_v4(NULL, nexthop, ire_type, ill, 168211457SErik.Nordmark@Sun.COM zoneid, tsl, match_args, irr_flags, xmit_hint, ipst, setsrcp, 168311042SErik.Nordmark@Sun.COM gwattrp, generationp)); 168411042SErik.Nordmark@Sun.COM } 168511042SErik.Nordmark@Sun.COM 168611042SErik.Nordmark@Sun.COM /* 168711042SErik.Nordmark@Sun.COM * Recursively look for a route to the destination. 168811042SErik.Nordmark@Sun.COM * We only handle a destination match here, yet we have the same arguments 168911042SErik.Nordmark@Sun.COM * as the full match to allow function pointers to select between the two. 169011042SErik.Nordmark@Sun.COM * 169111042SErik.Nordmark@Sun.COM * Note that this function never returns NULL. It returns an IRE_NOROUTE 169211042SErik.Nordmark@Sun.COM * instead. 169311042SErik.Nordmark@Sun.COM * 169411042SErik.Nordmark@Sun.COM * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 169511042SErik.Nordmark@Sun.COM * is an error. 169611042SErik.Nordmark@Sun.COM * Allow at most one RTF_INDIRECT. 169711042SErik.Nordmark@Sun.COM */ 169811042SErik.Nordmark@Sun.COM ire_t * 169911457SErik.Nordmark@Sun.COM ire_route_recursive_dstonly_v4(ipaddr_t nexthop, uint_t irr_flags, 170011042SErik.Nordmark@Sun.COM uint32_t xmit_hint, ip_stack_t *ipst) 170111042SErik.Nordmark@Sun.COM { 170211042SErik.Nordmark@Sun.COM ire_t *ire; 170311042SErik.Nordmark@Sun.COM ire_t *ire1; 170411042SErik.Nordmark@Sun.COM uint_t generation; 170511042SErik.Nordmark@Sun.COM 170611042SErik.Nordmark@Sun.COM /* ire_ftable_lookup handles round-robin/ECMP */ 170711042SErik.Nordmark@Sun.COM ire = ire_ftable_lookup_simple_v4(nexthop, xmit_hint, ipst, 170811042SErik.Nordmark@Sun.COM &generation); 170911042SErik.Nordmark@Sun.COM ASSERT(ire != NULL); 171011042SErik.Nordmark@Sun.COM 171111042SErik.Nordmark@Sun.COM /* 171211042SErik.Nordmark@Sun.COM * If this type should have an ire_nce_cache (even if it 171311042SErik.Nordmark@Sun.COM * doesn't yet have one) then we are done. Includes 171411042SErik.Nordmark@Sun.COM * IRE_INTERFACE with a full 32 bit mask. 171511042SErik.Nordmark@Sun.COM */ 171611042SErik.Nordmark@Sun.COM if (ire->ire_nce_capable) 171711042SErik.Nordmark@Sun.COM return (ire); 171811042SErik.Nordmark@Sun.COM 171911042SErik.Nordmark@Sun.COM /* 172011042SErik.Nordmark@Sun.COM * If the IRE has a current cached parent we know that the whole 172111042SErik.Nordmark@Sun.COM * parent chain is current, hence we don't need to discover and 172211042SErik.Nordmark@Sun.COM * build any dependencies by doing a recursive lookup. 172311042SErik.Nordmark@Sun.COM */ 172411042SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock); 172511042SErik.Nordmark@Sun.COM if (ire->ire_dep_parent != NULL && 172611042SErik.Nordmark@Sun.COM ire->ire_dep_parent->ire_generation == 172711042SErik.Nordmark@Sun.COM ire->ire_dep_parent_generation) { 172811042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 172911042SErik.Nordmark@Sun.COM return (ire); 173011042SErik.Nordmark@Sun.COM } 173111042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 173211042SErik.Nordmark@Sun.COM 173311042SErik.Nordmark@Sun.COM /* 173411042SErik.Nordmark@Sun.COM * Fallback to loop in the normal code starting with the ire 173511042SErik.Nordmark@Sun.COM * we found. Normally this would return the same ire. 173611042SErik.Nordmark@Sun.COM */ 173711042SErik.Nordmark@Sun.COM ire1 = ire_route_recursive_impl_v4(ire, nexthop, 0, NULL, ALL_ZONES, 173811457SErik.Nordmark@Sun.COM NULL, MATCH_IRE_DSTONLY, irr_flags, xmit_hint, ipst, NULL, NULL, 173911042SErik.Nordmark@Sun.COM &generation); 174011042SErik.Nordmark@Sun.COM ire_refrele(ire); 174111042SErik.Nordmark@Sun.COM return (ire1); 174211042SErik.Nordmark@Sun.COM } 1743