12535Ssangeeta /*
22535Ssangeeta * CDDL HEADER START
32535Ssangeeta *
42535Ssangeeta * The contents of this file are subject to the terms of the
52535Ssangeeta * Common Development and Distribution License (the "License").
62535Ssangeeta * You may not use this file except in compliance with the License.
72535Ssangeeta *
82535Ssangeeta * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
92535Ssangeeta * or http://www.opensolaris.org/os/licensing.
102535Ssangeeta * See the License for the specific language governing permissions
112535Ssangeeta * and limitations under the License.
122535Ssangeeta *
132535Ssangeeta * When distributing Covered Code, include this CDDL HEADER in each
142535Ssangeeta * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
152535Ssangeeta * If applicable, add the following below this CDDL HEADER, with the
162535Ssangeeta * fields enclosed by brackets "[]" replaced with your own identifying
172535Ssangeeta * information: Portions Copyright [yyyy] [name of copyright owner]
182535Ssangeeta *
192535Ssangeeta * CDDL HEADER END
202535Ssangeeta */
212535Ssangeeta /*
22*12985SSowmini.Varadhan@oracle.COM * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
232535Ssangeeta */
242535Ssangeeta
252535Ssangeeta /*
262535Ssangeeta * This file contains consumer routines of the IPv4 forwarding engine
272535Ssangeeta */
282535Ssangeeta
292535Ssangeeta #include <sys/types.h>
302535Ssangeeta #include <sys/stream.h>
312535Ssangeeta #include <sys/stropts.h>
322535Ssangeeta #include <sys/strlog.h>
332535Ssangeeta #include <sys/dlpi.h>
342535Ssangeeta #include <sys/ddi.h>
352535Ssangeeta #include <sys/cmn_err.h>
362535Ssangeeta #include <sys/policy.h>
372535Ssangeeta
382535Ssangeeta #include <sys/systm.h>
392535Ssangeeta #include <sys/strsun.h>
402535Ssangeeta #include <sys/kmem.h>
412535Ssangeeta #include <sys/param.h>
422535Ssangeeta #include <sys/socket.h>
434482Sdr146992 #include <sys/strsubr.h>
442535Ssangeeta #include <net/if.h>
452535Ssangeeta #include <net/route.h>
462535Ssangeeta #include <netinet/in.h>
472535Ssangeeta #include <net/if_dl.h>
482535Ssangeeta #include <netinet/ip6.h>
492535Ssangeeta #include <netinet/icmp6.h>
502535Ssangeeta
5111042SErik.Nordmark@Sun.COM #include <inet/ipsec_impl.h>
522535Ssangeeta #include <inet/common.h>
532535Ssangeeta #include <inet/mi.h>
542535Ssangeeta #include <inet/mib2.h>
552535Ssangeeta #include <inet/ip.h>
564482Sdr146992 #include <inet/ip_impl.h>
572535Ssangeeta #include <inet/ip6.h>
582535Ssangeeta #include <inet/ip_ndp.h>
592535Ssangeeta #include <inet/arp.h>
602535Ssangeeta #include <inet/ip_if.h>
612535Ssangeeta #include <inet/ip_ire.h>
622535Ssangeeta #include <inet/ip_ftable.h>
632535Ssangeeta #include <inet/ip_rts.h>
642535Ssangeeta #include <inet/nd.h>
652535Ssangeeta
662535Ssangeeta #include <net/pfkeyv2.h>
672535Ssangeeta #include <inet/sadb.h>
682535Ssangeeta #include <inet/tcp.h>
692535Ssangeeta #include <inet/ipclassifier.h>
702535Ssangeeta #include <sys/zone.h>
712535Ssangeeta #include <net/radix.h>
722535Ssangeeta #include <sys/tsol/label.h>
732535Ssangeeta #include <sys/tsol/tnet.h>
742535Ssangeeta
752535Ssangeeta #define IS_DEFAULT_ROUTE(ire) \
762535Ssangeeta (((ire)->ire_type & IRE_DEFAULT) || \
772535Ssangeeta (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0)))
782535Ssangeeta
7911681SSowmini.Varadhan@Sun.COM #define IP_SRC_MULTIHOMING(isv6, ipst) \
8011681SSowmini.Varadhan@Sun.COM (isv6 ? ipst->ips_ipv6_strict_src_multihoming : \
8111681SSowmini.Varadhan@Sun.COM ipst->ips_ip_strict_src_multihoming)
8211681SSowmini.Varadhan@Sun.COM
833448Sdh155122 static ire_t *route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *);
8411042SErik.Nordmark@Sun.COM static void ire_del_host_redir(ire_t *, char *);
8511042SErik.Nordmark@Sun.COM static boolean_t ire_find_best_route(struct radix_node *, void *);
862535Ssangeeta
872535Ssangeeta /*
882535Ssangeeta * Lookup a route in forwarding table. A specific lookup is indicated by
892535Ssangeeta * passing the required parameters and indicating the match required in the
902535Ssangeeta * flag field.
912535Ssangeeta *
922535Ssangeeta * Supports IP_BOUND_IF by following the ipif/ill when recursing.
932535Ssangeeta */
942535Ssangeeta ire_t *
ire_ftable_lookup_v4(ipaddr_t addr,ipaddr_t mask,ipaddr_t gateway,int type,const ill_t * ill,zoneid_t zoneid,const ts_label_t * tsl,int flags,uint32_t xmit_hint,ip_stack_t * ipst,uint_t * generationp)9511042SErik.Nordmark@Sun.COM ire_ftable_lookup_v4(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
9611042SErik.Nordmark@Sun.COM int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl,
9711042SErik.Nordmark@Sun.COM int flags, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp)
982535Ssangeeta {
9911042SErik.Nordmark@Sun.COM ire_t *ire;
1002535Ssangeeta struct rt_sockaddr rdst, rmask;
1012535Ssangeeta struct rt_entry *rt;
1022535Ssangeeta ire_ftable_args_t margs;
1032535Ssangeeta
10411042SErik.Nordmark@Sun.COM ASSERT(ill == NULL || !ill->ill_isv6);
1052535Ssangeeta
1062535Ssangeeta /*
10711042SErik.Nordmark@Sun.COM * ire_match_args() will dereference ill if MATCH_IRE_ILL
10811042SErik.Nordmark@Sun.COM * is set.
1092535Ssangeeta */
11011681SSowmini.Varadhan@Sun.COM if ((flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL)) && (ill == NULL))
1112535Ssangeeta return (NULL);
1122535Ssangeeta
11311131SErik.Nordmark@Sun.COM bzero(&rdst, sizeof (rdst));
1142535Ssangeeta rdst.rt_sin_len = sizeof (rdst);
1152535Ssangeeta rdst.rt_sin_family = AF_INET;
1162535Ssangeeta rdst.rt_sin_addr.s_addr = addr;
1172535Ssangeeta
11811131SErik.Nordmark@Sun.COM bzero(&rmask, sizeof (rmask));
1192535Ssangeeta rmask.rt_sin_len = sizeof (rmask);
1202535Ssangeeta rmask.rt_sin_family = AF_INET;
1212535Ssangeeta rmask.rt_sin_addr.s_addr = mask;
1222535Ssangeeta
12311131SErik.Nordmark@Sun.COM bzero(&margs, sizeof (margs));
1242535Ssangeeta margs.ift_addr = addr;
1252535Ssangeeta margs.ift_mask = mask;
1262535Ssangeeta margs.ift_gateway = gateway;
1272535Ssangeeta margs.ift_type = type;
12811042SErik.Nordmark@Sun.COM margs.ift_ill = ill;
1292535Ssangeeta margs.ift_zoneid = zoneid;
1302535Ssangeeta margs.ift_tsl = tsl;
1312535Ssangeeta margs.ift_flags = flags;
1322535Ssangeeta
1332535Ssangeeta /*
1342535Ssangeeta * The flags argument passed to ire_ftable_lookup may cause the
1352535Ssangeeta * search to return, not the longest matching prefix, but the
1362535Ssangeeta * "best matching prefix", i.e., the longest prefix that also
1372535Ssangeeta * satisfies constraints imposed via the permutation of flags
1382535Ssangeeta * passed in. To achieve this, we invoke ire_match_args() on
1392535Ssangeeta * each matching leaf in the radix tree. ire_match_args is
1402535Ssangeeta * invoked by the callback function ire_find_best_route()
1412535Ssangeeta * We hold the global tree lock in read mode when calling
14211042SErik.Nordmark@Sun.COM * rn_match_args. Before dropping the global tree lock, ensure
1432535Ssangeeta * that the radix node can't be deleted by incrementing ire_refcnt.
1442535Ssangeeta */
1453448Sdh155122 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
1463448Sdh155122 rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
1473448Sdh155122 ipst->ips_ip_ftable, ire_find_best_route, &margs);
1482535Ssangeeta ire = margs.ift_best_ire;
1492535Ssangeeta if (rt == NULL) {
15011042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
1512535Ssangeeta return (NULL);
1522535Ssangeeta }
15311042SErik.Nordmark@Sun.COM ASSERT(ire != NULL);
1542535Ssangeeta
1552535Ssangeeta DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire);
1562535Ssangeeta
1572535Ssangeeta /*
1582535Ssangeeta * round-robin only if we have more than one route in the bucket.
15911042SErik.Nordmark@Sun.COM * ips_ip_ecmp_behavior controls when we do ECMP
16011042SErik.Nordmark@Sun.COM * 2: always
16111042SErik.Nordmark@Sun.COM * 1: for IRE_DEFAULT and /0 IRE_INTERFACE
16211042SErik.Nordmark@Sun.COM * 0: never
1632535Ssangeeta */
16411042SErik.Nordmark@Sun.COM if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) {
16511042SErik.Nordmark@Sun.COM if (ipst->ips_ip_ecmp_behavior == 2 ||
16611042SErik.Nordmark@Sun.COM (ipst->ips_ip_ecmp_behavior == 1 &&
16711042SErik.Nordmark@Sun.COM IS_DEFAULT_ROUTE(ire))) {
16811042SErik.Nordmark@Sun.COM ire_t *next_ire;
1692535Ssangeeta
17011042SErik.Nordmark@Sun.COM margs.ift_best_ire = NULL;
17111042SErik.Nordmark@Sun.COM next_ire = ire_round_robin(ire->ire_bucket, &margs,
17211042SErik.Nordmark@Sun.COM xmit_hint, ire, ipst);
17311042SErik.Nordmark@Sun.COM if (next_ire == NULL) {
17411042SErik.Nordmark@Sun.COM /* keep ire if next_ire is null */
17511042SErik.Nordmark@Sun.COM goto done;
17611042SErik.Nordmark@Sun.COM }
17711042SErik.Nordmark@Sun.COM ire_refrele(ire);
1782535Ssangeeta ire = next_ire;
1792535Ssangeeta }
1802535Ssangeeta }
1812535Ssangeeta
18211042SErik.Nordmark@Sun.COM done:
18311042SErik.Nordmark@Sun.COM /* Return generation before dropping lock */
18411042SErik.Nordmark@Sun.COM if (generationp != NULL)
18511042SErik.Nordmark@Sun.COM *generationp = ire->ire_generation;
1862535Ssangeeta
18711042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
1888485SPeter.Memishian@Sun.COM
18911042SErik.Nordmark@Sun.COM /*
19011042SErik.Nordmark@Sun.COM * For shared-IP zones we need additional checks to what was
19111042SErik.Nordmark@Sun.COM * done in ire_match_args to make sure IRE_LOCALs are handled.
19211042SErik.Nordmark@Sun.COM *
19311042SErik.Nordmark@Sun.COM * When ip_restrict_interzone_loopback is set, then
19411042SErik.Nordmark@Sun.COM * we ensure that IRE_LOCAL are only used for loopback
19511042SErik.Nordmark@Sun.COM * between zones when the logical "Ethernet" would
19611042SErik.Nordmark@Sun.COM * have looped them back. That is, if in the absense of
19711042SErik.Nordmark@Sun.COM * the IRE_LOCAL we would have sent to packet out the
19811042SErik.Nordmark@Sun.COM * same ill.
19911042SErik.Nordmark@Sun.COM */
20011042SErik.Nordmark@Sun.COM if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES &&
20111042SErik.Nordmark@Sun.COM ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES &&
20211042SErik.Nordmark@Sun.COM ipst->ips_ip_restrict_interzone_loopback) {
20311042SErik.Nordmark@Sun.COM ire = ire_alt_local(ire, zoneid, tsl, ill, generationp);
20411042SErik.Nordmark@Sun.COM ASSERT(ire != NULL);
2052535Ssangeeta }
2062535Ssangeeta return (ire);
2072535Ssangeeta }
2082535Ssangeeta
2098275SEric Cheng /*
2108275SEric Cheng * This function is called by
21111042SErik.Nordmark@Sun.COM * ip_input/ire_route_recursive when doing a route lookup on only the
21211042SErik.Nordmark@Sun.COM * destination address.
21311042SErik.Nordmark@Sun.COM *
2148275SEric Cheng * The optimizations of this function over ire_ftable_lookup are:
2158275SEric Cheng * o removing unnecessary flag matching
2168275SEric Cheng * o doing longest prefix match instead of overloading it further
2178275SEric Cheng * with the unnecessary "best_prefix_match"
21811042SErik.Nordmark@Sun.COM *
21911042SErik.Nordmark@Sun.COM * If no route is found we return IRE_NOROUTE.
2208275SEric Cheng */
22111042SErik.Nordmark@Sun.COM ire_t *
ire_ftable_lookup_simple_v4(ipaddr_t addr,uint32_t xmit_hint,ip_stack_t * ipst,uint_t * generationp)22211042SErik.Nordmark@Sun.COM ire_ftable_lookup_simple_v4(ipaddr_t addr, uint32_t xmit_hint, ip_stack_t *ipst,
22311042SErik.Nordmark@Sun.COM uint_t *generationp)
2248275SEric Cheng {
22511042SErik.Nordmark@Sun.COM ire_t *ire;
2268275SEric Cheng struct rt_sockaddr rdst;
2278275SEric Cheng struct rt_entry *rt;
22811042SErik.Nordmark@Sun.COM irb_t *irb;
2298275SEric Cheng
2308275SEric Cheng rdst.rt_sin_len = sizeof (rdst);
2318275SEric Cheng rdst.rt_sin_family = AF_INET;
2328275SEric Cheng rdst.rt_sin_addr.s_addr = addr;
2338275SEric Cheng
2348275SEric Cheng /*
2358275SEric Cheng * This is basically inlining a simpler version of ire_match_args
2368275SEric Cheng */
2378275SEric Cheng RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
2388275SEric Cheng
2398275SEric Cheng rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
2408275SEric Cheng ipst->ips_ip_ftable, NULL, NULL);
2418275SEric Cheng
24211042SErik.Nordmark@Sun.COM if (rt == NULL)
24311042SErik.Nordmark@Sun.COM goto bad;
24411042SErik.Nordmark@Sun.COM
24511042SErik.Nordmark@Sun.COM irb = &rt->rt_irb;
24611042SErik.Nordmark@Sun.COM if (irb->irb_ire_cnt == 0)
24711042SErik.Nordmark@Sun.COM goto bad;
24811042SErik.Nordmark@Sun.COM
24911042SErik.Nordmark@Sun.COM rw_enter(&irb->irb_lock, RW_READER);
25011042SErik.Nordmark@Sun.COM ire = irb->irb_ire;
25111042SErik.Nordmark@Sun.COM if (ire == NULL) {
25211042SErik.Nordmark@Sun.COM rw_exit(&irb->irb_lock);
25311042SErik.Nordmark@Sun.COM goto bad;
2548275SEric Cheng }
25511042SErik.Nordmark@Sun.COM while (IRE_IS_CONDEMNED(ire)) {
25611042SErik.Nordmark@Sun.COM ire = ire->ire_next;
25711042SErik.Nordmark@Sun.COM if (ire == NULL) {
25811042SErik.Nordmark@Sun.COM rw_exit(&irb->irb_lock);
25911042SErik.Nordmark@Sun.COM goto bad;
26011042SErik.Nordmark@Sun.COM }
2618275SEric Cheng }
2628275SEric Cheng
2638275SEric Cheng /* we have a ire that matches */
26411042SErik.Nordmark@Sun.COM ire_refhold(ire);
26511042SErik.Nordmark@Sun.COM rw_exit(&irb->irb_lock);
2668275SEric Cheng
2678275SEric Cheng /*
26811042SErik.Nordmark@Sun.COM * round-robin only if we have more than one route in the bucket.
26911042SErik.Nordmark@Sun.COM * ips_ip_ecmp_behavior controls when we do ECMP
27011042SErik.Nordmark@Sun.COM * 2: always
27111042SErik.Nordmark@Sun.COM * 1: for IRE_DEFAULT and /0 IRE_INTERFACE
27211042SErik.Nordmark@Sun.COM * 0: never
2738275SEric Cheng *
27411042SErik.Nordmark@Sun.COM * Note: if we found an IRE_IF_CLONE we won't look at the bucket with
27511042SErik.Nordmark@Sun.COM * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match
27611042SErik.Nordmark@Sun.COM * and the IRE_INTERFACESs are likely to be shorter matches.
2778275SEric Cheng */
27811042SErik.Nordmark@Sun.COM if (ire->ire_bucket->irb_ire_cnt > 1) {
27911042SErik.Nordmark@Sun.COM if (ipst->ips_ip_ecmp_behavior == 2 ||
28011042SErik.Nordmark@Sun.COM (ipst->ips_ip_ecmp_behavior == 1 &&
28111042SErik.Nordmark@Sun.COM IS_DEFAULT_ROUTE(ire))) {
28211042SErik.Nordmark@Sun.COM ire_t *next_ire;
28311042SErik.Nordmark@Sun.COM ire_ftable_args_t margs;
2848275SEric Cheng
28511131SErik.Nordmark@Sun.COM bzero(&margs, sizeof (margs));
28611042SErik.Nordmark@Sun.COM margs.ift_addr = addr;
28711042SErik.Nordmark@Sun.COM margs.ift_zoneid = ALL_ZONES;
28811042SErik.Nordmark@Sun.COM
28911042SErik.Nordmark@Sun.COM next_ire = ire_round_robin(ire->ire_bucket, &margs,
29011042SErik.Nordmark@Sun.COM xmit_hint, ire, ipst);
29111042SErik.Nordmark@Sun.COM if (next_ire == NULL) {
29211042SErik.Nordmark@Sun.COM /* keep ire if next_ire is null */
29311042SErik.Nordmark@Sun.COM if (generationp != NULL)
29411042SErik.Nordmark@Sun.COM *generationp = ire->ire_generation;
29511042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
29611042SErik.Nordmark@Sun.COM return (ire);
29711042SErik.Nordmark@Sun.COM }
29811042SErik.Nordmark@Sun.COM ire_refrele(ire);
29911042SErik.Nordmark@Sun.COM ire = next_ire;
3008275SEric Cheng }
3018275SEric Cheng }
30211042SErik.Nordmark@Sun.COM /* Return generation before dropping lock */
30311042SErik.Nordmark@Sun.COM if (generationp != NULL)
30411042SErik.Nordmark@Sun.COM *generationp = ire->ire_generation;
30511042SErik.Nordmark@Sun.COM
30611042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
3078275SEric Cheng
30811042SErik.Nordmark@Sun.COM /*
30911042SErik.Nordmark@Sun.COM * Since we only did ALL_ZONES matches there is no special handling
31011042SErik.Nordmark@Sun.COM * of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that.
31111042SErik.Nordmark@Sun.COM */
3128275SEric Cheng return (ire);
31311042SErik.Nordmark@Sun.COM
31411042SErik.Nordmark@Sun.COM bad:
31511042SErik.Nordmark@Sun.COM if (generationp != NULL)
31611042SErik.Nordmark@Sun.COM *generationp = IRE_GENERATION_VERIFY;
31711042SErik.Nordmark@Sun.COM
31811042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
31911042SErik.Nordmark@Sun.COM return (ire_reject(ipst, B_FALSE));
3208275SEric Cheng }
3212535Ssangeeta
3222535Ssangeeta /*
32311042SErik.Nordmark@Sun.COM * Find the ill matching a multicast group.
3242535Ssangeeta * Allows different routes for multicast addresses
3252535Ssangeeta * in the unicast routing table (akin to 224.0.0.0 but could be more specific)
3262535Ssangeeta * which point at different interfaces. This is used when IP_MULTICAST_IF
3272535Ssangeeta * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't
3282535Ssangeeta * specify the interface to join on.
3292535Ssangeeta *
33011042SErik.Nordmark@Sun.COM * Supports link-local addresses by using ire_route_recursive which follows
33111042SErik.Nordmark@Sun.COM * the ill when recursing.
33211042SErik.Nordmark@Sun.COM *
33311042SErik.Nordmark@Sun.COM * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group
33411042SErik.Nordmark@Sun.COM * and the MULTIRT property can be different for different groups, we
33511042SErik.Nordmark@Sun.COM * extract RTF_MULTIRT from the special unicast route added for a group
33611042SErik.Nordmark@Sun.COM * with CGTP and pass that back in the multirtp argument.
33711042SErik.Nordmark@Sun.COM * This is used in ip_set_destination etc to set ixa_postfragfn for multicast.
33811042SErik.Nordmark@Sun.COM * We have a setsrcp argument for the same reason.
3392535Ssangeeta */
34011042SErik.Nordmark@Sun.COM ill_t *
ire_lookup_multi_ill_v4(ipaddr_t group,zoneid_t zoneid,ip_stack_t * ipst,boolean_t * multirtp,ipaddr_t * setsrcp)34111042SErik.Nordmark@Sun.COM ire_lookup_multi_ill_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst,
34211042SErik.Nordmark@Sun.COM boolean_t *multirtp, ipaddr_t *setsrcp)
3432535Ssangeeta {
3442535Ssangeeta ire_t *ire;
34511042SErik.Nordmark@Sun.COM ill_t *ill;
3462535Ssangeeta
34711042SErik.Nordmark@Sun.COM ire = ire_route_recursive_v4(group, 0, NULL, zoneid, NULL,
34811457SErik.Nordmark@Sun.COM MATCH_IRE_DSTONLY, IRR_NONE, 0, ipst, setsrcp, NULL, NULL);
34911042SErik.Nordmark@Sun.COM ASSERT(ire != NULL);
35011042SErik.Nordmark@Sun.COM if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
3512535Ssangeeta ire_refrele(ire);
3522535Ssangeeta return (NULL);
3532535Ssangeeta }
35411042SErik.Nordmark@Sun.COM
35511042SErik.Nordmark@Sun.COM if (multirtp != NULL)
35611042SErik.Nordmark@Sun.COM *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0;
35711042SErik.Nordmark@Sun.COM
35811042SErik.Nordmark@Sun.COM ill = ire_nexthop_ill(ire);
35911042SErik.Nordmark@Sun.COM ire_refrele(ire);
36011042SErik.Nordmark@Sun.COM return (ill);
3612535Ssangeeta }
3622535Ssangeeta
3632535Ssangeeta /*
3642535Ssangeeta * Delete the passed in ire if the gateway addr matches
3652535Ssangeeta */
3662535Ssangeeta void
ire_del_host_redir(ire_t * ire,char * gateway)3672535Ssangeeta ire_del_host_redir(ire_t *ire, char *gateway)
3682535Ssangeeta {
3693004Sdd193516 if ((ire->ire_flags & RTF_DYNAMIC) &&
3702535Ssangeeta (ire->ire_gateway_addr == *(ipaddr_t *)gateway))
3712535Ssangeeta ire_delete(ire);
3722535Ssangeeta }
3732535Ssangeeta
3742535Ssangeeta /*
37511042SErik.Nordmark@Sun.COM * Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are
3762535Ssangeeta * pointing at the specified gateway and
3772535Ssangeeta * delete them. This routine is called only
3782535Ssangeeta * when a default gateway is going away.
3792535Ssangeeta */
3802535Ssangeeta void
ire_delete_host_redirects(ipaddr_t gateway,ip_stack_t * ipst)3813448Sdh155122 ire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst)
3822535Ssangeeta {
3832535Ssangeeta struct rtfuncarg rtfarg;
3842535Ssangeeta
38511131SErik.Nordmark@Sun.COM bzero(&rtfarg, sizeof (rtfarg));
3862535Ssangeeta rtfarg.rt_func = ire_del_host_redir;
3872535Ssangeeta rtfarg.rt_arg = (void *)&gateway;
38811131SErik.Nordmark@Sun.COM rtfarg.rt_zoneid = ALL_ZONES;
38911131SErik.Nordmark@Sun.COM rtfarg.rt_ipst = ipst;
3903448Sdh155122 (void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable,
3913448Sdh155122 rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn);
3922535Ssangeeta }
3932535Ssangeeta
3942535Ssangeeta /*
3953448Sdh155122 * Obtain the rt_entry and rt_irb for the route to be added to
3963448Sdh155122 * the ips_ip_ftable.
3972535Ssangeeta * First attempt to add a node to the radix tree via rn_addroute. If the
3982535Ssangeeta * route already exists, return the bucket for the existing route.
3992535Ssangeeta *
4002535Ssangeeta * Locking notes: Need to hold the global radix tree lock in write mode to
4012535Ssangeeta * add a radix node. To prevent the node from being deleted, ire_get_bucket()
4022535Ssangeeta * returns with a ref'ed irb_t. The ire itself is added in ire_add_v4()
4032535Ssangeeta * while holding the irb_lock, but not the radix tree lock.
4042535Ssangeeta */
4052535Ssangeeta irb_t *
ire_get_bucket(ire_t * ire)4062535Ssangeeta ire_get_bucket(ire_t *ire)
4072535Ssangeeta {
4082535Ssangeeta struct radix_node *rn;
4092535Ssangeeta struct rt_entry *rt;
4102535Ssangeeta struct rt_sockaddr rmask, rdst;
4112535Ssangeeta irb_t *irb = NULL;
4123448Sdh155122 ip_stack_t *ipst = ire->ire_ipst;
4132535Ssangeeta
4143448Sdh155122 ASSERT(ipst->ips_ip_ftable != NULL);
4152535Ssangeeta
4162535Ssangeeta /* first try to see if route exists (based on rtalloc1) */
41711131SErik.Nordmark@Sun.COM bzero(&rdst, sizeof (rdst));
4182535Ssangeeta rdst.rt_sin_len = sizeof (rdst);
4192535Ssangeeta rdst.rt_sin_family = AF_INET;
4202535Ssangeeta rdst.rt_sin_addr.s_addr = ire->ire_addr;
4212535Ssangeeta
42211131SErik.Nordmark@Sun.COM bzero(&rmask, sizeof (rmask));
4232535Ssangeeta rmask.rt_sin_len = sizeof (rmask);
4242535Ssangeeta rmask.rt_sin_family = AF_INET;
4252535Ssangeeta rmask.rt_sin_addr.s_addr = ire->ire_mask;
4262535Ssangeeta
4272535Ssangeeta /*
4282535Ssangeeta * add the route. based on BSD's rtrequest1(RTM_ADD)
4292535Ssangeeta */
4302535Ssangeeta R_Malloc(rt, rt_entry_cache, sizeof (*rt));
4315090Ssangeeta /* kmem_alloc failed */
4325090Ssangeeta if (rt == NULL)
4335090Ssangeeta return (NULL);
4345090Ssangeeta
43511131SErik.Nordmark@Sun.COM bzero(rt, sizeof (*rt));
4362535Ssangeeta rt->rt_nodes->rn_key = (char *)&rt->rt_dst;
4372535Ssangeeta rt->rt_dst = rdst;
4382535Ssangeeta irb = &rt->rt_irb;
43911042SErik.Nordmark@Sun.COM irb->irb_marks |= IRB_MARK_DYNAMIC; /* dynamically allocated/freed */
4403448Sdh155122 irb->irb_ipst = ipst;
4412535Ssangeeta rw_init(&irb->irb_lock, NULL, RW_DEFAULT, NULL);
4423448Sdh155122 RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable);
4433448Sdh155122 rn = ipst->ips_ip_ftable->rnh_addaddr(&rt->rt_dst, &rmask,
4443448Sdh155122 ipst->ips_ip_ftable, (struct radix_node *)rt);
4452535Ssangeeta if (rn == NULL) {
4463448Sdh155122 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
4472535Ssangeeta Free(rt, rt_entry_cache);
4482535Ssangeeta rt = NULL;
4492535Ssangeeta irb = NULL;
4503448Sdh155122 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
4513448Sdh155122 rn = ipst->ips_ip_ftable->rnh_lookup(&rdst, &rmask,
4523448Sdh155122 ipst->ips_ip_ftable);
4533448Sdh155122 if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
4542535Ssangeeta /* found a non-root match */
4552535Ssangeeta rt = (struct rt_entry *)rn;
4562535Ssangeeta }
4572535Ssangeeta }
4582535Ssangeeta if (rt != NULL) {
4592535Ssangeeta irb = &rt->rt_irb;
46011042SErik.Nordmark@Sun.COM irb_refhold(irb);
4612535Ssangeeta }
4623448Sdh155122 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
4632535Ssangeeta return (irb);
4642535Ssangeeta }
4652535Ssangeeta
4662535Ssangeeta /*
4672535Ssangeeta * This function is used when the caller wants to know the outbound
4682535Ssangeeta * interface for a packet given only the address.
4692535Ssangeeta * If this is a offlink IP address and there are multiple
4702535Ssangeeta * routes to this destination, this routine will utilise the
4712535Ssangeeta * first route it finds to IP address
4722535Ssangeeta * Return values:
4732535Ssangeeta * 0 - FAILURE
4742535Ssangeeta * nonzero - ifindex
4752535Ssangeeta */
4762535Ssangeeta uint_t
ifindex_lookup(const struct sockaddr * ipaddr,zoneid_t zoneid)4772535Ssangeeta ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid)
4782535Ssangeeta {
4792535Ssangeeta uint_t ifindex = 0;
4802535Ssangeeta ire_t *ire;
4812535Ssangeeta ill_t *ill;
4823448Sdh155122 netstack_t *ns;
4833448Sdh155122 ip_stack_t *ipst;
4842535Ssangeeta
4853448Sdh155122 if (zoneid == ALL_ZONES)
4863448Sdh155122 ns = netstack_find_by_zoneid(GLOBAL_ZONEID);
4873448Sdh155122 else
4883448Sdh155122 ns = netstack_find_by_zoneid(zoneid);
4893448Sdh155122 ASSERT(ns != NULL);
4903448Sdh155122
4913448Sdh155122 /*
4923448Sdh155122 * For exclusive stacks we set the zoneid to zero
4933448Sdh155122 * since IP uses the global zoneid in the exclusive stacks.
4943448Sdh155122 */
4953448Sdh155122 if (ns->netstack_stackid != GLOBAL_NETSTACKID)
4963448Sdh155122 zoneid = GLOBAL_ZONEID;
4973448Sdh155122 ipst = ns->netstack_ip;
4982535Ssangeeta
4992535Ssangeeta ASSERT(ipaddr->sa_family == AF_INET || ipaddr->sa_family == AF_INET6);
5002535Ssangeeta
50111042SErik.Nordmark@Sun.COM if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) {
50211042SErik.Nordmark@Sun.COM ill = ire_nexthop_ill(ire);
50311042SErik.Nordmark@Sun.COM if (ill != NULL) {
5042535Ssangeeta ifindex = ill->ill_phyint->phyint_ifindex;
50511042SErik.Nordmark@Sun.COM ill_refrele(ill);
50611042SErik.Nordmark@Sun.COM }
5072535Ssangeeta ire_refrele(ire);
5082535Ssangeeta }
5093448Sdh155122 netstack_rele(ns);
5102535Ssangeeta return (ifindex);
5112535Ssangeeta }
5122535Ssangeeta
5132535Ssangeeta /*
5142535Ssangeeta * Routine to find the route to a destination. If a ifindex is supplied
51511042SErik.Nordmark@Sun.COM * it tries to match the route to the corresponding ipif for the ifindex
5162535Ssangeeta */
5172535Ssangeeta static ire_t *
route_to_dst(const struct sockaddr * dst_addr,zoneid_t zoneid,ip_stack_t * ipst)5183448Sdh155122 route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst)
5192535Ssangeeta {
5202535Ssangeeta ire_t *ire = NULL;
5212535Ssangeeta int match_flags;
5222535Ssangeeta
52311042SErik.Nordmark@Sun.COM match_flags = MATCH_IRE_DSTONLY;
5242535Ssangeeta
5252535Ssangeeta /* XXX pass NULL tsl for now */
5262535Ssangeeta
5272535Ssangeeta if (dst_addr->sa_family == AF_INET) {
52811042SErik.Nordmark@Sun.COM ire = ire_route_recursive_v4(
52911042SErik.Nordmark@Sun.COM ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr, 0, NULL,
53011457SErik.Nordmark@Sun.COM zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL,
53111457SErik.Nordmark@Sun.COM NULL, NULL);
5322535Ssangeeta } else {
53311042SErik.Nordmark@Sun.COM ire = ire_route_recursive_v6(
53411042SErik.Nordmark@Sun.COM &((struct sockaddr_in6 *)dst_addr)->sin6_addr, 0, NULL,
53511457SErik.Nordmark@Sun.COM zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL,
53611457SErik.Nordmark@Sun.COM NULL, NULL);
53711042SErik.Nordmark@Sun.COM }
53811042SErik.Nordmark@Sun.COM ASSERT(ire != NULL);
53911042SErik.Nordmark@Sun.COM if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
54011042SErik.Nordmark@Sun.COM ire_refrele(ire);
54111042SErik.Nordmark@Sun.COM return (NULL);
5422535Ssangeeta }
5432535Ssangeeta return (ire);
5442535Ssangeeta }
5452535Ssangeeta
5462535Ssangeeta /*
5472535Ssangeeta * This routine is called by IP Filter to send a packet out on the wire
54811042SErik.Nordmark@Sun.COM * to a specified dstination (which may be onlink or offlink). The ifindex may
54911042SErik.Nordmark@Sun.COM * or may not be 0. A non-null ifindex indicates IP Filter has stipulated
5502535Ssangeeta * an outgoing interface and requires the nexthop to be on that interface.
5514482Sdr146992 * IP WILL NOT DO the following to the data packet before sending it out:
5522535Ssangeeta * a. manipulate ttl
5534482Sdr146992 * b. ipsec work
5544482Sdr146992 * c. fragmentation
5554482Sdr146992 *
5564482Sdr146992 * If the packet has been prepared for hardware checksum then it will be
5574482Sdr146992 * passed off to ip_send_align_cksum() to check that the flags set on the
5584482Sdr146992 * packet are in alignment with the capabilities of the new outgoing NIC.
5592535Ssangeeta *
5602535Ssangeeta * Return values:
5612535Ssangeeta * 0: IP was able to send of the data pkt
5622535Ssangeeta * ECOMM: Could not send packet
5632535Ssangeeta * ENONET No route to dst. It is up to the caller
5642535Ssangeeta * to send icmp unreachable error message,
5652535Ssangeeta * EINPROGRESS The macaddr of the onlink dst or that
5662535Ssangeeta * of the offlink dst's nexthop needs to get
5672535Ssangeeta * resolved before packet can be sent to dst.
5682535Ssangeeta * Thus transmission is not guaranteed.
56911042SErik.Nordmark@Sun.COM * Note: No longer have visibility to the ARP queue
57011042SErik.Nordmark@Sun.COM * hence no EINPROGRESS.
5712535Ssangeeta */
5722535Ssangeeta int
ipfil_sendpkt(const struct sockaddr * dst_addr,mblk_t * mp,uint_t ifindex,zoneid_t zoneid)5732535Ssangeeta ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex,
5742535Ssangeeta zoneid_t zoneid)
5752535Ssangeeta {
57611042SErik.Nordmark@Sun.COM ipaddr_t nexthop;
5773448Sdh155122 netstack_t *ns;
5783448Sdh155122 ip_stack_t *ipst;
57911042SErik.Nordmark@Sun.COM ip_xmit_attr_t ixas;
58011042SErik.Nordmark@Sun.COM int error;
5812535Ssangeeta
5822535Ssangeeta ASSERT(mp != NULL);
5832535Ssangeeta
5843448Sdh155122 if (zoneid == ALL_ZONES)
5853448Sdh155122 ns = netstack_find_by_zoneid(GLOBAL_ZONEID);
5863448Sdh155122 else
5873448Sdh155122 ns = netstack_find_by_zoneid(zoneid);
5883448Sdh155122 ASSERT(ns != NULL);
5893448Sdh155122
5903448Sdh155122 /*
5913448Sdh155122 * For exclusive stacks we set the zoneid to zero
5923448Sdh155122 * since IP uses the global zoneid in the exclusive stacks.
5933448Sdh155122 */
5943448Sdh155122 if (ns->netstack_stackid != GLOBAL_NETSTACKID)
5953448Sdh155122 zoneid = GLOBAL_ZONEID;
5963448Sdh155122 ipst = ns->netstack_ip;
5973448Sdh155122
5982535Ssangeeta ASSERT(dst_addr->sa_family == AF_INET ||
5992535Ssangeeta dst_addr->sa_family == AF_INET6);
6002535Ssangeeta
60111042SErik.Nordmark@Sun.COM bzero(&ixas, sizeof (ixas));
6022535Ssangeeta /*
60311042SErik.Nordmark@Sun.COM * No IPsec, no fragmentation, and don't let any hooks see
60411042SErik.Nordmark@Sun.COM * the packet.
6052535Ssangeeta */
60611042SErik.Nordmark@Sun.COM ixas.ixa_flags = IXAF_NO_IPSEC | IXAF_DONTFRAG | IXAF_NO_PFHOOK;
60711042SErik.Nordmark@Sun.COM ixas.ixa_cred = kcred;
60811042SErik.Nordmark@Sun.COM ixas.ixa_cpid = NOPID;
60911042SErik.Nordmark@Sun.COM ixas.ixa_tsl = NULL;
61011042SErik.Nordmark@Sun.COM ixas.ixa_ipst = ipst;
61111042SErik.Nordmark@Sun.COM ixas.ixa_ifindex = ifindex;
6122535Ssangeeta
61311042SErik.Nordmark@Sun.COM if (dst_addr->sa_family == AF_INET) {
61411042SErik.Nordmark@Sun.COM ipha_t *ipha = (ipha_t *)mp->b_rptr;
6154482Sdr146992
61611042SErik.Nordmark@Sun.COM ixas.ixa_flags |= IXAF_IS_IPV4;
61711042SErik.Nordmark@Sun.COM nexthop = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr;
61811042SErik.Nordmark@Sun.COM if (nexthop != ipha->ipha_dst) {
61911042SErik.Nordmark@Sun.COM ixas.ixa_flags |= IXAF_NEXTHOP_SET;
62011042SErik.Nordmark@Sun.COM ixas.ixa_nexthop_v4 = nexthop;
6212535Ssangeeta }
62211042SErik.Nordmark@Sun.COM ixas.ixa_multicast_ttl = ipha->ipha_ttl;
62311042SErik.Nordmark@Sun.COM } else {
62411042SErik.Nordmark@Sun.COM ip6_t *ip6h = (ip6_t *)mp->b_rptr;
62511042SErik.Nordmark@Sun.COM in6_addr_t *nexthop6;
62611042SErik.Nordmark@Sun.COM
62711042SErik.Nordmark@Sun.COM nexthop6 = &((struct sockaddr_in6 *)dst_addr)->sin6_addr;
62811042SErik.Nordmark@Sun.COM if (!IN6_ARE_ADDR_EQUAL(nexthop6, &ip6h->ip6_dst)) {
62911042SErik.Nordmark@Sun.COM ixas.ixa_flags |= IXAF_NEXTHOP_SET;
63011042SErik.Nordmark@Sun.COM ixas.ixa_nexthop_v6 = *nexthop6;
63111042SErik.Nordmark@Sun.COM }
63211042SErik.Nordmark@Sun.COM ixas.ixa_multicast_ttl = ip6h->ip6_hops;
63311042SErik.Nordmark@Sun.COM }
63411042SErik.Nordmark@Sun.COM error = ip_output_simple(mp, &ixas);
63511042SErik.Nordmark@Sun.COM ixa_cleanup(&ixas);
63611042SErik.Nordmark@Sun.COM
63711042SErik.Nordmark@Sun.COM netstack_rele(ns);
63811042SErik.Nordmark@Sun.COM switch (error) {
63911042SErik.Nordmark@Sun.COM case 0:
6402535Ssangeeta break;
64111042SErik.Nordmark@Sun.COM
64211042SErik.Nordmark@Sun.COM case EHOSTUNREACH:
64311042SErik.Nordmark@Sun.COM case ENETUNREACH:
64411042SErik.Nordmark@Sun.COM error = ENONET;
64511042SErik.Nordmark@Sun.COM break;
64611042SErik.Nordmark@Sun.COM
64711042SErik.Nordmark@Sun.COM default:
64811042SErik.Nordmark@Sun.COM error = ECOMM;
6492535Ssangeeta break;
6502535Ssangeeta }
65111042SErik.Nordmark@Sun.COM return (error);
6524482Sdr146992 }
6534482Sdr146992
6542535Ssangeeta /*
6552535Ssangeeta * callback function provided by ire_ftable_lookup when calling
6562535Ssangeeta * rn_match_args(). Invoke ire_match_args on each matching leaf node in
6572535Ssangeeta * the radix tree.
6582535Ssangeeta */
6592535Ssangeeta boolean_t
ire_find_best_route(struct radix_node * rn,void * arg)6602535Ssangeeta ire_find_best_route(struct radix_node *rn, void *arg)
6612535Ssangeeta {
6622535Ssangeeta struct rt_entry *rt = (struct rt_entry *)rn;
6632535Ssangeeta irb_t *irb_ptr;
6642535Ssangeeta ire_t *ire;
6652535Ssangeeta ire_ftable_args_t *margs = arg;
6662535Ssangeeta ipaddr_t match_mask;
6672535Ssangeeta
6682535Ssangeeta ASSERT(rt != NULL);
6692535Ssangeeta
6702535Ssangeeta irb_ptr = &rt->rt_irb;
6712535Ssangeeta
6722535Ssangeeta if (irb_ptr->irb_ire_cnt == 0)
6732535Ssangeeta return (B_FALSE);
6742535Ssangeeta
6752535Ssangeeta rw_enter(&irb_ptr->irb_lock, RW_READER);
6762535Ssangeeta for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
67711042SErik.Nordmark@Sun.COM if (IRE_IS_CONDEMNED(ire))
6782535Ssangeeta continue;
67911681SSowmini.Varadhan@Sun.COM ASSERT((margs->ift_flags & MATCH_IRE_SHORTERMASK) == 0);
68011681SSowmini.Varadhan@Sun.COM if (margs->ift_flags & MATCH_IRE_MASK)
6812535Ssangeeta match_mask = margs->ift_mask;
6822535Ssangeeta else
6832535Ssangeeta match_mask = ire->ire_mask;
6842535Ssangeeta
6852535Ssangeeta if (ire_match_args(ire, margs->ift_addr, match_mask,
68611042SErik.Nordmark@Sun.COM margs->ift_gateway, margs->ift_type, margs->ift_ill,
68711042SErik.Nordmark@Sun.COM margs->ift_zoneid, margs->ift_tsl,
68811042SErik.Nordmark@Sun.COM margs->ift_flags)) {
68911042SErik.Nordmark@Sun.COM ire_refhold(ire);
6902535Ssangeeta rw_exit(&irb_ptr->irb_lock);
6912535Ssangeeta margs->ift_best_ire = ire;
6922535Ssangeeta return (B_TRUE);
6932535Ssangeeta }
6942535Ssangeeta }
6952535Ssangeeta rw_exit(&irb_ptr->irb_lock);
6962535Ssangeeta return (B_FALSE);
6972535Ssangeeta }
6982535Ssangeeta
6992535Ssangeeta /*
7002535Ssangeeta * ftable irb_t structures are dynamically allocated, and we need to
7012535Ssangeeta * check if the irb_t (and associated ftable tree attachment) needs to
7022535Ssangeeta * be cleaned up when the irb_refcnt goes to 0. The conditions that need
7032535Ssangeeta * be verified are:
7042535Ssangeeta * - no other walkers of the irebucket, i.e., quiescent irb_refcnt,
7052535Ssangeeta * - no other threads holding references to ire's in the bucket,
7062535Ssangeeta * i.e., irb_nire == 0
7072535Ssangeeta * - no active ire's in the bucket, i.e., irb_ire_cnt == 0
7082535Ssangeeta * - need to hold the global tree lock and irb_lock in write mode.
7092535Ssangeeta */
7102535Ssangeeta void
irb_refrele_ftable(irb_t * irb)7112535Ssangeeta irb_refrele_ftable(irb_t *irb)
7122535Ssangeeta {
7132535Ssangeeta for (;;) {
7142535Ssangeeta rw_enter(&irb->irb_lock, RW_WRITER);
7152535Ssangeeta ASSERT(irb->irb_refcnt != 0);
7162535Ssangeeta if (irb->irb_refcnt != 1) {
7172535Ssangeeta /*
7182535Ssangeeta * Someone has a reference to this radix node
7192535Ssangeeta * or there is some bucket walker.
7202535Ssangeeta */
7212535Ssangeeta irb->irb_refcnt--;
7222535Ssangeeta rw_exit(&irb->irb_lock);
7232535Ssangeeta return;
7242535Ssangeeta } else {
7252535Ssangeeta /*
7262535Ssangeeta * There is no other walker, nor is there any
7272535Ssangeeta * other thread that holds a direct ref to this
7282535Ssangeeta * radix node. Do the clean up if needed. Call
7292535Ssangeeta * to ire_unlink will clear the IRB_MARK_CONDEMNED flag
7302535Ssangeeta */
7312535Ssangeeta if (irb->irb_marks & IRB_MARK_CONDEMNED) {
7322535Ssangeeta ire_t *ire_list;
7332535Ssangeeta
7342535Ssangeeta ire_list = ire_unlink(irb);
7352535Ssangeeta rw_exit(&irb->irb_lock);
7362535Ssangeeta
7372535Ssangeeta if (ire_list != NULL)
7382535Ssangeeta ire_cleanup(ire_list);
7392535Ssangeeta /*
7402535Ssangeeta * more CONDEMNED entries could have
7412535Ssangeeta * been added while we dropped the lock,
7422535Ssangeeta * so we have to re-check.
7432535Ssangeeta */
7442535Ssangeeta continue;
7452535Ssangeeta }
7462535Ssangeeta
7472535Ssangeeta /*
7482535Ssangeeta * Now check if there are still any ires
7492535Ssangeeta * associated with this radix node.
7502535Ssangeeta */
7512535Ssangeeta if (irb->irb_nire != 0) {
7522535Ssangeeta /*
7532535Ssangeeta * someone is still holding on
7542535Ssangeeta * to ires in this bucket
7552535Ssangeeta */
7562535Ssangeeta irb->irb_refcnt--;
7572535Ssangeeta rw_exit(&irb->irb_lock);
7582535Ssangeeta return;
7592535Ssangeeta } else {
7602535Ssangeeta /*
7612535Ssangeeta * Everything is clear. Zero walkers,
7622535Ssangeeta * Zero threads with a ref to this
7632535Ssangeeta * radix node, Zero ires associated with
7642535Ssangeeta * this radix node. Due to lock order,
7652535Ssangeeta * check the above conditions again
7662535Ssangeeta * after grabbing all locks in the right order
7672535Ssangeeta */
7682535Ssangeeta rw_exit(&irb->irb_lock);
7692535Ssangeeta if (irb_inactive(irb))
7702535Ssangeeta return;
7712535Ssangeeta /*
7722535Ssangeeta * irb_inactive could not free the irb.
7732535Ssangeeta * See if there are any walkers, if not
7742535Ssangeeta * try to clean up again.
7752535Ssangeeta */
7762535Ssangeeta }
7772535Ssangeeta }
7782535Ssangeeta }
7792535Ssangeeta }
7802535Ssangeeta
7812535Ssangeeta /*
78211042SErik.Nordmark@Sun.COM * IRE iterator used by ire_ftable_lookup to process multiple equal
78311042SErik.Nordmark@Sun.COM * routes. Given a starting point in the hash list (hash), walk the IREs
78411042SErik.Nordmark@Sun.COM * in the bucket skipping deleted entries. We treat the bucket as a circular
78511042SErik.Nordmark@Sun.COM * list for the purposes of walking it.
78611042SErik.Nordmark@Sun.COM * Returns the IRE (held) that corresponds to the hash value. If that IRE is
78711042SErik.Nordmark@Sun.COM * not applicable (ire_match_args failed) then it returns a subsequent one.
78811042SErik.Nordmark@Sun.COM * If we fail to find an IRE we return NULL.
78911042SErik.Nordmark@Sun.COM *
79011042SErik.Nordmark@Sun.COM * Assumes that the caller holds a reference on the IRE bucket and a read lock
79111042SErik.Nordmark@Sun.COM * on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6).
7922535Ssangeeta *
79311042SErik.Nordmark@Sun.COM * Applies to IPv4 and IPv6.
79411042SErik.Nordmark@Sun.COM *
79511042SErik.Nordmark@Sun.COM * For CGTP, where an IRE_BROADCAST and IRE_HOST can exist for the same
79611042SErik.Nordmark@Sun.COM * address and bucket, we compare against ire_type for the orig_ire. We also
79711042SErik.Nordmark@Sun.COM * have IRE_BROADCASTs with and without RTF_MULTIRT, with the former being
79811131SErik.Nordmark@Sun.COM * first in the bucket. Thus we compare that RTF_MULTIRT match the orig_ire.
79911042SErik.Nordmark@Sun.COM *
80011042SErik.Nordmark@Sun.COM * Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is
80111042SErik.Nordmark@Sun.COM * reachable from the zone i.e., that the ire_gateway_addr is in a subnet
80211042SErik.Nordmark@Sun.COM * in which the zone has an IP address. We check this for the global zone
80311042SErik.Nordmark@Sun.COM * even if no shared-IP zones are configured.
8042535Ssangeeta */
8052535Ssangeeta ire_t *
ire_round_robin(irb_t * irb_ptr,ire_ftable_args_t * margs,uint_t hash,ire_t * orig_ire,ip_stack_t * ipst)80611042SErik.Nordmark@Sun.COM ire_round_robin(irb_t *irb_ptr, ire_ftable_args_t *margs, uint_t hash,
80711042SErik.Nordmark@Sun.COM ire_t *orig_ire, ip_stack_t *ipst)
8082535Ssangeeta {
80911042SErik.Nordmark@Sun.COM ire_t *ire, *maybe_ire = NULL;
81011042SErik.Nordmark@Sun.COM uint_t maybe_badcnt;
81111042SErik.Nordmark@Sun.COM uint_t maxwalk;
81211042SErik.Nordmark@Sun.COM
81311042SErik.Nordmark@Sun.COM /* Fold in more bits from the hint/hash */
81411042SErik.Nordmark@Sun.COM hash = hash ^ (hash >> 8) ^ (hash >> 16);
8152535Ssangeeta
8162535Ssangeeta rw_enter(&irb_ptr->irb_lock, RW_WRITER);
81711042SErik.Nordmark@Sun.COM maxwalk = irb_ptr->irb_ire_cnt; /* Excludes condemned */
81811042SErik.Nordmark@Sun.COM hash %= maxwalk;
81911042SErik.Nordmark@Sun.COM irb_refhold_locked(irb_ptr);
8202535Ssangeeta rw_exit(&irb_ptr->irb_lock);
8212535Ssangeeta
8222535Ssangeeta /*
8232535Ssangeeta * Round-robin the routers list looking for a route that
8242535Ssangeeta * matches the passed in parameters.
82511042SErik.Nordmark@Sun.COM * First we skip "hash" number of non-condemned IREs.
82611042SErik.Nordmark@Sun.COM * Then we match the IRE.
82711042SErik.Nordmark@Sun.COM * If we find an ire which has a non-zero ire_badcnt then we remember
82811042SErik.Nordmark@Sun.COM * it and keep on looking for a lower ire_badcnt.
82911042SErik.Nordmark@Sun.COM * If we come to the end of the list we continue (treat the
83011042SErik.Nordmark@Sun.COM * bucket list as a circular list) but we match less than "max"
83111042SErik.Nordmark@Sun.COM * entries.
8322535Ssangeeta */
83311042SErik.Nordmark@Sun.COM ire = irb_ptr->irb_ire;
83411042SErik.Nordmark@Sun.COM while (maxwalk > 0) {
83511042SErik.Nordmark@Sun.COM if (IRE_IS_CONDEMNED(ire))
83611042SErik.Nordmark@Sun.COM goto next_ire_skip;
8372535Ssangeeta
83811042SErik.Nordmark@Sun.COM /* Skip the first "hash" entries to do ECMP */
83911042SErik.Nordmark@Sun.COM if (hash != 0) {
84011042SErik.Nordmark@Sun.COM hash--;
84111042SErik.Nordmark@Sun.COM goto next_ire_skip;
84211042SErik.Nordmark@Sun.COM }
84311042SErik.Nordmark@Sun.COM
84411042SErik.Nordmark@Sun.COM /* See CGTP comment above */
84511042SErik.Nordmark@Sun.COM if (ire->ire_type != orig_ire->ire_type ||
84611131SErik.Nordmark@Sun.COM ((ire->ire_flags ^ orig_ire->ire_flags) & RTF_MULTIRT) != 0)
8472535Ssangeeta goto next_ire;
8482535Ssangeeta
84911042SErik.Nordmark@Sun.COM /*
85011042SErik.Nordmark@Sun.COM * Note: Since IPv6 has hash buckets instead of radix
85111042SErik.Nordmark@Sun.COM * buckers we need to explicitly compare the addresses.
85211042SErik.Nordmark@Sun.COM * That makes this less efficient since we will be called
85311042SErik.Nordmark@Sun.COM * even if there is no alternatives just because the
85411042SErik.Nordmark@Sun.COM * bucket has multiple IREs for different addresses.
85511042SErik.Nordmark@Sun.COM */
85611042SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV6_VERSION) {
85711042SErik.Nordmark@Sun.COM if (!IN6_ARE_ADDR_EQUAL(&orig_ire->ire_addr_v6,
85811042SErik.Nordmark@Sun.COM &ire->ire_addr_v6))
85911042SErik.Nordmark@Sun.COM goto next_ire;
86011042SErik.Nordmark@Sun.COM }
86111042SErik.Nordmark@Sun.COM
86211042SErik.Nordmark@Sun.COM /*
86311042SErik.Nordmark@Sun.COM * For some reason find_best_route uses ire_mask. We do
86411042SErik.Nordmark@Sun.COM * the same.
86511042SErik.Nordmark@Sun.COM */
86611042SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV4_VERSION ?
86711042SErik.Nordmark@Sun.COM !ire_match_args(ire, margs->ift_addr,
86811042SErik.Nordmark@Sun.COM ire->ire_mask, margs->ift_gateway,
86911042SErik.Nordmark@Sun.COM margs->ift_type, margs->ift_ill, margs->ift_zoneid,
87011042SErik.Nordmark@Sun.COM margs->ift_tsl, margs->ift_flags) :
87111042SErik.Nordmark@Sun.COM !ire_match_args_v6(ire, &margs->ift_addr_v6,
87211042SErik.Nordmark@Sun.COM &ire->ire_mask_v6, &margs->ift_gateway_v6,
87311042SErik.Nordmark@Sun.COM margs->ift_type, margs->ift_ill, margs->ift_zoneid,
87411042SErik.Nordmark@Sun.COM margs->ift_tsl, margs->ift_flags))
8752535Ssangeeta goto next_ire;
8762535Ssangeeta
87711042SErik.Nordmark@Sun.COM if (margs->ift_zoneid != ALL_ZONES &&
87811042SErik.Nordmark@Sun.COM (ire->ire_type & IRE_OFFLINK)) {
8792535Ssangeeta /*
88011042SErik.Nordmark@Sun.COM * When we're in a zone, we're only
88111042SErik.Nordmark@Sun.COM * interested in routers that are
88211042SErik.Nordmark@Sun.COM * reachable through ipifs within our zone.
8832535Ssangeeta */
88411042SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV4_VERSION) {
88511042SErik.Nordmark@Sun.COM if (!ire_gateway_ok_zone_v4(
88611042SErik.Nordmark@Sun.COM ire->ire_gateway_addr, margs->ift_zoneid,
88711042SErik.Nordmark@Sun.COM ire->ire_ill, margs->ift_tsl, ipst,
88811042SErik.Nordmark@Sun.COM B_TRUE))
88911042SErik.Nordmark@Sun.COM goto next_ire;
89011042SErik.Nordmark@Sun.COM } else {
89111042SErik.Nordmark@Sun.COM if (!ire_gateway_ok_zone_v6(
89211042SErik.Nordmark@Sun.COM &ire->ire_gateway_addr_v6,
89311042SErik.Nordmark@Sun.COM margs->ift_zoneid, ire->ire_ill,
89411042SErik.Nordmark@Sun.COM margs->ift_tsl, ipst, B_TRUE))
89511042SErik.Nordmark@Sun.COM goto next_ire;
89611042SErik.Nordmark@Sun.COM }
8972535Ssangeeta }
89811042SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock);
89911042SErik.Nordmark@Sun.COM /* Look for stale ire_badcnt and clear */
90011042SErik.Nordmark@Sun.COM if (ire->ire_badcnt != 0 &&
90111066Srafael.vanoni@sun.com (TICK_TO_SEC(ddi_get_lbolt64()) - ire->ire_last_badcnt >
90211042SErik.Nordmark@Sun.COM ipst->ips_ip_ire_badcnt_lifetime))
90311042SErik.Nordmark@Sun.COM ire->ire_badcnt = 0;
90411042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock);
9052535Ssangeeta
90611042SErik.Nordmark@Sun.COM if (ire->ire_badcnt == 0) {
90711042SErik.Nordmark@Sun.COM /* We found one with a zero badcnt; done */
90811042SErik.Nordmark@Sun.COM ire_refhold(ire);
90911042SErik.Nordmark@Sun.COM /*
91011042SErik.Nordmark@Sun.COM * Care needed since irb_refrele grabs WLOCK to free
91111042SErik.Nordmark@Sun.COM * the irb_t.
91211042SErik.Nordmark@Sun.COM */
91311042SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV4_VERSION) {
91411042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
91511042SErik.Nordmark@Sun.COM irb_refrele(irb_ptr);
91611042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
91711042SErik.Nordmark@Sun.COM } else {
91811042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ip6_ire_head_lock);
91911042SErik.Nordmark@Sun.COM irb_refrele(irb_ptr);
92011042SErik.Nordmark@Sun.COM rw_enter(&ipst->ips_ip6_ire_head_lock,
92111042SErik.Nordmark@Sun.COM RW_READER);
92211042SErik.Nordmark@Sun.COM }
9232535Ssangeeta return (ire);
9242535Ssangeeta }
9252535Ssangeeta /*
92611042SErik.Nordmark@Sun.COM * keep looking to see if there is a better (lower
92711042SErik.Nordmark@Sun.COM * badcnt) matching IRE, but save this one as a last resort.
92811042SErik.Nordmark@Sun.COM * If we find a lower badcnt pick that one as the last* resort.
9292535Ssangeeta */
93011042SErik.Nordmark@Sun.COM if (maybe_ire == NULL) {
93111042SErik.Nordmark@Sun.COM maybe_ire = ire;
93211042SErik.Nordmark@Sun.COM maybe_badcnt = ire->ire_badcnt;
93311042SErik.Nordmark@Sun.COM } else if (ire->ire_badcnt < maybe_badcnt) {
93411042SErik.Nordmark@Sun.COM maybe_ire = ire;
93511042SErik.Nordmark@Sun.COM maybe_badcnt = ire->ire_badcnt;
93611042SErik.Nordmark@Sun.COM }
9378485SPeter.Memishian@Sun.COM
9382535Ssangeeta next_ire:
93911042SErik.Nordmark@Sun.COM maxwalk--;
94011042SErik.Nordmark@Sun.COM next_ire_skip:
94111042SErik.Nordmark@Sun.COM ire = ire->ire_next;
94211042SErik.Nordmark@Sun.COM if (ire == NULL)
94311042SErik.Nordmark@Sun.COM ire = irb_ptr->irb_ire;
9442535Ssangeeta }
9452535Ssangeeta if (maybe_ire != NULL)
94611042SErik.Nordmark@Sun.COM ire_refhold(maybe_ire);
94711042SErik.Nordmark@Sun.COM
94811042SErik.Nordmark@Sun.COM /* Care needed since irb_refrele grabs WLOCK to free the irb_t. */
94911042SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV4_VERSION) {
95011042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
95111042SErik.Nordmark@Sun.COM irb_refrele(irb_ptr);
95211042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
95311042SErik.Nordmark@Sun.COM } else {
95411042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ip6_ire_head_lock);
95511042SErik.Nordmark@Sun.COM irb_refrele(irb_ptr);
95611042SErik.Nordmark@Sun.COM rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
95711042SErik.Nordmark@Sun.COM }
9582535Ssangeeta return (maybe_ire);
9592535Ssangeeta }
9602783Ssowmini
9612783Ssowmini void
irb_refhold_rn(struct radix_node * rn)9622783Ssowmini irb_refhold_rn(struct radix_node *rn)
9632783Ssowmini {
9642783Ssowmini if ((rn->rn_flags & RNF_ROOT) == 0)
96511042SErik.Nordmark@Sun.COM irb_refhold(&((rt_t *)(rn))->rt_irb);
9662783Ssowmini }
9672783Ssowmini
9682783Ssowmini void
irb_refrele_rn(struct radix_node * rn)9692783Ssowmini irb_refrele_rn(struct radix_node *rn)
9702783Ssowmini {
9712783Ssowmini if ((rn->rn_flags & RNF_ROOT) == 0)
9722783Ssowmini irb_refrele_ftable(&((rt_t *)(rn))->rt_irb);
9732783Ssowmini }
97411042SErik.Nordmark@Sun.COM
97511681SSowmini.Varadhan@Sun.COM
97611681SSowmini.Varadhan@Sun.COM /*
97711681SSowmini.Varadhan@Sun.COM * ip_select_src_ill() is used by ip_select_route() to find the src_ill
97811681SSowmini.Varadhan@Sun.COM * to be used for source-aware routing table lookup. This function will
97911681SSowmini.Varadhan@Sun.COM * ignore IPIF_UNNUMBERED interface addresses, and will only return a
98011681SSowmini.Varadhan@Sun.COM * numbered interface (ipif_lookup_addr_nondup() will ignore UNNUMBERED
98111681SSowmini.Varadhan@Sun.COM * interfaces).
98211681SSowmini.Varadhan@Sun.COM */
98311681SSowmini.Varadhan@Sun.COM static ill_t *
ip_select_src_ill(const in6_addr_t * v6src,zoneid_t zoneid,ip_stack_t * ipst)98411681SSowmini.Varadhan@Sun.COM ip_select_src_ill(const in6_addr_t *v6src, zoneid_t zoneid, ip_stack_t *ipst)
98511681SSowmini.Varadhan@Sun.COM {
98611681SSowmini.Varadhan@Sun.COM ipif_t *ipif;
98711681SSowmini.Varadhan@Sun.COM ill_t *ill;
98811681SSowmini.Varadhan@Sun.COM boolean_t isv6 = !IN6_IS_ADDR_V4MAPPED(v6src);
98911681SSowmini.Varadhan@Sun.COM ipaddr_t v4src;
99011681SSowmini.Varadhan@Sun.COM
99111681SSowmini.Varadhan@Sun.COM if (isv6) {
99211681SSowmini.Varadhan@Sun.COM ipif = ipif_lookup_addr_nondup_v6(v6src, NULL, zoneid, ipst);
99311681SSowmini.Varadhan@Sun.COM } else {
99411681SSowmini.Varadhan@Sun.COM IN6_V4MAPPED_TO_IPADDR(v6src, v4src);
99511681SSowmini.Varadhan@Sun.COM ipif = ipif_lookup_addr_nondup(v4src, NULL, zoneid, ipst);
99611681SSowmini.Varadhan@Sun.COM }
99711681SSowmini.Varadhan@Sun.COM if (ipif == NULL)
99811681SSowmini.Varadhan@Sun.COM return (NULL);
99911681SSowmini.Varadhan@Sun.COM ill = ipif->ipif_ill;
100011681SSowmini.Varadhan@Sun.COM ill_refhold(ill);
100111681SSowmini.Varadhan@Sun.COM ipif_refrele(ipif);
100211681SSowmini.Varadhan@Sun.COM return (ill);
100311681SSowmini.Varadhan@Sun.COM }
100411681SSowmini.Varadhan@Sun.COM
100511681SSowmini.Varadhan@Sun.COM /*
100611681SSowmini.Varadhan@Sun.COM * verify that v6src is configured on ill
100711681SSowmini.Varadhan@Sun.COM */
100811681SSowmini.Varadhan@Sun.COM static boolean_t
ip_verify_src_on_ill(const in6_addr_t v6src,ill_t * ill,zoneid_t zoneid)100911681SSowmini.Varadhan@Sun.COM ip_verify_src_on_ill(const in6_addr_t v6src, ill_t *ill, zoneid_t zoneid)
101011681SSowmini.Varadhan@Sun.COM {
101111681SSowmini.Varadhan@Sun.COM ipif_t *ipif;
101211681SSowmini.Varadhan@Sun.COM ip_stack_t *ipst;
101311681SSowmini.Varadhan@Sun.COM ipaddr_t v4src;
101411681SSowmini.Varadhan@Sun.COM
101511681SSowmini.Varadhan@Sun.COM if (ill == NULL)
101611681SSowmini.Varadhan@Sun.COM return (B_FALSE);
101711681SSowmini.Varadhan@Sun.COM ipst = ill->ill_ipst;
101811681SSowmini.Varadhan@Sun.COM
101911681SSowmini.Varadhan@Sun.COM if (ill->ill_isv6) {
102011681SSowmini.Varadhan@Sun.COM ipif = ipif_lookup_addr_nondup_v6(&v6src, ill, zoneid, ipst);
102111681SSowmini.Varadhan@Sun.COM } else {
102211681SSowmini.Varadhan@Sun.COM IN6_V4MAPPED_TO_IPADDR(&v6src, v4src);
102311681SSowmini.Varadhan@Sun.COM ipif = ipif_lookup_addr_nondup(v4src, ill, zoneid, ipst);
102411681SSowmini.Varadhan@Sun.COM }
102511681SSowmini.Varadhan@Sun.COM
102611681SSowmini.Varadhan@Sun.COM if (ipif != NULL) {
102711681SSowmini.Varadhan@Sun.COM ipif_refrele(ipif);
102811681SSowmini.Varadhan@Sun.COM return (B_TRUE);
102911681SSowmini.Varadhan@Sun.COM } else {
103011681SSowmini.Varadhan@Sun.COM return (B_FALSE);
103111681SSowmini.Varadhan@Sun.COM }
103211681SSowmini.Varadhan@Sun.COM }
103311681SSowmini.Varadhan@Sun.COM
103411042SErik.Nordmark@Sun.COM /*
103511042SErik.Nordmark@Sun.COM * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject
103611042SErik.Nordmark@Sun.COM * routes this routine sets up a ire_nce_cache as well. The caller needs to
103711042SErik.Nordmark@Sun.COM * lookup an nce for the multicast case.
103811681SSowmini.Varadhan@Sun.COM *
103911681SSowmini.Varadhan@Sun.COM * When src_multihoming is set to 2 (strict src multihoming) we use the source
104011681SSowmini.Varadhan@Sun.COM * address to select the interface and route. If IP_BOUND_IF etc are
104111681SSowmini.Varadhan@Sun.COM * specified, we require that they specify an interface on which the
104211681SSowmini.Varadhan@Sun.COM * source address is assigned.
104311681SSowmini.Varadhan@Sun.COM *
104411681SSowmini.Varadhan@Sun.COM * When src_multihoming is set to 1 (preferred src aware route
104511681SSowmini.Varadhan@Sun.COM * selection) the unicast lookup prefers a matching source
104611681SSowmini.Varadhan@Sun.COM * (i.e., that the route points out an ill on which the source is assigned), but
104711681SSowmini.Varadhan@Sun.COM * if no such route is found we fallback to not considering the source in the
104811681SSowmini.Varadhan@Sun.COM * route lookup.
104911681SSowmini.Varadhan@Sun.COM *
105011681SSowmini.Varadhan@Sun.COM * We skip the src_multihoming check when the source isn't (yet) set, and
105111681SSowmini.Varadhan@Sun.COM * when IXAF_VERIFY_SOURCE is not set. The latter allows RAW sockets to send
105211681SSowmini.Varadhan@Sun.COM * with bogus source addresses as allowed by IP_HDRINCL and IPV6_PKTINFO
105311681SSowmini.Varadhan@Sun.COM * when secpolicy_net_rawaccess().
105411042SErik.Nordmark@Sun.COM */
105511042SErik.Nordmark@Sun.COM ire_t *
ip_select_route(const in6_addr_t * v6dst,const in6_addr_t v6src,ip_xmit_attr_t * ixa,uint_t * generationp,in6_addr_t * setsrcp,int * errorp,boolean_t * multirtp)105611681SSowmini.Varadhan@Sun.COM ip_select_route(const in6_addr_t *v6dst, const in6_addr_t v6src,
105711681SSowmini.Varadhan@Sun.COM ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp,
105811681SSowmini.Varadhan@Sun.COM int *errorp, boolean_t *multirtp)
105911042SErik.Nordmark@Sun.COM {
106011042SErik.Nordmark@Sun.COM uint_t match_args;
106111042SErik.Nordmark@Sun.COM uint_t ire_type;
106211681SSowmini.Varadhan@Sun.COM ill_t *ill = NULL;
106311042SErik.Nordmark@Sun.COM ire_t *ire;
106411042SErik.Nordmark@Sun.COM ip_stack_t *ipst = ixa->ixa_ipst;
106511042SErik.Nordmark@Sun.COM ipaddr_t v4dst;
106611042SErik.Nordmark@Sun.COM in6_addr_t v6nexthop;
106711042SErik.Nordmark@Sun.COM iaflags_t ixaflags = ixa->ixa_flags;
106811042SErik.Nordmark@Sun.COM nce_t *nce;
106911681SSowmini.Varadhan@Sun.COM boolean_t preferred_src_aware = B_FALSE;
107011681SSowmini.Varadhan@Sun.COM boolean_t verify_src;
107111681SSowmini.Varadhan@Sun.COM boolean_t isv6 = !(ixa->ixa_flags & IXAF_IS_IPV4);
107211681SSowmini.Varadhan@Sun.COM int src_multihoming = IP_SRC_MULTIHOMING(isv6, ipst);
107311681SSowmini.Varadhan@Sun.COM
107411681SSowmini.Varadhan@Sun.COM /*
107511681SSowmini.Varadhan@Sun.COM * We only verify that the src has been configured on a selected
107611681SSowmini.Varadhan@Sun.COM * interface if the src is not :: or INADDR_ANY, and if the
107711681SSowmini.Varadhan@Sun.COM * IXAF_VERIFY_SOURCE flag is set.
107811681SSowmini.Varadhan@Sun.COM */
107911681SSowmini.Varadhan@Sun.COM verify_src = (!V6_OR_V4_INADDR_ANY(v6src) &&
108011681SSowmini.Varadhan@Sun.COM (ixa->ixa_flags & IXAF_VERIFY_SOURCE));
108111042SErik.Nordmark@Sun.COM
108211042SErik.Nordmark@Sun.COM match_args = MATCH_IRE_SECATTR;
108311042SErik.Nordmark@Sun.COM IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst);
108411042SErik.Nordmark@Sun.COM if (setsrcp != NULL)
108511042SErik.Nordmark@Sun.COM ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp));
108611042SErik.Nordmark@Sun.COM if (errorp != NULL)
108711042SErik.Nordmark@Sun.COM ASSERT(*errorp == 0);
108811042SErik.Nordmark@Sun.COM
108911042SErik.Nordmark@Sun.COM /*
109011042SErik.Nordmark@Sun.COM * The content of the ixa will be different if IP_NEXTHOP,
109111042SErik.Nordmark@Sun.COM * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set
109211042SErik.Nordmark@Sun.COM */
109311042SErik.Nordmark@Sun.COM
109411681SSowmini.Varadhan@Sun.COM if (isv6 ? IN6_IS_ADDR_MULTICAST(v6dst) : CLASSD(v4dst)) {
109511042SErik.Nordmark@Sun.COM /* Pick up the IRE_MULTICAST for the ill */
109611042SErik.Nordmark@Sun.COM if (ixa->ixa_multicast_ifindex != 0) {
109711042SErik.Nordmark@Sun.COM ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex,
109811681SSowmini.Varadhan@Sun.COM isv6, ipst);
109911042SErik.Nordmark@Sun.COM } else if (ixaflags & IXAF_SCOPEID_SET) {
110011042SErik.Nordmark@Sun.COM /* sin6_scope_id takes precedence over ixa_ifindex */
110111042SErik.Nordmark@Sun.COM ASSERT(ixa->ixa_scopeid != 0);
110211042SErik.Nordmark@Sun.COM ill = ill_lookup_on_ifindex(ixa->ixa_scopeid,
110311681SSowmini.Varadhan@Sun.COM isv6, ipst);
110411042SErik.Nordmark@Sun.COM } else if (ixa->ixa_ifindex != 0) {
110511042SErik.Nordmark@Sun.COM /*
110611042SErik.Nordmark@Sun.COM * In the ipmp case, the ixa_ifindex is set to
110711042SErik.Nordmark@Sun.COM * point at an under_ill and we would return the
110811042SErik.Nordmark@Sun.COM * ire_multicast() corresponding to that under_ill.
110911042SErik.Nordmark@Sun.COM */
111011042SErik.Nordmark@Sun.COM ill = ill_lookup_on_ifindex(ixa->ixa_ifindex,
111111681SSowmini.Varadhan@Sun.COM isv6, ipst);
111211681SSowmini.Varadhan@Sun.COM } else if (src_multihoming != 0 && verify_src) {
111311681SSowmini.Varadhan@Sun.COM /* Look up the ill based on the source address */
111411681SSowmini.Varadhan@Sun.COM ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst);
111511681SSowmini.Varadhan@Sun.COM /*
111611681SSowmini.Varadhan@Sun.COM * Since we looked up the ill from the source there
111711681SSowmini.Varadhan@Sun.COM * is no need to verify that the source is on the ill
111811681SSowmini.Varadhan@Sun.COM * below.
111911681SSowmini.Varadhan@Sun.COM */
112011681SSowmini.Varadhan@Sun.COM verify_src = B_FALSE;
112111681SSowmini.Varadhan@Sun.COM if (ill != NULL && IS_VNI(ill)) {
112211681SSowmini.Varadhan@Sun.COM ill_t *usesrc = ill;
112311681SSowmini.Varadhan@Sun.COM
112411681SSowmini.Varadhan@Sun.COM ill = ill_lookup_usesrc(usesrc);
112511681SSowmini.Varadhan@Sun.COM ill_refrele(usesrc);
112611681SSowmini.Varadhan@Sun.COM }
112711681SSowmini.Varadhan@Sun.COM } else if (!isv6) {
112811042SErik.Nordmark@Sun.COM ipaddr_t v4setsrc = INADDR_ANY;
112911042SErik.Nordmark@Sun.COM
113011681SSowmini.Varadhan@Sun.COM ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid,
113111681SSowmini.Varadhan@Sun.COM ipst, multirtp, &v4setsrc);
113211042SErik.Nordmark@Sun.COM if (setsrcp != NULL)
113311042SErik.Nordmark@Sun.COM IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp);
113411042SErik.Nordmark@Sun.COM } else {
113511681SSowmini.Varadhan@Sun.COM ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid,
113611681SSowmini.Varadhan@Sun.COM ipst, multirtp, setsrcp);
113711042SErik.Nordmark@Sun.COM }
113811042SErik.Nordmark@Sun.COM if (ill != NULL && IS_VNI(ill)) {
113911042SErik.Nordmark@Sun.COM ill_refrele(ill);
114011042SErik.Nordmark@Sun.COM ill = NULL;
114111042SErik.Nordmark@Sun.COM }
114211042SErik.Nordmark@Sun.COM if (ill == NULL) {
114311042SErik.Nordmark@Sun.COM if (errorp != NULL)
114411042SErik.Nordmark@Sun.COM *errorp = ENXIO;
114511042SErik.Nordmark@Sun.COM /* Get a hold on the IRE_NOROUTE */
114611681SSowmini.Varadhan@Sun.COM ire = ire_reject(ipst, isv6);
114711042SErik.Nordmark@Sun.COM return (ire);
114811042SErik.Nordmark@Sun.COM }
114911042SErik.Nordmark@Sun.COM if (!(ill->ill_flags & ILLF_MULTICAST)) {
115011042SErik.Nordmark@Sun.COM ill_refrele(ill);
115111042SErik.Nordmark@Sun.COM if (errorp != NULL)
115211042SErik.Nordmark@Sun.COM *errorp = EHOSTUNREACH;
115311042SErik.Nordmark@Sun.COM /* Get a hold on the IRE_NOROUTE */
115411681SSowmini.Varadhan@Sun.COM ire = ire_reject(ipst, isv6);
115511681SSowmini.Varadhan@Sun.COM return (ire);
115611681SSowmini.Varadhan@Sun.COM }
115711681SSowmini.Varadhan@Sun.COM /*
115811681SSowmini.Varadhan@Sun.COM * If we are doing the strictest src_multihoming, then
115911681SSowmini.Varadhan@Sun.COM * we check that IP_MULTICAST_IF, IP_BOUND_IF, etc specify
116011681SSowmini.Varadhan@Sun.COM * an interface that is consistent with the source address.
116111681SSowmini.Varadhan@Sun.COM */
116211681SSowmini.Varadhan@Sun.COM if (verify_src && src_multihoming == 2 &&
116311681SSowmini.Varadhan@Sun.COM !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) {
116411681SSowmini.Varadhan@Sun.COM if (errorp != NULL)
116511681SSowmini.Varadhan@Sun.COM *errorp = EADDRNOTAVAIL;
116611681SSowmini.Varadhan@Sun.COM ill_refrele(ill);
116711681SSowmini.Varadhan@Sun.COM /* Get a hold on the IRE_NOROUTE */
116811681SSowmini.Varadhan@Sun.COM ire = ire_reject(ipst, isv6);
116911042SErik.Nordmark@Sun.COM return (ire);
117011042SErik.Nordmark@Sun.COM }
117111042SErik.Nordmark@Sun.COM /* Get a refcnt on the single IRE_MULTICAST per ill */
117211042SErik.Nordmark@Sun.COM ire = ire_multicast(ill);
117311042SErik.Nordmark@Sun.COM ill_refrele(ill);
117411042SErik.Nordmark@Sun.COM if (generationp != NULL)
117511042SErik.Nordmark@Sun.COM *generationp = ire->ire_generation;
117611042SErik.Nordmark@Sun.COM if (errorp != NULL &&
117711042SErik.Nordmark@Sun.COM (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
117811042SErik.Nordmark@Sun.COM *errorp = EHOSTUNREACH;
117911042SErik.Nordmark@Sun.COM }
118011042SErik.Nordmark@Sun.COM return (ire);
118111042SErik.Nordmark@Sun.COM }
118211042SErik.Nordmark@Sun.COM
118311681SSowmini.Varadhan@Sun.COM /* Now for unicast */
118411042SErik.Nordmark@Sun.COM if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) {
118511042SErik.Nordmark@Sun.COM if (ixaflags & IXAF_SCOPEID_SET) {
118611042SErik.Nordmark@Sun.COM /* sin6_scope_id takes precedence over ixa_ifindex */
118711042SErik.Nordmark@Sun.COM ASSERT(ixa->ixa_scopeid != 0);
118811042SErik.Nordmark@Sun.COM ill = ill_lookup_on_ifindex(ixa->ixa_scopeid,
118911681SSowmini.Varadhan@Sun.COM isv6, ipst);
119011042SErik.Nordmark@Sun.COM } else {
119111042SErik.Nordmark@Sun.COM ASSERT(ixa->ixa_ifindex != 0);
119211042SErik.Nordmark@Sun.COM ill = ill_lookup_on_ifindex(ixa->ixa_ifindex,
119311681SSowmini.Varadhan@Sun.COM isv6, ipst);
119411042SErik.Nordmark@Sun.COM }
119511042SErik.Nordmark@Sun.COM if (ill != NULL && IS_VNI(ill)) {
119611042SErik.Nordmark@Sun.COM ill_refrele(ill);
119711042SErik.Nordmark@Sun.COM ill = NULL;
119811042SErik.Nordmark@Sun.COM }
119911042SErik.Nordmark@Sun.COM if (ill == NULL) {
120011042SErik.Nordmark@Sun.COM if (errorp != NULL)
120111042SErik.Nordmark@Sun.COM *errorp = ENXIO;
120211042SErik.Nordmark@Sun.COM /* Get a hold on the IRE_NOROUTE */
120311681SSowmini.Varadhan@Sun.COM ire = ire_reject(ipst, isv6);
120411042SErik.Nordmark@Sun.COM return (ire);
120511042SErik.Nordmark@Sun.COM }
120611681SSowmini.Varadhan@Sun.COM
120711681SSowmini.Varadhan@Sun.COM match_args |= MATCH_IRE_ILL;
120811681SSowmini.Varadhan@Sun.COM
120911042SErik.Nordmark@Sun.COM /*
121011042SErik.Nordmark@Sun.COM * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF
121111042SErik.Nordmark@Sun.COM * so for both of them we need to be able look for an under
121211042SErik.Nordmark@Sun.COM * interface.
121311042SErik.Nordmark@Sun.COM */
121411042SErik.Nordmark@Sun.COM if (IS_UNDER_IPMP(ill))
121511042SErik.Nordmark@Sun.COM match_args |= MATCH_IRE_TESTHIDDEN;
121611681SSowmini.Varadhan@Sun.COM
121711681SSowmini.Varadhan@Sun.COM /*
121811681SSowmini.Varadhan@Sun.COM * If we are doing the strictest src_multihoming, then
121911681SSowmini.Varadhan@Sun.COM * we check that IP_BOUND_IF, IP_PKTINFO, etc specify
122011681SSowmini.Varadhan@Sun.COM * an interface that is consistent with the source address.
122111681SSowmini.Varadhan@Sun.COM */
122211681SSowmini.Varadhan@Sun.COM if (src_multihoming == 2 &&
122311681SSowmini.Varadhan@Sun.COM !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) {
122411681SSowmini.Varadhan@Sun.COM if (errorp != NULL)
122511681SSowmini.Varadhan@Sun.COM *errorp = EADDRNOTAVAIL;
122611681SSowmini.Varadhan@Sun.COM ill_refrele(ill);
122711681SSowmini.Varadhan@Sun.COM /* Get a hold on the IRE_NOROUTE */
122811681SSowmini.Varadhan@Sun.COM ire = ire_reject(ipst, isv6);
122911681SSowmini.Varadhan@Sun.COM return (ire);
123011681SSowmini.Varadhan@Sun.COM }
123111681SSowmini.Varadhan@Sun.COM } else if (src_multihoming != 0 && verify_src) {
123211681SSowmini.Varadhan@Sun.COM /* Look up the ill based on the source address */
123311681SSowmini.Varadhan@Sun.COM ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst);
123411681SSowmini.Varadhan@Sun.COM if (ill == NULL) {
123511681SSowmini.Varadhan@Sun.COM char addrbuf[INET6_ADDRSTRLEN];
123611681SSowmini.Varadhan@Sun.COM
123711681SSowmini.Varadhan@Sun.COM ip3dbg(("%s not a valid src for unicast",
123811681SSowmini.Varadhan@Sun.COM inet_ntop(AF_INET6, &v6src, addrbuf,
123911681SSowmini.Varadhan@Sun.COM sizeof (addrbuf))));
124011681SSowmini.Varadhan@Sun.COM if (errorp != NULL)
124111681SSowmini.Varadhan@Sun.COM *errorp = EADDRNOTAVAIL;
124211681SSowmini.Varadhan@Sun.COM /* Get a hold on the IRE_NOROUTE */
124311681SSowmini.Varadhan@Sun.COM ire = ire_reject(ipst, isv6);
124411681SSowmini.Varadhan@Sun.COM return (ire);
124511681SSowmini.Varadhan@Sun.COM }
124611681SSowmini.Varadhan@Sun.COM match_args |= MATCH_IRE_SRC_ILL;
124711681SSowmini.Varadhan@Sun.COM preferred_src_aware = (src_multihoming == 1);
124811042SErik.Nordmark@Sun.COM }
124911042SErik.Nordmark@Sun.COM
125011042SErik.Nordmark@Sun.COM if (ixaflags & IXAF_NEXTHOP_SET) {
125111042SErik.Nordmark@Sun.COM /* IP_NEXTHOP was set */
125211042SErik.Nordmark@Sun.COM v6nexthop = ixa->ixa_nexthop_v6;
125311042SErik.Nordmark@Sun.COM } else {
125411042SErik.Nordmark@Sun.COM v6nexthop = *v6dst;
125511042SErik.Nordmark@Sun.COM }
125611042SErik.Nordmark@Sun.COM
125711042SErik.Nordmark@Sun.COM ire_type = 0;
125811042SErik.Nordmark@Sun.COM
125911042SErik.Nordmark@Sun.COM /*
126011042SErik.Nordmark@Sun.COM * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then
126111042SErik.Nordmark@Sun.COM * we only look for an onlink IRE.
126211042SErik.Nordmark@Sun.COM */
126311042SErik.Nordmark@Sun.COM if (ixaflags & (IXAF_DONTROUTE|IXAF_NEXTHOP_SET)) {
126411042SErik.Nordmark@Sun.COM match_args |= MATCH_IRE_TYPE;
126511042SErik.Nordmark@Sun.COM ire_type = IRE_ONLINK;
126611042SErik.Nordmark@Sun.COM }
126711042SErik.Nordmark@Sun.COM
126811681SSowmini.Varadhan@Sun.COM retry:
126911681SSowmini.Varadhan@Sun.COM if (!isv6) {
127011042SErik.Nordmark@Sun.COM ipaddr_t v4nexthop;
127111042SErik.Nordmark@Sun.COM ipaddr_t v4setsrc = INADDR_ANY;
127211042SErik.Nordmark@Sun.COM
127311042SErik.Nordmark@Sun.COM IN6_V4MAPPED_TO_IPADDR(&v6nexthop, v4nexthop);
127411042SErik.Nordmark@Sun.COM ire = ire_route_recursive_v4(v4nexthop, ire_type, ill,
127511457SErik.Nordmark@Sun.COM ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE,
127611042SErik.Nordmark@Sun.COM ixa->ixa_xmit_hint, ipst, &v4setsrc, NULL, generationp);
127711042SErik.Nordmark@Sun.COM if (setsrcp != NULL)
127811042SErik.Nordmark@Sun.COM IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp);
127911042SErik.Nordmark@Sun.COM } else {
128011042SErik.Nordmark@Sun.COM ire = ire_route_recursive_v6(&v6nexthop, ire_type, ill,
128111457SErik.Nordmark@Sun.COM ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE,
128211042SErik.Nordmark@Sun.COM ixa->ixa_xmit_hint, ipst, setsrcp, NULL, generationp);
128311042SErik.Nordmark@Sun.COM }
128411042SErik.Nordmark@Sun.COM
128511042SErik.Nordmark@Sun.COM #ifdef DEBUG
128611042SErik.Nordmark@Sun.COM if (match_args & MATCH_IRE_TESTHIDDEN) {
128711042SErik.Nordmark@Sun.COM ip3dbg(("looking for hidden; dst %x ire %p\n",
128811042SErik.Nordmark@Sun.COM v4dst, (void *)ire));
128911042SErik.Nordmark@Sun.COM }
129011042SErik.Nordmark@Sun.COM #endif
129111681SSowmini.Varadhan@Sun.COM if (ill != NULL) {
129211042SErik.Nordmark@Sun.COM ill_refrele(ill);
129311681SSowmini.Varadhan@Sun.COM ill = NULL;
129411681SSowmini.Varadhan@Sun.COM }
129511042SErik.Nordmark@Sun.COM if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
129611042SErik.Nordmark@Sun.COM (ire->ire_type & IRE_MULTICAST)) {
129711681SSowmini.Varadhan@Sun.COM if (preferred_src_aware) {
129811681SSowmini.Varadhan@Sun.COM /*
129911681SSowmini.Varadhan@Sun.COM * "Preferred Source Aware" send mode. If we cannot
130011681SSowmini.Varadhan@Sun.COM * find an ire whose ire_ill had the desired source
130111681SSowmini.Varadhan@Sun.COM * address retry after relaxing the ill matching
130211681SSowmini.Varadhan@Sun.COM * constraint.
130311681SSowmini.Varadhan@Sun.COM */
130411681SSowmini.Varadhan@Sun.COM ire_refrele(ire);
130511681SSowmini.Varadhan@Sun.COM preferred_src_aware = B_FALSE;
130611681SSowmini.Varadhan@Sun.COM match_args &= ~MATCH_IRE_SRC_ILL;
130711681SSowmini.Varadhan@Sun.COM goto retry;
130811681SSowmini.Varadhan@Sun.COM }
130911042SErik.Nordmark@Sun.COM /* No ire_nce_cache */
131011042SErik.Nordmark@Sun.COM return (ire);
131111042SErik.Nordmark@Sun.COM }
131211042SErik.Nordmark@Sun.COM
131311042SErik.Nordmark@Sun.COM /* Setup ire_nce_cache if it doesn't exist or is condemned. */
131411042SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock);
131511042SErik.Nordmark@Sun.COM nce = ire->ire_nce_cache;
131611042SErik.Nordmark@Sun.COM if (nce == NULL || nce->nce_is_condemned) {
131711042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock);
131811042SErik.Nordmark@Sun.COM (void) ire_revalidate_nce(ire);
131911042SErik.Nordmark@Sun.COM } else {
132011042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock);
132111042SErik.Nordmark@Sun.COM }
132211042SErik.Nordmark@Sun.COM return (ire);
132311042SErik.Nordmark@Sun.COM }
132411042SErik.Nordmark@Sun.COM
132511042SErik.Nordmark@Sun.COM /*
132611042SErik.Nordmark@Sun.COM * Find a route given some xmit attributes and a packet.
132711042SErik.Nordmark@Sun.COM * Generic for IPv4 and IPv6
132811042SErik.Nordmark@Sun.COM *
132911042SErik.Nordmark@Sun.COM * This never returns NULL. But when it returns the IRE_NOROUTE
133011042SErik.Nordmark@Sun.COM * it might set errorp.
133111042SErik.Nordmark@Sun.COM */
133211042SErik.Nordmark@Sun.COM ire_t *
ip_select_route_pkt(mblk_t * mp,ip_xmit_attr_t * ixa,uint_t * generationp,int * errorp,boolean_t * multirtp)133311042SErik.Nordmark@Sun.COM ip_select_route_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp,
133411042SErik.Nordmark@Sun.COM int *errorp, boolean_t *multirtp)
133511042SErik.Nordmark@Sun.COM {
133611042SErik.Nordmark@Sun.COM if (ixa->ixa_flags & IXAF_IS_IPV4) {
133711042SErik.Nordmark@Sun.COM ipha_t *ipha = (ipha_t *)mp->b_rptr;
133811681SSowmini.Varadhan@Sun.COM in6_addr_t v6dst, v6src;
133911042SErik.Nordmark@Sun.COM
134011042SErik.Nordmark@Sun.COM IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
134111681SSowmini.Varadhan@Sun.COM IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src);
134211042SErik.Nordmark@Sun.COM
134311681SSowmini.Varadhan@Sun.COM return (ip_select_route(&v6dst, v6src, ixa, generationp,
134411042SErik.Nordmark@Sun.COM NULL, errorp, multirtp));
134511042SErik.Nordmark@Sun.COM } else {
134611042SErik.Nordmark@Sun.COM ip6_t *ip6h = (ip6_t *)mp->b_rptr;
134711042SErik.Nordmark@Sun.COM
134811681SSowmini.Varadhan@Sun.COM return (ip_select_route(&ip6h->ip6_dst, ip6h->ip6_src,
134911681SSowmini.Varadhan@Sun.COM ixa, generationp, NULL, errorp, multirtp));
135011042SErik.Nordmark@Sun.COM }
135111042SErik.Nordmark@Sun.COM }
135211042SErik.Nordmark@Sun.COM
135311042SErik.Nordmark@Sun.COM ire_t *
ip_select_route_v4(ipaddr_t dst,ipaddr_t src,ip_xmit_attr_t * ixa,uint_t * generationp,ipaddr_t * v4setsrcp,int * errorp,boolean_t * multirtp)135411681SSowmini.Varadhan@Sun.COM ip_select_route_v4(ipaddr_t dst, ipaddr_t src, ip_xmit_attr_t *ixa,
135511681SSowmini.Varadhan@Sun.COM uint_t *generationp, ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp)
135611042SErik.Nordmark@Sun.COM {
135711681SSowmini.Varadhan@Sun.COM in6_addr_t v6dst, v6src;
135811042SErik.Nordmark@Sun.COM ire_t *ire;
135911042SErik.Nordmark@Sun.COM in6_addr_t setsrc;
136011042SErik.Nordmark@Sun.COM
136111042SErik.Nordmark@Sun.COM ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
136211042SErik.Nordmark@Sun.COM
136311042SErik.Nordmark@Sun.COM IN6_IPADDR_TO_V4MAPPED(dst, &v6dst);
136411681SSowmini.Varadhan@Sun.COM IN6_IPADDR_TO_V4MAPPED(src, &v6src);
136511042SErik.Nordmark@Sun.COM
136611042SErik.Nordmark@Sun.COM setsrc = ipv6_all_zeros;
136711681SSowmini.Varadhan@Sun.COM ire = ip_select_route(&v6dst, v6src, ixa, generationp, &setsrc, errorp,
136811042SErik.Nordmark@Sun.COM multirtp);
136911042SErik.Nordmark@Sun.COM if (v4setsrcp != NULL)
137011042SErik.Nordmark@Sun.COM IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp);
137111042SErik.Nordmark@Sun.COM return (ire);
137211042SErik.Nordmark@Sun.COM }
137311042SErik.Nordmark@Sun.COM
137411042SErik.Nordmark@Sun.COM /*
137511042SErik.Nordmark@Sun.COM * Recursively look for a route to the destination. Can also match on
137611042SErik.Nordmark@Sun.COM * the zoneid, ill, and label. Used for the data paths. See also
137711042SErik.Nordmark@Sun.COM * ire_route_recursive.
137811042SErik.Nordmark@Sun.COM *
137911457SErik.Nordmark@Sun.COM * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never
138011457SErik.Nordmark@Sun.COM * create an IRE_IF_CLONE. This is used on the receive side when we are not
138111457SErik.Nordmark@Sun.COM * forwarding.
138211457SErik.Nordmark@Sun.COM * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly
138311457SErik.Nordmark@Sun.COM * resolve the gateway.
138411457SErik.Nordmark@Sun.COM *
138511042SErik.Nordmark@Sun.COM * Note that this function never returns NULL. It returns an IRE_NOROUTE
138611042SErik.Nordmark@Sun.COM * instead.
138711042SErik.Nordmark@Sun.COM *
138811042SErik.Nordmark@Sun.COM * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
138911042SErik.Nordmark@Sun.COM * is an error.
139011042SErik.Nordmark@Sun.COM * Allow at most one RTF_INDIRECT.
139111042SErik.Nordmark@Sun.COM */
139211042SErik.Nordmark@Sun.COM ire_t *
ire_route_recursive_impl_v4(ire_t * ire,ipaddr_t nexthop,uint_t ire_type,const ill_t * ill_arg,zoneid_t zoneid,const ts_label_t * tsl,uint_t match_args,uint_t irr_flags,uint32_t xmit_hint,ip_stack_t * ipst,ipaddr_t * setsrcp,tsol_ire_gw_secattr_t ** gwattrp,uint_t * generationp)139311042SErik.Nordmark@Sun.COM ire_route_recursive_impl_v4(ire_t *ire,
139411042SErik.Nordmark@Sun.COM ipaddr_t nexthop, uint_t ire_type, const ill_t *ill_arg,
139511042SErik.Nordmark@Sun.COM zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
139611457SErik.Nordmark@Sun.COM uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp,
139711042SErik.Nordmark@Sun.COM tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
139811042SErik.Nordmark@Sun.COM {
139911042SErik.Nordmark@Sun.COM int i, j;
140011042SErik.Nordmark@Sun.COM ire_t *ires[MAX_IRE_RECURSION];
140111042SErik.Nordmark@Sun.COM uint_t generation;
140211042SErik.Nordmark@Sun.COM uint_t generations[MAX_IRE_RECURSION];
140311042SErik.Nordmark@Sun.COM boolean_t need_refrele = B_FALSE;
140411042SErik.Nordmark@Sun.COM boolean_t invalidate = B_FALSE;
140511042SErik.Nordmark@Sun.COM ill_t *ill = NULL;
140612038SSowmini.Varadhan@Sun.COM uint_t maskoff = (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST);
140711042SErik.Nordmark@Sun.COM
140811042SErik.Nordmark@Sun.COM if (setsrcp != NULL)
140911042SErik.Nordmark@Sun.COM ASSERT(*setsrcp == INADDR_ANY);
141011042SErik.Nordmark@Sun.COM if (gwattrp != NULL)
141111042SErik.Nordmark@Sun.COM ASSERT(*gwattrp == NULL);
141211042SErik.Nordmark@Sun.COM
141311042SErik.Nordmark@Sun.COM /*
141411042SErik.Nordmark@Sun.COM * We iterate up to three times to resolve a route, even though
141511042SErik.Nordmark@Sun.COM * we have four slots in the array. The extra slot is for an
141611042SErik.Nordmark@Sun.COM * IRE_IF_CLONE we might need to create.
141711042SErik.Nordmark@Sun.COM */
141811042SErik.Nordmark@Sun.COM i = 0;
141911042SErik.Nordmark@Sun.COM while (i < MAX_IRE_RECURSION - 1) {
142011042SErik.Nordmark@Sun.COM /* ire_ftable_lookup handles round-robin/ECMP */
142111042SErik.Nordmark@Sun.COM if (ire == NULL) {
142211042SErik.Nordmark@Sun.COM ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type,
142311681SSowmini.Varadhan@Sun.COM (ill != NULL? ill : ill_arg), zoneid, tsl,
142411042SErik.Nordmark@Sun.COM match_args, xmit_hint, ipst, &generation);
142511042SErik.Nordmark@Sun.COM } else {
142611042SErik.Nordmark@Sun.COM /* Caller passed it; extra hold since we will rele */
142711042SErik.Nordmark@Sun.COM ire_refhold(ire);
142811042SErik.Nordmark@Sun.COM if (generationp != NULL)
142911042SErik.Nordmark@Sun.COM generation = *generationp;
143011042SErik.Nordmark@Sun.COM else
143111042SErik.Nordmark@Sun.COM generation = IRE_GENERATION_VERIFY;
143211042SErik.Nordmark@Sun.COM }
143312038SSowmini.Varadhan@Sun.COM if (ire == NULL) {
143412038SSowmini.Varadhan@Sun.COM if (i > 0 && (irr_flags & IRR_INCOMPLETE)) {
143512038SSowmini.Varadhan@Sun.COM ire = ires[0];
143612038SSowmini.Varadhan@Sun.COM ire_refhold(ire);
143712038SSowmini.Varadhan@Sun.COM } else {
143812038SSowmini.Varadhan@Sun.COM ire = ire_reject(ipst, B_FALSE);
143912038SSowmini.Varadhan@Sun.COM }
144012038SSowmini.Varadhan@Sun.COM goto error;
144112038SSowmini.Varadhan@Sun.COM }
144211042SErik.Nordmark@Sun.COM
144311042SErik.Nordmark@Sun.COM /* Need to return the ire with RTF_REJECT|BLACKHOLE */
144411042SErik.Nordmark@Sun.COM if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
144511042SErik.Nordmark@Sun.COM goto error;
144611042SErik.Nordmark@Sun.COM
144711042SErik.Nordmark@Sun.COM ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */
1448*12985SSowmini.Varadhan@oracle.COM /*
1449*12985SSowmini.Varadhan@oracle.COM * Verify that the IRE_IF_CLONE has a consistent generation
1450*12985SSowmini.Varadhan@oracle.COM * number.
1451*12985SSowmini.Varadhan@oracle.COM */
1452*12985SSowmini.Varadhan@oracle.COM if ((ire->ire_type & IRE_IF_CLONE) && !ire_clone_verify(ire)) {
1453*12985SSowmini.Varadhan@oracle.COM ire_refrele(ire);
1454*12985SSowmini.Varadhan@oracle.COM ire = NULL;
1455*12985SSowmini.Varadhan@oracle.COM continue;
1456*12985SSowmini.Varadhan@oracle.COM }
145711042SErik.Nordmark@Sun.COM
145812038SSowmini.Varadhan@Sun.COM /*
145912038SSowmini.Varadhan@Sun.COM * Don't allow anything unusual past the first iteration.
146012038SSowmini.Varadhan@Sun.COM * After the first lookup, we should no longer look for
146112038SSowmini.Varadhan@Sun.COM * (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST) or RTF_INDIRECT
146212038SSowmini.Varadhan@Sun.COM * routes.
146312038SSowmini.Varadhan@Sun.COM *
146412038SSowmini.Varadhan@Sun.COM * In addition, after we have found a direct IRE_OFFLINK,
146512038SSowmini.Varadhan@Sun.COM * we should only look for interface or clone routes.
146612038SSowmini.Varadhan@Sun.COM */
146712038SSowmini.Varadhan@Sun.COM match_args |= MATCH_IRE_DIRECT; /* no more RTF_INDIRECTs */
146812038SSowmini.Varadhan@Sun.COM
146912038SSowmini.Varadhan@Sun.COM if ((ire->ire_type & IRE_OFFLINK) &&
147012038SSowmini.Varadhan@Sun.COM !(ire->ire_flags & RTF_INDIRECT)) {
147112038SSowmini.Varadhan@Sun.COM ire_type = IRE_IF_ALL;
147212038SSowmini.Varadhan@Sun.COM } else {
147311042SErik.Nordmark@Sun.COM /*
147412038SSowmini.Varadhan@Sun.COM * no more local, loopback, broadcast routes
147511042SErik.Nordmark@Sun.COM */
147612038SSowmini.Varadhan@Sun.COM if (!(match_args & MATCH_IRE_TYPE))
147712038SSowmini.Varadhan@Sun.COM ire_type = (IRE_OFFLINK|IRE_ONLINK);
147812038SSowmini.Varadhan@Sun.COM ire_type &= ~maskoff;
147911042SErik.Nordmark@Sun.COM }
148012038SSowmini.Varadhan@Sun.COM match_args |= MATCH_IRE_TYPE;
148112038SSowmini.Varadhan@Sun.COM
148211042SErik.Nordmark@Sun.COM /* We have a usable IRE */
148311042SErik.Nordmark@Sun.COM ires[i] = ire;
148411042SErik.Nordmark@Sun.COM generations[i] = generation;
148511042SErik.Nordmark@Sun.COM i++;
148611042SErik.Nordmark@Sun.COM
148711042SErik.Nordmark@Sun.COM /* The first RTF_SETSRC address is passed back if setsrcp */
148811042SErik.Nordmark@Sun.COM if ((ire->ire_flags & RTF_SETSRC) &&
148911042SErik.Nordmark@Sun.COM setsrcp != NULL && *setsrcp == INADDR_ANY) {
149011042SErik.Nordmark@Sun.COM ASSERT(ire->ire_setsrc_addr != INADDR_ANY);
149111042SErik.Nordmark@Sun.COM *setsrcp = ire->ire_setsrc_addr;
149211042SErik.Nordmark@Sun.COM }
149311042SErik.Nordmark@Sun.COM
149411042SErik.Nordmark@Sun.COM /* The first ire_gw_secattr is passed back if gwattrp */
149511042SErik.Nordmark@Sun.COM if (ire->ire_gw_secattr != NULL &&
149611042SErik.Nordmark@Sun.COM gwattrp != NULL && *gwattrp == NULL)
149711042SErik.Nordmark@Sun.COM *gwattrp = ire->ire_gw_secattr;
149811042SErik.Nordmark@Sun.COM
149911042SErik.Nordmark@Sun.COM /*
150011042SErik.Nordmark@Sun.COM * Check if we have a short-cut pointer to an IRE for this
150111042SErik.Nordmark@Sun.COM * destination, and that the cached dependency isn't stale.
150211042SErik.Nordmark@Sun.COM * In that case we've rejoined an existing tree towards a
150311042SErik.Nordmark@Sun.COM * parent, thus we don't need to continue the loop to
150411042SErik.Nordmark@Sun.COM * discover the rest of the tree.
150511042SErik.Nordmark@Sun.COM */
150611042SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock);
150711042SErik.Nordmark@Sun.COM if (ire->ire_dep_parent != NULL &&
150811042SErik.Nordmark@Sun.COM ire->ire_dep_parent->ire_generation ==
150911042SErik.Nordmark@Sun.COM ire->ire_dep_parent_generation) {
151011042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock);
151111042SErik.Nordmark@Sun.COM ire = NULL;
151211042SErik.Nordmark@Sun.COM goto done;
151311042SErik.Nordmark@Sun.COM }
151411042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock);
151511042SErik.Nordmark@Sun.COM
151611042SErik.Nordmark@Sun.COM /*
151711042SErik.Nordmark@Sun.COM * If this type should have an ire_nce_cache (even if it
151811042SErik.Nordmark@Sun.COM * doesn't yet have one) then we are done. Includes
151911042SErik.Nordmark@Sun.COM * IRE_INTERFACE with a full 32 bit mask.
152011042SErik.Nordmark@Sun.COM */
152111042SErik.Nordmark@Sun.COM if (ire->ire_nce_capable) {
152211042SErik.Nordmark@Sun.COM ire = NULL;
152311042SErik.Nordmark@Sun.COM goto done;
152411042SErik.Nordmark@Sun.COM }
152511042SErik.Nordmark@Sun.COM ASSERT(!(ire->ire_type & IRE_IF_CLONE));
152611042SErik.Nordmark@Sun.COM /*
152711042SErik.Nordmark@Sun.COM * For an IRE_INTERFACE we create an IRE_IF_CLONE for this
152811042SErik.Nordmark@Sun.COM * particular destination
152911042SErik.Nordmark@Sun.COM */
153011042SErik.Nordmark@Sun.COM if (ire->ire_type & IRE_INTERFACE) {
153111042SErik.Nordmark@Sun.COM in6_addr_t v6nexthop;
153211042SErik.Nordmark@Sun.COM ire_t *clone;
153311042SErik.Nordmark@Sun.COM
153411042SErik.Nordmark@Sun.COM ASSERT(ire->ire_masklen != IPV4_ABITS);
153511042SErik.Nordmark@Sun.COM
153611042SErik.Nordmark@Sun.COM /*
153711042SErik.Nordmark@Sun.COM * In the case of ip_input and ILLF_FORWARDING not
153811457SErik.Nordmark@Sun.COM * being set, and in the case of RTM_GET, there is
153911457SErik.Nordmark@Sun.COM * no point in allocating an IRE_IF_CLONE. We return
154011457SErik.Nordmark@Sun.COM * the IRE_INTERFACE. Note that !IRR_ALLOCATE can
154111457SErik.Nordmark@Sun.COM * result in a ire_dep_parent which is IRE_IF_*
154211457SErik.Nordmark@Sun.COM * without an IRE_IF_CLONE.
154311042SErik.Nordmark@Sun.COM * We recover from that when we need to send packets
154411042SErik.Nordmark@Sun.COM * by ensuring that the generations become
154511042SErik.Nordmark@Sun.COM * IRE_GENERATION_VERIFY in this case.
154611042SErik.Nordmark@Sun.COM */
154711457SErik.Nordmark@Sun.COM if (!(irr_flags & IRR_ALLOCATE)) {
154811042SErik.Nordmark@Sun.COM invalidate = B_TRUE;
154911042SErik.Nordmark@Sun.COM ire = NULL;
155011042SErik.Nordmark@Sun.COM goto done;
155111042SErik.Nordmark@Sun.COM }
155211042SErik.Nordmark@Sun.COM
155311042SErik.Nordmark@Sun.COM IN6_IPADDR_TO_V4MAPPED(nexthop, &v6nexthop);
155411042SErik.Nordmark@Sun.COM
155511042SErik.Nordmark@Sun.COM clone = ire_create_if_clone(ire, &v6nexthop,
155611042SErik.Nordmark@Sun.COM &generation);
155711042SErik.Nordmark@Sun.COM if (clone == NULL) {
155811042SErik.Nordmark@Sun.COM /*
155911042SErik.Nordmark@Sun.COM * Temporary failure - no memory.
156011042SErik.Nordmark@Sun.COM * Don't want caller to cache IRE_NOROUTE.
156111042SErik.Nordmark@Sun.COM */
156211042SErik.Nordmark@Sun.COM invalidate = B_TRUE;
156311042SErik.Nordmark@Sun.COM ire = ire_blackhole(ipst, B_FALSE);
156411042SErik.Nordmark@Sun.COM goto error;
156511042SErik.Nordmark@Sun.COM }
156611042SErik.Nordmark@Sun.COM /*
156711042SErik.Nordmark@Sun.COM * Make clone next to last entry and the
156811042SErik.Nordmark@Sun.COM * IRE_INTERFACE the last in the dependency
156911042SErik.Nordmark@Sun.COM * chain since the clone depends on the
157011042SErik.Nordmark@Sun.COM * IRE_INTERFACE.
157111042SErik.Nordmark@Sun.COM */
157211042SErik.Nordmark@Sun.COM ASSERT(i >= 1);
157311042SErik.Nordmark@Sun.COM ASSERT(i < MAX_IRE_RECURSION);
157411042SErik.Nordmark@Sun.COM
157511042SErik.Nordmark@Sun.COM ires[i] = ires[i-1];
157611042SErik.Nordmark@Sun.COM generations[i] = generations[i-1];
157711042SErik.Nordmark@Sun.COM ires[i-1] = clone;
157811042SErik.Nordmark@Sun.COM generations[i-1] = generation;
157911042SErik.Nordmark@Sun.COM i++;
158011042SErik.Nordmark@Sun.COM
158111042SErik.Nordmark@Sun.COM ire = NULL;
158211042SErik.Nordmark@Sun.COM goto done;
158311042SErik.Nordmark@Sun.COM }
158411042SErik.Nordmark@Sun.COM
158511042SErik.Nordmark@Sun.COM /*
158611042SErik.Nordmark@Sun.COM * We only match on the type and optionally ILL when
158711042SErik.Nordmark@Sun.COM * recursing. The type match is used by some callers
158811042SErik.Nordmark@Sun.COM * to exclude certain types (such as IRE_IF_CLONE or
158911042SErik.Nordmark@Sun.COM * IRE_LOCAL|IRE_LOOPBACK).
159011681SSowmini.Varadhan@Sun.COM *
159111681SSowmini.Varadhan@Sun.COM * In the MATCH_IRE_SRC_ILL case, ill_arg may be the 'srcof'
159211681SSowmini.Varadhan@Sun.COM * ire->ire_ill, and we want to find the IRE_INTERFACE for
159311681SSowmini.Varadhan@Sun.COM * ire_ill, so we set ill to the ire_ill;
159411042SErik.Nordmark@Sun.COM */
159512038SSowmini.Varadhan@Sun.COM match_args &= (MATCH_IRE_TYPE | MATCH_IRE_DIRECT);
159611042SErik.Nordmark@Sun.COM nexthop = ire->ire_gateway_addr;
159711042SErik.Nordmark@Sun.COM if (ill == NULL && ire->ire_ill != NULL) {
159811042SErik.Nordmark@Sun.COM ill = ire->ire_ill;
159911042SErik.Nordmark@Sun.COM need_refrele = B_TRUE;
160011042SErik.Nordmark@Sun.COM ill_refhold(ill);
160111042SErik.Nordmark@Sun.COM match_args |= MATCH_IRE_ILL;
160211042SErik.Nordmark@Sun.COM }
160311042SErik.Nordmark@Sun.COM ire = NULL;
160411042SErik.Nordmark@Sun.COM }
160511042SErik.Nordmark@Sun.COM ASSERT(ire == NULL);
160611042SErik.Nordmark@Sun.COM ire = ire_reject(ipst, B_FALSE);
160711042SErik.Nordmark@Sun.COM
160811042SErik.Nordmark@Sun.COM error:
160911042SErik.Nordmark@Sun.COM ASSERT(ire != NULL);
161011042SErik.Nordmark@Sun.COM if (need_refrele)
161111042SErik.Nordmark@Sun.COM ill_refrele(ill);
161211042SErik.Nordmark@Sun.COM
161311042SErik.Nordmark@Sun.COM /*
161411042SErik.Nordmark@Sun.COM * In the case of MULTIRT we want to try a different IRE the next
161511042SErik.Nordmark@Sun.COM * time. We let the next packet retry in that case.
161611042SErik.Nordmark@Sun.COM */
161711042SErik.Nordmark@Sun.COM if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT))
161811042SErik.Nordmark@Sun.COM (void) ire_no_good(ires[0]);
161911042SErik.Nordmark@Sun.COM
162011042SErik.Nordmark@Sun.COM cleanup:
162111042SErik.Nordmark@Sun.COM /* cleanup ires[i] */
162211042SErik.Nordmark@Sun.COM ire_dep_unbuild(ires, i);
162311042SErik.Nordmark@Sun.COM for (j = 0; j < i; j++)
162411042SErik.Nordmark@Sun.COM ire_refrele(ires[j]);
162511042SErik.Nordmark@Sun.COM
162611457SErik.Nordmark@Sun.COM ASSERT((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
162711457SErik.Nordmark@Sun.COM (irr_flags & IRR_INCOMPLETE));
162811042SErik.Nordmark@Sun.COM /*
162911042SErik.Nordmark@Sun.COM * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the
163011042SErik.Nordmark@Sun.COM * ip_select_route since the reject or lack of memory might be gone.
163111042SErik.Nordmark@Sun.COM */
163211042SErik.Nordmark@Sun.COM if (generationp != NULL)
163311042SErik.Nordmark@Sun.COM *generationp = IRE_GENERATION_VERIFY;
163411042SErik.Nordmark@Sun.COM return (ire);
163511042SErik.Nordmark@Sun.COM
163611042SErik.Nordmark@Sun.COM done:
163711042SErik.Nordmark@Sun.COM ASSERT(ire == NULL);
163811042SErik.Nordmark@Sun.COM if (need_refrele) {
163911042SErik.Nordmark@Sun.COM ill_refrele(ill);
164011042SErik.Nordmark@Sun.COM ill = NULL;
164111042SErik.Nordmark@Sun.COM }
164211042SErik.Nordmark@Sun.COM
164311042SErik.Nordmark@Sun.COM /* Build dependencies */
164411131SErik.Nordmark@Sun.COM if (i > 1 && !ire_dep_build(ires, generations, i)) {
164511042SErik.Nordmark@Sun.COM /* Something in chain was condemned; tear it apart */
164611042SErik.Nordmark@Sun.COM ire = ire_reject(ipst, B_FALSE);
164711042SErik.Nordmark@Sun.COM goto cleanup;
164811042SErik.Nordmark@Sun.COM }
164911042SErik.Nordmark@Sun.COM
165011042SErik.Nordmark@Sun.COM /*
165111042SErik.Nordmark@Sun.COM * Release all refholds except the one for ires[0] that we
165211042SErik.Nordmark@Sun.COM * will return to the caller.
165311042SErik.Nordmark@Sun.COM */
165411042SErik.Nordmark@Sun.COM for (j = 1; j < i; j++)
165511042SErik.Nordmark@Sun.COM ire_refrele(ires[j]);
165611042SErik.Nordmark@Sun.COM
165711042SErik.Nordmark@Sun.COM if (invalidate) {
165811042SErik.Nordmark@Sun.COM /*
165911042SErik.Nordmark@Sun.COM * Since we needed to allocate but couldn't we need to make
166011042SErik.Nordmark@Sun.COM * sure that the dependency chain is rebuilt the next time.
166111042SErik.Nordmark@Sun.COM */
166211042SErik.Nordmark@Sun.COM ire_dep_invalidate_generations(ires[0]);
166311042SErik.Nordmark@Sun.COM generation = IRE_GENERATION_VERIFY;
166411042SErik.Nordmark@Sun.COM } else {
166511042SErik.Nordmark@Sun.COM /*
166611042SErik.Nordmark@Sun.COM * IREs can have been added or deleted while we did the
166711042SErik.Nordmark@Sun.COM * recursive lookup and we can't catch those until we've built
166811042SErik.Nordmark@Sun.COM * the dependencies. We verify the stored
166911042SErik.Nordmark@Sun.COM * ire_dep_parent_generation to catch any such changes and
167011042SErik.Nordmark@Sun.COM * return IRE_GENERATION_VERIFY (which will cause
167111042SErik.Nordmark@Sun.COM * ip_select_route to be called again so we can redo the
167211042SErik.Nordmark@Sun.COM * recursive lookup next time we send a packet.
167311042SErik.Nordmark@Sun.COM */
167411131SErik.Nordmark@Sun.COM if (ires[0]->ire_dep_parent == NULL)
167511131SErik.Nordmark@Sun.COM generation = ires[0]->ire_generation;
167611131SErik.Nordmark@Sun.COM else
167711131SErik.Nordmark@Sun.COM generation = ire_dep_validate_generations(ires[0]);
167811042SErik.Nordmark@Sun.COM if (generations[0] != ires[0]->ire_generation) {
167911042SErik.Nordmark@Sun.COM /* Something changed at the top */
168011042SErik.Nordmark@Sun.COM generation = IRE_GENERATION_VERIFY;
168111042SErik.Nordmark@Sun.COM }
168211042SErik.Nordmark@Sun.COM }
168311042SErik.Nordmark@Sun.COM if (generationp != NULL)
168411042SErik.Nordmark@Sun.COM *generationp = generation;
168511042SErik.Nordmark@Sun.COM
168611042SErik.Nordmark@Sun.COM return (ires[0]);
168711042SErik.Nordmark@Sun.COM }
168811042SErik.Nordmark@Sun.COM
168911042SErik.Nordmark@Sun.COM ire_t *
ire_route_recursive_v4(ipaddr_t nexthop,uint_t ire_type,const ill_t * ill,zoneid_t zoneid,const ts_label_t * tsl,uint_t match_args,uint_t irr_flags,uint32_t xmit_hint,ip_stack_t * ipst,ipaddr_t * setsrcp,tsol_ire_gw_secattr_t ** gwattrp,uint_t * generationp)169011042SErik.Nordmark@Sun.COM ire_route_recursive_v4(ipaddr_t nexthop, uint_t ire_type, const ill_t *ill,
169111042SErik.Nordmark@Sun.COM zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
169211457SErik.Nordmark@Sun.COM uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp,
169311042SErik.Nordmark@Sun.COM tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
169411042SErik.Nordmark@Sun.COM {
169511042SErik.Nordmark@Sun.COM return (ire_route_recursive_impl_v4(NULL, nexthop, ire_type, ill,
169611457SErik.Nordmark@Sun.COM zoneid, tsl, match_args, irr_flags, xmit_hint, ipst, setsrcp,
169711042SErik.Nordmark@Sun.COM gwattrp, generationp));
169811042SErik.Nordmark@Sun.COM }
169911042SErik.Nordmark@Sun.COM
170011042SErik.Nordmark@Sun.COM /*
170111042SErik.Nordmark@Sun.COM * Recursively look for a route to the destination.
170211042SErik.Nordmark@Sun.COM * We only handle a destination match here, yet we have the same arguments
170311042SErik.Nordmark@Sun.COM * as the full match to allow function pointers to select between the two.
170411042SErik.Nordmark@Sun.COM *
170511042SErik.Nordmark@Sun.COM * Note that this function never returns NULL. It returns an IRE_NOROUTE
170611042SErik.Nordmark@Sun.COM * instead.
170711042SErik.Nordmark@Sun.COM *
170811042SErik.Nordmark@Sun.COM * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
170911042SErik.Nordmark@Sun.COM * is an error.
171011042SErik.Nordmark@Sun.COM * Allow at most one RTF_INDIRECT.
171111042SErik.Nordmark@Sun.COM */
171211042SErik.Nordmark@Sun.COM ire_t *
ire_route_recursive_dstonly_v4(ipaddr_t nexthop,uint_t irr_flags,uint32_t xmit_hint,ip_stack_t * ipst)171311457SErik.Nordmark@Sun.COM ire_route_recursive_dstonly_v4(ipaddr_t nexthop, uint_t irr_flags,
171411042SErik.Nordmark@Sun.COM uint32_t xmit_hint, ip_stack_t *ipst)
171511042SErik.Nordmark@Sun.COM {
171611042SErik.Nordmark@Sun.COM ire_t *ire;
171711042SErik.Nordmark@Sun.COM ire_t *ire1;
171811042SErik.Nordmark@Sun.COM uint_t generation;
171911042SErik.Nordmark@Sun.COM
172011042SErik.Nordmark@Sun.COM /* ire_ftable_lookup handles round-robin/ECMP */
172111042SErik.Nordmark@Sun.COM ire = ire_ftable_lookup_simple_v4(nexthop, xmit_hint, ipst,
172211042SErik.Nordmark@Sun.COM &generation);
172311042SErik.Nordmark@Sun.COM ASSERT(ire != NULL);
172411042SErik.Nordmark@Sun.COM /*
172511042SErik.Nordmark@Sun.COM * If the IRE has a current cached parent we know that the whole
172611042SErik.Nordmark@Sun.COM * parent chain is current, hence we don't need to discover and
172711042SErik.Nordmark@Sun.COM * build any dependencies by doing a recursive lookup.
172811042SErik.Nordmark@Sun.COM */
172911042SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock);
1730*12985SSowmini.Varadhan@oracle.COM if (ire->ire_dep_parent != NULL) {
1731*12985SSowmini.Varadhan@oracle.COM if (ire->ire_dep_parent->ire_generation ==
1732*12985SSowmini.Varadhan@oracle.COM ire->ire_dep_parent_generation) {
1733*12985SSowmini.Varadhan@oracle.COM mutex_exit(&ire->ire_lock);
1734*12985SSowmini.Varadhan@oracle.COM return (ire);
1735*12985SSowmini.Varadhan@oracle.COM }
1736*12985SSowmini.Varadhan@oracle.COM mutex_exit(&ire->ire_lock);
1737*12985SSowmini.Varadhan@oracle.COM } else {
173811042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock);
1739*12985SSowmini.Varadhan@oracle.COM /*
1740*12985SSowmini.Varadhan@oracle.COM * If this type should have an ire_nce_cache (even if it
1741*12985SSowmini.Varadhan@oracle.COM * doesn't yet have one) then we are done. Includes
1742*12985SSowmini.Varadhan@oracle.COM * IRE_INTERFACE with a full 32 bit mask.
1743*12985SSowmini.Varadhan@oracle.COM */
1744*12985SSowmini.Varadhan@oracle.COM if (ire->ire_nce_capable)
1745*12985SSowmini.Varadhan@oracle.COM return (ire);
174611042SErik.Nordmark@Sun.COM }
174711042SErik.Nordmark@Sun.COM
174811042SErik.Nordmark@Sun.COM /*
174911042SErik.Nordmark@Sun.COM * Fallback to loop in the normal code starting with the ire
175011042SErik.Nordmark@Sun.COM * we found. Normally this would return the same ire.
175111042SErik.Nordmark@Sun.COM */
175211042SErik.Nordmark@Sun.COM ire1 = ire_route_recursive_impl_v4(ire, nexthop, 0, NULL, ALL_ZONES,
175311457SErik.Nordmark@Sun.COM NULL, MATCH_IRE_DSTONLY, irr_flags, xmit_hint, ipst, NULL, NULL,
175411042SErik.Nordmark@Sun.COM &generation);
175511042SErik.Nordmark@Sun.COM ire_refrele(ire);
175611042SErik.Nordmark@Sun.COM return (ire1);
175711042SErik.Nordmark@Sun.COM }
1758*12985SSowmini.Varadhan@oracle.COM
1759*12985SSowmini.Varadhan@oracle.COM /*
1760*12985SSowmini.Varadhan@oracle.COM * Verify that the generation numbers in the chain leading to an IRE_IF_CLONE
1761*12985SSowmini.Varadhan@oracle.COM * are consistent. Return FALSE (and delete the IRE_IF_CLONE) if they
1762*12985SSowmini.Varadhan@oracle.COM * are not consistent, and TRUE otherwise.
1763*12985SSowmini.Varadhan@oracle.COM */
1764*12985SSowmini.Varadhan@oracle.COM boolean_t
ire_clone_verify(ire_t * ire)1765*12985SSowmini.Varadhan@oracle.COM ire_clone_verify(ire_t *ire)
1766*12985SSowmini.Varadhan@oracle.COM {
1767*12985SSowmini.Varadhan@oracle.COM ASSERT((ire->ire_type & IRE_IF_CLONE) != 0);
1768*12985SSowmini.Varadhan@oracle.COM mutex_enter(&ire->ire_lock);
1769*12985SSowmini.Varadhan@oracle.COM if (ire->ire_dep_parent != NULL &&
1770*12985SSowmini.Varadhan@oracle.COM ire->ire_dep_parent->ire_generation !=
1771*12985SSowmini.Varadhan@oracle.COM ire->ire_dep_parent_generation) {
1772*12985SSowmini.Varadhan@oracle.COM mutex_exit(&ire->ire_lock);
1773*12985SSowmini.Varadhan@oracle.COM ire_delete(ire);
1774*12985SSowmini.Varadhan@oracle.COM return (B_FALSE);
1775*12985SSowmini.Varadhan@oracle.COM }
1776*12985SSowmini.Varadhan@oracle.COM mutex_exit(&ire->ire_lock);
1777*12985SSowmini.Varadhan@oracle.COM return (B_TRUE);
1778*12985SSowmini.Varadhan@oracle.COM }
1779