xref: /onnv-gate/usr/src/uts/common/inet/ip/ip_ftable.c (revision 11681:fe992d6ccc26)
12535Ssangeeta /*
22535Ssangeeta  * CDDL HEADER START
32535Ssangeeta  *
42535Ssangeeta  * The contents of this file are subject to the terms of the
52535Ssangeeta  * Common Development and Distribution License (the "License").
62535Ssangeeta  * You may not use this file except in compliance with the License.
72535Ssangeeta  *
82535Ssangeeta  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
92535Ssangeeta  * or http://www.opensolaris.org/os/licensing.
102535Ssangeeta  * See the License for the specific language governing permissions
112535Ssangeeta  * and limitations under the License.
122535Ssangeeta  *
132535Ssangeeta  * When distributing Covered Code, include this CDDL HEADER in each
142535Ssangeeta  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
152535Ssangeeta  * If applicable, add the following below this CDDL HEADER, with the
162535Ssangeeta  * fields enclosed by brackets "[]" replaced with your own identifying
172535Ssangeeta  * information: Portions Copyright [yyyy] [name of copyright owner]
182535Ssangeeta  *
192535Ssangeeta  * CDDL HEADER END
202535Ssangeeta  */
212535Ssangeeta /*
2211457SErik.Nordmark@Sun.COM  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
232535Ssangeeta  * Use is subject to license terms.
242535Ssangeeta  */
252535Ssangeeta 
262535Ssangeeta /*
272535Ssangeeta  * This file contains consumer routines of the IPv4 forwarding engine
282535Ssangeeta  */
292535Ssangeeta 
302535Ssangeeta #include <sys/types.h>
312535Ssangeeta #include <sys/stream.h>
322535Ssangeeta #include <sys/stropts.h>
332535Ssangeeta #include <sys/strlog.h>
342535Ssangeeta #include <sys/dlpi.h>
352535Ssangeeta #include <sys/ddi.h>
362535Ssangeeta #include <sys/cmn_err.h>
372535Ssangeeta #include <sys/policy.h>
382535Ssangeeta 
392535Ssangeeta #include <sys/systm.h>
402535Ssangeeta #include <sys/strsun.h>
412535Ssangeeta #include <sys/kmem.h>
422535Ssangeeta #include <sys/param.h>
432535Ssangeeta #include <sys/socket.h>
444482Sdr146992 #include <sys/strsubr.h>
452535Ssangeeta #include <net/if.h>
462535Ssangeeta #include <net/route.h>
472535Ssangeeta #include <netinet/in.h>
482535Ssangeeta #include <net/if_dl.h>
492535Ssangeeta #include <netinet/ip6.h>
502535Ssangeeta #include <netinet/icmp6.h>
512535Ssangeeta 
5211042SErik.Nordmark@Sun.COM #include <inet/ipsec_impl.h>
532535Ssangeeta #include <inet/common.h>
542535Ssangeeta #include <inet/mi.h>
552535Ssangeeta #include <inet/mib2.h>
562535Ssangeeta #include <inet/ip.h>
574482Sdr146992 #include <inet/ip_impl.h>
582535Ssangeeta #include <inet/ip6.h>
592535Ssangeeta #include <inet/ip_ndp.h>
602535Ssangeeta #include <inet/arp.h>
612535Ssangeeta #include <inet/ip_if.h>
622535Ssangeeta #include <inet/ip_ire.h>
632535Ssangeeta #include <inet/ip_ftable.h>
642535Ssangeeta #include <inet/ip_rts.h>
652535Ssangeeta #include <inet/nd.h>
662535Ssangeeta 
672535Ssangeeta #include <net/pfkeyv2.h>
682535Ssangeeta #include <inet/sadb.h>
692535Ssangeeta #include <inet/tcp.h>
702535Ssangeeta #include <inet/ipclassifier.h>
712535Ssangeeta #include <sys/zone.h>
722535Ssangeeta #include <net/radix.h>
732535Ssangeeta #include <sys/tsol/label.h>
742535Ssangeeta #include <sys/tsol/tnet.h>
752535Ssangeeta 
762535Ssangeeta #define	IS_DEFAULT_ROUTE(ire)	\
772535Ssangeeta 	(((ire)->ire_type & IRE_DEFAULT) || \
782535Ssangeeta 	    (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0)))
792535Ssangeeta 
80*11681SSowmini.Varadhan@Sun.COM #define	IP_SRC_MULTIHOMING(isv6, ipst) 			\
81*11681SSowmini.Varadhan@Sun.COM 	(isv6 ? ipst->ips_ipv6_strict_src_multihoming :	\
82*11681SSowmini.Varadhan@Sun.COM 	ipst->ips_ip_strict_src_multihoming)
83*11681SSowmini.Varadhan@Sun.COM 
843448Sdh155122 static ire_t	*route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *);
8511042SErik.Nordmark@Sun.COM static void	ire_del_host_redir(ire_t *, char *);
8611042SErik.Nordmark@Sun.COM static boolean_t ire_find_best_route(struct radix_node *, void *);
872535Ssangeeta 
882535Ssangeeta /*
892535Ssangeeta  * Lookup a route in forwarding table. A specific lookup is indicated by
902535Ssangeeta  * passing the required parameters and indicating the match required in the
912535Ssangeeta  * flag field.
922535Ssangeeta  *
932535Ssangeeta  * Supports IP_BOUND_IF by following the ipif/ill when recursing.
942535Ssangeeta  */
952535Ssangeeta ire_t *
9611042SErik.Nordmark@Sun.COM ire_ftable_lookup_v4(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
9711042SErik.Nordmark@Sun.COM     int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl,
9811042SErik.Nordmark@Sun.COM     int flags, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp)
992535Ssangeeta {
10011042SErik.Nordmark@Sun.COM 	ire_t *ire;
1012535Ssangeeta 	struct rt_sockaddr rdst, rmask;
1022535Ssangeeta 	struct rt_entry *rt;
1032535Ssangeeta 	ire_ftable_args_t margs;
1042535Ssangeeta 
10511042SErik.Nordmark@Sun.COM 	ASSERT(ill == NULL || !ill->ill_isv6);
1062535Ssangeeta 
1072535Ssangeeta 	/*
10811042SErik.Nordmark@Sun.COM 	 * ire_match_args() will dereference ill if MATCH_IRE_ILL
10911042SErik.Nordmark@Sun.COM 	 * is set.
1102535Ssangeeta 	 */
111*11681SSowmini.Varadhan@Sun.COM 	if ((flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL)) && (ill == NULL))
1122535Ssangeeta 		return (NULL);
1132535Ssangeeta 
11411131SErik.Nordmark@Sun.COM 	bzero(&rdst, sizeof (rdst));
1152535Ssangeeta 	rdst.rt_sin_len = sizeof (rdst);
1162535Ssangeeta 	rdst.rt_sin_family = AF_INET;
1172535Ssangeeta 	rdst.rt_sin_addr.s_addr = addr;
1182535Ssangeeta 
11911131SErik.Nordmark@Sun.COM 	bzero(&rmask, sizeof (rmask));
1202535Ssangeeta 	rmask.rt_sin_len = sizeof (rmask);
1212535Ssangeeta 	rmask.rt_sin_family = AF_INET;
1222535Ssangeeta 	rmask.rt_sin_addr.s_addr = mask;
1232535Ssangeeta 
12411131SErik.Nordmark@Sun.COM 	bzero(&margs, sizeof (margs));
1252535Ssangeeta 	margs.ift_addr = addr;
1262535Ssangeeta 	margs.ift_mask = mask;
1272535Ssangeeta 	margs.ift_gateway = gateway;
1282535Ssangeeta 	margs.ift_type = type;
12911042SErik.Nordmark@Sun.COM 	margs.ift_ill = ill;
1302535Ssangeeta 	margs.ift_zoneid = zoneid;
1312535Ssangeeta 	margs.ift_tsl = tsl;
1322535Ssangeeta 	margs.ift_flags = flags;
1332535Ssangeeta 
1342535Ssangeeta 	/*
1352535Ssangeeta 	 * The flags argument passed to ire_ftable_lookup may cause the
1362535Ssangeeta 	 * search to return, not the longest matching prefix, but the
1372535Ssangeeta 	 * "best matching prefix", i.e., the longest prefix that also
1382535Ssangeeta 	 * satisfies constraints imposed via the permutation of flags
1392535Ssangeeta 	 * passed in. To achieve this, we invoke ire_match_args() on
1402535Ssangeeta 	 * each matching leaf in the  radix tree. ire_match_args is
1412535Ssangeeta 	 * invoked by the callback function ire_find_best_route()
1422535Ssangeeta 	 * We hold the global tree lock in read mode when calling
14311042SErik.Nordmark@Sun.COM 	 * rn_match_args. Before dropping the global tree lock, ensure
1442535Ssangeeta 	 * that the radix node can't be deleted by incrementing ire_refcnt.
1452535Ssangeeta 	 */
1463448Sdh155122 	RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
1473448Sdh155122 	rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
1483448Sdh155122 	    ipst->ips_ip_ftable, ire_find_best_route, &margs);
1492535Ssangeeta 	ire = margs.ift_best_ire;
1502535Ssangeeta 	if (rt == NULL) {
15111042SErik.Nordmark@Sun.COM 		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
1522535Ssangeeta 		return (NULL);
1532535Ssangeeta 	}
15411042SErik.Nordmark@Sun.COM 	ASSERT(ire != NULL);
1552535Ssangeeta 
1562535Ssangeeta 	DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire);
1572535Ssangeeta 
1582535Ssangeeta 	/*
1592535Ssangeeta 	 * round-robin only if we have more than one route in the bucket.
16011042SErik.Nordmark@Sun.COM 	 * ips_ip_ecmp_behavior controls when we do ECMP
16111042SErik.Nordmark@Sun.COM 	 *	2:	always
16211042SErik.Nordmark@Sun.COM 	 *	1:	for IRE_DEFAULT and /0 IRE_INTERFACE
16311042SErik.Nordmark@Sun.COM 	 *	0:	never
1642535Ssangeeta 	 */
16511042SErik.Nordmark@Sun.COM 	if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) {
16611042SErik.Nordmark@Sun.COM 		if (ipst->ips_ip_ecmp_behavior == 2 ||
16711042SErik.Nordmark@Sun.COM 		    (ipst->ips_ip_ecmp_behavior == 1 &&
16811042SErik.Nordmark@Sun.COM 		    IS_DEFAULT_ROUTE(ire))) {
16911042SErik.Nordmark@Sun.COM 			ire_t	*next_ire;
1702535Ssangeeta 
17111042SErik.Nordmark@Sun.COM 			margs.ift_best_ire = NULL;
17211042SErik.Nordmark@Sun.COM 			next_ire = ire_round_robin(ire->ire_bucket, &margs,
17311042SErik.Nordmark@Sun.COM 			    xmit_hint, ire, ipst);
17411042SErik.Nordmark@Sun.COM 			if (next_ire == NULL) {
17511042SErik.Nordmark@Sun.COM 				/* keep ire if next_ire is null */
17611042SErik.Nordmark@Sun.COM 				goto done;
17711042SErik.Nordmark@Sun.COM 			}
17811042SErik.Nordmark@Sun.COM 			ire_refrele(ire);
1792535Ssangeeta 			ire = next_ire;
1802535Ssangeeta 		}
1812535Ssangeeta 	}
1822535Ssangeeta 
18311042SErik.Nordmark@Sun.COM done:
18411042SErik.Nordmark@Sun.COM 	/* Return generation before dropping lock */
18511042SErik.Nordmark@Sun.COM 	if (generationp != NULL)
18611042SErik.Nordmark@Sun.COM 		*generationp = ire->ire_generation;
1872535Ssangeeta 
18811042SErik.Nordmark@Sun.COM 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
1898485SPeter.Memishian@Sun.COM 
19011042SErik.Nordmark@Sun.COM 	/*
19111042SErik.Nordmark@Sun.COM 	 * For shared-IP zones we need additional checks to what was
19211042SErik.Nordmark@Sun.COM 	 * done in ire_match_args to make sure IRE_LOCALs are handled.
19311042SErik.Nordmark@Sun.COM 	 *
19411042SErik.Nordmark@Sun.COM 	 * When ip_restrict_interzone_loopback is set, then
19511042SErik.Nordmark@Sun.COM 	 * we ensure that IRE_LOCAL are only used for loopback
19611042SErik.Nordmark@Sun.COM 	 * between zones when the logical "Ethernet" would
19711042SErik.Nordmark@Sun.COM 	 * have looped them back. That is, if in the absense of
19811042SErik.Nordmark@Sun.COM 	 * the IRE_LOCAL we would have sent to packet out the
19911042SErik.Nordmark@Sun.COM 	 * same ill.
20011042SErik.Nordmark@Sun.COM 	 */
20111042SErik.Nordmark@Sun.COM 	if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES &&
20211042SErik.Nordmark@Sun.COM 	    ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES &&
20311042SErik.Nordmark@Sun.COM 	    ipst->ips_ip_restrict_interzone_loopback) {
20411042SErik.Nordmark@Sun.COM 		ire = ire_alt_local(ire, zoneid, tsl, ill, generationp);
20511042SErik.Nordmark@Sun.COM 		ASSERT(ire != NULL);
2062535Ssangeeta 	}
2072535Ssangeeta 	return (ire);
2082535Ssangeeta }
2092535Ssangeeta 
2108275SEric Cheng /*
2118275SEric Cheng  * This function is called by
21211042SErik.Nordmark@Sun.COM  * ip_input/ire_route_recursive when doing a route lookup on only the
21311042SErik.Nordmark@Sun.COM  * destination address.
21411042SErik.Nordmark@Sun.COM  *
2158275SEric Cheng  * The optimizations of this function over ire_ftable_lookup are:
2168275SEric Cheng  *	o removing unnecessary flag matching
2178275SEric Cheng  *	o doing longest prefix match instead of overloading it further
2188275SEric Cheng  *	  with the unnecessary "best_prefix_match"
21911042SErik.Nordmark@Sun.COM  *
22011042SErik.Nordmark@Sun.COM  * If no route is found we return IRE_NOROUTE.
2218275SEric Cheng  */
22211042SErik.Nordmark@Sun.COM ire_t *
22311042SErik.Nordmark@Sun.COM ire_ftable_lookup_simple_v4(ipaddr_t addr, uint32_t xmit_hint, ip_stack_t *ipst,
22411042SErik.Nordmark@Sun.COM     uint_t *generationp)
2258275SEric Cheng {
22611042SErik.Nordmark@Sun.COM 	ire_t *ire;
2278275SEric Cheng 	struct rt_sockaddr rdst;
2288275SEric Cheng 	struct rt_entry *rt;
22911042SErik.Nordmark@Sun.COM 	irb_t *irb;
2308275SEric Cheng 
2318275SEric Cheng 	rdst.rt_sin_len = sizeof (rdst);
2328275SEric Cheng 	rdst.rt_sin_family = AF_INET;
2338275SEric Cheng 	rdst.rt_sin_addr.s_addr = addr;
2348275SEric Cheng 
2358275SEric Cheng 	/*
2368275SEric Cheng 	 * This is basically inlining  a simpler version of ire_match_args
2378275SEric Cheng 	 */
2388275SEric Cheng 	RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
2398275SEric Cheng 
2408275SEric Cheng 	rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
2418275SEric Cheng 	    ipst->ips_ip_ftable, NULL, NULL);
2428275SEric Cheng 
24311042SErik.Nordmark@Sun.COM 	if (rt == NULL)
24411042SErik.Nordmark@Sun.COM 		goto bad;
24511042SErik.Nordmark@Sun.COM 
24611042SErik.Nordmark@Sun.COM 	irb = &rt->rt_irb;
24711042SErik.Nordmark@Sun.COM 	if (irb->irb_ire_cnt == 0)
24811042SErik.Nordmark@Sun.COM 		goto bad;
24911042SErik.Nordmark@Sun.COM 
25011042SErik.Nordmark@Sun.COM 	rw_enter(&irb->irb_lock, RW_READER);
25111042SErik.Nordmark@Sun.COM 	ire = irb->irb_ire;
25211042SErik.Nordmark@Sun.COM 	if (ire == NULL) {
25311042SErik.Nordmark@Sun.COM 		rw_exit(&irb->irb_lock);
25411042SErik.Nordmark@Sun.COM 		goto bad;
2558275SEric Cheng 	}
25611042SErik.Nordmark@Sun.COM 	while (IRE_IS_CONDEMNED(ire)) {
25711042SErik.Nordmark@Sun.COM 		ire = ire->ire_next;
25811042SErik.Nordmark@Sun.COM 		if (ire == NULL) {
25911042SErik.Nordmark@Sun.COM 			rw_exit(&irb->irb_lock);
26011042SErik.Nordmark@Sun.COM 			goto bad;
26111042SErik.Nordmark@Sun.COM 		}
2628275SEric Cheng 	}
2638275SEric Cheng 
2648275SEric Cheng 	/* we have a ire that matches */
26511042SErik.Nordmark@Sun.COM 	ire_refhold(ire);
26611042SErik.Nordmark@Sun.COM 	rw_exit(&irb->irb_lock);
2678275SEric Cheng 
2688275SEric Cheng 	/*
26911042SErik.Nordmark@Sun.COM 	 * round-robin only if we have more than one route in the bucket.
27011042SErik.Nordmark@Sun.COM 	 * ips_ip_ecmp_behavior controls when we do ECMP
27111042SErik.Nordmark@Sun.COM 	 *	2:	always
27211042SErik.Nordmark@Sun.COM 	 *	1:	for IRE_DEFAULT and /0 IRE_INTERFACE
27311042SErik.Nordmark@Sun.COM 	 *	0:	never
2748275SEric Cheng 	 *
27511042SErik.Nordmark@Sun.COM 	 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with
27611042SErik.Nordmark@Sun.COM 	 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match
27711042SErik.Nordmark@Sun.COM 	 * and the IRE_INTERFACESs are likely to be shorter matches.
2788275SEric Cheng 	 */
27911042SErik.Nordmark@Sun.COM 	if (ire->ire_bucket->irb_ire_cnt > 1) {
28011042SErik.Nordmark@Sun.COM 		if (ipst->ips_ip_ecmp_behavior == 2 ||
28111042SErik.Nordmark@Sun.COM 		    (ipst->ips_ip_ecmp_behavior == 1 &&
28211042SErik.Nordmark@Sun.COM 		    IS_DEFAULT_ROUTE(ire))) {
28311042SErik.Nordmark@Sun.COM 			ire_t	*next_ire;
28411042SErik.Nordmark@Sun.COM 			ire_ftable_args_t margs;
2858275SEric Cheng 
28611131SErik.Nordmark@Sun.COM 			bzero(&margs, sizeof (margs));
28711042SErik.Nordmark@Sun.COM 			margs.ift_addr = addr;
28811042SErik.Nordmark@Sun.COM 			margs.ift_zoneid = ALL_ZONES;
28911042SErik.Nordmark@Sun.COM 
29011042SErik.Nordmark@Sun.COM 			next_ire = ire_round_robin(ire->ire_bucket, &margs,
29111042SErik.Nordmark@Sun.COM 			    xmit_hint, ire, ipst);
29211042SErik.Nordmark@Sun.COM 			if (next_ire == NULL) {
29311042SErik.Nordmark@Sun.COM 				/* keep ire if next_ire is null */
29411042SErik.Nordmark@Sun.COM 				if (generationp != NULL)
29511042SErik.Nordmark@Sun.COM 					*generationp = ire->ire_generation;
29611042SErik.Nordmark@Sun.COM 				RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
29711042SErik.Nordmark@Sun.COM 				return (ire);
29811042SErik.Nordmark@Sun.COM 			}
29911042SErik.Nordmark@Sun.COM 			ire_refrele(ire);
30011042SErik.Nordmark@Sun.COM 			ire = next_ire;
3018275SEric Cheng 		}
3028275SEric Cheng 	}
30311042SErik.Nordmark@Sun.COM 	/* Return generation before dropping lock */
30411042SErik.Nordmark@Sun.COM 	if (generationp != NULL)
30511042SErik.Nordmark@Sun.COM 		*generationp = ire->ire_generation;
30611042SErik.Nordmark@Sun.COM 
30711042SErik.Nordmark@Sun.COM 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
3088275SEric Cheng 
30911042SErik.Nordmark@Sun.COM 	/*
31011042SErik.Nordmark@Sun.COM 	 * Since we only did ALL_ZONES matches there is no special handling
31111042SErik.Nordmark@Sun.COM 	 * of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that.
31211042SErik.Nordmark@Sun.COM 	 */
3138275SEric Cheng 	return (ire);
31411042SErik.Nordmark@Sun.COM 
31511042SErik.Nordmark@Sun.COM bad:
31611042SErik.Nordmark@Sun.COM 	if (generationp != NULL)
31711042SErik.Nordmark@Sun.COM 		*generationp = IRE_GENERATION_VERIFY;
31811042SErik.Nordmark@Sun.COM 
31911042SErik.Nordmark@Sun.COM 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
32011042SErik.Nordmark@Sun.COM 	return (ire_reject(ipst, B_FALSE));
3218275SEric Cheng }
3222535Ssangeeta 
3232535Ssangeeta /*
32411042SErik.Nordmark@Sun.COM  * Find the ill matching a multicast group.
3252535Ssangeeta  * Allows different routes for multicast addresses
3262535Ssangeeta  * in the unicast routing table (akin to 224.0.0.0 but could be more specific)
3272535Ssangeeta  * which point at different interfaces. This is used when IP_MULTICAST_IF
3282535Ssangeeta  * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't
3292535Ssangeeta  * specify the interface to join on.
3302535Ssangeeta  *
33111042SErik.Nordmark@Sun.COM  * Supports link-local addresses by using ire_route_recursive which follows
33211042SErik.Nordmark@Sun.COM  * the ill when recursing.
33311042SErik.Nordmark@Sun.COM  *
33411042SErik.Nordmark@Sun.COM  * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group
33511042SErik.Nordmark@Sun.COM  * and the MULTIRT property can be different for different groups, we
33611042SErik.Nordmark@Sun.COM  * extract RTF_MULTIRT from the special unicast route added for a group
33711042SErik.Nordmark@Sun.COM  * with CGTP and pass that back in the multirtp argument.
33811042SErik.Nordmark@Sun.COM  * This is used in ip_set_destination etc to set ixa_postfragfn for multicast.
33911042SErik.Nordmark@Sun.COM  * We have a setsrcp argument for the same reason.
3402535Ssangeeta  */
34111042SErik.Nordmark@Sun.COM ill_t *
34211042SErik.Nordmark@Sun.COM ire_lookup_multi_ill_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst,
34311042SErik.Nordmark@Sun.COM     boolean_t *multirtp, ipaddr_t *setsrcp)
3442535Ssangeeta {
3452535Ssangeeta 	ire_t	*ire;
34611042SErik.Nordmark@Sun.COM 	ill_t	*ill;
3472535Ssangeeta 
34811042SErik.Nordmark@Sun.COM 	ire = ire_route_recursive_v4(group, 0, NULL, zoneid, NULL,
34911457SErik.Nordmark@Sun.COM 	    MATCH_IRE_DSTONLY, IRR_NONE, 0, ipst, setsrcp, NULL, NULL);
35011042SErik.Nordmark@Sun.COM 	ASSERT(ire != NULL);
35111042SErik.Nordmark@Sun.COM 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
3522535Ssangeeta 		ire_refrele(ire);
3532535Ssangeeta 		return (NULL);
3542535Ssangeeta 	}
35511042SErik.Nordmark@Sun.COM 
35611042SErik.Nordmark@Sun.COM 	if (multirtp != NULL)
35711042SErik.Nordmark@Sun.COM 		*multirtp = (ire->ire_flags & RTF_MULTIRT) != 0;
35811042SErik.Nordmark@Sun.COM 
35911042SErik.Nordmark@Sun.COM 	ill = ire_nexthop_ill(ire);
36011042SErik.Nordmark@Sun.COM 	ire_refrele(ire);
36111042SErik.Nordmark@Sun.COM 	return (ill);
3622535Ssangeeta }
3632535Ssangeeta 
3642535Ssangeeta /*
3652535Ssangeeta  * Delete the passed in ire if the gateway addr matches
3662535Ssangeeta  */
3672535Ssangeeta void
3682535Ssangeeta ire_del_host_redir(ire_t *ire, char *gateway)
3692535Ssangeeta {
3703004Sdd193516 	if ((ire->ire_flags & RTF_DYNAMIC) &&
3712535Ssangeeta 	    (ire->ire_gateway_addr == *(ipaddr_t *)gateway))
3722535Ssangeeta 		ire_delete(ire);
3732535Ssangeeta }
3742535Ssangeeta 
3752535Ssangeeta /*
37611042SErik.Nordmark@Sun.COM  * Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are
3772535Ssangeeta  * pointing at the specified gateway and
3782535Ssangeeta  * delete them. This routine is called only
3792535Ssangeeta  * when a default gateway is going away.
3802535Ssangeeta  */
3812535Ssangeeta void
3823448Sdh155122 ire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst)
3832535Ssangeeta {
3842535Ssangeeta 	struct rtfuncarg rtfarg;
3852535Ssangeeta 
38611131SErik.Nordmark@Sun.COM 	bzero(&rtfarg, sizeof (rtfarg));
3872535Ssangeeta 	rtfarg.rt_func = ire_del_host_redir;
3882535Ssangeeta 	rtfarg.rt_arg = (void *)&gateway;
38911131SErik.Nordmark@Sun.COM 	rtfarg.rt_zoneid = ALL_ZONES;
39011131SErik.Nordmark@Sun.COM 	rtfarg.rt_ipst = ipst;
3913448Sdh155122 	(void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable,
3923448Sdh155122 	    rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn);
3932535Ssangeeta }
3942535Ssangeeta 
3952535Ssangeeta /*
3963448Sdh155122  * Obtain the rt_entry and rt_irb for the route to be added to
3973448Sdh155122  * the ips_ip_ftable.
3982535Ssangeeta  * First attempt to add a node to the radix tree via rn_addroute. If the
3992535Ssangeeta  * route already exists, return the bucket for the existing route.
4002535Ssangeeta  *
4012535Ssangeeta  * Locking notes: Need to hold the global radix tree lock in write mode to
4022535Ssangeeta  * add a radix node. To prevent the node from being deleted, ire_get_bucket()
4032535Ssangeeta  * returns with a ref'ed irb_t. The ire itself is added in ire_add_v4()
4042535Ssangeeta  * while holding the irb_lock, but not the radix tree lock.
4052535Ssangeeta  */
4062535Ssangeeta irb_t *
4072535Ssangeeta ire_get_bucket(ire_t *ire)
4082535Ssangeeta {
4092535Ssangeeta 	struct radix_node *rn;
4102535Ssangeeta 	struct rt_entry *rt;
4112535Ssangeeta 	struct rt_sockaddr rmask, rdst;
4122535Ssangeeta 	irb_t *irb = NULL;
4133448Sdh155122 	ip_stack_t *ipst = ire->ire_ipst;
4142535Ssangeeta 
4153448Sdh155122 	ASSERT(ipst->ips_ip_ftable != NULL);
4162535Ssangeeta 
4172535Ssangeeta 	/* first try to see if route exists (based on rtalloc1) */
41811131SErik.Nordmark@Sun.COM 	bzero(&rdst, sizeof (rdst));
4192535Ssangeeta 	rdst.rt_sin_len = sizeof (rdst);
4202535Ssangeeta 	rdst.rt_sin_family = AF_INET;
4212535Ssangeeta 	rdst.rt_sin_addr.s_addr = ire->ire_addr;
4222535Ssangeeta 
42311131SErik.Nordmark@Sun.COM 	bzero(&rmask, sizeof (rmask));
4242535Ssangeeta 	rmask.rt_sin_len = sizeof (rmask);
4252535Ssangeeta 	rmask.rt_sin_family = AF_INET;
4262535Ssangeeta 	rmask.rt_sin_addr.s_addr = ire->ire_mask;
4272535Ssangeeta 
4282535Ssangeeta 	/*
4292535Ssangeeta 	 * add the route. based on BSD's rtrequest1(RTM_ADD)
4302535Ssangeeta 	 */
4312535Ssangeeta 	R_Malloc(rt, rt_entry_cache,  sizeof (*rt));
4325090Ssangeeta 	/* kmem_alloc failed */
4335090Ssangeeta 	if (rt == NULL)
4345090Ssangeeta 		return (NULL);
4355090Ssangeeta 
43611131SErik.Nordmark@Sun.COM 	bzero(rt, sizeof (*rt));
4372535Ssangeeta 	rt->rt_nodes->rn_key = (char *)&rt->rt_dst;
4382535Ssangeeta 	rt->rt_dst = rdst;
4392535Ssangeeta 	irb = &rt->rt_irb;
44011042SErik.Nordmark@Sun.COM 	irb->irb_marks |= IRB_MARK_DYNAMIC; /* dynamically allocated/freed */
4413448Sdh155122 	irb->irb_ipst = ipst;
4422535Ssangeeta 	rw_init(&irb->irb_lock, NULL, RW_DEFAULT, NULL);
4433448Sdh155122 	RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable);
4443448Sdh155122 	rn = ipst->ips_ip_ftable->rnh_addaddr(&rt->rt_dst, &rmask,
4453448Sdh155122 	    ipst->ips_ip_ftable, (struct radix_node *)rt);
4462535Ssangeeta 	if (rn == NULL) {
4473448Sdh155122 		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
4482535Ssangeeta 		Free(rt, rt_entry_cache);
4492535Ssangeeta 		rt = NULL;
4502535Ssangeeta 		irb = NULL;
4513448Sdh155122 		RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
4523448Sdh155122 		rn = ipst->ips_ip_ftable->rnh_lookup(&rdst, &rmask,
4533448Sdh155122 		    ipst->ips_ip_ftable);
4543448Sdh155122 		if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
4552535Ssangeeta 			/* found a non-root match */
4562535Ssangeeta 			rt = (struct rt_entry *)rn;
4572535Ssangeeta 		}
4582535Ssangeeta 	}
4592535Ssangeeta 	if (rt != NULL) {
4602535Ssangeeta 		irb = &rt->rt_irb;
46111042SErik.Nordmark@Sun.COM 		irb_refhold(irb);
4622535Ssangeeta 	}
4633448Sdh155122 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
4642535Ssangeeta 	return (irb);
4652535Ssangeeta }
4662535Ssangeeta 
4672535Ssangeeta /*
4682535Ssangeeta  * This function is used when the caller wants to know the outbound
4692535Ssangeeta  * interface for a packet given only the address.
4702535Ssangeeta  * If this is a offlink IP address and there are multiple
4712535Ssangeeta  * routes to this destination, this routine will utilise the
4722535Ssangeeta  * first route it finds to IP address
4732535Ssangeeta  * Return values:
4742535Ssangeeta  * 	0	- FAILURE
4752535Ssangeeta  *	nonzero	- ifindex
4762535Ssangeeta  */
4772535Ssangeeta uint_t
4782535Ssangeeta ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid)
4792535Ssangeeta {
4802535Ssangeeta 	uint_t ifindex = 0;
4812535Ssangeeta 	ire_t *ire;
4822535Ssangeeta 	ill_t *ill;
4833448Sdh155122 	netstack_t *ns;
4843448Sdh155122 	ip_stack_t *ipst;
4852535Ssangeeta 
4863448Sdh155122 	if (zoneid == ALL_ZONES)
4873448Sdh155122 		ns = netstack_find_by_zoneid(GLOBAL_ZONEID);
4883448Sdh155122 	else
4893448Sdh155122 		ns = netstack_find_by_zoneid(zoneid);
4903448Sdh155122 	ASSERT(ns != NULL);
4913448Sdh155122 
4923448Sdh155122 	/*
4933448Sdh155122 	 * For exclusive stacks we set the zoneid to zero
4943448Sdh155122 	 * since IP uses the global zoneid in the exclusive stacks.
4953448Sdh155122 	 */
4963448Sdh155122 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
4973448Sdh155122 		zoneid = GLOBAL_ZONEID;
4983448Sdh155122 	ipst = ns->netstack_ip;
4992535Ssangeeta 
5002535Ssangeeta 	ASSERT(ipaddr->sa_family == AF_INET || ipaddr->sa_family == AF_INET6);
5012535Ssangeeta 
50211042SErik.Nordmark@Sun.COM 	if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) {
50311042SErik.Nordmark@Sun.COM 		ill = ire_nexthop_ill(ire);
50411042SErik.Nordmark@Sun.COM 		if (ill != NULL) {
5052535Ssangeeta 			ifindex = ill->ill_phyint->phyint_ifindex;
50611042SErik.Nordmark@Sun.COM 			ill_refrele(ill);
50711042SErik.Nordmark@Sun.COM 		}
5082535Ssangeeta 		ire_refrele(ire);
5092535Ssangeeta 	}
5103448Sdh155122 	netstack_rele(ns);
5112535Ssangeeta 	return (ifindex);
5122535Ssangeeta }
5132535Ssangeeta 
5142535Ssangeeta /*
5152535Ssangeeta  * Routine to find the route to a destination. If a ifindex is supplied
51611042SErik.Nordmark@Sun.COM  * it tries to match the route to the corresponding ipif for the ifindex
5172535Ssangeeta  */
5182535Ssangeeta static	ire_t *
5193448Sdh155122 route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst)
5202535Ssangeeta {
5212535Ssangeeta 	ire_t *ire = NULL;
5222535Ssangeeta 	int match_flags;
5232535Ssangeeta 
52411042SErik.Nordmark@Sun.COM 	match_flags = MATCH_IRE_DSTONLY;
5252535Ssangeeta 
5262535Ssangeeta 	/* XXX pass NULL tsl for now */
5272535Ssangeeta 
5282535Ssangeeta 	if (dst_addr->sa_family == AF_INET) {
52911042SErik.Nordmark@Sun.COM 		ire = ire_route_recursive_v4(
53011042SErik.Nordmark@Sun.COM 		    ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr, 0, NULL,
53111457SErik.Nordmark@Sun.COM 		    zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL,
53211457SErik.Nordmark@Sun.COM 		    NULL, NULL);
5332535Ssangeeta 	} else {
53411042SErik.Nordmark@Sun.COM 		ire = ire_route_recursive_v6(
53511042SErik.Nordmark@Sun.COM 		    &((struct sockaddr_in6 *)dst_addr)->sin6_addr, 0, NULL,
53611457SErik.Nordmark@Sun.COM 		    zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL,
53711457SErik.Nordmark@Sun.COM 		    NULL, NULL);
53811042SErik.Nordmark@Sun.COM 	}
53911042SErik.Nordmark@Sun.COM 	ASSERT(ire != NULL);
54011042SErik.Nordmark@Sun.COM 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
54111042SErik.Nordmark@Sun.COM 		ire_refrele(ire);
54211042SErik.Nordmark@Sun.COM 		return (NULL);
5432535Ssangeeta 	}
5442535Ssangeeta 	return (ire);
5452535Ssangeeta }
5462535Ssangeeta 
5472535Ssangeeta /*
5482535Ssangeeta  * This routine is called by IP Filter to send a packet out on the wire
54911042SErik.Nordmark@Sun.COM  * to a specified dstination (which may be onlink or offlink). The ifindex may
55011042SErik.Nordmark@Sun.COM  * or may not be 0. A non-null ifindex indicates IP Filter has stipulated
5512535Ssangeeta  * an outgoing interface and requires the nexthop to be on that interface.
5524482Sdr146992  * IP WILL NOT DO the following to the data packet before sending it out:
5532535Ssangeeta  *	a. manipulate ttl
5544482Sdr146992  *	b. ipsec work
5554482Sdr146992  *	c. fragmentation
5564482Sdr146992  *
5574482Sdr146992  * If the packet has been prepared for hardware checksum then it will be
5584482Sdr146992  * passed off to ip_send_align_cksum() to check that the flags set on the
5594482Sdr146992  * packet are in alignment with the capabilities of the new outgoing NIC.
5602535Ssangeeta  *
5612535Ssangeeta  * Return values:
5622535Ssangeeta  *	0:		IP was able to send of the data pkt
5632535Ssangeeta  *	ECOMM:		Could not send packet
5642535Ssangeeta  *	ENONET		No route to dst. It is up to the caller
5652535Ssangeeta  *			to send icmp unreachable error message,
5662535Ssangeeta  *	EINPROGRESS	The macaddr of the onlink dst or that
5672535Ssangeeta  *			of the offlink dst's nexthop needs to get
5682535Ssangeeta  *			resolved before packet can be sent to dst.
5692535Ssangeeta  *			Thus transmission is not guaranteed.
57011042SErik.Nordmark@Sun.COM  *			Note: No longer have visibility to the ARP queue
57111042SErik.Nordmark@Sun.COM  *			hence no EINPROGRESS.
5722535Ssangeeta  */
5732535Ssangeeta int
5742535Ssangeeta ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex,
5752535Ssangeeta     zoneid_t zoneid)
5762535Ssangeeta {
57711042SErik.Nordmark@Sun.COM 	ipaddr_t nexthop;
5783448Sdh155122 	netstack_t *ns;
5793448Sdh155122 	ip_stack_t *ipst;
58011042SErik.Nordmark@Sun.COM 	ip_xmit_attr_t ixas;
58111042SErik.Nordmark@Sun.COM 	int error;
5822535Ssangeeta 
5832535Ssangeeta 	ASSERT(mp != NULL);
5842535Ssangeeta 
5853448Sdh155122 	if (zoneid == ALL_ZONES)
5863448Sdh155122 		ns = netstack_find_by_zoneid(GLOBAL_ZONEID);
5873448Sdh155122 	else
5883448Sdh155122 		ns = netstack_find_by_zoneid(zoneid);
5893448Sdh155122 	ASSERT(ns != NULL);
5903448Sdh155122 
5913448Sdh155122 	/*
5923448Sdh155122 	 * For exclusive stacks we set the zoneid to zero
5933448Sdh155122 	 * since IP uses the global zoneid in the exclusive stacks.
5943448Sdh155122 	 */
5953448Sdh155122 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
5963448Sdh155122 		zoneid = GLOBAL_ZONEID;
5973448Sdh155122 	ipst = ns->netstack_ip;
5983448Sdh155122 
5992535Ssangeeta 	ASSERT(dst_addr->sa_family == AF_INET ||
6002535Ssangeeta 	    dst_addr->sa_family == AF_INET6);
6012535Ssangeeta 
60211042SErik.Nordmark@Sun.COM 	bzero(&ixas, sizeof (ixas));
6032535Ssangeeta 	/*
60411042SErik.Nordmark@Sun.COM 	 * No IPsec, no fragmentation, and don't let any hooks see
60511042SErik.Nordmark@Sun.COM 	 * the packet.
6062535Ssangeeta 	 */
60711042SErik.Nordmark@Sun.COM 	ixas.ixa_flags = IXAF_NO_IPSEC | IXAF_DONTFRAG | IXAF_NO_PFHOOK;
60811042SErik.Nordmark@Sun.COM 	ixas.ixa_cred = kcred;
60911042SErik.Nordmark@Sun.COM 	ixas.ixa_cpid = NOPID;
61011042SErik.Nordmark@Sun.COM 	ixas.ixa_tsl = NULL;
61111042SErik.Nordmark@Sun.COM 	ixas.ixa_ipst = ipst;
61211042SErik.Nordmark@Sun.COM 	ixas.ixa_ifindex = ifindex;
6132535Ssangeeta 
61411042SErik.Nordmark@Sun.COM 	if (dst_addr->sa_family == AF_INET) {
61511042SErik.Nordmark@Sun.COM 		ipha_t *ipha = (ipha_t *)mp->b_rptr;
6164482Sdr146992 
61711042SErik.Nordmark@Sun.COM 		ixas.ixa_flags |= IXAF_IS_IPV4;
61811042SErik.Nordmark@Sun.COM 		nexthop = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr;
61911042SErik.Nordmark@Sun.COM 		if (nexthop != ipha->ipha_dst) {
62011042SErik.Nordmark@Sun.COM 			ixas.ixa_flags |= IXAF_NEXTHOP_SET;
62111042SErik.Nordmark@Sun.COM 			ixas.ixa_nexthop_v4 = nexthop;
6222535Ssangeeta 		}
62311042SErik.Nordmark@Sun.COM 		ixas.ixa_multicast_ttl = ipha->ipha_ttl;
62411042SErik.Nordmark@Sun.COM 	} else {
62511042SErik.Nordmark@Sun.COM 		ip6_t *ip6h = (ip6_t *)mp->b_rptr;
62611042SErik.Nordmark@Sun.COM 		in6_addr_t *nexthop6;
62711042SErik.Nordmark@Sun.COM 
62811042SErik.Nordmark@Sun.COM 		nexthop6 = &((struct sockaddr_in6 *)dst_addr)->sin6_addr;
62911042SErik.Nordmark@Sun.COM 		if (!IN6_ARE_ADDR_EQUAL(nexthop6, &ip6h->ip6_dst)) {
63011042SErik.Nordmark@Sun.COM 			ixas.ixa_flags |= IXAF_NEXTHOP_SET;
63111042SErik.Nordmark@Sun.COM 			ixas.ixa_nexthop_v6 = *nexthop6;
63211042SErik.Nordmark@Sun.COM 		}
63311042SErik.Nordmark@Sun.COM 		ixas.ixa_multicast_ttl = ip6h->ip6_hops;
63411042SErik.Nordmark@Sun.COM 	}
63511042SErik.Nordmark@Sun.COM 	error = ip_output_simple(mp, &ixas);
63611042SErik.Nordmark@Sun.COM 	ixa_cleanup(&ixas);
63711042SErik.Nordmark@Sun.COM 
63811042SErik.Nordmark@Sun.COM 	netstack_rele(ns);
63911042SErik.Nordmark@Sun.COM 	switch (error) {
64011042SErik.Nordmark@Sun.COM 	case 0:
6412535Ssangeeta 		break;
64211042SErik.Nordmark@Sun.COM 
64311042SErik.Nordmark@Sun.COM 	case EHOSTUNREACH:
64411042SErik.Nordmark@Sun.COM 	case ENETUNREACH:
64511042SErik.Nordmark@Sun.COM 		error = ENONET;
64611042SErik.Nordmark@Sun.COM 		break;
64711042SErik.Nordmark@Sun.COM 
64811042SErik.Nordmark@Sun.COM 	default:
64911042SErik.Nordmark@Sun.COM 		error = ECOMM;
6502535Ssangeeta 		break;
6512535Ssangeeta 	}
65211042SErik.Nordmark@Sun.COM 	return (error);
6534482Sdr146992 }
6544482Sdr146992 
6552535Ssangeeta /*
6562535Ssangeeta  * callback function provided by ire_ftable_lookup when calling
6572535Ssangeeta  * rn_match_args(). Invoke ire_match_args on each matching leaf node in
6582535Ssangeeta  * the radix tree.
6592535Ssangeeta  */
6602535Ssangeeta boolean_t
6612535Ssangeeta ire_find_best_route(struct radix_node *rn, void *arg)
6622535Ssangeeta {
6632535Ssangeeta 	struct rt_entry *rt = (struct rt_entry *)rn;
6642535Ssangeeta 	irb_t *irb_ptr;
6652535Ssangeeta 	ire_t *ire;
6662535Ssangeeta 	ire_ftable_args_t *margs = arg;
6672535Ssangeeta 	ipaddr_t match_mask;
6682535Ssangeeta 
6692535Ssangeeta 	ASSERT(rt != NULL);
6702535Ssangeeta 
6712535Ssangeeta 	irb_ptr = &rt->rt_irb;
6722535Ssangeeta 
6732535Ssangeeta 	if (irb_ptr->irb_ire_cnt == 0)
6742535Ssangeeta 		return (B_FALSE);
6752535Ssangeeta 
6762535Ssangeeta 	rw_enter(&irb_ptr->irb_lock, RW_READER);
6772535Ssangeeta 	for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
67811042SErik.Nordmark@Sun.COM 		if (IRE_IS_CONDEMNED(ire))
6792535Ssangeeta 			continue;
680*11681SSowmini.Varadhan@Sun.COM 		ASSERT((margs->ift_flags & MATCH_IRE_SHORTERMASK) == 0);
681*11681SSowmini.Varadhan@Sun.COM 		if (margs->ift_flags & MATCH_IRE_MASK)
6822535Ssangeeta 			match_mask = margs->ift_mask;
6832535Ssangeeta 		else
6842535Ssangeeta 			match_mask = ire->ire_mask;
6852535Ssangeeta 
6862535Ssangeeta 		if (ire_match_args(ire, margs->ift_addr, match_mask,
68711042SErik.Nordmark@Sun.COM 		    margs->ift_gateway, margs->ift_type, margs->ift_ill,
68811042SErik.Nordmark@Sun.COM 		    margs->ift_zoneid, margs->ift_tsl,
68911042SErik.Nordmark@Sun.COM 		    margs->ift_flags)) {
69011042SErik.Nordmark@Sun.COM 			ire_refhold(ire);
6912535Ssangeeta 			rw_exit(&irb_ptr->irb_lock);
6922535Ssangeeta 			margs->ift_best_ire = ire;
6932535Ssangeeta 			return (B_TRUE);
6942535Ssangeeta 		}
6952535Ssangeeta 	}
6962535Ssangeeta 	rw_exit(&irb_ptr->irb_lock);
6972535Ssangeeta 	return (B_FALSE);
6982535Ssangeeta }
6992535Ssangeeta 
7002535Ssangeeta /*
7012535Ssangeeta  * ftable irb_t structures are dynamically allocated, and we need to
7022535Ssangeeta  * check if the irb_t (and associated ftable tree attachment) needs to
7032535Ssangeeta  * be cleaned up when the irb_refcnt goes to 0. The conditions that need
7042535Ssangeeta  * be verified are:
7052535Ssangeeta  * - no other walkers of the irebucket, i.e., quiescent irb_refcnt,
7062535Ssangeeta  * - no other threads holding references to ire's in the bucket,
7072535Ssangeeta  *   i.e., irb_nire == 0
7082535Ssangeeta  * - no active ire's in the bucket, i.e., irb_ire_cnt == 0
7092535Ssangeeta  * - need to hold the global tree lock and irb_lock in write mode.
7102535Ssangeeta  */
7112535Ssangeeta void
7122535Ssangeeta irb_refrele_ftable(irb_t *irb)
7132535Ssangeeta {
7142535Ssangeeta 	for (;;) {
7152535Ssangeeta 		rw_enter(&irb->irb_lock, RW_WRITER);
7162535Ssangeeta 		ASSERT(irb->irb_refcnt != 0);
7172535Ssangeeta 		if (irb->irb_refcnt != 1) {
7182535Ssangeeta 			/*
7192535Ssangeeta 			 * Someone has a reference to this radix node
7202535Ssangeeta 			 * or there is some bucket walker.
7212535Ssangeeta 			 */
7222535Ssangeeta 			irb->irb_refcnt--;
7232535Ssangeeta 			rw_exit(&irb->irb_lock);
7242535Ssangeeta 			return;
7252535Ssangeeta 		} else {
7262535Ssangeeta 			/*
7272535Ssangeeta 			 * There is no other walker, nor is there any
7282535Ssangeeta 			 * other thread that holds a direct ref to this
7292535Ssangeeta 			 * radix node. Do the clean up if needed. Call
7302535Ssangeeta 			 * to ire_unlink will clear the IRB_MARK_CONDEMNED flag
7312535Ssangeeta 			 */
7322535Ssangeeta 			if (irb->irb_marks & IRB_MARK_CONDEMNED)  {
7332535Ssangeeta 				ire_t *ire_list;
7342535Ssangeeta 
7352535Ssangeeta 				ire_list = ire_unlink(irb);
7362535Ssangeeta 				rw_exit(&irb->irb_lock);
7372535Ssangeeta 
7382535Ssangeeta 				if (ire_list != NULL)
7392535Ssangeeta 					ire_cleanup(ire_list);
7402535Ssangeeta 				/*
7412535Ssangeeta 				 * more CONDEMNED entries could have
7422535Ssangeeta 				 * been added while we dropped the lock,
7432535Ssangeeta 				 * so we have to re-check.
7442535Ssangeeta 				 */
7452535Ssangeeta 				continue;
7462535Ssangeeta 			}
7472535Ssangeeta 
7482535Ssangeeta 			/*
7492535Ssangeeta 			 * Now check if there are still any ires
7502535Ssangeeta 			 * associated with this radix node.
7512535Ssangeeta 			 */
7522535Ssangeeta 			if (irb->irb_nire != 0) {
7532535Ssangeeta 				/*
7542535Ssangeeta 				 * someone is still holding on
7552535Ssangeeta 				 * to ires in this bucket
7562535Ssangeeta 				 */
7572535Ssangeeta 				irb->irb_refcnt--;
7582535Ssangeeta 				rw_exit(&irb->irb_lock);
7592535Ssangeeta 				return;
7602535Ssangeeta 			} else {
7612535Ssangeeta 				/*
7622535Ssangeeta 				 * Everything is clear. Zero walkers,
7632535Ssangeeta 				 * Zero threads with a ref to this
7642535Ssangeeta 				 * radix node, Zero ires associated with
7652535Ssangeeta 				 * this radix node. Due to lock order,
7662535Ssangeeta 				 * check the above conditions again
7672535Ssangeeta 				 * after grabbing all locks in the right order
7682535Ssangeeta 				 */
7692535Ssangeeta 				rw_exit(&irb->irb_lock);
7702535Ssangeeta 				if (irb_inactive(irb))
7712535Ssangeeta 					return;
7722535Ssangeeta 				/*
7732535Ssangeeta 				 * irb_inactive could not free the irb.
7742535Ssangeeta 				 * See if there are any walkers, if not
7752535Ssangeeta 				 * try to clean up again.
7762535Ssangeeta 				 */
7772535Ssangeeta 			}
7782535Ssangeeta 		}
7792535Ssangeeta 	}
7802535Ssangeeta }
7812535Ssangeeta 
7822535Ssangeeta /*
78311042SErik.Nordmark@Sun.COM  * IRE iterator used by ire_ftable_lookup to process multiple equal
78411042SErik.Nordmark@Sun.COM  * routes. Given a starting point in the hash list (hash), walk the IREs
78511042SErik.Nordmark@Sun.COM  * in the bucket skipping deleted entries. We treat the bucket as a circular
78611042SErik.Nordmark@Sun.COM  * list for the purposes of walking it.
78711042SErik.Nordmark@Sun.COM  * Returns the IRE (held) that corresponds to the hash value. If that IRE is
78811042SErik.Nordmark@Sun.COM  * not applicable (ire_match_args failed) then it returns a subsequent one.
78911042SErik.Nordmark@Sun.COM  * If we fail to find an IRE we return NULL.
79011042SErik.Nordmark@Sun.COM  *
79111042SErik.Nordmark@Sun.COM  * Assumes that the caller holds a reference on the IRE bucket and a read lock
79211042SErik.Nordmark@Sun.COM  * on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6).
7932535Ssangeeta  *
79411042SErik.Nordmark@Sun.COM  * Applies to IPv4 and IPv6.
79511042SErik.Nordmark@Sun.COM  *
79611042SErik.Nordmark@Sun.COM  * For CGTP, where an IRE_BROADCAST and IRE_HOST can exist for the same
79711042SErik.Nordmark@Sun.COM  * address and bucket, we compare against ire_type for the orig_ire. We also
79811042SErik.Nordmark@Sun.COM  * have IRE_BROADCASTs with and without RTF_MULTIRT, with the former being
79911131SErik.Nordmark@Sun.COM  * first in the bucket. Thus we compare that RTF_MULTIRT match the orig_ire.
80011042SErik.Nordmark@Sun.COM  *
80111042SErik.Nordmark@Sun.COM  * Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is
80211042SErik.Nordmark@Sun.COM  * reachable from the zone i.e., that the ire_gateway_addr is in a subnet
80311042SErik.Nordmark@Sun.COM  * in which the zone has an IP address. We check this for the global zone
80411042SErik.Nordmark@Sun.COM  * even if no shared-IP zones are configured.
8052535Ssangeeta  */
8062535Ssangeeta ire_t *
80711042SErik.Nordmark@Sun.COM ire_round_robin(irb_t *irb_ptr, ire_ftable_args_t *margs, uint_t hash,
80811042SErik.Nordmark@Sun.COM     ire_t *orig_ire, ip_stack_t *ipst)
8092535Ssangeeta {
81011042SErik.Nordmark@Sun.COM 	ire_t		*ire, *maybe_ire = NULL;
81111042SErik.Nordmark@Sun.COM 	uint_t		maybe_badcnt;
81211042SErik.Nordmark@Sun.COM 	uint_t		maxwalk;
81311042SErik.Nordmark@Sun.COM 
81411042SErik.Nordmark@Sun.COM 	/* Fold in more bits from the hint/hash */
81511042SErik.Nordmark@Sun.COM 	hash = hash ^ (hash >> 8) ^ (hash >> 16);
8162535Ssangeeta 
8172535Ssangeeta 	rw_enter(&irb_ptr->irb_lock, RW_WRITER);
81811042SErik.Nordmark@Sun.COM 	maxwalk = irb_ptr->irb_ire_cnt;	/* Excludes condemned */
81911042SErik.Nordmark@Sun.COM 	hash %= maxwalk;
82011042SErik.Nordmark@Sun.COM 	irb_refhold_locked(irb_ptr);
8212535Ssangeeta 	rw_exit(&irb_ptr->irb_lock);
8222535Ssangeeta 
8232535Ssangeeta 	/*
8242535Ssangeeta 	 * Round-robin the routers list looking for a route that
8252535Ssangeeta 	 * matches the passed in parameters.
82611042SErik.Nordmark@Sun.COM 	 * First we skip "hash" number of non-condemned IREs.
82711042SErik.Nordmark@Sun.COM 	 * Then we match the IRE.
82811042SErik.Nordmark@Sun.COM 	 * If we find an ire which has a non-zero ire_badcnt then we remember
82911042SErik.Nordmark@Sun.COM 	 * it and keep on looking for a lower ire_badcnt.
83011042SErik.Nordmark@Sun.COM 	 * If we come to the end of the list we continue (treat the
83111042SErik.Nordmark@Sun.COM 	 * bucket list as a circular list) but we match less than "max"
83211042SErik.Nordmark@Sun.COM 	 * entries.
8332535Ssangeeta 	 */
83411042SErik.Nordmark@Sun.COM 	ire = irb_ptr->irb_ire;
83511042SErik.Nordmark@Sun.COM 	while (maxwalk > 0) {
83611042SErik.Nordmark@Sun.COM 		if (IRE_IS_CONDEMNED(ire))
83711042SErik.Nordmark@Sun.COM 			goto next_ire_skip;
8382535Ssangeeta 
83911042SErik.Nordmark@Sun.COM 		/* Skip the first "hash" entries to do ECMP */
84011042SErik.Nordmark@Sun.COM 		if (hash != 0) {
84111042SErik.Nordmark@Sun.COM 			hash--;
84211042SErik.Nordmark@Sun.COM 			goto next_ire_skip;
84311042SErik.Nordmark@Sun.COM 		}
84411042SErik.Nordmark@Sun.COM 
84511042SErik.Nordmark@Sun.COM 		/* See CGTP comment above */
84611042SErik.Nordmark@Sun.COM 		if (ire->ire_type != orig_ire->ire_type ||
84711131SErik.Nordmark@Sun.COM 		    ((ire->ire_flags ^ orig_ire->ire_flags) & RTF_MULTIRT) != 0)
8482535Ssangeeta 			goto next_ire;
8492535Ssangeeta 
85011042SErik.Nordmark@Sun.COM 		/*
85111042SErik.Nordmark@Sun.COM 		 * Note: Since IPv6 has hash buckets instead of radix
85211042SErik.Nordmark@Sun.COM 		 * buckers we need to explicitly compare the addresses.
85311042SErik.Nordmark@Sun.COM 		 * That makes this less efficient since we will be called
85411042SErik.Nordmark@Sun.COM 		 * even if there is no alternatives just because the
85511042SErik.Nordmark@Sun.COM 		 * bucket has multiple IREs for different addresses.
85611042SErik.Nordmark@Sun.COM 		 */
85711042SErik.Nordmark@Sun.COM 		if (ire->ire_ipversion == IPV6_VERSION) {
85811042SErik.Nordmark@Sun.COM 			if (!IN6_ARE_ADDR_EQUAL(&orig_ire->ire_addr_v6,
85911042SErik.Nordmark@Sun.COM 			    &ire->ire_addr_v6))
86011042SErik.Nordmark@Sun.COM 				goto next_ire;
86111042SErik.Nordmark@Sun.COM 		}
86211042SErik.Nordmark@Sun.COM 
86311042SErik.Nordmark@Sun.COM 		/*
86411042SErik.Nordmark@Sun.COM 		 * For some reason find_best_route uses ire_mask. We do
86511042SErik.Nordmark@Sun.COM 		 * the same.
86611042SErik.Nordmark@Sun.COM 		 */
86711042SErik.Nordmark@Sun.COM 		if (ire->ire_ipversion == IPV4_VERSION ?
86811042SErik.Nordmark@Sun.COM 		    !ire_match_args(ire, margs->ift_addr,
86911042SErik.Nordmark@Sun.COM 		    ire->ire_mask, margs->ift_gateway,
87011042SErik.Nordmark@Sun.COM 		    margs->ift_type, margs->ift_ill, margs->ift_zoneid,
87111042SErik.Nordmark@Sun.COM 		    margs->ift_tsl, margs->ift_flags) :
87211042SErik.Nordmark@Sun.COM 		    !ire_match_args_v6(ire, &margs->ift_addr_v6,
87311042SErik.Nordmark@Sun.COM 		    &ire->ire_mask_v6, &margs->ift_gateway_v6,
87411042SErik.Nordmark@Sun.COM 		    margs->ift_type, margs->ift_ill, margs->ift_zoneid,
87511042SErik.Nordmark@Sun.COM 		    margs->ift_tsl, margs->ift_flags))
8762535Ssangeeta 			goto next_ire;
8772535Ssangeeta 
87811042SErik.Nordmark@Sun.COM 		if (margs->ift_zoneid != ALL_ZONES &&
87911042SErik.Nordmark@Sun.COM 		    (ire->ire_type & IRE_OFFLINK)) {
8802535Ssangeeta 			/*
88111042SErik.Nordmark@Sun.COM 			 * When we're in a zone, we're only
88211042SErik.Nordmark@Sun.COM 			 * interested in routers that are
88311042SErik.Nordmark@Sun.COM 			 * reachable through ipifs within our zone.
8842535Ssangeeta 			 */
88511042SErik.Nordmark@Sun.COM 			if (ire->ire_ipversion == IPV4_VERSION) {
88611042SErik.Nordmark@Sun.COM 				if (!ire_gateway_ok_zone_v4(
88711042SErik.Nordmark@Sun.COM 				    ire->ire_gateway_addr, margs->ift_zoneid,
88811042SErik.Nordmark@Sun.COM 				    ire->ire_ill, margs->ift_tsl, ipst,
88911042SErik.Nordmark@Sun.COM 				    B_TRUE))
89011042SErik.Nordmark@Sun.COM 					goto next_ire;
89111042SErik.Nordmark@Sun.COM 			} else {
89211042SErik.Nordmark@Sun.COM 				if (!ire_gateway_ok_zone_v6(
89311042SErik.Nordmark@Sun.COM 				    &ire->ire_gateway_addr_v6,
89411042SErik.Nordmark@Sun.COM 				    margs->ift_zoneid, ire->ire_ill,
89511042SErik.Nordmark@Sun.COM 				    margs->ift_tsl, ipst, B_TRUE))
89611042SErik.Nordmark@Sun.COM 					goto next_ire;
89711042SErik.Nordmark@Sun.COM 			}
8982535Ssangeeta 		}
89911042SErik.Nordmark@Sun.COM 		mutex_enter(&ire->ire_lock);
90011042SErik.Nordmark@Sun.COM 		/* Look for stale ire_badcnt and clear */
90111042SErik.Nordmark@Sun.COM 		if (ire->ire_badcnt != 0 &&
90211066Srafael.vanoni@sun.com 		    (TICK_TO_SEC(ddi_get_lbolt64()) - ire->ire_last_badcnt >
90311042SErik.Nordmark@Sun.COM 		    ipst->ips_ip_ire_badcnt_lifetime))
90411042SErik.Nordmark@Sun.COM 			ire->ire_badcnt = 0;
90511042SErik.Nordmark@Sun.COM 		mutex_exit(&ire->ire_lock);
9062535Ssangeeta 
90711042SErik.Nordmark@Sun.COM 		if (ire->ire_badcnt == 0) {
90811042SErik.Nordmark@Sun.COM 			/* We found one with a zero badcnt; done */
90911042SErik.Nordmark@Sun.COM 			ire_refhold(ire);
91011042SErik.Nordmark@Sun.COM 			/*
91111042SErik.Nordmark@Sun.COM 			 * Care needed since irb_refrele grabs WLOCK to free
91211042SErik.Nordmark@Sun.COM 			 * the irb_t.
91311042SErik.Nordmark@Sun.COM 			 */
91411042SErik.Nordmark@Sun.COM 			if (ire->ire_ipversion == IPV4_VERSION) {
91511042SErik.Nordmark@Sun.COM 				RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
91611042SErik.Nordmark@Sun.COM 				irb_refrele(irb_ptr);
91711042SErik.Nordmark@Sun.COM 				RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
91811042SErik.Nordmark@Sun.COM 			} else {
91911042SErik.Nordmark@Sun.COM 				rw_exit(&ipst->ips_ip6_ire_head_lock);
92011042SErik.Nordmark@Sun.COM 				irb_refrele(irb_ptr);
92111042SErik.Nordmark@Sun.COM 				rw_enter(&ipst->ips_ip6_ire_head_lock,
92211042SErik.Nordmark@Sun.COM 				    RW_READER);
92311042SErik.Nordmark@Sun.COM 			}
9242535Ssangeeta 			return (ire);
9252535Ssangeeta 		}
9262535Ssangeeta 		/*
92711042SErik.Nordmark@Sun.COM 		 * keep looking to see if there is a better (lower
92811042SErik.Nordmark@Sun.COM 		 * badcnt) matching IRE, but save this one as a last resort.
92911042SErik.Nordmark@Sun.COM 		 * If we find a lower badcnt pick that one as the last* resort.
9302535Ssangeeta 		 */
93111042SErik.Nordmark@Sun.COM 		if (maybe_ire == NULL) {
93211042SErik.Nordmark@Sun.COM 			maybe_ire = ire;
93311042SErik.Nordmark@Sun.COM 			maybe_badcnt = ire->ire_badcnt;
93411042SErik.Nordmark@Sun.COM 		} else if (ire->ire_badcnt < maybe_badcnt) {
93511042SErik.Nordmark@Sun.COM 			maybe_ire = ire;
93611042SErik.Nordmark@Sun.COM 			maybe_badcnt = ire->ire_badcnt;
93711042SErik.Nordmark@Sun.COM 		}
9388485SPeter.Memishian@Sun.COM 
9392535Ssangeeta next_ire:
94011042SErik.Nordmark@Sun.COM 		maxwalk--;
94111042SErik.Nordmark@Sun.COM next_ire_skip:
94211042SErik.Nordmark@Sun.COM 		ire = ire->ire_next;
94311042SErik.Nordmark@Sun.COM 		if (ire == NULL)
94411042SErik.Nordmark@Sun.COM 			ire = irb_ptr->irb_ire;
9452535Ssangeeta 	}
9462535Ssangeeta 	if (maybe_ire != NULL)
94711042SErik.Nordmark@Sun.COM 		ire_refhold(maybe_ire);
94811042SErik.Nordmark@Sun.COM 
94911042SErik.Nordmark@Sun.COM 	/* Care needed since irb_refrele grabs WLOCK to free the irb_t. */
95011042SErik.Nordmark@Sun.COM 	if (ire->ire_ipversion == IPV4_VERSION) {
95111042SErik.Nordmark@Sun.COM 		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
95211042SErik.Nordmark@Sun.COM 		irb_refrele(irb_ptr);
95311042SErik.Nordmark@Sun.COM 		RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
95411042SErik.Nordmark@Sun.COM 	} else {
95511042SErik.Nordmark@Sun.COM 		rw_exit(&ipst->ips_ip6_ire_head_lock);
95611042SErik.Nordmark@Sun.COM 		irb_refrele(irb_ptr);
95711042SErik.Nordmark@Sun.COM 		rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
95811042SErik.Nordmark@Sun.COM 	}
9592535Ssangeeta 	return (maybe_ire);
9602535Ssangeeta }
9612783Ssowmini 
9622783Ssowmini void
9632783Ssowmini irb_refhold_rn(struct radix_node *rn)
9642783Ssowmini {
9652783Ssowmini 	if ((rn->rn_flags & RNF_ROOT) == 0)
96611042SErik.Nordmark@Sun.COM 		irb_refhold(&((rt_t *)(rn))->rt_irb);
9672783Ssowmini }
9682783Ssowmini 
9692783Ssowmini void
9702783Ssowmini irb_refrele_rn(struct radix_node *rn)
9712783Ssowmini {
9722783Ssowmini 	if ((rn->rn_flags & RNF_ROOT) == 0)
9732783Ssowmini 		irb_refrele_ftable(&((rt_t *)(rn))->rt_irb);
9742783Ssowmini }
97511042SErik.Nordmark@Sun.COM 
976*11681SSowmini.Varadhan@Sun.COM 
977*11681SSowmini.Varadhan@Sun.COM /*
978*11681SSowmini.Varadhan@Sun.COM  * ip_select_src_ill() is used by ip_select_route() to find the src_ill
979*11681SSowmini.Varadhan@Sun.COM  * to be used for source-aware routing table lookup. This function will
980*11681SSowmini.Varadhan@Sun.COM  * ignore IPIF_UNNUMBERED interface addresses, and will only return a
981*11681SSowmini.Varadhan@Sun.COM  * numbered interface (ipif_lookup_addr_nondup() will ignore UNNUMBERED
982*11681SSowmini.Varadhan@Sun.COM  * interfaces).
983*11681SSowmini.Varadhan@Sun.COM  */
984*11681SSowmini.Varadhan@Sun.COM static ill_t *
985*11681SSowmini.Varadhan@Sun.COM ip_select_src_ill(const in6_addr_t *v6src, zoneid_t zoneid, ip_stack_t *ipst)
986*11681SSowmini.Varadhan@Sun.COM {
987*11681SSowmini.Varadhan@Sun.COM 	ipif_t *ipif;
988*11681SSowmini.Varadhan@Sun.COM 	ill_t *ill;
989*11681SSowmini.Varadhan@Sun.COM 	boolean_t isv6 = !IN6_IS_ADDR_V4MAPPED(v6src);
990*11681SSowmini.Varadhan@Sun.COM 	ipaddr_t v4src;
991*11681SSowmini.Varadhan@Sun.COM 
992*11681SSowmini.Varadhan@Sun.COM 	if (isv6) {
993*11681SSowmini.Varadhan@Sun.COM 		ipif = ipif_lookup_addr_nondup_v6(v6src, NULL, zoneid, ipst);
994*11681SSowmini.Varadhan@Sun.COM 	} else {
995*11681SSowmini.Varadhan@Sun.COM 		IN6_V4MAPPED_TO_IPADDR(v6src, v4src);
996*11681SSowmini.Varadhan@Sun.COM 		ipif = ipif_lookup_addr_nondup(v4src, NULL, zoneid, ipst);
997*11681SSowmini.Varadhan@Sun.COM 	}
998*11681SSowmini.Varadhan@Sun.COM 	if (ipif == NULL)
999*11681SSowmini.Varadhan@Sun.COM 		return (NULL);
1000*11681SSowmini.Varadhan@Sun.COM 	ill = ipif->ipif_ill;
1001*11681SSowmini.Varadhan@Sun.COM 	ill_refhold(ill);
1002*11681SSowmini.Varadhan@Sun.COM 	ipif_refrele(ipif);
1003*11681SSowmini.Varadhan@Sun.COM 	return (ill);
1004*11681SSowmini.Varadhan@Sun.COM }
1005*11681SSowmini.Varadhan@Sun.COM 
1006*11681SSowmini.Varadhan@Sun.COM /*
1007*11681SSowmini.Varadhan@Sun.COM  * verify that v6src is configured on ill
1008*11681SSowmini.Varadhan@Sun.COM  */
1009*11681SSowmini.Varadhan@Sun.COM static boolean_t
1010*11681SSowmini.Varadhan@Sun.COM ip_verify_src_on_ill(const in6_addr_t v6src, ill_t *ill, zoneid_t zoneid)
1011*11681SSowmini.Varadhan@Sun.COM {
1012*11681SSowmini.Varadhan@Sun.COM 	ipif_t *ipif;
1013*11681SSowmini.Varadhan@Sun.COM 	ip_stack_t *ipst;
1014*11681SSowmini.Varadhan@Sun.COM 	ipaddr_t v4src;
1015*11681SSowmini.Varadhan@Sun.COM 
1016*11681SSowmini.Varadhan@Sun.COM 	if (ill == NULL)
1017*11681SSowmini.Varadhan@Sun.COM 		return (B_FALSE);
1018*11681SSowmini.Varadhan@Sun.COM 	ipst = ill->ill_ipst;
1019*11681SSowmini.Varadhan@Sun.COM 
1020*11681SSowmini.Varadhan@Sun.COM 	if (ill->ill_isv6) {
1021*11681SSowmini.Varadhan@Sun.COM 		ipif = ipif_lookup_addr_nondup_v6(&v6src, ill, zoneid, ipst);
1022*11681SSowmini.Varadhan@Sun.COM 	} else {
1023*11681SSowmini.Varadhan@Sun.COM 		IN6_V4MAPPED_TO_IPADDR(&v6src, v4src);
1024*11681SSowmini.Varadhan@Sun.COM 		ipif = ipif_lookup_addr_nondup(v4src, ill, zoneid, ipst);
1025*11681SSowmini.Varadhan@Sun.COM 	}
1026*11681SSowmini.Varadhan@Sun.COM 
1027*11681SSowmini.Varadhan@Sun.COM 	if (ipif != NULL) {
1028*11681SSowmini.Varadhan@Sun.COM 		ipif_refrele(ipif);
1029*11681SSowmini.Varadhan@Sun.COM 		return (B_TRUE);
1030*11681SSowmini.Varadhan@Sun.COM 	} else {
1031*11681SSowmini.Varadhan@Sun.COM 		return (B_FALSE);
1032*11681SSowmini.Varadhan@Sun.COM 	}
1033*11681SSowmini.Varadhan@Sun.COM }
1034*11681SSowmini.Varadhan@Sun.COM 
103511042SErik.Nordmark@Sun.COM /*
103611042SErik.Nordmark@Sun.COM  * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject
103711042SErik.Nordmark@Sun.COM  * routes this routine sets up a ire_nce_cache as well. The caller needs to
103811042SErik.Nordmark@Sun.COM  * lookup an nce for the multicast case.
1039*11681SSowmini.Varadhan@Sun.COM  *
1040*11681SSowmini.Varadhan@Sun.COM  * When src_multihoming is set to 2 (strict src multihoming) we use the source
1041*11681SSowmini.Varadhan@Sun.COM  * address to select the interface and route. If IP_BOUND_IF etc are
1042*11681SSowmini.Varadhan@Sun.COM  * specified, we require that they specify an interface on which the
1043*11681SSowmini.Varadhan@Sun.COM  * source address is assigned.
1044*11681SSowmini.Varadhan@Sun.COM  *
1045*11681SSowmini.Varadhan@Sun.COM  * When src_multihoming is set to 1 (preferred src aware route
1046*11681SSowmini.Varadhan@Sun.COM  * selection)  the unicast lookup prefers a matching source
1047*11681SSowmini.Varadhan@Sun.COM  * (i.e., that the route points out an ill on which the source is assigned), but
1048*11681SSowmini.Varadhan@Sun.COM  * if no such route is found we fallback to not considering the source in the
1049*11681SSowmini.Varadhan@Sun.COM  * route lookup.
1050*11681SSowmini.Varadhan@Sun.COM  *
1051*11681SSowmini.Varadhan@Sun.COM  * We skip the src_multihoming check when the source isn't (yet) set, and
1052*11681SSowmini.Varadhan@Sun.COM  * when IXAF_VERIFY_SOURCE is not set. The latter allows RAW sockets to send
1053*11681SSowmini.Varadhan@Sun.COM  * with bogus source addresses as allowed by IP_HDRINCL and IPV6_PKTINFO
1054*11681SSowmini.Varadhan@Sun.COM  * when secpolicy_net_rawaccess().
105511042SErik.Nordmark@Sun.COM  */
105611042SErik.Nordmark@Sun.COM ire_t *
1057*11681SSowmini.Varadhan@Sun.COM ip_select_route(const in6_addr_t *v6dst, const in6_addr_t v6src,
1058*11681SSowmini.Varadhan@Sun.COM     ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp,
1059*11681SSowmini.Varadhan@Sun.COM     int *errorp, boolean_t *multirtp)
106011042SErik.Nordmark@Sun.COM {
106111042SErik.Nordmark@Sun.COM 	uint_t		match_args;
106211042SErik.Nordmark@Sun.COM 	uint_t		ire_type;
1063*11681SSowmini.Varadhan@Sun.COM 	ill_t		*ill = NULL;
106411042SErik.Nordmark@Sun.COM 	ire_t		*ire;
106511042SErik.Nordmark@Sun.COM 	ip_stack_t	*ipst = ixa->ixa_ipst;
106611042SErik.Nordmark@Sun.COM 	ipaddr_t	v4dst;
106711042SErik.Nordmark@Sun.COM 	in6_addr_t	v6nexthop;
106811042SErik.Nordmark@Sun.COM 	iaflags_t	ixaflags = ixa->ixa_flags;
106911042SErik.Nordmark@Sun.COM 	nce_t		*nce;
1070*11681SSowmini.Varadhan@Sun.COM 	boolean_t	preferred_src_aware = B_FALSE;
1071*11681SSowmini.Varadhan@Sun.COM 	boolean_t	verify_src;
1072*11681SSowmini.Varadhan@Sun.COM 	boolean_t	isv6 = !(ixa->ixa_flags & IXAF_IS_IPV4);
1073*11681SSowmini.Varadhan@Sun.COM 	int		src_multihoming = IP_SRC_MULTIHOMING(isv6, ipst);
1074*11681SSowmini.Varadhan@Sun.COM 
1075*11681SSowmini.Varadhan@Sun.COM 	/*
1076*11681SSowmini.Varadhan@Sun.COM 	 * We only verify that the src has been configured on a selected
1077*11681SSowmini.Varadhan@Sun.COM 	 * interface if the src is not :: or INADDR_ANY, and if the
1078*11681SSowmini.Varadhan@Sun.COM 	 * IXAF_VERIFY_SOURCE flag is set.
1079*11681SSowmini.Varadhan@Sun.COM 	 */
1080*11681SSowmini.Varadhan@Sun.COM 	verify_src = (!V6_OR_V4_INADDR_ANY(v6src) &&
1081*11681SSowmini.Varadhan@Sun.COM 	    (ixa->ixa_flags & IXAF_VERIFY_SOURCE));
108211042SErik.Nordmark@Sun.COM 
108311042SErik.Nordmark@Sun.COM 	match_args = MATCH_IRE_SECATTR;
108411042SErik.Nordmark@Sun.COM 	IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst);
108511042SErik.Nordmark@Sun.COM 	if (setsrcp != NULL)
108611042SErik.Nordmark@Sun.COM 		ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp));
108711042SErik.Nordmark@Sun.COM 	if (errorp != NULL)
108811042SErik.Nordmark@Sun.COM 		ASSERT(*errorp == 0);
108911042SErik.Nordmark@Sun.COM 
109011042SErik.Nordmark@Sun.COM 	/*
109111042SErik.Nordmark@Sun.COM 	 * The content of the ixa will be different if IP_NEXTHOP,
109211042SErik.Nordmark@Sun.COM 	 * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set
109311042SErik.Nordmark@Sun.COM 	 */
109411042SErik.Nordmark@Sun.COM 
1095*11681SSowmini.Varadhan@Sun.COM 	if (isv6 ? IN6_IS_ADDR_MULTICAST(v6dst) : CLASSD(v4dst)) {
109611042SErik.Nordmark@Sun.COM 		/* Pick up the IRE_MULTICAST for the ill */
109711042SErik.Nordmark@Sun.COM 		if (ixa->ixa_multicast_ifindex != 0) {
109811042SErik.Nordmark@Sun.COM 			ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex,
1099*11681SSowmini.Varadhan@Sun.COM 			    isv6, ipst);
110011042SErik.Nordmark@Sun.COM 		} else if (ixaflags & IXAF_SCOPEID_SET) {
110111042SErik.Nordmark@Sun.COM 			/* sin6_scope_id takes precedence over ixa_ifindex */
110211042SErik.Nordmark@Sun.COM 			ASSERT(ixa->ixa_scopeid != 0);
110311042SErik.Nordmark@Sun.COM 			ill = ill_lookup_on_ifindex(ixa->ixa_scopeid,
1104*11681SSowmini.Varadhan@Sun.COM 			    isv6, ipst);
110511042SErik.Nordmark@Sun.COM 		} else if (ixa->ixa_ifindex != 0) {
110611042SErik.Nordmark@Sun.COM 			/*
110711042SErik.Nordmark@Sun.COM 			 * In the ipmp case, the ixa_ifindex is set to
110811042SErik.Nordmark@Sun.COM 			 * point at an under_ill and we would return the
110911042SErik.Nordmark@Sun.COM 			 * ire_multicast() corresponding to that under_ill.
111011042SErik.Nordmark@Sun.COM 			 */
111111042SErik.Nordmark@Sun.COM 			ill = ill_lookup_on_ifindex(ixa->ixa_ifindex,
1112*11681SSowmini.Varadhan@Sun.COM 			    isv6, ipst);
1113*11681SSowmini.Varadhan@Sun.COM 		} else if (src_multihoming != 0 && verify_src) {
1114*11681SSowmini.Varadhan@Sun.COM 			/* Look up the ill based on the source address */
1115*11681SSowmini.Varadhan@Sun.COM 			ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst);
1116*11681SSowmini.Varadhan@Sun.COM 			/*
1117*11681SSowmini.Varadhan@Sun.COM 			 * Since we looked up the ill from the source there
1118*11681SSowmini.Varadhan@Sun.COM 			 * is no need to verify that the source is on the ill
1119*11681SSowmini.Varadhan@Sun.COM 			 * below.
1120*11681SSowmini.Varadhan@Sun.COM 			 */
1121*11681SSowmini.Varadhan@Sun.COM 			verify_src = B_FALSE;
1122*11681SSowmini.Varadhan@Sun.COM 			if (ill != NULL && IS_VNI(ill)) {
1123*11681SSowmini.Varadhan@Sun.COM 				ill_t *usesrc = ill;
1124*11681SSowmini.Varadhan@Sun.COM 
1125*11681SSowmini.Varadhan@Sun.COM 				ill = ill_lookup_usesrc(usesrc);
1126*11681SSowmini.Varadhan@Sun.COM 				ill_refrele(usesrc);
1127*11681SSowmini.Varadhan@Sun.COM 			}
1128*11681SSowmini.Varadhan@Sun.COM 		} else if (!isv6) {
112911042SErik.Nordmark@Sun.COM 			ipaddr_t	v4setsrc = INADDR_ANY;
113011042SErik.Nordmark@Sun.COM 
1131*11681SSowmini.Varadhan@Sun.COM 			ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid,
1132*11681SSowmini.Varadhan@Sun.COM 			    ipst, multirtp, &v4setsrc);
113311042SErik.Nordmark@Sun.COM 			if (setsrcp != NULL)
113411042SErik.Nordmark@Sun.COM 				IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp);
113511042SErik.Nordmark@Sun.COM 		} else {
1136*11681SSowmini.Varadhan@Sun.COM 			ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid,
1137*11681SSowmini.Varadhan@Sun.COM 			    ipst, multirtp, setsrcp);
113811042SErik.Nordmark@Sun.COM 		}
113911042SErik.Nordmark@Sun.COM 		if (ill != NULL && IS_VNI(ill)) {
114011042SErik.Nordmark@Sun.COM 			ill_refrele(ill);
114111042SErik.Nordmark@Sun.COM 			ill = NULL;
114211042SErik.Nordmark@Sun.COM 		}
114311042SErik.Nordmark@Sun.COM 		if (ill == NULL) {
114411042SErik.Nordmark@Sun.COM 			if (errorp != NULL)
114511042SErik.Nordmark@Sun.COM 				*errorp = ENXIO;
114611042SErik.Nordmark@Sun.COM 			/* Get a hold on the IRE_NOROUTE */
1147*11681SSowmini.Varadhan@Sun.COM 			ire = ire_reject(ipst, isv6);
114811042SErik.Nordmark@Sun.COM 			return (ire);
114911042SErik.Nordmark@Sun.COM 		}
115011042SErik.Nordmark@Sun.COM 		if (!(ill->ill_flags & ILLF_MULTICAST)) {
115111042SErik.Nordmark@Sun.COM 			ill_refrele(ill);
115211042SErik.Nordmark@Sun.COM 			if (errorp != NULL)
115311042SErik.Nordmark@Sun.COM 				*errorp = EHOSTUNREACH;
115411042SErik.Nordmark@Sun.COM 			/* Get a hold on the IRE_NOROUTE */
1155*11681SSowmini.Varadhan@Sun.COM 			ire = ire_reject(ipst, isv6);
1156*11681SSowmini.Varadhan@Sun.COM 			return (ire);
1157*11681SSowmini.Varadhan@Sun.COM 		}
1158*11681SSowmini.Varadhan@Sun.COM 		/*
1159*11681SSowmini.Varadhan@Sun.COM 		 * If we are doing the strictest src_multihoming, then
1160*11681SSowmini.Varadhan@Sun.COM 		 * we check that IP_MULTICAST_IF, IP_BOUND_IF, etc specify
1161*11681SSowmini.Varadhan@Sun.COM 		 * an interface that is consistent with the source address.
1162*11681SSowmini.Varadhan@Sun.COM 		 */
1163*11681SSowmini.Varadhan@Sun.COM 		if (verify_src && src_multihoming == 2 &&
1164*11681SSowmini.Varadhan@Sun.COM 		    !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) {
1165*11681SSowmini.Varadhan@Sun.COM 			if (errorp != NULL)
1166*11681SSowmini.Varadhan@Sun.COM 				*errorp = EADDRNOTAVAIL;
1167*11681SSowmini.Varadhan@Sun.COM 			ill_refrele(ill);
1168*11681SSowmini.Varadhan@Sun.COM 			/* Get a hold on the IRE_NOROUTE */
1169*11681SSowmini.Varadhan@Sun.COM 			ire = ire_reject(ipst, isv6);
117011042SErik.Nordmark@Sun.COM 			return (ire);
117111042SErik.Nordmark@Sun.COM 		}
117211042SErik.Nordmark@Sun.COM 		/* Get a refcnt on the single IRE_MULTICAST per ill */
117311042SErik.Nordmark@Sun.COM 		ire = ire_multicast(ill);
117411042SErik.Nordmark@Sun.COM 		ill_refrele(ill);
117511042SErik.Nordmark@Sun.COM 		if (generationp != NULL)
117611042SErik.Nordmark@Sun.COM 			*generationp = ire->ire_generation;
117711042SErik.Nordmark@Sun.COM 		if (errorp != NULL &&
117811042SErik.Nordmark@Sun.COM 		    (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
117911042SErik.Nordmark@Sun.COM 			*errorp = EHOSTUNREACH;
118011042SErik.Nordmark@Sun.COM 		}
118111042SErik.Nordmark@Sun.COM 		return (ire);
118211042SErik.Nordmark@Sun.COM 	}
118311042SErik.Nordmark@Sun.COM 
1184*11681SSowmini.Varadhan@Sun.COM 	/* Now for unicast */
118511042SErik.Nordmark@Sun.COM 	if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) {
118611042SErik.Nordmark@Sun.COM 		if (ixaflags & IXAF_SCOPEID_SET) {
118711042SErik.Nordmark@Sun.COM 			/* sin6_scope_id takes precedence over ixa_ifindex */
118811042SErik.Nordmark@Sun.COM 			ASSERT(ixa->ixa_scopeid != 0);
118911042SErik.Nordmark@Sun.COM 			ill = ill_lookup_on_ifindex(ixa->ixa_scopeid,
1190*11681SSowmini.Varadhan@Sun.COM 			    isv6, ipst);
119111042SErik.Nordmark@Sun.COM 		} else {
119211042SErik.Nordmark@Sun.COM 			ASSERT(ixa->ixa_ifindex != 0);
119311042SErik.Nordmark@Sun.COM 			ill = ill_lookup_on_ifindex(ixa->ixa_ifindex,
1194*11681SSowmini.Varadhan@Sun.COM 			    isv6, ipst);
119511042SErik.Nordmark@Sun.COM 		}
119611042SErik.Nordmark@Sun.COM 		if (ill != NULL && IS_VNI(ill)) {
119711042SErik.Nordmark@Sun.COM 			ill_refrele(ill);
119811042SErik.Nordmark@Sun.COM 			ill = NULL;
119911042SErik.Nordmark@Sun.COM 		}
120011042SErik.Nordmark@Sun.COM 		if (ill == NULL) {
120111042SErik.Nordmark@Sun.COM 			if (errorp != NULL)
120211042SErik.Nordmark@Sun.COM 				*errorp = ENXIO;
120311042SErik.Nordmark@Sun.COM 			/* Get a hold on the IRE_NOROUTE */
1204*11681SSowmini.Varadhan@Sun.COM 			ire = ire_reject(ipst, isv6);
120511042SErik.Nordmark@Sun.COM 			return (ire);
120611042SErik.Nordmark@Sun.COM 		}
1207*11681SSowmini.Varadhan@Sun.COM 
1208*11681SSowmini.Varadhan@Sun.COM 		match_args |= MATCH_IRE_ILL;
1209*11681SSowmini.Varadhan@Sun.COM 
121011042SErik.Nordmark@Sun.COM 		/*
121111042SErik.Nordmark@Sun.COM 		 * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF
121211042SErik.Nordmark@Sun.COM 		 * so for both of them we need to be able look for an under
121311042SErik.Nordmark@Sun.COM 		 * interface.
121411042SErik.Nordmark@Sun.COM 		 */
121511042SErik.Nordmark@Sun.COM 		if (IS_UNDER_IPMP(ill))
121611042SErik.Nordmark@Sun.COM 			match_args |= MATCH_IRE_TESTHIDDEN;
1217*11681SSowmini.Varadhan@Sun.COM 
1218*11681SSowmini.Varadhan@Sun.COM 		/*
1219*11681SSowmini.Varadhan@Sun.COM 		 * If we are doing the strictest src_multihoming, then
1220*11681SSowmini.Varadhan@Sun.COM 		 * we check that IP_BOUND_IF, IP_PKTINFO, etc specify
1221*11681SSowmini.Varadhan@Sun.COM 		 * an interface that is consistent with the source address.
1222*11681SSowmini.Varadhan@Sun.COM 		 */
1223*11681SSowmini.Varadhan@Sun.COM 		if (src_multihoming == 2 &&
1224*11681SSowmini.Varadhan@Sun.COM 		    !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) {
1225*11681SSowmini.Varadhan@Sun.COM 			if (errorp != NULL)
1226*11681SSowmini.Varadhan@Sun.COM 				*errorp = EADDRNOTAVAIL;
1227*11681SSowmini.Varadhan@Sun.COM 			ill_refrele(ill);
1228*11681SSowmini.Varadhan@Sun.COM 			/* Get a hold on the IRE_NOROUTE */
1229*11681SSowmini.Varadhan@Sun.COM 			ire = ire_reject(ipst, isv6);
1230*11681SSowmini.Varadhan@Sun.COM 			return (ire);
1231*11681SSowmini.Varadhan@Sun.COM 		}
1232*11681SSowmini.Varadhan@Sun.COM 	} else if (src_multihoming != 0 && verify_src) {
1233*11681SSowmini.Varadhan@Sun.COM 		/* Look up the ill based on the source address */
1234*11681SSowmini.Varadhan@Sun.COM 		ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst);
1235*11681SSowmini.Varadhan@Sun.COM 		if (ill == NULL) {
1236*11681SSowmini.Varadhan@Sun.COM 			char addrbuf[INET6_ADDRSTRLEN];
1237*11681SSowmini.Varadhan@Sun.COM 
1238*11681SSowmini.Varadhan@Sun.COM 			ip3dbg(("%s not a valid src for unicast",
1239*11681SSowmini.Varadhan@Sun.COM 			    inet_ntop(AF_INET6, &v6src, addrbuf,
1240*11681SSowmini.Varadhan@Sun.COM 			    sizeof (addrbuf))));
1241*11681SSowmini.Varadhan@Sun.COM 			if (errorp != NULL)
1242*11681SSowmini.Varadhan@Sun.COM 				*errorp = EADDRNOTAVAIL;
1243*11681SSowmini.Varadhan@Sun.COM 			/* Get a hold on the IRE_NOROUTE */
1244*11681SSowmini.Varadhan@Sun.COM 			ire = ire_reject(ipst, isv6);
1245*11681SSowmini.Varadhan@Sun.COM 			return (ire);
1246*11681SSowmini.Varadhan@Sun.COM 		}
1247*11681SSowmini.Varadhan@Sun.COM 		match_args |= MATCH_IRE_SRC_ILL;
1248*11681SSowmini.Varadhan@Sun.COM 		preferred_src_aware = (src_multihoming == 1);
124911042SErik.Nordmark@Sun.COM 	}
125011042SErik.Nordmark@Sun.COM 
125111042SErik.Nordmark@Sun.COM 	if (ixaflags & IXAF_NEXTHOP_SET) {
125211042SErik.Nordmark@Sun.COM 		/* IP_NEXTHOP was set */
125311042SErik.Nordmark@Sun.COM 		v6nexthop = ixa->ixa_nexthop_v6;
125411042SErik.Nordmark@Sun.COM 	} else {
125511042SErik.Nordmark@Sun.COM 		v6nexthop = *v6dst;
125611042SErik.Nordmark@Sun.COM 	}
125711042SErik.Nordmark@Sun.COM 
125811042SErik.Nordmark@Sun.COM 	ire_type = 0;
125911042SErik.Nordmark@Sun.COM 
126011042SErik.Nordmark@Sun.COM 	/*
126111042SErik.Nordmark@Sun.COM 	 * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then
126211042SErik.Nordmark@Sun.COM 	 * we only look for an onlink IRE.
126311042SErik.Nordmark@Sun.COM 	 */
126411042SErik.Nordmark@Sun.COM 	if (ixaflags & (IXAF_DONTROUTE|IXAF_NEXTHOP_SET)) {
126511042SErik.Nordmark@Sun.COM 		match_args |= MATCH_IRE_TYPE;
126611042SErik.Nordmark@Sun.COM 		ire_type = IRE_ONLINK;
126711042SErik.Nordmark@Sun.COM 	}
126811042SErik.Nordmark@Sun.COM 
1269*11681SSowmini.Varadhan@Sun.COM retry:
1270*11681SSowmini.Varadhan@Sun.COM 	if (!isv6) {
127111042SErik.Nordmark@Sun.COM 		ipaddr_t	v4nexthop;
127211042SErik.Nordmark@Sun.COM 		ipaddr_t	v4setsrc = INADDR_ANY;
127311042SErik.Nordmark@Sun.COM 
127411042SErik.Nordmark@Sun.COM 		IN6_V4MAPPED_TO_IPADDR(&v6nexthop, v4nexthop);
127511042SErik.Nordmark@Sun.COM 		ire = ire_route_recursive_v4(v4nexthop, ire_type, ill,
127611457SErik.Nordmark@Sun.COM 		    ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE,
127711042SErik.Nordmark@Sun.COM 		    ixa->ixa_xmit_hint, ipst, &v4setsrc, NULL, generationp);
127811042SErik.Nordmark@Sun.COM 		if (setsrcp != NULL)
127911042SErik.Nordmark@Sun.COM 			IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp);
128011042SErik.Nordmark@Sun.COM 	} else {
128111042SErik.Nordmark@Sun.COM 		ire = ire_route_recursive_v6(&v6nexthop, ire_type, ill,
128211457SErik.Nordmark@Sun.COM 		    ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE,
128311042SErik.Nordmark@Sun.COM 		    ixa->ixa_xmit_hint, ipst, setsrcp, NULL, generationp);
128411042SErik.Nordmark@Sun.COM 	}
128511042SErik.Nordmark@Sun.COM 
128611042SErik.Nordmark@Sun.COM #ifdef DEBUG
128711042SErik.Nordmark@Sun.COM 	if (match_args & MATCH_IRE_TESTHIDDEN) {
128811042SErik.Nordmark@Sun.COM 		ip3dbg(("looking for hidden; dst %x ire %p\n",
128911042SErik.Nordmark@Sun.COM 		    v4dst, (void *)ire));
129011042SErik.Nordmark@Sun.COM 	}
129111042SErik.Nordmark@Sun.COM #endif
1292*11681SSowmini.Varadhan@Sun.COM 	if (ill != NULL) {
129311042SErik.Nordmark@Sun.COM 		ill_refrele(ill);
1294*11681SSowmini.Varadhan@Sun.COM 		ill = NULL;
1295*11681SSowmini.Varadhan@Sun.COM 	}
129611042SErik.Nordmark@Sun.COM 	if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
129711042SErik.Nordmark@Sun.COM 	    (ire->ire_type & IRE_MULTICAST)) {
1298*11681SSowmini.Varadhan@Sun.COM 		if (preferred_src_aware) {
1299*11681SSowmini.Varadhan@Sun.COM 			/*
1300*11681SSowmini.Varadhan@Sun.COM 			 * "Preferred Source Aware" send mode. If we cannot
1301*11681SSowmini.Varadhan@Sun.COM 			 * find an ire whose ire_ill had the desired source
1302*11681SSowmini.Varadhan@Sun.COM 			 * address retry after relaxing the ill matching
1303*11681SSowmini.Varadhan@Sun.COM 			 * constraint.
1304*11681SSowmini.Varadhan@Sun.COM 			 */
1305*11681SSowmini.Varadhan@Sun.COM 			ire_refrele(ire);
1306*11681SSowmini.Varadhan@Sun.COM 			preferred_src_aware = B_FALSE;
1307*11681SSowmini.Varadhan@Sun.COM 			match_args &= ~MATCH_IRE_SRC_ILL;
1308*11681SSowmini.Varadhan@Sun.COM 			goto retry;
1309*11681SSowmini.Varadhan@Sun.COM 		}
131011042SErik.Nordmark@Sun.COM 		/* No ire_nce_cache */
131111042SErik.Nordmark@Sun.COM 		return (ire);
131211042SErik.Nordmark@Sun.COM 	}
131311042SErik.Nordmark@Sun.COM 
131411042SErik.Nordmark@Sun.COM 	/* Setup ire_nce_cache if it doesn't exist or is condemned. */
131511042SErik.Nordmark@Sun.COM 	mutex_enter(&ire->ire_lock);
131611042SErik.Nordmark@Sun.COM 	nce = ire->ire_nce_cache;
131711042SErik.Nordmark@Sun.COM 	if (nce == NULL || nce->nce_is_condemned) {
131811042SErik.Nordmark@Sun.COM 		mutex_exit(&ire->ire_lock);
131911042SErik.Nordmark@Sun.COM 		(void) ire_revalidate_nce(ire);
132011042SErik.Nordmark@Sun.COM 	} else {
132111042SErik.Nordmark@Sun.COM 		mutex_exit(&ire->ire_lock);
132211042SErik.Nordmark@Sun.COM 	}
132311042SErik.Nordmark@Sun.COM 	return (ire);
132411042SErik.Nordmark@Sun.COM }
132511042SErik.Nordmark@Sun.COM 
132611042SErik.Nordmark@Sun.COM /*
132711042SErik.Nordmark@Sun.COM  * Find a route given some xmit attributes and a packet.
132811042SErik.Nordmark@Sun.COM  * Generic for IPv4 and IPv6
132911042SErik.Nordmark@Sun.COM  *
133011042SErik.Nordmark@Sun.COM  * This never returns NULL. But when it returns the IRE_NOROUTE
133111042SErik.Nordmark@Sun.COM  * it might set errorp.
133211042SErik.Nordmark@Sun.COM  */
133311042SErik.Nordmark@Sun.COM ire_t *
133411042SErik.Nordmark@Sun.COM ip_select_route_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp,
133511042SErik.Nordmark@Sun.COM     int *errorp, boolean_t *multirtp)
133611042SErik.Nordmark@Sun.COM {
133711042SErik.Nordmark@Sun.COM 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
133811042SErik.Nordmark@Sun.COM 		ipha_t		*ipha = (ipha_t *)mp->b_rptr;
1339*11681SSowmini.Varadhan@Sun.COM 		in6_addr_t	v6dst, v6src;
134011042SErik.Nordmark@Sun.COM 
134111042SErik.Nordmark@Sun.COM 		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
1342*11681SSowmini.Varadhan@Sun.COM 		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src);
134311042SErik.Nordmark@Sun.COM 
1344*11681SSowmini.Varadhan@Sun.COM 		return (ip_select_route(&v6dst, v6src, ixa, generationp,
134511042SErik.Nordmark@Sun.COM 		    NULL, errorp, multirtp));
134611042SErik.Nordmark@Sun.COM 	} else {
134711042SErik.Nordmark@Sun.COM 		ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
134811042SErik.Nordmark@Sun.COM 
1349*11681SSowmini.Varadhan@Sun.COM 		return (ip_select_route(&ip6h->ip6_dst, ip6h->ip6_src,
1350*11681SSowmini.Varadhan@Sun.COM 		    ixa, generationp, NULL, errorp, multirtp));
135111042SErik.Nordmark@Sun.COM 	}
135211042SErik.Nordmark@Sun.COM }
135311042SErik.Nordmark@Sun.COM 
135411042SErik.Nordmark@Sun.COM ire_t *
1355*11681SSowmini.Varadhan@Sun.COM ip_select_route_v4(ipaddr_t dst, ipaddr_t src, ip_xmit_attr_t *ixa,
1356*11681SSowmini.Varadhan@Sun.COM     uint_t *generationp, ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp)
135711042SErik.Nordmark@Sun.COM {
1358*11681SSowmini.Varadhan@Sun.COM 	in6_addr_t	v6dst, v6src;
135911042SErik.Nordmark@Sun.COM 	ire_t		*ire;
136011042SErik.Nordmark@Sun.COM 	in6_addr_t	setsrc;
136111042SErik.Nordmark@Sun.COM 
136211042SErik.Nordmark@Sun.COM 	ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
136311042SErik.Nordmark@Sun.COM 
136411042SErik.Nordmark@Sun.COM 	IN6_IPADDR_TO_V4MAPPED(dst, &v6dst);
1365*11681SSowmini.Varadhan@Sun.COM 	IN6_IPADDR_TO_V4MAPPED(src, &v6src);
136611042SErik.Nordmark@Sun.COM 
136711042SErik.Nordmark@Sun.COM 	setsrc = ipv6_all_zeros;
1368*11681SSowmini.Varadhan@Sun.COM 	ire = ip_select_route(&v6dst, v6src, ixa, generationp, &setsrc, errorp,
136911042SErik.Nordmark@Sun.COM 	    multirtp);
137011042SErik.Nordmark@Sun.COM 	if (v4setsrcp != NULL)
137111042SErik.Nordmark@Sun.COM 		IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp);
137211042SErik.Nordmark@Sun.COM 	return (ire);
137311042SErik.Nordmark@Sun.COM }
137411042SErik.Nordmark@Sun.COM 
137511042SErik.Nordmark@Sun.COM /*
137611042SErik.Nordmark@Sun.COM  * Recursively look for a route to the destination. Can also match on
137711042SErik.Nordmark@Sun.COM  * the zoneid, ill, and label. Used for the data paths. See also
137811042SErik.Nordmark@Sun.COM  * ire_route_recursive.
137911042SErik.Nordmark@Sun.COM  *
138011457SErik.Nordmark@Sun.COM  * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never
138111457SErik.Nordmark@Sun.COM  * create an IRE_IF_CLONE. This is used on the receive side when we are not
138211457SErik.Nordmark@Sun.COM  * forwarding.
138311457SErik.Nordmark@Sun.COM  * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly
138411457SErik.Nordmark@Sun.COM  * resolve the gateway.
138511457SErik.Nordmark@Sun.COM  *
138611042SErik.Nordmark@Sun.COM  * Note that this function never returns NULL. It returns an IRE_NOROUTE
138711042SErik.Nordmark@Sun.COM  * instead.
138811042SErik.Nordmark@Sun.COM  *
138911042SErik.Nordmark@Sun.COM  * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
139011042SErik.Nordmark@Sun.COM  * is an error.
139111042SErik.Nordmark@Sun.COM  * Allow at most one RTF_INDIRECT.
139211042SErik.Nordmark@Sun.COM  */
139311042SErik.Nordmark@Sun.COM ire_t *
139411042SErik.Nordmark@Sun.COM ire_route_recursive_impl_v4(ire_t *ire,
139511042SErik.Nordmark@Sun.COM     ipaddr_t nexthop, uint_t ire_type, const ill_t *ill_arg,
139611042SErik.Nordmark@Sun.COM     zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
139711457SErik.Nordmark@Sun.COM     uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp,
139811042SErik.Nordmark@Sun.COM     tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
139911042SErik.Nordmark@Sun.COM {
140011042SErik.Nordmark@Sun.COM 	int		i, j;
140111042SErik.Nordmark@Sun.COM 	ire_t		*ires[MAX_IRE_RECURSION];
140211042SErik.Nordmark@Sun.COM 	uint_t		generation;
140311042SErik.Nordmark@Sun.COM 	uint_t		generations[MAX_IRE_RECURSION];
140411042SErik.Nordmark@Sun.COM 	boolean_t	need_refrele = B_FALSE;
140511042SErik.Nordmark@Sun.COM 	boolean_t	invalidate = B_FALSE;
140611042SErik.Nordmark@Sun.COM 	int		prefs[MAX_IRE_RECURSION];
140711042SErik.Nordmark@Sun.COM 	ill_t		*ill = NULL;
140811042SErik.Nordmark@Sun.COM 
140911042SErik.Nordmark@Sun.COM 	if (setsrcp != NULL)
141011042SErik.Nordmark@Sun.COM 		ASSERT(*setsrcp == INADDR_ANY);
141111042SErik.Nordmark@Sun.COM 	if (gwattrp != NULL)
141211042SErik.Nordmark@Sun.COM 		ASSERT(*gwattrp == NULL);
141311042SErik.Nordmark@Sun.COM 
141411042SErik.Nordmark@Sun.COM 	/*
141511042SErik.Nordmark@Sun.COM 	 * We iterate up to three times to resolve a route, even though
141611042SErik.Nordmark@Sun.COM 	 * we have four slots in the array. The extra slot is for an
141711042SErik.Nordmark@Sun.COM 	 * IRE_IF_CLONE we might need to create.
141811042SErik.Nordmark@Sun.COM 	 */
141911042SErik.Nordmark@Sun.COM 	i = 0;
142011042SErik.Nordmark@Sun.COM 	while (i < MAX_IRE_RECURSION - 1) {
142111042SErik.Nordmark@Sun.COM 		/* ire_ftable_lookup handles round-robin/ECMP */
142211042SErik.Nordmark@Sun.COM 		if (ire == NULL) {
142311042SErik.Nordmark@Sun.COM 			ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type,
1424*11681SSowmini.Varadhan@Sun.COM 			    (ill != NULL? ill : ill_arg), zoneid, tsl,
142511042SErik.Nordmark@Sun.COM 			    match_args, xmit_hint, ipst, &generation);
142611042SErik.Nordmark@Sun.COM 		} else {
142711042SErik.Nordmark@Sun.COM 			/* Caller passed it; extra hold since we will rele */
142811042SErik.Nordmark@Sun.COM 			ire_refhold(ire);
142911042SErik.Nordmark@Sun.COM 			if (generationp != NULL)
143011042SErik.Nordmark@Sun.COM 				generation = *generationp;
143111042SErik.Nordmark@Sun.COM 			else
143211042SErik.Nordmark@Sun.COM 				generation = IRE_GENERATION_VERIFY;
143311042SErik.Nordmark@Sun.COM 		}
143411042SErik.Nordmark@Sun.COM 		if (ire == NULL)
143511042SErik.Nordmark@Sun.COM 			ire = ire_reject(ipst, B_FALSE);
143611042SErik.Nordmark@Sun.COM 
143711042SErik.Nordmark@Sun.COM 		/* Need to return the ire with RTF_REJECT|BLACKHOLE */
143811042SErik.Nordmark@Sun.COM 		if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
143911042SErik.Nordmark@Sun.COM 			goto error;
144011042SErik.Nordmark@Sun.COM 
144111042SErik.Nordmark@Sun.COM 		ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */
144211042SErik.Nordmark@Sun.COM 
144311042SErik.Nordmark@Sun.COM 		if (i != 0) {
144411131SErik.Nordmark@Sun.COM 			prefs[i] = ire_pref(ire);
144511042SErik.Nordmark@Sun.COM 			/*
144611042SErik.Nordmark@Sun.COM 			 * Don't allow anything unusual past the first
144711042SErik.Nordmark@Sun.COM 			 * iteration.
144811042SErik.Nordmark@Sun.COM 			 */
144911042SErik.Nordmark@Sun.COM 			if ((ire->ire_type &
145011042SErik.Nordmark@Sun.COM 			    (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) ||
145111042SErik.Nordmark@Sun.COM 			    prefs[i] <= prefs[i-1]) {
145211042SErik.Nordmark@Sun.COM 				ire_refrele(ire);
145311457SErik.Nordmark@Sun.COM 				if (irr_flags & IRR_INCOMPLETE) {
145411457SErik.Nordmark@Sun.COM 					ire = ires[0];
145511457SErik.Nordmark@Sun.COM 					ire_refhold(ire);
145611457SErik.Nordmark@Sun.COM 				} else {
145711457SErik.Nordmark@Sun.COM 					ire = ire_reject(ipst, B_FALSE);
145811457SErik.Nordmark@Sun.COM 				}
145911042SErik.Nordmark@Sun.COM 				goto error;
146011042SErik.Nordmark@Sun.COM 			}
146111042SErik.Nordmark@Sun.COM 		}
146211042SErik.Nordmark@Sun.COM 		/* We have a usable IRE */
146311042SErik.Nordmark@Sun.COM 		ires[i] = ire;
146411042SErik.Nordmark@Sun.COM 		generations[i] = generation;
146511042SErik.Nordmark@Sun.COM 		i++;
146611042SErik.Nordmark@Sun.COM 
146711042SErik.Nordmark@Sun.COM 		/* The first RTF_SETSRC address is passed back if setsrcp */
146811042SErik.Nordmark@Sun.COM 		if ((ire->ire_flags & RTF_SETSRC) &&
146911042SErik.Nordmark@Sun.COM 		    setsrcp != NULL && *setsrcp == INADDR_ANY) {
147011042SErik.Nordmark@Sun.COM 			ASSERT(ire->ire_setsrc_addr != INADDR_ANY);
147111042SErik.Nordmark@Sun.COM 			*setsrcp = ire->ire_setsrc_addr;
147211042SErik.Nordmark@Sun.COM 		}
147311042SErik.Nordmark@Sun.COM 
147411042SErik.Nordmark@Sun.COM 		/* The first ire_gw_secattr is passed back if gwattrp */
147511042SErik.Nordmark@Sun.COM 		if (ire->ire_gw_secattr != NULL &&
147611042SErik.Nordmark@Sun.COM 		    gwattrp != NULL && *gwattrp == NULL)
147711042SErik.Nordmark@Sun.COM 			*gwattrp = ire->ire_gw_secattr;
147811042SErik.Nordmark@Sun.COM 
147911042SErik.Nordmark@Sun.COM 		/*
148011042SErik.Nordmark@Sun.COM 		 * Check if we have a short-cut pointer to an IRE for this
148111042SErik.Nordmark@Sun.COM 		 * destination, and that the cached dependency isn't stale.
148211042SErik.Nordmark@Sun.COM 		 * In that case we've rejoined an existing tree towards a
148311042SErik.Nordmark@Sun.COM 		 * parent, thus we don't need to continue the loop to
148411042SErik.Nordmark@Sun.COM 		 * discover the rest of the tree.
148511042SErik.Nordmark@Sun.COM 		 */
148611042SErik.Nordmark@Sun.COM 		mutex_enter(&ire->ire_lock);
148711042SErik.Nordmark@Sun.COM 		if (ire->ire_dep_parent != NULL &&
148811042SErik.Nordmark@Sun.COM 		    ire->ire_dep_parent->ire_generation ==
148911042SErik.Nordmark@Sun.COM 		    ire->ire_dep_parent_generation) {
149011042SErik.Nordmark@Sun.COM 			mutex_exit(&ire->ire_lock);
149111042SErik.Nordmark@Sun.COM 			ire = NULL;
149211042SErik.Nordmark@Sun.COM 			goto done;
149311042SErik.Nordmark@Sun.COM 		}
149411042SErik.Nordmark@Sun.COM 		mutex_exit(&ire->ire_lock);
149511042SErik.Nordmark@Sun.COM 
149611042SErik.Nordmark@Sun.COM 		/*
149711042SErik.Nordmark@Sun.COM 		 * If this type should have an ire_nce_cache (even if it
149811042SErik.Nordmark@Sun.COM 		 * doesn't yet have one) then we are done. Includes
149911042SErik.Nordmark@Sun.COM 		 * IRE_INTERFACE with a full 32 bit mask.
150011042SErik.Nordmark@Sun.COM 		 */
150111042SErik.Nordmark@Sun.COM 		if (ire->ire_nce_capable) {
150211042SErik.Nordmark@Sun.COM 			ire = NULL;
150311042SErik.Nordmark@Sun.COM 			goto done;
150411042SErik.Nordmark@Sun.COM 		}
150511042SErik.Nordmark@Sun.COM 		ASSERT(!(ire->ire_type & IRE_IF_CLONE));
150611042SErik.Nordmark@Sun.COM 		/*
150711042SErik.Nordmark@Sun.COM 		 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this
150811042SErik.Nordmark@Sun.COM 		 * particular destination
150911042SErik.Nordmark@Sun.COM 		 */
151011042SErik.Nordmark@Sun.COM 		if (ire->ire_type & IRE_INTERFACE) {
151111042SErik.Nordmark@Sun.COM 			in6_addr_t	v6nexthop;
151211042SErik.Nordmark@Sun.COM 			ire_t		*clone;
151311042SErik.Nordmark@Sun.COM 
151411042SErik.Nordmark@Sun.COM 			ASSERT(ire->ire_masklen != IPV4_ABITS);
151511042SErik.Nordmark@Sun.COM 
151611042SErik.Nordmark@Sun.COM 			/*
151711042SErik.Nordmark@Sun.COM 			 * In the case of ip_input and ILLF_FORWARDING not
151811457SErik.Nordmark@Sun.COM 			 * being set, and in the case of RTM_GET, there is
151911457SErik.Nordmark@Sun.COM 			 * no point in allocating an IRE_IF_CLONE. We return
152011457SErik.Nordmark@Sun.COM 			 * the IRE_INTERFACE. Note that !IRR_ALLOCATE can
152111457SErik.Nordmark@Sun.COM 			 * result in a ire_dep_parent which is IRE_IF_*
152211457SErik.Nordmark@Sun.COM 			 * without an IRE_IF_CLONE.
152311042SErik.Nordmark@Sun.COM 			 * We recover from that when we need to send packets
152411042SErik.Nordmark@Sun.COM 			 * by ensuring that the generations become
152511042SErik.Nordmark@Sun.COM 			 * IRE_GENERATION_VERIFY in this case.
152611042SErik.Nordmark@Sun.COM 			 */
152711457SErik.Nordmark@Sun.COM 			if (!(irr_flags & IRR_ALLOCATE)) {
152811042SErik.Nordmark@Sun.COM 				invalidate = B_TRUE;
152911042SErik.Nordmark@Sun.COM 				ire = NULL;
153011042SErik.Nordmark@Sun.COM 				goto done;
153111042SErik.Nordmark@Sun.COM 			}
153211042SErik.Nordmark@Sun.COM 
153311042SErik.Nordmark@Sun.COM 			IN6_IPADDR_TO_V4MAPPED(nexthop, &v6nexthop);
153411042SErik.Nordmark@Sun.COM 
153511042SErik.Nordmark@Sun.COM 			clone = ire_create_if_clone(ire, &v6nexthop,
153611042SErik.Nordmark@Sun.COM 			    &generation);
153711042SErik.Nordmark@Sun.COM 			if (clone == NULL) {
153811042SErik.Nordmark@Sun.COM 				/*
153911042SErik.Nordmark@Sun.COM 				 * Temporary failure - no memory.
154011042SErik.Nordmark@Sun.COM 				 * Don't want caller to cache IRE_NOROUTE.
154111042SErik.Nordmark@Sun.COM 				 */
154211042SErik.Nordmark@Sun.COM 				invalidate = B_TRUE;
154311042SErik.Nordmark@Sun.COM 				ire = ire_blackhole(ipst, B_FALSE);
154411042SErik.Nordmark@Sun.COM 				goto error;
154511042SErik.Nordmark@Sun.COM 			}
154611042SErik.Nordmark@Sun.COM 			/*
154711042SErik.Nordmark@Sun.COM 			 * Make clone next to last entry and the
154811042SErik.Nordmark@Sun.COM 			 * IRE_INTERFACE the last in the dependency
154911042SErik.Nordmark@Sun.COM 			 * chain since the clone depends on the
155011042SErik.Nordmark@Sun.COM 			 * IRE_INTERFACE.
155111042SErik.Nordmark@Sun.COM 			 */
155211042SErik.Nordmark@Sun.COM 			ASSERT(i >= 1);
155311042SErik.Nordmark@Sun.COM 			ASSERT(i < MAX_IRE_RECURSION);
155411042SErik.Nordmark@Sun.COM 
155511042SErik.Nordmark@Sun.COM 			ires[i] = ires[i-1];
155611042SErik.Nordmark@Sun.COM 			generations[i] = generations[i-1];
155711042SErik.Nordmark@Sun.COM 			ires[i-1] = clone;
155811042SErik.Nordmark@Sun.COM 			generations[i-1] = generation;
155911042SErik.Nordmark@Sun.COM 			i++;
156011042SErik.Nordmark@Sun.COM 
156111042SErik.Nordmark@Sun.COM 			ire = NULL;
156211042SErik.Nordmark@Sun.COM 			goto done;
156311042SErik.Nordmark@Sun.COM 		}
156411042SErik.Nordmark@Sun.COM 
156511042SErik.Nordmark@Sun.COM 		/*
156611042SErik.Nordmark@Sun.COM 		 * We only match on the type and optionally ILL when
156711042SErik.Nordmark@Sun.COM 		 * recursing. The type match is used by some callers
156811042SErik.Nordmark@Sun.COM 		 * to exclude certain types (such as IRE_IF_CLONE or
156911042SErik.Nordmark@Sun.COM 		 * IRE_LOCAL|IRE_LOOPBACK).
1570*11681SSowmini.Varadhan@Sun.COM 		 *
1571*11681SSowmini.Varadhan@Sun.COM 		 * In the MATCH_IRE_SRC_ILL case, ill_arg may be the 'srcof'
1572*11681SSowmini.Varadhan@Sun.COM 		 * ire->ire_ill, and we want to find the IRE_INTERFACE for
1573*11681SSowmini.Varadhan@Sun.COM 		 * ire_ill, so we set ill to the ire_ill;
157411042SErik.Nordmark@Sun.COM 		 */
157511042SErik.Nordmark@Sun.COM 		match_args &= MATCH_IRE_TYPE;
157611042SErik.Nordmark@Sun.COM 		nexthop = ire->ire_gateway_addr;
157711042SErik.Nordmark@Sun.COM 		if (ill == NULL && ire->ire_ill != NULL) {
157811042SErik.Nordmark@Sun.COM 			ill = ire->ire_ill;
157911042SErik.Nordmark@Sun.COM 			need_refrele = B_TRUE;
158011042SErik.Nordmark@Sun.COM 			ill_refhold(ill);
158111042SErik.Nordmark@Sun.COM 			match_args |= MATCH_IRE_ILL;
158211042SErik.Nordmark@Sun.COM 		}
158311131SErik.Nordmark@Sun.COM 		/*
158411131SErik.Nordmark@Sun.COM 		 * We set the prefs[i] value above if i > 0. We've already
158511131SErik.Nordmark@Sun.COM 		 * done i++ so i is one in the case of the first time around.
158611131SErik.Nordmark@Sun.COM 		 */
158711131SErik.Nordmark@Sun.COM 		if (i == 1)
158811131SErik.Nordmark@Sun.COM 			prefs[0] = ire_pref(ire);
158911042SErik.Nordmark@Sun.COM 		ire = NULL;
159011042SErik.Nordmark@Sun.COM 	}
159111042SErik.Nordmark@Sun.COM 	ASSERT(ire == NULL);
159211042SErik.Nordmark@Sun.COM 	ire = ire_reject(ipst, B_FALSE);
159311042SErik.Nordmark@Sun.COM 
159411042SErik.Nordmark@Sun.COM error:
159511042SErik.Nordmark@Sun.COM 	ASSERT(ire != NULL);
159611042SErik.Nordmark@Sun.COM 	if (need_refrele)
159711042SErik.Nordmark@Sun.COM 		ill_refrele(ill);
159811042SErik.Nordmark@Sun.COM 
159911042SErik.Nordmark@Sun.COM 	/*
160011042SErik.Nordmark@Sun.COM 	 * In the case of MULTIRT we want to try a different IRE the next
160111042SErik.Nordmark@Sun.COM 	 * time. We let the next packet retry in that case.
160211042SErik.Nordmark@Sun.COM 	 */
160311042SErik.Nordmark@Sun.COM 	if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT))
160411042SErik.Nordmark@Sun.COM 		(void) ire_no_good(ires[0]);
160511042SErik.Nordmark@Sun.COM 
160611042SErik.Nordmark@Sun.COM cleanup:
160711042SErik.Nordmark@Sun.COM 	/* cleanup ires[i] */
160811042SErik.Nordmark@Sun.COM 	ire_dep_unbuild(ires, i);
160911042SErik.Nordmark@Sun.COM 	for (j = 0; j < i; j++)
161011042SErik.Nordmark@Sun.COM 		ire_refrele(ires[j]);
161111042SErik.Nordmark@Sun.COM 
161211457SErik.Nordmark@Sun.COM 	ASSERT((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
161311457SErik.Nordmark@Sun.COM 	    (irr_flags & IRR_INCOMPLETE));
161411042SErik.Nordmark@Sun.COM 	/*
161511042SErik.Nordmark@Sun.COM 	 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the
161611042SErik.Nordmark@Sun.COM 	 * ip_select_route since the reject or lack of memory might be gone.
161711042SErik.Nordmark@Sun.COM 	 */
161811042SErik.Nordmark@Sun.COM 	if (generationp != NULL)
161911042SErik.Nordmark@Sun.COM 		*generationp = IRE_GENERATION_VERIFY;
162011042SErik.Nordmark@Sun.COM 	return (ire);
162111042SErik.Nordmark@Sun.COM 
162211042SErik.Nordmark@Sun.COM done:
162311042SErik.Nordmark@Sun.COM 	ASSERT(ire == NULL);
162411042SErik.Nordmark@Sun.COM 	if (need_refrele) {
162511042SErik.Nordmark@Sun.COM 		ill_refrele(ill);
162611042SErik.Nordmark@Sun.COM 		ill = NULL;
162711042SErik.Nordmark@Sun.COM 	}
162811042SErik.Nordmark@Sun.COM 
162911042SErik.Nordmark@Sun.COM 	/* Build dependencies */
163011131SErik.Nordmark@Sun.COM 	if (i > 1 && !ire_dep_build(ires, generations, i)) {
163111042SErik.Nordmark@Sun.COM 		/* Something in chain was condemned; tear it apart */
163211042SErik.Nordmark@Sun.COM 		ire = ire_reject(ipst, B_FALSE);
163311042SErik.Nordmark@Sun.COM 		goto cleanup;
163411042SErik.Nordmark@Sun.COM 	}
163511042SErik.Nordmark@Sun.COM 
163611042SErik.Nordmark@Sun.COM 	/*
163711042SErik.Nordmark@Sun.COM 	 * Release all refholds except the one for ires[0] that we
163811042SErik.Nordmark@Sun.COM 	 * will return to the caller.
163911042SErik.Nordmark@Sun.COM 	 */
164011042SErik.Nordmark@Sun.COM 	for (j = 1; j < i; j++)
164111042SErik.Nordmark@Sun.COM 		ire_refrele(ires[j]);
164211042SErik.Nordmark@Sun.COM 
164311042SErik.Nordmark@Sun.COM 	if (invalidate) {
164411042SErik.Nordmark@Sun.COM 		/*
164511042SErik.Nordmark@Sun.COM 		 * Since we needed to allocate but couldn't we need to make
164611042SErik.Nordmark@Sun.COM 		 * sure that the dependency chain is rebuilt the next time.
164711042SErik.Nordmark@Sun.COM 		 */
164811042SErik.Nordmark@Sun.COM 		ire_dep_invalidate_generations(ires[0]);
164911042SErik.Nordmark@Sun.COM 		generation = IRE_GENERATION_VERIFY;
165011042SErik.Nordmark@Sun.COM 	} else {
165111042SErik.Nordmark@Sun.COM 		/*
165211042SErik.Nordmark@Sun.COM 		 * IREs can have been added or deleted while we did the
165311042SErik.Nordmark@Sun.COM 		 * recursive lookup and we can't catch those until we've built
165411042SErik.Nordmark@Sun.COM 		 * the dependencies. We verify the stored
165511042SErik.Nordmark@Sun.COM 		 * ire_dep_parent_generation to catch any such changes and
165611042SErik.Nordmark@Sun.COM 		 * return IRE_GENERATION_VERIFY (which will cause
165711042SErik.Nordmark@Sun.COM 		 * ip_select_route to be called again so we can redo the
165811042SErik.Nordmark@Sun.COM 		 * recursive lookup next time we send a packet.
165911042SErik.Nordmark@Sun.COM 		 */
166011131SErik.Nordmark@Sun.COM 		if (ires[0]->ire_dep_parent == NULL)
166111131SErik.Nordmark@Sun.COM 			generation = ires[0]->ire_generation;
166211131SErik.Nordmark@Sun.COM 		else
166311131SErik.Nordmark@Sun.COM 			generation = ire_dep_validate_generations(ires[0]);
166411042SErik.Nordmark@Sun.COM 		if (generations[0] != ires[0]->ire_generation) {
166511042SErik.Nordmark@Sun.COM 			/* Something changed at the top */
166611042SErik.Nordmark@Sun.COM 			generation = IRE_GENERATION_VERIFY;
166711042SErik.Nordmark@Sun.COM 		}
166811042SErik.Nordmark@Sun.COM 	}
166911042SErik.Nordmark@Sun.COM 	if (generationp != NULL)
167011042SErik.Nordmark@Sun.COM 		*generationp = generation;
167111042SErik.Nordmark@Sun.COM 
167211042SErik.Nordmark@Sun.COM 	return (ires[0]);
167311042SErik.Nordmark@Sun.COM }
167411042SErik.Nordmark@Sun.COM 
167511042SErik.Nordmark@Sun.COM ire_t *
167611042SErik.Nordmark@Sun.COM ire_route_recursive_v4(ipaddr_t nexthop, uint_t ire_type, const ill_t *ill,
167711042SErik.Nordmark@Sun.COM     zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
167811457SErik.Nordmark@Sun.COM     uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp,
167911042SErik.Nordmark@Sun.COM     tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
168011042SErik.Nordmark@Sun.COM {
168111042SErik.Nordmark@Sun.COM 	return (ire_route_recursive_impl_v4(NULL, nexthop, ire_type, ill,
168211457SErik.Nordmark@Sun.COM 	    zoneid, tsl, match_args, irr_flags, xmit_hint, ipst, setsrcp,
168311042SErik.Nordmark@Sun.COM 	    gwattrp, generationp));
168411042SErik.Nordmark@Sun.COM }
168511042SErik.Nordmark@Sun.COM 
168611042SErik.Nordmark@Sun.COM /*
168711042SErik.Nordmark@Sun.COM  * Recursively look for a route to the destination.
168811042SErik.Nordmark@Sun.COM  * We only handle a destination match here, yet we have the same arguments
168911042SErik.Nordmark@Sun.COM  * as the full match to allow function pointers to select between the two.
169011042SErik.Nordmark@Sun.COM  *
169111042SErik.Nordmark@Sun.COM  * Note that this function never returns NULL. It returns an IRE_NOROUTE
169211042SErik.Nordmark@Sun.COM  * instead.
169311042SErik.Nordmark@Sun.COM  *
169411042SErik.Nordmark@Sun.COM  * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
169511042SErik.Nordmark@Sun.COM  * is an error.
169611042SErik.Nordmark@Sun.COM  * Allow at most one RTF_INDIRECT.
169711042SErik.Nordmark@Sun.COM  */
169811042SErik.Nordmark@Sun.COM ire_t *
169911457SErik.Nordmark@Sun.COM ire_route_recursive_dstonly_v4(ipaddr_t nexthop, uint_t irr_flags,
170011042SErik.Nordmark@Sun.COM     uint32_t xmit_hint, ip_stack_t *ipst)
170111042SErik.Nordmark@Sun.COM {
170211042SErik.Nordmark@Sun.COM 	ire_t	*ire;
170311042SErik.Nordmark@Sun.COM 	ire_t	*ire1;
170411042SErik.Nordmark@Sun.COM 	uint_t	generation;
170511042SErik.Nordmark@Sun.COM 
170611042SErik.Nordmark@Sun.COM 	/* ire_ftable_lookup handles round-robin/ECMP */
170711042SErik.Nordmark@Sun.COM 	ire = ire_ftable_lookup_simple_v4(nexthop, xmit_hint, ipst,
170811042SErik.Nordmark@Sun.COM 	    &generation);
170911042SErik.Nordmark@Sun.COM 	ASSERT(ire != NULL);
171011042SErik.Nordmark@Sun.COM 
171111042SErik.Nordmark@Sun.COM 	/*
171211042SErik.Nordmark@Sun.COM 	 * If this type should have an ire_nce_cache (even if it
171311042SErik.Nordmark@Sun.COM 	 * doesn't yet have one) then we are done. Includes
171411042SErik.Nordmark@Sun.COM 	 * IRE_INTERFACE with a full 32 bit mask.
171511042SErik.Nordmark@Sun.COM 	 */
171611042SErik.Nordmark@Sun.COM 	if (ire->ire_nce_capable)
171711042SErik.Nordmark@Sun.COM 		return (ire);
171811042SErik.Nordmark@Sun.COM 
171911042SErik.Nordmark@Sun.COM 	/*
172011042SErik.Nordmark@Sun.COM 	 * If the IRE has a current cached parent we know that the whole
172111042SErik.Nordmark@Sun.COM 	 * parent chain is current, hence we don't need to discover and
172211042SErik.Nordmark@Sun.COM 	 * build any dependencies by doing a recursive lookup.
172311042SErik.Nordmark@Sun.COM 	 */
172411042SErik.Nordmark@Sun.COM 	mutex_enter(&ire->ire_lock);
172511042SErik.Nordmark@Sun.COM 	if (ire->ire_dep_parent != NULL &&
172611042SErik.Nordmark@Sun.COM 	    ire->ire_dep_parent->ire_generation ==
172711042SErik.Nordmark@Sun.COM 	    ire->ire_dep_parent_generation) {
172811042SErik.Nordmark@Sun.COM 		mutex_exit(&ire->ire_lock);
172911042SErik.Nordmark@Sun.COM 		return (ire);
173011042SErik.Nordmark@Sun.COM 	}
173111042SErik.Nordmark@Sun.COM 	mutex_exit(&ire->ire_lock);
173211042SErik.Nordmark@Sun.COM 
173311042SErik.Nordmark@Sun.COM 	/*
173411042SErik.Nordmark@Sun.COM 	 * Fallback to loop in the normal code starting with the ire
173511042SErik.Nordmark@Sun.COM 	 * we found. Normally this would return the same ire.
173611042SErik.Nordmark@Sun.COM 	 */
173711042SErik.Nordmark@Sun.COM 	ire1 = ire_route_recursive_impl_v4(ire, nexthop, 0, NULL, ALL_ZONES,
173811457SErik.Nordmark@Sun.COM 	    NULL, MATCH_IRE_DSTONLY, irr_flags, xmit_hint, ipst, NULL, NULL,
173911042SErik.Nordmark@Sun.COM 	    &generation);
174011042SErik.Nordmark@Sun.COM 	ire_refrele(ire);
174111042SErik.Nordmark@Sun.COM 	return (ire1);
174211042SErik.Nordmark@Sun.COM }
1743