xref: /onnv-gate/usr/src/uts/common/inet/ip/ip_ire.c (revision 5335:269538231bcc)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
51676Sjpk  * Common Development and Distribution License (the "License").
61676Sjpk  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
210Sstevel@tonic-gate /*
223397Ssangeeta  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
230Sstevel@tonic-gate  * Use is subject to license terms.
240Sstevel@tonic-gate  */
250Sstevel@tonic-gate /* Copyright (c) 1990 Mentat Inc. */
260Sstevel@tonic-gate 
270Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
280Sstevel@tonic-gate 
290Sstevel@tonic-gate 
300Sstevel@tonic-gate /*
310Sstevel@tonic-gate  * This file contains routines that manipulate Internet Routing Entries (IREs).
320Sstevel@tonic-gate  */
330Sstevel@tonic-gate 
340Sstevel@tonic-gate #include <sys/types.h>
350Sstevel@tonic-gate #include <sys/stream.h>
360Sstevel@tonic-gate #include <sys/stropts.h>
370Sstevel@tonic-gate #include <sys/ddi.h>
380Sstevel@tonic-gate #include <sys/cmn_err.h>
390Sstevel@tonic-gate #include <sys/policy.h>
400Sstevel@tonic-gate 
410Sstevel@tonic-gate #include <sys/systm.h>
420Sstevel@tonic-gate #include <sys/kmem.h>
430Sstevel@tonic-gate #include <sys/param.h>
440Sstevel@tonic-gate #include <sys/socket.h>
450Sstevel@tonic-gate #include <net/if.h>
460Sstevel@tonic-gate #include <net/route.h>
470Sstevel@tonic-gate #include <netinet/in.h>
480Sstevel@tonic-gate #include <net/if_dl.h>
490Sstevel@tonic-gate #include <netinet/ip6.h>
500Sstevel@tonic-gate #include <netinet/icmp6.h>
510Sstevel@tonic-gate 
520Sstevel@tonic-gate #include <inet/common.h>
530Sstevel@tonic-gate #include <inet/mi.h>
540Sstevel@tonic-gate #include <inet/ip.h>
550Sstevel@tonic-gate #include <inet/ip6.h>
560Sstevel@tonic-gate #include <inet/ip_ndp.h>
572535Ssangeeta #include <inet/arp.h>
580Sstevel@tonic-gate #include <inet/ip_if.h>
590Sstevel@tonic-gate #include <inet/ip_ire.h>
602535Ssangeeta #include <inet/ip_ftable.h>
610Sstevel@tonic-gate #include <inet/ip_rts.h>
620Sstevel@tonic-gate #include <inet/nd.h>
630Sstevel@tonic-gate 
640Sstevel@tonic-gate #include <net/pfkeyv2.h>
650Sstevel@tonic-gate #include <inet/ipsec_info.h>
660Sstevel@tonic-gate #include <inet/sadb.h>
670Sstevel@tonic-gate #include <sys/kmem.h>
680Sstevel@tonic-gate #include <inet/tcp.h>
690Sstevel@tonic-gate #include <inet/ipclassifier.h>
700Sstevel@tonic-gate #include <sys/zone.h>
713448Sdh155122 #include <sys/cpuvar.h>
723448Sdh155122 
731676Sjpk #include <sys/tsol/label.h>
741676Sjpk #include <sys/tsol/tnet.h>
751676Sjpk 
762535Ssangeeta struct kmem_cache *rt_entry_cache;
772535Ssangeeta 
780Sstevel@tonic-gate /*
790Sstevel@tonic-gate  * Synchronization notes:
800Sstevel@tonic-gate  *
810Sstevel@tonic-gate  * The fields of the ire_t struct are protected in the following way :
820Sstevel@tonic-gate  *
830Sstevel@tonic-gate  * ire_next/ire_ptpn
840Sstevel@tonic-gate  *
850Sstevel@tonic-gate  *	- bucket lock of the respective tables (cache or forwarding tables).
860Sstevel@tonic-gate  *
870Sstevel@tonic-gate  * ire_mp, ire_rfq, ire_stq, ire_u *except* ire_gateway_addr[v6], ire_mask,
880Sstevel@tonic-gate  * ire_type, ire_create_time, ire_masklen, ire_ipversion, ire_flags, ire_ipif,
890Sstevel@tonic-gate  * ire_ihandle, ire_phandle, ire_nce, ire_bucket, ire_in_ill, ire_in_src_addr
900Sstevel@tonic-gate  *
910Sstevel@tonic-gate  *	- Set in ire_create_v4/v6 and never changes after that. Thus,
920Sstevel@tonic-gate  *	  we don't need a lock whenever these fields are accessed.
930Sstevel@tonic-gate  *
940Sstevel@tonic-gate  *	- ire_bucket and ire_masklen (also set in ire_create) is set in
950Sstevel@tonic-gate  *        ire_add_v4/ire_add_v6 before inserting in the bucket and never
960Sstevel@tonic-gate  *        changes after that. Thus we don't need a lock whenever these
970Sstevel@tonic-gate  *	  fields are accessed.
980Sstevel@tonic-gate  *
990Sstevel@tonic-gate  * ire_gateway_addr_v4[v6]
1000Sstevel@tonic-gate  *
1010Sstevel@tonic-gate  *	- ire_gateway_addr_v4[v6] is set during ire_create and later modified
1020Sstevel@tonic-gate  *	  by rts_setgwr[v6]. As ire_gateway_addr is a uint32_t, updates to
1030Sstevel@tonic-gate  *	  it assumed to be atomic and hence the other parts of the code
1040Sstevel@tonic-gate  *	  does not use any locks. ire_gateway_addr_v6 updates are not atomic
1050Sstevel@tonic-gate  *	  and hence any access to it uses ire_lock to get/set the right value.
1060Sstevel@tonic-gate  *
1070Sstevel@tonic-gate  * ire_ident, ire_refcnt
1080Sstevel@tonic-gate  *
1090Sstevel@tonic-gate  *	- Updated atomically using atomic_add_32
1100Sstevel@tonic-gate  *
1110Sstevel@tonic-gate  * ire_ssthresh, ire_rtt_sd, ire_rtt, ire_ib_pkt_count, ire_ob_pkt_count
1120Sstevel@tonic-gate  *
1130Sstevel@tonic-gate  *	- Assumes that 32 bit writes are atomic. No locks. ire_lock is
1140Sstevel@tonic-gate  *	  used to serialize updates to ire_ssthresh, ire_rtt_sd, ire_rtt.
1150Sstevel@tonic-gate  *
1160Sstevel@tonic-gate  * ire_max_frag, ire_frag_flag
1170Sstevel@tonic-gate  *
1180Sstevel@tonic-gate  *	- ire_lock is used to set/read both of them together.
1190Sstevel@tonic-gate  *
1200Sstevel@tonic-gate  * ire_tire_mark
1210Sstevel@tonic-gate  *
1220Sstevel@tonic-gate  *	- Set in ire_create and updated in ire_expire, which is called
1230Sstevel@tonic-gate  *	  by only one function namely ip_trash_timer_expire. Thus only
1240Sstevel@tonic-gate  *	  one function updates and examines the value.
1250Sstevel@tonic-gate  *
1260Sstevel@tonic-gate  * ire_marks
1270Sstevel@tonic-gate  *	- bucket lock protects this.
1280Sstevel@tonic-gate  *
1290Sstevel@tonic-gate  * ire_ipsec_overhead/ire_ll_hdr_length
1300Sstevel@tonic-gate  *
1310Sstevel@tonic-gate  *	- Place holder for returning the information to the upper layers
1320Sstevel@tonic-gate  *	  when IRE_DB_REQ comes down.
1330Sstevel@tonic-gate  *
1340Sstevel@tonic-gate  *
1350Sstevel@tonic-gate  * ipv6_ire_default_count is protected by the bucket lock of
1360Sstevel@tonic-gate  * ip_forwarding_table_v6[0][0].
1370Sstevel@tonic-gate  *
1382535Ssangeeta  * ipv6_ire_default_index is not protected as it  is just a hint
1392535Ssangeeta  * at which default gateway to use. There is nothing
1400Sstevel@tonic-gate  * wrong in using the same gateway for two different connections.
1410Sstevel@tonic-gate  *
1420Sstevel@tonic-gate  * As we always hold the bucket locks in all the places while accessing
1430Sstevel@tonic-gate  * the above values, it is natural to use them for protecting them.
1440Sstevel@tonic-gate  *
1450Sstevel@tonic-gate  * We have a separate cache table and forwarding table for IPv4 and IPv6.
1460Sstevel@tonic-gate  * Cache table (ip_cache_table/ip_cache_table_v6) is a pointer to an
147*5335Ssowmini  * array of irb_t structures. The IPv6 forwarding table
148*5335Ssowmini  * (ip_forwarding_table_v6) is an array of pointers to arrays of irb_t
149*5335Ssowmini  *  structure. ip_forwarding_table_v6 is allocated dynamically in
1503448Sdh155122  * ire_add_v6. ire_ft_init_lock is used to serialize multiple threads
1510Sstevel@tonic-gate  * initializing the same bucket. Once a bucket is initialized, it is never
1523448Sdh155122  * de-alloacted. This assumption enables us to access
1533448Sdh155122  * ip_forwarding_table_v6[i] without any locks.
1540Sstevel@tonic-gate  *
155*5335Ssowmini  * The forwarding table for IPv4 is a radix tree whose leaves
156*5335Ssowmini  * are rt_entry structures containing the irb_t for the rt_dst. The irb_t
157*5335Ssowmini  * for IPv4 is dynamically allocated and freed.
158*5335Ssowmini  *
1590Sstevel@tonic-gate  * Each irb_t - ire bucket structure has a lock to protect
1600Sstevel@tonic-gate  * a bucket and the ires residing in the bucket have a back pointer to
1610Sstevel@tonic-gate  * the bucket structure. It also has a reference count for the number
1620Sstevel@tonic-gate  * of threads walking the bucket - irb_refcnt which is bumped up
1630Sstevel@tonic-gate  * using the macro IRB_REFHOLD macro. The flags irb_flags can be
1640Sstevel@tonic-gate  * set to IRE_MARK_CONDEMNED indicating that there are some ires
1650Sstevel@tonic-gate  * in this bucket that are marked with IRE_MARK_CONDEMNED and the
1660Sstevel@tonic-gate  * last thread to leave the bucket should delete the ires. Usually
1670Sstevel@tonic-gate  * this is done by the IRB_REFRELE macro which is used to decrement
168*5335Ssowmini  * the reference count on a bucket. See comments above irb_t structure
169*5335Ssowmini  * definition in ip.h for further details.
1700Sstevel@tonic-gate  *
1710Sstevel@tonic-gate  * IRE_REFHOLD/IRE_REFRELE macros operate on the ire which increments/
1720Sstevel@tonic-gate  * decrements the reference count, ire_refcnt, atomically on the ire.
1730Sstevel@tonic-gate  * ire_refcnt is modified only using this macro. Operations on the IRE
1740Sstevel@tonic-gate  * could be described as follows :
1750Sstevel@tonic-gate  *
1760Sstevel@tonic-gate  * CREATE an ire with reference count initialized to 1.
1770Sstevel@tonic-gate  *
1780Sstevel@tonic-gate  * ADDITION of an ire holds the bucket lock, checks for duplicates
1790Sstevel@tonic-gate  * and then adds the ire. ire_add_v4/ire_add_v6 returns the ire after
1800Sstevel@tonic-gate  * bumping up once more i.e the reference count is 2. This is to avoid
1810Sstevel@tonic-gate  * an extra lookup in the functions calling ire_add which wants to
1820Sstevel@tonic-gate  * work with the ire after adding.
1830Sstevel@tonic-gate  *
1840Sstevel@tonic-gate  * LOOKUP of an ire bumps up the reference count using IRE_REFHOLD
1850Sstevel@tonic-gate  * macro. It is valid to bump up the referece count of the IRE,
1860Sstevel@tonic-gate  * after the lookup has returned an ire. Following are the lookup
1870Sstevel@tonic-gate  * functions that return an HELD ire :
1880Sstevel@tonic-gate  *
1890Sstevel@tonic-gate  * ire_lookup_local[_v6], ire_ctable_lookup[_v6], ire_ftable_lookup[_v6],
1900Sstevel@tonic-gate  * ire_cache_lookup[_v6], ire_lookup_multi[_v6], ire_route_lookup[_v6],
1914823Sseb  * ipif_to_ire[_v6].
1920Sstevel@tonic-gate  *
1930Sstevel@tonic-gate  * DELETION of an ire holds the bucket lock, removes it from the list
1940Sstevel@tonic-gate  * and then decrements the reference count for having removed from the list
1950Sstevel@tonic-gate  * by using the IRE_REFRELE macro. If some other thread has looked up
1960Sstevel@tonic-gate  * the ire, the reference count would have been bumped up and hence
1970Sstevel@tonic-gate  * this ire will not be freed once deleted. It will be freed once the
1980Sstevel@tonic-gate  * reference count drops to zero.
1990Sstevel@tonic-gate  *
2000Sstevel@tonic-gate  * Add and Delete acquires the bucket lock as RW_WRITER, while all the
2010Sstevel@tonic-gate  * lookups acquire the bucket lock as RW_READER.
2020Sstevel@tonic-gate  *
2030Sstevel@tonic-gate  * NOTE : The only functions that does the IRE_REFRELE when an ire is
2040Sstevel@tonic-gate  *	  passed as an argument are :
2050Sstevel@tonic-gate  *
2060Sstevel@tonic-gate  *	  1) ip_wput_ire : This is because it IRE_REFHOLD/RELEs the
2070Sstevel@tonic-gate  *			   broadcast ires it looks up internally within
2080Sstevel@tonic-gate  *			   the function. Currently, for simplicity it does
2090Sstevel@tonic-gate  *			   not differentiate the one that is passed in and
2100Sstevel@tonic-gate  *			   the ones it looks up internally. It always
2110Sstevel@tonic-gate  *			   IRE_REFRELEs.
2120Sstevel@tonic-gate  *	  2) ire_send
2130Sstevel@tonic-gate  *	     ire_send_v6 : As ire_send calls ip_wput_ire and other functions
2140Sstevel@tonic-gate  *			   that take ire as an argument, it has to selectively
2150Sstevel@tonic-gate  *			   IRE_REFRELE the ire. To maintain symmetry,
2160Sstevel@tonic-gate  *			   ire_send_v6 does the same.
2170Sstevel@tonic-gate  *
2180Sstevel@tonic-gate  * Otherwise, the general rule is to do the IRE_REFRELE in the function
2190Sstevel@tonic-gate  * that is passing the ire as an argument.
2200Sstevel@tonic-gate  *
2210Sstevel@tonic-gate  * In trying to locate ires the following points are to be noted.
2220Sstevel@tonic-gate  *
2230Sstevel@tonic-gate  * IRE_MARK_CONDEMNED signifies that the ire has been logically deleted and is
2240Sstevel@tonic-gate  * to be ignored when walking the ires using ire_next.
2250Sstevel@tonic-gate  *
2260Sstevel@tonic-gate  * IRE_MARK_HIDDEN signifies that the ire is a special ire typically for the
2270Sstevel@tonic-gate  * benefit of in.mpathd which needs to probe interfaces for failures. Normal
2280Sstevel@tonic-gate  * applications should not be seeing this ire and hence this ire is ignored
2290Sstevel@tonic-gate  * in most cases in the search using ire_next.
2300Sstevel@tonic-gate  *
2310Sstevel@tonic-gate  * Zones note:
2320Sstevel@tonic-gate  *	Walking IREs within a given zone also walks certain ires in other
2330Sstevel@tonic-gate  *	zones.  This is done intentionally.  IRE walks with a specified
2340Sstevel@tonic-gate  *	zoneid are used only when doing informational reports, and
2350Sstevel@tonic-gate  *	zone users want to see things that they can access. See block
2360Sstevel@tonic-gate  *	comment in ire_walk_ill_match().
2370Sstevel@tonic-gate  */
2380Sstevel@tonic-gate 
2390Sstevel@tonic-gate /*
2400Sstevel@tonic-gate  * The minimum size of IRE cache table.  It will be recalcuated in
2410Sstevel@tonic-gate  * ip_ire_init().
2423448Sdh155122  * Setable in /etc/system
2430Sstevel@tonic-gate  */
2440Sstevel@tonic-gate uint32_t ip_cache_table_size = IP_CACHE_TABLE_SIZE;
2450Sstevel@tonic-gate uint32_t ip6_cache_table_size = IP6_CACHE_TABLE_SIZE;
2460Sstevel@tonic-gate 
2470Sstevel@tonic-gate /*
2480Sstevel@tonic-gate  * The size of the forwarding table.  We will make sure that it is a
2490Sstevel@tonic-gate  * power of 2 in ip_ire_init().
2503448Sdh155122  * Setable in /etc/system
2510Sstevel@tonic-gate  */
2520Sstevel@tonic-gate uint32_t ip6_ftable_hash_size = IP6_FTABLE_HASH_SIZE;
2530Sstevel@tonic-gate 
2540Sstevel@tonic-gate struct	kmem_cache	*ire_cache;
2550Sstevel@tonic-gate static ire_t	ire_null;
2560Sstevel@tonic-gate 
2570Sstevel@tonic-gate /*
2580Sstevel@tonic-gate  * The threshold number of IRE in a bucket when the IREs are
2590Sstevel@tonic-gate  * cleaned up.  This threshold is calculated later in ip_open()
2600Sstevel@tonic-gate  * based on the speed of CPU and available memory.  This default
2610Sstevel@tonic-gate  * value is the maximum.
2620Sstevel@tonic-gate  *
2630Sstevel@tonic-gate  * We have two kinds of cached IRE, temporary and
2640Sstevel@tonic-gate  * non-temporary.  Temporary IREs are marked with
2650Sstevel@tonic-gate  * IRE_MARK_TEMPORARY.  They are IREs created for non
2660Sstevel@tonic-gate  * TCP traffic and for forwarding purposes.  All others
2670Sstevel@tonic-gate  * are non-temporary IREs.  We don't mark IRE created for
2680Sstevel@tonic-gate  * TCP as temporary because TCP is stateful and there are
2690Sstevel@tonic-gate  * info stored in the IRE which can be shared by other TCP
2700Sstevel@tonic-gate  * connections to the same destination.  For connected
2710Sstevel@tonic-gate  * endpoint, we also don't want to mark the IRE used as
2720Sstevel@tonic-gate  * temporary because the same IRE will be used frequently,
2730Sstevel@tonic-gate  * otherwise, the app should not do a connect().  We change
2740Sstevel@tonic-gate  * the marking at ip_bind_connected_*() if necessary.
2750Sstevel@tonic-gate  *
2760Sstevel@tonic-gate  * We want to keep the cache IRE hash bucket length reasonably
2770Sstevel@tonic-gate  * short, otherwise IRE lookup functions will take "forever."
2780Sstevel@tonic-gate  * We use the "crude" function that the IRE bucket
2790Sstevel@tonic-gate  * length should be based on the CPU speed, which is 1 entry
2800Sstevel@tonic-gate  * per x MHz, depending on the shift factor ip_ire_cpu_ratio
2810Sstevel@tonic-gate  * (n).  This means that with a 750MHz CPU, the max bucket
2820Sstevel@tonic-gate  * length can be (750 >> n) entries.
2830Sstevel@tonic-gate  *
2840Sstevel@tonic-gate  * Note that this threshold is separate for temp and non-temp
2850Sstevel@tonic-gate  * IREs.  This means that the actual bucket length can be
2860Sstevel@tonic-gate  * twice as that.  And while we try to keep temporary IRE
2870Sstevel@tonic-gate  * length at most at the threshold value, we do not attempt to
2880Sstevel@tonic-gate  * make the length for non-temporary IREs fixed, for the
2890Sstevel@tonic-gate  * reason stated above.  Instead, we start trying to find
2900Sstevel@tonic-gate  * "unused" non-temporary IREs when the bucket length reaches
2910Sstevel@tonic-gate  * this threshold and clean them up.
2920Sstevel@tonic-gate  *
2930Sstevel@tonic-gate  * We also want to limit the amount of memory used by
2940Sstevel@tonic-gate  * IREs.  So if we are allowed to use ~3% of memory (M)
2950Sstevel@tonic-gate  * for those IREs, each bucket should not have more than
2960Sstevel@tonic-gate  *
2970Sstevel@tonic-gate  * 	M / num of cache bucket / sizeof (ire_t)
2980Sstevel@tonic-gate  *
2990Sstevel@tonic-gate  * Again the above memory uses are separate for temp and
3000Sstevel@tonic-gate  * non-temp cached IREs.
3010Sstevel@tonic-gate  *
3020Sstevel@tonic-gate  * We may also want the limit to be a function of the number
3030Sstevel@tonic-gate  * of interfaces and number of CPUs.  Doing the initialization
3040Sstevel@tonic-gate  * in ip_open() means that every time an interface is plumbed,
3050Sstevel@tonic-gate  * the max is re-calculated.  Right now, we don't do anything
3060Sstevel@tonic-gate  * different.  In future, when we have more experience, we
3070Sstevel@tonic-gate  * may want to change this behavior.
3080Sstevel@tonic-gate  */
3093448Sdh155122 uint32_t ip_ire_max_bucket_cnt = 10;	/* Setable in /etc/system */
3100Sstevel@tonic-gate uint32_t ip6_ire_max_bucket_cnt = 10;
3110Sstevel@tonic-gate 
3120Sstevel@tonic-gate /*
3130Sstevel@tonic-gate  * The minimum of the temporary IRE bucket count.  We do not want
3140Sstevel@tonic-gate  * the length of each bucket to be too short.  This may hurt
3150Sstevel@tonic-gate  * performance of some apps as the temporary IREs are removed too
3160Sstevel@tonic-gate  * often.
3170Sstevel@tonic-gate  */
3183448Sdh155122 uint32_t ip_ire_min_bucket_cnt = 3;	/* /etc/system - not used */
3190Sstevel@tonic-gate uint32_t ip6_ire_min_bucket_cnt = 3;
3200Sstevel@tonic-gate 
3210Sstevel@tonic-gate /*
3220Sstevel@tonic-gate  * The ratio of memory consumed by IRE used for temporary to available
3230Sstevel@tonic-gate  * memory.  This is a shift factor, so 6 means the ratio 1 to 64.  This
3240Sstevel@tonic-gate  * value can be changed in /etc/system.  6 is a reasonable number.
3250Sstevel@tonic-gate  */
3263448Sdh155122 uint32_t ip_ire_mem_ratio = 6;	/* /etc/system */
3270Sstevel@tonic-gate /* The shift factor for CPU speed to calculate the max IRE bucket length. */
3283448Sdh155122 uint32_t ip_ire_cpu_ratio = 7;	/* /etc/system */
3290Sstevel@tonic-gate 
3302535Ssangeeta typedef struct nce_clookup_s {
3312535Ssangeeta 	ipaddr_t ncecl_addr;
3322535Ssangeeta 	boolean_t ncecl_found;
3332535Ssangeeta } nce_clookup_t;
3342535Ssangeeta 
3350Sstevel@tonic-gate /*
3360Sstevel@tonic-gate  * The maximum number of buckets in IRE cache table.  In future, we may
3370Sstevel@tonic-gate  * want to make it a dynamic hash table.  For the moment, we fix the
3380Sstevel@tonic-gate  * size and allocate the table in ip_ire_init() when IP is first loaded.
3390Sstevel@tonic-gate  * We take into account the amount of memory a system has.
3400Sstevel@tonic-gate  */
3410Sstevel@tonic-gate #define	IP_MAX_CACHE_TABLE_SIZE	4096
3420Sstevel@tonic-gate 
3433448Sdh155122 /* Setable in /etc/system */
3440Sstevel@tonic-gate static uint32_t	ip_max_cache_table_size = IP_MAX_CACHE_TABLE_SIZE;
3450Sstevel@tonic-gate static uint32_t	ip6_max_cache_table_size = IP_MAX_CACHE_TABLE_SIZE;
3460Sstevel@tonic-gate 
3474823Sseb #define	NUM_ILLS	2	/* To build the ILL list to unlock */
3480Sstevel@tonic-gate 
3490Sstevel@tonic-gate /* Zero iulp_t for initialization. */
3500Sstevel@tonic-gate const iulp_t	ire_uinfo_null = { 0 };
3510Sstevel@tonic-gate 
3520Sstevel@tonic-gate static int	ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp,
3532535Ssangeeta     ipsq_func_t func, boolean_t);
3540Sstevel@tonic-gate static void	ire_delete_v4(ire_t *ire);
3550Sstevel@tonic-gate static void	ire_report_ctable(ire_t *ire, char *mp);
3561676Sjpk static void	ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers,
3573448Sdh155122     zoneid_t zoneid, ip_stack_t *);
3580Sstevel@tonic-gate static void	ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type,
3591676Sjpk     pfv_t func, void *arg, uchar_t vers, ill_t *ill);
3600Sstevel@tonic-gate static void	ire_cache_cleanup(irb_t *irb, uint32_t threshold, int cnt);
3612535Ssangeeta static	void	ip_nce_clookup_and_delete(nce_t *nce, void *arg);
3625023Scarlsonj #ifdef DEBUG
3635023Scarlsonj static void	ire_trace_cleanup(const ire_t *);
3640Sstevel@tonic-gate #endif
3650Sstevel@tonic-gate 
3660Sstevel@tonic-gate /*
3670Sstevel@tonic-gate  * To avoid bloating the code, we call this function instead of
3680Sstevel@tonic-gate  * using the macro IRE_REFRELE. Use macro only in performance
3690Sstevel@tonic-gate  * critical paths.
3700Sstevel@tonic-gate  *
3710Sstevel@tonic-gate  * Must not be called while holding any locks. Otherwise if this is
3720Sstevel@tonic-gate  * the last reference to be released there is a chance of recursive mutex
3730Sstevel@tonic-gate  * panic due to ire_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
3740Sstevel@tonic-gate  * to restart an ioctl. The one exception is when the caller is sure that
3750Sstevel@tonic-gate  * this is not the last reference to be released. Eg. if the caller is
3760Sstevel@tonic-gate  * sure that the ire has not been deleted and won't be deleted.
3770Sstevel@tonic-gate  */
3780Sstevel@tonic-gate void
3790Sstevel@tonic-gate ire_refrele(ire_t *ire)
3800Sstevel@tonic-gate {
3810Sstevel@tonic-gate 	IRE_REFRELE(ire);
3820Sstevel@tonic-gate }
3830Sstevel@tonic-gate 
3840Sstevel@tonic-gate void
3850Sstevel@tonic-gate ire_refrele_notr(ire_t *ire)
3860Sstevel@tonic-gate {
3870Sstevel@tonic-gate 	IRE_REFRELE_NOTR(ire);
3880Sstevel@tonic-gate }
3890Sstevel@tonic-gate 
3900Sstevel@tonic-gate /*
3910Sstevel@tonic-gate  * kmem_cache_alloc constructor for IRE in kma space.
3920Sstevel@tonic-gate  * Note that when ire_mp is set the IRE is stored in that mblk and
3930Sstevel@tonic-gate  * not in this cache.
3940Sstevel@tonic-gate  */
3950Sstevel@tonic-gate /* ARGSUSED */
3960Sstevel@tonic-gate static int
3970Sstevel@tonic-gate ip_ire_constructor(void *buf, void *cdrarg, int kmflags)
3980Sstevel@tonic-gate {
3990Sstevel@tonic-gate 	ire_t	*ire = buf;
4000Sstevel@tonic-gate 
4012535Ssangeeta 	ire->ire_nce = NULL;
4020Sstevel@tonic-gate 
4030Sstevel@tonic-gate 	return (0);
4040Sstevel@tonic-gate }
4050Sstevel@tonic-gate 
4060Sstevel@tonic-gate /* ARGSUSED1 */
4070Sstevel@tonic-gate static void
4080Sstevel@tonic-gate ip_ire_destructor(void *buf, void *cdrarg)
4090Sstevel@tonic-gate {
4100Sstevel@tonic-gate 	ire_t	*ire = buf;
4110Sstevel@tonic-gate 
4122535Ssangeeta 	ASSERT(ire->ire_nce == NULL);
4130Sstevel@tonic-gate }
4140Sstevel@tonic-gate 
4150Sstevel@tonic-gate /*
4160Sstevel@tonic-gate  * This function is associated with the IP_IOC_IRE_ADVISE_NO_REPLY
4170Sstevel@tonic-gate  * IOCTL.  It is used by TCP (or other ULPs) to supply revised information
4180Sstevel@tonic-gate  * for an existing CACHED IRE.
4190Sstevel@tonic-gate  */
4200Sstevel@tonic-gate /* ARGSUSED */
4210Sstevel@tonic-gate int
4220Sstevel@tonic-gate ip_ire_advise(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
4230Sstevel@tonic-gate {
4240Sstevel@tonic-gate 	uchar_t	*addr_ucp;
4250Sstevel@tonic-gate 	ipic_t	*ipic;
4260Sstevel@tonic-gate 	ire_t	*ire;
4270Sstevel@tonic-gate 	ipaddr_t	addr;
4280Sstevel@tonic-gate 	in6_addr_t	v6addr;
4290Sstevel@tonic-gate 	irb_t	*irb;
4300Sstevel@tonic-gate 	zoneid_t	zoneid;
4313448Sdh155122 	ip_stack_t	*ipst = CONNQ_TO_IPST(q);
4320Sstevel@tonic-gate 
4330Sstevel@tonic-gate 	ASSERT(q->q_next == NULL);
4340Sstevel@tonic-gate 	zoneid = Q_TO_CONN(q)->conn_zoneid;
4350Sstevel@tonic-gate 
4360Sstevel@tonic-gate 	/*
4370Sstevel@tonic-gate 	 * Check privilege using the ioctl credential; if it is NULL
4380Sstevel@tonic-gate 	 * then this is a kernel message and therefor privileged.
4390Sstevel@tonic-gate 	 */
4403448Sdh155122 	if (ioc_cr != NULL && secpolicy_ip_config(ioc_cr, B_FALSE) != 0)
4410Sstevel@tonic-gate 		return (EPERM);
4420Sstevel@tonic-gate 
4430Sstevel@tonic-gate 	ipic = (ipic_t *)mp->b_rptr;
4440Sstevel@tonic-gate 	if (!(addr_ucp = mi_offset_param(mp, ipic->ipic_addr_offset,
4450Sstevel@tonic-gate 	    ipic->ipic_addr_length))) {
4460Sstevel@tonic-gate 		return (EINVAL);
4470Sstevel@tonic-gate 	}
4480Sstevel@tonic-gate 	if (!OK_32PTR(addr_ucp))
4490Sstevel@tonic-gate 		return (EINVAL);
4500Sstevel@tonic-gate 	switch (ipic->ipic_addr_length) {
4510Sstevel@tonic-gate 	case IP_ADDR_LEN: {
4520Sstevel@tonic-gate 		/* Extract the destination address. */
4530Sstevel@tonic-gate 		addr = *(ipaddr_t *)addr_ucp;
4540Sstevel@tonic-gate 		/* Find the corresponding IRE. */
4553448Sdh155122 		ire = ire_cache_lookup(addr, zoneid, NULL, ipst);
4560Sstevel@tonic-gate 		break;
4570Sstevel@tonic-gate 	}
4580Sstevel@tonic-gate 	case IPV6_ADDR_LEN: {
4590Sstevel@tonic-gate 		/* Extract the destination address. */
4600Sstevel@tonic-gate 		v6addr = *(in6_addr_t *)addr_ucp;
4610Sstevel@tonic-gate 		/* Find the corresponding IRE. */
4623448Sdh155122 		ire = ire_cache_lookup_v6(&v6addr, zoneid, NULL, ipst);
4630Sstevel@tonic-gate 		break;
4640Sstevel@tonic-gate 	}
4650Sstevel@tonic-gate 	default:
4660Sstevel@tonic-gate 		return (EINVAL);
4670Sstevel@tonic-gate 	}
4680Sstevel@tonic-gate 
4690Sstevel@tonic-gate 	if (ire == NULL)
4700Sstevel@tonic-gate 		return (ENOENT);
4710Sstevel@tonic-gate 	/*
4720Sstevel@tonic-gate 	 * Update the round trip time estimate and/or the max frag size
4730Sstevel@tonic-gate 	 * and/or the slow start threshold.
4740Sstevel@tonic-gate 	 *
4750Sstevel@tonic-gate 	 * We serialize multiple advises using ire_lock.
4760Sstevel@tonic-gate 	 */
4770Sstevel@tonic-gate 	mutex_enter(&ire->ire_lock);
4780Sstevel@tonic-gate 	if (ipic->ipic_rtt) {
4790Sstevel@tonic-gate 		/*
4800Sstevel@tonic-gate 		 * If there is no old cached values, initialize them
4810Sstevel@tonic-gate 		 * conservatively.  Set them to be (1.5 * new value).
4820Sstevel@tonic-gate 		 */
4830Sstevel@tonic-gate 		if (ire->ire_uinfo.iulp_rtt != 0) {
4840Sstevel@tonic-gate 			ire->ire_uinfo.iulp_rtt = (ire->ire_uinfo.iulp_rtt +
4850Sstevel@tonic-gate 			    ipic->ipic_rtt) >> 1;
4860Sstevel@tonic-gate 		} else {
4870Sstevel@tonic-gate 			ire->ire_uinfo.iulp_rtt = ipic->ipic_rtt +
4880Sstevel@tonic-gate 			    (ipic->ipic_rtt >> 1);
4890Sstevel@tonic-gate 		}
4900Sstevel@tonic-gate 		if (ire->ire_uinfo.iulp_rtt_sd != 0) {
4910Sstevel@tonic-gate 			ire->ire_uinfo.iulp_rtt_sd =
4920Sstevel@tonic-gate 			    (ire->ire_uinfo.iulp_rtt_sd +
4930Sstevel@tonic-gate 			    ipic->ipic_rtt_sd) >> 1;
4940Sstevel@tonic-gate 		} else {
4950Sstevel@tonic-gate 			ire->ire_uinfo.iulp_rtt_sd = ipic->ipic_rtt_sd +
4960Sstevel@tonic-gate 			    (ipic->ipic_rtt_sd >> 1);
4970Sstevel@tonic-gate 		}
4980Sstevel@tonic-gate 	}
4990Sstevel@tonic-gate 	if (ipic->ipic_max_frag)
5000Sstevel@tonic-gate 		ire->ire_max_frag = MIN(ipic->ipic_max_frag, IP_MAXPACKET);
5010Sstevel@tonic-gate 	if (ipic->ipic_ssthresh != 0) {
5020Sstevel@tonic-gate 		if (ire->ire_uinfo.iulp_ssthresh != 0)
5030Sstevel@tonic-gate 			ire->ire_uinfo.iulp_ssthresh =
5040Sstevel@tonic-gate 			    (ipic->ipic_ssthresh +
5050Sstevel@tonic-gate 			    ire->ire_uinfo.iulp_ssthresh) >> 1;
5060Sstevel@tonic-gate 		else
5070Sstevel@tonic-gate 			ire->ire_uinfo.iulp_ssthresh = ipic->ipic_ssthresh;
5080Sstevel@tonic-gate 	}
5090Sstevel@tonic-gate 	/*
5100Sstevel@tonic-gate 	 * Don't need the ire_lock below this. ire_type does not change
5110Sstevel@tonic-gate 	 * after initialization. ire_marks is protected by irb_lock.
5120Sstevel@tonic-gate 	 */
5130Sstevel@tonic-gate 	mutex_exit(&ire->ire_lock);
5140Sstevel@tonic-gate 
5150Sstevel@tonic-gate 	if (ipic->ipic_ire_marks != 0 && ire->ire_type == IRE_CACHE) {
5160Sstevel@tonic-gate 		/*
5170Sstevel@tonic-gate 		 * Only increment the temporary IRE count if the original
5180Sstevel@tonic-gate 		 * IRE is not already marked temporary.
5190Sstevel@tonic-gate 		 */
5200Sstevel@tonic-gate 		irb = ire->ire_bucket;
5210Sstevel@tonic-gate 		rw_enter(&irb->irb_lock, RW_WRITER);
5220Sstevel@tonic-gate 		if ((ipic->ipic_ire_marks & IRE_MARK_TEMPORARY) &&
5230Sstevel@tonic-gate 		    !(ire->ire_marks & IRE_MARK_TEMPORARY)) {
5240Sstevel@tonic-gate 			irb->irb_tmp_ire_cnt++;
5250Sstevel@tonic-gate 		}
5260Sstevel@tonic-gate 		ire->ire_marks |= ipic->ipic_ire_marks;
5270Sstevel@tonic-gate 		rw_exit(&irb->irb_lock);
5280Sstevel@tonic-gate 	}
5290Sstevel@tonic-gate 
5300Sstevel@tonic-gate 	ire_refrele(ire);
5310Sstevel@tonic-gate 	return (0);
5320Sstevel@tonic-gate }
5330Sstevel@tonic-gate 
5340Sstevel@tonic-gate /*
5350Sstevel@tonic-gate  * This function is associated with the IP_IOC_IRE_DELETE[_NO_REPLY]
5360Sstevel@tonic-gate  * IOCTL[s].  The NO_REPLY form is used by TCP to delete a route IRE
5370Sstevel@tonic-gate  * for a host that is not responding.  This will force an attempt to
5382612Scarlsonj  * establish a new route, if available, and flush out the ARP entry so
5392612Scarlsonj  * it will re-resolve.  Management processes may want to use the
5402612Scarlsonj  * version that generates a reply.
5410Sstevel@tonic-gate  *
5420Sstevel@tonic-gate  * This function does not support IPv6 since Neighbor Unreachability Detection
5430Sstevel@tonic-gate  * means that negative advise like this is useless.
5440Sstevel@tonic-gate  */
5450Sstevel@tonic-gate /* ARGSUSED */
5460Sstevel@tonic-gate int
5470Sstevel@tonic-gate ip_ire_delete(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
5480Sstevel@tonic-gate {
5492535Ssangeeta 	uchar_t		*addr_ucp;
5500Sstevel@tonic-gate 	ipaddr_t	addr;
5512535Ssangeeta 	ire_t		*ire;
5522535Ssangeeta 	ipid_t		*ipid;
5532535Ssangeeta 	boolean_t	routing_sock_info = B_FALSE;	/* Sent info? */
5540Sstevel@tonic-gate 	zoneid_t	zoneid;
5552535Ssangeeta 	ire_t		*gire = NULL;
5562612Scarlsonj 	ill_t		*ill;
5572612Scarlsonj 	mblk_t		*arp_mp;
5583448Sdh155122 	ip_stack_t	*ipst;
5590Sstevel@tonic-gate 
5600Sstevel@tonic-gate 	ASSERT(q->q_next == NULL);
5610Sstevel@tonic-gate 	zoneid = Q_TO_CONN(q)->conn_zoneid;
5623448Sdh155122 	ipst = CONNQ_TO_IPST(q);
5630Sstevel@tonic-gate 
5640Sstevel@tonic-gate 	/*
5650Sstevel@tonic-gate 	 * Check privilege using the ioctl credential; if it is NULL
5660Sstevel@tonic-gate 	 * then this is a kernel message and therefor privileged.
5670Sstevel@tonic-gate 	 */
5683448Sdh155122 	if (ioc_cr != NULL && secpolicy_ip_config(ioc_cr, B_FALSE) != 0)
5690Sstevel@tonic-gate 		return (EPERM);
5700Sstevel@tonic-gate 
5710Sstevel@tonic-gate 	ipid = (ipid_t *)mp->b_rptr;
5720Sstevel@tonic-gate 
5730Sstevel@tonic-gate 	/* Only actions on IRE_CACHEs are acceptable at present. */
5740Sstevel@tonic-gate 	if (ipid->ipid_ire_type != IRE_CACHE)
5750Sstevel@tonic-gate 		return (EINVAL);
5760Sstevel@tonic-gate 
5770Sstevel@tonic-gate 	addr_ucp = mi_offset_param(mp, ipid->ipid_addr_offset,
5784714Ssowmini 	    ipid->ipid_addr_length);
5790Sstevel@tonic-gate 	if (addr_ucp == NULL || !OK_32PTR(addr_ucp))
5800Sstevel@tonic-gate 		return (EINVAL);
5810Sstevel@tonic-gate 	switch (ipid->ipid_addr_length) {
5820Sstevel@tonic-gate 	case IP_ADDR_LEN:
5830Sstevel@tonic-gate 		/* addr_ucp points at IP addr */
5840Sstevel@tonic-gate 		break;
5850Sstevel@tonic-gate 	case sizeof (sin_t): {
5860Sstevel@tonic-gate 		sin_t	*sin;
5870Sstevel@tonic-gate 		/*
5880Sstevel@tonic-gate 		 * got complete (sockaddr) address - increment addr_ucp to point
5890Sstevel@tonic-gate 		 * at the ip_addr field.
5900Sstevel@tonic-gate 		 */
5910Sstevel@tonic-gate 		sin = (sin_t *)addr_ucp;
5920Sstevel@tonic-gate 		addr_ucp = (uchar_t *)&sin->sin_addr.s_addr;
5930Sstevel@tonic-gate 		break;
5940Sstevel@tonic-gate 	}
5950Sstevel@tonic-gate 	default:
5960Sstevel@tonic-gate 		return (EINVAL);
5970Sstevel@tonic-gate 	}
5980Sstevel@tonic-gate 	/* Extract the destination address. */
5990Sstevel@tonic-gate 	bcopy(addr_ucp, &addr, IP_ADDR_LEN);
6000Sstevel@tonic-gate 
6010Sstevel@tonic-gate 	/* Try to find the CACHED IRE. */
6023448Sdh155122 	ire = ire_cache_lookup(addr, zoneid, NULL, ipst);
6030Sstevel@tonic-gate 
6040Sstevel@tonic-gate 	/* Nail it. */
6050Sstevel@tonic-gate 	if (ire) {
6060Sstevel@tonic-gate 		/* Allow delete only on CACHE entries */
6070Sstevel@tonic-gate 		if (ire->ire_type != IRE_CACHE) {
6080Sstevel@tonic-gate 			ire_refrele(ire);
6090Sstevel@tonic-gate 			return (EINVAL);
6100Sstevel@tonic-gate 		}
6110Sstevel@tonic-gate 
6120Sstevel@tonic-gate 		/*
6130Sstevel@tonic-gate 		 * Verify that the IRE has been around for a while.
6140Sstevel@tonic-gate 		 * This is to protect against transport protocols
6150Sstevel@tonic-gate 		 * that are too eager in sending delete messages.
6160Sstevel@tonic-gate 		 */
6170Sstevel@tonic-gate 		if (gethrestime_sec() <
6183448Sdh155122 		    ire->ire_create_time + ipst->ips_ip_ignore_delete_time) {
6190Sstevel@tonic-gate 			ire_refrele(ire);
6200Sstevel@tonic-gate 			return (EINVAL);
6210Sstevel@tonic-gate 		}
6220Sstevel@tonic-gate 		/*
6230Sstevel@tonic-gate 		 * Now we have a potentially dead cache entry. We need
6240Sstevel@tonic-gate 		 * to remove it.
6252535Ssangeeta 		 * If this cache entry is generated from a
6262535Ssangeeta 		 * default route (i.e., ire_cmask == 0),
6270Sstevel@tonic-gate 		 * search the default list and mark it dead and some
6280Sstevel@tonic-gate 		 * background process will try to activate it.
6290Sstevel@tonic-gate 		 */
6300Sstevel@tonic-gate 		if ((ire->ire_gateway_addr != 0) && (ire->ire_cmask == 0)) {
6310Sstevel@tonic-gate 			/*
6320Sstevel@tonic-gate 			 * Make sure that we pick a different
6330Sstevel@tonic-gate 			 * IRE_DEFAULT next time.
6340Sstevel@tonic-gate 			 */
6350Sstevel@tonic-gate 			ire_t *gw_ire;
6362535Ssangeeta 			irb_t *irb = NULL;
6372535Ssangeeta 			uint_t match_flags;
6382535Ssangeeta 
6392535Ssangeeta 			match_flags = (MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE);
6402535Ssangeeta 
6412535Ssangeeta 			gire = ire_ftable_lookup(ire->ire_addr,
6422535Ssangeeta 			    ire->ire_cmask, 0, 0,
6433448Sdh155122 			    ire->ire_ipif, NULL, zoneid, 0, NULL, match_flags,
6443448Sdh155122 			    ipst);
6452535Ssangeeta 
6462535Ssangeeta 			ip3dbg(("ire_ftable_lookup() returned gire %p\n",
6472535Ssangeeta 			    (void *)gire));
6482535Ssangeeta 
6492535Ssangeeta 			if (gire != NULL) {
6502535Ssangeeta 				irb = gire->ire_bucket;
6510Sstevel@tonic-gate 
6520Sstevel@tonic-gate 				/*
6530Sstevel@tonic-gate 				 * We grab it as writer just to serialize
6540Sstevel@tonic-gate 				 * multiple threads trying to bump up
6552535Ssangeeta 				 * irb_rr_origin
6560Sstevel@tonic-gate 				 */
6570Sstevel@tonic-gate 				rw_enter(&irb->irb_lock, RW_WRITER);
6582535Ssangeeta 				if ((gw_ire = irb->irb_rr_origin) == NULL) {
6590Sstevel@tonic-gate 					rw_exit(&irb->irb_lock);
6600Sstevel@tonic-gate 					goto done;
6610Sstevel@tonic-gate 				}
6622535Ssangeeta 
6632894Ssowmini 				DTRACE_PROBE1(ip__ire__del__origin,
6642894Ssowmini 				    (ire_t *), gw_ire);
6650Sstevel@tonic-gate 
6660Sstevel@tonic-gate 				/* Skip past the potentially bad gateway */
6670Sstevel@tonic-gate 				if (ire->ire_gateway_addr ==
6682894Ssowmini 				    gw_ire->ire_gateway_addr) {
6692894Ssowmini 					ire_t *next = gw_ire->ire_next;
6702894Ssowmini 
6712894Ssowmini 					DTRACE_PROBE2(ip__ire__del,
6722894Ssowmini 					    (ire_t *), gw_ire, (irb_t *), irb);
6732894Ssowmini 					IRE_FIND_NEXT_ORIGIN(next);
6742894Ssowmini 					irb->irb_rr_origin = next;
6752894Ssowmini 				}
6760Sstevel@tonic-gate 				rw_exit(&irb->irb_lock);
6772535Ssangeeta 			}
6780Sstevel@tonic-gate 		}
6790Sstevel@tonic-gate done:
6802535Ssangeeta 		if (gire != NULL)
6812535Ssangeeta 			IRE_REFRELE(gire);
6820Sstevel@tonic-gate 		/* report the bad route to routing sockets */
6830Sstevel@tonic-gate 		ip_rts_change(RTM_LOSING, ire->ire_addr, ire->ire_gateway_addr,
6840Sstevel@tonic-gate 		    ire->ire_mask, ire->ire_src_addr, 0, 0, 0,
6853448Sdh155122 		    (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA), ipst);
6860Sstevel@tonic-gate 		routing_sock_info = B_TRUE;
6872612Scarlsonj 
6882612Scarlsonj 		/*
6892612Scarlsonj 		 * TCP is really telling us to start over completely, and it
6902612Scarlsonj 		 * expects that we'll resend the ARP query.  Tell ARP to
6912612Scarlsonj 		 * discard the entry, if this is a local destination.
6922612Scarlsonj 		 */
6932612Scarlsonj 		ill = ire->ire_stq->q_ptr;
6942612Scarlsonj 		if (ire->ire_gateway_addr == 0 &&
6952612Scarlsonj 		    (arp_mp = ill_ared_alloc(ill, addr)) != NULL) {
6962612Scarlsonj 			putnext(ill->ill_rq, arp_mp);
6972612Scarlsonj 		}
6982612Scarlsonj 
6990Sstevel@tonic-gate 		ire_delete(ire);
7000Sstevel@tonic-gate 		ire_refrele(ire);
7010Sstevel@tonic-gate 	}
7023004Sdd193516 	/*
7033004Sdd193516 	 * Also look for an IRE_HOST type redirect ire and
7043004Sdd193516 	 * remove it if present.
7053004Sdd193516 	 */
7063004Sdd193516 	ire = ire_route_lookup(addr, 0, 0, IRE_HOST, NULL, NULL,
7073448Sdh155122 	    ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
7080Sstevel@tonic-gate 
7090Sstevel@tonic-gate 	/* Nail it. */
7103004Sdd193516 	if (ire != NULL) {
7114714Ssowmini 		if (ire->ire_flags & RTF_DYNAMIC) {
7124714Ssowmini 			if (!routing_sock_info) {
7134714Ssowmini 				ip_rts_change(RTM_LOSING, ire->ire_addr,
7144714Ssowmini 				    ire->ire_gateway_addr, ire->ire_mask,
7154714Ssowmini 				    ire->ire_src_addr, 0, 0, 0,
7164714Ssowmini 				    (RTA_DST | RTA_GATEWAY |
7174714Ssowmini 				    RTA_NETMASK | RTA_IFA),
7184714Ssowmini 				    ipst);
7194714Ssowmini 			}
7204714Ssowmini 			ire_delete(ire);
7210Sstevel@tonic-gate 		}
7224714Ssowmini 		ire_refrele(ire);
7230Sstevel@tonic-gate 	}
7240Sstevel@tonic-gate 	return (0);
7250Sstevel@tonic-gate }
7260Sstevel@tonic-gate 
7270Sstevel@tonic-gate /*
7280Sstevel@tonic-gate  * Named Dispatch routine to produce a formatted report on all IREs.
7290Sstevel@tonic-gate  * This report is accessed by using the ndd utility to "get" ND variable
7300Sstevel@tonic-gate  * "ipv4_ire_status".
7310Sstevel@tonic-gate  */
7320Sstevel@tonic-gate /* ARGSUSED */
7330Sstevel@tonic-gate int
7340Sstevel@tonic-gate ip_ire_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr)
7350Sstevel@tonic-gate {
7360Sstevel@tonic-gate 	zoneid_t zoneid;
7373448Sdh155122 	ip_stack_t	*ipst;
7383448Sdh155122 
7393448Sdh155122 	if (CONN_Q(q))
7403448Sdh155122 		ipst = CONNQ_TO_IPST(q);
7413448Sdh155122 	else
7423448Sdh155122 		ipst = ILLQ_TO_IPST(q);
7430Sstevel@tonic-gate 
7440Sstevel@tonic-gate 	(void) mi_mpprintf(mp,
7450Sstevel@tonic-gate 	    "IRE      " MI_COL_HDRPAD_STR
7460Sstevel@tonic-gate 	/*   01234567[89ABCDEF] */
7470Sstevel@tonic-gate 	    "rfq      " MI_COL_HDRPAD_STR
7480Sstevel@tonic-gate 	/*   01234567[89ABCDEF] */
7490Sstevel@tonic-gate 	    "stq      " MI_COL_HDRPAD_STR
7500Sstevel@tonic-gate 	/*   01234567[89ABCDEF] */
7510Sstevel@tonic-gate 	    " zone "
7520Sstevel@tonic-gate 	/*   12345 */
7530Sstevel@tonic-gate 	    "addr            mask            "
7540Sstevel@tonic-gate 	/*   123.123.123.123 123.123.123.123 */
7550Sstevel@tonic-gate 	    "src             gateway         mxfrg rtt   rtt_sd ssthresh ref "
7560Sstevel@tonic-gate 	/*   123.123.123.123 123.123.123.123 12345 12345 123456 12345678 123 */
7570Sstevel@tonic-gate 	    "rtomax tstamp_ok wscale_ok ecn_ok pmtud_ok sack sendpipe "
7580Sstevel@tonic-gate 	/*   123456 123456789 123456789 123456 12345678 1234 12345678 */
7590Sstevel@tonic-gate 	    "recvpipe in/out/forward type");
7600Sstevel@tonic-gate 	/*   12345678 in/out/forward xxxxxxxxxx */
7610Sstevel@tonic-gate 
7620Sstevel@tonic-gate 	/*
7630Sstevel@tonic-gate 	 * Because of the ndd constraint, at most we can have 64K buffer
7640Sstevel@tonic-gate 	 * to put in all IRE info.  So to be more efficient, just
7650Sstevel@tonic-gate 	 * allocate a 64K buffer here, assuming we need that large buffer.
7660Sstevel@tonic-gate 	 * This should be OK as only root can do ndd /dev/ip.
7670Sstevel@tonic-gate 	 */
7680Sstevel@tonic-gate 	if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) {
7690Sstevel@tonic-gate 		/* The following may work even if we cannot get a large buf. */
7700Sstevel@tonic-gate 		(void) mi_mpprintf(mp, "<< Out of buffer >>\n");
7710Sstevel@tonic-gate 		return (0);
7720Sstevel@tonic-gate 	}
7730Sstevel@tonic-gate 
7740Sstevel@tonic-gate 	zoneid = Q_TO_CONN(q)->conn_zoneid;
7750Sstevel@tonic-gate 	if (zoneid == GLOBAL_ZONEID)
7760Sstevel@tonic-gate 		zoneid = ALL_ZONES;
7770Sstevel@tonic-gate 
7783448Sdh155122 	ire_walk_v4(ire_report_ftable, mp->b_cont, zoneid, ipst);
7793448Sdh155122 	ire_walk_v4(ire_report_ctable, mp->b_cont, zoneid, ipst);
7800Sstevel@tonic-gate 
7810Sstevel@tonic-gate 	return (0);
7820Sstevel@tonic-gate }
7830Sstevel@tonic-gate 
7840Sstevel@tonic-gate 
7850Sstevel@tonic-gate /* ire_walk routine invoked for ip_ire_report for each cached IRE. */
7860Sstevel@tonic-gate static void
7870Sstevel@tonic-gate ire_report_ctable(ire_t *ire, char *mp)
7880Sstevel@tonic-gate {
7890Sstevel@tonic-gate 	char	buf1[16];
7900Sstevel@tonic-gate 	char	buf2[16];
7910Sstevel@tonic-gate 	char	buf3[16];
7920Sstevel@tonic-gate 	char	buf4[16];
7930Sstevel@tonic-gate 	uint_t	fo_pkt_count;
7940Sstevel@tonic-gate 	uint_t	ib_pkt_count;
7950Sstevel@tonic-gate 	int	ref;
7960Sstevel@tonic-gate 	uint_t	print_len, buf_len;
7970Sstevel@tonic-gate 
7980Sstevel@tonic-gate 	if ((ire->ire_type & IRE_CACHETABLE) == 0)
7994714Ssowmini 		return;
8000Sstevel@tonic-gate 	buf_len = ((mblk_t *)mp)->b_datap->db_lim - ((mblk_t *)mp)->b_wptr;
8010Sstevel@tonic-gate 	if (buf_len <= 0)
8020Sstevel@tonic-gate 		return;
8030Sstevel@tonic-gate 
8040Sstevel@tonic-gate 	/* Number of active references of this ire */
8050Sstevel@tonic-gate 	ref = ire->ire_refcnt;
8060Sstevel@tonic-gate 	/* "inbound" to a non local address is a forward */
8070Sstevel@tonic-gate 	ib_pkt_count = ire->ire_ib_pkt_count;
8080Sstevel@tonic-gate 	fo_pkt_count = 0;
8090Sstevel@tonic-gate 	if (!(ire->ire_type & (IRE_LOCAL|IRE_BROADCAST))) {
8100Sstevel@tonic-gate 		fo_pkt_count = ib_pkt_count;
8110Sstevel@tonic-gate 		ib_pkt_count = 0;
8120Sstevel@tonic-gate 	}
8130Sstevel@tonic-gate 	print_len =  snprintf((char *)((mblk_t *)mp)->b_wptr, buf_len,
8140Sstevel@tonic-gate 	    MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR "%5d "
8150Sstevel@tonic-gate 	    "%s %s %s %s %05d %05ld %06ld %08d %03d %06d %09d %09d %06d %08d "
8160Sstevel@tonic-gate 	    "%04d %08d %08d %d/%d/%d %s\n",
8170Sstevel@tonic-gate 	    (void *)ire, (void *)ire->ire_rfq, (void *)ire->ire_stq,
8180Sstevel@tonic-gate 	    (int)ire->ire_zoneid,
8190Sstevel@tonic-gate 	    ip_dot_addr(ire->ire_addr, buf1), ip_dot_addr(ire->ire_mask, buf2),
8200Sstevel@tonic-gate 	    ip_dot_addr(ire->ire_src_addr, buf3),
8210Sstevel@tonic-gate 	    ip_dot_addr(ire->ire_gateway_addr, buf4),
8220Sstevel@tonic-gate 	    ire->ire_max_frag, ire->ire_uinfo.iulp_rtt,
8230Sstevel@tonic-gate 	    ire->ire_uinfo.iulp_rtt_sd, ire->ire_uinfo.iulp_ssthresh, ref,
8240Sstevel@tonic-gate 	    ire->ire_uinfo.iulp_rtomax,
8250Sstevel@tonic-gate 	    (ire->ire_uinfo.iulp_tstamp_ok ? 1: 0),
8260Sstevel@tonic-gate 	    (ire->ire_uinfo.iulp_wscale_ok ? 1: 0),
8270Sstevel@tonic-gate 	    (ire->ire_uinfo.iulp_ecn_ok ? 1: 0),
8280Sstevel@tonic-gate 	    (ire->ire_uinfo.iulp_pmtud_ok ? 1: 0),
8290Sstevel@tonic-gate 	    ire->ire_uinfo.iulp_sack,
8300Sstevel@tonic-gate 	    ire->ire_uinfo.iulp_spipe, ire->ire_uinfo.iulp_rpipe,
8310Sstevel@tonic-gate 	    ib_pkt_count, ire->ire_ob_pkt_count, fo_pkt_count,
8320Sstevel@tonic-gate 	    ip_nv_lookup(ire_nv_tbl, (int)ire->ire_type));
8330Sstevel@tonic-gate 	if (print_len < buf_len) {
8340Sstevel@tonic-gate 		((mblk_t *)mp)->b_wptr += print_len;
8350Sstevel@tonic-gate 	} else {
8360Sstevel@tonic-gate 		((mblk_t *)mp)->b_wptr += buf_len;
8370Sstevel@tonic-gate 	}
8380Sstevel@tonic-gate }
8390Sstevel@tonic-gate 
8400Sstevel@tonic-gate /*
8410Sstevel@tonic-gate  * ip_ire_req is called by ip_wput when an IRE_DB_REQ_TYPE message is handed
8420Sstevel@tonic-gate  * down from the Upper Level Protocol to request a copy of the IRE (to check
8430Sstevel@tonic-gate  * its type or to extract information like round-trip time estimates or the
8440Sstevel@tonic-gate  * MTU.)
8450Sstevel@tonic-gate  * The address is assumed to be in the ire_addr field. If no IRE is found
8460Sstevel@tonic-gate  * an IRE is returned with ire_type being zero.
8470Sstevel@tonic-gate  * Note that the upper lavel protocol has to check for broadcast
8480Sstevel@tonic-gate  * (IRE_BROADCAST) and multicast (CLASSD(addr)).
8490Sstevel@tonic-gate  * If there is a b_cont the resulting IRE_DB_TYPE mblk is placed at the
8500Sstevel@tonic-gate  * end of the returned message.
8510Sstevel@tonic-gate  *
8520Sstevel@tonic-gate  * TCP sends down a message of this type with a connection request packet
8530Sstevel@tonic-gate  * chained on. UDP and ICMP send it down to verify that a route exists for
8540Sstevel@tonic-gate  * the destination address when they get connected.
8550Sstevel@tonic-gate  */
8560Sstevel@tonic-gate void
8570Sstevel@tonic-gate ip_ire_req(queue_t *q, mblk_t *mp)
8580Sstevel@tonic-gate {
8590Sstevel@tonic-gate 	ire_t	*inire;
8600Sstevel@tonic-gate 	ire_t	*ire;
8610Sstevel@tonic-gate 	mblk_t	*mp1;
8620Sstevel@tonic-gate 	ire_t	*sire = NULL;
8630Sstevel@tonic-gate 	zoneid_t zoneid = Q_TO_CONN(q)->conn_zoneid;
8643448Sdh155122 	ip_stack_t	*ipst = CONNQ_TO_IPST(q);
8653448Sdh155122 
8663448Sdh155122 	ASSERT(q->q_next == NULL);
8670Sstevel@tonic-gate 
8680Sstevel@tonic-gate 	if ((mp->b_wptr - mp->b_rptr) < sizeof (ire_t) ||
8690Sstevel@tonic-gate 	    !OK_32PTR(mp->b_rptr)) {
8700Sstevel@tonic-gate 		freemsg(mp);
8710Sstevel@tonic-gate 		return;
8720Sstevel@tonic-gate 	}
8730Sstevel@tonic-gate 	inire = (ire_t *)mp->b_rptr;
8740Sstevel@tonic-gate 	/*
8750Sstevel@tonic-gate 	 * Got it, now take our best shot at an IRE.
8760Sstevel@tonic-gate 	 */
8770Sstevel@tonic-gate 	if (inire->ire_ipversion == IPV6_VERSION) {
8780Sstevel@tonic-gate 		ire = ire_route_lookup_v6(&inire->ire_addr_v6, 0, 0, 0,
8791676Sjpk 		    NULL, &sire, zoneid, NULL,
8803448Sdh155122 		    (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT), ipst);
8810Sstevel@tonic-gate 	} else {
8820Sstevel@tonic-gate 		ASSERT(inire->ire_ipversion == IPV4_VERSION);
8830Sstevel@tonic-gate 		ire = ire_route_lookup(inire->ire_addr, 0, 0, 0,
8841676Sjpk 		    NULL, &sire, zoneid, NULL,
8853448Sdh155122 		    (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT), ipst);
8860Sstevel@tonic-gate 	}
8870Sstevel@tonic-gate 
8880Sstevel@tonic-gate 	/*
8890Sstevel@tonic-gate 	 * We prevent returning IRES with source address INADDR_ANY
8900Sstevel@tonic-gate 	 * as these were temporarily created for sending packets
8910Sstevel@tonic-gate 	 * from endpoints that have conn_unspec_src set.
8920Sstevel@tonic-gate 	 */
8930Sstevel@tonic-gate 	if (ire == NULL ||
8940Sstevel@tonic-gate 	    (ire->ire_ipversion == IPV4_VERSION &&
8950Sstevel@tonic-gate 	    ire->ire_src_addr == INADDR_ANY) ||
8960Sstevel@tonic-gate 	    (ire->ire_ipversion == IPV6_VERSION &&
8970Sstevel@tonic-gate 	    IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6))) {
8980Sstevel@tonic-gate 		inire->ire_type = 0;
8990Sstevel@tonic-gate 	} else {
9000Sstevel@tonic-gate 		bcopy(ire, inire, sizeof (ire_t));
9010Sstevel@tonic-gate 		/* Copy the route metrics from the parent. */
9020Sstevel@tonic-gate 		if (sire != NULL) {
9030Sstevel@tonic-gate 			bcopy(&(sire->ire_uinfo), &(inire->ire_uinfo),
9040Sstevel@tonic-gate 			    sizeof (iulp_t));
9050Sstevel@tonic-gate 		}
9060Sstevel@tonic-gate 
9070Sstevel@tonic-gate 		/*
9080Sstevel@tonic-gate 		 * As we don't lookup global policy here, we may not
9090Sstevel@tonic-gate 		 * pass the right size if per-socket policy is not
9100Sstevel@tonic-gate 		 * present. For these cases, path mtu discovery will
9110Sstevel@tonic-gate 		 * do the right thing.
9120Sstevel@tonic-gate 		 */
9130Sstevel@tonic-gate 		inire->ire_ipsec_overhead = conn_ipsec_length(Q_TO_CONN(q));
9140Sstevel@tonic-gate 
9150Sstevel@tonic-gate 		/* Pass the latest setting of the ip_path_mtu_discovery */
9163448Sdh155122 		inire->ire_frag_flag |=
9173448Sdh155122 		    (ipst->ips_ip_path_mtu_discovery) ? IPH_DF : 0;
9180Sstevel@tonic-gate 	}
9190Sstevel@tonic-gate 	if (ire != NULL)
9200Sstevel@tonic-gate 		ire_refrele(ire);
9210Sstevel@tonic-gate 	if (sire != NULL)
9220Sstevel@tonic-gate 		ire_refrele(sire);
9230Sstevel@tonic-gate 	mp->b_wptr = &mp->b_rptr[sizeof (ire_t)];
9240Sstevel@tonic-gate 	mp->b_datap->db_type = IRE_DB_TYPE;
9250Sstevel@tonic-gate 
9260Sstevel@tonic-gate 	/* Put the IRE_DB_TYPE mblk last in the chain */
9270Sstevel@tonic-gate 	mp1 = mp->b_cont;
9280Sstevel@tonic-gate 	if (mp1 != NULL) {
9290Sstevel@tonic-gate 		mp->b_cont = NULL;
9300Sstevel@tonic-gate 		linkb(mp1, mp);
9310Sstevel@tonic-gate 		mp = mp1;
9320Sstevel@tonic-gate 	}
9330Sstevel@tonic-gate 	qreply(q, mp);
9340Sstevel@tonic-gate }
9350Sstevel@tonic-gate 
9360Sstevel@tonic-gate /*
9370Sstevel@tonic-gate  * Send a packet using the specified IRE.
9380Sstevel@tonic-gate  * If ire_src_addr_v6 is all zero then discard the IRE after
9390Sstevel@tonic-gate  * the packet has been sent.
9400Sstevel@tonic-gate  */
9410Sstevel@tonic-gate static void
9420Sstevel@tonic-gate ire_send(queue_t *q, mblk_t *pkt, ire_t *ire)
9430Sstevel@tonic-gate {
9440Sstevel@tonic-gate 	mblk_t *ipsec_mp;
9450Sstevel@tonic-gate 	boolean_t is_secure;
9460Sstevel@tonic-gate 	uint_t ifindex;
9470Sstevel@tonic-gate 	ill_t	*ill;
9482733Snordmark 	zoneid_t zoneid = ire->ire_zoneid;
9493448Sdh155122 	ip_stack_t	*ipst = ire->ire_ipst;
9500Sstevel@tonic-gate 
9510Sstevel@tonic-gate 	ASSERT(ire->ire_ipversion == IPV4_VERSION);
9522733Snordmark 	ASSERT(!(ire->ire_type & IRE_LOCAL)); /* Has different ire_zoneid */
9530Sstevel@tonic-gate 	ipsec_mp = pkt;
9540Sstevel@tonic-gate 	is_secure = (pkt->b_datap->db_type == M_CTL);
9552733Snordmark 	if (is_secure) {
9562733Snordmark 		ipsec_out_t *io;
9572733Snordmark 
9580Sstevel@tonic-gate 		pkt = pkt->b_cont;
9592733Snordmark 		io = (ipsec_out_t *)ipsec_mp->b_rptr;
9602733Snordmark 		if (io->ipsec_out_type == IPSEC_OUT)
9612733Snordmark 			zoneid = io->ipsec_out_zoneid;
9622733Snordmark 	}
9630Sstevel@tonic-gate 
9640Sstevel@tonic-gate 	/* If the packet originated externally then */
9650Sstevel@tonic-gate 	if (pkt->b_prev) {
9660Sstevel@tonic-gate 		ire_refrele(ire);
9670Sstevel@tonic-gate 		/*
9680Sstevel@tonic-gate 		 * Extract the ifindex from b_prev (set in ip_rput_noire).
9690Sstevel@tonic-gate 		 * Look up interface to see if it still exists (it could have
9700Sstevel@tonic-gate 		 * been unplumbed by the time the reply came back from ARP)
9710Sstevel@tonic-gate 		 */
9720Sstevel@tonic-gate 		ifindex = (uint_t)(uintptr_t)pkt->b_prev;
9730Sstevel@tonic-gate 		ill = ill_lookup_on_ifindex(ifindex, B_FALSE,
9743448Sdh155122 		    NULL, NULL, NULL, NULL, ipst);
9750Sstevel@tonic-gate 		if (ill == NULL) {
9760Sstevel@tonic-gate 			pkt->b_prev = NULL;
9770Sstevel@tonic-gate 			pkt->b_next = NULL;
9780Sstevel@tonic-gate 			freemsg(ipsec_mp);
9790Sstevel@tonic-gate 			return;
9800Sstevel@tonic-gate 		}
9810Sstevel@tonic-gate 		q = ill->ill_rq;
9820Sstevel@tonic-gate 		pkt->b_prev = NULL;
9830Sstevel@tonic-gate 		/*
9840Sstevel@tonic-gate 		 * This packet has not gone through IPSEC processing
9850Sstevel@tonic-gate 		 * and hence we should not have any IPSEC message
9860Sstevel@tonic-gate 		 * prepended.
9870Sstevel@tonic-gate 		 */
9880Sstevel@tonic-gate 		ASSERT(ipsec_mp == pkt);
9892535Ssangeeta 		put(q, pkt);
9900Sstevel@tonic-gate 		ill_refrele(ill);
9910Sstevel@tonic-gate 	} else if (pkt->b_next) {
9920Sstevel@tonic-gate 		/* Packets from multicast router */
9930Sstevel@tonic-gate 		pkt->b_next = NULL;
9940Sstevel@tonic-gate 		/*
9950Sstevel@tonic-gate 		 * We never get the IPSEC_OUT while forwarding the
9960Sstevel@tonic-gate 		 * packet for multicast router.
9970Sstevel@tonic-gate 		 */
9980Sstevel@tonic-gate 		ASSERT(ipsec_mp == pkt);
9990Sstevel@tonic-gate 		ip_rput_forward(ire, (ipha_t *)pkt->b_rptr, ipsec_mp, NULL);
10000Sstevel@tonic-gate 		ire_refrele(ire);
10010Sstevel@tonic-gate 	} else {
10020Sstevel@tonic-gate 		/* Locally originated packets */
10030Sstevel@tonic-gate 		boolean_t is_inaddr_any;
10040Sstevel@tonic-gate 		ipha_t *ipha = (ipha_t *)pkt->b_rptr;
10050Sstevel@tonic-gate 
10060Sstevel@tonic-gate 		/*
10070Sstevel@tonic-gate 		 * We need to do an ire_delete below for which
10080Sstevel@tonic-gate 		 * we need to make sure that the IRE will be
10090Sstevel@tonic-gate 		 * around even after calling ip_wput_ire -
10100Sstevel@tonic-gate 		 * which does ire_refrele. Otherwise somebody
10110Sstevel@tonic-gate 		 * could potentially delete this ire and hence
10120Sstevel@tonic-gate 		 * free this ire and we will be calling ire_delete
10130Sstevel@tonic-gate 		 * on a freed ire below.
10140Sstevel@tonic-gate 		 */
10150Sstevel@tonic-gate 		is_inaddr_any = (ire->ire_src_addr == INADDR_ANY);
10160Sstevel@tonic-gate 		if (is_inaddr_any) {
10170Sstevel@tonic-gate 			IRE_REFHOLD(ire);
10180Sstevel@tonic-gate 		}
10190Sstevel@tonic-gate 		/*
10200Sstevel@tonic-gate 		 * If we were resolving a router we can not use the
10210Sstevel@tonic-gate 		 * routers IRE for sending the packet (since it would
10220Sstevel@tonic-gate 		 * violate the uniqness of the IP idents) thus we
10230Sstevel@tonic-gate 		 * make another pass through ip_wput to create the IRE_CACHE
10240Sstevel@tonic-gate 		 * for the destination.
10250Sstevel@tonic-gate 		 * When IRE_MARK_NOADD is set, ire_add() is not called.
10260Sstevel@tonic-gate 		 * Thus ip_wput() will never find a ire and result in an
10270Sstevel@tonic-gate 		 * infinite loop. Thus we check whether IRE_MARK_NOADD is
10280Sstevel@tonic-gate 		 * is set. This also implies that IRE_MARK_NOADD can only be
10290Sstevel@tonic-gate 		 * used to send packets to directly connected hosts.
10300Sstevel@tonic-gate 		 */
10310Sstevel@tonic-gate 		if (ipha->ipha_dst != ire->ire_addr &&
10320Sstevel@tonic-gate 		    !(ire->ire_marks & IRE_MARK_NOADD)) {
10330Sstevel@tonic-gate 			ire_refrele(ire);	/* Held in ire_add */
10342733Snordmark 			if (CONN_Q(q)) {
10352733Snordmark 				(void) ip_output(Q_TO_CONN(q), ipsec_mp, q,
10362733Snordmark 				    IRE_SEND);
10372733Snordmark 			} else {
10382733Snordmark 				(void) ip_output((void *)(uintptr_t)zoneid,
10392733Snordmark 				    ipsec_mp, q, IRE_SEND);
10402733Snordmark 			}
10410Sstevel@tonic-gate 		} else {
10420Sstevel@tonic-gate 			if (is_secure) {
10430Sstevel@tonic-gate 				ipsec_out_t *oi;
10440Sstevel@tonic-gate 				ipha_t *ipha;
10450Sstevel@tonic-gate 
10460Sstevel@tonic-gate 				oi = (ipsec_out_t *)ipsec_mp->b_rptr;
10470Sstevel@tonic-gate 				ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr;
10480Sstevel@tonic-gate 				if (oi->ipsec_out_proc_begin) {
10490Sstevel@tonic-gate 					/*
10500Sstevel@tonic-gate 					 * This is the case where
10510Sstevel@tonic-gate 					 * ip_wput_ipsec_out could not find
10520Sstevel@tonic-gate 					 * the IRE and recreated a new one.
10530Sstevel@tonic-gate 					 * As ip_wput_ipsec_out does ire
10540Sstevel@tonic-gate 					 * lookups, ire_refrele for the extra
10550Sstevel@tonic-gate 					 * bump in ire_add.
10560Sstevel@tonic-gate 					 */
10570Sstevel@tonic-gate 					ire_refrele(ire);
10580Sstevel@tonic-gate 					ip_wput_ipsec_out(q, ipsec_mp, ipha,
10590Sstevel@tonic-gate 					    NULL, NULL);
10600Sstevel@tonic-gate 				} else {
10610Sstevel@tonic-gate 					/*
10620Sstevel@tonic-gate 					 * IRE_REFRELE will be done in
10630Sstevel@tonic-gate 					 * ip_wput_ire.
10640Sstevel@tonic-gate 					 */
10650Sstevel@tonic-gate 					ip_wput_ire(q, ipsec_mp, ire, NULL,
10662733Snordmark 					    IRE_SEND, zoneid);
10670Sstevel@tonic-gate 				}
10680Sstevel@tonic-gate 			} else {
10690Sstevel@tonic-gate 				/*
10700Sstevel@tonic-gate 				 * IRE_REFRELE will be done in ip_wput_ire.
10710Sstevel@tonic-gate 				 */
10720Sstevel@tonic-gate 				ip_wput_ire(q, ipsec_mp, ire, NULL,
10732733Snordmark 				    IRE_SEND, zoneid);
10740Sstevel@tonic-gate 			}
10750Sstevel@tonic-gate 		}
10760Sstevel@tonic-gate 		/*
10770Sstevel@tonic-gate 		 * Special code to support sending a single packet with
10780Sstevel@tonic-gate 		 * conn_unspec_src using an IRE which has no source address.
10790Sstevel@tonic-gate 		 * The IRE is deleted here after sending the packet to avoid
10800Sstevel@tonic-gate 		 * having other code trip on it. But before we delete the
10810Sstevel@tonic-gate 		 * ire, somebody could have looked up this ire.
10820Sstevel@tonic-gate 		 * We prevent returning/using this IRE by the upper layers
10830Sstevel@tonic-gate 		 * by making checks to NULL source address in other places
10840Sstevel@tonic-gate 		 * like e.g ip_ire_append, ip_ire_req and ip_bind_connected.
10850Sstevel@tonic-gate 		 * Though, this does not completely prevent other threads
10860Sstevel@tonic-gate 		 * from using this ire, this should not cause any problems.
10870Sstevel@tonic-gate 		 *
10880Sstevel@tonic-gate 		 * NOTE : We use is_inaddr_any instead of using ire_src_addr
10890Sstevel@tonic-gate 		 * because for the normal case i.e !is_inaddr_any, ire_refrele
10900Sstevel@tonic-gate 		 * above could have potentially freed the ire.
10910Sstevel@tonic-gate 		 */
10920Sstevel@tonic-gate 		if (is_inaddr_any) {
10930Sstevel@tonic-gate 			/*
10940Sstevel@tonic-gate 			 * If this IRE has been deleted by another thread, then
10950Sstevel@tonic-gate 			 * ire_bucket won't be NULL, but ire_ptpn will be NULL.
10960Sstevel@tonic-gate 			 * Thus, ire_delete will do nothing.  This check
10970Sstevel@tonic-gate 			 * guards against calling ire_delete when the IRE was
10980Sstevel@tonic-gate 			 * never inserted in the table, which is handled by
10990Sstevel@tonic-gate 			 * ire_delete as dropping another reference.
11000Sstevel@tonic-gate 			 */
11010Sstevel@tonic-gate 			if (ire->ire_bucket != NULL) {
11020Sstevel@tonic-gate 				ip1dbg(("ire_send: delete IRE\n"));
11030Sstevel@tonic-gate 				ire_delete(ire);
11040Sstevel@tonic-gate 			}
11050Sstevel@tonic-gate 			ire_refrele(ire);	/* Held above */
11060Sstevel@tonic-gate 		}
11070Sstevel@tonic-gate 	}
11080Sstevel@tonic-gate }
11090Sstevel@tonic-gate 
11100Sstevel@tonic-gate /*
11110Sstevel@tonic-gate  * Send a packet using the specified IRE.
11120Sstevel@tonic-gate  * If ire_src_addr_v6 is all zero then discard the IRE after
11130Sstevel@tonic-gate  * the packet has been sent.
11140Sstevel@tonic-gate  */
11150Sstevel@tonic-gate static void
11160Sstevel@tonic-gate ire_send_v6(queue_t *q, mblk_t *pkt, ire_t *ire)
11170Sstevel@tonic-gate {
11180Sstevel@tonic-gate 	mblk_t *ipsec_mp;
11190Sstevel@tonic-gate 	boolean_t secure;
11200Sstevel@tonic-gate 	uint_t ifindex;
11212733Snordmark 	zoneid_t zoneid = ire->ire_zoneid;
11223448Sdh155122 	ip_stack_t	*ipst = ire->ire_ipst;
11230Sstevel@tonic-gate 
11240Sstevel@tonic-gate 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
11252733Snordmark 	ASSERT(!(ire->ire_type & IRE_LOCAL)); /* Has different ire_zoneid */
11260Sstevel@tonic-gate 	if (pkt->b_datap->db_type == M_CTL) {
11272733Snordmark 		ipsec_out_t *io;
11282733Snordmark 
11290Sstevel@tonic-gate 		ipsec_mp = pkt;
11300Sstevel@tonic-gate 		pkt = pkt->b_cont;
11310Sstevel@tonic-gate 		secure = B_TRUE;
11322733Snordmark 		io = (ipsec_out_t *)ipsec_mp->b_rptr;
11332733Snordmark 		if (io->ipsec_out_type == IPSEC_OUT)
11342733Snordmark 			zoneid = io->ipsec_out_zoneid;
11350Sstevel@tonic-gate 	} else {
11360Sstevel@tonic-gate 		ipsec_mp = pkt;
11370Sstevel@tonic-gate 		secure = B_FALSE;
11380Sstevel@tonic-gate 	}
11390Sstevel@tonic-gate 
11400Sstevel@tonic-gate 	/* If the packet originated externally then */
11410Sstevel@tonic-gate 	if (pkt->b_prev) {
11420Sstevel@tonic-gate 		ill_t	*ill;
11430Sstevel@tonic-gate 		/*
11440Sstevel@tonic-gate 		 * Extract the ifindex from b_prev (set in ip_rput_data_v6).
11450Sstevel@tonic-gate 		 * Look up interface to see if it still exists (it could have
11460Sstevel@tonic-gate 		 * been unplumbed by the time the reply came back from the
11472535Ssangeeta 		 * resolver).
11480Sstevel@tonic-gate 		 */
11490Sstevel@tonic-gate 		ifindex = (uint_t)(uintptr_t)pkt->b_prev;
11500Sstevel@tonic-gate 		ill = ill_lookup_on_ifindex(ifindex, B_TRUE,
11513448Sdh155122 		    NULL, NULL, NULL, NULL, ipst);
11520Sstevel@tonic-gate 		if (ill == NULL) {
11530Sstevel@tonic-gate 			pkt->b_prev = NULL;
11540Sstevel@tonic-gate 			pkt->b_next = NULL;
11550Sstevel@tonic-gate 			freemsg(ipsec_mp);
11560Sstevel@tonic-gate 			ire_refrele(ire);	/* Held in ire_add */
11570Sstevel@tonic-gate 			return;
11580Sstevel@tonic-gate 		}
11590Sstevel@tonic-gate 		q = ill->ill_rq;
11600Sstevel@tonic-gate 		pkt->b_prev = NULL;
11610Sstevel@tonic-gate 		/*
11620Sstevel@tonic-gate 		 * This packet has not gone through IPSEC processing
11630Sstevel@tonic-gate 		 * and hence we should not have any IPSEC message
11640Sstevel@tonic-gate 		 * prepended.
11650Sstevel@tonic-gate 		 */
11660Sstevel@tonic-gate 		ASSERT(ipsec_mp == pkt);
11670Sstevel@tonic-gate 		put(q, pkt);
11680Sstevel@tonic-gate 		ill_refrele(ill);
11690Sstevel@tonic-gate 	} else if (pkt->b_next) {
11700Sstevel@tonic-gate 		/* Packets from multicast router */
11710Sstevel@tonic-gate 		pkt->b_next = NULL;
11720Sstevel@tonic-gate 		/*
11730Sstevel@tonic-gate 		 * We never get the IPSEC_OUT while forwarding the
11740Sstevel@tonic-gate 		 * packet for multicast router.
11750Sstevel@tonic-gate 		 */
11760Sstevel@tonic-gate 		ASSERT(ipsec_mp == pkt);
11770Sstevel@tonic-gate 		/*
11780Sstevel@tonic-gate 		 * XXX TODO IPv6.
11790Sstevel@tonic-gate 		 */
11800Sstevel@tonic-gate 		freemsg(pkt);
11810Sstevel@tonic-gate #ifdef XXX
11820Sstevel@tonic-gate 		ip_rput_forward(ire, (ipha_t *)pkt->b_rptr, pkt, NULL);
11830Sstevel@tonic-gate #endif
11840Sstevel@tonic-gate 	} else {
11850Sstevel@tonic-gate 		if (secure) {
11860Sstevel@tonic-gate 			ipsec_out_t *oi;
11870Sstevel@tonic-gate 			ip6_t *ip6h;
11880Sstevel@tonic-gate 
11890Sstevel@tonic-gate 			oi = (ipsec_out_t *)ipsec_mp->b_rptr;
11900Sstevel@tonic-gate 			ip6h = (ip6_t *)ipsec_mp->b_cont->b_rptr;
11910Sstevel@tonic-gate 			if (oi->ipsec_out_proc_begin) {
11920Sstevel@tonic-gate 				/*
11930Sstevel@tonic-gate 				 * This is the case where
11940Sstevel@tonic-gate 				 * ip_wput_ipsec_out could not find
11950Sstevel@tonic-gate 				 * the IRE and recreated a new one.
11960Sstevel@tonic-gate 				 */
11970Sstevel@tonic-gate 				ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h,
11980Sstevel@tonic-gate 				    NULL, NULL);
11990Sstevel@tonic-gate 			} else {
12002733Snordmark 				if (CONN_Q(q)) {
12012733Snordmark 					(void) ip_output_v6(Q_TO_CONN(q),
12022733Snordmark 					    ipsec_mp, q, IRE_SEND);
12032733Snordmark 				} else {
12042733Snordmark 					(void) ip_output_v6(
12052733Snordmark 					    (void *)(uintptr_t)zoneid,
12062733Snordmark 					    ipsec_mp, q, IRE_SEND);
12072733Snordmark 				}
12080Sstevel@tonic-gate 			}
12090Sstevel@tonic-gate 		} else {
12100Sstevel@tonic-gate 			/*
12110Sstevel@tonic-gate 			 * Send packets through ip_output_v6 so that any
12120Sstevel@tonic-gate 			 * ip6_info header can be processed again.
12130Sstevel@tonic-gate 			 */
12142733Snordmark 			if (CONN_Q(q)) {
12152733Snordmark 				(void) ip_output_v6(Q_TO_CONN(q), ipsec_mp, q,
12162733Snordmark 				    IRE_SEND);
12172733Snordmark 			} else {
12182733Snordmark 				(void) ip_output_v6((void *)(uintptr_t)zoneid,
12192733Snordmark 				    ipsec_mp, q, IRE_SEND);
12202733Snordmark 			}
12210Sstevel@tonic-gate 		}
12220Sstevel@tonic-gate 		/*
12230Sstevel@tonic-gate 		 * Special code to support sending a single packet with
12240Sstevel@tonic-gate 		 * conn_unspec_src using an IRE which has no source address.
12250Sstevel@tonic-gate 		 * The IRE is deleted here after sending the packet to avoid
12260Sstevel@tonic-gate 		 * having other code trip on it. But before we delete the
12270Sstevel@tonic-gate 		 * ire, somebody could have looked up this ire.
12280Sstevel@tonic-gate 		 * We prevent returning/using this IRE by the upper layers
12290Sstevel@tonic-gate 		 * by making checks to NULL source address in other places
12300Sstevel@tonic-gate 		 * like e.g ip_ire_append_v6, ip_ire_req and
12310Sstevel@tonic-gate 		 * ip_bind_connected_v6. Though, this does not completely
12320Sstevel@tonic-gate 		 * prevent other threads from using this ire, this should
12330Sstevel@tonic-gate 		 * not cause any problems.
12340Sstevel@tonic-gate 		 */
12350Sstevel@tonic-gate 		if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6)) {
12360Sstevel@tonic-gate 			ip1dbg(("ire_send_v6: delete IRE\n"));
12370Sstevel@tonic-gate 			ire_delete(ire);
12380Sstevel@tonic-gate 		}
12390Sstevel@tonic-gate 	}
12400Sstevel@tonic-gate 	ire_refrele(ire);	/* Held in ire_add */
12410Sstevel@tonic-gate }
12420Sstevel@tonic-gate 
12430Sstevel@tonic-gate /*
12440Sstevel@tonic-gate  * Make sure that IRE bucket does not get too long.
12450Sstevel@tonic-gate  * This can cause lock up because ire_cache_lookup()
12460Sstevel@tonic-gate  * may take "forever" to finish.
12470Sstevel@tonic-gate  *
12480Sstevel@tonic-gate  * We just remove cnt IREs each time.  This means that
12490Sstevel@tonic-gate  * the bucket length will stay approximately constant,
12500Sstevel@tonic-gate  * depending on cnt.  This should be enough to defend
12510Sstevel@tonic-gate  * against DoS attack based on creating temporary IREs
12520Sstevel@tonic-gate  * (for forwarding and non-TCP traffic).
12530Sstevel@tonic-gate  *
12540Sstevel@tonic-gate  * Note that new IRE is normally added at the tail of the
12550Sstevel@tonic-gate  * bucket.  This means that we are removing the "oldest"
12560Sstevel@tonic-gate  * temporary IRE added.  Only if there are IREs with
12570Sstevel@tonic-gate  * the same ire_addr, do we not add it at the tail.  Refer
12580Sstevel@tonic-gate  * to ire_add_v*().  It should be OK for our purpose.
12590Sstevel@tonic-gate  *
12600Sstevel@tonic-gate  * For non-temporary cached IREs, we make sure that they
12610Sstevel@tonic-gate  * have not been used for some time (defined below), they
12620Sstevel@tonic-gate  * are non-local destinations, and there is no one using
12630Sstevel@tonic-gate  * them at the moment (refcnt == 1).
12640Sstevel@tonic-gate  *
12650Sstevel@tonic-gate  * The above means that the IRE bucket length may become
12660Sstevel@tonic-gate  * very long, consisting of mostly non-temporary IREs.
12670Sstevel@tonic-gate  * This can happen when the hash function does a bad job
12680Sstevel@tonic-gate  * so that most TCP connections cluster to a specific bucket.
12690Sstevel@tonic-gate  * This "hopefully" should never happen.  It can also
12700Sstevel@tonic-gate  * happen if most TCP connections have very long lives.
12710Sstevel@tonic-gate  * Even with the minimal hash table size of 256, there
12720Sstevel@tonic-gate  * has to be a lot of such connections to make the bucket
12730Sstevel@tonic-gate  * length unreasonably long.  This should probably not
12740Sstevel@tonic-gate  * happen either.  The third can when this can happen is
12750Sstevel@tonic-gate  * when the machine is under attack, such as SYN flooding.
12760Sstevel@tonic-gate  * TCP should already have the proper mechanism to protect
12770Sstevel@tonic-gate  * that.  So we should be safe.
12780Sstevel@tonic-gate  *
12790Sstevel@tonic-gate  * This function is called by ire_add_then_send() after
12800Sstevel@tonic-gate  * a new IRE is added and the packet is sent.
12810Sstevel@tonic-gate  *
12820Sstevel@tonic-gate  * The idle cutoff interval is set to 60s.  It can be
12830Sstevel@tonic-gate  * changed using /etc/system.
12840Sstevel@tonic-gate  */
12850Sstevel@tonic-gate uint32_t ire_idle_cutoff_interval = 60000;
12860Sstevel@tonic-gate 
12870Sstevel@tonic-gate static void
12880Sstevel@tonic-gate ire_cache_cleanup(irb_t *irb, uint32_t threshold, int cnt)
12890Sstevel@tonic-gate {
12900Sstevel@tonic-gate 	ire_t *ire;
12910Sstevel@tonic-gate 	int tmp_cnt = cnt;
12920Sstevel@tonic-gate 	clock_t cut_off = drv_usectohz(ire_idle_cutoff_interval * 1000);
12930Sstevel@tonic-gate 
12940Sstevel@tonic-gate 	/*
12950Sstevel@tonic-gate 	 * irb is NULL if the IRE is not added to the hash.  This
12964823Sseb 	 * happens when IRE_MARK_NOADD is set in ire_add_then_send().
12970Sstevel@tonic-gate 	 */
12980Sstevel@tonic-gate 	if (irb == NULL)
12990Sstevel@tonic-gate 		return;
13000Sstevel@tonic-gate 
13010Sstevel@tonic-gate 	IRB_REFHOLD(irb);
13020Sstevel@tonic-gate 	if (irb->irb_tmp_ire_cnt > threshold) {
13030Sstevel@tonic-gate 		for (ire = irb->irb_ire; ire != NULL && tmp_cnt > 0;
13040Sstevel@tonic-gate 		    ire = ire->ire_next) {
13050Sstevel@tonic-gate 			if (ire->ire_marks & IRE_MARK_CONDEMNED)
13060Sstevel@tonic-gate 				continue;
13070Sstevel@tonic-gate 			if (ire->ire_marks & IRE_MARK_TEMPORARY) {
13080Sstevel@tonic-gate 				ASSERT(ire->ire_type == IRE_CACHE);
13090Sstevel@tonic-gate 				ire_delete(ire);
13100Sstevel@tonic-gate 				tmp_cnt--;
13110Sstevel@tonic-gate 			}
13120Sstevel@tonic-gate 		}
13130Sstevel@tonic-gate 	}
13140Sstevel@tonic-gate 	if (irb->irb_ire_cnt - irb->irb_tmp_ire_cnt > threshold) {
13150Sstevel@tonic-gate 		for (ire = irb->irb_ire; ire != NULL && cnt > 0;
13160Sstevel@tonic-gate 		    ire = ire->ire_next) {
13173448Sdh155122 			if (ire->ire_marks & IRE_MARK_CONDEMNED)
13180Sstevel@tonic-gate 				continue;
13193448Sdh155122 			if (ire->ire_ipversion == IPV4_VERSION) {
13203448Sdh155122 				if (ire->ire_gateway_addr == 0)
13213448Sdh155122 					continue;
13223448Sdh155122 			} else {
13233448Sdh155122 				if (IN6_IS_ADDR_UNSPECIFIED(
13243448Sdh155122 				    &ire->ire_gateway_addr_v6))
13253448Sdh155122 					continue;
13260Sstevel@tonic-gate 			}
13270Sstevel@tonic-gate 			if ((ire->ire_type == IRE_CACHE) &&
13280Sstevel@tonic-gate 			    (lbolt - ire->ire_last_used_time > cut_off) &&
13290Sstevel@tonic-gate 			    (ire->ire_refcnt == 1)) {
13300Sstevel@tonic-gate 				ire_delete(ire);
13310Sstevel@tonic-gate 				cnt--;
13320Sstevel@tonic-gate 			}
13330Sstevel@tonic-gate 		}
13340Sstevel@tonic-gate 	}
13350Sstevel@tonic-gate 	IRB_REFRELE(irb);
13360Sstevel@tonic-gate }
13370Sstevel@tonic-gate 
13380Sstevel@tonic-gate /*
13390Sstevel@tonic-gate  * ire_add_then_send is called when a new IRE has been created in order to
13400Sstevel@tonic-gate  * route an outgoing packet.  Typically, it is called from ip_wput when
13410Sstevel@tonic-gate  * a response comes back down from a resolver.  We add the IRE, and then
13420Sstevel@tonic-gate  * possibly run the packet through ip_wput or ip_rput, as appropriate.
13430Sstevel@tonic-gate  * However, we do not add the newly created IRE in the cache when
13440Sstevel@tonic-gate  * IRE_MARK_NOADD is set in the IRE. IRE_MARK_NOADD is set at
13454823Sseb  * ip_newroute_ipif(). The ires with IRE_MARK_NOADD are ire_refrele'd by
13464823Sseb  * ip_wput_ire() and get deleted.
13470Sstevel@tonic-gate  * Multirouting support: the packet is silently discarded when the new IRE
13480Sstevel@tonic-gate  * holds the RTF_MULTIRT flag, but is not the first IRE to be added with the
13490Sstevel@tonic-gate  * RTF_MULTIRT flag for the same destination address.
13500Sstevel@tonic-gate  * In this case, we just want to register this additional ire without
13510Sstevel@tonic-gate  * sending the packet, as it has already been replicated through
13520Sstevel@tonic-gate  * existing multirt routes in ip_wput().
13530Sstevel@tonic-gate  */
13540Sstevel@tonic-gate void
13550Sstevel@tonic-gate ire_add_then_send(queue_t *q, ire_t *ire, mblk_t *mp)
13560Sstevel@tonic-gate {
13570Sstevel@tonic-gate 	irb_t *irb;
13580Sstevel@tonic-gate 	boolean_t drop = B_FALSE;
13590Sstevel@tonic-gate 	/* LINTED : set but not used in function */
13600Sstevel@tonic-gate 	boolean_t mctl_present;
13610Sstevel@tonic-gate 	mblk_t *first_mp = NULL;
13620Sstevel@tonic-gate 	mblk_t *save_mp = NULL;
13630Sstevel@tonic-gate 	ire_t *dst_ire;
13640Sstevel@tonic-gate 	ipha_t *ipha;
13650Sstevel@tonic-gate 	ip6_t *ip6h;
13663448Sdh155122 	ip_stack_t	*ipst = ire->ire_ipst;
13670Sstevel@tonic-gate 
13680Sstevel@tonic-gate 	if (mp != NULL) {
13690Sstevel@tonic-gate 		/*
13700Sstevel@tonic-gate 		 * We first have to retrieve the destination address carried
13710Sstevel@tonic-gate 		 * by the packet.
13720Sstevel@tonic-gate 		 * We can't rely on ire as it can be related to a gateway.
13730Sstevel@tonic-gate 		 * The destination address will help in determining if
13740Sstevel@tonic-gate 		 * other RTF_MULTIRT ires are already registered.
13750Sstevel@tonic-gate 		 *
13760Sstevel@tonic-gate 		 * We first need to know where we are going : v4 or V6.
13770Sstevel@tonic-gate 		 * the ire version is enough, as there is no risk that
13780Sstevel@tonic-gate 		 * we resolve an IPv6 address with an IPv4 ire
13790Sstevel@tonic-gate 		 * or vice versa.
13800Sstevel@tonic-gate 		 */
13810Sstevel@tonic-gate 		if (ire->ire_ipversion == IPV4_VERSION) {
13820Sstevel@tonic-gate 			EXTRACT_PKT_MP(mp, first_mp, mctl_present);
13830Sstevel@tonic-gate 			ipha = (ipha_t *)mp->b_rptr;
13840Sstevel@tonic-gate 			save_mp = mp;
13850Sstevel@tonic-gate 			mp = first_mp;
13860Sstevel@tonic-gate 
13870Sstevel@tonic-gate 			dst_ire = ire_cache_lookup(ipha->ipha_dst,
13883448Sdh155122 			    ire->ire_zoneid, MBLK_GETLABEL(mp), ipst);
13890Sstevel@tonic-gate 		} else {
13902535Ssangeeta 			ASSERT(ire->ire_ipversion == IPV6_VERSION);
13910Sstevel@tonic-gate 			/*
13920Sstevel@tonic-gate 			 * Get a pointer to the beginning of the IPv6 header.
13930Sstevel@tonic-gate 			 * Ignore leading IPsec control mblks.
13940Sstevel@tonic-gate 			 */
13950Sstevel@tonic-gate 			first_mp = mp;
13960Sstevel@tonic-gate 			if (mp->b_datap->db_type == M_CTL) {
13970Sstevel@tonic-gate 				mp = mp->b_cont;
13980Sstevel@tonic-gate 			}
13990Sstevel@tonic-gate 			ip6h = (ip6_t *)mp->b_rptr;
14000Sstevel@tonic-gate 			save_mp = mp;
14010Sstevel@tonic-gate 			mp = first_mp;
14020Sstevel@tonic-gate 			dst_ire = ire_cache_lookup_v6(&ip6h->ip6_dst,
14033448Sdh155122 			    ire->ire_zoneid, MBLK_GETLABEL(mp), ipst);
14040Sstevel@tonic-gate 		}
14050Sstevel@tonic-gate 		if (dst_ire != NULL) {
14060Sstevel@tonic-gate 			if (dst_ire->ire_flags & RTF_MULTIRT) {
14070Sstevel@tonic-gate 				/*
14080Sstevel@tonic-gate 				 * At least one resolved multirt route
14090Sstevel@tonic-gate 				 * already exists for the destination,
14100Sstevel@tonic-gate 				 * don't sent this packet: either drop it
14110Sstevel@tonic-gate 				 * or complete the pending resolution,
14120Sstevel@tonic-gate 				 * depending on the ire.
14130Sstevel@tonic-gate 				 */
14140Sstevel@tonic-gate 				drop = B_TRUE;
14150Sstevel@tonic-gate 			}
14160Sstevel@tonic-gate 			ip1dbg(("ire_add_then_send: dst_ire %p "
14170Sstevel@tonic-gate 			    "[dst %08x, gw %08x], drop %d\n",
14180Sstevel@tonic-gate 			    (void *)dst_ire,
14190Sstevel@tonic-gate 			    (dst_ire->ire_ipversion == IPV4_VERSION) ? \
14204714Ssowmini 			    ntohl(dst_ire->ire_addr) : \
14214714Ssowmini 			    ntohl(V4_PART_OF_V6(dst_ire->ire_addr_v6)),
14220Sstevel@tonic-gate 			    (dst_ire->ire_ipversion == IPV4_VERSION) ? \
14234714Ssowmini 			    ntohl(dst_ire->ire_gateway_addr) : \
14244714Ssowmini 			    ntohl(V4_PART_OF_V6(
14254714Ssowmini 			    dst_ire->ire_gateway_addr_v6)),
14260Sstevel@tonic-gate 			    drop));
14270Sstevel@tonic-gate 			ire_refrele(dst_ire);
14280Sstevel@tonic-gate 		}
14290Sstevel@tonic-gate 	}
14300Sstevel@tonic-gate 
14310Sstevel@tonic-gate 	if (!(ire->ire_marks & IRE_MARK_NOADD)) {
14324823Sseb 		/* Regular packets with cache bound ires are here. */
14334823Sseb 		(void) ire_add(&ire, NULL, NULL, NULL, B_FALSE);
14340Sstevel@tonic-gate 
14350Sstevel@tonic-gate 		if (ire == NULL) {
14360Sstevel@tonic-gate 			mp->b_prev = NULL;
14370Sstevel@tonic-gate 			mp->b_next = NULL;
14380Sstevel@tonic-gate 			MULTIRT_DEBUG_UNTAG(mp);
14390Sstevel@tonic-gate 			freemsg(mp);
14400Sstevel@tonic-gate 			return;
14410Sstevel@tonic-gate 		}
14420Sstevel@tonic-gate 		if (mp == NULL) {
14430Sstevel@tonic-gate 			ire_refrele(ire);	/* Held in ire_add_v4/v6 */
14440Sstevel@tonic-gate 			return;
14450Sstevel@tonic-gate 		}
14460Sstevel@tonic-gate 	}
14470Sstevel@tonic-gate 	if (drop) {
14480Sstevel@tonic-gate 		/*
14490Sstevel@tonic-gate 		 * If we're adding an RTF_MULTIRT ire, the resolution
14500Sstevel@tonic-gate 		 * is over: we just drop the packet.
14510Sstevel@tonic-gate 		 */
14520Sstevel@tonic-gate 		if (ire->ire_flags & RTF_MULTIRT) {
14530Sstevel@tonic-gate 			if (save_mp) {
14540Sstevel@tonic-gate 				save_mp->b_prev = NULL;
14550Sstevel@tonic-gate 				save_mp->b_next = NULL;
14560Sstevel@tonic-gate 			}
14570Sstevel@tonic-gate 			MULTIRT_DEBUG_UNTAG(mp);
14580Sstevel@tonic-gate 			freemsg(mp);
14590Sstevel@tonic-gate 		} else {
14600Sstevel@tonic-gate 			/*
14610Sstevel@tonic-gate 			 * Otherwise, we're adding the ire to a gateway
14620Sstevel@tonic-gate 			 * for a multirt route.
14630Sstevel@tonic-gate 			 * Invoke ip_newroute() to complete the resolution
14640Sstevel@tonic-gate 			 * of the route. We will then come back here and
14650Sstevel@tonic-gate 			 * finally drop this packet in the above code.
14660Sstevel@tonic-gate 			 */
14670Sstevel@tonic-gate 			if (ire->ire_ipversion == IPV4_VERSION) {
14680Sstevel@tonic-gate 				/*
14690Sstevel@tonic-gate 				 * TODO: in order for CGTP to work in non-global
14700Sstevel@tonic-gate 				 * zones, ip_newroute() must create the IRE
14710Sstevel@tonic-gate 				 * cache in the zone indicated by
14720Sstevel@tonic-gate 				 * ire->ire_zoneid.
14730Sstevel@tonic-gate 				 */
14744823Sseb 				ip_newroute(q, mp, ipha->ipha_dst,
14752733Snordmark 				    (CONN_Q(q) ? Q_TO_CONN(q) : NULL),
14763448Sdh155122 				    ire->ire_zoneid, ipst);
14770Sstevel@tonic-gate 			} else {
14782535Ssangeeta 				ASSERT(ire->ire_ipversion == IPV6_VERSION);
14790Sstevel@tonic-gate 				ip_newroute_v6(q, mp, &ip6h->ip6_dst, NULL,
14803448Sdh155122 				    NULL, ire->ire_zoneid, ipst);
14810Sstevel@tonic-gate 			}
14820Sstevel@tonic-gate 		}
14830Sstevel@tonic-gate 
14840Sstevel@tonic-gate 		ire_refrele(ire); /* As done by ire_send(). */
14850Sstevel@tonic-gate 		return;
14860Sstevel@tonic-gate 	}
14870Sstevel@tonic-gate 	/*
14880Sstevel@tonic-gate 	 * Need to remember ire_bucket here as ire_send*() may delete
14890Sstevel@tonic-gate 	 * the ire so we cannot reference it after that.
14900Sstevel@tonic-gate 	 */
14910Sstevel@tonic-gate 	irb = ire->ire_bucket;
14920Sstevel@tonic-gate 	if (ire->ire_ipversion == IPV6_VERSION) {
14930Sstevel@tonic-gate 		ire_send_v6(q, mp, ire);
14940Sstevel@tonic-gate 		/*
14950Sstevel@tonic-gate 		 * Clean up more than 1 IRE so that the clean up does not
14960Sstevel@tonic-gate 		 * need to be done every time when a new IRE is added and
14970Sstevel@tonic-gate 		 * the threshold is reached.
14980Sstevel@tonic-gate 		 */
14990Sstevel@tonic-gate 		ire_cache_cleanup(irb, ip6_ire_max_bucket_cnt, 2);
15000Sstevel@tonic-gate 	} else {
15010Sstevel@tonic-gate 		ire_send(q, mp, ire);
15020Sstevel@tonic-gate 		ire_cache_cleanup(irb, ip_ire_max_bucket_cnt, 2);
15030Sstevel@tonic-gate 	}
15040Sstevel@tonic-gate }
15050Sstevel@tonic-gate 
15060Sstevel@tonic-gate /*
15070Sstevel@tonic-gate  * Initialize the ire that is specific to IPv4 part and call
15080Sstevel@tonic-gate  * ire_init_common to finish it.
15090Sstevel@tonic-gate  */
15100Sstevel@tonic-gate ire_t *
15110Sstevel@tonic-gate ire_init(ire_t *ire, uchar_t *addr, uchar_t *mask, uchar_t *src_addr,
15124823Sseb     uchar_t *gateway, uint_t *max_fragp, nce_t *src_nce, queue_t *rfq,
15134823Sseb     queue_t *stq, ushort_t type, ipif_t *ipif, ipaddr_t cmask, uint32_t phandle,
15144823Sseb     uint32_t ihandle, uint32_t flags, const iulp_t *ulp_info, tsol_gc_t *gc,
15154823Sseb     tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
15160Sstevel@tonic-gate {
15171676Sjpk 	/*
15181676Sjpk 	 * Reject IRE security attribute creation/initialization
15191676Sjpk 	 * if system is not running in Trusted mode.
15201676Sjpk 	 */
15211676Sjpk 	if ((gc != NULL || gcgrp != NULL) && !is_system_labeled())
15221676Sjpk 		return (NULL);
15231676Sjpk 
15240Sstevel@tonic-gate 
15253448Sdh155122 	BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_alloced);
15260Sstevel@tonic-gate 
15270Sstevel@tonic-gate 	if (addr != NULL)
15280Sstevel@tonic-gate 		bcopy(addr, &ire->ire_addr, IP_ADDR_LEN);
15290Sstevel@tonic-gate 	if (src_addr != NULL)
15300Sstevel@tonic-gate 		bcopy(src_addr, &ire->ire_src_addr, IP_ADDR_LEN);
15310Sstevel@tonic-gate 	if (mask != NULL) {
15320Sstevel@tonic-gate 		bcopy(mask, &ire->ire_mask, IP_ADDR_LEN);
15330Sstevel@tonic-gate 		ire->ire_masklen = ip_mask_to_plen(ire->ire_mask);
15340Sstevel@tonic-gate 	}
15350Sstevel@tonic-gate 	if (gateway != NULL) {
15360Sstevel@tonic-gate 		bcopy(gateway, &ire->ire_gateway_addr, IP_ADDR_LEN);
15370Sstevel@tonic-gate 	}
15380Sstevel@tonic-gate 
15390Sstevel@tonic-gate 	if (type == IRE_CACHE)
15400Sstevel@tonic-gate 		ire->ire_cmask = cmask;
15410Sstevel@tonic-gate 
15421676Sjpk 	/* ire_init_common will free the mblks upon encountering any failure */
15434823Sseb 	if (!ire_init_common(ire, max_fragp, src_nce, rfq, stq, type, ipif,
15444823Sseb 	    phandle, ihandle, flags, IPV4_VERSION, ulp_info, gc, gcgrp, ipst))
15451676Sjpk 		return (NULL);
15460Sstevel@tonic-gate 
15470Sstevel@tonic-gate 	return (ire);
15480Sstevel@tonic-gate }
15490Sstevel@tonic-gate 
15500Sstevel@tonic-gate /*
15510Sstevel@tonic-gate  * Similar to ire_create except that it is called only when
15520Sstevel@tonic-gate  * we want to allocate ire as an mblk e.g. we have an external
15530Sstevel@tonic-gate  * resolver ARP.
15540Sstevel@tonic-gate  */
15550Sstevel@tonic-gate ire_t *
15560Sstevel@tonic-gate ire_create_mp(uchar_t *addr, uchar_t *mask, uchar_t *src_addr, uchar_t *gateway,
15574823Sseb     uint_t max_frag, nce_t *src_nce, queue_t *rfq, queue_t *stq, ushort_t type,
15584823Sseb     ipif_t *ipif, ipaddr_t cmask, uint32_t phandle, uint32_t ihandle,
15594823Sseb     uint32_t flags, const iulp_t *ulp_info, tsol_gc_t *gc, tsol_gcgrp_t *gcgrp,
15603448Sdh155122     ip_stack_t *ipst)
15610Sstevel@tonic-gate {
15622535Ssangeeta 	ire_t	*ire, *buf;
15630Sstevel@tonic-gate 	ire_t	*ret_ire;
15640Sstevel@tonic-gate 	mblk_t	*mp;
15652535Ssangeeta 	size_t	bufsize;
15662535Ssangeeta 	frtn_t	*frtnp;
15672535Ssangeeta 	ill_t	*ill;
15682535Ssangeeta 
15692535Ssangeeta 	bufsize = sizeof (ire_t) + sizeof (frtn_t);
15702535Ssangeeta 	buf = kmem_alloc(bufsize, KM_NOSLEEP);
15712535Ssangeeta 	if (buf == NULL) {
15722535Ssangeeta 		ip1dbg(("ire_create_mp: alloc failed\n"));
15732535Ssangeeta 		return (NULL);
15742535Ssangeeta 	}
15752535Ssangeeta 	frtnp = (frtn_t *)(buf + 1);
15762535Ssangeeta 	frtnp->free_arg = (caddr_t)buf;
15772535Ssangeeta 	frtnp->free_func = ire_freemblk;
15782535Ssangeeta 
15792535Ssangeeta 	/*
15802535Ssangeeta 	 * Allocate the new IRE. The ire created will hold a ref on
15812535Ssangeeta 	 * an nce_t after ire_nce_init, and this ref must either be
15822535Ssangeeta 	 * (a)  transferred to the ire_cache entry created when ire_add_v4
15832535Ssangeeta 	 *	is called after successful arp resolution, or,
15842535Ssangeeta 	 * (b)  released, when arp resolution fails
15852535Ssangeeta 	 * Case (b) is handled in ire_freemblk() which will be called
15862535Ssangeeta 	 * when mp is freed as a result of failed arp.
15872535Ssangeeta 	 */
15882535Ssangeeta 	mp = esballoc((unsigned char *)buf, bufsize, BPRI_MED, frtnp);
15890Sstevel@tonic-gate 	if (mp == NULL) {
15900Sstevel@tonic-gate 		ip1dbg(("ire_create_mp: alloc failed\n"));
15912535Ssangeeta 		kmem_free(buf, bufsize);
15920Sstevel@tonic-gate 		return (NULL);
15930Sstevel@tonic-gate 	}
15940Sstevel@tonic-gate 	ire = (ire_t *)mp->b_rptr;
15950Sstevel@tonic-gate 	mp->b_wptr = (uchar_t *)&ire[1];
15960Sstevel@tonic-gate 
15970Sstevel@tonic-gate 	/* Start clean. */
15980Sstevel@tonic-gate 	*ire = ire_null;
15990Sstevel@tonic-gate 	ire->ire_mp = mp;
16000Sstevel@tonic-gate 	mp->b_datap->db_type = IRE_DB_TYPE;
16012535Ssangeeta 	ire->ire_marks |= IRE_MARK_UNCACHED;
16020Sstevel@tonic-gate 
16034823Sseb 	ret_ire = ire_init(ire, addr, mask, src_addr, gateway, NULL, src_nce,
16044823Sseb 	    rfq, stq, type, ipif, cmask, phandle, ihandle, flags, ulp_info, gc,
16054823Sseb 	    gcgrp, ipst);
16060Sstevel@tonic-gate 
16072741Ssowmini 	ill = (ill_t *)(stq->q_ptr);
16080Sstevel@tonic-gate 	if (ret_ire == NULL) {
16093448Sdh155122 		/* ire_freemblk needs these set */
16102741Ssowmini 		ire->ire_stq_ifindex = ill->ill_phyint->phyint_ifindex;
16113448Sdh155122 		ire->ire_ipst = ipst;
16120Sstevel@tonic-gate 		freeb(ire->ire_mp);
16130Sstevel@tonic-gate 		return (NULL);
16140Sstevel@tonic-gate 	}
16152535Ssangeeta 	ret_ire->ire_stq_ifindex = ill->ill_phyint->phyint_ifindex;
16160Sstevel@tonic-gate 	ASSERT(ret_ire == ire);
16170Sstevel@tonic-gate 	/*
16180Sstevel@tonic-gate 	 * ire_max_frag is normally zero here and is atomically set
16190Sstevel@tonic-gate 	 * under the irebucket lock in ire_add_v[46] except for the
16200Sstevel@tonic-gate 	 * case of IRE_MARK_NOADD. In that event the the ire_max_frag
16210Sstevel@tonic-gate 	 * is non-zero here.
16220Sstevel@tonic-gate 	 */
16230Sstevel@tonic-gate 	ire->ire_max_frag = max_frag;
16240Sstevel@tonic-gate 	return (ire);
16250Sstevel@tonic-gate }
16260Sstevel@tonic-gate 
16270Sstevel@tonic-gate /*
16280Sstevel@tonic-gate  * ire_create is called to allocate and initialize a new IRE.
16290Sstevel@tonic-gate  *
16300Sstevel@tonic-gate  * NOTE : This is called as writer sometimes though not required
16310Sstevel@tonic-gate  * by this function.
16320Sstevel@tonic-gate  */
16330Sstevel@tonic-gate ire_t *
16340Sstevel@tonic-gate ire_create(uchar_t *addr, uchar_t *mask, uchar_t *src_addr, uchar_t *gateway,
16354823Sseb     uint_t *max_fragp, nce_t *src_nce, queue_t *rfq, queue_t *stq,
16364823Sseb     ushort_t type, ipif_t *ipif, ipaddr_t cmask, uint32_t phandle,
16374823Sseb     uint32_t ihandle, uint32_t flags, const iulp_t *ulp_info, tsol_gc_t *gc,
16384823Sseb     tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
16390Sstevel@tonic-gate {
16400Sstevel@tonic-gate 	ire_t	*ire;
16410Sstevel@tonic-gate 	ire_t	*ret_ire;
16420Sstevel@tonic-gate 
16430Sstevel@tonic-gate 	ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
16440Sstevel@tonic-gate 	if (ire == NULL) {
16450Sstevel@tonic-gate 		ip1dbg(("ire_create: alloc failed\n"));
16460Sstevel@tonic-gate 		return (NULL);
16470Sstevel@tonic-gate 	}
16480Sstevel@tonic-gate 	*ire = ire_null;
16490Sstevel@tonic-gate 
16504823Sseb 	ret_ire = ire_init(ire, addr, mask, src_addr, gateway, max_fragp,
16514823Sseb 	    src_nce, rfq, stq, type, ipif, cmask, phandle, ihandle, flags,
16524823Sseb 	    ulp_info, gc, gcgrp, ipst);
16530Sstevel@tonic-gate 
16540Sstevel@tonic-gate 	if (ret_ire == NULL) {
16550Sstevel@tonic-gate 		kmem_cache_free(ire_cache, ire);
16560Sstevel@tonic-gate 		return (NULL);
16570Sstevel@tonic-gate 	}
16580Sstevel@tonic-gate 	ASSERT(ret_ire == ire);
16590Sstevel@tonic-gate 	return (ire);
16600Sstevel@tonic-gate }
16610Sstevel@tonic-gate 
16620Sstevel@tonic-gate 
16630Sstevel@tonic-gate /*
16640Sstevel@tonic-gate  * Common to IPv4 and IPv6
16650Sstevel@tonic-gate  */
16661676Sjpk boolean_t
16674714Ssowmini ire_init_common(ire_t *ire, uint_t *max_fragp, nce_t *src_nce, queue_t *rfq,
16684823Sseb     queue_t *stq, ushort_t type, ipif_t *ipif, uint32_t phandle,
16694714Ssowmini     uint32_t ihandle, uint32_t flags, uchar_t ipversion, const iulp_t *ulp_info,
16704714Ssowmini     tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
16710Sstevel@tonic-gate {
16720Sstevel@tonic-gate 	ire->ire_max_fragp = max_fragp;
16733448Sdh155122 	ire->ire_frag_flag |= (ipst->ips_ip_path_mtu_discovery) ? IPH_DF : 0;
16740Sstevel@tonic-gate 
16751676Sjpk #ifdef DEBUG
16761676Sjpk 	if (ipif != NULL) {
16770Sstevel@tonic-gate 		if (ipif->ipif_isv6)
16780Sstevel@tonic-gate 			ASSERT(ipversion == IPV6_VERSION);
16790Sstevel@tonic-gate 		else
16800Sstevel@tonic-gate 			ASSERT(ipversion == IPV4_VERSION);
16810Sstevel@tonic-gate 	}
16821676Sjpk #endif /* DEBUG */
16831676Sjpk 
16841676Sjpk 	/*
16851676Sjpk 	 * Create/initialize IRE security attribute only in Trusted mode;
16861676Sjpk 	 * if the passed in gc/gcgrp is non-NULL, we expect that the caller
16871676Sjpk 	 * has held a reference to it and will release it when this routine
16881676Sjpk 	 * returns a failure, otherwise we own the reference.  We do this
16891676Sjpk 	 * prior to initializing the rest IRE fields.
16902416Sjarrett 	 *
16912416Sjarrett 	 * Don't allocate ire_gw_secattr for the resolver case to prevent
16922416Sjarrett 	 * memory leak (in case of external resolution failure). We'll
16932416Sjarrett 	 * allocate it after a successful external resolution, in ire_add().
16942416Sjarrett 	 * Note that ire->ire_mp != NULL here means this ire is headed
16952416Sjarrett 	 * to an external resolver.
16961676Sjpk 	 */
16971676Sjpk 	if (is_system_labeled()) {
16981676Sjpk 		if ((type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST |
16991676Sjpk 		    IRE_INTERFACE)) != 0) {
17001676Sjpk 			/* release references on behalf of caller */
17011676Sjpk 			if (gc != NULL)
17021676Sjpk 				GC_REFRELE(gc);
17031676Sjpk 			if (gcgrp != NULL)
17041676Sjpk 				GCGRP_REFRELE(gcgrp);
17052416Sjarrett 		} else if ((ire->ire_mp == NULL) &&
17062416Sjarrett 		    tsol_ire_init_gwattr(ire, ipversion, gc, gcgrp) != 0) {
17071676Sjpk 			return (B_FALSE);
17081676Sjpk 		}
17091676Sjpk 	}
17100Sstevel@tonic-gate 
17110Sstevel@tonic-gate 	ire->ire_stq = stq;
17120Sstevel@tonic-gate 	ire->ire_rfq = rfq;
17130Sstevel@tonic-gate 	ire->ire_type = type;
17140Sstevel@tonic-gate 	ire->ire_flags = RTF_UP | flags;
17150Sstevel@tonic-gate 	ire->ire_ident = TICK_TO_MSEC(lbolt);
17160Sstevel@tonic-gate 	bcopy(ulp_info, &ire->ire_uinfo, sizeof (iulp_t));
17170Sstevel@tonic-gate 
17180Sstevel@tonic-gate 	ire->ire_tire_mark = ire->ire_ob_pkt_count + ire->ire_ib_pkt_count;
17190Sstevel@tonic-gate 	ire->ire_last_used_time = lbolt;
17200Sstevel@tonic-gate 	ire->ire_create_time = (uint32_t)gethrestime_sec();
17210Sstevel@tonic-gate 
17220Sstevel@tonic-gate 	/*
17230Sstevel@tonic-gate 	 * If this IRE is an IRE_CACHE, inherit the handles from the
17240Sstevel@tonic-gate 	 * parent IREs. For others in the forwarding table, assign appropriate
17250Sstevel@tonic-gate 	 * new ones.
17260Sstevel@tonic-gate 	 *
17270Sstevel@tonic-gate 	 * The mutex protecting ire_handle is because ire_create is not always
17280Sstevel@tonic-gate 	 * called as a writer.
17290Sstevel@tonic-gate 	 */
17300Sstevel@tonic-gate 	if (ire->ire_type & IRE_OFFSUBNET) {
17313448Sdh155122 		mutex_enter(&ipst->ips_ire_handle_lock);
17323448Sdh155122 		ire->ire_phandle = (uint32_t)ipst->ips_ire_handle++;
17333448Sdh155122 		mutex_exit(&ipst->ips_ire_handle_lock);
17340Sstevel@tonic-gate 	} else if (ire->ire_type & IRE_INTERFACE) {
17353448Sdh155122 		mutex_enter(&ipst->ips_ire_handle_lock);
17363448Sdh155122 		ire->ire_ihandle = (uint32_t)ipst->ips_ire_handle++;
17373448Sdh155122 		mutex_exit(&ipst->ips_ire_handle_lock);
17380Sstevel@tonic-gate 	} else if (ire->ire_type == IRE_CACHE) {
17390Sstevel@tonic-gate 		ire->ire_phandle = phandle;
17400Sstevel@tonic-gate 		ire->ire_ihandle = ihandle;
17410Sstevel@tonic-gate 	}
17420Sstevel@tonic-gate 	ire->ire_ipif = ipif;
17430Sstevel@tonic-gate 	if (ipif != NULL) {
17440Sstevel@tonic-gate 		ire->ire_ipif_seqid = ipif->ipif_seqid;
17450Sstevel@tonic-gate 		ire->ire_zoneid = ipif->ipif_zoneid;
17460Sstevel@tonic-gate 	} else {
17470Sstevel@tonic-gate 		ire->ire_zoneid = GLOBAL_ZONEID;
17480Sstevel@tonic-gate 	}
17490Sstevel@tonic-gate 	ire->ire_ipversion = ipversion;
17502535Ssangeeta 	mutex_init(&ire->ire_lock, NULL, MUTEX_DEFAULT, NULL);
17512535Ssangeeta 	if (ipversion == IPV4_VERSION) {
17524714Ssowmini 		/*
17534714Ssowmini 		 * IPv6 initializes the ire_nce in ire_add_v6, which expects
17544714Ssowmini 		 * to find the ire_nce to be null when it is called.
17554714Ssowmini 		 */
17564714Ssowmini 		if (ire_nce_init(ire, src_nce) != 0) {
17572535Ssangeeta 			/* some failure occurred. propagate error back */
17582535Ssangeeta 			return (B_FALSE);
17592535Ssangeeta 		}
17602535Ssangeeta 	}
17610Sstevel@tonic-gate 	ire->ire_refcnt = 1;
17623448Sdh155122 	ire->ire_ipst = ipst;	/* No netstack_hold */
17635023Scarlsonj 	ire->ire_trace_disable = B_FALSE;
17641676Sjpk 
17651676Sjpk 	return (B_TRUE);
17660Sstevel@tonic-gate }
17670Sstevel@tonic-gate 
17680Sstevel@tonic-gate /*
17690Sstevel@tonic-gate  * This routine is called repeatedly by ipif_up to create broadcast IREs.
17700Sstevel@tonic-gate  * It is passed a pointer to a slot in an IRE pointer array into which to
17710Sstevel@tonic-gate  * place the pointer to the new IRE, if indeed we create one.  If the
17720Sstevel@tonic-gate  * IRE corresponding to the address passed in would be a duplicate of an
17730Sstevel@tonic-gate  * existing one, we don't create the new one.  irep is incremented before
17740Sstevel@tonic-gate  * return only if we do create a new IRE.  (Always called as writer.)
17750Sstevel@tonic-gate  *
17760Sstevel@tonic-gate  * Note that with the "match_flags" parameter, we can match on either
17770Sstevel@tonic-gate  * a particular logical interface (MATCH_IRE_IPIF) or for all logical
17780Sstevel@tonic-gate  * interfaces for a given physical interface (MATCH_IRE_ILL).  Currently,
17790Sstevel@tonic-gate  * we only create broadcast ire's on a per physical interface basis. If
17800Sstevel@tonic-gate  * someone is going to be mucking with logical interfaces, it is important
17810Sstevel@tonic-gate  * to call "ipif_check_bcast_ires()" to make sure that any change to a
17820Sstevel@tonic-gate  * logical interface will not cause critical broadcast IRE's to be deleted.
17830Sstevel@tonic-gate  */
17840Sstevel@tonic-gate ire_t **
17850Sstevel@tonic-gate ire_check_and_create_bcast(ipif_t *ipif, ipaddr_t  addr, ire_t **irep,
17860Sstevel@tonic-gate     int match_flags)
17870Sstevel@tonic-gate {
17880Sstevel@tonic-gate 	ire_t *ire;
17890Sstevel@tonic-gate 	uint64_t check_flags = IPIF_DEPRECATED | IPIF_NOLOCAL | IPIF_ANYCAST;
17903448Sdh155122 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
17910Sstevel@tonic-gate 
17920Sstevel@tonic-gate 	/*
17930Sstevel@tonic-gate 	 * No broadcast IREs for the LOOPBACK interface
17940Sstevel@tonic-gate 	 * or others such as point to point and IPIF_NOXMIT.
17950Sstevel@tonic-gate 	 */
17960Sstevel@tonic-gate 	if (!(ipif->ipif_flags & IPIF_BROADCAST) ||
17970Sstevel@tonic-gate 	    (ipif->ipif_flags & IPIF_NOXMIT))
17980Sstevel@tonic-gate 		return (irep);
17990Sstevel@tonic-gate 
18000Sstevel@tonic-gate 	/* If this would be a duplicate, don't bother. */
18010Sstevel@tonic-gate 	if ((ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ipif,
18023448Sdh155122 	    ipif->ipif_zoneid, NULL, match_flags, ipst)) != NULL) {
18030Sstevel@tonic-gate 		/*
18040Sstevel@tonic-gate 		 * We look for non-deprecated (and non-anycast, non-nolocal)
18050Sstevel@tonic-gate 		 * ipifs as the best choice. ipifs with check_flags matching
18060Sstevel@tonic-gate 		 * (deprecated, etc) are used only if non-deprecated ipifs
18070Sstevel@tonic-gate 		 * are not available. if the existing ire's ipif is deprecated
18080Sstevel@tonic-gate 		 * and the new ipif is non-deprecated, switch to the new ipif
18090Sstevel@tonic-gate 		 */
18100Sstevel@tonic-gate 		if ((!(ire->ire_ipif->ipif_flags & check_flags)) ||
18110Sstevel@tonic-gate 		    (ipif->ipif_flags & check_flags)) {
18120Sstevel@tonic-gate 			ire_refrele(ire);
18130Sstevel@tonic-gate 			return (irep);
18140Sstevel@tonic-gate 		}
18150Sstevel@tonic-gate 		/*
18160Sstevel@tonic-gate 		 * Bcast ires exist in pairs. Both have to be deleted,
18170Sstevel@tonic-gate 		 * Since we are exclusive we can make the above assertion.
18180Sstevel@tonic-gate 		 * The 1st has to be refrele'd since it was ctable_lookup'd.
18190Sstevel@tonic-gate 		 */
18200Sstevel@tonic-gate 		ASSERT(IAM_WRITER_IPIF(ipif));
18210Sstevel@tonic-gate 		ASSERT(ire->ire_next->ire_addr == ire->ire_addr);
18220Sstevel@tonic-gate 		ire_delete(ire->ire_next);
18230Sstevel@tonic-gate 		ire_delete(ire);
18240Sstevel@tonic-gate 		ire_refrele(ire);
18250Sstevel@tonic-gate 	}
18260Sstevel@tonic-gate 
18270Sstevel@tonic-gate 	irep = ire_create_bcast(ipif, addr, irep);
18280Sstevel@tonic-gate 
18290Sstevel@tonic-gate 	return (irep);
18300Sstevel@tonic-gate }
18310Sstevel@tonic-gate 
18320Sstevel@tonic-gate uint_t ip_loopback_mtu = IP_LOOPBACK_MTU;
18330Sstevel@tonic-gate 
18340Sstevel@tonic-gate /*
18350Sstevel@tonic-gate  * This routine is called from ipif_check_bcast_ires and ire_check_bcast.
18360Sstevel@tonic-gate  * It leaves all the verifying and deleting to those routines. So it always
18370Sstevel@tonic-gate  * creates 2 bcast ires and chains them into the ire array passed in.
18380Sstevel@tonic-gate  */
18390Sstevel@tonic-gate ire_t **
18400Sstevel@tonic-gate ire_create_bcast(ipif_t *ipif, ipaddr_t  addr, ire_t **irep)
18410Sstevel@tonic-gate {
18423448Sdh155122 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
18433448Sdh155122 
18440Sstevel@tonic-gate 	*irep++ = ire_create(
18450Sstevel@tonic-gate 	    (uchar_t *)&addr,			/* dest addr */
18460Sstevel@tonic-gate 	    (uchar_t *)&ip_g_all_ones,		/* mask */
18470Sstevel@tonic-gate 	    (uchar_t *)&ipif->ipif_src_addr,	/* source addr */
18480Sstevel@tonic-gate 	    NULL,				/* no gateway */
18490Sstevel@tonic-gate 	    &ipif->ipif_mtu,			/* max frag */
18504714Ssowmini 	    NULL,				/* no src nce */
18510Sstevel@tonic-gate 	    ipif->ipif_rq,			/* recv-from queue */
18520Sstevel@tonic-gate 	    ipif->ipif_wq,			/* send-to queue */
18530Sstevel@tonic-gate 	    IRE_BROADCAST,
18540Sstevel@tonic-gate 	    ipif,
18550Sstevel@tonic-gate 	    0,
18560Sstevel@tonic-gate 	    0,
18570Sstevel@tonic-gate 	    0,
18580Sstevel@tonic-gate 	    0,
18591676Sjpk 	    &ire_uinfo_null,
18601676Sjpk 	    NULL,
18613448Sdh155122 	    NULL,
18623448Sdh155122 	    ipst);
18630Sstevel@tonic-gate 
18640Sstevel@tonic-gate 	*irep++ = ire_create(
18654714Ssowmini 	    (uchar_t *)&addr,			/* dest address */
18664714Ssowmini 	    (uchar_t *)&ip_g_all_ones,		/* mask */
18674714Ssowmini 	    (uchar_t *)&ipif->ipif_src_addr,	/* source address */
18684714Ssowmini 	    NULL,				/* no gateway */
18694714Ssowmini 	    &ip_loopback_mtu,			/* max frag size */
18704714Ssowmini 	    NULL,				/* no src_nce */
18714714Ssowmini 	    ipif->ipif_rq,			/* recv-from queue */
18724714Ssowmini 	    NULL,				/* no send-to queue */
18734714Ssowmini 	    IRE_BROADCAST,			/* Needed for fanout in wput */
18744714Ssowmini 	    ipif,
18754714Ssowmini 	    0,
18764714Ssowmini 	    0,
18774714Ssowmini 	    0,
18784714Ssowmini 	    0,
18794714Ssowmini 	    &ire_uinfo_null,
18804714Ssowmini 	    NULL,
18814714Ssowmini 	    NULL,
18824714Ssowmini 	    ipst);
18830Sstevel@tonic-gate 
18840Sstevel@tonic-gate 	return (irep);
18850Sstevel@tonic-gate }
18860Sstevel@tonic-gate 
18870Sstevel@tonic-gate /*
18880Sstevel@tonic-gate  * ire_walk routine to delete or update any IRE_CACHE that might contain
18890Sstevel@tonic-gate  * stale information.
18900Sstevel@tonic-gate  * The flags state which entries to delete or update.
18910Sstevel@tonic-gate  * Garbage collection is done separately using kmem alloc callbacks to
18920Sstevel@tonic-gate  * ip_trash_ire_reclaim.
18930Sstevel@tonic-gate  * Used for both IPv4 and IPv6. However, IPv6 only uses FLUSH_MTU_TIME
18940Sstevel@tonic-gate  * since other stale information is cleaned up using NUD.
18950Sstevel@tonic-gate  */
18960Sstevel@tonic-gate void
18970Sstevel@tonic-gate ire_expire(ire_t *ire, char *arg)
18980Sstevel@tonic-gate {
18993448Sdh155122 	ire_expire_arg_t	*ieap = (ire_expire_arg_t *)(uintptr_t)arg;
19003448Sdh155122 	ill_t			*stq_ill;
19013448Sdh155122 	int			flush_flags = ieap->iea_flush_flag;
19023448Sdh155122 	ip_stack_t		*ipst = ieap->iea_ipst;
19030Sstevel@tonic-gate 
19040Sstevel@tonic-gate 	if ((flush_flags & FLUSH_REDIRECT_TIME) &&
19053004Sdd193516 	    (ire->ire_flags & RTF_DYNAMIC)) {
19060Sstevel@tonic-gate 		/* Make sure we delete the corresponding IRE_CACHE */
19070Sstevel@tonic-gate 		ip1dbg(("ire_expire: all redirects\n"));
19083448Sdh155122 		ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
19090Sstevel@tonic-gate 		ire_delete(ire);
19103448Sdh155122 		atomic_dec_32(&ipst->ips_ip_redirect_cnt);
19110Sstevel@tonic-gate 		return;
19120Sstevel@tonic-gate 	}
19130Sstevel@tonic-gate 	if (ire->ire_type != IRE_CACHE)
19140Sstevel@tonic-gate 		return;
19150Sstevel@tonic-gate 
19160Sstevel@tonic-gate 	if (flush_flags & FLUSH_ARP_TIME) {
19170Sstevel@tonic-gate 		/*
19180Sstevel@tonic-gate 		 * Remove all IRE_CACHE.
19190Sstevel@tonic-gate 		 * Verify that create time is more than
19200Sstevel@tonic-gate 		 * ip_ire_arp_interval milliseconds ago.
19210Sstevel@tonic-gate 		 */
19223448Sdh155122 		if (NCE_EXPIRED(ire->ire_nce, ipst)) {
19230Sstevel@tonic-gate 			ire_delete(ire);
19240Sstevel@tonic-gate 			return;
19250Sstevel@tonic-gate 		}
19260Sstevel@tonic-gate 	}
19270Sstevel@tonic-gate 
19283448Sdh155122 	if (ipst->ips_ip_path_mtu_discovery && (flush_flags & FLUSH_MTU_TIME) &&
19290Sstevel@tonic-gate 	    (ire->ire_ipif != NULL)) {
19300Sstevel@tonic-gate 		/* Increase pmtu if it is less than the interface mtu */
19310Sstevel@tonic-gate 		mutex_enter(&ire->ire_lock);
19320Sstevel@tonic-gate 		/*
19330Sstevel@tonic-gate 		 * If the ipif is a vni (whose mtu is 0, since it's virtual)
19340Sstevel@tonic-gate 		 * get the mtu from the sending interfaces' ipif
19350Sstevel@tonic-gate 		 */
19360Sstevel@tonic-gate 		if (IS_VNI(ire->ire_ipif->ipif_ill)) {
19370Sstevel@tonic-gate 			stq_ill = ire->ire_stq->q_ptr;
19380Sstevel@tonic-gate 			ire->ire_max_frag = MIN(stq_ill->ill_ipif->ipif_mtu,
19390Sstevel@tonic-gate 			    IP_MAXPACKET);
19400Sstevel@tonic-gate 		} else {
19410Sstevel@tonic-gate 			ire->ire_max_frag = MIN(ire->ire_ipif->ipif_mtu,
19420Sstevel@tonic-gate 			    IP_MAXPACKET);
19430Sstevel@tonic-gate 		}
19440Sstevel@tonic-gate 		ire->ire_frag_flag |= IPH_DF;
19450Sstevel@tonic-gate 		mutex_exit(&ire->ire_lock);
19460Sstevel@tonic-gate 	}
19470Sstevel@tonic-gate }
19480Sstevel@tonic-gate 
19490Sstevel@tonic-gate /*
19500Sstevel@tonic-gate  * Return any local address.  We use this to target ourselves
19510Sstevel@tonic-gate  * when the src address was specified as 'default'.
19520Sstevel@tonic-gate  * Preference for IRE_LOCAL entries.
19530Sstevel@tonic-gate  */
19540Sstevel@tonic-gate ire_t *
19553448Sdh155122 ire_lookup_local(zoneid_t zoneid, ip_stack_t *ipst)
19560Sstevel@tonic-gate {
19570Sstevel@tonic-gate 	ire_t	*ire;
19580Sstevel@tonic-gate 	irb_t	*irb;
19590Sstevel@tonic-gate 	ire_t	*maybe = NULL;
19600Sstevel@tonic-gate 	int i;
19610Sstevel@tonic-gate 
19623448Sdh155122 	for (i = 0; i < ipst->ips_ip_cache_table_size;  i++) {
19633448Sdh155122 		irb = &ipst->ips_ip_cache_table[i];
19640Sstevel@tonic-gate 		if (irb->irb_ire == NULL)
19650Sstevel@tonic-gate 			continue;
19660Sstevel@tonic-gate 		rw_enter(&irb->irb_lock, RW_READER);
19670Sstevel@tonic-gate 		for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
19680Sstevel@tonic-gate 			if ((ire->ire_marks & IRE_MARK_CONDEMNED) ||
19691676Sjpk 			    (ire->ire_zoneid != zoneid &&
19701676Sjpk 			    ire->ire_zoneid != ALL_ZONES))
19710Sstevel@tonic-gate 				continue;
19720Sstevel@tonic-gate 			switch (ire->ire_type) {
19730Sstevel@tonic-gate 			case IRE_LOOPBACK:
19740Sstevel@tonic-gate 				if (maybe == NULL) {
19750Sstevel@tonic-gate 					IRE_REFHOLD(ire);
19760Sstevel@tonic-gate 					maybe = ire;
19770Sstevel@tonic-gate 				}
19780Sstevel@tonic-gate 				break;
19790Sstevel@tonic-gate 			case IRE_LOCAL:
19800Sstevel@tonic-gate 				if (maybe != NULL) {
19810Sstevel@tonic-gate 					ire_refrele(maybe);
19820Sstevel@tonic-gate 				}
19830Sstevel@tonic-gate 				IRE_REFHOLD(ire);
19840Sstevel@tonic-gate 				rw_exit(&irb->irb_lock);
19850Sstevel@tonic-gate 				return (ire);
19860Sstevel@tonic-gate 			}
19870Sstevel@tonic-gate 		}
19880Sstevel@tonic-gate 		rw_exit(&irb->irb_lock);
19890Sstevel@tonic-gate 	}
19900Sstevel@tonic-gate 	return (maybe);
19910Sstevel@tonic-gate }
19920Sstevel@tonic-gate 
19930Sstevel@tonic-gate /*
19940Sstevel@tonic-gate  * If the specified IRE is associated with a particular ILL, return
19950Sstevel@tonic-gate  * that ILL pointer (May be called as writer.).
19960Sstevel@tonic-gate  *
19970Sstevel@tonic-gate  * NOTE : This is not a generic function that can be used always.
19980Sstevel@tonic-gate  * This function always returns the ill of the outgoing packets
19990Sstevel@tonic-gate  * if this ire is used.
20000Sstevel@tonic-gate  */
20010Sstevel@tonic-gate ill_t *
20021676Sjpk ire_to_ill(const ire_t *ire)
20030Sstevel@tonic-gate {
20040Sstevel@tonic-gate 	ill_t *ill = NULL;
20050Sstevel@tonic-gate 
20060Sstevel@tonic-gate 	/*
20070Sstevel@tonic-gate 	 * 1) For an IRE_CACHE, ire_ipif is the one where it obtained
20080Sstevel@tonic-gate 	 *    the source address from. ire_stq is the one where the
20090Sstevel@tonic-gate 	 *    packets will be sent out on. We return that here.
20100Sstevel@tonic-gate 	 *
20110Sstevel@tonic-gate 	 * 2) IRE_BROADCAST normally has a loopback and a non-loopback
20120Sstevel@tonic-gate 	 *    copy and they always exist next to each other with loopback
20130Sstevel@tonic-gate 	 *    copy being the first one. If we are called on the non-loopback
20140Sstevel@tonic-gate 	 *    copy, return the one pointed by ire_stq. If it was called on
20150Sstevel@tonic-gate 	 *    a loopback copy, we still return the one pointed by the next
20160Sstevel@tonic-gate 	 *    ire's ire_stq pointer i.e the one pointed by the non-loopback
20170Sstevel@tonic-gate 	 *    copy. We don't want use ire_ipif as it might represent the
20180Sstevel@tonic-gate 	 *    source address (if we borrow source addresses for
20190Sstevel@tonic-gate 	 *    IRE_BROADCASTS in the future).
20200Sstevel@tonic-gate 	 *    However if an interface is currently coming up, the above
20210Sstevel@tonic-gate 	 *    condition may not hold during that period since the ires
20220Sstevel@tonic-gate 	 *    are added one at a time. Thus one of the pair could have been
20230Sstevel@tonic-gate 	 *    added and the other not yet added.
20242906Snordmark 	 * 3) For many other IREs (e.g., IRE_LOCAL), ire_rfq indicates the ill.
20252906Snordmark 	 * 4) For all others return the ones pointed by ire_ipif->ipif_ill.
20262906Snordmark 	 *    That handles IRE_LOOPBACK.
20270Sstevel@tonic-gate 	 */
20280Sstevel@tonic-gate 
20290Sstevel@tonic-gate 	if (ire->ire_type == IRE_CACHE) {
20300Sstevel@tonic-gate 		ill = (ill_t *)ire->ire_stq->q_ptr;
20310Sstevel@tonic-gate 	} else if (ire->ire_type == IRE_BROADCAST) {
20320Sstevel@tonic-gate 		if (ire->ire_stq != NULL) {
20330Sstevel@tonic-gate 			ill = (ill_t *)ire->ire_stq->q_ptr;
20340Sstevel@tonic-gate 		} else {
20350Sstevel@tonic-gate 			ire_t  *ire_next;
20360Sstevel@tonic-gate 
20370Sstevel@tonic-gate 			ire_next = ire->ire_next;
20380Sstevel@tonic-gate 			if (ire_next != NULL &&
20390Sstevel@tonic-gate 			    ire_next->ire_type == IRE_BROADCAST &&
20400Sstevel@tonic-gate 			    ire_next->ire_addr == ire->ire_addr &&
20410Sstevel@tonic-gate 			    ire_next->ire_ipif == ire->ire_ipif) {
20420Sstevel@tonic-gate 				ill = (ill_t *)ire_next->ire_stq->q_ptr;
20430Sstevel@tonic-gate 			}
20440Sstevel@tonic-gate 		}
20452906Snordmark 	} else if (ire->ire_rfq != NULL) {
20462906Snordmark 		ill = ire->ire_rfq->q_ptr;
20470Sstevel@tonic-gate 	} else if (ire->ire_ipif != NULL) {
20480Sstevel@tonic-gate 		ill = ire->ire_ipif->ipif_ill;
20490Sstevel@tonic-gate 	}
20500Sstevel@tonic-gate 	return (ill);
20510Sstevel@tonic-gate }
20520Sstevel@tonic-gate 
20530Sstevel@tonic-gate /* Arrange to call the specified function for every IRE in the world. */
20540Sstevel@tonic-gate void
20553448Sdh155122 ire_walk(pfv_t func, void *arg, ip_stack_t *ipst)
20560Sstevel@tonic-gate {
20573448Sdh155122 	ire_walk_ipvers(func, arg, 0, ALL_ZONES, ipst);
20580Sstevel@tonic-gate }
20590Sstevel@tonic-gate 
20600Sstevel@tonic-gate void
20613448Sdh155122 ire_walk_v4(pfv_t func, void *arg, zoneid_t zoneid, ip_stack_t *ipst)
20620Sstevel@tonic-gate {
20633448Sdh155122 	ire_walk_ipvers(func, arg, IPV4_VERSION, zoneid, ipst);
20640Sstevel@tonic-gate }
20650Sstevel@tonic-gate 
20660Sstevel@tonic-gate void
20673448Sdh155122 ire_walk_v6(pfv_t func, void *arg, zoneid_t zoneid, ip_stack_t *ipst)
20680Sstevel@tonic-gate {
20693448Sdh155122 	ire_walk_ipvers(func, arg, IPV6_VERSION, zoneid, ipst);
20700Sstevel@tonic-gate }
20710Sstevel@tonic-gate 
20720Sstevel@tonic-gate /*
20730Sstevel@tonic-gate  * Walk a particular version. version == 0 means both v4 and v6.
20740Sstevel@tonic-gate  */
20750Sstevel@tonic-gate static void
20763448Sdh155122 ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers, zoneid_t zoneid,
20773448Sdh155122     ip_stack_t *ipst)
20780Sstevel@tonic-gate {
20790Sstevel@tonic-gate 	if (vers != IPV6_VERSION) {
20802535Ssangeeta 		/*
20812535Ssangeeta 		 * ip_forwarding_table variable doesn't matter for IPv4 since
20823448Sdh155122 		 * ire_walk_ill_tables uses ips_ip_ftable for IPv4.
20832535Ssangeeta 		 */
20840Sstevel@tonic-gate 		ire_walk_ill_tables(0, 0, func, arg, IP_MASK_TABLE_SIZE,
20852535Ssangeeta 		    0, NULL,
20863448Sdh155122 		    ipst->ips_ip_cache_table_size, ipst->ips_ip_cache_table,
20873448Sdh155122 		    NULL, zoneid, ipst);
20880Sstevel@tonic-gate 	}
20890Sstevel@tonic-gate 	if (vers != IPV4_VERSION) {
20900Sstevel@tonic-gate 		ire_walk_ill_tables(0, 0, func, arg, IP6_MASK_TABLE_SIZE,
20913448Sdh155122 		    ipst->ips_ip6_ftable_hash_size,
20923448Sdh155122 		    ipst->ips_ip_forwarding_table_v6,
20933448Sdh155122 		    ipst->ips_ip6_cache_table_size,
20943448Sdh155122 		    ipst->ips_ip_cache_table_v6, NULL, zoneid, ipst);
20950Sstevel@tonic-gate 	}
20960Sstevel@tonic-gate }
20970Sstevel@tonic-gate 
20980Sstevel@tonic-gate /*
20990Sstevel@tonic-gate  * Arrange to call the specified
21000Sstevel@tonic-gate  * function for every IRE that matches the ill.
21010Sstevel@tonic-gate  */
21020Sstevel@tonic-gate void
21031676Sjpk ire_walk_ill(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg,
21040Sstevel@tonic-gate     ill_t *ill)
21050Sstevel@tonic-gate {
21060Sstevel@tonic-gate 	ire_walk_ill_ipvers(match_flags, ire_type, func, arg, 0, ill);
21070Sstevel@tonic-gate }
21080Sstevel@tonic-gate 
21090Sstevel@tonic-gate void
21101676Sjpk ire_walk_ill_v4(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg,
21110Sstevel@tonic-gate     ill_t *ill)
21120Sstevel@tonic-gate {
21130Sstevel@tonic-gate 	ire_walk_ill_ipvers(match_flags, ire_type, func, arg, IPV4_VERSION,
21140Sstevel@tonic-gate 	    ill);
21150Sstevel@tonic-gate }
21160Sstevel@tonic-gate 
21170Sstevel@tonic-gate void
21181676Sjpk ire_walk_ill_v6(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg,
21190Sstevel@tonic-gate     ill_t *ill)
21200Sstevel@tonic-gate {
21210Sstevel@tonic-gate 	ire_walk_ill_ipvers(match_flags, ire_type, func, arg, IPV6_VERSION,
21220Sstevel@tonic-gate 	    ill);
21230Sstevel@tonic-gate }
21240Sstevel@tonic-gate 
21250Sstevel@tonic-gate /*
21260Sstevel@tonic-gate  * Walk a particular ill and version. version == 0 means both v4 and v6.
21270Sstevel@tonic-gate  */
21280Sstevel@tonic-gate static void
21290Sstevel@tonic-gate ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type, pfv_t func,
21301676Sjpk     void *arg, uchar_t vers, ill_t *ill)
21310Sstevel@tonic-gate {
21323448Sdh155122 	ip_stack_t	*ipst = ill->ill_ipst;
21333448Sdh155122 
21340Sstevel@tonic-gate 	if (vers != IPV6_VERSION) {
21350Sstevel@tonic-gate 		ire_walk_ill_tables(match_flags, ire_type, func, arg,
21362535Ssangeeta 		    IP_MASK_TABLE_SIZE, 0,
21373448Sdh155122 		    NULL, ipst->ips_ip_cache_table_size,
21383448Sdh155122 		    ipst->ips_ip_cache_table, ill, ALL_ZONES, ipst);
21390Sstevel@tonic-gate 	}
21400Sstevel@tonic-gate 	if (vers != IPV4_VERSION) {
21410Sstevel@tonic-gate 		ire_walk_ill_tables(match_flags, ire_type, func, arg,
21423448Sdh155122 		    IP6_MASK_TABLE_SIZE, ipst->ips_ip6_ftable_hash_size,
21433448Sdh155122 		    ipst->ips_ip_forwarding_table_v6,
21443448Sdh155122 		    ipst->ips_ip6_cache_table_size,
21453448Sdh155122 		    ipst->ips_ip_cache_table_v6, ill, ALL_ZONES, ipst);
21460Sstevel@tonic-gate 	}
21470Sstevel@tonic-gate }
21480Sstevel@tonic-gate 
21492535Ssangeeta boolean_t
21500Sstevel@tonic-gate ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire,
21513448Sdh155122     ill_t *ill, zoneid_t zoneid, ip_stack_t *ipst)
21520Sstevel@tonic-gate {
21530Sstevel@tonic-gate 	ill_t *ire_stq_ill = NULL;
21540Sstevel@tonic-gate 	ill_t *ire_ipif_ill = NULL;
21550Sstevel@tonic-gate 	ill_group_t *ire_ill_group = NULL;
21560Sstevel@tonic-gate 
21570Sstevel@tonic-gate 	ASSERT(match_flags != 0 || zoneid != ALL_ZONES);
21580Sstevel@tonic-gate 	/*
21594972Smeem 	 * MATCH_IRE_ILL/MATCH_IRE_ILL_GROUP : We match both on ill
21600Sstevel@tonic-gate 	 *    pointed by ire_stq and ire_ipif. Only in the case of
21610Sstevel@tonic-gate 	 *    IRE_CACHEs can ire_stq and ire_ipif be pointing to
21620Sstevel@tonic-gate 	 *    different ills. But we want to keep this function generic
21630Sstevel@tonic-gate 	 *    enough for future use. So, we always try to match on both.
21640Sstevel@tonic-gate 	 *    The only caller of this function ire_walk_ill_tables, will
21650Sstevel@tonic-gate 	 *    call "func" after we return from this function. We expect
21660Sstevel@tonic-gate 	 *    "func" to do the right filtering of ires in this case.
21670Sstevel@tonic-gate 	 *
21680Sstevel@tonic-gate 	 * NOTE : In the case of MATCH_IRE_ILL_GROUP, groups
21690Sstevel@tonic-gate 	 * pointed by ire_stq and ire_ipif should always be the same.
21700Sstevel@tonic-gate 	 * So, we just match on only one of them.
21710Sstevel@tonic-gate 	 */
21720Sstevel@tonic-gate 	if (match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP)) {
21730Sstevel@tonic-gate 		if (ire->ire_stq != NULL)
21740Sstevel@tonic-gate 			ire_stq_ill = (ill_t *)ire->ire_stq->q_ptr;
21750Sstevel@tonic-gate 		if (ire->ire_ipif != NULL)
21760Sstevel@tonic-gate 			ire_ipif_ill = ire->ire_ipif->ipif_ill;
21770Sstevel@tonic-gate 		if (ire_stq_ill != NULL)
21780Sstevel@tonic-gate 			ire_ill_group = ire_stq_ill->ill_group;
21790Sstevel@tonic-gate 		if ((ire_ill_group == NULL) && (ire_ipif_ill != NULL))
21800Sstevel@tonic-gate 			ire_ill_group = ire_ipif_ill->ill_group;
21810Sstevel@tonic-gate 	}
21820Sstevel@tonic-gate 
21830Sstevel@tonic-gate 	if (zoneid != ALL_ZONES) {
21840Sstevel@tonic-gate 		/*
21850Sstevel@tonic-gate 		 * We're walking the IREs for a specific zone. The only relevant
21860Sstevel@tonic-gate 		 * IREs are:
21870Sstevel@tonic-gate 		 * - all IREs with a matching ire_zoneid
21880Sstevel@tonic-gate 		 * - all IRE_OFFSUBNETs as they're shared across all zones
21890Sstevel@tonic-gate 		 * - IRE_INTERFACE IREs for interfaces with a usable source addr
21900Sstevel@tonic-gate 		 *   with a matching zone
21910Sstevel@tonic-gate 		 * - IRE_DEFAULTs with a gateway reachable from the zone
21920Sstevel@tonic-gate 		 * We should really match on IRE_OFFSUBNETs and IRE_DEFAULTs
21930Sstevel@tonic-gate 		 * using the same rule; but the above rules are consistent with
21940Sstevel@tonic-gate 		 * the behavior of ire_ftable_lookup[_v6]() so that all the
21950Sstevel@tonic-gate 		 * routes that can be matched during lookup are also matched
21960Sstevel@tonic-gate 		 * here.
21970Sstevel@tonic-gate 		 */
21981676Sjpk 		if (zoneid != ire->ire_zoneid && ire->ire_zoneid != ALL_ZONES) {
21990Sstevel@tonic-gate 			/*
22000Sstevel@tonic-gate 			 * Note, IRE_INTERFACE can have the stq as NULL. For
22010Sstevel@tonic-gate 			 * example, if the default multicast route is tied to
22020Sstevel@tonic-gate 			 * the loopback address.
22030Sstevel@tonic-gate 			 */
22040Sstevel@tonic-gate 			if ((ire->ire_type & IRE_INTERFACE) &&
22050Sstevel@tonic-gate 			    (ire->ire_stq != NULL)) {
22060Sstevel@tonic-gate 				ire_stq_ill = (ill_t *)ire->ire_stq->q_ptr;
22070Sstevel@tonic-gate 				if (ire->ire_ipversion == IPV4_VERSION) {
22080Sstevel@tonic-gate 					if (!ipif_usesrc_avail(ire_stq_ill,
22090Sstevel@tonic-gate 					    zoneid))
22100Sstevel@tonic-gate 						/* No usable src addr in zone */
22110Sstevel@tonic-gate 						return (B_FALSE);
22120Sstevel@tonic-gate 				} else if (ire_stq_ill->ill_usesrc_ifindex
22130Sstevel@tonic-gate 				    != 0) {
22140Sstevel@tonic-gate 					/*
22150Sstevel@tonic-gate 					 * For IPv6 use ipif_select_source_v6()
22160Sstevel@tonic-gate 					 * so the right scope selection is done
22170Sstevel@tonic-gate 					 */
22180Sstevel@tonic-gate 					ipif_t *src_ipif;
22190Sstevel@tonic-gate 					src_ipif =
22200Sstevel@tonic-gate 					    ipif_select_source_v6(ire_stq_ill,
22212202Srk129064 					    &ire->ire_addr_v6, RESTRICT_TO_NONE,
22220Sstevel@tonic-gate 					    IPV6_PREFER_SRC_DEFAULT,
22230Sstevel@tonic-gate 					    zoneid);
22240Sstevel@tonic-gate 					if (src_ipif != NULL) {
22250Sstevel@tonic-gate 						ipif_refrele(src_ipif);
22260Sstevel@tonic-gate 					} else {
22270Sstevel@tonic-gate 						return (B_FALSE);
22280Sstevel@tonic-gate 					}
22290Sstevel@tonic-gate 				} else {
22300Sstevel@tonic-gate 					return (B_FALSE);
22310Sstevel@tonic-gate 				}
22320Sstevel@tonic-gate 
22330Sstevel@tonic-gate 			} else if (!(ire->ire_type & IRE_OFFSUBNET)) {
22340Sstevel@tonic-gate 				return (B_FALSE);
22350Sstevel@tonic-gate 			}
22360Sstevel@tonic-gate 		}
22370Sstevel@tonic-gate 
22380Sstevel@tonic-gate 		/*
22390Sstevel@tonic-gate 		 * Match all default routes from the global zone, irrespective
22402733Snordmark 		 * of reachability. For a non-global zone only match those
22412733Snordmark 		 * where ire_gateway_addr has a IRE_INTERFACE for the zoneid.
22420Sstevel@tonic-gate 		 */
22430Sstevel@tonic-gate 		if (ire->ire_type == IRE_DEFAULT && zoneid != GLOBAL_ZONEID) {
22440Sstevel@tonic-gate 			int ire_match_flags = 0;
22450Sstevel@tonic-gate 			in6_addr_t gw_addr_v6;
22460Sstevel@tonic-gate 			ire_t *rire;
22470Sstevel@tonic-gate 
22482733Snordmark 			ire_match_flags |= MATCH_IRE_TYPE;
22490Sstevel@tonic-gate 			if (ire->ire_ipif != NULL) {
22500Sstevel@tonic-gate 				ire_match_flags |= MATCH_IRE_ILL_GROUP;
22510Sstevel@tonic-gate 			}
22520Sstevel@tonic-gate 			if (ire->ire_ipversion == IPV4_VERSION) {
22530Sstevel@tonic-gate 				rire = ire_route_lookup(ire->ire_gateway_addr,
22542733Snordmark 				    0, 0, IRE_INTERFACE, ire->ire_ipif, NULL,
22553448Sdh155122 				    zoneid, NULL, ire_match_flags, ipst);
22560Sstevel@tonic-gate 			} else {
22570Sstevel@tonic-gate 				ASSERT(ire->ire_ipversion == IPV6_VERSION);
22580Sstevel@tonic-gate 				mutex_enter(&ire->ire_lock);
22590Sstevel@tonic-gate 				gw_addr_v6 = ire->ire_gateway_addr_v6;
22600Sstevel@tonic-gate 				mutex_exit(&ire->ire_lock);
22610Sstevel@tonic-gate 				rire = ire_route_lookup_v6(&gw_addr_v6,
22622733Snordmark 				    NULL, NULL, IRE_INTERFACE, ire->ire_ipif,
22633448Sdh155122 				    NULL, zoneid, NULL, ire_match_flags, ipst);
22640Sstevel@tonic-gate 			}
22650Sstevel@tonic-gate 			if (rire == NULL) {
22660Sstevel@tonic-gate 				return (B_FALSE);
22670Sstevel@tonic-gate 			}
22680Sstevel@tonic-gate 			ire_refrele(rire);
22690Sstevel@tonic-gate 		}
22700Sstevel@tonic-gate 	}
22710Sstevel@tonic-gate 
22720Sstevel@tonic-gate 	if (((!(match_flags & MATCH_IRE_TYPE)) ||
22734714Ssowmini 	    (ire->ire_type & ire_type)) &&
22740Sstevel@tonic-gate 	    ((!(match_flags & MATCH_IRE_ILL)) ||
22754714Ssowmini 	    (ire_stq_ill == ill || ire_ipif_ill == ill)) &&
22760Sstevel@tonic-gate 	    ((!(match_flags & MATCH_IRE_ILL_GROUP)) ||
22774714Ssowmini 	    (ire_stq_ill == ill) || (ire_ipif_ill == ill) ||
22784714Ssowmini 	    (ire_ill_group != NULL &&
22794714Ssowmini 	    ire_ill_group == ill->ill_group))) {
22800Sstevel@tonic-gate 		return (B_TRUE);
22810Sstevel@tonic-gate 	}
22820Sstevel@tonic-gate 	return (B_FALSE);
22830Sstevel@tonic-gate }
22840Sstevel@tonic-gate 
22852535Ssangeeta int
22862535Ssangeeta rtfunc(struct radix_node *rn, void *arg)
22872535Ssangeeta {
22882535Ssangeeta 	struct rtfuncarg *rtf = arg;
22892535Ssangeeta 	struct rt_entry *rt;
22902535Ssangeeta 	irb_t *irb;
22912535Ssangeeta 	ire_t *ire;
22922535Ssangeeta 	boolean_t ret;
22932535Ssangeeta 
22942535Ssangeeta 	rt = (struct rt_entry *)rn;
22952535Ssangeeta 	ASSERT(rt != NULL);
22962535Ssangeeta 	irb = &rt->rt_irb;
22972535Ssangeeta 	for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
22982535Ssangeeta 		if ((rtf->rt_match_flags != 0) ||
22992535Ssangeeta 		    (rtf->rt_zoneid != ALL_ZONES)) {
23002535Ssangeeta 			ret = ire_walk_ill_match(rtf->rt_match_flags,
23012535Ssangeeta 			    rtf->rt_ire_type, ire,
23023448Sdh155122 			    rtf->rt_ill, rtf->rt_zoneid, rtf->rt_ipst);
23032535Ssangeeta 		} else
23042535Ssangeeta 			ret = B_TRUE;
23052535Ssangeeta 		if (ret)
23062535Ssangeeta 			(*rtf->rt_func)(ire, rtf->rt_arg);
23072535Ssangeeta 	}
23082535Ssangeeta 	return (0);
23092535Ssangeeta }
23102535Ssangeeta 
23110Sstevel@tonic-gate /*
23120Sstevel@tonic-gate  * Walk the ftable and the ctable entries that match the ill.
23130Sstevel@tonic-gate  */
23142535Ssangeeta void
23150Sstevel@tonic-gate ire_walk_ill_tables(uint_t match_flags, uint_t ire_type, pfv_t func,
23161676Sjpk     void *arg, size_t ftbl_sz, size_t htbl_sz, irb_t **ipftbl,
23173448Sdh155122     size_t ctbl_sz, irb_t *ipctbl, ill_t *ill, zoneid_t zoneid,
23183448Sdh155122     ip_stack_t *ipst)
23190Sstevel@tonic-gate {
23200Sstevel@tonic-gate 	irb_t	*irb_ptr;
23210Sstevel@tonic-gate 	irb_t	*irb;
23220Sstevel@tonic-gate 	ire_t	*ire;
23230Sstevel@tonic-gate 	int i, j;
23240Sstevel@tonic-gate 	boolean_t ret;
23252535Ssangeeta 	struct rtfuncarg rtfarg;
23260Sstevel@tonic-gate 
23274972Smeem 	ASSERT((!(match_flags & (MATCH_IRE_ILL |
23280Sstevel@tonic-gate 	    MATCH_IRE_ILL_GROUP))) || (ill != NULL));
23290Sstevel@tonic-gate 	ASSERT(!(match_flags & MATCH_IRE_TYPE) || (ire_type != 0));
23300Sstevel@tonic-gate 	/*
23310Sstevel@tonic-gate 	 * Optimize by not looking at the forwarding table if there
23320Sstevel@tonic-gate 	 * is a MATCH_IRE_TYPE specified with no IRE_FORWARDTABLE
23330Sstevel@tonic-gate 	 * specified in ire_type.
23340Sstevel@tonic-gate 	 */
23350Sstevel@tonic-gate 	if (!(match_flags & MATCH_IRE_TYPE) ||
23360Sstevel@tonic-gate 	    ((ire_type & IRE_FORWARDTABLE) != 0)) {
23372535Ssangeeta 		/* knobs such that routine is called only for v6 case */
23383448Sdh155122 		if (ipftbl == ipst->ips_ip_forwarding_table_v6) {
23392535Ssangeeta 			for (i = (ftbl_sz - 1);  i >= 0; i--) {
23402535Ssangeeta 				if ((irb_ptr = ipftbl[i]) == NULL)
23410Sstevel@tonic-gate 					continue;
23422535Ssangeeta 				for (j = 0; j < htbl_sz; j++) {
23432535Ssangeeta 					irb = &irb_ptr[j];
23442535Ssangeeta 					if (irb->irb_ire == NULL)
23452535Ssangeeta 						continue;
23462535Ssangeeta 
23472535Ssangeeta 					IRB_REFHOLD(irb);
23482535Ssangeeta 					for (ire = irb->irb_ire; ire != NULL;
23494714Ssowmini 					    ire = ire->ire_next) {
23502535Ssangeeta 						if (match_flags == 0 &&
23512535Ssangeeta 						    zoneid == ALL_ZONES) {
23522535Ssangeeta 							ret = B_TRUE;
23532535Ssangeeta 						} else {
23542535Ssangeeta 							ret =
23552535Ssangeeta 							    ire_walk_ill_match(
23562535Ssangeeta 							    match_flags,
23572535Ssangeeta 							    ire_type, ire, ill,
23583448Sdh155122 							    zoneid, ipst);
23592535Ssangeeta 						}
23602535Ssangeeta 						if (ret)
23612535Ssangeeta 							(*func)(ire, arg);
23620Sstevel@tonic-gate 					}
23632535Ssangeeta 					IRB_REFRELE(irb);
23640Sstevel@tonic-gate 				}
23650Sstevel@tonic-gate 			}
23662535Ssangeeta 		} else {
23672535Ssangeeta 			(void) memset(&rtfarg, 0, sizeof (rtfarg));
23682535Ssangeeta 			rtfarg.rt_func = func;
23692535Ssangeeta 			rtfarg.rt_arg = arg;
23702535Ssangeeta 			if (match_flags != 0) {
23712535Ssangeeta 				rtfarg.rt_match_flags = match_flags;
23722535Ssangeeta 			}
23732535Ssangeeta 			rtfarg.rt_ire_type = ire_type;
23742535Ssangeeta 			rtfarg.rt_ill = ill;
23752535Ssangeeta 			rtfarg.rt_zoneid = zoneid;
23763448Sdh155122 			rtfarg.rt_ipst = ipst;	/* No netstack_hold */
23773448Sdh155122 			(void) ipst->ips_ip_ftable->rnh_walktree_mt(
23783448Sdh155122 			    ipst->ips_ip_ftable,
23793448Sdh155122 			    rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn);
23800Sstevel@tonic-gate 		}
23810Sstevel@tonic-gate 	}
23820Sstevel@tonic-gate 
23830Sstevel@tonic-gate 	/*
23840Sstevel@tonic-gate 	 * Optimize by not looking at the cache table if there
23850Sstevel@tonic-gate 	 * is a MATCH_IRE_TYPE specified with no IRE_CACHETABLE
23860Sstevel@tonic-gate 	 * specified in ire_type.
23870Sstevel@tonic-gate 	 */
23880Sstevel@tonic-gate 	if (!(match_flags & MATCH_IRE_TYPE) ||
23890Sstevel@tonic-gate 	    ((ire_type & IRE_CACHETABLE) != 0)) {
23900Sstevel@tonic-gate 		for (i = 0; i < ctbl_sz;  i++) {
23910Sstevel@tonic-gate 			irb = &ipctbl[i];
23920Sstevel@tonic-gate 			if (irb->irb_ire == NULL)
23930Sstevel@tonic-gate 				continue;
23940Sstevel@tonic-gate 			IRB_REFHOLD(irb);
23950Sstevel@tonic-gate 			for (ire = irb->irb_ire; ire != NULL;
23960Sstevel@tonic-gate 			    ire = ire->ire_next) {
23970Sstevel@tonic-gate 				if (match_flags == 0 && zoneid == ALL_ZONES) {
23980Sstevel@tonic-gate 					ret = B_TRUE;
23990Sstevel@tonic-gate 				} else {
24000Sstevel@tonic-gate 					ret = ire_walk_ill_match(
24010Sstevel@tonic-gate 					    match_flags, ire_type,
24023448Sdh155122 					    ire, ill, zoneid, ipst);
24030Sstevel@tonic-gate 				}
24040Sstevel@tonic-gate 				if (ret)
24050Sstevel@tonic-gate 					(*func)(ire, arg);
24060Sstevel@tonic-gate 			}
24070Sstevel@tonic-gate 			IRB_REFRELE(irb);
24080Sstevel@tonic-gate 		}
24090Sstevel@tonic-gate 	}
24100Sstevel@tonic-gate }
24110Sstevel@tonic-gate 
24120Sstevel@tonic-gate /*
24130Sstevel@tonic-gate  * This function takes a mask and returns
24140Sstevel@tonic-gate  * number of bits set in the mask. If no
24150Sstevel@tonic-gate  * bit is set it returns 0.
24160Sstevel@tonic-gate  * Assumes a contiguous mask.
24170Sstevel@tonic-gate  */
24180Sstevel@tonic-gate int
24190Sstevel@tonic-gate ip_mask_to_plen(ipaddr_t mask)
24200Sstevel@tonic-gate {
24210Sstevel@tonic-gate 	return (mask == 0 ? 0 : IP_ABITS - (ffs(ntohl(mask)) -1));
24220Sstevel@tonic-gate }
24230Sstevel@tonic-gate 
24240Sstevel@tonic-gate /*
24250Sstevel@tonic-gate  * Convert length for a mask to the mask.
24260Sstevel@tonic-gate  */
24270Sstevel@tonic-gate ipaddr_t
24280Sstevel@tonic-gate ip_plen_to_mask(uint_t masklen)
24290Sstevel@tonic-gate {
24300Sstevel@tonic-gate 	return (htonl(IP_HOST_MASK << (IP_ABITS - masklen)));
24310Sstevel@tonic-gate }
24320Sstevel@tonic-gate 
24330Sstevel@tonic-gate void
24340Sstevel@tonic-gate ire_atomic_end(irb_t *irb_ptr, ire_t *ire)
24350Sstevel@tonic-gate {
24360Sstevel@tonic-gate 	ill_t	*ill_list[NUM_ILLS];
24373448Sdh155122 	ip_stack_t	*ipst = ire->ire_ipst;
24380Sstevel@tonic-gate 
24390Sstevel@tonic-gate 	ill_list[0] = ire->ire_stq != NULL ? ire->ire_stq->q_ptr : NULL;
24400Sstevel@tonic-gate 	ill_list[1] = ire->ire_ipif != NULL ? ire->ire_ipif->ipif_ill : NULL;
24410Sstevel@tonic-gate 	ill_unlock_ills(ill_list, NUM_ILLS);
24420Sstevel@tonic-gate 	rw_exit(&irb_ptr->irb_lock);
24433448Sdh155122 	rw_exit(&ipst->ips_ill_g_usesrc_lock);
24440Sstevel@tonic-gate }
24450Sstevel@tonic-gate 
24460Sstevel@tonic-gate /*
24470Sstevel@tonic-gate  * ire_add_v[46] atomically make sure that the ipif or ill associated
24480Sstevel@tonic-gate  * with the new ire being added is stable and not IPIF_CHANGING or ILL_CHANGING
24490Sstevel@tonic-gate  * before adding the ire to the table. This ensures that we don't create
24500Sstevel@tonic-gate  * new IRE_CACHEs with stale values for parameters that are passed to
24510Sstevel@tonic-gate  * ire_create such as ire_max_frag. Note that ire_create() is passed a pointer
24520Sstevel@tonic-gate  * to the ipif_mtu, and not the value. The actual value is derived from the
24530Sstevel@tonic-gate  * parent ire or ipif under the bucket lock.
24540Sstevel@tonic-gate  */
24550Sstevel@tonic-gate int
24560Sstevel@tonic-gate ire_atomic_start(irb_t *irb_ptr, ire_t *ire, queue_t *q, mblk_t *mp,
24570Sstevel@tonic-gate     ipsq_func_t func)
24580Sstevel@tonic-gate {
24590Sstevel@tonic-gate 	ill_t	*stq_ill;
24600Sstevel@tonic-gate 	ill_t	*ipif_ill;
24610Sstevel@tonic-gate 	ill_t	*ill_list[NUM_ILLS];
24620Sstevel@tonic-gate 	int	cnt = NUM_ILLS;
24630Sstevel@tonic-gate 	int	error = 0;
24640Sstevel@tonic-gate 	ill_t	*ill = NULL;
24653448Sdh155122 	ip_stack_t	*ipst = ire->ire_ipst;
24660Sstevel@tonic-gate 
24670Sstevel@tonic-gate 	ill_list[0] = stq_ill = ire->ire_stq !=
24684714Ssowmini 	    NULL ? ire->ire_stq->q_ptr : NULL;
24690Sstevel@tonic-gate 	ill_list[1] = ipif_ill = ire->ire_ipif !=
24704714Ssowmini 	    NULL ? ire->ire_ipif->ipif_ill : NULL;
24710Sstevel@tonic-gate 
24720Sstevel@tonic-gate 	ASSERT((q != NULL && mp != NULL && func != NULL) ||
24730Sstevel@tonic-gate 	    (q == NULL && mp == NULL && func == NULL));
24743448Sdh155122 	rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER);
24750Sstevel@tonic-gate 	GRAB_CONN_LOCK(q);
24760Sstevel@tonic-gate 	rw_enter(&irb_ptr->irb_lock, RW_WRITER);
24770Sstevel@tonic-gate 	ill_lock_ills(ill_list, cnt);
24780Sstevel@tonic-gate 
24790Sstevel@tonic-gate 	/*
24800Sstevel@tonic-gate 	 * While the IRE is in the process of being added, a user may have
24810Sstevel@tonic-gate 	 * invoked the ifconfig usesrc option on the stq_ill to make it a
24820Sstevel@tonic-gate 	 * usesrc client ILL. Check for this possibility here, if it is true
24830Sstevel@tonic-gate 	 * then we fail adding the IRE_CACHE. Another check is to make sure
24840Sstevel@tonic-gate 	 * that an ipif_ill of an IRE_CACHE being added is not part of a usesrc
24850Sstevel@tonic-gate 	 * group. The ill_g_usesrc_lock is released in ire_atomic_end
24860Sstevel@tonic-gate 	 */
24870Sstevel@tonic-gate 	if ((ire->ire_type & IRE_CACHE) &&
24880Sstevel@tonic-gate 	    (ire->ire_marks & IRE_MARK_USESRC_CHECK)) {
24890Sstevel@tonic-gate 		if (stq_ill->ill_usesrc_ifindex != 0) {
24900Sstevel@tonic-gate 			ASSERT(stq_ill->ill_usesrc_grp_next != NULL);
24910Sstevel@tonic-gate 			if ((ipif_ill->ill_phyint->phyint_ifindex !=
24920Sstevel@tonic-gate 			    stq_ill->ill_usesrc_ifindex) ||
24930Sstevel@tonic-gate 			    (ipif_ill->ill_usesrc_grp_next == NULL) ||
24940Sstevel@tonic-gate 			    (ipif_ill->ill_usesrc_ifindex != 0)) {
24950Sstevel@tonic-gate 				error = EINVAL;
24960Sstevel@tonic-gate 				goto done;
24970Sstevel@tonic-gate 			}
24980Sstevel@tonic-gate 		} else if (ipif_ill->ill_usesrc_grp_next != NULL) {
24990Sstevel@tonic-gate 			error = EINVAL;
25000Sstevel@tonic-gate 			goto done;
25010Sstevel@tonic-gate 		}
25020Sstevel@tonic-gate 	}
25030Sstevel@tonic-gate 
25040Sstevel@tonic-gate 	/*
25050Sstevel@tonic-gate 	 * IPMP flag settings happen without taking the exclusive route
25060Sstevel@tonic-gate 	 * in ip_sioctl_flags. So we need to make an atomic check here
25070Sstevel@tonic-gate 	 * for FAILED/OFFLINE/INACTIVE flags or if it has hit the
25080Sstevel@tonic-gate 	 * FAILBACK=no case.
25090Sstevel@tonic-gate 	 */
25100Sstevel@tonic-gate 	if ((stq_ill != NULL) && !IAM_WRITER_ILL(stq_ill)) {
25110Sstevel@tonic-gate 		if (stq_ill->ill_state_flags & ILL_CHANGING) {
25120Sstevel@tonic-gate 			ill = stq_ill;
25130Sstevel@tonic-gate 			error = EAGAIN;
25140Sstevel@tonic-gate 		} else if ((stq_ill->ill_phyint->phyint_flags & PHYI_OFFLINE) ||
25150Sstevel@tonic-gate 		    (ill_is_probeonly(stq_ill) &&
25160Sstevel@tonic-gate 		    !(ire->ire_marks & IRE_MARK_HIDDEN))) {
25170Sstevel@tonic-gate 			error = EINVAL;
25180Sstevel@tonic-gate 		}
25190Sstevel@tonic-gate 		goto done;
25200Sstevel@tonic-gate 	}
25210Sstevel@tonic-gate 
25220Sstevel@tonic-gate 	/*
25230Sstevel@tonic-gate 	 * We don't check for OFFLINE/FAILED in this case because
25240Sstevel@tonic-gate 	 * the source address selection logic (ipif_select_source)
25250Sstevel@tonic-gate 	 * may still select a source address from such an ill. The
25260Sstevel@tonic-gate 	 * assumption is that these addresses will be moved by in.mpathd
25270Sstevel@tonic-gate 	 * soon. (i.e. this is a race). However link local addresses
25280Sstevel@tonic-gate 	 * will not move and hence ipif_select_source_v6 tries to avoid
25290Sstevel@tonic-gate 	 * FAILED ills. Please see ipif_select_source_v6 for more info
25300Sstevel@tonic-gate 	 */
25310Sstevel@tonic-gate 	if ((ipif_ill != NULL) && !IAM_WRITER_ILL(ipif_ill) &&
25320Sstevel@tonic-gate 	    (ipif_ill->ill_state_flags & ILL_CHANGING)) {
25330Sstevel@tonic-gate 		ill = ipif_ill;
25340Sstevel@tonic-gate 		error = EAGAIN;
25350Sstevel@tonic-gate 		goto done;
25360Sstevel@tonic-gate 	}
25370Sstevel@tonic-gate 
25380Sstevel@tonic-gate 	if ((ire->ire_ipif != NULL) && !IAM_WRITER_IPIF(ire->ire_ipif) &&
25390Sstevel@tonic-gate 	    (ire->ire_ipif->ipif_state_flags & IPIF_CHANGING)) {
25400Sstevel@tonic-gate 		ill = ire->ire_ipif->ipif_ill;
25410Sstevel@tonic-gate 		ASSERT(ill != NULL);
25420Sstevel@tonic-gate 		error = EAGAIN;
25430Sstevel@tonic-gate 		goto done;
25440Sstevel@tonic-gate 	}
25450Sstevel@tonic-gate 
25460Sstevel@tonic-gate done:
25470Sstevel@tonic-gate 	if (error == EAGAIN && ILL_CAN_WAIT(ill, q)) {
25480Sstevel@tonic-gate 		ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
25490Sstevel@tonic-gate 		mutex_enter(&ipsq->ipsq_lock);
25500Sstevel@tonic-gate 		ire_atomic_end(irb_ptr, ire);
25510Sstevel@tonic-gate 		ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
25520Sstevel@tonic-gate 		mutex_exit(&ipsq->ipsq_lock);
25530Sstevel@tonic-gate 		error = EINPROGRESS;
25540Sstevel@tonic-gate 	} else if (error != 0) {
25550Sstevel@tonic-gate 		ire_atomic_end(irb_ptr, ire);
25560Sstevel@tonic-gate 	}
25570Sstevel@tonic-gate 
25580Sstevel@tonic-gate 	RELEASE_CONN_LOCK(q);
25590Sstevel@tonic-gate 	return (error);
25600Sstevel@tonic-gate }
25610Sstevel@tonic-gate 
25620Sstevel@tonic-gate /*
25630Sstevel@tonic-gate  * Add a fully initialized IRE to an appropriate table based on
25640Sstevel@tonic-gate  * ire_type.
25652535Ssangeeta  *
25662535Ssangeeta  * allow_unresolved == B_FALSE indicates a legacy code-path call
25672535Ssangeeta  * that has prohibited the addition of incomplete ire's. If this
25682535Ssangeeta  * parameter is set, and we find an nce that is in a state other
25692535Ssangeeta  * than ND_REACHABLE, we fail the add. Note that nce_state could be
25704084Ssowmini  * something other than ND_REACHABLE if the nce had just expired and
25714084Ssowmini  * the ire_create preceding the ire_add added a new ND_INITIAL nce.
25720Sstevel@tonic-gate  */
25730Sstevel@tonic-gate int
25742535Ssangeeta ire_add(ire_t **irep, queue_t *q, mblk_t *mp, ipsq_func_t func,
25752535Ssangeeta     boolean_t allow_unresolved)
25760Sstevel@tonic-gate {
25770Sstevel@tonic-gate 	ire_t	*ire1;
25780Sstevel@tonic-gate 	ill_t	*stq_ill = NULL;
25790Sstevel@tonic-gate 	ill_t	*ill;
25800Sstevel@tonic-gate 	ipif_t	*ipif = NULL;
25810Sstevel@tonic-gate 	ill_walk_context_t ctx;
25820Sstevel@tonic-gate 	ire_t	*ire = *irep;
25830Sstevel@tonic-gate 	int	error;
25842416Sjarrett 	boolean_t ire_is_mblk = B_FALSE;
25852416Sjarrett 	tsol_gcgrp_t *gcgrp = NULL;
25862416Sjarrett 	tsol_gcgrp_addr_t ga;
25873448Sdh155122 	ip_stack_t	*ipst = ire->ire_ipst;
25880Sstevel@tonic-gate 
25890Sstevel@tonic-gate 	/* get ready for the day when original ire is not created as mblk */
25900Sstevel@tonic-gate 	if (ire->ire_mp != NULL) {
25912416Sjarrett 		ire_is_mblk = B_TRUE;
25920Sstevel@tonic-gate 		/* Copy the ire to a kmem_alloc'ed area */
25930Sstevel@tonic-gate 		ire1 = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
25940Sstevel@tonic-gate 		if (ire1 == NULL) {
25950Sstevel@tonic-gate 			ip1dbg(("ire_add: alloc failed\n"));
25960Sstevel@tonic-gate 			ire_delete(ire);
25970Sstevel@tonic-gate 			*irep = NULL;
25980Sstevel@tonic-gate 			return (ENOMEM);
25990Sstevel@tonic-gate 		}
26002535Ssangeeta 		ire->ire_marks &= ~IRE_MARK_UNCACHED;
26010Sstevel@tonic-gate 		*ire1 = *ire;
26020Sstevel@tonic-gate 		ire1->ire_mp = NULL;
26032535Ssangeeta 		ire1->ire_stq_ifindex = 0;
26040Sstevel@tonic-gate 		freeb(ire->ire_mp);
26050Sstevel@tonic-gate 		ire = ire1;
26060Sstevel@tonic-gate 	}
26070Sstevel@tonic-gate 	if (ire->ire_stq != NULL)
26080Sstevel@tonic-gate 		stq_ill = (ill_t *)ire->ire_stq->q_ptr;
26090Sstevel@tonic-gate 
26100Sstevel@tonic-gate 	if (ire->ire_type == IRE_CACHE) {
26110Sstevel@tonic-gate 		/*
26120Sstevel@tonic-gate 		 * If this interface is FAILED, or INACTIVE or has hit
26130Sstevel@tonic-gate 		 * the FAILBACK=no case, we create IRE_CACHES marked
26140Sstevel@tonic-gate 		 * HIDDEN for some special cases e.g. bind to
26150Sstevel@tonic-gate 		 * IPIF_NOFAILOVER address etc. So, if this interface
26160Sstevel@tonic-gate 		 * is FAILED/INACTIVE/hit FAILBACK=no case, and we are
26170Sstevel@tonic-gate 		 * not creating hidden ires, we should not allow that.
26180Sstevel@tonic-gate 		 * This happens because the state of the interface
26190Sstevel@tonic-gate 		 * changed while we were waiting in ARP. If this is the
26200Sstevel@tonic-gate 		 * daemon sending probes, the next probe will create
26210Sstevel@tonic-gate 		 * HIDDEN ires and we will create an ire then. This
26220Sstevel@tonic-gate 		 * cannot happen with NDP currently because IRE is
26230Sstevel@tonic-gate 		 * never queued in NDP. But it can happen in the
26240Sstevel@tonic-gate 		 * future when we have external resolvers with IPv6.
26250Sstevel@tonic-gate 		 * If the interface gets marked with OFFLINE while we
26260Sstevel@tonic-gate 		 * are waiting in ARP, don't add the ire.
26270Sstevel@tonic-gate 		 */
26280Sstevel@tonic-gate 		if ((stq_ill->ill_phyint->phyint_flags & PHYI_OFFLINE) ||
26290Sstevel@tonic-gate 		    (ill_is_probeonly(stq_ill) &&
26300Sstevel@tonic-gate 		    !(ire->ire_marks & IRE_MARK_HIDDEN))) {
26310Sstevel@tonic-gate 			/*
26320Sstevel@tonic-gate 			 * We don't know whether it is a valid ipif or not.
26330Sstevel@tonic-gate 			 * unless we do the check below. So, set it to NULL.
26340Sstevel@tonic-gate 			 */
26350Sstevel@tonic-gate 			ire->ire_ipif = NULL;
26360Sstevel@tonic-gate 			ire_delete(ire);
26370Sstevel@tonic-gate 			*irep = NULL;
26380Sstevel@tonic-gate 			return (EINVAL);
26390Sstevel@tonic-gate 		}
26400Sstevel@tonic-gate 	}
26410Sstevel@tonic-gate 
26420Sstevel@tonic-gate 	if (stq_ill != NULL && ire->ire_type == IRE_CACHE &&
26430Sstevel@tonic-gate 	    stq_ill->ill_net_type == IRE_IF_RESOLVER) {
26443448Sdh155122 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
26453448Sdh155122 		ill = ILL_START_WALK_ALL(&ctx, ipst);
26460Sstevel@tonic-gate 		for (; ill != NULL; ill = ill_next(&ctx, ill)) {
26470Sstevel@tonic-gate 			mutex_enter(&ill->ill_lock);
26480Sstevel@tonic-gate 			if (ill->ill_state_flags & ILL_CONDEMNED) {
26490Sstevel@tonic-gate 				mutex_exit(&ill->ill_lock);
26500Sstevel@tonic-gate 				continue;
26510Sstevel@tonic-gate 			}
26520Sstevel@tonic-gate 			/*
26530Sstevel@tonic-gate 			 * We need to make sure that the ipif is a valid one
26540Sstevel@tonic-gate 			 * before adding the IRE_CACHE. This happens only
26550Sstevel@tonic-gate 			 * with IRE_CACHE when there is an external resolver.
26560Sstevel@tonic-gate 			 *
26570Sstevel@tonic-gate 			 * We can unplumb a logical interface while the
26580Sstevel@tonic-gate 			 * packet is waiting in ARP with the IRE. Then,
26590Sstevel@tonic-gate 			 * later on when we feed the IRE back, the ipif
26600Sstevel@tonic-gate 			 * has to be re-checked. This can't happen with
26610Sstevel@tonic-gate 			 * NDP currently, as we never queue the IRE with
26620Sstevel@tonic-gate 			 * the packet. We always try to recreate the IRE
26630Sstevel@tonic-gate 			 * when the resolution is completed. But, we do
26640Sstevel@tonic-gate 			 * it for IPv6 also here so that in future if
26650Sstevel@tonic-gate 			 * we have external resolvers, it will work without
26660Sstevel@tonic-gate 			 * any change.
26670Sstevel@tonic-gate 			 */
26680Sstevel@tonic-gate 			ipif = ipif_lookup_seqid(ill, ire->ire_ipif_seqid);
26690Sstevel@tonic-gate 			if (ipif != NULL) {
26700Sstevel@tonic-gate 				ipif_refhold_locked(ipif);
26710Sstevel@tonic-gate 				mutex_exit(&ill->ill_lock);
26720Sstevel@tonic-gate 				break;
26730Sstevel@tonic-gate 			}
26740Sstevel@tonic-gate 			mutex_exit(&ill->ill_lock);
26750Sstevel@tonic-gate 		}
26763448Sdh155122 		rw_exit(&ipst->ips_ill_g_lock);
26770Sstevel@tonic-gate 		if (ipif == NULL ||
26780Sstevel@tonic-gate 		    (ipif->ipif_isv6 &&
26790Sstevel@tonic-gate 		    !IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6,
26800Sstevel@tonic-gate 		    &ipif->ipif_v6src_addr)) ||
26810Sstevel@tonic-gate 		    (!ipif->ipif_isv6 &&
26820Sstevel@tonic-gate 		    ire->ire_src_addr != ipif->ipif_src_addr) ||
26831676Sjpk 		    ire->ire_zoneid != ipif->ipif_zoneid) {
26840Sstevel@tonic-gate 
26850Sstevel@tonic-gate 			if (ipif != NULL)
26860Sstevel@tonic-gate 				ipif_refrele(ipif);
26870Sstevel@tonic-gate 			ire->ire_ipif = NULL;
26880Sstevel@tonic-gate 			ire_delete(ire);
26890Sstevel@tonic-gate 			*irep = NULL;
26900Sstevel@tonic-gate 			return (EINVAL);
26910Sstevel@tonic-gate 		}
26920Sstevel@tonic-gate 
26930Sstevel@tonic-gate 
26940Sstevel@tonic-gate 		ASSERT(ill != NULL);
26950Sstevel@tonic-gate 		/*
26960Sstevel@tonic-gate 		 * If this group was dismantled while this packets was
26970Sstevel@tonic-gate 		 * queued in ARP, don't add it here.
26980Sstevel@tonic-gate 		 */
26990Sstevel@tonic-gate 		if (ire->ire_ipif->ipif_ill->ill_group != ill->ill_group) {
27000Sstevel@tonic-gate 			/* We don't want ire_inactive bump stats for this */
27010Sstevel@tonic-gate 			ipif_refrele(ipif);
27020Sstevel@tonic-gate 			ire->ire_ipif = NULL;
27030Sstevel@tonic-gate 			ire_delete(ire);
27040Sstevel@tonic-gate 			*irep = NULL;
27050Sstevel@tonic-gate 			return (EINVAL);
27060Sstevel@tonic-gate 		}
27072416Sjarrett 
27082416Sjarrett 		/*
27092416Sjarrett 		 * Since we didn't attach label security attributes to the
27102416Sjarrett 		 * ire for the resolver case, we need to add it now. (only
27112416Sjarrett 		 * for v4 resolver and v6 xresolv case).
27122416Sjarrett 		 */
27132416Sjarrett 		if (is_system_labeled() && ire_is_mblk) {
27142416Sjarrett 			if (ire->ire_ipversion == IPV4_VERSION) {
27152416Sjarrett 				ga.ga_af = AF_INET;
27162416Sjarrett 				IN6_IPADDR_TO_V4MAPPED(ire->ire_gateway_addr !=
27172416Sjarrett 				    INADDR_ANY ? ire->ire_gateway_addr :
27182416Sjarrett 				    ire->ire_addr, &ga.ga_addr);
27192416Sjarrett 			} else {
27202416Sjarrett 				ga.ga_af = AF_INET6;
27212416Sjarrett 				ga.ga_addr = IN6_IS_ADDR_UNSPECIFIED(
27222416Sjarrett 				    &ire->ire_gateway_addr_v6) ?
27232416Sjarrett 				    ire->ire_addr_v6 :
27242416Sjarrett 				    ire->ire_gateway_addr_v6;
27252416Sjarrett 			}
27262416Sjarrett 			gcgrp = gcgrp_lookup(&ga, B_FALSE);
27272416Sjarrett 			error = tsol_ire_init_gwattr(ire, ire->ire_ipversion,
27282416Sjarrett 			    NULL, gcgrp);
27292416Sjarrett 			if (error != 0) {
27302416Sjarrett 				if (gcgrp != NULL) {
27312416Sjarrett 					GCGRP_REFRELE(gcgrp);
27322416Sjarrett 					gcgrp = NULL;
27332416Sjarrett 				}
27342416Sjarrett 				ipif_refrele(ipif);
27352416Sjarrett 				ire->ire_ipif = NULL;
27362416Sjarrett 				ire_delete(ire);
27372416Sjarrett 				*irep = NULL;
27382416Sjarrett 				return (error);
27392416Sjarrett 			}
27402416Sjarrett 		}
27410Sstevel@tonic-gate 	}
27420Sstevel@tonic-gate 
27430Sstevel@tonic-gate 	/*
27440Sstevel@tonic-gate 	 * In case ire was changed
27450Sstevel@tonic-gate 	 */
27460Sstevel@tonic-gate 	*irep = ire;
27474823Sseb 	if (ire->ire_ipversion == IPV6_VERSION)
27480Sstevel@tonic-gate 		error = ire_add_v6(irep, q, mp, func);
27494823Sseb 	else
27504823Sseb 		error = ire_add_v4(irep, q, mp, func, allow_unresolved);
27510Sstevel@tonic-gate 	if (ipif != NULL)
27520Sstevel@tonic-gate 		ipif_refrele(ipif);
27530Sstevel@tonic-gate 	return (error);
27540Sstevel@tonic-gate }
27550Sstevel@tonic-gate 
27560Sstevel@tonic-gate /*
27572416Sjarrett  * Add an initialized IRE to an appropriate table based on ire_type.
27580Sstevel@tonic-gate  *
27593004Sdd193516  * The forward table contains IRE_PREFIX/IRE_HOST and
27600Sstevel@tonic-gate  * IRE_IF_RESOLVER/IRE_IF_NORESOLVER and IRE_DEFAULT.
27610Sstevel@tonic-gate  *
27620Sstevel@tonic-gate  * The cache table contains IRE_BROADCAST/IRE_LOCAL/IRE_LOOPBACK
27630Sstevel@tonic-gate  * and IRE_CACHE.
27640Sstevel@tonic-gate  *
27650Sstevel@tonic-gate  * NOTE : This function is called as writer though not required
27660Sstevel@tonic-gate  * by this function.
27670Sstevel@tonic-gate  */
27680Sstevel@tonic-gate static int
27692535Ssangeeta ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
27702535Ssangeeta     boolean_t allow_unresolved)
27710Sstevel@tonic-gate {
27720Sstevel@tonic-gate 	ire_t	*ire1;
27730Sstevel@tonic-gate 	irb_t	*irb_ptr;
27740Sstevel@tonic-gate 	ire_t	**irep;
27750Sstevel@tonic-gate 	int	flags;
27760Sstevel@tonic-gate 	ire_t	*pire = NULL;
27770Sstevel@tonic-gate 	ill_t	*stq_ill;
27780Sstevel@tonic-gate 	ire_t	*ire = *ire_p;
27790Sstevel@tonic-gate 	int	error;
27802535Ssangeeta 	boolean_t need_refrele = B_FALSE;
27812535Ssangeeta 	nce_t	*nce;
27823448Sdh155122 	ip_stack_t	*ipst = ire->ire_ipst;
27830Sstevel@tonic-gate 
27840Sstevel@tonic-gate 	if (ire->ire_ipif != NULL)
27850Sstevel@tonic-gate 		ASSERT(!MUTEX_HELD(&ire->ire_ipif->ipif_ill->ill_lock));
27860Sstevel@tonic-gate 	if (ire->ire_stq != NULL)
27870Sstevel@tonic-gate 		ASSERT(!MUTEX_HELD(
27880Sstevel@tonic-gate 		    &((ill_t *)(ire->ire_stq->q_ptr))->ill_lock));
27890Sstevel@tonic-gate 	ASSERT(ire->ire_ipversion == IPV4_VERSION);
27900Sstevel@tonic-gate 	ASSERT(ire->ire_mp == NULL); /* Calls should go through ire_add */
27910Sstevel@tonic-gate 
27920Sstevel@tonic-gate 	/* Find the appropriate list head. */
27930Sstevel@tonic-gate 	switch (ire->ire_type) {
27940Sstevel@tonic-gate 	case IRE_HOST:
27950Sstevel@tonic-gate 		ire->ire_mask = IP_HOST_MASK;
27960Sstevel@tonic-gate 		ire->ire_masklen = IP_ABITS;
27970Sstevel@tonic-gate 		if ((ire->ire_flags & RTF_SETSRC) == 0)
27980Sstevel@tonic-gate 			ire->ire_src_addr = 0;
27990Sstevel@tonic-gate 		break;
28000Sstevel@tonic-gate 	case IRE_CACHE:
28010Sstevel@tonic-gate 	case IRE_BROADCAST:
28020Sstevel@tonic-gate 	case IRE_LOCAL:
28030Sstevel@tonic-gate 	case IRE_LOOPBACK:
28040Sstevel@tonic-gate 		ire->ire_mask = IP_HOST_MASK;
28050Sstevel@tonic-gate 		ire->ire_masklen = IP_ABITS;
28060Sstevel@tonic-gate 		break;
28070Sstevel@tonic-gate 	case IRE_PREFIX:
28080Sstevel@tonic-gate 		if ((ire->ire_flags & RTF_SETSRC) == 0)
28090Sstevel@tonic-gate 			ire->ire_src_addr = 0;
28100Sstevel@tonic-gate 		break;
28110Sstevel@tonic-gate 	case IRE_DEFAULT:
28120Sstevel@tonic-gate 		if ((ire->ire_flags & RTF_SETSRC) == 0)
28130Sstevel@tonic-gate 			ire->ire_src_addr = 0;
28140Sstevel@tonic-gate 		break;
28150Sstevel@tonic-gate 	case IRE_IF_RESOLVER:
28160Sstevel@tonic-gate 	case IRE_IF_NORESOLVER:
28170Sstevel@tonic-gate 		break;
28180Sstevel@tonic-gate 	default:
28192535Ssangeeta 		ip0dbg(("ire_add_v4: ire %p has unrecognized IRE type (%d)\n",
28202535Ssangeeta 		    (void *)ire, ire->ire_type));
28210Sstevel@tonic-gate 		ire_delete(ire);
28220Sstevel@tonic-gate 		*ire_p = NULL;
28230Sstevel@tonic-gate 		return (EINVAL);
28240Sstevel@tonic-gate 	}
28250Sstevel@tonic-gate 
28260Sstevel@tonic-gate 	/* Make sure the address is properly masked. */
28270Sstevel@tonic-gate 	ire->ire_addr &= ire->ire_mask;
28280Sstevel@tonic-gate 
28290Sstevel@tonic-gate 	/*
28300Sstevel@tonic-gate 	 * ip_newroute/ip_newroute_multi are unable to prevent the deletion
28310Sstevel@tonic-gate 	 * of the interface route while adding an IRE_CACHE for an on-link
28320Sstevel@tonic-gate 	 * destination in the IRE_IF_RESOLVER case, since the ire has to
28330Sstevel@tonic-gate 	 * go to ARP and return. We can't do a REFHOLD on the
28340Sstevel@tonic-gate 	 * associated interface ire for fear of ARP freeing the message.
28350Sstevel@tonic-gate 	 * Here we look up the interface ire in the forwarding table and
28360Sstevel@tonic-gate 	 * make sure that the interface route has not been deleted.
28370Sstevel@tonic-gate 	 */
28380Sstevel@tonic-gate 	if (ire->ire_type == IRE_CACHE && ire->ire_gateway_addr == 0 &&
28390Sstevel@tonic-gate 	    ((ill_t *)ire->ire_stq->q_ptr)->ill_net_type == IRE_IF_RESOLVER) {
28402535Ssangeeta 
28410Sstevel@tonic-gate 		ASSERT(ire->ire_max_fragp == NULL);
28420Sstevel@tonic-gate 		if (CLASSD(ire->ire_addr) && !(ire->ire_flags & RTF_SETSRC)) {
28430Sstevel@tonic-gate 			/*
28440Sstevel@tonic-gate 			 * The ihandle that we used in ip_newroute_multi
28450Sstevel@tonic-gate 			 * comes from the interface route corresponding
28460Sstevel@tonic-gate 			 * to ire_ipif. Lookup here to see if it exists
28470Sstevel@tonic-gate 			 * still.
28480Sstevel@tonic-gate 			 * If the ire has a source address assigned using
28490Sstevel@tonic-gate 			 * RTF_SETSRC, ire_ipif is the logical interface holding
28500Sstevel@tonic-gate 			 * this source address, so we can't use it to check for
28510Sstevel@tonic-gate 			 * the existence of the interface route. Instead we rely
28520Sstevel@tonic-gate 			 * on the brute force ihandle search in
28530Sstevel@tonic-gate 			 * ire_ihandle_lookup_onlink() below.
28540Sstevel@tonic-gate 			 */
28550Sstevel@tonic-gate 			pire = ipif_to_ire(ire->ire_ipif);
28560Sstevel@tonic-gate 			if (pire == NULL) {
28570Sstevel@tonic-gate 				ire_delete(ire);
28580Sstevel@tonic-gate 				*ire_p = NULL;
28590Sstevel@tonic-gate 				return (EINVAL);
28600Sstevel@tonic-gate 			} else if (pire->ire_ihandle != ire->ire_ihandle) {
28610Sstevel@tonic-gate 				ire_refrele(pire);
28620Sstevel@tonic-gate 				ire_delete(ire);
28630Sstevel@tonic-gate 				*ire_p = NULL;
28640Sstevel@tonic-gate 				return (EINVAL);
28650Sstevel@tonic-gate 			}
28660Sstevel@tonic-gate 		} else {
28670Sstevel@tonic-gate 			pire = ire_ihandle_lookup_onlink(ire);
28680Sstevel@tonic-gate 			if (pire == NULL) {
28690Sstevel@tonic-gate 				ire_delete(ire);
28700Sstevel@tonic-gate 				*ire_p = NULL;
28710Sstevel@tonic-gate 				return (EINVAL);
28720Sstevel@tonic-gate 			}
28730Sstevel@tonic-gate 		}
28740Sstevel@tonic-gate 		/* Prevent pire from getting deleted */
28750Sstevel@tonic-gate 		IRB_REFHOLD(pire->ire_bucket);
28760Sstevel@tonic-gate 		/* Has it been removed already ? */
28770Sstevel@tonic-gate 		if (pire->ire_marks & IRE_MARK_CONDEMNED) {
28780Sstevel@tonic-gate 			IRB_REFRELE(pire->ire_bucket);
28790Sstevel@tonic-gate 			ire_refrele(pire);
28800Sstevel@tonic-gate 			ire_delete(ire);
28810Sstevel@tonic-gate 			*ire_p = NULL;
28820Sstevel@tonic-gate 			return (EINVAL);
28830Sstevel@tonic-gate 		}
28840Sstevel@tonic-gate 	} else {
28850Sstevel@tonic-gate 		ASSERT(ire->ire_max_fragp != NULL);
28860Sstevel@tonic-gate 	}
28870Sstevel@tonic-gate 	flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW);
28880Sstevel@tonic-gate 
28890Sstevel@tonic-gate 	if (ire->ire_ipif != NULL) {
28900Sstevel@tonic-gate 		/*
28910Sstevel@tonic-gate 		 * We use MATCH_IRE_IPIF while adding IRE_CACHES only
28920Sstevel@tonic-gate 		 * for historic reasons and to maintain symmetry with
28930Sstevel@tonic-gate 		 * IPv6 code path. Historically this was used by
28940Sstevel@tonic-gate 		 * multicast code to create multiple IRE_CACHES on
28950Sstevel@tonic-gate 		 * a single ill with different ipifs. This was used
28960Sstevel@tonic-gate 		 * so that multicast packets leaving the node had the
28970Sstevel@tonic-gate 		 * right source address. This is no longer needed as
28980Sstevel@tonic-gate 		 * ip_wput initializes the address correctly.
28990Sstevel@tonic-gate 		 */
29000Sstevel@tonic-gate 		flags |= MATCH_IRE_IPIF;
29010Sstevel@tonic-gate 		/*
29020Sstevel@tonic-gate 		 * If we are creating hidden ires, make sure we search on
29030Sstevel@tonic-gate 		 * this ill (MATCH_IRE_ILL) and a hidden ire,
29040Sstevel@tonic-gate 		 * while we are searching for duplicates below. Otherwise we
29050Sstevel@tonic-gate 		 * could potentially find an IRE on some other interface
29060Sstevel@tonic-gate 		 * and it may not be a IRE marked with IRE_MARK_HIDDEN. We
29070Sstevel@tonic-gate 		 * shouldn't do this as this will lead to an infinite loop
29080Sstevel@tonic-gate 		 * (if we get to ip_wput again) eventually we need an hidden
29090Sstevel@tonic-gate 		 * ire for this packet to go out. MATCH_IRE_ILL is explicitly
29100Sstevel@tonic-gate 		 * done below.
29110Sstevel@tonic-gate 		 */
29120Sstevel@tonic-gate 		if (ire->ire_type == IRE_CACHE &&
29130Sstevel@tonic-gate 		    (ire->ire_marks & IRE_MARK_HIDDEN))
29140Sstevel@tonic-gate 			flags |= (MATCH_IRE_MARK_HIDDEN);
29150Sstevel@tonic-gate 	}
29162535Ssangeeta 	if ((ire->ire_type & IRE_CACHETABLE) == 0) {
29172535Ssangeeta 		irb_ptr = ire_get_bucket(ire);
29182535Ssangeeta 		need_refrele = B_TRUE;
29192535Ssangeeta 		if (irb_ptr == NULL) {
29202535Ssangeeta 			/*
29212535Ssangeeta 			 * This assumes that the ire has not added
29222535Ssangeeta 			 * a reference to the ipif.
29232535Ssangeeta 			 */
29242535Ssangeeta 			ire->ire_ipif = NULL;
29252535Ssangeeta 			ire_delete(ire);
29262535Ssangeeta 			if (pire != NULL) {
29272535Ssangeeta 				IRB_REFRELE(pire->ire_bucket);
29282535Ssangeeta 				ire_refrele(pire);
29292535Ssangeeta 			}
29302535Ssangeeta 			*ire_p = NULL;
29312535Ssangeeta 			return (EINVAL);
29322535Ssangeeta 		}
29332535Ssangeeta 	} else {
29343448Sdh155122 		irb_ptr = &(ipst->ips_ip_cache_table[IRE_ADDR_HASH(
29353448Sdh155122 		    ire->ire_addr, ipst->ips_ip_cache_table_size)]);
29362535Ssangeeta 	}
29370Sstevel@tonic-gate 
29380Sstevel@tonic-gate 	/*
29390Sstevel@tonic-gate 	 * Start the atomic add of the ire. Grab the ill locks,
29400Sstevel@tonic-gate 	 * ill_g_usesrc_lock and the bucket lock. Check for condemned
29410Sstevel@tonic-gate 	 *
29420Sstevel@tonic-gate 	 * If ipif or ill is changing ire_atomic_start() may queue the
29430Sstevel@tonic-gate 	 * request and return EINPROGRESS.
29443448Sdh155122 	 * To avoid lock order problems, get the ndp4->ndp_g_lock.
29450Sstevel@tonic-gate 	 */
29463448Sdh155122 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
29470Sstevel@tonic-gate 	error = ire_atomic_start(irb_ptr, ire, q, mp, func);
29480Sstevel@tonic-gate 	if (error != 0) {
29493448Sdh155122 		mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
29500Sstevel@tonic-gate 		/*
29510Sstevel@tonic-gate 		 * We don't know whether it is a valid ipif or not.
29520Sstevel@tonic-gate 		 * So, set it to NULL. This assumes that the ire has not added
29530Sstevel@tonic-gate 		 * a reference to the ipif.
29540Sstevel@tonic-gate 		 */
29550Sstevel@tonic-gate 		ire->ire_ipif = NULL;
29560Sstevel@tonic-gate 		ire_delete(ire);
29570Sstevel@tonic-gate 		if (pire != NULL) {
29580Sstevel@tonic-gate 			IRB_REFRELE(pire->ire_bucket);
29590Sstevel@tonic-gate 			ire_refrele(pire);
29600Sstevel@tonic-gate 		}
29610Sstevel@tonic-gate 		*ire_p = NULL;
29622535Ssangeeta 		if (need_refrele)
29632535Ssangeeta 			IRB_REFRELE(irb_ptr);
29640Sstevel@tonic-gate 		return (error);
29650Sstevel@tonic-gate 	}
29660Sstevel@tonic-gate 	/*
29670Sstevel@tonic-gate 	 * To avoid creating ires having stale values for the ire_max_frag
29680Sstevel@tonic-gate 	 * we get the latest value atomically here. For more details
29690Sstevel@tonic-gate 	 * see the block comment in ip_sioctl_mtu and in DL_NOTE_SDU_CHANGE
29700Sstevel@tonic-gate 	 * in ip_rput_dlpi_writer
29710Sstevel@tonic-gate 	 */
29720Sstevel@tonic-gate 	if (ire->ire_max_fragp == NULL) {
29730Sstevel@tonic-gate 		if (CLASSD(ire->ire_addr))
29740Sstevel@tonic-gate 			ire->ire_max_frag = ire->ire_ipif->ipif_mtu;
29750Sstevel@tonic-gate 		else
29760Sstevel@tonic-gate 			ire->ire_max_frag = pire->ire_max_frag;
29770Sstevel@tonic-gate 	} else {
29780Sstevel@tonic-gate 		uint_t	max_frag;
29790Sstevel@tonic-gate 
29800Sstevel@tonic-gate 		max_frag = *ire->ire_max_fragp;
29810Sstevel@tonic-gate 		ire->ire_max_fragp = NULL;
29820Sstevel@tonic-gate 		ire->ire_max_frag = max_frag;
29830Sstevel@tonic-gate 	}
29840Sstevel@tonic-gate 	/*
29850Sstevel@tonic-gate 	 * Atomically check for duplicate and insert in the table.
29860Sstevel@tonic-gate 	 */
29870Sstevel@tonic-gate 	for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
29880Sstevel@tonic-gate 		if (ire1->ire_marks & IRE_MARK_CONDEMNED)
29890Sstevel@tonic-gate 			continue;
29900Sstevel@tonic-gate 		if (ire->ire_ipif != NULL) {
29910Sstevel@tonic-gate 			/*
29920Sstevel@tonic-gate 			 * We do MATCH_IRE_ILL implicitly here for IREs
29930Sstevel@tonic-gate 			 * with a non-null ire_ipif, including IRE_CACHEs.
29940Sstevel@tonic-gate 			 * As ire_ipif and ire_stq could point to two
29950Sstevel@tonic-gate 			 * different ills, we can't pass just ire_ipif to
29960Sstevel@tonic-gate 			 * ire_match_args and get a match on both ills.
29970Sstevel@tonic-gate 			 * This is just needed for duplicate checks here and
29980Sstevel@tonic-gate 			 * so we don't add an extra argument to
29990Sstevel@tonic-gate 			 * ire_match_args for this. Do it locally.
30000Sstevel@tonic-gate 			 *
30010Sstevel@tonic-gate 			 * NOTE : Currently there is no part of the code
30020Sstevel@tonic-gate 			 * that asks for both MATH_IRE_IPIF and MATCH_IRE_ILL
30030Sstevel@tonic-gate 			 * match for IRE_CACHEs. Thus we don't want to
30040Sstevel@tonic-gate 			 * extend the arguments to ire_match_args.
30050Sstevel@tonic-gate 			 */
30060Sstevel@tonic-gate 			if (ire1->ire_stq != ire->ire_stq)
30070Sstevel@tonic-gate 				continue;
30080Sstevel@tonic-gate 			/*
30090Sstevel@tonic-gate 			 * Multiroute IRE_CACHEs for a given destination can
30100Sstevel@tonic-gate 			 * have the same ire_ipif, typically if their source
30110Sstevel@tonic-gate 			 * address is forced using RTF_SETSRC, and the same
30120Sstevel@tonic-gate 			 * send-to queue. We differentiate them using the parent
30130Sstevel@tonic-gate 			 * handle.
30140Sstevel@tonic-gate 			 */
30150Sstevel@tonic-gate 			if (ire->ire_type == IRE_CACHE &&
30160Sstevel@tonic-gate 			    (ire1->ire_flags & RTF_MULTIRT) &&
30170Sstevel@tonic-gate 			    (ire->ire_flags & RTF_MULTIRT) &&
30180Sstevel@tonic-gate 			    (ire1->ire_phandle != ire->ire_phandle))
30190Sstevel@tonic-gate 				continue;
30200Sstevel@tonic-gate 		}
30210Sstevel@tonic-gate 		if (ire1->ire_zoneid != ire->ire_zoneid)
30220Sstevel@tonic-gate 			continue;
30230Sstevel@tonic-gate 		if (ire_match_args(ire1, ire->ire_addr, ire->ire_mask,
30240Sstevel@tonic-gate 		    ire->ire_gateway_addr, ire->ire_type, ire->ire_ipif,
30251676Sjpk 		    ire->ire_zoneid, 0, NULL, flags)) {
30260Sstevel@tonic-gate 			/*
30270Sstevel@tonic-gate 			 * Return the old ire after doing a REFHOLD.
30280Sstevel@tonic-gate 			 * As most of the callers continue to use the IRE
30290Sstevel@tonic-gate 			 * after adding, we return a held ire. This will
30300Sstevel@tonic-gate 			 * avoid a lookup in the caller again. If the callers
30310Sstevel@tonic-gate 			 * don't want to use it, they need to do a REFRELE.
30320Sstevel@tonic-gate 			 */
30330Sstevel@tonic-gate 			ip1dbg(("found dup ire existing %p new %p",
30340Sstevel@tonic-gate 			    (void *)ire1, (void *)ire));
30350Sstevel@tonic-gate 			IRE_REFHOLD(ire1);
30360Sstevel@tonic-gate 			ire_atomic_end(irb_ptr, ire);
30373448Sdh155122 			mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
30380Sstevel@tonic-gate 			ire_delete(ire);
30390Sstevel@tonic-gate 			if (pire != NULL) {
30400Sstevel@tonic-gate 				/*
30410Sstevel@tonic-gate 				 * Assert that it is not removed from the
30420Sstevel@tonic-gate 				 * list yet.
30430Sstevel@tonic-gate 				 */
30440Sstevel@tonic-gate 				ASSERT(pire->ire_ptpn != NULL);
30450Sstevel@tonic-gate 				IRB_REFRELE(pire->ire_bucket);
30460Sstevel@tonic-gate 				ire_refrele(pire);
30470Sstevel@tonic-gate 			}
30480Sstevel@tonic-gate 			*ire_p = ire1;
30492535Ssangeeta 			if (need_refrele)
30502535Ssangeeta 				IRB_REFRELE(irb_ptr);
30510Sstevel@tonic-gate 			return (0);
30520Sstevel@tonic-gate 		}
30530Sstevel@tonic-gate 	}
30542535Ssangeeta 	if (ire->ire_type & IRE_CACHE) {
30552535Ssangeeta 		ASSERT(ire->ire_stq != NULL);
30562535Ssangeeta 		nce = ndp_lookup_v4(ire_to_ill(ire),
30572535Ssangeeta 		    ((ire->ire_gateway_addr != INADDR_ANY) ?
30582535Ssangeeta 		    &ire->ire_gateway_addr : &ire->ire_addr),
30592535Ssangeeta 		    B_TRUE);
30602535Ssangeeta 		if (nce != NULL)
30612535Ssangeeta 			mutex_enter(&nce->nce_lock);
30622535Ssangeeta 		/*
30632535Ssangeeta 		 * if the nce is NCE_F_CONDEMNED, or if it is not ND_REACHABLE
30642535Ssangeeta 		 * and the caller has prohibited the addition of incomplete
30652535Ssangeeta 		 * ire's, we fail the add. Note that nce_state could be
30664084Ssowmini 		 * something other than ND_REACHABLE if the nce had
30674084Ssowmini 		 * just expired and the ire_create preceding the
30684084Ssowmini 		 * ire_add added a new ND_INITIAL nce.
30692535Ssangeeta 		 */
30702535Ssangeeta 		if ((nce == NULL) ||
30712535Ssangeeta 		    (nce->nce_flags & NCE_F_CONDEMNED) ||
30722535Ssangeeta 		    (!allow_unresolved &&
30733397Ssangeeta 		    (nce->nce_state != ND_REACHABLE))) {
30744084Ssowmini 			if (nce != NULL) {
30754084Ssowmini 				DTRACE_PROBE1(ire__bad__nce, nce_t *, nce);
30762535Ssangeeta 				mutex_exit(&nce->nce_lock);
30774084Ssowmini 			}
30782535Ssangeeta 			ire_atomic_end(irb_ptr, ire);
30793448Sdh155122 			mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
30802535Ssangeeta 			if (nce != NULL)
30812535Ssangeeta 				NCE_REFRELE(nce);
30822535Ssangeeta 			DTRACE_PROBE1(ire__no__nce, ire_t *, ire);
30832535Ssangeeta 			ire_delete(ire);
30842535Ssangeeta 			if (pire != NULL) {
30852535Ssangeeta 				IRB_REFRELE(pire->ire_bucket);
30862535Ssangeeta 				ire_refrele(pire);
30872535Ssangeeta 			}
30882535Ssangeeta 			*ire_p = NULL;
30892535Ssangeeta 			if (need_refrele)
30902535Ssangeeta 				IRB_REFRELE(irb_ptr);
30912535Ssangeeta 			return (EINVAL);
30922535Ssangeeta 		} else {
30932535Ssangeeta 			ire->ire_nce = nce;
30942535Ssangeeta 			mutex_exit(&nce->nce_lock);
30952535Ssangeeta 			/*
30962535Ssangeeta 			 * We are associating this nce to the ire, so
30972535Ssangeeta 			 * change the nce ref taken in ndp_lookup_v4() from
30982535Ssangeeta 			 * NCE_REFHOLD to NCE_REFHOLD_NOTR
30992535Ssangeeta 			 */
31002535Ssangeeta 			NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce);
31012535Ssangeeta 		}
31022535Ssangeeta 	}
31030Sstevel@tonic-gate 	/*
31040Sstevel@tonic-gate 	 * Make it easy for ip_wput_ire() to hit multiple broadcast ires by
31050Sstevel@tonic-gate 	 * grouping identical addresses together on the hash chain. We also
31060Sstevel@tonic-gate 	 * don't want to send multiple copies out if there are two ills part
31070Sstevel@tonic-gate 	 * of the same group. Thus we group the ires with same addr and same
31080Sstevel@tonic-gate 	 * ill group together so that ip_wput_ire can easily skip all the
31090Sstevel@tonic-gate 	 * ires with same addr and same group after sending the first copy.
31100Sstevel@tonic-gate 	 * We do this only for IRE_BROADCASTs as ip_wput_ire is currently
31110Sstevel@tonic-gate 	 * interested in such groupings only for broadcasts.
31120Sstevel@tonic-gate 	 *
31130Sstevel@tonic-gate 	 * NOTE : If the interfaces are brought up first and then grouped,
31140Sstevel@tonic-gate 	 * illgrp_insert will handle it. We come here when the interfaces
31150Sstevel@tonic-gate 	 * are already in group and we are bringing them UP.
31160Sstevel@tonic-gate 	 *
31170Sstevel@tonic-gate 	 * Find the first entry that matches ire_addr. *irep will be null
31180Sstevel@tonic-gate 	 * if no match.
31194182Ssowmini 	 *
31204182Ssowmini 	 * Note: the loopback and non-loopback broadcast entries for an
31214182Ssowmini 	 * interface MUST be added before any MULTIRT entries.
31220Sstevel@tonic-gate 	 */
31230Sstevel@tonic-gate 	irep = (ire_t **)irb_ptr;
31240Sstevel@tonic-gate 	while ((ire1 = *irep) != NULL && ire->ire_addr != ire1->ire_addr)
31250Sstevel@tonic-gate 		irep = &ire1->ire_next;
31260Sstevel@tonic-gate 	if (ire->ire_type == IRE_BROADCAST && *irep != NULL) {
31270Sstevel@tonic-gate 		/*
31280Sstevel@tonic-gate 		 * We found some ire (i.e *irep) with a matching addr. We
31290Sstevel@tonic-gate 		 * want to group ires with same addr and same ill group
31300Sstevel@tonic-gate 		 * together.
31310Sstevel@tonic-gate 		 *
31320Sstevel@tonic-gate 		 * First get to the entry that matches our address and
31330Sstevel@tonic-gate 		 * ill group i.e stop as soon as we find the first ire
31340Sstevel@tonic-gate 		 * matching the ill group and address. If there is only
31350Sstevel@tonic-gate 		 * an address match, we should walk and look for some
31360Sstevel@tonic-gate 		 * group match. These are some of the possible scenarios :
31370Sstevel@tonic-gate 		 *
31380Sstevel@tonic-gate 		 * 1) There are no groups at all i.e all ire's ill_group
31390Sstevel@tonic-gate 		 *    are NULL. In that case we will essentially group
31400Sstevel@tonic-gate 		 *    all the ires with the same addr together. Same as
31410Sstevel@tonic-gate 		 *    the "else" block of this "if".
31420Sstevel@tonic-gate 		 *
31430Sstevel@tonic-gate 		 * 2) There are some groups and this ire's ill_group is
31440Sstevel@tonic-gate 		 *    NULL. In this case, we will first find the group
31450Sstevel@tonic-gate 		 *    that matches the address and a NULL group. Then
31460Sstevel@tonic-gate 		 *    we will insert the ire at the end of that group.
31470Sstevel@tonic-gate 		 *
31480Sstevel@tonic-gate 		 * 3) There are some groups and this ires's ill_group is
31490Sstevel@tonic-gate 		 *    non-NULL. In this case we will first find the group
31500Sstevel@tonic-gate 		 *    that matches the address and the ill_group. Then
31510Sstevel@tonic-gate 		 *    we will insert the ire at the end of that group.
31520Sstevel@tonic-gate 		 */
31534182Ssowmini 		for (;;) {
31540Sstevel@tonic-gate 			ire1 = *irep;
31550Sstevel@tonic-gate 			if ((ire1->ire_next == NULL) ||
31560Sstevel@tonic-gate 			    (ire1->ire_next->ire_addr != ire->ire_addr) ||
31570Sstevel@tonic-gate 			    (ire1->ire_type != IRE_BROADCAST) ||
31584182Ssowmini 			    (ire1->ire_flags & RTF_MULTIRT) ||
31590Sstevel@tonic-gate 			    (ire1->ire_ipif->ipif_ill->ill_group ==
31600Sstevel@tonic-gate 			    ire->ire_ipif->ipif_ill->ill_group))
31610Sstevel@tonic-gate 				break;
31620Sstevel@tonic-gate 			irep = &ire1->ire_next;
31630Sstevel@tonic-gate 		}
31640Sstevel@tonic-gate 		ASSERT(*irep != NULL);
31654182Ssowmini 		/*
31664182Ssowmini 		 * The ire will be added before *irep, so
31674182Ssowmini 		 * if irep is a MULTIRT ire, just break to
31684182Ssowmini 		 * ire insertion code.
31694182Ssowmini 		 */
31704182Ssowmini 		if (((*irep)->ire_flags & RTF_MULTIRT) != 0)
31714182Ssowmini 			goto insert_ire;
31724182Ssowmini 
31730Sstevel@tonic-gate 		irep = &((*irep)->ire_next);
31740Sstevel@tonic-gate 
31750Sstevel@tonic-gate 		/*
31760Sstevel@tonic-gate 		 * Either we have hit the end of the list or the address
31770Sstevel@tonic-gate 		 * did not match or the group *matched*. If we found
31780Sstevel@tonic-gate 		 * a match on the group, skip to the end of the group.
31790Sstevel@tonic-gate 		 */
31800Sstevel@tonic-gate 		while (*irep != NULL) {
31810Sstevel@tonic-gate 			ire1 = *irep;
31820Sstevel@tonic-gate 			if ((ire1->ire_addr != ire->ire_addr) ||
31830Sstevel@tonic-gate 			    (ire1->ire_type != IRE_BROADCAST) ||
31840Sstevel@tonic-gate 			    (ire1->ire_ipif->ipif_ill->ill_group !=
31850Sstevel@tonic-gate 			    ire->ire_ipif->ipif_ill->ill_group))
31860Sstevel@tonic-gate 				break;
31870Sstevel@tonic-gate 			if (ire1->ire_ipif->ipif_ill->ill_group == NULL &&
31880Sstevel@tonic-gate 			    ire1->ire_ipif == ire->ire_ipif) {
31890Sstevel@tonic-gate 				irep = &ire1->ire_next;
31900Sstevel@tonic-gate 				break;
31910Sstevel@tonic-gate 			}
31920Sstevel@tonic-gate 			irep = &ire1->ire_next;
31930Sstevel@tonic-gate 		}
31940Sstevel@tonic-gate 	} else if (*irep != NULL) {
31950Sstevel@tonic-gate 		/*
31960Sstevel@tonic-gate 		 * Find the last ire which matches ire_addr.
31970Sstevel@tonic-gate 		 * Needed to do tail insertion among entries with the same
31980Sstevel@tonic-gate 		 * ire_addr.
31990Sstevel@tonic-gate 		 */
32000Sstevel@tonic-gate 		while (ire->ire_addr == ire1->ire_addr) {
32010Sstevel@tonic-gate 			irep = &ire1->ire_next;
32020Sstevel@tonic-gate 			ire1 = *irep;
32030Sstevel@tonic-gate 			if (ire1 == NULL)
32040Sstevel@tonic-gate 				break;
32050Sstevel@tonic-gate 		}
32060Sstevel@tonic-gate 	}
32070Sstevel@tonic-gate 
32084182Ssowmini insert_ire:
32090Sstevel@tonic-gate 	/* Insert at *irep */
32100Sstevel@tonic-gate 	ire1 = *irep;
32110Sstevel@tonic-gate 	if (ire1 != NULL)
32120Sstevel@tonic-gate 		ire1->ire_ptpn = &ire->ire_next;
32130Sstevel@tonic-gate 	ire->ire_next = ire1;
32140Sstevel@tonic-gate 	/* Link the new one in. */
32150Sstevel@tonic-gate 	ire->ire_ptpn = irep;
32160Sstevel@tonic-gate 
32170Sstevel@tonic-gate 	/*
32180Sstevel@tonic-gate 	 * ire_walk routines de-reference ire_next without holding
32190Sstevel@tonic-gate 	 * a lock. Before we point to the new ire, we want to make
32200Sstevel@tonic-gate 	 * sure the store that sets the ire_next of the new ire
32210Sstevel@tonic-gate 	 * reaches global visibility, so that ire_walk routines
32220Sstevel@tonic-gate 	 * don't see a truncated list of ires i.e if the ire_next
32230Sstevel@tonic-gate 	 * of the new ire gets set after we do "*irep = ire" due
32240Sstevel@tonic-gate 	 * to re-ordering, the ire_walk thread will see a NULL
32250Sstevel@tonic-gate 	 * once it accesses the ire_next of the new ire.
32260Sstevel@tonic-gate 	 * membar_producer() makes sure that the following store
32270Sstevel@tonic-gate 	 * happens *after* all of the above stores.
32280Sstevel@tonic-gate 	 */
32290Sstevel@tonic-gate 	membar_producer();
32300Sstevel@tonic-gate 	*irep = ire;
32310Sstevel@tonic-gate 	ire->ire_bucket = irb_ptr;
32320Sstevel@tonic-gate 	/*
32330Sstevel@tonic-gate 	 * We return a bumped up IRE above. Keep it symmetrical
32340Sstevel@tonic-gate 	 * so that the callers will always have to release. This
32350Sstevel@tonic-gate 	 * helps the callers of this function because they continue
32360Sstevel@tonic-gate 	 * to use the IRE after adding and hence they don't have to
32370Sstevel@tonic-gate 	 * lookup again after we return the IRE.
32380Sstevel@tonic-gate 	 *
32390Sstevel@tonic-gate 	 * NOTE : We don't have to use atomics as this is appearing
32400Sstevel@tonic-gate 	 * in the list for the first time and no one else can bump
32410Sstevel@tonic-gate 	 * up the reference count on this yet.
32420Sstevel@tonic-gate 	 */
32430Sstevel@tonic-gate 	IRE_REFHOLD_LOCKED(ire);
32443448Sdh155122 	BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_inserted);
32452535Ssangeeta 
32460Sstevel@tonic-gate 	irb_ptr->irb_ire_cnt++;
32472535Ssangeeta 	if (irb_ptr->irb_marks & IRB_MARK_FTABLE)
32482535Ssangeeta 		irb_ptr->irb_nire++;
32492535Ssangeeta 
32500Sstevel@tonic-gate 	if (ire->ire_marks & IRE_MARK_TEMPORARY)
32510Sstevel@tonic-gate 		irb_ptr->irb_tmp_ire_cnt++;
32520Sstevel@tonic-gate 
32530Sstevel@tonic-gate 	if (ire->ire_ipif != NULL) {
32540Sstevel@tonic-gate 		ire->ire_ipif->ipif_ire_cnt++;
32550Sstevel@tonic-gate 		if (ire->ire_stq != NULL) {
32560Sstevel@tonic-gate 			stq_ill = (ill_t *)ire->ire_stq->q_ptr;
32570Sstevel@tonic-gate 			stq_ill->ill_ire_cnt++;
32580Sstevel@tonic-gate 		}
32590Sstevel@tonic-gate 	} else {
32600Sstevel@tonic-gate 		ASSERT(ire->ire_stq == NULL);
32610Sstevel@tonic-gate 	}
32620Sstevel@tonic-gate 
32630Sstevel@tonic-gate 	ire_atomic_end(irb_ptr, ire);
32643448Sdh155122 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
32650Sstevel@tonic-gate 
32660Sstevel@tonic-gate 	if (pire != NULL) {
32670Sstevel@tonic-gate 		/* Assert that it is not removed from the list yet */
32680Sstevel@tonic-gate 		ASSERT(pire->ire_ptpn != NULL);
32690Sstevel@tonic-gate 		IRB_REFRELE(pire->ire_bucket);
32700Sstevel@tonic-gate 		ire_refrele(pire);
32710Sstevel@tonic-gate 	}
32720Sstevel@tonic-gate 
32730Sstevel@tonic-gate 	if (ire->ire_type != IRE_CACHE) {
32740Sstevel@tonic-gate 		/*
32752535Ssangeeta 		 * For ire's with host mask see if there is an entry
32760Sstevel@tonic-gate 		 * in the cache. If there is one flush the whole cache as
32770Sstevel@tonic-gate 		 * there might be multiple entries due to RTF_MULTIRT (CGTP).
32780Sstevel@tonic-gate 		 * If no entry is found than there is no need to flush the
32790Sstevel@tonic-gate 		 * cache.
32800Sstevel@tonic-gate 		 */
32810Sstevel@tonic-gate 		if (ire->ire_mask == IP_HOST_MASK) {
32820Sstevel@tonic-gate 			ire_t *lire;
32830Sstevel@tonic-gate 			lire = ire_ctable_lookup(ire->ire_addr, NULL, IRE_CACHE,
32843448Sdh155122 			    NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
32850Sstevel@tonic-gate 			if (lire != NULL) {
32860Sstevel@tonic-gate 				ire_refrele(lire);
32870Sstevel@tonic-gate 				ire_flush_cache_v4(ire, IRE_FLUSH_ADD);
32880Sstevel@tonic-gate 			}
32890Sstevel@tonic-gate 		} else {
32900Sstevel@tonic-gate 			ire_flush_cache_v4(ire, IRE_FLUSH_ADD);
32910Sstevel@tonic-gate 		}
32920Sstevel@tonic-gate 	}
32930Sstevel@tonic-gate 	/*
32940Sstevel@tonic-gate 	 * We had to delay the fast path probe until the ire is inserted
32950Sstevel@tonic-gate 	 * in the list. Otherwise the fast path ack won't find the ire in
32960Sstevel@tonic-gate 	 * the table.
32970Sstevel@tonic-gate 	 */
32983425Ssowmini 	if (ire->ire_type == IRE_CACHE ||
32993425Ssowmini 	    (ire->ire_type == IRE_BROADCAST && ire->ire_stq != NULL)) {
33003425Ssowmini 		ASSERT(ire->ire_nce != NULL);
33014714Ssowmini 		if (ire->ire_nce->nce_state == ND_REACHABLE)
33024714Ssowmini 			nce_fastpath(ire->ire_nce);
33033425Ssowmini 	}
33040Sstevel@tonic-gate 	if (ire->ire_ipif != NULL)
33050Sstevel@tonic-gate 		ASSERT(!MUTEX_HELD(&ire->ire_ipif->ipif_ill->ill_lock));
33060Sstevel@tonic-gate 	*ire_p = ire;
33072535Ssangeeta 	if (need_refrele) {
33082535Ssangeeta 		IRB_REFRELE(irb_ptr);
33092535Ssangeeta 	}
33100Sstevel@tonic-gate 	return (0);
33110Sstevel@tonic-gate }
33120Sstevel@tonic-gate 
33130Sstevel@tonic-gate /*
33140Sstevel@tonic-gate  * IRB_REFRELE is the only caller of the function. ire_unlink calls to
33150Sstevel@tonic-gate  * do the final cleanup for this ire.
33160Sstevel@tonic-gate  */
33170Sstevel@tonic-gate void
33180Sstevel@tonic-gate ire_cleanup(ire_t *ire)
33190Sstevel@tonic-gate {
33200Sstevel@tonic-gate 	ire_t *ire_next;
33213448Sdh155122 	ip_stack_t *ipst = ire->ire_ipst;
33220Sstevel@tonic-gate 
33230Sstevel@tonic-gate 	ASSERT(ire != NULL);
33240Sstevel@tonic-gate 
33250Sstevel@tonic-gate 	while (ire != NULL) {
33260Sstevel@tonic-gate 		ire_next = ire->ire_next;
33270Sstevel@tonic-gate 		if (ire->ire_ipversion == IPV4_VERSION) {
33280Sstevel@tonic-gate 			ire_delete_v4(ire);
33293448Sdh155122 			BUMP_IRE_STATS(ipst->ips_ire_stats_v4,
33303448Sdh155122 			    ire_stats_deleted);
33310Sstevel@tonic-gate 		} else {
33320Sstevel@tonic-gate 			ASSERT(ire->ire_ipversion == IPV6_VERSION);
33330Sstevel@tonic-gate 			ire_delete_v6(ire);
33343448Sdh155122 			BUMP_IRE_STATS(ipst->ips_ire_stats_v6,
33353448Sdh155122 			    ire_stats_deleted);
33360Sstevel@tonic-gate 		}
33370Sstevel@tonic-gate 		/*
33380Sstevel@tonic-gate 		 * Now it's really out of the list. Before doing the
33390Sstevel@tonic-gate 		 * REFRELE, set ire_next to NULL as ire_inactive asserts
33400Sstevel@tonic-gate 		 * so.
33410Sstevel@tonic-gate 		 */
33420Sstevel@tonic-gate 		ire->ire_next = NULL;
33430Sstevel@tonic-gate 		IRE_REFRELE_NOTR(ire);
33440Sstevel@tonic-gate 		ire = ire_next;
33450Sstevel@tonic-gate 	}
33460Sstevel@tonic-gate }
33470Sstevel@tonic-gate 
33480Sstevel@tonic-gate /*
33490Sstevel@tonic-gate  * IRB_REFRELE is the only caller of the function. It calls to unlink
33500Sstevel@tonic-gate  * all the CONDEMNED ires from this bucket.
33510Sstevel@tonic-gate  */
33520Sstevel@tonic-gate ire_t *
33530Sstevel@tonic-gate ire_unlink(irb_t *irb)
33540Sstevel@tonic-gate {
33550Sstevel@tonic-gate 	ire_t *ire;
33560Sstevel@tonic-gate 	ire_t *ire1;
33570Sstevel@tonic-gate 	ire_t **ptpn;
33580Sstevel@tonic-gate 	ire_t *ire_list = NULL;
33590Sstevel@tonic-gate 
33600Sstevel@tonic-gate 	ASSERT(RW_WRITE_HELD(&irb->irb_lock));
33612535Ssangeeta 	ASSERT(((irb->irb_marks & IRB_MARK_FTABLE) && irb->irb_refcnt == 1) ||
33622535Ssangeeta 	    (irb->irb_refcnt == 0));
33632535Ssangeeta 	ASSERT(irb->irb_marks & IRB_MARK_CONDEMNED);
33640Sstevel@tonic-gate 	ASSERT(irb->irb_ire != NULL);
33650Sstevel@tonic-gate 
33660Sstevel@tonic-gate 	for (ire = irb->irb_ire; ire != NULL; ire = ire1) {
33673448Sdh155122 		ip_stack_t	*ipst = ire->ire_ipst;
33683448Sdh155122 
33690Sstevel@tonic-gate 		ire1 = ire->ire_next;
33700Sstevel@tonic-gate 		if (ire->ire_marks & IRE_MARK_CONDEMNED) {
33710Sstevel@tonic-gate 			ptpn = ire->ire_ptpn;
33720Sstevel@tonic-gate 			ire1 = ire->ire_next;
33730Sstevel@tonic-gate 			if (ire1)
33740Sstevel@tonic-gate 				ire1->ire_ptpn = ptpn;
33750Sstevel@tonic-gate 			*ptpn = ire1;
33760Sstevel@tonic-gate 			ire->ire_ptpn = NULL;
33770Sstevel@tonic-gate 			ire->ire_next = NULL;
33780Sstevel@tonic-gate 			if (ire->ire_type == IRE_DEFAULT) {
33790Sstevel@tonic-gate 				/*
33800Sstevel@tonic-gate 				 * IRE is out of the list. We need to adjust
33810Sstevel@tonic-gate 				 * the accounting before the caller drops
33820Sstevel@tonic-gate 				 * the lock.
33830Sstevel@tonic-gate 				 */
33840Sstevel@tonic-gate 				if (ire->ire_ipversion == IPV6_VERSION) {
33853448Sdh155122 					ASSERT(ipst->
33863448Sdh155122 					    ips_ipv6_ire_default_count !=
33873448Sdh155122 					    0);
33883448Sdh155122 					ipst->ips_ipv6_ire_default_count--;
33890Sstevel@tonic-gate 				}
33900Sstevel@tonic-gate 			}
33910Sstevel@tonic-gate 			/*
33920Sstevel@tonic-gate 			 * We need to call ire_delete_v4 or ire_delete_v6
33930Sstevel@tonic-gate 			 * to clean up the cache or the redirects pointing at
33940Sstevel@tonic-gate 			 * the default gateway. We need to drop the lock
33950Sstevel@tonic-gate 			 * as ire_flush_cache/ire_delete_host_redircts require
33960Sstevel@tonic-gate 			 * so. But we can't drop the lock, as ire_unlink needs
33970Sstevel@tonic-gate 			 * to atomically remove the ires from the list.
33980Sstevel@tonic-gate 			 * So, create a temporary list of CONDEMNED ires
33990Sstevel@tonic-gate 			 * for doing ire_delete_v4/ire_delete_v6 operations
34000Sstevel@tonic-gate 			 * later on.
34010Sstevel@tonic-gate 			 */
34020Sstevel@tonic-gate 			ire->ire_next = ire_list;
34030Sstevel@tonic-gate 			ire_list = ire;
34040Sstevel@tonic-gate 		}
34050Sstevel@tonic-gate 	}
34062535Ssangeeta 	irb->irb_marks &= ~IRB_MARK_CONDEMNED;
34070Sstevel@tonic-gate 	return (ire_list);
34080Sstevel@tonic-gate }
34090Sstevel@tonic-gate 
34100Sstevel@tonic-gate /*
34110Sstevel@tonic-gate  * Delete all the cache entries with this 'addr'.  When IP gets a gratuitous
34122535Ssangeeta  * ARP message on any of its interface queue, it scans the nce table and
34132535Ssangeeta  * deletes and calls ndp_delete() for the appropriate nce. This action
34142535Ssangeeta  * also deletes all the neighbor/ire cache entries for that address.
34152535Ssangeeta  * This function is called from ip_arp_news in ip.c and also for
34162535Ssangeeta  * ARP ioctl processing in ip_if.c. ip_ire_clookup_and_delete returns
34172535Ssangeeta  * true if it finds a nce entry which is used by ip_arp_news to determine if
34182535Ssangeeta  * it needs to do an ire_walk_v4. The return value is also  used for the
34192535Ssangeeta  * same purpose by ARP IOCTL processing * in ip_if.c when deleting
34202535Ssangeeta  * ARP entries. For SIOC*IFARP ioctls in addition to the address,
34212535Ssangeeta  * ip_if->ipif_ill also needs to be matched.
34220Sstevel@tonic-gate  */
34230Sstevel@tonic-gate boolean_t
34243448Sdh155122 ip_ire_clookup_and_delete(ipaddr_t addr, ipif_t *ipif, ip_stack_t *ipst)
34250Sstevel@tonic-gate {
34262535Ssangeeta 	ill_t	*ill;
34272535Ssangeeta 	nce_t	*nce;
34282535Ssangeeta 
34292535Ssangeeta 	ill = (ipif ? ipif->ipif_ill : NULL);
34302535Ssangeeta 
34312535Ssangeeta 	if (ill != NULL) {
34322535Ssangeeta 		/*
34332535Ssangeeta 		 * clean up the nce (and any relevant ire's) that matches
34342535Ssangeeta 		 * on addr and ill.
34352535Ssangeeta 		 */
34362535Ssangeeta 		nce = ndp_lookup_v4(ill, &addr, B_FALSE);
34372535Ssangeeta 		if (nce != NULL) {
34382535Ssangeeta 			ndp_delete(nce);
34392535Ssangeeta 			return (B_TRUE);
34402535Ssangeeta 		}
34412535Ssangeeta 	} else {
34422535Ssangeeta 		/*
34432535Ssangeeta 		 * ill is wildcard. clean up all nce's and
34442535Ssangeeta 		 * ire's that match on addr
34452535Ssangeeta 		 */
34462535Ssangeeta 		nce_clookup_t cl;
34472535Ssangeeta 
34482535Ssangeeta 		cl.ncecl_addr = addr;
34492535Ssangeeta 		cl.ncecl_found = B_FALSE;
34502535Ssangeeta 
34513448Sdh155122 		ndp_walk_common(ipst->ips_ndp4, NULL,
34522535Ssangeeta 		    (pfi_t)ip_nce_clookup_and_delete, (uchar_t *)&cl, B_TRUE);
34532535Ssangeeta 
34542535Ssangeeta 		/*
34552535Ssangeeta 		 *  ncecl_found would be set by ip_nce_clookup_and_delete if
34562535Ssangeeta 		 *  we found a matching nce.
34572535Ssangeeta 		 */
34582535Ssangeeta 		return (cl.ncecl_found);
34592535Ssangeeta 	}
34602535Ssangeeta 	return (B_FALSE);
34612535Ssangeeta 
34622535Ssangeeta }
34632535Ssangeeta 
34642535Ssangeeta /* Delete the supplied nce if its nce_addr matches the supplied address */
34652535Ssangeeta static void
34662535Ssangeeta ip_nce_clookup_and_delete(nce_t *nce, void *arg)
34672535Ssangeeta {
34682535Ssangeeta 	nce_clookup_t *cl = (nce_clookup_t *)arg;
34692535Ssangeeta 	ipaddr_t nce_addr;
34702535Ssangeeta 
34712535Ssangeeta 	IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr);
34722535Ssangeeta 	if (nce_addr == cl->ncecl_addr) {
34732535Ssangeeta 		cl->ncecl_found = B_TRUE;
34742535Ssangeeta 		/* clean up the nce (and any relevant ire's) */
34752535Ssangeeta 		ndp_delete(nce);
34762535Ssangeeta 	}
34772535Ssangeeta }
34782535Ssangeeta 
34792535Ssangeeta /*
34802535Ssangeeta  * Clean up the radix node for this ire. Must be called by IRB_REFRELE
34812535Ssangeeta  * when there are no ire's left in the bucket. Returns TRUE if the bucket
34822535Ssangeeta  * is deleted and freed.
34832535Ssangeeta  */
34842535Ssangeeta boolean_t
34852535Ssangeeta irb_inactive(irb_t *irb)
34862535Ssangeeta {
34872535Ssangeeta 	struct rt_entry *rt;
34882535Ssangeeta 	struct radix_node *rn;
34893448Sdh155122 	ip_stack_t *ipst = irb->irb_ipst;
34903448Sdh155122 
34913448Sdh155122 	ASSERT(irb->irb_ipst != NULL);
34922535Ssangeeta 
34932535Ssangeeta 	rt = IRB2RT(irb);
34942535Ssangeeta 	rn = (struct radix_node *)rt;
34952535Ssangeeta 
34962535Ssangeeta 	/* first remove it from the radix tree. */
34973448Sdh155122 	RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable);
34982535Ssangeeta 	rw_enter(&irb->irb_lock, RW_WRITER);
34992535Ssangeeta 	if (irb->irb_refcnt == 1 && irb->irb_nire == 0) {
35003448Sdh155122 		rn = ipst->ips_ip_ftable->rnh_deladdr(rn->rn_key, rn->rn_mask,
35013448Sdh155122 		    ipst->ips_ip_ftable);
35022535Ssangeeta 		DTRACE_PROBE1(irb__free, rt_t *,  rt);
35032535Ssangeeta 		ASSERT((void *)rn == (void *)rt);
35042535Ssangeeta 		Free(rt, rt_entry_cache);
35052535Ssangeeta 		/* irb_lock is freed */
35063448Sdh155122 		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
35072535Ssangeeta 		return (B_TRUE);
35082535Ssangeeta 	}
35092535Ssangeeta 	rw_exit(&irb->irb_lock);
35103448Sdh155122 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
35112535Ssangeeta 	return (B_FALSE);
35120Sstevel@tonic-gate }
35130Sstevel@tonic-gate 
35140Sstevel@tonic-gate /*
35150Sstevel@tonic-gate  * Delete the specified IRE.
35160Sstevel@tonic-gate  */
35170Sstevel@tonic-gate void
35180Sstevel@tonic-gate ire_delete(ire_t *ire)
35190Sstevel@tonic-gate {
35200Sstevel@tonic-gate 	ire_t	*ire1;
35210Sstevel@tonic-gate 	ire_t	**ptpn;
35220Sstevel@tonic-gate 	irb_t *irb;
35233448Sdh155122 	ip_stack_t	*ipst = ire->ire_ipst;
35240Sstevel@tonic-gate 
35250Sstevel@tonic-gate 	if ((irb = ire->ire_bucket) == NULL) {
35262535Ssangeeta 		/*
35272535Ssangeeta 		 * It was never inserted in the list. Should call REFRELE
35282535Ssangeeta 		 * to free this IRE.
35292535Ssangeeta 		 */
35300Sstevel@tonic-gate 		IRE_REFRELE_NOTR(ire);
35310Sstevel@tonic-gate 		return;
35320Sstevel@tonic-gate 	}
35330Sstevel@tonic-gate 
35340Sstevel@tonic-gate 	rw_enter(&irb->irb_lock, RW_WRITER);
35350Sstevel@tonic-gate 
35362535Ssangeeta 	if (irb->irb_rr_origin == ire) {
35372535Ssangeeta 		irb->irb_rr_origin = NULL;
35382535Ssangeeta 	}
35392535Ssangeeta 
35400Sstevel@tonic-gate 	/*
35410Sstevel@tonic-gate 	 * In case of V4 we might still be waiting for fastpath ack.
35420Sstevel@tonic-gate 	 */
35433425Ssowmini 	if (ire->ire_ipversion == IPV4_VERSION &&
35443425Ssowmini 	    (ire->ire_type == IRE_CACHE ||
35453425Ssowmini 	    (ire->ire_type == IRE_BROADCAST && ire->ire_stq != NULL))) {
35463425Ssowmini 		ASSERT(ire->ire_nce != NULL);
35473425Ssowmini 		nce_fastpath_list_delete(ire->ire_nce);
35480Sstevel@tonic-gate 	}
35490Sstevel@tonic-gate 
35500Sstevel@tonic-gate 	if (ire->ire_ptpn == NULL) {
35510Sstevel@tonic-gate 		/*
35520Sstevel@tonic-gate 		 * Some other thread has removed us from the list.
35530Sstevel@tonic-gate 		 * It should have done the REFRELE for us.
35540Sstevel@tonic-gate 		 */
35550Sstevel@tonic-gate 		rw_exit(&irb->irb_lock);
35560Sstevel@tonic-gate 		return;
35570Sstevel@tonic-gate 	}
35580Sstevel@tonic-gate 
35590Sstevel@tonic-gate 	if (irb->irb_refcnt != 0) {
35600Sstevel@tonic-gate 		/*
35610Sstevel@tonic-gate 		 * The last thread to leave this bucket will
35620Sstevel@tonic-gate 		 * delete this ire.
35630Sstevel@tonic-gate 		 */
35640Sstevel@tonic-gate 		if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) {
35650Sstevel@tonic-gate 			irb->irb_ire_cnt--;
35660Sstevel@tonic-gate 			if (ire->ire_marks & IRE_MARK_TEMPORARY)
35670Sstevel@tonic-gate 				irb->irb_tmp_ire_cnt--;
35680Sstevel@tonic-gate 			ire->ire_marks |= IRE_MARK_CONDEMNED;
35690Sstevel@tonic-gate 		}
35702535Ssangeeta 		irb->irb_marks |= IRB_MARK_CONDEMNED;
35710Sstevel@tonic-gate 		rw_exit(&irb->irb_lock);
35720Sstevel@tonic-gate 		return;
35730Sstevel@tonic-gate 	}
35740Sstevel@tonic-gate 
35750Sstevel@tonic-gate 	/*
35760Sstevel@tonic-gate 	 * Normally to delete an ire, we walk the bucket. While we
35770Sstevel@tonic-gate 	 * walk the bucket, we normally bump up irb_refcnt and hence
35780Sstevel@tonic-gate 	 * we return from above where we mark CONDEMNED and the ire
35790Sstevel@tonic-gate 	 * gets deleted from ire_unlink. This case is where somebody
35800Sstevel@tonic-gate 	 * knows the ire e.g by doing a lookup, and wants to delete the
35810Sstevel@tonic-gate 	 * IRE. irb_refcnt would be 0 in this case if nobody is walking
35820Sstevel@tonic-gate 	 * the bucket.
35830Sstevel@tonic-gate 	 */
35840Sstevel@tonic-gate 	ptpn = ire->ire_ptpn;
35850Sstevel@tonic-gate 	ire1 = ire->ire_next;
35860Sstevel@tonic-gate 	if (ire1 != NULL)
35870Sstevel@tonic-gate 		ire1->ire_ptpn = ptpn;
35880Sstevel@tonic-gate 	ASSERT(ptpn != NULL);
35890Sstevel@tonic-gate 	*ptpn = ire1;
35900Sstevel@tonic-gate 	ire->ire_ptpn = NULL;
35910Sstevel@tonic-gate 	ire->ire_next = NULL;
35920Sstevel@tonic-gate 	if (ire->ire_ipversion == IPV6_VERSION) {
35933448Sdh155122 		BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_deleted);
35940Sstevel@tonic-gate 	} else {
35953448Sdh155122 		BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_deleted);
35960Sstevel@tonic-gate 	}
35970Sstevel@tonic-gate 	/*
35980Sstevel@tonic-gate 	 * ip_wput/ip_wput_v6 checks this flag to see whether
35990Sstevel@tonic-gate 	 * it should still use the cached ire or not.
36000Sstevel@tonic-gate 	 */
36010Sstevel@tonic-gate 	ire->ire_marks |= IRE_MARK_CONDEMNED;
36020Sstevel@tonic-gate 	if (ire->ire_type == IRE_DEFAULT) {
36030Sstevel@tonic-gate 		/*
36040Sstevel@tonic-gate 		 * IRE is out of the list. We need to adjust the
36050Sstevel@tonic-gate 		 * accounting before we drop the lock.
36060Sstevel@tonic-gate 		 */
36070Sstevel@tonic-gate 		if (ire->ire_ipversion == IPV6_VERSION) {
36083448Sdh155122 			ASSERT(ipst->ips_ipv6_ire_default_count != 0);
36093448Sdh155122 			ipst->ips_ipv6_ire_default_count--;
36100Sstevel@tonic-gate 		}
36110Sstevel@tonic-gate 	}
36120Sstevel@tonic-gate 	irb->irb_ire_cnt--;
36132535Ssangeeta 
36140Sstevel@tonic-gate 	if (ire->ire_marks & IRE_MARK_TEMPORARY)
36150Sstevel@tonic-gate 		irb->irb_tmp_ire_cnt--;
36160Sstevel@tonic-gate 	rw_exit(&irb->irb_lock);
36170Sstevel@tonic-gate 
36180Sstevel@tonic-gate 	if (ire->ire_ipversion == IPV6_VERSION) {
36190Sstevel@tonic-gate 		ire_delete_v6(ire);
36200Sstevel@tonic-gate 	} else {
36210Sstevel@tonic-gate 		ire_delete_v4(ire);
36220Sstevel@tonic-gate 	}
36230Sstevel@tonic-gate 	/*
36240Sstevel@tonic-gate 	 * We removed it from the list. Decrement the
36250Sstevel@tonic-gate 	 * reference count.
36260Sstevel@tonic-gate 	 */
36270Sstevel@tonic-gate 	IRE_REFRELE_NOTR(ire);
36280Sstevel@tonic-gate }
36290Sstevel@tonic-gate 
36300Sstevel@tonic-gate /*
36310Sstevel@tonic-gate  * Delete the specified IRE.
36320Sstevel@tonic-gate  * All calls should use ire_delete().
36330Sstevel@tonic-gate  * Sometimes called as writer though not required by this function.
36340Sstevel@tonic-gate  *
36350Sstevel@tonic-gate  * NOTE : This function is called only if the ire was added
36360Sstevel@tonic-gate  * in the list.
36370Sstevel@tonic-gate  */
36380Sstevel@tonic-gate static void
36390Sstevel@tonic-gate ire_delete_v4(ire_t *ire)
36400Sstevel@tonic-gate {
36413448Sdh155122 	ip_stack_t	*ipst = ire->ire_ipst;
36423448Sdh155122 
36430Sstevel@tonic-gate 	ASSERT(ire->ire_refcnt >= 1);
36440Sstevel@tonic-gate 	ASSERT(ire->ire_ipversion == IPV4_VERSION);
36450Sstevel@tonic-gate 
36460Sstevel@tonic-gate 	if (ire->ire_type != IRE_CACHE)
36470Sstevel@tonic-gate 		ire_flush_cache_v4(ire, IRE_FLUSH_DELETE);
36480Sstevel@tonic-gate 	if (ire->ire_type == IRE_DEFAULT) {
36490Sstevel@tonic-gate 		/*
36500Sstevel@tonic-gate 		 * when a default gateway is going away
36510Sstevel@tonic-gate 		 * delete all the host redirects pointing at that
36520Sstevel@tonic-gate 		 * gateway.
36530Sstevel@tonic-gate 		 */
36543448Sdh155122 		ire_delete_host_redirects(ire->ire_gateway_addr, ipst);
36550Sstevel@tonic-gate 	}
36560Sstevel@tonic-gate }
36570Sstevel@tonic-gate 
36580Sstevel@tonic-gate /*
36590Sstevel@tonic-gate  * IRE_REFRELE/ire_refrele are the only caller of the function. It calls
36600Sstevel@tonic-gate  * to free the ire when the reference count goes to zero.
36610Sstevel@tonic-gate  */
36620Sstevel@tonic-gate void
36630Sstevel@tonic-gate ire_inactive(ire_t *ire)
36640Sstevel@tonic-gate {
36650Sstevel@tonic-gate 	nce_t	*nce;
36660Sstevel@tonic-gate 	ill_t	*ill = NULL;
36670Sstevel@tonic-gate 	ill_t	*stq_ill = NULL;
36680Sstevel@tonic-gate 	ipif_t	*ipif;
36690Sstevel@tonic-gate 	boolean_t	need_wakeup = B_FALSE;
36702535Ssangeeta 	irb_t 	*irb;
36713448Sdh155122 	ip_stack_t	*ipst = ire->ire_ipst;
36720Sstevel@tonic-gate 
36730Sstevel@tonic-gate 	ASSERT(ire->ire_refcnt == 0);
36740Sstevel@tonic-gate 	ASSERT(ire->ire_ptpn == NULL);
36750Sstevel@tonic-gate 	ASSERT(ire->ire_next == NULL);
36760Sstevel@tonic-gate 
36772535Ssangeeta 	if (ire->ire_gw_secattr != NULL) {
36782535Ssangeeta 		ire_gw_secattr_free(ire->ire_gw_secattr);
36792535Ssangeeta 		ire->ire_gw_secattr = NULL;
36802535Ssangeeta 	}
36812535Ssangeeta 
36822535Ssangeeta 	if (ire->ire_mp != NULL) {
36832535Ssangeeta 		ASSERT(ire->ire_bucket == NULL);
36842535Ssangeeta 		mutex_destroy(&ire->ire_lock);
36853448Sdh155122 		BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_freed);
36862535Ssangeeta 		if (ire->ire_nce != NULL)
36872535Ssangeeta 			NCE_REFRELE_NOTR(ire->ire_nce);
36882535Ssangeeta 		freeb(ire->ire_mp);
36892535Ssangeeta 		return;
36902535Ssangeeta 	}
36912535Ssangeeta 
36920Sstevel@tonic-gate 	if ((nce = ire->ire_nce) != NULL) {
36930Sstevel@tonic-gate 		NCE_REFRELE_NOTR(nce);
36940Sstevel@tonic-gate 		ire->ire_nce = NULL;
36950Sstevel@tonic-gate 	}
36962535Ssangeeta 
36970Sstevel@tonic-gate 	if (ire->ire_ipif == NULL)
36980Sstevel@tonic-gate 		goto end;
36990Sstevel@tonic-gate 
37000Sstevel@tonic-gate 	ipif = ire->ire_ipif;
37010Sstevel@tonic-gate 	ill = ipif->ipif_ill;
37020Sstevel@tonic-gate 
37030Sstevel@tonic-gate 	if (ire->ire_bucket == NULL) {
37040Sstevel@tonic-gate 		/* The ire was never inserted in the table. */
37050Sstevel@tonic-gate 		goto end;
37060Sstevel@tonic-gate 	}
37070Sstevel@tonic-gate 
37080Sstevel@tonic-gate 	/*
37090Sstevel@tonic-gate 	 * ipif_ire_cnt on this ipif goes down by 1. If the ire_stq is
37104823Sseb 	 * non-null ill_ire_count also goes down by 1.
37110Sstevel@tonic-gate 	 *
37120Sstevel@tonic-gate 	 * The ipif that is associated with an ire is ire->ire_ipif and
37130Sstevel@tonic-gate 	 * hence when the ire->ire_ipif->ipif_ire_cnt drops to zero we call
37140Sstevel@tonic-gate 	 * ipif_ill_refrele_tail. Usually stq_ill is null or the same as
37150Sstevel@tonic-gate 	 * ire->ire_ipif->ipif_ill. So nothing more needs to be done. Only
37160Sstevel@tonic-gate 	 * in the case of IRE_CACHES when IPMP is used, stq_ill can be
37170Sstevel@tonic-gate 	 * different. If this is different from ire->ire_ipif->ipif_ill and
37180Sstevel@tonic-gate 	 * if the ill_ire_cnt on the stq_ill also has dropped to zero, we call
37194823Sseb 	 * ipif_ill_refrele_tail on the stq_ill.
37200Sstevel@tonic-gate 	 */
37210Sstevel@tonic-gate 
37220Sstevel@tonic-gate 	if (ire->ire_stq != NULL)
37230Sstevel@tonic-gate 		stq_ill = (ill_t *)ire->ire_stq->q_ptr;
37244823Sseb 
37254823Sseb 	if (stq_ill == NULL || stq_ill == ill) {
37260Sstevel@tonic-gate 		/* Optimize the most common case */
37270Sstevel@tonic-gate 		mutex_enter(&ill->ill_lock);
37280Sstevel@tonic-gate 		ASSERT(ipif->ipif_ire_cnt != 0);
37290Sstevel@tonic-gate 		ipif->ipif_ire_cnt--;
37300Sstevel@tonic-gate 		if (ipif->ipif_ire_cnt == 0)
37310Sstevel@tonic-gate 			need_wakeup = B_TRUE;
37320Sstevel@tonic-gate 		if (stq_ill != NULL) {
37330Sstevel@tonic-gate 			ASSERT(stq_ill->ill_ire_cnt != 0);
37340Sstevel@tonic-gate 			stq_ill->ill_ire_cnt--;
37350Sstevel@tonic-gate 			if (stq_ill->ill_ire_cnt == 0)
37360Sstevel@tonic-gate 				need_wakeup = B_TRUE;
37370Sstevel@tonic-gate 		}
37380Sstevel@tonic-gate 		if (need_wakeup) {
37390Sstevel@tonic-gate 			/* Drops the ill lock */
37400Sstevel@tonic-gate 			ipif_ill_refrele_tail(ill);
37410Sstevel@tonic-gate 		} else {
37420Sstevel@tonic-gate 			mutex_exit(&ill->ill_lock);
37430Sstevel@tonic-gate 		}
37440Sstevel@tonic-gate 	} else {
37450Sstevel@tonic-gate 		/*
37460Sstevel@tonic-gate 		 * We can't grab all the ill locks at the same time.
37470Sstevel@tonic-gate 		 * It can lead to recursive lock enter in the call to
37480Sstevel@tonic-gate 		 * ipif_ill_refrele_tail and later. Instead do it 1 at
37490Sstevel@tonic-gate 		 * a time.
37500Sstevel@tonic-gate 		 */
37510Sstevel@tonic-gate 		mutex_enter(&ill->ill_lock);
37520Sstevel@tonic-gate 		ASSERT(ipif->ipif_ire_cnt != 0);
37530Sstevel@tonic-gate 		ipif->ipif_ire_cnt--;
37540Sstevel@tonic-gate 		if (ipif->ipif_ire_cnt == 0) {
37550Sstevel@tonic-gate 			/* Drops the lock */
37560Sstevel@tonic-gate 			ipif_ill_refrele_tail(ill);
37570Sstevel@tonic-gate 		} else {
37580Sstevel@tonic-gate 			mutex_exit(&ill->ill_lock);
37590Sstevel@tonic-gate 		}
37600Sstevel@tonic-gate 		if (stq_ill != NULL) {
37610Sstevel@tonic-gate 			mutex_enter(&stq_ill->ill_lock);
37620Sstevel@tonic-gate 			ASSERT(stq_ill->ill_ire_cnt != 0);
37630Sstevel@tonic-gate 			stq_ill->ill_ire_cnt--;
37640Sstevel@tonic-gate 			if (stq_ill->ill_ire_cnt == 0)  {
37650Sstevel@tonic-gate 				/* Drops the ill lock */
37660Sstevel@tonic-gate 				ipif_ill_refrele_tail(stq_ill);
37670Sstevel@tonic-gate 			} else {
37680Sstevel@tonic-gate 				mutex_exit(&stq_ill->ill_lock);
37690Sstevel@tonic-gate 			}
37700Sstevel@tonic-gate 		}
37710Sstevel@tonic-gate 	}
37720Sstevel@tonic-gate end:
37730Sstevel@tonic-gate 	/* This should be true for both V4 and V6 */
37740Sstevel@tonic-gate 
37752535Ssangeeta 	if ((ire->ire_type & IRE_FORWARDTABLE) &&
37762535Ssangeeta 	    (ire->ire_ipversion == IPV4_VERSION) &&
37772535Ssangeeta 	    ((irb = ire->ire_bucket) != NULL)) {
37782535Ssangeeta 		rw_enter(&irb->irb_lock, RW_WRITER);
37792535Ssangeeta 		irb->irb_nire--;
37802535Ssangeeta 		/*
37812535Ssangeeta 		 * Instead of examining the conditions for freeing
37822535Ssangeeta 		 * the radix node here, we do it by calling
37832535Ssangeeta 		 * IRB_REFRELE which is a single point in the code
37842535Ssangeeta 		 * that embeds that logic. Bump up the refcnt to
37852535Ssangeeta 		 * be able to call IRB_REFRELE
37862535Ssangeeta 		 */
37872535Ssangeeta 		IRB_REFHOLD_LOCKED(irb);
37882535Ssangeeta 		rw_exit(&irb->irb_lock);
37892535Ssangeeta 		IRB_REFRELE(irb);
37902535Ssangeeta 	}
37910Sstevel@tonic-gate 	ire->ire_ipif = NULL;
37920Sstevel@tonic-gate 
37935023Scarlsonj #ifdef DEBUG
37945023Scarlsonj 	ire_trace_cleanup(ire);
37950Sstevel@tonic-gate #endif
37960Sstevel@tonic-gate 	mutex_destroy(&ire->ire_lock);
37970Sstevel@tonic-gate 	if (ire->ire_ipversion == IPV6_VERSION) {
37983448Sdh155122 		BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_freed);
37990Sstevel@tonic-gate 	} else {
38003448Sdh155122 		BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_freed);
38010Sstevel@tonic-gate 	}
38022535Ssangeeta 	ASSERT(ire->ire_mp == NULL);
38032535Ssangeeta 	/* Has been allocated out of the cache */
38042535Ssangeeta 	kmem_cache_free(ire_cache, ire);
38050Sstevel@tonic-gate }
38060Sstevel@tonic-gate 
38070Sstevel@tonic-gate /*
38083004Sdd193516  * ire_walk routine to delete all IRE_CACHE/IRE_HOST types redirect
38093004Sdd193516  * entries that have a given gateway address.
38100Sstevel@tonic-gate  */
38110Sstevel@tonic-gate void
38120Sstevel@tonic-gate ire_delete_cache_gw(ire_t *ire, char *cp)
38130Sstevel@tonic-gate {
38140Sstevel@tonic-gate 	ipaddr_t	gw_addr;
38150Sstevel@tonic-gate 
38163004Sdd193516 	if (!(ire->ire_type & IRE_CACHE) &&
38173004Sdd193516 	    !(ire->ire_flags & RTF_DYNAMIC))
38180Sstevel@tonic-gate 		return;
38190Sstevel@tonic-gate 
38200Sstevel@tonic-gate 	bcopy(cp, &gw_addr, sizeof (gw_addr));
38210Sstevel@tonic-gate 	if (ire->ire_gateway_addr == gw_addr) {
38220Sstevel@tonic-gate 		ip1dbg(("ire_delete_cache_gw: deleted 0x%x type %d to 0x%x\n",
38234714Ssowmini 		    (int)ntohl(ire->ire_addr), ire->ire_type,
38244714Ssowmini 		    (int)ntohl(ire->ire_gateway_addr)));
38250Sstevel@tonic-gate 		ire_delete(ire);
38260Sstevel@tonic-gate 	}
38270Sstevel@tonic-gate }
38280Sstevel@tonic-gate 
38290Sstevel@tonic-gate /*
38300Sstevel@tonic-gate  * Remove all IRE_CACHE entries that match the ire specified.
38310Sstevel@tonic-gate  *
38320Sstevel@tonic-gate  * The flag argument indicates if the flush request is due to addition
38330Sstevel@tonic-gate  * of new route (IRE_FLUSH_ADD) or deletion of old route (IRE_FLUSH_DELETE).
38340Sstevel@tonic-gate  *
38350Sstevel@tonic-gate  * This routine takes only the IREs from the forwarding table and flushes
38360Sstevel@tonic-gate  * the corresponding entries from the cache table.
38370Sstevel@tonic-gate  *
38380Sstevel@tonic-gate  * When flushing due to the deletion of an old route, it
38390Sstevel@tonic-gate  * just checks the cache handles (ire_phandle and ire_ihandle) and
38400Sstevel@tonic-gate  * deletes the ones that match.
38410Sstevel@tonic-gate  *
38420Sstevel@tonic-gate  * When flushing due to the creation of a new route, it checks
38430Sstevel@tonic-gate  * if a cache entry's address matches the one in the IRE and
38440Sstevel@tonic-gate  * that the cache entry's parent has a less specific mask than the
38450Sstevel@tonic-gate  * one in IRE. The destination of such a cache entry could be the
38460Sstevel@tonic-gate  * gateway for other cache entries, so we need to flush those as
38470Sstevel@tonic-gate  * well by looking for gateway addresses matching the IRE's address.
38480Sstevel@tonic-gate  */
38490Sstevel@tonic-gate void
38500Sstevel@tonic-gate ire_flush_cache_v4(ire_t *ire, int flag)
38510Sstevel@tonic-gate {
38520Sstevel@tonic-gate 	int i;
38530Sstevel@tonic-gate 	ire_t *cire;
38540Sstevel@tonic-gate 	irb_t *irb;
38553448Sdh155122 	ip_stack_t	*ipst = ire->ire_ipst;
38560Sstevel@tonic-gate 
38570Sstevel@tonic-gate 	if (ire->ire_type & IRE_CACHE)
38584714Ssowmini 		return;
38590Sstevel@tonic-gate 
38600Sstevel@tonic-gate 	/*
38610Sstevel@tonic-gate 	 * If a default is just created, there is no point
38620Sstevel@tonic-gate 	 * in going through the cache, as there will not be any
38630Sstevel@tonic-gate 	 * cached ires.
38640Sstevel@tonic-gate 	 */
38650Sstevel@tonic-gate 	if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD)
38660Sstevel@tonic-gate 		return;
38670Sstevel@tonic-gate 	if (flag == IRE_FLUSH_ADD) {
38680Sstevel@tonic-gate 		/*
38690Sstevel@tonic-gate 		 * This selective flush is due to the addition of
38700Sstevel@tonic-gate 		 * new IRE.
38710Sstevel@tonic-gate 		 */
38723448Sdh155122 		for (i = 0; i < ipst->ips_ip_cache_table_size; i++) {
38733448Sdh155122 			irb = &ipst->ips_ip_cache_table[i];
38740Sstevel@tonic-gate 			if ((cire = irb->irb_ire) == NULL)
38750Sstevel@tonic-gate 				continue;
38760Sstevel@tonic-gate 			IRB_REFHOLD(irb);
38770Sstevel@tonic-gate 			for (cire = irb->irb_ire; cire != NULL;
38780Sstevel@tonic-gate 			    cire = cire->ire_next) {
38790Sstevel@tonic-gate 				if (cire->ire_type != IRE_CACHE)
38800Sstevel@tonic-gate 					continue;
38810Sstevel@tonic-gate 				/*
38820Sstevel@tonic-gate 				 * If 'cire' belongs to the same subnet
38830Sstevel@tonic-gate 				 * as the new ire being added, and 'cire'
38840Sstevel@tonic-gate 				 * is derived from a prefix that is less
38850Sstevel@tonic-gate 				 * specific than the new ire being added,
38860Sstevel@tonic-gate 				 * we need to flush 'cire'; for instance,
38870Sstevel@tonic-gate 				 * when a new interface comes up.
38880Sstevel@tonic-gate 				 */
38890Sstevel@tonic-gate 				if (((cire->ire_addr & ire->ire_mask) ==
38900Sstevel@tonic-gate 				    (ire->ire_addr & ire->ire_mask)) &&
38910Sstevel@tonic-gate 				    (ip_mask_to_plen(cire->ire_cmask) <=
38920Sstevel@tonic-gate 				    ire->ire_masklen)) {
38930Sstevel@tonic-gate 					ire_delete(cire);
38940Sstevel@tonic-gate 					continue;
38950Sstevel@tonic-gate 				}
38960Sstevel@tonic-gate 				/*
38970Sstevel@tonic-gate 				 * This is the case when the ire_gateway_addr
38980Sstevel@tonic-gate 				 * of 'cire' belongs to the same subnet as
38990Sstevel@tonic-gate 				 * the new ire being added.
39000Sstevel@tonic-gate 				 * Flushing such ires is sometimes required to
39010Sstevel@tonic-gate 				 * avoid misrouting: say we have a machine with
39020Sstevel@tonic-gate 				 * two interfaces (I1 and I2), a default router
39030Sstevel@tonic-gate 				 * R on the I1 subnet, and a host route to an
39040Sstevel@tonic-gate 				 * off-link destination D with a gateway G on
39050Sstevel@tonic-gate 				 * the I2 subnet.
39060Sstevel@tonic-gate 				 * Under normal operation, we will have an
39070Sstevel@tonic-gate 				 * on-link cache entry for G and an off-link
39080Sstevel@tonic-gate 				 * cache entry for D with G as ire_gateway_addr,
39090Sstevel@tonic-gate 				 * traffic to D will reach its destination
39100Sstevel@tonic-gate 				 * through gateway G.
39110Sstevel@tonic-gate 				 * If the administrator does 'ifconfig I2 down',
39120Sstevel@tonic-gate 				 * the cache entries for D and G will be
39130Sstevel@tonic-gate 				 * flushed. However, G will now be resolved as
39140Sstevel@tonic-gate 				 * an off-link destination using R (the default
39150Sstevel@tonic-gate 				 * router) as gateway. Then D will also be
39160Sstevel@tonic-gate 				 * resolved as an off-link destination using G
39170Sstevel@tonic-gate 				 * as gateway - this behavior is due to
39180Sstevel@tonic-gate 				 * compatibility reasons, see comment in
39190Sstevel@tonic-gate 				 * ire_ihandle_lookup_offlink(). Traffic to D
39200Sstevel@tonic-gate 				 * will go to the router R and probably won't
39210Sstevel@tonic-gate 				 * reach the destination.
39220Sstevel@tonic-gate 				 * The administrator then does 'ifconfig I2 up'.
39230Sstevel@tonic-gate 				 * Since G is on the I2 subnet, this routine
39240Sstevel@tonic-gate 				 * will flush its cache entry. It must also
39250Sstevel@tonic-gate 				 * flush the cache entry for D, otherwise
39260Sstevel@tonic-gate 				 * traffic will stay misrouted until the IRE
39270Sstevel@tonic-gate 				 * times out.
39280Sstevel@tonic-gate 				 */
39290Sstevel@tonic-gate 				if ((cire->ire_gateway_addr & ire->ire_mask) ==
39300Sstevel@tonic-gate 				    (ire->ire_addr & ire->ire_mask)) {
39310Sstevel@tonic-gate 					ire_delete(cire);
39320Sstevel@tonic-gate 					continue;
39330Sstevel@tonic-gate 				}
39340Sstevel@tonic-gate 			}
39350Sstevel@tonic-gate 			IRB_REFRELE(irb);
39360Sstevel@tonic-gate 		}
39370Sstevel@tonic-gate 	} else {
39380Sstevel@tonic-gate 		/*
39390Sstevel@tonic-gate 		 * delete the cache entries based on
39400Sstevel@tonic-gate 		 * handle in the IRE as this IRE is
39410Sstevel@tonic-gate 		 * being deleted/changed.
39420Sstevel@tonic-gate 		 */
39433448Sdh155122 		for (i = 0; i < ipst->ips_ip_cache_table_size; i++) {
39443448Sdh155122 			irb = &ipst->ips_ip_cache_table[i];
39450Sstevel@tonic-gate 			if ((cire = irb->irb_ire) == NULL)
39460Sstevel@tonic-gate 				continue;
39470Sstevel@tonic-gate 			IRB_REFHOLD(irb);
39480Sstevel@tonic-gate 			for (cire = irb->irb_ire; cire != NULL;
39490Sstevel@tonic-gate 			    cire = cire->ire_next) {
39500Sstevel@tonic-gate 				if (cire->ire_type != IRE_CACHE)
39510Sstevel@tonic-gate 					continue;
39520Sstevel@tonic-gate 				if ((cire->ire_phandle == 0 ||
39530Sstevel@tonic-gate 				    cire->ire_phandle != ire->ire_phandle) &&
39540Sstevel@tonic-gate 				    (cire->ire_ihandle == 0 ||
39550Sstevel@tonic-gate 				    cire->ire_ihandle != ire->ire_ihandle))
39560Sstevel@tonic-gate 					continue;
39570Sstevel@tonic-gate 				ire_delete(cire);
39580Sstevel@tonic-gate 			}
39590Sstevel@tonic-gate 			IRB_REFRELE(irb);
39600Sstevel@tonic-gate 		}
39610Sstevel@tonic-gate 	}
39620Sstevel@tonic-gate }
39630Sstevel@tonic-gate 
39640Sstevel@tonic-gate /*
39650Sstevel@tonic-gate  * Matches the arguments passed with the values in the ire.
39660Sstevel@tonic-gate  *
39670Sstevel@tonic-gate  * Note: for match types that match using "ipif" passed in, ipif
39680Sstevel@tonic-gate  * must be checked for non-NULL before calling this routine.
39690Sstevel@tonic-gate  */
39702535Ssangeeta boolean_t
39710Sstevel@tonic-gate ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
39721676Sjpk     int type, const ipif_t *ipif, zoneid_t zoneid, uint32_t ihandle,
39731676Sjpk     const ts_label_t *tsl, int match_flags)
39740Sstevel@tonic-gate {
39750Sstevel@tonic-gate 	ill_t *ire_ill = NULL, *dst_ill;
39760Sstevel@tonic-gate 	ill_t *ipif_ill = NULL;
39770Sstevel@tonic-gate 	ill_group_t *ire_ill_group = NULL;
39780Sstevel@tonic-gate 	ill_group_t *ipif_ill_group = NULL;
39790Sstevel@tonic-gate 
39800Sstevel@tonic-gate 	ASSERT(ire->ire_ipversion == IPV4_VERSION);
39810Sstevel@tonic-gate 	ASSERT((ire->ire_addr & ~ire->ire_mask) == 0);
39820Sstevel@tonic-gate 	ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP))) ||
39830Sstevel@tonic-gate 	    (ipif != NULL && !ipif->ipif_isv6));
39840Sstevel@tonic-gate 
39850Sstevel@tonic-gate 	/*
39860Sstevel@tonic-gate 	 * HIDDEN cache entries have to be looked up specifically with
39870Sstevel@tonic-gate 	 * MATCH_IRE_MARK_HIDDEN. MATCH_IRE_MARK_HIDDEN is usually set
39880Sstevel@tonic-gate 	 * when the interface is FAILED or INACTIVE. In that case,
39890Sstevel@tonic-gate 	 * any IRE_CACHES that exists should be marked with
39900Sstevel@tonic-gate 	 * IRE_MARK_HIDDEN. So, we don't really need to match below
39910Sstevel@tonic-gate 	 * for IRE_MARK_HIDDEN. But we do so for consistency.
39920Sstevel@tonic-gate 	 */
39930Sstevel@tonic-gate 	if (!(match_flags & MATCH_IRE_MARK_HIDDEN) &&
39940Sstevel@tonic-gate 	    (ire->ire_marks & IRE_MARK_HIDDEN))
39950Sstevel@tonic-gate 		return (B_FALSE);
39960Sstevel@tonic-gate 
39971095Spriyanka 	/*
39981095Spriyanka 	 * MATCH_IRE_MARK_PRIVATE_ADDR is set when IP_NEXTHOP option
39991095Spriyanka 	 * is used. In that case the routing table is bypassed and the
40001095Spriyanka 	 * packets are sent directly to the specified nexthop. The
40011095Spriyanka 	 * IRE_CACHE entry representing this route should be marked
40021095Spriyanka 	 * with IRE_MARK_PRIVATE_ADDR.
40031095Spriyanka 	 */
40041095Spriyanka 
40051095Spriyanka 	if (!(match_flags & MATCH_IRE_MARK_PRIVATE_ADDR) &&
40061095Spriyanka 	    (ire->ire_marks & IRE_MARK_PRIVATE_ADDR))
40071095Spriyanka 		return (B_FALSE);
40081095Spriyanka 
40091676Sjpk 	if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid &&
40101676Sjpk 	    ire->ire_zoneid != ALL_ZONES) {
40110Sstevel@tonic-gate 		/*
40120Sstevel@tonic-gate 		 * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid is
40130Sstevel@tonic-gate 		 * valid and does not match that of ire_zoneid, a failure to
40140Sstevel@tonic-gate 		 * match is reported at this point. Otherwise, since some IREs
40150Sstevel@tonic-gate 		 * that are available in the global zone can be used in local
40160Sstevel@tonic-gate 		 * zones, additional checks need to be performed:
40170Sstevel@tonic-gate 		 *
40180Sstevel@tonic-gate 		 *	IRE_BROADCAST, IRE_CACHE and IRE_LOOPBACK
40190Sstevel@tonic-gate 		 *	entries should never be matched in this situation.
40200Sstevel@tonic-gate 		 *
40210Sstevel@tonic-gate 		 *	IRE entries that have an interface associated with them
40220Sstevel@tonic-gate 		 *	should in general not match unless they are an IRE_LOCAL
40230Sstevel@tonic-gate 		 *	or in the case when MATCH_IRE_DEFAULT has been set in
40240Sstevel@tonic-gate 		 *	the caller.  In the case of the former, checking of the
40250Sstevel@tonic-gate 		 *	other fields supplied should take place.
40260Sstevel@tonic-gate 		 *
40270Sstevel@tonic-gate 		 *	In the case where MATCH_IRE_DEFAULT has been set,
40280Sstevel@tonic-gate 		 *	all of the ipif's associated with the IRE's ill are
40290Sstevel@tonic-gate 		 *	checked to see if there is a matching zoneid.  If any
40300Sstevel@tonic-gate 		 *	one ipif has a matching zoneid, this IRE is a
40310Sstevel@tonic-gate 		 *	potential candidate so checking of the other fields
40320Sstevel@tonic-gate 		 *	takes place.
40330Sstevel@tonic-gate 		 *
40340Sstevel@tonic-gate 		 *	In the case where the IRE_INTERFACE has a usable source
40350Sstevel@tonic-gate 		 *	address (indicated by ill_usesrc_ifindex) in the
40360Sstevel@tonic-gate 		 *	correct zone then it's permitted to return this IRE
40370Sstevel@tonic-gate 		 */
40380Sstevel@tonic-gate 		if (match_flags & MATCH_IRE_ZONEONLY)
40390Sstevel@tonic-gate 			return (B_FALSE);
40400Sstevel@tonic-gate 		if (ire->ire_type & (IRE_BROADCAST | IRE_CACHE | IRE_LOOPBACK))
40410Sstevel@tonic-gate 			return (B_FALSE);
40420Sstevel@tonic-gate 		/*
40430Sstevel@tonic-gate 		 * Note, IRE_INTERFACE can have the stq as NULL. For
40440Sstevel@tonic-gate 		 * example, if the default multicast route is tied to
40450Sstevel@tonic-gate 		 * the loopback address.
40460Sstevel@tonic-gate 		 */
40470Sstevel@tonic-gate 		if ((ire->ire_type & IRE_INTERFACE) &&
40480Sstevel@tonic-gate 		    (ire->ire_stq != NULL)) {
40490Sstevel@tonic-gate 			dst_ill = (ill_t *)ire->ire_stq->q_ptr;
40500Sstevel@tonic-gate 			/*
40510Sstevel@tonic-gate 			 * If there is a usable source address in the
40520Sstevel@tonic-gate 			 * zone, then it's ok to return an
40530Sstevel@tonic-gate 			 * IRE_INTERFACE
40540Sstevel@tonic-gate 			 */
40550Sstevel@tonic-gate 			if (ipif_usesrc_avail(dst_ill, zoneid)) {
40560Sstevel@tonic-gate 				ip3dbg(("ire_match_args: dst_ill %p match %d\n",
40570Sstevel@tonic-gate 				    (void *)dst_ill,
40580Sstevel@tonic-gate 				    (ire->ire_addr == (addr & mask))));
40590Sstevel@tonic-gate 			} else {
40600Sstevel@tonic-gate 				ip3dbg(("ire_match_args: src_ipif NULL"
40610Sstevel@tonic-gate 				    " dst_ill %p\n", (void *)dst_ill));
40620Sstevel@tonic-gate 				return (B_FALSE);
40630Sstevel@tonic-gate 			}
40640Sstevel@tonic-gate 		}
40650Sstevel@tonic-gate 		if (ire->ire_ipif != NULL && ire->ire_type != IRE_LOCAL &&
40660Sstevel@tonic-gate 		    !(ire->ire_type & IRE_INTERFACE)) {
40670Sstevel@tonic-gate 			ipif_t	*tipif;
40680Sstevel@tonic-gate 
40690Sstevel@tonic-gate 			if ((match_flags & MATCH_IRE_DEFAULT) == 0) {
40700Sstevel@tonic-gate 				return (B_FALSE);
40710Sstevel@tonic-gate 			}
40720Sstevel@tonic-gate 			mutex_enter(&ire->ire_ipif->ipif_ill->ill_lock);
40730Sstevel@tonic-gate 			for (tipif = ire->ire_ipif->ipif_ill->ill_ipif;
40740Sstevel@tonic-gate 			    tipif != NULL; tipif = tipif->ipif_next) {
40750Sstevel@tonic-gate 				if (IPIF_CAN_LOOKUP(tipif) &&
40760Sstevel@tonic-gate 				    (tipif->ipif_flags & IPIF_UP) &&
40771676Sjpk 				    (tipif->ipif_zoneid == zoneid ||
40781676Sjpk 				    tipif->ipif_zoneid == ALL_ZONES))
40790Sstevel@tonic-gate 					break;
40800Sstevel@tonic-gate 			}
40810Sstevel@tonic-gate 			mutex_exit(&ire->ire_ipif->ipif_ill->ill_lock);
40820Sstevel@tonic-gate 			if (tipif == NULL) {
40830Sstevel@tonic-gate 				return (B_FALSE);
40840Sstevel@tonic-gate 			}
40850Sstevel@tonic-gate 		}
40860Sstevel@tonic-gate 	}
40870Sstevel@tonic-gate 
40880Sstevel@tonic-gate 	/*
40890Sstevel@tonic-gate 	 * For IRE_CACHES, MATCH_IRE_ILL/ILL_GROUP really means that
40900Sstevel@tonic-gate 	 * somebody wants to send out on a particular interface which
40910Sstevel@tonic-gate 	 * is given by ire_stq and hence use ire_stq to derive the ill
40920Sstevel@tonic-gate 	 * value. ire_ipif for IRE_CACHES is just the means of getting
40930Sstevel@tonic-gate 	 * a source address i.e ire_src_addr = ire->ire_ipif->ipif_src_addr.
40940Sstevel@tonic-gate 	 * ire_to_ill does the right thing for this.
40950Sstevel@tonic-gate 	 */
40960Sstevel@tonic-gate 	if (match_flags & (MATCH_IRE_ILL|MATCH_IRE_ILL_GROUP)) {
40970Sstevel@tonic-gate 		ire_ill = ire_to_ill(ire);
40980Sstevel@tonic-gate 		if (ire_ill != NULL)
40990Sstevel@tonic-gate 			ire_ill_group = ire_ill->ill_group;
41000Sstevel@tonic-gate 		ipif_ill = ipif->ipif_ill;
41010Sstevel@tonic-gate 		ipif_ill_group = ipif_ill->ill_group;
41020Sstevel@tonic-gate 	}
41030Sstevel@tonic-gate 
41040Sstevel@tonic-gate 	if ((ire->ire_addr == (addr & mask)) &&
41050Sstevel@tonic-gate 	    ((!(match_flags & MATCH_IRE_GW)) ||
41064714Ssowmini 	    (ire->ire_gateway_addr == gateway)) &&
41070Sstevel@tonic-gate 	    ((!(match_flags & MATCH_IRE_TYPE)) ||
41084714Ssowmini 	    (ire->ire_type & type)) &&
41090Sstevel@tonic-gate 	    ((!(match_flags & MATCH_IRE_SRC)) ||
41104714Ssowmini 	    (ire->ire_src_addr == ipif->ipif_src_addr)) &&
41110Sstevel@tonic-gate 	    ((!(match_flags & MATCH_IRE_IPIF)) ||
41124714Ssowmini 	    (ire->ire_ipif == ipif)) &&
41130Sstevel@tonic-gate 	    ((!(match_flags & MATCH_IRE_MARK_HIDDEN)) ||
41144714Ssowmini 	    (ire->ire_type != IRE_CACHE ||
41154714Ssowmini 	    ire->ire_marks & IRE_MARK_HIDDEN)) &&
41161095Spriyanka 	    ((!(match_flags & MATCH_IRE_MARK_PRIVATE_ADDR)) ||
41174714Ssowmini 	    (ire->ire_type != IRE_CACHE ||
41184714Ssowmini 	    ire->ire_marks & IRE_MARK_PRIVATE_ADDR)) &&
41190Sstevel@tonic-gate 	    ((!(match_flags & MATCH_IRE_ILL)) ||
41204714Ssowmini 	    (ire_ill == ipif_ill)) &&
41210Sstevel@tonic-gate 	    ((!(match_flags & MATCH_IRE_IHANDLE)) ||
41224714Ssowmini 	    (ire->ire_ihandle == ihandle)) &&
41232535Ssangeeta 	    ((!(match_flags & MATCH_IRE_MASK)) ||
41244714Ssowmini 	    (ire->ire_mask == mask)) &&
41250Sstevel@tonic-gate 	    ((!(match_flags & MATCH_IRE_ILL_GROUP)) ||
41264714Ssowmini 	    (ire_ill == ipif_ill) ||
41274714Ssowmini 	    (ire_ill_group != NULL &&
41284714Ssowmini 	    ire_ill_group == ipif_ill_group)) &&
41291676Sjpk 	    ((!(match_flags & MATCH_IRE_SECATTR)) ||
41304714Ssowmini 	    (!is_system_labeled()) ||
41314714Ssowmini 	    (tsol_ire_match_gwattr(ire, tsl) == 0))) {
41320Sstevel@tonic-gate 		/* We found the matched IRE */
41330Sstevel@tonic-gate 		return (B_TRUE);
41340Sstevel@tonic-gate 	}
41350Sstevel@tonic-gate 	return (B_FALSE);
41360Sstevel@tonic-gate }
41370Sstevel@tonic-gate 
41380Sstevel@tonic-gate 
41390Sstevel@tonic-gate /*
41400Sstevel@tonic-gate  * Lookup for a route in all the tables
41410Sstevel@tonic-gate  */
41420Sstevel@tonic-gate ire_t *
41430Sstevel@tonic-gate ire_route_lookup(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
41441676Sjpk     int type, const ipif_t *ipif, ire_t **pire, zoneid_t zoneid,
41453448Sdh155122     const ts_label_t *tsl, int flags, ip_stack_t *ipst)
41460Sstevel@tonic-gate {
41470Sstevel@tonic-gate 	ire_t *ire = NULL;
41480Sstevel@tonic-gate 
41490Sstevel@tonic-gate 	/*
41500Sstevel@tonic-gate 	 * ire_match_args() will dereference ipif MATCH_IRE_SRC or
41510Sstevel@tonic-gate 	 * MATCH_IRE_ILL is set.
41520Sstevel@tonic-gate 	 */
41530Sstevel@tonic-gate 	if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
41540Sstevel@tonic-gate 	    (ipif == NULL))
41550Sstevel@tonic-gate 		return (NULL);
41560Sstevel@tonic-gate 
41570Sstevel@tonic-gate 	/*
41580Sstevel@tonic-gate 	 * might be asking for a cache lookup,
41590Sstevel@tonic-gate 	 * This is not best way to lookup cache,
41600Sstevel@tonic-gate 	 * user should call ire_cache_lookup directly.
41610Sstevel@tonic-gate 	 *
41620Sstevel@tonic-gate 	 * If MATCH_IRE_TYPE was set, first lookup in the cache table and then
41630Sstevel@tonic-gate 	 * in the forwarding table, if the applicable type flags were set.
41640Sstevel@tonic-gate 	 */
41650Sstevel@tonic-gate 	if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_CACHETABLE) != 0) {
41660Sstevel@tonic-gate 		ire = ire_ctable_lookup(addr, gateway, type, ipif, zoneid,
41673448Sdh155122 		    tsl, flags, ipst);
41680Sstevel@tonic-gate 		if (ire != NULL)
41690Sstevel@tonic-gate 			return (ire);
41700Sstevel@tonic-gate 	}
41710Sstevel@tonic-gate 	if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_FORWARDTABLE) != 0) {
41720Sstevel@tonic-gate 		ire = ire_ftable_lookup(addr, mask, gateway, type, ipif, pire,
41733448Sdh155122 		    zoneid, 0, tsl, flags, ipst);
41740Sstevel@tonic-gate 	}
41750Sstevel@tonic-gate 	return (ire);
41760Sstevel@tonic-gate }
41770Sstevel@tonic-gate 
41780Sstevel@tonic-gate 
41790Sstevel@tonic-gate /*
41801676Sjpk  * Delete the IRE cache for the gateway and all IRE caches whose
41811676Sjpk  * ire_gateway_addr points to this gateway, and allow them to
41821676Sjpk  * be created on demand by ip_newroute.
41831676Sjpk  */
41841676Sjpk void
41853448Sdh155122 ire_clookup_delete_cache_gw(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst)
41861676Sjpk {
41871676Sjpk 	irb_t *irb;
41881676Sjpk 	ire_t *ire;
41891676Sjpk 
41903448Sdh155122 	irb = &ipst->ips_ip_cache_table[IRE_ADDR_HASH(addr,
41913448Sdh155122 	    ipst->ips_ip_cache_table_size)];
41921676Sjpk 	IRB_REFHOLD(irb);
41931676Sjpk 	for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
41941676Sjpk 		if (ire->ire_marks & IRE_MARK_CONDEMNED)
41951676Sjpk 			continue;
41961676Sjpk 
41971676Sjpk 		ASSERT(ire->ire_mask == IP_HOST_MASK);
41981676Sjpk 		if (ire_match_args(ire, addr, ire->ire_mask, 0, IRE_CACHE,
41991676Sjpk 		    NULL, zoneid, 0, NULL, MATCH_IRE_TYPE)) {
42001676Sjpk 			ire_delete(ire);
42011676Sjpk 		}
42021676Sjpk 	}
42031676Sjpk 	IRB_REFRELE(irb);
42041676Sjpk 
42053448Sdh155122 	ire_walk_v4(ire_delete_cache_gw, &addr, zoneid, ipst);
42061676Sjpk }
42071676Sjpk 
42081676Sjpk /*
42090Sstevel@tonic-gate  * Looks up cache table for a route.
42100Sstevel@tonic-gate  * specific lookup can be indicated by
42110Sstevel@tonic-gate  * passing the MATCH_* flags and the
42120Sstevel@tonic-gate  * necessary parameters.
42130Sstevel@tonic-gate  */
42140Sstevel@tonic-gate ire_t *
42151676Sjpk ire_ctable_lookup(ipaddr_t addr, ipaddr_t gateway, int type, const ipif_t *ipif,
42163448Sdh155122     zoneid_t zoneid, const ts_label_t *tsl, int flags, ip_stack_t *ipst)
42170Sstevel@tonic-gate {
42180Sstevel@tonic-gate 	irb_t *irb_ptr;
42190Sstevel@tonic-gate 	ire_t *ire;
42200Sstevel@tonic-gate 
42210Sstevel@tonic-gate 	/*
42220Sstevel@tonic-gate 	 * ire_match_args() will dereference ipif MATCH_IRE_SRC or
42230Sstevel@tonic-gate 	 * MATCH_IRE_ILL is set.
42240Sstevel@tonic-gate 	 */
42250Sstevel@tonic-gate 	if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL | MATCH_IRE_ILL_GROUP)) &&
42260Sstevel@tonic-gate 	    (ipif == NULL))
42270Sstevel@tonic-gate 		return (NULL);
42280Sstevel@tonic-gate 
42293448Sdh155122 	irb_ptr = &ipst->ips_ip_cache_table[IRE_ADDR_HASH(addr,
42304714Ssowmini 	    ipst->ips_ip_cache_table_size)];
42310Sstevel@tonic-gate 	rw_enter(&irb_ptr->irb_lock, RW_READER);
42320Sstevel@tonic-gate 	for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
42330Sstevel@tonic-gate 		if (ire->ire_marks & IRE_MARK_CONDEMNED)
42340Sstevel@tonic-gate 			continue;
42350Sstevel@tonic-gate 		ASSERT(ire->ire_mask == IP_HOST_MASK);
42360Sstevel@tonic-gate 		if (ire_match_args(ire, addr, ire->ire_mask, gateway, type,
42371676Sjpk 		    ipif, zoneid, 0, tsl, flags)) {
42380Sstevel@tonic-gate 			IRE_REFHOLD(ire);
42390Sstevel@tonic-gate 			rw_exit(&irb_ptr->irb_lock);
42400Sstevel@tonic-gate 			return (ire);
42410Sstevel@tonic-gate 		}
42420Sstevel@tonic-gate 	}
42430Sstevel@tonic-gate 	rw_exit(&irb_ptr->irb_lock);
42440Sstevel@tonic-gate 	return (NULL);
42450Sstevel@tonic-gate }
42460Sstevel@tonic-gate 
42470Sstevel@tonic-gate /*
42482733Snordmark  * Check whether the IRE_LOCAL and the IRE potentially used to transmit
42492733Snordmark  * (could be an IRE_CACHE, IRE_BROADCAST, or IRE_INTERFACE) are part of
42502733Snordmark  * the same ill group.
42512733Snordmark  */
42522733Snordmark boolean_t
42532733Snordmark ire_local_same_ill_group(ire_t *ire_local, ire_t *xmit_ire)
42542733Snordmark {
42552733Snordmark 	ill_t		*recv_ill, *xmit_ill;
42562733Snordmark 	ill_group_t	*recv_group, *xmit_group;
42572733Snordmark 
42582906Snordmark 	ASSERT(ire_local->ire_type & (IRE_LOCAL|IRE_LOOPBACK));
42592962Snordmark 	ASSERT(xmit_ire->ire_type & (IRE_CACHETABLE|IRE_INTERFACE));
42602906Snordmark 
42612906Snordmark 	recv_ill = ire_to_ill(ire_local);
42622906Snordmark 	xmit_ill = ire_to_ill(xmit_ire);
42632906Snordmark 
42642906Snordmark 	ASSERT(recv_ill != NULL);
42652906Snordmark 	ASSERT(xmit_ill != NULL);
42662733Snordmark 
42672733Snordmark 	if (recv_ill == xmit_ill)
42682733Snordmark 		return (B_TRUE);
42692733Snordmark 
42702733Snordmark 	recv_group = recv_ill->ill_group;
42712733Snordmark 	xmit_group = xmit_ill->ill_group;
42722733Snordmark 
42732733Snordmark 	if (recv_group != NULL && recv_group == xmit_group)
42742733Snordmark 		return (B_TRUE);
42752733Snordmark 
42762733Snordmark 	return (B_FALSE);
42772733Snordmark }
42782733Snordmark 
42792733Snordmark /*
42802733Snordmark  * Check if the IRE_LOCAL uses the same ill (group) as another route would use.
42812962Snordmark  * If there is no alternate route, or the alternate is a REJECT or BLACKHOLE,
42822962Snordmark  * then we don't allow this IRE_LOCAL to be used.
42832733Snordmark  */
42842733Snordmark boolean_t
42852733Snordmark ire_local_ok_across_zones(ire_t *ire_local, zoneid_t zoneid, void *addr,
42863448Sdh155122     const ts_label_t *tsl, ip_stack_t *ipst)
42872733Snordmark {
42882733Snordmark 	ire_t		*alt_ire;
42892733Snordmark 	boolean_t	rval;
42902733Snordmark 
42912733Snordmark 	if (ire_local->ire_ipversion == IPV4_VERSION) {
42922733Snordmark 		alt_ire = ire_ftable_lookup(*((ipaddr_t *)addr), 0, 0, 0, NULL,
42932733Snordmark 		    NULL, zoneid, 0, tsl,
42942733Snordmark 		    MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
42953448Sdh155122 		    MATCH_IRE_RJ_BHOLE, ipst);
42962733Snordmark 	} else {
42972733Snordmark 		alt_ire = ire_ftable_lookup_v6((in6_addr_t *)addr, NULL, NULL,
42982733Snordmark 		    0, NULL, NULL, zoneid, 0, tsl,
42992733Snordmark 		    MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
43003448Sdh155122 		    MATCH_IRE_RJ_BHOLE, ipst);
43012733Snordmark 	}
43022733Snordmark 
43032733Snordmark 	if (alt_ire == NULL)
43042733Snordmark 		return (B_FALSE);
43052733Snordmark 
43062962Snordmark 	if (alt_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
43072962Snordmark 		ire_refrele(alt_ire);
43082962Snordmark 		return (B_FALSE);
43092962Snordmark 	}
43102733Snordmark 	rval = ire_local_same_ill_group(ire_local, alt_ire);
43112733Snordmark 
43122733Snordmark 	ire_refrele(alt_ire);
43132733Snordmark 	return (rval);
43142733Snordmark }
43152733Snordmark 
43162733Snordmark /*
43170Sstevel@tonic-gate  * Lookup cache. Don't return IRE_MARK_HIDDEN entries. Callers
43180Sstevel@tonic-gate  * should use ire_ctable_lookup with MATCH_IRE_MARK_HIDDEN to get
43190Sstevel@tonic-gate  * to the hidden ones.
43202733Snordmark  *
43212733Snordmark  * In general the zoneid has to match (where ALL_ZONES match all of them).
43222733Snordmark  * But for IRE_LOCAL we also need to handle the case where L2 should
43232733Snordmark  * conceptually loop back the packet. This is necessary since neither
43242733Snordmark  * Ethernet drivers nor Ethernet hardware loops back packets sent to their
43252733Snordmark  * own MAC address. This loopback is needed when the normal
43262733Snordmark  * routes (ignoring IREs with different zoneids) would send out the packet on
43272733Snordmark  * the same ill (or ill group) as the ill with which this IRE_LOCAL is
43282733Snordmark  * associated.
43292733Snordmark  *
43302733Snordmark  * Earlier versions of this code always matched an IRE_LOCAL independently of
43312733Snordmark  * the zoneid. We preserve that earlier behavior when
43322733Snordmark  * ip_restrict_interzone_loopback is turned off.
43330Sstevel@tonic-gate  */
43340Sstevel@tonic-gate ire_t *
43353448Sdh155122 ire_cache_lookup(ipaddr_t addr, zoneid_t zoneid, const ts_label_t *tsl,
43363448Sdh155122     ip_stack_t *ipst)
43370Sstevel@tonic-gate {
43380Sstevel@tonic-gate 	irb_t *irb_ptr;
43390Sstevel@tonic-gate 	ire_t *ire;
43400Sstevel@tonic-gate 
43413448Sdh155122 	irb_ptr = &ipst->ips_ip_cache_table[IRE_ADDR_HASH(addr,
43424714Ssowmini 	    ipst->ips_ip_cache_table_size)];
43430Sstevel@tonic-gate 	rw_enter(&irb_ptr->irb_lock, RW_READER);
43440Sstevel@tonic-gate 	for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
43451095Spriyanka 		if (ire->ire_marks & (IRE_MARK_CONDEMNED |
43461095Spriyanka 		    IRE_MARK_HIDDEN | IRE_MARK_PRIVATE_ADDR)) {
43470Sstevel@tonic-gate 			continue;
43481095Spriyanka 		}
43490Sstevel@tonic-gate 		if (ire->ire_addr == addr) {
43501676Sjpk 			/*
43511676Sjpk 			 * Finally, check if the security policy has any
43521676Sjpk 			 * restriction on using this route for the specified
43531676Sjpk 			 * message.
43541676Sjpk 			 */
43551676Sjpk 			if (tsl != NULL &&
43561676Sjpk 			    ire->ire_gw_secattr != NULL &&
43571676Sjpk 			    tsol_ire_match_gwattr(ire, tsl) != 0) {
43581676Sjpk 				continue;
43591676Sjpk 			}
43601676Sjpk 
43610Sstevel@tonic-gate 			if (zoneid == ALL_ZONES || ire->ire_zoneid == zoneid ||
43622733Snordmark 			    ire->ire_zoneid == ALL_ZONES) {
43632733Snordmark 				IRE_REFHOLD(ire);
43642733Snordmark 				rw_exit(&irb_ptr->irb_lock);
43652733Snordmark 				return (ire);
43662733Snordmark 			}
43672733Snordmark 
43682733Snordmark 			if (ire->ire_type == IRE_LOCAL) {
43693448Sdh155122 				if (ipst->ips_ip_restrict_interzone_loopback &&
43702733Snordmark 				    !ire_local_ok_across_zones(ire, zoneid,
43713448Sdh155122 				    &addr, tsl, ipst))
43722733Snordmark 					continue;
43732733Snordmark 
43740Sstevel@tonic-gate 				IRE_REFHOLD(ire);
43750Sstevel@tonic-gate 				rw_exit(&irb_ptr->irb_lock);
43760Sstevel@tonic-gate 				return (ire);
43770Sstevel@tonic-gate 			}
43780Sstevel@tonic-gate 		}
43790Sstevel@tonic-gate 	}
43800Sstevel@tonic-gate 	rw_exit(&irb_ptr->irb_lock);
43810Sstevel@tonic-gate 	return (NULL);
43820Sstevel@tonic-gate }
43830Sstevel@tonic-gate 
43840Sstevel@tonic-gate /*
43850Sstevel@tonic-gate  * Locate the interface ire that is tied to the cache ire 'cire' via
43860Sstevel@tonic-gate  * cire->ire_ihandle.
43870Sstevel@tonic-gate  *
43880Sstevel@tonic-gate  * We are trying to create the cache ire for an offlink destn based
43890Sstevel@tonic-gate  * on the cache ire of the gateway in 'cire'. 'pire' is the prefix ire
43900Sstevel@tonic-gate  * as found by ip_newroute(). We are called from ip_newroute() in
43910Sstevel@tonic-gate  * the IRE_CACHE case.
43920Sstevel@tonic-gate  */
43930Sstevel@tonic-gate ire_t *
43940Sstevel@tonic-gate ire_ihandle_lookup_offlink(ire_t *cire, ire_t *pire)
43950Sstevel@tonic-gate {
43960Sstevel@tonic-gate 	ire_t	*ire;
43970Sstevel@tonic-gate 	int	match_flags;
43980Sstevel@tonic-gate 	ipaddr_t gw_addr;
43990Sstevel@tonic-gate 	ipif_t	*gw_ipif;
44003448Sdh155122 	ip_stack_t	*ipst = cire->ire_ipst;
44010Sstevel@tonic-gate 
44020Sstevel@tonic-gate 	ASSERT(cire != NULL && pire != NULL);
44030Sstevel@tonic-gate 
44040Sstevel@tonic-gate 	/*
44050Sstevel@tonic-gate 	 * We don't need to specify the zoneid to ire_ftable_lookup() below
44060Sstevel@tonic-gate 	 * because the ihandle refers to an ipif which can be in only one zone.
44070Sstevel@tonic-gate 	 */
44080Sstevel@tonic-gate 	match_flags =  MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK;
44090Sstevel@tonic-gate 	/*
44100Sstevel@tonic-gate 	 * ip_newroute calls ire_ftable_lookup with MATCH_IRE_ILL only
44110Sstevel@tonic-gate 	 * for on-link hosts. We should never be here for onlink.
44120Sstevel@tonic-gate 	 * Thus, use MATCH_IRE_ILL_GROUP.
44130Sstevel@tonic-gate 	 */
44140Sstevel@tonic-gate 	if (pire->ire_ipif != NULL)
44150Sstevel@tonic-gate 		match_flags |= MATCH_IRE_ILL_GROUP;
44160Sstevel@tonic-gate 	/*
44170Sstevel@tonic-gate 	 * We know that the mask of the interface ire equals cire->ire_cmask.
44180Sstevel@tonic-gate 	 * (When ip_newroute() created 'cire' for the gateway it set its
44190Sstevel@tonic-gate 	 * cmask from the interface ire's mask)
44200Sstevel@tonic-gate 	 */
44210Sstevel@tonic-gate 	ire = ire_ftable_lookup(cire->ire_addr, cire->ire_cmask, 0,
44220Sstevel@tonic-gate 	    IRE_INTERFACE, pire->ire_ipif, NULL, ALL_ZONES, cire->ire_ihandle,
44233448Sdh155122 	    NULL, match_flags, ipst);
44240Sstevel@tonic-gate 	if (ire != NULL)
44250Sstevel@tonic-gate 		return (ire);
44260Sstevel@tonic-gate 	/*
44270Sstevel@tonic-gate 	 * If we didn't find an interface ire above, we can't declare failure.
44280Sstevel@tonic-gate 	 * For backwards compatibility, we need to support prefix routes
44290Sstevel@tonic-gate 	 * pointing to next hop gateways that are not on-link.
44300Sstevel@tonic-gate 	 *
44310Sstevel@tonic-gate 	 * Assume we are trying to ping some offlink destn, and we have the
44320Sstevel@tonic-gate 	 * routing table below.
44330Sstevel@tonic-gate 	 *
44340Sstevel@tonic-gate 	 * Eg.	default	- gw1		<--- pire	(line 1)
44350Sstevel@tonic-gate 	 *	gw1	- gw2				(line 2)
44360Sstevel@tonic-gate 	 *	gw2	- hme0				(line 3)
44370Sstevel@tonic-gate 	 *
44380Sstevel@tonic-gate 	 * If we already have a cache ire for gw1 in 'cire', the
44390Sstevel@tonic-gate 	 * ire_ftable_lookup above would have failed, since there is no
44400Sstevel@tonic-gate 	 * interface ire to reach gw1. We will fallthru below.
44410Sstevel@tonic-gate 	 *
44420Sstevel@tonic-gate 	 * Here we duplicate the steps that ire_ftable_lookup() did in
44430Sstevel@tonic-gate 	 * getting 'cire' from 'pire', in the MATCH_IRE_RECURSIVE case.
44440Sstevel@tonic-gate 	 * The differences are the following
44450Sstevel@tonic-gate 	 * i.   We want the interface ire only, so we call ire_ftable_lookup()
44460Sstevel@tonic-gate 	 *	instead of ire_route_lookup()
44470Sstevel@tonic-gate 	 * ii.  We look for only prefix routes in the 1st call below.
44480Sstevel@tonic-gate 	 * ii.  We want to match on the ihandle in the 2nd call below.
44490Sstevel@tonic-gate 	 */
44500Sstevel@tonic-gate 	match_flags =  MATCH_IRE_TYPE;
44510Sstevel@tonic-gate 	if (pire->ire_ipif != NULL)
44520Sstevel@tonic-gate 		match_flags |= MATCH_IRE_ILL_GROUP;
44530Sstevel@tonic-gate 	ire = ire_ftable_lookup(pire->ire_gateway_addr, 0, 0, IRE_OFFSUBNET,
44543448Sdh155122 	    pire->ire_ipif, NULL, ALL_ZONES, 0, NULL, match_flags, ipst);
44550Sstevel@tonic-gate 	if (ire == NULL)
44560Sstevel@tonic-gate 		return (NULL);
44570Sstevel@tonic-gate 	/*
44580Sstevel@tonic-gate 	 * At this point 'ire' corresponds to the entry shown in line 2.
44590Sstevel@tonic-gate 	 * gw_addr is 'gw2' in the example above.
44600Sstevel@tonic-gate 	 */
44610Sstevel@tonic-gate 	gw_addr = ire->ire_gateway_addr;
44620Sstevel@tonic-gate 	gw_ipif = ire->ire_ipif;
44630Sstevel@tonic-gate 	ire_refrele(ire);
44640Sstevel@tonic-gate 
44650Sstevel@tonic-gate 	match_flags |= MATCH_IRE_IHANDLE;
44660Sstevel@tonic-gate 	ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE,
44673448Sdh155122 	    gw_ipif, NULL, ALL_ZONES, cire->ire_ihandle, NULL, match_flags,
44683448Sdh155122 	    ipst);
44690Sstevel@tonic-gate 	return (ire);
44700Sstevel@tonic-gate }
44710Sstevel@tonic-gate 
44720Sstevel@tonic-gate /*
44730Sstevel@tonic-gate  * Return the IRE_LOOPBACK, IRE_IF_RESOLVER or IRE_IF_NORESOLVER
44740Sstevel@tonic-gate  * ire associated with the specified ipif.
44750Sstevel@tonic-gate  *
44760Sstevel@tonic-gate  * This might occasionally be called when IPIF_UP is not set since
44770Sstevel@tonic-gate  * the IP_MULTICAST_IF as well as creating interface routes
44780Sstevel@tonic-gate  * allows specifying a down ipif (ipif_lookup* match ipifs that are down).
44790Sstevel@tonic-gate  *
44800Sstevel@tonic-gate  * Note that if IPIF_NOLOCAL, IPIF_NOXMIT, or IPIF_DEPRECATED is set on
44810Sstevel@tonic-gate  * the ipif, this routine might return NULL.
44820Sstevel@tonic-gate  */
44830Sstevel@tonic-gate ire_t *
44841676Sjpk ipif_to_ire(const ipif_t *ipif)
44850Sstevel@tonic-gate {
44860Sstevel@tonic-gate 	ire_t	*ire;
44873448Sdh155122 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
44880Sstevel@tonic-gate 
44890Sstevel@tonic-gate 	ASSERT(!ipif->ipif_isv6);
44900Sstevel@tonic-gate 	if (ipif->ipif_ire_type == IRE_LOOPBACK) {
44910Sstevel@tonic-gate 		ire = ire_ctable_lookup(ipif->ipif_lcl_addr, 0, IRE_LOOPBACK,
44923448Sdh155122 		    ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF),
44933448Sdh155122 		    ipst);
44940Sstevel@tonic-gate 	} else if (ipif->ipif_flags & IPIF_POINTOPOINT) {
44950Sstevel@tonic-gate 		/* In this case we need to lookup destination address. */
44960Sstevel@tonic-gate 		ire = ire_ftable_lookup(ipif->ipif_pp_dst_addr, IP_HOST_MASK, 0,
44971676Sjpk 		    IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL,
44983448Sdh155122 		    (MATCH_IRE_TYPE | MATCH_IRE_IPIF | MATCH_IRE_MASK), ipst);
44990Sstevel@tonic-gate 	} else {
45000Sstevel@tonic-gate 		ire = ire_ftable_lookup(ipif->ipif_subnet,
45010Sstevel@tonic-gate 		    ipif->ipif_net_mask, 0, IRE_INTERFACE, ipif, NULL,
45021676Sjpk 		    ALL_ZONES, 0, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF |
45033448Sdh155122 		    MATCH_IRE_MASK), ipst);
45040Sstevel@tonic-gate 	}
45050Sstevel@tonic-gate 	return (ire);
45060Sstevel@tonic-gate }
45070Sstevel@tonic-gate 
45080Sstevel@tonic-gate /*
45090Sstevel@tonic-gate  * ire_walk function.
45100Sstevel@tonic-gate  * Count the number of IRE_CACHE entries in different categories.
45110Sstevel@tonic-gate  */
45120Sstevel@tonic-gate void
45130Sstevel@tonic-gate ire_cache_count(ire_t *ire, char *arg)
45140Sstevel@tonic-gate {
45150Sstevel@tonic-gate 	ire_cache_count_t *icc = (ire_cache_count_t *)arg;
45160Sstevel@tonic-gate 
45170Sstevel@tonic-gate 	if (ire->ire_type != IRE_CACHE)
45180Sstevel@tonic-gate 		return;
45190Sstevel@tonic-gate 
45200Sstevel@tonic-gate 	icc->icc_total++;
45210Sstevel@tonic-gate 
45220Sstevel@tonic-gate 	if (ire->ire_ipversion == IPV6_VERSION) {
45230Sstevel@tonic-gate 		mutex_enter(&ire->ire_lock);
45240Sstevel@tonic-gate 		if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) {
45250Sstevel@tonic-gate 			mutex_exit(&ire->ire_lock);
45260Sstevel@tonic-gate 			icc->icc_onlink++;
45270Sstevel@tonic-gate 			return;
45280Sstevel@tonic-gate 		}
45290Sstevel@tonic-gate 		mutex_exit(&ire->ire_lock);
45300Sstevel@tonic-gate 	} else {
45310Sstevel@tonic-gate 		if (ire->ire_gateway_addr == 0) {
45320Sstevel@tonic-gate 			icc->icc_onlink++;
45330Sstevel@tonic-gate 			return;
45340Sstevel@tonic-gate 		}
45350Sstevel@tonic-gate 	}
45360Sstevel@tonic-gate 
45370Sstevel@tonic-gate 	ASSERT(ire->ire_ipif != NULL);
45380Sstevel@tonic-gate 	if (ire->ire_max_frag < ire->ire_ipif->ipif_mtu)
45390Sstevel@tonic-gate 		icc->icc_pmtu++;
45400Sstevel@tonic-gate 	else if (ire->ire_tire_mark != ire->ire_ob_pkt_count +
45410Sstevel@tonic-gate 	    ire->ire_ib_pkt_count)
45420Sstevel@tonic-gate 		icc->icc_offlink++;
45430Sstevel@tonic-gate 	else
45440Sstevel@tonic-gate 		icc->icc_unused++;
45450Sstevel@tonic-gate }
45460Sstevel@tonic-gate 
45470Sstevel@tonic-gate /*
45480Sstevel@tonic-gate  * ire_walk function called by ip_trash_ire_reclaim().
45490Sstevel@tonic-gate  * Free a fraction of the IRE_CACHE cache entries. The fractions are
45500Sstevel@tonic-gate  * different for different categories of IRE_CACHE entries.
45510Sstevel@tonic-gate  * A fraction of zero means to not free any in that category.
45520Sstevel@tonic-gate  * Use the hash bucket id plus lbolt as a random number. Thus if the fraction
45530Sstevel@tonic-gate  * is N then every Nth hash bucket chain will be freed.
45540Sstevel@tonic-gate  */
45550Sstevel@tonic-gate void
45560Sstevel@tonic-gate ire_cache_reclaim(ire_t *ire, char *arg)
45570Sstevel@tonic-gate {
45580Sstevel@tonic-gate 	ire_cache_reclaim_t *icr = (ire_cache_reclaim_t *)arg;
45590Sstevel@tonic-gate 	uint_t rand;
45603448Sdh155122 	ip_stack_t	*ipst = icr->icr_ipst;
45610Sstevel@tonic-gate 
45620Sstevel@tonic-gate 	if (ire->ire_type != IRE_CACHE)
45630Sstevel@tonic-gate 		return;
45640Sstevel@tonic-gate 
45650Sstevel@tonic-gate 	if (ire->ire_ipversion == IPV6_VERSION) {
45660Sstevel@tonic-gate 		rand = (uint_t)lbolt +
45673448Sdh155122 		    IRE_ADDR_HASH_V6(ire->ire_addr_v6,
45684714Ssowmini 		    ipst->ips_ip6_cache_table_size);
45690Sstevel@tonic-gate 		mutex_enter(&ire->ire_lock);
45700Sstevel@tonic-gate 		if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) {
45710Sstevel@tonic-gate 			mutex_exit(&ire->ire_lock);
45720Sstevel@tonic-gate 			if (icr->icr_onlink != 0 &&
45730Sstevel@tonic-gate 			    (rand/icr->icr_onlink)*icr->icr_onlink == rand) {
45740Sstevel@tonic-gate 				ire_delete(ire);
45750Sstevel@tonic-gate 				return;
45760Sstevel@tonic-gate 			}
45770Sstevel@tonic-gate 			goto done;
45780Sstevel@tonic-gate 		}
45790Sstevel@tonic-gate 		mutex_exit(&ire->ire_lock);
45800Sstevel@tonic-gate 	} else {
45810Sstevel@tonic-gate 		rand = (uint_t)lbolt +
45823448Sdh155122 		    IRE_ADDR_HASH(ire->ire_addr, ipst->ips_ip_cache_table_size);
45830Sstevel@tonic-gate 		if (ire->ire_gateway_addr == 0) {
45840Sstevel@tonic-gate 			if (icr->icr_onlink != 0 &&
45850Sstevel@tonic-gate 			    (rand/icr->icr_onlink)*icr->icr_onlink == rand) {
45860Sstevel@tonic-gate 				ire_delete(ire);
45870Sstevel@tonic-gate 				return;
45880Sstevel@tonic-gate 			}
45890Sstevel@tonic-gate 			goto done;
45900Sstevel@tonic-gate 		}
45910Sstevel@tonic-gate 	}
45920Sstevel@tonic-gate 	/* Not onlink IRE */
45930Sstevel@tonic-gate 	ASSERT(ire->ire_ipif != NULL);
45940Sstevel@tonic-gate 	if (ire->ire_max_frag < ire->ire_ipif->ipif_mtu) {
45950Sstevel@tonic-gate 		/* Use ptmu fraction */
45960Sstevel@tonic-gate 		if (icr->icr_pmtu != 0 &&
45970Sstevel@tonic-gate 		    (rand/icr->icr_pmtu)*icr->icr_pmtu == rand) {
45980Sstevel@tonic-gate 			ire_delete(ire);
45990Sstevel@tonic-gate 			return;
46000Sstevel@tonic-gate 		}
46010Sstevel@tonic-gate 	} else if (ire->ire_tire_mark != ire->ire_ob_pkt_count +
46020Sstevel@tonic-gate 	    ire->ire_ib_pkt_count) {
46030Sstevel@tonic-gate 		/* Use offlink fraction */
46040Sstevel@tonic-gate 		if (icr->icr_offlink != 0 &&
46050Sstevel@tonic-gate 		    (rand/icr->icr_offlink)*icr->icr_offlink == rand) {
46060Sstevel@tonic-gate 			ire_delete(ire);
46070Sstevel@tonic-gate 			return;
46080Sstevel@tonic-gate 		}
46090Sstevel@tonic-gate 	} else {
46100Sstevel@tonic-gate 		/* Use unused fraction */
46110Sstevel@tonic-gate 		if (icr->icr_unused != 0 &&
46120Sstevel@tonic-gate 		    (rand/icr->icr_unused)*icr->icr_unused == rand) {
46130Sstevel@tonic-gate 			ire_delete(ire);
46140Sstevel@tonic-gate 			return;
46150Sstevel@tonic-gate 		}
46160Sstevel@tonic-gate 	}
46170Sstevel@tonic-gate done:
46180Sstevel@tonic-gate 	/*
46190Sstevel@tonic-gate 	 * Update tire_mark so that those that haven't been used since this
46200Sstevel@tonic-gate 	 * reclaim will be considered unused next time we reclaim.
46210Sstevel@tonic-gate 	 */
46220Sstevel@tonic-gate 	ire->ire_tire_mark = ire->ire_ob_pkt_count + ire->ire_ib_pkt_count;
46230Sstevel@tonic-gate }
46240Sstevel@tonic-gate 
46250Sstevel@tonic-gate static void
46260Sstevel@tonic-gate power2_roundup(uint32_t *value)
46270Sstevel@tonic-gate {
46280Sstevel@tonic-gate 	int i;
46290Sstevel@tonic-gate 
46300Sstevel@tonic-gate 	for (i = 1; i < 31; i++) {
46310Sstevel@tonic-gate 		if (*value <= (1 << i))
46320Sstevel@tonic-gate 			break;
46330Sstevel@tonic-gate 	}
46340Sstevel@tonic-gate 	*value = (1 << i);
46350Sstevel@tonic-gate }
46360Sstevel@tonic-gate 
46373448Sdh155122 /* Global init for all zones */
46380Sstevel@tonic-gate void
46393448Sdh155122 ip_ire_g_init()
46400Sstevel@tonic-gate {
46410Sstevel@tonic-gate 	/*
46420Sstevel@tonic-gate 	 * Create ire caches, ire_reclaim()
46430Sstevel@tonic-gate 	 * will give IRE_CACHE back to system when needed.
46440Sstevel@tonic-gate 	 * This needs to be done here before anything else, since
46450Sstevel@tonic-gate 	 * ire_add() expects the cache to be created.
46460Sstevel@tonic-gate 	 */
46470Sstevel@tonic-gate 	ire_cache = kmem_cache_create("ire_cache",
46484714Ssowmini 	    sizeof (ire_t), 0, ip_ire_constructor,
46494714Ssowmini 	    ip_ire_destructor, ip_trash_ire_reclaim, NULL, NULL, 0);
46500Sstevel@tonic-gate 
46513448Sdh155122 	rt_entry_cache = kmem_cache_create("rt_entry",
46523448Sdh155122 	    sizeof (struct rt_entry), 0, NULL, NULL, NULL, NULL, NULL, 0);
46533448Sdh155122 
46543448Sdh155122 	/*
46553448Sdh155122 	 * Have radix code setup kmem caches etc.
46563448Sdh155122 	 */
46573448Sdh155122 	rn_init();
46583448Sdh155122 }
46593448Sdh155122 
46603448Sdh155122 void
46613448Sdh155122 ip_ire_init(ip_stack_t *ipst)
46623448Sdh155122 {
46633448Sdh155122 	int i;
46643448Sdh155122 	uint32_t mem_cnt;
46653448Sdh155122 	uint32_t cpu_cnt;
46663448Sdh155122 	uint32_t min_cnt;
46673448Sdh155122 	pgcnt_t mem_avail;
46683448Sdh155122 
46693448Sdh155122 	/*
46703448Sdh155122 	 * ip_ire_max_bucket_cnt is sized below based on the memory
46713448Sdh155122 	 * size and the cpu speed of the machine. This is upper
46723448Sdh155122 	 * bounded by the compile time value of ip_ire_max_bucket_cnt
46733448Sdh155122 	 * and is lower bounded by the compile time value of
46743448Sdh155122 	 * ip_ire_min_bucket_cnt.  Similar logic applies to
46753448Sdh155122 	 * ip6_ire_max_bucket_cnt.
46763448Sdh155122 	 *
46773448Sdh155122 	 * We calculate this for each IP Instances in order to use
46783448Sdh155122 	 * the kmem_avail and ip_ire_{min,max}_bucket_cnt that are
46793448Sdh155122 	 * in effect when the zone is booted.
46803448Sdh155122 	 */
46813448Sdh155122 	mem_avail = kmem_avail();
46823448Sdh155122 	mem_cnt = (mem_avail >> ip_ire_mem_ratio) /
46833448Sdh155122 	    ip_cache_table_size / sizeof (ire_t);
46843448Sdh155122 	cpu_cnt = CPU->cpu_type_info.pi_clock >> ip_ire_cpu_ratio;
46853448Sdh155122 
46863448Sdh155122 	min_cnt = MIN(cpu_cnt, mem_cnt);
46873448Sdh155122 	if (min_cnt < ip_ire_min_bucket_cnt)
46883448Sdh155122 		min_cnt = ip_ire_min_bucket_cnt;
46893448Sdh155122 	if (ip_ire_max_bucket_cnt > min_cnt) {
46903448Sdh155122 		ip_ire_max_bucket_cnt = min_cnt;
46913448Sdh155122 	}
46923448Sdh155122 
46933448Sdh155122 	mem_cnt = (mem_avail >> ip_ire_mem_ratio) /
46943448Sdh155122 	    ip6_cache_table_size / sizeof (ire_t);
46953448Sdh155122 	min_cnt = MIN(cpu_cnt, mem_cnt);
46963448Sdh155122 	if (min_cnt < ip6_ire_min_bucket_cnt)
46973448Sdh155122 		min_cnt = ip6_ire_min_bucket_cnt;
46983448Sdh155122 	if (ip6_ire_max_bucket_cnt > min_cnt) {
46993448Sdh155122 		ip6_ire_max_bucket_cnt = min_cnt;
47003448Sdh155122 	}
47013448Sdh155122 
47023448Sdh155122 	mutex_init(&ipst->ips_ire_ft_init_lock, NULL, MUTEX_DEFAULT, 0);
47033448Sdh155122 	mutex_init(&ipst->ips_ire_handle_lock, NULL, MUTEX_DEFAULT, NULL);
47043448Sdh155122 
47053448Sdh155122 	(void) rn_inithead((void **)&ipst->ips_ip_ftable, 32);
47063448Sdh155122 
47073448Sdh155122 
47083448Sdh155122 	/* Calculate the IPv4 cache table size. */
47093448Sdh155122 	ipst->ips_ip_cache_table_size = MAX(ip_cache_table_size,
47103448Sdh155122 	    ((mem_avail >> ip_ire_mem_ratio) / sizeof (ire_t) /
47113448Sdh155122 	    ip_ire_max_bucket_cnt));
47123448Sdh155122 	if (ipst->ips_ip_cache_table_size > ip_max_cache_table_size)
47133448Sdh155122 		ipst->ips_ip_cache_table_size = ip_max_cache_table_size;
47143448Sdh155122 	/*
47153448Sdh155122 	 * Make sure that the table size is always a power of 2.  The
47163448Sdh155122 	 * hash macro IRE_ADDR_HASH() depends on that.
47173448Sdh155122 	 */
47183448Sdh155122 	power2_roundup(&ipst->ips_ip_cache_table_size);
47193448Sdh155122 
47203448Sdh155122 	ipst->ips_ip_cache_table = kmem_zalloc(ipst->ips_ip_cache_table_size *
47213448Sdh155122 	    sizeof (irb_t), KM_SLEEP);
47223448Sdh155122 
47233448Sdh155122 	for (i = 0; i < ipst->ips_ip_cache_table_size; i++) {
47243448Sdh155122 		rw_init(&ipst->ips_ip_cache_table[i].irb_lock, NULL,
47253448Sdh155122 		    RW_DEFAULT, NULL);
47263448Sdh155122 	}
47273448Sdh155122 
47283448Sdh155122 	/* Calculate the IPv6 cache table size. */
47293448Sdh155122 	ipst->ips_ip6_cache_table_size = MAX(ip6_cache_table_size,
47303448Sdh155122 	    ((mem_avail >> ip_ire_mem_ratio) / sizeof (ire_t) /
47313448Sdh155122 	    ip6_ire_max_bucket_cnt));
47323448Sdh155122 	if (ipst->ips_ip6_cache_table_size > ip6_max_cache_table_size)
47333448Sdh155122 		ipst->ips_ip6_cache_table_size = ip6_max_cache_table_size;
47343448Sdh155122 	/*
47353448Sdh155122 	 * Make sure that the table size is always a power of 2.  The
47363448Sdh155122 	 * hash macro IRE_ADDR_HASH_V6() depends on that.
47373448Sdh155122 	 */
47383448Sdh155122 	power2_roundup(&ipst->ips_ip6_cache_table_size);
47393448Sdh155122 
47403448Sdh155122 	ipst->ips_ip_cache_table_v6 = kmem_zalloc(
47413448Sdh155122 	    ipst->ips_ip6_cache_table_size * sizeof (irb_t), KM_SLEEP);
47423448Sdh155122 
47433448Sdh155122 	for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) {
47443448Sdh155122 		rw_init(&ipst->ips_ip_cache_table_v6[i].irb_lock, NULL,
47453448Sdh155122 		    RW_DEFAULT, NULL);
47463448Sdh155122 	}
47473448Sdh155122 
47480Sstevel@tonic-gate 	/*
47490Sstevel@tonic-gate 	 * Make sure that the forwarding table size is a power of 2.
47500Sstevel@tonic-gate 	 * The IRE*_ADDR_HASH() macroes depend on that.
47510Sstevel@tonic-gate 	 */
47523448Sdh155122 	ipst->ips_ip6_ftable_hash_size = ip6_ftable_hash_size;
47533448Sdh155122 	power2_roundup(&ipst->ips_ip6_ftable_hash_size);
47543448Sdh155122 
47553448Sdh155122 	ipst->ips_ire_handle = 1;
47563448Sdh155122 }
47573448Sdh155122 
47583448Sdh155122 void
47593448Sdh155122 ip_ire_g_fini(void)
47603448Sdh155122 {
47613448Sdh155122 	kmem_cache_destroy(ire_cache);
47623448Sdh155122 	kmem_cache_destroy(rt_entry_cache);
47633448Sdh155122 
47643448Sdh155122 	rn_fini();
47650Sstevel@tonic-gate }
47660Sstevel@tonic-gate 
47670Sstevel@tonic-gate void
47683448Sdh155122 ip_ire_fini(ip_stack_t *ipst)
47690Sstevel@tonic-gate {
47700Sstevel@tonic-gate 	int i;
47710Sstevel@tonic-gate 
47723448Sdh155122 	/*
47733448Sdh155122 	 * Delete all IREs - assumes that the ill/ipifs have
47743448Sdh155122 	 * been removed so what remains are just the ftable and IRE_CACHE.
47753448Sdh155122 	 */
47763448Sdh155122 	ire_walk(ire_delete, NULL, ipst);
47773448Sdh155122 
47783448Sdh155122 	rn_freehead(ipst->ips_ip_ftable);
47793448Sdh155122 	ipst->ips_ip_ftable = NULL;
47803448Sdh155122 
47813448Sdh155122 	mutex_destroy(&ipst->ips_ire_ft_init_lock);
47823448Sdh155122 	mutex_destroy(&ipst->ips_ire_handle_lock);
47833448Sdh155122 
47843448Sdh155122 	for (i = 0; i < ipst->ips_ip_cache_table_size; i++) {
47853448Sdh155122 		ASSERT(ipst->ips_ip_cache_table[i].irb_ire == NULL);
47863448Sdh155122 		rw_destroy(&ipst->ips_ip_cache_table[i].irb_lock);
47873448Sdh155122 	}
47883448Sdh155122 	kmem_free(ipst->ips_ip_cache_table,
47893448Sdh155122 	    ipst->ips_ip_cache_table_size * sizeof (irb_t));
47903448Sdh155122 	ipst->ips_ip_cache_table = NULL;
47913448Sdh155122 
47923448Sdh155122 	for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) {
47933448Sdh155122 		ASSERT(ipst->ips_ip_cache_table_v6[i].irb_ire == NULL);
47943448Sdh155122 		rw_destroy(&ipst->ips_ip_cache_table_v6[i].irb_lock);
47953448Sdh155122 	}
47963448Sdh155122 	kmem_free(ipst->ips_ip_cache_table_v6,
47973448Sdh155122 	    ipst->ips_ip6_cache_table_size * sizeof (irb_t));
47983448Sdh155122 	ipst->ips_ip_cache_table_v6 = NULL;
47993448Sdh155122 
48003448Sdh155122 	for (i = 0; i < IP6_MASK_TABLE_SIZE; i++) {
48013448Sdh155122 		irb_t *ptr;
48023448Sdh155122 		int j;
48033448Sdh155122 
48043448Sdh155122 		if ((ptr = ipst->ips_ip_forwarding_table_v6[i]) == NULL)
48053448Sdh155122 			continue;
48063448Sdh155122 
48073448Sdh155122 		for (j = 0; j < ipst->ips_ip6_ftable_hash_size; j++) {
48083448Sdh155122 			ASSERT(ptr[j].irb_ire == NULL);
48093448Sdh155122 			rw_destroy(&ptr[j].irb_lock);
48103448Sdh155122 		}
48113448Sdh155122 		mi_free(ptr);
48123448Sdh155122 		ipst->ips_ip_forwarding_table_v6[i] = NULL;
48133448Sdh155122 	}
48140Sstevel@tonic-gate }
48150Sstevel@tonic-gate 
48160Sstevel@tonic-gate /*
48170Sstevel@tonic-gate  * Check if another multirt route resolution is needed.
48180Sstevel@tonic-gate  * B_TRUE is returned is there remain a resolvable route,
48190Sstevel@tonic-gate  * or if no route for that dst is resolved yet.
48200Sstevel@tonic-gate  * B_FALSE is returned if all routes for that dst are resolved
48210Sstevel@tonic-gate  * or if the remaining unresolved routes are actually not
48220Sstevel@tonic-gate  * resolvable.
48230Sstevel@tonic-gate  * This only works in the global zone.
48240Sstevel@tonic-gate  */
48250Sstevel@tonic-gate boolean_t
48263448Sdh155122 ire_multirt_need_resolve(ipaddr_t dst, const ts_label_t *tsl, ip_stack_t *ipst)
48270Sstevel@tonic-gate {
48280Sstevel@tonic-gate 	ire_t	*first_fire;
48290Sstevel@tonic-gate 	ire_t	*first_cire;
48300Sstevel@tonic-gate 	ire_t	*fire;
48310Sstevel@tonic-gate 	ire_t	*cire;
48320Sstevel@tonic-gate 	irb_t	*firb;
48330Sstevel@tonic-gate 	irb_t	*cirb;
48340Sstevel@tonic-gate 	int	unres_cnt = 0;
48350Sstevel@tonic-gate 	boolean_t resolvable = B_FALSE;
48360Sstevel@tonic-gate 
48370Sstevel@tonic-gate 	/* Retrieve the first IRE_HOST that matches the destination */
48380Sstevel@tonic-gate 	first_fire = ire_ftable_lookup(dst, IP_HOST_MASK, 0, IRE_HOST, NULL,
48391676Sjpk 	    NULL, ALL_ZONES, 0, tsl,
48403448Sdh155122 	    MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_SECATTR, ipst);
48410Sstevel@tonic-gate 
48420Sstevel@tonic-gate 	/* No route at all */
48430Sstevel@tonic-gate 	if (first_fire == NULL) {
48440Sstevel@tonic-gate 		return (B_TRUE);
48450Sstevel@tonic-gate 	}
48460Sstevel@tonic-gate 
48470Sstevel@tonic-gate 	firb = first_fire->ire_bucket;
48480Sstevel@tonic-gate 	ASSERT(firb != NULL);
48490Sstevel@tonic-gate 
48500Sstevel@tonic-gate 	/* Retrieve the first IRE_CACHE ire for that destination. */
48513448Sdh155122 	first_cire = ire_cache_lookup(dst, GLOBAL_ZONEID, tsl, ipst);
48520Sstevel@tonic-gate 
48530Sstevel@tonic-gate 	/* No resolved route. */
48540Sstevel@tonic-gate 	if (first_cire == NULL) {
48550Sstevel@tonic-gate 		ire_refrele(first_fire);
48560Sstevel@tonic-gate 		return (B_TRUE);
48570Sstevel@tonic-gate 	}
48580Sstevel@tonic-gate 
48590Sstevel@tonic-gate 	/*
48600Sstevel@tonic-gate 	 * At least one route is resolved. Here we look through the forward
48610Sstevel@tonic-gate 	 * and cache tables, to compare the number of declared routes
48620Sstevel@tonic-gate 	 * with the number of resolved routes. The search for a resolvable
48630Sstevel@tonic-gate 	 * route is performed only if at least one route remains
48640Sstevel@tonic-gate 	 * unresolved.
48650Sstevel@tonic-gate 	 */
48660Sstevel@tonic-gate 	cirb = first_cire->ire_bucket;
48670Sstevel@tonic-gate 	ASSERT(cirb != NULL);
48680Sstevel@tonic-gate 
48690Sstevel@tonic-gate 	/* Count the number of routes to that dest that are declared. */
48700Sstevel@tonic-gate 	IRB_REFHOLD(firb);
48710Sstevel@tonic-gate 	for (fire = first_fire; fire != NULL; fire = fire->ire_next) {
48720Sstevel@tonic-gate 		if (!(fire->ire_flags & RTF_MULTIRT))
48730Sstevel@tonic-gate 			continue;
48740Sstevel@tonic-gate 		if (fire->ire_addr != dst)
48750Sstevel@tonic-gate 			continue;
48760Sstevel@tonic-gate 		unres_cnt++;
48770Sstevel@tonic-gate 	}
48780Sstevel@tonic-gate 	IRB_REFRELE(firb);
48790Sstevel@tonic-gate 
48800Sstevel@tonic-gate 	/* Then subtract the number of routes to that dst that are resolved */
48810Sstevel@tonic-gate 	IRB_REFHOLD(cirb);
48820Sstevel@tonic-gate 	for (cire = first_cire; cire != NULL; cire = cire->ire_next) {
48830Sstevel@tonic-gate 		if (!(cire->ire_flags & RTF_MULTIRT))
48840Sstevel@tonic-gate 			continue;
48850Sstevel@tonic-gate 		if (cire->ire_addr != dst)
48860Sstevel@tonic-gate 			continue;
48870Sstevel@tonic-gate 		if (cire->ire_marks & (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))
48880Sstevel@tonic-gate 			continue;
48890Sstevel@tonic-gate 		unres_cnt--;
48900Sstevel@tonic-gate 	}
48910Sstevel@tonic-gate 	IRB_REFRELE(cirb);
48920Sstevel@tonic-gate 
48930Sstevel@tonic-gate 	/* At least one route is unresolved; search for a resolvable route. */
48940Sstevel@tonic-gate 	if (unres_cnt > 0)
48950Sstevel@tonic-gate 		resolvable = ire_multirt_lookup(&first_cire, &first_fire,
48963448Sdh155122 		    MULTIRT_USESTAMP | MULTIRT_CACHEGW, tsl, ipst);
48970Sstevel@tonic-gate 
48980Sstevel@tonic-gate 	if (first_fire != NULL)
48990Sstevel@tonic-gate 		ire_refrele(first_fire);
49000Sstevel@tonic-gate 
49010Sstevel@tonic-gate 	if (first_cire != NULL)
49020Sstevel@tonic-gate 		ire_refrele(first_cire);
49030Sstevel@tonic-gate 
49040Sstevel@tonic-gate 	return (resolvable);
49050Sstevel@tonic-gate }
49060Sstevel@tonic-gate 
49070Sstevel@tonic-gate 
49080Sstevel@tonic-gate /*
49090Sstevel@tonic-gate  * Explore a forward_table bucket, starting from fire_arg.
49100Sstevel@tonic-gate  * fire_arg MUST be an IRE_HOST entry.
49110Sstevel@tonic-gate  *
49120Sstevel@tonic-gate  * Return B_TRUE and update *ire_arg and *fire_arg
49130Sstevel@tonic-gate  * if at least one resolvable route is found. *ire_arg
49140Sstevel@tonic-gate  * is the IRE entry for *fire_arg's gateway.
49150Sstevel@tonic-gate  *
49160Sstevel@tonic-gate  * Return B_FALSE otherwise (all routes are resolved or
49170Sstevel@tonic-gate  * the remaining unresolved routes are all unresolvable).
49180Sstevel@tonic-gate  *
49190Sstevel@tonic-gate  * The IRE selection relies on a priority mechanism
49200Sstevel@tonic-gate  * driven by the flags passed in by the caller.
49210Sstevel@tonic-gate  * The caller, such as ip_newroute_ipif(), can get the most
49220Sstevel@tonic-gate  * relevant ire at each stage of a multiple route resolution.
49230Sstevel@tonic-gate  *
49240Sstevel@tonic-gate  * The rules are:
49250Sstevel@tonic-gate  *
49260Sstevel@tonic-gate  * - if MULTIRT_CACHEGW is specified in flags, IRE_CACHETABLE
49270Sstevel@tonic-gate  *   ires are preferred for the gateway. This gives the highest
49280Sstevel@tonic-gate  *   priority to routes that can be resolved without using
49290Sstevel@tonic-gate  *   a resolver.
49300Sstevel@tonic-gate  *
49310Sstevel@tonic-gate  * - if MULTIRT_CACHEGW is not specified, or if MULTIRT_CACHEGW
49320Sstevel@tonic-gate  *   is specified but no IRE_CACHETABLE ire entry for the gateway
49330Sstevel@tonic-gate  *   is found, the following rules apply.
49340Sstevel@tonic-gate  *
49350Sstevel@tonic-gate  * - if MULTIRT_USESTAMP is specified in flags, IRE_INTERFACE
49360Sstevel@tonic-gate  *   ires for the gateway, that have not been tried since
49370Sstevel@tonic-gate  *   a configurable amount of time, are preferred.
49380Sstevel@tonic-gate  *   This applies when a resolver must be invoked for
49390Sstevel@tonic-gate  *   a missing route, but we don't want to use the resolver
49400Sstevel@tonic-gate  *   upon each packet emission. If no such resolver is found,
49410Sstevel@tonic-gate  *   B_FALSE is returned.
49420Sstevel@tonic-gate  *   The MULTIRT_USESTAMP flag can be combined with
49430Sstevel@tonic-gate  *   MULTIRT_CACHEGW.
49440Sstevel@tonic-gate  *
49450Sstevel@tonic-gate  * - if MULTIRT_USESTAMP is not specified in flags, the first
49460Sstevel@tonic-gate  *   unresolved but resolvable route is selected.
49470Sstevel@tonic-gate  *
49480Sstevel@tonic-gate  * - Otherwise, there is no resolvalble route, and
49490Sstevel@tonic-gate  *   B_FALSE is returned.
49500Sstevel@tonic-gate  *
49510Sstevel@tonic-gate  * At last, MULTIRT_SETSTAMP can be specified in flags to
49520Sstevel@tonic-gate  * request the timestamp of unresolvable routes to
49530Sstevel@tonic-gate  * be refreshed. This prevents the useless exploration
49540Sstevel@tonic-gate  * of those routes for a while, when MULTIRT_USESTAMP is used.
49550Sstevel@tonic-gate  *
49560Sstevel@tonic-gate  * This only works in the global zone.
49570Sstevel@tonic-gate  */
49580Sstevel@tonic-gate boolean_t
49591676Sjpk ire_multirt_lookup(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags,
49603448Sdh155122     const ts_label_t *tsl, ip_stack_t *ipst)
49610Sstevel@tonic-gate {
49620Sstevel@tonic-gate 	clock_t	delta;
49630Sstevel@tonic-gate 	ire_t	*best_fire = NULL;
49640Sstevel@tonic-gate 	ire_t	*best_cire = NULL;
49650Sstevel@tonic-gate 	ire_t	*first_fire;
49660Sstevel@tonic-gate 	ire_t	*first_cire;
49670Sstevel@tonic-gate 	ire_t	*fire;
49680Sstevel@tonic-gate 	ire_t	*cire;
49690Sstevel@tonic-gate 	irb_t	*firb = NULL;
49700Sstevel@tonic-gate 	irb_t	*cirb = NULL;
49710Sstevel@tonic-gate 	ire_t	*gw_ire;
49720Sstevel@tonic-gate 	boolean_t	already_resolved;
49730Sstevel@tonic-gate 	boolean_t	res;
49740Sstevel@tonic-gate 	ipaddr_t	dst;
49750Sstevel@tonic-gate 	ipaddr_t	gw;
49760Sstevel@tonic-gate 
49770Sstevel@tonic-gate 	ip2dbg(("ire_multirt_lookup: *ire_arg %p, *fire_arg %p, flags %04x\n",
49780Sstevel@tonic-gate 	    (void *)*ire_arg, (void *)*fire_arg, flags));
49790Sstevel@tonic-gate 
49800Sstevel@tonic-gate 	ASSERT(ire_arg != NULL);
49810Sstevel@tonic-gate 	ASSERT(fire_arg != NULL);
49820Sstevel@tonic-gate 
49830Sstevel@tonic-gate 	/* Not an IRE_HOST ire; give up. */
49840Sstevel@tonic-gate 	if ((*fire_arg == NULL) || ((*fire_arg)->ire_type != IRE_HOST)) {
49850Sstevel@tonic-gate 		return (B_FALSE);
49860Sstevel@tonic-gate 	}
49870Sstevel@tonic-gate 
49880Sstevel@tonic-gate 	/* This is the first IRE_HOST ire for that destination. */
49890Sstevel@tonic-gate 	first_fire = *fire_arg;
49900Sstevel@tonic-gate 	firb = first_fire->ire_bucket;
49910Sstevel@tonic-gate 	ASSERT(firb != NULL);
49920Sstevel@tonic-gate 
49930Sstevel@tonic-gate 	dst = first_fire->ire_addr;
49940Sstevel@tonic-gate 
49950Sstevel@tonic-gate 	ip2dbg(("ire_multirt_lookup: dst %08x\n", ntohl(dst)));
49960Sstevel@tonic-gate 
49970Sstevel@tonic-gate 	/*
49980Sstevel@tonic-gate 	 * Retrieve the first IRE_CACHE ire for that destination;
49990Sstevel@tonic-gate 	 * if we don't find one, no route for that dest is
50000Sstevel@tonic-gate 	 * resolved yet.
50010Sstevel@tonic-gate 	 */
50023448Sdh155122 	first_cire = ire_cache_lookup(dst, GLOBAL_ZONEID, tsl, ipst);
50030Sstevel@tonic-gate 	if (first_cire != NULL) {
50040Sstevel@tonic-gate 		cirb = first_cire->ire_bucket;
50050Sstevel@tonic-gate 	}
50060Sstevel@tonic-gate 
50070Sstevel@tonic-gate 	ip2dbg(("ire_multirt_lookup: first_cire %p\n", (void *)first_cire));
50080Sstevel@tonic-gate 
50090Sstevel@tonic-gate 	/*
50100Sstevel@tonic-gate 	 * Search for a resolvable route, giving the top priority
50110Sstevel@tonic-gate 	 * to routes that can be resolved without any call to the resolver.
50120Sstevel@tonic-gate 	 */
50130Sstevel@tonic-gate 	IRB_REFHOLD(firb);
50140Sstevel@tonic-gate 
50150Sstevel@tonic-gate 	if (!CLASSD(dst)) {
50160Sstevel@tonic-gate 		/*
50170Sstevel@tonic-gate 		 * For all multiroute IRE_HOST ires for that destination,
50180Sstevel@tonic-gate 		 * check if the route via the IRE_HOST's gateway is
50190Sstevel@tonic-gate 		 * resolved yet.
50200Sstevel@tonic-gate 		 */
50210Sstevel@tonic-gate 		for (fire = first_fire; fire != NULL; fire = fire->ire_next) {
50220Sstevel@tonic-gate 
50230Sstevel@tonic-gate 			if (!(fire->ire_flags & RTF_MULTIRT))
50240Sstevel@tonic-gate 				continue;
50250Sstevel@tonic-gate 			if (fire->ire_addr != dst)
50260Sstevel@tonic-gate 				continue;
50270Sstevel@tonic-gate 
50281676Sjpk 			if (fire->ire_gw_secattr != NULL &&
50291676Sjpk 			    tsol_ire_match_gwattr(fire, tsl) != 0) {
50301676Sjpk 				continue;
50311676Sjpk 			}
50321676Sjpk 
50330Sstevel@tonic-gate 			gw = fire->ire_gateway_addr;
50340Sstevel@tonic-gate 
50350Sstevel@tonic-gate 			ip2dbg(("ire_multirt_lookup: fire %p, "
50360Sstevel@tonic-gate 			    "ire_addr %08x, ire_gateway_addr %08x\n",
50370Sstevel@tonic-gate 			    (void *)fire, ntohl(fire->ire_addr), ntohl(gw)));
50380Sstevel@tonic-gate 
50390Sstevel@tonic-gate 			already_resolved = B_FALSE;
50400Sstevel@tonic-gate 
50410Sstevel@tonic-gate 			if (first_cire != NULL) {
50420Sstevel@tonic-gate 				ASSERT(cirb != NULL);
50430Sstevel@tonic-gate 
50440Sstevel@tonic-gate 				IRB_REFHOLD(cirb);
50450Sstevel@tonic-gate 				/*
50460Sstevel@tonic-gate 				 * For all IRE_CACHE ires for that
50470Sstevel@tonic-gate 				 * destination.
50480Sstevel@tonic-gate 				 */
50490Sstevel@tonic-gate 				for (cire = first_cire;
50500Sstevel@tonic-gate 				    cire != NULL;
50510Sstevel@tonic-gate 				    cire = cire->ire_next) {
50520Sstevel@tonic-gate 
50530Sstevel@tonic-gate 					if (!(cire->ire_flags & RTF_MULTIRT))
50540Sstevel@tonic-gate 						continue;
50550Sstevel@tonic-gate 					if (cire->ire_addr != dst)
50560Sstevel@tonic-gate 						continue;
50570Sstevel@tonic-gate 					if (cire->ire_marks &
50580Sstevel@tonic-gate 					    (IRE_MARK_CONDEMNED |
50594714Ssowmini 					    IRE_MARK_HIDDEN))
50600Sstevel@tonic-gate 						continue;
50611676Sjpk 
50621676Sjpk 					if (cire->ire_gw_secattr != NULL &&
50631676Sjpk 					    tsol_ire_match_gwattr(cire,
50641676Sjpk 					    tsl) != 0) {
50651676Sjpk 						continue;
50661676Sjpk 					}
50671676Sjpk 
50680Sstevel@tonic-gate 					/*
50690Sstevel@tonic-gate 					 * Check if the IRE_CACHE's gateway
50700Sstevel@tonic-gate 					 * matches the IRE_HOST's gateway.
50710Sstevel@tonic-gate 					 */
50720Sstevel@tonic-gate 					if (cire->ire_gateway_addr == gw) {
50730Sstevel@tonic-gate 						already_resolved = B_TRUE;
50740Sstevel@tonic-gate 						break;
50750Sstevel@tonic-gate 					}
50760Sstevel@tonic-gate 				}
50770Sstevel@tonic-gate 				IRB_REFRELE(cirb);
50780Sstevel@tonic-gate 			}
50790Sstevel@tonic-gate 
50800Sstevel@tonic-gate 			/*
50810Sstevel@tonic-gate 			 * This route is already resolved;
50820Sstevel@tonic-gate 			 * proceed with next one.
50830Sstevel@tonic-gate 			 */
50840Sstevel@tonic-gate 			if (already_resolved) {
50850Sstevel@tonic-gate 				ip2dbg(("ire_multirt_lookup: found cire %p, "
50860Sstevel@tonic-gate 				    "already resolved\n", (void *)cire));
50870Sstevel@tonic-gate 				continue;
50880Sstevel@tonic-gate 			}
50890Sstevel@tonic-gate 
50900Sstevel@tonic-gate 			/*
50910Sstevel@tonic-gate 			 * The route is unresolved; is it actually
50920Sstevel@tonic-gate 			 * resolvable, i.e. is there a cache or a resolver
50930Sstevel@tonic-gate 			 * for the gateway?
50940Sstevel@tonic-gate 			 */
50950Sstevel@tonic-gate 			gw_ire = ire_route_lookup(gw, 0, 0, 0, NULL, NULL,
50961676Sjpk 			    ALL_ZONES, tsl,
50973448Sdh155122 			    MATCH_IRE_RECURSIVE | MATCH_IRE_SECATTR, ipst);
50980Sstevel@tonic-gate 
50990Sstevel@tonic-gate 			ip2dbg(("ire_multirt_lookup: looked up gw_ire %p\n",
51000Sstevel@tonic-gate 			    (void *)gw_ire));
51010Sstevel@tonic-gate 
51020Sstevel@tonic-gate 			/*
51030Sstevel@tonic-gate 			 * If gw_ire is typed IRE_CACHETABLE,
51040Sstevel@tonic-gate 			 * this route can be resolved without any call to the
51050Sstevel@tonic-gate 			 * resolver. If the MULTIRT_CACHEGW flag is set,
51060Sstevel@tonic-gate 			 * give the top priority to this ire and exit the
51070Sstevel@tonic-gate 			 * loop.
51080Sstevel@tonic-gate 			 * This is typically the case when an ARP reply
51090Sstevel@tonic-gate 			 * is processed through ip_wput_nondata().
51100Sstevel@tonic-gate 			 */
51110Sstevel@tonic-gate 			if ((flags & MULTIRT_CACHEGW) &&
51120Sstevel@tonic-gate 			    (gw_ire != NULL) &&
51130Sstevel@tonic-gate 			    (gw_ire->ire_type & IRE_CACHETABLE)) {
51142535Ssangeeta 				ASSERT(gw_ire->ire_nce == NULL ||
51152535Ssangeeta 				    gw_ire->ire_nce->nce_state == ND_REACHABLE);
51160Sstevel@tonic-gate 				/*
51170Sstevel@tonic-gate 				 * Release the resolver associated to the
51180Sstevel@tonic-gate 				 * previous candidate best ire, if any.
51190Sstevel@tonic-gate 				 */
51200Sstevel@tonic-gate 				if (best_cire != NULL) {
51210Sstevel@tonic-gate 					ire_refrele(best_cire);
51220Sstevel@tonic-gate 					ASSERT(best_fire != NULL);
51230Sstevel@tonic-gate 				}
51240Sstevel@tonic-gate 
51250Sstevel@tonic-gate 				best_fire = fire;
51260Sstevel@tonic-gate 				best_cire = gw_ire;
51270Sstevel@tonic-gate 
51280Sstevel@tonic-gate 				ip2dbg(("ire_multirt_lookup: found top prio "
51290Sstevel@tonic-gate 				    "best_fire %p, best_cire %p\n",
51300Sstevel@tonic-gate 				    (void *)best_fire, (void *)best_cire));
51310Sstevel@tonic-gate 				break;
51320Sstevel@tonic-gate 			}
51330Sstevel@tonic-gate 
51340Sstevel@tonic-gate 			/*
51350Sstevel@tonic-gate 			 * Compute the time elapsed since our preceding
51360Sstevel@tonic-gate 			 * attempt to  resolve that route.
51370Sstevel@tonic-gate 			 * If the MULTIRT_USESTAMP flag is set, we take that
51380Sstevel@tonic-gate 			 * route into account only if this time interval
51390Sstevel@tonic-gate 			 * exceeds ip_multirt_resolution_interval;
51400Sstevel@tonic-gate 			 * this prevents us from attempting to resolve a
51410Sstevel@tonic-gate 			 * broken route upon each sending of a packet.
51420Sstevel@tonic-gate 			 */
51430Sstevel@tonic-gate 			delta = lbolt - fire->ire_last_used_time;
51440Sstevel@tonic-gate 			delta = TICK_TO_MSEC(delta);
51450Sstevel@tonic-gate 
51463448Sdh155122 			res = (boolean_t)((delta >
51474714Ssowmini 			    ipst->ips_ip_multirt_resolution_interval) ||
51484714Ssowmini 			    (!(flags & MULTIRT_USESTAMP)));
51490Sstevel@tonic-gate 
51500Sstevel@tonic-gate 			ip2dbg(("ire_multirt_lookup: fire %p, delta %lu, "
51510Sstevel@tonic-gate 			    "res %d\n",
51520Sstevel@tonic-gate 			    (void *)fire, delta, res));
51530Sstevel@tonic-gate 
51540Sstevel@tonic-gate 			if (res) {
51550Sstevel@tonic-gate 				/*
51560Sstevel@tonic-gate 				 * We are here if MULTIRT_USESTAMP flag is set
51570Sstevel@tonic-gate 				 * and the resolver for fire's gateway
51580Sstevel@tonic-gate 				 * has not been tried since
51590Sstevel@tonic-gate 				 * ip_multirt_resolution_interval, or if
51600Sstevel@tonic-gate 				 * MULTIRT_USESTAMP is not set but gw_ire did
51610Sstevel@tonic-gate 				 * not fill the conditions for MULTIRT_CACHEGW,
51620Sstevel@tonic-gate 				 * or if neither MULTIRT_USESTAMP nor
51630Sstevel@tonic-gate 				 * MULTIRT_CACHEGW are set.
51640Sstevel@tonic-gate 				 */
51650Sstevel@tonic-gate 				if (gw_ire != NULL) {
51660Sstevel@tonic-gate 					if (best_fire == NULL) {
51670Sstevel@tonic-gate 						ASSERT(best_cire == NULL);
51680Sstevel@tonic-gate 
51690Sstevel@tonic-gate 						best_fire = fire;
51700Sstevel@tonic-gate 						best_cire = gw_ire;
51710Sstevel@tonic-gate 
51720Sstevel@tonic-gate 						ip2dbg(("ire_multirt_lookup:"
51730Sstevel@tonic-gate 						    "found candidate "
51740Sstevel@tonic-gate 						    "best_fire %p, "
51750Sstevel@tonic-gate 						    "best_cire %p\n",
51760Sstevel@tonic-gate 						    (void *)best_fire,
51770Sstevel@tonic-gate 						    (void *)best_cire));
51780Sstevel@tonic-gate 
51790Sstevel@tonic-gate 						/*
51800Sstevel@tonic-gate 						 * If MULTIRT_CACHEGW is not
51810Sstevel@tonic-gate 						 * set, we ignore the top
51820Sstevel@tonic-gate 						 * priority ires that can
51830Sstevel@tonic-gate 						 * be resolved without any
51840Sstevel@tonic-gate 						 * call to the resolver;
51850Sstevel@tonic-gate 						 * In that case, there is
51860Sstevel@tonic-gate 						 * actually no need
51870Sstevel@tonic-gate 						 * to continue the loop.
51880Sstevel@tonic-gate 						 */
51890Sstevel@tonic-gate 						if (!(flags &
51900Sstevel@tonic-gate 						    MULTIRT_CACHEGW)) {
51910Sstevel@tonic-gate 							break;
51920Sstevel@tonic-gate 						}
51930Sstevel@tonic-gate 						continue;
51940Sstevel@tonic-gate 					}
51950Sstevel@tonic-gate 				} else {
51960Sstevel@tonic-gate 					/*
51970Sstevel@tonic-gate 					 * No resolver for the gateway: the
51980Sstevel@tonic-gate 					 * route is not resolvable.
51990Sstevel@tonic-gate 					 * If the MULTIRT_SETSTAMP flag is
52000Sstevel@tonic-gate 					 * set, we stamp the IRE_HOST ire,
52010Sstevel@tonic-gate 					 * so we will not select it again
52020Sstevel@tonic-gate 					 * during this resolution interval.
52030Sstevel@tonic-gate 					 */
52040Sstevel@tonic-gate 					if (flags & MULTIRT_SETSTAMP)
52050Sstevel@tonic-gate 						fire->ire_last_used_time =
52060Sstevel@tonic-gate 						    lbolt;
52070Sstevel@tonic-gate 				}
52080Sstevel@tonic-gate 			}
52090Sstevel@tonic-gate 
52100Sstevel@tonic-gate 			if (gw_ire != NULL)
52110Sstevel@tonic-gate 				ire_refrele(gw_ire);
52120Sstevel@tonic-gate 		}
52130Sstevel@tonic-gate 	} else { /* CLASSD(dst) */
52140Sstevel@tonic-gate 
52150Sstevel@tonic-gate 		for (fire = first_fire;
52160Sstevel@tonic-gate 		    fire != NULL;
52170Sstevel@tonic-gate 		    fire = fire->ire_next) {
52180Sstevel@tonic-gate 
52190Sstevel@tonic-gate 			if (!(fire->ire_flags & RTF_MULTIRT))
52200Sstevel@tonic-gate 				continue;
52210Sstevel@tonic-gate 			if (fire->ire_addr != dst)
52220Sstevel@tonic-gate 				continue;
52230Sstevel@tonic-gate 
52241676Sjpk 			if (fire->ire_gw_secattr != NULL &&
52251676Sjpk 			    tsol_ire_match_gwattr(fire, tsl) != 0) {
52261676Sjpk 				continue;
52271676Sjpk 			}
52281676Sjpk 
52290Sstevel@tonic-gate 			already_resolved = B_FALSE;
52300Sstevel@tonic-gate 
52310Sstevel@tonic-gate 			gw = fire->ire_gateway_addr;
52320Sstevel@tonic-gate 
52330Sstevel@tonic-gate 			gw_ire = ire_ftable_lookup(gw, 0, 0, IRE_INTERFACE,
52341676Sjpk 			    NULL, NULL, ALL_ZONES, 0, tsl,
52351676Sjpk 			    MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE |
52363448Sdh155122 			    MATCH_IRE_SECATTR, ipst);
52370Sstevel@tonic-gate 
52380Sstevel@tonic-gate 			/* No resolver for the gateway; we skip this ire. */
52390Sstevel@tonic-gate 			if (gw_ire == NULL) {
52400Sstevel@tonic-gate 				continue;
52410Sstevel@tonic-gate 			}
52422535Ssangeeta 			ASSERT(gw_ire->ire_nce == NULL ||
52432535Ssangeeta 			    gw_ire->ire_nce->nce_state == ND_REACHABLE);
52440Sstevel@tonic-gate 
52450Sstevel@tonic-gate 			if (first_cire != NULL) {
52460Sstevel@tonic-gate 
52470Sstevel@tonic-gate 				IRB_REFHOLD(cirb);
52480Sstevel@tonic-gate 				/*
52490Sstevel@tonic-gate 				 * For all IRE_CACHE ires for that
52500Sstevel@tonic-gate 				 * destination.
52510Sstevel@tonic-gate 				 */
52520Sstevel@tonic-gate 				for (cire = first_cire;
52530Sstevel@tonic-gate 				    cire != NULL;
52540Sstevel@tonic-gate 				    cire = cire->ire_next) {
52550Sstevel@tonic-gate 
52560Sstevel@tonic-gate 					if (!(cire->ire_flags & RTF_MULTIRT))
52570Sstevel@tonic-gate 						continue;
52580Sstevel@tonic-gate 					if (cire->ire_addr != dst)
52590Sstevel@tonic-gate 						continue;
52600Sstevel@tonic-gate 					if (cire->ire_marks &
52610Sstevel@tonic-gate 					    (IRE_MARK_CONDEMNED |
52624714Ssowmini 					    IRE_MARK_HIDDEN))
52630Sstevel@tonic-gate 						continue;
52640Sstevel@tonic-gate 
52651676Sjpk 					if (cire->ire_gw_secattr != NULL &&
52661676Sjpk 					    tsol_ire_match_gwattr(cire,
52671676Sjpk 					    tsl) != 0) {
52681676Sjpk 						continue;
52691676Sjpk 					}
52701676Sjpk 
52710Sstevel@tonic-gate 					/*
52720Sstevel@tonic-gate 					 * Cache entries are linked to the
52730Sstevel@tonic-gate 					 * parent routes using the parent handle
52740Sstevel@tonic-gate 					 * (ire_phandle). If no cache entry has
52750Sstevel@tonic-gate 					 * the same handle as fire, fire is
52760Sstevel@tonic-gate 					 * still unresolved.
52770Sstevel@tonic-gate 					 */
52780Sstevel@tonic-gate 					ASSERT(cire->ire_phandle != 0);
52790Sstevel@tonic-gate 					if (cire->ire_phandle ==
52800Sstevel@tonic-gate 					    fire->ire_phandle) {
52810Sstevel@tonic-gate 						already_resolved = B_TRUE;
52820Sstevel@tonic-gate 						break;
52830Sstevel@tonic-gate 					}
52840Sstevel@tonic-gate 				}
52850Sstevel@tonic-gate 				IRB_REFRELE(cirb);
52860Sstevel@tonic-gate 			}
52870Sstevel@tonic-gate 
52880Sstevel@tonic-gate 			/*
52890Sstevel@tonic-gate 			 * This route is already resolved; proceed with
52900Sstevel@tonic-gate 			 * next one.
52910Sstevel@tonic-gate 			 */
52920Sstevel@tonic-gate 			if (already_resolved) {
52930Sstevel@tonic-gate 				ire_refrele(gw_ire);
52940Sstevel@tonic-gate 				continue;
52950Sstevel@tonic-gate 			}
52960Sstevel@tonic-gate 
52970Sstevel@tonic-gate 			/*
52980Sstevel@tonic-gate 			 * Compute the time elapsed since our preceding
52990Sstevel@tonic-gate 			 * attempt to resolve that route.
53000Sstevel@tonic-gate 			 * If the MULTIRT_USESTAMP flag is set, we take
53010Sstevel@tonic-gate 			 * that route into account only if this time
53020Sstevel@tonic-gate 			 * interval exceeds ip_multirt_resolution_interval;
53030Sstevel@tonic-gate 			 * this prevents us from attempting to resolve a
53040Sstevel@tonic-gate 			 * broken route upon each sending of a packet.
53050Sstevel@tonic-gate 			 */
53060Sstevel@tonic-gate 			delta = lbolt - fire->ire_last_used_time;
53070Sstevel@tonic-gate 			delta = TICK_TO_MSEC(delta);
53080Sstevel@tonic-gate 
53093448Sdh155122 			res = (boolean_t)((delta >
53104714Ssowmini 			    ipst->ips_ip_multirt_resolution_interval) ||
53114714Ssowmini 			    (!(flags & MULTIRT_USESTAMP)));
53120Sstevel@tonic-gate 
53130Sstevel@tonic-gate 			ip3dbg(("ire_multirt_lookup: fire %p, delta %lx, "
53140Sstevel@tonic-gate 			    "flags %04x, res %d\n",
53150Sstevel@tonic-gate 			    (void *)fire, delta, flags, res));
53160Sstevel@tonic-gate 
53170Sstevel@tonic-gate 			if (res) {
53180Sstevel@tonic-gate 				if (best_cire != NULL) {
53190Sstevel@tonic-gate 					/*
53200Sstevel@tonic-gate 					 * Release the resolver associated
53210Sstevel@tonic-gate 					 * to the preceding candidate best
53220Sstevel@tonic-gate 					 * ire, if any.
53230Sstevel@tonic-gate 					 */
53240Sstevel@tonic-gate 					ire_refrele(best_cire);
53250Sstevel@tonic-gate 					ASSERT(best_fire != NULL);
53260Sstevel@tonic-gate 				}
53270Sstevel@tonic-gate 				best_fire = fire;
53280Sstevel@tonic-gate 				best_cire = gw_ire;
53290Sstevel@tonic-gate 				continue;
53300Sstevel@tonic-gate 			}
53310Sstevel@tonic-gate 
53320Sstevel@tonic-gate 			ire_refrele(gw_ire);
53330Sstevel@tonic-gate 		}
53340Sstevel@tonic-gate 	}
53350Sstevel@tonic-gate 
53360Sstevel@tonic-gate 	if (best_fire != NULL) {
53370Sstevel@tonic-gate 		IRE_REFHOLD(best_fire);
53380Sstevel@tonic-gate 	}
53390Sstevel@tonic-gate 	IRB_REFRELE(firb);
53400Sstevel@tonic-gate 
53410Sstevel@tonic-gate 	/* Release the first IRE_CACHE we initially looked up, if any. */
53420Sstevel@tonic-gate 	if (first_cire != NULL)
53430Sstevel@tonic-gate 		ire_refrele(first_cire);
53440Sstevel@tonic-gate 
53450Sstevel@tonic-gate 	/* Found a resolvable route. */
53460Sstevel@tonic-gate 	if (best_fire != NULL) {
53470Sstevel@tonic-gate 		ASSERT(best_cire != NULL);
53480Sstevel@tonic-gate 
53490Sstevel@tonic-gate 		if (*fire_arg != NULL)
53500Sstevel@tonic-gate 			ire_refrele(*fire_arg);
53510Sstevel@tonic-gate 		if (*ire_arg != NULL)
53520Sstevel@tonic-gate 			ire_refrele(*ire_arg);
53530Sstevel@tonic-gate 
53540Sstevel@tonic-gate 		/*
53550Sstevel@tonic-gate 		 * Update the passed-in arguments with the
53560Sstevel@tonic-gate 		 * resolvable multirt route we found.
53570Sstevel@tonic-gate 		 */
53580Sstevel@tonic-gate 		*fire_arg = best_fire;
53590Sstevel@tonic-gate 		*ire_arg = best_cire;
53600Sstevel@tonic-gate 
53610Sstevel@tonic-gate 		ip2dbg(("ire_multirt_lookup: returning B_TRUE, "
53620Sstevel@tonic-gate 		    "*fire_arg %p, *ire_arg %p\n",
53630Sstevel@tonic-gate 		    (void *)best_fire, (void *)best_cire));
53640Sstevel@tonic-gate 
53650Sstevel@tonic-gate 		return (B_TRUE);
53660Sstevel@tonic-gate 	}
53670Sstevel@tonic-gate 
53680Sstevel@tonic-gate 	ASSERT(best_cire == NULL);
53690Sstevel@tonic-gate 
53700Sstevel@tonic-gate 	ip2dbg(("ire_multirt_lookup: returning B_FALSE, *fire_arg %p, "
53710Sstevel@tonic-gate 	    "*ire_arg %p\n",
53720Sstevel@tonic-gate 	    (void *)*fire_arg, (void *)*ire_arg));
53730Sstevel@tonic-gate 
53740Sstevel@tonic-gate 	/* No resolvable route. */
53750Sstevel@tonic-gate 	return (B_FALSE);
53760Sstevel@tonic-gate }
53770Sstevel@tonic-gate 
53780Sstevel@tonic-gate /*
53790Sstevel@tonic-gate  * IRE iterator for inbound and loopback broadcast processing.
53800Sstevel@tonic-gate  * Given an IRE_BROADCAST ire, walk the ires with the same destination
53810Sstevel@tonic-gate  * address, but skip over the passed-in ire. Returns the next ire without
53820Sstevel@tonic-gate  * a hold - assumes that the caller holds a reference on the IRE bucket.
53830Sstevel@tonic-gate  */
53840Sstevel@tonic-gate ire_t *
53850Sstevel@tonic-gate ire_get_next_bcast_ire(ire_t *curr, ire_t *ire)
53860Sstevel@tonic-gate {
53870Sstevel@tonic-gate 	ill_t *ill;
53880Sstevel@tonic-gate 
53890Sstevel@tonic-gate 	if (curr == NULL) {
53900Sstevel@tonic-gate 		for (curr = ire->ire_bucket->irb_ire; curr != NULL;
53910Sstevel@tonic-gate 		    curr = curr->ire_next) {
53920Sstevel@tonic-gate 			if (curr->ire_addr == ire->ire_addr)
53930Sstevel@tonic-gate 				break;
53940Sstevel@tonic-gate 		}
53950Sstevel@tonic-gate 	} else {
53960Sstevel@tonic-gate 		curr = curr->ire_next;
53970Sstevel@tonic-gate 	}
53980Sstevel@tonic-gate 	ill = ire_to_ill(ire);
53990Sstevel@tonic-gate 	for (; curr != NULL; curr = curr->ire_next) {
54000Sstevel@tonic-gate 		if (curr->ire_addr != ire->ire_addr) {
54010Sstevel@tonic-gate 			/*
54020Sstevel@tonic-gate 			 * All the IREs to a given destination are contiguous;
54030Sstevel@tonic-gate 			 * break out once the address doesn't match.
54040Sstevel@tonic-gate 			 */
54050Sstevel@tonic-gate 			break;
54060Sstevel@tonic-gate 		}
54070Sstevel@tonic-gate 		if (curr == ire) {
54080Sstevel@tonic-gate 			/* skip over the passed-in ire */
54090Sstevel@tonic-gate 			continue;
54100Sstevel@tonic-gate 		}
54110Sstevel@tonic-gate 		if ((curr->ire_stq != NULL && ire->ire_stq == NULL) ||
54120Sstevel@tonic-gate 		    (curr->ire_stq == NULL && ire->ire_stq != NULL)) {
54130Sstevel@tonic-gate 			/*
54140Sstevel@tonic-gate 			 * If the passed-in ire is loopback, skip over
54150Sstevel@tonic-gate 			 * non-loopback ires and vice versa.
54160Sstevel@tonic-gate 			 */
54170Sstevel@tonic-gate 			continue;
54180Sstevel@tonic-gate 		}
54190Sstevel@tonic-gate 		if (ire_to_ill(curr) != ill) {
54200Sstevel@tonic-gate 			/* skip over IREs going through a different interface */
54210Sstevel@tonic-gate 			continue;
54220Sstevel@tonic-gate 		}
54230Sstevel@tonic-gate 		if (curr->ire_marks & IRE_MARK_CONDEMNED) {
54240Sstevel@tonic-gate 			/* skip over deleted IREs */
54250Sstevel@tonic-gate 			continue;
54260Sstevel@tonic-gate 		}
54270Sstevel@tonic-gate 		return (curr);
54280Sstevel@tonic-gate 	}
54290Sstevel@tonic-gate 	return (NULL);
54300Sstevel@tonic-gate }
54310Sstevel@tonic-gate 
54325023Scarlsonj #ifdef DEBUG
54330Sstevel@tonic-gate void
54340Sstevel@tonic-gate ire_trace_ref(ire_t *ire)
54350Sstevel@tonic-gate {
54360Sstevel@tonic-gate 	mutex_enter(&ire->ire_lock);
54375023Scarlsonj 	if (ire->ire_trace_disable) {
54380Sstevel@tonic-gate 		mutex_exit(&ire->ire_lock);
54390Sstevel@tonic-gate 		return;
54400Sstevel@tonic-gate 	}
54415023Scarlsonj 
54425023Scarlsonj 	if (th_trace_ref(ire, ire->ire_ipst)) {
54435023Scarlsonj 		mutex_exit(&ire->ire_lock);
54445023Scarlsonj 	} else {
54455023Scarlsonj 		ire->ire_trace_disable = B_TRUE;
54465023Scarlsonj 		mutex_exit(&ire->ire_lock);
54475023Scarlsonj 		ire_trace_cleanup(ire);
54480Sstevel@tonic-gate 	}
54490Sstevel@tonic-gate }
54500Sstevel@tonic-gate 
54510Sstevel@tonic-gate void
54520Sstevel@tonic-gate ire_untrace_ref(ire_t *ire)
54530Sstevel@tonic-gate {
54540Sstevel@tonic-gate 	mutex_enter(&ire->ire_lock);
54555023Scarlsonj 	if (!ire->ire_trace_disable)
54565023Scarlsonj 		th_trace_unref(ire);
54570Sstevel@tonic-gate 	mutex_exit(&ire->ire_lock);
54580Sstevel@tonic-gate }
54590Sstevel@tonic-gate 
54600Sstevel@tonic-gate static void
54615023Scarlsonj ire_trace_cleanup(const ire_t *ire)
54620Sstevel@tonic-gate {
54635023Scarlsonj 	th_trace_cleanup(ire, ire->ire_trace_disable);
54640Sstevel@tonic-gate }
54655023Scarlsonj #endif /* DEBUG */
54662535Ssangeeta 
54672535Ssangeeta /*
54682535Ssangeeta  * Generate a message chain with an arp request to resolve the in_ire.
54692535Ssangeeta  * It is assumed that in_ire itself is currently in the ire cache table,
54702535Ssangeeta  * so we create a fake_ire filled with enough information about ire_addr etc.
54712535Ssangeeta  * to retrieve in_ire when the DL_UNITDATA response from the resolver
54722535Ssangeeta  * comes back. The fake_ire itself is created by calling esballoc with
54732535Ssangeeta  * the fr_rtnp (free routine) set to ire_freemblk. This routine will be
54742535Ssangeeta  * invoked when the mblk containing fake_ire is freed.
54752535Ssangeeta  */
54762535Ssangeeta void
54772535Ssangeeta ire_arpresolve(ire_t *in_ire, ill_t *dst_ill)
54782535Ssangeeta {
54792535Ssangeeta 	areq_t		*areq;
54802535Ssangeeta 	ipaddr_t	*addrp;
54814714Ssowmini 	mblk_t 		*ire_mp, *areq_mp;
54822535Ssangeeta 	ire_t 		*ire, *buf;
54832535Ssangeeta 	size_t		bufsize;
54842535Ssangeeta 	frtn_t		*frtnp;
54852535Ssangeeta 	ill_t		*ill;
54863448Sdh155122 	ip_stack_t	*ipst = dst_ill->ill_ipst;
54872535Ssangeeta 
54882535Ssangeeta 	/*
54892535Ssangeeta 	 * Construct message chain for the resolver
54902535Ssangeeta 	 * of the form:
54912535Ssangeeta 	 *	ARP_REQ_MBLK-->IRE_MBLK
54922535Ssangeeta 	 *
54932535Ssangeeta 	 * NOTE : If the response does not
54942535Ssangeeta 	 * come back, ARP frees the packet. For this reason,
54952535Ssangeeta 	 * we can't REFHOLD the bucket of save_ire to prevent
54962535Ssangeeta 	 * deletions. We may not be able to REFRELE the bucket
54972535Ssangeeta 	 * if the response never comes back. Thus, before
54982535Ssangeeta 	 * adding the ire, ire_add_v4 will make sure that the
54992535Ssangeeta 	 * interface route does not get deleted. This is the
55002535Ssangeeta 	 * only case unlike ip_newroute_v6, ip_newroute_ipif_v6
55012535Ssangeeta 	 * where we can always prevent deletions because of
55022535Ssangeeta 	 * the synchronous nature of adding IRES i.e
55032535Ssangeeta 	 * ire_add_then_send is called after creating the IRE.
55042535Ssangeeta 	 */
55052535Ssangeeta 
55062535Ssangeeta 	/*
55072535Ssangeeta 	 * We use esballoc to allocate the second part(the ire_t size mblk)
55082535Ssangeeta 	 * of the message chain depicted above. THis mblk will be freed
55092535Ssangeeta 	 * by arp when there is a  timeout, and otherwise passed to IP
55102535Ssangeeta 	 * and IP will * free it after processing the ARP response.
55112535Ssangeeta 	 */
55122535Ssangeeta 
55132535Ssangeeta 	bufsize = sizeof (ire_t) + sizeof (frtn_t);
55142535Ssangeeta 	buf = kmem_alloc(bufsize, KM_NOSLEEP);
55152535Ssangeeta 	if (buf == NULL) {
55162535Ssangeeta 		ip1dbg(("ire_arpresolver:alloc buffer failed\n "));
55172535Ssangeeta 		return;
55182535Ssangeeta 	}
55192535Ssangeeta 	frtnp = (frtn_t *)(buf + 1);
55202535Ssangeeta 	frtnp->free_arg = (caddr_t)buf;
55212535Ssangeeta 	frtnp->free_func = ire_freemblk;
55222535Ssangeeta 
55232535Ssangeeta 	ire_mp = esballoc((unsigned char *)buf, bufsize, BPRI_MED, frtnp);
55242535Ssangeeta 
55252535Ssangeeta 	if (ire_mp == NULL) {
55262535Ssangeeta 		ip1dbg(("ire_arpresolve: esballoc failed\n"));
55272535Ssangeeta 		kmem_free(buf, bufsize);
55282535Ssangeeta 		return;
55292535Ssangeeta 	}
55302535Ssangeeta 	ASSERT(in_ire->ire_nce != NULL);
55314714Ssowmini 	areq_mp = copyb(dst_ill->ill_resolver_mp);
55324714Ssowmini 	if (areq_mp == NULL) {
55332535Ssangeeta 		kmem_free(buf, bufsize);
55342535Ssangeeta 		return;
55352535Ssangeeta 	}
55362535Ssangeeta 
55372535Ssangeeta 	ire_mp->b_datap->db_type = IRE_ARPRESOLVE_TYPE;
55382535Ssangeeta 	ire = (ire_t *)buf;
55392535Ssangeeta 	/*
55402535Ssangeeta 	 * keep enough info in the fake ire so that we can pull up
55412535Ssangeeta 	 * the incomplete ire (in_ire) after result comes back from
55422535Ssangeeta 	 * arp and make it complete.
55432535Ssangeeta 	 */
55442535Ssangeeta 	*ire = ire_null;
55452535Ssangeeta 	ire->ire_u = in_ire->ire_u;
55462535Ssangeeta 	ire->ire_ipif_seqid = in_ire->ire_ipif_seqid;
55472535Ssangeeta 	ire->ire_ipif = in_ire->ire_ipif;
55482535Ssangeeta 	ire->ire_stq = in_ire->ire_stq;
55492535Ssangeeta 	ill = ire_to_ill(ire);
55502535Ssangeeta 	ire->ire_stq_ifindex = ill->ill_phyint->phyint_ifindex;
55512535Ssangeeta 	ire->ire_zoneid = in_ire->ire_zoneid;
55523448Sdh155122 	ire->ire_ipst = ipst;
55533448Sdh155122 
55542535Ssangeeta 	/*
55552535Ssangeeta 	 * ire_freemblk will be called when ire_mp is freed, both for
55562535Ssangeeta 	 * successful and failed arp resolution. IRE_MARK_UNCACHED will be set
55572535Ssangeeta 	 * when the arp resolution failed.
55582535Ssangeeta 	 */
55592535Ssangeeta 	ire->ire_marks |= IRE_MARK_UNCACHED;
55602535Ssangeeta 	ire->ire_mp = ire_mp;
55612535Ssangeeta 	ire_mp->b_wptr = (uchar_t *)&ire[1];
55622535Ssangeeta 	ire_mp->b_cont = NULL;
55634714Ssowmini 	linkb(areq_mp, ire_mp);
55642535Ssangeeta 
55652535Ssangeeta 	/*
55662535Ssangeeta 	 * Fill in the source and dest addrs for the resolver.
55672535Ssangeeta 	 * NOTE: this depends on memory layouts imposed by
55682535Ssangeeta 	 * ill_init().
55692535Ssangeeta 	 */
55704714Ssowmini 	areq = (areq_t *)areq_mp->b_rptr;
55712535Ssangeeta 	addrp = (ipaddr_t *)((char *)areq + areq->areq_sender_addr_offset);
55722535Ssangeeta 	*addrp = ire->ire_src_addr;
55732535Ssangeeta 
55742535Ssangeeta 	addrp = (ipaddr_t *)((char *)areq + areq->areq_target_addr_offset);
55752535Ssangeeta 	if (ire->ire_gateway_addr != INADDR_ANY) {
55762535Ssangeeta 		*addrp = ire->ire_gateway_addr;
55772535Ssangeeta 	} else {
55782535Ssangeeta 		*addrp = ire->ire_addr;
55792535Ssangeeta 	}
55802535Ssangeeta 
55812535Ssangeeta 	/* Up to the resolver. */
55822535Ssangeeta 	if (canputnext(dst_ill->ill_rq)) {
55834714Ssowmini 		putnext(dst_ill->ill_rq, areq_mp);
55842535Ssangeeta 	} else {
55854714Ssowmini 		freemsg(areq_mp);
55862535Ssangeeta 	}
55872535Ssangeeta }
55882535Ssangeeta 
55892535Ssangeeta /*
55902535Ssangeeta  * Esballoc free function for AR_ENTRY_QUERY request to clean up any
55912535Ssangeeta  * unresolved ire_t and/or nce_t structures when ARP resolution fails.
55922535Ssangeeta  *
55932535Ssangeeta  * This function can be called by ARP via free routine for ire_mp or
55942535Ssangeeta  * by IPv4(both host and forwarding path) via ire_delete
55952535Ssangeeta  * in case ARP resolution fails.
55962535Ssangeeta  * NOTE: Since IP is MT, ARP can call into IP but not vice versa
55972535Ssangeeta  * (for IP to talk to ARP, it still has to send AR* messages).
55982535Ssangeeta  *
55992535Ssangeeta  * Note that the ARP/IP merge should replace the functioanlity by providing
56002535Ssangeeta  * direct function calls to clean up unresolved entries in ire/nce lists.
56012535Ssangeeta  */
56022535Ssangeeta void
56032535Ssangeeta ire_freemblk(ire_t *ire_mp)
56042535Ssangeeta {
56052535Ssangeeta 	nce_t		*nce = NULL;
56062535Ssangeeta 	ill_t		*ill;
56073448Sdh155122 	ip_stack_t	*ipst;
56082535Ssangeeta 
56092535Ssangeeta 	ASSERT(ire_mp != NULL);
56102535Ssangeeta 
56112535Ssangeeta 	if ((ire_mp->ire_addr == NULL) && (ire_mp->ire_gateway_addr == NULL)) {
56122535Ssangeeta 		ip1dbg(("ire_freemblk(0x%p) ire_addr is NULL\n",
56132535Ssangeeta 		    (void *)ire_mp));
56142535Ssangeeta 		goto cleanup;
56152535Ssangeeta 	}
56162535Ssangeeta 	if ((ire_mp->ire_marks & IRE_MARK_UNCACHED) == 0) {
56172535Ssangeeta 		goto cleanup; /* everything succeeded. just free and return */
56182535Ssangeeta 	}
56192535Ssangeeta 
56202535Ssangeeta 	/*
56212535Ssangeeta 	 * the arp information corresponding to this ire_mp was not
56222535Ssangeeta 	 * transferred to  a ire_cache entry. Need
56232535Ssangeeta 	 * to clean up incomplete ire's and nce, if necessary.
56242535Ssangeeta 	 */
56252535Ssangeeta 	ASSERT(ire_mp->ire_stq != NULL);
56262535Ssangeeta 	ASSERT(ire_mp->ire_stq_ifindex != 0);
56273448Sdh155122 	ASSERT(ire_mp->ire_ipst != NULL);
56283448Sdh155122 
56293448Sdh155122 	ipst = ire_mp->ire_ipst;
56303448Sdh155122 
56312535Ssangeeta 	/*
56322535Ssangeeta 	 * Get any nce's corresponding to this ire_mp. We first have to
56332535Ssangeeta 	 * make sure that the ill is still around.
56342535Ssangeeta 	 */
56353448Sdh155122 	ill = ill_lookup_on_ifindex(ire_mp->ire_stq_ifindex,
56363448Sdh155122 	    B_FALSE, NULL, NULL, NULL, NULL, ipst);
56372535Ssangeeta 	if (ill == NULL || (ire_mp->ire_stq != ill->ill_wq) ||
56382535Ssangeeta 	    (ill->ill_state_flags & ILL_CONDEMNED)) {
56392535Ssangeeta 		/*
56402535Ssangeeta 		 * ill went away. no nce to clean up.
56412535Ssangeeta 		 * Note that the ill_state_flags could be set to
56422535Ssangeeta 		 * ILL_CONDEMNED after this point, but if we know
56432535Ssangeeta 		 * that it is CONDEMNED now, we just bail out quickly.
56442535Ssangeeta 		 */
56452535Ssangeeta 		if (ill != NULL)
56462535Ssangeeta 			ill_refrele(ill);
56472535Ssangeeta 		goto cleanup;
56482535Ssangeeta 	}
56492535Ssangeeta 	nce = ndp_lookup_v4(ill,
56502535Ssangeeta 	    ((ire_mp->ire_gateway_addr != INADDR_ANY) ?
56512535Ssangeeta 	    &ire_mp->ire_gateway_addr : &ire_mp->ire_addr),
56522535Ssangeeta 	    B_FALSE);
56532535Ssangeeta 	ill_refrele(ill);
56542535Ssangeeta 
56552535Ssangeeta 	if ((nce != NULL) && (nce->nce_state != ND_REACHABLE)) {
56562535Ssangeeta 		/*
56572535Ssangeeta 		 * some incomplete nce was found.
56582535Ssangeeta 		 */
56592535Ssangeeta 		DTRACE_PROBE2(ire__freemblk__arp__resolv__fail,
56602535Ssangeeta 		    nce_t *, nce, ire_t *, ire_mp);
56612535Ssangeeta 		/*
56622535Ssangeeta 		 * Send the icmp_unreachable messages for the queued mblks in
56632535Ssangeeta 		 * ire->ire_nce->nce_qd_mp, since ARP resolution failed
56642535Ssangeeta 		 * for this ire
56652535Ssangeeta 		 */
56662535Ssangeeta 		arp_resolv_failed(nce);
56672535Ssangeeta 		/*
56682535Ssangeeta 		 * Delete the nce and clean up all ire's pointing at this nce
56692535Ssangeeta 		 * in the cachetable
56702535Ssangeeta 		 */
56712535Ssangeeta 		ndp_delete(nce);
56722535Ssangeeta 	}
56732535Ssangeeta 	if (nce != NULL)
56742535Ssangeeta 		NCE_REFRELE(nce); /* release the ref taken by ndp_lookup_v4 */
56752535Ssangeeta 
56762535Ssangeeta cleanup:
56772535Ssangeeta 	/*
56782535Ssangeeta 	 * Get rid of the ire buffer
56792535Ssangeeta 	 * We call kmem_free here(instead of ire_delete()), since
56802535Ssangeeta 	 * this is the freeb's callback.
56812535Ssangeeta 	 */
56822535Ssangeeta 	kmem_free(ire_mp, sizeof (ire_t) + sizeof (frtn_t));
56832535Ssangeeta }
56842535Ssangeeta 
56853772Ssangeeta /*
56864714Ssowmini  * find, or create if needed, a neighbor cache entry nce_t for IRE_CACHE and
56874714Ssowmini  * non-loopback IRE_BROADCAST ire's.
56884714Ssowmini  *
56894714Ssowmini  * If a neighbor-cache entry has to be created (i.e., one does not already
56904714Ssowmini  * exist in the nce list) the nce_res_mp and nce_state of the neighbor cache
56914714Ssowmini  * entry are initialized in ndp_add_v4(). These values are picked from
56924714Ssowmini  * the src_nce, if one is passed in. Otherwise (if src_nce == NULL) the
56934714Ssowmini  * ire->ire_type and the outgoing interface (ire_to_ill(ire)) values
56944714Ssowmini  * determine the {nce_state, nce_res_mp} of the nce_t created. All
56954714Ssowmini  * IRE_BROADCAST entries have nce_state = ND_REACHABLE, and the nce_res_mp
56964714Ssowmini  * is set to the ill_bcast_mp of the outgoing inerface. For unicast ire
56974714Ssowmini  * entries,
56984714Ssowmini  *   - if the outgoing interface is of type IRE_IF_RESOLVER, a newly created
56994714Ssowmini  *     nce_t will have a null nce_res_mp, and will be in the ND_INITIAL state.
57004714Ssowmini  *   - if the outgoing interface is a IRE_IF_NORESOLVER interface, no link
57014714Ssowmini  *     layer resolution is necessary, so that the nce_t will be in the
57024714Ssowmini  *     ND_REACHABLE state and the nce_res_mp will have a copy of the
57034714Ssowmini  *     ill_resolver_mp of the outgoing interface.
57044714Ssowmini  *
57054714Ssowmini  * The link layer information needed for broadcast addresses, and for
57064714Ssowmini  * packets sent on IRE_IF_NORESOLVER interfaces is a constant mapping that
57074714Ssowmini  * never needs re-verification for the lifetime of the nce_t. These are
57084714Ssowmini  * therefore marked NCE_F_PERMANENT, and never allowed to expire via
57094714Ssowmini  * NCE_EXPIRED.
57104714Ssowmini  *
57114714Ssowmini  * IRE_CACHE ire's contain the information for  the nexthop (ire_gateway_addr)
57124714Ssowmini  * in the case of indirect routes, and for the dst itself (ire_addr) in the
57132535Ssangeeta  * case of direct routes, with the nce_res_mp containing a template
57142535Ssangeeta  * DL_UNITDATA request.
57152535Ssangeeta  *
57162535Ssangeeta  * The actual association of the ire_nce to the nce created here is
57172535Ssangeeta  * typically done in ire_add_v4 for IRE_CACHE entries. Exceptions
57182535Ssangeeta  * to this rule are SO_DONTROUTE ire's (IRE_MARK_NO_ADD), for which
57194823Sseb  * the ire_nce assignment is done in ire_add_then_send.
57202535Ssangeeta  */
57212535Ssangeeta int
57224714Ssowmini ire_nce_init(ire_t *ire, nce_t *src_nce)
57232535Ssangeeta {
57244714Ssowmini 	in_addr_t	addr4;
57252535Ssangeeta 	int		err;
57264714Ssowmini 	nce_t		*nce = NULL;
57272535Ssangeeta 	ill_t		*ire_ill;
57284714Ssowmini 	uint16_t	nce_flags = 0;
57293448Sdh155122 	ip_stack_t	*ipst;
57302535Ssangeeta 
57314714Ssowmini 	if (ire->ire_stq == NULL)
57322535Ssangeeta 		return (0); /* no need to create nce for local/loopback */
57334714Ssowmini 
57342535Ssangeeta 	switch (ire->ire_type) {
57352535Ssangeeta 	case IRE_CACHE:
57362535Ssangeeta 		if (ire->ire_gateway_addr != INADDR_ANY)
57372535Ssangeeta 			addr4 = ire->ire_gateway_addr; /* 'G' route */
57382535Ssangeeta 		else
57392535Ssangeeta 			addr4 = ire->ire_addr; /* direct route */
57402535Ssangeeta 		break;
57412535Ssangeeta 	case IRE_BROADCAST:
57422535Ssangeeta 		addr4 = ire->ire_addr;
57434714Ssowmini 		nce_flags |= (NCE_F_PERMANENT|NCE_F_BCAST);
57442535Ssangeeta 		break;
57452535Ssangeeta 	default:
57462535Ssangeeta 		return (0);
57472535Ssangeeta 	}
57482535Ssangeeta 
57492535Ssangeeta 	/*
57502535Ssangeeta 	 * ire_ipif is picked based on RTF_SETSRC, usesrc etc.
57512535Ssangeeta 	 * rules in ire_forward_src_ipif. We want the dlureq_mp
57522535Ssangeeta 	 * for the outgoing interface, which we get from the ire_stq.
57532535Ssangeeta 	 */
57542535Ssangeeta 	ire_ill = ire_to_ill(ire);
57553448Sdh155122 	ipst = ire_ill->ill_ipst;
57562535Ssangeeta 
57572535Ssangeeta 	/*
57584714Ssowmini 	 * IRE_IF_NORESOLVER entries never need re-verification and
57594714Ssowmini 	 * do not expire, so we mark them as NCE_F_PERMANENT.
57602535Ssangeeta 	 */
57614714Ssowmini 	if (ire_ill->ill_net_type == IRE_IF_NORESOLVER)
57624714Ssowmini 		nce_flags |= NCE_F_PERMANENT;
57632535Ssangeeta 
57644084Ssowmini retry_nce:
57654714Ssowmini 	err = ndp_lookup_then_add_v4(ire_ill, &addr4, nce_flags,
57664714Ssowmini 	    &nce, src_nce);
57674714Ssowmini 
57684714Ssowmini 	if (err == EEXIST && NCE_EXPIRED(nce, ipst)) {
57694084Ssowmini 		/*
57704084Ssowmini 		 * We looked up an expired nce.
57714084Ssowmini 		 * Go back and try to create one again.
57724084Ssowmini 		 */
57734714Ssowmini 		ndp_delete(nce);
57744714Ssowmini 		NCE_REFRELE(nce);
57754714Ssowmini 		nce = NULL;
57764084Ssowmini 		goto retry_nce;
57774084Ssowmini 	}
57784084Ssowmini 
57794714Ssowmini 	ip1dbg(("ire 0x%p addr 0x%lx type 0x%x; found nce 0x%p err %d\n",
57804714Ssowmini 	    (void *)ire, (ulong_t)addr4, ire->ire_type, (void *)nce, err));
57812535Ssangeeta 
57822535Ssangeeta 	switch (err) {
57832535Ssangeeta 	case 0:
57842535Ssangeeta 	case EEXIST:
57852535Ssangeeta 		/*
57864714Ssowmini 		 * return a pointer to a newly created or existing nce_t;
57872535Ssangeeta 		 * note that the ire-nce mapping is many-one, i.e.,
57884714Ssowmini 		 * multiple ire's could point to the same nce_t.
57892535Ssangeeta 		 */
57902535Ssangeeta 		break;
57912535Ssangeeta 	default:
57922535Ssangeeta 		DTRACE_PROBE2(nce__init__fail, ill_t *, ire_ill, int, err);
57932535Ssangeeta 		return (EINVAL);
57942535Ssangeeta 	}
57952535Ssangeeta 	if (ire->ire_type == IRE_BROADCAST) {
57962535Ssangeeta 		/*
57972535Ssangeeta 		 * Two bcast ires are created for each interface;
57982535Ssangeeta 		 * 1. loopback copy (which does not  have an
57992535Ssangeeta 		 *    ire_stq, and therefore has no ire_nce), and,
58002535Ssangeeta 		 * 2. the non-loopback copy, which has the nce_res_mp
58012535Ssangeeta 		 *    initialized to a copy of the ill_bcast_mp, and
58022535Ssangeeta 		 *    is marked as ND_REACHABLE at this point.
58032535Ssangeeta 		 *    This nce does not undergo any further state changes,
58042535Ssangeeta 		 *    and exists as long as the interface is plumbed.
58052535Ssangeeta 		 * Note: we do the ire_nce assignment here for IRE_BROADCAST
58062535Ssangeeta 		 * because some functions like ill_mark_bcast() inline the
58074714Ssowmini 		 * ire_add functionality.
58082535Ssangeeta 		 */
58094714Ssowmini 		ire->ire_nce = nce;
58102535Ssangeeta 		/*
58112535Ssangeeta 		 * We are associating this nce to the ire,
58122535Ssangeeta 		 * so change the nce ref taken in
58132535Ssangeeta 		 * ndp_lookup_then_add_v4() from
58142535Ssangeeta 		 * NCE_REFHOLD to NCE_REFHOLD_NOTR
58152535Ssangeeta 		 */
58162535Ssangeeta 		NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce);
58172535Ssangeeta 	} else {
58184084Ssowmini 		/*
58194084Ssowmini 		 * We are not using this nce_t just yet so release
58204084Ssowmini 		 * the ref taken in ndp_lookup_then_add_v4()
58214084Ssowmini 		 */
58224714Ssowmini 		NCE_REFRELE(nce);
58232535Ssangeeta 	}
58242535Ssangeeta 	return (0);
58252535Ssangeeta }
5826