xref: /onnv-gate/usr/src/uts/common/inet/ip/ip_ire.c (revision 8778:b4169d2ab299)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
51676Sjpk  * Common Development and Distribution License (the "License").
61676Sjpk  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
210Sstevel@tonic-gate /*
228485SPeter.Memishian@Sun.COM  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
230Sstevel@tonic-gate  * Use is subject to license terms.
240Sstevel@tonic-gate  */
250Sstevel@tonic-gate /* Copyright (c) 1990 Mentat Inc. */
260Sstevel@tonic-gate 
270Sstevel@tonic-gate /*
280Sstevel@tonic-gate  * This file contains routines that manipulate Internet Routing Entries (IREs).
290Sstevel@tonic-gate  */
300Sstevel@tonic-gate 
310Sstevel@tonic-gate #include <sys/types.h>
320Sstevel@tonic-gate #include <sys/stream.h>
330Sstevel@tonic-gate #include <sys/stropts.h>
348485SPeter.Memishian@Sun.COM #include <sys/strsun.h>
35*8778SErik.Nordmark@Sun.COM #include <sys/strsubr.h>
360Sstevel@tonic-gate #include <sys/ddi.h>
370Sstevel@tonic-gate #include <sys/cmn_err.h>
380Sstevel@tonic-gate #include <sys/policy.h>
390Sstevel@tonic-gate 
400Sstevel@tonic-gate #include <sys/systm.h>
410Sstevel@tonic-gate #include <sys/kmem.h>
420Sstevel@tonic-gate #include <sys/param.h>
430Sstevel@tonic-gate #include <sys/socket.h>
440Sstevel@tonic-gate #include <net/if.h>
450Sstevel@tonic-gate #include <net/route.h>
460Sstevel@tonic-gate #include <netinet/in.h>
470Sstevel@tonic-gate #include <net/if_dl.h>
480Sstevel@tonic-gate #include <netinet/ip6.h>
490Sstevel@tonic-gate #include <netinet/icmp6.h>
500Sstevel@tonic-gate 
510Sstevel@tonic-gate #include <inet/common.h>
520Sstevel@tonic-gate #include <inet/mi.h>
530Sstevel@tonic-gate #include <inet/ip.h>
540Sstevel@tonic-gate #include <inet/ip6.h>
550Sstevel@tonic-gate #include <inet/ip_ndp.h>
562535Ssangeeta #include <inet/arp.h>
570Sstevel@tonic-gate #include <inet/ip_if.h>
580Sstevel@tonic-gate #include <inet/ip_ire.h>
592535Ssangeeta #include <inet/ip_ftable.h>
600Sstevel@tonic-gate #include <inet/ip_rts.h>
610Sstevel@tonic-gate #include <inet/nd.h>
620Sstevel@tonic-gate 
630Sstevel@tonic-gate #include <net/pfkeyv2.h>
640Sstevel@tonic-gate #include <inet/ipsec_info.h>
650Sstevel@tonic-gate #include <inet/sadb.h>
660Sstevel@tonic-gate #include <inet/tcp.h>
670Sstevel@tonic-gate #include <inet/ipclassifier.h>
680Sstevel@tonic-gate #include <sys/zone.h>
693448Sdh155122 #include <sys/cpuvar.h>
703448Sdh155122 
711676Sjpk #include <sys/tsol/label.h>
721676Sjpk #include <sys/tsol/tnet.h>
731676Sjpk 
742535Ssangeeta struct kmem_cache *rt_entry_cache;
752535Ssangeeta 
760Sstevel@tonic-gate /*
770Sstevel@tonic-gate  * Synchronization notes:
780Sstevel@tonic-gate  *
790Sstevel@tonic-gate  * The fields of the ire_t struct are protected in the following way :
800Sstevel@tonic-gate  *
810Sstevel@tonic-gate  * ire_next/ire_ptpn
820Sstevel@tonic-gate  *
830Sstevel@tonic-gate  *	- bucket lock of the respective tables (cache or forwarding tables).
840Sstevel@tonic-gate  *
850Sstevel@tonic-gate  * ire_mp, ire_rfq, ire_stq, ire_u *except* ire_gateway_addr[v6], ire_mask,
860Sstevel@tonic-gate  * ire_type, ire_create_time, ire_masklen, ire_ipversion, ire_flags, ire_ipif,
870Sstevel@tonic-gate  * ire_ihandle, ire_phandle, ire_nce, ire_bucket, ire_in_ill, ire_in_src_addr
880Sstevel@tonic-gate  *
890Sstevel@tonic-gate  *	- Set in ire_create_v4/v6 and never changes after that. Thus,
900Sstevel@tonic-gate  *	  we don't need a lock whenever these fields are accessed.
910Sstevel@tonic-gate  *
920Sstevel@tonic-gate  *	- ire_bucket and ire_masklen (also set in ire_create) is set in
930Sstevel@tonic-gate  *        ire_add_v4/ire_add_v6 before inserting in the bucket and never
940Sstevel@tonic-gate  *        changes after that. Thus we don't need a lock whenever these
950Sstevel@tonic-gate  *	  fields are accessed.
960Sstevel@tonic-gate  *
970Sstevel@tonic-gate  * ire_gateway_addr_v4[v6]
980Sstevel@tonic-gate  *
990Sstevel@tonic-gate  *	- ire_gateway_addr_v4[v6] is set during ire_create and later modified
1000Sstevel@tonic-gate  *	  by rts_setgwr[v6]. As ire_gateway_addr is a uint32_t, updates to
1010Sstevel@tonic-gate  *	  it assumed to be atomic and hence the other parts of the code
1020Sstevel@tonic-gate  *	  does not use any locks. ire_gateway_addr_v6 updates are not atomic
1030Sstevel@tonic-gate  *	  and hence any access to it uses ire_lock to get/set the right value.
1040Sstevel@tonic-gate  *
1050Sstevel@tonic-gate  * ire_ident, ire_refcnt
1060Sstevel@tonic-gate  *
1070Sstevel@tonic-gate  *	- Updated atomically using atomic_add_32
1080Sstevel@tonic-gate  *
1090Sstevel@tonic-gate  * ire_ssthresh, ire_rtt_sd, ire_rtt, ire_ib_pkt_count, ire_ob_pkt_count
1100Sstevel@tonic-gate  *
1110Sstevel@tonic-gate  *	- Assumes that 32 bit writes are atomic. No locks. ire_lock is
1120Sstevel@tonic-gate  *	  used to serialize updates to ire_ssthresh, ire_rtt_sd, ire_rtt.
1130Sstevel@tonic-gate  *
1140Sstevel@tonic-gate  * ire_max_frag, ire_frag_flag
1150Sstevel@tonic-gate  *
1160Sstevel@tonic-gate  *	- ire_lock is used to set/read both of them together.
1170Sstevel@tonic-gate  *
1180Sstevel@tonic-gate  * ire_tire_mark
1190Sstevel@tonic-gate  *
1200Sstevel@tonic-gate  *	- Set in ire_create and updated in ire_expire, which is called
1210Sstevel@tonic-gate  *	  by only one function namely ip_trash_timer_expire. Thus only
1220Sstevel@tonic-gate  *	  one function updates and examines the value.
1230Sstevel@tonic-gate  *
1240Sstevel@tonic-gate  * ire_marks
1250Sstevel@tonic-gate  *	- bucket lock protects this.
1260Sstevel@tonic-gate  *
1270Sstevel@tonic-gate  * ire_ipsec_overhead/ire_ll_hdr_length
1280Sstevel@tonic-gate  *
1290Sstevel@tonic-gate  *	- Place holder for returning the information to the upper layers
1300Sstevel@tonic-gate  *	  when IRE_DB_REQ comes down.
1310Sstevel@tonic-gate  *
1320Sstevel@tonic-gate  *
1330Sstevel@tonic-gate  * ipv6_ire_default_count is protected by the bucket lock of
1340Sstevel@tonic-gate  * ip_forwarding_table_v6[0][0].
1350Sstevel@tonic-gate  *
1362535Ssangeeta  * ipv6_ire_default_index is not protected as it  is just a hint
1372535Ssangeeta  * at which default gateway to use. There is nothing
1380Sstevel@tonic-gate  * wrong in using the same gateway for two different connections.
1390Sstevel@tonic-gate  *
1400Sstevel@tonic-gate  * As we always hold the bucket locks in all the places while accessing
1410Sstevel@tonic-gate  * the above values, it is natural to use them for protecting them.
1420Sstevel@tonic-gate  *
1430Sstevel@tonic-gate  * We have a separate cache table and forwarding table for IPv4 and IPv6.
1440Sstevel@tonic-gate  * Cache table (ip_cache_table/ip_cache_table_v6) is a pointer to an
1455335Ssowmini  * array of irb_t structures. The IPv6 forwarding table
1465335Ssowmini  * (ip_forwarding_table_v6) is an array of pointers to arrays of irb_t
1475335Ssowmini  *  structure. ip_forwarding_table_v6 is allocated dynamically in
1483448Sdh155122  * ire_add_v6. ire_ft_init_lock is used to serialize multiple threads
1490Sstevel@tonic-gate  * initializing the same bucket. Once a bucket is initialized, it is never
1503448Sdh155122  * de-alloacted. This assumption enables us to access
1513448Sdh155122  * ip_forwarding_table_v6[i] without any locks.
1520Sstevel@tonic-gate  *
1535335Ssowmini  * The forwarding table for IPv4 is a radix tree whose leaves
1545335Ssowmini  * are rt_entry structures containing the irb_t for the rt_dst. The irb_t
1555335Ssowmini  * for IPv4 is dynamically allocated and freed.
1565335Ssowmini  *
1570Sstevel@tonic-gate  * Each irb_t - ire bucket structure has a lock to protect
1580Sstevel@tonic-gate  * a bucket and the ires residing in the bucket have a back pointer to
1590Sstevel@tonic-gate  * the bucket structure. It also has a reference count for the number
1600Sstevel@tonic-gate  * of threads walking the bucket - irb_refcnt which is bumped up
1610Sstevel@tonic-gate  * using the macro IRB_REFHOLD macro. The flags irb_flags can be
1620Sstevel@tonic-gate  * set to IRE_MARK_CONDEMNED indicating that there are some ires
1630Sstevel@tonic-gate  * in this bucket that are marked with IRE_MARK_CONDEMNED and the
1640Sstevel@tonic-gate  * last thread to leave the bucket should delete the ires. Usually
1650Sstevel@tonic-gate  * this is done by the IRB_REFRELE macro which is used to decrement
1665335Ssowmini  * the reference count on a bucket. See comments above irb_t structure
1675335Ssowmini  * definition in ip.h for further details.
1680Sstevel@tonic-gate  *
1690Sstevel@tonic-gate  * IRE_REFHOLD/IRE_REFRELE macros operate on the ire which increments/
1700Sstevel@tonic-gate  * decrements the reference count, ire_refcnt, atomically on the ire.
1710Sstevel@tonic-gate  * ire_refcnt is modified only using this macro. Operations on the IRE
1720Sstevel@tonic-gate  * could be described as follows :
1730Sstevel@tonic-gate  *
1740Sstevel@tonic-gate  * CREATE an ire with reference count initialized to 1.
1750Sstevel@tonic-gate  *
1760Sstevel@tonic-gate  * ADDITION of an ire holds the bucket lock, checks for duplicates
1770Sstevel@tonic-gate  * and then adds the ire. ire_add_v4/ire_add_v6 returns the ire after
1780Sstevel@tonic-gate  * bumping up once more i.e the reference count is 2. This is to avoid
1790Sstevel@tonic-gate  * an extra lookup in the functions calling ire_add which wants to
1800Sstevel@tonic-gate  * work with the ire after adding.
1810Sstevel@tonic-gate  *
1820Sstevel@tonic-gate  * LOOKUP of an ire bumps up the reference count using IRE_REFHOLD
1830Sstevel@tonic-gate  * macro. It is valid to bump up the referece count of the IRE,
1840Sstevel@tonic-gate  * after the lookup has returned an ire. Following are the lookup
1850Sstevel@tonic-gate  * functions that return an HELD ire :
1860Sstevel@tonic-gate  *
1870Sstevel@tonic-gate  * ire_lookup_local[_v6], ire_ctable_lookup[_v6], ire_ftable_lookup[_v6],
1880Sstevel@tonic-gate  * ire_cache_lookup[_v6], ire_lookup_multi[_v6], ire_route_lookup[_v6],
1894823Sseb  * ipif_to_ire[_v6].
1900Sstevel@tonic-gate  *
1910Sstevel@tonic-gate  * DELETION of an ire holds the bucket lock, removes it from the list
1920Sstevel@tonic-gate  * and then decrements the reference count for having removed from the list
1930Sstevel@tonic-gate  * by using the IRE_REFRELE macro. If some other thread has looked up
1940Sstevel@tonic-gate  * the ire, the reference count would have been bumped up and hence
1950Sstevel@tonic-gate  * this ire will not be freed once deleted. It will be freed once the
1960Sstevel@tonic-gate  * reference count drops to zero.
1970Sstevel@tonic-gate  *
1980Sstevel@tonic-gate  * Add and Delete acquires the bucket lock as RW_WRITER, while all the
1990Sstevel@tonic-gate  * lookups acquire the bucket lock as RW_READER.
2000Sstevel@tonic-gate  *
2010Sstevel@tonic-gate  * NOTE : The only functions that does the IRE_REFRELE when an ire is
2020Sstevel@tonic-gate  *	  passed as an argument are :
2030Sstevel@tonic-gate  *
2040Sstevel@tonic-gate  *	  1) ip_wput_ire : This is because it IRE_REFHOLD/RELEs the
2050Sstevel@tonic-gate  *			   broadcast ires it looks up internally within
2060Sstevel@tonic-gate  *			   the function. Currently, for simplicity it does
2070Sstevel@tonic-gate  *			   not differentiate the one that is passed in and
2080Sstevel@tonic-gate  *			   the ones it looks up internally. It always
2090Sstevel@tonic-gate  *			   IRE_REFRELEs.
2100Sstevel@tonic-gate  *	  2) ire_send
2110Sstevel@tonic-gate  *	     ire_send_v6 : As ire_send calls ip_wput_ire and other functions
2120Sstevel@tonic-gate  *			   that take ire as an argument, it has to selectively
2130Sstevel@tonic-gate  *			   IRE_REFRELE the ire. To maintain symmetry,
2140Sstevel@tonic-gate  *			   ire_send_v6 does the same.
2150Sstevel@tonic-gate  *
2160Sstevel@tonic-gate  * Otherwise, the general rule is to do the IRE_REFRELE in the function
2170Sstevel@tonic-gate  * that is passing the ire as an argument.
2180Sstevel@tonic-gate  *
2190Sstevel@tonic-gate  * In trying to locate ires the following points are to be noted.
2200Sstevel@tonic-gate  *
2210Sstevel@tonic-gate  * IRE_MARK_CONDEMNED signifies that the ire has been logically deleted and is
2220Sstevel@tonic-gate  * to be ignored when walking the ires using ire_next.
2230Sstevel@tonic-gate  *
2240Sstevel@tonic-gate  * Zones note:
2250Sstevel@tonic-gate  *	Walking IREs within a given zone also walks certain ires in other
2260Sstevel@tonic-gate  *	zones.  This is done intentionally.  IRE walks with a specified
2270Sstevel@tonic-gate  *	zoneid are used only when doing informational reports, and
2280Sstevel@tonic-gate  *	zone users want to see things that they can access. See block
2290Sstevel@tonic-gate  *	comment in ire_walk_ill_match().
2300Sstevel@tonic-gate  */
2310Sstevel@tonic-gate 
2320Sstevel@tonic-gate /*
2330Sstevel@tonic-gate  * The minimum size of IRE cache table.  It will be recalcuated in
2340Sstevel@tonic-gate  * ip_ire_init().
2353448Sdh155122  * Setable in /etc/system
2360Sstevel@tonic-gate  */
2370Sstevel@tonic-gate uint32_t ip_cache_table_size = IP_CACHE_TABLE_SIZE;
2380Sstevel@tonic-gate uint32_t ip6_cache_table_size = IP6_CACHE_TABLE_SIZE;
2390Sstevel@tonic-gate 
2400Sstevel@tonic-gate /*
2410Sstevel@tonic-gate  * The size of the forwarding table.  We will make sure that it is a
2420Sstevel@tonic-gate  * power of 2 in ip_ire_init().
2433448Sdh155122  * Setable in /etc/system
2440Sstevel@tonic-gate  */
2450Sstevel@tonic-gate uint32_t ip6_ftable_hash_size = IP6_FTABLE_HASH_SIZE;
2460Sstevel@tonic-gate 
2470Sstevel@tonic-gate struct	kmem_cache	*ire_cache;
2480Sstevel@tonic-gate static ire_t	ire_null;
2490Sstevel@tonic-gate 
2500Sstevel@tonic-gate /*
2510Sstevel@tonic-gate  * The threshold number of IRE in a bucket when the IREs are
2520Sstevel@tonic-gate  * cleaned up.  This threshold is calculated later in ip_open()
2530Sstevel@tonic-gate  * based on the speed of CPU and available memory.  This default
2540Sstevel@tonic-gate  * value is the maximum.
2550Sstevel@tonic-gate  *
2560Sstevel@tonic-gate  * We have two kinds of cached IRE, temporary and
2570Sstevel@tonic-gate  * non-temporary.  Temporary IREs are marked with
2580Sstevel@tonic-gate  * IRE_MARK_TEMPORARY.  They are IREs created for non
2590Sstevel@tonic-gate  * TCP traffic and for forwarding purposes.  All others
2600Sstevel@tonic-gate  * are non-temporary IREs.  We don't mark IRE created for
2610Sstevel@tonic-gate  * TCP as temporary because TCP is stateful and there are
2620Sstevel@tonic-gate  * info stored in the IRE which can be shared by other TCP
2630Sstevel@tonic-gate  * connections to the same destination.  For connected
2640Sstevel@tonic-gate  * endpoint, we also don't want to mark the IRE used as
2650Sstevel@tonic-gate  * temporary because the same IRE will be used frequently,
2660Sstevel@tonic-gate  * otherwise, the app should not do a connect().  We change
2670Sstevel@tonic-gate  * the marking at ip_bind_connected_*() if necessary.
2680Sstevel@tonic-gate  *
2690Sstevel@tonic-gate  * We want to keep the cache IRE hash bucket length reasonably
2700Sstevel@tonic-gate  * short, otherwise IRE lookup functions will take "forever."
2710Sstevel@tonic-gate  * We use the "crude" function that the IRE bucket
2720Sstevel@tonic-gate  * length should be based on the CPU speed, which is 1 entry
2730Sstevel@tonic-gate  * per x MHz, depending on the shift factor ip_ire_cpu_ratio
2740Sstevel@tonic-gate  * (n).  This means that with a 750MHz CPU, the max bucket
2750Sstevel@tonic-gate  * length can be (750 >> n) entries.
2760Sstevel@tonic-gate  *
2770Sstevel@tonic-gate  * Note that this threshold is separate for temp and non-temp
2780Sstevel@tonic-gate  * IREs.  This means that the actual bucket length can be
2790Sstevel@tonic-gate  * twice as that.  And while we try to keep temporary IRE
2800Sstevel@tonic-gate  * length at most at the threshold value, we do not attempt to
2810Sstevel@tonic-gate  * make the length for non-temporary IREs fixed, for the
2820Sstevel@tonic-gate  * reason stated above.  Instead, we start trying to find
2830Sstevel@tonic-gate  * "unused" non-temporary IREs when the bucket length reaches
2840Sstevel@tonic-gate  * this threshold and clean them up.
2850Sstevel@tonic-gate  *
2860Sstevel@tonic-gate  * We also want to limit the amount of memory used by
2870Sstevel@tonic-gate  * IREs.  So if we are allowed to use ~3% of memory (M)
2880Sstevel@tonic-gate  * for those IREs, each bucket should not have more than
2890Sstevel@tonic-gate  *
2900Sstevel@tonic-gate  * 	M / num of cache bucket / sizeof (ire_t)
2910Sstevel@tonic-gate  *
2920Sstevel@tonic-gate  * Again the above memory uses are separate for temp and
2930Sstevel@tonic-gate  * non-temp cached IREs.
2940Sstevel@tonic-gate  *
2950Sstevel@tonic-gate  * We may also want the limit to be a function of the number
2960Sstevel@tonic-gate  * of interfaces and number of CPUs.  Doing the initialization
2970Sstevel@tonic-gate  * in ip_open() means that every time an interface is plumbed,
2980Sstevel@tonic-gate  * the max is re-calculated.  Right now, we don't do anything
2990Sstevel@tonic-gate  * different.  In future, when we have more experience, we
3000Sstevel@tonic-gate  * may want to change this behavior.
3010Sstevel@tonic-gate  */
3023448Sdh155122 uint32_t ip_ire_max_bucket_cnt = 10;	/* Setable in /etc/system */
3030Sstevel@tonic-gate uint32_t ip6_ire_max_bucket_cnt = 10;
3045388Sja97890 uint32_t ip_ire_cleanup_cnt = 2;
3050Sstevel@tonic-gate 
3060Sstevel@tonic-gate /*
3070Sstevel@tonic-gate  * The minimum of the temporary IRE bucket count.  We do not want
3080Sstevel@tonic-gate  * the length of each bucket to be too short.  This may hurt
3090Sstevel@tonic-gate  * performance of some apps as the temporary IREs are removed too
3100Sstevel@tonic-gate  * often.
3110Sstevel@tonic-gate  */
3123448Sdh155122 uint32_t ip_ire_min_bucket_cnt = 3;	/* /etc/system - not used */
3130Sstevel@tonic-gate uint32_t ip6_ire_min_bucket_cnt = 3;
3140Sstevel@tonic-gate 
3150Sstevel@tonic-gate /*
3160Sstevel@tonic-gate  * The ratio of memory consumed by IRE used for temporary to available
3170Sstevel@tonic-gate  * memory.  This is a shift factor, so 6 means the ratio 1 to 64.  This
3180Sstevel@tonic-gate  * value can be changed in /etc/system.  6 is a reasonable number.
3190Sstevel@tonic-gate  */
3203448Sdh155122 uint32_t ip_ire_mem_ratio = 6;	/* /etc/system */
3210Sstevel@tonic-gate /* The shift factor for CPU speed to calculate the max IRE bucket length. */
3223448Sdh155122 uint32_t ip_ire_cpu_ratio = 7;	/* /etc/system */
3230Sstevel@tonic-gate 
3242535Ssangeeta typedef struct nce_clookup_s {
3252535Ssangeeta 	ipaddr_t ncecl_addr;
3262535Ssangeeta 	boolean_t ncecl_found;
3272535Ssangeeta } nce_clookup_t;
3282535Ssangeeta 
3290Sstevel@tonic-gate /*
3300Sstevel@tonic-gate  * The maximum number of buckets in IRE cache table.  In future, we may
3310Sstevel@tonic-gate  * want to make it a dynamic hash table.  For the moment, we fix the
3320Sstevel@tonic-gate  * size and allocate the table in ip_ire_init() when IP is first loaded.
3330Sstevel@tonic-gate  * We take into account the amount of memory a system has.
3340Sstevel@tonic-gate  */
3350Sstevel@tonic-gate #define	IP_MAX_CACHE_TABLE_SIZE	4096
3360Sstevel@tonic-gate 
3373448Sdh155122 /* Setable in /etc/system */
3380Sstevel@tonic-gate static uint32_t	ip_max_cache_table_size = IP_MAX_CACHE_TABLE_SIZE;
3390Sstevel@tonic-gate static uint32_t	ip6_max_cache_table_size = IP_MAX_CACHE_TABLE_SIZE;
3400Sstevel@tonic-gate 
3410Sstevel@tonic-gate /* Zero iulp_t for initialization. */
3420Sstevel@tonic-gate const iulp_t	ire_uinfo_null = { 0 };
3430Sstevel@tonic-gate 
3440Sstevel@tonic-gate static int	ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp,
3452535Ssangeeta     ipsq_func_t func, boolean_t);
3460Sstevel@tonic-gate static void	ire_delete_v4(ire_t *ire);
3471676Sjpk static void	ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers,
3483448Sdh155122     zoneid_t zoneid, ip_stack_t *);
3490Sstevel@tonic-gate static void	ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type,
3501676Sjpk     pfv_t func, void *arg, uchar_t vers, ill_t *ill);
3515388Sja97890 static void	ire_cache_cleanup(irb_t *irb, uint32_t threshold,
3525388Sja97890     ire_t *ref_ire);
3532535Ssangeeta static	void	ip_nce_clookup_and_delete(nce_t *nce, void *arg);
3547880SJonathan.Anderson@Sun.COM static	ire_t	*ip4_ctable_lookup_impl(ire_ctable_args_t *margs);
3555023Scarlsonj #ifdef DEBUG
3565023Scarlsonj static void	ire_trace_cleanup(const ire_t *);
3570Sstevel@tonic-gate #endif
3580Sstevel@tonic-gate 
3590Sstevel@tonic-gate /*
3600Sstevel@tonic-gate  * To avoid bloating the code, we call this function instead of
3610Sstevel@tonic-gate  * using the macro IRE_REFRELE. Use macro only in performance
3620Sstevel@tonic-gate  * critical paths.
3630Sstevel@tonic-gate  *
3640Sstevel@tonic-gate  * Must not be called while holding any locks. Otherwise if this is
3650Sstevel@tonic-gate  * the last reference to be released there is a chance of recursive mutex
3660Sstevel@tonic-gate  * panic due to ire_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
3670Sstevel@tonic-gate  * to restart an ioctl. The one exception is when the caller is sure that
3680Sstevel@tonic-gate  * this is not the last reference to be released. Eg. if the caller is
3690Sstevel@tonic-gate  * sure that the ire has not been deleted and won't be deleted.
3700Sstevel@tonic-gate  */
3710Sstevel@tonic-gate void
3720Sstevel@tonic-gate ire_refrele(ire_t *ire)
3730Sstevel@tonic-gate {
3740Sstevel@tonic-gate 	IRE_REFRELE(ire);
3750Sstevel@tonic-gate }
3760Sstevel@tonic-gate 
3770Sstevel@tonic-gate void
3780Sstevel@tonic-gate ire_refrele_notr(ire_t *ire)
3790Sstevel@tonic-gate {
3800Sstevel@tonic-gate 	IRE_REFRELE_NOTR(ire);
3810Sstevel@tonic-gate }
3820Sstevel@tonic-gate 
3830Sstevel@tonic-gate /*
3840Sstevel@tonic-gate  * kmem_cache_alloc constructor for IRE in kma space.
3850Sstevel@tonic-gate  * Note that when ire_mp is set the IRE is stored in that mblk and
3860Sstevel@tonic-gate  * not in this cache.
3870Sstevel@tonic-gate  */
3880Sstevel@tonic-gate /* ARGSUSED */
3890Sstevel@tonic-gate static int
3900Sstevel@tonic-gate ip_ire_constructor(void *buf, void *cdrarg, int kmflags)
3910Sstevel@tonic-gate {
3920Sstevel@tonic-gate 	ire_t	*ire = buf;
3930Sstevel@tonic-gate 
3942535Ssangeeta 	ire->ire_nce = NULL;
3950Sstevel@tonic-gate 
3960Sstevel@tonic-gate 	return (0);
3970Sstevel@tonic-gate }
3980Sstevel@tonic-gate 
3990Sstevel@tonic-gate /* ARGSUSED1 */
4000Sstevel@tonic-gate static void
4010Sstevel@tonic-gate ip_ire_destructor(void *buf, void *cdrarg)
4020Sstevel@tonic-gate {
4030Sstevel@tonic-gate 	ire_t	*ire = buf;
4040Sstevel@tonic-gate 
4052535Ssangeeta 	ASSERT(ire->ire_nce == NULL);
4060Sstevel@tonic-gate }
4070Sstevel@tonic-gate 
4080Sstevel@tonic-gate /*
4090Sstevel@tonic-gate  * This function is associated with the IP_IOC_IRE_ADVISE_NO_REPLY
4100Sstevel@tonic-gate  * IOCTL.  It is used by TCP (or other ULPs) to supply revised information
4110Sstevel@tonic-gate  * for an existing CACHED IRE.
4120Sstevel@tonic-gate  */
4130Sstevel@tonic-gate /* ARGSUSED */
4140Sstevel@tonic-gate int
4150Sstevel@tonic-gate ip_ire_advise(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
4160Sstevel@tonic-gate {
4170Sstevel@tonic-gate 	uchar_t	*addr_ucp;
4180Sstevel@tonic-gate 	ipic_t	*ipic;
4190Sstevel@tonic-gate 	ire_t	*ire;
4200Sstevel@tonic-gate 	ipaddr_t	addr;
4210Sstevel@tonic-gate 	in6_addr_t	v6addr;
4220Sstevel@tonic-gate 	irb_t	*irb;
4230Sstevel@tonic-gate 	zoneid_t	zoneid;
4243448Sdh155122 	ip_stack_t	*ipst = CONNQ_TO_IPST(q);
4250Sstevel@tonic-gate 
4260Sstevel@tonic-gate 	ASSERT(q->q_next == NULL);
4270Sstevel@tonic-gate 	zoneid = Q_TO_CONN(q)->conn_zoneid;
4280Sstevel@tonic-gate 
4290Sstevel@tonic-gate 	/*
4300Sstevel@tonic-gate 	 * Check privilege using the ioctl credential; if it is NULL
4310Sstevel@tonic-gate 	 * then this is a kernel message and therefor privileged.
4320Sstevel@tonic-gate 	 */
4333448Sdh155122 	if (ioc_cr != NULL && secpolicy_ip_config(ioc_cr, B_FALSE) != 0)
4340Sstevel@tonic-gate 		return (EPERM);
4350Sstevel@tonic-gate 
4360Sstevel@tonic-gate 	ipic = (ipic_t *)mp->b_rptr;
4370Sstevel@tonic-gate 	if (!(addr_ucp = mi_offset_param(mp, ipic->ipic_addr_offset,
4380Sstevel@tonic-gate 	    ipic->ipic_addr_length))) {
4390Sstevel@tonic-gate 		return (EINVAL);
4400Sstevel@tonic-gate 	}
4410Sstevel@tonic-gate 	if (!OK_32PTR(addr_ucp))
4420Sstevel@tonic-gate 		return (EINVAL);
4430Sstevel@tonic-gate 	switch (ipic->ipic_addr_length) {
4440Sstevel@tonic-gate 	case IP_ADDR_LEN: {
4450Sstevel@tonic-gate 		/* Extract the destination address. */
4460Sstevel@tonic-gate 		addr = *(ipaddr_t *)addr_ucp;
4470Sstevel@tonic-gate 		/* Find the corresponding IRE. */
4483448Sdh155122 		ire = ire_cache_lookup(addr, zoneid, NULL, ipst);
4490Sstevel@tonic-gate 		break;
4500Sstevel@tonic-gate 	}
4510Sstevel@tonic-gate 	case IPV6_ADDR_LEN: {
4520Sstevel@tonic-gate 		/* Extract the destination address. */
4530Sstevel@tonic-gate 		v6addr = *(in6_addr_t *)addr_ucp;
4540Sstevel@tonic-gate 		/* Find the corresponding IRE. */
4553448Sdh155122 		ire = ire_cache_lookup_v6(&v6addr, zoneid, NULL, ipst);
4560Sstevel@tonic-gate 		break;
4570Sstevel@tonic-gate 	}
4580Sstevel@tonic-gate 	default:
4590Sstevel@tonic-gate 		return (EINVAL);
4600Sstevel@tonic-gate 	}
4610Sstevel@tonic-gate 
4620Sstevel@tonic-gate 	if (ire == NULL)
4630Sstevel@tonic-gate 		return (ENOENT);
4640Sstevel@tonic-gate 	/*
4650Sstevel@tonic-gate 	 * Update the round trip time estimate and/or the max frag size
4660Sstevel@tonic-gate 	 * and/or the slow start threshold.
4670Sstevel@tonic-gate 	 *
4680Sstevel@tonic-gate 	 * We serialize multiple advises using ire_lock.
4690Sstevel@tonic-gate 	 */
4700Sstevel@tonic-gate 	mutex_enter(&ire->ire_lock);
4710Sstevel@tonic-gate 	if (ipic->ipic_rtt) {
4720Sstevel@tonic-gate 		/*
4730Sstevel@tonic-gate 		 * If there is no old cached values, initialize them
4740Sstevel@tonic-gate 		 * conservatively.  Set them to be (1.5 * new value).
4750Sstevel@tonic-gate 		 */
4760Sstevel@tonic-gate 		if (ire->ire_uinfo.iulp_rtt != 0) {
4770Sstevel@tonic-gate 			ire->ire_uinfo.iulp_rtt = (ire->ire_uinfo.iulp_rtt +
4780Sstevel@tonic-gate 			    ipic->ipic_rtt) >> 1;
4790Sstevel@tonic-gate 		} else {
4800Sstevel@tonic-gate 			ire->ire_uinfo.iulp_rtt = ipic->ipic_rtt +
4810Sstevel@tonic-gate 			    (ipic->ipic_rtt >> 1);
4820Sstevel@tonic-gate 		}
4830Sstevel@tonic-gate 		if (ire->ire_uinfo.iulp_rtt_sd != 0) {
4840Sstevel@tonic-gate 			ire->ire_uinfo.iulp_rtt_sd =
4850Sstevel@tonic-gate 			    (ire->ire_uinfo.iulp_rtt_sd +
4860Sstevel@tonic-gate 			    ipic->ipic_rtt_sd) >> 1;
4870Sstevel@tonic-gate 		} else {
4880Sstevel@tonic-gate 			ire->ire_uinfo.iulp_rtt_sd = ipic->ipic_rtt_sd +
4890Sstevel@tonic-gate 			    (ipic->ipic_rtt_sd >> 1);
4900Sstevel@tonic-gate 		}
4910Sstevel@tonic-gate 	}
4920Sstevel@tonic-gate 	if (ipic->ipic_max_frag)
4930Sstevel@tonic-gate 		ire->ire_max_frag = MIN(ipic->ipic_max_frag, IP_MAXPACKET);
4940Sstevel@tonic-gate 	if (ipic->ipic_ssthresh != 0) {
4950Sstevel@tonic-gate 		if (ire->ire_uinfo.iulp_ssthresh != 0)
4960Sstevel@tonic-gate 			ire->ire_uinfo.iulp_ssthresh =
4970Sstevel@tonic-gate 			    (ipic->ipic_ssthresh +
4980Sstevel@tonic-gate 			    ire->ire_uinfo.iulp_ssthresh) >> 1;
4990Sstevel@tonic-gate 		else
5000Sstevel@tonic-gate 			ire->ire_uinfo.iulp_ssthresh = ipic->ipic_ssthresh;
5010Sstevel@tonic-gate 	}
5020Sstevel@tonic-gate 	/*
5030Sstevel@tonic-gate 	 * Don't need the ire_lock below this. ire_type does not change
5040Sstevel@tonic-gate 	 * after initialization. ire_marks is protected by irb_lock.
5050Sstevel@tonic-gate 	 */
5060Sstevel@tonic-gate 	mutex_exit(&ire->ire_lock);
5070Sstevel@tonic-gate 
5080Sstevel@tonic-gate 	if (ipic->ipic_ire_marks != 0 && ire->ire_type == IRE_CACHE) {
5090Sstevel@tonic-gate 		/*
5100Sstevel@tonic-gate 		 * Only increment the temporary IRE count if the original
5110Sstevel@tonic-gate 		 * IRE is not already marked temporary.
5120Sstevel@tonic-gate 		 */
5130Sstevel@tonic-gate 		irb = ire->ire_bucket;
5140Sstevel@tonic-gate 		rw_enter(&irb->irb_lock, RW_WRITER);
5150Sstevel@tonic-gate 		if ((ipic->ipic_ire_marks & IRE_MARK_TEMPORARY) &&
5160Sstevel@tonic-gate 		    !(ire->ire_marks & IRE_MARK_TEMPORARY)) {
5170Sstevel@tonic-gate 			irb->irb_tmp_ire_cnt++;
5180Sstevel@tonic-gate 		}
5190Sstevel@tonic-gate 		ire->ire_marks |= ipic->ipic_ire_marks;
5200Sstevel@tonic-gate 		rw_exit(&irb->irb_lock);
5210Sstevel@tonic-gate 	}
5220Sstevel@tonic-gate 
5230Sstevel@tonic-gate 	ire_refrele(ire);
5240Sstevel@tonic-gate 	return (0);
5250Sstevel@tonic-gate }
5260Sstevel@tonic-gate 
5270Sstevel@tonic-gate /*
5280Sstevel@tonic-gate  * This function is associated with the IP_IOC_IRE_DELETE[_NO_REPLY]
5290Sstevel@tonic-gate  * IOCTL[s].  The NO_REPLY form is used by TCP to delete a route IRE
5300Sstevel@tonic-gate  * for a host that is not responding.  This will force an attempt to
5312612Scarlsonj  * establish a new route, if available, and flush out the ARP entry so
5322612Scarlsonj  * it will re-resolve.  Management processes may want to use the
5332612Scarlsonj  * version that generates a reply.
5340Sstevel@tonic-gate  *
5350Sstevel@tonic-gate  * This function does not support IPv6 since Neighbor Unreachability Detection
5360Sstevel@tonic-gate  * means that negative advise like this is useless.
5370Sstevel@tonic-gate  */
5380Sstevel@tonic-gate /* ARGSUSED */
5390Sstevel@tonic-gate int
5400Sstevel@tonic-gate ip_ire_delete(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
5410Sstevel@tonic-gate {
5422535Ssangeeta 	uchar_t		*addr_ucp;
5430Sstevel@tonic-gate 	ipaddr_t	addr;
5442535Ssangeeta 	ire_t		*ire;
5452535Ssangeeta 	ipid_t		*ipid;
5462535Ssangeeta 	boolean_t	routing_sock_info = B_FALSE;	/* Sent info? */
5470Sstevel@tonic-gate 	zoneid_t	zoneid;
5482535Ssangeeta 	ire_t		*gire = NULL;
5492612Scarlsonj 	ill_t		*ill;
5502612Scarlsonj 	mblk_t		*arp_mp;
5513448Sdh155122 	ip_stack_t	*ipst;
5520Sstevel@tonic-gate 
5530Sstevel@tonic-gate 	ASSERT(q->q_next == NULL);
5540Sstevel@tonic-gate 	zoneid = Q_TO_CONN(q)->conn_zoneid;
5553448Sdh155122 	ipst = CONNQ_TO_IPST(q);
5560Sstevel@tonic-gate 
5570Sstevel@tonic-gate 	/*
5580Sstevel@tonic-gate 	 * Check privilege using the ioctl credential; if it is NULL
5590Sstevel@tonic-gate 	 * then this is a kernel message and therefor privileged.
5600Sstevel@tonic-gate 	 */
5613448Sdh155122 	if (ioc_cr != NULL && secpolicy_ip_config(ioc_cr, B_FALSE) != 0)
5620Sstevel@tonic-gate 		return (EPERM);
5630Sstevel@tonic-gate 
5640Sstevel@tonic-gate 	ipid = (ipid_t *)mp->b_rptr;
5650Sstevel@tonic-gate 
5660Sstevel@tonic-gate 	/* Only actions on IRE_CACHEs are acceptable at present. */
5670Sstevel@tonic-gate 	if (ipid->ipid_ire_type != IRE_CACHE)
5680Sstevel@tonic-gate 		return (EINVAL);
5690Sstevel@tonic-gate 
5700Sstevel@tonic-gate 	addr_ucp = mi_offset_param(mp, ipid->ipid_addr_offset,
5714714Ssowmini 	    ipid->ipid_addr_length);
5720Sstevel@tonic-gate 	if (addr_ucp == NULL || !OK_32PTR(addr_ucp))
5730Sstevel@tonic-gate 		return (EINVAL);
5740Sstevel@tonic-gate 	switch (ipid->ipid_addr_length) {
5750Sstevel@tonic-gate 	case IP_ADDR_LEN:
5760Sstevel@tonic-gate 		/* addr_ucp points at IP addr */
5770Sstevel@tonic-gate 		break;
5780Sstevel@tonic-gate 	case sizeof (sin_t): {
5790Sstevel@tonic-gate 		sin_t	*sin;
5800Sstevel@tonic-gate 		/*
5810Sstevel@tonic-gate 		 * got complete (sockaddr) address - increment addr_ucp to point
5820Sstevel@tonic-gate 		 * at the ip_addr field.
5830Sstevel@tonic-gate 		 */
5840Sstevel@tonic-gate 		sin = (sin_t *)addr_ucp;
5850Sstevel@tonic-gate 		addr_ucp = (uchar_t *)&sin->sin_addr.s_addr;
5860Sstevel@tonic-gate 		break;
5870Sstevel@tonic-gate 	}
5880Sstevel@tonic-gate 	default:
5890Sstevel@tonic-gate 		return (EINVAL);
5900Sstevel@tonic-gate 	}
5910Sstevel@tonic-gate 	/* Extract the destination address. */
5920Sstevel@tonic-gate 	bcopy(addr_ucp, &addr, IP_ADDR_LEN);
5930Sstevel@tonic-gate 
5940Sstevel@tonic-gate 	/* Try to find the CACHED IRE. */
5953448Sdh155122 	ire = ire_cache_lookup(addr, zoneid, NULL, ipst);
5960Sstevel@tonic-gate 
5970Sstevel@tonic-gate 	/* Nail it. */
5980Sstevel@tonic-gate 	if (ire) {
5990Sstevel@tonic-gate 		/* Allow delete only on CACHE entries */
6000Sstevel@tonic-gate 		if (ire->ire_type != IRE_CACHE) {
6010Sstevel@tonic-gate 			ire_refrele(ire);
6020Sstevel@tonic-gate 			return (EINVAL);
6030Sstevel@tonic-gate 		}
6040Sstevel@tonic-gate 
6050Sstevel@tonic-gate 		/*
6060Sstevel@tonic-gate 		 * Verify that the IRE has been around for a while.
6070Sstevel@tonic-gate 		 * This is to protect against transport protocols
6080Sstevel@tonic-gate 		 * that are too eager in sending delete messages.
6090Sstevel@tonic-gate 		 */
6100Sstevel@tonic-gate 		if (gethrestime_sec() <
6113448Sdh155122 		    ire->ire_create_time + ipst->ips_ip_ignore_delete_time) {
6120Sstevel@tonic-gate 			ire_refrele(ire);
6130Sstevel@tonic-gate 			return (EINVAL);
6140Sstevel@tonic-gate 		}
6150Sstevel@tonic-gate 		/*
6160Sstevel@tonic-gate 		 * Now we have a potentially dead cache entry. We need
6170Sstevel@tonic-gate 		 * to remove it.
6182535Ssangeeta 		 * If this cache entry is generated from a
6192535Ssangeeta 		 * default route (i.e., ire_cmask == 0),
6200Sstevel@tonic-gate 		 * search the default list and mark it dead and some
6210Sstevel@tonic-gate 		 * background process will try to activate it.
6220Sstevel@tonic-gate 		 */
6230Sstevel@tonic-gate 		if ((ire->ire_gateway_addr != 0) && (ire->ire_cmask == 0)) {
6240Sstevel@tonic-gate 			/*
6250Sstevel@tonic-gate 			 * Make sure that we pick a different
6260Sstevel@tonic-gate 			 * IRE_DEFAULT next time.
6270Sstevel@tonic-gate 			 */
6280Sstevel@tonic-gate 			ire_t *gw_ire;
6292535Ssangeeta 			irb_t *irb = NULL;
6302535Ssangeeta 			uint_t match_flags;
6312535Ssangeeta 
6322535Ssangeeta 			match_flags = (MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE);
6332535Ssangeeta 
6342535Ssangeeta 			gire = ire_ftable_lookup(ire->ire_addr,
6352535Ssangeeta 			    ire->ire_cmask, 0, 0,
6363448Sdh155122 			    ire->ire_ipif, NULL, zoneid, 0, NULL, match_flags,
6373448Sdh155122 			    ipst);
6382535Ssangeeta 
6392535Ssangeeta 			ip3dbg(("ire_ftable_lookup() returned gire %p\n",
6402535Ssangeeta 			    (void *)gire));
6412535Ssangeeta 
6422535Ssangeeta 			if (gire != NULL) {
6432535Ssangeeta 				irb = gire->ire_bucket;
6440Sstevel@tonic-gate 
6450Sstevel@tonic-gate 				/*
6460Sstevel@tonic-gate 				 * We grab it as writer just to serialize
6470Sstevel@tonic-gate 				 * multiple threads trying to bump up
6482535Ssangeeta 				 * irb_rr_origin
6490Sstevel@tonic-gate 				 */
6500Sstevel@tonic-gate 				rw_enter(&irb->irb_lock, RW_WRITER);
6512535Ssangeeta 				if ((gw_ire = irb->irb_rr_origin) == NULL) {
6520Sstevel@tonic-gate 					rw_exit(&irb->irb_lock);
6530Sstevel@tonic-gate 					goto done;
6540Sstevel@tonic-gate 				}
6552535Ssangeeta 
6562894Ssowmini 				DTRACE_PROBE1(ip__ire__del__origin,
6572894Ssowmini 				    (ire_t *), gw_ire);
6580Sstevel@tonic-gate 
6590Sstevel@tonic-gate 				/* Skip past the potentially bad gateway */
6600Sstevel@tonic-gate 				if (ire->ire_gateway_addr ==
6612894Ssowmini 				    gw_ire->ire_gateway_addr) {
6622894Ssowmini 					ire_t *next = gw_ire->ire_next;
6632894Ssowmini 
6642894Ssowmini 					DTRACE_PROBE2(ip__ire__del,
6652894Ssowmini 					    (ire_t *), gw_ire, (irb_t *), irb);
6662894Ssowmini 					IRE_FIND_NEXT_ORIGIN(next);
6672894Ssowmini 					irb->irb_rr_origin = next;
6682894Ssowmini 				}
6690Sstevel@tonic-gate 				rw_exit(&irb->irb_lock);
6702535Ssangeeta 			}
6710Sstevel@tonic-gate 		}
6720Sstevel@tonic-gate done:
6732535Ssangeeta 		if (gire != NULL)
6742535Ssangeeta 			IRE_REFRELE(gire);
6750Sstevel@tonic-gate 		/* report the bad route to routing sockets */
6760Sstevel@tonic-gate 		ip_rts_change(RTM_LOSING, ire->ire_addr, ire->ire_gateway_addr,
6770Sstevel@tonic-gate 		    ire->ire_mask, ire->ire_src_addr, 0, 0, 0,
6783448Sdh155122 		    (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA), ipst);
6790Sstevel@tonic-gate 		routing_sock_info = B_TRUE;
6802612Scarlsonj 
6812612Scarlsonj 		/*
6822612Scarlsonj 		 * TCP is really telling us to start over completely, and it
6832612Scarlsonj 		 * expects that we'll resend the ARP query.  Tell ARP to
6842612Scarlsonj 		 * discard the entry, if this is a local destination.
6857398SZhijun.Fu@Sun.COM 		 *
6867398SZhijun.Fu@Sun.COM 		 * But, if the ARP entry is permanent then it shouldn't be
6877398SZhijun.Fu@Sun.COM 		 * deleted, so we set ARED_F_PRESERVE_PERM.
6882612Scarlsonj 		 */
6892612Scarlsonj 		ill = ire->ire_stq->q_ptr;
6902612Scarlsonj 		if (ire->ire_gateway_addr == 0 &&
6912612Scarlsonj 		    (arp_mp = ill_ared_alloc(ill, addr)) != NULL) {
6927398SZhijun.Fu@Sun.COM 			ared_t *ared = (ared_t *)arp_mp->b_rptr;
6937398SZhijun.Fu@Sun.COM 
6947398SZhijun.Fu@Sun.COM 			ASSERT(ared->ared_cmd == AR_ENTRY_DELETE);
6957398SZhijun.Fu@Sun.COM 			ared->ared_flags |= ARED_F_PRESERVE_PERM;
6962612Scarlsonj 			putnext(ill->ill_rq, arp_mp);
6972612Scarlsonj 		}
6982612Scarlsonj 
6990Sstevel@tonic-gate 		ire_delete(ire);
7000Sstevel@tonic-gate 		ire_refrele(ire);
7010Sstevel@tonic-gate 	}
7023004Sdd193516 	/*
7033004Sdd193516 	 * Also look for an IRE_HOST type redirect ire and
7043004Sdd193516 	 * remove it if present.
7053004Sdd193516 	 */
7063004Sdd193516 	ire = ire_route_lookup(addr, 0, 0, IRE_HOST, NULL, NULL,
7073448Sdh155122 	    ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
7080Sstevel@tonic-gate 
7090Sstevel@tonic-gate 	/* Nail it. */
7103004Sdd193516 	if (ire != NULL) {
7114714Ssowmini 		if (ire->ire_flags & RTF_DYNAMIC) {
7124714Ssowmini 			if (!routing_sock_info) {
7134714Ssowmini 				ip_rts_change(RTM_LOSING, ire->ire_addr,
7144714Ssowmini 				    ire->ire_gateway_addr, ire->ire_mask,
7154714Ssowmini 				    ire->ire_src_addr, 0, 0, 0,
7164714Ssowmini 				    (RTA_DST | RTA_GATEWAY |
7174714Ssowmini 				    RTA_NETMASK | RTA_IFA),
7184714Ssowmini 				    ipst);
7194714Ssowmini 			}
7204714Ssowmini 			ire_delete(ire);
7210Sstevel@tonic-gate 		}
7224714Ssowmini 		ire_refrele(ire);
7230Sstevel@tonic-gate 	}
7240Sstevel@tonic-gate 	return (0);
7250Sstevel@tonic-gate }
7260Sstevel@tonic-gate 
7270Sstevel@tonic-gate /*
7280Sstevel@tonic-gate  * ip_ire_req is called by ip_wput when an IRE_DB_REQ_TYPE message is handed
7290Sstevel@tonic-gate  * down from the Upper Level Protocol to request a copy of the IRE (to check
7300Sstevel@tonic-gate  * its type or to extract information like round-trip time estimates or the
7310Sstevel@tonic-gate  * MTU.)
7320Sstevel@tonic-gate  * The address is assumed to be in the ire_addr field. If no IRE is found
7330Sstevel@tonic-gate  * an IRE is returned with ire_type being zero.
7340Sstevel@tonic-gate  * Note that the upper lavel protocol has to check for broadcast
7350Sstevel@tonic-gate  * (IRE_BROADCAST) and multicast (CLASSD(addr)).
7360Sstevel@tonic-gate  * If there is a b_cont the resulting IRE_DB_TYPE mblk is placed at the
7370Sstevel@tonic-gate  * end of the returned message.
7380Sstevel@tonic-gate  *
7390Sstevel@tonic-gate  * TCP sends down a message of this type with a connection request packet
7400Sstevel@tonic-gate  * chained on. UDP and ICMP send it down to verify that a route exists for
7410Sstevel@tonic-gate  * the destination address when they get connected.
7420Sstevel@tonic-gate  */
7430Sstevel@tonic-gate void
7440Sstevel@tonic-gate ip_ire_req(queue_t *q, mblk_t *mp)
7450Sstevel@tonic-gate {
7460Sstevel@tonic-gate 	ire_t	*inire;
7470Sstevel@tonic-gate 	ire_t	*ire;
7480Sstevel@tonic-gate 	mblk_t	*mp1;
7490Sstevel@tonic-gate 	ire_t	*sire = NULL;
7500Sstevel@tonic-gate 	zoneid_t zoneid = Q_TO_CONN(q)->conn_zoneid;
7513448Sdh155122 	ip_stack_t	*ipst = CONNQ_TO_IPST(q);
7523448Sdh155122 
7533448Sdh155122 	ASSERT(q->q_next == NULL);
7540Sstevel@tonic-gate 
7550Sstevel@tonic-gate 	if ((mp->b_wptr - mp->b_rptr) < sizeof (ire_t) ||
7560Sstevel@tonic-gate 	    !OK_32PTR(mp->b_rptr)) {
7570Sstevel@tonic-gate 		freemsg(mp);
7580Sstevel@tonic-gate 		return;
7590Sstevel@tonic-gate 	}
7600Sstevel@tonic-gate 	inire = (ire_t *)mp->b_rptr;
7610Sstevel@tonic-gate 	/*
7620Sstevel@tonic-gate 	 * Got it, now take our best shot at an IRE.
7630Sstevel@tonic-gate 	 */
7640Sstevel@tonic-gate 	if (inire->ire_ipversion == IPV6_VERSION) {
7650Sstevel@tonic-gate 		ire = ire_route_lookup_v6(&inire->ire_addr_v6, 0, 0, 0,
7661676Sjpk 		    NULL, &sire, zoneid, NULL,
7673448Sdh155122 		    (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT), ipst);
7680Sstevel@tonic-gate 	} else {
7690Sstevel@tonic-gate 		ASSERT(inire->ire_ipversion == IPV4_VERSION);
7700Sstevel@tonic-gate 		ire = ire_route_lookup(inire->ire_addr, 0, 0, 0,
7711676Sjpk 		    NULL, &sire, zoneid, NULL,
7723448Sdh155122 		    (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT), ipst);
7730Sstevel@tonic-gate 	}
7740Sstevel@tonic-gate 
7750Sstevel@tonic-gate 	/*
7760Sstevel@tonic-gate 	 * We prevent returning IRES with source address INADDR_ANY
7770Sstevel@tonic-gate 	 * as these were temporarily created for sending packets
7780Sstevel@tonic-gate 	 * from endpoints that have conn_unspec_src set.
7790Sstevel@tonic-gate 	 */
7800Sstevel@tonic-gate 	if (ire == NULL ||
7810Sstevel@tonic-gate 	    (ire->ire_ipversion == IPV4_VERSION &&
7820Sstevel@tonic-gate 	    ire->ire_src_addr == INADDR_ANY) ||
7830Sstevel@tonic-gate 	    (ire->ire_ipversion == IPV6_VERSION &&
7840Sstevel@tonic-gate 	    IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6))) {
7850Sstevel@tonic-gate 		inire->ire_type = 0;
7860Sstevel@tonic-gate 	} else {
7870Sstevel@tonic-gate 		bcopy(ire, inire, sizeof (ire_t));
7880Sstevel@tonic-gate 		/* Copy the route metrics from the parent. */
7890Sstevel@tonic-gate 		if (sire != NULL) {
7900Sstevel@tonic-gate 			bcopy(&(sire->ire_uinfo), &(inire->ire_uinfo),
7910Sstevel@tonic-gate 			    sizeof (iulp_t));
7920Sstevel@tonic-gate 		}
7930Sstevel@tonic-gate 
7940Sstevel@tonic-gate 		/*
7950Sstevel@tonic-gate 		 * As we don't lookup global policy here, we may not
7960Sstevel@tonic-gate 		 * pass the right size if per-socket policy is not
7970Sstevel@tonic-gate 		 * present. For these cases, path mtu discovery will
7980Sstevel@tonic-gate 		 * do the right thing.
7990Sstevel@tonic-gate 		 */
8000Sstevel@tonic-gate 		inire->ire_ipsec_overhead = conn_ipsec_length(Q_TO_CONN(q));
8010Sstevel@tonic-gate 
8020Sstevel@tonic-gate 		/* Pass the latest setting of the ip_path_mtu_discovery */
8033448Sdh155122 		inire->ire_frag_flag |=
8043448Sdh155122 		    (ipst->ips_ip_path_mtu_discovery) ? IPH_DF : 0;
8050Sstevel@tonic-gate 	}
8060Sstevel@tonic-gate 	if (ire != NULL)
8070Sstevel@tonic-gate 		ire_refrele(ire);
8080Sstevel@tonic-gate 	if (sire != NULL)
8090Sstevel@tonic-gate 		ire_refrele(sire);
8100Sstevel@tonic-gate 	mp->b_wptr = &mp->b_rptr[sizeof (ire_t)];
8110Sstevel@tonic-gate 	mp->b_datap->db_type = IRE_DB_TYPE;
8120Sstevel@tonic-gate 
8130Sstevel@tonic-gate 	/* Put the IRE_DB_TYPE mblk last in the chain */
8140Sstevel@tonic-gate 	mp1 = mp->b_cont;
8150Sstevel@tonic-gate 	if (mp1 != NULL) {
8160Sstevel@tonic-gate 		mp->b_cont = NULL;
8170Sstevel@tonic-gate 		linkb(mp1, mp);
8180Sstevel@tonic-gate 		mp = mp1;
8190Sstevel@tonic-gate 	}
8200Sstevel@tonic-gate 	qreply(q, mp);
8210Sstevel@tonic-gate }
8220Sstevel@tonic-gate 
8230Sstevel@tonic-gate /*
8240Sstevel@tonic-gate  * Send a packet using the specified IRE.
8250Sstevel@tonic-gate  * If ire_src_addr_v6 is all zero then discard the IRE after
8260Sstevel@tonic-gate  * the packet has been sent.
8270Sstevel@tonic-gate  */
8280Sstevel@tonic-gate static void
8290Sstevel@tonic-gate ire_send(queue_t *q, mblk_t *pkt, ire_t *ire)
8300Sstevel@tonic-gate {
8310Sstevel@tonic-gate 	mblk_t *ipsec_mp;
8320Sstevel@tonic-gate 	boolean_t is_secure;
8330Sstevel@tonic-gate 	uint_t ifindex;
8340Sstevel@tonic-gate 	ill_t	*ill;
8352733Snordmark 	zoneid_t zoneid = ire->ire_zoneid;
8363448Sdh155122 	ip_stack_t	*ipst = ire->ire_ipst;
8370Sstevel@tonic-gate 
8380Sstevel@tonic-gate 	ASSERT(ire->ire_ipversion == IPV4_VERSION);
8392733Snordmark 	ASSERT(!(ire->ire_type & IRE_LOCAL)); /* Has different ire_zoneid */
8400Sstevel@tonic-gate 	ipsec_mp = pkt;
8410Sstevel@tonic-gate 	is_secure = (pkt->b_datap->db_type == M_CTL);
8422733Snordmark 	if (is_secure) {
8432733Snordmark 		ipsec_out_t *io;
8442733Snordmark 
8450Sstevel@tonic-gate 		pkt = pkt->b_cont;
8462733Snordmark 		io = (ipsec_out_t *)ipsec_mp->b_rptr;
8472733Snordmark 		if (io->ipsec_out_type == IPSEC_OUT)
8482733Snordmark 			zoneid = io->ipsec_out_zoneid;
8492733Snordmark 	}
8500Sstevel@tonic-gate 
8510Sstevel@tonic-gate 	/* If the packet originated externally then */
8520Sstevel@tonic-gate 	if (pkt->b_prev) {
8530Sstevel@tonic-gate 		ire_refrele(ire);
8540Sstevel@tonic-gate 		/*
8550Sstevel@tonic-gate 		 * Extract the ifindex from b_prev (set in ip_rput_noire).
8560Sstevel@tonic-gate 		 * Look up interface to see if it still exists (it could have
8570Sstevel@tonic-gate 		 * been unplumbed by the time the reply came back from ARP)
8580Sstevel@tonic-gate 		 */
8590Sstevel@tonic-gate 		ifindex = (uint_t)(uintptr_t)pkt->b_prev;
8600Sstevel@tonic-gate 		ill = ill_lookup_on_ifindex(ifindex, B_FALSE,
8613448Sdh155122 		    NULL, NULL, NULL, NULL, ipst);
8620Sstevel@tonic-gate 		if (ill == NULL) {
8630Sstevel@tonic-gate 			pkt->b_prev = NULL;
8640Sstevel@tonic-gate 			pkt->b_next = NULL;
8650Sstevel@tonic-gate 			freemsg(ipsec_mp);
8660Sstevel@tonic-gate 			return;
8670Sstevel@tonic-gate 		}
8680Sstevel@tonic-gate 		q = ill->ill_rq;
8690Sstevel@tonic-gate 		pkt->b_prev = NULL;
8700Sstevel@tonic-gate 		/*
8710Sstevel@tonic-gate 		 * This packet has not gone through IPSEC processing
8720Sstevel@tonic-gate 		 * and hence we should not have any IPSEC message
8730Sstevel@tonic-gate 		 * prepended.
8740Sstevel@tonic-gate 		 */
8750Sstevel@tonic-gate 		ASSERT(ipsec_mp == pkt);
8762535Ssangeeta 		put(q, pkt);
8770Sstevel@tonic-gate 		ill_refrele(ill);
8780Sstevel@tonic-gate 	} else if (pkt->b_next) {
8790Sstevel@tonic-gate 		/* Packets from multicast router */
8800Sstevel@tonic-gate 		pkt->b_next = NULL;
8810Sstevel@tonic-gate 		/*
8820Sstevel@tonic-gate 		 * We never get the IPSEC_OUT while forwarding the
8830Sstevel@tonic-gate 		 * packet for multicast router.
8840Sstevel@tonic-gate 		 */
8850Sstevel@tonic-gate 		ASSERT(ipsec_mp == pkt);
8860Sstevel@tonic-gate 		ip_rput_forward(ire, (ipha_t *)pkt->b_rptr, ipsec_mp, NULL);
8870Sstevel@tonic-gate 		ire_refrele(ire);
8880Sstevel@tonic-gate 	} else {
8890Sstevel@tonic-gate 		/* Locally originated packets */
8907216Smeem 		boolean_t delete_ire = B_FALSE;
8910Sstevel@tonic-gate 		ipha_t *ipha = (ipha_t *)pkt->b_rptr;
8920Sstevel@tonic-gate 
8930Sstevel@tonic-gate 		/*
8947216Smeem 		 * If this IRE shouldn't be kept in the table (because its
8957216Smeem 		 * source address is unspecified), hold a reference to it so
8967216Smeem 		 * we can delete it even after e.g. ip_wput_ire() has dropped
8977216Smeem 		 * its reference.
8980Sstevel@tonic-gate 		 */
8997216Smeem 		if (!(ire->ire_marks & IRE_MARK_NOADD) &&
9007216Smeem 		    ire->ire_src_addr == INADDR_ANY) {
9017216Smeem 			delete_ire = B_TRUE;
9020Sstevel@tonic-gate 			IRE_REFHOLD(ire);
9030Sstevel@tonic-gate 		}
9047216Smeem 
9050Sstevel@tonic-gate 		/*
9060Sstevel@tonic-gate 		 * If we were resolving a router we can not use the
9070Sstevel@tonic-gate 		 * routers IRE for sending the packet (since it would
9080Sstevel@tonic-gate 		 * violate the uniqness of the IP idents) thus we
9090Sstevel@tonic-gate 		 * make another pass through ip_wput to create the IRE_CACHE
9100Sstevel@tonic-gate 		 * for the destination.
9110Sstevel@tonic-gate 		 * When IRE_MARK_NOADD is set, ire_add() is not called.
9120Sstevel@tonic-gate 		 * Thus ip_wput() will never find a ire and result in an
9130Sstevel@tonic-gate 		 * infinite loop. Thus we check whether IRE_MARK_NOADD is
9140Sstevel@tonic-gate 		 * is set. This also implies that IRE_MARK_NOADD can only be
9150Sstevel@tonic-gate 		 * used to send packets to directly connected hosts.
9160Sstevel@tonic-gate 		 */
9170Sstevel@tonic-gate 		if (ipha->ipha_dst != ire->ire_addr &&
9180Sstevel@tonic-gate 		    !(ire->ire_marks & IRE_MARK_NOADD)) {
9190Sstevel@tonic-gate 			ire_refrele(ire);	/* Held in ire_add */
9202733Snordmark 			if (CONN_Q(q)) {
9212733Snordmark 				(void) ip_output(Q_TO_CONN(q), ipsec_mp, q,
9222733Snordmark 				    IRE_SEND);
9232733Snordmark 			} else {
9242733Snordmark 				(void) ip_output((void *)(uintptr_t)zoneid,
9252733Snordmark 				    ipsec_mp, q, IRE_SEND);
9262733Snordmark 			}
9270Sstevel@tonic-gate 		} else {
9280Sstevel@tonic-gate 			if (is_secure) {
9290Sstevel@tonic-gate 				ipsec_out_t *oi;
9300Sstevel@tonic-gate 				ipha_t *ipha;
9310Sstevel@tonic-gate 
9320Sstevel@tonic-gate 				oi = (ipsec_out_t *)ipsec_mp->b_rptr;
9330Sstevel@tonic-gate 				ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr;
9340Sstevel@tonic-gate 				if (oi->ipsec_out_proc_begin) {
9350Sstevel@tonic-gate 					/*
9360Sstevel@tonic-gate 					 * This is the case where
9370Sstevel@tonic-gate 					 * ip_wput_ipsec_out could not find
9380Sstevel@tonic-gate 					 * the IRE and recreated a new one.
9390Sstevel@tonic-gate 					 * As ip_wput_ipsec_out does ire
9400Sstevel@tonic-gate 					 * lookups, ire_refrele for the extra
9410Sstevel@tonic-gate 					 * bump in ire_add.
9420Sstevel@tonic-gate 					 */
9430Sstevel@tonic-gate 					ire_refrele(ire);
9440Sstevel@tonic-gate 					ip_wput_ipsec_out(q, ipsec_mp, ipha,
9450Sstevel@tonic-gate 					    NULL, NULL);
9460Sstevel@tonic-gate 				} else {
9470Sstevel@tonic-gate 					/*
9480Sstevel@tonic-gate 					 * IRE_REFRELE will be done in
9490Sstevel@tonic-gate 					 * ip_wput_ire.
9500Sstevel@tonic-gate 					 */
9510Sstevel@tonic-gate 					ip_wput_ire(q, ipsec_mp, ire, NULL,
9522733Snordmark 					    IRE_SEND, zoneid);
9530Sstevel@tonic-gate 				}
9540Sstevel@tonic-gate 			} else {
9550Sstevel@tonic-gate 				/*
9560Sstevel@tonic-gate 				 * IRE_REFRELE will be done in ip_wput_ire.
9570Sstevel@tonic-gate 				 */
9580Sstevel@tonic-gate 				ip_wput_ire(q, ipsec_mp, ire, NULL,
9592733Snordmark 				    IRE_SEND, zoneid);
9600Sstevel@tonic-gate 			}
9610Sstevel@tonic-gate 		}
9620Sstevel@tonic-gate 		/*
9630Sstevel@tonic-gate 		 * Special code to support sending a single packet with
9640Sstevel@tonic-gate 		 * conn_unspec_src using an IRE which has no source address.
9650Sstevel@tonic-gate 		 * The IRE is deleted here after sending the packet to avoid
9660Sstevel@tonic-gate 		 * having other code trip on it. But before we delete the
9670Sstevel@tonic-gate 		 * ire, somebody could have looked up this ire.
9680Sstevel@tonic-gate 		 * We prevent returning/using this IRE by the upper layers
9690Sstevel@tonic-gate 		 * by making checks to NULL source address in other places
9700Sstevel@tonic-gate 		 * like e.g ip_ire_append, ip_ire_req and ip_bind_connected.
9717216Smeem 		 * Though this does not completely prevent other threads
9720Sstevel@tonic-gate 		 * from using this ire, this should not cause any problems.
9730Sstevel@tonic-gate 		 */
9747216Smeem 		if (delete_ire) {
9757216Smeem 			ip1dbg(("ire_send: delete IRE\n"));
9767216Smeem 			ire_delete(ire);
9770Sstevel@tonic-gate 			ire_refrele(ire);	/* Held above */
9780Sstevel@tonic-gate 		}
9790Sstevel@tonic-gate 	}
9800Sstevel@tonic-gate }
9810Sstevel@tonic-gate 
9820Sstevel@tonic-gate /*
9830Sstevel@tonic-gate  * Send a packet using the specified IRE.
9840Sstevel@tonic-gate  * If ire_src_addr_v6 is all zero then discard the IRE after
9850Sstevel@tonic-gate  * the packet has been sent.
9860Sstevel@tonic-gate  */
9870Sstevel@tonic-gate static void
9880Sstevel@tonic-gate ire_send_v6(queue_t *q, mblk_t *pkt, ire_t *ire)
9890Sstevel@tonic-gate {
9900Sstevel@tonic-gate 	mblk_t *ipsec_mp;
9910Sstevel@tonic-gate 	boolean_t secure;
9920Sstevel@tonic-gate 	uint_t ifindex;
9932733Snordmark 	zoneid_t zoneid = ire->ire_zoneid;
9943448Sdh155122 	ip_stack_t	*ipst = ire->ire_ipst;
9950Sstevel@tonic-gate 
9960Sstevel@tonic-gate 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
9972733Snordmark 	ASSERT(!(ire->ire_type & IRE_LOCAL)); /* Has different ire_zoneid */
9980Sstevel@tonic-gate 	if (pkt->b_datap->db_type == M_CTL) {
9992733Snordmark 		ipsec_out_t *io;
10002733Snordmark 
10010Sstevel@tonic-gate 		ipsec_mp = pkt;
10020Sstevel@tonic-gate 		pkt = pkt->b_cont;
10030Sstevel@tonic-gate 		secure = B_TRUE;
10042733Snordmark 		io = (ipsec_out_t *)ipsec_mp->b_rptr;
10052733Snordmark 		if (io->ipsec_out_type == IPSEC_OUT)
10062733Snordmark 			zoneid = io->ipsec_out_zoneid;
10070Sstevel@tonic-gate 	} else {
10080Sstevel@tonic-gate 		ipsec_mp = pkt;
10090Sstevel@tonic-gate 		secure = B_FALSE;
10100Sstevel@tonic-gate 	}
10110Sstevel@tonic-gate 
10120Sstevel@tonic-gate 	/* If the packet originated externally then */
10130Sstevel@tonic-gate 	if (pkt->b_prev) {
10140Sstevel@tonic-gate 		ill_t	*ill;
10150Sstevel@tonic-gate 		/*
10160Sstevel@tonic-gate 		 * Extract the ifindex from b_prev (set in ip_rput_data_v6).
10170Sstevel@tonic-gate 		 * Look up interface to see if it still exists (it could have
10180Sstevel@tonic-gate 		 * been unplumbed by the time the reply came back from the
10192535Ssangeeta 		 * resolver).
10200Sstevel@tonic-gate 		 */
10210Sstevel@tonic-gate 		ifindex = (uint_t)(uintptr_t)pkt->b_prev;
10220Sstevel@tonic-gate 		ill = ill_lookup_on_ifindex(ifindex, B_TRUE,
10233448Sdh155122 		    NULL, NULL, NULL, NULL, ipst);
10240Sstevel@tonic-gate 		if (ill == NULL) {
10250Sstevel@tonic-gate 			pkt->b_prev = NULL;
10260Sstevel@tonic-gate 			pkt->b_next = NULL;
10270Sstevel@tonic-gate 			freemsg(ipsec_mp);
10280Sstevel@tonic-gate 			ire_refrele(ire);	/* Held in ire_add */
10290Sstevel@tonic-gate 			return;
10300Sstevel@tonic-gate 		}
10310Sstevel@tonic-gate 		q = ill->ill_rq;
10320Sstevel@tonic-gate 		pkt->b_prev = NULL;
10330Sstevel@tonic-gate 		/*
10340Sstevel@tonic-gate 		 * This packet has not gone through IPSEC processing
10350Sstevel@tonic-gate 		 * and hence we should not have any IPSEC message
10360Sstevel@tonic-gate 		 * prepended.
10370Sstevel@tonic-gate 		 */
10380Sstevel@tonic-gate 		ASSERT(ipsec_mp == pkt);
10390Sstevel@tonic-gate 		put(q, pkt);
10400Sstevel@tonic-gate 		ill_refrele(ill);
10410Sstevel@tonic-gate 	} else if (pkt->b_next) {
10420Sstevel@tonic-gate 		/* Packets from multicast router */
10430Sstevel@tonic-gate 		pkt->b_next = NULL;
10440Sstevel@tonic-gate 		/*
10450Sstevel@tonic-gate 		 * We never get the IPSEC_OUT while forwarding the
10460Sstevel@tonic-gate 		 * packet for multicast router.
10470Sstevel@tonic-gate 		 */
10480Sstevel@tonic-gate 		ASSERT(ipsec_mp == pkt);
10490Sstevel@tonic-gate 		/*
10500Sstevel@tonic-gate 		 * XXX TODO IPv6.
10510Sstevel@tonic-gate 		 */
10520Sstevel@tonic-gate 		freemsg(pkt);
10530Sstevel@tonic-gate #ifdef XXX
10540Sstevel@tonic-gate 		ip_rput_forward(ire, (ipha_t *)pkt->b_rptr, pkt, NULL);
10550Sstevel@tonic-gate #endif
10560Sstevel@tonic-gate 	} else {
10570Sstevel@tonic-gate 		if (secure) {
10580Sstevel@tonic-gate 			ipsec_out_t *oi;
10590Sstevel@tonic-gate 			ip6_t *ip6h;
10600Sstevel@tonic-gate 
10610Sstevel@tonic-gate 			oi = (ipsec_out_t *)ipsec_mp->b_rptr;
10620Sstevel@tonic-gate 			ip6h = (ip6_t *)ipsec_mp->b_cont->b_rptr;
10630Sstevel@tonic-gate 			if (oi->ipsec_out_proc_begin) {
10640Sstevel@tonic-gate 				/*
10650Sstevel@tonic-gate 				 * This is the case where
10660Sstevel@tonic-gate 				 * ip_wput_ipsec_out could not find
10670Sstevel@tonic-gate 				 * the IRE and recreated a new one.
10680Sstevel@tonic-gate 				 */
10690Sstevel@tonic-gate 				ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h,
10700Sstevel@tonic-gate 				    NULL, NULL);
10710Sstevel@tonic-gate 			} else {
10722733Snordmark 				if (CONN_Q(q)) {
10732733Snordmark 					(void) ip_output_v6(Q_TO_CONN(q),
10742733Snordmark 					    ipsec_mp, q, IRE_SEND);
10752733Snordmark 				} else {
10762733Snordmark 					(void) ip_output_v6(
10772733Snordmark 					    (void *)(uintptr_t)zoneid,
10782733Snordmark 					    ipsec_mp, q, IRE_SEND);
10792733Snordmark 				}
10800Sstevel@tonic-gate 			}
10810Sstevel@tonic-gate 		} else {
10820Sstevel@tonic-gate 			/*
10830Sstevel@tonic-gate 			 * Send packets through ip_output_v6 so that any
10840Sstevel@tonic-gate 			 * ip6_info header can be processed again.
10850Sstevel@tonic-gate 			 */
10862733Snordmark 			if (CONN_Q(q)) {
10872733Snordmark 				(void) ip_output_v6(Q_TO_CONN(q), ipsec_mp, q,
10882733Snordmark 				    IRE_SEND);
10892733Snordmark 			} else {
10902733Snordmark 				(void) ip_output_v6((void *)(uintptr_t)zoneid,
10912733Snordmark 				    ipsec_mp, q, IRE_SEND);
10922733Snordmark 			}
10930Sstevel@tonic-gate 		}
10940Sstevel@tonic-gate 		/*
10950Sstevel@tonic-gate 		 * Special code to support sending a single packet with
10960Sstevel@tonic-gate 		 * conn_unspec_src using an IRE which has no source address.
10970Sstevel@tonic-gate 		 * The IRE is deleted here after sending the packet to avoid
10980Sstevel@tonic-gate 		 * having other code trip on it. But before we delete the
10990Sstevel@tonic-gate 		 * ire, somebody could have looked up this ire.
11000Sstevel@tonic-gate 		 * We prevent returning/using this IRE by the upper layers
11010Sstevel@tonic-gate 		 * by making checks to NULL source address in other places
11020Sstevel@tonic-gate 		 * like e.g ip_ire_append_v6, ip_ire_req and
11030Sstevel@tonic-gate 		 * ip_bind_connected_v6. Though, this does not completely
11040Sstevel@tonic-gate 		 * prevent other threads from using this ire, this should
11050Sstevel@tonic-gate 		 * not cause any problems.
11060Sstevel@tonic-gate 		 */
11070Sstevel@tonic-gate 		if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6)) {
11080Sstevel@tonic-gate 			ip1dbg(("ire_send_v6: delete IRE\n"));
11090Sstevel@tonic-gate 			ire_delete(ire);
11100Sstevel@tonic-gate 		}
11110Sstevel@tonic-gate 	}
11120Sstevel@tonic-gate 	ire_refrele(ire);	/* Held in ire_add */
11130Sstevel@tonic-gate }
11140Sstevel@tonic-gate 
11150Sstevel@tonic-gate /*
11160Sstevel@tonic-gate  * Make sure that IRE bucket does not get too long.
11170Sstevel@tonic-gate  * This can cause lock up because ire_cache_lookup()
11180Sstevel@tonic-gate  * may take "forever" to finish.
11190Sstevel@tonic-gate  *
11205388Sja97890  * We only remove a maximum of cnt IREs each time.  This
11215388Sja97890  * should keep the bucket length approximately constant,
11220Sstevel@tonic-gate  * depending on cnt.  This should be enough to defend
11230Sstevel@tonic-gate  * against DoS attack based on creating temporary IREs
11240Sstevel@tonic-gate  * (for forwarding and non-TCP traffic).
11250Sstevel@tonic-gate  *
11265388Sja97890  * We also pass in the address of the newly created IRE
11275388Sja97890  * as we do not want to remove this straight after adding
11285388Sja97890  * it. New IREs are normally added at the tail of the
11290Sstevel@tonic-gate  * bucket.  This means that we are removing the "oldest"
11305388Sja97890  * temporary IREs added.  Only if there are IREs with
11310Sstevel@tonic-gate  * the same ire_addr, do we not add it at the tail.  Refer
11320Sstevel@tonic-gate  * to ire_add_v*().  It should be OK for our purpose.
11330Sstevel@tonic-gate  *
11340Sstevel@tonic-gate  * For non-temporary cached IREs, we make sure that they
11350Sstevel@tonic-gate  * have not been used for some time (defined below), they
11360Sstevel@tonic-gate  * are non-local destinations, and there is no one using
11370Sstevel@tonic-gate  * them at the moment (refcnt == 1).
11380Sstevel@tonic-gate  *
11390Sstevel@tonic-gate  * The above means that the IRE bucket length may become
11400Sstevel@tonic-gate  * very long, consisting of mostly non-temporary IREs.
11410Sstevel@tonic-gate  * This can happen when the hash function does a bad job
11420Sstevel@tonic-gate  * so that most TCP connections cluster to a specific bucket.
11430Sstevel@tonic-gate  * This "hopefully" should never happen.  It can also
11440Sstevel@tonic-gate  * happen if most TCP connections have very long lives.
11450Sstevel@tonic-gate  * Even with the minimal hash table size of 256, there
11460Sstevel@tonic-gate  * has to be a lot of such connections to make the bucket
11470Sstevel@tonic-gate  * length unreasonably long.  This should probably not
11480Sstevel@tonic-gate  * happen either.  The third can when this can happen is
11490Sstevel@tonic-gate  * when the machine is under attack, such as SYN flooding.
11500Sstevel@tonic-gate  * TCP should already have the proper mechanism to protect
11510Sstevel@tonic-gate  * that.  So we should be safe.
11520Sstevel@tonic-gate  *
11530Sstevel@tonic-gate  * This function is called by ire_add_then_send() after
11540Sstevel@tonic-gate  * a new IRE is added and the packet is sent.
11550Sstevel@tonic-gate  *
11560Sstevel@tonic-gate  * The idle cutoff interval is set to 60s.  It can be
11570Sstevel@tonic-gate  * changed using /etc/system.
11580Sstevel@tonic-gate  */
11590Sstevel@tonic-gate uint32_t ire_idle_cutoff_interval = 60000;
11600Sstevel@tonic-gate 
11610Sstevel@tonic-gate static void
11625388Sja97890 ire_cache_cleanup(irb_t *irb, uint32_t threshold, ire_t *ref_ire)
11630Sstevel@tonic-gate {
11640Sstevel@tonic-gate 	ire_t *ire;
11650Sstevel@tonic-gate 	clock_t cut_off = drv_usectohz(ire_idle_cutoff_interval * 1000);
11665388Sja97890 	int cnt = ip_ire_cleanup_cnt;
11670Sstevel@tonic-gate 
11680Sstevel@tonic-gate 	/*
11695388Sja97890 	 * Try to remove cnt temporary IREs first.
11700Sstevel@tonic-gate 	 */
11715388Sja97890 	for (ire = irb->irb_ire; cnt > 0 && ire != NULL; ire = ire->ire_next) {
11725388Sja97890 		if (ire == ref_ire)
11735388Sja97890 			continue;
11745388Sja97890 		if (ire->ire_marks & IRE_MARK_CONDEMNED)
11755388Sja97890 			continue;
11765388Sja97890 		if (ire->ire_marks & IRE_MARK_TEMPORARY) {
11775388Sja97890 			ASSERT(ire->ire_type == IRE_CACHE);
11785388Sja97890 			ire_delete(ire);
11795388Sja97890 			cnt--;
11805388Sja97890 		}
11815388Sja97890 	}
11825388Sja97890 	if (cnt == 0)
11830Sstevel@tonic-gate 		return;
11840Sstevel@tonic-gate 
11855388Sja97890 	/*
11865388Sja97890 	 * If we didn't satisfy our removal target from temporary IREs
11875388Sja97890 	 * we see how many non-temporary IREs are currently in the bucket.
11885388Sja97890 	 * If this quantity is above the threshold then we see if there are any
11895388Sja97890 	 * candidates for removal. We are still limited to removing a maximum
11905388Sja97890 	 * of cnt IREs.
11915388Sja97890 	 */
11925388Sja97890 	if ((irb->irb_ire_cnt - irb->irb_tmp_ire_cnt) > threshold) {
11935388Sja97890 		for (ire = irb->irb_ire; cnt > 0 && ire != NULL;
11940Sstevel@tonic-gate 		    ire = ire->ire_next) {
11955388Sja97890 			if (ire == ref_ire)
11965388Sja97890 				continue;
11975388Sja97890 			if (ire->ire_type != IRE_CACHE)
11985388Sja97890 				continue;
11990Sstevel@tonic-gate 			if (ire->ire_marks & IRE_MARK_CONDEMNED)
12000Sstevel@tonic-gate 				continue;
12015388Sja97890 			if ((ire->ire_refcnt == 1) &&
12025388Sja97890 			    (lbolt - ire->ire_last_used_time > cut_off)) {
12030Sstevel@tonic-gate 				ire_delete(ire);
12040Sstevel@tonic-gate 				cnt--;
12050Sstevel@tonic-gate 			}
12060Sstevel@tonic-gate 		}
12070Sstevel@tonic-gate 	}
12080Sstevel@tonic-gate }
12090Sstevel@tonic-gate 
12100Sstevel@tonic-gate /*
12110Sstevel@tonic-gate  * ire_add_then_send is called when a new IRE has been created in order to
12120Sstevel@tonic-gate  * route an outgoing packet.  Typically, it is called from ip_wput when
12130Sstevel@tonic-gate  * a response comes back down from a resolver.  We add the IRE, and then
12140Sstevel@tonic-gate  * possibly run the packet through ip_wput or ip_rput, as appropriate.
12150Sstevel@tonic-gate  * However, we do not add the newly created IRE in the cache when
12160Sstevel@tonic-gate  * IRE_MARK_NOADD is set in the IRE. IRE_MARK_NOADD is set at
12174823Sseb  * ip_newroute_ipif(). The ires with IRE_MARK_NOADD are ire_refrele'd by
12184823Sseb  * ip_wput_ire() and get deleted.
12190Sstevel@tonic-gate  * Multirouting support: the packet is silently discarded when the new IRE
12200Sstevel@tonic-gate  * holds the RTF_MULTIRT flag, but is not the first IRE to be added with the
12210Sstevel@tonic-gate  * RTF_MULTIRT flag for the same destination address.
12220Sstevel@tonic-gate  * In this case, we just want to register this additional ire without
12230Sstevel@tonic-gate  * sending the packet, as it has already been replicated through
12240Sstevel@tonic-gate  * existing multirt routes in ip_wput().
12250Sstevel@tonic-gate  */
12260Sstevel@tonic-gate void
12270Sstevel@tonic-gate ire_add_then_send(queue_t *q, ire_t *ire, mblk_t *mp)
12280Sstevel@tonic-gate {
12290Sstevel@tonic-gate 	irb_t *irb;
12300Sstevel@tonic-gate 	boolean_t drop = B_FALSE;
12310Sstevel@tonic-gate 	boolean_t mctl_present;
12320Sstevel@tonic-gate 	mblk_t *first_mp = NULL;
12338485SPeter.Memishian@Sun.COM 	mblk_t *data_mp = NULL;
12340Sstevel@tonic-gate 	ire_t *dst_ire;
12350Sstevel@tonic-gate 	ipha_t *ipha;
12360Sstevel@tonic-gate 	ip6_t *ip6h;
12373448Sdh155122 	ip_stack_t	*ipst = ire->ire_ipst;
12385388Sja97890 	int		ire_limit;
12390Sstevel@tonic-gate 
12400Sstevel@tonic-gate 	if (mp != NULL) {
12410Sstevel@tonic-gate 		/*
12420Sstevel@tonic-gate 		 * We first have to retrieve the destination address carried
12430Sstevel@tonic-gate 		 * by the packet.
12440Sstevel@tonic-gate 		 * We can't rely on ire as it can be related to a gateway.
12450Sstevel@tonic-gate 		 * The destination address will help in determining if
12460Sstevel@tonic-gate 		 * other RTF_MULTIRT ires are already registered.
12470Sstevel@tonic-gate 		 *
12480Sstevel@tonic-gate 		 * We first need to know where we are going : v4 or V6.
12490Sstevel@tonic-gate 		 * the ire version is enough, as there is no risk that
12500Sstevel@tonic-gate 		 * we resolve an IPv6 address with an IPv4 ire
12510Sstevel@tonic-gate 		 * or vice versa.
12520Sstevel@tonic-gate 		 */
12538485SPeter.Memishian@Sun.COM 		EXTRACT_PKT_MP(mp, first_mp, mctl_present);
12548485SPeter.Memishian@Sun.COM 		data_mp = mp;
12558485SPeter.Memishian@Sun.COM 		mp = first_mp;
12560Sstevel@tonic-gate 		if (ire->ire_ipversion == IPV4_VERSION) {
12578485SPeter.Memishian@Sun.COM 			ipha = (ipha_t *)data_mp->b_rptr;
12580Sstevel@tonic-gate 			dst_ire = ire_cache_lookup(ipha->ipha_dst,
1259*8778SErik.Nordmark@Sun.COM 			    ire->ire_zoneid, msg_getlabel(mp), ipst);
12600Sstevel@tonic-gate 		} else {
12612535Ssangeeta 			ASSERT(ire->ire_ipversion == IPV6_VERSION);
12628485SPeter.Memishian@Sun.COM 			ip6h = (ip6_t *)data_mp->b_rptr;
12630Sstevel@tonic-gate 			dst_ire = ire_cache_lookup_v6(&ip6h->ip6_dst,
1264*8778SErik.Nordmark@Sun.COM 			    ire->ire_zoneid, msg_getlabel(mp), ipst);
12650Sstevel@tonic-gate 		}
12660Sstevel@tonic-gate 		if (dst_ire != NULL) {
12670Sstevel@tonic-gate 			if (dst_ire->ire_flags & RTF_MULTIRT) {
12680Sstevel@tonic-gate 				/*
12690Sstevel@tonic-gate 				 * At least one resolved multirt route
12700Sstevel@tonic-gate 				 * already exists for the destination,
12710Sstevel@tonic-gate 				 * don't sent this packet: either drop it
12720Sstevel@tonic-gate 				 * or complete the pending resolution,
12730Sstevel@tonic-gate 				 * depending on the ire.
12740Sstevel@tonic-gate 				 */
12750Sstevel@tonic-gate 				drop = B_TRUE;
12760Sstevel@tonic-gate 			}
12770Sstevel@tonic-gate 			ip1dbg(("ire_add_then_send: dst_ire %p "
12780Sstevel@tonic-gate 			    "[dst %08x, gw %08x], drop %d\n",
12790Sstevel@tonic-gate 			    (void *)dst_ire,
12800Sstevel@tonic-gate 			    (dst_ire->ire_ipversion == IPV4_VERSION) ? \
12814714Ssowmini 			    ntohl(dst_ire->ire_addr) : \
12824714Ssowmini 			    ntohl(V4_PART_OF_V6(dst_ire->ire_addr_v6)),
12830Sstevel@tonic-gate 			    (dst_ire->ire_ipversion == IPV4_VERSION) ? \
12844714Ssowmini 			    ntohl(dst_ire->ire_gateway_addr) : \
12854714Ssowmini 			    ntohl(V4_PART_OF_V6(
12864714Ssowmini 			    dst_ire->ire_gateway_addr_v6)),
12870Sstevel@tonic-gate 			    drop));
12880Sstevel@tonic-gate 			ire_refrele(dst_ire);
12890Sstevel@tonic-gate 		}
12900Sstevel@tonic-gate 	}
12910Sstevel@tonic-gate 
12920Sstevel@tonic-gate 	if (!(ire->ire_marks & IRE_MARK_NOADD)) {
12934823Sseb 		/* Regular packets with cache bound ires are here. */
12944823Sseb 		(void) ire_add(&ire, NULL, NULL, NULL, B_FALSE);
12950Sstevel@tonic-gate 
12960Sstevel@tonic-gate 		if (ire == NULL) {
12970Sstevel@tonic-gate 			mp->b_prev = NULL;
12980Sstevel@tonic-gate 			mp->b_next = NULL;
12990Sstevel@tonic-gate 			MULTIRT_DEBUG_UNTAG(mp);
13000Sstevel@tonic-gate 			freemsg(mp);
13010Sstevel@tonic-gate 			return;
13020Sstevel@tonic-gate 		}
13030Sstevel@tonic-gate 		if (mp == NULL) {
13040Sstevel@tonic-gate 			ire_refrele(ire);	/* Held in ire_add_v4/v6 */
13050Sstevel@tonic-gate 			return;
13060Sstevel@tonic-gate 		}
13070Sstevel@tonic-gate 	}
13080Sstevel@tonic-gate 	if (drop) {
13090Sstevel@tonic-gate 		/*
13100Sstevel@tonic-gate 		 * If we're adding an RTF_MULTIRT ire, the resolution
13110Sstevel@tonic-gate 		 * is over: we just drop the packet.
13120Sstevel@tonic-gate 		 */
13130Sstevel@tonic-gate 		if (ire->ire_flags & RTF_MULTIRT) {
13148485SPeter.Memishian@Sun.COM 			data_mp->b_prev = NULL;
13158485SPeter.Memishian@Sun.COM 			data_mp->b_next = NULL;
13160Sstevel@tonic-gate 			MULTIRT_DEBUG_UNTAG(mp);
13170Sstevel@tonic-gate 			freemsg(mp);
13180Sstevel@tonic-gate 		} else {
13190Sstevel@tonic-gate 			/*
13200Sstevel@tonic-gate 			 * Otherwise, we're adding the ire to a gateway
13210Sstevel@tonic-gate 			 * for a multirt route.
13220Sstevel@tonic-gate 			 * Invoke ip_newroute() to complete the resolution
13230Sstevel@tonic-gate 			 * of the route. We will then come back here and
13240Sstevel@tonic-gate 			 * finally drop this packet in the above code.
13250Sstevel@tonic-gate 			 */
13260Sstevel@tonic-gate 			if (ire->ire_ipversion == IPV4_VERSION) {
13270Sstevel@tonic-gate 				/*
13280Sstevel@tonic-gate 				 * TODO: in order for CGTP to work in non-global
13290Sstevel@tonic-gate 				 * zones, ip_newroute() must create the IRE
13300Sstevel@tonic-gate 				 * cache in the zone indicated by
13310Sstevel@tonic-gate 				 * ire->ire_zoneid.
13320Sstevel@tonic-gate 				 */
13334823Sseb 				ip_newroute(q, mp, ipha->ipha_dst,
13342733Snordmark 				    (CONN_Q(q) ? Q_TO_CONN(q) : NULL),
13353448Sdh155122 				    ire->ire_zoneid, ipst);
13360Sstevel@tonic-gate 			} else {
13378485SPeter.Memishian@Sun.COM 				int minlen = sizeof (ip6i_t) + IPV6_HDR_LEN;
13388485SPeter.Memishian@Sun.COM 
13392535Ssangeeta 				ASSERT(ire->ire_ipversion == IPV6_VERSION);
13408485SPeter.Memishian@Sun.COM 
13418485SPeter.Memishian@Sun.COM 				/*
13428485SPeter.Memishian@Sun.COM 				 * If necessary, skip over the ip6i_t to find
13438485SPeter.Memishian@Sun.COM 				 * the header with the actual source address.
13448485SPeter.Memishian@Sun.COM 				 */
13458485SPeter.Memishian@Sun.COM 				if (ip6h->ip6_nxt == IPPROTO_RAW) {
13468485SPeter.Memishian@Sun.COM 					if (MBLKL(data_mp) < minlen &&
13478485SPeter.Memishian@Sun.COM 					    pullupmsg(data_mp, -1) == 0) {
13488485SPeter.Memishian@Sun.COM 						ip1dbg(("ire_add_then_send: "
13498485SPeter.Memishian@Sun.COM 						    "cannot pullupmsg ip6i\n"));
13508485SPeter.Memishian@Sun.COM 						if (mctl_present)
13518485SPeter.Memishian@Sun.COM 							freeb(first_mp);
13528485SPeter.Memishian@Sun.COM 						ire_refrele(ire);
13538485SPeter.Memishian@Sun.COM 						return;
13548485SPeter.Memishian@Sun.COM 					}
13558485SPeter.Memishian@Sun.COM 					ASSERT(MBLKL(data_mp) >= IPV6_HDR_LEN);
13568485SPeter.Memishian@Sun.COM 					ip6h = (ip6_t *)(data_mp->b_rptr +
13578485SPeter.Memishian@Sun.COM 					    sizeof (ip6i_t));
13588485SPeter.Memishian@Sun.COM 				}
13598485SPeter.Memishian@Sun.COM 				ip_newroute_v6(q, mp, &ip6h->ip6_dst,
13608485SPeter.Memishian@Sun.COM 				    &ip6h->ip6_src, NULL, ire->ire_zoneid,
13618485SPeter.Memishian@Sun.COM 				    ipst);
13620Sstevel@tonic-gate 			}
13630Sstevel@tonic-gate 		}
13640Sstevel@tonic-gate 
13650Sstevel@tonic-gate 		ire_refrele(ire); /* As done by ire_send(). */
13660Sstevel@tonic-gate 		return;
13670Sstevel@tonic-gate 	}
13680Sstevel@tonic-gate 	/*
13690Sstevel@tonic-gate 	 * Need to remember ire_bucket here as ire_send*() may delete
13700Sstevel@tonic-gate 	 * the ire so we cannot reference it after that.
13710Sstevel@tonic-gate 	 */
13720Sstevel@tonic-gate 	irb = ire->ire_bucket;
13735388Sja97890 	if (ire->ire_ipversion == IPV4_VERSION) {
13745388Sja97890 		ire_send(q, mp, ire);
13755388Sja97890 		ire_limit = ip_ire_max_bucket_cnt;
13765388Sja97890 	} else {
13770Sstevel@tonic-gate 		ire_send_v6(q, mp, ire);
13785388Sja97890 		ire_limit = ip6_ire_max_bucket_cnt;
13795388Sja97890 	}
13805388Sja97890 
13815388Sja97890 	/*
13825388Sja97890 	 * irb is NULL if the IRE was not added to the hash. This happens
13835388Sja97890 	 * when IRE_MARK_NOADD is set and when IREs are returned from
13845388Sja97890 	 * ire_update_srcif_v4().
13855388Sja97890 	 */
13865388Sja97890 	if (irb != NULL) {
13875388Sja97890 		IRB_REFHOLD(irb);
13885388Sja97890 		if (irb->irb_ire_cnt > ire_limit)
13895388Sja97890 			ire_cache_cleanup(irb, ire_limit, ire);
13905388Sja97890 		IRB_REFRELE(irb);
13910Sstevel@tonic-gate 	}
13920Sstevel@tonic-gate }
13930Sstevel@tonic-gate 
13940Sstevel@tonic-gate /*
13950Sstevel@tonic-gate  * Initialize the ire that is specific to IPv4 part and call
13960Sstevel@tonic-gate  * ire_init_common to finish it.
13970Sstevel@tonic-gate  */
13980Sstevel@tonic-gate ire_t *
13990Sstevel@tonic-gate ire_init(ire_t *ire, uchar_t *addr, uchar_t *mask, uchar_t *src_addr,
14004823Sseb     uchar_t *gateway, uint_t *max_fragp, nce_t *src_nce, queue_t *rfq,
14014823Sseb     queue_t *stq, ushort_t type, ipif_t *ipif, ipaddr_t cmask, uint32_t phandle,
14024823Sseb     uint32_t ihandle, uint32_t flags, const iulp_t *ulp_info, tsol_gc_t *gc,
14034823Sseb     tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
14040Sstevel@tonic-gate {
14055907Sja97890 	ASSERT(type != IRE_CACHE || stq != NULL);
14061676Sjpk 	/*
14071676Sjpk 	 * Reject IRE security attribute creation/initialization
14081676Sjpk 	 * if system is not running in Trusted mode.
14091676Sjpk 	 */
14101676Sjpk 	if ((gc != NULL || gcgrp != NULL) && !is_system_labeled())
14111676Sjpk 		return (NULL);
14121676Sjpk 
14133448Sdh155122 	BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_alloced);
14140Sstevel@tonic-gate 
14150Sstevel@tonic-gate 	if (addr != NULL)
14160Sstevel@tonic-gate 		bcopy(addr, &ire->ire_addr, IP_ADDR_LEN);
14170Sstevel@tonic-gate 	if (src_addr != NULL)
14180Sstevel@tonic-gate 		bcopy(src_addr, &ire->ire_src_addr, IP_ADDR_LEN);
14190Sstevel@tonic-gate 	if (mask != NULL) {
14200Sstevel@tonic-gate 		bcopy(mask, &ire->ire_mask, IP_ADDR_LEN);
14210Sstevel@tonic-gate 		ire->ire_masklen = ip_mask_to_plen(ire->ire_mask);
14220Sstevel@tonic-gate 	}
14230Sstevel@tonic-gate 	if (gateway != NULL) {
14240Sstevel@tonic-gate 		bcopy(gateway, &ire->ire_gateway_addr, IP_ADDR_LEN);
14250Sstevel@tonic-gate 	}
14260Sstevel@tonic-gate 
14270Sstevel@tonic-gate 	if (type == IRE_CACHE)
14280Sstevel@tonic-gate 		ire->ire_cmask = cmask;
14290Sstevel@tonic-gate 
14301676Sjpk 	/* ire_init_common will free the mblks upon encountering any failure */
14314823Sseb 	if (!ire_init_common(ire, max_fragp, src_nce, rfq, stq, type, ipif,
14324823Sseb 	    phandle, ihandle, flags, IPV4_VERSION, ulp_info, gc, gcgrp, ipst))
14331676Sjpk 		return (NULL);
14340Sstevel@tonic-gate 
14350Sstevel@tonic-gate 	return (ire);
14360Sstevel@tonic-gate }
14370Sstevel@tonic-gate 
14380Sstevel@tonic-gate /*
14390Sstevel@tonic-gate  * Similar to ire_create except that it is called only when
14400Sstevel@tonic-gate  * we want to allocate ire as an mblk e.g. we have an external
14410Sstevel@tonic-gate  * resolver ARP.
14420Sstevel@tonic-gate  */
14430Sstevel@tonic-gate ire_t *
14440Sstevel@tonic-gate ire_create_mp(uchar_t *addr, uchar_t *mask, uchar_t *src_addr, uchar_t *gateway,
14454823Sseb     uint_t max_frag, nce_t *src_nce, queue_t *rfq, queue_t *stq, ushort_t type,
14464823Sseb     ipif_t *ipif, ipaddr_t cmask, uint32_t phandle, uint32_t ihandle,
14474823Sseb     uint32_t flags, const iulp_t *ulp_info, tsol_gc_t *gc, tsol_gcgrp_t *gcgrp,
14483448Sdh155122     ip_stack_t *ipst)
14490Sstevel@tonic-gate {
14502535Ssangeeta 	ire_t	*ire, *buf;
14510Sstevel@tonic-gate 	ire_t	*ret_ire;
14520Sstevel@tonic-gate 	mblk_t	*mp;
14532535Ssangeeta 	size_t	bufsize;
14542535Ssangeeta 	frtn_t	*frtnp;
14552535Ssangeeta 	ill_t	*ill;
14562535Ssangeeta 
14572535Ssangeeta 	bufsize = sizeof (ire_t) + sizeof (frtn_t);
14582535Ssangeeta 	buf = kmem_alloc(bufsize, KM_NOSLEEP);
14592535Ssangeeta 	if (buf == NULL) {
14602535Ssangeeta 		ip1dbg(("ire_create_mp: alloc failed\n"));
14612535Ssangeeta 		return (NULL);
14622535Ssangeeta 	}
14632535Ssangeeta 	frtnp = (frtn_t *)(buf + 1);
14642535Ssangeeta 	frtnp->free_arg = (caddr_t)buf;
14652535Ssangeeta 	frtnp->free_func = ire_freemblk;
14662535Ssangeeta 
14672535Ssangeeta 	/*
14682535Ssangeeta 	 * Allocate the new IRE. The ire created will hold a ref on
14692535Ssangeeta 	 * an nce_t after ire_nce_init, and this ref must either be
14702535Ssangeeta 	 * (a)  transferred to the ire_cache entry created when ire_add_v4
14712535Ssangeeta 	 *	is called after successful arp resolution, or,
14722535Ssangeeta 	 * (b)  released, when arp resolution fails
14732535Ssangeeta 	 * Case (b) is handled in ire_freemblk() which will be called
14742535Ssangeeta 	 * when mp is freed as a result of failed arp.
14752535Ssangeeta 	 */
14762535Ssangeeta 	mp = esballoc((unsigned char *)buf, bufsize, BPRI_MED, frtnp);
14770Sstevel@tonic-gate 	if (mp == NULL) {
14780Sstevel@tonic-gate 		ip1dbg(("ire_create_mp: alloc failed\n"));
14792535Ssangeeta 		kmem_free(buf, bufsize);
14800Sstevel@tonic-gate 		return (NULL);
14810Sstevel@tonic-gate 	}
14820Sstevel@tonic-gate 	ire = (ire_t *)mp->b_rptr;
14830Sstevel@tonic-gate 	mp->b_wptr = (uchar_t *)&ire[1];
14840Sstevel@tonic-gate 
14850Sstevel@tonic-gate 	/* Start clean. */
14860Sstevel@tonic-gate 	*ire = ire_null;
14870Sstevel@tonic-gate 	ire->ire_mp = mp;
14880Sstevel@tonic-gate 	mp->b_datap->db_type = IRE_DB_TYPE;
14892535Ssangeeta 	ire->ire_marks |= IRE_MARK_UNCACHED;
14900Sstevel@tonic-gate 
14914823Sseb 	ret_ire = ire_init(ire, addr, mask, src_addr, gateway, NULL, src_nce,
14924823Sseb 	    rfq, stq, type, ipif, cmask, phandle, ihandle, flags, ulp_info, gc,
14934823Sseb 	    gcgrp, ipst);
14940Sstevel@tonic-gate 
14952741Ssowmini 	ill = (ill_t *)(stq->q_ptr);
14960Sstevel@tonic-gate 	if (ret_ire == NULL) {
14973448Sdh155122 		/* ire_freemblk needs these set */
14982741Ssowmini 		ire->ire_stq_ifindex = ill->ill_phyint->phyint_ifindex;
14997558SSowmini.Varadhan@Sun.COM 		ire->ire_stackid = ipst->ips_netstack->netstack_stackid;
15003448Sdh155122 		ire->ire_ipst = ipst;
15010Sstevel@tonic-gate 		freeb(ire->ire_mp);
15020Sstevel@tonic-gate 		return (NULL);
15030Sstevel@tonic-gate 	}
15042535Ssangeeta 	ret_ire->ire_stq_ifindex = ill->ill_phyint->phyint_ifindex;
15057558SSowmini.Varadhan@Sun.COM 	ret_ire->ire_stackid = ipst->ips_netstack->netstack_stackid;
15060Sstevel@tonic-gate 	ASSERT(ret_ire == ire);
15077558SSowmini.Varadhan@Sun.COM 	ASSERT(ret_ire->ire_ipst == ipst);
15080Sstevel@tonic-gate 	/*
15090Sstevel@tonic-gate 	 * ire_max_frag is normally zero here and is atomically set
15100Sstevel@tonic-gate 	 * under the irebucket lock in ire_add_v[46] except for the
15110Sstevel@tonic-gate 	 * case of IRE_MARK_NOADD. In that event the the ire_max_frag
15120Sstevel@tonic-gate 	 * is non-zero here.
15130Sstevel@tonic-gate 	 */
15140Sstevel@tonic-gate 	ire->ire_max_frag = max_frag;
15150Sstevel@tonic-gate 	return (ire);
15160Sstevel@tonic-gate }
15170Sstevel@tonic-gate 
15180Sstevel@tonic-gate /*
15190Sstevel@tonic-gate  * ire_create is called to allocate and initialize a new IRE.
15200Sstevel@tonic-gate  *
15210Sstevel@tonic-gate  * NOTE : This is called as writer sometimes though not required
15220Sstevel@tonic-gate  * by this function.
15230Sstevel@tonic-gate  */
15240Sstevel@tonic-gate ire_t *
15250Sstevel@tonic-gate ire_create(uchar_t *addr, uchar_t *mask, uchar_t *src_addr, uchar_t *gateway,
15264823Sseb     uint_t *max_fragp, nce_t *src_nce, queue_t *rfq, queue_t *stq,
15274823Sseb     ushort_t type, ipif_t *ipif, ipaddr_t cmask, uint32_t phandle,
15284823Sseb     uint32_t ihandle, uint32_t flags, const iulp_t *ulp_info, tsol_gc_t *gc,
15294823Sseb     tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
15300Sstevel@tonic-gate {
15310Sstevel@tonic-gate 	ire_t	*ire;
15320Sstevel@tonic-gate 	ire_t	*ret_ire;
15330Sstevel@tonic-gate 
15340Sstevel@tonic-gate 	ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
15350Sstevel@tonic-gate 	if (ire == NULL) {
15360Sstevel@tonic-gate 		ip1dbg(("ire_create: alloc failed\n"));
15370Sstevel@tonic-gate 		return (NULL);
15380Sstevel@tonic-gate 	}
15390Sstevel@tonic-gate 	*ire = ire_null;
15400Sstevel@tonic-gate 
15414823Sseb 	ret_ire = ire_init(ire, addr, mask, src_addr, gateway, max_fragp,
15424823Sseb 	    src_nce, rfq, stq, type, ipif, cmask, phandle, ihandle, flags,
15434823Sseb 	    ulp_info, gc, gcgrp, ipst);
15440Sstevel@tonic-gate 
15450Sstevel@tonic-gate 	if (ret_ire == NULL) {
15460Sstevel@tonic-gate 		kmem_cache_free(ire_cache, ire);
15470Sstevel@tonic-gate 		return (NULL);
15480Sstevel@tonic-gate 	}
15490Sstevel@tonic-gate 	ASSERT(ret_ire == ire);
15500Sstevel@tonic-gate 	return (ire);
15510Sstevel@tonic-gate }
15520Sstevel@tonic-gate 
15530Sstevel@tonic-gate /*
15540Sstevel@tonic-gate  * Common to IPv4 and IPv6
15550Sstevel@tonic-gate  */
15561676Sjpk boolean_t
15574714Ssowmini ire_init_common(ire_t *ire, uint_t *max_fragp, nce_t *src_nce, queue_t *rfq,
15584823Sseb     queue_t *stq, ushort_t type, ipif_t *ipif, uint32_t phandle,
15594714Ssowmini     uint32_t ihandle, uint32_t flags, uchar_t ipversion, const iulp_t *ulp_info,
15604714Ssowmini     tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
15610Sstevel@tonic-gate {
15620Sstevel@tonic-gate 	ire->ire_max_fragp = max_fragp;
15633448Sdh155122 	ire->ire_frag_flag |= (ipst->ips_ip_path_mtu_discovery) ? IPH_DF : 0;
15640Sstevel@tonic-gate 
15651676Sjpk #ifdef DEBUG
15661676Sjpk 	if (ipif != NULL) {
15670Sstevel@tonic-gate 		if (ipif->ipif_isv6)
15680Sstevel@tonic-gate 			ASSERT(ipversion == IPV6_VERSION);
15690Sstevel@tonic-gate 		else
15700Sstevel@tonic-gate 			ASSERT(ipversion == IPV4_VERSION);
15710Sstevel@tonic-gate 	}
15721676Sjpk #endif /* DEBUG */
15731676Sjpk 
15741676Sjpk 	/*
15751676Sjpk 	 * Create/initialize IRE security attribute only in Trusted mode;
15761676Sjpk 	 * if the passed in gc/gcgrp is non-NULL, we expect that the caller
15771676Sjpk 	 * has held a reference to it and will release it when this routine
15781676Sjpk 	 * returns a failure, otherwise we own the reference.  We do this
15791676Sjpk 	 * prior to initializing the rest IRE fields.
15802416Sjarrett 	 *
15812416Sjarrett 	 * Don't allocate ire_gw_secattr for the resolver case to prevent
15822416Sjarrett 	 * memory leak (in case of external resolution failure). We'll
15832416Sjarrett 	 * allocate it after a successful external resolution, in ire_add().
15842416Sjarrett 	 * Note that ire->ire_mp != NULL here means this ire is headed
15852416Sjarrett 	 * to an external resolver.
15861676Sjpk 	 */
15871676Sjpk 	if (is_system_labeled()) {
15881676Sjpk 		if ((type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST |
15891676Sjpk 		    IRE_INTERFACE)) != 0) {
15901676Sjpk 			/* release references on behalf of caller */
15911676Sjpk 			if (gc != NULL)
15921676Sjpk 				GC_REFRELE(gc);
15931676Sjpk 			if (gcgrp != NULL)
15941676Sjpk 				GCGRP_REFRELE(gcgrp);
15952416Sjarrett 		} else if ((ire->ire_mp == NULL) &&
15962416Sjarrett 		    tsol_ire_init_gwattr(ire, ipversion, gc, gcgrp) != 0) {
15971676Sjpk 			return (B_FALSE);
15981676Sjpk 		}
15991676Sjpk 	}
16000Sstevel@tonic-gate 
16010Sstevel@tonic-gate 	ire->ire_stq = stq;
16020Sstevel@tonic-gate 	ire->ire_rfq = rfq;
16030Sstevel@tonic-gate 	ire->ire_type = type;
16040Sstevel@tonic-gate 	ire->ire_flags = RTF_UP | flags;
16050Sstevel@tonic-gate 	ire->ire_ident = TICK_TO_MSEC(lbolt);
16060Sstevel@tonic-gate 	bcopy(ulp_info, &ire->ire_uinfo, sizeof (iulp_t));
16070Sstevel@tonic-gate 
16080Sstevel@tonic-gate 	ire->ire_tire_mark = ire->ire_ob_pkt_count + ire->ire_ib_pkt_count;
16090Sstevel@tonic-gate 	ire->ire_last_used_time = lbolt;
16100Sstevel@tonic-gate 	ire->ire_create_time = (uint32_t)gethrestime_sec();
16110Sstevel@tonic-gate 
16120Sstevel@tonic-gate 	/*
16130Sstevel@tonic-gate 	 * If this IRE is an IRE_CACHE, inherit the handles from the
16140Sstevel@tonic-gate 	 * parent IREs. For others in the forwarding table, assign appropriate
16150Sstevel@tonic-gate 	 * new ones.
16160Sstevel@tonic-gate 	 *
16170Sstevel@tonic-gate 	 * The mutex protecting ire_handle is because ire_create is not always
16180Sstevel@tonic-gate 	 * called as a writer.
16190Sstevel@tonic-gate 	 */
16200Sstevel@tonic-gate 	if (ire->ire_type & IRE_OFFSUBNET) {
16213448Sdh155122 		mutex_enter(&ipst->ips_ire_handle_lock);
16223448Sdh155122 		ire->ire_phandle = (uint32_t)ipst->ips_ire_handle++;
16233448Sdh155122 		mutex_exit(&ipst->ips_ire_handle_lock);
16240Sstevel@tonic-gate 	} else if (ire->ire_type & IRE_INTERFACE) {
16253448Sdh155122 		mutex_enter(&ipst->ips_ire_handle_lock);
16263448Sdh155122 		ire->ire_ihandle = (uint32_t)ipst->ips_ire_handle++;
16273448Sdh155122 		mutex_exit(&ipst->ips_ire_handle_lock);
16280Sstevel@tonic-gate 	} else if (ire->ire_type == IRE_CACHE) {
16290Sstevel@tonic-gate 		ire->ire_phandle = phandle;
16300Sstevel@tonic-gate 		ire->ire_ihandle = ihandle;
16310Sstevel@tonic-gate 	}
16320Sstevel@tonic-gate 	ire->ire_ipif = ipif;
16330Sstevel@tonic-gate 	if (ipif != NULL) {
16340Sstevel@tonic-gate 		ire->ire_ipif_seqid = ipif->ipif_seqid;
16357880SJonathan.Anderson@Sun.COM 		ire->ire_ipif_ifindex =
16367880SJonathan.Anderson@Sun.COM 		    ipif->ipif_ill->ill_phyint->phyint_ifindex;
16370Sstevel@tonic-gate 		ire->ire_zoneid = ipif->ipif_zoneid;
16380Sstevel@tonic-gate 	} else {
16390Sstevel@tonic-gate 		ire->ire_zoneid = GLOBAL_ZONEID;
16400Sstevel@tonic-gate 	}
16410Sstevel@tonic-gate 	ire->ire_ipversion = ipversion;
16422535Ssangeeta 	mutex_init(&ire->ire_lock, NULL, MUTEX_DEFAULT, NULL);
16432535Ssangeeta 	if (ipversion == IPV4_VERSION) {
16444714Ssowmini 		/*
16454714Ssowmini 		 * IPv6 initializes the ire_nce in ire_add_v6, which expects
16464714Ssowmini 		 * to find the ire_nce to be null when it is called.
16474714Ssowmini 		 */
16484714Ssowmini 		if (ire_nce_init(ire, src_nce) != 0) {
16492535Ssangeeta 			/* some failure occurred. propagate error back */
16502535Ssangeeta 			return (B_FALSE);
16512535Ssangeeta 		}
16522535Ssangeeta 	}
16530Sstevel@tonic-gate 	ire->ire_refcnt = 1;
16543448Sdh155122 	ire->ire_ipst = ipst;	/* No netstack_hold */
16555023Scarlsonj 	ire->ire_trace_disable = B_FALSE;
16561676Sjpk 
16571676Sjpk 	return (B_TRUE);
16580Sstevel@tonic-gate }
16590Sstevel@tonic-gate 
16600Sstevel@tonic-gate /*
16610Sstevel@tonic-gate  * This routine is called repeatedly by ipif_up to create broadcast IREs.
16620Sstevel@tonic-gate  * It is passed a pointer to a slot in an IRE pointer array into which to
16630Sstevel@tonic-gate  * place the pointer to the new IRE, if indeed we create one.  If the
16640Sstevel@tonic-gate  * IRE corresponding to the address passed in would be a duplicate of an
16650Sstevel@tonic-gate  * existing one, we don't create the new one.  irep is incremented before
16660Sstevel@tonic-gate  * return only if we do create a new IRE.  (Always called as writer.)
16670Sstevel@tonic-gate  *
16680Sstevel@tonic-gate  * Note that with the "match_flags" parameter, we can match on either
16690Sstevel@tonic-gate  * a particular logical interface (MATCH_IRE_IPIF) or for all logical
16700Sstevel@tonic-gate  * interfaces for a given physical interface (MATCH_IRE_ILL).  Currently,
16710Sstevel@tonic-gate  * we only create broadcast ire's on a per physical interface basis. If
16720Sstevel@tonic-gate  * someone is going to be mucking with logical interfaces, it is important
16730Sstevel@tonic-gate  * to call "ipif_check_bcast_ires()" to make sure that any change to a
16740Sstevel@tonic-gate  * logical interface will not cause critical broadcast IRE's to be deleted.
16750Sstevel@tonic-gate  */
16760Sstevel@tonic-gate ire_t **
16770Sstevel@tonic-gate ire_check_and_create_bcast(ipif_t *ipif, ipaddr_t  addr, ire_t **irep,
16780Sstevel@tonic-gate     int match_flags)
16790Sstevel@tonic-gate {
16800Sstevel@tonic-gate 	ire_t *ire;
16810Sstevel@tonic-gate 	uint64_t check_flags = IPIF_DEPRECATED | IPIF_NOLOCAL | IPIF_ANYCAST;
16828485SPeter.Memishian@Sun.COM 	boolean_t prefer;
16838485SPeter.Memishian@Sun.COM 	ill_t *ill = ipif->ipif_ill;
16848485SPeter.Memishian@Sun.COM 	ip_stack_t *ipst = ill->ill_ipst;
16850Sstevel@tonic-gate 
16860Sstevel@tonic-gate 	/*
16870Sstevel@tonic-gate 	 * No broadcast IREs for the LOOPBACK interface
16880Sstevel@tonic-gate 	 * or others such as point to point and IPIF_NOXMIT.
16890Sstevel@tonic-gate 	 */
16900Sstevel@tonic-gate 	if (!(ipif->ipif_flags & IPIF_BROADCAST) ||
16910Sstevel@tonic-gate 	    (ipif->ipif_flags & IPIF_NOXMIT))
16920Sstevel@tonic-gate 		return (irep);
16930Sstevel@tonic-gate 
16948485SPeter.Memishian@Sun.COM 	/*
16958485SPeter.Memishian@Sun.COM 	 * If this new IRE would be a duplicate, only prefer it if one of
16968485SPeter.Memishian@Sun.COM 	 * the following is true:
16978485SPeter.Memishian@Sun.COM 	 *
16988485SPeter.Memishian@Sun.COM 	 * 1. The existing one has IPIF_DEPRECATED|IPIF_LOCAL|IPIF_ANYCAST
16998485SPeter.Memishian@Sun.COM 	 *    set and the new one has all of those clear.
17008485SPeter.Memishian@Sun.COM 	 *
17018485SPeter.Memishian@Sun.COM 	 * 2. The existing one corresponds to an underlying ILL in an IPMP
17028485SPeter.Memishian@Sun.COM 	 *    group and the new one corresponds to an IPMP group interface.
17038485SPeter.Memishian@Sun.COM 	 */
17040Sstevel@tonic-gate 	if ((ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ipif,
17053448Sdh155122 	    ipif->ipif_zoneid, NULL, match_flags, ipst)) != NULL) {
17068485SPeter.Memishian@Sun.COM 		prefer = ((ire->ire_ipif->ipif_flags & check_flags) &&
17078485SPeter.Memishian@Sun.COM 		    !(ipif->ipif_flags & check_flags)) ||
17088485SPeter.Memishian@Sun.COM 		    (IS_UNDER_IPMP(ire->ire_ipif->ipif_ill) && IS_IPMP(ill));
17098485SPeter.Memishian@Sun.COM 		if (!prefer) {
17100Sstevel@tonic-gate 			ire_refrele(ire);
17110Sstevel@tonic-gate 			return (irep);
17120Sstevel@tonic-gate 		}
17138485SPeter.Memishian@Sun.COM 
17140Sstevel@tonic-gate 		/*
17150Sstevel@tonic-gate 		 * Bcast ires exist in pairs. Both have to be deleted,
17160Sstevel@tonic-gate 		 * Since we are exclusive we can make the above assertion.
17170Sstevel@tonic-gate 		 * The 1st has to be refrele'd since it was ctable_lookup'd.
17180Sstevel@tonic-gate 		 */
17190Sstevel@tonic-gate 		ASSERT(IAM_WRITER_IPIF(ipif));
17200Sstevel@tonic-gate 		ASSERT(ire->ire_next->ire_addr == ire->ire_addr);
17210Sstevel@tonic-gate 		ire_delete(ire->ire_next);
17220Sstevel@tonic-gate 		ire_delete(ire);
17230Sstevel@tonic-gate 		ire_refrele(ire);
17240Sstevel@tonic-gate 	}
17258485SPeter.Memishian@Sun.COM 	return (ire_create_bcast(ipif, addr, irep));
17260Sstevel@tonic-gate }
17270Sstevel@tonic-gate 
17280Sstevel@tonic-gate uint_t ip_loopback_mtu = IP_LOOPBACK_MTU;
17290Sstevel@tonic-gate 
17300Sstevel@tonic-gate /*
17310Sstevel@tonic-gate  * This routine is called from ipif_check_bcast_ires and ire_check_bcast.
17320Sstevel@tonic-gate  * It leaves all the verifying and deleting to those routines. So it always
17330Sstevel@tonic-gate  * creates 2 bcast ires and chains them into the ire array passed in.
17340Sstevel@tonic-gate  */
17350Sstevel@tonic-gate ire_t **
17360Sstevel@tonic-gate ire_create_bcast(ipif_t *ipif, ipaddr_t  addr, ire_t **irep)
17370Sstevel@tonic-gate {
17383448Sdh155122 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
17398485SPeter.Memishian@Sun.COM 	ill_t		*ill = ipif->ipif_ill;
17408485SPeter.Memishian@Sun.COM 
17418485SPeter.Memishian@Sun.COM 	ASSERT(IAM_WRITER_IPIF(ipif));
17428485SPeter.Memishian@Sun.COM 
17438485SPeter.Memishian@Sun.COM 	if (IS_IPMP(ill)) {
17448485SPeter.Memishian@Sun.COM 		/*
17458485SPeter.Memishian@Sun.COM 		 * Broadcast IREs for the IPMP meta-interface use the
17468485SPeter.Memishian@Sun.COM 		 * nominated broadcast interface to send and receive packets.
17478485SPeter.Memishian@Sun.COM 		 * If there's no nominated interface, send the packets down to
17488485SPeter.Memishian@Sun.COM 		 * the IPMP stub driver, which will discard them.  If the
17498485SPeter.Memishian@Sun.COM 		 * nominated broadcast interface changes, ill_refresh_bcast()
17508485SPeter.Memishian@Sun.COM 		 * will refresh the broadcast IREs.
17518485SPeter.Memishian@Sun.COM 		 */
17528485SPeter.Memishian@Sun.COM 		if ((ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL)
17538485SPeter.Memishian@Sun.COM 			ill = ipif->ipif_ill;
17548485SPeter.Memishian@Sun.COM 	}
17553448Sdh155122 
17560Sstevel@tonic-gate 	*irep++ = ire_create(
17570Sstevel@tonic-gate 	    (uchar_t *)&addr,			/* dest addr */
17580Sstevel@tonic-gate 	    (uchar_t *)&ip_g_all_ones,		/* mask */
17590Sstevel@tonic-gate 	    (uchar_t *)&ipif->ipif_src_addr,	/* source addr */
17600Sstevel@tonic-gate 	    NULL,				/* no gateway */
17610Sstevel@tonic-gate 	    &ipif->ipif_mtu,			/* max frag */
17624714Ssowmini 	    NULL,				/* no src nce */
17638485SPeter.Memishian@Sun.COM 	    ill->ill_rq,			/* recv-from queue */
17648485SPeter.Memishian@Sun.COM 	    ill->ill_wq,			/* send-to queue */
17650Sstevel@tonic-gate 	    IRE_BROADCAST,
17660Sstevel@tonic-gate 	    ipif,
17670Sstevel@tonic-gate 	    0,
17680Sstevel@tonic-gate 	    0,
17690Sstevel@tonic-gate 	    0,
17700Sstevel@tonic-gate 	    0,
17711676Sjpk 	    &ire_uinfo_null,
17721676Sjpk 	    NULL,
17733448Sdh155122 	    NULL,
17743448Sdh155122 	    ipst);
17750Sstevel@tonic-gate 
17760Sstevel@tonic-gate 	*irep++ = ire_create(
17774714Ssowmini 	    (uchar_t *)&addr,			/* dest address */
17784714Ssowmini 	    (uchar_t *)&ip_g_all_ones,		/* mask */
17794714Ssowmini 	    (uchar_t *)&ipif->ipif_src_addr,	/* source address */
17804714Ssowmini 	    NULL,				/* no gateway */
17814714Ssowmini 	    &ip_loopback_mtu,			/* max frag size */
17824714Ssowmini 	    NULL,				/* no src_nce */
17838485SPeter.Memishian@Sun.COM 	    ill->ill_rq,			/* recv-from queue */
17844714Ssowmini 	    NULL,				/* no send-to queue */
17854714Ssowmini 	    IRE_BROADCAST,			/* Needed for fanout in wput */
17864714Ssowmini 	    ipif,
17874714Ssowmini 	    0,
17884714Ssowmini 	    0,
17894714Ssowmini 	    0,
17904714Ssowmini 	    0,
17914714Ssowmini 	    &ire_uinfo_null,
17924714Ssowmini 	    NULL,
17934714Ssowmini 	    NULL,
17944714Ssowmini 	    ipst);
17950Sstevel@tonic-gate 
17960Sstevel@tonic-gate 	return (irep);
17970Sstevel@tonic-gate }
17980Sstevel@tonic-gate 
17990Sstevel@tonic-gate /*
18000Sstevel@tonic-gate  * ire_walk routine to delete or update any IRE_CACHE that might contain
18010Sstevel@tonic-gate  * stale information.
18020Sstevel@tonic-gate  * The flags state which entries to delete or update.
18030Sstevel@tonic-gate  * Garbage collection is done separately using kmem alloc callbacks to
18040Sstevel@tonic-gate  * ip_trash_ire_reclaim.
18050Sstevel@tonic-gate  * Used for both IPv4 and IPv6. However, IPv6 only uses FLUSH_MTU_TIME
18060Sstevel@tonic-gate  * since other stale information is cleaned up using NUD.
18070Sstevel@tonic-gate  */
18080Sstevel@tonic-gate void
18090Sstevel@tonic-gate ire_expire(ire_t *ire, char *arg)
18100Sstevel@tonic-gate {
18113448Sdh155122 	ire_expire_arg_t	*ieap = (ire_expire_arg_t *)(uintptr_t)arg;
18123448Sdh155122 	ill_t			*stq_ill;
18133448Sdh155122 	int			flush_flags = ieap->iea_flush_flag;
18143448Sdh155122 	ip_stack_t		*ipst = ieap->iea_ipst;
18150Sstevel@tonic-gate 
18160Sstevel@tonic-gate 	if ((flush_flags & FLUSH_REDIRECT_TIME) &&
18173004Sdd193516 	    (ire->ire_flags & RTF_DYNAMIC)) {
18180Sstevel@tonic-gate 		/* Make sure we delete the corresponding IRE_CACHE */
18190Sstevel@tonic-gate 		ip1dbg(("ire_expire: all redirects\n"));
18203448Sdh155122 		ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
18210Sstevel@tonic-gate 		ire_delete(ire);
18223448Sdh155122 		atomic_dec_32(&ipst->ips_ip_redirect_cnt);
18230Sstevel@tonic-gate 		return;
18240Sstevel@tonic-gate 	}
18250Sstevel@tonic-gate 	if (ire->ire_type != IRE_CACHE)
18260Sstevel@tonic-gate 		return;
18270Sstevel@tonic-gate 
18280Sstevel@tonic-gate 	if (flush_flags & FLUSH_ARP_TIME) {
18290Sstevel@tonic-gate 		/*
18306307Sjprakash 		 * Remove all IRE_CACHE except IPv4 multicast ires. These
18316307Sjprakash 		 * ires will be deleted by ip_trash_ire_reclaim_stack()
18326307Sjprakash 		 * when system runs low in memory.
18336307Sjprakash 		 * Verify that create time is more than ip_ire_arp_interval
18346307Sjprakash 		 * milliseconds ago.
18350Sstevel@tonic-gate 		 */
18366307Sjprakash 
18376307Sjprakash 		if (!(ire->ire_ipversion == IPV4_VERSION &&
18386307Sjprakash 		    CLASSD(ire->ire_addr)) && NCE_EXPIRED(ire->ire_nce, ipst)) {
18390Sstevel@tonic-gate 			ire_delete(ire);
18400Sstevel@tonic-gate 			return;
18410Sstevel@tonic-gate 		}
18420Sstevel@tonic-gate 	}
18430Sstevel@tonic-gate 
18443448Sdh155122 	if (ipst->ips_ip_path_mtu_discovery && (flush_flags & FLUSH_MTU_TIME) &&
18450Sstevel@tonic-gate 	    (ire->ire_ipif != NULL)) {
18460Sstevel@tonic-gate 		/* Increase pmtu if it is less than the interface mtu */
18470Sstevel@tonic-gate 		mutex_enter(&ire->ire_lock);
18480Sstevel@tonic-gate 		/*
18490Sstevel@tonic-gate 		 * If the ipif is a vni (whose mtu is 0, since it's virtual)
18500Sstevel@tonic-gate 		 * get the mtu from the sending interfaces' ipif
18510Sstevel@tonic-gate 		 */
18520Sstevel@tonic-gate 		if (IS_VNI(ire->ire_ipif->ipif_ill)) {
18530Sstevel@tonic-gate 			stq_ill = ire->ire_stq->q_ptr;
18540Sstevel@tonic-gate 			ire->ire_max_frag = MIN(stq_ill->ill_ipif->ipif_mtu,
18550Sstevel@tonic-gate 			    IP_MAXPACKET);
18560Sstevel@tonic-gate 		} else {
18570Sstevel@tonic-gate 			ire->ire_max_frag = MIN(ire->ire_ipif->ipif_mtu,
18580Sstevel@tonic-gate 			    IP_MAXPACKET);
18590Sstevel@tonic-gate 		}
18600Sstevel@tonic-gate 		ire->ire_frag_flag |= IPH_DF;
18610Sstevel@tonic-gate 		mutex_exit(&ire->ire_lock);
18620Sstevel@tonic-gate 	}
18630Sstevel@tonic-gate }
18640Sstevel@tonic-gate 
18650Sstevel@tonic-gate /*
18660Sstevel@tonic-gate  * Return any local address.  We use this to target ourselves
18670Sstevel@tonic-gate  * when the src address was specified as 'default'.
18680Sstevel@tonic-gate  * Preference for IRE_LOCAL entries.
18690Sstevel@tonic-gate  */
18700Sstevel@tonic-gate ire_t *
18713448Sdh155122 ire_lookup_local(zoneid_t zoneid, ip_stack_t *ipst)
18720Sstevel@tonic-gate {
18730Sstevel@tonic-gate 	ire_t	*ire;
18740Sstevel@tonic-gate 	irb_t	*irb;
18750Sstevel@tonic-gate 	ire_t	*maybe = NULL;
18760Sstevel@tonic-gate 	int i;
18770Sstevel@tonic-gate 
18783448Sdh155122 	for (i = 0; i < ipst->ips_ip_cache_table_size;  i++) {
18793448Sdh155122 		irb = &ipst->ips_ip_cache_table[i];
18800Sstevel@tonic-gate 		if (irb->irb_ire == NULL)
18810Sstevel@tonic-gate 			continue;
18820Sstevel@tonic-gate 		rw_enter(&irb->irb_lock, RW_READER);
18830Sstevel@tonic-gate 		for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
18840Sstevel@tonic-gate 			if ((ire->ire_marks & IRE_MARK_CONDEMNED) ||
18851676Sjpk 			    (ire->ire_zoneid != zoneid &&
18861676Sjpk 			    ire->ire_zoneid != ALL_ZONES))
18870Sstevel@tonic-gate 				continue;
18880Sstevel@tonic-gate 			switch (ire->ire_type) {
18890Sstevel@tonic-gate 			case IRE_LOOPBACK:
18900Sstevel@tonic-gate 				if (maybe == NULL) {
18910Sstevel@tonic-gate 					IRE_REFHOLD(ire);
18920Sstevel@tonic-gate 					maybe = ire;
18930Sstevel@tonic-gate 				}
18940Sstevel@tonic-gate 				break;
18950Sstevel@tonic-gate 			case IRE_LOCAL:
18960Sstevel@tonic-gate 				if (maybe != NULL) {
18970Sstevel@tonic-gate 					ire_refrele(maybe);
18980Sstevel@tonic-gate 				}
18990Sstevel@tonic-gate 				IRE_REFHOLD(ire);
19000Sstevel@tonic-gate 				rw_exit(&irb->irb_lock);
19010Sstevel@tonic-gate 				return (ire);
19020Sstevel@tonic-gate 			}
19030Sstevel@tonic-gate 		}
19040Sstevel@tonic-gate 		rw_exit(&irb->irb_lock);
19050Sstevel@tonic-gate 	}
19060Sstevel@tonic-gate 	return (maybe);
19070Sstevel@tonic-gate }
19080Sstevel@tonic-gate 
19090Sstevel@tonic-gate /*
19100Sstevel@tonic-gate  * If the specified IRE is associated with a particular ILL, return
19110Sstevel@tonic-gate  * that ILL pointer (May be called as writer.).
19120Sstevel@tonic-gate  *
19130Sstevel@tonic-gate  * NOTE : This is not a generic function that can be used always.
19140Sstevel@tonic-gate  * This function always returns the ill of the outgoing packets
19150Sstevel@tonic-gate  * if this ire is used.
19160Sstevel@tonic-gate  */
19170Sstevel@tonic-gate ill_t *
19181676Sjpk ire_to_ill(const ire_t *ire)
19190Sstevel@tonic-gate {
19200Sstevel@tonic-gate 	ill_t *ill = NULL;
19210Sstevel@tonic-gate 
19220Sstevel@tonic-gate 	/*
19230Sstevel@tonic-gate 	 * 1) For an IRE_CACHE, ire_ipif is the one where it obtained
19240Sstevel@tonic-gate 	 *    the source address from. ire_stq is the one where the
19250Sstevel@tonic-gate 	 *    packets will be sent out on. We return that here.
19260Sstevel@tonic-gate 	 *
19270Sstevel@tonic-gate 	 * 2) IRE_BROADCAST normally has a loopback and a non-loopback
19280Sstevel@tonic-gate 	 *    copy and they always exist next to each other with loopback
19290Sstevel@tonic-gate 	 *    copy being the first one. If we are called on the non-loopback
19300Sstevel@tonic-gate 	 *    copy, return the one pointed by ire_stq. If it was called on
19310Sstevel@tonic-gate 	 *    a loopback copy, we still return the one pointed by the next
19320Sstevel@tonic-gate 	 *    ire's ire_stq pointer i.e the one pointed by the non-loopback
19330Sstevel@tonic-gate 	 *    copy. We don't want use ire_ipif as it might represent the
19340Sstevel@tonic-gate 	 *    source address (if we borrow source addresses for
19350Sstevel@tonic-gate 	 *    IRE_BROADCASTS in the future).
19360Sstevel@tonic-gate 	 *    However if an interface is currently coming up, the above
19370Sstevel@tonic-gate 	 *    condition may not hold during that period since the ires
19380Sstevel@tonic-gate 	 *    are added one at a time. Thus one of the pair could have been
19390Sstevel@tonic-gate 	 *    added and the other not yet added.
19402906Snordmark 	 * 3) For many other IREs (e.g., IRE_LOCAL), ire_rfq indicates the ill.
19412906Snordmark 	 * 4) For all others return the ones pointed by ire_ipif->ipif_ill.
19422906Snordmark 	 *    That handles IRE_LOOPBACK.
19430Sstevel@tonic-gate 	 */
19440Sstevel@tonic-gate 
19450Sstevel@tonic-gate 	if (ire->ire_type == IRE_CACHE) {
19460Sstevel@tonic-gate 		ill = (ill_t *)ire->ire_stq->q_ptr;
19470Sstevel@tonic-gate 	} else if (ire->ire_type == IRE_BROADCAST) {
19480Sstevel@tonic-gate 		if (ire->ire_stq != NULL) {
19490Sstevel@tonic-gate 			ill = (ill_t *)ire->ire_stq->q_ptr;
19500Sstevel@tonic-gate 		} else {
19510Sstevel@tonic-gate 			ire_t  *ire_next;
19520Sstevel@tonic-gate 
19530Sstevel@tonic-gate 			ire_next = ire->ire_next;
19540Sstevel@tonic-gate 			if (ire_next != NULL &&
19550Sstevel@tonic-gate 			    ire_next->ire_type == IRE_BROADCAST &&
19560Sstevel@tonic-gate 			    ire_next->ire_addr == ire->ire_addr &&
19570Sstevel@tonic-gate 			    ire_next->ire_ipif == ire->ire_ipif) {
19580Sstevel@tonic-gate 				ill = (ill_t *)ire_next->ire_stq->q_ptr;
19590Sstevel@tonic-gate 			}
19600Sstevel@tonic-gate 		}
19612906Snordmark 	} else if (ire->ire_rfq != NULL) {
19622906Snordmark 		ill = ire->ire_rfq->q_ptr;
19630Sstevel@tonic-gate 	} else if (ire->ire_ipif != NULL) {
19640Sstevel@tonic-gate 		ill = ire->ire_ipif->ipif_ill;
19650Sstevel@tonic-gate 	}
19660Sstevel@tonic-gate 	return (ill);
19670Sstevel@tonic-gate }
19680Sstevel@tonic-gate 
19690Sstevel@tonic-gate /* Arrange to call the specified function for every IRE in the world. */
19700Sstevel@tonic-gate void
19713448Sdh155122 ire_walk(pfv_t func, void *arg, ip_stack_t *ipst)
19720Sstevel@tonic-gate {
19733448Sdh155122 	ire_walk_ipvers(func, arg, 0, ALL_ZONES, ipst);
19740Sstevel@tonic-gate }
19750Sstevel@tonic-gate 
19760Sstevel@tonic-gate void
19773448Sdh155122 ire_walk_v4(pfv_t func, void *arg, zoneid_t zoneid, ip_stack_t *ipst)
19780Sstevel@tonic-gate {
19793448Sdh155122 	ire_walk_ipvers(func, arg, IPV4_VERSION, zoneid, ipst);
19800Sstevel@tonic-gate }
19810Sstevel@tonic-gate 
19820Sstevel@tonic-gate void
19833448Sdh155122 ire_walk_v6(pfv_t func, void *arg, zoneid_t zoneid, ip_stack_t *ipst)
19840Sstevel@tonic-gate {
19853448Sdh155122 	ire_walk_ipvers(func, arg, IPV6_VERSION, zoneid, ipst);
19860Sstevel@tonic-gate }
19870Sstevel@tonic-gate 
19880Sstevel@tonic-gate /*
19890Sstevel@tonic-gate  * Walk a particular version. version == 0 means both v4 and v6.
19900Sstevel@tonic-gate  */
19910Sstevel@tonic-gate static void
19923448Sdh155122 ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers, zoneid_t zoneid,
19933448Sdh155122     ip_stack_t *ipst)
19940Sstevel@tonic-gate {
19950Sstevel@tonic-gate 	if (vers != IPV6_VERSION) {
19962535Ssangeeta 		/*
19972535Ssangeeta 		 * ip_forwarding_table variable doesn't matter for IPv4 since
19983448Sdh155122 		 * ire_walk_ill_tables uses ips_ip_ftable for IPv4.
19992535Ssangeeta 		 */
20000Sstevel@tonic-gate 		ire_walk_ill_tables(0, 0, func, arg, IP_MASK_TABLE_SIZE,
20012535Ssangeeta 		    0, NULL,
20023448Sdh155122 		    ipst->ips_ip_cache_table_size, ipst->ips_ip_cache_table,
20033448Sdh155122 		    NULL, zoneid, ipst);
20040Sstevel@tonic-gate 	}
20050Sstevel@tonic-gate 	if (vers != IPV4_VERSION) {
20060Sstevel@tonic-gate 		ire_walk_ill_tables(0, 0, func, arg, IP6_MASK_TABLE_SIZE,
20073448Sdh155122 		    ipst->ips_ip6_ftable_hash_size,
20083448Sdh155122 		    ipst->ips_ip_forwarding_table_v6,
20093448Sdh155122 		    ipst->ips_ip6_cache_table_size,
20103448Sdh155122 		    ipst->ips_ip_cache_table_v6, NULL, zoneid, ipst);
20110Sstevel@tonic-gate 	}
20120Sstevel@tonic-gate }
20130Sstevel@tonic-gate 
20140Sstevel@tonic-gate /*
20157216Smeem  * Arrange to call the specified function for every IRE that matches the ill.
20160Sstevel@tonic-gate  */
20170Sstevel@tonic-gate void
20181676Sjpk ire_walk_ill(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg,
20190Sstevel@tonic-gate     ill_t *ill)
20200Sstevel@tonic-gate {
20217216Smeem 	uchar_t vers = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION);
20227216Smeem 
20237216Smeem 	ire_walk_ill_ipvers(match_flags, ire_type, func, arg, vers, ill);
20240Sstevel@tonic-gate }
20250Sstevel@tonic-gate 
20260Sstevel@tonic-gate void
20271676Sjpk ire_walk_ill_v4(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg,
20280Sstevel@tonic-gate     ill_t *ill)
20290Sstevel@tonic-gate {
20300Sstevel@tonic-gate 	ire_walk_ill_ipvers(match_flags, ire_type, func, arg, IPV4_VERSION,
20310Sstevel@tonic-gate 	    ill);
20320Sstevel@tonic-gate }
20330Sstevel@tonic-gate 
20340Sstevel@tonic-gate void
20351676Sjpk ire_walk_ill_v6(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg,
20360Sstevel@tonic-gate     ill_t *ill)
20370Sstevel@tonic-gate {
20380Sstevel@tonic-gate 	ire_walk_ill_ipvers(match_flags, ire_type, func, arg, IPV6_VERSION,
20390Sstevel@tonic-gate 	    ill);
20400Sstevel@tonic-gate }
20410Sstevel@tonic-gate 
20420Sstevel@tonic-gate /*
20437216Smeem  * Walk a particular ill and version.
20440Sstevel@tonic-gate  */
20450Sstevel@tonic-gate static void
20460Sstevel@tonic-gate ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type, pfv_t func,
20471676Sjpk     void *arg, uchar_t vers, ill_t *ill)
20480Sstevel@tonic-gate {
20493448Sdh155122 	ip_stack_t	*ipst = ill->ill_ipst;
20503448Sdh155122 
20517216Smeem 	if (vers == IPV4_VERSION) {
20520Sstevel@tonic-gate 		ire_walk_ill_tables(match_flags, ire_type, func, arg,
20532535Ssangeeta 		    IP_MASK_TABLE_SIZE, 0,
20543448Sdh155122 		    NULL, ipst->ips_ip_cache_table_size,
20553448Sdh155122 		    ipst->ips_ip_cache_table, ill, ALL_ZONES, ipst);
20567216Smeem 	} else if (vers == IPV6_VERSION) {
20570Sstevel@tonic-gate 		ire_walk_ill_tables(match_flags, ire_type, func, arg,
20583448Sdh155122 		    IP6_MASK_TABLE_SIZE, ipst->ips_ip6_ftable_hash_size,
20593448Sdh155122 		    ipst->ips_ip_forwarding_table_v6,
20603448Sdh155122 		    ipst->ips_ip6_cache_table_size,
20613448Sdh155122 		    ipst->ips_ip_cache_table_v6, ill, ALL_ZONES, ipst);
20620Sstevel@tonic-gate 	}
20630Sstevel@tonic-gate }
20640Sstevel@tonic-gate 
20652535Ssangeeta boolean_t
20660Sstevel@tonic-gate ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire,
20673448Sdh155122     ill_t *ill, zoneid_t zoneid, ip_stack_t *ipst)
20680Sstevel@tonic-gate {
20690Sstevel@tonic-gate 	ill_t *ire_stq_ill = NULL;
20700Sstevel@tonic-gate 	ill_t *ire_ipif_ill = NULL;
20710Sstevel@tonic-gate 
20720Sstevel@tonic-gate 	ASSERT(match_flags != 0 || zoneid != ALL_ZONES);
20730Sstevel@tonic-gate 	/*
20748485SPeter.Memishian@Sun.COM 	 * MATCH_IRE_ILL: We match both on ill pointed by ire_stq and
20758485SPeter.Memishian@Sun.COM 	 *    ire_ipif.  Only in the case of IRE_CACHEs can ire_stq and
20768485SPeter.Memishian@Sun.COM 	 *    ire_ipif be pointing to different ills. But we want to keep
20778485SPeter.Memishian@Sun.COM 	 *    this function generic enough for future use. So, we always
20788485SPeter.Memishian@Sun.COM 	 *    try to match on both.  The only caller of this function
20798485SPeter.Memishian@Sun.COM 	 *    ire_walk_ill_tables, will call "func" after we return from
20808485SPeter.Memishian@Sun.COM 	 *    this function. We expect "func" to do the right filtering
20818485SPeter.Memishian@Sun.COM 	 *    of ires in this case.
20820Sstevel@tonic-gate 	 */
20838485SPeter.Memishian@Sun.COM 	if (match_flags & MATCH_IRE_ILL) {
20840Sstevel@tonic-gate 		if (ire->ire_stq != NULL)
20858485SPeter.Memishian@Sun.COM 			ire_stq_ill = ire->ire_stq->q_ptr;
20860Sstevel@tonic-gate 		if (ire->ire_ipif != NULL)
20870Sstevel@tonic-gate 			ire_ipif_ill = ire->ire_ipif->ipif_ill;
20880Sstevel@tonic-gate 	}
20890Sstevel@tonic-gate 
20900Sstevel@tonic-gate 	if (zoneid != ALL_ZONES) {
20910Sstevel@tonic-gate 		/*
20920Sstevel@tonic-gate 		 * We're walking the IREs for a specific zone. The only relevant
20930Sstevel@tonic-gate 		 * IREs are:
20940Sstevel@tonic-gate 		 * - all IREs with a matching ire_zoneid
20950Sstevel@tonic-gate 		 * - all IRE_OFFSUBNETs as they're shared across all zones
20960Sstevel@tonic-gate 		 * - IRE_INTERFACE IREs for interfaces with a usable source addr
20970Sstevel@tonic-gate 		 *   with a matching zone
20980Sstevel@tonic-gate 		 * - IRE_DEFAULTs with a gateway reachable from the zone
20990Sstevel@tonic-gate 		 * We should really match on IRE_OFFSUBNETs and IRE_DEFAULTs
21000Sstevel@tonic-gate 		 * using the same rule; but the above rules are consistent with
21010Sstevel@tonic-gate 		 * the behavior of ire_ftable_lookup[_v6]() so that all the
21020Sstevel@tonic-gate 		 * routes that can be matched during lookup are also matched
21030Sstevel@tonic-gate 		 * here.
21040Sstevel@tonic-gate 		 */
21051676Sjpk 		if (zoneid != ire->ire_zoneid && ire->ire_zoneid != ALL_ZONES) {
21060Sstevel@tonic-gate 			/*
21070Sstevel@tonic-gate 			 * Note, IRE_INTERFACE can have the stq as NULL. For
21080Sstevel@tonic-gate 			 * example, if the default multicast route is tied to
21090Sstevel@tonic-gate 			 * the loopback address.
21100Sstevel@tonic-gate 			 */
21110Sstevel@tonic-gate 			if ((ire->ire_type & IRE_INTERFACE) &&
21120Sstevel@tonic-gate 			    (ire->ire_stq != NULL)) {
21130Sstevel@tonic-gate 				ire_stq_ill = (ill_t *)ire->ire_stq->q_ptr;
21140Sstevel@tonic-gate 				if (ire->ire_ipversion == IPV4_VERSION) {
21150Sstevel@tonic-gate 					if (!ipif_usesrc_avail(ire_stq_ill,
21160Sstevel@tonic-gate 					    zoneid))
21170Sstevel@tonic-gate 						/* No usable src addr in zone */
21180Sstevel@tonic-gate 						return (B_FALSE);
21190Sstevel@tonic-gate 				} else if (ire_stq_ill->ill_usesrc_ifindex
21200Sstevel@tonic-gate 				    != 0) {
21210Sstevel@tonic-gate 					/*
21220Sstevel@tonic-gate 					 * For IPv6 use ipif_select_source_v6()
21230Sstevel@tonic-gate 					 * so the right scope selection is done
21240Sstevel@tonic-gate 					 */
21250Sstevel@tonic-gate 					ipif_t *src_ipif;
21260Sstevel@tonic-gate 					src_ipif =
21270Sstevel@tonic-gate 					    ipif_select_source_v6(ire_stq_ill,
21288485SPeter.Memishian@Sun.COM 					    &ire->ire_addr_v6, B_FALSE,
21290Sstevel@tonic-gate 					    IPV6_PREFER_SRC_DEFAULT,
21300Sstevel@tonic-gate 					    zoneid);
21310Sstevel@tonic-gate 					if (src_ipif != NULL) {
21320Sstevel@tonic-gate 						ipif_refrele(src_ipif);
21330Sstevel@tonic-gate 					} else {
21340Sstevel@tonic-gate 						return (B_FALSE);
21350Sstevel@tonic-gate 					}
21360Sstevel@tonic-gate 				} else {
21370Sstevel@tonic-gate 					return (B_FALSE);
21380Sstevel@tonic-gate 				}
21390Sstevel@tonic-gate 
21400Sstevel@tonic-gate 			} else if (!(ire->ire_type & IRE_OFFSUBNET)) {
21410Sstevel@tonic-gate 				return (B_FALSE);
21420Sstevel@tonic-gate 			}
21430Sstevel@tonic-gate 		}
21440Sstevel@tonic-gate 
21450Sstevel@tonic-gate 		/*
21460Sstevel@tonic-gate 		 * Match all default routes from the global zone, irrespective
21472733Snordmark 		 * of reachability. For a non-global zone only match those
21482733Snordmark 		 * where ire_gateway_addr has a IRE_INTERFACE for the zoneid.
21490Sstevel@tonic-gate 		 */
21500Sstevel@tonic-gate 		if (ire->ire_type == IRE_DEFAULT && zoneid != GLOBAL_ZONEID) {
21510Sstevel@tonic-gate 			int ire_match_flags = 0;
21520Sstevel@tonic-gate 			in6_addr_t gw_addr_v6;
21530Sstevel@tonic-gate 			ire_t *rire;
21540Sstevel@tonic-gate 
21552733Snordmark 			ire_match_flags |= MATCH_IRE_TYPE;
21568485SPeter.Memishian@Sun.COM 			if (ire->ire_ipif != NULL)
21578485SPeter.Memishian@Sun.COM 				ire_match_flags |= MATCH_IRE_ILL;
21588485SPeter.Memishian@Sun.COM 
21590Sstevel@tonic-gate 			if (ire->ire_ipversion == IPV4_VERSION) {
21600Sstevel@tonic-gate 				rire = ire_route_lookup(ire->ire_gateway_addr,
21612733Snordmark 				    0, 0, IRE_INTERFACE, ire->ire_ipif, NULL,
21623448Sdh155122 				    zoneid, NULL, ire_match_flags, ipst);
21630Sstevel@tonic-gate 			} else {
21640Sstevel@tonic-gate 				ASSERT(ire->ire_ipversion == IPV6_VERSION);
21650Sstevel@tonic-gate 				mutex_enter(&ire->ire_lock);
21660Sstevel@tonic-gate 				gw_addr_v6 = ire->ire_gateway_addr_v6;
21670Sstevel@tonic-gate 				mutex_exit(&ire->ire_lock);
21680Sstevel@tonic-gate 				rire = ire_route_lookup_v6(&gw_addr_v6,
21692733Snordmark 				    NULL, NULL, IRE_INTERFACE, ire->ire_ipif,
21703448Sdh155122 				    NULL, zoneid, NULL, ire_match_flags, ipst);
21710Sstevel@tonic-gate 			}
21720Sstevel@tonic-gate 			if (rire == NULL) {
21730Sstevel@tonic-gate 				return (B_FALSE);
21740Sstevel@tonic-gate 			}
21750Sstevel@tonic-gate 			ire_refrele(rire);
21760Sstevel@tonic-gate 		}
21770Sstevel@tonic-gate 	}
21780Sstevel@tonic-gate 
21790Sstevel@tonic-gate 	if (((!(match_flags & MATCH_IRE_TYPE)) ||
21804714Ssowmini 	    (ire->ire_type & ire_type)) &&
21810Sstevel@tonic-gate 	    ((!(match_flags & MATCH_IRE_ILL)) ||
21828485SPeter.Memishian@Sun.COM 	    (ire_stq_ill == ill || ire_ipif_ill == ill ||
21838485SPeter.Memishian@Sun.COM 	    ire_ipif_ill != NULL && IS_IN_SAME_ILLGRP(ire_ipif_ill, ill)))) {
21840Sstevel@tonic-gate 		return (B_TRUE);
21850Sstevel@tonic-gate 	}
21860Sstevel@tonic-gate 	return (B_FALSE);
21870Sstevel@tonic-gate }
21880Sstevel@tonic-gate 
21892535Ssangeeta int
21902535Ssangeeta rtfunc(struct radix_node *rn, void *arg)
21912535Ssangeeta {
21922535Ssangeeta 	struct rtfuncarg *rtf = arg;
21932535Ssangeeta 	struct rt_entry *rt;
21942535Ssangeeta 	irb_t *irb;
21952535Ssangeeta 	ire_t *ire;
21962535Ssangeeta 	boolean_t ret;
21972535Ssangeeta 
21982535Ssangeeta 	rt = (struct rt_entry *)rn;
21992535Ssangeeta 	ASSERT(rt != NULL);
22002535Ssangeeta 	irb = &rt->rt_irb;
22012535Ssangeeta 	for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
22022535Ssangeeta 		if ((rtf->rt_match_flags != 0) ||
22032535Ssangeeta 		    (rtf->rt_zoneid != ALL_ZONES)) {
22042535Ssangeeta 			ret = ire_walk_ill_match(rtf->rt_match_flags,
22052535Ssangeeta 			    rtf->rt_ire_type, ire,
22063448Sdh155122 			    rtf->rt_ill, rtf->rt_zoneid, rtf->rt_ipst);
22072535Ssangeeta 		} else
22082535Ssangeeta 			ret = B_TRUE;
22092535Ssangeeta 		if (ret)
22102535Ssangeeta 			(*rtf->rt_func)(ire, rtf->rt_arg);
22112535Ssangeeta 	}
22122535Ssangeeta 	return (0);
22132535Ssangeeta }
22142535Ssangeeta 
22150Sstevel@tonic-gate /*
22160Sstevel@tonic-gate  * Walk the ftable and the ctable entries that match the ill.
22170Sstevel@tonic-gate  */
22182535Ssangeeta void
22190Sstevel@tonic-gate ire_walk_ill_tables(uint_t match_flags, uint_t ire_type, pfv_t func,
22201676Sjpk     void *arg, size_t ftbl_sz, size_t htbl_sz, irb_t **ipftbl,
22213448Sdh155122     size_t ctbl_sz, irb_t *ipctbl, ill_t *ill, zoneid_t zoneid,
22223448Sdh155122     ip_stack_t *ipst)
22230Sstevel@tonic-gate {
22240Sstevel@tonic-gate 	irb_t	*irb_ptr;
22250Sstevel@tonic-gate 	irb_t	*irb;
22260Sstevel@tonic-gate 	ire_t	*ire;
22270Sstevel@tonic-gate 	int i, j;
22280Sstevel@tonic-gate 	boolean_t ret;
22292535Ssangeeta 	struct rtfuncarg rtfarg;
22300Sstevel@tonic-gate 
22318485SPeter.Memishian@Sun.COM 	ASSERT((!(match_flags & MATCH_IRE_ILL)) || (ill != NULL));
22320Sstevel@tonic-gate 	ASSERT(!(match_flags & MATCH_IRE_TYPE) || (ire_type != 0));
22330Sstevel@tonic-gate 	/*
22340Sstevel@tonic-gate 	 * Optimize by not looking at the forwarding table if there
22350Sstevel@tonic-gate 	 * is a MATCH_IRE_TYPE specified with no IRE_FORWARDTABLE
22360Sstevel@tonic-gate 	 * specified in ire_type.
22370Sstevel@tonic-gate 	 */
22380Sstevel@tonic-gate 	if (!(match_flags & MATCH_IRE_TYPE) ||
22390Sstevel@tonic-gate 	    ((ire_type & IRE_FORWARDTABLE) != 0)) {
22402535Ssangeeta 		/* knobs such that routine is called only for v6 case */
22413448Sdh155122 		if (ipftbl == ipst->ips_ip_forwarding_table_v6) {
22422535Ssangeeta 			for (i = (ftbl_sz - 1);  i >= 0; i--) {
22432535Ssangeeta 				if ((irb_ptr = ipftbl[i]) == NULL)
22440Sstevel@tonic-gate 					continue;
22452535Ssangeeta 				for (j = 0; j < htbl_sz; j++) {
22462535Ssangeeta 					irb = &irb_ptr[j];
22472535Ssangeeta 					if (irb->irb_ire == NULL)
22482535Ssangeeta 						continue;
22492535Ssangeeta 
22502535Ssangeeta 					IRB_REFHOLD(irb);
22512535Ssangeeta 					for (ire = irb->irb_ire; ire != NULL;
22524714Ssowmini 					    ire = ire->ire_next) {
22532535Ssangeeta 						if (match_flags == 0 &&
22542535Ssangeeta 						    zoneid == ALL_ZONES) {
22552535Ssangeeta 							ret = B_TRUE;
22562535Ssangeeta 						} else {
22572535Ssangeeta 							ret =
22582535Ssangeeta 							    ire_walk_ill_match(
22592535Ssangeeta 							    match_flags,
22602535Ssangeeta 							    ire_type, ire, ill,
22613448Sdh155122 							    zoneid, ipst);
22622535Ssangeeta 						}
22632535Ssangeeta 						if (ret)
22642535Ssangeeta 							(*func)(ire, arg);
22650Sstevel@tonic-gate 					}
22662535Ssangeeta 					IRB_REFRELE(irb);
22670Sstevel@tonic-gate 				}
22680Sstevel@tonic-gate 			}
22692535Ssangeeta 		} else {
22702535Ssangeeta 			(void) memset(&rtfarg, 0, sizeof (rtfarg));
22712535Ssangeeta 			rtfarg.rt_func = func;
22722535Ssangeeta 			rtfarg.rt_arg = arg;
22732535Ssangeeta 			if (match_flags != 0) {
22742535Ssangeeta 				rtfarg.rt_match_flags = match_flags;
22752535Ssangeeta 			}
22762535Ssangeeta 			rtfarg.rt_ire_type = ire_type;
22772535Ssangeeta 			rtfarg.rt_ill = ill;
22782535Ssangeeta 			rtfarg.rt_zoneid = zoneid;
22793448Sdh155122 			rtfarg.rt_ipst = ipst;	/* No netstack_hold */
22803448Sdh155122 			(void) ipst->ips_ip_ftable->rnh_walktree_mt(
22813448Sdh155122 			    ipst->ips_ip_ftable,
22823448Sdh155122 			    rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn);
22830Sstevel@tonic-gate 		}
22840Sstevel@tonic-gate 	}
22850Sstevel@tonic-gate 
22860Sstevel@tonic-gate 	/*
22870Sstevel@tonic-gate 	 * Optimize by not looking at the cache table if there
22880Sstevel@tonic-gate 	 * is a MATCH_IRE_TYPE specified with no IRE_CACHETABLE
22890Sstevel@tonic-gate 	 * specified in ire_type.
22900Sstevel@tonic-gate 	 */
22910Sstevel@tonic-gate 	if (!(match_flags & MATCH_IRE_TYPE) ||
22920Sstevel@tonic-gate 	    ((ire_type & IRE_CACHETABLE) != 0)) {
22930Sstevel@tonic-gate 		for (i = 0; i < ctbl_sz;  i++) {
22940Sstevel@tonic-gate 			irb = &ipctbl[i];
22950Sstevel@tonic-gate 			if (irb->irb_ire == NULL)
22960Sstevel@tonic-gate 				continue;
22970Sstevel@tonic-gate 			IRB_REFHOLD(irb);
22980Sstevel@tonic-gate 			for (ire = irb->irb_ire; ire != NULL;
22990Sstevel@tonic-gate 			    ire = ire->ire_next) {
23000Sstevel@tonic-gate 				if (match_flags == 0 && zoneid == ALL_ZONES) {
23010Sstevel@tonic-gate 					ret = B_TRUE;
23020Sstevel@tonic-gate 				} else {
23030Sstevel@tonic-gate 					ret = ire_walk_ill_match(
23040Sstevel@tonic-gate 					    match_flags, ire_type,
23053448Sdh155122 					    ire, ill, zoneid, ipst);
23060Sstevel@tonic-gate 				}
23070Sstevel@tonic-gate 				if (ret)
23080Sstevel@tonic-gate 					(*func)(ire, arg);
23090Sstevel@tonic-gate 			}
23100Sstevel@tonic-gate 			IRB_REFRELE(irb);
23110Sstevel@tonic-gate 		}
23120Sstevel@tonic-gate 	}
23130Sstevel@tonic-gate }
23140Sstevel@tonic-gate 
23150Sstevel@tonic-gate /*
23160Sstevel@tonic-gate  * This function takes a mask and returns
23170Sstevel@tonic-gate  * number of bits set in the mask. If no
23180Sstevel@tonic-gate  * bit is set it returns 0.
23190Sstevel@tonic-gate  * Assumes a contiguous mask.
23200Sstevel@tonic-gate  */
23210Sstevel@tonic-gate int
23220Sstevel@tonic-gate ip_mask_to_plen(ipaddr_t mask)
23230Sstevel@tonic-gate {
23240Sstevel@tonic-gate 	return (mask == 0 ? 0 : IP_ABITS - (ffs(ntohl(mask)) -1));
23250Sstevel@tonic-gate }
23260Sstevel@tonic-gate 
23270Sstevel@tonic-gate /*
23280Sstevel@tonic-gate  * Convert length for a mask to the mask.
23290Sstevel@tonic-gate  */
23300Sstevel@tonic-gate ipaddr_t
23310Sstevel@tonic-gate ip_plen_to_mask(uint_t masklen)
23320Sstevel@tonic-gate {
23330Sstevel@tonic-gate 	return (htonl(IP_HOST_MASK << (IP_ABITS - masklen)));
23340Sstevel@tonic-gate }
23350Sstevel@tonic-gate 
23360Sstevel@tonic-gate void
23370Sstevel@tonic-gate ire_atomic_end(irb_t *irb_ptr, ire_t *ire)
23380Sstevel@tonic-gate {
23398564SPeter.Memishian@Sun.COM 	ill_t *stq_ill, *ipif_ill;
23408564SPeter.Memishian@Sun.COM 	ip_stack_t *ipst = ire->ire_ipst;
23418564SPeter.Memishian@Sun.COM 
23428564SPeter.Memishian@Sun.COM 	stq_ill = ire->ire_stq != NULL ? ire->ire_stq->q_ptr : NULL;
23438564SPeter.Memishian@Sun.COM 	ipif_ill = ire->ire_ipif != NULL ? ire->ire_ipif->ipif_ill : NULL;
23448564SPeter.Memishian@Sun.COM 	RELEASE_ILL_LOCKS(ipif_ill, stq_ill);
23450Sstevel@tonic-gate 	rw_exit(&irb_ptr->irb_lock);
23463448Sdh155122 	rw_exit(&ipst->ips_ill_g_usesrc_lock);
23470Sstevel@tonic-gate }
23480Sstevel@tonic-gate 
23490Sstevel@tonic-gate /*
23500Sstevel@tonic-gate  * ire_add_v[46] atomically make sure that the ipif or ill associated
23510Sstevel@tonic-gate  * with the new ire being added is stable and not IPIF_CHANGING or ILL_CHANGING
23520Sstevel@tonic-gate  * before adding the ire to the table. This ensures that we don't create
23530Sstevel@tonic-gate  * new IRE_CACHEs with stale values for parameters that are passed to
23540Sstevel@tonic-gate  * ire_create such as ire_max_frag. Note that ire_create() is passed a pointer
23550Sstevel@tonic-gate  * to the ipif_mtu, and not the value. The actual value is derived from the
23560Sstevel@tonic-gate  * parent ire or ipif under the bucket lock.
23570Sstevel@tonic-gate  */
23580Sstevel@tonic-gate int
23590Sstevel@tonic-gate ire_atomic_start(irb_t *irb_ptr, ire_t *ire, queue_t *q, mblk_t *mp,
23600Sstevel@tonic-gate     ipsq_func_t func)
23610Sstevel@tonic-gate {
23620Sstevel@tonic-gate 	ill_t	*stq_ill;
23630Sstevel@tonic-gate 	ill_t	*ipif_ill;
23640Sstevel@tonic-gate 	int	error = 0;
23650Sstevel@tonic-gate 	ill_t	*ill = NULL;
23663448Sdh155122 	ip_stack_t	*ipst = ire->ire_ipst;
23670Sstevel@tonic-gate 
23688564SPeter.Memishian@Sun.COM 	stq_ill = ire->ire_stq != NULL ? ire->ire_stq->q_ptr : NULL;
23698564SPeter.Memishian@Sun.COM 	ipif_ill = ire->ire_ipif != NULL ? ire->ire_ipif->ipif_ill : NULL;
23700Sstevel@tonic-gate 
23710Sstevel@tonic-gate 	ASSERT((q != NULL && mp != NULL && func != NULL) ||
23720Sstevel@tonic-gate 	    (q == NULL && mp == NULL && func == NULL));
23733448Sdh155122 	rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER);
23740Sstevel@tonic-gate 	GRAB_CONN_LOCK(q);
23750Sstevel@tonic-gate 	rw_enter(&irb_ptr->irb_lock, RW_WRITER);
23768564SPeter.Memishian@Sun.COM 	GRAB_ILL_LOCKS(ipif_ill, stq_ill);
23770Sstevel@tonic-gate 
23780Sstevel@tonic-gate 	/*
23790Sstevel@tonic-gate 	 * While the IRE is in the process of being added, a user may have
23800Sstevel@tonic-gate 	 * invoked the ifconfig usesrc option on the stq_ill to make it a
23810Sstevel@tonic-gate 	 * usesrc client ILL. Check for this possibility here, if it is true
23820Sstevel@tonic-gate 	 * then we fail adding the IRE_CACHE. Another check is to make sure
23830Sstevel@tonic-gate 	 * that an ipif_ill of an IRE_CACHE being added is not part of a usesrc
23840Sstevel@tonic-gate 	 * group. The ill_g_usesrc_lock is released in ire_atomic_end
23850Sstevel@tonic-gate 	 */
23860Sstevel@tonic-gate 	if ((ire->ire_type & IRE_CACHE) &&
23870Sstevel@tonic-gate 	    (ire->ire_marks & IRE_MARK_USESRC_CHECK)) {
23880Sstevel@tonic-gate 		if (stq_ill->ill_usesrc_ifindex != 0) {
23890Sstevel@tonic-gate 			ASSERT(stq_ill->ill_usesrc_grp_next != NULL);
23900Sstevel@tonic-gate 			if ((ipif_ill->ill_phyint->phyint_ifindex !=
23910Sstevel@tonic-gate 			    stq_ill->ill_usesrc_ifindex) ||
23920Sstevel@tonic-gate 			    (ipif_ill->ill_usesrc_grp_next == NULL) ||
23930Sstevel@tonic-gate 			    (ipif_ill->ill_usesrc_ifindex != 0)) {
23940Sstevel@tonic-gate 				error = EINVAL;
23950Sstevel@tonic-gate 				goto done;
23960Sstevel@tonic-gate 			}
23970Sstevel@tonic-gate 		} else if (ipif_ill->ill_usesrc_grp_next != NULL) {
23980Sstevel@tonic-gate 			error = EINVAL;
23990Sstevel@tonic-gate 			goto done;
24000Sstevel@tonic-gate 		}
24010Sstevel@tonic-gate 	}
24020Sstevel@tonic-gate 
24030Sstevel@tonic-gate 	/*
24048485SPeter.Memishian@Sun.COM 	 * Don't allow IRE's to be created on changing ill's.  Also, since
24058485SPeter.Memishian@Sun.COM 	 * IPMP flags can be set on an ill without quiescing it, if we're not
24068485SPeter.Memishian@Sun.COM 	 * a writer on stq_ill, check that the flags still allow IRE creation.
24070Sstevel@tonic-gate 	 */
24080Sstevel@tonic-gate 	if ((stq_ill != NULL) && !IAM_WRITER_ILL(stq_ill)) {
24090Sstevel@tonic-gate 		if (stq_ill->ill_state_flags & ILL_CHANGING) {
24100Sstevel@tonic-gate 			ill = stq_ill;
24110Sstevel@tonic-gate 			error = EAGAIN;
24128485SPeter.Memishian@Sun.COM 		} else if (IS_UNDER_IPMP(stq_ill)) {
24138485SPeter.Memishian@Sun.COM 			mutex_enter(&stq_ill->ill_phyint->phyint_lock);
24148485SPeter.Memishian@Sun.COM 			if (!ipmp_ill_is_active(stq_ill) &&
24158485SPeter.Memishian@Sun.COM 			    !(ire->ire_marks & IRE_MARK_TESTHIDDEN)) {
24168485SPeter.Memishian@Sun.COM 				error = EINVAL;
24178485SPeter.Memishian@Sun.COM 			}
24188485SPeter.Memishian@Sun.COM 			mutex_exit(&stq_ill->ill_phyint->phyint_lock);
24190Sstevel@tonic-gate 		}
24208485SPeter.Memishian@Sun.COM 		if (error != 0)
24218485SPeter.Memishian@Sun.COM 			goto done;
24220Sstevel@tonic-gate 	}
24230Sstevel@tonic-gate 
24240Sstevel@tonic-gate 	if ((ipif_ill != NULL) && !IAM_WRITER_ILL(ipif_ill) &&
24250Sstevel@tonic-gate 	    (ipif_ill->ill_state_flags & ILL_CHANGING)) {
24260Sstevel@tonic-gate 		ill = ipif_ill;
24270Sstevel@tonic-gate 		error = EAGAIN;
24280Sstevel@tonic-gate 		goto done;
24290Sstevel@tonic-gate 	}
24300Sstevel@tonic-gate 
24310Sstevel@tonic-gate 	if ((ire->ire_ipif != NULL) && !IAM_WRITER_IPIF(ire->ire_ipif) &&
24320Sstevel@tonic-gate 	    (ire->ire_ipif->ipif_state_flags & IPIF_CHANGING)) {
24330Sstevel@tonic-gate 		ill = ire->ire_ipif->ipif_ill;
24340Sstevel@tonic-gate 		ASSERT(ill != NULL);
24350Sstevel@tonic-gate 		error = EAGAIN;
24360Sstevel@tonic-gate 		goto done;
24370Sstevel@tonic-gate 	}
24380Sstevel@tonic-gate 
24390Sstevel@tonic-gate done:
24400Sstevel@tonic-gate 	if (error == EAGAIN && ILL_CAN_WAIT(ill, q)) {
24410Sstevel@tonic-gate 		ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
24420Sstevel@tonic-gate 		mutex_enter(&ipsq->ipsq_lock);
24438485SPeter.Memishian@Sun.COM 		mutex_enter(&ipsq->ipsq_xop->ipx_lock);
24440Sstevel@tonic-gate 		ire_atomic_end(irb_ptr, ire);
24450Sstevel@tonic-gate 		ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
24468485SPeter.Memishian@Sun.COM 		mutex_exit(&ipsq->ipsq_xop->ipx_lock);
24470Sstevel@tonic-gate 		mutex_exit(&ipsq->ipsq_lock);
24480Sstevel@tonic-gate 		error = EINPROGRESS;
24490Sstevel@tonic-gate 	} else if (error != 0) {
24500Sstevel@tonic-gate 		ire_atomic_end(irb_ptr, ire);
24510Sstevel@tonic-gate 	}
24520Sstevel@tonic-gate 
24530Sstevel@tonic-gate 	RELEASE_CONN_LOCK(q);
24540Sstevel@tonic-gate 	return (error);
24550Sstevel@tonic-gate }
24560Sstevel@tonic-gate 
24570Sstevel@tonic-gate /*
24580Sstevel@tonic-gate  * Add a fully initialized IRE to an appropriate table based on
24590Sstevel@tonic-gate  * ire_type.
24602535Ssangeeta  *
24612535Ssangeeta  * allow_unresolved == B_FALSE indicates a legacy code-path call
24622535Ssangeeta  * that has prohibited the addition of incomplete ire's. If this
24632535Ssangeeta  * parameter is set, and we find an nce that is in a state other
24642535Ssangeeta  * than ND_REACHABLE, we fail the add. Note that nce_state could be
24654084Ssowmini  * something other than ND_REACHABLE if the nce had just expired and
24664084Ssowmini  * the ire_create preceding the ire_add added a new ND_INITIAL nce.
24670Sstevel@tonic-gate  */
24680Sstevel@tonic-gate int
24692535Ssangeeta ire_add(ire_t **irep, queue_t *q, mblk_t *mp, ipsq_func_t func,
24702535Ssangeeta     boolean_t allow_unresolved)
24710Sstevel@tonic-gate {
24720Sstevel@tonic-gate 	ire_t	*ire1;
24730Sstevel@tonic-gate 	ill_t	*stq_ill = NULL;
24740Sstevel@tonic-gate 	ill_t	*ill;
24750Sstevel@tonic-gate 	ipif_t	*ipif = NULL;
24760Sstevel@tonic-gate 	ill_walk_context_t ctx;
24770Sstevel@tonic-gate 	ire_t	*ire = *irep;
24780Sstevel@tonic-gate 	int	error;
24792416Sjarrett 	boolean_t ire_is_mblk = B_FALSE;
24802416Sjarrett 	tsol_gcgrp_t *gcgrp = NULL;
24812416Sjarrett 	tsol_gcgrp_addr_t ga;
24823448Sdh155122 	ip_stack_t	*ipst = ire->ire_ipst;
24830Sstevel@tonic-gate 
24840Sstevel@tonic-gate 	/* get ready for the day when original ire is not created as mblk */
24850Sstevel@tonic-gate 	if (ire->ire_mp != NULL) {
24862416Sjarrett 		ire_is_mblk = B_TRUE;
24870Sstevel@tonic-gate 		/* Copy the ire to a kmem_alloc'ed area */
24880Sstevel@tonic-gate 		ire1 = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
24890Sstevel@tonic-gate 		if (ire1 == NULL) {
24900Sstevel@tonic-gate 			ip1dbg(("ire_add: alloc failed\n"));
24910Sstevel@tonic-gate 			ire_delete(ire);
24920Sstevel@tonic-gate 			*irep = NULL;
24930Sstevel@tonic-gate 			return (ENOMEM);
24940Sstevel@tonic-gate 		}
24952535Ssangeeta 		ire->ire_marks &= ~IRE_MARK_UNCACHED;
24960Sstevel@tonic-gate 		*ire1 = *ire;
24970Sstevel@tonic-gate 		ire1->ire_mp = NULL;
24982535Ssangeeta 		ire1->ire_stq_ifindex = 0;
24990Sstevel@tonic-gate 		freeb(ire->ire_mp);
25000Sstevel@tonic-gate 		ire = ire1;
25010Sstevel@tonic-gate 	}
25020Sstevel@tonic-gate 	if (ire->ire_stq != NULL)
25038485SPeter.Memishian@Sun.COM 		stq_ill = ire->ire_stq->q_ptr;
25040Sstevel@tonic-gate 
25050Sstevel@tonic-gate 	if (stq_ill != NULL && ire->ire_type == IRE_CACHE &&
25060Sstevel@tonic-gate 	    stq_ill->ill_net_type == IRE_IF_RESOLVER) {
25073448Sdh155122 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
25083448Sdh155122 		ill = ILL_START_WALK_ALL(&ctx, ipst);
25090Sstevel@tonic-gate 		for (; ill != NULL; ill = ill_next(&ctx, ill)) {
25100Sstevel@tonic-gate 			mutex_enter(&ill->ill_lock);
25110Sstevel@tonic-gate 			if (ill->ill_state_flags & ILL_CONDEMNED) {
25120Sstevel@tonic-gate 				mutex_exit(&ill->ill_lock);
25130Sstevel@tonic-gate 				continue;
25140Sstevel@tonic-gate 			}
25150Sstevel@tonic-gate 			/*
25160Sstevel@tonic-gate 			 * We need to make sure that the ipif is a valid one
25170Sstevel@tonic-gate 			 * before adding the IRE_CACHE. This happens only
25180Sstevel@tonic-gate 			 * with IRE_CACHE when there is an external resolver.
25190Sstevel@tonic-gate 			 *
25200Sstevel@tonic-gate 			 * We can unplumb a logical interface while the
25210Sstevel@tonic-gate 			 * packet is waiting in ARP with the IRE. Then,
25220Sstevel@tonic-gate 			 * later on when we feed the IRE back, the ipif
25230Sstevel@tonic-gate 			 * has to be re-checked. This can't happen with
25240Sstevel@tonic-gate 			 * NDP currently, as we never queue the IRE with
25250Sstevel@tonic-gate 			 * the packet. We always try to recreate the IRE
25260Sstevel@tonic-gate 			 * when the resolution is completed. But, we do
25270Sstevel@tonic-gate 			 * it for IPv6 also here so that in future if
25280Sstevel@tonic-gate 			 * we have external resolvers, it will work without
25290Sstevel@tonic-gate 			 * any change.
25300Sstevel@tonic-gate 			 */
25310Sstevel@tonic-gate 			ipif = ipif_lookup_seqid(ill, ire->ire_ipif_seqid);
25320Sstevel@tonic-gate 			if (ipif != NULL) {
25330Sstevel@tonic-gate 				ipif_refhold_locked(ipif);
25340Sstevel@tonic-gate 				mutex_exit(&ill->ill_lock);
25350Sstevel@tonic-gate 				break;
25360Sstevel@tonic-gate 			}
25370Sstevel@tonic-gate 			mutex_exit(&ill->ill_lock);
25380Sstevel@tonic-gate 		}
25393448Sdh155122 		rw_exit(&ipst->ips_ill_g_lock);
25400Sstevel@tonic-gate 		if (ipif == NULL ||
25410Sstevel@tonic-gate 		    (ipif->ipif_isv6 &&
25428485SPeter.Memishian@Sun.COM 		    !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6) &&
25430Sstevel@tonic-gate 		    !IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6,
25440Sstevel@tonic-gate 		    &ipif->ipif_v6src_addr)) ||
25450Sstevel@tonic-gate 		    (!ipif->ipif_isv6 &&
25460Sstevel@tonic-gate 		    ire->ire_src_addr != ipif->ipif_src_addr) ||
25471676Sjpk 		    ire->ire_zoneid != ipif->ipif_zoneid) {
25480Sstevel@tonic-gate 			if (ipif != NULL)
25490Sstevel@tonic-gate 				ipif_refrele(ipif);
25500Sstevel@tonic-gate 			ire->ire_ipif = NULL;
25510Sstevel@tonic-gate 			ire_delete(ire);
25520Sstevel@tonic-gate 			*irep = NULL;
25530Sstevel@tonic-gate 			return (EINVAL);
25540Sstevel@tonic-gate 		}
25550Sstevel@tonic-gate 
25560Sstevel@tonic-gate 		ASSERT(ill != NULL);
25572416Sjarrett 
25582416Sjarrett 		/*
25592416Sjarrett 		 * Since we didn't attach label security attributes to the
25602416Sjarrett 		 * ire for the resolver case, we need to add it now. (only
25612416Sjarrett 		 * for v4 resolver and v6 xresolv case).
25622416Sjarrett 		 */
25632416Sjarrett 		if (is_system_labeled() && ire_is_mblk) {
25642416Sjarrett 			if (ire->ire_ipversion == IPV4_VERSION) {
25652416Sjarrett 				ga.ga_af = AF_INET;
25662416Sjarrett 				IN6_IPADDR_TO_V4MAPPED(ire->ire_gateway_addr !=
25672416Sjarrett 				    INADDR_ANY ? ire->ire_gateway_addr :
25682416Sjarrett 				    ire->ire_addr, &ga.ga_addr);
25692416Sjarrett 			} else {
25702416Sjarrett 				ga.ga_af = AF_INET6;
25712416Sjarrett 				ga.ga_addr = IN6_IS_ADDR_UNSPECIFIED(
25722416Sjarrett 				    &ire->ire_gateway_addr_v6) ?
25732416Sjarrett 				    ire->ire_addr_v6 :
25742416Sjarrett 				    ire->ire_gateway_addr_v6;
25752416Sjarrett 			}
25762416Sjarrett 			gcgrp = gcgrp_lookup(&ga, B_FALSE);
25772416Sjarrett 			error = tsol_ire_init_gwattr(ire, ire->ire_ipversion,
25782416Sjarrett 			    NULL, gcgrp);
25792416Sjarrett 			if (error != 0) {
25802416Sjarrett 				if (gcgrp != NULL) {
25812416Sjarrett 					GCGRP_REFRELE(gcgrp);
25822416Sjarrett 					gcgrp = NULL;
25832416Sjarrett 				}
25842416Sjarrett 				ipif_refrele(ipif);
25852416Sjarrett 				ire->ire_ipif = NULL;
25862416Sjarrett 				ire_delete(ire);
25872416Sjarrett 				*irep = NULL;
25882416Sjarrett 				return (error);
25892416Sjarrett 			}
25902416Sjarrett 		}
25910Sstevel@tonic-gate 	}
25920Sstevel@tonic-gate 
25930Sstevel@tonic-gate 	/*
25940Sstevel@tonic-gate 	 * In case ire was changed
25950Sstevel@tonic-gate 	 */
25960Sstevel@tonic-gate 	*irep = ire;
25974823Sseb 	if (ire->ire_ipversion == IPV6_VERSION)
25980Sstevel@tonic-gate 		error = ire_add_v6(irep, q, mp, func);
25994823Sseb 	else
26004823Sseb 		error = ire_add_v4(irep, q, mp, func, allow_unresolved);
26010Sstevel@tonic-gate 	if (ipif != NULL)
26020Sstevel@tonic-gate 		ipif_refrele(ipif);
26030Sstevel@tonic-gate 	return (error);
26040Sstevel@tonic-gate }
26050Sstevel@tonic-gate 
26060Sstevel@tonic-gate /*
26072416Sjarrett  * Add an initialized IRE to an appropriate table based on ire_type.
26080Sstevel@tonic-gate  *
26093004Sdd193516  * The forward table contains IRE_PREFIX/IRE_HOST and
26100Sstevel@tonic-gate  * IRE_IF_RESOLVER/IRE_IF_NORESOLVER and IRE_DEFAULT.
26110Sstevel@tonic-gate  *
26120Sstevel@tonic-gate  * The cache table contains IRE_BROADCAST/IRE_LOCAL/IRE_LOOPBACK
26130Sstevel@tonic-gate  * and IRE_CACHE.
26140Sstevel@tonic-gate  *
26150Sstevel@tonic-gate  * NOTE : This function is called as writer though not required
26160Sstevel@tonic-gate  * by this function.
26170Sstevel@tonic-gate  */
26180Sstevel@tonic-gate static int
26192535Ssangeeta ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
26202535Ssangeeta     boolean_t allow_unresolved)
26210Sstevel@tonic-gate {
26220Sstevel@tonic-gate 	ire_t	*ire1;
26230Sstevel@tonic-gate 	irb_t	*irb_ptr;
26240Sstevel@tonic-gate 	ire_t	**irep;
26250Sstevel@tonic-gate 	int	flags;
26260Sstevel@tonic-gate 	ire_t	*pire = NULL;
26270Sstevel@tonic-gate 	ill_t	*stq_ill;
26280Sstevel@tonic-gate 	ire_t	*ire = *ire_p;
26290Sstevel@tonic-gate 	int	error;
26302535Ssangeeta 	boolean_t need_refrele = B_FALSE;
26312535Ssangeeta 	nce_t	*nce;
26323448Sdh155122 	ip_stack_t	*ipst = ire->ire_ipst;
26338485SPeter.Memishian@Sun.COM 	uint_t	marks = 0;
26348485SPeter.Memishian@Sun.COM 
26358485SPeter.Memishian@Sun.COM 	/*
26368485SPeter.Memishian@Sun.COM 	 * IREs with source addresses hosted on interfaces that are under IPMP
26378485SPeter.Memishian@Sun.COM 	 * should be hidden so that applications don't accidentally end up
26388485SPeter.Memishian@Sun.COM 	 * sending packets with test addresses as their source addresses, or
26398485SPeter.Memishian@Sun.COM 	 * sending out interfaces that are e.g. IFF_INACTIVE.  Hide them here.
26408485SPeter.Memishian@Sun.COM 	 */
26418485SPeter.Memishian@Sun.COM 	if (ire->ire_ipif != NULL && IS_UNDER_IPMP(ire->ire_ipif->ipif_ill))
26428485SPeter.Memishian@Sun.COM 		marks |= IRE_MARK_TESTHIDDEN;
26430Sstevel@tonic-gate 
26440Sstevel@tonic-gate 	if (ire->ire_ipif != NULL)
26450Sstevel@tonic-gate 		ASSERT(!MUTEX_HELD(&ire->ire_ipif->ipif_ill->ill_lock));
26460Sstevel@tonic-gate 	if (ire->ire_stq != NULL)
26470Sstevel@tonic-gate 		ASSERT(!MUTEX_HELD(
26480Sstevel@tonic-gate 		    &((ill_t *)(ire->ire_stq->q_ptr))->ill_lock));
26490Sstevel@tonic-gate 	ASSERT(ire->ire_ipversion == IPV4_VERSION);
26500Sstevel@tonic-gate 	ASSERT(ire->ire_mp == NULL); /* Calls should go through ire_add */
26510Sstevel@tonic-gate 
26520Sstevel@tonic-gate 	/* Find the appropriate list head. */
26530Sstevel@tonic-gate 	switch (ire->ire_type) {
26540Sstevel@tonic-gate 	case IRE_HOST:
26550Sstevel@tonic-gate 		ire->ire_mask = IP_HOST_MASK;
26560Sstevel@tonic-gate 		ire->ire_masklen = IP_ABITS;
26578485SPeter.Memishian@Sun.COM 		ire->ire_marks |= marks;
26580Sstevel@tonic-gate 		if ((ire->ire_flags & RTF_SETSRC) == 0)
26590Sstevel@tonic-gate 			ire->ire_src_addr = 0;
26600Sstevel@tonic-gate 		break;
26610Sstevel@tonic-gate 	case IRE_CACHE:
26628485SPeter.Memishian@Sun.COM 		ire->ire_mask = IP_HOST_MASK;
26638485SPeter.Memishian@Sun.COM 		ire->ire_masklen = IP_ABITS;
26648485SPeter.Memishian@Sun.COM 		ire->ire_marks |= marks;
26658485SPeter.Memishian@Sun.COM 		break;
26660Sstevel@tonic-gate 	case IRE_BROADCAST:
26670Sstevel@tonic-gate 	case IRE_LOCAL:
26680Sstevel@tonic-gate 	case IRE_LOOPBACK:
26690Sstevel@tonic-gate 		ire->ire_mask = IP_HOST_MASK;
26700Sstevel@tonic-gate 		ire->ire_masklen = IP_ABITS;
26710Sstevel@tonic-gate 		break;
26720Sstevel@tonic-gate 	case IRE_PREFIX:
26730Sstevel@tonic-gate 	case IRE_DEFAULT:
26748485SPeter.Memishian@Sun.COM 		ire->ire_marks |= marks;
26750Sstevel@tonic-gate 		if ((ire->ire_flags & RTF_SETSRC) == 0)
26760Sstevel@tonic-gate 			ire->ire_src_addr = 0;
26770Sstevel@tonic-gate 		break;
26780Sstevel@tonic-gate 	case IRE_IF_RESOLVER:
26790Sstevel@tonic-gate 	case IRE_IF_NORESOLVER:
26808485SPeter.Memishian@Sun.COM 		ire->ire_marks |= marks;
26810Sstevel@tonic-gate 		break;
26820Sstevel@tonic-gate 	default:
26832535Ssangeeta 		ip0dbg(("ire_add_v4: ire %p has unrecognized IRE type (%d)\n",
26842535Ssangeeta 		    (void *)ire, ire->ire_type));
26850Sstevel@tonic-gate 		ire_delete(ire);
26860Sstevel@tonic-gate 		*ire_p = NULL;
26870Sstevel@tonic-gate 		return (EINVAL);
26880Sstevel@tonic-gate 	}
26890Sstevel@tonic-gate 
26900Sstevel@tonic-gate 	/* Make sure the address is properly masked. */
26910Sstevel@tonic-gate 	ire->ire_addr &= ire->ire_mask;
26920Sstevel@tonic-gate 
26930Sstevel@tonic-gate 	/*
26940Sstevel@tonic-gate 	 * ip_newroute/ip_newroute_multi are unable to prevent the deletion
26950Sstevel@tonic-gate 	 * of the interface route while adding an IRE_CACHE for an on-link
26960Sstevel@tonic-gate 	 * destination in the IRE_IF_RESOLVER case, since the ire has to
26970Sstevel@tonic-gate 	 * go to ARP and return. We can't do a REFHOLD on the
26980Sstevel@tonic-gate 	 * associated interface ire for fear of ARP freeing the message.
26990Sstevel@tonic-gate 	 * Here we look up the interface ire in the forwarding table and
27000Sstevel@tonic-gate 	 * make sure that the interface route has not been deleted.
27010Sstevel@tonic-gate 	 */
27020Sstevel@tonic-gate 	if (ire->ire_type == IRE_CACHE && ire->ire_gateway_addr == 0 &&
27030Sstevel@tonic-gate 	    ((ill_t *)ire->ire_stq->q_ptr)->ill_net_type == IRE_IF_RESOLVER) {
27042535Ssangeeta 
27050Sstevel@tonic-gate 		ASSERT(ire->ire_max_fragp == NULL);
27060Sstevel@tonic-gate 		if (CLASSD(ire->ire_addr) && !(ire->ire_flags & RTF_SETSRC)) {
27070Sstevel@tonic-gate 			/*
27080Sstevel@tonic-gate 			 * The ihandle that we used in ip_newroute_multi
27090Sstevel@tonic-gate 			 * comes from the interface route corresponding
27100Sstevel@tonic-gate 			 * to ire_ipif. Lookup here to see if it exists
27110Sstevel@tonic-gate 			 * still.
27120Sstevel@tonic-gate 			 * If the ire has a source address assigned using
27130Sstevel@tonic-gate 			 * RTF_SETSRC, ire_ipif is the logical interface holding
27140Sstevel@tonic-gate 			 * this source address, so we can't use it to check for
27150Sstevel@tonic-gate 			 * the existence of the interface route. Instead we rely
27160Sstevel@tonic-gate 			 * on the brute force ihandle search in
27170Sstevel@tonic-gate 			 * ire_ihandle_lookup_onlink() below.
27180Sstevel@tonic-gate 			 */
27190Sstevel@tonic-gate 			pire = ipif_to_ire(ire->ire_ipif);
27200Sstevel@tonic-gate 			if (pire == NULL) {
27210Sstevel@tonic-gate 				ire_delete(ire);
27220Sstevel@tonic-gate 				*ire_p = NULL;
27230Sstevel@tonic-gate 				return (EINVAL);
27240Sstevel@tonic-gate 			} else if (pire->ire_ihandle != ire->ire_ihandle) {
27250Sstevel@tonic-gate 				ire_refrele(pire);
27260Sstevel@tonic-gate 				ire_delete(ire);
27270Sstevel@tonic-gate 				*ire_p = NULL;
27280Sstevel@tonic-gate 				return (EINVAL);
27290Sstevel@tonic-gate 			}
27300Sstevel@tonic-gate 		} else {
27310Sstevel@tonic-gate 			pire = ire_ihandle_lookup_onlink(ire);
27320Sstevel@tonic-gate 			if (pire == NULL) {
27330Sstevel@tonic-gate 				ire_delete(ire);
27340Sstevel@tonic-gate 				*ire_p = NULL;
27350Sstevel@tonic-gate 				return (EINVAL);
27360Sstevel@tonic-gate 			}
27370Sstevel@tonic-gate 		}
27380Sstevel@tonic-gate 		/* Prevent pire from getting deleted */
27390Sstevel@tonic-gate 		IRB_REFHOLD(pire->ire_bucket);
27400Sstevel@tonic-gate 		/* Has it been removed already ? */
27410Sstevel@tonic-gate 		if (pire->ire_marks & IRE_MARK_CONDEMNED) {
27420Sstevel@tonic-gate 			IRB_REFRELE(pire->ire_bucket);
27430Sstevel@tonic-gate 			ire_refrele(pire);
27440Sstevel@tonic-gate 			ire_delete(ire);
27450Sstevel@tonic-gate 			*ire_p = NULL;
27460Sstevel@tonic-gate 			return (EINVAL);
27470Sstevel@tonic-gate 		}
27480Sstevel@tonic-gate 	} else {
27490Sstevel@tonic-gate 		ASSERT(ire->ire_max_fragp != NULL);
27500Sstevel@tonic-gate 	}
27510Sstevel@tonic-gate 	flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW);
27520Sstevel@tonic-gate 
27530Sstevel@tonic-gate 	if (ire->ire_ipif != NULL) {
27540Sstevel@tonic-gate 		/*
27550Sstevel@tonic-gate 		 * We use MATCH_IRE_IPIF while adding IRE_CACHES only
27560Sstevel@tonic-gate 		 * for historic reasons and to maintain symmetry with
27570Sstevel@tonic-gate 		 * IPv6 code path. Historically this was used by
27580Sstevel@tonic-gate 		 * multicast code to create multiple IRE_CACHES on
27590Sstevel@tonic-gate 		 * a single ill with different ipifs. This was used
27600Sstevel@tonic-gate 		 * so that multicast packets leaving the node had the
27610Sstevel@tonic-gate 		 * right source address. This is no longer needed as
27620Sstevel@tonic-gate 		 * ip_wput initializes the address correctly.
27630Sstevel@tonic-gate 		 */
27640Sstevel@tonic-gate 		flags |= MATCH_IRE_IPIF;
27650Sstevel@tonic-gate 		/*
27668485SPeter.Memishian@Sun.COM 		 * If we are creating a hidden IRE, make sure we search for
27678485SPeter.Memishian@Sun.COM 		 * hidden IREs when searching for duplicates below.
27688485SPeter.Memishian@Sun.COM 		 * Otherwise, we might find an IRE on some other interface
27698485SPeter.Memishian@Sun.COM 		 * that's not marked hidden.
27700Sstevel@tonic-gate 		 */
27718485SPeter.Memishian@Sun.COM 		if (ire->ire_marks & IRE_MARK_TESTHIDDEN)
27728485SPeter.Memishian@Sun.COM 			flags |= MATCH_IRE_MARK_TESTHIDDEN;
27730Sstevel@tonic-gate 	}
27742535Ssangeeta 	if ((ire->ire_type & IRE_CACHETABLE) == 0) {
27752535Ssangeeta 		irb_ptr = ire_get_bucket(ire);
27762535Ssangeeta 		need_refrele = B_TRUE;
27772535Ssangeeta 		if (irb_ptr == NULL) {
27782535Ssangeeta 			/*
27792535Ssangeeta 			 * This assumes that the ire has not added
27802535Ssangeeta 			 * a reference to the ipif.
27812535Ssangeeta 			 */
27822535Ssangeeta 			ire->ire_ipif = NULL;
27832535Ssangeeta 			ire_delete(ire);
27842535Ssangeeta 			if (pire != NULL) {
27852535Ssangeeta 				IRB_REFRELE(pire->ire_bucket);
27862535Ssangeeta 				ire_refrele(pire);
27872535Ssangeeta 			}
27882535Ssangeeta 			*ire_p = NULL;
27892535Ssangeeta 			return (EINVAL);
27902535Ssangeeta 		}
27912535Ssangeeta 	} else {
27923448Sdh155122 		irb_ptr = &(ipst->ips_ip_cache_table[IRE_ADDR_HASH(
27933448Sdh155122 		    ire->ire_addr, ipst->ips_ip_cache_table_size)]);
27942535Ssangeeta 	}
27950Sstevel@tonic-gate 
27960Sstevel@tonic-gate 	/*
27970Sstevel@tonic-gate 	 * Start the atomic add of the ire. Grab the ill locks,
27980Sstevel@tonic-gate 	 * ill_g_usesrc_lock and the bucket lock. Check for condemned
27990Sstevel@tonic-gate 	 *
28000Sstevel@tonic-gate 	 * If ipif or ill is changing ire_atomic_start() may queue the
28010Sstevel@tonic-gate 	 * request and return EINPROGRESS.
28023448Sdh155122 	 * To avoid lock order problems, get the ndp4->ndp_g_lock.
28030Sstevel@tonic-gate 	 */
28043448Sdh155122 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
28050Sstevel@tonic-gate 	error = ire_atomic_start(irb_ptr, ire, q, mp, func);
28060Sstevel@tonic-gate 	if (error != 0) {
28073448Sdh155122 		mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
28080Sstevel@tonic-gate 		/*
28090Sstevel@tonic-gate 		 * We don't know whether it is a valid ipif or not.
28100Sstevel@tonic-gate 		 * So, set it to NULL. This assumes that the ire has not added
28110Sstevel@tonic-gate 		 * a reference to the ipif.
28120Sstevel@tonic-gate 		 */
28130Sstevel@tonic-gate 		ire->ire_ipif = NULL;
28140Sstevel@tonic-gate 		ire_delete(ire);
28150Sstevel@tonic-gate 		if (pire != NULL) {
28160Sstevel@tonic-gate 			IRB_REFRELE(pire->ire_bucket);
28170Sstevel@tonic-gate 			ire_refrele(pire);
28180Sstevel@tonic-gate 		}
28190Sstevel@tonic-gate 		*ire_p = NULL;
28202535Ssangeeta 		if (need_refrele)
28212535Ssangeeta 			IRB_REFRELE(irb_ptr);
28220Sstevel@tonic-gate 		return (error);
28230Sstevel@tonic-gate 	}
28240Sstevel@tonic-gate 	/*
28250Sstevel@tonic-gate 	 * To avoid creating ires having stale values for the ire_max_frag
28260Sstevel@tonic-gate 	 * we get the latest value atomically here. For more details
28270Sstevel@tonic-gate 	 * see the block comment in ip_sioctl_mtu and in DL_NOTE_SDU_CHANGE
28280Sstevel@tonic-gate 	 * in ip_rput_dlpi_writer
28290Sstevel@tonic-gate 	 */
28300Sstevel@tonic-gate 	if (ire->ire_max_fragp == NULL) {
28310Sstevel@tonic-gate 		if (CLASSD(ire->ire_addr))
28320Sstevel@tonic-gate 			ire->ire_max_frag = ire->ire_ipif->ipif_mtu;
28330Sstevel@tonic-gate 		else
28340Sstevel@tonic-gate 			ire->ire_max_frag = pire->ire_max_frag;
28350Sstevel@tonic-gate 	} else {
28360Sstevel@tonic-gate 		uint_t	max_frag;
28370Sstevel@tonic-gate 
28380Sstevel@tonic-gate 		max_frag = *ire->ire_max_fragp;
28390Sstevel@tonic-gate 		ire->ire_max_fragp = NULL;
28400Sstevel@tonic-gate 		ire->ire_max_frag = max_frag;
28410Sstevel@tonic-gate 	}
28420Sstevel@tonic-gate 	/*
28430Sstevel@tonic-gate 	 * Atomically check for duplicate and insert in the table.
28440Sstevel@tonic-gate 	 */
28450Sstevel@tonic-gate 	for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
28460Sstevel@tonic-gate 		if (ire1->ire_marks & IRE_MARK_CONDEMNED)
28470Sstevel@tonic-gate 			continue;
28480Sstevel@tonic-gate 		if (ire->ire_ipif != NULL) {
28490Sstevel@tonic-gate 			/*
28500Sstevel@tonic-gate 			 * We do MATCH_IRE_ILL implicitly here for IREs
28510Sstevel@tonic-gate 			 * with a non-null ire_ipif, including IRE_CACHEs.
28520Sstevel@tonic-gate 			 * As ire_ipif and ire_stq could point to two
28530Sstevel@tonic-gate 			 * different ills, we can't pass just ire_ipif to
28540Sstevel@tonic-gate 			 * ire_match_args and get a match on both ills.
28550Sstevel@tonic-gate 			 * This is just needed for duplicate checks here and
28560Sstevel@tonic-gate 			 * so we don't add an extra argument to
28570Sstevel@tonic-gate 			 * ire_match_args for this. Do it locally.
28580Sstevel@tonic-gate 			 *
28590Sstevel@tonic-gate 			 * NOTE : Currently there is no part of the code
28600Sstevel@tonic-gate 			 * that asks for both MATH_IRE_IPIF and MATCH_IRE_ILL
28610Sstevel@tonic-gate 			 * match for IRE_CACHEs. Thus we don't want to
28620Sstevel@tonic-gate 			 * extend the arguments to ire_match_args.
28630Sstevel@tonic-gate 			 */
28640Sstevel@tonic-gate 			if (ire1->ire_stq != ire->ire_stq)
28650Sstevel@tonic-gate 				continue;
28660Sstevel@tonic-gate 			/*
28670Sstevel@tonic-gate 			 * Multiroute IRE_CACHEs for a given destination can
28680Sstevel@tonic-gate 			 * have the same ire_ipif, typically if their source
28690Sstevel@tonic-gate 			 * address is forced using RTF_SETSRC, and the same
28700Sstevel@tonic-gate 			 * send-to queue. We differentiate them using the parent
28710Sstevel@tonic-gate 			 * handle.
28720Sstevel@tonic-gate 			 */
28730Sstevel@tonic-gate 			if (ire->ire_type == IRE_CACHE &&
28740Sstevel@tonic-gate 			    (ire1->ire_flags & RTF_MULTIRT) &&
28750Sstevel@tonic-gate 			    (ire->ire_flags & RTF_MULTIRT) &&
28760Sstevel@tonic-gate 			    (ire1->ire_phandle != ire->ire_phandle))
28770Sstevel@tonic-gate 				continue;
28780Sstevel@tonic-gate 		}
28790Sstevel@tonic-gate 		if (ire1->ire_zoneid != ire->ire_zoneid)
28800Sstevel@tonic-gate 			continue;
28810Sstevel@tonic-gate 		if (ire_match_args(ire1, ire->ire_addr, ire->ire_mask,
28820Sstevel@tonic-gate 		    ire->ire_gateway_addr, ire->ire_type, ire->ire_ipif,
28837880SJonathan.Anderson@Sun.COM 		    ire->ire_zoneid, 0, NULL, flags, NULL)) {
28840Sstevel@tonic-gate 			/*
28850Sstevel@tonic-gate 			 * Return the old ire after doing a REFHOLD.
28860Sstevel@tonic-gate 			 * As most of the callers continue to use the IRE
28870Sstevel@tonic-gate 			 * after adding, we return a held ire. This will
28880Sstevel@tonic-gate 			 * avoid a lookup in the caller again. If the callers
28890Sstevel@tonic-gate 			 * don't want to use it, they need to do a REFRELE.
28900Sstevel@tonic-gate 			 */
28918485SPeter.Memishian@Sun.COM 			ip1dbg(("found dup ire existing %p new %p\n",
28920Sstevel@tonic-gate 			    (void *)ire1, (void *)ire));
28930Sstevel@tonic-gate 			IRE_REFHOLD(ire1);
28940Sstevel@tonic-gate 			ire_atomic_end(irb_ptr, ire);
28953448Sdh155122 			mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
28960Sstevel@tonic-gate 			ire_delete(ire);
28970Sstevel@tonic-gate 			if (pire != NULL) {
28980Sstevel@tonic-gate 				/*
28990Sstevel@tonic-gate 				 * Assert that it is not removed from the
29000Sstevel@tonic-gate 				 * list yet.
29010Sstevel@tonic-gate 				 */
29020Sstevel@tonic-gate 				ASSERT(pire->ire_ptpn != NULL);
29030Sstevel@tonic-gate 				IRB_REFRELE(pire->ire_bucket);
29040Sstevel@tonic-gate 				ire_refrele(pire);
29050Sstevel@tonic-gate 			}
29060Sstevel@tonic-gate 			*ire_p = ire1;
29072535Ssangeeta 			if (need_refrele)
29082535Ssangeeta 				IRB_REFRELE(irb_ptr);
29090Sstevel@tonic-gate 			return (0);
29100Sstevel@tonic-gate 		}
29110Sstevel@tonic-gate 	}
29128485SPeter.Memishian@Sun.COM 
29132535Ssangeeta 	if (ire->ire_type & IRE_CACHE) {
29142535Ssangeeta 		ASSERT(ire->ire_stq != NULL);
29152535Ssangeeta 		nce = ndp_lookup_v4(ire_to_ill(ire),
29162535Ssangeeta 		    ((ire->ire_gateway_addr != INADDR_ANY) ?
29172535Ssangeeta 		    &ire->ire_gateway_addr : &ire->ire_addr),
29182535Ssangeeta 		    B_TRUE);
29192535Ssangeeta 		if (nce != NULL)
29202535Ssangeeta 			mutex_enter(&nce->nce_lock);
29212535Ssangeeta 		/*
29222535Ssangeeta 		 * if the nce is NCE_F_CONDEMNED, or if it is not ND_REACHABLE
29232535Ssangeeta 		 * and the caller has prohibited the addition of incomplete
29242535Ssangeeta 		 * ire's, we fail the add. Note that nce_state could be
29254084Ssowmini 		 * something other than ND_REACHABLE if the nce had
29264084Ssowmini 		 * just expired and the ire_create preceding the
29274084Ssowmini 		 * ire_add added a new ND_INITIAL nce.
29282535Ssangeeta 		 */
29292535Ssangeeta 		if ((nce == NULL) ||
29302535Ssangeeta 		    (nce->nce_flags & NCE_F_CONDEMNED) ||
29312535Ssangeeta 		    (!allow_unresolved &&
29323397Ssangeeta 		    (nce->nce_state != ND_REACHABLE))) {
29334084Ssowmini 			if (nce != NULL) {
29344084Ssowmini 				DTRACE_PROBE1(ire__bad__nce, nce_t *, nce);
29352535Ssangeeta 				mutex_exit(&nce->nce_lock);
29364084Ssowmini 			}
29372535Ssangeeta 			ire_atomic_end(irb_ptr, ire);
29383448Sdh155122 			mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
29392535Ssangeeta 			if (nce != NULL)
29402535Ssangeeta 				NCE_REFRELE(nce);
29412535Ssangeeta 			DTRACE_PROBE1(ire__no__nce, ire_t *, ire);
29422535Ssangeeta 			ire_delete(ire);
29432535Ssangeeta 			if (pire != NULL) {
29442535Ssangeeta 				IRB_REFRELE(pire->ire_bucket);
29452535Ssangeeta 				ire_refrele(pire);
29462535Ssangeeta 			}
29472535Ssangeeta 			*ire_p = NULL;
29482535Ssangeeta 			if (need_refrele)
29492535Ssangeeta 				IRB_REFRELE(irb_ptr);
29502535Ssangeeta 			return (EINVAL);
29512535Ssangeeta 		} else {
29522535Ssangeeta 			ire->ire_nce = nce;
29532535Ssangeeta 			mutex_exit(&nce->nce_lock);
29542535Ssangeeta 			/*
29552535Ssangeeta 			 * We are associating this nce to the ire, so
29562535Ssangeeta 			 * change the nce ref taken in ndp_lookup_v4() from
29572535Ssangeeta 			 * NCE_REFHOLD to NCE_REFHOLD_NOTR
29582535Ssangeeta 			 */
29592535Ssangeeta 			NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce);
29602535Ssangeeta 		}
29612535Ssangeeta 	}
29620Sstevel@tonic-gate 	/*
29630Sstevel@tonic-gate 	 * Make it easy for ip_wput_ire() to hit multiple broadcast ires by
29648485SPeter.Memishian@Sun.COM 	 * grouping identical addresses together on the hash chain.  We do
29658485SPeter.Memishian@Sun.COM 	 * this only for IRE_BROADCASTs as ip_wput_ire is currently interested
29668485SPeter.Memishian@Sun.COM 	 * in such groupings only for broadcasts.
29670Sstevel@tonic-gate 	 *
29680Sstevel@tonic-gate 	 * Find the first entry that matches ire_addr. *irep will be null
29690Sstevel@tonic-gate 	 * if no match.
29704182Ssowmini 	 *
29714182Ssowmini 	 * Note: the loopback and non-loopback broadcast entries for an
29724182Ssowmini 	 * interface MUST be added before any MULTIRT entries.
29730Sstevel@tonic-gate 	 */
29740Sstevel@tonic-gate 	irep = (ire_t **)irb_ptr;
29750Sstevel@tonic-gate 	while ((ire1 = *irep) != NULL && ire->ire_addr != ire1->ire_addr)
29760Sstevel@tonic-gate 		irep = &ire1->ire_next;
29770Sstevel@tonic-gate 	if (ire->ire_type == IRE_BROADCAST && *irep != NULL) {
29780Sstevel@tonic-gate 		/*
29790Sstevel@tonic-gate 		 * We found some ire (i.e *irep) with a matching addr. We
29808485SPeter.Memishian@Sun.COM 		 * want to group ires with same addr.
29810Sstevel@tonic-gate 		 */
29824182Ssowmini 		for (;;) {
29830Sstevel@tonic-gate 			ire1 = *irep;
29840Sstevel@tonic-gate 			if ((ire1->ire_next == NULL) ||
29850Sstevel@tonic-gate 			    (ire1->ire_next->ire_addr != ire->ire_addr) ||
29860Sstevel@tonic-gate 			    (ire1->ire_type != IRE_BROADCAST) ||
29874182Ssowmini 			    (ire1->ire_flags & RTF_MULTIRT) ||
29888485SPeter.Memishian@Sun.COM 			    (ire1->ire_ipif->ipif_ill->ill_grp ==
29898485SPeter.Memishian@Sun.COM 			    ire->ire_ipif->ipif_ill->ill_grp))
29900Sstevel@tonic-gate 				break;
29910Sstevel@tonic-gate 			irep = &ire1->ire_next;
29920Sstevel@tonic-gate 		}
29930Sstevel@tonic-gate 		ASSERT(*irep != NULL);
29944182Ssowmini 		/*
29954182Ssowmini 		 * The ire will be added before *irep, so
29964182Ssowmini 		 * if irep is a MULTIRT ire, just break to
29974182Ssowmini 		 * ire insertion code.
29984182Ssowmini 		 */
29994182Ssowmini 		if (((*irep)->ire_flags & RTF_MULTIRT) != 0)
30004182Ssowmini 			goto insert_ire;
30014182Ssowmini 
30020Sstevel@tonic-gate 		irep = &((*irep)->ire_next);
30030Sstevel@tonic-gate 
30040Sstevel@tonic-gate 		/*
30050Sstevel@tonic-gate 		 * Either we have hit the end of the list or the address
30068485SPeter.Memishian@Sun.COM 		 * did not match.
30070Sstevel@tonic-gate 		 */
30080Sstevel@tonic-gate 		while (*irep != NULL) {
30090Sstevel@tonic-gate 			ire1 = *irep;
30100Sstevel@tonic-gate 			if ((ire1->ire_addr != ire->ire_addr) ||
30118485SPeter.Memishian@Sun.COM 			    (ire1->ire_type != IRE_BROADCAST))
30120Sstevel@tonic-gate 				break;
30138485SPeter.Memishian@Sun.COM 			if (ire1->ire_ipif == ire->ire_ipif) {
30140Sstevel@tonic-gate 				irep = &ire1->ire_next;
30150Sstevel@tonic-gate 				break;
30160Sstevel@tonic-gate 			}
30170Sstevel@tonic-gate 			irep = &ire1->ire_next;
30180Sstevel@tonic-gate 		}
30190Sstevel@tonic-gate 	} else if (*irep != NULL) {
30200Sstevel@tonic-gate 		/*
30210Sstevel@tonic-gate 		 * Find the last ire which matches ire_addr.
30220Sstevel@tonic-gate 		 * Needed to do tail insertion among entries with the same
30230Sstevel@tonic-gate 		 * ire_addr.
30240Sstevel@tonic-gate 		 */
30250Sstevel@tonic-gate 		while (ire->ire_addr == ire1->ire_addr) {
30260Sstevel@tonic-gate 			irep = &ire1->ire_next;
30270Sstevel@tonic-gate 			ire1 = *irep;
30280Sstevel@tonic-gate 			if (ire1 == NULL)
30290Sstevel@tonic-gate 				break;
30300Sstevel@tonic-gate 		}
30310Sstevel@tonic-gate 	}
30320Sstevel@tonic-gate 
30334182Ssowmini insert_ire:
30340Sstevel@tonic-gate 	/* Insert at *irep */
30350Sstevel@tonic-gate 	ire1 = *irep;
30360Sstevel@tonic-gate 	if (ire1 != NULL)
30370Sstevel@tonic-gate 		ire1->ire_ptpn = &ire->ire_next;
30380Sstevel@tonic-gate 	ire->ire_next = ire1;
30390Sstevel@tonic-gate 	/* Link the new one in. */
30400Sstevel@tonic-gate 	ire->ire_ptpn = irep;
30410Sstevel@tonic-gate 
30420Sstevel@tonic-gate 	/*
30430Sstevel@tonic-gate 	 * ire_walk routines de-reference ire_next without holding
30440Sstevel@tonic-gate 	 * a lock. Before we point to the new ire, we want to make
30450Sstevel@tonic-gate 	 * sure the store that sets the ire_next of the new ire
30460Sstevel@tonic-gate 	 * reaches global visibility, so that ire_walk routines
30470Sstevel@tonic-gate 	 * don't see a truncated list of ires i.e if the ire_next
30480Sstevel@tonic-gate 	 * of the new ire gets set after we do "*irep = ire" due
30490Sstevel@tonic-gate 	 * to re-ordering, the ire_walk thread will see a NULL
30500Sstevel@tonic-gate 	 * once it accesses the ire_next of the new ire.
30510Sstevel@tonic-gate 	 * membar_producer() makes sure that the following store
30520Sstevel@tonic-gate 	 * happens *after* all of the above stores.
30530Sstevel@tonic-gate 	 */
30540Sstevel@tonic-gate 	membar_producer();
30550Sstevel@tonic-gate 	*irep = ire;
30560Sstevel@tonic-gate 	ire->ire_bucket = irb_ptr;
30570Sstevel@tonic-gate 	/*
30580Sstevel@tonic-gate 	 * We return a bumped up IRE above. Keep it symmetrical
30590Sstevel@tonic-gate 	 * so that the callers will always have to release. This
30600Sstevel@tonic-gate 	 * helps the callers of this function because they continue
30610Sstevel@tonic-gate 	 * to use the IRE after adding and hence they don't have to
30620Sstevel@tonic-gate 	 * lookup again after we return the IRE.
30630Sstevel@tonic-gate 	 *
30640Sstevel@tonic-gate 	 * NOTE : We don't have to use atomics as this is appearing
30650Sstevel@tonic-gate 	 * in the list for the first time and no one else can bump
30660Sstevel@tonic-gate 	 * up the reference count on this yet.
30670Sstevel@tonic-gate 	 */
30680Sstevel@tonic-gate 	IRE_REFHOLD_LOCKED(ire);
30693448Sdh155122 	BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_inserted);
30702535Ssangeeta 
30710Sstevel@tonic-gate 	irb_ptr->irb_ire_cnt++;
30722535Ssangeeta 	if (irb_ptr->irb_marks & IRB_MARK_FTABLE)
30732535Ssangeeta 		irb_ptr->irb_nire++;
30742535Ssangeeta 
30750Sstevel@tonic-gate 	if (ire->ire_marks & IRE_MARK_TEMPORARY)
30760Sstevel@tonic-gate 		irb_ptr->irb_tmp_ire_cnt++;
30770Sstevel@tonic-gate 
30780Sstevel@tonic-gate 	if (ire->ire_ipif != NULL) {
30796255Ssowmini 		DTRACE_PROBE3(ipif__incr__cnt, (ipif_t *), ire->ire_ipif,
30806255Ssowmini 		    (char *), "ire", (void *), ire);
30816379Ssowmini 		ire->ire_ipif->ipif_ire_cnt++;
30820Sstevel@tonic-gate 		if (ire->ire_stq != NULL) {
30830Sstevel@tonic-gate 			stq_ill = (ill_t *)ire->ire_stq->q_ptr;
30846255Ssowmini 			DTRACE_PROBE3(ill__incr__cnt, (ill_t *), stq_ill,
30856255Ssowmini 			    (char *), "ire", (void *), ire);
30866379Ssowmini 			stq_ill->ill_ire_cnt++;
30870Sstevel@tonic-gate 		}
30880Sstevel@tonic-gate 	} else {
30890Sstevel@tonic-gate 		ASSERT(ire->ire_stq == NULL);
30900Sstevel@tonic-gate 	}
30910Sstevel@tonic-gate 
30920Sstevel@tonic-gate 	ire_atomic_end(irb_ptr, ire);
30933448Sdh155122 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
30940Sstevel@tonic-gate 
30950Sstevel@tonic-gate 	if (pire != NULL) {
30960Sstevel@tonic-gate 		/* Assert that it is not removed from the list yet */
30970Sstevel@tonic-gate 		ASSERT(pire->ire_ptpn != NULL);
30980Sstevel@tonic-gate 		IRB_REFRELE(pire->ire_bucket);
30990Sstevel@tonic-gate 		ire_refrele(pire);
31000Sstevel@tonic-gate 	}
31010Sstevel@tonic-gate 
31020Sstevel@tonic-gate 	if (ire->ire_type != IRE_CACHE) {
31030Sstevel@tonic-gate 		/*
31042535Ssangeeta 		 * For ire's with host mask see if there is an entry
31050Sstevel@tonic-gate 		 * in the cache. If there is one flush the whole cache as
31060Sstevel@tonic-gate 		 * there might be multiple entries due to RTF_MULTIRT (CGTP).
31070Sstevel@tonic-gate 		 * If no entry is found than there is no need to flush the
31080Sstevel@tonic-gate 		 * cache.
31090Sstevel@tonic-gate 		 */
31100Sstevel@tonic-gate 		if (ire->ire_mask == IP_HOST_MASK) {
31110Sstevel@tonic-gate 			ire_t *lire;
31120Sstevel@tonic-gate 			lire = ire_ctable_lookup(ire->ire_addr, NULL, IRE_CACHE,
31133448Sdh155122 			    NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
31140Sstevel@tonic-gate 			if (lire != NULL) {
31150Sstevel@tonic-gate 				ire_refrele(lire);
31160Sstevel@tonic-gate 				ire_flush_cache_v4(ire, IRE_FLUSH_ADD);
31170Sstevel@tonic-gate 			}
31180Sstevel@tonic-gate 		} else {
31190Sstevel@tonic-gate 			ire_flush_cache_v4(ire, IRE_FLUSH_ADD);
31200Sstevel@tonic-gate 		}
31210Sstevel@tonic-gate 	}
31220Sstevel@tonic-gate 	/*
31230Sstevel@tonic-gate 	 * We had to delay the fast path probe until the ire is inserted
31240Sstevel@tonic-gate 	 * in the list. Otherwise the fast path ack won't find the ire in
31250Sstevel@tonic-gate 	 * the table.
31260Sstevel@tonic-gate 	 */
31273425Ssowmini 	if (ire->ire_type == IRE_CACHE ||
31283425Ssowmini 	    (ire->ire_type == IRE_BROADCAST && ire->ire_stq != NULL)) {
31293425Ssowmini 		ASSERT(ire->ire_nce != NULL);
31304714Ssowmini 		if (ire->ire_nce->nce_state == ND_REACHABLE)
31314714Ssowmini 			nce_fastpath(ire->ire_nce);
31323425Ssowmini 	}
31330Sstevel@tonic-gate 	if (ire->ire_ipif != NULL)
31340Sstevel@tonic-gate 		ASSERT(!MUTEX_HELD(&ire->ire_ipif->ipif_ill->ill_lock));
31350Sstevel@tonic-gate 	*ire_p = ire;
31362535Ssangeeta 	if (need_refrele) {
31372535Ssangeeta 		IRB_REFRELE(irb_ptr);
31382535Ssangeeta 	}
31390Sstevel@tonic-gate 	return (0);
31400Sstevel@tonic-gate }
31410Sstevel@tonic-gate 
31420Sstevel@tonic-gate /*
31430Sstevel@tonic-gate  * IRB_REFRELE is the only caller of the function. ire_unlink calls to
31440Sstevel@tonic-gate  * do the final cleanup for this ire.
31450Sstevel@tonic-gate  */
31460Sstevel@tonic-gate void
31470Sstevel@tonic-gate ire_cleanup(ire_t *ire)
31480Sstevel@tonic-gate {
31490Sstevel@tonic-gate 	ire_t *ire_next;
31503448Sdh155122 	ip_stack_t *ipst = ire->ire_ipst;
31510Sstevel@tonic-gate 
31520Sstevel@tonic-gate 	ASSERT(ire != NULL);
31530Sstevel@tonic-gate 
31540Sstevel@tonic-gate 	while (ire != NULL) {
31550Sstevel@tonic-gate 		ire_next = ire->ire_next;
31560Sstevel@tonic-gate 		if (ire->ire_ipversion == IPV4_VERSION) {
31570Sstevel@tonic-gate 			ire_delete_v4(ire);
31583448Sdh155122 			BUMP_IRE_STATS(ipst->ips_ire_stats_v4,
31593448Sdh155122 			    ire_stats_deleted);
31600Sstevel@tonic-gate 		} else {
31610Sstevel@tonic-gate 			ASSERT(ire->ire_ipversion == IPV6_VERSION);
31620Sstevel@tonic-gate 			ire_delete_v6(ire);
31633448Sdh155122 			BUMP_IRE_STATS(ipst->ips_ire_stats_v6,
31643448Sdh155122 			    ire_stats_deleted);
31650Sstevel@tonic-gate 		}
31660Sstevel@tonic-gate 		/*
31670Sstevel@tonic-gate 		 * Now it's really out of the list. Before doing the
31680Sstevel@tonic-gate 		 * REFRELE, set ire_next to NULL as ire_inactive asserts
31690Sstevel@tonic-gate 		 * so.
31700Sstevel@tonic-gate 		 */
31710Sstevel@tonic-gate 		ire->ire_next = NULL;
31720Sstevel@tonic-gate 		IRE_REFRELE_NOTR(ire);
31730Sstevel@tonic-gate 		ire = ire_next;
31740Sstevel@tonic-gate 	}
31750Sstevel@tonic-gate }
31760Sstevel@tonic-gate 
31770Sstevel@tonic-gate /*
31780Sstevel@tonic-gate  * IRB_REFRELE is the only caller of the function. It calls to unlink
31790Sstevel@tonic-gate  * all the CONDEMNED ires from this bucket.
31800Sstevel@tonic-gate  */
31810Sstevel@tonic-gate ire_t *
31820Sstevel@tonic-gate ire_unlink(irb_t *irb)
31830Sstevel@tonic-gate {
31840Sstevel@tonic-gate 	ire_t *ire;
31850Sstevel@tonic-gate 	ire_t *ire1;
31860Sstevel@tonic-gate 	ire_t **ptpn;
31870Sstevel@tonic-gate 	ire_t *ire_list = NULL;
31880Sstevel@tonic-gate 
31890Sstevel@tonic-gate 	ASSERT(RW_WRITE_HELD(&irb->irb_lock));
31902535Ssangeeta 	ASSERT(((irb->irb_marks & IRB_MARK_FTABLE) && irb->irb_refcnt == 1) ||
31912535Ssangeeta 	    (irb->irb_refcnt == 0));
31922535Ssangeeta 	ASSERT(irb->irb_marks & IRB_MARK_CONDEMNED);
31930Sstevel@tonic-gate 	ASSERT(irb->irb_ire != NULL);
31940Sstevel@tonic-gate 
31950Sstevel@tonic-gate 	for (ire = irb->irb_ire; ire != NULL; ire = ire1) {
31963448Sdh155122 		ip_stack_t	*ipst = ire->ire_ipst;
31973448Sdh155122 
31980Sstevel@tonic-gate 		ire1 = ire->ire_next;
31990Sstevel@tonic-gate 		if (ire->ire_marks & IRE_MARK_CONDEMNED) {
32000Sstevel@tonic-gate 			ptpn = ire->ire_ptpn;
32010Sstevel@tonic-gate 			ire1 = ire->ire_next;
32020Sstevel@tonic-gate 			if (ire1)
32030Sstevel@tonic-gate 				ire1->ire_ptpn = ptpn;
32040Sstevel@tonic-gate 			*ptpn = ire1;
32050Sstevel@tonic-gate 			ire->ire_ptpn = NULL;
32060Sstevel@tonic-gate 			ire->ire_next = NULL;
32070Sstevel@tonic-gate 			if (ire->ire_type == IRE_DEFAULT) {
32080Sstevel@tonic-gate 				/*
32090Sstevel@tonic-gate 				 * IRE is out of the list. We need to adjust
32100Sstevel@tonic-gate 				 * the accounting before the caller drops
32110Sstevel@tonic-gate 				 * the lock.
32120Sstevel@tonic-gate 				 */
32130Sstevel@tonic-gate 				if (ire->ire_ipversion == IPV6_VERSION) {
32143448Sdh155122 					ASSERT(ipst->
32153448Sdh155122 					    ips_ipv6_ire_default_count !=
32163448Sdh155122 					    0);
32173448Sdh155122 					ipst->ips_ipv6_ire_default_count--;
32180Sstevel@tonic-gate 				}
32190Sstevel@tonic-gate 			}
32200Sstevel@tonic-gate 			/*
32210Sstevel@tonic-gate 			 * We need to call ire_delete_v4 or ire_delete_v6
32220Sstevel@tonic-gate 			 * to clean up the cache or the redirects pointing at
32230Sstevel@tonic-gate 			 * the default gateway. We need to drop the lock
32240Sstevel@tonic-gate 			 * as ire_flush_cache/ire_delete_host_redircts require
32250Sstevel@tonic-gate 			 * so. But we can't drop the lock, as ire_unlink needs
32260Sstevel@tonic-gate 			 * to atomically remove the ires from the list.
32270Sstevel@tonic-gate 			 * So, create a temporary list of CONDEMNED ires
32280Sstevel@tonic-gate 			 * for doing ire_delete_v4/ire_delete_v6 operations
32290Sstevel@tonic-gate 			 * later on.
32300Sstevel@tonic-gate 			 */
32310Sstevel@tonic-gate 			ire->ire_next = ire_list;
32320Sstevel@tonic-gate 			ire_list = ire;
32330Sstevel@tonic-gate 		}
32340Sstevel@tonic-gate 	}
32352535Ssangeeta 	irb->irb_marks &= ~IRB_MARK_CONDEMNED;
32360Sstevel@tonic-gate 	return (ire_list);
32370Sstevel@tonic-gate }
32380Sstevel@tonic-gate 
32390Sstevel@tonic-gate /*
32400Sstevel@tonic-gate  * Delete all the cache entries with this 'addr'.  When IP gets a gratuitous
32412535Ssangeeta  * ARP message on any of its interface queue, it scans the nce table and
32422535Ssangeeta  * deletes and calls ndp_delete() for the appropriate nce. This action
32432535Ssangeeta  * also deletes all the neighbor/ire cache entries for that address.
32442535Ssangeeta  * This function is called from ip_arp_news in ip.c and also for
32452535Ssangeeta  * ARP ioctl processing in ip_if.c. ip_ire_clookup_and_delete returns
32462535Ssangeeta  * true if it finds a nce entry which is used by ip_arp_news to determine if
32472535Ssangeeta  * it needs to do an ire_walk_v4. The return value is also  used for the
32482535Ssangeeta  * same purpose by ARP IOCTL processing * in ip_if.c when deleting
32492535Ssangeeta  * ARP entries. For SIOC*IFARP ioctls in addition to the address,
32502535Ssangeeta  * ip_if->ipif_ill also needs to be matched.
32510Sstevel@tonic-gate  */
32520Sstevel@tonic-gate boolean_t
32533448Sdh155122 ip_ire_clookup_and_delete(ipaddr_t addr, ipif_t *ipif, ip_stack_t *ipst)
32540Sstevel@tonic-gate {
32552535Ssangeeta 	ill_t	*ill;
32562535Ssangeeta 	nce_t	*nce;
32572535Ssangeeta 
32582535Ssangeeta 	ill = (ipif ? ipif->ipif_ill : NULL);
32592535Ssangeeta 
32602535Ssangeeta 	if (ill != NULL) {
32612535Ssangeeta 		/*
32622535Ssangeeta 		 * clean up the nce (and any relevant ire's) that matches
32632535Ssangeeta 		 * on addr and ill.
32642535Ssangeeta 		 */
32652535Ssangeeta 		nce = ndp_lookup_v4(ill, &addr, B_FALSE);
32662535Ssangeeta 		if (nce != NULL) {
32672535Ssangeeta 			ndp_delete(nce);
32682535Ssangeeta 			return (B_TRUE);
32692535Ssangeeta 		}
32702535Ssangeeta 	} else {
32712535Ssangeeta 		/*
32722535Ssangeeta 		 * ill is wildcard. clean up all nce's and
32732535Ssangeeta 		 * ire's that match on addr
32742535Ssangeeta 		 */
32752535Ssangeeta 		nce_clookup_t cl;
32762535Ssangeeta 
32772535Ssangeeta 		cl.ncecl_addr = addr;
32782535Ssangeeta 		cl.ncecl_found = B_FALSE;
32792535Ssangeeta 
32803448Sdh155122 		ndp_walk_common(ipst->ips_ndp4, NULL,
32812535Ssangeeta 		    (pfi_t)ip_nce_clookup_and_delete, (uchar_t *)&cl, B_TRUE);
32822535Ssangeeta 
32832535Ssangeeta 		/*
32842535Ssangeeta 		 *  ncecl_found would be set by ip_nce_clookup_and_delete if
32852535Ssangeeta 		 *  we found a matching nce.
32862535Ssangeeta 		 */
32872535Ssangeeta 		return (cl.ncecl_found);
32882535Ssangeeta 	}
32892535Ssangeeta 	return (B_FALSE);
32902535Ssangeeta 
32912535Ssangeeta }
32922535Ssangeeta 
32932535Ssangeeta /* Delete the supplied nce if its nce_addr matches the supplied address */
32942535Ssangeeta static void
32952535Ssangeeta ip_nce_clookup_and_delete(nce_t *nce, void *arg)
32962535Ssangeeta {
32972535Ssangeeta 	nce_clookup_t *cl = (nce_clookup_t *)arg;
32982535Ssangeeta 	ipaddr_t nce_addr;
32992535Ssangeeta 
33002535Ssangeeta 	IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr);
33012535Ssangeeta 	if (nce_addr == cl->ncecl_addr) {
33022535Ssangeeta 		cl->ncecl_found = B_TRUE;
33032535Ssangeeta 		/* clean up the nce (and any relevant ire's) */
33042535Ssangeeta 		ndp_delete(nce);
33052535Ssangeeta 	}
33062535Ssangeeta }
33072535Ssangeeta 
33082535Ssangeeta /*
33092535Ssangeeta  * Clean up the radix node for this ire. Must be called by IRB_REFRELE
33102535Ssangeeta  * when there are no ire's left in the bucket. Returns TRUE if the bucket
33112535Ssangeeta  * is deleted and freed.
33122535Ssangeeta  */
33132535Ssangeeta boolean_t
33142535Ssangeeta irb_inactive(irb_t *irb)
33152535Ssangeeta {
33162535Ssangeeta 	struct rt_entry *rt;
33172535Ssangeeta 	struct radix_node *rn;
33183448Sdh155122 	ip_stack_t *ipst = irb->irb_ipst;
33193448Sdh155122 
33203448Sdh155122 	ASSERT(irb->irb_ipst != NULL);
33212535Ssangeeta 
33222535Ssangeeta 	rt = IRB2RT(irb);
33232535Ssangeeta 	rn = (struct radix_node *)rt;
33242535Ssangeeta 
33252535Ssangeeta 	/* first remove it from the radix tree. */
33263448Sdh155122 	RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable);
33272535Ssangeeta 	rw_enter(&irb->irb_lock, RW_WRITER);
33282535Ssangeeta 	if (irb->irb_refcnt == 1 && irb->irb_nire == 0) {
33293448Sdh155122 		rn = ipst->ips_ip_ftable->rnh_deladdr(rn->rn_key, rn->rn_mask,
33303448Sdh155122 		    ipst->ips_ip_ftable);
33312535Ssangeeta 		DTRACE_PROBE1(irb__free, rt_t *,  rt);
33322535Ssangeeta 		ASSERT((void *)rn == (void *)rt);
33332535Ssangeeta 		Free(rt, rt_entry_cache);
33342535Ssangeeta 		/* irb_lock is freed */
33353448Sdh155122 		RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
33362535Ssangeeta 		return (B_TRUE);
33372535Ssangeeta 	}
33382535Ssangeeta 	rw_exit(&irb->irb_lock);
33393448Sdh155122 	RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
33402535Ssangeeta 	return (B_FALSE);
33410Sstevel@tonic-gate }
33420Sstevel@tonic-gate 
33430Sstevel@tonic-gate /*
33440Sstevel@tonic-gate  * Delete the specified IRE.
33450Sstevel@tonic-gate  */
33460Sstevel@tonic-gate void
33470Sstevel@tonic-gate ire_delete(ire_t *ire)
33480Sstevel@tonic-gate {
33490Sstevel@tonic-gate 	ire_t	*ire1;
33500Sstevel@tonic-gate 	ire_t	**ptpn;
33510Sstevel@tonic-gate 	irb_t *irb;
33523448Sdh155122 	ip_stack_t	*ipst = ire->ire_ipst;
33530Sstevel@tonic-gate 
33540Sstevel@tonic-gate 	if ((irb = ire->ire_bucket) == NULL) {
33552535Ssangeeta 		/*
33562535Ssangeeta 		 * It was never inserted in the list. Should call REFRELE
33572535Ssangeeta 		 * to free this IRE.
33582535Ssangeeta 		 */
33590Sstevel@tonic-gate 		IRE_REFRELE_NOTR(ire);
33600Sstevel@tonic-gate 		return;
33610Sstevel@tonic-gate 	}
33620Sstevel@tonic-gate 
33630Sstevel@tonic-gate 	rw_enter(&irb->irb_lock, RW_WRITER);
33640Sstevel@tonic-gate 
33652535Ssangeeta 	if (irb->irb_rr_origin == ire) {
33662535Ssangeeta 		irb->irb_rr_origin = NULL;
33672535Ssangeeta 	}
33682535Ssangeeta 
33690Sstevel@tonic-gate 	/*
33700Sstevel@tonic-gate 	 * In case of V4 we might still be waiting for fastpath ack.
33710Sstevel@tonic-gate 	 */
33723425Ssowmini 	if (ire->ire_ipversion == IPV4_VERSION &&
33733425Ssowmini 	    (ire->ire_type == IRE_CACHE ||
33743425Ssowmini 	    (ire->ire_type == IRE_BROADCAST && ire->ire_stq != NULL))) {
33753425Ssowmini 		ASSERT(ire->ire_nce != NULL);
33763425Ssowmini 		nce_fastpath_list_delete(ire->ire_nce);
33770Sstevel@tonic-gate 	}
33780Sstevel@tonic-gate 
33790Sstevel@tonic-gate 	if (ire->ire_ptpn == NULL) {
33800Sstevel@tonic-gate 		/*
33810Sstevel@tonic-gate 		 * Some other thread has removed us from the list.
33820Sstevel@tonic-gate 		 * It should have done the REFRELE for us.
33830Sstevel@tonic-gate 		 */
33840Sstevel@tonic-gate 		rw_exit(&irb->irb_lock);
33850Sstevel@tonic-gate 		return;
33860Sstevel@tonic-gate 	}
33870Sstevel@tonic-gate 
33885388Sja97890 	if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) {
33895388Sja97890 		irb->irb_ire_cnt--;
33905388Sja97890 		ire->ire_marks |= IRE_MARK_CONDEMNED;
33915388Sja97890 		if (ire->ire_marks & IRE_MARK_TEMPORARY) {
33925388Sja97890 			irb->irb_tmp_ire_cnt--;
33935388Sja97890 			ire->ire_marks &= ~IRE_MARK_TEMPORARY;
33945388Sja97890 		}
33955388Sja97890 	}
33965388Sja97890 
33970Sstevel@tonic-gate 	if (irb->irb_refcnt != 0) {
33980Sstevel@tonic-gate 		/*
33990Sstevel@tonic-gate 		 * The last thread to leave this bucket will
34000Sstevel@tonic-gate 		 * delete this ire.
34010Sstevel@tonic-gate 		 */
34022535Ssangeeta 		irb->irb_marks |= IRB_MARK_CONDEMNED;
34030Sstevel@tonic-gate 		rw_exit(&irb->irb_lock);
34040Sstevel@tonic-gate 		return;
34050Sstevel@tonic-gate 	}
34060Sstevel@tonic-gate 
34070Sstevel@tonic-gate 	/*
34080Sstevel@tonic-gate 	 * Normally to delete an ire, we walk the bucket. While we
34090Sstevel@tonic-gate 	 * walk the bucket, we normally bump up irb_refcnt and hence
34100Sstevel@tonic-gate 	 * we return from above where we mark CONDEMNED and the ire
34110Sstevel@tonic-gate 	 * gets deleted from ire_unlink. This case is where somebody
34120Sstevel@tonic-gate 	 * knows the ire e.g by doing a lookup, and wants to delete the
34130Sstevel@tonic-gate 	 * IRE. irb_refcnt would be 0 in this case if nobody is walking
34140Sstevel@tonic-gate 	 * the bucket.
34150Sstevel@tonic-gate 	 */
34160Sstevel@tonic-gate 	ptpn = ire->ire_ptpn;
34170Sstevel@tonic-gate 	ire1 = ire->ire_next;
34180Sstevel@tonic-gate 	if (ire1 != NULL)
34190Sstevel@tonic-gate 		ire1->ire_ptpn = ptpn;
34200Sstevel@tonic-gate 	ASSERT(ptpn != NULL);
34210Sstevel@tonic-gate 	*ptpn = ire1;
34220Sstevel@tonic-gate 	ire->ire_ptpn = NULL;
34230Sstevel@tonic-gate 	ire->ire_next = NULL;
34240Sstevel@tonic-gate 	if (ire->ire_ipversion == IPV6_VERSION) {
34253448Sdh155122 		BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_deleted);
34260Sstevel@tonic-gate 	} else {
34273448Sdh155122 		BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_deleted);
34280Sstevel@tonic-gate 	}
34290Sstevel@tonic-gate 	/*
34300Sstevel@tonic-gate 	 * ip_wput/ip_wput_v6 checks this flag to see whether
34310Sstevel@tonic-gate 	 * it should still use the cached ire or not.
34320Sstevel@tonic-gate 	 */
34330Sstevel@tonic-gate 	if (ire->ire_type == IRE_DEFAULT) {
34340Sstevel@tonic-gate 		/*
34350Sstevel@tonic-gate 		 * IRE is out of the list. We need to adjust the
34360Sstevel@tonic-gate 		 * accounting before we drop the lock.
34370Sstevel@tonic-gate 		 */
34380Sstevel@tonic-gate 		if (ire->ire_ipversion == IPV6_VERSION) {
34393448Sdh155122 			ASSERT(ipst->ips_ipv6_ire_default_count != 0);
34403448Sdh155122 			ipst->ips_ipv6_ire_default_count--;
34410Sstevel@tonic-gate 		}
34420Sstevel@tonic-gate 	}
34430Sstevel@tonic-gate 	rw_exit(&irb->irb_lock);
34440Sstevel@tonic-gate 
34450Sstevel@tonic-gate 	if (ire->ire_ipversion == IPV6_VERSION) {
34460Sstevel@tonic-gate 		ire_delete_v6(ire);
34470Sstevel@tonic-gate 	} else {
34480Sstevel@tonic-gate 		ire_delete_v4(ire);
34490Sstevel@tonic-gate 	}
34500Sstevel@tonic-gate 	/*
34510Sstevel@tonic-gate 	 * We removed it from the list. Decrement the
34520Sstevel@tonic-gate 	 * reference count.
34530Sstevel@tonic-gate 	 */
34540Sstevel@tonic-gate 	IRE_REFRELE_NOTR(ire);
34550Sstevel@tonic-gate }
34560Sstevel@tonic-gate 
34570Sstevel@tonic-gate /*
34580Sstevel@tonic-gate  * Delete the specified IRE.
34590Sstevel@tonic-gate  * All calls should use ire_delete().
34600Sstevel@tonic-gate  * Sometimes called as writer though not required by this function.
34610Sstevel@tonic-gate  *
34620Sstevel@tonic-gate  * NOTE : This function is called only if the ire was added
34630Sstevel@tonic-gate  * in the list.
34640Sstevel@tonic-gate  */
34650Sstevel@tonic-gate static void
34660Sstevel@tonic-gate ire_delete_v4(ire_t *ire)
34670Sstevel@tonic-gate {
34683448Sdh155122 	ip_stack_t	*ipst = ire->ire_ipst;
34693448Sdh155122 
34700Sstevel@tonic-gate 	ASSERT(ire->ire_refcnt >= 1);
34710Sstevel@tonic-gate 	ASSERT(ire->ire_ipversion == IPV4_VERSION);
34720Sstevel@tonic-gate 
34730Sstevel@tonic-gate 	if (ire->ire_type != IRE_CACHE)
34740Sstevel@tonic-gate 		ire_flush_cache_v4(ire, IRE_FLUSH_DELETE);
34750Sstevel@tonic-gate 	if (ire->ire_type == IRE_DEFAULT) {
34760Sstevel@tonic-gate 		/*
34770Sstevel@tonic-gate 		 * when a default gateway is going away
34780Sstevel@tonic-gate 		 * delete all the host redirects pointing at that
34790Sstevel@tonic-gate 		 * gateway.
34800Sstevel@tonic-gate 		 */
34813448Sdh155122 		ire_delete_host_redirects(ire->ire_gateway_addr, ipst);
34820Sstevel@tonic-gate 	}
34830Sstevel@tonic-gate }
34840Sstevel@tonic-gate 
34850Sstevel@tonic-gate /*
34860Sstevel@tonic-gate  * IRE_REFRELE/ire_refrele are the only caller of the function. It calls
34870Sstevel@tonic-gate  * to free the ire when the reference count goes to zero.
34880Sstevel@tonic-gate  */
34890Sstevel@tonic-gate void
34900Sstevel@tonic-gate ire_inactive(ire_t *ire)
34910Sstevel@tonic-gate {
34920Sstevel@tonic-gate 	nce_t	*nce;
34930Sstevel@tonic-gate 	ill_t	*ill = NULL;
34940Sstevel@tonic-gate 	ill_t	*stq_ill = NULL;
34950Sstevel@tonic-gate 	ipif_t	*ipif;
34960Sstevel@tonic-gate 	boolean_t	need_wakeup = B_FALSE;
34972535Ssangeeta 	irb_t 	*irb;
34983448Sdh155122 	ip_stack_t	*ipst = ire->ire_ipst;
34990Sstevel@tonic-gate 
35000Sstevel@tonic-gate 	ASSERT(ire->ire_refcnt == 0);
35010Sstevel@tonic-gate 	ASSERT(ire->ire_ptpn == NULL);
35020Sstevel@tonic-gate 	ASSERT(ire->ire_next == NULL);
35030Sstevel@tonic-gate 
35042535Ssangeeta 	if (ire->ire_gw_secattr != NULL) {
35052535Ssangeeta 		ire_gw_secattr_free(ire->ire_gw_secattr);
35062535Ssangeeta 		ire->ire_gw_secattr = NULL;
35072535Ssangeeta 	}
35082535Ssangeeta 
35092535Ssangeeta 	if (ire->ire_mp != NULL) {
35102535Ssangeeta 		ASSERT(ire->ire_bucket == NULL);
35112535Ssangeeta 		mutex_destroy(&ire->ire_lock);
35123448Sdh155122 		BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_freed);
35132535Ssangeeta 		if (ire->ire_nce != NULL)
35142535Ssangeeta 			NCE_REFRELE_NOTR(ire->ire_nce);
35152535Ssangeeta 		freeb(ire->ire_mp);
35162535Ssangeeta 		return;
35172535Ssangeeta 	}
35182535Ssangeeta 
35190Sstevel@tonic-gate 	if ((nce = ire->ire_nce) != NULL) {
35200Sstevel@tonic-gate 		NCE_REFRELE_NOTR(nce);
35210Sstevel@tonic-gate 		ire->ire_nce = NULL;
35220Sstevel@tonic-gate 	}
35232535Ssangeeta 
35240Sstevel@tonic-gate 	if (ire->ire_ipif == NULL)
35250Sstevel@tonic-gate 		goto end;
35260Sstevel@tonic-gate 
35270Sstevel@tonic-gate 	ipif = ire->ire_ipif;
35280Sstevel@tonic-gate 	ill = ipif->ipif_ill;
35290Sstevel@tonic-gate 
35300Sstevel@tonic-gate 	if (ire->ire_bucket == NULL) {
35310Sstevel@tonic-gate 		/* The ire was never inserted in the table. */
35320Sstevel@tonic-gate 		goto end;
35330Sstevel@tonic-gate 	}
35340Sstevel@tonic-gate 
35350Sstevel@tonic-gate 	/*
35366379Ssowmini 	 * ipif_ire_cnt on this ipif goes down by 1. If the ire_stq is
35374823Sseb 	 * non-null ill_ire_count also goes down by 1.
35380Sstevel@tonic-gate 	 *
35390Sstevel@tonic-gate 	 * The ipif that is associated with an ire is ire->ire_ipif and
35406379Ssowmini 	 * hence when the ire->ire_ipif->ipif_ire_cnt drops to zero we call
35410Sstevel@tonic-gate 	 * ipif_ill_refrele_tail. Usually stq_ill is null or the same as
35428485SPeter.Memishian@Sun.COM 	 * ire->ire_ipif->ipif_ill. So nothing more needs to be done.
35438485SPeter.Memishian@Sun.COM 	 * However, for VNI or IPMP IRE entries, stq_ill can be different.
35448485SPeter.Memishian@Sun.COM 	 * If this is different from ire->ire_ipif->ipif_ill and if the
35458485SPeter.Memishian@Sun.COM 	 * ill_ire_cnt on the stq_ill also has dropped to zero, we call
35464823Sseb 	 * ipif_ill_refrele_tail on the stq_ill.
35470Sstevel@tonic-gate 	 */
35480Sstevel@tonic-gate 	if (ire->ire_stq != NULL)
35498485SPeter.Memishian@Sun.COM 		stq_ill = ire->ire_stq->q_ptr;
35504823Sseb 
35514823Sseb 	if (stq_ill == NULL || stq_ill == ill) {
35520Sstevel@tonic-gate 		/* Optimize the most common case */
35530Sstevel@tonic-gate 		mutex_enter(&ill->ill_lock);
35546379Ssowmini 		ASSERT(ipif->ipif_ire_cnt != 0);
35556255Ssowmini 		DTRACE_PROBE3(ipif__decr__cnt, (ipif_t *), ipif,
35566255Ssowmini 		    (char *), "ire", (void *), ire);
35576379Ssowmini 		ipif->ipif_ire_cnt--;
35586255Ssowmini 		if (IPIF_DOWN_OK(ipif))
35590Sstevel@tonic-gate 			need_wakeup = B_TRUE;
35600Sstevel@tonic-gate 		if (stq_ill != NULL) {
35616379Ssowmini 			ASSERT(stq_ill->ill_ire_cnt != 0);
35626255Ssowmini 			DTRACE_PROBE3(ill__decr__cnt, (ill_t *), stq_ill,
35636255Ssowmini 			    (char *), "ire", (void *), ire);
35646379Ssowmini 			stq_ill->ill_ire_cnt--;
35656255Ssowmini 			if (ILL_DOWN_OK(stq_ill))
35660Sstevel@tonic-gate 				need_wakeup = B_TRUE;
35670Sstevel@tonic-gate 		}
35680Sstevel@tonic-gate 		if (need_wakeup) {
35690Sstevel@tonic-gate 			/* Drops the ill lock */
35700Sstevel@tonic-gate 			ipif_ill_refrele_tail(ill);
35710Sstevel@tonic-gate 		} else {
35720Sstevel@tonic-gate 			mutex_exit(&ill->ill_lock);
35730Sstevel@tonic-gate 		}
35740Sstevel@tonic-gate 	} else {
35750Sstevel@tonic-gate 		/*
35760Sstevel@tonic-gate 		 * We can't grab all the ill locks at the same time.
35770Sstevel@tonic-gate 		 * It can lead to recursive lock enter in the call to
35780Sstevel@tonic-gate 		 * ipif_ill_refrele_tail and later. Instead do it 1 at
35790Sstevel@tonic-gate 		 * a time.
35800Sstevel@tonic-gate 		 */
35810Sstevel@tonic-gate 		mutex_enter(&ill->ill_lock);
35826379Ssowmini 		ASSERT(ipif->ipif_ire_cnt != 0);
35836255Ssowmini 		DTRACE_PROBE3(ipif__decr__cnt, (ipif_t *), ipif,
35846255Ssowmini 		    (char *), "ire", (void *), ire);
35856379Ssowmini 		ipif->ipif_ire_cnt--;
35866255Ssowmini 		if (IPIF_DOWN_OK(ipif)) {
35870Sstevel@tonic-gate 			/* Drops the lock */
35880Sstevel@tonic-gate 			ipif_ill_refrele_tail(ill);
35890Sstevel@tonic-gate 		} else {
35900Sstevel@tonic-gate 			mutex_exit(&ill->ill_lock);
35910Sstevel@tonic-gate 		}
35920Sstevel@tonic-gate 		if (stq_ill != NULL) {
35930Sstevel@tonic-gate 			mutex_enter(&stq_ill->ill_lock);
35946379Ssowmini 			ASSERT(stq_ill->ill_ire_cnt != 0);
35956255Ssowmini 			DTRACE_PROBE3(ill__decr__cnt, (ill_t *), stq_ill,
35966255Ssowmini 			    (char *), "ire", (void *), ire);
35976379Ssowmini 			stq_ill->ill_ire_cnt--;
35986255Ssowmini 			if (ILL_DOWN_OK(stq_ill)) {
35990Sstevel@tonic-gate 				/* Drops the ill lock */
36000Sstevel@tonic-gate 				ipif_ill_refrele_tail(stq_ill);
36010Sstevel@tonic-gate 			} else {
36020Sstevel@tonic-gate 				mutex_exit(&stq_ill->ill_lock);
36030Sstevel@tonic-gate 			}
36040Sstevel@tonic-gate 		}
36050Sstevel@tonic-gate 	}
36060Sstevel@tonic-gate end:
36070Sstevel@tonic-gate 	/* This should be true for both V4 and V6 */
36080Sstevel@tonic-gate 
36092535Ssangeeta 	if ((ire->ire_type & IRE_FORWARDTABLE) &&
36102535Ssangeeta 	    (ire->ire_ipversion == IPV4_VERSION) &&
36112535Ssangeeta 	    ((irb = ire->ire_bucket) != NULL)) {
36122535Ssangeeta 		rw_enter(&irb->irb_lock, RW_WRITER);
36132535Ssangeeta 		irb->irb_nire--;
36142535Ssangeeta 		/*
36152535Ssangeeta 		 * Instead of examining the conditions for freeing
36162535Ssangeeta 		 * the radix node here, we do it by calling
36172535Ssangeeta 		 * IRB_REFRELE which is a single point in the code
36182535Ssangeeta 		 * that embeds that logic. Bump up the refcnt to
36192535Ssangeeta 		 * be able to call IRB_REFRELE
36202535Ssangeeta 		 */
36212535Ssangeeta 		IRB_REFHOLD_LOCKED(irb);
36222535Ssangeeta 		rw_exit(&irb->irb_lock);
36232535Ssangeeta 		IRB_REFRELE(irb);
36242535Ssangeeta 	}
36250Sstevel@tonic-gate 	ire->ire_ipif = NULL;
36260Sstevel@tonic-gate 
36275023Scarlsonj #ifdef DEBUG
36285023Scarlsonj 	ire_trace_cleanup(ire);
36290Sstevel@tonic-gate #endif
36300Sstevel@tonic-gate 	mutex_destroy(&ire->ire_lock);
36310Sstevel@tonic-gate 	if (ire->ire_ipversion == IPV6_VERSION) {
36323448Sdh155122 		BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_freed);
36330Sstevel@tonic-gate 	} else {
36343448Sdh155122 		BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_freed);
36350Sstevel@tonic-gate 	}
36362535Ssangeeta 	ASSERT(ire->ire_mp == NULL);
36372535Ssangeeta 	/* Has been allocated out of the cache */
36382535Ssangeeta 	kmem_cache_free(ire_cache, ire);
36390Sstevel@tonic-gate }
36400Sstevel@tonic-gate 
36410Sstevel@tonic-gate /*
36423004Sdd193516  * ire_walk routine to delete all IRE_CACHE/IRE_HOST types redirect
36433004Sdd193516  * entries that have a given gateway address.
36440Sstevel@tonic-gate  */
36450Sstevel@tonic-gate void
36460Sstevel@tonic-gate ire_delete_cache_gw(ire_t *ire, char *cp)
36470Sstevel@tonic-gate {
36480Sstevel@tonic-gate 	ipaddr_t	gw_addr;
36490Sstevel@tonic-gate 
36503004Sdd193516 	if (!(ire->ire_type & IRE_CACHE) &&
36513004Sdd193516 	    !(ire->ire_flags & RTF_DYNAMIC))
36520Sstevel@tonic-gate 		return;
36530Sstevel@tonic-gate 
36540Sstevel@tonic-gate 	bcopy(cp, &gw_addr, sizeof (gw_addr));
36550Sstevel@tonic-gate 	if (ire->ire_gateway_addr == gw_addr) {
36560Sstevel@tonic-gate 		ip1dbg(("ire_delete_cache_gw: deleted 0x%x type %d to 0x%x\n",
36574714Ssowmini 		    (int)ntohl(ire->ire_addr), ire->ire_type,
36584714Ssowmini 		    (int)ntohl(ire->ire_gateway_addr)));
36590Sstevel@tonic-gate 		ire_delete(ire);
36600Sstevel@tonic-gate 	}
36610Sstevel@tonic-gate }
36620Sstevel@tonic-gate 
36630Sstevel@tonic-gate /*
36640Sstevel@tonic-gate  * Remove all IRE_CACHE entries that match the ire specified.
36650Sstevel@tonic-gate  *
36660Sstevel@tonic-gate  * The flag argument indicates if the flush request is due to addition
36670Sstevel@tonic-gate  * of new route (IRE_FLUSH_ADD) or deletion of old route (IRE_FLUSH_DELETE).
36680Sstevel@tonic-gate  *
36690Sstevel@tonic-gate  * This routine takes only the IREs from the forwarding table and flushes
36700Sstevel@tonic-gate  * the corresponding entries from the cache table.
36710Sstevel@tonic-gate  *
36720Sstevel@tonic-gate  * When flushing due to the deletion of an old route, it
36730Sstevel@tonic-gate  * just checks the cache handles (ire_phandle and ire_ihandle) and
36740Sstevel@tonic-gate  * deletes the ones that match.
36750Sstevel@tonic-gate  *
36760Sstevel@tonic-gate  * When flushing due to the creation of a new route, it checks
36770Sstevel@tonic-gate  * if a cache entry's address matches the one in the IRE and
36780Sstevel@tonic-gate  * that the cache entry's parent has a less specific mask than the
36790Sstevel@tonic-gate  * one in IRE. The destination of such a cache entry could be the
36800Sstevel@tonic-gate  * gateway for other cache entries, so we need to flush those as
36810Sstevel@tonic-gate  * well by looking for gateway addresses matching the IRE's address.
36820Sstevel@tonic-gate  */
36830Sstevel@tonic-gate void
36840Sstevel@tonic-gate ire_flush_cache_v4(ire_t *ire, int flag)
36850Sstevel@tonic-gate {
36860Sstevel@tonic-gate 	int i;
36870Sstevel@tonic-gate 	ire_t *cire;
36880Sstevel@tonic-gate 	irb_t *irb;
36893448Sdh155122 	ip_stack_t	*ipst = ire->ire_ipst;
36900Sstevel@tonic-gate 
36910Sstevel@tonic-gate 	if (ire->ire_type & IRE_CACHE)
36924714Ssowmini 		return;
36930Sstevel@tonic-gate 
36940Sstevel@tonic-gate 	/*
36950Sstevel@tonic-gate 	 * If a default is just created, there is no point
36960Sstevel@tonic-gate 	 * in going through the cache, as there will not be any
36970Sstevel@tonic-gate 	 * cached ires.
36980Sstevel@tonic-gate 	 */
36990Sstevel@tonic-gate 	if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD)
37000Sstevel@tonic-gate 		return;
37010Sstevel@tonic-gate 	if (flag == IRE_FLUSH_ADD) {
37020Sstevel@tonic-gate 		/*
37030Sstevel@tonic-gate 		 * This selective flush is due to the addition of
37040Sstevel@tonic-gate 		 * new IRE.
37050Sstevel@tonic-gate 		 */
37063448Sdh155122 		for (i = 0; i < ipst->ips_ip_cache_table_size; i++) {
37073448Sdh155122 			irb = &ipst->ips_ip_cache_table[i];
37080Sstevel@tonic-gate 			if ((cire = irb->irb_ire) == NULL)
37090Sstevel@tonic-gate 				continue;
37100Sstevel@tonic-gate 			IRB_REFHOLD(irb);
37110Sstevel@tonic-gate 			for (cire = irb->irb_ire; cire != NULL;
37120Sstevel@tonic-gate 			    cire = cire->ire_next) {
37130Sstevel@tonic-gate 				if (cire->ire_type != IRE_CACHE)
37140Sstevel@tonic-gate 					continue;
37150Sstevel@tonic-gate 				/*
37160Sstevel@tonic-gate 				 * If 'cire' belongs to the same subnet
37170Sstevel@tonic-gate 				 * as the new ire being added, and 'cire'
37180Sstevel@tonic-gate 				 * is derived from a prefix that is less
37190Sstevel@tonic-gate 				 * specific than the new ire being added,
37200Sstevel@tonic-gate 				 * we need to flush 'cire'; for instance,
37210Sstevel@tonic-gate 				 * when a new interface comes up.
37220Sstevel@tonic-gate 				 */
37230Sstevel@tonic-gate 				if (((cire->ire_addr & ire->ire_mask) ==
37240Sstevel@tonic-gate 				    (ire->ire_addr & ire->ire_mask)) &&
37250Sstevel@tonic-gate 				    (ip_mask_to_plen(cire->ire_cmask) <=
37260Sstevel@tonic-gate 				    ire->ire_masklen)) {
37270Sstevel@tonic-gate 					ire_delete(cire);
37280Sstevel@tonic-gate 					continue;
37290Sstevel@tonic-gate 				}
37300Sstevel@tonic-gate 				/*
37310Sstevel@tonic-gate 				 * This is the case when the ire_gateway_addr
37320Sstevel@tonic-gate 				 * of 'cire' belongs to the same subnet as
37330Sstevel@tonic-gate 				 * the new ire being added.
37340Sstevel@tonic-gate 				 * Flushing such ires is sometimes required to
37350Sstevel@tonic-gate 				 * avoid misrouting: say we have a machine with
37360Sstevel@tonic-gate 				 * two interfaces (I1 and I2), a default router
37370Sstevel@tonic-gate 				 * R on the I1 subnet, and a host route to an
37380Sstevel@tonic-gate 				 * off-link destination D with a gateway G on
37390Sstevel@tonic-gate 				 * the I2 subnet.
37400Sstevel@tonic-gate 				 * Under normal operation, we will have an
37410Sstevel@tonic-gate 				 * on-link cache entry for G and an off-link
37420Sstevel@tonic-gate 				 * cache entry for D with G as ire_gateway_addr,
37430Sstevel@tonic-gate 				 * traffic to D will reach its destination
37440Sstevel@tonic-gate 				 * through gateway G.
37450Sstevel@tonic-gate 				 * If the administrator does 'ifconfig I2 down',
37460Sstevel@tonic-gate 				 * the cache entries for D and G will be
37470Sstevel@tonic-gate 				 * flushed. However, G will now be resolved as
37480Sstevel@tonic-gate 				 * an off-link destination using R (the default
37490Sstevel@tonic-gate 				 * router) as gateway. Then D will also be
37500Sstevel@tonic-gate 				 * resolved as an off-link destination using G
37510Sstevel@tonic-gate 				 * as gateway - this behavior is due to
37520Sstevel@tonic-gate 				 * compatibility reasons, see comment in
37530Sstevel@tonic-gate 				 * ire_ihandle_lookup_offlink(). Traffic to D
37540Sstevel@tonic-gate 				 * will go to the router R and probably won't
37550Sstevel@tonic-gate 				 * reach the destination.
37560Sstevel@tonic-gate 				 * The administrator then does 'ifconfig I2 up'.
37570Sstevel@tonic-gate 				 * Since G is on the I2 subnet, this routine
37580Sstevel@tonic-gate 				 * will flush its cache entry. It must also
37590Sstevel@tonic-gate 				 * flush the cache entry for D, otherwise
37600Sstevel@tonic-gate 				 * traffic will stay misrouted until the IRE
37610Sstevel@tonic-gate 				 * times out.
37620Sstevel@tonic-gate 				 */
37630Sstevel@tonic-gate 				if ((cire->ire_gateway_addr & ire->ire_mask) ==
37640Sstevel@tonic-gate 				    (ire->ire_addr & ire->ire_mask)) {
37650Sstevel@tonic-gate 					ire_delete(cire);
37660Sstevel@tonic-gate 					continue;
37670Sstevel@tonic-gate 				}
37680Sstevel@tonic-gate 			}
37690Sstevel@tonic-gate 			IRB_REFRELE(irb);
37700Sstevel@tonic-gate 		}
37710Sstevel@tonic-gate 	} else {
37720Sstevel@tonic-gate 		/*
37730Sstevel@tonic-gate 		 * delete the cache entries based on
37740Sstevel@tonic-gate 		 * handle in the IRE as this IRE is
37750Sstevel@tonic-gate 		 * being deleted/changed.
37760Sstevel@tonic-gate 		 */
37773448Sdh155122 		for (i = 0; i < ipst->ips_ip_cache_table_size; i++) {
37783448Sdh155122 			irb = &ipst->ips_ip_cache_table[i];
37790Sstevel@tonic-gate 			if ((cire = irb->irb_ire) == NULL)
37800Sstevel@tonic-gate 				continue;
37810Sstevel@tonic-gate 			IRB_REFHOLD(irb);
37820Sstevel@tonic-gate 			for (cire = irb->irb_ire; cire != NULL;
37830Sstevel@tonic-gate 			    cire = cire->ire_next) {
37840Sstevel@tonic-gate 				if (cire->ire_type != IRE_CACHE)
37850Sstevel@tonic-gate 					continue;
37860Sstevel@tonic-gate 				if ((cire->ire_phandle == 0 ||
37870Sstevel@tonic-gate 				    cire->ire_phandle != ire->ire_phandle) &&
37880Sstevel@tonic-gate 				    (cire->ire_ihandle == 0 ||
37890Sstevel@tonic-gate 				    cire->ire_ihandle != ire->ire_ihandle))
37900Sstevel@tonic-gate 					continue;
37910Sstevel@tonic-gate 				ire_delete(cire);
37920Sstevel@tonic-gate 			}
37930Sstevel@tonic-gate 			IRB_REFRELE(irb);
37940Sstevel@tonic-gate 		}
37950Sstevel@tonic-gate 	}
37960Sstevel@tonic-gate }
37970Sstevel@tonic-gate 
37980Sstevel@tonic-gate /*
37990Sstevel@tonic-gate  * Matches the arguments passed with the values in the ire.
38000Sstevel@tonic-gate  *
38010Sstevel@tonic-gate  * Note: for match types that match using "ipif" passed in, ipif
38020Sstevel@tonic-gate  * must be checked for non-NULL before calling this routine.
38030Sstevel@tonic-gate  */
38042535Ssangeeta boolean_t
38050Sstevel@tonic-gate ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
38061676Sjpk     int type, const ipif_t *ipif, zoneid_t zoneid, uint32_t ihandle,
38077880SJonathan.Anderson@Sun.COM     const ts_label_t *tsl, int match_flags, queue_t *wq)
38080Sstevel@tonic-gate {
38090Sstevel@tonic-gate 	ill_t *ire_ill = NULL, *dst_ill;
38100Sstevel@tonic-gate 	ill_t *ipif_ill = NULL;
38110Sstevel@tonic-gate 
38120Sstevel@tonic-gate 	ASSERT(ire->ire_ipversion == IPV4_VERSION);
38130Sstevel@tonic-gate 	ASSERT((ire->ire_addr & ~ire->ire_mask) == 0);
38148485SPeter.Memishian@Sun.COM 	ASSERT((!(match_flags & MATCH_IRE_ILL)) ||
38150Sstevel@tonic-gate 	    (ipif != NULL && !ipif->ipif_isv6));
38167880SJonathan.Anderson@Sun.COM 	ASSERT(!(match_flags & MATCH_IRE_WQ) || wq != NULL);
38170Sstevel@tonic-gate 
38180Sstevel@tonic-gate 	/*
38198485SPeter.Memishian@Sun.COM 	 * If MATCH_IRE_MARK_TESTHIDDEN is set, then only return the IRE if it
38208485SPeter.Memishian@Sun.COM 	 * is in fact hidden, to ensure the caller gets the right one.  One
38218485SPeter.Memishian@Sun.COM 	 * exception: if the caller passed MATCH_IRE_IHANDLE, then they
38228485SPeter.Memishian@Sun.COM 	 * already know the identity of the given IRE_INTERFACE entry and
38238485SPeter.Memishian@Sun.COM 	 * there's no point trying to hide it from them.
38240Sstevel@tonic-gate 	 */
38258485SPeter.Memishian@Sun.COM 	if (ire->ire_marks & IRE_MARK_TESTHIDDEN) {
38268485SPeter.Memishian@Sun.COM 		if (match_flags & MATCH_IRE_IHANDLE)
38278485SPeter.Memishian@Sun.COM 			match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
38288485SPeter.Memishian@Sun.COM 
38298485SPeter.Memishian@Sun.COM 		if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN))
38308485SPeter.Memishian@Sun.COM 			return (B_FALSE);
38318485SPeter.Memishian@Sun.COM 	}
38320Sstevel@tonic-gate 
38331095Spriyanka 	/*
38341095Spriyanka 	 * MATCH_IRE_MARK_PRIVATE_ADDR is set when IP_NEXTHOP option
38351095Spriyanka 	 * is used. In that case the routing table is bypassed and the
38361095Spriyanka 	 * packets are sent directly to the specified nexthop. The
38371095Spriyanka 	 * IRE_CACHE entry representing this route should be marked
38381095Spriyanka 	 * with IRE_MARK_PRIVATE_ADDR.
38391095Spriyanka 	 */
38401095Spriyanka 
38411095Spriyanka 	if (!(match_flags & MATCH_IRE_MARK_PRIVATE_ADDR) &&
38421095Spriyanka 	    (ire->ire_marks & IRE_MARK_PRIVATE_ADDR))
38431095Spriyanka 		return (B_FALSE);
38441095Spriyanka 
38451676Sjpk 	if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid &&
38461676Sjpk 	    ire->ire_zoneid != ALL_ZONES) {
38470Sstevel@tonic-gate 		/*
38480Sstevel@tonic-gate 		 * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid is
38490Sstevel@tonic-gate 		 * valid and does not match that of ire_zoneid, a failure to
38500Sstevel@tonic-gate 		 * match is reported at this point. Otherwise, since some IREs
38510Sstevel@tonic-gate 		 * that are available in the global zone can be used in local
38520Sstevel@tonic-gate 		 * zones, additional checks need to be performed:
38530Sstevel@tonic-gate 		 *
38540Sstevel@tonic-gate 		 *	IRE_BROADCAST, IRE_CACHE and IRE_LOOPBACK
38550Sstevel@tonic-gate 		 *	entries should never be matched in this situation.
38560Sstevel@tonic-gate 		 *
38570Sstevel@tonic-gate 		 *	IRE entries that have an interface associated with them
38580Sstevel@tonic-gate 		 *	should in general not match unless they are an IRE_LOCAL
38590Sstevel@tonic-gate 		 *	or in the case when MATCH_IRE_DEFAULT has been set in
38600Sstevel@tonic-gate 		 *	the caller.  In the case of the former, checking of the
38610Sstevel@tonic-gate 		 *	other fields supplied should take place.
38620Sstevel@tonic-gate 		 *
38630Sstevel@tonic-gate 		 *	In the case where MATCH_IRE_DEFAULT has been set,
38640Sstevel@tonic-gate 		 *	all of the ipif's associated with the IRE's ill are
38650Sstevel@tonic-gate 		 *	checked to see if there is a matching zoneid.  If any
38660Sstevel@tonic-gate 		 *	one ipif has a matching zoneid, this IRE is a
38670Sstevel@tonic-gate 		 *	potential candidate so checking of the other fields
38680Sstevel@tonic-gate 		 *	takes place.
38690Sstevel@tonic-gate 		 *
38700Sstevel@tonic-gate 		 *	In the case where the IRE_INTERFACE has a usable source
38710Sstevel@tonic-gate 		 *	address (indicated by ill_usesrc_ifindex) in the
38720Sstevel@tonic-gate 		 *	correct zone then it's permitted to return this IRE
38730Sstevel@tonic-gate 		 */
38740Sstevel@tonic-gate 		if (match_flags & MATCH_IRE_ZONEONLY)
38750Sstevel@tonic-gate 			return (B_FALSE);
38760Sstevel@tonic-gate 		if (ire->ire_type & (IRE_BROADCAST | IRE_CACHE | IRE_LOOPBACK))
38770Sstevel@tonic-gate 			return (B_FALSE);
38780Sstevel@tonic-gate 		/*
38790Sstevel@tonic-gate 		 * Note, IRE_INTERFACE can have the stq as NULL. For
38800Sstevel@tonic-gate 		 * example, if the default multicast route is tied to
38810Sstevel@tonic-gate 		 * the loopback address.
38820Sstevel@tonic-gate 		 */
38830Sstevel@tonic-gate 		if ((ire->ire_type & IRE_INTERFACE) &&
38840Sstevel@tonic-gate 		    (ire->ire_stq != NULL)) {
38850Sstevel@tonic-gate 			dst_ill = (ill_t *)ire->ire_stq->q_ptr;
38860Sstevel@tonic-gate 			/*
38870Sstevel@tonic-gate 			 * If there is a usable source address in the
38880Sstevel@tonic-gate 			 * zone, then it's ok to return an
38890Sstevel@tonic-gate 			 * IRE_INTERFACE
38900Sstevel@tonic-gate 			 */
38910Sstevel@tonic-gate 			if (ipif_usesrc_avail(dst_ill, zoneid)) {
38920Sstevel@tonic-gate 				ip3dbg(("ire_match_args: dst_ill %p match %d\n",
38930Sstevel@tonic-gate 				    (void *)dst_ill,
38940Sstevel@tonic-gate 				    (ire->ire_addr == (addr & mask))));
38950Sstevel@tonic-gate 			} else {
38960Sstevel@tonic-gate 				ip3dbg(("ire_match_args: src_ipif NULL"
38970Sstevel@tonic-gate 				    " dst_ill %p\n", (void *)dst_ill));
38980Sstevel@tonic-gate 				return (B_FALSE);
38990Sstevel@tonic-gate 			}
39000Sstevel@tonic-gate 		}
39010Sstevel@tonic-gate 		if (ire->ire_ipif != NULL && ire->ire_type != IRE_LOCAL &&
39020Sstevel@tonic-gate 		    !(ire->ire_type & IRE_INTERFACE)) {
39030Sstevel@tonic-gate 			ipif_t	*tipif;
39040Sstevel@tonic-gate 
39050Sstevel@tonic-gate 			if ((match_flags & MATCH_IRE_DEFAULT) == 0) {
39060Sstevel@tonic-gate 				return (B_FALSE);
39070Sstevel@tonic-gate 			}
39080Sstevel@tonic-gate 			mutex_enter(&ire->ire_ipif->ipif_ill->ill_lock);
39090Sstevel@tonic-gate 			for (tipif = ire->ire_ipif->ipif_ill->ill_ipif;
39100Sstevel@tonic-gate 			    tipif != NULL; tipif = tipif->ipif_next) {
39110Sstevel@tonic-gate 				if (IPIF_CAN_LOOKUP(tipif) &&
39120Sstevel@tonic-gate 				    (tipif->ipif_flags & IPIF_UP) &&
39131676Sjpk 				    (tipif->ipif_zoneid == zoneid ||
39141676Sjpk 				    tipif->ipif_zoneid == ALL_ZONES))
39150Sstevel@tonic-gate 					break;
39160Sstevel@tonic-gate 			}
39170Sstevel@tonic-gate 			mutex_exit(&ire->ire_ipif->ipif_ill->ill_lock);
39180Sstevel@tonic-gate 			if (tipif == NULL) {
39190Sstevel@tonic-gate 				return (B_FALSE);
39200Sstevel@tonic-gate 			}
39210Sstevel@tonic-gate 		}
39220Sstevel@tonic-gate 	}
39230Sstevel@tonic-gate 
39240Sstevel@tonic-gate 	/*
39258485SPeter.Memishian@Sun.COM 	 * For IRE_CACHE entries, MATCH_IRE_ILL means that somebody wants to
39268485SPeter.Memishian@Sun.COM 	 * send out ire_stq (ire_ipif for IRE_CACHE entries is just the means
39278485SPeter.Memishian@Sun.COM 	 * of getting a source address -- i.e., ire_src_addr ==
39288485SPeter.Memishian@Sun.COM 	 * ire->ire_ipif->ipif_src_addr).  ire_to_ill() handles this.
39298485SPeter.Memishian@Sun.COM 	 *
39308485SPeter.Memishian@Sun.COM 	 * NOTE: For IPMP, MATCH_IRE_ILL usually matches any ill in the group.
39318485SPeter.Memishian@Sun.COM 	 * However, if MATCH_IRE_MARK_TESTHIDDEN is set (i.e., the IRE is for
39328485SPeter.Memishian@Sun.COM 	 * IPMP test traffic), then the ill must match exactly.
39330Sstevel@tonic-gate 	 */
39348485SPeter.Memishian@Sun.COM 	if (match_flags & MATCH_IRE_ILL) {
39350Sstevel@tonic-gate 		ire_ill = ire_to_ill(ire);
39360Sstevel@tonic-gate 		ipif_ill = ipif->ipif_ill;
39370Sstevel@tonic-gate 	}
39380Sstevel@tonic-gate 
39390Sstevel@tonic-gate 	if ((ire->ire_addr == (addr & mask)) &&
39400Sstevel@tonic-gate 	    ((!(match_flags & MATCH_IRE_GW)) ||
39414714Ssowmini 	    (ire->ire_gateway_addr == gateway)) &&
39420Sstevel@tonic-gate 	    ((!(match_flags & MATCH_IRE_TYPE)) ||
39434714Ssowmini 	    (ire->ire_type & type)) &&
39440Sstevel@tonic-gate 	    ((!(match_flags & MATCH_IRE_SRC)) ||
39454714Ssowmini 	    (ire->ire_src_addr == ipif->ipif_src_addr)) &&
39460Sstevel@tonic-gate 	    ((!(match_flags & MATCH_IRE_IPIF)) ||
39474714Ssowmini 	    (ire->ire_ipif == ipif)) &&
39488485SPeter.Memishian@Sun.COM 	    ((!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) ||
39498485SPeter.Memishian@Sun.COM 	    (ire->ire_marks & IRE_MARK_TESTHIDDEN)) &&
39501095Spriyanka 	    ((!(match_flags & MATCH_IRE_MARK_PRIVATE_ADDR)) ||
39514714Ssowmini 	    (ire->ire_type != IRE_CACHE ||
39524714Ssowmini 	    ire->ire_marks & IRE_MARK_PRIVATE_ADDR)) &&
39537880SJonathan.Anderson@Sun.COM 	    ((!(match_flags & MATCH_IRE_WQ)) ||
39547880SJonathan.Anderson@Sun.COM 	    (ire->ire_stq == wq)) &&
39558485SPeter.Memishian@Sun.COM 	    ((!(match_flags & MATCH_IRE_ILL)) ||
39568485SPeter.Memishian@Sun.COM 	    (ire_ill == ipif_ill ||
39578485SPeter.Memishian@Sun.COM 	    (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN) &&
39588485SPeter.Memishian@Sun.COM 	    ire_ill != NULL && IS_IN_SAME_ILLGRP(ipif_ill, ire_ill)))) &&
39590Sstevel@tonic-gate 	    ((!(match_flags & MATCH_IRE_IHANDLE)) ||
39604714Ssowmini 	    (ire->ire_ihandle == ihandle)) &&
39612535Ssangeeta 	    ((!(match_flags & MATCH_IRE_MASK)) ||
39624714Ssowmini 	    (ire->ire_mask == mask)) &&
39631676Sjpk 	    ((!(match_flags & MATCH_IRE_SECATTR)) ||
39644714Ssowmini 	    (!is_system_labeled()) ||
39654714Ssowmini 	    (tsol_ire_match_gwattr(ire, tsl) == 0))) {
39660Sstevel@tonic-gate 		/* We found the matched IRE */
39670Sstevel@tonic-gate 		return (B_TRUE);
39680Sstevel@tonic-gate 	}
39690Sstevel@tonic-gate 	return (B_FALSE);
39700Sstevel@tonic-gate }
39710Sstevel@tonic-gate 
39720Sstevel@tonic-gate /*
39730Sstevel@tonic-gate  * Lookup for a route in all the tables
39740Sstevel@tonic-gate  */
39750Sstevel@tonic-gate ire_t *
39760Sstevel@tonic-gate ire_route_lookup(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
39771676Sjpk     int type, const ipif_t *ipif, ire_t **pire, zoneid_t zoneid,
39783448Sdh155122     const ts_label_t *tsl, int flags, ip_stack_t *ipst)
39790Sstevel@tonic-gate {
39800Sstevel@tonic-gate 	ire_t *ire = NULL;
39810Sstevel@tonic-gate 
39820Sstevel@tonic-gate 	/*
39830Sstevel@tonic-gate 	 * ire_match_args() will dereference ipif MATCH_IRE_SRC or
39840Sstevel@tonic-gate 	 * MATCH_IRE_ILL is set.
39850Sstevel@tonic-gate 	 */
39868485SPeter.Memishian@Sun.COM 	if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL))
39870Sstevel@tonic-gate 		return (NULL);
39880Sstevel@tonic-gate 
39890Sstevel@tonic-gate 	/*
39900Sstevel@tonic-gate 	 * might be asking for a cache lookup,
39910Sstevel@tonic-gate 	 * This is not best way to lookup cache,
39920Sstevel@tonic-gate 	 * user should call ire_cache_lookup directly.
39930Sstevel@tonic-gate 	 *
39940Sstevel@tonic-gate 	 * If MATCH_IRE_TYPE was set, first lookup in the cache table and then
39950Sstevel@tonic-gate 	 * in the forwarding table, if the applicable type flags were set.
39960Sstevel@tonic-gate 	 */
39970Sstevel@tonic-gate 	if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_CACHETABLE) != 0) {
39980Sstevel@tonic-gate 		ire = ire_ctable_lookup(addr, gateway, type, ipif, zoneid,
39993448Sdh155122 		    tsl, flags, ipst);
40000Sstevel@tonic-gate 		if (ire != NULL)
40010Sstevel@tonic-gate 			return (ire);
40020Sstevel@tonic-gate 	}
40030Sstevel@tonic-gate 	if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_FORWARDTABLE) != 0) {
40040Sstevel@tonic-gate 		ire = ire_ftable_lookup(addr, mask, gateway, type, ipif, pire,
40053448Sdh155122 		    zoneid, 0, tsl, flags, ipst);
40060Sstevel@tonic-gate 	}
40070Sstevel@tonic-gate 	return (ire);
40080Sstevel@tonic-gate }
40090Sstevel@tonic-gate 
40100Sstevel@tonic-gate /*
40111676Sjpk  * Delete the IRE cache for the gateway and all IRE caches whose
40121676Sjpk  * ire_gateway_addr points to this gateway, and allow them to
40131676Sjpk  * be created on demand by ip_newroute.
40141676Sjpk  */
40151676Sjpk void
40163448Sdh155122 ire_clookup_delete_cache_gw(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst)
40171676Sjpk {
40181676Sjpk 	irb_t *irb;
40191676Sjpk 	ire_t *ire;
40201676Sjpk 
40213448Sdh155122 	irb = &ipst->ips_ip_cache_table[IRE_ADDR_HASH(addr,
40223448Sdh155122 	    ipst->ips_ip_cache_table_size)];
40231676Sjpk 	IRB_REFHOLD(irb);
40241676Sjpk 	for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
40251676Sjpk 		if (ire->ire_marks & IRE_MARK_CONDEMNED)
40261676Sjpk 			continue;
40271676Sjpk 
40281676Sjpk 		ASSERT(ire->ire_mask == IP_HOST_MASK);
40291676Sjpk 		if (ire_match_args(ire, addr, ire->ire_mask, 0, IRE_CACHE,
40307880SJonathan.Anderson@Sun.COM 		    NULL, zoneid, 0, NULL, MATCH_IRE_TYPE, NULL)) {
40311676Sjpk 			ire_delete(ire);
40321676Sjpk 		}
40331676Sjpk 	}
40341676Sjpk 	IRB_REFRELE(irb);
40351676Sjpk 
40363448Sdh155122 	ire_walk_v4(ire_delete_cache_gw, &addr, zoneid, ipst);
40371676Sjpk }
40381676Sjpk 
40391676Sjpk /*
40400Sstevel@tonic-gate  * Looks up cache table for a route.
40410Sstevel@tonic-gate  * specific lookup can be indicated by
40420Sstevel@tonic-gate  * passing the MATCH_* flags and the
40430Sstevel@tonic-gate  * necessary parameters.
40440Sstevel@tonic-gate  */
40450Sstevel@tonic-gate ire_t *
40461676Sjpk ire_ctable_lookup(ipaddr_t addr, ipaddr_t gateway, int type, const ipif_t *ipif,
40473448Sdh155122     zoneid_t zoneid, const ts_label_t *tsl, int flags, ip_stack_t *ipst)
40480Sstevel@tonic-gate {
40497880SJonathan.Anderson@Sun.COM 	ire_ctable_args_t	margs;
40507880SJonathan.Anderson@Sun.COM 
40517880SJonathan.Anderson@Sun.COM 	margs.ict_addr = &addr;
40527880SJonathan.Anderson@Sun.COM 	margs.ict_gateway = &gateway;
40537880SJonathan.Anderson@Sun.COM 	margs.ict_type = type;
40547880SJonathan.Anderson@Sun.COM 	margs.ict_ipif = ipif;
40557880SJonathan.Anderson@Sun.COM 	margs.ict_zoneid = zoneid;
40567880SJonathan.Anderson@Sun.COM 	margs.ict_tsl = tsl;
40577880SJonathan.Anderson@Sun.COM 	margs.ict_flags = flags;
40587880SJonathan.Anderson@Sun.COM 	margs.ict_ipst = ipst;
40597880SJonathan.Anderson@Sun.COM 	margs.ict_wq = NULL;
40607880SJonathan.Anderson@Sun.COM 
40617880SJonathan.Anderson@Sun.COM 	return (ip4_ctable_lookup_impl(&margs));
40620Sstevel@tonic-gate }
40630Sstevel@tonic-gate 
40640Sstevel@tonic-gate /*
40652733Snordmark  * Check whether the IRE_LOCAL and the IRE potentially used to transmit
40668485SPeter.Memishian@Sun.COM  * (could be an IRE_CACHE, IRE_BROADCAST, or IRE_INTERFACE) are identical
40678485SPeter.Memishian@Sun.COM  * or part of the same illgrp.  (In the IPMP case, usually the two IREs
40688485SPeter.Memishian@Sun.COM  * will both belong to the IPMP ill, but exceptions are possible -- e.g.
40698485SPeter.Memishian@Sun.COM  * if IPMP test addresses are on their own subnet.)
40702733Snordmark  */
40712733Snordmark boolean_t
40728485SPeter.Memishian@Sun.COM ire_local_same_lan(ire_t *ire_local, ire_t *xmit_ire)
40732733Snordmark {
40748485SPeter.Memishian@Sun.COM 	ill_t *recv_ill, *xmit_ill;
40752733Snordmark 
40762906Snordmark 	ASSERT(ire_local->ire_type & (IRE_LOCAL|IRE_LOOPBACK));
40772962Snordmark 	ASSERT(xmit_ire->ire_type & (IRE_CACHETABLE|IRE_INTERFACE));
40782906Snordmark 
40792906Snordmark 	recv_ill = ire_to_ill(ire_local);
40802906Snordmark 	xmit_ill = ire_to_ill(xmit_ire);
40812906Snordmark 
40822906Snordmark 	ASSERT(recv_ill != NULL);
40832906Snordmark 	ASSERT(xmit_ill != NULL);
40842733Snordmark 
40858485SPeter.Memishian@Sun.COM 	return (IS_ON_SAME_LAN(recv_ill, xmit_ill));
40862733Snordmark }
40872733Snordmark 
40882733Snordmark /*
40898485SPeter.Memishian@Sun.COM  * Check if the IRE_LOCAL uses the same ill as another route would use.
40902962Snordmark  * If there is no alternate route, or the alternate is a REJECT or BLACKHOLE,
40912962Snordmark  * then we don't allow this IRE_LOCAL to be used.
40922733Snordmark  */
40932733Snordmark boolean_t
40942733Snordmark ire_local_ok_across_zones(ire_t *ire_local, zoneid_t zoneid, void *addr,
40953448Sdh155122     const ts_label_t *tsl, ip_stack_t *ipst)
40962733Snordmark {
40972733Snordmark 	ire_t		*alt_ire;
40982733Snordmark 	boolean_t	rval;
40998485SPeter.Memishian@Sun.COM 	int		flags;
41008485SPeter.Memishian@Sun.COM 
41018485SPeter.Memishian@Sun.COM 	flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE;
41022733Snordmark 
41032733Snordmark 	if (ire_local->ire_ipversion == IPV4_VERSION) {
41042733Snordmark 		alt_ire = ire_ftable_lookup(*((ipaddr_t *)addr), 0, 0, 0, NULL,
41058485SPeter.Memishian@Sun.COM 		    NULL, zoneid, 0, tsl, flags, ipst);
41062733Snordmark 	} else {
41078485SPeter.Memishian@Sun.COM 		alt_ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL,
41088485SPeter.Memishian@Sun.COM 		    NULL, zoneid, 0, tsl, flags, ipst);
41092733Snordmark 	}
41102733Snordmark 
41112733Snordmark 	if (alt_ire == NULL)
41122733Snordmark 		return (B_FALSE);
41132733Snordmark 
41142962Snordmark 	if (alt_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
41152962Snordmark 		ire_refrele(alt_ire);
41162962Snordmark 		return (B_FALSE);
41172962Snordmark 	}
41188485SPeter.Memishian@Sun.COM 	rval = ire_local_same_lan(ire_local, alt_ire);
41192733Snordmark 
41202733Snordmark 	ire_refrele(alt_ire);
41212733Snordmark 	return (rval);
41222733Snordmark }
41232733Snordmark 
41242733Snordmark /*
41258485SPeter.Memishian@Sun.COM  * Lookup cache
41262733Snordmark  *
41272733Snordmark  * In general the zoneid has to match (where ALL_ZONES match all of them).
41282733Snordmark  * But for IRE_LOCAL we also need to handle the case where L2 should
41292733Snordmark  * conceptually loop back the packet. This is necessary since neither
41302733Snordmark  * Ethernet drivers nor Ethernet hardware loops back packets sent to their
41312733Snordmark  * own MAC address. This loopback is needed when the normal
41322733Snordmark  * routes (ignoring IREs with different zoneids) would send out the packet on
41338485SPeter.Memishian@Sun.COM  * the same ill as the ill with which this IRE_LOCAL is associated.
41342733Snordmark  *
41352733Snordmark  * Earlier versions of this code always matched an IRE_LOCAL independently of
41362733Snordmark  * the zoneid. We preserve that earlier behavior when
41372733Snordmark  * ip_restrict_interzone_loopback is turned off.
41380Sstevel@tonic-gate  */
41390Sstevel@tonic-gate ire_t *
41403448Sdh155122 ire_cache_lookup(ipaddr_t addr, zoneid_t zoneid, const ts_label_t *tsl,
41413448Sdh155122     ip_stack_t *ipst)
41420Sstevel@tonic-gate {
41430Sstevel@tonic-gate 	irb_t *irb_ptr;
41440Sstevel@tonic-gate 	ire_t *ire;
41450Sstevel@tonic-gate 
41463448Sdh155122 	irb_ptr = &ipst->ips_ip_cache_table[IRE_ADDR_HASH(addr,
41474714Ssowmini 	    ipst->ips_ip_cache_table_size)];
41480Sstevel@tonic-gate 	rw_enter(&irb_ptr->irb_lock, RW_READER);
41490Sstevel@tonic-gate 	for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
41501095Spriyanka 		if (ire->ire_marks & (IRE_MARK_CONDEMNED |
41518485SPeter.Memishian@Sun.COM 		    IRE_MARK_TESTHIDDEN | IRE_MARK_PRIVATE_ADDR)) {
41520Sstevel@tonic-gate 			continue;
41531095Spriyanka 		}
41540Sstevel@tonic-gate 		if (ire->ire_addr == addr) {
41551676Sjpk 			/*
41561676Sjpk 			 * Finally, check if the security policy has any
41571676Sjpk 			 * restriction on using this route for the specified
41581676Sjpk 			 * message.
41591676Sjpk 			 */
41601676Sjpk 			if (tsl != NULL &&
41611676Sjpk 			    ire->ire_gw_secattr != NULL &&
41621676Sjpk 			    tsol_ire_match_gwattr(ire, tsl) != 0) {
41631676Sjpk 				continue;
41641676Sjpk 			}
41651676Sjpk 
41660Sstevel@tonic-gate 			if (zoneid == ALL_ZONES || ire->ire_zoneid == zoneid ||
41672733Snordmark 			    ire->ire_zoneid == ALL_ZONES) {
41682733Snordmark 				IRE_REFHOLD(ire);
41692733Snordmark 				rw_exit(&irb_ptr->irb_lock);
41702733Snordmark 				return (ire);
41712733Snordmark 			}
41722733Snordmark 
41732733Snordmark 			if (ire->ire_type == IRE_LOCAL) {
41743448Sdh155122 				if (ipst->ips_ip_restrict_interzone_loopback &&
41752733Snordmark 				    !ire_local_ok_across_zones(ire, zoneid,
41763448Sdh155122 				    &addr, tsl, ipst))
41772733Snordmark 					continue;
41782733Snordmark 
41790Sstevel@tonic-gate 				IRE_REFHOLD(ire);
41800Sstevel@tonic-gate 				rw_exit(&irb_ptr->irb_lock);
41810Sstevel@tonic-gate 				return (ire);
41820Sstevel@tonic-gate 			}
41830Sstevel@tonic-gate 		}
41840Sstevel@tonic-gate 	}
41850Sstevel@tonic-gate 	rw_exit(&irb_ptr->irb_lock);
41860Sstevel@tonic-gate 	return (NULL);
41870Sstevel@tonic-gate }
41880Sstevel@tonic-gate 
41898275SEric Cheng ire_t *
41908275SEric Cheng ire_cache_lookup_simple(ipaddr_t dst, ip_stack_t *ipst)
41918275SEric Cheng {
41928275SEric Cheng 	irb_t *irb_ptr;
41938275SEric Cheng 	ire_t *ire;
41948275SEric Cheng 
41958275SEric Cheng 	/*
41968485SPeter.Memishian@Sun.COM 	 * Look for an ire in the cachetable whose
41978275SEric Cheng 	 * ire_addr matches the destination.
41988275SEric Cheng 	 * Since we are being called by forwarding fastpath
41998275SEric Cheng 	 * no need to check for Trusted Solaris label.
42008275SEric Cheng 	 */
42018275SEric Cheng 	irb_ptr = &ipst->ips_ip_cache_table[IRE_ADDR_HASH(
42028275SEric Cheng 	    dst, ipst->ips_ip_cache_table_size)];
42038275SEric Cheng 	rw_enter(&irb_ptr->irb_lock, RW_READER);
42048275SEric Cheng 	for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
42058485SPeter.Memishian@Sun.COM 		if (ire->ire_marks & (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN |
42068485SPeter.Memishian@Sun.COM 		    IRE_MARK_PRIVATE_ADDR)) {
42078275SEric Cheng 			continue;
42088275SEric Cheng 		}
42098275SEric Cheng 		if (ire->ire_addr == dst) {
42108275SEric Cheng 			IRE_REFHOLD(ire);
42118275SEric Cheng 			rw_exit(&irb_ptr->irb_lock);
42128275SEric Cheng 			return (ire);
42138275SEric Cheng 		}
42148275SEric Cheng 	}
42158275SEric Cheng 	rw_exit(&irb_ptr->irb_lock);
42168275SEric Cheng 	return (NULL);
42178275SEric Cheng }
42188275SEric Cheng 
42190Sstevel@tonic-gate /*
42200Sstevel@tonic-gate  * Locate the interface ire that is tied to the cache ire 'cire' via
42210Sstevel@tonic-gate  * cire->ire_ihandle.
42220Sstevel@tonic-gate  *
42230Sstevel@tonic-gate  * We are trying to create the cache ire for an offlink destn based
42240Sstevel@tonic-gate  * on the cache ire of the gateway in 'cire'. 'pire' is the prefix ire
42250Sstevel@tonic-gate  * as found by ip_newroute(). We are called from ip_newroute() in
42260Sstevel@tonic-gate  * the IRE_CACHE case.
42270Sstevel@tonic-gate  */
42280Sstevel@tonic-gate ire_t *
42290Sstevel@tonic-gate ire_ihandle_lookup_offlink(ire_t *cire, ire_t *pire)
42300Sstevel@tonic-gate {
42310Sstevel@tonic-gate 	ire_t	*ire;
42320Sstevel@tonic-gate 	int	match_flags;
42330Sstevel@tonic-gate 	ipaddr_t gw_addr;
42340Sstevel@tonic-gate 	ipif_t	*gw_ipif;
42353448Sdh155122 	ip_stack_t	*ipst = cire->ire_ipst;
42360Sstevel@tonic-gate 
42370Sstevel@tonic-gate 	ASSERT(cire != NULL && pire != NULL);
42380Sstevel@tonic-gate 
42390Sstevel@tonic-gate 	/*
42400Sstevel@tonic-gate 	 * We don't need to specify the zoneid to ire_ftable_lookup() below
42410Sstevel@tonic-gate 	 * because the ihandle refers to an ipif which can be in only one zone.
42420Sstevel@tonic-gate 	 */
42430Sstevel@tonic-gate 	match_flags =  MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK;
42440Sstevel@tonic-gate 	if (pire->ire_ipif != NULL)
42458485SPeter.Memishian@Sun.COM 		match_flags |= MATCH_IRE_ILL;
42460Sstevel@tonic-gate 	/*
42470Sstevel@tonic-gate 	 * We know that the mask of the interface ire equals cire->ire_cmask.
42480Sstevel@tonic-gate 	 * (When ip_newroute() created 'cire' for the gateway it set its
42490Sstevel@tonic-gate 	 * cmask from the interface ire's mask)
42500Sstevel@tonic-gate 	 */
42510Sstevel@tonic-gate 	ire = ire_ftable_lookup(cire->ire_addr, cire->ire_cmask, 0,
42520Sstevel@tonic-gate 	    IRE_INTERFACE, pire->ire_ipif, NULL, ALL_ZONES, cire->ire_ihandle,
42533448Sdh155122 	    NULL, match_flags, ipst);
42540Sstevel@tonic-gate 	if (ire != NULL)
42550Sstevel@tonic-gate 		return (ire);
42560Sstevel@tonic-gate 	/*
42570Sstevel@tonic-gate 	 * If we didn't find an interface ire above, we can't declare failure.
42580Sstevel@tonic-gate 	 * For backwards compatibility, we need to support prefix routes
42590Sstevel@tonic-gate 	 * pointing to next hop gateways that are not on-link.
42600Sstevel@tonic-gate 	 *
42610Sstevel@tonic-gate 	 * Assume we are trying to ping some offlink destn, and we have the
42620Sstevel@tonic-gate 	 * routing table below.
42630Sstevel@tonic-gate 	 *
42640Sstevel@tonic-gate 	 * Eg.	default	- gw1		<--- pire	(line 1)
42650Sstevel@tonic-gate 	 *	gw1	- gw2				(line 2)
42660Sstevel@tonic-gate 	 *	gw2	- hme0				(line 3)
42670Sstevel@tonic-gate 	 *
42680Sstevel@tonic-gate 	 * If we already have a cache ire for gw1 in 'cire', the
42690Sstevel@tonic-gate 	 * ire_ftable_lookup above would have failed, since there is no
42700Sstevel@tonic-gate 	 * interface ire to reach gw1. We will fallthru below.
42710Sstevel@tonic-gate 	 *
42720Sstevel@tonic-gate 	 * Here we duplicate the steps that ire_ftable_lookup() did in
42730Sstevel@tonic-gate 	 * getting 'cire' from 'pire', in the MATCH_IRE_RECURSIVE case.
42740Sstevel@tonic-gate 	 * The differences are the following
42750Sstevel@tonic-gate 	 * i.   We want the interface ire only, so we call ire_ftable_lookup()
42760Sstevel@tonic-gate 	 *	instead of ire_route_lookup()
42770Sstevel@tonic-gate 	 * ii.  We look for only prefix routes in the 1st call below.
42780Sstevel@tonic-gate 	 * ii.  We want to match on the ihandle in the 2nd call below.
42790Sstevel@tonic-gate 	 */
42800Sstevel@tonic-gate 	match_flags =  MATCH_IRE_TYPE;
42810Sstevel@tonic-gate 	if (pire->ire_ipif != NULL)
42828485SPeter.Memishian@Sun.COM 		match_flags |= MATCH_IRE_ILL;
42830Sstevel@tonic-gate 	ire = ire_ftable_lookup(pire->ire_gateway_addr, 0, 0, IRE_OFFSUBNET,
42843448Sdh155122 	    pire->ire_ipif, NULL, ALL_ZONES, 0, NULL, match_flags, ipst);
42850Sstevel@tonic-gate 	if (ire == NULL)
42860Sstevel@tonic-gate 		return (NULL);
42870Sstevel@tonic-gate 	/*
42880Sstevel@tonic-gate 	 * At this point 'ire' corresponds to the entry shown in line 2.
42890Sstevel@tonic-gate 	 * gw_addr is 'gw2' in the example above.
42900Sstevel@tonic-gate 	 */
42910Sstevel@tonic-gate 	gw_addr = ire->ire_gateway_addr;
42920Sstevel@tonic-gate 	gw_ipif = ire->ire_ipif;
42930Sstevel@tonic-gate 	ire_refrele(ire);
42940Sstevel@tonic-gate 
42950Sstevel@tonic-gate 	match_flags |= MATCH_IRE_IHANDLE;
42960Sstevel@tonic-gate 	ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE,
42973448Sdh155122 	    gw_ipif, NULL, ALL_ZONES, cire->ire_ihandle, NULL, match_flags,
42983448Sdh155122 	    ipst);
42990Sstevel@tonic-gate 	return (ire);
43000Sstevel@tonic-gate }
43010Sstevel@tonic-gate 
43020Sstevel@tonic-gate /*
43030Sstevel@tonic-gate  * Return the IRE_LOOPBACK, IRE_IF_RESOLVER or IRE_IF_NORESOLVER
43040Sstevel@tonic-gate  * ire associated with the specified ipif.
43050Sstevel@tonic-gate  *
43060Sstevel@tonic-gate  * This might occasionally be called when IPIF_UP is not set since
43070Sstevel@tonic-gate  * the IP_MULTICAST_IF as well as creating interface routes
43080Sstevel@tonic-gate  * allows specifying a down ipif (ipif_lookup* match ipifs that are down).
43090Sstevel@tonic-gate  *
43100Sstevel@tonic-gate  * Note that if IPIF_NOLOCAL, IPIF_NOXMIT, or IPIF_DEPRECATED is set on
43110Sstevel@tonic-gate  * the ipif, this routine might return NULL.
43120Sstevel@tonic-gate  */
43130Sstevel@tonic-gate ire_t *
43141676Sjpk ipif_to_ire(const ipif_t *ipif)
43150Sstevel@tonic-gate {
43160Sstevel@tonic-gate 	ire_t	*ire;
43178485SPeter.Memishian@Sun.COM 	ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
43188485SPeter.Memishian@Sun.COM 	uint_t	match_flags = MATCH_IRE_TYPE | MATCH_IRE_IPIF | MATCH_IRE_MASK;
43198485SPeter.Memishian@Sun.COM 
43208485SPeter.Memishian@Sun.COM 	/*
43218485SPeter.Memishian@Sun.COM 	 * IRE_INTERFACE entries for ills under IPMP are IRE_MARK_TESTHIDDEN
43228485SPeter.Memishian@Sun.COM 	 * so that they aren't accidentally returned.  However, if the
43238485SPeter.Memishian@Sun.COM 	 * caller's ipif is on an ill under IPMP, there's no need to hide 'em.
43248485SPeter.Memishian@Sun.COM 	 */
43258485SPeter.Memishian@Sun.COM 	if (IS_UNDER_IPMP(ipif->ipif_ill))
43268485SPeter.Memishian@Sun.COM 		match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
43270Sstevel@tonic-gate 
43280Sstevel@tonic-gate 	ASSERT(!ipif->ipif_isv6);
43290Sstevel@tonic-gate 	if (ipif->ipif_ire_type == IRE_LOOPBACK) {
43300Sstevel@tonic-gate 		ire = ire_ctable_lookup(ipif->ipif_lcl_addr, 0, IRE_LOOPBACK,
43313448Sdh155122 		    ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF),
43323448Sdh155122 		    ipst);
43330Sstevel@tonic-gate 	} else if (ipif->ipif_flags & IPIF_POINTOPOINT) {
43340Sstevel@tonic-gate 		/* In this case we need to lookup destination address. */
43350Sstevel@tonic-gate 		ire = ire_ftable_lookup(ipif->ipif_pp_dst_addr, IP_HOST_MASK, 0,
43368485SPeter.Memishian@Sun.COM 		    IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL, match_flags,
43378485SPeter.Memishian@Sun.COM 		    ipst);
43380Sstevel@tonic-gate 	} else {
43390Sstevel@tonic-gate 		ire = ire_ftable_lookup(ipif->ipif_subnet,
43400Sstevel@tonic-gate 		    ipif->ipif_net_mask, 0, IRE_INTERFACE, ipif, NULL,
43418485SPeter.Memishian@Sun.COM 		    ALL_ZONES, 0, NULL, match_flags, ipst);
43420Sstevel@tonic-gate 	}
43430Sstevel@tonic-gate 	return (ire);
43440Sstevel@tonic-gate }
43450Sstevel@tonic-gate 
43460Sstevel@tonic-gate /*
43470Sstevel@tonic-gate  * ire_walk function.
43480Sstevel@tonic-gate  * Count the number of IRE_CACHE entries in different categories.
43490Sstevel@tonic-gate  */
43500Sstevel@tonic-gate void
43510Sstevel@tonic-gate ire_cache_count(ire_t *ire, char *arg)
43520Sstevel@tonic-gate {
43530Sstevel@tonic-gate 	ire_cache_count_t *icc = (ire_cache_count_t *)arg;
43540Sstevel@tonic-gate 
43550Sstevel@tonic-gate 	if (ire->ire_type != IRE_CACHE)
43560Sstevel@tonic-gate 		return;
43570Sstevel@tonic-gate 
43580Sstevel@tonic-gate 	icc->icc_total++;
43590Sstevel@tonic-gate 
43600Sstevel@tonic-gate 	if (ire->ire_ipversion == IPV6_VERSION) {
43610Sstevel@tonic-gate 		mutex_enter(&ire->ire_lock);
43620Sstevel@tonic-gate 		if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) {
43630Sstevel@tonic-gate 			mutex_exit(&ire->ire_lock);
43640Sstevel@tonic-gate 			icc->icc_onlink++;
43650Sstevel@tonic-gate 			return;
43660Sstevel@tonic-gate 		}
43670Sstevel@tonic-gate 		mutex_exit(&ire->ire_lock);
43680Sstevel@tonic-gate 	} else {
43690Sstevel@tonic-gate 		if (ire->ire_gateway_addr == 0) {
43700Sstevel@tonic-gate 			icc->icc_onlink++;
43710Sstevel@tonic-gate 			return;
43720Sstevel@tonic-gate 		}
43730Sstevel@tonic-gate 	}
43740Sstevel@tonic-gate 
43750Sstevel@tonic-gate 	ASSERT(ire->ire_ipif != NULL);
43760Sstevel@tonic-gate 	if (ire->ire_max_frag < ire->ire_ipif->ipif_mtu)
43770Sstevel@tonic-gate 		icc->icc_pmtu++;
43780Sstevel@tonic-gate 	else if (ire->ire_tire_mark != ire->ire_ob_pkt_count +
43790Sstevel@tonic-gate 	    ire->ire_ib_pkt_count)
43800Sstevel@tonic-gate 		icc->icc_offlink++;
43810Sstevel@tonic-gate 	else
43820Sstevel@tonic-gate 		icc->icc_unused++;
43830Sstevel@tonic-gate }
43840Sstevel@tonic-gate 
43850Sstevel@tonic-gate /*
43860Sstevel@tonic-gate  * ire_walk function called by ip_trash_ire_reclaim().
43870Sstevel@tonic-gate  * Free a fraction of the IRE_CACHE cache entries. The fractions are
43880Sstevel@tonic-gate  * different for different categories of IRE_CACHE entries.
43890Sstevel@tonic-gate  * A fraction of zero means to not free any in that category.
43900Sstevel@tonic-gate  * Use the hash bucket id plus lbolt as a random number. Thus if the fraction
43910Sstevel@tonic-gate  * is N then every Nth hash bucket chain will be freed.
43920Sstevel@tonic-gate  */
43930Sstevel@tonic-gate void
43940Sstevel@tonic-gate ire_cache_reclaim(ire_t *ire, char *arg)
43950Sstevel@tonic-gate {
43960Sstevel@tonic-gate 	ire_cache_reclaim_t *icr = (ire_cache_reclaim_t *)arg;
43970Sstevel@tonic-gate 	uint_t rand;
43983448Sdh155122 	ip_stack_t	*ipst = icr->icr_ipst;
43990Sstevel@tonic-gate 
44000Sstevel@tonic-gate 	if (ire->ire_type != IRE_CACHE)
44010Sstevel@tonic-gate 		return;
44020Sstevel@tonic-gate 
44030Sstevel@tonic-gate 	if (ire->ire_ipversion == IPV6_VERSION) {
44040Sstevel@tonic-gate 		rand = (uint_t)lbolt +
44053448Sdh155122 		    IRE_ADDR_HASH_V6(ire->ire_addr_v6,
44064714Ssowmini 		    ipst->ips_ip6_cache_table_size);
44070Sstevel@tonic-gate 		mutex_enter(&ire->ire_lock);
44080Sstevel@tonic-gate 		if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) {
44090Sstevel@tonic-gate 			mutex_exit(&ire->ire_lock);
44100Sstevel@tonic-gate 			if (icr->icr_onlink != 0 &&
44110Sstevel@tonic-gate 			    (rand/icr->icr_onlink)*icr->icr_onlink == rand) {
44120Sstevel@tonic-gate 				ire_delete(ire);
44130Sstevel@tonic-gate 				return;
44140Sstevel@tonic-gate 			}
44150Sstevel@tonic-gate 			goto done;
44160Sstevel@tonic-gate 		}
44170Sstevel@tonic-gate 		mutex_exit(&ire->ire_lock);
44180Sstevel@tonic-gate 	} else {
44190Sstevel@tonic-gate 		rand = (uint_t)lbolt +
44203448Sdh155122 		    IRE_ADDR_HASH(ire->ire_addr, ipst->ips_ip_cache_table_size);
44210Sstevel@tonic-gate 		if (ire->ire_gateway_addr == 0) {
44220Sstevel@tonic-gate 			if (icr->icr_onlink != 0 &&
44230Sstevel@tonic-gate 			    (rand/icr->icr_onlink)*icr->icr_onlink == rand) {
44240Sstevel@tonic-gate 				ire_delete(ire);
44250Sstevel@tonic-gate 				return;
44260Sstevel@tonic-gate 			}
44270Sstevel@tonic-gate 			goto done;
44280Sstevel@tonic-gate 		}
44290Sstevel@tonic-gate 	}
44300Sstevel@tonic-gate 	/* Not onlink IRE */
44310Sstevel@tonic-gate 	ASSERT(ire->ire_ipif != NULL);
44320Sstevel@tonic-gate 	if (ire->ire_max_frag < ire->ire_ipif->ipif_mtu) {
44330Sstevel@tonic-gate 		/* Use ptmu fraction */
44340Sstevel@tonic-gate 		if (icr->icr_pmtu != 0 &&
44350Sstevel@tonic-gate 		    (rand/icr->icr_pmtu)*icr->icr_pmtu == rand) {
44360Sstevel@tonic-gate 			ire_delete(ire);
44370Sstevel@tonic-gate 			return;
44380Sstevel@tonic-gate 		}
44390Sstevel@tonic-gate 	} else if (ire->ire_tire_mark != ire->ire_ob_pkt_count +
44400Sstevel@tonic-gate 	    ire->ire_ib_pkt_count) {
44410Sstevel@tonic-gate 		/* Use offlink fraction */
44420Sstevel@tonic-gate 		if (icr->icr_offlink != 0 &&
44430Sstevel@tonic-gate 		    (rand/icr->icr_offlink)*icr->icr_offlink == rand) {
44440Sstevel@tonic-gate 			ire_delete(ire);
44450Sstevel@tonic-gate 			return;
44460Sstevel@tonic-gate 		}
44470Sstevel@tonic-gate 	} else {
44480Sstevel@tonic-gate 		/* Use unused fraction */
44490Sstevel@tonic-gate 		if (icr->icr_unused != 0 &&
44500Sstevel@tonic-gate 		    (rand/icr->icr_unused)*icr->icr_unused == rand) {
44510Sstevel@tonic-gate 			ire_delete(ire);
44520Sstevel@tonic-gate 			return;
44530Sstevel@tonic-gate 		}
44540Sstevel@tonic-gate 	}
44550Sstevel@tonic-gate done:
44560Sstevel@tonic-gate 	/*
44570Sstevel@tonic-gate 	 * Update tire_mark so that those that haven't been used since this
44580Sstevel@tonic-gate 	 * reclaim will be considered unused next time we reclaim.
44590Sstevel@tonic-gate 	 */
44600Sstevel@tonic-gate 	ire->ire_tire_mark = ire->ire_ob_pkt_count + ire->ire_ib_pkt_count;
44610Sstevel@tonic-gate }
44620Sstevel@tonic-gate 
44630Sstevel@tonic-gate static void
44640Sstevel@tonic-gate power2_roundup(uint32_t *value)
44650Sstevel@tonic-gate {
44660Sstevel@tonic-gate 	int i;
44670Sstevel@tonic-gate 
44680Sstevel@tonic-gate 	for (i = 1; i < 31; i++) {
44690Sstevel@tonic-gate 		if (*value <= (1 << i))
44700Sstevel@tonic-gate 			break;
44710Sstevel@tonic-gate 	}
44720Sstevel@tonic-gate 	*value = (1 << i);
44730Sstevel@tonic-gate }
44740Sstevel@tonic-gate 
44753448Sdh155122 /* Global init for all zones */
44760Sstevel@tonic-gate void
44773448Sdh155122 ip_ire_g_init()
44780Sstevel@tonic-gate {
44790Sstevel@tonic-gate 	/*
44800Sstevel@tonic-gate 	 * Create ire caches, ire_reclaim()
44810Sstevel@tonic-gate 	 * will give IRE_CACHE back to system when needed.
44820Sstevel@tonic-gate 	 * This needs to be done here before anything else, since
44830Sstevel@tonic-gate 	 * ire_add() expects the cache to be created.
44840Sstevel@tonic-gate 	 */
44850Sstevel@tonic-gate 	ire_cache = kmem_cache_create("ire_cache",
44864714Ssowmini 	    sizeof (ire_t), 0, ip_ire_constructor,
44874714Ssowmini 	    ip_ire_destructor, ip_trash_ire_reclaim, NULL, NULL, 0);
44880Sstevel@tonic-gate 
44893448Sdh155122 	rt_entry_cache = kmem_cache_create("rt_entry",
44903448Sdh155122 	    sizeof (struct rt_entry), 0, NULL, NULL, NULL, NULL, NULL, 0);
44913448Sdh155122 
44923448Sdh155122 	/*
44933448Sdh155122 	 * Have radix code setup kmem caches etc.
44943448Sdh155122 	 */
44953448Sdh155122 	rn_init();
44963448Sdh155122 }
44973448Sdh155122 
44983448Sdh155122 void
44993448Sdh155122 ip_ire_init(ip_stack_t *ipst)
45003448Sdh155122 {
45013448Sdh155122 	int i;
45023448Sdh155122 	uint32_t mem_cnt;
45033448Sdh155122 	uint32_t cpu_cnt;
45043448Sdh155122 	uint32_t min_cnt;
45053448Sdh155122 	pgcnt_t mem_avail;
45063448Sdh155122 
45073448Sdh155122 	/*
45083448Sdh155122 	 * ip_ire_max_bucket_cnt is sized below based on the memory
45093448Sdh155122 	 * size and the cpu speed of the machine. This is upper
45103448Sdh155122 	 * bounded by the compile time value of ip_ire_max_bucket_cnt
45113448Sdh155122 	 * and is lower bounded by the compile time value of
45123448Sdh155122 	 * ip_ire_min_bucket_cnt.  Similar logic applies to
45133448Sdh155122 	 * ip6_ire_max_bucket_cnt.
45143448Sdh155122 	 *
45153448Sdh155122 	 * We calculate this for each IP Instances in order to use
45163448Sdh155122 	 * the kmem_avail and ip_ire_{min,max}_bucket_cnt that are
45173448Sdh155122 	 * in effect when the zone is booted.
45183448Sdh155122 	 */
45193448Sdh155122 	mem_avail = kmem_avail();
45203448Sdh155122 	mem_cnt = (mem_avail >> ip_ire_mem_ratio) /
45213448Sdh155122 	    ip_cache_table_size / sizeof (ire_t);
45223448Sdh155122 	cpu_cnt = CPU->cpu_type_info.pi_clock >> ip_ire_cpu_ratio;
45233448Sdh155122 
45243448Sdh155122 	min_cnt = MIN(cpu_cnt, mem_cnt);
45253448Sdh155122 	if (min_cnt < ip_ire_min_bucket_cnt)
45263448Sdh155122 		min_cnt = ip_ire_min_bucket_cnt;
45273448Sdh155122 	if (ip_ire_max_bucket_cnt > min_cnt) {
45283448Sdh155122 		ip_ire_max_bucket_cnt = min_cnt;
45293448Sdh155122 	}
45303448Sdh155122 
45313448Sdh155122 	mem_cnt = (mem_avail >> ip_ire_mem_ratio) /
45323448Sdh155122 	    ip6_cache_table_size / sizeof (ire_t);
45333448Sdh155122 	min_cnt = MIN(cpu_cnt, mem_cnt);
45343448Sdh155122 	if (min_cnt < ip6_ire_min_bucket_cnt)
45353448Sdh155122 		min_cnt = ip6_ire_min_bucket_cnt;
45363448Sdh155122 	if (ip6_ire_max_bucket_cnt > min_cnt) {
45373448Sdh155122 		ip6_ire_max_bucket_cnt = min_cnt;
45383448Sdh155122 	}
45393448Sdh155122 
45403448Sdh155122 	mutex_init(&ipst->ips_ire_ft_init_lock, NULL, MUTEX_DEFAULT, 0);
45413448Sdh155122 	mutex_init(&ipst->ips_ire_handle_lock, NULL, MUTEX_DEFAULT, NULL);
45423448Sdh155122 
45433448Sdh155122 	(void) rn_inithead((void **)&ipst->ips_ip_ftable, 32);
45443448Sdh155122 
45453448Sdh155122 	/* Calculate the IPv4 cache table size. */
45463448Sdh155122 	ipst->ips_ip_cache_table_size = MAX(ip_cache_table_size,
45473448Sdh155122 	    ((mem_avail >> ip_ire_mem_ratio) / sizeof (ire_t) /
45483448Sdh155122 	    ip_ire_max_bucket_cnt));
45493448Sdh155122 	if (ipst->ips_ip_cache_table_size > ip_max_cache_table_size)
45503448Sdh155122 		ipst->ips_ip_cache_table_size = ip_max_cache_table_size;
45513448Sdh155122 	/*
45523448Sdh155122 	 * Make sure that the table size is always a power of 2.  The
45533448Sdh155122 	 * hash macro IRE_ADDR_HASH() depends on that.
45543448Sdh155122 	 */
45553448Sdh155122 	power2_roundup(&ipst->ips_ip_cache_table_size);
45563448Sdh155122 
45573448Sdh155122 	ipst->ips_ip_cache_table = kmem_zalloc(ipst->ips_ip_cache_table_size *
45583448Sdh155122 	    sizeof (irb_t), KM_SLEEP);
45593448Sdh155122 
45603448Sdh155122 	for (i = 0; i < ipst->ips_ip_cache_table_size; i++) {
45613448Sdh155122 		rw_init(&ipst->ips_ip_cache_table[i].irb_lock, NULL,
45623448Sdh155122 		    RW_DEFAULT, NULL);
45633448Sdh155122 	}
45643448Sdh155122 
45653448Sdh155122 	/* Calculate the IPv6 cache table size. */
45663448Sdh155122 	ipst->ips_ip6_cache_table_size = MAX(ip6_cache_table_size,
45673448Sdh155122 	    ((mem_avail >> ip_ire_mem_ratio) / sizeof (ire_t) /
45683448Sdh155122 	    ip6_ire_max_bucket_cnt));
45693448Sdh155122 	if (ipst->ips_ip6_cache_table_size > ip6_max_cache_table_size)
45703448Sdh155122 		ipst->ips_ip6_cache_table_size = ip6_max_cache_table_size;
45713448Sdh155122 	/*
45723448Sdh155122 	 * Make sure that the table size is always a power of 2.  The
45733448Sdh155122 	 * hash macro IRE_ADDR_HASH_V6() depends on that.
45743448Sdh155122 	 */
45753448Sdh155122 	power2_roundup(&ipst->ips_ip6_cache_table_size);
45763448Sdh155122 
45773448Sdh155122 	ipst->ips_ip_cache_table_v6 = kmem_zalloc(
45783448Sdh155122 	    ipst->ips_ip6_cache_table_size * sizeof (irb_t), KM_SLEEP);
45793448Sdh155122 
45803448Sdh155122 	for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) {
45813448Sdh155122 		rw_init(&ipst->ips_ip_cache_table_v6[i].irb_lock, NULL,
45823448Sdh155122 		    RW_DEFAULT, NULL);
45833448Sdh155122 	}
45843448Sdh155122 
45850Sstevel@tonic-gate 	/*
45860Sstevel@tonic-gate 	 * Make sure that the forwarding table size is a power of 2.
45870Sstevel@tonic-gate 	 * The IRE*_ADDR_HASH() macroes depend on that.
45880Sstevel@tonic-gate 	 */
45893448Sdh155122 	ipst->ips_ip6_ftable_hash_size = ip6_ftable_hash_size;
45903448Sdh155122 	power2_roundup(&ipst->ips_ip6_ftable_hash_size);
45913448Sdh155122 
45923448Sdh155122 	ipst->ips_ire_handle = 1;
45933448Sdh155122 }
45943448Sdh155122 
45953448Sdh155122 void
45963448Sdh155122 ip_ire_g_fini(void)
45973448Sdh155122 {
45983448Sdh155122 	kmem_cache_destroy(ire_cache);
45993448Sdh155122 	kmem_cache_destroy(rt_entry_cache);
46003448Sdh155122 
46013448Sdh155122 	rn_fini();
46020Sstevel@tonic-gate }
46030Sstevel@tonic-gate 
46040Sstevel@tonic-gate void
46053448Sdh155122 ip_ire_fini(ip_stack_t *ipst)
46060Sstevel@tonic-gate {
46070Sstevel@tonic-gate 	int i;
46080Sstevel@tonic-gate 
46093448Sdh155122 	/*
46103448Sdh155122 	 * Delete all IREs - assumes that the ill/ipifs have
46113448Sdh155122 	 * been removed so what remains are just the ftable and IRE_CACHE.
46123448Sdh155122 	 */
46133448Sdh155122 	ire_walk(ire_delete, NULL, ipst);
46143448Sdh155122 
46153448Sdh155122 	rn_freehead(ipst->ips_ip_ftable);
46163448Sdh155122 	ipst->ips_ip_ftable = NULL;
46173448Sdh155122 
46183448Sdh155122 	mutex_destroy(&ipst->ips_ire_ft_init_lock);
46193448Sdh155122 	mutex_destroy(&ipst->ips_ire_handle_lock);
46203448Sdh155122 
46213448Sdh155122 	for (i = 0; i < ipst->ips_ip_cache_table_size; i++) {
46223448Sdh155122 		ASSERT(ipst->ips_ip_cache_table[i].irb_ire == NULL);
46233448Sdh155122 		rw_destroy(&ipst->ips_ip_cache_table[i].irb_lock);
46243448Sdh155122 	}
46253448Sdh155122 	kmem_free(ipst->ips_ip_cache_table,
46263448Sdh155122 	    ipst->ips_ip_cache_table_size * sizeof (irb_t));
46273448Sdh155122 	ipst->ips_ip_cache_table = NULL;
46283448Sdh155122 
46293448Sdh155122 	for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) {
46303448Sdh155122 		ASSERT(ipst->ips_ip_cache_table_v6[i].irb_ire == NULL);
46313448Sdh155122 		rw_destroy(&ipst->ips_ip_cache_table_v6[i].irb_lock);
46323448Sdh155122 	}
46333448Sdh155122 	kmem_free(ipst->ips_ip_cache_table_v6,
46343448Sdh155122 	    ipst->ips_ip6_cache_table_size * sizeof (irb_t));
46353448Sdh155122 	ipst->ips_ip_cache_table_v6 = NULL;
46363448Sdh155122 
46373448Sdh155122 	for (i = 0; i < IP6_MASK_TABLE_SIZE; i++) {
46383448Sdh155122 		irb_t *ptr;
46393448Sdh155122 		int j;
46403448Sdh155122 
46413448Sdh155122 		if ((ptr = ipst->ips_ip_forwarding_table_v6[i]) == NULL)
46423448Sdh155122 			continue;
46433448Sdh155122 
46443448Sdh155122 		for (j = 0; j < ipst->ips_ip6_ftable_hash_size; j++) {
46453448Sdh155122 			ASSERT(ptr[j].irb_ire == NULL);
46463448Sdh155122 			rw_destroy(&ptr[j].irb_lock);
46473448Sdh155122 		}
46483448Sdh155122 		mi_free(ptr);
46493448Sdh155122 		ipst->ips_ip_forwarding_table_v6[i] = NULL;
46503448Sdh155122 	}
46510Sstevel@tonic-gate }
46520Sstevel@tonic-gate 
46530Sstevel@tonic-gate /*
46540Sstevel@tonic-gate  * Check if another multirt route resolution is needed.
46550Sstevel@tonic-gate  * B_TRUE is returned is there remain a resolvable route,
46560Sstevel@tonic-gate  * or if no route for that dst is resolved yet.
46570Sstevel@tonic-gate  * B_FALSE is returned if all routes for that dst are resolved
46580Sstevel@tonic-gate  * or if the remaining unresolved routes are actually not
46590Sstevel@tonic-gate  * resolvable.
46600Sstevel@tonic-gate  * This only works in the global zone.
46610Sstevel@tonic-gate  */
46620Sstevel@tonic-gate boolean_t
46633448Sdh155122 ire_multirt_need_resolve(ipaddr_t dst, const ts_label_t *tsl, ip_stack_t *ipst)
46640Sstevel@tonic-gate {
46650Sstevel@tonic-gate 	ire_t	*first_fire;
46660Sstevel@tonic-gate 	ire_t	*first_cire;
46670Sstevel@tonic-gate 	ire_t	*fire;
46680Sstevel@tonic-gate 	ire_t	*cire;
46690Sstevel@tonic-gate 	irb_t	*firb;
46700Sstevel@tonic-gate 	irb_t	*cirb;
46710Sstevel@tonic-gate 	int	unres_cnt = 0;
46720Sstevel@tonic-gate 	boolean_t resolvable = B_FALSE;
46730Sstevel@tonic-gate 
46740Sstevel@tonic-gate 	/* Retrieve the first IRE_HOST that matches the destination */
46750Sstevel@tonic-gate 	first_fire = ire_ftable_lookup(dst, IP_HOST_MASK, 0, IRE_HOST, NULL,
46761676Sjpk 	    NULL, ALL_ZONES, 0, tsl,
46773448Sdh155122 	    MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_SECATTR, ipst);
46780Sstevel@tonic-gate 
46790Sstevel@tonic-gate 	/* No route at all */
46800Sstevel@tonic-gate 	if (first_fire == NULL) {
46810Sstevel@tonic-gate 		return (B_TRUE);
46820Sstevel@tonic-gate 	}
46830Sstevel@tonic-gate 
46840Sstevel@tonic-gate 	firb = first_fire->ire_bucket;
46850Sstevel@tonic-gate 	ASSERT(firb != NULL);
46860Sstevel@tonic-gate 
46870Sstevel@tonic-gate 	/* Retrieve the first IRE_CACHE ire for that destination. */
46883448Sdh155122 	first_cire = ire_cache_lookup(dst, GLOBAL_ZONEID, tsl, ipst);
46890Sstevel@tonic-gate 
46900Sstevel@tonic-gate 	/* No resolved route. */
46910Sstevel@tonic-gate 	if (first_cire == NULL) {
46920Sstevel@tonic-gate 		ire_refrele(first_fire);
46930Sstevel@tonic-gate 		return (B_TRUE);
46940Sstevel@tonic-gate 	}
46950Sstevel@tonic-gate 
46960Sstevel@tonic-gate 	/*
46970Sstevel@tonic-gate 	 * At least one route is resolved. Here we look through the forward
46980Sstevel@tonic-gate 	 * and cache tables, to compare the number of declared routes
46990Sstevel@tonic-gate 	 * with the number of resolved routes. The search for a resolvable
47000Sstevel@tonic-gate 	 * route is performed only if at least one route remains
47010Sstevel@tonic-gate 	 * unresolved.
47020Sstevel@tonic-gate 	 */
47030Sstevel@tonic-gate 	cirb = first_cire->ire_bucket;
47040Sstevel@tonic-gate 	ASSERT(cirb != NULL);
47050Sstevel@tonic-gate 
47060Sstevel@tonic-gate 	/* Count the number of routes to that dest that are declared. */
47070Sstevel@tonic-gate 	IRB_REFHOLD(firb);
47080Sstevel@tonic-gate 	for (fire = first_fire; fire != NULL; fire = fire->ire_next) {
47090Sstevel@tonic-gate 		if (!(fire->ire_flags & RTF_MULTIRT))
47100Sstevel@tonic-gate 			continue;
47110Sstevel@tonic-gate 		if (fire->ire_addr != dst)
47120Sstevel@tonic-gate 			continue;
47130Sstevel@tonic-gate 		unres_cnt++;
47140Sstevel@tonic-gate 	}
47150Sstevel@tonic-gate 	IRB_REFRELE(firb);
47160Sstevel@tonic-gate 
47170Sstevel@tonic-gate 	/* Then subtract the number of routes to that dst that are resolved */
47180Sstevel@tonic-gate 	IRB_REFHOLD(cirb);
47190Sstevel@tonic-gate 	for (cire = first_cire; cire != NULL; cire = cire->ire_next) {
47200Sstevel@tonic-gate 		if (!(cire->ire_flags & RTF_MULTIRT))
47210Sstevel@tonic-gate 			continue;
47220Sstevel@tonic-gate 		if (cire->ire_addr != dst)
47230Sstevel@tonic-gate 			continue;
47248485SPeter.Memishian@Sun.COM 		if (cire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN))
47250Sstevel@tonic-gate 			continue;
47260Sstevel@tonic-gate 		unres_cnt--;
47270Sstevel@tonic-gate 	}
47280Sstevel@tonic-gate 	IRB_REFRELE(cirb);
47290Sstevel@tonic-gate 
47300Sstevel@tonic-gate 	/* At least one route is unresolved; search for a resolvable route. */
47310Sstevel@tonic-gate 	if (unres_cnt > 0)
47320Sstevel@tonic-gate 		resolvable = ire_multirt_lookup(&first_cire, &first_fire,
47333448Sdh155122 		    MULTIRT_USESTAMP | MULTIRT_CACHEGW, tsl, ipst);
47340Sstevel@tonic-gate 
47350Sstevel@tonic-gate 	if (first_fire != NULL)
47360Sstevel@tonic-gate 		ire_refrele(first_fire);
47370Sstevel@tonic-gate 
47380Sstevel@tonic-gate 	if (first_cire != NULL)
47390Sstevel@tonic-gate 		ire_refrele(first_cire);
47400Sstevel@tonic-gate 
47410Sstevel@tonic-gate 	return (resolvable);
47420Sstevel@tonic-gate }
47430Sstevel@tonic-gate 
47440Sstevel@tonic-gate /*
47450Sstevel@tonic-gate  * Explore a forward_table bucket, starting from fire_arg.
47460Sstevel@tonic-gate  * fire_arg MUST be an IRE_HOST entry.
47470Sstevel@tonic-gate  *
47480Sstevel@tonic-gate  * Return B_TRUE and update *ire_arg and *fire_arg
47490Sstevel@tonic-gate  * if at least one resolvable route is found. *ire_arg
47500Sstevel@tonic-gate  * is the IRE entry for *fire_arg's gateway.
47510Sstevel@tonic-gate  *
47520Sstevel@tonic-gate  * Return B_FALSE otherwise (all routes are resolved or
47530Sstevel@tonic-gate  * the remaining unresolved routes are all unresolvable).
47540Sstevel@tonic-gate  *
47550Sstevel@tonic-gate  * The IRE selection relies on a priority mechanism
47560Sstevel@tonic-gate  * driven by the flags passed in by the caller.
47570Sstevel@tonic-gate  * The caller, such as ip_newroute_ipif(), can get the most
47580Sstevel@tonic-gate  * relevant ire at each stage of a multiple route resolution.
47590Sstevel@tonic-gate  *
47600Sstevel@tonic-gate  * The rules are:
47610Sstevel@tonic-gate  *
47620Sstevel@tonic-gate  * - if MULTIRT_CACHEGW is specified in flags, IRE_CACHETABLE
47630Sstevel@tonic-gate  *   ires are preferred for the gateway. This gives the highest
47640Sstevel@tonic-gate  *   priority to routes that can be resolved without using
47650Sstevel@tonic-gate  *   a resolver.
47660Sstevel@tonic-gate  *
47670Sstevel@tonic-gate  * - if MULTIRT_CACHEGW is not specified, or if MULTIRT_CACHEGW
47680Sstevel@tonic-gate  *   is specified but no IRE_CACHETABLE ire entry for the gateway
47690Sstevel@tonic-gate  *   is found, the following rules apply.
47700Sstevel@tonic-gate  *
47710Sstevel@tonic-gate  * - if MULTIRT_USESTAMP is specified in flags, IRE_INTERFACE
47720Sstevel@tonic-gate  *   ires for the gateway, that have not been tried since
47730Sstevel@tonic-gate  *   a configurable amount of time, are preferred.
47740Sstevel@tonic-gate  *   This applies when a resolver must be invoked for
47750Sstevel@tonic-gate  *   a missing route, but we don't want to use the resolver
47760Sstevel@tonic-gate  *   upon each packet emission. If no such resolver is found,
47770Sstevel@tonic-gate  *   B_FALSE is returned.
47780Sstevel@tonic-gate  *   The MULTIRT_USESTAMP flag can be combined with
47790Sstevel@tonic-gate  *   MULTIRT_CACHEGW.
47800Sstevel@tonic-gate  *
47810Sstevel@tonic-gate  * - if MULTIRT_USESTAMP is not specified in flags, the first
47820Sstevel@tonic-gate  *   unresolved but resolvable route is selected.
47830Sstevel@tonic-gate  *
47840Sstevel@tonic-gate  * - Otherwise, there is no resolvalble route, and
47850Sstevel@tonic-gate  *   B_FALSE is returned.
47860Sstevel@tonic-gate  *
47870Sstevel@tonic-gate  * At last, MULTIRT_SETSTAMP can be specified in flags to
47880Sstevel@tonic-gate  * request the timestamp of unresolvable routes to
47890Sstevel@tonic-gate  * be refreshed. This prevents the useless exploration
47900Sstevel@tonic-gate  * of those routes for a while, when MULTIRT_USESTAMP is used.
47910Sstevel@tonic-gate  *
47920Sstevel@tonic-gate  * This only works in the global zone.
47930Sstevel@tonic-gate  */
47940Sstevel@tonic-gate boolean_t
47951676Sjpk ire_multirt_lookup(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags,
47963448Sdh155122     const ts_label_t *tsl, ip_stack_t *ipst)
47970Sstevel@tonic-gate {
47980Sstevel@tonic-gate 	clock_t	delta;
47990Sstevel@tonic-gate 	ire_t	*best_fire = NULL;
48000Sstevel@tonic-gate 	ire_t	*best_cire = NULL;
48010Sstevel@tonic-gate 	ire_t	*first_fire;
48020Sstevel@tonic-gate 	ire_t	*first_cire;
48030Sstevel@tonic-gate 	ire_t	*fire;
48040Sstevel@tonic-gate 	ire_t	*cire;
48050Sstevel@tonic-gate 	irb_t	*firb = NULL;
48060Sstevel@tonic-gate 	irb_t	*cirb = NULL;
48070Sstevel@tonic-gate 	ire_t	*gw_ire;
48080Sstevel@tonic-gate 	boolean_t	already_resolved;
48090Sstevel@tonic-gate 	boolean_t	res;
48100Sstevel@tonic-gate 	ipaddr_t	dst;
48110Sstevel@tonic-gate 	ipaddr_t	gw;
48120Sstevel@tonic-gate 
48130Sstevel@tonic-gate 	ip2dbg(("ire_multirt_lookup: *ire_arg %p, *fire_arg %p, flags %04x\n",
48140Sstevel@tonic-gate 	    (void *)*ire_arg, (void *)*fire_arg, flags));
48150Sstevel@tonic-gate 
48160Sstevel@tonic-gate 	ASSERT(ire_arg != NULL);
48170Sstevel@tonic-gate 	ASSERT(fire_arg != NULL);
48180Sstevel@tonic-gate 
48190Sstevel@tonic-gate 	/* Not an IRE_HOST ire; give up. */
48200Sstevel@tonic-gate 	if ((*fire_arg == NULL) || ((*fire_arg)->ire_type != IRE_HOST)) {
48210Sstevel@tonic-gate 		return (B_FALSE);
48220Sstevel@tonic-gate 	}
48230Sstevel@tonic-gate 
48240Sstevel@tonic-gate 	/* This is the first IRE_HOST ire for that destination. */
48250Sstevel@tonic-gate 	first_fire = *fire_arg;
48260Sstevel@tonic-gate 	firb = first_fire->ire_bucket;
48270Sstevel@tonic-gate 	ASSERT(firb != NULL);
48280Sstevel@tonic-gate 
48290Sstevel@tonic-gate 	dst = first_fire->ire_addr;
48300Sstevel@tonic-gate 
48310Sstevel@tonic-gate 	ip2dbg(("ire_multirt_lookup: dst %08x\n", ntohl(dst)));
48320Sstevel@tonic-gate 
48330Sstevel@tonic-gate 	/*
48340Sstevel@tonic-gate 	 * Retrieve the first IRE_CACHE ire for that destination;
48350Sstevel@tonic-gate 	 * if we don't find one, no route for that dest is
48360Sstevel@tonic-gate 	 * resolved yet.
48370Sstevel@tonic-gate 	 */
48383448Sdh155122 	first_cire = ire_cache_lookup(dst, GLOBAL_ZONEID, tsl, ipst);
48390Sstevel@tonic-gate 	if (first_cire != NULL) {
48400Sstevel@tonic-gate 		cirb = first_cire->ire_bucket;
48410Sstevel@tonic-gate 	}
48420Sstevel@tonic-gate 
48430Sstevel@tonic-gate 	ip2dbg(("ire_multirt_lookup: first_cire %p\n", (void *)first_cire));
48440Sstevel@tonic-gate 
48450Sstevel@tonic-gate 	/*
48460Sstevel@tonic-gate 	 * Search for a resolvable route, giving the top priority
48470Sstevel@tonic-gate 	 * to routes that can be resolved without any call to the resolver.
48480Sstevel@tonic-gate 	 */
48490Sstevel@tonic-gate 	IRB_REFHOLD(firb);
48500Sstevel@tonic-gate 
48510Sstevel@tonic-gate 	if (!CLASSD(dst)) {
48520Sstevel@tonic-gate 		/*
48530Sstevel@tonic-gate 		 * For all multiroute IRE_HOST ires for that destination,
48540Sstevel@tonic-gate 		 * check if the route via the IRE_HOST's gateway is
48550Sstevel@tonic-gate 		 * resolved yet.
48560Sstevel@tonic-gate 		 */
48570Sstevel@tonic-gate 		for (fire = first_fire; fire != NULL; fire = fire->ire_next) {
48580Sstevel@tonic-gate 
48590Sstevel@tonic-gate 			if (!(fire->ire_flags & RTF_MULTIRT))
48600Sstevel@tonic-gate 				continue;
48610Sstevel@tonic-gate 			if (fire->ire_addr != dst)
48620Sstevel@tonic-gate 				continue;
48630Sstevel@tonic-gate 
48641676Sjpk 			if (fire->ire_gw_secattr != NULL &&
48651676Sjpk 			    tsol_ire_match_gwattr(fire, tsl) != 0) {
48661676Sjpk 				continue;
48671676Sjpk 			}
48681676Sjpk 
48690Sstevel@tonic-gate 			gw = fire->ire_gateway_addr;
48700Sstevel@tonic-gate 
48710Sstevel@tonic-gate 			ip2dbg(("ire_multirt_lookup: fire %p, "
48720Sstevel@tonic-gate 			    "ire_addr %08x, ire_gateway_addr %08x\n",
48730Sstevel@tonic-gate 			    (void *)fire, ntohl(fire->ire_addr), ntohl(gw)));
48740Sstevel@tonic-gate 
48750Sstevel@tonic-gate 			already_resolved = B_FALSE;
48760Sstevel@tonic-gate 
48770Sstevel@tonic-gate 			if (first_cire != NULL) {
48780Sstevel@tonic-gate 				ASSERT(cirb != NULL);
48790Sstevel@tonic-gate 
48800Sstevel@tonic-gate 				IRB_REFHOLD(cirb);
48810Sstevel@tonic-gate 				/*
48820Sstevel@tonic-gate 				 * For all IRE_CACHE ires for that
48830Sstevel@tonic-gate 				 * destination.
48840Sstevel@tonic-gate 				 */
48850Sstevel@tonic-gate 				for (cire = first_cire;
48860Sstevel@tonic-gate 				    cire != NULL;
48870Sstevel@tonic-gate 				    cire = cire->ire_next) {
48880Sstevel@tonic-gate 
48890Sstevel@tonic-gate 					if (!(cire->ire_flags & RTF_MULTIRT))
48900Sstevel@tonic-gate 						continue;
48910Sstevel@tonic-gate 					if (cire->ire_addr != dst)
48920Sstevel@tonic-gate 						continue;
48930Sstevel@tonic-gate 					if (cire->ire_marks &
48940Sstevel@tonic-gate 					    (IRE_MARK_CONDEMNED |
48958485SPeter.Memishian@Sun.COM 					    IRE_MARK_TESTHIDDEN))
48960Sstevel@tonic-gate 						continue;
48971676Sjpk 
48981676Sjpk 					if (cire->ire_gw_secattr != NULL &&
48991676Sjpk 					    tsol_ire_match_gwattr(cire,
49001676Sjpk 					    tsl) != 0) {
49011676Sjpk 						continue;
49021676Sjpk 					}
49031676Sjpk 
49040Sstevel@tonic-gate 					/*
49050Sstevel@tonic-gate 					 * Check if the IRE_CACHE's gateway
49060Sstevel@tonic-gate 					 * matches the IRE_HOST's gateway.
49070Sstevel@tonic-gate 					 */
49080Sstevel@tonic-gate 					if (cire->ire_gateway_addr == gw) {
49090Sstevel@tonic-gate 						already_resolved = B_TRUE;
49100Sstevel@tonic-gate 						break;
49110Sstevel@tonic-gate 					}
49120Sstevel@tonic-gate 				}
49130Sstevel@tonic-gate 				IRB_REFRELE(cirb);
49140Sstevel@tonic-gate 			}
49150Sstevel@tonic-gate 
49160Sstevel@tonic-gate 			/*
49170Sstevel@tonic-gate 			 * This route is already resolved;
49180Sstevel@tonic-gate 			 * proceed with next one.
49190Sstevel@tonic-gate 			 */
49200Sstevel@tonic-gate 			if (already_resolved) {
49210Sstevel@tonic-gate 				ip2dbg(("ire_multirt_lookup: found cire %p, "
49220Sstevel@tonic-gate 				    "already resolved\n", (void *)cire));
49230Sstevel@tonic-gate 				continue;
49240Sstevel@tonic-gate 			}
49250Sstevel@tonic-gate 
49260Sstevel@tonic-gate 			/*
49270Sstevel@tonic-gate 			 * The route is unresolved; is it actually
49280Sstevel@tonic-gate 			 * resolvable, i.e. is there a cache or a resolver
49290Sstevel@tonic-gate 			 * for the gateway?
49300Sstevel@tonic-gate 			 */
49310Sstevel@tonic-gate 			gw_ire = ire_route_lookup(gw, 0, 0, 0, NULL, NULL,
49321676Sjpk 			    ALL_ZONES, tsl,
49333448Sdh155122 			    MATCH_IRE_RECURSIVE | MATCH_IRE_SECATTR, ipst);
49340Sstevel@tonic-gate 
49350Sstevel@tonic-gate 			ip2dbg(("ire_multirt_lookup: looked up gw_ire %p\n",
49360Sstevel@tonic-gate 			    (void *)gw_ire));
49370Sstevel@tonic-gate 
49380Sstevel@tonic-gate 			/*
49390Sstevel@tonic-gate 			 * If gw_ire is typed IRE_CACHETABLE,
49400Sstevel@tonic-gate 			 * this route can be resolved without any call to the
49410Sstevel@tonic-gate 			 * resolver. If the MULTIRT_CACHEGW flag is set,
49420Sstevel@tonic-gate 			 * give the top priority to this ire and exit the
49430Sstevel@tonic-gate 			 * loop.
49440Sstevel@tonic-gate 			 * This is typically the case when an ARP reply
49450Sstevel@tonic-gate 			 * is processed through ip_wput_nondata().
49460Sstevel@tonic-gate 			 */
49470Sstevel@tonic-gate 			if ((flags & MULTIRT_CACHEGW) &&
49480Sstevel@tonic-gate 			    (gw_ire != NULL) &&
49490Sstevel@tonic-gate 			    (gw_ire->ire_type & IRE_CACHETABLE)) {
49502535Ssangeeta 				ASSERT(gw_ire->ire_nce == NULL ||
49512535Ssangeeta 				    gw_ire->ire_nce->nce_state == ND_REACHABLE);
49520Sstevel@tonic-gate 				/*
49530Sstevel@tonic-gate 				 * Release the resolver associated to the
49540Sstevel@tonic-gate 				 * previous candidate best ire, if any.
49550Sstevel@tonic-gate 				 */
49560Sstevel@tonic-gate 				if (best_cire != NULL) {
49570Sstevel@tonic-gate 					ire_refrele(best_cire);
49580Sstevel@tonic-gate 					ASSERT(best_fire != NULL);
49590Sstevel@tonic-gate 				}
49600Sstevel@tonic-gate 
49610Sstevel@tonic-gate 				best_fire = fire;
49620Sstevel@tonic-gate 				best_cire = gw_ire;
49630Sstevel@tonic-gate 
49640Sstevel@tonic-gate 				ip2dbg(("ire_multirt_lookup: found top prio "
49650Sstevel@tonic-gate 				    "best_fire %p, best_cire %p\n",
49660Sstevel@tonic-gate 				    (void *)best_fire, (void *)best_cire));
49670Sstevel@tonic-gate 				break;
49680Sstevel@tonic-gate 			}
49690Sstevel@tonic-gate 
49700Sstevel@tonic-gate 			/*
49710Sstevel@tonic-gate 			 * Compute the time elapsed since our preceding
49720Sstevel@tonic-gate 			 * attempt to  resolve that route.
49730Sstevel@tonic-gate 			 * If the MULTIRT_USESTAMP flag is set, we take that
49740Sstevel@tonic-gate 			 * route into account only if this time interval
49750Sstevel@tonic-gate 			 * exceeds ip_multirt_resolution_interval;
49760Sstevel@tonic-gate 			 * this prevents us from attempting to resolve a
49770Sstevel@tonic-gate 			 * broken route upon each sending of a packet.
49780Sstevel@tonic-gate 			 */
49790Sstevel@tonic-gate 			delta = lbolt - fire->ire_last_used_time;
49800Sstevel@tonic-gate 			delta = TICK_TO_MSEC(delta);
49810Sstevel@tonic-gate 
49823448Sdh155122 			res = (boolean_t)((delta >
49834714Ssowmini 			    ipst->ips_ip_multirt_resolution_interval) ||
49844714Ssowmini 			    (!(flags & MULTIRT_USESTAMP)));
49850Sstevel@tonic-gate 
49860Sstevel@tonic-gate 			ip2dbg(("ire_multirt_lookup: fire %p, delta %lu, "
49870Sstevel@tonic-gate 			    "res %d\n",
49880Sstevel@tonic-gate 			    (void *)fire, delta, res));
49890Sstevel@tonic-gate 
49900Sstevel@tonic-gate 			if (res) {
49910Sstevel@tonic-gate 				/*
49920Sstevel@tonic-gate 				 * We are here if MULTIRT_USESTAMP flag is set
49930Sstevel@tonic-gate 				 * and the resolver for fire's gateway
49940Sstevel@tonic-gate 				 * has not been tried since
49950Sstevel@tonic-gate 				 * ip_multirt_resolution_interval, or if
49960Sstevel@tonic-gate 				 * MULTIRT_USESTAMP is not set but gw_ire did
49970Sstevel@tonic-gate 				 * not fill the conditions for MULTIRT_CACHEGW,
49980Sstevel@tonic-gate 				 * or if neither MULTIRT_USESTAMP nor
49990Sstevel@tonic-gate 				 * MULTIRT_CACHEGW are set.
50000Sstevel@tonic-gate 				 */
50010Sstevel@tonic-gate 				if (gw_ire != NULL) {
50020Sstevel@tonic-gate 					if (best_fire == NULL) {
50030Sstevel@tonic-gate 						ASSERT(best_cire == NULL);
50040Sstevel@tonic-gate 
50050Sstevel@tonic-gate 						best_fire = fire;
50060Sstevel@tonic-gate 						best_cire = gw_ire;
50070Sstevel@tonic-gate 
50080Sstevel@tonic-gate 						ip2dbg(("ire_multirt_lookup:"
50090Sstevel@tonic-gate 						    "found candidate "
50100Sstevel@tonic-gate 						    "best_fire %p, "
50110Sstevel@tonic-gate 						    "best_cire %p\n",
50120Sstevel@tonic-gate 						    (void *)best_fire,
50130Sstevel@tonic-gate 						    (void *)best_cire));
50140Sstevel@tonic-gate 
50150Sstevel@tonic-gate 						/*
50160Sstevel@tonic-gate 						 * If MULTIRT_CACHEGW is not
50170Sstevel@tonic-gate 						 * set, we ignore the top
50180Sstevel@tonic-gate 						 * priority ires that can
50190Sstevel@tonic-gate 						 * be resolved without any
50200Sstevel@tonic-gate 						 * call to the resolver;
50210Sstevel@tonic-gate 						 * In that case, there is
50220Sstevel@tonic-gate 						 * actually no need
50230Sstevel@tonic-gate 						 * to continue the loop.
50240Sstevel@tonic-gate 						 */
50250Sstevel@tonic-gate 						if (!(flags &
50260Sstevel@tonic-gate 						    MULTIRT_CACHEGW)) {
50270Sstevel@tonic-gate 							break;
50280Sstevel@tonic-gate 						}
50290Sstevel@tonic-gate 						continue;
50300Sstevel@tonic-gate 					}
50310Sstevel@tonic-gate 				} else {
50320Sstevel@tonic-gate 					/*
50330Sstevel@tonic-gate 					 * No resolver for the gateway: the
50340Sstevel@tonic-gate 					 * route is not resolvable.
50350Sstevel@tonic-gate 					 * If the MULTIRT_SETSTAMP flag is
50360Sstevel@tonic-gate 					 * set, we stamp the IRE_HOST ire,
50370Sstevel@tonic-gate 					 * so we will not select it again
50380Sstevel@tonic-gate 					 * during this resolution interval.
50390Sstevel@tonic-gate 					 */
50400Sstevel@tonic-gate 					if (flags & MULTIRT_SETSTAMP)
50410Sstevel@tonic-gate 						fire->ire_last_used_time =
50420Sstevel@tonic-gate 						    lbolt;
50430Sstevel@tonic-gate 				}
50440Sstevel@tonic-gate 			}
50450Sstevel@tonic-gate 
50460Sstevel@tonic-gate 			if (gw_ire != NULL)
50470Sstevel@tonic-gate 				ire_refrele(gw_ire);
50480Sstevel@tonic-gate 		}
50490Sstevel@tonic-gate 	} else { /* CLASSD(dst) */
50500Sstevel@tonic-gate 
50510Sstevel@tonic-gate 		for (fire = first_fire;
50520Sstevel@tonic-gate 		    fire != NULL;
50530Sstevel@tonic-gate 		    fire = fire->ire_next) {
50540Sstevel@tonic-gate 
50550Sstevel@tonic-gate 			if (!(fire->ire_flags & RTF_MULTIRT))
50560Sstevel@tonic-gate 				continue;
50570Sstevel@tonic-gate 			if (fire->ire_addr != dst)
50580Sstevel@tonic-gate 				continue;
50590Sstevel@tonic-gate 
50601676Sjpk 			if (fire->ire_gw_secattr != NULL &&
50611676Sjpk 			    tsol_ire_match_gwattr(fire, tsl) != 0) {
50621676Sjpk 				continue;
50631676Sjpk 			}
50641676Sjpk 
50650Sstevel@tonic-gate 			already_resolved = B_FALSE;
50660Sstevel@tonic-gate 
50670Sstevel@tonic-gate 			gw = fire->ire_gateway_addr;
50680Sstevel@tonic-gate 
50690Sstevel@tonic-gate 			gw_ire = ire_ftable_lookup(gw, 0, 0, IRE_INTERFACE,
50701676Sjpk 			    NULL, NULL, ALL_ZONES, 0, tsl,
50711676Sjpk 			    MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE |
50723448Sdh155122 			    MATCH_IRE_SECATTR, ipst);
50730Sstevel@tonic-gate 
50740Sstevel@tonic-gate 			/* No resolver for the gateway; we skip this ire. */
50750Sstevel@tonic-gate 			if (gw_ire == NULL) {
50760Sstevel@tonic-gate 				continue;
50770Sstevel@tonic-gate 			}
50782535Ssangeeta 			ASSERT(gw_ire->ire_nce == NULL ||
50792535Ssangeeta 			    gw_ire->ire_nce->nce_state == ND_REACHABLE);
50800Sstevel@tonic-gate 
50810Sstevel@tonic-gate 			if (first_cire != NULL) {
50820Sstevel@tonic-gate 
50830Sstevel@tonic-gate 				IRB_REFHOLD(cirb);
50840Sstevel@tonic-gate 				/*
50850Sstevel@tonic-gate 				 * For all IRE_CACHE ires for that
50860Sstevel@tonic-gate 				 * destination.
50870Sstevel@tonic-gate 				 */
50880Sstevel@tonic-gate 				for (cire = first_cire;
50890Sstevel@tonic-gate 				    cire != NULL;
50900Sstevel@tonic-gate 				    cire = cire->ire_next) {
50910Sstevel@tonic-gate 
50920Sstevel@tonic-gate 					if (!(cire->ire_flags & RTF_MULTIRT))
50930Sstevel@tonic-gate 						continue;
50940Sstevel@tonic-gate 					if (cire->ire_addr != dst)
50950Sstevel@tonic-gate 						continue;
50960Sstevel@tonic-gate 					if (cire->ire_marks &
50970Sstevel@tonic-gate 					    (IRE_MARK_CONDEMNED |
50988485SPeter.Memishian@Sun.COM 					    IRE_MARK_TESTHIDDEN))
50990Sstevel@tonic-gate 						continue;
51000Sstevel@tonic-gate 
51011676Sjpk 					if (cire->ire_gw_secattr != NULL &&
51021676Sjpk 					    tsol_ire_match_gwattr(cire,
51031676Sjpk 					    tsl) != 0) {
51041676Sjpk 						continue;
51051676Sjpk 					}
51061676Sjpk 
51070Sstevel@tonic-gate 					/*
51080Sstevel@tonic-gate 					 * Cache entries are linked to the
51090Sstevel@tonic-gate 					 * parent routes using the parent handle
51100Sstevel@tonic-gate 					 * (ire_phandle). If no cache entry has
51110Sstevel@tonic-gate 					 * the same handle as fire, fire is
51120Sstevel@tonic-gate 					 * still unresolved.
51130Sstevel@tonic-gate 					 */
51140Sstevel@tonic-gate 					ASSERT(cire->ire_phandle != 0);
51150Sstevel@tonic-gate 					if (cire->ire_phandle ==
51160Sstevel@tonic-gate 					    fire->ire_phandle) {
51170Sstevel@tonic-gate 						already_resolved = B_TRUE;
51180Sstevel@tonic-gate 						break;
51190Sstevel@tonic-gate 					}
51200Sstevel@tonic-gate 				}
51210Sstevel@tonic-gate 				IRB_REFRELE(cirb);
51220Sstevel@tonic-gate 			}
51230Sstevel@tonic-gate 
51240Sstevel@tonic-gate 			/*
51250Sstevel@tonic-gate 			 * This route is already resolved; proceed with
51260Sstevel@tonic-gate 			 * next one.
51270Sstevel@tonic-gate 			 */
51280Sstevel@tonic-gate 			if (already_resolved) {
51290Sstevel@tonic-gate 				ire_refrele(gw_ire);
51300Sstevel@tonic-gate 				continue;
51310Sstevel@tonic-gate 			}
51320Sstevel@tonic-gate 
51330Sstevel@tonic-gate 			/*
51340Sstevel@tonic-gate 			 * Compute the time elapsed since our preceding
51350Sstevel@tonic-gate 			 * attempt to resolve that route.
51360Sstevel@tonic-gate 			 * If the MULTIRT_USESTAMP flag is set, we take
51370Sstevel@tonic-gate 			 * that route into account only if this time
51380Sstevel@tonic-gate 			 * interval exceeds ip_multirt_resolution_interval;
51390Sstevel@tonic-gate 			 * this prevents us from attempting to resolve a
51400Sstevel@tonic-gate 			 * broken route upon each sending of a packet.
51410Sstevel@tonic-gate 			 */
51420Sstevel@tonic-gate 			delta = lbolt - fire->ire_last_used_time;
51430Sstevel@tonic-gate 			delta = TICK_TO_MSEC(delta);
51440Sstevel@tonic-gate 
51453448Sdh155122 			res = (boolean_t)((delta >
51464714Ssowmini 			    ipst->ips_ip_multirt_resolution_interval) ||
51474714Ssowmini 			    (!(flags & MULTIRT_USESTAMP)));
51480Sstevel@tonic-gate 
51490Sstevel@tonic-gate 			ip3dbg(("ire_multirt_lookup: fire %p, delta %lx, "
51500Sstevel@tonic-gate 			    "flags %04x, res %d\n",
51510Sstevel@tonic-gate 			    (void *)fire, delta, flags, res));
51520Sstevel@tonic-gate 
51530Sstevel@tonic-gate 			if (res) {
51540Sstevel@tonic-gate 				if (best_cire != NULL) {
51550Sstevel@tonic-gate 					/*
51560Sstevel@tonic-gate 					 * Release the resolver associated
51570Sstevel@tonic-gate 					 * to the preceding candidate best
51580Sstevel@tonic-gate 					 * ire, if any.
51590Sstevel@tonic-gate 					 */
51600Sstevel@tonic-gate 					ire_refrele(best_cire);
51610Sstevel@tonic-gate 					ASSERT(best_fire != NULL);
51620Sstevel@tonic-gate 				}
51630Sstevel@tonic-gate 				best_fire = fire;
51640Sstevel@tonic-gate 				best_cire = gw_ire;
51650Sstevel@tonic-gate 				continue;
51660Sstevel@tonic-gate 			}
51670Sstevel@tonic-gate 
51680Sstevel@tonic-gate 			ire_refrele(gw_ire);
51690Sstevel@tonic-gate 		}
51700Sstevel@tonic-gate 	}
51710Sstevel@tonic-gate 
51720Sstevel@tonic-gate 	if (best_fire != NULL) {
51730Sstevel@tonic-gate 		IRE_REFHOLD(best_fire);
51740Sstevel@tonic-gate 	}
51750Sstevel@tonic-gate 	IRB_REFRELE(firb);
51760Sstevel@tonic-gate 
51770Sstevel@tonic-gate 	/* Release the first IRE_CACHE we initially looked up, if any. */
51780Sstevel@tonic-gate 	if (first_cire != NULL)
51790Sstevel@tonic-gate 		ire_refrele(first_cire);
51800Sstevel@tonic-gate 
51810Sstevel@tonic-gate 	/* Found a resolvable route. */
51820Sstevel@tonic-gate 	if (best_fire != NULL) {
51830Sstevel@tonic-gate 		ASSERT(best_cire != NULL);
51840Sstevel@tonic-gate 
51850Sstevel@tonic-gate 		if (*fire_arg != NULL)
51860Sstevel@tonic-gate 			ire_refrele(*fire_arg);
51870Sstevel@tonic-gate 		if (*ire_arg != NULL)
51880Sstevel@tonic-gate 			ire_refrele(*ire_arg);
51890Sstevel@tonic-gate 
51900Sstevel@tonic-gate 		/*
51910Sstevel@tonic-gate 		 * Update the passed-in arguments with the
51920Sstevel@tonic-gate 		 * resolvable multirt route we found.
51930Sstevel@tonic-gate 		 */
51940Sstevel@tonic-gate 		*fire_arg = best_fire;
51950Sstevel@tonic-gate 		*ire_arg = best_cire;
51960Sstevel@tonic-gate 
51970Sstevel@tonic-gate 		ip2dbg(("ire_multirt_lookup: returning B_TRUE, "
51980Sstevel@tonic-gate 		    "*fire_arg %p, *ire_arg %p\n",
51990Sstevel@tonic-gate 		    (void *)best_fire, (void *)best_cire));
52000Sstevel@tonic-gate 
52010Sstevel@tonic-gate 		return (B_TRUE);
52020Sstevel@tonic-gate 	}
52030Sstevel@tonic-gate 
52040Sstevel@tonic-gate 	ASSERT(best_cire == NULL);
52050Sstevel@tonic-gate 
52060Sstevel@tonic-gate 	ip2dbg(("ire_multirt_lookup: returning B_FALSE, *fire_arg %p, "
52070Sstevel@tonic-gate 	    "*ire_arg %p\n",
52080Sstevel@tonic-gate 	    (void *)*fire_arg, (void *)*ire_arg));
52090Sstevel@tonic-gate 
52100Sstevel@tonic-gate 	/* No resolvable route. */
52110Sstevel@tonic-gate 	return (B_FALSE);
52120Sstevel@tonic-gate }
52130Sstevel@tonic-gate 
52140Sstevel@tonic-gate /*
52150Sstevel@tonic-gate  * IRE iterator for inbound and loopback broadcast processing.
52160Sstevel@tonic-gate  * Given an IRE_BROADCAST ire, walk the ires with the same destination
52170Sstevel@tonic-gate  * address, but skip over the passed-in ire. Returns the next ire without
52180Sstevel@tonic-gate  * a hold - assumes that the caller holds a reference on the IRE bucket.
52190Sstevel@tonic-gate  */
52200Sstevel@tonic-gate ire_t *
52210Sstevel@tonic-gate ire_get_next_bcast_ire(ire_t *curr, ire_t *ire)
52220Sstevel@tonic-gate {
52230Sstevel@tonic-gate 	ill_t *ill;
52240Sstevel@tonic-gate 
52250Sstevel@tonic-gate 	if (curr == NULL) {
52260Sstevel@tonic-gate 		for (curr = ire->ire_bucket->irb_ire; curr != NULL;
52270Sstevel@tonic-gate 		    curr = curr->ire_next) {
52280Sstevel@tonic-gate 			if (curr->ire_addr == ire->ire_addr)
52290Sstevel@tonic-gate 				break;
52300Sstevel@tonic-gate 		}
52310Sstevel@tonic-gate 	} else {
52320Sstevel@tonic-gate 		curr = curr->ire_next;
52330Sstevel@tonic-gate 	}
52340Sstevel@tonic-gate 	ill = ire_to_ill(ire);
52350Sstevel@tonic-gate 	for (; curr != NULL; curr = curr->ire_next) {
52360Sstevel@tonic-gate 		if (curr->ire_addr != ire->ire_addr) {
52370Sstevel@tonic-gate 			/*
52380Sstevel@tonic-gate 			 * All the IREs to a given destination are contiguous;
52390Sstevel@tonic-gate 			 * break out once the address doesn't match.
52400Sstevel@tonic-gate 			 */
52410Sstevel@tonic-gate 			break;
52420Sstevel@tonic-gate 		}
52430Sstevel@tonic-gate 		if (curr == ire) {
52440Sstevel@tonic-gate 			/* skip over the passed-in ire */
52450Sstevel@tonic-gate 			continue;
52460Sstevel@tonic-gate 		}
52470Sstevel@tonic-gate 		if ((curr->ire_stq != NULL && ire->ire_stq == NULL) ||
52480Sstevel@tonic-gate 		    (curr->ire_stq == NULL && ire->ire_stq != NULL)) {
52490Sstevel@tonic-gate 			/*
52500Sstevel@tonic-gate 			 * If the passed-in ire is loopback, skip over
52510Sstevel@tonic-gate 			 * non-loopback ires and vice versa.
52520Sstevel@tonic-gate 			 */
52530Sstevel@tonic-gate 			continue;
52540Sstevel@tonic-gate 		}
52550Sstevel@tonic-gate 		if (ire_to_ill(curr) != ill) {
52560Sstevel@tonic-gate 			/* skip over IREs going through a different interface */
52570Sstevel@tonic-gate 			continue;
52580Sstevel@tonic-gate 		}
52590Sstevel@tonic-gate 		if (curr->ire_marks & IRE_MARK_CONDEMNED) {
52600Sstevel@tonic-gate 			/* skip over deleted IREs */
52610Sstevel@tonic-gate 			continue;
52620Sstevel@tonic-gate 		}
52630Sstevel@tonic-gate 		return (curr);
52640Sstevel@tonic-gate 	}
52650Sstevel@tonic-gate 	return (NULL);
52660Sstevel@tonic-gate }
52670Sstevel@tonic-gate 
52685023Scarlsonj #ifdef DEBUG
52690Sstevel@tonic-gate void
52700Sstevel@tonic-gate ire_trace_ref(ire_t *ire)
52710Sstevel@tonic-gate {
52720Sstevel@tonic-gate 	mutex_enter(&ire->ire_lock);
52735023Scarlsonj 	if (ire->ire_trace_disable) {
52740Sstevel@tonic-gate 		mutex_exit(&ire->ire_lock);
52750Sstevel@tonic-gate 		return;
52760Sstevel@tonic-gate 	}
52775023Scarlsonj 
52785023Scarlsonj 	if (th_trace_ref(ire, ire->ire_ipst)) {
52795023Scarlsonj 		mutex_exit(&ire->ire_lock);
52805023Scarlsonj 	} else {
52815023Scarlsonj 		ire->ire_trace_disable = B_TRUE;
52825023Scarlsonj 		mutex_exit(&ire->ire_lock);
52835023Scarlsonj 		ire_trace_cleanup(ire);
52840Sstevel@tonic-gate 	}
52850Sstevel@tonic-gate }
52860Sstevel@tonic-gate 
52870Sstevel@tonic-gate void
52880Sstevel@tonic-gate ire_untrace_ref(ire_t *ire)
52890Sstevel@tonic-gate {
52900Sstevel@tonic-gate 	mutex_enter(&ire->ire_lock);
52915023Scarlsonj 	if (!ire->ire_trace_disable)
52925023Scarlsonj 		th_trace_unref(ire);
52930Sstevel@tonic-gate 	mutex_exit(&ire->ire_lock);
52940Sstevel@tonic-gate }
52950Sstevel@tonic-gate 
52960Sstevel@tonic-gate static void
52975023Scarlsonj ire_trace_cleanup(const ire_t *ire)
52980Sstevel@tonic-gate {
52995023Scarlsonj 	th_trace_cleanup(ire, ire->ire_trace_disable);
53000Sstevel@tonic-gate }
53015023Scarlsonj #endif /* DEBUG */
53022535Ssangeeta 
53032535Ssangeeta /*
53042535Ssangeeta  * Generate a message chain with an arp request to resolve the in_ire.
53052535Ssangeeta  * It is assumed that in_ire itself is currently in the ire cache table,
53062535Ssangeeta  * so we create a fake_ire filled with enough information about ire_addr etc.
53072535Ssangeeta  * to retrieve in_ire when the DL_UNITDATA response from the resolver
53082535Ssangeeta  * comes back. The fake_ire itself is created by calling esballoc with
53092535Ssangeeta  * the fr_rtnp (free routine) set to ire_freemblk. This routine will be
53102535Ssangeeta  * invoked when the mblk containing fake_ire is freed.
53112535Ssangeeta  */
53122535Ssangeeta void
53138485SPeter.Memishian@Sun.COM ire_arpresolve(ire_t *in_ire)
53142535Ssangeeta {
53152535Ssangeeta 	areq_t		*areq;
53162535Ssangeeta 	ipaddr_t	*addrp;
53174714Ssowmini 	mblk_t 		*ire_mp, *areq_mp;
53182535Ssangeeta 	ire_t 		*ire, *buf;
53192535Ssangeeta 	size_t		bufsize;
53202535Ssangeeta 	frtn_t		*frtnp;
53218485SPeter.Memishian@Sun.COM 	ill_t		*dst_ill;
53228485SPeter.Memishian@Sun.COM 	ip_stack_t	*ipst;
53238485SPeter.Memishian@Sun.COM 
53248485SPeter.Memishian@Sun.COM 	ASSERT(in_ire->ire_nce != NULL);
53258485SPeter.Memishian@Sun.COM 
53268485SPeter.Memishian@Sun.COM 	dst_ill = ire_to_ill(in_ire);
53278485SPeter.Memishian@Sun.COM 	ipst = dst_ill->ill_ipst;
53282535Ssangeeta 
53292535Ssangeeta 	/*
53302535Ssangeeta 	 * Construct message chain for the resolver
53312535Ssangeeta 	 * of the form:
53322535Ssangeeta 	 *	ARP_REQ_MBLK-->IRE_MBLK
53332535Ssangeeta 	 *
53342535Ssangeeta 	 * NOTE : If the response does not
53352535Ssangeeta 	 * come back, ARP frees the packet. For this reason,
53362535Ssangeeta 	 * we can't REFHOLD the bucket of save_ire to prevent
53372535Ssangeeta 	 * deletions. We may not be able to REFRELE the bucket
53382535Ssangeeta 	 * if the response never comes back. Thus, before
53392535Ssangeeta 	 * adding the ire, ire_add_v4 will make sure that the
53402535Ssangeeta 	 * interface route does not get deleted. This is the
53412535Ssangeeta 	 * only case unlike ip_newroute_v6, ip_newroute_ipif_v6
53422535Ssangeeta 	 * where we can always prevent deletions because of
53432535Ssangeeta 	 * the synchronous nature of adding IRES i.e
53442535Ssangeeta 	 * ire_add_then_send is called after creating the IRE.
53452535Ssangeeta 	 */
53462535Ssangeeta 
53472535Ssangeeta 	/*
53488485SPeter.Memishian@Sun.COM 	 * We use esballoc to allocate the second part (IRE_MBLK)
53498485SPeter.Memishian@Sun.COM 	 * of the message chain depicted above.  This mblk will be freed
53508485SPeter.Memishian@Sun.COM 	 * by arp when there is a timeout, and otherwise passed to IP
53518485SPeter.Memishian@Sun.COM 	 * and IP will free it after processing the ARP response.
53522535Ssangeeta 	 */
53532535Ssangeeta 
53542535Ssangeeta 	bufsize = sizeof (ire_t) + sizeof (frtn_t);
53552535Ssangeeta 	buf = kmem_alloc(bufsize, KM_NOSLEEP);
53562535Ssangeeta 	if (buf == NULL) {
53578485SPeter.Memishian@Sun.COM 		ip1dbg(("ire_arpresolve: alloc buffer failed\n"));
53582535Ssangeeta 		return;
53592535Ssangeeta 	}
53602535Ssangeeta 	frtnp = (frtn_t *)(buf + 1);
53612535Ssangeeta 	frtnp->free_arg = (caddr_t)buf;
53622535Ssangeeta 	frtnp->free_func = ire_freemblk;
53632535Ssangeeta 
53642535Ssangeeta 	ire_mp = esballoc((unsigned char *)buf, bufsize, BPRI_MED, frtnp);
53652535Ssangeeta 	if (ire_mp == NULL) {
53662535Ssangeeta 		ip1dbg(("ire_arpresolve: esballoc failed\n"));
53672535Ssangeeta 		kmem_free(buf, bufsize);
53682535Ssangeeta 		return;
53692535Ssangeeta 	}
53708485SPeter.Memishian@Sun.COM 
53714714Ssowmini 	areq_mp = copyb(dst_ill->ill_resolver_mp);
53724714Ssowmini 	if (areq_mp == NULL) {
53738485SPeter.Memishian@Sun.COM 		freemsg(ire_mp);
53742535Ssangeeta 		return;
53752535Ssangeeta 	}
53762535Ssangeeta 
53772535Ssangeeta 	ire_mp->b_datap->db_type = IRE_ARPRESOLVE_TYPE;
53782535Ssangeeta 	ire = (ire_t *)buf;
53792535Ssangeeta 	/*
53802535Ssangeeta 	 * keep enough info in the fake ire so that we can pull up
53812535Ssangeeta 	 * the incomplete ire (in_ire) after result comes back from
53822535Ssangeeta 	 * arp and make it complete.
53832535Ssangeeta 	 */
53842535Ssangeeta 	*ire = ire_null;
53852535Ssangeeta 	ire->ire_u = in_ire->ire_u;
53862535Ssangeeta 	ire->ire_ipif_seqid = in_ire->ire_ipif_seqid;
53877880SJonathan.Anderson@Sun.COM 	ire->ire_ipif_ifindex = in_ire->ire_ipif_ifindex;
53882535Ssangeeta 	ire->ire_ipif = in_ire->ire_ipif;
53898485SPeter.Memishian@Sun.COM 	ire->ire_stq = dst_ill->ill_wq;
53908485SPeter.Memishian@Sun.COM 	ire->ire_stq_ifindex = dst_ill->ill_phyint->phyint_ifindex;
53912535Ssangeeta 	ire->ire_zoneid = in_ire->ire_zoneid;
53927558SSowmini.Varadhan@Sun.COM 	ire->ire_stackid = ipst->ips_netstack->netstack_stackid;
53933448Sdh155122 	ire->ire_ipst = ipst;
53943448Sdh155122 
53952535Ssangeeta 	/*
53962535Ssangeeta 	 * ire_freemblk will be called when ire_mp is freed, both for
53972535Ssangeeta 	 * successful and failed arp resolution. IRE_MARK_UNCACHED will be set
53982535Ssangeeta 	 * when the arp resolution failed.
53992535Ssangeeta 	 */
54002535Ssangeeta 	ire->ire_marks |= IRE_MARK_UNCACHED;
54012535Ssangeeta 	ire->ire_mp = ire_mp;
54022535Ssangeeta 	ire_mp->b_wptr = (uchar_t *)&ire[1];
54032535Ssangeeta 	ire_mp->b_cont = NULL;
54044714Ssowmini 	linkb(areq_mp, ire_mp);
54052535Ssangeeta 
54062535Ssangeeta 	/*
54072535Ssangeeta 	 * Fill in the source and dest addrs for the resolver.
54082535Ssangeeta 	 * NOTE: this depends on memory layouts imposed by
54092535Ssangeeta 	 * ill_init().
54102535Ssangeeta 	 */
54114714Ssowmini 	areq = (areq_t *)areq_mp->b_rptr;
54122535Ssangeeta 	addrp = (ipaddr_t *)((char *)areq + areq->areq_sender_addr_offset);
54132535Ssangeeta 	*addrp = ire->ire_src_addr;
54142535Ssangeeta 
54152535Ssangeeta 	addrp = (ipaddr_t *)((char *)areq + areq->areq_target_addr_offset);
54162535Ssangeeta 	if (ire->ire_gateway_addr != INADDR_ANY) {
54172535Ssangeeta 		*addrp = ire->ire_gateway_addr;
54182535Ssangeeta 	} else {
54192535Ssangeeta 		*addrp = ire->ire_addr;
54202535Ssangeeta 	}
54212535Ssangeeta 
54222535Ssangeeta 	/* Up to the resolver. */
54232535Ssangeeta 	if (canputnext(dst_ill->ill_rq)) {
54244714Ssowmini 		putnext(dst_ill->ill_rq, areq_mp);
54252535Ssangeeta 	} else {
54264714Ssowmini 		freemsg(areq_mp);
54272535Ssangeeta 	}
54282535Ssangeeta }
54292535Ssangeeta 
54302535Ssangeeta /*
54312535Ssangeeta  * Esballoc free function for AR_ENTRY_QUERY request to clean up any
54322535Ssangeeta  * unresolved ire_t and/or nce_t structures when ARP resolution fails.
54332535Ssangeeta  *
54342535Ssangeeta  * This function can be called by ARP via free routine for ire_mp or
54352535Ssangeeta  * by IPv4(both host and forwarding path) via ire_delete
54362535Ssangeeta  * in case ARP resolution fails.
54372535Ssangeeta  * NOTE: Since IP is MT, ARP can call into IP but not vice versa
54382535Ssangeeta  * (for IP to talk to ARP, it still has to send AR* messages).
54392535Ssangeeta  *
54402535Ssangeeta  * Note that the ARP/IP merge should replace the functioanlity by providing
54412535Ssangeeta  * direct function calls to clean up unresolved entries in ire/nce lists.
54422535Ssangeeta  */
54432535Ssangeeta void
54442535Ssangeeta ire_freemblk(ire_t *ire_mp)
54452535Ssangeeta {
54462535Ssangeeta 	nce_t		*nce = NULL;
54472535Ssangeeta 	ill_t		*ill;
54483448Sdh155122 	ip_stack_t	*ipst;
54497558SSowmini.Varadhan@Sun.COM 	netstack_t	*ns = NULL;
54502535Ssangeeta 
54512535Ssangeeta 	ASSERT(ire_mp != NULL);
54522535Ssangeeta 
54532535Ssangeeta 	if ((ire_mp->ire_addr == NULL) && (ire_mp->ire_gateway_addr == NULL)) {
54542535Ssangeeta 		ip1dbg(("ire_freemblk(0x%p) ire_addr is NULL\n",
54552535Ssangeeta 		    (void *)ire_mp));
54562535Ssangeeta 		goto cleanup;
54572535Ssangeeta 	}
54582535Ssangeeta 	if ((ire_mp->ire_marks & IRE_MARK_UNCACHED) == 0) {
54592535Ssangeeta 		goto cleanup; /* everything succeeded. just free and return */
54602535Ssangeeta 	}
54612535Ssangeeta 
54622535Ssangeeta 	/*
54632535Ssangeeta 	 * the arp information corresponding to this ire_mp was not
54647558SSowmini.Varadhan@Sun.COM 	 * transferred to an ire_cache entry. Need
54652535Ssangeeta 	 * to clean up incomplete ire's and nce, if necessary.
54662535Ssangeeta 	 */
54672535Ssangeeta 	ASSERT(ire_mp->ire_stq != NULL);
54682535Ssangeeta 	ASSERT(ire_mp->ire_stq_ifindex != 0);
54693448Sdh155122 	ASSERT(ire_mp->ire_ipst != NULL);
54703448Sdh155122 
54717558SSowmini.Varadhan@Sun.COM 	ns = netstack_find_by_stackid(ire_mp->ire_stackid);
54727558SSowmini.Varadhan@Sun.COM 	ipst = (ns ? ns->netstack_ip : NULL);
54737558SSowmini.Varadhan@Sun.COM 	if (ipst == NULL || ipst != ire_mp->ire_ipst) /* Disapeared on us */
54747558SSowmini.Varadhan@Sun.COM 		goto  cleanup;
54753448Sdh155122 
54762535Ssangeeta 	/*
54772535Ssangeeta 	 * Get any nce's corresponding to this ire_mp. We first have to
54782535Ssangeeta 	 * make sure that the ill is still around.
54792535Ssangeeta 	 */
54803448Sdh155122 	ill = ill_lookup_on_ifindex(ire_mp->ire_stq_ifindex,
54813448Sdh155122 	    B_FALSE, NULL, NULL, NULL, NULL, ipst);
54822535Ssangeeta 	if (ill == NULL || (ire_mp->ire_stq != ill->ill_wq) ||
54832535Ssangeeta 	    (ill->ill_state_flags & ILL_CONDEMNED)) {
54842535Ssangeeta 		/*
54852535Ssangeeta 		 * ill went away. no nce to clean up.
54862535Ssangeeta 		 * Note that the ill_state_flags could be set to
54872535Ssangeeta 		 * ILL_CONDEMNED after this point, but if we know
54882535Ssangeeta 		 * that it is CONDEMNED now, we just bail out quickly.
54892535Ssangeeta 		 */
54902535Ssangeeta 		if (ill != NULL)
54912535Ssangeeta 			ill_refrele(ill);
54922535Ssangeeta 		goto cleanup;
54932535Ssangeeta 	}
54942535Ssangeeta 	nce = ndp_lookup_v4(ill,
54952535Ssangeeta 	    ((ire_mp->ire_gateway_addr != INADDR_ANY) ?
54962535Ssangeeta 	    &ire_mp->ire_gateway_addr : &ire_mp->ire_addr),
54972535Ssangeeta 	    B_FALSE);
54982535Ssangeeta 	ill_refrele(ill);
54992535Ssangeeta 
55002535Ssangeeta 	if ((nce != NULL) && (nce->nce_state != ND_REACHABLE)) {
55012535Ssangeeta 		/*
55022535Ssangeeta 		 * some incomplete nce was found.
55032535Ssangeeta 		 */
55042535Ssangeeta 		DTRACE_PROBE2(ire__freemblk__arp__resolv__fail,
55052535Ssangeeta 		    nce_t *, nce, ire_t *, ire_mp);
55062535Ssangeeta 		/*
55072535Ssangeeta 		 * Send the icmp_unreachable messages for the queued mblks in
55082535Ssangeeta 		 * ire->ire_nce->nce_qd_mp, since ARP resolution failed
55092535Ssangeeta 		 * for this ire
55102535Ssangeeta 		 */
55112535Ssangeeta 		arp_resolv_failed(nce);
55122535Ssangeeta 		/*
55132535Ssangeeta 		 * Delete the nce and clean up all ire's pointing at this nce
55142535Ssangeeta 		 * in the cachetable
55152535Ssangeeta 		 */
55162535Ssangeeta 		ndp_delete(nce);
55172535Ssangeeta 	}
55182535Ssangeeta 	if (nce != NULL)
55192535Ssangeeta 		NCE_REFRELE(nce); /* release the ref taken by ndp_lookup_v4 */
55202535Ssangeeta 
55212535Ssangeeta cleanup:
55227558SSowmini.Varadhan@Sun.COM 	if (ns != NULL)
55237558SSowmini.Varadhan@Sun.COM 		netstack_rele(ns);
55242535Ssangeeta 	/*
55252535Ssangeeta 	 * Get rid of the ire buffer
55262535Ssangeeta 	 * We call kmem_free here(instead of ire_delete()), since
55272535Ssangeeta 	 * this is the freeb's callback.
55282535Ssangeeta 	 */
55292535Ssangeeta 	kmem_free(ire_mp, sizeof (ire_t) + sizeof (frtn_t));
55302535Ssangeeta }
55312535Ssangeeta 
55323772Ssangeeta /*
55334714Ssowmini  * find, or create if needed, a neighbor cache entry nce_t for IRE_CACHE and
55344714Ssowmini  * non-loopback IRE_BROADCAST ire's.
55354714Ssowmini  *
55364714Ssowmini  * If a neighbor-cache entry has to be created (i.e., one does not already
55374714Ssowmini  * exist in the nce list) the nce_res_mp and nce_state of the neighbor cache
55384714Ssowmini  * entry are initialized in ndp_add_v4(). These values are picked from
55394714Ssowmini  * the src_nce, if one is passed in. Otherwise (if src_nce == NULL) the
55404714Ssowmini  * ire->ire_type and the outgoing interface (ire_to_ill(ire)) values
55414714Ssowmini  * determine the {nce_state, nce_res_mp} of the nce_t created. All
55424714Ssowmini  * IRE_BROADCAST entries have nce_state = ND_REACHABLE, and the nce_res_mp
55434714Ssowmini  * is set to the ill_bcast_mp of the outgoing inerface. For unicast ire
55444714Ssowmini  * entries,
55454714Ssowmini  *   - if the outgoing interface is of type IRE_IF_RESOLVER, a newly created
55464714Ssowmini  *     nce_t will have a null nce_res_mp, and will be in the ND_INITIAL state.
55474714Ssowmini  *   - if the outgoing interface is a IRE_IF_NORESOLVER interface, no link
55484714Ssowmini  *     layer resolution is necessary, so that the nce_t will be in the
55494714Ssowmini  *     ND_REACHABLE state and the nce_res_mp will have a copy of the
55504714Ssowmini  *     ill_resolver_mp of the outgoing interface.
55514714Ssowmini  *
55524714Ssowmini  * The link layer information needed for broadcast addresses, and for
55534714Ssowmini  * packets sent on IRE_IF_NORESOLVER interfaces is a constant mapping that
55544714Ssowmini  * never needs re-verification for the lifetime of the nce_t. These are
55554714Ssowmini  * therefore marked NCE_F_PERMANENT, and never allowed to expire via
55564714Ssowmini  * NCE_EXPIRED.
55574714Ssowmini  *
55584714Ssowmini  * IRE_CACHE ire's contain the information for  the nexthop (ire_gateway_addr)
55594714Ssowmini  * in the case of indirect routes, and for the dst itself (ire_addr) in the
55602535Ssangeeta  * case of direct routes, with the nce_res_mp containing a template
55612535Ssangeeta  * DL_UNITDATA request.
55622535Ssangeeta  *
55632535Ssangeeta  * The actual association of the ire_nce to the nce created here is
55642535Ssangeeta  * typically done in ire_add_v4 for IRE_CACHE entries. Exceptions
55652535Ssangeeta  * to this rule are SO_DONTROUTE ire's (IRE_MARK_NO_ADD), for which
55664823Sseb  * the ire_nce assignment is done in ire_add_then_send.
55672535Ssangeeta  */
55682535Ssangeeta int
55694714Ssowmini ire_nce_init(ire_t *ire, nce_t *src_nce)
55702535Ssangeeta {
55714714Ssowmini 	in_addr_t	addr4;
55722535Ssangeeta 	int		err;
55734714Ssowmini 	nce_t		*nce = NULL;
55742535Ssangeeta 	ill_t		*ire_ill;
55754714Ssowmini 	uint16_t	nce_flags = 0;
55763448Sdh155122 	ip_stack_t	*ipst;
55772535Ssangeeta 
55784714Ssowmini 	if (ire->ire_stq == NULL)
55792535Ssangeeta 		return (0); /* no need to create nce for local/loopback */
55804714Ssowmini 
55812535Ssangeeta 	switch (ire->ire_type) {
55822535Ssangeeta 	case IRE_CACHE:
55832535Ssangeeta 		if (ire->ire_gateway_addr != INADDR_ANY)
55842535Ssangeeta 			addr4 = ire->ire_gateway_addr; /* 'G' route */
55852535Ssangeeta 		else
55862535Ssangeeta 			addr4 = ire->ire_addr; /* direct route */
55872535Ssangeeta 		break;
55882535Ssangeeta 	case IRE_BROADCAST:
55892535Ssangeeta 		addr4 = ire->ire_addr;
55904714Ssowmini 		nce_flags |= (NCE_F_PERMANENT|NCE_F_BCAST);
55912535Ssangeeta 		break;
55922535Ssangeeta 	default:
55932535Ssangeeta 		return (0);
55942535Ssangeeta 	}
55952535Ssangeeta 
55962535Ssangeeta 	/*
55972535Ssangeeta 	 * ire_ipif is picked based on RTF_SETSRC, usesrc etc.
55982535Ssangeeta 	 * rules in ire_forward_src_ipif. We want the dlureq_mp
55992535Ssangeeta 	 * for the outgoing interface, which we get from the ire_stq.
56002535Ssangeeta 	 */
56012535Ssangeeta 	ire_ill = ire_to_ill(ire);
56023448Sdh155122 	ipst = ire_ill->ill_ipst;
56032535Ssangeeta 
56042535Ssangeeta 	/*
56054714Ssowmini 	 * IRE_IF_NORESOLVER entries never need re-verification and
56064714Ssowmini 	 * do not expire, so we mark them as NCE_F_PERMANENT.
56072535Ssangeeta 	 */
56084714Ssowmini 	if (ire_ill->ill_net_type == IRE_IF_NORESOLVER)
56094714Ssowmini 		nce_flags |= NCE_F_PERMANENT;
56102535Ssangeeta 
56114084Ssowmini retry_nce:
56124714Ssowmini 	err = ndp_lookup_then_add_v4(ire_ill, &addr4, nce_flags,
56134714Ssowmini 	    &nce, src_nce);
56144714Ssowmini 
56154714Ssowmini 	if (err == EEXIST && NCE_EXPIRED(nce, ipst)) {
56164084Ssowmini 		/*
56174084Ssowmini 		 * We looked up an expired nce.
56184084Ssowmini 		 * Go back and try to create one again.
56194084Ssowmini 		 */
56204714Ssowmini 		ndp_delete(nce);
56214714Ssowmini 		NCE_REFRELE(nce);
56224714Ssowmini 		nce = NULL;
56234084Ssowmini 		goto retry_nce;
56244084Ssowmini 	}
56254084Ssowmini 
56264714Ssowmini 	ip1dbg(("ire 0x%p addr 0x%lx type 0x%x; found nce 0x%p err %d\n",
56274714Ssowmini 	    (void *)ire, (ulong_t)addr4, ire->ire_type, (void *)nce, err));
56282535Ssangeeta 
56292535Ssangeeta 	switch (err) {
56302535Ssangeeta 	case 0:
56312535Ssangeeta 	case EEXIST:
56322535Ssangeeta 		/*
56334714Ssowmini 		 * return a pointer to a newly created or existing nce_t;
56342535Ssangeeta 		 * note that the ire-nce mapping is many-one, i.e.,
56354714Ssowmini 		 * multiple ire's could point to the same nce_t.
56362535Ssangeeta 		 */
56372535Ssangeeta 		break;
56382535Ssangeeta 	default:
56392535Ssangeeta 		DTRACE_PROBE2(nce__init__fail, ill_t *, ire_ill, int, err);
56402535Ssangeeta 		return (EINVAL);
56412535Ssangeeta 	}
56422535Ssangeeta 	if (ire->ire_type == IRE_BROADCAST) {
56432535Ssangeeta 		/*
56442535Ssangeeta 		 * Two bcast ires are created for each interface;
56452535Ssangeeta 		 * 1. loopback copy (which does not  have an
56462535Ssangeeta 		 *    ire_stq, and therefore has no ire_nce), and,
56472535Ssangeeta 		 * 2. the non-loopback copy, which has the nce_res_mp
56482535Ssangeeta 		 *    initialized to a copy of the ill_bcast_mp, and
56492535Ssangeeta 		 *    is marked as ND_REACHABLE at this point.
56502535Ssangeeta 		 *    This nce does not undergo any further state changes,
56512535Ssangeeta 		 *    and exists as long as the interface is plumbed.
56528485SPeter.Memishian@Sun.COM 		 * Note: the assignment of ire_nce here is a historical
56538485SPeter.Memishian@Sun.COM 		 * artifact of old code that used to inline ire_add().
56542535Ssangeeta 		 */
56554714Ssowmini 		ire->ire_nce = nce;
56562535Ssangeeta 		/*
56572535Ssangeeta 		 * We are associating this nce to the ire,
56582535Ssangeeta 		 * so change the nce ref taken in
56592535Ssangeeta 		 * ndp_lookup_then_add_v4() from
56602535Ssangeeta 		 * NCE_REFHOLD to NCE_REFHOLD_NOTR
56612535Ssangeeta 		 */
56622535Ssangeeta 		NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce);
56632535Ssangeeta 	} else {
56644084Ssowmini 		/*
56654084Ssowmini 		 * We are not using this nce_t just yet so release
56664084Ssowmini 		 * the ref taken in ndp_lookup_then_add_v4()
56674084Ssowmini 		 */
56684714Ssowmini 		NCE_REFRELE(nce);
56692535Ssangeeta 	}
56702535Ssangeeta 	return (0);
56712535Ssangeeta }
56727880SJonathan.Anderson@Sun.COM 
56737880SJonathan.Anderson@Sun.COM /*
56747880SJonathan.Anderson@Sun.COM  * This is the implementation of the IPv4 IRE cache lookup procedure.
56757880SJonathan.Anderson@Sun.COM  * Separating the interface from the implementation allows additional
56767880SJonathan.Anderson@Sun.COM  * flexibility when specifying search criteria.
56777880SJonathan.Anderson@Sun.COM  */
56787880SJonathan.Anderson@Sun.COM static ire_t *
56797880SJonathan.Anderson@Sun.COM ip4_ctable_lookup_impl(ire_ctable_args_t *margs)
56807880SJonathan.Anderson@Sun.COM {
56817880SJonathan.Anderson@Sun.COM 	irb_t			*irb_ptr;
56827880SJonathan.Anderson@Sun.COM 	ire_t			*ire;
56837880SJonathan.Anderson@Sun.COM 	ip_stack_t		*ipst = margs->ict_ipst;
56847880SJonathan.Anderson@Sun.COM 
56858485SPeter.Memishian@Sun.COM 	if ((margs->ict_flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) &&
56867880SJonathan.Anderson@Sun.COM 	    (margs->ict_ipif == NULL)) {
56877880SJonathan.Anderson@Sun.COM 		return (NULL);
56887880SJonathan.Anderson@Sun.COM 	}
56897880SJonathan.Anderson@Sun.COM 
56907880SJonathan.Anderson@Sun.COM 	irb_ptr = &ipst->ips_ip_cache_table[IRE_ADDR_HASH(
56917880SJonathan.Anderson@Sun.COM 	    *((ipaddr_t *)margs->ict_addr), ipst->ips_ip_cache_table_size)];
56927880SJonathan.Anderson@Sun.COM 	rw_enter(&irb_ptr->irb_lock, RW_READER);
56937880SJonathan.Anderson@Sun.COM 	for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
56947880SJonathan.Anderson@Sun.COM 		if (ire->ire_marks & IRE_MARK_CONDEMNED)
56957880SJonathan.Anderson@Sun.COM 			continue;
56967880SJonathan.Anderson@Sun.COM 		ASSERT(ire->ire_mask == IP_HOST_MASK);
56977880SJonathan.Anderson@Sun.COM 		if (ire_match_args(ire, *((ipaddr_t *)margs->ict_addr),
56987880SJonathan.Anderson@Sun.COM 		    ire->ire_mask, *((ipaddr_t *)margs->ict_gateway),
56997880SJonathan.Anderson@Sun.COM 		    margs->ict_type, margs->ict_ipif, margs->ict_zoneid, 0,
57007880SJonathan.Anderson@Sun.COM 		    margs->ict_tsl, margs->ict_flags, margs->ict_wq)) {
57017880SJonathan.Anderson@Sun.COM 			IRE_REFHOLD(ire);
57027880SJonathan.Anderson@Sun.COM 			rw_exit(&irb_ptr->irb_lock);
57037880SJonathan.Anderson@Sun.COM 			return (ire);
57047880SJonathan.Anderson@Sun.COM 		}
57057880SJonathan.Anderson@Sun.COM 	}
57067880SJonathan.Anderson@Sun.COM 
57077880SJonathan.Anderson@Sun.COM 	rw_exit(&irb_ptr->irb_lock);
57087880SJonathan.Anderson@Sun.COM 	return (NULL);
57097880SJonathan.Anderson@Sun.COM }
57107880SJonathan.Anderson@Sun.COM 
57117880SJonathan.Anderson@Sun.COM /*
57127880SJonathan.Anderson@Sun.COM  * This function locates IRE_CACHE entries which were added by the
57137880SJonathan.Anderson@Sun.COM  * ire_forward() path. We can fully specify the IRE we are looking for by
57148485SPeter.Memishian@Sun.COM  * providing the ipif (MATCH_IRE_IPIF) *and* the stq (MATCH_IRE_WQ).
57157880SJonathan.Anderson@Sun.COM  */
57167880SJonathan.Anderson@Sun.COM ire_t *
57177880SJonathan.Anderson@Sun.COM ire_arpresolve_lookup(ipaddr_t addr, ipaddr_t gw, ipif_t *ipif,
57187880SJonathan.Anderson@Sun.COM     zoneid_t zoneid, ip_stack_t *ipst, queue_t *wq)
57197880SJonathan.Anderson@Sun.COM {
57207880SJonathan.Anderson@Sun.COM 	ire_ctable_args_t	margs;
57217880SJonathan.Anderson@Sun.COM 
57227880SJonathan.Anderson@Sun.COM 	margs.ict_addr = &addr;
57237880SJonathan.Anderson@Sun.COM 	margs.ict_gateway = &gw;
57247880SJonathan.Anderson@Sun.COM 	margs.ict_type = IRE_CACHE;
57257880SJonathan.Anderson@Sun.COM 	margs.ict_ipif = ipif;
57267880SJonathan.Anderson@Sun.COM 	margs.ict_zoneid = zoneid;
57277880SJonathan.Anderson@Sun.COM 	margs.ict_tsl = NULL;
57287880SJonathan.Anderson@Sun.COM 	margs.ict_flags = MATCH_IRE_GW | MATCH_IRE_IPIF | MATCH_IRE_ZONEONLY |
57297880SJonathan.Anderson@Sun.COM 	    MATCH_IRE_TYPE | MATCH_IRE_WQ;
57307880SJonathan.Anderson@Sun.COM 	margs.ict_ipst = ipst;
57317880SJonathan.Anderson@Sun.COM 	margs.ict_wq = wq;
57327880SJonathan.Anderson@Sun.COM 
57337880SJonathan.Anderson@Sun.COM 	return (ip4_ctable_lookup_impl(&margs));
57347880SJonathan.Anderson@Sun.COM }
5735