10Sstevel@tonic-gate /* 20Sstevel@tonic-gate * CDDL HEADER START 30Sstevel@tonic-gate * 40Sstevel@tonic-gate * The contents of this file are subject to the terms of the 51676Sjpk * Common Development and Distribution License (the "License"). 61676Sjpk * You may not use this file except in compliance with the License. 70Sstevel@tonic-gate * 80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 100Sstevel@tonic-gate * See the License for the specific language governing permissions 110Sstevel@tonic-gate * and limitations under the License. 120Sstevel@tonic-gate * 130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 180Sstevel@tonic-gate * 190Sstevel@tonic-gate * CDDL HEADER END 200Sstevel@tonic-gate */ 210Sstevel@tonic-gate /* 228485SPeter.Memishian@Sun.COM * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 230Sstevel@tonic-gate * Use is subject to license terms. 240Sstevel@tonic-gate */ 250Sstevel@tonic-gate /* Copyright (c) 1990 Mentat Inc. */ 260Sstevel@tonic-gate 270Sstevel@tonic-gate /* 280Sstevel@tonic-gate * This file contains routines that manipulate Internet Routing Entries (IREs). 290Sstevel@tonic-gate */ 300Sstevel@tonic-gate 310Sstevel@tonic-gate #include <sys/types.h> 320Sstevel@tonic-gate #include <sys/stream.h> 330Sstevel@tonic-gate #include <sys/stropts.h> 348485SPeter.Memishian@Sun.COM #include <sys/strsun.h> 35*8778SErik.Nordmark@Sun.COM #include <sys/strsubr.h> 360Sstevel@tonic-gate #include <sys/ddi.h> 370Sstevel@tonic-gate #include <sys/cmn_err.h> 380Sstevel@tonic-gate #include <sys/policy.h> 390Sstevel@tonic-gate 400Sstevel@tonic-gate #include <sys/systm.h> 410Sstevel@tonic-gate #include <sys/kmem.h> 420Sstevel@tonic-gate #include <sys/param.h> 430Sstevel@tonic-gate #include <sys/socket.h> 440Sstevel@tonic-gate #include <net/if.h> 450Sstevel@tonic-gate #include <net/route.h> 460Sstevel@tonic-gate #include <netinet/in.h> 470Sstevel@tonic-gate #include <net/if_dl.h> 480Sstevel@tonic-gate #include <netinet/ip6.h> 490Sstevel@tonic-gate #include <netinet/icmp6.h> 500Sstevel@tonic-gate 510Sstevel@tonic-gate #include <inet/common.h> 520Sstevel@tonic-gate #include <inet/mi.h> 530Sstevel@tonic-gate #include <inet/ip.h> 540Sstevel@tonic-gate #include <inet/ip6.h> 550Sstevel@tonic-gate #include <inet/ip_ndp.h> 562535Ssangeeta #include <inet/arp.h> 570Sstevel@tonic-gate #include <inet/ip_if.h> 580Sstevel@tonic-gate #include <inet/ip_ire.h> 592535Ssangeeta #include <inet/ip_ftable.h> 600Sstevel@tonic-gate #include <inet/ip_rts.h> 610Sstevel@tonic-gate #include <inet/nd.h> 620Sstevel@tonic-gate 630Sstevel@tonic-gate #include <net/pfkeyv2.h> 640Sstevel@tonic-gate #include <inet/ipsec_info.h> 650Sstevel@tonic-gate #include <inet/sadb.h> 660Sstevel@tonic-gate #include <inet/tcp.h> 670Sstevel@tonic-gate #include <inet/ipclassifier.h> 680Sstevel@tonic-gate #include <sys/zone.h> 693448Sdh155122 #include <sys/cpuvar.h> 703448Sdh155122 711676Sjpk #include <sys/tsol/label.h> 721676Sjpk #include <sys/tsol/tnet.h> 731676Sjpk 742535Ssangeeta struct kmem_cache *rt_entry_cache; 752535Ssangeeta 760Sstevel@tonic-gate /* 770Sstevel@tonic-gate * Synchronization notes: 780Sstevel@tonic-gate * 790Sstevel@tonic-gate * The fields of the ire_t struct are protected in the following way : 800Sstevel@tonic-gate * 810Sstevel@tonic-gate * ire_next/ire_ptpn 820Sstevel@tonic-gate * 830Sstevel@tonic-gate * - bucket lock of the respective tables (cache or forwarding tables). 840Sstevel@tonic-gate * 850Sstevel@tonic-gate * ire_mp, ire_rfq, ire_stq, ire_u *except* ire_gateway_addr[v6], ire_mask, 860Sstevel@tonic-gate * ire_type, ire_create_time, ire_masklen, ire_ipversion, ire_flags, ire_ipif, 870Sstevel@tonic-gate * ire_ihandle, ire_phandle, ire_nce, ire_bucket, ire_in_ill, ire_in_src_addr 880Sstevel@tonic-gate * 890Sstevel@tonic-gate * - Set in ire_create_v4/v6 and never changes after that. Thus, 900Sstevel@tonic-gate * we don't need a lock whenever these fields are accessed. 910Sstevel@tonic-gate * 920Sstevel@tonic-gate * - ire_bucket and ire_masklen (also set in ire_create) is set in 930Sstevel@tonic-gate * ire_add_v4/ire_add_v6 before inserting in the bucket and never 940Sstevel@tonic-gate * changes after that. Thus we don't need a lock whenever these 950Sstevel@tonic-gate * fields are accessed. 960Sstevel@tonic-gate * 970Sstevel@tonic-gate * ire_gateway_addr_v4[v6] 980Sstevel@tonic-gate * 990Sstevel@tonic-gate * - ire_gateway_addr_v4[v6] is set during ire_create and later modified 1000Sstevel@tonic-gate * by rts_setgwr[v6]. As ire_gateway_addr is a uint32_t, updates to 1010Sstevel@tonic-gate * it assumed to be atomic and hence the other parts of the code 1020Sstevel@tonic-gate * does not use any locks. ire_gateway_addr_v6 updates are not atomic 1030Sstevel@tonic-gate * and hence any access to it uses ire_lock to get/set the right value. 1040Sstevel@tonic-gate * 1050Sstevel@tonic-gate * ire_ident, ire_refcnt 1060Sstevel@tonic-gate * 1070Sstevel@tonic-gate * - Updated atomically using atomic_add_32 1080Sstevel@tonic-gate * 1090Sstevel@tonic-gate * ire_ssthresh, ire_rtt_sd, ire_rtt, ire_ib_pkt_count, ire_ob_pkt_count 1100Sstevel@tonic-gate * 1110Sstevel@tonic-gate * - Assumes that 32 bit writes are atomic. No locks. ire_lock is 1120Sstevel@tonic-gate * used to serialize updates to ire_ssthresh, ire_rtt_sd, ire_rtt. 1130Sstevel@tonic-gate * 1140Sstevel@tonic-gate * ire_max_frag, ire_frag_flag 1150Sstevel@tonic-gate * 1160Sstevel@tonic-gate * - ire_lock is used to set/read both of them together. 1170Sstevel@tonic-gate * 1180Sstevel@tonic-gate * ire_tire_mark 1190Sstevel@tonic-gate * 1200Sstevel@tonic-gate * - Set in ire_create and updated in ire_expire, which is called 1210Sstevel@tonic-gate * by only one function namely ip_trash_timer_expire. Thus only 1220Sstevel@tonic-gate * one function updates and examines the value. 1230Sstevel@tonic-gate * 1240Sstevel@tonic-gate * ire_marks 1250Sstevel@tonic-gate * - bucket lock protects this. 1260Sstevel@tonic-gate * 1270Sstevel@tonic-gate * ire_ipsec_overhead/ire_ll_hdr_length 1280Sstevel@tonic-gate * 1290Sstevel@tonic-gate * - Place holder for returning the information to the upper layers 1300Sstevel@tonic-gate * when IRE_DB_REQ comes down. 1310Sstevel@tonic-gate * 1320Sstevel@tonic-gate * 1330Sstevel@tonic-gate * ipv6_ire_default_count is protected by the bucket lock of 1340Sstevel@tonic-gate * ip_forwarding_table_v6[0][0]. 1350Sstevel@tonic-gate * 1362535Ssangeeta * ipv6_ire_default_index is not protected as it is just a hint 1372535Ssangeeta * at which default gateway to use. There is nothing 1380Sstevel@tonic-gate * wrong in using the same gateway for two different connections. 1390Sstevel@tonic-gate * 1400Sstevel@tonic-gate * As we always hold the bucket locks in all the places while accessing 1410Sstevel@tonic-gate * the above values, it is natural to use them for protecting them. 1420Sstevel@tonic-gate * 1430Sstevel@tonic-gate * We have a separate cache table and forwarding table for IPv4 and IPv6. 1440Sstevel@tonic-gate * Cache table (ip_cache_table/ip_cache_table_v6) is a pointer to an 1455335Ssowmini * array of irb_t structures. The IPv6 forwarding table 1465335Ssowmini * (ip_forwarding_table_v6) is an array of pointers to arrays of irb_t 1475335Ssowmini * structure. ip_forwarding_table_v6 is allocated dynamically in 1483448Sdh155122 * ire_add_v6. ire_ft_init_lock is used to serialize multiple threads 1490Sstevel@tonic-gate * initializing the same bucket. Once a bucket is initialized, it is never 1503448Sdh155122 * de-alloacted. This assumption enables us to access 1513448Sdh155122 * ip_forwarding_table_v6[i] without any locks. 1520Sstevel@tonic-gate * 1535335Ssowmini * The forwarding table for IPv4 is a radix tree whose leaves 1545335Ssowmini * are rt_entry structures containing the irb_t for the rt_dst. The irb_t 1555335Ssowmini * for IPv4 is dynamically allocated and freed. 1565335Ssowmini * 1570Sstevel@tonic-gate * Each irb_t - ire bucket structure has a lock to protect 1580Sstevel@tonic-gate * a bucket and the ires residing in the bucket have a back pointer to 1590Sstevel@tonic-gate * the bucket structure. It also has a reference count for the number 1600Sstevel@tonic-gate * of threads walking the bucket - irb_refcnt which is bumped up 1610Sstevel@tonic-gate * using the macro IRB_REFHOLD macro. The flags irb_flags can be 1620Sstevel@tonic-gate * set to IRE_MARK_CONDEMNED indicating that there are some ires 1630Sstevel@tonic-gate * in this bucket that are marked with IRE_MARK_CONDEMNED and the 1640Sstevel@tonic-gate * last thread to leave the bucket should delete the ires. Usually 1650Sstevel@tonic-gate * this is done by the IRB_REFRELE macro which is used to decrement 1665335Ssowmini * the reference count on a bucket. See comments above irb_t structure 1675335Ssowmini * definition in ip.h for further details. 1680Sstevel@tonic-gate * 1690Sstevel@tonic-gate * IRE_REFHOLD/IRE_REFRELE macros operate on the ire which increments/ 1700Sstevel@tonic-gate * decrements the reference count, ire_refcnt, atomically on the ire. 1710Sstevel@tonic-gate * ire_refcnt is modified only using this macro. Operations on the IRE 1720Sstevel@tonic-gate * could be described as follows : 1730Sstevel@tonic-gate * 1740Sstevel@tonic-gate * CREATE an ire with reference count initialized to 1. 1750Sstevel@tonic-gate * 1760Sstevel@tonic-gate * ADDITION of an ire holds the bucket lock, checks for duplicates 1770Sstevel@tonic-gate * and then adds the ire. ire_add_v4/ire_add_v6 returns the ire after 1780Sstevel@tonic-gate * bumping up once more i.e the reference count is 2. This is to avoid 1790Sstevel@tonic-gate * an extra lookup in the functions calling ire_add which wants to 1800Sstevel@tonic-gate * work with the ire after adding. 1810Sstevel@tonic-gate * 1820Sstevel@tonic-gate * LOOKUP of an ire bumps up the reference count using IRE_REFHOLD 1830Sstevel@tonic-gate * macro. It is valid to bump up the referece count of the IRE, 1840Sstevel@tonic-gate * after the lookup has returned an ire. Following are the lookup 1850Sstevel@tonic-gate * functions that return an HELD ire : 1860Sstevel@tonic-gate * 1870Sstevel@tonic-gate * ire_lookup_local[_v6], ire_ctable_lookup[_v6], ire_ftable_lookup[_v6], 1880Sstevel@tonic-gate * ire_cache_lookup[_v6], ire_lookup_multi[_v6], ire_route_lookup[_v6], 1894823Sseb * ipif_to_ire[_v6]. 1900Sstevel@tonic-gate * 1910Sstevel@tonic-gate * DELETION of an ire holds the bucket lock, removes it from the list 1920Sstevel@tonic-gate * and then decrements the reference count for having removed from the list 1930Sstevel@tonic-gate * by using the IRE_REFRELE macro. If some other thread has looked up 1940Sstevel@tonic-gate * the ire, the reference count would have been bumped up and hence 1950Sstevel@tonic-gate * this ire will not be freed once deleted. It will be freed once the 1960Sstevel@tonic-gate * reference count drops to zero. 1970Sstevel@tonic-gate * 1980Sstevel@tonic-gate * Add and Delete acquires the bucket lock as RW_WRITER, while all the 1990Sstevel@tonic-gate * lookups acquire the bucket lock as RW_READER. 2000Sstevel@tonic-gate * 2010Sstevel@tonic-gate * NOTE : The only functions that does the IRE_REFRELE when an ire is 2020Sstevel@tonic-gate * passed as an argument are : 2030Sstevel@tonic-gate * 2040Sstevel@tonic-gate * 1) ip_wput_ire : This is because it IRE_REFHOLD/RELEs the 2050Sstevel@tonic-gate * broadcast ires it looks up internally within 2060Sstevel@tonic-gate * the function. Currently, for simplicity it does 2070Sstevel@tonic-gate * not differentiate the one that is passed in and 2080Sstevel@tonic-gate * the ones it looks up internally. It always 2090Sstevel@tonic-gate * IRE_REFRELEs. 2100Sstevel@tonic-gate * 2) ire_send 2110Sstevel@tonic-gate * ire_send_v6 : As ire_send calls ip_wput_ire and other functions 2120Sstevel@tonic-gate * that take ire as an argument, it has to selectively 2130Sstevel@tonic-gate * IRE_REFRELE the ire. To maintain symmetry, 2140Sstevel@tonic-gate * ire_send_v6 does the same. 2150Sstevel@tonic-gate * 2160Sstevel@tonic-gate * Otherwise, the general rule is to do the IRE_REFRELE in the function 2170Sstevel@tonic-gate * that is passing the ire as an argument. 2180Sstevel@tonic-gate * 2190Sstevel@tonic-gate * In trying to locate ires the following points are to be noted. 2200Sstevel@tonic-gate * 2210Sstevel@tonic-gate * IRE_MARK_CONDEMNED signifies that the ire has been logically deleted and is 2220Sstevel@tonic-gate * to be ignored when walking the ires using ire_next. 2230Sstevel@tonic-gate * 2240Sstevel@tonic-gate * Zones note: 2250Sstevel@tonic-gate * Walking IREs within a given zone also walks certain ires in other 2260Sstevel@tonic-gate * zones. This is done intentionally. IRE walks with a specified 2270Sstevel@tonic-gate * zoneid are used only when doing informational reports, and 2280Sstevel@tonic-gate * zone users want to see things that they can access. See block 2290Sstevel@tonic-gate * comment in ire_walk_ill_match(). 2300Sstevel@tonic-gate */ 2310Sstevel@tonic-gate 2320Sstevel@tonic-gate /* 2330Sstevel@tonic-gate * The minimum size of IRE cache table. It will be recalcuated in 2340Sstevel@tonic-gate * ip_ire_init(). 2353448Sdh155122 * Setable in /etc/system 2360Sstevel@tonic-gate */ 2370Sstevel@tonic-gate uint32_t ip_cache_table_size = IP_CACHE_TABLE_SIZE; 2380Sstevel@tonic-gate uint32_t ip6_cache_table_size = IP6_CACHE_TABLE_SIZE; 2390Sstevel@tonic-gate 2400Sstevel@tonic-gate /* 2410Sstevel@tonic-gate * The size of the forwarding table. We will make sure that it is a 2420Sstevel@tonic-gate * power of 2 in ip_ire_init(). 2433448Sdh155122 * Setable in /etc/system 2440Sstevel@tonic-gate */ 2450Sstevel@tonic-gate uint32_t ip6_ftable_hash_size = IP6_FTABLE_HASH_SIZE; 2460Sstevel@tonic-gate 2470Sstevel@tonic-gate struct kmem_cache *ire_cache; 2480Sstevel@tonic-gate static ire_t ire_null; 2490Sstevel@tonic-gate 2500Sstevel@tonic-gate /* 2510Sstevel@tonic-gate * The threshold number of IRE in a bucket when the IREs are 2520Sstevel@tonic-gate * cleaned up. This threshold is calculated later in ip_open() 2530Sstevel@tonic-gate * based on the speed of CPU and available memory. This default 2540Sstevel@tonic-gate * value is the maximum. 2550Sstevel@tonic-gate * 2560Sstevel@tonic-gate * We have two kinds of cached IRE, temporary and 2570Sstevel@tonic-gate * non-temporary. Temporary IREs are marked with 2580Sstevel@tonic-gate * IRE_MARK_TEMPORARY. They are IREs created for non 2590Sstevel@tonic-gate * TCP traffic and for forwarding purposes. All others 2600Sstevel@tonic-gate * are non-temporary IREs. We don't mark IRE created for 2610Sstevel@tonic-gate * TCP as temporary because TCP is stateful and there are 2620Sstevel@tonic-gate * info stored in the IRE which can be shared by other TCP 2630Sstevel@tonic-gate * connections to the same destination. For connected 2640Sstevel@tonic-gate * endpoint, we also don't want to mark the IRE used as 2650Sstevel@tonic-gate * temporary because the same IRE will be used frequently, 2660Sstevel@tonic-gate * otherwise, the app should not do a connect(). We change 2670Sstevel@tonic-gate * the marking at ip_bind_connected_*() if necessary. 2680Sstevel@tonic-gate * 2690Sstevel@tonic-gate * We want to keep the cache IRE hash bucket length reasonably 2700Sstevel@tonic-gate * short, otherwise IRE lookup functions will take "forever." 2710Sstevel@tonic-gate * We use the "crude" function that the IRE bucket 2720Sstevel@tonic-gate * length should be based on the CPU speed, which is 1 entry 2730Sstevel@tonic-gate * per x MHz, depending on the shift factor ip_ire_cpu_ratio 2740Sstevel@tonic-gate * (n). This means that with a 750MHz CPU, the max bucket 2750Sstevel@tonic-gate * length can be (750 >> n) entries. 2760Sstevel@tonic-gate * 2770Sstevel@tonic-gate * Note that this threshold is separate for temp and non-temp 2780Sstevel@tonic-gate * IREs. This means that the actual bucket length can be 2790Sstevel@tonic-gate * twice as that. And while we try to keep temporary IRE 2800Sstevel@tonic-gate * length at most at the threshold value, we do not attempt to 2810Sstevel@tonic-gate * make the length for non-temporary IREs fixed, for the 2820Sstevel@tonic-gate * reason stated above. Instead, we start trying to find 2830Sstevel@tonic-gate * "unused" non-temporary IREs when the bucket length reaches 2840Sstevel@tonic-gate * this threshold and clean them up. 2850Sstevel@tonic-gate * 2860Sstevel@tonic-gate * We also want to limit the amount of memory used by 2870Sstevel@tonic-gate * IREs. So if we are allowed to use ~3% of memory (M) 2880Sstevel@tonic-gate * for those IREs, each bucket should not have more than 2890Sstevel@tonic-gate * 2900Sstevel@tonic-gate * M / num of cache bucket / sizeof (ire_t) 2910Sstevel@tonic-gate * 2920Sstevel@tonic-gate * Again the above memory uses are separate for temp and 2930Sstevel@tonic-gate * non-temp cached IREs. 2940Sstevel@tonic-gate * 2950Sstevel@tonic-gate * We may also want the limit to be a function of the number 2960Sstevel@tonic-gate * of interfaces and number of CPUs. Doing the initialization 2970Sstevel@tonic-gate * in ip_open() means that every time an interface is plumbed, 2980Sstevel@tonic-gate * the max is re-calculated. Right now, we don't do anything 2990Sstevel@tonic-gate * different. In future, when we have more experience, we 3000Sstevel@tonic-gate * may want to change this behavior. 3010Sstevel@tonic-gate */ 3023448Sdh155122 uint32_t ip_ire_max_bucket_cnt = 10; /* Setable in /etc/system */ 3030Sstevel@tonic-gate uint32_t ip6_ire_max_bucket_cnt = 10; 3045388Sja97890 uint32_t ip_ire_cleanup_cnt = 2; 3050Sstevel@tonic-gate 3060Sstevel@tonic-gate /* 3070Sstevel@tonic-gate * The minimum of the temporary IRE bucket count. We do not want 3080Sstevel@tonic-gate * the length of each bucket to be too short. This may hurt 3090Sstevel@tonic-gate * performance of some apps as the temporary IREs are removed too 3100Sstevel@tonic-gate * often. 3110Sstevel@tonic-gate */ 3123448Sdh155122 uint32_t ip_ire_min_bucket_cnt = 3; /* /etc/system - not used */ 3130Sstevel@tonic-gate uint32_t ip6_ire_min_bucket_cnt = 3; 3140Sstevel@tonic-gate 3150Sstevel@tonic-gate /* 3160Sstevel@tonic-gate * The ratio of memory consumed by IRE used for temporary to available 3170Sstevel@tonic-gate * memory. This is a shift factor, so 6 means the ratio 1 to 64. This 3180Sstevel@tonic-gate * value can be changed in /etc/system. 6 is a reasonable number. 3190Sstevel@tonic-gate */ 3203448Sdh155122 uint32_t ip_ire_mem_ratio = 6; /* /etc/system */ 3210Sstevel@tonic-gate /* The shift factor for CPU speed to calculate the max IRE bucket length. */ 3223448Sdh155122 uint32_t ip_ire_cpu_ratio = 7; /* /etc/system */ 3230Sstevel@tonic-gate 3242535Ssangeeta typedef struct nce_clookup_s { 3252535Ssangeeta ipaddr_t ncecl_addr; 3262535Ssangeeta boolean_t ncecl_found; 3272535Ssangeeta } nce_clookup_t; 3282535Ssangeeta 3290Sstevel@tonic-gate /* 3300Sstevel@tonic-gate * The maximum number of buckets in IRE cache table. In future, we may 3310Sstevel@tonic-gate * want to make it a dynamic hash table. For the moment, we fix the 3320Sstevel@tonic-gate * size and allocate the table in ip_ire_init() when IP is first loaded. 3330Sstevel@tonic-gate * We take into account the amount of memory a system has. 3340Sstevel@tonic-gate */ 3350Sstevel@tonic-gate #define IP_MAX_CACHE_TABLE_SIZE 4096 3360Sstevel@tonic-gate 3373448Sdh155122 /* Setable in /etc/system */ 3380Sstevel@tonic-gate static uint32_t ip_max_cache_table_size = IP_MAX_CACHE_TABLE_SIZE; 3390Sstevel@tonic-gate static uint32_t ip6_max_cache_table_size = IP_MAX_CACHE_TABLE_SIZE; 3400Sstevel@tonic-gate 3410Sstevel@tonic-gate /* Zero iulp_t for initialization. */ 3420Sstevel@tonic-gate const iulp_t ire_uinfo_null = { 0 }; 3430Sstevel@tonic-gate 3440Sstevel@tonic-gate static int ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, 3452535Ssangeeta ipsq_func_t func, boolean_t); 3460Sstevel@tonic-gate static void ire_delete_v4(ire_t *ire); 3471676Sjpk static void ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers, 3483448Sdh155122 zoneid_t zoneid, ip_stack_t *); 3490Sstevel@tonic-gate static void ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type, 3501676Sjpk pfv_t func, void *arg, uchar_t vers, ill_t *ill); 3515388Sja97890 static void ire_cache_cleanup(irb_t *irb, uint32_t threshold, 3525388Sja97890 ire_t *ref_ire); 3532535Ssangeeta static void ip_nce_clookup_and_delete(nce_t *nce, void *arg); 3547880SJonathan.Anderson@Sun.COM static ire_t *ip4_ctable_lookup_impl(ire_ctable_args_t *margs); 3555023Scarlsonj #ifdef DEBUG 3565023Scarlsonj static void ire_trace_cleanup(const ire_t *); 3570Sstevel@tonic-gate #endif 3580Sstevel@tonic-gate 3590Sstevel@tonic-gate /* 3600Sstevel@tonic-gate * To avoid bloating the code, we call this function instead of 3610Sstevel@tonic-gate * using the macro IRE_REFRELE. Use macro only in performance 3620Sstevel@tonic-gate * critical paths. 3630Sstevel@tonic-gate * 3640Sstevel@tonic-gate * Must not be called while holding any locks. Otherwise if this is 3650Sstevel@tonic-gate * the last reference to be released there is a chance of recursive mutex 3660Sstevel@tonic-gate * panic due to ire_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 3670Sstevel@tonic-gate * to restart an ioctl. The one exception is when the caller is sure that 3680Sstevel@tonic-gate * this is not the last reference to be released. Eg. if the caller is 3690Sstevel@tonic-gate * sure that the ire has not been deleted and won't be deleted. 3700Sstevel@tonic-gate */ 3710Sstevel@tonic-gate void 3720Sstevel@tonic-gate ire_refrele(ire_t *ire) 3730Sstevel@tonic-gate { 3740Sstevel@tonic-gate IRE_REFRELE(ire); 3750Sstevel@tonic-gate } 3760Sstevel@tonic-gate 3770Sstevel@tonic-gate void 3780Sstevel@tonic-gate ire_refrele_notr(ire_t *ire) 3790Sstevel@tonic-gate { 3800Sstevel@tonic-gate IRE_REFRELE_NOTR(ire); 3810Sstevel@tonic-gate } 3820Sstevel@tonic-gate 3830Sstevel@tonic-gate /* 3840Sstevel@tonic-gate * kmem_cache_alloc constructor for IRE in kma space. 3850Sstevel@tonic-gate * Note that when ire_mp is set the IRE is stored in that mblk and 3860Sstevel@tonic-gate * not in this cache. 3870Sstevel@tonic-gate */ 3880Sstevel@tonic-gate /* ARGSUSED */ 3890Sstevel@tonic-gate static int 3900Sstevel@tonic-gate ip_ire_constructor(void *buf, void *cdrarg, int kmflags) 3910Sstevel@tonic-gate { 3920Sstevel@tonic-gate ire_t *ire = buf; 3930Sstevel@tonic-gate 3942535Ssangeeta ire->ire_nce = NULL; 3950Sstevel@tonic-gate 3960Sstevel@tonic-gate return (0); 3970Sstevel@tonic-gate } 3980Sstevel@tonic-gate 3990Sstevel@tonic-gate /* ARGSUSED1 */ 4000Sstevel@tonic-gate static void 4010Sstevel@tonic-gate ip_ire_destructor(void *buf, void *cdrarg) 4020Sstevel@tonic-gate { 4030Sstevel@tonic-gate ire_t *ire = buf; 4040Sstevel@tonic-gate 4052535Ssangeeta ASSERT(ire->ire_nce == NULL); 4060Sstevel@tonic-gate } 4070Sstevel@tonic-gate 4080Sstevel@tonic-gate /* 4090Sstevel@tonic-gate * This function is associated with the IP_IOC_IRE_ADVISE_NO_REPLY 4100Sstevel@tonic-gate * IOCTL. It is used by TCP (or other ULPs) to supply revised information 4110Sstevel@tonic-gate * for an existing CACHED IRE. 4120Sstevel@tonic-gate */ 4130Sstevel@tonic-gate /* ARGSUSED */ 4140Sstevel@tonic-gate int 4150Sstevel@tonic-gate ip_ire_advise(queue_t *q, mblk_t *mp, cred_t *ioc_cr) 4160Sstevel@tonic-gate { 4170Sstevel@tonic-gate uchar_t *addr_ucp; 4180Sstevel@tonic-gate ipic_t *ipic; 4190Sstevel@tonic-gate ire_t *ire; 4200Sstevel@tonic-gate ipaddr_t addr; 4210Sstevel@tonic-gate in6_addr_t v6addr; 4220Sstevel@tonic-gate irb_t *irb; 4230Sstevel@tonic-gate zoneid_t zoneid; 4243448Sdh155122 ip_stack_t *ipst = CONNQ_TO_IPST(q); 4250Sstevel@tonic-gate 4260Sstevel@tonic-gate ASSERT(q->q_next == NULL); 4270Sstevel@tonic-gate zoneid = Q_TO_CONN(q)->conn_zoneid; 4280Sstevel@tonic-gate 4290Sstevel@tonic-gate /* 4300Sstevel@tonic-gate * Check privilege using the ioctl credential; if it is NULL 4310Sstevel@tonic-gate * then this is a kernel message and therefor privileged. 4320Sstevel@tonic-gate */ 4333448Sdh155122 if (ioc_cr != NULL && secpolicy_ip_config(ioc_cr, B_FALSE) != 0) 4340Sstevel@tonic-gate return (EPERM); 4350Sstevel@tonic-gate 4360Sstevel@tonic-gate ipic = (ipic_t *)mp->b_rptr; 4370Sstevel@tonic-gate if (!(addr_ucp = mi_offset_param(mp, ipic->ipic_addr_offset, 4380Sstevel@tonic-gate ipic->ipic_addr_length))) { 4390Sstevel@tonic-gate return (EINVAL); 4400Sstevel@tonic-gate } 4410Sstevel@tonic-gate if (!OK_32PTR(addr_ucp)) 4420Sstevel@tonic-gate return (EINVAL); 4430Sstevel@tonic-gate switch (ipic->ipic_addr_length) { 4440Sstevel@tonic-gate case IP_ADDR_LEN: { 4450Sstevel@tonic-gate /* Extract the destination address. */ 4460Sstevel@tonic-gate addr = *(ipaddr_t *)addr_ucp; 4470Sstevel@tonic-gate /* Find the corresponding IRE. */ 4483448Sdh155122 ire = ire_cache_lookup(addr, zoneid, NULL, ipst); 4490Sstevel@tonic-gate break; 4500Sstevel@tonic-gate } 4510Sstevel@tonic-gate case IPV6_ADDR_LEN: { 4520Sstevel@tonic-gate /* Extract the destination address. */ 4530Sstevel@tonic-gate v6addr = *(in6_addr_t *)addr_ucp; 4540Sstevel@tonic-gate /* Find the corresponding IRE. */ 4553448Sdh155122 ire = ire_cache_lookup_v6(&v6addr, zoneid, NULL, ipst); 4560Sstevel@tonic-gate break; 4570Sstevel@tonic-gate } 4580Sstevel@tonic-gate default: 4590Sstevel@tonic-gate return (EINVAL); 4600Sstevel@tonic-gate } 4610Sstevel@tonic-gate 4620Sstevel@tonic-gate if (ire == NULL) 4630Sstevel@tonic-gate return (ENOENT); 4640Sstevel@tonic-gate /* 4650Sstevel@tonic-gate * Update the round trip time estimate and/or the max frag size 4660Sstevel@tonic-gate * and/or the slow start threshold. 4670Sstevel@tonic-gate * 4680Sstevel@tonic-gate * We serialize multiple advises using ire_lock. 4690Sstevel@tonic-gate */ 4700Sstevel@tonic-gate mutex_enter(&ire->ire_lock); 4710Sstevel@tonic-gate if (ipic->ipic_rtt) { 4720Sstevel@tonic-gate /* 4730Sstevel@tonic-gate * If there is no old cached values, initialize them 4740Sstevel@tonic-gate * conservatively. Set them to be (1.5 * new value). 4750Sstevel@tonic-gate */ 4760Sstevel@tonic-gate if (ire->ire_uinfo.iulp_rtt != 0) { 4770Sstevel@tonic-gate ire->ire_uinfo.iulp_rtt = (ire->ire_uinfo.iulp_rtt + 4780Sstevel@tonic-gate ipic->ipic_rtt) >> 1; 4790Sstevel@tonic-gate } else { 4800Sstevel@tonic-gate ire->ire_uinfo.iulp_rtt = ipic->ipic_rtt + 4810Sstevel@tonic-gate (ipic->ipic_rtt >> 1); 4820Sstevel@tonic-gate } 4830Sstevel@tonic-gate if (ire->ire_uinfo.iulp_rtt_sd != 0) { 4840Sstevel@tonic-gate ire->ire_uinfo.iulp_rtt_sd = 4850Sstevel@tonic-gate (ire->ire_uinfo.iulp_rtt_sd + 4860Sstevel@tonic-gate ipic->ipic_rtt_sd) >> 1; 4870Sstevel@tonic-gate } else { 4880Sstevel@tonic-gate ire->ire_uinfo.iulp_rtt_sd = ipic->ipic_rtt_sd + 4890Sstevel@tonic-gate (ipic->ipic_rtt_sd >> 1); 4900Sstevel@tonic-gate } 4910Sstevel@tonic-gate } 4920Sstevel@tonic-gate if (ipic->ipic_max_frag) 4930Sstevel@tonic-gate ire->ire_max_frag = MIN(ipic->ipic_max_frag, IP_MAXPACKET); 4940Sstevel@tonic-gate if (ipic->ipic_ssthresh != 0) { 4950Sstevel@tonic-gate if (ire->ire_uinfo.iulp_ssthresh != 0) 4960Sstevel@tonic-gate ire->ire_uinfo.iulp_ssthresh = 4970Sstevel@tonic-gate (ipic->ipic_ssthresh + 4980Sstevel@tonic-gate ire->ire_uinfo.iulp_ssthresh) >> 1; 4990Sstevel@tonic-gate else 5000Sstevel@tonic-gate ire->ire_uinfo.iulp_ssthresh = ipic->ipic_ssthresh; 5010Sstevel@tonic-gate } 5020Sstevel@tonic-gate /* 5030Sstevel@tonic-gate * Don't need the ire_lock below this. ire_type does not change 5040Sstevel@tonic-gate * after initialization. ire_marks is protected by irb_lock. 5050Sstevel@tonic-gate */ 5060Sstevel@tonic-gate mutex_exit(&ire->ire_lock); 5070Sstevel@tonic-gate 5080Sstevel@tonic-gate if (ipic->ipic_ire_marks != 0 && ire->ire_type == IRE_CACHE) { 5090Sstevel@tonic-gate /* 5100Sstevel@tonic-gate * Only increment the temporary IRE count if the original 5110Sstevel@tonic-gate * IRE is not already marked temporary. 5120Sstevel@tonic-gate */ 5130Sstevel@tonic-gate irb = ire->ire_bucket; 5140Sstevel@tonic-gate rw_enter(&irb->irb_lock, RW_WRITER); 5150Sstevel@tonic-gate if ((ipic->ipic_ire_marks & IRE_MARK_TEMPORARY) && 5160Sstevel@tonic-gate !(ire->ire_marks & IRE_MARK_TEMPORARY)) { 5170Sstevel@tonic-gate irb->irb_tmp_ire_cnt++; 5180Sstevel@tonic-gate } 5190Sstevel@tonic-gate ire->ire_marks |= ipic->ipic_ire_marks; 5200Sstevel@tonic-gate rw_exit(&irb->irb_lock); 5210Sstevel@tonic-gate } 5220Sstevel@tonic-gate 5230Sstevel@tonic-gate ire_refrele(ire); 5240Sstevel@tonic-gate return (0); 5250Sstevel@tonic-gate } 5260Sstevel@tonic-gate 5270Sstevel@tonic-gate /* 5280Sstevel@tonic-gate * This function is associated with the IP_IOC_IRE_DELETE[_NO_REPLY] 5290Sstevel@tonic-gate * IOCTL[s]. The NO_REPLY form is used by TCP to delete a route IRE 5300Sstevel@tonic-gate * for a host that is not responding. This will force an attempt to 5312612Scarlsonj * establish a new route, if available, and flush out the ARP entry so 5322612Scarlsonj * it will re-resolve. Management processes may want to use the 5332612Scarlsonj * version that generates a reply. 5340Sstevel@tonic-gate * 5350Sstevel@tonic-gate * This function does not support IPv6 since Neighbor Unreachability Detection 5360Sstevel@tonic-gate * means that negative advise like this is useless. 5370Sstevel@tonic-gate */ 5380Sstevel@tonic-gate /* ARGSUSED */ 5390Sstevel@tonic-gate int 5400Sstevel@tonic-gate ip_ire_delete(queue_t *q, mblk_t *mp, cred_t *ioc_cr) 5410Sstevel@tonic-gate { 5422535Ssangeeta uchar_t *addr_ucp; 5430Sstevel@tonic-gate ipaddr_t addr; 5442535Ssangeeta ire_t *ire; 5452535Ssangeeta ipid_t *ipid; 5462535Ssangeeta boolean_t routing_sock_info = B_FALSE; /* Sent info? */ 5470Sstevel@tonic-gate zoneid_t zoneid; 5482535Ssangeeta ire_t *gire = NULL; 5492612Scarlsonj ill_t *ill; 5502612Scarlsonj mblk_t *arp_mp; 5513448Sdh155122 ip_stack_t *ipst; 5520Sstevel@tonic-gate 5530Sstevel@tonic-gate ASSERT(q->q_next == NULL); 5540Sstevel@tonic-gate zoneid = Q_TO_CONN(q)->conn_zoneid; 5553448Sdh155122 ipst = CONNQ_TO_IPST(q); 5560Sstevel@tonic-gate 5570Sstevel@tonic-gate /* 5580Sstevel@tonic-gate * Check privilege using the ioctl credential; if it is NULL 5590Sstevel@tonic-gate * then this is a kernel message and therefor privileged. 5600Sstevel@tonic-gate */ 5613448Sdh155122 if (ioc_cr != NULL && secpolicy_ip_config(ioc_cr, B_FALSE) != 0) 5620Sstevel@tonic-gate return (EPERM); 5630Sstevel@tonic-gate 5640Sstevel@tonic-gate ipid = (ipid_t *)mp->b_rptr; 5650Sstevel@tonic-gate 5660Sstevel@tonic-gate /* Only actions on IRE_CACHEs are acceptable at present. */ 5670Sstevel@tonic-gate if (ipid->ipid_ire_type != IRE_CACHE) 5680Sstevel@tonic-gate return (EINVAL); 5690Sstevel@tonic-gate 5700Sstevel@tonic-gate addr_ucp = mi_offset_param(mp, ipid->ipid_addr_offset, 5714714Ssowmini ipid->ipid_addr_length); 5720Sstevel@tonic-gate if (addr_ucp == NULL || !OK_32PTR(addr_ucp)) 5730Sstevel@tonic-gate return (EINVAL); 5740Sstevel@tonic-gate switch (ipid->ipid_addr_length) { 5750Sstevel@tonic-gate case IP_ADDR_LEN: 5760Sstevel@tonic-gate /* addr_ucp points at IP addr */ 5770Sstevel@tonic-gate break; 5780Sstevel@tonic-gate case sizeof (sin_t): { 5790Sstevel@tonic-gate sin_t *sin; 5800Sstevel@tonic-gate /* 5810Sstevel@tonic-gate * got complete (sockaddr) address - increment addr_ucp to point 5820Sstevel@tonic-gate * at the ip_addr field. 5830Sstevel@tonic-gate */ 5840Sstevel@tonic-gate sin = (sin_t *)addr_ucp; 5850Sstevel@tonic-gate addr_ucp = (uchar_t *)&sin->sin_addr.s_addr; 5860Sstevel@tonic-gate break; 5870Sstevel@tonic-gate } 5880Sstevel@tonic-gate default: 5890Sstevel@tonic-gate return (EINVAL); 5900Sstevel@tonic-gate } 5910Sstevel@tonic-gate /* Extract the destination address. */ 5920Sstevel@tonic-gate bcopy(addr_ucp, &addr, IP_ADDR_LEN); 5930Sstevel@tonic-gate 5940Sstevel@tonic-gate /* Try to find the CACHED IRE. */ 5953448Sdh155122 ire = ire_cache_lookup(addr, zoneid, NULL, ipst); 5960Sstevel@tonic-gate 5970Sstevel@tonic-gate /* Nail it. */ 5980Sstevel@tonic-gate if (ire) { 5990Sstevel@tonic-gate /* Allow delete only on CACHE entries */ 6000Sstevel@tonic-gate if (ire->ire_type != IRE_CACHE) { 6010Sstevel@tonic-gate ire_refrele(ire); 6020Sstevel@tonic-gate return (EINVAL); 6030Sstevel@tonic-gate } 6040Sstevel@tonic-gate 6050Sstevel@tonic-gate /* 6060Sstevel@tonic-gate * Verify that the IRE has been around for a while. 6070Sstevel@tonic-gate * This is to protect against transport protocols 6080Sstevel@tonic-gate * that are too eager in sending delete messages. 6090Sstevel@tonic-gate */ 6100Sstevel@tonic-gate if (gethrestime_sec() < 6113448Sdh155122 ire->ire_create_time + ipst->ips_ip_ignore_delete_time) { 6120Sstevel@tonic-gate ire_refrele(ire); 6130Sstevel@tonic-gate return (EINVAL); 6140Sstevel@tonic-gate } 6150Sstevel@tonic-gate /* 6160Sstevel@tonic-gate * Now we have a potentially dead cache entry. We need 6170Sstevel@tonic-gate * to remove it. 6182535Ssangeeta * If this cache entry is generated from a 6192535Ssangeeta * default route (i.e., ire_cmask == 0), 6200Sstevel@tonic-gate * search the default list and mark it dead and some 6210Sstevel@tonic-gate * background process will try to activate it. 6220Sstevel@tonic-gate */ 6230Sstevel@tonic-gate if ((ire->ire_gateway_addr != 0) && (ire->ire_cmask == 0)) { 6240Sstevel@tonic-gate /* 6250Sstevel@tonic-gate * Make sure that we pick a different 6260Sstevel@tonic-gate * IRE_DEFAULT next time. 6270Sstevel@tonic-gate */ 6280Sstevel@tonic-gate ire_t *gw_ire; 6292535Ssangeeta irb_t *irb = NULL; 6302535Ssangeeta uint_t match_flags; 6312535Ssangeeta 6322535Ssangeeta match_flags = (MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE); 6332535Ssangeeta 6342535Ssangeeta gire = ire_ftable_lookup(ire->ire_addr, 6352535Ssangeeta ire->ire_cmask, 0, 0, 6363448Sdh155122 ire->ire_ipif, NULL, zoneid, 0, NULL, match_flags, 6373448Sdh155122 ipst); 6382535Ssangeeta 6392535Ssangeeta ip3dbg(("ire_ftable_lookup() returned gire %p\n", 6402535Ssangeeta (void *)gire)); 6412535Ssangeeta 6422535Ssangeeta if (gire != NULL) { 6432535Ssangeeta irb = gire->ire_bucket; 6440Sstevel@tonic-gate 6450Sstevel@tonic-gate /* 6460Sstevel@tonic-gate * We grab it as writer just to serialize 6470Sstevel@tonic-gate * multiple threads trying to bump up 6482535Ssangeeta * irb_rr_origin 6490Sstevel@tonic-gate */ 6500Sstevel@tonic-gate rw_enter(&irb->irb_lock, RW_WRITER); 6512535Ssangeeta if ((gw_ire = irb->irb_rr_origin) == NULL) { 6520Sstevel@tonic-gate rw_exit(&irb->irb_lock); 6530Sstevel@tonic-gate goto done; 6540Sstevel@tonic-gate } 6552535Ssangeeta 6562894Ssowmini DTRACE_PROBE1(ip__ire__del__origin, 6572894Ssowmini (ire_t *), gw_ire); 6580Sstevel@tonic-gate 6590Sstevel@tonic-gate /* Skip past the potentially bad gateway */ 6600Sstevel@tonic-gate if (ire->ire_gateway_addr == 6612894Ssowmini gw_ire->ire_gateway_addr) { 6622894Ssowmini ire_t *next = gw_ire->ire_next; 6632894Ssowmini 6642894Ssowmini DTRACE_PROBE2(ip__ire__del, 6652894Ssowmini (ire_t *), gw_ire, (irb_t *), irb); 6662894Ssowmini IRE_FIND_NEXT_ORIGIN(next); 6672894Ssowmini irb->irb_rr_origin = next; 6682894Ssowmini } 6690Sstevel@tonic-gate rw_exit(&irb->irb_lock); 6702535Ssangeeta } 6710Sstevel@tonic-gate } 6720Sstevel@tonic-gate done: 6732535Ssangeeta if (gire != NULL) 6742535Ssangeeta IRE_REFRELE(gire); 6750Sstevel@tonic-gate /* report the bad route to routing sockets */ 6760Sstevel@tonic-gate ip_rts_change(RTM_LOSING, ire->ire_addr, ire->ire_gateway_addr, 6770Sstevel@tonic-gate ire->ire_mask, ire->ire_src_addr, 0, 0, 0, 6783448Sdh155122 (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA), ipst); 6790Sstevel@tonic-gate routing_sock_info = B_TRUE; 6802612Scarlsonj 6812612Scarlsonj /* 6822612Scarlsonj * TCP is really telling us to start over completely, and it 6832612Scarlsonj * expects that we'll resend the ARP query. Tell ARP to 6842612Scarlsonj * discard the entry, if this is a local destination. 6857398SZhijun.Fu@Sun.COM * 6867398SZhijun.Fu@Sun.COM * But, if the ARP entry is permanent then it shouldn't be 6877398SZhijun.Fu@Sun.COM * deleted, so we set ARED_F_PRESERVE_PERM. 6882612Scarlsonj */ 6892612Scarlsonj ill = ire->ire_stq->q_ptr; 6902612Scarlsonj if (ire->ire_gateway_addr == 0 && 6912612Scarlsonj (arp_mp = ill_ared_alloc(ill, addr)) != NULL) { 6927398SZhijun.Fu@Sun.COM ared_t *ared = (ared_t *)arp_mp->b_rptr; 6937398SZhijun.Fu@Sun.COM 6947398SZhijun.Fu@Sun.COM ASSERT(ared->ared_cmd == AR_ENTRY_DELETE); 6957398SZhijun.Fu@Sun.COM ared->ared_flags |= ARED_F_PRESERVE_PERM; 6962612Scarlsonj putnext(ill->ill_rq, arp_mp); 6972612Scarlsonj } 6982612Scarlsonj 6990Sstevel@tonic-gate ire_delete(ire); 7000Sstevel@tonic-gate ire_refrele(ire); 7010Sstevel@tonic-gate } 7023004Sdd193516 /* 7033004Sdd193516 * Also look for an IRE_HOST type redirect ire and 7043004Sdd193516 * remove it if present. 7053004Sdd193516 */ 7063004Sdd193516 ire = ire_route_lookup(addr, 0, 0, IRE_HOST, NULL, NULL, 7073448Sdh155122 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 7080Sstevel@tonic-gate 7090Sstevel@tonic-gate /* Nail it. */ 7103004Sdd193516 if (ire != NULL) { 7114714Ssowmini if (ire->ire_flags & RTF_DYNAMIC) { 7124714Ssowmini if (!routing_sock_info) { 7134714Ssowmini ip_rts_change(RTM_LOSING, ire->ire_addr, 7144714Ssowmini ire->ire_gateway_addr, ire->ire_mask, 7154714Ssowmini ire->ire_src_addr, 0, 0, 0, 7164714Ssowmini (RTA_DST | RTA_GATEWAY | 7174714Ssowmini RTA_NETMASK | RTA_IFA), 7184714Ssowmini ipst); 7194714Ssowmini } 7204714Ssowmini ire_delete(ire); 7210Sstevel@tonic-gate } 7224714Ssowmini ire_refrele(ire); 7230Sstevel@tonic-gate } 7240Sstevel@tonic-gate return (0); 7250Sstevel@tonic-gate } 7260Sstevel@tonic-gate 7270Sstevel@tonic-gate /* 7280Sstevel@tonic-gate * ip_ire_req is called by ip_wput when an IRE_DB_REQ_TYPE message is handed 7290Sstevel@tonic-gate * down from the Upper Level Protocol to request a copy of the IRE (to check 7300Sstevel@tonic-gate * its type or to extract information like round-trip time estimates or the 7310Sstevel@tonic-gate * MTU.) 7320Sstevel@tonic-gate * The address is assumed to be in the ire_addr field. If no IRE is found 7330Sstevel@tonic-gate * an IRE is returned with ire_type being zero. 7340Sstevel@tonic-gate * Note that the upper lavel protocol has to check for broadcast 7350Sstevel@tonic-gate * (IRE_BROADCAST) and multicast (CLASSD(addr)). 7360Sstevel@tonic-gate * If there is a b_cont the resulting IRE_DB_TYPE mblk is placed at the 7370Sstevel@tonic-gate * end of the returned message. 7380Sstevel@tonic-gate * 7390Sstevel@tonic-gate * TCP sends down a message of this type with a connection request packet 7400Sstevel@tonic-gate * chained on. UDP and ICMP send it down to verify that a route exists for 7410Sstevel@tonic-gate * the destination address when they get connected. 7420Sstevel@tonic-gate */ 7430Sstevel@tonic-gate void 7440Sstevel@tonic-gate ip_ire_req(queue_t *q, mblk_t *mp) 7450Sstevel@tonic-gate { 7460Sstevel@tonic-gate ire_t *inire; 7470Sstevel@tonic-gate ire_t *ire; 7480Sstevel@tonic-gate mblk_t *mp1; 7490Sstevel@tonic-gate ire_t *sire = NULL; 7500Sstevel@tonic-gate zoneid_t zoneid = Q_TO_CONN(q)->conn_zoneid; 7513448Sdh155122 ip_stack_t *ipst = CONNQ_TO_IPST(q); 7523448Sdh155122 7533448Sdh155122 ASSERT(q->q_next == NULL); 7540Sstevel@tonic-gate 7550Sstevel@tonic-gate if ((mp->b_wptr - mp->b_rptr) < sizeof (ire_t) || 7560Sstevel@tonic-gate !OK_32PTR(mp->b_rptr)) { 7570Sstevel@tonic-gate freemsg(mp); 7580Sstevel@tonic-gate return; 7590Sstevel@tonic-gate } 7600Sstevel@tonic-gate inire = (ire_t *)mp->b_rptr; 7610Sstevel@tonic-gate /* 7620Sstevel@tonic-gate * Got it, now take our best shot at an IRE. 7630Sstevel@tonic-gate */ 7640Sstevel@tonic-gate if (inire->ire_ipversion == IPV6_VERSION) { 7650Sstevel@tonic-gate ire = ire_route_lookup_v6(&inire->ire_addr_v6, 0, 0, 0, 7661676Sjpk NULL, &sire, zoneid, NULL, 7673448Sdh155122 (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT), ipst); 7680Sstevel@tonic-gate } else { 7690Sstevel@tonic-gate ASSERT(inire->ire_ipversion == IPV4_VERSION); 7700Sstevel@tonic-gate ire = ire_route_lookup(inire->ire_addr, 0, 0, 0, 7711676Sjpk NULL, &sire, zoneid, NULL, 7723448Sdh155122 (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT), ipst); 7730Sstevel@tonic-gate } 7740Sstevel@tonic-gate 7750Sstevel@tonic-gate /* 7760Sstevel@tonic-gate * We prevent returning IRES with source address INADDR_ANY 7770Sstevel@tonic-gate * as these were temporarily created for sending packets 7780Sstevel@tonic-gate * from endpoints that have conn_unspec_src set. 7790Sstevel@tonic-gate */ 7800Sstevel@tonic-gate if (ire == NULL || 7810Sstevel@tonic-gate (ire->ire_ipversion == IPV4_VERSION && 7820Sstevel@tonic-gate ire->ire_src_addr == INADDR_ANY) || 7830Sstevel@tonic-gate (ire->ire_ipversion == IPV6_VERSION && 7840Sstevel@tonic-gate IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6))) { 7850Sstevel@tonic-gate inire->ire_type = 0; 7860Sstevel@tonic-gate } else { 7870Sstevel@tonic-gate bcopy(ire, inire, sizeof (ire_t)); 7880Sstevel@tonic-gate /* Copy the route metrics from the parent. */ 7890Sstevel@tonic-gate if (sire != NULL) { 7900Sstevel@tonic-gate bcopy(&(sire->ire_uinfo), &(inire->ire_uinfo), 7910Sstevel@tonic-gate sizeof (iulp_t)); 7920Sstevel@tonic-gate } 7930Sstevel@tonic-gate 7940Sstevel@tonic-gate /* 7950Sstevel@tonic-gate * As we don't lookup global policy here, we may not 7960Sstevel@tonic-gate * pass the right size if per-socket policy is not 7970Sstevel@tonic-gate * present. For these cases, path mtu discovery will 7980Sstevel@tonic-gate * do the right thing. 7990Sstevel@tonic-gate */ 8000Sstevel@tonic-gate inire->ire_ipsec_overhead = conn_ipsec_length(Q_TO_CONN(q)); 8010Sstevel@tonic-gate 8020Sstevel@tonic-gate /* Pass the latest setting of the ip_path_mtu_discovery */ 8033448Sdh155122 inire->ire_frag_flag |= 8043448Sdh155122 (ipst->ips_ip_path_mtu_discovery) ? IPH_DF : 0; 8050Sstevel@tonic-gate } 8060Sstevel@tonic-gate if (ire != NULL) 8070Sstevel@tonic-gate ire_refrele(ire); 8080Sstevel@tonic-gate if (sire != NULL) 8090Sstevel@tonic-gate ire_refrele(sire); 8100Sstevel@tonic-gate mp->b_wptr = &mp->b_rptr[sizeof (ire_t)]; 8110Sstevel@tonic-gate mp->b_datap->db_type = IRE_DB_TYPE; 8120Sstevel@tonic-gate 8130Sstevel@tonic-gate /* Put the IRE_DB_TYPE mblk last in the chain */ 8140Sstevel@tonic-gate mp1 = mp->b_cont; 8150Sstevel@tonic-gate if (mp1 != NULL) { 8160Sstevel@tonic-gate mp->b_cont = NULL; 8170Sstevel@tonic-gate linkb(mp1, mp); 8180Sstevel@tonic-gate mp = mp1; 8190Sstevel@tonic-gate } 8200Sstevel@tonic-gate qreply(q, mp); 8210Sstevel@tonic-gate } 8220Sstevel@tonic-gate 8230Sstevel@tonic-gate /* 8240Sstevel@tonic-gate * Send a packet using the specified IRE. 8250Sstevel@tonic-gate * If ire_src_addr_v6 is all zero then discard the IRE after 8260Sstevel@tonic-gate * the packet has been sent. 8270Sstevel@tonic-gate */ 8280Sstevel@tonic-gate static void 8290Sstevel@tonic-gate ire_send(queue_t *q, mblk_t *pkt, ire_t *ire) 8300Sstevel@tonic-gate { 8310Sstevel@tonic-gate mblk_t *ipsec_mp; 8320Sstevel@tonic-gate boolean_t is_secure; 8330Sstevel@tonic-gate uint_t ifindex; 8340Sstevel@tonic-gate ill_t *ill; 8352733Snordmark zoneid_t zoneid = ire->ire_zoneid; 8363448Sdh155122 ip_stack_t *ipst = ire->ire_ipst; 8370Sstevel@tonic-gate 8380Sstevel@tonic-gate ASSERT(ire->ire_ipversion == IPV4_VERSION); 8392733Snordmark ASSERT(!(ire->ire_type & IRE_LOCAL)); /* Has different ire_zoneid */ 8400Sstevel@tonic-gate ipsec_mp = pkt; 8410Sstevel@tonic-gate is_secure = (pkt->b_datap->db_type == M_CTL); 8422733Snordmark if (is_secure) { 8432733Snordmark ipsec_out_t *io; 8442733Snordmark 8450Sstevel@tonic-gate pkt = pkt->b_cont; 8462733Snordmark io = (ipsec_out_t *)ipsec_mp->b_rptr; 8472733Snordmark if (io->ipsec_out_type == IPSEC_OUT) 8482733Snordmark zoneid = io->ipsec_out_zoneid; 8492733Snordmark } 8500Sstevel@tonic-gate 8510Sstevel@tonic-gate /* If the packet originated externally then */ 8520Sstevel@tonic-gate if (pkt->b_prev) { 8530Sstevel@tonic-gate ire_refrele(ire); 8540Sstevel@tonic-gate /* 8550Sstevel@tonic-gate * Extract the ifindex from b_prev (set in ip_rput_noire). 8560Sstevel@tonic-gate * Look up interface to see if it still exists (it could have 8570Sstevel@tonic-gate * been unplumbed by the time the reply came back from ARP) 8580Sstevel@tonic-gate */ 8590Sstevel@tonic-gate ifindex = (uint_t)(uintptr_t)pkt->b_prev; 8600Sstevel@tonic-gate ill = ill_lookup_on_ifindex(ifindex, B_FALSE, 8613448Sdh155122 NULL, NULL, NULL, NULL, ipst); 8620Sstevel@tonic-gate if (ill == NULL) { 8630Sstevel@tonic-gate pkt->b_prev = NULL; 8640Sstevel@tonic-gate pkt->b_next = NULL; 8650Sstevel@tonic-gate freemsg(ipsec_mp); 8660Sstevel@tonic-gate return; 8670Sstevel@tonic-gate } 8680Sstevel@tonic-gate q = ill->ill_rq; 8690Sstevel@tonic-gate pkt->b_prev = NULL; 8700Sstevel@tonic-gate /* 8710Sstevel@tonic-gate * This packet has not gone through IPSEC processing 8720Sstevel@tonic-gate * and hence we should not have any IPSEC message 8730Sstevel@tonic-gate * prepended. 8740Sstevel@tonic-gate */ 8750Sstevel@tonic-gate ASSERT(ipsec_mp == pkt); 8762535Ssangeeta put(q, pkt); 8770Sstevel@tonic-gate ill_refrele(ill); 8780Sstevel@tonic-gate } else if (pkt->b_next) { 8790Sstevel@tonic-gate /* Packets from multicast router */ 8800Sstevel@tonic-gate pkt->b_next = NULL; 8810Sstevel@tonic-gate /* 8820Sstevel@tonic-gate * We never get the IPSEC_OUT while forwarding the 8830Sstevel@tonic-gate * packet for multicast router. 8840Sstevel@tonic-gate */ 8850Sstevel@tonic-gate ASSERT(ipsec_mp == pkt); 8860Sstevel@tonic-gate ip_rput_forward(ire, (ipha_t *)pkt->b_rptr, ipsec_mp, NULL); 8870Sstevel@tonic-gate ire_refrele(ire); 8880Sstevel@tonic-gate } else { 8890Sstevel@tonic-gate /* Locally originated packets */ 8907216Smeem boolean_t delete_ire = B_FALSE; 8910Sstevel@tonic-gate ipha_t *ipha = (ipha_t *)pkt->b_rptr; 8920Sstevel@tonic-gate 8930Sstevel@tonic-gate /* 8947216Smeem * If this IRE shouldn't be kept in the table (because its 8957216Smeem * source address is unspecified), hold a reference to it so 8967216Smeem * we can delete it even after e.g. ip_wput_ire() has dropped 8977216Smeem * its reference. 8980Sstevel@tonic-gate */ 8997216Smeem if (!(ire->ire_marks & IRE_MARK_NOADD) && 9007216Smeem ire->ire_src_addr == INADDR_ANY) { 9017216Smeem delete_ire = B_TRUE; 9020Sstevel@tonic-gate IRE_REFHOLD(ire); 9030Sstevel@tonic-gate } 9047216Smeem 9050Sstevel@tonic-gate /* 9060Sstevel@tonic-gate * If we were resolving a router we can not use the 9070Sstevel@tonic-gate * routers IRE for sending the packet (since it would 9080Sstevel@tonic-gate * violate the uniqness of the IP idents) thus we 9090Sstevel@tonic-gate * make another pass through ip_wput to create the IRE_CACHE 9100Sstevel@tonic-gate * for the destination. 9110Sstevel@tonic-gate * When IRE_MARK_NOADD is set, ire_add() is not called. 9120Sstevel@tonic-gate * Thus ip_wput() will never find a ire and result in an 9130Sstevel@tonic-gate * infinite loop. Thus we check whether IRE_MARK_NOADD is 9140Sstevel@tonic-gate * is set. This also implies that IRE_MARK_NOADD can only be 9150Sstevel@tonic-gate * used to send packets to directly connected hosts. 9160Sstevel@tonic-gate */ 9170Sstevel@tonic-gate if (ipha->ipha_dst != ire->ire_addr && 9180Sstevel@tonic-gate !(ire->ire_marks & IRE_MARK_NOADD)) { 9190Sstevel@tonic-gate ire_refrele(ire); /* Held in ire_add */ 9202733Snordmark if (CONN_Q(q)) { 9212733Snordmark (void) ip_output(Q_TO_CONN(q), ipsec_mp, q, 9222733Snordmark IRE_SEND); 9232733Snordmark } else { 9242733Snordmark (void) ip_output((void *)(uintptr_t)zoneid, 9252733Snordmark ipsec_mp, q, IRE_SEND); 9262733Snordmark } 9270Sstevel@tonic-gate } else { 9280Sstevel@tonic-gate if (is_secure) { 9290Sstevel@tonic-gate ipsec_out_t *oi; 9300Sstevel@tonic-gate ipha_t *ipha; 9310Sstevel@tonic-gate 9320Sstevel@tonic-gate oi = (ipsec_out_t *)ipsec_mp->b_rptr; 9330Sstevel@tonic-gate ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr; 9340Sstevel@tonic-gate if (oi->ipsec_out_proc_begin) { 9350Sstevel@tonic-gate /* 9360Sstevel@tonic-gate * This is the case where 9370Sstevel@tonic-gate * ip_wput_ipsec_out could not find 9380Sstevel@tonic-gate * the IRE and recreated a new one. 9390Sstevel@tonic-gate * As ip_wput_ipsec_out does ire 9400Sstevel@tonic-gate * lookups, ire_refrele for the extra 9410Sstevel@tonic-gate * bump in ire_add. 9420Sstevel@tonic-gate */ 9430Sstevel@tonic-gate ire_refrele(ire); 9440Sstevel@tonic-gate ip_wput_ipsec_out(q, ipsec_mp, ipha, 9450Sstevel@tonic-gate NULL, NULL); 9460Sstevel@tonic-gate } else { 9470Sstevel@tonic-gate /* 9480Sstevel@tonic-gate * IRE_REFRELE will be done in 9490Sstevel@tonic-gate * ip_wput_ire. 9500Sstevel@tonic-gate */ 9510Sstevel@tonic-gate ip_wput_ire(q, ipsec_mp, ire, NULL, 9522733Snordmark IRE_SEND, zoneid); 9530Sstevel@tonic-gate } 9540Sstevel@tonic-gate } else { 9550Sstevel@tonic-gate /* 9560Sstevel@tonic-gate * IRE_REFRELE will be done in ip_wput_ire. 9570Sstevel@tonic-gate */ 9580Sstevel@tonic-gate ip_wput_ire(q, ipsec_mp, ire, NULL, 9592733Snordmark IRE_SEND, zoneid); 9600Sstevel@tonic-gate } 9610Sstevel@tonic-gate } 9620Sstevel@tonic-gate /* 9630Sstevel@tonic-gate * Special code to support sending a single packet with 9640Sstevel@tonic-gate * conn_unspec_src using an IRE which has no source address. 9650Sstevel@tonic-gate * The IRE is deleted here after sending the packet to avoid 9660Sstevel@tonic-gate * having other code trip on it. But before we delete the 9670Sstevel@tonic-gate * ire, somebody could have looked up this ire. 9680Sstevel@tonic-gate * We prevent returning/using this IRE by the upper layers 9690Sstevel@tonic-gate * by making checks to NULL source address in other places 9700Sstevel@tonic-gate * like e.g ip_ire_append, ip_ire_req and ip_bind_connected. 9717216Smeem * Though this does not completely prevent other threads 9720Sstevel@tonic-gate * from using this ire, this should not cause any problems. 9730Sstevel@tonic-gate */ 9747216Smeem if (delete_ire) { 9757216Smeem ip1dbg(("ire_send: delete IRE\n")); 9767216Smeem ire_delete(ire); 9770Sstevel@tonic-gate ire_refrele(ire); /* Held above */ 9780Sstevel@tonic-gate } 9790Sstevel@tonic-gate } 9800Sstevel@tonic-gate } 9810Sstevel@tonic-gate 9820Sstevel@tonic-gate /* 9830Sstevel@tonic-gate * Send a packet using the specified IRE. 9840Sstevel@tonic-gate * If ire_src_addr_v6 is all zero then discard the IRE after 9850Sstevel@tonic-gate * the packet has been sent. 9860Sstevel@tonic-gate */ 9870Sstevel@tonic-gate static void 9880Sstevel@tonic-gate ire_send_v6(queue_t *q, mblk_t *pkt, ire_t *ire) 9890Sstevel@tonic-gate { 9900Sstevel@tonic-gate mblk_t *ipsec_mp; 9910Sstevel@tonic-gate boolean_t secure; 9920Sstevel@tonic-gate uint_t ifindex; 9932733Snordmark zoneid_t zoneid = ire->ire_zoneid; 9943448Sdh155122 ip_stack_t *ipst = ire->ire_ipst; 9950Sstevel@tonic-gate 9960Sstevel@tonic-gate ASSERT(ire->ire_ipversion == IPV6_VERSION); 9972733Snordmark ASSERT(!(ire->ire_type & IRE_LOCAL)); /* Has different ire_zoneid */ 9980Sstevel@tonic-gate if (pkt->b_datap->db_type == M_CTL) { 9992733Snordmark ipsec_out_t *io; 10002733Snordmark 10010Sstevel@tonic-gate ipsec_mp = pkt; 10020Sstevel@tonic-gate pkt = pkt->b_cont; 10030Sstevel@tonic-gate secure = B_TRUE; 10042733Snordmark io = (ipsec_out_t *)ipsec_mp->b_rptr; 10052733Snordmark if (io->ipsec_out_type == IPSEC_OUT) 10062733Snordmark zoneid = io->ipsec_out_zoneid; 10070Sstevel@tonic-gate } else { 10080Sstevel@tonic-gate ipsec_mp = pkt; 10090Sstevel@tonic-gate secure = B_FALSE; 10100Sstevel@tonic-gate } 10110Sstevel@tonic-gate 10120Sstevel@tonic-gate /* If the packet originated externally then */ 10130Sstevel@tonic-gate if (pkt->b_prev) { 10140Sstevel@tonic-gate ill_t *ill; 10150Sstevel@tonic-gate /* 10160Sstevel@tonic-gate * Extract the ifindex from b_prev (set in ip_rput_data_v6). 10170Sstevel@tonic-gate * Look up interface to see if it still exists (it could have 10180Sstevel@tonic-gate * been unplumbed by the time the reply came back from the 10192535Ssangeeta * resolver). 10200Sstevel@tonic-gate */ 10210Sstevel@tonic-gate ifindex = (uint_t)(uintptr_t)pkt->b_prev; 10220Sstevel@tonic-gate ill = ill_lookup_on_ifindex(ifindex, B_TRUE, 10233448Sdh155122 NULL, NULL, NULL, NULL, ipst); 10240Sstevel@tonic-gate if (ill == NULL) { 10250Sstevel@tonic-gate pkt->b_prev = NULL; 10260Sstevel@tonic-gate pkt->b_next = NULL; 10270Sstevel@tonic-gate freemsg(ipsec_mp); 10280Sstevel@tonic-gate ire_refrele(ire); /* Held in ire_add */ 10290Sstevel@tonic-gate return; 10300Sstevel@tonic-gate } 10310Sstevel@tonic-gate q = ill->ill_rq; 10320Sstevel@tonic-gate pkt->b_prev = NULL; 10330Sstevel@tonic-gate /* 10340Sstevel@tonic-gate * This packet has not gone through IPSEC processing 10350Sstevel@tonic-gate * and hence we should not have any IPSEC message 10360Sstevel@tonic-gate * prepended. 10370Sstevel@tonic-gate */ 10380Sstevel@tonic-gate ASSERT(ipsec_mp == pkt); 10390Sstevel@tonic-gate put(q, pkt); 10400Sstevel@tonic-gate ill_refrele(ill); 10410Sstevel@tonic-gate } else if (pkt->b_next) { 10420Sstevel@tonic-gate /* Packets from multicast router */ 10430Sstevel@tonic-gate pkt->b_next = NULL; 10440Sstevel@tonic-gate /* 10450Sstevel@tonic-gate * We never get the IPSEC_OUT while forwarding the 10460Sstevel@tonic-gate * packet for multicast router. 10470Sstevel@tonic-gate */ 10480Sstevel@tonic-gate ASSERT(ipsec_mp == pkt); 10490Sstevel@tonic-gate /* 10500Sstevel@tonic-gate * XXX TODO IPv6. 10510Sstevel@tonic-gate */ 10520Sstevel@tonic-gate freemsg(pkt); 10530Sstevel@tonic-gate #ifdef XXX 10540Sstevel@tonic-gate ip_rput_forward(ire, (ipha_t *)pkt->b_rptr, pkt, NULL); 10550Sstevel@tonic-gate #endif 10560Sstevel@tonic-gate } else { 10570Sstevel@tonic-gate if (secure) { 10580Sstevel@tonic-gate ipsec_out_t *oi; 10590Sstevel@tonic-gate ip6_t *ip6h; 10600Sstevel@tonic-gate 10610Sstevel@tonic-gate oi = (ipsec_out_t *)ipsec_mp->b_rptr; 10620Sstevel@tonic-gate ip6h = (ip6_t *)ipsec_mp->b_cont->b_rptr; 10630Sstevel@tonic-gate if (oi->ipsec_out_proc_begin) { 10640Sstevel@tonic-gate /* 10650Sstevel@tonic-gate * This is the case where 10660Sstevel@tonic-gate * ip_wput_ipsec_out could not find 10670Sstevel@tonic-gate * the IRE and recreated a new one. 10680Sstevel@tonic-gate */ 10690Sstevel@tonic-gate ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h, 10700Sstevel@tonic-gate NULL, NULL); 10710Sstevel@tonic-gate } else { 10722733Snordmark if (CONN_Q(q)) { 10732733Snordmark (void) ip_output_v6(Q_TO_CONN(q), 10742733Snordmark ipsec_mp, q, IRE_SEND); 10752733Snordmark } else { 10762733Snordmark (void) ip_output_v6( 10772733Snordmark (void *)(uintptr_t)zoneid, 10782733Snordmark ipsec_mp, q, IRE_SEND); 10792733Snordmark } 10800Sstevel@tonic-gate } 10810Sstevel@tonic-gate } else { 10820Sstevel@tonic-gate /* 10830Sstevel@tonic-gate * Send packets through ip_output_v6 so that any 10840Sstevel@tonic-gate * ip6_info header can be processed again. 10850Sstevel@tonic-gate */ 10862733Snordmark if (CONN_Q(q)) { 10872733Snordmark (void) ip_output_v6(Q_TO_CONN(q), ipsec_mp, q, 10882733Snordmark IRE_SEND); 10892733Snordmark } else { 10902733Snordmark (void) ip_output_v6((void *)(uintptr_t)zoneid, 10912733Snordmark ipsec_mp, q, IRE_SEND); 10922733Snordmark } 10930Sstevel@tonic-gate } 10940Sstevel@tonic-gate /* 10950Sstevel@tonic-gate * Special code to support sending a single packet with 10960Sstevel@tonic-gate * conn_unspec_src using an IRE which has no source address. 10970Sstevel@tonic-gate * The IRE is deleted here after sending the packet to avoid 10980Sstevel@tonic-gate * having other code trip on it. But before we delete the 10990Sstevel@tonic-gate * ire, somebody could have looked up this ire. 11000Sstevel@tonic-gate * We prevent returning/using this IRE by the upper layers 11010Sstevel@tonic-gate * by making checks to NULL source address in other places 11020Sstevel@tonic-gate * like e.g ip_ire_append_v6, ip_ire_req and 11030Sstevel@tonic-gate * ip_bind_connected_v6. Though, this does not completely 11040Sstevel@tonic-gate * prevent other threads from using this ire, this should 11050Sstevel@tonic-gate * not cause any problems. 11060Sstevel@tonic-gate */ 11070Sstevel@tonic-gate if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6)) { 11080Sstevel@tonic-gate ip1dbg(("ire_send_v6: delete IRE\n")); 11090Sstevel@tonic-gate ire_delete(ire); 11100Sstevel@tonic-gate } 11110Sstevel@tonic-gate } 11120Sstevel@tonic-gate ire_refrele(ire); /* Held in ire_add */ 11130Sstevel@tonic-gate } 11140Sstevel@tonic-gate 11150Sstevel@tonic-gate /* 11160Sstevel@tonic-gate * Make sure that IRE bucket does not get too long. 11170Sstevel@tonic-gate * This can cause lock up because ire_cache_lookup() 11180Sstevel@tonic-gate * may take "forever" to finish. 11190Sstevel@tonic-gate * 11205388Sja97890 * We only remove a maximum of cnt IREs each time. This 11215388Sja97890 * should keep the bucket length approximately constant, 11220Sstevel@tonic-gate * depending on cnt. This should be enough to defend 11230Sstevel@tonic-gate * against DoS attack based on creating temporary IREs 11240Sstevel@tonic-gate * (for forwarding and non-TCP traffic). 11250Sstevel@tonic-gate * 11265388Sja97890 * We also pass in the address of the newly created IRE 11275388Sja97890 * as we do not want to remove this straight after adding 11285388Sja97890 * it. New IREs are normally added at the tail of the 11290Sstevel@tonic-gate * bucket. This means that we are removing the "oldest" 11305388Sja97890 * temporary IREs added. Only if there are IREs with 11310Sstevel@tonic-gate * the same ire_addr, do we not add it at the tail. Refer 11320Sstevel@tonic-gate * to ire_add_v*(). It should be OK for our purpose. 11330Sstevel@tonic-gate * 11340Sstevel@tonic-gate * For non-temporary cached IREs, we make sure that they 11350Sstevel@tonic-gate * have not been used for some time (defined below), they 11360Sstevel@tonic-gate * are non-local destinations, and there is no one using 11370Sstevel@tonic-gate * them at the moment (refcnt == 1). 11380Sstevel@tonic-gate * 11390Sstevel@tonic-gate * The above means that the IRE bucket length may become 11400Sstevel@tonic-gate * very long, consisting of mostly non-temporary IREs. 11410Sstevel@tonic-gate * This can happen when the hash function does a bad job 11420Sstevel@tonic-gate * so that most TCP connections cluster to a specific bucket. 11430Sstevel@tonic-gate * This "hopefully" should never happen. It can also 11440Sstevel@tonic-gate * happen if most TCP connections have very long lives. 11450Sstevel@tonic-gate * Even with the minimal hash table size of 256, there 11460Sstevel@tonic-gate * has to be a lot of such connections to make the bucket 11470Sstevel@tonic-gate * length unreasonably long. This should probably not 11480Sstevel@tonic-gate * happen either. The third can when this can happen is 11490Sstevel@tonic-gate * when the machine is under attack, such as SYN flooding. 11500Sstevel@tonic-gate * TCP should already have the proper mechanism to protect 11510Sstevel@tonic-gate * that. So we should be safe. 11520Sstevel@tonic-gate * 11530Sstevel@tonic-gate * This function is called by ire_add_then_send() after 11540Sstevel@tonic-gate * a new IRE is added and the packet is sent. 11550Sstevel@tonic-gate * 11560Sstevel@tonic-gate * The idle cutoff interval is set to 60s. It can be 11570Sstevel@tonic-gate * changed using /etc/system. 11580Sstevel@tonic-gate */ 11590Sstevel@tonic-gate uint32_t ire_idle_cutoff_interval = 60000; 11600Sstevel@tonic-gate 11610Sstevel@tonic-gate static void 11625388Sja97890 ire_cache_cleanup(irb_t *irb, uint32_t threshold, ire_t *ref_ire) 11630Sstevel@tonic-gate { 11640Sstevel@tonic-gate ire_t *ire; 11650Sstevel@tonic-gate clock_t cut_off = drv_usectohz(ire_idle_cutoff_interval * 1000); 11665388Sja97890 int cnt = ip_ire_cleanup_cnt; 11670Sstevel@tonic-gate 11680Sstevel@tonic-gate /* 11695388Sja97890 * Try to remove cnt temporary IREs first. 11700Sstevel@tonic-gate */ 11715388Sja97890 for (ire = irb->irb_ire; cnt > 0 && ire != NULL; ire = ire->ire_next) { 11725388Sja97890 if (ire == ref_ire) 11735388Sja97890 continue; 11745388Sja97890 if (ire->ire_marks & IRE_MARK_CONDEMNED) 11755388Sja97890 continue; 11765388Sja97890 if (ire->ire_marks & IRE_MARK_TEMPORARY) { 11775388Sja97890 ASSERT(ire->ire_type == IRE_CACHE); 11785388Sja97890 ire_delete(ire); 11795388Sja97890 cnt--; 11805388Sja97890 } 11815388Sja97890 } 11825388Sja97890 if (cnt == 0) 11830Sstevel@tonic-gate return; 11840Sstevel@tonic-gate 11855388Sja97890 /* 11865388Sja97890 * If we didn't satisfy our removal target from temporary IREs 11875388Sja97890 * we see how many non-temporary IREs are currently in the bucket. 11885388Sja97890 * If this quantity is above the threshold then we see if there are any 11895388Sja97890 * candidates for removal. We are still limited to removing a maximum 11905388Sja97890 * of cnt IREs. 11915388Sja97890 */ 11925388Sja97890 if ((irb->irb_ire_cnt - irb->irb_tmp_ire_cnt) > threshold) { 11935388Sja97890 for (ire = irb->irb_ire; cnt > 0 && ire != NULL; 11940Sstevel@tonic-gate ire = ire->ire_next) { 11955388Sja97890 if (ire == ref_ire) 11965388Sja97890 continue; 11975388Sja97890 if (ire->ire_type != IRE_CACHE) 11985388Sja97890 continue; 11990Sstevel@tonic-gate if (ire->ire_marks & IRE_MARK_CONDEMNED) 12000Sstevel@tonic-gate continue; 12015388Sja97890 if ((ire->ire_refcnt == 1) && 12025388Sja97890 (lbolt - ire->ire_last_used_time > cut_off)) { 12030Sstevel@tonic-gate ire_delete(ire); 12040Sstevel@tonic-gate cnt--; 12050Sstevel@tonic-gate } 12060Sstevel@tonic-gate } 12070Sstevel@tonic-gate } 12080Sstevel@tonic-gate } 12090Sstevel@tonic-gate 12100Sstevel@tonic-gate /* 12110Sstevel@tonic-gate * ire_add_then_send is called when a new IRE has been created in order to 12120Sstevel@tonic-gate * route an outgoing packet. Typically, it is called from ip_wput when 12130Sstevel@tonic-gate * a response comes back down from a resolver. We add the IRE, and then 12140Sstevel@tonic-gate * possibly run the packet through ip_wput or ip_rput, as appropriate. 12150Sstevel@tonic-gate * However, we do not add the newly created IRE in the cache when 12160Sstevel@tonic-gate * IRE_MARK_NOADD is set in the IRE. IRE_MARK_NOADD is set at 12174823Sseb * ip_newroute_ipif(). The ires with IRE_MARK_NOADD are ire_refrele'd by 12184823Sseb * ip_wput_ire() and get deleted. 12190Sstevel@tonic-gate * Multirouting support: the packet is silently discarded when the new IRE 12200Sstevel@tonic-gate * holds the RTF_MULTIRT flag, but is not the first IRE to be added with the 12210Sstevel@tonic-gate * RTF_MULTIRT flag for the same destination address. 12220Sstevel@tonic-gate * In this case, we just want to register this additional ire without 12230Sstevel@tonic-gate * sending the packet, as it has already been replicated through 12240Sstevel@tonic-gate * existing multirt routes in ip_wput(). 12250Sstevel@tonic-gate */ 12260Sstevel@tonic-gate void 12270Sstevel@tonic-gate ire_add_then_send(queue_t *q, ire_t *ire, mblk_t *mp) 12280Sstevel@tonic-gate { 12290Sstevel@tonic-gate irb_t *irb; 12300Sstevel@tonic-gate boolean_t drop = B_FALSE; 12310Sstevel@tonic-gate boolean_t mctl_present; 12320Sstevel@tonic-gate mblk_t *first_mp = NULL; 12338485SPeter.Memishian@Sun.COM mblk_t *data_mp = NULL; 12340Sstevel@tonic-gate ire_t *dst_ire; 12350Sstevel@tonic-gate ipha_t *ipha; 12360Sstevel@tonic-gate ip6_t *ip6h; 12373448Sdh155122 ip_stack_t *ipst = ire->ire_ipst; 12385388Sja97890 int ire_limit; 12390Sstevel@tonic-gate 12400Sstevel@tonic-gate if (mp != NULL) { 12410Sstevel@tonic-gate /* 12420Sstevel@tonic-gate * We first have to retrieve the destination address carried 12430Sstevel@tonic-gate * by the packet. 12440Sstevel@tonic-gate * We can't rely on ire as it can be related to a gateway. 12450Sstevel@tonic-gate * The destination address will help in determining if 12460Sstevel@tonic-gate * other RTF_MULTIRT ires are already registered. 12470Sstevel@tonic-gate * 12480Sstevel@tonic-gate * We first need to know where we are going : v4 or V6. 12490Sstevel@tonic-gate * the ire version is enough, as there is no risk that 12500Sstevel@tonic-gate * we resolve an IPv6 address with an IPv4 ire 12510Sstevel@tonic-gate * or vice versa. 12520Sstevel@tonic-gate */ 12538485SPeter.Memishian@Sun.COM EXTRACT_PKT_MP(mp, first_mp, mctl_present); 12548485SPeter.Memishian@Sun.COM data_mp = mp; 12558485SPeter.Memishian@Sun.COM mp = first_mp; 12560Sstevel@tonic-gate if (ire->ire_ipversion == IPV4_VERSION) { 12578485SPeter.Memishian@Sun.COM ipha = (ipha_t *)data_mp->b_rptr; 12580Sstevel@tonic-gate dst_ire = ire_cache_lookup(ipha->ipha_dst, 1259*8778SErik.Nordmark@Sun.COM ire->ire_zoneid, msg_getlabel(mp), ipst); 12600Sstevel@tonic-gate } else { 12612535Ssangeeta ASSERT(ire->ire_ipversion == IPV6_VERSION); 12628485SPeter.Memishian@Sun.COM ip6h = (ip6_t *)data_mp->b_rptr; 12630Sstevel@tonic-gate dst_ire = ire_cache_lookup_v6(&ip6h->ip6_dst, 1264*8778SErik.Nordmark@Sun.COM ire->ire_zoneid, msg_getlabel(mp), ipst); 12650Sstevel@tonic-gate } 12660Sstevel@tonic-gate if (dst_ire != NULL) { 12670Sstevel@tonic-gate if (dst_ire->ire_flags & RTF_MULTIRT) { 12680Sstevel@tonic-gate /* 12690Sstevel@tonic-gate * At least one resolved multirt route 12700Sstevel@tonic-gate * already exists for the destination, 12710Sstevel@tonic-gate * don't sent this packet: either drop it 12720Sstevel@tonic-gate * or complete the pending resolution, 12730Sstevel@tonic-gate * depending on the ire. 12740Sstevel@tonic-gate */ 12750Sstevel@tonic-gate drop = B_TRUE; 12760Sstevel@tonic-gate } 12770Sstevel@tonic-gate ip1dbg(("ire_add_then_send: dst_ire %p " 12780Sstevel@tonic-gate "[dst %08x, gw %08x], drop %d\n", 12790Sstevel@tonic-gate (void *)dst_ire, 12800Sstevel@tonic-gate (dst_ire->ire_ipversion == IPV4_VERSION) ? \ 12814714Ssowmini ntohl(dst_ire->ire_addr) : \ 12824714Ssowmini ntohl(V4_PART_OF_V6(dst_ire->ire_addr_v6)), 12830Sstevel@tonic-gate (dst_ire->ire_ipversion == IPV4_VERSION) ? \ 12844714Ssowmini ntohl(dst_ire->ire_gateway_addr) : \ 12854714Ssowmini ntohl(V4_PART_OF_V6( 12864714Ssowmini dst_ire->ire_gateway_addr_v6)), 12870Sstevel@tonic-gate drop)); 12880Sstevel@tonic-gate ire_refrele(dst_ire); 12890Sstevel@tonic-gate } 12900Sstevel@tonic-gate } 12910Sstevel@tonic-gate 12920Sstevel@tonic-gate if (!(ire->ire_marks & IRE_MARK_NOADD)) { 12934823Sseb /* Regular packets with cache bound ires are here. */ 12944823Sseb (void) ire_add(&ire, NULL, NULL, NULL, B_FALSE); 12950Sstevel@tonic-gate 12960Sstevel@tonic-gate if (ire == NULL) { 12970Sstevel@tonic-gate mp->b_prev = NULL; 12980Sstevel@tonic-gate mp->b_next = NULL; 12990Sstevel@tonic-gate MULTIRT_DEBUG_UNTAG(mp); 13000Sstevel@tonic-gate freemsg(mp); 13010Sstevel@tonic-gate return; 13020Sstevel@tonic-gate } 13030Sstevel@tonic-gate if (mp == NULL) { 13040Sstevel@tonic-gate ire_refrele(ire); /* Held in ire_add_v4/v6 */ 13050Sstevel@tonic-gate return; 13060Sstevel@tonic-gate } 13070Sstevel@tonic-gate } 13080Sstevel@tonic-gate if (drop) { 13090Sstevel@tonic-gate /* 13100Sstevel@tonic-gate * If we're adding an RTF_MULTIRT ire, the resolution 13110Sstevel@tonic-gate * is over: we just drop the packet. 13120Sstevel@tonic-gate */ 13130Sstevel@tonic-gate if (ire->ire_flags & RTF_MULTIRT) { 13148485SPeter.Memishian@Sun.COM data_mp->b_prev = NULL; 13158485SPeter.Memishian@Sun.COM data_mp->b_next = NULL; 13160Sstevel@tonic-gate MULTIRT_DEBUG_UNTAG(mp); 13170Sstevel@tonic-gate freemsg(mp); 13180Sstevel@tonic-gate } else { 13190Sstevel@tonic-gate /* 13200Sstevel@tonic-gate * Otherwise, we're adding the ire to a gateway 13210Sstevel@tonic-gate * for a multirt route. 13220Sstevel@tonic-gate * Invoke ip_newroute() to complete the resolution 13230Sstevel@tonic-gate * of the route. We will then come back here and 13240Sstevel@tonic-gate * finally drop this packet in the above code. 13250Sstevel@tonic-gate */ 13260Sstevel@tonic-gate if (ire->ire_ipversion == IPV4_VERSION) { 13270Sstevel@tonic-gate /* 13280Sstevel@tonic-gate * TODO: in order for CGTP to work in non-global 13290Sstevel@tonic-gate * zones, ip_newroute() must create the IRE 13300Sstevel@tonic-gate * cache in the zone indicated by 13310Sstevel@tonic-gate * ire->ire_zoneid. 13320Sstevel@tonic-gate */ 13334823Sseb ip_newroute(q, mp, ipha->ipha_dst, 13342733Snordmark (CONN_Q(q) ? Q_TO_CONN(q) : NULL), 13353448Sdh155122 ire->ire_zoneid, ipst); 13360Sstevel@tonic-gate } else { 13378485SPeter.Memishian@Sun.COM int minlen = sizeof (ip6i_t) + IPV6_HDR_LEN; 13388485SPeter.Memishian@Sun.COM 13392535Ssangeeta ASSERT(ire->ire_ipversion == IPV6_VERSION); 13408485SPeter.Memishian@Sun.COM 13418485SPeter.Memishian@Sun.COM /* 13428485SPeter.Memishian@Sun.COM * If necessary, skip over the ip6i_t to find 13438485SPeter.Memishian@Sun.COM * the header with the actual source address. 13448485SPeter.Memishian@Sun.COM */ 13458485SPeter.Memishian@Sun.COM if (ip6h->ip6_nxt == IPPROTO_RAW) { 13468485SPeter.Memishian@Sun.COM if (MBLKL(data_mp) < minlen && 13478485SPeter.Memishian@Sun.COM pullupmsg(data_mp, -1) == 0) { 13488485SPeter.Memishian@Sun.COM ip1dbg(("ire_add_then_send: " 13498485SPeter.Memishian@Sun.COM "cannot pullupmsg ip6i\n")); 13508485SPeter.Memishian@Sun.COM if (mctl_present) 13518485SPeter.Memishian@Sun.COM freeb(first_mp); 13528485SPeter.Memishian@Sun.COM ire_refrele(ire); 13538485SPeter.Memishian@Sun.COM return; 13548485SPeter.Memishian@Sun.COM } 13558485SPeter.Memishian@Sun.COM ASSERT(MBLKL(data_mp) >= IPV6_HDR_LEN); 13568485SPeter.Memishian@Sun.COM ip6h = (ip6_t *)(data_mp->b_rptr + 13578485SPeter.Memishian@Sun.COM sizeof (ip6i_t)); 13588485SPeter.Memishian@Sun.COM } 13598485SPeter.Memishian@Sun.COM ip_newroute_v6(q, mp, &ip6h->ip6_dst, 13608485SPeter.Memishian@Sun.COM &ip6h->ip6_src, NULL, ire->ire_zoneid, 13618485SPeter.Memishian@Sun.COM ipst); 13620Sstevel@tonic-gate } 13630Sstevel@tonic-gate } 13640Sstevel@tonic-gate 13650Sstevel@tonic-gate ire_refrele(ire); /* As done by ire_send(). */ 13660Sstevel@tonic-gate return; 13670Sstevel@tonic-gate } 13680Sstevel@tonic-gate /* 13690Sstevel@tonic-gate * Need to remember ire_bucket here as ire_send*() may delete 13700Sstevel@tonic-gate * the ire so we cannot reference it after that. 13710Sstevel@tonic-gate */ 13720Sstevel@tonic-gate irb = ire->ire_bucket; 13735388Sja97890 if (ire->ire_ipversion == IPV4_VERSION) { 13745388Sja97890 ire_send(q, mp, ire); 13755388Sja97890 ire_limit = ip_ire_max_bucket_cnt; 13765388Sja97890 } else { 13770Sstevel@tonic-gate ire_send_v6(q, mp, ire); 13785388Sja97890 ire_limit = ip6_ire_max_bucket_cnt; 13795388Sja97890 } 13805388Sja97890 13815388Sja97890 /* 13825388Sja97890 * irb is NULL if the IRE was not added to the hash. This happens 13835388Sja97890 * when IRE_MARK_NOADD is set and when IREs are returned from 13845388Sja97890 * ire_update_srcif_v4(). 13855388Sja97890 */ 13865388Sja97890 if (irb != NULL) { 13875388Sja97890 IRB_REFHOLD(irb); 13885388Sja97890 if (irb->irb_ire_cnt > ire_limit) 13895388Sja97890 ire_cache_cleanup(irb, ire_limit, ire); 13905388Sja97890 IRB_REFRELE(irb); 13910Sstevel@tonic-gate } 13920Sstevel@tonic-gate } 13930Sstevel@tonic-gate 13940Sstevel@tonic-gate /* 13950Sstevel@tonic-gate * Initialize the ire that is specific to IPv4 part and call 13960Sstevel@tonic-gate * ire_init_common to finish it. 13970Sstevel@tonic-gate */ 13980Sstevel@tonic-gate ire_t * 13990Sstevel@tonic-gate ire_init(ire_t *ire, uchar_t *addr, uchar_t *mask, uchar_t *src_addr, 14004823Sseb uchar_t *gateway, uint_t *max_fragp, nce_t *src_nce, queue_t *rfq, 14014823Sseb queue_t *stq, ushort_t type, ipif_t *ipif, ipaddr_t cmask, uint32_t phandle, 14024823Sseb uint32_t ihandle, uint32_t flags, const iulp_t *ulp_info, tsol_gc_t *gc, 14034823Sseb tsol_gcgrp_t *gcgrp, ip_stack_t *ipst) 14040Sstevel@tonic-gate { 14055907Sja97890 ASSERT(type != IRE_CACHE || stq != NULL); 14061676Sjpk /* 14071676Sjpk * Reject IRE security attribute creation/initialization 14081676Sjpk * if system is not running in Trusted mode. 14091676Sjpk */ 14101676Sjpk if ((gc != NULL || gcgrp != NULL) && !is_system_labeled()) 14111676Sjpk return (NULL); 14121676Sjpk 14133448Sdh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_alloced); 14140Sstevel@tonic-gate 14150Sstevel@tonic-gate if (addr != NULL) 14160Sstevel@tonic-gate bcopy(addr, &ire->ire_addr, IP_ADDR_LEN); 14170Sstevel@tonic-gate if (src_addr != NULL) 14180Sstevel@tonic-gate bcopy(src_addr, &ire->ire_src_addr, IP_ADDR_LEN); 14190Sstevel@tonic-gate if (mask != NULL) { 14200Sstevel@tonic-gate bcopy(mask, &ire->ire_mask, IP_ADDR_LEN); 14210Sstevel@tonic-gate ire->ire_masklen = ip_mask_to_plen(ire->ire_mask); 14220Sstevel@tonic-gate } 14230Sstevel@tonic-gate if (gateway != NULL) { 14240Sstevel@tonic-gate bcopy(gateway, &ire->ire_gateway_addr, IP_ADDR_LEN); 14250Sstevel@tonic-gate } 14260Sstevel@tonic-gate 14270Sstevel@tonic-gate if (type == IRE_CACHE) 14280Sstevel@tonic-gate ire->ire_cmask = cmask; 14290Sstevel@tonic-gate 14301676Sjpk /* ire_init_common will free the mblks upon encountering any failure */ 14314823Sseb if (!ire_init_common(ire, max_fragp, src_nce, rfq, stq, type, ipif, 14324823Sseb phandle, ihandle, flags, IPV4_VERSION, ulp_info, gc, gcgrp, ipst)) 14331676Sjpk return (NULL); 14340Sstevel@tonic-gate 14350Sstevel@tonic-gate return (ire); 14360Sstevel@tonic-gate } 14370Sstevel@tonic-gate 14380Sstevel@tonic-gate /* 14390Sstevel@tonic-gate * Similar to ire_create except that it is called only when 14400Sstevel@tonic-gate * we want to allocate ire as an mblk e.g. we have an external 14410Sstevel@tonic-gate * resolver ARP. 14420Sstevel@tonic-gate */ 14430Sstevel@tonic-gate ire_t * 14440Sstevel@tonic-gate ire_create_mp(uchar_t *addr, uchar_t *mask, uchar_t *src_addr, uchar_t *gateway, 14454823Sseb uint_t max_frag, nce_t *src_nce, queue_t *rfq, queue_t *stq, ushort_t type, 14464823Sseb ipif_t *ipif, ipaddr_t cmask, uint32_t phandle, uint32_t ihandle, 14474823Sseb uint32_t flags, const iulp_t *ulp_info, tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, 14483448Sdh155122 ip_stack_t *ipst) 14490Sstevel@tonic-gate { 14502535Ssangeeta ire_t *ire, *buf; 14510Sstevel@tonic-gate ire_t *ret_ire; 14520Sstevel@tonic-gate mblk_t *mp; 14532535Ssangeeta size_t bufsize; 14542535Ssangeeta frtn_t *frtnp; 14552535Ssangeeta ill_t *ill; 14562535Ssangeeta 14572535Ssangeeta bufsize = sizeof (ire_t) + sizeof (frtn_t); 14582535Ssangeeta buf = kmem_alloc(bufsize, KM_NOSLEEP); 14592535Ssangeeta if (buf == NULL) { 14602535Ssangeeta ip1dbg(("ire_create_mp: alloc failed\n")); 14612535Ssangeeta return (NULL); 14622535Ssangeeta } 14632535Ssangeeta frtnp = (frtn_t *)(buf + 1); 14642535Ssangeeta frtnp->free_arg = (caddr_t)buf; 14652535Ssangeeta frtnp->free_func = ire_freemblk; 14662535Ssangeeta 14672535Ssangeeta /* 14682535Ssangeeta * Allocate the new IRE. The ire created will hold a ref on 14692535Ssangeeta * an nce_t after ire_nce_init, and this ref must either be 14702535Ssangeeta * (a) transferred to the ire_cache entry created when ire_add_v4 14712535Ssangeeta * is called after successful arp resolution, or, 14722535Ssangeeta * (b) released, when arp resolution fails 14732535Ssangeeta * Case (b) is handled in ire_freemblk() which will be called 14742535Ssangeeta * when mp is freed as a result of failed arp. 14752535Ssangeeta */ 14762535Ssangeeta mp = esballoc((unsigned char *)buf, bufsize, BPRI_MED, frtnp); 14770Sstevel@tonic-gate if (mp == NULL) { 14780Sstevel@tonic-gate ip1dbg(("ire_create_mp: alloc failed\n")); 14792535Ssangeeta kmem_free(buf, bufsize); 14800Sstevel@tonic-gate return (NULL); 14810Sstevel@tonic-gate } 14820Sstevel@tonic-gate ire = (ire_t *)mp->b_rptr; 14830Sstevel@tonic-gate mp->b_wptr = (uchar_t *)&ire[1]; 14840Sstevel@tonic-gate 14850Sstevel@tonic-gate /* Start clean. */ 14860Sstevel@tonic-gate *ire = ire_null; 14870Sstevel@tonic-gate ire->ire_mp = mp; 14880Sstevel@tonic-gate mp->b_datap->db_type = IRE_DB_TYPE; 14892535Ssangeeta ire->ire_marks |= IRE_MARK_UNCACHED; 14900Sstevel@tonic-gate 14914823Sseb ret_ire = ire_init(ire, addr, mask, src_addr, gateway, NULL, src_nce, 14924823Sseb rfq, stq, type, ipif, cmask, phandle, ihandle, flags, ulp_info, gc, 14934823Sseb gcgrp, ipst); 14940Sstevel@tonic-gate 14952741Ssowmini ill = (ill_t *)(stq->q_ptr); 14960Sstevel@tonic-gate if (ret_ire == NULL) { 14973448Sdh155122 /* ire_freemblk needs these set */ 14982741Ssowmini ire->ire_stq_ifindex = ill->ill_phyint->phyint_ifindex; 14997558SSowmini.Varadhan@Sun.COM ire->ire_stackid = ipst->ips_netstack->netstack_stackid; 15003448Sdh155122 ire->ire_ipst = ipst; 15010Sstevel@tonic-gate freeb(ire->ire_mp); 15020Sstevel@tonic-gate return (NULL); 15030Sstevel@tonic-gate } 15042535Ssangeeta ret_ire->ire_stq_ifindex = ill->ill_phyint->phyint_ifindex; 15057558SSowmini.Varadhan@Sun.COM ret_ire->ire_stackid = ipst->ips_netstack->netstack_stackid; 15060Sstevel@tonic-gate ASSERT(ret_ire == ire); 15077558SSowmini.Varadhan@Sun.COM ASSERT(ret_ire->ire_ipst == ipst); 15080Sstevel@tonic-gate /* 15090Sstevel@tonic-gate * ire_max_frag is normally zero here and is atomically set 15100Sstevel@tonic-gate * under the irebucket lock in ire_add_v[46] except for the 15110Sstevel@tonic-gate * case of IRE_MARK_NOADD. In that event the the ire_max_frag 15120Sstevel@tonic-gate * is non-zero here. 15130Sstevel@tonic-gate */ 15140Sstevel@tonic-gate ire->ire_max_frag = max_frag; 15150Sstevel@tonic-gate return (ire); 15160Sstevel@tonic-gate } 15170Sstevel@tonic-gate 15180Sstevel@tonic-gate /* 15190Sstevel@tonic-gate * ire_create is called to allocate and initialize a new IRE. 15200Sstevel@tonic-gate * 15210Sstevel@tonic-gate * NOTE : This is called as writer sometimes though not required 15220Sstevel@tonic-gate * by this function. 15230Sstevel@tonic-gate */ 15240Sstevel@tonic-gate ire_t * 15250Sstevel@tonic-gate ire_create(uchar_t *addr, uchar_t *mask, uchar_t *src_addr, uchar_t *gateway, 15264823Sseb uint_t *max_fragp, nce_t *src_nce, queue_t *rfq, queue_t *stq, 15274823Sseb ushort_t type, ipif_t *ipif, ipaddr_t cmask, uint32_t phandle, 15284823Sseb uint32_t ihandle, uint32_t flags, const iulp_t *ulp_info, tsol_gc_t *gc, 15294823Sseb tsol_gcgrp_t *gcgrp, ip_stack_t *ipst) 15300Sstevel@tonic-gate { 15310Sstevel@tonic-gate ire_t *ire; 15320Sstevel@tonic-gate ire_t *ret_ire; 15330Sstevel@tonic-gate 15340Sstevel@tonic-gate ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 15350Sstevel@tonic-gate if (ire == NULL) { 15360Sstevel@tonic-gate ip1dbg(("ire_create: alloc failed\n")); 15370Sstevel@tonic-gate return (NULL); 15380Sstevel@tonic-gate } 15390Sstevel@tonic-gate *ire = ire_null; 15400Sstevel@tonic-gate 15414823Sseb ret_ire = ire_init(ire, addr, mask, src_addr, gateway, max_fragp, 15424823Sseb src_nce, rfq, stq, type, ipif, cmask, phandle, ihandle, flags, 15434823Sseb ulp_info, gc, gcgrp, ipst); 15440Sstevel@tonic-gate 15450Sstevel@tonic-gate if (ret_ire == NULL) { 15460Sstevel@tonic-gate kmem_cache_free(ire_cache, ire); 15470Sstevel@tonic-gate return (NULL); 15480Sstevel@tonic-gate } 15490Sstevel@tonic-gate ASSERT(ret_ire == ire); 15500Sstevel@tonic-gate return (ire); 15510Sstevel@tonic-gate } 15520Sstevel@tonic-gate 15530Sstevel@tonic-gate /* 15540Sstevel@tonic-gate * Common to IPv4 and IPv6 15550Sstevel@tonic-gate */ 15561676Sjpk boolean_t 15574714Ssowmini ire_init_common(ire_t *ire, uint_t *max_fragp, nce_t *src_nce, queue_t *rfq, 15584823Sseb queue_t *stq, ushort_t type, ipif_t *ipif, uint32_t phandle, 15594714Ssowmini uint32_t ihandle, uint32_t flags, uchar_t ipversion, const iulp_t *ulp_info, 15604714Ssowmini tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, ip_stack_t *ipst) 15610Sstevel@tonic-gate { 15620Sstevel@tonic-gate ire->ire_max_fragp = max_fragp; 15633448Sdh155122 ire->ire_frag_flag |= (ipst->ips_ip_path_mtu_discovery) ? IPH_DF : 0; 15640Sstevel@tonic-gate 15651676Sjpk #ifdef DEBUG 15661676Sjpk if (ipif != NULL) { 15670Sstevel@tonic-gate if (ipif->ipif_isv6) 15680Sstevel@tonic-gate ASSERT(ipversion == IPV6_VERSION); 15690Sstevel@tonic-gate else 15700Sstevel@tonic-gate ASSERT(ipversion == IPV4_VERSION); 15710Sstevel@tonic-gate } 15721676Sjpk #endif /* DEBUG */ 15731676Sjpk 15741676Sjpk /* 15751676Sjpk * Create/initialize IRE security attribute only in Trusted mode; 15761676Sjpk * if the passed in gc/gcgrp is non-NULL, we expect that the caller 15771676Sjpk * has held a reference to it and will release it when this routine 15781676Sjpk * returns a failure, otherwise we own the reference. We do this 15791676Sjpk * prior to initializing the rest IRE fields. 15802416Sjarrett * 15812416Sjarrett * Don't allocate ire_gw_secattr for the resolver case to prevent 15822416Sjarrett * memory leak (in case of external resolution failure). We'll 15832416Sjarrett * allocate it after a successful external resolution, in ire_add(). 15842416Sjarrett * Note that ire->ire_mp != NULL here means this ire is headed 15852416Sjarrett * to an external resolver. 15861676Sjpk */ 15871676Sjpk if (is_system_labeled()) { 15881676Sjpk if ((type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST | 15891676Sjpk IRE_INTERFACE)) != 0) { 15901676Sjpk /* release references on behalf of caller */ 15911676Sjpk if (gc != NULL) 15921676Sjpk GC_REFRELE(gc); 15931676Sjpk if (gcgrp != NULL) 15941676Sjpk GCGRP_REFRELE(gcgrp); 15952416Sjarrett } else if ((ire->ire_mp == NULL) && 15962416Sjarrett tsol_ire_init_gwattr(ire, ipversion, gc, gcgrp) != 0) { 15971676Sjpk return (B_FALSE); 15981676Sjpk } 15991676Sjpk } 16000Sstevel@tonic-gate 16010Sstevel@tonic-gate ire->ire_stq = stq; 16020Sstevel@tonic-gate ire->ire_rfq = rfq; 16030Sstevel@tonic-gate ire->ire_type = type; 16040Sstevel@tonic-gate ire->ire_flags = RTF_UP | flags; 16050Sstevel@tonic-gate ire->ire_ident = TICK_TO_MSEC(lbolt); 16060Sstevel@tonic-gate bcopy(ulp_info, &ire->ire_uinfo, sizeof (iulp_t)); 16070Sstevel@tonic-gate 16080Sstevel@tonic-gate ire->ire_tire_mark = ire->ire_ob_pkt_count + ire->ire_ib_pkt_count; 16090Sstevel@tonic-gate ire->ire_last_used_time = lbolt; 16100Sstevel@tonic-gate ire->ire_create_time = (uint32_t)gethrestime_sec(); 16110Sstevel@tonic-gate 16120Sstevel@tonic-gate /* 16130Sstevel@tonic-gate * If this IRE is an IRE_CACHE, inherit the handles from the 16140Sstevel@tonic-gate * parent IREs. For others in the forwarding table, assign appropriate 16150Sstevel@tonic-gate * new ones. 16160Sstevel@tonic-gate * 16170Sstevel@tonic-gate * The mutex protecting ire_handle is because ire_create is not always 16180Sstevel@tonic-gate * called as a writer. 16190Sstevel@tonic-gate */ 16200Sstevel@tonic-gate if (ire->ire_type & IRE_OFFSUBNET) { 16213448Sdh155122 mutex_enter(&ipst->ips_ire_handle_lock); 16223448Sdh155122 ire->ire_phandle = (uint32_t)ipst->ips_ire_handle++; 16233448Sdh155122 mutex_exit(&ipst->ips_ire_handle_lock); 16240Sstevel@tonic-gate } else if (ire->ire_type & IRE_INTERFACE) { 16253448Sdh155122 mutex_enter(&ipst->ips_ire_handle_lock); 16263448Sdh155122 ire->ire_ihandle = (uint32_t)ipst->ips_ire_handle++; 16273448Sdh155122 mutex_exit(&ipst->ips_ire_handle_lock); 16280Sstevel@tonic-gate } else if (ire->ire_type == IRE_CACHE) { 16290Sstevel@tonic-gate ire->ire_phandle = phandle; 16300Sstevel@tonic-gate ire->ire_ihandle = ihandle; 16310Sstevel@tonic-gate } 16320Sstevel@tonic-gate ire->ire_ipif = ipif; 16330Sstevel@tonic-gate if (ipif != NULL) { 16340Sstevel@tonic-gate ire->ire_ipif_seqid = ipif->ipif_seqid; 16357880SJonathan.Anderson@Sun.COM ire->ire_ipif_ifindex = 16367880SJonathan.Anderson@Sun.COM ipif->ipif_ill->ill_phyint->phyint_ifindex; 16370Sstevel@tonic-gate ire->ire_zoneid = ipif->ipif_zoneid; 16380Sstevel@tonic-gate } else { 16390Sstevel@tonic-gate ire->ire_zoneid = GLOBAL_ZONEID; 16400Sstevel@tonic-gate } 16410Sstevel@tonic-gate ire->ire_ipversion = ipversion; 16422535Ssangeeta mutex_init(&ire->ire_lock, NULL, MUTEX_DEFAULT, NULL); 16432535Ssangeeta if (ipversion == IPV4_VERSION) { 16444714Ssowmini /* 16454714Ssowmini * IPv6 initializes the ire_nce in ire_add_v6, which expects 16464714Ssowmini * to find the ire_nce to be null when it is called. 16474714Ssowmini */ 16484714Ssowmini if (ire_nce_init(ire, src_nce) != 0) { 16492535Ssangeeta /* some failure occurred. propagate error back */ 16502535Ssangeeta return (B_FALSE); 16512535Ssangeeta } 16522535Ssangeeta } 16530Sstevel@tonic-gate ire->ire_refcnt = 1; 16543448Sdh155122 ire->ire_ipst = ipst; /* No netstack_hold */ 16555023Scarlsonj ire->ire_trace_disable = B_FALSE; 16561676Sjpk 16571676Sjpk return (B_TRUE); 16580Sstevel@tonic-gate } 16590Sstevel@tonic-gate 16600Sstevel@tonic-gate /* 16610Sstevel@tonic-gate * This routine is called repeatedly by ipif_up to create broadcast IREs. 16620Sstevel@tonic-gate * It is passed a pointer to a slot in an IRE pointer array into which to 16630Sstevel@tonic-gate * place the pointer to the new IRE, if indeed we create one. If the 16640Sstevel@tonic-gate * IRE corresponding to the address passed in would be a duplicate of an 16650Sstevel@tonic-gate * existing one, we don't create the new one. irep is incremented before 16660Sstevel@tonic-gate * return only if we do create a new IRE. (Always called as writer.) 16670Sstevel@tonic-gate * 16680Sstevel@tonic-gate * Note that with the "match_flags" parameter, we can match on either 16690Sstevel@tonic-gate * a particular logical interface (MATCH_IRE_IPIF) or for all logical 16700Sstevel@tonic-gate * interfaces for a given physical interface (MATCH_IRE_ILL). Currently, 16710Sstevel@tonic-gate * we only create broadcast ire's on a per physical interface basis. If 16720Sstevel@tonic-gate * someone is going to be mucking with logical interfaces, it is important 16730Sstevel@tonic-gate * to call "ipif_check_bcast_ires()" to make sure that any change to a 16740Sstevel@tonic-gate * logical interface will not cause critical broadcast IRE's to be deleted. 16750Sstevel@tonic-gate */ 16760Sstevel@tonic-gate ire_t ** 16770Sstevel@tonic-gate ire_check_and_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep, 16780Sstevel@tonic-gate int match_flags) 16790Sstevel@tonic-gate { 16800Sstevel@tonic-gate ire_t *ire; 16810Sstevel@tonic-gate uint64_t check_flags = IPIF_DEPRECATED | IPIF_NOLOCAL | IPIF_ANYCAST; 16828485SPeter.Memishian@Sun.COM boolean_t prefer; 16838485SPeter.Memishian@Sun.COM ill_t *ill = ipif->ipif_ill; 16848485SPeter.Memishian@Sun.COM ip_stack_t *ipst = ill->ill_ipst; 16850Sstevel@tonic-gate 16860Sstevel@tonic-gate /* 16870Sstevel@tonic-gate * No broadcast IREs for the LOOPBACK interface 16880Sstevel@tonic-gate * or others such as point to point and IPIF_NOXMIT. 16890Sstevel@tonic-gate */ 16900Sstevel@tonic-gate if (!(ipif->ipif_flags & IPIF_BROADCAST) || 16910Sstevel@tonic-gate (ipif->ipif_flags & IPIF_NOXMIT)) 16920Sstevel@tonic-gate return (irep); 16930Sstevel@tonic-gate 16948485SPeter.Memishian@Sun.COM /* 16958485SPeter.Memishian@Sun.COM * If this new IRE would be a duplicate, only prefer it if one of 16968485SPeter.Memishian@Sun.COM * the following is true: 16978485SPeter.Memishian@Sun.COM * 16988485SPeter.Memishian@Sun.COM * 1. The existing one has IPIF_DEPRECATED|IPIF_LOCAL|IPIF_ANYCAST 16998485SPeter.Memishian@Sun.COM * set and the new one has all of those clear. 17008485SPeter.Memishian@Sun.COM * 17018485SPeter.Memishian@Sun.COM * 2. The existing one corresponds to an underlying ILL in an IPMP 17028485SPeter.Memishian@Sun.COM * group and the new one corresponds to an IPMP group interface. 17038485SPeter.Memishian@Sun.COM */ 17040Sstevel@tonic-gate if ((ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ipif, 17053448Sdh155122 ipif->ipif_zoneid, NULL, match_flags, ipst)) != NULL) { 17068485SPeter.Memishian@Sun.COM prefer = ((ire->ire_ipif->ipif_flags & check_flags) && 17078485SPeter.Memishian@Sun.COM !(ipif->ipif_flags & check_flags)) || 17088485SPeter.Memishian@Sun.COM (IS_UNDER_IPMP(ire->ire_ipif->ipif_ill) && IS_IPMP(ill)); 17098485SPeter.Memishian@Sun.COM if (!prefer) { 17100Sstevel@tonic-gate ire_refrele(ire); 17110Sstevel@tonic-gate return (irep); 17120Sstevel@tonic-gate } 17138485SPeter.Memishian@Sun.COM 17140Sstevel@tonic-gate /* 17150Sstevel@tonic-gate * Bcast ires exist in pairs. Both have to be deleted, 17160Sstevel@tonic-gate * Since we are exclusive we can make the above assertion. 17170Sstevel@tonic-gate * The 1st has to be refrele'd since it was ctable_lookup'd. 17180Sstevel@tonic-gate */ 17190Sstevel@tonic-gate ASSERT(IAM_WRITER_IPIF(ipif)); 17200Sstevel@tonic-gate ASSERT(ire->ire_next->ire_addr == ire->ire_addr); 17210Sstevel@tonic-gate ire_delete(ire->ire_next); 17220Sstevel@tonic-gate ire_delete(ire); 17230Sstevel@tonic-gate ire_refrele(ire); 17240Sstevel@tonic-gate } 17258485SPeter.Memishian@Sun.COM return (ire_create_bcast(ipif, addr, irep)); 17260Sstevel@tonic-gate } 17270Sstevel@tonic-gate 17280Sstevel@tonic-gate uint_t ip_loopback_mtu = IP_LOOPBACK_MTU; 17290Sstevel@tonic-gate 17300Sstevel@tonic-gate /* 17310Sstevel@tonic-gate * This routine is called from ipif_check_bcast_ires and ire_check_bcast. 17320Sstevel@tonic-gate * It leaves all the verifying and deleting to those routines. So it always 17330Sstevel@tonic-gate * creates 2 bcast ires and chains them into the ire array passed in. 17340Sstevel@tonic-gate */ 17350Sstevel@tonic-gate ire_t ** 17360Sstevel@tonic-gate ire_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep) 17370Sstevel@tonic-gate { 17383448Sdh155122 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 17398485SPeter.Memishian@Sun.COM ill_t *ill = ipif->ipif_ill; 17408485SPeter.Memishian@Sun.COM 17418485SPeter.Memishian@Sun.COM ASSERT(IAM_WRITER_IPIF(ipif)); 17428485SPeter.Memishian@Sun.COM 17438485SPeter.Memishian@Sun.COM if (IS_IPMP(ill)) { 17448485SPeter.Memishian@Sun.COM /* 17458485SPeter.Memishian@Sun.COM * Broadcast IREs for the IPMP meta-interface use the 17468485SPeter.Memishian@Sun.COM * nominated broadcast interface to send and receive packets. 17478485SPeter.Memishian@Sun.COM * If there's no nominated interface, send the packets down to 17488485SPeter.Memishian@Sun.COM * the IPMP stub driver, which will discard them. If the 17498485SPeter.Memishian@Sun.COM * nominated broadcast interface changes, ill_refresh_bcast() 17508485SPeter.Memishian@Sun.COM * will refresh the broadcast IREs. 17518485SPeter.Memishian@Sun.COM */ 17528485SPeter.Memishian@Sun.COM if ((ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL) 17538485SPeter.Memishian@Sun.COM ill = ipif->ipif_ill; 17548485SPeter.Memishian@Sun.COM } 17553448Sdh155122 17560Sstevel@tonic-gate *irep++ = ire_create( 17570Sstevel@tonic-gate (uchar_t *)&addr, /* dest addr */ 17580Sstevel@tonic-gate (uchar_t *)&ip_g_all_ones, /* mask */ 17590Sstevel@tonic-gate (uchar_t *)&ipif->ipif_src_addr, /* source addr */ 17600Sstevel@tonic-gate NULL, /* no gateway */ 17610Sstevel@tonic-gate &ipif->ipif_mtu, /* max frag */ 17624714Ssowmini NULL, /* no src nce */ 17638485SPeter.Memishian@Sun.COM ill->ill_rq, /* recv-from queue */ 17648485SPeter.Memishian@Sun.COM ill->ill_wq, /* send-to queue */ 17650Sstevel@tonic-gate IRE_BROADCAST, 17660Sstevel@tonic-gate ipif, 17670Sstevel@tonic-gate 0, 17680Sstevel@tonic-gate 0, 17690Sstevel@tonic-gate 0, 17700Sstevel@tonic-gate 0, 17711676Sjpk &ire_uinfo_null, 17721676Sjpk NULL, 17733448Sdh155122 NULL, 17743448Sdh155122 ipst); 17750Sstevel@tonic-gate 17760Sstevel@tonic-gate *irep++ = ire_create( 17774714Ssowmini (uchar_t *)&addr, /* dest address */ 17784714Ssowmini (uchar_t *)&ip_g_all_ones, /* mask */ 17794714Ssowmini (uchar_t *)&ipif->ipif_src_addr, /* source address */ 17804714Ssowmini NULL, /* no gateway */ 17814714Ssowmini &ip_loopback_mtu, /* max frag size */ 17824714Ssowmini NULL, /* no src_nce */ 17838485SPeter.Memishian@Sun.COM ill->ill_rq, /* recv-from queue */ 17844714Ssowmini NULL, /* no send-to queue */ 17854714Ssowmini IRE_BROADCAST, /* Needed for fanout in wput */ 17864714Ssowmini ipif, 17874714Ssowmini 0, 17884714Ssowmini 0, 17894714Ssowmini 0, 17904714Ssowmini 0, 17914714Ssowmini &ire_uinfo_null, 17924714Ssowmini NULL, 17934714Ssowmini NULL, 17944714Ssowmini ipst); 17950Sstevel@tonic-gate 17960Sstevel@tonic-gate return (irep); 17970Sstevel@tonic-gate } 17980Sstevel@tonic-gate 17990Sstevel@tonic-gate /* 18000Sstevel@tonic-gate * ire_walk routine to delete or update any IRE_CACHE that might contain 18010Sstevel@tonic-gate * stale information. 18020Sstevel@tonic-gate * The flags state which entries to delete or update. 18030Sstevel@tonic-gate * Garbage collection is done separately using kmem alloc callbacks to 18040Sstevel@tonic-gate * ip_trash_ire_reclaim. 18050Sstevel@tonic-gate * Used for both IPv4 and IPv6. However, IPv6 only uses FLUSH_MTU_TIME 18060Sstevel@tonic-gate * since other stale information is cleaned up using NUD. 18070Sstevel@tonic-gate */ 18080Sstevel@tonic-gate void 18090Sstevel@tonic-gate ire_expire(ire_t *ire, char *arg) 18100Sstevel@tonic-gate { 18113448Sdh155122 ire_expire_arg_t *ieap = (ire_expire_arg_t *)(uintptr_t)arg; 18123448Sdh155122 ill_t *stq_ill; 18133448Sdh155122 int flush_flags = ieap->iea_flush_flag; 18143448Sdh155122 ip_stack_t *ipst = ieap->iea_ipst; 18150Sstevel@tonic-gate 18160Sstevel@tonic-gate if ((flush_flags & FLUSH_REDIRECT_TIME) && 18173004Sdd193516 (ire->ire_flags & RTF_DYNAMIC)) { 18180Sstevel@tonic-gate /* Make sure we delete the corresponding IRE_CACHE */ 18190Sstevel@tonic-gate ip1dbg(("ire_expire: all redirects\n")); 18203448Sdh155122 ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst); 18210Sstevel@tonic-gate ire_delete(ire); 18223448Sdh155122 atomic_dec_32(&ipst->ips_ip_redirect_cnt); 18230Sstevel@tonic-gate return; 18240Sstevel@tonic-gate } 18250Sstevel@tonic-gate if (ire->ire_type != IRE_CACHE) 18260Sstevel@tonic-gate return; 18270Sstevel@tonic-gate 18280Sstevel@tonic-gate if (flush_flags & FLUSH_ARP_TIME) { 18290Sstevel@tonic-gate /* 18306307Sjprakash * Remove all IRE_CACHE except IPv4 multicast ires. These 18316307Sjprakash * ires will be deleted by ip_trash_ire_reclaim_stack() 18326307Sjprakash * when system runs low in memory. 18336307Sjprakash * Verify that create time is more than ip_ire_arp_interval 18346307Sjprakash * milliseconds ago. 18350Sstevel@tonic-gate */ 18366307Sjprakash 18376307Sjprakash if (!(ire->ire_ipversion == IPV4_VERSION && 18386307Sjprakash CLASSD(ire->ire_addr)) && NCE_EXPIRED(ire->ire_nce, ipst)) { 18390Sstevel@tonic-gate ire_delete(ire); 18400Sstevel@tonic-gate return; 18410Sstevel@tonic-gate } 18420Sstevel@tonic-gate } 18430Sstevel@tonic-gate 18443448Sdh155122 if (ipst->ips_ip_path_mtu_discovery && (flush_flags & FLUSH_MTU_TIME) && 18450Sstevel@tonic-gate (ire->ire_ipif != NULL)) { 18460Sstevel@tonic-gate /* Increase pmtu if it is less than the interface mtu */ 18470Sstevel@tonic-gate mutex_enter(&ire->ire_lock); 18480Sstevel@tonic-gate /* 18490Sstevel@tonic-gate * If the ipif is a vni (whose mtu is 0, since it's virtual) 18500Sstevel@tonic-gate * get the mtu from the sending interfaces' ipif 18510Sstevel@tonic-gate */ 18520Sstevel@tonic-gate if (IS_VNI(ire->ire_ipif->ipif_ill)) { 18530Sstevel@tonic-gate stq_ill = ire->ire_stq->q_ptr; 18540Sstevel@tonic-gate ire->ire_max_frag = MIN(stq_ill->ill_ipif->ipif_mtu, 18550Sstevel@tonic-gate IP_MAXPACKET); 18560Sstevel@tonic-gate } else { 18570Sstevel@tonic-gate ire->ire_max_frag = MIN(ire->ire_ipif->ipif_mtu, 18580Sstevel@tonic-gate IP_MAXPACKET); 18590Sstevel@tonic-gate } 18600Sstevel@tonic-gate ire->ire_frag_flag |= IPH_DF; 18610Sstevel@tonic-gate mutex_exit(&ire->ire_lock); 18620Sstevel@tonic-gate } 18630Sstevel@tonic-gate } 18640Sstevel@tonic-gate 18650Sstevel@tonic-gate /* 18660Sstevel@tonic-gate * Return any local address. We use this to target ourselves 18670Sstevel@tonic-gate * when the src address was specified as 'default'. 18680Sstevel@tonic-gate * Preference for IRE_LOCAL entries. 18690Sstevel@tonic-gate */ 18700Sstevel@tonic-gate ire_t * 18713448Sdh155122 ire_lookup_local(zoneid_t zoneid, ip_stack_t *ipst) 18720Sstevel@tonic-gate { 18730Sstevel@tonic-gate ire_t *ire; 18740Sstevel@tonic-gate irb_t *irb; 18750Sstevel@tonic-gate ire_t *maybe = NULL; 18760Sstevel@tonic-gate int i; 18770Sstevel@tonic-gate 18783448Sdh155122 for (i = 0; i < ipst->ips_ip_cache_table_size; i++) { 18793448Sdh155122 irb = &ipst->ips_ip_cache_table[i]; 18800Sstevel@tonic-gate if (irb->irb_ire == NULL) 18810Sstevel@tonic-gate continue; 18820Sstevel@tonic-gate rw_enter(&irb->irb_lock, RW_READER); 18830Sstevel@tonic-gate for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 18840Sstevel@tonic-gate if ((ire->ire_marks & IRE_MARK_CONDEMNED) || 18851676Sjpk (ire->ire_zoneid != zoneid && 18861676Sjpk ire->ire_zoneid != ALL_ZONES)) 18870Sstevel@tonic-gate continue; 18880Sstevel@tonic-gate switch (ire->ire_type) { 18890Sstevel@tonic-gate case IRE_LOOPBACK: 18900Sstevel@tonic-gate if (maybe == NULL) { 18910Sstevel@tonic-gate IRE_REFHOLD(ire); 18920Sstevel@tonic-gate maybe = ire; 18930Sstevel@tonic-gate } 18940Sstevel@tonic-gate break; 18950Sstevel@tonic-gate case IRE_LOCAL: 18960Sstevel@tonic-gate if (maybe != NULL) { 18970Sstevel@tonic-gate ire_refrele(maybe); 18980Sstevel@tonic-gate } 18990Sstevel@tonic-gate IRE_REFHOLD(ire); 19000Sstevel@tonic-gate rw_exit(&irb->irb_lock); 19010Sstevel@tonic-gate return (ire); 19020Sstevel@tonic-gate } 19030Sstevel@tonic-gate } 19040Sstevel@tonic-gate rw_exit(&irb->irb_lock); 19050Sstevel@tonic-gate } 19060Sstevel@tonic-gate return (maybe); 19070Sstevel@tonic-gate } 19080Sstevel@tonic-gate 19090Sstevel@tonic-gate /* 19100Sstevel@tonic-gate * If the specified IRE is associated with a particular ILL, return 19110Sstevel@tonic-gate * that ILL pointer (May be called as writer.). 19120Sstevel@tonic-gate * 19130Sstevel@tonic-gate * NOTE : This is not a generic function that can be used always. 19140Sstevel@tonic-gate * This function always returns the ill of the outgoing packets 19150Sstevel@tonic-gate * if this ire is used. 19160Sstevel@tonic-gate */ 19170Sstevel@tonic-gate ill_t * 19181676Sjpk ire_to_ill(const ire_t *ire) 19190Sstevel@tonic-gate { 19200Sstevel@tonic-gate ill_t *ill = NULL; 19210Sstevel@tonic-gate 19220Sstevel@tonic-gate /* 19230Sstevel@tonic-gate * 1) For an IRE_CACHE, ire_ipif is the one where it obtained 19240Sstevel@tonic-gate * the source address from. ire_stq is the one where the 19250Sstevel@tonic-gate * packets will be sent out on. We return that here. 19260Sstevel@tonic-gate * 19270Sstevel@tonic-gate * 2) IRE_BROADCAST normally has a loopback and a non-loopback 19280Sstevel@tonic-gate * copy and they always exist next to each other with loopback 19290Sstevel@tonic-gate * copy being the first one. If we are called on the non-loopback 19300Sstevel@tonic-gate * copy, return the one pointed by ire_stq. If it was called on 19310Sstevel@tonic-gate * a loopback copy, we still return the one pointed by the next 19320Sstevel@tonic-gate * ire's ire_stq pointer i.e the one pointed by the non-loopback 19330Sstevel@tonic-gate * copy. We don't want use ire_ipif as it might represent the 19340Sstevel@tonic-gate * source address (if we borrow source addresses for 19350Sstevel@tonic-gate * IRE_BROADCASTS in the future). 19360Sstevel@tonic-gate * However if an interface is currently coming up, the above 19370Sstevel@tonic-gate * condition may not hold during that period since the ires 19380Sstevel@tonic-gate * are added one at a time. Thus one of the pair could have been 19390Sstevel@tonic-gate * added and the other not yet added. 19402906Snordmark * 3) For many other IREs (e.g., IRE_LOCAL), ire_rfq indicates the ill. 19412906Snordmark * 4) For all others return the ones pointed by ire_ipif->ipif_ill. 19422906Snordmark * That handles IRE_LOOPBACK. 19430Sstevel@tonic-gate */ 19440Sstevel@tonic-gate 19450Sstevel@tonic-gate if (ire->ire_type == IRE_CACHE) { 19460Sstevel@tonic-gate ill = (ill_t *)ire->ire_stq->q_ptr; 19470Sstevel@tonic-gate } else if (ire->ire_type == IRE_BROADCAST) { 19480Sstevel@tonic-gate if (ire->ire_stq != NULL) { 19490Sstevel@tonic-gate ill = (ill_t *)ire->ire_stq->q_ptr; 19500Sstevel@tonic-gate } else { 19510Sstevel@tonic-gate ire_t *ire_next; 19520Sstevel@tonic-gate 19530Sstevel@tonic-gate ire_next = ire->ire_next; 19540Sstevel@tonic-gate if (ire_next != NULL && 19550Sstevel@tonic-gate ire_next->ire_type == IRE_BROADCAST && 19560Sstevel@tonic-gate ire_next->ire_addr == ire->ire_addr && 19570Sstevel@tonic-gate ire_next->ire_ipif == ire->ire_ipif) { 19580Sstevel@tonic-gate ill = (ill_t *)ire_next->ire_stq->q_ptr; 19590Sstevel@tonic-gate } 19600Sstevel@tonic-gate } 19612906Snordmark } else if (ire->ire_rfq != NULL) { 19622906Snordmark ill = ire->ire_rfq->q_ptr; 19630Sstevel@tonic-gate } else if (ire->ire_ipif != NULL) { 19640Sstevel@tonic-gate ill = ire->ire_ipif->ipif_ill; 19650Sstevel@tonic-gate } 19660Sstevel@tonic-gate return (ill); 19670Sstevel@tonic-gate } 19680Sstevel@tonic-gate 19690Sstevel@tonic-gate /* Arrange to call the specified function for every IRE in the world. */ 19700Sstevel@tonic-gate void 19713448Sdh155122 ire_walk(pfv_t func, void *arg, ip_stack_t *ipst) 19720Sstevel@tonic-gate { 19733448Sdh155122 ire_walk_ipvers(func, arg, 0, ALL_ZONES, ipst); 19740Sstevel@tonic-gate } 19750Sstevel@tonic-gate 19760Sstevel@tonic-gate void 19773448Sdh155122 ire_walk_v4(pfv_t func, void *arg, zoneid_t zoneid, ip_stack_t *ipst) 19780Sstevel@tonic-gate { 19793448Sdh155122 ire_walk_ipvers(func, arg, IPV4_VERSION, zoneid, ipst); 19800Sstevel@tonic-gate } 19810Sstevel@tonic-gate 19820Sstevel@tonic-gate void 19833448Sdh155122 ire_walk_v6(pfv_t func, void *arg, zoneid_t zoneid, ip_stack_t *ipst) 19840Sstevel@tonic-gate { 19853448Sdh155122 ire_walk_ipvers(func, arg, IPV6_VERSION, zoneid, ipst); 19860Sstevel@tonic-gate } 19870Sstevel@tonic-gate 19880Sstevel@tonic-gate /* 19890Sstevel@tonic-gate * Walk a particular version. version == 0 means both v4 and v6. 19900Sstevel@tonic-gate */ 19910Sstevel@tonic-gate static void 19923448Sdh155122 ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers, zoneid_t zoneid, 19933448Sdh155122 ip_stack_t *ipst) 19940Sstevel@tonic-gate { 19950Sstevel@tonic-gate if (vers != IPV6_VERSION) { 19962535Ssangeeta /* 19972535Ssangeeta * ip_forwarding_table variable doesn't matter for IPv4 since 19983448Sdh155122 * ire_walk_ill_tables uses ips_ip_ftable for IPv4. 19992535Ssangeeta */ 20000Sstevel@tonic-gate ire_walk_ill_tables(0, 0, func, arg, IP_MASK_TABLE_SIZE, 20012535Ssangeeta 0, NULL, 20023448Sdh155122 ipst->ips_ip_cache_table_size, ipst->ips_ip_cache_table, 20033448Sdh155122 NULL, zoneid, ipst); 20040Sstevel@tonic-gate } 20050Sstevel@tonic-gate if (vers != IPV4_VERSION) { 20060Sstevel@tonic-gate ire_walk_ill_tables(0, 0, func, arg, IP6_MASK_TABLE_SIZE, 20073448Sdh155122 ipst->ips_ip6_ftable_hash_size, 20083448Sdh155122 ipst->ips_ip_forwarding_table_v6, 20093448Sdh155122 ipst->ips_ip6_cache_table_size, 20103448Sdh155122 ipst->ips_ip_cache_table_v6, NULL, zoneid, ipst); 20110Sstevel@tonic-gate } 20120Sstevel@tonic-gate } 20130Sstevel@tonic-gate 20140Sstevel@tonic-gate /* 20157216Smeem * Arrange to call the specified function for every IRE that matches the ill. 20160Sstevel@tonic-gate */ 20170Sstevel@tonic-gate void 20181676Sjpk ire_walk_ill(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg, 20190Sstevel@tonic-gate ill_t *ill) 20200Sstevel@tonic-gate { 20217216Smeem uchar_t vers = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION); 20227216Smeem 20237216Smeem ire_walk_ill_ipvers(match_flags, ire_type, func, arg, vers, ill); 20240Sstevel@tonic-gate } 20250Sstevel@tonic-gate 20260Sstevel@tonic-gate void 20271676Sjpk ire_walk_ill_v4(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg, 20280Sstevel@tonic-gate ill_t *ill) 20290Sstevel@tonic-gate { 20300Sstevel@tonic-gate ire_walk_ill_ipvers(match_flags, ire_type, func, arg, IPV4_VERSION, 20310Sstevel@tonic-gate ill); 20320Sstevel@tonic-gate } 20330Sstevel@tonic-gate 20340Sstevel@tonic-gate void 20351676Sjpk ire_walk_ill_v6(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg, 20360Sstevel@tonic-gate ill_t *ill) 20370Sstevel@tonic-gate { 20380Sstevel@tonic-gate ire_walk_ill_ipvers(match_flags, ire_type, func, arg, IPV6_VERSION, 20390Sstevel@tonic-gate ill); 20400Sstevel@tonic-gate } 20410Sstevel@tonic-gate 20420Sstevel@tonic-gate /* 20437216Smeem * Walk a particular ill and version. 20440Sstevel@tonic-gate */ 20450Sstevel@tonic-gate static void 20460Sstevel@tonic-gate ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type, pfv_t func, 20471676Sjpk void *arg, uchar_t vers, ill_t *ill) 20480Sstevel@tonic-gate { 20493448Sdh155122 ip_stack_t *ipst = ill->ill_ipst; 20503448Sdh155122 20517216Smeem if (vers == IPV4_VERSION) { 20520Sstevel@tonic-gate ire_walk_ill_tables(match_flags, ire_type, func, arg, 20532535Ssangeeta IP_MASK_TABLE_SIZE, 0, 20543448Sdh155122 NULL, ipst->ips_ip_cache_table_size, 20553448Sdh155122 ipst->ips_ip_cache_table, ill, ALL_ZONES, ipst); 20567216Smeem } else if (vers == IPV6_VERSION) { 20570Sstevel@tonic-gate ire_walk_ill_tables(match_flags, ire_type, func, arg, 20583448Sdh155122 IP6_MASK_TABLE_SIZE, ipst->ips_ip6_ftable_hash_size, 20593448Sdh155122 ipst->ips_ip_forwarding_table_v6, 20603448Sdh155122 ipst->ips_ip6_cache_table_size, 20613448Sdh155122 ipst->ips_ip_cache_table_v6, ill, ALL_ZONES, ipst); 20620Sstevel@tonic-gate } 20630Sstevel@tonic-gate } 20640Sstevel@tonic-gate 20652535Ssangeeta boolean_t 20660Sstevel@tonic-gate ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire, 20673448Sdh155122 ill_t *ill, zoneid_t zoneid, ip_stack_t *ipst) 20680Sstevel@tonic-gate { 20690Sstevel@tonic-gate ill_t *ire_stq_ill = NULL; 20700Sstevel@tonic-gate ill_t *ire_ipif_ill = NULL; 20710Sstevel@tonic-gate 20720Sstevel@tonic-gate ASSERT(match_flags != 0 || zoneid != ALL_ZONES); 20730Sstevel@tonic-gate /* 20748485SPeter.Memishian@Sun.COM * MATCH_IRE_ILL: We match both on ill pointed by ire_stq and 20758485SPeter.Memishian@Sun.COM * ire_ipif. Only in the case of IRE_CACHEs can ire_stq and 20768485SPeter.Memishian@Sun.COM * ire_ipif be pointing to different ills. But we want to keep 20778485SPeter.Memishian@Sun.COM * this function generic enough for future use. So, we always 20788485SPeter.Memishian@Sun.COM * try to match on both. The only caller of this function 20798485SPeter.Memishian@Sun.COM * ire_walk_ill_tables, will call "func" after we return from 20808485SPeter.Memishian@Sun.COM * this function. We expect "func" to do the right filtering 20818485SPeter.Memishian@Sun.COM * of ires in this case. 20820Sstevel@tonic-gate */ 20838485SPeter.Memishian@Sun.COM if (match_flags & MATCH_IRE_ILL) { 20840Sstevel@tonic-gate if (ire->ire_stq != NULL) 20858485SPeter.Memishian@Sun.COM ire_stq_ill = ire->ire_stq->q_ptr; 20860Sstevel@tonic-gate if (ire->ire_ipif != NULL) 20870Sstevel@tonic-gate ire_ipif_ill = ire->ire_ipif->ipif_ill; 20880Sstevel@tonic-gate } 20890Sstevel@tonic-gate 20900Sstevel@tonic-gate if (zoneid != ALL_ZONES) { 20910Sstevel@tonic-gate /* 20920Sstevel@tonic-gate * We're walking the IREs for a specific zone. The only relevant 20930Sstevel@tonic-gate * IREs are: 20940Sstevel@tonic-gate * - all IREs with a matching ire_zoneid 20950Sstevel@tonic-gate * - all IRE_OFFSUBNETs as they're shared across all zones 20960Sstevel@tonic-gate * - IRE_INTERFACE IREs for interfaces with a usable source addr 20970Sstevel@tonic-gate * with a matching zone 20980Sstevel@tonic-gate * - IRE_DEFAULTs with a gateway reachable from the zone 20990Sstevel@tonic-gate * We should really match on IRE_OFFSUBNETs and IRE_DEFAULTs 21000Sstevel@tonic-gate * using the same rule; but the above rules are consistent with 21010Sstevel@tonic-gate * the behavior of ire_ftable_lookup[_v6]() so that all the 21020Sstevel@tonic-gate * routes that can be matched during lookup are also matched 21030Sstevel@tonic-gate * here. 21040Sstevel@tonic-gate */ 21051676Sjpk if (zoneid != ire->ire_zoneid && ire->ire_zoneid != ALL_ZONES) { 21060Sstevel@tonic-gate /* 21070Sstevel@tonic-gate * Note, IRE_INTERFACE can have the stq as NULL. For 21080Sstevel@tonic-gate * example, if the default multicast route is tied to 21090Sstevel@tonic-gate * the loopback address. 21100Sstevel@tonic-gate */ 21110Sstevel@tonic-gate if ((ire->ire_type & IRE_INTERFACE) && 21120Sstevel@tonic-gate (ire->ire_stq != NULL)) { 21130Sstevel@tonic-gate ire_stq_ill = (ill_t *)ire->ire_stq->q_ptr; 21140Sstevel@tonic-gate if (ire->ire_ipversion == IPV4_VERSION) { 21150Sstevel@tonic-gate if (!ipif_usesrc_avail(ire_stq_ill, 21160Sstevel@tonic-gate zoneid)) 21170Sstevel@tonic-gate /* No usable src addr in zone */ 21180Sstevel@tonic-gate return (B_FALSE); 21190Sstevel@tonic-gate } else if (ire_stq_ill->ill_usesrc_ifindex 21200Sstevel@tonic-gate != 0) { 21210Sstevel@tonic-gate /* 21220Sstevel@tonic-gate * For IPv6 use ipif_select_source_v6() 21230Sstevel@tonic-gate * so the right scope selection is done 21240Sstevel@tonic-gate */ 21250Sstevel@tonic-gate ipif_t *src_ipif; 21260Sstevel@tonic-gate src_ipif = 21270Sstevel@tonic-gate ipif_select_source_v6(ire_stq_ill, 21288485SPeter.Memishian@Sun.COM &ire->ire_addr_v6, B_FALSE, 21290Sstevel@tonic-gate IPV6_PREFER_SRC_DEFAULT, 21300Sstevel@tonic-gate zoneid); 21310Sstevel@tonic-gate if (src_ipif != NULL) { 21320Sstevel@tonic-gate ipif_refrele(src_ipif); 21330Sstevel@tonic-gate } else { 21340Sstevel@tonic-gate return (B_FALSE); 21350Sstevel@tonic-gate } 21360Sstevel@tonic-gate } else { 21370Sstevel@tonic-gate return (B_FALSE); 21380Sstevel@tonic-gate } 21390Sstevel@tonic-gate 21400Sstevel@tonic-gate } else if (!(ire->ire_type & IRE_OFFSUBNET)) { 21410Sstevel@tonic-gate return (B_FALSE); 21420Sstevel@tonic-gate } 21430Sstevel@tonic-gate } 21440Sstevel@tonic-gate 21450Sstevel@tonic-gate /* 21460Sstevel@tonic-gate * Match all default routes from the global zone, irrespective 21472733Snordmark * of reachability. For a non-global zone only match those 21482733Snordmark * where ire_gateway_addr has a IRE_INTERFACE for the zoneid. 21490Sstevel@tonic-gate */ 21500Sstevel@tonic-gate if (ire->ire_type == IRE_DEFAULT && zoneid != GLOBAL_ZONEID) { 21510Sstevel@tonic-gate int ire_match_flags = 0; 21520Sstevel@tonic-gate in6_addr_t gw_addr_v6; 21530Sstevel@tonic-gate ire_t *rire; 21540Sstevel@tonic-gate 21552733Snordmark ire_match_flags |= MATCH_IRE_TYPE; 21568485SPeter.Memishian@Sun.COM if (ire->ire_ipif != NULL) 21578485SPeter.Memishian@Sun.COM ire_match_flags |= MATCH_IRE_ILL; 21588485SPeter.Memishian@Sun.COM 21590Sstevel@tonic-gate if (ire->ire_ipversion == IPV4_VERSION) { 21600Sstevel@tonic-gate rire = ire_route_lookup(ire->ire_gateway_addr, 21612733Snordmark 0, 0, IRE_INTERFACE, ire->ire_ipif, NULL, 21623448Sdh155122 zoneid, NULL, ire_match_flags, ipst); 21630Sstevel@tonic-gate } else { 21640Sstevel@tonic-gate ASSERT(ire->ire_ipversion == IPV6_VERSION); 21650Sstevel@tonic-gate mutex_enter(&ire->ire_lock); 21660Sstevel@tonic-gate gw_addr_v6 = ire->ire_gateway_addr_v6; 21670Sstevel@tonic-gate mutex_exit(&ire->ire_lock); 21680Sstevel@tonic-gate rire = ire_route_lookup_v6(&gw_addr_v6, 21692733Snordmark NULL, NULL, IRE_INTERFACE, ire->ire_ipif, 21703448Sdh155122 NULL, zoneid, NULL, ire_match_flags, ipst); 21710Sstevel@tonic-gate } 21720Sstevel@tonic-gate if (rire == NULL) { 21730Sstevel@tonic-gate return (B_FALSE); 21740Sstevel@tonic-gate } 21750Sstevel@tonic-gate ire_refrele(rire); 21760Sstevel@tonic-gate } 21770Sstevel@tonic-gate } 21780Sstevel@tonic-gate 21790Sstevel@tonic-gate if (((!(match_flags & MATCH_IRE_TYPE)) || 21804714Ssowmini (ire->ire_type & ire_type)) && 21810Sstevel@tonic-gate ((!(match_flags & MATCH_IRE_ILL)) || 21828485SPeter.Memishian@Sun.COM (ire_stq_ill == ill || ire_ipif_ill == ill || 21838485SPeter.Memishian@Sun.COM ire_ipif_ill != NULL && IS_IN_SAME_ILLGRP(ire_ipif_ill, ill)))) { 21840Sstevel@tonic-gate return (B_TRUE); 21850Sstevel@tonic-gate } 21860Sstevel@tonic-gate return (B_FALSE); 21870Sstevel@tonic-gate } 21880Sstevel@tonic-gate 21892535Ssangeeta int 21902535Ssangeeta rtfunc(struct radix_node *rn, void *arg) 21912535Ssangeeta { 21922535Ssangeeta struct rtfuncarg *rtf = arg; 21932535Ssangeeta struct rt_entry *rt; 21942535Ssangeeta irb_t *irb; 21952535Ssangeeta ire_t *ire; 21962535Ssangeeta boolean_t ret; 21972535Ssangeeta 21982535Ssangeeta rt = (struct rt_entry *)rn; 21992535Ssangeeta ASSERT(rt != NULL); 22002535Ssangeeta irb = &rt->rt_irb; 22012535Ssangeeta for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 22022535Ssangeeta if ((rtf->rt_match_flags != 0) || 22032535Ssangeeta (rtf->rt_zoneid != ALL_ZONES)) { 22042535Ssangeeta ret = ire_walk_ill_match(rtf->rt_match_flags, 22052535Ssangeeta rtf->rt_ire_type, ire, 22063448Sdh155122 rtf->rt_ill, rtf->rt_zoneid, rtf->rt_ipst); 22072535Ssangeeta } else 22082535Ssangeeta ret = B_TRUE; 22092535Ssangeeta if (ret) 22102535Ssangeeta (*rtf->rt_func)(ire, rtf->rt_arg); 22112535Ssangeeta } 22122535Ssangeeta return (0); 22132535Ssangeeta } 22142535Ssangeeta 22150Sstevel@tonic-gate /* 22160Sstevel@tonic-gate * Walk the ftable and the ctable entries that match the ill. 22170Sstevel@tonic-gate */ 22182535Ssangeeta void 22190Sstevel@tonic-gate ire_walk_ill_tables(uint_t match_flags, uint_t ire_type, pfv_t func, 22201676Sjpk void *arg, size_t ftbl_sz, size_t htbl_sz, irb_t **ipftbl, 22213448Sdh155122 size_t ctbl_sz, irb_t *ipctbl, ill_t *ill, zoneid_t zoneid, 22223448Sdh155122 ip_stack_t *ipst) 22230Sstevel@tonic-gate { 22240Sstevel@tonic-gate irb_t *irb_ptr; 22250Sstevel@tonic-gate irb_t *irb; 22260Sstevel@tonic-gate ire_t *ire; 22270Sstevel@tonic-gate int i, j; 22280Sstevel@tonic-gate boolean_t ret; 22292535Ssangeeta struct rtfuncarg rtfarg; 22300Sstevel@tonic-gate 22318485SPeter.Memishian@Sun.COM ASSERT((!(match_flags & MATCH_IRE_ILL)) || (ill != NULL)); 22320Sstevel@tonic-gate ASSERT(!(match_flags & MATCH_IRE_TYPE) || (ire_type != 0)); 22330Sstevel@tonic-gate /* 22340Sstevel@tonic-gate * Optimize by not looking at the forwarding table if there 22350Sstevel@tonic-gate * is a MATCH_IRE_TYPE specified with no IRE_FORWARDTABLE 22360Sstevel@tonic-gate * specified in ire_type. 22370Sstevel@tonic-gate */ 22380Sstevel@tonic-gate if (!(match_flags & MATCH_IRE_TYPE) || 22390Sstevel@tonic-gate ((ire_type & IRE_FORWARDTABLE) != 0)) { 22402535Ssangeeta /* knobs such that routine is called only for v6 case */ 22413448Sdh155122 if (ipftbl == ipst->ips_ip_forwarding_table_v6) { 22422535Ssangeeta for (i = (ftbl_sz - 1); i >= 0; i--) { 22432535Ssangeeta if ((irb_ptr = ipftbl[i]) == NULL) 22440Sstevel@tonic-gate continue; 22452535Ssangeeta for (j = 0; j < htbl_sz; j++) { 22462535Ssangeeta irb = &irb_ptr[j]; 22472535Ssangeeta if (irb->irb_ire == NULL) 22482535Ssangeeta continue; 22492535Ssangeeta 22502535Ssangeeta IRB_REFHOLD(irb); 22512535Ssangeeta for (ire = irb->irb_ire; ire != NULL; 22524714Ssowmini ire = ire->ire_next) { 22532535Ssangeeta if (match_flags == 0 && 22542535Ssangeeta zoneid == ALL_ZONES) { 22552535Ssangeeta ret = B_TRUE; 22562535Ssangeeta } else { 22572535Ssangeeta ret = 22582535Ssangeeta ire_walk_ill_match( 22592535Ssangeeta match_flags, 22602535Ssangeeta ire_type, ire, ill, 22613448Sdh155122 zoneid, ipst); 22622535Ssangeeta } 22632535Ssangeeta if (ret) 22642535Ssangeeta (*func)(ire, arg); 22650Sstevel@tonic-gate } 22662535Ssangeeta IRB_REFRELE(irb); 22670Sstevel@tonic-gate } 22680Sstevel@tonic-gate } 22692535Ssangeeta } else { 22702535Ssangeeta (void) memset(&rtfarg, 0, sizeof (rtfarg)); 22712535Ssangeeta rtfarg.rt_func = func; 22722535Ssangeeta rtfarg.rt_arg = arg; 22732535Ssangeeta if (match_flags != 0) { 22742535Ssangeeta rtfarg.rt_match_flags = match_flags; 22752535Ssangeeta } 22762535Ssangeeta rtfarg.rt_ire_type = ire_type; 22772535Ssangeeta rtfarg.rt_ill = ill; 22782535Ssangeeta rtfarg.rt_zoneid = zoneid; 22793448Sdh155122 rtfarg.rt_ipst = ipst; /* No netstack_hold */ 22803448Sdh155122 (void) ipst->ips_ip_ftable->rnh_walktree_mt( 22813448Sdh155122 ipst->ips_ip_ftable, 22823448Sdh155122 rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn); 22830Sstevel@tonic-gate } 22840Sstevel@tonic-gate } 22850Sstevel@tonic-gate 22860Sstevel@tonic-gate /* 22870Sstevel@tonic-gate * Optimize by not looking at the cache table if there 22880Sstevel@tonic-gate * is a MATCH_IRE_TYPE specified with no IRE_CACHETABLE 22890Sstevel@tonic-gate * specified in ire_type. 22900Sstevel@tonic-gate */ 22910Sstevel@tonic-gate if (!(match_flags & MATCH_IRE_TYPE) || 22920Sstevel@tonic-gate ((ire_type & IRE_CACHETABLE) != 0)) { 22930Sstevel@tonic-gate for (i = 0; i < ctbl_sz; i++) { 22940Sstevel@tonic-gate irb = &ipctbl[i]; 22950Sstevel@tonic-gate if (irb->irb_ire == NULL) 22960Sstevel@tonic-gate continue; 22970Sstevel@tonic-gate IRB_REFHOLD(irb); 22980Sstevel@tonic-gate for (ire = irb->irb_ire; ire != NULL; 22990Sstevel@tonic-gate ire = ire->ire_next) { 23000Sstevel@tonic-gate if (match_flags == 0 && zoneid == ALL_ZONES) { 23010Sstevel@tonic-gate ret = B_TRUE; 23020Sstevel@tonic-gate } else { 23030Sstevel@tonic-gate ret = ire_walk_ill_match( 23040Sstevel@tonic-gate match_flags, ire_type, 23053448Sdh155122 ire, ill, zoneid, ipst); 23060Sstevel@tonic-gate } 23070Sstevel@tonic-gate if (ret) 23080Sstevel@tonic-gate (*func)(ire, arg); 23090Sstevel@tonic-gate } 23100Sstevel@tonic-gate IRB_REFRELE(irb); 23110Sstevel@tonic-gate } 23120Sstevel@tonic-gate } 23130Sstevel@tonic-gate } 23140Sstevel@tonic-gate 23150Sstevel@tonic-gate /* 23160Sstevel@tonic-gate * This function takes a mask and returns 23170Sstevel@tonic-gate * number of bits set in the mask. If no 23180Sstevel@tonic-gate * bit is set it returns 0. 23190Sstevel@tonic-gate * Assumes a contiguous mask. 23200Sstevel@tonic-gate */ 23210Sstevel@tonic-gate int 23220Sstevel@tonic-gate ip_mask_to_plen(ipaddr_t mask) 23230Sstevel@tonic-gate { 23240Sstevel@tonic-gate return (mask == 0 ? 0 : IP_ABITS - (ffs(ntohl(mask)) -1)); 23250Sstevel@tonic-gate } 23260Sstevel@tonic-gate 23270Sstevel@tonic-gate /* 23280Sstevel@tonic-gate * Convert length for a mask to the mask. 23290Sstevel@tonic-gate */ 23300Sstevel@tonic-gate ipaddr_t 23310Sstevel@tonic-gate ip_plen_to_mask(uint_t masklen) 23320Sstevel@tonic-gate { 23330Sstevel@tonic-gate return (htonl(IP_HOST_MASK << (IP_ABITS - masklen))); 23340Sstevel@tonic-gate } 23350Sstevel@tonic-gate 23360Sstevel@tonic-gate void 23370Sstevel@tonic-gate ire_atomic_end(irb_t *irb_ptr, ire_t *ire) 23380Sstevel@tonic-gate { 23398564SPeter.Memishian@Sun.COM ill_t *stq_ill, *ipif_ill; 23408564SPeter.Memishian@Sun.COM ip_stack_t *ipst = ire->ire_ipst; 23418564SPeter.Memishian@Sun.COM 23428564SPeter.Memishian@Sun.COM stq_ill = ire->ire_stq != NULL ? ire->ire_stq->q_ptr : NULL; 23438564SPeter.Memishian@Sun.COM ipif_ill = ire->ire_ipif != NULL ? ire->ire_ipif->ipif_ill : NULL; 23448564SPeter.Memishian@Sun.COM RELEASE_ILL_LOCKS(ipif_ill, stq_ill); 23450Sstevel@tonic-gate rw_exit(&irb_ptr->irb_lock); 23463448Sdh155122 rw_exit(&ipst->ips_ill_g_usesrc_lock); 23470Sstevel@tonic-gate } 23480Sstevel@tonic-gate 23490Sstevel@tonic-gate /* 23500Sstevel@tonic-gate * ire_add_v[46] atomically make sure that the ipif or ill associated 23510Sstevel@tonic-gate * with the new ire being added is stable and not IPIF_CHANGING or ILL_CHANGING 23520Sstevel@tonic-gate * before adding the ire to the table. This ensures that we don't create 23530Sstevel@tonic-gate * new IRE_CACHEs with stale values for parameters that are passed to 23540Sstevel@tonic-gate * ire_create such as ire_max_frag. Note that ire_create() is passed a pointer 23550Sstevel@tonic-gate * to the ipif_mtu, and not the value. The actual value is derived from the 23560Sstevel@tonic-gate * parent ire or ipif under the bucket lock. 23570Sstevel@tonic-gate */ 23580Sstevel@tonic-gate int 23590Sstevel@tonic-gate ire_atomic_start(irb_t *irb_ptr, ire_t *ire, queue_t *q, mblk_t *mp, 23600Sstevel@tonic-gate ipsq_func_t func) 23610Sstevel@tonic-gate { 23620Sstevel@tonic-gate ill_t *stq_ill; 23630Sstevel@tonic-gate ill_t *ipif_ill; 23640Sstevel@tonic-gate int error = 0; 23650Sstevel@tonic-gate ill_t *ill = NULL; 23663448Sdh155122 ip_stack_t *ipst = ire->ire_ipst; 23670Sstevel@tonic-gate 23688564SPeter.Memishian@Sun.COM stq_ill = ire->ire_stq != NULL ? ire->ire_stq->q_ptr : NULL; 23698564SPeter.Memishian@Sun.COM ipif_ill = ire->ire_ipif != NULL ? ire->ire_ipif->ipif_ill : NULL; 23700Sstevel@tonic-gate 23710Sstevel@tonic-gate ASSERT((q != NULL && mp != NULL && func != NULL) || 23720Sstevel@tonic-gate (q == NULL && mp == NULL && func == NULL)); 23733448Sdh155122 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER); 23740Sstevel@tonic-gate GRAB_CONN_LOCK(q); 23750Sstevel@tonic-gate rw_enter(&irb_ptr->irb_lock, RW_WRITER); 23768564SPeter.Memishian@Sun.COM GRAB_ILL_LOCKS(ipif_ill, stq_ill); 23770Sstevel@tonic-gate 23780Sstevel@tonic-gate /* 23790Sstevel@tonic-gate * While the IRE is in the process of being added, a user may have 23800Sstevel@tonic-gate * invoked the ifconfig usesrc option on the stq_ill to make it a 23810Sstevel@tonic-gate * usesrc client ILL. Check for this possibility here, if it is true 23820Sstevel@tonic-gate * then we fail adding the IRE_CACHE. Another check is to make sure 23830Sstevel@tonic-gate * that an ipif_ill of an IRE_CACHE being added is not part of a usesrc 23840Sstevel@tonic-gate * group. The ill_g_usesrc_lock is released in ire_atomic_end 23850Sstevel@tonic-gate */ 23860Sstevel@tonic-gate if ((ire->ire_type & IRE_CACHE) && 23870Sstevel@tonic-gate (ire->ire_marks & IRE_MARK_USESRC_CHECK)) { 23880Sstevel@tonic-gate if (stq_ill->ill_usesrc_ifindex != 0) { 23890Sstevel@tonic-gate ASSERT(stq_ill->ill_usesrc_grp_next != NULL); 23900Sstevel@tonic-gate if ((ipif_ill->ill_phyint->phyint_ifindex != 23910Sstevel@tonic-gate stq_ill->ill_usesrc_ifindex) || 23920Sstevel@tonic-gate (ipif_ill->ill_usesrc_grp_next == NULL) || 23930Sstevel@tonic-gate (ipif_ill->ill_usesrc_ifindex != 0)) { 23940Sstevel@tonic-gate error = EINVAL; 23950Sstevel@tonic-gate goto done; 23960Sstevel@tonic-gate } 23970Sstevel@tonic-gate } else if (ipif_ill->ill_usesrc_grp_next != NULL) { 23980Sstevel@tonic-gate error = EINVAL; 23990Sstevel@tonic-gate goto done; 24000Sstevel@tonic-gate } 24010Sstevel@tonic-gate } 24020Sstevel@tonic-gate 24030Sstevel@tonic-gate /* 24048485SPeter.Memishian@Sun.COM * Don't allow IRE's to be created on changing ill's. Also, since 24058485SPeter.Memishian@Sun.COM * IPMP flags can be set on an ill without quiescing it, if we're not 24068485SPeter.Memishian@Sun.COM * a writer on stq_ill, check that the flags still allow IRE creation. 24070Sstevel@tonic-gate */ 24080Sstevel@tonic-gate if ((stq_ill != NULL) && !IAM_WRITER_ILL(stq_ill)) { 24090Sstevel@tonic-gate if (stq_ill->ill_state_flags & ILL_CHANGING) { 24100Sstevel@tonic-gate ill = stq_ill; 24110Sstevel@tonic-gate error = EAGAIN; 24128485SPeter.Memishian@Sun.COM } else if (IS_UNDER_IPMP(stq_ill)) { 24138485SPeter.Memishian@Sun.COM mutex_enter(&stq_ill->ill_phyint->phyint_lock); 24148485SPeter.Memishian@Sun.COM if (!ipmp_ill_is_active(stq_ill) && 24158485SPeter.Memishian@Sun.COM !(ire->ire_marks & IRE_MARK_TESTHIDDEN)) { 24168485SPeter.Memishian@Sun.COM error = EINVAL; 24178485SPeter.Memishian@Sun.COM } 24188485SPeter.Memishian@Sun.COM mutex_exit(&stq_ill->ill_phyint->phyint_lock); 24190Sstevel@tonic-gate } 24208485SPeter.Memishian@Sun.COM if (error != 0) 24218485SPeter.Memishian@Sun.COM goto done; 24220Sstevel@tonic-gate } 24230Sstevel@tonic-gate 24240Sstevel@tonic-gate if ((ipif_ill != NULL) && !IAM_WRITER_ILL(ipif_ill) && 24250Sstevel@tonic-gate (ipif_ill->ill_state_flags & ILL_CHANGING)) { 24260Sstevel@tonic-gate ill = ipif_ill; 24270Sstevel@tonic-gate error = EAGAIN; 24280Sstevel@tonic-gate goto done; 24290Sstevel@tonic-gate } 24300Sstevel@tonic-gate 24310Sstevel@tonic-gate if ((ire->ire_ipif != NULL) && !IAM_WRITER_IPIF(ire->ire_ipif) && 24320Sstevel@tonic-gate (ire->ire_ipif->ipif_state_flags & IPIF_CHANGING)) { 24330Sstevel@tonic-gate ill = ire->ire_ipif->ipif_ill; 24340Sstevel@tonic-gate ASSERT(ill != NULL); 24350Sstevel@tonic-gate error = EAGAIN; 24360Sstevel@tonic-gate goto done; 24370Sstevel@tonic-gate } 24380Sstevel@tonic-gate 24390Sstevel@tonic-gate done: 24400Sstevel@tonic-gate if (error == EAGAIN && ILL_CAN_WAIT(ill, q)) { 24410Sstevel@tonic-gate ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 24420Sstevel@tonic-gate mutex_enter(&ipsq->ipsq_lock); 24438485SPeter.Memishian@Sun.COM mutex_enter(&ipsq->ipsq_xop->ipx_lock); 24440Sstevel@tonic-gate ire_atomic_end(irb_ptr, ire); 24450Sstevel@tonic-gate ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 24468485SPeter.Memishian@Sun.COM mutex_exit(&ipsq->ipsq_xop->ipx_lock); 24470Sstevel@tonic-gate mutex_exit(&ipsq->ipsq_lock); 24480Sstevel@tonic-gate error = EINPROGRESS; 24490Sstevel@tonic-gate } else if (error != 0) { 24500Sstevel@tonic-gate ire_atomic_end(irb_ptr, ire); 24510Sstevel@tonic-gate } 24520Sstevel@tonic-gate 24530Sstevel@tonic-gate RELEASE_CONN_LOCK(q); 24540Sstevel@tonic-gate return (error); 24550Sstevel@tonic-gate } 24560Sstevel@tonic-gate 24570Sstevel@tonic-gate /* 24580Sstevel@tonic-gate * Add a fully initialized IRE to an appropriate table based on 24590Sstevel@tonic-gate * ire_type. 24602535Ssangeeta * 24612535Ssangeeta * allow_unresolved == B_FALSE indicates a legacy code-path call 24622535Ssangeeta * that has prohibited the addition of incomplete ire's. If this 24632535Ssangeeta * parameter is set, and we find an nce that is in a state other 24642535Ssangeeta * than ND_REACHABLE, we fail the add. Note that nce_state could be 24654084Ssowmini * something other than ND_REACHABLE if the nce had just expired and 24664084Ssowmini * the ire_create preceding the ire_add added a new ND_INITIAL nce. 24670Sstevel@tonic-gate */ 24680Sstevel@tonic-gate int 24692535Ssangeeta ire_add(ire_t **irep, queue_t *q, mblk_t *mp, ipsq_func_t func, 24702535Ssangeeta boolean_t allow_unresolved) 24710Sstevel@tonic-gate { 24720Sstevel@tonic-gate ire_t *ire1; 24730Sstevel@tonic-gate ill_t *stq_ill = NULL; 24740Sstevel@tonic-gate ill_t *ill; 24750Sstevel@tonic-gate ipif_t *ipif = NULL; 24760Sstevel@tonic-gate ill_walk_context_t ctx; 24770Sstevel@tonic-gate ire_t *ire = *irep; 24780Sstevel@tonic-gate int error; 24792416Sjarrett boolean_t ire_is_mblk = B_FALSE; 24802416Sjarrett tsol_gcgrp_t *gcgrp = NULL; 24812416Sjarrett tsol_gcgrp_addr_t ga; 24823448Sdh155122 ip_stack_t *ipst = ire->ire_ipst; 24830Sstevel@tonic-gate 24840Sstevel@tonic-gate /* get ready for the day when original ire is not created as mblk */ 24850Sstevel@tonic-gate if (ire->ire_mp != NULL) { 24862416Sjarrett ire_is_mblk = B_TRUE; 24870Sstevel@tonic-gate /* Copy the ire to a kmem_alloc'ed area */ 24880Sstevel@tonic-gate ire1 = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 24890Sstevel@tonic-gate if (ire1 == NULL) { 24900Sstevel@tonic-gate ip1dbg(("ire_add: alloc failed\n")); 24910Sstevel@tonic-gate ire_delete(ire); 24920Sstevel@tonic-gate *irep = NULL; 24930Sstevel@tonic-gate return (ENOMEM); 24940Sstevel@tonic-gate } 24952535Ssangeeta ire->ire_marks &= ~IRE_MARK_UNCACHED; 24960Sstevel@tonic-gate *ire1 = *ire; 24970Sstevel@tonic-gate ire1->ire_mp = NULL; 24982535Ssangeeta ire1->ire_stq_ifindex = 0; 24990Sstevel@tonic-gate freeb(ire->ire_mp); 25000Sstevel@tonic-gate ire = ire1; 25010Sstevel@tonic-gate } 25020Sstevel@tonic-gate if (ire->ire_stq != NULL) 25038485SPeter.Memishian@Sun.COM stq_ill = ire->ire_stq->q_ptr; 25040Sstevel@tonic-gate 25050Sstevel@tonic-gate if (stq_ill != NULL && ire->ire_type == IRE_CACHE && 25060Sstevel@tonic-gate stq_ill->ill_net_type == IRE_IF_RESOLVER) { 25073448Sdh155122 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 25083448Sdh155122 ill = ILL_START_WALK_ALL(&ctx, ipst); 25090Sstevel@tonic-gate for (; ill != NULL; ill = ill_next(&ctx, ill)) { 25100Sstevel@tonic-gate mutex_enter(&ill->ill_lock); 25110Sstevel@tonic-gate if (ill->ill_state_flags & ILL_CONDEMNED) { 25120Sstevel@tonic-gate mutex_exit(&ill->ill_lock); 25130Sstevel@tonic-gate continue; 25140Sstevel@tonic-gate } 25150Sstevel@tonic-gate /* 25160Sstevel@tonic-gate * We need to make sure that the ipif is a valid one 25170Sstevel@tonic-gate * before adding the IRE_CACHE. This happens only 25180Sstevel@tonic-gate * with IRE_CACHE when there is an external resolver. 25190Sstevel@tonic-gate * 25200Sstevel@tonic-gate * We can unplumb a logical interface while the 25210Sstevel@tonic-gate * packet is waiting in ARP with the IRE. Then, 25220Sstevel@tonic-gate * later on when we feed the IRE back, the ipif 25230Sstevel@tonic-gate * has to be re-checked. This can't happen with 25240Sstevel@tonic-gate * NDP currently, as we never queue the IRE with 25250Sstevel@tonic-gate * the packet. We always try to recreate the IRE 25260Sstevel@tonic-gate * when the resolution is completed. But, we do 25270Sstevel@tonic-gate * it for IPv6 also here so that in future if 25280Sstevel@tonic-gate * we have external resolvers, it will work without 25290Sstevel@tonic-gate * any change. 25300Sstevel@tonic-gate */ 25310Sstevel@tonic-gate ipif = ipif_lookup_seqid(ill, ire->ire_ipif_seqid); 25320Sstevel@tonic-gate if (ipif != NULL) { 25330Sstevel@tonic-gate ipif_refhold_locked(ipif); 25340Sstevel@tonic-gate mutex_exit(&ill->ill_lock); 25350Sstevel@tonic-gate break; 25360Sstevel@tonic-gate } 25370Sstevel@tonic-gate mutex_exit(&ill->ill_lock); 25380Sstevel@tonic-gate } 25393448Sdh155122 rw_exit(&ipst->ips_ill_g_lock); 25400Sstevel@tonic-gate if (ipif == NULL || 25410Sstevel@tonic-gate (ipif->ipif_isv6 && 25428485SPeter.Memishian@Sun.COM !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6) && 25430Sstevel@tonic-gate !IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6, 25440Sstevel@tonic-gate &ipif->ipif_v6src_addr)) || 25450Sstevel@tonic-gate (!ipif->ipif_isv6 && 25460Sstevel@tonic-gate ire->ire_src_addr != ipif->ipif_src_addr) || 25471676Sjpk ire->ire_zoneid != ipif->ipif_zoneid) { 25480Sstevel@tonic-gate if (ipif != NULL) 25490Sstevel@tonic-gate ipif_refrele(ipif); 25500Sstevel@tonic-gate ire->ire_ipif = NULL; 25510Sstevel@tonic-gate ire_delete(ire); 25520Sstevel@tonic-gate *irep = NULL; 25530Sstevel@tonic-gate return (EINVAL); 25540Sstevel@tonic-gate } 25550Sstevel@tonic-gate 25560Sstevel@tonic-gate ASSERT(ill != NULL); 25572416Sjarrett 25582416Sjarrett /* 25592416Sjarrett * Since we didn't attach label security attributes to the 25602416Sjarrett * ire for the resolver case, we need to add it now. (only 25612416Sjarrett * for v4 resolver and v6 xresolv case). 25622416Sjarrett */ 25632416Sjarrett if (is_system_labeled() && ire_is_mblk) { 25642416Sjarrett if (ire->ire_ipversion == IPV4_VERSION) { 25652416Sjarrett ga.ga_af = AF_INET; 25662416Sjarrett IN6_IPADDR_TO_V4MAPPED(ire->ire_gateway_addr != 25672416Sjarrett INADDR_ANY ? ire->ire_gateway_addr : 25682416Sjarrett ire->ire_addr, &ga.ga_addr); 25692416Sjarrett } else { 25702416Sjarrett ga.ga_af = AF_INET6; 25712416Sjarrett ga.ga_addr = IN6_IS_ADDR_UNSPECIFIED( 25722416Sjarrett &ire->ire_gateway_addr_v6) ? 25732416Sjarrett ire->ire_addr_v6 : 25742416Sjarrett ire->ire_gateway_addr_v6; 25752416Sjarrett } 25762416Sjarrett gcgrp = gcgrp_lookup(&ga, B_FALSE); 25772416Sjarrett error = tsol_ire_init_gwattr(ire, ire->ire_ipversion, 25782416Sjarrett NULL, gcgrp); 25792416Sjarrett if (error != 0) { 25802416Sjarrett if (gcgrp != NULL) { 25812416Sjarrett GCGRP_REFRELE(gcgrp); 25822416Sjarrett gcgrp = NULL; 25832416Sjarrett } 25842416Sjarrett ipif_refrele(ipif); 25852416Sjarrett ire->ire_ipif = NULL; 25862416Sjarrett ire_delete(ire); 25872416Sjarrett *irep = NULL; 25882416Sjarrett return (error); 25892416Sjarrett } 25902416Sjarrett } 25910Sstevel@tonic-gate } 25920Sstevel@tonic-gate 25930Sstevel@tonic-gate /* 25940Sstevel@tonic-gate * In case ire was changed 25950Sstevel@tonic-gate */ 25960Sstevel@tonic-gate *irep = ire; 25974823Sseb if (ire->ire_ipversion == IPV6_VERSION) 25980Sstevel@tonic-gate error = ire_add_v6(irep, q, mp, func); 25994823Sseb else 26004823Sseb error = ire_add_v4(irep, q, mp, func, allow_unresolved); 26010Sstevel@tonic-gate if (ipif != NULL) 26020Sstevel@tonic-gate ipif_refrele(ipif); 26030Sstevel@tonic-gate return (error); 26040Sstevel@tonic-gate } 26050Sstevel@tonic-gate 26060Sstevel@tonic-gate /* 26072416Sjarrett * Add an initialized IRE to an appropriate table based on ire_type. 26080Sstevel@tonic-gate * 26093004Sdd193516 * The forward table contains IRE_PREFIX/IRE_HOST and 26100Sstevel@tonic-gate * IRE_IF_RESOLVER/IRE_IF_NORESOLVER and IRE_DEFAULT. 26110Sstevel@tonic-gate * 26120Sstevel@tonic-gate * The cache table contains IRE_BROADCAST/IRE_LOCAL/IRE_LOOPBACK 26130Sstevel@tonic-gate * and IRE_CACHE. 26140Sstevel@tonic-gate * 26150Sstevel@tonic-gate * NOTE : This function is called as writer though not required 26160Sstevel@tonic-gate * by this function. 26170Sstevel@tonic-gate */ 26180Sstevel@tonic-gate static int 26192535Ssangeeta ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func, 26202535Ssangeeta boolean_t allow_unresolved) 26210Sstevel@tonic-gate { 26220Sstevel@tonic-gate ire_t *ire1; 26230Sstevel@tonic-gate irb_t *irb_ptr; 26240Sstevel@tonic-gate ire_t **irep; 26250Sstevel@tonic-gate int flags; 26260Sstevel@tonic-gate ire_t *pire = NULL; 26270Sstevel@tonic-gate ill_t *stq_ill; 26280Sstevel@tonic-gate ire_t *ire = *ire_p; 26290Sstevel@tonic-gate int error; 26302535Ssangeeta boolean_t need_refrele = B_FALSE; 26312535Ssangeeta nce_t *nce; 26323448Sdh155122 ip_stack_t *ipst = ire->ire_ipst; 26338485SPeter.Memishian@Sun.COM uint_t marks = 0; 26348485SPeter.Memishian@Sun.COM 26358485SPeter.Memishian@Sun.COM /* 26368485SPeter.Memishian@Sun.COM * IREs with source addresses hosted on interfaces that are under IPMP 26378485SPeter.Memishian@Sun.COM * should be hidden so that applications don't accidentally end up 26388485SPeter.Memishian@Sun.COM * sending packets with test addresses as their source addresses, or 26398485SPeter.Memishian@Sun.COM * sending out interfaces that are e.g. IFF_INACTIVE. Hide them here. 26408485SPeter.Memishian@Sun.COM */ 26418485SPeter.Memishian@Sun.COM if (ire->ire_ipif != NULL && IS_UNDER_IPMP(ire->ire_ipif->ipif_ill)) 26428485SPeter.Memishian@Sun.COM marks |= IRE_MARK_TESTHIDDEN; 26430Sstevel@tonic-gate 26440Sstevel@tonic-gate if (ire->ire_ipif != NULL) 26450Sstevel@tonic-gate ASSERT(!MUTEX_HELD(&ire->ire_ipif->ipif_ill->ill_lock)); 26460Sstevel@tonic-gate if (ire->ire_stq != NULL) 26470Sstevel@tonic-gate ASSERT(!MUTEX_HELD( 26480Sstevel@tonic-gate &((ill_t *)(ire->ire_stq->q_ptr))->ill_lock)); 26490Sstevel@tonic-gate ASSERT(ire->ire_ipversion == IPV4_VERSION); 26500Sstevel@tonic-gate ASSERT(ire->ire_mp == NULL); /* Calls should go through ire_add */ 26510Sstevel@tonic-gate 26520Sstevel@tonic-gate /* Find the appropriate list head. */ 26530Sstevel@tonic-gate switch (ire->ire_type) { 26540Sstevel@tonic-gate case IRE_HOST: 26550Sstevel@tonic-gate ire->ire_mask = IP_HOST_MASK; 26560Sstevel@tonic-gate ire->ire_masklen = IP_ABITS; 26578485SPeter.Memishian@Sun.COM ire->ire_marks |= marks; 26580Sstevel@tonic-gate if ((ire->ire_flags & RTF_SETSRC) == 0) 26590Sstevel@tonic-gate ire->ire_src_addr = 0; 26600Sstevel@tonic-gate break; 26610Sstevel@tonic-gate case IRE_CACHE: 26628485SPeter.Memishian@Sun.COM ire->ire_mask = IP_HOST_MASK; 26638485SPeter.Memishian@Sun.COM ire->ire_masklen = IP_ABITS; 26648485SPeter.Memishian@Sun.COM ire->ire_marks |= marks; 26658485SPeter.Memishian@Sun.COM break; 26660Sstevel@tonic-gate case IRE_BROADCAST: 26670Sstevel@tonic-gate case IRE_LOCAL: 26680Sstevel@tonic-gate case IRE_LOOPBACK: 26690Sstevel@tonic-gate ire->ire_mask = IP_HOST_MASK; 26700Sstevel@tonic-gate ire->ire_masklen = IP_ABITS; 26710Sstevel@tonic-gate break; 26720Sstevel@tonic-gate case IRE_PREFIX: 26730Sstevel@tonic-gate case IRE_DEFAULT: 26748485SPeter.Memishian@Sun.COM ire->ire_marks |= marks; 26750Sstevel@tonic-gate if ((ire->ire_flags & RTF_SETSRC) == 0) 26760Sstevel@tonic-gate ire->ire_src_addr = 0; 26770Sstevel@tonic-gate break; 26780Sstevel@tonic-gate case IRE_IF_RESOLVER: 26790Sstevel@tonic-gate case IRE_IF_NORESOLVER: 26808485SPeter.Memishian@Sun.COM ire->ire_marks |= marks; 26810Sstevel@tonic-gate break; 26820Sstevel@tonic-gate default: 26832535Ssangeeta ip0dbg(("ire_add_v4: ire %p has unrecognized IRE type (%d)\n", 26842535Ssangeeta (void *)ire, ire->ire_type)); 26850Sstevel@tonic-gate ire_delete(ire); 26860Sstevel@tonic-gate *ire_p = NULL; 26870Sstevel@tonic-gate return (EINVAL); 26880Sstevel@tonic-gate } 26890Sstevel@tonic-gate 26900Sstevel@tonic-gate /* Make sure the address is properly masked. */ 26910Sstevel@tonic-gate ire->ire_addr &= ire->ire_mask; 26920Sstevel@tonic-gate 26930Sstevel@tonic-gate /* 26940Sstevel@tonic-gate * ip_newroute/ip_newroute_multi are unable to prevent the deletion 26950Sstevel@tonic-gate * of the interface route while adding an IRE_CACHE for an on-link 26960Sstevel@tonic-gate * destination in the IRE_IF_RESOLVER case, since the ire has to 26970Sstevel@tonic-gate * go to ARP and return. We can't do a REFHOLD on the 26980Sstevel@tonic-gate * associated interface ire for fear of ARP freeing the message. 26990Sstevel@tonic-gate * Here we look up the interface ire in the forwarding table and 27000Sstevel@tonic-gate * make sure that the interface route has not been deleted. 27010Sstevel@tonic-gate */ 27020Sstevel@tonic-gate if (ire->ire_type == IRE_CACHE && ire->ire_gateway_addr == 0 && 27030Sstevel@tonic-gate ((ill_t *)ire->ire_stq->q_ptr)->ill_net_type == IRE_IF_RESOLVER) { 27042535Ssangeeta 27050Sstevel@tonic-gate ASSERT(ire->ire_max_fragp == NULL); 27060Sstevel@tonic-gate if (CLASSD(ire->ire_addr) && !(ire->ire_flags & RTF_SETSRC)) { 27070Sstevel@tonic-gate /* 27080Sstevel@tonic-gate * The ihandle that we used in ip_newroute_multi 27090Sstevel@tonic-gate * comes from the interface route corresponding 27100Sstevel@tonic-gate * to ire_ipif. Lookup here to see if it exists 27110Sstevel@tonic-gate * still. 27120Sstevel@tonic-gate * If the ire has a source address assigned using 27130Sstevel@tonic-gate * RTF_SETSRC, ire_ipif is the logical interface holding 27140Sstevel@tonic-gate * this source address, so we can't use it to check for 27150Sstevel@tonic-gate * the existence of the interface route. Instead we rely 27160Sstevel@tonic-gate * on the brute force ihandle search in 27170Sstevel@tonic-gate * ire_ihandle_lookup_onlink() below. 27180Sstevel@tonic-gate */ 27190Sstevel@tonic-gate pire = ipif_to_ire(ire->ire_ipif); 27200Sstevel@tonic-gate if (pire == NULL) { 27210Sstevel@tonic-gate ire_delete(ire); 27220Sstevel@tonic-gate *ire_p = NULL; 27230Sstevel@tonic-gate return (EINVAL); 27240Sstevel@tonic-gate } else if (pire->ire_ihandle != ire->ire_ihandle) { 27250Sstevel@tonic-gate ire_refrele(pire); 27260Sstevel@tonic-gate ire_delete(ire); 27270Sstevel@tonic-gate *ire_p = NULL; 27280Sstevel@tonic-gate return (EINVAL); 27290Sstevel@tonic-gate } 27300Sstevel@tonic-gate } else { 27310Sstevel@tonic-gate pire = ire_ihandle_lookup_onlink(ire); 27320Sstevel@tonic-gate if (pire == NULL) { 27330Sstevel@tonic-gate ire_delete(ire); 27340Sstevel@tonic-gate *ire_p = NULL; 27350Sstevel@tonic-gate return (EINVAL); 27360Sstevel@tonic-gate } 27370Sstevel@tonic-gate } 27380Sstevel@tonic-gate /* Prevent pire from getting deleted */ 27390Sstevel@tonic-gate IRB_REFHOLD(pire->ire_bucket); 27400Sstevel@tonic-gate /* Has it been removed already ? */ 27410Sstevel@tonic-gate if (pire->ire_marks & IRE_MARK_CONDEMNED) { 27420Sstevel@tonic-gate IRB_REFRELE(pire->ire_bucket); 27430Sstevel@tonic-gate ire_refrele(pire); 27440Sstevel@tonic-gate ire_delete(ire); 27450Sstevel@tonic-gate *ire_p = NULL; 27460Sstevel@tonic-gate return (EINVAL); 27470Sstevel@tonic-gate } 27480Sstevel@tonic-gate } else { 27490Sstevel@tonic-gate ASSERT(ire->ire_max_fragp != NULL); 27500Sstevel@tonic-gate } 27510Sstevel@tonic-gate flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW); 27520Sstevel@tonic-gate 27530Sstevel@tonic-gate if (ire->ire_ipif != NULL) { 27540Sstevel@tonic-gate /* 27550Sstevel@tonic-gate * We use MATCH_IRE_IPIF while adding IRE_CACHES only 27560Sstevel@tonic-gate * for historic reasons and to maintain symmetry with 27570Sstevel@tonic-gate * IPv6 code path. Historically this was used by 27580Sstevel@tonic-gate * multicast code to create multiple IRE_CACHES on 27590Sstevel@tonic-gate * a single ill with different ipifs. This was used 27600Sstevel@tonic-gate * so that multicast packets leaving the node had the 27610Sstevel@tonic-gate * right source address. This is no longer needed as 27620Sstevel@tonic-gate * ip_wput initializes the address correctly. 27630Sstevel@tonic-gate */ 27640Sstevel@tonic-gate flags |= MATCH_IRE_IPIF; 27650Sstevel@tonic-gate /* 27668485SPeter.Memishian@Sun.COM * If we are creating a hidden IRE, make sure we search for 27678485SPeter.Memishian@Sun.COM * hidden IREs when searching for duplicates below. 27688485SPeter.Memishian@Sun.COM * Otherwise, we might find an IRE on some other interface 27698485SPeter.Memishian@Sun.COM * that's not marked hidden. 27700Sstevel@tonic-gate */ 27718485SPeter.Memishian@Sun.COM if (ire->ire_marks & IRE_MARK_TESTHIDDEN) 27728485SPeter.Memishian@Sun.COM flags |= MATCH_IRE_MARK_TESTHIDDEN; 27730Sstevel@tonic-gate } 27742535Ssangeeta if ((ire->ire_type & IRE_CACHETABLE) == 0) { 27752535Ssangeeta irb_ptr = ire_get_bucket(ire); 27762535Ssangeeta need_refrele = B_TRUE; 27772535Ssangeeta if (irb_ptr == NULL) { 27782535Ssangeeta /* 27792535Ssangeeta * This assumes that the ire has not added 27802535Ssangeeta * a reference to the ipif. 27812535Ssangeeta */ 27822535Ssangeeta ire->ire_ipif = NULL; 27832535Ssangeeta ire_delete(ire); 27842535Ssangeeta if (pire != NULL) { 27852535Ssangeeta IRB_REFRELE(pire->ire_bucket); 27862535Ssangeeta ire_refrele(pire); 27872535Ssangeeta } 27882535Ssangeeta *ire_p = NULL; 27892535Ssangeeta return (EINVAL); 27902535Ssangeeta } 27912535Ssangeeta } else { 27923448Sdh155122 irb_ptr = &(ipst->ips_ip_cache_table[IRE_ADDR_HASH( 27933448Sdh155122 ire->ire_addr, ipst->ips_ip_cache_table_size)]); 27942535Ssangeeta } 27950Sstevel@tonic-gate 27960Sstevel@tonic-gate /* 27970Sstevel@tonic-gate * Start the atomic add of the ire. Grab the ill locks, 27980Sstevel@tonic-gate * ill_g_usesrc_lock and the bucket lock. Check for condemned 27990Sstevel@tonic-gate * 28000Sstevel@tonic-gate * If ipif or ill is changing ire_atomic_start() may queue the 28010Sstevel@tonic-gate * request and return EINPROGRESS. 28023448Sdh155122 * To avoid lock order problems, get the ndp4->ndp_g_lock. 28030Sstevel@tonic-gate */ 28043448Sdh155122 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 28050Sstevel@tonic-gate error = ire_atomic_start(irb_ptr, ire, q, mp, func); 28060Sstevel@tonic-gate if (error != 0) { 28073448Sdh155122 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 28080Sstevel@tonic-gate /* 28090Sstevel@tonic-gate * We don't know whether it is a valid ipif or not. 28100Sstevel@tonic-gate * So, set it to NULL. This assumes that the ire has not added 28110Sstevel@tonic-gate * a reference to the ipif. 28120Sstevel@tonic-gate */ 28130Sstevel@tonic-gate ire->ire_ipif = NULL; 28140Sstevel@tonic-gate ire_delete(ire); 28150Sstevel@tonic-gate if (pire != NULL) { 28160Sstevel@tonic-gate IRB_REFRELE(pire->ire_bucket); 28170Sstevel@tonic-gate ire_refrele(pire); 28180Sstevel@tonic-gate } 28190Sstevel@tonic-gate *ire_p = NULL; 28202535Ssangeeta if (need_refrele) 28212535Ssangeeta IRB_REFRELE(irb_ptr); 28220Sstevel@tonic-gate return (error); 28230Sstevel@tonic-gate } 28240Sstevel@tonic-gate /* 28250Sstevel@tonic-gate * To avoid creating ires having stale values for the ire_max_frag 28260Sstevel@tonic-gate * we get the latest value atomically here. For more details 28270Sstevel@tonic-gate * see the block comment in ip_sioctl_mtu and in DL_NOTE_SDU_CHANGE 28280Sstevel@tonic-gate * in ip_rput_dlpi_writer 28290Sstevel@tonic-gate */ 28300Sstevel@tonic-gate if (ire->ire_max_fragp == NULL) { 28310Sstevel@tonic-gate if (CLASSD(ire->ire_addr)) 28320Sstevel@tonic-gate ire->ire_max_frag = ire->ire_ipif->ipif_mtu; 28330Sstevel@tonic-gate else 28340Sstevel@tonic-gate ire->ire_max_frag = pire->ire_max_frag; 28350Sstevel@tonic-gate } else { 28360Sstevel@tonic-gate uint_t max_frag; 28370Sstevel@tonic-gate 28380Sstevel@tonic-gate max_frag = *ire->ire_max_fragp; 28390Sstevel@tonic-gate ire->ire_max_fragp = NULL; 28400Sstevel@tonic-gate ire->ire_max_frag = max_frag; 28410Sstevel@tonic-gate } 28420Sstevel@tonic-gate /* 28430Sstevel@tonic-gate * Atomically check for duplicate and insert in the table. 28440Sstevel@tonic-gate */ 28450Sstevel@tonic-gate for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 28460Sstevel@tonic-gate if (ire1->ire_marks & IRE_MARK_CONDEMNED) 28470Sstevel@tonic-gate continue; 28480Sstevel@tonic-gate if (ire->ire_ipif != NULL) { 28490Sstevel@tonic-gate /* 28500Sstevel@tonic-gate * We do MATCH_IRE_ILL implicitly here for IREs 28510Sstevel@tonic-gate * with a non-null ire_ipif, including IRE_CACHEs. 28520Sstevel@tonic-gate * As ire_ipif and ire_stq could point to two 28530Sstevel@tonic-gate * different ills, we can't pass just ire_ipif to 28540Sstevel@tonic-gate * ire_match_args and get a match on both ills. 28550Sstevel@tonic-gate * This is just needed for duplicate checks here and 28560Sstevel@tonic-gate * so we don't add an extra argument to 28570Sstevel@tonic-gate * ire_match_args for this. Do it locally. 28580Sstevel@tonic-gate * 28590Sstevel@tonic-gate * NOTE : Currently there is no part of the code 28600Sstevel@tonic-gate * that asks for both MATH_IRE_IPIF and MATCH_IRE_ILL 28610Sstevel@tonic-gate * match for IRE_CACHEs. Thus we don't want to 28620Sstevel@tonic-gate * extend the arguments to ire_match_args. 28630Sstevel@tonic-gate */ 28640Sstevel@tonic-gate if (ire1->ire_stq != ire->ire_stq) 28650Sstevel@tonic-gate continue; 28660Sstevel@tonic-gate /* 28670Sstevel@tonic-gate * Multiroute IRE_CACHEs for a given destination can 28680Sstevel@tonic-gate * have the same ire_ipif, typically if their source 28690Sstevel@tonic-gate * address is forced using RTF_SETSRC, and the same 28700Sstevel@tonic-gate * send-to queue. We differentiate them using the parent 28710Sstevel@tonic-gate * handle. 28720Sstevel@tonic-gate */ 28730Sstevel@tonic-gate if (ire->ire_type == IRE_CACHE && 28740Sstevel@tonic-gate (ire1->ire_flags & RTF_MULTIRT) && 28750Sstevel@tonic-gate (ire->ire_flags & RTF_MULTIRT) && 28760Sstevel@tonic-gate (ire1->ire_phandle != ire->ire_phandle)) 28770Sstevel@tonic-gate continue; 28780Sstevel@tonic-gate } 28790Sstevel@tonic-gate if (ire1->ire_zoneid != ire->ire_zoneid) 28800Sstevel@tonic-gate continue; 28810Sstevel@tonic-gate if (ire_match_args(ire1, ire->ire_addr, ire->ire_mask, 28820Sstevel@tonic-gate ire->ire_gateway_addr, ire->ire_type, ire->ire_ipif, 28837880SJonathan.Anderson@Sun.COM ire->ire_zoneid, 0, NULL, flags, NULL)) { 28840Sstevel@tonic-gate /* 28850Sstevel@tonic-gate * Return the old ire after doing a REFHOLD. 28860Sstevel@tonic-gate * As most of the callers continue to use the IRE 28870Sstevel@tonic-gate * after adding, we return a held ire. This will 28880Sstevel@tonic-gate * avoid a lookup in the caller again. If the callers 28890Sstevel@tonic-gate * don't want to use it, they need to do a REFRELE. 28900Sstevel@tonic-gate */ 28918485SPeter.Memishian@Sun.COM ip1dbg(("found dup ire existing %p new %p\n", 28920Sstevel@tonic-gate (void *)ire1, (void *)ire)); 28930Sstevel@tonic-gate IRE_REFHOLD(ire1); 28940Sstevel@tonic-gate ire_atomic_end(irb_ptr, ire); 28953448Sdh155122 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 28960Sstevel@tonic-gate ire_delete(ire); 28970Sstevel@tonic-gate if (pire != NULL) { 28980Sstevel@tonic-gate /* 28990Sstevel@tonic-gate * Assert that it is not removed from the 29000Sstevel@tonic-gate * list yet. 29010Sstevel@tonic-gate */ 29020Sstevel@tonic-gate ASSERT(pire->ire_ptpn != NULL); 29030Sstevel@tonic-gate IRB_REFRELE(pire->ire_bucket); 29040Sstevel@tonic-gate ire_refrele(pire); 29050Sstevel@tonic-gate } 29060Sstevel@tonic-gate *ire_p = ire1; 29072535Ssangeeta if (need_refrele) 29082535Ssangeeta IRB_REFRELE(irb_ptr); 29090Sstevel@tonic-gate return (0); 29100Sstevel@tonic-gate } 29110Sstevel@tonic-gate } 29128485SPeter.Memishian@Sun.COM 29132535Ssangeeta if (ire->ire_type & IRE_CACHE) { 29142535Ssangeeta ASSERT(ire->ire_stq != NULL); 29152535Ssangeeta nce = ndp_lookup_v4(ire_to_ill(ire), 29162535Ssangeeta ((ire->ire_gateway_addr != INADDR_ANY) ? 29172535Ssangeeta &ire->ire_gateway_addr : &ire->ire_addr), 29182535Ssangeeta B_TRUE); 29192535Ssangeeta if (nce != NULL) 29202535Ssangeeta mutex_enter(&nce->nce_lock); 29212535Ssangeeta /* 29222535Ssangeeta * if the nce is NCE_F_CONDEMNED, or if it is not ND_REACHABLE 29232535Ssangeeta * and the caller has prohibited the addition of incomplete 29242535Ssangeeta * ire's, we fail the add. Note that nce_state could be 29254084Ssowmini * something other than ND_REACHABLE if the nce had 29264084Ssowmini * just expired and the ire_create preceding the 29274084Ssowmini * ire_add added a new ND_INITIAL nce. 29282535Ssangeeta */ 29292535Ssangeeta if ((nce == NULL) || 29302535Ssangeeta (nce->nce_flags & NCE_F_CONDEMNED) || 29312535Ssangeeta (!allow_unresolved && 29323397Ssangeeta (nce->nce_state != ND_REACHABLE))) { 29334084Ssowmini if (nce != NULL) { 29344084Ssowmini DTRACE_PROBE1(ire__bad__nce, nce_t *, nce); 29352535Ssangeeta mutex_exit(&nce->nce_lock); 29364084Ssowmini } 29372535Ssangeeta ire_atomic_end(irb_ptr, ire); 29383448Sdh155122 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 29392535Ssangeeta if (nce != NULL) 29402535Ssangeeta NCE_REFRELE(nce); 29412535Ssangeeta DTRACE_PROBE1(ire__no__nce, ire_t *, ire); 29422535Ssangeeta ire_delete(ire); 29432535Ssangeeta if (pire != NULL) { 29442535Ssangeeta IRB_REFRELE(pire->ire_bucket); 29452535Ssangeeta ire_refrele(pire); 29462535Ssangeeta } 29472535Ssangeeta *ire_p = NULL; 29482535Ssangeeta if (need_refrele) 29492535Ssangeeta IRB_REFRELE(irb_ptr); 29502535Ssangeeta return (EINVAL); 29512535Ssangeeta } else { 29522535Ssangeeta ire->ire_nce = nce; 29532535Ssangeeta mutex_exit(&nce->nce_lock); 29542535Ssangeeta /* 29552535Ssangeeta * We are associating this nce to the ire, so 29562535Ssangeeta * change the nce ref taken in ndp_lookup_v4() from 29572535Ssangeeta * NCE_REFHOLD to NCE_REFHOLD_NOTR 29582535Ssangeeta */ 29592535Ssangeeta NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce); 29602535Ssangeeta } 29612535Ssangeeta } 29620Sstevel@tonic-gate /* 29630Sstevel@tonic-gate * Make it easy for ip_wput_ire() to hit multiple broadcast ires by 29648485SPeter.Memishian@Sun.COM * grouping identical addresses together on the hash chain. We do 29658485SPeter.Memishian@Sun.COM * this only for IRE_BROADCASTs as ip_wput_ire is currently interested 29668485SPeter.Memishian@Sun.COM * in such groupings only for broadcasts. 29670Sstevel@tonic-gate * 29680Sstevel@tonic-gate * Find the first entry that matches ire_addr. *irep will be null 29690Sstevel@tonic-gate * if no match. 29704182Ssowmini * 29714182Ssowmini * Note: the loopback and non-loopback broadcast entries for an 29724182Ssowmini * interface MUST be added before any MULTIRT entries. 29730Sstevel@tonic-gate */ 29740Sstevel@tonic-gate irep = (ire_t **)irb_ptr; 29750Sstevel@tonic-gate while ((ire1 = *irep) != NULL && ire->ire_addr != ire1->ire_addr) 29760Sstevel@tonic-gate irep = &ire1->ire_next; 29770Sstevel@tonic-gate if (ire->ire_type == IRE_BROADCAST && *irep != NULL) { 29780Sstevel@tonic-gate /* 29790Sstevel@tonic-gate * We found some ire (i.e *irep) with a matching addr. We 29808485SPeter.Memishian@Sun.COM * want to group ires with same addr. 29810Sstevel@tonic-gate */ 29824182Ssowmini for (;;) { 29830Sstevel@tonic-gate ire1 = *irep; 29840Sstevel@tonic-gate if ((ire1->ire_next == NULL) || 29850Sstevel@tonic-gate (ire1->ire_next->ire_addr != ire->ire_addr) || 29860Sstevel@tonic-gate (ire1->ire_type != IRE_BROADCAST) || 29874182Ssowmini (ire1->ire_flags & RTF_MULTIRT) || 29888485SPeter.Memishian@Sun.COM (ire1->ire_ipif->ipif_ill->ill_grp == 29898485SPeter.Memishian@Sun.COM ire->ire_ipif->ipif_ill->ill_grp)) 29900Sstevel@tonic-gate break; 29910Sstevel@tonic-gate irep = &ire1->ire_next; 29920Sstevel@tonic-gate } 29930Sstevel@tonic-gate ASSERT(*irep != NULL); 29944182Ssowmini /* 29954182Ssowmini * The ire will be added before *irep, so 29964182Ssowmini * if irep is a MULTIRT ire, just break to 29974182Ssowmini * ire insertion code. 29984182Ssowmini */ 29994182Ssowmini if (((*irep)->ire_flags & RTF_MULTIRT) != 0) 30004182Ssowmini goto insert_ire; 30014182Ssowmini 30020Sstevel@tonic-gate irep = &((*irep)->ire_next); 30030Sstevel@tonic-gate 30040Sstevel@tonic-gate /* 30050Sstevel@tonic-gate * Either we have hit the end of the list or the address 30068485SPeter.Memishian@Sun.COM * did not match. 30070Sstevel@tonic-gate */ 30080Sstevel@tonic-gate while (*irep != NULL) { 30090Sstevel@tonic-gate ire1 = *irep; 30100Sstevel@tonic-gate if ((ire1->ire_addr != ire->ire_addr) || 30118485SPeter.Memishian@Sun.COM (ire1->ire_type != IRE_BROADCAST)) 30120Sstevel@tonic-gate break; 30138485SPeter.Memishian@Sun.COM if (ire1->ire_ipif == ire->ire_ipif) { 30140Sstevel@tonic-gate irep = &ire1->ire_next; 30150Sstevel@tonic-gate break; 30160Sstevel@tonic-gate } 30170Sstevel@tonic-gate irep = &ire1->ire_next; 30180Sstevel@tonic-gate } 30190Sstevel@tonic-gate } else if (*irep != NULL) { 30200Sstevel@tonic-gate /* 30210Sstevel@tonic-gate * Find the last ire which matches ire_addr. 30220Sstevel@tonic-gate * Needed to do tail insertion among entries with the same 30230Sstevel@tonic-gate * ire_addr. 30240Sstevel@tonic-gate */ 30250Sstevel@tonic-gate while (ire->ire_addr == ire1->ire_addr) { 30260Sstevel@tonic-gate irep = &ire1->ire_next; 30270Sstevel@tonic-gate ire1 = *irep; 30280Sstevel@tonic-gate if (ire1 == NULL) 30290Sstevel@tonic-gate break; 30300Sstevel@tonic-gate } 30310Sstevel@tonic-gate } 30320Sstevel@tonic-gate 30334182Ssowmini insert_ire: 30340Sstevel@tonic-gate /* Insert at *irep */ 30350Sstevel@tonic-gate ire1 = *irep; 30360Sstevel@tonic-gate if (ire1 != NULL) 30370Sstevel@tonic-gate ire1->ire_ptpn = &ire->ire_next; 30380Sstevel@tonic-gate ire->ire_next = ire1; 30390Sstevel@tonic-gate /* Link the new one in. */ 30400Sstevel@tonic-gate ire->ire_ptpn = irep; 30410Sstevel@tonic-gate 30420Sstevel@tonic-gate /* 30430Sstevel@tonic-gate * ire_walk routines de-reference ire_next without holding 30440Sstevel@tonic-gate * a lock. Before we point to the new ire, we want to make 30450Sstevel@tonic-gate * sure the store that sets the ire_next of the new ire 30460Sstevel@tonic-gate * reaches global visibility, so that ire_walk routines 30470Sstevel@tonic-gate * don't see a truncated list of ires i.e if the ire_next 30480Sstevel@tonic-gate * of the new ire gets set after we do "*irep = ire" due 30490Sstevel@tonic-gate * to re-ordering, the ire_walk thread will see a NULL 30500Sstevel@tonic-gate * once it accesses the ire_next of the new ire. 30510Sstevel@tonic-gate * membar_producer() makes sure that the following store 30520Sstevel@tonic-gate * happens *after* all of the above stores. 30530Sstevel@tonic-gate */ 30540Sstevel@tonic-gate membar_producer(); 30550Sstevel@tonic-gate *irep = ire; 30560Sstevel@tonic-gate ire->ire_bucket = irb_ptr; 30570Sstevel@tonic-gate /* 30580Sstevel@tonic-gate * We return a bumped up IRE above. Keep it symmetrical 30590Sstevel@tonic-gate * so that the callers will always have to release. This 30600Sstevel@tonic-gate * helps the callers of this function because they continue 30610Sstevel@tonic-gate * to use the IRE after adding and hence they don't have to 30620Sstevel@tonic-gate * lookup again after we return the IRE. 30630Sstevel@tonic-gate * 30640Sstevel@tonic-gate * NOTE : We don't have to use atomics as this is appearing 30650Sstevel@tonic-gate * in the list for the first time and no one else can bump 30660Sstevel@tonic-gate * up the reference count on this yet. 30670Sstevel@tonic-gate */ 30680Sstevel@tonic-gate IRE_REFHOLD_LOCKED(ire); 30693448Sdh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_inserted); 30702535Ssangeeta 30710Sstevel@tonic-gate irb_ptr->irb_ire_cnt++; 30722535Ssangeeta if (irb_ptr->irb_marks & IRB_MARK_FTABLE) 30732535Ssangeeta irb_ptr->irb_nire++; 30742535Ssangeeta 30750Sstevel@tonic-gate if (ire->ire_marks & IRE_MARK_TEMPORARY) 30760Sstevel@tonic-gate irb_ptr->irb_tmp_ire_cnt++; 30770Sstevel@tonic-gate 30780Sstevel@tonic-gate if (ire->ire_ipif != NULL) { 30796255Ssowmini DTRACE_PROBE3(ipif__incr__cnt, (ipif_t *), ire->ire_ipif, 30806255Ssowmini (char *), "ire", (void *), ire); 30816379Ssowmini ire->ire_ipif->ipif_ire_cnt++; 30820Sstevel@tonic-gate if (ire->ire_stq != NULL) { 30830Sstevel@tonic-gate stq_ill = (ill_t *)ire->ire_stq->q_ptr; 30846255Ssowmini DTRACE_PROBE3(ill__incr__cnt, (ill_t *), stq_ill, 30856255Ssowmini (char *), "ire", (void *), ire); 30866379Ssowmini stq_ill->ill_ire_cnt++; 30870Sstevel@tonic-gate } 30880Sstevel@tonic-gate } else { 30890Sstevel@tonic-gate ASSERT(ire->ire_stq == NULL); 30900Sstevel@tonic-gate } 30910Sstevel@tonic-gate 30920Sstevel@tonic-gate ire_atomic_end(irb_ptr, ire); 30933448Sdh155122 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 30940Sstevel@tonic-gate 30950Sstevel@tonic-gate if (pire != NULL) { 30960Sstevel@tonic-gate /* Assert that it is not removed from the list yet */ 30970Sstevel@tonic-gate ASSERT(pire->ire_ptpn != NULL); 30980Sstevel@tonic-gate IRB_REFRELE(pire->ire_bucket); 30990Sstevel@tonic-gate ire_refrele(pire); 31000Sstevel@tonic-gate } 31010Sstevel@tonic-gate 31020Sstevel@tonic-gate if (ire->ire_type != IRE_CACHE) { 31030Sstevel@tonic-gate /* 31042535Ssangeeta * For ire's with host mask see if there is an entry 31050Sstevel@tonic-gate * in the cache. If there is one flush the whole cache as 31060Sstevel@tonic-gate * there might be multiple entries due to RTF_MULTIRT (CGTP). 31070Sstevel@tonic-gate * If no entry is found than there is no need to flush the 31080Sstevel@tonic-gate * cache. 31090Sstevel@tonic-gate */ 31100Sstevel@tonic-gate if (ire->ire_mask == IP_HOST_MASK) { 31110Sstevel@tonic-gate ire_t *lire; 31120Sstevel@tonic-gate lire = ire_ctable_lookup(ire->ire_addr, NULL, IRE_CACHE, 31133448Sdh155122 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 31140Sstevel@tonic-gate if (lire != NULL) { 31150Sstevel@tonic-gate ire_refrele(lire); 31160Sstevel@tonic-gate ire_flush_cache_v4(ire, IRE_FLUSH_ADD); 31170Sstevel@tonic-gate } 31180Sstevel@tonic-gate } else { 31190Sstevel@tonic-gate ire_flush_cache_v4(ire, IRE_FLUSH_ADD); 31200Sstevel@tonic-gate } 31210Sstevel@tonic-gate } 31220Sstevel@tonic-gate /* 31230Sstevel@tonic-gate * We had to delay the fast path probe until the ire is inserted 31240Sstevel@tonic-gate * in the list. Otherwise the fast path ack won't find the ire in 31250Sstevel@tonic-gate * the table. 31260Sstevel@tonic-gate */ 31273425Ssowmini if (ire->ire_type == IRE_CACHE || 31283425Ssowmini (ire->ire_type == IRE_BROADCAST && ire->ire_stq != NULL)) { 31293425Ssowmini ASSERT(ire->ire_nce != NULL); 31304714Ssowmini if (ire->ire_nce->nce_state == ND_REACHABLE) 31314714Ssowmini nce_fastpath(ire->ire_nce); 31323425Ssowmini } 31330Sstevel@tonic-gate if (ire->ire_ipif != NULL) 31340Sstevel@tonic-gate ASSERT(!MUTEX_HELD(&ire->ire_ipif->ipif_ill->ill_lock)); 31350Sstevel@tonic-gate *ire_p = ire; 31362535Ssangeeta if (need_refrele) { 31372535Ssangeeta IRB_REFRELE(irb_ptr); 31382535Ssangeeta } 31390Sstevel@tonic-gate return (0); 31400Sstevel@tonic-gate } 31410Sstevel@tonic-gate 31420Sstevel@tonic-gate /* 31430Sstevel@tonic-gate * IRB_REFRELE is the only caller of the function. ire_unlink calls to 31440Sstevel@tonic-gate * do the final cleanup for this ire. 31450Sstevel@tonic-gate */ 31460Sstevel@tonic-gate void 31470Sstevel@tonic-gate ire_cleanup(ire_t *ire) 31480Sstevel@tonic-gate { 31490Sstevel@tonic-gate ire_t *ire_next; 31503448Sdh155122 ip_stack_t *ipst = ire->ire_ipst; 31510Sstevel@tonic-gate 31520Sstevel@tonic-gate ASSERT(ire != NULL); 31530Sstevel@tonic-gate 31540Sstevel@tonic-gate while (ire != NULL) { 31550Sstevel@tonic-gate ire_next = ire->ire_next; 31560Sstevel@tonic-gate if (ire->ire_ipversion == IPV4_VERSION) { 31570Sstevel@tonic-gate ire_delete_v4(ire); 31583448Sdh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, 31593448Sdh155122 ire_stats_deleted); 31600Sstevel@tonic-gate } else { 31610Sstevel@tonic-gate ASSERT(ire->ire_ipversion == IPV6_VERSION); 31620Sstevel@tonic-gate ire_delete_v6(ire); 31633448Sdh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, 31643448Sdh155122 ire_stats_deleted); 31650Sstevel@tonic-gate } 31660Sstevel@tonic-gate /* 31670Sstevel@tonic-gate * Now it's really out of the list. Before doing the 31680Sstevel@tonic-gate * REFRELE, set ire_next to NULL as ire_inactive asserts 31690Sstevel@tonic-gate * so. 31700Sstevel@tonic-gate */ 31710Sstevel@tonic-gate ire->ire_next = NULL; 31720Sstevel@tonic-gate IRE_REFRELE_NOTR(ire); 31730Sstevel@tonic-gate ire = ire_next; 31740Sstevel@tonic-gate } 31750Sstevel@tonic-gate } 31760Sstevel@tonic-gate 31770Sstevel@tonic-gate /* 31780Sstevel@tonic-gate * IRB_REFRELE is the only caller of the function. It calls to unlink 31790Sstevel@tonic-gate * all the CONDEMNED ires from this bucket. 31800Sstevel@tonic-gate */ 31810Sstevel@tonic-gate ire_t * 31820Sstevel@tonic-gate ire_unlink(irb_t *irb) 31830Sstevel@tonic-gate { 31840Sstevel@tonic-gate ire_t *ire; 31850Sstevel@tonic-gate ire_t *ire1; 31860Sstevel@tonic-gate ire_t **ptpn; 31870Sstevel@tonic-gate ire_t *ire_list = NULL; 31880Sstevel@tonic-gate 31890Sstevel@tonic-gate ASSERT(RW_WRITE_HELD(&irb->irb_lock)); 31902535Ssangeeta ASSERT(((irb->irb_marks & IRB_MARK_FTABLE) && irb->irb_refcnt == 1) || 31912535Ssangeeta (irb->irb_refcnt == 0)); 31922535Ssangeeta ASSERT(irb->irb_marks & IRB_MARK_CONDEMNED); 31930Sstevel@tonic-gate ASSERT(irb->irb_ire != NULL); 31940Sstevel@tonic-gate 31950Sstevel@tonic-gate for (ire = irb->irb_ire; ire != NULL; ire = ire1) { 31963448Sdh155122 ip_stack_t *ipst = ire->ire_ipst; 31973448Sdh155122 31980Sstevel@tonic-gate ire1 = ire->ire_next; 31990Sstevel@tonic-gate if (ire->ire_marks & IRE_MARK_CONDEMNED) { 32000Sstevel@tonic-gate ptpn = ire->ire_ptpn; 32010Sstevel@tonic-gate ire1 = ire->ire_next; 32020Sstevel@tonic-gate if (ire1) 32030Sstevel@tonic-gate ire1->ire_ptpn = ptpn; 32040Sstevel@tonic-gate *ptpn = ire1; 32050Sstevel@tonic-gate ire->ire_ptpn = NULL; 32060Sstevel@tonic-gate ire->ire_next = NULL; 32070Sstevel@tonic-gate if (ire->ire_type == IRE_DEFAULT) { 32080Sstevel@tonic-gate /* 32090Sstevel@tonic-gate * IRE is out of the list. We need to adjust 32100Sstevel@tonic-gate * the accounting before the caller drops 32110Sstevel@tonic-gate * the lock. 32120Sstevel@tonic-gate */ 32130Sstevel@tonic-gate if (ire->ire_ipversion == IPV6_VERSION) { 32143448Sdh155122 ASSERT(ipst-> 32153448Sdh155122 ips_ipv6_ire_default_count != 32163448Sdh155122 0); 32173448Sdh155122 ipst->ips_ipv6_ire_default_count--; 32180Sstevel@tonic-gate } 32190Sstevel@tonic-gate } 32200Sstevel@tonic-gate /* 32210Sstevel@tonic-gate * We need to call ire_delete_v4 or ire_delete_v6 32220Sstevel@tonic-gate * to clean up the cache or the redirects pointing at 32230Sstevel@tonic-gate * the default gateway. We need to drop the lock 32240Sstevel@tonic-gate * as ire_flush_cache/ire_delete_host_redircts require 32250Sstevel@tonic-gate * so. But we can't drop the lock, as ire_unlink needs 32260Sstevel@tonic-gate * to atomically remove the ires from the list. 32270Sstevel@tonic-gate * So, create a temporary list of CONDEMNED ires 32280Sstevel@tonic-gate * for doing ire_delete_v4/ire_delete_v6 operations 32290Sstevel@tonic-gate * later on. 32300Sstevel@tonic-gate */ 32310Sstevel@tonic-gate ire->ire_next = ire_list; 32320Sstevel@tonic-gate ire_list = ire; 32330Sstevel@tonic-gate } 32340Sstevel@tonic-gate } 32352535Ssangeeta irb->irb_marks &= ~IRB_MARK_CONDEMNED; 32360Sstevel@tonic-gate return (ire_list); 32370Sstevel@tonic-gate } 32380Sstevel@tonic-gate 32390Sstevel@tonic-gate /* 32400Sstevel@tonic-gate * Delete all the cache entries with this 'addr'. When IP gets a gratuitous 32412535Ssangeeta * ARP message on any of its interface queue, it scans the nce table and 32422535Ssangeeta * deletes and calls ndp_delete() for the appropriate nce. This action 32432535Ssangeeta * also deletes all the neighbor/ire cache entries for that address. 32442535Ssangeeta * This function is called from ip_arp_news in ip.c and also for 32452535Ssangeeta * ARP ioctl processing in ip_if.c. ip_ire_clookup_and_delete returns 32462535Ssangeeta * true if it finds a nce entry which is used by ip_arp_news to determine if 32472535Ssangeeta * it needs to do an ire_walk_v4. The return value is also used for the 32482535Ssangeeta * same purpose by ARP IOCTL processing * in ip_if.c when deleting 32492535Ssangeeta * ARP entries. For SIOC*IFARP ioctls in addition to the address, 32502535Ssangeeta * ip_if->ipif_ill also needs to be matched. 32510Sstevel@tonic-gate */ 32520Sstevel@tonic-gate boolean_t 32533448Sdh155122 ip_ire_clookup_and_delete(ipaddr_t addr, ipif_t *ipif, ip_stack_t *ipst) 32540Sstevel@tonic-gate { 32552535Ssangeeta ill_t *ill; 32562535Ssangeeta nce_t *nce; 32572535Ssangeeta 32582535Ssangeeta ill = (ipif ? ipif->ipif_ill : NULL); 32592535Ssangeeta 32602535Ssangeeta if (ill != NULL) { 32612535Ssangeeta /* 32622535Ssangeeta * clean up the nce (and any relevant ire's) that matches 32632535Ssangeeta * on addr and ill. 32642535Ssangeeta */ 32652535Ssangeeta nce = ndp_lookup_v4(ill, &addr, B_FALSE); 32662535Ssangeeta if (nce != NULL) { 32672535Ssangeeta ndp_delete(nce); 32682535Ssangeeta return (B_TRUE); 32692535Ssangeeta } 32702535Ssangeeta } else { 32712535Ssangeeta /* 32722535Ssangeeta * ill is wildcard. clean up all nce's and 32732535Ssangeeta * ire's that match on addr 32742535Ssangeeta */ 32752535Ssangeeta nce_clookup_t cl; 32762535Ssangeeta 32772535Ssangeeta cl.ncecl_addr = addr; 32782535Ssangeeta cl.ncecl_found = B_FALSE; 32792535Ssangeeta 32803448Sdh155122 ndp_walk_common(ipst->ips_ndp4, NULL, 32812535Ssangeeta (pfi_t)ip_nce_clookup_and_delete, (uchar_t *)&cl, B_TRUE); 32822535Ssangeeta 32832535Ssangeeta /* 32842535Ssangeeta * ncecl_found would be set by ip_nce_clookup_and_delete if 32852535Ssangeeta * we found a matching nce. 32862535Ssangeeta */ 32872535Ssangeeta return (cl.ncecl_found); 32882535Ssangeeta } 32892535Ssangeeta return (B_FALSE); 32902535Ssangeeta 32912535Ssangeeta } 32922535Ssangeeta 32932535Ssangeeta /* Delete the supplied nce if its nce_addr matches the supplied address */ 32942535Ssangeeta static void 32952535Ssangeeta ip_nce_clookup_and_delete(nce_t *nce, void *arg) 32962535Ssangeeta { 32972535Ssangeeta nce_clookup_t *cl = (nce_clookup_t *)arg; 32982535Ssangeeta ipaddr_t nce_addr; 32992535Ssangeeta 33002535Ssangeeta IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr); 33012535Ssangeeta if (nce_addr == cl->ncecl_addr) { 33022535Ssangeeta cl->ncecl_found = B_TRUE; 33032535Ssangeeta /* clean up the nce (and any relevant ire's) */ 33042535Ssangeeta ndp_delete(nce); 33052535Ssangeeta } 33062535Ssangeeta } 33072535Ssangeeta 33082535Ssangeeta /* 33092535Ssangeeta * Clean up the radix node for this ire. Must be called by IRB_REFRELE 33102535Ssangeeta * when there are no ire's left in the bucket. Returns TRUE if the bucket 33112535Ssangeeta * is deleted and freed. 33122535Ssangeeta */ 33132535Ssangeeta boolean_t 33142535Ssangeeta irb_inactive(irb_t *irb) 33152535Ssangeeta { 33162535Ssangeeta struct rt_entry *rt; 33172535Ssangeeta struct radix_node *rn; 33183448Sdh155122 ip_stack_t *ipst = irb->irb_ipst; 33193448Sdh155122 33203448Sdh155122 ASSERT(irb->irb_ipst != NULL); 33212535Ssangeeta 33222535Ssangeeta rt = IRB2RT(irb); 33232535Ssangeeta rn = (struct radix_node *)rt; 33242535Ssangeeta 33252535Ssangeeta /* first remove it from the radix tree. */ 33263448Sdh155122 RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable); 33272535Ssangeeta rw_enter(&irb->irb_lock, RW_WRITER); 33282535Ssangeeta if (irb->irb_refcnt == 1 && irb->irb_nire == 0) { 33293448Sdh155122 rn = ipst->ips_ip_ftable->rnh_deladdr(rn->rn_key, rn->rn_mask, 33303448Sdh155122 ipst->ips_ip_ftable); 33312535Ssangeeta DTRACE_PROBE1(irb__free, rt_t *, rt); 33322535Ssangeeta ASSERT((void *)rn == (void *)rt); 33332535Ssangeeta Free(rt, rt_entry_cache); 33342535Ssangeeta /* irb_lock is freed */ 33353448Sdh155122 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 33362535Ssangeeta return (B_TRUE); 33372535Ssangeeta } 33382535Ssangeeta rw_exit(&irb->irb_lock); 33393448Sdh155122 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 33402535Ssangeeta return (B_FALSE); 33410Sstevel@tonic-gate } 33420Sstevel@tonic-gate 33430Sstevel@tonic-gate /* 33440Sstevel@tonic-gate * Delete the specified IRE. 33450Sstevel@tonic-gate */ 33460Sstevel@tonic-gate void 33470Sstevel@tonic-gate ire_delete(ire_t *ire) 33480Sstevel@tonic-gate { 33490Sstevel@tonic-gate ire_t *ire1; 33500Sstevel@tonic-gate ire_t **ptpn; 33510Sstevel@tonic-gate irb_t *irb; 33523448Sdh155122 ip_stack_t *ipst = ire->ire_ipst; 33530Sstevel@tonic-gate 33540Sstevel@tonic-gate if ((irb = ire->ire_bucket) == NULL) { 33552535Ssangeeta /* 33562535Ssangeeta * It was never inserted in the list. Should call REFRELE 33572535Ssangeeta * to free this IRE. 33582535Ssangeeta */ 33590Sstevel@tonic-gate IRE_REFRELE_NOTR(ire); 33600Sstevel@tonic-gate return; 33610Sstevel@tonic-gate } 33620Sstevel@tonic-gate 33630Sstevel@tonic-gate rw_enter(&irb->irb_lock, RW_WRITER); 33640Sstevel@tonic-gate 33652535Ssangeeta if (irb->irb_rr_origin == ire) { 33662535Ssangeeta irb->irb_rr_origin = NULL; 33672535Ssangeeta } 33682535Ssangeeta 33690Sstevel@tonic-gate /* 33700Sstevel@tonic-gate * In case of V4 we might still be waiting for fastpath ack. 33710Sstevel@tonic-gate */ 33723425Ssowmini if (ire->ire_ipversion == IPV4_VERSION && 33733425Ssowmini (ire->ire_type == IRE_CACHE || 33743425Ssowmini (ire->ire_type == IRE_BROADCAST && ire->ire_stq != NULL))) { 33753425Ssowmini ASSERT(ire->ire_nce != NULL); 33763425Ssowmini nce_fastpath_list_delete(ire->ire_nce); 33770Sstevel@tonic-gate } 33780Sstevel@tonic-gate 33790Sstevel@tonic-gate if (ire->ire_ptpn == NULL) { 33800Sstevel@tonic-gate /* 33810Sstevel@tonic-gate * Some other thread has removed us from the list. 33820Sstevel@tonic-gate * It should have done the REFRELE for us. 33830Sstevel@tonic-gate */ 33840Sstevel@tonic-gate rw_exit(&irb->irb_lock); 33850Sstevel@tonic-gate return; 33860Sstevel@tonic-gate } 33870Sstevel@tonic-gate 33885388Sja97890 if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) { 33895388Sja97890 irb->irb_ire_cnt--; 33905388Sja97890 ire->ire_marks |= IRE_MARK_CONDEMNED; 33915388Sja97890 if (ire->ire_marks & IRE_MARK_TEMPORARY) { 33925388Sja97890 irb->irb_tmp_ire_cnt--; 33935388Sja97890 ire->ire_marks &= ~IRE_MARK_TEMPORARY; 33945388Sja97890 } 33955388Sja97890 } 33965388Sja97890 33970Sstevel@tonic-gate if (irb->irb_refcnt != 0) { 33980Sstevel@tonic-gate /* 33990Sstevel@tonic-gate * The last thread to leave this bucket will 34000Sstevel@tonic-gate * delete this ire. 34010Sstevel@tonic-gate */ 34022535Ssangeeta irb->irb_marks |= IRB_MARK_CONDEMNED; 34030Sstevel@tonic-gate rw_exit(&irb->irb_lock); 34040Sstevel@tonic-gate return; 34050Sstevel@tonic-gate } 34060Sstevel@tonic-gate 34070Sstevel@tonic-gate /* 34080Sstevel@tonic-gate * Normally to delete an ire, we walk the bucket. While we 34090Sstevel@tonic-gate * walk the bucket, we normally bump up irb_refcnt and hence 34100Sstevel@tonic-gate * we return from above where we mark CONDEMNED and the ire 34110Sstevel@tonic-gate * gets deleted from ire_unlink. This case is where somebody 34120Sstevel@tonic-gate * knows the ire e.g by doing a lookup, and wants to delete the 34130Sstevel@tonic-gate * IRE. irb_refcnt would be 0 in this case if nobody is walking 34140Sstevel@tonic-gate * the bucket. 34150Sstevel@tonic-gate */ 34160Sstevel@tonic-gate ptpn = ire->ire_ptpn; 34170Sstevel@tonic-gate ire1 = ire->ire_next; 34180Sstevel@tonic-gate if (ire1 != NULL) 34190Sstevel@tonic-gate ire1->ire_ptpn = ptpn; 34200Sstevel@tonic-gate ASSERT(ptpn != NULL); 34210Sstevel@tonic-gate *ptpn = ire1; 34220Sstevel@tonic-gate ire->ire_ptpn = NULL; 34230Sstevel@tonic-gate ire->ire_next = NULL; 34240Sstevel@tonic-gate if (ire->ire_ipversion == IPV6_VERSION) { 34253448Sdh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_deleted); 34260Sstevel@tonic-gate } else { 34273448Sdh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_deleted); 34280Sstevel@tonic-gate } 34290Sstevel@tonic-gate /* 34300Sstevel@tonic-gate * ip_wput/ip_wput_v6 checks this flag to see whether 34310Sstevel@tonic-gate * it should still use the cached ire or not. 34320Sstevel@tonic-gate */ 34330Sstevel@tonic-gate if (ire->ire_type == IRE_DEFAULT) { 34340Sstevel@tonic-gate /* 34350Sstevel@tonic-gate * IRE is out of the list. We need to adjust the 34360Sstevel@tonic-gate * accounting before we drop the lock. 34370Sstevel@tonic-gate */ 34380Sstevel@tonic-gate if (ire->ire_ipversion == IPV6_VERSION) { 34393448Sdh155122 ASSERT(ipst->ips_ipv6_ire_default_count != 0); 34403448Sdh155122 ipst->ips_ipv6_ire_default_count--; 34410Sstevel@tonic-gate } 34420Sstevel@tonic-gate } 34430Sstevel@tonic-gate rw_exit(&irb->irb_lock); 34440Sstevel@tonic-gate 34450Sstevel@tonic-gate if (ire->ire_ipversion == IPV6_VERSION) { 34460Sstevel@tonic-gate ire_delete_v6(ire); 34470Sstevel@tonic-gate } else { 34480Sstevel@tonic-gate ire_delete_v4(ire); 34490Sstevel@tonic-gate } 34500Sstevel@tonic-gate /* 34510Sstevel@tonic-gate * We removed it from the list. Decrement the 34520Sstevel@tonic-gate * reference count. 34530Sstevel@tonic-gate */ 34540Sstevel@tonic-gate IRE_REFRELE_NOTR(ire); 34550Sstevel@tonic-gate } 34560Sstevel@tonic-gate 34570Sstevel@tonic-gate /* 34580Sstevel@tonic-gate * Delete the specified IRE. 34590Sstevel@tonic-gate * All calls should use ire_delete(). 34600Sstevel@tonic-gate * Sometimes called as writer though not required by this function. 34610Sstevel@tonic-gate * 34620Sstevel@tonic-gate * NOTE : This function is called only if the ire was added 34630Sstevel@tonic-gate * in the list. 34640Sstevel@tonic-gate */ 34650Sstevel@tonic-gate static void 34660Sstevel@tonic-gate ire_delete_v4(ire_t *ire) 34670Sstevel@tonic-gate { 34683448Sdh155122 ip_stack_t *ipst = ire->ire_ipst; 34693448Sdh155122 34700Sstevel@tonic-gate ASSERT(ire->ire_refcnt >= 1); 34710Sstevel@tonic-gate ASSERT(ire->ire_ipversion == IPV4_VERSION); 34720Sstevel@tonic-gate 34730Sstevel@tonic-gate if (ire->ire_type != IRE_CACHE) 34740Sstevel@tonic-gate ire_flush_cache_v4(ire, IRE_FLUSH_DELETE); 34750Sstevel@tonic-gate if (ire->ire_type == IRE_DEFAULT) { 34760Sstevel@tonic-gate /* 34770Sstevel@tonic-gate * when a default gateway is going away 34780Sstevel@tonic-gate * delete all the host redirects pointing at that 34790Sstevel@tonic-gate * gateway. 34800Sstevel@tonic-gate */ 34813448Sdh155122 ire_delete_host_redirects(ire->ire_gateway_addr, ipst); 34820Sstevel@tonic-gate } 34830Sstevel@tonic-gate } 34840Sstevel@tonic-gate 34850Sstevel@tonic-gate /* 34860Sstevel@tonic-gate * IRE_REFRELE/ire_refrele are the only caller of the function. It calls 34870Sstevel@tonic-gate * to free the ire when the reference count goes to zero. 34880Sstevel@tonic-gate */ 34890Sstevel@tonic-gate void 34900Sstevel@tonic-gate ire_inactive(ire_t *ire) 34910Sstevel@tonic-gate { 34920Sstevel@tonic-gate nce_t *nce; 34930Sstevel@tonic-gate ill_t *ill = NULL; 34940Sstevel@tonic-gate ill_t *stq_ill = NULL; 34950Sstevel@tonic-gate ipif_t *ipif; 34960Sstevel@tonic-gate boolean_t need_wakeup = B_FALSE; 34972535Ssangeeta irb_t *irb; 34983448Sdh155122 ip_stack_t *ipst = ire->ire_ipst; 34990Sstevel@tonic-gate 35000Sstevel@tonic-gate ASSERT(ire->ire_refcnt == 0); 35010Sstevel@tonic-gate ASSERT(ire->ire_ptpn == NULL); 35020Sstevel@tonic-gate ASSERT(ire->ire_next == NULL); 35030Sstevel@tonic-gate 35042535Ssangeeta if (ire->ire_gw_secattr != NULL) { 35052535Ssangeeta ire_gw_secattr_free(ire->ire_gw_secattr); 35062535Ssangeeta ire->ire_gw_secattr = NULL; 35072535Ssangeeta } 35082535Ssangeeta 35092535Ssangeeta if (ire->ire_mp != NULL) { 35102535Ssangeeta ASSERT(ire->ire_bucket == NULL); 35112535Ssangeeta mutex_destroy(&ire->ire_lock); 35123448Sdh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_freed); 35132535Ssangeeta if (ire->ire_nce != NULL) 35142535Ssangeeta NCE_REFRELE_NOTR(ire->ire_nce); 35152535Ssangeeta freeb(ire->ire_mp); 35162535Ssangeeta return; 35172535Ssangeeta } 35182535Ssangeeta 35190Sstevel@tonic-gate if ((nce = ire->ire_nce) != NULL) { 35200Sstevel@tonic-gate NCE_REFRELE_NOTR(nce); 35210Sstevel@tonic-gate ire->ire_nce = NULL; 35220Sstevel@tonic-gate } 35232535Ssangeeta 35240Sstevel@tonic-gate if (ire->ire_ipif == NULL) 35250Sstevel@tonic-gate goto end; 35260Sstevel@tonic-gate 35270Sstevel@tonic-gate ipif = ire->ire_ipif; 35280Sstevel@tonic-gate ill = ipif->ipif_ill; 35290Sstevel@tonic-gate 35300Sstevel@tonic-gate if (ire->ire_bucket == NULL) { 35310Sstevel@tonic-gate /* The ire was never inserted in the table. */ 35320Sstevel@tonic-gate goto end; 35330Sstevel@tonic-gate } 35340Sstevel@tonic-gate 35350Sstevel@tonic-gate /* 35366379Ssowmini * ipif_ire_cnt on this ipif goes down by 1. If the ire_stq is 35374823Sseb * non-null ill_ire_count also goes down by 1. 35380Sstevel@tonic-gate * 35390Sstevel@tonic-gate * The ipif that is associated with an ire is ire->ire_ipif and 35406379Ssowmini * hence when the ire->ire_ipif->ipif_ire_cnt drops to zero we call 35410Sstevel@tonic-gate * ipif_ill_refrele_tail. Usually stq_ill is null or the same as 35428485SPeter.Memishian@Sun.COM * ire->ire_ipif->ipif_ill. So nothing more needs to be done. 35438485SPeter.Memishian@Sun.COM * However, for VNI or IPMP IRE entries, stq_ill can be different. 35448485SPeter.Memishian@Sun.COM * If this is different from ire->ire_ipif->ipif_ill and if the 35458485SPeter.Memishian@Sun.COM * ill_ire_cnt on the stq_ill also has dropped to zero, we call 35464823Sseb * ipif_ill_refrele_tail on the stq_ill. 35470Sstevel@tonic-gate */ 35480Sstevel@tonic-gate if (ire->ire_stq != NULL) 35498485SPeter.Memishian@Sun.COM stq_ill = ire->ire_stq->q_ptr; 35504823Sseb 35514823Sseb if (stq_ill == NULL || stq_ill == ill) { 35520Sstevel@tonic-gate /* Optimize the most common case */ 35530Sstevel@tonic-gate mutex_enter(&ill->ill_lock); 35546379Ssowmini ASSERT(ipif->ipif_ire_cnt != 0); 35556255Ssowmini DTRACE_PROBE3(ipif__decr__cnt, (ipif_t *), ipif, 35566255Ssowmini (char *), "ire", (void *), ire); 35576379Ssowmini ipif->ipif_ire_cnt--; 35586255Ssowmini if (IPIF_DOWN_OK(ipif)) 35590Sstevel@tonic-gate need_wakeup = B_TRUE; 35600Sstevel@tonic-gate if (stq_ill != NULL) { 35616379Ssowmini ASSERT(stq_ill->ill_ire_cnt != 0); 35626255Ssowmini DTRACE_PROBE3(ill__decr__cnt, (ill_t *), stq_ill, 35636255Ssowmini (char *), "ire", (void *), ire); 35646379Ssowmini stq_ill->ill_ire_cnt--; 35656255Ssowmini if (ILL_DOWN_OK(stq_ill)) 35660Sstevel@tonic-gate need_wakeup = B_TRUE; 35670Sstevel@tonic-gate } 35680Sstevel@tonic-gate if (need_wakeup) { 35690Sstevel@tonic-gate /* Drops the ill lock */ 35700Sstevel@tonic-gate ipif_ill_refrele_tail(ill); 35710Sstevel@tonic-gate } else { 35720Sstevel@tonic-gate mutex_exit(&ill->ill_lock); 35730Sstevel@tonic-gate } 35740Sstevel@tonic-gate } else { 35750Sstevel@tonic-gate /* 35760Sstevel@tonic-gate * We can't grab all the ill locks at the same time. 35770Sstevel@tonic-gate * It can lead to recursive lock enter in the call to 35780Sstevel@tonic-gate * ipif_ill_refrele_tail and later. Instead do it 1 at 35790Sstevel@tonic-gate * a time. 35800Sstevel@tonic-gate */ 35810Sstevel@tonic-gate mutex_enter(&ill->ill_lock); 35826379Ssowmini ASSERT(ipif->ipif_ire_cnt != 0); 35836255Ssowmini DTRACE_PROBE3(ipif__decr__cnt, (ipif_t *), ipif, 35846255Ssowmini (char *), "ire", (void *), ire); 35856379Ssowmini ipif->ipif_ire_cnt--; 35866255Ssowmini if (IPIF_DOWN_OK(ipif)) { 35870Sstevel@tonic-gate /* Drops the lock */ 35880Sstevel@tonic-gate ipif_ill_refrele_tail(ill); 35890Sstevel@tonic-gate } else { 35900Sstevel@tonic-gate mutex_exit(&ill->ill_lock); 35910Sstevel@tonic-gate } 35920Sstevel@tonic-gate if (stq_ill != NULL) { 35930Sstevel@tonic-gate mutex_enter(&stq_ill->ill_lock); 35946379Ssowmini ASSERT(stq_ill->ill_ire_cnt != 0); 35956255Ssowmini DTRACE_PROBE3(ill__decr__cnt, (ill_t *), stq_ill, 35966255Ssowmini (char *), "ire", (void *), ire); 35976379Ssowmini stq_ill->ill_ire_cnt--; 35986255Ssowmini if (ILL_DOWN_OK(stq_ill)) { 35990Sstevel@tonic-gate /* Drops the ill lock */ 36000Sstevel@tonic-gate ipif_ill_refrele_tail(stq_ill); 36010Sstevel@tonic-gate } else { 36020Sstevel@tonic-gate mutex_exit(&stq_ill->ill_lock); 36030Sstevel@tonic-gate } 36040Sstevel@tonic-gate } 36050Sstevel@tonic-gate } 36060Sstevel@tonic-gate end: 36070Sstevel@tonic-gate /* This should be true for both V4 and V6 */ 36080Sstevel@tonic-gate 36092535Ssangeeta if ((ire->ire_type & IRE_FORWARDTABLE) && 36102535Ssangeeta (ire->ire_ipversion == IPV4_VERSION) && 36112535Ssangeeta ((irb = ire->ire_bucket) != NULL)) { 36122535Ssangeeta rw_enter(&irb->irb_lock, RW_WRITER); 36132535Ssangeeta irb->irb_nire--; 36142535Ssangeeta /* 36152535Ssangeeta * Instead of examining the conditions for freeing 36162535Ssangeeta * the radix node here, we do it by calling 36172535Ssangeeta * IRB_REFRELE which is a single point in the code 36182535Ssangeeta * that embeds that logic. Bump up the refcnt to 36192535Ssangeeta * be able to call IRB_REFRELE 36202535Ssangeeta */ 36212535Ssangeeta IRB_REFHOLD_LOCKED(irb); 36222535Ssangeeta rw_exit(&irb->irb_lock); 36232535Ssangeeta IRB_REFRELE(irb); 36242535Ssangeeta } 36250Sstevel@tonic-gate ire->ire_ipif = NULL; 36260Sstevel@tonic-gate 36275023Scarlsonj #ifdef DEBUG 36285023Scarlsonj ire_trace_cleanup(ire); 36290Sstevel@tonic-gate #endif 36300Sstevel@tonic-gate mutex_destroy(&ire->ire_lock); 36310Sstevel@tonic-gate if (ire->ire_ipversion == IPV6_VERSION) { 36323448Sdh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_freed); 36330Sstevel@tonic-gate } else { 36343448Sdh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_freed); 36350Sstevel@tonic-gate } 36362535Ssangeeta ASSERT(ire->ire_mp == NULL); 36372535Ssangeeta /* Has been allocated out of the cache */ 36382535Ssangeeta kmem_cache_free(ire_cache, ire); 36390Sstevel@tonic-gate } 36400Sstevel@tonic-gate 36410Sstevel@tonic-gate /* 36423004Sdd193516 * ire_walk routine to delete all IRE_CACHE/IRE_HOST types redirect 36433004Sdd193516 * entries that have a given gateway address. 36440Sstevel@tonic-gate */ 36450Sstevel@tonic-gate void 36460Sstevel@tonic-gate ire_delete_cache_gw(ire_t *ire, char *cp) 36470Sstevel@tonic-gate { 36480Sstevel@tonic-gate ipaddr_t gw_addr; 36490Sstevel@tonic-gate 36503004Sdd193516 if (!(ire->ire_type & IRE_CACHE) && 36513004Sdd193516 !(ire->ire_flags & RTF_DYNAMIC)) 36520Sstevel@tonic-gate return; 36530Sstevel@tonic-gate 36540Sstevel@tonic-gate bcopy(cp, &gw_addr, sizeof (gw_addr)); 36550Sstevel@tonic-gate if (ire->ire_gateway_addr == gw_addr) { 36560Sstevel@tonic-gate ip1dbg(("ire_delete_cache_gw: deleted 0x%x type %d to 0x%x\n", 36574714Ssowmini (int)ntohl(ire->ire_addr), ire->ire_type, 36584714Ssowmini (int)ntohl(ire->ire_gateway_addr))); 36590Sstevel@tonic-gate ire_delete(ire); 36600Sstevel@tonic-gate } 36610Sstevel@tonic-gate } 36620Sstevel@tonic-gate 36630Sstevel@tonic-gate /* 36640Sstevel@tonic-gate * Remove all IRE_CACHE entries that match the ire specified. 36650Sstevel@tonic-gate * 36660Sstevel@tonic-gate * The flag argument indicates if the flush request is due to addition 36670Sstevel@tonic-gate * of new route (IRE_FLUSH_ADD) or deletion of old route (IRE_FLUSH_DELETE). 36680Sstevel@tonic-gate * 36690Sstevel@tonic-gate * This routine takes only the IREs from the forwarding table and flushes 36700Sstevel@tonic-gate * the corresponding entries from the cache table. 36710Sstevel@tonic-gate * 36720Sstevel@tonic-gate * When flushing due to the deletion of an old route, it 36730Sstevel@tonic-gate * just checks the cache handles (ire_phandle and ire_ihandle) and 36740Sstevel@tonic-gate * deletes the ones that match. 36750Sstevel@tonic-gate * 36760Sstevel@tonic-gate * When flushing due to the creation of a new route, it checks 36770Sstevel@tonic-gate * if a cache entry's address matches the one in the IRE and 36780Sstevel@tonic-gate * that the cache entry's parent has a less specific mask than the 36790Sstevel@tonic-gate * one in IRE. The destination of such a cache entry could be the 36800Sstevel@tonic-gate * gateway for other cache entries, so we need to flush those as 36810Sstevel@tonic-gate * well by looking for gateway addresses matching the IRE's address. 36820Sstevel@tonic-gate */ 36830Sstevel@tonic-gate void 36840Sstevel@tonic-gate ire_flush_cache_v4(ire_t *ire, int flag) 36850Sstevel@tonic-gate { 36860Sstevel@tonic-gate int i; 36870Sstevel@tonic-gate ire_t *cire; 36880Sstevel@tonic-gate irb_t *irb; 36893448Sdh155122 ip_stack_t *ipst = ire->ire_ipst; 36900Sstevel@tonic-gate 36910Sstevel@tonic-gate if (ire->ire_type & IRE_CACHE) 36924714Ssowmini return; 36930Sstevel@tonic-gate 36940Sstevel@tonic-gate /* 36950Sstevel@tonic-gate * If a default is just created, there is no point 36960Sstevel@tonic-gate * in going through the cache, as there will not be any 36970Sstevel@tonic-gate * cached ires. 36980Sstevel@tonic-gate */ 36990Sstevel@tonic-gate if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) 37000Sstevel@tonic-gate return; 37010Sstevel@tonic-gate if (flag == IRE_FLUSH_ADD) { 37020Sstevel@tonic-gate /* 37030Sstevel@tonic-gate * This selective flush is due to the addition of 37040Sstevel@tonic-gate * new IRE. 37050Sstevel@tonic-gate */ 37063448Sdh155122 for (i = 0; i < ipst->ips_ip_cache_table_size; i++) { 37073448Sdh155122 irb = &ipst->ips_ip_cache_table[i]; 37080Sstevel@tonic-gate if ((cire = irb->irb_ire) == NULL) 37090Sstevel@tonic-gate continue; 37100Sstevel@tonic-gate IRB_REFHOLD(irb); 37110Sstevel@tonic-gate for (cire = irb->irb_ire; cire != NULL; 37120Sstevel@tonic-gate cire = cire->ire_next) { 37130Sstevel@tonic-gate if (cire->ire_type != IRE_CACHE) 37140Sstevel@tonic-gate continue; 37150Sstevel@tonic-gate /* 37160Sstevel@tonic-gate * If 'cire' belongs to the same subnet 37170Sstevel@tonic-gate * as the new ire being added, and 'cire' 37180Sstevel@tonic-gate * is derived from a prefix that is less 37190Sstevel@tonic-gate * specific than the new ire being added, 37200Sstevel@tonic-gate * we need to flush 'cire'; for instance, 37210Sstevel@tonic-gate * when a new interface comes up. 37220Sstevel@tonic-gate */ 37230Sstevel@tonic-gate if (((cire->ire_addr & ire->ire_mask) == 37240Sstevel@tonic-gate (ire->ire_addr & ire->ire_mask)) && 37250Sstevel@tonic-gate (ip_mask_to_plen(cire->ire_cmask) <= 37260Sstevel@tonic-gate ire->ire_masklen)) { 37270Sstevel@tonic-gate ire_delete(cire); 37280Sstevel@tonic-gate continue; 37290Sstevel@tonic-gate } 37300Sstevel@tonic-gate /* 37310Sstevel@tonic-gate * This is the case when the ire_gateway_addr 37320Sstevel@tonic-gate * of 'cire' belongs to the same subnet as 37330Sstevel@tonic-gate * the new ire being added. 37340Sstevel@tonic-gate * Flushing such ires is sometimes required to 37350Sstevel@tonic-gate * avoid misrouting: say we have a machine with 37360Sstevel@tonic-gate * two interfaces (I1 and I2), a default router 37370Sstevel@tonic-gate * R on the I1 subnet, and a host route to an 37380Sstevel@tonic-gate * off-link destination D with a gateway G on 37390Sstevel@tonic-gate * the I2 subnet. 37400Sstevel@tonic-gate * Under normal operation, we will have an 37410Sstevel@tonic-gate * on-link cache entry for G and an off-link 37420Sstevel@tonic-gate * cache entry for D with G as ire_gateway_addr, 37430Sstevel@tonic-gate * traffic to D will reach its destination 37440Sstevel@tonic-gate * through gateway G. 37450Sstevel@tonic-gate * If the administrator does 'ifconfig I2 down', 37460Sstevel@tonic-gate * the cache entries for D and G will be 37470Sstevel@tonic-gate * flushed. However, G will now be resolved as 37480Sstevel@tonic-gate * an off-link destination using R (the default 37490Sstevel@tonic-gate * router) as gateway. Then D will also be 37500Sstevel@tonic-gate * resolved as an off-link destination using G 37510Sstevel@tonic-gate * as gateway - this behavior is due to 37520Sstevel@tonic-gate * compatibility reasons, see comment in 37530Sstevel@tonic-gate * ire_ihandle_lookup_offlink(). Traffic to D 37540Sstevel@tonic-gate * will go to the router R and probably won't 37550Sstevel@tonic-gate * reach the destination. 37560Sstevel@tonic-gate * The administrator then does 'ifconfig I2 up'. 37570Sstevel@tonic-gate * Since G is on the I2 subnet, this routine 37580Sstevel@tonic-gate * will flush its cache entry. It must also 37590Sstevel@tonic-gate * flush the cache entry for D, otherwise 37600Sstevel@tonic-gate * traffic will stay misrouted until the IRE 37610Sstevel@tonic-gate * times out. 37620Sstevel@tonic-gate */ 37630Sstevel@tonic-gate if ((cire->ire_gateway_addr & ire->ire_mask) == 37640Sstevel@tonic-gate (ire->ire_addr & ire->ire_mask)) { 37650Sstevel@tonic-gate ire_delete(cire); 37660Sstevel@tonic-gate continue; 37670Sstevel@tonic-gate } 37680Sstevel@tonic-gate } 37690Sstevel@tonic-gate IRB_REFRELE(irb); 37700Sstevel@tonic-gate } 37710Sstevel@tonic-gate } else { 37720Sstevel@tonic-gate /* 37730Sstevel@tonic-gate * delete the cache entries based on 37740Sstevel@tonic-gate * handle in the IRE as this IRE is 37750Sstevel@tonic-gate * being deleted/changed. 37760Sstevel@tonic-gate */ 37773448Sdh155122 for (i = 0; i < ipst->ips_ip_cache_table_size; i++) { 37783448Sdh155122 irb = &ipst->ips_ip_cache_table[i]; 37790Sstevel@tonic-gate if ((cire = irb->irb_ire) == NULL) 37800Sstevel@tonic-gate continue; 37810Sstevel@tonic-gate IRB_REFHOLD(irb); 37820Sstevel@tonic-gate for (cire = irb->irb_ire; cire != NULL; 37830Sstevel@tonic-gate cire = cire->ire_next) { 37840Sstevel@tonic-gate if (cire->ire_type != IRE_CACHE) 37850Sstevel@tonic-gate continue; 37860Sstevel@tonic-gate if ((cire->ire_phandle == 0 || 37870Sstevel@tonic-gate cire->ire_phandle != ire->ire_phandle) && 37880Sstevel@tonic-gate (cire->ire_ihandle == 0 || 37890Sstevel@tonic-gate cire->ire_ihandle != ire->ire_ihandle)) 37900Sstevel@tonic-gate continue; 37910Sstevel@tonic-gate ire_delete(cire); 37920Sstevel@tonic-gate } 37930Sstevel@tonic-gate IRB_REFRELE(irb); 37940Sstevel@tonic-gate } 37950Sstevel@tonic-gate } 37960Sstevel@tonic-gate } 37970Sstevel@tonic-gate 37980Sstevel@tonic-gate /* 37990Sstevel@tonic-gate * Matches the arguments passed with the values in the ire. 38000Sstevel@tonic-gate * 38010Sstevel@tonic-gate * Note: for match types that match using "ipif" passed in, ipif 38020Sstevel@tonic-gate * must be checked for non-NULL before calling this routine. 38030Sstevel@tonic-gate */ 38042535Ssangeeta boolean_t 38050Sstevel@tonic-gate ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, 38061676Sjpk int type, const ipif_t *ipif, zoneid_t zoneid, uint32_t ihandle, 38077880SJonathan.Anderson@Sun.COM const ts_label_t *tsl, int match_flags, queue_t *wq) 38080Sstevel@tonic-gate { 38090Sstevel@tonic-gate ill_t *ire_ill = NULL, *dst_ill; 38100Sstevel@tonic-gate ill_t *ipif_ill = NULL; 38110Sstevel@tonic-gate 38120Sstevel@tonic-gate ASSERT(ire->ire_ipversion == IPV4_VERSION); 38130Sstevel@tonic-gate ASSERT((ire->ire_addr & ~ire->ire_mask) == 0); 38148485SPeter.Memishian@Sun.COM ASSERT((!(match_flags & MATCH_IRE_ILL)) || 38150Sstevel@tonic-gate (ipif != NULL && !ipif->ipif_isv6)); 38167880SJonathan.Anderson@Sun.COM ASSERT(!(match_flags & MATCH_IRE_WQ) || wq != NULL); 38170Sstevel@tonic-gate 38180Sstevel@tonic-gate /* 38198485SPeter.Memishian@Sun.COM * If MATCH_IRE_MARK_TESTHIDDEN is set, then only return the IRE if it 38208485SPeter.Memishian@Sun.COM * is in fact hidden, to ensure the caller gets the right one. One 38218485SPeter.Memishian@Sun.COM * exception: if the caller passed MATCH_IRE_IHANDLE, then they 38228485SPeter.Memishian@Sun.COM * already know the identity of the given IRE_INTERFACE entry and 38238485SPeter.Memishian@Sun.COM * there's no point trying to hide it from them. 38240Sstevel@tonic-gate */ 38258485SPeter.Memishian@Sun.COM if (ire->ire_marks & IRE_MARK_TESTHIDDEN) { 38268485SPeter.Memishian@Sun.COM if (match_flags & MATCH_IRE_IHANDLE) 38278485SPeter.Memishian@Sun.COM match_flags |= MATCH_IRE_MARK_TESTHIDDEN; 38288485SPeter.Memishian@Sun.COM 38298485SPeter.Memishian@Sun.COM if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) 38308485SPeter.Memishian@Sun.COM return (B_FALSE); 38318485SPeter.Memishian@Sun.COM } 38320Sstevel@tonic-gate 38331095Spriyanka /* 38341095Spriyanka * MATCH_IRE_MARK_PRIVATE_ADDR is set when IP_NEXTHOP option 38351095Spriyanka * is used. In that case the routing table is bypassed and the 38361095Spriyanka * packets are sent directly to the specified nexthop. The 38371095Spriyanka * IRE_CACHE entry representing this route should be marked 38381095Spriyanka * with IRE_MARK_PRIVATE_ADDR. 38391095Spriyanka */ 38401095Spriyanka 38411095Spriyanka if (!(match_flags & MATCH_IRE_MARK_PRIVATE_ADDR) && 38421095Spriyanka (ire->ire_marks & IRE_MARK_PRIVATE_ADDR)) 38431095Spriyanka return (B_FALSE); 38441095Spriyanka 38451676Sjpk if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid && 38461676Sjpk ire->ire_zoneid != ALL_ZONES) { 38470Sstevel@tonic-gate /* 38480Sstevel@tonic-gate * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid is 38490Sstevel@tonic-gate * valid and does not match that of ire_zoneid, a failure to 38500Sstevel@tonic-gate * match is reported at this point. Otherwise, since some IREs 38510Sstevel@tonic-gate * that are available in the global zone can be used in local 38520Sstevel@tonic-gate * zones, additional checks need to be performed: 38530Sstevel@tonic-gate * 38540Sstevel@tonic-gate * IRE_BROADCAST, IRE_CACHE and IRE_LOOPBACK 38550Sstevel@tonic-gate * entries should never be matched in this situation. 38560Sstevel@tonic-gate * 38570Sstevel@tonic-gate * IRE entries that have an interface associated with them 38580Sstevel@tonic-gate * should in general not match unless they are an IRE_LOCAL 38590Sstevel@tonic-gate * or in the case when MATCH_IRE_DEFAULT has been set in 38600Sstevel@tonic-gate * the caller. In the case of the former, checking of the 38610Sstevel@tonic-gate * other fields supplied should take place. 38620Sstevel@tonic-gate * 38630Sstevel@tonic-gate * In the case where MATCH_IRE_DEFAULT has been set, 38640Sstevel@tonic-gate * all of the ipif's associated with the IRE's ill are 38650Sstevel@tonic-gate * checked to see if there is a matching zoneid. If any 38660Sstevel@tonic-gate * one ipif has a matching zoneid, this IRE is a 38670Sstevel@tonic-gate * potential candidate so checking of the other fields 38680Sstevel@tonic-gate * takes place. 38690Sstevel@tonic-gate * 38700Sstevel@tonic-gate * In the case where the IRE_INTERFACE has a usable source 38710Sstevel@tonic-gate * address (indicated by ill_usesrc_ifindex) in the 38720Sstevel@tonic-gate * correct zone then it's permitted to return this IRE 38730Sstevel@tonic-gate */ 38740Sstevel@tonic-gate if (match_flags & MATCH_IRE_ZONEONLY) 38750Sstevel@tonic-gate return (B_FALSE); 38760Sstevel@tonic-gate if (ire->ire_type & (IRE_BROADCAST | IRE_CACHE | IRE_LOOPBACK)) 38770Sstevel@tonic-gate return (B_FALSE); 38780Sstevel@tonic-gate /* 38790Sstevel@tonic-gate * Note, IRE_INTERFACE can have the stq as NULL. For 38800Sstevel@tonic-gate * example, if the default multicast route is tied to 38810Sstevel@tonic-gate * the loopback address. 38820Sstevel@tonic-gate */ 38830Sstevel@tonic-gate if ((ire->ire_type & IRE_INTERFACE) && 38840Sstevel@tonic-gate (ire->ire_stq != NULL)) { 38850Sstevel@tonic-gate dst_ill = (ill_t *)ire->ire_stq->q_ptr; 38860Sstevel@tonic-gate /* 38870Sstevel@tonic-gate * If there is a usable source address in the 38880Sstevel@tonic-gate * zone, then it's ok to return an 38890Sstevel@tonic-gate * IRE_INTERFACE 38900Sstevel@tonic-gate */ 38910Sstevel@tonic-gate if (ipif_usesrc_avail(dst_ill, zoneid)) { 38920Sstevel@tonic-gate ip3dbg(("ire_match_args: dst_ill %p match %d\n", 38930Sstevel@tonic-gate (void *)dst_ill, 38940Sstevel@tonic-gate (ire->ire_addr == (addr & mask)))); 38950Sstevel@tonic-gate } else { 38960Sstevel@tonic-gate ip3dbg(("ire_match_args: src_ipif NULL" 38970Sstevel@tonic-gate " dst_ill %p\n", (void *)dst_ill)); 38980Sstevel@tonic-gate return (B_FALSE); 38990Sstevel@tonic-gate } 39000Sstevel@tonic-gate } 39010Sstevel@tonic-gate if (ire->ire_ipif != NULL && ire->ire_type != IRE_LOCAL && 39020Sstevel@tonic-gate !(ire->ire_type & IRE_INTERFACE)) { 39030Sstevel@tonic-gate ipif_t *tipif; 39040Sstevel@tonic-gate 39050Sstevel@tonic-gate if ((match_flags & MATCH_IRE_DEFAULT) == 0) { 39060Sstevel@tonic-gate return (B_FALSE); 39070Sstevel@tonic-gate } 39080Sstevel@tonic-gate mutex_enter(&ire->ire_ipif->ipif_ill->ill_lock); 39090Sstevel@tonic-gate for (tipif = ire->ire_ipif->ipif_ill->ill_ipif; 39100Sstevel@tonic-gate tipif != NULL; tipif = tipif->ipif_next) { 39110Sstevel@tonic-gate if (IPIF_CAN_LOOKUP(tipif) && 39120Sstevel@tonic-gate (tipif->ipif_flags & IPIF_UP) && 39131676Sjpk (tipif->ipif_zoneid == zoneid || 39141676Sjpk tipif->ipif_zoneid == ALL_ZONES)) 39150Sstevel@tonic-gate break; 39160Sstevel@tonic-gate } 39170Sstevel@tonic-gate mutex_exit(&ire->ire_ipif->ipif_ill->ill_lock); 39180Sstevel@tonic-gate if (tipif == NULL) { 39190Sstevel@tonic-gate return (B_FALSE); 39200Sstevel@tonic-gate } 39210Sstevel@tonic-gate } 39220Sstevel@tonic-gate } 39230Sstevel@tonic-gate 39240Sstevel@tonic-gate /* 39258485SPeter.Memishian@Sun.COM * For IRE_CACHE entries, MATCH_IRE_ILL means that somebody wants to 39268485SPeter.Memishian@Sun.COM * send out ire_stq (ire_ipif for IRE_CACHE entries is just the means 39278485SPeter.Memishian@Sun.COM * of getting a source address -- i.e., ire_src_addr == 39288485SPeter.Memishian@Sun.COM * ire->ire_ipif->ipif_src_addr). ire_to_ill() handles this. 39298485SPeter.Memishian@Sun.COM * 39308485SPeter.Memishian@Sun.COM * NOTE: For IPMP, MATCH_IRE_ILL usually matches any ill in the group. 39318485SPeter.Memishian@Sun.COM * However, if MATCH_IRE_MARK_TESTHIDDEN is set (i.e., the IRE is for 39328485SPeter.Memishian@Sun.COM * IPMP test traffic), then the ill must match exactly. 39330Sstevel@tonic-gate */ 39348485SPeter.Memishian@Sun.COM if (match_flags & MATCH_IRE_ILL) { 39350Sstevel@tonic-gate ire_ill = ire_to_ill(ire); 39360Sstevel@tonic-gate ipif_ill = ipif->ipif_ill; 39370Sstevel@tonic-gate } 39380Sstevel@tonic-gate 39390Sstevel@tonic-gate if ((ire->ire_addr == (addr & mask)) && 39400Sstevel@tonic-gate ((!(match_flags & MATCH_IRE_GW)) || 39414714Ssowmini (ire->ire_gateway_addr == gateway)) && 39420Sstevel@tonic-gate ((!(match_flags & MATCH_IRE_TYPE)) || 39434714Ssowmini (ire->ire_type & type)) && 39440Sstevel@tonic-gate ((!(match_flags & MATCH_IRE_SRC)) || 39454714Ssowmini (ire->ire_src_addr == ipif->ipif_src_addr)) && 39460Sstevel@tonic-gate ((!(match_flags & MATCH_IRE_IPIF)) || 39474714Ssowmini (ire->ire_ipif == ipif)) && 39488485SPeter.Memishian@Sun.COM ((!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) || 39498485SPeter.Memishian@Sun.COM (ire->ire_marks & IRE_MARK_TESTHIDDEN)) && 39501095Spriyanka ((!(match_flags & MATCH_IRE_MARK_PRIVATE_ADDR)) || 39514714Ssowmini (ire->ire_type != IRE_CACHE || 39524714Ssowmini ire->ire_marks & IRE_MARK_PRIVATE_ADDR)) && 39537880SJonathan.Anderson@Sun.COM ((!(match_flags & MATCH_IRE_WQ)) || 39547880SJonathan.Anderson@Sun.COM (ire->ire_stq == wq)) && 39558485SPeter.Memishian@Sun.COM ((!(match_flags & MATCH_IRE_ILL)) || 39568485SPeter.Memishian@Sun.COM (ire_ill == ipif_ill || 39578485SPeter.Memishian@Sun.COM (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN) && 39588485SPeter.Memishian@Sun.COM ire_ill != NULL && IS_IN_SAME_ILLGRP(ipif_ill, ire_ill)))) && 39590Sstevel@tonic-gate ((!(match_flags & MATCH_IRE_IHANDLE)) || 39604714Ssowmini (ire->ire_ihandle == ihandle)) && 39612535Ssangeeta ((!(match_flags & MATCH_IRE_MASK)) || 39624714Ssowmini (ire->ire_mask == mask)) && 39631676Sjpk ((!(match_flags & MATCH_IRE_SECATTR)) || 39644714Ssowmini (!is_system_labeled()) || 39654714Ssowmini (tsol_ire_match_gwattr(ire, tsl) == 0))) { 39660Sstevel@tonic-gate /* We found the matched IRE */ 39670Sstevel@tonic-gate return (B_TRUE); 39680Sstevel@tonic-gate } 39690Sstevel@tonic-gate return (B_FALSE); 39700Sstevel@tonic-gate } 39710Sstevel@tonic-gate 39720Sstevel@tonic-gate /* 39730Sstevel@tonic-gate * Lookup for a route in all the tables 39740Sstevel@tonic-gate */ 39750Sstevel@tonic-gate ire_t * 39760Sstevel@tonic-gate ire_route_lookup(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, 39771676Sjpk int type, const ipif_t *ipif, ire_t **pire, zoneid_t zoneid, 39783448Sdh155122 const ts_label_t *tsl, int flags, ip_stack_t *ipst) 39790Sstevel@tonic-gate { 39800Sstevel@tonic-gate ire_t *ire = NULL; 39810Sstevel@tonic-gate 39820Sstevel@tonic-gate /* 39830Sstevel@tonic-gate * ire_match_args() will dereference ipif MATCH_IRE_SRC or 39840Sstevel@tonic-gate * MATCH_IRE_ILL is set. 39850Sstevel@tonic-gate */ 39868485SPeter.Memishian@Sun.COM if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL)) 39870Sstevel@tonic-gate return (NULL); 39880Sstevel@tonic-gate 39890Sstevel@tonic-gate /* 39900Sstevel@tonic-gate * might be asking for a cache lookup, 39910Sstevel@tonic-gate * This is not best way to lookup cache, 39920Sstevel@tonic-gate * user should call ire_cache_lookup directly. 39930Sstevel@tonic-gate * 39940Sstevel@tonic-gate * If MATCH_IRE_TYPE was set, first lookup in the cache table and then 39950Sstevel@tonic-gate * in the forwarding table, if the applicable type flags were set. 39960Sstevel@tonic-gate */ 39970Sstevel@tonic-gate if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_CACHETABLE) != 0) { 39980Sstevel@tonic-gate ire = ire_ctable_lookup(addr, gateway, type, ipif, zoneid, 39993448Sdh155122 tsl, flags, ipst); 40000Sstevel@tonic-gate if (ire != NULL) 40010Sstevel@tonic-gate return (ire); 40020Sstevel@tonic-gate } 40030Sstevel@tonic-gate if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_FORWARDTABLE) != 0) { 40040Sstevel@tonic-gate ire = ire_ftable_lookup(addr, mask, gateway, type, ipif, pire, 40053448Sdh155122 zoneid, 0, tsl, flags, ipst); 40060Sstevel@tonic-gate } 40070Sstevel@tonic-gate return (ire); 40080Sstevel@tonic-gate } 40090Sstevel@tonic-gate 40100Sstevel@tonic-gate /* 40111676Sjpk * Delete the IRE cache for the gateway and all IRE caches whose 40121676Sjpk * ire_gateway_addr points to this gateway, and allow them to 40131676Sjpk * be created on demand by ip_newroute. 40141676Sjpk */ 40151676Sjpk void 40163448Sdh155122 ire_clookup_delete_cache_gw(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst) 40171676Sjpk { 40181676Sjpk irb_t *irb; 40191676Sjpk ire_t *ire; 40201676Sjpk 40213448Sdh155122 irb = &ipst->ips_ip_cache_table[IRE_ADDR_HASH(addr, 40223448Sdh155122 ipst->ips_ip_cache_table_size)]; 40231676Sjpk IRB_REFHOLD(irb); 40241676Sjpk for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 40251676Sjpk if (ire->ire_marks & IRE_MARK_CONDEMNED) 40261676Sjpk continue; 40271676Sjpk 40281676Sjpk ASSERT(ire->ire_mask == IP_HOST_MASK); 40291676Sjpk if (ire_match_args(ire, addr, ire->ire_mask, 0, IRE_CACHE, 40307880SJonathan.Anderson@Sun.COM NULL, zoneid, 0, NULL, MATCH_IRE_TYPE, NULL)) { 40311676Sjpk ire_delete(ire); 40321676Sjpk } 40331676Sjpk } 40341676Sjpk IRB_REFRELE(irb); 40351676Sjpk 40363448Sdh155122 ire_walk_v4(ire_delete_cache_gw, &addr, zoneid, ipst); 40371676Sjpk } 40381676Sjpk 40391676Sjpk /* 40400Sstevel@tonic-gate * Looks up cache table for a route. 40410Sstevel@tonic-gate * specific lookup can be indicated by 40420Sstevel@tonic-gate * passing the MATCH_* flags and the 40430Sstevel@tonic-gate * necessary parameters. 40440Sstevel@tonic-gate */ 40450Sstevel@tonic-gate ire_t * 40461676Sjpk ire_ctable_lookup(ipaddr_t addr, ipaddr_t gateway, int type, const ipif_t *ipif, 40473448Sdh155122 zoneid_t zoneid, const ts_label_t *tsl, int flags, ip_stack_t *ipst) 40480Sstevel@tonic-gate { 40497880SJonathan.Anderson@Sun.COM ire_ctable_args_t margs; 40507880SJonathan.Anderson@Sun.COM 40517880SJonathan.Anderson@Sun.COM margs.ict_addr = &addr; 40527880SJonathan.Anderson@Sun.COM margs.ict_gateway = &gateway; 40537880SJonathan.Anderson@Sun.COM margs.ict_type = type; 40547880SJonathan.Anderson@Sun.COM margs.ict_ipif = ipif; 40557880SJonathan.Anderson@Sun.COM margs.ict_zoneid = zoneid; 40567880SJonathan.Anderson@Sun.COM margs.ict_tsl = tsl; 40577880SJonathan.Anderson@Sun.COM margs.ict_flags = flags; 40587880SJonathan.Anderson@Sun.COM margs.ict_ipst = ipst; 40597880SJonathan.Anderson@Sun.COM margs.ict_wq = NULL; 40607880SJonathan.Anderson@Sun.COM 40617880SJonathan.Anderson@Sun.COM return (ip4_ctable_lookup_impl(&margs)); 40620Sstevel@tonic-gate } 40630Sstevel@tonic-gate 40640Sstevel@tonic-gate /* 40652733Snordmark * Check whether the IRE_LOCAL and the IRE potentially used to transmit 40668485SPeter.Memishian@Sun.COM * (could be an IRE_CACHE, IRE_BROADCAST, or IRE_INTERFACE) are identical 40678485SPeter.Memishian@Sun.COM * or part of the same illgrp. (In the IPMP case, usually the two IREs 40688485SPeter.Memishian@Sun.COM * will both belong to the IPMP ill, but exceptions are possible -- e.g. 40698485SPeter.Memishian@Sun.COM * if IPMP test addresses are on their own subnet.) 40702733Snordmark */ 40712733Snordmark boolean_t 40728485SPeter.Memishian@Sun.COM ire_local_same_lan(ire_t *ire_local, ire_t *xmit_ire) 40732733Snordmark { 40748485SPeter.Memishian@Sun.COM ill_t *recv_ill, *xmit_ill; 40752733Snordmark 40762906Snordmark ASSERT(ire_local->ire_type & (IRE_LOCAL|IRE_LOOPBACK)); 40772962Snordmark ASSERT(xmit_ire->ire_type & (IRE_CACHETABLE|IRE_INTERFACE)); 40782906Snordmark 40792906Snordmark recv_ill = ire_to_ill(ire_local); 40802906Snordmark xmit_ill = ire_to_ill(xmit_ire); 40812906Snordmark 40822906Snordmark ASSERT(recv_ill != NULL); 40832906Snordmark ASSERT(xmit_ill != NULL); 40842733Snordmark 40858485SPeter.Memishian@Sun.COM return (IS_ON_SAME_LAN(recv_ill, xmit_ill)); 40862733Snordmark } 40872733Snordmark 40882733Snordmark /* 40898485SPeter.Memishian@Sun.COM * Check if the IRE_LOCAL uses the same ill as another route would use. 40902962Snordmark * If there is no alternate route, or the alternate is a REJECT or BLACKHOLE, 40912962Snordmark * then we don't allow this IRE_LOCAL to be used. 40922733Snordmark */ 40932733Snordmark boolean_t 40942733Snordmark ire_local_ok_across_zones(ire_t *ire_local, zoneid_t zoneid, void *addr, 40953448Sdh155122 const ts_label_t *tsl, ip_stack_t *ipst) 40962733Snordmark { 40972733Snordmark ire_t *alt_ire; 40982733Snordmark boolean_t rval; 40998485SPeter.Memishian@Sun.COM int flags; 41008485SPeter.Memishian@Sun.COM 41018485SPeter.Memishian@Sun.COM flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE; 41022733Snordmark 41032733Snordmark if (ire_local->ire_ipversion == IPV4_VERSION) { 41042733Snordmark alt_ire = ire_ftable_lookup(*((ipaddr_t *)addr), 0, 0, 0, NULL, 41058485SPeter.Memishian@Sun.COM NULL, zoneid, 0, tsl, flags, ipst); 41062733Snordmark } else { 41078485SPeter.Memishian@Sun.COM alt_ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL, 41088485SPeter.Memishian@Sun.COM NULL, zoneid, 0, tsl, flags, ipst); 41092733Snordmark } 41102733Snordmark 41112733Snordmark if (alt_ire == NULL) 41122733Snordmark return (B_FALSE); 41132733Snordmark 41142962Snordmark if (alt_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 41152962Snordmark ire_refrele(alt_ire); 41162962Snordmark return (B_FALSE); 41172962Snordmark } 41188485SPeter.Memishian@Sun.COM rval = ire_local_same_lan(ire_local, alt_ire); 41192733Snordmark 41202733Snordmark ire_refrele(alt_ire); 41212733Snordmark return (rval); 41222733Snordmark } 41232733Snordmark 41242733Snordmark /* 41258485SPeter.Memishian@Sun.COM * Lookup cache 41262733Snordmark * 41272733Snordmark * In general the zoneid has to match (where ALL_ZONES match all of them). 41282733Snordmark * But for IRE_LOCAL we also need to handle the case where L2 should 41292733Snordmark * conceptually loop back the packet. This is necessary since neither 41302733Snordmark * Ethernet drivers nor Ethernet hardware loops back packets sent to their 41312733Snordmark * own MAC address. This loopback is needed when the normal 41322733Snordmark * routes (ignoring IREs with different zoneids) would send out the packet on 41338485SPeter.Memishian@Sun.COM * the same ill as the ill with which this IRE_LOCAL is associated. 41342733Snordmark * 41352733Snordmark * Earlier versions of this code always matched an IRE_LOCAL independently of 41362733Snordmark * the zoneid. We preserve that earlier behavior when 41372733Snordmark * ip_restrict_interzone_loopback is turned off. 41380Sstevel@tonic-gate */ 41390Sstevel@tonic-gate ire_t * 41403448Sdh155122 ire_cache_lookup(ipaddr_t addr, zoneid_t zoneid, const ts_label_t *tsl, 41413448Sdh155122 ip_stack_t *ipst) 41420Sstevel@tonic-gate { 41430Sstevel@tonic-gate irb_t *irb_ptr; 41440Sstevel@tonic-gate ire_t *ire; 41450Sstevel@tonic-gate 41463448Sdh155122 irb_ptr = &ipst->ips_ip_cache_table[IRE_ADDR_HASH(addr, 41474714Ssowmini ipst->ips_ip_cache_table_size)]; 41480Sstevel@tonic-gate rw_enter(&irb_ptr->irb_lock, RW_READER); 41490Sstevel@tonic-gate for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { 41501095Spriyanka if (ire->ire_marks & (IRE_MARK_CONDEMNED | 41518485SPeter.Memishian@Sun.COM IRE_MARK_TESTHIDDEN | IRE_MARK_PRIVATE_ADDR)) { 41520Sstevel@tonic-gate continue; 41531095Spriyanka } 41540Sstevel@tonic-gate if (ire->ire_addr == addr) { 41551676Sjpk /* 41561676Sjpk * Finally, check if the security policy has any 41571676Sjpk * restriction on using this route for the specified 41581676Sjpk * message. 41591676Sjpk */ 41601676Sjpk if (tsl != NULL && 41611676Sjpk ire->ire_gw_secattr != NULL && 41621676Sjpk tsol_ire_match_gwattr(ire, tsl) != 0) { 41631676Sjpk continue; 41641676Sjpk } 41651676Sjpk 41660Sstevel@tonic-gate if (zoneid == ALL_ZONES || ire->ire_zoneid == zoneid || 41672733Snordmark ire->ire_zoneid == ALL_ZONES) { 41682733Snordmark IRE_REFHOLD(ire); 41692733Snordmark rw_exit(&irb_ptr->irb_lock); 41702733Snordmark return (ire); 41712733Snordmark } 41722733Snordmark 41732733Snordmark if (ire->ire_type == IRE_LOCAL) { 41743448Sdh155122 if (ipst->ips_ip_restrict_interzone_loopback && 41752733Snordmark !ire_local_ok_across_zones(ire, zoneid, 41763448Sdh155122 &addr, tsl, ipst)) 41772733Snordmark continue; 41782733Snordmark 41790Sstevel@tonic-gate IRE_REFHOLD(ire); 41800Sstevel@tonic-gate rw_exit(&irb_ptr->irb_lock); 41810Sstevel@tonic-gate return (ire); 41820Sstevel@tonic-gate } 41830Sstevel@tonic-gate } 41840Sstevel@tonic-gate } 41850Sstevel@tonic-gate rw_exit(&irb_ptr->irb_lock); 41860Sstevel@tonic-gate return (NULL); 41870Sstevel@tonic-gate } 41880Sstevel@tonic-gate 41898275SEric Cheng ire_t * 41908275SEric Cheng ire_cache_lookup_simple(ipaddr_t dst, ip_stack_t *ipst) 41918275SEric Cheng { 41928275SEric Cheng irb_t *irb_ptr; 41938275SEric Cheng ire_t *ire; 41948275SEric Cheng 41958275SEric Cheng /* 41968485SPeter.Memishian@Sun.COM * Look for an ire in the cachetable whose 41978275SEric Cheng * ire_addr matches the destination. 41988275SEric Cheng * Since we are being called by forwarding fastpath 41998275SEric Cheng * no need to check for Trusted Solaris label. 42008275SEric Cheng */ 42018275SEric Cheng irb_ptr = &ipst->ips_ip_cache_table[IRE_ADDR_HASH( 42028275SEric Cheng dst, ipst->ips_ip_cache_table_size)]; 42038275SEric Cheng rw_enter(&irb_ptr->irb_lock, RW_READER); 42048275SEric Cheng for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { 42058485SPeter.Memishian@Sun.COM if (ire->ire_marks & (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN | 42068485SPeter.Memishian@Sun.COM IRE_MARK_PRIVATE_ADDR)) { 42078275SEric Cheng continue; 42088275SEric Cheng } 42098275SEric Cheng if (ire->ire_addr == dst) { 42108275SEric Cheng IRE_REFHOLD(ire); 42118275SEric Cheng rw_exit(&irb_ptr->irb_lock); 42128275SEric Cheng return (ire); 42138275SEric Cheng } 42148275SEric Cheng } 42158275SEric Cheng rw_exit(&irb_ptr->irb_lock); 42168275SEric Cheng return (NULL); 42178275SEric Cheng } 42188275SEric Cheng 42190Sstevel@tonic-gate /* 42200Sstevel@tonic-gate * Locate the interface ire that is tied to the cache ire 'cire' via 42210Sstevel@tonic-gate * cire->ire_ihandle. 42220Sstevel@tonic-gate * 42230Sstevel@tonic-gate * We are trying to create the cache ire for an offlink destn based 42240Sstevel@tonic-gate * on the cache ire of the gateway in 'cire'. 'pire' is the prefix ire 42250Sstevel@tonic-gate * as found by ip_newroute(). We are called from ip_newroute() in 42260Sstevel@tonic-gate * the IRE_CACHE case. 42270Sstevel@tonic-gate */ 42280Sstevel@tonic-gate ire_t * 42290Sstevel@tonic-gate ire_ihandle_lookup_offlink(ire_t *cire, ire_t *pire) 42300Sstevel@tonic-gate { 42310Sstevel@tonic-gate ire_t *ire; 42320Sstevel@tonic-gate int match_flags; 42330Sstevel@tonic-gate ipaddr_t gw_addr; 42340Sstevel@tonic-gate ipif_t *gw_ipif; 42353448Sdh155122 ip_stack_t *ipst = cire->ire_ipst; 42360Sstevel@tonic-gate 42370Sstevel@tonic-gate ASSERT(cire != NULL && pire != NULL); 42380Sstevel@tonic-gate 42390Sstevel@tonic-gate /* 42400Sstevel@tonic-gate * We don't need to specify the zoneid to ire_ftable_lookup() below 42410Sstevel@tonic-gate * because the ihandle refers to an ipif which can be in only one zone. 42420Sstevel@tonic-gate */ 42430Sstevel@tonic-gate match_flags = MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK; 42440Sstevel@tonic-gate if (pire->ire_ipif != NULL) 42458485SPeter.Memishian@Sun.COM match_flags |= MATCH_IRE_ILL; 42460Sstevel@tonic-gate /* 42470Sstevel@tonic-gate * We know that the mask of the interface ire equals cire->ire_cmask. 42480Sstevel@tonic-gate * (When ip_newroute() created 'cire' for the gateway it set its 42490Sstevel@tonic-gate * cmask from the interface ire's mask) 42500Sstevel@tonic-gate */ 42510Sstevel@tonic-gate ire = ire_ftable_lookup(cire->ire_addr, cire->ire_cmask, 0, 42520Sstevel@tonic-gate IRE_INTERFACE, pire->ire_ipif, NULL, ALL_ZONES, cire->ire_ihandle, 42533448Sdh155122 NULL, match_flags, ipst); 42540Sstevel@tonic-gate if (ire != NULL) 42550Sstevel@tonic-gate return (ire); 42560Sstevel@tonic-gate /* 42570Sstevel@tonic-gate * If we didn't find an interface ire above, we can't declare failure. 42580Sstevel@tonic-gate * For backwards compatibility, we need to support prefix routes 42590Sstevel@tonic-gate * pointing to next hop gateways that are not on-link. 42600Sstevel@tonic-gate * 42610Sstevel@tonic-gate * Assume we are trying to ping some offlink destn, and we have the 42620Sstevel@tonic-gate * routing table below. 42630Sstevel@tonic-gate * 42640Sstevel@tonic-gate * Eg. default - gw1 <--- pire (line 1) 42650Sstevel@tonic-gate * gw1 - gw2 (line 2) 42660Sstevel@tonic-gate * gw2 - hme0 (line 3) 42670Sstevel@tonic-gate * 42680Sstevel@tonic-gate * If we already have a cache ire for gw1 in 'cire', the 42690Sstevel@tonic-gate * ire_ftable_lookup above would have failed, since there is no 42700Sstevel@tonic-gate * interface ire to reach gw1. We will fallthru below. 42710Sstevel@tonic-gate * 42720Sstevel@tonic-gate * Here we duplicate the steps that ire_ftable_lookup() did in 42730Sstevel@tonic-gate * getting 'cire' from 'pire', in the MATCH_IRE_RECURSIVE case. 42740Sstevel@tonic-gate * The differences are the following 42750Sstevel@tonic-gate * i. We want the interface ire only, so we call ire_ftable_lookup() 42760Sstevel@tonic-gate * instead of ire_route_lookup() 42770Sstevel@tonic-gate * ii. We look for only prefix routes in the 1st call below. 42780Sstevel@tonic-gate * ii. We want to match on the ihandle in the 2nd call below. 42790Sstevel@tonic-gate */ 42800Sstevel@tonic-gate match_flags = MATCH_IRE_TYPE; 42810Sstevel@tonic-gate if (pire->ire_ipif != NULL) 42828485SPeter.Memishian@Sun.COM match_flags |= MATCH_IRE_ILL; 42830Sstevel@tonic-gate ire = ire_ftable_lookup(pire->ire_gateway_addr, 0, 0, IRE_OFFSUBNET, 42843448Sdh155122 pire->ire_ipif, NULL, ALL_ZONES, 0, NULL, match_flags, ipst); 42850Sstevel@tonic-gate if (ire == NULL) 42860Sstevel@tonic-gate return (NULL); 42870Sstevel@tonic-gate /* 42880Sstevel@tonic-gate * At this point 'ire' corresponds to the entry shown in line 2. 42890Sstevel@tonic-gate * gw_addr is 'gw2' in the example above. 42900Sstevel@tonic-gate */ 42910Sstevel@tonic-gate gw_addr = ire->ire_gateway_addr; 42920Sstevel@tonic-gate gw_ipif = ire->ire_ipif; 42930Sstevel@tonic-gate ire_refrele(ire); 42940Sstevel@tonic-gate 42950Sstevel@tonic-gate match_flags |= MATCH_IRE_IHANDLE; 42960Sstevel@tonic-gate ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE, 42973448Sdh155122 gw_ipif, NULL, ALL_ZONES, cire->ire_ihandle, NULL, match_flags, 42983448Sdh155122 ipst); 42990Sstevel@tonic-gate return (ire); 43000Sstevel@tonic-gate } 43010Sstevel@tonic-gate 43020Sstevel@tonic-gate /* 43030Sstevel@tonic-gate * Return the IRE_LOOPBACK, IRE_IF_RESOLVER or IRE_IF_NORESOLVER 43040Sstevel@tonic-gate * ire associated with the specified ipif. 43050Sstevel@tonic-gate * 43060Sstevel@tonic-gate * This might occasionally be called when IPIF_UP is not set since 43070Sstevel@tonic-gate * the IP_MULTICAST_IF as well as creating interface routes 43080Sstevel@tonic-gate * allows specifying a down ipif (ipif_lookup* match ipifs that are down). 43090Sstevel@tonic-gate * 43100Sstevel@tonic-gate * Note that if IPIF_NOLOCAL, IPIF_NOXMIT, or IPIF_DEPRECATED is set on 43110Sstevel@tonic-gate * the ipif, this routine might return NULL. 43120Sstevel@tonic-gate */ 43130Sstevel@tonic-gate ire_t * 43141676Sjpk ipif_to_ire(const ipif_t *ipif) 43150Sstevel@tonic-gate { 43160Sstevel@tonic-gate ire_t *ire; 43178485SPeter.Memishian@Sun.COM ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 43188485SPeter.Memishian@Sun.COM uint_t match_flags = MATCH_IRE_TYPE | MATCH_IRE_IPIF | MATCH_IRE_MASK; 43198485SPeter.Memishian@Sun.COM 43208485SPeter.Memishian@Sun.COM /* 43218485SPeter.Memishian@Sun.COM * IRE_INTERFACE entries for ills under IPMP are IRE_MARK_TESTHIDDEN 43228485SPeter.Memishian@Sun.COM * so that they aren't accidentally returned. However, if the 43238485SPeter.Memishian@Sun.COM * caller's ipif is on an ill under IPMP, there's no need to hide 'em. 43248485SPeter.Memishian@Sun.COM */ 43258485SPeter.Memishian@Sun.COM if (IS_UNDER_IPMP(ipif->ipif_ill)) 43268485SPeter.Memishian@Sun.COM match_flags |= MATCH_IRE_MARK_TESTHIDDEN; 43270Sstevel@tonic-gate 43280Sstevel@tonic-gate ASSERT(!ipif->ipif_isv6); 43290Sstevel@tonic-gate if (ipif->ipif_ire_type == IRE_LOOPBACK) { 43300Sstevel@tonic-gate ire = ire_ctable_lookup(ipif->ipif_lcl_addr, 0, IRE_LOOPBACK, 43313448Sdh155122 ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF), 43323448Sdh155122 ipst); 43330Sstevel@tonic-gate } else if (ipif->ipif_flags & IPIF_POINTOPOINT) { 43340Sstevel@tonic-gate /* In this case we need to lookup destination address. */ 43350Sstevel@tonic-gate ire = ire_ftable_lookup(ipif->ipif_pp_dst_addr, IP_HOST_MASK, 0, 43368485SPeter.Memishian@Sun.COM IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL, match_flags, 43378485SPeter.Memishian@Sun.COM ipst); 43380Sstevel@tonic-gate } else { 43390Sstevel@tonic-gate ire = ire_ftable_lookup(ipif->ipif_subnet, 43400Sstevel@tonic-gate ipif->ipif_net_mask, 0, IRE_INTERFACE, ipif, NULL, 43418485SPeter.Memishian@Sun.COM ALL_ZONES, 0, NULL, match_flags, ipst); 43420Sstevel@tonic-gate } 43430Sstevel@tonic-gate return (ire); 43440Sstevel@tonic-gate } 43450Sstevel@tonic-gate 43460Sstevel@tonic-gate /* 43470Sstevel@tonic-gate * ire_walk function. 43480Sstevel@tonic-gate * Count the number of IRE_CACHE entries in different categories. 43490Sstevel@tonic-gate */ 43500Sstevel@tonic-gate void 43510Sstevel@tonic-gate ire_cache_count(ire_t *ire, char *arg) 43520Sstevel@tonic-gate { 43530Sstevel@tonic-gate ire_cache_count_t *icc = (ire_cache_count_t *)arg; 43540Sstevel@tonic-gate 43550Sstevel@tonic-gate if (ire->ire_type != IRE_CACHE) 43560Sstevel@tonic-gate return; 43570Sstevel@tonic-gate 43580Sstevel@tonic-gate icc->icc_total++; 43590Sstevel@tonic-gate 43600Sstevel@tonic-gate if (ire->ire_ipversion == IPV6_VERSION) { 43610Sstevel@tonic-gate mutex_enter(&ire->ire_lock); 43620Sstevel@tonic-gate if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) { 43630Sstevel@tonic-gate mutex_exit(&ire->ire_lock); 43640Sstevel@tonic-gate icc->icc_onlink++; 43650Sstevel@tonic-gate return; 43660Sstevel@tonic-gate } 43670Sstevel@tonic-gate mutex_exit(&ire->ire_lock); 43680Sstevel@tonic-gate } else { 43690Sstevel@tonic-gate if (ire->ire_gateway_addr == 0) { 43700Sstevel@tonic-gate icc->icc_onlink++; 43710Sstevel@tonic-gate return; 43720Sstevel@tonic-gate } 43730Sstevel@tonic-gate } 43740Sstevel@tonic-gate 43750Sstevel@tonic-gate ASSERT(ire->ire_ipif != NULL); 43760Sstevel@tonic-gate if (ire->ire_max_frag < ire->ire_ipif->ipif_mtu) 43770Sstevel@tonic-gate icc->icc_pmtu++; 43780Sstevel@tonic-gate else if (ire->ire_tire_mark != ire->ire_ob_pkt_count + 43790Sstevel@tonic-gate ire->ire_ib_pkt_count) 43800Sstevel@tonic-gate icc->icc_offlink++; 43810Sstevel@tonic-gate else 43820Sstevel@tonic-gate icc->icc_unused++; 43830Sstevel@tonic-gate } 43840Sstevel@tonic-gate 43850Sstevel@tonic-gate /* 43860Sstevel@tonic-gate * ire_walk function called by ip_trash_ire_reclaim(). 43870Sstevel@tonic-gate * Free a fraction of the IRE_CACHE cache entries. The fractions are 43880Sstevel@tonic-gate * different for different categories of IRE_CACHE entries. 43890Sstevel@tonic-gate * A fraction of zero means to not free any in that category. 43900Sstevel@tonic-gate * Use the hash bucket id plus lbolt as a random number. Thus if the fraction 43910Sstevel@tonic-gate * is N then every Nth hash bucket chain will be freed. 43920Sstevel@tonic-gate */ 43930Sstevel@tonic-gate void 43940Sstevel@tonic-gate ire_cache_reclaim(ire_t *ire, char *arg) 43950Sstevel@tonic-gate { 43960Sstevel@tonic-gate ire_cache_reclaim_t *icr = (ire_cache_reclaim_t *)arg; 43970Sstevel@tonic-gate uint_t rand; 43983448Sdh155122 ip_stack_t *ipst = icr->icr_ipst; 43990Sstevel@tonic-gate 44000Sstevel@tonic-gate if (ire->ire_type != IRE_CACHE) 44010Sstevel@tonic-gate return; 44020Sstevel@tonic-gate 44030Sstevel@tonic-gate if (ire->ire_ipversion == IPV6_VERSION) { 44040Sstevel@tonic-gate rand = (uint_t)lbolt + 44053448Sdh155122 IRE_ADDR_HASH_V6(ire->ire_addr_v6, 44064714Ssowmini ipst->ips_ip6_cache_table_size); 44070Sstevel@tonic-gate mutex_enter(&ire->ire_lock); 44080Sstevel@tonic-gate if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) { 44090Sstevel@tonic-gate mutex_exit(&ire->ire_lock); 44100Sstevel@tonic-gate if (icr->icr_onlink != 0 && 44110Sstevel@tonic-gate (rand/icr->icr_onlink)*icr->icr_onlink == rand) { 44120Sstevel@tonic-gate ire_delete(ire); 44130Sstevel@tonic-gate return; 44140Sstevel@tonic-gate } 44150Sstevel@tonic-gate goto done; 44160Sstevel@tonic-gate } 44170Sstevel@tonic-gate mutex_exit(&ire->ire_lock); 44180Sstevel@tonic-gate } else { 44190Sstevel@tonic-gate rand = (uint_t)lbolt + 44203448Sdh155122 IRE_ADDR_HASH(ire->ire_addr, ipst->ips_ip_cache_table_size); 44210Sstevel@tonic-gate if (ire->ire_gateway_addr == 0) { 44220Sstevel@tonic-gate if (icr->icr_onlink != 0 && 44230Sstevel@tonic-gate (rand/icr->icr_onlink)*icr->icr_onlink == rand) { 44240Sstevel@tonic-gate ire_delete(ire); 44250Sstevel@tonic-gate return; 44260Sstevel@tonic-gate } 44270Sstevel@tonic-gate goto done; 44280Sstevel@tonic-gate } 44290Sstevel@tonic-gate } 44300Sstevel@tonic-gate /* Not onlink IRE */ 44310Sstevel@tonic-gate ASSERT(ire->ire_ipif != NULL); 44320Sstevel@tonic-gate if (ire->ire_max_frag < ire->ire_ipif->ipif_mtu) { 44330Sstevel@tonic-gate /* Use ptmu fraction */ 44340Sstevel@tonic-gate if (icr->icr_pmtu != 0 && 44350Sstevel@tonic-gate (rand/icr->icr_pmtu)*icr->icr_pmtu == rand) { 44360Sstevel@tonic-gate ire_delete(ire); 44370Sstevel@tonic-gate return; 44380Sstevel@tonic-gate } 44390Sstevel@tonic-gate } else if (ire->ire_tire_mark != ire->ire_ob_pkt_count + 44400Sstevel@tonic-gate ire->ire_ib_pkt_count) { 44410Sstevel@tonic-gate /* Use offlink fraction */ 44420Sstevel@tonic-gate if (icr->icr_offlink != 0 && 44430Sstevel@tonic-gate (rand/icr->icr_offlink)*icr->icr_offlink == rand) { 44440Sstevel@tonic-gate ire_delete(ire); 44450Sstevel@tonic-gate return; 44460Sstevel@tonic-gate } 44470Sstevel@tonic-gate } else { 44480Sstevel@tonic-gate /* Use unused fraction */ 44490Sstevel@tonic-gate if (icr->icr_unused != 0 && 44500Sstevel@tonic-gate (rand/icr->icr_unused)*icr->icr_unused == rand) { 44510Sstevel@tonic-gate ire_delete(ire); 44520Sstevel@tonic-gate return; 44530Sstevel@tonic-gate } 44540Sstevel@tonic-gate } 44550Sstevel@tonic-gate done: 44560Sstevel@tonic-gate /* 44570Sstevel@tonic-gate * Update tire_mark so that those that haven't been used since this 44580Sstevel@tonic-gate * reclaim will be considered unused next time we reclaim. 44590Sstevel@tonic-gate */ 44600Sstevel@tonic-gate ire->ire_tire_mark = ire->ire_ob_pkt_count + ire->ire_ib_pkt_count; 44610Sstevel@tonic-gate } 44620Sstevel@tonic-gate 44630Sstevel@tonic-gate static void 44640Sstevel@tonic-gate power2_roundup(uint32_t *value) 44650Sstevel@tonic-gate { 44660Sstevel@tonic-gate int i; 44670Sstevel@tonic-gate 44680Sstevel@tonic-gate for (i = 1; i < 31; i++) { 44690Sstevel@tonic-gate if (*value <= (1 << i)) 44700Sstevel@tonic-gate break; 44710Sstevel@tonic-gate } 44720Sstevel@tonic-gate *value = (1 << i); 44730Sstevel@tonic-gate } 44740Sstevel@tonic-gate 44753448Sdh155122 /* Global init for all zones */ 44760Sstevel@tonic-gate void 44773448Sdh155122 ip_ire_g_init() 44780Sstevel@tonic-gate { 44790Sstevel@tonic-gate /* 44800Sstevel@tonic-gate * Create ire caches, ire_reclaim() 44810Sstevel@tonic-gate * will give IRE_CACHE back to system when needed. 44820Sstevel@tonic-gate * This needs to be done here before anything else, since 44830Sstevel@tonic-gate * ire_add() expects the cache to be created. 44840Sstevel@tonic-gate */ 44850Sstevel@tonic-gate ire_cache = kmem_cache_create("ire_cache", 44864714Ssowmini sizeof (ire_t), 0, ip_ire_constructor, 44874714Ssowmini ip_ire_destructor, ip_trash_ire_reclaim, NULL, NULL, 0); 44880Sstevel@tonic-gate 44893448Sdh155122 rt_entry_cache = kmem_cache_create("rt_entry", 44903448Sdh155122 sizeof (struct rt_entry), 0, NULL, NULL, NULL, NULL, NULL, 0); 44913448Sdh155122 44923448Sdh155122 /* 44933448Sdh155122 * Have radix code setup kmem caches etc. 44943448Sdh155122 */ 44953448Sdh155122 rn_init(); 44963448Sdh155122 } 44973448Sdh155122 44983448Sdh155122 void 44993448Sdh155122 ip_ire_init(ip_stack_t *ipst) 45003448Sdh155122 { 45013448Sdh155122 int i; 45023448Sdh155122 uint32_t mem_cnt; 45033448Sdh155122 uint32_t cpu_cnt; 45043448Sdh155122 uint32_t min_cnt; 45053448Sdh155122 pgcnt_t mem_avail; 45063448Sdh155122 45073448Sdh155122 /* 45083448Sdh155122 * ip_ire_max_bucket_cnt is sized below based on the memory 45093448Sdh155122 * size and the cpu speed of the machine. This is upper 45103448Sdh155122 * bounded by the compile time value of ip_ire_max_bucket_cnt 45113448Sdh155122 * and is lower bounded by the compile time value of 45123448Sdh155122 * ip_ire_min_bucket_cnt. Similar logic applies to 45133448Sdh155122 * ip6_ire_max_bucket_cnt. 45143448Sdh155122 * 45153448Sdh155122 * We calculate this for each IP Instances in order to use 45163448Sdh155122 * the kmem_avail and ip_ire_{min,max}_bucket_cnt that are 45173448Sdh155122 * in effect when the zone is booted. 45183448Sdh155122 */ 45193448Sdh155122 mem_avail = kmem_avail(); 45203448Sdh155122 mem_cnt = (mem_avail >> ip_ire_mem_ratio) / 45213448Sdh155122 ip_cache_table_size / sizeof (ire_t); 45223448Sdh155122 cpu_cnt = CPU->cpu_type_info.pi_clock >> ip_ire_cpu_ratio; 45233448Sdh155122 45243448Sdh155122 min_cnt = MIN(cpu_cnt, mem_cnt); 45253448Sdh155122 if (min_cnt < ip_ire_min_bucket_cnt) 45263448Sdh155122 min_cnt = ip_ire_min_bucket_cnt; 45273448Sdh155122 if (ip_ire_max_bucket_cnt > min_cnt) { 45283448Sdh155122 ip_ire_max_bucket_cnt = min_cnt; 45293448Sdh155122 } 45303448Sdh155122 45313448Sdh155122 mem_cnt = (mem_avail >> ip_ire_mem_ratio) / 45323448Sdh155122 ip6_cache_table_size / sizeof (ire_t); 45333448Sdh155122 min_cnt = MIN(cpu_cnt, mem_cnt); 45343448Sdh155122 if (min_cnt < ip6_ire_min_bucket_cnt) 45353448Sdh155122 min_cnt = ip6_ire_min_bucket_cnt; 45363448Sdh155122 if (ip6_ire_max_bucket_cnt > min_cnt) { 45373448Sdh155122 ip6_ire_max_bucket_cnt = min_cnt; 45383448Sdh155122 } 45393448Sdh155122 45403448Sdh155122 mutex_init(&ipst->ips_ire_ft_init_lock, NULL, MUTEX_DEFAULT, 0); 45413448Sdh155122 mutex_init(&ipst->ips_ire_handle_lock, NULL, MUTEX_DEFAULT, NULL); 45423448Sdh155122 45433448Sdh155122 (void) rn_inithead((void **)&ipst->ips_ip_ftable, 32); 45443448Sdh155122 45453448Sdh155122 /* Calculate the IPv4 cache table size. */ 45463448Sdh155122 ipst->ips_ip_cache_table_size = MAX(ip_cache_table_size, 45473448Sdh155122 ((mem_avail >> ip_ire_mem_ratio) / sizeof (ire_t) / 45483448Sdh155122 ip_ire_max_bucket_cnt)); 45493448Sdh155122 if (ipst->ips_ip_cache_table_size > ip_max_cache_table_size) 45503448Sdh155122 ipst->ips_ip_cache_table_size = ip_max_cache_table_size; 45513448Sdh155122 /* 45523448Sdh155122 * Make sure that the table size is always a power of 2. The 45533448Sdh155122 * hash macro IRE_ADDR_HASH() depends on that. 45543448Sdh155122 */ 45553448Sdh155122 power2_roundup(&ipst->ips_ip_cache_table_size); 45563448Sdh155122 45573448Sdh155122 ipst->ips_ip_cache_table = kmem_zalloc(ipst->ips_ip_cache_table_size * 45583448Sdh155122 sizeof (irb_t), KM_SLEEP); 45593448Sdh155122 45603448Sdh155122 for (i = 0; i < ipst->ips_ip_cache_table_size; i++) { 45613448Sdh155122 rw_init(&ipst->ips_ip_cache_table[i].irb_lock, NULL, 45623448Sdh155122 RW_DEFAULT, NULL); 45633448Sdh155122 } 45643448Sdh155122 45653448Sdh155122 /* Calculate the IPv6 cache table size. */ 45663448Sdh155122 ipst->ips_ip6_cache_table_size = MAX(ip6_cache_table_size, 45673448Sdh155122 ((mem_avail >> ip_ire_mem_ratio) / sizeof (ire_t) / 45683448Sdh155122 ip6_ire_max_bucket_cnt)); 45693448Sdh155122 if (ipst->ips_ip6_cache_table_size > ip6_max_cache_table_size) 45703448Sdh155122 ipst->ips_ip6_cache_table_size = ip6_max_cache_table_size; 45713448Sdh155122 /* 45723448Sdh155122 * Make sure that the table size is always a power of 2. The 45733448Sdh155122 * hash macro IRE_ADDR_HASH_V6() depends on that. 45743448Sdh155122 */ 45753448Sdh155122 power2_roundup(&ipst->ips_ip6_cache_table_size); 45763448Sdh155122 45773448Sdh155122 ipst->ips_ip_cache_table_v6 = kmem_zalloc( 45783448Sdh155122 ipst->ips_ip6_cache_table_size * sizeof (irb_t), KM_SLEEP); 45793448Sdh155122 45803448Sdh155122 for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) { 45813448Sdh155122 rw_init(&ipst->ips_ip_cache_table_v6[i].irb_lock, NULL, 45823448Sdh155122 RW_DEFAULT, NULL); 45833448Sdh155122 } 45843448Sdh155122 45850Sstevel@tonic-gate /* 45860Sstevel@tonic-gate * Make sure that the forwarding table size is a power of 2. 45870Sstevel@tonic-gate * The IRE*_ADDR_HASH() macroes depend on that. 45880Sstevel@tonic-gate */ 45893448Sdh155122 ipst->ips_ip6_ftable_hash_size = ip6_ftable_hash_size; 45903448Sdh155122 power2_roundup(&ipst->ips_ip6_ftable_hash_size); 45913448Sdh155122 45923448Sdh155122 ipst->ips_ire_handle = 1; 45933448Sdh155122 } 45943448Sdh155122 45953448Sdh155122 void 45963448Sdh155122 ip_ire_g_fini(void) 45973448Sdh155122 { 45983448Sdh155122 kmem_cache_destroy(ire_cache); 45993448Sdh155122 kmem_cache_destroy(rt_entry_cache); 46003448Sdh155122 46013448Sdh155122 rn_fini(); 46020Sstevel@tonic-gate } 46030Sstevel@tonic-gate 46040Sstevel@tonic-gate void 46053448Sdh155122 ip_ire_fini(ip_stack_t *ipst) 46060Sstevel@tonic-gate { 46070Sstevel@tonic-gate int i; 46080Sstevel@tonic-gate 46093448Sdh155122 /* 46103448Sdh155122 * Delete all IREs - assumes that the ill/ipifs have 46113448Sdh155122 * been removed so what remains are just the ftable and IRE_CACHE. 46123448Sdh155122 */ 46133448Sdh155122 ire_walk(ire_delete, NULL, ipst); 46143448Sdh155122 46153448Sdh155122 rn_freehead(ipst->ips_ip_ftable); 46163448Sdh155122 ipst->ips_ip_ftable = NULL; 46173448Sdh155122 46183448Sdh155122 mutex_destroy(&ipst->ips_ire_ft_init_lock); 46193448Sdh155122 mutex_destroy(&ipst->ips_ire_handle_lock); 46203448Sdh155122 46213448Sdh155122 for (i = 0; i < ipst->ips_ip_cache_table_size; i++) { 46223448Sdh155122 ASSERT(ipst->ips_ip_cache_table[i].irb_ire == NULL); 46233448Sdh155122 rw_destroy(&ipst->ips_ip_cache_table[i].irb_lock); 46243448Sdh155122 } 46253448Sdh155122 kmem_free(ipst->ips_ip_cache_table, 46263448Sdh155122 ipst->ips_ip_cache_table_size * sizeof (irb_t)); 46273448Sdh155122 ipst->ips_ip_cache_table = NULL; 46283448Sdh155122 46293448Sdh155122 for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) { 46303448Sdh155122 ASSERT(ipst->ips_ip_cache_table_v6[i].irb_ire == NULL); 46313448Sdh155122 rw_destroy(&ipst->ips_ip_cache_table_v6[i].irb_lock); 46323448Sdh155122 } 46333448Sdh155122 kmem_free(ipst->ips_ip_cache_table_v6, 46343448Sdh155122 ipst->ips_ip6_cache_table_size * sizeof (irb_t)); 46353448Sdh155122 ipst->ips_ip_cache_table_v6 = NULL; 46363448Sdh155122 46373448Sdh155122 for (i = 0; i < IP6_MASK_TABLE_SIZE; i++) { 46383448Sdh155122 irb_t *ptr; 46393448Sdh155122 int j; 46403448Sdh155122 46413448Sdh155122 if ((ptr = ipst->ips_ip_forwarding_table_v6[i]) == NULL) 46423448Sdh155122 continue; 46433448Sdh155122 46443448Sdh155122 for (j = 0; j < ipst->ips_ip6_ftable_hash_size; j++) { 46453448Sdh155122 ASSERT(ptr[j].irb_ire == NULL); 46463448Sdh155122 rw_destroy(&ptr[j].irb_lock); 46473448Sdh155122 } 46483448Sdh155122 mi_free(ptr); 46493448Sdh155122 ipst->ips_ip_forwarding_table_v6[i] = NULL; 46503448Sdh155122 } 46510Sstevel@tonic-gate } 46520Sstevel@tonic-gate 46530Sstevel@tonic-gate /* 46540Sstevel@tonic-gate * Check if another multirt route resolution is needed. 46550Sstevel@tonic-gate * B_TRUE is returned is there remain a resolvable route, 46560Sstevel@tonic-gate * or if no route for that dst is resolved yet. 46570Sstevel@tonic-gate * B_FALSE is returned if all routes for that dst are resolved 46580Sstevel@tonic-gate * or if the remaining unresolved routes are actually not 46590Sstevel@tonic-gate * resolvable. 46600Sstevel@tonic-gate * This only works in the global zone. 46610Sstevel@tonic-gate */ 46620Sstevel@tonic-gate boolean_t 46633448Sdh155122 ire_multirt_need_resolve(ipaddr_t dst, const ts_label_t *tsl, ip_stack_t *ipst) 46640Sstevel@tonic-gate { 46650Sstevel@tonic-gate ire_t *first_fire; 46660Sstevel@tonic-gate ire_t *first_cire; 46670Sstevel@tonic-gate ire_t *fire; 46680Sstevel@tonic-gate ire_t *cire; 46690Sstevel@tonic-gate irb_t *firb; 46700Sstevel@tonic-gate irb_t *cirb; 46710Sstevel@tonic-gate int unres_cnt = 0; 46720Sstevel@tonic-gate boolean_t resolvable = B_FALSE; 46730Sstevel@tonic-gate 46740Sstevel@tonic-gate /* Retrieve the first IRE_HOST that matches the destination */ 46750Sstevel@tonic-gate first_fire = ire_ftable_lookup(dst, IP_HOST_MASK, 0, IRE_HOST, NULL, 46761676Sjpk NULL, ALL_ZONES, 0, tsl, 46773448Sdh155122 MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_SECATTR, ipst); 46780Sstevel@tonic-gate 46790Sstevel@tonic-gate /* No route at all */ 46800Sstevel@tonic-gate if (first_fire == NULL) { 46810Sstevel@tonic-gate return (B_TRUE); 46820Sstevel@tonic-gate } 46830Sstevel@tonic-gate 46840Sstevel@tonic-gate firb = first_fire->ire_bucket; 46850Sstevel@tonic-gate ASSERT(firb != NULL); 46860Sstevel@tonic-gate 46870Sstevel@tonic-gate /* Retrieve the first IRE_CACHE ire for that destination. */ 46883448Sdh155122 first_cire = ire_cache_lookup(dst, GLOBAL_ZONEID, tsl, ipst); 46890Sstevel@tonic-gate 46900Sstevel@tonic-gate /* No resolved route. */ 46910Sstevel@tonic-gate if (first_cire == NULL) { 46920Sstevel@tonic-gate ire_refrele(first_fire); 46930Sstevel@tonic-gate return (B_TRUE); 46940Sstevel@tonic-gate } 46950Sstevel@tonic-gate 46960Sstevel@tonic-gate /* 46970Sstevel@tonic-gate * At least one route is resolved. Here we look through the forward 46980Sstevel@tonic-gate * and cache tables, to compare the number of declared routes 46990Sstevel@tonic-gate * with the number of resolved routes. The search for a resolvable 47000Sstevel@tonic-gate * route is performed only if at least one route remains 47010Sstevel@tonic-gate * unresolved. 47020Sstevel@tonic-gate */ 47030Sstevel@tonic-gate cirb = first_cire->ire_bucket; 47040Sstevel@tonic-gate ASSERT(cirb != NULL); 47050Sstevel@tonic-gate 47060Sstevel@tonic-gate /* Count the number of routes to that dest that are declared. */ 47070Sstevel@tonic-gate IRB_REFHOLD(firb); 47080Sstevel@tonic-gate for (fire = first_fire; fire != NULL; fire = fire->ire_next) { 47090Sstevel@tonic-gate if (!(fire->ire_flags & RTF_MULTIRT)) 47100Sstevel@tonic-gate continue; 47110Sstevel@tonic-gate if (fire->ire_addr != dst) 47120Sstevel@tonic-gate continue; 47130Sstevel@tonic-gate unres_cnt++; 47140Sstevel@tonic-gate } 47150Sstevel@tonic-gate IRB_REFRELE(firb); 47160Sstevel@tonic-gate 47170Sstevel@tonic-gate /* Then subtract the number of routes to that dst that are resolved */ 47180Sstevel@tonic-gate IRB_REFHOLD(cirb); 47190Sstevel@tonic-gate for (cire = first_cire; cire != NULL; cire = cire->ire_next) { 47200Sstevel@tonic-gate if (!(cire->ire_flags & RTF_MULTIRT)) 47210Sstevel@tonic-gate continue; 47220Sstevel@tonic-gate if (cire->ire_addr != dst) 47230Sstevel@tonic-gate continue; 47248485SPeter.Memishian@Sun.COM if (cire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN)) 47250Sstevel@tonic-gate continue; 47260Sstevel@tonic-gate unres_cnt--; 47270Sstevel@tonic-gate } 47280Sstevel@tonic-gate IRB_REFRELE(cirb); 47290Sstevel@tonic-gate 47300Sstevel@tonic-gate /* At least one route is unresolved; search for a resolvable route. */ 47310Sstevel@tonic-gate if (unres_cnt > 0) 47320Sstevel@tonic-gate resolvable = ire_multirt_lookup(&first_cire, &first_fire, 47333448Sdh155122 MULTIRT_USESTAMP | MULTIRT_CACHEGW, tsl, ipst); 47340Sstevel@tonic-gate 47350Sstevel@tonic-gate if (first_fire != NULL) 47360Sstevel@tonic-gate ire_refrele(first_fire); 47370Sstevel@tonic-gate 47380Sstevel@tonic-gate if (first_cire != NULL) 47390Sstevel@tonic-gate ire_refrele(first_cire); 47400Sstevel@tonic-gate 47410Sstevel@tonic-gate return (resolvable); 47420Sstevel@tonic-gate } 47430Sstevel@tonic-gate 47440Sstevel@tonic-gate /* 47450Sstevel@tonic-gate * Explore a forward_table bucket, starting from fire_arg. 47460Sstevel@tonic-gate * fire_arg MUST be an IRE_HOST entry. 47470Sstevel@tonic-gate * 47480Sstevel@tonic-gate * Return B_TRUE and update *ire_arg and *fire_arg 47490Sstevel@tonic-gate * if at least one resolvable route is found. *ire_arg 47500Sstevel@tonic-gate * is the IRE entry for *fire_arg's gateway. 47510Sstevel@tonic-gate * 47520Sstevel@tonic-gate * Return B_FALSE otherwise (all routes are resolved or 47530Sstevel@tonic-gate * the remaining unresolved routes are all unresolvable). 47540Sstevel@tonic-gate * 47550Sstevel@tonic-gate * The IRE selection relies on a priority mechanism 47560Sstevel@tonic-gate * driven by the flags passed in by the caller. 47570Sstevel@tonic-gate * The caller, such as ip_newroute_ipif(), can get the most 47580Sstevel@tonic-gate * relevant ire at each stage of a multiple route resolution. 47590Sstevel@tonic-gate * 47600Sstevel@tonic-gate * The rules are: 47610Sstevel@tonic-gate * 47620Sstevel@tonic-gate * - if MULTIRT_CACHEGW is specified in flags, IRE_CACHETABLE 47630Sstevel@tonic-gate * ires are preferred for the gateway. This gives the highest 47640Sstevel@tonic-gate * priority to routes that can be resolved without using 47650Sstevel@tonic-gate * a resolver. 47660Sstevel@tonic-gate * 47670Sstevel@tonic-gate * - if MULTIRT_CACHEGW is not specified, or if MULTIRT_CACHEGW 47680Sstevel@tonic-gate * is specified but no IRE_CACHETABLE ire entry for the gateway 47690Sstevel@tonic-gate * is found, the following rules apply. 47700Sstevel@tonic-gate * 47710Sstevel@tonic-gate * - if MULTIRT_USESTAMP is specified in flags, IRE_INTERFACE 47720Sstevel@tonic-gate * ires for the gateway, that have not been tried since 47730Sstevel@tonic-gate * a configurable amount of time, are preferred. 47740Sstevel@tonic-gate * This applies when a resolver must be invoked for 47750Sstevel@tonic-gate * a missing route, but we don't want to use the resolver 47760Sstevel@tonic-gate * upon each packet emission. If no such resolver is found, 47770Sstevel@tonic-gate * B_FALSE is returned. 47780Sstevel@tonic-gate * The MULTIRT_USESTAMP flag can be combined with 47790Sstevel@tonic-gate * MULTIRT_CACHEGW. 47800Sstevel@tonic-gate * 47810Sstevel@tonic-gate * - if MULTIRT_USESTAMP is not specified in flags, the first 47820Sstevel@tonic-gate * unresolved but resolvable route is selected. 47830Sstevel@tonic-gate * 47840Sstevel@tonic-gate * - Otherwise, there is no resolvalble route, and 47850Sstevel@tonic-gate * B_FALSE is returned. 47860Sstevel@tonic-gate * 47870Sstevel@tonic-gate * At last, MULTIRT_SETSTAMP can be specified in flags to 47880Sstevel@tonic-gate * request the timestamp of unresolvable routes to 47890Sstevel@tonic-gate * be refreshed. This prevents the useless exploration 47900Sstevel@tonic-gate * of those routes for a while, when MULTIRT_USESTAMP is used. 47910Sstevel@tonic-gate * 47920Sstevel@tonic-gate * This only works in the global zone. 47930Sstevel@tonic-gate */ 47940Sstevel@tonic-gate boolean_t 47951676Sjpk ire_multirt_lookup(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags, 47963448Sdh155122 const ts_label_t *tsl, ip_stack_t *ipst) 47970Sstevel@tonic-gate { 47980Sstevel@tonic-gate clock_t delta; 47990Sstevel@tonic-gate ire_t *best_fire = NULL; 48000Sstevel@tonic-gate ire_t *best_cire = NULL; 48010Sstevel@tonic-gate ire_t *first_fire; 48020Sstevel@tonic-gate ire_t *first_cire; 48030Sstevel@tonic-gate ire_t *fire; 48040Sstevel@tonic-gate ire_t *cire; 48050Sstevel@tonic-gate irb_t *firb = NULL; 48060Sstevel@tonic-gate irb_t *cirb = NULL; 48070Sstevel@tonic-gate ire_t *gw_ire; 48080Sstevel@tonic-gate boolean_t already_resolved; 48090Sstevel@tonic-gate boolean_t res; 48100Sstevel@tonic-gate ipaddr_t dst; 48110Sstevel@tonic-gate ipaddr_t gw; 48120Sstevel@tonic-gate 48130Sstevel@tonic-gate ip2dbg(("ire_multirt_lookup: *ire_arg %p, *fire_arg %p, flags %04x\n", 48140Sstevel@tonic-gate (void *)*ire_arg, (void *)*fire_arg, flags)); 48150Sstevel@tonic-gate 48160Sstevel@tonic-gate ASSERT(ire_arg != NULL); 48170Sstevel@tonic-gate ASSERT(fire_arg != NULL); 48180Sstevel@tonic-gate 48190Sstevel@tonic-gate /* Not an IRE_HOST ire; give up. */ 48200Sstevel@tonic-gate if ((*fire_arg == NULL) || ((*fire_arg)->ire_type != IRE_HOST)) { 48210Sstevel@tonic-gate return (B_FALSE); 48220Sstevel@tonic-gate } 48230Sstevel@tonic-gate 48240Sstevel@tonic-gate /* This is the first IRE_HOST ire for that destination. */ 48250Sstevel@tonic-gate first_fire = *fire_arg; 48260Sstevel@tonic-gate firb = first_fire->ire_bucket; 48270Sstevel@tonic-gate ASSERT(firb != NULL); 48280Sstevel@tonic-gate 48290Sstevel@tonic-gate dst = first_fire->ire_addr; 48300Sstevel@tonic-gate 48310Sstevel@tonic-gate ip2dbg(("ire_multirt_lookup: dst %08x\n", ntohl(dst))); 48320Sstevel@tonic-gate 48330Sstevel@tonic-gate /* 48340Sstevel@tonic-gate * Retrieve the first IRE_CACHE ire for that destination; 48350Sstevel@tonic-gate * if we don't find one, no route for that dest is 48360Sstevel@tonic-gate * resolved yet. 48370Sstevel@tonic-gate */ 48383448Sdh155122 first_cire = ire_cache_lookup(dst, GLOBAL_ZONEID, tsl, ipst); 48390Sstevel@tonic-gate if (first_cire != NULL) { 48400Sstevel@tonic-gate cirb = first_cire->ire_bucket; 48410Sstevel@tonic-gate } 48420Sstevel@tonic-gate 48430Sstevel@tonic-gate ip2dbg(("ire_multirt_lookup: first_cire %p\n", (void *)first_cire)); 48440Sstevel@tonic-gate 48450Sstevel@tonic-gate /* 48460Sstevel@tonic-gate * Search for a resolvable route, giving the top priority 48470Sstevel@tonic-gate * to routes that can be resolved without any call to the resolver. 48480Sstevel@tonic-gate */ 48490Sstevel@tonic-gate IRB_REFHOLD(firb); 48500Sstevel@tonic-gate 48510Sstevel@tonic-gate if (!CLASSD(dst)) { 48520Sstevel@tonic-gate /* 48530Sstevel@tonic-gate * For all multiroute IRE_HOST ires for that destination, 48540Sstevel@tonic-gate * check if the route via the IRE_HOST's gateway is 48550Sstevel@tonic-gate * resolved yet. 48560Sstevel@tonic-gate */ 48570Sstevel@tonic-gate for (fire = first_fire; fire != NULL; fire = fire->ire_next) { 48580Sstevel@tonic-gate 48590Sstevel@tonic-gate if (!(fire->ire_flags & RTF_MULTIRT)) 48600Sstevel@tonic-gate continue; 48610Sstevel@tonic-gate if (fire->ire_addr != dst) 48620Sstevel@tonic-gate continue; 48630Sstevel@tonic-gate 48641676Sjpk if (fire->ire_gw_secattr != NULL && 48651676Sjpk tsol_ire_match_gwattr(fire, tsl) != 0) { 48661676Sjpk continue; 48671676Sjpk } 48681676Sjpk 48690Sstevel@tonic-gate gw = fire->ire_gateway_addr; 48700Sstevel@tonic-gate 48710Sstevel@tonic-gate ip2dbg(("ire_multirt_lookup: fire %p, " 48720Sstevel@tonic-gate "ire_addr %08x, ire_gateway_addr %08x\n", 48730Sstevel@tonic-gate (void *)fire, ntohl(fire->ire_addr), ntohl(gw))); 48740Sstevel@tonic-gate 48750Sstevel@tonic-gate already_resolved = B_FALSE; 48760Sstevel@tonic-gate 48770Sstevel@tonic-gate if (first_cire != NULL) { 48780Sstevel@tonic-gate ASSERT(cirb != NULL); 48790Sstevel@tonic-gate 48800Sstevel@tonic-gate IRB_REFHOLD(cirb); 48810Sstevel@tonic-gate /* 48820Sstevel@tonic-gate * For all IRE_CACHE ires for that 48830Sstevel@tonic-gate * destination. 48840Sstevel@tonic-gate */ 48850Sstevel@tonic-gate for (cire = first_cire; 48860Sstevel@tonic-gate cire != NULL; 48870Sstevel@tonic-gate cire = cire->ire_next) { 48880Sstevel@tonic-gate 48890Sstevel@tonic-gate if (!(cire->ire_flags & RTF_MULTIRT)) 48900Sstevel@tonic-gate continue; 48910Sstevel@tonic-gate if (cire->ire_addr != dst) 48920Sstevel@tonic-gate continue; 48930Sstevel@tonic-gate if (cire->ire_marks & 48940Sstevel@tonic-gate (IRE_MARK_CONDEMNED | 48958485SPeter.Memishian@Sun.COM IRE_MARK_TESTHIDDEN)) 48960Sstevel@tonic-gate continue; 48971676Sjpk 48981676Sjpk if (cire->ire_gw_secattr != NULL && 48991676Sjpk tsol_ire_match_gwattr(cire, 49001676Sjpk tsl) != 0) { 49011676Sjpk continue; 49021676Sjpk } 49031676Sjpk 49040Sstevel@tonic-gate /* 49050Sstevel@tonic-gate * Check if the IRE_CACHE's gateway 49060Sstevel@tonic-gate * matches the IRE_HOST's gateway. 49070Sstevel@tonic-gate */ 49080Sstevel@tonic-gate if (cire->ire_gateway_addr == gw) { 49090Sstevel@tonic-gate already_resolved = B_TRUE; 49100Sstevel@tonic-gate break; 49110Sstevel@tonic-gate } 49120Sstevel@tonic-gate } 49130Sstevel@tonic-gate IRB_REFRELE(cirb); 49140Sstevel@tonic-gate } 49150Sstevel@tonic-gate 49160Sstevel@tonic-gate /* 49170Sstevel@tonic-gate * This route is already resolved; 49180Sstevel@tonic-gate * proceed with next one. 49190Sstevel@tonic-gate */ 49200Sstevel@tonic-gate if (already_resolved) { 49210Sstevel@tonic-gate ip2dbg(("ire_multirt_lookup: found cire %p, " 49220Sstevel@tonic-gate "already resolved\n", (void *)cire)); 49230Sstevel@tonic-gate continue; 49240Sstevel@tonic-gate } 49250Sstevel@tonic-gate 49260Sstevel@tonic-gate /* 49270Sstevel@tonic-gate * The route is unresolved; is it actually 49280Sstevel@tonic-gate * resolvable, i.e. is there a cache or a resolver 49290Sstevel@tonic-gate * for the gateway? 49300Sstevel@tonic-gate */ 49310Sstevel@tonic-gate gw_ire = ire_route_lookup(gw, 0, 0, 0, NULL, NULL, 49321676Sjpk ALL_ZONES, tsl, 49333448Sdh155122 MATCH_IRE_RECURSIVE | MATCH_IRE_SECATTR, ipst); 49340Sstevel@tonic-gate 49350Sstevel@tonic-gate ip2dbg(("ire_multirt_lookup: looked up gw_ire %p\n", 49360Sstevel@tonic-gate (void *)gw_ire)); 49370Sstevel@tonic-gate 49380Sstevel@tonic-gate /* 49390Sstevel@tonic-gate * If gw_ire is typed IRE_CACHETABLE, 49400Sstevel@tonic-gate * this route can be resolved without any call to the 49410Sstevel@tonic-gate * resolver. If the MULTIRT_CACHEGW flag is set, 49420Sstevel@tonic-gate * give the top priority to this ire and exit the 49430Sstevel@tonic-gate * loop. 49440Sstevel@tonic-gate * This is typically the case when an ARP reply 49450Sstevel@tonic-gate * is processed through ip_wput_nondata(). 49460Sstevel@tonic-gate */ 49470Sstevel@tonic-gate if ((flags & MULTIRT_CACHEGW) && 49480Sstevel@tonic-gate (gw_ire != NULL) && 49490Sstevel@tonic-gate (gw_ire->ire_type & IRE_CACHETABLE)) { 49502535Ssangeeta ASSERT(gw_ire->ire_nce == NULL || 49512535Ssangeeta gw_ire->ire_nce->nce_state == ND_REACHABLE); 49520Sstevel@tonic-gate /* 49530Sstevel@tonic-gate * Release the resolver associated to the 49540Sstevel@tonic-gate * previous candidate best ire, if any. 49550Sstevel@tonic-gate */ 49560Sstevel@tonic-gate if (best_cire != NULL) { 49570Sstevel@tonic-gate ire_refrele(best_cire); 49580Sstevel@tonic-gate ASSERT(best_fire != NULL); 49590Sstevel@tonic-gate } 49600Sstevel@tonic-gate 49610Sstevel@tonic-gate best_fire = fire; 49620Sstevel@tonic-gate best_cire = gw_ire; 49630Sstevel@tonic-gate 49640Sstevel@tonic-gate ip2dbg(("ire_multirt_lookup: found top prio " 49650Sstevel@tonic-gate "best_fire %p, best_cire %p\n", 49660Sstevel@tonic-gate (void *)best_fire, (void *)best_cire)); 49670Sstevel@tonic-gate break; 49680Sstevel@tonic-gate } 49690Sstevel@tonic-gate 49700Sstevel@tonic-gate /* 49710Sstevel@tonic-gate * Compute the time elapsed since our preceding 49720Sstevel@tonic-gate * attempt to resolve that route. 49730Sstevel@tonic-gate * If the MULTIRT_USESTAMP flag is set, we take that 49740Sstevel@tonic-gate * route into account only if this time interval 49750Sstevel@tonic-gate * exceeds ip_multirt_resolution_interval; 49760Sstevel@tonic-gate * this prevents us from attempting to resolve a 49770Sstevel@tonic-gate * broken route upon each sending of a packet. 49780Sstevel@tonic-gate */ 49790Sstevel@tonic-gate delta = lbolt - fire->ire_last_used_time; 49800Sstevel@tonic-gate delta = TICK_TO_MSEC(delta); 49810Sstevel@tonic-gate 49823448Sdh155122 res = (boolean_t)((delta > 49834714Ssowmini ipst->ips_ip_multirt_resolution_interval) || 49844714Ssowmini (!(flags & MULTIRT_USESTAMP))); 49850Sstevel@tonic-gate 49860Sstevel@tonic-gate ip2dbg(("ire_multirt_lookup: fire %p, delta %lu, " 49870Sstevel@tonic-gate "res %d\n", 49880Sstevel@tonic-gate (void *)fire, delta, res)); 49890Sstevel@tonic-gate 49900Sstevel@tonic-gate if (res) { 49910Sstevel@tonic-gate /* 49920Sstevel@tonic-gate * We are here if MULTIRT_USESTAMP flag is set 49930Sstevel@tonic-gate * and the resolver for fire's gateway 49940Sstevel@tonic-gate * has not been tried since 49950Sstevel@tonic-gate * ip_multirt_resolution_interval, or if 49960Sstevel@tonic-gate * MULTIRT_USESTAMP is not set but gw_ire did 49970Sstevel@tonic-gate * not fill the conditions for MULTIRT_CACHEGW, 49980Sstevel@tonic-gate * or if neither MULTIRT_USESTAMP nor 49990Sstevel@tonic-gate * MULTIRT_CACHEGW are set. 50000Sstevel@tonic-gate */ 50010Sstevel@tonic-gate if (gw_ire != NULL) { 50020Sstevel@tonic-gate if (best_fire == NULL) { 50030Sstevel@tonic-gate ASSERT(best_cire == NULL); 50040Sstevel@tonic-gate 50050Sstevel@tonic-gate best_fire = fire; 50060Sstevel@tonic-gate best_cire = gw_ire; 50070Sstevel@tonic-gate 50080Sstevel@tonic-gate ip2dbg(("ire_multirt_lookup:" 50090Sstevel@tonic-gate "found candidate " 50100Sstevel@tonic-gate "best_fire %p, " 50110Sstevel@tonic-gate "best_cire %p\n", 50120Sstevel@tonic-gate (void *)best_fire, 50130Sstevel@tonic-gate (void *)best_cire)); 50140Sstevel@tonic-gate 50150Sstevel@tonic-gate /* 50160Sstevel@tonic-gate * If MULTIRT_CACHEGW is not 50170Sstevel@tonic-gate * set, we ignore the top 50180Sstevel@tonic-gate * priority ires that can 50190Sstevel@tonic-gate * be resolved without any 50200Sstevel@tonic-gate * call to the resolver; 50210Sstevel@tonic-gate * In that case, there is 50220Sstevel@tonic-gate * actually no need 50230Sstevel@tonic-gate * to continue the loop. 50240Sstevel@tonic-gate */ 50250Sstevel@tonic-gate if (!(flags & 50260Sstevel@tonic-gate MULTIRT_CACHEGW)) { 50270Sstevel@tonic-gate break; 50280Sstevel@tonic-gate } 50290Sstevel@tonic-gate continue; 50300Sstevel@tonic-gate } 50310Sstevel@tonic-gate } else { 50320Sstevel@tonic-gate /* 50330Sstevel@tonic-gate * No resolver for the gateway: the 50340Sstevel@tonic-gate * route is not resolvable. 50350Sstevel@tonic-gate * If the MULTIRT_SETSTAMP flag is 50360Sstevel@tonic-gate * set, we stamp the IRE_HOST ire, 50370Sstevel@tonic-gate * so we will not select it again 50380Sstevel@tonic-gate * during this resolution interval. 50390Sstevel@tonic-gate */ 50400Sstevel@tonic-gate if (flags & MULTIRT_SETSTAMP) 50410Sstevel@tonic-gate fire->ire_last_used_time = 50420Sstevel@tonic-gate lbolt; 50430Sstevel@tonic-gate } 50440Sstevel@tonic-gate } 50450Sstevel@tonic-gate 50460Sstevel@tonic-gate if (gw_ire != NULL) 50470Sstevel@tonic-gate ire_refrele(gw_ire); 50480Sstevel@tonic-gate } 50490Sstevel@tonic-gate } else { /* CLASSD(dst) */ 50500Sstevel@tonic-gate 50510Sstevel@tonic-gate for (fire = first_fire; 50520Sstevel@tonic-gate fire != NULL; 50530Sstevel@tonic-gate fire = fire->ire_next) { 50540Sstevel@tonic-gate 50550Sstevel@tonic-gate if (!(fire->ire_flags & RTF_MULTIRT)) 50560Sstevel@tonic-gate continue; 50570Sstevel@tonic-gate if (fire->ire_addr != dst) 50580Sstevel@tonic-gate continue; 50590Sstevel@tonic-gate 50601676Sjpk if (fire->ire_gw_secattr != NULL && 50611676Sjpk tsol_ire_match_gwattr(fire, tsl) != 0) { 50621676Sjpk continue; 50631676Sjpk } 50641676Sjpk 50650Sstevel@tonic-gate already_resolved = B_FALSE; 50660Sstevel@tonic-gate 50670Sstevel@tonic-gate gw = fire->ire_gateway_addr; 50680Sstevel@tonic-gate 50690Sstevel@tonic-gate gw_ire = ire_ftable_lookup(gw, 0, 0, IRE_INTERFACE, 50701676Sjpk NULL, NULL, ALL_ZONES, 0, tsl, 50711676Sjpk MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE | 50723448Sdh155122 MATCH_IRE_SECATTR, ipst); 50730Sstevel@tonic-gate 50740Sstevel@tonic-gate /* No resolver for the gateway; we skip this ire. */ 50750Sstevel@tonic-gate if (gw_ire == NULL) { 50760Sstevel@tonic-gate continue; 50770Sstevel@tonic-gate } 50782535Ssangeeta ASSERT(gw_ire->ire_nce == NULL || 50792535Ssangeeta gw_ire->ire_nce->nce_state == ND_REACHABLE); 50800Sstevel@tonic-gate 50810Sstevel@tonic-gate if (first_cire != NULL) { 50820Sstevel@tonic-gate 50830Sstevel@tonic-gate IRB_REFHOLD(cirb); 50840Sstevel@tonic-gate /* 50850Sstevel@tonic-gate * For all IRE_CACHE ires for that 50860Sstevel@tonic-gate * destination. 50870Sstevel@tonic-gate */ 50880Sstevel@tonic-gate for (cire = first_cire; 50890Sstevel@tonic-gate cire != NULL; 50900Sstevel@tonic-gate cire = cire->ire_next) { 50910Sstevel@tonic-gate 50920Sstevel@tonic-gate if (!(cire->ire_flags & RTF_MULTIRT)) 50930Sstevel@tonic-gate continue; 50940Sstevel@tonic-gate if (cire->ire_addr != dst) 50950Sstevel@tonic-gate continue; 50960Sstevel@tonic-gate if (cire->ire_marks & 50970Sstevel@tonic-gate (IRE_MARK_CONDEMNED | 50988485SPeter.Memishian@Sun.COM IRE_MARK_TESTHIDDEN)) 50990Sstevel@tonic-gate continue; 51000Sstevel@tonic-gate 51011676Sjpk if (cire->ire_gw_secattr != NULL && 51021676Sjpk tsol_ire_match_gwattr(cire, 51031676Sjpk tsl) != 0) { 51041676Sjpk continue; 51051676Sjpk } 51061676Sjpk 51070Sstevel@tonic-gate /* 51080Sstevel@tonic-gate * Cache entries are linked to the 51090Sstevel@tonic-gate * parent routes using the parent handle 51100Sstevel@tonic-gate * (ire_phandle). If no cache entry has 51110Sstevel@tonic-gate * the same handle as fire, fire is 51120Sstevel@tonic-gate * still unresolved. 51130Sstevel@tonic-gate */ 51140Sstevel@tonic-gate ASSERT(cire->ire_phandle != 0); 51150Sstevel@tonic-gate if (cire->ire_phandle == 51160Sstevel@tonic-gate fire->ire_phandle) { 51170Sstevel@tonic-gate already_resolved = B_TRUE; 51180Sstevel@tonic-gate break; 51190Sstevel@tonic-gate } 51200Sstevel@tonic-gate } 51210Sstevel@tonic-gate IRB_REFRELE(cirb); 51220Sstevel@tonic-gate } 51230Sstevel@tonic-gate 51240Sstevel@tonic-gate /* 51250Sstevel@tonic-gate * This route is already resolved; proceed with 51260Sstevel@tonic-gate * next one. 51270Sstevel@tonic-gate */ 51280Sstevel@tonic-gate if (already_resolved) { 51290Sstevel@tonic-gate ire_refrele(gw_ire); 51300Sstevel@tonic-gate continue; 51310Sstevel@tonic-gate } 51320Sstevel@tonic-gate 51330Sstevel@tonic-gate /* 51340Sstevel@tonic-gate * Compute the time elapsed since our preceding 51350Sstevel@tonic-gate * attempt to resolve that route. 51360Sstevel@tonic-gate * If the MULTIRT_USESTAMP flag is set, we take 51370Sstevel@tonic-gate * that route into account only if this time 51380Sstevel@tonic-gate * interval exceeds ip_multirt_resolution_interval; 51390Sstevel@tonic-gate * this prevents us from attempting to resolve a 51400Sstevel@tonic-gate * broken route upon each sending of a packet. 51410Sstevel@tonic-gate */ 51420Sstevel@tonic-gate delta = lbolt - fire->ire_last_used_time; 51430Sstevel@tonic-gate delta = TICK_TO_MSEC(delta); 51440Sstevel@tonic-gate 51453448Sdh155122 res = (boolean_t)((delta > 51464714Ssowmini ipst->ips_ip_multirt_resolution_interval) || 51474714Ssowmini (!(flags & MULTIRT_USESTAMP))); 51480Sstevel@tonic-gate 51490Sstevel@tonic-gate ip3dbg(("ire_multirt_lookup: fire %p, delta %lx, " 51500Sstevel@tonic-gate "flags %04x, res %d\n", 51510Sstevel@tonic-gate (void *)fire, delta, flags, res)); 51520Sstevel@tonic-gate 51530Sstevel@tonic-gate if (res) { 51540Sstevel@tonic-gate if (best_cire != NULL) { 51550Sstevel@tonic-gate /* 51560Sstevel@tonic-gate * Release the resolver associated 51570Sstevel@tonic-gate * to the preceding candidate best 51580Sstevel@tonic-gate * ire, if any. 51590Sstevel@tonic-gate */ 51600Sstevel@tonic-gate ire_refrele(best_cire); 51610Sstevel@tonic-gate ASSERT(best_fire != NULL); 51620Sstevel@tonic-gate } 51630Sstevel@tonic-gate best_fire = fire; 51640Sstevel@tonic-gate best_cire = gw_ire; 51650Sstevel@tonic-gate continue; 51660Sstevel@tonic-gate } 51670Sstevel@tonic-gate 51680Sstevel@tonic-gate ire_refrele(gw_ire); 51690Sstevel@tonic-gate } 51700Sstevel@tonic-gate } 51710Sstevel@tonic-gate 51720Sstevel@tonic-gate if (best_fire != NULL) { 51730Sstevel@tonic-gate IRE_REFHOLD(best_fire); 51740Sstevel@tonic-gate } 51750Sstevel@tonic-gate IRB_REFRELE(firb); 51760Sstevel@tonic-gate 51770Sstevel@tonic-gate /* Release the first IRE_CACHE we initially looked up, if any. */ 51780Sstevel@tonic-gate if (first_cire != NULL) 51790Sstevel@tonic-gate ire_refrele(first_cire); 51800Sstevel@tonic-gate 51810Sstevel@tonic-gate /* Found a resolvable route. */ 51820Sstevel@tonic-gate if (best_fire != NULL) { 51830Sstevel@tonic-gate ASSERT(best_cire != NULL); 51840Sstevel@tonic-gate 51850Sstevel@tonic-gate if (*fire_arg != NULL) 51860Sstevel@tonic-gate ire_refrele(*fire_arg); 51870Sstevel@tonic-gate if (*ire_arg != NULL) 51880Sstevel@tonic-gate ire_refrele(*ire_arg); 51890Sstevel@tonic-gate 51900Sstevel@tonic-gate /* 51910Sstevel@tonic-gate * Update the passed-in arguments with the 51920Sstevel@tonic-gate * resolvable multirt route we found. 51930Sstevel@tonic-gate */ 51940Sstevel@tonic-gate *fire_arg = best_fire; 51950Sstevel@tonic-gate *ire_arg = best_cire; 51960Sstevel@tonic-gate 51970Sstevel@tonic-gate ip2dbg(("ire_multirt_lookup: returning B_TRUE, " 51980Sstevel@tonic-gate "*fire_arg %p, *ire_arg %p\n", 51990Sstevel@tonic-gate (void *)best_fire, (void *)best_cire)); 52000Sstevel@tonic-gate 52010Sstevel@tonic-gate return (B_TRUE); 52020Sstevel@tonic-gate } 52030Sstevel@tonic-gate 52040Sstevel@tonic-gate ASSERT(best_cire == NULL); 52050Sstevel@tonic-gate 52060Sstevel@tonic-gate ip2dbg(("ire_multirt_lookup: returning B_FALSE, *fire_arg %p, " 52070Sstevel@tonic-gate "*ire_arg %p\n", 52080Sstevel@tonic-gate (void *)*fire_arg, (void *)*ire_arg)); 52090Sstevel@tonic-gate 52100Sstevel@tonic-gate /* No resolvable route. */ 52110Sstevel@tonic-gate return (B_FALSE); 52120Sstevel@tonic-gate } 52130Sstevel@tonic-gate 52140Sstevel@tonic-gate /* 52150Sstevel@tonic-gate * IRE iterator for inbound and loopback broadcast processing. 52160Sstevel@tonic-gate * Given an IRE_BROADCAST ire, walk the ires with the same destination 52170Sstevel@tonic-gate * address, but skip over the passed-in ire. Returns the next ire without 52180Sstevel@tonic-gate * a hold - assumes that the caller holds a reference on the IRE bucket. 52190Sstevel@tonic-gate */ 52200Sstevel@tonic-gate ire_t * 52210Sstevel@tonic-gate ire_get_next_bcast_ire(ire_t *curr, ire_t *ire) 52220Sstevel@tonic-gate { 52230Sstevel@tonic-gate ill_t *ill; 52240Sstevel@tonic-gate 52250Sstevel@tonic-gate if (curr == NULL) { 52260Sstevel@tonic-gate for (curr = ire->ire_bucket->irb_ire; curr != NULL; 52270Sstevel@tonic-gate curr = curr->ire_next) { 52280Sstevel@tonic-gate if (curr->ire_addr == ire->ire_addr) 52290Sstevel@tonic-gate break; 52300Sstevel@tonic-gate } 52310Sstevel@tonic-gate } else { 52320Sstevel@tonic-gate curr = curr->ire_next; 52330Sstevel@tonic-gate } 52340Sstevel@tonic-gate ill = ire_to_ill(ire); 52350Sstevel@tonic-gate for (; curr != NULL; curr = curr->ire_next) { 52360Sstevel@tonic-gate if (curr->ire_addr != ire->ire_addr) { 52370Sstevel@tonic-gate /* 52380Sstevel@tonic-gate * All the IREs to a given destination are contiguous; 52390Sstevel@tonic-gate * break out once the address doesn't match. 52400Sstevel@tonic-gate */ 52410Sstevel@tonic-gate break; 52420Sstevel@tonic-gate } 52430Sstevel@tonic-gate if (curr == ire) { 52440Sstevel@tonic-gate /* skip over the passed-in ire */ 52450Sstevel@tonic-gate continue; 52460Sstevel@tonic-gate } 52470Sstevel@tonic-gate if ((curr->ire_stq != NULL && ire->ire_stq == NULL) || 52480Sstevel@tonic-gate (curr->ire_stq == NULL && ire->ire_stq != NULL)) { 52490Sstevel@tonic-gate /* 52500Sstevel@tonic-gate * If the passed-in ire is loopback, skip over 52510Sstevel@tonic-gate * non-loopback ires and vice versa. 52520Sstevel@tonic-gate */ 52530Sstevel@tonic-gate continue; 52540Sstevel@tonic-gate } 52550Sstevel@tonic-gate if (ire_to_ill(curr) != ill) { 52560Sstevel@tonic-gate /* skip over IREs going through a different interface */ 52570Sstevel@tonic-gate continue; 52580Sstevel@tonic-gate } 52590Sstevel@tonic-gate if (curr->ire_marks & IRE_MARK_CONDEMNED) { 52600Sstevel@tonic-gate /* skip over deleted IREs */ 52610Sstevel@tonic-gate continue; 52620Sstevel@tonic-gate } 52630Sstevel@tonic-gate return (curr); 52640Sstevel@tonic-gate } 52650Sstevel@tonic-gate return (NULL); 52660Sstevel@tonic-gate } 52670Sstevel@tonic-gate 52685023Scarlsonj #ifdef DEBUG 52690Sstevel@tonic-gate void 52700Sstevel@tonic-gate ire_trace_ref(ire_t *ire) 52710Sstevel@tonic-gate { 52720Sstevel@tonic-gate mutex_enter(&ire->ire_lock); 52735023Scarlsonj if (ire->ire_trace_disable) { 52740Sstevel@tonic-gate mutex_exit(&ire->ire_lock); 52750Sstevel@tonic-gate return; 52760Sstevel@tonic-gate } 52775023Scarlsonj 52785023Scarlsonj if (th_trace_ref(ire, ire->ire_ipst)) { 52795023Scarlsonj mutex_exit(&ire->ire_lock); 52805023Scarlsonj } else { 52815023Scarlsonj ire->ire_trace_disable = B_TRUE; 52825023Scarlsonj mutex_exit(&ire->ire_lock); 52835023Scarlsonj ire_trace_cleanup(ire); 52840Sstevel@tonic-gate } 52850Sstevel@tonic-gate } 52860Sstevel@tonic-gate 52870Sstevel@tonic-gate void 52880Sstevel@tonic-gate ire_untrace_ref(ire_t *ire) 52890Sstevel@tonic-gate { 52900Sstevel@tonic-gate mutex_enter(&ire->ire_lock); 52915023Scarlsonj if (!ire->ire_trace_disable) 52925023Scarlsonj th_trace_unref(ire); 52930Sstevel@tonic-gate mutex_exit(&ire->ire_lock); 52940Sstevel@tonic-gate } 52950Sstevel@tonic-gate 52960Sstevel@tonic-gate static void 52975023Scarlsonj ire_trace_cleanup(const ire_t *ire) 52980Sstevel@tonic-gate { 52995023Scarlsonj th_trace_cleanup(ire, ire->ire_trace_disable); 53000Sstevel@tonic-gate } 53015023Scarlsonj #endif /* DEBUG */ 53022535Ssangeeta 53032535Ssangeeta /* 53042535Ssangeeta * Generate a message chain with an arp request to resolve the in_ire. 53052535Ssangeeta * It is assumed that in_ire itself is currently in the ire cache table, 53062535Ssangeeta * so we create a fake_ire filled with enough information about ire_addr etc. 53072535Ssangeeta * to retrieve in_ire when the DL_UNITDATA response from the resolver 53082535Ssangeeta * comes back. The fake_ire itself is created by calling esballoc with 53092535Ssangeeta * the fr_rtnp (free routine) set to ire_freemblk. This routine will be 53102535Ssangeeta * invoked when the mblk containing fake_ire is freed. 53112535Ssangeeta */ 53122535Ssangeeta void 53138485SPeter.Memishian@Sun.COM ire_arpresolve(ire_t *in_ire) 53142535Ssangeeta { 53152535Ssangeeta areq_t *areq; 53162535Ssangeeta ipaddr_t *addrp; 53174714Ssowmini mblk_t *ire_mp, *areq_mp; 53182535Ssangeeta ire_t *ire, *buf; 53192535Ssangeeta size_t bufsize; 53202535Ssangeeta frtn_t *frtnp; 53218485SPeter.Memishian@Sun.COM ill_t *dst_ill; 53228485SPeter.Memishian@Sun.COM ip_stack_t *ipst; 53238485SPeter.Memishian@Sun.COM 53248485SPeter.Memishian@Sun.COM ASSERT(in_ire->ire_nce != NULL); 53258485SPeter.Memishian@Sun.COM 53268485SPeter.Memishian@Sun.COM dst_ill = ire_to_ill(in_ire); 53278485SPeter.Memishian@Sun.COM ipst = dst_ill->ill_ipst; 53282535Ssangeeta 53292535Ssangeeta /* 53302535Ssangeeta * Construct message chain for the resolver 53312535Ssangeeta * of the form: 53322535Ssangeeta * ARP_REQ_MBLK-->IRE_MBLK 53332535Ssangeeta * 53342535Ssangeeta * NOTE : If the response does not 53352535Ssangeeta * come back, ARP frees the packet. For this reason, 53362535Ssangeeta * we can't REFHOLD the bucket of save_ire to prevent 53372535Ssangeeta * deletions. We may not be able to REFRELE the bucket 53382535Ssangeeta * if the response never comes back. Thus, before 53392535Ssangeeta * adding the ire, ire_add_v4 will make sure that the 53402535Ssangeeta * interface route does not get deleted. This is the 53412535Ssangeeta * only case unlike ip_newroute_v6, ip_newroute_ipif_v6 53422535Ssangeeta * where we can always prevent deletions because of 53432535Ssangeeta * the synchronous nature of adding IRES i.e 53442535Ssangeeta * ire_add_then_send is called after creating the IRE. 53452535Ssangeeta */ 53462535Ssangeeta 53472535Ssangeeta /* 53488485SPeter.Memishian@Sun.COM * We use esballoc to allocate the second part (IRE_MBLK) 53498485SPeter.Memishian@Sun.COM * of the message chain depicted above. This mblk will be freed 53508485SPeter.Memishian@Sun.COM * by arp when there is a timeout, and otherwise passed to IP 53518485SPeter.Memishian@Sun.COM * and IP will free it after processing the ARP response. 53522535Ssangeeta */ 53532535Ssangeeta 53542535Ssangeeta bufsize = sizeof (ire_t) + sizeof (frtn_t); 53552535Ssangeeta buf = kmem_alloc(bufsize, KM_NOSLEEP); 53562535Ssangeeta if (buf == NULL) { 53578485SPeter.Memishian@Sun.COM ip1dbg(("ire_arpresolve: alloc buffer failed\n")); 53582535Ssangeeta return; 53592535Ssangeeta } 53602535Ssangeeta frtnp = (frtn_t *)(buf + 1); 53612535Ssangeeta frtnp->free_arg = (caddr_t)buf; 53622535Ssangeeta frtnp->free_func = ire_freemblk; 53632535Ssangeeta 53642535Ssangeeta ire_mp = esballoc((unsigned char *)buf, bufsize, BPRI_MED, frtnp); 53652535Ssangeeta if (ire_mp == NULL) { 53662535Ssangeeta ip1dbg(("ire_arpresolve: esballoc failed\n")); 53672535Ssangeeta kmem_free(buf, bufsize); 53682535Ssangeeta return; 53692535Ssangeeta } 53708485SPeter.Memishian@Sun.COM 53714714Ssowmini areq_mp = copyb(dst_ill->ill_resolver_mp); 53724714Ssowmini if (areq_mp == NULL) { 53738485SPeter.Memishian@Sun.COM freemsg(ire_mp); 53742535Ssangeeta return; 53752535Ssangeeta } 53762535Ssangeeta 53772535Ssangeeta ire_mp->b_datap->db_type = IRE_ARPRESOLVE_TYPE; 53782535Ssangeeta ire = (ire_t *)buf; 53792535Ssangeeta /* 53802535Ssangeeta * keep enough info in the fake ire so that we can pull up 53812535Ssangeeta * the incomplete ire (in_ire) after result comes back from 53822535Ssangeeta * arp and make it complete. 53832535Ssangeeta */ 53842535Ssangeeta *ire = ire_null; 53852535Ssangeeta ire->ire_u = in_ire->ire_u; 53862535Ssangeeta ire->ire_ipif_seqid = in_ire->ire_ipif_seqid; 53877880SJonathan.Anderson@Sun.COM ire->ire_ipif_ifindex = in_ire->ire_ipif_ifindex; 53882535Ssangeeta ire->ire_ipif = in_ire->ire_ipif; 53898485SPeter.Memishian@Sun.COM ire->ire_stq = dst_ill->ill_wq; 53908485SPeter.Memishian@Sun.COM ire->ire_stq_ifindex = dst_ill->ill_phyint->phyint_ifindex; 53912535Ssangeeta ire->ire_zoneid = in_ire->ire_zoneid; 53927558SSowmini.Varadhan@Sun.COM ire->ire_stackid = ipst->ips_netstack->netstack_stackid; 53933448Sdh155122 ire->ire_ipst = ipst; 53943448Sdh155122 53952535Ssangeeta /* 53962535Ssangeeta * ire_freemblk will be called when ire_mp is freed, both for 53972535Ssangeeta * successful and failed arp resolution. IRE_MARK_UNCACHED will be set 53982535Ssangeeta * when the arp resolution failed. 53992535Ssangeeta */ 54002535Ssangeeta ire->ire_marks |= IRE_MARK_UNCACHED; 54012535Ssangeeta ire->ire_mp = ire_mp; 54022535Ssangeeta ire_mp->b_wptr = (uchar_t *)&ire[1]; 54032535Ssangeeta ire_mp->b_cont = NULL; 54044714Ssowmini linkb(areq_mp, ire_mp); 54052535Ssangeeta 54062535Ssangeeta /* 54072535Ssangeeta * Fill in the source and dest addrs for the resolver. 54082535Ssangeeta * NOTE: this depends on memory layouts imposed by 54092535Ssangeeta * ill_init(). 54102535Ssangeeta */ 54114714Ssowmini areq = (areq_t *)areq_mp->b_rptr; 54122535Ssangeeta addrp = (ipaddr_t *)((char *)areq + areq->areq_sender_addr_offset); 54132535Ssangeeta *addrp = ire->ire_src_addr; 54142535Ssangeeta 54152535Ssangeeta addrp = (ipaddr_t *)((char *)areq + areq->areq_target_addr_offset); 54162535Ssangeeta if (ire->ire_gateway_addr != INADDR_ANY) { 54172535Ssangeeta *addrp = ire->ire_gateway_addr; 54182535Ssangeeta } else { 54192535Ssangeeta *addrp = ire->ire_addr; 54202535Ssangeeta } 54212535Ssangeeta 54222535Ssangeeta /* Up to the resolver. */ 54232535Ssangeeta if (canputnext(dst_ill->ill_rq)) { 54244714Ssowmini putnext(dst_ill->ill_rq, areq_mp); 54252535Ssangeeta } else { 54264714Ssowmini freemsg(areq_mp); 54272535Ssangeeta } 54282535Ssangeeta } 54292535Ssangeeta 54302535Ssangeeta /* 54312535Ssangeeta * Esballoc free function for AR_ENTRY_QUERY request to clean up any 54322535Ssangeeta * unresolved ire_t and/or nce_t structures when ARP resolution fails. 54332535Ssangeeta * 54342535Ssangeeta * This function can be called by ARP via free routine for ire_mp or 54352535Ssangeeta * by IPv4(both host and forwarding path) via ire_delete 54362535Ssangeeta * in case ARP resolution fails. 54372535Ssangeeta * NOTE: Since IP is MT, ARP can call into IP but not vice versa 54382535Ssangeeta * (for IP to talk to ARP, it still has to send AR* messages). 54392535Ssangeeta * 54402535Ssangeeta * Note that the ARP/IP merge should replace the functioanlity by providing 54412535Ssangeeta * direct function calls to clean up unresolved entries in ire/nce lists. 54422535Ssangeeta */ 54432535Ssangeeta void 54442535Ssangeeta ire_freemblk(ire_t *ire_mp) 54452535Ssangeeta { 54462535Ssangeeta nce_t *nce = NULL; 54472535Ssangeeta ill_t *ill; 54483448Sdh155122 ip_stack_t *ipst; 54497558SSowmini.Varadhan@Sun.COM netstack_t *ns = NULL; 54502535Ssangeeta 54512535Ssangeeta ASSERT(ire_mp != NULL); 54522535Ssangeeta 54532535Ssangeeta if ((ire_mp->ire_addr == NULL) && (ire_mp->ire_gateway_addr == NULL)) { 54542535Ssangeeta ip1dbg(("ire_freemblk(0x%p) ire_addr is NULL\n", 54552535Ssangeeta (void *)ire_mp)); 54562535Ssangeeta goto cleanup; 54572535Ssangeeta } 54582535Ssangeeta if ((ire_mp->ire_marks & IRE_MARK_UNCACHED) == 0) { 54592535Ssangeeta goto cleanup; /* everything succeeded. just free and return */ 54602535Ssangeeta } 54612535Ssangeeta 54622535Ssangeeta /* 54632535Ssangeeta * the arp information corresponding to this ire_mp was not 54647558SSowmini.Varadhan@Sun.COM * transferred to an ire_cache entry. Need 54652535Ssangeeta * to clean up incomplete ire's and nce, if necessary. 54662535Ssangeeta */ 54672535Ssangeeta ASSERT(ire_mp->ire_stq != NULL); 54682535Ssangeeta ASSERT(ire_mp->ire_stq_ifindex != 0); 54693448Sdh155122 ASSERT(ire_mp->ire_ipst != NULL); 54703448Sdh155122 54717558SSowmini.Varadhan@Sun.COM ns = netstack_find_by_stackid(ire_mp->ire_stackid); 54727558SSowmini.Varadhan@Sun.COM ipst = (ns ? ns->netstack_ip : NULL); 54737558SSowmini.Varadhan@Sun.COM if (ipst == NULL || ipst != ire_mp->ire_ipst) /* Disapeared on us */ 54747558SSowmini.Varadhan@Sun.COM goto cleanup; 54753448Sdh155122 54762535Ssangeeta /* 54772535Ssangeeta * Get any nce's corresponding to this ire_mp. We first have to 54782535Ssangeeta * make sure that the ill is still around. 54792535Ssangeeta */ 54803448Sdh155122 ill = ill_lookup_on_ifindex(ire_mp->ire_stq_ifindex, 54813448Sdh155122 B_FALSE, NULL, NULL, NULL, NULL, ipst); 54822535Ssangeeta if (ill == NULL || (ire_mp->ire_stq != ill->ill_wq) || 54832535Ssangeeta (ill->ill_state_flags & ILL_CONDEMNED)) { 54842535Ssangeeta /* 54852535Ssangeeta * ill went away. no nce to clean up. 54862535Ssangeeta * Note that the ill_state_flags could be set to 54872535Ssangeeta * ILL_CONDEMNED after this point, but if we know 54882535Ssangeeta * that it is CONDEMNED now, we just bail out quickly. 54892535Ssangeeta */ 54902535Ssangeeta if (ill != NULL) 54912535Ssangeeta ill_refrele(ill); 54922535Ssangeeta goto cleanup; 54932535Ssangeeta } 54942535Ssangeeta nce = ndp_lookup_v4(ill, 54952535Ssangeeta ((ire_mp->ire_gateway_addr != INADDR_ANY) ? 54962535Ssangeeta &ire_mp->ire_gateway_addr : &ire_mp->ire_addr), 54972535Ssangeeta B_FALSE); 54982535Ssangeeta ill_refrele(ill); 54992535Ssangeeta 55002535Ssangeeta if ((nce != NULL) && (nce->nce_state != ND_REACHABLE)) { 55012535Ssangeeta /* 55022535Ssangeeta * some incomplete nce was found. 55032535Ssangeeta */ 55042535Ssangeeta DTRACE_PROBE2(ire__freemblk__arp__resolv__fail, 55052535Ssangeeta nce_t *, nce, ire_t *, ire_mp); 55062535Ssangeeta /* 55072535Ssangeeta * Send the icmp_unreachable messages for the queued mblks in 55082535Ssangeeta * ire->ire_nce->nce_qd_mp, since ARP resolution failed 55092535Ssangeeta * for this ire 55102535Ssangeeta */ 55112535Ssangeeta arp_resolv_failed(nce); 55122535Ssangeeta /* 55132535Ssangeeta * Delete the nce and clean up all ire's pointing at this nce 55142535Ssangeeta * in the cachetable 55152535Ssangeeta */ 55162535Ssangeeta ndp_delete(nce); 55172535Ssangeeta } 55182535Ssangeeta if (nce != NULL) 55192535Ssangeeta NCE_REFRELE(nce); /* release the ref taken by ndp_lookup_v4 */ 55202535Ssangeeta 55212535Ssangeeta cleanup: 55227558SSowmini.Varadhan@Sun.COM if (ns != NULL) 55237558SSowmini.Varadhan@Sun.COM netstack_rele(ns); 55242535Ssangeeta /* 55252535Ssangeeta * Get rid of the ire buffer 55262535Ssangeeta * We call kmem_free here(instead of ire_delete()), since 55272535Ssangeeta * this is the freeb's callback. 55282535Ssangeeta */ 55292535Ssangeeta kmem_free(ire_mp, sizeof (ire_t) + sizeof (frtn_t)); 55302535Ssangeeta } 55312535Ssangeeta 55323772Ssangeeta /* 55334714Ssowmini * find, or create if needed, a neighbor cache entry nce_t for IRE_CACHE and 55344714Ssowmini * non-loopback IRE_BROADCAST ire's. 55354714Ssowmini * 55364714Ssowmini * If a neighbor-cache entry has to be created (i.e., one does not already 55374714Ssowmini * exist in the nce list) the nce_res_mp and nce_state of the neighbor cache 55384714Ssowmini * entry are initialized in ndp_add_v4(). These values are picked from 55394714Ssowmini * the src_nce, if one is passed in. Otherwise (if src_nce == NULL) the 55404714Ssowmini * ire->ire_type and the outgoing interface (ire_to_ill(ire)) values 55414714Ssowmini * determine the {nce_state, nce_res_mp} of the nce_t created. All 55424714Ssowmini * IRE_BROADCAST entries have nce_state = ND_REACHABLE, and the nce_res_mp 55434714Ssowmini * is set to the ill_bcast_mp of the outgoing inerface. For unicast ire 55444714Ssowmini * entries, 55454714Ssowmini * - if the outgoing interface is of type IRE_IF_RESOLVER, a newly created 55464714Ssowmini * nce_t will have a null nce_res_mp, and will be in the ND_INITIAL state. 55474714Ssowmini * - if the outgoing interface is a IRE_IF_NORESOLVER interface, no link 55484714Ssowmini * layer resolution is necessary, so that the nce_t will be in the 55494714Ssowmini * ND_REACHABLE state and the nce_res_mp will have a copy of the 55504714Ssowmini * ill_resolver_mp of the outgoing interface. 55514714Ssowmini * 55524714Ssowmini * The link layer information needed for broadcast addresses, and for 55534714Ssowmini * packets sent on IRE_IF_NORESOLVER interfaces is a constant mapping that 55544714Ssowmini * never needs re-verification for the lifetime of the nce_t. These are 55554714Ssowmini * therefore marked NCE_F_PERMANENT, and never allowed to expire via 55564714Ssowmini * NCE_EXPIRED. 55574714Ssowmini * 55584714Ssowmini * IRE_CACHE ire's contain the information for the nexthop (ire_gateway_addr) 55594714Ssowmini * in the case of indirect routes, and for the dst itself (ire_addr) in the 55602535Ssangeeta * case of direct routes, with the nce_res_mp containing a template 55612535Ssangeeta * DL_UNITDATA request. 55622535Ssangeeta * 55632535Ssangeeta * The actual association of the ire_nce to the nce created here is 55642535Ssangeeta * typically done in ire_add_v4 for IRE_CACHE entries. Exceptions 55652535Ssangeeta * to this rule are SO_DONTROUTE ire's (IRE_MARK_NO_ADD), for which 55664823Sseb * the ire_nce assignment is done in ire_add_then_send. 55672535Ssangeeta */ 55682535Ssangeeta int 55694714Ssowmini ire_nce_init(ire_t *ire, nce_t *src_nce) 55702535Ssangeeta { 55714714Ssowmini in_addr_t addr4; 55722535Ssangeeta int err; 55734714Ssowmini nce_t *nce = NULL; 55742535Ssangeeta ill_t *ire_ill; 55754714Ssowmini uint16_t nce_flags = 0; 55763448Sdh155122 ip_stack_t *ipst; 55772535Ssangeeta 55784714Ssowmini if (ire->ire_stq == NULL) 55792535Ssangeeta return (0); /* no need to create nce for local/loopback */ 55804714Ssowmini 55812535Ssangeeta switch (ire->ire_type) { 55822535Ssangeeta case IRE_CACHE: 55832535Ssangeeta if (ire->ire_gateway_addr != INADDR_ANY) 55842535Ssangeeta addr4 = ire->ire_gateway_addr; /* 'G' route */ 55852535Ssangeeta else 55862535Ssangeeta addr4 = ire->ire_addr; /* direct route */ 55872535Ssangeeta break; 55882535Ssangeeta case IRE_BROADCAST: 55892535Ssangeeta addr4 = ire->ire_addr; 55904714Ssowmini nce_flags |= (NCE_F_PERMANENT|NCE_F_BCAST); 55912535Ssangeeta break; 55922535Ssangeeta default: 55932535Ssangeeta return (0); 55942535Ssangeeta } 55952535Ssangeeta 55962535Ssangeeta /* 55972535Ssangeeta * ire_ipif is picked based on RTF_SETSRC, usesrc etc. 55982535Ssangeeta * rules in ire_forward_src_ipif. We want the dlureq_mp 55992535Ssangeeta * for the outgoing interface, which we get from the ire_stq. 56002535Ssangeeta */ 56012535Ssangeeta ire_ill = ire_to_ill(ire); 56023448Sdh155122 ipst = ire_ill->ill_ipst; 56032535Ssangeeta 56042535Ssangeeta /* 56054714Ssowmini * IRE_IF_NORESOLVER entries never need re-verification and 56064714Ssowmini * do not expire, so we mark them as NCE_F_PERMANENT. 56072535Ssangeeta */ 56084714Ssowmini if (ire_ill->ill_net_type == IRE_IF_NORESOLVER) 56094714Ssowmini nce_flags |= NCE_F_PERMANENT; 56102535Ssangeeta 56114084Ssowmini retry_nce: 56124714Ssowmini err = ndp_lookup_then_add_v4(ire_ill, &addr4, nce_flags, 56134714Ssowmini &nce, src_nce); 56144714Ssowmini 56154714Ssowmini if (err == EEXIST && NCE_EXPIRED(nce, ipst)) { 56164084Ssowmini /* 56174084Ssowmini * We looked up an expired nce. 56184084Ssowmini * Go back and try to create one again. 56194084Ssowmini */ 56204714Ssowmini ndp_delete(nce); 56214714Ssowmini NCE_REFRELE(nce); 56224714Ssowmini nce = NULL; 56234084Ssowmini goto retry_nce; 56244084Ssowmini } 56254084Ssowmini 56264714Ssowmini ip1dbg(("ire 0x%p addr 0x%lx type 0x%x; found nce 0x%p err %d\n", 56274714Ssowmini (void *)ire, (ulong_t)addr4, ire->ire_type, (void *)nce, err)); 56282535Ssangeeta 56292535Ssangeeta switch (err) { 56302535Ssangeeta case 0: 56312535Ssangeeta case EEXIST: 56322535Ssangeeta /* 56334714Ssowmini * return a pointer to a newly created or existing nce_t; 56342535Ssangeeta * note that the ire-nce mapping is many-one, i.e., 56354714Ssowmini * multiple ire's could point to the same nce_t. 56362535Ssangeeta */ 56372535Ssangeeta break; 56382535Ssangeeta default: 56392535Ssangeeta DTRACE_PROBE2(nce__init__fail, ill_t *, ire_ill, int, err); 56402535Ssangeeta return (EINVAL); 56412535Ssangeeta } 56422535Ssangeeta if (ire->ire_type == IRE_BROADCAST) { 56432535Ssangeeta /* 56442535Ssangeeta * Two bcast ires are created for each interface; 56452535Ssangeeta * 1. loopback copy (which does not have an 56462535Ssangeeta * ire_stq, and therefore has no ire_nce), and, 56472535Ssangeeta * 2. the non-loopback copy, which has the nce_res_mp 56482535Ssangeeta * initialized to a copy of the ill_bcast_mp, and 56492535Ssangeeta * is marked as ND_REACHABLE at this point. 56502535Ssangeeta * This nce does not undergo any further state changes, 56512535Ssangeeta * and exists as long as the interface is plumbed. 56528485SPeter.Memishian@Sun.COM * Note: the assignment of ire_nce here is a historical 56538485SPeter.Memishian@Sun.COM * artifact of old code that used to inline ire_add(). 56542535Ssangeeta */ 56554714Ssowmini ire->ire_nce = nce; 56562535Ssangeeta /* 56572535Ssangeeta * We are associating this nce to the ire, 56582535Ssangeeta * so change the nce ref taken in 56592535Ssangeeta * ndp_lookup_then_add_v4() from 56602535Ssangeeta * NCE_REFHOLD to NCE_REFHOLD_NOTR 56612535Ssangeeta */ 56622535Ssangeeta NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce); 56632535Ssangeeta } else { 56644084Ssowmini /* 56654084Ssowmini * We are not using this nce_t just yet so release 56664084Ssowmini * the ref taken in ndp_lookup_then_add_v4() 56674084Ssowmini */ 56684714Ssowmini NCE_REFRELE(nce); 56692535Ssangeeta } 56702535Ssangeeta return (0); 56712535Ssangeeta } 56727880SJonathan.Anderson@Sun.COM 56737880SJonathan.Anderson@Sun.COM /* 56747880SJonathan.Anderson@Sun.COM * This is the implementation of the IPv4 IRE cache lookup procedure. 56757880SJonathan.Anderson@Sun.COM * Separating the interface from the implementation allows additional 56767880SJonathan.Anderson@Sun.COM * flexibility when specifying search criteria. 56777880SJonathan.Anderson@Sun.COM */ 56787880SJonathan.Anderson@Sun.COM static ire_t * 56797880SJonathan.Anderson@Sun.COM ip4_ctable_lookup_impl(ire_ctable_args_t *margs) 56807880SJonathan.Anderson@Sun.COM { 56817880SJonathan.Anderson@Sun.COM irb_t *irb_ptr; 56827880SJonathan.Anderson@Sun.COM ire_t *ire; 56837880SJonathan.Anderson@Sun.COM ip_stack_t *ipst = margs->ict_ipst; 56847880SJonathan.Anderson@Sun.COM 56858485SPeter.Memishian@Sun.COM if ((margs->ict_flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && 56867880SJonathan.Anderson@Sun.COM (margs->ict_ipif == NULL)) { 56877880SJonathan.Anderson@Sun.COM return (NULL); 56887880SJonathan.Anderson@Sun.COM } 56897880SJonathan.Anderson@Sun.COM 56907880SJonathan.Anderson@Sun.COM irb_ptr = &ipst->ips_ip_cache_table[IRE_ADDR_HASH( 56917880SJonathan.Anderson@Sun.COM *((ipaddr_t *)margs->ict_addr), ipst->ips_ip_cache_table_size)]; 56927880SJonathan.Anderson@Sun.COM rw_enter(&irb_ptr->irb_lock, RW_READER); 56937880SJonathan.Anderson@Sun.COM for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { 56947880SJonathan.Anderson@Sun.COM if (ire->ire_marks & IRE_MARK_CONDEMNED) 56957880SJonathan.Anderson@Sun.COM continue; 56967880SJonathan.Anderson@Sun.COM ASSERT(ire->ire_mask == IP_HOST_MASK); 56977880SJonathan.Anderson@Sun.COM if (ire_match_args(ire, *((ipaddr_t *)margs->ict_addr), 56987880SJonathan.Anderson@Sun.COM ire->ire_mask, *((ipaddr_t *)margs->ict_gateway), 56997880SJonathan.Anderson@Sun.COM margs->ict_type, margs->ict_ipif, margs->ict_zoneid, 0, 57007880SJonathan.Anderson@Sun.COM margs->ict_tsl, margs->ict_flags, margs->ict_wq)) { 57017880SJonathan.Anderson@Sun.COM IRE_REFHOLD(ire); 57027880SJonathan.Anderson@Sun.COM rw_exit(&irb_ptr->irb_lock); 57037880SJonathan.Anderson@Sun.COM return (ire); 57047880SJonathan.Anderson@Sun.COM } 57057880SJonathan.Anderson@Sun.COM } 57067880SJonathan.Anderson@Sun.COM 57077880SJonathan.Anderson@Sun.COM rw_exit(&irb_ptr->irb_lock); 57087880SJonathan.Anderson@Sun.COM return (NULL); 57097880SJonathan.Anderson@Sun.COM } 57107880SJonathan.Anderson@Sun.COM 57117880SJonathan.Anderson@Sun.COM /* 57127880SJonathan.Anderson@Sun.COM * This function locates IRE_CACHE entries which were added by the 57137880SJonathan.Anderson@Sun.COM * ire_forward() path. We can fully specify the IRE we are looking for by 57148485SPeter.Memishian@Sun.COM * providing the ipif (MATCH_IRE_IPIF) *and* the stq (MATCH_IRE_WQ). 57157880SJonathan.Anderson@Sun.COM */ 57167880SJonathan.Anderson@Sun.COM ire_t * 57177880SJonathan.Anderson@Sun.COM ire_arpresolve_lookup(ipaddr_t addr, ipaddr_t gw, ipif_t *ipif, 57187880SJonathan.Anderson@Sun.COM zoneid_t zoneid, ip_stack_t *ipst, queue_t *wq) 57197880SJonathan.Anderson@Sun.COM { 57207880SJonathan.Anderson@Sun.COM ire_ctable_args_t margs; 57217880SJonathan.Anderson@Sun.COM 57227880SJonathan.Anderson@Sun.COM margs.ict_addr = &addr; 57237880SJonathan.Anderson@Sun.COM margs.ict_gateway = &gw; 57247880SJonathan.Anderson@Sun.COM margs.ict_type = IRE_CACHE; 57257880SJonathan.Anderson@Sun.COM margs.ict_ipif = ipif; 57267880SJonathan.Anderson@Sun.COM margs.ict_zoneid = zoneid; 57277880SJonathan.Anderson@Sun.COM margs.ict_tsl = NULL; 57287880SJonathan.Anderson@Sun.COM margs.ict_flags = MATCH_IRE_GW | MATCH_IRE_IPIF | MATCH_IRE_ZONEONLY | 57297880SJonathan.Anderson@Sun.COM MATCH_IRE_TYPE | MATCH_IRE_WQ; 57307880SJonathan.Anderson@Sun.COM margs.ict_ipst = ipst; 57317880SJonathan.Anderson@Sun.COM margs.ict_wq = wq; 57327880SJonathan.Anderson@Sun.COM 57337880SJonathan.Anderson@Sun.COM return (ip4_ctable_lookup_impl(&margs)); 57347880SJonathan.Anderson@Sun.COM } 5735