10Sstevel@tonic-gate /* 20Sstevel@tonic-gate * CDDL HEADER START 30Sstevel@tonic-gate * 40Sstevel@tonic-gate * The contents of this file are subject to the terms of the 51676Sjpk * Common Development and Distribution License (the "License"). 61676Sjpk * You may not use this file except in compliance with the License. 70Sstevel@tonic-gate * 80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 100Sstevel@tonic-gate * See the License for the specific language governing permissions 110Sstevel@tonic-gate * and limitations under the License. 120Sstevel@tonic-gate * 130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 180Sstevel@tonic-gate * 190Sstevel@tonic-gate * CDDL HEADER END 200Sstevel@tonic-gate */ 210Sstevel@tonic-gate /* 2211457SErik.Nordmark@Sun.COM * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 230Sstevel@tonic-gate * Use is subject to license terms. 240Sstevel@tonic-gate */ 250Sstevel@tonic-gate /* Copyright (c) 1990 Mentat Inc. */ 260Sstevel@tonic-gate 270Sstevel@tonic-gate /* 280Sstevel@tonic-gate * This file contains routines that manipulate Internet Routing Entries (IREs). 290Sstevel@tonic-gate */ 300Sstevel@tonic-gate 310Sstevel@tonic-gate #include <sys/types.h> 320Sstevel@tonic-gate #include <sys/stream.h> 330Sstevel@tonic-gate #include <sys/stropts.h> 348485SPeter.Memishian@Sun.COM #include <sys/strsun.h> 358778SErik.Nordmark@Sun.COM #include <sys/strsubr.h> 360Sstevel@tonic-gate #include <sys/ddi.h> 370Sstevel@tonic-gate #include <sys/cmn_err.h> 380Sstevel@tonic-gate #include <sys/policy.h> 390Sstevel@tonic-gate 400Sstevel@tonic-gate #include <sys/systm.h> 410Sstevel@tonic-gate #include <sys/kmem.h> 420Sstevel@tonic-gate #include <sys/param.h> 430Sstevel@tonic-gate #include <sys/socket.h> 440Sstevel@tonic-gate #include <net/if.h> 450Sstevel@tonic-gate #include <net/route.h> 460Sstevel@tonic-gate #include <netinet/in.h> 470Sstevel@tonic-gate #include <net/if_dl.h> 480Sstevel@tonic-gate #include <netinet/ip6.h> 490Sstevel@tonic-gate #include <netinet/icmp6.h> 500Sstevel@tonic-gate 510Sstevel@tonic-gate #include <inet/common.h> 520Sstevel@tonic-gate #include <inet/mi.h> 530Sstevel@tonic-gate #include <inet/ip.h> 540Sstevel@tonic-gate #include <inet/ip6.h> 550Sstevel@tonic-gate #include <inet/ip_ndp.h> 562535Ssangeeta #include <inet/arp.h> 570Sstevel@tonic-gate #include <inet/ip_if.h> 580Sstevel@tonic-gate #include <inet/ip_ire.h> 592535Ssangeeta #include <inet/ip_ftable.h> 600Sstevel@tonic-gate #include <inet/ip_rts.h> 610Sstevel@tonic-gate #include <inet/nd.h> 620Sstevel@tonic-gate 630Sstevel@tonic-gate #include <inet/tcp.h> 640Sstevel@tonic-gate #include <inet/ipclassifier.h> 650Sstevel@tonic-gate #include <sys/zone.h> 663448Sdh155122 #include <sys/cpuvar.h> 673448Sdh155122 681676Sjpk #include <sys/tsol/label.h> 691676Sjpk #include <sys/tsol/tnet.h> 701676Sjpk 712535Ssangeeta struct kmem_cache *rt_entry_cache; 722535Ssangeeta 7311042SErik.Nordmark@Sun.COM typedef struct nce_clookup_s { 7411042SErik.Nordmark@Sun.COM ipaddr_t ncecl_addr; 7511042SErik.Nordmark@Sun.COM boolean_t ncecl_found; 7611042SErik.Nordmark@Sun.COM } nce_clookup_t; 7711042SErik.Nordmark@Sun.COM 780Sstevel@tonic-gate /* 790Sstevel@tonic-gate * Synchronization notes: 800Sstevel@tonic-gate * 810Sstevel@tonic-gate * The fields of the ire_t struct are protected in the following way : 820Sstevel@tonic-gate * 830Sstevel@tonic-gate * ire_next/ire_ptpn 840Sstevel@tonic-gate * 8511042SErik.Nordmark@Sun.COM * - bucket lock of the forwarding table in which is ire stored. 860Sstevel@tonic-gate * 8711042SErik.Nordmark@Sun.COM * ire_ill, ire_u *except* ire_gateway_addr[v6], ire_mask, 8811042SErik.Nordmark@Sun.COM * ire_type, ire_create_time, ire_masklen, ire_ipversion, ire_flags, 8911042SErik.Nordmark@Sun.COM * ire_bucket 900Sstevel@tonic-gate * 910Sstevel@tonic-gate * - Set in ire_create_v4/v6 and never changes after that. Thus, 920Sstevel@tonic-gate * we don't need a lock whenever these fields are accessed. 930Sstevel@tonic-gate * 940Sstevel@tonic-gate * - ire_bucket and ire_masklen (also set in ire_create) is set in 9511042SErik.Nordmark@Sun.COM * ire_add before inserting in the bucket and never 960Sstevel@tonic-gate * changes after that. Thus we don't need a lock whenever these 970Sstevel@tonic-gate * fields are accessed. 980Sstevel@tonic-gate * 990Sstevel@tonic-gate * ire_gateway_addr_v4[v6] 1000Sstevel@tonic-gate * 1010Sstevel@tonic-gate * - ire_gateway_addr_v4[v6] is set during ire_create and later modified 1020Sstevel@tonic-gate * by rts_setgwr[v6]. As ire_gateway_addr is a uint32_t, updates to 1030Sstevel@tonic-gate * it assumed to be atomic and hence the other parts of the code 1040Sstevel@tonic-gate * does not use any locks. ire_gateway_addr_v6 updates are not atomic 1050Sstevel@tonic-gate * and hence any access to it uses ire_lock to get/set the right value. 1060Sstevel@tonic-gate * 10711042SErik.Nordmark@Sun.COM * ire_refcnt, ire_identical_ref 1080Sstevel@tonic-gate * 1090Sstevel@tonic-gate * - Updated atomically using atomic_add_32 1100Sstevel@tonic-gate * 1110Sstevel@tonic-gate * ire_ssthresh, ire_rtt_sd, ire_rtt, ire_ib_pkt_count, ire_ob_pkt_count 1120Sstevel@tonic-gate * 1130Sstevel@tonic-gate * - Assumes that 32 bit writes are atomic. No locks. ire_lock is 1140Sstevel@tonic-gate * used to serialize updates to ire_ssthresh, ire_rtt_sd, ire_rtt. 1150Sstevel@tonic-gate * 11611042SErik.Nordmark@Sun.COM * ire_generation 11711042SErik.Nordmark@Sun.COM * - Under ire_lock 1180Sstevel@tonic-gate * 11911042SErik.Nordmark@Sun.COM * ire_nce_cache 12011042SErik.Nordmark@Sun.COM * - Under ire_lock 1210Sstevel@tonic-gate * 12211042SErik.Nordmark@Sun.COM * ire_dep_parent (To next IRE in recursive lookup chain) 12311042SErik.Nordmark@Sun.COM * - Under ips_ire_dep_lock. Write held when modifying. Read held when 12411042SErik.Nordmark@Sun.COM * walking. We also hold ire_lock when modifying to allow the data path 12511042SErik.Nordmark@Sun.COM * to only acquire ire_lock. 1260Sstevel@tonic-gate * 12711042SErik.Nordmark@Sun.COM * ire_dep_parent_generation (Generation number from ire_dep_parent) 12811042SErik.Nordmark@Sun.COM * - Under ips_ire_dep_lock and/or ire_lock. (A read claim on the dep_lock 12911042SErik.Nordmark@Sun.COM * and ire_lock held when modifying) 1300Sstevel@tonic-gate * 13111042SErik.Nordmark@Sun.COM * ire_dep_children (From parent to first child) 13211042SErik.Nordmark@Sun.COM * ire_dep_sib_next (linked list of siblings) 13311042SErik.Nordmark@Sun.COM * ire_dep_sib_ptpn (linked list of siblings) 13411042SErik.Nordmark@Sun.COM * - Under ips_ire_dep_lock. Write held when modifying. Read held when 13511042SErik.Nordmark@Sun.COM * walking. 1360Sstevel@tonic-gate * 1370Sstevel@tonic-gate * As we always hold the bucket locks in all the places while accessing 1380Sstevel@tonic-gate * the above values, it is natural to use them for protecting them. 1390Sstevel@tonic-gate * 14011042SErik.Nordmark@Sun.COM * We have a forwarding table for IPv4 and IPv6. The IPv6 forwarding table 1415335Ssowmini * (ip_forwarding_table_v6) is an array of pointers to arrays of irb_t 14211042SErik.Nordmark@Sun.COM * structures. ip_forwarding_table_v6 is allocated dynamically in 1433448Sdh155122 * ire_add_v6. ire_ft_init_lock is used to serialize multiple threads 1440Sstevel@tonic-gate * initializing the same bucket. Once a bucket is initialized, it is never 1453448Sdh155122 * de-alloacted. This assumption enables us to access 1463448Sdh155122 * ip_forwarding_table_v6[i] without any locks. 1470Sstevel@tonic-gate * 1485335Ssowmini * The forwarding table for IPv4 is a radix tree whose leaves 1495335Ssowmini * are rt_entry structures containing the irb_t for the rt_dst. The irb_t 1505335Ssowmini * for IPv4 is dynamically allocated and freed. 1515335Ssowmini * 1520Sstevel@tonic-gate * Each irb_t - ire bucket structure has a lock to protect 1530Sstevel@tonic-gate * a bucket and the ires residing in the bucket have a back pointer to 1540Sstevel@tonic-gate * the bucket structure. It also has a reference count for the number 1550Sstevel@tonic-gate * of threads walking the bucket - irb_refcnt which is bumped up 15611042SErik.Nordmark@Sun.COM * using the irb_refhold function. The flags irb_marks can be 15711042SErik.Nordmark@Sun.COM * set to IRB_MARK_CONDEMNED indicating that there are some ires 15811042SErik.Nordmark@Sun.COM * in this bucket that are IRE_IS_CONDEMNED and the 1590Sstevel@tonic-gate * last thread to leave the bucket should delete the ires. Usually 16011042SErik.Nordmark@Sun.COM * this is done by the irb_refrele function which is used to decrement 1615335Ssowmini * the reference count on a bucket. See comments above irb_t structure 1625335Ssowmini * definition in ip.h for further details. 1630Sstevel@tonic-gate * 16411042SErik.Nordmark@Sun.COM * The ire_refhold/ire_refrele functions operate on the ire which increments/ 1650Sstevel@tonic-gate * decrements the reference count, ire_refcnt, atomically on the ire. 16611042SErik.Nordmark@Sun.COM * ire_refcnt is modified only using those functions. Operations on the IRE 1670Sstevel@tonic-gate * could be described as follows : 1680Sstevel@tonic-gate * 1690Sstevel@tonic-gate * CREATE an ire with reference count initialized to 1. 1700Sstevel@tonic-gate * 1710Sstevel@tonic-gate * ADDITION of an ire holds the bucket lock, checks for duplicates 17211042SErik.Nordmark@Sun.COM * and then adds the ire. ire_add returns the ire after 1730Sstevel@tonic-gate * bumping up once more i.e the reference count is 2. This is to avoid 1740Sstevel@tonic-gate * an extra lookup in the functions calling ire_add which wants to 1750Sstevel@tonic-gate * work with the ire after adding. 1760Sstevel@tonic-gate * 17711042SErik.Nordmark@Sun.COM * LOOKUP of an ire bumps up the reference count using ire_refhold 17811042SErik.Nordmark@Sun.COM * function. It is valid to bump up the referece count of the IRE, 1790Sstevel@tonic-gate * after the lookup has returned an ire. Following are the lookup 1800Sstevel@tonic-gate * functions that return an HELD ire : 1810Sstevel@tonic-gate * 18211042SErik.Nordmark@Sun.COM * ire_ftable_lookup[_v6], ire_lookup_multi_ill[_v6] 1830Sstevel@tonic-gate * 1840Sstevel@tonic-gate * DELETION of an ire holds the bucket lock, removes it from the list 1850Sstevel@tonic-gate * and then decrements the reference count for having removed from the list 18611042SErik.Nordmark@Sun.COM * by using the ire_refrele function. If some other thread has looked up 1870Sstevel@tonic-gate * the ire, the reference count would have been bumped up and hence 1880Sstevel@tonic-gate * this ire will not be freed once deleted. It will be freed once the 1890Sstevel@tonic-gate * reference count drops to zero. 1900Sstevel@tonic-gate * 1910Sstevel@tonic-gate * Add and Delete acquires the bucket lock as RW_WRITER, while all the 1920Sstevel@tonic-gate * lookups acquire the bucket lock as RW_READER. 1930Sstevel@tonic-gate * 19411042SErik.Nordmark@Sun.COM * The general rule is to do the ire_refrele in the function 1950Sstevel@tonic-gate * that is passing the ire as an argument. 1960Sstevel@tonic-gate * 1970Sstevel@tonic-gate * In trying to locate ires the following points are to be noted. 1980Sstevel@tonic-gate * 19911042SErik.Nordmark@Sun.COM * IRE_IS_CONDEMNED signifies that the ire has been logically deleted and is 2000Sstevel@tonic-gate * to be ignored when walking the ires using ire_next. 2010Sstevel@tonic-gate * 2020Sstevel@tonic-gate * Zones note: 2030Sstevel@tonic-gate * Walking IREs within a given zone also walks certain ires in other 2040Sstevel@tonic-gate * zones. This is done intentionally. IRE walks with a specified 2050Sstevel@tonic-gate * zoneid are used only when doing informational reports, and 2060Sstevel@tonic-gate * zone users want to see things that they can access. See block 2070Sstevel@tonic-gate * comment in ire_walk_ill_match(). 2080Sstevel@tonic-gate */ 2090Sstevel@tonic-gate 2100Sstevel@tonic-gate /* 2110Sstevel@tonic-gate * The size of the forwarding table. We will make sure that it is a 2120Sstevel@tonic-gate * power of 2 in ip_ire_init(). 2133448Sdh155122 * Setable in /etc/system 2140Sstevel@tonic-gate */ 2150Sstevel@tonic-gate uint32_t ip6_ftable_hash_size = IP6_FTABLE_HASH_SIZE; 2160Sstevel@tonic-gate 2170Sstevel@tonic-gate struct kmem_cache *ire_cache; 21811042SErik.Nordmark@Sun.COM struct kmem_cache *ncec_cache; 21911042SErik.Nordmark@Sun.COM struct kmem_cache *nce_cache; 22011042SErik.Nordmark@Sun.COM 2210Sstevel@tonic-gate static ire_t ire_null; 2220Sstevel@tonic-gate 22311042SErik.Nordmark@Sun.COM static ire_t *ire_add_v4(ire_t *ire); 2240Sstevel@tonic-gate static void ire_delete_v4(ire_t *ire); 22511042SErik.Nordmark@Sun.COM static void ire_dep_invalidate_children(ire_t *child); 2261676Sjpk static void ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers, 2273448Sdh155122 zoneid_t zoneid, ip_stack_t *); 2280Sstevel@tonic-gate static void ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type, 2291676Sjpk pfv_t func, void *arg, uchar_t vers, ill_t *ill); 2305023Scarlsonj #ifdef DEBUG 2315023Scarlsonj static void ire_trace_cleanup(const ire_t *); 2320Sstevel@tonic-gate #endif 23311463SSowmini.Varadhan@Sun.COM static void ire_dep_incr_generation_locked(ire_t *); 2340Sstevel@tonic-gate 2350Sstevel@tonic-gate /* 23611042SErik.Nordmark@Sun.COM * Following are the functions to increment/decrement the reference 23711042SErik.Nordmark@Sun.COM * count of the IREs and IRBs (ire bucket). 23811042SErik.Nordmark@Sun.COM * 23911042SErik.Nordmark@Sun.COM * 1) We bump up the reference count of an IRE to make sure that 24011042SErik.Nordmark@Sun.COM * it does not get deleted and freed while we are using it. 24111042SErik.Nordmark@Sun.COM * Typically all the lookup functions hold the bucket lock, 24211042SErik.Nordmark@Sun.COM * and look for the IRE. If it finds an IRE, it bumps up the 24311042SErik.Nordmark@Sun.COM * reference count before dropping the lock. Sometimes we *may* want 24411042SErik.Nordmark@Sun.COM * to bump up the reference count after we *looked* up i.e without 24511042SErik.Nordmark@Sun.COM * holding the bucket lock. So, the ire_refhold function does not assert 24611042SErik.Nordmark@Sun.COM * on the bucket lock being held. Any thread trying to delete from 24711042SErik.Nordmark@Sun.COM * the hash bucket can still do so but cannot free the IRE if 24811042SErik.Nordmark@Sun.COM * ire_refcnt is not 0. 24911042SErik.Nordmark@Sun.COM * 25011042SErik.Nordmark@Sun.COM * 2) We bump up the reference count on the bucket where the IRE resides 25111042SErik.Nordmark@Sun.COM * (IRB), when we want to prevent the IREs getting deleted from a given 25211042SErik.Nordmark@Sun.COM * hash bucket. This makes life easier for ire_walk type functions which 25311042SErik.Nordmark@Sun.COM * wants to walk the IRE list, call a function, but needs to drop 25411042SErik.Nordmark@Sun.COM * the bucket lock to prevent recursive rw_enters. While the 25511042SErik.Nordmark@Sun.COM * lock is dropped, the list could be changed by other threads or 25611042SErik.Nordmark@Sun.COM * the same thread could end up deleting the ire or the ire pointed by 25711042SErik.Nordmark@Sun.COM * ire_next. ire_refholding the ire or ire_next is not sufficient as 25811042SErik.Nordmark@Sun.COM * a delete will still remove the ire from the bucket while we have 25911042SErik.Nordmark@Sun.COM * dropped the lock and hence the ire_next would be NULL. Thus, we 26011042SErik.Nordmark@Sun.COM * need a mechanism to prevent deletions from a given bucket. 26111042SErik.Nordmark@Sun.COM * 26211042SErik.Nordmark@Sun.COM * To prevent deletions, we bump up the reference count on the 26311042SErik.Nordmark@Sun.COM * bucket. If the bucket is held, ire_delete just marks both 26411042SErik.Nordmark@Sun.COM * the ire and irb as CONDEMNED. When the 26511042SErik.Nordmark@Sun.COM * reference count on the bucket drops to zero, all the CONDEMNED ires 26611042SErik.Nordmark@Sun.COM * are deleted. We don't have to bump up the reference count on the 26711042SErik.Nordmark@Sun.COM * bucket if we are walking the bucket and never have to drop the bucket 26811042SErik.Nordmark@Sun.COM * lock. Note that irb_refhold does not prevent addition of new ires 26911042SErik.Nordmark@Sun.COM * in the list. It is okay because addition of new ires will not cause 27011042SErik.Nordmark@Sun.COM * ire_next to point to freed memory. We do irb_refhold only when 27111042SErik.Nordmark@Sun.COM * all of the 3 conditions are true : 27211042SErik.Nordmark@Sun.COM * 27311042SErik.Nordmark@Sun.COM * 1) The code needs to walk the IRE bucket from start to end. 27411042SErik.Nordmark@Sun.COM * 2) It may have to drop the bucket lock sometimes while doing (1) 27511042SErik.Nordmark@Sun.COM * 3) It does not want any ires to be deleted meanwhile. 27611042SErik.Nordmark@Sun.COM */ 27711042SErik.Nordmark@Sun.COM 27811042SErik.Nordmark@Sun.COM /* 27911042SErik.Nordmark@Sun.COM * Bump up the reference count on the hash bucket - IRB to 28011042SErik.Nordmark@Sun.COM * prevent ires from being deleted in this bucket. 28111042SErik.Nordmark@Sun.COM */ 28211042SErik.Nordmark@Sun.COM void 28311042SErik.Nordmark@Sun.COM irb_refhold(irb_t *irb) 28411042SErik.Nordmark@Sun.COM { 28511042SErik.Nordmark@Sun.COM rw_enter(&irb->irb_lock, RW_WRITER); 28611042SErik.Nordmark@Sun.COM irb->irb_refcnt++; 28711042SErik.Nordmark@Sun.COM ASSERT(irb->irb_refcnt != 0); 28811042SErik.Nordmark@Sun.COM rw_exit(&irb->irb_lock); 28911042SErik.Nordmark@Sun.COM } 29011042SErik.Nordmark@Sun.COM 29111042SErik.Nordmark@Sun.COM void 29211042SErik.Nordmark@Sun.COM irb_refhold_locked(irb_t *irb) 29311042SErik.Nordmark@Sun.COM { 29411042SErik.Nordmark@Sun.COM ASSERT(RW_WRITE_HELD(&irb->irb_lock)); 29511042SErik.Nordmark@Sun.COM irb->irb_refcnt++; 29611042SErik.Nordmark@Sun.COM ASSERT(irb->irb_refcnt != 0); 29711042SErik.Nordmark@Sun.COM } 29811042SErik.Nordmark@Sun.COM 29911042SErik.Nordmark@Sun.COM /* 30011042SErik.Nordmark@Sun.COM * Note: when IRB_MARK_DYNAMIC is not set the irb_t 30111042SErik.Nordmark@Sun.COM * is statically allocated, so that when the irb_refcnt goes to 0, 30211042SErik.Nordmark@Sun.COM * we simply clean up the ire list and continue. 30311042SErik.Nordmark@Sun.COM */ 30411042SErik.Nordmark@Sun.COM void 30511042SErik.Nordmark@Sun.COM irb_refrele(irb_t *irb) 30611042SErik.Nordmark@Sun.COM { 30711042SErik.Nordmark@Sun.COM if (irb->irb_marks & IRB_MARK_DYNAMIC) { 30811042SErik.Nordmark@Sun.COM irb_refrele_ftable(irb); 30911042SErik.Nordmark@Sun.COM } else { 31011042SErik.Nordmark@Sun.COM rw_enter(&irb->irb_lock, RW_WRITER); 31111042SErik.Nordmark@Sun.COM ASSERT(irb->irb_refcnt != 0); 31211042SErik.Nordmark@Sun.COM if (--irb->irb_refcnt == 0 && 31311042SErik.Nordmark@Sun.COM (irb->irb_marks & IRB_MARK_CONDEMNED)) { 31411042SErik.Nordmark@Sun.COM ire_t *ire_list; 31511042SErik.Nordmark@Sun.COM 31611042SErik.Nordmark@Sun.COM ire_list = ire_unlink(irb); 31711042SErik.Nordmark@Sun.COM rw_exit(&irb->irb_lock); 31811042SErik.Nordmark@Sun.COM ASSERT(ire_list != NULL); 31911042SErik.Nordmark@Sun.COM ire_cleanup(ire_list); 32011042SErik.Nordmark@Sun.COM } else { 32111042SErik.Nordmark@Sun.COM rw_exit(&irb->irb_lock); 32211042SErik.Nordmark@Sun.COM } 32311042SErik.Nordmark@Sun.COM } 32411042SErik.Nordmark@Sun.COM } 32511042SErik.Nordmark@Sun.COM 32611042SErik.Nordmark@Sun.COM 32711042SErik.Nordmark@Sun.COM /* 32811042SErik.Nordmark@Sun.COM * Bump up the reference count on the IRE. We cannot assert that the 32911042SErik.Nordmark@Sun.COM * bucket lock is being held as it is legal to bump up the reference 33011042SErik.Nordmark@Sun.COM * count after the first lookup has returned the IRE without 33111042SErik.Nordmark@Sun.COM * holding the lock. 33211042SErik.Nordmark@Sun.COM */ 33311042SErik.Nordmark@Sun.COM void 33411042SErik.Nordmark@Sun.COM ire_refhold(ire_t *ire) 33511042SErik.Nordmark@Sun.COM { 33611042SErik.Nordmark@Sun.COM atomic_add_32(&(ire)->ire_refcnt, 1); 33711042SErik.Nordmark@Sun.COM ASSERT((ire)->ire_refcnt != 0); 33811042SErik.Nordmark@Sun.COM #ifdef DEBUG 33911042SErik.Nordmark@Sun.COM ire_trace_ref(ire); 34011042SErik.Nordmark@Sun.COM #endif 34111042SErik.Nordmark@Sun.COM } 34211042SErik.Nordmark@Sun.COM 34311042SErik.Nordmark@Sun.COM void 34411042SErik.Nordmark@Sun.COM ire_refhold_notr(ire_t *ire) 34511042SErik.Nordmark@Sun.COM { 34611042SErik.Nordmark@Sun.COM atomic_add_32(&(ire)->ire_refcnt, 1); 34711042SErik.Nordmark@Sun.COM ASSERT((ire)->ire_refcnt != 0); 34811042SErik.Nordmark@Sun.COM } 34911042SErik.Nordmark@Sun.COM 35011042SErik.Nordmark@Sun.COM void 35111042SErik.Nordmark@Sun.COM ire_refhold_locked(ire_t *ire) 35211042SErik.Nordmark@Sun.COM { 35311042SErik.Nordmark@Sun.COM #ifdef DEBUG 35411042SErik.Nordmark@Sun.COM ire_trace_ref(ire); 35511042SErik.Nordmark@Sun.COM #endif 35611042SErik.Nordmark@Sun.COM ire->ire_refcnt++; 35711042SErik.Nordmark@Sun.COM } 35811042SErik.Nordmark@Sun.COM 35911042SErik.Nordmark@Sun.COM /* 36011042SErik.Nordmark@Sun.COM * Release a ref on an IRE. 3610Sstevel@tonic-gate * 3620Sstevel@tonic-gate * Must not be called while holding any locks. Otherwise if this is 3630Sstevel@tonic-gate * the last reference to be released there is a chance of recursive mutex 3640Sstevel@tonic-gate * panic due to ire_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 3650Sstevel@tonic-gate * to restart an ioctl. The one exception is when the caller is sure that 3660Sstevel@tonic-gate * this is not the last reference to be released. Eg. if the caller is 3670Sstevel@tonic-gate * sure that the ire has not been deleted and won't be deleted. 36811042SErik.Nordmark@Sun.COM * 36911042SErik.Nordmark@Sun.COM * In architectures e.g sun4u, where atomic_add_32_nv is just 37011042SErik.Nordmark@Sun.COM * a cas, we need to maintain the right memory barrier semantics 37111042SErik.Nordmark@Sun.COM * as that of mutex_exit i.e all the loads and stores should complete 37211042SErik.Nordmark@Sun.COM * before the cas is executed. membar_exit() does that here. 3730Sstevel@tonic-gate */ 3740Sstevel@tonic-gate void 3750Sstevel@tonic-gate ire_refrele(ire_t *ire) 3760Sstevel@tonic-gate { 37711042SErik.Nordmark@Sun.COM #ifdef DEBUG 37811042SErik.Nordmark@Sun.COM ire_untrace_ref(ire); 37911042SErik.Nordmark@Sun.COM #endif 38011042SErik.Nordmark@Sun.COM ASSERT((ire)->ire_refcnt != 0); 38111042SErik.Nordmark@Sun.COM membar_exit(); 38211042SErik.Nordmark@Sun.COM if (atomic_add_32_nv(&(ire)->ire_refcnt, -1) == 0) 38311042SErik.Nordmark@Sun.COM ire_inactive(ire); 3840Sstevel@tonic-gate } 3850Sstevel@tonic-gate 3860Sstevel@tonic-gate void 3870Sstevel@tonic-gate ire_refrele_notr(ire_t *ire) 3880Sstevel@tonic-gate { 38911042SErik.Nordmark@Sun.COM ASSERT((ire)->ire_refcnt != 0); 39011042SErik.Nordmark@Sun.COM membar_exit(); 39111042SErik.Nordmark@Sun.COM if (atomic_add_32_nv(&(ire)->ire_refcnt, -1) == 0) 39211042SErik.Nordmark@Sun.COM ire_inactive(ire); 3930Sstevel@tonic-gate } 3940Sstevel@tonic-gate 3950Sstevel@tonic-gate /* 3960Sstevel@tonic-gate * This function is associated with the IP_IOC_IRE_DELETE[_NO_REPLY] 39711042SErik.Nordmark@Sun.COM * IOCTL[s]. The NO_REPLY form is used by TCP to tell IP that it is 39811042SErik.Nordmark@Sun.COM * having problems reaching a particular destination. 39911042SErik.Nordmark@Sun.COM * This will make IP consider alternate routes (e.g., when there are 40011042SErik.Nordmark@Sun.COM * muliple default routes), and it will also make IP discard any (potentially) 40111042SErik.Nordmark@Sun.COM * stale redirect. 40211042SErik.Nordmark@Sun.COM * Management processes may want to use the version that generates a reply. 4030Sstevel@tonic-gate * 40411042SErik.Nordmark@Sun.COM * With the use of NUD like behavior for IPv4/ARP in addition to IPv6 40511042SErik.Nordmark@Sun.COM * this function shouldn't be necessary for IP to recover from a bad redirect, 40611042SErik.Nordmark@Sun.COM * a bad default router (when there are multiple default routers), or 40711042SErik.Nordmark@Sun.COM * a stale ND/ARP entry. But we retain it in any case. 40811042SErik.Nordmark@Sun.COM * For instance, this is helpful when TCP suspects a failure before NUD does. 4090Sstevel@tonic-gate */ 4100Sstevel@tonic-gate int 4110Sstevel@tonic-gate ip_ire_delete(queue_t *q, mblk_t *mp, cred_t *ioc_cr) 4120Sstevel@tonic-gate { 4132535Ssangeeta uchar_t *addr_ucp; 41411042SErik.Nordmark@Sun.COM uint_t ipversion; 41511042SErik.Nordmark@Sun.COM sin_t *sin; 41611042SErik.Nordmark@Sun.COM sin6_t *sin6; 41711042SErik.Nordmark@Sun.COM ipaddr_t v4addr; 41811042SErik.Nordmark@Sun.COM in6_addr_t v6addr; 4192535Ssangeeta ire_t *ire; 4202535Ssangeeta ipid_t *ipid; 4210Sstevel@tonic-gate zoneid_t zoneid; 4223448Sdh155122 ip_stack_t *ipst; 4230Sstevel@tonic-gate 4240Sstevel@tonic-gate ASSERT(q->q_next == NULL); 42511042SErik.Nordmark@Sun.COM zoneid = IPCL_ZONEID(Q_TO_CONN(q)); 4263448Sdh155122 ipst = CONNQ_TO_IPST(q); 4270Sstevel@tonic-gate 4280Sstevel@tonic-gate /* 4290Sstevel@tonic-gate * Check privilege using the ioctl credential; if it is NULL 4300Sstevel@tonic-gate * then this is a kernel message and therefor privileged. 4310Sstevel@tonic-gate */ 4323448Sdh155122 if (ioc_cr != NULL && secpolicy_ip_config(ioc_cr, B_FALSE) != 0) 4330Sstevel@tonic-gate return (EPERM); 4340Sstevel@tonic-gate 4350Sstevel@tonic-gate ipid = (ipid_t *)mp->b_rptr; 4360Sstevel@tonic-gate 4370Sstevel@tonic-gate addr_ucp = mi_offset_param(mp, ipid->ipid_addr_offset, 4384714Ssowmini ipid->ipid_addr_length); 4390Sstevel@tonic-gate if (addr_ucp == NULL || !OK_32PTR(addr_ucp)) 4400Sstevel@tonic-gate return (EINVAL); 4410Sstevel@tonic-gate switch (ipid->ipid_addr_length) { 44211042SErik.Nordmark@Sun.COM case sizeof (sin_t): 4430Sstevel@tonic-gate /* 4440Sstevel@tonic-gate * got complete (sockaddr) address - increment addr_ucp to point 4450Sstevel@tonic-gate * at the ip_addr field. 4460Sstevel@tonic-gate */ 4470Sstevel@tonic-gate sin = (sin_t *)addr_ucp; 4480Sstevel@tonic-gate addr_ucp = (uchar_t *)&sin->sin_addr.s_addr; 44911042SErik.Nordmark@Sun.COM ipversion = IPV4_VERSION; 4500Sstevel@tonic-gate break; 45111042SErik.Nordmark@Sun.COM case sizeof (sin6_t): 45211042SErik.Nordmark@Sun.COM /* 45311042SErik.Nordmark@Sun.COM * got complete (sockaddr) address - increment addr_ucp to point 45411042SErik.Nordmark@Sun.COM * at the ip_addr field. 45511042SErik.Nordmark@Sun.COM */ 45611042SErik.Nordmark@Sun.COM sin6 = (sin6_t *)addr_ucp; 45711042SErik.Nordmark@Sun.COM addr_ucp = (uchar_t *)&sin6->sin6_addr; 45811042SErik.Nordmark@Sun.COM ipversion = IPV6_VERSION; 45911042SErik.Nordmark@Sun.COM break; 4600Sstevel@tonic-gate default: 4610Sstevel@tonic-gate return (EINVAL); 4620Sstevel@tonic-gate } 46311042SErik.Nordmark@Sun.COM if (ipversion == IPV4_VERSION) { 46411042SErik.Nordmark@Sun.COM /* Extract the destination address. */ 46511042SErik.Nordmark@Sun.COM bcopy(addr_ucp, &v4addr, IP_ADDR_LEN); 46611042SErik.Nordmark@Sun.COM 46711042SErik.Nordmark@Sun.COM ire = ire_ftable_lookup_v4(v4addr, 0, 0, 0, NULL, 46811042SErik.Nordmark@Sun.COM zoneid, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL); 46911042SErik.Nordmark@Sun.COM } else { 47011042SErik.Nordmark@Sun.COM /* Extract the destination address. */ 47111042SErik.Nordmark@Sun.COM bcopy(addr_ucp, &v6addr, IPV6_ADDR_LEN); 47211042SErik.Nordmark@Sun.COM 47311042SErik.Nordmark@Sun.COM ire = ire_ftable_lookup_v6(&v6addr, NULL, NULL, 0, NULL, 47411042SErik.Nordmark@Sun.COM zoneid, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL); 47511042SErik.Nordmark@Sun.COM } 47611042SErik.Nordmark@Sun.COM if (ire != NULL) { 47711042SErik.Nordmark@Sun.COM if (ipversion == IPV4_VERSION) { 47811042SErik.Nordmark@Sun.COM ip_rts_change(RTM_LOSING, ire->ire_addr, 47911042SErik.Nordmark@Sun.COM ire->ire_gateway_addr, ire->ire_mask, 48011042SErik.Nordmark@Sun.COM (Q_TO_CONN(q))->conn_laddr_v4, 0, 0, 0, 48111042SErik.Nordmark@Sun.COM (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA), 48211042SErik.Nordmark@Sun.COM ire->ire_ipst); 4830Sstevel@tonic-gate } 48411042SErik.Nordmark@Sun.COM (void) ire_no_good(ire); 4854714Ssowmini ire_refrele(ire); 4860Sstevel@tonic-gate } 4870Sstevel@tonic-gate return (0); 4880Sstevel@tonic-gate } 4890Sstevel@tonic-gate 4900Sstevel@tonic-gate /* 4910Sstevel@tonic-gate * Initialize the ire that is specific to IPv4 part and call 4920Sstevel@tonic-gate * ire_init_common to finish it. 49311042SErik.Nordmark@Sun.COM * Returns zero or errno. 4940Sstevel@tonic-gate */ 49511042SErik.Nordmark@Sun.COM int 49611042SErik.Nordmark@Sun.COM ire_init_v4(ire_t *ire, uchar_t *addr, uchar_t *mask, uchar_t *gateway, 49711042SErik.Nordmark@Sun.COM ushort_t type, ill_t *ill, zoneid_t zoneid, uint_t flags, 49811042SErik.Nordmark@Sun.COM tsol_gc_t *gc, ip_stack_t *ipst) 4990Sstevel@tonic-gate { 50011042SErik.Nordmark@Sun.COM int error; 50111042SErik.Nordmark@Sun.COM 5021676Sjpk /* 5031676Sjpk * Reject IRE security attribute creation/initialization 5041676Sjpk * if system is not running in Trusted mode. 5051676Sjpk */ 50611042SErik.Nordmark@Sun.COM if (gc != NULL && !is_system_labeled()) 50711042SErik.Nordmark@Sun.COM return (EINVAL); 5081676Sjpk 5093448Sdh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_alloced); 5100Sstevel@tonic-gate 5110Sstevel@tonic-gate if (addr != NULL) 5120Sstevel@tonic-gate bcopy(addr, &ire->ire_addr, IP_ADDR_LEN); 51311042SErik.Nordmark@Sun.COM if (gateway != NULL) 5140Sstevel@tonic-gate bcopy(gateway, &ire->ire_gateway_addr, IP_ADDR_LEN); 51511042SErik.Nordmark@Sun.COM 51611042SErik.Nordmark@Sun.COM /* Make sure we don't have stray values in some fields */ 51711042SErik.Nordmark@Sun.COM switch (type) { 51811042SErik.Nordmark@Sun.COM case IRE_LOOPBACK: 51911042SErik.Nordmark@Sun.COM case IRE_HOST: 52011042SErik.Nordmark@Sun.COM case IRE_BROADCAST: 52111042SErik.Nordmark@Sun.COM case IRE_LOCAL: 52211042SErik.Nordmark@Sun.COM case IRE_IF_CLONE: 52311042SErik.Nordmark@Sun.COM ire->ire_mask = IP_HOST_MASK; 52411042SErik.Nordmark@Sun.COM ire->ire_masklen = IPV4_ABITS; 52511042SErik.Nordmark@Sun.COM break; 52611042SErik.Nordmark@Sun.COM case IRE_PREFIX: 52711042SErik.Nordmark@Sun.COM case IRE_DEFAULT: 52811042SErik.Nordmark@Sun.COM case IRE_IF_RESOLVER: 52911042SErik.Nordmark@Sun.COM case IRE_IF_NORESOLVER: 53011042SErik.Nordmark@Sun.COM if (mask != NULL) { 53111042SErik.Nordmark@Sun.COM bcopy(mask, &ire->ire_mask, IP_ADDR_LEN); 53211042SErik.Nordmark@Sun.COM ire->ire_masklen = ip_mask_to_plen(ire->ire_mask); 53311042SErik.Nordmark@Sun.COM } 53411042SErik.Nordmark@Sun.COM break; 53511042SErik.Nordmark@Sun.COM case IRE_MULTICAST: 53611042SErik.Nordmark@Sun.COM case IRE_NOROUTE: 53711042SErik.Nordmark@Sun.COM ASSERT(mask == NULL); 53811042SErik.Nordmark@Sun.COM break; 53911042SErik.Nordmark@Sun.COM default: 54011042SErik.Nordmark@Sun.COM ASSERT(0); 54111042SErik.Nordmark@Sun.COM return (EINVAL); 5420Sstevel@tonic-gate } 5430Sstevel@tonic-gate 54411042SErik.Nordmark@Sun.COM error = ire_init_common(ire, type, ill, zoneid, flags, IPV4_VERSION, 54511042SErik.Nordmark@Sun.COM gc, ipst); 54611042SErik.Nordmark@Sun.COM if (error != NULL) 54711042SErik.Nordmark@Sun.COM return (error); 54811042SErik.Nordmark@Sun.COM 54911042SErik.Nordmark@Sun.COM /* Determine which function pointers to use */ 55011042SErik.Nordmark@Sun.COM ire->ire_postfragfn = ip_xmit; /* Common case */ 55111042SErik.Nordmark@Sun.COM 55211042SErik.Nordmark@Sun.COM switch (ire->ire_type) { 55311042SErik.Nordmark@Sun.COM case IRE_LOCAL: 55411042SErik.Nordmark@Sun.COM ire->ire_sendfn = ire_send_local_v4; 55511042SErik.Nordmark@Sun.COM ire->ire_recvfn = ire_recv_local_v4; 55611042SErik.Nordmark@Sun.COM ASSERT(ire->ire_ill != NULL); 55711076SCathy.Zhou@Sun.COM if (ire->ire_ill->ill_flags & ILLF_NOACCEPT) 55811042SErik.Nordmark@Sun.COM ire->ire_recvfn = ire_recv_noaccept_v6; 55911042SErik.Nordmark@Sun.COM break; 56011042SErik.Nordmark@Sun.COM case IRE_LOOPBACK: 56111042SErik.Nordmark@Sun.COM ire->ire_sendfn = ire_send_local_v4; 56211042SErik.Nordmark@Sun.COM ire->ire_recvfn = ire_recv_loopback_v4; 56311042SErik.Nordmark@Sun.COM break; 56411042SErik.Nordmark@Sun.COM case IRE_BROADCAST: 56511042SErik.Nordmark@Sun.COM ire->ire_postfragfn = ip_postfrag_loopcheck; 56611042SErik.Nordmark@Sun.COM ire->ire_sendfn = ire_send_broadcast_v4; 56711042SErik.Nordmark@Sun.COM ire->ire_recvfn = ire_recv_broadcast_v4; 56811042SErik.Nordmark@Sun.COM break; 56911042SErik.Nordmark@Sun.COM case IRE_MULTICAST: 57011042SErik.Nordmark@Sun.COM ire->ire_postfragfn = ip_postfrag_loopcheck; 57111042SErik.Nordmark@Sun.COM ire->ire_sendfn = ire_send_multicast_v4; 57211042SErik.Nordmark@Sun.COM ire->ire_recvfn = ire_recv_multicast_v4; 57311042SErik.Nordmark@Sun.COM break; 57411042SErik.Nordmark@Sun.COM default: 57511042SErik.Nordmark@Sun.COM /* 57611042SErik.Nordmark@Sun.COM * For IRE_IF_ALL and IRE_OFFLINK we forward received 57711042SErik.Nordmark@Sun.COM * packets by default. 57811042SErik.Nordmark@Sun.COM */ 57911042SErik.Nordmark@Sun.COM ire->ire_sendfn = ire_send_wire_v4; 58011042SErik.Nordmark@Sun.COM ire->ire_recvfn = ire_recv_forward_v4; 58111042SErik.Nordmark@Sun.COM break; 58211042SErik.Nordmark@Sun.COM } 58311042SErik.Nordmark@Sun.COM if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 58411042SErik.Nordmark@Sun.COM ire->ire_sendfn = ire_send_noroute_v4; 58511042SErik.Nordmark@Sun.COM ire->ire_recvfn = ire_recv_noroute_v4; 58611042SErik.Nordmark@Sun.COM } else if (ire->ire_flags & RTF_MULTIRT) { 58711042SErik.Nordmark@Sun.COM ire->ire_postfragfn = ip_postfrag_multirt_v4; 58811042SErik.Nordmark@Sun.COM ire->ire_sendfn = ire_send_multirt_v4; 58911042SErik.Nordmark@Sun.COM /* Multirt receive of broadcast uses ire_recv_broadcast_v4 */ 59011042SErik.Nordmark@Sun.COM if (ire->ire_type != IRE_BROADCAST) 59111042SErik.Nordmark@Sun.COM ire->ire_recvfn = ire_recv_multirt_v4; 59211042SErik.Nordmark@Sun.COM } 59311042SErik.Nordmark@Sun.COM ire->ire_nce_capable = ire_determine_nce_capable(ire); 59411042SErik.Nordmark@Sun.COM return (0); 5950Sstevel@tonic-gate } 5960Sstevel@tonic-gate 5970Sstevel@tonic-gate /* 59811042SErik.Nordmark@Sun.COM * Determine ire_nce_capable 5990Sstevel@tonic-gate */ 60011042SErik.Nordmark@Sun.COM boolean_t 60111042SErik.Nordmark@Sun.COM ire_determine_nce_capable(ire_t *ire) 6020Sstevel@tonic-gate { 60311042SErik.Nordmark@Sun.COM int max_masklen; 60411042SErik.Nordmark@Sun.COM 60511042SErik.Nordmark@Sun.COM if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 60611042SErik.Nordmark@Sun.COM (ire->ire_type & IRE_MULTICAST)) 60711042SErik.Nordmark@Sun.COM return (B_TRUE); 60811042SErik.Nordmark@Sun.COM 60911042SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV4_VERSION) 61011042SErik.Nordmark@Sun.COM max_masklen = IPV4_ABITS; 61111042SErik.Nordmark@Sun.COM else 61211042SErik.Nordmark@Sun.COM max_masklen = IPV6_ABITS; 61311042SErik.Nordmark@Sun.COM 61411042SErik.Nordmark@Sun.COM if ((ire->ire_type & IRE_ONLINK) && ire->ire_masklen == max_masklen) 61511042SErik.Nordmark@Sun.COM return (B_TRUE); 61611042SErik.Nordmark@Sun.COM return (B_FALSE); 6170Sstevel@tonic-gate } 6180Sstevel@tonic-gate 6190Sstevel@tonic-gate /* 6200Sstevel@tonic-gate * ire_create is called to allocate and initialize a new IRE. 6210Sstevel@tonic-gate * 6220Sstevel@tonic-gate * NOTE : This is called as writer sometimes though not required 6230Sstevel@tonic-gate * by this function. 6240Sstevel@tonic-gate */ 6250Sstevel@tonic-gate ire_t * 62611042SErik.Nordmark@Sun.COM ire_create(uchar_t *addr, uchar_t *mask, uchar_t *gateway, 62711042SErik.Nordmark@Sun.COM ushort_t type, ill_t *ill, zoneid_t zoneid, uint_t flags, tsol_gc_t *gc, 62811042SErik.Nordmark@Sun.COM ip_stack_t *ipst) 6290Sstevel@tonic-gate { 6300Sstevel@tonic-gate ire_t *ire; 63111042SErik.Nordmark@Sun.COM int error; 6320Sstevel@tonic-gate 6330Sstevel@tonic-gate ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 6340Sstevel@tonic-gate if (ire == NULL) { 63511042SErik.Nordmark@Sun.COM DTRACE_PROBE(kmem__cache__alloc); 6360Sstevel@tonic-gate return (NULL); 6370Sstevel@tonic-gate } 6380Sstevel@tonic-gate *ire = ire_null; 6390Sstevel@tonic-gate 64011042SErik.Nordmark@Sun.COM error = ire_init_v4(ire, addr, mask, gateway, type, ill, zoneid, flags, 64111042SErik.Nordmark@Sun.COM gc, ipst); 64211042SErik.Nordmark@Sun.COM if (error != 0) { 64311042SErik.Nordmark@Sun.COM DTRACE_PROBE2(ire__init, ire_t *, ire, int, error); 6440Sstevel@tonic-gate kmem_cache_free(ire_cache, ire); 6450Sstevel@tonic-gate return (NULL); 6460Sstevel@tonic-gate } 6470Sstevel@tonic-gate return (ire); 6480Sstevel@tonic-gate } 6490Sstevel@tonic-gate 6500Sstevel@tonic-gate /* 6510Sstevel@tonic-gate * Common to IPv4 and IPv6 65211042SErik.Nordmark@Sun.COM * Returns zero or errno. 6530Sstevel@tonic-gate */ 65411042SErik.Nordmark@Sun.COM int 65511042SErik.Nordmark@Sun.COM ire_init_common(ire_t *ire, ushort_t type, ill_t *ill, zoneid_t zoneid, 65611042SErik.Nordmark@Sun.COM uint_t flags, uchar_t ipversion, tsol_gc_t *gc, ip_stack_t *ipst) 6570Sstevel@tonic-gate { 65811042SErik.Nordmark@Sun.COM int error; 6590Sstevel@tonic-gate 6601676Sjpk #ifdef DEBUG 66111042SErik.Nordmark@Sun.COM if (ill != NULL) { 66211042SErik.Nordmark@Sun.COM if (ill->ill_isv6) 6630Sstevel@tonic-gate ASSERT(ipversion == IPV6_VERSION); 6640Sstevel@tonic-gate else 6650Sstevel@tonic-gate ASSERT(ipversion == IPV4_VERSION); 6660Sstevel@tonic-gate } 6671676Sjpk #endif /* DEBUG */ 6681676Sjpk 6691676Sjpk /* 6701676Sjpk * Create/initialize IRE security attribute only in Trusted mode; 67111042SErik.Nordmark@Sun.COM * if the passed in gc is non-NULL, we expect that the caller 6721676Sjpk * has held a reference to it and will release it when this routine 6731676Sjpk * returns a failure, otherwise we own the reference. We do this 6741676Sjpk * prior to initializing the rest IRE fields. 6751676Sjpk */ 6761676Sjpk if (is_system_labeled()) { 6771676Sjpk if ((type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST | 67811042SErik.Nordmark@Sun.COM IRE_IF_ALL | IRE_MULTICAST | IRE_NOROUTE)) != 0) { 6791676Sjpk /* release references on behalf of caller */ 6801676Sjpk if (gc != NULL) 6811676Sjpk GC_REFRELE(gc); 68211042SErik.Nordmark@Sun.COM } else { 68311042SErik.Nordmark@Sun.COM error = tsol_ire_init_gwattr(ire, ipversion, gc); 68411042SErik.Nordmark@Sun.COM if (error != 0) 68511042SErik.Nordmark@Sun.COM return (error); 6861676Sjpk } 6871676Sjpk } 6880Sstevel@tonic-gate 6890Sstevel@tonic-gate ire->ire_type = type; 6900Sstevel@tonic-gate ire->ire_flags = RTF_UP | flags; 6910Sstevel@tonic-gate ire->ire_create_time = (uint32_t)gethrestime_sec(); 69211042SErik.Nordmark@Sun.COM ire->ire_generation = IRE_GENERATION_INITIAL; 6930Sstevel@tonic-gate 6940Sstevel@tonic-gate /* 69511042SErik.Nordmark@Sun.COM * The ill_ire_cnt isn't increased until 69611042SErik.Nordmark@Sun.COM * the IRE is added to ensure that a walker will find 69711042SErik.Nordmark@Sun.COM * all IREs that hold a reference on an ill. 6980Sstevel@tonic-gate * 69911042SErik.Nordmark@Sun.COM * Note that ill_ire_multicast doesn't hold a ref on the ill since 70011042SErik.Nordmark@Sun.COM * ire_add() is not called for the IRE_MULTICAST. 7010Sstevel@tonic-gate */ 70211042SErik.Nordmark@Sun.COM ire->ire_ill = ill; 70311042SErik.Nordmark@Sun.COM ire->ire_zoneid = zoneid; 7040Sstevel@tonic-gate ire->ire_ipversion = ipversion; 70511042SErik.Nordmark@Sun.COM 7062535Ssangeeta mutex_init(&ire->ire_lock, NULL, MUTEX_DEFAULT, NULL); 7070Sstevel@tonic-gate ire->ire_refcnt = 1; 70811042SErik.Nordmark@Sun.COM ire->ire_identical_ref = 1; /* Number of ire_delete's needed */ 7093448Sdh155122 ire->ire_ipst = ipst; /* No netstack_hold */ 7105023Scarlsonj ire->ire_trace_disable = B_FALSE; 7111676Sjpk 71211042SErik.Nordmark@Sun.COM return (0); 7130Sstevel@tonic-gate } 7140Sstevel@tonic-gate 7150Sstevel@tonic-gate /* 71611042SErik.Nordmark@Sun.COM * This creates an IRE_BROADCAST based on the arguments. 71711042SErik.Nordmark@Sun.COM * A mirror is ire_lookup_bcast(). 7180Sstevel@tonic-gate * 71911042SErik.Nordmark@Sun.COM * Any supression of unneeded ones is done in ire_add_v4. 72011042SErik.Nordmark@Sun.COM * We add one IRE_BROADCAST per address. ire_send_broadcast_v4() 72111042SErik.Nordmark@Sun.COM * takes care of generating a loopback copy of the packet. 7220Sstevel@tonic-gate */ 7230Sstevel@tonic-gate ire_t ** 72411042SErik.Nordmark@Sun.COM ire_create_bcast(ill_t *ill, ipaddr_t addr, zoneid_t zoneid, ire_t **irep) 7250Sstevel@tonic-gate { 72611042SErik.Nordmark@Sun.COM ip_stack_t *ipst = ill->ill_ipst; 72711042SErik.Nordmark@Sun.COM 72811042SErik.Nordmark@Sun.COM ASSERT(IAM_WRITER_ILL(ill)); 7293448Sdh155122 7300Sstevel@tonic-gate *irep++ = ire_create( 7310Sstevel@tonic-gate (uchar_t *)&addr, /* dest addr */ 7320Sstevel@tonic-gate (uchar_t *)&ip_g_all_ones, /* mask */ 7330Sstevel@tonic-gate NULL, /* no gateway */ 7340Sstevel@tonic-gate IRE_BROADCAST, 73511042SErik.Nordmark@Sun.COM ill, 73611042SErik.Nordmark@Sun.COM zoneid, 73711042SErik.Nordmark@Sun.COM RTF_KERNEL, 7384714Ssowmini NULL, 7394714Ssowmini ipst); 7400Sstevel@tonic-gate 7410Sstevel@tonic-gate return (irep); 7420Sstevel@tonic-gate } 7430Sstevel@tonic-gate 7440Sstevel@tonic-gate /* 74511042SErik.Nordmark@Sun.COM * This looks up an IRE_BROADCAST based on the arguments. 74611042SErik.Nordmark@Sun.COM * Mirrors ire_create_bcast(). 7470Sstevel@tonic-gate */ 7480Sstevel@tonic-gate ire_t * 74911042SErik.Nordmark@Sun.COM ire_lookup_bcast(ill_t *ill, ipaddr_t addr, zoneid_t zoneid) 7500Sstevel@tonic-gate { 75111042SErik.Nordmark@Sun.COM ire_t *ire; 75211042SErik.Nordmark@Sun.COM int match_args; 75311042SErik.Nordmark@Sun.COM 75411042SErik.Nordmark@Sun.COM match_args = MATCH_IRE_TYPE | MATCH_IRE_ILL | MATCH_IRE_GW | 75511042SErik.Nordmark@Sun.COM MATCH_IRE_MASK | MATCH_IRE_ZONEONLY; 75611042SErik.Nordmark@Sun.COM 75711042SErik.Nordmark@Sun.COM if (IS_UNDER_IPMP(ill)) 75811042SErik.Nordmark@Sun.COM match_args |= MATCH_IRE_TESTHIDDEN; 75911042SErik.Nordmark@Sun.COM 76011042SErik.Nordmark@Sun.COM ire = ire_ftable_lookup_v4( 76111042SErik.Nordmark@Sun.COM addr, /* dest addr */ 76211042SErik.Nordmark@Sun.COM ip_g_all_ones, /* mask */ 76311042SErik.Nordmark@Sun.COM 0, /* no gateway */ 76411042SErik.Nordmark@Sun.COM IRE_BROADCAST, 76511042SErik.Nordmark@Sun.COM ill, 76611042SErik.Nordmark@Sun.COM zoneid, 76711042SErik.Nordmark@Sun.COM NULL, 76811042SErik.Nordmark@Sun.COM match_args, 76911042SErik.Nordmark@Sun.COM 0, 77011042SErik.Nordmark@Sun.COM ill->ill_ipst, 77111042SErik.Nordmark@Sun.COM NULL); 77211042SErik.Nordmark@Sun.COM return (ire); 7730Sstevel@tonic-gate } 7740Sstevel@tonic-gate 7750Sstevel@tonic-gate /* Arrange to call the specified function for every IRE in the world. */ 7760Sstevel@tonic-gate void 7773448Sdh155122 ire_walk(pfv_t func, void *arg, ip_stack_t *ipst) 7780Sstevel@tonic-gate { 7793448Sdh155122 ire_walk_ipvers(func, arg, 0, ALL_ZONES, ipst); 7800Sstevel@tonic-gate } 7810Sstevel@tonic-gate 7820Sstevel@tonic-gate void 7833448Sdh155122 ire_walk_v4(pfv_t func, void *arg, zoneid_t zoneid, ip_stack_t *ipst) 7840Sstevel@tonic-gate { 7853448Sdh155122 ire_walk_ipvers(func, arg, IPV4_VERSION, zoneid, ipst); 7860Sstevel@tonic-gate } 7870Sstevel@tonic-gate 7880Sstevel@tonic-gate void 7893448Sdh155122 ire_walk_v6(pfv_t func, void *arg, zoneid_t zoneid, ip_stack_t *ipst) 7900Sstevel@tonic-gate { 7913448Sdh155122 ire_walk_ipvers(func, arg, IPV6_VERSION, zoneid, ipst); 7920Sstevel@tonic-gate } 7930Sstevel@tonic-gate 7940Sstevel@tonic-gate /* 7950Sstevel@tonic-gate * Walk a particular version. version == 0 means both v4 and v6. 7960Sstevel@tonic-gate */ 7970Sstevel@tonic-gate static void 7983448Sdh155122 ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers, zoneid_t zoneid, 7993448Sdh155122 ip_stack_t *ipst) 8000Sstevel@tonic-gate { 8010Sstevel@tonic-gate if (vers != IPV6_VERSION) { 8022535Ssangeeta /* 8032535Ssangeeta * ip_forwarding_table variable doesn't matter for IPv4 since 8043448Sdh155122 * ire_walk_ill_tables uses ips_ip_ftable for IPv4. 8052535Ssangeeta */ 8060Sstevel@tonic-gate ire_walk_ill_tables(0, 0, func, arg, IP_MASK_TABLE_SIZE, 8072535Ssangeeta 0, NULL, 8083448Sdh155122 NULL, zoneid, ipst); 8090Sstevel@tonic-gate } 8100Sstevel@tonic-gate if (vers != IPV4_VERSION) { 8110Sstevel@tonic-gate ire_walk_ill_tables(0, 0, func, arg, IP6_MASK_TABLE_SIZE, 8123448Sdh155122 ipst->ips_ip6_ftable_hash_size, 8133448Sdh155122 ipst->ips_ip_forwarding_table_v6, 81411042SErik.Nordmark@Sun.COM NULL, zoneid, ipst); 8150Sstevel@tonic-gate } 8160Sstevel@tonic-gate } 8170Sstevel@tonic-gate 8180Sstevel@tonic-gate /* 8197216Smeem * Arrange to call the specified function for every IRE that matches the ill. 8200Sstevel@tonic-gate */ 8210Sstevel@tonic-gate void 8221676Sjpk ire_walk_ill(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg, 8230Sstevel@tonic-gate ill_t *ill) 8240Sstevel@tonic-gate { 8257216Smeem uchar_t vers = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION); 8267216Smeem 8277216Smeem ire_walk_ill_ipvers(match_flags, ire_type, func, arg, vers, ill); 8280Sstevel@tonic-gate } 8290Sstevel@tonic-gate 8300Sstevel@tonic-gate /* 8317216Smeem * Walk a particular ill and version. 8320Sstevel@tonic-gate */ 8330Sstevel@tonic-gate static void 8340Sstevel@tonic-gate ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type, pfv_t func, 8351676Sjpk void *arg, uchar_t vers, ill_t *ill) 8360Sstevel@tonic-gate { 8373448Sdh155122 ip_stack_t *ipst = ill->ill_ipst; 8383448Sdh155122 8397216Smeem if (vers == IPV4_VERSION) { 8400Sstevel@tonic-gate ire_walk_ill_tables(match_flags, ire_type, func, arg, 84111042SErik.Nordmark@Sun.COM IP_MASK_TABLE_SIZE, 84211042SErik.Nordmark@Sun.COM 0, NULL, 84311042SErik.Nordmark@Sun.COM ill, ALL_ZONES, ipst); 84411042SErik.Nordmark@Sun.COM } 84511042SErik.Nordmark@Sun.COM if (vers != IPV4_VERSION) { 8460Sstevel@tonic-gate ire_walk_ill_tables(match_flags, ire_type, func, arg, 8473448Sdh155122 IP6_MASK_TABLE_SIZE, ipst->ips_ip6_ftable_hash_size, 8483448Sdh155122 ipst->ips_ip_forwarding_table_v6, 84911042SErik.Nordmark@Sun.COM ill, ALL_ZONES, ipst); 8500Sstevel@tonic-gate } 8510Sstevel@tonic-gate } 8520Sstevel@tonic-gate 85311042SErik.Nordmark@Sun.COM /* 85411042SErik.Nordmark@Sun.COM * Do the specific matching of IREs to shared-IP zones. 85511042SErik.Nordmark@Sun.COM * 85611042SErik.Nordmark@Sun.COM * We have the same logic as in ire_match_args but implemented slightly 85711042SErik.Nordmark@Sun.COM * differently. 85811042SErik.Nordmark@Sun.COM */ 8592535Ssangeeta boolean_t 8600Sstevel@tonic-gate ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire, 8613448Sdh155122 ill_t *ill, zoneid_t zoneid, ip_stack_t *ipst) 8620Sstevel@tonic-gate { 86311131SErik.Nordmark@Sun.COM ill_t *dst_ill = ire->ire_ill; 8640Sstevel@tonic-gate 8650Sstevel@tonic-gate ASSERT(match_flags != 0 || zoneid != ALL_ZONES); 8660Sstevel@tonic-gate 86711042SErik.Nordmark@Sun.COM if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid && 86811042SErik.Nordmark@Sun.COM ire->ire_zoneid != ALL_ZONES) { 8690Sstevel@tonic-gate /* 8700Sstevel@tonic-gate * We're walking the IREs for a specific zone. The only relevant 8710Sstevel@tonic-gate * IREs are: 8720Sstevel@tonic-gate * - all IREs with a matching ire_zoneid 87311042SErik.Nordmark@Sun.COM * - IRE_IF_ALL IREs for interfaces with a usable source addr 8740Sstevel@tonic-gate * with a matching zone 87511042SErik.Nordmark@Sun.COM * - IRE_OFFLINK with a gateway reachable from the zone 87611042SErik.Nordmark@Sun.COM * Note that ealier we only did the IRE_OFFLINK check for 87711042SErik.Nordmark@Sun.COM * IRE_DEFAULT (and only when we had multiple IRE_DEFAULTs). 8780Sstevel@tonic-gate */ 87911042SErik.Nordmark@Sun.COM if (ire->ire_type & IRE_ONLINK) { 88011042SErik.Nordmark@Sun.COM uint_t ifindex; 88111042SErik.Nordmark@Sun.COM 8820Sstevel@tonic-gate /* 88311042SErik.Nordmark@Sun.COM * Note there is no IRE_INTERFACE on vniN thus 88411042SErik.Nordmark@Sun.COM * can't do an IRE lookup for a matching route. 8850Sstevel@tonic-gate */ 88611042SErik.Nordmark@Sun.COM ifindex = dst_ill->ill_usesrc_ifindex; 88711042SErik.Nordmark@Sun.COM if (ifindex == 0) 88811042SErik.Nordmark@Sun.COM return (B_FALSE); 88911042SErik.Nordmark@Sun.COM 89011042SErik.Nordmark@Sun.COM /* 89111042SErik.Nordmark@Sun.COM * If there is a usable source address in the 89211042SErik.Nordmark@Sun.COM * zone, then it's ok to return an 89311042SErik.Nordmark@Sun.COM * IRE_INTERFACE 89411042SErik.Nordmark@Sun.COM */ 89511042SErik.Nordmark@Sun.COM if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6, 89611042SErik.Nordmark@Sun.COM zoneid, ipst)) { 89711042SErik.Nordmark@Sun.COM return (B_FALSE); 89811042SErik.Nordmark@Sun.COM } 89911042SErik.Nordmark@Sun.COM } 90011042SErik.Nordmark@Sun.COM if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) { 90111042SErik.Nordmark@Sun.COM ipif_t *tipif; 90211042SErik.Nordmark@Sun.COM 90311042SErik.Nordmark@Sun.COM mutex_enter(&dst_ill->ill_lock); 90411042SErik.Nordmark@Sun.COM for (tipif = dst_ill->ill_ipif; 90511042SErik.Nordmark@Sun.COM tipif != NULL; tipif = tipif->ipif_next) { 90611042SErik.Nordmark@Sun.COM if (!IPIF_IS_CONDEMNED(tipif) && 90711042SErik.Nordmark@Sun.COM (tipif->ipif_flags & IPIF_UP) && 90811042SErik.Nordmark@Sun.COM (tipif->ipif_zoneid == zoneid || 90911042SErik.Nordmark@Sun.COM tipif->ipif_zoneid == ALL_ZONES)) 91011042SErik.Nordmark@Sun.COM break; 91111042SErik.Nordmark@Sun.COM } 91211042SErik.Nordmark@Sun.COM mutex_exit(&dst_ill->ill_lock); 91311042SErik.Nordmark@Sun.COM if (tipif == NULL) { 9140Sstevel@tonic-gate return (B_FALSE); 9150Sstevel@tonic-gate } 9160Sstevel@tonic-gate } 91711131SErik.Nordmark@Sun.COM } 91811131SErik.Nordmark@Sun.COM /* 91911131SErik.Nordmark@Sun.COM * Except for ALL_ZONES, we only match the offlink routes 92011131SErik.Nordmark@Sun.COM * where ire_gateway_addr has an IRE_INTERFACE for the zoneid. 92111457SErik.Nordmark@Sun.COM * Since we can have leftover routes after the IP addresses have 92211457SErik.Nordmark@Sun.COM * changed, the global zone will also match offlink routes where the 92311457SErik.Nordmark@Sun.COM * gateway is unreachable from any zone. 92411131SErik.Nordmark@Sun.COM */ 92511131SErik.Nordmark@Sun.COM if ((ire->ire_type & IRE_OFFLINK) && zoneid != ALL_ZONES) { 92611131SErik.Nordmark@Sun.COM in6_addr_t gw_addr_v6; 92711457SErik.Nordmark@Sun.COM boolean_t reach; 92811131SErik.Nordmark@Sun.COM 92911131SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV4_VERSION) { 93011457SErik.Nordmark@Sun.COM reach = ire_gateway_ok_zone_v4(ire->ire_gateway_addr, 93111457SErik.Nordmark@Sun.COM zoneid, dst_ill, NULL, ipst, B_FALSE); 93211131SErik.Nordmark@Sun.COM } else { 93311131SErik.Nordmark@Sun.COM ASSERT(ire->ire_ipversion == IPV6_VERSION); 93411131SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock); 93511131SErik.Nordmark@Sun.COM gw_addr_v6 = ire->ire_gateway_addr_v6; 93611131SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 93711131SErik.Nordmark@Sun.COM 93811457SErik.Nordmark@Sun.COM reach = ire_gateway_ok_zone_v6(&gw_addr_v6, zoneid, 93911457SErik.Nordmark@Sun.COM dst_ill, NULL, ipst, B_FALSE); 94011457SErik.Nordmark@Sun.COM } 94111457SErik.Nordmark@Sun.COM if (!reach) { 94211457SErik.Nordmark@Sun.COM if (zoneid != GLOBAL_ZONEID) 94311131SErik.Nordmark@Sun.COM return (B_FALSE); 94411457SErik.Nordmark@Sun.COM 94511457SErik.Nordmark@Sun.COM /* 94611457SErik.Nordmark@Sun.COM * Check if ALL_ZONES reachable - if not then let the 94711457SErik.Nordmark@Sun.COM * global zone see it. 94811457SErik.Nordmark@Sun.COM */ 94911457SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV4_VERSION) { 95011457SErik.Nordmark@Sun.COM reach = ire_gateway_ok_zone_v4( 95111457SErik.Nordmark@Sun.COM ire->ire_gateway_addr, ALL_ZONES, 95211457SErik.Nordmark@Sun.COM dst_ill, NULL, ipst, B_FALSE); 95311457SErik.Nordmark@Sun.COM } else { 95411457SErik.Nordmark@Sun.COM reach = ire_gateway_ok_zone_v6(&gw_addr_v6, 95511457SErik.Nordmark@Sun.COM ALL_ZONES, dst_ill, NULL, ipst, B_FALSE); 95611457SErik.Nordmark@Sun.COM } 95711457SErik.Nordmark@Sun.COM if (reach) { 95811457SErik.Nordmark@Sun.COM /* 95911457SErik.Nordmark@Sun.COM * Some other zone could see it, hence hide it 96011457SErik.Nordmark@Sun.COM * in the global zone. 96111457SErik.Nordmark@Sun.COM */ 96211457SErik.Nordmark@Sun.COM return (B_FALSE); 96311457SErik.Nordmark@Sun.COM } 9640Sstevel@tonic-gate } 9650Sstevel@tonic-gate } 9660Sstevel@tonic-gate 9670Sstevel@tonic-gate if (((!(match_flags & MATCH_IRE_TYPE)) || 9684714Ssowmini (ire->ire_type & ire_type)) && 9690Sstevel@tonic-gate ((!(match_flags & MATCH_IRE_ILL)) || 97011042SErik.Nordmark@Sun.COM (dst_ill == ill || 97111042SErik.Nordmark@Sun.COM dst_ill != NULL && IS_IN_SAME_ILLGRP(dst_ill, ill)))) { 9720Sstevel@tonic-gate return (B_TRUE); 9730Sstevel@tonic-gate } 9740Sstevel@tonic-gate return (B_FALSE); 9750Sstevel@tonic-gate } 9760Sstevel@tonic-gate 9772535Ssangeeta int 9782535Ssangeeta rtfunc(struct radix_node *rn, void *arg) 9792535Ssangeeta { 9802535Ssangeeta struct rtfuncarg *rtf = arg; 9812535Ssangeeta struct rt_entry *rt; 9822535Ssangeeta irb_t *irb; 9832535Ssangeeta ire_t *ire; 9842535Ssangeeta boolean_t ret; 9852535Ssangeeta 9862535Ssangeeta rt = (struct rt_entry *)rn; 9872535Ssangeeta ASSERT(rt != NULL); 9882535Ssangeeta irb = &rt->rt_irb; 9892535Ssangeeta for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 9902535Ssangeeta if ((rtf->rt_match_flags != 0) || 9912535Ssangeeta (rtf->rt_zoneid != ALL_ZONES)) { 9922535Ssangeeta ret = ire_walk_ill_match(rtf->rt_match_flags, 9932535Ssangeeta rtf->rt_ire_type, ire, 9943448Sdh155122 rtf->rt_ill, rtf->rt_zoneid, rtf->rt_ipst); 99511042SErik.Nordmark@Sun.COM } else { 9962535Ssangeeta ret = B_TRUE; 99711042SErik.Nordmark@Sun.COM } 9982535Ssangeeta if (ret) 9992535Ssangeeta (*rtf->rt_func)(ire, rtf->rt_arg); 10002535Ssangeeta } 10012535Ssangeeta return (0); 10022535Ssangeeta } 10032535Ssangeeta 10040Sstevel@tonic-gate /* 100511042SErik.Nordmark@Sun.COM * Walk the ftable entries that match the ill. 10060Sstevel@tonic-gate */ 10072535Ssangeeta void 10080Sstevel@tonic-gate ire_walk_ill_tables(uint_t match_flags, uint_t ire_type, pfv_t func, 10091676Sjpk void *arg, size_t ftbl_sz, size_t htbl_sz, irb_t **ipftbl, 101011042SErik.Nordmark@Sun.COM ill_t *ill, zoneid_t zoneid, 10113448Sdh155122 ip_stack_t *ipst) 10120Sstevel@tonic-gate { 10130Sstevel@tonic-gate irb_t *irb_ptr; 10140Sstevel@tonic-gate irb_t *irb; 10150Sstevel@tonic-gate ire_t *ire; 10160Sstevel@tonic-gate int i, j; 10170Sstevel@tonic-gate boolean_t ret; 10182535Ssangeeta struct rtfuncarg rtfarg; 10190Sstevel@tonic-gate 10208485SPeter.Memishian@Sun.COM ASSERT((!(match_flags & MATCH_IRE_ILL)) || (ill != NULL)); 10210Sstevel@tonic-gate ASSERT(!(match_flags & MATCH_IRE_TYPE) || (ire_type != 0)); 102211042SErik.Nordmark@Sun.COM 102311042SErik.Nordmark@Sun.COM /* knobs such that routine is called only for v6 case */ 102411042SErik.Nordmark@Sun.COM if (ipftbl == ipst->ips_ip_forwarding_table_v6) { 102511042SErik.Nordmark@Sun.COM for (i = (ftbl_sz - 1); i >= 0; i--) { 102611042SErik.Nordmark@Sun.COM if ((irb_ptr = ipftbl[i]) == NULL) 102711042SErik.Nordmark@Sun.COM continue; 102811042SErik.Nordmark@Sun.COM for (j = 0; j < htbl_sz; j++) { 102911042SErik.Nordmark@Sun.COM irb = &irb_ptr[j]; 103011042SErik.Nordmark@Sun.COM if (irb->irb_ire == NULL) 10310Sstevel@tonic-gate continue; 103211042SErik.Nordmark@Sun.COM 103311042SErik.Nordmark@Sun.COM irb_refhold(irb); 103411042SErik.Nordmark@Sun.COM for (ire = irb->irb_ire; ire != NULL; 103511042SErik.Nordmark@Sun.COM ire = ire->ire_next) { 103611042SErik.Nordmark@Sun.COM if (match_flags == 0 && 103711042SErik.Nordmark@Sun.COM zoneid == ALL_ZONES) { 103811042SErik.Nordmark@Sun.COM ret = B_TRUE; 103911042SErik.Nordmark@Sun.COM } else { 104011042SErik.Nordmark@Sun.COM ret = 104111042SErik.Nordmark@Sun.COM ire_walk_ill_match( 104211042SErik.Nordmark@Sun.COM match_flags, 104311042SErik.Nordmark@Sun.COM ire_type, ire, ill, 104411042SErik.Nordmark@Sun.COM zoneid, ipst); 10450Sstevel@tonic-gate } 104611042SErik.Nordmark@Sun.COM if (ret) 104711042SErik.Nordmark@Sun.COM (*func)(ire, arg); 10480Sstevel@tonic-gate } 104911042SErik.Nordmark@Sun.COM irb_refrele(irb); 10500Sstevel@tonic-gate } 105111042SErik.Nordmark@Sun.COM } 105211042SErik.Nordmark@Sun.COM } else { 105311131SErik.Nordmark@Sun.COM bzero(&rtfarg, sizeof (rtfarg)); 105411042SErik.Nordmark@Sun.COM rtfarg.rt_func = func; 105511042SErik.Nordmark@Sun.COM rtfarg.rt_arg = arg; 105611042SErik.Nordmark@Sun.COM if (match_flags != 0) { 105711042SErik.Nordmark@Sun.COM rtfarg.rt_match_flags = match_flags; 10580Sstevel@tonic-gate } 105911042SErik.Nordmark@Sun.COM rtfarg.rt_ire_type = ire_type; 106011042SErik.Nordmark@Sun.COM rtfarg.rt_ill = ill; 106111042SErik.Nordmark@Sun.COM rtfarg.rt_zoneid = zoneid; 106211042SErik.Nordmark@Sun.COM rtfarg.rt_ipst = ipst; /* No netstack_hold */ 106311042SErik.Nordmark@Sun.COM (void) ipst->ips_ip_ftable->rnh_walktree_mt( 106411042SErik.Nordmark@Sun.COM ipst->ips_ip_ftable, 106511042SErik.Nordmark@Sun.COM rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn); 10660Sstevel@tonic-gate } 10670Sstevel@tonic-gate } 10680Sstevel@tonic-gate 10690Sstevel@tonic-gate /* 10700Sstevel@tonic-gate * This function takes a mask and returns 10710Sstevel@tonic-gate * number of bits set in the mask. If no 10720Sstevel@tonic-gate * bit is set it returns 0. 10730Sstevel@tonic-gate * Assumes a contiguous mask. 10740Sstevel@tonic-gate */ 10750Sstevel@tonic-gate int 10760Sstevel@tonic-gate ip_mask_to_plen(ipaddr_t mask) 10770Sstevel@tonic-gate { 10780Sstevel@tonic-gate return (mask == 0 ? 0 : IP_ABITS - (ffs(ntohl(mask)) -1)); 10790Sstevel@tonic-gate } 10800Sstevel@tonic-gate 10810Sstevel@tonic-gate /* 10820Sstevel@tonic-gate * Convert length for a mask to the mask. 10830Sstevel@tonic-gate */ 10840Sstevel@tonic-gate ipaddr_t 10850Sstevel@tonic-gate ip_plen_to_mask(uint_t masklen) 10860Sstevel@tonic-gate { 108711042SErik.Nordmark@Sun.COM if (masklen == 0) 108811042SErik.Nordmark@Sun.COM return (0); 108911042SErik.Nordmark@Sun.COM 10900Sstevel@tonic-gate return (htonl(IP_HOST_MASK << (IP_ABITS - masklen))); 10910Sstevel@tonic-gate } 10920Sstevel@tonic-gate 10930Sstevel@tonic-gate void 10940Sstevel@tonic-gate ire_atomic_end(irb_t *irb_ptr, ire_t *ire) 10950Sstevel@tonic-gate { 109611042SErik.Nordmark@Sun.COM ill_t *ill; 109711042SErik.Nordmark@Sun.COM 109811042SErik.Nordmark@Sun.COM ill = ire->ire_ill; 109911042SErik.Nordmark@Sun.COM if (ill != NULL) 110011042SErik.Nordmark@Sun.COM mutex_exit(&ill->ill_lock); 11010Sstevel@tonic-gate rw_exit(&irb_ptr->irb_lock); 11020Sstevel@tonic-gate } 11030Sstevel@tonic-gate 11040Sstevel@tonic-gate /* 110511042SErik.Nordmark@Sun.COM * ire_add_v[46] atomically make sure that the ill associated 110611042SErik.Nordmark@Sun.COM * with the new ire is not going away i.e., we check ILL_CONDEMNED. 11070Sstevel@tonic-gate */ 11080Sstevel@tonic-gate int 110911042SErik.Nordmark@Sun.COM ire_atomic_start(irb_t *irb_ptr, ire_t *ire) 11100Sstevel@tonic-gate { 111111042SErik.Nordmark@Sun.COM ill_t *ill; 111211042SErik.Nordmark@Sun.COM 111311042SErik.Nordmark@Sun.COM ill = ire->ire_ill; 111411042SErik.Nordmark@Sun.COM 111511042SErik.Nordmark@Sun.COM rw_enter(&irb_ptr->irb_lock, RW_WRITER); 111611042SErik.Nordmark@Sun.COM if (ill != NULL) { 111711042SErik.Nordmark@Sun.COM mutex_enter(&ill->ill_lock); 11182416Sjarrett 11192416Sjarrett /* 112011042SErik.Nordmark@Sun.COM * Don't allow IRE's to be created on dying ills. 11212416Sjarrett */ 112211042SErik.Nordmark@Sun.COM if (ill->ill_state_flags & ILL_CONDEMNED) { 112311042SErik.Nordmark@Sun.COM ire_atomic_end(irb_ptr, ire); 112411042SErik.Nordmark@Sun.COM return (ENXIO); 112511042SErik.Nordmark@Sun.COM } 112611042SErik.Nordmark@Sun.COM 112711042SErik.Nordmark@Sun.COM if (IS_UNDER_IPMP(ill)) { 112811042SErik.Nordmark@Sun.COM int error = 0; 112911042SErik.Nordmark@Sun.COM mutex_enter(&ill->ill_phyint->phyint_lock); 113011042SErik.Nordmark@Sun.COM if (!ipmp_ill_is_active(ill) && 113111042SErik.Nordmark@Sun.COM IRE_HIDDEN_TYPE(ire->ire_type) && 113211042SErik.Nordmark@Sun.COM !ire->ire_testhidden) { 113311042SErik.Nordmark@Sun.COM error = EINVAL; 11342416Sjarrett } 113511042SErik.Nordmark@Sun.COM mutex_exit(&ill->ill_phyint->phyint_lock); 11362416Sjarrett if (error != 0) { 113711042SErik.Nordmark@Sun.COM ire_atomic_end(irb_ptr, ire); 11382416Sjarrett return (error); 11392416Sjarrett } 11402416Sjarrett } 114111042SErik.Nordmark@Sun.COM 11420Sstevel@tonic-gate } 114311042SErik.Nordmark@Sun.COM return (0); 11440Sstevel@tonic-gate } 11450Sstevel@tonic-gate 11460Sstevel@tonic-gate /* 114711042SErik.Nordmark@Sun.COM * Add a fully initialized IRE to the forwarding table. 114811042SErik.Nordmark@Sun.COM * This returns NULL on failure, or a held IRE on success. 114911042SErik.Nordmark@Sun.COM * Normally the returned IRE is the same as the argument. But a different 115011042SErik.Nordmark@Sun.COM * IRE will be returned if the added IRE is deemed identical to an existing 115111042SErik.Nordmark@Sun.COM * one. In that case ire_identical_ref will be increased. 115211042SErik.Nordmark@Sun.COM * The caller always needs to do an ire_refrele() on the returned IRE. 11530Sstevel@tonic-gate */ 115411042SErik.Nordmark@Sun.COM ire_t * 115511042SErik.Nordmark@Sun.COM ire_add(ire_t *ire) 115611042SErik.Nordmark@Sun.COM { 115711042SErik.Nordmark@Sun.COM if (IRE_HIDDEN_TYPE(ire->ire_type) && 115811042SErik.Nordmark@Sun.COM ire->ire_ill != NULL && IS_UNDER_IPMP(ire->ire_ill)) { 115911042SErik.Nordmark@Sun.COM /* 116011042SErik.Nordmark@Sun.COM * IREs hosted on interfaces that are under IPMP 116111042SErik.Nordmark@Sun.COM * should be hidden so that applications don't 116211042SErik.Nordmark@Sun.COM * accidentally end up sending packets with test 116311042SErik.Nordmark@Sun.COM * addresses as their source addresses, or 116411042SErik.Nordmark@Sun.COM * sending out interfaces that are e.g. IFF_INACTIVE. 116511042SErik.Nordmark@Sun.COM * Hide them here. 116611042SErik.Nordmark@Sun.COM */ 116711042SErik.Nordmark@Sun.COM ire->ire_testhidden = B_TRUE; 116811042SErik.Nordmark@Sun.COM } 116911042SErik.Nordmark@Sun.COM 117011042SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV6_VERSION) 117111042SErik.Nordmark@Sun.COM return (ire_add_v6(ire)); 117211042SErik.Nordmark@Sun.COM else 117311042SErik.Nordmark@Sun.COM return (ire_add_v4(ire)); 117411042SErik.Nordmark@Sun.COM } 117511042SErik.Nordmark@Sun.COM 117611042SErik.Nordmark@Sun.COM /* 117711042SErik.Nordmark@Sun.COM * Add a fully initialized IPv4 IRE to the forwarding table. 117811042SErik.Nordmark@Sun.COM * This returns NULL on failure, or a held IRE on success. 117911042SErik.Nordmark@Sun.COM * Normally the returned IRE is the same as the argument. But a different 118011042SErik.Nordmark@Sun.COM * IRE will be returned if the added IRE is deemed identical to an existing 118111042SErik.Nordmark@Sun.COM * one. In that case ire_identical_ref will be increased. 118211042SErik.Nordmark@Sun.COM * The caller always needs to do an ire_refrele() on the returned IRE. 118311042SErik.Nordmark@Sun.COM */ 118411042SErik.Nordmark@Sun.COM static ire_t * 118511042SErik.Nordmark@Sun.COM ire_add_v4(ire_t *ire) 11860Sstevel@tonic-gate { 11870Sstevel@tonic-gate ire_t *ire1; 11880Sstevel@tonic-gate irb_t *irb_ptr; 11890Sstevel@tonic-gate ire_t **irep; 119011042SErik.Nordmark@Sun.COM int match_flags; 11910Sstevel@tonic-gate int error; 11923448Sdh155122 ip_stack_t *ipst = ire->ire_ipst; 119311042SErik.Nordmark@Sun.COM 119411042SErik.Nordmark@Sun.COM if (ire->ire_ill != NULL) 119511042SErik.Nordmark@Sun.COM ASSERT(!MUTEX_HELD(&ire->ire_ill->ill_lock)); 11960Sstevel@tonic-gate ASSERT(ire->ire_ipversion == IPV4_VERSION); 11970Sstevel@tonic-gate 11980Sstevel@tonic-gate /* Make sure the address is properly masked. */ 11990Sstevel@tonic-gate ire->ire_addr &= ire->ire_mask; 12000Sstevel@tonic-gate 120111042SErik.Nordmark@Sun.COM match_flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW); 120211042SErik.Nordmark@Sun.COM 120311042SErik.Nordmark@Sun.COM if (ire->ire_ill != NULL) { 120411042SErik.Nordmark@Sun.COM match_flags |= MATCH_IRE_ILL; 12050Sstevel@tonic-gate } 120611042SErik.Nordmark@Sun.COM irb_ptr = ire_get_bucket(ire); 120711042SErik.Nordmark@Sun.COM if (irb_ptr == NULL) { 120811042SErik.Nordmark@Sun.COM printf("no bucket for %p\n", (void *)ire); 120911042SErik.Nordmark@Sun.COM ire_delete(ire); 121011042SErik.Nordmark@Sun.COM return (NULL); 12112535Ssangeeta } 12120Sstevel@tonic-gate 12130Sstevel@tonic-gate /* 121411042SErik.Nordmark@Sun.COM * Start the atomic add of the ire. Grab the ill lock, 121511042SErik.Nordmark@Sun.COM * the bucket lock. Check for condemned. 12160Sstevel@tonic-gate */ 121711042SErik.Nordmark@Sun.COM error = ire_atomic_start(irb_ptr, ire); 12180Sstevel@tonic-gate if (error != 0) { 121911042SErik.Nordmark@Sun.COM printf("no ire_atomic_start for %p\n", (void *)ire); 12200Sstevel@tonic-gate ire_delete(ire); 122111042SErik.Nordmark@Sun.COM irb_refrele(irb_ptr); 122211042SErik.Nordmark@Sun.COM return (NULL); 12230Sstevel@tonic-gate } 12240Sstevel@tonic-gate /* 122511042SErik.Nordmark@Sun.COM * If we are creating a hidden IRE, make sure we search for 122611042SErik.Nordmark@Sun.COM * hidden IREs when searching for duplicates below. 122711042SErik.Nordmark@Sun.COM * Otherwise, we might find an IRE on some other interface 122811042SErik.Nordmark@Sun.COM * that's not marked hidden. 12290Sstevel@tonic-gate */ 123011042SErik.Nordmark@Sun.COM if (ire->ire_testhidden) 123111042SErik.Nordmark@Sun.COM match_flags |= MATCH_IRE_TESTHIDDEN; 123211042SErik.Nordmark@Sun.COM 12330Sstevel@tonic-gate /* 12340Sstevel@tonic-gate * Atomically check for duplicate and insert in the table. 12350Sstevel@tonic-gate */ 12360Sstevel@tonic-gate for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 123711042SErik.Nordmark@Sun.COM if (IRE_IS_CONDEMNED(ire1)) 12380Sstevel@tonic-gate continue; 123911042SErik.Nordmark@Sun.COM /* 124011042SErik.Nordmark@Sun.COM * Here we need an exact match on zoneid, i.e., 124111042SErik.Nordmark@Sun.COM * ire_match_args doesn't fit. 124211042SErik.Nordmark@Sun.COM */ 12430Sstevel@tonic-gate if (ire1->ire_zoneid != ire->ire_zoneid) 12440Sstevel@tonic-gate continue; 124511042SErik.Nordmark@Sun.COM 124611042SErik.Nordmark@Sun.COM if (ire1->ire_type != ire->ire_type) 124711042SErik.Nordmark@Sun.COM continue; 124811042SErik.Nordmark@Sun.COM 124911042SErik.Nordmark@Sun.COM /* 125011042SErik.Nordmark@Sun.COM * Note: We do not allow multiple routes that differ only 125111042SErik.Nordmark@Sun.COM * in the gateway security attributes; such routes are 125211042SErik.Nordmark@Sun.COM * considered duplicates. 125311042SErik.Nordmark@Sun.COM * To change that we explicitly have to treat them as 125411042SErik.Nordmark@Sun.COM * different here. 125511042SErik.Nordmark@Sun.COM */ 12560Sstevel@tonic-gate if (ire_match_args(ire1, ire->ire_addr, ire->ire_mask, 125711042SErik.Nordmark@Sun.COM ire->ire_gateway_addr, ire->ire_type, ire->ire_ill, 125811042SErik.Nordmark@Sun.COM ire->ire_zoneid, NULL, match_flags)) { 12590Sstevel@tonic-gate /* 12600Sstevel@tonic-gate * Return the old ire after doing a REFHOLD. 12610Sstevel@tonic-gate * As most of the callers continue to use the IRE 12620Sstevel@tonic-gate * after adding, we return a held ire. This will 12630Sstevel@tonic-gate * avoid a lookup in the caller again. If the callers 12640Sstevel@tonic-gate * don't want to use it, they need to do a REFRELE. 12650Sstevel@tonic-gate */ 126611042SErik.Nordmark@Sun.COM atomic_add_32(&ire1->ire_identical_ref, 1); 126711042SErik.Nordmark@Sun.COM DTRACE_PROBE2(ire__add__exist, ire_t *, ire1, 126811042SErik.Nordmark@Sun.COM ire_t *, ire); 126911042SErik.Nordmark@Sun.COM ire_refhold(ire1); 12700Sstevel@tonic-gate ire_atomic_end(irb_ptr, ire); 12710Sstevel@tonic-gate ire_delete(ire); 127211042SErik.Nordmark@Sun.COM irb_refrele(irb_ptr); 127311042SErik.Nordmark@Sun.COM return (ire1); 12742535Ssangeeta } 12752535Ssangeeta } 127611042SErik.Nordmark@Sun.COM 12770Sstevel@tonic-gate /* 127811042SErik.Nordmark@Sun.COM * Normally we do head insertion since most things do not care about 127911042SErik.Nordmark@Sun.COM * the order of the IREs in the bucket. Note that ip_cgtp_bcast_add 128011042SErik.Nordmark@Sun.COM * assumes we at least do head insertion so that its IRE_BROADCAST 128111042SErik.Nordmark@Sun.COM * arrive ahead of existing IRE_HOST for the same address. 128211042SErik.Nordmark@Sun.COM * However, due to shared-IP zones (and restrict_interzone_loopback) 128311042SErik.Nordmark@Sun.COM * we can have an IRE_LOCAL as well as IRE_IF_CLONE for the same 128411042SErik.Nordmark@Sun.COM * address. For that reason we do tail insertion for IRE_IF_CLONE. 128511042SErik.Nordmark@Sun.COM * Due to the IRE_BROADCAST on cgtp0, which must be last in the bucket, 128611042SErik.Nordmark@Sun.COM * we do tail insertion of IRE_BROADCASTs that do not have RTF_MULTIRT 128711042SErik.Nordmark@Sun.COM * set. 12880Sstevel@tonic-gate */ 12890Sstevel@tonic-gate irep = (ire_t **)irb_ptr; 129011042SErik.Nordmark@Sun.COM if ((ire->ire_type & IRE_IF_CLONE) || 129111042SErik.Nordmark@Sun.COM ((ire->ire_type & IRE_BROADCAST) && 129211042SErik.Nordmark@Sun.COM !(ire->ire_flags & RTF_MULTIRT))) { 129311042SErik.Nordmark@Sun.COM while ((ire1 = *irep) != NULL) 12940Sstevel@tonic-gate irep = &ire1->ire_next; 12950Sstevel@tonic-gate } 12960Sstevel@tonic-gate /* Insert at *irep */ 12970Sstevel@tonic-gate ire1 = *irep; 12980Sstevel@tonic-gate if (ire1 != NULL) 12990Sstevel@tonic-gate ire1->ire_ptpn = &ire->ire_next; 13000Sstevel@tonic-gate ire->ire_next = ire1; 13010Sstevel@tonic-gate /* Link the new one in. */ 13020Sstevel@tonic-gate ire->ire_ptpn = irep; 13030Sstevel@tonic-gate 13040Sstevel@tonic-gate /* 13050Sstevel@tonic-gate * ire_walk routines de-reference ire_next without holding 13060Sstevel@tonic-gate * a lock. Before we point to the new ire, we want to make 13070Sstevel@tonic-gate * sure the store that sets the ire_next of the new ire 13080Sstevel@tonic-gate * reaches global visibility, so that ire_walk routines 13090Sstevel@tonic-gate * don't see a truncated list of ires i.e if the ire_next 13100Sstevel@tonic-gate * of the new ire gets set after we do "*irep = ire" due 13110Sstevel@tonic-gate * to re-ordering, the ire_walk thread will see a NULL 13120Sstevel@tonic-gate * once it accesses the ire_next of the new ire. 13130Sstevel@tonic-gate * membar_producer() makes sure that the following store 13140Sstevel@tonic-gate * happens *after* all of the above stores. 13150Sstevel@tonic-gate */ 13160Sstevel@tonic-gate membar_producer(); 13170Sstevel@tonic-gate *irep = ire; 13180Sstevel@tonic-gate ire->ire_bucket = irb_ptr; 13190Sstevel@tonic-gate /* 13200Sstevel@tonic-gate * We return a bumped up IRE above. Keep it symmetrical 13210Sstevel@tonic-gate * so that the callers will always have to release. This 13220Sstevel@tonic-gate * helps the callers of this function because they continue 13230Sstevel@tonic-gate * to use the IRE after adding and hence they don't have to 13240Sstevel@tonic-gate * lookup again after we return the IRE. 13250Sstevel@tonic-gate * 13260Sstevel@tonic-gate * NOTE : We don't have to use atomics as this is appearing 13270Sstevel@tonic-gate * in the list for the first time and no one else can bump 13280Sstevel@tonic-gate * up the reference count on this yet. 13290Sstevel@tonic-gate */ 133011042SErik.Nordmark@Sun.COM ire_refhold_locked(ire); 13313448Sdh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_inserted); 13322535Ssangeeta 13330Sstevel@tonic-gate irb_ptr->irb_ire_cnt++; 133411042SErik.Nordmark@Sun.COM if (irb_ptr->irb_marks & IRB_MARK_DYNAMIC) 13352535Ssangeeta irb_ptr->irb_nire++; 13362535Ssangeeta 133711042SErik.Nordmark@Sun.COM if (ire->ire_ill != NULL) { 133811042SErik.Nordmark@Sun.COM ire->ire_ill->ill_ire_cnt++; 133911042SErik.Nordmark@Sun.COM ASSERT(ire->ire_ill->ill_ire_cnt != 0); /* Wraparound */ 13400Sstevel@tonic-gate } 13410Sstevel@tonic-gate 13420Sstevel@tonic-gate ire_atomic_end(irb_ptr, ire); 134311042SErik.Nordmark@Sun.COM 134411042SErik.Nordmark@Sun.COM /* Make any caching of the IREs be notified or updated */ 134511042SErik.Nordmark@Sun.COM ire_flush_cache_v4(ire, IRE_FLUSH_ADD); 134611042SErik.Nordmark@Sun.COM 134711042SErik.Nordmark@Sun.COM if (ire->ire_ill != NULL) 134811042SErik.Nordmark@Sun.COM ASSERT(!MUTEX_HELD(&ire->ire_ill->ill_lock)); 134911042SErik.Nordmark@Sun.COM irb_refrele(irb_ptr); 135011042SErik.Nordmark@Sun.COM return (ire); 13510Sstevel@tonic-gate } 13520Sstevel@tonic-gate 13530Sstevel@tonic-gate /* 135411042SErik.Nordmark@Sun.COM * irb_refrele is the only caller of the function. ire_unlink calls to 13550Sstevel@tonic-gate * do the final cleanup for this ire. 13560Sstevel@tonic-gate */ 13570Sstevel@tonic-gate void 13580Sstevel@tonic-gate ire_cleanup(ire_t *ire) 13590Sstevel@tonic-gate { 13600Sstevel@tonic-gate ire_t *ire_next; 13613448Sdh155122 ip_stack_t *ipst = ire->ire_ipst; 13620Sstevel@tonic-gate 13630Sstevel@tonic-gate ASSERT(ire != NULL); 13640Sstevel@tonic-gate 13650Sstevel@tonic-gate while (ire != NULL) { 13660Sstevel@tonic-gate ire_next = ire->ire_next; 13670Sstevel@tonic-gate if (ire->ire_ipversion == IPV4_VERSION) { 13680Sstevel@tonic-gate ire_delete_v4(ire); 13693448Sdh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, 13703448Sdh155122 ire_stats_deleted); 13710Sstevel@tonic-gate } else { 13720Sstevel@tonic-gate ASSERT(ire->ire_ipversion == IPV6_VERSION); 13730Sstevel@tonic-gate ire_delete_v6(ire); 13743448Sdh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, 13753448Sdh155122 ire_stats_deleted); 13760Sstevel@tonic-gate } 13770Sstevel@tonic-gate /* 13780Sstevel@tonic-gate * Now it's really out of the list. Before doing the 13790Sstevel@tonic-gate * REFRELE, set ire_next to NULL as ire_inactive asserts 13800Sstevel@tonic-gate * so. 13810Sstevel@tonic-gate */ 13820Sstevel@tonic-gate ire->ire_next = NULL; 138311042SErik.Nordmark@Sun.COM ire_refrele_notr(ire); 13840Sstevel@tonic-gate ire = ire_next; 13850Sstevel@tonic-gate } 13860Sstevel@tonic-gate } 13870Sstevel@tonic-gate 13880Sstevel@tonic-gate /* 138911042SErik.Nordmark@Sun.COM * irb_refrele is the only caller of the function. It calls to unlink 13900Sstevel@tonic-gate * all the CONDEMNED ires from this bucket. 13910Sstevel@tonic-gate */ 13920Sstevel@tonic-gate ire_t * 13930Sstevel@tonic-gate ire_unlink(irb_t *irb) 13940Sstevel@tonic-gate { 13950Sstevel@tonic-gate ire_t *ire; 13960Sstevel@tonic-gate ire_t *ire1; 13970Sstevel@tonic-gate ire_t **ptpn; 13980Sstevel@tonic-gate ire_t *ire_list = NULL; 13990Sstevel@tonic-gate 14000Sstevel@tonic-gate ASSERT(RW_WRITE_HELD(&irb->irb_lock)); 140111042SErik.Nordmark@Sun.COM ASSERT(((irb->irb_marks & IRB_MARK_DYNAMIC) && irb->irb_refcnt == 1) || 14022535Ssangeeta (irb->irb_refcnt == 0)); 14032535Ssangeeta ASSERT(irb->irb_marks & IRB_MARK_CONDEMNED); 14040Sstevel@tonic-gate ASSERT(irb->irb_ire != NULL); 14050Sstevel@tonic-gate 14060Sstevel@tonic-gate for (ire = irb->irb_ire; ire != NULL; ire = ire1) { 14070Sstevel@tonic-gate ire1 = ire->ire_next; 140811042SErik.Nordmark@Sun.COM if (IRE_IS_CONDEMNED(ire)) { 14090Sstevel@tonic-gate ptpn = ire->ire_ptpn; 14100Sstevel@tonic-gate ire1 = ire->ire_next; 14110Sstevel@tonic-gate if (ire1) 14120Sstevel@tonic-gate ire1->ire_ptpn = ptpn; 14130Sstevel@tonic-gate *ptpn = ire1; 14140Sstevel@tonic-gate ire->ire_ptpn = NULL; 14150Sstevel@tonic-gate ire->ire_next = NULL; 141611042SErik.Nordmark@Sun.COM 14170Sstevel@tonic-gate /* 141811042SErik.Nordmark@Sun.COM * We need to call ire_delete_v4 or ire_delete_v6 to 141911042SErik.Nordmark@Sun.COM * clean up dependents and the redirects pointing at 14200Sstevel@tonic-gate * the default gateway. We need to drop the lock 14210Sstevel@tonic-gate * as ire_flush_cache/ire_delete_host_redircts require 14220Sstevel@tonic-gate * so. But we can't drop the lock, as ire_unlink needs 14230Sstevel@tonic-gate * to atomically remove the ires from the list. 14240Sstevel@tonic-gate * So, create a temporary list of CONDEMNED ires 14250Sstevel@tonic-gate * for doing ire_delete_v4/ire_delete_v6 operations 14260Sstevel@tonic-gate * later on. 14270Sstevel@tonic-gate */ 14280Sstevel@tonic-gate ire->ire_next = ire_list; 14290Sstevel@tonic-gate ire_list = ire; 14300Sstevel@tonic-gate } 14310Sstevel@tonic-gate } 14322535Ssangeeta irb->irb_marks &= ~IRB_MARK_CONDEMNED; 14330Sstevel@tonic-gate return (ire_list); 14340Sstevel@tonic-gate } 14350Sstevel@tonic-gate 14360Sstevel@tonic-gate /* 143711042SErik.Nordmark@Sun.COM * Clean up the radix node for this ire. Must be called by irb_refrele 14382535Ssangeeta * when there are no ire's left in the bucket. Returns TRUE if the bucket 14392535Ssangeeta * is deleted and freed. 14402535Ssangeeta */ 14412535Ssangeeta boolean_t 14422535Ssangeeta irb_inactive(irb_t *irb) 14432535Ssangeeta { 14442535Ssangeeta struct rt_entry *rt; 14452535Ssangeeta struct radix_node *rn; 14463448Sdh155122 ip_stack_t *ipst = irb->irb_ipst; 14473448Sdh155122 14483448Sdh155122 ASSERT(irb->irb_ipst != NULL); 14492535Ssangeeta 14502535Ssangeeta rt = IRB2RT(irb); 14512535Ssangeeta rn = (struct radix_node *)rt; 14522535Ssangeeta 14532535Ssangeeta /* first remove it from the radix tree. */ 14543448Sdh155122 RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable); 14552535Ssangeeta rw_enter(&irb->irb_lock, RW_WRITER); 14562535Ssangeeta if (irb->irb_refcnt == 1 && irb->irb_nire == 0) { 14573448Sdh155122 rn = ipst->ips_ip_ftable->rnh_deladdr(rn->rn_key, rn->rn_mask, 14583448Sdh155122 ipst->ips_ip_ftable); 14592535Ssangeeta DTRACE_PROBE1(irb__free, rt_t *, rt); 14602535Ssangeeta ASSERT((void *)rn == (void *)rt); 14612535Ssangeeta Free(rt, rt_entry_cache); 14622535Ssangeeta /* irb_lock is freed */ 14633448Sdh155122 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 14642535Ssangeeta return (B_TRUE); 14652535Ssangeeta } 14662535Ssangeeta rw_exit(&irb->irb_lock); 14673448Sdh155122 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 14682535Ssangeeta return (B_FALSE); 14690Sstevel@tonic-gate } 14700Sstevel@tonic-gate 14710Sstevel@tonic-gate /* 14720Sstevel@tonic-gate * Delete the specified IRE. 147311042SErik.Nordmark@Sun.COM * We assume that if ire_bucket is not set then ire_ill->ill_ire_cnt was 147411042SErik.Nordmark@Sun.COM * not incremented i.e., that the insertion in the bucket and the increment 147511042SErik.Nordmark@Sun.COM * of that counter is done atomically. 14760Sstevel@tonic-gate */ 14770Sstevel@tonic-gate void 14780Sstevel@tonic-gate ire_delete(ire_t *ire) 14790Sstevel@tonic-gate { 14800Sstevel@tonic-gate ire_t *ire1; 14810Sstevel@tonic-gate ire_t **ptpn; 148211042SErik.Nordmark@Sun.COM irb_t *irb; 14833448Sdh155122 ip_stack_t *ipst = ire->ire_ipst; 14840Sstevel@tonic-gate 14850Sstevel@tonic-gate if ((irb = ire->ire_bucket) == NULL) { 14862535Ssangeeta /* 14872535Ssangeeta * It was never inserted in the list. Should call REFRELE 14882535Ssangeeta * to free this IRE. 14892535Ssangeeta */ 149011463SSowmini.Varadhan@Sun.COM ire_make_condemned(ire); 149111042SErik.Nordmark@Sun.COM ire_refrele_notr(ire); 14920Sstevel@tonic-gate return; 14930Sstevel@tonic-gate } 14940Sstevel@tonic-gate 149511042SErik.Nordmark@Sun.COM /* 149611042SErik.Nordmark@Sun.COM * Move the use counts from an IRE_IF_CLONE to its parent 149711042SErik.Nordmark@Sun.COM * IRE_INTERFACE. 149811042SErik.Nordmark@Sun.COM * We need to do this before acquiring irb_lock. 149911042SErik.Nordmark@Sun.COM */ 150011042SErik.Nordmark@Sun.COM if (ire->ire_type & IRE_IF_CLONE) { 150111042SErik.Nordmark@Sun.COM ire_t *parent; 150211042SErik.Nordmark@Sun.COM 150311042SErik.Nordmark@Sun.COM rw_enter(&ipst->ips_ire_dep_lock, RW_READER); 150411042SErik.Nordmark@Sun.COM if ((parent = ire->ire_dep_parent) != NULL) { 150511042SErik.Nordmark@Sun.COM parent->ire_ob_pkt_count += ire->ire_ob_pkt_count; 150611042SErik.Nordmark@Sun.COM parent->ire_ib_pkt_count += ire->ire_ib_pkt_count; 150711042SErik.Nordmark@Sun.COM ire->ire_ob_pkt_count = 0; 150811042SErik.Nordmark@Sun.COM ire->ire_ib_pkt_count = 0; 150911042SErik.Nordmark@Sun.COM } 151011042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ire_dep_lock); 151111042SErik.Nordmark@Sun.COM } 151211042SErik.Nordmark@Sun.COM 15130Sstevel@tonic-gate rw_enter(&irb->irb_lock, RW_WRITER); 15140Sstevel@tonic-gate if (ire->ire_ptpn == NULL) { 15150Sstevel@tonic-gate /* 15160Sstevel@tonic-gate * Some other thread has removed us from the list. 15170Sstevel@tonic-gate * It should have done the REFRELE for us. 15180Sstevel@tonic-gate */ 15190Sstevel@tonic-gate rw_exit(&irb->irb_lock); 15200Sstevel@tonic-gate return; 15210Sstevel@tonic-gate } 15220Sstevel@tonic-gate 152311042SErik.Nordmark@Sun.COM if (!IRE_IS_CONDEMNED(ire)) { 152411042SErik.Nordmark@Sun.COM /* Is this an IRE representing multiple duplicate entries? */ 152511042SErik.Nordmark@Sun.COM ASSERT(ire->ire_identical_ref >= 1); 152611042SErik.Nordmark@Sun.COM if (atomic_add_32_nv(&ire->ire_identical_ref, -1) != 0) { 152711042SErik.Nordmark@Sun.COM /* Removed one of the identical parties */ 152811042SErik.Nordmark@Sun.COM rw_exit(&irb->irb_lock); 152911042SErik.Nordmark@Sun.COM return; 153011042SErik.Nordmark@Sun.COM } 153111042SErik.Nordmark@Sun.COM 15325388Sja97890 irb->irb_ire_cnt--; 153311042SErik.Nordmark@Sun.COM ire_make_condemned(ire); 15345388Sja97890 } 15355388Sja97890 15360Sstevel@tonic-gate if (irb->irb_refcnt != 0) { 15370Sstevel@tonic-gate /* 15380Sstevel@tonic-gate * The last thread to leave this bucket will 15390Sstevel@tonic-gate * delete this ire. 15400Sstevel@tonic-gate */ 15412535Ssangeeta irb->irb_marks |= IRB_MARK_CONDEMNED; 15420Sstevel@tonic-gate rw_exit(&irb->irb_lock); 15430Sstevel@tonic-gate return; 15440Sstevel@tonic-gate } 15450Sstevel@tonic-gate 15460Sstevel@tonic-gate /* 15470Sstevel@tonic-gate * Normally to delete an ire, we walk the bucket. While we 15480Sstevel@tonic-gate * walk the bucket, we normally bump up irb_refcnt and hence 15490Sstevel@tonic-gate * we return from above where we mark CONDEMNED and the ire 15500Sstevel@tonic-gate * gets deleted from ire_unlink. This case is where somebody 15510Sstevel@tonic-gate * knows the ire e.g by doing a lookup, and wants to delete the 15520Sstevel@tonic-gate * IRE. irb_refcnt would be 0 in this case if nobody is walking 15530Sstevel@tonic-gate * the bucket. 15540Sstevel@tonic-gate */ 15550Sstevel@tonic-gate ptpn = ire->ire_ptpn; 15560Sstevel@tonic-gate ire1 = ire->ire_next; 15570Sstevel@tonic-gate if (ire1 != NULL) 15580Sstevel@tonic-gate ire1->ire_ptpn = ptpn; 15590Sstevel@tonic-gate ASSERT(ptpn != NULL); 15600Sstevel@tonic-gate *ptpn = ire1; 15610Sstevel@tonic-gate ire->ire_ptpn = NULL; 15620Sstevel@tonic-gate ire->ire_next = NULL; 15630Sstevel@tonic-gate if (ire->ire_ipversion == IPV6_VERSION) { 15643448Sdh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_deleted); 15650Sstevel@tonic-gate } else { 15663448Sdh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_deleted); 15670Sstevel@tonic-gate } 15680Sstevel@tonic-gate rw_exit(&irb->irb_lock); 15690Sstevel@tonic-gate 157011042SErik.Nordmark@Sun.COM /* Cleanup dependents and related stuff */ 15710Sstevel@tonic-gate if (ire->ire_ipversion == IPV6_VERSION) { 15720Sstevel@tonic-gate ire_delete_v6(ire); 15730Sstevel@tonic-gate } else { 15740Sstevel@tonic-gate ire_delete_v4(ire); 15750Sstevel@tonic-gate } 15760Sstevel@tonic-gate /* 15770Sstevel@tonic-gate * We removed it from the list. Decrement the 15780Sstevel@tonic-gate * reference count. 15790Sstevel@tonic-gate */ 158011042SErik.Nordmark@Sun.COM ire_refrele_notr(ire); 15810Sstevel@tonic-gate } 15820Sstevel@tonic-gate 15830Sstevel@tonic-gate /* 15840Sstevel@tonic-gate * Delete the specified IRE. 15850Sstevel@tonic-gate * All calls should use ire_delete(). 15860Sstevel@tonic-gate * Sometimes called as writer though not required by this function. 15870Sstevel@tonic-gate * 15880Sstevel@tonic-gate * NOTE : This function is called only if the ire was added 15890Sstevel@tonic-gate * in the list. 15900Sstevel@tonic-gate */ 15910Sstevel@tonic-gate static void 15920Sstevel@tonic-gate ire_delete_v4(ire_t *ire) 15930Sstevel@tonic-gate { 15943448Sdh155122 ip_stack_t *ipst = ire->ire_ipst; 15953448Sdh155122 15960Sstevel@tonic-gate ASSERT(ire->ire_refcnt >= 1); 15970Sstevel@tonic-gate ASSERT(ire->ire_ipversion == IPV4_VERSION); 15980Sstevel@tonic-gate 159911042SErik.Nordmark@Sun.COM ire_flush_cache_v4(ire, IRE_FLUSH_DELETE); 16000Sstevel@tonic-gate if (ire->ire_type == IRE_DEFAULT) { 16010Sstevel@tonic-gate /* 16020Sstevel@tonic-gate * when a default gateway is going away 16030Sstevel@tonic-gate * delete all the host redirects pointing at that 16040Sstevel@tonic-gate * gateway. 16050Sstevel@tonic-gate */ 16063448Sdh155122 ire_delete_host_redirects(ire->ire_gateway_addr, ipst); 16070Sstevel@tonic-gate } 160811042SErik.Nordmark@Sun.COM 160911042SErik.Nordmark@Sun.COM /* 161011042SErik.Nordmark@Sun.COM * If we are deleting an IRE_INTERFACE then we make sure we also 161111042SErik.Nordmark@Sun.COM * delete any IRE_IF_CLONE that has been created from it. 161211042SErik.Nordmark@Sun.COM * Those are always in ire_dep_children. 161311042SErik.Nordmark@Sun.COM */ 161411042SErik.Nordmark@Sun.COM if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != NULL) 161511042SErik.Nordmark@Sun.COM ire_dep_delete_if_clone(ire); 161611042SErik.Nordmark@Sun.COM 161711042SErik.Nordmark@Sun.COM /* Remove from parent dependencies and child */ 161811042SErik.Nordmark@Sun.COM rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER); 161911042SErik.Nordmark@Sun.COM if (ire->ire_dep_parent != NULL) 162011042SErik.Nordmark@Sun.COM ire_dep_remove(ire); 162111042SErik.Nordmark@Sun.COM 162211042SErik.Nordmark@Sun.COM while (ire->ire_dep_children != NULL) 162311042SErik.Nordmark@Sun.COM ire_dep_remove(ire->ire_dep_children); 162411042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ire_dep_lock); 16250Sstevel@tonic-gate } 16260Sstevel@tonic-gate 16270Sstevel@tonic-gate /* 162811042SErik.Nordmark@Sun.COM * ire_refrele is the only caller of the function. It calls 16290Sstevel@tonic-gate * to free the ire when the reference count goes to zero. 16300Sstevel@tonic-gate */ 16310Sstevel@tonic-gate void 16320Sstevel@tonic-gate ire_inactive(ire_t *ire) 16330Sstevel@tonic-gate { 163411042SErik.Nordmark@Sun.COM ill_t *ill; 16352535Ssangeeta irb_t *irb; 16363448Sdh155122 ip_stack_t *ipst = ire->ire_ipst; 16370Sstevel@tonic-gate 16380Sstevel@tonic-gate ASSERT(ire->ire_refcnt == 0); 16390Sstevel@tonic-gate ASSERT(ire->ire_ptpn == NULL); 16400Sstevel@tonic-gate ASSERT(ire->ire_next == NULL); 16410Sstevel@tonic-gate 164211042SErik.Nordmark@Sun.COM /* Count how many condemned ires for kmem_cache callback */ 164311463SSowmini.Varadhan@Sun.COM ASSERT(IRE_IS_CONDEMNED(ire)); 164411463SSowmini.Varadhan@Sun.COM atomic_add_32(&ipst->ips_num_ire_condemned, -1); 164511042SErik.Nordmark@Sun.COM 16462535Ssangeeta if (ire->ire_gw_secattr != NULL) { 16472535Ssangeeta ire_gw_secattr_free(ire->ire_gw_secattr); 16482535Ssangeeta ire->ire_gw_secattr = NULL; 16492535Ssangeeta } 16502535Ssangeeta 165111042SErik.Nordmark@Sun.COM /* 165211042SErik.Nordmark@Sun.COM * ire_nce_cache is cleared in ire_delete, and we make sure we don't 165311042SErik.Nordmark@Sun.COM * set it once the ire is marked condemned. 165411042SErik.Nordmark@Sun.COM */ 165511042SErik.Nordmark@Sun.COM ASSERT(ire->ire_nce_cache == NULL); 165611042SErik.Nordmark@Sun.COM 165711042SErik.Nordmark@Sun.COM /* 165811042SErik.Nordmark@Sun.COM * Since any parent would have a refhold on us they would already 165911042SErik.Nordmark@Sun.COM * have been removed. 166011042SErik.Nordmark@Sun.COM */ 166111042SErik.Nordmark@Sun.COM ASSERT(ire->ire_dep_parent == NULL); 166211042SErik.Nordmark@Sun.COM ASSERT(ire->ire_dep_sib_next == NULL); 166311042SErik.Nordmark@Sun.COM ASSERT(ire->ire_dep_sib_ptpn == NULL); 16640Sstevel@tonic-gate 16650Sstevel@tonic-gate /* 166611042SErik.Nordmark@Sun.COM * Since any children would have a refhold on us they should have 166711042SErik.Nordmark@Sun.COM * already been removed. 166811042SErik.Nordmark@Sun.COM */ 166911042SErik.Nordmark@Sun.COM ASSERT(ire->ire_dep_children == NULL); 167011042SErik.Nordmark@Sun.COM 167111042SErik.Nordmark@Sun.COM /* 167211042SErik.Nordmark@Sun.COM * ill_ire_ref is increased when the IRE is inserted in the 167311042SErik.Nordmark@Sun.COM * bucket - not when the IRE is created. 16740Sstevel@tonic-gate */ 167511042SErik.Nordmark@Sun.COM irb = ire->ire_bucket; 167611042SErik.Nordmark@Sun.COM ill = ire->ire_ill; 167711042SErik.Nordmark@Sun.COM if (irb != NULL && ill != NULL) { 16780Sstevel@tonic-gate mutex_enter(&ill->ill_lock); 167911042SErik.Nordmark@Sun.COM ASSERT(ill->ill_ire_cnt != 0); 168011042SErik.Nordmark@Sun.COM DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill, 16816255Ssowmini (char *), "ire", (void *), ire); 168211042SErik.Nordmark@Sun.COM ill->ill_ire_cnt--; 168311042SErik.Nordmark@Sun.COM if (ILL_DOWN_OK(ill)) { 16840Sstevel@tonic-gate /* Drops the ill lock */ 16850Sstevel@tonic-gate ipif_ill_refrele_tail(ill); 16860Sstevel@tonic-gate } else { 16870Sstevel@tonic-gate mutex_exit(&ill->ill_lock); 16880Sstevel@tonic-gate } 16890Sstevel@tonic-gate } 169011042SErik.Nordmark@Sun.COM ire->ire_ill = NULL; 169111042SErik.Nordmark@Sun.COM 16920Sstevel@tonic-gate /* This should be true for both V4 and V6 */ 169311042SErik.Nordmark@Sun.COM if (irb != NULL && (irb->irb_marks & IRB_MARK_DYNAMIC)) { 16942535Ssangeeta rw_enter(&irb->irb_lock, RW_WRITER); 16952535Ssangeeta irb->irb_nire--; 16962535Ssangeeta /* 16972535Ssangeeta * Instead of examining the conditions for freeing 16982535Ssangeeta * the radix node here, we do it by calling 169911042SErik.Nordmark@Sun.COM * irb_refrele which is a single point in the code 17002535Ssangeeta * that embeds that logic. Bump up the refcnt to 170111042SErik.Nordmark@Sun.COM * be able to call irb_refrele 17022535Ssangeeta */ 170311042SErik.Nordmark@Sun.COM irb_refhold_locked(irb); 17042535Ssangeeta rw_exit(&irb->irb_lock); 170511042SErik.Nordmark@Sun.COM irb_refrele(irb); 17062535Ssangeeta } 17070Sstevel@tonic-gate 17085023Scarlsonj #ifdef DEBUG 17095023Scarlsonj ire_trace_cleanup(ire); 17100Sstevel@tonic-gate #endif 17110Sstevel@tonic-gate mutex_destroy(&ire->ire_lock); 17120Sstevel@tonic-gate if (ire->ire_ipversion == IPV6_VERSION) { 17133448Sdh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_freed); 17140Sstevel@tonic-gate } else { 17153448Sdh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_freed); 17160Sstevel@tonic-gate } 17172535Ssangeeta kmem_cache_free(ire_cache, ire); 17180Sstevel@tonic-gate } 17190Sstevel@tonic-gate 17200Sstevel@tonic-gate /* 172111042SErik.Nordmark@Sun.COM * ire_update_generation is the callback function provided by 172211042SErik.Nordmark@Sun.COM * ire_get_bucket() to update the generation number of any 172311042SErik.Nordmark@Sun.COM * matching shorter route when a new route is added. 172411042SErik.Nordmark@Sun.COM * 172511042SErik.Nordmark@Sun.COM * This fucntion always returns a failure return (B_FALSE) 172611042SErik.Nordmark@Sun.COM * to force the caller (rn_matchaddr_args) 172711042SErik.Nordmark@Sun.COM * to back-track up the tree looking for shorter matches. 17280Sstevel@tonic-gate */ 172911042SErik.Nordmark@Sun.COM /* ARGSUSED */ 173011042SErik.Nordmark@Sun.COM static boolean_t 173111042SErik.Nordmark@Sun.COM ire_update_generation(struct radix_node *rn, void *arg) 17320Sstevel@tonic-gate { 173311042SErik.Nordmark@Sun.COM struct rt_entry *rt = (struct rt_entry *)rn; 173411042SErik.Nordmark@Sun.COM 173511042SErik.Nordmark@Sun.COM /* We need to handle all in the same bucket */ 173611042SErik.Nordmark@Sun.COM irb_increment_generation(&rt->rt_irb); 173711042SErik.Nordmark@Sun.COM return (B_FALSE); 17380Sstevel@tonic-gate } 17390Sstevel@tonic-gate 17400Sstevel@tonic-gate /* 174111042SErik.Nordmark@Sun.COM * Take care of all the generation numbers in the bucket. 174211042SErik.Nordmark@Sun.COM */ 174311042SErik.Nordmark@Sun.COM void 174411042SErik.Nordmark@Sun.COM irb_increment_generation(irb_t *irb) 174511042SErik.Nordmark@Sun.COM { 174611042SErik.Nordmark@Sun.COM ire_t *ire; 174711463SSowmini.Varadhan@Sun.COM ip_stack_t *ipst; 174811042SErik.Nordmark@Sun.COM 174911042SErik.Nordmark@Sun.COM if (irb == NULL || irb->irb_ire_cnt == 0) 175011042SErik.Nordmark@Sun.COM return; 175111042SErik.Nordmark@Sun.COM 175211463SSowmini.Varadhan@Sun.COM ipst = irb->irb_ipst; 175311463SSowmini.Varadhan@Sun.COM /* 175411463SSowmini.Varadhan@Sun.COM * we cannot do an irb_refhold/irb_refrele here as the caller 175511463SSowmini.Varadhan@Sun.COM * already has the global RADIX_NODE_HEAD_WLOCK, and the irb_refrele 175611463SSowmini.Varadhan@Sun.COM * may result in an attempt to free the irb_t, which also needs 175711463SSowmini.Varadhan@Sun.COM * the RADIX_NODE_HEAD lock. However, since we want to traverse the 175811463SSowmini.Varadhan@Sun.COM * irb_ire list without fear of having a condemned ire removed from 175911463SSowmini.Varadhan@Sun.COM * the list, we acquire the irb_lock as WRITER. Moreover, since 176011463SSowmini.Varadhan@Sun.COM * the ire_generation increments are done under the ire_dep_lock, 176111463SSowmini.Varadhan@Sun.COM * acquire the locks in the prescribed lock order first. 176211463SSowmini.Varadhan@Sun.COM */ 176311463SSowmini.Varadhan@Sun.COM rw_enter(&ipst->ips_ire_dep_lock, RW_READER); 176411463SSowmini.Varadhan@Sun.COM rw_enter(&irb->irb_lock, RW_WRITER); 176511042SErik.Nordmark@Sun.COM for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 176611042SErik.Nordmark@Sun.COM if (!IRE_IS_CONDEMNED(ire)) 176711042SErik.Nordmark@Sun.COM ire_increment_generation(ire); /* Ourselves */ 176811463SSowmini.Varadhan@Sun.COM ire_dep_incr_generation_locked(ire); /* Dependants */ 176911042SErik.Nordmark@Sun.COM } 177011463SSowmini.Varadhan@Sun.COM rw_exit(&irb->irb_lock); 177111463SSowmini.Varadhan@Sun.COM rw_exit(&ipst->ips_ire_dep_lock); 177211042SErik.Nordmark@Sun.COM } 177311042SErik.Nordmark@Sun.COM 177411042SErik.Nordmark@Sun.COM /* 177511042SErik.Nordmark@Sun.COM * When an IRE is added or deleted this routine is called to make sure 177611042SErik.Nordmark@Sun.COM * any caching of IRE information is notified or updated. 17770Sstevel@tonic-gate * 17780Sstevel@tonic-gate * The flag argument indicates if the flush request is due to addition 177911042SErik.Nordmark@Sun.COM * of new route (IRE_FLUSH_ADD), deletion of old route (IRE_FLUSH_DELETE), 178011042SErik.Nordmark@Sun.COM * or a change to ire_gateway_addr (IRE_FLUSH_GWCHANGE). 17810Sstevel@tonic-gate */ 17820Sstevel@tonic-gate void 17830Sstevel@tonic-gate ire_flush_cache_v4(ire_t *ire, int flag) 17840Sstevel@tonic-gate { 178511042SErik.Nordmark@Sun.COM irb_t *irb = ire->ire_bucket; 178611042SErik.Nordmark@Sun.COM struct rt_entry *rt = IRB2RT(irb); 178711042SErik.Nordmark@Sun.COM ip_stack_t *ipst = ire->ire_ipst; 178811042SErik.Nordmark@Sun.COM 178911042SErik.Nordmark@Sun.COM /* 179011042SErik.Nordmark@Sun.COM * IRE_IF_CLONE ire's don't provide any new information 179111042SErik.Nordmark@Sun.COM * than the parent from which they are cloned, so don't 179211042SErik.Nordmark@Sun.COM * perturb the generation numbers. 179311042SErik.Nordmark@Sun.COM */ 179411042SErik.Nordmark@Sun.COM if (ire->ire_type & IRE_IF_CLONE) 17954714Ssowmini return; 17960Sstevel@tonic-gate 17970Sstevel@tonic-gate /* 179811042SErik.Nordmark@Sun.COM * Ensure that an ire_add during a lookup serializes the updates of the 179911042SErik.Nordmark@Sun.COM * generation numbers under the radix head lock so that the lookup gets 180011042SErik.Nordmark@Sun.COM * either the old ire and old generation number, or a new ire and new 180111042SErik.Nordmark@Sun.COM * generation number. 180211042SErik.Nordmark@Sun.COM */ 180311042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable); 180411042SErik.Nordmark@Sun.COM 180511042SErik.Nordmark@Sun.COM /* 180611042SErik.Nordmark@Sun.COM * If a route was just added, we need to notify everybody that 180711042SErik.Nordmark@Sun.COM * has cached an IRE_NOROUTE since there might now be a better 180811042SErik.Nordmark@Sun.COM * route for them. 18090Sstevel@tonic-gate */ 181011042SErik.Nordmark@Sun.COM if (flag == IRE_FLUSH_ADD) { 181111042SErik.Nordmark@Sun.COM ire_increment_generation(ipst->ips_ire_reject_v4); 181211042SErik.Nordmark@Sun.COM ire_increment_generation(ipst->ips_ire_blackhole_v4); 181311042SErik.Nordmark@Sun.COM } 181411042SErik.Nordmark@Sun.COM 181511042SErik.Nordmark@Sun.COM /* Adding a default can't otherwise provide a better route */ 181611042SErik.Nordmark@Sun.COM if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) { 181711042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 18180Sstevel@tonic-gate return; 181911042SErik.Nordmark@Sun.COM } 182011042SErik.Nordmark@Sun.COM 182111042SErik.Nordmark@Sun.COM switch (flag) { 182211042SErik.Nordmark@Sun.COM case IRE_FLUSH_DELETE: 182311042SErik.Nordmark@Sun.COM case IRE_FLUSH_GWCHANGE: 18240Sstevel@tonic-gate /* 182511042SErik.Nordmark@Sun.COM * Update ire_generation for all ire_dep_children chains 182611042SErik.Nordmark@Sun.COM * starting with this IRE 18270Sstevel@tonic-gate */ 182811042SErik.Nordmark@Sun.COM ire_dep_incr_generation(ire); 182911042SErik.Nordmark@Sun.COM break; 183011042SErik.Nordmark@Sun.COM case IRE_FLUSH_ADD: 18310Sstevel@tonic-gate /* 183211042SErik.Nordmark@Sun.COM * Update the generation numbers of all shorter matching routes. 183311042SErik.Nordmark@Sun.COM * ire_update_generation takes care of the dependants by 183411042SErik.Nordmark@Sun.COM * using ire_dep_incr_generation. 18350Sstevel@tonic-gate */ 183611042SErik.Nordmark@Sun.COM (void) ipst->ips_ip_ftable->rnh_matchaddr_args(&rt->rt_dst, 183711042SErik.Nordmark@Sun.COM ipst->ips_ip_ftable, ire_update_generation, NULL); 183811042SErik.Nordmark@Sun.COM break; 18390Sstevel@tonic-gate } 184011042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 18410Sstevel@tonic-gate } 18420Sstevel@tonic-gate 18430Sstevel@tonic-gate /* 18440Sstevel@tonic-gate * Matches the arguments passed with the values in the ire. 18450Sstevel@tonic-gate * 184611042SErik.Nordmark@Sun.COM * Note: for match types that match using "ill" passed in, ill 18470Sstevel@tonic-gate * must be checked for non-NULL before calling this routine. 18480Sstevel@tonic-gate */ 18492535Ssangeeta boolean_t 18500Sstevel@tonic-gate ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, 185111042SErik.Nordmark@Sun.COM int type, const ill_t *ill, zoneid_t zoneid, 185211042SErik.Nordmark@Sun.COM const ts_label_t *tsl, int match_flags) 18530Sstevel@tonic-gate { 18540Sstevel@tonic-gate ill_t *ire_ill = NULL, *dst_ill; 185511042SErik.Nordmark@Sun.COM ip_stack_t *ipst = ire->ire_ipst; 18560Sstevel@tonic-gate 18570Sstevel@tonic-gate ASSERT(ire->ire_ipversion == IPV4_VERSION); 18580Sstevel@tonic-gate ASSERT((ire->ire_addr & ~ire->ire_mask) == 0); 1859*11681SSowmini.Varadhan@Sun.COM ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL))) || 186011042SErik.Nordmark@Sun.COM (ill != NULL && !ill->ill_isv6)); 18610Sstevel@tonic-gate 18620Sstevel@tonic-gate /* 186311042SErik.Nordmark@Sun.COM * If MATCH_IRE_TESTHIDDEN is set, then only return the IRE if it is 186411042SErik.Nordmark@Sun.COM * in fact hidden, to ensure the caller gets the right one. 18650Sstevel@tonic-gate */ 186611042SErik.Nordmark@Sun.COM if (ire->ire_testhidden) { 186711042SErik.Nordmark@Sun.COM if (!(match_flags & MATCH_IRE_TESTHIDDEN)) 18688485SPeter.Memishian@Sun.COM return (B_FALSE); 18698485SPeter.Memishian@Sun.COM } 18700Sstevel@tonic-gate 18711676Sjpk if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid && 18721676Sjpk ire->ire_zoneid != ALL_ZONES) { 18730Sstevel@tonic-gate /* 187411042SErik.Nordmark@Sun.COM * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid 187511042SErik.Nordmark@Sun.COM * does not match that of ire_zoneid, a failure to 18760Sstevel@tonic-gate * match is reported at this point. Otherwise, since some IREs 18770Sstevel@tonic-gate * that are available in the global zone can be used in local 18780Sstevel@tonic-gate * zones, additional checks need to be performed: 18790Sstevel@tonic-gate * 188011042SErik.Nordmark@Sun.COM * IRE_LOOPBACK 18810Sstevel@tonic-gate * entries should never be matched in this situation. 188211042SErik.Nordmark@Sun.COM * Each zone has its own IRE_LOOPBACK. 18830Sstevel@tonic-gate * 188411042SErik.Nordmark@Sun.COM * IRE_LOCAL 188511042SErik.Nordmark@Sun.COM * We allow them for any zoneid. ire_route_recursive 188611042SErik.Nordmark@Sun.COM * does additional checks when 188711042SErik.Nordmark@Sun.COM * ip_restrict_interzone_loopback is set. 18880Sstevel@tonic-gate * 188911042SErik.Nordmark@Sun.COM * If ill_usesrc_ifindex is set 189011042SErik.Nordmark@Sun.COM * Then we check if the zone has a valid source address 189111042SErik.Nordmark@Sun.COM * on the usesrc ill. 18920Sstevel@tonic-gate * 189311042SErik.Nordmark@Sun.COM * If ire_ill is set, then check that the zone has an ipif 189411042SErik.Nordmark@Sun.COM * on that ill. 189511042SErik.Nordmark@Sun.COM * 189611042SErik.Nordmark@Sun.COM * Outside of this function (in ire_round_robin) we check 189711042SErik.Nordmark@Sun.COM * that any IRE_OFFLINK has a gateway that reachable from the 189811042SErik.Nordmark@Sun.COM * zone when we have multiple choices (ECMP). 18990Sstevel@tonic-gate */ 19000Sstevel@tonic-gate if (match_flags & MATCH_IRE_ZONEONLY) 19010Sstevel@tonic-gate return (B_FALSE); 190211042SErik.Nordmark@Sun.COM if (ire->ire_type & IRE_LOOPBACK) 19030Sstevel@tonic-gate return (B_FALSE); 190411042SErik.Nordmark@Sun.COM 190511042SErik.Nordmark@Sun.COM if (ire->ire_type & IRE_LOCAL) 190611042SErik.Nordmark@Sun.COM goto matchit; 190711042SErik.Nordmark@Sun.COM 19080Sstevel@tonic-gate /* 190911042SErik.Nordmark@Sun.COM * The normal case of IRE_ONLINK has a matching zoneid. 191011042SErik.Nordmark@Sun.COM * Here we handle the case when shared-IP zones have been 191111042SErik.Nordmark@Sun.COM * configured with IP addresses on vniN. In that case it 191211042SErik.Nordmark@Sun.COM * is ok for traffic from a zone to use IRE_ONLINK routes 191311042SErik.Nordmark@Sun.COM * if the ill has a usesrc pointing at vniN 19140Sstevel@tonic-gate */ 191511042SErik.Nordmark@Sun.COM dst_ill = ire->ire_ill; 191611042SErik.Nordmark@Sun.COM if (ire->ire_type & IRE_ONLINK) { 191711042SErik.Nordmark@Sun.COM uint_t ifindex; 191811042SErik.Nordmark@Sun.COM 191911042SErik.Nordmark@Sun.COM /* 192011042SErik.Nordmark@Sun.COM * Note there is no IRE_INTERFACE on vniN thus 192111042SErik.Nordmark@Sun.COM * can't do an IRE lookup for a matching route. 192211042SErik.Nordmark@Sun.COM */ 192311042SErik.Nordmark@Sun.COM ifindex = dst_ill->ill_usesrc_ifindex; 192411042SErik.Nordmark@Sun.COM if (ifindex == 0) 192511042SErik.Nordmark@Sun.COM return (B_FALSE); 192611042SErik.Nordmark@Sun.COM 19270Sstevel@tonic-gate /* 19280Sstevel@tonic-gate * If there is a usable source address in the 192911042SErik.Nordmark@Sun.COM * zone, then it's ok to return this IRE_INTERFACE 19300Sstevel@tonic-gate */ 193111042SErik.Nordmark@Sun.COM if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6, 193211042SErik.Nordmark@Sun.COM zoneid, ipst)) { 193311042SErik.Nordmark@Sun.COM ip3dbg(("ire_match_args: no usrsrc for zone" 19340Sstevel@tonic-gate " dst_ill %p\n", (void *)dst_ill)); 19350Sstevel@tonic-gate return (B_FALSE); 19360Sstevel@tonic-gate } 19370Sstevel@tonic-gate } 193811042SErik.Nordmark@Sun.COM /* 1939*11681SSowmini.Varadhan@Sun.COM * For example, with 194011042SErik.Nordmark@Sun.COM * route add 11.0.0.0 gw1 -ifp bge0 194111042SErik.Nordmark@Sun.COM * route add 11.0.0.0 gw2 -ifp bge1 194211042SErik.Nordmark@Sun.COM * this code would differentiate based on 194311042SErik.Nordmark@Sun.COM * where the sending zone has addresses. 194411042SErik.Nordmark@Sun.COM * Only if the zone has an address on bge0 can it use the first 194511042SErik.Nordmark@Sun.COM * route. It isn't clear if this behavior is documented 194611042SErik.Nordmark@Sun.COM * anywhere. 194711042SErik.Nordmark@Sun.COM */ 194811042SErik.Nordmark@Sun.COM if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) { 19490Sstevel@tonic-gate ipif_t *tipif; 19500Sstevel@tonic-gate 195111042SErik.Nordmark@Sun.COM mutex_enter(&dst_ill->ill_lock); 195211042SErik.Nordmark@Sun.COM for (tipif = dst_ill->ill_ipif; 19530Sstevel@tonic-gate tipif != NULL; tipif = tipif->ipif_next) { 195411042SErik.Nordmark@Sun.COM if (!IPIF_IS_CONDEMNED(tipif) && 19550Sstevel@tonic-gate (tipif->ipif_flags & IPIF_UP) && 19561676Sjpk (tipif->ipif_zoneid == zoneid || 19571676Sjpk tipif->ipif_zoneid == ALL_ZONES)) 19580Sstevel@tonic-gate break; 19590Sstevel@tonic-gate } 196011042SErik.Nordmark@Sun.COM mutex_exit(&dst_ill->ill_lock); 19610Sstevel@tonic-gate if (tipif == NULL) { 19620Sstevel@tonic-gate return (B_FALSE); 19630Sstevel@tonic-gate } 19640Sstevel@tonic-gate } 19650Sstevel@tonic-gate } 19660Sstevel@tonic-gate 196711042SErik.Nordmark@Sun.COM matchit: 1968*11681SSowmini.Varadhan@Sun.COM ire_ill = ire->ire_ill; 19698485SPeter.Memishian@Sun.COM if (match_flags & MATCH_IRE_ILL) { 197011042SErik.Nordmark@Sun.COM 197111042SErik.Nordmark@Sun.COM /* 197211042SErik.Nordmark@Sun.COM * If asked to match an ill, we *must* match 197311042SErik.Nordmark@Sun.COM * on the ire_ill for ipmp test addresses, or 197411042SErik.Nordmark@Sun.COM * any of the ill in the group for data addresses. 197511042SErik.Nordmark@Sun.COM * If we don't, we may as well fail. 197611042SErik.Nordmark@Sun.COM * However, we need an exception for IRE_LOCALs to ensure 197711042SErik.Nordmark@Sun.COM * we loopback packets even sent to test addresses on different 197811042SErik.Nordmark@Sun.COM * interfaces in the group. 197911042SErik.Nordmark@Sun.COM */ 198011042SErik.Nordmark@Sun.COM if ((match_flags & MATCH_IRE_TESTHIDDEN) && 198111042SErik.Nordmark@Sun.COM !(ire->ire_type & IRE_LOCAL)) { 198211042SErik.Nordmark@Sun.COM if (ire->ire_ill != ill) 198311042SErik.Nordmark@Sun.COM return (B_FALSE); 198411042SErik.Nordmark@Sun.COM } else { 198511042SErik.Nordmark@Sun.COM match_flags &= ~MATCH_IRE_TESTHIDDEN; 198611042SErik.Nordmark@Sun.COM /* 198711042SErik.Nordmark@Sun.COM * We know that ill is not NULL, but ire_ill could be 198811042SErik.Nordmark@Sun.COM * NULL 198911042SErik.Nordmark@Sun.COM */ 199011042SErik.Nordmark@Sun.COM if (ire_ill == NULL || !IS_ON_SAME_LAN(ill, ire_ill)) 199111042SErik.Nordmark@Sun.COM return (B_FALSE); 199211042SErik.Nordmark@Sun.COM } 19930Sstevel@tonic-gate } 1994*11681SSowmini.Varadhan@Sun.COM if (match_flags & MATCH_IRE_SRC_ILL) { 1995*11681SSowmini.Varadhan@Sun.COM if (ire_ill == NULL) 1996*11681SSowmini.Varadhan@Sun.COM return (B_FALSE); 1997*11681SSowmini.Varadhan@Sun.COM if (!IS_ON_SAME_LAN(ill, ire_ill)) { 1998*11681SSowmini.Varadhan@Sun.COM if (ire_ill->ill_usesrc_ifindex == 0 || 1999*11681SSowmini.Varadhan@Sun.COM (ire_ill->ill_usesrc_ifindex != 2000*11681SSowmini.Varadhan@Sun.COM ill->ill_phyint->phyint_ifindex)) 2001*11681SSowmini.Varadhan@Sun.COM return (B_FALSE); 2002*11681SSowmini.Varadhan@Sun.COM } 2003*11681SSowmini.Varadhan@Sun.COM } 20040Sstevel@tonic-gate 20050Sstevel@tonic-gate if ((ire->ire_addr == (addr & mask)) && 20060Sstevel@tonic-gate ((!(match_flags & MATCH_IRE_GW)) || 20074714Ssowmini (ire->ire_gateway_addr == gateway)) && 200811042SErik.Nordmark@Sun.COM ((!(match_flags & MATCH_IRE_TYPE)) || (ire->ire_type & type)) && 200911042SErik.Nordmark@Sun.COM ((!(match_flags & MATCH_IRE_TESTHIDDEN)) || ire->ire_testhidden) && 201011042SErik.Nordmark@Sun.COM ((!(match_flags & MATCH_IRE_MASK)) || (ire->ire_mask == mask)) && 20111676Sjpk ((!(match_flags & MATCH_IRE_SECATTR)) || 20124714Ssowmini (!is_system_labeled()) || 20134714Ssowmini (tsol_ire_match_gwattr(ire, tsl) == 0))) { 20140Sstevel@tonic-gate /* We found the matched IRE */ 20150Sstevel@tonic-gate return (B_TRUE); 20160Sstevel@tonic-gate } 20170Sstevel@tonic-gate return (B_FALSE); 20180Sstevel@tonic-gate } 20190Sstevel@tonic-gate 20200Sstevel@tonic-gate /* 202111042SErik.Nordmark@Sun.COM * Check if the IRE_LOCAL uses the same ill as another route would use. 202211042SErik.Nordmark@Sun.COM * If there is no alternate route, or the alternate is a REJECT or BLACKHOLE, 202311042SErik.Nordmark@Sun.COM * then we don't allow this IRE_LOCAL to be used. 202411042SErik.Nordmark@Sun.COM * We always return an IRE; will be RTF_REJECT if no route available. 20250Sstevel@tonic-gate */ 20260Sstevel@tonic-gate ire_t * 202711042SErik.Nordmark@Sun.COM ire_alt_local(ire_t *ire, zoneid_t zoneid, const ts_label_t *tsl, 202811042SErik.Nordmark@Sun.COM const ill_t *ill, uint_t *generationp) 20290Sstevel@tonic-gate { 203011042SErik.Nordmark@Sun.COM ip_stack_t *ipst = ire->ire_ipst; 203111042SErik.Nordmark@Sun.COM ire_t *alt_ire; 203211042SErik.Nordmark@Sun.COM uint_t ire_type; 203311042SErik.Nordmark@Sun.COM uint_t generation; 203411042SErik.Nordmark@Sun.COM uint_t match_flags; 203511042SErik.Nordmark@Sun.COM 203611042SErik.Nordmark@Sun.COM ASSERT(ire->ire_type & IRE_LOCAL); 203711042SErik.Nordmark@Sun.COM ASSERT(ire->ire_ill != NULL); 20380Sstevel@tonic-gate 20390Sstevel@tonic-gate /* 204011042SErik.Nordmark@Sun.COM * Need to match on everything but local. 204111042SErik.Nordmark@Sun.COM * This might result in the creation of a IRE_IF_CLONE for the 204211042SErik.Nordmark@Sun.COM * same address as the IRE_LOCAL when restrict_interzone_loopback is 204311042SErik.Nordmark@Sun.COM * set. ire_add_*() ensures that the IRE_IF_CLONE are tail inserted 204411042SErik.Nordmark@Sun.COM * to make sure the IRE_LOCAL is always found first. 20450Sstevel@tonic-gate */ 204611042SErik.Nordmark@Sun.COM ire_type = (IRE_ONLINK | IRE_OFFLINK) & ~(IRE_LOCAL|IRE_LOOPBACK); 204711042SErik.Nordmark@Sun.COM match_flags = MATCH_IRE_TYPE | MATCH_IRE_SECATTR; 204811042SErik.Nordmark@Sun.COM if (ill != NULL) 204911042SErik.Nordmark@Sun.COM match_flags |= MATCH_IRE_ILL; 205011042SErik.Nordmark@Sun.COM 205111042SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV4_VERSION) { 205211042SErik.Nordmark@Sun.COM alt_ire = ire_route_recursive_v4(ire->ire_addr, ire_type, 205311457SErik.Nordmark@Sun.COM ill, zoneid, tsl, match_flags, IRR_ALLOCATE, 0, ipst, NULL, 205411457SErik.Nordmark@Sun.COM NULL, &generation); 205511042SErik.Nordmark@Sun.COM } else { 205611042SErik.Nordmark@Sun.COM alt_ire = ire_route_recursive_v6(&ire->ire_addr_v6, ire_type, 205711457SErik.Nordmark@Sun.COM ill, zoneid, tsl, match_flags, IRR_ALLOCATE, 0, ipst, NULL, 205811457SErik.Nordmark@Sun.COM NULL, &generation); 20590Sstevel@tonic-gate } 206011042SErik.Nordmark@Sun.COM ASSERT(alt_ire != NULL); 206111042SErik.Nordmark@Sun.COM 206211042SErik.Nordmark@Sun.COM if (alt_ire->ire_ill == ire->ire_ill) { 206311042SErik.Nordmark@Sun.COM /* Going out the same ILL - ok to send to IRE_LOCAL */ 206411042SErik.Nordmark@Sun.COM ire_refrele(alt_ire); 206511042SErik.Nordmark@Sun.COM } else { 206611042SErik.Nordmark@Sun.COM /* Different ill - ignore IRE_LOCAL */ 206711042SErik.Nordmark@Sun.COM ire_refrele(ire); 206811042SErik.Nordmark@Sun.COM ire = alt_ire; 206911042SErik.Nordmark@Sun.COM if (generationp != NULL) 207011042SErik.Nordmark@Sun.COM *generationp = generation; 20710Sstevel@tonic-gate } 20720Sstevel@tonic-gate return (ire); 20730Sstevel@tonic-gate } 20740Sstevel@tonic-gate 207511042SErik.Nordmark@Sun.COM boolean_t 207611042SErik.Nordmark@Sun.COM ire_find_zoneid(struct radix_node *rn, void *arg) 20771676Sjpk { 207811042SErik.Nordmark@Sun.COM struct rt_entry *rt = (struct rt_entry *)rn; 20791676Sjpk irb_t *irb; 20801676Sjpk ire_t *ire; 208111042SErik.Nordmark@Sun.COM ire_ftable_args_t *margs = arg; 208211042SErik.Nordmark@Sun.COM 208311042SErik.Nordmark@Sun.COM ASSERT(rt != NULL); 208411042SErik.Nordmark@Sun.COM 208511042SErik.Nordmark@Sun.COM irb = &rt->rt_irb; 208611042SErik.Nordmark@Sun.COM 208711042SErik.Nordmark@Sun.COM if (irb->irb_ire_cnt == 0) 208811042SErik.Nordmark@Sun.COM return (B_FALSE); 208911042SErik.Nordmark@Sun.COM 209011042SErik.Nordmark@Sun.COM rw_enter(&irb->irb_lock, RW_READER); 20911676Sjpk for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 209211042SErik.Nordmark@Sun.COM if (IRE_IS_CONDEMNED(ire)) 20931676Sjpk continue; 20941676Sjpk 209511131SErik.Nordmark@Sun.COM if (!(ire->ire_type & IRE_INTERFACE)) 209611131SErik.Nordmark@Sun.COM continue; 209711131SErik.Nordmark@Sun.COM 209811042SErik.Nordmark@Sun.COM if (ire->ire_zoneid != ALL_ZONES && 209911042SErik.Nordmark@Sun.COM ire->ire_zoneid != margs->ift_zoneid) 210011042SErik.Nordmark@Sun.COM continue; 210111042SErik.Nordmark@Sun.COM 210211042SErik.Nordmark@Sun.COM if (margs->ift_ill != NULL && margs->ift_ill != ire->ire_ill) 210311042SErik.Nordmark@Sun.COM continue; 210411042SErik.Nordmark@Sun.COM 210511042SErik.Nordmark@Sun.COM if (is_system_labeled() && 210611042SErik.Nordmark@Sun.COM tsol_ire_match_gwattr(ire, margs->ift_tsl) != 0) 210711042SErik.Nordmark@Sun.COM continue; 210811042SErik.Nordmark@Sun.COM 210911042SErik.Nordmark@Sun.COM rw_exit(&irb->irb_lock); 211011042SErik.Nordmark@Sun.COM return (B_TRUE); 21111676Sjpk } 211211042SErik.Nordmark@Sun.COM rw_exit(&irb->irb_lock); 211311042SErik.Nordmark@Sun.COM return (B_FALSE); 21142733Snordmark } 21152733Snordmark 21162733Snordmark /* 211711042SErik.Nordmark@Sun.COM * Check if the zoneid (not ALL_ZONES) has an IRE_INTERFACE for the specified 211811042SErik.Nordmark@Sun.COM * gateway address. If ill is non-NULL we also match on it. 211911042SErik.Nordmark@Sun.COM * The caller must hold a read lock on RADIX_NODE_HEAD if lock_held is set. 21200Sstevel@tonic-gate */ 212111042SErik.Nordmark@Sun.COM boolean_t 212211042SErik.Nordmark@Sun.COM ire_gateway_ok_zone_v4(ipaddr_t gateway, zoneid_t zoneid, ill_t *ill, 212311042SErik.Nordmark@Sun.COM const ts_label_t *tsl, ip_stack_t *ipst, boolean_t lock_held) 21240Sstevel@tonic-gate { 212511042SErik.Nordmark@Sun.COM struct rt_sockaddr rdst; 212611042SErik.Nordmark@Sun.COM struct rt_entry *rt; 212711042SErik.Nordmark@Sun.COM ire_ftable_args_t margs; 212811042SErik.Nordmark@Sun.COM 212911042SErik.Nordmark@Sun.COM ASSERT(ill == NULL || !ill->ill_isv6); 213011042SErik.Nordmark@Sun.COM if (lock_held) 213111042SErik.Nordmark@Sun.COM ASSERT(RW_READ_HELD(&ipst->ips_ip_ftable->rnh_lock)); 213211042SErik.Nordmark@Sun.COM else 213311042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 213411042SErik.Nordmark@Sun.COM 213511131SErik.Nordmark@Sun.COM bzero(&rdst, sizeof (rdst)); 213611042SErik.Nordmark@Sun.COM rdst.rt_sin_len = sizeof (rdst); 213711042SErik.Nordmark@Sun.COM rdst.rt_sin_family = AF_INET; 213811042SErik.Nordmark@Sun.COM rdst.rt_sin_addr.s_addr = gateway; 21398275SEric Cheng 21408275SEric Cheng /* 214111042SErik.Nordmark@Sun.COM * We only use margs for ill, zoneid, and tsl matching in 214211042SErik.Nordmark@Sun.COM * ire_find_zoneid 21438275SEric Cheng */ 214411131SErik.Nordmark@Sun.COM bzero(&margs, sizeof (margs)); 214511042SErik.Nordmark@Sun.COM margs.ift_ill = ill; 214611042SErik.Nordmark@Sun.COM margs.ift_zoneid = zoneid; 214711042SErik.Nordmark@Sun.COM margs.ift_tsl = tsl; 214811042SErik.Nordmark@Sun.COM rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, 214911042SErik.Nordmark@Sun.COM ipst->ips_ip_ftable, ire_find_zoneid, (void *)&margs); 215011042SErik.Nordmark@Sun.COM 215111042SErik.Nordmark@Sun.COM if (!lock_held) 215211042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 215311042SErik.Nordmark@Sun.COM 215411042SErik.Nordmark@Sun.COM return (rt != NULL); 21558275SEric Cheng } 21568275SEric Cheng 21570Sstevel@tonic-gate /* 215811042SErik.Nordmark@Sun.COM * ire_walk routine to delete a fraction of redirect IREs and IRE_CLONE_IF IREs. 215911042SErik.Nordmark@Sun.COM * The fraction argument tells us what fraction of the IREs to delete. 216011042SErik.Nordmark@Sun.COM * Common for IPv4 and IPv6. 216111042SErik.Nordmark@Sun.COM * Used when memory backpressure. 21620Sstevel@tonic-gate */ 216311042SErik.Nordmark@Sun.COM static void 216411042SErik.Nordmark@Sun.COM ire_delete_reclaim(ire_t *ire, char *arg) 21650Sstevel@tonic-gate { 216611042SErik.Nordmark@Sun.COM ip_stack_t *ipst = ire->ire_ipst; 216711042SErik.Nordmark@Sun.COM uint_t fraction = *(uint_t *)arg; 216811042SErik.Nordmark@Sun.COM uint_t rand; 216911042SErik.Nordmark@Sun.COM 217011042SErik.Nordmark@Sun.COM if ((ire->ire_flags & RTF_DYNAMIC) || 217111042SErik.Nordmark@Sun.COM (ire->ire_type & IRE_IF_CLONE)) { 217211042SErik.Nordmark@Sun.COM 217311042SErik.Nordmark@Sun.COM /* Pick a random number */ 217411066Srafael.vanoni@sun.com rand = (uint_t)ddi_get_lbolt() + 217511042SErik.Nordmark@Sun.COM IRE_ADDR_HASH_V6(ire->ire_addr_v6, 256); 217611042SErik.Nordmark@Sun.COM 217711042SErik.Nordmark@Sun.COM /* Use truncation */ 217811042SErik.Nordmark@Sun.COM if ((rand/fraction)*fraction == rand) { 217911042SErik.Nordmark@Sun.COM IP_STAT(ipst, ip_ire_reclaim_deleted); 218011042SErik.Nordmark@Sun.COM ire_delete(ire); 218111042SErik.Nordmark@Sun.COM } 218211042SErik.Nordmark@Sun.COM } 218311042SErik.Nordmark@Sun.COM 21840Sstevel@tonic-gate } 21850Sstevel@tonic-gate 21860Sstevel@tonic-gate /* 218711042SErik.Nordmark@Sun.COM * kmem_cache callback to free up memory. 21880Sstevel@tonic-gate * 218911042SErik.Nordmark@Sun.COM * Free a fraction (ips_ip_ire_reclaim_fraction) of things IP added dynamically 219011042SErik.Nordmark@Sun.COM * (RTF_DYNAMIC and IRE_IF_CLONE). 21910Sstevel@tonic-gate */ 219211042SErik.Nordmark@Sun.COM static void 219311042SErik.Nordmark@Sun.COM ip_ire_reclaim_stack(ip_stack_t *ipst) 21940Sstevel@tonic-gate { 219511042SErik.Nordmark@Sun.COM uint_t fraction = ipst->ips_ip_ire_reclaim_fraction; 219611042SErik.Nordmark@Sun.COM 219711042SErik.Nordmark@Sun.COM IP_STAT(ipst, ip_ire_reclaim_calls); 219811042SErik.Nordmark@Sun.COM 219911042SErik.Nordmark@Sun.COM ire_walk(ire_delete_reclaim, &fraction, ipst); 22008485SPeter.Memishian@Sun.COM 22018485SPeter.Memishian@Sun.COM /* 220211042SErik.Nordmark@Sun.COM * Walk all CONNs that can have a reference on an ire, nce or dce. 220311042SErik.Nordmark@Sun.COM * Get them to update any stale references to drop any refholds they 220411042SErik.Nordmark@Sun.COM * have. 22058485SPeter.Memishian@Sun.COM */ 220611042SErik.Nordmark@Sun.COM ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst); 22070Sstevel@tonic-gate } 22080Sstevel@tonic-gate 22090Sstevel@tonic-gate /* 221011042SErik.Nordmark@Sun.COM * Called by the memory allocator subsystem directly, when the system 221111042SErik.Nordmark@Sun.COM * is running low on memory. 22120Sstevel@tonic-gate */ 221311042SErik.Nordmark@Sun.COM /* ARGSUSED */ 22140Sstevel@tonic-gate void 221511042SErik.Nordmark@Sun.COM ip_ire_reclaim(void *args) 22160Sstevel@tonic-gate { 221711042SErik.Nordmark@Sun.COM netstack_handle_t nh; 221811042SErik.Nordmark@Sun.COM netstack_t *ns; 221911042SErik.Nordmark@Sun.COM 222011042SErik.Nordmark@Sun.COM netstack_next_init(&nh); 222111042SErik.Nordmark@Sun.COM while ((ns = netstack_next(&nh)) != NULL) { 222211042SErik.Nordmark@Sun.COM ip_ire_reclaim_stack(ns->netstack_ip); 222311042SErik.Nordmark@Sun.COM netstack_rele(ns); 22240Sstevel@tonic-gate } 222511042SErik.Nordmark@Sun.COM netstack_next_fini(&nh); 22260Sstevel@tonic-gate } 22270Sstevel@tonic-gate 22280Sstevel@tonic-gate static void 22290Sstevel@tonic-gate power2_roundup(uint32_t *value) 22300Sstevel@tonic-gate { 22310Sstevel@tonic-gate int i; 22320Sstevel@tonic-gate 22330Sstevel@tonic-gate for (i = 1; i < 31; i++) { 22340Sstevel@tonic-gate if (*value <= (1 << i)) 22350Sstevel@tonic-gate break; 22360Sstevel@tonic-gate } 22370Sstevel@tonic-gate *value = (1 << i); 22380Sstevel@tonic-gate } 22390Sstevel@tonic-gate 22403448Sdh155122 /* Global init for all zones */ 22410Sstevel@tonic-gate void 22423448Sdh155122 ip_ire_g_init() 22430Sstevel@tonic-gate { 22440Sstevel@tonic-gate /* 224511042SErik.Nordmark@Sun.COM * Create kmem_caches. ip_ire_reclaim() and ip_nce_reclaim() 224611042SErik.Nordmark@Sun.COM * will give disposable IREs back to system when needed. 22470Sstevel@tonic-gate * This needs to be done here before anything else, since 22480Sstevel@tonic-gate * ire_add() expects the cache to be created. 22490Sstevel@tonic-gate */ 22500Sstevel@tonic-gate ire_cache = kmem_cache_create("ire_cache", 225111042SErik.Nordmark@Sun.COM sizeof (ire_t), 0, NULL, NULL, 225211042SErik.Nordmark@Sun.COM ip_ire_reclaim, NULL, NULL, 0); 225311042SErik.Nordmark@Sun.COM 225411042SErik.Nordmark@Sun.COM ncec_cache = kmem_cache_create("ncec_cache", 225511042SErik.Nordmark@Sun.COM sizeof (ncec_t), 0, NULL, NULL, 225611042SErik.Nordmark@Sun.COM ip_nce_reclaim, NULL, NULL, 0); 225711042SErik.Nordmark@Sun.COM nce_cache = kmem_cache_create("nce_cache", 225811042SErik.Nordmark@Sun.COM sizeof (nce_t), 0, NULL, NULL, 225911042SErik.Nordmark@Sun.COM NULL, NULL, NULL, 0); 22600Sstevel@tonic-gate 22613448Sdh155122 rt_entry_cache = kmem_cache_create("rt_entry", 22623448Sdh155122 sizeof (struct rt_entry), 0, NULL, NULL, NULL, NULL, NULL, 0); 22633448Sdh155122 22643448Sdh155122 /* 22653448Sdh155122 * Have radix code setup kmem caches etc. 22663448Sdh155122 */ 22673448Sdh155122 rn_init(); 22683448Sdh155122 } 22693448Sdh155122 22703448Sdh155122 void 22713448Sdh155122 ip_ire_init(ip_stack_t *ipst) 22723448Sdh155122 { 227311042SErik.Nordmark@Sun.COM ire_t *ire; 227411042SErik.Nordmark@Sun.COM int error; 22753448Sdh155122 22763448Sdh155122 mutex_init(&ipst->ips_ire_ft_init_lock, NULL, MUTEX_DEFAULT, 0); 22773448Sdh155122 22783448Sdh155122 (void) rn_inithead((void **)&ipst->ips_ip_ftable, 32); 22793448Sdh155122 22800Sstevel@tonic-gate /* 22810Sstevel@tonic-gate * Make sure that the forwarding table size is a power of 2. 22820Sstevel@tonic-gate * The IRE*_ADDR_HASH() macroes depend on that. 22830Sstevel@tonic-gate */ 22843448Sdh155122 ipst->ips_ip6_ftable_hash_size = ip6_ftable_hash_size; 22853448Sdh155122 power2_roundup(&ipst->ips_ip6_ftable_hash_size); 22863448Sdh155122 228711042SErik.Nordmark@Sun.COM /* 228811042SErik.Nordmark@Sun.COM * Allocate/initialize a pair of IRE_NOROUTEs for each of IPv4 and IPv6. 228911042SErik.Nordmark@Sun.COM * The ire_reject_v* has RTF_REJECT set, and the ire_blackhole_v* has 229011042SErik.Nordmark@Sun.COM * RTF_BLACKHOLE set. We use the latter for transient errors such 229111042SErik.Nordmark@Sun.COM * as memory allocation failures and tripping on IRE_IS_CONDEMNED 229211042SErik.Nordmark@Sun.COM * entries. 229311042SErik.Nordmark@Sun.COM */ 229411042SErik.Nordmark@Sun.COM ire = kmem_cache_alloc(ire_cache, KM_SLEEP); 229511042SErik.Nordmark@Sun.COM *ire = ire_null; 229611042SErik.Nordmark@Sun.COM error = ire_init_v4(ire, 0, 0, 0, IRE_NOROUTE, NULL, ALL_ZONES, 229711042SErik.Nordmark@Sun.COM RTF_REJECT|RTF_UP, NULL, ipst); 229811042SErik.Nordmark@Sun.COM ASSERT(error == 0); 229911042SErik.Nordmark@Sun.COM ipst->ips_ire_reject_v4 = ire; 230011042SErik.Nordmark@Sun.COM 230111042SErik.Nordmark@Sun.COM ire = kmem_cache_alloc(ire_cache, KM_SLEEP); 230211042SErik.Nordmark@Sun.COM *ire = ire_null; 230311042SErik.Nordmark@Sun.COM error = ire_init_v6(ire, 0, 0, 0, IRE_NOROUTE, NULL, ALL_ZONES, 230411042SErik.Nordmark@Sun.COM RTF_REJECT|RTF_UP, NULL, ipst); 230511042SErik.Nordmark@Sun.COM ASSERT(error == 0); 230611042SErik.Nordmark@Sun.COM ipst->ips_ire_reject_v6 = ire; 230711042SErik.Nordmark@Sun.COM 230811042SErik.Nordmark@Sun.COM ire = kmem_cache_alloc(ire_cache, KM_SLEEP); 230911042SErik.Nordmark@Sun.COM *ire = ire_null; 231011042SErik.Nordmark@Sun.COM error = ire_init_v4(ire, 0, 0, 0, IRE_NOROUTE, NULL, ALL_ZONES, 231111042SErik.Nordmark@Sun.COM RTF_BLACKHOLE|RTF_UP, NULL, ipst); 231211042SErik.Nordmark@Sun.COM ASSERT(error == 0); 231311042SErik.Nordmark@Sun.COM ipst->ips_ire_blackhole_v4 = ire; 231411042SErik.Nordmark@Sun.COM 231511042SErik.Nordmark@Sun.COM ire = kmem_cache_alloc(ire_cache, KM_SLEEP); 231611042SErik.Nordmark@Sun.COM *ire = ire_null; 231711042SErik.Nordmark@Sun.COM error = ire_init_v6(ire, 0, 0, 0, IRE_NOROUTE, NULL, ALL_ZONES, 231811042SErik.Nordmark@Sun.COM RTF_BLACKHOLE|RTF_UP, NULL, ipst); 231911042SErik.Nordmark@Sun.COM ASSERT(error == 0); 232011042SErik.Nordmark@Sun.COM ipst->ips_ire_blackhole_v6 = ire; 232111042SErik.Nordmark@Sun.COM 232211042SErik.Nordmark@Sun.COM rw_init(&ipst->ips_ip6_ire_head_lock, NULL, RW_DEFAULT, NULL); 232311042SErik.Nordmark@Sun.COM rw_init(&ipst->ips_ire_dep_lock, NULL, RW_DEFAULT, NULL); 23243448Sdh155122 } 23253448Sdh155122 23263448Sdh155122 void 23273448Sdh155122 ip_ire_g_fini(void) 23283448Sdh155122 { 23293448Sdh155122 kmem_cache_destroy(ire_cache); 233011042SErik.Nordmark@Sun.COM kmem_cache_destroy(ncec_cache); 233111042SErik.Nordmark@Sun.COM kmem_cache_destroy(nce_cache); 23323448Sdh155122 kmem_cache_destroy(rt_entry_cache); 23333448Sdh155122 23343448Sdh155122 rn_fini(); 23350Sstevel@tonic-gate } 23360Sstevel@tonic-gate 23370Sstevel@tonic-gate void 23383448Sdh155122 ip_ire_fini(ip_stack_t *ipst) 23390Sstevel@tonic-gate { 23400Sstevel@tonic-gate int i; 23410Sstevel@tonic-gate 234211042SErik.Nordmark@Sun.COM rw_destroy(&ipst->ips_ire_dep_lock); 234311042SErik.Nordmark@Sun.COM rw_destroy(&ipst->ips_ip6_ire_head_lock); 234411042SErik.Nordmark@Sun.COM 234511553SThirumalai.Srinivasan@Sun.COM ire_make_condemned(ipst->ips_ire_reject_v6); 234611042SErik.Nordmark@Sun.COM ire_refrele_notr(ipst->ips_ire_reject_v6); 234711042SErik.Nordmark@Sun.COM ipst->ips_ire_reject_v6 = NULL; 234811553SThirumalai.Srinivasan@Sun.COM 234911553SThirumalai.Srinivasan@Sun.COM ire_make_condemned(ipst->ips_ire_reject_v4); 235011042SErik.Nordmark@Sun.COM ire_refrele_notr(ipst->ips_ire_reject_v4); 235111042SErik.Nordmark@Sun.COM ipst->ips_ire_reject_v4 = NULL; 235211553SThirumalai.Srinivasan@Sun.COM 235311553SThirumalai.Srinivasan@Sun.COM ire_make_condemned(ipst->ips_ire_blackhole_v6); 235411042SErik.Nordmark@Sun.COM ire_refrele_notr(ipst->ips_ire_blackhole_v6); 235511042SErik.Nordmark@Sun.COM ipst->ips_ire_blackhole_v6 = NULL; 235611553SThirumalai.Srinivasan@Sun.COM 235711553SThirumalai.Srinivasan@Sun.COM ire_make_condemned(ipst->ips_ire_blackhole_v4); 235811042SErik.Nordmark@Sun.COM ire_refrele_notr(ipst->ips_ire_blackhole_v4); 235911042SErik.Nordmark@Sun.COM ipst->ips_ire_blackhole_v4 = NULL; 236011042SErik.Nordmark@Sun.COM 23613448Sdh155122 /* 23623448Sdh155122 * Delete all IREs - assumes that the ill/ipifs have 236311042SErik.Nordmark@Sun.COM * been removed so what remains are just the ftable to handle. 23643448Sdh155122 */ 23653448Sdh155122 ire_walk(ire_delete, NULL, ipst); 23663448Sdh155122 23673448Sdh155122 rn_freehead(ipst->ips_ip_ftable); 23683448Sdh155122 ipst->ips_ip_ftable = NULL; 23693448Sdh155122 23703448Sdh155122 mutex_destroy(&ipst->ips_ire_ft_init_lock); 23713448Sdh155122 23723448Sdh155122 for (i = 0; i < IP6_MASK_TABLE_SIZE; i++) { 23733448Sdh155122 irb_t *ptr; 23743448Sdh155122 int j; 23753448Sdh155122 23763448Sdh155122 if ((ptr = ipst->ips_ip_forwarding_table_v6[i]) == NULL) 23773448Sdh155122 continue; 23783448Sdh155122 23793448Sdh155122 for (j = 0; j < ipst->ips_ip6_ftable_hash_size; j++) { 23803448Sdh155122 ASSERT(ptr[j].irb_ire == NULL); 23813448Sdh155122 rw_destroy(&ptr[j].irb_lock); 23823448Sdh155122 } 23833448Sdh155122 mi_free(ptr); 23843448Sdh155122 ipst->ips_ip_forwarding_table_v6[i] = NULL; 23853448Sdh155122 } 23860Sstevel@tonic-gate } 23870Sstevel@tonic-gate 23885023Scarlsonj #ifdef DEBUG 23890Sstevel@tonic-gate void 23900Sstevel@tonic-gate ire_trace_ref(ire_t *ire) 23910Sstevel@tonic-gate { 23920Sstevel@tonic-gate mutex_enter(&ire->ire_lock); 23935023Scarlsonj if (ire->ire_trace_disable) { 23940Sstevel@tonic-gate mutex_exit(&ire->ire_lock); 23950Sstevel@tonic-gate return; 23960Sstevel@tonic-gate } 23975023Scarlsonj 23985023Scarlsonj if (th_trace_ref(ire, ire->ire_ipst)) { 23995023Scarlsonj mutex_exit(&ire->ire_lock); 24005023Scarlsonj } else { 24015023Scarlsonj ire->ire_trace_disable = B_TRUE; 24025023Scarlsonj mutex_exit(&ire->ire_lock); 24035023Scarlsonj ire_trace_cleanup(ire); 24040Sstevel@tonic-gate } 24050Sstevel@tonic-gate } 24060Sstevel@tonic-gate 24070Sstevel@tonic-gate void 24080Sstevel@tonic-gate ire_untrace_ref(ire_t *ire) 24090Sstevel@tonic-gate { 24100Sstevel@tonic-gate mutex_enter(&ire->ire_lock); 24115023Scarlsonj if (!ire->ire_trace_disable) 24125023Scarlsonj th_trace_unref(ire); 24130Sstevel@tonic-gate mutex_exit(&ire->ire_lock); 24140Sstevel@tonic-gate } 24150Sstevel@tonic-gate 24160Sstevel@tonic-gate static void 24175023Scarlsonj ire_trace_cleanup(const ire_t *ire) 24180Sstevel@tonic-gate { 24195023Scarlsonj th_trace_cleanup(ire, ire->ire_trace_disable); 24200Sstevel@tonic-gate } 24215023Scarlsonj #endif /* DEBUG */ 24222535Ssangeeta 24232535Ssangeeta /* 242411042SErik.Nordmark@Sun.COM * Find, or create if needed, the nce_t pointer to the neighbor cache 242511042SErik.Nordmark@Sun.COM * entry ncec_t for an IPv4 address. The nce_t will be created on the ill_t 242611042SErik.Nordmark@Sun.COM * in the non-IPMP case, or on the cast-ill in the IPMP bcast/mcast case, or 242711042SErik.Nordmark@Sun.COM * on the next available under-ill (selected by the IPMP rotor) in the 242811042SErik.Nordmark@Sun.COM * unicast IPMP case. 242911042SErik.Nordmark@Sun.COM * 243011042SErik.Nordmark@Sun.COM * If a neighbor-cache entry has to be created (i.e., one does not already 243111042SErik.Nordmark@Sun.COM * exist in the nce list) the ncec_lladdr and ncec_state of the neighbor cache 243211042SErik.Nordmark@Sun.COM * entry are initialized in nce_add_v4(). The broadcast, multicast, and 243311042SErik.Nordmark@Sun.COM * link-layer type determine the contents of {ncec_state, ncec_lladdr} of 243411042SErik.Nordmark@Sun.COM * the ncec_t created. The ncec_lladdr is non-null for all link types with 243511042SErik.Nordmark@Sun.COM * non-zero ill_phys_addr_length, though the contents may be zero in cases 243611042SErik.Nordmark@Sun.COM * where the link-layer type is not known at the time of creation 243711042SErik.Nordmark@Sun.COM * (e.g., IRE_IFRESOLVER links) 243811042SErik.Nordmark@Sun.COM * 243911042SErik.Nordmark@Sun.COM * All IRE_BROADCAST entries have ncec_state = ND_REACHABLE, and the nce_lladr 244011042SErik.Nordmark@Sun.COM * has the physical broadcast address of the outgoing interface. 244111042SErik.Nordmark@Sun.COM * For unicast ire entries, 244211042SErik.Nordmark@Sun.COM * - if the outgoing interface is of type IRE_IF_RESOLVER, a newly created 244311042SErik.Nordmark@Sun.COM * ncec_t with 0 nce_lladr contents, and will be in the ND_INITIAL state. 244411042SErik.Nordmark@Sun.COM * - if the outgoing interface is a IRE_IF_NORESOLVER interface, no link 244511042SErik.Nordmark@Sun.COM * layer resolution is necessary, so that the ncec_t will be in the 244611042SErik.Nordmark@Sun.COM * ND_REACHABLE state 244711042SErik.Nordmark@Sun.COM * 244811042SErik.Nordmark@Sun.COM * The link layer information needed for broadcast addresses, and for 244911042SErik.Nordmark@Sun.COM * packets sent on IRE_IF_NORESOLVER interfaces is a constant mapping that 245011042SErik.Nordmark@Sun.COM * never needs re-verification for the lifetime of the ncec_t. These are 245111042SErik.Nordmark@Sun.COM * therefore marked NCE_F_NONUD. 245211042SErik.Nordmark@Sun.COM * 245311042SErik.Nordmark@Sun.COM * The nce returned will be created such that the nce_ill == ill that 245411042SErik.Nordmark@Sun.COM * is passed in. Note that the nce itself may not have ncec_ill == ill 245511042SErik.Nordmark@Sun.COM * where IPMP links are involved. 245611042SErik.Nordmark@Sun.COM */ 245711042SErik.Nordmark@Sun.COM static nce_t * 245811042SErik.Nordmark@Sun.COM ire_nce_init(ill_t *ill, const void *addr, int ire_type) 245911042SErik.Nordmark@Sun.COM { 246011042SErik.Nordmark@Sun.COM int err; 246111042SErik.Nordmark@Sun.COM nce_t *nce = NULL; 246211042SErik.Nordmark@Sun.COM uint16_t ncec_flags; 246311042SErik.Nordmark@Sun.COM uchar_t *hwaddr; 246411042SErik.Nordmark@Sun.COM boolean_t need_refrele = B_FALSE; 246511042SErik.Nordmark@Sun.COM ill_t *in_ill = ill; 246611042SErik.Nordmark@Sun.COM boolean_t is_unicast; 246711042SErik.Nordmark@Sun.COM uint_t hwaddr_len; 246811042SErik.Nordmark@Sun.COM 246911042SErik.Nordmark@Sun.COM is_unicast = ((ire_type & (IRE_MULTICAST|IRE_BROADCAST)) == 0); 247011042SErik.Nordmark@Sun.COM if (IS_IPMP(ill) || 247111042SErik.Nordmark@Sun.COM ((ire_type & IRE_BROADCAST) && IS_UNDER_IPMP(ill))) { 247211042SErik.Nordmark@Sun.COM if ((ill = ipmp_ill_get_xmit_ill(ill, is_unicast)) == NULL) 247311042SErik.Nordmark@Sun.COM return (NULL); 247411042SErik.Nordmark@Sun.COM need_refrele = B_TRUE; 247511042SErik.Nordmark@Sun.COM } 247611042SErik.Nordmark@Sun.COM ncec_flags = (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0; 247711042SErik.Nordmark@Sun.COM 247811042SErik.Nordmark@Sun.COM switch (ire_type) { 247911042SErik.Nordmark@Sun.COM case IRE_BROADCAST: 248011042SErik.Nordmark@Sun.COM ASSERT(!ill->ill_isv6); 248111042SErik.Nordmark@Sun.COM ncec_flags |= (NCE_F_BCAST|NCE_F_NONUD); 248211042SErik.Nordmark@Sun.COM break; 248311042SErik.Nordmark@Sun.COM case IRE_MULTICAST: 248411042SErik.Nordmark@Sun.COM ncec_flags |= (NCE_F_MCAST|NCE_F_NONUD); 248511042SErik.Nordmark@Sun.COM break; 248611042SErik.Nordmark@Sun.COM } 248711042SErik.Nordmark@Sun.COM 248811042SErik.Nordmark@Sun.COM if (ill->ill_net_type == IRE_IF_NORESOLVER && is_unicast) { 248911042SErik.Nordmark@Sun.COM hwaddr = ill->ill_dest_addr; 249011042SErik.Nordmark@Sun.COM } else { 249111042SErik.Nordmark@Sun.COM hwaddr = NULL; 249211042SErik.Nordmark@Sun.COM } 249311042SErik.Nordmark@Sun.COM hwaddr_len = ill->ill_phys_addr_length; 249411042SErik.Nordmark@Sun.COM 249511042SErik.Nordmark@Sun.COM retry: 249611042SErik.Nordmark@Sun.COM /* nce_state will be computed by nce_add_common() */ 249711042SErik.Nordmark@Sun.COM if (!ill->ill_isv6) { 249811042SErik.Nordmark@Sun.COM err = nce_lookup_then_add_v4(ill, hwaddr, hwaddr_len, addr, 249911042SErik.Nordmark@Sun.COM ncec_flags, ND_UNCHANGED, &nce); 250011042SErik.Nordmark@Sun.COM } else { 250111042SErik.Nordmark@Sun.COM err = nce_lookup_then_add_v6(ill, hwaddr, hwaddr_len, addr, 250211042SErik.Nordmark@Sun.COM ncec_flags, ND_UNCHANGED, &nce); 250311042SErik.Nordmark@Sun.COM } 250411042SErik.Nordmark@Sun.COM 250511042SErik.Nordmark@Sun.COM switch (err) { 250611042SErik.Nordmark@Sun.COM case 0: 250711042SErik.Nordmark@Sun.COM break; 250811042SErik.Nordmark@Sun.COM case EEXIST: 250911042SErik.Nordmark@Sun.COM /* 251011042SErik.Nordmark@Sun.COM * When subnets change or partially overlap what was once 251111042SErik.Nordmark@Sun.COM * a broadcast address could now be a unicast, or vice versa. 251211042SErik.Nordmark@Sun.COM */ 251311042SErik.Nordmark@Sun.COM if (((ncec_flags ^ nce->nce_common->ncec_flags) & 251411042SErik.Nordmark@Sun.COM NCE_F_BCAST) != 0) { 251511042SErik.Nordmark@Sun.COM ASSERT(!ill->ill_isv6); 251611042SErik.Nordmark@Sun.COM ncec_delete(nce->nce_common); 251711042SErik.Nordmark@Sun.COM nce_refrele(nce); 251811042SErik.Nordmark@Sun.COM goto retry; 251911042SErik.Nordmark@Sun.COM } 252011042SErik.Nordmark@Sun.COM break; 252111042SErik.Nordmark@Sun.COM default: 252211042SErik.Nordmark@Sun.COM DTRACE_PROBE2(nce__init__fail, ill_t *, ill, int, err); 252311042SErik.Nordmark@Sun.COM if (need_refrele) 252411042SErik.Nordmark@Sun.COM ill_refrele(ill); 252511042SErik.Nordmark@Sun.COM return (NULL); 252611042SErik.Nordmark@Sun.COM } 252711042SErik.Nordmark@Sun.COM /* 252811042SErik.Nordmark@Sun.COM * If the ill was an under-ill of an IPMP group, we need to verify 252911042SErik.Nordmark@Sun.COM * that it is still active so that we select an active interface in 253011042SErik.Nordmark@Sun.COM * the group. However, since ipmp_ill_is_active ASSERTs for 253111042SErik.Nordmark@Sun.COM * IS_UNDER_IPMP(), we first need to verify that the ill is an 253211042SErik.Nordmark@Sun.COM * under-ill, and since this is being done in the data path, the 253311042SErik.Nordmark@Sun.COM * only way to ascertain this is by holding the ill_g_lock. 253411042SErik.Nordmark@Sun.COM */ 253511042SErik.Nordmark@Sun.COM rw_enter(&ill->ill_ipst->ips_ill_g_lock, RW_READER); 253611042SErik.Nordmark@Sun.COM mutex_enter(&ill->ill_lock); 253711042SErik.Nordmark@Sun.COM mutex_enter(&ill->ill_phyint->phyint_lock); 253811042SErik.Nordmark@Sun.COM if (need_refrele && IS_UNDER_IPMP(ill) && !ipmp_ill_is_active(ill)) { 253911042SErik.Nordmark@Sun.COM /* 254011042SErik.Nordmark@Sun.COM * need_refrele implies that the under ill was selected by 254111042SErik.Nordmark@Sun.COM * ipmp_ill_get_xmit_ill() because either the in_ill was an 254211042SErik.Nordmark@Sun.COM * ipmp_ill, or we are sending a non-unicast packet on 254311042SErik.Nordmark@Sun.COM * an under_ill. However, when we get here, the ill selected by 254411042SErik.Nordmark@Sun.COM * ipmp_ill_get_xmit_ill was pulled out of the active set 254511042SErik.Nordmark@Sun.COM * (for unicast) or cast_ill nomination (for 254611042SErik.Nordmark@Sun.COM * !unicast) after it was picked as the outgoing ill. 254711042SErik.Nordmark@Sun.COM * We have to pick an active interface and/or cast_ill in the 254811042SErik.Nordmark@Sun.COM * group. 254911042SErik.Nordmark@Sun.COM */ 255011042SErik.Nordmark@Sun.COM mutex_exit(&ill->ill_phyint->phyint_lock); 255111042SErik.Nordmark@Sun.COM nce_delete(nce); 255211042SErik.Nordmark@Sun.COM mutex_exit(&ill->ill_lock); 255311042SErik.Nordmark@Sun.COM rw_exit(&ill->ill_ipst->ips_ill_g_lock); 255411042SErik.Nordmark@Sun.COM nce_refrele(nce); 255511042SErik.Nordmark@Sun.COM ill_refrele(ill); 255611042SErik.Nordmark@Sun.COM if ((ill = ipmp_ill_get_xmit_ill(in_ill, is_unicast)) == NULL) 255711042SErik.Nordmark@Sun.COM return (NULL); 255811042SErik.Nordmark@Sun.COM goto retry; 255911042SErik.Nordmark@Sun.COM } else { 256011042SErik.Nordmark@Sun.COM mutex_exit(&ill->ill_phyint->phyint_lock); 256111042SErik.Nordmark@Sun.COM mutex_exit(&ill->ill_lock); 256211042SErik.Nordmark@Sun.COM rw_exit(&ill->ill_ipst->ips_ill_g_lock); 256311042SErik.Nordmark@Sun.COM } 256411042SErik.Nordmark@Sun.COM done: 256511042SErik.Nordmark@Sun.COM ASSERT(nce->nce_ill == ill); 256611042SErik.Nordmark@Sun.COM if (need_refrele) 256711042SErik.Nordmark@Sun.COM ill_refrele(ill); 256811042SErik.Nordmark@Sun.COM return (nce); 256911042SErik.Nordmark@Sun.COM } 257011042SErik.Nordmark@Sun.COM 257111042SErik.Nordmark@Sun.COM nce_t * 257211042SErik.Nordmark@Sun.COM arp_nce_init(ill_t *ill, in_addr_t addr4, int ire_type) 257311042SErik.Nordmark@Sun.COM { 257411042SErik.Nordmark@Sun.COM return (ire_nce_init(ill, &addr4, ire_type)); 257511042SErik.Nordmark@Sun.COM } 257611042SErik.Nordmark@Sun.COM 257711042SErik.Nordmark@Sun.COM nce_t * 257811042SErik.Nordmark@Sun.COM ndp_nce_init(ill_t *ill, const in6_addr_t *addr6, int ire_type) 257911042SErik.Nordmark@Sun.COM { 258011042SErik.Nordmark@Sun.COM ASSERT((ire_type & IRE_BROADCAST) == 0); 258111042SErik.Nordmark@Sun.COM return (ire_nce_init(ill, addr6, ire_type)); 258211042SErik.Nordmark@Sun.COM } 258311042SErik.Nordmark@Sun.COM 258411042SErik.Nordmark@Sun.COM /* 258511042SErik.Nordmark@Sun.COM * The caller should hold irb_lock as a writer if the ire is in a bucket. 258611463SSowmini.Varadhan@Sun.COM * This routine will clear ire_nce_cache, and we make sure that we can never 258711463SSowmini.Varadhan@Sun.COM * set ire_nce_cache after the ire is marked condemned. 258811042SErik.Nordmark@Sun.COM */ 258911042SErik.Nordmark@Sun.COM void 259011042SErik.Nordmark@Sun.COM ire_make_condemned(ire_t *ire) 259111042SErik.Nordmark@Sun.COM { 259211042SErik.Nordmark@Sun.COM ip_stack_t *ipst = ire->ire_ipst; 259311463SSowmini.Varadhan@Sun.COM nce_t *nce; 259411042SErik.Nordmark@Sun.COM 259511042SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock); 259611042SErik.Nordmark@Sun.COM ASSERT(ire->ire_bucket == NULL || 259711042SErik.Nordmark@Sun.COM RW_WRITE_HELD(&ire->ire_bucket->irb_lock)); 259811042SErik.Nordmark@Sun.COM ASSERT(!IRE_IS_CONDEMNED(ire)); 259911042SErik.Nordmark@Sun.COM ire->ire_generation = IRE_GENERATION_CONDEMNED; 260011042SErik.Nordmark@Sun.COM /* Count how many condemned ires for kmem_cache callback */ 260111042SErik.Nordmark@Sun.COM atomic_add_32(&ipst->ips_num_ire_condemned, 1); 260211463SSowmini.Varadhan@Sun.COM nce = ire->ire_nce_cache; 260311463SSowmini.Varadhan@Sun.COM ire->ire_nce_cache = NULL; 260411042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 260511463SSowmini.Varadhan@Sun.COM if (nce != NULL) 260611463SSowmini.Varadhan@Sun.COM nce_refrele(nce); 260711042SErik.Nordmark@Sun.COM } 260811042SErik.Nordmark@Sun.COM 260911042SErik.Nordmark@Sun.COM /* 261011042SErik.Nordmark@Sun.COM * Increment the generation avoiding the special condemned value 261111042SErik.Nordmark@Sun.COM */ 261211042SErik.Nordmark@Sun.COM void 261311042SErik.Nordmark@Sun.COM ire_increment_generation(ire_t *ire) 261411042SErik.Nordmark@Sun.COM { 261511042SErik.Nordmark@Sun.COM uint_t generation; 261611042SErik.Nordmark@Sun.COM 261711042SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock); 261811042SErik.Nordmark@Sun.COM /* 261911042SErik.Nordmark@Sun.COM * Even though the caller has a hold it can't prevent a concurrent 262011042SErik.Nordmark@Sun.COM * ire_delete marking the IRE condemned 262111042SErik.Nordmark@Sun.COM */ 262211042SErik.Nordmark@Sun.COM if (!IRE_IS_CONDEMNED(ire)) { 262311042SErik.Nordmark@Sun.COM generation = ire->ire_generation + 1; 262411042SErik.Nordmark@Sun.COM if (generation == IRE_GENERATION_CONDEMNED) 262511042SErik.Nordmark@Sun.COM generation = IRE_GENERATION_INITIAL; 262611042SErik.Nordmark@Sun.COM ASSERT(generation != IRE_GENERATION_VERIFY); 262711042SErik.Nordmark@Sun.COM ire->ire_generation = generation; 262811042SErik.Nordmark@Sun.COM } 262911042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 263011042SErik.Nordmark@Sun.COM } 263111042SErik.Nordmark@Sun.COM 263211042SErik.Nordmark@Sun.COM /* 263311042SErik.Nordmark@Sun.COM * Increment ire_generation on all the IRE_MULTICASTs 263411042SErik.Nordmark@Sun.COM * Used when the default multicast interface (as determined by 263511042SErik.Nordmark@Sun.COM * ill_lookup_multicast) might have changed. 263611042SErik.Nordmark@Sun.COM * 263711042SErik.Nordmark@Sun.COM * That includes the zoneid, IFF_ flags, the IPv6 scope of the address, and 263811042SErik.Nordmark@Sun.COM * ill unplumb. 26392535Ssangeeta */ 26402535Ssangeeta void 264111042SErik.Nordmark@Sun.COM ire_increment_multicast_generation(ip_stack_t *ipst, boolean_t isv6) 264211042SErik.Nordmark@Sun.COM { 264311042SErik.Nordmark@Sun.COM ill_t *ill; 264411042SErik.Nordmark@Sun.COM ill_walk_context_t ctx; 264511042SErik.Nordmark@Sun.COM 264611042SErik.Nordmark@Sun.COM rw_enter(&ipst->ips_ill_g_lock, RW_READER); 264711042SErik.Nordmark@Sun.COM if (isv6) 264811042SErik.Nordmark@Sun.COM ill = ILL_START_WALK_V6(&ctx, ipst); 264911042SErik.Nordmark@Sun.COM else 265011042SErik.Nordmark@Sun.COM ill = ILL_START_WALK_V4(&ctx, ipst); 265111042SErik.Nordmark@Sun.COM for (; ill != NULL; ill = ill_next(&ctx, ill)) { 265211042SErik.Nordmark@Sun.COM if (ILL_IS_CONDEMNED(ill)) 265311042SErik.Nordmark@Sun.COM continue; 265411042SErik.Nordmark@Sun.COM if (ill->ill_ire_multicast != NULL) 265511042SErik.Nordmark@Sun.COM ire_increment_generation(ill->ill_ire_multicast); 265611042SErik.Nordmark@Sun.COM } 265711042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ill_g_lock); 265811042SErik.Nordmark@Sun.COM } 265911042SErik.Nordmark@Sun.COM 266011042SErik.Nordmark@Sun.COM /* 266111042SErik.Nordmark@Sun.COM * Return a held IRE_NOROUTE with RTF_REJECT set 266211042SErik.Nordmark@Sun.COM */ 266311042SErik.Nordmark@Sun.COM ire_t * 266411042SErik.Nordmark@Sun.COM ire_reject(ip_stack_t *ipst, boolean_t isv6) 266511042SErik.Nordmark@Sun.COM { 266611042SErik.Nordmark@Sun.COM ire_t *ire; 266711042SErik.Nordmark@Sun.COM 266811042SErik.Nordmark@Sun.COM if (isv6) 266911042SErik.Nordmark@Sun.COM ire = ipst->ips_ire_reject_v6; 267011042SErik.Nordmark@Sun.COM else 267111042SErik.Nordmark@Sun.COM ire = ipst->ips_ire_reject_v4; 267211042SErik.Nordmark@Sun.COM 267311042SErik.Nordmark@Sun.COM ASSERT(ire->ire_generation != IRE_GENERATION_CONDEMNED); 267411042SErik.Nordmark@Sun.COM ire_refhold(ire); 267511042SErik.Nordmark@Sun.COM return (ire); 267611042SErik.Nordmark@Sun.COM } 267711042SErik.Nordmark@Sun.COM 267811042SErik.Nordmark@Sun.COM /* 267911042SErik.Nordmark@Sun.COM * Return a held IRE_NOROUTE with RTF_BLACKHOLE set 268011042SErik.Nordmark@Sun.COM */ 268111042SErik.Nordmark@Sun.COM ire_t * 268211042SErik.Nordmark@Sun.COM ire_blackhole(ip_stack_t *ipst, boolean_t isv6) 268311042SErik.Nordmark@Sun.COM { 268411042SErik.Nordmark@Sun.COM ire_t *ire; 268511042SErik.Nordmark@Sun.COM 268611042SErik.Nordmark@Sun.COM if (isv6) 268711042SErik.Nordmark@Sun.COM ire = ipst->ips_ire_blackhole_v6; 268811042SErik.Nordmark@Sun.COM else 268911042SErik.Nordmark@Sun.COM ire = ipst->ips_ire_blackhole_v4; 269011042SErik.Nordmark@Sun.COM 269111042SErik.Nordmark@Sun.COM ASSERT(ire->ire_generation != IRE_GENERATION_CONDEMNED); 269211042SErik.Nordmark@Sun.COM ire_refhold(ire); 269311042SErik.Nordmark@Sun.COM return (ire); 269411042SErik.Nordmark@Sun.COM } 269511042SErik.Nordmark@Sun.COM 269611042SErik.Nordmark@Sun.COM /* 269711042SErik.Nordmark@Sun.COM * Return a held IRE_MULTICAST. 269811042SErik.Nordmark@Sun.COM */ 269911042SErik.Nordmark@Sun.COM ire_t * 270011042SErik.Nordmark@Sun.COM ire_multicast(ill_t *ill) 270111042SErik.Nordmark@Sun.COM { 270211042SErik.Nordmark@Sun.COM ire_t *ire = ill->ill_ire_multicast; 270311042SErik.Nordmark@Sun.COM 270411042SErik.Nordmark@Sun.COM ASSERT(ire == NULL || ire->ire_generation != IRE_GENERATION_CONDEMNED); 270511042SErik.Nordmark@Sun.COM if (ire == NULL) 270611042SErik.Nordmark@Sun.COM ire = ire_blackhole(ill->ill_ipst, ill->ill_isv6); 270711042SErik.Nordmark@Sun.COM else 270811042SErik.Nordmark@Sun.COM ire_refhold(ire); 270911042SErik.Nordmark@Sun.COM return (ire); 271011042SErik.Nordmark@Sun.COM } 271111042SErik.Nordmark@Sun.COM 271211042SErik.Nordmark@Sun.COM /* 271311042SErik.Nordmark@Sun.COM * Given an IRE return its nexthop IRE. The nexthop IRE is an IRE_ONLINK 271411042SErik.Nordmark@Sun.COM * that is an exact match (i.e., a /32 for IPv4 and /128 for IPv6). 271511042SErik.Nordmark@Sun.COM * This can return an RTF_REJECT|RTF_BLACKHOLE. 271611042SErik.Nordmark@Sun.COM * The returned IRE is held. 271711042SErik.Nordmark@Sun.COM * The assumption is that ip_select_route() has been called and returned the 271811042SErik.Nordmark@Sun.COM * IRE (thus ip_select_route would have set up the ire_dep* information.) 271911042SErik.Nordmark@Sun.COM * If some IRE is deleteted then ire_dep_remove() will have been called and 272011042SErik.Nordmark@Sun.COM * we might not find a nexthop IRE, in which case we return NULL. 272111042SErik.Nordmark@Sun.COM */ 272211042SErik.Nordmark@Sun.COM ire_t * 272311042SErik.Nordmark@Sun.COM ire_nexthop(ire_t *ire) 27242535Ssangeeta { 272511042SErik.Nordmark@Sun.COM ip_stack_t *ipst = ire->ire_ipst; 272611042SErik.Nordmark@Sun.COM 272711042SErik.Nordmark@Sun.COM /* Acquire lock to walk ire_dep_parent */ 272811042SErik.Nordmark@Sun.COM rw_enter(&ipst->ips_ire_dep_lock, RW_READER); 272911042SErik.Nordmark@Sun.COM while (ire != NULL) { 273011042SErik.Nordmark@Sun.COM if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 273111042SErik.Nordmark@Sun.COM goto done; 273211042SErik.Nordmark@Sun.COM } 273311042SErik.Nordmark@Sun.COM /* 273411042SErik.Nordmark@Sun.COM * If we find an IRE_ONLINK we are done. This includes 273511042SErik.Nordmark@Sun.COM * the case of IRE_MULTICAST. 273611042SErik.Nordmark@Sun.COM * Note that in order to send packets we need a host-specific 273711042SErik.Nordmark@Sun.COM * IRE_IF_ALL first in the ire_dep_parent chain. Normally this 273811042SErik.Nordmark@Sun.COM * is done by inserting an IRE_IF_CLONE if the IRE_INTERFACE 273911042SErik.Nordmark@Sun.COM * was not host specific. 274011042SErik.Nordmark@Sun.COM * However, ip_rts_request doesn't want to send packets 274111042SErik.Nordmark@Sun.COM * hence doesn't want to allocate an IRE_IF_CLONE. Yet 274211042SErik.Nordmark@Sun.COM * it needs an IRE_IF_ALL to get to the ill. Thus 274311042SErik.Nordmark@Sun.COM * we return IRE_IF_ALL that are not host specific here. 274411042SErik.Nordmark@Sun.COM */ 274511042SErik.Nordmark@Sun.COM if (ire->ire_type & IRE_ONLINK) 274611042SErik.Nordmark@Sun.COM goto done; 274711042SErik.Nordmark@Sun.COM ire = ire->ire_dep_parent; 274811042SErik.Nordmark@Sun.COM } 274911042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ire_dep_lock); 275011042SErik.Nordmark@Sun.COM return (NULL); 275111042SErik.Nordmark@Sun.COM 275211042SErik.Nordmark@Sun.COM done: 275311042SErik.Nordmark@Sun.COM ire_refhold(ire); 275411042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ire_dep_lock); 275511042SErik.Nordmark@Sun.COM return (ire); 275611042SErik.Nordmark@Sun.COM } 275711042SErik.Nordmark@Sun.COM 275811042SErik.Nordmark@Sun.COM /* 275911042SErik.Nordmark@Sun.COM * Find the ill used to send packets. This will be NULL in case 276011042SErik.Nordmark@Sun.COM * of a reject or blackhole. 276111042SErik.Nordmark@Sun.COM * The returned ill is held; caller needs to do ill_refrele when done. 276211042SErik.Nordmark@Sun.COM */ 276311042SErik.Nordmark@Sun.COM ill_t * 276411042SErik.Nordmark@Sun.COM ire_nexthop_ill(ire_t *ire) 276511042SErik.Nordmark@Sun.COM { 276611042SErik.Nordmark@Sun.COM ill_t *ill; 276711042SErik.Nordmark@Sun.COM 276811042SErik.Nordmark@Sun.COM ire = ire_nexthop(ire); 276911042SErik.Nordmark@Sun.COM if (ire == NULL) 277011042SErik.Nordmark@Sun.COM return (NULL); 277111042SErik.Nordmark@Sun.COM 277211042SErik.Nordmark@Sun.COM /* ire_ill can not change for an existing ire */ 277311042SErik.Nordmark@Sun.COM ill = ire->ire_ill; 277411042SErik.Nordmark@Sun.COM if (ill != NULL) 277511042SErik.Nordmark@Sun.COM ill_refhold(ill); 277611042SErik.Nordmark@Sun.COM ire_refrele(ire); 277711042SErik.Nordmark@Sun.COM return (ill); 277811042SErik.Nordmark@Sun.COM } 277911042SErik.Nordmark@Sun.COM 278011042SErik.Nordmark@Sun.COM #ifdef DEBUG 278111042SErik.Nordmark@Sun.COM static boolean_t 278211042SErik.Nordmark@Sun.COM parent_has_child(ire_t *parent, ire_t *child) 278311042SErik.Nordmark@Sun.COM { 278411042SErik.Nordmark@Sun.COM ire_t *ire; 278511042SErik.Nordmark@Sun.COM ire_t *prev; 278611042SErik.Nordmark@Sun.COM 278711042SErik.Nordmark@Sun.COM ire = parent->ire_dep_children; 278811042SErik.Nordmark@Sun.COM prev = NULL; 278911042SErik.Nordmark@Sun.COM while (ire != NULL) { 279011042SErik.Nordmark@Sun.COM if (prev == NULL) { 279111042SErik.Nordmark@Sun.COM ASSERT(ire->ire_dep_sib_ptpn == 279211042SErik.Nordmark@Sun.COM &(parent->ire_dep_children)); 279311042SErik.Nordmark@Sun.COM } else { 279411042SErik.Nordmark@Sun.COM ASSERT(ire->ire_dep_sib_ptpn == 279511042SErik.Nordmark@Sun.COM &(prev->ire_dep_sib_next)); 279611042SErik.Nordmark@Sun.COM } 279711042SErik.Nordmark@Sun.COM if (ire == child) 279811042SErik.Nordmark@Sun.COM return (B_TRUE); 279911042SErik.Nordmark@Sun.COM prev = ire; 280011042SErik.Nordmark@Sun.COM ire = ire->ire_dep_sib_next; 280111042SErik.Nordmark@Sun.COM } 280211042SErik.Nordmark@Sun.COM return (B_FALSE); 280311042SErik.Nordmark@Sun.COM } 280411042SErik.Nordmark@Sun.COM 280511042SErik.Nordmark@Sun.COM static void 280611042SErik.Nordmark@Sun.COM ire_dep_verify(ire_t *ire) 280711042SErik.Nordmark@Sun.COM { 280811042SErik.Nordmark@Sun.COM ire_t *parent = ire->ire_dep_parent; 280911042SErik.Nordmark@Sun.COM ire_t *child = ire->ire_dep_children; 281011042SErik.Nordmark@Sun.COM 281111042SErik.Nordmark@Sun.COM ASSERT(ire->ire_ipversion == IPV4_VERSION || 281211042SErik.Nordmark@Sun.COM ire->ire_ipversion == IPV6_VERSION); 281311042SErik.Nordmark@Sun.COM if (parent != NULL) { 281411042SErik.Nordmark@Sun.COM ASSERT(parent->ire_ipversion == IPV4_VERSION || 281511042SErik.Nordmark@Sun.COM parent->ire_ipversion == IPV6_VERSION); 281611042SErik.Nordmark@Sun.COM ASSERT(parent->ire_refcnt >= 1); 281711042SErik.Nordmark@Sun.COM ASSERT(parent_has_child(parent, ire)); 281811042SErik.Nordmark@Sun.COM } 281911042SErik.Nordmark@Sun.COM if (child != NULL) { 282011042SErik.Nordmark@Sun.COM ASSERT(child->ire_ipversion == IPV4_VERSION || 282111042SErik.Nordmark@Sun.COM child->ire_ipversion == IPV6_VERSION); 282211042SErik.Nordmark@Sun.COM ASSERT(child->ire_dep_parent == ire); 282311042SErik.Nordmark@Sun.COM ASSERT(child->ire_dep_sib_ptpn != NULL); 282411042SErik.Nordmark@Sun.COM ASSERT(parent_has_child(ire, child)); 282511042SErik.Nordmark@Sun.COM } 282611042SErik.Nordmark@Sun.COM } 282711042SErik.Nordmark@Sun.COM #endif /* DEBUG */ 282811042SErik.Nordmark@Sun.COM 282911042SErik.Nordmark@Sun.COM /* 283011042SErik.Nordmark@Sun.COM * Assumes ire_dep_parent is set. Remove this child from its parent's linkage. 283111042SErik.Nordmark@Sun.COM */ 283211042SErik.Nordmark@Sun.COM void 283311042SErik.Nordmark@Sun.COM ire_dep_remove(ire_t *ire) 283411042SErik.Nordmark@Sun.COM { 283511042SErik.Nordmark@Sun.COM ip_stack_t *ipst = ire->ire_ipst; 283611042SErik.Nordmark@Sun.COM ire_t *parent = ire->ire_dep_parent; 283711042SErik.Nordmark@Sun.COM ire_t *next; 283811042SErik.Nordmark@Sun.COM nce_t *nce; 283911042SErik.Nordmark@Sun.COM 284011042SErik.Nordmark@Sun.COM ASSERT(RW_WRITE_HELD(&ipst->ips_ire_dep_lock)); 284111042SErik.Nordmark@Sun.COM ASSERT(ire->ire_dep_parent != NULL); 284211042SErik.Nordmark@Sun.COM ASSERT(ire->ire_dep_sib_ptpn != NULL); 284311042SErik.Nordmark@Sun.COM 284411042SErik.Nordmark@Sun.COM #ifdef DEBUG 284511042SErik.Nordmark@Sun.COM ire_dep_verify(ire); 284611042SErik.Nordmark@Sun.COM ire_dep_verify(parent); 284711042SErik.Nordmark@Sun.COM #endif 284811042SErik.Nordmark@Sun.COM 284911042SErik.Nordmark@Sun.COM next = ire->ire_dep_sib_next; 285011042SErik.Nordmark@Sun.COM if (next != NULL) 285111042SErik.Nordmark@Sun.COM next->ire_dep_sib_ptpn = ire->ire_dep_sib_ptpn; 285211042SErik.Nordmark@Sun.COM 285311042SErik.Nordmark@Sun.COM ASSERT(*(ire->ire_dep_sib_ptpn) == ire); 285411042SErik.Nordmark@Sun.COM *(ire->ire_dep_sib_ptpn) = ire->ire_dep_sib_next; 285511042SErik.Nordmark@Sun.COM 285611042SErik.Nordmark@Sun.COM ire->ire_dep_sib_ptpn = NULL; 285711042SErik.Nordmark@Sun.COM ire->ire_dep_sib_next = NULL; 285811042SErik.Nordmark@Sun.COM 285911042SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock); 286011042SErik.Nordmark@Sun.COM parent = ire->ire_dep_parent; 286111042SErik.Nordmark@Sun.COM ire->ire_dep_parent = NULL; 286211042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 28632535Ssangeeta 28642535Ssangeeta /* 286511042SErik.Nordmark@Sun.COM * Make sure all our children, grandchildren, etc set 286611042SErik.Nordmark@Sun.COM * ire_dep_parent_generation to IRE_GENERATION_VERIFY since 286711042SErik.Nordmark@Sun.COM * we can no longer guarantee than the children have a current 286811042SErik.Nordmark@Sun.COM * ire_nce_cache and ire_nexthop_ill(). 28692535Ssangeeta */ 287011042SErik.Nordmark@Sun.COM if (ire->ire_dep_children != NULL) 287111042SErik.Nordmark@Sun.COM ire_dep_invalidate_children(ire->ire_dep_children); 28722535Ssangeeta 28732535Ssangeeta /* 287411042SErik.Nordmark@Sun.COM * Since the parent is gone we make sure we clear ire_nce_cache. 287511042SErik.Nordmark@Sun.COM * We can clear it under ire_lock even if the IRE is used 28762535Ssangeeta */ 287711042SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock); 287811042SErik.Nordmark@Sun.COM nce = ire->ire_nce_cache; 287911042SErik.Nordmark@Sun.COM ire->ire_nce_cache = NULL; 288011042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 288111042SErik.Nordmark@Sun.COM if (nce != NULL) 288211042SErik.Nordmark@Sun.COM nce_refrele(nce); 288311042SErik.Nordmark@Sun.COM 288411042SErik.Nordmark@Sun.COM #ifdef DEBUG 288511042SErik.Nordmark@Sun.COM ire_dep_verify(ire); 288611042SErik.Nordmark@Sun.COM ire_dep_verify(parent); 288711042SErik.Nordmark@Sun.COM #endif 288811042SErik.Nordmark@Sun.COM 288911042SErik.Nordmark@Sun.COM ire_refrele_notr(parent); 289011042SErik.Nordmark@Sun.COM ire_refrele_notr(ire); 289111042SErik.Nordmark@Sun.COM } 289211042SErik.Nordmark@Sun.COM 289311042SErik.Nordmark@Sun.COM /* 289411042SErik.Nordmark@Sun.COM * Insert the child in the linkage of the parent 289511042SErik.Nordmark@Sun.COM */ 289611042SErik.Nordmark@Sun.COM static void 289711042SErik.Nordmark@Sun.COM ire_dep_parent_insert(ire_t *child, ire_t *parent) 289811042SErik.Nordmark@Sun.COM { 289911042SErik.Nordmark@Sun.COM ip_stack_t *ipst = child->ire_ipst; 290011042SErik.Nordmark@Sun.COM ire_t *next; 290111042SErik.Nordmark@Sun.COM 290211042SErik.Nordmark@Sun.COM ASSERT(RW_WRITE_HELD(&ipst->ips_ire_dep_lock)); 290311042SErik.Nordmark@Sun.COM ASSERT(child->ire_dep_parent == NULL); 290411042SErik.Nordmark@Sun.COM 290511042SErik.Nordmark@Sun.COM #ifdef DEBUG 290611042SErik.Nordmark@Sun.COM ire_dep_verify(child); 290711042SErik.Nordmark@Sun.COM ire_dep_verify(parent); 290811042SErik.Nordmark@Sun.COM #endif 290911042SErik.Nordmark@Sun.COM /* No parents => no siblings */ 291011042SErik.Nordmark@Sun.COM ASSERT(child->ire_dep_sib_ptpn == NULL); 291111042SErik.Nordmark@Sun.COM ASSERT(child->ire_dep_sib_next == NULL); 291211042SErik.Nordmark@Sun.COM 291311042SErik.Nordmark@Sun.COM ire_refhold_notr(parent); 291411042SErik.Nordmark@Sun.COM ire_refhold_notr(child); 291511042SErik.Nordmark@Sun.COM 291611042SErik.Nordmark@Sun.COM /* Head insertion */ 291711042SErik.Nordmark@Sun.COM next = parent->ire_dep_children; 291811042SErik.Nordmark@Sun.COM if (next != NULL) { 291911042SErik.Nordmark@Sun.COM ASSERT(next->ire_dep_sib_ptpn == &(parent->ire_dep_children)); 292011042SErik.Nordmark@Sun.COM child->ire_dep_sib_next = next; 292111042SErik.Nordmark@Sun.COM next->ire_dep_sib_ptpn = &(child->ire_dep_sib_next); 292211042SErik.Nordmark@Sun.COM } 292311042SErik.Nordmark@Sun.COM parent->ire_dep_children = child; 292411042SErik.Nordmark@Sun.COM child->ire_dep_sib_ptpn = &(parent->ire_dep_children); 292511042SErik.Nordmark@Sun.COM 292611042SErik.Nordmark@Sun.COM mutex_enter(&child->ire_lock); 292711042SErik.Nordmark@Sun.COM child->ire_dep_parent = parent; 292811042SErik.Nordmark@Sun.COM mutex_exit(&child->ire_lock); 292911042SErik.Nordmark@Sun.COM 293011042SErik.Nordmark@Sun.COM #ifdef DEBUG 293111042SErik.Nordmark@Sun.COM ire_dep_verify(child); 293211042SErik.Nordmark@Sun.COM ire_dep_verify(parent); 293311042SErik.Nordmark@Sun.COM #endif 293411042SErik.Nordmark@Sun.COM } 293511042SErik.Nordmark@Sun.COM 293611042SErik.Nordmark@Sun.COM 293711042SErik.Nordmark@Sun.COM /* 293811042SErik.Nordmark@Sun.COM * Given count worth of ires and generations, build ire_dep_* relationships 293911042SErik.Nordmark@Sun.COM * from ires[0] to ires[count-1]. Record generations[i+1] in 294011042SErik.Nordmark@Sun.COM * ire_dep_parent_generation for ires[i]. 294111042SErik.Nordmark@Sun.COM * We graft onto an existing parent chain by making sure that we don't 294211042SErik.Nordmark@Sun.COM * touch ire_dep_parent for ires[count-1]. 294311042SErik.Nordmark@Sun.COM * 294411042SErik.Nordmark@Sun.COM * We check for any condemned ire_generation count and return B_FALSE in 294511042SErik.Nordmark@Sun.COM * that case so that the caller can tear it apart. 294611042SErik.Nordmark@Sun.COM * 294711042SErik.Nordmark@Sun.COM * Note that generations[0] is not used. Caller handles that. 294811042SErik.Nordmark@Sun.COM */ 294911042SErik.Nordmark@Sun.COM boolean_t 295011042SErik.Nordmark@Sun.COM ire_dep_build(ire_t *ires[], uint_t generations[], uint_t count) 295111042SErik.Nordmark@Sun.COM { 295211042SErik.Nordmark@Sun.COM ire_t *ire = ires[0]; 295311042SErik.Nordmark@Sun.COM ip_stack_t *ipst; 295411042SErik.Nordmark@Sun.COM uint_t i; 295511042SErik.Nordmark@Sun.COM 295611042SErik.Nordmark@Sun.COM ASSERT(count > 0); 295711042SErik.Nordmark@Sun.COM if (count == 1) { 295811042SErik.Nordmark@Sun.COM /* No work to do */ 295911042SErik.Nordmark@Sun.COM return (B_TRUE); 296011042SErik.Nordmark@Sun.COM } 296111042SErik.Nordmark@Sun.COM ipst = ire->ire_ipst; 296211042SErik.Nordmark@Sun.COM rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER); 296311042SErik.Nordmark@Sun.COM /* 296411042SErik.Nordmark@Sun.COM * Do not remove the linkage for any existing parent chain i.e., 296511042SErik.Nordmark@Sun.COM * ires[count-1] is left alone. 296611042SErik.Nordmark@Sun.COM */ 296711042SErik.Nordmark@Sun.COM for (i = 0; i < count-1; i++) { 296811042SErik.Nordmark@Sun.COM /* Remove existing parent if we need to change it */ 296911042SErik.Nordmark@Sun.COM if (ires[i]->ire_dep_parent != NULL && 297011042SErik.Nordmark@Sun.COM ires[i]->ire_dep_parent != ires[i+1]) 297111042SErik.Nordmark@Sun.COM ire_dep_remove(ires[i]); 297211042SErik.Nordmark@Sun.COM } 297311042SErik.Nordmark@Sun.COM 297411042SErik.Nordmark@Sun.COM for (i = 0; i < count - 1; i++) { 297511042SErik.Nordmark@Sun.COM ASSERT(ires[i]->ire_ipversion == IPV4_VERSION || 297611042SErik.Nordmark@Sun.COM ires[i]->ire_ipversion == IPV6_VERSION); 297711042SErik.Nordmark@Sun.COM /* Does it need to change? */ 297811042SErik.Nordmark@Sun.COM if (ires[i]->ire_dep_parent != ires[i+1]) 297911042SErik.Nordmark@Sun.COM ire_dep_parent_insert(ires[i], ires[i+1]); 298011042SErik.Nordmark@Sun.COM 298111042SErik.Nordmark@Sun.COM mutex_enter(&ires[i+1]->ire_lock); 298211042SErik.Nordmark@Sun.COM if (IRE_IS_CONDEMNED(ires[i+1])) { 298311042SErik.Nordmark@Sun.COM mutex_exit(&ires[i+1]->ire_lock); 298411042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ire_dep_lock); 298511042SErik.Nordmark@Sun.COM return (B_FALSE); 298611042SErik.Nordmark@Sun.COM } 298711042SErik.Nordmark@Sun.COM mutex_exit(&ires[i+1]->ire_lock); 298811042SErik.Nordmark@Sun.COM 298911042SErik.Nordmark@Sun.COM mutex_enter(&ires[i]->ire_lock); 299011042SErik.Nordmark@Sun.COM ires[i]->ire_dep_parent_generation = generations[i+1]; 299111042SErik.Nordmark@Sun.COM mutex_exit(&ires[i]->ire_lock); 299211042SErik.Nordmark@Sun.COM } 299311042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ire_dep_lock); 299411042SErik.Nordmark@Sun.COM return (B_TRUE); 299511042SErik.Nordmark@Sun.COM } 299611042SErik.Nordmark@Sun.COM 299711042SErik.Nordmark@Sun.COM /* 299811042SErik.Nordmark@Sun.COM * Given count worth of ires, unbuild ire_dep_* relationships 299911042SErik.Nordmark@Sun.COM * from ires[0] to ires[count-1]. 300011042SErik.Nordmark@Sun.COM */ 300111042SErik.Nordmark@Sun.COM void 300211042SErik.Nordmark@Sun.COM ire_dep_unbuild(ire_t *ires[], uint_t count) 300311042SErik.Nordmark@Sun.COM { 300411042SErik.Nordmark@Sun.COM ip_stack_t *ipst; 300511042SErik.Nordmark@Sun.COM uint_t i; 300611042SErik.Nordmark@Sun.COM 300711042SErik.Nordmark@Sun.COM if (count == 0) { 300811042SErik.Nordmark@Sun.COM /* No work to do */ 30092535Ssangeeta return; 30102535Ssangeeta } 301111042SErik.Nordmark@Sun.COM ipst = ires[0]->ire_ipst; 301211042SErik.Nordmark@Sun.COM rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER); 301311042SErik.Nordmark@Sun.COM for (i = 0; i < count; i++) { 301411042SErik.Nordmark@Sun.COM ASSERT(ires[i]->ire_ipversion == IPV4_VERSION || 301511042SErik.Nordmark@Sun.COM ires[i]->ire_ipversion == IPV6_VERSION); 301611042SErik.Nordmark@Sun.COM if (ires[i]->ire_dep_parent != NULL) 301711042SErik.Nordmark@Sun.COM ire_dep_remove(ires[i]); 301811042SErik.Nordmark@Sun.COM mutex_enter(&ires[i]->ire_lock); 301911042SErik.Nordmark@Sun.COM ires[i]->ire_dep_parent_generation = IRE_GENERATION_VERIFY; 302011042SErik.Nordmark@Sun.COM mutex_exit(&ires[i]->ire_lock); 302111042SErik.Nordmark@Sun.COM } 302211042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ire_dep_lock); 302311042SErik.Nordmark@Sun.COM } 302411042SErik.Nordmark@Sun.COM 302511042SErik.Nordmark@Sun.COM /* 302611042SErik.Nordmark@Sun.COM * Both the forwarding and the outbound code paths can trip on 302711042SErik.Nordmark@Sun.COM * a condemned NCE, in which case we call this function. 302811042SErik.Nordmark@Sun.COM * We have two different behaviors: if the NCE was UNREACHABLE 302911042SErik.Nordmark@Sun.COM * it is an indication that something failed. In that case 303011042SErik.Nordmark@Sun.COM * we see if we should look for a different IRE (for example, 303111042SErik.Nordmark@Sun.COM * delete any matching redirect IRE, or try a different 303211042SErik.Nordmark@Sun.COM * IRE_DEFAULT (ECMP)). We mark the ire as bad so a hopefully 303311042SErik.Nordmark@Sun.COM * different IRE will be picked next time we send/forward. 303411042SErik.Nordmark@Sun.COM * 303511042SErik.Nordmark@Sun.COM * If we are called by the output path then fail_if_better is set 303611042SErik.Nordmark@Sun.COM * and we return NULL if there could be a better IRE. This is because the 303711042SErik.Nordmark@Sun.COM * output path retries the IRE lookup. (The input/forward path can not retry.) 303811042SErik.Nordmark@Sun.COM * 303911042SErik.Nordmark@Sun.COM * If the NCE was not unreachable then we pick/allocate a 304011042SErik.Nordmark@Sun.COM * new (most likely ND_INITIAL) NCE and proceed with it. 304111042SErik.Nordmark@Sun.COM * 304211042SErik.Nordmark@Sun.COM * ipha/ip6h are needed for multicast packets; ipha needs to be 304311042SErik.Nordmark@Sun.COM * set for IPv4 and ip6h needs to be set for IPv6 packets. 304411042SErik.Nordmark@Sun.COM */ 304511042SErik.Nordmark@Sun.COM nce_t * 304611042SErik.Nordmark@Sun.COM ire_handle_condemned_nce(nce_t *nce, ire_t *ire, ipha_t *ipha, ip6_t *ip6h, 304711042SErik.Nordmark@Sun.COM boolean_t fail_if_better) 304811042SErik.Nordmark@Sun.COM { 304911042SErik.Nordmark@Sun.COM if (nce->nce_common->ncec_state == ND_UNREACHABLE) { 305011042SErik.Nordmark@Sun.COM if (ire_no_good(ire) && fail_if_better) { 305111042SErik.Nordmark@Sun.COM /* 305211042SErik.Nordmark@Sun.COM * Did some changes, or ECMP likely to exist. 305311042SErik.Nordmark@Sun.COM * Make ip_output look for a different IRE 305411042SErik.Nordmark@Sun.COM */ 305511042SErik.Nordmark@Sun.COM return (NULL); 305611042SErik.Nordmark@Sun.COM } 305711042SErik.Nordmark@Sun.COM } 305811042SErik.Nordmark@Sun.COM if (ire_revalidate_nce(ire) == ENETUNREACH) { 305911042SErik.Nordmark@Sun.COM /* The ire_dep_parent chain went bad, or no memory? */ 306011042SErik.Nordmark@Sun.COM (void) ire_no_good(ire); 306111042SErik.Nordmark@Sun.COM return (NULL); 306211042SErik.Nordmark@Sun.COM } 306311042SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV4_VERSION) { 306411042SErik.Nordmark@Sun.COM ASSERT(ipha != NULL); 306511042SErik.Nordmark@Sun.COM nce = ire_to_nce(ire, ipha->ipha_dst, NULL); 306611042SErik.Nordmark@Sun.COM } else { 306711042SErik.Nordmark@Sun.COM ASSERT(ip6h != NULL); 306811042SErik.Nordmark@Sun.COM nce = ire_to_nce(ire, INADDR_ANY, &ip6h->ip6_dst); 30692535Ssangeeta } 30708485SPeter.Memishian@Sun.COM 307111042SErik.Nordmark@Sun.COM if (nce == NULL) 307211042SErik.Nordmark@Sun.COM return (NULL); 307311042SErik.Nordmark@Sun.COM if (nce->nce_is_condemned) { 307411042SErik.Nordmark@Sun.COM nce_refrele(nce); 307511042SErik.Nordmark@Sun.COM return (NULL); 30762535Ssangeeta } 307711042SErik.Nordmark@Sun.COM return (nce); 307811042SErik.Nordmark@Sun.COM } 307911042SErik.Nordmark@Sun.COM 308011042SErik.Nordmark@Sun.COM /* 308111042SErik.Nordmark@Sun.COM * The caller has found that the ire is bad, either due to a reference to an NCE 308211042SErik.Nordmark@Sun.COM * in ND_UNREACHABLE state, or a MULTIRT route whose gateway can't be resolved. 308311042SErik.Nordmark@Sun.COM * We update things so a subsequent attempt to send to the destination 308411042SErik.Nordmark@Sun.COM * is likely to find different IRE, or that a new NCE would be created. 308511042SErik.Nordmark@Sun.COM * 308611042SErik.Nordmark@Sun.COM * Returns B_TRUE if it is likely that a subsequent ire_ftable_lookup would 308711042SErik.Nordmark@Sun.COM * find a different route (either due to having deleted a redirect, or there 308811042SErik.Nordmark@Sun.COM * being ECMP routes.) 308911042SErik.Nordmark@Sun.COM * 309011042SErik.Nordmark@Sun.COM * If we have a redirect (RTF_DYNAMIC) we delete it. 309111042SErik.Nordmark@Sun.COM * Otherwise we increment ire_badcnt and increment the generation number so 309211042SErik.Nordmark@Sun.COM * that a cached ixa_ire will redo the route selection. ire_badcnt is taken 309311042SErik.Nordmark@Sun.COM * into account in the route selection when we have multiple choices (multiple 309411042SErik.Nordmark@Sun.COM * default routes or ECMP in general). 309511042SErik.Nordmark@Sun.COM * Any time ip_select_route find an ire with a condemned ire_nce_cache 309611042SErik.Nordmark@Sun.COM * (e.g., if no equal cost route to the bad one) ip_select_route will make 309711042SErik.Nordmark@Sun.COM * sure the NCE is revalidated to avoid getting stuck on a 309811042SErik.Nordmark@Sun.COM * NCE_F_CONDMNED ncec that caused ire_no_good to be called. 309911042SErik.Nordmark@Sun.COM */ 310011042SErik.Nordmark@Sun.COM boolean_t 310111042SErik.Nordmark@Sun.COM ire_no_good(ire_t *ire) 310211042SErik.Nordmark@Sun.COM { 310311042SErik.Nordmark@Sun.COM ip_stack_t *ipst = ire->ire_ipst; 310411042SErik.Nordmark@Sun.COM ire_t *ire2; 310511042SErik.Nordmark@Sun.COM nce_t *nce; 310611042SErik.Nordmark@Sun.COM 310711042SErik.Nordmark@Sun.COM if (ire->ire_flags & RTF_DYNAMIC) { 310811042SErik.Nordmark@Sun.COM ire_delete(ire); 310911042SErik.Nordmark@Sun.COM return (B_TRUE); 311011042SErik.Nordmark@Sun.COM } 311111042SErik.Nordmark@Sun.COM if (ire->ire_flags & RTF_INDIRECT) { 311211042SErik.Nordmark@Sun.COM /* Check if next IRE is a redirect */ 311311042SErik.Nordmark@Sun.COM rw_enter(&ipst->ips_ire_dep_lock, RW_READER); 311411042SErik.Nordmark@Sun.COM if (ire->ire_dep_parent != NULL && 311511042SErik.Nordmark@Sun.COM (ire->ire_dep_parent->ire_flags & RTF_DYNAMIC)) { 311611042SErik.Nordmark@Sun.COM ire2 = ire->ire_dep_parent; 311711042SErik.Nordmark@Sun.COM ire_refhold(ire2); 311811042SErik.Nordmark@Sun.COM } else { 311911042SErik.Nordmark@Sun.COM ire2 = NULL; 312011042SErik.Nordmark@Sun.COM } 312111042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ire_dep_lock); 312211042SErik.Nordmark@Sun.COM if (ire2 != NULL) { 312311042SErik.Nordmark@Sun.COM ire_delete(ire2); 312411042SErik.Nordmark@Sun.COM ire_refrele(ire2); 312511042SErik.Nordmark@Sun.COM return (B_TRUE); 312611042SErik.Nordmark@Sun.COM } 312711042SErik.Nordmark@Sun.COM } 31282535Ssangeeta /* 312911042SErik.Nordmark@Sun.COM * No redirect involved. Increment badcnt so that if we have ECMP 313011042SErik.Nordmark@Sun.COM * routes we are likely to pick a different one for the next packet. 313111042SErik.Nordmark@Sun.COM * 313211042SErik.Nordmark@Sun.COM * If the NCE is unreachable and condemned we should drop the reference 313311042SErik.Nordmark@Sun.COM * to it so that a new NCE can be created. 313411042SErik.Nordmark@Sun.COM * 313511042SErik.Nordmark@Sun.COM * Finally we increment the generation number so that any ixa_ire 313611042SErik.Nordmark@Sun.COM * cache will be revalidated. 31372535Ssangeeta */ 313811042SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock); 313911042SErik.Nordmark@Sun.COM ire->ire_badcnt++; 314011066Srafael.vanoni@sun.com ire->ire_last_badcnt = TICK_TO_SEC(ddi_get_lbolt64()); 314111042SErik.Nordmark@Sun.COM nce = ire->ire_nce_cache; 314211042SErik.Nordmark@Sun.COM if (nce != NULL && nce->nce_is_condemned && 314311042SErik.Nordmark@Sun.COM nce->nce_common->ncec_state == ND_UNREACHABLE) 314411042SErik.Nordmark@Sun.COM ire->ire_nce_cache = NULL; 314511042SErik.Nordmark@Sun.COM else 314611042SErik.Nordmark@Sun.COM nce = NULL; 314711042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 314811042SErik.Nordmark@Sun.COM if (nce != NULL) 314911042SErik.Nordmark@Sun.COM nce_refrele(nce); 315011042SErik.Nordmark@Sun.COM 315111042SErik.Nordmark@Sun.COM ire_increment_generation(ire); 315211042SErik.Nordmark@Sun.COM ire_dep_incr_generation(ire); 315311042SErik.Nordmark@Sun.COM 315411042SErik.Nordmark@Sun.COM return (ire->ire_bucket->irb_ire_cnt > 1); 315511042SErik.Nordmark@Sun.COM } 315611042SErik.Nordmark@Sun.COM 315711042SErik.Nordmark@Sun.COM /* 315811042SErik.Nordmark@Sun.COM * Walk ire_dep_parent chain and validate that ire_dep_parent->ire_generation == 315911042SErik.Nordmark@Sun.COM * ire_dep_parent_generation. 316011042SErik.Nordmark@Sun.COM * If they all match we just return ire_generation from the topmost IRE. 316111042SErik.Nordmark@Sun.COM * Otherwise we propagate the mismatch by setting all ire_dep_parent_generation 316211042SErik.Nordmark@Sun.COM * above the mismatch to IRE_GENERATION_VERIFY and also returning 316311042SErik.Nordmark@Sun.COM * IRE_GENERATION_VERIFY. 316411042SErik.Nordmark@Sun.COM */ 316511042SErik.Nordmark@Sun.COM uint_t 316611042SErik.Nordmark@Sun.COM ire_dep_validate_generations(ire_t *ire) 316711042SErik.Nordmark@Sun.COM { 316811042SErik.Nordmark@Sun.COM ip_stack_t *ipst = ire->ire_ipst; 316911042SErik.Nordmark@Sun.COM uint_t generation; 317011042SErik.Nordmark@Sun.COM ire_t *ire1; 317111042SErik.Nordmark@Sun.COM 317211042SErik.Nordmark@Sun.COM rw_enter(&ipst->ips_ire_dep_lock, RW_READER); 317311042SErik.Nordmark@Sun.COM generation = ire->ire_generation; /* Assuming things match */ 317411042SErik.Nordmark@Sun.COM for (ire1 = ire; ire1 != NULL; ire1 = ire1->ire_dep_parent) { 317511042SErik.Nordmark@Sun.COM ASSERT(ire1->ire_ipversion == IPV4_VERSION || 317611042SErik.Nordmark@Sun.COM ire1->ire_ipversion == IPV6_VERSION); 317711042SErik.Nordmark@Sun.COM if (ire1->ire_dep_parent == NULL) 317811042SErik.Nordmark@Sun.COM break; 317911042SErik.Nordmark@Sun.COM if (ire1->ire_dep_parent_generation != 318011042SErik.Nordmark@Sun.COM ire1->ire_dep_parent->ire_generation) 318111042SErik.Nordmark@Sun.COM goto mismatch; 318211042SErik.Nordmark@Sun.COM } 318311042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ire_dep_lock); 318411042SErik.Nordmark@Sun.COM return (generation); 318511042SErik.Nordmark@Sun.COM 318611042SErik.Nordmark@Sun.COM mismatch: 318711042SErik.Nordmark@Sun.COM generation = IRE_GENERATION_VERIFY; 318811042SErik.Nordmark@Sun.COM /* Fill from top down to the mismatch with _VERIFY */ 318911042SErik.Nordmark@Sun.COM while (ire != ire1) { 319011042SErik.Nordmark@Sun.COM ASSERT(ire->ire_ipversion == IPV4_VERSION || 319111042SErik.Nordmark@Sun.COM ire->ire_ipversion == IPV6_VERSION); 319211042SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock); 319311042SErik.Nordmark@Sun.COM ire->ire_dep_parent_generation = IRE_GENERATION_VERIFY; 319411042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 319511042SErik.Nordmark@Sun.COM ire = ire->ire_dep_parent; 31962535Ssangeeta } 319711042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ire_dep_lock); 319811042SErik.Nordmark@Sun.COM return (generation); 319911042SErik.Nordmark@Sun.COM } 320011042SErik.Nordmark@Sun.COM 320111042SErik.Nordmark@Sun.COM /* 320211042SErik.Nordmark@Sun.COM * Used when we need to return an ire with ire_dep_parent, but we 320311042SErik.Nordmark@Sun.COM * know the chain is invalid for instance we didn't create an IRE_IF_CLONE 320411042SErik.Nordmark@Sun.COM * Using IRE_GENERATION_VERIFY means that next time we'll redo the 320511042SErik.Nordmark@Sun.COM * recursive lookup. 320611042SErik.Nordmark@Sun.COM */ 320711042SErik.Nordmark@Sun.COM void 320811042SErik.Nordmark@Sun.COM ire_dep_invalidate_generations(ire_t *ire) 320911042SErik.Nordmark@Sun.COM { 321011042SErik.Nordmark@Sun.COM ip_stack_t *ipst = ire->ire_ipst; 321111042SErik.Nordmark@Sun.COM 321211042SErik.Nordmark@Sun.COM rw_enter(&ipst->ips_ire_dep_lock, RW_READER); 321311042SErik.Nordmark@Sun.COM while (ire != NULL) { 321411042SErik.Nordmark@Sun.COM ASSERT(ire->ire_ipversion == IPV4_VERSION || 321511042SErik.Nordmark@Sun.COM ire->ire_ipversion == IPV6_VERSION); 321611042SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock); 321711042SErik.Nordmark@Sun.COM ire->ire_dep_parent_generation = IRE_GENERATION_VERIFY; 321811042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 321911042SErik.Nordmark@Sun.COM ire = ire->ire_dep_parent; 322011042SErik.Nordmark@Sun.COM } 322111042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ire_dep_lock); 322211042SErik.Nordmark@Sun.COM } 322311042SErik.Nordmark@Sun.COM 322411042SErik.Nordmark@Sun.COM /* Set _VERIFY ire_dep_parent_generation for all children recursively */ 322511042SErik.Nordmark@Sun.COM static void 322611042SErik.Nordmark@Sun.COM ire_dep_invalidate_children(ire_t *child) 322711042SErik.Nordmark@Sun.COM { 322811042SErik.Nordmark@Sun.COM ip_stack_t *ipst = child->ire_ipst; 322911042SErik.Nordmark@Sun.COM 323011042SErik.Nordmark@Sun.COM ASSERT(RW_WRITE_HELD(&ipst->ips_ire_dep_lock)); 323111042SErik.Nordmark@Sun.COM /* Depth first */ 323211042SErik.Nordmark@Sun.COM if (child->ire_dep_children != NULL) 323311042SErik.Nordmark@Sun.COM ire_dep_invalidate_children(child->ire_dep_children); 323411042SErik.Nordmark@Sun.COM 323511042SErik.Nordmark@Sun.COM while (child != NULL) { 323611042SErik.Nordmark@Sun.COM mutex_enter(&child->ire_lock); 323711042SErik.Nordmark@Sun.COM child->ire_dep_parent_generation = IRE_GENERATION_VERIFY; 323811042SErik.Nordmark@Sun.COM mutex_exit(&child->ire_lock); 323911042SErik.Nordmark@Sun.COM child = child->ire_dep_sib_next; 324011042SErik.Nordmark@Sun.COM } 324111042SErik.Nordmark@Sun.COM } 324211042SErik.Nordmark@Sun.COM 324311042SErik.Nordmark@Sun.COM static void 324411042SErik.Nordmark@Sun.COM ire_dep_increment_children(ire_t *child) 324511042SErik.Nordmark@Sun.COM { 324611042SErik.Nordmark@Sun.COM ip_stack_t *ipst = child->ire_ipst; 324711042SErik.Nordmark@Sun.COM 324811042SErik.Nordmark@Sun.COM ASSERT(RW_READ_HELD(&ipst->ips_ire_dep_lock)); 324911042SErik.Nordmark@Sun.COM /* Depth first */ 325011042SErik.Nordmark@Sun.COM if (child->ire_dep_children != NULL) 325111042SErik.Nordmark@Sun.COM ire_dep_increment_children(child->ire_dep_children); 325211042SErik.Nordmark@Sun.COM 325311042SErik.Nordmark@Sun.COM while (child != NULL) { 325411042SErik.Nordmark@Sun.COM if (!IRE_IS_CONDEMNED(child)) 325511042SErik.Nordmark@Sun.COM ire_increment_generation(child); 325611042SErik.Nordmark@Sun.COM child = child->ire_dep_sib_next; 32572535Ssangeeta } 32582535Ssangeeta } 32592535Ssangeeta 32602535Ssangeeta /* 326111042SErik.Nordmark@Sun.COM * Walk all the children of this ire recursively and increment their 326211042SErik.Nordmark@Sun.COM * generation number. 32632535Ssangeeta */ 326411463SSowmini.Varadhan@Sun.COM static void 326511463SSowmini.Varadhan@Sun.COM ire_dep_incr_generation_locked(ire_t *parent) 326611463SSowmini.Varadhan@Sun.COM { 326711463SSowmini.Varadhan@Sun.COM ASSERT(RW_READ_HELD(&parent->ire_ipst->ips_ire_dep_lock)); 326811463SSowmini.Varadhan@Sun.COM if (parent->ire_dep_children != NULL) 326911463SSowmini.Varadhan@Sun.COM ire_dep_increment_children(parent->ire_dep_children); 327011463SSowmini.Varadhan@Sun.COM } 327111463SSowmini.Varadhan@Sun.COM 32722535Ssangeeta void 327311042SErik.Nordmark@Sun.COM ire_dep_incr_generation(ire_t *parent) 32742535Ssangeeta { 327511042SErik.Nordmark@Sun.COM ip_stack_t *ipst = parent->ire_ipst; 327611042SErik.Nordmark@Sun.COM 327711042SErik.Nordmark@Sun.COM rw_enter(&ipst->ips_ire_dep_lock, RW_READER); 327811463SSowmini.Varadhan@Sun.COM ire_dep_incr_generation_locked(parent); 327911042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ire_dep_lock); 32802535Ssangeeta } 32812535Ssangeeta 32823772Ssangeeta /* 328311042SErik.Nordmark@Sun.COM * Get a new ire_nce_cache for this IRE as well as its nexthop. 328411042SErik.Nordmark@Sun.COM * Returns zero if it succeeds. Can fail due to lack of memory or when 328511042SErik.Nordmark@Sun.COM * the route has become unreachable. Returns ENOMEM and ENETUNREACH in those 328611042SErik.Nordmark@Sun.COM * cases. 328711042SErik.Nordmark@Sun.COM * 328811042SErik.Nordmark@Sun.COM * In the in.mpathd case, the ire will have ire_testhidden 328911042SErik.Nordmark@Sun.COM * set; so we should create the ncec for the underlying ill. 32904714Ssowmini * 329111042SErik.Nordmark@Sun.COM * Note that the error returned by ire_revalidate_nce() is ignored by most 329211042SErik.Nordmark@Sun.COM * callers except ire_handle_condemned_nce(), which handles the ENETUNREACH 329311042SErik.Nordmark@Sun.COM * error to mark potentially bad ire's. For all the other callers, an 329411042SErik.Nordmark@Sun.COM * error return could indicate a transient condition like ENOMEM, or could 329511042SErik.Nordmark@Sun.COM * be the result of an interface that is going down/unplumbing. In the former 329611042SErik.Nordmark@Sun.COM * case (transient error), we would leave the old stale ire/ire_nce_cache 329711042SErik.Nordmark@Sun.COM * in place, and possibly use incorrect link-layer information to send packets 329811042SErik.Nordmark@Sun.COM * but would eventually recover. In the latter case (ill down/replumb), 329911042SErik.Nordmark@Sun.COM * ire_revalidate_nce() might return a condemned nce back, but we would then 330011042SErik.Nordmark@Sun.COM * recover in the packet output path. 33012535Ssangeeta */ 33022535Ssangeeta int 330311042SErik.Nordmark@Sun.COM ire_revalidate_nce(ire_t *ire) 33042535Ssangeeta { 330511042SErik.Nordmark@Sun.COM nce_t *nce, *old_nce; 330611042SErik.Nordmark@Sun.COM ire_t *nexthop; 33072535Ssangeeta 33082535Ssangeeta /* 330911042SErik.Nordmark@Sun.COM * For multicast we conceptually have an NCE but we don't store it 331011042SErik.Nordmark@Sun.COM * in ire_nce_cache; when ire_to_nce is called we allocate the nce. 33112535Ssangeeta */ 331211042SErik.Nordmark@Sun.COM if (ire->ire_type & IRE_MULTICAST) 331311042SErik.Nordmark@Sun.COM return (0); 331411042SErik.Nordmark@Sun.COM 331511042SErik.Nordmark@Sun.COM /* ire_testhidden should only be set on under-interfaces */ 331611042SErik.Nordmark@Sun.COM ASSERT(!ire->ire_testhidden || !IS_IPMP(ire->ire_ill)); 331711042SErik.Nordmark@Sun.COM 331811042SErik.Nordmark@Sun.COM nexthop = ire_nexthop(ire); 331911042SErik.Nordmark@Sun.COM if (nexthop == NULL) { 332011042SErik.Nordmark@Sun.COM /* The route is potentially bad */ 332111042SErik.Nordmark@Sun.COM (void) ire_no_good(ire); 332211042SErik.Nordmark@Sun.COM return (ENETUNREACH); 33234084Ssowmini } 332411042SErik.Nordmark@Sun.COM if (ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) { 332511042SErik.Nordmark@Sun.COM ASSERT(ire->ire_ill != NULL); 332611042SErik.Nordmark@Sun.COM 332711042SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV4_VERSION) 332811042SErik.Nordmark@Sun.COM nce = nce_lookup_v4(ire->ire_ill, &ire->ire_addr); 332911042SErik.Nordmark@Sun.COM else 333011042SErik.Nordmark@Sun.COM nce = nce_lookup_v6(ire->ire_ill, &ire->ire_addr_v6); 333111042SErik.Nordmark@Sun.COM } else { 333211042SErik.Nordmark@Sun.COM ASSERT(nexthop->ire_type & IRE_ONLINK); 333311042SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV4_VERSION) { 333411042SErik.Nordmark@Sun.COM nce = arp_nce_init(nexthop->ire_ill, nexthop->ire_addr, 333511042SErik.Nordmark@Sun.COM nexthop->ire_type); 333611042SErik.Nordmark@Sun.COM } else { 333711042SErik.Nordmark@Sun.COM nce = ndp_nce_init(nexthop->ire_ill, 333811042SErik.Nordmark@Sun.COM &nexthop->ire_addr_v6, nexthop->ire_type); 333911042SErik.Nordmark@Sun.COM } 33402535Ssangeeta } 334111042SErik.Nordmark@Sun.COM if (nce == NULL) { 33422535Ssangeeta /* 334311042SErik.Nordmark@Sun.COM * Leave the old stale one in place to avoid a NULL 334411042SErik.Nordmark@Sun.COM * ire_nce_cache. 33452535Ssangeeta */ 334611042SErik.Nordmark@Sun.COM ire_refrele(nexthop); 334711042SErik.Nordmark@Sun.COM return (ENOMEM); 334811042SErik.Nordmark@Sun.COM } 334911042SErik.Nordmark@Sun.COM 335011042SErik.Nordmark@Sun.COM if (nexthop != ire) { 335111042SErik.Nordmark@Sun.COM /* Update the nexthop ire */ 335211042SErik.Nordmark@Sun.COM mutex_enter(&nexthop->ire_lock); 335311042SErik.Nordmark@Sun.COM old_nce = nexthop->ire_nce_cache; 335411042SErik.Nordmark@Sun.COM if (!IRE_IS_CONDEMNED(nexthop)) { 335511042SErik.Nordmark@Sun.COM nce_refhold(nce); 335611042SErik.Nordmark@Sun.COM nexthop->ire_nce_cache = nce; 335711042SErik.Nordmark@Sun.COM } else { 335811042SErik.Nordmark@Sun.COM nexthop->ire_nce_cache = NULL; 335911042SErik.Nordmark@Sun.COM } 336011042SErik.Nordmark@Sun.COM mutex_exit(&nexthop->ire_lock); 336111042SErik.Nordmark@Sun.COM if (old_nce != NULL) 336211042SErik.Nordmark@Sun.COM nce_refrele(old_nce); 336311042SErik.Nordmark@Sun.COM } 336411042SErik.Nordmark@Sun.COM ire_refrele(nexthop); 336511042SErik.Nordmark@Sun.COM 336611042SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock); 336711042SErik.Nordmark@Sun.COM old_nce = ire->ire_nce_cache; 336811042SErik.Nordmark@Sun.COM if (!IRE_IS_CONDEMNED(ire)) { 336911042SErik.Nordmark@Sun.COM nce_refhold(nce); 337011042SErik.Nordmark@Sun.COM ire->ire_nce_cache = nce; 33712535Ssangeeta } else { 337211042SErik.Nordmark@Sun.COM ire->ire_nce_cache = NULL; 33732535Ssangeeta } 337411042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 337511042SErik.Nordmark@Sun.COM if (old_nce != NULL) 337611042SErik.Nordmark@Sun.COM nce_refrele(old_nce); 337711042SErik.Nordmark@Sun.COM 337811042SErik.Nordmark@Sun.COM nce_refrele(nce); 33792535Ssangeeta return (0); 33802535Ssangeeta } 33817880SJonathan.Anderson@Sun.COM 33827880SJonathan.Anderson@Sun.COM /* 338311042SErik.Nordmark@Sun.COM * Get a held nce for a given ire. 338411042SErik.Nordmark@Sun.COM * In the common case this is just from ire_nce_cache. 338511042SErik.Nordmark@Sun.COM * For IRE_MULTICAST this needs to do an explicit lookup since we do not 338611042SErik.Nordmark@Sun.COM * have an IRE_MULTICAST per address. 338711042SErik.Nordmark@Sun.COM * Note that this explicitly returns CONDEMNED NCEs. The caller needs those 338811042SErik.Nordmark@Sun.COM * so they can check whether the NCE went unreachable (as opposed to was 338911042SErik.Nordmark@Sun.COM * condemned for some other reason). 33907880SJonathan.Anderson@Sun.COM */ 339111042SErik.Nordmark@Sun.COM nce_t * 339211042SErik.Nordmark@Sun.COM ire_to_nce(ire_t *ire, ipaddr_t v4nexthop, const in6_addr_t *v6nexthop) 33937880SJonathan.Anderson@Sun.COM { 339411042SErik.Nordmark@Sun.COM nce_t *nce; 339511042SErik.Nordmark@Sun.COM 339611042SErik.Nordmark@Sun.COM if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) 33977880SJonathan.Anderson@Sun.COM return (NULL); 339811042SErik.Nordmark@Sun.COM 339911042SErik.Nordmark@Sun.COM /* ire_testhidden should only be set on under-interfaces */ 340011042SErik.Nordmark@Sun.COM ASSERT(!ire->ire_testhidden || !IS_IPMP(ire->ire_ill)); 340111042SErik.Nordmark@Sun.COM 340211042SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock); 340311042SErik.Nordmark@Sun.COM nce = ire->ire_nce_cache; 340411042SErik.Nordmark@Sun.COM if (nce != NULL) { 340511042SErik.Nordmark@Sun.COM nce_refhold(nce); 340611042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 340711042SErik.Nordmark@Sun.COM return (nce); 34087880SJonathan.Anderson@Sun.COM } 340911042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 341011042SErik.Nordmark@Sun.COM 341111042SErik.Nordmark@Sun.COM if (ire->ire_type & IRE_MULTICAST) { 341211042SErik.Nordmark@Sun.COM ASSERT(ire->ire_ill != NULL); 341311042SErik.Nordmark@Sun.COM 341411042SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV4_VERSION) { 341511042SErik.Nordmark@Sun.COM ASSERT(v6nexthop == NULL); 341611042SErik.Nordmark@Sun.COM 341711042SErik.Nordmark@Sun.COM nce = arp_nce_init(ire->ire_ill, v4nexthop, 341811042SErik.Nordmark@Sun.COM ire->ire_type); 341911042SErik.Nordmark@Sun.COM } else { 342011042SErik.Nordmark@Sun.COM ASSERT(v6nexthop != NULL); 342111042SErik.Nordmark@Sun.COM ASSERT(v4nexthop == 0); 342211042SErik.Nordmark@Sun.COM nce = ndp_nce_init(ire->ire_ill, v6nexthop, 342311042SErik.Nordmark@Sun.COM ire->ire_type); 34247880SJonathan.Anderson@Sun.COM } 342511042SErik.Nordmark@Sun.COM return (nce); 34267880SJonathan.Anderson@Sun.COM } 34277880SJonathan.Anderson@Sun.COM return (NULL); 34287880SJonathan.Anderson@Sun.COM } 34297880SJonathan.Anderson@Sun.COM 343011042SErik.Nordmark@Sun.COM nce_t * 343111042SErik.Nordmark@Sun.COM ire_to_nce_pkt(ire_t *ire, mblk_t *mp) 343211042SErik.Nordmark@Sun.COM { 343311042SErik.Nordmark@Sun.COM ipha_t *ipha; 343411042SErik.Nordmark@Sun.COM ip6_t *ip6h; 343511042SErik.Nordmark@Sun.COM 343611042SErik.Nordmark@Sun.COM if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { 343711042SErik.Nordmark@Sun.COM ipha = (ipha_t *)mp->b_rptr; 343811042SErik.Nordmark@Sun.COM return (ire_to_nce(ire, ipha->ipha_dst, NULL)); 343911042SErik.Nordmark@Sun.COM } else { 344011042SErik.Nordmark@Sun.COM ip6h = (ip6_t *)mp->b_rptr; 344111042SErik.Nordmark@Sun.COM return (ire_to_nce(ire, INADDR_ANY, &ip6h->ip6_dst)); 344211042SErik.Nordmark@Sun.COM } 344311042SErik.Nordmark@Sun.COM } 344411042SErik.Nordmark@Sun.COM 34457880SJonathan.Anderson@Sun.COM /* 344611042SErik.Nordmark@Sun.COM * Given an IRE_INTERFACE (that matches more than one address) create 344711042SErik.Nordmark@Sun.COM * and return an IRE_IF_CLONE for the specific address. 344811042SErik.Nordmark@Sun.COM * Return the generation number. 344911042SErik.Nordmark@Sun.COM * Returns NULL is no memory for the IRE. 345011042SErik.Nordmark@Sun.COM * Handles both IPv4 and IPv6. 34517880SJonathan.Anderson@Sun.COM */ 34527880SJonathan.Anderson@Sun.COM ire_t * 345311042SErik.Nordmark@Sun.COM ire_create_if_clone(ire_t *ire_if, const in6_addr_t *addr, uint_t *generationp) 34547880SJonathan.Anderson@Sun.COM { 345511042SErik.Nordmark@Sun.COM ire_t *ire; 345611042SErik.Nordmark@Sun.COM ire_t *nire; 345711042SErik.Nordmark@Sun.COM 345811042SErik.Nordmark@Sun.COM if (ire_if->ire_ipversion == IPV4_VERSION) { 345911042SErik.Nordmark@Sun.COM ipaddr_t v4addr; 346011042SErik.Nordmark@Sun.COM ipaddr_t mask = IP_HOST_MASK; 346111042SErik.Nordmark@Sun.COM 346211042SErik.Nordmark@Sun.COM ASSERT(IN6_IS_ADDR_V4MAPPED(addr)); 346311042SErik.Nordmark@Sun.COM IN6_V4MAPPED_TO_IPADDR(addr, v4addr); 346411042SErik.Nordmark@Sun.COM 346511042SErik.Nordmark@Sun.COM ire = ire_create( 346611042SErik.Nordmark@Sun.COM (uchar_t *)&v4addr, /* dest address */ 346711042SErik.Nordmark@Sun.COM (uchar_t *)&mask, /* mask */ 346811042SErik.Nordmark@Sun.COM (uchar_t *)&ire_if->ire_gateway_addr, 346911042SErik.Nordmark@Sun.COM IRE_IF_CLONE, /* IRE type */ 347011042SErik.Nordmark@Sun.COM ire_if->ire_ill, 347111042SErik.Nordmark@Sun.COM ire_if->ire_zoneid, 347211042SErik.Nordmark@Sun.COM ire_if->ire_flags | RTF_HOST, 347311042SErik.Nordmark@Sun.COM NULL, /* No security attr for IRE_IF_ALL */ 347411042SErik.Nordmark@Sun.COM ire_if->ire_ipst); 347511042SErik.Nordmark@Sun.COM } else { 347611042SErik.Nordmark@Sun.COM ASSERT(!IN6_IS_ADDR_V4MAPPED(addr)); 347711042SErik.Nordmark@Sun.COM ire = ire_create_v6( 347811042SErik.Nordmark@Sun.COM addr, /* dest address */ 347911042SErik.Nordmark@Sun.COM &ipv6_all_ones, /* mask */ 348011042SErik.Nordmark@Sun.COM &ire_if->ire_gateway_addr_v6, /* gateway addr */ 348111042SErik.Nordmark@Sun.COM IRE_IF_CLONE, /* IRE type */ 348211042SErik.Nordmark@Sun.COM ire_if->ire_ill, 348311042SErik.Nordmark@Sun.COM ire_if->ire_zoneid, 348411042SErik.Nordmark@Sun.COM ire_if->ire_flags | RTF_HOST, 348511042SErik.Nordmark@Sun.COM NULL, /* No security attr for IRE_IF_ALL */ 348611042SErik.Nordmark@Sun.COM ire_if->ire_ipst); 348711042SErik.Nordmark@Sun.COM } 348811042SErik.Nordmark@Sun.COM if (ire == NULL) 348911042SErik.Nordmark@Sun.COM return (NULL); 349011042SErik.Nordmark@Sun.COM 349111042SErik.Nordmark@Sun.COM /* Take the metrics, in particular the mtu, from the IRE_IF */ 349211042SErik.Nordmark@Sun.COM ire->ire_metrics = ire_if->ire_metrics; 349311042SErik.Nordmark@Sun.COM 349411042SErik.Nordmark@Sun.COM nire = ire_add(ire); 349511042SErik.Nordmark@Sun.COM if (nire == NULL) /* Some failure */ 349611042SErik.Nordmark@Sun.COM return (NULL); 349711042SErik.Nordmark@Sun.COM 349811042SErik.Nordmark@Sun.COM if (generationp != NULL) 349911042SErik.Nordmark@Sun.COM *generationp = nire->ire_generation; 350011042SErik.Nordmark@Sun.COM 350111042SErik.Nordmark@Sun.COM /* 350211042SErik.Nordmark@Sun.COM * Make sure races don't add a duplicate by 350311042SErik.Nordmark@Sun.COM * catching the case when an identical was returned. 350411042SErik.Nordmark@Sun.COM */ 350511042SErik.Nordmark@Sun.COM if (nire != ire) { 350611042SErik.Nordmark@Sun.COM ASSERT(nire->ire_identical_ref > 1); 350711042SErik.Nordmark@Sun.COM ire_delete(nire); 350811042SErik.Nordmark@Sun.COM } 350911042SErik.Nordmark@Sun.COM return (nire); 35107880SJonathan.Anderson@Sun.COM } 351111042SErik.Nordmark@Sun.COM 351211042SErik.Nordmark@Sun.COM /* 351311042SErik.Nordmark@Sun.COM * The argument is an IRE_INTERFACE. Delete all of IRE_IF_CLONE in the 351411042SErik.Nordmark@Sun.COM * ire_dep_children (just walk the ire_dep_sib_next since they are all 351511042SErik.Nordmark@Sun.COM * immediate children.) 351611042SErik.Nordmark@Sun.COM * Since we hold a lock while we remove them we need to defer the actual 351711042SErik.Nordmark@Sun.COM * calls to ire_delete() until we have dropped the lock. This makes things 351811042SErik.Nordmark@Sun.COM * less efficient since we restart at the top after dropping the lock. But 351911042SErik.Nordmark@Sun.COM * we only run when an IRE_INTERFACE is deleted which is infrquent. 352011042SErik.Nordmark@Sun.COM * 352111042SErik.Nordmark@Sun.COM * Note that ire_dep_children can be any mixture of offlink routes and 352211042SErik.Nordmark@Sun.COM * IRE_IF_CLONE entries. 352311042SErik.Nordmark@Sun.COM */ 352411042SErik.Nordmark@Sun.COM void 352511042SErik.Nordmark@Sun.COM ire_dep_delete_if_clone(ire_t *parent) 352611042SErik.Nordmark@Sun.COM { 352711042SErik.Nordmark@Sun.COM ip_stack_t *ipst = parent->ire_ipst; 352811042SErik.Nordmark@Sun.COM ire_t *child, *next; 352911042SErik.Nordmark@Sun.COM 353011042SErik.Nordmark@Sun.COM restart: 353111042SErik.Nordmark@Sun.COM rw_enter(&ipst->ips_ire_dep_lock, RW_READER); 353211042SErik.Nordmark@Sun.COM if (parent->ire_dep_children == NULL) { 353311042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ire_dep_lock); 353411042SErik.Nordmark@Sun.COM return; 353511042SErik.Nordmark@Sun.COM } 353611042SErik.Nordmark@Sun.COM child = parent->ire_dep_children; 353711042SErik.Nordmark@Sun.COM while (child != NULL) { 353811042SErik.Nordmark@Sun.COM next = child->ire_dep_sib_next; 353911042SErik.Nordmark@Sun.COM if ((child->ire_type & IRE_IF_CLONE) && 354011042SErik.Nordmark@Sun.COM !IRE_IS_CONDEMNED(child)) { 354111042SErik.Nordmark@Sun.COM ire_refhold(child); 354211042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ire_dep_lock); 354311042SErik.Nordmark@Sun.COM ire_delete(child); 354411042SErik.Nordmark@Sun.COM ASSERT(IRE_IS_CONDEMNED(child)); 354511042SErik.Nordmark@Sun.COM ire_refrele(child); 354611042SErik.Nordmark@Sun.COM goto restart; 354711042SErik.Nordmark@Sun.COM } 354811042SErik.Nordmark@Sun.COM child = next; 354911042SErik.Nordmark@Sun.COM } 355011042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ire_dep_lock); 355111042SErik.Nordmark@Sun.COM } 355211042SErik.Nordmark@Sun.COM 355311042SErik.Nordmark@Sun.COM /* 355411042SErik.Nordmark@Sun.COM * ire_pref() is used in recursive route-resolution for a destination to 355511042SErik.Nordmark@Sun.COM * determine the preference of an ire, where "preference" is determined 355611042SErik.Nordmark@Sun.COM * based on the level of indirection to the destination of the ire. 355711042SErik.Nordmark@Sun.COM * A higher preference indicates that fewer lookups are needed to complete 355811042SErik.Nordmark@Sun.COM * recursive route lookup. Thus 355911042SErik.Nordmark@Sun.COM * ire_pref(RTF_INDIRECT) < ire_pref(IRE_IF_RESOLVER) < ire_pref(IRE_PREF_CLONE) 356011042SErik.Nordmark@Sun.COM */ 356111042SErik.Nordmark@Sun.COM int 356211042SErik.Nordmark@Sun.COM ire_pref(ire_t *ire) 356311042SErik.Nordmark@Sun.COM { 356411042SErik.Nordmark@Sun.COM if (ire->ire_flags & RTF_INDIRECT) 356511042SErik.Nordmark@Sun.COM return (1); 356611042SErik.Nordmark@Sun.COM if (ire->ire_type & IRE_OFFLINK) 356711042SErik.Nordmark@Sun.COM return (2); 356811042SErik.Nordmark@Sun.COM if (ire->ire_type & (IRE_IF_RESOLVER|IRE_IF_NORESOLVER)) 356911042SErik.Nordmark@Sun.COM return (3); 357011042SErik.Nordmark@Sun.COM if (ire->ire_type & IRE_IF_CLONE) 357111042SErik.Nordmark@Sun.COM return (4); 357211042SErik.Nordmark@Sun.COM if (ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) 357311042SErik.Nordmark@Sun.COM return (5); 357411042SErik.Nordmark@Sun.COM return (-1); /* unknown ire_type */ 357511042SErik.Nordmark@Sun.COM } 3576*11681SSowmini.Varadhan@Sun.COM 3577*11681SSowmini.Varadhan@Sun.COM /* 3578*11681SSowmini.Varadhan@Sun.COM * In the preferred/strict src multihoming modes, unbound routes (i.e., 3579*11681SSowmini.Varadhan@Sun.COM * ire_t entries with ire_unbound set to B_TRUE) are bound to an interface 3580*11681SSowmini.Varadhan@Sun.COM * by selecting the first available interface that has an interface route for 3581*11681SSowmini.Varadhan@Sun.COM * the ire_gateway. If that interface is subsequently brought down, ill_downi() 3582*11681SSowmini.Varadhan@Sun.COM * will call ire_rebind() so that the unbound route can be bound to some other 3583*11681SSowmini.Varadhan@Sun.COM * matching interface thereby preserving the intended reachability information 3584*11681SSowmini.Varadhan@Sun.COM * from the original unbound route. 3585*11681SSowmini.Varadhan@Sun.COM */ 3586*11681SSowmini.Varadhan@Sun.COM void 3587*11681SSowmini.Varadhan@Sun.COM ire_rebind(ire_t *ire) 3588*11681SSowmini.Varadhan@Sun.COM { 3589*11681SSowmini.Varadhan@Sun.COM ire_t *gw_ire, *new_ire; 3590*11681SSowmini.Varadhan@Sun.COM int match_flags = MATCH_IRE_TYPE; 3591*11681SSowmini.Varadhan@Sun.COM ill_t *gw_ill; 3592*11681SSowmini.Varadhan@Sun.COM boolean_t isv6 = (ire->ire_ipversion == IPV6_VERSION); 3593*11681SSowmini.Varadhan@Sun.COM ip_stack_t *ipst = ire->ire_ipst; 3594*11681SSowmini.Varadhan@Sun.COM 3595*11681SSowmini.Varadhan@Sun.COM ASSERT(ire->ire_unbound); 3596*11681SSowmini.Varadhan@Sun.COM again: 3597*11681SSowmini.Varadhan@Sun.COM if (isv6) { 3598*11681SSowmini.Varadhan@Sun.COM gw_ire = ire_ftable_lookup_v6(&ire->ire_gateway_addr_v6, 0, 0, 3599*11681SSowmini.Varadhan@Sun.COM IRE_INTERFACE, NULL, ALL_ZONES, NULL, match_flags, 0, 3600*11681SSowmini.Varadhan@Sun.COM ipst, NULL); 3601*11681SSowmini.Varadhan@Sun.COM } else { 3602*11681SSowmini.Varadhan@Sun.COM gw_ire = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0, 3603*11681SSowmini.Varadhan@Sun.COM IRE_INTERFACE, NULL, ALL_ZONES, NULL, match_flags, 0, 3604*11681SSowmini.Varadhan@Sun.COM ipst, NULL); 3605*11681SSowmini.Varadhan@Sun.COM } 3606*11681SSowmini.Varadhan@Sun.COM if (gw_ire == NULL) { 3607*11681SSowmini.Varadhan@Sun.COM /* see comments in ip_rt_add[_v6]() for IPMP */ 3608*11681SSowmini.Varadhan@Sun.COM if (match_flags & MATCH_IRE_TESTHIDDEN) 3609*11681SSowmini.Varadhan@Sun.COM return; 3610*11681SSowmini.Varadhan@Sun.COM 3611*11681SSowmini.Varadhan@Sun.COM match_flags |= MATCH_IRE_TESTHIDDEN; 3612*11681SSowmini.Varadhan@Sun.COM goto again; 3613*11681SSowmini.Varadhan@Sun.COM } 3614*11681SSowmini.Varadhan@Sun.COM gw_ill = gw_ire->ire_ill; 3615*11681SSowmini.Varadhan@Sun.COM if (isv6) { 3616*11681SSowmini.Varadhan@Sun.COM new_ire = ire_create_v6(&ire->ire_addr_v6, &ire->ire_mask_v6, 3617*11681SSowmini.Varadhan@Sun.COM &ire->ire_gateway_addr_v6, ire->ire_type, gw_ill, 3618*11681SSowmini.Varadhan@Sun.COM ire->ire_zoneid, ire->ire_flags, NULL, ipst); 3619*11681SSowmini.Varadhan@Sun.COM } else { 3620*11681SSowmini.Varadhan@Sun.COM new_ire = ire_create((uchar_t *)&ire->ire_addr, 3621*11681SSowmini.Varadhan@Sun.COM (uchar_t *)&ire->ire_mask, 3622*11681SSowmini.Varadhan@Sun.COM (uchar_t *)&ire->ire_gateway_addr, ire->ire_type, gw_ill, 3623*11681SSowmini.Varadhan@Sun.COM ire->ire_zoneid, ire->ire_flags, NULL, ipst); 3624*11681SSowmini.Varadhan@Sun.COM } 3625*11681SSowmini.Varadhan@Sun.COM ire_refrele(gw_ire); 3626*11681SSowmini.Varadhan@Sun.COM if (new_ire == NULL) 3627*11681SSowmini.Varadhan@Sun.COM return; 3628*11681SSowmini.Varadhan@Sun.COM new_ire->ire_unbound = B_TRUE; 3629*11681SSowmini.Varadhan@Sun.COM new_ire = ire_add(new_ire); 3630*11681SSowmini.Varadhan@Sun.COM if (new_ire != NULL) 3631*11681SSowmini.Varadhan@Sun.COM ire_refrele(new_ire); 3632*11681SSowmini.Varadhan@Sun.COM } 3633