10Sstevel@tonic-gate /* 20Sstevel@tonic-gate * CDDL HEADER START 30Sstevel@tonic-gate * 40Sstevel@tonic-gate * The contents of this file are subject to the terms of the 51676Sjpk * Common Development and Distribution License (the "License"). 61676Sjpk * You may not use this file except in compliance with the License. 70Sstevel@tonic-gate * 80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 100Sstevel@tonic-gate * See the License for the specific language governing permissions 110Sstevel@tonic-gate * and limitations under the License. 120Sstevel@tonic-gate * 130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 180Sstevel@tonic-gate * 190Sstevel@tonic-gate * CDDL HEADER END 200Sstevel@tonic-gate */ 210Sstevel@tonic-gate /* 22*11457SErik.Nordmark@Sun.COM * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 230Sstevel@tonic-gate * Use is subject to license terms. 240Sstevel@tonic-gate */ 250Sstevel@tonic-gate /* Copyright (c) 1990 Mentat Inc. */ 260Sstevel@tonic-gate 270Sstevel@tonic-gate /* 280Sstevel@tonic-gate * This file contains routines that manipulate Internet Routing Entries (IREs). 290Sstevel@tonic-gate */ 300Sstevel@tonic-gate 310Sstevel@tonic-gate #include <sys/types.h> 320Sstevel@tonic-gate #include <sys/stream.h> 330Sstevel@tonic-gate #include <sys/stropts.h> 348485SPeter.Memishian@Sun.COM #include <sys/strsun.h> 358778SErik.Nordmark@Sun.COM #include <sys/strsubr.h> 360Sstevel@tonic-gate #include <sys/ddi.h> 370Sstevel@tonic-gate #include <sys/cmn_err.h> 380Sstevel@tonic-gate #include <sys/policy.h> 390Sstevel@tonic-gate 400Sstevel@tonic-gate #include <sys/systm.h> 410Sstevel@tonic-gate #include <sys/kmem.h> 420Sstevel@tonic-gate #include <sys/param.h> 430Sstevel@tonic-gate #include <sys/socket.h> 440Sstevel@tonic-gate #include <net/if.h> 450Sstevel@tonic-gate #include <net/route.h> 460Sstevel@tonic-gate #include <netinet/in.h> 470Sstevel@tonic-gate #include <net/if_dl.h> 480Sstevel@tonic-gate #include <netinet/ip6.h> 490Sstevel@tonic-gate #include <netinet/icmp6.h> 500Sstevel@tonic-gate 510Sstevel@tonic-gate #include <inet/common.h> 520Sstevel@tonic-gate #include <inet/mi.h> 530Sstevel@tonic-gate #include <inet/ip.h> 540Sstevel@tonic-gate #include <inet/ip6.h> 550Sstevel@tonic-gate #include <inet/ip_ndp.h> 562535Ssangeeta #include <inet/arp.h> 570Sstevel@tonic-gate #include <inet/ip_if.h> 580Sstevel@tonic-gate #include <inet/ip_ire.h> 592535Ssangeeta #include <inet/ip_ftable.h> 600Sstevel@tonic-gate #include <inet/ip_rts.h> 610Sstevel@tonic-gate #include <inet/nd.h> 620Sstevel@tonic-gate 630Sstevel@tonic-gate #include <inet/tcp.h> 640Sstevel@tonic-gate #include <inet/ipclassifier.h> 650Sstevel@tonic-gate #include <sys/zone.h> 663448Sdh155122 #include <sys/cpuvar.h> 673448Sdh155122 681676Sjpk #include <sys/tsol/label.h> 691676Sjpk #include <sys/tsol/tnet.h> 701676Sjpk 712535Ssangeeta struct kmem_cache *rt_entry_cache; 722535Ssangeeta 7311042SErik.Nordmark@Sun.COM typedef struct nce_clookup_s { 7411042SErik.Nordmark@Sun.COM ipaddr_t ncecl_addr; 7511042SErik.Nordmark@Sun.COM boolean_t ncecl_found; 7611042SErik.Nordmark@Sun.COM } nce_clookup_t; 7711042SErik.Nordmark@Sun.COM 780Sstevel@tonic-gate /* 790Sstevel@tonic-gate * Synchronization notes: 800Sstevel@tonic-gate * 810Sstevel@tonic-gate * The fields of the ire_t struct are protected in the following way : 820Sstevel@tonic-gate * 830Sstevel@tonic-gate * ire_next/ire_ptpn 840Sstevel@tonic-gate * 8511042SErik.Nordmark@Sun.COM * - bucket lock of the forwarding table in which is ire stored. 860Sstevel@tonic-gate * 8711042SErik.Nordmark@Sun.COM * ire_ill, ire_u *except* ire_gateway_addr[v6], ire_mask, 8811042SErik.Nordmark@Sun.COM * ire_type, ire_create_time, ire_masklen, ire_ipversion, ire_flags, 8911042SErik.Nordmark@Sun.COM * ire_bucket 900Sstevel@tonic-gate * 910Sstevel@tonic-gate * - Set in ire_create_v4/v6 and never changes after that. Thus, 920Sstevel@tonic-gate * we don't need a lock whenever these fields are accessed. 930Sstevel@tonic-gate * 940Sstevel@tonic-gate * - ire_bucket and ire_masklen (also set in ire_create) is set in 9511042SErik.Nordmark@Sun.COM * ire_add before inserting in the bucket and never 960Sstevel@tonic-gate * changes after that. Thus we don't need a lock whenever these 970Sstevel@tonic-gate * fields are accessed. 980Sstevel@tonic-gate * 990Sstevel@tonic-gate * ire_gateway_addr_v4[v6] 1000Sstevel@tonic-gate * 1010Sstevel@tonic-gate * - ire_gateway_addr_v4[v6] is set during ire_create and later modified 1020Sstevel@tonic-gate * by rts_setgwr[v6]. As ire_gateway_addr is a uint32_t, updates to 1030Sstevel@tonic-gate * it assumed to be atomic and hence the other parts of the code 1040Sstevel@tonic-gate * does not use any locks. ire_gateway_addr_v6 updates are not atomic 1050Sstevel@tonic-gate * and hence any access to it uses ire_lock to get/set the right value. 1060Sstevel@tonic-gate * 10711042SErik.Nordmark@Sun.COM * ire_refcnt, ire_identical_ref 1080Sstevel@tonic-gate * 1090Sstevel@tonic-gate * - Updated atomically using atomic_add_32 1100Sstevel@tonic-gate * 1110Sstevel@tonic-gate * ire_ssthresh, ire_rtt_sd, ire_rtt, ire_ib_pkt_count, ire_ob_pkt_count 1120Sstevel@tonic-gate * 1130Sstevel@tonic-gate * - Assumes that 32 bit writes are atomic. No locks. ire_lock is 1140Sstevel@tonic-gate * used to serialize updates to ire_ssthresh, ire_rtt_sd, ire_rtt. 1150Sstevel@tonic-gate * 11611042SErik.Nordmark@Sun.COM * ire_generation 11711042SErik.Nordmark@Sun.COM * - Under ire_lock 1180Sstevel@tonic-gate * 11911042SErik.Nordmark@Sun.COM * ire_nce_cache 12011042SErik.Nordmark@Sun.COM * - Under ire_lock 1210Sstevel@tonic-gate * 12211042SErik.Nordmark@Sun.COM * ire_dep_parent (To next IRE in recursive lookup chain) 12311042SErik.Nordmark@Sun.COM * - Under ips_ire_dep_lock. Write held when modifying. Read held when 12411042SErik.Nordmark@Sun.COM * walking. We also hold ire_lock when modifying to allow the data path 12511042SErik.Nordmark@Sun.COM * to only acquire ire_lock. 1260Sstevel@tonic-gate * 12711042SErik.Nordmark@Sun.COM * ire_dep_parent_generation (Generation number from ire_dep_parent) 12811042SErik.Nordmark@Sun.COM * - Under ips_ire_dep_lock and/or ire_lock. (A read claim on the dep_lock 12911042SErik.Nordmark@Sun.COM * and ire_lock held when modifying) 1300Sstevel@tonic-gate * 13111042SErik.Nordmark@Sun.COM * ire_dep_children (From parent to first child) 13211042SErik.Nordmark@Sun.COM * ire_dep_sib_next (linked list of siblings) 13311042SErik.Nordmark@Sun.COM * ire_dep_sib_ptpn (linked list of siblings) 13411042SErik.Nordmark@Sun.COM * - Under ips_ire_dep_lock. Write held when modifying. Read held when 13511042SErik.Nordmark@Sun.COM * walking. 1360Sstevel@tonic-gate * 1370Sstevel@tonic-gate * As we always hold the bucket locks in all the places while accessing 1380Sstevel@tonic-gate * the above values, it is natural to use them for protecting them. 1390Sstevel@tonic-gate * 14011042SErik.Nordmark@Sun.COM * We have a forwarding table for IPv4 and IPv6. The IPv6 forwarding table 1415335Ssowmini * (ip_forwarding_table_v6) is an array of pointers to arrays of irb_t 14211042SErik.Nordmark@Sun.COM * structures. ip_forwarding_table_v6 is allocated dynamically in 1433448Sdh155122 * ire_add_v6. ire_ft_init_lock is used to serialize multiple threads 1440Sstevel@tonic-gate * initializing the same bucket. Once a bucket is initialized, it is never 1453448Sdh155122 * de-alloacted. This assumption enables us to access 1463448Sdh155122 * ip_forwarding_table_v6[i] without any locks. 1470Sstevel@tonic-gate * 1485335Ssowmini * The forwarding table for IPv4 is a radix tree whose leaves 1495335Ssowmini * are rt_entry structures containing the irb_t for the rt_dst. The irb_t 1505335Ssowmini * for IPv4 is dynamically allocated and freed. 1515335Ssowmini * 1520Sstevel@tonic-gate * Each irb_t - ire bucket structure has a lock to protect 1530Sstevel@tonic-gate * a bucket and the ires residing in the bucket have a back pointer to 1540Sstevel@tonic-gate * the bucket structure. It also has a reference count for the number 1550Sstevel@tonic-gate * of threads walking the bucket - irb_refcnt which is bumped up 15611042SErik.Nordmark@Sun.COM * using the irb_refhold function. The flags irb_marks can be 15711042SErik.Nordmark@Sun.COM * set to IRB_MARK_CONDEMNED indicating that there are some ires 15811042SErik.Nordmark@Sun.COM * in this bucket that are IRE_IS_CONDEMNED and the 1590Sstevel@tonic-gate * last thread to leave the bucket should delete the ires. Usually 16011042SErik.Nordmark@Sun.COM * this is done by the irb_refrele function which is used to decrement 1615335Ssowmini * the reference count on a bucket. See comments above irb_t structure 1625335Ssowmini * definition in ip.h for further details. 1630Sstevel@tonic-gate * 16411042SErik.Nordmark@Sun.COM * The ire_refhold/ire_refrele functions operate on the ire which increments/ 1650Sstevel@tonic-gate * decrements the reference count, ire_refcnt, atomically on the ire. 16611042SErik.Nordmark@Sun.COM * ire_refcnt is modified only using those functions. Operations on the IRE 1670Sstevel@tonic-gate * could be described as follows : 1680Sstevel@tonic-gate * 1690Sstevel@tonic-gate * CREATE an ire with reference count initialized to 1. 1700Sstevel@tonic-gate * 1710Sstevel@tonic-gate * ADDITION of an ire holds the bucket lock, checks for duplicates 17211042SErik.Nordmark@Sun.COM * and then adds the ire. ire_add returns the ire after 1730Sstevel@tonic-gate * bumping up once more i.e the reference count is 2. This is to avoid 1740Sstevel@tonic-gate * an extra lookup in the functions calling ire_add which wants to 1750Sstevel@tonic-gate * work with the ire after adding. 1760Sstevel@tonic-gate * 17711042SErik.Nordmark@Sun.COM * LOOKUP of an ire bumps up the reference count using ire_refhold 17811042SErik.Nordmark@Sun.COM * function. It is valid to bump up the referece count of the IRE, 1790Sstevel@tonic-gate * after the lookup has returned an ire. Following are the lookup 1800Sstevel@tonic-gate * functions that return an HELD ire : 1810Sstevel@tonic-gate * 18211042SErik.Nordmark@Sun.COM * ire_ftable_lookup[_v6], ire_lookup_multi_ill[_v6] 1830Sstevel@tonic-gate * 1840Sstevel@tonic-gate * DELETION of an ire holds the bucket lock, removes it from the list 1850Sstevel@tonic-gate * and then decrements the reference count for having removed from the list 18611042SErik.Nordmark@Sun.COM * by using the ire_refrele function. If some other thread has looked up 1870Sstevel@tonic-gate * the ire, the reference count would have been bumped up and hence 1880Sstevel@tonic-gate * this ire will not be freed once deleted. It will be freed once the 1890Sstevel@tonic-gate * reference count drops to zero. 1900Sstevel@tonic-gate * 1910Sstevel@tonic-gate * Add and Delete acquires the bucket lock as RW_WRITER, while all the 1920Sstevel@tonic-gate * lookups acquire the bucket lock as RW_READER. 1930Sstevel@tonic-gate * 19411042SErik.Nordmark@Sun.COM * The general rule is to do the ire_refrele in the function 1950Sstevel@tonic-gate * that is passing the ire as an argument. 1960Sstevel@tonic-gate * 1970Sstevel@tonic-gate * In trying to locate ires the following points are to be noted. 1980Sstevel@tonic-gate * 19911042SErik.Nordmark@Sun.COM * IRE_IS_CONDEMNED signifies that the ire has been logically deleted and is 2000Sstevel@tonic-gate * to be ignored when walking the ires using ire_next. 2010Sstevel@tonic-gate * 2020Sstevel@tonic-gate * Zones note: 2030Sstevel@tonic-gate * Walking IREs within a given zone also walks certain ires in other 2040Sstevel@tonic-gate * zones. This is done intentionally. IRE walks with a specified 2050Sstevel@tonic-gate * zoneid are used only when doing informational reports, and 2060Sstevel@tonic-gate * zone users want to see things that they can access. See block 2070Sstevel@tonic-gate * comment in ire_walk_ill_match(). 2080Sstevel@tonic-gate */ 2090Sstevel@tonic-gate 2100Sstevel@tonic-gate /* 2110Sstevel@tonic-gate * The size of the forwarding table. We will make sure that it is a 2120Sstevel@tonic-gate * power of 2 in ip_ire_init(). 2133448Sdh155122 * Setable in /etc/system 2140Sstevel@tonic-gate */ 2150Sstevel@tonic-gate uint32_t ip6_ftable_hash_size = IP6_FTABLE_HASH_SIZE; 2160Sstevel@tonic-gate 2170Sstevel@tonic-gate struct kmem_cache *ire_cache; 21811042SErik.Nordmark@Sun.COM struct kmem_cache *ncec_cache; 21911042SErik.Nordmark@Sun.COM struct kmem_cache *nce_cache; 22011042SErik.Nordmark@Sun.COM 2210Sstevel@tonic-gate static ire_t ire_null; 2220Sstevel@tonic-gate 22311042SErik.Nordmark@Sun.COM static ire_t *ire_add_v4(ire_t *ire); 2240Sstevel@tonic-gate static void ire_delete_v4(ire_t *ire); 22511042SErik.Nordmark@Sun.COM static void ire_dep_invalidate_children(ire_t *child); 2261676Sjpk static void ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers, 2273448Sdh155122 zoneid_t zoneid, ip_stack_t *); 2280Sstevel@tonic-gate static void ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type, 2291676Sjpk pfv_t func, void *arg, uchar_t vers, ill_t *ill); 2305023Scarlsonj #ifdef DEBUG 2315023Scarlsonj static void ire_trace_cleanup(const ire_t *); 2320Sstevel@tonic-gate #endif 2330Sstevel@tonic-gate 2340Sstevel@tonic-gate /* 23511042SErik.Nordmark@Sun.COM * Following are the functions to increment/decrement the reference 23611042SErik.Nordmark@Sun.COM * count of the IREs and IRBs (ire bucket). 23711042SErik.Nordmark@Sun.COM * 23811042SErik.Nordmark@Sun.COM * 1) We bump up the reference count of an IRE to make sure that 23911042SErik.Nordmark@Sun.COM * it does not get deleted and freed while we are using it. 24011042SErik.Nordmark@Sun.COM * Typically all the lookup functions hold the bucket lock, 24111042SErik.Nordmark@Sun.COM * and look for the IRE. If it finds an IRE, it bumps up the 24211042SErik.Nordmark@Sun.COM * reference count before dropping the lock. Sometimes we *may* want 24311042SErik.Nordmark@Sun.COM * to bump up the reference count after we *looked* up i.e without 24411042SErik.Nordmark@Sun.COM * holding the bucket lock. So, the ire_refhold function does not assert 24511042SErik.Nordmark@Sun.COM * on the bucket lock being held. Any thread trying to delete from 24611042SErik.Nordmark@Sun.COM * the hash bucket can still do so but cannot free the IRE if 24711042SErik.Nordmark@Sun.COM * ire_refcnt is not 0. 24811042SErik.Nordmark@Sun.COM * 24911042SErik.Nordmark@Sun.COM * 2) We bump up the reference count on the bucket where the IRE resides 25011042SErik.Nordmark@Sun.COM * (IRB), when we want to prevent the IREs getting deleted from a given 25111042SErik.Nordmark@Sun.COM * hash bucket. This makes life easier for ire_walk type functions which 25211042SErik.Nordmark@Sun.COM * wants to walk the IRE list, call a function, but needs to drop 25311042SErik.Nordmark@Sun.COM * the bucket lock to prevent recursive rw_enters. While the 25411042SErik.Nordmark@Sun.COM * lock is dropped, the list could be changed by other threads or 25511042SErik.Nordmark@Sun.COM * the same thread could end up deleting the ire or the ire pointed by 25611042SErik.Nordmark@Sun.COM * ire_next. ire_refholding the ire or ire_next is not sufficient as 25711042SErik.Nordmark@Sun.COM * a delete will still remove the ire from the bucket while we have 25811042SErik.Nordmark@Sun.COM * dropped the lock and hence the ire_next would be NULL. Thus, we 25911042SErik.Nordmark@Sun.COM * need a mechanism to prevent deletions from a given bucket. 26011042SErik.Nordmark@Sun.COM * 26111042SErik.Nordmark@Sun.COM * To prevent deletions, we bump up the reference count on the 26211042SErik.Nordmark@Sun.COM * bucket. If the bucket is held, ire_delete just marks both 26311042SErik.Nordmark@Sun.COM * the ire and irb as CONDEMNED. When the 26411042SErik.Nordmark@Sun.COM * reference count on the bucket drops to zero, all the CONDEMNED ires 26511042SErik.Nordmark@Sun.COM * are deleted. We don't have to bump up the reference count on the 26611042SErik.Nordmark@Sun.COM * bucket if we are walking the bucket and never have to drop the bucket 26711042SErik.Nordmark@Sun.COM * lock. Note that irb_refhold does not prevent addition of new ires 26811042SErik.Nordmark@Sun.COM * in the list. It is okay because addition of new ires will not cause 26911042SErik.Nordmark@Sun.COM * ire_next to point to freed memory. We do irb_refhold only when 27011042SErik.Nordmark@Sun.COM * all of the 3 conditions are true : 27111042SErik.Nordmark@Sun.COM * 27211042SErik.Nordmark@Sun.COM * 1) The code needs to walk the IRE bucket from start to end. 27311042SErik.Nordmark@Sun.COM * 2) It may have to drop the bucket lock sometimes while doing (1) 27411042SErik.Nordmark@Sun.COM * 3) It does not want any ires to be deleted meanwhile. 27511042SErik.Nordmark@Sun.COM */ 27611042SErik.Nordmark@Sun.COM 27711042SErik.Nordmark@Sun.COM /* 27811042SErik.Nordmark@Sun.COM * Bump up the reference count on the hash bucket - IRB to 27911042SErik.Nordmark@Sun.COM * prevent ires from being deleted in this bucket. 28011042SErik.Nordmark@Sun.COM */ 28111042SErik.Nordmark@Sun.COM void 28211042SErik.Nordmark@Sun.COM irb_refhold(irb_t *irb) 28311042SErik.Nordmark@Sun.COM { 28411042SErik.Nordmark@Sun.COM rw_enter(&irb->irb_lock, RW_WRITER); 28511042SErik.Nordmark@Sun.COM irb->irb_refcnt++; 28611042SErik.Nordmark@Sun.COM ASSERT(irb->irb_refcnt != 0); 28711042SErik.Nordmark@Sun.COM rw_exit(&irb->irb_lock); 28811042SErik.Nordmark@Sun.COM } 28911042SErik.Nordmark@Sun.COM 29011042SErik.Nordmark@Sun.COM void 29111042SErik.Nordmark@Sun.COM irb_refhold_locked(irb_t *irb) 29211042SErik.Nordmark@Sun.COM { 29311042SErik.Nordmark@Sun.COM ASSERT(RW_WRITE_HELD(&irb->irb_lock)); 29411042SErik.Nordmark@Sun.COM irb->irb_refcnt++; 29511042SErik.Nordmark@Sun.COM ASSERT(irb->irb_refcnt != 0); 29611042SErik.Nordmark@Sun.COM } 29711042SErik.Nordmark@Sun.COM 29811042SErik.Nordmark@Sun.COM /* 29911042SErik.Nordmark@Sun.COM * Note: when IRB_MARK_DYNAMIC is not set the irb_t 30011042SErik.Nordmark@Sun.COM * is statically allocated, so that when the irb_refcnt goes to 0, 30111042SErik.Nordmark@Sun.COM * we simply clean up the ire list and continue. 30211042SErik.Nordmark@Sun.COM */ 30311042SErik.Nordmark@Sun.COM void 30411042SErik.Nordmark@Sun.COM irb_refrele(irb_t *irb) 30511042SErik.Nordmark@Sun.COM { 30611042SErik.Nordmark@Sun.COM if (irb->irb_marks & IRB_MARK_DYNAMIC) { 30711042SErik.Nordmark@Sun.COM irb_refrele_ftable(irb); 30811042SErik.Nordmark@Sun.COM } else { 30911042SErik.Nordmark@Sun.COM rw_enter(&irb->irb_lock, RW_WRITER); 31011042SErik.Nordmark@Sun.COM ASSERT(irb->irb_refcnt != 0); 31111042SErik.Nordmark@Sun.COM if (--irb->irb_refcnt == 0 && 31211042SErik.Nordmark@Sun.COM (irb->irb_marks & IRB_MARK_CONDEMNED)) { 31311042SErik.Nordmark@Sun.COM ire_t *ire_list; 31411042SErik.Nordmark@Sun.COM 31511042SErik.Nordmark@Sun.COM ire_list = ire_unlink(irb); 31611042SErik.Nordmark@Sun.COM rw_exit(&irb->irb_lock); 31711042SErik.Nordmark@Sun.COM ASSERT(ire_list != NULL); 31811042SErik.Nordmark@Sun.COM ire_cleanup(ire_list); 31911042SErik.Nordmark@Sun.COM } else { 32011042SErik.Nordmark@Sun.COM rw_exit(&irb->irb_lock); 32111042SErik.Nordmark@Sun.COM } 32211042SErik.Nordmark@Sun.COM } 32311042SErik.Nordmark@Sun.COM } 32411042SErik.Nordmark@Sun.COM 32511042SErik.Nordmark@Sun.COM 32611042SErik.Nordmark@Sun.COM /* 32711042SErik.Nordmark@Sun.COM * Bump up the reference count on the IRE. We cannot assert that the 32811042SErik.Nordmark@Sun.COM * bucket lock is being held as it is legal to bump up the reference 32911042SErik.Nordmark@Sun.COM * count after the first lookup has returned the IRE without 33011042SErik.Nordmark@Sun.COM * holding the lock. 33111042SErik.Nordmark@Sun.COM */ 33211042SErik.Nordmark@Sun.COM void 33311042SErik.Nordmark@Sun.COM ire_refhold(ire_t *ire) 33411042SErik.Nordmark@Sun.COM { 33511042SErik.Nordmark@Sun.COM atomic_add_32(&(ire)->ire_refcnt, 1); 33611042SErik.Nordmark@Sun.COM ASSERT((ire)->ire_refcnt != 0); 33711042SErik.Nordmark@Sun.COM #ifdef DEBUG 33811042SErik.Nordmark@Sun.COM ire_trace_ref(ire); 33911042SErik.Nordmark@Sun.COM #endif 34011042SErik.Nordmark@Sun.COM } 34111042SErik.Nordmark@Sun.COM 34211042SErik.Nordmark@Sun.COM void 34311042SErik.Nordmark@Sun.COM ire_refhold_notr(ire_t *ire) 34411042SErik.Nordmark@Sun.COM { 34511042SErik.Nordmark@Sun.COM atomic_add_32(&(ire)->ire_refcnt, 1); 34611042SErik.Nordmark@Sun.COM ASSERT((ire)->ire_refcnt != 0); 34711042SErik.Nordmark@Sun.COM } 34811042SErik.Nordmark@Sun.COM 34911042SErik.Nordmark@Sun.COM void 35011042SErik.Nordmark@Sun.COM ire_refhold_locked(ire_t *ire) 35111042SErik.Nordmark@Sun.COM { 35211042SErik.Nordmark@Sun.COM #ifdef DEBUG 35311042SErik.Nordmark@Sun.COM ire_trace_ref(ire); 35411042SErik.Nordmark@Sun.COM #endif 35511042SErik.Nordmark@Sun.COM ire->ire_refcnt++; 35611042SErik.Nordmark@Sun.COM } 35711042SErik.Nordmark@Sun.COM 35811042SErik.Nordmark@Sun.COM /* 35911042SErik.Nordmark@Sun.COM * Release a ref on an IRE. 3600Sstevel@tonic-gate * 3610Sstevel@tonic-gate * Must not be called while holding any locks. Otherwise if this is 3620Sstevel@tonic-gate * the last reference to be released there is a chance of recursive mutex 3630Sstevel@tonic-gate * panic due to ire_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 3640Sstevel@tonic-gate * to restart an ioctl. The one exception is when the caller is sure that 3650Sstevel@tonic-gate * this is not the last reference to be released. Eg. if the caller is 3660Sstevel@tonic-gate * sure that the ire has not been deleted and won't be deleted. 36711042SErik.Nordmark@Sun.COM * 36811042SErik.Nordmark@Sun.COM * In architectures e.g sun4u, where atomic_add_32_nv is just 36911042SErik.Nordmark@Sun.COM * a cas, we need to maintain the right memory barrier semantics 37011042SErik.Nordmark@Sun.COM * as that of mutex_exit i.e all the loads and stores should complete 37111042SErik.Nordmark@Sun.COM * before the cas is executed. membar_exit() does that here. 3720Sstevel@tonic-gate */ 3730Sstevel@tonic-gate void 3740Sstevel@tonic-gate ire_refrele(ire_t *ire) 3750Sstevel@tonic-gate { 37611042SErik.Nordmark@Sun.COM #ifdef DEBUG 37711042SErik.Nordmark@Sun.COM ire_untrace_ref(ire); 37811042SErik.Nordmark@Sun.COM #endif 37911042SErik.Nordmark@Sun.COM ASSERT((ire)->ire_refcnt != 0); 38011042SErik.Nordmark@Sun.COM membar_exit(); 38111042SErik.Nordmark@Sun.COM if (atomic_add_32_nv(&(ire)->ire_refcnt, -1) == 0) 38211042SErik.Nordmark@Sun.COM ire_inactive(ire); 3830Sstevel@tonic-gate } 3840Sstevel@tonic-gate 3850Sstevel@tonic-gate void 3860Sstevel@tonic-gate ire_refrele_notr(ire_t *ire) 3870Sstevel@tonic-gate { 38811042SErik.Nordmark@Sun.COM ASSERT((ire)->ire_refcnt != 0); 38911042SErik.Nordmark@Sun.COM membar_exit(); 39011042SErik.Nordmark@Sun.COM if (atomic_add_32_nv(&(ire)->ire_refcnt, -1) == 0) 39111042SErik.Nordmark@Sun.COM ire_inactive(ire); 3920Sstevel@tonic-gate } 3930Sstevel@tonic-gate 3940Sstevel@tonic-gate /* 3950Sstevel@tonic-gate * This function is associated with the IP_IOC_IRE_DELETE[_NO_REPLY] 39611042SErik.Nordmark@Sun.COM * IOCTL[s]. The NO_REPLY form is used by TCP to tell IP that it is 39711042SErik.Nordmark@Sun.COM * having problems reaching a particular destination. 39811042SErik.Nordmark@Sun.COM * This will make IP consider alternate routes (e.g., when there are 39911042SErik.Nordmark@Sun.COM * muliple default routes), and it will also make IP discard any (potentially) 40011042SErik.Nordmark@Sun.COM * stale redirect. 40111042SErik.Nordmark@Sun.COM * Management processes may want to use the version that generates a reply. 4020Sstevel@tonic-gate * 40311042SErik.Nordmark@Sun.COM * With the use of NUD like behavior for IPv4/ARP in addition to IPv6 40411042SErik.Nordmark@Sun.COM * this function shouldn't be necessary for IP to recover from a bad redirect, 40511042SErik.Nordmark@Sun.COM * a bad default router (when there are multiple default routers), or 40611042SErik.Nordmark@Sun.COM * a stale ND/ARP entry. But we retain it in any case. 40711042SErik.Nordmark@Sun.COM * For instance, this is helpful when TCP suspects a failure before NUD does. 4080Sstevel@tonic-gate */ 4090Sstevel@tonic-gate int 4100Sstevel@tonic-gate ip_ire_delete(queue_t *q, mblk_t *mp, cred_t *ioc_cr) 4110Sstevel@tonic-gate { 4122535Ssangeeta uchar_t *addr_ucp; 41311042SErik.Nordmark@Sun.COM uint_t ipversion; 41411042SErik.Nordmark@Sun.COM sin_t *sin; 41511042SErik.Nordmark@Sun.COM sin6_t *sin6; 41611042SErik.Nordmark@Sun.COM ipaddr_t v4addr; 41711042SErik.Nordmark@Sun.COM in6_addr_t v6addr; 4182535Ssangeeta ire_t *ire; 4192535Ssangeeta ipid_t *ipid; 4200Sstevel@tonic-gate zoneid_t zoneid; 4213448Sdh155122 ip_stack_t *ipst; 4220Sstevel@tonic-gate 4230Sstevel@tonic-gate ASSERT(q->q_next == NULL); 42411042SErik.Nordmark@Sun.COM zoneid = IPCL_ZONEID(Q_TO_CONN(q)); 4253448Sdh155122 ipst = CONNQ_TO_IPST(q); 4260Sstevel@tonic-gate 4270Sstevel@tonic-gate /* 4280Sstevel@tonic-gate * Check privilege using the ioctl credential; if it is NULL 4290Sstevel@tonic-gate * then this is a kernel message and therefor privileged. 4300Sstevel@tonic-gate */ 4313448Sdh155122 if (ioc_cr != NULL && secpolicy_ip_config(ioc_cr, B_FALSE) != 0) 4320Sstevel@tonic-gate return (EPERM); 4330Sstevel@tonic-gate 4340Sstevel@tonic-gate ipid = (ipid_t *)mp->b_rptr; 4350Sstevel@tonic-gate 4360Sstevel@tonic-gate addr_ucp = mi_offset_param(mp, ipid->ipid_addr_offset, 4374714Ssowmini ipid->ipid_addr_length); 4380Sstevel@tonic-gate if (addr_ucp == NULL || !OK_32PTR(addr_ucp)) 4390Sstevel@tonic-gate return (EINVAL); 4400Sstevel@tonic-gate switch (ipid->ipid_addr_length) { 44111042SErik.Nordmark@Sun.COM case sizeof (sin_t): 4420Sstevel@tonic-gate /* 4430Sstevel@tonic-gate * got complete (sockaddr) address - increment addr_ucp to point 4440Sstevel@tonic-gate * at the ip_addr field. 4450Sstevel@tonic-gate */ 4460Sstevel@tonic-gate sin = (sin_t *)addr_ucp; 4470Sstevel@tonic-gate addr_ucp = (uchar_t *)&sin->sin_addr.s_addr; 44811042SErik.Nordmark@Sun.COM ipversion = IPV4_VERSION; 4490Sstevel@tonic-gate break; 45011042SErik.Nordmark@Sun.COM case sizeof (sin6_t): 45111042SErik.Nordmark@Sun.COM /* 45211042SErik.Nordmark@Sun.COM * got complete (sockaddr) address - increment addr_ucp to point 45311042SErik.Nordmark@Sun.COM * at the ip_addr field. 45411042SErik.Nordmark@Sun.COM */ 45511042SErik.Nordmark@Sun.COM sin6 = (sin6_t *)addr_ucp; 45611042SErik.Nordmark@Sun.COM addr_ucp = (uchar_t *)&sin6->sin6_addr; 45711042SErik.Nordmark@Sun.COM ipversion = IPV6_VERSION; 45811042SErik.Nordmark@Sun.COM break; 4590Sstevel@tonic-gate default: 4600Sstevel@tonic-gate return (EINVAL); 4610Sstevel@tonic-gate } 46211042SErik.Nordmark@Sun.COM if (ipversion == IPV4_VERSION) { 46311042SErik.Nordmark@Sun.COM /* Extract the destination address. */ 46411042SErik.Nordmark@Sun.COM bcopy(addr_ucp, &v4addr, IP_ADDR_LEN); 46511042SErik.Nordmark@Sun.COM 46611042SErik.Nordmark@Sun.COM ire = ire_ftable_lookup_v4(v4addr, 0, 0, 0, NULL, 46711042SErik.Nordmark@Sun.COM zoneid, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL); 46811042SErik.Nordmark@Sun.COM } else { 46911042SErik.Nordmark@Sun.COM /* Extract the destination address. */ 47011042SErik.Nordmark@Sun.COM bcopy(addr_ucp, &v6addr, IPV6_ADDR_LEN); 47111042SErik.Nordmark@Sun.COM 47211042SErik.Nordmark@Sun.COM ire = ire_ftable_lookup_v6(&v6addr, NULL, NULL, 0, NULL, 47311042SErik.Nordmark@Sun.COM zoneid, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL); 47411042SErik.Nordmark@Sun.COM } 47511042SErik.Nordmark@Sun.COM if (ire != NULL) { 47611042SErik.Nordmark@Sun.COM if (ipversion == IPV4_VERSION) { 47711042SErik.Nordmark@Sun.COM ip_rts_change(RTM_LOSING, ire->ire_addr, 47811042SErik.Nordmark@Sun.COM ire->ire_gateway_addr, ire->ire_mask, 47911042SErik.Nordmark@Sun.COM (Q_TO_CONN(q))->conn_laddr_v4, 0, 0, 0, 48011042SErik.Nordmark@Sun.COM (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA), 48111042SErik.Nordmark@Sun.COM ire->ire_ipst); 4820Sstevel@tonic-gate } 48311042SErik.Nordmark@Sun.COM (void) ire_no_good(ire); 4844714Ssowmini ire_refrele(ire); 4850Sstevel@tonic-gate } 4860Sstevel@tonic-gate return (0); 4870Sstevel@tonic-gate } 4880Sstevel@tonic-gate 4890Sstevel@tonic-gate /* 4900Sstevel@tonic-gate * Initialize the ire that is specific to IPv4 part and call 4910Sstevel@tonic-gate * ire_init_common to finish it. 49211042SErik.Nordmark@Sun.COM * Returns zero or errno. 4930Sstevel@tonic-gate */ 49411042SErik.Nordmark@Sun.COM int 49511042SErik.Nordmark@Sun.COM ire_init_v4(ire_t *ire, uchar_t *addr, uchar_t *mask, uchar_t *gateway, 49611042SErik.Nordmark@Sun.COM ushort_t type, ill_t *ill, zoneid_t zoneid, uint_t flags, 49711042SErik.Nordmark@Sun.COM tsol_gc_t *gc, ip_stack_t *ipst) 4980Sstevel@tonic-gate { 49911042SErik.Nordmark@Sun.COM int error; 50011042SErik.Nordmark@Sun.COM 5011676Sjpk /* 5021676Sjpk * Reject IRE security attribute creation/initialization 5031676Sjpk * if system is not running in Trusted mode. 5041676Sjpk */ 50511042SErik.Nordmark@Sun.COM if (gc != NULL && !is_system_labeled()) 50611042SErik.Nordmark@Sun.COM return (EINVAL); 5071676Sjpk 5083448Sdh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_alloced); 5090Sstevel@tonic-gate 5100Sstevel@tonic-gate if (addr != NULL) 5110Sstevel@tonic-gate bcopy(addr, &ire->ire_addr, IP_ADDR_LEN); 51211042SErik.Nordmark@Sun.COM if (gateway != NULL) 5130Sstevel@tonic-gate bcopy(gateway, &ire->ire_gateway_addr, IP_ADDR_LEN); 51411042SErik.Nordmark@Sun.COM 51511042SErik.Nordmark@Sun.COM /* Make sure we don't have stray values in some fields */ 51611042SErik.Nordmark@Sun.COM switch (type) { 51711042SErik.Nordmark@Sun.COM case IRE_LOOPBACK: 51811042SErik.Nordmark@Sun.COM bcopy(&ire->ire_addr, &ire->ire_gateway_addr, IP_ADDR_LEN); 51911042SErik.Nordmark@Sun.COM /* FALLTHRU */ 52011042SErik.Nordmark@Sun.COM case IRE_HOST: 52111042SErik.Nordmark@Sun.COM case IRE_BROADCAST: 52211042SErik.Nordmark@Sun.COM case IRE_LOCAL: 52311042SErik.Nordmark@Sun.COM case IRE_IF_CLONE: 52411042SErik.Nordmark@Sun.COM ire->ire_mask = IP_HOST_MASK; 52511042SErik.Nordmark@Sun.COM ire->ire_masklen = IPV4_ABITS; 52611042SErik.Nordmark@Sun.COM break; 52711042SErik.Nordmark@Sun.COM case IRE_PREFIX: 52811042SErik.Nordmark@Sun.COM case IRE_DEFAULT: 52911042SErik.Nordmark@Sun.COM case IRE_IF_RESOLVER: 53011042SErik.Nordmark@Sun.COM case IRE_IF_NORESOLVER: 53111042SErik.Nordmark@Sun.COM if (mask != NULL) { 53211042SErik.Nordmark@Sun.COM bcopy(mask, &ire->ire_mask, IP_ADDR_LEN); 53311042SErik.Nordmark@Sun.COM ire->ire_masklen = ip_mask_to_plen(ire->ire_mask); 53411042SErik.Nordmark@Sun.COM } 53511042SErik.Nordmark@Sun.COM break; 53611042SErik.Nordmark@Sun.COM case IRE_MULTICAST: 53711042SErik.Nordmark@Sun.COM case IRE_NOROUTE: 53811042SErik.Nordmark@Sun.COM ASSERT(mask == NULL); 53911042SErik.Nordmark@Sun.COM break; 54011042SErik.Nordmark@Sun.COM default: 54111042SErik.Nordmark@Sun.COM ASSERT(0); 54211042SErik.Nordmark@Sun.COM return (EINVAL); 5430Sstevel@tonic-gate } 5440Sstevel@tonic-gate 54511042SErik.Nordmark@Sun.COM error = ire_init_common(ire, type, ill, zoneid, flags, IPV4_VERSION, 54611042SErik.Nordmark@Sun.COM gc, ipst); 54711042SErik.Nordmark@Sun.COM if (error != NULL) 54811042SErik.Nordmark@Sun.COM return (error); 54911042SErik.Nordmark@Sun.COM 55011042SErik.Nordmark@Sun.COM /* Determine which function pointers to use */ 55111042SErik.Nordmark@Sun.COM ire->ire_postfragfn = ip_xmit; /* Common case */ 55211042SErik.Nordmark@Sun.COM 55311042SErik.Nordmark@Sun.COM switch (ire->ire_type) { 55411042SErik.Nordmark@Sun.COM case IRE_LOCAL: 55511042SErik.Nordmark@Sun.COM ire->ire_sendfn = ire_send_local_v4; 55611042SErik.Nordmark@Sun.COM ire->ire_recvfn = ire_recv_local_v4; 55711042SErik.Nordmark@Sun.COM ASSERT(ire->ire_ill != NULL); 55811076SCathy.Zhou@Sun.COM if (ire->ire_ill->ill_flags & ILLF_NOACCEPT) 55911042SErik.Nordmark@Sun.COM ire->ire_recvfn = ire_recv_noaccept_v6; 56011042SErik.Nordmark@Sun.COM break; 56111042SErik.Nordmark@Sun.COM case IRE_LOOPBACK: 56211042SErik.Nordmark@Sun.COM ire->ire_sendfn = ire_send_local_v4; 56311042SErik.Nordmark@Sun.COM ire->ire_recvfn = ire_recv_loopback_v4; 56411042SErik.Nordmark@Sun.COM break; 56511042SErik.Nordmark@Sun.COM case IRE_BROADCAST: 56611042SErik.Nordmark@Sun.COM ire->ire_postfragfn = ip_postfrag_loopcheck; 56711042SErik.Nordmark@Sun.COM ire->ire_sendfn = ire_send_broadcast_v4; 56811042SErik.Nordmark@Sun.COM ire->ire_recvfn = ire_recv_broadcast_v4; 56911042SErik.Nordmark@Sun.COM break; 57011042SErik.Nordmark@Sun.COM case IRE_MULTICAST: 57111042SErik.Nordmark@Sun.COM ire->ire_postfragfn = ip_postfrag_loopcheck; 57211042SErik.Nordmark@Sun.COM ire->ire_sendfn = ire_send_multicast_v4; 57311042SErik.Nordmark@Sun.COM ire->ire_recvfn = ire_recv_multicast_v4; 57411042SErik.Nordmark@Sun.COM break; 57511042SErik.Nordmark@Sun.COM default: 57611042SErik.Nordmark@Sun.COM /* 57711042SErik.Nordmark@Sun.COM * For IRE_IF_ALL and IRE_OFFLINK we forward received 57811042SErik.Nordmark@Sun.COM * packets by default. 57911042SErik.Nordmark@Sun.COM */ 58011042SErik.Nordmark@Sun.COM ire->ire_sendfn = ire_send_wire_v4; 58111042SErik.Nordmark@Sun.COM ire->ire_recvfn = ire_recv_forward_v4; 58211042SErik.Nordmark@Sun.COM break; 58311042SErik.Nordmark@Sun.COM } 58411042SErik.Nordmark@Sun.COM if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 58511042SErik.Nordmark@Sun.COM ire->ire_sendfn = ire_send_noroute_v4; 58611042SErik.Nordmark@Sun.COM ire->ire_recvfn = ire_recv_noroute_v4; 58711042SErik.Nordmark@Sun.COM } else if (ire->ire_flags & RTF_MULTIRT) { 58811042SErik.Nordmark@Sun.COM ire->ire_postfragfn = ip_postfrag_multirt_v4; 58911042SErik.Nordmark@Sun.COM ire->ire_sendfn = ire_send_multirt_v4; 59011042SErik.Nordmark@Sun.COM /* Multirt receive of broadcast uses ire_recv_broadcast_v4 */ 59111042SErik.Nordmark@Sun.COM if (ire->ire_type != IRE_BROADCAST) 59211042SErik.Nordmark@Sun.COM ire->ire_recvfn = ire_recv_multirt_v4; 59311042SErik.Nordmark@Sun.COM } 59411042SErik.Nordmark@Sun.COM ire->ire_nce_capable = ire_determine_nce_capable(ire); 59511042SErik.Nordmark@Sun.COM return (0); 5960Sstevel@tonic-gate } 5970Sstevel@tonic-gate 5980Sstevel@tonic-gate /* 59911042SErik.Nordmark@Sun.COM * Determine ire_nce_capable 6000Sstevel@tonic-gate */ 60111042SErik.Nordmark@Sun.COM boolean_t 60211042SErik.Nordmark@Sun.COM ire_determine_nce_capable(ire_t *ire) 6030Sstevel@tonic-gate { 60411042SErik.Nordmark@Sun.COM int max_masklen; 60511042SErik.Nordmark@Sun.COM 60611042SErik.Nordmark@Sun.COM if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 60711042SErik.Nordmark@Sun.COM (ire->ire_type & IRE_MULTICAST)) 60811042SErik.Nordmark@Sun.COM return (B_TRUE); 60911042SErik.Nordmark@Sun.COM 61011042SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV4_VERSION) 61111042SErik.Nordmark@Sun.COM max_masklen = IPV4_ABITS; 61211042SErik.Nordmark@Sun.COM else 61311042SErik.Nordmark@Sun.COM max_masklen = IPV6_ABITS; 61411042SErik.Nordmark@Sun.COM 61511042SErik.Nordmark@Sun.COM if ((ire->ire_type & IRE_ONLINK) && ire->ire_masklen == max_masklen) 61611042SErik.Nordmark@Sun.COM return (B_TRUE); 61711042SErik.Nordmark@Sun.COM return (B_FALSE); 6180Sstevel@tonic-gate } 6190Sstevel@tonic-gate 6200Sstevel@tonic-gate /* 6210Sstevel@tonic-gate * ire_create is called to allocate and initialize a new IRE. 6220Sstevel@tonic-gate * 6230Sstevel@tonic-gate * NOTE : This is called as writer sometimes though not required 6240Sstevel@tonic-gate * by this function. 6250Sstevel@tonic-gate */ 6260Sstevel@tonic-gate ire_t * 62711042SErik.Nordmark@Sun.COM ire_create(uchar_t *addr, uchar_t *mask, uchar_t *gateway, 62811042SErik.Nordmark@Sun.COM ushort_t type, ill_t *ill, zoneid_t zoneid, uint_t flags, tsol_gc_t *gc, 62911042SErik.Nordmark@Sun.COM ip_stack_t *ipst) 6300Sstevel@tonic-gate { 6310Sstevel@tonic-gate ire_t *ire; 63211042SErik.Nordmark@Sun.COM int error; 6330Sstevel@tonic-gate 6340Sstevel@tonic-gate ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 6350Sstevel@tonic-gate if (ire == NULL) { 63611042SErik.Nordmark@Sun.COM DTRACE_PROBE(kmem__cache__alloc); 6370Sstevel@tonic-gate return (NULL); 6380Sstevel@tonic-gate } 6390Sstevel@tonic-gate *ire = ire_null; 6400Sstevel@tonic-gate 64111042SErik.Nordmark@Sun.COM error = ire_init_v4(ire, addr, mask, gateway, type, ill, zoneid, flags, 64211042SErik.Nordmark@Sun.COM gc, ipst); 64311042SErik.Nordmark@Sun.COM if (error != 0) { 64411042SErik.Nordmark@Sun.COM DTRACE_PROBE2(ire__init, ire_t *, ire, int, error); 6450Sstevel@tonic-gate kmem_cache_free(ire_cache, ire); 6460Sstevel@tonic-gate return (NULL); 6470Sstevel@tonic-gate } 6480Sstevel@tonic-gate return (ire); 6490Sstevel@tonic-gate } 6500Sstevel@tonic-gate 6510Sstevel@tonic-gate /* 6520Sstevel@tonic-gate * Common to IPv4 and IPv6 65311042SErik.Nordmark@Sun.COM * Returns zero or errno. 6540Sstevel@tonic-gate */ 65511042SErik.Nordmark@Sun.COM int 65611042SErik.Nordmark@Sun.COM ire_init_common(ire_t *ire, ushort_t type, ill_t *ill, zoneid_t zoneid, 65711042SErik.Nordmark@Sun.COM uint_t flags, uchar_t ipversion, tsol_gc_t *gc, ip_stack_t *ipst) 6580Sstevel@tonic-gate { 65911042SErik.Nordmark@Sun.COM int error; 6600Sstevel@tonic-gate 6611676Sjpk #ifdef DEBUG 66211042SErik.Nordmark@Sun.COM if (ill != NULL) { 66311042SErik.Nordmark@Sun.COM if (ill->ill_isv6) 6640Sstevel@tonic-gate ASSERT(ipversion == IPV6_VERSION); 6650Sstevel@tonic-gate else 6660Sstevel@tonic-gate ASSERT(ipversion == IPV4_VERSION); 6670Sstevel@tonic-gate } 6681676Sjpk #endif /* DEBUG */ 6691676Sjpk 6701676Sjpk /* 6711676Sjpk * Create/initialize IRE security attribute only in Trusted mode; 67211042SErik.Nordmark@Sun.COM * if the passed in gc is non-NULL, we expect that the caller 6731676Sjpk * has held a reference to it and will release it when this routine 6741676Sjpk * returns a failure, otherwise we own the reference. We do this 6751676Sjpk * prior to initializing the rest IRE fields. 6761676Sjpk */ 6771676Sjpk if (is_system_labeled()) { 6781676Sjpk if ((type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST | 67911042SErik.Nordmark@Sun.COM IRE_IF_ALL | IRE_MULTICAST | IRE_NOROUTE)) != 0) { 6801676Sjpk /* release references on behalf of caller */ 6811676Sjpk if (gc != NULL) 6821676Sjpk GC_REFRELE(gc); 68311042SErik.Nordmark@Sun.COM } else { 68411042SErik.Nordmark@Sun.COM error = tsol_ire_init_gwattr(ire, ipversion, gc); 68511042SErik.Nordmark@Sun.COM if (error != 0) 68611042SErik.Nordmark@Sun.COM return (error); 6871676Sjpk } 6881676Sjpk } 6890Sstevel@tonic-gate 6900Sstevel@tonic-gate ire->ire_type = type; 6910Sstevel@tonic-gate ire->ire_flags = RTF_UP | flags; 6920Sstevel@tonic-gate ire->ire_create_time = (uint32_t)gethrestime_sec(); 69311042SErik.Nordmark@Sun.COM ire->ire_generation = IRE_GENERATION_INITIAL; 6940Sstevel@tonic-gate 6950Sstevel@tonic-gate /* 69611042SErik.Nordmark@Sun.COM * The ill_ire_cnt isn't increased until 69711042SErik.Nordmark@Sun.COM * the IRE is added to ensure that a walker will find 69811042SErik.Nordmark@Sun.COM * all IREs that hold a reference on an ill. 6990Sstevel@tonic-gate * 70011042SErik.Nordmark@Sun.COM * Note that ill_ire_multicast doesn't hold a ref on the ill since 70111042SErik.Nordmark@Sun.COM * ire_add() is not called for the IRE_MULTICAST. 7020Sstevel@tonic-gate */ 70311042SErik.Nordmark@Sun.COM ire->ire_ill = ill; 70411042SErik.Nordmark@Sun.COM ire->ire_zoneid = zoneid; 7050Sstevel@tonic-gate ire->ire_ipversion = ipversion; 70611042SErik.Nordmark@Sun.COM 7072535Ssangeeta mutex_init(&ire->ire_lock, NULL, MUTEX_DEFAULT, NULL); 7080Sstevel@tonic-gate ire->ire_refcnt = 1; 70911042SErik.Nordmark@Sun.COM ire->ire_identical_ref = 1; /* Number of ire_delete's needed */ 7103448Sdh155122 ire->ire_ipst = ipst; /* No netstack_hold */ 7115023Scarlsonj ire->ire_trace_disable = B_FALSE; 7121676Sjpk 71311042SErik.Nordmark@Sun.COM return (0); 7140Sstevel@tonic-gate } 7150Sstevel@tonic-gate 7160Sstevel@tonic-gate /* 71711042SErik.Nordmark@Sun.COM * This creates an IRE_BROADCAST based on the arguments. 71811042SErik.Nordmark@Sun.COM * A mirror is ire_lookup_bcast(). 7190Sstevel@tonic-gate * 72011042SErik.Nordmark@Sun.COM * Any supression of unneeded ones is done in ire_add_v4. 72111042SErik.Nordmark@Sun.COM * We add one IRE_BROADCAST per address. ire_send_broadcast_v4() 72211042SErik.Nordmark@Sun.COM * takes care of generating a loopback copy of the packet. 7230Sstevel@tonic-gate */ 7240Sstevel@tonic-gate ire_t ** 72511042SErik.Nordmark@Sun.COM ire_create_bcast(ill_t *ill, ipaddr_t addr, zoneid_t zoneid, ire_t **irep) 7260Sstevel@tonic-gate { 72711042SErik.Nordmark@Sun.COM ip_stack_t *ipst = ill->ill_ipst; 72811042SErik.Nordmark@Sun.COM 72911042SErik.Nordmark@Sun.COM ASSERT(IAM_WRITER_ILL(ill)); 7303448Sdh155122 7310Sstevel@tonic-gate *irep++ = ire_create( 7320Sstevel@tonic-gate (uchar_t *)&addr, /* dest addr */ 7330Sstevel@tonic-gate (uchar_t *)&ip_g_all_ones, /* mask */ 7340Sstevel@tonic-gate NULL, /* no gateway */ 7350Sstevel@tonic-gate IRE_BROADCAST, 73611042SErik.Nordmark@Sun.COM ill, 73711042SErik.Nordmark@Sun.COM zoneid, 73811042SErik.Nordmark@Sun.COM RTF_KERNEL, 7394714Ssowmini NULL, 7404714Ssowmini ipst); 7410Sstevel@tonic-gate 7420Sstevel@tonic-gate return (irep); 7430Sstevel@tonic-gate } 7440Sstevel@tonic-gate 7450Sstevel@tonic-gate /* 74611042SErik.Nordmark@Sun.COM * This looks up an IRE_BROADCAST based on the arguments. 74711042SErik.Nordmark@Sun.COM * Mirrors ire_create_bcast(). 7480Sstevel@tonic-gate */ 7490Sstevel@tonic-gate ire_t * 75011042SErik.Nordmark@Sun.COM ire_lookup_bcast(ill_t *ill, ipaddr_t addr, zoneid_t zoneid) 7510Sstevel@tonic-gate { 75211042SErik.Nordmark@Sun.COM ire_t *ire; 75311042SErik.Nordmark@Sun.COM int match_args; 75411042SErik.Nordmark@Sun.COM 75511042SErik.Nordmark@Sun.COM match_args = MATCH_IRE_TYPE | MATCH_IRE_ILL | MATCH_IRE_GW | 75611042SErik.Nordmark@Sun.COM MATCH_IRE_MASK | MATCH_IRE_ZONEONLY; 75711042SErik.Nordmark@Sun.COM 75811042SErik.Nordmark@Sun.COM if (IS_UNDER_IPMP(ill)) 75911042SErik.Nordmark@Sun.COM match_args |= MATCH_IRE_TESTHIDDEN; 76011042SErik.Nordmark@Sun.COM 76111042SErik.Nordmark@Sun.COM ire = ire_ftable_lookup_v4( 76211042SErik.Nordmark@Sun.COM addr, /* dest addr */ 76311042SErik.Nordmark@Sun.COM ip_g_all_ones, /* mask */ 76411042SErik.Nordmark@Sun.COM 0, /* no gateway */ 76511042SErik.Nordmark@Sun.COM IRE_BROADCAST, 76611042SErik.Nordmark@Sun.COM ill, 76711042SErik.Nordmark@Sun.COM zoneid, 76811042SErik.Nordmark@Sun.COM NULL, 76911042SErik.Nordmark@Sun.COM match_args, 77011042SErik.Nordmark@Sun.COM 0, 77111042SErik.Nordmark@Sun.COM ill->ill_ipst, 77211042SErik.Nordmark@Sun.COM NULL); 77311042SErik.Nordmark@Sun.COM return (ire); 7740Sstevel@tonic-gate } 7750Sstevel@tonic-gate 7760Sstevel@tonic-gate /* Arrange to call the specified function for every IRE in the world. */ 7770Sstevel@tonic-gate void 7783448Sdh155122 ire_walk(pfv_t func, void *arg, ip_stack_t *ipst) 7790Sstevel@tonic-gate { 7803448Sdh155122 ire_walk_ipvers(func, arg, 0, ALL_ZONES, ipst); 7810Sstevel@tonic-gate } 7820Sstevel@tonic-gate 7830Sstevel@tonic-gate void 7843448Sdh155122 ire_walk_v4(pfv_t func, void *arg, zoneid_t zoneid, ip_stack_t *ipst) 7850Sstevel@tonic-gate { 7863448Sdh155122 ire_walk_ipvers(func, arg, IPV4_VERSION, zoneid, ipst); 7870Sstevel@tonic-gate } 7880Sstevel@tonic-gate 7890Sstevel@tonic-gate void 7903448Sdh155122 ire_walk_v6(pfv_t func, void *arg, zoneid_t zoneid, ip_stack_t *ipst) 7910Sstevel@tonic-gate { 7923448Sdh155122 ire_walk_ipvers(func, arg, IPV6_VERSION, zoneid, ipst); 7930Sstevel@tonic-gate } 7940Sstevel@tonic-gate 7950Sstevel@tonic-gate /* 7960Sstevel@tonic-gate * Walk a particular version. version == 0 means both v4 and v6. 7970Sstevel@tonic-gate */ 7980Sstevel@tonic-gate static void 7993448Sdh155122 ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers, zoneid_t zoneid, 8003448Sdh155122 ip_stack_t *ipst) 8010Sstevel@tonic-gate { 8020Sstevel@tonic-gate if (vers != IPV6_VERSION) { 8032535Ssangeeta /* 8042535Ssangeeta * ip_forwarding_table variable doesn't matter for IPv4 since 8053448Sdh155122 * ire_walk_ill_tables uses ips_ip_ftable for IPv4. 8062535Ssangeeta */ 8070Sstevel@tonic-gate ire_walk_ill_tables(0, 0, func, arg, IP_MASK_TABLE_SIZE, 8082535Ssangeeta 0, NULL, 8093448Sdh155122 NULL, zoneid, ipst); 8100Sstevel@tonic-gate } 8110Sstevel@tonic-gate if (vers != IPV4_VERSION) { 8120Sstevel@tonic-gate ire_walk_ill_tables(0, 0, func, arg, IP6_MASK_TABLE_SIZE, 8133448Sdh155122 ipst->ips_ip6_ftable_hash_size, 8143448Sdh155122 ipst->ips_ip_forwarding_table_v6, 81511042SErik.Nordmark@Sun.COM NULL, zoneid, ipst); 8160Sstevel@tonic-gate } 8170Sstevel@tonic-gate } 8180Sstevel@tonic-gate 8190Sstevel@tonic-gate /* 8207216Smeem * Arrange to call the specified function for every IRE that matches the ill. 8210Sstevel@tonic-gate */ 8220Sstevel@tonic-gate void 8231676Sjpk ire_walk_ill(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg, 8240Sstevel@tonic-gate ill_t *ill) 8250Sstevel@tonic-gate { 8267216Smeem uchar_t vers = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION); 8277216Smeem 8287216Smeem ire_walk_ill_ipvers(match_flags, ire_type, func, arg, vers, ill); 8290Sstevel@tonic-gate } 8300Sstevel@tonic-gate 8310Sstevel@tonic-gate /* 8327216Smeem * Walk a particular ill and version. 8330Sstevel@tonic-gate */ 8340Sstevel@tonic-gate static void 8350Sstevel@tonic-gate ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type, pfv_t func, 8361676Sjpk void *arg, uchar_t vers, ill_t *ill) 8370Sstevel@tonic-gate { 8383448Sdh155122 ip_stack_t *ipst = ill->ill_ipst; 8393448Sdh155122 8407216Smeem if (vers == IPV4_VERSION) { 8410Sstevel@tonic-gate ire_walk_ill_tables(match_flags, ire_type, func, arg, 84211042SErik.Nordmark@Sun.COM IP_MASK_TABLE_SIZE, 84311042SErik.Nordmark@Sun.COM 0, NULL, 84411042SErik.Nordmark@Sun.COM ill, ALL_ZONES, ipst); 84511042SErik.Nordmark@Sun.COM } 84611042SErik.Nordmark@Sun.COM if (vers != IPV4_VERSION) { 8470Sstevel@tonic-gate ire_walk_ill_tables(match_flags, ire_type, func, arg, 8483448Sdh155122 IP6_MASK_TABLE_SIZE, ipst->ips_ip6_ftable_hash_size, 8493448Sdh155122 ipst->ips_ip_forwarding_table_v6, 85011042SErik.Nordmark@Sun.COM ill, ALL_ZONES, ipst); 8510Sstevel@tonic-gate } 8520Sstevel@tonic-gate } 8530Sstevel@tonic-gate 85411042SErik.Nordmark@Sun.COM /* 85511042SErik.Nordmark@Sun.COM * Do the specific matching of IREs to shared-IP zones. 85611042SErik.Nordmark@Sun.COM * 85711042SErik.Nordmark@Sun.COM * We have the same logic as in ire_match_args but implemented slightly 85811042SErik.Nordmark@Sun.COM * differently. 85911042SErik.Nordmark@Sun.COM */ 8602535Ssangeeta boolean_t 8610Sstevel@tonic-gate ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire, 8623448Sdh155122 ill_t *ill, zoneid_t zoneid, ip_stack_t *ipst) 8630Sstevel@tonic-gate { 86411131SErik.Nordmark@Sun.COM ill_t *dst_ill = ire->ire_ill; 8650Sstevel@tonic-gate 8660Sstevel@tonic-gate ASSERT(match_flags != 0 || zoneid != ALL_ZONES); 8670Sstevel@tonic-gate 86811042SErik.Nordmark@Sun.COM if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid && 86911042SErik.Nordmark@Sun.COM ire->ire_zoneid != ALL_ZONES) { 8700Sstevel@tonic-gate /* 8710Sstevel@tonic-gate * We're walking the IREs for a specific zone. The only relevant 8720Sstevel@tonic-gate * IREs are: 8730Sstevel@tonic-gate * - all IREs with a matching ire_zoneid 87411042SErik.Nordmark@Sun.COM * - IRE_IF_ALL IREs for interfaces with a usable source addr 8750Sstevel@tonic-gate * with a matching zone 87611042SErik.Nordmark@Sun.COM * - IRE_OFFLINK with a gateway reachable from the zone 87711042SErik.Nordmark@Sun.COM * Note that ealier we only did the IRE_OFFLINK check for 87811042SErik.Nordmark@Sun.COM * IRE_DEFAULT (and only when we had multiple IRE_DEFAULTs). 8790Sstevel@tonic-gate */ 88011042SErik.Nordmark@Sun.COM if (ire->ire_type & IRE_ONLINK) { 88111042SErik.Nordmark@Sun.COM uint_t ifindex; 88211042SErik.Nordmark@Sun.COM 8830Sstevel@tonic-gate /* 88411042SErik.Nordmark@Sun.COM * Note there is no IRE_INTERFACE on vniN thus 88511042SErik.Nordmark@Sun.COM * can't do an IRE lookup for a matching route. 8860Sstevel@tonic-gate */ 88711042SErik.Nordmark@Sun.COM ifindex = dst_ill->ill_usesrc_ifindex; 88811042SErik.Nordmark@Sun.COM if (ifindex == 0) 88911042SErik.Nordmark@Sun.COM return (B_FALSE); 89011042SErik.Nordmark@Sun.COM 89111042SErik.Nordmark@Sun.COM /* 89211042SErik.Nordmark@Sun.COM * If there is a usable source address in the 89311042SErik.Nordmark@Sun.COM * zone, then it's ok to return an 89411042SErik.Nordmark@Sun.COM * IRE_INTERFACE 89511042SErik.Nordmark@Sun.COM */ 89611042SErik.Nordmark@Sun.COM if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6, 89711042SErik.Nordmark@Sun.COM zoneid, ipst)) { 89811042SErik.Nordmark@Sun.COM return (B_FALSE); 89911042SErik.Nordmark@Sun.COM } 90011042SErik.Nordmark@Sun.COM } 90111042SErik.Nordmark@Sun.COM if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) { 90211042SErik.Nordmark@Sun.COM ipif_t *tipif; 90311042SErik.Nordmark@Sun.COM 90411042SErik.Nordmark@Sun.COM mutex_enter(&dst_ill->ill_lock); 90511042SErik.Nordmark@Sun.COM for (tipif = dst_ill->ill_ipif; 90611042SErik.Nordmark@Sun.COM tipif != NULL; tipif = tipif->ipif_next) { 90711042SErik.Nordmark@Sun.COM if (!IPIF_IS_CONDEMNED(tipif) && 90811042SErik.Nordmark@Sun.COM (tipif->ipif_flags & IPIF_UP) && 90911042SErik.Nordmark@Sun.COM (tipif->ipif_zoneid == zoneid || 91011042SErik.Nordmark@Sun.COM tipif->ipif_zoneid == ALL_ZONES)) 91111042SErik.Nordmark@Sun.COM break; 91211042SErik.Nordmark@Sun.COM } 91311042SErik.Nordmark@Sun.COM mutex_exit(&dst_ill->ill_lock); 91411042SErik.Nordmark@Sun.COM if (tipif == NULL) { 9150Sstevel@tonic-gate return (B_FALSE); 9160Sstevel@tonic-gate } 9170Sstevel@tonic-gate } 91811131SErik.Nordmark@Sun.COM } 91911131SErik.Nordmark@Sun.COM /* 92011131SErik.Nordmark@Sun.COM * Except for ALL_ZONES, we only match the offlink routes 92111131SErik.Nordmark@Sun.COM * where ire_gateway_addr has an IRE_INTERFACE for the zoneid. 922*11457SErik.Nordmark@Sun.COM * Since we can have leftover routes after the IP addresses have 923*11457SErik.Nordmark@Sun.COM * changed, the global zone will also match offlink routes where the 924*11457SErik.Nordmark@Sun.COM * gateway is unreachable from any zone. 92511131SErik.Nordmark@Sun.COM */ 92611131SErik.Nordmark@Sun.COM if ((ire->ire_type & IRE_OFFLINK) && zoneid != ALL_ZONES) { 92711131SErik.Nordmark@Sun.COM in6_addr_t gw_addr_v6; 928*11457SErik.Nordmark@Sun.COM boolean_t reach; 92911131SErik.Nordmark@Sun.COM 93011131SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV4_VERSION) { 931*11457SErik.Nordmark@Sun.COM reach = ire_gateway_ok_zone_v4(ire->ire_gateway_addr, 932*11457SErik.Nordmark@Sun.COM zoneid, dst_ill, NULL, ipst, B_FALSE); 93311131SErik.Nordmark@Sun.COM } else { 93411131SErik.Nordmark@Sun.COM ASSERT(ire->ire_ipversion == IPV6_VERSION); 93511131SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock); 93611131SErik.Nordmark@Sun.COM gw_addr_v6 = ire->ire_gateway_addr_v6; 93711131SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 93811131SErik.Nordmark@Sun.COM 939*11457SErik.Nordmark@Sun.COM reach = ire_gateway_ok_zone_v6(&gw_addr_v6, zoneid, 940*11457SErik.Nordmark@Sun.COM dst_ill, NULL, ipst, B_FALSE); 941*11457SErik.Nordmark@Sun.COM } 942*11457SErik.Nordmark@Sun.COM if (!reach) { 943*11457SErik.Nordmark@Sun.COM if (zoneid != GLOBAL_ZONEID) 94411131SErik.Nordmark@Sun.COM return (B_FALSE); 945*11457SErik.Nordmark@Sun.COM 946*11457SErik.Nordmark@Sun.COM /* 947*11457SErik.Nordmark@Sun.COM * Check if ALL_ZONES reachable - if not then let the 948*11457SErik.Nordmark@Sun.COM * global zone see it. 949*11457SErik.Nordmark@Sun.COM */ 950*11457SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV4_VERSION) { 951*11457SErik.Nordmark@Sun.COM reach = ire_gateway_ok_zone_v4( 952*11457SErik.Nordmark@Sun.COM ire->ire_gateway_addr, ALL_ZONES, 953*11457SErik.Nordmark@Sun.COM dst_ill, NULL, ipst, B_FALSE); 954*11457SErik.Nordmark@Sun.COM } else { 955*11457SErik.Nordmark@Sun.COM reach = ire_gateway_ok_zone_v6(&gw_addr_v6, 956*11457SErik.Nordmark@Sun.COM ALL_ZONES, dst_ill, NULL, ipst, B_FALSE); 957*11457SErik.Nordmark@Sun.COM } 958*11457SErik.Nordmark@Sun.COM if (reach) { 959*11457SErik.Nordmark@Sun.COM /* 960*11457SErik.Nordmark@Sun.COM * Some other zone could see it, hence hide it 961*11457SErik.Nordmark@Sun.COM * in the global zone. 962*11457SErik.Nordmark@Sun.COM */ 963*11457SErik.Nordmark@Sun.COM return (B_FALSE); 964*11457SErik.Nordmark@Sun.COM } 9650Sstevel@tonic-gate } 9660Sstevel@tonic-gate } 9670Sstevel@tonic-gate 9680Sstevel@tonic-gate if (((!(match_flags & MATCH_IRE_TYPE)) || 9694714Ssowmini (ire->ire_type & ire_type)) && 9700Sstevel@tonic-gate ((!(match_flags & MATCH_IRE_ILL)) || 97111042SErik.Nordmark@Sun.COM (dst_ill == ill || 97211042SErik.Nordmark@Sun.COM dst_ill != NULL && IS_IN_SAME_ILLGRP(dst_ill, ill)))) { 9730Sstevel@tonic-gate return (B_TRUE); 9740Sstevel@tonic-gate } 9750Sstevel@tonic-gate return (B_FALSE); 9760Sstevel@tonic-gate } 9770Sstevel@tonic-gate 9782535Ssangeeta int 9792535Ssangeeta rtfunc(struct radix_node *rn, void *arg) 9802535Ssangeeta { 9812535Ssangeeta struct rtfuncarg *rtf = arg; 9822535Ssangeeta struct rt_entry *rt; 9832535Ssangeeta irb_t *irb; 9842535Ssangeeta ire_t *ire; 9852535Ssangeeta boolean_t ret; 9862535Ssangeeta 9872535Ssangeeta rt = (struct rt_entry *)rn; 9882535Ssangeeta ASSERT(rt != NULL); 9892535Ssangeeta irb = &rt->rt_irb; 9902535Ssangeeta for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 9912535Ssangeeta if ((rtf->rt_match_flags != 0) || 9922535Ssangeeta (rtf->rt_zoneid != ALL_ZONES)) { 9932535Ssangeeta ret = ire_walk_ill_match(rtf->rt_match_flags, 9942535Ssangeeta rtf->rt_ire_type, ire, 9953448Sdh155122 rtf->rt_ill, rtf->rt_zoneid, rtf->rt_ipst); 99611042SErik.Nordmark@Sun.COM } else { 9972535Ssangeeta ret = B_TRUE; 99811042SErik.Nordmark@Sun.COM } 9992535Ssangeeta if (ret) 10002535Ssangeeta (*rtf->rt_func)(ire, rtf->rt_arg); 10012535Ssangeeta } 10022535Ssangeeta return (0); 10032535Ssangeeta } 10042535Ssangeeta 10050Sstevel@tonic-gate /* 100611042SErik.Nordmark@Sun.COM * Walk the ftable entries that match the ill. 10070Sstevel@tonic-gate */ 10082535Ssangeeta void 10090Sstevel@tonic-gate ire_walk_ill_tables(uint_t match_flags, uint_t ire_type, pfv_t func, 10101676Sjpk void *arg, size_t ftbl_sz, size_t htbl_sz, irb_t **ipftbl, 101111042SErik.Nordmark@Sun.COM ill_t *ill, zoneid_t zoneid, 10123448Sdh155122 ip_stack_t *ipst) 10130Sstevel@tonic-gate { 10140Sstevel@tonic-gate irb_t *irb_ptr; 10150Sstevel@tonic-gate irb_t *irb; 10160Sstevel@tonic-gate ire_t *ire; 10170Sstevel@tonic-gate int i, j; 10180Sstevel@tonic-gate boolean_t ret; 10192535Ssangeeta struct rtfuncarg rtfarg; 10200Sstevel@tonic-gate 10218485SPeter.Memishian@Sun.COM ASSERT((!(match_flags & MATCH_IRE_ILL)) || (ill != NULL)); 10220Sstevel@tonic-gate ASSERT(!(match_flags & MATCH_IRE_TYPE) || (ire_type != 0)); 102311042SErik.Nordmark@Sun.COM 102411042SErik.Nordmark@Sun.COM /* knobs such that routine is called only for v6 case */ 102511042SErik.Nordmark@Sun.COM if (ipftbl == ipst->ips_ip_forwarding_table_v6) { 102611042SErik.Nordmark@Sun.COM for (i = (ftbl_sz - 1); i >= 0; i--) { 102711042SErik.Nordmark@Sun.COM if ((irb_ptr = ipftbl[i]) == NULL) 102811042SErik.Nordmark@Sun.COM continue; 102911042SErik.Nordmark@Sun.COM for (j = 0; j < htbl_sz; j++) { 103011042SErik.Nordmark@Sun.COM irb = &irb_ptr[j]; 103111042SErik.Nordmark@Sun.COM if (irb->irb_ire == NULL) 10320Sstevel@tonic-gate continue; 103311042SErik.Nordmark@Sun.COM 103411042SErik.Nordmark@Sun.COM irb_refhold(irb); 103511042SErik.Nordmark@Sun.COM for (ire = irb->irb_ire; ire != NULL; 103611042SErik.Nordmark@Sun.COM ire = ire->ire_next) { 103711042SErik.Nordmark@Sun.COM if (match_flags == 0 && 103811042SErik.Nordmark@Sun.COM zoneid == ALL_ZONES) { 103911042SErik.Nordmark@Sun.COM ret = B_TRUE; 104011042SErik.Nordmark@Sun.COM } else { 104111042SErik.Nordmark@Sun.COM ret = 104211042SErik.Nordmark@Sun.COM ire_walk_ill_match( 104311042SErik.Nordmark@Sun.COM match_flags, 104411042SErik.Nordmark@Sun.COM ire_type, ire, ill, 104511042SErik.Nordmark@Sun.COM zoneid, ipst); 10460Sstevel@tonic-gate } 104711042SErik.Nordmark@Sun.COM if (ret) 104811042SErik.Nordmark@Sun.COM (*func)(ire, arg); 10490Sstevel@tonic-gate } 105011042SErik.Nordmark@Sun.COM irb_refrele(irb); 10510Sstevel@tonic-gate } 105211042SErik.Nordmark@Sun.COM } 105311042SErik.Nordmark@Sun.COM } else { 105411131SErik.Nordmark@Sun.COM bzero(&rtfarg, sizeof (rtfarg)); 105511042SErik.Nordmark@Sun.COM rtfarg.rt_func = func; 105611042SErik.Nordmark@Sun.COM rtfarg.rt_arg = arg; 105711042SErik.Nordmark@Sun.COM if (match_flags != 0) { 105811042SErik.Nordmark@Sun.COM rtfarg.rt_match_flags = match_flags; 10590Sstevel@tonic-gate } 106011042SErik.Nordmark@Sun.COM rtfarg.rt_ire_type = ire_type; 106111042SErik.Nordmark@Sun.COM rtfarg.rt_ill = ill; 106211042SErik.Nordmark@Sun.COM rtfarg.rt_zoneid = zoneid; 106311042SErik.Nordmark@Sun.COM rtfarg.rt_ipst = ipst; /* No netstack_hold */ 106411042SErik.Nordmark@Sun.COM (void) ipst->ips_ip_ftable->rnh_walktree_mt( 106511042SErik.Nordmark@Sun.COM ipst->ips_ip_ftable, 106611042SErik.Nordmark@Sun.COM rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn); 10670Sstevel@tonic-gate } 10680Sstevel@tonic-gate } 10690Sstevel@tonic-gate 10700Sstevel@tonic-gate /* 10710Sstevel@tonic-gate * This function takes a mask and returns 10720Sstevel@tonic-gate * number of bits set in the mask. If no 10730Sstevel@tonic-gate * bit is set it returns 0. 10740Sstevel@tonic-gate * Assumes a contiguous mask. 10750Sstevel@tonic-gate */ 10760Sstevel@tonic-gate int 10770Sstevel@tonic-gate ip_mask_to_plen(ipaddr_t mask) 10780Sstevel@tonic-gate { 10790Sstevel@tonic-gate return (mask == 0 ? 0 : IP_ABITS - (ffs(ntohl(mask)) -1)); 10800Sstevel@tonic-gate } 10810Sstevel@tonic-gate 10820Sstevel@tonic-gate /* 10830Sstevel@tonic-gate * Convert length for a mask to the mask. 10840Sstevel@tonic-gate */ 10850Sstevel@tonic-gate ipaddr_t 10860Sstevel@tonic-gate ip_plen_to_mask(uint_t masklen) 10870Sstevel@tonic-gate { 108811042SErik.Nordmark@Sun.COM if (masklen == 0) 108911042SErik.Nordmark@Sun.COM return (0); 109011042SErik.Nordmark@Sun.COM 10910Sstevel@tonic-gate return (htonl(IP_HOST_MASK << (IP_ABITS - masklen))); 10920Sstevel@tonic-gate } 10930Sstevel@tonic-gate 10940Sstevel@tonic-gate void 10950Sstevel@tonic-gate ire_atomic_end(irb_t *irb_ptr, ire_t *ire) 10960Sstevel@tonic-gate { 109711042SErik.Nordmark@Sun.COM ill_t *ill; 109811042SErik.Nordmark@Sun.COM 109911042SErik.Nordmark@Sun.COM ill = ire->ire_ill; 110011042SErik.Nordmark@Sun.COM if (ill != NULL) 110111042SErik.Nordmark@Sun.COM mutex_exit(&ill->ill_lock); 11020Sstevel@tonic-gate rw_exit(&irb_ptr->irb_lock); 11030Sstevel@tonic-gate } 11040Sstevel@tonic-gate 11050Sstevel@tonic-gate /* 110611042SErik.Nordmark@Sun.COM * ire_add_v[46] atomically make sure that the ill associated 110711042SErik.Nordmark@Sun.COM * with the new ire is not going away i.e., we check ILL_CONDEMNED. 11080Sstevel@tonic-gate */ 11090Sstevel@tonic-gate int 111011042SErik.Nordmark@Sun.COM ire_atomic_start(irb_t *irb_ptr, ire_t *ire) 11110Sstevel@tonic-gate { 111211042SErik.Nordmark@Sun.COM ill_t *ill; 111311042SErik.Nordmark@Sun.COM 111411042SErik.Nordmark@Sun.COM ill = ire->ire_ill; 111511042SErik.Nordmark@Sun.COM 111611042SErik.Nordmark@Sun.COM rw_enter(&irb_ptr->irb_lock, RW_WRITER); 111711042SErik.Nordmark@Sun.COM if (ill != NULL) { 111811042SErik.Nordmark@Sun.COM mutex_enter(&ill->ill_lock); 11192416Sjarrett 11202416Sjarrett /* 112111042SErik.Nordmark@Sun.COM * Don't allow IRE's to be created on dying ills. 11222416Sjarrett */ 112311042SErik.Nordmark@Sun.COM if (ill->ill_state_flags & ILL_CONDEMNED) { 112411042SErik.Nordmark@Sun.COM ire_atomic_end(irb_ptr, ire); 112511042SErik.Nordmark@Sun.COM return (ENXIO); 112611042SErik.Nordmark@Sun.COM } 112711042SErik.Nordmark@Sun.COM 112811042SErik.Nordmark@Sun.COM if (IS_UNDER_IPMP(ill)) { 112911042SErik.Nordmark@Sun.COM int error = 0; 113011042SErik.Nordmark@Sun.COM mutex_enter(&ill->ill_phyint->phyint_lock); 113111042SErik.Nordmark@Sun.COM if (!ipmp_ill_is_active(ill) && 113211042SErik.Nordmark@Sun.COM IRE_HIDDEN_TYPE(ire->ire_type) && 113311042SErik.Nordmark@Sun.COM !ire->ire_testhidden) { 113411042SErik.Nordmark@Sun.COM error = EINVAL; 11352416Sjarrett } 113611042SErik.Nordmark@Sun.COM mutex_exit(&ill->ill_phyint->phyint_lock); 11372416Sjarrett if (error != 0) { 113811042SErik.Nordmark@Sun.COM ire_atomic_end(irb_ptr, ire); 11392416Sjarrett return (error); 11402416Sjarrett } 11412416Sjarrett } 114211042SErik.Nordmark@Sun.COM 11430Sstevel@tonic-gate } 114411042SErik.Nordmark@Sun.COM return (0); 11450Sstevel@tonic-gate } 11460Sstevel@tonic-gate 11470Sstevel@tonic-gate /* 114811042SErik.Nordmark@Sun.COM * Add a fully initialized IRE to the forwarding table. 114911042SErik.Nordmark@Sun.COM * This returns NULL on failure, or a held IRE on success. 115011042SErik.Nordmark@Sun.COM * Normally the returned IRE is the same as the argument. But a different 115111042SErik.Nordmark@Sun.COM * IRE will be returned if the added IRE is deemed identical to an existing 115211042SErik.Nordmark@Sun.COM * one. In that case ire_identical_ref will be increased. 115311042SErik.Nordmark@Sun.COM * The caller always needs to do an ire_refrele() on the returned IRE. 11540Sstevel@tonic-gate */ 115511042SErik.Nordmark@Sun.COM ire_t * 115611042SErik.Nordmark@Sun.COM ire_add(ire_t *ire) 115711042SErik.Nordmark@Sun.COM { 115811042SErik.Nordmark@Sun.COM if (IRE_HIDDEN_TYPE(ire->ire_type) && 115911042SErik.Nordmark@Sun.COM ire->ire_ill != NULL && IS_UNDER_IPMP(ire->ire_ill)) { 116011042SErik.Nordmark@Sun.COM /* 116111042SErik.Nordmark@Sun.COM * IREs hosted on interfaces that are under IPMP 116211042SErik.Nordmark@Sun.COM * should be hidden so that applications don't 116311042SErik.Nordmark@Sun.COM * accidentally end up sending packets with test 116411042SErik.Nordmark@Sun.COM * addresses as their source addresses, or 116511042SErik.Nordmark@Sun.COM * sending out interfaces that are e.g. IFF_INACTIVE. 116611042SErik.Nordmark@Sun.COM * Hide them here. 116711042SErik.Nordmark@Sun.COM */ 116811042SErik.Nordmark@Sun.COM ire->ire_testhidden = B_TRUE; 116911042SErik.Nordmark@Sun.COM } 117011042SErik.Nordmark@Sun.COM 117111042SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV6_VERSION) 117211042SErik.Nordmark@Sun.COM return (ire_add_v6(ire)); 117311042SErik.Nordmark@Sun.COM else 117411042SErik.Nordmark@Sun.COM return (ire_add_v4(ire)); 117511042SErik.Nordmark@Sun.COM } 117611042SErik.Nordmark@Sun.COM 117711042SErik.Nordmark@Sun.COM /* 117811042SErik.Nordmark@Sun.COM * Add a fully initialized IPv4 IRE to the forwarding table. 117911042SErik.Nordmark@Sun.COM * This returns NULL on failure, or a held IRE on success. 118011042SErik.Nordmark@Sun.COM * Normally the returned IRE is the same as the argument. But a different 118111042SErik.Nordmark@Sun.COM * IRE will be returned if the added IRE is deemed identical to an existing 118211042SErik.Nordmark@Sun.COM * one. In that case ire_identical_ref will be increased. 118311042SErik.Nordmark@Sun.COM * The caller always needs to do an ire_refrele() on the returned IRE. 118411042SErik.Nordmark@Sun.COM */ 118511042SErik.Nordmark@Sun.COM static ire_t * 118611042SErik.Nordmark@Sun.COM ire_add_v4(ire_t *ire) 11870Sstevel@tonic-gate { 11880Sstevel@tonic-gate ire_t *ire1; 11890Sstevel@tonic-gate irb_t *irb_ptr; 11900Sstevel@tonic-gate ire_t **irep; 119111042SErik.Nordmark@Sun.COM int match_flags; 11920Sstevel@tonic-gate int error; 11933448Sdh155122 ip_stack_t *ipst = ire->ire_ipst; 119411042SErik.Nordmark@Sun.COM 119511042SErik.Nordmark@Sun.COM if (ire->ire_ill != NULL) 119611042SErik.Nordmark@Sun.COM ASSERT(!MUTEX_HELD(&ire->ire_ill->ill_lock)); 11970Sstevel@tonic-gate ASSERT(ire->ire_ipversion == IPV4_VERSION); 11980Sstevel@tonic-gate 11990Sstevel@tonic-gate /* Make sure the address is properly masked. */ 12000Sstevel@tonic-gate ire->ire_addr &= ire->ire_mask; 12010Sstevel@tonic-gate 120211042SErik.Nordmark@Sun.COM match_flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW); 120311042SErik.Nordmark@Sun.COM 120411042SErik.Nordmark@Sun.COM if (ire->ire_ill != NULL) { 120511042SErik.Nordmark@Sun.COM match_flags |= MATCH_IRE_ILL; 12060Sstevel@tonic-gate } 120711042SErik.Nordmark@Sun.COM irb_ptr = ire_get_bucket(ire); 120811042SErik.Nordmark@Sun.COM if (irb_ptr == NULL) { 120911042SErik.Nordmark@Sun.COM printf("no bucket for %p\n", (void *)ire); 121011042SErik.Nordmark@Sun.COM ire_delete(ire); 121111042SErik.Nordmark@Sun.COM return (NULL); 12122535Ssangeeta } 12130Sstevel@tonic-gate 12140Sstevel@tonic-gate /* 121511042SErik.Nordmark@Sun.COM * Start the atomic add of the ire. Grab the ill lock, 121611042SErik.Nordmark@Sun.COM * the bucket lock. Check for condemned. 12170Sstevel@tonic-gate */ 121811042SErik.Nordmark@Sun.COM error = ire_atomic_start(irb_ptr, ire); 12190Sstevel@tonic-gate if (error != 0) { 122011042SErik.Nordmark@Sun.COM printf("no ire_atomic_start for %p\n", (void *)ire); 12210Sstevel@tonic-gate ire_delete(ire); 122211042SErik.Nordmark@Sun.COM irb_refrele(irb_ptr); 122311042SErik.Nordmark@Sun.COM return (NULL); 12240Sstevel@tonic-gate } 12250Sstevel@tonic-gate /* 122611042SErik.Nordmark@Sun.COM * If we are creating a hidden IRE, make sure we search for 122711042SErik.Nordmark@Sun.COM * hidden IREs when searching for duplicates below. 122811042SErik.Nordmark@Sun.COM * Otherwise, we might find an IRE on some other interface 122911042SErik.Nordmark@Sun.COM * that's not marked hidden. 12300Sstevel@tonic-gate */ 123111042SErik.Nordmark@Sun.COM if (ire->ire_testhidden) 123211042SErik.Nordmark@Sun.COM match_flags |= MATCH_IRE_TESTHIDDEN; 123311042SErik.Nordmark@Sun.COM 12340Sstevel@tonic-gate /* 12350Sstevel@tonic-gate * Atomically check for duplicate and insert in the table. 12360Sstevel@tonic-gate */ 12370Sstevel@tonic-gate for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 123811042SErik.Nordmark@Sun.COM if (IRE_IS_CONDEMNED(ire1)) 12390Sstevel@tonic-gate continue; 124011042SErik.Nordmark@Sun.COM /* 124111042SErik.Nordmark@Sun.COM * Here we need an exact match on zoneid, i.e., 124211042SErik.Nordmark@Sun.COM * ire_match_args doesn't fit. 124311042SErik.Nordmark@Sun.COM */ 12440Sstevel@tonic-gate if (ire1->ire_zoneid != ire->ire_zoneid) 12450Sstevel@tonic-gate continue; 124611042SErik.Nordmark@Sun.COM 124711042SErik.Nordmark@Sun.COM if (ire1->ire_type != ire->ire_type) 124811042SErik.Nordmark@Sun.COM continue; 124911042SErik.Nordmark@Sun.COM 125011042SErik.Nordmark@Sun.COM /* 125111042SErik.Nordmark@Sun.COM * Note: We do not allow multiple routes that differ only 125211042SErik.Nordmark@Sun.COM * in the gateway security attributes; such routes are 125311042SErik.Nordmark@Sun.COM * considered duplicates. 125411042SErik.Nordmark@Sun.COM * To change that we explicitly have to treat them as 125511042SErik.Nordmark@Sun.COM * different here. 125611042SErik.Nordmark@Sun.COM */ 12570Sstevel@tonic-gate if (ire_match_args(ire1, ire->ire_addr, ire->ire_mask, 125811042SErik.Nordmark@Sun.COM ire->ire_gateway_addr, ire->ire_type, ire->ire_ill, 125911042SErik.Nordmark@Sun.COM ire->ire_zoneid, NULL, match_flags)) { 12600Sstevel@tonic-gate /* 12610Sstevel@tonic-gate * Return the old ire after doing a REFHOLD. 12620Sstevel@tonic-gate * As most of the callers continue to use the IRE 12630Sstevel@tonic-gate * after adding, we return a held ire. This will 12640Sstevel@tonic-gate * avoid a lookup in the caller again. If the callers 12650Sstevel@tonic-gate * don't want to use it, they need to do a REFRELE. 12660Sstevel@tonic-gate */ 126711042SErik.Nordmark@Sun.COM atomic_add_32(&ire1->ire_identical_ref, 1); 126811042SErik.Nordmark@Sun.COM DTRACE_PROBE2(ire__add__exist, ire_t *, ire1, 126911042SErik.Nordmark@Sun.COM ire_t *, ire); 127011042SErik.Nordmark@Sun.COM ire_refhold(ire1); 12710Sstevel@tonic-gate ire_atomic_end(irb_ptr, ire); 12720Sstevel@tonic-gate ire_delete(ire); 127311042SErik.Nordmark@Sun.COM irb_refrele(irb_ptr); 127411042SErik.Nordmark@Sun.COM return (ire1); 12752535Ssangeeta } 12762535Ssangeeta } 127711042SErik.Nordmark@Sun.COM 12780Sstevel@tonic-gate /* 127911042SErik.Nordmark@Sun.COM * Normally we do head insertion since most things do not care about 128011042SErik.Nordmark@Sun.COM * the order of the IREs in the bucket. Note that ip_cgtp_bcast_add 128111042SErik.Nordmark@Sun.COM * assumes we at least do head insertion so that its IRE_BROADCAST 128211042SErik.Nordmark@Sun.COM * arrive ahead of existing IRE_HOST for the same address. 128311042SErik.Nordmark@Sun.COM * However, due to shared-IP zones (and restrict_interzone_loopback) 128411042SErik.Nordmark@Sun.COM * we can have an IRE_LOCAL as well as IRE_IF_CLONE for the same 128511042SErik.Nordmark@Sun.COM * address. For that reason we do tail insertion for IRE_IF_CLONE. 128611042SErik.Nordmark@Sun.COM * Due to the IRE_BROADCAST on cgtp0, which must be last in the bucket, 128711042SErik.Nordmark@Sun.COM * we do tail insertion of IRE_BROADCASTs that do not have RTF_MULTIRT 128811042SErik.Nordmark@Sun.COM * set. 12890Sstevel@tonic-gate */ 12900Sstevel@tonic-gate irep = (ire_t **)irb_ptr; 129111042SErik.Nordmark@Sun.COM if ((ire->ire_type & IRE_IF_CLONE) || 129211042SErik.Nordmark@Sun.COM ((ire->ire_type & IRE_BROADCAST) && 129311042SErik.Nordmark@Sun.COM !(ire->ire_flags & RTF_MULTIRT))) { 129411042SErik.Nordmark@Sun.COM while ((ire1 = *irep) != NULL) 12950Sstevel@tonic-gate irep = &ire1->ire_next; 12960Sstevel@tonic-gate } 12970Sstevel@tonic-gate /* Insert at *irep */ 12980Sstevel@tonic-gate ire1 = *irep; 12990Sstevel@tonic-gate if (ire1 != NULL) 13000Sstevel@tonic-gate ire1->ire_ptpn = &ire->ire_next; 13010Sstevel@tonic-gate ire->ire_next = ire1; 13020Sstevel@tonic-gate /* Link the new one in. */ 13030Sstevel@tonic-gate ire->ire_ptpn = irep; 13040Sstevel@tonic-gate 13050Sstevel@tonic-gate /* 13060Sstevel@tonic-gate * ire_walk routines de-reference ire_next without holding 13070Sstevel@tonic-gate * a lock. Before we point to the new ire, we want to make 13080Sstevel@tonic-gate * sure the store that sets the ire_next of the new ire 13090Sstevel@tonic-gate * reaches global visibility, so that ire_walk routines 13100Sstevel@tonic-gate * don't see a truncated list of ires i.e if the ire_next 13110Sstevel@tonic-gate * of the new ire gets set after we do "*irep = ire" due 13120Sstevel@tonic-gate * to re-ordering, the ire_walk thread will see a NULL 13130Sstevel@tonic-gate * once it accesses the ire_next of the new ire. 13140Sstevel@tonic-gate * membar_producer() makes sure that the following store 13150Sstevel@tonic-gate * happens *after* all of the above stores. 13160Sstevel@tonic-gate */ 13170Sstevel@tonic-gate membar_producer(); 13180Sstevel@tonic-gate *irep = ire; 13190Sstevel@tonic-gate ire->ire_bucket = irb_ptr; 13200Sstevel@tonic-gate /* 13210Sstevel@tonic-gate * We return a bumped up IRE above. Keep it symmetrical 13220Sstevel@tonic-gate * so that the callers will always have to release. This 13230Sstevel@tonic-gate * helps the callers of this function because they continue 13240Sstevel@tonic-gate * to use the IRE after adding and hence they don't have to 13250Sstevel@tonic-gate * lookup again after we return the IRE. 13260Sstevel@tonic-gate * 13270Sstevel@tonic-gate * NOTE : We don't have to use atomics as this is appearing 13280Sstevel@tonic-gate * in the list for the first time and no one else can bump 13290Sstevel@tonic-gate * up the reference count on this yet. 13300Sstevel@tonic-gate */ 133111042SErik.Nordmark@Sun.COM ire_refhold_locked(ire); 13323448Sdh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_inserted); 13332535Ssangeeta 13340Sstevel@tonic-gate irb_ptr->irb_ire_cnt++; 133511042SErik.Nordmark@Sun.COM if (irb_ptr->irb_marks & IRB_MARK_DYNAMIC) 13362535Ssangeeta irb_ptr->irb_nire++; 13372535Ssangeeta 133811042SErik.Nordmark@Sun.COM if (ire->ire_ill != NULL) { 133911042SErik.Nordmark@Sun.COM ire->ire_ill->ill_ire_cnt++; 134011042SErik.Nordmark@Sun.COM ASSERT(ire->ire_ill->ill_ire_cnt != 0); /* Wraparound */ 13410Sstevel@tonic-gate } 13420Sstevel@tonic-gate 13430Sstevel@tonic-gate ire_atomic_end(irb_ptr, ire); 134411042SErik.Nordmark@Sun.COM 134511042SErik.Nordmark@Sun.COM /* Make any caching of the IREs be notified or updated */ 134611042SErik.Nordmark@Sun.COM ire_flush_cache_v4(ire, IRE_FLUSH_ADD); 134711042SErik.Nordmark@Sun.COM 134811042SErik.Nordmark@Sun.COM if (ire->ire_ill != NULL) 134911042SErik.Nordmark@Sun.COM ASSERT(!MUTEX_HELD(&ire->ire_ill->ill_lock)); 135011042SErik.Nordmark@Sun.COM irb_refrele(irb_ptr); 135111042SErik.Nordmark@Sun.COM return (ire); 13520Sstevel@tonic-gate } 13530Sstevel@tonic-gate 13540Sstevel@tonic-gate /* 135511042SErik.Nordmark@Sun.COM * irb_refrele is the only caller of the function. ire_unlink calls to 13560Sstevel@tonic-gate * do the final cleanup for this ire. 13570Sstevel@tonic-gate */ 13580Sstevel@tonic-gate void 13590Sstevel@tonic-gate ire_cleanup(ire_t *ire) 13600Sstevel@tonic-gate { 13610Sstevel@tonic-gate ire_t *ire_next; 13623448Sdh155122 ip_stack_t *ipst = ire->ire_ipst; 13630Sstevel@tonic-gate 13640Sstevel@tonic-gate ASSERT(ire != NULL); 13650Sstevel@tonic-gate 13660Sstevel@tonic-gate while (ire != NULL) { 13670Sstevel@tonic-gate ire_next = ire->ire_next; 13680Sstevel@tonic-gate if (ire->ire_ipversion == IPV4_VERSION) { 13690Sstevel@tonic-gate ire_delete_v4(ire); 13703448Sdh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, 13713448Sdh155122 ire_stats_deleted); 13720Sstevel@tonic-gate } else { 13730Sstevel@tonic-gate ASSERT(ire->ire_ipversion == IPV6_VERSION); 13740Sstevel@tonic-gate ire_delete_v6(ire); 13753448Sdh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, 13763448Sdh155122 ire_stats_deleted); 13770Sstevel@tonic-gate } 13780Sstevel@tonic-gate /* 13790Sstevel@tonic-gate * Now it's really out of the list. Before doing the 13800Sstevel@tonic-gate * REFRELE, set ire_next to NULL as ire_inactive asserts 13810Sstevel@tonic-gate * so. 13820Sstevel@tonic-gate */ 13830Sstevel@tonic-gate ire->ire_next = NULL; 138411042SErik.Nordmark@Sun.COM ire_refrele_notr(ire); 13850Sstevel@tonic-gate ire = ire_next; 13860Sstevel@tonic-gate } 13870Sstevel@tonic-gate } 13880Sstevel@tonic-gate 13890Sstevel@tonic-gate /* 139011042SErik.Nordmark@Sun.COM * irb_refrele is the only caller of the function. It calls to unlink 13910Sstevel@tonic-gate * all the CONDEMNED ires from this bucket. 13920Sstevel@tonic-gate */ 13930Sstevel@tonic-gate ire_t * 13940Sstevel@tonic-gate ire_unlink(irb_t *irb) 13950Sstevel@tonic-gate { 13960Sstevel@tonic-gate ire_t *ire; 13970Sstevel@tonic-gate ire_t *ire1; 13980Sstevel@tonic-gate ire_t **ptpn; 13990Sstevel@tonic-gate ire_t *ire_list = NULL; 14000Sstevel@tonic-gate 14010Sstevel@tonic-gate ASSERT(RW_WRITE_HELD(&irb->irb_lock)); 140211042SErik.Nordmark@Sun.COM ASSERT(((irb->irb_marks & IRB_MARK_DYNAMIC) && irb->irb_refcnt == 1) || 14032535Ssangeeta (irb->irb_refcnt == 0)); 14042535Ssangeeta ASSERT(irb->irb_marks & IRB_MARK_CONDEMNED); 14050Sstevel@tonic-gate ASSERT(irb->irb_ire != NULL); 14060Sstevel@tonic-gate 14070Sstevel@tonic-gate for (ire = irb->irb_ire; ire != NULL; ire = ire1) { 14080Sstevel@tonic-gate ire1 = ire->ire_next; 140911042SErik.Nordmark@Sun.COM if (IRE_IS_CONDEMNED(ire)) { 14100Sstevel@tonic-gate ptpn = ire->ire_ptpn; 14110Sstevel@tonic-gate ire1 = ire->ire_next; 14120Sstevel@tonic-gate if (ire1) 14130Sstevel@tonic-gate ire1->ire_ptpn = ptpn; 14140Sstevel@tonic-gate *ptpn = ire1; 14150Sstevel@tonic-gate ire->ire_ptpn = NULL; 14160Sstevel@tonic-gate ire->ire_next = NULL; 141711042SErik.Nordmark@Sun.COM 14180Sstevel@tonic-gate /* 141911042SErik.Nordmark@Sun.COM * We need to call ire_delete_v4 or ire_delete_v6 to 142011042SErik.Nordmark@Sun.COM * clean up dependents and the redirects pointing at 14210Sstevel@tonic-gate * the default gateway. We need to drop the lock 14220Sstevel@tonic-gate * as ire_flush_cache/ire_delete_host_redircts require 14230Sstevel@tonic-gate * so. But we can't drop the lock, as ire_unlink needs 14240Sstevel@tonic-gate * to atomically remove the ires from the list. 14250Sstevel@tonic-gate * So, create a temporary list of CONDEMNED ires 14260Sstevel@tonic-gate * for doing ire_delete_v4/ire_delete_v6 operations 14270Sstevel@tonic-gate * later on. 14280Sstevel@tonic-gate */ 14290Sstevel@tonic-gate ire->ire_next = ire_list; 14300Sstevel@tonic-gate ire_list = ire; 14310Sstevel@tonic-gate } 14320Sstevel@tonic-gate } 14332535Ssangeeta irb->irb_marks &= ~IRB_MARK_CONDEMNED; 14340Sstevel@tonic-gate return (ire_list); 14350Sstevel@tonic-gate } 14360Sstevel@tonic-gate 14370Sstevel@tonic-gate /* 143811042SErik.Nordmark@Sun.COM * Clean up the radix node for this ire. Must be called by irb_refrele 14392535Ssangeeta * when there are no ire's left in the bucket. Returns TRUE if the bucket 14402535Ssangeeta * is deleted and freed. 14412535Ssangeeta */ 14422535Ssangeeta boolean_t 14432535Ssangeeta irb_inactive(irb_t *irb) 14442535Ssangeeta { 14452535Ssangeeta struct rt_entry *rt; 14462535Ssangeeta struct radix_node *rn; 14473448Sdh155122 ip_stack_t *ipst = irb->irb_ipst; 14483448Sdh155122 14493448Sdh155122 ASSERT(irb->irb_ipst != NULL); 14502535Ssangeeta 14512535Ssangeeta rt = IRB2RT(irb); 14522535Ssangeeta rn = (struct radix_node *)rt; 14532535Ssangeeta 14542535Ssangeeta /* first remove it from the radix tree. */ 14553448Sdh155122 RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable); 14562535Ssangeeta rw_enter(&irb->irb_lock, RW_WRITER); 14572535Ssangeeta if (irb->irb_refcnt == 1 && irb->irb_nire == 0) { 14583448Sdh155122 rn = ipst->ips_ip_ftable->rnh_deladdr(rn->rn_key, rn->rn_mask, 14593448Sdh155122 ipst->ips_ip_ftable); 14602535Ssangeeta DTRACE_PROBE1(irb__free, rt_t *, rt); 14612535Ssangeeta ASSERT((void *)rn == (void *)rt); 14622535Ssangeeta Free(rt, rt_entry_cache); 14632535Ssangeeta /* irb_lock is freed */ 14643448Sdh155122 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 14652535Ssangeeta return (B_TRUE); 14662535Ssangeeta } 14672535Ssangeeta rw_exit(&irb->irb_lock); 14683448Sdh155122 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 14692535Ssangeeta return (B_FALSE); 14700Sstevel@tonic-gate } 14710Sstevel@tonic-gate 14720Sstevel@tonic-gate /* 14730Sstevel@tonic-gate * Delete the specified IRE. 147411042SErik.Nordmark@Sun.COM * We assume that if ire_bucket is not set then ire_ill->ill_ire_cnt was 147511042SErik.Nordmark@Sun.COM * not incremented i.e., that the insertion in the bucket and the increment 147611042SErik.Nordmark@Sun.COM * of that counter is done atomically. 14770Sstevel@tonic-gate */ 14780Sstevel@tonic-gate void 14790Sstevel@tonic-gate ire_delete(ire_t *ire) 14800Sstevel@tonic-gate { 14810Sstevel@tonic-gate ire_t *ire1; 14820Sstevel@tonic-gate ire_t **ptpn; 148311042SErik.Nordmark@Sun.COM irb_t *irb; 148411042SErik.Nordmark@Sun.COM nce_t *nce; 14853448Sdh155122 ip_stack_t *ipst = ire->ire_ipst; 14860Sstevel@tonic-gate 148711042SErik.Nordmark@Sun.COM /* We can clear ire_nce_cache under ire_lock even if the IRE is used */ 148811042SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock); 148911042SErik.Nordmark@Sun.COM nce = ire->ire_nce_cache; 149011042SErik.Nordmark@Sun.COM ire->ire_nce_cache = NULL; 149111042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 149211042SErik.Nordmark@Sun.COM if (nce != NULL) 149311042SErik.Nordmark@Sun.COM nce_refrele(nce); 149411042SErik.Nordmark@Sun.COM 14950Sstevel@tonic-gate if ((irb = ire->ire_bucket) == NULL) { 14962535Ssangeeta /* 14972535Ssangeeta * It was never inserted in the list. Should call REFRELE 14982535Ssangeeta * to free this IRE. 14992535Ssangeeta */ 150011042SErik.Nordmark@Sun.COM ire_refrele_notr(ire); 15010Sstevel@tonic-gate return; 15020Sstevel@tonic-gate } 15030Sstevel@tonic-gate 150411042SErik.Nordmark@Sun.COM /* 150511042SErik.Nordmark@Sun.COM * Move the use counts from an IRE_IF_CLONE to its parent 150611042SErik.Nordmark@Sun.COM * IRE_INTERFACE. 150711042SErik.Nordmark@Sun.COM * We need to do this before acquiring irb_lock. 150811042SErik.Nordmark@Sun.COM */ 150911042SErik.Nordmark@Sun.COM if (ire->ire_type & IRE_IF_CLONE) { 151011042SErik.Nordmark@Sun.COM ire_t *parent; 151111042SErik.Nordmark@Sun.COM 151211042SErik.Nordmark@Sun.COM rw_enter(&ipst->ips_ire_dep_lock, RW_READER); 151311042SErik.Nordmark@Sun.COM if ((parent = ire->ire_dep_parent) != NULL) { 151411042SErik.Nordmark@Sun.COM parent->ire_ob_pkt_count += ire->ire_ob_pkt_count; 151511042SErik.Nordmark@Sun.COM parent->ire_ib_pkt_count += ire->ire_ib_pkt_count; 151611042SErik.Nordmark@Sun.COM ire->ire_ob_pkt_count = 0; 151711042SErik.Nordmark@Sun.COM ire->ire_ib_pkt_count = 0; 151811042SErik.Nordmark@Sun.COM } 151911042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ire_dep_lock); 152011042SErik.Nordmark@Sun.COM } 152111042SErik.Nordmark@Sun.COM 15220Sstevel@tonic-gate rw_enter(&irb->irb_lock, RW_WRITER); 15230Sstevel@tonic-gate if (ire->ire_ptpn == NULL) { 15240Sstevel@tonic-gate /* 15250Sstevel@tonic-gate * Some other thread has removed us from the list. 15260Sstevel@tonic-gate * It should have done the REFRELE for us. 15270Sstevel@tonic-gate */ 15280Sstevel@tonic-gate rw_exit(&irb->irb_lock); 15290Sstevel@tonic-gate return; 15300Sstevel@tonic-gate } 15310Sstevel@tonic-gate 153211042SErik.Nordmark@Sun.COM if (!IRE_IS_CONDEMNED(ire)) { 153311042SErik.Nordmark@Sun.COM /* Is this an IRE representing multiple duplicate entries? */ 153411042SErik.Nordmark@Sun.COM ASSERT(ire->ire_identical_ref >= 1); 153511042SErik.Nordmark@Sun.COM if (atomic_add_32_nv(&ire->ire_identical_ref, -1) != 0) { 153611042SErik.Nordmark@Sun.COM /* Removed one of the identical parties */ 153711042SErik.Nordmark@Sun.COM rw_exit(&irb->irb_lock); 153811042SErik.Nordmark@Sun.COM return; 153911042SErik.Nordmark@Sun.COM } 154011042SErik.Nordmark@Sun.COM 15415388Sja97890 irb->irb_ire_cnt--; 154211042SErik.Nordmark@Sun.COM ire_make_condemned(ire); 15435388Sja97890 } 15445388Sja97890 15450Sstevel@tonic-gate if (irb->irb_refcnt != 0) { 15460Sstevel@tonic-gate /* 15470Sstevel@tonic-gate * The last thread to leave this bucket will 15480Sstevel@tonic-gate * delete this ire. 15490Sstevel@tonic-gate */ 15502535Ssangeeta irb->irb_marks |= IRB_MARK_CONDEMNED; 15510Sstevel@tonic-gate rw_exit(&irb->irb_lock); 15520Sstevel@tonic-gate return; 15530Sstevel@tonic-gate } 15540Sstevel@tonic-gate 15550Sstevel@tonic-gate /* 15560Sstevel@tonic-gate * Normally to delete an ire, we walk the bucket. While we 15570Sstevel@tonic-gate * walk the bucket, we normally bump up irb_refcnt and hence 15580Sstevel@tonic-gate * we return from above where we mark CONDEMNED and the ire 15590Sstevel@tonic-gate * gets deleted from ire_unlink. This case is where somebody 15600Sstevel@tonic-gate * knows the ire e.g by doing a lookup, and wants to delete the 15610Sstevel@tonic-gate * IRE. irb_refcnt would be 0 in this case if nobody is walking 15620Sstevel@tonic-gate * the bucket. 15630Sstevel@tonic-gate */ 15640Sstevel@tonic-gate ptpn = ire->ire_ptpn; 15650Sstevel@tonic-gate ire1 = ire->ire_next; 15660Sstevel@tonic-gate if (ire1 != NULL) 15670Sstevel@tonic-gate ire1->ire_ptpn = ptpn; 15680Sstevel@tonic-gate ASSERT(ptpn != NULL); 15690Sstevel@tonic-gate *ptpn = ire1; 15700Sstevel@tonic-gate ire->ire_ptpn = NULL; 15710Sstevel@tonic-gate ire->ire_next = NULL; 15720Sstevel@tonic-gate if (ire->ire_ipversion == IPV6_VERSION) { 15733448Sdh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_deleted); 15740Sstevel@tonic-gate } else { 15753448Sdh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_deleted); 15760Sstevel@tonic-gate } 15770Sstevel@tonic-gate rw_exit(&irb->irb_lock); 15780Sstevel@tonic-gate 157911042SErik.Nordmark@Sun.COM /* Cleanup dependents and related stuff */ 15800Sstevel@tonic-gate if (ire->ire_ipversion == IPV6_VERSION) { 15810Sstevel@tonic-gate ire_delete_v6(ire); 15820Sstevel@tonic-gate } else { 15830Sstevel@tonic-gate ire_delete_v4(ire); 15840Sstevel@tonic-gate } 15850Sstevel@tonic-gate /* 15860Sstevel@tonic-gate * We removed it from the list. Decrement the 15870Sstevel@tonic-gate * reference count. 15880Sstevel@tonic-gate */ 158911042SErik.Nordmark@Sun.COM ire_refrele_notr(ire); 15900Sstevel@tonic-gate } 15910Sstevel@tonic-gate 15920Sstevel@tonic-gate /* 15930Sstevel@tonic-gate * Delete the specified IRE. 15940Sstevel@tonic-gate * All calls should use ire_delete(). 15950Sstevel@tonic-gate * Sometimes called as writer though not required by this function. 15960Sstevel@tonic-gate * 15970Sstevel@tonic-gate * NOTE : This function is called only if the ire was added 15980Sstevel@tonic-gate * in the list. 15990Sstevel@tonic-gate */ 16000Sstevel@tonic-gate static void 16010Sstevel@tonic-gate ire_delete_v4(ire_t *ire) 16020Sstevel@tonic-gate { 16033448Sdh155122 ip_stack_t *ipst = ire->ire_ipst; 16043448Sdh155122 16050Sstevel@tonic-gate ASSERT(ire->ire_refcnt >= 1); 16060Sstevel@tonic-gate ASSERT(ire->ire_ipversion == IPV4_VERSION); 16070Sstevel@tonic-gate 160811042SErik.Nordmark@Sun.COM ire_flush_cache_v4(ire, IRE_FLUSH_DELETE); 16090Sstevel@tonic-gate if (ire->ire_type == IRE_DEFAULT) { 16100Sstevel@tonic-gate /* 16110Sstevel@tonic-gate * when a default gateway is going away 16120Sstevel@tonic-gate * delete all the host redirects pointing at that 16130Sstevel@tonic-gate * gateway. 16140Sstevel@tonic-gate */ 16153448Sdh155122 ire_delete_host_redirects(ire->ire_gateway_addr, ipst); 16160Sstevel@tonic-gate } 161711042SErik.Nordmark@Sun.COM 161811042SErik.Nordmark@Sun.COM /* 161911042SErik.Nordmark@Sun.COM * If we are deleting an IRE_INTERFACE then we make sure we also 162011042SErik.Nordmark@Sun.COM * delete any IRE_IF_CLONE that has been created from it. 162111042SErik.Nordmark@Sun.COM * Those are always in ire_dep_children. 162211042SErik.Nordmark@Sun.COM */ 162311042SErik.Nordmark@Sun.COM if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != NULL) 162411042SErik.Nordmark@Sun.COM ire_dep_delete_if_clone(ire); 162511042SErik.Nordmark@Sun.COM 162611042SErik.Nordmark@Sun.COM /* Remove from parent dependencies and child */ 162711042SErik.Nordmark@Sun.COM rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER); 162811042SErik.Nordmark@Sun.COM if (ire->ire_dep_parent != NULL) 162911042SErik.Nordmark@Sun.COM ire_dep_remove(ire); 163011042SErik.Nordmark@Sun.COM 163111042SErik.Nordmark@Sun.COM while (ire->ire_dep_children != NULL) 163211042SErik.Nordmark@Sun.COM ire_dep_remove(ire->ire_dep_children); 163311042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ire_dep_lock); 16340Sstevel@tonic-gate } 16350Sstevel@tonic-gate 16360Sstevel@tonic-gate /* 163711042SErik.Nordmark@Sun.COM * ire_refrele is the only caller of the function. It calls 16380Sstevel@tonic-gate * to free the ire when the reference count goes to zero. 16390Sstevel@tonic-gate */ 16400Sstevel@tonic-gate void 16410Sstevel@tonic-gate ire_inactive(ire_t *ire) 16420Sstevel@tonic-gate { 164311042SErik.Nordmark@Sun.COM ill_t *ill; 16442535Ssangeeta irb_t *irb; 16453448Sdh155122 ip_stack_t *ipst = ire->ire_ipst; 16460Sstevel@tonic-gate 16470Sstevel@tonic-gate ASSERT(ire->ire_refcnt == 0); 16480Sstevel@tonic-gate ASSERT(ire->ire_ptpn == NULL); 16490Sstevel@tonic-gate ASSERT(ire->ire_next == NULL); 16500Sstevel@tonic-gate 165111042SErik.Nordmark@Sun.COM /* Count how many condemned ires for kmem_cache callback */ 165211042SErik.Nordmark@Sun.COM if (IRE_IS_CONDEMNED(ire)) 165311042SErik.Nordmark@Sun.COM atomic_add_32(&ipst->ips_num_ire_condemned, -1); 165411042SErik.Nordmark@Sun.COM 16552535Ssangeeta if (ire->ire_gw_secattr != NULL) { 16562535Ssangeeta ire_gw_secattr_free(ire->ire_gw_secattr); 16572535Ssangeeta ire->ire_gw_secattr = NULL; 16582535Ssangeeta } 16592535Ssangeeta 166011042SErik.Nordmark@Sun.COM /* 166111042SErik.Nordmark@Sun.COM * ire_nce_cache is cleared in ire_delete, and we make sure we don't 166211042SErik.Nordmark@Sun.COM * set it once the ire is marked condemned. 166311042SErik.Nordmark@Sun.COM */ 166411042SErik.Nordmark@Sun.COM ASSERT(ire->ire_nce_cache == NULL); 166511042SErik.Nordmark@Sun.COM 166611042SErik.Nordmark@Sun.COM /* 166711042SErik.Nordmark@Sun.COM * Since any parent would have a refhold on us they would already 166811042SErik.Nordmark@Sun.COM * have been removed. 166911042SErik.Nordmark@Sun.COM */ 167011042SErik.Nordmark@Sun.COM ASSERT(ire->ire_dep_parent == NULL); 167111042SErik.Nordmark@Sun.COM ASSERT(ire->ire_dep_sib_next == NULL); 167211042SErik.Nordmark@Sun.COM ASSERT(ire->ire_dep_sib_ptpn == NULL); 16730Sstevel@tonic-gate 16740Sstevel@tonic-gate /* 167511042SErik.Nordmark@Sun.COM * Since any children would have a refhold on us they should have 167611042SErik.Nordmark@Sun.COM * already been removed. 167711042SErik.Nordmark@Sun.COM */ 167811042SErik.Nordmark@Sun.COM ASSERT(ire->ire_dep_children == NULL); 167911042SErik.Nordmark@Sun.COM 168011042SErik.Nordmark@Sun.COM /* 168111042SErik.Nordmark@Sun.COM * ill_ire_ref is increased when the IRE is inserted in the 168211042SErik.Nordmark@Sun.COM * bucket - not when the IRE is created. 16830Sstevel@tonic-gate */ 168411042SErik.Nordmark@Sun.COM irb = ire->ire_bucket; 168511042SErik.Nordmark@Sun.COM ill = ire->ire_ill; 168611042SErik.Nordmark@Sun.COM if (irb != NULL && ill != NULL) { 16870Sstevel@tonic-gate mutex_enter(&ill->ill_lock); 168811042SErik.Nordmark@Sun.COM ASSERT(ill->ill_ire_cnt != 0); 168911042SErik.Nordmark@Sun.COM DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill, 16906255Ssowmini (char *), "ire", (void *), ire); 169111042SErik.Nordmark@Sun.COM ill->ill_ire_cnt--; 169211042SErik.Nordmark@Sun.COM if (ILL_DOWN_OK(ill)) { 16930Sstevel@tonic-gate /* Drops the ill lock */ 16940Sstevel@tonic-gate ipif_ill_refrele_tail(ill); 16950Sstevel@tonic-gate } else { 16960Sstevel@tonic-gate mutex_exit(&ill->ill_lock); 16970Sstevel@tonic-gate } 16980Sstevel@tonic-gate } 169911042SErik.Nordmark@Sun.COM ire->ire_ill = NULL; 170011042SErik.Nordmark@Sun.COM 17010Sstevel@tonic-gate /* This should be true for both V4 and V6 */ 170211042SErik.Nordmark@Sun.COM if (irb != NULL && (irb->irb_marks & IRB_MARK_DYNAMIC)) { 17032535Ssangeeta rw_enter(&irb->irb_lock, RW_WRITER); 17042535Ssangeeta irb->irb_nire--; 17052535Ssangeeta /* 17062535Ssangeeta * Instead of examining the conditions for freeing 17072535Ssangeeta * the radix node here, we do it by calling 170811042SErik.Nordmark@Sun.COM * irb_refrele which is a single point in the code 17092535Ssangeeta * that embeds that logic. Bump up the refcnt to 171011042SErik.Nordmark@Sun.COM * be able to call irb_refrele 17112535Ssangeeta */ 171211042SErik.Nordmark@Sun.COM irb_refhold_locked(irb); 17132535Ssangeeta rw_exit(&irb->irb_lock); 171411042SErik.Nordmark@Sun.COM irb_refrele(irb); 17152535Ssangeeta } 17160Sstevel@tonic-gate 17175023Scarlsonj #ifdef DEBUG 17185023Scarlsonj ire_trace_cleanup(ire); 17190Sstevel@tonic-gate #endif 17200Sstevel@tonic-gate mutex_destroy(&ire->ire_lock); 17210Sstevel@tonic-gate if (ire->ire_ipversion == IPV6_VERSION) { 17223448Sdh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_freed); 17230Sstevel@tonic-gate } else { 17243448Sdh155122 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_freed); 17250Sstevel@tonic-gate } 17262535Ssangeeta kmem_cache_free(ire_cache, ire); 17270Sstevel@tonic-gate } 17280Sstevel@tonic-gate 17290Sstevel@tonic-gate /* 173011042SErik.Nordmark@Sun.COM * ire_update_generation is the callback function provided by 173111042SErik.Nordmark@Sun.COM * ire_get_bucket() to update the generation number of any 173211042SErik.Nordmark@Sun.COM * matching shorter route when a new route is added. 173311042SErik.Nordmark@Sun.COM * 173411042SErik.Nordmark@Sun.COM * This fucntion always returns a failure return (B_FALSE) 173511042SErik.Nordmark@Sun.COM * to force the caller (rn_matchaddr_args) 173611042SErik.Nordmark@Sun.COM * to back-track up the tree looking for shorter matches. 17370Sstevel@tonic-gate */ 173811042SErik.Nordmark@Sun.COM /* ARGSUSED */ 173911042SErik.Nordmark@Sun.COM static boolean_t 174011042SErik.Nordmark@Sun.COM ire_update_generation(struct radix_node *rn, void *arg) 17410Sstevel@tonic-gate { 174211042SErik.Nordmark@Sun.COM struct rt_entry *rt = (struct rt_entry *)rn; 174311042SErik.Nordmark@Sun.COM 174411042SErik.Nordmark@Sun.COM /* We need to handle all in the same bucket */ 174511042SErik.Nordmark@Sun.COM irb_increment_generation(&rt->rt_irb); 174611042SErik.Nordmark@Sun.COM return (B_FALSE); 17470Sstevel@tonic-gate } 17480Sstevel@tonic-gate 17490Sstevel@tonic-gate /* 175011042SErik.Nordmark@Sun.COM * Take care of all the generation numbers in the bucket. 175111042SErik.Nordmark@Sun.COM */ 175211042SErik.Nordmark@Sun.COM void 175311042SErik.Nordmark@Sun.COM irb_increment_generation(irb_t *irb) 175411042SErik.Nordmark@Sun.COM { 175511042SErik.Nordmark@Sun.COM ire_t *ire; 175611042SErik.Nordmark@Sun.COM 175711042SErik.Nordmark@Sun.COM if (irb == NULL || irb->irb_ire_cnt == 0) 175811042SErik.Nordmark@Sun.COM return; 175911042SErik.Nordmark@Sun.COM 176011042SErik.Nordmark@Sun.COM irb_refhold(irb); 176111042SErik.Nordmark@Sun.COM for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 176211042SErik.Nordmark@Sun.COM if (!IRE_IS_CONDEMNED(ire)) 176311042SErik.Nordmark@Sun.COM ire_increment_generation(ire); /* Ourselves */ 176411042SErik.Nordmark@Sun.COM ire_dep_incr_generation(ire); /* Dependants */ 176511042SErik.Nordmark@Sun.COM } 176611042SErik.Nordmark@Sun.COM irb_refrele(irb); 176711042SErik.Nordmark@Sun.COM } 176811042SErik.Nordmark@Sun.COM 176911042SErik.Nordmark@Sun.COM /* 177011042SErik.Nordmark@Sun.COM * When an IRE is added or deleted this routine is called to make sure 177111042SErik.Nordmark@Sun.COM * any caching of IRE information is notified or updated. 17720Sstevel@tonic-gate * 17730Sstevel@tonic-gate * The flag argument indicates if the flush request is due to addition 177411042SErik.Nordmark@Sun.COM * of new route (IRE_FLUSH_ADD), deletion of old route (IRE_FLUSH_DELETE), 177511042SErik.Nordmark@Sun.COM * or a change to ire_gateway_addr (IRE_FLUSH_GWCHANGE). 17760Sstevel@tonic-gate */ 17770Sstevel@tonic-gate void 17780Sstevel@tonic-gate ire_flush_cache_v4(ire_t *ire, int flag) 17790Sstevel@tonic-gate { 178011042SErik.Nordmark@Sun.COM irb_t *irb = ire->ire_bucket; 178111042SErik.Nordmark@Sun.COM struct rt_entry *rt = IRB2RT(irb); 178211042SErik.Nordmark@Sun.COM ip_stack_t *ipst = ire->ire_ipst; 178311042SErik.Nordmark@Sun.COM 178411042SErik.Nordmark@Sun.COM /* 178511042SErik.Nordmark@Sun.COM * IRE_IF_CLONE ire's don't provide any new information 178611042SErik.Nordmark@Sun.COM * than the parent from which they are cloned, so don't 178711042SErik.Nordmark@Sun.COM * perturb the generation numbers. 178811042SErik.Nordmark@Sun.COM */ 178911042SErik.Nordmark@Sun.COM if (ire->ire_type & IRE_IF_CLONE) 17904714Ssowmini return; 17910Sstevel@tonic-gate 17920Sstevel@tonic-gate /* 179311042SErik.Nordmark@Sun.COM * Ensure that an ire_add during a lookup serializes the updates of the 179411042SErik.Nordmark@Sun.COM * generation numbers under the radix head lock so that the lookup gets 179511042SErik.Nordmark@Sun.COM * either the old ire and old generation number, or a new ire and new 179611042SErik.Nordmark@Sun.COM * generation number. 179711042SErik.Nordmark@Sun.COM */ 179811042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable); 179911042SErik.Nordmark@Sun.COM 180011042SErik.Nordmark@Sun.COM /* 180111042SErik.Nordmark@Sun.COM * If a route was just added, we need to notify everybody that 180211042SErik.Nordmark@Sun.COM * has cached an IRE_NOROUTE since there might now be a better 180311042SErik.Nordmark@Sun.COM * route for them. 18040Sstevel@tonic-gate */ 180511042SErik.Nordmark@Sun.COM if (flag == IRE_FLUSH_ADD) { 180611042SErik.Nordmark@Sun.COM ire_increment_generation(ipst->ips_ire_reject_v4); 180711042SErik.Nordmark@Sun.COM ire_increment_generation(ipst->ips_ire_blackhole_v4); 180811042SErik.Nordmark@Sun.COM } 180911042SErik.Nordmark@Sun.COM 181011042SErik.Nordmark@Sun.COM /* Adding a default can't otherwise provide a better route */ 181111042SErik.Nordmark@Sun.COM if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) { 181211042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 18130Sstevel@tonic-gate return; 181411042SErik.Nordmark@Sun.COM } 181511042SErik.Nordmark@Sun.COM 181611042SErik.Nordmark@Sun.COM switch (flag) { 181711042SErik.Nordmark@Sun.COM case IRE_FLUSH_DELETE: 181811042SErik.Nordmark@Sun.COM case IRE_FLUSH_GWCHANGE: 18190Sstevel@tonic-gate /* 182011042SErik.Nordmark@Sun.COM * Update ire_generation for all ire_dep_children chains 182111042SErik.Nordmark@Sun.COM * starting with this IRE 18220Sstevel@tonic-gate */ 182311042SErik.Nordmark@Sun.COM ire_dep_incr_generation(ire); 182411042SErik.Nordmark@Sun.COM break; 182511042SErik.Nordmark@Sun.COM case IRE_FLUSH_ADD: 18260Sstevel@tonic-gate /* 182711042SErik.Nordmark@Sun.COM * Update the generation numbers of all shorter matching routes. 182811042SErik.Nordmark@Sun.COM * ire_update_generation takes care of the dependants by 182911042SErik.Nordmark@Sun.COM * using ire_dep_incr_generation. 18300Sstevel@tonic-gate */ 183111042SErik.Nordmark@Sun.COM (void) ipst->ips_ip_ftable->rnh_matchaddr_args(&rt->rt_dst, 183211042SErik.Nordmark@Sun.COM ipst->ips_ip_ftable, ire_update_generation, NULL); 183311042SErik.Nordmark@Sun.COM break; 18340Sstevel@tonic-gate } 183511042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 18360Sstevel@tonic-gate } 18370Sstevel@tonic-gate 18380Sstevel@tonic-gate /* 18390Sstevel@tonic-gate * Matches the arguments passed with the values in the ire. 18400Sstevel@tonic-gate * 184111042SErik.Nordmark@Sun.COM * Note: for match types that match using "ill" passed in, ill 18420Sstevel@tonic-gate * must be checked for non-NULL before calling this routine. 18430Sstevel@tonic-gate */ 18442535Ssangeeta boolean_t 18450Sstevel@tonic-gate ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, 184611042SErik.Nordmark@Sun.COM int type, const ill_t *ill, zoneid_t zoneid, 184711042SErik.Nordmark@Sun.COM const ts_label_t *tsl, int match_flags) 18480Sstevel@tonic-gate { 18490Sstevel@tonic-gate ill_t *ire_ill = NULL, *dst_ill; 185011042SErik.Nordmark@Sun.COM ip_stack_t *ipst = ire->ire_ipst; 18510Sstevel@tonic-gate 18520Sstevel@tonic-gate ASSERT(ire->ire_ipversion == IPV4_VERSION); 18530Sstevel@tonic-gate ASSERT((ire->ire_addr & ~ire->ire_mask) == 0); 18548485SPeter.Memishian@Sun.COM ASSERT((!(match_flags & MATCH_IRE_ILL)) || 185511042SErik.Nordmark@Sun.COM (ill != NULL && !ill->ill_isv6)); 18560Sstevel@tonic-gate 18570Sstevel@tonic-gate /* 185811042SErik.Nordmark@Sun.COM * If MATCH_IRE_TESTHIDDEN is set, then only return the IRE if it is 185911042SErik.Nordmark@Sun.COM * in fact hidden, to ensure the caller gets the right one. 18600Sstevel@tonic-gate */ 186111042SErik.Nordmark@Sun.COM if (ire->ire_testhidden) { 186211042SErik.Nordmark@Sun.COM if (!(match_flags & MATCH_IRE_TESTHIDDEN)) 18638485SPeter.Memishian@Sun.COM return (B_FALSE); 18648485SPeter.Memishian@Sun.COM } 18650Sstevel@tonic-gate 18661676Sjpk if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid && 18671676Sjpk ire->ire_zoneid != ALL_ZONES) { 18680Sstevel@tonic-gate /* 186911042SErik.Nordmark@Sun.COM * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid 187011042SErik.Nordmark@Sun.COM * does not match that of ire_zoneid, a failure to 18710Sstevel@tonic-gate * match is reported at this point. Otherwise, since some IREs 18720Sstevel@tonic-gate * that are available in the global zone can be used in local 18730Sstevel@tonic-gate * zones, additional checks need to be performed: 18740Sstevel@tonic-gate * 187511042SErik.Nordmark@Sun.COM * IRE_LOOPBACK 18760Sstevel@tonic-gate * entries should never be matched in this situation. 187711042SErik.Nordmark@Sun.COM * Each zone has its own IRE_LOOPBACK. 18780Sstevel@tonic-gate * 187911042SErik.Nordmark@Sun.COM * IRE_LOCAL 188011042SErik.Nordmark@Sun.COM * We allow them for any zoneid. ire_route_recursive 188111042SErik.Nordmark@Sun.COM * does additional checks when 188211042SErik.Nordmark@Sun.COM * ip_restrict_interzone_loopback is set. 18830Sstevel@tonic-gate * 188411042SErik.Nordmark@Sun.COM * If ill_usesrc_ifindex is set 188511042SErik.Nordmark@Sun.COM * Then we check if the zone has a valid source address 188611042SErik.Nordmark@Sun.COM * on the usesrc ill. 18870Sstevel@tonic-gate * 188811042SErik.Nordmark@Sun.COM * If ire_ill is set, then check that the zone has an ipif 188911042SErik.Nordmark@Sun.COM * on that ill. 189011042SErik.Nordmark@Sun.COM * 189111042SErik.Nordmark@Sun.COM * Outside of this function (in ire_round_robin) we check 189211042SErik.Nordmark@Sun.COM * that any IRE_OFFLINK has a gateway that reachable from the 189311042SErik.Nordmark@Sun.COM * zone when we have multiple choices (ECMP). 18940Sstevel@tonic-gate */ 18950Sstevel@tonic-gate if (match_flags & MATCH_IRE_ZONEONLY) 18960Sstevel@tonic-gate return (B_FALSE); 189711042SErik.Nordmark@Sun.COM if (ire->ire_type & IRE_LOOPBACK) 18980Sstevel@tonic-gate return (B_FALSE); 189911042SErik.Nordmark@Sun.COM 190011042SErik.Nordmark@Sun.COM if (ire->ire_type & IRE_LOCAL) 190111042SErik.Nordmark@Sun.COM goto matchit; 190211042SErik.Nordmark@Sun.COM 19030Sstevel@tonic-gate /* 190411042SErik.Nordmark@Sun.COM * The normal case of IRE_ONLINK has a matching zoneid. 190511042SErik.Nordmark@Sun.COM * Here we handle the case when shared-IP zones have been 190611042SErik.Nordmark@Sun.COM * configured with IP addresses on vniN. In that case it 190711042SErik.Nordmark@Sun.COM * is ok for traffic from a zone to use IRE_ONLINK routes 190811042SErik.Nordmark@Sun.COM * if the ill has a usesrc pointing at vniN 19090Sstevel@tonic-gate */ 191011042SErik.Nordmark@Sun.COM dst_ill = ire->ire_ill; 191111042SErik.Nordmark@Sun.COM if (ire->ire_type & IRE_ONLINK) { 191211042SErik.Nordmark@Sun.COM uint_t ifindex; 191311042SErik.Nordmark@Sun.COM 191411042SErik.Nordmark@Sun.COM /* 191511042SErik.Nordmark@Sun.COM * Note there is no IRE_INTERFACE on vniN thus 191611042SErik.Nordmark@Sun.COM * can't do an IRE lookup for a matching route. 191711042SErik.Nordmark@Sun.COM */ 191811042SErik.Nordmark@Sun.COM ifindex = dst_ill->ill_usesrc_ifindex; 191911042SErik.Nordmark@Sun.COM if (ifindex == 0) 192011042SErik.Nordmark@Sun.COM return (B_FALSE); 192111042SErik.Nordmark@Sun.COM 19220Sstevel@tonic-gate /* 19230Sstevel@tonic-gate * If there is a usable source address in the 192411042SErik.Nordmark@Sun.COM * zone, then it's ok to return this IRE_INTERFACE 19250Sstevel@tonic-gate */ 192611042SErik.Nordmark@Sun.COM if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6, 192711042SErik.Nordmark@Sun.COM zoneid, ipst)) { 192811042SErik.Nordmark@Sun.COM ip3dbg(("ire_match_args: no usrsrc for zone" 19290Sstevel@tonic-gate " dst_ill %p\n", (void *)dst_ill)); 19300Sstevel@tonic-gate return (B_FALSE); 19310Sstevel@tonic-gate } 19320Sstevel@tonic-gate } 193311042SErik.Nordmark@Sun.COM /* 193411042SErik.Nordmark@Sun.COM * For exampe, with 193511042SErik.Nordmark@Sun.COM * route add 11.0.0.0 gw1 -ifp bge0 193611042SErik.Nordmark@Sun.COM * route add 11.0.0.0 gw2 -ifp bge1 193711042SErik.Nordmark@Sun.COM * this code would differentiate based on 193811042SErik.Nordmark@Sun.COM * where the sending zone has addresses. 193911042SErik.Nordmark@Sun.COM * Only if the zone has an address on bge0 can it use the first 194011042SErik.Nordmark@Sun.COM * route. It isn't clear if this behavior is documented 194111042SErik.Nordmark@Sun.COM * anywhere. 194211042SErik.Nordmark@Sun.COM */ 194311042SErik.Nordmark@Sun.COM if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) { 19440Sstevel@tonic-gate ipif_t *tipif; 19450Sstevel@tonic-gate 194611042SErik.Nordmark@Sun.COM mutex_enter(&dst_ill->ill_lock); 194711042SErik.Nordmark@Sun.COM for (tipif = dst_ill->ill_ipif; 19480Sstevel@tonic-gate tipif != NULL; tipif = tipif->ipif_next) { 194911042SErik.Nordmark@Sun.COM if (!IPIF_IS_CONDEMNED(tipif) && 19500Sstevel@tonic-gate (tipif->ipif_flags & IPIF_UP) && 19511676Sjpk (tipif->ipif_zoneid == zoneid || 19521676Sjpk tipif->ipif_zoneid == ALL_ZONES)) 19530Sstevel@tonic-gate break; 19540Sstevel@tonic-gate } 195511042SErik.Nordmark@Sun.COM mutex_exit(&dst_ill->ill_lock); 19560Sstevel@tonic-gate if (tipif == NULL) { 19570Sstevel@tonic-gate return (B_FALSE); 19580Sstevel@tonic-gate } 19590Sstevel@tonic-gate } 19600Sstevel@tonic-gate } 19610Sstevel@tonic-gate 196211042SErik.Nordmark@Sun.COM matchit: 19638485SPeter.Memishian@Sun.COM if (match_flags & MATCH_IRE_ILL) { 196411042SErik.Nordmark@Sun.COM ire_ill = ire->ire_ill; 196511042SErik.Nordmark@Sun.COM 196611042SErik.Nordmark@Sun.COM /* 196711042SErik.Nordmark@Sun.COM * If asked to match an ill, we *must* match 196811042SErik.Nordmark@Sun.COM * on the ire_ill for ipmp test addresses, or 196911042SErik.Nordmark@Sun.COM * any of the ill in the group for data addresses. 197011042SErik.Nordmark@Sun.COM * If we don't, we may as well fail. 197111042SErik.Nordmark@Sun.COM * However, we need an exception for IRE_LOCALs to ensure 197211042SErik.Nordmark@Sun.COM * we loopback packets even sent to test addresses on different 197311042SErik.Nordmark@Sun.COM * interfaces in the group. 197411042SErik.Nordmark@Sun.COM */ 197511042SErik.Nordmark@Sun.COM if ((match_flags & MATCH_IRE_TESTHIDDEN) && 197611042SErik.Nordmark@Sun.COM !(ire->ire_type & IRE_LOCAL)) { 197711042SErik.Nordmark@Sun.COM if (ire->ire_ill != ill) 197811042SErik.Nordmark@Sun.COM return (B_FALSE); 197911042SErik.Nordmark@Sun.COM } else { 198011042SErik.Nordmark@Sun.COM match_flags &= ~MATCH_IRE_TESTHIDDEN; 198111042SErik.Nordmark@Sun.COM /* 198211042SErik.Nordmark@Sun.COM * We know that ill is not NULL, but ire_ill could be 198311042SErik.Nordmark@Sun.COM * NULL 198411042SErik.Nordmark@Sun.COM */ 198511042SErik.Nordmark@Sun.COM if (ire_ill == NULL || !IS_ON_SAME_LAN(ill, ire_ill)) 198611042SErik.Nordmark@Sun.COM return (B_FALSE); 198711042SErik.Nordmark@Sun.COM } 19880Sstevel@tonic-gate } 19890Sstevel@tonic-gate 19900Sstevel@tonic-gate if ((ire->ire_addr == (addr & mask)) && 19910Sstevel@tonic-gate ((!(match_flags & MATCH_IRE_GW)) || 19924714Ssowmini (ire->ire_gateway_addr == gateway)) && 199311042SErik.Nordmark@Sun.COM ((!(match_flags & MATCH_IRE_TYPE)) || (ire->ire_type & type)) && 199411042SErik.Nordmark@Sun.COM ((!(match_flags & MATCH_IRE_TESTHIDDEN)) || ire->ire_testhidden) && 199511042SErik.Nordmark@Sun.COM ((!(match_flags & MATCH_IRE_MASK)) || (ire->ire_mask == mask)) && 19961676Sjpk ((!(match_flags & MATCH_IRE_SECATTR)) || 19974714Ssowmini (!is_system_labeled()) || 19984714Ssowmini (tsol_ire_match_gwattr(ire, tsl) == 0))) { 19990Sstevel@tonic-gate /* We found the matched IRE */ 20000Sstevel@tonic-gate return (B_TRUE); 20010Sstevel@tonic-gate } 20020Sstevel@tonic-gate return (B_FALSE); 20030Sstevel@tonic-gate } 20040Sstevel@tonic-gate 20050Sstevel@tonic-gate /* 200611042SErik.Nordmark@Sun.COM * Check if the IRE_LOCAL uses the same ill as another route would use. 200711042SErik.Nordmark@Sun.COM * If there is no alternate route, or the alternate is a REJECT or BLACKHOLE, 200811042SErik.Nordmark@Sun.COM * then we don't allow this IRE_LOCAL to be used. 200911042SErik.Nordmark@Sun.COM * We always return an IRE; will be RTF_REJECT if no route available. 20100Sstevel@tonic-gate */ 20110Sstevel@tonic-gate ire_t * 201211042SErik.Nordmark@Sun.COM ire_alt_local(ire_t *ire, zoneid_t zoneid, const ts_label_t *tsl, 201311042SErik.Nordmark@Sun.COM const ill_t *ill, uint_t *generationp) 20140Sstevel@tonic-gate { 201511042SErik.Nordmark@Sun.COM ip_stack_t *ipst = ire->ire_ipst; 201611042SErik.Nordmark@Sun.COM ire_t *alt_ire; 201711042SErik.Nordmark@Sun.COM uint_t ire_type; 201811042SErik.Nordmark@Sun.COM uint_t generation; 201911042SErik.Nordmark@Sun.COM uint_t match_flags; 202011042SErik.Nordmark@Sun.COM 202111042SErik.Nordmark@Sun.COM ASSERT(ire->ire_type & IRE_LOCAL); 202211042SErik.Nordmark@Sun.COM ASSERT(ire->ire_ill != NULL); 20230Sstevel@tonic-gate 20240Sstevel@tonic-gate /* 202511042SErik.Nordmark@Sun.COM * Need to match on everything but local. 202611042SErik.Nordmark@Sun.COM * This might result in the creation of a IRE_IF_CLONE for the 202711042SErik.Nordmark@Sun.COM * same address as the IRE_LOCAL when restrict_interzone_loopback is 202811042SErik.Nordmark@Sun.COM * set. ire_add_*() ensures that the IRE_IF_CLONE are tail inserted 202911042SErik.Nordmark@Sun.COM * to make sure the IRE_LOCAL is always found first. 20300Sstevel@tonic-gate */ 203111042SErik.Nordmark@Sun.COM ire_type = (IRE_ONLINK | IRE_OFFLINK) & ~(IRE_LOCAL|IRE_LOOPBACK); 203211042SErik.Nordmark@Sun.COM match_flags = MATCH_IRE_TYPE | MATCH_IRE_SECATTR; 203311042SErik.Nordmark@Sun.COM if (ill != NULL) 203411042SErik.Nordmark@Sun.COM match_flags |= MATCH_IRE_ILL; 203511042SErik.Nordmark@Sun.COM 203611042SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV4_VERSION) { 203711042SErik.Nordmark@Sun.COM alt_ire = ire_route_recursive_v4(ire->ire_addr, ire_type, 2038*11457SErik.Nordmark@Sun.COM ill, zoneid, tsl, match_flags, IRR_ALLOCATE, 0, ipst, NULL, 2039*11457SErik.Nordmark@Sun.COM NULL, &generation); 204011042SErik.Nordmark@Sun.COM } else { 204111042SErik.Nordmark@Sun.COM alt_ire = ire_route_recursive_v6(&ire->ire_addr_v6, ire_type, 2042*11457SErik.Nordmark@Sun.COM ill, zoneid, tsl, match_flags, IRR_ALLOCATE, 0, ipst, NULL, 2043*11457SErik.Nordmark@Sun.COM NULL, &generation); 20440Sstevel@tonic-gate } 204511042SErik.Nordmark@Sun.COM ASSERT(alt_ire != NULL); 204611042SErik.Nordmark@Sun.COM 204711042SErik.Nordmark@Sun.COM if (alt_ire->ire_ill == ire->ire_ill) { 204811042SErik.Nordmark@Sun.COM /* Going out the same ILL - ok to send to IRE_LOCAL */ 204911042SErik.Nordmark@Sun.COM ire_refrele(alt_ire); 205011042SErik.Nordmark@Sun.COM } else { 205111042SErik.Nordmark@Sun.COM /* Different ill - ignore IRE_LOCAL */ 205211042SErik.Nordmark@Sun.COM ire_refrele(ire); 205311042SErik.Nordmark@Sun.COM ire = alt_ire; 205411042SErik.Nordmark@Sun.COM if (generationp != NULL) 205511042SErik.Nordmark@Sun.COM *generationp = generation; 20560Sstevel@tonic-gate } 20570Sstevel@tonic-gate return (ire); 20580Sstevel@tonic-gate } 20590Sstevel@tonic-gate 206011042SErik.Nordmark@Sun.COM boolean_t 206111042SErik.Nordmark@Sun.COM ire_find_zoneid(struct radix_node *rn, void *arg) 20621676Sjpk { 206311042SErik.Nordmark@Sun.COM struct rt_entry *rt = (struct rt_entry *)rn; 20641676Sjpk irb_t *irb; 20651676Sjpk ire_t *ire; 206611042SErik.Nordmark@Sun.COM ire_ftable_args_t *margs = arg; 206711042SErik.Nordmark@Sun.COM 206811042SErik.Nordmark@Sun.COM ASSERT(rt != NULL); 206911042SErik.Nordmark@Sun.COM 207011042SErik.Nordmark@Sun.COM irb = &rt->rt_irb; 207111042SErik.Nordmark@Sun.COM 207211042SErik.Nordmark@Sun.COM if (irb->irb_ire_cnt == 0) 207311042SErik.Nordmark@Sun.COM return (B_FALSE); 207411042SErik.Nordmark@Sun.COM 207511042SErik.Nordmark@Sun.COM rw_enter(&irb->irb_lock, RW_READER); 20761676Sjpk for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 207711042SErik.Nordmark@Sun.COM if (IRE_IS_CONDEMNED(ire)) 20781676Sjpk continue; 20791676Sjpk 208011131SErik.Nordmark@Sun.COM if (!(ire->ire_type & IRE_INTERFACE)) 208111131SErik.Nordmark@Sun.COM continue; 208211131SErik.Nordmark@Sun.COM 208311042SErik.Nordmark@Sun.COM if (ire->ire_zoneid != ALL_ZONES && 208411042SErik.Nordmark@Sun.COM ire->ire_zoneid != margs->ift_zoneid) 208511042SErik.Nordmark@Sun.COM continue; 208611042SErik.Nordmark@Sun.COM 208711042SErik.Nordmark@Sun.COM if (margs->ift_ill != NULL && margs->ift_ill != ire->ire_ill) 208811042SErik.Nordmark@Sun.COM continue; 208911042SErik.Nordmark@Sun.COM 209011042SErik.Nordmark@Sun.COM if (is_system_labeled() && 209111042SErik.Nordmark@Sun.COM tsol_ire_match_gwattr(ire, margs->ift_tsl) != 0) 209211042SErik.Nordmark@Sun.COM continue; 209311042SErik.Nordmark@Sun.COM 209411042SErik.Nordmark@Sun.COM rw_exit(&irb->irb_lock); 209511042SErik.Nordmark@Sun.COM return (B_TRUE); 20961676Sjpk } 209711042SErik.Nordmark@Sun.COM rw_exit(&irb->irb_lock); 209811042SErik.Nordmark@Sun.COM return (B_FALSE); 20992733Snordmark } 21002733Snordmark 21012733Snordmark /* 210211042SErik.Nordmark@Sun.COM * Check if the zoneid (not ALL_ZONES) has an IRE_INTERFACE for the specified 210311042SErik.Nordmark@Sun.COM * gateway address. If ill is non-NULL we also match on it. 210411042SErik.Nordmark@Sun.COM * The caller must hold a read lock on RADIX_NODE_HEAD if lock_held is set. 21050Sstevel@tonic-gate */ 210611042SErik.Nordmark@Sun.COM boolean_t 210711042SErik.Nordmark@Sun.COM ire_gateway_ok_zone_v4(ipaddr_t gateway, zoneid_t zoneid, ill_t *ill, 210811042SErik.Nordmark@Sun.COM const ts_label_t *tsl, ip_stack_t *ipst, boolean_t lock_held) 21090Sstevel@tonic-gate { 211011042SErik.Nordmark@Sun.COM struct rt_sockaddr rdst; 211111042SErik.Nordmark@Sun.COM struct rt_entry *rt; 211211042SErik.Nordmark@Sun.COM ire_ftable_args_t margs; 211311042SErik.Nordmark@Sun.COM 211411042SErik.Nordmark@Sun.COM ASSERT(ill == NULL || !ill->ill_isv6); 211511042SErik.Nordmark@Sun.COM if (lock_held) 211611042SErik.Nordmark@Sun.COM ASSERT(RW_READ_HELD(&ipst->ips_ip_ftable->rnh_lock)); 211711042SErik.Nordmark@Sun.COM else 211811042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 211911042SErik.Nordmark@Sun.COM 212011131SErik.Nordmark@Sun.COM bzero(&rdst, sizeof (rdst)); 212111042SErik.Nordmark@Sun.COM rdst.rt_sin_len = sizeof (rdst); 212211042SErik.Nordmark@Sun.COM rdst.rt_sin_family = AF_INET; 212311042SErik.Nordmark@Sun.COM rdst.rt_sin_addr.s_addr = gateway; 21248275SEric Cheng 21258275SEric Cheng /* 212611042SErik.Nordmark@Sun.COM * We only use margs for ill, zoneid, and tsl matching in 212711042SErik.Nordmark@Sun.COM * ire_find_zoneid 21288275SEric Cheng */ 212911131SErik.Nordmark@Sun.COM bzero(&margs, sizeof (margs)); 213011042SErik.Nordmark@Sun.COM margs.ift_ill = ill; 213111042SErik.Nordmark@Sun.COM margs.ift_zoneid = zoneid; 213211042SErik.Nordmark@Sun.COM margs.ift_tsl = tsl; 213311042SErik.Nordmark@Sun.COM rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, 213411042SErik.Nordmark@Sun.COM ipst->ips_ip_ftable, ire_find_zoneid, (void *)&margs); 213511042SErik.Nordmark@Sun.COM 213611042SErik.Nordmark@Sun.COM if (!lock_held) 213711042SErik.Nordmark@Sun.COM RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 213811042SErik.Nordmark@Sun.COM 213911042SErik.Nordmark@Sun.COM return (rt != NULL); 21408275SEric Cheng } 21418275SEric Cheng 21420Sstevel@tonic-gate /* 214311042SErik.Nordmark@Sun.COM * ire_walk routine to delete a fraction of redirect IREs and IRE_CLONE_IF IREs. 214411042SErik.Nordmark@Sun.COM * The fraction argument tells us what fraction of the IREs to delete. 214511042SErik.Nordmark@Sun.COM * Common for IPv4 and IPv6. 214611042SErik.Nordmark@Sun.COM * Used when memory backpressure. 21470Sstevel@tonic-gate */ 214811042SErik.Nordmark@Sun.COM static void 214911042SErik.Nordmark@Sun.COM ire_delete_reclaim(ire_t *ire, char *arg) 21500Sstevel@tonic-gate { 215111042SErik.Nordmark@Sun.COM ip_stack_t *ipst = ire->ire_ipst; 215211042SErik.Nordmark@Sun.COM uint_t fraction = *(uint_t *)arg; 215311042SErik.Nordmark@Sun.COM uint_t rand; 215411042SErik.Nordmark@Sun.COM 215511042SErik.Nordmark@Sun.COM if ((ire->ire_flags & RTF_DYNAMIC) || 215611042SErik.Nordmark@Sun.COM (ire->ire_type & IRE_IF_CLONE)) { 215711042SErik.Nordmark@Sun.COM 215811042SErik.Nordmark@Sun.COM /* Pick a random number */ 215911066Srafael.vanoni@sun.com rand = (uint_t)ddi_get_lbolt() + 216011042SErik.Nordmark@Sun.COM IRE_ADDR_HASH_V6(ire->ire_addr_v6, 256); 216111042SErik.Nordmark@Sun.COM 216211042SErik.Nordmark@Sun.COM /* Use truncation */ 216311042SErik.Nordmark@Sun.COM if ((rand/fraction)*fraction == rand) { 216411042SErik.Nordmark@Sun.COM IP_STAT(ipst, ip_ire_reclaim_deleted); 216511042SErik.Nordmark@Sun.COM ire_delete(ire); 216611042SErik.Nordmark@Sun.COM } 216711042SErik.Nordmark@Sun.COM } 216811042SErik.Nordmark@Sun.COM 21690Sstevel@tonic-gate } 21700Sstevel@tonic-gate 21710Sstevel@tonic-gate /* 217211042SErik.Nordmark@Sun.COM * kmem_cache callback to free up memory. 21730Sstevel@tonic-gate * 217411042SErik.Nordmark@Sun.COM * Free a fraction (ips_ip_ire_reclaim_fraction) of things IP added dynamically 217511042SErik.Nordmark@Sun.COM * (RTF_DYNAMIC and IRE_IF_CLONE). 21760Sstevel@tonic-gate */ 217711042SErik.Nordmark@Sun.COM static void 217811042SErik.Nordmark@Sun.COM ip_ire_reclaim_stack(ip_stack_t *ipst) 21790Sstevel@tonic-gate { 218011042SErik.Nordmark@Sun.COM uint_t fraction = ipst->ips_ip_ire_reclaim_fraction; 218111042SErik.Nordmark@Sun.COM 218211042SErik.Nordmark@Sun.COM IP_STAT(ipst, ip_ire_reclaim_calls); 218311042SErik.Nordmark@Sun.COM 218411042SErik.Nordmark@Sun.COM ire_walk(ire_delete_reclaim, &fraction, ipst); 21858485SPeter.Memishian@Sun.COM 21868485SPeter.Memishian@Sun.COM /* 218711042SErik.Nordmark@Sun.COM * Walk all CONNs that can have a reference on an ire, nce or dce. 218811042SErik.Nordmark@Sun.COM * Get them to update any stale references to drop any refholds they 218911042SErik.Nordmark@Sun.COM * have. 21908485SPeter.Memishian@Sun.COM */ 219111042SErik.Nordmark@Sun.COM ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst); 21920Sstevel@tonic-gate } 21930Sstevel@tonic-gate 21940Sstevel@tonic-gate /* 219511042SErik.Nordmark@Sun.COM * Called by the memory allocator subsystem directly, when the system 219611042SErik.Nordmark@Sun.COM * is running low on memory. 21970Sstevel@tonic-gate */ 219811042SErik.Nordmark@Sun.COM /* ARGSUSED */ 21990Sstevel@tonic-gate void 220011042SErik.Nordmark@Sun.COM ip_ire_reclaim(void *args) 22010Sstevel@tonic-gate { 220211042SErik.Nordmark@Sun.COM netstack_handle_t nh; 220311042SErik.Nordmark@Sun.COM netstack_t *ns; 220411042SErik.Nordmark@Sun.COM 220511042SErik.Nordmark@Sun.COM netstack_next_init(&nh); 220611042SErik.Nordmark@Sun.COM while ((ns = netstack_next(&nh)) != NULL) { 220711042SErik.Nordmark@Sun.COM ip_ire_reclaim_stack(ns->netstack_ip); 220811042SErik.Nordmark@Sun.COM netstack_rele(ns); 22090Sstevel@tonic-gate } 221011042SErik.Nordmark@Sun.COM netstack_next_fini(&nh); 22110Sstevel@tonic-gate } 22120Sstevel@tonic-gate 22130Sstevel@tonic-gate static void 22140Sstevel@tonic-gate power2_roundup(uint32_t *value) 22150Sstevel@tonic-gate { 22160Sstevel@tonic-gate int i; 22170Sstevel@tonic-gate 22180Sstevel@tonic-gate for (i = 1; i < 31; i++) { 22190Sstevel@tonic-gate if (*value <= (1 << i)) 22200Sstevel@tonic-gate break; 22210Sstevel@tonic-gate } 22220Sstevel@tonic-gate *value = (1 << i); 22230Sstevel@tonic-gate } 22240Sstevel@tonic-gate 22253448Sdh155122 /* Global init for all zones */ 22260Sstevel@tonic-gate void 22273448Sdh155122 ip_ire_g_init() 22280Sstevel@tonic-gate { 22290Sstevel@tonic-gate /* 223011042SErik.Nordmark@Sun.COM * Create kmem_caches. ip_ire_reclaim() and ip_nce_reclaim() 223111042SErik.Nordmark@Sun.COM * will give disposable IREs back to system when needed. 22320Sstevel@tonic-gate * This needs to be done here before anything else, since 22330Sstevel@tonic-gate * ire_add() expects the cache to be created. 22340Sstevel@tonic-gate */ 22350Sstevel@tonic-gate ire_cache = kmem_cache_create("ire_cache", 223611042SErik.Nordmark@Sun.COM sizeof (ire_t), 0, NULL, NULL, 223711042SErik.Nordmark@Sun.COM ip_ire_reclaim, NULL, NULL, 0); 223811042SErik.Nordmark@Sun.COM 223911042SErik.Nordmark@Sun.COM ncec_cache = kmem_cache_create("ncec_cache", 224011042SErik.Nordmark@Sun.COM sizeof (ncec_t), 0, NULL, NULL, 224111042SErik.Nordmark@Sun.COM ip_nce_reclaim, NULL, NULL, 0); 224211042SErik.Nordmark@Sun.COM nce_cache = kmem_cache_create("nce_cache", 224311042SErik.Nordmark@Sun.COM sizeof (nce_t), 0, NULL, NULL, 224411042SErik.Nordmark@Sun.COM NULL, NULL, NULL, 0); 22450Sstevel@tonic-gate 22463448Sdh155122 rt_entry_cache = kmem_cache_create("rt_entry", 22473448Sdh155122 sizeof (struct rt_entry), 0, NULL, NULL, NULL, NULL, NULL, 0); 22483448Sdh155122 22493448Sdh155122 /* 22503448Sdh155122 * Have radix code setup kmem caches etc. 22513448Sdh155122 */ 22523448Sdh155122 rn_init(); 22533448Sdh155122 } 22543448Sdh155122 22553448Sdh155122 void 22563448Sdh155122 ip_ire_init(ip_stack_t *ipst) 22573448Sdh155122 { 225811042SErik.Nordmark@Sun.COM ire_t *ire; 225911042SErik.Nordmark@Sun.COM int error; 22603448Sdh155122 22613448Sdh155122 mutex_init(&ipst->ips_ire_ft_init_lock, NULL, MUTEX_DEFAULT, 0); 22623448Sdh155122 22633448Sdh155122 (void) rn_inithead((void **)&ipst->ips_ip_ftable, 32); 22643448Sdh155122 22650Sstevel@tonic-gate /* 22660Sstevel@tonic-gate * Make sure that the forwarding table size is a power of 2. 22670Sstevel@tonic-gate * The IRE*_ADDR_HASH() macroes depend on that. 22680Sstevel@tonic-gate */ 22693448Sdh155122 ipst->ips_ip6_ftable_hash_size = ip6_ftable_hash_size; 22703448Sdh155122 power2_roundup(&ipst->ips_ip6_ftable_hash_size); 22713448Sdh155122 227211042SErik.Nordmark@Sun.COM /* 227311042SErik.Nordmark@Sun.COM * Allocate/initialize a pair of IRE_NOROUTEs for each of IPv4 and IPv6. 227411042SErik.Nordmark@Sun.COM * The ire_reject_v* has RTF_REJECT set, and the ire_blackhole_v* has 227511042SErik.Nordmark@Sun.COM * RTF_BLACKHOLE set. We use the latter for transient errors such 227611042SErik.Nordmark@Sun.COM * as memory allocation failures and tripping on IRE_IS_CONDEMNED 227711042SErik.Nordmark@Sun.COM * entries. 227811042SErik.Nordmark@Sun.COM */ 227911042SErik.Nordmark@Sun.COM ire = kmem_cache_alloc(ire_cache, KM_SLEEP); 228011042SErik.Nordmark@Sun.COM *ire = ire_null; 228111042SErik.Nordmark@Sun.COM error = ire_init_v4(ire, 0, 0, 0, IRE_NOROUTE, NULL, ALL_ZONES, 228211042SErik.Nordmark@Sun.COM RTF_REJECT|RTF_UP, NULL, ipst); 228311042SErik.Nordmark@Sun.COM ASSERT(error == 0); 228411042SErik.Nordmark@Sun.COM ipst->ips_ire_reject_v4 = ire; 228511042SErik.Nordmark@Sun.COM 228611042SErik.Nordmark@Sun.COM ire = kmem_cache_alloc(ire_cache, KM_SLEEP); 228711042SErik.Nordmark@Sun.COM *ire = ire_null; 228811042SErik.Nordmark@Sun.COM error = ire_init_v6(ire, 0, 0, 0, IRE_NOROUTE, NULL, ALL_ZONES, 228911042SErik.Nordmark@Sun.COM RTF_REJECT|RTF_UP, NULL, ipst); 229011042SErik.Nordmark@Sun.COM ASSERT(error == 0); 229111042SErik.Nordmark@Sun.COM ipst->ips_ire_reject_v6 = ire; 229211042SErik.Nordmark@Sun.COM 229311042SErik.Nordmark@Sun.COM ire = kmem_cache_alloc(ire_cache, KM_SLEEP); 229411042SErik.Nordmark@Sun.COM *ire = ire_null; 229511042SErik.Nordmark@Sun.COM error = ire_init_v4(ire, 0, 0, 0, IRE_NOROUTE, NULL, ALL_ZONES, 229611042SErik.Nordmark@Sun.COM RTF_BLACKHOLE|RTF_UP, NULL, ipst); 229711042SErik.Nordmark@Sun.COM ASSERT(error == 0); 229811042SErik.Nordmark@Sun.COM ipst->ips_ire_blackhole_v4 = ire; 229911042SErik.Nordmark@Sun.COM 230011042SErik.Nordmark@Sun.COM ire = kmem_cache_alloc(ire_cache, KM_SLEEP); 230111042SErik.Nordmark@Sun.COM *ire = ire_null; 230211042SErik.Nordmark@Sun.COM error = ire_init_v6(ire, 0, 0, 0, IRE_NOROUTE, NULL, ALL_ZONES, 230311042SErik.Nordmark@Sun.COM RTF_BLACKHOLE|RTF_UP, NULL, ipst); 230411042SErik.Nordmark@Sun.COM ASSERT(error == 0); 230511042SErik.Nordmark@Sun.COM ipst->ips_ire_blackhole_v6 = ire; 230611042SErik.Nordmark@Sun.COM 230711042SErik.Nordmark@Sun.COM rw_init(&ipst->ips_ip6_ire_head_lock, NULL, RW_DEFAULT, NULL); 230811042SErik.Nordmark@Sun.COM rw_init(&ipst->ips_ire_dep_lock, NULL, RW_DEFAULT, NULL); 23093448Sdh155122 } 23103448Sdh155122 23113448Sdh155122 void 23123448Sdh155122 ip_ire_g_fini(void) 23133448Sdh155122 { 23143448Sdh155122 kmem_cache_destroy(ire_cache); 231511042SErik.Nordmark@Sun.COM kmem_cache_destroy(ncec_cache); 231611042SErik.Nordmark@Sun.COM kmem_cache_destroy(nce_cache); 23173448Sdh155122 kmem_cache_destroy(rt_entry_cache); 23183448Sdh155122 23193448Sdh155122 rn_fini(); 23200Sstevel@tonic-gate } 23210Sstevel@tonic-gate 23220Sstevel@tonic-gate void 23233448Sdh155122 ip_ire_fini(ip_stack_t *ipst) 23240Sstevel@tonic-gate { 23250Sstevel@tonic-gate int i; 23260Sstevel@tonic-gate 232711042SErik.Nordmark@Sun.COM rw_destroy(&ipst->ips_ire_dep_lock); 232811042SErik.Nordmark@Sun.COM rw_destroy(&ipst->ips_ip6_ire_head_lock); 232911042SErik.Nordmark@Sun.COM 233011042SErik.Nordmark@Sun.COM ire_refrele_notr(ipst->ips_ire_reject_v6); 233111042SErik.Nordmark@Sun.COM ipst->ips_ire_reject_v6 = NULL; 233211042SErik.Nordmark@Sun.COM ire_refrele_notr(ipst->ips_ire_reject_v4); 233311042SErik.Nordmark@Sun.COM ipst->ips_ire_reject_v4 = NULL; 233411042SErik.Nordmark@Sun.COM ire_refrele_notr(ipst->ips_ire_blackhole_v6); 233511042SErik.Nordmark@Sun.COM ipst->ips_ire_blackhole_v6 = NULL; 233611042SErik.Nordmark@Sun.COM ire_refrele_notr(ipst->ips_ire_blackhole_v4); 233711042SErik.Nordmark@Sun.COM ipst->ips_ire_blackhole_v4 = NULL; 233811042SErik.Nordmark@Sun.COM 23393448Sdh155122 /* 23403448Sdh155122 * Delete all IREs - assumes that the ill/ipifs have 234111042SErik.Nordmark@Sun.COM * been removed so what remains are just the ftable to handle. 23423448Sdh155122 */ 23433448Sdh155122 ire_walk(ire_delete, NULL, ipst); 23443448Sdh155122 23453448Sdh155122 rn_freehead(ipst->ips_ip_ftable); 23463448Sdh155122 ipst->ips_ip_ftable = NULL; 23473448Sdh155122 23483448Sdh155122 mutex_destroy(&ipst->ips_ire_ft_init_lock); 23493448Sdh155122 23503448Sdh155122 for (i = 0; i < IP6_MASK_TABLE_SIZE; i++) { 23513448Sdh155122 irb_t *ptr; 23523448Sdh155122 int j; 23533448Sdh155122 23543448Sdh155122 if ((ptr = ipst->ips_ip_forwarding_table_v6[i]) == NULL) 23553448Sdh155122 continue; 23563448Sdh155122 23573448Sdh155122 for (j = 0; j < ipst->ips_ip6_ftable_hash_size; j++) { 23583448Sdh155122 ASSERT(ptr[j].irb_ire == NULL); 23593448Sdh155122 rw_destroy(&ptr[j].irb_lock); 23603448Sdh155122 } 23613448Sdh155122 mi_free(ptr); 23623448Sdh155122 ipst->ips_ip_forwarding_table_v6[i] = NULL; 23633448Sdh155122 } 23640Sstevel@tonic-gate } 23650Sstevel@tonic-gate 23665023Scarlsonj #ifdef DEBUG 23670Sstevel@tonic-gate void 23680Sstevel@tonic-gate ire_trace_ref(ire_t *ire) 23690Sstevel@tonic-gate { 23700Sstevel@tonic-gate mutex_enter(&ire->ire_lock); 23715023Scarlsonj if (ire->ire_trace_disable) { 23720Sstevel@tonic-gate mutex_exit(&ire->ire_lock); 23730Sstevel@tonic-gate return; 23740Sstevel@tonic-gate } 23755023Scarlsonj 23765023Scarlsonj if (th_trace_ref(ire, ire->ire_ipst)) { 23775023Scarlsonj mutex_exit(&ire->ire_lock); 23785023Scarlsonj } else { 23795023Scarlsonj ire->ire_trace_disable = B_TRUE; 23805023Scarlsonj mutex_exit(&ire->ire_lock); 23815023Scarlsonj ire_trace_cleanup(ire); 23820Sstevel@tonic-gate } 23830Sstevel@tonic-gate } 23840Sstevel@tonic-gate 23850Sstevel@tonic-gate void 23860Sstevel@tonic-gate ire_untrace_ref(ire_t *ire) 23870Sstevel@tonic-gate { 23880Sstevel@tonic-gate mutex_enter(&ire->ire_lock); 23895023Scarlsonj if (!ire->ire_trace_disable) 23905023Scarlsonj th_trace_unref(ire); 23910Sstevel@tonic-gate mutex_exit(&ire->ire_lock); 23920Sstevel@tonic-gate } 23930Sstevel@tonic-gate 23940Sstevel@tonic-gate static void 23955023Scarlsonj ire_trace_cleanup(const ire_t *ire) 23960Sstevel@tonic-gate { 23975023Scarlsonj th_trace_cleanup(ire, ire->ire_trace_disable); 23980Sstevel@tonic-gate } 23995023Scarlsonj #endif /* DEBUG */ 24002535Ssangeeta 24012535Ssangeeta /* 240211042SErik.Nordmark@Sun.COM * Find, or create if needed, the nce_t pointer to the neighbor cache 240311042SErik.Nordmark@Sun.COM * entry ncec_t for an IPv4 address. The nce_t will be created on the ill_t 240411042SErik.Nordmark@Sun.COM * in the non-IPMP case, or on the cast-ill in the IPMP bcast/mcast case, or 240511042SErik.Nordmark@Sun.COM * on the next available under-ill (selected by the IPMP rotor) in the 240611042SErik.Nordmark@Sun.COM * unicast IPMP case. 240711042SErik.Nordmark@Sun.COM * 240811042SErik.Nordmark@Sun.COM * If a neighbor-cache entry has to be created (i.e., one does not already 240911042SErik.Nordmark@Sun.COM * exist in the nce list) the ncec_lladdr and ncec_state of the neighbor cache 241011042SErik.Nordmark@Sun.COM * entry are initialized in nce_add_v4(). The broadcast, multicast, and 241111042SErik.Nordmark@Sun.COM * link-layer type determine the contents of {ncec_state, ncec_lladdr} of 241211042SErik.Nordmark@Sun.COM * the ncec_t created. The ncec_lladdr is non-null for all link types with 241311042SErik.Nordmark@Sun.COM * non-zero ill_phys_addr_length, though the contents may be zero in cases 241411042SErik.Nordmark@Sun.COM * where the link-layer type is not known at the time of creation 241511042SErik.Nordmark@Sun.COM * (e.g., IRE_IFRESOLVER links) 241611042SErik.Nordmark@Sun.COM * 241711042SErik.Nordmark@Sun.COM * All IRE_BROADCAST entries have ncec_state = ND_REACHABLE, and the nce_lladr 241811042SErik.Nordmark@Sun.COM * has the physical broadcast address of the outgoing interface. 241911042SErik.Nordmark@Sun.COM * For unicast ire entries, 242011042SErik.Nordmark@Sun.COM * - if the outgoing interface is of type IRE_IF_RESOLVER, a newly created 242111042SErik.Nordmark@Sun.COM * ncec_t with 0 nce_lladr contents, and will be in the ND_INITIAL state. 242211042SErik.Nordmark@Sun.COM * - if the outgoing interface is a IRE_IF_NORESOLVER interface, no link 242311042SErik.Nordmark@Sun.COM * layer resolution is necessary, so that the ncec_t will be in the 242411042SErik.Nordmark@Sun.COM * ND_REACHABLE state 242511042SErik.Nordmark@Sun.COM * 242611042SErik.Nordmark@Sun.COM * The link layer information needed for broadcast addresses, and for 242711042SErik.Nordmark@Sun.COM * packets sent on IRE_IF_NORESOLVER interfaces is a constant mapping that 242811042SErik.Nordmark@Sun.COM * never needs re-verification for the lifetime of the ncec_t. These are 242911042SErik.Nordmark@Sun.COM * therefore marked NCE_F_NONUD. 243011042SErik.Nordmark@Sun.COM * 243111042SErik.Nordmark@Sun.COM * The nce returned will be created such that the nce_ill == ill that 243211042SErik.Nordmark@Sun.COM * is passed in. Note that the nce itself may not have ncec_ill == ill 243311042SErik.Nordmark@Sun.COM * where IPMP links are involved. 243411042SErik.Nordmark@Sun.COM */ 243511042SErik.Nordmark@Sun.COM static nce_t * 243611042SErik.Nordmark@Sun.COM ire_nce_init(ill_t *ill, const void *addr, int ire_type) 243711042SErik.Nordmark@Sun.COM { 243811042SErik.Nordmark@Sun.COM int err; 243911042SErik.Nordmark@Sun.COM nce_t *nce = NULL; 244011042SErik.Nordmark@Sun.COM uint16_t ncec_flags; 244111042SErik.Nordmark@Sun.COM uchar_t *hwaddr; 244211042SErik.Nordmark@Sun.COM boolean_t need_refrele = B_FALSE; 244311042SErik.Nordmark@Sun.COM ill_t *in_ill = ill; 244411042SErik.Nordmark@Sun.COM boolean_t is_unicast; 244511042SErik.Nordmark@Sun.COM uint_t hwaddr_len; 244611042SErik.Nordmark@Sun.COM 244711042SErik.Nordmark@Sun.COM is_unicast = ((ire_type & (IRE_MULTICAST|IRE_BROADCAST)) == 0); 244811042SErik.Nordmark@Sun.COM if (IS_IPMP(ill) || 244911042SErik.Nordmark@Sun.COM ((ire_type & IRE_BROADCAST) && IS_UNDER_IPMP(ill))) { 245011042SErik.Nordmark@Sun.COM if ((ill = ipmp_ill_get_xmit_ill(ill, is_unicast)) == NULL) 245111042SErik.Nordmark@Sun.COM return (NULL); 245211042SErik.Nordmark@Sun.COM need_refrele = B_TRUE; 245311042SErik.Nordmark@Sun.COM } 245411042SErik.Nordmark@Sun.COM ncec_flags = (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0; 245511042SErik.Nordmark@Sun.COM 245611042SErik.Nordmark@Sun.COM switch (ire_type) { 245711042SErik.Nordmark@Sun.COM case IRE_BROADCAST: 245811042SErik.Nordmark@Sun.COM ASSERT(!ill->ill_isv6); 245911042SErik.Nordmark@Sun.COM ncec_flags |= (NCE_F_BCAST|NCE_F_NONUD); 246011042SErik.Nordmark@Sun.COM break; 246111042SErik.Nordmark@Sun.COM case IRE_MULTICAST: 246211042SErik.Nordmark@Sun.COM ncec_flags |= (NCE_F_MCAST|NCE_F_NONUD); 246311042SErik.Nordmark@Sun.COM break; 246411042SErik.Nordmark@Sun.COM } 246511042SErik.Nordmark@Sun.COM 246611042SErik.Nordmark@Sun.COM if (ill->ill_net_type == IRE_IF_NORESOLVER && is_unicast) { 246711042SErik.Nordmark@Sun.COM hwaddr = ill->ill_dest_addr; 246811042SErik.Nordmark@Sun.COM } else { 246911042SErik.Nordmark@Sun.COM hwaddr = NULL; 247011042SErik.Nordmark@Sun.COM } 247111042SErik.Nordmark@Sun.COM hwaddr_len = ill->ill_phys_addr_length; 247211042SErik.Nordmark@Sun.COM 247311042SErik.Nordmark@Sun.COM retry: 247411042SErik.Nordmark@Sun.COM /* nce_state will be computed by nce_add_common() */ 247511042SErik.Nordmark@Sun.COM if (!ill->ill_isv6) { 247611042SErik.Nordmark@Sun.COM err = nce_lookup_then_add_v4(ill, hwaddr, hwaddr_len, addr, 247711042SErik.Nordmark@Sun.COM ncec_flags, ND_UNCHANGED, &nce); 247811042SErik.Nordmark@Sun.COM } else { 247911042SErik.Nordmark@Sun.COM err = nce_lookup_then_add_v6(ill, hwaddr, hwaddr_len, addr, 248011042SErik.Nordmark@Sun.COM ncec_flags, ND_UNCHANGED, &nce); 248111042SErik.Nordmark@Sun.COM } 248211042SErik.Nordmark@Sun.COM 248311042SErik.Nordmark@Sun.COM switch (err) { 248411042SErik.Nordmark@Sun.COM case 0: 248511042SErik.Nordmark@Sun.COM break; 248611042SErik.Nordmark@Sun.COM case EEXIST: 248711042SErik.Nordmark@Sun.COM /* 248811042SErik.Nordmark@Sun.COM * When subnets change or partially overlap what was once 248911042SErik.Nordmark@Sun.COM * a broadcast address could now be a unicast, or vice versa. 249011042SErik.Nordmark@Sun.COM */ 249111042SErik.Nordmark@Sun.COM if (((ncec_flags ^ nce->nce_common->ncec_flags) & 249211042SErik.Nordmark@Sun.COM NCE_F_BCAST) != 0) { 249311042SErik.Nordmark@Sun.COM ASSERT(!ill->ill_isv6); 249411042SErik.Nordmark@Sun.COM ncec_delete(nce->nce_common); 249511042SErik.Nordmark@Sun.COM nce_refrele(nce); 249611042SErik.Nordmark@Sun.COM goto retry; 249711042SErik.Nordmark@Sun.COM } 249811042SErik.Nordmark@Sun.COM break; 249911042SErik.Nordmark@Sun.COM default: 250011042SErik.Nordmark@Sun.COM DTRACE_PROBE2(nce__init__fail, ill_t *, ill, int, err); 250111042SErik.Nordmark@Sun.COM if (need_refrele) 250211042SErik.Nordmark@Sun.COM ill_refrele(ill); 250311042SErik.Nordmark@Sun.COM return (NULL); 250411042SErik.Nordmark@Sun.COM } 250511042SErik.Nordmark@Sun.COM /* 250611042SErik.Nordmark@Sun.COM * If the ill was an under-ill of an IPMP group, we need to verify 250711042SErik.Nordmark@Sun.COM * that it is still active so that we select an active interface in 250811042SErik.Nordmark@Sun.COM * the group. However, since ipmp_ill_is_active ASSERTs for 250911042SErik.Nordmark@Sun.COM * IS_UNDER_IPMP(), we first need to verify that the ill is an 251011042SErik.Nordmark@Sun.COM * under-ill, and since this is being done in the data path, the 251111042SErik.Nordmark@Sun.COM * only way to ascertain this is by holding the ill_g_lock. 251211042SErik.Nordmark@Sun.COM */ 251311042SErik.Nordmark@Sun.COM rw_enter(&ill->ill_ipst->ips_ill_g_lock, RW_READER); 251411042SErik.Nordmark@Sun.COM mutex_enter(&ill->ill_lock); 251511042SErik.Nordmark@Sun.COM mutex_enter(&ill->ill_phyint->phyint_lock); 251611042SErik.Nordmark@Sun.COM if (need_refrele && IS_UNDER_IPMP(ill) && !ipmp_ill_is_active(ill)) { 251711042SErik.Nordmark@Sun.COM /* 251811042SErik.Nordmark@Sun.COM * need_refrele implies that the under ill was selected by 251911042SErik.Nordmark@Sun.COM * ipmp_ill_get_xmit_ill() because either the in_ill was an 252011042SErik.Nordmark@Sun.COM * ipmp_ill, or we are sending a non-unicast packet on 252111042SErik.Nordmark@Sun.COM * an under_ill. However, when we get here, the ill selected by 252211042SErik.Nordmark@Sun.COM * ipmp_ill_get_xmit_ill was pulled out of the active set 252311042SErik.Nordmark@Sun.COM * (for unicast) or cast_ill nomination (for 252411042SErik.Nordmark@Sun.COM * !unicast) after it was picked as the outgoing ill. 252511042SErik.Nordmark@Sun.COM * We have to pick an active interface and/or cast_ill in the 252611042SErik.Nordmark@Sun.COM * group. 252711042SErik.Nordmark@Sun.COM */ 252811042SErik.Nordmark@Sun.COM mutex_exit(&ill->ill_phyint->phyint_lock); 252911042SErik.Nordmark@Sun.COM nce_delete(nce); 253011042SErik.Nordmark@Sun.COM mutex_exit(&ill->ill_lock); 253111042SErik.Nordmark@Sun.COM rw_exit(&ill->ill_ipst->ips_ill_g_lock); 253211042SErik.Nordmark@Sun.COM nce_refrele(nce); 253311042SErik.Nordmark@Sun.COM ill_refrele(ill); 253411042SErik.Nordmark@Sun.COM if ((ill = ipmp_ill_get_xmit_ill(in_ill, is_unicast)) == NULL) 253511042SErik.Nordmark@Sun.COM return (NULL); 253611042SErik.Nordmark@Sun.COM goto retry; 253711042SErik.Nordmark@Sun.COM } else { 253811042SErik.Nordmark@Sun.COM mutex_exit(&ill->ill_phyint->phyint_lock); 253911042SErik.Nordmark@Sun.COM mutex_exit(&ill->ill_lock); 254011042SErik.Nordmark@Sun.COM rw_exit(&ill->ill_ipst->ips_ill_g_lock); 254111042SErik.Nordmark@Sun.COM } 254211042SErik.Nordmark@Sun.COM done: 254311042SErik.Nordmark@Sun.COM ASSERT(nce->nce_ill == ill); 254411042SErik.Nordmark@Sun.COM if (need_refrele) 254511042SErik.Nordmark@Sun.COM ill_refrele(ill); 254611042SErik.Nordmark@Sun.COM return (nce); 254711042SErik.Nordmark@Sun.COM } 254811042SErik.Nordmark@Sun.COM 254911042SErik.Nordmark@Sun.COM nce_t * 255011042SErik.Nordmark@Sun.COM arp_nce_init(ill_t *ill, in_addr_t addr4, int ire_type) 255111042SErik.Nordmark@Sun.COM { 255211042SErik.Nordmark@Sun.COM return (ire_nce_init(ill, &addr4, ire_type)); 255311042SErik.Nordmark@Sun.COM } 255411042SErik.Nordmark@Sun.COM 255511042SErik.Nordmark@Sun.COM nce_t * 255611042SErik.Nordmark@Sun.COM ndp_nce_init(ill_t *ill, const in6_addr_t *addr6, int ire_type) 255711042SErik.Nordmark@Sun.COM { 255811042SErik.Nordmark@Sun.COM ASSERT((ire_type & IRE_BROADCAST) == 0); 255911042SErik.Nordmark@Sun.COM return (ire_nce_init(ill, addr6, ire_type)); 256011042SErik.Nordmark@Sun.COM } 256111042SErik.Nordmark@Sun.COM 256211042SErik.Nordmark@Sun.COM /* 256311042SErik.Nordmark@Sun.COM * The caller should hold irb_lock as a writer if the ire is in a bucket. 256411042SErik.Nordmark@Sun.COM */ 256511042SErik.Nordmark@Sun.COM void 256611042SErik.Nordmark@Sun.COM ire_make_condemned(ire_t *ire) 256711042SErik.Nordmark@Sun.COM { 256811042SErik.Nordmark@Sun.COM ip_stack_t *ipst = ire->ire_ipst; 256911042SErik.Nordmark@Sun.COM 257011042SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock); 257111042SErik.Nordmark@Sun.COM ASSERT(ire->ire_bucket == NULL || 257211042SErik.Nordmark@Sun.COM RW_WRITE_HELD(&ire->ire_bucket->irb_lock)); 257311042SErik.Nordmark@Sun.COM ASSERT(!IRE_IS_CONDEMNED(ire)); 257411042SErik.Nordmark@Sun.COM ire->ire_generation = IRE_GENERATION_CONDEMNED; 257511042SErik.Nordmark@Sun.COM /* Count how many condemned ires for kmem_cache callback */ 257611042SErik.Nordmark@Sun.COM atomic_add_32(&ipst->ips_num_ire_condemned, 1); 257711042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 257811042SErik.Nordmark@Sun.COM } 257911042SErik.Nordmark@Sun.COM 258011042SErik.Nordmark@Sun.COM /* 258111042SErik.Nordmark@Sun.COM * Increment the generation avoiding the special condemned value 258211042SErik.Nordmark@Sun.COM */ 258311042SErik.Nordmark@Sun.COM void 258411042SErik.Nordmark@Sun.COM ire_increment_generation(ire_t *ire) 258511042SErik.Nordmark@Sun.COM { 258611042SErik.Nordmark@Sun.COM uint_t generation; 258711042SErik.Nordmark@Sun.COM 258811042SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock); 258911042SErik.Nordmark@Sun.COM /* 259011042SErik.Nordmark@Sun.COM * Even though the caller has a hold it can't prevent a concurrent 259111042SErik.Nordmark@Sun.COM * ire_delete marking the IRE condemned 259211042SErik.Nordmark@Sun.COM */ 259311042SErik.Nordmark@Sun.COM if (!IRE_IS_CONDEMNED(ire)) { 259411042SErik.Nordmark@Sun.COM generation = ire->ire_generation + 1; 259511042SErik.Nordmark@Sun.COM if (generation == IRE_GENERATION_CONDEMNED) 259611042SErik.Nordmark@Sun.COM generation = IRE_GENERATION_INITIAL; 259711042SErik.Nordmark@Sun.COM ASSERT(generation != IRE_GENERATION_VERIFY); 259811042SErik.Nordmark@Sun.COM ire->ire_generation = generation; 259911042SErik.Nordmark@Sun.COM } 260011042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 260111042SErik.Nordmark@Sun.COM } 260211042SErik.Nordmark@Sun.COM 260311042SErik.Nordmark@Sun.COM /* 260411042SErik.Nordmark@Sun.COM * Increment ire_generation on all the IRE_MULTICASTs 260511042SErik.Nordmark@Sun.COM * Used when the default multicast interface (as determined by 260611042SErik.Nordmark@Sun.COM * ill_lookup_multicast) might have changed. 260711042SErik.Nordmark@Sun.COM * 260811042SErik.Nordmark@Sun.COM * That includes the zoneid, IFF_ flags, the IPv6 scope of the address, and 260911042SErik.Nordmark@Sun.COM * ill unplumb. 26102535Ssangeeta */ 26112535Ssangeeta void 261211042SErik.Nordmark@Sun.COM ire_increment_multicast_generation(ip_stack_t *ipst, boolean_t isv6) 261311042SErik.Nordmark@Sun.COM { 261411042SErik.Nordmark@Sun.COM ill_t *ill; 261511042SErik.Nordmark@Sun.COM ill_walk_context_t ctx; 261611042SErik.Nordmark@Sun.COM 261711042SErik.Nordmark@Sun.COM rw_enter(&ipst->ips_ill_g_lock, RW_READER); 261811042SErik.Nordmark@Sun.COM if (isv6) 261911042SErik.Nordmark@Sun.COM ill = ILL_START_WALK_V6(&ctx, ipst); 262011042SErik.Nordmark@Sun.COM else 262111042SErik.Nordmark@Sun.COM ill = ILL_START_WALK_V4(&ctx, ipst); 262211042SErik.Nordmark@Sun.COM for (; ill != NULL; ill = ill_next(&ctx, ill)) { 262311042SErik.Nordmark@Sun.COM if (ILL_IS_CONDEMNED(ill)) 262411042SErik.Nordmark@Sun.COM continue; 262511042SErik.Nordmark@Sun.COM if (ill->ill_ire_multicast != NULL) 262611042SErik.Nordmark@Sun.COM ire_increment_generation(ill->ill_ire_multicast); 262711042SErik.Nordmark@Sun.COM } 262811042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ill_g_lock); 262911042SErik.Nordmark@Sun.COM } 263011042SErik.Nordmark@Sun.COM 263111042SErik.Nordmark@Sun.COM /* 263211042SErik.Nordmark@Sun.COM * Return a held IRE_NOROUTE with RTF_REJECT set 263311042SErik.Nordmark@Sun.COM */ 263411042SErik.Nordmark@Sun.COM ire_t * 263511042SErik.Nordmark@Sun.COM ire_reject(ip_stack_t *ipst, boolean_t isv6) 263611042SErik.Nordmark@Sun.COM { 263711042SErik.Nordmark@Sun.COM ire_t *ire; 263811042SErik.Nordmark@Sun.COM 263911042SErik.Nordmark@Sun.COM if (isv6) 264011042SErik.Nordmark@Sun.COM ire = ipst->ips_ire_reject_v6; 264111042SErik.Nordmark@Sun.COM else 264211042SErik.Nordmark@Sun.COM ire = ipst->ips_ire_reject_v4; 264311042SErik.Nordmark@Sun.COM 264411042SErik.Nordmark@Sun.COM ASSERT(ire->ire_generation != IRE_GENERATION_CONDEMNED); 264511042SErik.Nordmark@Sun.COM ire_refhold(ire); 264611042SErik.Nordmark@Sun.COM return (ire); 264711042SErik.Nordmark@Sun.COM } 264811042SErik.Nordmark@Sun.COM 264911042SErik.Nordmark@Sun.COM /* 265011042SErik.Nordmark@Sun.COM * Return a held IRE_NOROUTE with RTF_BLACKHOLE set 265111042SErik.Nordmark@Sun.COM */ 265211042SErik.Nordmark@Sun.COM ire_t * 265311042SErik.Nordmark@Sun.COM ire_blackhole(ip_stack_t *ipst, boolean_t isv6) 265411042SErik.Nordmark@Sun.COM { 265511042SErik.Nordmark@Sun.COM ire_t *ire; 265611042SErik.Nordmark@Sun.COM 265711042SErik.Nordmark@Sun.COM if (isv6) 265811042SErik.Nordmark@Sun.COM ire = ipst->ips_ire_blackhole_v6; 265911042SErik.Nordmark@Sun.COM else 266011042SErik.Nordmark@Sun.COM ire = ipst->ips_ire_blackhole_v4; 266111042SErik.Nordmark@Sun.COM 266211042SErik.Nordmark@Sun.COM ASSERT(ire->ire_generation != IRE_GENERATION_CONDEMNED); 266311042SErik.Nordmark@Sun.COM ire_refhold(ire); 266411042SErik.Nordmark@Sun.COM return (ire); 266511042SErik.Nordmark@Sun.COM } 266611042SErik.Nordmark@Sun.COM 266711042SErik.Nordmark@Sun.COM /* 266811042SErik.Nordmark@Sun.COM * Return a held IRE_MULTICAST. 266911042SErik.Nordmark@Sun.COM */ 267011042SErik.Nordmark@Sun.COM ire_t * 267111042SErik.Nordmark@Sun.COM ire_multicast(ill_t *ill) 267211042SErik.Nordmark@Sun.COM { 267311042SErik.Nordmark@Sun.COM ire_t *ire = ill->ill_ire_multicast; 267411042SErik.Nordmark@Sun.COM 267511042SErik.Nordmark@Sun.COM ASSERT(ire == NULL || ire->ire_generation != IRE_GENERATION_CONDEMNED); 267611042SErik.Nordmark@Sun.COM if (ire == NULL) 267711042SErik.Nordmark@Sun.COM ire = ire_blackhole(ill->ill_ipst, ill->ill_isv6); 267811042SErik.Nordmark@Sun.COM else 267911042SErik.Nordmark@Sun.COM ire_refhold(ire); 268011042SErik.Nordmark@Sun.COM return (ire); 268111042SErik.Nordmark@Sun.COM } 268211042SErik.Nordmark@Sun.COM 268311042SErik.Nordmark@Sun.COM /* 268411042SErik.Nordmark@Sun.COM * Given an IRE return its nexthop IRE. The nexthop IRE is an IRE_ONLINK 268511042SErik.Nordmark@Sun.COM * that is an exact match (i.e., a /32 for IPv4 and /128 for IPv6). 268611042SErik.Nordmark@Sun.COM * This can return an RTF_REJECT|RTF_BLACKHOLE. 268711042SErik.Nordmark@Sun.COM * The returned IRE is held. 268811042SErik.Nordmark@Sun.COM * The assumption is that ip_select_route() has been called and returned the 268911042SErik.Nordmark@Sun.COM * IRE (thus ip_select_route would have set up the ire_dep* information.) 269011042SErik.Nordmark@Sun.COM * If some IRE is deleteted then ire_dep_remove() will have been called and 269111042SErik.Nordmark@Sun.COM * we might not find a nexthop IRE, in which case we return NULL. 269211042SErik.Nordmark@Sun.COM */ 269311042SErik.Nordmark@Sun.COM ire_t * 269411042SErik.Nordmark@Sun.COM ire_nexthop(ire_t *ire) 26952535Ssangeeta { 269611042SErik.Nordmark@Sun.COM ip_stack_t *ipst = ire->ire_ipst; 269711042SErik.Nordmark@Sun.COM 269811042SErik.Nordmark@Sun.COM /* Acquire lock to walk ire_dep_parent */ 269911042SErik.Nordmark@Sun.COM rw_enter(&ipst->ips_ire_dep_lock, RW_READER); 270011042SErik.Nordmark@Sun.COM while (ire != NULL) { 270111042SErik.Nordmark@Sun.COM if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 270211042SErik.Nordmark@Sun.COM goto done; 270311042SErik.Nordmark@Sun.COM } 270411042SErik.Nordmark@Sun.COM /* 270511042SErik.Nordmark@Sun.COM * If we find an IRE_ONLINK we are done. This includes 270611042SErik.Nordmark@Sun.COM * the case of IRE_MULTICAST. 270711042SErik.Nordmark@Sun.COM * Note that in order to send packets we need a host-specific 270811042SErik.Nordmark@Sun.COM * IRE_IF_ALL first in the ire_dep_parent chain. Normally this 270911042SErik.Nordmark@Sun.COM * is done by inserting an IRE_IF_CLONE if the IRE_INTERFACE 271011042SErik.Nordmark@Sun.COM * was not host specific. 271111042SErik.Nordmark@Sun.COM * However, ip_rts_request doesn't want to send packets 271211042SErik.Nordmark@Sun.COM * hence doesn't want to allocate an IRE_IF_CLONE. Yet 271311042SErik.Nordmark@Sun.COM * it needs an IRE_IF_ALL to get to the ill. Thus 271411042SErik.Nordmark@Sun.COM * we return IRE_IF_ALL that are not host specific here. 271511042SErik.Nordmark@Sun.COM */ 271611042SErik.Nordmark@Sun.COM if (ire->ire_type & IRE_ONLINK) 271711042SErik.Nordmark@Sun.COM goto done; 271811042SErik.Nordmark@Sun.COM ire = ire->ire_dep_parent; 271911042SErik.Nordmark@Sun.COM } 272011042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ire_dep_lock); 272111042SErik.Nordmark@Sun.COM return (NULL); 272211042SErik.Nordmark@Sun.COM 272311042SErik.Nordmark@Sun.COM done: 272411042SErik.Nordmark@Sun.COM ire_refhold(ire); 272511042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ire_dep_lock); 272611042SErik.Nordmark@Sun.COM return (ire); 272711042SErik.Nordmark@Sun.COM } 272811042SErik.Nordmark@Sun.COM 272911042SErik.Nordmark@Sun.COM /* 273011042SErik.Nordmark@Sun.COM * Find the ill used to send packets. This will be NULL in case 273111042SErik.Nordmark@Sun.COM * of a reject or blackhole. 273211042SErik.Nordmark@Sun.COM * The returned ill is held; caller needs to do ill_refrele when done. 273311042SErik.Nordmark@Sun.COM */ 273411042SErik.Nordmark@Sun.COM ill_t * 273511042SErik.Nordmark@Sun.COM ire_nexthop_ill(ire_t *ire) 273611042SErik.Nordmark@Sun.COM { 273711042SErik.Nordmark@Sun.COM ill_t *ill; 273811042SErik.Nordmark@Sun.COM 273911042SErik.Nordmark@Sun.COM ire = ire_nexthop(ire); 274011042SErik.Nordmark@Sun.COM if (ire == NULL) 274111042SErik.Nordmark@Sun.COM return (NULL); 274211042SErik.Nordmark@Sun.COM 274311042SErik.Nordmark@Sun.COM /* ire_ill can not change for an existing ire */ 274411042SErik.Nordmark@Sun.COM ill = ire->ire_ill; 274511042SErik.Nordmark@Sun.COM if (ill != NULL) 274611042SErik.Nordmark@Sun.COM ill_refhold(ill); 274711042SErik.Nordmark@Sun.COM ire_refrele(ire); 274811042SErik.Nordmark@Sun.COM return (ill); 274911042SErik.Nordmark@Sun.COM } 275011042SErik.Nordmark@Sun.COM 275111042SErik.Nordmark@Sun.COM #ifdef DEBUG 275211042SErik.Nordmark@Sun.COM static boolean_t 275311042SErik.Nordmark@Sun.COM parent_has_child(ire_t *parent, ire_t *child) 275411042SErik.Nordmark@Sun.COM { 275511042SErik.Nordmark@Sun.COM ire_t *ire; 275611042SErik.Nordmark@Sun.COM ire_t *prev; 275711042SErik.Nordmark@Sun.COM 275811042SErik.Nordmark@Sun.COM ire = parent->ire_dep_children; 275911042SErik.Nordmark@Sun.COM prev = NULL; 276011042SErik.Nordmark@Sun.COM while (ire != NULL) { 276111042SErik.Nordmark@Sun.COM if (prev == NULL) { 276211042SErik.Nordmark@Sun.COM ASSERT(ire->ire_dep_sib_ptpn == 276311042SErik.Nordmark@Sun.COM &(parent->ire_dep_children)); 276411042SErik.Nordmark@Sun.COM } else { 276511042SErik.Nordmark@Sun.COM ASSERT(ire->ire_dep_sib_ptpn == 276611042SErik.Nordmark@Sun.COM &(prev->ire_dep_sib_next)); 276711042SErik.Nordmark@Sun.COM } 276811042SErik.Nordmark@Sun.COM if (ire == child) 276911042SErik.Nordmark@Sun.COM return (B_TRUE); 277011042SErik.Nordmark@Sun.COM prev = ire; 277111042SErik.Nordmark@Sun.COM ire = ire->ire_dep_sib_next; 277211042SErik.Nordmark@Sun.COM } 277311042SErik.Nordmark@Sun.COM return (B_FALSE); 277411042SErik.Nordmark@Sun.COM } 277511042SErik.Nordmark@Sun.COM 277611042SErik.Nordmark@Sun.COM static void 277711042SErik.Nordmark@Sun.COM ire_dep_verify(ire_t *ire) 277811042SErik.Nordmark@Sun.COM { 277911042SErik.Nordmark@Sun.COM ire_t *parent = ire->ire_dep_parent; 278011042SErik.Nordmark@Sun.COM ire_t *child = ire->ire_dep_children; 278111042SErik.Nordmark@Sun.COM 278211042SErik.Nordmark@Sun.COM ASSERT(ire->ire_ipversion == IPV4_VERSION || 278311042SErik.Nordmark@Sun.COM ire->ire_ipversion == IPV6_VERSION); 278411042SErik.Nordmark@Sun.COM if (parent != NULL) { 278511042SErik.Nordmark@Sun.COM ASSERT(parent->ire_ipversion == IPV4_VERSION || 278611042SErik.Nordmark@Sun.COM parent->ire_ipversion == IPV6_VERSION); 278711042SErik.Nordmark@Sun.COM ASSERT(parent->ire_refcnt >= 1); 278811042SErik.Nordmark@Sun.COM ASSERT(parent_has_child(parent, ire)); 278911042SErik.Nordmark@Sun.COM } 279011042SErik.Nordmark@Sun.COM if (child != NULL) { 279111042SErik.Nordmark@Sun.COM ASSERT(child->ire_ipversion == IPV4_VERSION || 279211042SErik.Nordmark@Sun.COM child->ire_ipversion == IPV6_VERSION); 279311042SErik.Nordmark@Sun.COM ASSERT(child->ire_dep_parent == ire); 279411042SErik.Nordmark@Sun.COM ASSERT(child->ire_dep_sib_ptpn != NULL); 279511042SErik.Nordmark@Sun.COM ASSERT(parent_has_child(ire, child)); 279611042SErik.Nordmark@Sun.COM } 279711042SErik.Nordmark@Sun.COM } 279811042SErik.Nordmark@Sun.COM #endif /* DEBUG */ 279911042SErik.Nordmark@Sun.COM 280011042SErik.Nordmark@Sun.COM /* 280111042SErik.Nordmark@Sun.COM * Assumes ire_dep_parent is set. Remove this child from its parent's linkage. 280211042SErik.Nordmark@Sun.COM */ 280311042SErik.Nordmark@Sun.COM void 280411042SErik.Nordmark@Sun.COM ire_dep_remove(ire_t *ire) 280511042SErik.Nordmark@Sun.COM { 280611042SErik.Nordmark@Sun.COM ip_stack_t *ipst = ire->ire_ipst; 280711042SErik.Nordmark@Sun.COM ire_t *parent = ire->ire_dep_parent; 280811042SErik.Nordmark@Sun.COM ire_t *next; 280911042SErik.Nordmark@Sun.COM nce_t *nce; 281011042SErik.Nordmark@Sun.COM 281111042SErik.Nordmark@Sun.COM ASSERT(RW_WRITE_HELD(&ipst->ips_ire_dep_lock)); 281211042SErik.Nordmark@Sun.COM ASSERT(ire->ire_dep_parent != NULL); 281311042SErik.Nordmark@Sun.COM ASSERT(ire->ire_dep_sib_ptpn != NULL); 281411042SErik.Nordmark@Sun.COM 281511042SErik.Nordmark@Sun.COM #ifdef DEBUG 281611042SErik.Nordmark@Sun.COM ire_dep_verify(ire); 281711042SErik.Nordmark@Sun.COM ire_dep_verify(parent); 281811042SErik.Nordmark@Sun.COM #endif 281911042SErik.Nordmark@Sun.COM 282011042SErik.Nordmark@Sun.COM next = ire->ire_dep_sib_next; 282111042SErik.Nordmark@Sun.COM if (next != NULL) 282211042SErik.Nordmark@Sun.COM next->ire_dep_sib_ptpn = ire->ire_dep_sib_ptpn; 282311042SErik.Nordmark@Sun.COM 282411042SErik.Nordmark@Sun.COM ASSERT(*(ire->ire_dep_sib_ptpn) == ire); 282511042SErik.Nordmark@Sun.COM *(ire->ire_dep_sib_ptpn) = ire->ire_dep_sib_next; 282611042SErik.Nordmark@Sun.COM 282711042SErik.Nordmark@Sun.COM ire->ire_dep_sib_ptpn = NULL; 282811042SErik.Nordmark@Sun.COM ire->ire_dep_sib_next = NULL; 282911042SErik.Nordmark@Sun.COM 283011042SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock); 283111042SErik.Nordmark@Sun.COM parent = ire->ire_dep_parent; 283211042SErik.Nordmark@Sun.COM ire->ire_dep_parent = NULL; 283311042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 28342535Ssangeeta 28352535Ssangeeta /* 283611042SErik.Nordmark@Sun.COM * Make sure all our children, grandchildren, etc set 283711042SErik.Nordmark@Sun.COM * ire_dep_parent_generation to IRE_GENERATION_VERIFY since 283811042SErik.Nordmark@Sun.COM * we can no longer guarantee than the children have a current 283911042SErik.Nordmark@Sun.COM * ire_nce_cache and ire_nexthop_ill(). 28402535Ssangeeta */ 284111042SErik.Nordmark@Sun.COM if (ire->ire_dep_children != NULL) 284211042SErik.Nordmark@Sun.COM ire_dep_invalidate_children(ire->ire_dep_children); 28432535Ssangeeta 28442535Ssangeeta /* 284511042SErik.Nordmark@Sun.COM * Since the parent is gone we make sure we clear ire_nce_cache. 284611042SErik.Nordmark@Sun.COM * We can clear it under ire_lock even if the IRE is used 28472535Ssangeeta */ 284811042SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock); 284911042SErik.Nordmark@Sun.COM nce = ire->ire_nce_cache; 285011042SErik.Nordmark@Sun.COM ire->ire_nce_cache = NULL; 285111042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 285211042SErik.Nordmark@Sun.COM if (nce != NULL) 285311042SErik.Nordmark@Sun.COM nce_refrele(nce); 285411042SErik.Nordmark@Sun.COM 285511042SErik.Nordmark@Sun.COM #ifdef DEBUG 285611042SErik.Nordmark@Sun.COM ire_dep_verify(ire); 285711042SErik.Nordmark@Sun.COM ire_dep_verify(parent); 285811042SErik.Nordmark@Sun.COM #endif 285911042SErik.Nordmark@Sun.COM 286011042SErik.Nordmark@Sun.COM ire_refrele_notr(parent); 286111042SErik.Nordmark@Sun.COM ire_refrele_notr(ire); 286211042SErik.Nordmark@Sun.COM } 286311042SErik.Nordmark@Sun.COM 286411042SErik.Nordmark@Sun.COM /* 286511042SErik.Nordmark@Sun.COM * Insert the child in the linkage of the parent 286611042SErik.Nordmark@Sun.COM */ 286711042SErik.Nordmark@Sun.COM static void 286811042SErik.Nordmark@Sun.COM ire_dep_parent_insert(ire_t *child, ire_t *parent) 286911042SErik.Nordmark@Sun.COM { 287011042SErik.Nordmark@Sun.COM ip_stack_t *ipst = child->ire_ipst; 287111042SErik.Nordmark@Sun.COM ire_t *next; 287211042SErik.Nordmark@Sun.COM 287311042SErik.Nordmark@Sun.COM ASSERT(RW_WRITE_HELD(&ipst->ips_ire_dep_lock)); 287411042SErik.Nordmark@Sun.COM ASSERT(child->ire_dep_parent == NULL); 287511042SErik.Nordmark@Sun.COM 287611042SErik.Nordmark@Sun.COM #ifdef DEBUG 287711042SErik.Nordmark@Sun.COM ire_dep_verify(child); 287811042SErik.Nordmark@Sun.COM ire_dep_verify(parent); 287911042SErik.Nordmark@Sun.COM #endif 288011042SErik.Nordmark@Sun.COM /* No parents => no siblings */ 288111042SErik.Nordmark@Sun.COM ASSERT(child->ire_dep_sib_ptpn == NULL); 288211042SErik.Nordmark@Sun.COM ASSERT(child->ire_dep_sib_next == NULL); 288311042SErik.Nordmark@Sun.COM 288411042SErik.Nordmark@Sun.COM ire_refhold_notr(parent); 288511042SErik.Nordmark@Sun.COM ire_refhold_notr(child); 288611042SErik.Nordmark@Sun.COM 288711042SErik.Nordmark@Sun.COM /* Head insertion */ 288811042SErik.Nordmark@Sun.COM next = parent->ire_dep_children; 288911042SErik.Nordmark@Sun.COM if (next != NULL) { 289011042SErik.Nordmark@Sun.COM ASSERT(next->ire_dep_sib_ptpn == &(parent->ire_dep_children)); 289111042SErik.Nordmark@Sun.COM child->ire_dep_sib_next = next; 289211042SErik.Nordmark@Sun.COM next->ire_dep_sib_ptpn = &(child->ire_dep_sib_next); 289311042SErik.Nordmark@Sun.COM } 289411042SErik.Nordmark@Sun.COM parent->ire_dep_children = child; 289511042SErik.Nordmark@Sun.COM child->ire_dep_sib_ptpn = &(parent->ire_dep_children); 289611042SErik.Nordmark@Sun.COM 289711042SErik.Nordmark@Sun.COM mutex_enter(&child->ire_lock); 289811042SErik.Nordmark@Sun.COM child->ire_dep_parent = parent; 289911042SErik.Nordmark@Sun.COM mutex_exit(&child->ire_lock); 290011042SErik.Nordmark@Sun.COM 290111042SErik.Nordmark@Sun.COM #ifdef DEBUG 290211042SErik.Nordmark@Sun.COM ire_dep_verify(child); 290311042SErik.Nordmark@Sun.COM ire_dep_verify(parent); 290411042SErik.Nordmark@Sun.COM #endif 290511042SErik.Nordmark@Sun.COM } 290611042SErik.Nordmark@Sun.COM 290711042SErik.Nordmark@Sun.COM 290811042SErik.Nordmark@Sun.COM /* 290911042SErik.Nordmark@Sun.COM * Given count worth of ires and generations, build ire_dep_* relationships 291011042SErik.Nordmark@Sun.COM * from ires[0] to ires[count-1]. Record generations[i+1] in 291111042SErik.Nordmark@Sun.COM * ire_dep_parent_generation for ires[i]. 291211042SErik.Nordmark@Sun.COM * We graft onto an existing parent chain by making sure that we don't 291311042SErik.Nordmark@Sun.COM * touch ire_dep_parent for ires[count-1]. 291411042SErik.Nordmark@Sun.COM * 291511042SErik.Nordmark@Sun.COM * We check for any condemned ire_generation count and return B_FALSE in 291611042SErik.Nordmark@Sun.COM * that case so that the caller can tear it apart. 291711042SErik.Nordmark@Sun.COM * 291811042SErik.Nordmark@Sun.COM * Note that generations[0] is not used. Caller handles that. 291911042SErik.Nordmark@Sun.COM */ 292011042SErik.Nordmark@Sun.COM boolean_t 292111042SErik.Nordmark@Sun.COM ire_dep_build(ire_t *ires[], uint_t generations[], uint_t count) 292211042SErik.Nordmark@Sun.COM { 292311042SErik.Nordmark@Sun.COM ire_t *ire = ires[0]; 292411042SErik.Nordmark@Sun.COM ip_stack_t *ipst; 292511042SErik.Nordmark@Sun.COM uint_t i; 292611042SErik.Nordmark@Sun.COM 292711042SErik.Nordmark@Sun.COM ASSERT(count > 0); 292811042SErik.Nordmark@Sun.COM if (count == 1) { 292911042SErik.Nordmark@Sun.COM /* No work to do */ 293011042SErik.Nordmark@Sun.COM return (B_TRUE); 293111042SErik.Nordmark@Sun.COM } 293211042SErik.Nordmark@Sun.COM ipst = ire->ire_ipst; 293311042SErik.Nordmark@Sun.COM rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER); 293411042SErik.Nordmark@Sun.COM /* 293511042SErik.Nordmark@Sun.COM * Do not remove the linkage for any existing parent chain i.e., 293611042SErik.Nordmark@Sun.COM * ires[count-1] is left alone. 293711042SErik.Nordmark@Sun.COM */ 293811042SErik.Nordmark@Sun.COM for (i = 0; i < count-1; i++) { 293911042SErik.Nordmark@Sun.COM /* Remove existing parent if we need to change it */ 294011042SErik.Nordmark@Sun.COM if (ires[i]->ire_dep_parent != NULL && 294111042SErik.Nordmark@Sun.COM ires[i]->ire_dep_parent != ires[i+1]) 294211042SErik.Nordmark@Sun.COM ire_dep_remove(ires[i]); 294311042SErik.Nordmark@Sun.COM } 294411042SErik.Nordmark@Sun.COM 294511042SErik.Nordmark@Sun.COM for (i = 0; i < count - 1; i++) { 294611042SErik.Nordmark@Sun.COM ASSERT(ires[i]->ire_ipversion == IPV4_VERSION || 294711042SErik.Nordmark@Sun.COM ires[i]->ire_ipversion == IPV6_VERSION); 294811042SErik.Nordmark@Sun.COM /* Does it need to change? */ 294911042SErik.Nordmark@Sun.COM if (ires[i]->ire_dep_parent != ires[i+1]) 295011042SErik.Nordmark@Sun.COM ire_dep_parent_insert(ires[i], ires[i+1]); 295111042SErik.Nordmark@Sun.COM 295211042SErik.Nordmark@Sun.COM mutex_enter(&ires[i+1]->ire_lock); 295311042SErik.Nordmark@Sun.COM if (IRE_IS_CONDEMNED(ires[i+1])) { 295411042SErik.Nordmark@Sun.COM mutex_exit(&ires[i+1]->ire_lock); 295511042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ire_dep_lock); 295611042SErik.Nordmark@Sun.COM return (B_FALSE); 295711042SErik.Nordmark@Sun.COM } 295811042SErik.Nordmark@Sun.COM mutex_exit(&ires[i+1]->ire_lock); 295911042SErik.Nordmark@Sun.COM 296011042SErik.Nordmark@Sun.COM mutex_enter(&ires[i]->ire_lock); 296111042SErik.Nordmark@Sun.COM ires[i]->ire_dep_parent_generation = generations[i+1]; 296211042SErik.Nordmark@Sun.COM mutex_exit(&ires[i]->ire_lock); 296311042SErik.Nordmark@Sun.COM } 296411042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ire_dep_lock); 296511042SErik.Nordmark@Sun.COM return (B_TRUE); 296611042SErik.Nordmark@Sun.COM } 296711042SErik.Nordmark@Sun.COM 296811042SErik.Nordmark@Sun.COM /* 296911042SErik.Nordmark@Sun.COM * Given count worth of ires, unbuild ire_dep_* relationships 297011042SErik.Nordmark@Sun.COM * from ires[0] to ires[count-1]. 297111042SErik.Nordmark@Sun.COM */ 297211042SErik.Nordmark@Sun.COM void 297311042SErik.Nordmark@Sun.COM ire_dep_unbuild(ire_t *ires[], uint_t count) 297411042SErik.Nordmark@Sun.COM { 297511042SErik.Nordmark@Sun.COM ip_stack_t *ipst; 297611042SErik.Nordmark@Sun.COM uint_t i; 297711042SErik.Nordmark@Sun.COM 297811042SErik.Nordmark@Sun.COM if (count == 0) { 297911042SErik.Nordmark@Sun.COM /* No work to do */ 29802535Ssangeeta return; 29812535Ssangeeta } 298211042SErik.Nordmark@Sun.COM ipst = ires[0]->ire_ipst; 298311042SErik.Nordmark@Sun.COM rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER); 298411042SErik.Nordmark@Sun.COM for (i = 0; i < count; i++) { 298511042SErik.Nordmark@Sun.COM ASSERT(ires[i]->ire_ipversion == IPV4_VERSION || 298611042SErik.Nordmark@Sun.COM ires[i]->ire_ipversion == IPV6_VERSION); 298711042SErik.Nordmark@Sun.COM if (ires[i]->ire_dep_parent != NULL) 298811042SErik.Nordmark@Sun.COM ire_dep_remove(ires[i]); 298911042SErik.Nordmark@Sun.COM mutex_enter(&ires[i]->ire_lock); 299011042SErik.Nordmark@Sun.COM ires[i]->ire_dep_parent_generation = IRE_GENERATION_VERIFY; 299111042SErik.Nordmark@Sun.COM mutex_exit(&ires[i]->ire_lock); 299211042SErik.Nordmark@Sun.COM } 299311042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ire_dep_lock); 299411042SErik.Nordmark@Sun.COM } 299511042SErik.Nordmark@Sun.COM 299611042SErik.Nordmark@Sun.COM /* 299711042SErik.Nordmark@Sun.COM * Both the forwarding and the outbound code paths can trip on 299811042SErik.Nordmark@Sun.COM * a condemned NCE, in which case we call this function. 299911042SErik.Nordmark@Sun.COM * We have two different behaviors: if the NCE was UNREACHABLE 300011042SErik.Nordmark@Sun.COM * it is an indication that something failed. In that case 300111042SErik.Nordmark@Sun.COM * we see if we should look for a different IRE (for example, 300211042SErik.Nordmark@Sun.COM * delete any matching redirect IRE, or try a different 300311042SErik.Nordmark@Sun.COM * IRE_DEFAULT (ECMP)). We mark the ire as bad so a hopefully 300411042SErik.Nordmark@Sun.COM * different IRE will be picked next time we send/forward. 300511042SErik.Nordmark@Sun.COM * 300611042SErik.Nordmark@Sun.COM * If we are called by the output path then fail_if_better is set 300711042SErik.Nordmark@Sun.COM * and we return NULL if there could be a better IRE. This is because the 300811042SErik.Nordmark@Sun.COM * output path retries the IRE lookup. (The input/forward path can not retry.) 300911042SErik.Nordmark@Sun.COM * 301011042SErik.Nordmark@Sun.COM * If the NCE was not unreachable then we pick/allocate a 301111042SErik.Nordmark@Sun.COM * new (most likely ND_INITIAL) NCE and proceed with it. 301211042SErik.Nordmark@Sun.COM * 301311042SErik.Nordmark@Sun.COM * ipha/ip6h are needed for multicast packets; ipha needs to be 301411042SErik.Nordmark@Sun.COM * set for IPv4 and ip6h needs to be set for IPv6 packets. 301511042SErik.Nordmark@Sun.COM */ 301611042SErik.Nordmark@Sun.COM nce_t * 301711042SErik.Nordmark@Sun.COM ire_handle_condemned_nce(nce_t *nce, ire_t *ire, ipha_t *ipha, ip6_t *ip6h, 301811042SErik.Nordmark@Sun.COM boolean_t fail_if_better) 301911042SErik.Nordmark@Sun.COM { 302011042SErik.Nordmark@Sun.COM if (nce->nce_common->ncec_state == ND_UNREACHABLE) { 302111042SErik.Nordmark@Sun.COM if (ire_no_good(ire) && fail_if_better) { 302211042SErik.Nordmark@Sun.COM /* 302311042SErik.Nordmark@Sun.COM * Did some changes, or ECMP likely to exist. 302411042SErik.Nordmark@Sun.COM * Make ip_output look for a different IRE 302511042SErik.Nordmark@Sun.COM */ 302611042SErik.Nordmark@Sun.COM return (NULL); 302711042SErik.Nordmark@Sun.COM } 302811042SErik.Nordmark@Sun.COM } 302911042SErik.Nordmark@Sun.COM if (ire_revalidate_nce(ire) == ENETUNREACH) { 303011042SErik.Nordmark@Sun.COM /* The ire_dep_parent chain went bad, or no memory? */ 303111042SErik.Nordmark@Sun.COM (void) ire_no_good(ire); 303211042SErik.Nordmark@Sun.COM return (NULL); 303311042SErik.Nordmark@Sun.COM } 303411042SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV4_VERSION) { 303511042SErik.Nordmark@Sun.COM ASSERT(ipha != NULL); 303611042SErik.Nordmark@Sun.COM nce = ire_to_nce(ire, ipha->ipha_dst, NULL); 303711042SErik.Nordmark@Sun.COM } else { 303811042SErik.Nordmark@Sun.COM ASSERT(ip6h != NULL); 303911042SErik.Nordmark@Sun.COM nce = ire_to_nce(ire, INADDR_ANY, &ip6h->ip6_dst); 30402535Ssangeeta } 30418485SPeter.Memishian@Sun.COM 304211042SErik.Nordmark@Sun.COM if (nce == NULL) 304311042SErik.Nordmark@Sun.COM return (NULL); 304411042SErik.Nordmark@Sun.COM if (nce->nce_is_condemned) { 304511042SErik.Nordmark@Sun.COM nce_refrele(nce); 304611042SErik.Nordmark@Sun.COM return (NULL); 30472535Ssangeeta } 304811042SErik.Nordmark@Sun.COM return (nce); 304911042SErik.Nordmark@Sun.COM } 305011042SErik.Nordmark@Sun.COM 305111042SErik.Nordmark@Sun.COM /* 305211042SErik.Nordmark@Sun.COM * The caller has found that the ire is bad, either due to a reference to an NCE 305311042SErik.Nordmark@Sun.COM * in ND_UNREACHABLE state, or a MULTIRT route whose gateway can't be resolved. 305411042SErik.Nordmark@Sun.COM * We update things so a subsequent attempt to send to the destination 305511042SErik.Nordmark@Sun.COM * is likely to find different IRE, or that a new NCE would be created. 305611042SErik.Nordmark@Sun.COM * 305711042SErik.Nordmark@Sun.COM * Returns B_TRUE if it is likely that a subsequent ire_ftable_lookup would 305811042SErik.Nordmark@Sun.COM * find a different route (either due to having deleted a redirect, or there 305911042SErik.Nordmark@Sun.COM * being ECMP routes.) 306011042SErik.Nordmark@Sun.COM * 306111042SErik.Nordmark@Sun.COM * If we have a redirect (RTF_DYNAMIC) we delete it. 306211042SErik.Nordmark@Sun.COM * Otherwise we increment ire_badcnt and increment the generation number so 306311042SErik.Nordmark@Sun.COM * that a cached ixa_ire will redo the route selection. ire_badcnt is taken 306411042SErik.Nordmark@Sun.COM * into account in the route selection when we have multiple choices (multiple 306511042SErik.Nordmark@Sun.COM * default routes or ECMP in general). 306611042SErik.Nordmark@Sun.COM * Any time ip_select_route find an ire with a condemned ire_nce_cache 306711042SErik.Nordmark@Sun.COM * (e.g., if no equal cost route to the bad one) ip_select_route will make 306811042SErik.Nordmark@Sun.COM * sure the NCE is revalidated to avoid getting stuck on a 306911042SErik.Nordmark@Sun.COM * NCE_F_CONDMNED ncec that caused ire_no_good to be called. 307011042SErik.Nordmark@Sun.COM */ 307111042SErik.Nordmark@Sun.COM boolean_t 307211042SErik.Nordmark@Sun.COM ire_no_good(ire_t *ire) 307311042SErik.Nordmark@Sun.COM { 307411042SErik.Nordmark@Sun.COM ip_stack_t *ipst = ire->ire_ipst; 307511042SErik.Nordmark@Sun.COM ire_t *ire2; 307611042SErik.Nordmark@Sun.COM nce_t *nce; 307711042SErik.Nordmark@Sun.COM 307811042SErik.Nordmark@Sun.COM if (ire->ire_flags & RTF_DYNAMIC) { 307911042SErik.Nordmark@Sun.COM ire_delete(ire); 308011042SErik.Nordmark@Sun.COM return (B_TRUE); 308111042SErik.Nordmark@Sun.COM } 308211042SErik.Nordmark@Sun.COM if (ire->ire_flags & RTF_INDIRECT) { 308311042SErik.Nordmark@Sun.COM /* Check if next IRE is a redirect */ 308411042SErik.Nordmark@Sun.COM rw_enter(&ipst->ips_ire_dep_lock, RW_READER); 308511042SErik.Nordmark@Sun.COM if (ire->ire_dep_parent != NULL && 308611042SErik.Nordmark@Sun.COM (ire->ire_dep_parent->ire_flags & RTF_DYNAMIC)) { 308711042SErik.Nordmark@Sun.COM ire2 = ire->ire_dep_parent; 308811042SErik.Nordmark@Sun.COM ire_refhold(ire2); 308911042SErik.Nordmark@Sun.COM } else { 309011042SErik.Nordmark@Sun.COM ire2 = NULL; 309111042SErik.Nordmark@Sun.COM } 309211042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ire_dep_lock); 309311042SErik.Nordmark@Sun.COM if (ire2 != NULL) { 309411042SErik.Nordmark@Sun.COM ire_delete(ire2); 309511042SErik.Nordmark@Sun.COM ire_refrele(ire2); 309611042SErik.Nordmark@Sun.COM return (B_TRUE); 309711042SErik.Nordmark@Sun.COM } 309811042SErik.Nordmark@Sun.COM } 30992535Ssangeeta /* 310011042SErik.Nordmark@Sun.COM * No redirect involved. Increment badcnt so that if we have ECMP 310111042SErik.Nordmark@Sun.COM * routes we are likely to pick a different one for the next packet. 310211042SErik.Nordmark@Sun.COM * 310311042SErik.Nordmark@Sun.COM * If the NCE is unreachable and condemned we should drop the reference 310411042SErik.Nordmark@Sun.COM * to it so that a new NCE can be created. 310511042SErik.Nordmark@Sun.COM * 310611042SErik.Nordmark@Sun.COM * Finally we increment the generation number so that any ixa_ire 310711042SErik.Nordmark@Sun.COM * cache will be revalidated. 31082535Ssangeeta */ 310911042SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock); 311011042SErik.Nordmark@Sun.COM ire->ire_badcnt++; 311111066Srafael.vanoni@sun.com ire->ire_last_badcnt = TICK_TO_SEC(ddi_get_lbolt64()); 311211042SErik.Nordmark@Sun.COM nce = ire->ire_nce_cache; 311311042SErik.Nordmark@Sun.COM if (nce != NULL && nce->nce_is_condemned && 311411042SErik.Nordmark@Sun.COM nce->nce_common->ncec_state == ND_UNREACHABLE) 311511042SErik.Nordmark@Sun.COM ire->ire_nce_cache = NULL; 311611042SErik.Nordmark@Sun.COM else 311711042SErik.Nordmark@Sun.COM nce = NULL; 311811042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 311911042SErik.Nordmark@Sun.COM if (nce != NULL) 312011042SErik.Nordmark@Sun.COM nce_refrele(nce); 312111042SErik.Nordmark@Sun.COM 312211042SErik.Nordmark@Sun.COM ire_increment_generation(ire); 312311042SErik.Nordmark@Sun.COM ire_dep_incr_generation(ire); 312411042SErik.Nordmark@Sun.COM 312511042SErik.Nordmark@Sun.COM return (ire->ire_bucket->irb_ire_cnt > 1); 312611042SErik.Nordmark@Sun.COM } 312711042SErik.Nordmark@Sun.COM 312811042SErik.Nordmark@Sun.COM /* 312911042SErik.Nordmark@Sun.COM * Walk ire_dep_parent chain and validate that ire_dep_parent->ire_generation == 313011042SErik.Nordmark@Sun.COM * ire_dep_parent_generation. 313111042SErik.Nordmark@Sun.COM * If they all match we just return ire_generation from the topmost IRE. 313211042SErik.Nordmark@Sun.COM * Otherwise we propagate the mismatch by setting all ire_dep_parent_generation 313311042SErik.Nordmark@Sun.COM * above the mismatch to IRE_GENERATION_VERIFY and also returning 313411042SErik.Nordmark@Sun.COM * IRE_GENERATION_VERIFY. 313511042SErik.Nordmark@Sun.COM */ 313611042SErik.Nordmark@Sun.COM uint_t 313711042SErik.Nordmark@Sun.COM ire_dep_validate_generations(ire_t *ire) 313811042SErik.Nordmark@Sun.COM { 313911042SErik.Nordmark@Sun.COM ip_stack_t *ipst = ire->ire_ipst; 314011042SErik.Nordmark@Sun.COM uint_t generation; 314111042SErik.Nordmark@Sun.COM ire_t *ire1; 314211042SErik.Nordmark@Sun.COM 314311042SErik.Nordmark@Sun.COM rw_enter(&ipst->ips_ire_dep_lock, RW_READER); 314411042SErik.Nordmark@Sun.COM generation = ire->ire_generation; /* Assuming things match */ 314511042SErik.Nordmark@Sun.COM for (ire1 = ire; ire1 != NULL; ire1 = ire1->ire_dep_parent) { 314611042SErik.Nordmark@Sun.COM ASSERT(ire1->ire_ipversion == IPV4_VERSION || 314711042SErik.Nordmark@Sun.COM ire1->ire_ipversion == IPV6_VERSION); 314811042SErik.Nordmark@Sun.COM if (ire1->ire_dep_parent == NULL) 314911042SErik.Nordmark@Sun.COM break; 315011042SErik.Nordmark@Sun.COM if (ire1->ire_dep_parent_generation != 315111042SErik.Nordmark@Sun.COM ire1->ire_dep_parent->ire_generation) 315211042SErik.Nordmark@Sun.COM goto mismatch; 315311042SErik.Nordmark@Sun.COM } 315411042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ire_dep_lock); 315511042SErik.Nordmark@Sun.COM return (generation); 315611042SErik.Nordmark@Sun.COM 315711042SErik.Nordmark@Sun.COM mismatch: 315811042SErik.Nordmark@Sun.COM generation = IRE_GENERATION_VERIFY; 315911042SErik.Nordmark@Sun.COM /* Fill from top down to the mismatch with _VERIFY */ 316011042SErik.Nordmark@Sun.COM while (ire != ire1) { 316111042SErik.Nordmark@Sun.COM ASSERT(ire->ire_ipversion == IPV4_VERSION || 316211042SErik.Nordmark@Sun.COM ire->ire_ipversion == IPV6_VERSION); 316311042SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock); 316411042SErik.Nordmark@Sun.COM ire->ire_dep_parent_generation = IRE_GENERATION_VERIFY; 316511042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 316611042SErik.Nordmark@Sun.COM ire = ire->ire_dep_parent; 31672535Ssangeeta } 316811042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ire_dep_lock); 316911042SErik.Nordmark@Sun.COM return (generation); 317011042SErik.Nordmark@Sun.COM } 317111042SErik.Nordmark@Sun.COM 317211042SErik.Nordmark@Sun.COM /* 317311042SErik.Nordmark@Sun.COM * Used when we need to return an ire with ire_dep_parent, but we 317411042SErik.Nordmark@Sun.COM * know the chain is invalid for instance we didn't create an IRE_IF_CLONE 317511042SErik.Nordmark@Sun.COM * Using IRE_GENERATION_VERIFY means that next time we'll redo the 317611042SErik.Nordmark@Sun.COM * recursive lookup. 317711042SErik.Nordmark@Sun.COM */ 317811042SErik.Nordmark@Sun.COM void 317911042SErik.Nordmark@Sun.COM ire_dep_invalidate_generations(ire_t *ire) 318011042SErik.Nordmark@Sun.COM { 318111042SErik.Nordmark@Sun.COM ip_stack_t *ipst = ire->ire_ipst; 318211042SErik.Nordmark@Sun.COM 318311042SErik.Nordmark@Sun.COM rw_enter(&ipst->ips_ire_dep_lock, RW_READER); 318411042SErik.Nordmark@Sun.COM while (ire != NULL) { 318511042SErik.Nordmark@Sun.COM ASSERT(ire->ire_ipversion == IPV4_VERSION || 318611042SErik.Nordmark@Sun.COM ire->ire_ipversion == IPV6_VERSION); 318711042SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock); 318811042SErik.Nordmark@Sun.COM ire->ire_dep_parent_generation = IRE_GENERATION_VERIFY; 318911042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 319011042SErik.Nordmark@Sun.COM ire = ire->ire_dep_parent; 319111042SErik.Nordmark@Sun.COM } 319211042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ire_dep_lock); 319311042SErik.Nordmark@Sun.COM } 319411042SErik.Nordmark@Sun.COM 319511042SErik.Nordmark@Sun.COM /* Set _VERIFY ire_dep_parent_generation for all children recursively */ 319611042SErik.Nordmark@Sun.COM static void 319711042SErik.Nordmark@Sun.COM ire_dep_invalidate_children(ire_t *child) 319811042SErik.Nordmark@Sun.COM { 319911042SErik.Nordmark@Sun.COM ip_stack_t *ipst = child->ire_ipst; 320011042SErik.Nordmark@Sun.COM 320111042SErik.Nordmark@Sun.COM ASSERT(RW_WRITE_HELD(&ipst->ips_ire_dep_lock)); 320211042SErik.Nordmark@Sun.COM /* Depth first */ 320311042SErik.Nordmark@Sun.COM if (child->ire_dep_children != NULL) 320411042SErik.Nordmark@Sun.COM ire_dep_invalidate_children(child->ire_dep_children); 320511042SErik.Nordmark@Sun.COM 320611042SErik.Nordmark@Sun.COM while (child != NULL) { 320711042SErik.Nordmark@Sun.COM mutex_enter(&child->ire_lock); 320811042SErik.Nordmark@Sun.COM child->ire_dep_parent_generation = IRE_GENERATION_VERIFY; 320911042SErik.Nordmark@Sun.COM mutex_exit(&child->ire_lock); 321011042SErik.Nordmark@Sun.COM child = child->ire_dep_sib_next; 321111042SErik.Nordmark@Sun.COM } 321211042SErik.Nordmark@Sun.COM } 321311042SErik.Nordmark@Sun.COM 321411042SErik.Nordmark@Sun.COM static void 321511042SErik.Nordmark@Sun.COM ire_dep_increment_children(ire_t *child) 321611042SErik.Nordmark@Sun.COM { 321711042SErik.Nordmark@Sun.COM ip_stack_t *ipst = child->ire_ipst; 321811042SErik.Nordmark@Sun.COM 321911042SErik.Nordmark@Sun.COM ASSERT(RW_READ_HELD(&ipst->ips_ire_dep_lock)); 322011042SErik.Nordmark@Sun.COM /* Depth first */ 322111042SErik.Nordmark@Sun.COM if (child->ire_dep_children != NULL) 322211042SErik.Nordmark@Sun.COM ire_dep_increment_children(child->ire_dep_children); 322311042SErik.Nordmark@Sun.COM 322411042SErik.Nordmark@Sun.COM while (child != NULL) { 322511042SErik.Nordmark@Sun.COM if (!IRE_IS_CONDEMNED(child)) 322611042SErik.Nordmark@Sun.COM ire_increment_generation(child); 322711042SErik.Nordmark@Sun.COM child = child->ire_dep_sib_next; 32282535Ssangeeta } 32292535Ssangeeta } 32302535Ssangeeta 32312535Ssangeeta /* 323211042SErik.Nordmark@Sun.COM * Walk all the children of this ire recursively and increment their 323311042SErik.Nordmark@Sun.COM * generation number. 32342535Ssangeeta */ 32352535Ssangeeta void 323611042SErik.Nordmark@Sun.COM ire_dep_incr_generation(ire_t *parent) 32372535Ssangeeta { 323811042SErik.Nordmark@Sun.COM ip_stack_t *ipst = parent->ire_ipst; 323911042SErik.Nordmark@Sun.COM 324011042SErik.Nordmark@Sun.COM rw_enter(&ipst->ips_ire_dep_lock, RW_READER); 324111042SErik.Nordmark@Sun.COM if (parent->ire_dep_children != NULL) 324211042SErik.Nordmark@Sun.COM ire_dep_increment_children(parent->ire_dep_children); 324311042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ire_dep_lock); 32442535Ssangeeta } 32452535Ssangeeta 32463772Ssangeeta /* 324711042SErik.Nordmark@Sun.COM * Get a new ire_nce_cache for this IRE as well as its nexthop. 324811042SErik.Nordmark@Sun.COM * Returns zero if it succeeds. Can fail due to lack of memory or when 324911042SErik.Nordmark@Sun.COM * the route has become unreachable. Returns ENOMEM and ENETUNREACH in those 325011042SErik.Nordmark@Sun.COM * cases. 325111042SErik.Nordmark@Sun.COM * 325211042SErik.Nordmark@Sun.COM * In the in.mpathd case, the ire will have ire_testhidden 325311042SErik.Nordmark@Sun.COM * set; so we should create the ncec for the underlying ill. 32544714Ssowmini * 325511042SErik.Nordmark@Sun.COM * Note that the error returned by ire_revalidate_nce() is ignored by most 325611042SErik.Nordmark@Sun.COM * callers except ire_handle_condemned_nce(), which handles the ENETUNREACH 325711042SErik.Nordmark@Sun.COM * error to mark potentially bad ire's. For all the other callers, an 325811042SErik.Nordmark@Sun.COM * error return could indicate a transient condition like ENOMEM, or could 325911042SErik.Nordmark@Sun.COM * be the result of an interface that is going down/unplumbing. In the former 326011042SErik.Nordmark@Sun.COM * case (transient error), we would leave the old stale ire/ire_nce_cache 326111042SErik.Nordmark@Sun.COM * in place, and possibly use incorrect link-layer information to send packets 326211042SErik.Nordmark@Sun.COM * but would eventually recover. In the latter case (ill down/replumb), 326311042SErik.Nordmark@Sun.COM * ire_revalidate_nce() might return a condemned nce back, but we would then 326411042SErik.Nordmark@Sun.COM * recover in the packet output path. 32652535Ssangeeta */ 32662535Ssangeeta int 326711042SErik.Nordmark@Sun.COM ire_revalidate_nce(ire_t *ire) 32682535Ssangeeta { 326911042SErik.Nordmark@Sun.COM nce_t *nce, *old_nce; 327011042SErik.Nordmark@Sun.COM ire_t *nexthop; 32712535Ssangeeta 32722535Ssangeeta /* 327311042SErik.Nordmark@Sun.COM * For multicast we conceptually have an NCE but we don't store it 327411042SErik.Nordmark@Sun.COM * in ire_nce_cache; when ire_to_nce is called we allocate the nce. 32752535Ssangeeta */ 327611042SErik.Nordmark@Sun.COM if (ire->ire_type & IRE_MULTICAST) 327711042SErik.Nordmark@Sun.COM return (0); 327811042SErik.Nordmark@Sun.COM 327911042SErik.Nordmark@Sun.COM /* ire_testhidden should only be set on under-interfaces */ 328011042SErik.Nordmark@Sun.COM ASSERT(!ire->ire_testhidden || !IS_IPMP(ire->ire_ill)); 328111042SErik.Nordmark@Sun.COM 328211042SErik.Nordmark@Sun.COM nexthop = ire_nexthop(ire); 328311042SErik.Nordmark@Sun.COM if (nexthop == NULL) { 328411042SErik.Nordmark@Sun.COM /* The route is potentially bad */ 328511042SErik.Nordmark@Sun.COM (void) ire_no_good(ire); 328611042SErik.Nordmark@Sun.COM return (ENETUNREACH); 32874084Ssowmini } 328811042SErik.Nordmark@Sun.COM if (ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) { 328911042SErik.Nordmark@Sun.COM ASSERT(ire->ire_ill != NULL); 329011042SErik.Nordmark@Sun.COM 329111042SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV4_VERSION) 329211042SErik.Nordmark@Sun.COM nce = nce_lookup_v4(ire->ire_ill, &ire->ire_addr); 329311042SErik.Nordmark@Sun.COM else 329411042SErik.Nordmark@Sun.COM nce = nce_lookup_v6(ire->ire_ill, &ire->ire_addr_v6); 329511042SErik.Nordmark@Sun.COM } else { 329611042SErik.Nordmark@Sun.COM ASSERT(nexthop->ire_type & IRE_ONLINK); 329711042SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV4_VERSION) { 329811042SErik.Nordmark@Sun.COM nce = arp_nce_init(nexthop->ire_ill, nexthop->ire_addr, 329911042SErik.Nordmark@Sun.COM nexthop->ire_type); 330011042SErik.Nordmark@Sun.COM } else { 330111042SErik.Nordmark@Sun.COM nce = ndp_nce_init(nexthop->ire_ill, 330211042SErik.Nordmark@Sun.COM &nexthop->ire_addr_v6, nexthop->ire_type); 330311042SErik.Nordmark@Sun.COM } 33042535Ssangeeta } 330511042SErik.Nordmark@Sun.COM if (nce == NULL) { 33062535Ssangeeta /* 330711042SErik.Nordmark@Sun.COM * Leave the old stale one in place to avoid a NULL 330811042SErik.Nordmark@Sun.COM * ire_nce_cache. 33092535Ssangeeta */ 331011042SErik.Nordmark@Sun.COM ire_refrele(nexthop); 331111042SErik.Nordmark@Sun.COM return (ENOMEM); 331211042SErik.Nordmark@Sun.COM } 331311042SErik.Nordmark@Sun.COM 331411042SErik.Nordmark@Sun.COM if (nexthop != ire) { 331511042SErik.Nordmark@Sun.COM /* Update the nexthop ire */ 331611042SErik.Nordmark@Sun.COM mutex_enter(&nexthop->ire_lock); 331711042SErik.Nordmark@Sun.COM old_nce = nexthop->ire_nce_cache; 331811042SErik.Nordmark@Sun.COM if (!IRE_IS_CONDEMNED(nexthop)) { 331911042SErik.Nordmark@Sun.COM nce_refhold(nce); 332011042SErik.Nordmark@Sun.COM nexthop->ire_nce_cache = nce; 332111042SErik.Nordmark@Sun.COM } else { 332211042SErik.Nordmark@Sun.COM nexthop->ire_nce_cache = NULL; 332311042SErik.Nordmark@Sun.COM } 332411042SErik.Nordmark@Sun.COM mutex_exit(&nexthop->ire_lock); 332511042SErik.Nordmark@Sun.COM if (old_nce != NULL) 332611042SErik.Nordmark@Sun.COM nce_refrele(old_nce); 332711042SErik.Nordmark@Sun.COM } 332811042SErik.Nordmark@Sun.COM ire_refrele(nexthop); 332911042SErik.Nordmark@Sun.COM 333011042SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock); 333111042SErik.Nordmark@Sun.COM old_nce = ire->ire_nce_cache; 333211042SErik.Nordmark@Sun.COM if (!IRE_IS_CONDEMNED(ire)) { 333311042SErik.Nordmark@Sun.COM nce_refhold(nce); 333411042SErik.Nordmark@Sun.COM ire->ire_nce_cache = nce; 33352535Ssangeeta } else { 333611042SErik.Nordmark@Sun.COM ire->ire_nce_cache = NULL; 33372535Ssangeeta } 333811042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 333911042SErik.Nordmark@Sun.COM if (old_nce != NULL) 334011042SErik.Nordmark@Sun.COM nce_refrele(old_nce); 334111042SErik.Nordmark@Sun.COM 334211042SErik.Nordmark@Sun.COM nce_refrele(nce); 33432535Ssangeeta return (0); 33442535Ssangeeta } 33457880SJonathan.Anderson@Sun.COM 33467880SJonathan.Anderson@Sun.COM /* 334711042SErik.Nordmark@Sun.COM * Get a held nce for a given ire. 334811042SErik.Nordmark@Sun.COM * In the common case this is just from ire_nce_cache. 334911042SErik.Nordmark@Sun.COM * For IRE_MULTICAST this needs to do an explicit lookup since we do not 335011042SErik.Nordmark@Sun.COM * have an IRE_MULTICAST per address. 335111042SErik.Nordmark@Sun.COM * Note that this explicitly returns CONDEMNED NCEs. The caller needs those 335211042SErik.Nordmark@Sun.COM * so they can check whether the NCE went unreachable (as opposed to was 335311042SErik.Nordmark@Sun.COM * condemned for some other reason). 33547880SJonathan.Anderson@Sun.COM */ 335511042SErik.Nordmark@Sun.COM nce_t * 335611042SErik.Nordmark@Sun.COM ire_to_nce(ire_t *ire, ipaddr_t v4nexthop, const in6_addr_t *v6nexthop) 33577880SJonathan.Anderson@Sun.COM { 335811042SErik.Nordmark@Sun.COM nce_t *nce; 335911042SErik.Nordmark@Sun.COM 336011042SErik.Nordmark@Sun.COM if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) 33617880SJonathan.Anderson@Sun.COM return (NULL); 336211042SErik.Nordmark@Sun.COM 336311042SErik.Nordmark@Sun.COM /* ire_testhidden should only be set on under-interfaces */ 336411042SErik.Nordmark@Sun.COM ASSERT(!ire->ire_testhidden || !IS_IPMP(ire->ire_ill)); 336511042SErik.Nordmark@Sun.COM 336611042SErik.Nordmark@Sun.COM mutex_enter(&ire->ire_lock); 336711042SErik.Nordmark@Sun.COM nce = ire->ire_nce_cache; 336811042SErik.Nordmark@Sun.COM if (nce != NULL) { 336911042SErik.Nordmark@Sun.COM nce_refhold(nce); 337011042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 337111042SErik.Nordmark@Sun.COM return (nce); 33727880SJonathan.Anderson@Sun.COM } 337311042SErik.Nordmark@Sun.COM mutex_exit(&ire->ire_lock); 337411042SErik.Nordmark@Sun.COM 337511042SErik.Nordmark@Sun.COM if (ire->ire_type & IRE_MULTICAST) { 337611042SErik.Nordmark@Sun.COM ASSERT(ire->ire_ill != NULL); 337711042SErik.Nordmark@Sun.COM 337811042SErik.Nordmark@Sun.COM if (ire->ire_ipversion == IPV4_VERSION) { 337911042SErik.Nordmark@Sun.COM ASSERT(v6nexthop == NULL); 338011042SErik.Nordmark@Sun.COM 338111042SErik.Nordmark@Sun.COM nce = arp_nce_init(ire->ire_ill, v4nexthop, 338211042SErik.Nordmark@Sun.COM ire->ire_type); 338311042SErik.Nordmark@Sun.COM } else { 338411042SErik.Nordmark@Sun.COM ASSERT(v6nexthop != NULL); 338511042SErik.Nordmark@Sun.COM ASSERT(v4nexthop == 0); 338611042SErik.Nordmark@Sun.COM nce = ndp_nce_init(ire->ire_ill, v6nexthop, 338711042SErik.Nordmark@Sun.COM ire->ire_type); 33887880SJonathan.Anderson@Sun.COM } 338911042SErik.Nordmark@Sun.COM return (nce); 33907880SJonathan.Anderson@Sun.COM } 33917880SJonathan.Anderson@Sun.COM return (NULL); 33927880SJonathan.Anderson@Sun.COM } 33937880SJonathan.Anderson@Sun.COM 339411042SErik.Nordmark@Sun.COM nce_t * 339511042SErik.Nordmark@Sun.COM ire_to_nce_pkt(ire_t *ire, mblk_t *mp) 339611042SErik.Nordmark@Sun.COM { 339711042SErik.Nordmark@Sun.COM ipha_t *ipha; 339811042SErik.Nordmark@Sun.COM ip6_t *ip6h; 339911042SErik.Nordmark@Sun.COM 340011042SErik.Nordmark@Sun.COM if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { 340111042SErik.Nordmark@Sun.COM ipha = (ipha_t *)mp->b_rptr; 340211042SErik.Nordmark@Sun.COM return (ire_to_nce(ire, ipha->ipha_dst, NULL)); 340311042SErik.Nordmark@Sun.COM } else { 340411042SErik.Nordmark@Sun.COM ip6h = (ip6_t *)mp->b_rptr; 340511042SErik.Nordmark@Sun.COM return (ire_to_nce(ire, INADDR_ANY, &ip6h->ip6_dst)); 340611042SErik.Nordmark@Sun.COM } 340711042SErik.Nordmark@Sun.COM } 340811042SErik.Nordmark@Sun.COM 34097880SJonathan.Anderson@Sun.COM /* 341011042SErik.Nordmark@Sun.COM * Given an IRE_INTERFACE (that matches more than one address) create 341111042SErik.Nordmark@Sun.COM * and return an IRE_IF_CLONE for the specific address. 341211042SErik.Nordmark@Sun.COM * Return the generation number. 341311042SErik.Nordmark@Sun.COM * Returns NULL is no memory for the IRE. 341411042SErik.Nordmark@Sun.COM * Handles both IPv4 and IPv6. 34157880SJonathan.Anderson@Sun.COM */ 34167880SJonathan.Anderson@Sun.COM ire_t * 341711042SErik.Nordmark@Sun.COM ire_create_if_clone(ire_t *ire_if, const in6_addr_t *addr, uint_t *generationp) 34187880SJonathan.Anderson@Sun.COM { 341911042SErik.Nordmark@Sun.COM ire_t *ire; 342011042SErik.Nordmark@Sun.COM ire_t *nire; 342111042SErik.Nordmark@Sun.COM 342211042SErik.Nordmark@Sun.COM if (ire_if->ire_ipversion == IPV4_VERSION) { 342311042SErik.Nordmark@Sun.COM ipaddr_t v4addr; 342411042SErik.Nordmark@Sun.COM ipaddr_t mask = IP_HOST_MASK; 342511042SErik.Nordmark@Sun.COM 342611042SErik.Nordmark@Sun.COM ASSERT(IN6_IS_ADDR_V4MAPPED(addr)); 342711042SErik.Nordmark@Sun.COM IN6_V4MAPPED_TO_IPADDR(addr, v4addr); 342811042SErik.Nordmark@Sun.COM 342911042SErik.Nordmark@Sun.COM ire = ire_create( 343011042SErik.Nordmark@Sun.COM (uchar_t *)&v4addr, /* dest address */ 343111042SErik.Nordmark@Sun.COM (uchar_t *)&mask, /* mask */ 343211042SErik.Nordmark@Sun.COM (uchar_t *)&ire_if->ire_gateway_addr, 343311042SErik.Nordmark@Sun.COM IRE_IF_CLONE, /* IRE type */ 343411042SErik.Nordmark@Sun.COM ire_if->ire_ill, 343511042SErik.Nordmark@Sun.COM ire_if->ire_zoneid, 343611042SErik.Nordmark@Sun.COM ire_if->ire_flags | RTF_HOST, 343711042SErik.Nordmark@Sun.COM NULL, /* No security attr for IRE_IF_ALL */ 343811042SErik.Nordmark@Sun.COM ire_if->ire_ipst); 343911042SErik.Nordmark@Sun.COM } else { 344011042SErik.Nordmark@Sun.COM ASSERT(!IN6_IS_ADDR_V4MAPPED(addr)); 344111042SErik.Nordmark@Sun.COM ire = ire_create_v6( 344211042SErik.Nordmark@Sun.COM addr, /* dest address */ 344311042SErik.Nordmark@Sun.COM &ipv6_all_ones, /* mask */ 344411042SErik.Nordmark@Sun.COM &ire_if->ire_gateway_addr_v6, /* gateway addr */ 344511042SErik.Nordmark@Sun.COM IRE_IF_CLONE, /* IRE type */ 344611042SErik.Nordmark@Sun.COM ire_if->ire_ill, 344711042SErik.Nordmark@Sun.COM ire_if->ire_zoneid, 344811042SErik.Nordmark@Sun.COM ire_if->ire_flags | RTF_HOST, 344911042SErik.Nordmark@Sun.COM NULL, /* No security attr for IRE_IF_ALL */ 345011042SErik.Nordmark@Sun.COM ire_if->ire_ipst); 345111042SErik.Nordmark@Sun.COM } 345211042SErik.Nordmark@Sun.COM if (ire == NULL) 345311042SErik.Nordmark@Sun.COM return (NULL); 345411042SErik.Nordmark@Sun.COM 345511042SErik.Nordmark@Sun.COM /* Take the metrics, in particular the mtu, from the IRE_IF */ 345611042SErik.Nordmark@Sun.COM ire->ire_metrics = ire_if->ire_metrics; 345711042SErik.Nordmark@Sun.COM 345811042SErik.Nordmark@Sun.COM nire = ire_add(ire); 345911042SErik.Nordmark@Sun.COM if (nire == NULL) /* Some failure */ 346011042SErik.Nordmark@Sun.COM return (NULL); 346111042SErik.Nordmark@Sun.COM 346211042SErik.Nordmark@Sun.COM if (generationp != NULL) 346311042SErik.Nordmark@Sun.COM *generationp = nire->ire_generation; 346411042SErik.Nordmark@Sun.COM 346511042SErik.Nordmark@Sun.COM /* 346611042SErik.Nordmark@Sun.COM * Make sure races don't add a duplicate by 346711042SErik.Nordmark@Sun.COM * catching the case when an identical was returned. 346811042SErik.Nordmark@Sun.COM */ 346911042SErik.Nordmark@Sun.COM if (nire != ire) { 347011042SErik.Nordmark@Sun.COM ASSERT(nire->ire_identical_ref > 1); 347111042SErik.Nordmark@Sun.COM ire_delete(nire); 347211042SErik.Nordmark@Sun.COM } 347311042SErik.Nordmark@Sun.COM return (nire); 34747880SJonathan.Anderson@Sun.COM } 347511042SErik.Nordmark@Sun.COM 347611042SErik.Nordmark@Sun.COM /* 347711042SErik.Nordmark@Sun.COM * The argument is an IRE_INTERFACE. Delete all of IRE_IF_CLONE in the 347811042SErik.Nordmark@Sun.COM * ire_dep_children (just walk the ire_dep_sib_next since they are all 347911042SErik.Nordmark@Sun.COM * immediate children.) 348011042SErik.Nordmark@Sun.COM * Since we hold a lock while we remove them we need to defer the actual 348111042SErik.Nordmark@Sun.COM * calls to ire_delete() until we have dropped the lock. This makes things 348211042SErik.Nordmark@Sun.COM * less efficient since we restart at the top after dropping the lock. But 348311042SErik.Nordmark@Sun.COM * we only run when an IRE_INTERFACE is deleted which is infrquent. 348411042SErik.Nordmark@Sun.COM * 348511042SErik.Nordmark@Sun.COM * Note that ire_dep_children can be any mixture of offlink routes and 348611042SErik.Nordmark@Sun.COM * IRE_IF_CLONE entries. 348711042SErik.Nordmark@Sun.COM */ 348811042SErik.Nordmark@Sun.COM void 348911042SErik.Nordmark@Sun.COM ire_dep_delete_if_clone(ire_t *parent) 349011042SErik.Nordmark@Sun.COM { 349111042SErik.Nordmark@Sun.COM ip_stack_t *ipst = parent->ire_ipst; 349211042SErik.Nordmark@Sun.COM ire_t *child, *next; 349311042SErik.Nordmark@Sun.COM 349411042SErik.Nordmark@Sun.COM restart: 349511042SErik.Nordmark@Sun.COM rw_enter(&ipst->ips_ire_dep_lock, RW_READER); 349611042SErik.Nordmark@Sun.COM if (parent->ire_dep_children == NULL) { 349711042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ire_dep_lock); 349811042SErik.Nordmark@Sun.COM return; 349911042SErik.Nordmark@Sun.COM } 350011042SErik.Nordmark@Sun.COM child = parent->ire_dep_children; 350111042SErik.Nordmark@Sun.COM while (child != NULL) { 350211042SErik.Nordmark@Sun.COM next = child->ire_dep_sib_next; 350311042SErik.Nordmark@Sun.COM if ((child->ire_type & IRE_IF_CLONE) && 350411042SErik.Nordmark@Sun.COM !IRE_IS_CONDEMNED(child)) { 350511042SErik.Nordmark@Sun.COM ire_refhold(child); 350611042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ire_dep_lock); 350711042SErik.Nordmark@Sun.COM ire_delete(child); 350811042SErik.Nordmark@Sun.COM ASSERT(IRE_IS_CONDEMNED(child)); 350911042SErik.Nordmark@Sun.COM ire_refrele(child); 351011042SErik.Nordmark@Sun.COM goto restart; 351111042SErik.Nordmark@Sun.COM } 351211042SErik.Nordmark@Sun.COM child = next; 351311042SErik.Nordmark@Sun.COM } 351411042SErik.Nordmark@Sun.COM rw_exit(&ipst->ips_ire_dep_lock); 351511042SErik.Nordmark@Sun.COM } 351611042SErik.Nordmark@Sun.COM 351711042SErik.Nordmark@Sun.COM /* 351811042SErik.Nordmark@Sun.COM * ire_pref() is used in recursive route-resolution for a destination to 351911042SErik.Nordmark@Sun.COM * determine the preference of an ire, where "preference" is determined 352011042SErik.Nordmark@Sun.COM * based on the level of indirection to the destination of the ire. 352111042SErik.Nordmark@Sun.COM * A higher preference indicates that fewer lookups are needed to complete 352211042SErik.Nordmark@Sun.COM * recursive route lookup. Thus 352311042SErik.Nordmark@Sun.COM * ire_pref(RTF_INDIRECT) < ire_pref(IRE_IF_RESOLVER) < ire_pref(IRE_PREF_CLONE) 352411042SErik.Nordmark@Sun.COM */ 352511042SErik.Nordmark@Sun.COM int 352611042SErik.Nordmark@Sun.COM ire_pref(ire_t *ire) 352711042SErik.Nordmark@Sun.COM { 352811042SErik.Nordmark@Sun.COM if (ire->ire_flags & RTF_INDIRECT) 352911042SErik.Nordmark@Sun.COM return (1); 353011042SErik.Nordmark@Sun.COM if (ire->ire_type & IRE_OFFLINK) 353111042SErik.Nordmark@Sun.COM return (2); 353211042SErik.Nordmark@Sun.COM if (ire->ire_type & (IRE_IF_RESOLVER|IRE_IF_NORESOLVER)) 353311042SErik.Nordmark@Sun.COM return (3); 353411042SErik.Nordmark@Sun.COM if (ire->ire_type & IRE_IF_CLONE) 353511042SErik.Nordmark@Sun.COM return (4); 353611042SErik.Nordmark@Sun.COM if (ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) 353711042SErik.Nordmark@Sun.COM return (5); 353811042SErik.Nordmark@Sun.COM return (-1); /* unknown ire_type */ 353911042SErik.Nordmark@Sun.COM } 3540