10Sstevel@tonic-gate /* 20Sstevel@tonic-gate * CDDL HEADER START 30Sstevel@tonic-gate * 40Sstevel@tonic-gate * The contents of this file are subject to the terms of the 5*1503Sericheng * Common Development and Distribution License (the "License"). 6*1503Sericheng * You may not use this file except in compliance with the License. 70Sstevel@tonic-gate * 80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 100Sstevel@tonic-gate * See the License for the specific language governing permissions 110Sstevel@tonic-gate * and limitations under the License. 120Sstevel@tonic-gate * 130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 180Sstevel@tonic-gate * 190Sstevel@tonic-gate * CDDL HEADER END 200Sstevel@tonic-gate */ 210Sstevel@tonic-gate /* 22*1503Sericheng * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 230Sstevel@tonic-gate * Use is subject to license terms. 240Sstevel@tonic-gate */ 250Sstevel@tonic-gate 260Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 270Sstevel@tonic-gate 280Sstevel@tonic-gate /* 290Sstevel@tonic-gate * IP interface to squeues. 300Sstevel@tonic-gate * 310Sstevel@tonic-gate * IP creates an squeue instance for each CPU. The squeue pointer is saved in 320Sstevel@tonic-gate * cpu_squeue field of the cpu structure. Each squeue is associated with a 330Sstevel@tonic-gate * connection instance (conn_t). 340Sstevel@tonic-gate * 350Sstevel@tonic-gate * For CPUs available at system startup time the squeue creation and association 360Sstevel@tonic-gate * with CPU happens at MP initialization time. For CPUs added during dynamic 370Sstevel@tonic-gate * reconfiguration, the initialization happens when the new CPU is configured in 380Sstevel@tonic-gate * the system. The squeue is chosen using IP_SQUEUE_GET macro which will either 390Sstevel@tonic-gate * return per-CPU squeue or random squeue based on the ip_squeue_fanout 400Sstevel@tonic-gate * variable. 410Sstevel@tonic-gate * 420Sstevel@tonic-gate * There are two modes of associating connection with squeues. The first mode 430Sstevel@tonic-gate * associates each connection with the CPU that creates the connection (either 440Sstevel@tonic-gate * during open time or during accept time). The second mode associates each 450Sstevel@tonic-gate * connection with a random CPU, effectively distributing load over all CPUs 460Sstevel@tonic-gate * and all squeues in the system. The mode is controlled by the 470Sstevel@tonic-gate * ip_squeue_fanout variable. 480Sstevel@tonic-gate * 490Sstevel@tonic-gate * NOTE: The fact that there is an association between each connection and 500Sstevel@tonic-gate * squeue and squeue and CPU does not mean that each connection is always 510Sstevel@tonic-gate * processed on this CPU and on this CPU only. Any thread calling squeue_enter() 520Sstevel@tonic-gate * may process the connection on whatever CPU it is scheduled. The squeue to CPU 530Sstevel@tonic-gate * binding is only relevant for the worker thread. 540Sstevel@tonic-gate * 550Sstevel@tonic-gate * The list of all created squeues is kept in squeue_set structure. This list is 560Sstevel@tonic-gate * used when ip_squeue_fanout is set and the load is distributed across all 570Sstevel@tonic-gate * squeues. 580Sstevel@tonic-gate * 590Sstevel@tonic-gate * INTERFACE: 600Sstevel@tonic-gate * 610Sstevel@tonic-gate * squeue_t *ip_squeue_get(hint) 620Sstevel@tonic-gate * 630Sstevel@tonic-gate * Find an squeue based on the 'hint' value. The hint is used as an index 640Sstevel@tonic-gate * in the array of IP squeues available. The way hint is computed may 650Sstevel@tonic-gate * affect the effectiveness of the squeue distribution. Currently squeues 660Sstevel@tonic-gate * are assigned in round-robin fashion using lbolt as a hint. 670Sstevel@tonic-gate * 680Sstevel@tonic-gate * 690Sstevel@tonic-gate * DR Notes 700Sstevel@tonic-gate * ======== 710Sstevel@tonic-gate * 720Sstevel@tonic-gate * The ip_squeue_init() registers a call-back function with the CPU DR 730Sstevel@tonic-gate * subsystem using register_cpu_setup_func(). The call-back function does two 740Sstevel@tonic-gate * things: 750Sstevel@tonic-gate * 760Sstevel@tonic-gate * o When the CPU is going off-line or unconfigured, the worker thread is 770Sstevel@tonic-gate * unbound from the CPU. This allows the CPU unconfig code to move it to 780Sstevel@tonic-gate * another CPU. 790Sstevel@tonic-gate * 800Sstevel@tonic-gate * o When the CPU is going online, it creates a new squeue for this CPU if 810Sstevel@tonic-gate * necessary and binds the squeue worker thread to this CPU. 820Sstevel@tonic-gate * 830Sstevel@tonic-gate * TUNEBALES: 840Sstevel@tonic-gate * 850Sstevel@tonic-gate * ip_squeue_bind: if set to 1 each squeue worker thread is bound to the CPU 860Sstevel@tonic-gate * associated with an squeue instance. 870Sstevel@tonic-gate * 880Sstevel@tonic-gate * ip_squeue_profile: if set to 1 squeue profiling is enabled. NOTE: squeue.c 890Sstevel@tonic-gate * should be compiled with SQUEUE_PROFILE enabled for this variable to have 900Sstevel@tonic-gate * an impact. 910Sstevel@tonic-gate * 920Sstevel@tonic-gate * ip_squeue_fanout: if set to 1 use ip_squeue_get() to find an squeue, 930Sstevel@tonic-gate * otherwise get it from CPU->cpu_squeue. 940Sstevel@tonic-gate * 950Sstevel@tonic-gate * ip_squeue_bind, ip_squeue_profile and ip_squeue_fanout can be accessed and 960Sstevel@tonic-gate * changed using ndd on /dev/tcp or /dev/ip. 970Sstevel@tonic-gate * 980Sstevel@tonic-gate * ip_squeue_worker_wait: global value for the sq_wait field for all squeues 990Sstevel@tonic-gate * created. This is the time squeue code waits before waking up the worker 1000Sstevel@tonic-gate * thread after queuing a request. 1010Sstevel@tonic-gate */ 1020Sstevel@tonic-gate 1030Sstevel@tonic-gate #include <sys/types.h> 1040Sstevel@tonic-gate #include <sys/debug.h> 1050Sstevel@tonic-gate #include <sys/kmem.h> 1060Sstevel@tonic-gate #include <sys/cpuvar.h> 1070Sstevel@tonic-gate 1080Sstevel@tonic-gate #include <sys/cmn_err.h> 1090Sstevel@tonic-gate 1100Sstevel@tonic-gate #include <inet/common.h> 1110Sstevel@tonic-gate #include <inet/ip.h> 1120Sstevel@tonic-gate #include <inet/ip_if.h> 1130Sstevel@tonic-gate #include <inet/mi.h> 1140Sstevel@tonic-gate #include <inet/nd.h> 1150Sstevel@tonic-gate #include <inet/ipclassifier.h> 1160Sstevel@tonic-gate #include <sys/types.h> 1170Sstevel@tonic-gate #include <sys/conf.h> 1180Sstevel@tonic-gate #include <sys/sunddi.h> 1190Sstevel@tonic-gate #include <sys/ddi.h> 1200Sstevel@tonic-gate #include <sys/squeue_impl.h> 1210Sstevel@tonic-gate 1220Sstevel@tonic-gate 1230Sstevel@tonic-gate /* 1240Sstevel@tonic-gate * We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1 1250Sstevel@tonic-gate * mapping between squeue and NIC (or Rx ring) for performance reasons so 1260Sstevel@tonic-gate * each squeue can uniquely own a NIC or a Rx ring and do polling 1271184Skrgopi * (PSARC 2004/630). So we allow up to MAX_SQUEUES_PER_CPU squeues per CPU. 1281184Skrgopi * We start by creating MIN_SQUEUES_PER_CPU squeues per CPU but more squeues 1290Sstevel@tonic-gate * can be created dynamically as needed. 1300Sstevel@tonic-gate */ 1311184Skrgopi #define MAX_SQUEUES_PER_CPU 32 1321184Skrgopi #define MIN_SQUEUES_PER_CPU 1 1331184Skrgopi uint_t ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU; 1341184Skrgopi 1351184Skrgopi #define IP_NUM_SOFT_RINGS 2 1361184Skrgopi uint_t ip_soft_rings_cnt = IP_NUM_SOFT_RINGS; 1370Sstevel@tonic-gate 1380Sstevel@tonic-gate /* 1390Sstevel@tonic-gate * List of all created squeue sets. The size is protected by cpu_lock 1400Sstevel@tonic-gate */ 1410Sstevel@tonic-gate squeue_set_t **sqset_global_list; 1420Sstevel@tonic-gate uint_t sqset_global_size; 1430Sstevel@tonic-gate 1440Sstevel@tonic-gate int ip_squeue_bind = B_TRUE; 1450Sstevel@tonic-gate int ip_squeue_profile = B_TRUE; 1460Sstevel@tonic-gate static void (*ip_squeue_create_callback)(squeue_t *) = NULL; 1470Sstevel@tonic-gate 1480Sstevel@tonic-gate /* 1490Sstevel@tonic-gate * ip_squeue_worker_wait: global value for the sq_wait field for all squeues 1500Sstevel@tonic-gate * created. This is the time squeue code waits before waking up the worker 1510Sstevel@tonic-gate * thread after queuing a request. 1520Sstevel@tonic-gate */ 1530Sstevel@tonic-gate uint_t ip_squeue_worker_wait = 10; 1540Sstevel@tonic-gate 1550Sstevel@tonic-gate static squeue_set_t *ip_squeue_set_create(cpu_t *, boolean_t); 1560Sstevel@tonic-gate static int ip_squeue_cpu_setup(cpu_setup_t, int, void *); 1570Sstevel@tonic-gate 1580Sstevel@tonic-gate static void ip_squeue_set_bind(squeue_set_t *); 1590Sstevel@tonic-gate static void ip_squeue_set_unbind(squeue_set_t *); 1601184Skrgopi static squeue_t *ip_find_unused_squeue(squeue_set_t *, cpu_t *, boolean_t); 1610Sstevel@tonic-gate 1620Sstevel@tonic-gate #define CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS)) 1630Sstevel@tonic-gate 1640Sstevel@tonic-gate /* 1651184Skrgopi * Create squeue set containing ip_squeues_per_cpu number of squeues 1660Sstevel@tonic-gate * for this CPU and bind them all to the CPU. 1670Sstevel@tonic-gate */ 1680Sstevel@tonic-gate static squeue_set_t * 1690Sstevel@tonic-gate ip_squeue_set_create(cpu_t *cp, boolean_t reuse) 1700Sstevel@tonic-gate { 1710Sstevel@tonic-gate int i; 1720Sstevel@tonic-gate squeue_set_t *sqs; 1730Sstevel@tonic-gate squeue_t *sqp; 1740Sstevel@tonic-gate char sqname[64]; 1750Sstevel@tonic-gate processorid_t id = cp->cpu_id; 1760Sstevel@tonic-gate 1770Sstevel@tonic-gate if (reuse) { 1780Sstevel@tonic-gate int i; 1790Sstevel@tonic-gate 1800Sstevel@tonic-gate /* 1810Sstevel@tonic-gate * We may already have an squeue created for this CPU. Try to 1820Sstevel@tonic-gate * find one and reuse it if possible. 1830Sstevel@tonic-gate */ 1840Sstevel@tonic-gate for (i = 0; i < sqset_global_size; i++) { 1850Sstevel@tonic-gate sqs = sqset_global_list[i]; 1860Sstevel@tonic-gate if (id == sqs->sqs_bind) 1870Sstevel@tonic-gate return (sqs); 1880Sstevel@tonic-gate } 1890Sstevel@tonic-gate } 1900Sstevel@tonic-gate 1910Sstevel@tonic-gate sqs = kmem_zalloc(sizeof (squeue_set_t) + 1921184Skrgopi (sizeof (squeue_t *) * MAX_SQUEUES_PER_CPU), KM_SLEEP); 1930Sstevel@tonic-gate mutex_init(&sqs->sqs_lock, NULL, MUTEX_DEFAULT, NULL); 1940Sstevel@tonic-gate sqs->sqs_list = (squeue_t **)&sqs[1]; 1951184Skrgopi sqs->sqs_max_size = MAX_SQUEUES_PER_CPU; 1960Sstevel@tonic-gate sqs->sqs_bind = id; 1970Sstevel@tonic-gate 1981184Skrgopi for (i = 0; i < ip_squeues_per_cpu; i++) { 1990Sstevel@tonic-gate bzero(sqname, sizeof (sqname)); 2000Sstevel@tonic-gate 2010Sstevel@tonic-gate (void) snprintf(sqname, sizeof (sqname), 2020Sstevel@tonic-gate "ip_squeue_cpu_%d/%d/%d", cp->cpu_seqid, 2030Sstevel@tonic-gate cp->cpu_id, i); 2040Sstevel@tonic-gate 2050Sstevel@tonic-gate sqp = squeue_create(sqname, id, ip_squeue_worker_wait, 2060Sstevel@tonic-gate minclsyspri); 2070Sstevel@tonic-gate 2081184Skrgopi /* 2091184Skrgopi * The first squeue in each squeue_set is the DEFAULT 2101184Skrgopi * squeue. 2111184Skrgopi */ 2121184Skrgopi sqp->sq_state |= SQS_DEFAULT; 2131184Skrgopi 2140Sstevel@tonic-gate ASSERT(sqp != NULL); 2150Sstevel@tonic-gate 2160Sstevel@tonic-gate squeue_profile_enable(sqp); 2170Sstevel@tonic-gate sqs->sqs_list[sqs->sqs_size++] = sqp; 2180Sstevel@tonic-gate 2190Sstevel@tonic-gate if (ip_squeue_create_callback != NULL) 2200Sstevel@tonic-gate ip_squeue_create_callback(sqp); 2210Sstevel@tonic-gate } 2220Sstevel@tonic-gate 223405Sakolb if (ip_squeue_bind && cpu_is_online(cp)) 2240Sstevel@tonic-gate ip_squeue_set_bind(sqs); 2250Sstevel@tonic-gate 2260Sstevel@tonic-gate sqset_global_list[sqset_global_size++] = sqs; 2270Sstevel@tonic-gate ASSERT(sqset_global_size <= NCPU); 2280Sstevel@tonic-gate return (sqs); 2290Sstevel@tonic-gate } 2300Sstevel@tonic-gate 2310Sstevel@tonic-gate /* 2320Sstevel@tonic-gate * Initialize IP squeues. 2330Sstevel@tonic-gate */ 2340Sstevel@tonic-gate void 2350Sstevel@tonic-gate ip_squeue_init(void (*callback)(squeue_t *)) 2360Sstevel@tonic-gate { 2370Sstevel@tonic-gate int i; 2380Sstevel@tonic-gate 2390Sstevel@tonic-gate ASSERT(sqset_global_list == NULL); 2400Sstevel@tonic-gate 2411184Skrgopi if (ip_squeues_per_cpu < MIN_SQUEUES_PER_CPU) 2421184Skrgopi ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU; 2431184Skrgopi else if (ip_squeues_per_cpu > MAX_SQUEUES_PER_CPU) 2441184Skrgopi ip_squeues_per_cpu = MAX_SQUEUES_PER_CPU; 2450Sstevel@tonic-gate 2460Sstevel@tonic-gate ip_squeue_create_callback = callback; 2470Sstevel@tonic-gate squeue_init(); 2480Sstevel@tonic-gate sqset_global_list = 2490Sstevel@tonic-gate kmem_zalloc(sizeof (squeue_set_t *) * NCPU, KM_SLEEP); 2500Sstevel@tonic-gate sqset_global_size = 0; 2510Sstevel@tonic-gate mutex_enter(&cpu_lock); 2520Sstevel@tonic-gate 2530Sstevel@tonic-gate /* Create squeue for each active CPU available */ 2540Sstevel@tonic-gate for (i = 0; i < NCPU; i++) { 2550Sstevel@tonic-gate cpu_t *cp = cpu[i]; 2560Sstevel@tonic-gate if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) { 2570Sstevel@tonic-gate cp->cpu_squeue_set = ip_squeue_set_create(cp, B_FALSE); 2580Sstevel@tonic-gate } 2590Sstevel@tonic-gate } 2600Sstevel@tonic-gate 2610Sstevel@tonic-gate register_cpu_setup_func(ip_squeue_cpu_setup, NULL); 2620Sstevel@tonic-gate 2630Sstevel@tonic-gate mutex_exit(&cpu_lock); 2640Sstevel@tonic-gate 2650Sstevel@tonic-gate if (ip_squeue_profile) 2660Sstevel@tonic-gate squeue_profile_start(); 2670Sstevel@tonic-gate } 2680Sstevel@tonic-gate 2690Sstevel@tonic-gate /* 2700Sstevel@tonic-gate * Get squeue_t structure based on index. 2710Sstevel@tonic-gate * Since the squeue list can only grow, no need to grab any lock. 2720Sstevel@tonic-gate */ 2730Sstevel@tonic-gate squeue_t * 2740Sstevel@tonic-gate ip_squeue_random(uint_t index) 2750Sstevel@tonic-gate { 2760Sstevel@tonic-gate squeue_set_t *sqs; 2770Sstevel@tonic-gate 2780Sstevel@tonic-gate sqs = sqset_global_list[index % sqset_global_size]; 2790Sstevel@tonic-gate return (sqs->sqs_list[index % sqs->sqs_size]); 2800Sstevel@tonic-gate } 2810Sstevel@tonic-gate 2820Sstevel@tonic-gate /* ARGSUSED */ 2830Sstevel@tonic-gate void 2840Sstevel@tonic-gate ip_squeue_clean(void *arg1, mblk_t *mp, void *arg2) 2850Sstevel@tonic-gate { 2860Sstevel@tonic-gate squeue_t *sqp = arg2; 2870Sstevel@tonic-gate ill_rx_ring_t *ring = sqp->sq_rx_ring; 2880Sstevel@tonic-gate ill_t *ill; 2890Sstevel@tonic-gate 2900Sstevel@tonic-gate ASSERT(sqp != NULL); 2910Sstevel@tonic-gate 2920Sstevel@tonic-gate if (ring == NULL) { 2930Sstevel@tonic-gate return; 2940Sstevel@tonic-gate } 2950Sstevel@tonic-gate 2960Sstevel@tonic-gate /* 2970Sstevel@tonic-gate * Clean up squeue 2980Sstevel@tonic-gate */ 2990Sstevel@tonic-gate mutex_enter(&sqp->sq_lock); 3000Sstevel@tonic-gate sqp->sq_state &= ~(SQS_ILL_BOUND|SQS_POLL_CAPAB); 3010Sstevel@tonic-gate sqp->sq_rx_ring = NULL; 3020Sstevel@tonic-gate mutex_exit(&sqp->sq_lock); 3030Sstevel@tonic-gate 3040Sstevel@tonic-gate ill = ring->rr_ill; 3051184Skrgopi if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { 3061184Skrgopi ASSERT(ring->rr_handle != NULL); 3071184Skrgopi ill->ill_dls_capab->ill_dls_unbind(ring->rr_handle); 3081184Skrgopi } 3090Sstevel@tonic-gate 3100Sstevel@tonic-gate /* 3110Sstevel@tonic-gate * Cleanup the ring 3120Sstevel@tonic-gate */ 3130Sstevel@tonic-gate 3140Sstevel@tonic-gate ring->rr_blank = NULL; 3150Sstevel@tonic-gate ring->rr_handle = NULL; 3160Sstevel@tonic-gate ring->rr_sqp = NULL; 3170Sstevel@tonic-gate 3180Sstevel@tonic-gate /* 3190Sstevel@tonic-gate * Signal ill that cleanup is done 3200Sstevel@tonic-gate */ 3210Sstevel@tonic-gate mutex_enter(&ill->ill_lock); 3220Sstevel@tonic-gate ring->rr_ring_state = ILL_RING_FREE; 3230Sstevel@tonic-gate cv_signal(&ill->ill_cv); 3240Sstevel@tonic-gate mutex_exit(&ill->ill_lock); 3250Sstevel@tonic-gate } 3260Sstevel@tonic-gate 3270Sstevel@tonic-gate typedef struct ip_taskq_arg { 3280Sstevel@tonic-gate ill_t *ip_taskq_ill; 3290Sstevel@tonic-gate ill_rx_ring_t *ip_taskq_ill_rx_ring; 3300Sstevel@tonic-gate cpu_t *ip_taskq_cpu; 3310Sstevel@tonic-gate } ip_taskq_arg_t; 3320Sstevel@tonic-gate 3330Sstevel@tonic-gate /* 3340Sstevel@tonic-gate * Do a Rx ring to squeue binding. Find a unique squeue that is not 3350Sstevel@tonic-gate * managing a receive ring. If no such squeue exists, dynamically 3360Sstevel@tonic-gate * create a new one in the squeue set. 3370Sstevel@tonic-gate * 3380Sstevel@tonic-gate * The function runs via the system taskq. The ill passed as an 3390Sstevel@tonic-gate * argument can't go away since we hold a ref. The lock order is 3400Sstevel@tonic-gate * ill_lock -> sqs_lock -> sq_lock. 3410Sstevel@tonic-gate * 3420Sstevel@tonic-gate * If we are binding a Rx ring to a squeue attached to the offline CPU, 3430Sstevel@tonic-gate * no need to check that because squeues are never destroyed once 3440Sstevel@tonic-gate * created. 3450Sstevel@tonic-gate */ 3460Sstevel@tonic-gate /* ARGSUSED */ 3470Sstevel@tonic-gate static void 3480Sstevel@tonic-gate ip_squeue_extend(void *arg) 3490Sstevel@tonic-gate { 3500Sstevel@tonic-gate ip_taskq_arg_t *sq_arg = (ip_taskq_arg_t *)arg; 3510Sstevel@tonic-gate ill_t *ill = sq_arg->ip_taskq_ill; 3520Sstevel@tonic-gate ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring; 3530Sstevel@tonic-gate cpu_t *intr_cpu = sq_arg->ip_taskq_cpu; 3541184Skrgopi squeue_set_t *sqs; 3550Sstevel@tonic-gate squeue_t *sqp = NULL; 3560Sstevel@tonic-gate 3570Sstevel@tonic-gate ASSERT(ill != NULL); 3580Sstevel@tonic-gate ASSERT(ill_rx_ring != NULL); 3590Sstevel@tonic-gate kmem_free(arg, sizeof (ip_taskq_arg_t)); 3600Sstevel@tonic-gate 3611184Skrgopi /* 3621184Skrgopi * Make sure the CPU that originally took the interrupt still 3631184Skrgopi * exists. 3641184Skrgopi */ 3651184Skrgopi if (!CPU_ISON(intr_cpu)) 3661184Skrgopi intr_cpu = CPU; 3671184Skrgopi 3680Sstevel@tonic-gate sqs = intr_cpu->cpu_squeue_set; 3690Sstevel@tonic-gate 3700Sstevel@tonic-gate /* 3710Sstevel@tonic-gate * If this ill represents link aggregation, then there might be 3720Sstevel@tonic-gate * multiple NICs trying to register them selves at the same time 3730Sstevel@tonic-gate * and in order to ensure that test and assignment of free rings 3740Sstevel@tonic-gate * is sequential, we need to hold the ill_lock. 3750Sstevel@tonic-gate */ 3760Sstevel@tonic-gate mutex_enter(&ill->ill_lock); 3771184Skrgopi sqp = ip_find_unused_squeue(sqs, intr_cpu, B_FALSE); 3781184Skrgopi if (sqp == NULL) { 3791184Skrgopi /* 3801184Skrgopi * We hit the max limit of squeues allowed per CPU. 3811184Skrgopi * Assign this rx_ring to DEFAULT squeue of the 3821184Skrgopi * interrupted CPU but the squeue will not manage 3831184Skrgopi * the ring. Also print a warning. 3841184Skrgopi */ 3851184Skrgopi cmn_err(CE_NOTE, "ip_squeue_extend: CPU/sqset = %d/%p already " 3861184Skrgopi "has max number of squeues. System performance might " 3871184Skrgopi "become suboptimal\n", sqs->sqs_bind, (void *)sqs); 3881184Skrgopi 3891184Skrgopi /* the first squeue in the list is the default squeue */ 3901184Skrgopi sqp = sqs->sqs_list[0]; 3911184Skrgopi ASSERT(sqp != NULL); 3921184Skrgopi ill_rx_ring->rr_sqp = sqp; 3931184Skrgopi ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 3941184Skrgopi 3951184Skrgopi mutex_exit(&ill->ill_lock); 3961184Skrgopi ill_waiter_dcr(ill); 3971184Skrgopi return; 3981184Skrgopi } 3991184Skrgopi 4001184Skrgopi ASSERT(MUTEX_HELD(&sqp->sq_lock)); 4011184Skrgopi sqp->sq_rx_ring = ill_rx_ring; 4021184Skrgopi ill_rx_ring->rr_sqp = sqp; 4031184Skrgopi ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 4041184Skrgopi 4051184Skrgopi sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB); 4061184Skrgopi mutex_exit(&sqp->sq_lock); 4071184Skrgopi 4081184Skrgopi mutex_exit(&ill->ill_lock); 4091184Skrgopi 4101184Skrgopi /* ill_waiter_dcr will also signal any waiters on ill_ring_state */ 4111184Skrgopi ill_waiter_dcr(ill); 4121184Skrgopi } 4131184Skrgopi 4141184Skrgopi /* 4151184Skrgopi * Do a Rx ring to squeue binding. Find a unique squeue that is not 4161184Skrgopi * managing a receive ring. If no such squeue exists, dynamically 4171184Skrgopi * create a new one in the squeue set. 4181184Skrgopi * 4191184Skrgopi * The function runs via the system taskq. The ill passed as an 4201184Skrgopi * argument can't go away since we hold a ref. The lock order is 4211184Skrgopi * ill_lock -> sqs_lock -> sq_lock. 4221184Skrgopi * 4231184Skrgopi * If we are binding a Rx ring to a squeue attached to the offline CPU, 4241184Skrgopi * no need to check that because squeues are never destroyed once 4251184Skrgopi * created. 4261184Skrgopi */ 4271184Skrgopi /* ARGSUSED */ 4281184Skrgopi static void 4291184Skrgopi ip_squeue_soft_ring_affinity(void *arg) 4301184Skrgopi { 4311184Skrgopi ip_taskq_arg_t *sq_arg = (ip_taskq_arg_t *)arg; 4321184Skrgopi ill_t *ill = sq_arg->ip_taskq_ill; 4331184Skrgopi ill_dls_capab_t *ill_soft_ring = ill->ill_dls_capab; 4341184Skrgopi ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring; 4351184Skrgopi cpu_t *intr_cpu = sq_arg->ip_taskq_cpu; 4361184Skrgopi cpu_t *bind_cpu; 4371184Skrgopi int cpu_id = intr_cpu->cpu_id; 4381184Skrgopi int min_cpu_id, max_cpu_id; 4391184Skrgopi boolean_t enough_uniq_cpus = B_FALSE; 4401184Skrgopi boolean_t enough_cpus = B_FALSE; 4411184Skrgopi squeue_set_t *sqs, *last_sqs; 4421184Skrgopi squeue_t *sqp = NULL; 4431184Skrgopi int i, j; 4441184Skrgopi 4451184Skrgopi ASSERT(ill != NULL); 4461184Skrgopi kmem_free(arg, sizeof (ip_taskq_arg_t)); 4471184Skrgopi 4481184Skrgopi /* 4491184Skrgopi * Make sure the CPU that originally took the interrupt still 4501184Skrgopi * exists. 4511184Skrgopi */ 4521184Skrgopi if (!CPU_ISON(intr_cpu)) { 4531184Skrgopi intr_cpu = CPU; 4541184Skrgopi cpu_id = intr_cpu->cpu_id; 4551184Skrgopi } 4561184Skrgopi 4571184Skrgopi /* 4581184Skrgopi * If this ill represents link aggregation, then there might be 4591184Skrgopi * multiple NICs trying to register them selves at the same time 4601184Skrgopi * and in order to ensure that test and assignment of free rings 4611184Skrgopi * is sequential, we need to hold the ill_lock. 4621184Skrgopi */ 4631184Skrgopi mutex_enter(&ill->ill_lock); 4641184Skrgopi 4651184Skrgopi if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) { 4661184Skrgopi mutex_exit(&ill->ill_lock); 4671184Skrgopi return; 4681184Skrgopi } 4691184Skrgopi /* 4701184Skrgopi * We need to fanout the interrupts from the NIC. We do that by 4711184Skrgopi * telling the driver underneath to create soft rings and use 4721184Skrgopi * worker threads (if the driver advertized SOFT_RING capability) 4731184Skrgopi * Its still a big performance win to if we can fanout to the 4741184Skrgopi * threads on the same core that is taking interrupts. 4751184Skrgopi * 4761184Skrgopi * Since we don't know the interrupt to CPU binding, we don't 4771184Skrgopi * assign any squeues or affinity to worker threads in the NIC. 4781184Skrgopi * At the time of the first interrupt, we know which CPU is 4791184Skrgopi * taking interrupts and try to find other threads on the same 4801184Skrgopi * core. Assuming, ip_threads_per_cpu is correct and cpus are 4811184Skrgopi * numbered sequentially for each core (XXX need something better 4821184Skrgopi * than this in future), find the lowest number and highest 4831184Skrgopi * number thread for that core. 4841184Skrgopi * 4851184Skrgopi * If we have one more thread per core than number of soft rings, 4861184Skrgopi * then don't assign any worker threads to the H/W thread (cpu) 4871184Skrgopi * taking interrupts (capability negotiation tries to ensure this) 4881184Skrgopi * 4891184Skrgopi * If the number of threads per core are same as the number of 4901184Skrgopi * soft rings, then assign the worker affinity and squeue to 4911184Skrgopi * the same cpu. 4921184Skrgopi * 4931184Skrgopi * Otherwise, just fanout to higher number CPUs starting from 4941184Skrgopi * the interrupted CPU. 4951184Skrgopi */ 4961184Skrgopi 4971184Skrgopi min_cpu_id = (cpu_id / ip_threads_per_cpu) * ip_threads_per_cpu; 4981184Skrgopi max_cpu_id = min_cpu_id + ip_threads_per_cpu; 4991184Skrgopi 5001184Skrgopi cmn_err(CE_CONT, "soft_ring_affinity: min/max/intr = %d/%d/%d\n", 5011184Skrgopi min_cpu_id, max_cpu_id, (int)intr_cpu->cpu_id); 5021184Skrgopi 5031184Skrgopi /* 5041184Skrgopi * Quickly check if there are enough CPUs present for fanout 5051184Skrgopi * and also max_cpu_id is less than the id of the active CPU. 5061184Skrgopi * We use the cpu_id stored in the last squeue_set to get 5071184Skrgopi * an idea. The scheme is by no means perfect since it doesn't 5081184Skrgopi * take into account CPU DR operations and the fact that 5091184Skrgopi * interrupts themselves might change. An ideal scenario 5101184Skrgopi * would be to ensure that interrupts run cpus by themselves 5111184Skrgopi * and worker threads never have affinity to those CPUs. If 5121184Skrgopi * the interrupts move to CPU which had a worker thread, it 5131184Skrgopi * should be changed. Probably callbacks similar to CPU offline 5141184Skrgopi * are needed to make it work perfectly. 5151184Skrgopi */ 5161184Skrgopi last_sqs = sqset_global_list[sqset_global_size - 1]; 5171184Skrgopi if (ip_threads_per_cpu <= ncpus && max_cpu_id <= last_sqs->sqs_bind) { 5181184Skrgopi if ((max_cpu_id - min_cpu_id) > 5191184Skrgopi ill_soft_ring->ill_dls_soft_ring_cnt) 5201184Skrgopi enough_uniq_cpus = B_TRUE; 5211184Skrgopi else if ((max_cpu_id - min_cpu_id) >= 5221184Skrgopi ill_soft_ring->ill_dls_soft_ring_cnt) 5231184Skrgopi enough_cpus = B_TRUE; 5241184Skrgopi } 5251184Skrgopi 5261184Skrgopi j = 0; 5271184Skrgopi for (i = 0; i < (ill_soft_ring->ill_dls_soft_ring_cnt + j); i++) { 5281184Skrgopi if (enough_uniq_cpus) { 5291184Skrgopi if ((min_cpu_id + i) == cpu_id) { 5301184Skrgopi j++; 5311184Skrgopi continue; 5321184Skrgopi } 5331184Skrgopi bind_cpu = cpu[min_cpu_id + i]; 5341184Skrgopi } else if (enough_cpus) { 5351184Skrgopi bind_cpu = cpu[min_cpu_id + i]; 5361184Skrgopi } else { 5371184Skrgopi /* bind_cpu = cpu[(cpu_id + i) % last_sqs->sqs_bind]; */ 5381184Skrgopi bind_cpu = cpu[(cpu_id + i) % ncpus]; 5391184Skrgopi } 5401184Skrgopi 5411184Skrgopi /* 5421184Skrgopi * Check if the CPU actually exist and active. If not, 5431184Skrgopi * use the interrupted CPU. ip_find_unused_squeue() will 5441184Skrgopi * find the right CPU to fanout anyway. 5451184Skrgopi */ 5461184Skrgopi if (!CPU_ISON(bind_cpu)) 5471184Skrgopi bind_cpu = intr_cpu; 5481184Skrgopi 5491184Skrgopi sqs = bind_cpu->cpu_squeue_set; 5501184Skrgopi ASSERT(sqs != NULL); 5511184Skrgopi ill_rx_ring = &ill_soft_ring->ill_ring_tbl[i - j]; 5521184Skrgopi 5531184Skrgopi sqp = ip_find_unused_squeue(sqs, bind_cpu, B_TRUE); 5541184Skrgopi if (sqp == NULL) { 5551184Skrgopi /* 5561184Skrgopi * We hit the max limit of squeues allowed per CPU. 5571184Skrgopi * Assign this rx_ring to DEFAULT squeue of the 5581184Skrgopi * interrupted CPU but thesqueue will not manage 5591184Skrgopi * the ring. Also print a warning. 5601184Skrgopi */ 5611184Skrgopi cmn_err(CE_NOTE, "ip_squeue_soft_ring: CPU/sqset = " 5621184Skrgopi "%d/%p already has max number of squeues. System " 5631184Skrgopi "performance might become suboptimal\n", 5641184Skrgopi sqs->sqs_bind, (void *)sqs); 5651184Skrgopi 5661184Skrgopi /* the first squeue in the list is the default squeue */ 5671184Skrgopi sqp = intr_cpu->cpu_squeue_set->sqs_list[0]; 5681184Skrgopi ASSERT(sqp != NULL); 5691184Skrgopi 5701184Skrgopi ill_rx_ring->rr_sqp = sqp; 5711184Skrgopi ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 5721184Skrgopi continue; 5731184Skrgopi 5741184Skrgopi } 5751184Skrgopi ASSERT(MUTEX_HELD(&sqp->sq_lock)); 5761184Skrgopi ill_rx_ring->rr_sqp = sqp; 5771184Skrgopi sqp->sq_rx_ring = ill_rx_ring; 5781184Skrgopi ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 5791184Skrgopi sqp->sq_state |= SQS_ILL_BOUND; 5801184Skrgopi 5811184Skrgopi /* assign affinity to soft ring */ 5821184Skrgopi if (ip_squeue_bind && (sqp->sq_state & SQS_BOUND)) { 5831184Skrgopi ill_soft_ring->ill_dls_bind(ill_rx_ring->rr_handle, 5841184Skrgopi sqp->sq_bind); 5851184Skrgopi } 5861184Skrgopi mutex_exit(&sqp->sq_lock); 5871184Skrgopi 5881184Skrgopi cmn_err(CE_CONT, "soft_ring_affinity: ring = %d, bind = %d\n", 5891184Skrgopi i - j, sqp->sq_bind); 5901184Skrgopi } 5911184Skrgopi mutex_exit(&ill->ill_lock); 5921184Skrgopi 5931184Skrgopi ill_soft_ring->ill_dls_change_status(ill_soft_ring->ill_tx_handle, 5941184Skrgopi SOFT_RING_SRC_HASH); 5951184Skrgopi 596*1503Sericheng mutex_enter(&ill->ill_lock); 597*1503Sericheng ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; 598*1503Sericheng mutex_exit(&ill->ill_lock); 599*1503Sericheng 6001184Skrgopi /* ill_waiter_dcr will also signal any waiters on ill_ring_state */ 6011184Skrgopi ill_waiter_dcr(ill); 6021184Skrgopi } 6031184Skrgopi 6041184Skrgopi void 6051184Skrgopi ip_soft_ring_assignment(ill_t *ill, ill_rx_ring_t *ip_ring, 6061184Skrgopi mblk_t *mp_chain, size_t hdrlen) 6071184Skrgopi { 6081184Skrgopi ip_taskq_arg_t *taskq_arg; 6091184Skrgopi boolean_t refheld; 6101184Skrgopi 6111184Skrgopi ASSERT(servicing_interrupt()); 6121184Skrgopi ASSERT(ip_ring == NULL); 6131184Skrgopi 6141184Skrgopi mutex_enter(&ill->ill_lock); 6151184Skrgopi if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) { 6161184Skrgopi taskq_arg = (ip_taskq_arg_t *) 6171184Skrgopi kmem_zalloc(sizeof (ip_taskq_arg_t), KM_NOSLEEP); 6181184Skrgopi 6191184Skrgopi if (taskq_arg == NULL) 6201184Skrgopi goto out; 6211184Skrgopi 6221184Skrgopi taskq_arg->ip_taskq_ill = ill; 6231184Skrgopi taskq_arg->ip_taskq_ill_rx_ring = ip_ring; 6241184Skrgopi taskq_arg->ip_taskq_cpu = CPU; 6251184Skrgopi 6261184Skrgopi /* 6271184Skrgopi * Set ILL_SOFT_RING_ASSIGN flag. We don't want 6281184Skrgopi * the next interrupt to schedule a task for calling 6291184Skrgopi * ip_squeue_soft_ring_affinity(); 6301184Skrgopi */ 6311184Skrgopi ill->ill_state_flags |= ILL_SOFT_RING_ASSIGN; 6321184Skrgopi } else { 6331184Skrgopi mutex_exit(&ill->ill_lock); 6341184Skrgopi goto out; 6351184Skrgopi } 6361184Skrgopi mutex_exit(&ill->ill_lock); 6371184Skrgopi refheld = ill_waiter_inc(ill); 6381184Skrgopi if (refheld) { 6391184Skrgopi if (taskq_dispatch(system_taskq, 6401184Skrgopi ip_squeue_soft_ring_affinity, taskq_arg, TQ_NOSLEEP)) 6411184Skrgopi goto out; 6421184Skrgopi 6431184Skrgopi /* release ref on ill if taskq dispatch fails */ 6441184Skrgopi ill_waiter_dcr(ill); 6451184Skrgopi } 6461184Skrgopi /* 6471184Skrgopi * Turn on CAPAB_SOFT_RING so that affinity assignment 6481184Skrgopi * can be tried again later. 6491184Skrgopi */ 6501184Skrgopi mutex_enter(&ill->ill_lock); 6511184Skrgopi ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; 6521184Skrgopi mutex_exit(&ill->ill_lock); 6531184Skrgopi kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 6541184Skrgopi 6551184Skrgopi out: 6561184Skrgopi ip_input(ill, ip_ring, mp_chain, hdrlen); 6571184Skrgopi } 6581184Skrgopi 6591184Skrgopi static squeue_t * 6601184Skrgopi ip_find_unused_squeue(squeue_set_t *sqs, cpu_t *bind_cpu, boolean_t fanout) 6611184Skrgopi { 6621184Skrgopi int i; 6631184Skrgopi squeue_set_t *best_sqs = NULL; 6641184Skrgopi squeue_set_t *curr_sqs = NULL; 6651184Skrgopi int min_sq = 0; 6661184Skrgopi squeue_t *sqp = NULL; 6671184Skrgopi char sqname[64]; 6681184Skrgopi 6691184Skrgopi /* 6701184Skrgopi * If fanout is set and the passed squeue_set already has some 6711184Skrgopi * squeues which are managing the NICs, try to find squeues on 6721184Skrgopi * unused CPU. 6731184Skrgopi */ 6741184Skrgopi if (sqs->sqs_size > 1 && fanout) { 6751184Skrgopi /* 6761184Skrgopi * First check to see if any squeue on the CPU passed 6771184Skrgopi * is managing a NIC. 6781184Skrgopi */ 6791184Skrgopi for (i = 0; i < sqs->sqs_size; i++) { 6801184Skrgopi mutex_enter(&sqs->sqs_list[i]->sq_lock); 6811184Skrgopi if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) && 6821184Skrgopi !(sqs->sqs_list[i]->sq_state & SQS_DEFAULT)) { 6831184Skrgopi mutex_exit(&sqs->sqs_list[i]->sq_lock); 6841184Skrgopi break; 6851184Skrgopi } 6861184Skrgopi mutex_exit(&sqs->sqs_list[i]->sq_lock); 6871184Skrgopi } 6881184Skrgopi if (i != sqs->sqs_size) { 6891184Skrgopi best_sqs = sqset_global_list[sqset_global_size - 1]; 6901184Skrgopi min_sq = best_sqs->sqs_size; 6911184Skrgopi 6921184Skrgopi for (i = sqset_global_size - 2; i >= 0; i--) { 6931184Skrgopi curr_sqs = sqset_global_list[i]; 6941184Skrgopi if (curr_sqs->sqs_size < min_sq) { 6951184Skrgopi best_sqs = curr_sqs; 6961184Skrgopi min_sq = curr_sqs->sqs_size; 6971184Skrgopi } 6981184Skrgopi } 6991184Skrgopi 7001184Skrgopi ASSERT(best_sqs != NULL); 7011184Skrgopi sqs = best_sqs; 7021184Skrgopi bind_cpu = cpu[sqs->sqs_bind]; 7031184Skrgopi } 7041184Skrgopi } 7051184Skrgopi 7060Sstevel@tonic-gate mutex_enter(&sqs->sqs_lock); 7071184Skrgopi 7080Sstevel@tonic-gate for (i = 0; i < sqs->sqs_size; i++) { 7090Sstevel@tonic-gate mutex_enter(&sqs->sqs_list[i]->sq_lock); 7101184Skrgopi if ((sqs->sqs_list[i]->sq_state & 7111184Skrgopi (SQS_DEFAULT|SQS_ILL_BOUND)) == 0) { 7120Sstevel@tonic-gate sqp = sqs->sqs_list[i]; 7130Sstevel@tonic-gate break; 7140Sstevel@tonic-gate } 7150Sstevel@tonic-gate mutex_exit(&sqs->sqs_list[i]->sq_lock); 7160Sstevel@tonic-gate } 7170Sstevel@tonic-gate 7180Sstevel@tonic-gate if (sqp == NULL) { 7190Sstevel@tonic-gate /* Need to create a new squeue */ 7200Sstevel@tonic-gate if (sqs->sqs_size == sqs->sqs_max_size) { 7210Sstevel@tonic-gate /* 7220Sstevel@tonic-gate * Reached the max limit for squeue 7231184Skrgopi * we can allocate on this CPU. 7240Sstevel@tonic-gate */ 7250Sstevel@tonic-gate mutex_exit(&sqs->sqs_lock); 7261184Skrgopi return (NULL); 7270Sstevel@tonic-gate } 7280Sstevel@tonic-gate 7290Sstevel@tonic-gate bzero(sqname, sizeof (sqname)); 7300Sstevel@tonic-gate (void) snprintf(sqname, sizeof (sqname), 7311184Skrgopi "ip_squeue_cpu_%d/%d/%d", bind_cpu->cpu_seqid, 7321184Skrgopi bind_cpu->cpu_id, sqs->sqs_size); 7330Sstevel@tonic-gate 7341184Skrgopi sqp = squeue_create(sqname, bind_cpu->cpu_id, 7351184Skrgopi ip_squeue_worker_wait, minclsyspri); 7360Sstevel@tonic-gate 7370Sstevel@tonic-gate ASSERT(sqp != NULL); 7380Sstevel@tonic-gate 7390Sstevel@tonic-gate squeue_profile_enable(sqp); 7400Sstevel@tonic-gate sqs->sqs_list[sqs->sqs_size++] = sqp; 7410Sstevel@tonic-gate 7420Sstevel@tonic-gate if (ip_squeue_create_callback != NULL) 7430Sstevel@tonic-gate ip_squeue_create_callback(sqp); 7440Sstevel@tonic-gate 7451184Skrgopi mutex_enter(&cpu_lock); 7461184Skrgopi if (ip_squeue_bind && cpu_is_online(bind_cpu)) { 7470Sstevel@tonic-gate squeue_bind(sqp, -1); 7480Sstevel@tonic-gate } 7491184Skrgopi mutex_exit(&cpu_lock); 7501184Skrgopi 7510Sstevel@tonic-gate mutex_enter(&sqp->sq_lock); 7520Sstevel@tonic-gate } 7530Sstevel@tonic-gate 7541184Skrgopi mutex_exit(&sqs->sqs_lock); 7550Sstevel@tonic-gate ASSERT(sqp != NULL); 7561184Skrgopi return (sqp); 7570Sstevel@tonic-gate } 7580Sstevel@tonic-gate 7590Sstevel@tonic-gate /* 7600Sstevel@tonic-gate * Find the squeue assigned to manage this Rx ring. If the Rx ring is not 7610Sstevel@tonic-gate * owned by a squeue yet, do the assignment. When the NIC registers it 7620Sstevel@tonic-gate * Rx rings with IP, we don't know where the interrupts will land and 7630Sstevel@tonic-gate * hence we need to wait till this point to do the assignment. 7640Sstevel@tonic-gate */ 7650Sstevel@tonic-gate squeue_t * 7660Sstevel@tonic-gate ip_squeue_get(ill_rx_ring_t *ill_rx_ring) 7670Sstevel@tonic-gate { 7680Sstevel@tonic-gate squeue_t *sqp; 7690Sstevel@tonic-gate ill_t *ill; 7700Sstevel@tonic-gate int interrupt; 7710Sstevel@tonic-gate ip_taskq_arg_t *taskq_arg; 7720Sstevel@tonic-gate boolean_t refheld; 7730Sstevel@tonic-gate 7740Sstevel@tonic-gate if (ill_rx_ring == NULL) 7750Sstevel@tonic-gate return (IP_SQUEUE_GET(lbolt)); 7760Sstevel@tonic-gate 7770Sstevel@tonic-gate sqp = ill_rx_ring->rr_sqp; 7780Sstevel@tonic-gate /* 7790Sstevel@tonic-gate * Do a quick check. If it's not NULL, we are done. 7800Sstevel@tonic-gate * Squeues are never destroyed so worse we will bind 7810Sstevel@tonic-gate * this connection to a suboptimal squeue. 7820Sstevel@tonic-gate * 7830Sstevel@tonic-gate * This is the fast path case. 7840Sstevel@tonic-gate */ 7850Sstevel@tonic-gate if (sqp != NULL) 7860Sstevel@tonic-gate return (sqp); 7870Sstevel@tonic-gate 7880Sstevel@tonic-gate ill = ill_rx_ring->rr_ill; 7890Sstevel@tonic-gate ASSERT(ill != NULL); 7900Sstevel@tonic-gate 7910Sstevel@tonic-gate interrupt = servicing_interrupt(); 7920Sstevel@tonic-gate taskq_arg = (ip_taskq_arg_t *)kmem_zalloc(sizeof (ip_taskq_arg_t), 7930Sstevel@tonic-gate KM_NOSLEEP); 7940Sstevel@tonic-gate 7950Sstevel@tonic-gate mutex_enter(&ill->ill_lock); 7960Sstevel@tonic-gate if (!interrupt || ill_rx_ring->rr_ring_state != ILL_RING_INUSE || 7970Sstevel@tonic-gate taskq_arg == NULL) { 7980Sstevel@tonic-gate /* 7990Sstevel@tonic-gate * Do the ring to squeue binding only if we are in interrupt 8000Sstevel@tonic-gate * context and there is no one else trying the bind already. 8010Sstevel@tonic-gate */ 8020Sstevel@tonic-gate mutex_exit(&ill->ill_lock); 8030Sstevel@tonic-gate if (taskq_arg != NULL) 8040Sstevel@tonic-gate kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 8050Sstevel@tonic-gate return (IP_SQUEUE_GET(lbolt)); 8060Sstevel@tonic-gate } 8070Sstevel@tonic-gate 8080Sstevel@tonic-gate /* 8090Sstevel@tonic-gate * No sqp assigned yet. Can't really do that in interrupt 8100Sstevel@tonic-gate * context. Assign the default sqp to this connection and 8110Sstevel@tonic-gate * trigger creation of new sqp and binding it to this ring 8120Sstevel@tonic-gate * via taskq. Need to make sure ill stays around. 8130Sstevel@tonic-gate */ 8140Sstevel@tonic-gate taskq_arg->ip_taskq_ill = ill; 8150Sstevel@tonic-gate taskq_arg->ip_taskq_ill_rx_ring = ill_rx_ring; 8160Sstevel@tonic-gate taskq_arg->ip_taskq_cpu = CPU; 8170Sstevel@tonic-gate ill_rx_ring->rr_ring_state = ILL_RING_INPROC; 8180Sstevel@tonic-gate mutex_exit(&ill->ill_lock); 8190Sstevel@tonic-gate refheld = ill_waiter_inc(ill); 8200Sstevel@tonic-gate if (refheld) { 8210Sstevel@tonic-gate if (taskq_dispatch(system_taskq, ip_squeue_extend, 8220Sstevel@tonic-gate taskq_arg, TQ_NOSLEEP) != NULL) { 8230Sstevel@tonic-gate return (IP_SQUEUE_GET(lbolt)); 8240Sstevel@tonic-gate } 8250Sstevel@tonic-gate } 8260Sstevel@tonic-gate /* 8270Sstevel@tonic-gate * The ill is closing and we could not get a reference on the ill OR 8280Sstevel@tonic-gate * taskq_dispatch failed probably due to memory allocation failure. 8290Sstevel@tonic-gate * We will try again next time. 8300Sstevel@tonic-gate */ 8310Sstevel@tonic-gate mutex_enter(&ill->ill_lock); 8320Sstevel@tonic-gate ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 8330Sstevel@tonic-gate mutex_exit(&ill->ill_lock); 8340Sstevel@tonic-gate kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 8350Sstevel@tonic-gate if (refheld) 8360Sstevel@tonic-gate ill_waiter_dcr(ill); 8370Sstevel@tonic-gate 8380Sstevel@tonic-gate return (IP_SQUEUE_GET(lbolt)); 8390Sstevel@tonic-gate } 8400Sstevel@tonic-gate 8410Sstevel@tonic-gate /* 8420Sstevel@tonic-gate * NDD hooks for setting ip_squeue_xxx tuneables. 8430Sstevel@tonic-gate */ 8440Sstevel@tonic-gate 8450Sstevel@tonic-gate /* ARGSUSED */ 8460Sstevel@tonic-gate int 8470Sstevel@tonic-gate ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value, 8480Sstevel@tonic-gate caddr_t addr, cred_t *cr) 8490Sstevel@tonic-gate { 8500Sstevel@tonic-gate int *bind_enabled = (int *)addr; 8510Sstevel@tonic-gate long new_value; 8520Sstevel@tonic-gate int i; 8530Sstevel@tonic-gate 8540Sstevel@tonic-gate if (ddi_strtol(value, NULL, 10, &new_value) != 0) 8550Sstevel@tonic-gate return (EINVAL); 8560Sstevel@tonic-gate 8570Sstevel@tonic-gate if (ip_squeue_bind == new_value) 8580Sstevel@tonic-gate return (0); 8590Sstevel@tonic-gate 8600Sstevel@tonic-gate *bind_enabled = new_value; 8610Sstevel@tonic-gate mutex_enter(&cpu_lock); 8620Sstevel@tonic-gate if (new_value == 0) { 8630Sstevel@tonic-gate for (i = 0; i < sqset_global_size; i++) 8640Sstevel@tonic-gate ip_squeue_set_unbind(sqset_global_list[i]); 8650Sstevel@tonic-gate } else { 8660Sstevel@tonic-gate for (i = 0; i < sqset_global_size; i++) 8670Sstevel@tonic-gate ip_squeue_set_bind(sqset_global_list[i]); 8680Sstevel@tonic-gate } 8690Sstevel@tonic-gate 8700Sstevel@tonic-gate mutex_exit(&cpu_lock); 8710Sstevel@tonic-gate return (0); 8720Sstevel@tonic-gate } 8730Sstevel@tonic-gate 8740Sstevel@tonic-gate /* 8750Sstevel@tonic-gate * Set squeue profiling. 8760Sstevel@tonic-gate * 0 means "disable" 8770Sstevel@tonic-gate * 1 means "enable" 8780Sstevel@tonic-gate * 2 means "enable and reset" 8790Sstevel@tonic-gate */ 8800Sstevel@tonic-gate /* ARGSUSED */ 8810Sstevel@tonic-gate int 8820Sstevel@tonic-gate ip_squeue_profile_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 8830Sstevel@tonic-gate cred_t *cr) 8840Sstevel@tonic-gate { 8850Sstevel@tonic-gate int *profile_enabled = (int *)cp; 8860Sstevel@tonic-gate long new_value; 8870Sstevel@tonic-gate squeue_set_t *sqs; 8880Sstevel@tonic-gate 8890Sstevel@tonic-gate if (ddi_strtol(value, NULL, 10, &new_value) != 0) 8900Sstevel@tonic-gate return (EINVAL); 8910Sstevel@tonic-gate 8920Sstevel@tonic-gate if (new_value == 0) 8930Sstevel@tonic-gate squeue_profile_stop(); 8940Sstevel@tonic-gate else if (new_value == 1) 8950Sstevel@tonic-gate squeue_profile_start(); 8960Sstevel@tonic-gate else if (new_value == 2) { 8970Sstevel@tonic-gate int i, j; 8980Sstevel@tonic-gate 8990Sstevel@tonic-gate squeue_profile_stop(); 9000Sstevel@tonic-gate mutex_enter(&cpu_lock); 9010Sstevel@tonic-gate for (i = 0; i < sqset_global_size; i++) { 9020Sstevel@tonic-gate sqs = sqset_global_list[i]; 9030Sstevel@tonic-gate for (j = 0; j < sqs->sqs_size; j++) { 9040Sstevel@tonic-gate squeue_profile_reset(sqs->sqs_list[j]); 9050Sstevel@tonic-gate } 9060Sstevel@tonic-gate } 9070Sstevel@tonic-gate mutex_exit(&cpu_lock); 9080Sstevel@tonic-gate 9090Sstevel@tonic-gate new_value = 1; 9100Sstevel@tonic-gate squeue_profile_start(); 9110Sstevel@tonic-gate } 9120Sstevel@tonic-gate *profile_enabled = new_value; 9130Sstevel@tonic-gate 9140Sstevel@tonic-gate return (0); 9150Sstevel@tonic-gate } 9160Sstevel@tonic-gate 9170Sstevel@tonic-gate /* 9180Sstevel@tonic-gate * Reconfiguration callback 9190Sstevel@tonic-gate */ 9200Sstevel@tonic-gate 9210Sstevel@tonic-gate /* ARGSUSED */ 9220Sstevel@tonic-gate static int 9230Sstevel@tonic-gate ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg) 9240Sstevel@tonic-gate { 9250Sstevel@tonic-gate cpu_t *cp = cpu[id]; 9260Sstevel@tonic-gate 9270Sstevel@tonic-gate ASSERT(MUTEX_HELD(&cpu_lock)); 9280Sstevel@tonic-gate switch (what) { 929405Sakolb case CPU_CONFIG: 930405Sakolb /* 931405Sakolb * A new CPU is added. Create an squeue for it but do not bind 932405Sakolb * it yet. 933405Sakolb */ 934405Sakolb if (cp->cpu_squeue_set == NULL) 935405Sakolb cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE); 936405Sakolb break; 9370Sstevel@tonic-gate case CPU_ON: 9380Sstevel@tonic-gate case CPU_INIT: 9390Sstevel@tonic-gate case CPU_CPUPART_IN: 9400Sstevel@tonic-gate if (cp->cpu_squeue_set == NULL) { 9410Sstevel@tonic-gate cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE); 9420Sstevel@tonic-gate } 9430Sstevel@tonic-gate if (ip_squeue_bind) 9440Sstevel@tonic-gate ip_squeue_set_bind(cp->cpu_squeue_set); 9450Sstevel@tonic-gate break; 9460Sstevel@tonic-gate case CPU_UNCONFIG: 9470Sstevel@tonic-gate case CPU_OFF: 9480Sstevel@tonic-gate case CPU_CPUPART_OUT: 9490Sstevel@tonic-gate ASSERT((cp->cpu_squeue_set != NULL) || 9500Sstevel@tonic-gate (cp->cpu_flags & CPU_OFFLINE)); 9510Sstevel@tonic-gate 9520Sstevel@tonic-gate if (cp->cpu_squeue_set != NULL) { 9530Sstevel@tonic-gate ip_squeue_set_unbind(cp->cpu_squeue_set); 9540Sstevel@tonic-gate } 9550Sstevel@tonic-gate break; 9560Sstevel@tonic-gate default: 9570Sstevel@tonic-gate break; 9580Sstevel@tonic-gate } 9590Sstevel@tonic-gate return (0); 9600Sstevel@tonic-gate } 9610Sstevel@tonic-gate 9620Sstevel@tonic-gate /* ARGSUSED */ 9630Sstevel@tonic-gate static void 9640Sstevel@tonic-gate ip_squeue_set_bind(squeue_set_t *sqs) 9650Sstevel@tonic-gate { 9660Sstevel@tonic-gate int i; 9670Sstevel@tonic-gate squeue_t *sqp; 9680Sstevel@tonic-gate 9690Sstevel@tonic-gate if (!ip_squeue_bind) 9700Sstevel@tonic-gate return; 9710Sstevel@tonic-gate 9720Sstevel@tonic-gate mutex_enter(&sqs->sqs_lock); 9730Sstevel@tonic-gate for (i = 0; i < sqs->sqs_size; i++) { 9740Sstevel@tonic-gate sqp = sqs->sqs_list[i]; 9750Sstevel@tonic-gate if (sqp->sq_state & SQS_BOUND) 9760Sstevel@tonic-gate continue; 9770Sstevel@tonic-gate squeue_bind(sqp, -1); 9780Sstevel@tonic-gate } 9790Sstevel@tonic-gate mutex_exit(&sqs->sqs_lock); 9800Sstevel@tonic-gate } 9810Sstevel@tonic-gate 9820Sstevel@tonic-gate static void 9830Sstevel@tonic-gate ip_squeue_set_unbind(squeue_set_t *sqs) 9840Sstevel@tonic-gate { 9850Sstevel@tonic-gate int i; 9860Sstevel@tonic-gate squeue_t *sqp; 9870Sstevel@tonic-gate 9880Sstevel@tonic-gate mutex_enter(&sqs->sqs_lock); 9890Sstevel@tonic-gate for (i = 0; i < sqs->sqs_size; i++) { 9900Sstevel@tonic-gate sqp = sqs->sqs_list[i]; 9911184Skrgopi 9921184Skrgopi /* 9931184Skrgopi * CPU is going offline. Remove the thread affinity 9941184Skrgopi * for any soft ring threads the squeue is managing. 9951184Skrgopi */ 9961184Skrgopi if (sqp->sq_state & SQS_ILL_BOUND) { 9971184Skrgopi ill_rx_ring_t *ring = sqp->sq_rx_ring; 9981184Skrgopi ill_t *ill = ring->rr_ill; 9991184Skrgopi 10001184Skrgopi if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { 10011184Skrgopi ASSERT(ring->rr_handle != NULL); 10021184Skrgopi ill->ill_dls_capab->ill_dls_unbind( 10031184Skrgopi ring->rr_handle); 10041184Skrgopi } 10051184Skrgopi } 10060Sstevel@tonic-gate if (!(sqp->sq_state & SQS_BOUND)) 10070Sstevel@tonic-gate continue; 10080Sstevel@tonic-gate squeue_unbind(sqp); 10090Sstevel@tonic-gate } 10100Sstevel@tonic-gate mutex_exit(&sqs->sqs_lock); 10110Sstevel@tonic-gate } 1012